exec.c source code [linux/fs/exec.c]

1	// SPDX-License-Identifier: GPL-2.0-only
2	/*
3	* linux/fs/exec.c
4	*
5	* Copyright (C) 1991, 1992 Linus Torvalds
6	*/
7
8	/*
9	* #!-checking implemented by tytso.
10	*/
11	/*
12	* Demand-loading implemented 01.12.91 - no need to read anything but
13	* the header into memory. The inode of the executable is put into
14	* "current->executable", and page faults do the actual loading. Clean.
15	*
16	* Once more I can proudly say that linux stood up to being changed: it
17	* was less than 2 hours work to get demand-loading completely implemented.
18	*
19	* Demand loading changed July 1993 by Eric Youngdale. Use mmap instead,
20	* current->executable is only used by the procfs. This allows a dispatch
21	* table to check for several different types of binary formats. We keep
22	* trying until we recognize the file or we run out of supported binary
23	* formats.
24	*/
25
26	#include <linux/kernel_read_file.h>
27	#include <linux/slab.h>
28	#include <linux/file.h>
29	#include <linux/fdtable.h>
30	#include <linux/mm.h>
31	#include <linux/stat.h>
32	#include <linux/fcntl.h>
33	#include <linux/swap.h>
34	#include <linux/string.h>
35	#include <linux/init.h>
36	#include <linux/sched/mm.h>
37	#include <linux/sched/coredump.h>
38	#include <linux/sched/signal.h>
39	#include <linux/sched/numa_balancing.h>
40	#include <linux/sched/task.h>
41	#include <linux/pagemap.h>
42	#include <linux/perf_event.h>
43	#include <linux/highmem.h>
44	#include <linux/spinlock.h>
45	#include <linux/key.h>
46	#include <linux/personality.h>
47	#include <linux/binfmts.h>
48	#include <linux/utsname.h>
49	#include <linux/pid_namespace.h>
50	#include <linux/module.h>
51	#include <linux/namei.h>
52	#include <linux/mount.h>
53	#include <linux/security.h>
54	#include <linux/syscalls.h>
55	#include <linux/tsacct_kern.h>
56	#include <linux/cn_proc.h>
57	#include <linux/audit.h>
58	#include <linux/kmod.h>
59	#include <linux/fsnotify.h>
60	#include <linux/fs_struct.h>
61	#include <linux/oom.h>
62	#include <linux/compat.h>
63	#include <linux/vmalloc.h>
64	#include <linux/io_uring.h>
65	#include <linux/syscall_user_dispatch.h>
66	#include <linux/coredump.h>
67	#include <linux/time_namespace.h>
68	#include <linux/user_events.h>
69
70	#include <linux/uaccess.h>
71	#include <asm/mmu_context.h>
72	#include <asm/tlb.h>
73
74	#include <trace/events/task.h>
75	#include "internal.h"
76
77	#include <trace/events/sched.h>
78
79	static int bprm_creds_from_file(struct linux_binprm *bprm);
80
81	int suid_dumpable = `0`;
82
83	static LIST_HEAD(formats);
84	static DEFINE_RWLOCK(binfmt_lock);
85
86	void __register_binfmt(struct linux_binfmt * fmt, int insert)
87	{
88	write_lock(&binfmt_lock);
89	insert ? list_add(new: &fmt->lh, head: &formats) :
90	list_add_tail(new: &fmt->lh, head: &formats);
91	write_unlock(&binfmt_lock);
92	}
93
94	EXPORT_SYMBOL(__register_binfmt);
95
96	void unregister_binfmt(struct linux_binfmt * fmt)
97	{
98	write_lock(&binfmt_lock);
99	list_del(entry: &fmt->lh);
100	write_unlock(&binfmt_lock);
101	}
102
103	EXPORT_SYMBOL(unregister_binfmt);
104
105	static inline void put_binfmt(struct linux_binfmt * fmt)
106	{
107	module_put(module: fmt->module);
108	}
109
110	bool path_noexec(const struct path *path)
111	{
112	return (path->mnt->mnt_flags & MNT_NOEXEC) \|\|
113	(path->mnt->mnt_sb->s_iflags & SB_I_NOEXEC);
114	}
115
116	#ifdef CONFIG_USELIB
117	/*
118	* Note that a shared library must be both readable and executable due to
119	* security reasons.
120	*
121	* Also note that we take the address to load from the file itself.
122	*/
123	SYSCALL_DEFINE1(uselib, const char __user *, library)
124	{
125	struct linux_binfmt *fmt;
126	struct file *file;
127	struct filename *tmp = getname(library);
128	int error = PTR_ERR(ptr: tmp);
129	static const struct open_flags uselib_flags = {
130	.open_flag = O_LARGEFILE \| O_RDONLY \| __FMODE_EXEC,
131	.acc_mode = MAY_READ \| MAY_EXEC,
132	.intent = LOOKUP_OPEN,
133	.lookup_flags = LOOKUP_FOLLOW,
134	};
135
136	if (IS_ERR(ptr: tmp))
137	goto out;
138
139	file = do_filp_open(AT_FDCWD, pathname: tmp, op: &uselib_flags);
140	putname(name: tmp);
141	error = PTR_ERR(ptr: file);
142	if (IS_ERR(ptr: file))
143	goto out;
144
145	/*
146	* may_open() has already checked for this, so it should be
147	* impossible to trip now. But we need to be extra cautious
148	* and check again at the very end too.
149	*/
150	error = -EACCES;
151	if (WARN_ON_ONCE(!S_ISREG(file_inode(file)->i_mode) \|\|
152	path_noexec(&file->f_path)))
153	goto exit;
154
155	error = -ENOEXEC;
156
157	read_lock(&binfmt_lock);
158	list_for_each_entry(fmt, &formats, lh) {
159	if (!fmt->load_shlib)
160	continue;
161	if (!try_module_get(module: fmt->module))
162	continue;
163	read_unlock(&binfmt_lock);
164	error = fmt->load_shlib(file);
165	read_lock(&binfmt_lock);
166	put_binfmt(fmt);
167	if (error != -ENOEXEC)
168	break;
169	}
170	read_unlock(&binfmt_lock);
171	exit:
172	fput(file);
173	out:
174	return error;
175	}
176	#endif /* #ifdef CONFIG_USELIB */
177
178	#ifdef CONFIG_MMU
179	/*
180	* The nascent bprm->mm is not visible until exec_mmap() but it can
181	* use a lot of memory, account these pages in current->mm temporary
182	* for oom_badness()->get_mm_rss(). Once exec succeeds or fails, we
183	* change the counter back via acct_arg_size(0).
184	*/
185	static void acct_arg_size(struct linux_binprm bprm, unsigned* long pages)
186	{
187	struct mm_struct *mm = current->mm;
188	long diff = (long)(pages - bprm->vma_pages);
189
190	if (!mm \|\| !diff)
191	return;
192
193	bprm->vma_pages = pages;
194	add_mm_counter(mm, member: MM_ANONPAGES, value: diff);
195	}
196
197	static struct page get_arg_page(struct* linux_binprm bprm, unsigned* long pos,
198	int write)
199	{
200	struct page *page;
201	struct vm_area_struct *vma = bprm->vma;
202	struct mm_struct *mm = bprm->mm;
203	int ret;
204
205	/*
206	* Avoid relying on expanding the stack down in GUP (which
207	* does not work for STACK_GROWSUP anyway), and just do it
208	* by hand ahead of time.
209	*/
210	if (write && pos < vma->vm_start) {
211	mmap_write_lock(mm);
212	ret = expand_downwards(vma, address: pos);
213	if (unlikely(ret < `0`)) {
214	mmap_write_unlock(mm);
215	return NULL;
216	}
217	mmap_write_downgrade(mm);
218	} else
219	mmap_read_lock(mm);
220
221	/*
222	* We are doing an exec(). 'current' is the process
223	* doing the exec and 'mm' is the new process's mm.
224	*/
225	ret = get_user_pages_remote(mm, start: pos, nr_pages: `1`,
226	gup_flags: write ? FOLL_WRITE : `0`,
227	pages: &page, NULL);
228	mmap_read_unlock(mm);
229	if (ret <= `0`)
230	return NULL;
231
232	if (write)
233	acct_arg_size(bprm, pages: vma_pages(vma));
234
235	return page;
236	}
237
238	static void put_arg_page(struct page *page)
239	{
240	put_page(page);
241	}
242
243	static void free_arg_pages(struct linux_binprm *bprm)
244	{
245	}
246
247	static void flush_arg_page(struct linux_binprm bprm, unsigned* long pos,
248	struct page *page)
249	{
250	flush_cache_page(vma: bprm->vma, vmaddr: pos, page_to_pfn(page));
251	}
252
253	static int __bprm_mm_init(struct linux_binprm *bprm)
254	{
255	int err;
256	struct vm_area_struct *vma = NULL;
257	struct mm_struct *mm = bprm->mm;
258
259	bprm->vma = vma = vm_area_alloc(mm);
260	if (!vma)
261	return -ENOMEM;
262	vma_set_anonymous(vma);
263
264	if (mmap_write_lock_killable(mm)) {
265	err = -EINTR;
266	goto err_free;
267	}
268
269	/*
270	* Place the stack at the largest stack address the architecture
271	* supports. Later, we'll move this to an appropriate place. We don't
272	* use STACK_TOP because that can depend on attributes which aren't
273	* configured yet.
274	*/
275	BUILD_BUG_ON(VM_STACK_FLAGS & VM_STACK_INCOMPLETE_SETUP);
276	vma->vm_end = STACK_TOP_MAX;
277	vma->vm_start = vma->vm_end - PAGE_SIZE;
278	vm_flags_init(vma, VM_SOFTDIRTY \| VM_STACK_FLAGS \| VM_STACK_INCOMPLETE_SETUP);
279	vma->vm_page_prot = vm_get_page_prot(vm_flags: vma->vm_flags);
280
281	err = insert_vm_struct(mm, vma);
282	if (err)
283	goto err;
284
285	mm->stack_vm = mm->total_vm = `1`;
286	mmap_write_unlock(mm);
287	bprm->p = vma->vm_end - sizeof(void *);
288	return `0`;
289	err:
290	mmap_write_unlock(mm);
291	err_free:
292	bprm->vma = NULL;
293	vm_area_free(vma);
294	return err;
295	}
296
297	static bool valid_arg_len(struct linux_binprm bprm, long* len)
298	{
299	return len <= MAX_ARG_STRLEN;
300	}
301
302	#else
303
304	static inline void acct_arg_size(struct linux_binprm bprm, unsigned* long pages)
305	{
306	}
307
308	static struct page get_arg_page(struct* linux_binprm bprm, unsigned* long pos,
309	int write)
310	{
311	struct page *page;
312
313	page = bprm->page[pos / PAGE_SIZE];
314	if (!page && write) {
315	page = alloc_page(GFP_HIGHUSER\|__GFP_ZERO);
316	if (!page)
317	return NULL;
318	bprm->page[pos / PAGE_SIZE] = page;
319	}
320
321	return page;
322	}
323
324	static void put_arg_page(struct page *page)
325	{
326	}
327
328	static void free_arg_page(struct linux_binprm bprm, int* i)
329	{
330	if (bprm->page[i]) {
331	__free_page(bprm->page[i]);
332	bprm->page[i] = NULL;
333	}
334	}
335
336	static void free_arg_pages(struct linux_binprm *bprm)
337	{
338	int i;
339
340	for (i = `0`; i < MAX_ARG_PAGES; i++)
341	free_arg_page(bprm, i);
342	}
343
344	static void flush_arg_page(struct linux_binprm bprm, unsigned* long pos,
345	struct page *page)
346	{
347	}
348
349	static int __bprm_mm_init(struct linux_binprm *bprm)
350	{
351	bprm->p = PAGE_SIZE * MAX_ARG_PAGES - sizeof(void *);
352	return `0`;
353	}
354
355	static bool valid_arg_len(struct linux_binprm bprm, long* len)
356	{
357	return len <= bprm->p;
358	}
359
360	#endif /* CONFIG_MMU */
361
362	/*
363	* Create a new mm_struct and populate it with a temporary stack
364	* vm_area_struct. We don't have enough context at this point to set the stack
365	* flags, permissions, and offset, so we use temporary values. We'll update
366	* them later in setup_arg_pages().
367	*/
368	static int bprm_mm_init(struct linux_binprm *bprm)
369	{
370	int err;
371	struct mm_struct *mm = NULL;
372
373	bprm->mm = mm = mm_alloc();
374	err = -ENOMEM;
375	if (!mm)
376	goto err;
377
378	/ Save current stack limit for all calculations made during exec. /
379	task_lock(current->group_leader);
380	bprm->rlim_stack = current->signal->rlim[RLIMIT_STACK];
381	task_unlock(current->group_leader);
382
383	err = __bprm_mm_init(bprm);
384	if (err)
385	goto err;
386
387	return `0`;
388
389	err:
390	if (mm) {
391	bprm->mm = NULL;
392	mmdrop(mm);
393	}
394
395	return err;
396	}
397
398	struct user_arg_ptr {
399	#ifdef CONFIG_COMPAT
400	bool is_compat;
401	#endif
402	union {
403	const char __user *const __user *native;
404	#ifdef CONFIG_COMPAT
405	const compat_uptr_t __user *compat;
406	#endif
407	} ptr;
408	};
409
410	static const char __user get_user_arg_ptr(struct* user_arg_ptr argv, int nr)
411	{
412	const char __user *native;
413
414	#ifdef CONFIG_COMPAT
415	if (unlikely(argv.is_compat)) {
416	compat_uptr_t compat;
417
418	if (get_user(compat, argv.ptr.compat + nr))
419	return ERR_PTR(error: -EFAULT);
420
421	return compat_ptr(uptr: compat);
422	}
423	#endif
424
425	if (get_user(native, argv.ptr.native + nr))
426	return ERR_PTR(error: -EFAULT);
427
428	return native;
429	}
430
431	/*
432	* count() counts the number of strings in array ARGV.
433	*/
434	static int count(struct user_arg_ptr argv, int max)
435	{
436	int i = `0`;
437
438	if (argv.ptr.native != NULL) {
439	for (;;) {
440	const char __user *p = get_user_arg_ptr(argv, nr: i);
441
442	if (!p)
443	break;
444
445	if (IS_ERR(ptr: p))
446	return -EFAULT;
447
448	if (i >= max)
449	return -E2BIG;
450	++i;
451
452	if (fatal_signal_pending(current))
453	return -ERESTARTNOHAND;
454	cond_resched();
455	}
456	}
457	return i;
458	}
459
460	static int count_strings_kernel(const char *const *argv)
461	{
462	int i;
463
464	if (!argv)
465	return `0`;
466
467	for (i = `0`; argv[i]; ++i) {
468	if (i >= MAX_ARG_STRINGS)
469	return -E2BIG;
470	if (fatal_signal_pending(current))
471	return -ERESTARTNOHAND;
472	cond_resched();
473	}
474	return i;
475	}
476
477	static int bprm_stack_limits(struct linux_binprm *bprm)
478	{
479	unsigned long limit, ptr_size;
480
481	/*
482	* Limit to 1/4 of the max stack size or 3/4 of _STK_LIM
483	* (whichever is smaller) for the argv+env strings.
484	* This ensures that:
485	* - the remaining binfmt code will not run out of stack space,
486	* - the program will have a reasonable amount of stack left
487	* to work from.
488	*/
489	limit = _STK_LIM / `4` * `3`;
490	limit = min(limit, bprm->rlim_stack.rlim_cur / `4`);
491	/*
492	* We've historically supported up to 32 pages (ARG_MAX)
493	* of argument strings even with small stacks
494	*/
495	limit = max_t(unsigned long, limit, ARG_MAX);
496	/*
497	* We must account for the size of all the argv and envp pointers to
498	* the argv and envp strings, since they will also take up space in
499	* the stack. They aren't stored until much later when we can't
500	* signal to the parent that the child has run out of stack space.
501	* Instead, calculate it here so it's possible to fail gracefully.
502	*
503	* In the case of argc = 0, make sure there is space for adding a
504	* empty string (which will bump argc to 1), to ensure confused
505	* userspace programs don't start processing from argv[1], thinking
506	* argc can never be 0, to keep them from walking envp by accident.
507	* See do_execveat_common().
508	*/
509	ptr_size = (max(bprm->argc, `1`) + bprm->envc) * sizeof(void *);
510	if (limit <= ptr_size)
511	return -E2BIG;
512	limit -= ptr_size;
513
514	bprm->argmin = bprm->p - limit;
515	return `0`;
516	}
517
518	/*
519	* 'copy_strings()' copies argument/environment strings from the old
520	* processes's memory to the new process's stack. The call to get_user_pages()
521	* ensures the destination page is created and not swapped out.
522	*/
523	static int copy_strings(int argc, struct user_arg_ptr argv,
524	struct linux_binprm *bprm)
525	{
526	struct page *kmapped_page = NULL;
527	char *kaddr = NULL;
528	unsigned long kpos = `0`;
529	int ret;
530
531	while (argc-- > `0`) {
532	const char __user *str;
533	int len;
534	unsigned long pos;
535
536	ret = -EFAULT;
537	str = get_user_arg_ptr(argv, nr: argc);
538	if (IS_ERR(ptr: str))
539	goto out;
540
541	len = strnlen_user(str, MAX_ARG_STRLEN);
542	if (!len)
543	goto out;
544
545	ret = -E2BIG;
546	if (!valid_arg_len(bprm, len))
547	goto out;
548
549	/ We're going to work our way backwards. /
550	pos = bprm->p;
551	str += len;
552	bprm->p -= len;
553	#ifdef CONFIG_MMU
554	if (bprm->p < bprm->argmin)
555	goto out;
556	#endif
557
558	while (len > `0`) {
559	int offset, bytes_to_copy;
560
561	if (fatal_signal_pending(current)) {
562	ret = -ERESTARTNOHAND;
563	goto out;
564	}
565	cond_resched();
566
567	offset = pos % PAGE_SIZE;
568	if (offset == `0`)
569	offset = PAGE_SIZE;
570
571	bytes_to_copy = offset;
572	if (bytes_to_copy > len)
573	bytes_to_copy = len;
574
575	offset -= bytes_to_copy;
576	pos -= bytes_to_copy;
577	str -= bytes_to_copy;
578	len -= bytes_to_copy;
579
580	if (!kmapped_page \|\| kpos != (pos & PAGE_MASK)) {
581	struct page *page;
582
583	page = get_arg_page(bprm, pos, write: `1`);
584	if (!page) {
585	ret = -E2BIG;
586	goto out;
587	}
588
589	if (kmapped_page) {
590	flush_dcache_page(page: kmapped_page);
591	kunmap_local(kaddr);
592	put_arg_page(page: kmapped_page);
593	}
594	kmapped_page = page;
595	kaddr = kmap_local_page(page: kmapped_page);
596	kpos = pos & PAGE_MASK;
597	flush_arg_page(bprm, pos: kpos, page: kmapped_page);
598	}
599	if (copy_from_user(to: kaddr+offset, from: str, n: bytes_to_copy)) {
600	ret = -EFAULT;
601	goto out;
602	}
603	}
604	}
605	ret = `0`;
606	out:
607	if (kmapped_page) {
608	flush_dcache_page(page: kmapped_page);
609	kunmap_local(kaddr);
610	put_arg_page(page: kmapped_page);
611	}
612	return ret;
613	}
614
615	/*
616	* Copy and argument/environment string from the kernel to the processes stack.
617	*/
618	int copy_string_kernel(const char arg, struct* linux_binprm *bprm)
619	{
620	int len = strnlen(p: arg, MAX_ARG_STRLEN) + `1` / terminating NUL /;
621	unsigned long pos = bprm->p;
622
623	if (len == `0`)
624	return -EFAULT;
625	if (!valid_arg_len(bprm, len))
626	return -E2BIG;
627
628	/ We're going to work our way backwards. /
629	arg += len;
630	bprm->p -= len;
631	if (IS_ENABLED(CONFIG_MMU) && bprm->p < bprm->argmin)
632	return -E2BIG;
633
634	while (len > `0`) {
635	unsigned int bytes_to_copy = min_t(unsigned int, len,
636	min_not_zero(offset_in_page(pos), PAGE_SIZE));
637	struct page *page;
638
639	pos -= bytes_to_copy;
640	arg -= bytes_to_copy;
641	len -= bytes_to_copy;
642
643	page = get_arg_page(bprm, pos, write: `1`);
644	if (!page)
645	return -E2BIG;
646	flush_arg_page(bprm, pos: pos & PAGE_MASK, page);
647	memcpy_to_page(page, offset_in_page(pos), from: arg, len: bytes_to_copy);
648	put_arg_page(page);
649	}
650
651	return `0`;
652	}
653	EXPORT_SYMBOL(copy_string_kernel);
654
655	static int copy_strings_kernel(int argc, const char *const *argv,
656	struct linux_binprm *bprm)
657	{
658	while (argc-- > `0`) {
659	int ret = copy_string_kernel(argv[argc], bprm);
660	if (ret < `0`)
661	return ret;
662	if (fatal_signal_pending(current))
663	return -ERESTARTNOHAND;
664	cond_resched();
665	}
666	return `0`;
667	}
668
669	#ifdef CONFIG_MMU
670
671	/*
672	* During bprm_mm_init(), we create a temporary stack at STACK_TOP_MAX. Once
673	* the binfmt code determines where the new stack should reside, we shift it to
674	* its final location. The process proceeds as follows:
675	*
676	* 1) Use shift to calculate the new vma endpoints.
677	* 2) Extend vma to cover both the old and new ranges. This ensures the
678	* arguments passed to subsequent functions are consistent.
679	* 3) Move vma's page tables to the new range.
680	* 4) Free up any cleared pgd range.
681	* 5) Shrink the vma to cover only the new range.
682	*/
683	static int shift_arg_pages(struct vm_area_struct vma, unsigned* long shift)
684	{
685	struct mm_struct *mm = vma->vm_mm;
686	unsigned long old_start = vma->vm_start;
687	unsigned long old_end = vma->vm_end;
688	unsigned long length = old_end - old_start;
689	unsigned long new_start = old_start - shift;
690	unsigned long new_end = old_end - shift;
691	VMA_ITERATOR(vmi, mm, new_start);
692	struct vm_area_struct *next;
693	struct mmu_gather tlb;
694
695	BUG_ON(new_start > new_end);
696
697	/*
698	* ensure there are no vmas between where we want to go
699	* and where we are
700	*/
701	if (vma != vma_next(vmi: &vmi))
702	return -EFAULT;
703
704	vma_iter_prev_range(vmi: &vmi);
705	/*
706	* cover the whole range: [new_start, old_end)
707	*/
708	if (vma_expand(vmi: &vmi, vma, start: new_start, end: old_end, pgoff: vma->vm_pgoff, NULL))
709	return -ENOMEM;
710
711	/*
712	* move the page tables downwards, on failure we rely on
713	* process cleanup to remove whatever mess we made.
714	*/
715	if (length != move_page_tables(vma, old_addr: old_start,
716	new_vma: vma, new_addr: new_start, len: length, need_rmap_locks: false, for_stack: true))
717	return -ENOMEM;
718
719	lru_add_drain();
720	tlb_gather_mmu(tlb: &tlb, mm);
721	next = vma_next(vmi: &vmi);
722	if (new_end > old_start) {
723	/*
724	* when the old and new regions overlap clear from new_end.
725	*/
726	free_pgd_range(tlb: &tlb, addr: new_end, end: old_end, floor: new_end,
727	ceiling: next ? next->vm_start : USER_PGTABLES_CEILING);
728	} else {
729	/*
730	* otherwise, clean from old_start; this is done to not touch
731	* the address space in [new_end, old_start) some architectures
732	* have constraints on va-space that make this illegal (IA64) -
733	* for the others its just a little faster.
734	*/
735	free_pgd_range(tlb: &tlb, addr: old_start, end: old_end, floor: new_end,
736	ceiling: next ? next->vm_start : USER_PGTABLES_CEILING);
737	}
738	tlb_finish_mmu(tlb: &tlb);
739
740	vma_prev(vmi: &vmi);
741	/ Shrink the vma to just the new range /
742	return vma_shrink(vmi: &vmi, vma, start: new_start, end: new_end, pgoff: vma->vm_pgoff);
743	}
744
745	/*
746	* Finalizes the stack vm_area_struct. The flags and permissions are updated,
747	* the stack is optionally relocated, and some extra space is added.
748	*/
749	int setup_arg_pages(struct linux_binprm *bprm,
750	unsigned long stack_top,
751	int executable_stack)
752	{
753	unsigned long ret;
754	unsigned long stack_shift;
755	struct mm_struct *mm = current->mm;
756	struct vm_area_struct *vma = bprm->vma;
757	struct vm_area_struct *prev = NULL;
758	unsigned long vm_flags;
759	unsigned long stack_base;
760	unsigned long stack_size;
761	unsigned long stack_expand;
762	unsigned long rlim_stack;
763	struct mmu_gather tlb;
764	struct vma_iterator vmi;
765
766	#ifdef CONFIG_STACK_GROWSUP
767	/ Limit stack size /
768	stack_base = bprm->rlim_stack.rlim_max;
769
770	stack_base = calc_max_stack_size(stack_base);
771
772	/ Add space for stack randomization. /
773	stack_base += (STACK_RND_MASK << PAGE_SHIFT);
774
775	/ Make sure we didn't let the argument array grow too large. /
776	if (vma->vm_end - vma->vm_start > stack_base)
777	return -ENOMEM;
778
779	stack_base = PAGE_ALIGN(stack_top - stack_base);
780
781	stack_shift = vma->vm_start - stack_base;
782	mm->arg_start = bprm->p - stack_shift;
783	bprm->p = vma->vm_end - stack_shift;
784	#else
785	stack_top = arch_align_stack(sp: stack_top);
786	stack_top = PAGE_ALIGN(stack_top);
787
788	if (unlikely(stack_top < mmap_min_addr) \|\|
789	unlikely(vma->vm_end - vma->vm_start >= stack_top - mmap_min_addr))
790	return -ENOMEM;
791
792	stack_shift = vma->vm_end - stack_top;
793
794	bprm->p -= stack_shift;
795	mm->arg_start = bprm->p;
796	#endif
797
798	if (bprm->loader)
799	bprm->loader -= stack_shift;
800	bprm->exec -= stack_shift;
801
802	if (mmap_write_lock_killable(mm))
803	return -EINTR;
804
805	vm_flags = VM_STACK_FLAGS;
806
807	/*
808	* Adjust stack execute permissions; explicitly enable for
809	* EXSTACK_ENABLE_X, disable for EXSTACK_DISABLE_X and leave alone
810	* (arch default) otherwise.
811	*/
812	if (unlikely(executable_stack == EXSTACK_ENABLE_X))
813	vm_flags \|= VM_EXEC;
814	else if (executable_stack == EXSTACK_DISABLE_X)
815	vm_flags &= ~VM_EXEC;
816	vm_flags \|= mm->def_flags;
817	vm_flags \|= VM_STACK_INCOMPLETE_SETUP;
818
819	vma_iter_init(vmi: &vmi, mm, addr: vma->vm_start);
820
821	tlb_gather_mmu(tlb: &tlb, mm);
822	ret = mprotect_fixup(vmi: &vmi, tlb: &tlb, vma, pprev: &prev, start: vma->vm_start, end: vma->vm_end,
823	newflags: vm_flags);
824	tlb_finish_mmu(tlb: &tlb);
825
826	if (ret)
827	goto out_unlock;
828	BUG_ON(prev != vma);
829
830	if (unlikely(vm_flags & VM_EXEC)) {
831	pr_warn_once("process '%pD4' started with executable stack\n",
832	bprm->file);
833	}
834
835	/ Move stack pages down in memory. /
836	if (stack_shift) {
837	ret = shift_arg_pages(vma, shift: stack_shift);
838	if (ret)
839	goto out_unlock;
840	}
841
842	/ mprotect_fixup is overkill to remove the temporary stack flags /
843	vm_flags_clear(vma, VM_STACK_INCOMPLETE_SETUP);
844
845	stack_expand = `131072UL`; / randomly 324k (or 264k) pages /
846	stack_size = vma->vm_end - vma->vm_start;
847	/*
848	* Align this down to a page boundary as expand_stack
849	* will align it up.
850	*/
851	rlim_stack = bprm->rlim_stack.rlim_cur & PAGE_MASK;
852
853	stack_expand = min(rlim_stack, stack_size + stack_expand);
854
855	#ifdef CONFIG_STACK_GROWSUP
856	stack_base = vma->vm_start + stack_expand;
857	#else
858	stack_base = vma->vm_end - stack_expand;
859	#endif
860	current->mm->start_stack = bprm->p;
861	ret = expand_stack_locked(vma, address: stack_base);
862	if (ret)
863	ret = -EFAULT;
864
865	out_unlock:
866	mmap_write_unlock(mm);
867	return ret;
868	}
869	EXPORT_SYMBOL(setup_arg_pages);
870
871	#else
872
873	/*
874	* Transfer the program arguments and environment from the holding pages
875	* onto the stack. The provided stack pointer is adjusted accordingly.
876	*/
877	int transfer_args_to_stack(struct linux_binprm *bprm,
878	unsigned long *sp_location)
879	{
880	unsigned long index, stop, sp;
881	int ret = `0`;
882
883	stop = bprm->p >> PAGE_SHIFT;
884	sp = *sp_location;
885
886	for (index = MAX_ARG_PAGES - `1`; index >= stop; index--) {
887	unsigned int offset = index == stop ? bprm->p & ~PAGE_MASK : `0`;
888	char *src = kmap_local_page(bprm->page[index]) + offset;
889	sp -= PAGE_SIZE - offset;
890	if (copy_to_user((void *) sp, src, PAGE_SIZE - offset) != `0`)
891	ret = -EFAULT;
892	kunmap_local(src);
893	if (ret)
894	goto out;
895	}
896
897	*sp_location = sp;
898
899	out:
900	return ret;
901	}
902	EXPORT_SYMBOL(transfer_args_to_stack);
903
904	#endif /* CONFIG_MMU */
905
906	static struct file do_open_execat(int* fd, struct filename name, int* flags)
907	{
908	struct file *file;
909	int err;
910	struct open_flags open_exec_flags = {
911	.open_flag = O_LARGEFILE \| O_RDONLY \| __FMODE_EXEC,
912	.acc_mode = MAY_EXEC,
913	.intent = LOOKUP_OPEN,
914	.lookup_flags = LOOKUP_FOLLOW,
915	};
916
917	if ((flags & ~(AT_SYMLINK_NOFOLLOW \| AT_EMPTY_PATH)) != `0`)
918	return ERR_PTR(error: -EINVAL);
919	if (flags & AT_SYMLINK_NOFOLLOW)
920	open_exec_flags.lookup_flags &= ~LOOKUP_FOLLOW;
921	if (flags & AT_EMPTY_PATH)
922	open_exec_flags.lookup_flags \|= LOOKUP_EMPTY;
923
924	file = do_filp_open(dfd: fd, pathname: name, op: &open_exec_flags);
925	if (IS_ERR(ptr: file))
926	goto out;
927
928	/*
929	* may_open() has already checked for this, so it should be
930	* impossible to trip now. But we need to be extra cautious
931	* and check again at the very end too.
932	*/
933	err = -EACCES;
934	if (WARN_ON_ONCE(!S_ISREG(file_inode(file)->i_mode) \|\|
935	path_noexec(&file->f_path)))
936	goto exit;
937
938	err = deny_write_access(file);
939	if (err)
940	goto exit;
941
942	out:
943	return file;
944
945	exit:
946	fput(file);
947	return ERR_PTR(error: err);
948	}
949
950	struct file open_exec(const* char *name)
951	{
952	struct filename *filename = getname_kernel(name);
953	struct file *f = ERR_CAST(ptr: filename);
954
955	if (!IS_ERR(ptr: filename)) {
956	f = do_open_execat(AT_FDCWD, name: filename, flags: `0`);
957	putname(name: filename);
958	}
959	return f;
960	}
961	EXPORT_SYMBOL(open_exec);
962
963	#if defined(CONFIG_BINFMT_FLAT) \|\| defined(CONFIG_BINFMT_ELF_FDPIC)
964	ssize_t read_code(struct file file, unsigned* long addr, loff_t pos, size_t len)
965	{
966	ssize_t res = vfs_read(file, (void __user *)addr, len, &pos);
967	if (res > `0`)
968	flush_icache_user_range(addr, addr + len);
969	return res;
970	}
971	EXPORT_SYMBOL(read_code);
972	#endif
973
974	/*
975	* Maps the mm_struct mm into the current task struct.
976	* On success, this function returns with exec_update_lock
977	* held for writing.
978	*/
979	static int exec_mmap(struct mm_struct *mm)
980	{
981	struct task_struct *tsk;
982	struct mm_struct old_mm, active_mm;
983	int ret;
984
985	/ Notify parent that we're no longer interested in the old VM /
986	tsk = current;
987	old_mm = current->mm;
988	exec_mm_release(tsk, old_mm);
989
990	ret = down_write_killable(sem: &tsk->signal->exec_update_lock);
991	if (ret)
992	return ret;
993
994	if (old_mm) {
995	/*
996	* If there is a pending fatal signal perhaps a signal
997	* whose default action is to create a coredump get
998	* out and die instead of going through with the exec.
999	*/
1000	ret = mmap_read_lock_killable(mm: old_mm);
1001	if (ret) {
1002	up_write(sem: &tsk->signal->exec_update_lock);
1003	return ret;
1004	}
1005	}
1006
1007	task_lock(p: tsk);
1008	membarrier_exec_mmap(mm);
1009
1010	local_irq_disable();
1011	active_mm = tsk->active_mm;
1012	tsk->active_mm = mm;
1013	tsk->mm = mm;
1014	mm_init_cid(mm);
1015	/*
1016	* This prevents preemption while active_mm is being loaded and
1017	* it and mm are being updated, which could cause problems for
1018	* lazy tlb mm refcounting when these are updated by context
1019	* switches. Not all architectures can handle irqs off over
1020	* activate_mm yet.
1021	*/
1022	if (!IS_ENABLED(CONFIG_ARCH_WANT_IRQS_OFF_ACTIVATE_MM))
1023	local_irq_enable();
1024	activate_mm(active_mm, mm);
1025	if (IS_ENABLED(CONFIG_ARCH_WANT_IRQS_OFF_ACTIVATE_MM))
1026	local_irq_enable();
1027	lru_gen_add_mm(mm);
1028	task_unlock(p: tsk);
1029	lru_gen_use_mm(mm);
1030	if (old_mm) {
1031	mmap_read_unlock(mm: old_mm);
1032	BUG_ON(active_mm != old_mm);
1033	setmax_mm_hiwater_rss(maxrss: &tsk->signal->maxrss, mm: old_mm);
1034	mm_update_next_owner(mm: old_mm);
1035	mmput(old_mm);
1036	return `0`;
1037	}
1038	mmdrop_lazy_tlb(mm: active_mm);
1039	return `0`;
1040	}
1041
1042	static int de_thread(struct task_struct *tsk)
1043	{
1044	struct signal_struct *sig = tsk->signal;
1045	struct sighand_struct *oldsighand = tsk->sighand;
1046	spinlock_t *lock = &oldsighand->siglock;
1047
1048	if (thread_group_empty(p: tsk))
1049	goto no_thread_group;
1050
1051	/*
1052	* Kill all other threads in the thread group.
1053	*/
1054	spin_lock_irq(lock);
1055	if ((sig->flags & SIGNAL_GROUP_EXIT) \|\| sig->group_exec_task) {
1056	/*
1057	* Another group action in progress, just
1058	* return so that the signal is processed.
1059	*/
1060	spin_unlock_irq(lock);
1061	return -EAGAIN;
1062	}
1063
1064	sig->group_exec_task = tsk;
1065	sig->notify_count = zap_other_threads(p: tsk);
1066	if (!thread_group_leader(p: tsk))
1067	sig->notify_count--;
1068
1069	while (sig->notify_count) {
1070	__set_current_state(TASK_KILLABLE);
1071	spin_unlock_irq(lock);
1072	schedule();
1073	if (__fatal_signal_pending(p: tsk))
1074	goto killed;
1075	spin_lock_irq(lock);
1076	}
1077	spin_unlock_irq(lock);
1078
1079	/*
1080	* At this point all other threads have exited, all we have to
1081	* do is to wait for the thread group leader to become inactive,
1082	* and to assume its PID:
1083	*/
1084	if (!thread_group_leader(p: tsk)) {
1085	struct task_struct *leader = tsk->group_leader;
1086
1087	for (;;) {
1088	cgroup_threadgroup_change_begin(tsk);
1089	write_lock_irq(&tasklist_lock);
1090	/*
1091	* Do this under tasklist_lock to ensure that
1092	* exit_notify() can't miss ->group_exec_task
1093	*/
1094	sig->notify_count = -`1`;
1095	if (likely(leader->exit_state))
1096	break;
1097	__set_current_state(TASK_KILLABLE);
1098	write_unlock_irq(&tasklist_lock);
1099	cgroup_threadgroup_change_end(tsk);
1100	schedule();
1101	if (__fatal_signal_pending(p: tsk))
1102	goto killed;
1103	}
1104
1105	/*
1106	* The only record we have of the real-time age of a
1107	* process, regardless of execs it's done, is start_time.
1108	* All the past CPU time is accumulated in signal_struct
1109	* from sister threads now dead. But in this non-leader
1110	* exec, nothing survives from the original leader thread,
1111	* whose birth marks the true age of this process now.
1112	* When we take on its identity by switching to its PID, we
1113	* also take its birthdate (always earlier than our own).
1114	*/
1115	tsk->start_time = leader->start_time;
1116	tsk->start_boottime = leader->start_boottime;
1117
1118	BUG_ON(!same_thread_group(leader, tsk));
1119	/*
1120	* An exec() starts a new thread group with the
1121	* TGID of the previous thread group. Rehash the
1122	* two threads with a switched PID, and release
1123	* the former thread group leader:
1124	*/
1125
1126	/ Become a process group leader with the old leader's pid.*
1127	* The old leader becomes a thread of the this thread group.
1128	*/
1129	exchange_tids(task: tsk, old: leader);
1130	transfer_pid(old: leader, new: tsk, PIDTYPE_TGID);
1131	transfer_pid(old: leader, new: tsk, PIDTYPE_PGID);
1132	transfer_pid(old: leader, new: tsk, PIDTYPE_SID);
1133
1134	list_replace_rcu(old: &leader->tasks, new: &tsk->tasks);
1135	list_replace_init(old: &leader->sibling, new: &tsk->sibling);
1136
1137	tsk->group_leader = tsk;
1138	leader->group_leader = tsk;
1139
1140	tsk->exit_signal = SIGCHLD;
1141	leader->exit_signal = -`1`;
1142
1143	BUG_ON(leader->exit_state != EXIT_ZOMBIE);
1144	leader->exit_state = EXIT_DEAD;
1145
1146	/*
1147	* We are going to release_task()->ptrace_unlink() silently,
1148	* the tracer can sleep in do_wait(). EXIT_DEAD guarantees
1149	* the tracer won't block again waiting for this thread.
1150	*/
1151	if (unlikely(leader->ptrace))
1152	__wake_up_parent(p: leader, parent: leader->parent);
1153	write_unlock_irq(&tasklist_lock);
1154	cgroup_threadgroup_change_end(tsk);
1155
1156	release_task(p: leader);
1157	}
1158
1159	sig->group_exec_task = NULL;
1160	sig->notify_count = `0`;
1161
1162	no_thread_group:
1163	/ we have changed execution domain /
1164	tsk->exit_signal = SIGCHLD;
1165
1166	BUG_ON(!thread_group_leader(tsk));
1167	return `0`;
1168
1169	killed:
1170	/ protects against exit_notify() and __exit_signal() /
1171	read_lock(&tasklist_lock);
1172	sig->group_exec_task = NULL;
1173	sig->notify_count = `0`;
1174	read_unlock(&tasklist_lock);
1175	return -EAGAIN;
1176	}
1177
1178
1179	/*
1180	* This function makes sure the current process has its own signal table,
1181	* so that flush_signal_handlers can later reset the handlers without
1182	* disturbing other processes. (Other processes might share the signal
1183	* table via the CLONE_SIGHAND option to clone().)
1184	*/
1185	static int unshare_sighand(struct task_struct *me)
1186	{
1187	struct sighand_struct *oldsighand = me->sighand;
1188
1189	if (refcount_read(r: &oldsighand->count) != `1`) {
1190	struct sighand_struct *newsighand;
1191	/*
1192	* This ->sighand is shared with the CLONE_SIGHAND
1193	* but not CLONE_THREAD task, switch to the new one.
1194	*/
1195	newsighand = kmem_cache_alloc(cachep: sighand_cachep, GFP_KERNEL);
1196	if (!newsighand)
1197	return -ENOMEM;
1198
1199	refcount_set(r: &newsighand->count, n: `1`);
1200
1201	write_lock_irq(&tasklist_lock);
1202	spin_lock(lock: &oldsighand->siglock);
1203	memcpy(newsighand->action, oldsighand->action,
1204	sizeof(newsighand->action));
1205	rcu_assign_pointer(me->sighand, newsighand);
1206	spin_unlock(lock: &oldsighand->siglock);
1207	write_unlock_irq(&tasklist_lock);
1208
1209	__cleanup_sighand(oldsighand);
1210	}
1211	return `0`;
1212	}
1213
1214	char __get_task_comm(char* buf, size_t buf_size, struct* task_struct *tsk)
1215	{
1216	task_lock(p: tsk);
1217	/ Always NUL terminated and zero-padded /
1218	strscpy_pad(dest: buf, src: tsk->comm, count: buf_size);
1219	task_unlock(p: tsk);
1220	return buf;
1221	}
1222	EXPORT_SYMBOL_GPL(__get_task_comm);
1223
1224	/*
1225	* These functions flushes out all traces of the currently running executable
1226	* so that a new one can be started
1227	*/
1228
1229	void __set_task_comm(struct task_struct tsk, const* char *buf, bool exec)
1230	{
1231	task_lock(p: tsk);
1232	trace_task_rename(task: tsk, comm: buf);
1233	strscpy_pad(dest: tsk->comm, src: buf, count: sizeof(tsk->comm));
1234	task_unlock(p: tsk);
1235	perf_event_comm(tsk, exec);
1236	}
1237
1238	/*
1239	* Calling this is the point of no return. None of the failures will be
1240	* seen by userspace since either the process is already taking a fatal
1241	* signal (via de_thread() or coredump), or will have SEGV raised
1242	* (after exec_mmap()) by search_binary_handler (see below).
1243	*/
1244	int begin_new_exec(struct linux_binprm * bprm)
1245	{
1246	struct task_struct *me = current;
1247	int retval;
1248
1249	/ Once we are committed compute the creds /
1250	retval = bprm_creds_from_file(bprm);
1251	if (retval)
1252	return retval;
1253
1254	/*
1255	* Ensure all future errors are fatal.
1256	*/
1257	bprm->point_of_no_return = true;
1258
1259	/*
1260	* Make this the only thread in the thread group.
1261	*/
1262	retval = de_thread(tsk: me);
1263	if (retval)
1264	goto out;
1265
1266	/*
1267	* Cancel any io_uring activity across execve
1268	*/
1269	io_uring_task_cancel();
1270
1271	/ Ensure the files table is not shared. /
1272	retval = unshare_files();
1273	if (retval)
1274	goto out;
1275
1276	/*
1277	* Must be called _before_ exec_mmap() as bprm->mm is
1278	* not visible until then. Doing it here also ensures
1279	* we don't race against replace_mm_exe_file().
1280	*/
1281	retval = set_mm_exe_file(mm: bprm->mm, new_exe_file: bprm->file);
1282	if (retval)
1283	goto out;
1284
1285	/ If the binary is not readable then enforce mm->dumpable=0 /
1286	would_dump(bprm, bprm->file);
1287	if (bprm->have_execfd)
1288	would_dump(bprm, bprm->executable);
1289
1290	/*
1291	* Release all of the old mmap stuff
1292	*/
1293	acct_arg_size(bprm, pages: `0`);
1294	retval = exec_mmap(mm: bprm->mm);
1295	if (retval)
1296	goto out;
1297
1298	bprm->mm = NULL;
1299
1300	retval = exec_task_namespaces();
1301	if (retval)
1302	goto out_unlock;
1303
1304	#ifdef CONFIG_POSIX_TIMERS
1305	spin_lock_irq(lock: &me->sighand->siglock);
1306	posix_cpu_timers_exit(task: me);
1307	spin_unlock_irq(lock: &me->sighand->siglock);
1308	exit_itimers(me);
1309	flush_itimer_signals();
1310	#endif
1311
1312	/*
1313	* Make the signal table private.
1314	*/
1315	retval = unshare_sighand(me);
1316	if (retval)
1317	goto out_unlock;
1318
1319	me->flags &= ~(PF_RANDOMIZE \| PF_FORKNOEXEC \|
1320	PF_NOFREEZE \| PF_NO_SETAFFINITY);
1321	flush_thread();
1322	me->personality &= ~bprm->per_clear;
1323
1324	clear_syscall_work_syscall_user_dispatch(me);
1325
1326	/*
1327	* We have to apply CLOEXEC before we change whether the process is
1328	* dumpable (in setup_new_exec) to avoid a race with a process in userspace
1329	* trying to access the should-be-closed file descriptors of a process
1330	* undergoing exec(2).
1331	*/
1332	do_close_on_exec(me->files);
1333
1334	if (bprm->secureexec) {
1335	/ Make sure parent cannot signal privileged process. /
1336	me->pdeath_signal = `0`;
1337
1338	/*
1339	* For secureexec, reset the stack limit to sane default to
1340	* avoid bad behavior from the prior rlimits. This has to
1341	* happen before arch_pick_mmap_layout(), which examines
1342	* RLIMIT_STACK, but after the point of no return to avoid
1343	* needing to clean up the change on failure.
1344	*/
1345	if (bprm->rlim_stack.rlim_cur > _STK_LIM)
1346	bprm->rlim_stack.rlim_cur = _STK_LIM;
1347	}
1348
1349	me->sas_ss_sp = me->sas_ss_size = `0`;
1350
1351	/*
1352	* Figure out dumpability. Note that this checking only of current
1353	* is wrong, but userspace depends on it. This should be testing
1354	* bprm->secureexec instead.
1355	*/
1356	if (bprm->interp_flags & BINPRM_FLAGS_ENFORCE_NONDUMP \|\|
1357	!(uid_eq(current_euid(), current_uid()) &&
1358	gid_eq(current_egid(), current_gid())))
1359	set_dumpable(current->mm, value: suid_dumpable);
1360	else
1361	set_dumpable(current->mm, SUID_DUMP_USER);
1362
1363	perf_event_exec();
1364	__set_task_comm(tsk: me, buf: kbasename(path: bprm->filename), exec: true);
1365
1366	/ An exec changes our domain. We are no longer part of the thread*
1367	group /*
1368	WRITE_ONCE(me->self_exec_id, me->self_exec_id + `1`);
1369	flush_signal_handlers(me, force_default: `0`);
1370
1371	retval = set_cred_ucounts(bprm->cred);
1372	if (retval < `0`)
1373	goto out_unlock;
1374
1375	/*
1376	* install the new credentials for this executable
1377	*/
1378	security_bprm_committing_creds(bprm);
1379
1380	commit_creds(bprm->cred);
1381	bprm->cred = NULL;
1382
1383	/*
1384	* Disable monitoring for regular users
1385	* when executing setuid binaries. Must
1386	* wait until new credentials are committed
1387	* by commit_creds() above
1388	*/
1389	if (get_dumpable(mm: me->mm) != SUID_DUMP_USER)
1390	perf_event_exit_task(child: me);
1391	/*
1392	* cred_guard_mutex must be held at least to this point to prevent
1393	* ptrace_attach() from altering our determination of the task's
1394	* credentials; any time after this it may be unlocked.
1395	*/
1396	security_bprm_committed_creds(bprm);
1397
1398	/ Pass the opened binary to the interpreter. /
1399	if (bprm->have_execfd) {
1400	retval = get_unused_fd_flags(flags: `0`);
1401	if (retval < `0`)
1402	goto out_unlock;
1403	fd_install(fd: retval, file: bprm->executable);
1404	bprm->executable = NULL;
1405	bprm->execfd = retval;
1406	}
1407	return `0`;
1408
1409	out_unlock:
1410	up_write(sem: &me->signal->exec_update_lock);
1411	out:
1412	return retval;
1413	}
1414	EXPORT_SYMBOL(begin_new_exec);
1415
1416	void would_dump(struct linux_binprm bprm, struct* file *file)
1417	{
1418	struct inode *inode = file_inode(f: file);
1419	struct mnt_idmap *idmap = file_mnt_idmap(file);
1420	if (inode_permission(idmap, inode, MAY_READ) < `0`) {
1421	struct user_namespace old, user_ns;
1422	bprm->interp_flags \|= BINPRM_FLAGS_ENFORCE_NONDUMP;
1423
1424	/ Ensure mm->user_ns contains the executable /
1425	user_ns = old = bprm->mm->user_ns;
1426	while ((user_ns != &init_user_ns) &&
1427	!privileged_wrt_inode_uidgid(ns: user_ns, idmap, inode))
1428	user_ns = user_ns->parent;
1429
1430	if (old != user_ns) {
1431	bprm->mm->user_ns = get_user_ns(ns: user_ns);
1432	put_user_ns(ns: old);
1433	}
1434	}
1435	}
1436	EXPORT_SYMBOL(would_dump);
1437
1438	void setup_new_exec(struct linux_binprm * bprm)
1439	{
1440	/ Setup things that can depend upon the personality /
1441	struct task_struct *me = current;
1442
1443	arch_pick_mmap_layout(mm: me->mm, rlim_stack: &bprm->rlim_stack);
1444
1445	arch_setup_new_exec();
1446
1447	/ Set the new mm task size. We have to do that late because it may*
1448	* depend on TIF_32BIT which is only updated in flush_thread() on
1449	* some architectures like powerpc
1450	*/
1451	me->mm->task_size = TASK_SIZE;
1452	up_write(sem: &me->signal->exec_update_lock);
1453	mutex_unlock(lock: &me->signal->cred_guard_mutex);
1454	}
1455	EXPORT_SYMBOL(setup_new_exec);
1456
1457	/ Runs immediately before start_thread() takes over. /
1458	void finalize_exec(struct linux_binprm *bprm)
1459	{
1460	/ Store any stack rlimit changes before starting thread. /
1461	task_lock(current->group_leader);
1462	current->signal->rlim[RLIMIT_STACK] = bprm->rlim_stack;
1463	task_unlock(current->group_leader);
1464	}
1465	EXPORT_SYMBOL(finalize_exec);
1466
1467	/*
1468	* Prepare credentials and lock ->cred_guard_mutex.
1469	* setup_new_exec() commits the new creds and drops the lock.
1470	* Or, if exec fails before, free_bprm() should release ->cred
1471	* and unlock.
1472	*/
1473	static int prepare_bprm_creds(struct linux_binprm *bprm)
1474	{
1475	if (mutex_lock_interruptible(&current->signal->cred_guard_mutex))
1476	return -ERESTARTNOINTR;
1477
1478	bprm->cred = prepare_exec_creds();
1479	if (likely(bprm->cred))
1480	return `0`;
1481
1482	mutex_unlock(lock: &current->signal->cred_guard_mutex);
1483	return -ENOMEM;
1484	}
1485
1486	static void free_bprm(struct linux_binprm *bprm)
1487	{
1488	if (bprm->mm) {
1489	acct_arg_size(bprm, pages: `0`);
1490	mmput(bprm->mm);
1491	}
1492	free_arg_pages(bprm);
1493	if (bprm->cred) {
1494	mutex_unlock(lock: &current->signal->cred_guard_mutex);
1495	abort_creds(bprm->cred);
1496	}
1497	if (bprm->file) {
1498	allow_write_access(file: bprm->file);
1499	fput(bprm->file);
1500	}
1501	if (bprm->executable)
1502	fput(bprm->executable);
1503	/ If a binfmt changed the interp, free it. /
1504	if (bprm->interp != bprm->filename)
1505	kfree(objp: bprm->interp);
1506	kfree(objp: bprm->fdpath);
1507	kfree(objp: bprm);
1508	}
1509
1510	static struct linux_binprm alloc_bprm(int* fd, struct filename *filename)
1511	{
1512	struct linux_binprm bprm = kzalloc(size: sizeof(bprm), GFP_KERNEL);
1513	int retval = -ENOMEM;
1514	if (!bprm)
1515	goto out;
1516
1517	if (fd == AT_FDCWD \|\| filename->name[`0`] == `'/'`) {
1518	bprm->filename = filename->name;
1519	} else {
1520	if (filename->name[`0`] == `'\0'`)
1521	bprm->fdpath = kasprintf(GFP_KERNEL, fmt: "/dev/fd/%d", fd);
1522	else
1523	bprm->fdpath = kasprintf(GFP_KERNEL, fmt: "/dev/fd/%d/%s",
1524	fd, filename->name);
1525	if (!bprm->fdpath)
1526	goto out_free;
1527
1528	bprm->filename = bprm->fdpath;
1529	}
1530	bprm->interp = bprm->filename;
1531
1532	retval = bprm_mm_init(bprm);
1533	if (retval)
1534	goto out_free;
1535	return bprm;
1536
1537	out_free:
1538	free_bprm(bprm);
1539	out:
1540	return ERR_PTR(error: retval);
1541	}
1542
1543	int bprm_change_interp(const char interp, struct* linux_binprm *bprm)
1544	{
1545	/ If a binfmt changed the interp, free it first. /
1546	if (bprm->interp != bprm->filename)
1547	kfree(objp: bprm->interp);
1548	bprm->interp = kstrdup(s: interp, GFP_KERNEL);
1549	if (!bprm->interp)
1550	return -ENOMEM;
1551	return `0`;
1552	}
1553	EXPORT_SYMBOL(bprm_change_interp);
1554
1555	/*
1556	* determine how safe it is to execute the proposed program
1557	* - the caller must hold ->cred_guard_mutex to protect against
1558	* PTRACE_ATTACH or seccomp thread-sync
1559	*/
1560	static void check_unsafe_exec(struct linux_binprm *bprm)
1561	{
1562	struct task_struct p = current, t;
1563	unsigned n_fs;
1564
1565	if (p->ptrace)
1566	bprm->unsafe \|= LSM_UNSAFE_PTRACE;
1567
1568	/*
1569	* This isn't strictly necessary, but it makes it harder for LSMs to
1570	* mess up.
1571	*/
1572	if (task_no_new_privs(current))
1573	bprm->unsafe \|= LSM_UNSAFE_NO_NEW_PRIVS;
1574
1575	/*
1576	* If another task is sharing our fs, we cannot safely
1577	* suid exec because the differently privileged task
1578	* will be able to manipulate the current directory, etc.
1579	* It would be nice to force an unshare instead...
1580	*/
1581	t = p;
1582	n_fs = `1`;
1583	spin_lock(lock: &p->fs->lock);
1584	rcu_read_lock();
1585	while_each_thread(p, t) {
1586	if (t->fs == p->fs)
1587	n_fs++;
1588	}
1589	rcu_read_unlock();
1590
1591	if (p->fs->users > n_fs)
1592	bprm->unsafe \|= LSM_UNSAFE_SHARE;
1593	else
1594	p->fs->in_exec = `1`;
1595	spin_unlock(lock: &p->fs->lock);
1596	}
1597
1598	static void bprm_fill_uid(struct linux_binprm bprm, struct* file *file)
1599	{
1600	/ Handle suid and sgid on files /
1601	struct mnt_idmap *idmap;
1602	struct inode *inode = file_inode(f: file);
1603	unsigned int mode;
1604	vfsuid_t vfsuid;
1605	vfsgid_t vfsgid;
1606
1607	if (!mnt_may_suid(mnt: file->f_path.mnt))
1608	return;
1609
1610	if (task_no_new_privs(current))
1611	return;
1612
1613	mode = READ_ONCE(inode->i_mode);
1614	if (!(mode & (S_ISUID\|S_ISGID)))
1615	return;
1616
1617	idmap = file_mnt_idmap(file);
1618
1619	/ Be careful if suid/sgid is set /
1620	inode_lock(inode);
1621
1622	/ reload atomically mode/uid/gid now that lock held /
1623	mode = inode->i_mode;
1624	vfsuid = i_uid_into_vfsuid(idmap, inode);
1625	vfsgid = i_gid_into_vfsgid(idmap, inode);
1626	inode_unlock(inode);
1627
1628	/ We ignore suid/sgid if there are no mappings for them in the ns /
1629	if (!vfsuid_has_mapping(userns: bprm->cred->user_ns, vfsuid) \|\|
1630	!vfsgid_has_mapping(userns: bprm->cred->user_ns, vfsgid))
1631	return;
1632
1633	if (mode & S_ISUID) {
1634	bprm->per_clear \|= PER_CLEAR_ON_SETID;
1635	bprm->cred->euid = vfsuid_into_kuid(vfsuid);
1636	}
1637
1638	if ((mode & (S_ISGID \| S_IXGRP)) == (S_ISGID \| S_IXGRP)) {
1639	bprm->per_clear \|= PER_CLEAR_ON_SETID;
1640	bprm->cred->egid = vfsgid_into_kgid(vfsgid);
1641	}
1642	}
1643
1644	/*
1645	* Compute brpm->cred based upon the final binary.
1646	*/
1647	static int bprm_creds_from_file(struct linux_binprm *bprm)
1648	{
1649	/ Compute creds based on which file? /
1650	struct file *file = bprm->execfd_creds ? bprm->executable : bprm->file;
1651
1652	bprm_fill_uid(bprm, file);
1653	return security_bprm_creds_from_file(bprm, file);
1654	}
1655
1656	/*
1657	* Fill the binprm structure from the inode.
1658	* Read the first BINPRM_BUF_SIZE bytes
1659	*
1660	* This may be called multiple times for binary chains (scripts for example).
1661	*/
1662	static int prepare_binprm(struct linux_binprm *bprm)
1663	{
1664	loff_t pos = `0`;
1665
1666	memset(bprm->buf, `0`, BINPRM_BUF_SIZE);
1667	return kernel_read(bprm->file, bprm->buf, BINPRM_BUF_SIZE, &pos);
1668	}
1669
1670	/*
1671	* Arguments are '\0' separated strings found at the location bprm->p
1672	* points to; chop off the first by relocating brpm->p to right after
1673	* the first '\0' encountered.
1674	*/
1675	int remove_arg_zero(struct linux_binprm *bprm)
1676	{
1677	int ret = `0`;
1678	unsigned long offset;
1679	char *kaddr;
1680	struct page *page;
1681
1682	if (!bprm->argc)
1683	return `0`;
1684
1685	do {
1686	offset = bprm->p & ~PAGE_MASK;
1687	page = get_arg_page(bprm, pos: bprm->p, write: `0`);
1688	if (!page) {
1689	ret = -EFAULT;
1690	goto out;
1691	}
1692	kaddr = kmap_local_page(page);
1693
1694	for (; offset < PAGE_SIZE && kaddr[offset];
1695	offset++, bprm->p++)
1696	;
1697
1698	kunmap_local(kaddr);
1699	put_arg_page(page);
1700	} while (offset == PAGE_SIZE);
1701
1702	bprm->p++;
1703	bprm->argc--;
1704	ret = `0`;
1705
1706	out:
1707	return ret;
1708	}
1709	EXPORT_SYMBOL(remove_arg_zero);
1710
1711	#define printable(c) (((c)=='\t') \|\| ((c)=='\n') \|\| (0x20<=(c) && (c)<=0x7e))
1712	/*
1713	* cycle the list of binary formats handler, until one recognizes the image
1714	*/
1715	static int search_binary_handler(struct linux_binprm *bprm)
1716	{
1717	bool need_retry = IS_ENABLED(CONFIG_MODULES);
1718	struct linux_binfmt *fmt;
1719	int retval;
1720
1721	retval = prepare_binprm(bprm);
1722	if (retval < `0`)
1723	return retval;
1724
1725	retval = security_bprm_check(bprm);
1726	if (retval)
1727	return retval;
1728
1729	retval = -ENOENT;
1730	retry:
1731	read_lock(&binfmt_lock);
1732	list_for_each_entry(fmt, &formats, lh) {
1733	if (!try_module_get(module: fmt->module))
1734	continue;
1735	read_unlock(&binfmt_lock);
1736
1737	retval = fmt->load_binary(bprm);
1738
1739	read_lock(&binfmt_lock);
1740	put_binfmt(fmt);
1741	if (bprm->point_of_no_return \|\| (retval != -ENOEXEC)) {
1742	read_unlock(&binfmt_lock);
1743	return retval;
1744	}
1745	}
1746	read_unlock(&binfmt_lock);
1747
1748	if (need_retry) {
1749	if (printable(bprm->buf[`0`]) && printable(bprm->buf[`1`]) &&
1750	printable(bprm->buf[`2`]) && printable(bprm->buf[`3`]))
1751	return retval;
1752	if (request_module("binfmt-%04x", (ushort )(bprm->buf + `2`)) < `0`)
1753	return retval;
1754	need_retry = false;
1755	goto retry;
1756	}
1757
1758	return retval;
1759	}
1760
1761	/ binfmt handlers will call back into begin_new_exec() on success. /
1762	static int exec_binprm(struct linux_binprm *bprm)
1763	{
1764	pid_t old_pid, old_vpid;
1765	int ret, depth;
1766
1767	/ Need to fetch pid before load_binary changes it /
1768	old_pid = current->pid;
1769	rcu_read_lock();
1770	old_vpid = task_pid_nr_ns(current, ns: task_active_pid_ns(current->parent));
1771	rcu_read_unlock();
1772
1773	/ This allows 4 levels of binfmt rewrites before failing hard. /
1774	for (depth = `0`;; depth++) {
1775	struct file *exec;
1776	if (depth > `5`)
1777	return -ELOOP;
1778
1779	ret = search_binary_handler(bprm);
1780	if (ret < `0`)
1781	return ret;
1782	if (!bprm->interpreter)
1783	break;
1784
1785	exec = bprm->file;
1786	bprm->file = bprm->interpreter;
1787	bprm->interpreter = NULL;
1788
1789	allow_write_access(file: exec);
1790	if (unlikely(bprm->have_execfd)) {
1791	if (bprm->executable) {
1792	fput(exec);
1793	return -ENOEXEC;
1794	}
1795	bprm->executable = exec;
1796	} else
1797	fput(exec);
1798	}
1799
1800	audit_bprm(bprm);
1801	trace_sched_process_exec(current, old_pid, bprm);
1802	ptrace_event(PTRACE_EVENT_EXEC, message: old_vpid);
1803	proc_exec_connector(current);
1804	return `0`;
1805	}
1806
1807	/*
1808	* sys_execve() executes a new program.
1809	*/
1810	static int bprm_execve(struct linux_binprm *bprm,
1811	int fd, struct filename filename, int* flags)
1812	{
1813	struct file *file;
1814	int retval;
1815
1816	retval = prepare_bprm_creds(bprm);
1817	if (retval)
1818	return retval;
1819
1820	/*
1821	* Check for unsafe execution states before exec_binprm(), which
1822	* will call back into begin_new_exec(), into bprm_creds_from_file(),
1823	* where setuid-ness is evaluated.
1824	*/
1825	check_unsafe_exec(bprm);
1826	current->in_execve = `1`;
1827	sched_mm_cid_before_execve(current);
1828
1829	file = do_open_execat(fd, name: filename, flags);
1830	retval = PTR_ERR(ptr: file);
1831	if (IS_ERR(ptr: file))
1832	goto out_unmark;
1833
1834	sched_exec();
1835
1836	bprm->file = file;
1837	/*
1838	* Record that a name derived from an O_CLOEXEC fd will be
1839	* inaccessible after exec. This allows the code in exec to
1840	* choose to fail when the executable is not mmaped into the
1841	* interpreter and an open file descriptor is not passed to
1842	* the interpreter. This makes for a better user experience
1843	* than having the interpreter start and then immediately fail
1844	* when it finds the executable is inaccessible.
1845	*/
1846	if (bprm->fdpath && get_close_on_exec(fd))
1847	bprm->interp_flags \|= BINPRM_FLAGS_PATH_INACCESSIBLE;
1848
1849	/ Set the unchanging part of bprm->cred /
1850	retval = security_bprm_creds_for_exec(bprm);
1851	if (retval)
1852	goto out;
1853
1854	retval = exec_binprm(bprm);
1855	if (retval < `0`)
1856	goto out;
1857
1858	sched_mm_cid_after_execve(current);
1859	/ execve succeeded /
1860	current->fs->in_exec = `0`;
1861	current->in_execve = `0`;
1862	rseq_execve(current);
1863	user_events_execve(current);
1864	acct_update_integrals(current);
1865	task_numa_free(current, final: false);
1866	return retval;
1867
1868	out:
1869	/*
1870	* If past the point of no return ensure the code never
1871	* returns to the userspace process. Use an existing fatal
1872	* signal if present otherwise terminate the process with
1873	* SIGSEGV.
1874	*/
1875	if (bprm->point_of_no_return && !fatal_signal_pending(current))
1876	force_fatal_sig(SIGSEGV);
1877
1878	out_unmark:
1879	sched_mm_cid_after_execve(current);
1880	current->fs->in_exec = `0`;
1881	current->in_execve = `0`;
1882
1883	return retval;
1884	}
1885
1886	static int do_execveat_common(int fd, struct filename *filename,
1887	struct user_arg_ptr argv,
1888	struct user_arg_ptr envp,
1889	int flags)
1890	{
1891	struct linux_binprm *bprm;
1892	int retval;
1893
1894	if (IS_ERR(ptr: filename))
1895	return PTR_ERR(ptr: filename);
1896
1897	/*
1898	* We move the actual failure in case of RLIMIT_NPROC excess from
1899	* set*uid() to execve() because too many poorly written programs
1900	* don't check setuid() return code. Here we additionally recheck
1901	* whether NPROC limit is still exceeded.
1902	*/
1903	if ((current->flags & PF_NPROC_EXCEEDED) &&
1904	is_rlimit_overlimit(current_ucounts(), type: UCOUNT_RLIMIT_NPROC, max: rlimit(RLIMIT_NPROC))) {
1905	retval = -EAGAIN;
1906	goto out_ret;
1907	}
1908
1909	/ We're below the limit (still or again), so we don't want to make*
1910	* further execve() calls fail. */
1911	current->flags &= ~PF_NPROC_EXCEEDED;
1912
1913	bprm = alloc_bprm(fd, filename);
1914	if (IS_ERR(ptr: bprm)) {
1915	retval = PTR_ERR(ptr: bprm);
1916	goto out_ret;
1917	}
1918
1919	retval = count(argv, MAX_ARG_STRINGS);
1920	if (retval == `0`)
1921	pr_warn_once("process '%s' launched '%s' with NULL argv: empty string added\n",
1922	current->comm, bprm->filename);
1923	if (retval < `0`)
1924	goto out_free;
1925	bprm->argc = retval;
1926
1927	retval = count(argv: envp, MAX_ARG_STRINGS);
1928	if (retval < `0`)
1929	goto out_free;
1930	bprm->envc = retval;
1931
1932	retval = bprm_stack_limits(bprm);
1933	if (retval < `0`)
1934	goto out_free;
1935
1936	retval = copy_string_kernel(bprm->filename, bprm);
1937	if (retval < `0`)
1938	goto out_free;
1939	bprm->exec = bprm->p;
1940
1941	retval = copy_strings(argc: bprm->envc, argv: envp, bprm);
1942	if (retval < `0`)
1943	goto out_free;
1944
1945	retval = copy_strings(argc: bprm->argc, argv, bprm);
1946	if (retval < `0`)
1947	goto out_free;
1948
1949	/*
1950	* When argv is empty, add an empty string ("") as argv[0] to
1951	* ensure confused userspace programs that start processing
1952	* from argv[1] won't end up walking envp. See also
1953	* bprm_stack_limits().
1954	*/
1955	if (bprm->argc == `0`) {
1956	retval = copy_string_kernel("", bprm);
1957	if (retval < `0`)
1958	goto out_free;
1959	bprm->argc = `1`;
1960	}
1961
1962	retval = bprm_execve(bprm, fd, filename, flags);
1963	out_free:
1964	free_bprm(bprm);
1965
1966	out_ret:
1967	putname(name: filename);
1968	return retval;
1969	}
1970
1971	int kernel_execve(const char *kernel_filename,
1972	const char *const argv, const* char *const *envp)
1973	{
1974	struct filename *filename;
1975	struct linux_binprm *bprm;
1976	int fd = AT_FDCWD;
1977	int retval;
1978
1979	/ It is non-sense for kernel threads to call execve /
1980	if (WARN_ON_ONCE(current->flags & PF_KTHREAD))
1981	return -EINVAL;
1982
1983	filename = getname_kernel(kernel_filename);
1984	if (IS_ERR(ptr: filename))
1985	return PTR_ERR(ptr: filename);
1986
1987	bprm = alloc_bprm(fd, filename);
1988	if (IS_ERR(ptr: bprm)) {
1989	retval = PTR_ERR(ptr: bprm);
1990	goto out_ret;
1991	}
1992
1993	retval = count_strings_kernel(argv);
1994	if (WARN_ON_ONCE(retval == `0`))
1995	retval = -EINVAL;
1996	if (retval < `0`)
1997	goto out_free;
1998	bprm->argc = retval;
1999
2000	retval = count_strings_kernel(argv: envp);
2001	if (retval < `0`)
2002	goto out_free;
2003	bprm->envc = retval;
2004
2005	retval = bprm_stack_limits(bprm);
2006	if (retval < `0`)
2007	goto out_free;
2008
2009	retval = copy_string_kernel(bprm->filename, bprm);
2010	if (retval < `0`)
2011	goto out_free;
2012	bprm->exec = bprm->p;
2013
2014	retval = copy_strings_kernel(argc: bprm->envc, argv: envp, bprm);
2015	if (retval < `0`)
2016	goto out_free;
2017
2018	retval = copy_strings_kernel(argc: bprm->argc, argv, bprm);
2019	if (retval < `0`)
2020	goto out_free;
2021
2022	retval = bprm_execve(bprm, fd, filename, flags: `0`);
2023	out_free:
2024	free_bprm(bprm);
2025	out_ret:
2026	putname(name: filename);
2027	return retval;
2028	}
2029
2030	static int do_execve(struct filename *filename,
2031	const char __user *const __user *__argv,
2032	const char __user *const __user *__envp)
2033	{
2034	struct user_arg_ptr argv = { .ptr.native = __argv };
2035	struct user_arg_ptr envp = { .ptr.native = __envp };
2036	return do_execveat_common(AT_FDCWD, filename, argv, envp, flags: `0`);
2037	}
2038
2039	static int do_execveat(int fd, struct filename *filename,
2040	const char __user *const __user *__argv,
2041	const char __user *const __user *__envp,
2042	int flags)
2043	{
2044	struct user_arg_ptr argv = { .ptr.native = __argv };
2045	struct user_arg_ptr envp = { .ptr.native = __envp };
2046
2047	return do_execveat_common(fd, filename, argv, envp, flags);
2048	}
2049
2050	#ifdef CONFIG_COMPAT
2051	static int compat_do_execve(struct filename *filename,
2052	const compat_uptr_t __user *__argv,
2053	const compat_uptr_t __user *__envp)
2054	{
2055	struct user_arg_ptr argv = {
2056	.is_compat = true,
2057	.ptr.compat = __argv,
2058	};
2059	struct user_arg_ptr envp = {
2060	.is_compat = true,
2061	.ptr.compat = __envp,
2062	};
2063	return do_execveat_common(AT_FDCWD, filename, argv, envp, flags: `0`);
2064	}
2065
2066	static int compat_do_execveat(int fd, struct filename *filename,
2067	const compat_uptr_t __user *__argv,
2068	const compat_uptr_t __user *__envp,
2069	int flags)
2070	{
2071	struct user_arg_ptr argv = {
2072	.is_compat = true,
2073	.ptr.compat = __argv,
2074	};
2075	struct user_arg_ptr envp = {
2076	.is_compat = true,
2077	.ptr.compat = __envp,
2078	};
2079	return do_execveat_common(fd, filename, argv, envp, flags);
2080	}
2081	#endif
2082
2083	void set_binfmt(struct linux_binfmt *new)
2084	{
2085	struct mm_struct *mm = current->mm;
2086
2087	if (mm->binfmt)
2088	module_put(module: mm->binfmt->module);
2089
2090	mm->binfmt = new;
2091	if (new)
2092	__module_get(module: new->module);
2093	}
2094	EXPORT_SYMBOL(set_binfmt);
2095
2096	/*
2097	* set_dumpable stores three-value SUID_DUMP_* into mm->flags.
2098	*/
2099	void set_dumpable(struct mm_struct mm, int* value)
2100	{
2101	if (WARN_ON((unsigned)value > SUID_DUMP_ROOT))
2102	return;
2103
2104	set_mask_bits(&mm->flags, MMF_DUMPABLE_MASK, value);
2105	}
2106
2107	SYSCALL_DEFINE3(execve,
2108	const char __user *, filename,
2109	const char __user *const __user *, argv,
2110	const char __user *const __user *, envp)
2111	{
2112	return do_execve(filename: getname(filename), argv: argv, envp: envp);
2113	}
2114
2115	SYSCALL_DEFINE5(execveat,
2116	int, fd, const char __user *, filename,
2117	const char __user *const __user *, argv,
2118	const char __user *const __user *, envp,
2119	int, flags)
2120	{
2121	return do_execveat(fd,
2122	filename: getname_uflags(filename, flags),
2123	argv: argv, envp: envp, flags);
2124	}
2125
2126	#ifdef CONFIG_COMPAT
2127	COMPAT_SYSCALL_DEFINE3(execve, const char __user *, filename,
2128	const compat_uptr_t __user *, argv,
2129	const compat_uptr_t __user *, envp)
2130	{
2131	return compat_do_execve(filename: getname(filename), argv: argv, envp: envp);
2132	}
2133
2134	COMPAT_SYSCALL_DEFINE5(execveat, int, fd,
2135	const char __user *, filename,
2136	const compat_uptr_t __user *, argv,
2137	const compat_uptr_t __user *, envp,
2138	int, flags)
2139	{
2140	return compat_do_execveat(fd,
2141	filename: getname_uflags(filename, flags),
2142	argv: argv, envp: envp, flags);
2143	}
2144	#endif
2145
2146	#ifdef CONFIG_SYSCTL
2147
2148	static int proc_dointvec_minmax_coredump(struct ctl_table table, int* write,
2149	void buffer, size_t lenp, loff_t *ppos)
2150	{
2151	int error = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
2152
2153	if (!error)
2154	validate_coredump_safety();
2155	return error;
2156	}
2157
2158	static struct ctl_table fs_exec_sysctls[] = {
2159	{
2160	.procname = "suid_dumpable",
2161	.data = &suid_dumpable,
2162	.maxlen = sizeof(int),
2163	.mode = `0644`,
2164	.proc_handler = proc_dointvec_minmax_coredump,
2165	.extra1 = SYSCTL_ZERO,
2166	.extra2 = SYSCTL_TWO,
2167	},
2168	{ }
2169	};
2170
2171	static int __init init_fs_exec_sysctls(void)
2172	{
2173	register_sysctl_init("fs", fs_exec_sysctls);
2174	return `0`;
2175	}
2176
2177	fs_initcall(init_fs_exec_sysctls);
2178	#endif /* CONFIG_SYSCTL */
2179

source code of linux/fs/exec.c