exit.c source code [linux/kernel/exit.c]

1	// SPDX-License-Identifier: GPL-2.0-only
2	/*
3	* linux/kernel/exit.c
4	*
5	* Copyright (C) 1991, 1992 Linus Torvalds
6	*/
7
8	#include <linux/mm.h>
9	#include <linux/slab.h>
10	#include <linux/sched/autogroup.h>
11	#include <linux/sched/mm.h>
12	#include <linux/sched/stat.h>
13	#include <linux/sched/task.h>
14	#include <linux/sched/task_stack.h>
15	#include <linux/sched/cputime.h>
16	#include <linux/interrupt.h>
17	#include <linux/module.h>
18	#include <linux/capability.h>
19	#include <linux/completion.h>
20	#include <linux/personality.h>
21	#include <linux/tty.h>
22	#include <linux/iocontext.h>
23	#include <linux/key.h>
24	#include <linux/cpu.h>
25	#include <linux/acct.h>
26	#include <linux/tsacct_kern.h>
27	#include <linux/file.h>
28	#include <linux/fdtable.h>
29	#include <linux/freezer.h>
30	#include <linux/binfmts.h>
31	#include <linux/nsproxy.h>
32	#include <linux/pid_namespace.h>
33	#include <linux/ptrace.h>
34	#include <linux/profile.h>
35	#include <linux/mount.h>
36	#include <linux/proc_fs.h>
37	#include <linux/kthread.h>
38	#include <linux/mempolicy.h>
39	#include <linux/taskstats_kern.h>
40	#include <linux/delayacct.h>
41	#include <linux/cgroup.h>
42	#include <linux/syscalls.h>
43	#include <linux/signal.h>
44	#include <linux/posix-timers.h>
45	#include <linux/cn_proc.h>
46	#include <linux/mutex.h>
47	#include <linux/futex.h>
48	#include <linux/pipe_fs_i.h>
49	#include <linux/audit.h> /* for audit_free() */
50	#include <linux/resource.h>
51	#include <linux/task_io_accounting_ops.h>
52	#include <linux/blkdev.h>
53	#include <linux/task_work.h>
54	#include <linux/fs_struct.h>
55	#include <linux/init_task.h>
56	#include <linux/perf_event.h>
57	#include <trace/events/sched.h>
58	#include <linux/hw_breakpoint.h>
59	#include <linux/oom.h>
60	#include <linux/writeback.h>
61	#include <linux/shm.h>
62	#include <linux/kcov.h>
63	#include <linux/kmsan.h>
64	#include <linux/random.h>
65	#include <linux/rcuwait.h>
66	#include <linux/compat.h>
67	#include <linux/io_uring.h>
68	#include <linux/kprobes.h>
69	#include <linux/rethook.h>
70	#include <linux/sysfs.h>
71	#include <linux/user_events.h>
72
73	#include <linux/uaccess.h>
74	#include <asm/unistd.h>
75	#include <asm/mmu_context.h>
76
77	#include "exit.h"
78
79	/*
80	* The default value should be high enough to not crash a system that randomly
81	* crashes its kernel from time to time, but low enough to at least not permit
82	* overflowing 32-bit refcounts or the ldsem writer count.
83	*/
84	static unsigned int oops_limit = `10000`;
85
86	#ifdef CONFIG_SYSCTL
87	static struct ctl_table kern_exit_table[] = {
88	{
89	.procname = "oops_limit",
90	.data = &oops_limit,
91	.maxlen = sizeof(oops_limit),
92	.mode = `0644`,
93	.proc_handler = proc_douintvec,
94	},
95	{ }
96	};
97
98	static __init int kernel_exit_sysctls_init(void)
99	{
100	register_sysctl_init("kernel", kern_exit_table);
101	return `0`;
102	}
103	late_initcall(kernel_exit_sysctls_init);
104	#endif
105
106	static atomic_t oops_count = ATOMIC_INIT(`0`);
107
108	#ifdef CONFIG_SYSFS
109	static ssize_t oops_count_show(struct kobject kobj, struct* kobj_attribute *attr,
110	char *page)
111	{
112	return sysfs_emit(buf: page, fmt: "%d\n", atomic_read(v: &oops_count));
113	}
114
115	static struct kobj_attribute oops_count_attr = __ATTR_RO(oops_count);
116
117	static __init int kernel_exit_sysfs_init(void)
118	{
119	sysfs_add_file_to_group(kobj: kernel_kobj, attr: &oops_count_attr.attr, NULL);
120	return `0`;
121	}
122	late_initcall(kernel_exit_sysfs_init);
123	#endif
124
125	static void __unhash_process(struct task_struct *p, bool group_dead)
126	{
127	nr_threads--;
128	detach_pid(task: p, PIDTYPE_PID);
129	if (group_dead) {
130	detach_pid(task: p, PIDTYPE_TGID);
131	detach_pid(task: p, PIDTYPE_PGID);
132	detach_pid(task: p, PIDTYPE_SID);
133
134	list_del_rcu(entry: &p->tasks);
135	list_del_init(entry: &p->sibling);
136	__this_cpu_dec(process_counts);
137	}
138	list_del_rcu(entry: &p->thread_node);
139	}
140
141	/*
142	* This function expects the tasklist_lock write-locked.
143	*/
144	static void __exit_signal(struct task_struct *tsk)
145	{
146	struct signal_struct *sig = tsk->signal;
147	bool group_dead = thread_group_leader(p: tsk);
148	struct sighand_struct *sighand;
149	struct tty_struct *tty;
150	u64 utime, stime;
151
152	sighand = rcu_dereference_check(tsk->sighand,
153	lockdep_tasklist_lock_is_held());
154	spin_lock(lock: &sighand->siglock);
155
156	#ifdef CONFIG_POSIX_TIMERS
157	posix_cpu_timers_exit(task: tsk);
158	if (group_dead)
159	posix_cpu_timers_exit_group(task: tsk);
160	#endif
161
162	if (group_dead) {
163	tty = sig->tty;
164	sig->tty = NULL;
165	} else {
166	/*
167	* If there is any task waiting for the group exit
168	* then notify it:
169	*/
170	if (sig->notify_count > `0` && !--sig->notify_count)
171	wake_up_process(tsk: sig->group_exec_task);
172
173	if (tsk == sig->curr_target)
174	sig->curr_target = next_thread(p: tsk);
175	}
176
177	add_device_randomness(buf: (const void*) &tsk->se.sum_exec_runtime,
178	len: sizeof(unsigned long long));
179
180	/*
181	* Accumulate here the counters for all threads as they die. We could
182	* skip the group leader because it is the last user of signal_struct,
183	* but we want to avoid the race with thread_group_cputime() which can
184	* see the empty ->thread_head list.
185	*/
186	task_cputime(t: tsk, utime: &utime, stime: &stime);
187	write_seqlock(sl: &sig->stats_lock);
188	sig->utime += utime;
189	sig->stime += stime;
190	sig->gtime += task_gtime(t: tsk);
191	sig->min_flt += tsk->min_flt;
192	sig->maj_flt += tsk->maj_flt;
193	sig->nvcsw += tsk->nvcsw;
194	sig->nivcsw += tsk->nivcsw;
195	sig->inblock += task_io_get_inblock(p: tsk);
196	sig->oublock += task_io_get_oublock(p: tsk);
197	task_io_accounting_add(dst: &sig->ioac, src: &tsk->ioac);
198	sig->sum_sched_runtime += tsk->se.sum_exec_runtime;
199	sig->nr_threads--;
200	__unhash_process(p: tsk, group_dead);
201	write_sequnlock(sl: &sig->stats_lock);
202
203	/*
204	* Do this under ->siglock, we can race with another thread
205	* doing sigqueue_free() if we have SIGQUEUE_PREALLOC signals.
206	*/
207	flush_sigqueue(queue: &tsk->pending);
208	tsk->sighand = NULL;
209	spin_unlock(lock: &sighand->siglock);
210
211	__cleanup_sighand(sighand);
212	clear_tsk_thread_flag(tsk, TIF_SIGPENDING);
213	if (group_dead) {
214	flush_sigqueue(queue: &sig->shared_pending);
215	tty_kref_put(tty);
216	}
217	}
218
219	static void delayed_put_task_struct(struct rcu_head *rhp)
220	{
221	struct task_struct tsk = container_of(rhp, struct* task_struct, rcu);
222
223	kprobe_flush_task(tsk);
224	rethook_flush_task(tk: tsk);
225	perf_event_delayed_put(task: tsk);
226	trace_sched_process_free(p: tsk);
227	put_task_struct(t: tsk);
228	}
229
230	void put_task_struct_rcu_user(struct task_struct *task)
231	{
232	if (refcount_dec_and_test(r: &task->rcu_users))
233	call_rcu(head: &task->rcu, func: delayed_put_task_struct);
234	}
235
236	void __weak release_thread(struct task_struct *dead_task)
237	{
238	}
239
240	void release_task(struct task_struct *p)
241	{
242	struct task_struct *leader;
243	struct pid *thread_pid;
244	int zap_leader;
245	repeat:
246	/ don't need to get the RCU readlock here - the process is dead and*
247	* can't be modifying its own credentials. But shut RCU-lockdep up */
248	rcu_read_lock();
249	dec_rlimit_ucounts(task_ucounts(p), type: UCOUNT_RLIMIT_NPROC, v: `1`);
250	rcu_read_unlock();
251
252	cgroup_release(p);
253
254	write_lock_irq(&tasklist_lock);
255	ptrace_release_task(task: p);
256	thread_pid = get_pid(pid: p->thread_pid);
257	__exit_signal(tsk: p);
258
259	/*
260	* If we are the last non-leader member of the thread
261	* group, and the leader is zombie, then notify the
262	* group leader's parent process. (if it wants notification.)
263	*/
264	zap_leader = `0`;
265	leader = p->group_leader;
266	if (leader != p && thread_group_empty(p: leader)
267	&& leader->exit_state == EXIT_ZOMBIE) {
268	/*
269	* If we were the last child thread and the leader has
270	* exited already, and the leader's parent ignores SIGCHLD,
271	* then we are the one who should release the leader.
272	*/
273	zap_leader = do_notify_parent(leader, leader->exit_signal);
274	if (zap_leader)
275	leader->exit_state = EXIT_DEAD;
276	}
277
278	write_unlock_irq(&tasklist_lock);
279	seccomp_filter_release(tsk: p);
280	proc_flush_pid(thread_pid);
281	put_pid(pid: thread_pid);
282	release_thread(dead_task: p);
283	put_task_struct_rcu_user(task: p);
284
285	p = leader;
286	if (unlikely(zap_leader))
287	goto repeat;
288	}
289
290	int rcuwait_wake_up(struct rcuwait *w)
291	{
292	int ret = `0`;
293	struct task_struct *task;
294
295	rcu_read_lock();
296
297	/*
298	* Order condition vs @task, such that everything prior to the load
299	* of @task is visible. This is the condition as to why the user called
300	* rcuwait_wake() in the first place. Pairs with set_current_state()
301	* barrier (A) in rcuwait_wait_event().
302	*
303	* WAIT WAKE
304	* [S] tsk = current [S] cond = true
305	* MB (A) MB (B)
306	* [L] cond [L] tsk
307	*/
308	smp_mb(); / (B) /
309
310	task = rcu_dereference(w->task);
311	if (task)
312	ret = wake_up_process(tsk: task);
313	rcu_read_unlock();
314
315	return ret;
316	}
317	EXPORT_SYMBOL_GPL(rcuwait_wake_up);
318
319	/*
320	* Determine if a process group is "orphaned", according to the POSIX
321	* definition in 2.2.2.52. Orphaned process groups are not to be affected
322	* by terminal-generated stop signals. Newly orphaned process groups are
323	* to receive a SIGHUP and a SIGCONT.
324	*
325	* "I ask you, have you ever known what it is to be an orphan?"
326	*/
327	static int will_become_orphaned_pgrp(struct pid *pgrp,
328	struct task_struct *ignored_task)
329	{
330	struct task_struct *p;
331
332	do_each_pid_task(pgrp, PIDTYPE_PGID, p) {
333	if ((p == ignored_task) \|\|
334	(p->exit_state && thread_group_empty(p)) \|\|
335	is_global_init(tsk: p->real_parent))
336	continue;
337
338	if (task_pgrp(task: p->real_parent) != pgrp &&
339	task_session(task: p->real_parent) == task_session(task: p))
340	return `0`;
341	} while_each_pid_task(pgrp, PIDTYPE_PGID, p);
342
343	return `1`;
344	}
345
346	int is_current_pgrp_orphaned(void)
347	{
348	int retval;
349
350	read_lock(&tasklist_lock);
351	retval = will_become_orphaned_pgrp(pgrp: task_pgrp(current), NULL);
352	read_unlock(&tasklist_lock);
353
354	return retval;
355	}
356
357	static bool has_stopped_jobs(struct pid *pgrp)
358	{
359	struct task_struct *p;
360
361	do_each_pid_task(pgrp, PIDTYPE_PGID, p) {
362	if (p->signal->flags & SIGNAL_STOP_STOPPED)
363	return true;
364	} while_each_pid_task(pgrp, PIDTYPE_PGID, p);
365
366	return false;
367	}
368
369	/*
370	* Check to see if any process groups have become orphaned as
371	* a result of our exiting, and if they have any stopped jobs,
372	* send them a SIGHUP and then a SIGCONT. (POSIX 3.2.2.2)
373	*/
374	static void
375	kill_orphaned_pgrp(struct task_struct tsk, struct* task_struct *parent)
376	{
377	struct pid *pgrp = task_pgrp(task: tsk);
378	struct task_struct *ignored_task = tsk;
379
380	if (!parent)
381	/ exit: our father is in a different pgrp than*
382	* we are and we were the only connection outside.
383	*/
384	parent = tsk->real_parent;
385	else
386	/ reparent: our child is in a different pgrp than*
387	* we are, and it was the only connection outside.
388	*/
389	ignored_task = NULL;
390
391	if (task_pgrp(task: parent) != pgrp &&
392	task_session(task: parent) == task_session(task: tsk) &&
393	will_become_orphaned_pgrp(pgrp, ignored_task) &&
394	has_stopped_jobs(pgrp)) {
395	__kill_pgrp_info(SIGHUP, SEND_SIG_PRIV, pgrp);
396	__kill_pgrp_info(SIGCONT, SEND_SIG_PRIV, pgrp);
397	}
398	}
399
400	static void coredump_task_exit(struct task_struct *tsk)
401	{
402	struct core_state *core_state;
403
404	/*
405	* Serialize with any possible pending coredump.
406	* We must hold siglock around checking core_state
407	* and setting PF_POSTCOREDUMP. The core-inducing thread
408	* will increment ->nr_threads for each thread in the
409	* group without PF_POSTCOREDUMP set.
410	*/
411	spin_lock_irq(lock: &tsk->sighand->siglock);
412	tsk->flags \|= PF_POSTCOREDUMP;
413	core_state = tsk->signal->core_state;
414	spin_unlock_irq(lock: &tsk->sighand->siglock);
415
416	/ The vhost_worker does not particpate in coredumps /
417	if (core_state &&
418	((tsk->flags & (PF_IO_WORKER \| PF_USER_WORKER)) != PF_USER_WORKER)) {
419	struct core_thread self;
420
421	self.task = current;
422	if (self.task->flags & PF_SIGNALED)
423	self.next = xchg(&core_state->dumper.next, &self);
424	else
425	self.task = NULL;
426	/*
427	* Implies mb(), the result of xchg() must be visible
428	* to core_state->dumper.
429	*/
430	if (atomic_dec_and_test(v: &core_state->nr_threads))
431	complete(&core_state->startup);
432
433	for (;;) {
434	set_current_state(TASK_UNINTERRUPTIBLE\|TASK_FREEZABLE);
435	if (!self.task) / see coredump_finish() /
436	break;
437	schedule();
438	}
439	__set_current_state(TASK_RUNNING);
440	}
441	}
442
443	#ifdef CONFIG_MEMCG
444	/*
445	* A task is exiting. If it owned this mm, find a new owner for the mm.
446	*/
447	void mm_update_next_owner(struct mm_struct *mm)
448	{
449	struct task_struct c, g, *p = current;
450
451	retry:
452	/*
453	* If the exiting or execing task is not the owner, it's
454	* someone else's problem.
455	*/
456	if (mm->owner != p)
457	return;
458	/*
459	* The current owner is exiting/execing and there are no other
460	* candidates. Do not leave the mm pointing to a possibly
461	* freed task structure.
462	*/
463	if (atomic_read(v: &mm->mm_users) <= `1`) {
464	WRITE_ONCE(mm->owner, NULL);
465	return;
466	}
467
468	read_lock(&tasklist_lock);
469	/*
470	* Search in the children
471	*/
472	list_for_each_entry(c, &p->children, sibling) {
473	if (c->mm == mm)
474	goto assign_new_owner;
475	}
476
477	/*
478	* Search in the siblings
479	*/
480	list_for_each_entry(c, &p->real_parent->children, sibling) {
481	if (c->mm == mm)
482	goto assign_new_owner;
483	}
484
485	/*
486	* Search through everything else, we should not get here often.
487	*/
488	for_each_process(g) {
489	if (g->flags & PF_KTHREAD)
490	continue;
491	for_each_thread(g, c) {
492	if (c->mm == mm)
493	goto assign_new_owner;
494	if (c->mm)
495	break;
496	}
497	}
498	read_unlock(&tasklist_lock);
499	/*
500	* We found no owner yet mm_users > 1: this implies that we are
501	* most likely racing with swapoff (try_to_unuse()) or /proc or
502	* ptrace or page migration (get_task_mm()). Mark owner as NULL.
503	*/
504	WRITE_ONCE(mm->owner, NULL);
505	return;
506
507	assign_new_owner:
508	BUG_ON(c == p);
509	get_task_struct(t: c);
510	/*
511	* The task_lock protects c->mm from changing.
512	* We always want mm->owner->mm == mm
513	*/
514	task_lock(p: c);
515	/*
516	* Delay read_unlock() till we have the task_lock()
517	* to ensure that c does not slip away underneath us
518	*/
519	read_unlock(&tasklist_lock);
520	if (c->mm != mm) {
521	task_unlock(p: c);
522	put_task_struct(t: c);
523	goto retry;
524	}
525	WRITE_ONCE(mm->owner, c);
526	lru_gen_migrate_mm(mm);
527	task_unlock(p: c);
528	put_task_struct(t: c);
529	}
530	#endif /* CONFIG_MEMCG */
531
532	/*
533	* Turn us into a lazy TLB process if we
534	* aren't already..
535	*/
536	static void exit_mm(void)
537	{
538	struct mm_struct *mm = current->mm;
539
540	exit_mm_release(current, mm);
541	if (!mm)
542	return;
543	mmap_read_lock(mm);
544	mmgrab_lazy_tlb(mm);
545	BUG_ON(mm != current->active_mm);
546	/ more a memory barrier than a real lock /
547	task_lock(current);
548	/*
549	* When a thread stops operating on an address space, the loop
550	* in membarrier_private_expedited() may not observe that
551	* tsk->mm, and the loop in membarrier_global_expedited() may
552	* not observe a MEMBARRIER_STATE_GLOBAL_EXPEDITED
553	* rq->membarrier_state, so those would not issue an IPI.
554	* Membarrier requires a memory barrier after accessing
555	* user-space memory, before clearing tsk->mm or the
556	* rq->membarrier_state.
557	*/
558	smp_mb__after_spinlock();
559	local_irq_disable();
560	current->mm = NULL;
561	membarrier_update_current_mm(NULL);
562	enter_lazy_tlb(mm, current);
563	local_irq_enable();
564	task_unlock(current);
565	mmap_read_unlock(mm);
566	mm_update_next_owner(mm);
567	mmput(mm);
568	if (test_thread_flag(TIF_MEMDIE))
569	exit_oom_victim();
570	}
571
572	static struct task_struct find_alive_thread(struct* task_struct *p)
573	{
574	struct task_struct *t;
575
576	for_each_thread(p, t) {
577	if (!(t->flags & PF_EXITING))
578	return t;
579	}
580	return NULL;
581	}
582
583	static struct task_struct find_child_reaper(struct* task_struct *father,
584	struct list_head *dead)
585	__releases(&tasklist_lock)
586	__acquires(&tasklist_lock)
587	{
588	struct pid_namespace *pid_ns = task_active_pid_ns(tsk: father);
589	struct task_struct *reaper = pid_ns->child_reaper;
590	struct task_struct p, n;
591
592	if (likely(reaper != father))
593	return reaper;
594
595	reaper = find_alive_thread(p: father);
596	if (reaper) {
597	pid_ns->child_reaper = reaper;
598	return reaper;
599	}
600
601	write_unlock_irq(&tasklist_lock);
602
603	list_for_each_entry_safe(p, n, dead, ptrace_entry) {
604	list_del_init(entry: &p->ptrace_entry);
605	release_task(p);
606	}
607
608	zap_pid_ns_processes(pid_ns);
609	write_lock_irq(&tasklist_lock);
610
611	return father;
612	}
613
614	/*
615	* When we die, we re-parent all our children, and try to:
616	* 1. give them to another thread in our thread group, if such a member exists
617	* 2. give it to the first ancestor process which prctl'd itself as a
618	* child_subreaper for its children (like a service manager)
619	* 3. give it to the init process (PID 1) in our pid namespace
620	*/
621	static struct task_struct find_new_reaper(struct* task_struct *father,
622	struct task_struct *child_reaper)
623	{
624	struct task_struct thread, reaper;
625
626	thread = find_alive_thread(p: father);
627	if (thread)
628	return thread;
629
630	if (father->signal->has_child_subreaper) {
631	unsigned int ns_level = task_pid(task: father)->level;
632	/*
633	* Find the first ->is_child_subreaper ancestor in our pid_ns.
634	* We can't check reaper != child_reaper to ensure we do not
635	* cross the namespaces, the exiting parent could be injected
636	* by setns() + fork().
637	* We check pid->level, this is slightly more efficient than
638	* task_active_pid_ns(reaper) != task_active_pid_ns(father).
639	*/
640	for (reaper = father->real_parent;
641	task_pid(task: reaper)->level == ns_level;
642	reaper = reaper->real_parent) {
643	if (reaper == &init_task)
644	break;
645	if (!reaper->signal->is_child_subreaper)
646	continue;
647	thread = find_alive_thread(p: reaper);
648	if (thread)
649	return thread;
650	}
651	}
652
653	return child_reaper;
654	}
655
656	/*
657	* Any that need to be release_task'd are put on the @dead list.
658	*/
659	static void reparent_leader(struct task_struct father, struct* task_struct *p,
660	struct list_head *dead)
661	{
662	if (unlikely(p->exit_state == EXIT_DEAD))
663	return;
664
665	/ We don't want people slaying init. /
666	p->exit_signal = SIGCHLD;
667
668	/ If it has exited notify the new parent about this child's death. /
669	if (!p->ptrace &&
670	p->exit_state == EXIT_ZOMBIE && thread_group_empty(p)) {
671	if (do_notify_parent(p, p->exit_signal)) {
672	p->exit_state = EXIT_DEAD;
673	list_add(new: &p->ptrace_entry, head: dead);
674	}
675	}
676
677	kill_orphaned_pgrp(tsk: p, parent: father);
678	}
679
680	/*
681	* This does two things:
682	*
683	* A. Make init inherit all the child processes
684	* B. Check to see if any process groups have become orphaned
685	* as a result of our exiting, and if they have any stopped
686	* jobs, send them a SIGHUP and then a SIGCONT. (POSIX 3.2.2.2)
687	*/
688	static void forget_original_parent(struct task_struct *father,
689	struct list_head *dead)
690	{
691	struct task_struct p, t, *reaper;
692
693	if (unlikely(!list_empty(&father->ptraced)))
694	exit_ptrace(tracer: father, dead);
695
696	/ Can drop and reacquire tasklist_lock /
697	reaper = find_child_reaper(father, dead);
698	if (list_empty(head: &father->children))
699	return;
700
701	reaper = find_new_reaper(father, child_reaper: reaper);
702	list_for_each_entry(p, &father->children, sibling) {
703	for_each_thread(p, t) {
704	RCU_INIT_POINTER(t->real_parent, reaper);
705	BUG_ON((!t->ptrace) != (rcu_access_pointer(t->parent) == father));
706	if (likely(!t->ptrace))
707	t->parent = t->real_parent;
708	if (t->pdeath_signal)
709	group_send_sig_info(sig: t->pdeath_signal,
710	SEND_SIG_NOINFO, p: t,
711	type: PIDTYPE_TGID);
712	}
713	/*
714	* If this is a threaded reparent there is no need to
715	* notify anyone anything has happened.
716	*/
717	if (!same_thread_group(p1: reaper, p2: father))
718	reparent_leader(father, p, dead);
719	}
720	list_splice_tail_init(list: &father->children, head: &reaper->children);
721	}
722
723	/*
724	* Send signals to all our closest relatives so that they know
725	* to properly mourn us..
726	*/
727	static void exit_notify(struct task_struct tsk, int* group_dead)
728	{
729	bool autoreap;
730	struct task_struct p, n;
731	LIST_HEAD(dead);
732
733	write_lock_irq(&tasklist_lock);
734	forget_original_parent(father: tsk, dead: &dead);
735
736	if (group_dead)
737	kill_orphaned_pgrp(tsk: tsk->group_leader, NULL);
738
739	tsk->exit_state = EXIT_ZOMBIE;
740	if (unlikely(tsk->ptrace)) {
741	int sig = thread_group_leader(p: tsk) &&
742	thread_group_empty(p: tsk) &&
743	!ptrace_reparented(child: tsk) ?
744	tsk->exit_signal : SIGCHLD;
745	autoreap = do_notify_parent(tsk, sig);
746	} else if (thread_group_leader(p: tsk)) {
747	autoreap = thread_group_empty(p: tsk) &&
748	do_notify_parent(tsk, tsk->exit_signal);
749	} else {
750	autoreap = true;
751	}
752
753	if (autoreap) {
754	tsk->exit_state = EXIT_DEAD;
755	list_add(new: &tsk->ptrace_entry, head: &dead);
756	}
757
758	/ mt-exec, de_thread() is waiting for group leader /
759	if (unlikely(tsk->signal->notify_count < `0`))
760	wake_up_process(tsk: tsk->signal->group_exec_task);
761	write_unlock_irq(&tasklist_lock);
762
763	list_for_each_entry_safe(p, n, &dead, ptrace_entry) {
764	list_del_init(entry: &p->ptrace_entry);
765	release_task(p);
766	}
767	}
768
769	#ifdef CONFIG_DEBUG_STACK_USAGE
770	static void check_stack_usage(void)
771	{
772	static DEFINE_SPINLOCK(low_water_lock);
773	static int lowest_to_date = THREAD_SIZE;
774	unsigned long free;
775
776	free = stack_not_used(current);
777
778	if (free >= lowest_to_date)
779	return;
780
781	spin_lock(lock: &low_water_lock);
782	if (free < lowest_to_date) {
783	pr_info("%s (%d) used greatest stack depth: %lu bytes left\n",
784	current->comm, task_pid_nr(current), free);
785	lowest_to_date = free;
786	}
787	spin_unlock(lock: &low_water_lock);
788	}
789	#else
790	static inline void check_stack_usage(void) {}
791	#endif
792
793	static void synchronize_group_exit(struct task_struct tsk, long* code)
794	{
795	struct sighand_struct *sighand = tsk->sighand;
796	struct signal_struct *signal = tsk->signal;
797
798	spin_lock_irq(lock: &sighand->siglock);
799	signal->quick_threads--;
800	if ((signal->quick_threads == `0`) &&
801	!(signal->flags & SIGNAL_GROUP_EXIT)) {
802	signal->flags = SIGNAL_GROUP_EXIT;
803	signal->group_exit_code = code;
804	signal->group_stop_count = `0`;
805	}
806	spin_unlock_irq(lock: &sighand->siglock);
807	}
808
809	void __noreturn do_exit(long code)
810	{
811	struct task_struct *tsk = current;
812	int group_dead;
813
814	WARN_ON(irqs_disabled());
815
816	synchronize_group_exit(tsk, code);
817
818	WARN_ON(tsk->plug);
819
820	kcov_task_exit(t: tsk);
821	kmsan_task_exit(task: tsk);
822
823	coredump_task_exit(tsk);
824	ptrace_event(PTRACE_EVENT_EXIT, message: code);
825	user_events_exit(t: tsk);
826
827	validate_creds_for_do_exit(tsk);
828
829	io_uring_files_cancel();
830	exit_signals(tsk); / sets PF_EXITING /
831
832	acct_update_integrals(tsk);
833	group_dead = atomic_dec_and_test(v: &tsk->signal->live);
834	if (group_dead) {
835	/*
836	* If the last thread of global init has exited, panic
837	* immediately to get a useable coredump.
838	*/
839	if (unlikely(is_global_init(tsk)))
840	panic(fmt: "Attempted to kill init! exitcode=0x%08x\n",
841	tsk->signal->group_exit_code ?: (int)code);
842
843	#ifdef CONFIG_POSIX_TIMERS
844	hrtimer_cancel(timer: &tsk->signal->real_timer);
845	exit_itimers(tsk);
846	#endif
847	if (tsk->mm)
848	setmax_mm_hiwater_rss(maxrss: &tsk->signal->maxrss, mm: tsk->mm);
849	}
850	acct_collect(exitcode: code, group_dead);
851	if (group_dead)
852	tty_audit_exit();
853	audit_free(task: tsk);
854
855	tsk->exit_code = code;
856	taskstats_exit(tsk, group_dead);
857
858	exit_mm();
859
860	if (group_dead)
861	acct_process();
862	trace_sched_process_exit(p: tsk);
863
864	exit_sem(tsk);
865	exit_shm(task: tsk);
866	exit_files(tsk);
867	exit_fs(tsk);
868	if (group_dead)
869	disassociate_ctty(priv: `1`);
870	exit_task_namespaces(tsk);
871	exit_task_work(task: tsk);
872	exit_thread(tsk);
873
874	/*
875	* Flush inherited counters to the parent - before the parent
876	* gets woken up by child-exit notifications.
877	*
878	* because of cgroup mode, must be called before cgroup_exit()
879	*/
880	perf_event_exit_task(child: tsk);
881
882	sched_autogroup_exit_task(p: tsk);
883	cgroup_exit(p: tsk);
884
885	/*
886	* FIXME: do that only when needed, using sched_exit tracepoint
887	*/
888	flush_ptrace_hw_breakpoint(tsk);
889
890	exit_tasks_rcu_start();
891	exit_notify(tsk, group_dead);
892	proc_exit_connector(task: tsk);
893	mpol_put_task_policy(tsk);
894	#ifdef CONFIG_FUTEX
895	if (unlikely(current->pi_state_cache))
896	kfree(current->pi_state_cache);
897	#endif
898	/*
899	* Make sure we are holding no locks:
900	*/
901	debug_check_no_locks_held();
902
903	if (tsk->io_context)
904	exit_io_context(task: tsk);
905
906	if (tsk->splice_pipe)
907	free_pipe_info(tsk->splice_pipe);
908
909	if (tsk->task_frag.page)
910	put_page(page: tsk->task_frag.page);
911
912	validate_creds_for_do_exit(tsk);
913	exit_task_stack_account(tsk);
914
915	check_stack_usage();
916	preempt_disable();
917	if (tsk->nr_dirtied)
918	__this_cpu_add(dirty_throttle_leaks, tsk->nr_dirtied);
919	exit_rcu();
920	exit_tasks_rcu_finish();
921
922	lockdep_free_task(task: tsk);
923	do_task_dead();
924	}
925
926	void __noreturn make_task_dead(int signr)
927	{
928	/*
929	* Take the task off the cpu after something catastrophic has
930	* happened.
931	*
932	* We can get here from a kernel oops, sometimes with preemption off.
933	* Start by checking for critical errors.
934	* Then fix up important state like USER_DS and preemption.
935	* Then do everything else.
936	*/
937	struct task_struct *tsk = current;
938	unsigned int limit;
939
940	if (unlikely(in_interrupt()))
941	panic(fmt: "Aiee, killing interrupt handler!");
942	if (unlikely(!tsk->pid))
943	panic(fmt: "Attempted to kill the idle task!");
944
945	if (unlikely(irqs_disabled())) {
946	pr_info("note: %s[%d] exited with irqs disabled\n",
947	current->comm, task_pid_nr(current));
948	local_irq_enable();
949	}
950	if (unlikely(in_atomic())) {
951	pr_info("note: %s[%d] exited with preempt_count %d\n",
952	current->comm, task_pid_nr(current),
953	preempt_count());
954	preempt_count_set(PREEMPT_ENABLED);
955	}
956
957	/*
958	* Every time the system oopses, if the oops happens while a reference
959	* to an object was held, the reference leaks.
960	* If the oops doesn't also leak memory, repeated oopsing can cause
961	* reference counters to wrap around (if they're not using refcount_t).
962	* This means that repeated oopsing can make unexploitable-looking bugs
963	* exploitable through repeated oopsing.
964	* To make sure this can't happen, place an upper bound on how often the
965	* kernel may oops without panic().
966	*/
967	limit = READ_ONCE(oops_limit);
968	if (atomic_inc_return(v: &oops_count) >= limit && limit)
969	panic(fmt: "Oopsed too often (kernel.oops_limit is %d)", limit);
970
971	/*
972	* We're taking recursive faults here in make_task_dead. Safest is to just
973	* leave this task alone and wait for reboot.
974	*/
975	if (unlikely(tsk->flags & PF_EXITING)) {
976	pr_alert("Fixing recursive fault but reboot is needed!\n");
977	futex_exit_recursive(tsk);
978	tsk->exit_state = EXIT_DEAD;
979	refcount_inc(r: &tsk->rcu_users);
980	do_task_dead();
981	}
982
983	do_exit(code: signr);
984	}
985
986	SYSCALL_DEFINE1(exit, int, error_code)
987	{
988	do_exit(code: (error_code&`0xff`)<<`8`);
989	}
990
991	/*
992	* Take down every thread in the group. This is called by fatal signals
993	* as well as by sys_exit_group (below).
994	*/
995	void __noreturn
996	do_group_exit(int exit_code)
997	{
998	struct signal_struct *sig = current->signal;
999
1000	if (sig->flags & SIGNAL_GROUP_EXIT)
1001	exit_code = sig->group_exit_code;
1002	else if (sig->group_exec_task)
1003	exit_code = `0`;
1004	else {
1005	struct sighand_struct *const sighand = current->sighand;
1006
1007	spin_lock_irq(lock: &sighand->siglock);
1008	if (sig->flags & SIGNAL_GROUP_EXIT)
1009	/ Another thread got here before we took the lock. /
1010	exit_code = sig->group_exit_code;
1011	else if (sig->group_exec_task)
1012	exit_code = `0`;
1013	else {
1014	sig->group_exit_code = exit_code;
1015	sig->flags = SIGNAL_GROUP_EXIT;
1016	zap_other_threads(current);
1017	}
1018	spin_unlock_irq(lock: &sighand->siglock);
1019	}
1020
1021	do_exit(code: exit_code);
1022	/ NOTREACHED /
1023	}
1024
1025	/*
1026	* this kills every thread in the thread group. Note that any externally
1027	* wait4()-ing process will get the correct exit code - even if this
1028	* thread is not the thread group leader.
1029	*/
1030	SYSCALL_DEFINE1(exit_group, int, error_code)
1031	{
1032	do_group_exit(exit_code: (error_code & `0xff`) << `8`);
1033	/ NOTREACHED /
1034	return `0`;
1035	}
1036
1037	static int eligible_pid(struct wait_opts wo, struct* task_struct *p)
1038	{
1039	return wo->wo_type == PIDTYPE_MAX \|\|
1040	task_pid_type(task: p, type: wo->wo_type) == wo->wo_pid;
1041	}
1042
1043	static int
1044	eligible_child(struct wait_opts wo, bool ptrace, struct* task_struct *p)
1045	{
1046	if (!eligible_pid(wo, p))
1047	return `0`;
1048
1049	/*
1050	* Wait for all children (clone and not) if __WALL is set or
1051	* if it is traced by us.
1052	*/
1053	if (ptrace \|\| (wo->wo_flags & __WALL))
1054	return `1`;
1055
1056	/*
1057	* Otherwise, wait for clone children only if __WCLONE is set;
1058	* otherwise, wait for non-clone children only.
1059	*
1060	* Note: a "clone" child here is one that reports to its parent
1061	* using a signal other than SIGCHLD, or a non-leader thread which
1062	* we can only see if it is traced by us.
1063	*/
1064	if ((p->exit_signal != SIGCHLD) ^ !!(wo->wo_flags & __WCLONE))
1065	return `0`;
1066
1067	return `1`;
1068	}
1069
1070	/*
1071	* Handle sys_wait4 work for one task in state EXIT_ZOMBIE. We hold
1072	* read_lock(&tasklist_lock) on entry. If we return zero, we still hold
1073	* the lock and this task is uninteresting. If we return nonzero, we have
1074	* released the lock and the system call should return.
1075	*/
1076	static int wait_task_zombie(struct wait_opts wo, struct* task_struct *p)
1077	{
1078	int state, status;
1079	pid_t pid = task_pid_vnr(tsk: p);
1080	uid_t uid = from_kuid_munged(current_user_ns(), task_uid(p));
1081	struct waitid_info *infop;
1082
1083	if (!likely(wo->wo_flags & WEXITED))
1084	return `0`;
1085
1086	if (unlikely(wo->wo_flags & WNOWAIT)) {
1087	status = (p->signal->flags & SIGNAL_GROUP_EXIT)
1088	? p->signal->group_exit_code : p->exit_code;
1089	get_task_struct(t: p);
1090	read_unlock(&tasklist_lock);
1091	sched_annotate_sleep();
1092	if (wo->wo_rusage)
1093	getrusage(p, RUSAGE_BOTH, ru: wo->wo_rusage);
1094	put_task_struct(t: p);
1095	goto out_info;
1096	}
1097	/*
1098	* Move the task's state to DEAD/TRACE, only one thread can do this.
1099	*/
1100	state = (ptrace_reparented(child: p) && thread_group_leader(p)) ?
1101	EXIT_TRACE : EXIT_DEAD;
1102	if (cmpxchg(&p->exit_state, EXIT_ZOMBIE, state) != EXIT_ZOMBIE)
1103	return `0`;
1104	/*
1105	* We own this thread, nobody else can reap it.
1106	*/
1107	read_unlock(&tasklist_lock);
1108	sched_annotate_sleep();
1109
1110	/*
1111	* Check thread_group_leader() to exclude the traced sub-threads.
1112	*/
1113	if (state == EXIT_DEAD && thread_group_leader(p)) {
1114	struct signal_struct *sig = p->signal;
1115	struct signal_struct *psig = current->signal;
1116	unsigned long maxrss;
1117	u64 tgutime, tgstime;
1118
1119	/*
1120	* The resource counters for the group leader are in its
1121	* own task_struct. Those for dead threads in the group
1122	* are in its signal_struct, as are those for the child
1123	* processes it has previously reaped. All these
1124	* accumulate in the parent's signal_struct c* fields.
1125	*
1126	* We don't bother to take a lock here to protect these
1127	* p->signal fields because the whole thread group is dead
1128	* and nobody can change them.
1129	*
1130	* psig->stats_lock also protects us from our sub-threads
1131	* which can reap other children at the same time. Until
1132	* we change k_getrusage()-like users to rely on this lock
1133	* we have to take ->siglock as well.
1134	*
1135	* We use thread_group_cputime_adjusted() to get times for
1136	* the thread group, which consolidates times for all threads
1137	* in the group including the group leader.
1138	*/
1139	thread_group_cputime_adjusted(p, ut: &tgutime, st: &tgstime);
1140	spin_lock_irq(lock: &current->sighand->siglock);
1141	write_seqlock(sl: &psig->stats_lock);
1142	psig->cutime += tgutime + sig->cutime;
1143	psig->cstime += tgstime + sig->cstime;
1144	psig->cgtime += task_gtime(t: p) + sig->gtime + sig->cgtime;
1145	psig->cmin_flt +=
1146	p->min_flt + sig->min_flt + sig->cmin_flt;
1147	psig->cmaj_flt +=
1148	p->maj_flt + sig->maj_flt + sig->cmaj_flt;
1149	psig->cnvcsw +=
1150	p->nvcsw + sig->nvcsw + sig->cnvcsw;
1151	psig->cnivcsw +=
1152	p->nivcsw + sig->nivcsw + sig->cnivcsw;
1153	psig->cinblock +=
1154	task_io_get_inblock(p) +
1155	sig->inblock + sig->cinblock;
1156	psig->coublock +=
1157	task_io_get_oublock(p) +
1158	sig->oublock + sig->coublock;
1159	maxrss = max(sig->maxrss, sig->cmaxrss);
1160	if (psig->cmaxrss < maxrss)
1161	psig->cmaxrss = maxrss;
1162	task_io_accounting_add(dst: &psig->ioac, src: &p->ioac);
1163	task_io_accounting_add(dst: &psig->ioac, src: &sig->ioac);
1164	write_sequnlock(sl: &psig->stats_lock);
1165	spin_unlock_irq(lock: &current->sighand->siglock);
1166	}
1167
1168	if (wo->wo_rusage)
1169	getrusage(p, RUSAGE_BOTH, ru: wo->wo_rusage);
1170	status = (p->signal->flags & SIGNAL_GROUP_EXIT)
1171	? p->signal->group_exit_code : p->exit_code;
1172	wo->wo_stat = status;
1173
1174	if (state == EXIT_TRACE) {
1175	write_lock_irq(&tasklist_lock);
1176	/ We dropped tasklist, ptracer could die and untrace /
1177	ptrace_unlink(child: p);
1178
1179	/ If parent wants a zombie, don't release it now /
1180	state = EXIT_ZOMBIE;
1181	if (do_notify_parent(p, p->exit_signal))
1182	state = EXIT_DEAD;
1183	p->exit_state = state;
1184	write_unlock_irq(&tasklist_lock);
1185	}
1186	if (state == EXIT_DEAD)
1187	release_task(p);
1188
1189	out_info:
1190	infop = wo->wo_info;
1191	if (infop) {
1192	if ((status & `0x7f`) == `0`) {
1193	infop->cause = CLD_EXITED;
1194	infop->status = status >> `8`;
1195	} else {
1196	infop->cause = (status & `0x80`) ? CLD_DUMPED : CLD_KILLED;
1197	infop->status = status & `0x7f`;
1198	}
1199	infop->pid = pid;
1200	infop->uid = uid;
1201	}
1202
1203	return pid;
1204	}
1205
1206	static int task_stopped_code(struct* task_struct *p, bool ptrace)
1207	{
1208	if (ptrace) {
1209	if (task_is_traced(p) && !(p->jobctl & JOBCTL_LISTENING))
1210	return &p->exit_code;
1211	} else {
1212	if (p->signal->flags & SIGNAL_STOP_STOPPED)
1213	return &p->signal->group_exit_code;
1214	}
1215	return NULL;
1216	}
1217
1218	/**
1219	* wait_task_stopped - Wait for %TASK_STOPPED or %TASK_TRACED
1220	* @wo: wait options
1221	* @ptrace: is the wait for ptrace
1222	* @p: task to wait for
1223	*
1224	* Handle sys_wait4() work for %p in state %TASK_STOPPED or %TASK_TRACED.
1225	*
1226	* CONTEXT:
1227	* read_lock(&tasklist_lock), which is released if return value is
1228	* non-zero. Also, grabs and releases @p->sighand->siglock.
1229	*
1230	* RETURNS:
1231	* 0 if wait condition didn't exist and search for other wait conditions
1232	* should continue. Non-zero return, -errno on failure and @p's pid on
1233	* success, implies that tasklist_lock is released and wait condition
1234	* search should terminate.
1235	*/
1236	static int wait_task_stopped(struct wait_opts *wo,
1237	int ptrace, struct task_struct *p)
1238	{
1239	struct waitid_info *infop;
1240	int exit_code, *p_code, why;
1241	uid_t uid = `0`; / unneeded, required by compiler /
1242	pid_t pid;
1243
1244	/*
1245	* Traditionally we see ptrace'd stopped tasks regardless of options.
1246	*/
1247	if (!ptrace && !(wo->wo_flags & WUNTRACED))
1248	return `0`;
1249
1250	if (!task_stopped_code(p, ptrace))
1251	return `0`;
1252
1253	exit_code = `0`;
1254	spin_lock_irq(lock: &p->sighand->siglock);
1255
1256	p_code = task_stopped_code(p, ptrace);
1257	if (unlikely(!p_code))
1258	goto unlock_sig;
1259
1260	exit_code = *p_code;
1261	if (!exit_code)
1262	goto unlock_sig;
1263
1264	if (!unlikely(wo->wo_flags & WNOWAIT))
1265	*p_code = `0`;
1266
1267	uid = from_kuid_munged(current_user_ns(), task_uid(p));
1268	unlock_sig:
1269	spin_unlock_irq(lock: &p->sighand->siglock);
1270	if (!exit_code)
1271	return `0`;
1272
1273	/*
1274	* Now we are pretty sure this task is interesting.
1275	* Make sure it doesn't get reaped out from under us while we
1276	* give up the lock and then examine it below. We don't want to
1277	* keep holding onto the tasklist_lock while we call getrusage and
1278	* possibly take page faults for user memory.
1279	*/
1280	get_task_struct(t: p);
1281	pid = task_pid_vnr(tsk: p);
1282	why = ptrace ? CLD_TRAPPED : CLD_STOPPED;
1283	read_unlock(&tasklist_lock);
1284	sched_annotate_sleep();
1285	if (wo->wo_rusage)
1286	getrusage(p, RUSAGE_BOTH, ru: wo->wo_rusage);
1287	put_task_struct(t: p);
1288
1289	if (likely(!(wo->wo_flags & WNOWAIT)))
1290	wo->wo_stat = (exit_code << `8`) \| `0x7f`;
1291
1292	infop = wo->wo_info;
1293	if (infop) {
1294	infop->cause = why;
1295	infop->status = exit_code;
1296	infop->pid = pid;
1297	infop->uid = uid;
1298	}
1299	return pid;
1300	}
1301
1302	/*
1303	* Handle do_wait work for one task in a live, non-stopped state.
1304	* read_lock(&tasklist_lock) on entry. If we return zero, we still hold
1305	* the lock and this task is uninteresting. If we return nonzero, we have
1306	* released the lock and the system call should return.
1307	*/
1308	static int wait_task_continued(struct wait_opts wo, struct* task_struct *p)
1309	{
1310	struct waitid_info *infop;
1311	pid_t pid;
1312	uid_t uid;
1313
1314	if (!unlikely(wo->wo_flags & WCONTINUED))
1315	return `0`;
1316
1317	if (!(p->signal->flags & SIGNAL_STOP_CONTINUED))
1318	return `0`;
1319
1320	spin_lock_irq(lock: &p->sighand->siglock);
1321	/ Re-check with the lock held. /
1322	if (!(p->signal->flags & SIGNAL_STOP_CONTINUED)) {
1323	spin_unlock_irq(lock: &p->sighand->siglock);
1324	return `0`;
1325	}
1326	if (!unlikely(wo->wo_flags & WNOWAIT))
1327	p->signal->flags &= ~SIGNAL_STOP_CONTINUED;
1328	uid = from_kuid_munged(current_user_ns(), task_uid(p));
1329	spin_unlock_irq(lock: &p->sighand->siglock);
1330
1331	pid = task_pid_vnr(tsk: p);
1332	get_task_struct(t: p);
1333	read_unlock(&tasklist_lock);
1334	sched_annotate_sleep();
1335	if (wo->wo_rusage)
1336	getrusage(p, RUSAGE_BOTH, ru: wo->wo_rusage);
1337	put_task_struct(t: p);
1338
1339	infop = wo->wo_info;
1340	if (!infop) {
1341	wo->wo_stat = `0xffff`;
1342	} else {
1343	infop->cause = CLD_CONTINUED;
1344	infop->pid = pid;
1345	infop->uid = uid;
1346	infop->status = SIGCONT;
1347	}
1348	return pid;
1349	}
1350
1351	/*
1352	* Consider @p for a wait by @parent.
1353	*
1354	* -ECHILD should be in ->notask_error before the first call.
1355	* Returns nonzero for a final return, when we have unlocked tasklist_lock.
1356	* Returns zero if the search for a child should continue;
1357	* then ->notask_error is 0 if @p is an eligible child,
1358	* or still -ECHILD.
1359	*/
1360	static int wait_consider_task(struct wait_opts wo, int* ptrace,
1361	struct task_struct *p)
1362	{
1363	/*
1364	* We can race with wait_task_zombie() from another thread.
1365	* Ensure that EXIT_ZOMBIE -> EXIT_DEAD/EXIT_TRACE transition
1366	* can't confuse the checks below.
1367	*/
1368	int exit_state = READ_ONCE(p->exit_state);
1369	int ret;
1370
1371	if (unlikely(exit_state == EXIT_DEAD))
1372	return `0`;
1373
1374	ret = eligible_child(wo, ptrace, p);
1375	if (!ret)
1376	return ret;
1377
1378	if (unlikely(exit_state == EXIT_TRACE)) {
1379	/*
1380	* ptrace == 0 means we are the natural parent. In this case
1381	* we should clear notask_error, debugger will notify us.
1382	*/
1383	if (likely(!ptrace))
1384	wo->notask_error = `0`;
1385	return `0`;
1386	}
1387
1388	if (likely(!ptrace) && unlikely(p->ptrace)) {
1389	/*
1390	* If it is traced by its real parent's group, just pretend
1391	* the caller is ptrace_do_wait() and reap this child if it
1392	* is zombie.
1393	*
1394	* This also hides group stop state from real parent; otherwise
1395	* a single stop can be reported twice as group and ptrace stop.
1396	* If a ptracer wants to distinguish these two events for its
1397	* own children it should create a separate process which takes
1398	* the role of real parent.
1399	*/
1400	if (!ptrace_reparented(child: p))
1401	ptrace = `1`;
1402	}
1403
1404	/ slay zombie? /
1405	if (exit_state == EXIT_ZOMBIE) {
1406	/ we don't reap group leaders with subthreads /
1407	if (!delay_group_leader(p)) {
1408	/*
1409	* A zombie ptracee is only visible to its ptracer.
1410	* Notification and reaping will be cascaded to the
1411	* real parent when the ptracer detaches.
1412	*/
1413	if (unlikely(ptrace) \|\| likely(!p->ptrace))
1414	return wait_task_zombie(wo, p);
1415	}
1416
1417	/*
1418	* Allow access to stopped/continued state via zombie by
1419	* falling through. Clearing of notask_error is complex.
1420	*
1421	* When !@ptrace:
1422	*
1423	* If WEXITED is set, notask_error should naturally be
1424	* cleared. If not, subset of WSTOPPED\|WCONTINUED is set,
1425	* so, if there are live subthreads, there are events to
1426	* wait for. If all subthreads are dead, it's still safe
1427	* to clear - this function will be called again in finite
1428	* amount time once all the subthreads are released and
1429	* will then return without clearing.
1430	*
1431	* When @ptrace:
1432	*
1433	* Stopped state is per-task and thus can't change once the
1434	* target task dies. Only continued and exited can happen.
1435	* Clear notask_error if WCONTINUED \| WEXITED.
1436	*/
1437	if (likely(!ptrace) \|\| (wo->wo_flags & (WCONTINUED \| WEXITED)))
1438	wo->notask_error = `0`;
1439	} else {
1440	/*
1441	* @p is alive and it's gonna stop, continue or exit, so
1442	* there always is something to wait for.
1443	*/
1444	wo->notask_error = `0`;
1445	}
1446
1447	/*
1448	* Wait for stopped. Depending on @ptrace, different stopped state
1449	* is used and the two don't interact with each other.
1450	*/
1451	ret = wait_task_stopped(wo, ptrace, p);
1452	if (ret)
1453	return ret;
1454
1455	/*
1456	* Wait for continued. There's only one continued state and the
1457	* ptracer can consume it which can confuse the real parent. Don't
1458	* use WCONTINUED from ptracer. You don't need or want it.
1459	*/
1460	return wait_task_continued(wo, p);
1461	}
1462
1463	/*
1464	* Do the work of do_wait() for one thread in the group, @tsk.
1465	*
1466	* -ECHILD should be in ->notask_error before the first call.
1467	* Returns nonzero for a final return, when we have unlocked tasklist_lock.
1468	* Returns zero if the search for a child should continue; then
1469	* ->notask_error is 0 if there were any eligible children,
1470	* or still -ECHILD.
1471	*/
1472	static int do_wait_thread(struct wait_opts wo, struct* task_struct *tsk)
1473	{
1474	struct task_struct *p;
1475
1476	list_for_each_entry(p, &tsk->children, sibling) {
1477	int ret = wait_consider_task(wo, ptrace: `0`, p);
1478
1479	if (ret)
1480	return ret;
1481	}
1482
1483	return `0`;
1484	}
1485
1486	static int ptrace_do_wait(struct wait_opts wo, struct* task_struct *tsk)
1487	{
1488	struct task_struct *p;
1489
1490	list_for_each_entry(p, &tsk->ptraced, ptrace_entry) {
1491	int ret = wait_consider_task(wo, ptrace: `1`, p);
1492
1493	if (ret)
1494	return ret;
1495	}
1496
1497	return `0`;
1498	}
1499
1500	bool pid_child_should_wake(struct wait_opts wo, struct* task_struct *p)
1501	{
1502	if (!eligible_pid(wo, p))
1503	return false;
1504
1505	if ((wo->wo_flags & __WNOTHREAD) && wo->child_wait.private != p->parent)
1506	return false;
1507
1508	return true;
1509	}
1510
1511	static int child_wait_callback(wait_queue_entry_t wait, unsigned* mode,
1512	int sync, void *key)
1513	{
1514	struct wait_opts wo = container_of(wait, struct* wait_opts,
1515	child_wait);
1516	struct task_struct *p = key;
1517
1518	if (pid_child_should_wake(wo, p))
1519	return default_wake_function(wq_entry: wait, mode, flags: sync, key);
1520
1521	return `0`;
1522	}
1523
1524	void __wake_up_parent(struct task_struct p, struct* task_struct *parent)
1525	{
1526	__wake_up_sync_key(wq_head: &parent->signal->wait_chldexit,
1527	TASK_INTERRUPTIBLE, key: p);
1528	}
1529
1530	static bool is_effectively_child(struct wait_opts *wo, bool ptrace,
1531	struct task_struct *target)
1532	{
1533	struct task_struct *parent =
1534	!ptrace ? target->real_parent : target->parent;
1535
1536	return current == parent \|\| (!(wo->wo_flags & __WNOTHREAD) &&
1537	same_thread_group(current, p2: parent));
1538	}
1539
1540	/*
1541	* Optimization for waiting on PIDTYPE_PID. No need to iterate through child
1542	* and tracee lists to find the target task.
1543	*/
1544	static int do_wait_pid(struct wait_opts *wo)
1545	{
1546	bool ptrace;
1547	struct task_struct *target;
1548	int retval;
1549
1550	ptrace = false;
1551	target = pid_task(pid: wo->wo_pid, PIDTYPE_TGID);
1552	if (target && is_effectively_child(wo, ptrace, target)) {
1553	retval = wait_consider_task(wo, ptrace, p: target);
1554	if (retval)
1555	return retval;
1556	}
1557
1558	ptrace = true;
1559	target = pid_task(pid: wo->wo_pid, PIDTYPE_PID);
1560	if (target && target->ptrace &&
1561	is_effectively_child(wo, ptrace, target)) {
1562	retval = wait_consider_task(wo, ptrace, p: target);
1563	if (retval)
1564	return retval;
1565	}
1566
1567	return `0`;
1568	}
1569
1570	long __do_wait(struct wait_opts *wo)
1571	{
1572	long retval;
1573
1574	/*
1575	* If there is nothing that can match our criteria, just get out.
1576	* We will clear ->notask_error to zero if we see any child that
1577	* might later match our criteria, even if we are not able to reap
1578	* it yet.
1579	*/
1580	wo->notask_error = -ECHILD;
1581	if ((wo->wo_type < PIDTYPE_MAX) &&
1582	(!wo->wo_pid \|\| !pid_has_task(pid: wo->wo_pid, type: wo->wo_type)))
1583	goto notask;
1584
1585	read_lock(&tasklist_lock);
1586
1587	if (wo->wo_type == PIDTYPE_PID) {
1588	retval = do_wait_pid(wo);
1589	if (retval)
1590	return retval;
1591	} else {
1592	struct task_struct *tsk = current;
1593
1594	do {
1595	retval = do_wait_thread(wo, tsk);
1596	if (retval)
1597	return retval;
1598
1599	retval = ptrace_do_wait(wo, tsk);
1600	if (retval)
1601	return retval;
1602
1603	if (wo->wo_flags & __WNOTHREAD)
1604	break;
1605	} while_each_thread(current, tsk);
1606	}
1607	read_unlock(&tasklist_lock);
1608
1609	notask:
1610	retval = wo->notask_error;
1611	if (!retval && !(wo->wo_flags & WNOHANG))
1612	return -ERESTARTSYS;
1613
1614	return retval;
1615	}
1616
1617	static long do_wait(struct wait_opts *wo)
1618	{
1619	int retval;
1620
1621	trace_sched_process_wait(pid: wo->wo_pid);
1622
1623	init_waitqueue_func_entry(wq_entry: &wo->child_wait, func: child_wait_callback);
1624	wo->child_wait.private = current;
1625	add_wait_queue(wq_head: &current->signal->wait_chldexit, wq_entry: &wo->child_wait);
1626
1627	do {
1628	set_current_state(TASK_INTERRUPTIBLE);
1629	retval = __do_wait(wo);
1630	if (retval != -ERESTARTSYS)
1631	break;
1632	if (signal_pending(current))
1633	break;
1634	schedule();
1635	} while (`1`);
1636
1637	__set_current_state(TASK_RUNNING);
1638	remove_wait_queue(wq_head: &current->signal->wait_chldexit, wq_entry: &wo->child_wait);
1639	return retval;
1640	}
1641
1642	int kernel_waitid_prepare(struct wait_opts wo, int* which, pid_t upid,
1643	struct waitid_info infop, int* options,
1644	struct rusage *ru)
1645	{
1646	unsigned int f_flags = `0`;
1647	struct pid *pid = NULL;
1648	enum pid_type type;
1649
1650	if (options & ~(WNOHANG\|WNOWAIT\|WEXITED\|WSTOPPED\|WCONTINUED\|
1651	__WNOTHREAD\|__WCLONE\|__WALL))
1652	return -EINVAL;
1653	if (!(options & (WEXITED\|WSTOPPED\|WCONTINUED)))
1654	return -EINVAL;
1655
1656	switch (which) {
1657	case P_ALL:
1658	type = PIDTYPE_MAX;
1659	break;
1660	case P_PID:
1661	type = PIDTYPE_PID;
1662	if (upid <= `0`)
1663	return -EINVAL;
1664
1665	pid = find_get_pid(nr: upid);
1666	break;
1667	case P_PGID:
1668	type = PIDTYPE_PGID;
1669	if (upid < `0`)
1670	return -EINVAL;
1671
1672	if (upid)
1673	pid = find_get_pid(nr: upid);
1674	else
1675	pid = get_task_pid(current, type: PIDTYPE_PGID);
1676	break;
1677	case P_PIDFD:
1678	type = PIDTYPE_PID;
1679	if (upid < `0`)
1680	return -EINVAL;
1681
1682	pid = pidfd_get_pid(fd: upid, flags: &f_flags);
1683	if (IS_ERR(ptr: pid))
1684	return PTR_ERR(ptr: pid);
1685
1686	break;
1687	default:
1688	return -EINVAL;
1689	}
1690
1691	wo->wo_type = type;
1692	wo->wo_pid = pid;
1693	wo->wo_flags = options;
1694	wo->wo_info = infop;
1695	wo->wo_rusage = ru;
1696	if (f_flags & O_NONBLOCK)
1697	wo->wo_flags \|= WNOHANG;
1698
1699	return `0`;
1700	}
1701
1702	static long kernel_waitid(int which, pid_t upid, struct waitid_info *infop,
1703	int options, struct rusage *ru)
1704	{
1705	struct wait_opts wo;
1706	long ret;
1707
1708	ret = kernel_waitid_prepare(wo: &wo, which, upid, infop, options, ru);
1709	if (ret)
1710	return ret;
1711
1712	ret = do_wait(wo: &wo);
1713	if (!ret && !(options & WNOHANG) && (wo.wo_flags & WNOHANG))
1714	ret = -EAGAIN;
1715
1716	put_pid(pid: wo.wo_pid);
1717	return ret;
1718	}
1719
1720	SYSCALL_DEFINE5(waitid, int, which, pid_t, upid, struct siginfo __user *,
1721	infop, int, options, struct rusage __user *, ru)
1722	{
1723	struct rusage r;
1724	struct waitid_info info = {.status = `0`};
1725	long err = kernel_waitid(which, upid, infop: &info, options, ru: ru ? &r : NULL);
1726	int signo = `0`;
1727
1728	if (err > `0`) {
1729	signo = SIGCHLD;
1730	err = `0`;
1731	if (ru && copy_to_user(to: ru, from: &r, n: sizeof(struct rusage)))
1732	return -EFAULT;
1733	}
1734	if (!infop)
1735	return err;
1736
1737	if (!user_write_access_begin(infop, sizeof(*infop)))
1738	return -EFAULT;
1739
1740	unsafe_put_user(signo, &infop->si_signo, Efault);
1741	unsafe_put_user(`0`, &infop->si_errno, Efault);
1742	unsafe_put_user(info.cause, &infop->si_code, Efault);
1743	unsafe_put_user(info.pid, &infop->si_pid, Efault);
1744	unsafe_put_user(info.uid, &infop->si_uid, Efault);
1745	unsafe_put_user(info.status, &infop->si_status, Efault);
1746	user_write_access_end();
1747	return err;
1748	Efault:
1749	user_write_access_end();
1750	return -EFAULT;
1751	}
1752
1753	long kernel_wait4(pid_t upid, int __user stat_addr, int* options,
1754	struct rusage *ru)
1755	{
1756	struct wait_opts wo;
1757	struct pid *pid = NULL;
1758	enum pid_type type;
1759	long ret;
1760
1761	if (options & ~(WNOHANG\|WUNTRACED\|WCONTINUED\|
1762	__WNOTHREAD\|__WCLONE\|__WALL))
1763	return -EINVAL;
1764
1765	/ -INT_MIN is not defined /
1766	if (upid == INT_MIN)
1767	return -ESRCH;
1768
1769	if (upid == -`1`)
1770	type = PIDTYPE_MAX;
1771	else if (upid < `0`) {
1772	type = PIDTYPE_PGID;
1773	pid = find_get_pid(nr: -upid);
1774	} else if (upid == `0`) {
1775	type = PIDTYPE_PGID;
1776	pid = get_task_pid(current, type: PIDTYPE_PGID);
1777	} else / upid > 0 / {
1778	type = PIDTYPE_PID;
1779	pid = find_get_pid(nr: upid);
1780	}
1781
1782	wo.wo_type = type;
1783	wo.wo_pid = pid;
1784	wo.wo_flags = options \| WEXITED;
1785	wo.wo_info = NULL;
1786	wo.wo_stat = `0`;
1787	wo.wo_rusage = ru;
1788	ret = do_wait(wo: &wo);
1789	put_pid(pid);
1790	if (ret > `0` && stat_addr && put_user(wo.wo_stat, stat_addr))
1791	ret = -EFAULT;
1792
1793	return ret;
1794	}
1795
1796	int kernel_wait(pid_t pid, int *stat)
1797	{
1798	struct wait_opts wo = {
1799	.wo_type = PIDTYPE_PID,
1800	.wo_pid = find_get_pid(nr: pid),
1801	.wo_flags = WEXITED,
1802	};
1803	int ret;
1804
1805	ret = do_wait(wo: &wo);
1806	if (ret > `0` && wo.wo_stat)
1807	*stat = wo.wo_stat;
1808	put_pid(pid: wo.wo_pid);
1809	return ret;
1810	}
1811
1812	SYSCALL_DEFINE4(wait4, pid_t, upid, int __user *, stat_addr,
1813	int, options, struct rusage __user *, ru)
1814	{
1815	struct rusage r;
1816	long err = kernel_wait4(upid, stat_addr, options, ru: ru ? &r : NULL);
1817
1818	if (err > `0`) {
1819	if (ru && copy_to_user(to: ru, from: &r, n: sizeof(struct rusage)))
1820	return -EFAULT;
1821	}
1822	return err;
1823	}
1824
1825	#ifdef __ARCH_WANT_SYS_WAITPID
1826
1827	/*
1828	* sys_waitpid() remains for compatibility. waitpid() should be
1829	* implemented by calling sys_wait4() from libc.a.
1830	*/
1831	SYSCALL_DEFINE3(waitpid, pid_t, pid, int __user , stat_addr, int*, options)
1832	{
1833	return kernel_wait4(upid: pid, stat_addr, options, NULL);
1834	}
1835
1836	#endif
1837
1838	#ifdef CONFIG_COMPAT
1839	COMPAT_SYSCALL_DEFINE4(wait4,
1840	compat_pid_t, pid,
1841	compat_uint_t __user *, stat_addr,
1842	int, options,
1843	struct compat_rusage __user *, ru)
1844	{
1845	struct rusage r;
1846	long err = kernel_wait4(upid: pid, stat_addr, options, ru: ru ? &r : NULL);
1847	if (err > `0`) {
1848	if (ru && put_compat_rusage(&r, ru))
1849	return -EFAULT;
1850	}
1851	return err;
1852	}
1853
1854	COMPAT_SYSCALL_DEFINE5(waitid,
1855	int, which, compat_pid_t, pid,
1856	struct compat_siginfo __user , infop, int*, options,
1857	struct compat_rusage __user *, uru)
1858	{
1859	struct rusage ru;
1860	struct waitid_info info = {.status = `0`};
1861	long err = kernel_waitid(which, upid: pid, infop: &info, options, ru: uru ? &ru : NULL);
1862	int signo = `0`;
1863	if (err > `0`) {
1864	signo = SIGCHLD;
1865	err = `0`;
1866	if (uru) {
1867	/ kernel_waitid() overwrites everything in ru /
1868	if (COMPAT_USE_64BIT_TIME)
1869	err = copy_to_user(to: uru, from: &ru, n: sizeof(ru));
1870	else
1871	err = put_compat_rusage(&ru, uru);
1872	if (err)
1873	return -EFAULT;
1874	}
1875	}
1876
1877	if (!infop)
1878	return err;
1879
1880	if (!user_write_access_begin(infop, sizeof(*infop)))
1881	return -EFAULT;
1882
1883	unsafe_put_user(signo, &infop->si_signo, Efault);
1884	unsafe_put_user(`0`, &infop->si_errno, Efault);
1885	unsafe_put_user(info.cause, &infop->si_code, Efault);
1886	unsafe_put_user(info.pid, &infop->si_pid, Efault);
1887	unsafe_put_user(info.uid, &infop->si_uid, Efault);
1888	unsafe_put_user(info.status, &infop->si_status, Efault);
1889	user_write_access_end();
1890	return err;
1891	Efault:
1892	user_write_access_end();
1893	return -EFAULT;
1894	}
1895	#endif
1896
1897	/**
1898	* thread_group_exited - check that a thread group has exited
1899	* @pid: tgid of thread group to be checked.
1900	*
1901	* Test if the thread group represented by tgid has exited (all
1902	* threads are zombies, dead or completely gone).
1903	*
1904	* Return: true if the thread group has exited. false otherwise.
1905	*/
1906	bool thread_group_exited(struct pid *pid)
1907	{
1908	struct task_struct *task;
1909	bool exited;
1910
1911	rcu_read_lock();
1912	task = pid_task(pid, PIDTYPE_PID);
1913	exited = !task \|\|
1914	(READ_ONCE(task->exit_state) && thread_group_empty(p: task));
1915	rcu_read_unlock();
1916
1917	return exited;
1918	}
1919	EXPORT_SYMBOL(thread_group_exited);
1920
1921	/*
1922	* This needs to be __function_aligned as GCC implicitly makes any
1923	* implementation of abort() cold and drops alignment specified by
1924	* -falign-functions=N.
1925	*
1926	* See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=88345#c11
1927	*/
1928	__weak __function_aligned void abort(void)
1929	{
1930	BUG();
1931
1932	/ if that doesn't kill us, halt /
1933	panic(fmt: "Oops failed to kill thread");
1934	}
1935	EXPORT_SYMBOL(abort);
1936

source code of linux/kernel/exit.c