oom_kill.c source code [linux/mm/oom_kill.c]

1	// SPDX-License-Identifier: GPL-2.0-only
2	/*
3	* linux/mm/oom_kill.c
4	*
5	* Copyright (C) 1998,2000 Rik van Riel
6	* Thanks go out to Claus Fischer for some serious inspiration and
7	* for goading me into coding this file...
8	* Copyright (C) 2010 Google, Inc.
9	* Rewritten by David Rientjes
10	*
11	* The routines in this file are used to kill a process when
12	* we're seriously out of memory. This gets called from __alloc_pages()
13	* in mm/page_alloc.c when we really run out of memory.
14	*
15	* Since we won't call these routines often (on a well-configured
16	* machine) this file will double as a 'coding guide' and a signpost
17	* for newbie kernel hackers. It features several pointers to major
18	* kernel subsystems and hints as to where to find out what things do.
19	*/
20
21	#include <linux/oom.h>
22	#include <linux/mm.h>
23	#include <linux/err.h>
24	#include <linux/gfp.h>
25	#include <linux/sched.h>
26	#include <linux/sched/mm.h>
27	#include <linux/sched/coredump.h>
28	#include <linux/sched/task.h>
29	#include <linux/sched/debug.h>
30	#include <linux/swap.h>
31	#include <linux/syscalls.h>
32	#include <linux/timex.h>
33	#include <linux/jiffies.h>
34	#include <linux/cpuset.h>
35	#include <linux/export.h>
36	#include <linux/notifier.h>
37	#include <linux/memcontrol.h>
38	#include <linux/mempolicy.h>
39	#include <linux/security.h>
40	#include <linux/ptrace.h>
41	#include <linux/freezer.h>
42	#include <linux/ftrace.h>
43	#include <linux/ratelimit.h>
44	#include <linux/kthread.h>
45	#include <linux/init.h>
46	#include <linux/mmu_notifier.h>
47
48	#include <asm/tlb.h>
49	#include "internal.h"
50	#include "slab.h"
51
52	#define CREATE_TRACE_POINTS
53	#include <trace/events/oom.h>
54
55	static int sysctl_panic_on_oom;
56	static int sysctl_oom_kill_allocating_task;
57	static int sysctl_oom_dump_tasks = `1`;
58
59	/*
60	* Serializes oom killer invocations (out_of_memory()) from all contexts to
61	* prevent from over eager oom killing (e.g. when the oom killer is invoked
62	* from different domains).
63	*
64	* oom_killer_disable() relies on this lock to stabilize oom_killer_disabled
65	* and mark_oom_victim
66	*/
67	DEFINE_MUTEX(oom_lock);
68	/ Serializes oom_score_adj and oom_score_adj_min updates /
69	DEFINE_MUTEX(oom_adj_mutex);
70
71	static inline bool is_memcg_oom(struct oom_control *oc)
72	{
73	return oc->memcg != NULL;
74	}
75
76	#ifdef CONFIG_NUMA
77	/**
78	* oom_cpuset_eligible() - check task eligibility for kill
79	* @start: task struct of which task to consider
80	* @oc: pointer to struct oom_control
81	*
82	* Task eligibility is determined by whether or not a candidate task, @tsk,
83	* shares the same mempolicy nodes as current if it is bound by such a policy
84	* and whether or not it has the same set of allowed cpuset nodes.
85	*
86	* This function is assuming oom-killer context and 'current' has triggered
87	* the oom-killer.
88	*/
89	static bool oom_cpuset_eligible(struct task_struct *start,
90	struct oom_control *oc)
91	{
92	struct task_struct *tsk;
93	bool ret = false;
94	const nodemask_t *mask = oc->nodemask;
95
96	rcu_read_lock();
97	for_each_thread(start, tsk) {
98	if (mask) {
99	/*
100	* If this is a mempolicy constrained oom, tsk's
101	* cpuset is irrelevant. Only return true if its
102	* mempolicy intersects current, otherwise it may be
103	* needlessly killed.
104	*/
105	ret = mempolicy_in_oom_domain(tsk, mask);
106	} else {
107	/*
108	* This is not a mempolicy constrained oom, so only
109	* check the mems of tsk's cpuset.
110	*/
111	ret = cpuset_mems_allowed_intersects(current, tsk2: tsk);
112	}
113	if (ret)
114	break;
115	}
116	rcu_read_unlock();
117
118	return ret;
119	}
120	#else
121	static bool oom_cpuset_eligible(struct task_struct tsk, struct* oom_control *oc)
122	{
123	return true;
124	}
125	#endif /* CONFIG_NUMA */
126
127	/*
128	* The process p may have detached its own ->mm while exiting or through
129	* kthread_use_mm(), but one or more of its subthreads may still have a valid
130	* pointer. Return p, or any of its subthreads with a valid ->mm, with
131	* task_lock() held.
132	*/
133	struct task_struct find_lock_task_mm(struct* task_struct *p)
134	{
135	struct task_struct *t;
136
137	rcu_read_lock();
138
139	for_each_thread(p, t) {
140	task_lock(p: t);
141	if (likely(t->mm))
142	goto found;
143	task_unlock(p: t);
144	}
145	t = NULL;
146	found:
147	rcu_read_unlock();
148
149	return t;
150	}
151
152	/*
153	* order == -1 means the oom kill is required by sysrq, otherwise only
154	* for display purposes.
155	*/
156	static inline bool is_sysrq_oom(struct oom_control *oc)
157	{
158	return oc->order == -`1`;
159	}
160
161	/ return true if the task is not adequate as candidate victim task. /
162	static bool oom_unkillable_task(struct task_struct *p)
163	{
164	if (is_global_init(tsk: p))
165	return true;
166	if (p->flags & PF_KTHREAD)
167	return true;
168	return false;
169	}
170
171	/*
172	* Check whether unreclaimable slab amount is greater than
173	* all user memory(LRU pages).
174	* dump_unreclaimable_slab() could help in the case that
175	* oom due to too much unreclaimable slab used by kernel.
176	*/
177	static bool should_dump_unreclaim_slab(void)
178	{
179	unsigned long nr_lru;
180
181	nr_lru = global_node_page_state(item: NR_ACTIVE_ANON) +
182	global_node_page_state(item: NR_INACTIVE_ANON) +
183	global_node_page_state(item: NR_ACTIVE_FILE) +
184	global_node_page_state(item: NR_INACTIVE_FILE) +
185	global_node_page_state(item: NR_ISOLATED_ANON) +
186	global_node_page_state(item: NR_ISOLATED_FILE) +
187	global_node_page_state(item: NR_UNEVICTABLE);
188
189	return (global_node_page_state_pages(item: NR_SLAB_UNRECLAIMABLE_B) > nr_lru);
190	}
191
192	/**
193	* oom_badness - heuristic function to determine which candidate task to kill
194	* @p: task struct of which task we should calculate
195	* @totalpages: total present RAM allowed for page allocation
196	*
197	* The heuristic for determining which task to kill is made to be as simple and
198	* predictable as possible. The goal is to return the highest value for the
199	* task consuming the most memory to avoid subsequent oom failures.
200	*/
201	long oom_badness(struct task_struct p, unsigned* long totalpages)
202	{
203	long points;
204	long adj;
205
206	if (oom_unkillable_task(p))
207	return LONG_MIN;
208
209	p = find_lock_task_mm(p);
210	if (!p)
211	return LONG_MIN;
212
213	/*
214	* Do not even consider tasks which are explicitly marked oom
215	* unkillable or have been already oom reaped or the are in
216	* the middle of vfork
217	*/
218	adj = (long)p->signal->oom_score_adj;
219	if (adj == OOM_SCORE_ADJ_MIN \|\|
220	test_bit(MMF_OOM_SKIP, &p->mm->flags) \|\|
221	in_vfork(tsk: p)) {
222	task_unlock(p);
223	return LONG_MIN;
224	}
225
226	/*
227	* The baseline for the badness score is the proportion of RAM that each
228	* task's rss, pagetable and swap space use.
229	*/
230	points = get_mm_rss(mm: p->mm) + get_mm_counter(mm: p->mm, member: MM_SWAPENTS) +
231	mm_pgtables_bytes(mm: p->mm) / PAGE_SIZE;
232	task_unlock(p);
233
234	/ Normalize to oom_score_adj units /
235	adj *= totalpages / `1000`;
236	points += adj;
237
238	return points;
239	}
240
241	static const char * const oom_constraint_text[] = {
242	[CONSTRAINT_NONE] = "CONSTRAINT_NONE",
243	[CONSTRAINT_CPUSET] = "CONSTRAINT_CPUSET",
244	[CONSTRAINT_MEMORY_POLICY] = "CONSTRAINT_MEMORY_POLICY",
245	[CONSTRAINT_MEMCG] = "CONSTRAINT_MEMCG",
246	};
247
248	/*
249	* Determine the type of allocation constraint.
250	*/
251	static enum oom_constraint constrained_alloc(struct oom_control *oc)
252	{
253	struct zone *zone;
254	struct zoneref *z;
255	enum zone_type highest_zoneidx = gfp_zone(flags: oc->gfp_mask);
256	bool cpuset_limited = false;
257	int nid;
258
259	if (is_memcg_oom(oc)) {
260	oc->totalpages = mem_cgroup_get_max(memcg: oc->memcg) ?: `1`;
261	return CONSTRAINT_MEMCG;
262	}
263
264	/ Default to all available memory /
265	oc->totalpages = totalram_pages() + total_swap_pages;
266
267	if (!IS_ENABLED(CONFIG_NUMA))
268	return CONSTRAINT_NONE;
269
270	if (!oc->zonelist)
271	return CONSTRAINT_NONE;
272	/*
273	* Reach here only when __GFP_NOFAIL is used. So, we should avoid
274	* to kill current.We have to random task kill in this case.
275	* Hopefully, CONSTRAINT_THISNODE...but no way to handle it, now.
276	*/
277	if (oc->gfp_mask & __GFP_THISNODE)
278	return CONSTRAINT_NONE;
279
280	/*
281	* This is not a __GFP_THISNODE allocation, so a truncated nodemask in
282	* the page allocator means a mempolicy is in effect. Cpuset policy
283	* is enforced in get_page_from_freelist().
284	*/
285	if (oc->nodemask &&
286	!nodes_subset(node_states[N_MEMORY], *oc->nodemask)) {
287	oc->totalpages = total_swap_pages;
288	for_each_node_mask(nid, *oc->nodemask)
289	oc->totalpages += node_present_pages(nid);
290	return CONSTRAINT_MEMORY_POLICY;
291	}
292
293	/ Check this allocation failure is caused by cpuset's wall function /
294	for_each_zone_zonelist_nodemask(zone, z, oc->zonelist,
295	highest_zoneidx, oc->nodemask)
296	if (!cpuset_zone_allowed(z: zone, gfp_mask: oc->gfp_mask))
297	cpuset_limited = true;
298
299	if (cpuset_limited) {
300	oc->totalpages = total_swap_pages;
301	for_each_node_mask(nid, cpuset_current_mems_allowed)
302	oc->totalpages += node_present_pages(nid);
303	return CONSTRAINT_CPUSET;
304	}
305	return CONSTRAINT_NONE;
306	}
307
308	static int oom_evaluate_task(struct task_struct task, void* *arg)
309	{
310	struct oom_control *oc = arg;
311	long points;
312
313	if (oom_unkillable_task(p: task))
314	goto next;
315
316	/ p may not have freeable memory in nodemask /
317	if (!is_memcg_oom(oc) && !oom_cpuset_eligible(start: task, oc))
318	goto next;
319
320	/*
321	* This task already has access to memory reserves and is being killed.
322	* Don't allow any other task to have access to the reserves unless
323	* the task has MMF_OOM_SKIP because chances that it would release
324	* any memory is quite low.
325	*/
326	if (!is_sysrq_oom(oc) && tsk_is_oom_victim(tsk: task)) {
327	if (test_bit(MMF_OOM_SKIP, &task->signal->oom_mm->flags))
328	goto next;
329	goto abort;
330	}
331
332	/*
333	* If task is allocating a lot of memory and has been marked to be
334	* killed first if it triggers an oom, then select it.
335	*/
336	if (oom_task_origin(p: task)) {
337	points = LONG_MAX;
338	goto select;
339	}
340
341	points = oom_badness(p: task, totalpages: oc->totalpages);
342	if (points == LONG_MIN \|\| points < oc->chosen_points)
343	goto next;
344
345	select:
346	if (oc->chosen)
347	put_task_struct(t: oc->chosen);
348	get_task_struct(t: task);
349	oc->chosen = task;
350	oc->chosen_points = points;
351	next:
352	return `0`;
353	abort:
354	if (oc->chosen)
355	put_task_struct(t: oc->chosen);
356	oc->chosen = (void *)-`1UL`;
357	return `1`;
358	}
359
360	/*
361	* Simple selection loop. We choose the process with the highest number of
362	* 'points'. In case scan was aborted, oc->chosen is set to -1.
363	*/
364	static void select_bad_process(struct oom_control *oc)
365	{
366	oc->chosen_points = LONG_MIN;
367
368	if (is_memcg_oom(oc))
369	mem_cgroup_scan_tasks(memcg: oc->memcg, oom_evaluate_task, arg: oc);
370	else {
371	struct task_struct *p;
372
373	rcu_read_lock();
374	for_each_process(p)
375	if (oom_evaluate_task(task: p, arg: oc))
376	break;
377	rcu_read_unlock();
378	}
379	}
380
381	static int dump_task(struct task_struct p, void* *arg)
382	{
383	struct oom_control *oc = arg;
384	struct task_struct *task;
385
386	if (oom_unkillable_task(p))
387	return `0`;
388
389	/ p may not have freeable memory in nodemask /
390	if (!is_memcg_oom(oc) && !oom_cpuset_eligible(start: p, oc))
391	return `0`;
392
393	task = find_lock_task_mm(p);
394	if (!task) {
395	/*
396	* All of p's threads have already detached their mm's. There's
397	* no need to report them; they can't be oom killed anyway.
398	*/
399	return `0`;
400	}
401
402	pr_info("[%7d] %5d %5d %8lu %8lu %8ld %8lu %5hd %s\n",
403	task->pid, from_kuid(&init_user_ns, task_uid(task)),
404	task->tgid, task->mm->total_vm, get_mm_rss(task->mm),
405	mm_pgtables_bytes(task->mm),
406	get_mm_counter(task->mm, MM_SWAPENTS),
407	task->signal->oom_score_adj, task->comm);
408	task_unlock(p: task);
409
410	return `0`;
411	}
412
413	/**
414	* dump_tasks - dump current memory state of all system tasks
415	* @oc: pointer to struct oom_control
416	*
417	* Dumps the current memory state of all eligible tasks. Tasks not in the same
418	* memcg, not in the same cpuset, or bound to a disjoint set of mempolicy nodes
419	* are not shown.
420	* State information includes task's pid, uid, tgid, vm size, rss,
421	* pgtables_bytes, swapents, oom_score_adj value, and name.
422	*/
423	static void dump_tasks(struct oom_control *oc)
424	{
425	pr_info("Tasks state (memory values in pages):\n");
426	pr_info("[ pid ] uid tgid total_vm rss pgtables_bytes swapents oom_score_adj name\n");
427
428	if (is_memcg_oom(oc))
429	mem_cgroup_scan_tasks(memcg: oc->memcg, dump_task, arg: oc);
430	else {
431	struct task_struct *p;
432
433	rcu_read_lock();
434	for_each_process(p)
435	dump_task(p, arg: oc);
436	rcu_read_unlock();
437	}
438	}
439
440	static void dump_oom_victim(struct oom_control oc, struct* task_struct *victim)
441	{
442	/ one line summary of the oom killer context. /
443	pr_info("oom-kill:constraint=%s,nodemask=%*pbl",
444	oom_constraint_text[oc->constraint],
445	nodemask_pr_args(oc->nodemask));
446	cpuset_print_current_mems_allowed();
447	mem_cgroup_print_oom_context(memcg: oc->memcg, p: victim);
448	pr_cont(",task=%s,pid=%d,uid=%d\n", victim->comm, victim->pid,
449	from_kuid(&init_user_ns, task_uid(victim)));
450	}
451
452	static void dump_header(struct oom_control *oc)
453	{
454	pr_warn("%s invoked oom-killer: gfp_mask=%#x(%pGg), order=%d, oom_score_adj=%hd\n",
455	current->comm, oc->gfp_mask, &oc->gfp_mask, oc->order,
456	current->signal->oom_score_adj);
457	if (!IS_ENABLED(CONFIG_COMPACTION) && oc->order)
458	pr_warn("COMPACTION is disabled!!!\n");
459
460	dump_stack();
461	if (is_memcg_oom(oc))
462	mem_cgroup_print_oom_meminfo(memcg: oc->memcg);
463	else {
464	__show_mem(SHOW_MEM_FILTER_NODES, nodemask: oc->nodemask, max_zone_idx: gfp_zone(flags: oc->gfp_mask));
465	if (should_dump_unreclaim_slab())
466	dump_unreclaimable_slab();
467	}
468	if (sysctl_oom_dump_tasks)
469	dump_tasks(oc);
470	}
471
472	/*
473	* Number of OOM victims in flight
474	*/
475	static atomic_t oom_victims = ATOMIC_INIT(`0`);
476	static DECLARE_WAIT_QUEUE_HEAD(oom_victims_wait);
477
478	static bool oom_killer_disabled __read_mostly;
479
480	/*
481	* task->mm can be NULL if the task is the exited group leader. So to
482	* determine whether the task is using a particular mm, we examine all the
483	* task's threads: if one of those is using this mm then this task was also
484	* using it.
485	*/
486	bool process_shares_mm(struct task_struct p, struct* mm_struct *mm)
487	{
488	struct task_struct *t;
489
490	for_each_thread(p, t) {
491	struct mm_struct *t_mm = READ_ONCE(t->mm);
492	if (t_mm)
493	return t_mm == mm;
494	}
495	return false;
496	}
497
498	#ifdef CONFIG_MMU
499	/*
500	* OOM Reaper kernel thread which tries to reap the memory used by the OOM
501	* victim (if that is possible) to help the OOM killer to move on.
502	*/
503	static struct task_struct *oom_reaper_th;
504	static DECLARE_WAIT_QUEUE_HEAD(oom_reaper_wait);
505	static struct task_struct *oom_reaper_list;
506	static DEFINE_SPINLOCK(oom_reaper_lock);
507
508	static bool __oom_reap_task_mm(struct mm_struct *mm)
509	{
510	struct vm_area_struct *vma;
511	bool ret = true;
512	VMA_ITERATOR(vmi, mm, `0`);
513
514	/*
515	* Tell all users of get_user/copy_from_user etc... that the content
516	* is no longer stable. No barriers really needed because unmapping
517	* should imply barriers already and the reader would hit a page fault
518	* if it stumbled over a reaped memory.
519	*/
520	set_bit(MMF_UNSTABLE, addr: &mm->flags);
521
522	for_each_vma(vmi, vma) {
523	if (vma->vm_flags & (VM_HUGETLB\|VM_PFNMAP))
524	continue;
525
526	/*
527	* Only anonymous pages have a good chance to be dropped
528	* without additional steps which we cannot afford as we
529	* are OOM already.
530	*
531	* We do not even care about fs backed pages because all
532	* which are reclaimable have already been reclaimed and
533	* we do not want to block exit_mmap by keeping mm ref
534	* count elevated without a good reason.
535	*/
536	if (vma_is_anonymous(vma) \|\| !(vma->vm_flags & VM_SHARED)) {
537	struct mmu_notifier_range range;
538	struct mmu_gather tlb;
539
540	mmu_notifier_range_init(range: &range, event: MMU_NOTIFY_UNMAP, flags: `0`,
541	mm, start: vma->vm_start,
542	end: vma->vm_end);
543	tlb_gather_mmu(tlb: &tlb, mm);
544	if (mmu_notifier_invalidate_range_start_nonblock(range: &range)) {
545	tlb_finish_mmu(tlb: &tlb);
546	ret = false;
547	continue;
548	}
549	unmap_page_range(tlb: &tlb, vma, addr: range.start, end: range.end, NULL);
550	mmu_notifier_invalidate_range_end(range: &range);
551	tlb_finish_mmu(tlb: &tlb);
552	}
553	}
554
555	return ret;
556	}
557
558	/*
559	* Reaps the address space of the give task.
560	*
561	* Returns true on success and false if none or part of the address space
562	* has been reclaimed and the caller should retry later.
563	*/
564	static bool oom_reap_task_mm(struct task_struct tsk, struct* mm_struct *mm)
565	{
566	bool ret = true;
567
568	if (!mmap_read_trylock(mm)) {
569	trace_skip_task_reaping(pid: tsk->pid);
570	return false;
571	}
572
573	/*
574	* MMF_OOM_SKIP is set by exit_mmap when the OOM reaper can't
575	* work on the mm anymore. The check for MMF_OOM_SKIP must run
576	* under mmap_lock for reading because it serializes against the
577	* mmap_write_lock();mmap_write_unlock() cycle in exit_mmap().
578	*/
579	if (test_bit(MMF_OOM_SKIP, &mm->flags)) {
580	trace_skip_task_reaping(pid: tsk->pid);
581	goto out_unlock;
582	}
583
584	trace_start_task_reaping(pid: tsk->pid);
585
586	/ failed to reap part of the address space. Try again later /
587	ret = __oom_reap_task_mm(mm);
588	if (!ret)
589	goto out_finish;
590
591	pr_info("oom_reaper: reaped process %d (%s), now anon-rss:%lukB, file-rss:%lukB, shmem-rss:%lukB\n",
592	task_pid_nr(tsk), tsk->comm,
593	K(get_mm_counter(mm, MM_ANONPAGES)),
594	K(get_mm_counter(mm, MM_FILEPAGES)),
595	K(get_mm_counter(mm, MM_SHMEMPAGES)));
596	out_finish:
597	trace_finish_task_reaping(pid: tsk->pid);
598	out_unlock:
599	mmap_read_unlock(mm);
600
601	return ret;
602	}
603
604	#define MAX_OOM_REAP_RETRIES 10
605	static void oom_reap_task(struct task_struct *tsk)
606	{
607	int attempts = `0`;
608	struct mm_struct *mm = tsk->signal->oom_mm;
609
610	/ Retry the mmap_read_trylock(mm) a few times /
611	while (attempts++ < MAX_OOM_REAP_RETRIES && !oom_reap_task_mm(tsk, mm))
612	schedule_timeout_idle(HZ/`10`);
613
614	if (attempts <= MAX_OOM_REAP_RETRIES \|\|
615	test_bit(MMF_OOM_SKIP, &mm->flags))
616	goto done;
617
618	pr_info("oom_reaper: unable to reap pid:%d (%s)\n",
619	task_pid_nr(tsk), tsk->comm);
620	sched_show_task(p: tsk);
621	debug_show_all_locks();
622
623	done:
624	tsk->oom_reaper_list = NULL;
625
626	/*
627	* Hide this mm from OOM killer because it has been either reaped or
628	* somebody can't call mmap_write_unlock(mm).
629	*/
630	set_bit(MMF_OOM_SKIP, addr: &mm->flags);
631
632	/ Drop a reference taken by queue_oom_reaper /
633	put_task_struct(t: tsk);
634	}
635
636	static int oom_reaper(void *unused)
637	{
638	set_freezable();
639
640	while (true) {
641	struct task_struct *tsk = NULL;
642
643	wait_event_freezable(oom_reaper_wait, oom_reaper_list != NULL);
644	spin_lock_irq(lock: &oom_reaper_lock);
645	if (oom_reaper_list != NULL) {
646	tsk = oom_reaper_list;
647	oom_reaper_list = tsk->oom_reaper_list;
648	}
649	spin_unlock_irq(lock: &oom_reaper_lock);
650
651	if (tsk)
652	oom_reap_task(tsk);
653	}
654
655	return `0`;
656	}
657
658	static void wake_oom_reaper(struct timer_list *timer)
659	{
660	struct task_struct tsk = container_of(timer, struct* task_struct,
661	oom_reaper_timer);
662	struct mm_struct *mm = tsk->signal->oom_mm;
663	unsigned long flags;
664
665	/ The victim managed to terminate on its own - see exit_mmap /
666	if (test_bit(MMF_OOM_SKIP, &mm->flags)) {
667	put_task_struct(t: tsk);
668	return;
669	}
670
671	spin_lock_irqsave(&oom_reaper_lock, flags);
672	tsk->oom_reaper_list = oom_reaper_list;
673	oom_reaper_list = tsk;
674	spin_unlock_irqrestore(lock: &oom_reaper_lock, flags);
675	trace_wake_reaper(pid: tsk->pid);
676	wake_up(&oom_reaper_wait);
677	}
678
679	/*
680	* Give the OOM victim time to exit naturally before invoking the oom_reaping.
681	* The timers timeout is arbitrary... the longer it is, the longer the worst
682	* case scenario for the OOM can take. If it is too small, the oom_reaper can
683	* get in the way and release resources needed by the process exit path.
684	* e.g. The futex robust list can sit in Anon\|Private memory that gets reaped
685	* before the exit path is able to wake the futex waiters.
686	*/
687	#define OOM_REAPER_DELAY (2*HZ)
688	static void queue_oom_reaper(struct task_struct *tsk)
689	{
690	/ mm is already queued? /
691	if (test_and_set_bit(MMF_OOM_REAP_QUEUED, addr: &tsk->signal->oom_mm->flags))
692	return;
693
694	get_task_struct(t: tsk);
695	timer_setup(&tsk->oom_reaper_timer, wake_oom_reaper, `0`);
696	tsk->oom_reaper_timer.expires = jiffies + OOM_REAPER_DELAY;
697	add_timer(timer: &tsk->oom_reaper_timer);
698	}
699
700	#ifdef CONFIG_SYSCTL
701	static struct ctl_table vm_oom_kill_table[] = {
702	{
703	.procname = "panic_on_oom",
704	.data = &sysctl_panic_on_oom,
705	.maxlen = sizeof(sysctl_panic_on_oom),
706	.mode = `0644`,
707	.proc_handler = proc_dointvec_minmax,
708	.extra1 = SYSCTL_ZERO,
709	.extra2 = SYSCTL_TWO,
710	},
711	{
712	.procname = "oom_kill_allocating_task",
713	.data = &sysctl_oom_kill_allocating_task,
714	.maxlen = sizeof(sysctl_oom_kill_allocating_task),
715	.mode = `0644`,
716	.proc_handler = proc_dointvec,
717	},
718	{
719	.procname = "oom_dump_tasks",
720	.data = &sysctl_oom_dump_tasks,
721	.maxlen = sizeof(sysctl_oom_dump_tasks),
722	.mode = `0644`,
723	.proc_handler = proc_dointvec,
724	},
725	{}
726	};
727	#endif
728
729	static int __init oom_init(void)
730	{
731	oom_reaper_th = kthread_run(oom_reaper, NULL, "oom_reaper");
732	#ifdef CONFIG_SYSCTL
733	register_sysctl_init("vm", vm_oom_kill_table);
734	#endif
735	return `0`;
736	}
737	subsys_initcall(oom_init)
738	#else
739	static inline void queue_oom_reaper(struct task_struct *tsk)
740	{
741	}
742	#endif /* CONFIG_MMU */
743
744	/**
745	* mark_oom_victim - mark the given task as OOM victim
746	* @tsk: task to mark
747	*
748	* Has to be called with oom_lock held and never after
749	* oom has been disabled already.
750	*
751	* tsk->mm has to be non NULL and caller has to guarantee it is stable (either
752	* under task_lock or operate on the current).
753	*/
754	static void mark_oom_victim(struct task_struct *tsk)
755	{
756	struct mm_struct *mm = tsk->mm;
757
758	WARN_ON(oom_killer_disabled);
759	/ OOM killer might race with memcg OOM /
760	if (test_and_set_tsk_thread_flag(tsk, TIF_MEMDIE))
761	return;
762
763	/ oom_mm is bound to the signal struct life time. /
764	if (!cmpxchg(&tsk->signal->oom_mm, NULL, mm))
765	mmgrab(mm: tsk->signal->oom_mm);
766
767	/*
768	* Make sure that the task is woken up from uninterruptible sleep
769	* if it is frozen because OOM killer wouldn't be able to free
770	* any memory and livelock. freezing_slow_path will tell the freezer
771	* that TIF_MEMDIE tasks should be ignored.
772	*/
773	__thaw_task(t: tsk);
774	atomic_inc(v: &oom_victims);
775	trace_mark_victim(pid: tsk->pid);
776	}
777
778	/**
779	* exit_oom_victim - note the exit of an OOM victim
780	*/
781	void exit_oom_victim(void)
782	{
783	clear_thread_flag(TIF_MEMDIE);
784
785	if (!atomic_dec_return(v: &oom_victims))
786	wake_up_all(&oom_victims_wait);
787	}
788
789	/**
790	* oom_killer_enable - enable OOM killer
791	*/
792	void oom_killer_enable(void)
793	{
794	oom_killer_disabled = false;
795	pr_info("OOM killer enabled.\n");
796	}
797
798	/**
799	* oom_killer_disable - disable OOM killer
800	* @timeout: maximum timeout to wait for oom victims in jiffies
801	*
802	* Forces all page allocations to fail rather than trigger OOM killer.
803	* Will block and wait until all OOM victims are killed or the given
804	* timeout expires.
805	*
806	* The function cannot be called when there are runnable user tasks because
807	* the userspace would see unexpected allocation failures as a result. Any
808	* new usage of this function should be consulted with MM people.
809	*
810	* Returns true if successful and false if the OOM killer cannot be
811	* disabled.
812	*/
813	bool oom_killer_disable(signed long timeout)
814	{
815	signed long ret;
816
817	/*
818	* Make sure to not race with an ongoing OOM killer. Check that the
819	* current is not killed (possibly due to sharing the victim's memory).
820	*/
821	if (mutex_lock_killable(&oom_lock))
822	return false;
823	oom_killer_disabled = true;
824	mutex_unlock(lock: &oom_lock);
825
826	ret = wait_event_interruptible_timeout(oom_victims_wait,
827	!atomic_read(&oom_victims), timeout);
828	if (ret <= `0`) {
829	oom_killer_enable();
830	return false;
831	}
832	pr_info("OOM killer disabled.\n");
833
834	return true;
835	}
836
837	static inline bool __task_will_free_mem(struct task_struct *task)
838	{
839	struct signal_struct *sig = task->signal;
840
841	/*
842	* A coredumping process may sleep for an extended period in
843	* coredump_task_exit(), so the oom killer cannot assume that
844	* the process will promptly exit and release memory.
845	*/
846	if (sig->core_state)
847	return false;
848
849	if (sig->flags & SIGNAL_GROUP_EXIT)
850	return true;
851
852	if (thread_group_empty(p: task) && (task->flags & PF_EXITING))
853	return true;
854
855	return false;
856	}
857
858	/*
859	* Checks whether the given task is dying or exiting and likely to
860	* release its address space. This means that all threads and processes
861	* sharing the same mm have to be killed or exiting.
862	* Caller has to make sure that task->mm is stable (hold task_lock or
863	* it operates on the current).
864	*/
865	static bool task_will_free_mem(struct task_struct *task)
866	{
867	struct mm_struct *mm = task->mm;
868	struct task_struct *p;
869	bool ret = true;
870
871	/*
872	* Skip tasks without mm because it might have passed its exit_mm and
873	* exit_oom_victim. oom_reaper could have rescued that but do not rely
874	* on that for now. We can consider find_lock_task_mm in future.
875	*/
876	if (!mm)
877	return false;
878
879	if (!__task_will_free_mem(task))
880	return false;
881
882	/*
883	* This task has already been drained by the oom reaper so there are
884	* only small chances it will free some more
885	*/
886	if (test_bit(MMF_OOM_SKIP, &mm->flags))
887	return false;
888
889	if (atomic_read(v: &mm->mm_users) <= `1`)
890	return true;
891
892	/*
893	* Make sure that all tasks which share the mm with the given tasks
894	* are dying as well to make sure that a) nobody pins its mm and
895	* b) the task is also reapable by the oom reaper.
896	*/
897	rcu_read_lock();
898	for_each_process(p) {
899	if (!process_shares_mm(p, mm))
900	continue;
901	if (same_thread_group(p1: task, p2: p))
902	continue;
903	ret = __task_will_free_mem(task: p);
904	if (!ret)
905	break;
906	}
907	rcu_read_unlock();
908
909	return ret;
910	}
911
912	static void __oom_kill_process(struct task_struct victim, const* char *message)
913	{
914	struct task_struct *p;
915	struct mm_struct *mm;
916	bool can_oom_reap = true;
917
918	p = find_lock_task_mm(p: victim);
919	if (!p) {
920	pr_info("%s: OOM victim %d (%s) is already exiting. Skip killing the task\n",
921	message, task_pid_nr(victim), victim->comm);
922	put_task_struct(t: victim);
923	return;
924	} else if (victim != p) {
925	get_task_struct(t: p);
926	put_task_struct(t: victim);
927	victim = p;
928	}
929
930	/ Get a reference to safely compare mm after task_unlock(victim) /
931	mm = victim->mm;
932	mmgrab(mm);
933
934	/ Raise event before sending signal: task reaper must see this /
935	count_vm_event(item: OOM_KILL);
936	memcg_memory_event_mm(mm, event: MEMCG_OOM_KILL);
937
938	/*
939	* We should send SIGKILL before granting access to memory reserves
940	* in order to prevent the OOM victim from depleting the memory
941	* reserves from the user space under its control.
942	*/
943	do_send_sig_info(SIGKILL, SEND_SIG_PRIV, p: victim, type: PIDTYPE_TGID);
944	mark_oom_victim(tsk: victim);
945	pr_err("%s: Killed process %d (%s) total-vm:%lukB, anon-rss:%lukB, file-rss:%lukB, shmem-rss:%lukB, UID:%u pgtables:%lukB oom_score_adj:%hd\n",
946	message, task_pid_nr(victim), victim->comm, K(mm->total_vm),
947	K(get_mm_counter(mm, MM_ANONPAGES)),
948	K(get_mm_counter(mm, MM_FILEPAGES)),
949	K(get_mm_counter(mm, MM_SHMEMPAGES)),
950	from_kuid(&init_user_ns, task_uid(victim)),
951	mm_pgtables_bytes(mm) >> `10`, victim->signal->oom_score_adj);
952	task_unlock(p: victim);
953
954	/*
955	* Kill all user processes sharing victim->mm in other thread groups, if
956	* any. They don't get access to memory reserves, though, to avoid
957	* depletion of all memory. This prevents mm->mmap_lock livelock when an
958	* oom killed thread cannot exit because it requires the semaphore and
959	* its contended by another thread trying to allocate memory itself.
960	* That thread will now get access to memory reserves since it has a
961	* pending fatal signal.
962	*/
963	rcu_read_lock();
964	for_each_process(p) {
965	if (!process_shares_mm(p, mm))
966	continue;
967	if (same_thread_group(p1: p, p2: victim))
968	continue;
969	if (is_global_init(tsk: p)) {
970	can_oom_reap = false;
971	set_bit(MMF_OOM_SKIP, addr: &mm->flags);
972	pr_info("oom killer %d (%s) has mm pinned by %d (%s)\n",
973	task_pid_nr(victim), victim->comm,
974	task_pid_nr(p), p->comm);
975	continue;
976	}
977	/*
978	* No kthread_use_mm() user needs to read from the userspace so
979	* we are ok to reap it.
980	*/
981	if (unlikely(p->flags & PF_KTHREAD))
982	continue;
983	do_send_sig_info(SIGKILL, SEND_SIG_PRIV, p, type: PIDTYPE_TGID);
984	}
985	rcu_read_unlock();
986
987	if (can_oom_reap)
988	queue_oom_reaper(tsk: victim);
989
990	mmdrop(mm);
991	put_task_struct(t: victim);
992	}
993
994	/*
995	* Kill provided task unless it's secured by setting
996	* oom_score_adj to OOM_SCORE_ADJ_MIN.
997	*/
998	static int oom_kill_memcg_member(struct task_struct task, void* *message)
999	{
1000	if (task->signal->oom_score_adj != OOM_SCORE_ADJ_MIN &&
1001	!is_global_init(tsk: task)) {
1002	get_task_struct(t: task);
1003	__oom_kill_process(victim: task, message);
1004	}
1005	return `0`;
1006	}
1007
1008	static void oom_kill_process(struct oom_control oc, const* char *message)
1009	{
1010	struct task_struct *victim = oc->chosen;
1011	struct mem_cgroup *oom_group;
1012	static DEFINE_RATELIMIT_STATE(oom_rs, DEFAULT_RATELIMIT_INTERVAL,
1013	DEFAULT_RATELIMIT_BURST);
1014
1015	/*
1016	* If the task is already exiting, don't alarm the sysadmin or kill
1017	* its children or threads, just give it access to memory reserves
1018	* so it can die quickly
1019	*/
1020	task_lock(p: victim);
1021	if (task_will_free_mem(task: victim)) {
1022	mark_oom_victim(tsk: victim);
1023	queue_oom_reaper(tsk: victim);
1024	task_unlock(p: victim);
1025	put_task_struct(t: victim);
1026	return;
1027	}
1028	task_unlock(p: victim);
1029
1030	if (__ratelimit(&oom_rs)) {
1031	dump_header(oc);
1032	dump_oom_victim(oc, victim);
1033	}
1034
1035	/*
1036	* Do we need to kill the entire memory cgroup?
1037	* Or even one of the ancestor memory cgroups?
1038	* Check this out before killing the victim task.
1039	*/
1040	oom_group = mem_cgroup_get_oom_group(victim, oom_domain: oc->memcg);
1041
1042	__oom_kill_process(victim, message);
1043
1044	/*
1045	* If necessary, kill all tasks in the selected memory cgroup.
1046	*/
1047	if (oom_group) {
1048	memcg_memory_event(memcg: oom_group, event: MEMCG_OOM_GROUP_KILL);
1049	mem_cgroup_print_oom_group(memcg: oom_group);
1050	mem_cgroup_scan_tasks(memcg: oom_group, oom_kill_memcg_member,
1051	arg: (void *)message);
1052	mem_cgroup_put(memcg: oom_group);
1053	}
1054	}
1055
1056	/*
1057	* Determines whether the kernel must panic because of the panic_on_oom sysctl.
1058	*/
1059	static void check_panic_on_oom(struct oom_control *oc)
1060	{
1061	if (likely(!sysctl_panic_on_oom))
1062	return;
1063	if (sysctl_panic_on_oom != `2`) {
1064	/*
1065	* panic_on_oom == 1 only affects CONSTRAINT_NONE, the kernel
1066	* does not panic for cpuset, mempolicy, or memcg allocation
1067	* failures.
1068	*/
1069	if (oc->constraint != CONSTRAINT_NONE)
1070	return;
1071	}
1072	/ Do not panic for oom kills triggered by sysrq /
1073	if (is_sysrq_oom(oc))
1074	return;
1075	dump_header(oc);
1076	panic(fmt: "Out of memory: %s panic_on_oom is enabled\n",
1077	sysctl_panic_on_oom == `2` ? "compulsory" : "system-wide");
1078	}
1079
1080	static BLOCKING_NOTIFIER_HEAD(oom_notify_list);
1081
1082	int register_oom_notifier(struct notifier_block *nb)
1083	{
1084	return blocking_notifier_chain_register(nh: &oom_notify_list, nb);
1085	}
1086	EXPORT_SYMBOL_GPL(register_oom_notifier);
1087
1088	int unregister_oom_notifier(struct notifier_block *nb)
1089	{
1090	return blocking_notifier_chain_unregister(nh: &oom_notify_list, nb);
1091	}
1092	EXPORT_SYMBOL_GPL(unregister_oom_notifier);
1093
1094	/**
1095	* out_of_memory - kill the "best" process when we run out of memory
1096	* @oc: pointer to struct oom_control
1097	*
1098	* If we run out of memory, we have the choice between either
1099	* killing a random task (bad), letting the system crash (worse)
1100	* OR try to be smart about which process to kill. Note that we
1101	* don't have to be perfect here, we just have to be good.
1102	*/
1103	bool out_of_memory(struct oom_control *oc)
1104	{
1105	unsigned long freed = `0`;
1106
1107	if (oom_killer_disabled)
1108	return false;
1109
1110	if (!is_memcg_oom(oc)) {
1111	blocking_notifier_call_chain(nh: &oom_notify_list, val: `0`, v: &freed);
1112	if (freed > `0` && !is_sysrq_oom(oc))
1113	/ Got some memory back in the last second. /
1114	return true;
1115	}
1116
1117	/*
1118	* If current has a pending SIGKILL or is exiting, then automatically
1119	* select it. The goal is to allow it to allocate so that it may
1120	* quickly exit and free its memory.
1121	*/
1122	if (task_will_free_mem(current)) {
1123	mark_oom_victim(current);
1124	queue_oom_reaper(current);
1125	return true;
1126	}
1127
1128	/*
1129	* The OOM killer does not compensate for IO-less reclaim.
1130	* But mem_cgroup_oom() has to invoke the OOM killer even
1131	* if it is a GFP_NOFS allocation.
1132	*/
1133	if (!(oc->gfp_mask & __GFP_FS) && !is_memcg_oom(oc))
1134	return true;
1135
1136	/*
1137	* Check if there were limitations on the allocation (only relevant for
1138	* NUMA and memcg) that may require different handling.
1139	*/
1140	oc->constraint = constrained_alloc(oc);
1141	if (oc->constraint != CONSTRAINT_MEMORY_POLICY)
1142	oc->nodemask = NULL;
1143	check_panic_on_oom(oc);
1144
1145	if (!is_memcg_oom(oc) && sysctl_oom_kill_allocating_task &&
1146	current->mm && !oom_unkillable_task(current) &&
1147	oom_cpuset_eligible(current, oc) &&
1148	current->signal->oom_score_adj != OOM_SCORE_ADJ_MIN) {
1149	get_task_struct(current);
1150	oc->chosen = current;
1151	oom_kill_process(oc, message: "Out of memory (oom_kill_allocating_task)");
1152	return true;
1153	}
1154
1155	select_bad_process(oc);
1156	/ Found nothing?!?! /
1157	if (!oc->chosen) {
1158	dump_header(oc);
1159	pr_warn("Out of memory and no killable processes...\n");
1160	/*
1161	* If we got here due to an actual allocation at the
1162	* system level, we cannot survive this and will enter
1163	* an endless loop in the allocator. Bail out now.
1164	*/
1165	if (!is_sysrq_oom(oc) && !is_memcg_oom(oc))
1166	panic(fmt: "System is deadlocked on memory\n");
1167	}
1168	if (oc->chosen && oc->chosen != (void *)-`1UL`)
1169	oom_kill_process(oc, message: !is_memcg_oom(oc) ? "Out of memory" :
1170	"Memory cgroup out of memory");
1171	return !!oc->chosen;
1172	}
1173
1174	/*
1175	* The pagefault handler calls here because some allocation has failed. We have
1176	* to take care of the memcg OOM here because this is the only safe context without
1177	* any locks held but let the oom killer triggered from the allocation context care
1178	* about the global OOM.
1179	*/
1180	void pagefault_out_of_memory(void)
1181	{
1182	static DEFINE_RATELIMIT_STATE(pfoom_rs, DEFAULT_RATELIMIT_INTERVAL,
1183	DEFAULT_RATELIMIT_BURST);
1184
1185	if (mem_cgroup_oom_synchronize(wait: true))
1186	return;
1187
1188	if (fatal_signal_pending(current))
1189	return;
1190
1191	if (__ratelimit(&pfoom_rs))
1192	pr_warn("Huh VM_FAULT_OOM leaked out to the #PF handler. Retrying PF\n");
1193	}
1194
1195	SYSCALL_DEFINE2(process_mrelease, int, pidfd, unsigned int, flags)
1196	{
1197	#ifdef CONFIG_MMU
1198	struct mm_struct *mm = NULL;
1199	struct task_struct *task;
1200	struct task_struct *p;
1201	unsigned int f_flags;
1202	bool reap = false;
1203	long ret = `0`;
1204
1205	if (flags)
1206	return -EINVAL;
1207
1208	task = pidfd_get_task(pidfd, flags: &f_flags);
1209	if (IS_ERR(ptr: task))
1210	return PTR_ERR(ptr: task);
1211
1212	/*
1213	* Make sure to choose a thread which still has a reference to mm
1214	* during the group exit
1215	*/
1216	p = find_lock_task_mm(p: task);
1217	if (!p) {
1218	ret = -ESRCH;
1219	goto put_task;
1220	}
1221
1222	mm = p->mm;
1223	mmgrab(mm);
1224
1225	if (task_will_free_mem(task: p))
1226	reap = true;
1227	else {
1228	/ Error only if the work has not been done already /
1229	if (!test_bit(MMF_OOM_SKIP, &mm->flags))
1230	ret = -EINVAL;
1231	}
1232	task_unlock(p);
1233
1234	if (!reap)
1235	goto drop_mm;
1236
1237	if (mmap_read_lock_killable(mm)) {
1238	ret = -EINTR;
1239	goto drop_mm;
1240	}
1241	/*
1242	* Check MMF_OOM_SKIP again under mmap_read_lock protection to ensure
1243	* possible change in exit_mmap is seen
1244	*/
1245	if (!test_bit(MMF_OOM_SKIP, &mm->flags) && !__oom_reap_task_mm(mm))
1246	ret = -EAGAIN;
1247	mmap_read_unlock(mm);
1248
1249	drop_mm:
1250	mmdrop(mm);
1251	put_task:
1252	put_task_struct(t: task);
1253	return ret;
1254	#else
1255	return -ENOSYS;
1256	#endif /* CONFIG_MMU */
1257	}
1258

source code of linux/mm/oom_kill.c