compaction.c source code [linux/mm/compaction.c]

1	// SPDX-License-Identifier: GPL-2.0
2	/*
3	* linux/mm/compaction.c
4	*
5	* Memory compaction for the reduction of external fragmentation. Note that
6	* this heavily depends upon page migration to do all the real heavy
7	* lifting
8	*
9	* Copyright IBM Corp. 2007-2010 Mel Gorman <mel@csn.ul.ie>
10	*/
11	#include <linux/cpu.h>
12	#include <linux/swap.h>
13	#include <linux/migrate.h>
14	#include <linux/compaction.h>
15	#include <linux/mm_inline.h>
16	#include <linux/sched/signal.h>
17	#include <linux/backing-dev.h>
18	#include <linux/sysctl.h>
19	#include <linux/sysfs.h>
20	#include <linux/page-isolation.h>
21	#include <linux/kasan.h>
22	#include <linux/kthread.h>
23	#include <linux/freezer.h>
24	#include <linux/page_owner.h>
25	#include <linux/psi.h>
26	#include "internal.h"
27
28	#ifdef CONFIG_COMPACTION
29	/*
30	* Fragmentation score check interval for proactive compaction purposes.
31	*/
32	#define HPAGE_FRAG_CHECK_INTERVAL_MSEC (500)
33
34	static inline void count_compact_event(enum vm_event_item item)
35	{
36	count_vm_event(item);
37	}
38
39	static inline void count_compact_events(enum vm_event_item item, long delta)
40	{
41	count_vm_events(item, delta);
42	}
43	#else
44	#define count_compact_event(item) do { } while (0)
45	#define count_compact_events(item, delta) do { } while (0)
46	#endif
47
48	#if defined CONFIG_COMPACTION \|\| defined CONFIG_CMA
49
50	#define CREATE_TRACE_POINTS
51	#include <trace/events/compaction.h>
52
53	#define block_start_pfn(pfn, order) round_down(pfn, 1UL << (order))
54	#define block_end_pfn(pfn, order) ALIGN((pfn) + 1, 1UL << (order))
55
56	/*
57	* Page order with-respect-to which proactive compaction
58	* calculates external fragmentation, which is used as
59	* the "fragmentation score" of a node/zone.
60	*/
61	#if defined CONFIG_TRANSPARENT_HUGEPAGE
62	#define COMPACTION_HPAGE_ORDER HPAGE_PMD_ORDER
63	#elif defined CONFIG_HUGETLBFS
64	#define COMPACTION_HPAGE_ORDER HUGETLB_PAGE_ORDER
65	#else
66	#define COMPACTION_HPAGE_ORDER (PMD_SHIFT - PAGE_SHIFT)
67	#endif
68
69	static unsigned long release_freepages(struct list_head *freelist)
70	{
71	struct page page, next;
72	unsigned long high_pfn = `0`;
73
74	list_for_each_entry_safe(page, next, freelist, lru) {
75	unsigned long pfn = page_to_pfn(page);
76	list_del(entry: &page->lru);
77	__free_page(page);
78	if (pfn > high_pfn)
79	high_pfn = pfn;
80	}
81
82	return high_pfn;
83	}
84
85	static void split_map_pages(struct list_head *list)
86	{
87	unsigned int i, order, nr_pages;
88	struct page page, next;
89	LIST_HEAD(tmp_list);
90
91	list_for_each_entry_safe(page, next, list, lru) {
92	list_del(entry: &page->lru);
93
94	order = page_private(page);
95	nr_pages = `1` << order;
96
97	post_alloc_hook(page, order, __GFP_MOVABLE);
98	if (order)
99	split_page(page, order);
100
101	for (i = `0`; i < nr_pages; i++) {
102	list_add(new: &page->lru, head: &tmp_list);
103	page++;
104	}
105	}
106
107	list_splice(list: &tmp_list, head: list);
108	}
109
110	#ifdef CONFIG_COMPACTION
111	bool PageMovable(struct page *page)
112	{
113	const struct movable_operations *mops;
114
115	VM_BUG_ON_PAGE(!PageLocked(page), page);
116	if (!__PageMovable(page))
117	return false;
118
119	mops = page_movable_ops(page);
120	if (mops)
121	return true;
122
123	return false;
124	}
125
126	void __SetPageMovable(struct page page, const* struct movable_operations *mops)
127	{
128	VM_BUG_ON_PAGE(!PageLocked(page), page);
129	VM_BUG_ON_PAGE((unsigned long)mops & PAGE_MAPPING_MOVABLE, page);
130	page->mapping = (void )((unsigned* long)mops \| PAGE_MAPPING_MOVABLE);
131	}
132	EXPORT_SYMBOL(__SetPageMovable);
133
134	void __ClearPageMovable(struct page *page)
135	{
136	VM_BUG_ON_PAGE(!PageMovable(page), page);
137	/*
138	* This page still has the type of a movable page, but it's
139	* actually not movable any more.
140	*/
141	page->mapping = (void *)PAGE_MAPPING_MOVABLE;
142	}
143	EXPORT_SYMBOL(__ClearPageMovable);
144
145	/ Do not skip compaction more than 64 times /
146	#define COMPACT_MAX_DEFER_SHIFT 6
147
148	/*
149	* Compaction is deferred when compaction fails to result in a page
150	* allocation success. 1 << compact_defer_shift, compactions are skipped up
151	* to a limit of 1 << COMPACT_MAX_DEFER_SHIFT
152	*/
153	static void defer_compaction(struct zone zone, int* order)
154	{
155	zone->compact_considered = `0`;
156	zone->compact_defer_shift++;
157
158	if (order < zone->compact_order_failed)
159	zone->compact_order_failed = order;
160
161	if (zone->compact_defer_shift > COMPACT_MAX_DEFER_SHIFT)
162	zone->compact_defer_shift = COMPACT_MAX_DEFER_SHIFT;
163
164	trace_mm_compaction_defer_compaction(zone, order);
165	}
166
167	/ Returns true if compaction should be skipped this time /
168	static bool compaction_deferred(struct zone zone, int* order)
169	{
170	unsigned long defer_limit = `1UL` << zone->compact_defer_shift;
171
172	if (order < zone->compact_order_failed)
173	return false;
174
175	/ Avoid possible overflow /
176	if (++zone->compact_considered >= defer_limit) {
177	zone->compact_considered = defer_limit;
178	return false;
179	}
180
181	trace_mm_compaction_deferred(zone, order);
182
183	return true;
184	}
185
186	/*
187	* Update defer tracking counters after successful compaction of given order,
188	* which means an allocation either succeeded (alloc_success == true) or is
189	* expected to succeed.
190	*/
191	void compaction_defer_reset(struct zone zone, int* order,
192	bool alloc_success)
193	{
194	if (alloc_success) {
195	zone->compact_considered = `0`;
196	zone->compact_defer_shift = `0`;
197	}
198	if (order >= zone->compact_order_failed)
199	zone->compact_order_failed = order + `1`;
200
201	trace_mm_compaction_defer_reset(zone, order);
202	}
203
204	/ Returns true if restarting compaction after many failures /
205	static bool compaction_restarting(struct zone zone, int* order)
206	{
207	if (order < zone->compact_order_failed)
208	return false;
209
210	return zone->compact_defer_shift == COMPACT_MAX_DEFER_SHIFT &&
211	zone->compact_considered >= `1UL` << zone->compact_defer_shift;
212	}
213
214	/ Returns true if the pageblock should be scanned for pages to isolate. /
215	static inline bool isolation_suitable(struct compact_control *cc,
216	struct page *page)
217	{
218	if (cc->ignore_skip_hint)
219	return true;
220
221	return !get_pageblock_skip(page);
222	}
223
224	static void reset_cached_positions(struct zone *zone)
225	{
226	zone->compact_cached_migrate_pfn[`0`] = zone->zone_start_pfn;
227	zone->compact_cached_migrate_pfn[`1`] = zone->zone_start_pfn;
228	zone->compact_cached_free_pfn =
229	pageblock_start_pfn(zone_end_pfn(zone) - `1`);
230	}
231
232	#ifdef CONFIG_SPARSEMEM
233	/*
234	* If the PFN falls into an offline section, return the start PFN of the
235	* next online section. If the PFN falls into an online section or if
236	* there is no next online section, return 0.
237	*/
238	static unsigned long skip_offline_sections(unsigned long start_pfn)
239	{
240	unsigned long start_nr = pfn_to_section_nr(pfn: start_pfn);
241
242	if (online_section_nr(nr: start_nr))
243	return `0`;
244
245	while (++start_nr <= __highest_present_section_nr) {
246	if (online_section_nr(nr: start_nr))
247	return section_nr_to_pfn(sec: start_nr);
248	}
249
250	return `0`;
251	}
252
253	/*
254	* If the PFN falls into an offline section, return the end PFN of the
255	* next online section in reverse. If the PFN falls into an online section
256	* or if there is no next online section in reverse, return 0.
257	*/
258	static unsigned long skip_offline_sections_reverse(unsigned long start_pfn)
259	{
260	unsigned long start_nr = pfn_to_section_nr(pfn: start_pfn);
261
262	if (!start_nr \|\| online_section_nr(nr: start_nr))
263	return `0`;
264
265	while (start_nr-- > `0`) {
266	if (online_section_nr(nr: start_nr))
267	return section_nr_to_pfn(sec: start_nr) + PAGES_PER_SECTION;
268	}
269
270	return `0`;
271	}
272	#else
273	static unsigned long skip_offline_sections(unsigned long start_pfn)
274	{
275	return `0`;
276	}
277
278	static unsigned long skip_offline_sections_reverse(unsigned long start_pfn)
279	{
280	return `0`;
281	}
282	#endif
283
284	/*
285	* Compound pages of >= pageblock_order should consistently be skipped until
286	* released. It is always pointless to compact pages of such order (if they are
287	* migratable), and the pageblocks they occupy cannot contain any free pages.
288	*/
289	static bool pageblock_skip_persistent(struct page *page)
290	{
291	if (!PageCompound(page))
292	return false;
293
294	page = compound_head(page);
295
296	if (compound_order(page) >= pageblock_order)
297	return true;
298
299	return false;
300	}
301
302	static bool
303	__reset_isolation_pfn(struct zone zone, unsigned* long pfn, bool check_source,
304	bool check_target)
305	{
306	struct page *page = pfn_to_online_page(pfn);
307	struct page *block_page;
308	struct page *end_page;
309	unsigned long block_pfn;
310
311	if (!page)
312	return false;
313	if (zone != page_zone(page))
314	return false;
315	if (pageblock_skip_persistent(page))
316	return false;
317
318	/*
319	* If skip is already cleared do no further checking once the
320	* restart points have been set.
321	*/
322	if (check_source && check_target && !get_pageblock_skip(page))
323	return true;
324
325	/*
326	* If clearing skip for the target scanner, do not select a
327	* non-movable pageblock as the starting point.
328	*/
329	if (!check_source && check_target &&
330	get_pageblock_migratetype(page) != MIGRATE_MOVABLE)
331	return false;
332
333	/ Ensure the start of the pageblock or zone is online and valid /
334	block_pfn = pageblock_start_pfn(pfn);
335	block_pfn = max(block_pfn, zone->zone_start_pfn);
336	block_page = pfn_to_online_page(pfn: block_pfn);
337	if (block_page) {
338	page = block_page;
339	pfn = block_pfn;
340	}
341
342	/ Ensure the end of the pageblock or zone is online and valid /
343	block_pfn = pageblock_end_pfn(pfn) - `1`;
344	block_pfn = min(block_pfn, zone_end_pfn(zone) - `1`);
345	end_page = pfn_to_online_page(pfn: block_pfn);
346	if (!end_page)
347	return false;
348
349	/*
350	* Only clear the hint if a sample indicates there is either a
351	* free page or an LRU page in the block. One or other condition
352	* is necessary for the block to be a migration source/target.
353	*/
354	do {
355	if (check_source && PageLRU(page)) {
356	clear_pageblock_skip(page);
357	return true;
358	}
359
360	if (check_target && PageBuddy(page)) {
361	clear_pageblock_skip(page);
362	return true;
363	}
364
365	page += (`1` << PAGE_ALLOC_COSTLY_ORDER);
366	} while (page <= end_page);
367
368	return false;
369	}
370
371	/*
372	* This function is called to clear all cached information on pageblocks that
373	* should be skipped for page isolation when the migrate and free page scanner
374	* meet.
375	*/
376	static void __reset_isolation_suitable(struct zone *zone)
377	{
378	unsigned long migrate_pfn = zone->zone_start_pfn;
379	unsigned long free_pfn = zone_end_pfn(zone) - `1`;
380	unsigned long reset_migrate = free_pfn;
381	unsigned long reset_free = migrate_pfn;
382	bool source_set = false;
383	bool free_set = false;
384
385	/ Only flush if a full compaction finished recently /
386	if (!zone->compact_blockskip_flush)
387	return;
388
389	zone->compact_blockskip_flush = false;
390
391	/*
392	* Walk the zone and update pageblock skip information. Source looks
393	* for PageLRU while target looks for PageBuddy. When the scanner
394	* is found, both PageBuddy and PageLRU are checked as the pageblock
395	* is suitable as both source and target.
396	*/
397	for (; migrate_pfn < free_pfn; migrate_pfn += pageblock_nr_pages,
398	free_pfn -= pageblock_nr_pages) {
399	cond_resched();
400
401	/ Update the migrate PFN /
402	if (__reset_isolation_pfn(zone, pfn: migrate_pfn, check_source: true, check_target: source_set) &&
403	migrate_pfn < reset_migrate) {
404	source_set = true;
405	reset_migrate = migrate_pfn;
406	zone->compact_init_migrate_pfn = reset_migrate;
407	zone->compact_cached_migrate_pfn[`0`] = reset_migrate;
408	zone->compact_cached_migrate_pfn[`1`] = reset_migrate;
409	}
410
411	/ Update the free PFN /
412	if (__reset_isolation_pfn(zone, pfn: free_pfn, check_source: free_set, check_target: true) &&
413	free_pfn > reset_free) {
414	free_set = true;
415	reset_free = free_pfn;
416	zone->compact_init_free_pfn = reset_free;
417	zone->compact_cached_free_pfn = reset_free;
418	}
419	}
420
421	/ Leave no distance if no suitable block was reset /
422	if (reset_migrate >= reset_free) {
423	zone->compact_cached_migrate_pfn[`0`] = migrate_pfn;
424	zone->compact_cached_migrate_pfn[`1`] = migrate_pfn;
425	zone->compact_cached_free_pfn = free_pfn;
426	}
427	}
428
429	void reset_isolation_suitable(pg_data_t *pgdat)
430	{
431	int zoneid;
432
433	for (zoneid = `0`; zoneid < MAX_NR_ZONES; zoneid++) {
434	struct zone *zone = &pgdat->node_zones[zoneid];
435	if (!populated_zone(zone))
436	continue;
437
438	__reset_isolation_suitable(zone);
439	}
440	}
441
442	/*
443	* Sets the pageblock skip bit if it was clear. Note that this is a hint as
444	* locks are not required for read/writers. Returns true if it was already set.
445	*/
446	static bool test_and_set_skip(struct compact_control cc, struct* page *page)
447	{
448	bool skip;
449
450	/ Do not update if skip hint is being ignored /
451	if (cc->ignore_skip_hint)
452	return false;
453
454	skip = get_pageblock_skip(page);
455	if (!skip && !cc->no_set_skip_hint)
456	set_pageblock_skip(page);
457
458	return skip;
459	}
460
461	static void update_cached_migrate(struct compact_control cc, unsigned* long pfn)
462	{
463	struct zone *zone = cc->zone;
464
465	/ Set for isolation rather than compaction /
466	if (cc->no_set_skip_hint)
467	return;
468
469	pfn = pageblock_end_pfn(pfn);
470
471	/ Update where async and sync compaction should restart /
472	if (pfn > zone->compact_cached_migrate_pfn[`0`])
473	zone->compact_cached_migrate_pfn[`0`] = pfn;
474	if (cc->mode != MIGRATE_ASYNC &&
475	pfn > zone->compact_cached_migrate_pfn[`1`])
476	zone->compact_cached_migrate_pfn[`1`] = pfn;
477	}
478
479	/*
480	* If no pages were isolated then mark this pageblock to be skipped in the
481	* future. The information is later cleared by __reset_isolation_suitable().
482	*/
483	static void update_pageblock_skip(struct compact_control *cc,
484	struct page page, unsigned* long pfn)
485	{
486	struct zone *zone = cc->zone;
487
488	if (cc->no_set_skip_hint)
489	return;
490
491	set_pageblock_skip(page);
492
493	if (pfn < zone->compact_cached_free_pfn)
494	zone->compact_cached_free_pfn = pfn;
495	}
496	#else
497	static inline bool isolation_suitable(struct compact_control *cc,
498	struct page *page)
499	{
500	return true;
501	}
502
503	static inline bool pageblock_skip_persistent(struct page *page)
504	{
505	return false;
506	}
507
508	static inline void update_pageblock_skip(struct compact_control *cc,
509	struct page page, unsigned* long pfn)
510	{
511	}
512
513	static void update_cached_migrate(struct compact_control cc, unsigned* long pfn)
514	{
515	}
516
517	static bool test_and_set_skip(struct compact_control cc, struct* page *page)
518	{
519	return false;
520	}
521	#endif /* CONFIG_COMPACTION */
522
523	/*
524	* Compaction requires the taking of some coarse locks that are potentially
525	* very heavily contended. For async compaction, trylock and record if the
526	* lock is contended. The lock will still be acquired but compaction will
527	* abort when the current block is finished regardless of success rate.
528	* Sync compaction acquires the lock.
529	*
530	* Always returns true which makes it easier to track lock state in callers.
531	*/
532	static bool compact_lock_irqsave(spinlock_t lock, unsigned* long *flags,
533	struct compact_control *cc)
534	__acquires(lock)
535	{
536	/ Track if the lock is contended in async mode /
537	if (cc->mode == MIGRATE_ASYNC && !cc->contended) {
538	if (spin_trylock_irqsave(lock, *flags))
539	return true;
540
541	cc->contended = true;
542	}
543
544	spin_lock_irqsave(lock, *flags);
545	return true;
546	}
547
548	/*
549	* Compaction requires the taking of some coarse locks that are potentially
550	* very heavily contended. The lock should be periodically unlocked to avoid
551	* having disabled IRQs for a long time, even when there is nobody waiting on
552	* the lock. It might also be that allowing the IRQs will result in
553	* need_resched() becoming true. If scheduling is needed, compaction schedules.
554	* Either compaction type will also abort if a fatal signal is pending.
555	* In either case if the lock was locked, it is dropped and not regained.
556	*
557	* Returns true if compaction should abort due to fatal signal pending.
558	* Returns false when compaction can continue.
559	*/
560	static bool compact_unlock_should_abort(spinlock_t *lock,
561	unsigned long flags, bool locked, struct* compact_control *cc)
562	{
563	if (*locked) {
564	spin_unlock_irqrestore(lock, flags);
565	*locked = false;
566	}
567
568	if (fatal_signal_pending(current)) {
569	cc->contended = true;
570	return true;
571	}
572
573	cond_resched();
574
575	return false;
576	}
577
578	/*
579	* Isolate free pages onto a private freelist. If @strict is true, will abort
580	* returning 0 on any invalid PFNs or non-free pages inside of the pageblock
581	* (even though it may still end up isolating some pages).
582	*/
583	static unsigned long isolate_freepages_block(struct compact_control *cc,
584	unsigned long *start_pfn,
585	unsigned long end_pfn,
586	struct list_head *freelist,
587	unsigned int stride,
588	bool strict)
589	{
590	int nr_scanned = `0`, total_isolated = `0`;
591	struct page *page;
592	unsigned long flags = `0`;
593	bool locked = false;
594	unsigned long blockpfn = *start_pfn;
595	unsigned int order;
596
597	/ Strict mode is for isolation, speed is secondary /
598	if (strict)
599	stride = `1`;
600
601	page = pfn_to_page(blockpfn);
602
603	/ Isolate free pages. /
604	for (; blockpfn < end_pfn; blockpfn += stride, page += stride) {
605	int isolated;
606
607	/*
608	* Periodically drop the lock (if held) regardless of its
609	* contention, to give chance to IRQs. Abort if fatal signal
610	* pending.
611	*/
612	if (!(blockpfn % COMPACT_CLUSTER_MAX)
613	&& compact_unlock_should_abort(lock: &cc->zone->lock, flags,
614	locked: &locked, cc))
615	break;
616
617	nr_scanned++;
618
619	/*
620	* For compound pages such as THP and hugetlbfs, we can save
621	* potentially a lot of iterations if we skip them at once.
622	* The check is racy, but we can consider only valid values
623	* and the only danger is skipping too much.
624	*/
625	if (PageCompound(page)) {
626	const unsigned int order = compound_order(page);
627
628	if (blockpfn + (`1UL` << order) <= end_pfn) {
629	blockpfn += (`1UL` << order) - `1`;
630	page += (`1UL` << order) - `1`;
631	nr_scanned += (`1UL` << order) - `1`;
632	}
633
634	goto isolate_fail;
635	}
636
637	if (!PageBuddy(page))
638	goto isolate_fail;
639
640	/ If we already hold the lock, we can skip some rechecking. /
641	if (!locked) {
642	locked = compact_lock_irqsave(lock: &cc->zone->lock,
643	flags: &flags, cc);
644
645	/ Recheck this is a buddy page under lock /
646	if (!PageBuddy(page))
647	goto isolate_fail;
648	}
649
650	/ Found a free page, will break it into order-0 pages /
651	order = buddy_order(page);
652	isolated = __isolate_free_page(page, order);
653	if (!isolated)
654	break;
655	set_page_private(page, private: order);
656
657	nr_scanned += isolated - `1`;
658	total_isolated += isolated;
659	cc->nr_freepages += isolated;
660	list_add_tail(new: &page->lru, head: freelist);
661
662	if (!strict && cc->nr_migratepages <= cc->nr_freepages) {
663	blockpfn += isolated;
664	break;
665	}
666	/ Advance to the end of split page /
667	blockpfn += isolated - `1`;
668	page += isolated - `1`;
669	continue;
670
671	isolate_fail:
672	if (strict)
673	break;
674
675	}
676
677	if (locked)
678	spin_unlock_irqrestore(lock: &cc->zone->lock, flags);
679
680	/*
681	* Be careful to not go outside of the pageblock.
682	*/
683	if (unlikely(blockpfn > end_pfn))
684	blockpfn = end_pfn;
685
686	trace_mm_compaction_isolate_freepages(start_pfn: *start_pfn, end_pfn: blockpfn,
687	nr_scanned, nr_taken: total_isolated);
688
689	/ Record how far we have got within the block /
690	*start_pfn = blockpfn;
691
692	/*
693	* If strict isolation is requested by CMA then check that all the
694	* pages requested were isolated. If there were any failures, 0 is
695	* returned and CMA will fail.
696	*/
697	if (strict && blockpfn < end_pfn)
698	total_isolated = `0`;
699
700	cc->total_free_scanned += nr_scanned;
701	if (total_isolated)
702	count_compact_events(item: COMPACTISOLATED, delta: total_isolated);
703	return total_isolated;
704	}
705
706	/**
707	* isolate_freepages_range() - isolate free pages.
708	* @cc: Compaction control structure.
709	* @start_pfn: The first PFN to start isolating.
710	* @end_pfn: The one-past-last PFN.
711	*
712	* Non-free pages, invalid PFNs, or zone boundaries within the
713	* [start_pfn, end_pfn) range are considered errors, cause function to
714	* undo its actions and return zero.
715	*
716	* Otherwise, function returns one-past-the-last PFN of isolated page
717	* (which may be greater then end_pfn if end fell in a middle of
718	* a free page).
719	*/
720	unsigned long
721	isolate_freepages_range(struct compact_control *cc,
722	unsigned long start_pfn, unsigned long end_pfn)
723	{
724	unsigned long isolated, pfn, block_start_pfn, block_end_pfn;
725	LIST_HEAD(freelist);
726
727	pfn = start_pfn;
728	block_start_pfn = pageblock_start_pfn(pfn);
729	if (block_start_pfn < cc->zone->zone_start_pfn)
730	block_start_pfn = cc->zone->zone_start_pfn;
731	block_end_pfn = pageblock_end_pfn(pfn);
732
733	for (; pfn < end_pfn; pfn += isolated,
734	block_start_pfn = block_end_pfn,
735	block_end_pfn += pageblock_nr_pages) {
736	/ Protect pfn from changing by isolate_freepages_block /
737	unsigned long isolate_start_pfn = pfn;
738
739	/*
740	* pfn could pass the block_end_pfn if isolated freepage
741	* is more than pageblock order. In this case, we adjust
742	* scanning range to right one.
743	*/
744	if (pfn >= block_end_pfn) {
745	block_start_pfn = pageblock_start_pfn(pfn);
746	block_end_pfn = pageblock_end_pfn(pfn);
747	}
748
749	block_end_pfn = min(block_end_pfn, end_pfn);
750
751	if (!pageblock_pfn_to_page(start_pfn: block_start_pfn,
752	end_pfn: block_end_pfn, zone: cc->zone))
753	break;
754
755	isolated = isolate_freepages_block(cc, start_pfn: &isolate_start_pfn,
756	end_pfn: block_end_pfn, freelist: &freelist, stride: `0`, strict: true);
757
758	/*
759	* In strict mode, isolate_freepages_block() returns 0 if
760	* there are any holes in the block (ie. invalid PFNs or
761	* non-free pages).
762	*/
763	if (!isolated)
764	break;
765
766	/*
767	* If we managed to isolate pages, it is always (1 << n) *
768	* pageblock_nr_pages for some non-negative n. (Max order
769	* page may span two pageblocks).
770	*/
771	}
772
773	/ __isolate_free_page() does not map the pages /
774	split_map_pages(list: &freelist);
775
776	if (pfn < end_pfn) {
777	/ Loop terminated early, cleanup. /
778	release_freepages(freelist: &freelist);
779	return `0`;
780	}
781
782	/ We don't use freelists for anything. /
783	return pfn;
784	}
785
786	/ Similar to reclaim, but different enough that they don't share logic /
787	static bool too_many_isolated(struct compact_control *cc)
788	{
789	pg_data_t *pgdat = cc->zone->zone_pgdat;
790	bool too_many;
791
792	unsigned long active, inactive, isolated;
793
794	inactive = node_page_state(pgdat, item: NR_INACTIVE_FILE) +
795	node_page_state(pgdat, item: NR_INACTIVE_ANON);
796	active = node_page_state(pgdat, item: NR_ACTIVE_FILE) +
797	node_page_state(pgdat, item: NR_ACTIVE_ANON);
798	isolated = node_page_state(pgdat, item: NR_ISOLATED_FILE) +
799	node_page_state(pgdat, item: NR_ISOLATED_ANON);
800
801	/*
802	* Allow GFP_NOFS to isolate past the limit set for regular
803	* compaction runs. This prevents an ABBA deadlock when other
804	* compactors have already isolated to the limit, but are
805	* blocked on filesystem locks held by the GFP_NOFS thread.
806	*/
807	if (cc->gfp_mask & __GFP_FS) {
808	inactive >>= `3`;
809	active >>= `3`;
810	}
811
812	too_many = isolated > (inactive + active) / `2`;
813	if (!too_many)
814	wake_throttle_isolated(pgdat);
815
816	return too_many;
817	}
818
819	/**
820	* isolate_migratepages_block() - isolate all migrate-able pages within
821	* a single pageblock
822	* @cc: Compaction control structure.
823	* @low_pfn: The first PFN to isolate
824	* @end_pfn: The one-past-the-last PFN to isolate, within same pageblock
825	* @mode: Isolation mode to be used.
826	*
827	* Isolate all pages that can be migrated from the range specified by
828	* [low_pfn, end_pfn). The range is expected to be within same pageblock.
829	* Returns errno, like -EAGAIN or -EINTR in case e.g signal pending or congestion,
830	* -ENOMEM in case we could not allocate a page, or 0.
831	* cc->migrate_pfn will contain the next pfn to scan.
832	*
833	* The pages are isolated on cc->migratepages list (not required to be empty),
834	* and cc->nr_migratepages is updated accordingly.
835	*/
836	static int
837	isolate_migratepages_block(struct compact_control cc, unsigned* long low_pfn,
838	unsigned long end_pfn, isolate_mode_t mode)
839	{
840	pg_data_t *pgdat = cc->zone->zone_pgdat;
841	unsigned long nr_scanned = `0`, nr_isolated = `0`;
842	struct lruvec *lruvec;
843	unsigned long flags = `0`;
844	struct lruvec *locked = NULL;
845	struct folio *folio = NULL;
846	struct page page = NULL, valid_page = NULL;
847	struct address_space *mapping;
848	unsigned long start_pfn = low_pfn;
849	bool skip_on_failure = false;
850	unsigned long next_skip_pfn = `0`;
851	bool skip_updated = false;
852	int ret = `0`;
853
854	cc->migrate_pfn = low_pfn;
855
856	/*
857	* Ensure that there are not too many pages isolated from the LRU
858	* list by either parallel reclaimers or compaction. If there are,
859	* delay for some time until fewer pages are isolated
860	*/
861	while (unlikely(too_many_isolated(cc))) {
862	/ stop isolation if there are still pages not migrated /
863	if (cc->nr_migratepages)
864	return -EAGAIN;
865
866	/ async migration should just abort /
867	if (cc->mode == MIGRATE_ASYNC)
868	return -EAGAIN;
869
870	reclaim_throttle(pgdat, reason: VMSCAN_THROTTLE_ISOLATED);
871
872	if (fatal_signal_pending(current))
873	return -EINTR;
874	}
875
876	cond_resched();
877
878	if (cc->direct_compaction && (cc->mode == MIGRATE_ASYNC)) {
879	skip_on_failure = true;
880	next_skip_pfn = block_end_pfn(low_pfn, cc->order);
881	}
882
883	/ Time to isolate some pages for migration /
884	for (; low_pfn < end_pfn; low_pfn++) {
885
886	if (skip_on_failure && low_pfn >= next_skip_pfn) {
887	/*
888	* We have isolated all migration candidates in the
889	* previous order-aligned block, and did not skip it due
890	* to failure. We should migrate the pages now and
891	* hopefully succeed compaction.
892	*/
893	if (nr_isolated)
894	break;
895
896	/*
897	* We failed to isolate in the previous order-aligned
898	* block. Set the new boundary to the end of the
899	* current block. Note we can't simply increase
900	* next_skip_pfn by 1 << order, as low_pfn might have
901	* been incremented by a higher number due to skipping
902	* a compound or a high-order buddy page in the
903	* previous loop iteration.
904	*/
905	next_skip_pfn = block_end_pfn(low_pfn, cc->order);
906	}
907
908	/*
909	* Periodically drop the lock (if held) regardless of its
910	* contention, to give chance to IRQs. Abort completely if
911	* a fatal signal is pending.
912	*/
913	if (!(low_pfn % COMPACT_CLUSTER_MAX)) {
914	if (locked) {
915	unlock_page_lruvec_irqrestore(lruvec: locked, flags);
916	locked = NULL;
917	}
918
919	if (fatal_signal_pending(current)) {
920	cc->contended = true;
921	ret = -EINTR;
922
923	goto fatal_pending;
924	}
925
926	cond_resched();
927	}
928
929	nr_scanned++;
930
931	page = pfn_to_page(low_pfn);
932
933	/*
934	* Check if the pageblock has already been marked skipped.
935	* Only the first PFN is checked as the caller isolates
936	* COMPACT_CLUSTER_MAX at a time so the second call must
937	* not falsely conclude that the block should be skipped.
938	*/
939	if (!valid_page && (pageblock_aligned(low_pfn) \|\|
940	low_pfn == cc->zone->zone_start_pfn)) {
941	if (!isolation_suitable(cc, page)) {
942	low_pfn = end_pfn;
943	folio = NULL;
944	goto isolate_abort;
945	}
946	valid_page = page;
947	}
948
949	if (PageHuge(page) && cc->alloc_contig) {
950	if (locked) {
951	unlock_page_lruvec_irqrestore(lruvec: locked, flags);
952	locked = NULL;
953	}
954
955	ret = isolate_or_dissolve_huge_page(page, list: &cc->migratepages);
956
957	/*
958	* Fail isolation in case isolate_or_dissolve_huge_page()
959	* reports an error. In case of -ENOMEM, abort right away.
960	*/
961	if (ret < `0`) {
962	/ Do not report -EBUSY down the chain /
963	if (ret == -EBUSY)
964	ret = `0`;
965	low_pfn += compound_nr(page) - `1`;
966	nr_scanned += compound_nr(page) - `1`;
967	goto isolate_fail;
968	}
969
970	if (PageHuge(page)) {
971	/*
972	* Hugepage was successfully isolated and placed
973	* on the cc->migratepages list.
974	*/
975	folio = page_folio(page);
976	low_pfn += folio_nr_pages(folio) - `1`;
977	goto isolate_success_no_list;
978	}
979
980	/*
981	* Ok, the hugepage was dissolved. Now these pages are
982	* Buddy and cannot be re-allocated because they are
983	* isolated. Fall-through as the check below handles
984	* Buddy pages.
985	*/
986	}
987
988	/*
989	* Skip if free. We read page order here without zone lock
990	* which is generally unsafe, but the race window is small and
991	* the worst thing that can happen is that we skip some
992	* potential isolation targets.
993	*/
994	if (PageBuddy(page)) {
995	unsigned long freepage_order = buddy_order_unsafe(page);
996
997	/*
998	* Without lock, we cannot be sure that what we got is
999	* a valid page order. Consider only values in the
1000	* valid order range to prevent low_pfn overflow.
1001	*/
1002	if (freepage_order > `0` && freepage_order <= MAX_ORDER) {
1003	low_pfn += (`1UL` << freepage_order) - `1`;
1004	nr_scanned += (`1UL` << freepage_order) - `1`;
1005	}
1006	continue;
1007	}
1008
1009	/*
1010	* Regardless of being on LRU, compound pages such as THP and
1011	* hugetlbfs are not to be compacted unless we are attempting
1012	* an allocation much larger than the huge page size (eg CMA).
1013	* We can potentially save a lot of iterations if we skip them
1014	* at once. The check is racy, but we can consider only valid
1015	* values and the only danger is skipping too much.
1016	*/
1017	if (PageCompound(page) && !cc->alloc_contig) {
1018	const unsigned int order = compound_order(page);
1019
1020	if (likely(order <= MAX_ORDER)) {
1021	low_pfn += (`1UL` << order) - `1`;
1022	nr_scanned += (`1UL` << order) - `1`;
1023	}
1024	goto isolate_fail;
1025	}
1026
1027	/*
1028	* Check may be lockless but that's ok as we recheck later.
1029	* It's possible to migrate LRU and non-lru movable pages.
1030	* Skip any other type of page
1031	*/
1032	if (!PageLRU(page)) {
1033	/*
1034	* __PageMovable can return false positive so we need
1035	* to verify it under page_lock.
1036	*/
1037	if (unlikely(__PageMovable(page)) &&
1038	!PageIsolated(page)) {
1039	if (locked) {
1040	unlock_page_lruvec_irqrestore(lruvec: locked, flags);
1041	locked = NULL;
1042	}
1043
1044	if (isolate_movable_page(page, mode)) {
1045	folio = page_folio(page);
1046	goto isolate_success;
1047	}
1048	}
1049
1050	goto isolate_fail;
1051	}
1052
1053	/*
1054	* Be careful not to clear PageLRU until after we're
1055	* sure the page is not being freed elsewhere -- the
1056	* page release code relies on it.
1057	*/
1058	folio = folio_get_nontail_page(page);
1059	if (unlikely(!folio))
1060	goto isolate_fail;
1061
1062	/*
1063	* Migration will fail if an anonymous page is pinned in memory,
1064	* so avoid taking lru_lock and isolating it unnecessarily in an
1065	* admittedly racy check.
1066	*/
1067	mapping = folio_mapping(folio);
1068	if (!mapping && (folio_ref_count(folio) - `1`) > folio_mapcount(folio))
1069	goto isolate_fail_put;
1070
1071	/*
1072	* Only allow to migrate anonymous pages in GFP_NOFS context
1073	* because those do not depend on fs locks.
1074	*/
1075	if (!(cc->gfp_mask & __GFP_FS) && mapping)
1076	goto isolate_fail_put;
1077
1078	/ Only take pages on LRU: a check now makes later tests safe /
1079	if (!folio_test_lru(folio))
1080	goto isolate_fail_put;
1081
1082	/ Compaction might skip unevictable pages but CMA takes them /
1083	if (!(mode & ISOLATE_UNEVICTABLE) && folio_test_unevictable(folio))
1084	goto isolate_fail_put;
1085
1086	/*
1087	* To minimise LRU disruption, the caller can indicate with
1088	* ISOLATE_ASYNC_MIGRATE that it only wants to isolate pages
1089	* it will be able to migrate without blocking - clean pages
1090	* for the most part. PageWriteback would require blocking.
1091	*/
1092	if ((mode & ISOLATE_ASYNC_MIGRATE) && folio_test_writeback(folio))
1093	goto isolate_fail_put;
1094
1095	if ((mode & ISOLATE_ASYNC_MIGRATE) && folio_test_dirty(folio)) {
1096	bool migrate_dirty;
1097
1098	/*
1099	* Only folios without mappings or that have
1100	* a ->migrate_folio callback are possible to
1101	* migrate without blocking. However, we may
1102	* be racing with truncation, which can free
1103	* the mapping. Truncation holds the folio lock
1104	* until after the folio is removed from the page
1105	* cache so holding it ourselves is sufficient.
1106	*/
1107	if (!folio_trylock(folio))
1108	goto isolate_fail_put;
1109
1110	mapping = folio_mapping(folio);
1111	migrate_dirty = !mapping \|\|
1112	mapping->a_ops->migrate_folio;
1113	folio_unlock(folio);
1114	if (!migrate_dirty)
1115	goto isolate_fail_put;
1116	}
1117
1118	/ Try isolate the folio /
1119	if (!folio_test_clear_lru(folio))
1120	goto isolate_fail_put;
1121
1122	lruvec = folio_lruvec(folio);
1123
1124	/ If we already hold the lock, we can skip some rechecking /
1125	if (lruvec != locked) {
1126	if (locked)
1127	unlock_page_lruvec_irqrestore(lruvec: locked, flags);
1128
1129	compact_lock_irqsave(lock: &lruvec->lru_lock, flags: &flags, cc);
1130	locked = lruvec;
1131
1132	lruvec_memcg_debug(lruvec, folio);
1133
1134	/*
1135	* Try get exclusive access under lock. If marked for
1136	* skip, the scan is aborted unless the current context
1137	* is a rescan to reach the end of the pageblock.
1138	*/
1139	if (!skip_updated && valid_page) {
1140	skip_updated = true;
1141	if (test_and_set_skip(cc, page: valid_page) &&
1142	!cc->finish_pageblock) {
1143	low_pfn = end_pfn;
1144	goto isolate_abort;
1145	}
1146	}
1147
1148	/*
1149	* folio become large since the non-locked check,
1150	* and it's on LRU.
1151	*/
1152	if (unlikely(folio_test_large(folio) && !cc->alloc_contig)) {
1153	low_pfn += folio_nr_pages(folio) - `1`;
1154	nr_scanned += folio_nr_pages(folio) - `1`;
1155	folio_set_lru(folio);
1156	goto isolate_fail_put;
1157	}
1158	}
1159
1160	/ The folio is taken off the LRU /
1161	if (folio_test_large(folio))
1162	low_pfn += folio_nr_pages(folio) - `1`;
1163
1164	/ Successfully isolated /
1165	lruvec_del_folio(lruvec, folio);
1166	node_stat_mod_folio(folio,
1167	item: NR_ISOLATED_ANON + folio_is_file_lru(folio),
1168	nr: folio_nr_pages(folio));
1169
1170	isolate_success:
1171	list_add(new: &folio->lru, head: &cc->migratepages);
1172	isolate_success_no_list:
1173	cc->nr_migratepages += folio_nr_pages(folio);
1174	nr_isolated += folio_nr_pages(folio);
1175	nr_scanned += folio_nr_pages(folio) - `1`;
1176
1177	/*
1178	* Avoid isolating too much unless this block is being
1179	* fully scanned (e.g. dirty/writeback pages, parallel allocation)
1180	* or a lock is contended. For contention, isolate quickly to
1181	* potentially remove one source of contention.
1182	*/
1183	if (cc->nr_migratepages >= COMPACT_CLUSTER_MAX &&
1184	!cc->finish_pageblock && !cc->contended) {
1185	++low_pfn;
1186	break;
1187	}
1188
1189	continue;
1190
1191	isolate_fail_put:
1192	/ Avoid potential deadlock in freeing page under lru_lock /
1193	if (locked) {
1194	unlock_page_lruvec_irqrestore(lruvec: locked, flags);
1195	locked = NULL;
1196	}
1197	folio_put(folio);
1198
1199	isolate_fail:
1200	if (!skip_on_failure && ret != -ENOMEM)
1201	continue;
1202
1203	/*
1204	* We have isolated some pages, but then failed. Release them
1205	* instead of migrating, as we cannot form the cc->order buddy
1206	* page anyway.
1207	*/
1208	if (nr_isolated) {
1209	if (locked) {
1210	unlock_page_lruvec_irqrestore(lruvec: locked, flags);
1211	locked = NULL;
1212	}
1213	putback_movable_pages(l: &cc->migratepages);
1214	cc->nr_migratepages = `0`;
1215	nr_isolated = `0`;
1216	}
1217
1218	if (low_pfn < next_skip_pfn) {
1219	low_pfn = next_skip_pfn - `1`;
1220	/*
1221	* The check near the loop beginning would have updated
1222	* next_skip_pfn too, but this is a bit simpler.
1223	*/
1224	next_skip_pfn += `1UL` << cc->order;
1225	}
1226
1227	if (ret == -ENOMEM)
1228	break;
1229	}
1230
1231	/*
1232	* The PageBuddy() check could have potentially brought us outside
1233	* the range to be scanned.
1234	*/
1235	if (unlikely(low_pfn > end_pfn))
1236	low_pfn = end_pfn;
1237
1238	folio = NULL;
1239
1240	isolate_abort:
1241	if (locked)
1242	unlock_page_lruvec_irqrestore(lruvec: locked, flags);
1243	if (folio) {
1244	folio_set_lru(folio);
1245	folio_put(folio);
1246	}
1247
1248	/*
1249	* Update the cached scanner pfn once the pageblock has been scanned.
1250	* Pages will either be migrated in which case there is no point
1251	* scanning in the near future or migration failed in which case the
1252	* failure reason may persist. The block is marked for skipping if
1253	* there were no pages isolated in the block or if the block is
1254	* rescanned twice in a row.
1255	*/
1256	if (low_pfn == end_pfn && (!nr_isolated \|\| cc->finish_pageblock)) {
1257	if (!cc->no_set_skip_hint && valid_page && !skip_updated)
1258	set_pageblock_skip(valid_page);
1259	update_cached_migrate(cc, pfn: low_pfn);
1260	}
1261
1262	trace_mm_compaction_isolate_migratepages(start_pfn, end_pfn: low_pfn,
1263	nr_scanned, nr_taken: nr_isolated);
1264
1265	fatal_pending:
1266	cc->total_migrate_scanned += nr_scanned;
1267	if (nr_isolated)
1268	count_compact_events(item: COMPACTISOLATED, delta: nr_isolated);
1269
1270	cc->migrate_pfn = low_pfn;
1271
1272	return ret;
1273	}
1274
1275	/**
1276	* isolate_migratepages_range() - isolate migrate-able pages in a PFN range
1277	* @cc: Compaction control structure.
1278	* @start_pfn: The first PFN to start isolating.
1279	* @end_pfn: The one-past-last PFN.
1280	*
1281	* Returns -EAGAIN when contented, -EINTR in case of a signal pending, -ENOMEM
1282	* in case we could not allocate a page, or 0.
1283	*/
1284	int
1285	isolate_migratepages_range(struct compact_control cc, unsigned* long start_pfn,
1286	unsigned long end_pfn)
1287	{
1288	unsigned long pfn, block_start_pfn, block_end_pfn;
1289	int ret = `0`;
1290
1291	/ Scan block by block. First and last block may be incomplete /
1292	pfn = start_pfn;
1293	block_start_pfn = pageblock_start_pfn(pfn);
1294	if (block_start_pfn < cc->zone->zone_start_pfn)
1295	block_start_pfn = cc->zone->zone_start_pfn;
1296	block_end_pfn = pageblock_end_pfn(pfn);
1297
1298	for (; pfn < end_pfn; pfn = block_end_pfn,
1299	block_start_pfn = block_end_pfn,
1300	block_end_pfn += pageblock_nr_pages) {
1301
1302	block_end_pfn = min(block_end_pfn, end_pfn);
1303
1304	if (!pageblock_pfn_to_page(start_pfn: block_start_pfn,
1305	end_pfn: block_end_pfn, zone: cc->zone))
1306	continue;
1307
1308	ret = isolate_migratepages_block(cc, low_pfn: pfn, end_pfn: block_end_pfn,
1309	ISOLATE_UNEVICTABLE);
1310
1311	if (ret)
1312	break;
1313
1314	if (cc->nr_migratepages >= COMPACT_CLUSTER_MAX)
1315	break;
1316	}
1317
1318	return ret;
1319	}
1320
1321	#endif /* CONFIG_COMPACTION \|\| CONFIG_CMA */
1322	#ifdef CONFIG_COMPACTION
1323
1324	static bool suitable_migration_source(struct compact_control *cc,
1325	struct page *page)
1326	{
1327	int block_mt;
1328
1329	if (pageblock_skip_persistent(page))
1330	return false;
1331
1332	if ((cc->mode != MIGRATE_ASYNC) \|\| !cc->direct_compaction)
1333	return true;
1334
1335	block_mt = get_pageblock_migratetype(page);
1336
1337	if (cc->migratetype == MIGRATE_MOVABLE)
1338	return is_migrate_movable(mt: block_mt);
1339	else
1340	return block_mt == cc->migratetype;
1341	}
1342
1343	/ Returns true if the page is within a block suitable for migration to /
1344	static bool suitable_migration_target(struct compact_control *cc,
1345	struct page *page)
1346	{
1347	/ If the page is a large free page, then disallow migration /
1348	if (PageBuddy(page)) {
1349	/*
1350	* We are checking page_order without zone->lock taken. But
1351	* the only small danger is that we skip a potentially suitable
1352	* pageblock, so it's not worth to check order for valid range.
1353	*/
1354	if (buddy_order_unsafe(page) >= pageblock_order)
1355	return false;
1356	}
1357
1358	if (cc->ignore_block_suitable)
1359	return true;
1360
1361	/ If the block is MIGRATE_MOVABLE or MIGRATE_CMA, allow migration /
1362	if (is_migrate_movable(get_pageblock_migratetype(page)))
1363	return true;
1364
1365	/ Otherwise skip the block /
1366	return false;
1367	}
1368
1369	static inline unsigned int
1370	freelist_scan_limit(struct compact_control *cc)
1371	{
1372	unsigned short shift = BITS_PER_LONG - `1`;
1373
1374	return (COMPACT_CLUSTER_MAX >> min(shift, cc->fast_search_fail)) + `1`;
1375	}
1376
1377	/*
1378	* Test whether the free scanner has reached the same or lower pageblock than
1379	* the migration scanner, and compaction should thus terminate.
1380	*/
1381	static inline bool compact_scanners_met(struct compact_control *cc)
1382	{
1383	return (cc->free_pfn >> pageblock_order)
1384	<= (cc->migrate_pfn >> pageblock_order);
1385	}
1386
1387	/*
1388	* Used when scanning for a suitable migration target which scans freelists
1389	* in reverse. Reorders the list such as the unscanned pages are scanned
1390	* first on the next iteration of the free scanner
1391	*/
1392	static void
1393	move_freelist_head(struct list_head freelist, struct* page *freepage)
1394	{
1395	LIST_HEAD(sublist);
1396
1397	if (!list_is_first(list: &freepage->buddy_list, head: freelist)) {
1398	list_cut_before(list: &sublist, head: freelist, entry: &freepage->buddy_list);
1399	list_splice_tail(list: &sublist, head: freelist);
1400	}
1401	}
1402
1403	/*
1404	* Similar to move_freelist_head except used by the migration scanner
1405	* when scanning forward. It's possible for these list operations to
1406	* move against each other if they search the free list exactly in
1407	* lockstep.
1408	*/
1409	static void
1410	move_freelist_tail(struct list_head freelist, struct* page *freepage)
1411	{
1412	LIST_HEAD(sublist);
1413
1414	if (!list_is_last(list: &freepage->buddy_list, head: freelist)) {
1415	list_cut_position(list: &sublist, head: freelist, entry: &freepage->buddy_list);
1416	list_splice_tail(list: &sublist, head: freelist);
1417	}
1418	}
1419
1420	static void
1421	fast_isolate_around(struct compact_control cc, unsigned* long pfn)
1422	{
1423	unsigned long start_pfn, end_pfn;
1424	struct page *page;
1425
1426	/ Do not search around if there are enough pages already /
1427	if (cc->nr_freepages >= cc->nr_migratepages)
1428	return;
1429
1430	/ Minimise scanning during async compaction /
1431	if (cc->direct_compaction && cc->mode == MIGRATE_ASYNC)
1432	return;
1433
1434	/ Pageblock boundaries /
1435	start_pfn = max(pageblock_start_pfn(pfn), cc->zone->zone_start_pfn);
1436	end_pfn = min(pageblock_end_pfn(pfn), zone_end_pfn(cc->zone));
1437
1438	page = pageblock_pfn_to_page(start_pfn, end_pfn, zone: cc->zone);
1439	if (!page)
1440	return;
1441
1442	isolate_freepages_block(cc, start_pfn: &start_pfn, end_pfn, freelist: &cc->freepages, stride: `1`, strict: false);
1443
1444	/ Skip this pageblock in the future as it's full or nearly full /
1445	if (start_pfn == end_pfn && !cc->no_set_skip_hint)
1446	set_pageblock_skip(page);
1447	}
1448
1449	/ Search orders in round-robin fashion /
1450	static int next_search_order(struct compact_control cc, int* order)
1451	{
1452	order--;
1453	if (order < `0`)
1454	order = cc->order - `1`;
1455
1456	/ Search wrapped around? /
1457	if (order == cc->search_order) {
1458	cc->search_order--;
1459	if (cc->search_order < `0`)
1460	cc->search_order = cc->order - `1`;
1461	return -`1`;
1462	}
1463
1464	return order;
1465	}
1466
1467	static void fast_isolate_freepages(struct compact_control *cc)
1468	{
1469	unsigned int limit = max(`1U`, freelist_scan_limit(cc) >> `1`);
1470	unsigned int nr_scanned = `0`, total_isolated = `0`;
1471	unsigned long low_pfn, min_pfn, highest = `0`;
1472	unsigned long nr_isolated = `0`;
1473	unsigned long distance;
1474	struct page *page = NULL;
1475	bool scan_start = false;
1476	int order;
1477
1478	/ Full compaction passes in a negative order /
1479	if (cc->order <= `0`)
1480	return;
1481
1482	/*
1483	* If starting the scan, use a deeper search and use the highest
1484	* PFN found if a suitable one is not found.
1485	*/
1486	if (cc->free_pfn >= cc->zone->compact_init_free_pfn) {
1487	limit = pageblock_nr_pages >> `1`;
1488	scan_start = true;
1489	}
1490
1491	/*
1492	* Preferred point is in the top quarter of the scan space but take
1493	* a pfn from the top half if the search is problematic.
1494	*/
1495	distance = (cc->free_pfn - cc->migrate_pfn);
1496	low_pfn = pageblock_start_pfn(cc->free_pfn - (distance >> `2`));
1497	min_pfn = pageblock_start_pfn(cc->free_pfn - (distance >> `1`));
1498
1499	if (WARN_ON_ONCE(min_pfn > low_pfn))
1500	low_pfn = min_pfn;
1501
1502	/*
1503	* Search starts from the last successful isolation order or the next
1504	* order to search after a previous failure
1505	*/
1506	cc->search_order = min_t(unsigned int, cc->order - `1`, cc->search_order);
1507
1508	for (order = cc->search_order;
1509	!page && order >= `0`;
1510	order = next_search_order(cc, order)) {
1511	struct free_area *area = &cc->zone->free_area[order];
1512	struct list_head *freelist;
1513	struct page *freepage;
1514	unsigned long flags;
1515	unsigned int order_scanned = `0`;
1516	unsigned long high_pfn = `0`;
1517
1518	if (!area->nr_free)
1519	continue;
1520
1521	spin_lock_irqsave(&cc->zone->lock, flags);
1522	freelist = &area->free_list[MIGRATE_MOVABLE];
1523	list_for_each_entry_reverse(freepage, freelist, buddy_list) {
1524	unsigned long pfn;
1525
1526	order_scanned++;
1527	nr_scanned++;
1528	pfn = page_to_pfn(freepage);
1529
1530	if (pfn >= highest)
1531	highest = max(pageblock_start_pfn(pfn),
1532	cc->zone->zone_start_pfn);
1533
1534	if (pfn >= low_pfn) {
1535	cc->fast_search_fail = `0`;
1536	cc->search_order = order;
1537	page = freepage;
1538	break;
1539	}
1540
1541	if (pfn >= min_pfn && pfn > high_pfn) {
1542	high_pfn = pfn;
1543
1544	/ Shorten the scan if a candidate is found /
1545	limit >>= `1`;
1546	}
1547
1548	if (order_scanned >= limit)
1549	break;
1550	}
1551
1552	/ Use a maximum candidate pfn if a preferred one was not found /
1553	if (!page && high_pfn) {
1554	page = pfn_to_page(high_pfn);
1555
1556	/ Update freepage for the list reorder below /
1557	freepage = page;
1558	}
1559
1560	/ Reorder to so a future search skips recent pages /
1561	move_freelist_head(freelist, freepage);
1562
1563	/ Isolate the page if available /
1564	if (page) {
1565	if (__isolate_free_page(page, order)) {
1566	set_page_private(page, private: order);
1567	nr_isolated = `1` << order;
1568	nr_scanned += nr_isolated - `1`;
1569	total_isolated += nr_isolated;
1570	cc->nr_freepages += nr_isolated;
1571	list_add_tail(new: &page->lru, head: &cc->freepages);
1572	count_compact_events(item: COMPACTISOLATED, delta: nr_isolated);
1573	} else {
1574	/ If isolation fails, abort the search /
1575	order = cc->search_order + `1`;
1576	page = NULL;
1577	}
1578	}
1579
1580	spin_unlock_irqrestore(lock: &cc->zone->lock, flags);
1581
1582	/ Skip fast search if enough freepages isolated /
1583	if (cc->nr_freepages >= cc->nr_migratepages)
1584	break;
1585
1586	/*
1587	* Smaller scan on next order so the total scan is related
1588	* to freelist_scan_limit.
1589	*/
1590	if (order_scanned >= limit)
1591	limit = max(`1U`, limit >> `1`);
1592	}
1593
1594	trace_mm_compaction_fast_isolate_freepages(start_pfn: min_pfn, end_pfn: cc->free_pfn,
1595	nr_scanned, nr_taken: total_isolated);
1596
1597	if (!page) {
1598	cc->fast_search_fail++;
1599	if (scan_start) {
1600	/*
1601	* Use the highest PFN found above min. If one was
1602	* not found, be pessimistic for direct compaction
1603	* and use the min mark.
1604	*/
1605	if (highest >= min_pfn) {
1606	page = pfn_to_page(highest);
1607	cc->free_pfn = highest;
1608	} else {
1609	if (cc->direct_compaction && pfn_valid(pfn: min_pfn)) {
1610	page = pageblock_pfn_to_page(start_pfn: min_pfn,
1611	min(pageblock_end_pfn(min_pfn),
1612	zone_end_pfn(cc->zone)),
1613	zone: cc->zone);
1614	cc->free_pfn = min_pfn;
1615	}
1616	}
1617	}
1618	}
1619
1620	if (highest && highest >= cc->zone->compact_cached_free_pfn) {
1621	highest -= pageblock_nr_pages;
1622	cc->zone->compact_cached_free_pfn = highest;
1623	}
1624
1625	cc->total_free_scanned += nr_scanned;
1626	if (!page)
1627	return;
1628
1629	low_pfn = page_to_pfn(page);
1630	fast_isolate_around(cc, pfn: low_pfn);
1631	}
1632
1633	/*
1634	* Based on information in the current compact_control, find blocks
1635	* suitable for isolating free pages from and then isolate them.
1636	*/
1637	static void isolate_freepages(struct compact_control *cc)
1638	{
1639	struct zone *zone = cc->zone;
1640	struct page *page;
1641	unsigned long block_start_pfn; / start of current pageblock /
1642	unsigned long isolate_start_pfn; / exact pfn we start at /
1643	unsigned long block_end_pfn; / end of current pageblock /
1644	unsigned long low_pfn; / lowest pfn scanner is able to scan /
1645	struct list_head *freelist = &cc->freepages;
1646	unsigned int stride;
1647
1648	/ Try a small search of the free lists for a candidate /
1649	fast_isolate_freepages(cc);
1650	if (cc->nr_freepages)
1651	goto splitmap;
1652
1653	/*
1654	* Initialise the free scanner. The starting point is where we last
1655	* successfully isolated from, zone-cached value, or the end of the
1656	* zone when isolating for the first time. For looping we also need
1657	* this pfn aligned down to the pageblock boundary, because we do
1658	* block_start_pfn -= pageblock_nr_pages in the for loop.
1659	* For ending point, take care when isolating in last pageblock of a
1660	* zone which ends in the middle of a pageblock.
1661	* The low boundary is the end of the pageblock the migration scanner
1662	* is using.
1663	*/
1664	isolate_start_pfn = cc->free_pfn;
1665	block_start_pfn = pageblock_start_pfn(isolate_start_pfn);
1666	block_end_pfn = min(block_start_pfn + pageblock_nr_pages,
1667	zone_end_pfn(zone));
1668	low_pfn = pageblock_end_pfn(cc->migrate_pfn);
1669	stride = cc->mode == MIGRATE_ASYNC ? COMPACT_CLUSTER_MAX : `1`;
1670
1671	/*
1672	* Isolate free pages until enough are available to migrate the
1673	* pages on cc->migratepages. We stop searching if the migrate
1674	* and free page scanners meet or enough free pages are isolated.
1675	*/
1676	for (; block_start_pfn >= low_pfn;
1677	block_end_pfn = block_start_pfn,
1678	block_start_pfn -= pageblock_nr_pages,
1679	isolate_start_pfn = block_start_pfn) {
1680	unsigned long nr_isolated;
1681
1682	/*
1683	* This can iterate a massively long zone without finding any
1684	* suitable migration targets, so periodically check resched.
1685	*/
1686	if (!(block_start_pfn % (COMPACT_CLUSTER_MAX * pageblock_nr_pages)))
1687	cond_resched();
1688
1689	page = pageblock_pfn_to_page(start_pfn: block_start_pfn, end_pfn: block_end_pfn,
1690	zone);
1691	if (!page) {
1692	unsigned long next_pfn;
1693
1694	next_pfn = skip_offline_sections_reverse(start_pfn: block_start_pfn);
1695	if (next_pfn)
1696	block_start_pfn = max(next_pfn, low_pfn);
1697
1698	continue;
1699	}
1700
1701	/ Check the block is suitable for migration /
1702	if (!suitable_migration_target(cc, page))
1703	continue;
1704
1705	/ If isolation recently failed, do not retry /
1706	if (!isolation_suitable(cc, page))
1707	continue;
1708
1709	/ Found a block suitable for isolating free pages from. /
1710	nr_isolated = isolate_freepages_block(cc, start_pfn: &isolate_start_pfn,
1711	end_pfn: block_end_pfn, freelist, stride, strict: false);
1712
1713	/ Update the skip hint if the full pageblock was scanned /
1714	if (isolate_start_pfn == block_end_pfn)
1715	update_pageblock_skip(cc, page, pfn: block_start_pfn -
1716	pageblock_nr_pages);
1717
1718	/ Are enough freepages isolated? /
1719	if (cc->nr_freepages >= cc->nr_migratepages) {
1720	if (isolate_start_pfn >= block_end_pfn) {
1721	/*
1722	* Restart at previous pageblock if more
1723	* freepages can be isolated next time.
1724	*/
1725	isolate_start_pfn =
1726	block_start_pfn - pageblock_nr_pages;
1727	}
1728	break;
1729	} else if (isolate_start_pfn < block_end_pfn) {
1730	/*
1731	* If isolation failed early, do not continue
1732	* needlessly.
1733	*/
1734	break;
1735	}
1736
1737	/ Adjust stride depending on isolation /
1738	if (nr_isolated) {
1739	stride = `1`;
1740	continue;
1741	}
1742	stride = min_t(unsigned int, COMPACT_CLUSTER_MAX, stride << `1`);
1743	}
1744
1745	/*
1746	* Record where the free scanner will restart next time. Either we
1747	* broke from the loop and set isolate_start_pfn based on the last
1748	* call to isolate_freepages_block(), or we met the migration scanner
1749	* and the loop terminated due to isolate_start_pfn < low_pfn
1750	*/
1751	cc->free_pfn = isolate_start_pfn;
1752
1753	splitmap:
1754	/ __isolate_free_page() does not map the pages /
1755	split_map_pages(list: freelist);
1756	}
1757
1758	/*
1759	* This is a migrate-callback that "allocates" freepages by taking pages
1760	* from the isolated freelists in the block we are migrating to.
1761	*/
1762	static struct folio compaction_alloc(struct* folio src, unsigned* long data)
1763	{
1764	struct compact_control cc = (struct* compact_control *)data;
1765	struct folio *dst;
1766
1767	if (list_empty(head: &cc->freepages)) {
1768	isolate_freepages(cc);
1769
1770	if (list_empty(head: &cc->freepages))
1771	return NULL;
1772	}
1773
1774	dst = list_entry(cc->freepages.next, struct folio, lru);
1775	list_del(entry: &dst->lru);
1776	cc->nr_freepages--;
1777
1778	return dst;
1779	}
1780
1781	/*
1782	* This is a migrate-callback that "frees" freepages back to the isolated
1783	* freelist. All pages on the freelist are from the same zone, so there is no
1784	* special handling needed for NUMA.
1785	*/
1786	static void compaction_free(struct folio dst, unsigned* long data)
1787	{
1788	struct compact_control cc = (struct* compact_control *)data;
1789
1790	list_add(new: &dst->lru, head: &cc->freepages);
1791	cc->nr_freepages++;
1792	}
1793
1794	/ possible outcome of isolate_migratepages /
1795	typedef enum {
1796	ISOLATE_ABORT, / Abort compaction now /
1797	ISOLATE_NONE, / No pages isolated, continue scanning /
1798	ISOLATE_SUCCESS, / Pages isolated, migrate /
1799	} isolate_migrate_t;
1800
1801	/*
1802	* Allow userspace to control policy on scanning the unevictable LRU for
1803	* compactable pages.
1804	*/
1805	static int sysctl_compact_unevictable_allowed __read_mostly = CONFIG_COMPACT_UNEVICTABLE_DEFAULT;
1806	/*
1807	* Tunable for proactive compaction. It determines how
1808	* aggressively the kernel should compact memory in the
1809	* background. It takes values in the range [0, 100].
1810	*/
1811	static unsigned int __read_mostly sysctl_compaction_proactiveness = `20`;
1812	static int sysctl_extfrag_threshold = `500`;
1813	static int __read_mostly sysctl_compact_memory;
1814
1815	static inline void
1816	update_fast_start_pfn(struct compact_control cc, unsigned* long pfn)
1817	{
1818	if (cc->fast_start_pfn == ULONG_MAX)
1819	return;
1820
1821	if (!cc->fast_start_pfn)
1822	cc->fast_start_pfn = pfn;
1823
1824	cc->fast_start_pfn = min(cc->fast_start_pfn, pfn);
1825	}
1826
1827	static inline unsigned long
1828	reinit_migrate_pfn(struct compact_control *cc)
1829	{
1830	if (!cc->fast_start_pfn \|\| cc->fast_start_pfn == ULONG_MAX)
1831	return cc->migrate_pfn;
1832
1833	cc->migrate_pfn = cc->fast_start_pfn;
1834	cc->fast_start_pfn = ULONG_MAX;
1835
1836	return cc->migrate_pfn;
1837	}
1838
1839	/*
1840	* Briefly search the free lists for a migration source that already has
1841	* some free pages to reduce the number of pages that need migration
1842	* before a pageblock is free.
1843	*/
1844	static unsigned long fast_find_migrateblock(struct compact_control *cc)
1845	{
1846	unsigned int limit = freelist_scan_limit(cc);
1847	unsigned int nr_scanned = `0`;
1848	unsigned long distance;
1849	unsigned long pfn = cc->migrate_pfn;
1850	unsigned long high_pfn;
1851	int order;
1852	bool found_block = false;
1853
1854	/ Skip hints are relied on to avoid repeats on the fast search /
1855	if (cc->ignore_skip_hint)
1856	return pfn;
1857
1858	/*
1859	* If the pageblock should be finished then do not select a different
1860	* pageblock.
1861	*/
1862	if (cc->finish_pageblock)
1863	return pfn;
1864
1865	/*
1866	* If the migrate_pfn is not at the start of a zone or the start
1867	* of a pageblock then assume this is a continuation of a previous
1868	* scan restarted due to COMPACT_CLUSTER_MAX.
1869	*/
1870	if (pfn != cc->zone->zone_start_pfn && pfn != pageblock_start_pfn(pfn))
1871	return pfn;
1872
1873	/*
1874	* For smaller orders, just linearly scan as the number of pages
1875	* to migrate should be relatively small and does not necessarily
1876	* justify freeing up a large block for a small allocation.
1877	*/
1878	if (cc->order <= PAGE_ALLOC_COSTLY_ORDER)
1879	return pfn;
1880
1881	/*
1882	* Only allow kcompactd and direct requests for movable pages to
1883	* quickly clear out a MOVABLE pageblock for allocation. This
1884	* reduces the risk that a large movable pageblock is freed for
1885	* an unmovable/reclaimable small allocation.
1886	*/
1887	if (cc->direct_compaction && cc->migratetype != MIGRATE_MOVABLE)
1888	return pfn;
1889
1890	/*
1891	* When starting the migration scanner, pick any pageblock within the
1892	* first half of the search space. Otherwise try and pick a pageblock
1893	* within the first eighth to reduce the chances that a migration
1894	* target later becomes a source.
1895	*/
1896	distance = (cc->free_pfn - cc->migrate_pfn) >> `1`;
1897	if (cc->migrate_pfn != cc->zone->zone_start_pfn)
1898	distance >>= `2`;
1899	high_pfn = pageblock_start_pfn(cc->migrate_pfn + distance);
1900
1901	for (order = cc->order - `1`;
1902	order >= PAGE_ALLOC_COSTLY_ORDER && !found_block && nr_scanned < limit;
1903	order--) {
1904	struct free_area *area = &cc->zone->free_area[order];
1905	struct list_head *freelist;
1906	unsigned long flags;
1907	struct page *freepage;
1908
1909	if (!area->nr_free)
1910	continue;
1911
1912	spin_lock_irqsave(&cc->zone->lock, flags);
1913	freelist = &area->free_list[MIGRATE_MOVABLE];
1914	list_for_each_entry(freepage, freelist, buddy_list) {
1915	unsigned long free_pfn;
1916
1917	if (nr_scanned++ >= limit) {
1918	move_freelist_tail(freelist, freepage);
1919	break;
1920	}
1921
1922	free_pfn = page_to_pfn(freepage);
1923	if (free_pfn < high_pfn) {
1924	/*
1925	* Avoid if skipped recently. Ideally it would
1926	* move to the tail but even safe iteration of
1927	* the list assumes an entry is deleted, not
1928	* reordered.
1929	*/
1930	if (get_pageblock_skip(freepage))
1931	continue;
1932
1933	/ Reorder to so a future search skips recent pages /
1934	move_freelist_tail(freelist, freepage);
1935
1936	update_fast_start_pfn(cc, pfn: free_pfn);
1937	pfn = pageblock_start_pfn(free_pfn);
1938	if (pfn < cc->zone->zone_start_pfn)
1939	pfn = cc->zone->zone_start_pfn;
1940	cc->fast_search_fail = `0`;
1941	found_block = true;
1942	break;
1943	}
1944	}
1945	spin_unlock_irqrestore(lock: &cc->zone->lock, flags);
1946	}
1947
1948	cc->total_migrate_scanned += nr_scanned;
1949
1950	/*
1951	* If fast scanning failed then use a cached entry for a page block
1952	* that had free pages as the basis for starting a linear scan.
1953	*/
1954	if (!found_block) {
1955	cc->fast_search_fail++;
1956	pfn = reinit_migrate_pfn(cc);
1957	}
1958	return pfn;
1959	}
1960
1961	/*
1962	* Isolate all pages that can be migrated from the first suitable block,
1963	* starting at the block pointed to by the migrate scanner pfn within
1964	* compact_control.
1965	*/
1966	static isolate_migrate_t isolate_migratepages(struct compact_control *cc)
1967	{
1968	unsigned long block_start_pfn;
1969	unsigned long block_end_pfn;
1970	unsigned long low_pfn;
1971	struct page *page;
1972	const isolate_mode_t isolate_mode =
1973	(sysctl_compact_unevictable_allowed ? ISOLATE_UNEVICTABLE : `0`) \|
1974	(cc->mode != MIGRATE_SYNC ? ISOLATE_ASYNC_MIGRATE : `0`);
1975	bool fast_find_block;
1976
1977	/*
1978	* Start at where we last stopped, or beginning of the zone as
1979	* initialized by compact_zone(). The first failure will use
1980	* the lowest PFN as the starting point for linear scanning.
1981	*/
1982	low_pfn = fast_find_migrateblock(cc);
1983	block_start_pfn = pageblock_start_pfn(low_pfn);
1984	if (block_start_pfn < cc->zone->zone_start_pfn)
1985	block_start_pfn = cc->zone->zone_start_pfn;
1986
1987	/*
1988	* fast_find_migrateblock() has already ensured the pageblock is not
1989	* set with a skipped flag, so to avoid the isolation_suitable check
1990	* below again, check whether the fast search was successful.
1991	*/
1992	fast_find_block = low_pfn != cc->migrate_pfn && !cc->fast_search_fail;
1993
1994	/ Only scan within a pageblock boundary /
1995	block_end_pfn = pageblock_end_pfn(low_pfn);
1996
1997	/*
1998	* Iterate over whole pageblocks until we find the first suitable.
1999	* Do not cross the free scanner.
2000	*/
2001	for (; block_end_pfn <= cc->free_pfn;
2002	fast_find_block = false,
2003	cc->migrate_pfn = low_pfn = block_end_pfn,
2004	block_start_pfn = block_end_pfn,
2005	block_end_pfn += pageblock_nr_pages) {
2006
2007	/*
2008	* This can potentially iterate a massively long zone with
2009	* many pageblocks unsuitable, so periodically check if we
2010	* need to schedule.
2011	*/
2012	if (!(low_pfn % (COMPACT_CLUSTER_MAX * pageblock_nr_pages)))
2013	cond_resched();
2014
2015	page = pageblock_pfn_to_page(start_pfn: block_start_pfn,
2016	end_pfn: block_end_pfn, zone: cc->zone);
2017	if (!page) {
2018	unsigned long next_pfn;
2019
2020	next_pfn = skip_offline_sections(start_pfn: block_start_pfn);
2021	if (next_pfn)
2022	block_end_pfn = min(next_pfn, cc->free_pfn);
2023	continue;
2024	}
2025
2026	/*
2027	* If isolation recently failed, do not retry. Only check the
2028	* pageblock once. COMPACT_CLUSTER_MAX causes a pageblock
2029	* to be visited multiple times. Assume skip was checked
2030	* before making it "skip" so other compaction instances do
2031	* not scan the same block.
2032	*/
2033	if ((pageblock_aligned(low_pfn) \|\|
2034	low_pfn == cc->zone->zone_start_pfn) &&
2035	!fast_find_block && !isolation_suitable(cc, page))
2036	continue;
2037
2038	/*
2039	* For async direct compaction, only scan the pageblocks of the
2040	* same migratetype without huge pages. Async direct compaction
2041	* is optimistic to see if the minimum amount of work satisfies
2042	* the allocation. The cached PFN is updated as it's possible
2043	* that all remaining blocks between source and target are
2044	* unsuitable and the compaction scanners fail to meet.
2045	*/
2046	if (!suitable_migration_source(cc, page)) {
2047	update_cached_migrate(cc, pfn: block_end_pfn);
2048	continue;
2049	}
2050
2051	/ Perform the isolation /
2052	if (isolate_migratepages_block(cc, low_pfn, end_pfn: block_end_pfn,
2053	mode: isolate_mode))
2054	return ISOLATE_ABORT;
2055
2056	/*
2057	* Either we isolated something and proceed with migration. Or
2058	* we failed and compact_zone should decide if we should
2059	* continue or not.
2060	*/
2061	break;
2062	}
2063
2064	return cc->nr_migratepages ? ISOLATE_SUCCESS : ISOLATE_NONE;
2065	}
2066
2067	/*
2068	* order == -1 is expected when compacting proactively via
2069	* 1. /proc/sys/vm/compact_memory
2070	* 2. /sys/devices/system/node/nodex/compact
2071	* 3. /proc/sys/vm/compaction_proactiveness
2072	*/
2073	static inline bool is_via_compact_memory(int order)
2074	{
2075	return order == -`1`;
2076	}
2077
2078	/*
2079	* Determine whether kswapd is (or recently was!) running on this node.
2080	*
2081	* pgdat_kswapd_lock() pins pgdat->kswapd, so a concurrent kswapd_stop() can't
2082	* zero it.
2083	*/
2084	static bool kswapd_is_running(pg_data_t *pgdat)
2085	{
2086	bool running;
2087
2088	pgdat_kswapd_lock(pgdat);
2089	running = pgdat->kswapd && task_is_running(pgdat->kswapd);
2090	pgdat_kswapd_unlock(pgdat);
2091
2092	return running;
2093	}
2094
2095	/*
2096	* A zone's fragmentation score is the external fragmentation wrt to the
2097	* COMPACTION_HPAGE_ORDER. It returns a value in the range [0, 100].
2098	*/
2099	static unsigned int fragmentation_score_zone(struct zone *zone)
2100	{
2101	return extfrag_for_order(zone, COMPACTION_HPAGE_ORDER);
2102	}
2103
2104	/*
2105	* A weighted zone's fragmentation score is the external fragmentation
2106	* wrt to the COMPACTION_HPAGE_ORDER scaled by the zone's size. It
2107	* returns a value in the range [0, 100].
2108	*
2109	* The scaling factor ensures that proactive compaction focuses on larger
2110	* zones like ZONE_NORMAL, rather than smaller, specialized zones like
2111	* ZONE_DMA32. For smaller zones, the score value remains close to zero,
2112	* and thus never exceeds the high threshold for proactive compaction.
2113	*/
2114	static unsigned int fragmentation_score_zone_weighted(struct zone *zone)
2115	{
2116	unsigned long score;
2117
2118	score = zone->present_pages * fragmentation_score_zone(zone);
2119	return div64_ul(score, zone->zone_pgdat->node_present_pages + `1`);
2120	}
2121
2122	/*
2123	* The per-node proactive (background) compaction process is started by its
2124	* corresponding kcompactd thread when the node's fragmentation score
2125	* exceeds the high threshold. The compaction process remains active till
2126	* the node's score falls below the low threshold, or one of the back-off
2127	* conditions is met.
2128	*/
2129	static unsigned int fragmentation_score_node(pg_data_t *pgdat)
2130	{
2131	unsigned int score = `0`;
2132	int zoneid;
2133
2134	for (zoneid = `0`; zoneid < MAX_NR_ZONES; zoneid++) {
2135	struct zone *zone;
2136
2137	zone = &pgdat->node_zones[zoneid];
2138	if (!populated_zone(zone))
2139	continue;
2140	score += fragmentation_score_zone_weighted(zone);
2141	}
2142
2143	return score;
2144	}
2145
2146	static unsigned int fragmentation_score_wmark(bool low)
2147	{
2148	unsigned int wmark_low;
2149
2150	/*
2151	* Cap the low watermark to avoid excessive compaction
2152	* activity in case a user sets the proactiveness tunable
2153	* close to 100 (maximum).
2154	*/
2155	wmark_low = max(`100U` - sysctl_compaction_proactiveness, `5U`);
2156	return low ? wmark_low : min(wmark_low + `10`, `100U`);
2157	}
2158
2159	static bool should_proactive_compact_node(pg_data_t *pgdat)
2160	{
2161	int wmark_high;
2162
2163	if (!sysctl_compaction_proactiveness \|\| kswapd_is_running(pgdat))
2164	return false;
2165
2166	wmark_high = fragmentation_score_wmark(low: false);
2167	return fragmentation_score_node(pgdat) > wmark_high;
2168	}
2169
2170	static enum compact_result __compact_finished(struct compact_control *cc)
2171	{
2172	unsigned int order;
2173	const int migratetype = cc->migratetype;
2174	int ret;
2175
2176	/ Compaction run completes if the migrate and free scanner meet /
2177	if (compact_scanners_met(cc)) {
2178	/ Let the next compaction start anew. /
2179	reset_cached_positions(zone: cc->zone);
2180
2181	/*
2182	* Mark that the PG_migrate_skip information should be cleared
2183	* by kswapd when it goes to sleep. kcompactd does not set the
2184	* flag itself as the decision to be clear should be directly
2185	* based on an allocation request.
2186	*/
2187	if (cc->direct_compaction)
2188	cc->zone->compact_blockskip_flush = true;
2189
2190	if (cc->whole_zone)
2191	return COMPACT_COMPLETE;
2192	else
2193	return COMPACT_PARTIAL_SKIPPED;
2194	}
2195
2196	if (cc->proactive_compaction) {
2197	int score, wmark_low;
2198	pg_data_t *pgdat;
2199
2200	pgdat = cc->zone->zone_pgdat;
2201	if (kswapd_is_running(pgdat))
2202	return COMPACT_PARTIAL_SKIPPED;
2203
2204	score = fragmentation_score_zone(zone: cc->zone);
2205	wmark_low = fragmentation_score_wmark(low: true);
2206
2207	if (score > wmark_low)
2208	ret = COMPACT_CONTINUE;
2209	else
2210	ret = COMPACT_SUCCESS;
2211
2212	goto out;
2213	}
2214
2215	if (is_via_compact_memory(order: cc->order))
2216	return COMPACT_CONTINUE;
2217
2218	/*
2219	* Always finish scanning a pageblock to reduce the possibility of
2220	* fallbacks in the future. This is particularly important when
2221	* migration source is unmovable/reclaimable but it's not worth
2222	* special casing.
2223	*/
2224	if (!pageblock_aligned(cc->migrate_pfn))
2225	return COMPACT_CONTINUE;
2226
2227	/ Direct compactor: Is a suitable page free? /
2228	ret = COMPACT_NO_SUITABLE_PAGE;
2229	for (order = cc->order; order <= MAX_ORDER; order++) {
2230	struct free_area *area = &cc->zone->free_area[order];
2231	bool can_steal;
2232
2233	/ Job done if page is free of the right migratetype /
2234	if (!free_area_empty(area, migratetype))
2235	return COMPACT_SUCCESS;
2236
2237	#ifdef CONFIG_CMA
2238	/ MIGRATE_MOVABLE can fallback on MIGRATE_CMA /
2239	if (migratetype == MIGRATE_MOVABLE &&
2240	!free_area_empty(area, migratetype: MIGRATE_CMA))
2241	return COMPACT_SUCCESS;
2242	#endif
2243	/*
2244	* Job done if allocation would steal freepages from
2245	* other migratetype buddy lists.
2246	*/
2247	if (find_suitable_fallback(area, order, migratetype,
2248	only_stealable: true, can_steal: &can_steal) != -`1`)
2249	/*
2250	* Movable pages are OK in any pageblock. If we are
2251	* stealing for a non-movable allocation, make sure
2252	* we finish compacting the current pageblock first
2253	* (which is assured by the above migrate_pfn align
2254	* check) so it is as free as possible and we won't
2255	* have to steal another one soon.
2256	*/
2257	return COMPACT_SUCCESS;
2258	}
2259
2260	out:
2261	if (cc->contended \|\| fatal_signal_pending(current))
2262	ret = COMPACT_CONTENDED;
2263
2264	return ret;
2265	}
2266
2267	static enum compact_result compact_finished(struct compact_control *cc)
2268	{
2269	int ret;
2270
2271	ret = __compact_finished(cc);
2272	trace_mm_compaction_finished(zone: cc->zone, order: cc->order, ret);
2273	if (ret == COMPACT_NO_SUITABLE_PAGE)
2274	ret = COMPACT_CONTINUE;
2275
2276	return ret;
2277	}
2278
2279	static bool __compaction_suitable(struct zone zone, int* order,
2280	int highest_zoneidx,
2281	unsigned long wmark_target)
2282	{
2283	unsigned long watermark;
2284	/*
2285	* Watermarks for order-0 must be met for compaction to be able to
2286	* isolate free pages for migration targets. This means that the
2287	* watermark and alloc_flags have to match, or be more pessimistic than
2288	* the check in __isolate_free_page(). We don't use the direct
2289	* compactor's alloc_flags, as they are not relevant for freepage
2290	* isolation. We however do use the direct compactor's highest_zoneidx
2291	* to skip over zones where lowmem reserves would prevent allocation
2292	* even if compaction succeeds.
2293	* For costly orders, we require low watermark instead of min for
2294	* compaction to proceed to increase its chances.
2295	* ALLOC_CMA is used, as pages in CMA pageblocks are considered
2296	* suitable migration targets
2297	*/
2298	watermark = (order > PAGE_ALLOC_COSTLY_ORDER) ?
2299	low_wmark_pages(zone) : min_wmark_pages(zone);
2300	watermark += compact_gap(order);
2301	return __zone_watermark_ok(z: zone, order: `0`, mark: watermark, highest_zoneidx,
2302	ALLOC_CMA, free_pages: wmark_target);
2303	}
2304
2305	/*
2306	* compaction_suitable: Is this suitable to run compaction on this zone now?
2307	*/
2308	bool compaction_suitable(struct zone zone, int* order, int highest_zoneidx)
2309	{
2310	enum compact_result compact_result;
2311	bool suitable;
2312
2313	suitable = __compaction_suitable(zone, order, highest_zoneidx,
2314	wmark_target: zone_page_state(zone, item: NR_FREE_PAGES));
2315	/*
2316	* fragmentation index determines if allocation failures are due to
2317	* low memory or external fragmentation
2318	*
2319	* index of -1000 would imply allocations might succeed depending on
2320	* watermarks, but we already failed the high-order watermark check
2321	* index towards 0 implies failure is due to lack of memory
2322	* index towards 1000 implies failure is due to fragmentation
2323	*
2324	* Only compact if a failure would be due to fragmentation. Also
2325	* ignore fragindex for non-costly orders where the alternative to
2326	* a successful reclaim/compaction is OOM. Fragindex and the
2327	* vm.extfrag_threshold sysctl is meant as a heuristic to prevent
2328	* excessive compaction for costly orders, but it should not be at the
2329	* expense of system stability.
2330	*/
2331	if (suitable) {
2332	compact_result = COMPACT_CONTINUE;
2333	if (order > PAGE_ALLOC_COSTLY_ORDER) {
2334	int fragindex = fragmentation_index(zone, order);
2335
2336	if (fragindex >= `0` &&
2337	fragindex <= sysctl_extfrag_threshold) {
2338	suitable = false;
2339	compact_result = COMPACT_NOT_SUITABLE_ZONE;
2340	}
2341	}
2342	} else {
2343	compact_result = COMPACT_SKIPPED;
2344	}
2345
2346	trace_mm_compaction_suitable(zone, order, ret: compact_result);
2347
2348	return suitable;
2349	}
2350
2351	bool compaction_zonelist_suitable(struct alloc_context ac, int* order,
2352	int alloc_flags)
2353	{
2354	struct zone *zone;
2355	struct zoneref *z;
2356
2357	/*
2358	* Make sure at least one zone would pass __compaction_suitable if we continue
2359	* retrying the reclaim.
2360	*/
2361	for_each_zone_zonelist_nodemask(zone, z, ac->zonelist,
2362	ac->highest_zoneidx, ac->nodemask) {
2363	unsigned long available;
2364
2365	/*
2366	* Do not consider all the reclaimable memory because we do not
2367	* want to trash just for a single high order allocation which
2368	* is even not guaranteed to appear even if __compaction_suitable
2369	* is happy about the watermark check.
2370	*/
2371	available = zone_reclaimable_pages(zone) / order;
2372	available += zone_page_state_snapshot(zone, item: NR_FREE_PAGES);
2373	if (__compaction_suitable(zone, order, highest_zoneidx: ac->highest_zoneidx,
2374	wmark_target: available))
2375	return true;
2376	}
2377
2378	return false;
2379	}
2380
2381	/*
2382	* Should we do compaction for target allocation order.
2383	* Return COMPACT_SUCCESS if allocation for target order can be already
2384	* satisfied
2385	* Return COMPACT_SKIPPED if compaction for target order is likely to fail
2386	* Return COMPACT_CONTINUE if compaction for target order should be ran
2387	*/
2388	static enum compact_result
2389	compaction_suit_allocation_order(struct zone zone, unsigned* int order,
2390	int highest_zoneidx, unsigned int alloc_flags)
2391	{
2392	unsigned long watermark;
2393
2394	watermark = wmark_pages(zone, alloc_flags & ALLOC_WMARK_MASK);
2395	if (zone_watermark_ok(z: zone, order, mark: watermark, highest_zoneidx,
2396	alloc_flags))
2397	return COMPACT_SUCCESS;
2398
2399	if (!compaction_suitable(zone, order, highest_zoneidx))
2400	return COMPACT_SKIPPED;
2401
2402	return COMPACT_CONTINUE;
2403	}
2404
2405	static enum compact_result
2406	compact_zone(struct compact_control cc, struct* capture_control *capc)
2407	{
2408	enum compact_result ret;
2409	unsigned long start_pfn = cc->zone->zone_start_pfn;
2410	unsigned long end_pfn = zone_end_pfn(zone: cc->zone);
2411	unsigned long last_migrated_pfn;
2412	const bool sync = cc->mode != MIGRATE_ASYNC;
2413	bool update_cached;
2414	unsigned int nr_succeeded = `0`;
2415
2416	/*
2417	* These counters track activities during zone compaction. Initialize
2418	* them before compacting a new zone.
2419	*/
2420	cc->total_migrate_scanned = `0`;
2421	cc->total_free_scanned = `0`;
2422	cc->nr_migratepages = `0`;
2423	cc->nr_freepages = `0`;
2424	INIT_LIST_HEAD(list: &cc->freepages);
2425	INIT_LIST_HEAD(list: &cc->migratepages);
2426
2427	cc->migratetype = gfp_migratetype(gfp_flags: cc->gfp_mask);
2428
2429	if (!is_via_compact_memory(order: cc->order)) {
2430	ret = compaction_suit_allocation_order(zone: cc->zone, order: cc->order,
2431	highest_zoneidx: cc->highest_zoneidx,
2432	alloc_flags: cc->alloc_flags);
2433	if (ret != COMPACT_CONTINUE)
2434	return ret;
2435	}
2436
2437	/*
2438	* Clear pageblock skip if there were failures recently and compaction
2439	* is about to be retried after being deferred.
2440	*/
2441	if (compaction_restarting(zone: cc->zone, order: cc->order))
2442	__reset_isolation_suitable(zone: cc->zone);
2443
2444	/*
2445	* Setup to move all movable pages to the end of the zone. Used cached
2446	* information on where the scanners should start (unless we explicitly
2447	* want to compact the whole zone), but check that it is initialised
2448	* by ensuring the values are within zone boundaries.
2449	*/
2450	cc->fast_start_pfn = `0`;
2451	if (cc->whole_zone) {
2452	cc->migrate_pfn = start_pfn;
2453	cc->free_pfn = pageblock_start_pfn(end_pfn - `1`);
2454	} else {
2455	cc->migrate_pfn = cc->zone->compact_cached_migrate_pfn[sync];
2456	cc->free_pfn = cc->zone->compact_cached_free_pfn;
2457	if (cc->free_pfn < start_pfn \|\| cc->free_pfn >= end_pfn) {
2458	cc->free_pfn = pageblock_start_pfn(end_pfn - `1`);
2459	cc->zone->compact_cached_free_pfn = cc->free_pfn;
2460	}
2461	if (cc->migrate_pfn < start_pfn \|\| cc->migrate_pfn >= end_pfn) {
2462	cc->migrate_pfn = start_pfn;
2463	cc->zone->compact_cached_migrate_pfn[`0`] = cc->migrate_pfn;
2464	cc->zone->compact_cached_migrate_pfn[`1`] = cc->migrate_pfn;
2465	}
2466
2467	if (cc->migrate_pfn <= cc->zone->compact_init_migrate_pfn)
2468	cc->whole_zone = true;
2469	}
2470
2471	last_migrated_pfn = `0`;
2472
2473	/*
2474	* Migrate has separate cached PFNs for ASYNC and SYNC* migration on
2475	* the basis that some migrations will fail in ASYNC mode. However,
2476	* if the cached PFNs match and pageblocks are skipped due to having
2477	* no isolation candidates, then the sync state does not matter.
2478	* Until a pageblock with isolation candidates is found, keep the
2479	* cached PFNs in sync to avoid revisiting the same blocks.
2480	*/
2481	update_cached = !sync &&
2482	cc->zone->compact_cached_migrate_pfn[`0`] == cc->zone->compact_cached_migrate_pfn[`1`];
2483
2484	trace_mm_compaction_begin(cc, zone_start: start_pfn, zone_end: end_pfn, sync);
2485
2486	/ lru_add_drain_all could be expensive with involving other CPUs /
2487	lru_add_drain();
2488
2489	while ((ret = compact_finished(cc)) == COMPACT_CONTINUE) {
2490	int err;
2491	unsigned long iteration_start_pfn = cc->migrate_pfn;
2492
2493	/*
2494	* Avoid multiple rescans of the same pageblock which can
2495	* happen if a page cannot be isolated (dirty/writeback in
2496	* async mode) or if the migrated pages are being allocated
2497	* before the pageblock is cleared. The first rescan will
2498	* capture the entire pageblock for migration. If it fails,
2499	* it'll be marked skip and scanning will proceed as normal.
2500	*/
2501	cc->finish_pageblock = false;
2502	if (pageblock_start_pfn(last_migrated_pfn) ==
2503	pageblock_start_pfn(iteration_start_pfn)) {
2504	cc->finish_pageblock = true;
2505	}
2506
2507	rescan:
2508	switch (isolate_migratepages(cc)) {
2509	case ISOLATE_ABORT:
2510	ret = COMPACT_CONTENDED;
2511	putback_movable_pages(l: &cc->migratepages);
2512	cc->nr_migratepages = `0`;
2513	goto out;
2514	case ISOLATE_NONE:
2515	if (update_cached) {
2516	cc->zone->compact_cached_migrate_pfn[`1`] =
2517	cc->zone->compact_cached_migrate_pfn[`0`];
2518	}
2519
2520	/*
2521	* We haven't isolated and migrated anything, but
2522	* there might still be unflushed migrations from
2523	* previous cc->order aligned block.
2524	*/
2525	goto check_drain;
2526	case ISOLATE_SUCCESS:
2527	update_cached = false;
2528	last_migrated_pfn = max(cc->zone->zone_start_pfn,
2529	pageblock_start_pfn(cc->migrate_pfn - `1`));
2530	}
2531
2532	err = migrate_pages(l: &cc->migratepages, new: compaction_alloc,
2533	free: compaction_free, private: (unsigned long)cc, mode: cc->mode,
2534	reason: MR_COMPACTION, ret_succeeded: &nr_succeeded);
2535
2536	trace_mm_compaction_migratepages(cc, nr_succeeded);
2537
2538	/ All pages were either migrated or will be released /
2539	cc->nr_migratepages = `0`;
2540	if (err) {
2541	putback_movable_pages(l: &cc->migratepages);
2542	/*
2543	* migrate_pages() may return -ENOMEM when scanners meet
2544	* and we want compact_finished() to detect it
2545	*/
2546	if (err == -ENOMEM && !compact_scanners_met(cc)) {
2547	ret = COMPACT_CONTENDED;
2548	goto out;
2549	}
2550	/*
2551	* If an ASYNC or SYNC_LIGHT fails to migrate a page
2552	* within the pageblock_order-aligned block and
2553	* fast_find_migrateblock may be used then scan the
2554	* remainder of the pageblock. This will mark the
2555	* pageblock "skip" to avoid rescanning in the near
2556	* future. This will isolate more pages than necessary
2557	* for the request but avoid loops due to
2558	* fast_find_migrateblock revisiting blocks that were
2559	* recently partially scanned.
2560	*/
2561	if (!pageblock_aligned(cc->migrate_pfn) &&
2562	!cc->ignore_skip_hint && !cc->finish_pageblock &&
2563	(cc->mode < MIGRATE_SYNC)) {
2564	cc->finish_pageblock = true;
2565
2566	/*
2567	* Draining pcplists does not help THP if
2568	* any page failed to migrate. Even after
2569	* drain, the pageblock will not be free.
2570	*/
2571	if (cc->order == COMPACTION_HPAGE_ORDER)
2572	last_migrated_pfn = `0`;
2573
2574	goto rescan;
2575	}
2576	}
2577
2578	/ Stop if a page has been captured /
2579	if (capc && capc->page) {
2580	ret = COMPACT_SUCCESS;
2581	break;
2582	}
2583
2584	check_drain:
2585	/*
2586	* Has the migration scanner moved away from the previous
2587	* cc->order aligned block where we migrated from? If yes,
2588	* flush the pages that were freed, so that they can merge and
2589	* compact_finished() can detect immediately if allocation
2590	* would succeed.
2591	*/
2592	if (cc->order > `0` && last_migrated_pfn) {
2593	unsigned long current_block_start =
2594	block_start_pfn(cc->migrate_pfn, cc->order);
2595
2596	if (last_migrated_pfn < current_block_start) {
2597	lru_add_drain_cpu_zone(zone: cc->zone);
2598	/ No more flushing until we migrate again /
2599	last_migrated_pfn = `0`;
2600	}
2601	}
2602	}
2603
2604	out:
2605	/*
2606	* Release free pages and update where the free scanner should restart,
2607	* so we don't leave any returned pages behind in the next attempt.
2608	*/
2609	if (cc->nr_freepages > `0`) {
2610	unsigned long free_pfn = release_freepages(freelist: &cc->freepages);
2611
2612	cc->nr_freepages = `0`;
2613	VM_BUG_ON(free_pfn == `0`);
2614	/ The cached pfn is always the first in a pageblock /
2615	free_pfn = pageblock_start_pfn(free_pfn);
2616	/*
2617	* Only go back, not forward. The cached pfn might have been
2618	* already reset to zone end in compact_finished()
2619	*/
2620	if (free_pfn > cc->zone->compact_cached_free_pfn)
2621	cc->zone->compact_cached_free_pfn = free_pfn;
2622	}
2623
2624	count_compact_events(item: COMPACTMIGRATE_SCANNED, delta: cc->total_migrate_scanned);
2625	count_compact_events(item: COMPACTFREE_SCANNED, delta: cc->total_free_scanned);
2626
2627	trace_mm_compaction_end(cc, zone_start: start_pfn, zone_end: end_pfn, sync, status: ret);
2628
2629	VM_BUG_ON(!list_empty(&cc->freepages));
2630	VM_BUG_ON(!list_empty(&cc->migratepages));
2631
2632	return ret;
2633	}
2634
2635	static enum compact_result compact_zone_order(struct zone zone, int* order,
2636	gfp_t gfp_mask, enum compact_priority prio,
2637	unsigned int alloc_flags, int highest_zoneidx,
2638	struct page **capture)
2639	{
2640	enum compact_result ret;
2641	struct compact_control cc = {
2642	.order = order,
2643	.search_order = order,
2644	.gfp_mask = gfp_mask,
2645	.zone = zone,
2646	.mode = (prio == COMPACT_PRIO_ASYNC) ?
2647	MIGRATE_ASYNC : MIGRATE_SYNC_LIGHT,
2648	.alloc_flags = alloc_flags,
2649	.highest_zoneidx = highest_zoneidx,
2650	.direct_compaction = true,
2651	.whole_zone = (prio == MIN_COMPACT_PRIORITY),
2652	.ignore_skip_hint = (prio == MIN_COMPACT_PRIORITY),
2653	.ignore_block_suitable = (prio == MIN_COMPACT_PRIORITY)
2654	};
2655	struct capture_control capc = {
2656	.cc = &cc,
2657	.page = NULL,
2658	};
2659
2660	/*
2661	* Make sure the structs are really initialized before we expose the
2662	* capture control, in case we are interrupted and the interrupt handler
2663	* frees a page.
2664	*/
2665	barrier();
2666	WRITE_ONCE(current->capture_control, &capc);
2667
2668	ret = compact_zone(cc: &cc, capc: &capc);
2669
2670	/*
2671	* Make sure we hide capture control first before we read the captured
2672	* page pointer, otherwise an interrupt could free and capture a page
2673	* and we would leak it.
2674	*/
2675	WRITE_ONCE(current->capture_control, NULL);
2676	*capture = READ_ONCE(capc.page);
2677	/*
2678	* Technically, it is also possible that compaction is skipped but
2679	* the page is still captured out of luck(IRQ came and freed the page).
2680	* Returning COMPACT_SUCCESS in such cases helps in properly accounting
2681	* the COMPACT[STALL\|FAIL] when compaction is skipped.
2682	*/
2683	if (*capture)
2684	ret = COMPACT_SUCCESS;
2685
2686	return ret;
2687	}
2688
2689	/**
2690	* try_to_compact_pages - Direct compact to satisfy a high-order allocation
2691	* @gfp_mask: The GFP mask of the current allocation
2692	* @order: The order of the current allocation
2693	* @alloc_flags: The allocation flags of the current allocation
2694	* @ac: The context of current allocation
2695	* @prio: Determines how hard direct compaction should try to succeed
2696	* @capture: Pointer to free page created by compaction will be stored here
2697	*
2698	* This is the main entry point for direct page compaction.
2699	*/
2700	enum compact_result try_to_compact_pages(gfp_t gfp_mask, unsigned int order,
2701	unsigned int alloc_flags, const struct alloc_context *ac,
2702	enum compact_priority prio, struct page **capture)
2703	{
2704	int may_perform_io = (__force int)(gfp_mask & __GFP_IO);
2705	struct zoneref *z;
2706	struct zone *zone;
2707	enum compact_result rc = COMPACT_SKIPPED;
2708
2709	/*
2710	* Check if the GFP flags allow compaction - GFP_NOIO is really
2711	* tricky context because the migration might require IO
2712	*/
2713	if (!may_perform_io)
2714	return COMPACT_SKIPPED;
2715
2716	trace_mm_compaction_try_to_compact_pages(order, gfp_mask, prio);
2717
2718	/ Compact each zone in the list /
2719	for_each_zone_zonelist_nodemask(zone, z, ac->zonelist,
2720	ac->highest_zoneidx, ac->nodemask) {
2721	enum compact_result status;
2722
2723	if (prio > MIN_COMPACT_PRIORITY
2724	&& compaction_deferred(zone, order)) {
2725	rc = max_t(enum compact_result, COMPACT_DEFERRED, rc);
2726	continue;
2727	}
2728
2729	status = compact_zone_order(zone, order, gfp_mask, prio,
2730	alloc_flags, highest_zoneidx: ac->highest_zoneidx, capture);
2731	rc = max(status, rc);
2732
2733	/ The allocation should succeed, stop compacting /
2734	if (status == COMPACT_SUCCESS) {
2735	/*
2736	* We think the allocation will succeed in this zone,
2737	* but it is not certain, hence the false. The caller
2738	* will repeat this with true if allocation indeed
2739	* succeeds in this zone.
2740	*/
2741	compaction_defer_reset(zone, order, alloc_success: false);
2742
2743	break;
2744	}
2745
2746	if (prio != COMPACT_PRIO_ASYNC && (status == COMPACT_COMPLETE \|\|
2747	status == COMPACT_PARTIAL_SKIPPED))
2748	/*
2749	* We think that allocation won't succeed in this zone
2750	* so we defer compaction there. If it ends up
2751	* succeeding after all, it will be reset.
2752	*/
2753	defer_compaction(zone, order);
2754
2755	/*
2756	* We might have stopped compacting due to need_resched() in
2757	* async compaction, or due to a fatal signal detected. In that
2758	* case do not try further zones
2759	*/
2760	if ((prio == COMPACT_PRIO_ASYNC && need_resched())
2761	\|\| fatal_signal_pending(current))
2762	break;
2763	}
2764
2765	return rc;
2766	}
2767
2768	/*
2769	* Compact all zones within a node till each zone's fragmentation score
2770	* reaches within proactive compaction thresholds (as determined by the
2771	* proactiveness tunable).
2772	*
2773	* It is possible that the function returns before reaching score targets
2774	* due to various back-off conditions, such as, contention on per-node or
2775	* per-zone locks.
2776	*/
2777	static void proactive_compact_node(pg_data_t *pgdat)
2778	{
2779	int zoneid;
2780	struct zone *zone;
2781	struct compact_control cc = {
2782	.order = -`1`,
2783	.mode = MIGRATE_SYNC_LIGHT,
2784	.ignore_skip_hint = true,
2785	.whole_zone = true,
2786	.gfp_mask = GFP_KERNEL,
2787	.proactive_compaction = true,
2788	};
2789
2790	for (zoneid = `0`; zoneid < MAX_NR_ZONES; zoneid++) {
2791	zone = &pgdat->node_zones[zoneid];
2792	if (!populated_zone(zone))
2793	continue;
2794
2795	cc.zone = zone;
2796
2797	compact_zone(cc: &cc, NULL);
2798
2799	count_compact_events(item: KCOMPACTD_MIGRATE_SCANNED,
2800	delta: cc.total_migrate_scanned);
2801	count_compact_events(item: KCOMPACTD_FREE_SCANNED,
2802	delta: cc.total_free_scanned);
2803	}
2804	}
2805
2806	/ Compact all zones within a node /
2807	static void compact_node(int nid)
2808	{
2809	pg_data_t *pgdat = NODE_DATA(nid);
2810	int zoneid;
2811	struct zone *zone;
2812	struct compact_control cc = {
2813	.order = -`1`,
2814	.mode = MIGRATE_SYNC,
2815	.ignore_skip_hint = true,
2816	.whole_zone = true,
2817	.gfp_mask = GFP_KERNEL,
2818	};
2819
2820
2821	for (zoneid = `0`; zoneid < MAX_NR_ZONES; zoneid++) {
2822
2823	zone = &pgdat->node_zones[zoneid];
2824	if (!populated_zone(zone))
2825	continue;
2826
2827	cc.zone = zone;
2828
2829	compact_zone(cc: &cc, NULL);
2830	}
2831	}
2832
2833	/ Compact all nodes in the system /
2834	static void compact_nodes(void)
2835	{
2836	int nid;
2837
2838	/ Flush pending updates to the LRU lists /
2839	lru_add_drain_all();
2840
2841	for_each_online_node(nid)
2842	compact_node(nid);
2843	}
2844
2845	static int compaction_proactiveness_sysctl_handler(struct ctl_table table, int* write,
2846	void buffer, size_t length, loff_t *ppos)
2847	{
2848	int rc, nid;
2849
2850	rc = proc_dointvec_minmax(table, write, buffer, length, ppos);
2851	if (rc)
2852	return rc;
2853
2854	if (write && sysctl_compaction_proactiveness) {
2855	for_each_online_node(nid) {
2856	pg_data_t *pgdat = NODE_DATA(nid);
2857
2858	if (pgdat->proactive_compact_trigger)
2859	continue;
2860
2861	pgdat->proactive_compact_trigger = true;
2862	trace_mm_compaction_wakeup_kcompactd(nid: pgdat->node_id, order: -`1`,
2863	highest_zoneidx: pgdat->nr_zones - `1`);
2864	wake_up_interruptible(&pgdat->kcompactd_wait);
2865	}
2866	}
2867
2868	return `0`;
2869	}
2870
2871	/*
2872	* This is the entry point for compacting all nodes via
2873	* /proc/sys/vm/compact_memory
2874	*/
2875	static int sysctl_compaction_handler(struct ctl_table table, int* write,
2876	void buffer, size_t length, loff_t *ppos)
2877	{
2878	int ret;
2879
2880	ret = proc_dointvec(table, write, buffer, length, ppos);
2881	if (ret)
2882	return ret;
2883
2884	if (sysctl_compact_memory != `1`)
2885	return -EINVAL;
2886
2887	if (write)
2888	compact_nodes();
2889
2890	return `0`;
2891	}
2892
2893	#if defined(CONFIG_SYSFS) && defined(CONFIG_NUMA)
2894	static ssize_t compact_store(struct device *dev,
2895	struct device_attribute *attr,
2896	const char *buf, size_t count)
2897	{
2898	int nid = dev->id;
2899
2900	if (nid >= `0` && nid < nr_node_ids && node_online(nid)) {
2901	/ Flush pending updates to the LRU lists /
2902	lru_add_drain_all();
2903
2904	compact_node(nid);
2905	}
2906
2907	return count;
2908	}
2909	static DEVICE_ATTR_WO(compact);
2910
2911	int compaction_register_node(struct node *node)
2912	{
2913	return device_create_file(device: &node->dev, entry: &dev_attr_compact);
2914	}
2915
2916	void compaction_unregister_node(struct node *node)
2917	{
2918	device_remove_file(dev: &node->dev, attr: &dev_attr_compact);
2919	}
2920	#endif /* CONFIG_SYSFS && CONFIG_NUMA */
2921
2922	static inline bool kcompactd_work_requested(pg_data_t *pgdat)
2923	{
2924	return pgdat->kcompactd_max_order > `0` \|\| kthread_should_stop() \|\|
2925	pgdat->proactive_compact_trigger;
2926	}
2927
2928	static bool kcompactd_node_suitable(pg_data_t *pgdat)
2929	{
2930	int zoneid;
2931	struct zone *zone;
2932	enum zone_type highest_zoneidx = pgdat->kcompactd_highest_zoneidx;
2933	enum compact_result ret;
2934
2935	for (zoneid = `0`; zoneid <= highest_zoneidx; zoneid++) {
2936	zone = &pgdat->node_zones[zoneid];
2937
2938	if (!populated_zone(zone))
2939	continue;
2940
2941	ret = compaction_suit_allocation_order(zone,
2942	order: pgdat->kcompactd_max_order,
2943	highest_zoneidx, ALLOC_WMARK_MIN);
2944	if (ret == COMPACT_CONTINUE)
2945	return true;
2946	}
2947
2948	return false;
2949	}
2950
2951	static void kcompactd_do_work(pg_data_t *pgdat)
2952	{
2953	/*
2954	* With no special task, compact all zones so that a page of requested
2955	* order is allocatable.
2956	*/
2957	int zoneid;
2958	struct zone *zone;
2959	struct compact_control cc = {
2960	.order = pgdat->kcompactd_max_order,
2961	.search_order = pgdat->kcompactd_max_order,
2962	.highest_zoneidx = pgdat->kcompactd_highest_zoneidx,
2963	.mode = MIGRATE_SYNC_LIGHT,
2964	.ignore_skip_hint = false,
2965	.gfp_mask = GFP_KERNEL,
2966	};
2967	enum compact_result ret;
2968
2969	trace_mm_compaction_kcompactd_wake(nid: pgdat->node_id, order: cc.order,
2970	highest_zoneidx: cc.highest_zoneidx);
2971	count_compact_event(item: KCOMPACTD_WAKE);
2972
2973	for (zoneid = `0`; zoneid <= cc.highest_zoneidx; zoneid++) {
2974	int status;
2975
2976	zone = &pgdat->node_zones[zoneid];
2977	if (!populated_zone(zone))
2978	continue;
2979
2980	if (compaction_deferred(zone, order: cc.order))
2981	continue;
2982
2983	ret = compaction_suit_allocation_order(zone,
2984	order: cc.order, highest_zoneidx: zoneid, ALLOC_WMARK_MIN);
2985	if (ret != COMPACT_CONTINUE)
2986	continue;
2987
2988	if (kthread_should_stop())
2989	return;
2990
2991	cc.zone = zone;
2992	status = compact_zone(cc: &cc, NULL);
2993
2994	if (status == COMPACT_SUCCESS) {
2995	compaction_defer_reset(zone, order: cc.order, alloc_success: false);
2996	} else if (status == COMPACT_PARTIAL_SKIPPED \|\| status == COMPACT_COMPLETE) {
2997	/*
2998	* Buddy pages may become stranded on pcps that could
2999	* otherwise coalesce on the zone's free area for
3000	* order >= cc.order. This is ratelimited by the
3001	* upcoming deferral.
3002	*/
3003	drain_all_pages(zone);
3004
3005	/*
3006	* We use sync migration mode here, so we defer like
3007	* sync direct compaction does.
3008	*/
3009	defer_compaction(zone, order: cc.order);
3010	}
3011
3012	count_compact_events(item: KCOMPACTD_MIGRATE_SCANNED,
3013	delta: cc.total_migrate_scanned);
3014	count_compact_events(item: KCOMPACTD_FREE_SCANNED,
3015	delta: cc.total_free_scanned);
3016	}
3017
3018	/*
3019	* Regardless of success, we are done until woken up next. But remember
3020	* the requested order/highest_zoneidx in case it was higher/tighter
3021	* than our current ones
3022	*/
3023	if (pgdat->kcompactd_max_order <= cc.order)
3024	pgdat->kcompactd_max_order = `0`;
3025	if (pgdat->kcompactd_highest_zoneidx >= cc.highest_zoneidx)
3026	pgdat->kcompactd_highest_zoneidx = pgdat->nr_zones - `1`;
3027	}
3028
3029	void wakeup_kcompactd(pg_data_t pgdat, int* order, int highest_zoneidx)
3030	{
3031	if (!order)
3032	return;
3033
3034	if (pgdat->kcompactd_max_order < order)
3035	pgdat->kcompactd_max_order = order;
3036
3037	if (pgdat->kcompactd_highest_zoneidx > highest_zoneidx)
3038	pgdat->kcompactd_highest_zoneidx = highest_zoneidx;
3039
3040	/*
3041	* Pairs with implicit barrier in wait_event_freezable()
3042	* such that wakeups are not missed.
3043	*/
3044	if (!wq_has_sleeper(wq_head: &pgdat->kcompactd_wait))
3045	return;
3046
3047	if (!kcompactd_node_suitable(pgdat))
3048	return;
3049
3050	trace_mm_compaction_wakeup_kcompactd(nid: pgdat->node_id, order,
3051	highest_zoneidx);
3052	wake_up_interruptible(&pgdat->kcompactd_wait);
3053	}
3054
3055	/*
3056	* The background compaction daemon, started as a kernel thread
3057	* from the init process.
3058	*/
3059	static int kcompactd(void *p)
3060	{
3061	pg_data_t pgdat = (pg_data_t )p;
3062	struct task_struct *tsk = current;
3063	long default_timeout = msecs_to_jiffies(HPAGE_FRAG_CHECK_INTERVAL_MSEC);
3064	long timeout = default_timeout;
3065
3066	const struct cpumask *cpumask = cpumask_of_node(node: pgdat->node_id);
3067
3068	if (!cpumask_empty(srcp: cpumask))
3069	set_cpus_allowed_ptr(p: tsk, new_mask: cpumask);
3070
3071	set_freezable();
3072
3073	pgdat->kcompactd_max_order = `0`;
3074	pgdat->kcompactd_highest_zoneidx = pgdat->nr_zones - `1`;
3075
3076	while (!kthread_should_stop()) {
3077	unsigned long pflags;
3078
3079	/*
3080	* Avoid the unnecessary wakeup for proactive compaction
3081	* when it is disabled.
3082	*/
3083	if (!sysctl_compaction_proactiveness)
3084	timeout = MAX_SCHEDULE_TIMEOUT;
3085	trace_mm_compaction_kcompactd_sleep(nid: pgdat->node_id);
3086	if (wait_event_freezable_timeout(pgdat->kcompactd_wait,
3087	kcompactd_work_requested(pgdat), timeout) &&
3088	!pgdat->proactive_compact_trigger) {
3089
3090	psi_memstall_enter(flags: &pflags);
3091	kcompactd_do_work(pgdat);
3092	psi_memstall_leave(flags: &pflags);
3093	/*
3094	* Reset the timeout value. The defer timeout from
3095	* proactive compaction is lost here but that is fine
3096	* as the condition of the zone changing substantionally
3097	* then carrying on with the previous defer interval is
3098	* not useful.
3099	*/
3100	timeout = default_timeout;
3101	continue;
3102	}
3103
3104	/*
3105	* Start the proactive work with default timeout. Based
3106	* on the fragmentation score, this timeout is updated.
3107	*/
3108	timeout = default_timeout;
3109	if (should_proactive_compact_node(pgdat)) {
3110	unsigned int prev_score, score;
3111
3112	prev_score = fragmentation_score_node(pgdat);
3113	proactive_compact_node(pgdat);
3114	score = fragmentation_score_node(pgdat);
3115	/*
3116	* Defer proactive compaction if the fragmentation
3117	* score did not go down i.e. no progress made.
3118	*/
3119	if (unlikely(score >= prev_score))
3120	timeout =
3121	default_timeout << COMPACT_MAX_DEFER_SHIFT;
3122	}
3123	if (unlikely(pgdat->proactive_compact_trigger))
3124	pgdat->proactive_compact_trigger = false;
3125	}
3126
3127	return `0`;
3128	}
3129
3130	/*
3131	* This kcompactd start function will be called by init and node-hot-add.
3132	* On node-hot-add, kcompactd will moved to proper cpus if cpus are hot-added.
3133	*/
3134	void __meminit kcompactd_run(int nid)
3135	{
3136	pg_data_t *pgdat = NODE_DATA(nid);
3137
3138	if (pgdat->kcompactd)
3139	return;
3140
3141	pgdat->kcompactd = kthread_run(kcompactd, pgdat, "kcompactd%d", nid);
3142	if (IS_ERR(ptr: pgdat->kcompactd)) {
3143	pr_err("Failed to start kcompactd on node %d\n", nid);
3144	pgdat->kcompactd = NULL;
3145	}
3146	}
3147
3148	/*
3149	* Called by memory hotplug when all memory in a node is offlined. Caller must
3150	* be holding mem_hotplug_begin/done().
3151	*/
3152	void __meminit kcompactd_stop(int nid)
3153	{
3154	struct task_struct *kcompactd = NODE_DATA(nid)->kcompactd;
3155
3156	if (kcompactd) {
3157	kthread_stop(k: kcompactd);
3158	NODE_DATA(nid)->kcompactd = NULL;
3159	}
3160	}
3161
3162	/*
3163	* It's optimal to keep kcompactd on the same CPUs as their memory, but
3164	* not required for correctness. So if the last cpu in a node goes
3165	* away, we get changed to run anywhere: as the first one comes back,
3166	* restore their cpu bindings.
3167	*/
3168	static int kcompactd_cpu_online(unsigned int cpu)
3169	{
3170	int nid;
3171
3172	for_each_node_state(nid, N_MEMORY) {
3173	pg_data_t *pgdat = NODE_DATA(nid);
3174	const struct cpumask *mask;
3175
3176	mask = cpumask_of_node(node: pgdat->node_id);
3177
3178	if (cpumask_any_and(cpu_online_mask, mask) < nr_cpu_ids)
3179	/ One of our CPUs online: restore mask /
3180	if (pgdat->kcompactd)
3181	set_cpus_allowed_ptr(p: pgdat->kcompactd, new_mask: mask);
3182	}
3183	return `0`;
3184	}
3185
3186	static int proc_dointvec_minmax_warn_RT_change(struct ctl_table *table,
3187	int write, void buffer, size_t lenp, loff_t *ppos)
3188	{
3189	int ret, old;
3190
3191	if (!IS_ENABLED(CONFIG_PREEMPT_RT) \|\| !write)
3192	return proc_dointvec_minmax(table, write, buffer, lenp, ppos);
3193
3194	old = (int* *)table->data;
3195	ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
3196	if (ret)
3197	return ret;
3198	if (old != (int* *)table->data)
3199	pr_warn_once("sysctl attribute %s changed by %s[%d]\n",
3200	table->procname, current->comm,
3201	task_pid_nr(current));
3202	return ret;
3203	}
3204
3205	static struct ctl_table vm_compaction[] = {
3206	{
3207	.procname = "compact_memory",
3208	.data = &sysctl_compact_memory,
3209	.maxlen = sizeof(int),
3210	.mode = `0200`,
3211	.proc_handler = sysctl_compaction_handler,
3212	},
3213	{
3214	.procname = "compaction_proactiveness",
3215	.data = &sysctl_compaction_proactiveness,
3216	.maxlen = sizeof(sysctl_compaction_proactiveness),
3217	.mode = `0644`,
3218	.proc_handler = compaction_proactiveness_sysctl_handler,
3219	.extra1 = SYSCTL_ZERO,
3220	.extra2 = SYSCTL_ONE_HUNDRED,
3221	},
3222	{
3223	.procname = "extfrag_threshold",
3224	.data = &sysctl_extfrag_threshold,
3225	.maxlen = sizeof(int),
3226	.mode = `0644`,
3227	.proc_handler = proc_dointvec_minmax,
3228	.extra1 = SYSCTL_ZERO,
3229	.extra2 = SYSCTL_ONE_THOUSAND,
3230	},
3231	{
3232	.procname = "compact_unevictable_allowed",
3233	.data = &sysctl_compact_unevictable_allowed,
3234	.maxlen = sizeof(int),
3235	.mode = `0644`,
3236	.proc_handler = proc_dointvec_minmax_warn_RT_change,
3237	.extra1 = SYSCTL_ZERO,
3238	.extra2 = SYSCTL_ONE,
3239	},
3240	{ }
3241	};
3242
3243	static int __init kcompactd_init(void)
3244	{
3245	int nid;
3246	int ret;
3247
3248	ret = cpuhp_setup_state_nocalls(state: CPUHP_AP_ONLINE_DYN,
3249	name: "mm/compaction:online",
3250	startup: kcompactd_cpu_online, NULL);
3251	if (ret < `0`) {
3252	pr_err("kcompactd: failed to register hotplug callbacks.\n");
3253	return ret;
3254	}
3255
3256	for_each_node_state(nid, N_MEMORY)
3257	kcompactd_run(nid);
3258	register_sysctl_init("vm", vm_compaction);
3259	return `0`;
3260	}
3261	subsys_initcall(kcompactd_init)
3262
3263	#endif /* CONFIG_COMPACTION */
3264

source code of linux/mm/compaction.c