page_alloc.c source code [linux/mm/page_alloc.c]

1	// SPDX-License-Identifier: GPL-2.0-only
2	/*
3	* linux/mm/page_alloc.c
4	*
5	* Manages the free list, the system allocates free pages here.
6	* Note that kmalloc() lives in slab.c
7	*
8	* Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
9	* Swap reorganised 29.12.95, Stephen Tweedie
10	* Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
11	* Reshaped it to be a zoned allocator, Ingo Molnar, Red Hat, 1999
12	* Discontiguous memory support, Kanoj Sarcar, SGI, Nov 1999
13	* Zone balancing, Kanoj Sarcar, SGI, Jan 2000
14	* Per cpu hot/cold page lists, bulk allocation, Martin J. Bligh, Sept 2002
15	* (lots of bits borrowed from Ingo Molnar & Andrew Morton)
16	*/
17
18	#include <linux/stddef.h>
19	#include <linux/mm.h>
20	#include <linux/highmem.h>
21	#include <linux/interrupt.h>
22	#include <linux/jiffies.h>
23	#include <linux/compiler.h>
24	#include <linux/kernel.h>
25	#include <linux/kasan.h>
26	#include <linux/kmsan.h>
27	#include <linux/module.h>
28	#include <linux/suspend.h>
29	#include <linux/ratelimit.h>
30	#include <linux/oom.h>
31	#include <linux/topology.h>
32	#include <linux/sysctl.h>
33	#include <linux/cpu.h>
34	#include <linux/cpuset.h>
35	#include <linux/memory_hotplug.h>
36	#include <linux/nodemask.h>
37	#include <linux/vmstat.h>
38	#include <linux/fault-inject.h>
39	#include <linux/compaction.h>
40	#include <trace/events/kmem.h>
41	#include <trace/events/oom.h>
42	#include <linux/prefetch.h>
43	#include <linux/mm_inline.h>
44	#include <linux/mmu_notifier.h>
45	#include <linux/migrate.h>
46	#include <linux/sched/mm.h>
47	#include <linux/page_owner.h>
48	#include <linux/page_table_check.h>
49	#include <linux/memcontrol.h>
50	#include <linux/ftrace.h>
51	#include <linux/lockdep.h>
52	#include <linux/psi.h>
53	#include <linux/khugepaged.h>
54	#include <linux/delayacct.h>
55	#include <linux/cacheinfo.h>
56	#include <asm/div64.h>
57	#include "internal.h"
58	#include "shuffle.h"
59	#include "page_reporting.h"
60
61	/ Free Page Internal flags: for internal, non-pcp variants of free_pages(). /
62	typedef int __bitwise fpi_t;
63
64	/ No special request /
65	#define FPI_NONE ((__force fpi_t)0)
66
67	/*
68	* Skip free page reporting notification for the (possibly merged) page.
69	* This does not hinder free page reporting from grabbing the page,
70	* reporting it and marking it "reported" - it only skips notifying
71	* the free page reporting infrastructure about a newly freed page. For
72	* example, used when temporarily pulling a page from a freelist and
73	* putting it back unmodified.
74	*/
75	#define FPI_SKIP_REPORT_NOTIFY ((__force fpi_t)BIT(0))
76
77	/*
78	* Place the (possibly merged) page to the tail of the freelist. Will ignore
79	* page shuffling (relevant code - e.g., memory onlining - is expected to
80	* shuffle the whole zone).
81	*
82	* Note: No code should rely on this flag for correctness - it's purely
83	* to allow for optimizations when handing back either fresh pages
84	* (memory onlining) or untouched pages (page isolation, free page
85	* reporting).
86	*/
87	#define FPI_TO_TAIL ((__force fpi_t)BIT(1))
88
89	/ prevent >1 _updater_ of zone percpu pageset ->high and ->batch fields /
90	static DEFINE_MUTEX(pcp_batch_high_lock);
91	#define MIN_PERCPU_PAGELIST_HIGH_FRACTION (8)
92
93	#if defined(CONFIG_SMP) \|\| defined(CONFIG_PREEMPT_RT)
94	/*
95	* On SMP, spin_trylock is sufficient protection.
96	* On PREEMPT_RT, spin_trylock is equivalent on both SMP and UP.
97	*/
98	#define pcp_trylock_prepare(flags) do { } while (0)
99	#define pcp_trylock_finish(flag) do { } while (0)
100	#else
101
102	/ UP spin_trylock always succeeds so disable IRQs to prevent re-entrancy. /
103	#define pcp_trylock_prepare(flags) local_irq_save(flags)
104	#define pcp_trylock_finish(flags) local_irq_restore(flags)
105	#endif
106
107	/*
108	* Locking a pcp requires a PCP lookup followed by a spinlock. To avoid
109	* a migration causing the wrong PCP to be locked and remote memory being
110	* potentially allocated, pin the task to the CPU for the lookup+lock.
111	* preempt_disable is used on !RT because it is faster than migrate_disable.
112	* migrate_disable is used on RT because otherwise RT spinlock usage is
113	* interfered with and a high priority task cannot preempt the allocator.
114	*/
115	#ifndef CONFIG_PREEMPT_RT
116	#define pcpu_task_pin() preempt_disable()
117	#define pcpu_task_unpin() preempt_enable()
118	#else
119	#define pcpu_task_pin() migrate_disable()
120	#define pcpu_task_unpin() migrate_enable()
121	#endif
122
123	/*
124	* Generic helper to lookup and a per-cpu variable with an embedded spinlock.
125	* Return value should be used with equivalent unlock helper.
126	*/
127	#define pcpu_spin_lock(type, member, ptr) \
128	({ \
129	type *_ret; \
130	pcpu_task_pin(); \
131	_ret = this_cpu_ptr(ptr); \
132	spin_lock(&_ret->member); \
133	_ret; \
134	})
135
136	#define pcpu_spin_trylock(type, member, ptr) \
137	({ \
138	type *_ret; \
139	pcpu_task_pin(); \
140	_ret = this_cpu_ptr(ptr); \
141	if (!spin_trylock(&_ret->member)) { \
142	pcpu_task_unpin(); \
143	_ret = NULL; \
144	} \
145	_ret; \
146	})
147
148	#define pcpu_spin_unlock(member, ptr) \
149	({ \
150	spin_unlock(&ptr->member); \
151	pcpu_task_unpin(); \
152	})
153
154	/ struct per_cpu_pages specific helpers. /
155	#define pcp_spin_lock(ptr) \
156	pcpu_spin_lock(struct per_cpu_pages, lock, ptr)
157
158	#define pcp_spin_trylock(ptr) \
159	pcpu_spin_trylock(struct per_cpu_pages, lock, ptr)
160
161	#define pcp_spin_unlock(ptr) \
162	pcpu_spin_unlock(lock, ptr)
163
164	#ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID
165	DEFINE_PER_CPU(int, numa_node);
166	EXPORT_PER_CPU_SYMBOL(numa_node);
167	#endif
168
169	DEFINE_STATIC_KEY_TRUE(vm_numa_stat_key);
170
171	#ifdef CONFIG_HAVE_MEMORYLESS_NODES
172	/*
173	* N.B., Do NOT reference the '_numa_mem_' per cpu variable directly.
174	* It will not be defined when CONFIG_HAVE_MEMORYLESS_NODES is not defined.
175	* Use the accessor functions set_numa_mem(), numa_mem_id() and cpu_to_mem()
176	* defined in <linux/topology.h>.
177	*/
178	DEFINE_PER_CPU(int, _numa_mem_); / Kernel "local memory" node /
179	EXPORT_PER_CPU_SYMBOL(_numa_mem_);
180	#endif
181
182	static DEFINE_MUTEX(pcpu_drain_mutex);
183
184	#ifdef CONFIG_GCC_PLUGIN_LATENT_ENTROPY
185	volatile unsigned long latent_entropy __latent_entropy;
186	EXPORT_SYMBOL(latent_entropy);
187	#endif
188
189	/*
190	* Array of node states.
191	*/
192	nodemask_t node_states[NR_NODE_STATES] __read_mostly = {
193	[N_POSSIBLE] = NODE_MASK_ALL,
194	[N_ONLINE] = { .bits: { [`0`] = `1UL` } },
195	#ifndef CONFIG_NUMA
196	[N_NORMAL_MEMORY] = { { [`0`] = `1UL` } },
197	#ifdef CONFIG_HIGHMEM
198	[N_HIGH_MEMORY] = { { [`0`] = `1UL` } },
199	#endif
200	[N_MEMORY] = { { [`0`] = `1UL` } },
201	[N_CPU] = { { [`0`] = `1UL` } },
202	#endif /* NUMA */
203	};
204	EXPORT_SYMBOL(node_states);
205
206	gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK;
207
208	/*
209	* A cached value of the page's pageblock's migratetype, used when the page is
210	* put on a pcplist. Used to avoid the pageblock migratetype lookup when
211	* freeing from pcplists in most cases, at the cost of possibly becoming stale.
212	* Also the migratetype set in the page does not necessarily match the pcplist
213	* index, e.g. page might have MIGRATE_CMA set but be on a pcplist with any
214	* other index - this ensures that it will be put on the correct CMA freelist.
215	*/
216	static inline int get_pcppage_migratetype(struct page *page)
217	{
218	return page->index;
219	}
220
221	static inline void set_pcppage_migratetype(struct page page, int* migratetype)
222	{
223	page->index = migratetype;
224	}
225
226	#ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE
227	unsigned int pageblock_order __read_mostly;
228	#endif
229
230	static void __free_pages_ok(struct page page, unsigned* int order,
231	fpi_t fpi_flags);
232
233	/*
234	* results with 256, 32 in the lowmem_reserve sysctl:
235	* 1G machine -> (16M dma, 800M-16M normal, 1G-800M high)
236	* 1G machine -> (16M dma, 784M normal, 224M high)
237	* NORMAL allocation will leave 784M/256 of ram reserved in the ZONE_DMA
238	* HIGHMEM allocation will leave 224M/32 of ram reserved in ZONE_NORMAL
239	* HIGHMEM allocation will leave (224M+784M)/256 of ram reserved in ZONE_DMA
240	*
241	* TBD: should special case ZONE_DMA32 machines here - in those we normally
242	* don't need any ZONE_NORMAL reservation
243	*/
244	static int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES] = {
245	#ifdef CONFIG_ZONE_DMA
246	[ZONE_DMA] = `256`,
247	#endif
248	#ifdef CONFIG_ZONE_DMA32
249	[ZONE_DMA32] = `256`,
250	#endif
251	[ZONE_NORMAL] = `32`,
252	#ifdef CONFIG_HIGHMEM
253	[ZONE_HIGHMEM] = `0`,
254	#endif
255	[ZONE_MOVABLE] = `0`,
256	};
257
258	char * const zone_names[MAX_NR_ZONES] = {
259	#ifdef CONFIG_ZONE_DMA
260	"DMA",
261	#endif
262	#ifdef CONFIG_ZONE_DMA32
263	"DMA32",
264	#endif
265	"Normal",
266	#ifdef CONFIG_HIGHMEM
267	"HighMem",
268	#endif
269	"Movable",
270	#ifdef CONFIG_ZONE_DEVICE
271	"Device",
272	#endif
273	};
274
275	const char * const migratetype_names[MIGRATE_TYPES] = {
276	"Unmovable",
277	"Movable",
278	"Reclaimable",
279	"HighAtomic",
280	#ifdef CONFIG_CMA
281	"CMA",
282	#endif
283	#ifdef CONFIG_MEMORY_ISOLATION
284	"Isolate",
285	#endif
286	};
287
288	int min_free_kbytes = `1024`;
289	int user_min_free_kbytes = -`1`;
290	static int watermark_boost_factor __read_mostly = `15000`;
291	static int watermark_scale_factor = `10`;
292
293	/ movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from /
294	int movable_zone;
295	EXPORT_SYMBOL(movable_zone);
296
297	#if MAX_NUMNODES > 1
298	unsigned int nr_node_ids __read_mostly = MAX_NUMNODES;
299	unsigned int nr_online_nodes __read_mostly = `1`;
300	EXPORT_SYMBOL(nr_node_ids);
301	EXPORT_SYMBOL(nr_online_nodes);
302	#endif
303
304	static bool page_contains_unaccepted(struct page page, unsigned* int order);
305	static void accept_page(struct page page, unsigned* int order);
306	static bool try_to_accept_memory(struct zone zone, unsigned* int order);
307	static inline bool has_unaccepted_memory(void);
308	static bool __free_unaccepted(struct page *page);
309
310	int page_group_by_mobility_disabled __read_mostly;
311
312	#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
313	/*
314	* During boot we initialize deferred pages on-demand, as needed, but once
315	* page_alloc_init_late() has finished, the deferred pages are all initialized,
316	* and we can permanently disable that path.
317	*/
318	DEFINE_STATIC_KEY_TRUE(deferred_pages);
319
320	static inline bool deferred_pages_enabled(void)
321	{
322	return static_branch_unlikely(&deferred_pages);
323	}
324
325	/*
326	* deferred_grow_zone() is __init, but it is called from
327	* get_page_from_freelist() during early boot until deferred_pages permanently
328	* disables this call. This is why we have refdata wrapper to avoid warning,
329	* and to ensure that the function body gets unloaded.
330	*/
331	static bool __ref
332	_deferred_grow_zone(struct zone zone, unsigned* int order)
333	{
334	return deferred_grow_zone(zone, order);
335	}
336	#else
337	static inline bool deferred_pages_enabled(void)
338	{
339	return false;
340	}
341	#endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */
342
343	/ Return a pointer to the bitmap storing bits affecting a block of pages /
344	static inline unsigned long get_pageblock_bitmap(const* struct page *page,
345	unsigned long pfn)
346	{
347	#ifdef CONFIG_SPARSEMEM
348	return section_to_usemap(ms: __pfn_to_section(pfn));
349	#else
350	return page_zone(page)->pageblock_flags;
351	#endif /* CONFIG_SPARSEMEM */
352	}
353
354	static inline int pfn_to_bitidx(const struct page page, unsigned* long pfn)
355	{
356	#ifdef CONFIG_SPARSEMEM
357	pfn &= (PAGES_PER_SECTION-`1`);
358	#else
359	pfn = pfn - pageblock_start_pfn(page_zone(page)->zone_start_pfn);
360	#endif /* CONFIG_SPARSEMEM */
361	return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS;
362	}
363
364	/**
365	* get_pfnblock_flags_mask - Return the requested group of flags for the pageblock_nr_pages block of pages
366	* @page: The page within the block of interest
367	* @pfn: The target page frame number
368	* @mask: mask of bits that the caller is interested in
369	*
370	* Return: pageblock_bits flags
371	*/
372	unsigned long get_pfnblock_flags_mask(const struct page *page,
373	unsigned long pfn, unsigned long mask)
374	{
375	unsigned long *bitmap;
376	unsigned long bitidx, word_bitidx;
377	unsigned long word;
378
379	bitmap = get_pageblock_bitmap(page, pfn);
380	bitidx = pfn_to_bitidx(page, pfn);
381	word_bitidx = bitidx / BITS_PER_LONG;
382	bitidx &= (BITS_PER_LONG-`1`);
383	/*
384	* This races, without locks, with set_pfnblock_flags_mask(). Ensure
385	* a consistent read of the memory array, so that results, even though
386	* racy, are not corrupted.
387	*/
388	word = READ_ONCE(bitmap[word_bitidx]);
389	return (word >> bitidx) & mask;
390	}
391
392	static __always_inline int get_pfnblock_migratetype(const struct page *page,
393	unsigned long pfn)
394	{
395	return get_pfnblock_flags_mask(page, pfn, MIGRATETYPE_MASK);
396	}
397
398	/**
399	* set_pfnblock_flags_mask - Set the requested group of flags for a pageblock_nr_pages block of pages
400	* @page: The page within the block of interest
401	* @flags: The flags to set
402	* @pfn: The target page frame number
403	* @mask: mask of bits that the caller is interested in
404	*/
405	void set_pfnblock_flags_mask(struct page page, unsigned* long flags,
406	unsigned long pfn,
407	unsigned long mask)
408	{
409	unsigned long *bitmap;
410	unsigned long bitidx, word_bitidx;
411	unsigned long word;
412
413	BUILD_BUG_ON(NR_PAGEBLOCK_BITS != `4`);
414	BUILD_BUG_ON(MIGRATE_TYPES > (`1` << PB_migratetype_bits));
415
416	bitmap = get_pageblock_bitmap(page, pfn);
417	bitidx = pfn_to_bitidx(page, pfn);
418	word_bitidx = bitidx / BITS_PER_LONG;
419	bitidx &= (BITS_PER_LONG-`1`);
420
421	VM_BUG_ON_PAGE(!zone_spans_pfn(page_zone(page), pfn), page);
422
423	mask <<= bitidx;
424	flags <<= bitidx;
425
426	word = READ_ONCE(bitmap[word_bitidx]);
427	do {
428	} while (!try_cmpxchg(&bitmap[word_bitidx], &word, (word & ~mask) \| flags));
429	}
430
431	void set_pageblock_migratetype(struct page page, int* migratetype)
432	{
433	if (unlikely(page_group_by_mobility_disabled &&
434	migratetype < MIGRATE_PCPTYPES))
435	migratetype = MIGRATE_UNMOVABLE;
436
437	set_pfnblock_flags_mask(page, flags: (unsigned long)migratetype,
438	page_to_pfn(page), MIGRATETYPE_MASK);
439	}
440
441	#ifdef CONFIG_DEBUG_VM
442	static int page_outside_zone_boundaries(struct zone zone, struct* page *page)
443	{
444	int ret;
445	unsigned seq;
446	unsigned long pfn = page_to_pfn(page);
447	unsigned long sp, start_pfn;
448
449	do {
450	seq = zone_span_seqbegin(zone);
451	start_pfn = zone->zone_start_pfn;
452	sp = zone->spanned_pages;
453	ret = !zone_spans_pfn(zone, pfn);
454	} while (zone_span_seqretry(zone, iv: seq));
455
456	if (ret)
457	pr_err("page 0x%lx outside node %d zone %s [ 0x%lx - 0x%lx ]\n",
458	pfn, zone_to_nid(zone), zone->name,
459	start_pfn, start_pfn + sp);
460
461	return ret;
462	}
463
464	/*
465	* Temporary debugging check for pages not lying within a given zone.
466	*/
467	static int __maybe_unused bad_range(struct zone zone, struct* page *page)
468	{
469	if (page_outside_zone_boundaries(zone, page))
470	return `1`;
471	if (zone != page_zone(page))
472	return `1`;
473
474	return `0`;
475	}
476	#else
477	static inline int __maybe_unused bad_range(struct zone zone, struct* page *page)
478	{
479	return `0`;
480	}
481	#endif
482
483	static void bad_page(struct page page, const* char *reason)
484	{
485	static unsigned long resume;
486	static unsigned long nr_shown;
487	static unsigned long nr_unshown;
488
489	/*
490	* Allow a burst of 60 reports, then keep quiet for that minute;
491	* or allow a steady drip of one report per second.
492	*/
493	if (nr_shown == `60`) {
494	if (time_before(jiffies, resume)) {
495	nr_unshown++;
496	goto out;
497	}
498	if (nr_unshown) {
499	pr_alert(
500	"BUG: Bad page state: %lu messages suppressed\n",
501	nr_unshown);
502	nr_unshown = `0`;
503	}
504	nr_shown = `0`;
505	}
506	if (nr_shown++ == `0`)
507	resume = jiffies + `60` * HZ;
508
509	pr_alert("BUG: Bad page state in process %s pfn:%05lx\n",
510	current->comm, page_to_pfn(page));
511	dump_page(page, reason);
512
513	print_modules();
514	dump_stack();
515	out:
516	/ Leave bad fields for debug, except PageBuddy could make trouble /
517	page_mapcount_reset(page); / remove PageBuddy /
518	add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
519	}
520
521	static inline unsigned int order_to_pindex(int migratetype, int order)
522	{
523	#ifdef CONFIG_TRANSPARENT_HUGEPAGE
524	if (order > PAGE_ALLOC_COSTLY_ORDER) {
525	VM_BUG_ON(order != pageblock_order);
526	return NR_LOWORDER_PCP_LISTS;
527	}
528	#else
529	VM_BUG_ON(order > PAGE_ALLOC_COSTLY_ORDER);
530	#endif
531
532	return (MIGRATE_PCPTYPES * order) + migratetype;
533	}
534
535	static inline int pindex_to_order(unsigned int pindex)
536	{
537	int order = pindex / MIGRATE_PCPTYPES;
538
539	#ifdef CONFIG_TRANSPARENT_HUGEPAGE
540	if (pindex == NR_LOWORDER_PCP_LISTS)
541	order = pageblock_order;
542	#else
543	VM_BUG_ON(order > PAGE_ALLOC_COSTLY_ORDER);
544	#endif
545
546	return order;
547	}
548
549	static inline bool pcp_allowed_order(unsigned int order)
550	{
551	if (order <= PAGE_ALLOC_COSTLY_ORDER)
552	return true;
553	#ifdef CONFIG_TRANSPARENT_HUGEPAGE
554	if (order == pageblock_order)
555	return true;
556	#endif
557	return false;
558	}
559
560	static inline void free_the_page(struct page page, unsigned* int order)
561	{
562	if (pcp_allowed_order(order)) / Via pcp? /
563	free_unref_page(page, order);
564	else
565	__free_pages_ok(page, order, FPI_NONE);
566	}
567
568	/*
569	* Higher-order pages are called "compound pages". They are structured thusly:
570	*
571	* The first PAGE_SIZE page is called the "head page" and have PG_head set.
572	*
573	* The remaining PAGE_SIZE pages are called "tail pages". PageTail() is encoded
574	* in bit 0 of page->compound_head. The rest of bits is pointer to head page.
575	*
576	* The first tail page's ->compound_order holds the order of allocation.
577	* This usage means that zero-order pages may not be compound.
578	*/
579
580	void prep_compound_page(struct page page, unsigned* int order)
581	{
582	int i;
583	int nr_pages = `1` << order;
584
585	__SetPageHead(page);
586	for (i = `1`; i < nr_pages; i++)
587	prep_compound_tail(head: page, tail_idx: i);
588
589	prep_compound_head(page, order);
590	}
591
592	void destroy_large_folio(struct folio *folio)
593	{
594	if (folio_test_hugetlb(folio)) {
595	free_huge_folio(folio);
596	return;
597	}
598
599	if (folio_test_large_rmappable(folio))
600	folio_undo_large_rmappable(folio);
601
602	mem_cgroup_uncharge(folio);
603	free_the_page(page: &folio->page, order: folio_order(folio));
604	}
605
606	static inline void set_buddy_order(struct page page, unsigned* int order)
607	{
608	set_page_private(page, private: order);
609	__SetPageBuddy(page);
610	}
611
612	#ifdef CONFIG_COMPACTION
613	static inline struct capture_control task_capc(struct* zone *zone)
614	{
615	struct capture_control *capc = current->capture_control;
616
617	return unlikely(capc) &&
618	!(current->flags & PF_KTHREAD) &&
619	!capc->page &&
620	capc->cc->zone == zone ? capc : NULL;
621	}
622
623	static inline bool
624	compaction_capture(struct capture_control capc, struct* page *page,
625	int order, int migratetype)
626	{
627	if (!capc \|\| order != capc->cc->order)
628	return false;
629
630	/ Do not accidentally pollute CMA or isolated regions/
631	if (is_migrate_cma(migratetype) \|\|
632	is_migrate_isolate(migratetype))
633	return false;
634
635	/*
636	* Do not let lower order allocations pollute a movable pageblock.
637	* This might let an unmovable request use a reclaimable pageblock
638	* and vice-versa but no more than normal fallback logic which can
639	* have trouble finding a high-order free page.
640	*/
641	if (order < pageblock_order && migratetype == MIGRATE_MOVABLE)
642	return false;
643
644	capc->page = page;
645	return true;
646	}
647
648	#else
649	static inline struct capture_control task_capc(struct* zone *zone)
650	{
651	return NULL;
652	}
653
654	static inline bool
655	compaction_capture(struct capture_control capc, struct* page *page,
656	int order, int migratetype)
657	{
658	return false;
659	}
660	#endif /* CONFIG_COMPACTION */
661
662	/ Used for pages not on another list /
663	static inline void add_to_free_list(struct page page, struct* zone *zone,
664	unsigned int order, int migratetype)
665	{
666	struct free_area *area = &zone->free_area[order];
667
668	list_add(new: &page->buddy_list, head: &area->free_list[migratetype]);
669	area->nr_free++;
670	}
671
672	/ Used for pages not on another list /
673	static inline void add_to_free_list_tail(struct page page, struct* zone *zone,
674	unsigned int order, int migratetype)
675	{
676	struct free_area *area = &zone->free_area[order];
677
678	list_add_tail(new: &page->buddy_list, head: &area->free_list[migratetype]);
679	area->nr_free++;
680	}
681
682	/*
683	* Used for pages which are on another list. Move the pages to the tail
684	* of the list - so the moved pages won't immediately be considered for
685	* allocation again (e.g., optimization for memory onlining).
686	*/
687	static inline void move_to_free_list(struct page page, struct* zone *zone,
688	unsigned int order, int migratetype)
689	{
690	struct free_area *area = &zone->free_area[order];
691
692	list_move_tail(list: &page->buddy_list, head: &area->free_list[migratetype]);
693	}
694
695	static inline void del_page_from_free_list(struct page page, struct* zone *zone,
696	unsigned int order)
697	{
698	/ clear reported state and update reported page count /
699	if (page_reported(page))
700	__ClearPageReported(page);
701
702	list_del(entry: &page->buddy_list);
703	__ClearPageBuddy(page);
704	set_page_private(page, private: `0`);
705	zone->free_area[order].nr_free--;
706	}
707
708	static inline struct page get_page_from_free_area(struct* free_area *area,
709	int migratetype)
710	{
711	return list_first_entry_or_null(&area->free_list[migratetype],
712	struct page, buddy_list);
713	}
714
715	/*
716	* If this is not the largest possible page, check if the buddy
717	* of the next-highest order is free. If it is, it's possible
718	* that pages are being freed that will coalesce soon. In case,
719	* that is happening, add the free page to the tail of the list
720	* so it's less likely to be used soon and more likely to be merged
721	* as a higher order page
722	*/
723	static inline bool
724	buddy_merge_likely(unsigned long pfn, unsigned long buddy_pfn,
725	struct page page, unsigned* int order)
726	{
727	unsigned long higher_page_pfn;
728	struct page *higher_page;
729
730	if (order >= MAX_ORDER - `1`)
731	return false;
732
733	higher_page_pfn = buddy_pfn & pfn;
734	higher_page = page + (higher_page_pfn - pfn);
735
736	return find_buddy_page_pfn(page: higher_page, pfn: higher_page_pfn, order: order + `1`,
737	NULL) != NULL;
738	}
739
740	/*
741	* Freeing function for a buddy system allocator.
742	*
743	* The concept of a buddy system is to maintain direct-mapped table
744	* (containing bit values) for memory blocks of various "orders".
745	* The bottom level table contains the map for the smallest allocatable
746	* units of memory (here, pages), and each level above it describes
747	* pairs of units from the levels below, hence, "buddies".
748	* At a high level, all that happens here is marking the table entry
749	* at the bottom level available, and propagating the changes upward
750	* as necessary, plus some accounting needed to play nicely with other
751	* parts of the VM system.
752	* At each level, we keep a list of pages, which are heads of continuous
753	* free pages of length of (1 << order) and marked with PageBuddy.
754	* Page's order is recorded in page_private(page) field.
755	* So when we are allocating or freeing one, we can derive the state of the
756	* other. That is, if we allocate a small block, and both were
757	* free, the remainder of the region must be split into blocks.
758	* If a block is freed, and its buddy is also free, then this
759	* triggers coalescing into a block of larger size.
760	*
761	* -- nyc
762	*/
763
764	static inline void __free_one_page(struct page *page,
765	unsigned long pfn,
766	struct zone zone, unsigned* int order,
767	int migratetype, fpi_t fpi_flags)
768	{
769	struct capture_control *capc = task_capc(zone);
770	unsigned long buddy_pfn = `0`;
771	unsigned long combined_pfn;
772	struct page *buddy;
773	bool to_tail;
774
775	VM_BUG_ON(!zone_is_initialized(zone));
776	VM_BUG_ON_PAGE(page->flags & PAGE_FLAGS_CHECK_AT_PREP, page);
777
778	VM_BUG_ON(migratetype == -`1`);
779	if (likely(!is_migrate_isolate(migratetype)))
780	__mod_zone_freepage_state(zone, nr_pages: `1` << order, migratetype);
781
782	VM_BUG_ON_PAGE(pfn & ((`1` << order) - `1`), page);
783	VM_BUG_ON_PAGE(bad_range(zone, page), page);
784
785	while (order < MAX_ORDER) {
786	if (compaction_capture(capc, page, order, migratetype)) {
787	__mod_zone_freepage_state(zone, nr_pages: -(`1` << order),
788	migratetype);
789	return;
790	}
791
792	buddy = find_buddy_page_pfn(page, pfn, order, buddy_pfn: &buddy_pfn);
793	if (!buddy)
794	goto done_merging;
795
796	if (unlikely(order >= pageblock_order)) {
797	/*
798	* We want to prevent merge between freepages on pageblock
799	* without fallbacks and normal pageblock. Without this,
800	* pageblock isolation could cause incorrect freepage or CMA
801	* accounting or HIGHATOMIC accounting.
802	*/
803	int buddy_mt = get_pfnblock_migratetype(page: buddy, pfn: buddy_pfn);
804
805	if (migratetype != buddy_mt
806	&& (!migratetype_is_mergeable(mt: migratetype) \|\|
807	!migratetype_is_mergeable(mt: buddy_mt)))
808	goto done_merging;
809	}
810
811	/*
812	* Our buddy is free or it is CONFIG_DEBUG_PAGEALLOC guard page,
813	* merge with it and move up one order.
814	*/
815	if (page_is_guard(page: buddy))
816	clear_page_guard(zone, page: buddy, order, migratetype);
817	else
818	del_page_from_free_list(page: buddy, zone, order);
819	combined_pfn = buddy_pfn & pfn;
820	page = page + (combined_pfn - pfn);
821	pfn = combined_pfn;
822	order++;
823	}
824
825	done_merging:
826	set_buddy_order(page, order);
827
828	if (fpi_flags & FPI_TO_TAIL)
829	to_tail = true;
830	else if (is_shuffle_order(order))
831	to_tail = shuffle_pick_tail();
832	else
833	to_tail = buddy_merge_likely(pfn, buddy_pfn, page, order);
834
835	if (to_tail)
836	add_to_free_list_tail(page, zone, order, migratetype);
837	else
838	add_to_free_list(page, zone, order, migratetype);
839
840	/ Notify page reporting subsystem of freed page /
841	if (!(fpi_flags & FPI_SKIP_REPORT_NOTIFY))
842	page_reporting_notify_free(order);
843	}
844
845	/**
846	* split_free_page() -- split a free page at split_pfn_offset
847	* @free_page: the original free page
848	* @order: the order of the page
849	* @split_pfn_offset: split offset within the page
850	*
851	* Return -ENOENT if the free page is changed, otherwise 0
852	*
853	* It is used when the free page crosses two pageblocks with different migratetypes
854	* at split_pfn_offset within the page. The split free page will be put into
855	* separate migratetype lists afterwards. Otherwise, the function achieves
856	* nothing.
857	*/
858	int split_free_page(struct page *free_page,
859	unsigned int order, unsigned long split_pfn_offset)
860	{
861	struct zone *zone = page_zone(page: free_page);
862	unsigned long free_page_pfn = page_to_pfn(free_page);
863	unsigned long pfn;
864	unsigned long flags;
865	int free_page_order;
866	int mt;
867	int ret = `0`;
868
869	if (split_pfn_offset == `0`)
870	return ret;
871
872	spin_lock_irqsave(&zone->lock, flags);
873
874	if (!PageBuddy(page: free_page) \|\| buddy_order(page: free_page) != order) {
875	ret = -ENOENT;
876	goto out;
877	}
878
879	mt = get_pfnblock_migratetype(page: free_page, pfn: free_page_pfn);
880	if (likely(!is_migrate_isolate(mt)))
881	__mod_zone_freepage_state(zone, nr_pages: -(`1UL` << order), migratetype: mt);
882
883	del_page_from_free_list(page: free_page, zone, order);
884	for (pfn = free_page_pfn;
885	pfn < free_page_pfn + (`1UL` << order);) {
886	int mt = get_pfnblock_migratetype(pfn_to_page(pfn), pfn);
887
888	free_page_order = min_t(unsigned int,
889	pfn ? __ffs(pfn) : order,
890	__fls(split_pfn_offset));
891	__free_one_page(pfn_to_page(pfn), pfn, zone, order: free_page_order,
892	migratetype: mt, FPI_NONE);
893	pfn += `1UL` << free_page_order;
894	split_pfn_offset -= (`1UL` << free_page_order);
895	/ we have done the first part, now switch to second part /
896	if (split_pfn_offset == `0`)
897	split_pfn_offset = (`1UL` << order) - (pfn - free_page_pfn);
898	}
899	out:
900	spin_unlock_irqrestore(lock: &zone->lock, flags);
901	return ret;
902	}
903	/*
904	* A bad page could be due to a number of fields. Instead of multiple branches,
905	* try and check multiple fields with one check. The caller must do a detailed
906	* check if necessary.
907	*/
908	static inline bool page_expected_state(struct page *page,
909	unsigned long check_flags)
910	{
911	if (unlikely(atomic_read(&page->_mapcount) != -`1`))
912	return false;
913
914	if (unlikely((unsigned long)page->mapping \|
915	page_ref_count(page) \|
916	#ifdef CONFIG_MEMCG
917	page->memcg_data \|
918	#endif
919	(page->flags & check_flags)))
920	return false;
921
922	return true;
923	}
924
925	static const char page_bad_reason(struct* page page, unsigned* long flags)
926	{
927	const char *bad_reason = NULL;
928
929	if (unlikely(atomic_read(&page->_mapcount) != -`1`))
930	bad_reason = "nonzero mapcount";
931	if (unlikely(page->mapping != NULL))
932	bad_reason = "non-NULL mapping";
933	if (unlikely(page_ref_count(page) != `0`))
934	bad_reason = "nonzero _refcount";
935	if (unlikely(page->flags & flags)) {
936	if (flags == PAGE_FLAGS_CHECK_AT_PREP)
937	bad_reason = "PAGE_FLAGS_CHECK_AT_PREP flag(s) set";
938	else
939	bad_reason = "PAGE_FLAGS_CHECK_AT_FREE flag(s) set";
940	}
941	#ifdef CONFIG_MEMCG
942	if (unlikely(page->memcg_data))
943	bad_reason = "page still charged to cgroup";
944	#endif
945	return bad_reason;
946	}
947
948	static void free_page_is_bad_report(struct page *page)
949	{
950	bad_page(page,
951	page_bad_reason(page, PAGE_FLAGS_CHECK_AT_FREE));
952	}
953
954	static inline bool free_page_is_bad(struct page *page)
955	{
956	if (likely(page_expected_state(page, PAGE_FLAGS_CHECK_AT_FREE)))
957	return false;
958
959	/ Something has gone sideways, find it /
960	free_page_is_bad_report(page);
961	return true;
962	}
963
964	static inline bool is_check_pages_enabled(void)
965	{
966	return static_branch_unlikely(&check_pages_enabled);
967	}
968
969	static int free_tail_page_prepare(struct page head_page, struct* page *page)
970	{
971	struct folio folio = (struct* folio *)head_page;
972	int ret = `1`;
973
974	/*
975	* We rely page->lru.next never has bit 0 set, unless the page
976	* is PageTail(). Let's make sure that's true even for poisoned ->lru.
977	*/
978	BUILD_BUG_ON((unsigned long)LIST_POISON1 & `1`);
979
980	if (!is_check_pages_enabled()) {
981	ret = `0`;
982	goto out;
983	}
984	switch (page - head_page) {
985	case `1`:
986	/ the first tail page: these may be in place of ->mapping /
987	if (unlikely(folio_entire_mapcount(folio))) {
988	bad_page(page, reason: "nonzero entire_mapcount");
989	goto out;
990	}
991	if (unlikely(atomic_read(&folio->_nr_pages_mapped))) {
992	bad_page(page, reason: "nonzero nr_pages_mapped");
993	goto out;
994	}
995	if (unlikely(atomic_read(&folio->_pincount))) {
996	bad_page(page, reason: "nonzero pincount");
997	goto out;
998	}
999	break;
1000	case `2`:
1001	/*
1002	* the second tail page: ->mapping is
1003	* deferred_list.next -- ignore value.
1004	*/
1005	break;
1006	default:
1007	if (page->mapping != TAIL_MAPPING) {
1008	bad_page(page, reason: "corrupted mapping in tail page");
1009	goto out;
1010	}
1011	break;
1012	}
1013	if (unlikely(!PageTail(page))) {
1014	bad_page(page, reason: "PageTail not set");
1015	goto out;
1016	}
1017	if (unlikely(compound_head(page) != head_page)) {
1018	bad_page(page, reason: "compound_head not consistent");
1019	goto out;
1020	}
1021	ret = `0`;
1022	out:
1023	page->mapping = NULL;
1024	clear_compound_head(page);
1025	return ret;
1026	}
1027
1028	/*
1029	* Skip KASAN memory poisoning when either:
1030	*
1031	* 1. For generic KASAN: deferred memory initialization has not yet completed.
1032	* Tag-based KASAN modes skip pages freed via deferred memory initialization
1033	* using page tags instead (see below).
1034	* 2. For tag-based KASAN modes: the page has a match-all KASAN tag, indicating
1035	* that error detection is disabled for accesses via the page address.
1036	*
1037	* Pages will have match-all tags in the following circumstances:
1038	*
1039	* 1. Pages are being initialized for the first time, including during deferred
1040	* memory init; see the call to page_kasan_tag_reset in __init_single_page.
1041	* 2. The allocation was not unpoisoned due to __GFP_SKIP_KASAN, with the
1042	* exception of pages unpoisoned by kasan_unpoison_vmalloc.
1043	* 3. The allocation was excluded from being checked due to sampling,
1044	* see the call to kasan_unpoison_pages.
1045	*
1046	* Poisoning pages during deferred memory init will greatly lengthen the
1047	* process and cause problem in large memory systems as the deferred pages
1048	* initialization is done with interrupt disabled.
1049	*
1050	* Assuming that there will be no reference to those newly initialized
1051	* pages before they are ever allocated, this should have no effect on
1052	* KASAN memory tracking as the poison will be properly inserted at page
1053	* allocation time. The only corner case is when pages are allocated by
1054	* on-demand allocation and then freed again before the deferred pages
1055	* initialization is done, but this is not likely to happen.
1056	*/
1057	static inline bool should_skip_kasan_poison(struct page *page, fpi_t fpi_flags)
1058	{
1059	if (IS_ENABLED(CONFIG_KASAN_GENERIC))
1060	return deferred_pages_enabled();
1061
1062	return page_kasan_tag(page) == `0xff`;
1063	}
1064
1065	static void kernel_init_pages(struct page page, int* numpages)
1066	{
1067	int i;
1068
1069	/ s390's use of memset() could override KASAN redzones. /
1070	kasan_disable_current();
1071	for (i = `0`; i < numpages; i++)
1072	clear_highpage_kasan_tagged(page: page + i);
1073	kasan_enable_current();
1074	}
1075
1076	static __always_inline bool free_pages_prepare(struct page *page,
1077	unsigned int order, fpi_t fpi_flags)
1078	{
1079	int bad = `0`;
1080	bool skip_kasan_poison = should_skip_kasan_poison(page, fpi_flags);
1081	bool init = want_init_on_free();
1082	bool compound = PageCompound(page);
1083
1084	VM_BUG_ON_PAGE(PageTail(page), page);
1085
1086	trace_mm_page_free(page, order);
1087	kmsan_free_page(page, order);
1088
1089	if (unlikely(PageHWPoison(page)) && !order) {
1090	/*
1091	* Do not let hwpoison pages hit pcplists/buddy
1092	* Untie memcg state and reset page's owner
1093	*/
1094	if (memcg_kmem_online() && PageMemcgKmem(page))
1095	__memcg_kmem_uncharge_page(page, order);
1096	reset_page_owner(page, order);
1097	page_table_check_free(page, order);
1098	return false;
1099	}
1100
1101	VM_BUG_ON_PAGE(compound && compound_order(page) != order, page);
1102
1103	/*
1104	* Check tail pages before head page information is cleared to
1105	* avoid checking PageCompound for order-0 pages.
1106	*/
1107	if (unlikely(order)) {
1108	int i;
1109
1110	if (compound)
1111	page[`1`].flags &= ~PAGE_FLAGS_SECOND;
1112	for (i = `1`; i < (`1` << order); i++) {
1113	if (compound)
1114	bad += free_tail_page_prepare(head_page: page, page: page + i);
1115	if (is_check_pages_enabled()) {
1116	if (free_page_is_bad(page: page + i)) {
1117	bad++;
1118	continue;
1119	}
1120	}
1121	(page + i)->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
1122	}
1123	}
1124	if (PageMappingFlags(page))
1125	page->mapping = NULL;
1126	if (memcg_kmem_online() && PageMemcgKmem(page))
1127	__memcg_kmem_uncharge_page(page, order);
1128	if (is_check_pages_enabled()) {
1129	if (free_page_is_bad(page))
1130	bad++;
1131	if (bad)
1132	return false;
1133	}
1134
1135	page_cpupid_reset_last(page);
1136	page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
1137	reset_page_owner(page, order);
1138	page_table_check_free(page, order);
1139
1140	if (!PageHighMem(page)) {
1141	debug_check_no_locks_freed(page_address(page),
1142	PAGE_SIZE << order);
1143	debug_check_no_obj_freed(page_address(page),
1144	PAGE_SIZE << order);
1145	}
1146
1147	kernel_poison_pages(page, numpages: `1` << order);
1148
1149	/*
1150	* As memory initialization might be integrated into KASAN,
1151	* KASAN poisoning and memory initialization code must be
1152	* kept together to avoid discrepancies in behavior.
1153	*
1154	* With hardware tag-based KASAN, memory tags must be set before the
1155	* page becomes unavailable via debug_pagealloc or arch_free_page.
1156	*/
1157	if (!skip_kasan_poison) {
1158	kasan_poison_pages(page, order, init);
1159
1160	/ Memory is already initialized if KASAN did it internally. /
1161	if (kasan_has_integrated_init())
1162	init = false;
1163	}
1164	if (init)
1165	kernel_init_pages(page, numpages: `1` << order);
1166
1167	/*
1168	* arch_free_page() can make the page's contents inaccessible. s390
1169	* does this. So nothing which can access the page's contents should
1170	* happen after this.
1171	*/
1172	arch_free_page(page, order);
1173
1174	debug_pagealloc_unmap_pages(page, numpages: `1` << order);
1175
1176	return true;
1177	}
1178
1179	/*
1180	* Frees a number of pages from the PCP lists
1181	* Assumes all pages on list are in same zone.
1182	* count is the number of pages to free.
1183	*/
1184	static void free_pcppages_bulk(struct zone zone, int* count,
1185	struct per_cpu_pages *pcp,
1186	int pindex)
1187	{
1188	unsigned long flags;
1189	unsigned int order;
1190	bool isolated_pageblocks;
1191	struct page *page;
1192
1193	/*
1194	* Ensure proper count is passed which otherwise would stuck in the
1195	* below while (list_empty(list)) loop.
1196	*/
1197	count = min(pcp->count, count);
1198
1199	/ Ensure requested pindex is drained first. /
1200	pindex = pindex - `1`;
1201
1202	spin_lock_irqsave(&zone->lock, flags);
1203	isolated_pageblocks = has_isolate_pageblock(zone);
1204
1205	while (count > `0`) {
1206	struct list_head *list;
1207	int nr_pages;
1208
1209	/ Remove pages from lists in a round-robin fashion. /
1210	do {
1211	if (++pindex > NR_PCP_LISTS - `1`)
1212	pindex = `0`;
1213	list = &pcp->lists[pindex];
1214	} while (list_empty(head: list));
1215
1216	order = pindex_to_order(pindex);
1217	nr_pages = `1` << order;
1218	do {
1219	int mt;
1220
1221	page = list_last_entry(list, struct page, pcp_list);
1222	mt = get_pcppage_migratetype(page);
1223
1224	/ must delete to avoid corrupting pcp list /
1225	list_del(entry: &page->pcp_list);
1226	count -= nr_pages;
1227	pcp->count -= nr_pages;
1228
1229	/ MIGRATE_ISOLATE page should not go to pcplists /
1230	VM_BUG_ON_PAGE(is_migrate_isolate(mt), page);
1231	/ Pageblock could have been isolated meanwhile /
1232	if (unlikely(isolated_pageblocks))
1233	mt = get_pageblock_migratetype(page);
1234
1235	__free_one_page(page, page_to_pfn(page), zone, order, migratetype: mt, FPI_NONE);
1236	trace_mm_page_pcpu_drain(page, order, migratetype: mt);
1237	} while (count > `0` && !list_empty(head: list));
1238	}
1239
1240	spin_unlock_irqrestore(lock: &zone->lock, flags);
1241	}
1242
1243	static void free_one_page(struct zone *zone,
1244	struct page page, unsigned* long pfn,
1245	unsigned int order,
1246	int migratetype, fpi_t fpi_flags)
1247	{
1248	unsigned long flags;
1249
1250	spin_lock_irqsave(&zone->lock, flags);
1251	if (unlikely(has_isolate_pageblock(zone) \|\|
1252	is_migrate_isolate(migratetype))) {
1253	migratetype = get_pfnblock_migratetype(page, pfn);
1254	}
1255	__free_one_page(page, pfn, zone, order, migratetype, fpi_flags);
1256	spin_unlock_irqrestore(lock: &zone->lock, flags);
1257	}
1258
1259	static void __free_pages_ok(struct page page, unsigned* int order,
1260	fpi_t fpi_flags)
1261	{
1262	unsigned long flags;
1263	int migratetype;
1264	unsigned long pfn = page_to_pfn(page);
1265	struct zone *zone = page_zone(page);
1266
1267	if (!free_pages_prepare(page, order, fpi_flags))
1268	return;
1269
1270	/*
1271	* Calling get_pfnblock_migratetype() without spin_lock_irqsave() here
1272	* is used to avoid calling get_pfnblock_migratetype() under the lock.
1273	* This will reduce the lock holding time.
1274	*/
1275	migratetype = get_pfnblock_migratetype(page, pfn);
1276
1277	spin_lock_irqsave(&zone->lock, flags);
1278	if (unlikely(has_isolate_pageblock(zone) \|\|
1279	is_migrate_isolate(migratetype))) {
1280	migratetype = get_pfnblock_migratetype(page, pfn);
1281	}
1282	__free_one_page(page, pfn, zone, order, migratetype, fpi_flags);
1283	spin_unlock_irqrestore(lock: &zone->lock, flags);
1284
1285	__count_vm_events(item: PGFREE, delta: `1` << order);
1286	}
1287
1288	void __free_pages_core(struct page page, unsigned* int order)
1289	{
1290	unsigned int nr_pages = `1` << order;
1291	struct page *p = page;
1292	unsigned int loop;
1293
1294	/*
1295	* When initializing the memmap, __init_single_page() sets the refcount
1296	* of all pages to 1 ("allocated"/"not free"). We have to set the
1297	* refcount of all involved pages to 0.
1298	*/
1299	prefetchw(x: p);
1300	for (loop = `0`; loop < (nr_pages - `1`); loop++, p++) {
1301	prefetchw(x: p + `1`);
1302	__ClearPageReserved(page: p);
1303	set_page_count(page: p, v: `0`);
1304	}
1305	__ClearPageReserved(page: p);
1306	set_page_count(page: p, v: `0`);
1307
1308	atomic_long_add(i: nr_pages, v: &page_zone(page)->managed_pages);
1309
1310	if (page_contains_unaccepted(page, order)) {
1311	if (order == MAX_ORDER && __free_unaccepted(page))
1312	return;
1313
1314	accept_page(page, order);
1315	}
1316
1317	/*
1318	* Bypass PCP and place fresh pages right to the tail, primarily
1319	* relevant for memory onlining.
1320	*/
1321	__free_pages_ok(page, order, FPI_TO_TAIL);
1322	}
1323
1324	/*
1325	* Check that the whole (or subset of) a pageblock given by the interval of
1326	* [start_pfn, end_pfn) is valid and within the same zone, before scanning it
1327	* with the migration of free compaction scanner.
1328	*
1329	* Return struct page pointer of start_pfn, or NULL if checks were not passed.
1330	*
1331	* It's possible on some configurations to have a setup like node0 node1 node0
1332	* i.e. it's possible that all pages within a zones range of pages do not
1333	* belong to a single zone. We assume that a border between node0 and node1
1334	* can occur within a single pageblock, but not a node0 node1 node0
1335	* interleaving within a single pageblock. It is therefore sufficient to check
1336	* the first and last page of a pageblock and avoid checking each individual
1337	* page in a pageblock.
1338	*
1339	* Note: the function may return non-NULL struct page even for a page block
1340	* which contains a memory hole (i.e. there is no physical memory for a subset
1341	* of the pfn range). For example, if the pageblock order is MAX_ORDER, which
1342	* will fall into 2 sub-sections, and the end pfn of the pageblock may be hole
1343	* even though the start pfn is online and valid. This should be safe most of
1344	* the time because struct pages are still initialized via init_unavailable_range()
1345	* and pfn walkers shouldn't touch any physical memory range for which they do
1346	* not recognize any specific metadata in struct pages.
1347	*/
1348	struct page __pageblock_pfn_to_page(unsigned* long start_pfn,
1349	unsigned long end_pfn, struct zone *zone)
1350	{
1351	struct page *start_page;
1352	struct page *end_page;
1353
1354	/ end_pfn is one past the range we are checking /
1355	end_pfn--;
1356
1357	if (!pfn_valid(pfn: end_pfn))
1358	return NULL;
1359
1360	start_page = pfn_to_online_page(pfn: start_pfn);
1361	if (!start_page)
1362	return NULL;
1363
1364	if (page_zone(page: start_page) != zone)
1365	return NULL;
1366
1367	end_page = pfn_to_page(end_pfn);
1368
1369	/ This gives a shorter code than deriving page_zone(end_page) /
1370	if (page_zone_id(page: start_page) != page_zone_id(page: end_page))
1371	return NULL;
1372
1373	return start_page;
1374	}
1375
1376	/*
1377	* The order of subdivision here is critical for the IO subsystem.
1378	* Please do not alter this order without good reasons and regression
1379	* testing. Specifically, as large blocks of memory are subdivided,
1380	* the order in which smaller blocks are delivered depends on the order
1381	* they're subdivided in this function. This is the primary factor
1382	* influencing the order in which pages are delivered to the IO
1383	* subsystem according to empirical testing, and this is also justified
1384	* by considering the behavior of a buddy system containing a single
1385	* large block of memory acted on by a series of small allocations.
1386	* This behavior is a critical factor in sglist merging's success.
1387	*
1388	* -- nyc
1389	*/
1390	static inline void expand(struct zone zone, struct* page *page,
1391	int low, int high, int migratetype)
1392	{
1393	unsigned long size = `1` << high;
1394
1395	while (high > low) {
1396	high--;
1397	size >>= `1`;
1398	VM_BUG_ON_PAGE(bad_range(zone, &page[size]), &page[size]);
1399
1400	/*
1401	* Mark as guard pages (or page), that will allow to
1402	* merge back to allocator when buddy will be freed.
1403	* Corresponding page table entries will not be touched,
1404	* pages will stay not present in virtual address space
1405	*/
1406	if (set_page_guard(zone, page: &page[size], order: high, migratetype))
1407	continue;
1408
1409	add_to_free_list(page: &page[size], zone, order: high, migratetype);
1410	set_buddy_order(page: &page[size], order: high);
1411	}
1412	}
1413
1414	static void check_new_page_bad(struct page *page)
1415	{
1416	if (unlikely(page->flags & __PG_HWPOISON)) {
1417	/ Don't complain about hwpoisoned pages /
1418	page_mapcount_reset(page); / remove PageBuddy /
1419	return;
1420	}
1421
1422	bad_page(page,
1423	page_bad_reason(page, PAGE_FLAGS_CHECK_AT_PREP));
1424	}
1425
1426	/*
1427	* This page is about to be returned from the page allocator
1428	*/
1429	static int check_new_page(struct page *page)
1430	{
1431	if (likely(page_expected_state(page,
1432	PAGE_FLAGS_CHECK_AT_PREP\|__PG_HWPOISON)))
1433	return `0`;
1434
1435	check_new_page_bad(page);
1436	return `1`;
1437	}
1438
1439	static inline bool check_new_pages(struct page page, unsigned* int order)
1440	{
1441	if (is_check_pages_enabled()) {
1442	for (int i = `0`; i < (`1` << order); i++) {
1443	struct page *p = page + i;
1444
1445	if (check_new_page(page: p))
1446	return true;
1447	}
1448	}
1449
1450	return false;
1451	}
1452
1453	static inline bool should_skip_kasan_unpoison(gfp_t flags)
1454	{
1455	/ Don't skip if a software KASAN mode is enabled. /
1456	if (IS_ENABLED(CONFIG_KASAN_GENERIC) \|\|
1457	IS_ENABLED(CONFIG_KASAN_SW_TAGS))
1458	return false;
1459
1460	/ Skip, if hardware tag-based KASAN is not enabled. /
1461	if (!kasan_hw_tags_enabled())
1462	return true;
1463
1464	/*
1465	* With hardware tag-based KASAN enabled, skip if this has been
1466	* requested via __GFP_SKIP_KASAN.
1467	*/
1468	return flags & __GFP_SKIP_KASAN;
1469	}
1470
1471	static inline bool should_skip_init(gfp_t flags)
1472	{
1473	/ Don't skip, if hardware tag-based KASAN is not enabled. /
1474	if (!kasan_hw_tags_enabled())
1475	return false;
1476
1477	/ For hardware tag-based KASAN, skip if requested. /
1478	return (flags & __GFP_SKIP_ZERO);
1479	}
1480
1481	inline void post_alloc_hook(struct page page, unsigned* int order,
1482	gfp_t gfp_flags)
1483	{
1484	bool init = !want_init_on_free() && want_init_on_alloc(flags: gfp_flags) &&
1485	!should_skip_init(flags: gfp_flags);
1486	bool zero_tags = init && (gfp_flags & __GFP_ZEROTAGS);
1487	int i;
1488
1489	set_page_private(page, private: `0`);
1490	set_page_refcounted(page);
1491
1492	arch_alloc_page(page, order);
1493	debug_pagealloc_map_pages(page, numpages: `1` << order);
1494
1495	/*
1496	* Page unpoisoning must happen before memory initialization.
1497	* Otherwise, the poison pattern will be overwritten for __GFP_ZERO
1498	* allocations and the page unpoisoning code will complain.
1499	*/
1500	kernel_unpoison_pages(page, numpages: `1` << order);
1501
1502	/*
1503	* As memory initialization might be integrated into KASAN,
1504	* KASAN unpoisoning and memory initializion code must be
1505	* kept together to avoid discrepancies in behavior.
1506	*/
1507
1508	/*
1509	* If memory tags should be zeroed
1510	* (which happens only when memory should be initialized as well).
1511	*/
1512	if (zero_tags) {
1513	/ Initialize both memory and memory tags. /
1514	for (i = `0`; i != `1` << order; ++i)
1515	tag_clear_highpage(page: page + i);
1516
1517	/ Take note that memory was initialized by the loop above. /
1518	init = false;
1519	}
1520	if (!should_skip_kasan_unpoison(flags: gfp_flags) &&
1521	kasan_unpoison_pages(page, order, init)) {
1522	/ Take note that memory was initialized by KASAN. /
1523	if (kasan_has_integrated_init())
1524	init = false;
1525	} else {
1526	/*
1527	* If memory tags have not been set by KASAN, reset the page
1528	* tags to ensure page_address() dereferencing does not fault.
1529	*/
1530	for (i = `0`; i != `1` << order; ++i)
1531	page_kasan_tag_reset(page: page + i);
1532	}
1533	/ If memory is still not initialized, initialize it now. /
1534	if (init)
1535	kernel_init_pages(page, numpages: `1` << order);
1536
1537	set_page_owner(page, order, gfp_mask: gfp_flags);
1538	page_table_check_alloc(page, order);
1539	}
1540
1541	static void prep_new_page(struct page page, unsigned* int order, gfp_t gfp_flags,
1542	unsigned int alloc_flags)
1543	{
1544	post_alloc_hook(page, order, gfp_flags);
1545
1546	if (order && (gfp_flags & __GFP_COMP))
1547	prep_compound_page(page, order);
1548
1549	/*
1550	* page is set pfmemalloc when ALLOC_NO_WATERMARKS was necessary to
1551	* allocate the page. The expectation is that the caller is taking
1552	* steps that will free more memory. The caller should avoid the page
1553	* being used for !PFMEMALLOC purposes.
1554	*/
1555	if (alloc_flags & ALLOC_NO_WATERMARKS)
1556	set_page_pfmemalloc(page);
1557	else
1558	clear_page_pfmemalloc(page);
1559	}
1560
1561	/*
1562	* Go through the free lists for the given migratetype and remove
1563	* the smallest available page from the freelists
1564	*/
1565	static __always_inline
1566	struct page __rmqueue_smallest(struct* zone zone, unsigned* int order,
1567	int migratetype)
1568	{
1569	unsigned int current_order;
1570	struct free_area *area;
1571	struct page *page;
1572
1573	/ Find a page of the appropriate size in the preferred list /
1574	for (current_order = order; current_order <= MAX_ORDER; ++current_order) {
1575	area = &(zone->free_area[current_order]);
1576	page = get_page_from_free_area(area, migratetype);
1577	if (!page)
1578	continue;
1579	del_page_from_free_list(page, zone, order: current_order);
1580	expand(zone, page, low: order, high: current_order, migratetype);
1581	set_pcppage_migratetype(page, migratetype);
1582	trace_mm_page_alloc_zone_locked(page, order, migratetype,
1583	percpu_refill: pcp_allowed_order(order) &&
1584	migratetype < MIGRATE_PCPTYPES);
1585	return page;
1586	}
1587
1588	return NULL;
1589	}
1590
1591
1592	/*
1593	* This array describes the order lists are fallen back to when
1594	* the free lists for the desirable migrate type are depleted
1595	*
1596	* The other migratetypes do not have fallbacks.
1597	*/
1598	static int fallbacks[MIGRATE_TYPES][MIGRATE_PCPTYPES - `1`] = {
1599	[MIGRATE_UNMOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_MOVABLE },
1600	[MIGRATE_MOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE },
1601	[MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE, MIGRATE_MOVABLE },
1602	};
1603
1604	#ifdef CONFIG_CMA
1605	static __always_inline struct page __rmqueue_cma_fallback(struct* zone *zone,
1606	unsigned int order)
1607	{
1608	return __rmqueue_smallest(zone, order, migratetype: MIGRATE_CMA);
1609	}
1610	#else
1611	static inline struct page __rmqueue_cma_fallback(struct* zone *zone,
1612	unsigned int order) { return NULL; }
1613	#endif
1614
1615	/*
1616	* Move the free pages in a range to the freelist tail of the requested type.
1617	* Note that start_page and end_pages are not aligned on a pageblock
1618	* boundary. If alignment is required, use move_freepages_block()
1619	*/
1620	static int move_freepages(struct zone *zone,
1621	unsigned long start_pfn, unsigned long end_pfn,
1622	int migratetype, int *num_movable)
1623	{
1624	struct page *page;
1625	unsigned long pfn;
1626	unsigned int order;
1627	int pages_moved = `0`;
1628
1629	for (pfn = start_pfn; pfn <= end_pfn;) {
1630	page = pfn_to_page(pfn);
1631	if (!PageBuddy(page)) {
1632	/*
1633	* We assume that pages that could be isolated for
1634	* migration are movable. But we don't actually try
1635	* isolating, as that would be expensive.
1636	*/
1637	if (num_movable &&
1638	(PageLRU(page) \|\| __PageMovable(page)))
1639	(*num_movable)++;
1640	pfn++;
1641	continue;
1642	}
1643
1644	/ Make sure we are not inadvertently changing nodes /
1645	VM_BUG_ON_PAGE(page_to_nid(page) != zone_to_nid(zone), page);
1646	VM_BUG_ON_PAGE(page_zone(page) != zone, page);
1647
1648	order = buddy_order(page);
1649	move_to_free_list(page, zone, order, migratetype);
1650	pfn += `1` << order;
1651	pages_moved += `1` << order;
1652	}
1653
1654	return pages_moved;
1655	}
1656
1657	int move_freepages_block(struct zone zone, struct* page *page,
1658	int migratetype, int *num_movable)
1659	{
1660	unsigned long start_pfn, end_pfn, pfn;
1661
1662	if (num_movable)
1663	*num_movable = `0`;
1664
1665	pfn = page_to_pfn(page);
1666	start_pfn = pageblock_start_pfn(pfn);
1667	end_pfn = pageblock_end_pfn(pfn) - `1`;
1668
1669	/ Do not cross zone boundaries /
1670	if (!zone_spans_pfn(zone, pfn: start_pfn))
1671	start_pfn = pfn;
1672	if (!zone_spans_pfn(zone, pfn: end_pfn))
1673	return `0`;
1674
1675	return move_freepages(zone, start_pfn, end_pfn, migratetype,
1676	num_movable);
1677	}
1678
1679	static void change_pageblock_range(struct page *pageblock_page,
1680	int start_order, int migratetype)
1681	{
1682	int nr_pageblocks = `1` << (start_order - pageblock_order);
1683
1684	while (nr_pageblocks--) {
1685	set_pageblock_migratetype(page: pageblock_page, migratetype);
1686	pageblock_page += pageblock_nr_pages;
1687	}
1688	}
1689
1690	/*
1691	* When we are falling back to another migratetype during allocation, try to
1692	* steal extra free pages from the same pageblocks to satisfy further
1693	* allocations, instead of polluting multiple pageblocks.
1694	*
1695	* If we are stealing a relatively large buddy page, it is likely there will
1696	* be more free pages in the pageblock, so try to steal them all. For
1697	* reclaimable and unmovable allocations, we steal regardless of page size,
1698	* as fragmentation caused by those allocations polluting movable pageblocks
1699	* is worse than movable allocations stealing from unmovable and reclaimable
1700	* pageblocks.
1701	*/
1702	static bool can_steal_fallback(unsigned int order, int start_mt)
1703	{
1704	/*
1705	* Leaving this order check is intended, although there is
1706	* relaxed order check in next check. The reason is that
1707	* we can actually steal whole pageblock if this condition met,
1708	* but, below check doesn't guarantee it and that is just heuristic
1709	* so could be changed anytime.
1710	*/
1711	if (order >= pageblock_order)
1712	return true;
1713
1714	if (order >= pageblock_order / `2` \|\|
1715	start_mt == MIGRATE_RECLAIMABLE \|\|
1716	start_mt == MIGRATE_UNMOVABLE \|\|
1717	page_group_by_mobility_disabled)
1718	return true;
1719
1720	return false;
1721	}
1722
1723	static inline bool boost_watermark(struct zone *zone)
1724	{
1725	unsigned long max_boost;
1726
1727	if (!watermark_boost_factor)
1728	return false;
1729	/*
1730	* Don't bother in zones that are unlikely to produce results.
1731	* On small machines, including kdump capture kernels running
1732	* in a small area, boosting the watermark can cause an out of
1733	* memory situation immediately.
1734	*/
1735	if ((pageblock_nr_pages * `4`) > zone_managed_pages(zone))
1736	return false;
1737
1738	max_boost = mult_frac(zone->_watermark[WMARK_HIGH],
1739	watermark_boost_factor, `10000`);
1740
1741	/*
1742	* high watermark may be uninitialised if fragmentation occurs
1743	* very early in boot so do not boost. We do not fall
1744	* through and boost by pageblock_nr_pages as failing
1745	* allocations that early means that reclaim is not going
1746	* to help and it may even be impossible to reclaim the
1747	* boosted watermark resulting in a hang.
1748	*/
1749	if (!max_boost)
1750	return false;
1751
1752	max_boost = max(pageblock_nr_pages, max_boost);
1753
1754	zone->watermark_boost = min(zone->watermark_boost + pageblock_nr_pages,
1755	max_boost);
1756
1757	return true;
1758	}
1759
1760	/*
1761	* This function implements actual steal behaviour. If order is large enough,
1762	* we can steal whole pageblock. If not, we first move freepages in this
1763	* pageblock to our migratetype and determine how many already-allocated pages
1764	* are there in the pageblock with a compatible migratetype. If at least half
1765	* of pages are free or compatible, we can change migratetype of the pageblock
1766	* itself, so pages freed in the future will be put on the correct free list.
1767	*/
1768	static void steal_suitable_fallback(struct zone zone, struct* page *page,
1769	unsigned int alloc_flags, int start_type, bool whole_block)
1770	{
1771	unsigned int current_order = buddy_order(page);
1772	int free_pages, movable_pages, alike_pages;
1773	int old_block_type;
1774
1775	old_block_type = get_pageblock_migratetype(page);
1776
1777	/*
1778	* This can happen due to races and we want to prevent broken
1779	* highatomic accounting.
1780	*/
1781	if (is_migrate_highatomic(migratetype: old_block_type))
1782	goto single_page;
1783
1784	/ Take ownership for orders >= pageblock_order /
1785	if (current_order >= pageblock_order) {
1786	change_pageblock_range(pageblock_page: page, start_order: current_order, migratetype: start_type);
1787	goto single_page;
1788	}
1789
1790	/*
1791	* Boost watermarks to increase reclaim pressure to reduce the
1792	* likelihood of future fallbacks. Wake kswapd now as the node
1793	* may be balanced overall and kswapd will not wake naturally.
1794	*/
1795	if (boost_watermark(zone) && (alloc_flags & ALLOC_KSWAPD))
1796	set_bit(nr: ZONE_BOOSTED_WATERMARK, addr: &zone->flags);
1797
1798	/ We are not allowed to try stealing from the whole block /
1799	if (!whole_block)
1800	goto single_page;
1801
1802	free_pages = move_freepages_block(zone, page, migratetype: start_type,
1803	num_movable: &movable_pages);
1804	/ moving whole block can fail due to zone boundary conditions /
1805	if (!free_pages)
1806	goto single_page;
1807
1808	/*
1809	* Determine how many pages are compatible with our allocation.
1810	* For movable allocation, it's the number of movable pages which
1811	* we just obtained. For other types it's a bit more tricky.
1812	*/
1813	if (start_type == MIGRATE_MOVABLE) {
1814	alike_pages = movable_pages;
1815	} else {
1816	/*
1817	* If we are falling back a RECLAIMABLE or UNMOVABLE allocation
1818	* to MOVABLE pageblock, consider all non-movable pages as
1819	* compatible. If it's UNMOVABLE falling back to RECLAIMABLE or
1820	* vice versa, be conservative since we can't distinguish the
1821	* exact migratetype of non-movable pages.
1822	*/
1823	if (old_block_type == MIGRATE_MOVABLE)
1824	alike_pages = pageblock_nr_pages
1825	- (free_pages + movable_pages);
1826	else
1827	alike_pages = `0`;
1828	}
1829	/*
1830	* If a sufficient number of pages in the block are either free or of
1831	* compatible migratability as our allocation, claim the whole block.
1832	*/
1833	if (free_pages + alike_pages >= (`1` << (pageblock_order-`1`)) \|\|
1834	page_group_by_mobility_disabled)
1835	set_pageblock_migratetype(page, migratetype: start_type);
1836
1837	return;
1838
1839	single_page:
1840	move_to_free_list(page, zone, order: current_order, migratetype: start_type);
1841	}
1842
1843	/*
1844	* Check whether there is a suitable fallback freepage with requested order.
1845	* If only_stealable is true, this function returns fallback_mt only if
1846	* we can steal other freepages all together. This would help to reduce
1847	* fragmentation due to mixed migratetype pages in one pageblock.
1848	*/
1849	int find_suitable_fallback(struct free_area area, unsigned* int order,
1850	int migratetype, bool only_stealable, bool *can_steal)
1851	{
1852	int i;
1853	int fallback_mt;
1854
1855	if (area->nr_free == `0`)
1856	return -`1`;
1857
1858	*can_steal = false;
1859	for (i = `0`; i < MIGRATE_PCPTYPES - `1` ; i++) {
1860	fallback_mt = fallbacks[migratetype][i];
1861	if (free_area_empty(area, migratetype: fallback_mt))
1862	continue;
1863
1864	if (can_steal_fallback(order, start_mt: migratetype))
1865	*can_steal = true;
1866
1867	if (!only_stealable)
1868	return fallback_mt;
1869
1870	if (*can_steal)
1871	return fallback_mt;
1872	}
1873
1874	return -`1`;
1875	}
1876
1877	/*
1878	* Reserve a pageblock for exclusive use of high-order atomic allocations if
1879	* there are no empty page blocks that contain a page with a suitable order
1880	*/
1881	static void reserve_highatomic_pageblock(struct page page, struct* zone *zone)
1882	{
1883	int mt;
1884	unsigned long max_managed, flags;
1885
1886	/*
1887	* Limit the number reserved to 1 pageblock or roughly 1% of a zone.
1888	* Check is race-prone but harmless.
1889	*/
1890	max_managed = (zone_managed_pages(zone) / `100`) + pageblock_nr_pages;
1891	if (zone->nr_reserved_highatomic >= max_managed)
1892	return;
1893
1894	spin_lock_irqsave(&zone->lock, flags);
1895
1896	/ Recheck the nr_reserved_highatomic limit under the lock /
1897	if (zone->nr_reserved_highatomic >= max_managed)
1898	goto out_unlock;
1899
1900	/ Yoink! /
1901	mt = get_pageblock_migratetype(page);
1902	/ Only reserve normal pageblocks (i.e., they can merge with others) /
1903	if (migratetype_is_mergeable(mt)) {
1904	zone->nr_reserved_highatomic += pageblock_nr_pages;
1905	set_pageblock_migratetype(page, migratetype: MIGRATE_HIGHATOMIC);
1906	move_freepages_block(zone, page, migratetype: MIGRATE_HIGHATOMIC, NULL);
1907	}
1908
1909	out_unlock:
1910	spin_unlock_irqrestore(lock: &zone->lock, flags);
1911	}
1912
1913	/*
1914	* Used when an allocation is about to fail under memory pressure. This
1915	* potentially hurts the reliability of high-order allocations when under
1916	* intense memory pressure but failed atomic allocations should be easier
1917	* to recover from than an OOM.
1918	*
1919	* If @force is true, try to unreserve a pageblock even though highatomic
1920	* pageblock is exhausted.
1921	*/
1922	static bool unreserve_highatomic_pageblock(const struct alloc_context *ac,
1923	bool force)
1924	{
1925	struct zonelist *zonelist = ac->zonelist;
1926	unsigned long flags;
1927	struct zoneref *z;
1928	struct zone *zone;
1929	struct page *page;
1930	int order;
1931	bool ret;
1932
1933	for_each_zone_zonelist_nodemask(zone, z, zonelist, ac->highest_zoneidx,
1934	ac->nodemask) {
1935	/*
1936	* Preserve at least one pageblock unless memory pressure
1937	* is really high.
1938	*/
1939	if (!force && zone->nr_reserved_highatomic <=
1940	pageblock_nr_pages)
1941	continue;
1942
1943	spin_lock_irqsave(&zone->lock, flags);
1944	for (order = `0`; order <= MAX_ORDER; order++) {
1945	struct free_area *area = &(zone->free_area[order]);
1946
1947	page = get_page_from_free_area(area, migratetype: MIGRATE_HIGHATOMIC);
1948	if (!page)
1949	continue;
1950
1951	/*
1952	* In page freeing path, migratetype change is racy so
1953	* we can counter several free pages in a pageblock
1954	* in this loop although we changed the pageblock type
1955	* from highatomic to ac->migratetype. So we should
1956	* adjust the count once.
1957	*/
1958	if (is_migrate_highatomic_page(page)) {
1959	/*
1960	* It should never happen but changes to
1961	* locking could inadvertently allow a per-cpu
1962	* drain to add pages to MIGRATE_HIGHATOMIC
1963	* while unreserving so be safe and watch for
1964	* underflows.
1965	*/
1966	zone->nr_reserved_highatomic -= min(
1967	pageblock_nr_pages,
1968	zone->nr_reserved_highatomic);
1969	}
1970
1971	/*
1972	* Convert to ac->migratetype and avoid the normal
1973	* pageblock stealing heuristics. Minimally, the caller
1974	* is doing the work and needs the pages. More
1975	* importantly, if the block was always converted to
1976	* MIGRATE_UNMOVABLE or another type then the number
1977	* of pageblocks that cannot be completely freed
1978	* may increase.
1979	*/
1980	set_pageblock_migratetype(page, migratetype: ac->migratetype);
1981	ret = move_freepages_block(zone, page, migratetype: ac->migratetype,
1982	NULL);
1983	if (ret) {
1984	spin_unlock_irqrestore(lock: &zone->lock, flags);
1985	return ret;
1986	}
1987	}
1988	spin_unlock_irqrestore(lock: &zone->lock, flags);
1989	}
1990
1991	return false;
1992	}
1993
1994	/*
1995	* Try finding a free buddy page on the fallback list and put it on the free
1996	* list of requested migratetype, possibly along with other pages from the same
1997	* block, depending on fragmentation avoidance heuristics. Returns true if
1998	* fallback was found so that __rmqueue_smallest() can grab it.
1999	*
2000	* The use of signed ints for order and current_order is a deliberate
2001	* deviation from the rest of this file, to make the for loop
2002	* condition simpler.
2003	*/
2004	static __always_inline bool
2005	__rmqueue_fallback(struct zone zone, int* order, int start_migratetype,
2006	unsigned int alloc_flags)
2007	{
2008	struct free_area *area;
2009	int current_order;
2010	int min_order = order;
2011	struct page *page;
2012	int fallback_mt;
2013	bool can_steal;
2014
2015	/*
2016	* Do not steal pages from freelists belonging to other pageblocks
2017	* i.e. orders < pageblock_order. If there are no local zones free,
2018	* the zonelists will be reiterated without ALLOC_NOFRAGMENT.
2019	*/
2020	if (order < pageblock_order && alloc_flags & ALLOC_NOFRAGMENT)
2021	min_order = pageblock_order;
2022
2023	/*
2024	* Find the largest available free page in the other list. This roughly
2025	* approximates finding the pageblock with the most free pages, which
2026	* would be too costly to do exactly.
2027	*/
2028	for (current_order = MAX_ORDER; current_order >= min_order;
2029	--current_order) {
2030	area = &(zone->free_area[current_order]);
2031	fallback_mt = find_suitable_fallback(area, order: current_order,
2032	migratetype: start_migratetype, only_stealable: false, can_steal: &can_steal);
2033	if (fallback_mt == -`1`)
2034	continue;
2035
2036	/*
2037	* We cannot steal all free pages from the pageblock and the
2038	* requested migratetype is movable. In that case it's better to
2039	* steal and split the smallest available page instead of the
2040	* largest available page, because even if the next movable
2041	* allocation falls back into a different pageblock than this
2042	* one, it won't cause permanent fragmentation.
2043	*/
2044	if (!can_steal && start_migratetype == MIGRATE_MOVABLE
2045	&& current_order > order)
2046	goto find_smallest;
2047
2048	goto do_steal;
2049	}
2050
2051	return false;
2052
2053	find_smallest:
2054	for (current_order = order; current_order <= MAX_ORDER;
2055	current_order++) {
2056	area = &(zone->free_area[current_order]);
2057	fallback_mt = find_suitable_fallback(area, order: current_order,
2058	migratetype: start_migratetype, only_stealable: false, can_steal: &can_steal);
2059	if (fallback_mt != -`1`)
2060	break;
2061	}
2062
2063	/*
2064	* This should not happen - we already found a suitable fallback
2065	* when looking for the largest page.
2066	*/
2067	VM_BUG_ON(current_order > MAX_ORDER);
2068
2069	do_steal:
2070	page = get_page_from_free_area(area, migratetype: fallback_mt);
2071
2072	steal_suitable_fallback(zone, page, alloc_flags, start_type: start_migratetype,
2073	whole_block: can_steal);
2074
2075	trace_mm_page_alloc_extfrag(page, alloc_order: order, fallback_order: current_order,
2076	alloc_migratetype: start_migratetype, fallback_migratetype: fallback_mt);
2077
2078	return true;
2079
2080	}
2081
2082	/*
2083	* Do the hard work of removing an element from the buddy allocator.
2084	* Call me with the zone->lock already held.
2085	*/
2086	static __always_inline struct page *
2087	__rmqueue(struct zone zone, unsigned* int order, int migratetype,
2088	unsigned int alloc_flags)
2089	{
2090	struct page *page;
2091
2092	if (IS_ENABLED(CONFIG_CMA)) {
2093	/*
2094	* Balance movable allocations between regular and CMA areas by
2095	* allocating from CMA when over half of the zone's free memory
2096	* is in the CMA area.
2097	*/
2098	if (alloc_flags & ALLOC_CMA &&
2099	zone_page_state(zone, item: NR_FREE_CMA_PAGES) >
2100	zone_page_state(zone, item: NR_FREE_PAGES) / `2`) {
2101	page = __rmqueue_cma_fallback(zone, order);
2102	if (page)
2103	return page;
2104	}
2105	}
2106	retry:
2107	page = __rmqueue_smallest(zone, order, migratetype);
2108	if (unlikely(!page)) {
2109	if (alloc_flags & ALLOC_CMA)
2110	page = __rmqueue_cma_fallback(zone, order);
2111
2112	if (!page && __rmqueue_fallback(zone, order, start_migratetype: migratetype,
2113	alloc_flags))
2114	goto retry;
2115	}
2116	return page;
2117	}
2118
2119	/*
2120	* Obtain a specified number of elements from the buddy allocator, all under
2121	* a single hold of the lock, for efficiency. Add them to the supplied list.
2122	* Returns the number of new pages which were placed at *list.
2123	*/
2124	static int rmqueue_bulk(struct zone zone, unsigned* int order,
2125	unsigned long count, struct list_head *list,
2126	int migratetype, unsigned int alloc_flags)
2127	{
2128	unsigned long flags;
2129	int i;
2130
2131	spin_lock_irqsave(&zone->lock, flags);
2132	for (i = `0`; i < count; ++i) {
2133	struct page *page = __rmqueue(zone, order, migratetype,
2134	alloc_flags);
2135	if (unlikely(page == NULL))
2136	break;
2137
2138	/*
2139	* Split buddy pages returned by expand() are received here in
2140	* physical page order. The page is added to the tail of
2141	* caller's list. From the callers perspective, the linked list
2142	* is ordered by page number under some conditions. This is
2143	* useful for IO devices that can forward direction from the
2144	* head, thus also in the physical page order. This is useful
2145	* for IO devices that can merge IO requests if the physical
2146	* pages are ordered properly.
2147	*/
2148	list_add_tail(new: &page->pcp_list, head: list);
2149	if (is_migrate_cma(get_pcppage_migratetype(page)))
2150	__mod_zone_page_state(zone, item: NR_FREE_CMA_PAGES,
2151	-(`1` << order));
2152	}
2153
2154	__mod_zone_page_state(zone, item: NR_FREE_PAGES, -(i << order));
2155	spin_unlock_irqrestore(lock: &zone->lock, flags);
2156
2157	return i;
2158	}
2159
2160	/*
2161	* Called from the vmstat counter updater to decay the PCP high.
2162	* Return whether there are addition works to do.
2163	*/
2164	int decay_pcp_high(struct zone zone, struct* per_cpu_pages *pcp)
2165	{
2166	int high_min, to_drain, batch;
2167	int todo = `0`;
2168
2169	high_min = READ_ONCE(pcp->high_min);
2170	batch = READ_ONCE(pcp->batch);
2171	/*
2172	* Decrease pcp->high periodically to try to free possible
2173	* idle PCP pages. And, avoid to free too many pages to
2174	* control latency. This caps pcp->high decrement too.
2175	*/
2176	if (pcp->high > high_min) {
2177	pcp->high = max3(pcp->count - (batch << CONFIG_PCP_BATCH_SCALE_MAX),
2178	pcp->high - (pcp->high >> `3`), high_min);
2179	if (pcp->high > high_min)
2180	todo++;
2181	}
2182
2183	to_drain = pcp->count - pcp->high;
2184	if (to_drain > `0`) {
2185	spin_lock(lock: &pcp->lock);
2186	free_pcppages_bulk(zone, count: to_drain, pcp, pindex: `0`);
2187	spin_unlock(lock: &pcp->lock);
2188	todo++;
2189	}
2190
2191	return todo;
2192	}
2193
2194	#ifdef CONFIG_NUMA
2195	/*
2196	* Called from the vmstat counter updater to drain pagesets of this
2197	* currently executing processor on remote nodes after they have
2198	* expired.
2199	*/
2200	void drain_zone_pages(struct zone zone, struct* per_cpu_pages *pcp)
2201	{
2202	int to_drain, batch;
2203
2204	batch = READ_ONCE(pcp->batch);
2205	to_drain = min(pcp->count, batch);
2206	if (to_drain > `0`) {
2207	spin_lock(lock: &pcp->lock);
2208	free_pcppages_bulk(zone, count: to_drain, pcp, pindex: `0`);
2209	spin_unlock(lock: &pcp->lock);
2210	}
2211	}
2212	#endif
2213
2214	/*
2215	* Drain pcplists of the indicated processor and zone.
2216	*/
2217	static void drain_pages_zone(unsigned int cpu, struct zone *zone)
2218	{
2219	struct per_cpu_pages *pcp;
2220
2221	pcp = per_cpu_ptr(zone->per_cpu_pageset, cpu);
2222	if (pcp->count) {
2223	spin_lock(lock: &pcp->lock);
2224	free_pcppages_bulk(zone, count: pcp->count, pcp, pindex: `0`);
2225	spin_unlock(lock: &pcp->lock);
2226	}
2227	}
2228
2229	/*
2230	* Drain pcplists of all zones on the indicated processor.
2231	*/
2232	static void drain_pages(unsigned int cpu)
2233	{
2234	struct zone *zone;
2235
2236	for_each_populated_zone(zone) {
2237	drain_pages_zone(cpu, zone);
2238	}
2239	}
2240
2241	/*
2242	* Spill all of this CPU's per-cpu pages back into the buddy allocator.
2243	*/
2244	void drain_local_pages(struct zone *zone)
2245	{
2246	int cpu = smp_processor_id();
2247
2248	if (zone)
2249	drain_pages_zone(cpu, zone);
2250	else
2251	drain_pages(cpu);
2252	}
2253
2254	/*
2255	* The implementation of drain_all_pages(), exposing an extra parameter to
2256	* drain on all cpus.
2257	*
2258	* drain_all_pages() is optimized to only execute on cpus where pcplists are
2259	* not empty. The check for non-emptiness can however race with a free to
2260	* pcplist that has not yet increased the pcp->count from 0 to 1. Callers
2261	* that need the guarantee that every CPU has drained can disable the
2262	* optimizing racy check.
2263	*/
2264	static void __drain_all_pages(struct zone *zone, bool force_all_cpus)
2265	{
2266	int cpu;
2267
2268	/*
2269	* Allocate in the BSS so we won't require allocation in
2270	* direct reclaim path for CONFIG_CPUMASK_OFFSTACK=y
2271	*/
2272	static cpumask_t cpus_with_pcps;
2273
2274	/*
2275	* Do not drain if one is already in progress unless it's specific to
2276	* a zone. Such callers are primarily CMA and memory hotplug and need
2277	* the drain to be complete when the call returns.
2278	*/
2279	if (unlikely(!mutex_trylock(&pcpu_drain_mutex))) {
2280	if (!zone)
2281	return;
2282	mutex_lock(&pcpu_drain_mutex);
2283	}
2284
2285	/*
2286	* We don't care about racing with CPU hotplug event
2287	* as offline notification will cause the notified
2288	* cpu to drain that CPU pcps and on_each_cpu_mask
2289	* disables preemption as part of its processing
2290	*/
2291	for_each_online_cpu(cpu) {
2292	struct per_cpu_pages *pcp;
2293	struct zone *z;
2294	bool has_pcps = false;
2295
2296	if (force_all_cpus) {
2297	/*
2298	* The pcp.count check is racy, some callers need a
2299	* guarantee that no cpu is missed.
2300	*/
2301	has_pcps = true;
2302	} else if (zone) {
2303	pcp = per_cpu_ptr(zone->per_cpu_pageset, cpu);
2304	if (pcp->count)
2305	has_pcps = true;
2306	} else {
2307	for_each_populated_zone(z) {
2308	pcp = per_cpu_ptr(z->per_cpu_pageset, cpu);
2309	if (pcp->count) {
2310	has_pcps = true;
2311	break;
2312	}
2313	}
2314	}
2315
2316	if (has_pcps)
2317	cpumask_set_cpu(cpu, dstp: &cpus_with_pcps);
2318	else
2319	cpumask_clear_cpu(cpu, dstp: &cpus_with_pcps);
2320	}
2321
2322	for_each_cpu(cpu, &cpus_with_pcps) {
2323	if (zone)
2324	drain_pages_zone(cpu, zone);
2325	else
2326	drain_pages(cpu);
2327	}
2328
2329	mutex_unlock(lock: &pcpu_drain_mutex);
2330	}
2331
2332	/*
2333	* Spill all the per-cpu pages from all CPUs back into the buddy allocator.
2334	*
2335	* When zone parameter is non-NULL, spill just the single zone's pages.
2336	*/
2337	void drain_all_pages(struct zone *zone)
2338	{
2339	__drain_all_pages(zone, force_all_cpus: false);
2340	}
2341
2342	static bool free_unref_page_prepare(struct page page, unsigned* long pfn,
2343	unsigned int order)
2344	{
2345	int migratetype;
2346
2347	if (!free_pages_prepare(page, order, FPI_NONE))
2348	return false;
2349
2350	migratetype = get_pfnblock_migratetype(page, pfn);
2351	set_pcppage_migratetype(page, migratetype);
2352	return true;
2353	}
2354
2355	static int nr_pcp_free(struct per_cpu_pages pcp, int* batch, int high, bool free_high)
2356	{
2357	int min_nr_free, max_nr_free;
2358
2359	/ Free as much as possible if batch freeing high-order pages. /
2360	if (unlikely(free_high))
2361	return min(pcp->count, batch << CONFIG_PCP_BATCH_SCALE_MAX);
2362
2363	/ Check for PCP disabled or boot pageset /
2364	if (unlikely(high < batch))
2365	return `1`;
2366
2367	/ Leave at least pcp->batch pages on the list /
2368	min_nr_free = batch;
2369	max_nr_free = high - batch;
2370
2371	/*
2372	* Increase the batch number to the number of the consecutive
2373	* freed pages to reduce zone lock contention.
2374	*/
2375	batch = clamp_t(int, pcp->free_count, min_nr_free, max_nr_free);
2376
2377	return batch;
2378	}
2379
2380	static int nr_pcp_high(struct per_cpu_pages pcp, struct* zone *zone,
2381	int batch, bool free_high)
2382	{
2383	int high, high_min, high_max;
2384
2385	high_min = READ_ONCE(pcp->high_min);
2386	high_max = READ_ONCE(pcp->high_max);
2387	high = pcp->high = clamp(pcp->high, high_min, high_max);
2388
2389	if (unlikely(!high))
2390	return `0`;
2391
2392	if (unlikely(free_high)) {
2393	pcp->high = max(high - (batch << CONFIG_PCP_BATCH_SCALE_MAX),
2394	high_min);
2395	return `0`;
2396	}
2397
2398	/*
2399	* If reclaim is active, limit the number of pages that can be
2400	* stored on pcp lists
2401	*/
2402	if (test_bit(ZONE_RECLAIM_ACTIVE, &zone->flags)) {
2403	int free_count = max_t(int, pcp->free_count, batch);
2404
2405	pcp->high = max(high - free_count, high_min);
2406	return min(batch << `2`, pcp->high);
2407	}
2408
2409	if (high_min == high_max)
2410	return high;
2411
2412	if (test_bit(ZONE_BELOW_HIGH, &zone->flags)) {
2413	int free_count = max_t(int, pcp->free_count, batch);
2414
2415	pcp->high = max(high - free_count, high_min);
2416	high = max(pcp->count, high_min);
2417	} else if (pcp->count >= high) {
2418	int need_high = pcp->free_count + batch;
2419
2420	/ pcp->high should be large enough to hold batch freed pages /
2421	if (pcp->high < need_high)
2422	pcp->high = clamp(need_high, high_min, high_max);
2423	}
2424
2425	return high;
2426	}
2427
2428	static void free_unref_page_commit(struct zone zone, struct* per_cpu_pages *pcp,
2429	struct page page, int* migratetype,
2430	unsigned int order)
2431	{
2432	int high, batch;
2433	int pindex;
2434	bool free_high = false;
2435
2436	/*
2437	* On freeing, reduce the number of pages that are batch allocated.
2438	* See nr_pcp_alloc() where alloc_factor is increased for subsequent
2439	* allocations.
2440	*/
2441	pcp->alloc_factor >>= `1`;
2442	__count_vm_events(item: PGFREE, delta: `1` << order);
2443	pindex = order_to_pindex(migratetype, order);
2444	list_add(new: &page->pcp_list, head: &pcp->lists[pindex]);
2445	pcp->count += `1` << order;
2446
2447	batch = READ_ONCE(pcp->batch);
2448	/*
2449	* As high-order pages other than THP's stored on PCP can contribute
2450	* to fragmentation, limit the number stored when PCP is heavily
2451	* freeing without allocation. The remainder after bulk freeing
2452	* stops will be drained from vmstat refresh context.
2453	*/
2454	if (order && order <= PAGE_ALLOC_COSTLY_ORDER) {
2455	free_high = (pcp->free_count >= batch &&
2456	(pcp->flags & PCPF_PREV_FREE_HIGH_ORDER) &&
2457	(!(pcp->flags & PCPF_FREE_HIGH_BATCH) \|\|
2458	pcp->count >= READ_ONCE(batch)));
2459	pcp->flags \|= PCPF_PREV_FREE_HIGH_ORDER;
2460	} else if (pcp->flags & PCPF_PREV_FREE_HIGH_ORDER) {
2461	pcp->flags &= ~PCPF_PREV_FREE_HIGH_ORDER;
2462	}
2463	if (pcp->free_count < (batch << CONFIG_PCP_BATCH_SCALE_MAX))
2464	pcp->free_count += (`1` << order);
2465	high = nr_pcp_high(pcp, zone, batch, free_high);
2466	if (pcp->count >= high) {
2467	free_pcppages_bulk(zone, count: nr_pcp_free(pcp, batch, high, free_high),
2468	pcp, pindex);
2469	if (test_bit(ZONE_BELOW_HIGH, &zone->flags) &&
2470	zone_watermark_ok(z: zone, order: `0`, high_wmark_pages(zone),
2471	highest_zoneidx: ZONE_MOVABLE, alloc_flags: `0`))
2472	clear_bit(nr: ZONE_BELOW_HIGH, addr: &zone->flags);
2473	}
2474	}
2475
2476	/*
2477	* Free a pcp page
2478	*/
2479	void free_unref_page(struct page page, unsigned* int order)
2480	{
2481	unsigned long __maybe_unused UP_flags;
2482	struct per_cpu_pages *pcp;
2483	struct zone *zone;
2484	unsigned long pfn = page_to_pfn(page);
2485	int migratetype, pcpmigratetype;
2486
2487	if (!free_unref_page_prepare(page, pfn, order))
2488	return;
2489
2490	/*
2491	* We only track unmovable, reclaimable and movable on pcp lists.
2492	* Place ISOLATE pages on the isolated list because they are being
2493	* offlined but treat HIGHATOMIC and CMA as movable pages so we can
2494	* get those areas back if necessary. Otherwise, we may have to free
2495	* excessively into the page allocator
2496	*/
2497	migratetype = pcpmigratetype = get_pcppage_migratetype(page);
2498	if (unlikely(migratetype >= MIGRATE_PCPTYPES)) {
2499	if (unlikely(is_migrate_isolate(migratetype))) {
2500	free_one_page(zone: page_zone(page), page, pfn, order, migratetype, FPI_NONE);
2501	return;
2502	}
2503	pcpmigratetype = MIGRATE_MOVABLE;
2504	}
2505
2506	zone = page_zone(page);
2507	pcp_trylock_prepare(UP_flags);
2508	pcp = pcp_spin_trylock(zone->per_cpu_pageset);
2509	if (pcp) {
2510	free_unref_page_commit(zone, pcp, page, migratetype: pcpmigratetype, order);
2511	pcp_spin_unlock(pcp);
2512	} else {
2513	free_one_page(zone, page, pfn, order, migratetype, FPI_NONE);
2514	}
2515	pcp_trylock_finish(UP_flags);
2516	}
2517
2518	/*
2519	* Free a list of 0-order pages
2520	*/
2521	void free_unref_page_list(struct list_head *list)
2522	{
2523	unsigned long __maybe_unused UP_flags;
2524	struct page page, next;
2525	struct per_cpu_pages *pcp = NULL;
2526	struct zone *locked_zone = NULL;
2527	int batch_count = `0`;
2528	int migratetype;
2529
2530	/ Prepare pages for freeing /
2531	list_for_each_entry_safe(page, next, list, lru) {
2532	unsigned long pfn = page_to_pfn(page);
2533	if (!free_unref_page_prepare(page, pfn, order: `0`)) {
2534	list_del(entry: &page->lru);
2535	continue;
2536	}
2537
2538	/*
2539	* Free isolated pages directly to the allocator, see
2540	* comment in free_unref_page.
2541	*/
2542	migratetype = get_pcppage_migratetype(page);
2543	if (unlikely(is_migrate_isolate(migratetype))) {
2544	list_del(entry: &page->lru);
2545	free_one_page(zone: page_zone(page), page, pfn, order: `0`, migratetype, FPI_NONE);
2546	continue;
2547	}
2548	}
2549
2550	list_for_each_entry_safe(page, next, list, lru) {
2551	struct zone *zone = page_zone(page);
2552
2553	list_del(entry: &page->lru);
2554	migratetype = get_pcppage_migratetype(page);
2555
2556	/*
2557	* Either different zone requiring a different pcp lock or
2558	* excessive lock hold times when freeing a large list of
2559	* pages.
2560	*/
2561	if (zone != locked_zone \|\| batch_count == SWAP_CLUSTER_MAX) {
2562	if (pcp) {
2563	pcp_spin_unlock(pcp);
2564	pcp_trylock_finish(UP_flags);
2565	}
2566
2567	batch_count = `0`;
2568
2569	/*
2570	* trylock is necessary as pages may be getting freed
2571	* from IRQ or SoftIRQ context after an IO completion.
2572	*/
2573	pcp_trylock_prepare(UP_flags);
2574	pcp = pcp_spin_trylock(zone->per_cpu_pageset);
2575	if (unlikely(!pcp)) {
2576	pcp_trylock_finish(UP_flags);
2577	free_one_page(zone, page, page_to_pfn(page),
2578	order: `0`, migratetype, FPI_NONE);
2579	locked_zone = NULL;
2580	continue;
2581	}
2582	locked_zone = zone;
2583	}
2584
2585	/*
2586	* Non-isolated types over MIGRATE_PCPTYPES get added
2587	* to the MIGRATE_MOVABLE pcp list.
2588	*/
2589	if (unlikely(migratetype >= MIGRATE_PCPTYPES))
2590	migratetype = MIGRATE_MOVABLE;
2591
2592	trace_mm_page_free_batched(page);
2593	free_unref_page_commit(zone, pcp, page, migratetype, order: `0`);
2594	batch_count++;
2595	}
2596
2597	if (pcp) {
2598	pcp_spin_unlock(pcp);
2599	pcp_trylock_finish(UP_flags);
2600	}
2601	}
2602
2603	/*
2604	* split_page takes a non-compound higher-order page, and splits it into
2605	* n (1<<order) sub-pages: page[0..n]
2606	* Each sub-page must be freed individually.
2607	*
2608	* Note: this is probably too low level an operation for use in drivers.
2609	* Please consult with lkml before using this in your driver.
2610	*/
2611	void split_page(struct page page, unsigned* int order)
2612	{
2613	int i;
2614
2615	VM_BUG_ON_PAGE(PageCompound(page), page);
2616	VM_BUG_ON_PAGE(!page_count(page), page);
2617
2618	for (i = `1`; i < (`1` << order); i++)
2619	set_page_refcounted(page + i);
2620	split_page_owner(page, nr: `1` << order);
2621	split_page_memcg(head: page, nr: `1` << order);
2622	}
2623	EXPORT_SYMBOL_GPL(split_page);
2624
2625	int __isolate_free_page(struct page page, unsigned* int order)
2626	{
2627	struct zone *zone = page_zone(page);
2628	int mt = get_pageblock_migratetype(page);
2629
2630	if (!is_migrate_isolate(migratetype: mt)) {
2631	unsigned long watermark;
2632	/*
2633	* Obey watermarks as if the page was being allocated. We can
2634	* emulate a high-order watermark check with a raised order-0
2635	* watermark, because we already know our high-order page
2636	* exists.
2637	*/
2638	watermark = zone->_watermark[WMARK_MIN] + (`1UL` << order);
2639	if (!zone_watermark_ok(z: zone, order: `0`, mark: watermark, highest_zoneidx: `0`, ALLOC_CMA))
2640	return `0`;
2641
2642	__mod_zone_freepage_state(zone, nr_pages: -(`1UL` << order), migratetype: mt);
2643	}
2644
2645	del_page_from_free_list(page, zone, order);
2646
2647	/*
2648	* Set the pageblock if the isolated page is at least half of a
2649	* pageblock
2650	*/
2651	if (order >= pageblock_order - `1`) {
2652	struct page *endpage = page + (`1` << order) - `1`;
2653	for (; page < endpage; page += pageblock_nr_pages) {
2654	int mt = get_pageblock_migratetype(page);
2655	/*
2656	* Only change normal pageblocks (i.e., they can merge
2657	* with others)
2658	*/
2659	if (migratetype_is_mergeable(mt))
2660	set_pageblock_migratetype(page,
2661	migratetype: MIGRATE_MOVABLE);
2662	}
2663	}
2664
2665	return `1UL` << order;
2666	}
2667
2668	/**
2669	* __putback_isolated_page - Return a now-isolated page back where we got it
2670	* @page: Page that was isolated
2671	* @order: Order of the isolated page
2672	* @mt: The page's pageblock's migratetype
2673	*
2674	* This function is meant to return a page pulled from the free lists via
2675	* __isolate_free_page back to the free lists they were pulled from.
2676	*/
2677	void __putback_isolated_page(struct page page, unsigned* int order, int mt)
2678	{
2679	struct zone *zone = page_zone(page);
2680
2681	/ zone lock should be held when this function is called /
2682	lockdep_assert_held(&zone->lock);
2683
2684	/ Return isolated page to tail of freelist. /
2685	__free_one_page(page, page_to_pfn(page), zone, order, migratetype: mt,
2686	FPI_SKIP_REPORT_NOTIFY \| FPI_TO_TAIL);
2687	}
2688
2689	/*
2690	* Update NUMA hit/miss statistics
2691	*/
2692	static inline void zone_statistics(struct zone preferred_zone, struct* zone *z,
2693	long nr_account)
2694	{
2695	#ifdef CONFIG_NUMA
2696	enum numa_stat_item local_stat = NUMA_LOCAL;
2697
2698	/ skip numa counters update if numa stats is disabled /
2699	if (!static_branch_likely(&vm_numa_stat_key))
2700	return;
2701
2702	if (zone_to_nid(zone: z) != numa_node_id())
2703	local_stat = NUMA_OTHER;
2704
2705	if (zone_to_nid(zone: z) == zone_to_nid(zone: preferred_zone))
2706	__count_numa_events(zone: z, item: NUMA_HIT, delta: nr_account);
2707	else {
2708	__count_numa_events(zone: z, item: NUMA_MISS, delta: nr_account);
2709	__count_numa_events(zone: preferred_zone, item: NUMA_FOREIGN, delta: nr_account);
2710	}
2711	__count_numa_events(zone: z, item: local_stat, delta: nr_account);
2712	#endif
2713	}
2714
2715	static __always_inline
2716	struct page rmqueue_buddy(struct* zone preferred_zone, struct* zone *zone,
2717	unsigned int order, unsigned int alloc_flags,
2718	int migratetype)
2719	{
2720	struct page *page;
2721	unsigned long flags;
2722
2723	do {
2724	page = NULL;
2725	spin_lock_irqsave(&zone->lock, flags);
2726	if (alloc_flags & ALLOC_HIGHATOMIC)
2727	page = __rmqueue_smallest(zone, order, migratetype: MIGRATE_HIGHATOMIC);
2728	if (!page) {
2729	page = __rmqueue(zone, order, migratetype, alloc_flags);
2730
2731	/*
2732	* If the allocation fails, allow OOM handling access
2733	* to HIGHATOMIC reserves as failing now is worse than
2734	* failing a high-order atomic allocation in the
2735	* future.
2736	*/
2737	if (!page && (alloc_flags & ALLOC_OOM))
2738	page = __rmqueue_smallest(zone, order, migratetype: MIGRATE_HIGHATOMIC);
2739
2740	if (!page) {
2741	spin_unlock_irqrestore(lock: &zone->lock, flags);
2742	return NULL;
2743	}
2744	}
2745	__mod_zone_freepage_state(zone, nr_pages: -(`1` << order),
2746	migratetype: get_pcppage_migratetype(page));
2747	spin_unlock_irqrestore(lock: &zone->lock, flags);
2748	} while (check_new_pages(page, order));
2749
2750	__count_zid_vm_events(PGALLOC, page_zonenum(page), `1` << order);
2751	zone_statistics(preferred_zone, z: zone, nr_account: `1`);
2752
2753	return page;
2754	}
2755
2756	static int nr_pcp_alloc(struct per_cpu_pages pcp, struct* zone zone, int* order)
2757	{
2758	int high, base_batch, batch, max_nr_alloc;
2759	int high_max, high_min;
2760
2761	base_batch = READ_ONCE(pcp->batch);
2762	high_min = READ_ONCE(pcp->high_min);
2763	high_max = READ_ONCE(pcp->high_max);
2764	high = pcp->high = clamp(pcp->high, high_min, high_max);
2765
2766	/ Check for PCP disabled or boot pageset /
2767	if (unlikely(high < base_batch))
2768	return `1`;
2769
2770	if (order)
2771	batch = base_batch;
2772	else
2773	batch = (base_batch << pcp->alloc_factor);
2774
2775	/*
2776	* If we had larger pcp->high, we could avoid to allocate from
2777	* zone.
2778	*/
2779	if (high_min != high_max && !test_bit(ZONE_BELOW_HIGH, &zone->flags))
2780	high = pcp->high = min(high + batch, high_max);
2781
2782	if (!order) {
2783	max_nr_alloc = max(high - pcp->count - base_batch, base_batch);
2784	/*
2785	* Double the number of pages allocated each time there is
2786	* subsequent allocation of order-0 pages without any freeing.
2787	*/
2788	if (batch <= max_nr_alloc &&
2789	pcp->alloc_factor < CONFIG_PCP_BATCH_SCALE_MAX)
2790	pcp->alloc_factor++;
2791	batch = min(batch, max_nr_alloc);
2792	}
2793
2794	/*
2795	* Scale batch relative to order if batch implies free pages
2796	* can be stored on the PCP. Batch can be 1 for small zones or
2797	* for boot pagesets which should never store free pages as
2798	* the pages may belong to arbitrary zones.
2799	*/
2800	if (batch > `1`)
2801	batch = max(batch >> order, `2`);
2802
2803	return batch;
2804	}
2805
2806	/ Remove page from the per-cpu list, caller must protect the list /
2807	static inline
2808	struct page __rmqueue_pcplist(struct* zone zone, unsigned* int order,
2809	int migratetype,
2810	unsigned int alloc_flags,
2811	struct per_cpu_pages *pcp,
2812	struct list_head *list)
2813	{
2814	struct page *page;
2815
2816	do {
2817	if (list_empty(head: list)) {
2818	int batch = nr_pcp_alloc(pcp, zone, order);
2819	int alloced;
2820
2821	alloced = rmqueue_bulk(zone, order,
2822	count: batch, list,
2823	migratetype, alloc_flags);
2824
2825	pcp->count += alloced << order;
2826	if (unlikely(list_empty(list)))
2827	return NULL;
2828	}
2829
2830	page = list_first_entry(list, struct page, pcp_list);
2831	list_del(entry: &page->pcp_list);
2832	pcp->count -= `1` << order;
2833	} while (check_new_pages(page, order));
2834
2835	return page;
2836	}
2837
2838	/ Lock and remove page from the per-cpu list /
2839	static struct page rmqueue_pcplist(struct* zone *preferred_zone,
2840	struct zone zone, unsigned* int order,
2841	int migratetype, unsigned int alloc_flags)
2842	{
2843	struct per_cpu_pages *pcp;
2844	struct list_head *list;
2845	struct page *page;
2846	unsigned long __maybe_unused UP_flags;
2847
2848	/ spin_trylock may fail due to a parallel drain or IRQ reentrancy. /
2849	pcp_trylock_prepare(UP_flags);
2850	pcp = pcp_spin_trylock(zone->per_cpu_pageset);
2851	if (!pcp) {
2852	pcp_trylock_finish(UP_flags);
2853	return NULL;
2854	}
2855
2856	/*
2857	* On allocation, reduce the number of pages that are batch freed.
2858	* See nr_pcp_free() where free_factor is increased for subsequent
2859	* frees.
2860	*/
2861	pcp->free_count >>= `1`;
2862	list = &pcp->lists[order_to_pindex(migratetype, order)];
2863	page = __rmqueue_pcplist(zone, order, migratetype, alloc_flags, pcp, list);
2864	pcp_spin_unlock(pcp);
2865	pcp_trylock_finish(UP_flags);
2866	if (page) {
2867	__count_zid_vm_events(PGALLOC, page_zonenum(page), `1` << order);
2868	zone_statistics(preferred_zone, z: zone, nr_account: `1`);
2869	}
2870	return page;
2871	}
2872
2873	/*
2874	* Allocate a page from the given zone.
2875	* Use pcplists for THP or "cheap" high-order allocations.
2876	*/
2877
2878	/*
2879	* Do not instrument rmqueue() with KMSAN. This function may call
2880	* __msan_poison_alloca() through a call to set_pfnblock_flags_mask().
2881	* If __msan_poison_alloca() attempts to allocate pages for the stack depot, it
2882	* may call rmqueue() again, which will result in a deadlock.
2883	*/
2884	__no_sanitize_memory
2885	static inline
2886	struct page rmqueue(struct* zone *preferred_zone,
2887	struct zone zone, unsigned* int order,
2888	gfp_t gfp_flags, unsigned int alloc_flags,
2889	int migratetype)
2890	{
2891	struct page *page;
2892
2893	/*
2894	* We most definitely don't want callers attempting to
2895	* allocate greater than order-1 page units with __GFP_NOFAIL.
2896	*/
2897	WARN_ON_ONCE((gfp_flags & __GFP_NOFAIL) && (order > `1`));
2898
2899	if (likely(pcp_allowed_order(order))) {
2900	page = rmqueue_pcplist(preferred_zone, zone, order,
2901	migratetype, alloc_flags);
2902	if (likely(page))
2903	goto out;
2904	}
2905
2906	page = rmqueue_buddy(preferred_zone, zone, order, alloc_flags,
2907	migratetype);
2908
2909	out:
2910	/ Separate test+clear to avoid unnecessary atomics /
2911	if ((alloc_flags & ALLOC_KSWAPD) &&
2912	unlikely(test_bit(ZONE_BOOSTED_WATERMARK, &zone->flags))) {
2913	clear_bit(nr: ZONE_BOOSTED_WATERMARK, addr: &zone->flags);
2914	wakeup_kswapd(zone, gfp_mask: `0`, order: `0`, zone_idx(zone));
2915	}
2916
2917	VM_BUG_ON_PAGE(page && bad_range(zone, page), page);
2918	return page;
2919	}
2920
2921	noinline bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
2922	{
2923	return __should_fail_alloc_page(gfp_mask, order);
2924	}
2925	ALLOW_ERROR_INJECTION(should_fail_alloc_page, TRUE);
2926
2927	static inline long __zone_watermark_unusable_free(struct zone *z,
2928	unsigned int order, unsigned int alloc_flags)
2929	{
2930	long unusable_free = (`1` << order) - `1`;
2931
2932	/*
2933	* If the caller does not have rights to reserves below the min
2934	* watermark then subtract the high-atomic reserves. This will
2935	* over-estimate the size of the atomic reserve but it avoids a search.
2936	*/
2937	if (likely(!(alloc_flags & ALLOC_RESERVES)))
2938	unusable_free += z->nr_reserved_highatomic;
2939
2940	#ifdef CONFIG_CMA
2941	/ If allocation can't use CMA areas don't use free CMA pages /
2942	if (!(alloc_flags & ALLOC_CMA))
2943	unusable_free += zone_page_state(zone: z, item: NR_FREE_CMA_PAGES);
2944	#endif
2945	#ifdef CONFIG_UNACCEPTED_MEMORY
2946	unusable_free += zone_page_state(zone: z, item: NR_UNACCEPTED);
2947	#endif
2948
2949	return unusable_free;
2950	}
2951
2952	/*
2953	* Return true if free base pages are above 'mark'. For high-order checks it
2954	* will return true of the order-0 watermark is reached and there is at least
2955	* one free page of a suitable size. Checking now avoids taking the zone lock
2956	* to check in the allocation paths if no pages are free.
2957	*/
2958	bool __zone_watermark_ok(struct zone z, unsigned* int order, unsigned long mark,
2959	int highest_zoneidx, unsigned int alloc_flags,
2960	long free_pages)
2961	{
2962	long min = mark;
2963	int o;
2964
2965	/ free_pages may go negative - that's OK /
2966	free_pages -= __zone_watermark_unusable_free(z, order, alloc_flags);
2967
2968	if (unlikely(alloc_flags & ALLOC_RESERVES)) {
2969	/*
2970	* __GFP_HIGH allows access to 50% of the min reserve as well
2971	* as OOM.
2972	*/
2973	if (alloc_flags & ALLOC_MIN_RESERVE) {
2974	min -= min / `2`;
2975
2976	/*
2977	* Non-blocking allocations (e.g. GFP_ATOMIC) can
2978	* access more reserves than just __GFP_HIGH. Other
2979	* non-blocking allocations requests such as GFP_NOWAIT
2980	* or (GFP_KERNEL & ~__GFP_DIRECT_RECLAIM) do not get
2981	* access to the min reserve.
2982	*/
2983	if (alloc_flags & ALLOC_NON_BLOCK)
2984	min -= min / `4`;
2985	}
2986
2987	/*
2988	* OOM victims can try even harder than the normal reserve
2989	* users on the grounds that it's definitely going to be in
2990	* the exit path shortly and free memory. Any allocation it
2991	* makes during the free path will be small and short-lived.
2992	*/
2993	if (alloc_flags & ALLOC_OOM)
2994	min -= min / `2`;
2995	}
2996
2997	/*
2998	* Check watermarks for an order-0 allocation request. If these
2999	* are not met, then a high-order request also cannot go ahead
3000	* even if a suitable page happened to be free.
3001	*/
3002	if (free_pages <= min + z->lowmem_reserve[highest_zoneidx])
3003	return false;
3004
3005	/ If this is an order-0 request then the watermark is fine /
3006	if (!order)
3007	return true;
3008
3009	/ For a high-order request, check at least one suitable page is free /
3010	for (o = order; o <= MAX_ORDER; o++) {
3011	struct free_area *area = &z->free_area[o];
3012	int mt;
3013
3014	if (!area->nr_free)
3015	continue;
3016
3017	for (mt = `0`; mt < MIGRATE_PCPTYPES; mt++) {
3018	if (!free_area_empty(area, migratetype: mt))
3019	return true;
3020	}
3021
3022	#ifdef CONFIG_CMA
3023	if ((alloc_flags & ALLOC_CMA) &&
3024	!free_area_empty(area, migratetype: MIGRATE_CMA)) {
3025	return true;
3026	}
3027	#endif
3028	if ((alloc_flags & (ALLOC_HIGHATOMIC\|ALLOC_OOM)) &&
3029	!free_area_empty(area, migratetype: MIGRATE_HIGHATOMIC)) {
3030	return true;
3031	}
3032	}
3033	return false;
3034	}
3035
3036	bool zone_watermark_ok(struct zone z, unsigned* int order, unsigned long mark,
3037	int highest_zoneidx, unsigned int alloc_flags)
3038	{
3039	return __zone_watermark_ok(z, order, mark, highest_zoneidx, alloc_flags,
3040	free_pages: zone_page_state(zone: z, item: NR_FREE_PAGES));
3041	}
3042
3043	static inline bool zone_watermark_fast(struct zone z, unsigned* int order,
3044	unsigned long mark, int highest_zoneidx,
3045	unsigned int alloc_flags, gfp_t gfp_mask)
3046	{
3047	long free_pages;
3048
3049	free_pages = zone_page_state(zone: z, item: NR_FREE_PAGES);
3050
3051	/*
3052	* Fast check for order-0 only. If this fails then the reserves
3053	* need to be calculated.
3054	*/
3055	if (!order) {
3056	long usable_free;
3057	long reserved;
3058
3059	usable_free = free_pages;
3060	reserved = __zone_watermark_unusable_free(z, order: `0`, alloc_flags);
3061
3062	/ reserved may over estimate high-atomic reserves. /
3063	usable_free -= min(usable_free, reserved);
3064	if (usable_free > mark + z->lowmem_reserve[highest_zoneidx])
3065	return true;
3066	}
3067
3068	if (__zone_watermark_ok(z, order, mark, highest_zoneidx, alloc_flags,
3069	free_pages))
3070	return true;
3071
3072	/*
3073	* Ignore watermark boosting for __GFP_HIGH order-0 allocations
3074	* when checking the min watermark. The min watermark is the
3075	* point where boosting is ignored so that kswapd is woken up
3076	* when below the low watermark.
3077	*/
3078	if (unlikely(!order && (alloc_flags & ALLOC_MIN_RESERVE) && z->watermark_boost
3079	&& ((alloc_flags & ALLOC_WMARK_MASK) == WMARK_MIN))) {
3080	mark = z->_watermark[WMARK_MIN];
3081	return __zone_watermark_ok(z, order, mark, highest_zoneidx,
3082	alloc_flags, free_pages);
3083	}
3084
3085	return false;
3086	}
3087
3088	bool zone_watermark_ok_safe(struct zone z, unsigned* int order,
3089	unsigned long mark, int highest_zoneidx)
3090	{
3091	long free_pages = zone_page_state(zone: z, item: NR_FREE_PAGES);
3092
3093	if (z->percpu_drift_mark && free_pages < z->percpu_drift_mark)
3094	free_pages = zone_page_state_snapshot(zone: z, item: NR_FREE_PAGES);
3095
3096	return __zone_watermark_ok(z, order, mark, highest_zoneidx, alloc_flags: `0`,
3097	free_pages);
3098	}
3099
3100	#ifdef CONFIG_NUMA
3101	int __read_mostly node_reclaim_distance = RECLAIM_DISTANCE;
3102
3103	static bool zone_allows_reclaim(struct zone local_zone, struct* zone *zone)
3104	{
3105	return node_distance(zone_to_nid(local_zone), zone_to_nid(zone)) <=
3106	node_reclaim_distance;
3107	}
3108	#else /* CONFIG_NUMA */
3109	static bool zone_allows_reclaim(struct zone local_zone, struct* zone *zone)
3110	{
3111	return true;
3112	}
3113	#endif /* CONFIG_NUMA */
3114
3115	/*
3116	* The restriction on ZONE_DMA32 as being a suitable zone to use to avoid
3117	* fragmentation is subtle. If the preferred zone was HIGHMEM then
3118	* premature use of a lower zone may cause lowmem pressure problems that
3119	* are worse than fragmentation. If the next zone is ZONE_DMA then it is
3120	* probably too small. It only makes sense to spread allocations to avoid
3121	* fragmentation between the Normal and DMA32 zones.
3122	*/
3123	static inline unsigned int
3124	alloc_flags_nofragment(struct zone *zone, gfp_t gfp_mask)
3125	{
3126	unsigned int alloc_flags;
3127
3128	/*
3129	* __GFP_KSWAPD_RECLAIM is assumed to be the same as ALLOC_KSWAPD
3130	* to save a branch.
3131	*/
3132	alloc_flags = (__force int) (gfp_mask & __GFP_KSWAPD_RECLAIM);
3133
3134	#ifdef CONFIG_ZONE_DMA32
3135	if (!zone)
3136	return alloc_flags;
3137
3138	if (zone_idx(zone) != ZONE_NORMAL)
3139	return alloc_flags;
3140
3141	/*
3142	* If ZONE_DMA32 exists, assume it is the one after ZONE_NORMAL and
3143	* the pointer is within zone->zone_pgdat->node_zones[]. Also assume
3144	* on UMA that if Normal is populated then so is DMA32.
3145	*/
3146	BUILD_BUG_ON(ZONE_NORMAL - ZONE_DMA32 != `1`);
3147	if (nr_online_nodes > `1` && !populated_zone(zone: --zone))
3148	return alloc_flags;
3149
3150	alloc_flags \|= ALLOC_NOFRAGMENT;
3151	#endif /* CONFIG_ZONE_DMA32 */
3152	return alloc_flags;
3153	}
3154
3155	/ Must be called after current_gfp_context() which can change gfp_mask /
3156	static inline unsigned int gfp_to_alloc_flags_cma(gfp_t gfp_mask,
3157	unsigned int alloc_flags)
3158	{
3159	#ifdef CONFIG_CMA
3160	if (gfp_migratetype(gfp_flags: gfp_mask) == MIGRATE_MOVABLE)
3161	alloc_flags \|= ALLOC_CMA;
3162	#endif
3163	return alloc_flags;
3164	}
3165
3166	/*
3167	* get_page_from_freelist goes through the zonelist trying to allocate
3168	* a page.
3169	*/
3170	static struct page *
3171	get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags,
3172	const struct alloc_context *ac)
3173	{
3174	struct zoneref *z;
3175	struct zone *zone;
3176	struct pglist_data *last_pgdat = NULL;
3177	bool last_pgdat_dirty_ok = false;
3178	bool no_fallback;
3179
3180	retry:
3181	/*
3182	* Scan zonelist, looking for a zone with enough free.
3183	* See also cpuset_node_allowed() comment in kernel/cgroup/cpuset.c.
3184	*/
3185	no_fallback = alloc_flags & ALLOC_NOFRAGMENT;
3186	z = ac->preferred_zoneref;
3187	for_next_zone_zonelist_nodemask(zone, z, ac->highest_zoneidx,
3188	ac->nodemask) {
3189	struct page *page;
3190	unsigned long mark;
3191
3192	if (cpusets_enabled() &&
3193	(alloc_flags & ALLOC_CPUSET) &&
3194	!__cpuset_zone_allowed(z: zone, gfp_mask))
3195	continue;
3196	/*
3197	* When allocating a page cache page for writing, we
3198	* want to get it from a node that is within its dirty
3199	* limit, such that no single node holds more than its
3200	* proportional share of globally allowed dirty pages.
3201	* The dirty limits take into account the node's
3202	* lowmem reserves and high watermark so that kswapd
3203	* should be able to balance it without having to
3204	* write pages from its LRU list.
3205	*
3206	* XXX: For now, allow allocations to potentially
3207	* exceed the per-node dirty limit in the slowpath
3208	* (spread_dirty_pages unset) before going into reclaim,
3209	* which is important when on a NUMA setup the allowed
3210	* nodes are together not big enough to reach the
3211	* global limit. The proper fix for these situations
3212	* will require awareness of nodes in the
3213	* dirty-throttling and the flusher threads.
3214	*/
3215	if (ac->spread_dirty_pages) {
3216	if (last_pgdat != zone->zone_pgdat) {
3217	last_pgdat = zone->zone_pgdat;
3218	last_pgdat_dirty_ok = node_dirty_ok(pgdat: zone->zone_pgdat);
3219	}
3220
3221	if (!last_pgdat_dirty_ok)
3222	continue;
3223	}
3224
3225	if (no_fallback && nr_online_nodes > `1` &&
3226	zone != ac->preferred_zoneref->zone) {
3227	int local_nid;
3228
3229	/*
3230	* If moving to a remote node, retry but allow
3231	* fragmenting fallbacks. Locality is more important
3232	* than fragmentation avoidance.
3233	*/
3234	local_nid = zone_to_nid(zone: ac->preferred_zoneref->zone);
3235	if (zone_to_nid(zone) != local_nid) {
3236	alloc_flags &= ~ALLOC_NOFRAGMENT;
3237	goto retry;
3238	}
3239	}
3240
3241	/*
3242	* Detect whether the number of free pages is below high
3243	* watermark. If so, we will decrease pcp->high and free
3244	* PCP pages in free path to reduce the possibility of
3245	* premature page reclaiming. Detection is done here to
3246	* avoid to do that in hotter free path.
3247	*/
3248	if (test_bit(ZONE_BELOW_HIGH, &zone->flags))
3249	goto check_alloc_wmark;
3250
3251	mark = high_wmark_pages(zone);
3252	if (zone_watermark_fast(z: zone, order, mark,
3253	highest_zoneidx: ac->highest_zoneidx, alloc_flags,
3254	gfp_mask))
3255	goto try_this_zone;
3256	else
3257	set_bit(nr: ZONE_BELOW_HIGH, addr: &zone->flags);
3258
3259	check_alloc_wmark:
3260	mark = wmark_pages(zone, alloc_flags & ALLOC_WMARK_MASK);
3261	if (!zone_watermark_fast(z: zone, order, mark,
3262	highest_zoneidx: ac->highest_zoneidx, alloc_flags,
3263	gfp_mask)) {
3264	int ret;
3265
3266	if (has_unaccepted_memory()) {
3267	if (try_to_accept_memory(zone, order))
3268	goto try_this_zone;
3269	}
3270
3271	#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
3272	/*
3273	* Watermark failed for this zone, but see if we can
3274	* grow this zone if it contains deferred pages.
3275	*/
3276	if (deferred_pages_enabled()) {
3277	if (_deferred_grow_zone(zone, order))
3278	goto try_this_zone;
3279	}
3280	#endif
3281	/ Checked here to keep the fast path fast /
3282	BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK);
3283	if (alloc_flags & ALLOC_NO_WATERMARKS)
3284	goto try_this_zone;
3285
3286	if (!node_reclaim_enabled() \|\|
3287	!zone_allows_reclaim(local_zone: ac->preferred_zoneref->zone, zone))
3288	continue;
3289
3290	ret = node_reclaim(zone->zone_pgdat, gfp_mask, order);
3291	switch (ret) {
3292	case NODE_RECLAIM_NOSCAN:
3293	/ did not scan /
3294	continue;
3295	case NODE_RECLAIM_FULL:
3296	/ scanned but unreclaimable /
3297	continue;
3298	default:
3299	/ did we reclaim enough /
3300	if (zone_watermark_ok(z: zone, order, mark,
3301	highest_zoneidx: ac->highest_zoneidx, alloc_flags))
3302	goto try_this_zone;
3303
3304	continue;
3305	}
3306	}
3307
3308	try_this_zone:
3309	page = rmqueue(preferred_zone: ac->preferred_zoneref->zone, zone, order,
3310	gfp_flags: gfp_mask, alloc_flags, migratetype: ac->migratetype);
3311	if (page) {
3312	prep_new_page(page, order, gfp_flags: gfp_mask, alloc_flags);
3313
3314	/*
3315	* If this is a high-order atomic allocation then check
3316	* if the pageblock should be reserved for the future
3317	*/
3318	if (unlikely(alloc_flags & ALLOC_HIGHATOMIC))
3319	reserve_highatomic_pageblock(page, zone);
3320
3321	return page;
3322	} else {
3323	if (has_unaccepted_memory()) {
3324	if (try_to_accept_memory(zone, order))
3325	goto try_this_zone;
3326	}
3327
3328	#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
3329	/ Try again if zone has deferred pages /
3330	if (deferred_pages_enabled()) {
3331	if (_deferred_grow_zone(zone, order))
3332	goto try_this_zone;
3333	}
3334	#endif
3335	}
3336	}
3337
3338	/*
3339	* It's possible on a UMA machine to get through all zones that are
3340	* fragmented. If avoiding fragmentation, reset and try again.
3341	*/
3342	if (no_fallback) {
3343	alloc_flags &= ~ALLOC_NOFRAGMENT;
3344	goto retry;
3345	}
3346
3347	return NULL;
3348	}
3349
3350	static void warn_alloc_show_mem(gfp_t gfp_mask, nodemask_t *nodemask)
3351	{
3352	unsigned int filter = SHOW_MEM_FILTER_NODES;
3353
3354	/*
3355	* This documents exceptions given to allocations in certain
3356	* contexts that are allowed to allocate outside current's set
3357	* of allowed nodes.
3358	*/
3359	if (!(gfp_mask & __GFP_NOMEMALLOC))
3360	if (tsk_is_oom_victim(current) \|\|
3361	(current->flags & (PF_MEMALLOC \| PF_EXITING)))
3362	filter &= ~SHOW_MEM_FILTER_NODES;
3363	if (!in_task() \|\| !(gfp_mask & __GFP_DIRECT_RECLAIM))
3364	filter &= ~SHOW_MEM_FILTER_NODES;
3365
3366	__show_mem(flags: filter, nodemask, max_zone_idx: gfp_zone(flags: gfp_mask));
3367	}
3368
3369	void warn_alloc(gfp_t gfp_mask, nodemask_t nodemask, const* char *fmt, ...)
3370	{
3371	struct va_format vaf;
3372	va_list args;
3373	static DEFINE_RATELIMIT_STATE(nopage_rs, `10`*HZ, `1`);
3374
3375	if ((gfp_mask & __GFP_NOWARN) \|\|
3376	!__ratelimit(&nopage_rs) \|\|
3377	((gfp_mask & __GFP_DMA) && !has_managed_dma()))
3378	return;
3379
3380	va_start(args, fmt);
3381	vaf.fmt = fmt;
3382	vaf.va = &args;
3383	pr_warn("%s: %pV, mode:%#x(%pGg), nodemask=%*pbl",
3384	current->comm, &vaf, gfp_mask, &gfp_mask,
3385	nodemask_pr_args(nodemask));
3386	va_end(args);
3387
3388	cpuset_print_current_mems_allowed();
3389	pr_cont("\n");
3390	dump_stack();
3391	warn_alloc_show_mem(gfp_mask, nodemask);
3392	}
3393
3394	static inline struct page *
3395	__alloc_pages_cpuset_fallback(gfp_t gfp_mask, unsigned int order,
3396	unsigned int alloc_flags,
3397	const struct alloc_context *ac)
3398	{
3399	struct page *page;
3400
3401	page = get_page_from_freelist(gfp_mask, order,
3402	alloc_flags: alloc_flags\|ALLOC_CPUSET, ac);
3403	/*
3404	* fallback to ignore cpuset restriction if our nodes
3405	* are depleted
3406	*/
3407	if (!page)
3408	page = get_page_from_freelist(gfp_mask, order,
3409	alloc_flags, ac);
3410
3411	return page;
3412	}
3413
3414	static inline struct page *
3415	__alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
3416	const struct alloc_context ac, unsigned* long *did_some_progress)
3417	{
3418	struct oom_control oc = {
3419	.zonelist = ac->zonelist,
3420	.nodemask = ac->nodemask,
3421	.memcg = NULL,
3422	.gfp_mask = gfp_mask,
3423	.order = order,
3424	};
3425	struct page *page;
3426
3427	*did_some_progress = `0`;
3428
3429	/*
3430	* Acquire the oom lock. If that fails, somebody else is
3431	* making progress for us.
3432	*/
3433	if (!mutex_trylock(lock: &oom_lock)) {
3434	*did_some_progress = `1`;
3435	schedule_timeout_uninterruptible(timeout: `1`);
3436	return NULL;
3437	}
3438
3439	/*
3440	* Go through the zonelist yet one more time, keep very high watermark
3441	* here, this is only to catch a parallel oom killing, we must fail if
3442	* we're still under heavy pressure. But make sure that this reclaim
3443	* attempt shall not depend on __GFP_DIRECT_RECLAIM && !__GFP_NORETRY
3444	* allocation which will never fail due to oom_lock already held.
3445	*/
3446	page = get_page_from_freelist(gfp_mask: (gfp_mask \| __GFP_HARDWALL) &
3447	~__GFP_DIRECT_RECLAIM, order,
3448	ALLOC_WMARK_HIGH\|ALLOC_CPUSET, ac);
3449	if (page)
3450	goto out;
3451
3452	/ Coredumps can quickly deplete all memory reserves /
3453	if (current->flags & PF_DUMPCORE)
3454	goto out;
3455	/ The OOM killer will not help higher order allocs /
3456	if (order > PAGE_ALLOC_COSTLY_ORDER)
3457	goto out;
3458	/*
3459	* We have already exhausted all our reclaim opportunities without any
3460	* success so it is time to admit defeat. We will skip the OOM killer
3461	* because it is very likely that the caller has a more reasonable
3462	* fallback than shooting a random task.
3463	*
3464	* The OOM killer may not free memory on a specific node.
3465	*/
3466	if (gfp_mask & (__GFP_RETRY_MAYFAIL \| __GFP_THISNODE))
3467	goto out;
3468	/ The OOM killer does not needlessly kill tasks for lowmem /
3469	if (ac->highest_zoneidx < ZONE_NORMAL)
3470	goto out;
3471	if (pm_suspended_storage())
3472	goto out;
3473	/*
3474	* XXX: GFP_NOFS allocations should rather fail than rely on
3475	* other request to make a forward progress.
3476	* We are in an unfortunate situation where out_of_memory cannot
3477	* do much for this context but let's try it to at least get
3478	* access to memory reserved if the current task is killed (see
3479	* out_of_memory). Once filesystems are ready to handle allocation
3480	* failures more gracefully we should just bail out here.
3481	*/
3482
3483	/ Exhausted what can be done so it's blame time /
3484	if (out_of_memory(oc: &oc) \|\|
3485	WARN_ON_ONCE_GFP(gfp_mask & __GFP_NOFAIL, gfp_mask)) {
3486	*did_some_progress = `1`;
3487
3488	/*
3489	* Help non-failing allocations by giving them access to memory
3490	* reserves
3491	*/
3492	if (gfp_mask & __GFP_NOFAIL)
3493	page = __alloc_pages_cpuset_fallback(gfp_mask, order,
3494	ALLOC_NO_WATERMARKS, ac);
3495	}
3496	out:
3497	mutex_unlock(lock: &oom_lock);
3498	return page;
3499	}
3500
3501	/*
3502	* Maximum number of compaction retries with a progress before OOM
3503	* killer is consider as the only way to move forward.
3504	*/
3505	#define MAX_COMPACT_RETRIES 16
3506
3507	#ifdef CONFIG_COMPACTION
3508	/ Try memory compaction for high-order allocations before reclaim /
3509	static struct page *
3510	__alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
3511	unsigned int alloc_flags, const struct alloc_context *ac,
3512	enum compact_priority prio, enum compact_result *compact_result)
3513	{
3514	struct page *page = NULL;
3515	unsigned long pflags;
3516	unsigned int noreclaim_flag;
3517
3518	if (!order)
3519	return NULL;
3520
3521	psi_memstall_enter(flags: &pflags);
3522	delayacct_compact_start();
3523	noreclaim_flag = memalloc_noreclaim_save();
3524
3525	*compact_result = try_to_compact_pages(gfp_mask, order, alloc_flags, ac,
3526	prio, page: &page);
3527
3528	memalloc_noreclaim_restore(flags: noreclaim_flag);
3529	psi_memstall_leave(flags: &pflags);
3530	delayacct_compact_end();
3531
3532	if (*compact_result == COMPACT_SKIPPED)
3533	return NULL;
3534	/*
3535	* At least in one zone compaction wasn't deferred or skipped, so let's
3536	* count a compaction stall
3537	*/
3538	count_vm_event(item: COMPACTSTALL);
3539
3540	/ Prep a captured page if available /
3541	if (page)
3542	prep_new_page(page, order, gfp_flags: gfp_mask, alloc_flags);
3543
3544	/ Try get a page from the freelist if available /
3545	if (!page)
3546	page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac);
3547
3548	if (page) {
3549	struct zone *zone = page_zone(page);
3550
3551	zone->compact_blockskip_flush = false;
3552	compaction_defer_reset(zone, order, alloc_success: true);
3553	count_vm_event(item: COMPACTSUCCESS);
3554	return page;
3555	}
3556
3557	/*
3558	* It's bad if compaction run occurs and fails. The most likely reason
3559	* is that pages exist, but not enough to satisfy watermarks.
3560	*/
3561	count_vm_event(item: COMPACTFAIL);
3562
3563	cond_resched();
3564
3565	return NULL;
3566	}
3567
3568	static inline bool
3569	should_compact_retry(struct alloc_context ac, int* order, int alloc_flags,
3570	enum compact_result compact_result,
3571	enum compact_priority *compact_priority,
3572	int *compaction_retries)
3573	{
3574	int max_retries = MAX_COMPACT_RETRIES;
3575	int min_priority;
3576	bool ret = false;
3577	int retries = *compaction_retries;
3578	enum compact_priority priority = *compact_priority;
3579
3580	if (!order)
3581	return false;
3582
3583	if (fatal_signal_pending(current))
3584	return false;
3585
3586	/*
3587	* Compaction was skipped due to a lack of free order-0
3588	* migration targets. Continue if reclaim can help.
3589	*/
3590	if (compact_result == COMPACT_SKIPPED) {
3591	ret = compaction_zonelist_suitable(ac, order, alloc_flags);
3592	goto out;
3593	}
3594
3595	/*
3596	* Compaction managed to coalesce some page blocks, but the
3597	* allocation failed presumably due to a race. Retry some.
3598	*/
3599	if (compact_result == COMPACT_SUCCESS) {
3600	/*
3601	* !costly requests are much more important than
3602	* __GFP_RETRY_MAYFAIL costly ones because they are de
3603	* facto nofail and invoke OOM killer to move on while
3604	* costly can fail and users are ready to cope with
3605	* that. 1/4 retries is rather arbitrary but we would
3606	* need much more detailed feedback from compaction to
3607	* make a better decision.
3608	*/
3609	if (order > PAGE_ALLOC_COSTLY_ORDER)
3610	max_retries /= `4`;
3611
3612	if (++(*compaction_retries) <= max_retries) {
3613	ret = true;
3614	goto out;
3615	}
3616	}
3617
3618	/*
3619	* Compaction failed. Retry with increasing priority.
3620	*/
3621	min_priority = (order > PAGE_ALLOC_COSTLY_ORDER) ?
3622	MIN_COMPACT_COSTLY_PRIORITY : MIN_COMPACT_PRIORITY;
3623
3624	if (*compact_priority > min_priority) {
3625	(*compact_priority)--;
3626	*compaction_retries = `0`;
3627	ret = true;
3628	}
3629	out:
3630	trace_compact_retry(order, priority, result: compact_result, retries, max_retries, ret);
3631	return ret;
3632	}
3633	#else
3634	static inline struct page *
3635	__alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
3636	unsigned int alloc_flags, const struct alloc_context *ac,
3637	enum compact_priority prio, enum compact_result *compact_result)
3638	{
3639	*compact_result = COMPACT_SKIPPED;
3640	return NULL;
3641	}
3642
3643	static inline bool
3644	should_compact_retry(struct alloc_context ac, unsigned* int order, int alloc_flags,
3645	enum compact_result compact_result,
3646	enum compact_priority *compact_priority,
3647	int *compaction_retries)
3648	{
3649	struct zone *zone;
3650	struct zoneref *z;
3651
3652	if (!order \|\| order > PAGE_ALLOC_COSTLY_ORDER)
3653	return false;
3654
3655	/*
3656	* There are setups with compaction disabled which would prefer to loop
3657	* inside the allocator rather than hit the oom killer prematurely.
3658	* Let's give them a good hope and keep retrying while the order-0
3659	* watermarks are OK.
3660	*/
3661	for_each_zone_zonelist_nodemask(zone, z, ac->zonelist,
3662	ac->highest_zoneidx, ac->nodemask) {
3663	if (zone_watermark_ok(zone, `0`, min_wmark_pages(zone),
3664	ac->highest_zoneidx, alloc_flags))
3665	return true;
3666	}
3667	return false;
3668	}
3669	#endif /* CONFIG_COMPACTION */
3670
3671	#ifdef CONFIG_LOCKDEP
3672	static struct lockdep_map __fs_reclaim_map =
3673	STATIC_LOCKDEP_MAP_INIT("fs_reclaim", &__fs_reclaim_map);
3674
3675	static bool __need_reclaim(gfp_t gfp_mask)
3676	{
3677	/ no reclaim without waiting on it /
3678	if (!(gfp_mask & __GFP_DIRECT_RECLAIM))
3679	return false;
3680
3681	/ this guy won't enter reclaim /
3682	if (current->flags & PF_MEMALLOC)
3683	return false;
3684
3685	if (gfp_mask & __GFP_NOLOCKDEP)
3686	return false;
3687
3688	return true;
3689	}
3690
3691	void __fs_reclaim_acquire(unsigned long ip)
3692	{
3693	lock_acquire_exclusive(&__fs_reclaim_map, `0`, `0`, NULL, ip);
3694	}
3695
3696	void __fs_reclaim_release(unsigned long ip)
3697	{
3698	lock_release(lock: &__fs_reclaim_map, ip);
3699	}
3700
3701	void fs_reclaim_acquire(gfp_t gfp_mask)
3702	{
3703	gfp_mask = current_gfp_context(flags: gfp_mask);
3704
3705	if (__need_reclaim(gfp_mask)) {
3706	if (gfp_mask & __GFP_FS)
3707	__fs_reclaim_acquire(_RET_IP_);
3708
3709	#ifdef CONFIG_MMU_NOTIFIER
3710	lock_map_acquire(&__mmu_notifier_invalidate_range_start_map);
3711	lock_map_release(&__mmu_notifier_invalidate_range_start_map);
3712	#endif
3713
3714	}
3715	}
3716	EXPORT_SYMBOL_GPL(fs_reclaim_acquire);
3717
3718	void fs_reclaim_release(gfp_t gfp_mask)
3719	{
3720	gfp_mask = current_gfp_context(flags: gfp_mask);
3721
3722	if (__need_reclaim(gfp_mask)) {
3723	if (gfp_mask & __GFP_FS)
3724	__fs_reclaim_release(_RET_IP_);
3725	}
3726	}
3727	EXPORT_SYMBOL_GPL(fs_reclaim_release);
3728	#endif
3729
3730	/*
3731	* Zonelists may change due to hotplug during allocation. Detect when zonelists
3732	* have been rebuilt so allocation retries. Reader side does not lock and
3733	* retries the allocation if zonelist changes. Writer side is protected by the
3734	* embedded spin_lock.
3735	*/
3736	static DEFINE_SEQLOCK(zonelist_update_seq);
3737
3738	static unsigned int zonelist_iter_begin(void)
3739	{
3740	if (IS_ENABLED(CONFIG_MEMORY_HOTREMOVE))
3741	return read_seqbegin(sl: &zonelist_update_seq);
3742
3743	return `0`;
3744	}
3745
3746	static unsigned int check_retry_zonelist(unsigned int seq)
3747	{
3748	if (IS_ENABLED(CONFIG_MEMORY_HOTREMOVE))
3749	return read_seqretry(sl: &zonelist_update_seq, start: seq);
3750
3751	return seq;
3752	}
3753
3754	/ Perform direct synchronous page reclaim /
3755	static unsigned long
3756	__perform_reclaim(gfp_t gfp_mask, unsigned int order,
3757	const struct alloc_context *ac)
3758	{
3759	unsigned int noreclaim_flag;
3760	unsigned long progress;
3761
3762	cond_resched();
3763
3764	/ We now go into synchronous reclaim /
3765	cpuset_memory_pressure_bump();
3766	fs_reclaim_acquire(gfp_mask);
3767	noreclaim_flag = memalloc_noreclaim_save();
3768
3769	progress = try_to_free_pages(zonelist: ac->zonelist, order, gfp_mask,
3770	mask: ac->nodemask);
3771
3772	memalloc_noreclaim_restore(flags: noreclaim_flag);
3773	fs_reclaim_release(gfp_mask);
3774
3775	cond_resched();
3776
3777	return progress;
3778	}
3779
3780	/ The really slow allocator path where we enter direct reclaim /
3781	static inline struct page *
3782	__alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order,
3783	unsigned int alloc_flags, const struct alloc_context *ac,
3784	unsigned long *did_some_progress)
3785	{
3786	struct page *page = NULL;
3787	unsigned long pflags;
3788	bool drained = false;
3789
3790	psi_memstall_enter(flags: &pflags);
3791	*did_some_progress = __perform_reclaim(gfp_mask, order, ac);
3792	if (unlikely(!(*did_some_progress)))
3793	goto out;
3794
3795	retry:
3796	page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac);
3797
3798	/*
3799	* If an allocation failed after direct reclaim, it could be because
3800	* pages are pinned on the per-cpu lists or in high alloc reserves.
3801	* Shrink them and try again
3802	*/
3803	if (!page && !drained) {
3804	unreserve_highatomic_pageblock(ac, force: false);
3805	drain_all_pages(NULL);
3806	drained = true;
3807	goto retry;
3808	}
3809	out:
3810	psi_memstall_leave(flags: &pflags);
3811
3812	return page;
3813	}
3814
3815	static void wake_all_kswapds(unsigned int order, gfp_t gfp_mask,
3816	const struct alloc_context *ac)
3817	{
3818	struct zoneref *z;
3819	struct zone *zone;
3820	pg_data_t *last_pgdat = NULL;
3821	enum zone_type highest_zoneidx = ac->highest_zoneidx;
3822
3823	for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, highest_zoneidx,
3824	ac->nodemask) {
3825	if (!managed_zone(zone))
3826	continue;
3827	if (last_pgdat != zone->zone_pgdat) {
3828	wakeup_kswapd(zone, gfp_mask, order, highest_zoneidx);
3829	last_pgdat = zone->zone_pgdat;
3830	}
3831	}
3832	}
3833
3834	static inline unsigned int
3835	gfp_to_alloc_flags(gfp_t gfp_mask, unsigned int order)
3836	{
3837	unsigned int alloc_flags = ALLOC_WMARK_MIN \| ALLOC_CPUSET;
3838
3839	/*
3840	* __GFP_HIGH is assumed to be the same as ALLOC_MIN_RESERVE
3841	* and __GFP_KSWAPD_RECLAIM is assumed to be the same as ALLOC_KSWAPD
3842	* to save two branches.
3843	*/
3844	BUILD_BUG_ON(__GFP_HIGH != (__force gfp_t) ALLOC_MIN_RESERVE);
3845	BUILD_BUG_ON(__GFP_KSWAPD_RECLAIM != (__force gfp_t) ALLOC_KSWAPD);
3846
3847	/*
3848	* The caller may dip into page reserves a bit more if the caller
3849	* cannot run direct reclaim, or if the caller has realtime scheduling
3850	* policy or is asking for __GFP_HIGH memory. GFP_ATOMIC requests will
3851	* set both ALLOC_NON_BLOCK and ALLOC_MIN_RESERVE(__GFP_HIGH).
3852	*/
3853	alloc_flags \|= (__force int)
3854	(gfp_mask & (__GFP_HIGH \| __GFP_KSWAPD_RECLAIM));
3855
3856	if (!(gfp_mask & __GFP_DIRECT_RECLAIM)) {
3857	/*
3858	* Not worth trying to allocate harder for __GFP_NOMEMALLOC even
3859	* if it can't schedule.
3860	*/
3861	if (!(gfp_mask & __GFP_NOMEMALLOC)) {
3862	alloc_flags \|= ALLOC_NON_BLOCK;
3863
3864	if (order > `0`)
3865	alloc_flags \|= ALLOC_HIGHATOMIC;
3866	}
3867
3868	/*
3869	* Ignore cpuset mems for non-blocking __GFP_HIGH (probably
3870	* GFP_ATOMIC) rather than fail, see the comment for
3871	* cpuset_node_allowed().
3872	*/
3873	if (alloc_flags & ALLOC_MIN_RESERVE)
3874	alloc_flags &= ~ALLOC_CPUSET;
3875	} else if (unlikely(rt_task(current)) && in_task())
3876	alloc_flags \|= ALLOC_MIN_RESERVE;
3877
3878	alloc_flags = gfp_to_alloc_flags_cma(gfp_mask, alloc_flags);
3879
3880	return alloc_flags;
3881	}
3882
3883	static bool oom_reserves_allowed(struct task_struct *tsk)
3884	{
3885	if (!tsk_is_oom_victim(tsk))
3886	return false;
3887
3888	/*
3889	* !MMU doesn't have oom reaper so give access to memory reserves
3890	* only to the thread with TIF_MEMDIE set
3891	*/
3892	if (!IS_ENABLED(CONFIG_MMU) && !test_thread_flag(TIF_MEMDIE))
3893	return false;
3894
3895	return true;
3896	}
3897
3898	/*
3899	* Distinguish requests which really need access to full memory
3900	* reserves from oom victims which can live with a portion of it
3901	*/
3902	static inline int __gfp_pfmemalloc_flags(gfp_t gfp_mask)
3903	{
3904	if (unlikely(gfp_mask & __GFP_NOMEMALLOC))
3905	return `0`;
3906	if (gfp_mask & __GFP_MEMALLOC)
3907	return ALLOC_NO_WATERMARKS;
3908	if (in_serving_softirq() && (current->flags & PF_MEMALLOC))
3909	return ALLOC_NO_WATERMARKS;
3910	if (!in_interrupt()) {
3911	if (current->flags & PF_MEMALLOC)
3912	return ALLOC_NO_WATERMARKS;
3913	else if (oom_reserves_allowed(current))
3914	return ALLOC_OOM;
3915	}
3916
3917	return `0`;
3918	}
3919
3920	bool gfp_pfmemalloc_allowed(gfp_t gfp_mask)
3921	{
3922	return !!__gfp_pfmemalloc_flags(gfp_mask);
3923	}
3924
3925	/*
3926	* Checks whether it makes sense to retry the reclaim to make a forward progress
3927	* for the given allocation request.
3928	*
3929	* We give up when we either have tried MAX_RECLAIM_RETRIES in a row
3930	* without success, or when we couldn't even meet the watermark if we
3931	* reclaimed all remaining pages on the LRU lists.
3932	*
3933	* Returns true if a retry is viable or false to enter the oom path.
3934	*/
3935	static inline bool
3936	should_reclaim_retry(gfp_t gfp_mask, unsigned order,
3937	struct alloc_context ac, int* alloc_flags,
3938	bool did_some_progress, int *no_progress_loops)
3939	{
3940	struct zone *zone;
3941	struct zoneref *z;
3942	bool ret = false;
3943
3944	/*
3945	* Costly allocations might have made a progress but this doesn't mean
3946	* their order will become available due to high fragmentation so
3947	* always increment the no progress counter for them
3948	*/
3949	if (did_some_progress && order <= PAGE_ALLOC_COSTLY_ORDER)
3950	*no_progress_loops = `0`;
3951	else
3952	(*no_progress_loops)++;
3953
3954	/*
3955	* Make sure we converge to OOM if we cannot make any progress
3956	* several times in the row.
3957	*/
3958	if (*no_progress_loops > MAX_RECLAIM_RETRIES) {
3959	/ Before OOM, exhaust highatomic_reserve /
3960	return unreserve_highatomic_pageblock(ac, force: true);
3961	}
3962
3963	/*
3964	* Keep reclaiming pages while there is a chance this will lead
3965	* somewhere. If none of the target zones can satisfy our allocation
3966	* request even if all reclaimable pages are considered then we are
3967	* screwed and have to go OOM.
3968	*/
3969	for_each_zone_zonelist_nodemask(zone, z, ac->zonelist,
3970	ac->highest_zoneidx, ac->nodemask) {
3971	unsigned long available;
3972	unsigned long reclaimable;
3973	unsigned long min_wmark = min_wmark_pages(zone);
3974	bool wmark;
3975
3976	available = reclaimable = zone_reclaimable_pages(zone);
3977	available += zone_page_state_snapshot(zone, item: NR_FREE_PAGES);
3978
3979	/*
3980	* Would the allocation succeed if we reclaimed all
3981	* reclaimable pages?
3982	*/
3983	wmark = __zone_watermark_ok(z: zone, order, mark: min_wmark,
3984	highest_zoneidx: ac->highest_zoneidx, alloc_flags, free_pages: available);
3985	trace_reclaim_retry_zone(zoneref: z, order, reclaimable,
3986	available, min_wmark, no_progress_loops: *no_progress_loops, wmark_check: wmark);
3987	if (wmark) {
3988	ret = true;
3989	break;
3990	}
3991	}
3992
3993	/*
3994	* Memory allocation/reclaim might be called from a WQ context and the
3995	* current implementation of the WQ concurrency control doesn't
3996	* recognize that a particular WQ is congested if the worker thread is
3997	* looping without ever sleeping. Therefore we have to do a short sleep
3998	* here rather than calling cond_resched().
3999	*/
4000	if (current->flags & PF_WQ_WORKER)
4001	schedule_timeout_uninterruptible(timeout: `1`);
4002	else
4003	cond_resched();
4004	return ret;
4005	}
4006
4007	static inline bool
4008	check_retry_cpuset(int cpuset_mems_cookie, struct alloc_context *ac)
4009	{
4010	/*
4011	* It's possible that cpuset's mems_allowed and the nodemask from
4012	* mempolicy don't intersect. This should be normally dealt with by
4013	* policy_nodemask(), but it's possible to race with cpuset update in
4014	* such a way the check therein was true, and then it became false
4015	* before we got our cpuset_mems_cookie here.
4016	* This assumes that for all allocations, ac->nodemask can come only
4017	* from MPOL_BIND mempolicy (whose documented semantics is to be ignored
4018	* when it does not intersect with the cpuset restrictions) or the
4019	* caller can deal with a violated nodemask.
4020	*/
4021	if (cpusets_enabled() && ac->nodemask &&
4022	!cpuset_nodemask_valid_mems_allowed(nodemask: ac->nodemask)) {
4023	ac->nodemask = NULL;
4024	return true;
4025	}
4026
4027	/*
4028	* When updating a task's mems_allowed or mempolicy nodemask, it is
4029	* possible to race with parallel threads in such a way that our
4030	* allocation can fail while the mask is being updated. If we are about
4031	* to fail, check if the cpuset changed during allocation and if so,
4032	* retry.
4033	*/
4034	if (read_mems_allowed_retry(seq: cpuset_mems_cookie))
4035	return true;
4036
4037	return false;
4038	}
4039
4040	static inline struct page *
4041	__alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
4042	struct alloc_context *ac)
4043	{
4044	bool can_direct_reclaim = gfp_mask & __GFP_DIRECT_RECLAIM;
4045	const bool costly_order = order > PAGE_ALLOC_COSTLY_ORDER;
4046	struct page *page = NULL;
4047	unsigned int alloc_flags;
4048	unsigned long did_some_progress;
4049	enum compact_priority compact_priority;
4050	enum compact_result compact_result;
4051	int compaction_retries;
4052	int no_progress_loops;
4053	unsigned int cpuset_mems_cookie;
4054	unsigned int zonelist_iter_cookie;
4055	int reserve_flags;
4056
4057	restart:
4058	compaction_retries = `0`;
4059	no_progress_loops = `0`;
4060	compact_priority = DEF_COMPACT_PRIORITY;
4061	cpuset_mems_cookie = read_mems_allowed_begin();
4062	zonelist_iter_cookie = zonelist_iter_begin();
4063
4064	/*
4065	* The fast path uses conservative alloc_flags to succeed only until
4066	* kswapd needs to be woken up, and to avoid the cost of setting up
4067	* alloc_flags precisely. So we do that now.
4068	*/
4069	alloc_flags = gfp_to_alloc_flags(gfp_mask, order);
4070
4071	/*
4072	* We need to recalculate the starting point for the zonelist iterator
4073	* because we might have used different nodemask in the fast path, or
4074	* there was a cpuset modification and we are retrying - otherwise we
4075	* could end up iterating over non-eligible zones endlessly.
4076	*/
4077	ac->preferred_zoneref = first_zones_zonelist(zonelist: ac->zonelist,
4078	highest_zoneidx: ac->highest_zoneidx, nodes: ac->nodemask);
4079	if (!ac->preferred_zoneref->zone)
4080	goto nopage;
4081
4082	/*
4083	* Check for insane configurations where the cpuset doesn't contain
4084	* any suitable zone to satisfy the request - e.g. non-movable
4085	* GFP_HIGHUSER allocations from MOVABLE nodes only.
4086	*/
4087	if (cpusets_insane_config() && (gfp_mask & __GFP_HARDWALL)) {
4088	struct zoneref *z = first_zones_zonelist(zonelist: ac->zonelist,
4089	highest_zoneidx: ac->highest_zoneidx,
4090	nodes: &cpuset_current_mems_allowed);
4091	if (!z->zone)
4092	goto nopage;
4093	}
4094
4095	if (alloc_flags & ALLOC_KSWAPD)
4096	wake_all_kswapds(order, gfp_mask, ac);
4097
4098	/*
4099	* The adjusted alloc_flags might result in immediate success, so try
4100	* that first
4101	*/
4102	page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac);
4103	if (page)
4104	goto got_pg;
4105
4106	/*
4107	* For costly allocations, try direct compaction first, as it's likely
4108	* that we have enough base pages and don't need to reclaim. For non-
4109	* movable high-order allocations, do that as well, as compaction will
4110	* try prevent permanent fragmentation by migrating from blocks of the
4111	* same migratetype.
4112	* Don't try this for allocations that are allowed to ignore
4113	* watermarks, as the ALLOC_NO_WATERMARKS attempt didn't yet happen.
4114	*/
4115	if (can_direct_reclaim &&
4116	(costly_order \|\|
4117	(order > `0` && ac->migratetype != MIGRATE_MOVABLE))
4118	&& !gfp_pfmemalloc_allowed(gfp_mask)) {
4119	page = __alloc_pages_direct_compact(gfp_mask, order,
4120	alloc_flags, ac,
4121	prio: INIT_COMPACT_PRIORITY,
4122	compact_result: &compact_result);
4123	if (page)
4124	goto got_pg;
4125
4126	/*
4127	* Checks for costly allocations with __GFP_NORETRY, which
4128	* includes some THP page fault allocations
4129	*/
4130	if (costly_order && (gfp_mask & __GFP_NORETRY)) {
4131	/*
4132	* If allocating entire pageblock(s) and compaction
4133	* failed because all zones are below low watermarks
4134	* or is prohibited because it recently failed at this
4135	* order, fail immediately unless the allocator has
4136	* requested compaction and reclaim retry.
4137	*
4138	* Reclaim is
4139	* - potentially very expensive because zones are far
4140	* below their low watermarks or this is part of very
4141	* bursty high order allocations,
4142	* - not guaranteed to help because isolate_freepages()
4143	* may not iterate over freed pages as part of its
4144	* linear scan, and
4145	* - unlikely to make entire pageblocks free on its
4146	* own.
4147	*/
4148	if (compact_result == COMPACT_SKIPPED \|\|
4149	compact_result == COMPACT_DEFERRED)
4150	goto nopage;
4151
4152	/*
4153	* Looks like reclaim/compaction is worth trying, but
4154	* sync compaction could be very expensive, so keep
4155	* using async compaction.
4156	*/
4157	compact_priority = INIT_COMPACT_PRIORITY;
4158	}
4159	}
4160
4161	retry:
4162	/ Ensure kswapd doesn't accidentally go to sleep as long as we loop /
4163	if (alloc_flags & ALLOC_KSWAPD)
4164	wake_all_kswapds(order, gfp_mask, ac);
4165
4166	reserve_flags = __gfp_pfmemalloc_flags(gfp_mask);
4167	if (reserve_flags)
4168	alloc_flags = gfp_to_alloc_flags_cma(gfp_mask, alloc_flags: reserve_flags) \|
4169	(alloc_flags & ALLOC_KSWAPD);
4170
4171	/*
4172	* Reset the nodemask and zonelist iterators if memory policies can be
4173	* ignored. These allocations are high priority and system rather than
4174	* user oriented.
4175	*/
4176	if (!(alloc_flags & ALLOC_CPUSET) \|\| reserve_flags) {
4177	ac->nodemask = NULL;
4178	ac->preferred_zoneref = first_zones_zonelist(zonelist: ac->zonelist,
4179	highest_zoneidx: ac->highest_zoneidx, nodes: ac->nodemask);
4180	}
4181
4182	/ Attempt with potentially adjusted zonelist and alloc_flags /
4183	page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac);
4184	if (page)
4185	goto got_pg;
4186
4187	/ Caller is not willing to reclaim, we can't balance anything /
4188	if (!can_direct_reclaim)
4189	goto nopage;
4190
4191	/ Avoid recursion of direct reclaim /
4192	if (current->flags & PF_MEMALLOC)
4193	goto nopage;
4194
4195	/ Try direct reclaim and then allocating /
4196	page = __alloc_pages_direct_reclaim(gfp_mask, order, alloc_flags, ac,
4197	did_some_progress: &did_some_progress);
4198	if (page)
4199	goto got_pg;
4200
4201	/ Try direct compaction and then allocating /
4202	page = __alloc_pages_direct_compact(gfp_mask, order, alloc_flags, ac,
4203	prio: compact_priority, compact_result: &compact_result);
4204	if (page)
4205	goto got_pg;
4206
4207	/ Do not loop if specifically requested /
4208	if (gfp_mask & __GFP_NORETRY)
4209	goto nopage;
4210
4211	/*
4212	* Do not retry costly high order allocations unless they are
4213	* __GFP_RETRY_MAYFAIL
4214	*/
4215	if (costly_order && !(gfp_mask & __GFP_RETRY_MAYFAIL))
4216	goto nopage;
4217
4218	if (should_reclaim_retry(gfp_mask, order, ac, alloc_flags,
4219	did_some_progress: did_some_progress > `0`, no_progress_loops: &no_progress_loops))
4220	goto retry;
4221
4222	/*
4223	* It doesn't make any sense to retry for the compaction if the order-0
4224	* reclaim is not able to make any progress because the current
4225	* implementation of the compaction depends on the sufficient amount
4226	* of free memory (see __compaction_suitable)
4227	*/
4228	if (did_some_progress > `0` &&
4229	should_compact_retry(ac, order, alloc_flags,
4230	compact_result, compact_priority: &compact_priority,
4231	compaction_retries: &compaction_retries))
4232	goto retry;
4233
4234
4235	/*
4236	* Deal with possible cpuset update races or zonelist updates to avoid
4237	* a unnecessary OOM kill.
4238	*/
4239	if (check_retry_cpuset(cpuset_mems_cookie, ac) \|\|
4240	check_retry_zonelist(seq: zonelist_iter_cookie))
4241	goto restart;
4242
4243	/ Reclaim has failed us, start killing things /
4244	page = __alloc_pages_may_oom(gfp_mask, order, ac, did_some_progress: &did_some_progress);
4245	if (page)
4246	goto got_pg;
4247
4248	/ Avoid allocations with no watermarks from looping endlessly /
4249	if (tsk_is_oom_victim(current) &&
4250	(alloc_flags & ALLOC_OOM \|\|
4251	(gfp_mask & __GFP_NOMEMALLOC)))
4252	goto nopage;
4253
4254	/ Retry as long as the OOM killer is making progress /
4255	if (did_some_progress) {
4256	no_progress_loops = `0`;
4257	goto retry;
4258	}
4259
4260	nopage:
4261	/*
4262	* Deal with possible cpuset update races or zonelist updates to avoid
4263	* a unnecessary OOM kill.
4264	*/
4265	if (check_retry_cpuset(cpuset_mems_cookie, ac) \|\|
4266	check_retry_zonelist(seq: zonelist_iter_cookie))
4267	goto restart;
4268
4269	/*
4270	* Make sure that __GFP_NOFAIL request doesn't leak out and make sure
4271	* we always retry
4272	*/
4273	if (gfp_mask & __GFP_NOFAIL) {
4274	/*
4275	* All existing users of the __GFP_NOFAIL are blockable, so warn
4276	* of any new users that actually require GFP_NOWAIT
4277	*/
4278	if (WARN_ON_ONCE_GFP(!can_direct_reclaim, gfp_mask))
4279	goto fail;
4280
4281	/*
4282	* PF_MEMALLOC request from this context is rather bizarre
4283	* because we cannot reclaim anything and only can loop waiting
4284	* for somebody to do a work for us
4285	*/
4286	WARN_ON_ONCE_GFP(current->flags & PF_MEMALLOC, gfp_mask);
4287
4288	/*
4289	* non failing costly orders are a hard requirement which we
4290	* are not prepared for much so let's warn about these users
4291	* so that we can identify them and convert them to something
4292	* else.
4293	*/
4294	WARN_ON_ONCE_GFP(costly_order, gfp_mask);
4295
4296	/*
4297	* Help non-failing allocations by giving some access to memory
4298	* reserves normally used for high priority non-blocking
4299	* allocations but do not use ALLOC_NO_WATERMARKS because this
4300	* could deplete whole memory reserves which would just make
4301	* the situation worse.
4302	*/
4303	page = __alloc_pages_cpuset_fallback(gfp_mask, order, ALLOC_MIN_RESERVE, ac);
4304	if (page)
4305	goto got_pg;
4306
4307	cond_resched();
4308	goto retry;
4309	}
4310	fail:
4311	warn_alloc(gfp_mask, nodemask: ac->nodemask,
4312	fmt: "page allocation failure: order:%u", order);
4313	got_pg:
4314	return page;
4315	}
4316
4317	static inline bool prepare_alloc_pages(gfp_t gfp_mask, unsigned int order,
4318	int preferred_nid, nodemask_t *nodemask,
4319	struct alloc_context ac, gfp_t alloc_gfp,
4320	unsigned int *alloc_flags)
4321	{
4322	ac->highest_zoneidx = gfp_zone(flags: gfp_mask);
4323	ac->zonelist = node_zonelist(nid: preferred_nid, flags: gfp_mask);
4324	ac->nodemask = nodemask;
4325	ac->migratetype = gfp_migratetype(gfp_flags: gfp_mask);
4326
4327	if (cpusets_enabled()) {
4328	*alloc_gfp \|= __GFP_HARDWALL;
4329	/*
4330	* When we are in the interrupt context, it is irrelevant
4331	* to the current task context. It means that any node ok.
4332	*/
4333	if (in_task() && !ac->nodemask)
4334	ac->nodemask = &cpuset_current_mems_allowed;
4335	else
4336	*alloc_flags \|= ALLOC_CPUSET;
4337	}
4338
4339	might_alloc(gfp_mask);
4340
4341	if (should_fail_alloc_page(gfp_mask, order))
4342	return false;
4343
4344	alloc_flags = gfp_to_alloc_flags_cma(gfp_mask, alloc_flags: alloc_flags);
4345
4346	/ Dirty zone balancing only done in the fast path /
4347	ac->spread_dirty_pages = (gfp_mask & __GFP_WRITE);
4348
4349	/*
4350	* The preferred zone is used for statistics but crucially it is
4351	* also used as the starting point for the zonelist iterator. It
4352	* may get reset for allocations that ignore memory policies.
4353	*/
4354	ac->preferred_zoneref = first_zones_zonelist(zonelist: ac->zonelist,
4355	highest_zoneidx: ac->highest_zoneidx, nodes: ac->nodemask);
4356
4357	return true;
4358	}
4359
4360	/*
4361	* __alloc_pages_bulk - Allocate a number of order-0 pages to a list or array
4362	* @gfp: GFP flags for the allocation
4363	* @preferred_nid: The preferred NUMA node ID to allocate from
4364	* @nodemask: Set of nodes to allocate from, may be NULL
4365	* @nr_pages: The number of pages desired on the list or array
4366	* @page_list: Optional list to store the allocated pages
4367	* @page_array: Optional array to store the pages
4368	*
4369	* This is a batched version of the page allocator that attempts to
4370	* allocate nr_pages quickly. Pages are added to page_list if page_list
4371	* is not NULL, otherwise it is assumed that the page_array is valid.
4372	*
4373	* For lists, nr_pages is the number of pages that should be allocated.
4374	*
4375	* For arrays, only NULL elements are populated with pages and nr_pages
4376	* is the maximum number of pages that will be stored in the array.
4377	*
4378	* Returns the number of pages on the list or array.
4379	*/
4380	unsigned long __alloc_pages_bulk(gfp_t gfp, int preferred_nid,
4381	nodemask_t nodemask, int* nr_pages,
4382	struct list_head *page_list,
4383	struct page **page_array)
4384	{
4385	struct page *page;
4386	unsigned long __maybe_unused UP_flags;
4387	struct zone *zone;
4388	struct zoneref *z;
4389	struct per_cpu_pages *pcp;
4390	struct list_head *pcp_list;
4391	struct alloc_context ac;
4392	gfp_t alloc_gfp;
4393	unsigned int alloc_flags = ALLOC_WMARK_LOW;
4394	int nr_populated = `0`, nr_account = `0`;
4395
4396	/*
4397	* Skip populated array elements to determine if any pages need
4398	* to be allocated before disabling IRQs.
4399	*/
4400	while (page_array && nr_populated < nr_pages && page_array[nr_populated])
4401	nr_populated++;
4402
4403	/ No pages requested? /
4404	if (unlikely(nr_pages <= `0`))
4405	goto out;
4406
4407	/ Already populated array? /
4408	if (unlikely(page_array && nr_pages - nr_populated == `0`))
4409	goto out;
4410
4411	/ Bulk allocator does not support memcg accounting. /
4412	if (memcg_kmem_online() && (gfp & __GFP_ACCOUNT))
4413	goto failed;
4414
4415	/ Use the single page allocator for one page. /
4416	if (nr_pages - nr_populated == `1`)
4417	goto failed;
4418
4419	#ifdef CONFIG_PAGE_OWNER
4420	/*
4421	* PAGE_OWNER may recurse into the allocator to allocate space to
4422	* save the stack with pagesets.lock held. Releasing/reacquiring
4423	* removes much of the performance benefit of bulk allocation so
4424	* force the caller to allocate one page at a time as it'll have
4425	* similar performance to added complexity to the bulk allocator.
4426	*/
4427	if (static_branch_unlikely(&page_owner_inited))
4428	goto failed;
4429	#endif
4430
4431	/ May set ALLOC_NOFRAGMENT, fragmentation will return 1 page. /
4432	gfp &= gfp_allowed_mask;
4433	alloc_gfp = gfp;
4434	if (!prepare_alloc_pages(gfp_mask: gfp, order: `0`, preferred_nid, nodemask, ac: &ac, alloc_gfp: &alloc_gfp, alloc_flags: &alloc_flags))
4435	goto out;
4436	gfp = alloc_gfp;
4437
4438	/ Find an allowed local zone that meets the low watermark. /
4439	for_each_zone_zonelist_nodemask(zone, z, ac.zonelist, ac.highest_zoneidx, ac.nodemask) {
4440	unsigned long mark;
4441
4442	if (cpusets_enabled() && (alloc_flags & ALLOC_CPUSET) &&
4443	!__cpuset_zone_allowed(z: zone, gfp_mask: gfp)) {
4444	continue;
4445	}
4446
4447	if (nr_online_nodes > `1` && zone != ac.preferred_zoneref->zone &&
4448	zone_to_nid(zone) != zone_to_nid(zone: ac.preferred_zoneref->zone)) {
4449	goto failed;
4450	}
4451
4452	mark = wmark_pages(zone, alloc_flags & ALLOC_WMARK_MASK) + nr_pages;
4453	if (zone_watermark_fast(z: zone, order: `0`, mark,
4454	highest_zoneidx: zonelist_zone_idx(zoneref: ac.preferred_zoneref),
4455	alloc_flags, gfp_mask: gfp)) {
4456	break;
4457	}
4458	}
4459
4460	/*
4461	* If there are no allowed local zones that meets the watermarks then
4462	* try to allocate a single page and reclaim if necessary.
4463	*/
4464	if (unlikely(!zone))
4465	goto failed;
4466
4467	/ spin_trylock may fail due to a parallel drain or IRQ reentrancy. /
4468	pcp_trylock_prepare(UP_flags);
4469	pcp = pcp_spin_trylock(zone->per_cpu_pageset);
4470	if (!pcp)
4471	goto failed_irq;
4472
4473	/ Attempt the batch allocation /
4474	pcp_list = &pcp->lists[order_to_pindex(migratetype: ac.migratetype, order: `0`)];
4475	while (nr_populated < nr_pages) {
4476
4477	/ Skip existing pages /
4478	if (page_array && page_array[nr_populated]) {
4479	nr_populated++;
4480	continue;
4481	}
4482
4483	page = __rmqueue_pcplist(zone, order: `0`, migratetype: ac.migratetype, alloc_flags,
4484	pcp, list: pcp_list);
4485	if (unlikely(!page)) {
4486	/ Try and allocate at least one page /
4487	if (!nr_account) {
4488	pcp_spin_unlock(pcp);
4489	goto failed_irq;
4490	}
4491	break;
4492	}
4493	nr_account++;
4494
4495	prep_new_page(page, order: `0`, gfp_flags: gfp, alloc_flags: `0`);
4496	if (page_list)
4497	list_add(new: &page->lru, head: page_list);
4498	else
4499	page_array[nr_populated] = page;
4500	nr_populated++;
4501	}
4502
4503	pcp_spin_unlock(pcp);
4504	pcp_trylock_finish(UP_flags);
4505
4506	__count_zid_vm_events(PGALLOC, zone_idx(zone), nr_account);
4507	zone_statistics(preferred_zone: ac.preferred_zoneref->zone, z: zone, nr_account);
4508
4509	out:
4510	return nr_populated;
4511
4512	failed_irq:
4513	pcp_trylock_finish(UP_flags);
4514
4515	failed:
4516	page = __alloc_pages(gfp, order: `0`, preferred_nid, nodemask);
4517	if (page) {
4518	if (page_list)
4519	list_add(new: &page->lru, head: page_list);
4520	else
4521	page_array[nr_populated] = page;
4522	nr_populated++;
4523	}
4524
4525	goto out;
4526	}
4527	EXPORT_SYMBOL_GPL(__alloc_pages_bulk);
4528
4529	/*
4530	* This is the 'heart' of the zoned buddy allocator.
4531	*/
4532	struct page __alloc_pages(gfp_t gfp, unsigned* int order, int preferred_nid,
4533	nodemask_t *nodemask)
4534	{
4535	struct page *page;
4536	unsigned int alloc_flags = ALLOC_WMARK_LOW;
4537	gfp_t alloc_gfp; / The gfp_t that was actually used for allocation /
4538	struct alloc_context ac = { };
4539
4540	/*
4541	* There are several places where we assume that the order value is sane
4542	* so bail out early if the request is out of bound.
4543	*/
4544	if (WARN_ON_ONCE_GFP(order > MAX_ORDER, gfp))
4545	return NULL;
4546
4547	gfp &= gfp_allowed_mask;
4548	/*
4549	* Apply scoped allocation constraints. This is mainly about GFP_NOFS
4550	* resp. GFP_NOIO which has to be inherited for all allocation requests
4551	* from a particular context which has been marked by
4552	* memalloc_no{fs,io}_{save,restore}. And PF_MEMALLOC_PIN which ensures
4553	* movable zones are not used during allocation.
4554	*/
4555	gfp = current_gfp_context(flags: gfp);
4556	alloc_gfp = gfp;
4557	if (!prepare_alloc_pages(gfp_mask: gfp, order, preferred_nid, nodemask, ac: &ac,
4558	alloc_gfp: &alloc_gfp, alloc_flags: &alloc_flags))
4559	return NULL;
4560
4561	/*
4562	* Forbid the first pass from falling back to types that fragment
4563	* memory until all local zones are considered.
4564	*/
4565	alloc_flags \|= alloc_flags_nofragment(zone: ac.preferred_zoneref->zone, gfp_mask: gfp);
4566
4567	/ First allocation attempt /
4568	page = get_page_from_freelist(gfp_mask: alloc_gfp, order, alloc_flags, ac: &ac);
4569	if (likely(page))
4570	goto out;
4571
4572	alloc_gfp = gfp;
4573	ac.spread_dirty_pages = false;
4574
4575	/*
4576	* Restore the original nodemask if it was potentially replaced with
4577	* &cpuset_current_mems_allowed to optimize the fast-path attempt.
4578	*/
4579	ac.nodemask = nodemask;
4580
4581	page = __alloc_pages_slowpath(gfp_mask: alloc_gfp, order, ac: &ac);
4582
4583	out:
4584	if (memcg_kmem_online() && (gfp & __GFP_ACCOUNT) && page &&
4585	unlikely(__memcg_kmem_charge_page(page, gfp, order) != `0`)) {
4586	__free_pages(page, order);
4587	page = NULL;
4588	}
4589
4590	trace_mm_page_alloc(page, order, gfp_flags: alloc_gfp, migratetype: ac.migratetype);
4591	kmsan_alloc_page(page, order, flags: alloc_gfp);
4592
4593	return page;
4594	}
4595	EXPORT_SYMBOL(__alloc_pages);
4596
4597	struct folio __folio_alloc(gfp_t gfp, unsigned* int order, int preferred_nid,
4598	nodemask_t *nodemask)
4599	{
4600	struct page *page = __alloc_pages(gfp \| __GFP_COMP, order,
4601	preferred_nid, nodemask);
4602	return page_rmappable_folio(page);
4603	}
4604	EXPORT_SYMBOL(__folio_alloc);
4605
4606	/*
4607	* Common helper functions. Never use with __GFP_HIGHMEM because the returned
4608	* address cannot represent highmem pages. Use alloc_pages and then kmap if
4609	* you need to access high mem.
4610	*/
4611	unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order)
4612	{
4613	struct page *page;
4614
4615	page = alloc_pages(gfp: gfp_mask & ~__GFP_HIGHMEM, order);
4616	if (!page)
4617	return `0`;
4618	return (unsigned long) page_address(page);
4619	}
4620	EXPORT_SYMBOL(__get_free_pages);
4621
4622	unsigned long get_zeroed_page(gfp_t gfp_mask)
4623	{
4624	return __get_free_page(gfp_mask \| __GFP_ZERO);
4625	}
4626	EXPORT_SYMBOL(get_zeroed_page);
4627
4628	/**
4629	* __free_pages - Free pages allocated with alloc_pages().
4630	* @page: The page pointer returned from alloc_pages().
4631	* @order: The order of the allocation.
4632	*
4633	* This function can free multi-page allocations that are not compound
4634	* pages. It does not check that the @order passed in matches that of
4635	* the allocation, so it is easy to leak memory. Freeing more memory
4636	* than was allocated will probably emit a warning.
4637	*
4638	* If the last reference to this page is speculative, it will be released
4639	* by put_page() which only frees the first page of a non-compound
4640	* allocation. To prevent the remaining pages from being leaked, we free
4641	* the subsequent pages here. If you want to use the page's reference
4642	* count to decide when to free the allocation, you should allocate a
4643	* compound page, and use put_page() instead of __free_pages().
4644	*
4645	* Context: May be called in interrupt context or while holding a normal
4646	* spinlock, but not in NMI context or while holding a raw spinlock.
4647	*/
4648	void __free_pages(struct page page, unsigned* int order)
4649	{
4650	/ get PageHead before we drop reference /
4651	int head = PageHead(page);
4652
4653	if (put_page_testzero(page))
4654	free_the_page(page, order);
4655	else if (!head)
4656	while (order-- > `0`)
4657	free_the_page(page: page + (`1` << order), order);
4658	}
4659	EXPORT_SYMBOL(__free_pages);
4660
4661	void free_pages(unsigned long addr, unsigned int order)
4662	{
4663	if (addr != `0`) {
4664	VM_BUG_ON(!virt_addr_valid((void *)addr));
4665	__free_pages(virt_to_page((void *)addr), order);
4666	}
4667	}
4668
4669	EXPORT_SYMBOL(free_pages);
4670
4671	/*
4672	* Page Fragment:
4673	* An arbitrary-length arbitrary-offset area of memory which resides
4674	* within a 0 or higher order page. Multiple fragments within that page
4675	* are individually refcounted, in the page's reference counter.
4676	*
4677	* The page_frag functions below provide a simple allocation framework for
4678	* page fragments. This is used by the network stack and network device
4679	* drivers to provide a backing region of memory for use as either an
4680	* sk_buff->head, or to be used in the "frags" portion of skb_shared_info.
4681	*/
4682	static struct page __page_frag_cache_refill(struct* page_frag_cache *nc,
4683	gfp_t gfp_mask)
4684	{
4685	struct page *page = NULL;
4686	gfp_t gfp = gfp_mask;
4687
4688	#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
4689	gfp_mask \|= __GFP_COMP \| __GFP_NOWARN \| __GFP_NORETRY \|
4690	__GFP_NOMEMALLOC;
4691	page = alloc_pages_node(NUMA_NO_NODE, gfp_mask,
4692	PAGE_FRAG_CACHE_MAX_ORDER);
4693	nc->size = page ? PAGE_FRAG_CACHE_MAX_SIZE : PAGE_SIZE;
4694	#endif
4695	if (unlikely(!page))
4696	page = alloc_pages_node(NUMA_NO_NODE, gfp_mask: gfp, order: `0`);
4697
4698	nc->va = page ? page_address(page) : NULL;
4699
4700	return page;
4701	}
4702
4703	void __page_frag_cache_drain(struct page page, unsigned* int count)
4704	{
4705	VM_BUG_ON_PAGE(page_ref_count(page) == `0`, page);
4706
4707	if (page_ref_sub_and_test(page, nr: count))
4708	free_the_page(page, order: compound_order(page));
4709	}
4710	EXPORT_SYMBOL(__page_frag_cache_drain);
4711
4712	void page_frag_alloc_align(struct* page_frag_cache *nc,
4713	unsigned int fragsz, gfp_t gfp_mask,
4714	unsigned int align_mask)
4715	{
4716	unsigned int size = PAGE_SIZE;
4717	struct page *page;
4718	int offset;
4719
4720	if (unlikely(!nc->va)) {
4721	refill:
4722	page = __page_frag_cache_refill(nc, gfp_mask);
4723	if (!page)
4724	return NULL;
4725
4726	#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
4727	/ if size can vary use size else just use PAGE_SIZE /
4728	size = nc->size;
4729	#endif
4730	/ Even if we own the page, we do not use atomic_set().*
4731	* This would break get_page_unless_zero() users.
4732	*/
4733	page_ref_add(page, PAGE_FRAG_CACHE_MAX_SIZE);
4734
4735	/ reset page count bias and offset to start of new frag /
4736	nc->pfmemalloc = page_is_pfmemalloc(page);
4737	nc->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE + `1`;
4738	nc->offset = size;
4739	}
4740
4741	offset = nc->offset - fragsz;
4742	if (unlikely(offset < `0`)) {
4743	page = virt_to_page(nc->va);
4744
4745	if (!page_ref_sub_and_test(page, nr: nc->pagecnt_bias))
4746	goto refill;
4747
4748	if (unlikely(nc->pfmemalloc)) {
4749	free_the_page(page, order: compound_order(page));
4750	goto refill;
4751	}
4752
4753	#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
4754	/ if size can vary use size else just use PAGE_SIZE /
4755	size = nc->size;
4756	#endif
4757	/ OK, page count is 0, we can safely set it /
4758	set_page_count(page, PAGE_FRAG_CACHE_MAX_SIZE + `1`);
4759
4760	/ reset page count bias and offset to start of new frag /
4761	nc->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE + `1`;
4762	offset = size - fragsz;
4763	if (unlikely(offset < `0`)) {
4764	/*
4765	* The caller is trying to allocate a fragment
4766	* with fragsz > PAGE_SIZE but the cache isn't big
4767	* enough to satisfy the request, this may
4768	* happen in low memory conditions.
4769	* We don't release the cache page because
4770	* it could make memory pressure worse
4771	* so we simply return NULL here.
4772	*/
4773	return NULL;
4774	}
4775	}
4776
4777	nc->pagecnt_bias--;
4778	offset &= align_mask;
4779	nc->offset = offset;
4780
4781	return nc->va + offset;
4782	}
4783	EXPORT_SYMBOL(page_frag_alloc_align);
4784
4785	/*
4786	* Frees a page fragment allocated out of either a compound or order 0 page.
4787	*/
4788	void page_frag_free(void *addr)
4789	{
4790	struct page *page = virt_to_head_page(x: addr);
4791
4792	if (unlikely(put_page_testzero(page)))
4793	free_the_page(page, order: compound_order(page));
4794	}
4795	EXPORT_SYMBOL(page_frag_free);
4796
4797	static void make_alloc_exact(unsigned* long addr, unsigned int order,
4798	size_t size)
4799	{
4800	if (addr) {
4801	unsigned long nr = DIV_ROUND_UP(size, PAGE_SIZE);
4802	struct page page = virt_to_page((void* *)addr);
4803	struct page *last = page + nr;
4804
4805	split_page_owner(page, nr: `1` << order);
4806	split_page_memcg(head: page, nr: `1` << order);
4807	while (page < --last)
4808	set_page_refcounted(last);
4809
4810	last = page + (`1UL` << order);
4811	for (page += nr; page < last; page++)
4812	__free_pages_ok(page, order: `0`, FPI_TO_TAIL);
4813	}
4814	return (void *)addr;
4815	}
4816
4817	/**
4818	* alloc_pages_exact - allocate an exact number physically-contiguous pages.
4819	* @size: the number of bytes to allocate
4820	* @gfp_mask: GFP flags for the allocation, must not contain __GFP_COMP
4821	*
4822	* This function is similar to alloc_pages(), except that it allocates the
4823	* minimum number of pages to satisfy the request. alloc_pages() can only
4824	* allocate memory in power-of-two pages.
4825	*
4826	* This function is also limited by MAX_ORDER.
4827	*
4828	* Memory allocated by this function must be released by free_pages_exact().
4829	*
4830	* Return: pointer to the allocated area or %NULL in case of error.
4831	*/
4832	void *alloc_pages_exact(size_t size, gfp_t gfp_mask)
4833	{
4834	unsigned int order = get_order(size);
4835	unsigned long addr;
4836
4837	if (WARN_ON_ONCE(gfp_mask & (__GFP_COMP \| __GFP_HIGHMEM)))
4838	gfp_mask &= ~(__GFP_COMP \| __GFP_HIGHMEM);
4839
4840	addr = __get_free_pages(gfp_mask, order);
4841	return make_alloc_exact(addr, order, size);
4842	}
4843	EXPORT_SYMBOL(alloc_pages_exact);
4844
4845	/**
4846	* alloc_pages_exact_nid - allocate an exact number of physically-contiguous
4847	* pages on a node.
4848	* @nid: the preferred node ID where memory should be allocated
4849	* @size: the number of bytes to allocate
4850	* @gfp_mask: GFP flags for the allocation, must not contain __GFP_COMP
4851	*
4852	* Like alloc_pages_exact(), but try to allocate on node nid first before falling
4853	* back.
4854	*
4855	* Return: pointer to the allocated area or %NULL in case of error.
4856	*/
4857	void * __meminit alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask)
4858	{
4859	unsigned int order = get_order(size);
4860	struct page *p;
4861
4862	if (WARN_ON_ONCE(gfp_mask & (__GFP_COMP \| __GFP_HIGHMEM)))
4863	gfp_mask &= ~(__GFP_COMP \| __GFP_HIGHMEM);
4864
4865	p = alloc_pages_node(nid, gfp_mask, order);
4866	if (!p)
4867	return NULL;
4868	return make_alloc_exact(addr: (unsigned long)page_address(p), order, size);
4869	}
4870
4871	/**
4872	* free_pages_exact - release memory allocated via alloc_pages_exact()
4873	* @virt: the value returned by alloc_pages_exact.
4874	* @size: size of allocation, same value as passed to alloc_pages_exact().
4875	*
4876	* Release the memory allocated by a previous call to alloc_pages_exact.
4877	*/
4878	void free_pages_exact(void *virt, size_t size)
4879	{
4880	unsigned long addr = (unsigned long)virt;
4881	unsigned long end = addr + PAGE_ALIGN(size);
4882
4883	while (addr < end) {
4884	free_page(addr);
4885	addr += PAGE_SIZE;
4886	}
4887	}
4888	EXPORT_SYMBOL(free_pages_exact);
4889
4890	/**
4891	* nr_free_zone_pages - count number of pages beyond high watermark
4892	* @offset: The zone index of the highest zone
4893	*
4894	* nr_free_zone_pages() counts the number of pages which are beyond the
4895	* high watermark within all zones at or below a given zone index. For each
4896	* zone, the number of pages is calculated as:
4897	*
4898	* nr_free_zone_pages = managed_pages - high_pages
4899	*
4900	* Return: number of pages beyond high watermark.
4901	*/
4902	static unsigned long nr_free_zone_pages(int offset)
4903	{
4904	struct zoneref *z;
4905	struct zone *zone;
4906
4907	/ Just pick one node, since fallback list is circular /
4908	unsigned long sum = `0`;
4909
4910	struct zonelist *zonelist = node_zonelist(nid: numa_node_id(), GFP_KERNEL);
4911
4912	for_each_zone_zonelist(zone, z, zonelist, offset) {
4913	unsigned long size = zone_managed_pages(zone);
4914	unsigned long high = high_wmark_pages(zone);
4915	if (size > high)
4916	sum += size - high;
4917	}
4918
4919	return sum;
4920	}
4921
4922	/**
4923	* nr_free_buffer_pages - count number of pages beyond high watermark
4924	*
4925	* nr_free_buffer_pages() counts the number of pages which are beyond the high
4926	* watermark within ZONE_DMA and ZONE_NORMAL.
4927	*
4928	* Return: number of pages beyond high watermark within ZONE_DMA and
4929	* ZONE_NORMAL.
4930	*/
4931	unsigned long nr_free_buffer_pages(void)
4932	{
4933	return nr_free_zone_pages(offset: gfp_zone(GFP_USER));
4934	}
4935	EXPORT_SYMBOL_GPL(nr_free_buffer_pages);
4936
4937	static void zoneref_set_zone(struct zone zone, struct* zoneref *zoneref)
4938	{
4939	zoneref->zone = zone;
4940	zoneref->zone_idx = zone_idx(zone);
4941	}
4942
4943	/*
4944	* Builds allocation fallback zone lists.
4945	*
4946	* Add all populated zones of a node to the zonelist.
4947	*/
4948	static int build_zonerefs_node(pg_data_t pgdat, struct* zoneref *zonerefs)
4949	{
4950	struct zone *zone;
4951	enum zone_type zone_type = MAX_NR_ZONES;
4952	int nr_zones = `0`;
4953
4954	do {
4955	zone_type--;
4956	zone = pgdat->node_zones + zone_type;
4957	if (populated_zone(zone)) {
4958	zoneref_set_zone(zone, zoneref: &zonerefs[nr_zones++]);
4959	check_highest_zone(k: zone_type);
4960	}
4961	} while (zone_type);
4962
4963	return nr_zones;
4964	}
4965
4966	#ifdef CONFIG_NUMA
4967
4968	static int __parse_numa_zonelist_order(char *s)
4969	{
4970	/*
4971	* We used to support different zonelists modes but they turned
4972	* out to be just not useful. Let's keep the warning in place
4973	* if somebody still use the cmd line parameter so that we do
4974	* not fail it silently
4975	*/
4976	if (!(s == `'d'` \|\| s == `'D'` \|\| s == `'n'` \|\| s == `'N'`)) {
4977	pr_warn("Ignoring unsupported numa_zonelist_order value: %s\n", s);
4978	return -EINVAL;
4979	}
4980	return `0`;
4981	}
4982
4983	static char numa_zonelist_order[] = "Node";
4984	#define NUMA_ZONELIST_ORDER_LEN 16
4985	/*
4986	* sysctl handler for numa_zonelist_order
4987	*/
4988	static int numa_zonelist_order_handler(struct ctl_table table, int* write,
4989	void buffer, size_t length, loff_t *ppos)
4990	{
4991	if (write)
4992	return __parse_numa_zonelist_order(s: buffer);
4993	return proc_dostring(table, write, buffer, length, ppos);
4994	}
4995
4996	static int node_load[MAX_NUMNODES];
4997
4998	/**
4999	* find_next_best_node - find the next node that should appear in a given node's fallback list
5000	* @node: node whose fallback list we're appending
5001	* @used_node_mask: nodemask_t of already used nodes
5002	*
5003	* We use a number of factors to determine which is the next node that should
5004	* appear on a given node's fallback list. The node should not have appeared
5005	* already in @node's fallback list, and it should be the next closest node
5006	* according to the distance array (which contains arbitrary distance values
5007	* from each node to each node in the system), and should also prefer nodes
5008	* with no CPUs, since presumably they'll have very little allocation pressure
5009	* on them otherwise.
5010	*
5011	* Return: node id of the found node or %NUMA_NO_NODE if no node is found.
5012	*/
5013	int find_next_best_node(int node, nodemask_t *used_node_mask)
5014	{
5015	int n, val;
5016	int min_val = INT_MAX;
5017	int best_node = NUMA_NO_NODE;
5018
5019	/*
5020	* Use the local node if we haven't already, but for memoryless local
5021	* node, we should skip it and fall back to other nodes.
5022	*/
5023	if (!node_isset(node, *used_node_mask) && node_state(node, state: N_MEMORY)) {
5024	node_set(node, *used_node_mask);
5025	return node;
5026	}
5027
5028	for_each_node_state(n, N_MEMORY) {
5029
5030	/ Don't want a node to appear more than once /
5031	if (node_isset(n, *used_node_mask))
5032	continue;
5033
5034	/ Use the distance array to find the distance /
5035	val = node_distance(node, n);
5036
5037	/ Penalize nodes under us ("prefer the next node") /
5038	val += (n < node);
5039
5040	/ Give preference to headless and unused nodes /
5041	if (!cpumask_empty(srcp: cpumask_of_node(node: n)))
5042	val += PENALTY_FOR_NODE_WITH_CPUS;
5043
5044	/ Slight preference for less loaded node /
5045	val *= MAX_NUMNODES;
5046	val += node_load[n];
5047
5048	if (val < min_val) {
5049	min_val = val;
5050	best_node = n;
5051	}
5052	}
5053
5054	if (best_node >= `0`)
5055	node_set(best_node, *used_node_mask);
5056
5057	return best_node;
5058	}
5059
5060
5061	/*
5062	* Build zonelists ordered by node and zones within node.
5063	* This results in maximum locality--normal zone overflows into local
5064	* DMA zone, if any--but risks exhausting DMA zone.
5065	*/
5066	static void build_zonelists_in_node_order(pg_data_t pgdat, int* *node_order,
5067	unsigned nr_nodes)
5068	{
5069	struct zoneref *zonerefs;
5070	int i;
5071
5072	zonerefs = pgdat->node_zonelists[ZONELIST_FALLBACK]._zonerefs;
5073
5074	for (i = `0`; i < nr_nodes; i++) {
5075	int nr_zones;
5076
5077	pg_data_t *node = NODE_DATA(node_order[i]);
5078
5079	nr_zones = build_zonerefs_node(pgdat: node, zonerefs);
5080	zonerefs += nr_zones;
5081	}
5082	zonerefs->zone = NULL;
5083	zonerefs->zone_idx = `0`;
5084	}
5085
5086	/*
5087	* Build gfp_thisnode zonelists
5088	*/
5089	static void build_thisnode_zonelists(pg_data_t *pgdat)
5090	{
5091	struct zoneref *zonerefs;
5092	int nr_zones;
5093
5094	zonerefs = pgdat->node_zonelists[ZONELIST_NOFALLBACK]._zonerefs;
5095	nr_zones = build_zonerefs_node(pgdat, zonerefs);
5096	zonerefs += nr_zones;
5097	zonerefs->zone = NULL;
5098	zonerefs->zone_idx = `0`;
5099	}
5100
5101	/*
5102	* Build zonelists ordered by zone and nodes within zones.
5103	* This results in conserving DMA zone[s] until all Normal memory is
5104	* exhausted, but results in overflowing to remote node while memory
5105	* may still exist in local DMA zone.
5106	*/
5107
5108	static void build_zonelists(pg_data_t *pgdat)
5109	{
5110	static int node_order[MAX_NUMNODES];
5111	int node, nr_nodes = `0`;
5112	nodemask_t used_mask = NODE_MASK_NONE;
5113	int local_node, prev_node;
5114
5115	/ NUMA-aware ordering of nodes /
5116	local_node = pgdat->node_id;
5117	prev_node = local_node;
5118
5119	memset(node_order, `0`, sizeof(node_order));
5120	while ((node = find_next_best_node(node: local_node, used_node_mask: &used_mask)) >= `0`) {
5121	/*
5122	* We don't want to pressure a particular node.
5123	* So adding penalty to the first node in same
5124	* distance group to make it round-robin.
5125	*/
5126	if (node_distance(local_node, node) !=
5127	node_distance(local_node, prev_node))
5128	node_load[node] += `1`;
5129
5130	node_order[nr_nodes++] = node;
5131	prev_node = node;
5132	}
5133
5134	build_zonelists_in_node_order(pgdat, node_order, nr_nodes);
5135	build_thisnode_zonelists(pgdat);
5136	pr_info("Fallback order for Node %d: ", local_node);
5137	for (node = `0`; node < nr_nodes; node++)
5138	pr_cont("%d ", node_order[node]);
5139	pr_cont("\n");
5140	}
5141
5142	#ifdef CONFIG_HAVE_MEMORYLESS_NODES
5143	/*
5144	* Return node id of node used for "local" allocations.
5145	* I.e., first node id of first zone in arg node's generic zonelist.
5146	* Used for initializing percpu 'numa_mem', which is used primarily
5147	* for kernel allocations, so use GFP_KERNEL flags to locate zonelist.
5148	*/
5149	int local_memory_node(int node)
5150	{
5151	struct zoneref *z;
5152
5153	z = first_zones_zonelist(node_zonelist(node, GFP_KERNEL),
5154	gfp_zone(GFP_KERNEL),
5155	NULL);
5156	return zone_to_nid(z->zone);
5157	}
5158	#endif
5159
5160	static void setup_min_unmapped_ratio(void);
5161	static void setup_min_slab_ratio(void);
5162	#else /* CONFIG_NUMA */
5163
5164	static void build_zonelists(pg_data_t *pgdat)
5165	{
5166	int node, local_node;
5167	struct zoneref *zonerefs;
5168	int nr_zones;
5169
5170	local_node = pgdat->node_id;
5171
5172	zonerefs = pgdat->node_zonelists[ZONELIST_FALLBACK]._zonerefs;
5173	nr_zones = build_zonerefs_node(pgdat, zonerefs);
5174	zonerefs += nr_zones;
5175
5176	/*
5177	* Now we build the zonelist so that it contains the zones
5178	* of all the other nodes.
5179	* We don't want to pressure a particular node, so when
5180	* building the zones for node N, we make sure that the
5181	* zones coming right after the local ones are those from
5182	* node N+1 (modulo N)
5183	*/
5184	for (node = local_node + `1`; node < MAX_NUMNODES; node++) {
5185	if (!node_online(node))
5186	continue;
5187	nr_zones = build_zonerefs_node(NODE_DATA(node), zonerefs);
5188	zonerefs += nr_zones;
5189	}
5190	for (node = `0`; node < local_node; node++) {
5191	if (!node_online(node))
5192	continue;
5193	nr_zones = build_zonerefs_node(NODE_DATA(node), zonerefs);
5194	zonerefs += nr_zones;
5195	}
5196
5197	zonerefs->zone = NULL;
5198	zonerefs->zone_idx = `0`;
5199	}
5200
5201	#endif /* CONFIG_NUMA */
5202
5203	/*
5204	* Boot pageset table. One per cpu which is going to be used for all
5205	* zones and all nodes. The parameters will be set in such a way
5206	* that an item put on a list will immediately be handed over to
5207	* the buddy list. This is safe since pageset manipulation is done
5208	* with interrupts disabled.
5209	*
5210	* The boot_pagesets must be kept even after bootup is complete for
5211	* unused processors and/or zones. They do play a role for bootstrapping
5212	* hotplugged processors.
5213	*
5214	* zoneinfo_show() and maybe other functions do
5215	* not check if the processor is online before following the pageset pointer.
5216	* Other parts of the kernel may not check if the zone is available.
5217	*/
5218	static void per_cpu_pages_init(struct per_cpu_pages pcp, struct* per_cpu_zonestat *pzstats);
5219	/ These effectively disable the pcplists in the boot pageset completely /
5220	#define BOOT_PAGESET_HIGH 0
5221	#define BOOT_PAGESET_BATCH 1
5222	static DEFINE_PER_CPU(struct per_cpu_pages, boot_pageset);
5223	static DEFINE_PER_CPU(struct per_cpu_zonestat, boot_zonestats);
5224
5225	static void __build_all_zonelists(void *data)
5226	{
5227	int nid;
5228	int __maybe_unused cpu;
5229	pg_data_t *self = data;
5230	unsigned long flags;
5231
5232	/*
5233	* The zonelist_update_seq must be acquired with irqsave because the
5234	* reader can be invoked from IRQ with GFP_ATOMIC.
5235	*/
5236	write_seqlock_irqsave(&zonelist_update_seq, flags);
5237	/*
5238	* Also disable synchronous printk() to prevent any printk() from
5239	* trying to hold port->lock, for
5240	* tty_insert_flip_string_and_push_buffer() on other CPU might be
5241	* calling kmalloc(GFP_ATOMIC \| __GFP_NOWARN) with port->lock held.
5242	*/
5243	printk_deferred_enter();
5244
5245	#ifdef CONFIG_NUMA
5246	memset(node_load, `0`, sizeof(node_load));
5247	#endif
5248
5249	/*
5250	* This node is hotadded and no memory is yet present. So just
5251	* building zonelists is fine - no need to touch other nodes.
5252	*/
5253	if (self && !node_online(self->node_id)) {
5254	build_zonelists(pgdat: self);
5255	} else {
5256	/*
5257	* All possible nodes have pgdat preallocated
5258	* in free_area_init
5259	*/
5260	for_each_node(nid) {
5261	pg_data_t *pgdat = NODE_DATA(nid);
5262
5263	build_zonelists(pgdat);
5264	}
5265
5266	#ifdef CONFIG_HAVE_MEMORYLESS_NODES
5267	/*
5268	* We now know the "local memory node" for each node--
5269	* i.e., the node of the first zone in the generic zonelist.
5270	* Set up numa_mem percpu variable for on-line cpus. During
5271	* boot, only the boot cpu should be on-line; we'll init the
5272	* secondary cpus' numa_mem as they come on-line. During
5273	* node/memory hotplug, we'll fixup all on-line cpus.
5274	*/
5275	for_each_online_cpu(cpu)
5276	set_cpu_numa_mem(cpu, local_memory_node(cpu_to_node(cpu)));
5277	#endif
5278	}
5279
5280	printk_deferred_exit();
5281	write_sequnlock_irqrestore(sl: &zonelist_update_seq, flags);
5282	}
5283
5284	static noinline void __init
5285	build_all_zonelists_init(void)
5286	{
5287	int cpu;
5288
5289	__build_all_zonelists(NULL);
5290
5291	/*
5292	* Initialize the boot_pagesets that are going to be used
5293	* for bootstrapping processors. The real pagesets for
5294	* each zone will be allocated later when the per cpu
5295	* allocator is available.
5296	*
5297	* boot_pagesets are used also for bootstrapping offline
5298	* cpus if the system is already booted because the pagesets
5299	* are needed to initialize allocators on a specific cpu too.
5300	* F.e. the percpu allocator needs the page allocator which
5301	* needs the percpu allocator in order to allocate its pagesets
5302	* (a chicken-egg dilemma).
5303	*/
5304	for_each_possible_cpu(cpu)
5305	per_cpu_pages_init(pcp: &per_cpu(boot_pageset, cpu), pzstats: &per_cpu(boot_zonestats, cpu));
5306
5307	mminit_verify_zonelist();
5308	cpuset_init_current_mems_allowed();
5309	}
5310
5311	/*
5312	* unless system_state == SYSTEM_BOOTING.
5313	*
5314	* __ref due to call of __init annotated helper build_all_zonelists_init
5315	* [protected by SYSTEM_BOOTING].
5316	*/
5317	void __ref build_all_zonelists(pg_data_t *pgdat)
5318	{
5319	unsigned long vm_total_pages;
5320
5321	if (system_state == SYSTEM_BOOTING) {
5322	build_all_zonelists_init();
5323	} else {
5324	__build_all_zonelists(data: pgdat);
5325	/ cpuset refresh routine should be here /
5326	}
5327	/ Get the number of free pages beyond high watermark in all zones. /
5328	vm_total_pages = nr_free_zone_pages(offset: gfp_zone(GFP_HIGHUSER_MOVABLE));
5329	/*
5330	* Disable grouping by mobility if the number of pages in the
5331	* system is too low to allow the mechanism to work. It would be
5332	* more accurate, but expensive to check per-zone. This check is
5333	* made on memory-hotadd so a system can start with mobility
5334	* disabled and enable it later
5335	*/
5336	if (vm_total_pages < (pageblock_nr_pages * MIGRATE_TYPES))
5337	page_group_by_mobility_disabled = `1`;
5338	else
5339	page_group_by_mobility_disabled = `0`;
5340
5341	pr_info("Built %u zonelists, mobility grouping %s. Total pages: %ld\n",
5342	nr_online_nodes,
5343	page_group_by_mobility_disabled ? "off" : "on",
5344	vm_total_pages);
5345	#ifdef CONFIG_NUMA
5346	pr_info("Policy zone: %s\n", zone_names[policy_zone]);
5347	#endif
5348	}
5349
5350	static int zone_batchsize(struct zone *zone)
5351	{
5352	#ifdef CONFIG_MMU
5353	int batch;
5354
5355	/*
5356	* The number of pages to batch allocate is either ~0.1%
5357	* of the zone or 1MB, whichever is smaller. The batch
5358	* size is striking a balance between allocation latency
5359	* and zone lock contention.
5360	*/
5361	batch = min(zone_managed_pages(zone) >> `10`, SZ_1M / PAGE_SIZE);
5362	batch /= `4`; / We effectively = 4 below /*
5363	if (batch < `1`)
5364	batch = `1`;
5365
5366	/*
5367	* Clamp the batch to a 2^n - 1 value. Having a power
5368	* of 2 value was found to be more likely to have
5369	* suboptimal cache aliasing properties in some cases.
5370	*
5371	* For example if 2 tasks are alternately allocating
5372	* batches of pages, one task can end up with a lot
5373	* of pages of one half of the possible page colors
5374	* and the other with pages of the other colors.
5375	*/
5376	batch = rounddown_pow_of_two(batch + batch/`2`) - `1`;
5377
5378	return batch;
5379
5380	#else
5381	/ The deferral and batching of frees should be suppressed under NOMMU*
5382	* conditions.
5383	*
5384	* The problem is that NOMMU needs to be able to allocate large chunks
5385	* of contiguous memory as there's no hardware page translation to
5386	* assemble apparent contiguous memory from discontiguous pages.
5387	*
5388	* Queueing large contiguous runs of pages for batching, however,
5389	* causes the pages to actually be freed in smaller chunks. As there
5390	* can be a significant delay between the individual batches being
5391	* recycled, this leads to the once large chunks of space being
5392	* fragmented and becoming unavailable for high-order allocations.
5393	*/
5394	return `0`;
5395	#endif
5396	}
5397
5398	static int percpu_pagelist_high_fraction;
5399	static int zone_highsize(struct zone zone, int* batch, int cpu_online,
5400	int high_fraction)
5401	{
5402	#ifdef CONFIG_MMU
5403	int high;
5404	int nr_split_cpus;
5405	unsigned long total_pages;
5406
5407	if (!high_fraction) {
5408	/*
5409	* By default, the high value of the pcp is based on the zone
5410	* low watermark so that if they are full then background
5411	* reclaim will not be started prematurely.
5412	*/
5413	total_pages = low_wmark_pages(zone);
5414	} else {
5415	/*
5416	* If percpu_pagelist_high_fraction is configured, the high
5417	* value is based on a fraction of the managed pages in the
5418	* zone.
5419	*/
5420	total_pages = zone_managed_pages(zone) / high_fraction;
5421	}
5422
5423	/*
5424	* Split the high value across all online CPUs local to the zone. Note
5425	* that early in boot that CPUs may not be online yet and that during
5426	* CPU hotplug that the cpumask is not yet updated when a CPU is being
5427	* onlined. For memory nodes that have no CPUs, split the high value
5428	* across all online CPUs to mitigate the risk that reclaim is triggered
5429	* prematurely due to pages stored on pcp lists.
5430	*/
5431	nr_split_cpus = cpumask_weight(srcp: cpumask_of_node(node: zone_to_nid(zone))) + cpu_online;
5432	if (!nr_split_cpus)
5433	nr_split_cpus = num_online_cpus();
5434	high = total_pages / nr_split_cpus;
5435
5436	/*
5437	* Ensure high is at least batch*4. The multiple is based on the
5438	* historical relationship between high and batch.
5439	*/
5440	high = max(high, batch << `2`);
5441
5442	return high;
5443	#else
5444	return `0`;
5445	#endif
5446	}
5447
5448	/*
5449	* pcp->high and pcp->batch values are related and generally batch is lower
5450	* than high. They are also related to pcp->count such that count is lower
5451	* than high, and as soon as it reaches high, the pcplist is flushed.
5452	*
5453	* However, guaranteeing these relations at all times would require e.g. write
5454	* barriers here but also careful usage of read barriers at the read side, and
5455	* thus be prone to error and bad for performance. Thus the update only prevents
5456	* store tearing. Any new users of pcp->batch, pcp->high_min and pcp->high_max
5457	* should ensure they can cope with those fields changing asynchronously, and
5458	* fully trust only the pcp->count field on the local CPU with interrupts
5459	* disabled.
5460	*
5461	* mutex_is_locked(&pcp_batch_high_lock) required when calling this function
5462	* outside of boot time (or some other assurance that no concurrent updaters
5463	* exist).
5464	*/
5465	static void pageset_update(struct per_cpu_pages pcp, unsigned* long high_min,
5466	unsigned long high_max, unsigned long batch)
5467	{
5468	WRITE_ONCE(pcp->batch, batch);
5469	WRITE_ONCE(pcp->high_min, high_min);
5470	WRITE_ONCE(pcp->high_max, high_max);
5471	}
5472
5473	static void per_cpu_pages_init(struct per_cpu_pages pcp, struct* per_cpu_zonestat *pzstats)
5474	{
5475	int pindex;
5476
5477	memset(pcp, `0`, sizeof(*pcp));
5478	memset(pzstats, `0`, sizeof(*pzstats));
5479
5480	spin_lock_init(&pcp->lock);
5481	for (pindex = `0`; pindex < NR_PCP_LISTS; pindex++)
5482	INIT_LIST_HEAD(list: &pcp->lists[pindex]);
5483
5484	/*
5485	* Set batch and high values safe for a boot pageset. A true percpu
5486	* pageset's initialization will update them subsequently. Here we don't
5487	* need to be as careful as pageset_update() as nobody can access the
5488	* pageset yet.
5489	*/
5490	pcp->high_min = BOOT_PAGESET_HIGH;
5491	pcp->high_max = BOOT_PAGESET_HIGH;
5492	pcp->batch = BOOT_PAGESET_BATCH;
5493	pcp->free_count = `0`;
5494	}
5495
5496	static void __zone_set_pageset_high_and_batch(struct zone zone, unsigned* long high_min,
5497	unsigned long high_max, unsigned long batch)
5498	{
5499	struct per_cpu_pages *pcp;
5500	int cpu;
5501
5502	for_each_possible_cpu(cpu) {
5503	pcp = per_cpu_ptr(zone->per_cpu_pageset, cpu);
5504	pageset_update(pcp, high_min, high_max, batch);
5505	}
5506	}
5507
5508	/*
5509	* Calculate and set new high and batch values for all per-cpu pagesets of a
5510	* zone based on the zone's size.
5511	*/
5512	static void zone_set_pageset_high_and_batch(struct zone zone, int* cpu_online)
5513	{
5514	int new_high_min, new_high_max, new_batch;
5515
5516	new_batch = max(`1`, zone_batchsize(zone));
5517	if (percpu_pagelist_high_fraction) {
5518	new_high_min = zone_highsize(zone, batch: new_batch, cpu_online,
5519	high_fraction: percpu_pagelist_high_fraction);
5520	/*
5521	* PCP high is tuned manually, disable auto-tuning via
5522	* setting high_min and high_max to the manual value.
5523	*/
5524	new_high_max = new_high_min;
5525	} else {
5526	new_high_min = zone_highsize(zone, batch: new_batch, cpu_online, high_fraction: `0`);
5527	new_high_max = zone_highsize(zone, batch: new_batch, cpu_online,
5528	MIN_PERCPU_PAGELIST_HIGH_FRACTION);
5529	}
5530
5531	if (zone->pageset_high_min == new_high_min &&
5532	zone->pageset_high_max == new_high_max &&
5533	zone->pageset_batch == new_batch)
5534	return;
5535
5536	zone->pageset_high_min = new_high_min;
5537	zone->pageset_high_max = new_high_max;
5538	zone->pageset_batch = new_batch;
5539
5540	__zone_set_pageset_high_and_batch(zone, high_min: new_high_min, high_max: new_high_max,
5541	batch: new_batch);
5542	}
5543
5544	void __meminit setup_zone_pageset(struct zone *zone)
5545	{
5546	int cpu;
5547
5548	/ Size may be 0 on !SMP && !NUMA /
5549	if (sizeof(struct per_cpu_zonestat) > `0`)
5550	zone->per_cpu_zonestats = alloc_percpu(struct per_cpu_zonestat);
5551
5552	zone->per_cpu_pageset = alloc_percpu(struct per_cpu_pages);
5553	for_each_possible_cpu(cpu) {
5554	struct per_cpu_pages *pcp;
5555	struct per_cpu_zonestat *pzstats;
5556
5557	pcp = per_cpu_ptr(zone->per_cpu_pageset, cpu);
5558	pzstats = per_cpu_ptr(zone->per_cpu_zonestats, cpu);
5559	per_cpu_pages_init(pcp, pzstats);
5560	}
5561
5562	zone_set_pageset_high_and_batch(zone, cpu_online: `0`);
5563	}
5564
5565	/*
5566	* The zone indicated has a new number of managed_pages; batch sizes and percpu
5567	* page high values need to be recalculated.
5568	*/
5569	static void zone_pcp_update(struct zone zone, int* cpu_online)
5570	{
5571	mutex_lock(&pcp_batch_high_lock);
5572	zone_set_pageset_high_and_batch(zone, cpu_online);
5573	mutex_unlock(lock: &pcp_batch_high_lock);
5574	}
5575
5576	static void zone_pcp_update_cacheinfo(struct zone *zone)
5577	{
5578	int cpu;
5579	struct per_cpu_pages *pcp;
5580	struct cpu_cacheinfo *cci;
5581
5582	for_each_online_cpu(cpu) {
5583	pcp = per_cpu_ptr(zone->per_cpu_pageset, cpu);
5584	cci = get_cpu_cacheinfo(cpu);
5585	/*
5586	* If data cache slice of CPU is large enough, "pcp->batch"
5587	* pages can be preserved in PCP before draining PCP for
5588	* consecutive high-order pages freeing without allocation.
5589	* This can reduce zone lock contention without hurting
5590	* cache-hot pages sharing.
5591	*/
5592	spin_lock(lock: &pcp->lock);
5593	if ((cci->per_cpu_data_slice_size >> PAGE_SHIFT) > `3` * pcp->batch)
5594	pcp->flags \|= PCPF_FREE_HIGH_BATCH;
5595	else
5596	pcp->flags &= ~PCPF_FREE_HIGH_BATCH;
5597	spin_unlock(lock: &pcp->lock);
5598	}
5599	}
5600
5601	void setup_pcp_cacheinfo(void)
5602	{
5603	struct zone *zone;
5604
5605	for_each_populated_zone(zone)
5606	zone_pcp_update_cacheinfo(zone);
5607	}
5608
5609	/*
5610	* Allocate per cpu pagesets and initialize them.
5611	* Before this call only boot pagesets were available.
5612	*/
5613	void __init setup_per_cpu_pageset(void)
5614	{
5615	struct pglist_data *pgdat;
5616	struct zone *zone;
5617	int __maybe_unused cpu;
5618
5619	for_each_populated_zone(zone)
5620	setup_zone_pageset(zone);
5621
5622	#ifdef CONFIG_NUMA
5623	/*
5624	* Unpopulated zones continue using the boot pagesets.
5625	* The numa stats for these pagesets need to be reset.
5626	* Otherwise, they will end up skewing the stats of
5627	* the nodes these zones are associated with.
5628	*/
5629	for_each_possible_cpu(cpu) {
5630	struct per_cpu_zonestat *pzstats = &per_cpu(boot_zonestats, cpu);
5631	memset(pzstats->vm_numa_event, `0`,
5632	sizeof(pzstats->vm_numa_event));
5633	}
5634	#endif
5635
5636	for_each_online_pgdat(pgdat)
5637	pgdat->per_cpu_nodestats =
5638	alloc_percpu(struct per_cpu_nodestat);
5639	}
5640
5641	__meminit void zone_pcp_init(struct zone *zone)
5642	{
5643	/*
5644	* per cpu subsystem is not up at this point. The following code
5645	* relies on the ability of the linker to provide the
5646	* offset of a (static) per cpu variable into the per cpu area.
5647	*/
5648	zone->per_cpu_pageset = &boot_pageset;
5649	zone->per_cpu_zonestats = &boot_zonestats;
5650	zone->pageset_high_min = BOOT_PAGESET_HIGH;
5651	zone->pageset_high_max = BOOT_PAGESET_HIGH;
5652	zone->pageset_batch = BOOT_PAGESET_BATCH;
5653
5654	if (populated_zone(zone))
5655	pr_debug(" %s zone: %lu pages, LIFO batch:%u\n", zone->name,
5656	zone->present_pages, zone_batchsize(zone));
5657	}
5658
5659	void adjust_managed_page_count(struct page page, long* count)
5660	{
5661	atomic_long_add(i: count, v: &page_zone(page)->managed_pages);
5662	totalram_pages_add(count);
5663	#ifdef CONFIG_HIGHMEM
5664	if (PageHighMem(page))
5665	totalhigh_pages_add(count);
5666	#endif
5667	}
5668	EXPORT_SYMBOL(adjust_managed_page_count);
5669
5670	unsigned long free_reserved_area(void start, void* end, int* poison, const char *s)
5671	{
5672	void *pos;
5673	unsigned long pages = `0`;
5674
5675	start = (void )PAGE_ALIGN((unsigned* long)start);
5676	end = (void )((unsigned* long)end & PAGE_MASK);
5677	for (pos = start; pos < end; pos += PAGE_SIZE, pages++) {
5678	struct page *page = virt_to_page(pos);
5679	void *direct_map_addr;
5680
5681	/*
5682	* 'direct_map_addr' might be different from 'pos'
5683	* because some architectures' virt_to_page()
5684	* work with aliases. Getting the direct map
5685	* address ensures that we get a _writeable_
5686	* alias for the memset().
5687	*/
5688	direct_map_addr = page_address(page);
5689	/*
5690	* Perform a kasan-unchecked memset() since this memory
5691	* has not been initialized.
5692	*/
5693	direct_map_addr = kasan_reset_tag(addr: direct_map_addr);
5694	if ((unsigned int)poison <= `0xFF`)
5695	memset(direct_map_addr, poison, PAGE_SIZE);
5696
5697	free_reserved_page(page);
5698	}
5699
5700	if (pages && s)
5701	pr_info("Freeing %s memory: %ldK\n", s, K(pages));
5702
5703	return pages;
5704	}
5705
5706	static int page_alloc_cpu_dead(unsigned int cpu)
5707	{
5708	struct zone *zone;
5709
5710	lru_add_drain_cpu(cpu);
5711	mlock_drain_remote(cpu);
5712	drain_pages(cpu);
5713
5714	/*
5715	* Spill the event counters of the dead processor
5716	* into the current processors event counters.
5717	* This artificially elevates the count of the current
5718	* processor.
5719	*/
5720	vm_events_fold_cpu(cpu);
5721
5722	/*
5723	* Zero the differential counters of the dead processor
5724	* so that the vm statistics are consistent.
5725	*
5726	* This is only okay since the processor is dead and cannot
5727	* race with what we are doing.
5728	*/
5729	cpu_vm_stats_fold(cpu);
5730
5731	for_each_populated_zone(zone)
5732	zone_pcp_update(zone, cpu_online: `0`);
5733
5734	return `0`;
5735	}
5736
5737	static int page_alloc_cpu_online(unsigned int cpu)
5738	{
5739	struct zone *zone;
5740
5741	for_each_populated_zone(zone)
5742	zone_pcp_update(zone, cpu_online: `1`);
5743	return `0`;
5744	}
5745
5746	void __init page_alloc_init_cpuhp(void)
5747	{
5748	int ret;
5749
5750	ret = cpuhp_setup_state_nocalls(state: CPUHP_PAGE_ALLOC,
5751	name: "mm/page_alloc:pcp",
5752	startup: page_alloc_cpu_online,
5753	teardown: page_alloc_cpu_dead);
5754	WARN_ON(ret < `0`);
5755	}
5756
5757	/*
5758	* calculate_totalreserve_pages - called when sysctl_lowmem_reserve_ratio
5759	* or min_free_kbytes changes.
5760	*/
5761	static void calculate_totalreserve_pages(void)
5762	{
5763	struct pglist_data *pgdat;
5764	unsigned long reserve_pages = `0`;
5765	enum zone_type i, j;
5766
5767	for_each_online_pgdat(pgdat) {
5768
5769	pgdat->totalreserve_pages = `0`;
5770
5771	for (i = `0`; i < MAX_NR_ZONES; i++) {
5772	struct zone *zone = pgdat->node_zones + i;
5773	long max = `0`;
5774	unsigned long managed_pages = zone_managed_pages(zone);
5775
5776	/ Find valid and maximum lowmem_reserve in the zone /
5777	for (j = i; j < MAX_NR_ZONES; j++) {
5778	if (zone->lowmem_reserve[j] > max)
5779	max = zone->lowmem_reserve[j];
5780	}
5781
5782	/ we treat the high watermark as reserved pages. /
5783	max += high_wmark_pages(zone);
5784
5785	if (max > managed_pages)
5786	max = managed_pages;
5787
5788	pgdat->totalreserve_pages += max;
5789
5790	reserve_pages += max;
5791	}
5792	}
5793	totalreserve_pages = reserve_pages;
5794	}
5795
5796	/*
5797	* setup_per_zone_lowmem_reserve - called whenever
5798	* sysctl_lowmem_reserve_ratio changes. Ensures that each zone
5799	* has a correct pages reserved value, so an adequate number of
5800	* pages are left in the zone after a successful __alloc_pages().
5801	*/
5802	static void setup_per_zone_lowmem_reserve(void)
5803	{
5804	struct pglist_data *pgdat;
5805	enum zone_type i, j;
5806
5807	for_each_online_pgdat(pgdat) {
5808	for (i = `0`; i < MAX_NR_ZONES - `1`; i++) {
5809	struct zone *zone = &pgdat->node_zones[i];
5810	int ratio = sysctl_lowmem_reserve_ratio[i];
5811	bool clear = !ratio \|\| !zone_managed_pages(zone);
5812	unsigned long managed_pages = `0`;
5813
5814	for (j = i + `1`; j < MAX_NR_ZONES; j++) {
5815	struct zone *upper_zone = &pgdat->node_zones[j];
5816
5817	managed_pages += zone_managed_pages(upper_zone);
5818
5819	if (clear)
5820	zone->lowmem_reserve[j] = `0`;
5821	else
5822	zone->lowmem_reserve[j] = managed_pages / ratio;
5823	}
5824	}
5825	}
5826
5827	/ update totalreserve_pages /
5828	calculate_totalreserve_pages();
5829	}
5830
5831	static void __setup_per_zone_wmarks(void)
5832	{
5833	unsigned long pages_min = min_free_kbytes >> (PAGE_SHIFT - `10`);
5834	unsigned long lowmem_pages = `0`;
5835	struct zone *zone;
5836	unsigned long flags;
5837
5838	/ Calculate total number of !ZONE_HIGHMEM and !ZONE_MOVABLE pages /
5839	for_each_zone(zone) {
5840	if (!is_highmem(zone) && zone_idx(zone) != ZONE_MOVABLE)
5841	lowmem_pages += zone_managed_pages(zone);
5842	}
5843
5844	for_each_zone(zone) {
5845	u64 tmp;
5846
5847	spin_lock_irqsave(&zone->lock, flags);
5848	tmp = (u64)pages_min * zone_managed_pages(zone);
5849	do_div(tmp, lowmem_pages);
5850	if (is_highmem(zone) \|\| zone_idx(zone) == ZONE_MOVABLE) {
5851	/*
5852	* __GFP_HIGH and PF_MEMALLOC allocations usually don't
5853	* need highmem and movable zones pages, so cap pages_min
5854	* to a small value here.
5855	*
5856	* The WMARK_HIGH-WMARK_LOW and (WMARK_LOW-WMARK_MIN)
5857	* deltas control async page reclaim, and so should
5858	* not be capped for highmem and movable zones.
5859	*/
5860	unsigned long min_pages;
5861
5862	min_pages = zone_managed_pages(zone) / `1024`;
5863	min_pages = clamp(min_pages, SWAP_CLUSTER_MAX, `128UL`);
5864	zone->_watermark[WMARK_MIN] = min_pages;
5865	} else {
5866	/*
5867	* If it's a lowmem zone, reserve a number of pages
5868	* proportionate to the zone's size.
5869	*/
5870	zone->_watermark[WMARK_MIN] = tmp;
5871	}
5872
5873	/*
5874	* Set the kswapd watermarks distance according to the
5875	* scale factor in proportion to available memory, but
5876	* ensure a minimum size on small systems.
5877	*/
5878	tmp = max_t(u64, tmp >> `2`,
5879	mult_frac(zone_managed_pages(zone),
5880	watermark_scale_factor, `10000`));
5881
5882	zone->watermark_boost = `0`;
5883	zone->_watermark[WMARK_LOW] = min_wmark_pages(zone) + tmp;
5884	zone->_watermark[WMARK_HIGH] = low_wmark_pages(zone) + tmp;
5885	zone->_watermark[WMARK_PROMO] = high_wmark_pages(zone) + tmp;
5886
5887	spin_unlock_irqrestore(lock: &zone->lock, flags);
5888	}
5889
5890	/ update totalreserve_pages /
5891	calculate_totalreserve_pages();
5892	}
5893
5894	/**
5895	* setup_per_zone_wmarks - called when min_free_kbytes changes
5896	* or when memory is hot-{added\|removed}
5897	*
5898	* Ensures that the watermark[min,low,high] values for each zone are set
5899	* correctly with respect to min_free_kbytes.
5900	*/
5901	void setup_per_zone_wmarks(void)
5902	{
5903	struct zone *zone;
5904	static DEFINE_SPINLOCK(lock);
5905
5906	spin_lock(lock: &lock);
5907	__setup_per_zone_wmarks();
5908	spin_unlock(lock: &lock);
5909
5910	/*
5911	* The watermark size have changed so update the pcpu batch
5912	* and high limits or the limits may be inappropriate.
5913	*/
5914	for_each_zone(zone)
5915	zone_pcp_update(zone, cpu_online: `0`);
5916	}
5917
5918	/*
5919	* Initialise min_free_kbytes.
5920	*
5921	* For small machines we want it small (128k min). For large machines
5922	* we want it large (256MB max). But it is not linear, because network
5923	* bandwidth does not increase linearly with machine size. We use
5924	*
5925	* min_free_kbytes = 4 * sqrt(lowmem_kbytes), for better accuracy:
5926	* min_free_kbytes = sqrt(lowmem_kbytes * 16)
5927	*
5928	* which yields
5929	*
5930	* 16MB: 512k
5931	* 32MB: 724k
5932	* 64MB: 1024k
5933	* 128MB: 1448k
5934	* 256MB: 2048k
5935	* 512MB: 2896k
5936	* 1024MB: 4096k
5937	* 2048MB: 5792k
5938	* 4096MB: 8192k
5939	* 8192MB: 11584k
5940	* 16384MB: 16384k
5941	*/
5942	void calculate_min_free_kbytes(void)
5943	{
5944	unsigned long lowmem_kbytes;
5945	int new_min_free_kbytes;
5946
5947	lowmem_kbytes = nr_free_buffer_pages() * (PAGE_SIZE >> `10`);
5948	new_min_free_kbytes = int_sqrt(lowmem_kbytes * `16`);
5949
5950	if (new_min_free_kbytes > user_min_free_kbytes)
5951	min_free_kbytes = clamp(new_min_free_kbytes, `128`, `262144`);
5952	else
5953	pr_warn("min_free_kbytes is not updated to %d because user defined value %d is preferred\n",
5954	new_min_free_kbytes, user_min_free_kbytes);
5955
5956	}
5957
5958	int __meminit init_per_zone_wmark_min(void)
5959	{
5960	calculate_min_free_kbytes();
5961	setup_per_zone_wmarks();
5962	refresh_zone_stat_thresholds();
5963	setup_per_zone_lowmem_reserve();
5964
5965	#ifdef CONFIG_NUMA
5966	setup_min_unmapped_ratio();
5967	setup_min_slab_ratio();
5968	#endif
5969
5970	khugepaged_min_free_kbytes_update();
5971
5972	return `0`;
5973	}
5974	postcore_initcall(init_per_zone_wmark_min)
5975
5976	/*
5977	* min_free_kbytes_sysctl_handler - just a wrapper around proc_dointvec() so
5978	* that we can call two helper functions whenever min_free_kbytes
5979	* changes.
5980	*/
5981	static int min_free_kbytes_sysctl_handler(struct ctl_table table, int* write,
5982	void buffer, size_t length, loff_t *ppos)
5983	{
5984	int rc;
5985
5986	rc = proc_dointvec_minmax(table, write, buffer, length, ppos);
5987	if (rc)
5988	return rc;
5989
5990	if (write) {
5991	user_min_free_kbytes = min_free_kbytes;
5992	setup_per_zone_wmarks();
5993	}
5994	return `0`;
5995	}
5996
5997	static int watermark_scale_factor_sysctl_handler(struct ctl_table table, int* write,
5998	void buffer, size_t length, loff_t *ppos)
5999	{
6000	int rc;
6001
6002	rc = proc_dointvec_minmax(table, write, buffer, length, ppos);
6003	if (rc)
6004	return rc;
6005
6006	if (write)
6007	setup_per_zone_wmarks();
6008
6009	return `0`;
6010	}
6011
6012	#ifdef CONFIG_NUMA
6013	static void setup_min_unmapped_ratio(void)
6014	{
6015	pg_data_t *pgdat;
6016	struct zone *zone;
6017
6018	for_each_online_pgdat(pgdat)
6019	pgdat->min_unmapped_pages = `0`;
6020
6021	for_each_zone(zone)
6022	zone->zone_pgdat->min_unmapped_pages += (zone_managed_pages(zone) *
6023	sysctl_min_unmapped_ratio) / `100`;
6024	}
6025
6026
6027	static int sysctl_min_unmapped_ratio_sysctl_handler(struct ctl_table table, int* write,
6028	void buffer, size_t length, loff_t *ppos)
6029	{
6030	int rc;
6031
6032	rc = proc_dointvec_minmax(table, write, buffer, length, ppos);
6033	if (rc)
6034	return rc;
6035
6036	setup_min_unmapped_ratio();
6037
6038	return `0`;
6039	}
6040
6041	static void setup_min_slab_ratio(void)
6042	{
6043	pg_data_t *pgdat;
6044	struct zone *zone;
6045
6046	for_each_online_pgdat(pgdat)
6047	pgdat->min_slab_pages = `0`;
6048
6049	for_each_zone(zone)
6050	zone->zone_pgdat->min_slab_pages += (zone_managed_pages(zone) *
6051	sysctl_min_slab_ratio) / `100`;
6052	}
6053
6054	static int sysctl_min_slab_ratio_sysctl_handler(struct ctl_table table, int* write,
6055	void buffer, size_t length, loff_t *ppos)
6056	{
6057	int rc;
6058
6059	rc = proc_dointvec_minmax(table, write, buffer, length, ppos);
6060	if (rc)
6061	return rc;
6062
6063	setup_min_slab_ratio();
6064
6065	return `0`;
6066	}
6067	#endif
6068
6069	/*
6070	* lowmem_reserve_ratio_sysctl_handler - just a wrapper around
6071	* proc_dointvec() so that we can call setup_per_zone_lowmem_reserve()
6072	* whenever sysctl_lowmem_reserve_ratio changes.
6073	*
6074	* The reserve ratio obviously has absolutely no relation with the
6075	* minimum watermarks. The lowmem reserve ratio can only make sense
6076	* if in function of the boot time zone sizes.
6077	*/
6078	static int lowmem_reserve_ratio_sysctl_handler(struct ctl_table *table,
6079	int write, void buffer, size_t length, loff_t *ppos)
6080	{
6081	int i;
6082
6083	proc_dointvec_minmax(table, write, buffer, length, ppos);
6084
6085	for (i = `0`; i < MAX_NR_ZONES; i++) {
6086	if (sysctl_lowmem_reserve_ratio[i] < `1`)
6087	sysctl_lowmem_reserve_ratio[i] = `0`;
6088	}
6089
6090	setup_per_zone_lowmem_reserve();
6091	return `0`;
6092	}
6093
6094	/*
6095	* percpu_pagelist_high_fraction - changes the pcp->high for each zone on each
6096	* cpu. It is the fraction of total pages in each zone that a hot per cpu
6097	* pagelist can have before it gets flushed back to buddy allocator.
6098	*/
6099	static int percpu_pagelist_high_fraction_sysctl_handler(struct ctl_table *table,
6100	int write, void buffer, size_t length, loff_t *ppos)
6101	{
6102	struct zone *zone;
6103	int old_percpu_pagelist_high_fraction;
6104	int ret;
6105
6106	mutex_lock(&pcp_batch_high_lock);
6107	old_percpu_pagelist_high_fraction = percpu_pagelist_high_fraction;
6108
6109	ret = proc_dointvec_minmax(table, write, buffer, length, ppos);
6110	if (!write \|\| ret < `0`)
6111	goto out;
6112
6113	/ Sanity checking to avoid pcp imbalance /
6114	if (percpu_pagelist_high_fraction &&
6115	percpu_pagelist_high_fraction < MIN_PERCPU_PAGELIST_HIGH_FRACTION) {
6116	percpu_pagelist_high_fraction = old_percpu_pagelist_high_fraction;
6117	ret = -EINVAL;
6118	goto out;
6119	}
6120
6121	/ No change? /
6122	if (percpu_pagelist_high_fraction == old_percpu_pagelist_high_fraction)
6123	goto out;
6124
6125	for_each_populated_zone(zone)
6126	zone_set_pageset_high_and_batch(zone, cpu_online: `0`);
6127	out:
6128	mutex_unlock(lock: &pcp_batch_high_lock);
6129	return ret;
6130	}
6131
6132	static struct ctl_table page_alloc_sysctl_table[] = {
6133	{
6134	.procname = "min_free_kbytes",
6135	.data = &min_free_kbytes,
6136	.maxlen = sizeof(min_free_kbytes),
6137	.mode = `0644`,
6138	.proc_handler = min_free_kbytes_sysctl_handler,
6139	.extra1 = SYSCTL_ZERO,
6140	},
6141	{
6142	.procname = "watermark_boost_factor",
6143	.data = &watermark_boost_factor,
6144	.maxlen = sizeof(watermark_boost_factor),
6145	.mode = `0644`,
6146	.proc_handler = proc_dointvec_minmax,
6147	.extra1 = SYSCTL_ZERO,
6148	},
6149	{
6150	.procname = "watermark_scale_factor",
6151	.data = &watermark_scale_factor,
6152	.maxlen = sizeof(watermark_scale_factor),
6153	.mode = `0644`,
6154	.proc_handler = watermark_scale_factor_sysctl_handler,
6155	.extra1 = SYSCTL_ONE,
6156	.extra2 = SYSCTL_THREE_THOUSAND,
6157	},
6158	{
6159	.procname = "percpu_pagelist_high_fraction",
6160	.data = &percpu_pagelist_high_fraction,
6161	.maxlen = sizeof(percpu_pagelist_high_fraction),
6162	.mode = `0644`,
6163	.proc_handler = percpu_pagelist_high_fraction_sysctl_handler,
6164	.extra1 = SYSCTL_ZERO,
6165	},
6166	{
6167	.procname = "lowmem_reserve_ratio",
6168	.data = &sysctl_lowmem_reserve_ratio,
6169	.maxlen = sizeof(sysctl_lowmem_reserve_ratio),
6170	.mode = `0644`,
6171	.proc_handler = lowmem_reserve_ratio_sysctl_handler,
6172	},
6173	#ifdef CONFIG_NUMA
6174	{
6175	.procname = "numa_zonelist_order",
6176	.data = &numa_zonelist_order,
6177	.maxlen = NUMA_ZONELIST_ORDER_LEN,
6178	.mode = `0644`,
6179	.proc_handler = numa_zonelist_order_handler,
6180	},
6181	{
6182	.procname = "min_unmapped_ratio",
6183	.data = &sysctl_min_unmapped_ratio,
6184	.maxlen = sizeof(sysctl_min_unmapped_ratio),
6185	.mode = `0644`,
6186	.proc_handler = sysctl_min_unmapped_ratio_sysctl_handler,
6187	.extra1 = SYSCTL_ZERO,
6188	.extra2 = SYSCTL_ONE_HUNDRED,
6189	},
6190	{
6191	.procname = "min_slab_ratio",
6192	.data = &sysctl_min_slab_ratio,
6193	.maxlen = sizeof(sysctl_min_slab_ratio),
6194	.mode = `0644`,
6195	.proc_handler = sysctl_min_slab_ratio_sysctl_handler,
6196	.extra1 = SYSCTL_ZERO,
6197	.extra2 = SYSCTL_ONE_HUNDRED,
6198	},
6199	#endif
6200	{}
6201	};
6202
6203	void __init page_alloc_sysctl_init(void)
6204	{
6205	register_sysctl_init("vm", page_alloc_sysctl_table);
6206	}
6207
6208	#ifdef CONFIG_CONTIG_ALLOC
6209	/ Usage: See admin-guide/dynamic-debug-howto.rst /
6210	static void alloc_contig_dump_pages(struct list_head *page_list)
6211	{
6212	DEFINE_DYNAMIC_DEBUG_METADATA(descriptor, "migrate failure");
6213
6214	if (DYNAMIC_DEBUG_BRANCH(descriptor)) {
6215	struct page *page;
6216
6217	dump_stack();
6218	list_for_each_entry(page, page_list, lru)
6219	dump_page(page, reason: "migration failure");
6220	}
6221	}
6222
6223	/ [start, end) must belong to a single zone. /
6224	int __alloc_contig_migrate_range(struct compact_control *cc,
6225	unsigned long start, unsigned long end)
6226	{
6227	/ This function is based on compact_zone() from compaction.c. /
6228	unsigned int nr_reclaimed;
6229	unsigned long pfn = start;
6230	unsigned int tries = `0`;
6231	int ret = `0`;
6232	struct migration_target_control mtc = {
6233	.nid = zone_to_nid(zone: cc->zone),
6234	.gfp_mask = GFP_USER \| __GFP_MOVABLE \| __GFP_RETRY_MAYFAIL,
6235	};
6236
6237	lru_cache_disable();
6238
6239	while (pfn < end \|\| !list_empty(head: &cc->migratepages)) {
6240	if (fatal_signal_pending(current)) {
6241	ret = -EINTR;
6242	break;
6243	}
6244
6245	if (list_empty(head: &cc->migratepages)) {
6246	cc->nr_migratepages = `0`;
6247	ret = isolate_migratepages_range(cc, low_pfn: pfn, end_pfn: end);
6248	if (ret && ret != -EAGAIN)
6249	break;
6250	pfn = cc->migrate_pfn;
6251	tries = `0`;
6252	} else if (++tries == `5`) {
6253	ret = -EBUSY;
6254	break;
6255	}
6256
6257	nr_reclaimed = reclaim_clean_pages_from_list(zone: cc->zone,
6258	folio_list: &cc->migratepages);
6259	cc->nr_migratepages -= nr_reclaimed;
6260
6261	ret = migrate_pages(l: &cc->migratepages, new: alloc_migration_target,
6262	NULL, private: (unsigned long)&mtc, mode: cc->mode, reason: MR_CONTIG_RANGE, NULL);
6263
6264	/*
6265	* On -ENOMEM, migrate_pages() bails out right away. It is pointless
6266	* to retry again over this error, so do the same here.
6267	*/
6268	if (ret == -ENOMEM)
6269	break;
6270	}
6271
6272	lru_cache_enable();
6273	if (ret < `0`) {
6274	if (!(cc->gfp_mask & __GFP_NOWARN) && ret == -EBUSY)
6275	alloc_contig_dump_pages(page_list: &cc->migratepages);
6276	putback_movable_pages(l: &cc->migratepages);
6277	return ret;
6278	}
6279	return `0`;
6280	}
6281
6282	/**
6283	* alloc_contig_range() -- tries to allocate given range of pages
6284	* @start: start PFN to allocate
6285	* @end: one-past-the-last PFN to allocate
6286	* @migratetype: migratetype of the underlying pageblocks (either
6287	* #MIGRATE_MOVABLE or #MIGRATE_CMA). All pageblocks
6288	* in range must have the same migratetype and it must
6289	* be either of the two.
6290	* @gfp_mask: GFP mask to use during compaction
6291	*
6292	* The PFN range does not have to be pageblock aligned. The PFN range must
6293	* belong to a single zone.
6294	*
6295	* The first thing this routine does is attempt to MIGRATE_ISOLATE all
6296	* pageblocks in the range. Once isolated, the pageblocks should not
6297	* be modified by others.
6298	*
6299	* Return: zero on success or negative error code. On success all
6300	* pages which PFN is in [start, end) are allocated for the caller and
6301	* need to be freed with free_contig_range().
6302	*/
6303	int alloc_contig_range(unsigned long start, unsigned long end,
6304	unsigned migratetype, gfp_t gfp_mask)
6305	{
6306	unsigned long outer_start, outer_end;
6307	int order;
6308	int ret = `0`;
6309
6310	struct compact_control cc = {
6311	.nr_migratepages = `0`,
6312	.order = -`1`,
6313	.zone = page_zone(pfn_to_page(start)),
6314	.mode = MIGRATE_SYNC,
6315	.ignore_skip_hint = true,
6316	.no_set_skip_hint = true,
6317	.gfp_mask = current_gfp_context(flags: gfp_mask),
6318	.alloc_contig = true,
6319	};
6320	INIT_LIST_HEAD(list: &cc.migratepages);
6321
6322	/*
6323	* What we do here is we mark all pageblocks in range as
6324	* MIGRATE_ISOLATE. Because pageblock and max order pages may
6325	* have different sizes, and due to the way page allocator
6326	* work, start_isolate_page_range() has special handlings for this.
6327	*
6328	* Once the pageblocks are marked as MIGRATE_ISOLATE, we
6329	* migrate the pages from an unaligned range (ie. pages that
6330	* we are interested in). This will put all the pages in
6331	* range back to page allocator as MIGRATE_ISOLATE.
6332	*
6333	* When this is done, we take the pages in range from page
6334	* allocator removing them from the buddy system. This way
6335	* page allocator will never consider using them.
6336	*
6337	* This lets us mark the pageblocks back as
6338	* MIGRATE_CMA/MIGRATE_MOVABLE so that free pages in the
6339	* aligned range but not in the unaligned, original range are
6340	* put back to page allocator so that buddy can use them.
6341	*/
6342
6343	ret = start_isolate_page_range(start_pfn: start, end_pfn: end, migratetype, flags: `0`, gfp_flags: gfp_mask);
6344	if (ret)
6345	goto done;
6346
6347	drain_all_pages(zone: cc.zone);
6348
6349	/*
6350	* In case of -EBUSY, we'd like to know which page causes problem.
6351	* So, just fall through. test_pages_isolated() has a tracepoint
6352	* which will report the busy page.
6353	*
6354	* It is possible that busy pages could become available before
6355	* the call to test_pages_isolated, and the range will actually be
6356	* allocated. So, if we fall through be sure to clear ret so that
6357	* -EBUSY is not accidentally used or returned to caller.
6358	*/
6359	ret = __alloc_contig_migrate_range(cc: &cc, start, end);
6360	if (ret && ret != -EBUSY)
6361	goto done;
6362	ret = `0`;
6363
6364	/*
6365	* Pages from [start, end) are within a pageblock_nr_pages
6366	* aligned blocks that are marked as MIGRATE_ISOLATE. What's
6367	* more, all pages in [start, end) are free in page allocator.
6368	* What we are going to do is to allocate all pages from
6369	* [start, end) (that is remove them from page allocator).
6370	*
6371	* The only problem is that pages at the beginning and at the
6372	* end of interesting range may be not aligned with pages that
6373	* page allocator holds, ie. they can be part of higher order
6374	* pages. Because of this, we reserve the bigger range and
6375	* once this is done free the pages we are not interested in.
6376	*
6377	* We don't have to hold zone->lock here because the pages are
6378	* isolated thus they won't get removed from buddy.
6379	*/
6380
6381	order = `0`;
6382	outer_start = start;
6383	while (!PageBuddy(pfn_to_page(outer_start))) {
6384	if (++order > MAX_ORDER) {
6385	outer_start = start;
6386	break;
6387	}
6388	outer_start &= ~`0UL` << order;
6389	}
6390
6391	if (outer_start != start) {
6392	order = buddy_order(pfn_to_page(outer_start));
6393
6394	/*
6395	* outer_start page could be small order buddy page and
6396	* it doesn't include start page. Adjust outer_start
6397	* in this case to report failed page properly
6398	* on tracepoint in test_pages_isolated()
6399	*/
6400	if (outer_start + (`1UL` << order) <= start)
6401	outer_start = start;
6402	}
6403
6404	/ Make sure the range is really isolated. /
6405	if (test_pages_isolated(start_pfn: outer_start, end_pfn: end, isol_flags: `0`)) {
6406	ret = -EBUSY;
6407	goto done;
6408	}
6409
6410	/ Grab isolated pages from freelists. /
6411	outer_end = isolate_freepages_range(cc: &cc, start_pfn: outer_start, end_pfn: end);
6412	if (!outer_end) {
6413	ret = -EBUSY;
6414	goto done;
6415	}
6416
6417	/ Free head and tail (if any) /
6418	if (start != outer_start)
6419	free_contig_range(pfn: outer_start, nr_pages: start - outer_start);
6420	if (end != outer_end)
6421	free_contig_range(pfn: end, nr_pages: outer_end - end);
6422
6423	done:
6424	undo_isolate_page_range(start_pfn: start, end_pfn: end, migratetype);
6425	return ret;
6426	}
6427	EXPORT_SYMBOL(alloc_contig_range);
6428
6429	static int __alloc_contig_pages(unsigned long start_pfn,
6430	unsigned long nr_pages, gfp_t gfp_mask)
6431	{
6432	unsigned long end_pfn = start_pfn + nr_pages;
6433
6434	return alloc_contig_range(start_pfn, end_pfn, MIGRATE_MOVABLE,
6435	gfp_mask);
6436	}
6437
6438	static bool pfn_range_valid_contig(struct zone z, unsigned* long start_pfn,
6439	unsigned long nr_pages)
6440	{
6441	unsigned long i, end_pfn = start_pfn + nr_pages;
6442	struct page *page;
6443
6444	for (i = start_pfn; i < end_pfn; i++) {
6445	page = pfn_to_online_page(pfn: i);
6446	if (!page)
6447	return false;
6448
6449	if (page_zone(page) != z)
6450	return false;
6451
6452	if (PageReserved(page))
6453	return false;
6454
6455	if (PageHuge(page))
6456	return false;
6457	}
6458	return true;
6459	}
6460
6461	static bool zone_spans_last_pfn(const struct zone *zone,
6462	unsigned long start_pfn, unsigned long nr_pages)
6463	{
6464	unsigned long last_pfn = start_pfn + nr_pages - `1`;
6465
6466	return zone_spans_pfn(zone, pfn: last_pfn);
6467	}
6468
6469	/**
6470	* alloc_contig_pages() -- tries to find and allocate contiguous range of pages
6471	* @nr_pages: Number of contiguous pages to allocate
6472	* @gfp_mask: GFP mask to limit search and used during compaction
6473	* @nid: Target node
6474	* @nodemask: Mask for other possible nodes
6475	*
6476	* This routine is a wrapper around alloc_contig_range(). It scans over zones
6477	* on an applicable zonelist to find a contiguous pfn range which can then be
6478	* tried for allocation with alloc_contig_range(). This routine is intended
6479	* for allocation requests which can not be fulfilled with the buddy allocator.
6480	*
6481	* The allocated memory is always aligned to a page boundary. If nr_pages is a
6482	* power of two, then allocated range is also guaranteed to be aligned to same
6483	* nr_pages (e.g. 1GB request would be aligned to 1GB).
6484	*
6485	* Allocated pages can be freed with free_contig_range() or by manually calling
6486	* __free_page() on each allocated page.
6487	*
6488	* Return: pointer to contiguous pages on success, or NULL if not successful.
6489	*/
6490	struct page alloc_contig_pages(unsigned* long nr_pages, gfp_t gfp_mask,
6491	int nid, nodemask_t *nodemask)
6492	{
6493	unsigned long ret, pfn, flags;
6494	struct zonelist *zonelist;
6495	struct zone *zone;
6496	struct zoneref *z;
6497
6498	zonelist = node_zonelist(nid, flags: gfp_mask);
6499	for_each_zone_zonelist_nodemask(zone, z, zonelist,
6500	gfp_zone(gfp_mask), nodemask) {
6501	spin_lock_irqsave(&zone->lock, flags);
6502
6503	pfn = ALIGN(zone->zone_start_pfn, nr_pages);
6504	while (zone_spans_last_pfn(zone, start_pfn: pfn, nr_pages)) {
6505	if (pfn_range_valid_contig(z: zone, start_pfn: pfn, nr_pages)) {
6506	/*
6507	* We release the zone lock here because
6508	* alloc_contig_range() will also lock the zone
6509	* at some point. If there's an allocation
6510	* spinning on this lock, it may win the race
6511	* and cause alloc_contig_range() to fail...
6512	*/
6513	spin_unlock_irqrestore(lock: &zone->lock, flags);
6514	ret = __alloc_contig_pages(start_pfn: pfn, nr_pages,
6515	gfp_mask);
6516	if (!ret)
6517	return pfn_to_page(pfn);
6518	spin_lock_irqsave(&zone->lock, flags);
6519	}
6520	pfn += nr_pages;
6521	}
6522	spin_unlock_irqrestore(lock: &zone->lock, flags);
6523	}
6524	return NULL;
6525	}
6526	#endif /* CONFIG_CONTIG_ALLOC */
6527
6528	void free_contig_range(unsigned long pfn, unsigned long nr_pages)
6529	{
6530	unsigned long count = `0`;
6531
6532	for (; nr_pages--; pfn++) {
6533	struct page *page = pfn_to_page(pfn);
6534
6535	count += page_count(page) != `1`;
6536	__free_page(page);
6537	}
6538	WARN(count != `0`, "%lu pages are still in use!\n", count);
6539	}
6540	EXPORT_SYMBOL(free_contig_range);
6541
6542	/*
6543	* Effectively disable pcplists for the zone by setting the high limit to 0
6544	* and draining all cpus. A concurrent page freeing on another CPU that's about
6545	* to put the page on pcplist will either finish before the drain and the page
6546	* will be drained, or observe the new high limit and skip the pcplist.
6547	*
6548	* Must be paired with a call to zone_pcp_enable().
6549	*/
6550	void zone_pcp_disable(struct zone *zone)
6551	{
6552	mutex_lock(&pcp_batch_high_lock);
6553	__zone_set_pageset_high_and_batch(zone, high_min: `0`, high_max: `0`, batch: `1`);
6554	__drain_all_pages(zone, force_all_cpus: true);
6555	}
6556
6557	void zone_pcp_enable(struct zone *zone)
6558	{
6559	__zone_set_pageset_high_and_batch(zone, high_min: zone->pageset_high_min,
6560	high_max: zone->pageset_high_max, batch: zone->pageset_batch);
6561	mutex_unlock(lock: &pcp_batch_high_lock);
6562	}
6563
6564	void zone_pcp_reset(struct zone *zone)
6565	{
6566	int cpu;
6567	struct per_cpu_zonestat *pzstats;
6568
6569	if (zone->per_cpu_pageset != &boot_pageset) {
6570	for_each_online_cpu(cpu) {
6571	pzstats = per_cpu_ptr(zone->per_cpu_zonestats, cpu);
6572	drain_zonestat(zone, pzstats);
6573	}
6574	free_percpu(pdata: zone->per_cpu_pageset);
6575	zone->per_cpu_pageset = &boot_pageset;
6576	if (zone->per_cpu_zonestats != &boot_zonestats) {
6577	free_percpu(pdata: zone->per_cpu_zonestats);
6578	zone->per_cpu_zonestats = &boot_zonestats;
6579	}
6580	}
6581	}
6582
6583	#ifdef CONFIG_MEMORY_HOTREMOVE
6584	/*
6585	* All pages in the range must be in a single zone, must not contain holes,
6586	* must span full sections, and must be isolated before calling this function.
6587	*/
6588	void __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)
6589	{
6590	unsigned long pfn = start_pfn;
6591	struct page *page;
6592	struct zone *zone;
6593	unsigned int order;
6594	unsigned long flags;
6595
6596	offline_mem_sections(start_pfn: pfn, end_pfn);
6597	zone = page_zone(pfn_to_page(pfn));
6598	spin_lock_irqsave(&zone->lock, flags);
6599	while (pfn < end_pfn) {
6600	page = pfn_to_page(pfn);
6601	/*
6602	* The HWPoisoned page may be not in buddy system, and
6603	* page_count() is not 0.
6604	*/
6605	if (unlikely(!PageBuddy(page) && PageHWPoison(page))) {
6606	pfn++;
6607	continue;
6608	}
6609	/*
6610	* At this point all remaining PageOffline() pages have a
6611	* reference count of 0 and can simply be skipped.
6612	*/
6613	if (PageOffline(page)) {
6614	BUG_ON(page_count(page));
6615	BUG_ON(PageBuddy(page));
6616	pfn++;
6617	continue;
6618	}
6619
6620	BUG_ON(page_count(page));
6621	BUG_ON(!PageBuddy(page));
6622	order = buddy_order(page);
6623	del_page_from_free_list(page, zone, order);
6624	pfn += (`1` << order);
6625	}
6626	spin_unlock_irqrestore(lock: &zone->lock, flags);
6627	}
6628	#endif
6629
6630	/*
6631	* This function returns a stable result only if called under zone lock.
6632	*/
6633	bool is_free_buddy_page(struct page *page)
6634	{
6635	unsigned long pfn = page_to_pfn(page);
6636	unsigned int order;
6637
6638	for (order = `0`; order <= MAX_ORDER; order++) {
6639	struct page *page_head = page - (pfn & ((`1` << order) - `1`));
6640
6641	if (PageBuddy(page: page_head) &&
6642	buddy_order_unsafe(page_head) >= order)
6643	break;
6644	}
6645
6646	return order <= MAX_ORDER;
6647	}
6648	EXPORT_SYMBOL(is_free_buddy_page);
6649
6650	#ifdef CONFIG_MEMORY_FAILURE
6651	/*
6652	* Break down a higher-order page in sub-pages, and keep our target out of
6653	* buddy allocator.
6654	*/
6655	static void break_down_buddy_pages(struct zone zone, struct* page *page,
6656	struct page target, int* low, int high,
6657	int migratetype)
6658	{
6659	unsigned long size = `1` << high;
6660	struct page *current_buddy;
6661
6662	while (high > low) {
6663	high--;
6664	size >>= `1`;
6665
6666	if (target >= &page[size]) {
6667	current_buddy = page;
6668	page = page + size;
6669	} else {
6670	current_buddy = page + size;
6671	}
6672
6673	if (set_page_guard(zone, page: current_buddy, order: high, migratetype))
6674	continue;
6675
6676	add_to_free_list(page: current_buddy, zone, order: high, migratetype);
6677	set_buddy_order(page: current_buddy, order: high);
6678	}
6679	}
6680
6681	/*
6682	* Take a page that will be marked as poisoned off the buddy allocator.
6683	*/
6684	bool take_page_off_buddy(struct page *page)
6685	{
6686	struct zone *zone = page_zone(page);
6687	unsigned long pfn = page_to_pfn(page);
6688	unsigned long flags;
6689	unsigned int order;
6690	bool ret = false;
6691
6692	spin_lock_irqsave(&zone->lock, flags);
6693	for (order = `0`; order <= MAX_ORDER; order++) {
6694	struct page *page_head = page - (pfn & ((`1` << order) - `1`));
6695	int page_order = buddy_order(page: page_head);
6696
6697	if (PageBuddy(page: page_head) && page_order >= order) {
6698	unsigned long pfn_head = page_to_pfn(page_head);
6699	int migratetype = get_pfnblock_migratetype(page: page_head,
6700	pfn: pfn_head);
6701
6702	del_page_from_free_list(page: page_head, zone, order: page_order);
6703	break_down_buddy_pages(zone, page: page_head, target: page, low: `0`,
6704	high: page_order, migratetype);
6705	SetPageHWPoisonTakenOff(page);
6706	if (!is_migrate_isolate(migratetype))
6707	__mod_zone_freepage_state(zone, nr_pages: -`1`, migratetype);
6708	ret = true;
6709	break;
6710	}
6711	if (page_count(page: page_head) > `0`)
6712	break;
6713	}
6714	spin_unlock_irqrestore(lock: &zone->lock, flags);
6715	return ret;
6716	}
6717
6718	/*
6719	* Cancel takeoff done by take_page_off_buddy().
6720	*/
6721	bool put_page_back_buddy(struct page *page)
6722	{
6723	struct zone *zone = page_zone(page);
6724	unsigned long pfn = page_to_pfn(page);
6725	unsigned long flags;
6726	int migratetype = get_pfnblock_migratetype(page, pfn);
6727	bool ret = false;
6728
6729	spin_lock_irqsave(&zone->lock, flags);
6730	if (put_page_testzero(page)) {
6731	ClearPageHWPoisonTakenOff(page);
6732	__free_one_page(page, pfn, zone, order: `0`, migratetype, FPI_NONE);
6733	if (TestClearPageHWPoison(page)) {
6734	ret = true;
6735	}
6736	}
6737	spin_unlock_irqrestore(lock: &zone->lock, flags);
6738
6739	return ret;
6740	}
6741	#endif
6742
6743	#ifdef CONFIG_ZONE_DMA
6744	bool has_managed_dma(void)
6745	{
6746	struct pglist_data *pgdat;
6747
6748	for_each_online_pgdat(pgdat) {
6749	struct zone *zone = &pgdat->node_zones[ZONE_DMA];
6750
6751	if (managed_zone(zone))
6752	return true;
6753	}
6754	return false;
6755	}
6756	#endif /* CONFIG_ZONE_DMA */
6757
6758	#ifdef CONFIG_UNACCEPTED_MEMORY
6759
6760	/ Counts number of zones with unaccepted pages. /
6761	static DEFINE_STATIC_KEY_FALSE(zones_with_unaccepted_pages);
6762
6763	static bool lazy_accept = true;
6764
6765	static int __init accept_memory_parse(char *p)
6766	{
6767	if (!strcmp(p, "lazy")) {
6768	lazy_accept = true;
6769	return `0`;
6770	} else if (!strcmp(p, "eager")) {
6771	lazy_accept = false;
6772	return `0`;
6773	} else {
6774	return -EINVAL;
6775	}
6776	}
6777	early_param("accept_memory", accept_memory_parse);
6778
6779	static bool page_contains_unaccepted(struct page page, unsigned* int order)
6780	{
6781	phys_addr_t start = page_to_phys(page);
6782	phys_addr_t end = start + (PAGE_SIZE << order);
6783
6784	return range_contains_unaccepted_memory(start, end);
6785	}
6786
6787	static void accept_page(struct page page, unsigned* int order)
6788	{
6789	phys_addr_t start = page_to_phys(page);
6790
6791	accept_memory(start, end: start + (PAGE_SIZE << order));
6792	}
6793
6794	static bool try_to_accept_memory_one(struct zone *zone)
6795	{
6796	unsigned long flags;
6797	struct page *page;
6798	bool last;
6799
6800	if (list_empty(head: &zone->unaccepted_pages))
6801	return false;
6802
6803	spin_lock_irqsave(&zone->lock, flags);
6804	page = list_first_entry_or_null(&zone->unaccepted_pages,
6805	struct page, lru);
6806	if (!page) {
6807	spin_unlock_irqrestore(lock: &zone->lock, flags);
6808	return false;
6809	}
6810
6811	list_del(entry: &page->lru);
6812	last = list_empty(head: &zone->unaccepted_pages);
6813
6814	__mod_zone_freepage_state(zone, nr_pages: -MAX_ORDER_NR_PAGES, migratetype: MIGRATE_MOVABLE);
6815	__mod_zone_page_state(zone, item: NR_UNACCEPTED, -MAX_ORDER_NR_PAGES);
6816	spin_unlock_irqrestore(lock: &zone->lock, flags);
6817
6818	accept_page(page, MAX_ORDER);
6819
6820	__free_pages_ok(page, MAX_ORDER, FPI_TO_TAIL);
6821
6822	if (last)
6823	static_branch_dec(&zones_with_unaccepted_pages);
6824
6825	return true;
6826	}
6827
6828	static bool try_to_accept_memory(struct zone zone, unsigned* int order)
6829	{
6830	long to_accept;
6831	int ret = false;
6832
6833	/ How much to accept to get to high watermark? /
6834	to_accept = high_wmark_pages(zone) -
6835	(zone_page_state(zone, item: NR_FREE_PAGES) -
6836	__zone_watermark_unusable_free(z: zone, order, alloc_flags: `0`));
6837
6838	/ Accept at least one page /
6839	do {
6840	if (!try_to_accept_memory_one(zone))
6841	break;
6842	ret = true;
6843	to_accept -= MAX_ORDER_NR_PAGES;
6844	} while (to_accept > `0`);
6845
6846	return ret;
6847	}
6848
6849	static inline bool has_unaccepted_memory(void)
6850	{
6851	return static_branch_unlikely(&zones_with_unaccepted_pages);
6852	}
6853
6854	static bool __free_unaccepted(struct page *page)
6855	{
6856	struct zone *zone = page_zone(page);
6857	unsigned long flags;
6858	bool first = false;
6859
6860	if (!lazy_accept)
6861	return false;
6862
6863	spin_lock_irqsave(&zone->lock, flags);
6864	first = list_empty(head: &zone->unaccepted_pages);
6865	list_add_tail(new: &page->lru, head: &zone->unaccepted_pages);
6866	__mod_zone_freepage_state(zone, MAX_ORDER_NR_PAGES, migratetype: MIGRATE_MOVABLE);
6867	__mod_zone_page_state(zone, item: NR_UNACCEPTED, MAX_ORDER_NR_PAGES);
6868	spin_unlock_irqrestore(lock: &zone->lock, flags);
6869
6870	if (first)
6871	static_branch_inc(&zones_with_unaccepted_pages);
6872
6873	return true;
6874	}
6875
6876	#else
6877
6878	static bool page_contains_unaccepted(struct page page, unsigned* int order)
6879	{
6880	return false;
6881	}
6882
6883	static void accept_page(struct page page, unsigned* int order)
6884	{
6885	}
6886
6887	static bool try_to_accept_memory(struct zone zone, unsigned* int order)
6888	{
6889	return false;
6890	}
6891
6892	static inline bool has_unaccepted_memory(void)
6893	{
6894	return false;
6895	}
6896
6897	static bool __free_unaccepted(struct page *page)
6898	{
6899	BUILD_BUG();
6900	return false;
6901	}
6902
6903	#endif /* CONFIG_UNACCEPTED_MEMORY */
6904

source code of linux/mm/page_alloc.c