mm_init.c source code [linux/mm/mm_init.c]

1	// SPDX-License-Identifier: GPL-2.0-only
2	/*
3	* mm_init.c - Memory initialisation verification and debugging
4	*
5	* Copyright 2008 IBM Corporation, 2008
6	* Author Mel Gorman <mel@csn.ul.ie>
7	*
8	*/
9	#include <linux/kernel.h>
10	#include <linux/init.h>
11	#include <linux/kobject.h>
12	#include <linux/export.h>
13	#include <linux/memory.h>
14	#include <linux/notifier.h>
15	#include <linux/sched.h>
16	#include <linux/mman.h>
17	#include <linux/memblock.h>
18	#include <linux/page-isolation.h>
19	#include <linux/padata.h>
20	#include <linux/nmi.h>
21	#include <linux/buffer_head.h>
22	#include <linux/kmemleak.h>
23	#include <linux/kfence.h>
24	#include <linux/page_ext.h>
25	#include <linux/pti.h>
26	#include <linux/pgtable.h>
27	#include <linux/swap.h>
28	#include <linux/cma.h>
29	#include "internal.h"
30	#include "slab.h"
31	#include "shuffle.h"
32
33	#include <asm/setup.h>
34
35	#ifdef CONFIG_DEBUG_MEMORY_INIT
36	int __meminitdata mminit_loglevel;
37
38	/ The zonelists are simply reported, validation is manual. /
39	void __init mminit_verify_zonelist(void)
40	{
41	int nid;
42
43	if (mminit_loglevel < MMINIT_VERIFY)
44	return;
45
46	for_each_online_node(nid) {
47	pg_data_t *pgdat = NODE_DATA(nid);
48	struct zone *zone;
49	struct zoneref *z;
50	struct zonelist *zonelist;
51	int i, listid, zoneid;
52
53	BUILD_BUG_ON(MAX_ZONELISTS > `2`);
54	for (i = `0`; i < MAX_ZONELISTS * MAX_NR_ZONES; i++) {
55
56	/ Identify the zone and nodelist /
57	zoneid = i % MAX_NR_ZONES;
58	listid = i / MAX_NR_ZONES;
59	zonelist = &pgdat->node_zonelists[listid];
60	zone = &pgdat->node_zones[zoneid];
61	if (!populated_zone(zone))
62	continue;
63
64	/ Print information about the zonelist /
65	printk(KERN_DEBUG "mminit::zonelist %s %d:%s = ",
66	listid > `0` ? "thisnode" : "general", nid,
67	zone->name);
68
69	/ Iterate the zonelist /
70	for_each_zone_zonelist(zone, z, zonelist, zoneid)
71	pr_cont("%d:%s ", zone_to_nid(zone), zone->name);
72	pr_cont("\n");
73	}
74	}
75	}
76
77	void __init mminit_verify_pageflags_layout(void)
78	{
79	int shift, width;
80	unsigned long or_mask, add_mask;
81
82	shift = BITS_PER_LONG;
83	width = shift - SECTIONS_WIDTH - NODES_WIDTH - ZONES_WIDTH
84	- LAST_CPUPID_SHIFT - KASAN_TAG_WIDTH - LRU_GEN_WIDTH - LRU_REFS_WIDTH;
85	mminit_dprintk(MMINIT_TRACE, "pageflags_layout_widths",
86	"Section %d Node %d Zone %d Lastcpupid %d Kasantag %d Gen %d Tier %d Flags %d\n",
87	SECTIONS_WIDTH,
88	NODES_WIDTH,
89	ZONES_WIDTH,
90	LAST_CPUPID_WIDTH,
91	KASAN_TAG_WIDTH,
92	LRU_GEN_WIDTH,
93	LRU_REFS_WIDTH,
94	NR_PAGEFLAGS);
95	mminit_dprintk(MMINIT_TRACE, "pageflags_layout_shifts",
96	"Section %d Node %d Zone %d Lastcpupid %d Kasantag %d\n",
97	SECTIONS_SHIFT,
98	NODES_SHIFT,
99	ZONES_SHIFT,
100	LAST_CPUPID_SHIFT,
101	KASAN_TAG_WIDTH);
102	mminit_dprintk(MMINIT_TRACE, "pageflags_layout_pgshifts",
103	"Section %lu Node %lu Zone %lu Lastcpupid %lu Kasantag %lu\n",
104	(unsigned long)SECTIONS_PGSHIFT,
105	(unsigned long)NODES_PGSHIFT,
106	(unsigned long)ZONES_PGSHIFT,
107	(unsigned long)LAST_CPUPID_PGSHIFT,
108	(unsigned long)KASAN_TAG_PGSHIFT);
109	mminit_dprintk(MMINIT_TRACE, "pageflags_layout_nodezoneid",
110	"Node/Zone ID: %lu -> %lu\n",
111	(unsigned long)(ZONEID_PGOFF + ZONEID_SHIFT),
112	(unsigned long)ZONEID_PGOFF);
113	mminit_dprintk(MMINIT_TRACE, "pageflags_layout_usage",
114	"location: %d -> %d layout %d -> %d unused %d -> %d page-flags\n",
115	shift, width, width, NR_PAGEFLAGS, NR_PAGEFLAGS, `0`);
116	#ifdef NODE_NOT_IN_PAGE_FLAGS
117	mminit_dprintk(MMINIT_TRACE, "pageflags_layout_nodeflags",
118	"Node not in page flags");
119	#endif
120	#ifdef LAST_CPUPID_NOT_IN_PAGE_FLAGS
121	mminit_dprintk(MMINIT_TRACE, "pageflags_layout_nodeflags",
122	"Last cpupid not in page flags");
123	#endif
124
125	if (SECTIONS_WIDTH) {
126	shift -= SECTIONS_WIDTH;
127	BUG_ON(shift != SECTIONS_PGSHIFT);
128	}
129	if (NODES_WIDTH) {
130	shift -= NODES_WIDTH;
131	BUG_ON(shift != NODES_PGSHIFT);
132	}
133	if (ZONES_WIDTH) {
134	shift -= ZONES_WIDTH;
135	BUG_ON(shift != ZONES_PGSHIFT);
136	}
137
138	/ Check for bitmask overlaps /
139	or_mask = (ZONES_MASK << ZONES_PGSHIFT) \|
140	(NODES_MASK << NODES_PGSHIFT) \|
141	(SECTIONS_MASK << SECTIONS_PGSHIFT);
142	add_mask = (ZONES_MASK << ZONES_PGSHIFT) +
143	(NODES_MASK << NODES_PGSHIFT) +
144	(SECTIONS_MASK << SECTIONS_PGSHIFT);
145	BUG_ON(or_mask != add_mask);
146	}
147
148	static __init int set_mminit_loglevel(char *str)
149	{
150	get_option(str: &str, pint: &mminit_loglevel);
151	return `0`;
152	}
153	early_param("mminit_loglevel", set_mminit_loglevel);
154	#endif /* CONFIG_DEBUG_MEMORY_INIT */
155
156	struct kobject *mm_kobj;
157
158	#ifdef CONFIG_SMP
159	s32 vm_committed_as_batch = `32`;
160
161	void mm_compute_batch(int overcommit_policy)
162	{
163	u64 memsized_batch;
164	s32 nr = num_present_cpus();
165	s32 batch = max_t(s32, nr*`2`, `32`);
166	unsigned long ram_pages = totalram_pages();
167
168	/*
169	* For policy OVERCOMMIT_NEVER, set batch size to 0.4% of
170	* (total memory/#cpus), and lift it to 25% for other policies
171	* to easy the possible lock contention for percpu_counter
172	* vm_committed_as, while the max limit is INT_MAX
173	*/
174	if (overcommit_policy == OVERCOMMIT_NEVER)
175	memsized_batch = min_t(u64, ram_pages/nr/`256`, INT_MAX);
176	else
177	memsized_batch = min_t(u64, ram_pages/nr/`4`, INT_MAX);
178
179	vm_committed_as_batch = max_t(s32, memsized_batch, batch);
180	}
181
182	static int __meminit mm_compute_batch_notifier(struct notifier_block *self,
183	unsigned long action, void *arg)
184	{
185	switch (action) {
186	case MEM_ONLINE:
187	case MEM_OFFLINE:
188	mm_compute_batch(overcommit_policy: sysctl_overcommit_memory);
189	break;
190	default:
191	break;
192	}
193	return NOTIFY_OK;
194	}
195
196	static int __init mm_compute_batch_init(void)
197	{
198	mm_compute_batch(overcommit_policy: sysctl_overcommit_memory);
199	hotplug_memory_notifier(mm_compute_batch_notifier, MM_COMPUTE_BATCH_PRI);
200	return `0`;
201	}
202
203	__initcall(mm_compute_batch_init);
204
205	#endif
206
207	static int __init mm_sysfs_init(void)
208	{
209	mm_kobj = kobject_create_and_add(name: "mm", parent: kernel_kobj);
210	if (!mm_kobj)
211	return -ENOMEM;
212
213	return `0`;
214	}
215	postcore_initcall(mm_sysfs_init);
216
217	static unsigned long arch_zone_lowest_possible_pfn[MAX_NR_ZONES] __initdata;
218	static unsigned long arch_zone_highest_possible_pfn[MAX_NR_ZONES] __initdata;
219	static unsigned long zone_movable_pfn[MAX_NUMNODES] __initdata;
220
221	static unsigned long required_kernelcore __initdata;
222	static unsigned long required_kernelcore_percent __initdata;
223	static unsigned long required_movablecore __initdata;
224	static unsigned long required_movablecore_percent __initdata;
225
226	static unsigned long nr_kernel_pages __initdata;
227	static unsigned long nr_all_pages __initdata;
228	static unsigned long dma_reserve __initdata;
229
230	static bool deferred_struct_pages __meminitdata;
231
232	static DEFINE_PER_CPU(struct per_cpu_nodestat, boot_nodestats);
233
234	static int __init cmdline_parse_core(char p, unsigned* long *core,
235	unsigned long *percent)
236	{
237	unsigned long long coremem;
238	char *endptr;
239
240	if (!p)
241	return -EINVAL;
242
243	/ Value may be a percentage of total memory, otherwise bytes /
244	coremem = simple_strtoull(p, &endptr, `0`);
245	if (*endptr == `'%'`) {
246	/ Paranoid check for percent values greater than 100 /
247	WARN_ON(coremem > `100`);
248
249	*percent = coremem;
250	} else {
251	coremem = memparse(ptr: p, retptr: &p);
252	/ Paranoid check that UL is enough for the coremem value /
253	WARN_ON((coremem >> PAGE_SHIFT) > ULONG_MAX);
254
255	*core = coremem >> PAGE_SHIFT;
256	*percent = `0UL`;
257	}
258	return `0`;
259	}
260
261	bool mirrored_kernelcore __initdata_memblock;
262
263	/*
264	* kernelcore=size sets the amount of memory for use for allocations that
265	* cannot be reclaimed or migrated.
266	*/
267	static int __init cmdline_parse_kernelcore(char *p)
268	{
269	/ parse kernelcore=mirror /
270	if (parse_option_str(str: p, option: "mirror")) {
271	mirrored_kernelcore = true;
272	return `0`;
273	}
274
275	return cmdline_parse_core(p, core: &required_kernelcore,
276	percent: &required_kernelcore_percent);
277	}
278	early_param("kernelcore", cmdline_parse_kernelcore);
279
280	/*
281	* movablecore=size sets the amount of memory for use for allocations that
282	* can be reclaimed or migrated.
283	*/
284	static int __init cmdline_parse_movablecore(char *p)
285	{
286	return cmdline_parse_core(p, core: &required_movablecore,
287	percent: &required_movablecore_percent);
288	}
289	early_param("movablecore", cmdline_parse_movablecore);
290
291	/*
292	* early_calculate_totalpages()
293	* Sum pages in active regions for movable zone.
294	* Populate N_MEMORY for calculating usable_nodes.
295	*/
296	static unsigned long __init early_calculate_totalpages(void)
297	{
298	unsigned long totalpages = `0`;
299	unsigned long start_pfn, end_pfn;
300	int i, nid;
301
302	for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) {
303	unsigned long pages = end_pfn - start_pfn;
304
305	totalpages += pages;
306	if (pages)
307	node_set_state(node: nid, state: N_MEMORY);
308	}
309	return totalpages;
310	}
311
312	/*
313	* This finds a zone that can be used for ZONE_MOVABLE pages. The
314	* assumption is made that zones within a node are ordered in monotonic
315	* increasing memory addresses so that the "highest" populated zone is used
316	*/
317	static void __init find_usable_zone_for_movable(void)
318	{
319	int zone_index;
320	for (zone_index = MAX_NR_ZONES - `1`; zone_index >= `0`; zone_index--) {
321	if (zone_index == ZONE_MOVABLE)
322	continue;
323
324	if (arch_zone_highest_possible_pfn[zone_index] >
325	arch_zone_lowest_possible_pfn[zone_index])
326	break;
327	}
328
329	VM_BUG_ON(zone_index == -`1`);
330	movable_zone = zone_index;
331	}
332
333	/*
334	* Find the PFN the Movable zone begins in each node. Kernel memory
335	* is spread evenly between nodes as long as the nodes have enough
336	* memory. When they don't, some nodes will have more kernelcore than
337	* others
338	*/
339	static void __init find_zone_movable_pfns_for_nodes(void)
340	{
341	int i, nid;
342	unsigned long usable_startpfn;
343	unsigned long kernelcore_node, kernelcore_remaining;
344	/ save the state before borrow the nodemask /
345	nodemask_t saved_node_state = node_states[N_MEMORY];
346	unsigned long totalpages = early_calculate_totalpages();
347	int usable_nodes = nodes_weight(node_states[N_MEMORY]);
348	struct memblock_region *r;
349
350	/ Need to find movable_zone earlier when movable_node is specified. /
351	find_usable_zone_for_movable();
352
353	/*
354	* If movable_node is specified, ignore kernelcore and movablecore
355	* options.
356	*/
357	if (movable_node_is_enabled()) {
358	for_each_mem_region(r) {
359	if (!memblock_is_hotpluggable(m: r))
360	continue;
361
362	nid = memblock_get_region_node(r);
363
364	usable_startpfn = PFN_DOWN(r->base);
365	zone_movable_pfn[nid] = zone_movable_pfn[nid] ?
366	min(usable_startpfn, zone_movable_pfn[nid]) :
367	usable_startpfn;
368	}
369
370	goto out2;
371	}
372
373	/*
374	* If kernelcore=mirror is specified, ignore movablecore option
375	*/
376	if (mirrored_kernelcore) {
377	bool mem_below_4gb_not_mirrored = false;
378
379	if (!memblock_has_mirror()) {
380	pr_warn("The system has no mirror memory, ignore kernelcore=mirror.\n");
381	goto out;
382	}
383
384	for_each_mem_region(r) {
385	if (memblock_is_mirror(m: r))
386	continue;
387
388	nid = memblock_get_region_node(r);
389
390	usable_startpfn = memblock_region_memory_base_pfn(reg: r);
391
392	if (usable_startpfn < PHYS_PFN(SZ_4G)) {
393	mem_below_4gb_not_mirrored = true;
394	continue;
395	}
396
397	zone_movable_pfn[nid] = zone_movable_pfn[nid] ?
398	min(usable_startpfn, zone_movable_pfn[nid]) :
399	usable_startpfn;
400	}
401
402	if (mem_below_4gb_not_mirrored)
403	pr_warn("This configuration results in unmirrored kernel memory.\n");
404
405	goto out2;
406	}
407
408	/*
409	* If kernelcore=nn% or movablecore=nn% was specified, calculate the
410	* amount of necessary memory.
411	*/
412	if (required_kernelcore_percent)
413	required_kernelcore = (totalpages * `100` * required_kernelcore_percent) /
414	`10000UL`;
415	if (required_movablecore_percent)
416	required_movablecore = (totalpages * `100` * required_movablecore_percent) /
417	`10000UL`;
418
419	/*
420	* If movablecore= was specified, calculate what size of
421	* kernelcore that corresponds so that memory usable for
422	* any allocation type is evenly spread. If both kernelcore
423	* and movablecore are specified, then the value of kernelcore
424	* will be used for required_kernelcore if it's greater than
425	* what movablecore would have allowed.
426	*/
427	if (required_movablecore) {
428	unsigned long corepages;
429
430	/*
431	* Round-up so that ZONE_MOVABLE is at least as large as what
432	* was requested by the user
433	*/
434	required_movablecore =
435	roundup(required_movablecore, MAX_ORDER_NR_PAGES);
436	required_movablecore = min(totalpages, required_movablecore);
437	corepages = totalpages - required_movablecore;
438
439	required_kernelcore = max(required_kernelcore, corepages);
440	}
441
442	/*
443	* If kernelcore was not specified or kernelcore size is larger
444	* than totalpages, there is no ZONE_MOVABLE.
445	*/
446	if (!required_kernelcore \|\| required_kernelcore >= totalpages)
447	goto out;
448
449	/ usable_startpfn is the lowest possible pfn ZONE_MOVABLE can be at /
450	usable_startpfn = arch_zone_lowest_possible_pfn[movable_zone];
451
452	restart:
453	/ Spread kernelcore memory as evenly as possible throughout nodes /
454	kernelcore_node = required_kernelcore / usable_nodes;
455	for_each_node_state(nid, N_MEMORY) {
456	unsigned long start_pfn, end_pfn;
457
458	/*
459	* Recalculate kernelcore_node if the division per node
460	* now exceeds what is necessary to satisfy the requested
461	* amount of memory for the kernel
462	*/
463	if (required_kernelcore < kernelcore_node)
464	kernelcore_node = required_kernelcore / usable_nodes;
465
466	/*
467	* As the map is walked, we track how much memory is usable
468	* by the kernel using kernelcore_remaining. When it is
469	* 0, the rest of the node is usable by ZONE_MOVABLE
470	*/
471	kernelcore_remaining = kernelcore_node;
472
473	/ Go through each range of PFNs within this node /
474	for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
475	unsigned long size_pages;
476
477	start_pfn = max(start_pfn, zone_movable_pfn[nid]);
478	if (start_pfn >= end_pfn)
479	continue;
480
481	/ Account for what is only usable for kernelcore /
482	if (start_pfn < usable_startpfn) {
483	unsigned long kernel_pages;
484	kernel_pages = min(end_pfn, usable_startpfn)
485	- start_pfn;
486
487	kernelcore_remaining -= min(kernel_pages,
488	kernelcore_remaining);
489	required_kernelcore -= min(kernel_pages,
490	required_kernelcore);
491
492	/ Continue if range is now fully accounted /
493	if (end_pfn <= usable_startpfn) {
494
495	/*
496	* Push zone_movable_pfn to the end so
497	* that if we have to rebalance
498	* kernelcore across nodes, we will
499	* not double account here
500	*/
501	zone_movable_pfn[nid] = end_pfn;
502	continue;
503	}
504	start_pfn = usable_startpfn;
505	}
506
507	/*
508	* The usable PFN range for ZONE_MOVABLE is from
509	* start_pfn->end_pfn. Calculate size_pages as the
510	* number of pages used as kernelcore
511	*/
512	size_pages = end_pfn - start_pfn;
513	if (size_pages > kernelcore_remaining)
514	size_pages = kernelcore_remaining;
515	zone_movable_pfn[nid] = start_pfn + size_pages;
516
517	/*
518	* Some kernelcore has been met, update counts and
519	* break if the kernelcore for this node has been
520	* satisfied
521	*/
522	required_kernelcore -= min(required_kernelcore,
523	size_pages);
524	kernelcore_remaining -= size_pages;
525	if (!kernelcore_remaining)
526	break;
527	}
528	}
529
530	/*
531	* If there is still required_kernelcore, we do another pass with one
532	* less node in the count. This will push zone_movable_pfn[nid] further
533	* along on the nodes that still have memory until kernelcore is
534	* satisfied
535	*/
536	usable_nodes--;
537	if (usable_nodes && required_kernelcore > usable_nodes)
538	goto restart;
539
540	out2:
541	/ Align start of ZONE_MOVABLE on all nids to MAX_ORDER_NR_PAGES /
542	for (nid = `0`; nid < MAX_NUMNODES; nid++) {
543	unsigned long start_pfn, end_pfn;
544
545	zone_movable_pfn[nid] =
546	roundup(zone_movable_pfn[nid], MAX_ORDER_NR_PAGES);
547
548	get_pfn_range_for_nid(nid, start_pfn: &start_pfn, end_pfn: &end_pfn);
549	if (zone_movable_pfn[nid] >= end_pfn)
550	zone_movable_pfn[nid] = `0`;
551	}
552
553	out:
554	/ restore the node_state /
555	node_states[N_MEMORY] = saved_node_state;
556	}
557
558	void __meminit __init_single_page(struct page page, unsigned* long pfn,
559	unsigned long zone, int nid)
560	{
561	mm_zero_struct_page(page);
562	set_page_links(page, zone, node: nid, pfn);
563	init_page_count(page);
564	page_mapcount_reset(page);
565	page_cpupid_reset_last(page);
566	page_kasan_tag_reset(page);
567
568	INIT_LIST_HEAD(list: &page->lru);
569	#ifdef WANT_PAGE_VIRTUAL
570	/ The shift won't overflow because ZONE_NORMAL is below 4G. /
571	if (!is_highmem_idx(zone))
572	set_page_address(page, __va(pfn << PAGE_SHIFT));
573	#endif
574	}
575
576	#ifdef CONFIG_NUMA
577	/*
578	* During memory init memblocks map pfns to nids. The search is expensive and
579	* this caches recent lookups. The implementation of __early_pfn_to_nid
580	* treats start/end as pfns.
581	*/
582	struct mminit_pfnnid_cache {
583	unsigned long last_start;
584	unsigned long last_end;
585	int last_nid;
586	};
587
588	static struct mminit_pfnnid_cache early_pfnnid_cache __meminitdata;
589
590	/*
591	* Required by SPARSEMEM. Given a PFN, return what node the PFN is on.
592	*/
593	static int __meminit __early_pfn_to_nid(unsigned long pfn,
594	struct mminit_pfnnid_cache *state)
595	{
596	unsigned long start_pfn, end_pfn;
597	int nid;
598
599	if (state->last_start <= pfn && pfn < state->last_end)
600	return state->last_nid;
601
602	nid = memblock_search_pfn_nid(pfn, start_pfn: &start_pfn, end_pfn: &end_pfn);
603	if (nid != NUMA_NO_NODE) {
604	state->last_start = start_pfn;
605	state->last_end = end_pfn;
606	state->last_nid = nid;
607	}
608
609	return nid;
610	}
611
612	int __meminit early_pfn_to_nid(unsigned long pfn)
613	{
614	static DEFINE_SPINLOCK(early_pfn_lock);
615	int nid;
616
617	spin_lock(lock: &early_pfn_lock);
618	nid = __early_pfn_to_nid(pfn, state: &early_pfnnid_cache);
619	if (nid < `0`)
620	nid = first_online_node;
621	spin_unlock(lock: &early_pfn_lock);
622
623	return nid;
624	}
625
626	int hashdist = HASHDIST_DEFAULT;
627
628	static int __init set_hashdist(char *str)
629	{
630	if (!str)
631	return `0`;
632	hashdist = simple_strtoul(str, &str, `0`);
633	return `1`;
634	}
635	__setup("hashdist=", set_hashdist);
636
637	static inline void fixup_hashdist(void)
638	{
639	if (num_node_state(state: N_MEMORY) == `1`)
640	hashdist = `0`;
641	}
642	#else
643	static inline void fixup_hashdist(void) {}
644	#endif /* CONFIG_NUMA */
645
646	#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
647	static inline void pgdat_set_deferred_range(pg_data_t *pgdat)
648	{
649	pgdat->first_deferred_pfn = ULONG_MAX;
650	}
651
652	/ Returns true if the struct page for the pfn is initialised /
653	static inline bool __meminit early_page_initialised(unsigned long pfn, int nid)
654	{
655	if (node_online(nid) && pfn >= NODE_DATA(nid)->first_deferred_pfn)
656	return false;
657
658	return true;
659	}
660
661	/*
662	* Returns true when the remaining initialisation should be deferred until
663	* later in the boot cycle when it can be parallelised.
664	*/
665	static bool __meminit
666	defer_init(int nid, unsigned long pfn, unsigned long end_pfn)
667	{
668	static unsigned long prev_end_pfn, nr_initialised;
669
670	if (early_page_ext_enabled())
671	return false;
672	/*
673	* prev_end_pfn static that contains the end of previous zone
674	* No need to protect because called very early in boot before smp_init.
675	*/
676	if (prev_end_pfn != end_pfn) {
677	prev_end_pfn = end_pfn;
678	nr_initialised = `0`;
679	}
680
681	/ Always populate low zones for address-constrained allocations /
682	if (end_pfn < pgdat_end_pfn(NODE_DATA(nid)))
683	return false;
684
685	if (NODE_DATA(nid)->first_deferred_pfn != ULONG_MAX)
686	return true;
687	/*
688	* We start only with one section of pages, more pages are added as
689	* needed until the rest of deferred pages are initialized.
690	*/
691	nr_initialised++;
692	if ((nr_initialised > PAGES_PER_SECTION) &&
693	(pfn & (PAGES_PER_SECTION - `1`)) == `0`) {
694	NODE_DATA(nid)->first_deferred_pfn = pfn;
695	return true;
696	}
697	return false;
698	}
699
700	static void __meminit init_reserved_page(unsigned long pfn, int nid)
701	{
702	pg_data_t *pgdat;
703	int zid;
704
705	if (early_page_initialised(pfn, nid))
706	return;
707
708	pgdat = NODE_DATA(nid);
709
710	for (zid = `0`; zid < MAX_NR_ZONES; zid++) {
711	struct zone *zone = &pgdat->node_zones[zid];
712
713	if (zone_spans_pfn(zone, pfn))
714	break;
715	}
716	__init_single_page(pfn_to_page(pfn), pfn, zone: zid, nid);
717	}
718	#else
719	static inline void pgdat_set_deferred_range(pg_data_t *pgdat) {}
720
721	static inline bool early_page_initialised(unsigned long pfn, int nid)
722	{
723	return true;
724	}
725
726	static inline bool defer_init(int nid, unsigned long pfn, unsigned long end_pfn)
727	{
728	return false;
729	}
730
731	static inline void init_reserved_page(unsigned long pfn, int nid)
732	{
733	}
734	#endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */
735
736	/*
737	* Initialised pages do not have PageReserved set. This function is
738	* called for each range allocated by the bootmem allocator and
739	* marks the pages PageReserved. The remaining valid pages are later
740	* sent to the buddy page allocator.
741	*/
742	void __meminit reserve_bootmem_region(phys_addr_t start,
743	phys_addr_t end, int nid)
744	{
745	unsigned long start_pfn = PFN_DOWN(start);
746	unsigned long end_pfn = PFN_UP(end);
747
748	for (; start_pfn < end_pfn; start_pfn++) {
749	if (pfn_valid(pfn: start_pfn)) {
750	struct page *page = pfn_to_page(start_pfn);
751
752	init_reserved_page(pfn: start_pfn, nid);
753
754	/ Avoid false-positive PageTail() /
755	INIT_LIST_HEAD(list: &page->lru);
756
757	/*
758	* no need for atomic set_bit because the struct
759	* page is not visible yet so nobody should
760	* access it yet.
761	*/
762	__SetPageReserved(page);
763	}
764	}
765	}
766
767	/ If zone is ZONE_MOVABLE but memory is mirrored, it is an overlapped init /
768	static bool __meminit
769	overlap_memmap_init(unsigned long zone, unsigned long *pfn)
770	{
771	static struct memblock_region *r;
772
773	if (mirrored_kernelcore && zone == ZONE_MOVABLE) {
774	if (!r \|\| *pfn >= memblock_region_memory_end_pfn(reg: r)) {
775	for_each_mem_region(r) {
776	if (*pfn < memblock_region_memory_end_pfn(reg: r))
777	break;
778	}
779	}
780	if (*pfn >= memblock_region_memory_base_pfn(reg: r) &&
781	memblock_is_mirror(m: r)) {
782	*pfn = memblock_region_memory_end_pfn(reg: r);
783	return true;
784	}
785	}
786	return false;
787	}
788
789	/*
790	* Only struct pages that correspond to ranges defined by memblock.memory
791	* are zeroed and initialized by going through __init_single_page() during
792	* memmap_init_zone_range().
793	*
794	* But, there could be struct pages that correspond to holes in
795	* memblock.memory. This can happen because of the following reasons:
796	* - physical memory bank size is not necessarily the exact multiple of the
797	* arbitrary section size
798	* - early reserved memory may not be listed in memblock.memory
799	* - memory layouts defined with memmap= kernel parameter may not align
800	* nicely with memmap sections
801	*
802	* Explicitly initialize those struct pages so that:
803	* - PG_Reserved is set
804	* - zone and node links point to zone and node that span the page if the
805	* hole is in the middle of a zone
806	* - zone and node links point to adjacent zone/node if the hole falls on
807	* the zone boundary; the pages in such holes will be prepended to the
808	* zone/node above the hole except for the trailing pages in the last
809	* section that will be appended to the zone/node below.
810	*/
811	static void __init init_unavailable_range(unsigned long spfn,
812	unsigned long epfn,
813	int zone, int node)
814	{
815	unsigned long pfn;
816	u64 pgcnt = `0`;
817
818	for (pfn = spfn; pfn < epfn; pfn++) {
819	if (!pfn_valid(pageblock_start_pfn(pfn))) {
820	pfn = pageblock_end_pfn(pfn) - `1`;
821	continue;
822	}
823	__init_single_page(pfn_to_page(pfn), pfn, zone, nid: node);
824	__SetPageReserved(pfn_to_page(pfn));
825	pgcnt++;
826	}
827
828	if (pgcnt)
829	pr_info("On node %d, zone %s: %lld pages in unavailable ranges",
830	node, zone_names[zone], pgcnt);
831	}
832
833	/*
834	* Initially all pages are reserved - free ones are freed
835	* up by memblock_free_all() once the early boot process is
836	* done. Non-atomic initialization, single-pass.
837	*
838	* All aligned pageblocks are initialized to the specified migratetype
839	* (usually MIGRATE_MOVABLE). Besides setting the migratetype, no related
840	* zone stats (e.g., nr_isolate_pageblock) are touched.
841	*/
842	void __meminit memmap_init_range(unsigned long size, int nid, unsigned long zone,
843	unsigned long start_pfn, unsigned long zone_end_pfn,
844	enum meminit_context context,
845	struct vmem_altmap altmap, int* migratetype)
846	{
847	unsigned long pfn, end_pfn = start_pfn + size;
848	struct page *page;
849
850	if (highest_memmap_pfn < end_pfn - `1`)
851	highest_memmap_pfn = end_pfn - `1`;
852
853	#ifdef CONFIG_ZONE_DEVICE
854	/*
855	* Honor reservation requested by the driver for this ZONE_DEVICE
856	* memory. We limit the total number of pages to initialize to just
857	* those that might contain the memory mapping. We will defer the
858	* ZONE_DEVICE page initialization until after we have released
859	* the hotplug lock.
860	*/
861	if (zone == ZONE_DEVICE) {
862	if (!altmap)
863	return;
864
865	if (start_pfn == altmap->base_pfn)
866	start_pfn += altmap->reserve;
867	end_pfn = altmap->base_pfn + vmem_altmap_offset(altmap);
868	}
869	#endif
870
871	for (pfn = start_pfn; pfn < end_pfn; ) {
872	/*
873	* There can be holes in boot-time mem_map[]s handed to this
874	* function. They do not exist on hotplugged memory.
875	*/
876	if (context == MEMINIT_EARLY) {
877	if (overlap_memmap_init(zone, pfn: &pfn))
878	continue;
879	if (defer_init(nid, pfn, end_pfn: zone_end_pfn)) {
880	deferred_struct_pages = true;
881	break;
882	}
883	}
884
885	page = pfn_to_page(pfn);
886	__init_single_page(page, pfn, zone, nid);
887	if (context == MEMINIT_HOTPLUG)
888	__SetPageReserved(page);
889
890	/*
891	* Usually, we want to mark the pageblock MIGRATE_MOVABLE,
892	* such that unmovable allocations won't be scattered all
893	* over the place during system boot.
894	*/
895	if (pageblock_aligned(pfn)) {
896	set_pageblock_migratetype(page, migratetype);
897	cond_resched();
898	}
899	pfn++;
900	}
901	}
902
903	static void __init memmap_init_zone_range(struct zone *zone,
904	unsigned long start_pfn,
905	unsigned long end_pfn,
906	unsigned long *hole_pfn)
907	{
908	unsigned long zone_start_pfn = zone->zone_start_pfn;
909	unsigned long zone_end_pfn = zone_start_pfn + zone->spanned_pages;
910	int nid = zone_to_nid(zone), zone_id = zone_idx(zone);
911
912	start_pfn = clamp(start_pfn, zone_start_pfn, zone_end_pfn);
913	end_pfn = clamp(end_pfn, zone_start_pfn, zone_end_pfn);
914
915	if (start_pfn >= end_pfn)
916	return;
917
918	memmap_init_range(size: end_pfn - start_pfn, nid, zone: zone_id, start_pfn,
919	zone_end_pfn, context: MEMINIT_EARLY, NULL, migratetype: MIGRATE_MOVABLE);
920
921	if (*hole_pfn < start_pfn)
922	init_unavailable_range(spfn: *hole_pfn, epfn: start_pfn, zone: zone_id, node: nid);
923
924	*hole_pfn = end_pfn;
925	}
926
927	static void __init memmap_init(void)
928	{
929	unsigned long start_pfn, end_pfn;
930	unsigned long hole_pfn = `0`;
931	int i, j, zone_id = `0`, nid;
932
933	for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) {
934	struct pglist_data *node = NODE_DATA(nid);
935
936	for (j = `0`; j < MAX_NR_ZONES; j++) {
937	struct zone *zone = node->node_zones + j;
938
939	if (!populated_zone(zone))
940	continue;
941
942	memmap_init_zone_range(zone, start_pfn, end_pfn,
943	hole_pfn: &hole_pfn);
944	zone_id = j;
945	}
946	}
947
948	#ifdef CONFIG_SPARSEMEM
949	/*
950	* Initialize the memory map for hole in the range [memory_end,
951	* section_end].
952	* Append the pages in this hole to the highest zone in the last
953	* node.
954	* The call to init_unavailable_range() is outside the ifdef to
955	* silence the compiler warining about zone_id set but not used;
956	* for FLATMEM it is a nop anyway
957	*/
958	end_pfn = round_up(end_pfn, PAGES_PER_SECTION);
959	if (hole_pfn < end_pfn)
960	#endif
961	init_unavailable_range(spfn: hole_pfn, epfn: end_pfn, zone: zone_id, node: nid);
962	}
963
964	#ifdef CONFIG_ZONE_DEVICE
965	static void __ref __init_zone_device_page(struct page page, unsigned* long pfn,
966	unsigned long zone_idx, int nid,
967	struct dev_pagemap *pgmap)
968	{
969
970	__init_single_page(page, pfn, zone: zone_idx, nid);
971
972	/*
973	* Mark page reserved as it will need to wait for onlining
974	* phase for it to be fully associated with a zone.
975	*
976	* We can use the non-atomic __set_bit operation for setting
977	* the flag as we are still initializing the pages.
978	*/
979	__SetPageReserved(page);
980
981	/*
982	* ZONE_DEVICE pages union ->lru with a ->pgmap back pointer
983	* and zone_device_data. It is a bug if a ZONE_DEVICE page is
984	* ever freed or placed on a driver-private list.
985	*/
986	page->pgmap = pgmap;
987	page->zone_device_data = NULL;
988
989	/*
990	* Mark the block movable so that blocks are reserved for
991	* movable at startup. This will force kernel allocations
992	* to reserve their blocks rather than leaking throughout
993	* the address space during boot when many long-lived
994	* kernel allocations are made.
995	*
996	* Please note that MEMINIT_HOTPLUG path doesn't clear memmap
997	* because this is done early in section_activate()
998	*/
999	if (pageblock_aligned(pfn)) {
1000	set_pageblock_migratetype(page, migratetype: MIGRATE_MOVABLE);
1001	cond_resched();
1002	}
1003
1004	/*
1005	* ZONE_DEVICE pages are released directly to the driver page allocator
1006	* which will set the page count to 1 when allocating the page.
1007	*/
1008	if (pgmap->type == MEMORY_DEVICE_PRIVATE \|\|
1009	pgmap->type == MEMORY_DEVICE_COHERENT)
1010	set_page_count(page, v: `0`);
1011	}
1012
1013	/*
1014	* With compound page geometry and when struct pages are stored in ram most
1015	* tail pages are reused. Consequently, the amount of unique struct pages to
1016	* initialize is a lot smaller that the total amount of struct pages being
1017	* mapped. This is a paired / mild layering violation with explicit knowledge
1018	* of how the sparse_vmemmap internals handle compound pages in the lack
1019	* of an altmap. See vmemmap_populate_compound_pages().
1020	*/
1021	static inline unsigned long compound_nr_pages(struct vmem_altmap *altmap,
1022	struct dev_pagemap *pgmap)
1023	{
1024	if (!vmemmap_can_optimize(altmap, pgmap))
1025	return pgmap_vmemmap_nr(pgmap);
1026
1027	return VMEMMAP_RESERVE_NR * (PAGE_SIZE / sizeof(struct page));
1028	}
1029
1030	static void __ref memmap_init_compound(struct page *head,
1031	unsigned long head_pfn,
1032	unsigned long zone_idx, int nid,
1033	struct dev_pagemap *pgmap,
1034	unsigned long nr_pages)
1035	{
1036	unsigned long pfn, end_pfn = head_pfn + nr_pages;
1037	unsigned int order = pgmap->vmemmap_shift;
1038
1039	__SetPageHead(page: head);
1040	for (pfn = head_pfn + `1`; pfn < end_pfn; pfn++) {
1041	struct page *page = pfn_to_page(pfn);
1042
1043	__init_zone_device_page(page, pfn, zone_idx, nid, pgmap);
1044	prep_compound_tail(head, tail_idx: pfn - head_pfn);
1045	set_page_count(page, v: `0`);
1046
1047	/*
1048	* The first tail page stores important compound page info.
1049	* Call prep_compound_head() after the first tail page has
1050	* been initialized, to not have the data overwritten.
1051	*/
1052	if (pfn == head_pfn + `1`)
1053	prep_compound_head(page: head, order);
1054	}
1055	}
1056
1057	void __ref memmap_init_zone_device(struct zone *zone,
1058	unsigned long start_pfn,
1059	unsigned long nr_pages,
1060	struct dev_pagemap *pgmap)
1061	{
1062	unsigned long pfn, end_pfn = start_pfn + nr_pages;
1063	struct pglist_data *pgdat = zone->zone_pgdat;
1064	struct vmem_altmap *altmap = pgmap_altmap(pgmap);
1065	unsigned int pfns_per_compound = pgmap_vmemmap_nr(pgmap);
1066	unsigned long zone_idx = zone_idx(zone);
1067	unsigned long start = jiffies;
1068	int nid = pgdat->node_id;
1069
1070	if (WARN_ON_ONCE(!pgmap \|\| zone_idx != ZONE_DEVICE))
1071	return;
1072
1073	/*
1074	* The call to memmap_init should have already taken care
1075	* of the pages reserved for the memmap, so we can just jump to
1076	* the end of that region and start processing the device pages.
1077	*/
1078	if (altmap) {
1079	start_pfn = altmap->base_pfn + vmem_altmap_offset(altmap);
1080	nr_pages = end_pfn - start_pfn;
1081	}
1082
1083	for (pfn = start_pfn; pfn < end_pfn; pfn += pfns_per_compound) {
1084	struct page *page = pfn_to_page(pfn);
1085
1086	__init_zone_device_page(page, pfn, zone_idx, nid, pgmap);
1087
1088	if (pfns_per_compound == `1`)
1089	continue;
1090
1091	memmap_init_compound(head: page, head_pfn: pfn, zone_idx, nid, pgmap,
1092	nr_pages: compound_nr_pages(altmap, pgmap));
1093	}
1094
1095	pr_debug("%s initialised %lu pages in %ums\n", __func__,
1096	nr_pages, jiffies_to_msecs(jiffies - start));
1097	}
1098	#endif
1099
1100	/*
1101	* The zone ranges provided by the architecture do not include ZONE_MOVABLE
1102	* because it is sized independent of architecture. Unlike the other zones,
1103	* the starting point for ZONE_MOVABLE is not fixed. It may be different
1104	* in each node depending on the size of each node and how evenly kernelcore
1105	* is distributed. This helper function adjusts the zone ranges
1106	* provided by the architecture for a given node by using the end of the
1107	* highest usable zone for ZONE_MOVABLE. This preserves the assumption that
1108	* zones within a node are in order of monotonic increases memory addresses
1109	*/
1110	static void __init adjust_zone_range_for_zone_movable(int nid,
1111	unsigned long zone_type,
1112	unsigned long node_end_pfn,
1113	unsigned long *zone_start_pfn,
1114	unsigned long *zone_end_pfn)
1115	{
1116	/ Only adjust if ZONE_MOVABLE is on this node /
1117	if (zone_movable_pfn[nid]) {
1118	/ Size ZONE_MOVABLE /
1119	if (zone_type == ZONE_MOVABLE) {
1120	*zone_start_pfn = zone_movable_pfn[nid];
1121	*zone_end_pfn = min(node_end_pfn,
1122	arch_zone_highest_possible_pfn[movable_zone]);
1123
1124	/ Adjust for ZONE_MOVABLE starting within this range /
1125	} else if (!mirrored_kernelcore &&
1126	*zone_start_pfn < zone_movable_pfn[nid] &&
1127	*zone_end_pfn > zone_movable_pfn[nid]) {
1128	*zone_end_pfn = zone_movable_pfn[nid];
1129
1130	/ Check if this whole range is within ZONE_MOVABLE /
1131	} else if (*zone_start_pfn >= zone_movable_pfn[nid])
1132	zone_start_pfn = zone_end_pfn;
1133	}
1134	}
1135
1136	/*
1137	* Return the number of holes in a range on a node. If nid is MAX_NUMNODES,
1138	* then all holes in the requested range will be accounted for.
1139	*/
1140	unsigned long __init __absent_pages_in_range(int nid,
1141	unsigned long range_start_pfn,
1142	unsigned long range_end_pfn)
1143	{
1144	unsigned long nr_absent = range_end_pfn - range_start_pfn;
1145	unsigned long start_pfn, end_pfn;
1146	int i;
1147
1148	for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
1149	start_pfn = clamp(start_pfn, range_start_pfn, range_end_pfn);
1150	end_pfn = clamp(end_pfn, range_start_pfn, range_end_pfn);
1151	nr_absent -= end_pfn - start_pfn;
1152	}
1153	return nr_absent;
1154	}
1155
1156	/**
1157	* absent_pages_in_range - Return number of page frames in holes within a range
1158	* @start_pfn: The start PFN to start searching for holes
1159	* @end_pfn: The end PFN to stop searching for holes
1160	*
1161	* Return: the number of pages frames in memory holes within a range.
1162	*/
1163	unsigned long __init absent_pages_in_range(unsigned long start_pfn,
1164	unsigned long end_pfn)
1165	{
1166	return __absent_pages_in_range(MAX_NUMNODES, range_start_pfn: start_pfn, range_end_pfn: end_pfn);
1167	}
1168
1169	/ Return the number of page frames in holes in a zone on a node /
1170	static unsigned long __init zone_absent_pages_in_node(int nid,
1171	unsigned long zone_type,
1172	unsigned long zone_start_pfn,
1173	unsigned long zone_end_pfn)
1174	{
1175	unsigned long nr_absent;
1176
1177	/ zone is empty, we don't have any absent pages /
1178	if (zone_start_pfn == zone_end_pfn)
1179	return `0`;
1180
1181	nr_absent = __absent_pages_in_range(nid, range_start_pfn: zone_start_pfn, range_end_pfn: zone_end_pfn);
1182
1183	/*
1184	* ZONE_MOVABLE handling.
1185	* Treat pages to be ZONE_MOVABLE in ZONE_NORMAL as absent pages
1186	* and vice versa.
1187	*/
1188	if (mirrored_kernelcore && zone_movable_pfn[nid]) {
1189	unsigned long start_pfn, end_pfn;
1190	struct memblock_region *r;
1191
1192	for_each_mem_region(r) {
1193	start_pfn = clamp(memblock_region_memory_base_pfn(r),
1194	zone_start_pfn, zone_end_pfn);
1195	end_pfn = clamp(memblock_region_memory_end_pfn(r),
1196	zone_start_pfn, zone_end_pfn);
1197
1198	if (zone_type == ZONE_MOVABLE &&
1199	memblock_is_mirror(m: r))
1200	nr_absent += end_pfn - start_pfn;
1201
1202	if (zone_type == ZONE_NORMAL &&
1203	!memblock_is_mirror(m: r))
1204	nr_absent += end_pfn - start_pfn;
1205	}
1206	}
1207
1208	return nr_absent;
1209	}
1210
1211	/*
1212	* Return the number of pages a zone spans in a node, including holes
1213	* present_pages = zone_spanned_pages_in_node() - zone_absent_pages_in_node()
1214	*/
1215	static unsigned long __init zone_spanned_pages_in_node(int nid,
1216	unsigned long zone_type,
1217	unsigned long node_start_pfn,
1218	unsigned long node_end_pfn,
1219	unsigned long *zone_start_pfn,
1220	unsigned long *zone_end_pfn)
1221	{
1222	unsigned long zone_low = arch_zone_lowest_possible_pfn[zone_type];
1223	unsigned long zone_high = arch_zone_highest_possible_pfn[zone_type];
1224
1225	/ Get the start and end of the zone /
1226	*zone_start_pfn = clamp(node_start_pfn, zone_low, zone_high);
1227	*zone_end_pfn = clamp(node_end_pfn, zone_low, zone_high);
1228	adjust_zone_range_for_zone_movable(nid, zone_type, node_end_pfn,
1229	zone_start_pfn, zone_end_pfn);
1230
1231	/ Check that this node has pages within the zone's required range /
1232	if (zone_end_pfn < node_start_pfn \|\| zone_start_pfn > node_end_pfn)
1233	return `0`;
1234
1235	/ Move the zone boundaries inside the node if necessary /
1236	zone_end_pfn = min(zone_end_pfn, node_end_pfn);
1237	zone_start_pfn = max(zone_start_pfn, node_start_pfn);
1238
1239	/ Return the spanned pages /
1240	return zone_end_pfn - zone_start_pfn;
1241	}
1242
1243	static void __init reset_memoryless_node_totalpages(struct pglist_data *pgdat)
1244	{
1245	struct zone *z;
1246
1247	for (z = pgdat->node_zones; z < pgdat->node_zones + MAX_NR_ZONES; z++) {
1248	z->zone_start_pfn = `0`;
1249	z->spanned_pages = `0`;
1250	z->present_pages = `0`;
1251	#if defined(CONFIG_MEMORY_HOTPLUG)
1252	z->present_early_pages = `0`;
1253	#endif
1254	}
1255
1256	pgdat->node_spanned_pages = `0`;
1257	pgdat->node_present_pages = `0`;
1258	pr_debug("On node %d totalpages: 0\n", pgdat->node_id);
1259	}
1260
1261	static void __init calculate_node_totalpages(struct pglist_data *pgdat,
1262	unsigned long node_start_pfn,
1263	unsigned long node_end_pfn)
1264	{
1265	unsigned long realtotalpages = `0`, totalpages = `0`;
1266	enum zone_type i;
1267
1268	for (i = `0`; i < MAX_NR_ZONES; i++) {
1269	struct zone *zone = pgdat->node_zones + i;
1270	unsigned long zone_start_pfn, zone_end_pfn;
1271	unsigned long spanned, absent;
1272	unsigned long real_size;
1273
1274	spanned = zone_spanned_pages_in_node(nid: pgdat->node_id, zone_type: i,
1275	node_start_pfn,
1276	node_end_pfn,
1277	zone_start_pfn: &zone_start_pfn,
1278	zone_end_pfn: &zone_end_pfn);
1279	absent = zone_absent_pages_in_node(nid: pgdat->node_id, zone_type: i,
1280	zone_start_pfn,
1281	zone_end_pfn);
1282
1283	real_size = spanned - absent;
1284
1285	if (spanned)
1286	zone->zone_start_pfn = zone_start_pfn;
1287	else
1288	zone->zone_start_pfn = `0`;
1289	zone->spanned_pages = spanned;
1290	zone->present_pages = real_size;
1291	#if defined(CONFIG_MEMORY_HOTPLUG)
1292	zone->present_early_pages = real_size;
1293	#endif
1294
1295	totalpages += spanned;
1296	realtotalpages += real_size;
1297	}
1298
1299	pgdat->node_spanned_pages = totalpages;
1300	pgdat->node_present_pages = realtotalpages;
1301	pr_debug("On node %d totalpages: %lu\n", pgdat->node_id, realtotalpages);
1302	}
1303
1304	static unsigned long __init calc_memmap_size(unsigned long spanned_pages,
1305	unsigned long present_pages)
1306	{
1307	unsigned long pages = spanned_pages;
1308
1309	/*
1310	* Provide a more accurate estimation if there are holes within
1311	* the zone and SPARSEMEM is in use. If there are holes within the
1312	* zone, each populated memory region may cost us one or two extra
1313	* memmap pages due to alignment because memmap pages for each
1314	* populated regions may not be naturally aligned on page boundary.
1315	* So the (present_pages >> 4) heuristic is a tradeoff for that.
1316	*/
1317	if (spanned_pages > present_pages + (present_pages >> `4`) &&
1318	IS_ENABLED(CONFIG_SPARSEMEM))
1319	pages = present_pages;
1320
1321	return PAGE_ALIGN(pages * sizeof(struct page)) >> PAGE_SHIFT;
1322	}
1323
1324	#ifdef CONFIG_TRANSPARENT_HUGEPAGE
1325	static void pgdat_init_split_queue(struct pglist_data *pgdat)
1326	{
1327	struct deferred_split *ds_queue = &pgdat->deferred_split_queue;
1328
1329	spin_lock_init(&ds_queue->split_queue_lock);
1330	INIT_LIST_HEAD(list: &ds_queue->split_queue);
1331	ds_queue->split_queue_len = `0`;
1332	}
1333	#else
1334	static void pgdat_init_split_queue(struct pglist_data *pgdat) {}
1335	#endif
1336
1337	#ifdef CONFIG_COMPACTION
1338	static void pgdat_init_kcompactd(struct pglist_data *pgdat)
1339	{
1340	init_waitqueue_head(&pgdat->kcompactd_wait);
1341	}
1342	#else
1343	static void pgdat_init_kcompactd(struct pglist_data *pgdat) {}
1344	#endif
1345
1346	static void __meminit pgdat_init_internals(struct pglist_data *pgdat)
1347	{
1348	int i;
1349
1350	pgdat_resize_init(pgdat);
1351	pgdat_kswapd_lock_init(pgdat);
1352
1353	pgdat_init_split_queue(pgdat);
1354	pgdat_init_kcompactd(pgdat);
1355
1356	init_waitqueue_head(&pgdat->kswapd_wait);
1357	init_waitqueue_head(&pgdat->pfmemalloc_wait);
1358
1359	for (i = `0`; i < NR_VMSCAN_THROTTLE; i++)
1360	init_waitqueue_head(&pgdat->reclaim_wait[i]);
1361
1362	pgdat_page_ext_init(pgdat);
1363	lruvec_init(lruvec: &pgdat->__lruvec);
1364	}
1365
1366	static void __meminit zone_init_internals(struct zone zone, enum* zone_type idx, int nid,
1367	unsigned long remaining_pages)
1368	{
1369	atomic_long_set(v: &zone->managed_pages, i: remaining_pages);
1370	zone_set_nid(zone, nid);
1371	zone->name = zone_names[idx];
1372	zone->zone_pgdat = NODE_DATA(nid);
1373	spin_lock_init(&zone->lock);
1374	zone_seqlock_init(zone);
1375	zone_pcp_init(zone);
1376	}
1377
1378	static void __meminit zone_init_free_lists(struct zone *zone)
1379	{
1380	unsigned int order, t;
1381	for_each_migratetype_order(order, t) {
1382	INIT_LIST_HEAD(list: &zone->free_area[order].free_list[t]);
1383	zone->free_area[order].nr_free = `0`;
1384	}
1385
1386	#ifdef CONFIG_UNACCEPTED_MEMORY
1387	INIT_LIST_HEAD(list: &zone->unaccepted_pages);
1388	#endif
1389	}
1390
1391	void __meminit init_currently_empty_zone(struct zone *zone,
1392	unsigned long zone_start_pfn,
1393	unsigned long size)
1394	{
1395	struct pglist_data *pgdat = zone->zone_pgdat;
1396	int zone_idx = zone_idx(zone) + `1`;
1397
1398	if (zone_idx > pgdat->nr_zones)
1399	pgdat->nr_zones = zone_idx;
1400
1401	zone->zone_start_pfn = zone_start_pfn;
1402
1403	mminit_dprintk(MMINIT_TRACE, "memmap_init",
1404	"Initialising map node %d zone %lu pfns %lu -> %lu\n",
1405	pgdat->node_id,
1406	(unsigned long)zone_idx(zone),
1407	zone_start_pfn, (zone_start_pfn + size));
1408
1409	zone_init_free_lists(zone);
1410	zone->initialized = `1`;
1411	}
1412
1413	#ifndef CONFIG_SPARSEMEM
1414	/*
1415	* Calculate the size of the zone->blockflags rounded to an unsigned long
1416	* Start by making sure zonesize is a multiple of pageblock_order by rounding
1417	* up. Then use 1 NR_PAGEBLOCK_BITS worth of bits per pageblock, finally
1418	* round what is now in bits to nearest long in bits, then return it in
1419	* bytes.
1420	*/
1421	static unsigned long __init usemap_size(unsigned long zone_start_pfn, unsigned long zonesize)
1422	{
1423	unsigned long usemapsize;
1424
1425	zonesize += zone_start_pfn & (pageblock_nr_pages-`1`);
1426	usemapsize = roundup(zonesize, pageblock_nr_pages);
1427	usemapsize = usemapsize >> pageblock_order;
1428	usemapsize *= NR_PAGEBLOCK_BITS;
1429	usemapsize = roundup(usemapsize, BITS_PER_LONG);
1430
1431	return usemapsize / BITS_PER_BYTE;
1432	}
1433
1434	static void __ref setup_usemap(struct zone *zone)
1435	{
1436	unsigned long usemapsize = usemap_size(zone->zone_start_pfn,
1437	zone->spanned_pages);
1438	zone->pageblock_flags = NULL;
1439	if (usemapsize) {
1440	zone->pageblock_flags =
1441	memblock_alloc_node(usemapsize, SMP_CACHE_BYTES,
1442	zone_to_nid(zone));
1443	if (!zone->pageblock_flags)
1444	panic("Failed to allocate %ld bytes for zone %s pageblock flags on node %d\n",
1445	usemapsize, zone->name, zone_to_nid(zone));
1446	}
1447	}
1448	#else
1449	static inline void setup_usemap(struct zone *zone) {}
1450	#endif /* CONFIG_SPARSEMEM */
1451
1452	#ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE
1453
1454	/ Initialise the number of pages represented by NR_PAGEBLOCK_BITS /
1455	void __init set_pageblock_order(void)
1456	{
1457	unsigned int order = MAX_ORDER;
1458
1459	/ Check that pageblock_nr_pages has not already been setup /
1460	if (pageblock_order)
1461	return;
1462
1463	/ Don't let pageblocks exceed the maximum allocation granularity. /
1464	if (HPAGE_SHIFT > PAGE_SHIFT && HUGETLB_PAGE_ORDER < order)
1465	order = HUGETLB_PAGE_ORDER;
1466
1467	/*
1468	* Assume the largest contiguous order of interest is a huge page.
1469	* This value may be variable depending on boot parameters on IA64 and
1470	* powerpc.
1471	*/
1472	pageblock_order = order;
1473	}
1474	#else /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */
1475
1476	/*
1477	* When CONFIG_HUGETLB_PAGE_SIZE_VARIABLE is not set, set_pageblock_order()
1478	* is unused as pageblock_order is set at compile-time. See
1479	* include/linux/pageblock-flags.h for the values of pageblock_order based on
1480	* the kernel config
1481	*/
1482	void __init set_pageblock_order(void)
1483	{
1484	}
1485
1486	#endif /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */
1487
1488	/*
1489	* Set up the zone data structures
1490	* - init pgdat internals
1491	* - init all zones belonging to this node
1492	*
1493	* NOTE: this function is only called during memory hotplug
1494	*/
1495	#ifdef CONFIG_MEMORY_HOTPLUG
1496	void __ref free_area_init_core_hotplug(struct pglist_data *pgdat)
1497	{
1498	int nid = pgdat->node_id;
1499	enum zone_type z;
1500	int cpu;
1501
1502	pgdat_init_internals(pgdat);
1503
1504	if (pgdat->per_cpu_nodestats == &boot_nodestats)
1505	pgdat->per_cpu_nodestats = alloc_percpu(struct per_cpu_nodestat);
1506
1507	/*
1508	* Reset the nr_zones, order and highest_zoneidx before reuse.
1509	* Note that kswapd will init kswapd_highest_zoneidx properly
1510	* when it starts in the near future.
1511	*/
1512	pgdat->nr_zones = `0`;
1513	pgdat->kswapd_order = `0`;
1514	pgdat->kswapd_highest_zoneidx = `0`;
1515	pgdat->node_start_pfn = `0`;
1516	pgdat->node_present_pages = `0`;
1517
1518	for_each_online_cpu(cpu) {
1519	struct per_cpu_nodestat *p;
1520
1521	p = per_cpu_ptr(pgdat->per_cpu_nodestats, cpu);
1522	memset(p, `0`, sizeof(*p));
1523	}
1524
1525	/*
1526	* When memory is hot-added, all the memory is in offline state. So
1527	* clear all zones' present_pages and managed_pages because they will
1528	* be updated in online_pages() and offline_pages().
1529	*/
1530	for (z = `0`; z < MAX_NR_ZONES; z++) {
1531	struct zone *zone = pgdat->node_zones + z;
1532
1533	zone->present_pages = `0`;
1534	zone_init_internals(zone, z, nid, `0`);
1535	}
1536	}
1537	#endif
1538
1539	/*
1540	* Set up the zone data structures:
1541	* - mark all pages reserved
1542	* - mark all memory queues empty
1543	* - clear the memory bitmaps
1544	*
1545	* NOTE: pgdat should get zeroed by caller.
1546	* NOTE: this function is only called during early init.
1547	*/
1548	static void __init free_area_init_core(struct pglist_data *pgdat)
1549	{
1550	enum zone_type j;
1551	int nid = pgdat->node_id;
1552
1553	pgdat_init_internals(pgdat);
1554	pgdat->per_cpu_nodestats = &boot_nodestats;
1555
1556	for (j = `0`; j < MAX_NR_ZONES; j++) {
1557	struct zone *zone = pgdat->node_zones + j;
1558	unsigned long size, freesize, memmap_pages;
1559
1560	size = zone->spanned_pages;
1561	freesize = zone->present_pages;
1562
1563	/*
1564	* Adjust freesize so that it accounts for how much memory
1565	* is used by this zone for memmap. This affects the watermark
1566	* and per-cpu initialisations
1567	*/
1568	memmap_pages = calc_memmap_size(size, freesize);
1569	if (!is_highmem_idx(j)) {
1570	if (freesize >= memmap_pages) {
1571	freesize -= memmap_pages;
1572	if (memmap_pages)
1573	pr_debug(" %s zone: %lu pages used for memmap\n",
1574	zone_names[j], memmap_pages);
1575	} else
1576	pr_warn(" %s zone: %lu memmap pages exceeds freesize %lu\n",
1577	zone_names[j], memmap_pages, freesize);
1578	}
1579
1580	/ Account for reserved pages /
1581	if (j == `0` && freesize > dma_reserve) {
1582	freesize -= dma_reserve;
1583	pr_debug(" %s zone: %lu pages reserved\n", zone_names[`0`], dma_reserve);
1584	}
1585
1586	if (!is_highmem_idx(j))
1587	nr_kernel_pages += freesize;
1588	/ Charge for highmem memmap if there are enough kernel pages /
1589	else if (nr_kernel_pages > memmap_pages * `2`)
1590	nr_kernel_pages -= memmap_pages;
1591	nr_all_pages += freesize;
1592
1593	/*
1594	* Set an approximate value for lowmem here, it will be adjusted
1595	* when the bootmem allocator frees pages into the buddy system.
1596	* And all highmem pages will be managed by the buddy system.
1597	*/
1598	zone_init_internals(zone, j, nid, freesize);
1599
1600	if (!size)
1601	continue;
1602
1603	setup_usemap(zone);
1604	init_currently_empty_zone(zone, zone->zone_start_pfn, size);
1605	}
1606	}
1607
1608	void __init *memmap_alloc(phys_addr_t size, phys_addr_t align,
1609	phys_addr_t min_addr, int nid, bool exact_nid)
1610	{
1611	void *ptr;
1612
1613	if (exact_nid)
1614	ptr = memblock_alloc_exact_nid_raw(size, align, min_addr,
1615	MEMBLOCK_ALLOC_ACCESSIBLE,
1616	nid);
1617	else
1618	ptr = memblock_alloc_try_nid_raw(size, align, min_addr,
1619	MEMBLOCK_ALLOC_ACCESSIBLE,
1620	nid);
1621
1622	if (ptr && size > `0`)
1623	page_init_poison(page: ptr, size);
1624
1625	return ptr;
1626	}
1627
1628	#ifdef CONFIG_FLATMEM
1629	static void __init alloc_node_mem_map(struct pglist_data *pgdat)
1630	{
1631	unsigned long __maybe_unused start = `0`;
1632	unsigned long __maybe_unused offset = `0`;
1633
1634	/ Skip empty nodes /
1635	if (!pgdat->node_spanned_pages)
1636	return;
1637
1638	start = pgdat->node_start_pfn & ~(MAX_ORDER_NR_PAGES - `1`);
1639	offset = pgdat->node_start_pfn - start;
1640	/ ia64 gets its own node_mem_map, before this, without bootmem /
1641	if (!pgdat->node_mem_map) {
1642	unsigned long size, end;
1643	struct page *map;
1644
1645	/*
1646	* The zone's endpoints aren't required to be MAX_ORDER
1647	* aligned but the node_mem_map endpoints must be in order
1648	* for the buddy allocator to function correctly.
1649	*/
1650	end = pgdat_end_pfn(pgdat);
1651	end = ALIGN(end, MAX_ORDER_NR_PAGES);
1652	size = (end - start) * sizeof(struct page);
1653	map = memmap_alloc(size, SMP_CACHE_BYTES, MEMBLOCK_LOW_LIMIT,
1654	pgdat->node_id, false);
1655	if (!map)
1656	panic("Failed to allocate %ld bytes for node %d memory map\n",
1657	size, pgdat->node_id);
1658	pgdat->node_mem_map = map + offset;
1659	}
1660	pr_debug("%s: node %d, pgdat %08lx, node_mem_map %08lx\n",
1661	__func__, pgdat->node_id, (unsigned long)pgdat,
1662	(unsigned long)pgdat->node_mem_map);
1663	#ifndef CONFIG_NUMA
1664	/*
1665	* With no DISCONTIG, the global mem_map is just set as node 0's
1666	*/
1667	if (pgdat == NODE_DATA(`0`)) {
1668	mem_map = NODE_DATA(`0`)->node_mem_map;
1669	if (page_to_pfn(mem_map) != pgdat->node_start_pfn)
1670	mem_map -= offset;
1671	}
1672	#endif
1673	}
1674	#else
1675	static inline void alloc_node_mem_map(struct pglist_data *pgdat) { }
1676	#endif /* CONFIG_FLATMEM */
1677
1678	/**
1679	* get_pfn_range_for_nid - Return the start and end page frames for a node
1680	* @nid: The nid to return the range for. If MAX_NUMNODES, the min and max PFN are returned.
1681	* @start_pfn: Passed by reference. On return, it will have the node start_pfn.
1682	* @end_pfn: Passed by reference. On return, it will have the node end_pfn.
1683	*
1684	* It returns the start and end page frame of a node based on information
1685	* provided by memblock_set_node(). If called for a node
1686	* with no available memory, the start and end PFNs will be 0.
1687	*/
1688	void __init get_pfn_range_for_nid(unsigned int nid,
1689	unsigned long start_pfn, unsigned* long *end_pfn)
1690	{
1691	unsigned long this_start_pfn, this_end_pfn;
1692	int i;
1693
1694	*start_pfn = -`1UL`;
1695	*end_pfn = `0`;
1696
1697	for_each_mem_pfn_range(i, nid, &this_start_pfn, &this_end_pfn, NULL) {
1698	start_pfn = min(start_pfn, this_start_pfn);
1699	end_pfn = max(end_pfn, this_end_pfn);
1700	}
1701
1702	if (*start_pfn == -`1UL`)
1703	*start_pfn = `0`;
1704	}
1705
1706	static void __init free_area_init_node(int nid)
1707	{
1708	pg_data_t *pgdat = NODE_DATA(nid);
1709	unsigned long start_pfn = `0`;
1710	unsigned long end_pfn = `0`;
1711
1712	/ pg_data_t should be reset to zero when it's allocated /
1713	WARN_ON(pgdat->nr_zones \|\| pgdat->kswapd_highest_zoneidx);
1714
1715	get_pfn_range_for_nid(nid, start_pfn: &start_pfn, end_pfn: &end_pfn);
1716
1717	pgdat->node_id = nid;
1718	pgdat->node_start_pfn = start_pfn;
1719	pgdat->per_cpu_nodestats = NULL;
1720
1721	if (start_pfn != end_pfn) {
1722	pr_info("Initmem setup node %d [mem %#018Lx-%#018Lx]\n", nid,
1723	(u64)start_pfn << PAGE_SHIFT,
1724	end_pfn ? ((u64)end_pfn << PAGE_SHIFT) - `1` : `0`);
1725
1726	calculate_node_totalpages(pgdat, node_start_pfn: start_pfn, node_end_pfn: end_pfn);
1727	} else {
1728	pr_info("Initmem setup node %d as memoryless\n", nid);
1729
1730	reset_memoryless_node_totalpages(pgdat);
1731	}
1732
1733	alloc_node_mem_map(pgdat);
1734	pgdat_set_deferred_range(pgdat);
1735
1736	free_area_init_core(pgdat);
1737	lru_gen_init_pgdat(pgdat);
1738	}
1739
1740	/ Any regular or high memory on that node ? /
1741	static void __init check_for_memory(pg_data_t *pgdat)
1742	{
1743	enum zone_type zone_type;
1744
1745	for (zone_type = `0`; zone_type <= ZONE_MOVABLE - `1`; zone_type++) {
1746	struct zone *zone = &pgdat->node_zones[zone_type];
1747	if (populated_zone(zone)) {
1748	if (IS_ENABLED(CONFIG_HIGHMEM))
1749	node_set_state(node: pgdat->node_id, state: N_HIGH_MEMORY);
1750	if (zone_type <= ZONE_NORMAL)
1751	node_set_state(node: pgdat->node_id, state: N_NORMAL_MEMORY);
1752	break;
1753	}
1754	}
1755	}
1756
1757	#if MAX_NUMNODES > 1
1758	/*
1759	* Figure out the number of possible node ids.
1760	*/
1761	void __init setup_nr_node_ids(void)
1762	{
1763	unsigned int highest;
1764
1765	highest = find_last_bit(node_possible_map.bits, MAX_NUMNODES);
1766	nr_node_ids = highest + `1`;
1767	}
1768	#endif
1769
1770	/*
1771	* Some architectures, e.g. ARC may have ZONE_HIGHMEM below ZONE_NORMAL. For
1772	* such cases we allow max_zone_pfn sorted in the descending order
1773	*/
1774	static bool arch_has_descending_max_zone_pfns(void)
1775	{
1776	return IS_ENABLED(CONFIG_ARC) && !IS_ENABLED(CONFIG_ARC_HAS_PAE40);
1777	}
1778
1779	/**
1780	* free_area_init - Initialise all pg_data_t and zone data
1781	* @max_zone_pfn: an array of max PFNs for each zone
1782	*
1783	* This will call free_area_init_node() for each active node in the system.
1784	* Using the page ranges provided by memblock_set_node(), the size of each
1785	* zone in each node and their holes is calculated. If the maximum PFN
1786	* between two adjacent zones match, it is assumed that the zone is empty.
1787	* For example, if arch_max_dma_pfn == arch_max_dma32_pfn, it is assumed
1788	* that arch_max_dma32_pfn has no pages. It is also assumed that a zone
1789	* starts where the previous one ended. For example, ZONE_DMA32 starts
1790	* at arch_max_dma_pfn.
1791	*/
1792	void __init free_area_init(unsigned long *max_zone_pfn)
1793	{
1794	unsigned long start_pfn, end_pfn;
1795	int i, nid, zone;
1796	bool descending;
1797
1798	/ Record where the zone boundaries are /
1799	memset(arch_zone_lowest_possible_pfn, `0`,
1800	sizeof(arch_zone_lowest_possible_pfn));
1801	memset(arch_zone_highest_possible_pfn, `0`,
1802	sizeof(arch_zone_highest_possible_pfn));
1803
1804	start_pfn = PHYS_PFN(memblock_start_of_DRAM());
1805	descending = arch_has_descending_max_zone_pfns();
1806
1807	for (i = `0`; i < MAX_NR_ZONES; i++) {
1808	if (descending)
1809	zone = MAX_NR_ZONES - i - `1`;
1810	else
1811	zone = i;
1812
1813	if (zone == ZONE_MOVABLE)
1814	continue;
1815
1816	end_pfn = max(max_zone_pfn[zone], start_pfn);
1817	arch_zone_lowest_possible_pfn[zone] = start_pfn;
1818	arch_zone_highest_possible_pfn[zone] = end_pfn;
1819
1820	start_pfn = end_pfn;
1821	}
1822
1823	/ Find the PFNs that ZONE_MOVABLE begins at in each node /
1824	memset(zone_movable_pfn, `0`, sizeof(zone_movable_pfn));
1825	find_zone_movable_pfns_for_nodes();
1826
1827	/ Print out the zone ranges /
1828	pr_info("Zone ranges:\n");
1829	for (i = `0`; i < MAX_NR_ZONES; i++) {
1830	if (i == ZONE_MOVABLE)
1831	continue;
1832	pr_info(" %-8s ", zone_names[i]);
1833	if (arch_zone_lowest_possible_pfn[i] ==
1834	arch_zone_highest_possible_pfn[i])
1835	pr_cont("empty\n");
1836	else
1837	pr_cont("[mem %#018Lx-%#018Lx]\n",
1838	(u64)arch_zone_lowest_possible_pfn[i]
1839	<< PAGE_SHIFT,
1840	((u64)arch_zone_highest_possible_pfn[i]
1841	<< PAGE_SHIFT) - `1`);
1842	}
1843
1844	/ Print out the PFNs ZONE_MOVABLE begins at in each node /
1845	pr_info("Movable zone start for each node\n");
1846	for (i = `0`; i < MAX_NUMNODES; i++) {
1847	if (zone_movable_pfn[i])
1848	pr_info(" Node %d: %#018Lx\n", i,
1849	(u64)zone_movable_pfn[i] << PAGE_SHIFT);
1850	}
1851
1852	/*
1853	* Print out the early node map, and initialize the
1854	* subsection-map relative to active online memory ranges to
1855	* enable future "sub-section" extensions of the memory map.
1856	*/
1857	pr_info("Early memory node ranges\n");
1858	for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) {
1859	pr_info(" node %3d: [mem %#018Lx-%#018Lx]\n", nid,
1860	(u64)start_pfn << PAGE_SHIFT,
1861	((u64)end_pfn << PAGE_SHIFT) - `1`);
1862	subsection_map_init(pfn: start_pfn, nr_pages: end_pfn - start_pfn);
1863	}
1864
1865	/ Initialise every node /
1866	mminit_verify_pageflags_layout();
1867	setup_nr_node_ids();
1868	set_pageblock_order();
1869
1870	for_each_node(nid) {
1871	pg_data_t *pgdat;
1872
1873	if (!node_online(nid)) {
1874	/ Allocator not initialized yet /
1875	pgdat = arch_alloc_nodedata(nid);
1876	if (!pgdat)
1877	panic(fmt: "Cannot allocate %zuB for node %d.\n",
1878	sizeof(*pgdat), nid);
1879	arch_refresh_nodedata(nid, pgdat);
1880	free_area_init_node(nid);
1881
1882	/*
1883	* We do not want to confuse userspace by sysfs
1884	* files/directories for node without any memory
1885	* attached to it, so this node is not marked as
1886	* N_MEMORY and not marked online so that no sysfs
1887	* hierarchy will be created via register_one_node for
1888	* it. The pgdat will get fully initialized by
1889	* hotadd_init_pgdat() when memory is hotplugged into
1890	* this node.
1891	*/
1892	continue;
1893	}
1894
1895	pgdat = NODE_DATA(nid);
1896	free_area_init_node(nid);
1897
1898	/ Any memory on that node /
1899	if (pgdat->node_present_pages)
1900	node_set_state(node: nid, state: N_MEMORY);
1901	check_for_memory(pgdat);
1902	}
1903
1904	memmap_init();
1905
1906	/ disable hash distribution for systems with a single node /
1907	fixup_hashdist();
1908	}
1909
1910	/**
1911	* node_map_pfn_alignment - determine the maximum internode alignment
1912	*
1913	* This function should be called after node map is populated and sorted.
1914	* It calculates the maximum power of two alignment which can distinguish
1915	* all the nodes.
1916	*
1917	* For example, if all nodes are 1GiB and aligned to 1GiB, the return value
1918	* would indicate 1GiB alignment with (1 << (30 - PAGE_SHIFT)). If the
1919	* nodes are shifted by 256MiB, 256MiB. Note that if only the last node is
1920	* shifted, 1GiB is enough and this function will indicate so.
1921	*
1922	* This is used to test whether pfn -> nid mapping of the chosen memory
1923	* model has fine enough granularity to avoid incorrect mapping for the
1924	* populated node map.
1925	*
1926	* Return: the determined alignment in pfn's. 0 if there is no alignment
1927	* requirement (single node).
1928	*/
1929	unsigned long __init node_map_pfn_alignment(void)
1930	{
1931	unsigned long accl_mask = `0`, last_end = `0`;
1932	unsigned long start, end, mask;
1933	int last_nid = NUMA_NO_NODE;
1934	int i, nid;
1935
1936	for_each_mem_pfn_range(i, MAX_NUMNODES, &start, &end, &nid) {
1937	if (!start \|\| last_nid < `0` \|\| last_nid == nid) {
1938	last_nid = nid;
1939	last_end = end;
1940	continue;
1941	}
1942
1943	/*
1944	* Start with a mask granular enough to pin-point to the
1945	* start pfn and tick off bits one-by-one until it becomes
1946	* too coarse to separate the current node from the last.
1947	*/
1948	mask = ~((`1` << __ffs(start)) - `1`);
1949	while (mask && last_end <= (start & (mask << `1`)))
1950	mask <<= `1`;
1951
1952	/ accumulate all internode masks /
1953	accl_mask \|= mask;
1954	}
1955
1956	/ convert mask to number of pages /
1957	return ~accl_mask + `1`;
1958	}
1959
1960	#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
1961	static void __init deferred_free_range(unsigned long pfn,
1962	unsigned long nr_pages)
1963	{
1964	struct page *page;
1965	unsigned long i;
1966
1967	if (!nr_pages)
1968	return;
1969
1970	page = pfn_to_page(pfn);
1971
1972	/ Free a large naturally-aligned chunk if possible /
1973	if (nr_pages == MAX_ORDER_NR_PAGES && IS_MAX_ORDER_ALIGNED(pfn)) {
1974	for (i = `0`; i < nr_pages; i += pageblock_nr_pages)
1975	set_pageblock_migratetype(page: page + i, migratetype: MIGRATE_MOVABLE);
1976	__free_pages_core(page, MAX_ORDER);
1977	return;
1978	}
1979
1980	/ Accept chunks smaller than MAX_ORDER upfront /
1981	accept_memory(PFN_PHYS(pfn), PFN_PHYS(pfn + nr_pages));
1982
1983	for (i = `0`; i < nr_pages; i++, page++, pfn++) {
1984	if (pageblock_aligned(pfn))
1985	set_pageblock_migratetype(page, migratetype: MIGRATE_MOVABLE);
1986	__free_pages_core(page, order: `0`);
1987	}
1988	}
1989
1990	/ Completion tracking for deferred_init_memmap() threads /
1991	static atomic_t pgdat_init_n_undone __initdata;
1992	static __initdata DECLARE_COMPLETION(pgdat_init_all_done_comp);
1993
1994	static inline void __init pgdat_init_report_one_done(void)
1995	{
1996	if (atomic_dec_and_test(v: &pgdat_init_n_undone))
1997	complete(&pgdat_init_all_done_comp);
1998	}
1999
2000	/*
2001	* Returns true if page needs to be initialized or freed to buddy allocator.
2002	*
2003	* We check if a current MAX_ORDER block is valid by only checking the validity
2004	* of the head pfn.
2005	*/
2006	static inline bool __init deferred_pfn_valid(unsigned long pfn)
2007	{
2008	if (IS_MAX_ORDER_ALIGNED(pfn) && !pfn_valid(pfn))
2009	return false;
2010	return true;
2011	}
2012
2013	/*
2014	* Free pages to buddy allocator. Try to free aligned pages in
2015	* MAX_ORDER_NR_PAGES sizes.
2016	*/
2017	static void __init deferred_free_pages(unsigned long pfn,
2018	unsigned long end_pfn)
2019	{
2020	unsigned long nr_free = `0`;
2021
2022	for (; pfn < end_pfn; pfn++) {
2023	if (!deferred_pfn_valid(pfn)) {
2024	deferred_free_range(pfn: pfn - nr_free, nr_pages: nr_free);
2025	nr_free = `0`;
2026	} else if (IS_MAX_ORDER_ALIGNED(pfn)) {
2027	deferred_free_range(pfn: pfn - nr_free, nr_pages: nr_free);
2028	nr_free = `1`;
2029	} else {
2030	nr_free++;
2031	}
2032	}
2033	/ Free the last block of pages to allocator /
2034	deferred_free_range(pfn: pfn - nr_free, nr_pages: nr_free);
2035	}
2036
2037	/*
2038	* Initialize struct pages. We minimize pfn page lookups and scheduler checks
2039	* by performing it only once every MAX_ORDER_NR_PAGES.
2040	* Return number of pages initialized.
2041	*/
2042	static unsigned long __init deferred_init_pages(struct zone *zone,
2043	unsigned long pfn,
2044	unsigned long end_pfn)
2045	{
2046	int nid = zone_to_nid(zone);
2047	unsigned long nr_pages = `0`;
2048	int zid = zone_idx(zone);
2049	struct page *page = NULL;
2050
2051	for (; pfn < end_pfn; pfn++) {
2052	if (!deferred_pfn_valid(pfn)) {
2053	page = NULL;
2054	continue;
2055	} else if (!page \|\| IS_MAX_ORDER_ALIGNED(pfn)) {
2056	page = pfn_to_page(pfn);
2057	} else {
2058	page++;
2059	}
2060	__init_single_page(page, pfn, zone: zid, nid);
2061	nr_pages++;
2062	}
2063	return (nr_pages);
2064	}
2065
2066	/*
2067	* This function is meant to pre-load the iterator for the zone init.
2068	* Specifically it walks through the ranges until we are caught up to the
2069	* first_init_pfn value and exits there. If we never encounter the value we
2070	* return false indicating there are no valid ranges left.
2071	*/
2072	static bool __init
2073	deferred_init_mem_pfn_range_in_zone(u64 i, struct* zone *zone,
2074	unsigned long spfn, unsigned* long *epfn,
2075	unsigned long first_init_pfn)
2076	{
2077	u64 j;
2078
2079	/*
2080	* Start out by walking through the ranges in this zone that have
2081	* already been initialized. We don't need to do anything with them
2082	* so we just need to flush them out of the system.
2083	*/
2084	for_each_free_mem_pfn_range_in_zone(j, zone, spfn, epfn) {
2085	if (*epfn <= first_init_pfn)
2086	continue;
2087	if (*spfn < first_init_pfn)
2088	*spfn = first_init_pfn;
2089	*i = j;
2090	return true;
2091	}
2092
2093	return false;
2094	}
2095
2096	/*
2097	* Initialize and free pages. We do it in two loops: first we initialize
2098	* struct page, then free to buddy allocator, because while we are
2099	* freeing pages we can access pages that are ahead (computing buddy
2100	* page in __free_one_page()).
2101	*
2102	* In order to try and keep some memory in the cache we have the loop
2103	* broken along max page order boundaries. This way we will not cause
2104	* any issues with the buddy page computation.
2105	*/
2106	static unsigned long __init
2107	deferred_init_maxorder(u64 i, struct* zone zone, unsigned* long *start_pfn,
2108	unsigned long *end_pfn)
2109	{
2110	unsigned long mo_pfn = ALIGN(*start_pfn + `1`, MAX_ORDER_NR_PAGES);
2111	unsigned long spfn = start_pfn, epfn = end_pfn;
2112	unsigned long nr_pages = `0`;
2113	u64 j = *i;
2114
2115	/ First we loop through and initialize the page values /
2116	for_each_free_mem_pfn_range_in_zone_from(j, zone, start_pfn, end_pfn) {
2117	unsigned long t;
2118
2119	if (mo_pfn <= *start_pfn)
2120	break;
2121
2122	t = min(mo_pfn, *end_pfn);
2123	nr_pages += deferred_init_pages(zone, pfn: *start_pfn, end_pfn: t);
2124
2125	if (mo_pfn < *end_pfn) {
2126	*start_pfn = mo_pfn;
2127	break;
2128	}
2129	}
2130
2131	/ Reset values and now loop through freeing pages as needed /
2132	swap(j, *i);
2133
2134	for_each_free_mem_pfn_range_in_zone_from(j, zone, &spfn, &epfn) {
2135	unsigned long t;
2136
2137	if (mo_pfn <= spfn)
2138	break;
2139
2140	t = min(mo_pfn, epfn);
2141	deferred_free_pages(pfn: spfn, end_pfn: t);
2142
2143	if (mo_pfn <= epfn)
2144	break;
2145	}
2146
2147	return nr_pages;
2148	}
2149
2150	static void __init
2151	deferred_init_memmap_chunk(unsigned long start_pfn, unsigned long end_pfn,
2152	void *arg)
2153	{
2154	unsigned long spfn, epfn;
2155	struct zone *zone = arg;
2156	u64 i;
2157
2158	deferred_init_mem_pfn_range_in_zone(i: &i, zone, spfn: &spfn, epfn: &epfn, first_init_pfn: start_pfn);
2159
2160	/*
2161	* Initialize and free pages in MAX_ORDER sized increments so that we
2162	* can avoid introducing any issues with the buddy allocator.
2163	*/
2164	while (spfn < end_pfn) {
2165	deferred_init_maxorder(i: &i, zone, start_pfn: &spfn, end_pfn: &epfn);
2166	cond_resched();
2167	}
2168	}
2169
2170	/ An arch may override for more concurrency. /
2171	__weak int __init
2172	deferred_page_init_max_threads(const struct cpumask *node_cpumask)
2173	{
2174	return `1`;
2175	}
2176
2177	/ Initialise remaining memory on a node /
2178	static int __init deferred_init_memmap(void *data)
2179	{
2180	pg_data_t *pgdat = data;
2181	const struct cpumask *cpumask = cpumask_of_node(node: pgdat->node_id);
2182	unsigned long spfn = `0`, epfn = `0`;
2183	unsigned long first_init_pfn, flags;
2184	unsigned long start = jiffies;
2185	struct zone *zone;
2186	int zid, max_threads;
2187	u64 i;
2188
2189	/ Bind memory initialisation thread to a local node if possible /
2190	if (!cpumask_empty(srcp: cpumask))
2191	set_cpus_allowed_ptr(current, new_mask: cpumask);
2192
2193	pgdat_resize_lock(pgdat, flags: &flags);
2194	first_init_pfn = pgdat->first_deferred_pfn;
2195	if (first_init_pfn == ULONG_MAX) {
2196	pgdat_resize_unlock(pgdat, flags: &flags);
2197	pgdat_init_report_one_done();
2198	return `0`;
2199	}
2200
2201	/ Sanity check boundaries /
2202	BUG_ON(pgdat->first_deferred_pfn < pgdat->node_start_pfn);
2203	BUG_ON(pgdat->first_deferred_pfn > pgdat_end_pfn(pgdat));
2204	pgdat->first_deferred_pfn = ULONG_MAX;
2205
2206	/*
2207	* Once we unlock here, the zone cannot be grown anymore, thus if an
2208	* interrupt thread must allocate this early in boot, zone must be
2209	* pre-grown prior to start of deferred page initialization.
2210	*/
2211	pgdat_resize_unlock(pgdat, flags: &flags);
2212
2213	/ Only the highest zone is deferred so find it /
2214	for (zid = `0`; zid < MAX_NR_ZONES; zid++) {
2215	zone = pgdat->node_zones + zid;
2216	if (first_init_pfn < zone_end_pfn(zone))
2217	break;
2218	}
2219
2220	/ If the zone is empty somebody else may have cleared out the zone /
2221	if (!deferred_init_mem_pfn_range_in_zone(i: &i, zone, spfn: &spfn, epfn: &epfn,
2222	first_init_pfn))
2223	goto zone_empty;
2224
2225	max_threads = deferred_page_init_max_threads(node_cpumask: cpumask);
2226
2227	while (spfn < epfn) {
2228	unsigned long epfn_align = ALIGN(epfn, PAGES_PER_SECTION);
2229	struct padata_mt_job job = {
2230	.thread_fn = deferred_init_memmap_chunk,
2231	.fn_arg = zone,
2232	.start = spfn,
2233	.size = epfn_align - spfn,
2234	.align = PAGES_PER_SECTION,
2235	.min_chunk = PAGES_PER_SECTION,
2236	.max_threads = max_threads,
2237	};
2238
2239	padata_do_multithreaded(job: &job);
2240	deferred_init_mem_pfn_range_in_zone(i: &i, zone, spfn: &spfn, epfn: &epfn,
2241	first_init_pfn: epfn_align);
2242	}
2243	zone_empty:
2244	/ Sanity check that the next zone really is unpopulated /
2245	WARN_ON(++zid < MAX_NR_ZONES && populated_zone(++zone));
2246
2247	pr_info("node %d deferred pages initialised in %ums\n",
2248	pgdat->node_id, jiffies_to_msecs(jiffies - start));
2249
2250	pgdat_init_report_one_done();
2251	return `0`;
2252	}
2253
2254	/*
2255	* If this zone has deferred pages, try to grow it by initializing enough
2256	* deferred pages to satisfy the allocation specified by order, rounded up to
2257	* the nearest PAGES_PER_SECTION boundary. So we're adding memory in increments
2258	* of SECTION_SIZE bytes by initializing struct pages in increments of
2259	* PAGES_PER_SECTION * sizeof(struct page) bytes.
2260	*
2261	* Return true when zone was grown, otherwise return false. We return true even
2262	* when we grow less than requested, to let the caller decide if there are
2263	* enough pages to satisfy the allocation.
2264	*
2265	* Note: We use noinline because this function is needed only during boot, and
2266	* it is called from a __ref function _deferred_grow_zone. This way we are
2267	* making sure that it is not inlined into permanent text section.
2268	*/
2269	bool __init deferred_grow_zone(struct zone zone, unsigned* int order)
2270	{
2271	unsigned long nr_pages_needed = ALIGN(`1` << order, PAGES_PER_SECTION);
2272	pg_data_t *pgdat = zone->zone_pgdat;
2273	unsigned long first_deferred_pfn = pgdat->first_deferred_pfn;
2274	unsigned long spfn, epfn, flags;
2275	unsigned long nr_pages = `0`;
2276	u64 i;
2277
2278	/ Only the last zone may have deferred pages /
2279	if (zone_end_pfn(zone) != pgdat_end_pfn(pgdat))
2280	return false;
2281
2282	pgdat_resize_lock(pgdat, flags: &flags);
2283
2284	/*
2285	* If someone grew this zone while we were waiting for spinlock, return
2286	* true, as there might be enough pages already.
2287	*/
2288	if (first_deferred_pfn != pgdat->first_deferred_pfn) {
2289	pgdat_resize_unlock(pgdat, flags: &flags);
2290	return true;
2291	}
2292
2293	/ If the zone is empty somebody else may have cleared out the zone /
2294	if (!deferred_init_mem_pfn_range_in_zone(i: &i, zone, spfn: &spfn, epfn: &epfn,
2295	first_init_pfn: first_deferred_pfn)) {
2296	pgdat->first_deferred_pfn = ULONG_MAX;
2297	pgdat_resize_unlock(pgdat, flags: &flags);
2298	/ Retry only once. /
2299	return first_deferred_pfn != ULONG_MAX;
2300	}
2301
2302	/*
2303	* Initialize and free pages in MAX_ORDER sized increments so
2304	* that we can avoid introducing any issues with the buddy
2305	* allocator.
2306	*/
2307	while (spfn < epfn) {
2308	/ update our first deferred PFN for this section /
2309	first_deferred_pfn = spfn;
2310
2311	nr_pages += deferred_init_maxorder(i: &i, zone, start_pfn: &spfn, end_pfn: &epfn);
2312	touch_nmi_watchdog();
2313
2314	/ We should only stop along section boundaries /
2315	if ((first_deferred_pfn ^ spfn) < PAGES_PER_SECTION)
2316	continue;
2317
2318	/ If our quota has been met we can stop here /
2319	if (nr_pages >= nr_pages_needed)
2320	break;
2321	}
2322
2323	pgdat->first_deferred_pfn = spfn;
2324	pgdat_resize_unlock(pgdat, flags: &flags);
2325
2326	return nr_pages > `0`;
2327	}
2328
2329	#endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */
2330
2331	#ifdef CONFIG_CMA
2332	void __init init_cma_reserved_pageblock(struct page *page)
2333	{
2334	unsigned i = pageblock_nr_pages;
2335	struct page *p = page;
2336
2337	do {
2338	__ClearPageReserved(page: p);
2339	set_page_count(page: p, v: `0`);
2340	} while (++p, --i);
2341
2342	set_pageblock_migratetype(page, migratetype: MIGRATE_CMA);
2343	set_page_refcounted(page);
2344	__free_pages(page, pageblock_order);
2345
2346	adjust_managed_page_count(page, pageblock_nr_pages);
2347	page_zone(page)->cma_pages += pageblock_nr_pages;
2348	}
2349	#endif
2350
2351	void set_zone_contiguous(struct zone *zone)
2352	{
2353	unsigned long block_start_pfn = zone->zone_start_pfn;
2354	unsigned long block_end_pfn;
2355
2356	block_end_pfn = pageblock_end_pfn(block_start_pfn);
2357	for (; block_start_pfn < zone_end_pfn(zone);
2358	block_start_pfn = block_end_pfn,
2359	block_end_pfn += pageblock_nr_pages) {
2360
2361	block_end_pfn = min(block_end_pfn, zone_end_pfn(zone));
2362
2363	if (!__pageblock_pfn_to_page(start_pfn: block_start_pfn,
2364	end_pfn: block_end_pfn, zone))
2365	return;
2366	cond_resched();
2367	}
2368
2369	/ We confirm that there is no hole /
2370	zone->contiguous = true;
2371	}
2372
2373	void __init page_alloc_init_late(void)
2374	{
2375	struct zone *zone;
2376	int nid;
2377
2378	#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
2379
2380	/ There will be num_node_state(N_MEMORY) threads /
2381	atomic_set(v: &pgdat_init_n_undone, i: num_node_state(state: N_MEMORY));
2382	for_each_node_state(nid, N_MEMORY) {
2383	kthread_run(deferred_init_memmap, NODE_DATA(nid), "pgdatinit%d", nid);
2384	}
2385
2386	/ Block until all are initialised /
2387	wait_for_completion(&pgdat_init_all_done_comp);
2388
2389	/*
2390	* We initialized the rest of the deferred pages. Permanently disable
2391	* on-demand struct page initialization.
2392	*/
2393	static_branch_disable(&deferred_pages);
2394
2395	/ Reinit limits that are based on free pages after the kernel is up /
2396	files_maxfiles_init();
2397	#endif
2398
2399	buffer_init();
2400
2401	/ Discard memblock private memory /
2402	memblock_discard();
2403
2404	for_each_node_state(nid, N_MEMORY)
2405	shuffle_free_memory(NODE_DATA(nid));
2406
2407	for_each_populated_zone(zone)
2408	set_zone_contiguous(zone);
2409
2410	/ Initialize page ext after all struct pages are initialized. /
2411	if (deferred_struct_pages)
2412	page_ext_init();
2413
2414	page_alloc_sysctl_init();
2415	}
2416
2417	#ifndef __HAVE_ARCH_RESERVED_KERNEL_PAGES
2418	/*
2419	* Returns the number of pages that arch has reserved but
2420	* is not known to alloc_large_system_hash().
2421	*/
2422	static unsigned long __init arch_reserved_kernel_pages(void)
2423	{
2424	return `0`;
2425	}
2426	#endif
2427
2428	/*
2429	* Adaptive scale is meant to reduce sizes of hash tables on large memory
2430	* machines. As memory size is increased the scale is also increased but at
2431	* slower pace. Starting from ADAPT_SCALE_BASE (64G), every time memory
2432	* quadruples the scale is increased by one, which means the size of hash table
2433	* only doubles, instead of quadrupling as well.
2434	* Because 32-bit systems cannot have large physical memory, where this scaling
2435	* makes sense, it is disabled on such platforms.
2436	*/
2437	#if __BITS_PER_LONG > 32
2438	#define ADAPT_SCALE_BASE (64ul << 30)
2439	#define ADAPT_SCALE_SHIFT 2
2440	#define ADAPT_SCALE_NPAGES (ADAPT_SCALE_BASE >> PAGE_SHIFT)
2441	#endif
2442
2443	/*
2444	* allocate a large system hash table from bootmem
2445	* - it is assumed that the hash table must contain an exact power-of-2
2446	* quantity of entries
2447	* - limit is the number of hash buckets, not the total allocation size
2448	*/
2449	void __init alloc_large_system_hash(const* char *tablename,
2450	unsigned long bucketsize,
2451	unsigned long numentries,
2452	int scale,
2453	int flags,
2454	unsigned int *_hash_shift,
2455	unsigned int *_hash_mask,
2456	unsigned long low_limit,
2457	unsigned long high_limit)
2458	{
2459	unsigned long long max = high_limit;
2460	unsigned long log2qty, size;
2461	void *table;
2462	gfp_t gfp_flags;
2463	bool virt;
2464	bool huge;
2465
2466	/ allow the kernel cmdline to have a say /
2467	if (!numentries) {
2468	/ round applicable memory size up to nearest megabyte /
2469	numentries = nr_kernel_pages;
2470	numentries -= arch_reserved_kernel_pages();
2471
2472	/ It isn't necessary when PAGE_SIZE >= 1MB /
2473	if (PAGE_SIZE < SZ_1M)
2474	numentries = round_up(numentries, SZ_1M / PAGE_SIZE);
2475
2476	#if __BITS_PER_LONG > 32
2477	if (!high_limit) {
2478	unsigned long adapt;
2479
2480	for (adapt = ADAPT_SCALE_NPAGES; adapt < numentries;
2481	adapt <<= ADAPT_SCALE_SHIFT)
2482	scale++;
2483	}
2484	#endif
2485
2486	/ limit to 1 bucket per 2^scale bytes of low memory /
2487	if (scale > PAGE_SHIFT)
2488	numentries >>= (scale - PAGE_SHIFT);
2489	else
2490	numentries <<= (PAGE_SHIFT - scale);
2491
2492	if (unlikely((numentries * bucketsize) < PAGE_SIZE))
2493	numentries = PAGE_SIZE / bucketsize;
2494	}
2495	numentries = roundup_pow_of_two(numentries);
2496
2497	/ limit allocation size to 1/16 total memory by default /
2498	if (max == `0`) {
2499	max = ((unsigned long long)nr_all_pages << PAGE_SHIFT) >> `4`;
2500	do_div(max, bucketsize);
2501	}
2502	max = min(max, `0x80000000ULL`);
2503
2504	if (numentries < low_limit)
2505	numentries = low_limit;
2506	if (numentries > max)
2507	numentries = max;
2508
2509	log2qty = ilog2(numentries);
2510
2511	gfp_flags = (flags & HASH_ZERO) ? GFP_ATOMIC \| __GFP_ZERO : GFP_ATOMIC;
2512	do {
2513	virt = false;
2514	size = bucketsize << log2qty;
2515	if (flags & HASH_EARLY) {
2516	if (flags & HASH_ZERO)
2517	table = memblock_alloc(size, SMP_CACHE_BYTES);
2518	else
2519	table = memblock_alloc_raw(size,
2520	SMP_CACHE_BYTES);
2521	} else if (get_order(size) > MAX_ORDER \|\| hashdist) {
2522	table = vmalloc_huge(size, gfp_mask: gfp_flags);
2523	virt = true;
2524	if (table)
2525	huge = is_vm_area_hugepages(addr: table);
2526	} else {
2527	/*
2528	* If bucketsize is not a power-of-two, we may free
2529	* some pages at the end of hash table which
2530	* alloc_pages_exact() automatically does
2531	*/
2532	table = alloc_pages_exact(size, gfp_mask: gfp_flags);
2533	kmemleak_alloc(ptr: table, size, min_count: `1`, gfp: gfp_flags);
2534	}
2535	} while (!table && size > PAGE_SIZE && --log2qty);
2536
2537	if (!table)
2538	panic(fmt: "Failed to allocate %s hash table\n", tablename);
2539
2540	pr_info("%s hash table entries: %ld (order: %d, %lu bytes, %s)\n",
2541	tablename, `1UL` << log2qty, ilog2(size) - PAGE_SHIFT, size,
2542	virt ? (huge ? "vmalloc hugepage" : "vmalloc") : "linear");
2543
2544	if (_hash_shift)
2545	*_hash_shift = log2qty;
2546	if (_hash_mask)
2547	*_hash_mask = (`1` << log2qty) - `1`;
2548
2549	return table;
2550	}
2551
2552	/**
2553	* set_dma_reserve - set the specified number of pages reserved in the first zone
2554	* @new_dma_reserve: The number of pages to mark reserved
2555	*
2556	* The per-cpu batchsize and zone watermarks are determined by managed_pages.
2557	* In the DMA zone, a significant percentage may be consumed by kernel image
2558	* and other unfreeable allocations which can skew the watermarks badly. This
2559	* function may optionally be used to account for unfreeable pages in the
2560	* first zone (e.g., ZONE_DMA). The effect will be lower watermarks and
2561	* smaller per-cpu batchsize.
2562	*/
2563	void __init set_dma_reserve(unsigned long new_dma_reserve)
2564	{
2565	dma_reserve = new_dma_reserve;
2566	}
2567
2568	void __init memblock_free_pages(struct page page, unsigned* long pfn,
2569	unsigned int order)
2570	{
2571
2572	if (IS_ENABLED(CONFIG_DEFERRED_STRUCT_PAGE_INIT)) {
2573	int nid = early_pfn_to_nid(pfn);
2574
2575	if (!early_page_initialised(pfn, nid))
2576	return;
2577	}
2578
2579	if (!kmsan_memblock_free_pages(page, order)) {
2580	/ KMSAN will take care of these pages. /
2581	return;
2582	}
2583	__free_pages_core(page, order);
2584	}
2585
2586	DEFINE_STATIC_KEY_MAYBE(CONFIG_INIT_ON_ALLOC_DEFAULT_ON, init_on_alloc);
2587	EXPORT_SYMBOL(init_on_alloc);
2588
2589	DEFINE_STATIC_KEY_MAYBE(CONFIG_INIT_ON_FREE_DEFAULT_ON, init_on_free);
2590	EXPORT_SYMBOL(init_on_free);
2591
2592	static bool _init_on_alloc_enabled_early __read_mostly
2593	= IS_ENABLED(CONFIG_INIT_ON_ALLOC_DEFAULT_ON);
2594	static int __init early_init_on_alloc(char *buf)
2595	{
2596
2597	return kstrtobool(s: buf, res: &_init_on_alloc_enabled_early);
2598	}
2599	early_param("init_on_alloc", early_init_on_alloc);
2600
2601	static bool _init_on_free_enabled_early __read_mostly
2602	= IS_ENABLED(CONFIG_INIT_ON_FREE_DEFAULT_ON);
2603	static int __init early_init_on_free(char *buf)
2604	{
2605	return kstrtobool(s: buf, res: &_init_on_free_enabled_early);
2606	}
2607	early_param("init_on_free", early_init_on_free);
2608
2609	DEFINE_STATIC_KEY_MAYBE(CONFIG_DEBUG_VM, check_pages_enabled);
2610
2611	/*
2612	* Enable static keys related to various memory debugging and hardening options.
2613	* Some override others, and depend on early params that are evaluated in the
2614	* order of appearance. So we need to first gather the full picture of what was
2615	* enabled, and then make decisions.
2616	*/
2617	static void __init mem_debugging_and_hardening_init(void)
2618	{
2619	bool page_poisoning_requested = false;
2620	bool want_check_pages = false;
2621
2622	#ifdef CONFIG_PAGE_POISONING
2623	/*
2624	* Page poisoning is debug page alloc for some arches. If
2625	* either of those options are enabled, enable poisoning.
2626	*/
2627	if (page_poisoning_enabled() \|\|
2628	(!IS_ENABLED(CONFIG_ARCH_SUPPORTS_DEBUG_PAGEALLOC) &&
2629	debug_pagealloc_enabled())) {
2630	static_branch_enable(&_page_poisoning_enabled);
2631	page_poisoning_requested = true;
2632	want_check_pages = true;
2633	}
2634	#endif
2635
2636	if ((_init_on_alloc_enabled_early \|\| _init_on_free_enabled_early) &&
2637	page_poisoning_requested) {
2638	pr_info("mem auto-init: CONFIG_PAGE_POISONING is on, "
2639	"will take precedence over init_on_alloc and init_on_free\n");
2640	_init_on_alloc_enabled_early = false;
2641	_init_on_free_enabled_early = false;
2642	}
2643
2644	if (_init_on_alloc_enabled_early) {
2645	want_check_pages = true;
2646	static_branch_enable(&init_on_alloc);
2647	} else {
2648	static_branch_disable(&init_on_alloc);
2649	}
2650
2651	if (_init_on_free_enabled_early) {
2652	want_check_pages = true;
2653	static_branch_enable(&init_on_free);
2654	} else {
2655	static_branch_disable(&init_on_free);
2656	}
2657
2658	if (IS_ENABLED(CONFIG_KMSAN) &&
2659	(_init_on_alloc_enabled_early \|\| _init_on_free_enabled_early))
2660	pr_info("mem auto-init: please make sure init_on_alloc and init_on_free are disabled when running KMSAN\n");
2661
2662	#ifdef CONFIG_DEBUG_PAGEALLOC
2663	if (debug_pagealloc_enabled()) {
2664	want_check_pages = true;
2665	static_branch_enable(&_debug_pagealloc_enabled);
2666
2667	if (debug_guardpage_minorder())
2668	static_branch_enable(&_debug_guardpage_enabled);
2669	}
2670	#endif
2671
2672	/*
2673	* Any page debugging or hardening option also enables sanity checking
2674	* of struct pages being allocated or freed. With CONFIG_DEBUG_VM it's
2675	* enabled already.
2676	*/
2677	if (!IS_ENABLED(CONFIG_DEBUG_VM) && want_check_pages)
2678	static_branch_enable(&check_pages_enabled);
2679	}
2680
2681	/ Report memory auto-initialization states for this boot. /
2682	static void __init report_meminit(void)
2683	{
2684	const char *stack;
2685
2686	if (IS_ENABLED(CONFIG_INIT_STACK_ALL_PATTERN))
2687	stack = "all(pattern)";
2688	else if (IS_ENABLED(CONFIG_INIT_STACK_ALL_ZERO))
2689	stack = "all(zero)";
2690	else if (IS_ENABLED(CONFIG_GCC_PLUGIN_STRUCTLEAK_BYREF_ALL))
2691	stack = "byref_all(zero)";
2692	else if (IS_ENABLED(CONFIG_GCC_PLUGIN_STRUCTLEAK_BYREF))
2693	stack = "byref(zero)";
2694	else if (IS_ENABLED(CONFIG_GCC_PLUGIN_STRUCTLEAK_USER))
2695	stack = "__user(zero)";
2696	else
2697	stack = "off";
2698
2699	pr_info("mem auto-init: stack:%s, heap alloc:%s, heap free:%s\n",
2700	stack, want_init_on_alloc(GFP_KERNEL) ? "on" : "off",
2701	want_init_on_free() ? "on" : "off");
2702	if (want_init_on_free())
2703	pr_info("mem auto-init: clearing system memory may take some time...\n");
2704	}
2705
2706	static void __init mem_init_print_info(void)
2707	{
2708	unsigned long physpages, codesize, datasize, rosize, bss_size;
2709	unsigned long init_code_size, init_data_size;
2710
2711	physpages = get_num_physpages();
2712	codesize = _etext - _stext;
2713	datasize = _edata - _sdata;
2714	rosize = __end_rodata - __start_rodata;
2715	bss_size = __bss_stop - __bss_start;
2716	init_data_size = __init_end - __init_begin;
2717	init_code_size = _einittext - _sinittext;
2718
2719	/*
2720	* Detect special cases and adjust section sizes accordingly:
2721	* 1) .init.* may be embedded into .data sections
2722	* 2) .init.text.* may be out of [__init_begin, __init_end],
2723	* please refer to arch/tile/kernel/vmlinux.lds.S.
2724	* 3) .rodata.* may be embedded into .text or .data sections.
2725	*/
2726	#define adj_init_size(start, end, size, pos, adj) \
2727	do { \
2728	if (&start[0] <= &pos[0] && &pos[0] < &end[0] && size > adj) \
2729	size -= adj; \
2730	} while (0)
2731
2732	adj_init_size(__init_begin, __init_end, init_data_size,
2733	_sinittext, init_code_size);
2734	adj_init_size(_stext, _etext, codesize, _sinittext, init_code_size);
2735	adj_init_size(_sdata, _edata, datasize, __init_begin, init_data_size);
2736	adj_init_size(_stext, _etext, codesize, __start_rodata, rosize);
2737	adj_init_size(_sdata, _edata, datasize, __start_rodata, rosize);
2738
2739	#undef adj_init_size
2740
2741	pr_info("Memory: %luK/%luK available (%luK kernel code, %luK rwdata, %luK rodata, %luK init, %luK bss, %luK reserved, %luK cma-reserved"
2742	#ifdef CONFIG_HIGHMEM
2743	", %luK highmem"
2744	#endif
2745	")\n",
2746	K(nr_free_pages()), K(physpages),
2747	codesize / SZ_1K, datasize / SZ_1K, rosize / SZ_1K,
2748	(init_data_size + init_code_size) / SZ_1K, bss_size / SZ_1K,
2749	K(physpages - totalram_pages() - totalcma_pages),
2750	K(totalcma_pages)
2751	#ifdef CONFIG_HIGHMEM
2752	, K(totalhigh_pages())
2753	#endif
2754	);
2755	}
2756
2757	/*
2758	* Set up kernel memory allocators
2759	*/
2760	void __init mm_core_init(void)
2761	{
2762	/ Initializations relying on SMP setup /
2763	build_all_zonelists(NULL);
2764	page_alloc_init_cpuhp();
2765
2766	/*
2767	* page_ext requires contiguous pages,
2768	* bigger than MAX_ORDER unless SPARSEMEM.
2769	*/
2770	page_ext_init_flatmem();
2771	mem_debugging_and_hardening_init();
2772	kfence_alloc_pool_and_metadata();
2773	report_meminit();
2774	kmsan_init_shadow();
2775	stack_depot_early_init();
2776	mem_init();
2777	mem_init_print_info();
2778	kmem_cache_init();
2779	/*
2780	* page_owner must be initialized after buddy is ready, and also after
2781	* slab is ready so that stack_depot_init() works properly
2782	*/
2783	page_ext_init_flatmem_late();
2784	kmemleak_init();
2785	ptlock_cache_init();
2786	pgtable_cache_init();
2787	debug_objects_mem_init();
2788	vmalloc_init();
2789	/ If no deferred init page_ext now, as vmap is fully initialized /
2790	if (!deferred_struct_pages)
2791	page_ext_init();
2792	/ Should be run before the first non-init thread is created /
2793	init_espfix_bsp();
2794	/ Should be run after espfix64 is set up. /
2795	pti_init();
2796	kmsan_init_runtime();
2797	mm_cache_init();
2798	}
2799

source code of linux/mm/mm_init.c