numa.c source code [linux/arch/x86/mm/numa.c]

1	// SPDX-License-Identifier: GPL-2.0-only
2	/ Common code for 32 and 64-bit NUMA /
3	#include <linux/acpi.h>
4	#include <linux/kernel.h>
5	#include <linux/mm.h>
6	#include <linux/of.h>
7	#include <linux/string.h>
8	#include <linux/init.h>
9	#include <linux/memblock.h>
10	#include <linux/mmzone.h>
11	#include <linux/ctype.h>
12	#include <linux/nodemask.h>
13	#include <linux/sched.h>
14	#include <linux/topology.h>
15	#include <linux/sort.h>
16
17	#include <asm/e820/api.h>
18	#include <asm/proto.h>
19	#include <asm/dma.h>
20	#include <asm/amd_nb.h>
21
22	#include "numa_internal.h"
23
24	int numa_off;
25	nodemask_t numa_nodes_parsed __initdata;
26
27	struct pglist_data *node_data[MAX_NUMNODES] __read_mostly;
28	EXPORT_SYMBOL(node_data);
29
30	static struct numa_meminfo numa_meminfo __initdata_or_meminfo;
31	static struct numa_meminfo numa_reserved_meminfo __initdata_or_meminfo;
32
33	static int numa_distance_cnt;
34	static u8 *numa_distance;
35
36	static __init int numa_setup(char *opt)
37	{
38	if (!opt)
39	return -EINVAL;
40	if (!strncmp(opt, "off", `3`))
41	numa_off = `1`;
42	if (!strncmp(opt, "fake=", `5`))
43	return numa_emu_cmdline(str: opt + `5`);
44	if (!strncmp(opt, "noacpi", `6`))
45	disable_srat();
46	if (!strncmp(opt, "nohmat", `6`))
47	disable_hmat();
48	return `0`;
49	}
50	early_param("numa", numa_setup);
51
52	/*
53	* apicid, cpu, node mappings
54	*/
55	s16 __apicid_to_node[MAX_LOCAL_APIC] = {
56	[`0` ... MAX_LOCAL_APIC-`1`] = NUMA_NO_NODE
57	};
58
59	int numa_cpu_node(int cpu)
60	{
61	u32 apicid = early_per_cpu(x86_cpu_to_apicid, cpu);
62
63	if (apicid != BAD_APICID)
64	return __apicid_to_node[apicid];
65	return NUMA_NO_NODE;
66	}
67
68	cpumask_var_t node_to_cpumask_map[MAX_NUMNODES];
69	EXPORT_SYMBOL(node_to_cpumask_map);
70
71	/*
72	* Map cpu index to node index
73	*/
74	DEFINE_EARLY_PER_CPU(int, x86_cpu_to_node_map, NUMA_NO_NODE);
75	EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_node_map);
76
77	void numa_set_node(int cpu, int node)
78	{
79	int *cpu_to_node_map = early_per_cpu_ptr(x86_cpu_to_node_map);
80
81	/ early setting, no percpu area yet /
82	if (cpu_to_node_map) {
83	cpu_to_node_map[cpu] = node;
84	return;
85	}
86
87	#ifdef CONFIG_DEBUG_PER_CPU_MAPS
88	if (cpu >= nr_cpu_ids \|\| !cpu_possible(cpu)) {
89	printk(KERN_ERR "numa_set_node: invalid cpu# (%d)\n", cpu);
90	dump_stack();
91	return;
92	}
93	#endif
94	per_cpu(x86_cpu_to_node_map, cpu) = node;
95
96	set_cpu_numa_node(cpu, node);
97	}
98
99	void numa_clear_node(int cpu)
100	{
101	numa_set_node(cpu, NUMA_NO_NODE);
102	}
103
104	/*
105	* Allocate node_to_cpumask_map based on number of available nodes
106	* Requires node_possible_map to be valid.
107	*
108	* Note: cpumask_of_node() is not valid until after this is done.
109	* (Use CONFIG_DEBUG_PER_CPU_MAPS to check this.)
110	*/
111	void __init setup_node_to_cpumask_map(void)
112	{
113	unsigned int node;
114
115	/ setup nr_node_ids if not done yet /
116	if (nr_node_ids == MAX_NUMNODES)
117	setup_nr_node_ids();
118
119	/ allocate the map /
120	for (node = `0`; node < nr_node_ids; node++)
121	alloc_bootmem_cpumask_var(mask: &node_to_cpumask_map[node]);
122
123	/ cpumask_of_node() will now work /
124	pr_debug("Node to cpumask map for %u nodes\n", nr_node_ids);
125	}
126
127	static int __init numa_add_memblk_to(int nid, u64 start, u64 end,
128	struct numa_meminfo *mi)
129	{
130	/ ignore zero length blks /
131	if (start == end)
132	return `0`;
133
134	/ whine about and ignore invalid blks /
135	if (start > end \|\| nid < `0` \|\| nid >= MAX_NUMNODES) {
136	pr_warn("Warning: invalid memblk node %d [mem %#010Lx-%#010Lx]\n",
137	nid, start, end - `1`);
138	return `0`;
139	}
140
141	if (mi->nr_blks >= NR_NODE_MEMBLKS) {
142	pr_err("too many memblk ranges\n");
143	return -EINVAL;
144	}
145
146	mi->blk[mi->nr_blks].start = start;
147	mi->blk[mi->nr_blks].end = end;
148	mi->blk[mi->nr_blks].nid = nid;
149	mi->nr_blks++;
150	return `0`;
151	}
152
153	/**
154	* numa_remove_memblk_from - Remove one numa_memblk from a numa_meminfo
155	* @idx: Index of memblk to remove
156	* @mi: numa_meminfo to remove memblk from
157	*
158	* Remove @idx'th numa_memblk from @mi by shifting @mi->blk[] and
159	* decrementing @mi->nr_blks.
160	*/
161	void __init numa_remove_memblk_from(int idx, struct numa_meminfo *mi)
162	{
163	mi->nr_blks--;
164	memmove(&mi->blk[idx], &mi->blk[idx + `1`],
165	(mi->nr_blks - idx) * sizeof(mi->blk[`0`]));
166	}
167
168	/**
169	* numa_move_tail_memblk - Move a numa_memblk from one numa_meminfo to another
170	* @dst: numa_meminfo to append block to
171	* @idx: Index of memblk to remove
172	* @src: numa_meminfo to remove memblk from
173	*/
174	static void __init numa_move_tail_memblk(struct numa_meminfo dst, int* idx,
175	struct numa_meminfo *src)
176	{
177	dst->blk[dst->nr_blks++] = src->blk[idx];
178	numa_remove_memblk_from(idx, mi: src);
179	}
180
181	/**
182	* numa_add_memblk - Add one numa_memblk to numa_meminfo
183	* @nid: NUMA node ID of the new memblk
184	* @start: Start address of the new memblk
185	* @end: End address of the new memblk
186	*
187	* Add a new memblk to the default numa_meminfo.
188	*
189	* RETURNS:
190	* 0 on success, -errno on failure.
191	*/
192	int __init numa_add_memblk(int nid, u64 start, u64 end)
193	{
194	return numa_add_memblk_to(nid, start, end, mi: &numa_meminfo);
195	}
196
197	/ Allocate NODE_DATA for a node on the local memory /
198	static void __init alloc_node_data(int nid)
199	{
200	const size_t nd_size = roundup(sizeof(pg_data_t), PAGE_SIZE);
201	u64 nd_pa;
202	void *nd;
203	int tnid;
204
205	/*
206	* Allocate node data. Try node-local memory and then any node.
207	* Never allocate in DMA zone.
208	*/
209	nd_pa = memblock_phys_alloc_try_nid(size: nd_size, SMP_CACHE_BYTES, nid);
210	if (!nd_pa) {
211	pr_err("Cannot find %zu bytes in any node (initial node: %d)\n",
212	nd_size, nid);
213	return;
214	}
215	nd = __va(nd_pa);
216
217	/ report and initialize /
218	printk(KERN_INFO "NODE_DATA(%d) allocated [mem %#010Lx-%#010Lx]\n", nid,
219	nd_pa, nd_pa + nd_size - `1`);
220	tnid = early_pfn_to_nid(pfn: nd_pa >> PAGE_SHIFT);
221	if (tnid != nid)
222	printk(KERN_INFO " NODE_DATA(%d) on node %d\n", nid, tnid);
223
224	node_data[nid] = nd;
225	memset(NODE_DATA(nid), `0`, sizeof(pg_data_t));
226
227	node_set_online(nid);
228	}
229
230	/**
231	* numa_cleanup_meminfo - Cleanup a numa_meminfo
232	* @mi: numa_meminfo to clean up
233	*
234	* Sanitize @mi by merging and removing unnecessary memblks. Also check for
235	* conflicts and clear unused memblks.
236	*
237	* RETURNS:
238	* 0 on success, -errno on failure.
239	*/
240	int __init numa_cleanup_meminfo(struct numa_meminfo *mi)
241	{
242	const u64 low = `0`;
243	const u64 high = PFN_PHYS(max_pfn);
244	int i, j, k;
245
246	/ first, trim all entries /
247	for (i = `0`; i < mi->nr_blks; i++) {
248	struct numa_memblk *bi = &mi->blk[i];
249
250	/ move / save reserved memory ranges /
251	if (!memblock_overlaps_region(type: &memblock.memory,
252	base: bi->start, size: bi->end - bi->start)) {
253	numa_move_tail_memblk(dst: &numa_reserved_meminfo, idx: i--, src: mi);
254	continue;
255	}
256
257	/ make sure all non-reserved blocks are inside the limits /
258	bi->start = max(bi->start, low);
259
260	/ preserve info for non-RAM areas above 'max_pfn': /
261	if (bi->end > high) {
262	numa_add_memblk_to(nid: bi->nid, start: high, end: bi->end,
263	mi: &numa_reserved_meminfo);
264	bi->end = high;
265	}
266
267	/ and there's no empty block /
268	if (bi->start >= bi->end)
269	numa_remove_memblk_from(idx: i--, mi);
270	}
271
272	/ merge neighboring / overlapping entries /
273	for (i = `0`; i < mi->nr_blks; i++) {
274	struct numa_memblk *bi = &mi->blk[i];
275
276	for (j = i + `1`; j < mi->nr_blks; j++) {
277	struct numa_memblk *bj = &mi->blk[j];
278	u64 start, end;
279
280	/*
281	* See whether there are overlapping blocks. Whine
282	* about but allow overlaps of the same nid. They
283	* will be merged below.
284	*/
285	if (bi->end > bj->start && bi->start < bj->end) {
286	if (bi->nid != bj->nid) {
287	pr_err("node %d [mem %#010Lx-%#010Lx] overlaps with node %d [mem %#010Lx-%#010Lx]\n",
288	bi->nid, bi->start, bi->end - `1`,
289	bj->nid, bj->start, bj->end - `1`);
290	return -EINVAL;
291	}
292	pr_warn("Warning: node %d [mem %#010Lx-%#010Lx] overlaps with itself [mem %#010Lx-%#010Lx]\n",
293	bi->nid, bi->start, bi->end - `1`,
294	bj->start, bj->end - `1`);
295	}
296
297	/*
298	* Join together blocks on the same node, holes
299	* between which don't overlap with memory on other
300	* nodes.
301	*/
302	if (bi->nid != bj->nid)
303	continue;
304	start = min(bi->start, bj->start);
305	end = max(bi->end, bj->end);
306	for (k = `0`; k < mi->nr_blks; k++) {
307	struct numa_memblk *bk = &mi->blk[k];
308
309	if (bi->nid == bk->nid)
310	continue;
311	if (start < bk->end && end > bk->start)
312	break;
313	}
314	if (k < mi->nr_blks)
315	continue;
316	printk(KERN_INFO "NUMA: Node %d [mem %#010Lx-%#010Lx] + [mem %#010Lx-%#010Lx] -> [mem %#010Lx-%#010Lx]\n",
317	bi->nid, bi->start, bi->end - `1`, bj->start,
318	bj->end - `1`, start, end - `1`);
319	bi->start = start;
320	bi->end = end;
321	numa_remove_memblk_from(idx: j--, mi);
322	}
323	}
324
325	/ clear unused ones /
326	for (i = mi->nr_blks; i < ARRAY_SIZE(mi->blk); i++) {
327	mi->blk[i].start = mi->blk[i].end = `0`;
328	mi->blk[i].nid = NUMA_NO_NODE;
329	}
330
331	return `0`;
332	}
333
334	/*
335	* Set nodes, which have memory in @mi, in *@nodemask.
336	*/
337	static void __init numa_nodemask_from_meminfo(nodemask_t *nodemask,
338	const struct numa_meminfo *mi)
339	{
340	int i;
341
342	for (i = `0`; i < ARRAY_SIZE(mi->blk); i++)
343	if (mi->blk[i].start != mi->blk[i].end &&
344	mi->blk[i].nid != NUMA_NO_NODE)
345	node_set(mi->blk[i].nid, *nodemask);
346	}
347
348	/**
349	* numa_reset_distance - Reset NUMA distance table
350	*
351	* The current table is freed. The next numa_set_distance() call will
352	* create a new one.
353	*/
354	void __init numa_reset_distance(void)
355	{
356	size_t size = numa_distance_cnt * numa_distance_cnt * sizeof(numa_distance[`0`]);
357
358	/ numa_distance could be 1LU marking allocation failure, test cnt /
359	if (numa_distance_cnt)
360	memblock_free(ptr: numa_distance, size);
361	numa_distance_cnt = `0`;
362	numa_distance = NULL; / enable table creation /
363	}
364
365	static int __init numa_alloc_distance(void)
366	{
367	nodemask_t nodes_parsed;
368	size_t size;
369	int i, j, cnt = `0`;
370	u64 phys;
371
372	/ size the new table and allocate it /
373	nodes_parsed = numa_nodes_parsed;
374	numa_nodemask_from_meminfo(nodemask: &nodes_parsed, mi: &numa_meminfo);
375
376	for_each_node_mask(i, nodes_parsed)
377	cnt = i;
378	cnt++;
379	size = cnt * cnt * sizeof(numa_distance[`0`]);
380
381	phys = memblock_phys_alloc_range(size, PAGE_SIZE, start: `0`,
382	PFN_PHYS(max_pfn_mapped));
383	if (!phys) {
384	pr_warn("Warning: can't allocate distance table!\n");
385	/ don't retry until explicitly reset /
386	numa_distance = (void *)`1LU`;
387	return -ENOMEM;
388	}
389
390	numa_distance = __va(phys);
391	numa_distance_cnt = cnt;
392
393	/ fill with the default distances /
394	for (i = `0`; i < cnt; i++)
395	for (j = `0`; j < cnt; j++)
396	numa_distance[i * cnt + j] = i == j ?
397	LOCAL_DISTANCE : REMOTE_DISTANCE;
398	printk(KERN_DEBUG "NUMA: Initialized distance table, cnt=%d\n", cnt);
399
400	return `0`;
401	}
402
403	/**
404	* numa_set_distance - Set NUMA distance from one NUMA to another
405	* @from: the 'from' node to set distance
406	* @to: the 'to' node to set distance
407	* @distance: NUMA distance
408	*
409	* Set the distance from node @from to @to to @distance. If distance table
410	* doesn't exist, one which is large enough to accommodate all the currently
411	* known nodes will be created.
412	*
413	* If such table cannot be allocated, a warning is printed and further
414	* calls are ignored until the distance table is reset with
415	* numa_reset_distance().
416	*
417	* If @from or @to is higher than the highest known node or lower than zero
418	* at the time of table creation or @distance doesn't make sense, the call
419	* is ignored.
420	* This is to allow simplification of specific NUMA config implementations.
421	*/
422	void __init numa_set_distance(int from, int to, int distance)
423	{
424	if (!numa_distance && numa_alloc_distance() < `0`)
425	return;
426
427	if (from >= numa_distance_cnt \|\| to >= numa_distance_cnt \|\|
428	from < `0` \|\| to < `0`) {
429	pr_warn_once("Warning: node ids are out of bound, from=%d to=%d distance=%d\n",
430	from, to, distance);
431	return;
432	}
433
434	if ((u8)distance != distance \|\|
435	(from == to && distance != LOCAL_DISTANCE)) {
436	pr_warn_once("Warning: invalid distance parameter, from=%d to=%d distance=%d\n",
437	from, to, distance);
438	return;
439	}
440
441	numa_distance[from * numa_distance_cnt + to] = distance;
442	}
443
444	int __node_distance(int from, int to)
445	{
446	if (from >= numa_distance_cnt \|\| to >= numa_distance_cnt)
447	return from == to ? LOCAL_DISTANCE : REMOTE_DISTANCE;
448	return numa_distance[from * numa_distance_cnt + to];
449	}
450	EXPORT_SYMBOL(__node_distance);
451
452	/*
453	* Mark all currently memblock-reserved physical memory (which covers the
454	* kernel's own memory ranges) as hot-unswappable.
455	*/
456	static void __init numa_clear_kernel_node_hotplug(void)
457	{
458	nodemask_t reserved_nodemask = NODE_MASK_NONE;
459	struct memblock_region *mb_region;
460	int i;
461
462	/*
463	* We have to do some preprocessing of memblock regions, to
464	* make them suitable for reservation.
465	*
466	* At this time, all memory regions reserved by memblock are
467	* used by the kernel, but those regions are not split up
468	* along node boundaries yet, and don't necessarily have their
469	* node ID set yet either.
470	*
471	* So iterate over all memory known to the x86 architecture,
472	* and use those ranges to set the nid in memblock.reserved.
473	* This will split up the memblock regions along node
474	* boundaries and will set the node IDs as well.
475	*/
476	for (i = `0`; i < numa_meminfo.nr_blks; i++) {
477	struct numa_memblk *mb = numa_meminfo.blk + i;
478	int ret;
479
480	ret = memblock_set_node(base: mb->start, size: mb->end - mb->start, type: &memblock.reserved, nid: mb->nid);
481	WARN_ON_ONCE(ret);
482	}
483
484	/*
485	* Now go over all reserved memblock regions, to construct a
486	* node mask of all kernel reserved memory areas.
487	*
488	* [ Note, when booting with mem=nn[kMG] or in a kdump kernel,
489	* numa_meminfo might not include all memblock.reserved
490	* memory ranges, because quirks such as trim_snb_memory()
491	* reserve specific pages for Sandy Bridge graphics. ]
492	*/
493	for_each_reserved_mem_region(mb_region) {
494	int nid = memblock_get_region_node(r: mb_region);
495
496	if (nid != MAX_NUMNODES)
497	node_set(nid, reserved_nodemask);
498	}
499
500	/*
501	* Finally, clear the MEMBLOCK_HOTPLUG flag for all memory
502	* belonging to the reserved node mask.
503	*
504	* Note that this will include memory regions that reside
505	* on nodes that contain kernel memory - entire nodes
506	* become hot-unpluggable:
507	*/
508	for (i = `0`; i < numa_meminfo.nr_blks; i++) {
509	struct numa_memblk *mb = numa_meminfo.blk + i;
510
511	if (!node_isset(mb->nid, reserved_nodemask))
512	continue;
513
514	memblock_clear_hotplug(base: mb->start, size: mb->end - mb->start);
515	}
516	}
517
518	static int __init numa_register_memblks(struct numa_meminfo *mi)
519	{
520	int i, nid;
521
522	/ Account for nodes with cpus and no memory /
523	node_possible_map = numa_nodes_parsed;
524	numa_nodemask_from_meminfo(nodemask: &node_possible_map, mi);
525	if (WARN_ON(nodes_empty(node_possible_map)))
526	return -EINVAL;
527
528	for (i = `0`; i < mi->nr_blks; i++) {
529	struct numa_memblk *mb = &mi->blk[i];
530	memblock_set_node(base: mb->start, size: mb->end - mb->start,
531	type: &memblock.memory, nid: mb->nid);
532	}
533
534	/*
535	* At very early time, the kernel have to use some memory such as
536	* loading the kernel image. We cannot prevent this anyway. So any
537	* node the kernel resides in should be un-hotpluggable.
538	*
539	* And when we come here, alloc node data won't fail.
540	*/
541	numa_clear_kernel_node_hotplug();
542
543	/*
544	* If sections array is gonna be used for pfn -> nid mapping, check
545	* whether its granularity is fine enough.
546	*/
547	if (IS_ENABLED(NODE_NOT_IN_PAGE_FLAGS)) {
548	unsigned long pfn_align = node_map_pfn_alignment();
549
550	if (pfn_align && pfn_align < PAGES_PER_SECTION) {
551	pr_warn("Node alignment %LuMB < min %LuMB, rejecting NUMA config\n",
552	PFN_PHYS(pfn_align) >> `20`,
553	PFN_PHYS(PAGES_PER_SECTION) >> `20`);
554	return -EINVAL;
555	}
556	}
557
558	if (!memblock_validate_numa_coverage(SZ_1M))
559	return -EINVAL;
560
561	/ Finally register nodes. /
562	for_each_node_mask(nid, node_possible_map) {
563	u64 start = PFN_PHYS(max_pfn);
564	u64 end = `0`;
565
566	for (i = `0`; i < mi->nr_blks; i++) {
567	if (nid != mi->blk[i].nid)
568	continue;
569	start = min(mi->blk[i].start, start);
570	end = max(mi->blk[i].end, end);
571	}
572
573	if (start >= end)
574	continue;
575
576	alloc_node_data(nid);
577	}
578
579	/ Dump memblock with node info and return. /
580	memblock_dump_all();
581	return `0`;
582	}
583
584	/*
585	* There are unfortunately some poorly designed mainboards around that
586	* only connect memory to a single CPU. This breaks the 1:1 cpu->node
587	* mapping. To avoid this fill in the mapping for all possible CPUs,
588	* as the number of CPUs is not known yet. We round robin the existing
589	* nodes.
590	*/
591	static void __init numa_init_array(void)
592	{
593	int rr, i;
594
595	rr = first_node(node_online_map);
596	for (i = `0`; i < nr_cpu_ids; i++) {
597	if (early_cpu_to_node(cpu: i) != NUMA_NO_NODE)
598	continue;
599	numa_set_node(cpu: i, node: rr);
600	rr = next_node_in(rr, node_online_map);
601	}
602	}
603
604	static int __init numa_init(int (init_func)(void*))
605	{
606	int i;
607	int ret;
608
609	for (i = `0`; i < MAX_LOCAL_APIC; i++)
610	set_apicid_to_node(apicid: i, NUMA_NO_NODE);
611
612	nodes_clear(numa_nodes_parsed);
613	nodes_clear(node_possible_map);
614	nodes_clear(node_online_map);
615	memset(&numa_meminfo, `0`, sizeof(numa_meminfo));
616	WARN_ON(memblock_set_node(`0`, ULLONG_MAX, &memblock.memory,
617	MAX_NUMNODES));
618	WARN_ON(memblock_set_node(`0`, ULLONG_MAX, &memblock.reserved,
619	MAX_NUMNODES));
620	/ In case that parsing SRAT failed. /
621	WARN_ON(memblock_clear_hotplug(`0`, ULLONG_MAX));
622	numa_reset_distance();
623
624	ret = init_func();
625	if (ret < `0`)
626	return ret;
627
628	/*
629	* We reset memblock back to the top-down direction
630	* here because if we configured ACPI_NUMA, we have
631	* parsed SRAT in init_func(). It is ok to have the
632	* reset here even if we did't configure ACPI_NUMA
633	* or acpi numa init fails and fallbacks to dummy
634	* numa init.
635	*/
636	memblock_set_bottom_up(enable: false);
637
638	ret = numa_cleanup_meminfo(mi: &numa_meminfo);
639	if (ret < `0`)
640	return ret;
641
642	numa_emulation(numa_meminfo: &numa_meminfo, numa_dist_cnt: numa_distance_cnt);
643
644	ret = numa_register_memblks(mi: &numa_meminfo);
645	if (ret < `0`)
646	return ret;
647
648	for (i = `0`; i < nr_cpu_ids; i++) {
649	int nid = early_cpu_to_node(cpu: i);
650
651	if (nid == NUMA_NO_NODE)
652	continue;
653	if (!node_online(nid))
654	numa_clear_node(cpu: i);
655	}
656	numa_init_array();
657
658	return `0`;
659	}
660
661	/**
662	* dummy_numa_init - Fallback dummy NUMA init
663	*
664	* Used if there's no underlying NUMA architecture, NUMA initialization
665	* fails, or NUMA is disabled on the command line.
666	*
667	* Must online at least one node and add memory blocks that cover all
668	* allowed memory. This function must not fail.
669	*/
670	static int __init dummy_numa_init(void)
671	{
672	printk(KERN_INFO "%s\n",
673	numa_off ? "NUMA turned off" : "No NUMA configuration found");
674	printk(KERN_INFO "Faking a node at [mem %#018Lx-%#018Lx]\n",
675	`0LLU`, PFN_PHYS(max_pfn) - `1`);
676
677	node_set(`0`, numa_nodes_parsed);
678	numa_add_memblk(nid: `0`, start: `0`, PFN_PHYS(max_pfn));
679
680	return `0`;
681	}
682
683	/**
684	* x86_numa_init - Initialize NUMA
685	*
686	* Try each configured NUMA initialization method until one succeeds. The
687	* last fallback is dummy single node config encompassing whole memory and
688	* never fails.
689	*/
690	void __init x86_numa_init(void)
691	{
692	if (!numa_off) {
693	#ifdef CONFIG_ACPI_NUMA
694	if (!numa_init(init_func: x86_acpi_numa_init))
695	return;
696	#endif
697	#ifdef CONFIG_AMD_NUMA
698	if (!numa_init(init_func: amd_numa_init))
699	return;
700	#endif
701	if (acpi_disabled && !numa_init(init_func: of_numa_init))
702	return;
703	}
704
705	numa_init(init_func: dummy_numa_init);
706	}
707
708
709	/*
710	* A node may exist which has one or more Generic Initiators but no CPUs and no
711	* memory.
712	*
713	* This function must be called after init_cpu_to_node(), to ensure that any
714	* memoryless CPU nodes have already been brought online, and before the
715	* node_data[nid] is needed for zone list setup in build_all_zonelists().
716	*
717	* When this function is called, any nodes containing either memory and/or CPUs
718	* will already be online and there is no need to do anything extra, even if
719	* they also contain one or more Generic Initiators.
720	*/
721	void __init init_gi_nodes(void)
722	{
723	int nid;
724
725	/*
726	* Exclude this node from
727	* bringup_nonboot_cpus
728	* cpu_up
729	* __try_online_node
730	* register_one_node
731	* because node_subsys is not initialized yet.
732	* TODO remove dependency on node_online
733	*/
734	for_each_node_state(nid, N_GENERIC_INITIATOR)
735	if (!node_online(nid))
736	node_set_online(nid);
737	}
738
739	/*
740	* Setup early cpu_to_node.
741	*
742	* Populate cpu_to_node[] only if x86_cpu_to_apicid[],
743	* and apicid_to_node[] tables have valid entries for a CPU.
744	* This means we skip cpu_to_node[] initialisation for NUMA
745	* emulation and faking node case (when running a kernel compiled
746	* for NUMA on a non NUMA box), which is OK as cpu_to_node[]
747	* is already initialized in a round robin manner at numa_init_array,
748	* prior to this call, and this initialization is good enough
749	* for the fake NUMA cases.
750	*
751	* Called before the per_cpu areas are setup.
752	*/
753	void __init init_cpu_to_node(void)
754	{
755	int cpu;
756	u32 *cpu_to_apicid = early_per_cpu_ptr(x86_cpu_to_apicid);
757
758	BUG_ON(cpu_to_apicid == NULL);
759
760	for_each_possible_cpu(cpu) {
761	int node = numa_cpu_node(cpu);
762
763	if (node == NUMA_NO_NODE)
764	continue;
765
766	/*
767	* Exclude this node from
768	* bringup_nonboot_cpus
769	* cpu_up
770	* __try_online_node
771	* register_one_node
772	* because node_subsys is not initialized yet.
773	* TODO remove dependency on node_online
774	*/
775	if (!node_online(node))
776	node_set_online(nid: node);
777
778	numa_set_node(cpu, node);
779	}
780	}
781
782	#ifndef CONFIG_DEBUG_PER_CPU_MAPS
783
784	# ifndef CONFIG_NUMA_EMU
785	void numa_add_cpu(int cpu)
786	{
787	cpumask_set_cpu(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]);
788	}
789
790	void numa_remove_cpu(int cpu)
791	{
792	cpumask_clear_cpu(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]);
793	}
794	# endif /* !CONFIG_NUMA_EMU */
795
796	#else /* !CONFIG_DEBUG_PER_CPU_MAPS */
797
798	int __cpu_to_node(int cpu)
799	{
800	if (early_per_cpu_ptr(x86_cpu_to_node_map)) {
801	printk(KERN_WARNING
802	"cpu_to_node(%d): usage too early!\n", cpu);
803	dump_stack();
804	return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu];
805	}
806	return per_cpu(x86_cpu_to_node_map, cpu);
807	}
808	EXPORT_SYMBOL(__cpu_to_node);
809
810	/*
811	* Same function as cpu_to_node() but used if called before the
812	* per_cpu areas are setup.
813	*/
814	int early_cpu_to_node(int cpu)
815	{
816	if (early_per_cpu_ptr(x86_cpu_to_node_map))
817	return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu];
818
819	if (!cpu_possible(cpu)) {
820	printk(KERN_WARNING
821	"early_cpu_to_node(%d): no per_cpu area!\n", cpu);
822	dump_stack();
823	return NUMA_NO_NODE;
824	}
825	return per_cpu(x86_cpu_to_node_map, cpu);
826	}
827
828	void debug_cpumask_set_cpu(int cpu, int node, bool enable)
829	{
830	struct cpumask *mask;
831
832	if (node == NUMA_NO_NODE) {
833	/ early_cpu_to_node() already emits a warning and trace /
834	return;
835	}
836	mask = node_to_cpumask_map[node];
837	if (!cpumask_available(mask)) {
838	pr_err("node_to_cpumask_map[%i] NULL\n", node);
839	dump_stack();
840	return;
841	}
842
843	if (enable)
844	cpumask_set_cpu(cpu, dstp: mask);
845	else
846	cpumask_clear_cpu(cpu, dstp: mask);
847
848	printk(KERN_DEBUG "%s cpu %d node %d: mask now %*pbl\n",
849	enable ? "numa_add_cpu" : "numa_remove_cpu",
850	cpu, node, cpumask_pr_args(mask));
851	return;
852	}
853
854	# ifndef CONFIG_NUMA_EMU
855	static void numa_set_cpumask(int cpu, bool enable)
856	{
857	debug_cpumask_set_cpu(cpu, early_cpu_to_node(cpu), enable);
858	}
859
860	void numa_add_cpu(int cpu)
861	{
862	numa_set_cpumask(cpu, true);
863	}
864
865	void numa_remove_cpu(int cpu)
866	{
867	numa_set_cpumask(cpu, false);
868	}
869	# endif /* !CONFIG_NUMA_EMU */
870
871	/*
872	* Returns a pointer to the bitmask of CPUs on Node 'node'.
873	*/
874	const struct cpumask cpumask_of_node(int* node)
875	{
876	if ((unsigned)node >= nr_node_ids) {
877	printk(KERN_WARNING
878	"cpumask_of_node(%d): (unsigned)node >= nr_node_ids(%u)\n",
879	node, nr_node_ids);
880	dump_stack();
881	return cpu_none_mask;
882	}
883	if (!cpumask_available(mask: node_to_cpumask_map[node])) {
884	printk(KERN_WARNING
885	"cpumask_of_node(%d): no node_to_cpumask_map!\n",
886	node);
887	dump_stack();
888	return cpu_online_mask;
889	}
890	return node_to_cpumask_map[node];
891	}
892	EXPORT_SYMBOL(cpumask_of_node);
893
894	#endif /* !CONFIG_DEBUG_PER_CPU_MAPS */
895
896	#ifdef CONFIG_NUMA_KEEP_MEMINFO
897	static int meminfo_to_nid(struct numa_meminfo *mi, u64 start)
898	{
899	int i;
900
901	for (i = `0`; i < mi->nr_blks; i++)
902	if (mi->blk[i].start <= start && mi->blk[i].end > start)
903	return mi->blk[i].nid;
904	return NUMA_NO_NODE;
905	}
906
907	int phys_to_target_node(phys_addr_t start)
908	{
909	int nid = meminfo_to_nid(mi: &numa_meminfo, start);
910
911	/*
912	* Prefer online nodes, but if reserved memory might be
913	* hot-added continue the search with reserved ranges.
914	*/
915	if (nid != NUMA_NO_NODE)
916	return nid;
917
918	return meminfo_to_nid(mi: &numa_reserved_meminfo, start);
919	}
920	EXPORT_SYMBOL_GPL(phys_to_target_node);
921
922	int memory_add_physaddr_to_nid(u64 start)
923	{
924	int nid = meminfo_to_nid(mi: &numa_meminfo, start);
925
926	if (nid == NUMA_NO_NODE)
927	nid = numa_meminfo.blk[`0`].nid;
928	return nid;
929	}
930	EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid);
931
932	static int __init cmp_memblk(const void a, const* void *b)
933	{
934	const struct numa_memblk ma = (const struct numa_memblk **)a;
935	const struct numa_memblk mb = (const struct numa_memblk **)b;
936
937	return (ma->start > mb->start) - (ma->start < mb->start);
938	}
939
940	static struct numa_memblk *numa_memblk_list[NR_NODE_MEMBLKS] __initdata;
941
942	/**
943	* numa_fill_memblks - Fill gaps in numa_meminfo memblks
944	* @start: address to begin fill
945	* @end: address to end fill
946	*
947	* Find and extend numa_meminfo memblks to cover the physical
948	* address range @start-@end
949	*
950	* RETURNS:
951	* 0 : Success
952	* NUMA_NO_MEMBLK : No memblks exist in address range @start-@end
953	*/
954
955	int __init numa_fill_memblks(u64 start, u64 end)
956	{
957	struct numa_memblk **blk = &numa_memblk_list[`0`];
958	struct numa_meminfo *mi = &numa_meminfo;
959	int count = `0`;
960	u64 prev_end;
961
962	/*
963	* Create a list of pointers to numa_meminfo memblks that
964	* overlap start, end. The list is used to make in-place
965	* changes that fill out the numa_meminfo memblks.
966	*/
967	for (int i = `0`; i < mi->nr_blks; i++) {
968	struct numa_memblk *bi = &mi->blk[i];
969
970	if (memblock_addrs_overlap(base1: start, size1: end - start, base2: bi->start,
971	size2: bi->end - bi->start)) {
972	blk[count] = &mi->blk[i];
973	count++;
974	}
975	}
976	if (!count)
977	return NUMA_NO_MEMBLK;
978
979	/ Sort the list of pointers in memblk->start order /
980	sort(base: &blk[`0`], num: count, size: sizeof(blk[`0`]), cmp_func: cmp_memblk, NULL);
981
982	/ Make sure the first/last memblks include start/end /
983	blk[`0`]->start = min(blk[`0`]->start, start);
984	blk[count - `1`]->end = max(blk[count - `1`]->end, end);
985
986	/*
987	* Fill any gaps by tracking the previous memblks
988	* end address and backfilling to it if needed.
989	*/
990	prev_end = blk[`0`]->end;
991	for (int i = `1`; i < count; i++) {
992	struct numa_memblk *curr = blk[i];
993
994	if (prev_end >= curr->start) {
995	if (prev_end < curr->end)
996	prev_end = curr->end;
997	} else {
998	curr->start = prev_end;
999	prev_end = curr->end;
1000	}
1001	}
1002	return `0`;
1003	}
1004
1005	#endif
1006

source code of linux/arch/x86/mm/numa.c