1 | // SPDX-License-Identifier: GPL-2.0-only |
2 | /* Common code for 32 and 64-bit NUMA */ |
3 | #include <linux/acpi.h> |
4 | #include <linux/kernel.h> |
5 | #include <linux/mm.h> |
6 | #include <linux/of.h> |
7 | #include <linux/string.h> |
8 | #include <linux/init.h> |
9 | #include <linux/memblock.h> |
10 | #include <linux/mmzone.h> |
11 | #include <linux/ctype.h> |
12 | #include <linux/nodemask.h> |
13 | #include <linux/sched.h> |
14 | #include <linux/topology.h> |
15 | #include <linux/sort.h> |
16 | |
17 | #include <asm/e820/api.h> |
18 | #include <asm/proto.h> |
19 | #include <asm/dma.h> |
20 | #include <asm/amd_nb.h> |
21 | |
22 | #include "numa_internal.h" |
23 | |
24 | int numa_off; |
25 | nodemask_t numa_nodes_parsed __initdata; |
26 | |
27 | struct pglist_data *node_data[MAX_NUMNODES] __read_mostly; |
28 | EXPORT_SYMBOL(node_data); |
29 | |
30 | static struct numa_meminfo numa_meminfo __initdata_or_meminfo; |
31 | static struct numa_meminfo numa_reserved_meminfo __initdata_or_meminfo; |
32 | |
33 | static int numa_distance_cnt; |
34 | static u8 *numa_distance; |
35 | |
36 | static __init int numa_setup(char *opt) |
37 | { |
38 | if (!opt) |
39 | return -EINVAL; |
40 | if (!strncmp(opt, "off" , 3)) |
41 | numa_off = 1; |
42 | if (!strncmp(opt, "fake=" , 5)) |
43 | return numa_emu_cmdline(str: opt + 5); |
44 | if (!strncmp(opt, "noacpi" , 6)) |
45 | disable_srat(); |
46 | if (!strncmp(opt, "nohmat" , 6)) |
47 | disable_hmat(); |
48 | return 0; |
49 | } |
50 | early_param("numa" , numa_setup); |
51 | |
52 | /* |
53 | * apicid, cpu, node mappings |
54 | */ |
55 | s16 __apicid_to_node[MAX_LOCAL_APIC] = { |
56 | [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE |
57 | }; |
58 | |
59 | int numa_cpu_node(int cpu) |
60 | { |
61 | u32 apicid = early_per_cpu(x86_cpu_to_apicid, cpu); |
62 | |
63 | if (apicid != BAD_APICID) |
64 | return __apicid_to_node[apicid]; |
65 | return NUMA_NO_NODE; |
66 | } |
67 | |
68 | cpumask_var_t node_to_cpumask_map[MAX_NUMNODES]; |
69 | EXPORT_SYMBOL(node_to_cpumask_map); |
70 | |
71 | /* |
72 | * Map cpu index to node index |
73 | */ |
74 | DEFINE_EARLY_PER_CPU(int, x86_cpu_to_node_map, NUMA_NO_NODE); |
75 | EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_node_map); |
76 | |
77 | void numa_set_node(int cpu, int node) |
78 | { |
79 | int *cpu_to_node_map = early_per_cpu_ptr(x86_cpu_to_node_map); |
80 | |
81 | /* early setting, no percpu area yet */ |
82 | if (cpu_to_node_map) { |
83 | cpu_to_node_map[cpu] = node; |
84 | return; |
85 | } |
86 | |
87 | #ifdef CONFIG_DEBUG_PER_CPU_MAPS |
88 | if (cpu >= nr_cpu_ids || !cpu_possible(cpu)) { |
89 | printk(KERN_ERR "numa_set_node: invalid cpu# (%d)\n" , cpu); |
90 | dump_stack(); |
91 | return; |
92 | } |
93 | #endif |
94 | per_cpu(x86_cpu_to_node_map, cpu) = node; |
95 | |
96 | set_cpu_numa_node(cpu, node); |
97 | } |
98 | |
99 | void numa_clear_node(int cpu) |
100 | { |
101 | numa_set_node(cpu, NUMA_NO_NODE); |
102 | } |
103 | |
104 | /* |
105 | * Allocate node_to_cpumask_map based on number of available nodes |
106 | * Requires node_possible_map to be valid. |
107 | * |
108 | * Note: cpumask_of_node() is not valid until after this is done. |
109 | * (Use CONFIG_DEBUG_PER_CPU_MAPS to check this.) |
110 | */ |
111 | void __init setup_node_to_cpumask_map(void) |
112 | { |
113 | unsigned int node; |
114 | |
115 | /* setup nr_node_ids if not done yet */ |
116 | if (nr_node_ids == MAX_NUMNODES) |
117 | setup_nr_node_ids(); |
118 | |
119 | /* allocate the map */ |
120 | for (node = 0; node < nr_node_ids; node++) |
121 | alloc_bootmem_cpumask_var(mask: &node_to_cpumask_map[node]); |
122 | |
123 | /* cpumask_of_node() will now work */ |
124 | pr_debug("Node to cpumask map for %u nodes\n" , nr_node_ids); |
125 | } |
126 | |
127 | static int __init numa_add_memblk_to(int nid, u64 start, u64 end, |
128 | struct numa_meminfo *mi) |
129 | { |
130 | /* ignore zero length blks */ |
131 | if (start == end) |
132 | return 0; |
133 | |
134 | /* whine about and ignore invalid blks */ |
135 | if (start > end || nid < 0 || nid >= MAX_NUMNODES) { |
136 | pr_warn("Warning: invalid memblk node %d [mem %#010Lx-%#010Lx]\n" , |
137 | nid, start, end - 1); |
138 | return 0; |
139 | } |
140 | |
141 | if (mi->nr_blks >= NR_NODE_MEMBLKS) { |
142 | pr_err("too many memblk ranges\n" ); |
143 | return -EINVAL; |
144 | } |
145 | |
146 | mi->blk[mi->nr_blks].start = start; |
147 | mi->blk[mi->nr_blks].end = end; |
148 | mi->blk[mi->nr_blks].nid = nid; |
149 | mi->nr_blks++; |
150 | return 0; |
151 | } |
152 | |
153 | /** |
154 | * numa_remove_memblk_from - Remove one numa_memblk from a numa_meminfo |
155 | * @idx: Index of memblk to remove |
156 | * @mi: numa_meminfo to remove memblk from |
157 | * |
158 | * Remove @idx'th numa_memblk from @mi by shifting @mi->blk[] and |
159 | * decrementing @mi->nr_blks. |
160 | */ |
161 | void __init numa_remove_memblk_from(int idx, struct numa_meminfo *mi) |
162 | { |
163 | mi->nr_blks--; |
164 | memmove(&mi->blk[idx], &mi->blk[idx + 1], |
165 | (mi->nr_blks - idx) * sizeof(mi->blk[0])); |
166 | } |
167 | |
168 | /** |
169 | * numa_move_tail_memblk - Move a numa_memblk from one numa_meminfo to another |
170 | * @dst: numa_meminfo to append block to |
171 | * @idx: Index of memblk to remove |
172 | * @src: numa_meminfo to remove memblk from |
173 | */ |
174 | static void __init numa_move_tail_memblk(struct numa_meminfo *dst, int idx, |
175 | struct numa_meminfo *src) |
176 | { |
177 | dst->blk[dst->nr_blks++] = src->blk[idx]; |
178 | numa_remove_memblk_from(idx, mi: src); |
179 | } |
180 | |
181 | /** |
182 | * numa_add_memblk - Add one numa_memblk to numa_meminfo |
183 | * @nid: NUMA node ID of the new memblk |
184 | * @start: Start address of the new memblk |
185 | * @end: End address of the new memblk |
186 | * |
187 | * Add a new memblk to the default numa_meminfo. |
188 | * |
189 | * RETURNS: |
190 | * 0 on success, -errno on failure. |
191 | */ |
192 | int __init numa_add_memblk(int nid, u64 start, u64 end) |
193 | { |
194 | return numa_add_memblk_to(nid, start, end, mi: &numa_meminfo); |
195 | } |
196 | |
197 | /* Allocate NODE_DATA for a node on the local memory */ |
198 | static void __init alloc_node_data(int nid) |
199 | { |
200 | const size_t nd_size = roundup(sizeof(pg_data_t), PAGE_SIZE); |
201 | u64 nd_pa; |
202 | void *nd; |
203 | int tnid; |
204 | |
205 | /* |
206 | * Allocate node data. Try node-local memory and then any node. |
207 | * Never allocate in DMA zone. |
208 | */ |
209 | nd_pa = memblock_phys_alloc_try_nid(size: nd_size, SMP_CACHE_BYTES, nid); |
210 | if (!nd_pa) { |
211 | pr_err("Cannot find %zu bytes in any node (initial node: %d)\n" , |
212 | nd_size, nid); |
213 | return; |
214 | } |
215 | nd = __va(nd_pa); |
216 | |
217 | /* report and initialize */ |
218 | printk(KERN_INFO "NODE_DATA(%d) allocated [mem %#010Lx-%#010Lx]\n" , nid, |
219 | nd_pa, nd_pa + nd_size - 1); |
220 | tnid = early_pfn_to_nid(pfn: nd_pa >> PAGE_SHIFT); |
221 | if (tnid != nid) |
222 | printk(KERN_INFO " NODE_DATA(%d) on node %d\n" , nid, tnid); |
223 | |
224 | node_data[nid] = nd; |
225 | memset(NODE_DATA(nid), 0, sizeof(pg_data_t)); |
226 | |
227 | node_set_online(nid); |
228 | } |
229 | |
230 | /** |
231 | * numa_cleanup_meminfo - Cleanup a numa_meminfo |
232 | * @mi: numa_meminfo to clean up |
233 | * |
234 | * Sanitize @mi by merging and removing unnecessary memblks. Also check for |
235 | * conflicts and clear unused memblks. |
236 | * |
237 | * RETURNS: |
238 | * 0 on success, -errno on failure. |
239 | */ |
240 | int __init numa_cleanup_meminfo(struct numa_meminfo *mi) |
241 | { |
242 | const u64 low = 0; |
243 | const u64 high = PFN_PHYS(max_pfn); |
244 | int i, j, k; |
245 | |
246 | /* first, trim all entries */ |
247 | for (i = 0; i < mi->nr_blks; i++) { |
248 | struct numa_memblk *bi = &mi->blk[i]; |
249 | |
250 | /* move / save reserved memory ranges */ |
251 | if (!memblock_overlaps_region(type: &memblock.memory, |
252 | base: bi->start, size: bi->end - bi->start)) { |
253 | numa_move_tail_memblk(dst: &numa_reserved_meminfo, idx: i--, src: mi); |
254 | continue; |
255 | } |
256 | |
257 | /* make sure all non-reserved blocks are inside the limits */ |
258 | bi->start = max(bi->start, low); |
259 | |
260 | /* preserve info for non-RAM areas above 'max_pfn': */ |
261 | if (bi->end > high) { |
262 | numa_add_memblk_to(nid: bi->nid, start: high, end: bi->end, |
263 | mi: &numa_reserved_meminfo); |
264 | bi->end = high; |
265 | } |
266 | |
267 | /* and there's no empty block */ |
268 | if (bi->start >= bi->end) |
269 | numa_remove_memblk_from(idx: i--, mi); |
270 | } |
271 | |
272 | /* merge neighboring / overlapping entries */ |
273 | for (i = 0; i < mi->nr_blks; i++) { |
274 | struct numa_memblk *bi = &mi->blk[i]; |
275 | |
276 | for (j = i + 1; j < mi->nr_blks; j++) { |
277 | struct numa_memblk *bj = &mi->blk[j]; |
278 | u64 start, end; |
279 | |
280 | /* |
281 | * See whether there are overlapping blocks. Whine |
282 | * about but allow overlaps of the same nid. They |
283 | * will be merged below. |
284 | */ |
285 | if (bi->end > bj->start && bi->start < bj->end) { |
286 | if (bi->nid != bj->nid) { |
287 | pr_err("node %d [mem %#010Lx-%#010Lx] overlaps with node %d [mem %#010Lx-%#010Lx]\n" , |
288 | bi->nid, bi->start, bi->end - 1, |
289 | bj->nid, bj->start, bj->end - 1); |
290 | return -EINVAL; |
291 | } |
292 | pr_warn("Warning: node %d [mem %#010Lx-%#010Lx] overlaps with itself [mem %#010Lx-%#010Lx]\n" , |
293 | bi->nid, bi->start, bi->end - 1, |
294 | bj->start, bj->end - 1); |
295 | } |
296 | |
297 | /* |
298 | * Join together blocks on the same node, holes |
299 | * between which don't overlap with memory on other |
300 | * nodes. |
301 | */ |
302 | if (bi->nid != bj->nid) |
303 | continue; |
304 | start = min(bi->start, bj->start); |
305 | end = max(bi->end, bj->end); |
306 | for (k = 0; k < mi->nr_blks; k++) { |
307 | struct numa_memblk *bk = &mi->blk[k]; |
308 | |
309 | if (bi->nid == bk->nid) |
310 | continue; |
311 | if (start < bk->end && end > bk->start) |
312 | break; |
313 | } |
314 | if (k < mi->nr_blks) |
315 | continue; |
316 | printk(KERN_INFO "NUMA: Node %d [mem %#010Lx-%#010Lx] + [mem %#010Lx-%#010Lx] -> [mem %#010Lx-%#010Lx]\n" , |
317 | bi->nid, bi->start, bi->end - 1, bj->start, |
318 | bj->end - 1, start, end - 1); |
319 | bi->start = start; |
320 | bi->end = end; |
321 | numa_remove_memblk_from(idx: j--, mi); |
322 | } |
323 | } |
324 | |
325 | /* clear unused ones */ |
326 | for (i = mi->nr_blks; i < ARRAY_SIZE(mi->blk); i++) { |
327 | mi->blk[i].start = mi->blk[i].end = 0; |
328 | mi->blk[i].nid = NUMA_NO_NODE; |
329 | } |
330 | |
331 | return 0; |
332 | } |
333 | |
334 | /* |
335 | * Set nodes, which have memory in @mi, in *@nodemask. |
336 | */ |
337 | static void __init numa_nodemask_from_meminfo(nodemask_t *nodemask, |
338 | const struct numa_meminfo *mi) |
339 | { |
340 | int i; |
341 | |
342 | for (i = 0; i < ARRAY_SIZE(mi->blk); i++) |
343 | if (mi->blk[i].start != mi->blk[i].end && |
344 | mi->blk[i].nid != NUMA_NO_NODE) |
345 | node_set(mi->blk[i].nid, *nodemask); |
346 | } |
347 | |
348 | /** |
349 | * numa_reset_distance - Reset NUMA distance table |
350 | * |
351 | * The current table is freed. The next numa_set_distance() call will |
352 | * create a new one. |
353 | */ |
354 | void __init numa_reset_distance(void) |
355 | { |
356 | size_t size = numa_distance_cnt * numa_distance_cnt * sizeof(numa_distance[0]); |
357 | |
358 | /* numa_distance could be 1LU marking allocation failure, test cnt */ |
359 | if (numa_distance_cnt) |
360 | memblock_free(ptr: numa_distance, size); |
361 | numa_distance_cnt = 0; |
362 | numa_distance = NULL; /* enable table creation */ |
363 | } |
364 | |
365 | static int __init numa_alloc_distance(void) |
366 | { |
367 | nodemask_t nodes_parsed; |
368 | size_t size; |
369 | int i, j, cnt = 0; |
370 | u64 phys; |
371 | |
372 | /* size the new table and allocate it */ |
373 | nodes_parsed = numa_nodes_parsed; |
374 | numa_nodemask_from_meminfo(nodemask: &nodes_parsed, mi: &numa_meminfo); |
375 | |
376 | for_each_node_mask(i, nodes_parsed) |
377 | cnt = i; |
378 | cnt++; |
379 | size = cnt * cnt * sizeof(numa_distance[0]); |
380 | |
381 | phys = memblock_phys_alloc_range(size, PAGE_SIZE, start: 0, |
382 | PFN_PHYS(max_pfn_mapped)); |
383 | if (!phys) { |
384 | pr_warn("Warning: can't allocate distance table!\n" ); |
385 | /* don't retry until explicitly reset */ |
386 | numa_distance = (void *)1LU; |
387 | return -ENOMEM; |
388 | } |
389 | |
390 | numa_distance = __va(phys); |
391 | numa_distance_cnt = cnt; |
392 | |
393 | /* fill with the default distances */ |
394 | for (i = 0; i < cnt; i++) |
395 | for (j = 0; j < cnt; j++) |
396 | numa_distance[i * cnt + j] = i == j ? |
397 | LOCAL_DISTANCE : REMOTE_DISTANCE; |
398 | printk(KERN_DEBUG "NUMA: Initialized distance table, cnt=%d\n" , cnt); |
399 | |
400 | return 0; |
401 | } |
402 | |
403 | /** |
404 | * numa_set_distance - Set NUMA distance from one NUMA to another |
405 | * @from: the 'from' node to set distance |
406 | * @to: the 'to' node to set distance |
407 | * @distance: NUMA distance |
408 | * |
409 | * Set the distance from node @from to @to to @distance. If distance table |
410 | * doesn't exist, one which is large enough to accommodate all the currently |
411 | * known nodes will be created. |
412 | * |
413 | * If such table cannot be allocated, a warning is printed and further |
414 | * calls are ignored until the distance table is reset with |
415 | * numa_reset_distance(). |
416 | * |
417 | * If @from or @to is higher than the highest known node or lower than zero |
418 | * at the time of table creation or @distance doesn't make sense, the call |
419 | * is ignored. |
420 | * This is to allow simplification of specific NUMA config implementations. |
421 | */ |
422 | void __init numa_set_distance(int from, int to, int distance) |
423 | { |
424 | if (!numa_distance && numa_alloc_distance() < 0) |
425 | return; |
426 | |
427 | if (from >= numa_distance_cnt || to >= numa_distance_cnt || |
428 | from < 0 || to < 0) { |
429 | pr_warn_once("Warning: node ids are out of bound, from=%d to=%d distance=%d\n" , |
430 | from, to, distance); |
431 | return; |
432 | } |
433 | |
434 | if ((u8)distance != distance || |
435 | (from == to && distance != LOCAL_DISTANCE)) { |
436 | pr_warn_once("Warning: invalid distance parameter, from=%d to=%d distance=%d\n" , |
437 | from, to, distance); |
438 | return; |
439 | } |
440 | |
441 | numa_distance[from * numa_distance_cnt + to] = distance; |
442 | } |
443 | |
444 | int __node_distance(int from, int to) |
445 | { |
446 | if (from >= numa_distance_cnt || to >= numa_distance_cnt) |
447 | return from == to ? LOCAL_DISTANCE : REMOTE_DISTANCE; |
448 | return numa_distance[from * numa_distance_cnt + to]; |
449 | } |
450 | EXPORT_SYMBOL(__node_distance); |
451 | |
452 | /* |
453 | * Mark all currently memblock-reserved physical memory (which covers the |
454 | * kernel's own memory ranges) as hot-unswappable. |
455 | */ |
456 | static void __init numa_clear_kernel_node_hotplug(void) |
457 | { |
458 | nodemask_t reserved_nodemask = NODE_MASK_NONE; |
459 | struct memblock_region *mb_region; |
460 | int i; |
461 | |
462 | /* |
463 | * We have to do some preprocessing of memblock regions, to |
464 | * make them suitable for reservation. |
465 | * |
466 | * At this time, all memory regions reserved by memblock are |
467 | * used by the kernel, but those regions are not split up |
468 | * along node boundaries yet, and don't necessarily have their |
469 | * node ID set yet either. |
470 | * |
471 | * So iterate over all memory known to the x86 architecture, |
472 | * and use those ranges to set the nid in memblock.reserved. |
473 | * This will split up the memblock regions along node |
474 | * boundaries and will set the node IDs as well. |
475 | */ |
476 | for (i = 0; i < numa_meminfo.nr_blks; i++) { |
477 | struct numa_memblk *mb = numa_meminfo.blk + i; |
478 | int ret; |
479 | |
480 | ret = memblock_set_node(base: mb->start, size: mb->end - mb->start, type: &memblock.reserved, nid: mb->nid); |
481 | WARN_ON_ONCE(ret); |
482 | } |
483 | |
484 | /* |
485 | * Now go over all reserved memblock regions, to construct a |
486 | * node mask of all kernel reserved memory areas. |
487 | * |
488 | * [ Note, when booting with mem=nn[kMG] or in a kdump kernel, |
489 | * numa_meminfo might not include all memblock.reserved |
490 | * memory ranges, because quirks such as trim_snb_memory() |
491 | * reserve specific pages for Sandy Bridge graphics. ] |
492 | */ |
493 | for_each_reserved_mem_region(mb_region) { |
494 | int nid = memblock_get_region_node(r: mb_region); |
495 | |
496 | if (nid != MAX_NUMNODES) |
497 | node_set(nid, reserved_nodemask); |
498 | } |
499 | |
500 | /* |
501 | * Finally, clear the MEMBLOCK_HOTPLUG flag for all memory |
502 | * belonging to the reserved node mask. |
503 | * |
504 | * Note that this will include memory regions that reside |
505 | * on nodes that contain kernel memory - entire nodes |
506 | * become hot-unpluggable: |
507 | */ |
508 | for (i = 0; i < numa_meminfo.nr_blks; i++) { |
509 | struct numa_memblk *mb = numa_meminfo.blk + i; |
510 | |
511 | if (!node_isset(mb->nid, reserved_nodemask)) |
512 | continue; |
513 | |
514 | memblock_clear_hotplug(base: mb->start, size: mb->end - mb->start); |
515 | } |
516 | } |
517 | |
518 | static int __init numa_register_memblks(struct numa_meminfo *mi) |
519 | { |
520 | int i, nid; |
521 | |
522 | /* Account for nodes with cpus and no memory */ |
523 | node_possible_map = numa_nodes_parsed; |
524 | numa_nodemask_from_meminfo(nodemask: &node_possible_map, mi); |
525 | if (WARN_ON(nodes_empty(node_possible_map))) |
526 | return -EINVAL; |
527 | |
528 | for (i = 0; i < mi->nr_blks; i++) { |
529 | struct numa_memblk *mb = &mi->blk[i]; |
530 | memblock_set_node(base: mb->start, size: mb->end - mb->start, |
531 | type: &memblock.memory, nid: mb->nid); |
532 | } |
533 | |
534 | /* |
535 | * At very early time, the kernel have to use some memory such as |
536 | * loading the kernel image. We cannot prevent this anyway. So any |
537 | * node the kernel resides in should be un-hotpluggable. |
538 | * |
539 | * And when we come here, alloc node data won't fail. |
540 | */ |
541 | numa_clear_kernel_node_hotplug(); |
542 | |
543 | /* |
544 | * If sections array is gonna be used for pfn -> nid mapping, check |
545 | * whether its granularity is fine enough. |
546 | */ |
547 | if (IS_ENABLED(NODE_NOT_IN_PAGE_FLAGS)) { |
548 | unsigned long pfn_align = node_map_pfn_alignment(); |
549 | |
550 | if (pfn_align && pfn_align < PAGES_PER_SECTION) { |
551 | pr_warn("Node alignment %LuMB < min %LuMB, rejecting NUMA config\n" , |
552 | PFN_PHYS(pfn_align) >> 20, |
553 | PFN_PHYS(PAGES_PER_SECTION) >> 20); |
554 | return -EINVAL; |
555 | } |
556 | } |
557 | |
558 | if (!memblock_validate_numa_coverage(SZ_1M)) |
559 | return -EINVAL; |
560 | |
561 | /* Finally register nodes. */ |
562 | for_each_node_mask(nid, node_possible_map) { |
563 | u64 start = PFN_PHYS(max_pfn); |
564 | u64 end = 0; |
565 | |
566 | for (i = 0; i < mi->nr_blks; i++) { |
567 | if (nid != mi->blk[i].nid) |
568 | continue; |
569 | start = min(mi->blk[i].start, start); |
570 | end = max(mi->blk[i].end, end); |
571 | } |
572 | |
573 | if (start >= end) |
574 | continue; |
575 | |
576 | alloc_node_data(nid); |
577 | } |
578 | |
579 | /* Dump memblock with node info and return. */ |
580 | memblock_dump_all(); |
581 | return 0; |
582 | } |
583 | |
584 | /* |
585 | * There are unfortunately some poorly designed mainboards around that |
586 | * only connect memory to a single CPU. This breaks the 1:1 cpu->node |
587 | * mapping. To avoid this fill in the mapping for all possible CPUs, |
588 | * as the number of CPUs is not known yet. We round robin the existing |
589 | * nodes. |
590 | */ |
591 | static void __init numa_init_array(void) |
592 | { |
593 | int rr, i; |
594 | |
595 | rr = first_node(node_online_map); |
596 | for (i = 0; i < nr_cpu_ids; i++) { |
597 | if (early_cpu_to_node(cpu: i) != NUMA_NO_NODE) |
598 | continue; |
599 | numa_set_node(cpu: i, node: rr); |
600 | rr = next_node_in(rr, node_online_map); |
601 | } |
602 | } |
603 | |
604 | static int __init numa_init(int (*init_func)(void)) |
605 | { |
606 | int i; |
607 | int ret; |
608 | |
609 | for (i = 0; i < MAX_LOCAL_APIC; i++) |
610 | set_apicid_to_node(apicid: i, NUMA_NO_NODE); |
611 | |
612 | nodes_clear(numa_nodes_parsed); |
613 | nodes_clear(node_possible_map); |
614 | nodes_clear(node_online_map); |
615 | memset(&numa_meminfo, 0, sizeof(numa_meminfo)); |
616 | WARN_ON(memblock_set_node(0, ULLONG_MAX, &memblock.memory, |
617 | MAX_NUMNODES)); |
618 | WARN_ON(memblock_set_node(0, ULLONG_MAX, &memblock.reserved, |
619 | MAX_NUMNODES)); |
620 | /* In case that parsing SRAT failed. */ |
621 | WARN_ON(memblock_clear_hotplug(0, ULLONG_MAX)); |
622 | numa_reset_distance(); |
623 | |
624 | ret = init_func(); |
625 | if (ret < 0) |
626 | return ret; |
627 | |
628 | /* |
629 | * We reset memblock back to the top-down direction |
630 | * here because if we configured ACPI_NUMA, we have |
631 | * parsed SRAT in init_func(). It is ok to have the |
632 | * reset here even if we did't configure ACPI_NUMA |
633 | * or acpi numa init fails and fallbacks to dummy |
634 | * numa init. |
635 | */ |
636 | memblock_set_bottom_up(enable: false); |
637 | |
638 | ret = numa_cleanup_meminfo(mi: &numa_meminfo); |
639 | if (ret < 0) |
640 | return ret; |
641 | |
642 | numa_emulation(numa_meminfo: &numa_meminfo, numa_dist_cnt: numa_distance_cnt); |
643 | |
644 | ret = numa_register_memblks(mi: &numa_meminfo); |
645 | if (ret < 0) |
646 | return ret; |
647 | |
648 | for (i = 0; i < nr_cpu_ids; i++) { |
649 | int nid = early_cpu_to_node(cpu: i); |
650 | |
651 | if (nid == NUMA_NO_NODE) |
652 | continue; |
653 | if (!node_online(nid)) |
654 | numa_clear_node(cpu: i); |
655 | } |
656 | numa_init_array(); |
657 | |
658 | return 0; |
659 | } |
660 | |
661 | /** |
662 | * dummy_numa_init - Fallback dummy NUMA init |
663 | * |
664 | * Used if there's no underlying NUMA architecture, NUMA initialization |
665 | * fails, or NUMA is disabled on the command line. |
666 | * |
667 | * Must online at least one node and add memory blocks that cover all |
668 | * allowed memory. This function must not fail. |
669 | */ |
670 | static int __init dummy_numa_init(void) |
671 | { |
672 | printk(KERN_INFO "%s\n" , |
673 | numa_off ? "NUMA turned off" : "No NUMA configuration found" ); |
674 | printk(KERN_INFO "Faking a node at [mem %#018Lx-%#018Lx]\n" , |
675 | 0LLU, PFN_PHYS(max_pfn) - 1); |
676 | |
677 | node_set(0, numa_nodes_parsed); |
678 | numa_add_memblk(nid: 0, start: 0, PFN_PHYS(max_pfn)); |
679 | |
680 | return 0; |
681 | } |
682 | |
683 | /** |
684 | * x86_numa_init - Initialize NUMA |
685 | * |
686 | * Try each configured NUMA initialization method until one succeeds. The |
687 | * last fallback is dummy single node config encompassing whole memory and |
688 | * never fails. |
689 | */ |
690 | void __init x86_numa_init(void) |
691 | { |
692 | if (!numa_off) { |
693 | #ifdef CONFIG_ACPI_NUMA |
694 | if (!numa_init(init_func: x86_acpi_numa_init)) |
695 | return; |
696 | #endif |
697 | #ifdef CONFIG_AMD_NUMA |
698 | if (!numa_init(init_func: amd_numa_init)) |
699 | return; |
700 | #endif |
701 | if (acpi_disabled && !numa_init(init_func: of_numa_init)) |
702 | return; |
703 | } |
704 | |
705 | numa_init(init_func: dummy_numa_init); |
706 | } |
707 | |
708 | |
709 | /* |
710 | * A node may exist which has one or more Generic Initiators but no CPUs and no |
711 | * memory. |
712 | * |
713 | * This function must be called after init_cpu_to_node(), to ensure that any |
714 | * memoryless CPU nodes have already been brought online, and before the |
715 | * node_data[nid] is needed for zone list setup in build_all_zonelists(). |
716 | * |
717 | * When this function is called, any nodes containing either memory and/or CPUs |
718 | * will already be online and there is no need to do anything extra, even if |
719 | * they also contain one or more Generic Initiators. |
720 | */ |
721 | void __init init_gi_nodes(void) |
722 | { |
723 | int nid; |
724 | |
725 | /* |
726 | * Exclude this node from |
727 | * bringup_nonboot_cpus |
728 | * cpu_up |
729 | * __try_online_node |
730 | * register_one_node |
731 | * because node_subsys is not initialized yet. |
732 | * TODO remove dependency on node_online |
733 | */ |
734 | for_each_node_state(nid, N_GENERIC_INITIATOR) |
735 | if (!node_online(nid)) |
736 | node_set_online(nid); |
737 | } |
738 | |
739 | /* |
740 | * Setup early cpu_to_node. |
741 | * |
742 | * Populate cpu_to_node[] only if x86_cpu_to_apicid[], |
743 | * and apicid_to_node[] tables have valid entries for a CPU. |
744 | * This means we skip cpu_to_node[] initialisation for NUMA |
745 | * emulation and faking node case (when running a kernel compiled |
746 | * for NUMA on a non NUMA box), which is OK as cpu_to_node[] |
747 | * is already initialized in a round robin manner at numa_init_array, |
748 | * prior to this call, and this initialization is good enough |
749 | * for the fake NUMA cases. |
750 | * |
751 | * Called before the per_cpu areas are setup. |
752 | */ |
753 | void __init init_cpu_to_node(void) |
754 | { |
755 | int cpu; |
756 | u32 *cpu_to_apicid = early_per_cpu_ptr(x86_cpu_to_apicid); |
757 | |
758 | BUG_ON(cpu_to_apicid == NULL); |
759 | |
760 | for_each_possible_cpu(cpu) { |
761 | int node = numa_cpu_node(cpu); |
762 | |
763 | if (node == NUMA_NO_NODE) |
764 | continue; |
765 | |
766 | /* |
767 | * Exclude this node from |
768 | * bringup_nonboot_cpus |
769 | * cpu_up |
770 | * __try_online_node |
771 | * register_one_node |
772 | * because node_subsys is not initialized yet. |
773 | * TODO remove dependency on node_online |
774 | */ |
775 | if (!node_online(node)) |
776 | node_set_online(nid: node); |
777 | |
778 | numa_set_node(cpu, node); |
779 | } |
780 | } |
781 | |
782 | #ifndef CONFIG_DEBUG_PER_CPU_MAPS |
783 | |
784 | # ifndef CONFIG_NUMA_EMU |
785 | void numa_add_cpu(int cpu) |
786 | { |
787 | cpumask_set_cpu(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]); |
788 | } |
789 | |
790 | void numa_remove_cpu(int cpu) |
791 | { |
792 | cpumask_clear_cpu(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]); |
793 | } |
794 | # endif /* !CONFIG_NUMA_EMU */ |
795 | |
796 | #else /* !CONFIG_DEBUG_PER_CPU_MAPS */ |
797 | |
798 | int __cpu_to_node(int cpu) |
799 | { |
800 | if (early_per_cpu_ptr(x86_cpu_to_node_map)) { |
801 | printk(KERN_WARNING |
802 | "cpu_to_node(%d): usage too early!\n" , cpu); |
803 | dump_stack(); |
804 | return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu]; |
805 | } |
806 | return per_cpu(x86_cpu_to_node_map, cpu); |
807 | } |
808 | EXPORT_SYMBOL(__cpu_to_node); |
809 | |
810 | /* |
811 | * Same function as cpu_to_node() but used if called before the |
812 | * per_cpu areas are setup. |
813 | */ |
814 | int early_cpu_to_node(int cpu) |
815 | { |
816 | if (early_per_cpu_ptr(x86_cpu_to_node_map)) |
817 | return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu]; |
818 | |
819 | if (!cpu_possible(cpu)) { |
820 | printk(KERN_WARNING |
821 | "early_cpu_to_node(%d): no per_cpu area!\n" , cpu); |
822 | dump_stack(); |
823 | return NUMA_NO_NODE; |
824 | } |
825 | return per_cpu(x86_cpu_to_node_map, cpu); |
826 | } |
827 | |
828 | void debug_cpumask_set_cpu(int cpu, int node, bool enable) |
829 | { |
830 | struct cpumask *mask; |
831 | |
832 | if (node == NUMA_NO_NODE) { |
833 | /* early_cpu_to_node() already emits a warning and trace */ |
834 | return; |
835 | } |
836 | mask = node_to_cpumask_map[node]; |
837 | if (!cpumask_available(mask)) { |
838 | pr_err("node_to_cpumask_map[%i] NULL\n" , node); |
839 | dump_stack(); |
840 | return; |
841 | } |
842 | |
843 | if (enable) |
844 | cpumask_set_cpu(cpu, dstp: mask); |
845 | else |
846 | cpumask_clear_cpu(cpu, dstp: mask); |
847 | |
848 | printk(KERN_DEBUG "%s cpu %d node %d: mask now %*pbl\n" , |
849 | enable ? "numa_add_cpu" : "numa_remove_cpu" , |
850 | cpu, node, cpumask_pr_args(mask)); |
851 | return; |
852 | } |
853 | |
854 | # ifndef CONFIG_NUMA_EMU |
855 | static void numa_set_cpumask(int cpu, bool enable) |
856 | { |
857 | debug_cpumask_set_cpu(cpu, early_cpu_to_node(cpu), enable); |
858 | } |
859 | |
860 | void numa_add_cpu(int cpu) |
861 | { |
862 | numa_set_cpumask(cpu, true); |
863 | } |
864 | |
865 | void numa_remove_cpu(int cpu) |
866 | { |
867 | numa_set_cpumask(cpu, false); |
868 | } |
869 | # endif /* !CONFIG_NUMA_EMU */ |
870 | |
871 | /* |
872 | * Returns a pointer to the bitmask of CPUs on Node 'node'. |
873 | */ |
874 | const struct cpumask *cpumask_of_node(int node) |
875 | { |
876 | if ((unsigned)node >= nr_node_ids) { |
877 | printk(KERN_WARNING |
878 | "cpumask_of_node(%d): (unsigned)node >= nr_node_ids(%u)\n" , |
879 | node, nr_node_ids); |
880 | dump_stack(); |
881 | return cpu_none_mask; |
882 | } |
883 | if (!cpumask_available(mask: node_to_cpumask_map[node])) { |
884 | printk(KERN_WARNING |
885 | "cpumask_of_node(%d): no node_to_cpumask_map!\n" , |
886 | node); |
887 | dump_stack(); |
888 | return cpu_online_mask; |
889 | } |
890 | return node_to_cpumask_map[node]; |
891 | } |
892 | EXPORT_SYMBOL(cpumask_of_node); |
893 | |
894 | #endif /* !CONFIG_DEBUG_PER_CPU_MAPS */ |
895 | |
896 | #ifdef CONFIG_NUMA_KEEP_MEMINFO |
897 | static int meminfo_to_nid(struct numa_meminfo *mi, u64 start) |
898 | { |
899 | int i; |
900 | |
901 | for (i = 0; i < mi->nr_blks; i++) |
902 | if (mi->blk[i].start <= start && mi->blk[i].end > start) |
903 | return mi->blk[i].nid; |
904 | return NUMA_NO_NODE; |
905 | } |
906 | |
907 | int phys_to_target_node(phys_addr_t start) |
908 | { |
909 | int nid = meminfo_to_nid(mi: &numa_meminfo, start); |
910 | |
911 | /* |
912 | * Prefer online nodes, but if reserved memory might be |
913 | * hot-added continue the search with reserved ranges. |
914 | */ |
915 | if (nid != NUMA_NO_NODE) |
916 | return nid; |
917 | |
918 | return meminfo_to_nid(mi: &numa_reserved_meminfo, start); |
919 | } |
920 | EXPORT_SYMBOL_GPL(phys_to_target_node); |
921 | |
922 | int memory_add_physaddr_to_nid(u64 start) |
923 | { |
924 | int nid = meminfo_to_nid(mi: &numa_meminfo, start); |
925 | |
926 | if (nid == NUMA_NO_NODE) |
927 | nid = numa_meminfo.blk[0].nid; |
928 | return nid; |
929 | } |
930 | EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid); |
931 | |
932 | static int __init cmp_memblk(const void *a, const void *b) |
933 | { |
934 | const struct numa_memblk *ma = *(const struct numa_memblk **)a; |
935 | const struct numa_memblk *mb = *(const struct numa_memblk **)b; |
936 | |
937 | return (ma->start > mb->start) - (ma->start < mb->start); |
938 | } |
939 | |
940 | static struct numa_memblk *numa_memblk_list[NR_NODE_MEMBLKS] __initdata; |
941 | |
942 | /** |
943 | * numa_fill_memblks - Fill gaps in numa_meminfo memblks |
944 | * @start: address to begin fill |
945 | * @end: address to end fill |
946 | * |
947 | * Find and extend numa_meminfo memblks to cover the physical |
948 | * address range @start-@end |
949 | * |
950 | * RETURNS: |
951 | * 0 : Success |
952 | * NUMA_NO_MEMBLK : No memblks exist in address range @start-@end |
953 | */ |
954 | |
955 | int __init numa_fill_memblks(u64 start, u64 end) |
956 | { |
957 | struct numa_memblk **blk = &numa_memblk_list[0]; |
958 | struct numa_meminfo *mi = &numa_meminfo; |
959 | int count = 0; |
960 | u64 prev_end; |
961 | |
962 | /* |
963 | * Create a list of pointers to numa_meminfo memblks that |
964 | * overlap start, end. The list is used to make in-place |
965 | * changes that fill out the numa_meminfo memblks. |
966 | */ |
967 | for (int i = 0; i < mi->nr_blks; i++) { |
968 | struct numa_memblk *bi = &mi->blk[i]; |
969 | |
970 | if (memblock_addrs_overlap(base1: start, size1: end - start, base2: bi->start, |
971 | size2: bi->end - bi->start)) { |
972 | blk[count] = &mi->blk[i]; |
973 | count++; |
974 | } |
975 | } |
976 | if (!count) |
977 | return NUMA_NO_MEMBLK; |
978 | |
979 | /* Sort the list of pointers in memblk->start order */ |
980 | sort(base: &blk[0], num: count, size: sizeof(blk[0]), cmp_func: cmp_memblk, NULL); |
981 | |
982 | /* Make sure the first/last memblks include start/end */ |
983 | blk[0]->start = min(blk[0]->start, start); |
984 | blk[count - 1]->end = max(blk[count - 1]->end, end); |
985 | |
986 | /* |
987 | * Fill any gaps by tracking the previous memblks |
988 | * end address and backfilling to it if needed. |
989 | */ |
990 | prev_end = blk[0]->end; |
991 | for (int i = 1; i < count; i++) { |
992 | struct numa_memblk *curr = blk[i]; |
993 | |
994 | if (prev_end >= curr->start) { |
995 | if (prev_end < curr->end) |
996 | prev_end = curr->end; |
997 | } else { |
998 | curr->start = prev_end; |
999 | prev_end = curr->end; |
1000 | } |
1001 | } |
1002 | return 0; |
1003 | } |
1004 | |
1005 | #endif |
1006 | |