1 | // SPDX-License-Identifier: GPL-2.0 |
2 | /* |
3 | * NUMA emulation |
4 | */ |
5 | #include <linux/kernel.h> |
6 | #include <linux/errno.h> |
7 | #include <linux/topology.h> |
8 | #include <linux/memblock.h> |
9 | #include <asm/dma.h> |
10 | |
11 | #include "numa_internal.h" |
12 | |
13 | static int emu_nid_to_phys[MAX_NUMNODES]; |
14 | static char *emu_cmdline __initdata; |
15 | |
16 | int __init numa_emu_cmdline(char *str) |
17 | { |
18 | emu_cmdline = str; |
19 | return 0; |
20 | } |
21 | |
22 | static int __init emu_find_memblk_by_nid(int nid, const struct numa_meminfo *mi) |
23 | { |
24 | int i; |
25 | |
26 | for (i = 0; i < mi->nr_blks; i++) |
27 | if (mi->blk[i].nid == nid) |
28 | return i; |
29 | return -ENOENT; |
30 | } |
31 | |
32 | static u64 __init mem_hole_size(u64 start, u64 end) |
33 | { |
34 | unsigned long start_pfn = PFN_UP(start); |
35 | unsigned long end_pfn = PFN_DOWN(end); |
36 | |
37 | if (start_pfn < end_pfn) |
38 | return PFN_PHYS(absent_pages_in_range(start_pfn, end_pfn)); |
39 | return 0; |
40 | } |
41 | |
42 | /* |
43 | * Sets up nid to range from @start to @end. The return value is -errno if |
44 | * something went wrong, 0 otherwise. |
45 | */ |
46 | static int __init emu_setup_memblk(struct numa_meminfo *ei, |
47 | struct numa_meminfo *pi, |
48 | int nid, int phys_blk, u64 size) |
49 | { |
50 | struct numa_memblk *eb = &ei->blk[ei->nr_blks]; |
51 | struct numa_memblk *pb = &pi->blk[phys_blk]; |
52 | |
53 | if (ei->nr_blks >= NR_NODE_MEMBLKS) { |
54 | pr_err("NUMA: Too many emulated memblks, failing emulation\n" ); |
55 | return -EINVAL; |
56 | } |
57 | |
58 | ei->nr_blks++; |
59 | eb->start = pb->start; |
60 | eb->end = pb->start + size; |
61 | eb->nid = nid; |
62 | |
63 | if (emu_nid_to_phys[nid] == NUMA_NO_NODE) |
64 | emu_nid_to_phys[nid] = pb->nid; |
65 | |
66 | pb->start += size; |
67 | if (pb->start >= pb->end) { |
68 | WARN_ON_ONCE(pb->start > pb->end); |
69 | numa_remove_memblk_from(idx: phys_blk, mi: pi); |
70 | } |
71 | |
72 | printk(KERN_INFO "Faking node %d at [mem %#018Lx-%#018Lx] (%LuMB)\n" , |
73 | nid, eb->start, eb->end - 1, (eb->end - eb->start) >> 20); |
74 | return 0; |
75 | } |
76 | |
77 | /* |
78 | * Sets up nr_nodes fake nodes interleaved over physical nodes ranging from addr |
79 | * to max_addr. |
80 | * |
81 | * Returns zero on success or negative on error. |
82 | */ |
83 | static int __init split_nodes_interleave(struct numa_meminfo *ei, |
84 | struct numa_meminfo *pi, |
85 | u64 addr, u64 max_addr, int nr_nodes) |
86 | { |
87 | nodemask_t physnode_mask = numa_nodes_parsed; |
88 | u64 size; |
89 | int big; |
90 | int nid = 0; |
91 | int i, ret; |
92 | |
93 | if (nr_nodes <= 0) |
94 | return -1; |
95 | if (nr_nodes > MAX_NUMNODES) { |
96 | pr_info("numa=fake=%d too large, reducing to %d\n" , |
97 | nr_nodes, MAX_NUMNODES); |
98 | nr_nodes = MAX_NUMNODES; |
99 | } |
100 | |
101 | /* |
102 | * Calculate target node size. x86_32 freaks on __udivdi3() so do |
103 | * the division in ulong number of pages and convert back. |
104 | */ |
105 | size = max_addr - addr - mem_hole_size(start: addr, end: max_addr); |
106 | size = PFN_PHYS((unsigned long)(size >> PAGE_SHIFT) / nr_nodes); |
107 | |
108 | /* |
109 | * Calculate the number of big nodes that can be allocated as a result |
110 | * of consolidating the remainder. |
111 | */ |
112 | big = ((size & ~FAKE_NODE_MIN_HASH_MASK) * nr_nodes) / |
113 | FAKE_NODE_MIN_SIZE; |
114 | |
115 | size &= FAKE_NODE_MIN_HASH_MASK; |
116 | if (!size) { |
117 | pr_err("Not enough memory for each node. " |
118 | "NUMA emulation disabled.\n" ); |
119 | return -1; |
120 | } |
121 | |
122 | /* |
123 | * Continue to fill physical nodes with fake nodes until there is no |
124 | * memory left on any of them. |
125 | */ |
126 | while (!nodes_empty(physnode_mask)) { |
127 | for_each_node_mask(i, physnode_mask) { |
128 | u64 dma32_end = PFN_PHYS(MAX_DMA32_PFN); |
129 | u64 start, limit, end; |
130 | int phys_blk; |
131 | |
132 | phys_blk = emu_find_memblk_by_nid(nid: i, mi: pi); |
133 | if (phys_blk < 0) { |
134 | node_clear(i, physnode_mask); |
135 | continue; |
136 | } |
137 | start = pi->blk[phys_blk].start; |
138 | limit = pi->blk[phys_blk].end; |
139 | end = start + size; |
140 | |
141 | if (nid < big) |
142 | end += FAKE_NODE_MIN_SIZE; |
143 | |
144 | /* |
145 | * Continue to add memory to this fake node if its |
146 | * non-reserved memory is less than the per-node size. |
147 | */ |
148 | while (end - start - mem_hole_size(start, end) < size) { |
149 | end += FAKE_NODE_MIN_SIZE; |
150 | if (end > limit) { |
151 | end = limit; |
152 | break; |
153 | } |
154 | } |
155 | |
156 | /* |
157 | * If there won't be at least FAKE_NODE_MIN_SIZE of |
158 | * non-reserved memory in ZONE_DMA32 for the next node, |
159 | * this one must extend to the boundary. |
160 | */ |
161 | if (end < dma32_end && dma32_end - end - |
162 | mem_hole_size(start: end, end: dma32_end) < FAKE_NODE_MIN_SIZE) |
163 | end = dma32_end; |
164 | |
165 | /* |
166 | * If there won't be enough non-reserved memory for the |
167 | * next node, this one must extend to the end of the |
168 | * physical node. |
169 | */ |
170 | if (limit - end - mem_hole_size(start: end, end: limit) < size) |
171 | end = limit; |
172 | |
173 | ret = emu_setup_memblk(ei, pi, nid: nid++ % nr_nodes, |
174 | phys_blk, |
175 | min(end, limit) - start); |
176 | if (ret < 0) |
177 | return ret; |
178 | } |
179 | } |
180 | return 0; |
181 | } |
182 | |
183 | /* |
184 | * Returns the end address of a node so that there is at least `size' amount of |
185 | * non-reserved memory or `max_addr' is reached. |
186 | */ |
187 | static u64 __init find_end_of_node(u64 start, u64 max_addr, u64 size) |
188 | { |
189 | u64 end = start + size; |
190 | |
191 | while (end - start - mem_hole_size(start, end) < size) { |
192 | end += FAKE_NODE_MIN_SIZE; |
193 | if (end > max_addr) { |
194 | end = max_addr; |
195 | break; |
196 | } |
197 | } |
198 | return end; |
199 | } |
200 | |
201 | static u64 uniform_size(u64 max_addr, u64 base, u64 hole, int nr_nodes) |
202 | { |
203 | unsigned long max_pfn = PHYS_PFN(max_addr); |
204 | unsigned long base_pfn = PHYS_PFN(base); |
205 | unsigned long hole_pfns = PHYS_PFN(hole); |
206 | |
207 | return PFN_PHYS((max_pfn - base_pfn - hole_pfns) / nr_nodes); |
208 | } |
209 | |
210 | /* |
211 | * Sets up fake nodes of `size' interleaved over physical nodes ranging from |
212 | * `addr' to `max_addr'. |
213 | * |
214 | * Returns zero on success or negative on error. |
215 | */ |
216 | static int __init split_nodes_size_interleave_uniform(struct numa_meminfo *ei, |
217 | struct numa_meminfo *pi, |
218 | u64 addr, u64 max_addr, u64 size, |
219 | int nr_nodes, struct numa_memblk *pblk, |
220 | int nid) |
221 | { |
222 | nodemask_t physnode_mask = numa_nodes_parsed; |
223 | int i, ret, uniform = 0; |
224 | u64 min_size; |
225 | |
226 | if ((!size && !nr_nodes) || (nr_nodes && !pblk)) |
227 | return -1; |
228 | |
229 | /* |
230 | * In the 'uniform' case split the passed in physical node by |
231 | * nr_nodes, in the non-uniform case, ignore the passed in |
232 | * physical block and try to create nodes of at least size |
233 | * @size. |
234 | * |
235 | * In the uniform case, split the nodes strictly by physical |
236 | * capacity, i.e. ignore holes. In the non-uniform case account |
237 | * for holes and treat @size as a minimum floor. |
238 | */ |
239 | if (!nr_nodes) |
240 | nr_nodes = MAX_NUMNODES; |
241 | else { |
242 | nodes_clear(physnode_mask); |
243 | node_set(pblk->nid, physnode_mask); |
244 | uniform = 1; |
245 | } |
246 | |
247 | if (uniform) { |
248 | min_size = uniform_size(max_addr, base: addr, hole: 0, nr_nodes); |
249 | size = min_size; |
250 | } else { |
251 | /* |
252 | * The limit on emulated nodes is MAX_NUMNODES, so the |
253 | * size per node is increased accordingly if the |
254 | * requested size is too small. This creates a uniform |
255 | * distribution of node sizes across the entire machine |
256 | * (but not necessarily over physical nodes). |
257 | */ |
258 | min_size = uniform_size(max_addr, base: addr, |
259 | hole: mem_hole_size(start: addr, end: max_addr), nr_nodes); |
260 | } |
261 | min_size = ALIGN(max(min_size, FAKE_NODE_MIN_SIZE), FAKE_NODE_MIN_SIZE); |
262 | if (size < min_size) { |
263 | pr_err("Fake node size %LuMB too small, increasing to %LuMB\n" , |
264 | size >> 20, min_size >> 20); |
265 | size = min_size; |
266 | } |
267 | size = ALIGN_DOWN(size, FAKE_NODE_MIN_SIZE); |
268 | |
269 | /* |
270 | * Fill physical nodes with fake nodes of size until there is no memory |
271 | * left on any of them. |
272 | */ |
273 | while (!nodes_empty(physnode_mask)) { |
274 | for_each_node_mask(i, physnode_mask) { |
275 | u64 dma32_end = PFN_PHYS(MAX_DMA32_PFN); |
276 | u64 start, limit, end; |
277 | int phys_blk; |
278 | |
279 | phys_blk = emu_find_memblk_by_nid(nid: i, mi: pi); |
280 | if (phys_blk < 0) { |
281 | node_clear(i, physnode_mask); |
282 | continue; |
283 | } |
284 | |
285 | start = pi->blk[phys_blk].start; |
286 | limit = pi->blk[phys_blk].end; |
287 | |
288 | if (uniform) |
289 | end = start + size; |
290 | else |
291 | end = find_end_of_node(start, max_addr: limit, size); |
292 | /* |
293 | * If there won't be at least FAKE_NODE_MIN_SIZE of |
294 | * non-reserved memory in ZONE_DMA32 for the next node, |
295 | * this one must extend to the boundary. |
296 | */ |
297 | if (end < dma32_end && dma32_end - end - |
298 | mem_hole_size(start: end, end: dma32_end) < FAKE_NODE_MIN_SIZE) |
299 | end = dma32_end; |
300 | |
301 | /* |
302 | * If there won't be enough non-reserved memory for the |
303 | * next node, this one must extend to the end of the |
304 | * physical node. |
305 | */ |
306 | if ((limit - end - mem_hole_size(start: end, end: limit) < size) |
307 | && !uniform) |
308 | end = limit; |
309 | |
310 | ret = emu_setup_memblk(ei, pi, nid: nid++ % MAX_NUMNODES, |
311 | phys_blk, |
312 | min(end, limit) - start); |
313 | if (ret < 0) |
314 | return ret; |
315 | } |
316 | } |
317 | return nid; |
318 | } |
319 | |
320 | static int __init split_nodes_size_interleave(struct numa_meminfo *ei, |
321 | struct numa_meminfo *pi, |
322 | u64 addr, u64 max_addr, u64 size) |
323 | { |
324 | return split_nodes_size_interleave_uniform(ei, pi, addr, max_addr, size, |
325 | nr_nodes: 0, NULL, nid: 0); |
326 | } |
327 | |
328 | static int __init setup_emu2phys_nid(int *dfl_phys_nid) |
329 | { |
330 | int i, max_emu_nid = 0; |
331 | |
332 | *dfl_phys_nid = NUMA_NO_NODE; |
333 | for (i = 0; i < ARRAY_SIZE(emu_nid_to_phys); i++) { |
334 | if (emu_nid_to_phys[i] != NUMA_NO_NODE) { |
335 | max_emu_nid = i; |
336 | if (*dfl_phys_nid == NUMA_NO_NODE) |
337 | *dfl_phys_nid = emu_nid_to_phys[i]; |
338 | } |
339 | } |
340 | |
341 | return max_emu_nid; |
342 | } |
343 | |
344 | /** |
345 | * numa_emulation - Emulate NUMA nodes |
346 | * @numa_meminfo: NUMA configuration to massage |
347 | * @numa_dist_cnt: The size of the physical NUMA distance table |
348 | * |
349 | * Emulate NUMA nodes according to the numa=fake kernel parameter. |
350 | * @numa_meminfo contains the physical memory configuration and is modified |
351 | * to reflect the emulated configuration on success. @numa_dist_cnt is |
352 | * used to determine the size of the physical distance table. |
353 | * |
354 | * On success, the following modifications are made. |
355 | * |
356 | * - @numa_meminfo is updated to reflect the emulated nodes. |
357 | * |
358 | * - __apicid_to_node[] is updated such that APIC IDs are mapped to the |
359 | * emulated nodes. |
360 | * |
361 | * - NUMA distance table is rebuilt to represent distances between emulated |
362 | * nodes. The distances are determined considering how emulated nodes |
363 | * are mapped to physical nodes and match the actual distances. |
364 | * |
365 | * - emu_nid_to_phys[] reflects how emulated nodes are mapped to physical |
366 | * nodes. This is used by numa_add_cpu() and numa_remove_cpu(). |
367 | * |
368 | * If emulation is not enabled or fails, emu_nid_to_phys[] is filled with |
369 | * identity mapping and no other modification is made. |
370 | */ |
371 | void __init numa_emulation(struct numa_meminfo *numa_meminfo, int numa_dist_cnt) |
372 | { |
373 | static struct numa_meminfo ei __initdata; |
374 | static struct numa_meminfo pi __initdata; |
375 | const u64 max_addr = PFN_PHYS(max_pfn); |
376 | u8 *phys_dist = NULL; |
377 | size_t phys_size = numa_dist_cnt * numa_dist_cnt * sizeof(phys_dist[0]); |
378 | int max_emu_nid, dfl_phys_nid; |
379 | int i, j, ret; |
380 | |
381 | if (!emu_cmdline) |
382 | goto no_emu; |
383 | |
384 | memset(&ei, 0, sizeof(ei)); |
385 | pi = *numa_meminfo; |
386 | |
387 | for (i = 0; i < MAX_NUMNODES; i++) |
388 | emu_nid_to_phys[i] = NUMA_NO_NODE; |
389 | |
390 | /* |
391 | * If the numa=fake command-line contains a 'M' or 'G', it represents |
392 | * the fixed node size. Otherwise, if it is just a single number N, |
393 | * split the system RAM into N fake nodes. |
394 | */ |
395 | if (strchr(emu_cmdline, 'U')) { |
396 | nodemask_t physnode_mask = numa_nodes_parsed; |
397 | unsigned long n; |
398 | int nid = 0; |
399 | |
400 | n = simple_strtoul(emu_cmdline, &emu_cmdline, 0); |
401 | ret = -1; |
402 | for_each_node_mask(i, physnode_mask) { |
403 | /* |
404 | * The reason we pass in blk[0] is due to |
405 | * numa_remove_memblk_from() called by |
406 | * emu_setup_memblk() will delete entry 0 |
407 | * and then move everything else up in the pi.blk |
408 | * array. Therefore we should always be looking |
409 | * at blk[0]. |
410 | */ |
411 | ret = split_nodes_size_interleave_uniform(ei: &ei, pi: &pi, |
412 | addr: pi.blk[0].start, max_addr: pi.blk[0].end, size: 0, |
413 | nr_nodes: n, pblk: &pi.blk[0], nid); |
414 | if (ret < 0) |
415 | break; |
416 | if (ret < n) { |
417 | pr_info("%s: phys: %d only got %d of %ld nodes, failing\n" , |
418 | __func__, i, ret, n); |
419 | ret = -1; |
420 | break; |
421 | } |
422 | nid = ret; |
423 | } |
424 | } else if (strchr(emu_cmdline, 'M') || strchr(emu_cmdline, 'G')) { |
425 | u64 size; |
426 | |
427 | size = memparse(ptr: emu_cmdline, retptr: &emu_cmdline); |
428 | ret = split_nodes_size_interleave(ei: &ei, pi: &pi, addr: 0, max_addr, size); |
429 | } else { |
430 | unsigned long n; |
431 | |
432 | n = simple_strtoul(emu_cmdline, &emu_cmdline, 0); |
433 | ret = split_nodes_interleave(ei: &ei, pi: &pi, addr: 0, max_addr, nr_nodes: n); |
434 | } |
435 | if (*emu_cmdline == ':') |
436 | emu_cmdline++; |
437 | |
438 | if (ret < 0) |
439 | goto no_emu; |
440 | |
441 | if (numa_cleanup_meminfo(mi: &ei) < 0) { |
442 | pr_warn("NUMA: Warning: constructed meminfo invalid, disabling emulation\n" ); |
443 | goto no_emu; |
444 | } |
445 | |
446 | /* copy the physical distance table */ |
447 | if (numa_dist_cnt) { |
448 | u64 phys; |
449 | |
450 | phys = memblock_phys_alloc_range(size: phys_size, PAGE_SIZE, start: 0, |
451 | PFN_PHYS(max_pfn_mapped)); |
452 | if (!phys) { |
453 | pr_warn("NUMA: Warning: can't allocate copy of distance table, disabling emulation\n" ); |
454 | goto no_emu; |
455 | } |
456 | phys_dist = __va(phys); |
457 | |
458 | for (i = 0; i < numa_dist_cnt; i++) |
459 | for (j = 0; j < numa_dist_cnt; j++) |
460 | phys_dist[i * numa_dist_cnt + j] = |
461 | node_distance(i, j); |
462 | } |
463 | |
464 | /* |
465 | * Determine the max emulated nid and the default phys nid to use |
466 | * for unmapped nodes. |
467 | */ |
468 | max_emu_nid = setup_emu2phys_nid(&dfl_phys_nid); |
469 | |
470 | /* commit */ |
471 | *numa_meminfo = ei; |
472 | |
473 | /* Make sure numa_nodes_parsed only contains emulated nodes */ |
474 | nodes_clear(numa_nodes_parsed); |
475 | for (i = 0; i < ARRAY_SIZE(ei.blk); i++) |
476 | if (ei.blk[i].start != ei.blk[i].end && |
477 | ei.blk[i].nid != NUMA_NO_NODE) |
478 | node_set(ei.blk[i].nid, numa_nodes_parsed); |
479 | |
480 | /* |
481 | * Transform __apicid_to_node table to use emulated nids by |
482 | * reverse-mapping phys_nid. The maps should always exist but fall |
483 | * back to zero just in case. |
484 | */ |
485 | for (i = 0; i < ARRAY_SIZE(__apicid_to_node); i++) { |
486 | if (__apicid_to_node[i] == NUMA_NO_NODE) |
487 | continue; |
488 | for (j = 0; j < ARRAY_SIZE(emu_nid_to_phys); j++) |
489 | if (__apicid_to_node[i] == emu_nid_to_phys[j]) |
490 | break; |
491 | __apicid_to_node[i] = j < ARRAY_SIZE(emu_nid_to_phys) ? j : 0; |
492 | } |
493 | |
494 | /* make sure all emulated nodes are mapped to a physical node */ |
495 | for (i = 0; i < ARRAY_SIZE(emu_nid_to_phys); i++) |
496 | if (emu_nid_to_phys[i] == NUMA_NO_NODE) |
497 | emu_nid_to_phys[i] = dfl_phys_nid; |
498 | |
499 | /* transform distance table */ |
500 | numa_reset_distance(); |
501 | for (i = 0; i < max_emu_nid + 1; i++) { |
502 | for (j = 0; j < max_emu_nid + 1; j++) { |
503 | int physi = emu_nid_to_phys[i]; |
504 | int physj = emu_nid_to_phys[j]; |
505 | int dist; |
506 | |
507 | if (get_option(str: &emu_cmdline, pint: &dist) == 2) |
508 | ; |
509 | else if (physi >= numa_dist_cnt || physj >= numa_dist_cnt) |
510 | dist = physi == physj ? |
511 | LOCAL_DISTANCE : REMOTE_DISTANCE; |
512 | else |
513 | dist = phys_dist[physi * numa_dist_cnt + physj]; |
514 | |
515 | numa_set_distance(from: i, to: j, distance: dist); |
516 | } |
517 | } |
518 | |
519 | /* free the copied physical distance table */ |
520 | memblock_free(ptr: phys_dist, size: phys_size); |
521 | return; |
522 | |
523 | no_emu: |
524 | /* No emulation. Build identity emu_nid_to_phys[] for numa_add_cpu() */ |
525 | for (i = 0; i < ARRAY_SIZE(emu_nid_to_phys); i++) |
526 | emu_nid_to_phys[i] = i; |
527 | } |
528 | |
529 | #ifndef CONFIG_DEBUG_PER_CPU_MAPS |
530 | void numa_add_cpu(int cpu) |
531 | { |
532 | int physnid, nid; |
533 | |
534 | nid = early_cpu_to_node(cpu); |
535 | BUG_ON(nid == NUMA_NO_NODE || !node_online(nid)); |
536 | |
537 | physnid = emu_nid_to_phys[nid]; |
538 | |
539 | /* |
540 | * Map the cpu to each emulated node that is allocated on the physical |
541 | * node of the cpu's apic id. |
542 | */ |
543 | for_each_online_node(nid) |
544 | if (emu_nid_to_phys[nid] == physnid) |
545 | cpumask_set_cpu(cpu, node_to_cpumask_map[nid]); |
546 | } |
547 | |
548 | void numa_remove_cpu(int cpu) |
549 | { |
550 | int i; |
551 | |
552 | for_each_online_node(i) |
553 | cpumask_clear_cpu(cpu, node_to_cpumask_map[i]); |
554 | } |
555 | #else /* !CONFIG_DEBUG_PER_CPU_MAPS */ |
556 | static void numa_set_cpumask(int cpu, bool enable) |
557 | { |
558 | int nid, physnid; |
559 | |
560 | nid = early_cpu_to_node(cpu); |
561 | if (nid == NUMA_NO_NODE) { |
562 | /* early_cpu_to_node() already emits a warning and trace */ |
563 | return; |
564 | } |
565 | |
566 | physnid = emu_nid_to_phys[nid]; |
567 | |
568 | for_each_online_node(nid) { |
569 | if (emu_nid_to_phys[nid] != physnid) |
570 | continue; |
571 | |
572 | debug_cpumask_set_cpu(cpu, node: nid, enable); |
573 | } |
574 | } |
575 | |
576 | void numa_add_cpu(int cpu) |
577 | { |
578 | numa_set_cpumask(cpu, enable: true); |
579 | } |
580 | |
581 | void numa_remove_cpu(int cpu) |
582 | { |
583 | numa_set_cpumask(cpu, enable: false); |
584 | } |
585 | #endif /* !CONFIG_DEBUG_PER_CPU_MAPS */ |
586 | |