1 | // SPDX-License-Identifier: GPL-2.0-or-later |
2 | /* |
3 | * pSeries NUMA support |
4 | * |
5 | * Copyright (C) 2002 Anton Blanchard <anton@au.ibm.com>, IBM |
6 | */ |
7 | #define pr_fmt(fmt) "numa: " fmt |
8 | |
9 | #include <linux/threads.h> |
10 | #include <linux/memblock.h> |
11 | #include <linux/init.h> |
12 | #include <linux/mm.h> |
13 | #include <linux/mmzone.h> |
14 | #include <linux/export.h> |
15 | #include <linux/nodemask.h> |
16 | #include <linux/cpu.h> |
17 | #include <linux/notifier.h> |
18 | #include <linux/of.h> |
19 | #include <linux/of_address.h> |
20 | #include <linux/pfn.h> |
21 | #include <linux/cpuset.h> |
22 | #include <linux/node.h> |
23 | #include <linux/stop_machine.h> |
24 | #include <linux/proc_fs.h> |
25 | #include <linux/seq_file.h> |
26 | #include <linux/uaccess.h> |
27 | #include <linux/slab.h> |
28 | #include <asm/cputhreads.h> |
29 | #include <asm/sparsemem.h> |
30 | #include <asm/smp.h> |
31 | #include <asm/topology.h> |
32 | #include <asm/firmware.h> |
33 | #include <asm/paca.h> |
34 | #include <asm/hvcall.h> |
35 | #include <asm/setup.h> |
36 | #include <asm/vdso.h> |
37 | #include <asm/vphn.h> |
38 | #include <asm/drmem.h> |
39 | |
40 | static int numa_enabled = 1; |
41 | |
42 | static char *cmdline __initdata; |
43 | |
44 | int numa_cpu_lookup_table[NR_CPUS]; |
45 | cpumask_var_t node_to_cpumask_map[MAX_NUMNODES]; |
46 | struct pglist_data *node_data[MAX_NUMNODES]; |
47 | |
48 | EXPORT_SYMBOL(numa_cpu_lookup_table); |
49 | EXPORT_SYMBOL(node_to_cpumask_map); |
50 | EXPORT_SYMBOL(node_data); |
51 | |
52 | static int primary_domain_index; |
53 | static int n_mem_addr_cells, n_mem_size_cells; |
54 | |
55 | #define FORM0_AFFINITY 0 |
56 | #define FORM1_AFFINITY 1 |
57 | #define FORM2_AFFINITY 2 |
58 | static int affinity_form; |
59 | |
60 | #define MAX_DISTANCE_REF_POINTS 4 |
61 | static int distance_ref_points_depth; |
62 | static const __be32 *distance_ref_points; |
63 | static int distance_lookup_table[MAX_NUMNODES][MAX_DISTANCE_REF_POINTS]; |
64 | static int numa_distance_table[MAX_NUMNODES][MAX_NUMNODES] = { |
65 | [0 ... MAX_NUMNODES - 1] = { [0 ... MAX_NUMNODES - 1] = -1 } |
66 | }; |
67 | static int numa_id_index_table[MAX_NUMNODES] = { [0 ... MAX_NUMNODES - 1] = NUMA_NO_NODE }; |
68 | |
69 | /* |
70 | * Allocate node_to_cpumask_map based on number of available nodes |
71 | * Requires node_possible_map to be valid. |
72 | * |
73 | * Note: cpumask_of_node() is not valid until after this is done. |
74 | */ |
75 | static void __init setup_node_to_cpumask_map(void) |
76 | { |
77 | unsigned int node; |
78 | |
79 | /* setup nr_node_ids if not done yet */ |
80 | if (nr_node_ids == MAX_NUMNODES) |
81 | setup_nr_node_ids(); |
82 | |
83 | /* allocate the map */ |
84 | for_each_node(node) |
85 | alloc_bootmem_cpumask_var(mask: &node_to_cpumask_map[node]); |
86 | |
87 | /* cpumask_of_node() will now work */ |
88 | pr_debug("Node to cpumask map for %u nodes\n" , nr_node_ids); |
89 | } |
90 | |
91 | static int __init fake_numa_create_new_node(unsigned long end_pfn, |
92 | unsigned int *nid) |
93 | { |
94 | unsigned long long mem; |
95 | char *p = cmdline; |
96 | static unsigned int fake_nid; |
97 | static unsigned long long curr_boundary; |
98 | |
99 | /* |
100 | * Modify node id, iff we started creating NUMA nodes |
101 | * We want to continue from where we left of the last time |
102 | */ |
103 | if (fake_nid) |
104 | *nid = fake_nid; |
105 | /* |
106 | * In case there are no more arguments to parse, the |
107 | * node_id should be the same as the last fake node id |
108 | * (we've handled this above). |
109 | */ |
110 | if (!p) |
111 | return 0; |
112 | |
113 | mem = memparse(ptr: p, retptr: &p); |
114 | if (!mem) |
115 | return 0; |
116 | |
117 | if (mem < curr_boundary) |
118 | return 0; |
119 | |
120 | curr_boundary = mem; |
121 | |
122 | if ((end_pfn << PAGE_SHIFT) > mem) { |
123 | /* |
124 | * Skip commas and spaces |
125 | */ |
126 | while (*p == ',' || *p == ' ' || *p == '\t') |
127 | p++; |
128 | |
129 | cmdline = p; |
130 | fake_nid++; |
131 | *nid = fake_nid; |
132 | pr_debug("created new fake_node with id %d\n" , fake_nid); |
133 | return 1; |
134 | } |
135 | return 0; |
136 | } |
137 | |
138 | static void __init reset_numa_cpu_lookup_table(void) |
139 | { |
140 | unsigned int cpu; |
141 | |
142 | for_each_possible_cpu(cpu) |
143 | numa_cpu_lookup_table[cpu] = -1; |
144 | } |
145 | |
146 | void map_cpu_to_node(int cpu, int node) |
147 | { |
148 | update_numa_cpu_lookup_table(cpu, node); |
149 | |
150 | if (!(cpumask_test_cpu(cpu, cpumask: node_to_cpumask_map[node]))) { |
151 | pr_debug("adding cpu %d to node %d\n" , cpu, node); |
152 | cpumask_set_cpu(cpu, dstp: node_to_cpumask_map[node]); |
153 | } |
154 | } |
155 | |
156 | #if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_PPC_SPLPAR) |
157 | void unmap_cpu_from_node(unsigned long cpu) |
158 | { |
159 | int node = numa_cpu_lookup_table[cpu]; |
160 | |
161 | if (cpumask_test_cpu(cpu, cpumask: node_to_cpumask_map[node])) { |
162 | cpumask_clear_cpu(cpu, dstp: node_to_cpumask_map[node]); |
163 | pr_debug("removing cpu %lu from node %d\n" , cpu, node); |
164 | } else { |
165 | pr_warn("Warning: cpu %lu not found in node %d\n" , cpu, node); |
166 | } |
167 | } |
168 | #endif /* CONFIG_HOTPLUG_CPU || CONFIG_PPC_SPLPAR */ |
169 | |
170 | static int __associativity_to_nid(const __be32 *associativity, |
171 | int max_array_sz) |
172 | { |
173 | int nid; |
174 | /* |
175 | * primary_domain_index is 1 based array index. |
176 | */ |
177 | int index = primary_domain_index - 1; |
178 | |
179 | if (!numa_enabled || index >= max_array_sz) |
180 | return NUMA_NO_NODE; |
181 | |
182 | nid = of_read_number(cell: &associativity[index], size: 1); |
183 | |
184 | /* POWER4 LPAR uses 0xffff as invalid node */ |
185 | if (nid == 0xffff || nid >= nr_node_ids) |
186 | nid = NUMA_NO_NODE; |
187 | return nid; |
188 | } |
189 | /* |
190 | * Returns nid in the range [0..nr_node_ids], or -1 if no useful NUMA |
191 | * info is found. |
192 | */ |
193 | static int associativity_to_nid(const __be32 *associativity) |
194 | { |
195 | int array_sz = of_read_number(cell: associativity, size: 1); |
196 | |
197 | /* Skip the first element in the associativity array */ |
198 | return __associativity_to_nid(associativity: (associativity + 1), max_array_sz: array_sz); |
199 | } |
200 | |
201 | static int __cpu_form2_relative_distance(__be32 *cpu1_assoc, __be32 *cpu2_assoc) |
202 | { |
203 | int dist; |
204 | int node1, node2; |
205 | |
206 | node1 = associativity_to_nid(associativity: cpu1_assoc); |
207 | node2 = associativity_to_nid(associativity: cpu2_assoc); |
208 | |
209 | dist = numa_distance_table[node1][node2]; |
210 | if (dist <= LOCAL_DISTANCE) |
211 | return 0; |
212 | else if (dist <= REMOTE_DISTANCE) |
213 | return 1; |
214 | else |
215 | return 2; |
216 | } |
217 | |
218 | static int __cpu_form1_relative_distance(__be32 *cpu1_assoc, __be32 *cpu2_assoc) |
219 | { |
220 | int dist = 0; |
221 | |
222 | int i, index; |
223 | |
224 | for (i = 0; i < distance_ref_points_depth; i++) { |
225 | index = be32_to_cpu(distance_ref_points[i]); |
226 | if (cpu1_assoc[index] == cpu2_assoc[index]) |
227 | break; |
228 | dist++; |
229 | } |
230 | |
231 | return dist; |
232 | } |
233 | |
234 | int cpu_relative_distance(__be32 *cpu1_assoc, __be32 *cpu2_assoc) |
235 | { |
236 | /* We should not get called with FORM0 */ |
237 | VM_WARN_ON(affinity_form == FORM0_AFFINITY); |
238 | if (affinity_form == FORM1_AFFINITY) |
239 | return __cpu_form1_relative_distance(cpu1_assoc, cpu2_assoc); |
240 | return __cpu_form2_relative_distance(cpu1_assoc, cpu2_assoc); |
241 | } |
242 | |
243 | /* must hold reference to node during call */ |
244 | static const __be32 *of_get_associativity(struct device_node *dev) |
245 | { |
246 | return of_get_property(node: dev, name: "ibm,associativity" , NULL); |
247 | } |
248 | |
249 | int __node_distance(int a, int b) |
250 | { |
251 | int i; |
252 | int distance = LOCAL_DISTANCE; |
253 | |
254 | if (affinity_form == FORM2_AFFINITY) |
255 | return numa_distance_table[a][b]; |
256 | else if (affinity_form == FORM0_AFFINITY) |
257 | return ((a == b) ? LOCAL_DISTANCE : REMOTE_DISTANCE); |
258 | |
259 | for (i = 0; i < distance_ref_points_depth; i++) { |
260 | if (distance_lookup_table[a][i] == distance_lookup_table[b][i]) |
261 | break; |
262 | |
263 | /* Double the distance for each NUMA level */ |
264 | distance *= 2; |
265 | } |
266 | |
267 | return distance; |
268 | } |
269 | EXPORT_SYMBOL(__node_distance); |
270 | |
271 | /* Returns the nid associated with the given device tree node, |
272 | * or -1 if not found. |
273 | */ |
274 | static int of_node_to_nid_single(struct device_node *device) |
275 | { |
276 | int nid = NUMA_NO_NODE; |
277 | const __be32 *tmp; |
278 | |
279 | tmp = of_get_associativity(dev: device); |
280 | if (tmp) |
281 | nid = associativity_to_nid(associativity: tmp); |
282 | return nid; |
283 | } |
284 | |
285 | /* Walk the device tree upwards, looking for an associativity id */ |
286 | int of_node_to_nid(struct device_node *device) |
287 | { |
288 | int nid = NUMA_NO_NODE; |
289 | |
290 | of_node_get(node: device); |
291 | while (device) { |
292 | nid = of_node_to_nid_single(device); |
293 | if (nid != -1) |
294 | break; |
295 | |
296 | device = of_get_next_parent(node: device); |
297 | } |
298 | of_node_put(node: device); |
299 | |
300 | return nid; |
301 | } |
302 | EXPORT_SYMBOL(of_node_to_nid); |
303 | |
304 | static void __initialize_form1_numa_distance(const __be32 *associativity, |
305 | int max_array_sz) |
306 | { |
307 | int i, nid; |
308 | |
309 | if (affinity_form != FORM1_AFFINITY) |
310 | return; |
311 | |
312 | nid = __associativity_to_nid(associativity, max_array_sz); |
313 | if (nid != NUMA_NO_NODE) { |
314 | for (i = 0; i < distance_ref_points_depth; i++) { |
315 | const __be32 *entry; |
316 | int index = be32_to_cpu(distance_ref_points[i]) - 1; |
317 | |
318 | /* |
319 | * broken hierarchy, return with broken distance table |
320 | */ |
321 | if (WARN(index >= max_array_sz, "Broken ibm,associativity property" )) |
322 | return; |
323 | |
324 | entry = &associativity[index]; |
325 | distance_lookup_table[nid][i] = of_read_number(cell: entry, size: 1); |
326 | } |
327 | } |
328 | } |
329 | |
330 | static void initialize_form1_numa_distance(const __be32 *associativity) |
331 | { |
332 | int array_sz; |
333 | |
334 | array_sz = of_read_number(cell: associativity, size: 1); |
335 | /* Skip the first element in the associativity array */ |
336 | __initialize_form1_numa_distance(associativity: associativity + 1, max_array_sz: array_sz); |
337 | } |
338 | |
339 | /* |
340 | * Used to update distance information w.r.t newly added node. |
341 | */ |
342 | void update_numa_distance(struct device_node *node) |
343 | { |
344 | int nid; |
345 | |
346 | if (affinity_form == FORM0_AFFINITY) |
347 | return; |
348 | else if (affinity_form == FORM1_AFFINITY) { |
349 | const __be32 *associativity; |
350 | |
351 | associativity = of_get_associativity(dev: node); |
352 | if (!associativity) |
353 | return; |
354 | |
355 | initialize_form1_numa_distance(associativity); |
356 | return; |
357 | } |
358 | |
359 | /* FORM2 affinity */ |
360 | nid = of_node_to_nid_single(device: node); |
361 | if (nid == NUMA_NO_NODE) |
362 | return; |
363 | |
364 | /* |
365 | * With FORM2 we expect NUMA distance of all possible NUMA |
366 | * nodes to be provided during boot. |
367 | */ |
368 | WARN(numa_distance_table[nid][nid] == -1, |
369 | "NUMA distance details for node %d not provided\n" , nid); |
370 | } |
371 | EXPORT_SYMBOL_GPL(update_numa_distance); |
372 | |
373 | /* |
374 | * ibm,numa-lookup-index-table= {N, domainid1, domainid2, ..... domainidN} |
375 | * ibm,numa-distance-table = { N, 1, 2, 4, 5, 1, 6, .... N elements} |
376 | */ |
377 | static void __init initialize_form2_numa_distance_lookup_table(void) |
378 | { |
379 | int i, j; |
380 | struct device_node *root; |
381 | const __u8 *form2_distances; |
382 | const __be32 *numa_lookup_index; |
383 | int form2_distances_length; |
384 | int max_numa_index, distance_index; |
385 | |
386 | if (firmware_has_feature(FW_FEATURE_OPAL)) |
387 | root = of_find_node_by_path(path: "/ibm,opal" ); |
388 | else |
389 | root = of_find_node_by_path(path: "/rtas" ); |
390 | if (!root) |
391 | root = of_find_node_by_path(path: "/" ); |
392 | |
393 | numa_lookup_index = of_get_property(node: root, name: "ibm,numa-lookup-index-table" , NULL); |
394 | max_numa_index = of_read_number(cell: &numa_lookup_index[0], size: 1); |
395 | |
396 | /* first element of the array is the size and is encode-int */ |
397 | form2_distances = of_get_property(node: root, name: "ibm,numa-distance-table" , NULL); |
398 | form2_distances_length = of_read_number(cell: (const __be32 *)&form2_distances[0], size: 1); |
399 | /* Skip the size which is encoded int */ |
400 | form2_distances += sizeof(__be32); |
401 | |
402 | pr_debug("form2_distances_len = %d, numa_dist_indexes_len = %d\n" , |
403 | form2_distances_length, max_numa_index); |
404 | |
405 | for (i = 0; i < max_numa_index; i++) |
406 | /* +1 skip the max_numa_index in the property */ |
407 | numa_id_index_table[i] = of_read_number(cell: &numa_lookup_index[i + 1], size: 1); |
408 | |
409 | |
410 | if (form2_distances_length != max_numa_index * max_numa_index) { |
411 | WARN(1, "Wrong NUMA distance information\n" ); |
412 | form2_distances = NULL; // don't use it |
413 | } |
414 | distance_index = 0; |
415 | for (i = 0; i < max_numa_index; i++) { |
416 | for (j = 0; j < max_numa_index; j++) { |
417 | int nodeA = numa_id_index_table[i]; |
418 | int nodeB = numa_id_index_table[j]; |
419 | int dist; |
420 | |
421 | if (form2_distances) |
422 | dist = form2_distances[distance_index++]; |
423 | else if (nodeA == nodeB) |
424 | dist = LOCAL_DISTANCE; |
425 | else |
426 | dist = REMOTE_DISTANCE; |
427 | numa_distance_table[nodeA][nodeB] = dist; |
428 | pr_debug("dist[%d][%d]=%d " , nodeA, nodeB, dist); |
429 | } |
430 | } |
431 | |
432 | of_node_put(node: root); |
433 | } |
434 | |
435 | static int __init find_primary_domain_index(void) |
436 | { |
437 | int index; |
438 | struct device_node *root; |
439 | |
440 | /* |
441 | * Check for which form of affinity. |
442 | */ |
443 | if (firmware_has_feature(FW_FEATURE_OPAL)) { |
444 | affinity_form = FORM1_AFFINITY; |
445 | } else if (firmware_has_feature(FW_FEATURE_FORM2_AFFINITY)) { |
446 | pr_debug("Using form 2 affinity\n" ); |
447 | affinity_form = FORM2_AFFINITY; |
448 | } else if (firmware_has_feature(FW_FEATURE_FORM1_AFFINITY)) { |
449 | pr_debug("Using form 1 affinity\n" ); |
450 | affinity_form = FORM1_AFFINITY; |
451 | } else |
452 | affinity_form = FORM0_AFFINITY; |
453 | |
454 | if (firmware_has_feature(FW_FEATURE_OPAL)) |
455 | root = of_find_node_by_path(path: "/ibm,opal" ); |
456 | else |
457 | root = of_find_node_by_path(path: "/rtas" ); |
458 | if (!root) |
459 | root = of_find_node_by_path(path: "/" ); |
460 | |
461 | /* |
462 | * This property is a set of 32-bit integers, each representing |
463 | * an index into the ibm,associativity nodes. |
464 | * |
465 | * With form 0 affinity the first integer is for an SMP configuration |
466 | * (should be all 0's) and the second is for a normal NUMA |
467 | * configuration. We have only one level of NUMA. |
468 | * |
469 | * With form 1 affinity the first integer is the most significant |
470 | * NUMA boundary and the following are progressively less significant |
471 | * boundaries. There can be more than one level of NUMA. |
472 | */ |
473 | distance_ref_points = of_get_property(node: root, |
474 | name: "ibm,associativity-reference-points" , |
475 | lenp: &distance_ref_points_depth); |
476 | |
477 | if (!distance_ref_points) { |
478 | pr_debug("ibm,associativity-reference-points not found.\n" ); |
479 | goto err; |
480 | } |
481 | |
482 | distance_ref_points_depth /= sizeof(int); |
483 | if (affinity_form == FORM0_AFFINITY) { |
484 | if (distance_ref_points_depth < 2) { |
485 | pr_warn("short ibm,associativity-reference-points\n" ); |
486 | goto err; |
487 | } |
488 | |
489 | index = of_read_number(cell: &distance_ref_points[1], size: 1); |
490 | } else { |
491 | /* |
492 | * Both FORM1 and FORM2 affinity find the primary domain details |
493 | * at the same offset. |
494 | */ |
495 | index = of_read_number(cell: distance_ref_points, size: 1); |
496 | } |
497 | /* |
498 | * Warn and cap if the hardware supports more than |
499 | * MAX_DISTANCE_REF_POINTS domains. |
500 | */ |
501 | if (distance_ref_points_depth > MAX_DISTANCE_REF_POINTS) { |
502 | pr_warn("distance array capped at %d entries\n" , |
503 | MAX_DISTANCE_REF_POINTS); |
504 | distance_ref_points_depth = MAX_DISTANCE_REF_POINTS; |
505 | } |
506 | |
507 | of_node_put(node: root); |
508 | return index; |
509 | |
510 | err: |
511 | of_node_put(node: root); |
512 | return -1; |
513 | } |
514 | |
515 | static void __init get_n_mem_cells(int *n_addr_cells, int *n_size_cells) |
516 | { |
517 | struct device_node *memory = NULL; |
518 | |
519 | memory = of_find_node_by_type(from: memory, type: "memory" ); |
520 | if (!memory) |
521 | panic(fmt: "numa.c: No memory nodes found!" ); |
522 | |
523 | *n_addr_cells = of_n_addr_cells(np: memory); |
524 | *n_size_cells = of_n_size_cells(np: memory); |
525 | of_node_put(node: memory); |
526 | } |
527 | |
528 | static unsigned long read_n_cells(int n, const __be32 **buf) |
529 | { |
530 | unsigned long result = 0; |
531 | |
532 | while (n--) { |
533 | result = (result << 32) | of_read_number(cell: *buf, size: 1); |
534 | (*buf)++; |
535 | } |
536 | return result; |
537 | } |
538 | |
539 | struct assoc_arrays { |
540 | u32 n_arrays; |
541 | u32 array_sz; |
542 | const __be32 *arrays; |
543 | }; |
544 | |
545 | /* |
546 | * Retrieve and validate the list of associativity arrays for drconf |
547 | * memory from the ibm,associativity-lookup-arrays property of the |
548 | * device tree.. |
549 | * |
550 | * The layout of the ibm,associativity-lookup-arrays property is a number N |
551 | * indicating the number of associativity arrays, followed by a number M |
552 | * indicating the size of each associativity array, followed by a list |
553 | * of N associativity arrays. |
554 | */ |
555 | static int of_get_assoc_arrays(struct assoc_arrays *aa) |
556 | { |
557 | struct device_node *memory; |
558 | const __be32 *prop; |
559 | u32 len; |
560 | |
561 | memory = of_find_node_by_path(path: "/ibm,dynamic-reconfiguration-memory" ); |
562 | if (!memory) |
563 | return -1; |
564 | |
565 | prop = of_get_property(node: memory, name: "ibm,associativity-lookup-arrays" , lenp: &len); |
566 | if (!prop || len < 2 * sizeof(unsigned int)) { |
567 | of_node_put(node: memory); |
568 | return -1; |
569 | } |
570 | |
571 | aa->n_arrays = of_read_number(cell: prop++, size: 1); |
572 | aa->array_sz = of_read_number(cell: prop++, size: 1); |
573 | |
574 | of_node_put(node: memory); |
575 | |
576 | /* Now that we know the number of arrays and size of each array, |
577 | * revalidate the size of the property read in. |
578 | */ |
579 | if (len < (aa->n_arrays * aa->array_sz + 2) * sizeof(unsigned int)) |
580 | return -1; |
581 | |
582 | aa->arrays = prop; |
583 | return 0; |
584 | } |
585 | |
586 | static int __init get_nid_and_numa_distance(struct drmem_lmb *lmb) |
587 | { |
588 | struct assoc_arrays aa = { .arrays = NULL }; |
589 | int default_nid = NUMA_NO_NODE; |
590 | int nid = default_nid; |
591 | int rc, index; |
592 | |
593 | if ((primary_domain_index < 0) || !numa_enabled) |
594 | return default_nid; |
595 | |
596 | rc = of_get_assoc_arrays(aa: &aa); |
597 | if (rc) |
598 | return default_nid; |
599 | |
600 | if (primary_domain_index <= aa.array_sz && |
601 | !(lmb->flags & DRCONF_MEM_AI_INVALID) && lmb->aa_index < aa.n_arrays) { |
602 | const __be32 *associativity; |
603 | |
604 | index = lmb->aa_index * aa.array_sz; |
605 | associativity = &aa.arrays[index]; |
606 | nid = __associativity_to_nid(associativity, max_array_sz: aa.array_sz); |
607 | if (nid > 0 && affinity_form == FORM1_AFFINITY) { |
608 | /* |
609 | * lookup array associativity entries have |
610 | * no length of the array as the first element. |
611 | */ |
612 | __initialize_form1_numa_distance(associativity, max_array_sz: aa.array_sz); |
613 | } |
614 | } |
615 | return nid; |
616 | } |
617 | |
618 | /* |
619 | * This is like of_node_to_nid_single() for memory represented in the |
620 | * ibm,dynamic-reconfiguration-memory node. |
621 | */ |
622 | int of_drconf_to_nid_single(struct drmem_lmb *lmb) |
623 | { |
624 | struct assoc_arrays aa = { .arrays = NULL }; |
625 | int default_nid = NUMA_NO_NODE; |
626 | int nid = default_nid; |
627 | int rc, index; |
628 | |
629 | if ((primary_domain_index < 0) || !numa_enabled) |
630 | return default_nid; |
631 | |
632 | rc = of_get_assoc_arrays(aa: &aa); |
633 | if (rc) |
634 | return default_nid; |
635 | |
636 | if (primary_domain_index <= aa.array_sz && |
637 | !(lmb->flags & DRCONF_MEM_AI_INVALID) && lmb->aa_index < aa.n_arrays) { |
638 | const __be32 *associativity; |
639 | |
640 | index = lmb->aa_index * aa.array_sz; |
641 | associativity = &aa.arrays[index]; |
642 | nid = __associativity_to_nid(associativity, max_array_sz: aa.array_sz); |
643 | } |
644 | return nid; |
645 | } |
646 | |
647 | #ifdef CONFIG_PPC_SPLPAR |
648 | |
649 | static int __vphn_get_associativity(long lcpu, __be32 *associativity) |
650 | { |
651 | long rc, hwid; |
652 | |
653 | /* |
654 | * On a shared lpar, device tree will not have node associativity. |
655 | * At this time lppaca, or its __old_status field may not be |
656 | * updated. Hence kernel cannot detect if its on a shared lpar. So |
657 | * request an explicit associativity irrespective of whether the |
658 | * lpar is shared or dedicated. Use the device tree property as a |
659 | * fallback. cpu_to_phys_id is only valid between |
660 | * smp_setup_cpu_maps() and smp_setup_pacas(). |
661 | */ |
662 | if (firmware_has_feature(FW_FEATURE_VPHN)) { |
663 | if (cpu_to_phys_id) |
664 | hwid = cpu_to_phys_id[lcpu]; |
665 | else |
666 | hwid = get_hard_smp_processor_id(lcpu); |
667 | |
668 | rc = hcall_vphn(hwid, VPHN_FLAG_VCPU, associativity); |
669 | if (rc == H_SUCCESS) |
670 | return 0; |
671 | } |
672 | |
673 | return -1; |
674 | } |
675 | |
676 | static int vphn_get_nid(long lcpu) |
677 | { |
678 | __be32 associativity[VPHN_ASSOC_BUFSIZE] = {0}; |
679 | |
680 | |
681 | if (!__vphn_get_associativity(lcpu, associativity)) |
682 | return associativity_to_nid(associativity); |
683 | |
684 | return NUMA_NO_NODE; |
685 | |
686 | } |
687 | #else |
688 | |
689 | static int __vphn_get_associativity(long lcpu, __be32 *associativity) |
690 | { |
691 | return -1; |
692 | } |
693 | |
694 | static int vphn_get_nid(long unused) |
695 | { |
696 | return NUMA_NO_NODE; |
697 | } |
698 | #endif /* CONFIG_PPC_SPLPAR */ |
699 | |
700 | /* |
701 | * Figure out to which domain a cpu belongs and stick it there. |
702 | * Return the id of the domain used. |
703 | */ |
704 | static int numa_setup_cpu(unsigned long lcpu) |
705 | { |
706 | struct device_node *cpu; |
707 | int fcpu = cpu_first_thread_sibling(lcpu); |
708 | int nid = NUMA_NO_NODE; |
709 | |
710 | if (!cpu_present(cpu: lcpu)) { |
711 | set_cpu_numa_node(cpu: lcpu, first_online_node); |
712 | return first_online_node; |
713 | } |
714 | |
715 | /* |
716 | * If a valid cpu-to-node mapping is already available, use it |
717 | * directly instead of querying the firmware, since it represents |
718 | * the most recent mapping notified to us by the platform (eg: VPHN). |
719 | * Since cpu_to_node binding remains the same for all threads in the |
720 | * core. If a valid cpu-to-node mapping is already available, for |
721 | * the first thread in the core, use it. |
722 | */ |
723 | nid = numa_cpu_lookup_table[fcpu]; |
724 | if (nid >= 0) { |
725 | map_cpu_to_node(cpu: lcpu, node: nid); |
726 | return nid; |
727 | } |
728 | |
729 | nid = vphn_get_nid(unused: lcpu); |
730 | if (nid != NUMA_NO_NODE) |
731 | goto out_present; |
732 | |
733 | cpu = of_get_cpu_node(cpu: lcpu, NULL); |
734 | |
735 | if (!cpu) { |
736 | WARN_ON(1); |
737 | if (cpu_present(cpu: lcpu)) |
738 | goto out_present; |
739 | else |
740 | goto out; |
741 | } |
742 | |
743 | nid = of_node_to_nid_single(device: cpu); |
744 | of_node_put(node: cpu); |
745 | |
746 | out_present: |
747 | if (nid < 0 || !node_possible(nid)) |
748 | nid = first_online_node; |
749 | |
750 | /* |
751 | * Update for the first thread of the core. All threads of a core |
752 | * have to be part of the same node. This not only avoids querying |
753 | * for every other thread in the core, but always avoids a case |
754 | * where virtual node associativity change causes subsequent threads |
755 | * of a core to be associated with different nid. However if first |
756 | * thread is already online, expect it to have a valid mapping. |
757 | */ |
758 | if (fcpu != lcpu) { |
759 | WARN_ON(cpu_online(fcpu)); |
760 | map_cpu_to_node(cpu: fcpu, node: nid); |
761 | } |
762 | |
763 | map_cpu_to_node(cpu: lcpu, node: nid); |
764 | out: |
765 | return nid; |
766 | } |
767 | |
768 | static void verify_cpu_node_mapping(int cpu, int node) |
769 | { |
770 | int base, sibling, i; |
771 | |
772 | /* Verify that all the threads in the core belong to the same node */ |
773 | base = cpu_first_thread_sibling(cpu); |
774 | |
775 | for (i = 0; i < threads_per_core; i++) { |
776 | sibling = base + i; |
777 | |
778 | if (sibling == cpu || cpu_is_offline(sibling)) |
779 | continue; |
780 | |
781 | if (cpu_to_node(cpu: sibling) != node) { |
782 | WARN(1, "CPU thread siblings %d and %d don't belong" |
783 | " to the same node!\n" , cpu, sibling); |
784 | break; |
785 | } |
786 | } |
787 | } |
788 | |
789 | /* Must run before sched domains notifier. */ |
790 | static int ppc_numa_cpu_prepare(unsigned int cpu) |
791 | { |
792 | int nid; |
793 | |
794 | nid = numa_setup_cpu(lcpu: cpu); |
795 | verify_cpu_node_mapping(cpu, node: nid); |
796 | return 0; |
797 | } |
798 | |
799 | static int ppc_numa_cpu_dead(unsigned int cpu) |
800 | { |
801 | return 0; |
802 | } |
803 | |
804 | /* |
805 | * Check and possibly modify a memory region to enforce the memory limit. |
806 | * |
807 | * Returns the size the region should have to enforce the memory limit. |
808 | * This will either be the original value of size, a truncated value, |
809 | * or zero. If the returned value of size is 0 the region should be |
810 | * discarded as it lies wholly above the memory limit. |
811 | */ |
812 | static unsigned long __init numa_enforce_memory_limit(unsigned long start, |
813 | unsigned long size) |
814 | { |
815 | /* |
816 | * We use memblock_end_of_DRAM() in here instead of memory_limit because |
817 | * we've already adjusted it for the limit and it takes care of |
818 | * having memory holes below the limit. Also, in the case of |
819 | * iommu_is_off, memory_limit is not set but is implicitly enforced. |
820 | */ |
821 | |
822 | if (start + size <= memblock_end_of_DRAM()) |
823 | return size; |
824 | |
825 | if (start >= memblock_end_of_DRAM()) |
826 | return 0; |
827 | |
828 | return memblock_end_of_DRAM() - start; |
829 | } |
830 | |
831 | /* |
832 | * Reads the counter for a given entry in |
833 | * linux,drconf-usable-memory property |
834 | */ |
835 | static inline int __init read_usm_ranges(const __be32 **usm) |
836 | { |
837 | /* |
838 | * For each lmb in ibm,dynamic-memory a corresponding |
839 | * entry in linux,drconf-usable-memory property contains |
840 | * a counter followed by that many (base, size) duple. |
841 | * read the counter from linux,drconf-usable-memory |
842 | */ |
843 | return read_n_cells(n: n_mem_size_cells, buf: usm); |
844 | } |
845 | |
846 | /* |
847 | * Extract NUMA information from the ibm,dynamic-reconfiguration-memory |
848 | * node. This assumes n_mem_{addr,size}_cells have been set. |
849 | */ |
850 | static int __init numa_setup_drmem_lmb(struct drmem_lmb *lmb, |
851 | const __be32 **usm, |
852 | void *data) |
853 | { |
854 | unsigned int ranges, is_kexec_kdump = 0; |
855 | unsigned long base, size, sz; |
856 | int nid; |
857 | |
858 | /* |
859 | * Skip this block if the reserved bit is set in flags (0x80) |
860 | * or if the block is not assigned to this partition (0x8) |
861 | */ |
862 | if ((lmb->flags & DRCONF_MEM_RESERVED) |
863 | || !(lmb->flags & DRCONF_MEM_ASSIGNED)) |
864 | return 0; |
865 | |
866 | if (*usm) |
867 | is_kexec_kdump = 1; |
868 | |
869 | base = lmb->base_addr; |
870 | size = drmem_lmb_size(); |
871 | ranges = 1; |
872 | |
873 | if (is_kexec_kdump) { |
874 | ranges = read_usm_ranges(usm); |
875 | if (!ranges) /* there are no (base, size) duple */ |
876 | return 0; |
877 | } |
878 | |
879 | do { |
880 | if (is_kexec_kdump) { |
881 | base = read_n_cells(n: n_mem_addr_cells, buf: usm); |
882 | size = read_n_cells(n: n_mem_size_cells, buf: usm); |
883 | } |
884 | |
885 | nid = get_nid_and_numa_distance(lmb); |
886 | fake_numa_create_new_node(end_pfn: ((base + size) >> PAGE_SHIFT), |
887 | nid: &nid); |
888 | node_set_online(nid); |
889 | sz = numa_enforce_memory_limit(start: base, size); |
890 | if (sz) |
891 | memblock_set_node(base, size: sz, type: &memblock.memory, nid); |
892 | } while (--ranges); |
893 | |
894 | return 0; |
895 | } |
896 | |
897 | static int __init parse_numa_properties(void) |
898 | { |
899 | struct device_node *memory; |
900 | int default_nid = 0; |
901 | unsigned long i; |
902 | const __be32 *associativity; |
903 | |
904 | if (numa_enabled == 0) { |
905 | pr_warn("disabled by user\n" ); |
906 | return -1; |
907 | } |
908 | |
909 | primary_domain_index = find_primary_domain_index(); |
910 | |
911 | if (primary_domain_index < 0) { |
912 | /* |
913 | * if we fail to parse primary_domain_index from device tree |
914 | * mark the numa disabled, boot with numa disabled. |
915 | */ |
916 | numa_enabled = false; |
917 | return primary_domain_index; |
918 | } |
919 | |
920 | pr_debug("associativity depth for CPU/Memory: %d\n" , primary_domain_index); |
921 | |
922 | /* |
923 | * If it is FORM2 initialize the distance table here. |
924 | */ |
925 | if (affinity_form == FORM2_AFFINITY) |
926 | initialize_form2_numa_distance_lookup_table(); |
927 | |
928 | /* |
929 | * Even though we connect cpus to numa domains later in SMP |
930 | * init, we need to know the node ids now. This is because |
931 | * each node to be onlined must have NODE_DATA etc backing it. |
932 | */ |
933 | for_each_present_cpu(i) { |
934 | __be32 vphn_assoc[VPHN_ASSOC_BUFSIZE]; |
935 | struct device_node *cpu; |
936 | int nid = NUMA_NO_NODE; |
937 | |
938 | memset(vphn_assoc, 0, VPHN_ASSOC_BUFSIZE * sizeof(__be32)); |
939 | |
940 | if (__vphn_get_associativity(lcpu: i, associativity: vphn_assoc) == 0) { |
941 | nid = associativity_to_nid(associativity: vphn_assoc); |
942 | initialize_form1_numa_distance(associativity: vphn_assoc); |
943 | } else { |
944 | |
945 | /* |
946 | * Don't fall back to default_nid yet -- we will plug |
947 | * cpus into nodes once the memory scan has discovered |
948 | * the topology. |
949 | */ |
950 | cpu = of_get_cpu_node(cpu: i, NULL); |
951 | BUG_ON(!cpu); |
952 | |
953 | associativity = of_get_associativity(dev: cpu); |
954 | if (associativity) { |
955 | nid = associativity_to_nid(associativity); |
956 | initialize_form1_numa_distance(associativity); |
957 | } |
958 | of_node_put(node: cpu); |
959 | } |
960 | |
961 | /* node_set_online() is an UB if 'nid' is negative */ |
962 | if (likely(nid >= 0)) |
963 | node_set_online(nid); |
964 | } |
965 | |
966 | get_n_mem_cells(n_addr_cells: &n_mem_addr_cells, n_size_cells: &n_mem_size_cells); |
967 | |
968 | for_each_node_by_type(memory, "memory" ) { |
969 | unsigned long start; |
970 | unsigned long size; |
971 | int nid; |
972 | int ranges; |
973 | const __be32 *memcell_buf; |
974 | unsigned int len; |
975 | |
976 | memcell_buf = of_get_property(node: memory, |
977 | name: "linux,usable-memory" , lenp: &len); |
978 | if (!memcell_buf || len <= 0) |
979 | memcell_buf = of_get_property(node: memory, name: "reg" , lenp: &len); |
980 | if (!memcell_buf || len <= 0) |
981 | continue; |
982 | |
983 | /* ranges in cell */ |
984 | ranges = (len >> 2) / (n_mem_addr_cells + n_mem_size_cells); |
985 | new_range: |
986 | /* these are order-sensitive, and modify the buffer pointer */ |
987 | start = read_n_cells(n: n_mem_addr_cells, buf: &memcell_buf); |
988 | size = read_n_cells(n: n_mem_size_cells, buf: &memcell_buf); |
989 | |
990 | /* |
991 | * Assumption: either all memory nodes or none will |
992 | * have associativity properties. If none, then |
993 | * everything goes to default_nid. |
994 | */ |
995 | associativity = of_get_associativity(dev: memory); |
996 | if (associativity) { |
997 | nid = associativity_to_nid(associativity); |
998 | initialize_form1_numa_distance(associativity); |
999 | } else |
1000 | nid = default_nid; |
1001 | |
1002 | fake_numa_create_new_node(end_pfn: ((start + size) >> PAGE_SHIFT), nid: &nid); |
1003 | node_set_online(nid); |
1004 | |
1005 | size = numa_enforce_memory_limit(start, size); |
1006 | if (size) |
1007 | memblock_set_node(base: start, size, type: &memblock.memory, nid); |
1008 | |
1009 | if (--ranges) |
1010 | goto new_range; |
1011 | } |
1012 | |
1013 | /* |
1014 | * Now do the same thing for each MEMBLOCK listed in the |
1015 | * ibm,dynamic-memory property in the |
1016 | * ibm,dynamic-reconfiguration-memory node. |
1017 | */ |
1018 | memory = of_find_node_by_path(path: "/ibm,dynamic-reconfiguration-memory" ); |
1019 | if (memory) { |
1020 | walk_drmem_lmbs(memory, NULL, numa_setup_drmem_lmb); |
1021 | of_node_put(node: memory); |
1022 | } |
1023 | |
1024 | return 0; |
1025 | } |
1026 | |
1027 | static void __init setup_nonnuma(void) |
1028 | { |
1029 | unsigned long top_of_ram = memblock_end_of_DRAM(); |
1030 | unsigned long total_ram = memblock_phys_mem_size(); |
1031 | unsigned long start_pfn, end_pfn; |
1032 | unsigned int nid = 0; |
1033 | int i; |
1034 | |
1035 | pr_debug("Top of RAM: 0x%lx, Total RAM: 0x%lx\n" , top_of_ram, total_ram); |
1036 | pr_debug("Memory hole size: %ldMB\n" , (top_of_ram - total_ram) >> 20); |
1037 | |
1038 | for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, NULL) { |
1039 | fake_numa_create_new_node(end_pfn, nid: &nid); |
1040 | memblock_set_node(PFN_PHYS(start_pfn), |
1041 | PFN_PHYS(end_pfn - start_pfn), |
1042 | type: &memblock.memory, nid); |
1043 | node_set_online(nid); |
1044 | } |
1045 | } |
1046 | |
1047 | void __init dump_numa_cpu_topology(void) |
1048 | { |
1049 | unsigned int node; |
1050 | unsigned int cpu, count; |
1051 | |
1052 | if (!numa_enabled) |
1053 | return; |
1054 | |
1055 | for_each_online_node(node) { |
1056 | pr_info("Node %d CPUs:" , node); |
1057 | |
1058 | count = 0; |
1059 | /* |
1060 | * If we used a CPU iterator here we would miss printing |
1061 | * the holes in the cpumap. |
1062 | */ |
1063 | for (cpu = 0; cpu < nr_cpu_ids; cpu++) { |
1064 | if (cpumask_test_cpu(cpu, |
1065 | cpumask: node_to_cpumask_map[node])) { |
1066 | if (count == 0) |
1067 | pr_cont(" %u" , cpu); |
1068 | ++count; |
1069 | } else { |
1070 | if (count > 1) |
1071 | pr_cont("-%u" , cpu - 1); |
1072 | count = 0; |
1073 | } |
1074 | } |
1075 | |
1076 | if (count > 1) |
1077 | pr_cont("-%u" , nr_cpu_ids - 1); |
1078 | pr_cont("\n" ); |
1079 | } |
1080 | } |
1081 | |
1082 | /* Initialize NODE_DATA for a node on the local memory */ |
1083 | static void __init setup_node_data(int nid, u64 start_pfn, u64 end_pfn) |
1084 | { |
1085 | u64 spanned_pages = end_pfn - start_pfn; |
1086 | const size_t nd_size = roundup(sizeof(pg_data_t), SMP_CACHE_BYTES); |
1087 | u64 nd_pa; |
1088 | void *nd; |
1089 | int tnid; |
1090 | |
1091 | nd_pa = memblock_phys_alloc_try_nid(size: nd_size, SMP_CACHE_BYTES, nid); |
1092 | if (!nd_pa) |
1093 | panic(fmt: "Cannot allocate %zu bytes for node %d data\n" , |
1094 | nd_size, nid); |
1095 | |
1096 | nd = __va(nd_pa); |
1097 | |
1098 | /* report and initialize */ |
1099 | pr_info(" NODE_DATA [mem %#010Lx-%#010Lx]\n" , |
1100 | nd_pa, nd_pa + nd_size - 1); |
1101 | tnid = early_pfn_to_nid(pfn: nd_pa >> PAGE_SHIFT); |
1102 | if (tnid != nid) |
1103 | pr_info(" NODE_DATA(%d) on node %d\n" , nid, tnid); |
1104 | |
1105 | node_data[nid] = nd; |
1106 | memset(NODE_DATA(nid), 0, sizeof(pg_data_t)); |
1107 | NODE_DATA(nid)->node_id = nid; |
1108 | NODE_DATA(nid)->node_start_pfn = start_pfn; |
1109 | NODE_DATA(nid)->node_spanned_pages = spanned_pages; |
1110 | } |
1111 | |
1112 | static void __init find_possible_nodes(void) |
1113 | { |
1114 | struct device_node *rtas, *root; |
1115 | const __be32 *domains = NULL; |
1116 | int prop_length, max_nodes; |
1117 | u32 i; |
1118 | |
1119 | if (!numa_enabled) |
1120 | return; |
1121 | |
1122 | rtas = of_find_node_by_path(path: "/rtas" ); |
1123 | if (!rtas) |
1124 | return; |
1125 | |
1126 | /* |
1127 | * ibm,current-associativity-domains is a fairly recent property. If |
1128 | * it doesn't exist, then fallback on ibm,max-associativity-domains. |
1129 | * Current denotes what the platform can support compared to max |
1130 | * which denotes what the Hypervisor can support. |
1131 | * |
1132 | * If the LPAR is migratable, new nodes might be activated after a LPM, |
1133 | * so we should consider the max number in that case. |
1134 | */ |
1135 | root = of_find_node_by_path(path: "/" ); |
1136 | if (!of_get_property(node: root, name: "ibm,migratable-partition" , NULL)) |
1137 | domains = of_get_property(node: rtas, |
1138 | name: "ibm,current-associativity-domains" , |
1139 | lenp: &prop_length); |
1140 | of_node_put(node: root); |
1141 | if (!domains) { |
1142 | domains = of_get_property(node: rtas, name: "ibm,max-associativity-domains" , |
1143 | lenp: &prop_length); |
1144 | if (!domains) |
1145 | goto out; |
1146 | } |
1147 | |
1148 | max_nodes = of_read_number(cell: &domains[primary_domain_index], size: 1); |
1149 | pr_info("Partition configured for %d NUMA nodes.\n" , max_nodes); |
1150 | |
1151 | for (i = 0; i < max_nodes; i++) { |
1152 | if (!node_possible(i)) |
1153 | node_set(i, node_possible_map); |
1154 | } |
1155 | |
1156 | prop_length /= sizeof(int); |
1157 | if (prop_length > primary_domain_index + 2) |
1158 | coregroup_enabled = 1; |
1159 | |
1160 | out: |
1161 | of_node_put(node: rtas); |
1162 | } |
1163 | |
1164 | void __init mem_topology_setup(void) |
1165 | { |
1166 | int cpu; |
1167 | |
1168 | max_low_pfn = max_pfn = memblock_end_of_DRAM() >> PAGE_SHIFT; |
1169 | min_low_pfn = MEMORY_START >> PAGE_SHIFT; |
1170 | |
1171 | /* |
1172 | * Linux/mm assumes node 0 to be online at boot. However this is not |
1173 | * true on PowerPC, where node 0 is similar to any other node, it |
1174 | * could be cpuless, memoryless node. So force node 0 to be offline |
1175 | * for now. This will prevent cpuless, memoryless node 0 showing up |
1176 | * unnecessarily as online. If a node has cpus or memory that need |
1177 | * to be online, then node will anyway be marked online. |
1178 | */ |
1179 | node_set_offline(nid: 0); |
1180 | |
1181 | if (parse_numa_properties()) |
1182 | setup_nonnuma(); |
1183 | |
1184 | /* |
1185 | * Modify the set of possible NUMA nodes to reflect information |
1186 | * available about the set of online nodes, and the set of nodes |
1187 | * that we expect to make use of for this platform's affinity |
1188 | * calculations. |
1189 | */ |
1190 | nodes_and(node_possible_map, node_possible_map, node_online_map); |
1191 | |
1192 | find_possible_nodes(); |
1193 | |
1194 | setup_node_to_cpumask_map(); |
1195 | |
1196 | reset_numa_cpu_lookup_table(); |
1197 | |
1198 | for_each_possible_cpu(cpu) { |
1199 | /* |
1200 | * Powerpc with CONFIG_NUMA always used to have a node 0, |
1201 | * even if it was memoryless or cpuless. For all cpus that |
1202 | * are possible but not present, cpu_to_node() would point |
1203 | * to node 0. To remove a cpuless, memoryless dummy node, |
1204 | * powerpc need to make sure all possible but not present |
1205 | * cpu_to_node are set to a proper node. |
1206 | */ |
1207 | numa_setup_cpu(lcpu: cpu); |
1208 | } |
1209 | } |
1210 | |
1211 | void __init initmem_init(void) |
1212 | { |
1213 | int nid; |
1214 | |
1215 | memblock_dump_all(); |
1216 | |
1217 | for_each_online_node(nid) { |
1218 | unsigned long start_pfn, end_pfn; |
1219 | |
1220 | get_pfn_range_for_nid(nid, start_pfn: &start_pfn, end_pfn: &end_pfn); |
1221 | setup_node_data(nid, start_pfn, end_pfn); |
1222 | } |
1223 | |
1224 | sparse_init(); |
1225 | |
1226 | /* |
1227 | * We need the numa_cpu_lookup_table to be accurate for all CPUs, |
1228 | * even before we online them, so that we can use cpu_to_{node,mem} |
1229 | * early in boot, cf. smp_prepare_cpus(). |
1230 | * _nocalls() + manual invocation is used because cpuhp is not yet |
1231 | * initialized for the boot CPU. |
1232 | */ |
1233 | cpuhp_setup_state_nocalls(state: CPUHP_POWER_NUMA_PREPARE, name: "powerpc/numa:prepare" , |
1234 | startup: ppc_numa_cpu_prepare, teardown: ppc_numa_cpu_dead); |
1235 | } |
1236 | |
1237 | static int __init early_numa(char *p) |
1238 | { |
1239 | if (!p) |
1240 | return 0; |
1241 | |
1242 | if (strstr(p, "off" )) |
1243 | numa_enabled = 0; |
1244 | |
1245 | p = strstr(p, "fake=" ); |
1246 | if (p) |
1247 | cmdline = p + strlen("fake=" ); |
1248 | |
1249 | return 0; |
1250 | } |
1251 | early_param("numa" , early_numa); |
1252 | |
1253 | #ifdef CONFIG_MEMORY_HOTPLUG |
1254 | /* |
1255 | * Find the node associated with a hot added memory section for |
1256 | * memory represented in the device tree by the property |
1257 | * ibm,dynamic-reconfiguration-memory/ibm,dynamic-memory. |
1258 | */ |
1259 | static int hot_add_drconf_scn_to_nid(unsigned long scn_addr) |
1260 | { |
1261 | struct drmem_lmb *lmb; |
1262 | unsigned long lmb_size; |
1263 | int nid = NUMA_NO_NODE; |
1264 | |
1265 | lmb_size = drmem_lmb_size(); |
1266 | |
1267 | for_each_drmem_lmb(lmb) { |
1268 | /* skip this block if it is reserved or not assigned to |
1269 | * this partition */ |
1270 | if ((lmb->flags & DRCONF_MEM_RESERVED) |
1271 | || !(lmb->flags & DRCONF_MEM_ASSIGNED)) |
1272 | continue; |
1273 | |
1274 | if ((scn_addr < lmb->base_addr) |
1275 | || (scn_addr >= (lmb->base_addr + lmb_size))) |
1276 | continue; |
1277 | |
1278 | nid = of_drconf_to_nid_single(lmb); |
1279 | break; |
1280 | } |
1281 | |
1282 | return nid; |
1283 | } |
1284 | |
1285 | /* |
1286 | * Find the node associated with a hot added memory section for memory |
1287 | * represented in the device tree as a node (i.e. memory@XXXX) for |
1288 | * each memblock. |
1289 | */ |
1290 | static int hot_add_node_scn_to_nid(unsigned long scn_addr) |
1291 | { |
1292 | struct device_node *memory; |
1293 | int nid = NUMA_NO_NODE; |
1294 | |
1295 | for_each_node_by_type(memory, "memory" ) { |
1296 | int i = 0; |
1297 | |
1298 | while (1) { |
1299 | struct resource res; |
1300 | |
1301 | if (of_address_to_resource(dev: memory, index: i++, r: &res)) |
1302 | break; |
1303 | |
1304 | if ((scn_addr < res.start) || (scn_addr > res.end)) |
1305 | continue; |
1306 | |
1307 | nid = of_node_to_nid_single(device: memory); |
1308 | break; |
1309 | } |
1310 | |
1311 | if (nid >= 0) |
1312 | break; |
1313 | } |
1314 | |
1315 | of_node_put(node: memory); |
1316 | |
1317 | return nid; |
1318 | } |
1319 | |
1320 | /* |
1321 | * Find the node associated with a hot added memory section. Section |
1322 | * corresponds to a SPARSEMEM section, not an MEMBLOCK. It is assumed that |
1323 | * sections are fully contained within a single MEMBLOCK. |
1324 | */ |
1325 | int hot_add_scn_to_nid(unsigned long scn_addr) |
1326 | { |
1327 | struct device_node *memory = NULL; |
1328 | int nid; |
1329 | |
1330 | if (!numa_enabled) |
1331 | return first_online_node; |
1332 | |
1333 | memory = of_find_node_by_path(path: "/ibm,dynamic-reconfiguration-memory" ); |
1334 | if (memory) { |
1335 | nid = hot_add_drconf_scn_to_nid(scn_addr); |
1336 | of_node_put(node: memory); |
1337 | } else { |
1338 | nid = hot_add_node_scn_to_nid(scn_addr); |
1339 | } |
1340 | |
1341 | if (nid < 0 || !node_possible(nid)) |
1342 | nid = first_online_node; |
1343 | |
1344 | return nid; |
1345 | } |
1346 | |
1347 | static u64 hot_add_drconf_memory_max(void) |
1348 | { |
1349 | struct device_node *memory = NULL; |
1350 | struct device_node *dn = NULL; |
1351 | const __be64 *lrdr = NULL; |
1352 | |
1353 | dn = of_find_node_by_path(path: "/rtas" ); |
1354 | if (dn) { |
1355 | lrdr = of_get_property(node: dn, name: "ibm,lrdr-capacity" , NULL); |
1356 | of_node_put(node: dn); |
1357 | if (lrdr) |
1358 | return be64_to_cpup(p: lrdr); |
1359 | } |
1360 | |
1361 | memory = of_find_node_by_path(path: "/ibm,dynamic-reconfiguration-memory" ); |
1362 | if (memory) { |
1363 | of_node_put(node: memory); |
1364 | return drmem_lmb_memory_max(); |
1365 | } |
1366 | return 0; |
1367 | } |
1368 | |
1369 | /* |
1370 | * memory_hotplug_max - return max address of memory that may be added |
1371 | * |
1372 | * This is currently only used on systems that support drconfig memory |
1373 | * hotplug. |
1374 | */ |
1375 | u64 memory_hotplug_max(void) |
1376 | { |
1377 | return max(hot_add_drconf_memory_max(), memblock_end_of_DRAM()); |
1378 | } |
1379 | #endif /* CONFIG_MEMORY_HOTPLUG */ |
1380 | |
1381 | /* Virtual Processor Home Node (VPHN) support */ |
1382 | #ifdef CONFIG_PPC_SPLPAR |
1383 | static int topology_inited; |
1384 | |
1385 | /* |
1386 | * Retrieve the new associativity information for a virtual processor's |
1387 | * home node. |
1388 | */ |
1389 | static long vphn_get_associativity(unsigned long cpu, |
1390 | __be32 *associativity) |
1391 | { |
1392 | long rc; |
1393 | |
1394 | rc = hcall_vphn(get_hard_smp_processor_id(cpu), |
1395 | VPHN_FLAG_VCPU, associativity); |
1396 | |
1397 | switch (rc) { |
1398 | case H_SUCCESS: |
1399 | pr_debug("VPHN hcall succeeded. Reset polling...\n" ); |
1400 | goto out; |
1401 | |
1402 | case H_FUNCTION: |
1403 | pr_err_ratelimited("VPHN unsupported. Disabling polling...\n" ); |
1404 | break; |
1405 | case H_HARDWARE: |
1406 | pr_err_ratelimited("hcall_vphn() experienced a hardware fault " |
1407 | "preventing VPHN. Disabling polling...\n" ); |
1408 | break; |
1409 | case H_PARAMETER: |
1410 | pr_err_ratelimited("hcall_vphn() was passed an invalid parameter. " |
1411 | "Disabling polling...\n" ); |
1412 | break; |
1413 | default: |
1414 | pr_err_ratelimited("hcall_vphn() returned %ld. Disabling polling...\n" |
1415 | , rc); |
1416 | break; |
1417 | } |
1418 | out: |
1419 | return rc; |
1420 | } |
1421 | |
1422 | void find_and_update_cpu_nid(int cpu) |
1423 | { |
1424 | __be32 associativity[VPHN_ASSOC_BUFSIZE] = {0}; |
1425 | int new_nid; |
1426 | |
1427 | /* Use associativity from first thread for all siblings */ |
1428 | if (vphn_get_associativity(cpu, associativity)) |
1429 | return; |
1430 | |
1431 | /* Do not have previous associativity, so find it now. */ |
1432 | new_nid = associativity_to_nid(associativity); |
1433 | |
1434 | if (new_nid < 0 || !node_possible(new_nid)) |
1435 | new_nid = first_online_node; |
1436 | else |
1437 | // Associate node <-> cpu, so cpu_up() calls |
1438 | // try_online_node() on the right node. |
1439 | set_cpu_numa_node(cpu, new_nid); |
1440 | |
1441 | pr_debug("%s:%d cpu %d nid %d\n" , __func__, __LINE__, cpu, new_nid); |
1442 | } |
1443 | |
1444 | int cpu_to_coregroup_id(int cpu) |
1445 | { |
1446 | __be32 associativity[VPHN_ASSOC_BUFSIZE] = {0}; |
1447 | int index; |
1448 | |
1449 | if (cpu < 0 || cpu > nr_cpu_ids) |
1450 | return -1; |
1451 | |
1452 | if (!coregroup_enabled) |
1453 | goto out; |
1454 | |
1455 | if (!firmware_has_feature(FW_FEATURE_VPHN)) |
1456 | goto out; |
1457 | |
1458 | if (vphn_get_associativity(cpu, associativity)) |
1459 | goto out; |
1460 | |
1461 | index = of_read_number(associativity, 1); |
1462 | if (index > primary_domain_index + 1) |
1463 | return of_read_number(&associativity[index - 1], 1); |
1464 | |
1465 | out: |
1466 | return cpu_to_core_id(cpu); |
1467 | } |
1468 | |
1469 | static int topology_update_init(void) |
1470 | { |
1471 | topology_inited = 1; |
1472 | return 0; |
1473 | } |
1474 | device_initcall(topology_update_init); |
1475 | #endif /* CONFIG_PPC_SPLPAR */ |
1476 | |