1 | // SPDX-License-Identifier: GPL-2.0 |
2 | #include <linux/slab.h> |
3 | #include <linux/lockdep.h> |
4 | #include <linux/sysfs.h> |
5 | #include <linux/kobject.h> |
6 | #include <linux/memory.h> |
7 | #include <linux/memory-tiers.h> |
8 | #include <linux/notifier.h> |
9 | |
10 | #include "internal.h" |
11 | |
12 | struct memory_tier { |
13 | /* hierarchy of memory tiers */ |
14 | struct list_head list; |
15 | /* list of all memory types part of this tier */ |
16 | struct list_head memory_types; |
17 | /* |
18 | * start value of abstract distance. memory tier maps |
19 | * an abstract distance range, |
20 | * adistance_start .. adistance_start + MEMTIER_CHUNK_SIZE |
21 | */ |
22 | int adistance_start; |
23 | struct device dev; |
24 | /* All the nodes that are part of all the lower memory tiers. */ |
25 | nodemask_t lower_tier_mask; |
26 | }; |
27 | |
28 | struct demotion_nodes { |
29 | nodemask_t preferred; |
30 | }; |
31 | |
32 | struct node_memory_type_map { |
33 | struct memory_dev_type *memtype; |
34 | int map_count; |
35 | }; |
36 | |
37 | static DEFINE_MUTEX(memory_tier_lock); |
38 | static LIST_HEAD(memory_tiers); |
39 | static struct node_memory_type_map node_memory_types[MAX_NUMNODES]; |
40 | struct memory_dev_type *default_dram_type; |
41 | |
42 | static struct bus_type memory_tier_subsys = { |
43 | .name = "memory_tiering" , |
44 | .dev_name = "memory_tier" , |
45 | }; |
46 | |
47 | #ifdef CONFIG_MIGRATION |
48 | static int top_tier_adistance; |
49 | /* |
50 | * node_demotion[] examples: |
51 | * |
52 | * Example 1: |
53 | * |
54 | * Node 0 & 1 are CPU + DRAM nodes, node 2 & 3 are PMEM nodes. |
55 | * |
56 | * node distances: |
57 | * node 0 1 2 3 |
58 | * 0 10 20 30 40 |
59 | * 1 20 10 40 30 |
60 | * 2 30 40 10 40 |
61 | * 3 40 30 40 10 |
62 | * |
63 | * memory_tiers0 = 0-1 |
64 | * memory_tiers1 = 2-3 |
65 | * |
66 | * node_demotion[0].preferred = 2 |
67 | * node_demotion[1].preferred = 3 |
68 | * node_demotion[2].preferred = <empty> |
69 | * node_demotion[3].preferred = <empty> |
70 | * |
71 | * Example 2: |
72 | * |
73 | * Node 0 & 1 are CPU + DRAM nodes, node 2 is memory-only DRAM node. |
74 | * |
75 | * node distances: |
76 | * node 0 1 2 |
77 | * 0 10 20 30 |
78 | * 1 20 10 30 |
79 | * 2 30 30 10 |
80 | * |
81 | * memory_tiers0 = 0-2 |
82 | * |
83 | * node_demotion[0].preferred = <empty> |
84 | * node_demotion[1].preferred = <empty> |
85 | * node_demotion[2].preferred = <empty> |
86 | * |
87 | * Example 3: |
88 | * |
89 | * Node 0 is CPU + DRAM nodes, Node 1 is HBM node, node 2 is PMEM node. |
90 | * |
91 | * node distances: |
92 | * node 0 1 2 |
93 | * 0 10 20 30 |
94 | * 1 20 10 40 |
95 | * 2 30 40 10 |
96 | * |
97 | * memory_tiers0 = 1 |
98 | * memory_tiers1 = 0 |
99 | * memory_tiers2 = 2 |
100 | * |
101 | * node_demotion[0].preferred = 2 |
102 | * node_demotion[1].preferred = 0 |
103 | * node_demotion[2].preferred = <empty> |
104 | * |
105 | */ |
106 | static struct demotion_nodes *node_demotion __read_mostly; |
107 | #endif /* CONFIG_MIGRATION */ |
108 | |
109 | static BLOCKING_NOTIFIER_HEAD(mt_adistance_algorithms); |
110 | |
111 | static bool default_dram_perf_error; |
112 | static struct node_hmem_attrs default_dram_perf; |
113 | static int default_dram_perf_ref_nid = NUMA_NO_NODE; |
114 | static const char *default_dram_perf_ref_source; |
115 | |
116 | static inline struct memory_tier *to_memory_tier(struct device *device) |
117 | { |
118 | return container_of(device, struct memory_tier, dev); |
119 | } |
120 | |
121 | static __always_inline nodemask_t get_memtier_nodemask(struct memory_tier *memtier) |
122 | { |
123 | nodemask_t nodes = NODE_MASK_NONE; |
124 | struct memory_dev_type *memtype; |
125 | |
126 | list_for_each_entry(memtype, &memtier->memory_types, tier_sibling) |
127 | nodes_or(nodes, nodes, memtype->nodes); |
128 | |
129 | return nodes; |
130 | } |
131 | |
132 | static void memory_tier_device_release(struct device *dev) |
133 | { |
134 | struct memory_tier *tier = to_memory_tier(device: dev); |
135 | /* |
136 | * synchronize_rcu in clear_node_memory_tier makes sure |
137 | * we don't have rcu access to this memory tier. |
138 | */ |
139 | kfree(objp: tier); |
140 | } |
141 | |
142 | static ssize_t nodelist_show(struct device *dev, |
143 | struct device_attribute *attr, char *buf) |
144 | { |
145 | int ret; |
146 | nodemask_t nmask; |
147 | |
148 | mutex_lock(&memory_tier_lock); |
149 | nmask = get_memtier_nodemask(memtier: to_memory_tier(device: dev)); |
150 | ret = sysfs_emit(buf, fmt: "%*pbl\n" , nodemask_pr_args(&nmask)); |
151 | mutex_unlock(lock: &memory_tier_lock); |
152 | return ret; |
153 | } |
154 | static DEVICE_ATTR_RO(nodelist); |
155 | |
156 | static struct attribute *memtier_dev_attrs[] = { |
157 | &dev_attr_nodelist.attr, |
158 | NULL |
159 | }; |
160 | |
161 | static const struct attribute_group memtier_dev_group = { |
162 | .attrs = memtier_dev_attrs, |
163 | }; |
164 | |
165 | static const struct attribute_group *memtier_dev_groups[] = { |
166 | &memtier_dev_group, |
167 | NULL |
168 | }; |
169 | |
170 | static struct memory_tier *find_create_memory_tier(struct memory_dev_type *memtype) |
171 | { |
172 | int ret; |
173 | bool found_slot = false; |
174 | struct memory_tier *memtier, *new_memtier; |
175 | int adistance = memtype->adistance; |
176 | unsigned int memtier_adistance_chunk_size = MEMTIER_CHUNK_SIZE; |
177 | |
178 | lockdep_assert_held_once(&memory_tier_lock); |
179 | |
180 | adistance = round_down(adistance, memtier_adistance_chunk_size); |
181 | /* |
182 | * If the memtype is already part of a memory tier, |
183 | * just return that. |
184 | */ |
185 | if (!list_empty(head: &memtype->tier_sibling)) { |
186 | list_for_each_entry(memtier, &memory_tiers, list) { |
187 | if (adistance == memtier->adistance_start) |
188 | return memtier; |
189 | } |
190 | WARN_ON(1); |
191 | return ERR_PTR(error: -EINVAL); |
192 | } |
193 | |
194 | list_for_each_entry(memtier, &memory_tiers, list) { |
195 | if (adistance == memtier->adistance_start) { |
196 | goto link_memtype; |
197 | } else if (adistance < memtier->adistance_start) { |
198 | found_slot = true; |
199 | break; |
200 | } |
201 | } |
202 | |
203 | new_memtier = kzalloc(size: sizeof(struct memory_tier), GFP_KERNEL); |
204 | if (!new_memtier) |
205 | return ERR_PTR(error: -ENOMEM); |
206 | |
207 | new_memtier->adistance_start = adistance; |
208 | INIT_LIST_HEAD(list: &new_memtier->list); |
209 | INIT_LIST_HEAD(list: &new_memtier->memory_types); |
210 | if (found_slot) |
211 | list_add_tail(new: &new_memtier->list, head: &memtier->list); |
212 | else |
213 | list_add_tail(new: &new_memtier->list, head: &memory_tiers); |
214 | |
215 | new_memtier->dev.id = adistance >> MEMTIER_CHUNK_BITS; |
216 | new_memtier->dev.bus = &memory_tier_subsys; |
217 | new_memtier->dev.release = memory_tier_device_release; |
218 | new_memtier->dev.groups = memtier_dev_groups; |
219 | |
220 | ret = device_register(dev: &new_memtier->dev); |
221 | if (ret) { |
222 | list_del(entry: &new_memtier->list); |
223 | put_device(dev: &new_memtier->dev); |
224 | return ERR_PTR(error: ret); |
225 | } |
226 | memtier = new_memtier; |
227 | |
228 | link_memtype: |
229 | list_add(new: &memtype->tier_sibling, head: &memtier->memory_types); |
230 | return memtier; |
231 | } |
232 | |
233 | static struct memory_tier *__node_get_memory_tier(int node) |
234 | { |
235 | pg_data_t *pgdat; |
236 | |
237 | pgdat = NODE_DATA(node); |
238 | if (!pgdat) |
239 | return NULL; |
240 | /* |
241 | * Since we hold memory_tier_lock, we can avoid |
242 | * RCU read locks when accessing the details. No |
243 | * parallel updates are possible here. |
244 | */ |
245 | return rcu_dereference_check(pgdat->memtier, |
246 | lockdep_is_held(&memory_tier_lock)); |
247 | } |
248 | |
249 | #ifdef CONFIG_MIGRATION |
250 | bool node_is_toptier(int node) |
251 | { |
252 | bool toptier; |
253 | pg_data_t *pgdat; |
254 | struct memory_tier *memtier; |
255 | |
256 | pgdat = NODE_DATA(node); |
257 | if (!pgdat) |
258 | return false; |
259 | |
260 | rcu_read_lock(); |
261 | memtier = rcu_dereference(pgdat->memtier); |
262 | if (!memtier) { |
263 | toptier = true; |
264 | goto out; |
265 | } |
266 | if (memtier->adistance_start <= top_tier_adistance) |
267 | toptier = true; |
268 | else |
269 | toptier = false; |
270 | out: |
271 | rcu_read_unlock(); |
272 | return toptier; |
273 | } |
274 | |
275 | void node_get_allowed_targets(pg_data_t *pgdat, nodemask_t *targets) |
276 | { |
277 | struct memory_tier *memtier; |
278 | |
279 | /* |
280 | * pg_data_t.memtier updates includes a synchronize_rcu() |
281 | * which ensures that we either find NULL or a valid memtier |
282 | * in NODE_DATA. protect the access via rcu_read_lock(); |
283 | */ |
284 | rcu_read_lock(); |
285 | memtier = rcu_dereference(pgdat->memtier); |
286 | if (memtier) |
287 | *targets = memtier->lower_tier_mask; |
288 | else |
289 | *targets = NODE_MASK_NONE; |
290 | rcu_read_unlock(); |
291 | } |
292 | |
293 | /** |
294 | * next_demotion_node() - Get the next node in the demotion path |
295 | * @node: The starting node to lookup the next node |
296 | * |
297 | * Return: node id for next memory node in the demotion path hierarchy |
298 | * from @node; NUMA_NO_NODE if @node is terminal. This does not keep |
299 | * @node online or guarantee that it *continues* to be the next demotion |
300 | * target. |
301 | */ |
302 | int next_demotion_node(int node) |
303 | { |
304 | struct demotion_nodes *nd; |
305 | int target; |
306 | |
307 | if (!node_demotion) |
308 | return NUMA_NO_NODE; |
309 | |
310 | nd = &node_demotion[node]; |
311 | |
312 | /* |
313 | * node_demotion[] is updated without excluding this |
314 | * function from running. |
315 | * |
316 | * Make sure to use RCU over entire code blocks if |
317 | * node_demotion[] reads need to be consistent. |
318 | */ |
319 | rcu_read_lock(); |
320 | /* |
321 | * If there are multiple target nodes, just select one |
322 | * target node randomly. |
323 | * |
324 | * In addition, we can also use round-robin to select |
325 | * target node, but we should introduce another variable |
326 | * for node_demotion[] to record last selected target node, |
327 | * that may cause cache ping-pong due to the changing of |
328 | * last target node. Or introducing per-cpu data to avoid |
329 | * caching issue, which seems more complicated. So selecting |
330 | * target node randomly seems better until now. |
331 | */ |
332 | target = node_random(maskp: &nd->preferred); |
333 | rcu_read_unlock(); |
334 | |
335 | return target; |
336 | } |
337 | |
338 | static void disable_all_demotion_targets(void) |
339 | { |
340 | struct memory_tier *memtier; |
341 | int node; |
342 | |
343 | for_each_node_state(node, N_MEMORY) { |
344 | node_demotion[node].preferred = NODE_MASK_NONE; |
345 | /* |
346 | * We are holding memory_tier_lock, it is safe |
347 | * to access pgda->memtier. |
348 | */ |
349 | memtier = __node_get_memory_tier(node); |
350 | if (memtier) |
351 | memtier->lower_tier_mask = NODE_MASK_NONE; |
352 | } |
353 | /* |
354 | * Ensure that the "disable" is visible across the system. |
355 | * Readers will see either a combination of before+disable |
356 | * state or disable+after. They will never see before and |
357 | * after state together. |
358 | */ |
359 | synchronize_rcu(); |
360 | } |
361 | |
362 | /* |
363 | * Find an automatic demotion target for all memory |
364 | * nodes. Failing here is OK. It might just indicate |
365 | * being at the end of a chain. |
366 | */ |
367 | static void establish_demotion_targets(void) |
368 | { |
369 | struct memory_tier *memtier; |
370 | struct demotion_nodes *nd; |
371 | int target = NUMA_NO_NODE, node; |
372 | int distance, best_distance; |
373 | nodemask_t tier_nodes, lower_tier; |
374 | |
375 | lockdep_assert_held_once(&memory_tier_lock); |
376 | |
377 | if (!node_demotion) |
378 | return; |
379 | |
380 | disable_all_demotion_targets(); |
381 | |
382 | for_each_node_state(node, N_MEMORY) { |
383 | best_distance = -1; |
384 | nd = &node_demotion[node]; |
385 | |
386 | memtier = __node_get_memory_tier(node); |
387 | if (!memtier || list_is_last(list: &memtier->list, head: &memory_tiers)) |
388 | continue; |
389 | /* |
390 | * Get the lower memtier to find the demotion node list. |
391 | */ |
392 | memtier = list_next_entry(memtier, list); |
393 | tier_nodes = get_memtier_nodemask(memtier); |
394 | /* |
395 | * find_next_best_node, use 'used' nodemask as a skip list. |
396 | * Add all memory nodes except the selected memory tier |
397 | * nodelist to skip list so that we find the best node from the |
398 | * memtier nodelist. |
399 | */ |
400 | nodes_andnot(tier_nodes, node_states[N_MEMORY], tier_nodes); |
401 | |
402 | /* |
403 | * Find all the nodes in the memory tier node list of same best distance. |
404 | * add them to the preferred mask. We randomly select between nodes |
405 | * in the preferred mask when allocating pages during demotion. |
406 | */ |
407 | do { |
408 | target = find_next_best_node(node, used_node_mask: &tier_nodes); |
409 | if (target == NUMA_NO_NODE) |
410 | break; |
411 | |
412 | distance = node_distance(node, target); |
413 | if (distance == best_distance || best_distance == -1) { |
414 | best_distance = distance; |
415 | node_set(target, nd->preferred); |
416 | } else { |
417 | break; |
418 | } |
419 | } while (1); |
420 | } |
421 | /* |
422 | * Promotion is allowed from a memory tier to higher |
423 | * memory tier only if the memory tier doesn't include |
424 | * compute. We want to skip promotion from a memory tier, |
425 | * if any node that is part of the memory tier have CPUs. |
426 | * Once we detect such a memory tier, we consider that tier |
427 | * as top tiper from which promotion is not allowed. |
428 | */ |
429 | list_for_each_entry_reverse(memtier, &memory_tiers, list) { |
430 | tier_nodes = get_memtier_nodemask(memtier); |
431 | nodes_and(tier_nodes, node_states[N_CPU], tier_nodes); |
432 | if (!nodes_empty(tier_nodes)) { |
433 | /* |
434 | * abstract distance below the max value of this memtier |
435 | * is considered toptier. |
436 | */ |
437 | top_tier_adistance = memtier->adistance_start + |
438 | MEMTIER_CHUNK_SIZE - 1; |
439 | break; |
440 | } |
441 | } |
442 | /* |
443 | * Now build the lower_tier mask for each node collecting node mask from |
444 | * all memory tier below it. This allows us to fallback demotion page |
445 | * allocation to a set of nodes that is closer the above selected |
446 | * perferred node. |
447 | */ |
448 | lower_tier = node_states[N_MEMORY]; |
449 | list_for_each_entry(memtier, &memory_tiers, list) { |
450 | /* |
451 | * Keep removing current tier from lower_tier nodes, |
452 | * This will remove all nodes in current and above |
453 | * memory tier from the lower_tier mask. |
454 | */ |
455 | tier_nodes = get_memtier_nodemask(memtier); |
456 | nodes_andnot(lower_tier, lower_tier, tier_nodes); |
457 | memtier->lower_tier_mask = lower_tier; |
458 | } |
459 | } |
460 | |
461 | #else |
462 | static inline void establish_demotion_targets(void) {} |
463 | #endif /* CONFIG_MIGRATION */ |
464 | |
465 | static inline void __init_node_memory_type(int node, struct memory_dev_type *memtype) |
466 | { |
467 | if (!node_memory_types[node].memtype) |
468 | node_memory_types[node].memtype = memtype; |
469 | /* |
470 | * for each device getting added in the same NUMA node |
471 | * with this specific memtype, bump the map count. We |
472 | * Only take memtype device reference once, so that |
473 | * changing a node memtype can be done by droping the |
474 | * only reference count taken here. |
475 | */ |
476 | |
477 | if (node_memory_types[node].memtype == memtype) { |
478 | if (!node_memory_types[node].map_count++) |
479 | kref_get(kref: &memtype->kref); |
480 | } |
481 | } |
482 | |
483 | static struct memory_tier *set_node_memory_tier(int node) |
484 | { |
485 | struct memory_tier *memtier; |
486 | struct memory_dev_type *memtype; |
487 | pg_data_t *pgdat = NODE_DATA(node); |
488 | |
489 | |
490 | lockdep_assert_held_once(&memory_tier_lock); |
491 | |
492 | if (!node_state(node, state: N_MEMORY)) |
493 | return ERR_PTR(error: -EINVAL); |
494 | |
495 | __init_node_memory_type(node, memtype: default_dram_type); |
496 | |
497 | memtype = node_memory_types[node].memtype; |
498 | node_set(node, memtype->nodes); |
499 | memtier = find_create_memory_tier(memtype); |
500 | if (!IS_ERR(ptr: memtier)) |
501 | rcu_assign_pointer(pgdat->memtier, memtier); |
502 | return memtier; |
503 | } |
504 | |
505 | static void destroy_memory_tier(struct memory_tier *memtier) |
506 | { |
507 | list_del(entry: &memtier->list); |
508 | device_unregister(dev: &memtier->dev); |
509 | } |
510 | |
511 | static bool clear_node_memory_tier(int node) |
512 | { |
513 | bool cleared = false; |
514 | pg_data_t *pgdat; |
515 | struct memory_tier *memtier; |
516 | |
517 | pgdat = NODE_DATA(node); |
518 | if (!pgdat) |
519 | return false; |
520 | |
521 | /* |
522 | * Make sure that anybody looking at NODE_DATA who finds |
523 | * a valid memtier finds memory_dev_types with nodes still |
524 | * linked to the memtier. We achieve this by waiting for |
525 | * rcu read section to finish using synchronize_rcu. |
526 | * This also enables us to free the destroyed memory tier |
527 | * with kfree instead of kfree_rcu |
528 | */ |
529 | memtier = __node_get_memory_tier(node); |
530 | if (memtier) { |
531 | struct memory_dev_type *memtype; |
532 | |
533 | rcu_assign_pointer(pgdat->memtier, NULL); |
534 | synchronize_rcu(); |
535 | memtype = node_memory_types[node].memtype; |
536 | node_clear(node, memtype->nodes); |
537 | if (nodes_empty(memtype->nodes)) { |
538 | list_del_init(entry: &memtype->tier_sibling); |
539 | if (list_empty(head: &memtier->memory_types)) |
540 | destroy_memory_tier(memtier); |
541 | } |
542 | cleared = true; |
543 | } |
544 | return cleared; |
545 | } |
546 | |
547 | static void release_memtype(struct kref *kref) |
548 | { |
549 | struct memory_dev_type *memtype; |
550 | |
551 | memtype = container_of(kref, struct memory_dev_type, kref); |
552 | kfree(objp: memtype); |
553 | } |
554 | |
555 | struct memory_dev_type *alloc_memory_type(int adistance) |
556 | { |
557 | struct memory_dev_type *memtype; |
558 | |
559 | memtype = kmalloc(size: sizeof(*memtype), GFP_KERNEL); |
560 | if (!memtype) |
561 | return ERR_PTR(error: -ENOMEM); |
562 | |
563 | memtype->adistance = adistance; |
564 | INIT_LIST_HEAD(list: &memtype->tier_sibling); |
565 | memtype->nodes = NODE_MASK_NONE; |
566 | kref_init(kref: &memtype->kref); |
567 | return memtype; |
568 | } |
569 | EXPORT_SYMBOL_GPL(alloc_memory_type); |
570 | |
571 | void put_memory_type(struct memory_dev_type *memtype) |
572 | { |
573 | kref_put(kref: &memtype->kref, release: release_memtype); |
574 | } |
575 | EXPORT_SYMBOL_GPL(put_memory_type); |
576 | |
577 | void init_node_memory_type(int node, struct memory_dev_type *memtype) |
578 | { |
579 | |
580 | mutex_lock(&memory_tier_lock); |
581 | __init_node_memory_type(node, memtype); |
582 | mutex_unlock(lock: &memory_tier_lock); |
583 | } |
584 | EXPORT_SYMBOL_GPL(init_node_memory_type); |
585 | |
586 | void clear_node_memory_type(int node, struct memory_dev_type *memtype) |
587 | { |
588 | mutex_lock(&memory_tier_lock); |
589 | if (node_memory_types[node].memtype == memtype || !memtype) |
590 | node_memory_types[node].map_count--; |
591 | /* |
592 | * If we umapped all the attached devices to this node, |
593 | * clear the node memory type. |
594 | */ |
595 | if (!node_memory_types[node].map_count) { |
596 | memtype = node_memory_types[node].memtype; |
597 | node_memory_types[node].memtype = NULL; |
598 | put_memory_type(memtype); |
599 | } |
600 | mutex_unlock(lock: &memory_tier_lock); |
601 | } |
602 | EXPORT_SYMBOL_GPL(clear_node_memory_type); |
603 | |
604 | static void dump_hmem_attrs(struct node_hmem_attrs *attrs, const char *prefix) |
605 | { |
606 | pr_info( |
607 | "%sread_latency: %u, write_latency: %u, read_bandwidth: %u, write_bandwidth: %u\n" , |
608 | prefix, attrs->read_latency, attrs->write_latency, |
609 | attrs->read_bandwidth, attrs->write_bandwidth); |
610 | } |
611 | |
612 | int mt_set_default_dram_perf(int nid, struct node_hmem_attrs *perf, |
613 | const char *source) |
614 | { |
615 | int rc = 0; |
616 | |
617 | mutex_lock(&memory_tier_lock); |
618 | if (default_dram_perf_error) { |
619 | rc = -EIO; |
620 | goto out; |
621 | } |
622 | |
623 | if (perf->read_latency + perf->write_latency == 0 || |
624 | perf->read_bandwidth + perf->write_bandwidth == 0) { |
625 | rc = -EINVAL; |
626 | goto out; |
627 | } |
628 | |
629 | if (default_dram_perf_ref_nid == NUMA_NO_NODE) { |
630 | default_dram_perf = *perf; |
631 | default_dram_perf_ref_nid = nid; |
632 | default_dram_perf_ref_source = kstrdup(s: source, GFP_KERNEL); |
633 | goto out; |
634 | } |
635 | |
636 | /* |
637 | * The performance of all default DRAM nodes is expected to be |
638 | * same (that is, the variation is less than 10%). And it |
639 | * will be used as base to calculate the abstract distance of |
640 | * other memory nodes. |
641 | */ |
642 | if (abs(perf->read_latency - default_dram_perf.read_latency) * 10 > |
643 | default_dram_perf.read_latency || |
644 | abs(perf->write_latency - default_dram_perf.write_latency) * 10 > |
645 | default_dram_perf.write_latency || |
646 | abs(perf->read_bandwidth - default_dram_perf.read_bandwidth) * 10 > |
647 | default_dram_perf.read_bandwidth || |
648 | abs(perf->write_bandwidth - default_dram_perf.write_bandwidth) * 10 > |
649 | default_dram_perf.write_bandwidth) { |
650 | pr_info( |
651 | "memory-tiers: the performance of DRAM node %d mismatches that of the reference\n" |
652 | "DRAM node %d.\n" , nid, default_dram_perf_ref_nid); |
653 | pr_info(" performance of reference DRAM node %d:\n" , |
654 | default_dram_perf_ref_nid); |
655 | dump_hmem_attrs(attrs: &default_dram_perf, prefix: " " ); |
656 | pr_info(" performance of DRAM node %d:\n" , nid); |
657 | dump_hmem_attrs(attrs: perf, prefix: " " ); |
658 | pr_info( |
659 | " disable default DRAM node performance based abstract distance algorithm.\n" ); |
660 | default_dram_perf_error = true; |
661 | rc = -EINVAL; |
662 | } |
663 | |
664 | out: |
665 | mutex_unlock(lock: &memory_tier_lock); |
666 | return rc; |
667 | } |
668 | |
669 | int mt_perf_to_adistance(struct node_hmem_attrs *perf, int *adist) |
670 | { |
671 | if (default_dram_perf_error) |
672 | return -EIO; |
673 | |
674 | if (default_dram_perf_ref_nid == NUMA_NO_NODE) |
675 | return -ENOENT; |
676 | |
677 | if (perf->read_latency + perf->write_latency == 0 || |
678 | perf->read_bandwidth + perf->write_bandwidth == 0) |
679 | return -EINVAL; |
680 | |
681 | mutex_lock(&memory_tier_lock); |
682 | /* |
683 | * The abstract distance of a memory node is in direct proportion to |
684 | * its memory latency (read + write) and inversely proportional to its |
685 | * memory bandwidth (read + write). The abstract distance, memory |
686 | * latency, and memory bandwidth of the default DRAM nodes are used as |
687 | * the base. |
688 | */ |
689 | *adist = MEMTIER_ADISTANCE_DRAM * |
690 | (perf->read_latency + perf->write_latency) / |
691 | (default_dram_perf.read_latency + default_dram_perf.write_latency) * |
692 | (default_dram_perf.read_bandwidth + default_dram_perf.write_bandwidth) / |
693 | (perf->read_bandwidth + perf->write_bandwidth); |
694 | mutex_unlock(lock: &memory_tier_lock); |
695 | |
696 | return 0; |
697 | } |
698 | EXPORT_SYMBOL_GPL(mt_perf_to_adistance); |
699 | |
700 | /** |
701 | * register_mt_adistance_algorithm() - Register memory tiering abstract distance algorithm |
702 | * @nb: The notifier block which describe the algorithm |
703 | * |
704 | * Return: 0 on success, errno on error. |
705 | * |
706 | * Every memory tiering abstract distance algorithm provider needs to |
707 | * register the algorithm with register_mt_adistance_algorithm(). To |
708 | * calculate the abstract distance for a specified memory node, the |
709 | * notifier function will be called unless some high priority |
710 | * algorithm has provided result. The prototype of the notifier |
711 | * function is as follows, |
712 | * |
713 | * int (*algorithm_notifier)(struct notifier_block *nb, |
714 | * unsigned long nid, void *data); |
715 | * |
716 | * Where "nid" specifies the memory node, "data" is the pointer to the |
717 | * returned abstract distance (that is, "int *adist"). If the |
718 | * algorithm provides the result, NOTIFY_STOP should be returned. |
719 | * Otherwise, return_value & %NOTIFY_STOP_MASK == 0 to allow the next |
720 | * algorithm in the chain to provide the result. |
721 | */ |
722 | int register_mt_adistance_algorithm(struct notifier_block *nb) |
723 | { |
724 | return blocking_notifier_chain_register(nh: &mt_adistance_algorithms, nb); |
725 | } |
726 | EXPORT_SYMBOL_GPL(register_mt_adistance_algorithm); |
727 | |
728 | /** |
729 | * unregister_mt_adistance_algorithm() - Unregister memory tiering abstract distance algorithm |
730 | * @nb: the notifier block which describe the algorithm |
731 | * |
732 | * Return: 0 on success, errno on error. |
733 | */ |
734 | int unregister_mt_adistance_algorithm(struct notifier_block *nb) |
735 | { |
736 | return blocking_notifier_chain_unregister(nh: &mt_adistance_algorithms, nb); |
737 | } |
738 | EXPORT_SYMBOL_GPL(unregister_mt_adistance_algorithm); |
739 | |
740 | /** |
741 | * mt_calc_adistance() - Calculate abstract distance with registered algorithms |
742 | * @node: the node to calculate abstract distance for |
743 | * @adist: the returned abstract distance |
744 | * |
745 | * Return: if return_value & %NOTIFY_STOP_MASK != 0, then some |
746 | * abstract distance algorithm provides the result, and return it via |
747 | * @adist. Otherwise, no algorithm can provide the result and @adist |
748 | * will be kept as it is. |
749 | */ |
750 | int mt_calc_adistance(int node, int *adist) |
751 | { |
752 | return blocking_notifier_call_chain(nh: &mt_adistance_algorithms, val: node, v: adist); |
753 | } |
754 | EXPORT_SYMBOL_GPL(mt_calc_adistance); |
755 | |
756 | static int __meminit memtier_hotplug_callback(struct notifier_block *self, |
757 | unsigned long action, void *_arg) |
758 | { |
759 | struct memory_tier *memtier; |
760 | struct memory_notify *arg = _arg; |
761 | |
762 | /* |
763 | * Only update the node migration order when a node is |
764 | * changing status, like online->offline. |
765 | */ |
766 | if (arg->status_change_nid < 0) |
767 | return notifier_from_errno(err: 0); |
768 | |
769 | switch (action) { |
770 | case MEM_OFFLINE: |
771 | mutex_lock(&memory_tier_lock); |
772 | if (clear_node_memory_tier(node: arg->status_change_nid)) |
773 | establish_demotion_targets(); |
774 | mutex_unlock(lock: &memory_tier_lock); |
775 | break; |
776 | case MEM_ONLINE: |
777 | mutex_lock(&memory_tier_lock); |
778 | memtier = set_node_memory_tier(arg->status_change_nid); |
779 | if (!IS_ERR(ptr: memtier)) |
780 | establish_demotion_targets(); |
781 | mutex_unlock(lock: &memory_tier_lock); |
782 | break; |
783 | } |
784 | |
785 | return notifier_from_errno(err: 0); |
786 | } |
787 | |
788 | static int __init memory_tier_init(void) |
789 | { |
790 | int ret, node; |
791 | struct memory_tier *memtier; |
792 | |
793 | ret = subsys_virtual_register(subsys: &memory_tier_subsys, NULL); |
794 | if (ret) |
795 | panic(fmt: "%s() failed to register memory tier subsystem\n" , __func__); |
796 | |
797 | #ifdef CONFIG_MIGRATION |
798 | node_demotion = kcalloc(n: nr_node_ids, size: sizeof(struct demotion_nodes), |
799 | GFP_KERNEL); |
800 | WARN_ON(!node_demotion); |
801 | #endif |
802 | mutex_lock(&memory_tier_lock); |
803 | /* |
804 | * For now we can have 4 faster memory tiers with smaller adistance |
805 | * than default DRAM tier. |
806 | */ |
807 | default_dram_type = alloc_memory_type(MEMTIER_ADISTANCE_DRAM); |
808 | if (IS_ERR(ptr: default_dram_type)) |
809 | panic(fmt: "%s() failed to allocate default DRAM tier\n" , __func__); |
810 | |
811 | /* |
812 | * Look at all the existing N_MEMORY nodes and add them to |
813 | * default memory tier or to a tier if we already have memory |
814 | * types assigned. |
815 | */ |
816 | for_each_node_state(node, N_MEMORY) { |
817 | memtier = set_node_memory_tier(node); |
818 | if (IS_ERR(ptr: memtier)) |
819 | /* |
820 | * Continue with memtiers we are able to setup |
821 | */ |
822 | break; |
823 | } |
824 | establish_demotion_targets(); |
825 | mutex_unlock(lock: &memory_tier_lock); |
826 | |
827 | hotplug_memory_notifier(memtier_hotplug_callback, MEMTIER_HOTPLUG_PRI); |
828 | return 0; |
829 | } |
830 | subsys_initcall(memory_tier_init); |
831 | |
832 | bool numa_demotion_enabled = false; |
833 | |
834 | #ifdef CONFIG_MIGRATION |
835 | #ifdef CONFIG_SYSFS |
836 | static ssize_t demotion_enabled_show(struct kobject *kobj, |
837 | struct kobj_attribute *attr, char *buf) |
838 | { |
839 | return sysfs_emit(buf, fmt: "%s\n" , |
840 | numa_demotion_enabled ? "true" : "false" ); |
841 | } |
842 | |
843 | static ssize_t demotion_enabled_store(struct kobject *kobj, |
844 | struct kobj_attribute *attr, |
845 | const char *buf, size_t count) |
846 | { |
847 | ssize_t ret; |
848 | |
849 | ret = kstrtobool(s: buf, res: &numa_demotion_enabled); |
850 | if (ret) |
851 | return ret; |
852 | |
853 | return count; |
854 | } |
855 | |
856 | static struct kobj_attribute numa_demotion_enabled_attr = |
857 | __ATTR_RW(demotion_enabled); |
858 | |
859 | static struct attribute *numa_attrs[] = { |
860 | &numa_demotion_enabled_attr.attr, |
861 | NULL, |
862 | }; |
863 | |
864 | static const struct attribute_group numa_attr_group = { |
865 | .attrs = numa_attrs, |
866 | }; |
867 | |
868 | static int __init numa_init_sysfs(void) |
869 | { |
870 | int err; |
871 | struct kobject *numa_kobj; |
872 | |
873 | numa_kobj = kobject_create_and_add(name: "numa" , parent: mm_kobj); |
874 | if (!numa_kobj) { |
875 | pr_err("failed to create numa kobject\n" ); |
876 | return -ENOMEM; |
877 | } |
878 | err = sysfs_create_group(kobj: numa_kobj, grp: &numa_attr_group); |
879 | if (err) { |
880 | pr_err("failed to register numa group\n" ); |
881 | goto delete_obj; |
882 | } |
883 | return 0; |
884 | |
885 | delete_obj: |
886 | kobject_put(kobj: numa_kobj); |
887 | return err; |
888 | } |
889 | subsys_initcall(numa_init_sysfs); |
890 | #endif /* CONFIG_SYSFS */ |
891 | #endif |
892 | |