1 | // SPDX-License-Identifier: GPL-2.0 |
2 | /* |
3 | * Memory subsystem support |
4 | * |
5 | * Written by Matt Tolentino <matthew.e.tolentino@intel.com> |
6 | * Dave Hansen <haveblue@us.ibm.com> |
7 | * |
8 | * This file provides the necessary infrastructure to represent |
9 | * a SPARSEMEM-memory-model system's physical memory in /sysfs. |
10 | * All arch-independent code that assumes MEMORY_HOTPLUG requires |
11 | * SPARSEMEM should be contained here, or in mm/memory_hotplug.c. |
12 | */ |
13 | |
14 | #include <linux/module.h> |
15 | #include <linux/init.h> |
16 | #include <linux/topology.h> |
17 | #include <linux/capability.h> |
18 | #include <linux/device.h> |
19 | #include <linux/memory.h> |
20 | #include <linux/memory_hotplug.h> |
21 | #include <linux/mm.h> |
22 | #include <linux/mutex.h> |
23 | #include <linux/stat.h> |
24 | #include <linux/slab.h> |
25 | |
26 | #include <linux/atomic.h> |
27 | #include <linux/uaccess.h> |
28 | |
29 | static DEFINE_MUTEX(mem_sysfs_mutex); |
30 | |
31 | #define MEMORY_CLASS_NAME "memory" |
32 | |
33 | #define to_memory_block(dev) container_of(dev, struct memory_block, dev) |
34 | |
35 | static int sections_per_block; |
36 | |
37 | static inline int base_memory_block_id(int section_nr) |
38 | { |
39 | return section_nr / sections_per_block; |
40 | } |
41 | |
42 | static int memory_subsys_online(struct device *dev); |
43 | static int memory_subsys_offline(struct device *dev); |
44 | |
45 | static struct bus_type memory_subsys = { |
46 | .name = MEMORY_CLASS_NAME, |
47 | .dev_name = MEMORY_CLASS_NAME, |
48 | .online = memory_subsys_online, |
49 | .offline = memory_subsys_offline, |
50 | }; |
51 | |
52 | static BLOCKING_NOTIFIER_HEAD(memory_chain); |
53 | |
54 | int register_memory_notifier(struct notifier_block *nb) |
55 | { |
56 | return blocking_notifier_chain_register(&memory_chain, nb); |
57 | } |
58 | EXPORT_SYMBOL(register_memory_notifier); |
59 | |
60 | void unregister_memory_notifier(struct notifier_block *nb) |
61 | { |
62 | blocking_notifier_chain_unregister(&memory_chain, nb); |
63 | } |
64 | EXPORT_SYMBOL(unregister_memory_notifier); |
65 | |
66 | static ATOMIC_NOTIFIER_HEAD(memory_isolate_chain); |
67 | |
68 | int register_memory_isolate_notifier(struct notifier_block *nb) |
69 | { |
70 | return atomic_notifier_chain_register(&memory_isolate_chain, nb); |
71 | } |
72 | EXPORT_SYMBOL(register_memory_isolate_notifier); |
73 | |
74 | void unregister_memory_isolate_notifier(struct notifier_block *nb) |
75 | { |
76 | atomic_notifier_chain_unregister(&memory_isolate_chain, nb); |
77 | } |
78 | EXPORT_SYMBOL(unregister_memory_isolate_notifier); |
79 | |
80 | static void memory_block_release(struct device *dev) |
81 | { |
82 | struct memory_block *mem = to_memory_block(dev); |
83 | |
84 | kfree(mem); |
85 | } |
86 | |
87 | unsigned long __weak memory_block_size_bytes(void) |
88 | { |
89 | return MIN_MEMORY_BLOCK_SIZE; |
90 | } |
91 | EXPORT_SYMBOL_GPL(memory_block_size_bytes); |
92 | |
93 | static unsigned long get_memory_block_size(void) |
94 | { |
95 | unsigned long block_sz; |
96 | |
97 | block_sz = memory_block_size_bytes(); |
98 | |
99 | /* Validate blk_sz is a power of 2 and not less than section size */ |
100 | if ((block_sz & (block_sz - 1)) || (block_sz < MIN_MEMORY_BLOCK_SIZE)) { |
101 | WARN_ON(1); |
102 | block_sz = MIN_MEMORY_BLOCK_SIZE; |
103 | } |
104 | |
105 | return block_sz; |
106 | } |
107 | |
108 | /* |
109 | * use this as the physical section index that this memsection |
110 | * uses. |
111 | */ |
112 | |
113 | static ssize_t phys_index_show(struct device *dev, |
114 | struct device_attribute *attr, char *buf) |
115 | { |
116 | struct memory_block *mem = to_memory_block(dev); |
117 | unsigned long phys_index; |
118 | |
119 | phys_index = mem->start_section_nr / sections_per_block; |
120 | return sprintf(buf, "%08lx\n" , phys_index); |
121 | } |
122 | |
123 | /* |
124 | * Show whether the section of memory is likely to be hot-removable |
125 | */ |
126 | static ssize_t removable_show(struct device *dev, struct device_attribute *attr, |
127 | char *buf) |
128 | { |
129 | unsigned long i, pfn; |
130 | int ret = 1; |
131 | struct memory_block *mem = to_memory_block(dev); |
132 | |
133 | if (mem->state != MEM_ONLINE) |
134 | goto out; |
135 | |
136 | for (i = 0; i < sections_per_block; i++) { |
137 | if (!present_section_nr(mem->start_section_nr + i)) |
138 | continue; |
139 | pfn = section_nr_to_pfn(mem->start_section_nr + i); |
140 | ret &= is_mem_section_removable(pfn, PAGES_PER_SECTION); |
141 | } |
142 | |
143 | out: |
144 | return sprintf(buf, "%d\n" , ret); |
145 | } |
146 | |
147 | /* |
148 | * online, offline, going offline, etc. |
149 | */ |
150 | static ssize_t state_show(struct device *dev, struct device_attribute *attr, |
151 | char *buf) |
152 | { |
153 | struct memory_block *mem = to_memory_block(dev); |
154 | ssize_t len = 0; |
155 | |
156 | /* |
157 | * We can probably put these states in a nice little array |
158 | * so that they're not open-coded |
159 | */ |
160 | switch (mem->state) { |
161 | case MEM_ONLINE: |
162 | len = sprintf(buf, "online\n" ); |
163 | break; |
164 | case MEM_OFFLINE: |
165 | len = sprintf(buf, "offline\n" ); |
166 | break; |
167 | case MEM_GOING_OFFLINE: |
168 | len = sprintf(buf, "going-offline\n" ); |
169 | break; |
170 | default: |
171 | len = sprintf(buf, "ERROR-UNKNOWN-%ld\n" , |
172 | mem->state); |
173 | WARN_ON(1); |
174 | break; |
175 | } |
176 | |
177 | return len; |
178 | } |
179 | |
180 | int memory_notify(unsigned long val, void *v) |
181 | { |
182 | return blocking_notifier_call_chain(&memory_chain, val, v); |
183 | } |
184 | |
185 | int memory_isolate_notify(unsigned long val, void *v) |
186 | { |
187 | return atomic_notifier_call_chain(&memory_isolate_chain, val, v); |
188 | } |
189 | |
190 | /* |
191 | * The probe routines leave the pages uninitialized, just as the bootmem code |
192 | * does. Make sure we do not access them, but instead use only information from |
193 | * within sections. |
194 | */ |
195 | static bool pages_correctly_probed(unsigned long start_pfn) |
196 | { |
197 | unsigned long section_nr = pfn_to_section_nr(start_pfn); |
198 | unsigned long section_nr_end = section_nr + sections_per_block; |
199 | unsigned long pfn = start_pfn; |
200 | |
201 | /* |
202 | * memmap between sections is not contiguous except with |
203 | * SPARSEMEM_VMEMMAP. We lookup the page once per section |
204 | * and assume memmap is contiguous within each section |
205 | */ |
206 | for (; section_nr < section_nr_end; section_nr++) { |
207 | if (WARN_ON_ONCE(!pfn_valid(pfn))) |
208 | return false; |
209 | |
210 | if (!present_section_nr(section_nr)) { |
211 | pr_warn("section %ld pfn[%lx, %lx) not present\n" , |
212 | section_nr, pfn, pfn + PAGES_PER_SECTION); |
213 | return false; |
214 | } else if (!valid_section_nr(section_nr)) { |
215 | pr_warn("section %ld pfn[%lx, %lx) no valid memmap\n" , |
216 | section_nr, pfn, pfn + PAGES_PER_SECTION); |
217 | return false; |
218 | } else if (online_section_nr(section_nr)) { |
219 | pr_warn("section %ld pfn[%lx, %lx) is already online\n" , |
220 | section_nr, pfn, pfn + PAGES_PER_SECTION); |
221 | return false; |
222 | } |
223 | pfn += PAGES_PER_SECTION; |
224 | } |
225 | |
226 | return true; |
227 | } |
228 | |
229 | /* |
230 | * MEMORY_HOTPLUG depends on SPARSEMEM in mm/Kconfig, so it is |
231 | * OK to have direct references to sparsemem variables in here. |
232 | */ |
233 | static int |
234 | memory_block_action(unsigned long phys_index, unsigned long action, int online_type) |
235 | { |
236 | unsigned long start_pfn; |
237 | unsigned long nr_pages = PAGES_PER_SECTION * sections_per_block; |
238 | int ret; |
239 | |
240 | start_pfn = section_nr_to_pfn(phys_index); |
241 | |
242 | switch (action) { |
243 | case MEM_ONLINE: |
244 | if (!pages_correctly_probed(start_pfn)) |
245 | return -EBUSY; |
246 | |
247 | ret = online_pages(start_pfn, nr_pages, online_type); |
248 | break; |
249 | case MEM_OFFLINE: |
250 | ret = offline_pages(start_pfn, nr_pages); |
251 | break; |
252 | default: |
253 | WARN(1, KERN_WARNING "%s(%ld, %ld) unknown action: " |
254 | "%ld\n" , __func__, phys_index, action, action); |
255 | ret = -EINVAL; |
256 | } |
257 | |
258 | return ret; |
259 | } |
260 | |
261 | static int memory_block_change_state(struct memory_block *mem, |
262 | unsigned long to_state, unsigned long from_state_req) |
263 | { |
264 | int ret = 0; |
265 | |
266 | if (mem->state != from_state_req) |
267 | return -EINVAL; |
268 | |
269 | if (to_state == MEM_OFFLINE) |
270 | mem->state = MEM_GOING_OFFLINE; |
271 | |
272 | ret = memory_block_action(mem->start_section_nr, to_state, |
273 | mem->online_type); |
274 | |
275 | mem->state = ret ? from_state_req : to_state; |
276 | |
277 | return ret; |
278 | } |
279 | |
280 | /* The device lock serializes operations on memory_subsys_[online|offline] */ |
281 | static int memory_subsys_online(struct device *dev) |
282 | { |
283 | struct memory_block *mem = to_memory_block(dev); |
284 | int ret; |
285 | |
286 | if (mem->state == MEM_ONLINE) |
287 | return 0; |
288 | |
289 | /* |
290 | * If we are called from state_store(), online_type will be |
291 | * set >= 0 Otherwise we were called from the device online |
292 | * attribute and need to set the online_type. |
293 | */ |
294 | if (mem->online_type < 0) |
295 | mem->online_type = MMOP_ONLINE_KEEP; |
296 | |
297 | ret = memory_block_change_state(mem, MEM_ONLINE, MEM_OFFLINE); |
298 | |
299 | /* clear online_type */ |
300 | mem->online_type = -1; |
301 | |
302 | return ret; |
303 | } |
304 | |
305 | static int memory_subsys_offline(struct device *dev) |
306 | { |
307 | struct memory_block *mem = to_memory_block(dev); |
308 | |
309 | if (mem->state == MEM_OFFLINE) |
310 | return 0; |
311 | |
312 | /* Can't offline block with non-present sections */ |
313 | if (mem->section_count != sections_per_block) |
314 | return -EINVAL; |
315 | |
316 | return memory_block_change_state(mem, MEM_OFFLINE, MEM_ONLINE); |
317 | } |
318 | |
319 | static ssize_t state_store(struct device *dev, struct device_attribute *attr, |
320 | const char *buf, size_t count) |
321 | { |
322 | struct memory_block *mem = to_memory_block(dev); |
323 | int ret, online_type; |
324 | |
325 | ret = lock_device_hotplug_sysfs(); |
326 | if (ret) |
327 | return ret; |
328 | |
329 | if (sysfs_streq(buf, "online_kernel" )) |
330 | online_type = MMOP_ONLINE_KERNEL; |
331 | else if (sysfs_streq(buf, "online_movable" )) |
332 | online_type = MMOP_ONLINE_MOVABLE; |
333 | else if (sysfs_streq(buf, "online" )) |
334 | online_type = MMOP_ONLINE_KEEP; |
335 | else if (sysfs_streq(buf, "offline" )) |
336 | online_type = MMOP_OFFLINE; |
337 | else { |
338 | ret = -EINVAL; |
339 | goto err; |
340 | } |
341 | |
342 | switch (online_type) { |
343 | case MMOP_ONLINE_KERNEL: |
344 | case MMOP_ONLINE_MOVABLE: |
345 | case MMOP_ONLINE_KEEP: |
346 | /* mem->online_type is protected by device_hotplug_lock */ |
347 | mem->online_type = online_type; |
348 | ret = device_online(&mem->dev); |
349 | break; |
350 | case MMOP_OFFLINE: |
351 | ret = device_offline(&mem->dev); |
352 | break; |
353 | default: |
354 | ret = -EINVAL; /* should never happen */ |
355 | } |
356 | |
357 | err: |
358 | unlock_device_hotplug(); |
359 | |
360 | if (ret < 0) |
361 | return ret; |
362 | if (ret) |
363 | return -EINVAL; |
364 | |
365 | return count; |
366 | } |
367 | |
368 | /* |
369 | * phys_device is a bad name for this. What I really want |
370 | * is a way to differentiate between memory ranges that |
371 | * are part of physical devices that constitute |
372 | * a complete removable unit or fru. |
373 | * i.e. do these ranges belong to the same physical device, |
374 | * s.t. if I offline all of these sections I can then |
375 | * remove the physical device? |
376 | */ |
377 | static ssize_t phys_device_show(struct device *dev, |
378 | struct device_attribute *attr, char *buf) |
379 | { |
380 | struct memory_block *mem = to_memory_block(dev); |
381 | return sprintf(buf, "%d\n" , mem->phys_device); |
382 | } |
383 | |
384 | #ifdef CONFIG_MEMORY_HOTREMOVE |
385 | static void print_allowed_zone(char *buf, int nid, unsigned long start_pfn, |
386 | unsigned long nr_pages, int online_type, |
387 | struct zone *default_zone) |
388 | { |
389 | struct zone *zone; |
390 | |
391 | zone = zone_for_pfn_range(online_type, nid, start_pfn, nr_pages); |
392 | if (zone != default_zone) { |
393 | strcat(buf, " " ); |
394 | strcat(buf, zone->name); |
395 | } |
396 | } |
397 | |
398 | static ssize_t valid_zones_show(struct device *dev, |
399 | struct device_attribute *attr, char *buf) |
400 | { |
401 | struct memory_block *mem = to_memory_block(dev); |
402 | unsigned long start_pfn = section_nr_to_pfn(mem->start_section_nr); |
403 | unsigned long nr_pages = PAGES_PER_SECTION * sections_per_block; |
404 | unsigned long valid_start_pfn, valid_end_pfn; |
405 | struct zone *default_zone; |
406 | int nid; |
407 | |
408 | /* |
409 | * Check the existing zone. Make sure that we do that only on the |
410 | * online nodes otherwise the page_zone is not reliable |
411 | */ |
412 | if (mem->state == MEM_ONLINE) { |
413 | /* |
414 | * The block contains more than one zone can not be offlined. |
415 | * This can happen e.g. for ZONE_DMA and ZONE_DMA32 |
416 | */ |
417 | if (!test_pages_in_a_zone(start_pfn, start_pfn + nr_pages, |
418 | &valid_start_pfn, &valid_end_pfn)) |
419 | return sprintf(buf, "none\n" ); |
420 | start_pfn = valid_start_pfn; |
421 | strcat(buf, page_zone(pfn_to_page(start_pfn))->name); |
422 | goto out; |
423 | } |
424 | |
425 | nid = mem->nid; |
426 | default_zone = zone_for_pfn_range(MMOP_ONLINE_KEEP, nid, start_pfn, nr_pages); |
427 | strcat(buf, default_zone->name); |
428 | |
429 | print_allowed_zone(buf, nid, start_pfn, nr_pages, MMOP_ONLINE_KERNEL, |
430 | default_zone); |
431 | print_allowed_zone(buf, nid, start_pfn, nr_pages, MMOP_ONLINE_MOVABLE, |
432 | default_zone); |
433 | out: |
434 | strcat(buf, "\n" ); |
435 | |
436 | return strlen(buf); |
437 | } |
438 | static DEVICE_ATTR_RO(valid_zones); |
439 | #endif |
440 | |
441 | static DEVICE_ATTR_RO(phys_index); |
442 | static DEVICE_ATTR_RW(state); |
443 | static DEVICE_ATTR_RO(phys_device); |
444 | static DEVICE_ATTR_RO(removable); |
445 | |
446 | /* |
447 | * Block size attribute stuff |
448 | */ |
449 | static ssize_t block_size_bytes_show(struct device *dev, |
450 | struct device_attribute *attr, char *buf) |
451 | { |
452 | return sprintf(buf, "%lx\n" , get_memory_block_size()); |
453 | } |
454 | |
455 | static DEVICE_ATTR_RO(block_size_bytes); |
456 | |
457 | /* |
458 | * Memory auto online policy. |
459 | */ |
460 | |
461 | static ssize_t auto_online_blocks_show(struct device *dev, |
462 | struct device_attribute *attr, char *buf) |
463 | { |
464 | if (memhp_auto_online) |
465 | return sprintf(buf, "online\n" ); |
466 | else |
467 | return sprintf(buf, "offline\n" ); |
468 | } |
469 | |
470 | static ssize_t auto_online_blocks_store(struct device *dev, |
471 | struct device_attribute *attr, |
472 | const char *buf, size_t count) |
473 | { |
474 | if (sysfs_streq(buf, "online" )) |
475 | memhp_auto_online = true; |
476 | else if (sysfs_streq(buf, "offline" )) |
477 | memhp_auto_online = false; |
478 | else |
479 | return -EINVAL; |
480 | |
481 | return count; |
482 | } |
483 | |
484 | static DEVICE_ATTR_RW(auto_online_blocks); |
485 | |
486 | /* |
487 | * Some architectures will have custom drivers to do this, and |
488 | * will not need to do it from userspace. The fake hot-add code |
489 | * as well as ppc64 will do all of their discovery in userspace |
490 | * and will require this interface. |
491 | */ |
492 | #ifdef CONFIG_ARCH_MEMORY_PROBE |
493 | static ssize_t probe_store(struct device *dev, struct device_attribute *attr, |
494 | const char *buf, size_t count) |
495 | { |
496 | u64 phys_addr; |
497 | int nid, ret; |
498 | unsigned long pages_per_block = PAGES_PER_SECTION * sections_per_block; |
499 | |
500 | ret = kstrtoull(buf, 0, &phys_addr); |
501 | if (ret) |
502 | return ret; |
503 | |
504 | if (phys_addr & ((pages_per_block << PAGE_SHIFT) - 1)) |
505 | return -EINVAL; |
506 | |
507 | ret = lock_device_hotplug_sysfs(); |
508 | if (ret) |
509 | goto out; |
510 | |
511 | nid = memory_add_physaddr_to_nid(phys_addr); |
512 | ret = __add_memory(nid, phys_addr, |
513 | MIN_MEMORY_BLOCK_SIZE * sections_per_block); |
514 | |
515 | if (ret) |
516 | goto out; |
517 | |
518 | ret = count; |
519 | out: |
520 | unlock_device_hotplug(); |
521 | return ret; |
522 | } |
523 | |
524 | static DEVICE_ATTR_WO(probe); |
525 | #endif |
526 | |
527 | #ifdef CONFIG_MEMORY_FAILURE |
528 | /* |
529 | * Support for offlining pages of memory |
530 | */ |
531 | |
532 | /* Soft offline a page */ |
533 | static ssize_t soft_offline_page_store(struct device *dev, |
534 | struct device_attribute *attr, |
535 | const char *buf, size_t count) |
536 | { |
537 | int ret; |
538 | u64 pfn; |
539 | if (!capable(CAP_SYS_ADMIN)) |
540 | return -EPERM; |
541 | if (kstrtoull(buf, 0, &pfn) < 0) |
542 | return -EINVAL; |
543 | pfn >>= PAGE_SHIFT; |
544 | if (!pfn_valid(pfn)) |
545 | return -ENXIO; |
546 | ret = soft_offline_page(pfn_to_page(pfn), 0); |
547 | return ret == 0 ? count : ret; |
548 | } |
549 | |
550 | /* Forcibly offline a page, including killing processes. */ |
551 | static ssize_t hard_offline_page_store(struct device *dev, |
552 | struct device_attribute *attr, |
553 | const char *buf, size_t count) |
554 | { |
555 | int ret; |
556 | u64 pfn; |
557 | if (!capable(CAP_SYS_ADMIN)) |
558 | return -EPERM; |
559 | if (kstrtoull(buf, 0, &pfn) < 0) |
560 | return -EINVAL; |
561 | pfn >>= PAGE_SHIFT; |
562 | ret = memory_failure(pfn, 0); |
563 | return ret ? ret : count; |
564 | } |
565 | |
566 | static DEVICE_ATTR_WO(soft_offline_page); |
567 | static DEVICE_ATTR_WO(hard_offline_page); |
568 | #endif |
569 | |
570 | /* |
571 | * Note that phys_device is optional. It is here to allow for |
572 | * differentiation between which *physical* devices each |
573 | * section belongs to... |
574 | */ |
575 | int __weak arch_get_memory_phys_device(unsigned long start_pfn) |
576 | { |
577 | return 0; |
578 | } |
579 | |
580 | /* |
581 | * A reference for the returned object is held and the reference for the |
582 | * hinted object is released. |
583 | */ |
584 | struct memory_block *find_memory_block_hinted(struct mem_section *section, |
585 | struct memory_block *hint) |
586 | { |
587 | int block_id = base_memory_block_id(__section_nr(section)); |
588 | struct device *hintdev = hint ? &hint->dev : NULL; |
589 | struct device *dev; |
590 | |
591 | dev = subsys_find_device_by_id(&memory_subsys, block_id, hintdev); |
592 | if (hint) |
593 | put_device(&hint->dev); |
594 | if (!dev) |
595 | return NULL; |
596 | return to_memory_block(dev); |
597 | } |
598 | |
599 | /* |
600 | * For now, we have a linear search to go find the appropriate |
601 | * memory_block corresponding to a particular phys_index. If |
602 | * this gets to be a real problem, we can always use a radix |
603 | * tree or something here. |
604 | * |
605 | * This could be made generic for all device subsystems. |
606 | */ |
607 | struct memory_block *find_memory_block(struct mem_section *section) |
608 | { |
609 | return find_memory_block_hinted(section, NULL); |
610 | } |
611 | |
612 | static struct attribute *memory_memblk_attrs[] = { |
613 | &dev_attr_phys_index.attr, |
614 | &dev_attr_state.attr, |
615 | &dev_attr_phys_device.attr, |
616 | &dev_attr_removable.attr, |
617 | #ifdef CONFIG_MEMORY_HOTREMOVE |
618 | &dev_attr_valid_zones.attr, |
619 | #endif |
620 | NULL |
621 | }; |
622 | |
623 | static struct attribute_group memory_memblk_attr_group = { |
624 | .attrs = memory_memblk_attrs, |
625 | }; |
626 | |
627 | static const struct attribute_group *memory_memblk_attr_groups[] = { |
628 | &memory_memblk_attr_group, |
629 | NULL, |
630 | }; |
631 | |
632 | /* |
633 | * register_memory - Setup a sysfs device for a memory block |
634 | */ |
635 | static |
636 | int register_memory(struct memory_block *memory) |
637 | { |
638 | int ret; |
639 | |
640 | memory->dev.bus = &memory_subsys; |
641 | memory->dev.id = memory->start_section_nr / sections_per_block; |
642 | memory->dev.release = memory_block_release; |
643 | memory->dev.groups = memory_memblk_attr_groups; |
644 | memory->dev.offline = memory->state == MEM_OFFLINE; |
645 | |
646 | ret = device_register(&memory->dev); |
647 | if (ret) |
648 | put_device(&memory->dev); |
649 | |
650 | return ret; |
651 | } |
652 | |
653 | static int init_memory_block(struct memory_block **memory, |
654 | struct mem_section *section, unsigned long state) |
655 | { |
656 | struct memory_block *mem; |
657 | unsigned long start_pfn; |
658 | int scn_nr; |
659 | int ret = 0; |
660 | |
661 | mem = kzalloc(sizeof(*mem), GFP_KERNEL); |
662 | if (!mem) |
663 | return -ENOMEM; |
664 | |
665 | scn_nr = __section_nr(section); |
666 | mem->start_section_nr = |
667 | base_memory_block_id(scn_nr) * sections_per_block; |
668 | mem->end_section_nr = mem->start_section_nr + sections_per_block - 1; |
669 | mem->state = state; |
670 | start_pfn = section_nr_to_pfn(mem->start_section_nr); |
671 | mem->phys_device = arch_get_memory_phys_device(start_pfn); |
672 | |
673 | ret = register_memory(mem); |
674 | |
675 | *memory = mem; |
676 | return ret; |
677 | } |
678 | |
679 | static int add_memory_block(int base_section_nr) |
680 | { |
681 | struct memory_block *mem; |
682 | int i, ret, section_count = 0, section_nr; |
683 | |
684 | for (i = base_section_nr; |
685 | i < base_section_nr + sections_per_block; |
686 | i++) { |
687 | if (!present_section_nr(i)) |
688 | continue; |
689 | if (section_count == 0) |
690 | section_nr = i; |
691 | section_count++; |
692 | } |
693 | |
694 | if (section_count == 0) |
695 | return 0; |
696 | ret = init_memory_block(&mem, __nr_to_section(section_nr), MEM_ONLINE); |
697 | if (ret) |
698 | return ret; |
699 | mem->section_count = section_count; |
700 | return 0; |
701 | } |
702 | |
703 | /* |
704 | * need an interface for the VM to add new memory regions, |
705 | * but without onlining it. |
706 | */ |
707 | int hotplug_memory_register(int nid, struct mem_section *section) |
708 | { |
709 | int ret = 0; |
710 | struct memory_block *mem; |
711 | |
712 | mutex_lock(&mem_sysfs_mutex); |
713 | |
714 | mem = find_memory_block(section); |
715 | if (mem) { |
716 | mem->section_count++; |
717 | put_device(&mem->dev); |
718 | } else { |
719 | ret = init_memory_block(&mem, section, MEM_OFFLINE); |
720 | if (ret) |
721 | goto out; |
722 | mem->section_count++; |
723 | } |
724 | |
725 | out: |
726 | mutex_unlock(&mem_sysfs_mutex); |
727 | return ret; |
728 | } |
729 | |
730 | #ifdef CONFIG_MEMORY_HOTREMOVE |
731 | static void |
732 | unregister_memory(struct memory_block *memory) |
733 | { |
734 | BUG_ON(memory->dev.bus != &memory_subsys); |
735 | |
736 | /* drop the ref. we got in remove_memory_section() */ |
737 | put_device(&memory->dev); |
738 | device_unregister(&memory->dev); |
739 | } |
740 | |
741 | static int remove_memory_section(unsigned long node_id, |
742 | struct mem_section *section, int phys_device) |
743 | { |
744 | struct memory_block *mem; |
745 | |
746 | mutex_lock(&mem_sysfs_mutex); |
747 | |
748 | /* |
749 | * Some users of the memory hotplug do not want/need memblock to |
750 | * track all sections. Skip over those. |
751 | */ |
752 | mem = find_memory_block(section); |
753 | if (!mem) |
754 | goto out_unlock; |
755 | |
756 | unregister_mem_sect_under_nodes(mem, __section_nr(section)); |
757 | |
758 | mem->section_count--; |
759 | if (mem->section_count == 0) |
760 | unregister_memory(mem); |
761 | else |
762 | put_device(&mem->dev); |
763 | |
764 | out_unlock: |
765 | mutex_unlock(&mem_sysfs_mutex); |
766 | return 0; |
767 | } |
768 | |
769 | int unregister_memory_section(struct mem_section *section) |
770 | { |
771 | if (!present_section(section)) |
772 | return -EINVAL; |
773 | |
774 | return remove_memory_section(0, section, 0); |
775 | } |
776 | #endif /* CONFIG_MEMORY_HOTREMOVE */ |
777 | |
778 | /* return true if the memory block is offlined, otherwise, return false */ |
779 | bool is_memblock_offlined(struct memory_block *mem) |
780 | { |
781 | return mem->state == MEM_OFFLINE; |
782 | } |
783 | |
784 | static struct attribute *memory_root_attrs[] = { |
785 | #ifdef CONFIG_ARCH_MEMORY_PROBE |
786 | &dev_attr_probe.attr, |
787 | #endif |
788 | |
789 | #ifdef CONFIG_MEMORY_FAILURE |
790 | &dev_attr_soft_offline_page.attr, |
791 | &dev_attr_hard_offline_page.attr, |
792 | #endif |
793 | |
794 | &dev_attr_block_size_bytes.attr, |
795 | &dev_attr_auto_online_blocks.attr, |
796 | NULL |
797 | }; |
798 | |
799 | static struct attribute_group memory_root_attr_group = { |
800 | .attrs = memory_root_attrs, |
801 | }; |
802 | |
803 | static const struct attribute_group *memory_root_attr_groups[] = { |
804 | &memory_root_attr_group, |
805 | NULL, |
806 | }; |
807 | |
808 | /* |
809 | * Initialize the sysfs support for memory devices... |
810 | */ |
811 | int __init memory_dev_init(void) |
812 | { |
813 | unsigned int i; |
814 | int ret; |
815 | int err; |
816 | unsigned long block_sz; |
817 | |
818 | ret = subsys_system_register(&memory_subsys, memory_root_attr_groups); |
819 | if (ret) |
820 | goto out; |
821 | |
822 | block_sz = get_memory_block_size(); |
823 | sections_per_block = block_sz / MIN_MEMORY_BLOCK_SIZE; |
824 | |
825 | /* |
826 | * Create entries for memory sections that were found |
827 | * during boot and have been initialized |
828 | */ |
829 | mutex_lock(&mem_sysfs_mutex); |
830 | for (i = 0; i <= __highest_present_section_nr; |
831 | i += sections_per_block) { |
832 | err = add_memory_block(i); |
833 | if (!ret) |
834 | ret = err; |
835 | } |
836 | mutex_unlock(&mem_sysfs_mutex); |
837 | |
838 | out: |
839 | if (ret) |
840 | printk(KERN_ERR "%s() failed: %d\n" , __func__, ret); |
841 | return ret; |
842 | } |
843 | |