1
2
3
4
5
6
7
8
9
10
11
12
13
14#include <linux/module.h>
15#include <linux/init.h>
16#include <linux/topology.h>
17#include <linux/capability.h>
18#include <linux/device.h>
19#include <linux/memory.h>
20#include <linux/memory_hotplug.h>
21#include <linux/mm.h>
22#include <linux/stat.h>
23#include <linux/slab.h>
24#include <linux/xarray.h>
25
26#include <linux/atomic.h>
27#include <linux/uaccess.h>
28
29#define MEMORY_CLASS_NAME "memory"
30
31static const char *const online_type_to_str[] = {
32 [MMOP_OFFLINE] = "offline",
33 [MMOP_ONLINE] = "online",
34 [MMOP_ONLINE_KERNEL] = "online_kernel",
35 [MMOP_ONLINE_MOVABLE] = "online_movable",
36};
37
38int memhp_online_type_from_str(const char *str)
39{
40 int i;
41
42 for (i = 0; i < ARRAY_SIZE(online_type_to_str); i++) {
43 if (sysfs_streq(str, online_type_to_str[i]))
44 return i;
45 }
46 return -EINVAL;
47}
48
49#define to_memory_block(dev) container_of(dev, struct memory_block, dev)
50
51static int sections_per_block;
52
53static inline unsigned long base_memory_block_id(unsigned long section_nr)
54{
55 return section_nr / sections_per_block;
56}
57
58static inline unsigned long pfn_to_block_id(unsigned long pfn)
59{
60 return base_memory_block_id(pfn_to_section_nr(pfn));
61}
62
63static inline unsigned long phys_to_block_id(unsigned long phys)
64{
65 return pfn_to_block_id(PFN_DOWN(phys));
66}
67
68static int memory_subsys_online(struct device *dev);
69static int memory_subsys_offline(struct device *dev);
70
71static struct bus_type memory_subsys = {
72 .name = MEMORY_CLASS_NAME,
73 .dev_name = MEMORY_CLASS_NAME,
74 .online = memory_subsys_online,
75 .offline = memory_subsys_offline,
76};
77
78
79
80
81
82
83static DEFINE_XARRAY(memory_blocks);
84
85static BLOCKING_NOTIFIER_HEAD(memory_chain);
86
87int register_memory_notifier(struct notifier_block *nb)
88{
89 return blocking_notifier_chain_register(&memory_chain, nb);
90}
91EXPORT_SYMBOL(register_memory_notifier);
92
93void unregister_memory_notifier(struct notifier_block *nb)
94{
95 blocking_notifier_chain_unregister(&memory_chain, nb);
96}
97EXPORT_SYMBOL(unregister_memory_notifier);
98
99static void memory_block_release(struct device *dev)
100{
101 struct memory_block *mem = to_memory_block(dev);
102
103 kfree(mem);
104}
105
106unsigned long __weak memory_block_size_bytes(void)
107{
108 return MIN_MEMORY_BLOCK_SIZE;
109}
110
111
112
113
114static ssize_t phys_index_show(struct device *dev,
115 struct device_attribute *attr, char *buf)
116{
117 struct memory_block *mem = to_memory_block(dev);
118 unsigned long phys_index;
119
120 phys_index = mem->start_section_nr / sections_per_block;
121
122 return sysfs_emit(buf, "%08lx\n", phys_index);
123}
124
125
126
127
128
129static ssize_t removable_show(struct device *dev, struct device_attribute *attr,
130 char *buf)
131{
132 return sprintf(buf, "%d\n", (int)IS_ENABLED(CONFIG_MEMORY_HOTREMOVE));
133}
134
135
136
137
138static ssize_t state_show(struct device *dev, struct device_attribute *attr,
139 char *buf)
140{
141 struct memory_block *mem = to_memory_block(dev);
142 const char *output;
143
144
145
146
147
148 switch (mem->state) {
149 case MEM_ONLINE:
150 output = "online";
151 break;
152 case MEM_OFFLINE:
153 output = "offline";
154 break;
155 case MEM_GOING_OFFLINE:
156 output = "going-offline";
157 break;
158 default:
159 WARN_ON(1);
160 return sysfs_emit(buf, "ERROR-UNKNOWN-%ld\n", mem->state);
161 }
162
163 return sysfs_emit(buf, "%s\n", output);
164}
165
166int memory_notify(unsigned long val, void *v)
167{
168 return blocking_notifier_call_chain(&memory_chain, val, v);
169}
170
171
172
173
174
175static int
176memory_block_action(unsigned long start_section_nr, unsigned long action,
177 int online_type, int nid)
178{
179 unsigned long start_pfn;
180 unsigned long nr_pages = PAGES_PER_SECTION * sections_per_block;
181 int ret;
182
183 start_pfn = section_nr_to_pfn(start_section_nr);
184
185 switch (action) {
186 case MEM_ONLINE:
187 ret = online_pages(start_pfn, nr_pages, online_type, nid);
188 break;
189 case MEM_OFFLINE:
190 ret = offline_pages(start_pfn, nr_pages);
191 break;
192 default:
193 WARN(1, KERN_WARNING "%s(%ld, %ld) unknown action: "
194 "%ld\n", __func__, start_section_nr, action, action);
195 ret = -EINVAL;
196 }
197
198 return ret;
199}
200
201static int memory_block_change_state(struct memory_block *mem,
202 unsigned long to_state, unsigned long from_state_req)
203{
204 int ret = 0;
205
206 if (mem->state != from_state_req)
207 return -EINVAL;
208
209 if (to_state == MEM_OFFLINE)
210 mem->state = MEM_GOING_OFFLINE;
211
212 ret = memory_block_action(mem->start_section_nr, to_state,
213 mem->online_type, mem->nid);
214
215 mem->state = ret ? from_state_req : to_state;
216
217 return ret;
218}
219
220
221static int memory_subsys_online(struct device *dev)
222{
223 struct memory_block *mem = to_memory_block(dev);
224 int ret;
225
226 if (mem->state == MEM_ONLINE)
227 return 0;
228
229
230
231
232
233 if (mem->online_type == MMOP_OFFLINE)
234 mem->online_type = MMOP_ONLINE;
235
236 ret = memory_block_change_state(mem, MEM_ONLINE, MEM_OFFLINE);
237 mem->online_type = MMOP_OFFLINE;
238
239 return ret;
240}
241
242static int memory_subsys_offline(struct device *dev)
243{
244 struct memory_block *mem = to_memory_block(dev);
245
246 if (mem->state == MEM_OFFLINE)
247 return 0;
248
249 return memory_block_change_state(mem, MEM_OFFLINE, MEM_ONLINE);
250}
251
252static ssize_t state_store(struct device *dev, struct device_attribute *attr,
253 const char *buf, size_t count)
254{
255 const int online_type = memhp_online_type_from_str(buf);
256 struct memory_block *mem = to_memory_block(dev);
257 int ret;
258
259 if (online_type < 0)
260 return -EINVAL;
261
262 ret = lock_device_hotplug_sysfs();
263 if (ret)
264 return ret;
265
266 switch (online_type) {
267 case MMOP_ONLINE_KERNEL:
268 case MMOP_ONLINE_MOVABLE:
269 case MMOP_ONLINE:
270
271 mem->online_type = online_type;
272 ret = device_online(&mem->dev);
273 break;
274 case MMOP_OFFLINE:
275 ret = device_offline(&mem->dev);
276 break;
277 default:
278 ret = -EINVAL;
279 }
280
281 unlock_device_hotplug();
282
283 if (ret < 0)
284 return ret;
285 if (ret)
286 return -EINVAL;
287
288 return count;
289}
290
291
292
293
294
295
296
297
298static ssize_t phys_device_show(struct device *dev,
299 struct device_attribute *attr, char *buf)
300{
301 struct memory_block *mem = to_memory_block(dev);
302 unsigned long start_pfn = section_nr_to_pfn(mem->start_section_nr);
303
304 return sysfs_emit(buf, "%d\n",
305 arch_get_memory_phys_device(start_pfn));
306}
307
308#ifdef CONFIG_MEMORY_HOTREMOVE
309static int print_allowed_zone(char *buf, int len, int nid,
310 unsigned long start_pfn, unsigned long nr_pages,
311 int online_type, struct zone *default_zone)
312{
313 struct zone *zone;
314
315 zone = zone_for_pfn_range(online_type, nid, start_pfn, nr_pages);
316 if (zone == default_zone)
317 return 0;
318
319 return sysfs_emit_at(buf, len, " %s", zone->name);
320}
321
322static ssize_t valid_zones_show(struct device *dev,
323 struct device_attribute *attr, char *buf)
324{
325 struct memory_block *mem = to_memory_block(dev);
326 unsigned long start_pfn = section_nr_to_pfn(mem->start_section_nr);
327 unsigned long nr_pages = PAGES_PER_SECTION * sections_per_block;
328 struct zone *default_zone;
329 int len = 0;
330 int nid;
331
332
333
334
335
336 if (mem->state == MEM_ONLINE) {
337
338
339
340
341 default_zone = test_pages_in_a_zone(start_pfn,
342 start_pfn + nr_pages);
343 if (!default_zone)
344 return sysfs_emit(buf, "%s\n", "none");
345 len += sysfs_emit_at(buf, len, "%s", default_zone->name);
346 goto out;
347 }
348
349 nid = mem->nid;
350 default_zone = zone_for_pfn_range(MMOP_ONLINE, nid, start_pfn,
351 nr_pages);
352
353 len += sysfs_emit_at(buf, len, "%s", default_zone->name);
354 len += print_allowed_zone(buf, len, nid, start_pfn, nr_pages,
355 MMOP_ONLINE_KERNEL, default_zone);
356 len += print_allowed_zone(buf, len, nid, start_pfn, nr_pages,
357 MMOP_ONLINE_MOVABLE, default_zone);
358out:
359 len += sysfs_emit_at(buf, len, "\n");
360 return len;
361}
362static DEVICE_ATTR_RO(valid_zones);
363#endif
364
365static DEVICE_ATTR_RO(phys_index);
366static DEVICE_ATTR_RW(state);
367static DEVICE_ATTR_RO(phys_device);
368static DEVICE_ATTR_RO(removable);
369
370
371
372
373static ssize_t block_size_bytes_show(struct device *dev,
374 struct device_attribute *attr, char *buf)
375{
376 return sysfs_emit(buf, "%lx\n", memory_block_size_bytes());
377}
378
379static DEVICE_ATTR_RO(block_size_bytes);
380
381
382
383
384
385static ssize_t auto_online_blocks_show(struct device *dev,
386 struct device_attribute *attr, char *buf)
387{
388 return sysfs_emit(buf, "%s\n",
389 online_type_to_str[memhp_default_online_type]);
390}
391
392static ssize_t auto_online_blocks_store(struct device *dev,
393 struct device_attribute *attr,
394 const char *buf, size_t count)
395{
396 const int online_type = memhp_online_type_from_str(buf);
397
398 if (online_type < 0)
399 return -EINVAL;
400
401 memhp_default_online_type = online_type;
402 return count;
403}
404
405static DEVICE_ATTR_RW(auto_online_blocks);
406
407
408
409
410
411
412
413#ifdef CONFIG_ARCH_MEMORY_PROBE
414static ssize_t probe_store(struct device *dev, struct device_attribute *attr,
415 const char *buf, size_t count)
416{
417 u64 phys_addr;
418 int nid, ret;
419 unsigned long pages_per_block = PAGES_PER_SECTION * sections_per_block;
420
421 ret = kstrtoull(buf, 0, &phys_addr);
422 if (ret)
423 return ret;
424
425 if (phys_addr & ((pages_per_block << PAGE_SHIFT) - 1))
426 return -EINVAL;
427
428 ret = lock_device_hotplug_sysfs();
429 if (ret)
430 return ret;
431
432 nid = memory_add_physaddr_to_nid(phys_addr);
433 ret = __add_memory(nid, phys_addr,
434 MIN_MEMORY_BLOCK_SIZE * sections_per_block);
435
436 if (ret)
437 goto out;
438
439 ret = count;
440out:
441 unlock_device_hotplug();
442 return ret;
443}
444
445static DEVICE_ATTR_WO(probe);
446#endif
447
448#ifdef CONFIG_MEMORY_FAILURE
449
450
451
452
453
454static ssize_t soft_offline_page_store(struct device *dev,
455 struct device_attribute *attr,
456 const char *buf, size_t count)
457{
458 int ret;
459 u64 pfn;
460 if (!capable(CAP_SYS_ADMIN))
461 return -EPERM;
462 if (kstrtoull(buf, 0, &pfn) < 0)
463 return -EINVAL;
464 pfn >>= PAGE_SHIFT;
465 ret = soft_offline_page(pfn, 0);
466 return ret == 0 ? count : ret;
467}
468
469
470static ssize_t hard_offline_page_store(struct device *dev,
471 struct device_attribute *attr,
472 const char *buf, size_t count)
473{
474 int ret;
475 u64 pfn;
476 if (!capable(CAP_SYS_ADMIN))
477 return -EPERM;
478 if (kstrtoull(buf, 0, &pfn) < 0)
479 return -EINVAL;
480 pfn >>= PAGE_SHIFT;
481 ret = memory_failure(pfn, 0);
482 return ret ? ret : count;
483}
484
485static DEVICE_ATTR_WO(soft_offline_page);
486static DEVICE_ATTR_WO(hard_offline_page);
487#endif
488
489
490int __weak arch_get_memory_phys_device(unsigned long start_pfn)
491{
492 return 0;
493}
494
495
496
497
498
499
500static struct memory_block *find_memory_block_by_id(unsigned long block_id)
501{
502 struct memory_block *mem;
503
504 mem = xa_load(&memory_blocks, block_id);
505 if (mem)
506 get_device(&mem->dev);
507 return mem;
508}
509
510
511
512
513struct memory_block *find_memory_block(struct mem_section *section)
514{
515 unsigned long block_id = base_memory_block_id(__section_nr(section));
516
517 return find_memory_block_by_id(block_id);
518}
519
520static struct attribute *memory_memblk_attrs[] = {
521 &dev_attr_phys_index.attr,
522 &dev_attr_state.attr,
523 &dev_attr_phys_device.attr,
524 &dev_attr_removable.attr,
525#ifdef CONFIG_MEMORY_HOTREMOVE
526 &dev_attr_valid_zones.attr,
527#endif
528 NULL
529};
530
531static struct attribute_group memory_memblk_attr_group = {
532 .attrs = memory_memblk_attrs,
533};
534
535static const struct attribute_group *memory_memblk_attr_groups[] = {
536 &memory_memblk_attr_group,
537 NULL,
538};
539
540
541
542
543static
544int register_memory(struct memory_block *memory)
545{
546 int ret;
547
548 memory->dev.bus = &memory_subsys;
549 memory->dev.id = memory->start_section_nr / sections_per_block;
550 memory->dev.release = memory_block_release;
551 memory->dev.groups = memory_memblk_attr_groups;
552 memory->dev.offline = memory->state == MEM_OFFLINE;
553
554 ret = device_register(&memory->dev);
555 if (ret) {
556 put_device(&memory->dev);
557 return ret;
558 }
559 ret = xa_err(xa_store(&memory_blocks, memory->dev.id, memory,
560 GFP_KERNEL));
561 if (ret) {
562 put_device(&memory->dev);
563 device_unregister(&memory->dev);
564 }
565 return ret;
566}
567
568static int init_memory_block(struct memory_block **memory,
569 unsigned long block_id, unsigned long state)
570{
571 struct memory_block *mem;
572 int ret = 0;
573
574 mem = find_memory_block_by_id(block_id);
575 if (mem) {
576 put_device(&mem->dev);
577 return -EEXIST;
578 }
579 mem = kzalloc(sizeof(*mem), GFP_KERNEL);
580 if (!mem)
581 return -ENOMEM;
582
583 mem->start_section_nr = block_id * sections_per_block;
584 mem->state = state;
585 mem->nid = NUMA_NO_NODE;
586
587 ret = register_memory(mem);
588
589 *memory = mem;
590 return ret;
591}
592
593static int add_memory_block(unsigned long base_section_nr)
594{
595 int section_count = 0;
596 struct memory_block *mem;
597 unsigned long nr;
598
599 for (nr = base_section_nr; nr < base_section_nr + sections_per_block;
600 nr++)
601 if (present_section_nr(nr))
602 section_count++;
603
604 if (section_count == 0)
605 return 0;
606 return init_memory_block(&mem, base_memory_block_id(base_section_nr),
607 MEM_ONLINE);
608}
609
610static void unregister_memory(struct memory_block *memory)
611{
612 if (WARN_ON_ONCE(memory->dev.bus != &memory_subsys))
613 return;
614
615 WARN_ON(xa_erase(&memory_blocks, memory->dev.id) == NULL);
616
617
618 put_device(&memory->dev);
619 device_unregister(&memory->dev);
620}
621
622
623
624
625
626
627
628
629int create_memory_block_devices(unsigned long start, unsigned long size)
630{
631 const unsigned long start_block_id = pfn_to_block_id(PFN_DOWN(start));
632 unsigned long end_block_id = pfn_to_block_id(PFN_DOWN(start + size));
633 struct memory_block *mem;
634 unsigned long block_id;
635 int ret = 0;
636
637 if (WARN_ON_ONCE(!IS_ALIGNED(start, memory_block_size_bytes()) ||
638 !IS_ALIGNED(size, memory_block_size_bytes())))
639 return -EINVAL;
640
641 for (block_id = start_block_id; block_id != end_block_id; block_id++) {
642 ret = init_memory_block(&mem, block_id, MEM_OFFLINE);
643 if (ret)
644 break;
645 }
646 if (ret) {
647 end_block_id = block_id;
648 for (block_id = start_block_id; block_id != end_block_id;
649 block_id++) {
650 mem = find_memory_block_by_id(block_id);
651 if (WARN_ON_ONCE(!mem))
652 continue;
653 unregister_memory(mem);
654 }
655 }
656 return ret;
657}
658
659
660
661
662
663
664
665
666void remove_memory_block_devices(unsigned long start, unsigned long size)
667{
668 const unsigned long start_block_id = pfn_to_block_id(PFN_DOWN(start));
669 const unsigned long end_block_id = pfn_to_block_id(PFN_DOWN(start + size));
670 struct memory_block *mem;
671 unsigned long block_id;
672
673 if (WARN_ON_ONCE(!IS_ALIGNED(start, memory_block_size_bytes()) ||
674 !IS_ALIGNED(size, memory_block_size_bytes())))
675 return;
676
677 for (block_id = start_block_id; block_id != end_block_id; block_id++) {
678 mem = find_memory_block_by_id(block_id);
679 if (WARN_ON_ONCE(!mem))
680 continue;
681 unregister_memory_block_under_nodes(mem);
682 unregister_memory(mem);
683 }
684}
685
686
687bool is_memblock_offlined(struct memory_block *mem)
688{
689 return mem->state == MEM_OFFLINE;
690}
691
692static struct attribute *memory_root_attrs[] = {
693#ifdef CONFIG_ARCH_MEMORY_PROBE
694 &dev_attr_probe.attr,
695#endif
696
697#ifdef CONFIG_MEMORY_FAILURE
698 &dev_attr_soft_offline_page.attr,
699 &dev_attr_hard_offline_page.attr,
700#endif
701
702 &dev_attr_block_size_bytes.attr,
703 &dev_attr_auto_online_blocks.attr,
704 NULL
705};
706
707static struct attribute_group memory_root_attr_group = {
708 .attrs = memory_root_attrs,
709};
710
711static const struct attribute_group *memory_root_attr_groups[] = {
712 &memory_root_attr_group,
713 NULL,
714};
715
716
717
718
719
720
721void __init memory_dev_init(void)
722{
723 int ret;
724 unsigned long block_sz, nr;
725
726
727 block_sz = memory_block_size_bytes();
728 if (!is_power_of_2(block_sz) || block_sz < MIN_MEMORY_BLOCK_SIZE)
729 panic("Memory block size not suitable: 0x%lx\n", block_sz);
730 sections_per_block = block_sz / MIN_MEMORY_BLOCK_SIZE;
731
732 ret = subsys_system_register(&memory_subsys, memory_root_attr_groups);
733 if (ret)
734 panic("%s() failed to register subsystem: %d\n", __func__, ret);
735
736
737
738
739
740 for (nr = 0; nr <= __highest_present_section_nr;
741 nr += sections_per_block) {
742 ret = add_memory_block(nr);
743 if (ret)
744 panic("%s() failed to add memory block: %d\n", __func__,
745 ret);
746 }
747}
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766int walk_memory_blocks(unsigned long start, unsigned long size,
767 void *arg, walk_memory_blocks_func_t func)
768{
769 const unsigned long start_block_id = phys_to_block_id(start);
770 const unsigned long end_block_id = phys_to_block_id(start + size - 1);
771 struct memory_block *mem;
772 unsigned long block_id;
773 int ret = 0;
774
775 if (!size)
776 return 0;
777
778 for (block_id = start_block_id; block_id <= end_block_id; block_id++) {
779 mem = find_memory_block_by_id(block_id);
780 if (!mem)
781 continue;
782
783 ret = func(mem, arg);
784 put_device(&mem->dev);
785 if (ret)
786 break;
787 }
788 return ret;
789}
790
791struct for_each_memory_block_cb_data {
792 walk_memory_blocks_func_t func;
793 void *arg;
794};
795
796static int for_each_memory_block_cb(struct device *dev, void *data)
797{
798 struct memory_block *mem = to_memory_block(dev);
799 struct for_each_memory_block_cb_data *cb_data = data;
800
801 return cb_data->func(mem, cb_data->arg);
802}
803
804
805
806
807
808
809
810
811
812
813
814
815
816int for_each_memory_block(void *arg, walk_memory_blocks_func_t func)
817{
818 struct for_each_memory_block_cb_data cb_data = {
819 .func = func,
820 .arg = arg,
821 };
822
823 return bus_for_each_dev(&memory_subsys, NULL, &cb_data,
824 for_each_memory_block_cb);
825}
826