1
2
3
4
5
6
7
8#include <linux/stddef.h>
9#include <linux/mm.h>
10#include <linux/sched/signal.h>
11#include <linux/swap.h>
12#include <linux/interrupt.h>
13#include <linux/pagemap.h>
14#include <linux/compiler.h>
15#include <linux/export.h>
16#include <linux/pagevec.h>
17#include <linux/writeback.h>
18#include <linux/slab.h>
19#include <linux/sysctl.h>
20#include <linux/cpu.h>
21#include <linux/memory.h>
22#include <linux/memremap.h>
23#include <linux/memory_hotplug.h>
24#include <linux/vmalloc.h>
25#include <linux/ioport.h>
26#include <linux/delay.h>
27#include <linux/migrate.h>
28#include <linux/page-isolation.h>
29#include <linux/pfn.h>
30#include <linux/suspend.h>
31#include <linux/mm_inline.h>
32#include <linux/firmware-map.h>
33#include <linux/stop_machine.h>
34#include <linux/hugetlb.h>
35#include <linux/memblock.h>
36#include <linux/compaction.h>
37#include <linux/rmap.h>
38#include <linux/module.h>
39
40#include <asm/tlbflush.h>
41
42#include "internal.h"
43#include "shuffle.h"
44
45
46
47
48
49static bool memmap_on_memory __ro_after_init;
50#ifdef CONFIG_MHP_MEMMAP_ON_MEMORY
51module_param(memmap_on_memory, bool, 0444);
52MODULE_PARM_DESC(memmap_on_memory, "Enable memmap on memory for memory hotplug");
53#endif
54
55enum {
56 ONLINE_POLICY_CONTIG_ZONES = 0,
57 ONLINE_POLICY_AUTO_MOVABLE,
58};
59
60static const char * const online_policy_to_str[] = {
61 [ONLINE_POLICY_CONTIG_ZONES] = "contig-zones",
62 [ONLINE_POLICY_AUTO_MOVABLE] = "auto-movable",
63};
64
65static int set_online_policy(const char *val, const struct kernel_param *kp)
66{
67 int ret = sysfs_match_string(online_policy_to_str, val);
68
69 if (ret < 0)
70 return ret;
71 *((int *)kp->arg) = ret;
72 return 0;
73}
74
75static int get_online_policy(char *buffer, const struct kernel_param *kp)
76{
77 return sprintf(buffer, "%s\n", online_policy_to_str[*((int *)kp->arg)]);
78}
79
80
81
82
83
84
85
86
87
88static int online_policy __read_mostly = ONLINE_POLICY_CONTIG_ZONES;
89static const struct kernel_param_ops online_policy_ops = {
90 .set = set_online_policy,
91 .get = get_online_policy,
92};
93module_param_cb(online_policy, &online_policy_ops, &online_policy, 0644);
94MODULE_PARM_DESC(online_policy,
95 "Set the online policy (\"contig-zones\", \"auto-movable\") "
96 "Default: \"contig-zones\"");
97
98
99
100
101
102
103
104
105static unsigned int auto_movable_ratio __read_mostly = 301;
106module_param(auto_movable_ratio, uint, 0644);
107MODULE_PARM_DESC(auto_movable_ratio,
108 "Set the maximum ratio of MOVABLE:KERNEL memory in the system "
109 "in percent for \"auto-movable\" online policy. Default: 301");
110
111
112
113
114#ifdef CONFIG_NUMA
115static bool auto_movable_numa_aware __read_mostly = true;
116module_param(auto_movable_numa_aware, bool, 0644);
117MODULE_PARM_DESC(auto_movable_numa_aware,
118 "Consider numa node stats in addition to global stats in "
119 "\"auto-movable\" online policy. Default: true");
120#endif
121
122
123
124
125
126
127
128
129static online_page_callback_t online_page_callback = generic_online_page;
130static DEFINE_MUTEX(online_page_callback_lock);
131
132DEFINE_STATIC_PERCPU_RWSEM(mem_hotplug_lock);
133
134void get_online_mems(void)
135{
136 percpu_down_read(&mem_hotplug_lock);
137}
138
139void put_online_mems(void)
140{
141 percpu_up_read(&mem_hotplug_lock);
142}
143
144bool movable_node_enabled = false;
145
146#ifndef CONFIG_MEMORY_HOTPLUG_DEFAULT_ONLINE
147int mhp_default_online_type = MMOP_OFFLINE;
148#else
149int mhp_default_online_type = MMOP_ONLINE;
150#endif
151
152static int __init setup_memhp_default_state(char *str)
153{
154 const int online_type = mhp_online_type_from_str(str);
155
156 if (online_type >= 0)
157 mhp_default_online_type = online_type;
158
159 return 1;
160}
161__setup("memhp_default_state=", setup_memhp_default_state);
162
163void mem_hotplug_begin(void)
164{
165 cpus_read_lock();
166 percpu_down_write(&mem_hotplug_lock);
167}
168
169void mem_hotplug_done(void)
170{
171 percpu_up_write(&mem_hotplug_lock);
172 cpus_read_unlock();
173}
174
175u64 max_mem_size = U64_MAX;
176
177
178static struct resource *register_memory_resource(u64 start, u64 size,
179 const char *resource_name)
180{
181 struct resource *res;
182 unsigned long flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY;
183
184 if (strcmp(resource_name, "System RAM"))
185 flags |= IORESOURCE_SYSRAM_DRIVER_MANAGED;
186
187 if (!mhp_range_allowed(start, size, true))
188 return ERR_PTR(-E2BIG);
189
190
191
192
193
194
195
196 if (start + size > max_mem_size && system_state < SYSTEM_RUNNING)
197 return ERR_PTR(-E2BIG);
198
199
200
201
202
203
204 res = __request_region(&iomem_resource, start, size,
205 resource_name, flags);
206
207 if (!res) {
208 pr_debug("Unable to reserve System RAM region: %016llx->%016llx\n",
209 start, start + size);
210 return ERR_PTR(-EEXIST);
211 }
212 return res;
213}
214
215static void release_memory_resource(struct resource *res)
216{
217 if (!res)
218 return;
219 release_resource(res);
220 kfree(res);
221}
222
223static int check_pfn_span(unsigned long pfn, unsigned long nr_pages,
224 const char *reason)
225{
226
227
228
229
230
231
232
233
234
235 unsigned long min_align;
236
237 if (IS_ENABLED(CONFIG_SPARSEMEM_VMEMMAP))
238 min_align = PAGES_PER_SUBSECTION;
239 else
240 min_align = PAGES_PER_SECTION;
241 if (!IS_ALIGNED(pfn, min_align)
242 || !IS_ALIGNED(nr_pages, min_align)) {
243 WARN(1, "Misaligned __%s_pages start: %#lx end: #%lx\n",
244 reason, pfn, pfn + nr_pages - 1);
245 return -EINVAL;
246 }
247 return 0;
248}
249
250
251
252
253
254
255struct page *pfn_to_online_page(unsigned long pfn)
256{
257 unsigned long nr = pfn_to_section_nr(pfn);
258 struct dev_pagemap *pgmap;
259 struct mem_section *ms;
260
261 if (nr >= NR_MEM_SECTIONS)
262 return NULL;
263
264 ms = __nr_to_section(nr);
265 if (!online_section(ms))
266 return NULL;
267
268
269
270
271
272 if (IS_ENABLED(CONFIG_HAVE_ARCH_PFN_VALID) && !pfn_valid(pfn))
273 return NULL;
274
275 if (!pfn_section_valid(ms, pfn))
276 return NULL;
277
278 if (!online_device_section(ms))
279 return pfn_to_page(pfn);
280
281
282
283
284
285
286
287 pgmap = get_dev_pagemap(pfn, NULL);
288 put_dev_pagemap(pgmap);
289
290
291 if (pgmap)
292 return NULL;
293
294 return pfn_to_page(pfn);
295}
296EXPORT_SYMBOL_GPL(pfn_to_online_page);
297
298int __ref __add_pages(int nid, unsigned long pfn, unsigned long nr_pages,
299 struct mhp_params *params)
300{
301 const unsigned long end_pfn = pfn + nr_pages;
302 unsigned long cur_nr_pages;
303 int err;
304 struct vmem_altmap *altmap = params->altmap;
305
306 if (WARN_ON_ONCE(!params->pgprot.pgprot))
307 return -EINVAL;
308
309 VM_BUG_ON(!mhp_range_allowed(PFN_PHYS(pfn), nr_pages * PAGE_SIZE, false));
310
311 if (altmap) {
312
313
314
315 if (altmap->base_pfn != pfn
316 || vmem_altmap_offset(altmap) > nr_pages) {
317 pr_warn_once("memory add fail, invalid altmap\n");
318 return -EINVAL;
319 }
320 altmap->alloc = 0;
321 }
322
323 err = check_pfn_span(pfn, nr_pages, "add");
324 if (err)
325 return err;
326
327 for (; pfn < end_pfn; pfn += cur_nr_pages) {
328
329 cur_nr_pages = min(end_pfn - pfn,
330 SECTION_ALIGN_UP(pfn + 1) - pfn);
331 err = sparse_add_section(nid, pfn, cur_nr_pages, altmap);
332 if (err)
333 break;
334 cond_resched();
335 }
336 vmemmap_populate_print_last();
337 return err;
338}
339
340
341static unsigned long find_smallest_section_pfn(int nid, struct zone *zone,
342 unsigned long start_pfn,
343 unsigned long end_pfn)
344{
345 for (; start_pfn < end_pfn; start_pfn += PAGES_PER_SUBSECTION) {
346 if (unlikely(!pfn_to_online_page(start_pfn)))
347 continue;
348
349 if (unlikely(pfn_to_nid(start_pfn) != nid))
350 continue;
351
352 if (zone != page_zone(pfn_to_page(start_pfn)))
353 continue;
354
355 return start_pfn;
356 }
357
358 return 0;
359}
360
361
362static unsigned long find_biggest_section_pfn(int nid, struct zone *zone,
363 unsigned long start_pfn,
364 unsigned long end_pfn)
365{
366 unsigned long pfn;
367
368
369 pfn = end_pfn - 1;
370 for (; pfn >= start_pfn; pfn -= PAGES_PER_SUBSECTION) {
371 if (unlikely(!pfn_to_online_page(pfn)))
372 continue;
373
374 if (unlikely(pfn_to_nid(pfn) != nid))
375 continue;
376
377 if (zone != page_zone(pfn_to_page(pfn)))
378 continue;
379
380 return pfn;
381 }
382
383 return 0;
384}
385
386static void shrink_zone_span(struct zone *zone, unsigned long start_pfn,
387 unsigned long end_pfn)
388{
389 unsigned long pfn;
390 int nid = zone_to_nid(zone);
391
392 if (zone->zone_start_pfn == start_pfn) {
393
394
395
396
397
398
399 pfn = find_smallest_section_pfn(nid, zone, end_pfn,
400 zone_end_pfn(zone));
401 if (pfn) {
402 zone->spanned_pages = zone_end_pfn(zone) - pfn;
403 zone->zone_start_pfn = pfn;
404 } else {
405 zone->zone_start_pfn = 0;
406 zone->spanned_pages = 0;
407 }
408 } else if (zone_end_pfn(zone) == end_pfn) {
409
410
411
412
413
414
415 pfn = find_biggest_section_pfn(nid, zone, zone->zone_start_pfn,
416 start_pfn);
417 if (pfn)
418 zone->spanned_pages = pfn - zone->zone_start_pfn + 1;
419 else {
420 zone->zone_start_pfn = 0;
421 zone->spanned_pages = 0;
422 }
423 }
424}
425
426static void update_pgdat_span(struct pglist_data *pgdat)
427{
428 unsigned long node_start_pfn = 0, node_end_pfn = 0;
429 struct zone *zone;
430
431 for (zone = pgdat->node_zones;
432 zone < pgdat->node_zones + MAX_NR_ZONES; zone++) {
433 unsigned long end_pfn = zone_end_pfn(zone);
434
435
436 if (!zone->spanned_pages)
437 continue;
438 if (!node_end_pfn) {
439 node_start_pfn = zone->zone_start_pfn;
440 node_end_pfn = end_pfn;
441 continue;
442 }
443
444 if (end_pfn > node_end_pfn)
445 node_end_pfn = end_pfn;
446 if (zone->zone_start_pfn < node_start_pfn)
447 node_start_pfn = zone->zone_start_pfn;
448 }
449
450 pgdat->node_start_pfn = node_start_pfn;
451 pgdat->node_spanned_pages = node_end_pfn - node_start_pfn;
452}
453
454void __ref remove_pfn_range_from_zone(struct zone *zone,
455 unsigned long start_pfn,
456 unsigned long nr_pages)
457{
458 const unsigned long end_pfn = start_pfn + nr_pages;
459 struct pglist_data *pgdat = zone->zone_pgdat;
460 unsigned long pfn, cur_nr_pages;
461
462
463 for (pfn = start_pfn; pfn < end_pfn; pfn += cur_nr_pages) {
464 cond_resched();
465
466
467 cur_nr_pages =
468 min(end_pfn - pfn, SECTION_ALIGN_UP(pfn + 1) - pfn);
469 page_init_poison(pfn_to_page(pfn),
470 sizeof(struct page) * cur_nr_pages);
471 }
472
473
474
475
476
477
478 if (zone_is_zone_device(zone))
479 return;
480
481 clear_zone_contiguous(zone);
482
483 shrink_zone_span(zone, start_pfn, start_pfn + nr_pages);
484 update_pgdat_span(pgdat);
485
486 set_zone_contiguous(zone);
487}
488
489static void __remove_section(unsigned long pfn, unsigned long nr_pages,
490 unsigned long map_offset,
491 struct vmem_altmap *altmap)
492{
493 struct mem_section *ms = __pfn_to_section(pfn);
494
495 if (WARN_ON_ONCE(!valid_section(ms)))
496 return;
497
498 sparse_remove_section(ms, pfn, nr_pages, map_offset, altmap);
499}
500
501
502
503
504
505
506
507
508
509
510
511
512void __remove_pages(unsigned long pfn, unsigned long nr_pages,
513 struct vmem_altmap *altmap)
514{
515 const unsigned long end_pfn = pfn + nr_pages;
516 unsigned long cur_nr_pages;
517 unsigned long map_offset = 0;
518
519 map_offset = vmem_altmap_offset(altmap);
520
521 if (check_pfn_span(pfn, nr_pages, "remove"))
522 return;
523
524 for (; pfn < end_pfn; pfn += cur_nr_pages) {
525 cond_resched();
526
527 cur_nr_pages = min(end_pfn - pfn,
528 SECTION_ALIGN_UP(pfn + 1) - pfn);
529 __remove_section(pfn, cur_nr_pages, map_offset, altmap);
530 map_offset = 0;
531 }
532}
533
534int set_online_page_callback(online_page_callback_t callback)
535{
536 int rc = -EINVAL;
537
538 get_online_mems();
539 mutex_lock(&online_page_callback_lock);
540
541 if (online_page_callback == generic_online_page) {
542 online_page_callback = callback;
543 rc = 0;
544 }
545
546 mutex_unlock(&online_page_callback_lock);
547 put_online_mems();
548
549 return rc;
550}
551EXPORT_SYMBOL_GPL(set_online_page_callback);
552
553int restore_online_page_callback(online_page_callback_t callback)
554{
555 int rc = -EINVAL;
556
557 get_online_mems();
558 mutex_lock(&online_page_callback_lock);
559
560 if (online_page_callback == callback) {
561 online_page_callback = generic_online_page;
562 rc = 0;
563 }
564
565 mutex_unlock(&online_page_callback_lock);
566 put_online_mems();
567
568 return rc;
569}
570EXPORT_SYMBOL_GPL(restore_online_page_callback);
571
572void generic_online_page(struct page *page, unsigned int order)
573{
574
575
576
577
578
579 debug_pagealloc_map_pages(page, 1 << order);
580 __free_pages_core(page, order);
581 totalram_pages_add(1UL << order);
582}
583EXPORT_SYMBOL_GPL(generic_online_page);
584
585static void online_pages_range(unsigned long start_pfn, unsigned long nr_pages)
586{
587 const unsigned long end_pfn = start_pfn + nr_pages;
588 unsigned long pfn;
589
590
591
592
593
594
595
596
597
598
599 for (pfn = start_pfn; pfn < end_pfn;) {
600 int order = min(MAX_ORDER - 1UL, __ffs(pfn));
601
602 (*online_page_callback)(pfn_to_page(pfn), order);
603 pfn += (1UL << order);
604 }
605
606
607 online_mem_sections(start_pfn, end_pfn);
608}
609
610
611static void node_states_check_changes_online(unsigned long nr_pages,
612 struct zone *zone, struct memory_notify *arg)
613{
614 int nid = zone_to_nid(zone);
615
616 arg->status_change_nid = NUMA_NO_NODE;
617 arg->status_change_nid_normal = NUMA_NO_NODE;
618
619 if (!node_state(nid, N_MEMORY))
620 arg->status_change_nid = nid;
621 if (zone_idx(zone) <= ZONE_NORMAL && !node_state(nid, N_NORMAL_MEMORY))
622 arg->status_change_nid_normal = nid;
623}
624
625static void node_states_set_node(int node, struct memory_notify *arg)
626{
627 if (arg->status_change_nid_normal >= 0)
628 node_set_state(node, N_NORMAL_MEMORY);
629
630 if (arg->status_change_nid >= 0)
631 node_set_state(node, N_MEMORY);
632}
633
634static void __meminit resize_zone_range(struct zone *zone, unsigned long start_pfn,
635 unsigned long nr_pages)
636{
637 unsigned long old_end_pfn = zone_end_pfn(zone);
638
639 if (zone_is_empty(zone) || start_pfn < zone->zone_start_pfn)
640 zone->zone_start_pfn = start_pfn;
641
642 zone->spanned_pages = max(start_pfn + nr_pages, old_end_pfn) - zone->zone_start_pfn;
643}
644
645static void __meminit resize_pgdat_range(struct pglist_data *pgdat, unsigned long start_pfn,
646 unsigned long nr_pages)
647{
648 unsigned long old_end_pfn = pgdat_end_pfn(pgdat);
649
650 if (!pgdat->node_spanned_pages || start_pfn < pgdat->node_start_pfn)
651 pgdat->node_start_pfn = start_pfn;
652
653 pgdat->node_spanned_pages = max(start_pfn + nr_pages, old_end_pfn) - pgdat->node_start_pfn;
654
655}
656
657static void section_taint_zone_device(unsigned long pfn)
658{
659 struct mem_section *ms = __pfn_to_section(pfn);
660
661 ms->section_mem_map |= SECTION_TAINT_ZONE_DEVICE;
662}
663
664
665
666
667
668
669
670
671
672
673void __ref move_pfn_range_to_zone(struct zone *zone, unsigned long start_pfn,
674 unsigned long nr_pages,
675 struct vmem_altmap *altmap, int migratetype)
676{
677 struct pglist_data *pgdat = zone->zone_pgdat;
678 int nid = pgdat->node_id;
679
680 clear_zone_contiguous(zone);
681
682 if (zone_is_empty(zone))
683 init_currently_empty_zone(zone, start_pfn, nr_pages);
684 resize_zone_range(zone, start_pfn, nr_pages);
685 resize_pgdat_range(pgdat, start_pfn, nr_pages);
686
687
688
689
690
691
692
693 if (zone_is_zone_device(zone)) {
694 if (!IS_ALIGNED(start_pfn, PAGES_PER_SECTION))
695 section_taint_zone_device(start_pfn);
696 if (!IS_ALIGNED(start_pfn + nr_pages, PAGES_PER_SECTION))
697 section_taint_zone_device(start_pfn + nr_pages);
698 }
699
700
701
702
703
704
705
706 memmap_init_range(nr_pages, nid, zone_idx(zone), start_pfn, 0,
707 MEMINIT_HOTPLUG, altmap, migratetype);
708
709 set_zone_contiguous(zone);
710}
711
712struct auto_movable_stats {
713 unsigned long kernel_early_pages;
714 unsigned long movable_pages;
715};
716
717static void auto_movable_stats_account_zone(struct auto_movable_stats *stats,
718 struct zone *zone)
719{
720 if (zone_idx(zone) == ZONE_MOVABLE) {
721 stats->movable_pages += zone->present_pages;
722 } else {
723 stats->kernel_early_pages += zone->present_early_pages;
724#ifdef CONFIG_CMA
725
726
727
728
729 stats->movable_pages += zone->cma_pages;
730 stats->kernel_early_pages -= zone->cma_pages;
731#endif
732 }
733}
734struct auto_movable_group_stats {
735 unsigned long movable_pages;
736 unsigned long req_kernel_early_pages;
737};
738
739static int auto_movable_stats_account_group(struct memory_group *group,
740 void *arg)
741{
742 const int ratio = READ_ONCE(auto_movable_ratio);
743 struct auto_movable_group_stats *stats = arg;
744 long pages;
745
746
747
748
749
750 if (!ratio)
751 return 0;
752
753
754
755
756
757 pages = group->present_movable_pages * 100 / ratio;
758 pages -= group->present_kernel_pages;
759
760 if (pages > 0)
761 stats->req_kernel_early_pages += pages;
762 stats->movable_pages += group->present_movable_pages;
763 return 0;
764}
765
766static bool auto_movable_can_online_movable(int nid, struct memory_group *group,
767 unsigned long nr_pages)
768{
769 unsigned long kernel_early_pages, movable_pages;
770 struct auto_movable_group_stats group_stats = {};
771 struct auto_movable_stats stats = {};
772 pg_data_t *pgdat = NODE_DATA(nid);
773 struct zone *zone;
774 int i;
775
776
777 if (nid == NUMA_NO_NODE) {
778
779 for_each_populated_zone(zone)
780 auto_movable_stats_account_zone(&stats, zone);
781 } else {
782 for (i = 0; i < MAX_NR_ZONES; i++) {
783 zone = pgdat->node_zones + i;
784 if (populated_zone(zone))
785 auto_movable_stats_account_zone(&stats, zone);
786 }
787 }
788
789 kernel_early_pages = stats.kernel_early_pages;
790 movable_pages = stats.movable_pages;
791
792
793
794
795
796
797 walk_dynamic_memory_groups(nid, auto_movable_stats_account_group,
798 group, &group_stats);
799 if (kernel_early_pages <= group_stats.req_kernel_early_pages)
800 return false;
801 kernel_early_pages -= group_stats.req_kernel_early_pages;
802 movable_pages -= group_stats.movable_pages;
803
804 if (group && group->is_dynamic)
805 kernel_early_pages += group->present_kernel_pages;
806
807
808
809
810
811 movable_pages += nr_pages;
812 return movable_pages <= (auto_movable_ratio * kernel_early_pages) / 100;
813}
814
815
816
817
818
819
820static struct zone *default_kernel_zone_for_pfn(int nid, unsigned long start_pfn,
821 unsigned long nr_pages)
822{
823 struct pglist_data *pgdat = NODE_DATA(nid);
824 int zid;
825
826 for (zid = 0; zid < ZONE_NORMAL; zid++) {
827 struct zone *zone = &pgdat->node_zones[zid];
828
829 if (zone_intersects(zone, start_pfn, nr_pages))
830 return zone;
831 }
832
833 return &pgdat->node_zones[ZONE_NORMAL];
834}
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886static struct zone *auto_movable_zone_for_pfn(int nid,
887 struct memory_group *group,
888 unsigned long pfn,
889 unsigned long nr_pages)
890{
891 unsigned long online_pages = 0, max_pages, end_pfn;
892 struct page *page;
893
894 if (!auto_movable_ratio)
895 goto kernel_zone;
896
897 if (group && !group->is_dynamic) {
898 max_pages = group->s.max_pages;
899 online_pages = group->present_movable_pages;
900
901
902 if (group->present_kernel_pages)
903 goto kernel_zone;
904 } else if (!group || group->d.unit_pages == nr_pages) {
905 max_pages = nr_pages;
906 } else {
907 max_pages = group->d.unit_pages;
908
909
910
911
912
913
914 pfn = ALIGN_DOWN(pfn, group->d.unit_pages);
915 end_pfn = pfn + group->d.unit_pages;
916 for (; pfn < end_pfn; pfn += PAGES_PER_SECTION) {
917 page = pfn_to_online_page(pfn);
918 if (!page)
919 continue;
920
921 if (page_zonenum(page) != ZONE_MOVABLE)
922 goto kernel_zone;
923 online_pages += PAGES_PER_SECTION;
924 }
925 }
926
927
928
929
930
931
932 nr_pages = max_pages - online_pages;
933 if (!auto_movable_can_online_movable(NUMA_NO_NODE, group, nr_pages))
934 goto kernel_zone;
935
936#ifdef CONFIG_NUMA
937 if (auto_movable_numa_aware &&
938 !auto_movable_can_online_movable(nid, group, nr_pages))
939 goto kernel_zone;
940#endif
941
942 return &NODE_DATA(nid)->node_zones[ZONE_MOVABLE];
943kernel_zone:
944 return default_kernel_zone_for_pfn(nid, pfn, nr_pages);
945}
946
947static inline struct zone *default_zone_for_pfn(int nid, unsigned long start_pfn,
948 unsigned long nr_pages)
949{
950 struct zone *kernel_zone = default_kernel_zone_for_pfn(nid, start_pfn,
951 nr_pages);
952 struct zone *movable_zone = &NODE_DATA(nid)->node_zones[ZONE_MOVABLE];
953 bool in_kernel = zone_intersects(kernel_zone, start_pfn, nr_pages);
954 bool in_movable = zone_intersects(movable_zone, start_pfn, nr_pages);
955
956
957
958
959
960 if (in_kernel ^ in_movable)
961 return (in_kernel) ? kernel_zone : movable_zone;
962
963
964
965
966
967
968 return movable_node_enabled ? movable_zone : kernel_zone;
969}
970
971struct zone *zone_for_pfn_range(int online_type, int nid,
972 struct memory_group *group, unsigned long start_pfn,
973 unsigned long nr_pages)
974{
975 if (online_type == MMOP_ONLINE_KERNEL)
976 return default_kernel_zone_for_pfn(nid, start_pfn, nr_pages);
977
978 if (online_type == MMOP_ONLINE_MOVABLE)
979 return &NODE_DATA(nid)->node_zones[ZONE_MOVABLE];
980
981 if (online_policy == ONLINE_POLICY_AUTO_MOVABLE)
982 return auto_movable_zone_for_pfn(nid, group, start_pfn, nr_pages);
983
984 return default_zone_for_pfn(nid, start_pfn, nr_pages);
985}
986
987
988
989
990
991void adjust_present_page_count(struct page *page, struct memory_group *group,
992 long nr_pages)
993{
994 struct zone *zone = page_zone(page);
995 const bool movable = zone_idx(zone) == ZONE_MOVABLE;
996
997
998
999
1000
1001 if (early_section(__pfn_to_section(page_to_pfn(page))))
1002 zone->present_early_pages += nr_pages;
1003 zone->present_pages += nr_pages;
1004 zone->zone_pgdat->node_present_pages += nr_pages;
1005
1006 if (group && movable)
1007 group->present_movable_pages += nr_pages;
1008 else if (group && !movable)
1009 group->present_kernel_pages += nr_pages;
1010}
1011
1012int mhp_init_memmap_on_memory(unsigned long pfn, unsigned long nr_pages,
1013 struct zone *zone)
1014{
1015 unsigned long end_pfn = pfn + nr_pages;
1016 int ret;
1017
1018 ret = kasan_add_zero_shadow(__va(PFN_PHYS(pfn)), PFN_PHYS(nr_pages));
1019 if (ret)
1020 return ret;
1021
1022 move_pfn_range_to_zone(zone, pfn, nr_pages, NULL, MIGRATE_UNMOVABLE);
1023
1024
1025
1026
1027
1028
1029 if (nr_pages >= PAGES_PER_SECTION)
1030 online_mem_sections(pfn, ALIGN_DOWN(end_pfn, PAGES_PER_SECTION));
1031
1032 return ret;
1033}
1034
1035void mhp_deinit_memmap_on_memory(unsigned long pfn, unsigned long nr_pages)
1036{
1037 unsigned long end_pfn = pfn + nr_pages;
1038
1039
1040
1041
1042
1043
1044 if (nr_pages >= PAGES_PER_SECTION)
1045 offline_mem_sections(pfn, ALIGN_DOWN(end_pfn, PAGES_PER_SECTION));
1046
1047
1048
1049
1050
1051 remove_pfn_range_from_zone(page_zone(pfn_to_page(pfn)), pfn, nr_pages);
1052 kasan_remove_zero_shadow(__va(PFN_PHYS(pfn)), PFN_PHYS(nr_pages));
1053}
1054
1055int __ref online_pages(unsigned long pfn, unsigned long nr_pages,
1056 struct zone *zone, struct memory_group *group)
1057{
1058 unsigned long flags;
1059 int need_zonelists_rebuild = 0;
1060 const int nid = zone_to_nid(zone);
1061 int ret;
1062 struct memory_notify arg;
1063
1064
1065
1066
1067
1068
1069
1070
1071 if (WARN_ON_ONCE(!nr_pages ||
1072 !IS_ALIGNED(pfn, pageblock_nr_pages) ||
1073 !IS_ALIGNED(pfn + nr_pages, PAGES_PER_SECTION)))
1074 return -EINVAL;
1075
1076 mem_hotplug_begin();
1077
1078
1079 move_pfn_range_to_zone(zone, pfn, nr_pages, NULL, MIGRATE_ISOLATE);
1080
1081 arg.start_pfn = pfn;
1082 arg.nr_pages = nr_pages;
1083 node_states_check_changes_online(nr_pages, zone, &arg);
1084
1085 ret = memory_notify(MEM_GOING_ONLINE, &arg);
1086 ret = notifier_to_errno(ret);
1087 if (ret)
1088 goto failed_addition;
1089
1090
1091
1092
1093
1094 spin_lock_irqsave(&zone->lock, flags);
1095 zone->nr_isolate_pageblock += nr_pages / pageblock_nr_pages;
1096 spin_unlock_irqrestore(&zone->lock, flags);
1097
1098
1099
1100
1101
1102
1103 if (!populated_zone(zone)) {
1104 need_zonelists_rebuild = 1;
1105 setup_zone_pageset(zone);
1106 }
1107
1108 online_pages_range(pfn, nr_pages);
1109 adjust_present_page_count(pfn_to_page(pfn), group, nr_pages);
1110
1111 node_states_set_node(nid, &arg);
1112 if (need_zonelists_rebuild)
1113 build_all_zonelists(NULL);
1114
1115
1116 undo_isolate_page_range(pfn, pfn + nr_pages, MIGRATE_MOVABLE);
1117
1118
1119
1120
1121
1122
1123
1124 shuffle_zone(zone);
1125
1126
1127 init_per_zone_wmark_min();
1128
1129 kswapd_run(nid);
1130 kcompactd_run(nid);
1131
1132 writeback_set_ratelimit();
1133
1134 memory_notify(MEM_ONLINE, &arg);
1135 mem_hotplug_done();
1136 return 0;
1137
1138failed_addition:
1139 pr_debug("online_pages [mem %#010llx-%#010llx] failed\n",
1140 (unsigned long long) pfn << PAGE_SHIFT,
1141 (((unsigned long long) pfn + nr_pages) << PAGE_SHIFT) - 1);
1142 memory_notify(MEM_CANCEL_ONLINE, &arg);
1143 remove_pfn_range_from_zone(zone, pfn, nr_pages);
1144 mem_hotplug_done();
1145 return ret;
1146}
1147
1148static void reset_node_present_pages(pg_data_t *pgdat)
1149{
1150 struct zone *z;
1151
1152 for (z = pgdat->node_zones; z < pgdat->node_zones + MAX_NR_ZONES; z++)
1153 z->present_pages = 0;
1154
1155 pgdat->node_present_pages = 0;
1156}
1157
1158
1159static pg_data_t __ref *hotadd_init_pgdat(int nid)
1160{
1161 struct pglist_data *pgdat;
1162
1163
1164
1165
1166
1167
1168
1169 pgdat = NODE_DATA(nid);
1170
1171
1172 free_area_init_core_hotplug(pgdat);
1173
1174
1175
1176
1177
1178 build_all_zonelists(pgdat);
1179
1180
1181
1182
1183
1184
1185
1186 reset_node_managed_pages(pgdat);
1187 reset_node_present_pages(pgdat);
1188
1189 return pgdat;
1190}
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203static int __try_online_node(int nid, bool set_node_online)
1204{
1205 pg_data_t *pgdat;
1206 int ret = 1;
1207
1208 if (node_online(nid))
1209 return 0;
1210
1211 pgdat = hotadd_init_pgdat(nid);
1212 if (!pgdat) {
1213 pr_err("Cannot online node %d due to NULL pgdat\n", nid);
1214 ret = -ENOMEM;
1215 goto out;
1216 }
1217
1218 if (set_node_online) {
1219 node_set_online(nid);
1220 ret = register_one_node(nid);
1221 BUG_ON(ret);
1222 }
1223out:
1224 return ret;
1225}
1226
1227
1228
1229
1230int try_online_node(int nid)
1231{
1232 int ret;
1233
1234 mem_hotplug_begin();
1235 ret = __try_online_node(nid, true);
1236 mem_hotplug_done();
1237 return ret;
1238}
1239
1240static int check_hotplug_memory_range(u64 start, u64 size)
1241{
1242
1243 if (!size || !IS_ALIGNED(start, memory_block_size_bytes()) ||
1244 !IS_ALIGNED(size, memory_block_size_bytes())) {
1245 pr_err("Block size [%#lx] unaligned hotplug range: start %#llx, size %#llx",
1246 memory_block_size_bytes(), start, size);
1247 return -EINVAL;
1248 }
1249
1250 return 0;
1251}
1252
1253static int online_memory_block(struct memory_block *mem, void *arg)
1254{
1255 mem->online_type = mhp_default_online_type;
1256 return device_online(&mem->dev);
1257}
1258
1259bool mhp_supports_memmap_on_memory(unsigned long size)
1260{
1261 unsigned long nr_vmemmap_pages = size / PAGE_SIZE;
1262 unsigned long vmemmap_size = nr_vmemmap_pages * sizeof(struct page);
1263 unsigned long remaining_size = size - vmemmap_size;
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291 return memmap_on_memory &&
1292 !hugetlb_free_vmemmap_enabled() &&
1293 IS_ENABLED(CONFIG_MHP_MEMMAP_ON_MEMORY) &&
1294 size == memory_block_size_bytes() &&
1295 IS_ALIGNED(vmemmap_size, PMD_SIZE) &&
1296 IS_ALIGNED(remaining_size, (pageblock_nr_pages << PAGE_SHIFT));
1297}
1298
1299
1300
1301
1302
1303
1304
1305int __ref add_memory_resource(int nid, struct resource *res, mhp_t mhp_flags)
1306{
1307 struct mhp_params params = { .pgprot = pgprot_mhp(PAGE_KERNEL) };
1308 enum memblock_flags memblock_flags = MEMBLOCK_NONE;
1309 struct vmem_altmap mhp_altmap = {};
1310 struct memory_group *group = NULL;
1311 u64 start, size;
1312 bool new_node = false;
1313 int ret;
1314
1315 start = res->start;
1316 size = resource_size(res);
1317
1318 ret = check_hotplug_memory_range(start, size);
1319 if (ret)
1320 return ret;
1321
1322 if (mhp_flags & MHP_NID_IS_MGID) {
1323 group = memory_group_find_by_id(nid);
1324 if (!group)
1325 return -EINVAL;
1326 nid = group->nid;
1327 }
1328
1329 if (!node_possible(nid)) {
1330 WARN(1, "node %d was absent from the node_possible_map\n", nid);
1331 return -EINVAL;
1332 }
1333
1334 mem_hotplug_begin();
1335
1336 if (IS_ENABLED(CONFIG_ARCH_KEEP_MEMBLOCK)) {
1337 if (res->flags & IORESOURCE_SYSRAM_DRIVER_MANAGED)
1338 memblock_flags = MEMBLOCK_DRIVER_MANAGED;
1339 ret = memblock_add_node(start, size, nid, memblock_flags);
1340 if (ret)
1341 goto error_mem_hotplug_end;
1342 }
1343
1344 ret = __try_online_node(nid, false);
1345 if (ret < 0)
1346 goto error;
1347 new_node = ret;
1348
1349
1350
1351
1352 if (mhp_flags & MHP_MEMMAP_ON_MEMORY) {
1353 if (!mhp_supports_memmap_on_memory(size)) {
1354 ret = -EINVAL;
1355 goto error;
1356 }
1357 mhp_altmap.free = PHYS_PFN(size);
1358 mhp_altmap.base_pfn = PHYS_PFN(start);
1359 params.altmap = &mhp_altmap;
1360 }
1361
1362
1363 ret = arch_add_memory(nid, start, size, ¶ms);
1364 if (ret < 0)
1365 goto error;
1366
1367
1368 ret = create_memory_block_devices(start, size, mhp_altmap.alloc,
1369 group);
1370 if (ret) {
1371 arch_remove_memory(start, size, NULL);
1372 goto error;
1373 }
1374
1375 if (new_node) {
1376
1377
1378
1379
1380
1381 node_set_online(nid);
1382 ret = __register_one_node(nid);
1383 BUG_ON(ret);
1384 }
1385
1386 register_memory_blocks_under_node(nid, PFN_DOWN(start),
1387 PFN_UP(start + size - 1),
1388 MEMINIT_HOTPLUG);
1389
1390
1391 if (!strcmp(res->name, "System RAM"))
1392 firmware_map_add_hotplug(start, start + size, "System RAM");
1393
1394
1395 mem_hotplug_done();
1396
1397
1398
1399
1400
1401 if (mhp_flags & MHP_MERGE_RESOURCE)
1402 merge_system_ram_resource(res);
1403
1404
1405 if (mhp_default_online_type != MMOP_OFFLINE)
1406 walk_memory_blocks(start, size, NULL, online_memory_block);
1407
1408 return ret;
1409error:
1410 if (IS_ENABLED(CONFIG_ARCH_KEEP_MEMBLOCK))
1411 memblock_remove(start, size);
1412error_mem_hotplug_end:
1413 mem_hotplug_done();
1414 return ret;
1415}
1416
1417
1418int __ref __add_memory(int nid, u64 start, u64 size, mhp_t mhp_flags)
1419{
1420 struct resource *res;
1421 int ret;
1422
1423 res = register_memory_resource(start, size, "System RAM");
1424 if (IS_ERR(res))
1425 return PTR_ERR(res);
1426
1427 ret = add_memory_resource(nid, res, mhp_flags);
1428 if (ret < 0)
1429 release_memory_resource(res);
1430 return ret;
1431}
1432
1433int add_memory(int nid, u64 start, u64 size, mhp_t mhp_flags)
1434{
1435 int rc;
1436
1437 lock_device_hotplug();
1438 rc = __add_memory(nid, start, size, mhp_flags);
1439 unlock_device_hotplug();
1440
1441 return rc;
1442}
1443EXPORT_SYMBOL_GPL(add_memory);
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466int add_memory_driver_managed(int nid, u64 start, u64 size,
1467 const char *resource_name, mhp_t mhp_flags)
1468{
1469 struct resource *res;
1470 int rc;
1471
1472 if (!resource_name ||
1473 strstr(resource_name, "System RAM (") != resource_name ||
1474 resource_name[strlen(resource_name) - 1] != ')')
1475 return -EINVAL;
1476
1477 lock_device_hotplug();
1478
1479 res = register_memory_resource(start, size, resource_name);
1480 if (IS_ERR(res)) {
1481 rc = PTR_ERR(res);
1482 goto out_unlock;
1483 }
1484
1485 rc = add_memory_resource(nid, res, mhp_flags);
1486 if (rc < 0)
1487 release_memory_resource(res);
1488
1489out_unlock:
1490 unlock_device_hotplug();
1491 return rc;
1492}
1493EXPORT_SYMBOL_GPL(add_memory_driver_managed);
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508struct range __weak arch_get_mappable_range(void)
1509{
1510 struct range mhp_range = {
1511 .start = 0UL,
1512 .end = -1ULL,
1513 };
1514 return mhp_range;
1515}
1516
1517struct range mhp_get_pluggable_range(bool need_mapping)
1518{
1519 const u64 max_phys = (1ULL << MAX_PHYSMEM_BITS) - 1;
1520 struct range mhp_range;
1521
1522 if (need_mapping) {
1523 mhp_range = arch_get_mappable_range();
1524 if (mhp_range.start > max_phys) {
1525 mhp_range.start = 0;
1526 mhp_range.end = 0;
1527 }
1528 mhp_range.end = min_t(u64, mhp_range.end, max_phys);
1529 } else {
1530 mhp_range.start = 0;
1531 mhp_range.end = max_phys;
1532 }
1533 return mhp_range;
1534}
1535EXPORT_SYMBOL_GPL(mhp_get_pluggable_range);
1536
1537bool mhp_range_allowed(u64 start, u64 size, bool need_mapping)
1538{
1539 struct range mhp_range = mhp_get_pluggable_range(need_mapping);
1540 u64 end = start + size;
1541
1542 if (start < end && start >= mhp_range.start && (end - 1) <= mhp_range.end)
1543 return true;
1544
1545 pr_warn("Hotplug memory [%#llx-%#llx] exceeds maximum addressable range [%#llx-%#llx]\n",
1546 start, end, mhp_range.start, mhp_range.end);
1547 return false;
1548}
1549
1550#ifdef CONFIG_MEMORY_HOTREMOVE
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562static int scan_movable_pages(unsigned long start, unsigned long end,
1563 unsigned long *movable_pfn)
1564{
1565 unsigned long pfn;
1566
1567 for (pfn = start; pfn < end; pfn++) {
1568 struct page *page, *head;
1569 unsigned long skip;
1570
1571 if (!pfn_valid(pfn))
1572 continue;
1573 page = pfn_to_page(pfn);
1574 if (PageLRU(page))
1575 goto found;
1576 if (__PageMovable(page))
1577 goto found;
1578
1579
1580
1581
1582
1583
1584
1585 if (PageOffline(page) && page_count(page))
1586 return -EBUSY;
1587
1588 if (!PageHuge(page))
1589 continue;
1590 head = compound_head(page);
1591
1592
1593
1594
1595
1596
1597
1598 if (HPageMigratable(head))
1599 goto found;
1600 skip = compound_nr(head) - (page - head);
1601 pfn += skip - 1;
1602 }
1603 return -ENOENT;
1604found:
1605 *movable_pfn = pfn;
1606 return 0;
1607}
1608
1609static int
1610do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
1611{
1612 unsigned long pfn;
1613 struct page *page, *head;
1614 int ret = 0;
1615 LIST_HEAD(source);
1616 static DEFINE_RATELIMIT_STATE(migrate_rs, DEFAULT_RATELIMIT_INTERVAL,
1617 DEFAULT_RATELIMIT_BURST);
1618
1619 for (pfn = start_pfn; pfn < end_pfn; pfn++) {
1620 struct folio *folio;
1621
1622 if (!pfn_valid(pfn))
1623 continue;
1624 page = pfn_to_page(pfn);
1625 folio = page_folio(page);
1626 head = &folio->page;
1627
1628 if (PageHuge(page)) {
1629 pfn = page_to_pfn(head) + compound_nr(head) - 1;
1630 isolate_huge_page(head, &source);
1631 continue;
1632 } else if (PageTransHuge(page))
1633 pfn = page_to_pfn(head) + thp_nr_pages(page) - 1;
1634
1635
1636
1637
1638
1639
1640
1641
1642 if (PageHWPoison(page)) {
1643 if (WARN_ON(folio_test_lru(folio)))
1644 folio_isolate_lru(folio);
1645 if (folio_mapped(folio))
1646 try_to_unmap(folio, TTU_IGNORE_MLOCK);
1647 continue;
1648 }
1649
1650 if (!get_page_unless_zero(page))
1651 continue;
1652
1653
1654
1655
1656 if (PageLRU(page))
1657 ret = isolate_lru_page(page);
1658 else
1659 ret = isolate_movable_page(page, ISOLATE_UNEVICTABLE);
1660 if (!ret) {
1661 list_add_tail(&page->lru, &source);
1662 if (!__PageMovable(page))
1663 inc_node_page_state(page, NR_ISOLATED_ANON +
1664 page_is_file_lru(page));
1665
1666 } else {
1667 if (__ratelimit(&migrate_rs)) {
1668 pr_warn("failed to isolate pfn %lx\n", pfn);
1669 dump_page(page, "isolation failed");
1670 }
1671 }
1672 put_page(page);
1673 }
1674 if (!list_empty(&source)) {
1675 nodemask_t nmask = node_states[N_MEMORY];
1676 struct migration_target_control mtc = {
1677 .nmask = &nmask,
1678 .gfp_mask = GFP_USER | __GFP_MOVABLE | __GFP_RETRY_MAYFAIL,
1679 };
1680
1681
1682
1683
1684
1685 mtc.nid = page_to_nid(list_first_entry(&source, struct page, lru));
1686
1687
1688
1689
1690
1691
1692 node_clear(mtc.nid, nmask);
1693 if (nodes_empty(nmask))
1694 node_set(mtc.nid, nmask);
1695 ret = migrate_pages(&source, alloc_migration_target, NULL,
1696 (unsigned long)&mtc, MIGRATE_SYNC, MR_MEMORY_HOTPLUG, NULL);
1697 if (ret) {
1698 list_for_each_entry(page, &source, lru) {
1699 if (__ratelimit(&migrate_rs)) {
1700 pr_warn("migrating pfn %lx failed ret:%d\n",
1701 page_to_pfn(page), ret);
1702 dump_page(page, "migration failure");
1703 }
1704 }
1705 putback_movable_pages(&source);
1706 }
1707 }
1708
1709 return ret;
1710}
1711
1712static int __init cmdline_parse_movable_node(char *p)
1713{
1714 movable_node_enabled = true;
1715 return 0;
1716}
1717early_param("movable_node", cmdline_parse_movable_node);
1718
1719
1720static void node_states_check_changes_offline(unsigned long nr_pages,
1721 struct zone *zone, struct memory_notify *arg)
1722{
1723 struct pglist_data *pgdat = zone->zone_pgdat;
1724 unsigned long present_pages = 0;
1725 enum zone_type zt;
1726
1727 arg->status_change_nid = NUMA_NO_NODE;
1728 arg->status_change_nid_normal = NUMA_NO_NODE;
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738 for (zt = 0; zt <= ZONE_NORMAL; zt++)
1739 present_pages += pgdat->node_zones[zt].present_pages;
1740 if (zone_idx(zone) <= ZONE_NORMAL && nr_pages >= present_pages)
1741 arg->status_change_nid_normal = zone_to_nid(zone);
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752 present_pages += pgdat->node_zones[ZONE_MOVABLE].present_pages;
1753
1754 if (nr_pages >= present_pages)
1755 arg->status_change_nid = zone_to_nid(zone);
1756}
1757
1758static void node_states_clear_node(int node, struct memory_notify *arg)
1759{
1760 if (arg->status_change_nid_normal >= 0)
1761 node_clear_state(node, N_NORMAL_MEMORY);
1762
1763 if (arg->status_change_nid >= 0)
1764 node_clear_state(node, N_MEMORY);
1765}
1766
1767static int count_system_ram_pages_cb(unsigned long start_pfn,
1768 unsigned long nr_pages, void *data)
1769{
1770 unsigned long *nr_system_ram_pages = data;
1771
1772 *nr_system_ram_pages += nr_pages;
1773 return 0;
1774}
1775
1776int __ref offline_pages(unsigned long start_pfn, unsigned long nr_pages,
1777 struct zone *zone, struct memory_group *group)
1778{
1779 const unsigned long end_pfn = start_pfn + nr_pages;
1780 unsigned long pfn, system_ram_pages = 0;
1781 const int node = zone_to_nid(zone);
1782 unsigned long flags;
1783 struct memory_notify arg;
1784 char *reason;
1785 int ret;
1786
1787
1788
1789
1790
1791
1792
1793
1794 if (WARN_ON_ONCE(!nr_pages ||
1795 !IS_ALIGNED(start_pfn, pageblock_nr_pages) ||
1796 !IS_ALIGNED(start_pfn + nr_pages, PAGES_PER_SECTION)))
1797 return -EINVAL;
1798
1799 mem_hotplug_begin();
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809 walk_system_ram_range(start_pfn, nr_pages, &system_ram_pages,
1810 count_system_ram_pages_cb);
1811 if (system_ram_pages != nr_pages) {
1812 ret = -EINVAL;
1813 reason = "memory holes";
1814 goto failed_removal;
1815 }
1816
1817
1818
1819
1820
1821
1822 if (WARN_ON_ONCE(page_zone(pfn_to_page(start_pfn)) != zone ||
1823 page_zone(pfn_to_page(end_pfn - 1)) != zone)) {
1824 ret = -EINVAL;
1825 reason = "multizone range";
1826 goto failed_removal;
1827 }
1828
1829
1830
1831
1832
1833 zone_pcp_disable(zone);
1834 lru_cache_disable();
1835
1836
1837 ret = start_isolate_page_range(start_pfn, end_pfn,
1838 MIGRATE_MOVABLE,
1839 MEMORY_OFFLINE | REPORT_FAILURE);
1840 if (ret) {
1841 reason = "failure to isolate range";
1842 goto failed_removal_pcplists_disabled;
1843 }
1844
1845 arg.start_pfn = start_pfn;
1846 arg.nr_pages = nr_pages;
1847 node_states_check_changes_offline(nr_pages, zone, &arg);
1848
1849 ret = memory_notify(MEM_GOING_OFFLINE, &arg);
1850 ret = notifier_to_errno(ret);
1851 if (ret) {
1852 reason = "notifier failure";
1853 goto failed_removal_isolated;
1854 }
1855
1856 do {
1857 pfn = start_pfn;
1858 do {
1859 if (signal_pending(current)) {
1860 ret = -EINTR;
1861 reason = "signal backoff";
1862 goto failed_removal_isolated;
1863 }
1864
1865 cond_resched();
1866
1867 ret = scan_movable_pages(pfn, end_pfn, &pfn);
1868 if (!ret) {
1869
1870
1871
1872
1873 do_migrate_range(pfn, end_pfn);
1874 }
1875 } while (!ret);
1876
1877 if (ret != -ENOENT) {
1878 reason = "unmovable page";
1879 goto failed_removal_isolated;
1880 }
1881
1882
1883
1884
1885
1886
1887 ret = dissolve_free_huge_pages(start_pfn, end_pfn);
1888 if (ret) {
1889 reason = "failure to dissolve huge pages";
1890 goto failed_removal_isolated;
1891 }
1892
1893 ret = test_pages_isolated(start_pfn, end_pfn, MEMORY_OFFLINE);
1894
1895 } while (ret);
1896
1897
1898 __offline_isolated_pages(start_pfn, end_pfn);
1899 pr_debug("Offlined Pages %ld\n", nr_pages);
1900
1901
1902
1903
1904
1905
1906 spin_lock_irqsave(&zone->lock, flags);
1907 zone->nr_isolate_pageblock -= nr_pages / pageblock_nr_pages;
1908 spin_unlock_irqrestore(&zone->lock, flags);
1909
1910 lru_cache_enable();
1911 zone_pcp_enable(zone);
1912
1913
1914 adjust_managed_page_count(pfn_to_page(start_pfn), -nr_pages);
1915 adjust_present_page_count(pfn_to_page(start_pfn), group, -nr_pages);
1916
1917
1918 init_per_zone_wmark_min();
1919
1920 if (!populated_zone(zone)) {
1921 zone_pcp_reset(zone);
1922 build_all_zonelists(NULL);
1923 }
1924
1925 node_states_clear_node(node, &arg);
1926 if (arg.status_change_nid >= 0) {
1927 kswapd_stop(node);
1928 kcompactd_stop(node);
1929 }
1930
1931 writeback_set_ratelimit();
1932
1933 memory_notify(MEM_OFFLINE, &arg);
1934 remove_pfn_range_from_zone(zone, start_pfn, nr_pages);
1935 mem_hotplug_done();
1936 return 0;
1937
1938failed_removal_isolated:
1939
1940 undo_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE);
1941 memory_notify(MEM_CANCEL_OFFLINE, &arg);
1942failed_removal_pcplists_disabled:
1943 lru_cache_enable();
1944 zone_pcp_enable(zone);
1945failed_removal:
1946 pr_debug("memory offlining [mem %#010llx-%#010llx] failed due to %s\n",
1947 (unsigned long long) start_pfn << PAGE_SHIFT,
1948 ((unsigned long long) end_pfn << PAGE_SHIFT) - 1,
1949 reason);
1950 mem_hotplug_done();
1951 return ret;
1952}
1953
1954static int check_memblock_offlined_cb(struct memory_block *mem, void *arg)
1955{
1956 int ret = !is_memblock_offlined(mem);
1957 int *nid = arg;
1958
1959 *nid = mem->nid;
1960 if (unlikely(ret)) {
1961 phys_addr_t beginpa, endpa;
1962
1963 beginpa = PFN_PHYS(section_nr_to_pfn(mem->start_section_nr));
1964 endpa = beginpa + memory_block_size_bytes() - 1;
1965 pr_warn("removing memory fails, because memory [%pa-%pa] is onlined\n",
1966 &beginpa, &endpa);
1967
1968 return -EBUSY;
1969 }
1970 return 0;
1971}
1972
1973static int get_nr_vmemmap_pages_cb(struct memory_block *mem, void *arg)
1974{
1975
1976
1977
1978 return mem->nr_vmemmap_pages;
1979}
1980
1981static int check_cpu_on_node(int nid)
1982{
1983 int cpu;
1984
1985 for_each_present_cpu(cpu) {
1986 if (cpu_to_node(cpu) == nid)
1987
1988
1989
1990
1991 return -EBUSY;
1992 }
1993
1994 return 0;
1995}
1996
1997static int check_no_memblock_for_node_cb(struct memory_block *mem, void *arg)
1998{
1999 int nid = *(int *)arg;
2000
2001
2002
2003
2004
2005
2006 return mem->nid == nid ? -EEXIST : 0;
2007}
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018void try_offline_node(int nid)
2019{
2020 int rc;
2021
2022
2023
2024
2025
2026
2027 if (node_spanned_pages(nid))
2028 return;
2029
2030
2031
2032
2033
2034
2035 rc = for_each_memory_block(&nid, check_no_memblock_for_node_cb);
2036 if (rc)
2037 return;
2038
2039 if (check_cpu_on_node(nid))
2040 return;
2041
2042
2043
2044
2045
2046 node_set_offline(nid);
2047 unregister_one_node(nid);
2048}
2049EXPORT_SYMBOL(try_offline_node);
2050
2051static int __ref try_remove_memory(u64 start, u64 size)
2052{
2053 struct vmem_altmap mhp_altmap = {};
2054 struct vmem_altmap *altmap = NULL;
2055 unsigned long nr_vmemmap_pages;
2056 int rc = 0, nid = NUMA_NO_NODE;
2057
2058 BUG_ON(check_hotplug_memory_range(start, size));
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069 rc = walk_memory_blocks(start, size, &nid, check_memblock_offlined_cb);
2070 if (rc)
2071 return rc;
2072
2073
2074
2075
2076
2077 if (memmap_on_memory) {
2078 nr_vmemmap_pages = walk_memory_blocks(start, size, NULL,
2079 get_nr_vmemmap_pages_cb);
2080 if (nr_vmemmap_pages) {
2081 if (size != memory_block_size_bytes()) {
2082 pr_warn("Refuse to remove %#llx - %#llx,"
2083 "wrong granularity\n",
2084 start, start + size);
2085 return -EINVAL;
2086 }
2087
2088
2089
2090
2091
2092
2093 mhp_altmap.alloc = nr_vmemmap_pages;
2094 altmap = &mhp_altmap;
2095 }
2096 }
2097
2098
2099 firmware_map_remove(start, start + size, "System RAM");
2100
2101
2102
2103
2104
2105 remove_memory_block_devices(start, size);
2106
2107 mem_hotplug_begin();
2108
2109 arch_remove_memory(start, size, altmap);
2110
2111 if (IS_ENABLED(CONFIG_ARCH_KEEP_MEMBLOCK)) {
2112 memblock_phys_free(start, size);
2113 memblock_remove(start, size);
2114 }
2115
2116 release_mem_region_adjustable(start, size);
2117
2118 if (nid != NUMA_NO_NODE)
2119 try_offline_node(nid);
2120
2121 mem_hotplug_done();
2122 return 0;
2123}
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134void __remove_memory(u64 start, u64 size)
2135{
2136
2137
2138
2139
2140
2141 if (try_remove_memory(start, size))
2142 BUG();
2143}
2144
2145
2146
2147
2148
2149int remove_memory(u64 start, u64 size)
2150{
2151 int rc;
2152
2153 lock_device_hotplug();
2154 rc = try_remove_memory(start, size);
2155 unlock_device_hotplug();
2156
2157 return rc;
2158}
2159EXPORT_SYMBOL_GPL(remove_memory);
2160
2161static int try_offline_memory_block(struct memory_block *mem, void *arg)
2162{
2163 uint8_t online_type = MMOP_ONLINE_KERNEL;
2164 uint8_t **online_types = arg;
2165 struct page *page;
2166 int rc;
2167
2168
2169
2170
2171
2172
2173 page = pfn_to_online_page(section_nr_to_pfn(mem->start_section_nr));
2174 if (page && zone_idx(page_zone(page)) == ZONE_MOVABLE)
2175 online_type = MMOP_ONLINE_MOVABLE;
2176
2177 rc = device_offline(&mem->dev);
2178
2179
2180
2181
2182 if (!rc)
2183 **online_types = online_type;
2184
2185 (*online_types)++;
2186
2187 return rc < 0 ? rc : 0;
2188}
2189
2190static int try_reonline_memory_block(struct memory_block *mem, void *arg)
2191{
2192 uint8_t **online_types = arg;
2193 int rc;
2194
2195 if (**online_types != MMOP_OFFLINE) {
2196 mem->online_type = **online_types;
2197 rc = device_online(&mem->dev);
2198 if (rc < 0)
2199 pr_warn("%s: Failed to re-online memory: %d",
2200 __func__, rc);
2201 }
2202
2203
2204 (*online_types)++;
2205 return 0;
2206}
2207
2208
2209
2210
2211
2212
2213
2214int offline_and_remove_memory(u64 start, u64 size)
2215{
2216 const unsigned long mb_count = size / memory_block_size_bytes();
2217 uint8_t *online_types, *tmp;
2218 int rc;
2219
2220 if (!IS_ALIGNED(start, memory_block_size_bytes()) ||
2221 !IS_ALIGNED(size, memory_block_size_bytes()) || !size)
2222 return -EINVAL;
2223
2224
2225
2226
2227
2228
2229 online_types = kmalloc_array(mb_count, sizeof(*online_types),
2230 GFP_KERNEL);
2231 if (!online_types)
2232 return -ENOMEM;
2233
2234
2235
2236
2237
2238 memset(online_types, MMOP_OFFLINE, mb_count);
2239
2240 lock_device_hotplug();
2241
2242 tmp = online_types;
2243 rc = walk_memory_blocks(start, size, &tmp, try_offline_memory_block);
2244
2245
2246
2247
2248
2249 if (!rc) {
2250 rc = try_remove_memory(start, size);
2251 if (rc)
2252 pr_err("%s: Failed to remove memory: %d", __func__, rc);
2253 }
2254
2255
2256
2257
2258
2259 if (rc) {
2260 tmp = online_types;
2261 walk_memory_blocks(start, size, &tmp,
2262 try_reonline_memory_block);
2263 }
2264 unlock_device_hotplug();
2265
2266 kfree(online_types);
2267 return rc;
2268}
2269EXPORT_SYMBOL_GPL(offline_and_remove_memory);
2270#endif
2271