1
2
3
4
5
6
7
8#include <linux/stddef.h>
9#include <linux/mm.h>
10#include <linux/sched/signal.h>
11#include <linux/swap.h>
12#include <linux/interrupt.h>
13#include <linux/pagemap.h>
14#include <linux/compiler.h>
15#include <linux/export.h>
16#include <linux/pagevec.h>
17#include <linux/writeback.h>
18#include <linux/slab.h>
19#include <linux/sysctl.h>
20#include <linux/cpu.h>
21#include <linux/memory.h>
22#include <linux/memremap.h>
23#include <linux/memory_hotplug.h>
24#include <linux/highmem.h>
25#include <linux/vmalloc.h>
26#include <linux/ioport.h>
27#include <linux/delay.h>
28#include <linux/migrate.h>
29#include <linux/page-isolation.h>
30#include <linux/pfn.h>
31#include <linux/suspend.h>
32#include <linux/mm_inline.h>
33#include <linux/firmware-map.h>
34#include <linux/stop_machine.h>
35#include <linux/hugetlb.h>
36#include <linux/memblock.h>
37#include <linux/compaction.h>
38#include <linux/rmap.h>
39
40#include <asm/tlbflush.h>
41
42#include "internal.h"
43#include "shuffle.h"
44
45
46
47
48
49static bool memmap_on_memory __ro_after_init;
50#ifdef CONFIG_MHP_MEMMAP_ON_MEMORY
51module_param(memmap_on_memory, bool, 0444);
52MODULE_PARM_DESC(memmap_on_memory, "Enable memmap on memory for memory hotplug");
53#endif
54
55enum {
56 ONLINE_POLICY_CONTIG_ZONES = 0,
57 ONLINE_POLICY_AUTO_MOVABLE,
58};
59
60const char *online_policy_to_str[] = {
61 [ONLINE_POLICY_CONTIG_ZONES] = "contig-zones",
62 [ONLINE_POLICY_AUTO_MOVABLE] = "auto-movable",
63};
64
65static int set_online_policy(const char *val, const struct kernel_param *kp)
66{
67 int ret = sysfs_match_string(online_policy_to_str, val);
68
69 if (ret < 0)
70 return ret;
71 *((int *)kp->arg) = ret;
72 return 0;
73}
74
75static int get_online_policy(char *buffer, const struct kernel_param *kp)
76{
77 return sprintf(buffer, "%s\n", online_policy_to_str[*((int *)kp->arg)]);
78}
79
80
81
82
83
84
85
86
87
88static int online_policy __read_mostly = ONLINE_POLICY_CONTIG_ZONES;
89static const struct kernel_param_ops online_policy_ops = {
90 .set = set_online_policy,
91 .get = get_online_policy,
92};
93module_param_cb(online_policy, &online_policy_ops, &online_policy, 0644);
94MODULE_PARM_DESC(online_policy,
95 "Set the online policy (\"contig-zones\", \"auto-movable\") "
96 "Default: \"contig-zones\"");
97
98
99
100
101
102
103
104
105static unsigned int auto_movable_ratio __read_mostly = 301;
106module_param(auto_movable_ratio, uint, 0644);
107MODULE_PARM_DESC(auto_movable_ratio,
108 "Set the maximum ratio of MOVABLE:KERNEL memory in the system "
109 "in percent for \"auto-movable\" online policy. Default: 301");
110
111
112
113
114#ifdef CONFIG_NUMA
115static bool auto_movable_numa_aware __read_mostly = true;
116module_param(auto_movable_numa_aware, bool, 0644);
117MODULE_PARM_DESC(auto_movable_numa_aware,
118 "Consider numa node stats in addition to global stats in "
119 "\"auto-movable\" online policy. Default: true");
120#endif
121
122
123
124
125
126
127
128
129static online_page_callback_t online_page_callback = generic_online_page;
130static DEFINE_MUTEX(online_page_callback_lock);
131
132DEFINE_STATIC_PERCPU_RWSEM(mem_hotplug_lock);
133
134void get_online_mems(void)
135{
136 percpu_down_read(&mem_hotplug_lock);
137}
138
139void put_online_mems(void)
140{
141 percpu_up_read(&mem_hotplug_lock);
142}
143
144bool movable_node_enabled = false;
145
146#ifndef CONFIG_MEMORY_HOTPLUG_DEFAULT_ONLINE
147int mhp_default_online_type = MMOP_OFFLINE;
148#else
149int mhp_default_online_type = MMOP_ONLINE;
150#endif
151
152static int __init setup_memhp_default_state(char *str)
153{
154 const int online_type = mhp_online_type_from_str(str);
155
156 if (online_type >= 0)
157 mhp_default_online_type = online_type;
158
159 return 1;
160}
161__setup("memhp_default_state=", setup_memhp_default_state);
162
163void mem_hotplug_begin(void)
164{
165 cpus_read_lock();
166 percpu_down_write(&mem_hotplug_lock);
167}
168
169void mem_hotplug_done(void)
170{
171 percpu_up_write(&mem_hotplug_lock);
172 cpus_read_unlock();
173}
174
175u64 max_mem_size = U64_MAX;
176
177
178static struct resource *register_memory_resource(u64 start, u64 size,
179 const char *resource_name)
180{
181 struct resource *res;
182 unsigned long flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY;
183
184 if (strcmp(resource_name, "System RAM"))
185 flags |= IORESOURCE_SYSRAM_DRIVER_MANAGED;
186
187 if (!mhp_range_allowed(start, size, true))
188 return ERR_PTR(-E2BIG);
189
190
191
192
193
194
195
196 if (start + size > max_mem_size && system_state < SYSTEM_RUNNING)
197 return ERR_PTR(-E2BIG);
198
199
200
201
202
203
204 res = __request_region(&iomem_resource, start, size,
205 resource_name, flags);
206
207 if (!res) {
208 pr_debug("Unable to reserve System RAM region: %016llx->%016llx\n",
209 start, start + size);
210 return ERR_PTR(-EEXIST);
211 }
212 return res;
213}
214
215static void release_memory_resource(struct resource *res)
216{
217 if (!res)
218 return;
219 release_resource(res);
220 kfree(res);
221}
222
223#ifdef CONFIG_MEMORY_HOTPLUG_SPARSE
224static int check_pfn_span(unsigned long pfn, unsigned long nr_pages,
225 const char *reason)
226{
227
228
229
230
231
232
233
234
235
236 unsigned long min_align;
237
238 if (IS_ENABLED(CONFIG_SPARSEMEM_VMEMMAP))
239 min_align = PAGES_PER_SUBSECTION;
240 else
241 min_align = PAGES_PER_SECTION;
242 if (!IS_ALIGNED(pfn, min_align)
243 || !IS_ALIGNED(nr_pages, min_align)) {
244 WARN(1, "Misaligned __%s_pages start: %#lx end: #%lx\n",
245 reason, pfn, pfn + nr_pages - 1);
246 return -EINVAL;
247 }
248 return 0;
249}
250
251
252
253
254
255
256struct page *pfn_to_online_page(unsigned long pfn)
257{
258 unsigned long nr = pfn_to_section_nr(pfn);
259 struct dev_pagemap *pgmap;
260 struct mem_section *ms;
261
262 if (nr >= NR_MEM_SECTIONS)
263 return NULL;
264
265 ms = __nr_to_section(nr);
266 if (!online_section(ms))
267 return NULL;
268
269
270
271
272
273 if (IS_ENABLED(CONFIG_HAVE_ARCH_PFN_VALID) && !pfn_valid(pfn))
274 return NULL;
275
276 if (!pfn_section_valid(ms, pfn))
277 return NULL;
278
279 if (!online_device_section(ms))
280 return pfn_to_page(pfn);
281
282
283
284
285
286
287
288 pgmap = get_dev_pagemap(pfn, NULL);
289 put_dev_pagemap(pgmap);
290
291
292 if (pgmap)
293 return NULL;
294
295 return pfn_to_page(pfn);
296}
297EXPORT_SYMBOL_GPL(pfn_to_online_page);
298
299
300
301
302
303
304
305int __ref __add_pages(int nid, unsigned long pfn, unsigned long nr_pages,
306 struct mhp_params *params)
307{
308 const unsigned long end_pfn = pfn + nr_pages;
309 unsigned long cur_nr_pages;
310 int err;
311 struct vmem_altmap *altmap = params->altmap;
312
313 if (WARN_ON_ONCE(!params->pgprot.pgprot))
314 return -EINVAL;
315
316 VM_BUG_ON(!mhp_range_allowed(PFN_PHYS(pfn), nr_pages * PAGE_SIZE, false));
317
318 if (altmap) {
319
320
321
322 if (altmap->base_pfn != pfn
323 || vmem_altmap_offset(altmap) > nr_pages) {
324 pr_warn_once("memory add fail, invalid altmap\n");
325 return -EINVAL;
326 }
327 altmap->alloc = 0;
328 }
329
330 err = check_pfn_span(pfn, nr_pages, "add");
331 if (err)
332 return err;
333
334 for (; pfn < end_pfn; pfn += cur_nr_pages) {
335
336 cur_nr_pages = min(end_pfn - pfn,
337 SECTION_ALIGN_UP(pfn + 1) - pfn);
338 err = sparse_add_section(nid, pfn, cur_nr_pages, altmap);
339 if (err)
340 break;
341 cond_resched();
342 }
343 vmemmap_populate_print_last();
344 return err;
345}
346
347
348static unsigned long find_smallest_section_pfn(int nid, struct zone *zone,
349 unsigned long start_pfn,
350 unsigned long end_pfn)
351{
352 for (; start_pfn < end_pfn; start_pfn += PAGES_PER_SUBSECTION) {
353 if (unlikely(!pfn_to_online_page(start_pfn)))
354 continue;
355
356 if (unlikely(pfn_to_nid(start_pfn) != nid))
357 continue;
358
359 if (zone != page_zone(pfn_to_page(start_pfn)))
360 continue;
361
362 return start_pfn;
363 }
364
365 return 0;
366}
367
368
369static unsigned long find_biggest_section_pfn(int nid, struct zone *zone,
370 unsigned long start_pfn,
371 unsigned long end_pfn)
372{
373 unsigned long pfn;
374
375
376 pfn = end_pfn - 1;
377 for (; pfn >= start_pfn; pfn -= PAGES_PER_SUBSECTION) {
378 if (unlikely(!pfn_to_online_page(pfn)))
379 continue;
380
381 if (unlikely(pfn_to_nid(pfn) != nid))
382 continue;
383
384 if (zone != page_zone(pfn_to_page(pfn)))
385 continue;
386
387 return pfn;
388 }
389
390 return 0;
391}
392
393static void shrink_zone_span(struct zone *zone, unsigned long start_pfn,
394 unsigned long end_pfn)
395{
396 unsigned long pfn;
397 int nid = zone_to_nid(zone);
398
399 if (zone->zone_start_pfn == start_pfn) {
400
401
402
403
404
405
406 pfn = find_smallest_section_pfn(nid, zone, end_pfn,
407 zone_end_pfn(zone));
408 if (pfn) {
409 zone->spanned_pages = zone_end_pfn(zone) - pfn;
410 zone->zone_start_pfn = pfn;
411 } else {
412 zone->zone_start_pfn = 0;
413 zone->spanned_pages = 0;
414 }
415 } else if (zone_end_pfn(zone) == end_pfn) {
416
417
418
419
420
421
422 pfn = find_biggest_section_pfn(nid, zone, zone->zone_start_pfn,
423 start_pfn);
424 if (pfn)
425 zone->spanned_pages = pfn - zone->zone_start_pfn + 1;
426 else {
427 zone->zone_start_pfn = 0;
428 zone->spanned_pages = 0;
429 }
430 }
431}
432
433static void update_pgdat_span(struct pglist_data *pgdat)
434{
435 unsigned long node_start_pfn = 0, node_end_pfn = 0;
436 struct zone *zone;
437
438 for (zone = pgdat->node_zones;
439 zone < pgdat->node_zones + MAX_NR_ZONES; zone++) {
440 unsigned long end_pfn = zone_end_pfn(zone);
441
442
443 if (!zone->spanned_pages)
444 continue;
445 if (!node_end_pfn) {
446 node_start_pfn = zone->zone_start_pfn;
447 node_end_pfn = end_pfn;
448 continue;
449 }
450
451 if (end_pfn > node_end_pfn)
452 node_end_pfn = end_pfn;
453 if (zone->zone_start_pfn < node_start_pfn)
454 node_start_pfn = zone->zone_start_pfn;
455 }
456
457 pgdat->node_start_pfn = node_start_pfn;
458 pgdat->node_spanned_pages = node_end_pfn - node_start_pfn;
459}
460
461void __ref remove_pfn_range_from_zone(struct zone *zone,
462 unsigned long start_pfn,
463 unsigned long nr_pages)
464{
465 const unsigned long end_pfn = start_pfn + nr_pages;
466 struct pglist_data *pgdat = zone->zone_pgdat;
467 unsigned long pfn, cur_nr_pages;
468
469
470 for (pfn = start_pfn; pfn < end_pfn; pfn += cur_nr_pages) {
471 cond_resched();
472
473
474 cur_nr_pages =
475 min(end_pfn - pfn, SECTION_ALIGN_UP(pfn + 1) - pfn);
476 page_init_poison(pfn_to_page(pfn),
477 sizeof(struct page) * cur_nr_pages);
478 }
479
480
481
482
483
484
485 if (zone_is_zone_device(zone))
486 return;
487
488 clear_zone_contiguous(zone);
489
490 shrink_zone_span(zone, start_pfn, start_pfn + nr_pages);
491 update_pgdat_span(pgdat);
492
493 set_zone_contiguous(zone);
494}
495
496static void __remove_section(unsigned long pfn, unsigned long nr_pages,
497 unsigned long map_offset,
498 struct vmem_altmap *altmap)
499{
500 struct mem_section *ms = __pfn_to_section(pfn);
501
502 if (WARN_ON_ONCE(!valid_section(ms)))
503 return;
504
505 sparse_remove_section(ms, pfn, nr_pages, map_offset, altmap);
506}
507
508
509
510
511
512
513
514
515
516
517
518
519void __remove_pages(unsigned long pfn, unsigned long nr_pages,
520 struct vmem_altmap *altmap)
521{
522 const unsigned long end_pfn = pfn + nr_pages;
523 unsigned long cur_nr_pages;
524 unsigned long map_offset = 0;
525
526 map_offset = vmem_altmap_offset(altmap);
527
528 if (check_pfn_span(pfn, nr_pages, "remove"))
529 return;
530
531 for (; pfn < end_pfn; pfn += cur_nr_pages) {
532 cond_resched();
533
534 cur_nr_pages = min(end_pfn - pfn,
535 SECTION_ALIGN_UP(pfn + 1) - pfn);
536 __remove_section(pfn, cur_nr_pages, map_offset, altmap);
537 map_offset = 0;
538 }
539}
540
541int set_online_page_callback(online_page_callback_t callback)
542{
543 int rc = -EINVAL;
544
545 get_online_mems();
546 mutex_lock(&online_page_callback_lock);
547
548 if (online_page_callback == generic_online_page) {
549 online_page_callback = callback;
550 rc = 0;
551 }
552
553 mutex_unlock(&online_page_callback_lock);
554 put_online_mems();
555
556 return rc;
557}
558EXPORT_SYMBOL_GPL(set_online_page_callback);
559
560int restore_online_page_callback(online_page_callback_t callback)
561{
562 int rc = -EINVAL;
563
564 get_online_mems();
565 mutex_lock(&online_page_callback_lock);
566
567 if (online_page_callback == callback) {
568 online_page_callback = generic_online_page;
569 rc = 0;
570 }
571
572 mutex_unlock(&online_page_callback_lock);
573 put_online_mems();
574
575 return rc;
576}
577EXPORT_SYMBOL_GPL(restore_online_page_callback);
578
579void generic_online_page(struct page *page, unsigned int order)
580{
581
582
583
584
585
586 debug_pagealloc_map_pages(page, 1 << order);
587 __free_pages_core(page, order);
588 totalram_pages_add(1UL << order);
589#ifdef CONFIG_HIGHMEM
590 if (PageHighMem(page))
591 totalhigh_pages_add(1UL << order);
592#endif
593}
594EXPORT_SYMBOL_GPL(generic_online_page);
595
596static void online_pages_range(unsigned long start_pfn, unsigned long nr_pages)
597{
598 const unsigned long end_pfn = start_pfn + nr_pages;
599 unsigned long pfn;
600
601
602
603
604
605
606
607
608
609
610 for (pfn = start_pfn; pfn < end_pfn;) {
611 int order = min(MAX_ORDER - 1UL, __ffs(pfn));
612
613 (*online_page_callback)(pfn_to_page(pfn), order);
614 pfn += (1UL << order);
615 }
616
617
618 online_mem_sections(start_pfn, end_pfn);
619}
620
621
622static void node_states_check_changes_online(unsigned long nr_pages,
623 struct zone *zone, struct memory_notify *arg)
624{
625 int nid = zone_to_nid(zone);
626
627 arg->status_change_nid = NUMA_NO_NODE;
628 arg->status_change_nid_normal = NUMA_NO_NODE;
629 arg->status_change_nid_high = NUMA_NO_NODE;
630
631 if (!node_state(nid, N_MEMORY))
632 arg->status_change_nid = nid;
633 if (zone_idx(zone) <= ZONE_NORMAL && !node_state(nid, N_NORMAL_MEMORY))
634 arg->status_change_nid_normal = nid;
635#ifdef CONFIG_HIGHMEM
636 if (zone_idx(zone) <= ZONE_HIGHMEM && !node_state(nid, N_HIGH_MEMORY))
637 arg->status_change_nid_high = nid;
638#endif
639}
640
641static void node_states_set_node(int node, struct memory_notify *arg)
642{
643 if (arg->status_change_nid_normal >= 0)
644 node_set_state(node, N_NORMAL_MEMORY);
645
646 if (arg->status_change_nid_high >= 0)
647 node_set_state(node, N_HIGH_MEMORY);
648
649 if (arg->status_change_nid >= 0)
650 node_set_state(node, N_MEMORY);
651}
652
653static void __meminit resize_zone_range(struct zone *zone, unsigned long start_pfn,
654 unsigned long nr_pages)
655{
656 unsigned long old_end_pfn = zone_end_pfn(zone);
657
658 if (zone_is_empty(zone) || start_pfn < zone->zone_start_pfn)
659 zone->zone_start_pfn = start_pfn;
660
661 zone->spanned_pages = max(start_pfn + nr_pages, old_end_pfn) - zone->zone_start_pfn;
662}
663
664static void __meminit resize_pgdat_range(struct pglist_data *pgdat, unsigned long start_pfn,
665 unsigned long nr_pages)
666{
667 unsigned long old_end_pfn = pgdat_end_pfn(pgdat);
668
669 if (!pgdat->node_spanned_pages || start_pfn < pgdat->node_start_pfn)
670 pgdat->node_start_pfn = start_pfn;
671
672 pgdat->node_spanned_pages = max(start_pfn + nr_pages, old_end_pfn) - pgdat->node_start_pfn;
673
674}
675
676static void section_taint_zone_device(unsigned long pfn)
677{
678 struct mem_section *ms = __pfn_to_section(pfn);
679
680 ms->section_mem_map |= SECTION_TAINT_ZONE_DEVICE;
681}
682
683
684
685
686
687
688
689
690
691
692void __ref move_pfn_range_to_zone(struct zone *zone, unsigned long start_pfn,
693 unsigned long nr_pages,
694 struct vmem_altmap *altmap, int migratetype)
695{
696 struct pglist_data *pgdat = zone->zone_pgdat;
697 int nid = pgdat->node_id;
698
699 clear_zone_contiguous(zone);
700
701 if (zone_is_empty(zone))
702 init_currently_empty_zone(zone, start_pfn, nr_pages);
703 resize_zone_range(zone, start_pfn, nr_pages);
704 resize_pgdat_range(pgdat, start_pfn, nr_pages);
705
706
707
708
709
710
711
712 if (zone_is_zone_device(zone)) {
713 if (!IS_ALIGNED(start_pfn, PAGES_PER_SECTION))
714 section_taint_zone_device(start_pfn);
715 if (!IS_ALIGNED(start_pfn + nr_pages, PAGES_PER_SECTION))
716 section_taint_zone_device(start_pfn + nr_pages);
717 }
718
719
720
721
722
723
724
725 memmap_init_range(nr_pages, nid, zone_idx(zone), start_pfn, 0,
726 MEMINIT_HOTPLUG, altmap, migratetype);
727
728 set_zone_contiguous(zone);
729}
730
731struct auto_movable_stats {
732 unsigned long kernel_early_pages;
733 unsigned long movable_pages;
734};
735
736static void auto_movable_stats_account_zone(struct auto_movable_stats *stats,
737 struct zone *zone)
738{
739 if (zone_idx(zone) == ZONE_MOVABLE) {
740 stats->movable_pages += zone->present_pages;
741 } else {
742 stats->kernel_early_pages += zone->present_early_pages;
743#ifdef CONFIG_CMA
744
745
746
747
748 stats->movable_pages += zone->cma_pages;
749 stats->kernel_early_pages -= zone->cma_pages;
750#endif
751 }
752}
753struct auto_movable_group_stats {
754 unsigned long movable_pages;
755 unsigned long req_kernel_early_pages;
756};
757
758static int auto_movable_stats_account_group(struct memory_group *group,
759 void *arg)
760{
761 const int ratio = READ_ONCE(auto_movable_ratio);
762 struct auto_movable_group_stats *stats = arg;
763 long pages;
764
765
766
767
768
769 if (!ratio)
770 return 0;
771
772
773
774
775
776 pages = group->present_movable_pages * 100 / ratio;
777 pages -= group->present_kernel_pages;
778
779 if (pages > 0)
780 stats->req_kernel_early_pages += pages;
781 stats->movable_pages += group->present_movable_pages;
782 return 0;
783}
784
785static bool auto_movable_can_online_movable(int nid, struct memory_group *group,
786 unsigned long nr_pages)
787{
788 unsigned long kernel_early_pages, movable_pages;
789 struct auto_movable_group_stats group_stats = {};
790 struct auto_movable_stats stats = {};
791 pg_data_t *pgdat = NODE_DATA(nid);
792 struct zone *zone;
793 int i;
794
795
796 if (nid == NUMA_NO_NODE) {
797
798 for_each_populated_zone(zone)
799 auto_movable_stats_account_zone(&stats, zone);
800 } else {
801 for (i = 0; i < MAX_NR_ZONES; i++) {
802 zone = pgdat->node_zones + i;
803 if (populated_zone(zone))
804 auto_movable_stats_account_zone(&stats, zone);
805 }
806 }
807
808 kernel_early_pages = stats.kernel_early_pages;
809 movable_pages = stats.movable_pages;
810
811
812
813
814
815
816 walk_dynamic_memory_groups(nid, auto_movable_stats_account_group,
817 group, &group_stats);
818 if (kernel_early_pages <= group_stats.req_kernel_early_pages)
819 return false;
820 kernel_early_pages -= group_stats.req_kernel_early_pages;
821 movable_pages -= group_stats.movable_pages;
822
823 if (group && group->is_dynamic)
824 kernel_early_pages += group->present_kernel_pages;
825
826
827
828
829
830 movable_pages += nr_pages;
831 return movable_pages <= (auto_movable_ratio * kernel_early_pages) / 100;
832}
833
834
835
836
837
838
839static struct zone *default_kernel_zone_for_pfn(int nid, unsigned long start_pfn,
840 unsigned long nr_pages)
841{
842 struct pglist_data *pgdat = NODE_DATA(nid);
843 int zid;
844
845 for (zid = 0; zid <= ZONE_NORMAL; zid++) {
846 struct zone *zone = &pgdat->node_zones[zid];
847
848 if (zone_intersects(zone, start_pfn, nr_pages))
849 return zone;
850 }
851
852 return &pgdat->node_zones[ZONE_NORMAL];
853}
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905static struct zone *auto_movable_zone_for_pfn(int nid,
906 struct memory_group *group,
907 unsigned long pfn,
908 unsigned long nr_pages)
909{
910 unsigned long online_pages = 0, max_pages, end_pfn;
911 struct page *page;
912
913 if (!auto_movable_ratio)
914 goto kernel_zone;
915
916 if (group && !group->is_dynamic) {
917 max_pages = group->s.max_pages;
918 online_pages = group->present_movable_pages;
919
920
921 if (group->present_kernel_pages)
922 goto kernel_zone;
923 } else if (!group || group->d.unit_pages == nr_pages) {
924 max_pages = nr_pages;
925 } else {
926 max_pages = group->d.unit_pages;
927
928
929
930
931
932
933 pfn = ALIGN_DOWN(pfn, group->d.unit_pages);
934 end_pfn = pfn + group->d.unit_pages;
935 for (; pfn < end_pfn; pfn += PAGES_PER_SECTION) {
936 page = pfn_to_online_page(pfn);
937 if (!page)
938 continue;
939
940 if (page_zonenum(page) != ZONE_MOVABLE)
941 goto kernel_zone;
942 online_pages += PAGES_PER_SECTION;
943 }
944 }
945
946
947
948
949
950
951 nr_pages = max_pages - online_pages;
952 if (!auto_movable_can_online_movable(NUMA_NO_NODE, group, nr_pages))
953 goto kernel_zone;
954
955#ifdef CONFIG_NUMA
956 if (auto_movable_numa_aware &&
957 !auto_movable_can_online_movable(nid, group, nr_pages))
958 goto kernel_zone;
959#endif
960
961 return &NODE_DATA(nid)->node_zones[ZONE_MOVABLE];
962kernel_zone:
963 return default_kernel_zone_for_pfn(nid, pfn, nr_pages);
964}
965
966static inline struct zone *default_zone_for_pfn(int nid, unsigned long start_pfn,
967 unsigned long nr_pages)
968{
969 struct zone *kernel_zone = default_kernel_zone_for_pfn(nid, start_pfn,
970 nr_pages);
971 struct zone *movable_zone = &NODE_DATA(nid)->node_zones[ZONE_MOVABLE];
972 bool in_kernel = zone_intersects(kernel_zone, start_pfn, nr_pages);
973 bool in_movable = zone_intersects(movable_zone, start_pfn, nr_pages);
974
975
976
977
978
979 if (in_kernel ^ in_movable)
980 return (in_kernel) ? kernel_zone : movable_zone;
981
982
983
984
985
986
987 return movable_node_enabled ? movable_zone : kernel_zone;
988}
989
990struct zone *zone_for_pfn_range(int online_type, int nid,
991 struct memory_group *group, unsigned long start_pfn,
992 unsigned long nr_pages)
993{
994 if (online_type == MMOP_ONLINE_KERNEL)
995 return default_kernel_zone_for_pfn(nid, start_pfn, nr_pages);
996
997 if (online_type == MMOP_ONLINE_MOVABLE)
998 return &NODE_DATA(nid)->node_zones[ZONE_MOVABLE];
999
1000 if (online_policy == ONLINE_POLICY_AUTO_MOVABLE)
1001 return auto_movable_zone_for_pfn(nid, group, start_pfn, nr_pages);
1002
1003 return default_zone_for_pfn(nid, start_pfn, nr_pages);
1004}
1005
1006
1007
1008
1009
1010void adjust_present_page_count(struct page *page, struct memory_group *group,
1011 long nr_pages)
1012{
1013 struct zone *zone = page_zone(page);
1014 const bool movable = zone_idx(zone) == ZONE_MOVABLE;
1015
1016
1017
1018
1019
1020 if (early_section(__pfn_to_section(page_to_pfn(page))))
1021 zone->present_early_pages += nr_pages;
1022 zone->present_pages += nr_pages;
1023 zone->zone_pgdat->node_present_pages += nr_pages;
1024
1025 if (group && movable)
1026 group->present_movable_pages += nr_pages;
1027 else if (group && !movable)
1028 group->present_kernel_pages += nr_pages;
1029}
1030
1031int mhp_init_memmap_on_memory(unsigned long pfn, unsigned long nr_pages,
1032 struct zone *zone)
1033{
1034 unsigned long end_pfn = pfn + nr_pages;
1035 int ret;
1036
1037 ret = kasan_add_zero_shadow(__va(PFN_PHYS(pfn)), PFN_PHYS(nr_pages));
1038 if (ret)
1039 return ret;
1040
1041 move_pfn_range_to_zone(zone, pfn, nr_pages, NULL, MIGRATE_UNMOVABLE);
1042
1043
1044
1045
1046
1047
1048 if (nr_pages >= PAGES_PER_SECTION)
1049 online_mem_sections(pfn, ALIGN_DOWN(end_pfn, PAGES_PER_SECTION));
1050
1051 return ret;
1052}
1053
1054void mhp_deinit_memmap_on_memory(unsigned long pfn, unsigned long nr_pages)
1055{
1056 unsigned long end_pfn = pfn + nr_pages;
1057
1058
1059
1060
1061
1062
1063 if (nr_pages >= PAGES_PER_SECTION)
1064 offline_mem_sections(pfn, ALIGN_DOWN(end_pfn, PAGES_PER_SECTION));
1065
1066
1067
1068
1069
1070 remove_pfn_range_from_zone(page_zone(pfn_to_page(pfn)), pfn, nr_pages);
1071 kasan_remove_zero_shadow(__va(PFN_PHYS(pfn)), PFN_PHYS(nr_pages));
1072}
1073
1074int __ref online_pages(unsigned long pfn, unsigned long nr_pages,
1075 struct zone *zone, struct memory_group *group)
1076{
1077 unsigned long flags;
1078 int need_zonelists_rebuild = 0;
1079 const int nid = zone_to_nid(zone);
1080 int ret;
1081 struct memory_notify arg;
1082
1083
1084
1085
1086
1087
1088
1089
1090 if (WARN_ON_ONCE(!nr_pages ||
1091 !IS_ALIGNED(pfn, pageblock_nr_pages) ||
1092 !IS_ALIGNED(pfn + nr_pages, PAGES_PER_SECTION)))
1093 return -EINVAL;
1094
1095 mem_hotplug_begin();
1096
1097
1098 move_pfn_range_to_zone(zone, pfn, nr_pages, NULL, MIGRATE_ISOLATE);
1099
1100 arg.start_pfn = pfn;
1101 arg.nr_pages = nr_pages;
1102 node_states_check_changes_online(nr_pages, zone, &arg);
1103
1104 ret = memory_notify(MEM_GOING_ONLINE, &arg);
1105 ret = notifier_to_errno(ret);
1106 if (ret)
1107 goto failed_addition;
1108
1109
1110
1111
1112
1113 spin_lock_irqsave(&zone->lock, flags);
1114 zone->nr_isolate_pageblock += nr_pages / pageblock_nr_pages;
1115 spin_unlock_irqrestore(&zone->lock, flags);
1116
1117
1118
1119
1120
1121
1122 if (!populated_zone(zone)) {
1123 need_zonelists_rebuild = 1;
1124 setup_zone_pageset(zone);
1125 }
1126
1127 online_pages_range(pfn, nr_pages);
1128 adjust_present_page_count(pfn_to_page(pfn), group, nr_pages);
1129
1130 node_states_set_node(nid, &arg);
1131 if (need_zonelists_rebuild)
1132 build_all_zonelists(NULL);
1133
1134
1135 undo_isolate_page_range(pfn, pfn + nr_pages, MIGRATE_MOVABLE);
1136
1137
1138
1139
1140
1141
1142
1143 shuffle_zone(zone);
1144
1145
1146 init_per_zone_wmark_min();
1147
1148 kswapd_run(nid);
1149 kcompactd_run(nid);
1150
1151 writeback_set_ratelimit();
1152
1153 memory_notify(MEM_ONLINE, &arg);
1154 mem_hotplug_done();
1155 return 0;
1156
1157failed_addition:
1158 pr_debug("online_pages [mem %#010llx-%#010llx] failed\n",
1159 (unsigned long long) pfn << PAGE_SHIFT,
1160 (((unsigned long long) pfn + nr_pages) << PAGE_SHIFT) - 1);
1161 memory_notify(MEM_CANCEL_ONLINE, &arg);
1162 remove_pfn_range_from_zone(zone, pfn, nr_pages);
1163 mem_hotplug_done();
1164 return ret;
1165}
1166#endif
1167
1168static void reset_node_present_pages(pg_data_t *pgdat)
1169{
1170 struct zone *z;
1171
1172 for (z = pgdat->node_zones; z < pgdat->node_zones + MAX_NR_ZONES; z++)
1173 z->present_pages = 0;
1174
1175 pgdat->node_present_pages = 0;
1176}
1177
1178
1179static pg_data_t __ref *hotadd_new_pgdat(int nid)
1180{
1181 struct pglist_data *pgdat;
1182
1183 pgdat = NODE_DATA(nid);
1184 if (!pgdat) {
1185 pgdat = arch_alloc_nodedata(nid);
1186 if (!pgdat)
1187 return NULL;
1188
1189 pgdat->per_cpu_nodestats =
1190 alloc_percpu(struct per_cpu_nodestat);
1191 arch_refresh_nodedata(nid, pgdat);
1192 } else {
1193 int cpu;
1194
1195
1196
1197
1198
1199 pgdat->nr_zones = 0;
1200 pgdat->kswapd_order = 0;
1201 pgdat->kswapd_highest_zoneidx = 0;
1202 for_each_online_cpu(cpu) {
1203 struct per_cpu_nodestat *p;
1204
1205 p = per_cpu_ptr(pgdat->per_cpu_nodestats, cpu);
1206 memset(p, 0, sizeof(*p));
1207 }
1208 }
1209
1210
1211 pgdat->node_id = nid;
1212 pgdat->node_start_pfn = 0;
1213
1214
1215 free_area_init_core_hotplug(nid);
1216
1217
1218
1219
1220
1221 build_all_zonelists(pgdat);
1222
1223
1224
1225
1226
1227
1228 reset_node_managed_pages(pgdat);
1229 reset_node_present_pages(pgdat);
1230
1231 return pgdat;
1232}
1233
1234static void rollback_node_hotadd(int nid)
1235{
1236 pg_data_t *pgdat = NODE_DATA(nid);
1237
1238 arch_refresh_nodedata(nid, NULL);
1239 free_percpu(pgdat->per_cpu_nodestats);
1240 arch_free_nodedata(pgdat);
1241}
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255static int __try_online_node(int nid, bool set_node_online)
1256{
1257 pg_data_t *pgdat;
1258 int ret = 1;
1259
1260 if (node_online(nid))
1261 return 0;
1262
1263 pgdat = hotadd_new_pgdat(nid);
1264 if (!pgdat) {
1265 pr_err("Cannot online node %d due to NULL pgdat\n", nid);
1266 ret = -ENOMEM;
1267 goto out;
1268 }
1269
1270 if (set_node_online) {
1271 node_set_online(nid);
1272 ret = register_one_node(nid);
1273 BUG_ON(ret);
1274 }
1275out:
1276 return ret;
1277}
1278
1279
1280
1281
1282int try_online_node(int nid)
1283{
1284 int ret;
1285
1286 mem_hotplug_begin();
1287 ret = __try_online_node(nid, true);
1288 mem_hotplug_done();
1289 return ret;
1290}
1291
1292static int check_hotplug_memory_range(u64 start, u64 size)
1293{
1294
1295 if (!size || !IS_ALIGNED(start, memory_block_size_bytes()) ||
1296 !IS_ALIGNED(size, memory_block_size_bytes())) {
1297 pr_err("Block size [%#lx] unaligned hotplug range: start %#llx, size %#llx",
1298 memory_block_size_bytes(), start, size);
1299 return -EINVAL;
1300 }
1301
1302 return 0;
1303}
1304
1305static int online_memory_block(struct memory_block *mem, void *arg)
1306{
1307 mem->online_type = mhp_default_online_type;
1308 return device_online(&mem->dev);
1309}
1310
1311bool mhp_supports_memmap_on_memory(unsigned long size)
1312{
1313 unsigned long nr_vmemmap_pages = size / PAGE_SIZE;
1314 unsigned long vmemmap_size = nr_vmemmap_pages * sizeof(struct page);
1315 unsigned long remaining_size = size - vmemmap_size;
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343 return memmap_on_memory &&
1344 !hugetlb_free_vmemmap_enabled &&
1345 IS_ENABLED(CONFIG_MHP_MEMMAP_ON_MEMORY) &&
1346 size == memory_block_size_bytes() &&
1347 IS_ALIGNED(vmemmap_size, PMD_SIZE) &&
1348 IS_ALIGNED(remaining_size, (pageblock_nr_pages << PAGE_SHIFT));
1349}
1350
1351
1352
1353
1354
1355
1356
1357int __ref add_memory_resource(int nid, struct resource *res, mhp_t mhp_flags)
1358{
1359 struct mhp_params params = { .pgprot = pgprot_mhp(PAGE_KERNEL) };
1360 struct vmem_altmap mhp_altmap = {};
1361 struct memory_group *group = NULL;
1362 u64 start, size;
1363 bool new_node = false;
1364 int ret;
1365
1366 start = res->start;
1367 size = resource_size(res);
1368
1369 ret = check_hotplug_memory_range(start, size);
1370 if (ret)
1371 return ret;
1372
1373 if (mhp_flags & MHP_NID_IS_MGID) {
1374 group = memory_group_find_by_id(nid);
1375 if (!group)
1376 return -EINVAL;
1377 nid = group->nid;
1378 }
1379
1380 if (!node_possible(nid)) {
1381 WARN(1, "node %d was absent from the node_possible_map\n", nid);
1382 return -EINVAL;
1383 }
1384
1385 mem_hotplug_begin();
1386
1387 if (IS_ENABLED(CONFIG_ARCH_KEEP_MEMBLOCK))
1388 memblock_add_node(start, size, nid);
1389
1390 ret = __try_online_node(nid, false);
1391 if (ret < 0)
1392 goto error;
1393 new_node = ret;
1394
1395
1396
1397
1398 if (mhp_flags & MHP_MEMMAP_ON_MEMORY) {
1399 if (!mhp_supports_memmap_on_memory(size)) {
1400 ret = -EINVAL;
1401 goto error;
1402 }
1403 mhp_altmap.free = PHYS_PFN(size);
1404 mhp_altmap.base_pfn = PHYS_PFN(start);
1405 params.altmap = &mhp_altmap;
1406 }
1407
1408
1409 ret = arch_add_memory(nid, start, size, ¶ms);
1410 if (ret < 0)
1411 goto error;
1412
1413
1414 ret = create_memory_block_devices(start, size, mhp_altmap.alloc,
1415 group);
1416 if (ret) {
1417 arch_remove_memory(start, size, NULL);
1418 goto error;
1419 }
1420
1421 if (new_node) {
1422
1423
1424
1425
1426
1427 node_set_online(nid);
1428 ret = __register_one_node(nid);
1429 BUG_ON(ret);
1430 }
1431
1432
1433 link_mem_sections(nid, PFN_DOWN(start), PFN_UP(start + size - 1),
1434 MEMINIT_HOTPLUG);
1435
1436
1437 if (!strcmp(res->name, "System RAM"))
1438 firmware_map_add_hotplug(start, start + size, "System RAM");
1439
1440
1441 mem_hotplug_done();
1442
1443
1444
1445
1446
1447 if (mhp_flags & MHP_MERGE_RESOURCE)
1448 merge_system_ram_resource(res);
1449
1450
1451 if (mhp_default_online_type != MMOP_OFFLINE)
1452 walk_memory_blocks(start, size, NULL, online_memory_block);
1453
1454 return ret;
1455error:
1456
1457 if (new_node)
1458 rollback_node_hotadd(nid);
1459 if (IS_ENABLED(CONFIG_ARCH_KEEP_MEMBLOCK))
1460 memblock_remove(start, size);
1461 mem_hotplug_done();
1462 return ret;
1463}
1464
1465
1466int __ref __add_memory(int nid, u64 start, u64 size, mhp_t mhp_flags)
1467{
1468 struct resource *res;
1469 int ret;
1470
1471 res = register_memory_resource(start, size, "System RAM");
1472 if (IS_ERR(res))
1473 return PTR_ERR(res);
1474
1475 ret = add_memory_resource(nid, res, mhp_flags);
1476 if (ret < 0)
1477 release_memory_resource(res);
1478 return ret;
1479}
1480
1481int add_memory(int nid, u64 start, u64 size, mhp_t mhp_flags)
1482{
1483 int rc;
1484
1485 lock_device_hotplug();
1486 rc = __add_memory(nid, start, size, mhp_flags);
1487 unlock_device_hotplug();
1488
1489 return rc;
1490}
1491EXPORT_SYMBOL_GPL(add_memory);
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514int add_memory_driver_managed(int nid, u64 start, u64 size,
1515 const char *resource_name, mhp_t mhp_flags)
1516{
1517 struct resource *res;
1518 int rc;
1519
1520 if (!resource_name ||
1521 strstr(resource_name, "System RAM (") != resource_name ||
1522 resource_name[strlen(resource_name) - 1] != ')')
1523 return -EINVAL;
1524
1525 lock_device_hotplug();
1526
1527 res = register_memory_resource(start, size, resource_name);
1528 if (IS_ERR(res)) {
1529 rc = PTR_ERR(res);
1530 goto out_unlock;
1531 }
1532
1533 rc = add_memory_resource(nid, res, mhp_flags);
1534 if (rc < 0)
1535 release_memory_resource(res);
1536
1537out_unlock:
1538 unlock_device_hotplug();
1539 return rc;
1540}
1541EXPORT_SYMBOL_GPL(add_memory_driver_managed);
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556struct range __weak arch_get_mappable_range(void)
1557{
1558 struct range mhp_range = {
1559 .start = 0UL,
1560 .end = -1ULL,
1561 };
1562 return mhp_range;
1563}
1564
1565struct range mhp_get_pluggable_range(bool need_mapping)
1566{
1567 const u64 max_phys = (1ULL << MAX_PHYSMEM_BITS) - 1;
1568 struct range mhp_range;
1569
1570 if (need_mapping) {
1571 mhp_range = arch_get_mappable_range();
1572 if (mhp_range.start > max_phys) {
1573 mhp_range.start = 0;
1574 mhp_range.end = 0;
1575 }
1576 mhp_range.end = min_t(u64, mhp_range.end, max_phys);
1577 } else {
1578 mhp_range.start = 0;
1579 mhp_range.end = max_phys;
1580 }
1581 return mhp_range;
1582}
1583EXPORT_SYMBOL_GPL(mhp_get_pluggable_range);
1584
1585bool mhp_range_allowed(u64 start, u64 size, bool need_mapping)
1586{
1587 struct range mhp_range = mhp_get_pluggable_range(need_mapping);
1588 u64 end = start + size;
1589
1590 if (start < end && start >= mhp_range.start && (end - 1) <= mhp_range.end)
1591 return true;
1592
1593 pr_warn("Hotplug memory [%#llx-%#llx] exceeds maximum addressable range [%#llx-%#llx]\n",
1594 start, end, mhp_range.start, mhp_range.end);
1595 return false;
1596}
1597
1598#ifdef CONFIG_MEMORY_HOTREMOVE
1599
1600
1601
1602
1603struct zone *test_pages_in_a_zone(unsigned long start_pfn,
1604 unsigned long end_pfn)
1605{
1606 unsigned long pfn, sec_end_pfn;
1607 struct zone *zone = NULL;
1608 struct page *page;
1609
1610 for (pfn = start_pfn, sec_end_pfn = SECTION_ALIGN_UP(start_pfn + 1);
1611 pfn < end_pfn;
1612 pfn = sec_end_pfn, sec_end_pfn += PAGES_PER_SECTION) {
1613
1614 if (!present_section_nr(pfn_to_section_nr(pfn)))
1615 continue;
1616 for (; pfn < sec_end_pfn && pfn < end_pfn;
1617 pfn += MAX_ORDER_NR_PAGES) {
1618
1619 if (zone && !zone_spans_pfn(zone, pfn))
1620 return NULL;
1621 page = pfn_to_page(pfn);
1622 if (zone && page_zone(page) != zone)
1623 return NULL;
1624 zone = page_zone(page);
1625 }
1626 }
1627
1628 return zone;
1629}
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642static int scan_movable_pages(unsigned long start, unsigned long end,
1643 unsigned long *movable_pfn)
1644{
1645 unsigned long pfn;
1646
1647 for (pfn = start; pfn < end; pfn++) {
1648 struct page *page, *head;
1649 unsigned long skip;
1650
1651 if (!pfn_valid(pfn))
1652 continue;
1653 page = pfn_to_page(pfn);
1654 if (PageLRU(page))
1655 goto found;
1656 if (__PageMovable(page))
1657 goto found;
1658
1659
1660
1661
1662
1663
1664
1665 if (PageOffline(page) && page_count(page))
1666 return -EBUSY;
1667
1668 if (!PageHuge(page))
1669 continue;
1670 head = compound_head(page);
1671
1672
1673
1674
1675
1676
1677
1678 if (HPageMigratable(head))
1679 goto found;
1680 skip = compound_nr(head) - (page - head);
1681 pfn += skip - 1;
1682 }
1683 return -ENOENT;
1684found:
1685 *movable_pfn = pfn;
1686 return 0;
1687}
1688
1689static int
1690do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
1691{
1692 unsigned long pfn;
1693 struct page *page, *head;
1694 int ret = 0;
1695 LIST_HEAD(source);
1696 static DEFINE_RATELIMIT_STATE(migrate_rs, DEFAULT_RATELIMIT_INTERVAL,
1697 DEFAULT_RATELIMIT_BURST);
1698
1699 for (pfn = start_pfn; pfn < end_pfn; pfn++) {
1700 if (!pfn_valid(pfn))
1701 continue;
1702 page = pfn_to_page(pfn);
1703 head = compound_head(page);
1704
1705 if (PageHuge(page)) {
1706 pfn = page_to_pfn(head) + compound_nr(head) - 1;
1707 isolate_huge_page(head, &source);
1708 continue;
1709 } else if (PageTransHuge(page))
1710 pfn = page_to_pfn(head) + thp_nr_pages(page) - 1;
1711
1712
1713
1714
1715
1716
1717
1718
1719 if (PageHWPoison(page)) {
1720 if (WARN_ON(PageLRU(page)))
1721 isolate_lru_page(page);
1722 if (page_mapped(page))
1723 try_to_unmap(page, TTU_IGNORE_MLOCK);
1724 continue;
1725 }
1726
1727 if (!get_page_unless_zero(page))
1728 continue;
1729
1730
1731
1732
1733 if (PageLRU(page))
1734 ret = isolate_lru_page(page);
1735 else
1736 ret = isolate_movable_page(page, ISOLATE_UNEVICTABLE);
1737 if (!ret) {
1738 list_add_tail(&page->lru, &source);
1739 if (!__PageMovable(page))
1740 inc_node_page_state(page, NR_ISOLATED_ANON +
1741 page_is_file_lru(page));
1742
1743 } else {
1744 if (__ratelimit(&migrate_rs)) {
1745 pr_warn("failed to isolate pfn %lx\n", pfn);
1746 dump_page(page, "isolation failed");
1747 }
1748 }
1749 put_page(page);
1750 }
1751 if (!list_empty(&source)) {
1752 nodemask_t nmask = node_states[N_MEMORY];
1753 struct migration_target_control mtc = {
1754 .nmask = &nmask,
1755 .gfp_mask = GFP_USER | __GFP_MOVABLE | __GFP_RETRY_MAYFAIL,
1756 };
1757
1758
1759
1760
1761
1762 mtc.nid = page_to_nid(list_first_entry(&source, struct page, lru));
1763
1764
1765
1766
1767
1768
1769 node_clear(mtc.nid, nmask);
1770 if (nodes_empty(nmask))
1771 node_set(mtc.nid, nmask);
1772 ret = migrate_pages(&source, alloc_migration_target, NULL,
1773 (unsigned long)&mtc, MIGRATE_SYNC, MR_MEMORY_HOTPLUG, NULL);
1774 if (ret) {
1775 list_for_each_entry(page, &source, lru) {
1776 if (__ratelimit(&migrate_rs)) {
1777 pr_warn("migrating pfn %lx failed ret:%d\n",
1778 page_to_pfn(page), ret);
1779 dump_page(page, "migration failure");
1780 }
1781 }
1782 putback_movable_pages(&source);
1783 }
1784 }
1785
1786 return ret;
1787}
1788
1789static int __init cmdline_parse_movable_node(char *p)
1790{
1791 movable_node_enabled = true;
1792 return 0;
1793}
1794early_param("movable_node", cmdline_parse_movable_node);
1795
1796
1797static void node_states_check_changes_offline(unsigned long nr_pages,
1798 struct zone *zone, struct memory_notify *arg)
1799{
1800 struct pglist_data *pgdat = zone->zone_pgdat;
1801 unsigned long present_pages = 0;
1802 enum zone_type zt;
1803
1804 arg->status_change_nid = NUMA_NO_NODE;
1805 arg->status_change_nid_normal = NUMA_NO_NODE;
1806 arg->status_change_nid_high = NUMA_NO_NODE;
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816 for (zt = 0; zt <= ZONE_NORMAL; zt++)
1817 present_pages += pgdat->node_zones[zt].present_pages;
1818 if (zone_idx(zone) <= ZONE_NORMAL && nr_pages >= present_pages)
1819 arg->status_change_nid_normal = zone_to_nid(zone);
1820
1821#ifdef CONFIG_HIGHMEM
1822
1823
1824
1825
1826
1827
1828
1829
1830 present_pages += pgdat->node_zones[ZONE_HIGHMEM].present_pages;
1831 if (zone_idx(zone) <= ZONE_HIGHMEM && nr_pages >= present_pages)
1832 arg->status_change_nid_high = zone_to_nid(zone);
1833#endif
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845 present_pages += pgdat->node_zones[ZONE_MOVABLE].present_pages;
1846
1847 if (nr_pages >= present_pages)
1848 arg->status_change_nid = zone_to_nid(zone);
1849}
1850
1851static void node_states_clear_node(int node, struct memory_notify *arg)
1852{
1853 if (arg->status_change_nid_normal >= 0)
1854 node_clear_state(node, N_NORMAL_MEMORY);
1855
1856 if (arg->status_change_nid_high >= 0)
1857 node_clear_state(node, N_HIGH_MEMORY);
1858
1859 if (arg->status_change_nid >= 0)
1860 node_clear_state(node, N_MEMORY);
1861}
1862
1863static int count_system_ram_pages_cb(unsigned long start_pfn,
1864 unsigned long nr_pages, void *data)
1865{
1866 unsigned long *nr_system_ram_pages = data;
1867
1868 *nr_system_ram_pages += nr_pages;
1869 return 0;
1870}
1871
1872int __ref offline_pages(unsigned long start_pfn, unsigned long nr_pages,
1873 struct memory_group *group)
1874{
1875 const unsigned long end_pfn = start_pfn + nr_pages;
1876 unsigned long pfn, system_ram_pages = 0;
1877 unsigned long flags;
1878 struct zone *zone;
1879 struct memory_notify arg;
1880 int ret, node;
1881 char *reason;
1882
1883
1884
1885
1886
1887
1888
1889
1890 if (WARN_ON_ONCE(!nr_pages ||
1891 !IS_ALIGNED(start_pfn, pageblock_nr_pages) ||
1892 !IS_ALIGNED(start_pfn + nr_pages, PAGES_PER_SECTION)))
1893 return -EINVAL;
1894
1895 mem_hotplug_begin();
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905 walk_system_ram_range(start_pfn, nr_pages, &system_ram_pages,
1906 count_system_ram_pages_cb);
1907 if (system_ram_pages != nr_pages) {
1908 ret = -EINVAL;
1909 reason = "memory holes";
1910 goto failed_removal;
1911 }
1912
1913
1914
1915 zone = test_pages_in_a_zone(start_pfn, end_pfn);
1916 if (!zone) {
1917 ret = -EINVAL;
1918 reason = "multizone range";
1919 goto failed_removal;
1920 }
1921 node = zone_to_nid(zone);
1922
1923
1924
1925
1926
1927 zone_pcp_disable(zone);
1928 lru_cache_disable();
1929
1930
1931 ret = start_isolate_page_range(start_pfn, end_pfn,
1932 MIGRATE_MOVABLE,
1933 MEMORY_OFFLINE | REPORT_FAILURE);
1934 if (ret) {
1935 reason = "failure to isolate range";
1936 goto failed_removal_pcplists_disabled;
1937 }
1938
1939 arg.start_pfn = start_pfn;
1940 arg.nr_pages = nr_pages;
1941 node_states_check_changes_offline(nr_pages, zone, &arg);
1942
1943 ret = memory_notify(MEM_GOING_OFFLINE, &arg);
1944 ret = notifier_to_errno(ret);
1945 if (ret) {
1946 reason = "notifier failure";
1947 goto failed_removal_isolated;
1948 }
1949
1950 do {
1951 pfn = start_pfn;
1952 do {
1953 if (signal_pending(current)) {
1954 ret = -EINTR;
1955 reason = "signal backoff";
1956 goto failed_removal_isolated;
1957 }
1958
1959 cond_resched();
1960
1961 ret = scan_movable_pages(pfn, end_pfn, &pfn);
1962 if (!ret) {
1963
1964
1965
1966
1967 do_migrate_range(pfn, end_pfn);
1968 }
1969 } while (!ret);
1970
1971 if (ret != -ENOENT) {
1972 reason = "unmovable page";
1973 goto failed_removal_isolated;
1974 }
1975
1976
1977
1978
1979
1980
1981 ret = dissolve_free_huge_pages(start_pfn, end_pfn);
1982 if (ret) {
1983 reason = "failure to dissolve huge pages";
1984 goto failed_removal_isolated;
1985 }
1986
1987 ret = test_pages_isolated(start_pfn, end_pfn, MEMORY_OFFLINE);
1988
1989 } while (ret);
1990
1991
1992 __offline_isolated_pages(start_pfn, end_pfn);
1993 pr_debug("Offlined Pages %ld\n", nr_pages);
1994
1995
1996
1997
1998
1999
2000 spin_lock_irqsave(&zone->lock, flags);
2001 zone->nr_isolate_pageblock -= nr_pages / pageblock_nr_pages;
2002 spin_unlock_irqrestore(&zone->lock, flags);
2003
2004 lru_cache_enable();
2005 zone_pcp_enable(zone);
2006
2007
2008 adjust_managed_page_count(pfn_to_page(start_pfn), -nr_pages);
2009 adjust_present_page_count(pfn_to_page(start_pfn), group, -nr_pages);
2010
2011
2012 init_per_zone_wmark_min();
2013
2014 if (!populated_zone(zone)) {
2015 zone_pcp_reset(zone);
2016 build_all_zonelists(NULL);
2017 }
2018
2019 node_states_clear_node(node, &arg);
2020 if (arg.status_change_nid >= 0) {
2021 kswapd_stop(node);
2022 kcompactd_stop(node);
2023 }
2024
2025 writeback_set_ratelimit();
2026
2027 memory_notify(MEM_OFFLINE, &arg);
2028 remove_pfn_range_from_zone(zone, start_pfn, nr_pages);
2029 mem_hotplug_done();
2030 return 0;
2031
2032failed_removal_isolated:
2033 undo_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE);
2034 memory_notify(MEM_CANCEL_OFFLINE, &arg);
2035failed_removal_pcplists_disabled:
2036 lru_cache_enable();
2037 zone_pcp_enable(zone);
2038failed_removal:
2039 pr_debug("memory offlining [mem %#010llx-%#010llx] failed due to %s\n",
2040 (unsigned long long) start_pfn << PAGE_SHIFT,
2041 ((unsigned long long) end_pfn << PAGE_SHIFT) - 1,
2042 reason);
2043
2044 mem_hotplug_done();
2045 return ret;
2046}
2047
2048static int check_memblock_offlined_cb(struct memory_block *mem, void *arg)
2049{
2050 int ret = !is_memblock_offlined(mem);
2051 int *nid = arg;
2052
2053 *nid = mem->nid;
2054 if (unlikely(ret)) {
2055 phys_addr_t beginpa, endpa;
2056
2057 beginpa = PFN_PHYS(section_nr_to_pfn(mem->start_section_nr));
2058 endpa = beginpa + memory_block_size_bytes() - 1;
2059 pr_warn("removing memory fails, because memory [%pa-%pa] is onlined\n",
2060 &beginpa, &endpa);
2061
2062 return -EBUSY;
2063 }
2064 return 0;
2065}
2066
2067static int get_nr_vmemmap_pages_cb(struct memory_block *mem, void *arg)
2068{
2069
2070
2071
2072 return mem->nr_vmemmap_pages;
2073}
2074
2075static int check_cpu_on_node(pg_data_t *pgdat)
2076{
2077 int cpu;
2078
2079 for_each_present_cpu(cpu) {
2080 if (cpu_to_node(cpu) == pgdat->node_id)
2081
2082
2083
2084
2085 return -EBUSY;
2086 }
2087
2088 return 0;
2089}
2090
2091static int check_no_memblock_for_node_cb(struct memory_block *mem, void *arg)
2092{
2093 int nid = *(int *)arg;
2094
2095
2096
2097
2098
2099
2100 return mem->nid == nid ? -EEXIST : 0;
2101}
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112void try_offline_node(int nid)
2113{
2114 pg_data_t *pgdat = NODE_DATA(nid);
2115 int rc;
2116
2117
2118
2119
2120
2121
2122 if (pgdat->node_spanned_pages)
2123 return;
2124
2125
2126
2127
2128
2129
2130 rc = for_each_memory_block(&nid, check_no_memblock_for_node_cb);
2131 if (rc)
2132 return;
2133
2134 if (check_cpu_on_node(pgdat))
2135 return;
2136
2137
2138
2139
2140
2141 node_set_offline(nid);
2142 unregister_one_node(nid);
2143}
2144EXPORT_SYMBOL(try_offline_node);
2145
2146static int __ref try_remove_memory(u64 start, u64 size)
2147{
2148 struct vmem_altmap mhp_altmap = {};
2149 struct vmem_altmap *altmap = NULL;
2150 unsigned long nr_vmemmap_pages;
2151 int rc = 0, nid = NUMA_NO_NODE;
2152
2153 BUG_ON(check_hotplug_memory_range(start, size));
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164 rc = walk_memory_blocks(start, size, &nid, check_memblock_offlined_cb);
2165 if (rc)
2166 return rc;
2167
2168
2169
2170
2171
2172 if (memmap_on_memory) {
2173 nr_vmemmap_pages = walk_memory_blocks(start, size, NULL,
2174 get_nr_vmemmap_pages_cb);
2175 if (nr_vmemmap_pages) {
2176 if (size != memory_block_size_bytes()) {
2177 pr_warn("Refuse to remove %#llx - %#llx,"
2178 "wrong granularity\n",
2179 start, start + size);
2180 return -EINVAL;
2181 }
2182
2183
2184
2185
2186
2187
2188 mhp_altmap.alloc = nr_vmemmap_pages;
2189 altmap = &mhp_altmap;
2190 }
2191 }
2192
2193
2194 firmware_map_remove(start, start + size, "System RAM");
2195
2196
2197
2198
2199
2200 remove_memory_block_devices(start, size);
2201
2202 mem_hotplug_begin();
2203
2204 arch_remove_memory(start, size, altmap);
2205
2206 if (IS_ENABLED(CONFIG_ARCH_KEEP_MEMBLOCK)) {
2207 memblock_free(start, size);
2208 memblock_remove(start, size);
2209 }
2210
2211 release_mem_region_adjustable(start, size);
2212
2213 if (nid != NUMA_NO_NODE)
2214 try_offline_node(nid);
2215
2216 mem_hotplug_done();
2217 return 0;
2218}
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229void __remove_memory(u64 start, u64 size)
2230{
2231
2232
2233
2234
2235
2236 if (try_remove_memory(start, size))
2237 BUG();
2238}
2239
2240
2241
2242
2243
2244int remove_memory(u64 start, u64 size)
2245{
2246 int rc;
2247
2248 lock_device_hotplug();
2249 rc = try_remove_memory(start, size);
2250 unlock_device_hotplug();
2251
2252 return rc;
2253}
2254EXPORT_SYMBOL_GPL(remove_memory);
2255
2256static int try_offline_memory_block(struct memory_block *mem, void *arg)
2257{
2258 uint8_t online_type = MMOP_ONLINE_KERNEL;
2259 uint8_t **online_types = arg;
2260 struct page *page;
2261 int rc;
2262
2263
2264
2265
2266
2267
2268 page = pfn_to_online_page(section_nr_to_pfn(mem->start_section_nr));
2269 if (page && zone_idx(page_zone(page)) == ZONE_MOVABLE)
2270 online_type = MMOP_ONLINE_MOVABLE;
2271
2272 rc = device_offline(&mem->dev);
2273
2274
2275
2276
2277 if (!rc)
2278 **online_types = online_type;
2279
2280 (*online_types)++;
2281
2282 return rc < 0 ? rc : 0;
2283}
2284
2285static int try_reonline_memory_block(struct memory_block *mem, void *arg)
2286{
2287 uint8_t **online_types = arg;
2288 int rc;
2289
2290 if (**online_types != MMOP_OFFLINE) {
2291 mem->online_type = **online_types;
2292 rc = device_online(&mem->dev);
2293 if (rc < 0)
2294 pr_warn("%s: Failed to re-online memory: %d",
2295 __func__, rc);
2296 }
2297
2298
2299 (*online_types)++;
2300 return 0;
2301}
2302
2303
2304
2305
2306
2307
2308
2309int offline_and_remove_memory(u64 start, u64 size)
2310{
2311 const unsigned long mb_count = size / memory_block_size_bytes();
2312 uint8_t *online_types, *tmp;
2313 int rc;
2314
2315 if (!IS_ALIGNED(start, memory_block_size_bytes()) ||
2316 !IS_ALIGNED(size, memory_block_size_bytes()) || !size)
2317 return -EINVAL;
2318
2319
2320
2321
2322
2323
2324 online_types = kmalloc_array(mb_count, sizeof(*online_types),
2325 GFP_KERNEL);
2326 if (!online_types)
2327 return -ENOMEM;
2328
2329
2330
2331
2332
2333 memset(online_types, MMOP_OFFLINE, mb_count);
2334
2335 lock_device_hotplug();
2336
2337 tmp = online_types;
2338 rc = walk_memory_blocks(start, size, &tmp, try_offline_memory_block);
2339
2340
2341
2342
2343
2344 if (!rc) {
2345 rc = try_remove_memory(start, size);
2346 if (rc)
2347 pr_err("%s: Failed to remove memory: %d", __func__, rc);
2348 }
2349
2350
2351
2352
2353
2354 if (rc) {
2355 tmp = online_types;
2356 walk_memory_blocks(start, size, &tmp,
2357 try_reonline_memory_block);
2358 }
2359 unlock_device_hotplug();
2360
2361 kfree(online_types);
2362 return rc;
2363}
2364EXPORT_SYMBOL_GPL(offline_and_remove_memory);
2365#endif
2366