1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16#include <linux/migrate.h>
17#include <linux/export.h>
18#include <linux/swap.h>
19#include <linux/swapops.h>
20#include <linux/pagemap.h>
21#include <linux/buffer_head.h>
22#include <linux/mm_inline.h>
23#include <linux/nsproxy.h>
24#include <linux/pagevec.h>
25#include <linux/ksm.h>
26#include <linux/rmap.h>
27#include <linux/topology.h>
28#include <linux/cpu.h>
29#include <linux/cpuset.h>
30#include <linux/writeback.h>
31#include <linux/mempolicy.h>
32#include <linux/vmalloc.h>
33#include <linux/security.h>
34#include <linux/backing-dev.h>
35#include <linux/compaction.h>
36#include <linux/syscalls.h>
37#include <linux/compat.h>
38#include <linux/hugetlb.h>
39#include <linux/hugetlb_cgroup.h>
40#include <linux/gfp.h>
41#include <linux/pfn_t.h>
42#include <linux/memremap.h>
43#include <linux/userfaultfd_k.h>
44#include <linux/balloon_compaction.h>
45#include <linux/page_idle.h>
46#include <linux/page_owner.h>
47#include <linux/sched/mm.h>
48#include <linux/ptrace.h>
49#include <linux/oom.h>
50#include <linux/memory.h>
51#include <linux/random.h>
52#include <linux/sched/sysctl.h>
53
54#include <asm/tlbflush.h>
55
56#include <trace/events/migrate.h>
57
58#include "internal.h"
59
60int isolate_movable_page(struct page *page, isolate_mode_t mode)
61{
62 struct address_space *mapping;
63
64
65
66
67
68
69
70
71
72
73 if (unlikely(!get_page_unless_zero(page)))
74 goto out;
75
76
77
78
79
80
81 if (unlikely(!__PageMovable(page)))
82 goto out_putpage;
83
84
85
86
87
88
89
90
91
92
93
94 if (unlikely(!trylock_page(page)))
95 goto out_putpage;
96
97 if (!PageMovable(page) || PageIsolated(page))
98 goto out_no_isolated;
99
100 mapping = page_mapping(page);
101 VM_BUG_ON_PAGE(!mapping, page);
102
103 if (!mapping->a_ops->isolate_page(page, mode))
104 goto out_no_isolated;
105
106
107 WARN_ON_ONCE(PageIsolated(page));
108 SetPageIsolated(page);
109 unlock_page(page);
110
111 return 0;
112
113out_no_isolated:
114 unlock_page(page);
115out_putpage:
116 put_page(page);
117out:
118 return -EBUSY;
119}
120
121static void putback_movable_page(struct page *page)
122{
123 struct address_space *mapping;
124
125 mapping = page_mapping(page);
126 mapping->a_ops->putback_page(page);
127 ClearPageIsolated(page);
128}
129
130
131
132
133
134
135
136
137
138void putback_movable_pages(struct list_head *l)
139{
140 struct page *page;
141 struct page *page2;
142
143 list_for_each_entry_safe(page, page2, l, lru) {
144 if (unlikely(PageHuge(page))) {
145 putback_active_hugepage(page);
146 continue;
147 }
148 list_del(&page->lru);
149
150
151
152
153
154 if (unlikely(__PageMovable(page))) {
155 VM_BUG_ON_PAGE(!PageIsolated(page), page);
156 lock_page(page);
157 if (PageMovable(page))
158 putback_movable_page(page);
159 else
160 ClearPageIsolated(page);
161 unlock_page(page);
162 put_page(page);
163 } else {
164 mod_node_page_state(page_pgdat(page), NR_ISOLATED_ANON +
165 page_is_file_lru(page), -thp_nr_pages(page));
166 putback_lru_page(page);
167 }
168 }
169}
170
171
172
173
174static bool remove_migration_pte(struct folio *folio,
175 struct vm_area_struct *vma, unsigned long addr, void *old)
176{
177 DEFINE_FOLIO_VMA_WALK(pvmw, old, vma, addr, PVMW_SYNC | PVMW_MIGRATION);
178
179 while (page_vma_mapped_walk(&pvmw)) {
180 rmap_t rmap_flags = RMAP_NONE;
181 pte_t pte;
182 swp_entry_t entry;
183 struct page *new;
184 unsigned long idx = 0;
185
186
187 if (folio_test_large(folio) && !folio_test_hugetlb(folio))
188 idx = linear_page_index(vma, pvmw.address) - pvmw.pgoff;
189 new = folio_page(folio, idx);
190
191#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
192
193 if (!pvmw.pte) {
194 VM_BUG_ON_FOLIO(folio_test_hugetlb(folio) ||
195 !folio_test_pmd_mappable(folio), folio);
196 remove_migration_pmd(&pvmw, new);
197 continue;
198 }
199#endif
200
201 folio_get(folio);
202 pte = pte_mkold(mk_pte(new, READ_ONCE(vma->vm_page_prot)));
203 if (pte_swp_soft_dirty(*pvmw.pte))
204 pte = pte_mksoft_dirty(pte);
205
206
207
208
209 entry = pte_to_swp_entry(*pvmw.pte);
210 if (is_writable_migration_entry(entry))
211 pte = maybe_mkwrite(pte, vma);
212 else if (pte_swp_uffd_wp(*pvmw.pte))
213 pte = pte_mkuffd_wp(pte);
214
215 if (folio_test_anon(folio) && !is_readable_migration_entry(entry))
216 rmap_flags |= RMAP_EXCLUSIVE;
217
218 if (unlikely(is_device_private_page(new))) {
219 if (pte_write(pte))
220 entry = make_writable_device_private_entry(
221 page_to_pfn(new));
222 else
223 entry = make_readable_device_private_entry(
224 page_to_pfn(new));
225 pte = swp_entry_to_pte(entry);
226 if (pte_swp_soft_dirty(*pvmw.pte))
227 pte = pte_swp_mksoft_dirty(pte);
228 if (pte_swp_uffd_wp(*pvmw.pte))
229 pte = pte_swp_mkuffd_wp(pte);
230 }
231
232#ifdef CONFIG_HUGETLB_PAGE
233 if (folio_test_hugetlb(folio)) {
234 unsigned int shift = huge_page_shift(hstate_vma(vma));
235
236 pte = pte_mkhuge(pte);
237 pte = arch_make_huge_pte(pte, shift, vma->vm_flags);
238 if (folio_test_anon(folio))
239 hugepage_add_anon_rmap(new, vma, pvmw.address,
240 rmap_flags);
241 else
242 page_dup_file_rmap(new, true);
243 set_huge_pte_at(vma->vm_mm, pvmw.address, pvmw.pte, pte);
244 } else
245#endif
246 {
247 if (folio_test_anon(folio))
248 page_add_anon_rmap(new, vma, pvmw.address,
249 rmap_flags);
250 else
251 page_add_file_rmap(new, vma, false);
252 set_pte_at(vma->vm_mm, pvmw.address, pvmw.pte, pte);
253 }
254 if (vma->vm_flags & VM_LOCKED)
255 mlock_page_drain_local();
256
257 trace_remove_migration_pte(pvmw.address, pte_val(pte),
258 compound_order(new));
259
260
261 update_mmu_cache(vma, pvmw.address, pvmw.pte);
262 }
263
264 return true;
265}
266
267
268
269
270
271void remove_migration_ptes(struct folio *src, struct folio *dst, bool locked)
272{
273 struct rmap_walk_control rwc = {
274 .rmap_one = remove_migration_pte,
275 .arg = src,
276 };
277
278 if (locked)
279 rmap_walk_locked(dst, &rwc);
280 else
281 rmap_walk(dst, &rwc);
282}
283
284
285
286
287
288
289void __migration_entry_wait(struct mm_struct *mm, pte_t *ptep,
290 spinlock_t *ptl)
291{
292 pte_t pte;
293 swp_entry_t entry;
294
295 spin_lock(ptl);
296 pte = *ptep;
297 if (!is_swap_pte(pte))
298 goto out;
299
300 entry = pte_to_swp_entry(pte);
301 if (!is_migration_entry(entry))
302 goto out;
303
304 migration_entry_wait_on_locked(entry, ptep, ptl);
305 return;
306out:
307 pte_unmap_unlock(ptep, ptl);
308}
309
310void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd,
311 unsigned long address)
312{
313 spinlock_t *ptl = pte_lockptr(mm, pmd);
314 pte_t *ptep = pte_offset_map(pmd, address);
315 __migration_entry_wait(mm, ptep, ptl);
316}
317
318void migration_entry_wait_huge(struct vm_area_struct *vma,
319 struct mm_struct *mm, pte_t *pte)
320{
321 spinlock_t *ptl = huge_pte_lockptr(hstate_vma(vma), mm, pte);
322 __migration_entry_wait(mm, pte, ptl);
323}
324
325#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
326void pmd_migration_entry_wait(struct mm_struct *mm, pmd_t *pmd)
327{
328 spinlock_t *ptl;
329
330 ptl = pmd_lock(mm, pmd);
331 if (!is_pmd_migration_entry(*pmd))
332 goto unlock;
333 migration_entry_wait_on_locked(pmd_to_swp_entry(*pmd), NULL, ptl);
334 return;
335unlock:
336 spin_unlock(ptl);
337}
338#endif
339
340static int expected_page_refs(struct address_space *mapping, struct page *page)
341{
342 int expected_count = 1;
343
344 if (mapping)
345 expected_count += compound_nr(page) + page_has_private(page);
346 return expected_count;
347}
348
349
350
351
352
353
354
355
356
357int folio_migrate_mapping(struct address_space *mapping,
358 struct folio *newfolio, struct folio *folio, int extra_count)
359{
360 XA_STATE(xas, &mapping->i_pages, folio_index(folio));
361 struct zone *oldzone, *newzone;
362 int dirty;
363 int expected_count = expected_page_refs(mapping, &folio->page) + extra_count;
364 long nr = folio_nr_pages(folio);
365
366 if (!mapping) {
367
368 if (folio_ref_count(folio) != expected_count)
369 return -EAGAIN;
370
371
372 newfolio->index = folio->index;
373 newfolio->mapping = folio->mapping;
374 if (folio_test_swapbacked(folio))
375 __folio_set_swapbacked(newfolio);
376
377 return MIGRATEPAGE_SUCCESS;
378 }
379
380 oldzone = folio_zone(folio);
381 newzone = folio_zone(newfolio);
382
383 xas_lock_irq(&xas);
384 if (!folio_ref_freeze(folio, expected_count)) {
385 xas_unlock_irq(&xas);
386 return -EAGAIN;
387 }
388
389
390
391
392
393 newfolio->index = folio->index;
394 newfolio->mapping = folio->mapping;
395 folio_ref_add(newfolio, nr);
396 if (folio_test_swapbacked(folio)) {
397 __folio_set_swapbacked(newfolio);
398 if (folio_test_swapcache(folio)) {
399 folio_set_swapcache(newfolio);
400 newfolio->private = folio_get_private(folio);
401 }
402 } else {
403 VM_BUG_ON_FOLIO(folio_test_swapcache(folio), folio);
404 }
405
406
407 dirty = folio_test_dirty(folio);
408 if (dirty) {
409 folio_clear_dirty(folio);
410 folio_set_dirty(newfolio);
411 }
412
413 xas_store(&xas, newfolio);
414
415
416
417
418
419
420 folio_ref_unfreeze(folio, expected_count - nr);
421
422 xas_unlock(&xas);
423
424
425
426
427
428
429
430
431
432
433
434
435 if (newzone != oldzone) {
436 struct lruvec *old_lruvec, *new_lruvec;
437 struct mem_cgroup *memcg;
438
439 memcg = folio_memcg(folio);
440 old_lruvec = mem_cgroup_lruvec(memcg, oldzone->zone_pgdat);
441 new_lruvec = mem_cgroup_lruvec(memcg, newzone->zone_pgdat);
442
443 __mod_lruvec_state(old_lruvec, NR_FILE_PAGES, -nr);
444 __mod_lruvec_state(new_lruvec, NR_FILE_PAGES, nr);
445 if (folio_test_swapbacked(folio) && !folio_test_swapcache(folio)) {
446 __mod_lruvec_state(old_lruvec, NR_SHMEM, -nr);
447 __mod_lruvec_state(new_lruvec, NR_SHMEM, nr);
448 }
449#ifdef CONFIG_SWAP
450 if (folio_test_swapcache(folio)) {
451 __mod_lruvec_state(old_lruvec, NR_SWAPCACHE, -nr);
452 __mod_lruvec_state(new_lruvec, NR_SWAPCACHE, nr);
453 }
454#endif
455 if (dirty && mapping_can_writeback(mapping)) {
456 __mod_lruvec_state(old_lruvec, NR_FILE_DIRTY, -nr);
457 __mod_zone_page_state(oldzone, NR_ZONE_WRITE_PENDING, -nr);
458 __mod_lruvec_state(new_lruvec, NR_FILE_DIRTY, nr);
459 __mod_zone_page_state(newzone, NR_ZONE_WRITE_PENDING, nr);
460 }
461 }
462 local_irq_enable();
463
464 return MIGRATEPAGE_SUCCESS;
465}
466EXPORT_SYMBOL(folio_migrate_mapping);
467
468
469
470
471
472int migrate_huge_page_move_mapping(struct address_space *mapping,
473 struct page *newpage, struct page *page)
474{
475 XA_STATE(xas, &mapping->i_pages, page_index(page));
476 int expected_count;
477
478 xas_lock_irq(&xas);
479 expected_count = 2 + page_has_private(page);
480 if (!page_ref_freeze(page, expected_count)) {
481 xas_unlock_irq(&xas);
482 return -EAGAIN;
483 }
484
485 newpage->index = page->index;
486 newpage->mapping = page->mapping;
487
488 get_page(newpage);
489
490 xas_store(&xas, newpage);
491
492 page_ref_unfreeze(page, expected_count - 1);
493
494 xas_unlock_irq(&xas);
495
496 return MIGRATEPAGE_SUCCESS;
497}
498
499
500
501
502void folio_migrate_flags(struct folio *newfolio, struct folio *folio)
503{
504 int cpupid;
505
506 if (folio_test_error(folio))
507 folio_set_error(newfolio);
508 if (folio_test_referenced(folio))
509 folio_set_referenced(newfolio);
510 if (folio_test_uptodate(folio))
511 folio_mark_uptodate(newfolio);
512 if (folio_test_clear_active(folio)) {
513 VM_BUG_ON_FOLIO(folio_test_unevictable(folio), folio);
514 folio_set_active(newfolio);
515 } else if (folio_test_clear_unevictable(folio))
516 folio_set_unevictable(newfolio);
517 if (folio_test_workingset(folio))
518 folio_set_workingset(newfolio);
519 if (folio_test_checked(folio))
520 folio_set_checked(newfolio);
521
522
523
524
525
526
527 if (folio_test_mappedtodisk(folio))
528 folio_set_mappedtodisk(newfolio);
529
530
531 if (folio_test_dirty(folio))
532 folio_set_dirty(newfolio);
533
534 if (folio_test_young(folio))
535 folio_set_young(newfolio);
536 if (folio_test_idle(folio))
537 folio_set_idle(newfolio);
538
539
540
541
542
543 cpupid = page_cpupid_xchg_last(&folio->page, -1);
544 page_cpupid_xchg_last(&newfolio->page, cpupid);
545
546 folio_migrate_ksm(newfolio, folio);
547
548
549
550
551 if (folio_test_swapcache(folio))
552 folio_clear_swapcache(folio);
553 folio_clear_private(folio);
554
555
556 if (!folio_test_hugetlb(folio))
557 folio->private = NULL;
558
559
560
561
562
563 if (folio_test_writeback(newfolio))
564 folio_end_writeback(newfolio);
565
566
567
568
569
570
571 if (folio_test_readahead(folio))
572 folio_set_readahead(newfolio);
573
574 folio_copy_owner(newfolio, folio);
575
576 if (!folio_test_hugetlb(folio))
577 mem_cgroup_migrate(folio, newfolio);
578}
579EXPORT_SYMBOL(folio_migrate_flags);
580
581void folio_migrate_copy(struct folio *newfolio, struct folio *folio)
582{
583 folio_copy(newfolio, folio);
584 folio_migrate_flags(newfolio, folio);
585}
586EXPORT_SYMBOL(folio_migrate_copy);
587
588
589
590
591
592
593
594
595
596
597
598int migrate_page(struct address_space *mapping,
599 struct page *newpage, struct page *page,
600 enum migrate_mode mode)
601{
602 struct folio *newfolio = page_folio(newpage);
603 struct folio *folio = page_folio(page);
604 int rc;
605
606 BUG_ON(folio_test_writeback(folio));
607
608 rc = folio_migrate_mapping(mapping, newfolio, folio, 0);
609
610 if (rc != MIGRATEPAGE_SUCCESS)
611 return rc;
612
613 if (mode != MIGRATE_SYNC_NO_COPY)
614 folio_migrate_copy(newfolio, folio);
615 else
616 folio_migrate_flags(newfolio, folio);
617 return MIGRATEPAGE_SUCCESS;
618}
619EXPORT_SYMBOL(migrate_page);
620
621#ifdef CONFIG_BLOCK
622
623static bool buffer_migrate_lock_buffers(struct buffer_head *head,
624 enum migrate_mode mode)
625{
626 struct buffer_head *bh = head;
627
628
629 if (mode != MIGRATE_ASYNC) {
630 do {
631 lock_buffer(bh);
632 bh = bh->b_this_page;
633
634 } while (bh != head);
635
636 return true;
637 }
638
639
640 do {
641 if (!trylock_buffer(bh)) {
642
643
644
645
646 struct buffer_head *failed_bh = bh;
647 bh = head;
648 while (bh != failed_bh) {
649 unlock_buffer(bh);
650 bh = bh->b_this_page;
651 }
652 return false;
653 }
654
655 bh = bh->b_this_page;
656 } while (bh != head);
657 return true;
658}
659
660static int __buffer_migrate_page(struct address_space *mapping,
661 struct page *newpage, struct page *page, enum migrate_mode mode,
662 bool check_refs)
663{
664 struct buffer_head *bh, *head;
665 int rc;
666 int expected_count;
667
668 if (!page_has_buffers(page))
669 return migrate_page(mapping, newpage, page, mode);
670
671
672 expected_count = expected_page_refs(mapping, page);
673 if (page_count(page) != expected_count)
674 return -EAGAIN;
675
676 head = page_buffers(page);
677 if (!buffer_migrate_lock_buffers(head, mode))
678 return -EAGAIN;
679
680 if (check_refs) {
681 bool busy;
682 bool invalidated = false;
683
684recheck_buffers:
685 busy = false;
686 spin_lock(&mapping->private_lock);
687 bh = head;
688 do {
689 if (atomic_read(&bh->b_count)) {
690 busy = true;
691 break;
692 }
693 bh = bh->b_this_page;
694 } while (bh != head);
695 if (busy) {
696 if (invalidated) {
697 rc = -EAGAIN;
698 goto unlock_buffers;
699 }
700 spin_unlock(&mapping->private_lock);
701 invalidate_bh_lrus();
702 invalidated = true;
703 goto recheck_buffers;
704 }
705 }
706
707 rc = migrate_page_move_mapping(mapping, newpage, page, 0);
708 if (rc != MIGRATEPAGE_SUCCESS)
709 goto unlock_buffers;
710
711 attach_page_private(newpage, detach_page_private(page));
712
713 bh = head;
714 do {
715 set_bh_page(bh, newpage, bh_offset(bh));
716 bh = bh->b_this_page;
717
718 } while (bh != head);
719
720 if (mode != MIGRATE_SYNC_NO_COPY)
721 migrate_page_copy(newpage, page);
722 else
723 migrate_page_states(newpage, page);
724
725 rc = MIGRATEPAGE_SUCCESS;
726unlock_buffers:
727 if (check_refs)
728 spin_unlock(&mapping->private_lock);
729 bh = head;
730 do {
731 unlock_buffer(bh);
732 bh = bh->b_this_page;
733
734 } while (bh != head);
735
736 return rc;
737}
738
739
740
741
742
743
744int buffer_migrate_page(struct address_space *mapping,
745 struct page *newpage, struct page *page, enum migrate_mode mode)
746{
747 return __buffer_migrate_page(mapping, newpage, page, mode, false);
748}
749EXPORT_SYMBOL(buffer_migrate_page);
750
751
752
753
754
755
756
757int buffer_migrate_page_norefs(struct address_space *mapping,
758 struct page *newpage, struct page *page, enum migrate_mode mode)
759{
760 return __buffer_migrate_page(mapping, newpage, page, mode, true);
761}
762#endif
763
764
765
766
767static int writeout(struct address_space *mapping, struct page *page)
768{
769 struct folio *folio = page_folio(page);
770 struct writeback_control wbc = {
771 .sync_mode = WB_SYNC_NONE,
772 .nr_to_write = 1,
773 .range_start = 0,
774 .range_end = LLONG_MAX,
775 .for_reclaim = 1
776 };
777 int rc;
778
779 if (!mapping->a_ops->writepage)
780
781 return -EINVAL;
782
783 if (!clear_page_dirty_for_io(page))
784
785 return -EAGAIN;
786
787
788
789
790
791
792
793
794
795 remove_migration_ptes(folio, folio, false);
796
797 rc = mapping->a_ops->writepage(page, &wbc);
798
799 if (rc != AOP_WRITEPAGE_ACTIVATE)
800
801 lock_page(page);
802
803 return (rc < 0) ? -EIO : -EAGAIN;
804}
805
806
807
808
809static int fallback_migrate_page(struct address_space *mapping,
810 struct page *newpage, struct page *page, enum migrate_mode mode)
811{
812 if (PageDirty(page)) {
813
814 switch (mode) {
815 case MIGRATE_SYNC:
816 case MIGRATE_SYNC_NO_COPY:
817 break;
818 default:
819 return -EBUSY;
820 }
821 return writeout(mapping, page);
822 }
823
824
825
826
827
828 if (page_has_private(page) &&
829 !try_to_release_page(page, GFP_KERNEL))
830 return mode == MIGRATE_SYNC ? -EAGAIN : -EBUSY;
831
832 return migrate_page(mapping, newpage, page, mode);
833}
834
835
836
837
838
839
840
841
842
843
844
845
846static int move_to_new_folio(struct folio *dst, struct folio *src,
847 enum migrate_mode mode)
848{
849 struct address_space *mapping;
850 int rc = -EAGAIN;
851 bool is_lru = !__PageMovable(&src->page);
852
853 VM_BUG_ON_FOLIO(!folio_test_locked(src), src);
854 VM_BUG_ON_FOLIO(!folio_test_locked(dst), dst);
855
856 mapping = folio_mapping(src);
857
858 if (likely(is_lru)) {
859 if (!mapping)
860 rc = migrate_page(mapping, &dst->page, &src->page, mode);
861 else if (mapping->a_ops->migratepage)
862
863
864
865
866
867
868
869 rc = mapping->a_ops->migratepage(mapping, &dst->page,
870 &src->page, mode);
871 else
872 rc = fallback_migrate_page(mapping, &dst->page,
873 &src->page, mode);
874 } else {
875
876
877
878
879 VM_BUG_ON_FOLIO(!folio_test_isolated(src), src);
880 if (!folio_test_movable(src)) {
881 rc = MIGRATEPAGE_SUCCESS;
882 folio_clear_isolated(src);
883 goto out;
884 }
885
886 rc = mapping->a_ops->migratepage(mapping, &dst->page,
887 &src->page, mode);
888 WARN_ON_ONCE(rc == MIGRATEPAGE_SUCCESS &&
889 !folio_test_isolated(src));
890 }
891
892
893
894
895
896 if (rc == MIGRATEPAGE_SUCCESS) {
897 if (__PageMovable(&src->page)) {
898 VM_BUG_ON_FOLIO(!folio_test_isolated(src), src);
899
900
901
902
903
904 folio_clear_isolated(src);
905 }
906
907
908
909
910
911
912 if (!folio_mapping_flags(src))
913 src->mapping = NULL;
914
915 if (likely(!folio_is_zone_device(dst)))
916 flush_dcache_folio(dst);
917 }
918out:
919 return rc;
920}
921
922static int __unmap_and_move(struct page *page, struct page *newpage,
923 int force, enum migrate_mode mode)
924{
925 struct folio *folio = page_folio(page);
926 struct folio *dst = page_folio(newpage);
927 int rc = -EAGAIN;
928 bool page_was_mapped = false;
929 struct anon_vma *anon_vma = NULL;
930 bool is_lru = !__PageMovable(page);
931
932 if (!trylock_page(page)) {
933 if (!force || mode == MIGRATE_ASYNC)
934 goto out;
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949 if (current->flags & PF_MEMALLOC)
950 goto out;
951
952 lock_page(page);
953 }
954
955 if (PageWriteback(page)) {
956
957
958
959
960
961
962 switch (mode) {
963 case MIGRATE_SYNC:
964 case MIGRATE_SYNC_NO_COPY:
965 break;
966 default:
967 rc = -EBUSY;
968 goto out_unlock;
969 }
970 if (!force)
971 goto out_unlock;
972 wait_on_page_writeback(page);
973 }
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989 if (PageAnon(page) && !PageKsm(page))
990 anon_vma = page_get_anon_vma(page);
991
992
993
994
995
996
997
998
999
1000 if (unlikely(!trylock_page(newpage)))
1001 goto out_unlock;
1002
1003 if (unlikely(!is_lru)) {
1004 rc = move_to_new_folio(dst, folio, mode);
1005 goto out_unlock_both;
1006 }
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020 if (!page->mapping) {
1021 VM_BUG_ON_PAGE(PageAnon(page), page);
1022 if (page_has_private(page)) {
1023 try_to_free_buffers(folio);
1024 goto out_unlock_both;
1025 }
1026 } else if (page_mapped(page)) {
1027
1028 VM_BUG_ON_PAGE(PageAnon(page) && !PageKsm(page) && !anon_vma,
1029 page);
1030 try_to_migrate(folio, 0);
1031 page_was_mapped = true;
1032 }
1033
1034 if (!page_mapped(page))
1035 rc = move_to_new_folio(dst, folio, mode);
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046 if (rc == MIGRATEPAGE_SUCCESS) {
1047 lru_cache_add(newpage);
1048 if (page_was_mapped)
1049 lru_add_drain();
1050 }
1051
1052 if (page_was_mapped)
1053 remove_migration_ptes(folio,
1054 rc == MIGRATEPAGE_SUCCESS ? dst : folio, false);
1055
1056out_unlock_both:
1057 unlock_page(newpage);
1058out_unlock:
1059
1060 if (anon_vma)
1061 put_anon_vma(anon_vma);
1062 unlock_page(page);
1063out:
1064
1065
1066
1067
1068
1069 if (rc == MIGRATEPAGE_SUCCESS)
1070 put_page(newpage);
1071
1072 return rc;
1073}
1074
1075
1076
1077
1078
1079static int unmap_and_move(new_page_t get_new_page,
1080 free_page_t put_new_page,
1081 unsigned long private, struct page *page,
1082 int force, enum migrate_mode mode,
1083 enum migrate_reason reason,
1084 struct list_head *ret)
1085{
1086 int rc = MIGRATEPAGE_SUCCESS;
1087 struct page *newpage = NULL;
1088
1089 if (!thp_migration_supported() && PageTransHuge(page))
1090 return -ENOSYS;
1091
1092 if (page_count(page) == 1) {
1093
1094 ClearPageActive(page);
1095 ClearPageUnevictable(page);
1096 if (unlikely(__PageMovable(page))) {
1097 lock_page(page);
1098 if (!PageMovable(page))
1099 ClearPageIsolated(page);
1100 unlock_page(page);
1101 }
1102 goto out;
1103 }
1104
1105 newpage = get_new_page(page, private);
1106 if (!newpage)
1107 return -ENOMEM;
1108
1109 newpage->private = 0;
1110 rc = __unmap_and_move(page, newpage, force, mode);
1111 if (rc == MIGRATEPAGE_SUCCESS)
1112 set_page_owner_migrate_reason(newpage, reason);
1113
1114out:
1115 if (rc != -EAGAIN) {
1116
1117
1118
1119
1120
1121 list_del(&page->lru);
1122 }
1123
1124
1125
1126
1127
1128
1129 if (rc == MIGRATEPAGE_SUCCESS) {
1130
1131
1132
1133
1134
1135 if (likely(!__PageMovable(page)))
1136 mod_node_page_state(page_pgdat(page), NR_ISOLATED_ANON +
1137 page_is_file_lru(page), -thp_nr_pages(page));
1138
1139 if (reason != MR_MEMORY_FAILURE)
1140
1141
1142
1143 put_page(page);
1144 } else {
1145 if (rc != -EAGAIN)
1146 list_add_tail(&page->lru, ret);
1147
1148 if (put_new_page)
1149 put_new_page(newpage, private);
1150 else
1151 put_page(newpage);
1152 }
1153
1154 return rc;
1155}
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175static int unmap_and_move_huge_page(new_page_t get_new_page,
1176 free_page_t put_new_page, unsigned long private,
1177 struct page *hpage, int force,
1178 enum migrate_mode mode, int reason,
1179 struct list_head *ret)
1180{
1181 struct folio *dst, *src = page_folio(hpage);
1182 int rc = -EAGAIN;
1183 int page_was_mapped = 0;
1184 struct page *new_hpage;
1185 struct anon_vma *anon_vma = NULL;
1186 struct address_space *mapping = NULL;
1187
1188
1189
1190
1191
1192
1193
1194
1195 if (!hugepage_migration_supported(page_hstate(hpage))) {
1196 list_move_tail(&hpage->lru, ret);
1197 return -ENOSYS;
1198 }
1199
1200 if (page_count(hpage) == 1) {
1201
1202 putback_active_hugepage(hpage);
1203 return MIGRATEPAGE_SUCCESS;
1204 }
1205
1206 new_hpage = get_new_page(hpage, private);
1207 if (!new_hpage)
1208 return -ENOMEM;
1209 dst = page_folio(new_hpage);
1210
1211 if (!trylock_page(hpage)) {
1212 if (!force)
1213 goto out;
1214 switch (mode) {
1215 case MIGRATE_SYNC:
1216 case MIGRATE_SYNC_NO_COPY:
1217 break;
1218 default:
1219 goto out;
1220 }
1221 lock_page(hpage);
1222 }
1223
1224
1225
1226
1227
1228
1229 if (hugetlb_page_subpool(hpage) && !page_mapping(hpage)) {
1230 rc = -EBUSY;
1231 goto out_unlock;
1232 }
1233
1234 if (PageAnon(hpage))
1235 anon_vma = page_get_anon_vma(hpage);
1236
1237 if (unlikely(!trylock_page(new_hpage)))
1238 goto put_anon;
1239
1240 if (page_mapped(hpage)) {
1241 enum ttu_flags ttu = 0;
1242
1243 if (!PageAnon(hpage)) {
1244
1245
1246
1247
1248
1249
1250 mapping = hugetlb_page_mapping_lock_write(hpage);
1251 if (unlikely(!mapping))
1252 goto unlock_put_anon;
1253
1254 ttu = TTU_RMAP_LOCKED;
1255 }
1256
1257 try_to_migrate(src, ttu);
1258 page_was_mapped = 1;
1259
1260 if (ttu & TTU_RMAP_LOCKED)
1261 i_mmap_unlock_write(mapping);
1262 }
1263
1264 if (!page_mapped(hpage))
1265 rc = move_to_new_folio(dst, src, mode);
1266
1267 if (page_was_mapped)
1268 remove_migration_ptes(src,
1269 rc == MIGRATEPAGE_SUCCESS ? dst : src, false);
1270
1271unlock_put_anon:
1272 unlock_page(new_hpage);
1273
1274put_anon:
1275 if (anon_vma)
1276 put_anon_vma(anon_vma);
1277
1278 if (rc == MIGRATEPAGE_SUCCESS) {
1279 move_hugetlb_state(hpage, new_hpage, reason);
1280 put_new_page = NULL;
1281 }
1282
1283out_unlock:
1284 unlock_page(hpage);
1285out:
1286 if (rc == MIGRATEPAGE_SUCCESS)
1287 putback_active_hugepage(hpage);
1288 else if (rc != -EAGAIN)
1289 list_move_tail(&hpage->lru, ret);
1290
1291
1292
1293
1294
1295
1296 if (put_new_page)
1297 put_new_page(new_hpage, private);
1298 else
1299 putback_active_hugepage(new_hpage);
1300
1301 return rc;
1302}
1303
1304static inline int try_split_thp(struct page *page, struct page **page2,
1305 struct list_head *from)
1306{
1307 int rc = 0;
1308
1309 lock_page(page);
1310 rc = split_huge_page_to_list(page, from);
1311 unlock_page(page);
1312 if (!rc)
1313 list_safe_reset_next(page, *page2, lru);
1314
1315 return rc;
1316}
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343int migrate_pages(struct list_head *from, new_page_t get_new_page,
1344 free_page_t put_new_page, unsigned long private,
1345 enum migrate_mode mode, int reason, unsigned int *ret_succeeded)
1346{
1347 int retry = 1;
1348 int thp_retry = 1;
1349 int nr_failed = 0;
1350 int nr_failed_pages = 0;
1351 int nr_succeeded = 0;
1352 int nr_thp_succeeded = 0;
1353 int nr_thp_failed = 0;
1354 int nr_thp_split = 0;
1355 int pass = 0;
1356 bool is_thp = false;
1357 struct page *page;
1358 struct page *page2;
1359 int rc, nr_subpages;
1360 LIST_HEAD(ret_pages);
1361 LIST_HEAD(thp_split_pages);
1362 bool nosplit = (reason == MR_NUMA_MISPLACED);
1363 bool no_subpage_counting = false;
1364
1365 trace_mm_migrate_pages_start(mode, reason);
1366
1367thp_subpage_migration:
1368 for (pass = 0; pass < 10 && (retry || thp_retry); pass++) {
1369 retry = 0;
1370 thp_retry = 0;
1371
1372 list_for_each_entry_safe(page, page2, from, lru) {
1373retry:
1374
1375
1376
1377
1378
1379 is_thp = PageTransHuge(page) && !PageHuge(page);
1380 nr_subpages = compound_nr(page);
1381 cond_resched();
1382
1383 if (PageHuge(page))
1384 rc = unmap_and_move_huge_page(get_new_page,
1385 put_new_page, private, page,
1386 pass > 2, mode, reason,
1387 &ret_pages);
1388 else
1389 rc = unmap_and_move(get_new_page, put_new_page,
1390 private, page, pass > 2, mode,
1391 reason, &ret_pages);
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401 switch(rc) {
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413 case -ENOSYS:
1414
1415 if (is_thp) {
1416 nr_thp_failed++;
1417 if (!try_split_thp(page, &page2, &thp_split_pages)) {
1418 nr_thp_split++;
1419 goto retry;
1420 }
1421
1422 } else if (!no_subpage_counting) {
1423 nr_failed++;
1424 }
1425
1426 nr_failed_pages += nr_subpages;
1427 break;
1428 case -ENOMEM:
1429
1430
1431
1432
1433
1434 if (is_thp && !nosplit) {
1435 nr_thp_failed++;
1436 if (!try_split_thp(page, &page2, &thp_split_pages)) {
1437 nr_thp_split++;
1438 goto retry;
1439 }
1440 } else if (!no_subpage_counting) {
1441 nr_failed++;
1442 }
1443
1444 nr_failed_pages += nr_subpages;
1445
1446
1447
1448
1449
1450
1451 list_splice_init(&thp_split_pages, from);
1452 nr_thp_failed += thp_retry;
1453 goto out;
1454 case -EAGAIN:
1455 if (is_thp)
1456 thp_retry++;
1457 else
1458 retry++;
1459 break;
1460 case MIGRATEPAGE_SUCCESS:
1461 nr_succeeded += nr_subpages;
1462 if (is_thp)
1463 nr_thp_succeeded++;
1464 break;
1465 default:
1466
1467
1468
1469
1470
1471
1472 if (is_thp)
1473 nr_thp_failed++;
1474 else if (!no_subpage_counting)
1475 nr_failed++;
1476
1477 nr_failed_pages += nr_subpages;
1478 break;
1479 }
1480 }
1481 }
1482 nr_failed += retry;
1483 nr_thp_failed += thp_retry;
1484
1485
1486
1487
1488
1489 if (!list_empty(&thp_split_pages)) {
1490
1491
1492
1493
1494 list_splice_init(from, &ret_pages);
1495 list_splice_init(&thp_split_pages, from);
1496 no_subpage_counting = true;
1497 retry = 1;
1498 goto thp_subpage_migration;
1499 }
1500
1501 rc = nr_failed + nr_thp_failed;
1502out:
1503
1504
1505
1506
1507 list_splice(&ret_pages, from);
1508
1509 count_vm_events(PGMIGRATE_SUCCESS, nr_succeeded);
1510 count_vm_events(PGMIGRATE_FAIL, nr_failed_pages);
1511 count_vm_events(THP_MIGRATION_SUCCESS, nr_thp_succeeded);
1512 count_vm_events(THP_MIGRATION_FAIL, nr_thp_failed);
1513 count_vm_events(THP_MIGRATION_SPLIT, nr_thp_split);
1514 trace_mm_migrate_pages(nr_succeeded, nr_failed_pages, nr_thp_succeeded,
1515 nr_thp_failed, nr_thp_split, mode, reason);
1516
1517 if (ret_succeeded)
1518 *ret_succeeded = nr_succeeded;
1519
1520 return rc;
1521}
1522
1523struct page *alloc_migration_target(struct page *page, unsigned long private)
1524{
1525 struct folio *folio = page_folio(page);
1526 struct migration_target_control *mtc;
1527 gfp_t gfp_mask;
1528 unsigned int order = 0;
1529 struct folio *new_folio = NULL;
1530 int nid;
1531 int zidx;
1532
1533 mtc = (struct migration_target_control *)private;
1534 gfp_mask = mtc->gfp_mask;
1535 nid = mtc->nid;
1536 if (nid == NUMA_NO_NODE)
1537 nid = folio_nid(folio);
1538
1539 if (folio_test_hugetlb(folio)) {
1540 struct hstate *h = page_hstate(&folio->page);
1541
1542 gfp_mask = htlb_modify_alloc_mask(h, gfp_mask);
1543 return alloc_huge_page_nodemask(h, nid, mtc->nmask, gfp_mask);
1544 }
1545
1546 if (folio_test_large(folio)) {
1547
1548
1549
1550
1551 gfp_mask &= ~__GFP_RECLAIM;
1552 gfp_mask |= GFP_TRANSHUGE;
1553 order = folio_order(folio);
1554 }
1555 zidx = zone_idx(folio_zone(folio));
1556 if (is_highmem_idx(zidx) || zidx == ZONE_MOVABLE)
1557 gfp_mask |= __GFP_HIGHMEM;
1558
1559 new_folio = __folio_alloc(gfp_mask, order, nid, mtc->nmask);
1560
1561 return &new_folio->page;
1562}
1563
1564#ifdef CONFIG_NUMA
1565
1566static int store_status(int __user *status, int start, int value, int nr)
1567{
1568 while (nr-- > 0) {
1569 if (put_user(value, status + start))
1570 return -EFAULT;
1571 start++;
1572 }
1573
1574 return 0;
1575}
1576
1577static int do_move_pages_to_node(struct mm_struct *mm,
1578 struct list_head *pagelist, int node)
1579{
1580 int err;
1581 struct migration_target_control mtc = {
1582 .nid = node,
1583 .gfp_mask = GFP_HIGHUSER_MOVABLE | __GFP_THISNODE,
1584 };
1585
1586 err = migrate_pages(pagelist, alloc_migration_target, NULL,
1587 (unsigned long)&mtc, MIGRATE_SYNC, MR_SYSCALL, NULL);
1588 if (err)
1589 putback_movable_pages(pagelist);
1590 return err;
1591}
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602static int add_page_for_migration(struct mm_struct *mm, unsigned long addr,
1603 int node, struct list_head *pagelist, bool migrate_all)
1604{
1605 struct vm_area_struct *vma;
1606 struct page *page;
1607 int err;
1608
1609 mmap_read_lock(mm);
1610 err = -EFAULT;
1611 vma = vma_lookup(mm, addr);
1612 if (!vma || !vma_migratable(vma))
1613 goto out;
1614
1615
1616 page = follow_page(vma, addr, FOLL_GET | FOLL_DUMP);
1617
1618 err = PTR_ERR(page);
1619 if (IS_ERR(page))
1620 goto out;
1621
1622 err = -ENOENT;
1623 if (!page)
1624 goto out;
1625
1626 err = 0;
1627 if (page_to_nid(page) == node)
1628 goto out_putpage;
1629
1630 err = -EACCES;
1631 if (page_mapcount(page) > 1 && !migrate_all)
1632 goto out_putpage;
1633
1634 if (PageHuge(page)) {
1635 if (PageHead(page)) {
1636 isolate_huge_page(page, pagelist);
1637 err = 1;
1638 }
1639 } else {
1640 struct page *head;
1641
1642 head = compound_head(page);
1643 err = isolate_lru_page(head);
1644 if (err)
1645 goto out_putpage;
1646
1647 err = 1;
1648 list_add_tail(&head->lru, pagelist);
1649 mod_node_page_state(page_pgdat(head),
1650 NR_ISOLATED_ANON + page_is_file_lru(head),
1651 thp_nr_pages(head));
1652 }
1653out_putpage:
1654
1655
1656
1657
1658
1659 put_page(page);
1660out:
1661 mmap_read_unlock(mm);
1662 return err;
1663}
1664
1665static int move_pages_and_store_status(struct mm_struct *mm, int node,
1666 struct list_head *pagelist, int __user *status,
1667 int start, int i, unsigned long nr_pages)
1668{
1669 int err;
1670
1671 if (list_empty(pagelist))
1672 return 0;
1673
1674 err = do_move_pages_to_node(mm, pagelist, node);
1675 if (err) {
1676
1677
1678
1679
1680
1681
1682
1683
1684 if (err > 0)
1685 err += nr_pages - i - 1;
1686 return err;
1687 }
1688 return store_status(status, start, node, i - start);
1689}
1690
1691
1692
1693
1694
1695static int do_pages_move(struct mm_struct *mm, nodemask_t task_nodes,
1696 unsigned long nr_pages,
1697 const void __user * __user *pages,
1698 const int __user *nodes,
1699 int __user *status, int flags)
1700{
1701 int current_node = NUMA_NO_NODE;
1702 LIST_HEAD(pagelist);
1703 int start, i;
1704 int err = 0, err1;
1705
1706 lru_cache_disable();
1707
1708 for (i = start = 0; i < nr_pages; i++) {
1709 const void __user *p;
1710 unsigned long addr;
1711 int node;
1712
1713 err = -EFAULT;
1714 if (get_user(p, pages + i))
1715 goto out_flush;
1716 if (get_user(node, nodes + i))
1717 goto out_flush;
1718 addr = (unsigned long)untagged_addr(p);
1719
1720 err = -ENODEV;
1721 if (node < 0 || node >= MAX_NUMNODES)
1722 goto out_flush;
1723 if (!node_state(node, N_MEMORY))
1724 goto out_flush;
1725
1726 err = -EACCES;
1727 if (!node_isset(node, task_nodes))
1728 goto out_flush;
1729
1730 if (current_node == NUMA_NO_NODE) {
1731 current_node = node;
1732 start = i;
1733 } else if (node != current_node) {
1734 err = move_pages_and_store_status(mm, current_node,
1735 &pagelist, status, start, i, nr_pages);
1736 if (err)
1737 goto out;
1738 start = i;
1739 current_node = node;
1740 }
1741
1742
1743
1744
1745
1746 err = add_page_for_migration(mm, addr, current_node,
1747 &pagelist, flags & MPOL_MF_MOVE_ALL);
1748
1749 if (err > 0) {
1750
1751 continue;
1752 }
1753
1754
1755
1756
1757
1758 if (err == -EEXIST)
1759 err = -EFAULT;
1760
1761
1762
1763
1764
1765 err = store_status(status, i, err ? : current_node, 1);
1766 if (err)
1767 goto out_flush;
1768
1769 err = move_pages_and_store_status(mm, current_node, &pagelist,
1770 status, start, i, nr_pages);
1771 if (err)
1772 goto out;
1773 current_node = NUMA_NO_NODE;
1774 }
1775out_flush:
1776
1777 err1 = move_pages_and_store_status(mm, current_node, &pagelist,
1778 status, start, i, nr_pages);
1779 if (err >= 0)
1780 err = err1;
1781out:
1782 lru_cache_enable();
1783 return err;
1784}
1785
1786
1787
1788
1789static void do_pages_stat_array(struct mm_struct *mm, unsigned long nr_pages,
1790 const void __user **pages, int *status)
1791{
1792 unsigned long i;
1793
1794 mmap_read_lock(mm);
1795
1796 for (i = 0; i < nr_pages; i++) {
1797 unsigned long addr = (unsigned long)(*pages);
1798 struct vm_area_struct *vma;
1799 struct page *page;
1800 int err = -EFAULT;
1801
1802 vma = vma_lookup(mm, addr);
1803 if (!vma)
1804 goto set_status;
1805
1806
1807 page = follow_page(vma, addr, FOLL_GET | FOLL_DUMP);
1808
1809 err = PTR_ERR(page);
1810 if (IS_ERR(page))
1811 goto set_status;
1812
1813 if (page) {
1814 err = page_to_nid(page);
1815 put_page(page);
1816 } else {
1817 err = -ENOENT;
1818 }
1819set_status:
1820 *status = err;
1821
1822 pages++;
1823 status++;
1824 }
1825
1826 mmap_read_unlock(mm);
1827}
1828
1829static int get_compat_pages_array(const void __user *chunk_pages[],
1830 const void __user * __user *pages,
1831 unsigned long chunk_nr)
1832{
1833 compat_uptr_t __user *pages32 = (compat_uptr_t __user *)pages;
1834 compat_uptr_t p;
1835 int i;
1836
1837 for (i = 0; i < chunk_nr; i++) {
1838 if (get_user(p, pages32 + i))
1839 return -EFAULT;
1840 chunk_pages[i] = compat_ptr(p);
1841 }
1842
1843 return 0;
1844}
1845
1846
1847
1848
1849
1850static int do_pages_stat(struct mm_struct *mm, unsigned long nr_pages,
1851 const void __user * __user *pages,
1852 int __user *status)
1853{
1854#define DO_PAGES_STAT_CHUNK_NR 16UL
1855 const void __user *chunk_pages[DO_PAGES_STAT_CHUNK_NR];
1856 int chunk_status[DO_PAGES_STAT_CHUNK_NR];
1857
1858 while (nr_pages) {
1859 unsigned long chunk_nr = min(nr_pages, DO_PAGES_STAT_CHUNK_NR);
1860
1861 if (in_compat_syscall()) {
1862 if (get_compat_pages_array(chunk_pages, pages,
1863 chunk_nr))
1864 break;
1865 } else {
1866 if (copy_from_user(chunk_pages, pages,
1867 chunk_nr * sizeof(*chunk_pages)))
1868 break;
1869 }
1870
1871 do_pages_stat_array(mm, chunk_nr, chunk_pages, chunk_status);
1872
1873 if (copy_to_user(status, chunk_status, chunk_nr * sizeof(*status)))
1874 break;
1875
1876 pages += chunk_nr;
1877 status += chunk_nr;
1878 nr_pages -= chunk_nr;
1879 }
1880 return nr_pages ? -EFAULT : 0;
1881}
1882
1883static struct mm_struct *find_mm_struct(pid_t pid, nodemask_t *mem_nodes)
1884{
1885 struct task_struct *task;
1886 struct mm_struct *mm;
1887
1888
1889
1890
1891
1892 if (!pid) {
1893 mmget(current->mm);
1894 *mem_nodes = cpuset_mems_allowed(current);
1895 return current->mm;
1896 }
1897
1898
1899 rcu_read_lock();
1900 task = find_task_by_vpid(pid);
1901 if (!task) {
1902 rcu_read_unlock();
1903 return ERR_PTR(-ESRCH);
1904 }
1905 get_task_struct(task);
1906
1907
1908
1909
1910
1911 if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS)) {
1912 rcu_read_unlock();
1913 mm = ERR_PTR(-EPERM);
1914 goto out;
1915 }
1916 rcu_read_unlock();
1917
1918 mm = ERR_PTR(security_task_movememory(task));
1919 if (IS_ERR(mm))
1920 goto out;
1921 *mem_nodes = cpuset_mems_allowed(task);
1922 mm = get_task_mm(task);
1923out:
1924 put_task_struct(task);
1925 if (!mm)
1926 mm = ERR_PTR(-EINVAL);
1927 return mm;
1928}
1929
1930
1931
1932
1933
1934static int kernel_move_pages(pid_t pid, unsigned long nr_pages,
1935 const void __user * __user *pages,
1936 const int __user *nodes,
1937 int __user *status, int flags)
1938{
1939 struct mm_struct *mm;
1940 int err;
1941 nodemask_t task_nodes;
1942
1943
1944 if (flags & ~(MPOL_MF_MOVE|MPOL_MF_MOVE_ALL))
1945 return -EINVAL;
1946
1947 if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
1948 return -EPERM;
1949
1950 mm = find_mm_struct(pid, &task_nodes);
1951 if (IS_ERR(mm))
1952 return PTR_ERR(mm);
1953
1954 if (nodes)
1955 err = do_pages_move(mm, task_nodes, nr_pages, pages,
1956 nodes, status, flags);
1957 else
1958 err = do_pages_stat(mm, nr_pages, pages, status);
1959
1960 mmput(mm);
1961 return err;
1962}
1963
1964SYSCALL_DEFINE6(move_pages, pid_t, pid, unsigned long, nr_pages,
1965 const void __user * __user *, pages,
1966 const int __user *, nodes,
1967 int __user *, status, int, flags)
1968{
1969 return kernel_move_pages(pid, nr_pages, pages, nodes, status, flags);
1970}
1971
1972#ifdef CONFIG_NUMA_BALANCING
1973
1974
1975
1976
1977static bool migrate_balanced_pgdat(struct pglist_data *pgdat,
1978 unsigned long nr_migrate_pages)
1979{
1980 int z;
1981
1982 for (z = pgdat->nr_zones - 1; z >= 0; z--) {
1983 struct zone *zone = pgdat->node_zones + z;
1984
1985 if (!managed_zone(zone))
1986 continue;
1987
1988
1989 if (!zone_watermark_ok(zone, 0,
1990 high_wmark_pages(zone) +
1991 nr_migrate_pages,
1992 ZONE_MOVABLE, 0))
1993 continue;
1994 return true;
1995 }
1996 return false;
1997}
1998
1999static struct page *alloc_misplaced_dst_page(struct page *page,
2000 unsigned long data)
2001{
2002 int nid = (int) data;
2003 int order = compound_order(page);
2004 gfp_t gfp = __GFP_THISNODE;
2005 struct folio *new;
2006
2007 if (order > 0)
2008 gfp |= GFP_TRANSHUGE_LIGHT;
2009 else {
2010 gfp |= GFP_HIGHUSER_MOVABLE | __GFP_NOMEMALLOC | __GFP_NORETRY |
2011 __GFP_NOWARN;
2012 gfp &= ~__GFP_RECLAIM;
2013 }
2014 new = __folio_alloc_node(gfp, order, nid);
2015
2016 return &new->page;
2017}
2018
2019static int numamigrate_isolate_page(pg_data_t *pgdat, struct page *page)
2020{
2021 int nr_pages = thp_nr_pages(page);
2022 int order = compound_order(page);
2023
2024 VM_BUG_ON_PAGE(order && !PageTransHuge(page), page);
2025
2026
2027 if (PageTransHuge(page) && total_mapcount(page) > 1)
2028 return 0;
2029
2030
2031 if (!migrate_balanced_pgdat(pgdat, nr_pages)) {
2032 int z;
2033
2034 if (!(sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING))
2035 return 0;
2036 for (z = pgdat->nr_zones - 1; z >= 0; z--) {
2037 if (managed_zone(pgdat->node_zones + z))
2038 break;
2039 }
2040 wakeup_kswapd(pgdat->node_zones + z, 0, order, ZONE_MOVABLE);
2041 return 0;
2042 }
2043
2044 if (isolate_lru_page(page))
2045 return 0;
2046
2047 mod_node_page_state(page_pgdat(page), NR_ISOLATED_ANON + page_is_file_lru(page),
2048 nr_pages);
2049
2050
2051
2052
2053
2054
2055 put_page(page);
2056 return 1;
2057}
2058
2059
2060
2061
2062
2063
2064int migrate_misplaced_page(struct page *page, struct vm_area_struct *vma,
2065 int node)
2066{
2067 pg_data_t *pgdat = NODE_DATA(node);
2068 int isolated;
2069 int nr_remaining;
2070 unsigned int nr_succeeded;
2071 LIST_HEAD(migratepages);
2072 int nr_pages = thp_nr_pages(page);
2073
2074
2075
2076
2077
2078 if (page_mapcount(page) != 1 && page_is_file_lru(page) &&
2079 (vma->vm_flags & VM_EXEC))
2080 goto out;
2081
2082
2083
2084
2085
2086 if (page_is_file_lru(page) && PageDirty(page))
2087 goto out;
2088
2089 isolated = numamigrate_isolate_page(pgdat, page);
2090 if (!isolated)
2091 goto out;
2092
2093 list_add(&page->lru, &migratepages);
2094 nr_remaining = migrate_pages(&migratepages, alloc_misplaced_dst_page,
2095 NULL, node, MIGRATE_ASYNC,
2096 MR_NUMA_MISPLACED, &nr_succeeded);
2097 if (nr_remaining) {
2098 if (!list_empty(&migratepages)) {
2099 list_del(&page->lru);
2100 mod_node_page_state(page_pgdat(page), NR_ISOLATED_ANON +
2101 page_is_file_lru(page), -nr_pages);
2102 putback_lru_page(page);
2103 }
2104 isolated = 0;
2105 }
2106 if (nr_succeeded) {
2107 count_vm_numa_events(NUMA_PAGE_MIGRATE, nr_succeeded);
2108 if (!node_is_toptier(page_to_nid(page)) && node_is_toptier(node))
2109 mod_node_page_state(pgdat, PGPROMOTE_SUCCESS,
2110 nr_succeeded);
2111 }
2112 BUG_ON(!list_empty(&migratepages));
2113 return isolated;
2114
2115out:
2116 put_page(page);
2117 return 0;
2118}
2119#endif
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175#define DEFAULT_DEMOTION_TARGET_NODES 15
2176
2177#if MAX_NUMNODES < DEFAULT_DEMOTION_TARGET_NODES
2178#define DEMOTION_TARGET_NODES (MAX_NUMNODES - 1)
2179#else
2180#define DEMOTION_TARGET_NODES DEFAULT_DEMOTION_TARGET_NODES
2181#endif
2182
2183struct demotion_nodes {
2184 unsigned short nr;
2185 short nodes[DEMOTION_TARGET_NODES];
2186};
2187
2188static struct demotion_nodes *node_demotion __read_mostly;
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199int next_demotion_node(int node)
2200{
2201 struct demotion_nodes *nd;
2202 unsigned short target_nr, index;
2203 int target;
2204
2205 if (!node_demotion)
2206 return NUMA_NO_NODE;
2207
2208 nd = &node_demotion[node];
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219 rcu_read_lock();
2220 target_nr = READ_ONCE(nd->nr);
2221
2222 switch (target_nr) {
2223 case 0:
2224 target = NUMA_NO_NODE;
2225 goto out;
2226 case 1:
2227 index = 0;
2228 break;
2229 default:
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242 index = get_random_int() % target_nr;
2243 break;
2244 }
2245
2246 target = READ_ONCE(nd->nodes[index]);
2247
2248out:
2249 rcu_read_unlock();
2250 return target;
2251}
2252
2253
2254static void __disable_all_migrate_targets(void)
2255{
2256 int node, i;
2257
2258 if (!node_demotion)
2259 return;
2260
2261 for_each_online_node(node) {
2262 node_demotion[node].nr = 0;
2263 for (i = 0; i < DEMOTION_TARGET_NODES; i++)
2264 node_demotion[node].nodes[i] = NUMA_NO_NODE;
2265 }
2266}
2267
2268static void disable_all_migrate_targets(void)
2269{
2270 __disable_all_migrate_targets();
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284 synchronize_rcu();
2285}
2286
2287
2288
2289
2290
2291
2292static int establish_migrate_target(int node, nodemask_t *used,
2293 int best_distance)
2294{
2295 int migration_target, index, val;
2296 struct demotion_nodes *nd;
2297
2298 if (!node_demotion)
2299 return NUMA_NO_NODE;
2300
2301 nd = &node_demotion[node];
2302
2303 migration_target = find_next_best_node(node, used);
2304 if (migration_target == NUMA_NO_NODE)
2305 return NUMA_NO_NODE;
2306
2307
2308
2309
2310
2311
2312
2313 if (best_distance != -1) {
2314 val = node_distance(node, migration_target);
2315 if (val > best_distance)
2316 goto out_clear;
2317 }
2318
2319 index = nd->nr;
2320 if (WARN_ONCE(index >= DEMOTION_TARGET_NODES,
2321 "Exceeds maximum demotion target nodes\n"))
2322 goto out_clear;
2323
2324 nd->nodes[index] = migration_target;
2325 nd->nr++;
2326
2327 return migration_target;
2328out_clear:
2329 node_clear(migration_target, *used);
2330 return NUMA_NO_NODE;
2331}
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353static void __set_migration_target_nodes(void)
2354{
2355 nodemask_t next_pass;
2356 nodemask_t this_pass;
2357 nodemask_t used_targets = NODE_MASK_NONE;
2358 int node, best_distance;
2359
2360
2361
2362
2363
2364
2365 disable_all_migrate_targets();
2366
2367
2368
2369
2370
2371 next_pass = node_states[N_CPU];
2372again:
2373 this_pass = next_pass;
2374 next_pass = NODE_MASK_NONE;
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386 nodes_or(used_targets, used_targets, this_pass);
2387
2388 for_each_node_mask(node, this_pass) {
2389 best_distance = -1;
2390
2391
2392
2393
2394
2395
2396 do {
2397 int target_node =
2398 establish_migrate_target(node, &used_targets,
2399 best_distance);
2400
2401 if (target_node == NUMA_NO_NODE)
2402 break;
2403
2404 if (best_distance == -1)
2405 best_distance = node_distance(node, target_node);
2406
2407
2408
2409
2410
2411
2412 node_set(target_node, next_pass);
2413 } while (1);
2414 }
2415
2416
2417
2418
2419
2420 if (!nodes_empty(next_pass))
2421 goto again;
2422}
2423
2424
2425
2426
2427void set_migration_target_nodes(void)
2428{
2429 get_online_mems();
2430 __set_migration_target_nodes();
2431 put_online_mems();
2432}
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445#ifdef CONFIG_MEMORY_HOTPLUG
2446static int __meminit migrate_on_reclaim_callback(struct notifier_block *self,
2447 unsigned long action, void *_arg)
2448{
2449 struct memory_notify *arg = _arg;
2450
2451
2452
2453
2454
2455
2456 if (arg->status_change_nid < 0)
2457 return notifier_from_errno(0);
2458
2459 switch (action) {
2460 case MEM_GOING_OFFLINE:
2461
2462
2463
2464
2465
2466
2467 disable_all_migrate_targets();
2468 break;
2469 case MEM_OFFLINE:
2470 case MEM_ONLINE:
2471
2472
2473
2474
2475 __set_migration_target_nodes();
2476 break;
2477 case MEM_CANCEL_OFFLINE:
2478
2479
2480
2481
2482 __set_migration_target_nodes();
2483 break;
2484 case MEM_GOING_ONLINE:
2485 case MEM_CANCEL_ONLINE:
2486 break;
2487 }
2488
2489 return notifier_from_errno(0);
2490}
2491#endif
2492
2493void __init migrate_on_reclaim_init(void)
2494{
2495 node_demotion = kcalloc(nr_node_ids,
2496 sizeof(struct demotion_nodes),
2497 GFP_KERNEL);
2498 WARN_ON(!node_demotion);
2499#ifdef CONFIG_MEMORY_HOTPLUG
2500 hotplug_memory_notifier(migrate_on_reclaim_callback, 100);
2501#endif
2502
2503
2504
2505
2506
2507
2508 cpus_read_lock();
2509 set_migration_target_nodes();
2510 cpus_read_unlock();
2511}
2512
2513bool numa_demotion_enabled = false;
2514
2515#ifdef CONFIG_SYSFS
2516static ssize_t numa_demotion_enabled_show(struct kobject *kobj,
2517 struct kobj_attribute *attr, char *buf)
2518{
2519 return sysfs_emit(buf, "%s\n",
2520 numa_demotion_enabled ? "true" : "false");
2521}
2522
2523static ssize_t numa_demotion_enabled_store(struct kobject *kobj,
2524 struct kobj_attribute *attr,
2525 const char *buf, size_t count)
2526{
2527 ssize_t ret;
2528
2529 ret = kstrtobool(buf, &numa_demotion_enabled);
2530 if (ret)
2531 return ret;
2532
2533 return count;
2534}
2535
2536static struct kobj_attribute numa_demotion_enabled_attr =
2537 __ATTR(demotion_enabled, 0644, numa_demotion_enabled_show,
2538 numa_demotion_enabled_store);
2539
2540static struct attribute *numa_attrs[] = {
2541 &numa_demotion_enabled_attr.attr,
2542 NULL,
2543};
2544
2545static const struct attribute_group numa_attr_group = {
2546 .attrs = numa_attrs,
2547};
2548
2549static int __init numa_init_sysfs(void)
2550{
2551 int err;
2552 struct kobject *numa_kobj;
2553
2554 numa_kobj = kobject_create_and_add("numa", mm_kobj);
2555 if (!numa_kobj) {
2556 pr_err("failed to create numa kobject\n");
2557 return -ENOMEM;
2558 }
2559 err = sysfs_create_group(numa_kobj, &numa_attr_group);
2560 if (err) {
2561 pr_err("failed to register numa group\n");
2562 goto delete_obj;
2563 }
2564 return 0;
2565
2566delete_obj:
2567 kobject_put(numa_kobj);
2568 return err;
2569}
2570subsys_initcall(numa_init_sysfs);
2571#endif
2572#endif
2573