1
2
3
4
5
6
7
8
9
10
11
12
13
14
15#include <linux/migrate.h>
16#include <linux/export.h>
17#include <linux/swap.h>
18#include <linux/swapops.h>
19#include <linux/pagemap.h>
20#include <linux/buffer_head.h>
21#include <linux/mm_inline.h>
22#include <linux/nsproxy.h>
23#include <linux/pagevec.h>
24#include <linux/ksm.h>
25#include <linux/rmap.h>
26#include <linux/topology.h>
27#include <linux/cpu.h>
28#include <linux/cpuset.h>
29#include <linux/writeback.h>
30#include <linux/mempolicy.h>
31#include <linux/vmalloc.h>
32#include <linux/security.h>
33#include <linux/memcontrol.h>
34#include <linux/syscalls.h>
35#include <linux/hugetlb.h>
36#include <linux/hugetlb_cgroup.h>
37#include <linux/gfp.h>
38#include <linux/balloon_compaction.h>
39#include <linux/mmu_notifier.h>
40#include <linux/memremap.h>
41#include <linux/userfaultfd_k.h>
42#include <linux/ptrace.h>
43
44#include <asm/tlbflush.h>
45
46#define CREATE_TRACE_POINTS
47#include <trace/events/migrate.h>
48
49#include "internal.h"
50
51
52
53
54
55
56int migrate_prep(void)
57{
58
59
60
61
62
63
64 lru_add_drain_all();
65
66 return 0;
67}
68
69
70int migrate_prep_local(void)
71{
72 lru_add_drain();
73
74 return 0;
75}
76
77
78
79
80
81void putback_lru_pages(struct list_head *l)
82{
83 struct page *page;
84 struct page *page2;
85
86 list_for_each_entry_safe(page, page2, l, lru) {
87 list_del(&page->lru);
88 dec_zone_page_state(page, NR_ISOLATED_ANON +
89 page_is_file_cache(page));
90 putback_lru_page(page);
91 }
92}
93
94
95
96
97
98
99
100
101void putback_movable_pages(struct list_head *l)
102{
103 struct page *page;
104 struct page *page2;
105
106 list_for_each_entry_safe(page, page2, l, lru) {
107 if (unlikely(PageHuge(page))) {
108 putback_active_hugepage(page);
109 continue;
110 }
111 list_del(&page->lru);
112 dec_zone_page_state(page, NR_ISOLATED_ANON +
113 page_is_file_cache(page));
114 if (unlikely(isolated_balloon_page(page)))
115 balloon_page_putback(page);
116 else
117 putback_lru_page(page);
118 }
119}
120
121
122
123
124static int remove_migration_pte(struct page *new, struct vm_area_struct *vma,
125 unsigned long addr, void *old)
126{
127 struct mm_struct *mm = vma->vm_mm;
128 swp_entry_t entry;
129 pmd_t *pmd;
130 pte_t *ptep, pte;
131 spinlock_t *ptl;
132
133 if (unlikely(PageHuge(new))) {
134 ptep = huge_pte_offset(mm, addr);
135 if (!ptep)
136 goto out;
137 ptl = huge_pte_lockptr(hstate_vma(vma), mm, ptep);
138 } else {
139 pmd = mm_find_pmd(mm, addr);
140 if (!pmd)
141 goto out;
142
143 ptep = pte_offset_map(pmd, addr);
144
145
146
147
148
149
150 ptl = pte_lockptr(mm, pmd);
151 }
152
153 spin_lock(ptl);
154 pte = *ptep;
155 if (!is_swap_pte(pte))
156 goto unlock;
157
158 entry = pte_to_swp_entry(pte);
159
160 if (!is_migration_entry(entry) ||
161 migration_entry_to_page(entry) != old)
162 goto unlock;
163
164 get_page(new);
165 pte = pte_mkold(mk_pte(new, READ_ONCE(vma->vm_page_prot)));
166 if (pte_swp_soft_dirty(*ptep))
167 pte = pte_mksoft_dirty(pte);
168 if (is_write_migration_entry(entry))
169 pte = pte_mkwrite(pte);
170#ifdef CONFIG_HUGETLB_PAGE
171 if (PageHuge(new)) {
172 pte = pte_mkhuge(pte);
173 pte = arch_make_huge_pte(pte, vma, new, 0);
174 }
175#endif
176
177 if (unlikely(is_zone_device_page(new)) && is_hmm_page(new)) {
178 entry = make_hmm_entry(new, pte_write(pte));
179 pte = swp_entry_to_pte(entry);
180 } else
181 flush_dcache_page(new);
182 set_pte_at(mm, addr, ptep, pte);
183
184 if (PageHuge(new)) {
185 if (PageAnon(new))
186 hugepage_add_anon_rmap(new, vma, addr);
187 else
188 page_dup_rmap(new);
189 } else if (PageAnon(new))
190 page_add_anon_rmap(new, vma, addr);
191 else
192 page_add_file_rmap(new);
193
194
195 update_mmu_cache(vma, addr, ptep);
196unlock:
197 pte_unmap_unlock(ptep, ptl);
198out:
199 return SWAP_AGAIN;
200}
201
202
203
204
205
206static void remove_migration_ptes(struct page *old, struct page *new)
207{
208 rmap_walk(new, remove_migration_pte, old);
209}
210
211
212
213
214
215
216void __migration_entry_wait(struct mm_struct *mm, pte_t *ptep,
217 spinlock_t *ptl)
218{
219 pte_t pte;
220 swp_entry_t entry;
221 struct page *page;
222
223 spin_lock(ptl);
224 pte = *ptep;
225 if (!is_swap_pte(pte))
226 goto out;
227
228 entry = pte_to_swp_entry(pte);
229 if (!is_migration_entry(entry))
230 goto out;
231
232 page = migration_entry_to_page(entry);
233
234
235
236
237
238
239
240
241 if (!get_page_unless_zero(page))
242 goto out;
243 if (is_zone_device_page(page))
244 get_zone_device_page(page);
245 pte_unmap_unlock(ptep, ptl);
246 wait_on_page_locked(page);
247 put_page(page);
248 return;
249out:
250 pte_unmap_unlock(ptep, ptl);
251}
252
253void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd,
254 unsigned long address)
255{
256 spinlock_t *ptl = pte_lockptr(mm, pmd);
257 pte_t *ptep = pte_offset_map(pmd, address);
258 __migration_entry_wait(mm, ptep, ptl);
259}
260
261void migration_entry_wait_huge(struct vm_area_struct *vma,
262 struct mm_struct *mm, pte_t *pte)
263{
264 spinlock_t *ptl = huge_pte_lockptr(hstate_vma(vma), mm, pte);
265 __migration_entry_wait(mm, pte, ptl);
266}
267
268#ifdef CONFIG_BLOCK
269
270static bool buffer_migrate_lock_buffers(struct buffer_head *head,
271 enum migrate_mode mode)
272{
273 struct buffer_head *bh = head;
274
275
276 if (mode != MIGRATE_ASYNC) {
277 do {
278 get_bh(bh);
279 lock_buffer(bh);
280 bh = bh->b_this_page;
281
282 } while (bh != head);
283
284 return true;
285 }
286
287
288 do {
289 get_bh(bh);
290 if (!trylock_buffer(bh)) {
291
292
293
294
295 struct buffer_head *failed_bh = bh;
296 put_bh(failed_bh);
297 bh = head;
298 while (bh != failed_bh) {
299 unlock_buffer(bh);
300 put_bh(bh);
301 bh = bh->b_this_page;
302 }
303 return false;
304 }
305
306 bh = bh->b_this_page;
307 } while (bh != head);
308 return true;
309}
310#else
311static inline bool buffer_migrate_lock_buffers(struct buffer_head *head,
312 enum migrate_mode mode)
313{
314 return true;
315}
316#endif
317
318
319
320
321
322
323
324
325
326int migrate_page_move_mapping(struct address_space *mapping,
327 struct page *newpage, struct page *page,
328 struct buffer_head *head, enum migrate_mode mode,
329 int extra_count)
330{
331 int expected_count = 1 + extra_count;
332 void **pslot;
333
334
335
336
337
338
339
340 expected_count += is_zone_device_page(page);
341
342 if (!mapping) {
343
344 if (page_count(page) != expected_count)
345 return -EAGAIN;
346 return MIGRATEPAGE_SUCCESS;
347 }
348
349 spin_lock_irq(&mapping->tree_lock);
350
351 pslot = radix_tree_lookup_slot(&mapping->page_tree,
352 page_index(page));
353
354 expected_count += 1 + page_has_private(page);
355 if (page_count(page) != expected_count ||
356 radix_tree_deref_slot_protected(pslot, &mapping->tree_lock) != page) {
357 spin_unlock_irq(&mapping->tree_lock);
358 return -EAGAIN;
359 }
360
361 if (!page_ref_freeze(page, expected_count)) {
362 spin_unlock_irq(&mapping->tree_lock);
363 return -EAGAIN;
364 }
365
366
367
368
369
370
371
372
373 if (mode == MIGRATE_ASYNC && head &&
374 !buffer_migrate_lock_buffers(head, mode)) {
375 page_ref_unfreeze(page, expected_count);
376 spin_unlock_irq(&mapping->tree_lock);
377 return -EAGAIN;
378 }
379
380
381
382
383 get_page(newpage);
384 if (PageSwapCache(page)) {
385 SetPageSwapCache(newpage);
386 set_page_private(newpage, page_private(page));
387 }
388
389 radix_tree_replace_slot(pslot, newpage);
390
391
392
393
394
395
396 page_ref_unfreeze(page, expected_count - 1);
397
398
399
400
401
402
403
404
405
406
407
408 __dec_zone_page_state(page, NR_FILE_PAGES);
409 __inc_zone_page_state(newpage, NR_FILE_PAGES);
410 if (!PageSwapCache(page) && PageSwapBacked(page)) {
411 __dec_zone_page_state(page, NR_SHMEM);
412 __inc_zone_page_state(newpage, NR_SHMEM);
413 }
414 spin_unlock_irq(&mapping->tree_lock);
415
416 return MIGRATEPAGE_SUCCESS;
417}
418
419
420
421
422
423int migrate_huge_page_move_mapping(struct address_space *mapping,
424 struct page *newpage, struct page *page)
425{
426 int expected_count;
427 void **pslot;
428
429 if (!mapping) {
430 if (page_count(page) != 1)
431 return -EAGAIN;
432 return MIGRATEPAGE_SUCCESS;
433 }
434
435 spin_lock_irq(&mapping->tree_lock);
436
437 pslot = radix_tree_lookup_slot(&mapping->page_tree,
438 page_index(page));
439
440 expected_count = 2 + page_has_private(page);
441 if (page_count(page) != expected_count ||
442 radix_tree_deref_slot_protected(pslot, &mapping->tree_lock) != page) {
443 spin_unlock_irq(&mapping->tree_lock);
444 return -EAGAIN;
445 }
446
447 if (!page_ref_freeze(page, expected_count)) {
448 spin_unlock_irq(&mapping->tree_lock);
449 return -EAGAIN;
450 }
451
452 get_page(newpage);
453
454 radix_tree_replace_slot(pslot, newpage);
455
456 page_ref_unfreeze(page, expected_count - 1);
457
458 spin_unlock_irq(&mapping->tree_lock);
459 return MIGRATEPAGE_SUCCESS;
460}
461
462
463
464
465
466
467static void __copy_gigantic_page(struct page *dst, struct page *src,
468 int nr_pages)
469{
470 int i;
471 struct page *dst_base = dst;
472 struct page *src_base = src;
473
474 for (i = 0; i < nr_pages; ) {
475 cond_resched();
476 copy_highpage(dst, src);
477
478 i++;
479 dst = mem_map_next(dst, dst_base, i);
480 src = mem_map_next(src, src_base, i);
481 }
482}
483
484static void copy_huge_page(struct page *dst, struct page *src)
485{
486 int i;
487 int nr_pages;
488
489 if (PageHuge(src)) {
490
491 struct hstate *h = page_hstate(src);
492 nr_pages = pages_per_huge_page(h);
493
494 if (unlikely(nr_pages > MAX_ORDER_NR_PAGES)) {
495 __copy_gigantic_page(dst, src, nr_pages);
496 return;
497 }
498 } else {
499
500 BUG_ON(!PageTransHuge(src));
501 nr_pages = hpage_nr_pages(src);
502 }
503
504 for (i = 0; i < nr_pages; i++) {
505 cond_resched();
506 copy_highpage(dst + i, src + i);
507 }
508}
509
510
511
512
513void migrate_page_states(struct page *newpage, struct page *page)
514{
515 int cpupid;
516
517 if (PageError(page))
518 SetPageError(newpage);
519 if (PageReferenced(page))
520 SetPageReferenced(newpage);
521 if (PageUptodate(page))
522 SetPageUptodate(newpage);
523 if (TestClearPageActive(page)) {
524 VM_BUG_ON_PAGE(PageUnevictable(page), page);
525 SetPageActive(newpage);
526 } else if (TestClearPageUnevictable(page))
527 SetPageUnevictable(newpage);
528 if (PageChecked(page))
529 SetPageChecked(newpage);
530 if (PageMappedToDisk(page))
531 SetPageMappedToDisk(newpage);
532
533 if (PageDirty(page)) {
534 clear_page_dirty_for_io(page);
535
536
537
538
539
540
541
542 if (PageSwapBacked(page))
543 SetPageDirty(newpage);
544 else
545 __set_page_dirty_nobuffers(newpage);
546 }
547
548
549
550
551
552 cpupid = page_cpupid_xchg_last(page, -1);
553 page_cpupid_xchg_last(newpage, cpupid);
554
555 mlock_migrate_page(newpage, page);
556 ksm_migrate_page(newpage, page);
557
558
559
560
561 ClearPageSwapCache(page);
562 ClearPagePrivate(page);
563 set_page_private(page, 0);
564
565
566
567
568
569 if (PageWriteback(newpage))
570 end_page_writeback(newpage);
571}
572
573void migrate_page_copy(struct page *newpage, struct page *page)
574{
575 if (PageHuge(page) || PageTransHuge(page))
576 copy_huge_page(newpage, page);
577 else
578 copy_highpage(newpage, page);
579
580 migrate_page_states(newpage, page);
581}
582
583
584
585
586
587
588int fail_migrate_page(struct address_space *mapping,
589 struct page *newpage, struct page *page)
590{
591 return -EIO;
592}
593EXPORT_SYMBOL(fail_migrate_page);
594
595
596
597
598
599
600
601int migrate_page(struct address_space *mapping,
602 struct page *newpage, struct page *page,
603 enum migrate_mode mode)
604{
605 int rc;
606
607 BUG_ON(PageWriteback(page));
608
609 rc = migrate_page_move_mapping(mapping, newpage, page, NULL, mode, 0);
610
611 if (rc != MIGRATEPAGE_SUCCESS)
612 return rc;
613
614 if ((int)mode != MIGRATE_SYNC_NO_COPY)
615 migrate_page_copy(newpage, page);
616 else
617 migrate_page_states(newpage, page);
618 return MIGRATEPAGE_SUCCESS;
619}
620EXPORT_SYMBOL(migrate_page);
621
622#ifdef CONFIG_BLOCK
623
624
625
626
627
628int buffer_migrate_page(struct address_space *mapping,
629 struct page *newpage, struct page *page, enum migrate_mode mode)
630{
631 struct buffer_head *bh, *head;
632 int rc;
633
634 if (!page_has_buffers(page))
635 return migrate_page(mapping, newpage, page, mode);
636
637 head = page_buffers(page);
638
639 rc = migrate_page_move_mapping(mapping, newpage, page, head, mode, 0);
640
641 if (rc != MIGRATEPAGE_SUCCESS)
642 return rc;
643
644
645
646
647
648
649 if (mode != MIGRATE_ASYNC)
650 BUG_ON(!buffer_migrate_lock_buffers(head, mode));
651
652 ClearPagePrivate(page);
653 set_page_private(newpage, page_private(page));
654 set_page_private(page, 0);
655 put_page(page);
656 get_page(newpage);
657
658 bh = head;
659 do {
660 set_bh_page(bh, newpage, bh_offset(bh));
661 bh = bh->b_this_page;
662
663 } while (bh != head);
664
665 SetPagePrivate(newpage);
666
667 if ((int)mode != MIGRATE_SYNC_NO_COPY)
668 migrate_page_copy(newpage, page);
669 else
670 migrate_page_states(newpage, page);
671
672 bh = head;
673 do {
674 unlock_buffer(bh);
675 put_bh(bh);
676 bh = bh->b_this_page;
677
678 } while (bh != head);
679
680 return MIGRATEPAGE_SUCCESS;
681}
682EXPORT_SYMBOL(buffer_migrate_page);
683#endif
684
685
686
687
688static int writeout(struct address_space *mapping, struct page *page)
689{
690 struct writeback_control wbc = {
691 .sync_mode = WB_SYNC_NONE,
692 .nr_to_write = 1,
693 .range_start = 0,
694 .range_end = LLONG_MAX,
695 .for_reclaim = 1
696 };
697 int rc;
698
699 if (!mapping->a_ops->writepage)
700
701 return -EINVAL;
702
703 if (!clear_page_dirty_for_io(page))
704
705 return -EAGAIN;
706
707
708
709
710
711
712
713
714
715 remove_migration_ptes(page, page);
716
717 rc = mapping->a_ops->writepage(page, &wbc);
718
719 if (rc != AOP_WRITEPAGE_ACTIVATE)
720
721 lock_page(page);
722
723 return (rc < 0) ? -EIO : -EAGAIN;
724}
725
726
727
728
729static int fallback_migrate_page(struct address_space *mapping,
730 struct page *newpage, struct page *page, enum migrate_mode mode)
731{
732 if (PageDirty(page)) {
733
734 switch ((int)mode) {
735 case MIGRATE_SYNC:
736 case MIGRATE_SYNC_NO_COPY:
737 break;
738 default:
739 return -EBUSY;
740 }
741 return writeout(mapping, page);
742 }
743
744
745
746
747
748 if (page_has_private(page) &&
749 !try_to_release_page(page, GFP_KERNEL))
750 return -EAGAIN;
751
752 return migrate_page(mapping, newpage, page, mode);
753}
754
755
756
757
758
759
760
761
762
763
764
765
766static int move_to_new_page(struct page *newpage, struct page *page,
767 int page_was_mapped, enum migrate_mode mode)
768{
769 struct address_space *mapping;
770 int rc;
771
772
773
774
775
776
777 if (!trylock_page(newpage))
778 BUG();
779
780
781 newpage->index = page->index;
782 newpage->mapping = page->mapping;
783 if (PageSwapBacked(page))
784 SetPageSwapBacked(newpage);
785
786 mapping = page_mapping(page);
787 if (!mapping)
788 rc = migrate_page(mapping, newpage, page, mode);
789 else if (mapping->a_ops->migratepage)
790
791
792
793
794
795
796 rc = mapping->a_ops->migratepage(mapping,
797 newpage, page, mode);
798 else
799 rc = fallback_migrate_page(mapping, newpage, page, mode);
800
801 if (rc != MIGRATEPAGE_SUCCESS) {
802 newpage->mapping = NULL;
803 } else {
804 if (page_was_mapped)
805 remove_migration_ptes(page, newpage);
806 page->mapping = NULL;
807 }
808
809 unlock_page(newpage);
810
811 return rc;
812}
813
814static int __unmap_and_move(struct page *page, struct page *newpage,
815 int force, enum migrate_mode mode)
816{
817 int rc = -EAGAIN;
818 int page_was_mapped = 0;
819 struct mem_cgroup *mem;
820 struct anon_vma *anon_vma = NULL;
821
822 if (!trylock_page(page)) {
823 if (!force || mode == MIGRATE_ASYNC)
824 goto out;
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839 if (current->flags & PF_MEMALLOC)
840 goto out;
841
842 lock_page(page);
843 }
844
845
846 mem_cgroup_prepare_migration(page, newpage, &mem);
847
848 if (PageWriteback(page)) {
849
850
851
852
853
854
855 switch ((int)mode) {
856 case MIGRATE_SYNC:
857 case MIGRATE_SYNC_NO_COPY:
858 break;
859 default:
860 rc = -EBUSY;
861 goto uncharge;
862 }
863 if (!force)
864 goto uncharge;
865 wait_on_page_writeback(page);
866 }
867
868
869
870
871
872
873
874
875 if (PageAnon(page) && !PageKsm(page)) {
876
877
878
879
880 anon_vma = page_get_anon_vma(page);
881 if (anon_vma) {
882
883
884
885 } else if (PageSwapCache(page)) {
886
887
888
889
890
891
892
893
894
895
896
897
898 } else {
899 goto uncharge;
900 }
901 }
902
903 if (unlikely(isolated_balloon_page(page))) {
904
905
906
907
908
909
910
911 rc = balloon_page_migrate(newpage, page, mode);
912 goto uncharge;
913 }
914
915
916
917
918
919
920
921
922
923
924
925
926
927 if (!page->mapping) {
928 VM_BUG_ON_PAGE(PageAnon(page), page);
929 if (page_has_private(page)) {
930 try_to_free_buffers(page);
931 goto uncharge;
932 }
933 goto skip_unmap;
934 }
935
936
937 if (page_mapped(page)) {
938 try_to_unmap(page,
939 TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS);
940 page_was_mapped = 1;
941 }
942
943skip_unmap:
944 if (!page_mapped(page))
945 rc = move_to_new_page(newpage, page, page_was_mapped, mode);
946
947 if (rc && page_was_mapped)
948 remove_migration_ptes(page, page);
949
950
951 if (anon_vma)
952 put_anon_vma(anon_vma);
953
954uncharge:
955 mem_cgroup_end_migration(mem, page, newpage,
956 rc == MIGRATEPAGE_SUCCESS);
957 unlock_page(page);
958out:
959 return rc;
960}
961
962
963
964
965
966static int unmap_and_move(new_page_t get_new_page, unsigned long private,
967 struct page *page, int force, enum migrate_mode mode,
968 enum migrate_reason reason)
969{
970 int rc = 0;
971 int *result = NULL;
972 struct page *newpage = get_new_page(page, private, &result);
973
974 if (!newpage)
975 return -ENOMEM;
976
977 if (page_count(page) == 1) {
978
979 goto out;
980 }
981
982 if (unlikely(PageTransHuge(page)))
983 if (unlikely(split_huge_page(page)))
984 goto out;
985
986 rc = __unmap_and_move(page, newpage, force, mode);
987
988out:
989 if (rc != -EAGAIN) {
990
991
992
993
994
995
996 list_del(&page->lru);
997 dec_zone_page_state(page, NR_ISOLATED_ANON +
998 page_is_file_cache(page));
999 if (reason != MR_MEMORY_FAILURE)
1000 putback_lru_page(page);
1001 }
1002
1003
1004
1005
1006 if (unlikely(__is_movable_balloon_page(newpage))) {
1007
1008 put_page(newpage);
1009 } else {
1010 putback_lru_page(newpage);
1011 }
1012
1013 if (result) {
1014 if (rc)
1015 *result = rc;
1016 else
1017 *result = page_to_nid(newpage);
1018 }
1019 return rc;
1020}
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040static int unmap_and_move_huge_page(new_page_t get_new_page,
1041 unsigned long private, struct page *hpage,
1042 int force, enum migrate_mode mode)
1043{
1044 int rc = 0;
1045 int *result = NULL;
1046 int page_was_mapped = 0;
1047 struct page *new_hpage;
1048 struct anon_vma *anon_vma = NULL;
1049
1050
1051
1052
1053
1054
1055
1056
1057 if (!hugepage_migration_supported(page_hstate(hpage))) {
1058 putback_active_hugepage(hpage);
1059 return -ENOSYS;
1060 }
1061
1062 new_hpage = get_new_page(hpage, private, &result);
1063 if (!new_hpage)
1064 return -ENOMEM;
1065
1066 rc = -EAGAIN;
1067
1068 if (!trylock_page(hpage)) {
1069 if (!force)
1070 goto out;
1071 switch ((int)mode) {
1072 case MIGRATE_SYNC:
1073 case MIGRATE_SYNC_NO_COPY:
1074 break;
1075 default:
1076 goto out;
1077 }
1078 lock_page(hpage);
1079 }
1080
1081 if (PageAnon(hpage))
1082 anon_vma = page_get_anon_vma(hpage);
1083
1084 if (page_mapped(hpage)) {
1085 try_to_unmap(hpage,
1086 TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS);
1087 page_was_mapped = 1;
1088 }
1089
1090 if (!page_mapped(hpage))
1091 rc = move_to_new_page(new_hpage, hpage, page_was_mapped, mode);
1092
1093 if (rc != MIGRATEPAGE_SUCCESS && page_was_mapped)
1094 remove_migration_ptes(hpage, hpage);
1095
1096 if (anon_vma)
1097 put_anon_vma(anon_vma);
1098
1099 if (!rc)
1100 hugetlb_cgroup_migrate(hpage, new_hpage);
1101
1102 unlock_page(hpage);
1103out:
1104 if (rc != -EAGAIN)
1105 putback_active_hugepage(hpage);
1106 putback_active_hugepage(new_hpage);
1107 if (result) {
1108 if (rc)
1109 *result = rc;
1110 else
1111 *result = page_to_nid(new_hpage);
1112 }
1113 return rc;
1114}
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135int migrate_pages(struct list_head *from, new_page_t get_new_page,
1136 unsigned long private, enum migrate_mode mode, int reason)
1137{
1138 int retry = 1;
1139 int nr_failed = 0;
1140 int nr_succeeded = 0;
1141 int pass = 0;
1142 struct page *page;
1143 struct page *page2;
1144 int swapwrite = current->flags & PF_SWAPWRITE;
1145 int rc;
1146
1147 if (!swapwrite)
1148 current->flags |= PF_SWAPWRITE;
1149
1150 for(pass = 0; pass < 10 && retry; pass++) {
1151 retry = 0;
1152
1153 list_for_each_entry_safe(page, page2, from, lru) {
1154 cond_resched();
1155
1156 if (PageHuge(page))
1157 rc = unmap_and_move_huge_page(get_new_page,
1158 private, page, pass > 2, mode);
1159 else
1160 rc = unmap_and_move(get_new_page, private,
1161 page, pass > 2, mode, reason);
1162
1163 switch(rc) {
1164 case -ENOMEM:
1165 goto out;
1166 case -EAGAIN:
1167 retry++;
1168 break;
1169 case MIGRATEPAGE_SUCCESS:
1170 nr_succeeded++;
1171 break;
1172 default:
1173
1174 nr_failed++;
1175 break;
1176 }
1177 }
1178 }
1179 rc = nr_failed + retry;
1180out:
1181 if (nr_succeeded)
1182 count_vm_events(PGMIGRATE_SUCCESS, nr_succeeded);
1183 if (nr_failed)
1184 count_vm_events(PGMIGRATE_FAIL, nr_failed);
1185 trace_mm_migrate_pages(nr_succeeded, nr_failed, mode, reason);
1186
1187 if (!swapwrite)
1188 current->flags &= ~PF_SWAPWRITE;
1189
1190 return rc;
1191}
1192
1193#ifdef CONFIG_NUMA
1194
1195
1196
1197struct page_to_node {
1198 unsigned long addr;
1199 struct page *page;
1200 int node;
1201 int status;
1202};
1203
1204static struct page *new_page_node(struct page *p, unsigned long private,
1205 int **result)
1206{
1207 struct page_to_node *pm = (struct page_to_node *)private;
1208
1209 while (pm->node != MAX_NUMNODES && pm->page != p)
1210 pm++;
1211
1212 if (pm->node == MAX_NUMNODES)
1213 return NULL;
1214
1215 *result = &pm->status;
1216
1217 if (PageHuge(p))
1218 return alloc_huge_page_node(page_hstate(compound_head(p)),
1219 pm->node);
1220 else
1221 return alloc_pages_exact_node(pm->node,
1222 GFP_HIGHUSER_MOVABLE | __GFP_THISNODE, 0);
1223}
1224
1225
1226
1227
1228
1229
1230
1231static int do_move_page_to_node_array(struct mm_struct *mm,
1232 struct page_to_node *pm,
1233 int migrate_all)
1234{
1235 int err;
1236 struct page_to_node *pp;
1237 LIST_HEAD(pagelist);
1238
1239 down_read(&mm->mmap_sem);
1240
1241
1242
1243
1244 for (pp = pm; pp->node != MAX_NUMNODES; pp++) {
1245 struct vm_area_struct *vma;
1246 struct page *page;
1247
1248 err = -EFAULT;
1249 vma = find_vma(mm, pp->addr);
1250 if (!vma || pp->addr < vma->vm_start || !vma_migratable(vma))
1251 goto set_status;
1252
1253 page = follow_page(vma, pp->addr, FOLL_GET|FOLL_SPLIT);
1254
1255 err = PTR_ERR(page);
1256 if (IS_ERR(page))
1257 goto set_status;
1258
1259 err = -ENOENT;
1260 if (!page)
1261 goto set_status;
1262
1263
1264 if (PageReserved(page))
1265 goto put_and_set;
1266
1267 pp->page = page;
1268 err = page_to_nid(page);
1269
1270 if (err == pp->node)
1271
1272
1273
1274 goto put_and_set;
1275
1276 err = -EACCES;
1277 if (page_mapcount(page) > 1 &&
1278 !migrate_all)
1279 goto put_and_set;
1280
1281 if (PageHuge(page)) {
1282 if (PageHead(page))
1283 isolate_huge_page(page, &pagelist);
1284 goto put_and_set;
1285 }
1286
1287 err = isolate_lru_page(page);
1288 if (!err) {
1289 list_add_tail(&page->lru, &pagelist);
1290 inc_zone_page_state(page, NR_ISOLATED_ANON +
1291 page_is_file_cache(page));
1292 }
1293put_and_set:
1294
1295
1296
1297
1298
1299 put_page(page);
1300set_status:
1301 pp->status = err;
1302 }
1303
1304 err = 0;
1305 if (!list_empty(&pagelist)) {
1306 err = migrate_pages(&pagelist, new_page_node,
1307 (unsigned long)pm, MIGRATE_SYNC, MR_SYSCALL);
1308 if (err)
1309 putback_movable_pages(&pagelist);
1310 }
1311
1312 up_read(&mm->mmap_sem);
1313 return err;
1314}
1315
1316
1317
1318
1319
1320static int do_pages_move(struct mm_struct *mm, nodemask_t task_nodes,
1321 unsigned long nr_pages,
1322 const void __user * __user *pages,
1323 const int __user *nodes,
1324 int __user *status, int flags)
1325{
1326 struct page_to_node *pm;
1327 unsigned long chunk_nr_pages;
1328 unsigned long chunk_start;
1329 int err;
1330
1331 err = -ENOMEM;
1332 pm = (struct page_to_node *)__get_free_page(GFP_KERNEL);
1333 if (!pm)
1334 goto out;
1335
1336 migrate_prep();
1337
1338
1339
1340
1341
1342 chunk_nr_pages = (PAGE_SIZE / sizeof(struct page_to_node)) - 1;
1343
1344 for (chunk_start = 0;
1345 chunk_start < nr_pages;
1346 chunk_start += chunk_nr_pages) {
1347 int j;
1348
1349 if (chunk_start + chunk_nr_pages > nr_pages)
1350 chunk_nr_pages = nr_pages - chunk_start;
1351
1352
1353 for (j = 0; j < chunk_nr_pages; j++) {
1354 const void __user *p;
1355 int node;
1356
1357 err = -EFAULT;
1358 if (get_user(p, pages + j + chunk_start))
1359 goto out_pm;
1360 pm[j].addr = (unsigned long) p;
1361
1362 if (get_user(node, nodes + j + chunk_start))
1363 goto out_pm;
1364
1365 err = -ENODEV;
1366 if (node < 0 || node >= MAX_NUMNODES)
1367 goto out_pm;
1368
1369 if (!node_state(node, N_MEMORY))
1370 goto out_pm;
1371
1372 err = -EACCES;
1373 if (!node_isset(node, task_nodes))
1374 goto out_pm;
1375
1376 pm[j].node = node;
1377 }
1378
1379
1380 pm[chunk_nr_pages].node = MAX_NUMNODES;
1381
1382
1383 err = do_move_page_to_node_array(mm, pm,
1384 flags & MPOL_MF_MOVE_ALL);
1385 if (err < 0)
1386 goto out_pm;
1387
1388
1389 for (j = 0; j < chunk_nr_pages; j++)
1390 if (put_user(pm[j].status, status + j + chunk_start)) {
1391 err = -EFAULT;
1392 goto out_pm;
1393 }
1394 }
1395 err = 0;
1396
1397out_pm:
1398 free_page((unsigned long)pm);
1399out:
1400 return err;
1401}
1402
1403
1404
1405
1406static void do_pages_stat_array(struct mm_struct *mm, unsigned long nr_pages,
1407 const void __user **pages, int *status)
1408{
1409 unsigned long i;
1410
1411 down_read(&mm->mmap_sem);
1412
1413 for (i = 0; i < nr_pages; i++) {
1414 unsigned long addr = (unsigned long)(*pages);
1415 struct vm_area_struct *vma;
1416 struct page *page;
1417 int err = -EFAULT;
1418
1419 vma = find_vma(mm, addr);
1420 if (!vma || addr < vma->vm_start)
1421 goto set_status;
1422
1423 page = follow_page(vma, addr, 0);
1424
1425 err = PTR_ERR(page);
1426 if (IS_ERR(page))
1427 goto set_status;
1428
1429 err = -ENOENT;
1430
1431 if (!page || PageReserved(page))
1432 goto set_status;
1433
1434 err = page_to_nid(page);
1435set_status:
1436 *status = err;
1437
1438 pages++;
1439 status++;
1440 }
1441
1442 up_read(&mm->mmap_sem);
1443}
1444
1445
1446
1447
1448
1449static int do_pages_stat(struct mm_struct *mm, unsigned long nr_pages,
1450 const void __user * __user *pages,
1451 int __user *status)
1452{
1453#define DO_PAGES_STAT_CHUNK_NR 16
1454 const void __user *chunk_pages[DO_PAGES_STAT_CHUNK_NR];
1455 int chunk_status[DO_PAGES_STAT_CHUNK_NR];
1456
1457 while (nr_pages) {
1458 unsigned long chunk_nr;
1459
1460 chunk_nr = nr_pages;
1461 if (chunk_nr > DO_PAGES_STAT_CHUNK_NR)
1462 chunk_nr = DO_PAGES_STAT_CHUNK_NR;
1463
1464 if (copy_from_user(chunk_pages, pages, chunk_nr * sizeof(*chunk_pages)))
1465 break;
1466
1467 do_pages_stat_array(mm, chunk_nr, chunk_pages, chunk_status);
1468
1469 if (copy_to_user(status, chunk_status, chunk_nr * sizeof(*status)))
1470 break;
1471
1472 pages += chunk_nr;
1473 status += chunk_nr;
1474 nr_pages -= chunk_nr;
1475 }
1476 return nr_pages ? -EFAULT : 0;
1477}
1478
1479
1480
1481
1482
1483SYSCALL_DEFINE6(move_pages, pid_t, pid, unsigned long, nr_pages,
1484 const void __user * __user *, pages,
1485 const int __user *, nodes,
1486 int __user *, status, int, flags)
1487{
1488 struct task_struct *task;
1489 struct mm_struct *mm;
1490 int err;
1491 nodemask_t task_nodes;
1492
1493
1494 if (flags & ~(MPOL_MF_MOVE|MPOL_MF_MOVE_ALL))
1495 return -EINVAL;
1496
1497 if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
1498 return -EPERM;
1499
1500
1501 rcu_read_lock();
1502 task = pid ? find_task_by_vpid(pid) : current;
1503 if (!task) {
1504 rcu_read_unlock();
1505 return -ESRCH;
1506 }
1507 get_task_struct(task);
1508
1509
1510
1511
1512
1513 if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS)) {
1514 rcu_read_unlock();
1515 err = -EPERM;
1516 goto out;
1517 }
1518 rcu_read_unlock();
1519
1520 err = security_task_movememory(task);
1521 if (err)
1522 goto out;
1523
1524 task_nodes = cpuset_mems_allowed(task);
1525 mm = get_task_mm(task);
1526 put_task_struct(task);
1527
1528 if (!mm)
1529 return -EINVAL;
1530
1531 if (nodes)
1532 err = do_pages_move(mm, task_nodes, nr_pages, pages,
1533 nodes, status, flags);
1534 else
1535 err = do_pages_stat(mm, nr_pages, pages, status);
1536
1537 mmput(mm);
1538 return err;
1539
1540out:
1541 put_task_struct(task);
1542 return err;
1543}
1544
1545
1546
1547
1548
1549
1550int migrate_vmas(struct mm_struct *mm, const nodemask_t *to,
1551 const nodemask_t *from, unsigned long flags)
1552{
1553 struct vm_area_struct *vma;
1554 int err = 0;
1555
1556 for (vma = mm->mmap; vma && !err; vma = vma->vm_next) {
1557 if (vma->vm_ops && vma->vm_ops->migrate) {
1558 err = vma->vm_ops->migrate(vma, to, from, flags);
1559 if (err)
1560 break;
1561 }
1562 }
1563 return err;
1564}
1565
1566#ifdef CONFIG_NUMA_BALANCING
1567
1568
1569
1570
1571static bool migrate_balanced_pgdat(struct pglist_data *pgdat,
1572 unsigned long nr_migrate_pages)
1573{
1574 int z;
1575 for (z = pgdat->nr_zones - 1; z >= 0; z--) {
1576 struct zone *zone = pgdat->node_zones + z;
1577
1578 if (!populated_zone(zone))
1579 continue;
1580
1581 if (zone->all_unreclaimable)
1582 continue;
1583
1584
1585 if (!zone_watermark_ok(zone, 0,
1586 high_wmark_pages(zone) +
1587 nr_migrate_pages,
1588 0, 0))
1589 continue;
1590 return true;
1591 }
1592 return false;
1593}
1594
1595static struct page *alloc_misplaced_dst_page(struct page *page,
1596 unsigned long data,
1597 int **result)
1598{
1599 int nid = (int) data;
1600 struct page *newpage;
1601
1602 newpage = alloc_pages_exact_node(nid,
1603 (GFP_HIGHUSER_MOVABLE |
1604 __GFP_THISNODE | __GFP_NOMEMALLOC |
1605 __GFP_NORETRY | __GFP_NOWARN) &
1606 ~GFP_IOFS, 0);
1607 if (newpage)
1608 page_cpupid_xchg_last(newpage, page_cpupid_last(page));
1609
1610 return newpage;
1611}
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622static unsigned int migrate_interval_millisecs __read_mostly = 100;
1623static unsigned int pteupdate_interval_millisecs __read_mostly = 1000;
1624static unsigned int ratelimit_pages __read_mostly = 128 << (20 - PAGE_SHIFT);
1625
1626
1627bool migrate_ratelimited(int node)
1628{
1629 pg_data_t *pgdat = NODE_DATA(node);
1630
1631 if (time_after(jiffies, pgdat->numabalancing_migrate_next_window +
1632 msecs_to_jiffies(pteupdate_interval_millisecs)))
1633 return false;
1634
1635 if (pgdat->numabalancing_migrate_nr_pages < ratelimit_pages)
1636 return false;
1637
1638 return true;
1639}
1640
1641
1642static bool numamigrate_update_ratelimit(pg_data_t *pgdat,
1643 unsigned long nr_pages)
1644{
1645
1646
1647
1648
1649
1650 if (time_after(jiffies, pgdat->numabalancing_migrate_next_window)) {
1651 spin_lock(&pgdat->numabalancing_migrate_lock);
1652 pgdat->numabalancing_migrate_nr_pages = 0;
1653 pgdat->numabalancing_migrate_next_window = jiffies +
1654 msecs_to_jiffies(migrate_interval_millisecs);
1655 spin_unlock(&pgdat->numabalancing_migrate_lock);
1656 }
1657 if (pgdat->numabalancing_migrate_nr_pages > ratelimit_pages) {
1658 trace_mm_numa_migrate_ratelimit(current, pgdat->node_id,
1659 nr_pages);
1660 return true;
1661 }
1662
1663
1664
1665
1666
1667
1668
1669 pgdat->numabalancing_migrate_nr_pages += nr_pages;
1670 return false;
1671}
1672
1673static int numamigrate_isolate_page(pg_data_t *pgdat, struct page *page)
1674{
1675 int page_lru;
1676
1677 VM_BUG_ON_PAGE(compound_order(page) && !PageTransHuge(page), page);
1678
1679
1680 if (!migrate_balanced_pgdat(pgdat, 1UL << compound_order(page)))
1681 return 0;
1682
1683 if (isolate_lru_page(page))
1684 return 0;
1685
1686
1687
1688
1689
1690
1691
1692
1693 if (PageTransHuge(page) && page_count(page) != 3) {
1694 putback_lru_page(page);
1695 return 0;
1696 }
1697
1698 page_lru = page_is_file_cache(page);
1699 mod_zone_page_state(page_zone(page), NR_ISOLATED_ANON + page_lru,
1700 hpage_nr_pages(page));
1701
1702
1703
1704
1705
1706
1707 put_page(page);
1708 return 1;
1709}
1710
1711bool pmd_trans_migrating(pmd_t pmd)
1712{
1713 struct page *page = pmd_page(pmd);
1714 return PageLocked(page);
1715}
1716
1717
1718
1719
1720
1721
1722int migrate_misplaced_page(struct page *page, struct vm_area_struct *vma,
1723 int node)
1724{
1725 pg_data_t *pgdat = NODE_DATA(node);
1726 int isolated;
1727 int nr_remaining;
1728 LIST_HEAD(migratepages);
1729
1730
1731
1732
1733
1734 if (page_mapcount(page) != 1 && page_is_file_cache(page) &&
1735 (vma->vm_flags & VM_EXEC))
1736 goto out;
1737
1738
1739
1740
1741
1742
1743 if (numamigrate_update_ratelimit(pgdat, 1))
1744 goto out;
1745
1746 isolated = numamigrate_isolate_page(pgdat, page);
1747 if (!isolated)
1748 goto out;
1749
1750 list_add(&page->lru, &migratepages);
1751 nr_remaining = migrate_pages(&migratepages, alloc_misplaced_dst_page,
1752 node, MIGRATE_ASYNC, MR_NUMA_MISPLACED);
1753 if (nr_remaining) {
1754 putback_lru_pages(&migratepages);
1755 isolated = 0;
1756 } else
1757 count_vm_numa_event(NUMA_PAGE_MIGRATE);
1758 BUG_ON(!list_empty(&migratepages));
1759 return isolated;
1760
1761out:
1762 put_page(page);
1763 return 0;
1764}
1765#endif
1766
1767#if defined(CONFIG_NUMA_BALANCING) && defined(CONFIG_TRANSPARENT_HUGEPAGE)
1768
1769
1770
1771
1772int migrate_misplaced_transhuge_page(struct mm_struct *mm,
1773 struct vm_area_struct *vma,
1774 pmd_t *pmd, pmd_t entry,
1775 unsigned long address,
1776 struct page *page, int node)
1777{
1778 spinlock_t *ptl;
1779 pg_data_t *pgdat = NODE_DATA(node);
1780 int isolated = 0;
1781 struct page *new_page = NULL;
1782 struct mem_cgroup *memcg = NULL;
1783 int page_lru = page_is_file_cache(page);
1784 unsigned long mmun_start = address & HPAGE_PMD_MASK;
1785 unsigned long mmun_end = mmun_start + HPAGE_PMD_SIZE;
1786 pmd_t orig_entry;
1787
1788
1789
1790
1791
1792
1793 if (numamigrate_update_ratelimit(pgdat, HPAGE_PMD_NR))
1794 goto out_dropref;
1795
1796 new_page = alloc_pages_node(node,
1797 (GFP_TRANSHUGE | __GFP_THISNODE) & ~__GFP_WAIT,
1798 HPAGE_PMD_ORDER);
1799 if (!new_page)
1800 goto out_fail;
1801
1802 isolated = numamigrate_isolate_page(pgdat, page);
1803 if (!isolated) {
1804 put_page(new_page);
1805 goto out_fail;
1806 }
1807
1808 if (tlb_flush_pending(mm))
1809 flush_tlb_range(vma, mmun_start, mmun_end);
1810
1811
1812 __set_page_locked(new_page);
1813 SetPageSwapBacked(new_page);
1814
1815
1816 new_page->mapping = page->mapping;
1817 new_page->index = page->index;
1818 migrate_page_copy(new_page, page);
1819 WARN_ON(PageLRU(new_page));
1820
1821
1822 mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
1823 ptl = pmd_lock(mm, pmd);
1824 if (unlikely(!pmd_same(*pmd, entry) || page_count(page) != 2)) {
1825fail_putback:
1826 spin_unlock(ptl);
1827 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
1828
1829
1830 if (TestClearPageActive(new_page))
1831 SetPageActive(page);
1832 if (TestClearPageUnevictable(new_page))
1833 SetPageUnevictable(page);
1834 mlock_migrate_page(page, new_page);
1835
1836 unlock_page(new_page);
1837 put_page(new_page);
1838
1839
1840 get_page(page);
1841 putback_lru_page(page);
1842 mod_zone_page_state(page_zone(page),
1843 NR_ISOLATED_ANON + page_lru, -HPAGE_PMD_NR);
1844
1845 goto out_unlock;
1846 }
1847
1848
1849
1850
1851
1852
1853
1854
1855 mem_cgroup_prepare_migration(page, new_page, &memcg);
1856
1857 init_trans_huge_mmu_gather_count(new_page);
1858
1859 orig_entry = *pmd;
1860 entry = mk_pmd(new_page, vma->vm_page_prot);
1861 entry = pmd_mkhuge(entry);
1862 entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
1863
1864
1865
1866
1867
1868
1869
1870
1871 flush_cache_range(vma, mmun_start, mmun_end);
1872 page_add_new_anon_rmap(new_page, vma, mmun_start);
1873 pmdp_clear_flush_notify(vma, mmun_start, pmd);
1874 set_pmd_at(mm, mmun_start, pmd, entry);
1875 flush_tlb_range(vma, mmun_start, mmun_end);
1876 update_mmu_cache_pmd(vma, address, &entry);
1877
1878 if (page_count(page) != 2) {
1879 set_pmd_at(mm, mmun_start, pmd, orig_entry);
1880 flush_tlb_range(vma, mmun_start, mmun_end);
1881 mmu_notifier_invalidate_range(mm, mmun_start, mmun_end);
1882 update_mmu_cache_pmd(vma, address, &entry);
1883 page_remove_rmap(new_page);
1884 goto fail_putback;
1885 }
1886
1887 page_remove_rmap(page);
1888
1889
1890
1891
1892
1893
1894 mem_cgroup_end_migration(memcg, page, new_page, true);
1895 spin_unlock(ptl);
1896 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
1897
1898 unlock_page(new_page);
1899 unlock_page(page);
1900 put_page(page);
1901 put_page(page);
1902
1903 count_vm_events(PGMIGRATE_SUCCESS, HPAGE_PMD_NR);
1904 count_vm_numa_events(NUMA_PAGE_MIGRATE, HPAGE_PMD_NR);
1905
1906 mod_zone_page_state(page_zone(page),
1907 NR_ISOLATED_ANON + page_lru,
1908 -HPAGE_PMD_NR);
1909 return isolated;
1910
1911out_fail:
1912 count_vm_events(PGMIGRATE_FAIL, HPAGE_PMD_NR);
1913out_dropref:
1914 ptl = pmd_lock(mm, pmd);
1915 if (pmd_same(*pmd, entry)) {
1916 entry = pmd_mknonnuma(entry);
1917 set_pmd_at(mm, mmun_start, pmd, entry);
1918 update_mmu_cache_pmd(vma, address, &entry);
1919 }
1920 spin_unlock(ptl);
1921
1922out_unlock:
1923 unlock_page(page);
1924 put_page(page);
1925 return 0;
1926}
1927#endif
1928
1929#endif
1930
1931
1932struct migrate_vma {
1933 struct vm_area_struct *vma;
1934 unsigned long *dst;
1935 unsigned long *src;
1936 unsigned long cpages;
1937 unsigned long npages;
1938 unsigned long start;
1939 unsigned long end;
1940};
1941
1942static int migrate_vma_collect_hole(unsigned long start,
1943 unsigned long end,
1944 struct mm_walk *walk)
1945{
1946 struct migrate_vma *migrate = walk->private;
1947 unsigned long addr;
1948
1949 for (addr = start & PAGE_MASK; addr < end; addr += PAGE_SIZE) {
1950 migrate->src[migrate->npages] = MIGRATE_PFN_MIGRATE;
1951 migrate->dst[migrate->npages] = 0;
1952 migrate->npages++;
1953 migrate->cpages++;
1954 }
1955
1956 return 0;
1957}
1958
1959static int migrate_vma_collect_pmd(pmd_t *pmdp,
1960 unsigned long start,
1961 unsigned long end,
1962 struct mm_walk *walk)
1963{
1964 struct migrate_vma *migrate = walk->private;
1965 unsigned long addr = start, unmapped = 0;
1966 struct mm_struct *mm = walk->mm;
1967 spinlock_t *ptl;
1968 pte_t *ptep;
1969
1970 if (pmd_trans_huge(*pmdp))
1971 split_huge_page_pmd(migrate->vma, start, pmdp);
1972 if (pmd_none_or_trans_huge_or_clear_bad(pmdp))
1973 return migrate_vma_collect_hole(start, end, walk);
1974
1975 ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl);
1976 arch_enter_lazy_mmu_mode();
1977
1978 for (; addr < end; addr += PAGE_SIZE, ptep++) {
1979 unsigned long mpfn, pfn;
1980 struct page *page;
1981 swp_entry_t entry;
1982 pte_t pte;
1983
1984 pte = *ptep;
1985 pfn = pte_pfn(pte);
1986
1987 if (pte_none(pte)) {
1988 mpfn = MIGRATE_PFN_MIGRATE;
1989 migrate->cpages++;
1990 pfn = 0;
1991 goto next;
1992 }
1993
1994 if (!pte_present(pte)) {
1995 mpfn = pfn = 0;
1996
1997 if (pte_file(pte))
1998 goto next;
1999
2000
2001
2002
2003
2004
2005 entry = pte_to_swp_entry(pte);
2006 if (!is_hmm_entry(entry))
2007 goto next;
2008
2009 page = hmm_entry_to_page(entry);
2010 mpfn = migrate_pfn(page_to_pfn(page))|
2011 MIGRATE_PFN_DEVICE | MIGRATE_PFN_MIGRATE;
2012 if (is_write_hmm_entry(entry))
2013 mpfn |= MIGRATE_PFN_WRITE;
2014 } else {
2015 page = vm_normal_page(migrate->vma, addr, pte);
2016 mpfn = migrate_pfn(pfn) | MIGRATE_PFN_MIGRATE;
2017 mpfn |= pte_write(pte) ? MIGRATE_PFN_WRITE : 0;
2018 }
2019
2020
2021 if (!page || !page->mapping || PageTransCompound(page)) {
2022 mpfn = pfn = 0;
2023 goto next;
2024 }
2025 pfn = page_to_pfn(page);
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036 get_page(page);
2037 migrate->cpages++;
2038
2039
2040
2041
2042
2043
2044 if (trylock_page(page)) {
2045 pte_t swp_pte;
2046
2047 mpfn |= MIGRATE_PFN_LOCKED;
2048 ptep_get_and_clear(mm, addr, ptep);
2049
2050
2051 entry = make_migration_entry(page, pte_write(pte));
2052 swp_pte = swp_entry_to_pte(entry);
2053 if (pte_soft_dirty(pte))
2054 swp_pte = pte_swp_mksoft_dirty(swp_pte);
2055 set_pte_at(mm, addr, ptep, swp_pte);
2056
2057
2058
2059
2060
2061
2062 page_remove_rmap(page);
2063 put_page(page);
2064 if (pte_present(pte))
2065 unmapped++;
2066 }
2067
2068next:
2069 migrate->dst[migrate->npages] = 0;
2070 migrate->src[migrate->npages++] = mpfn;
2071 }
2072 arch_leave_lazy_mmu_mode();
2073 pte_unmap_unlock(ptep - 1, ptl);
2074
2075
2076 if (unmapped)
2077 flush_tlb_range(migrate->vma, start, end);
2078
2079 return 0;
2080}
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090static void migrate_vma_collect(struct migrate_vma *migrate)
2091{
2092 struct mm_walk mm_walk;
2093
2094 mm_walk.pgd_entry = NULL;
2095 mm_walk.pud_entry = NULL;
2096 mm_walk.pte_entry = NULL;
2097 mm_walk.private = migrate;
2098 mm_walk.hugetlb_entry = NULL;
2099 mm_walk.mm = migrate->vma->vm_mm;
2100 mm_walk.pte_hole = migrate_vma_collect_hole;
2101 mm_walk.pmd_entry = migrate_vma_collect_pmd;
2102
2103 mmu_notifier_invalidate_range_start(mm_walk.mm,
2104 migrate->start,
2105 migrate->end);
2106 walk_page_range(migrate->start, migrate->end, &mm_walk);
2107 mmu_notifier_invalidate_range_end(mm_walk.mm,
2108 migrate->start,
2109 migrate->end);
2110
2111 migrate->end = migrate->start + (migrate->npages << PAGE_SHIFT);
2112}
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122static bool migrate_vma_check_page(struct page *page)
2123{
2124
2125
2126
2127
2128
2129 int extra = 1;
2130
2131
2132
2133
2134
2135
2136 if (PageCompound(page))
2137 return false;
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152 if (is_hmm_page(page))
2153 return true;
2154
2155 if (is_zone_device_page(page))
2156
2157 return false;
2158
2159 if ((page_count(page) - extra) > page_mapcount(page))
2160 return false;
2161
2162 return true;
2163}
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174static void migrate_vma_prepare(struct migrate_vma *migrate)
2175{
2176 const unsigned long npages = migrate->npages;
2177 const unsigned long start = migrate->start;
2178 unsigned long addr, i, restore = 0;
2179 bool allow_drain = true;
2180
2181 lru_add_drain();
2182
2183 for (i = 0; (i < npages) && migrate->cpages; i++) {
2184 struct page *page = migrate_pfn_to_page(migrate->src[i]);
2185 bool remap = true;
2186
2187 if (!page)
2188 continue;
2189
2190 if (!(migrate->src[i] & MIGRATE_PFN_LOCKED)) {
2191
2192
2193
2194
2195
2196
2197
2198
2199 if (!trylock_page(page)) {
2200 migrate->src[i] = 0;
2201 migrate->cpages--;
2202 put_page(page);
2203 continue;
2204 }
2205 remap = false;
2206 migrate->src[i] |= MIGRATE_PFN_LOCKED;
2207 }
2208
2209
2210 if (!is_zone_device_page(page)) {
2211 if (!PageLRU(page) && allow_drain) {
2212
2213 lru_add_drain_all();
2214 allow_drain = false;
2215 }
2216
2217 if (isolate_lru_page(page)) {
2218 if (remap) {
2219 migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
2220 migrate->cpages--;
2221 restore++;
2222 } else {
2223 migrate->src[i] = 0;
2224 unlock_page(page);
2225 migrate->cpages--;
2226 put_page(page);
2227 }
2228 continue;
2229 }
2230
2231
2232 put_page(page);
2233 }
2234
2235 if (!migrate_vma_check_page(page)) {
2236 if (remap) {
2237 migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
2238 migrate->cpages--;
2239 restore++;
2240
2241 if (!is_zone_device_page(page)) {
2242 get_page(page);
2243 putback_lru_page(page);
2244 }
2245 } else {
2246 migrate->src[i] = 0;
2247 unlock_page(page);
2248 migrate->cpages--;
2249
2250 if (!is_zone_device_page(page))
2251 putback_lru_page(page);
2252 else
2253 put_page(page);
2254 }
2255 }
2256 }
2257
2258 for (i = 0, addr = start; i < npages && restore; i++, addr += PAGE_SIZE) {
2259 struct page *page = migrate_pfn_to_page(migrate->src[i]);
2260
2261 if (!page || (migrate->src[i] & MIGRATE_PFN_MIGRATE))
2262 continue;
2263
2264 remove_migration_pte(page, migrate->vma, addr, page);
2265
2266 migrate->src[i] = 0;
2267 unlock_page(page);
2268 put_page(page);
2269 restore--;
2270 }
2271}
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284static void migrate_vma_unmap(struct migrate_vma *migrate)
2285{
2286 int flags = TTU_MIGRATION | TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS;
2287 const unsigned long npages = migrate->npages;
2288 const unsigned long start = migrate->start;
2289 unsigned long addr, i, restore = 0;
2290
2291 for (i = 0; i < npages; i++) {
2292 struct page *page = migrate_pfn_to_page(migrate->src[i]);
2293
2294 if (!page || !(migrate->src[i] & MIGRATE_PFN_MIGRATE))
2295 continue;
2296
2297 if (page_mapped(page)) {
2298 try_to_unmap(page, flags);
2299 if (page_mapped(page))
2300 goto restore;
2301 }
2302
2303 if (migrate_vma_check_page(page))
2304 continue;
2305
2306restore:
2307 migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
2308 migrate->cpages--;
2309 restore++;
2310 }
2311
2312 for (addr = start, i = 0; i < npages && restore; addr += PAGE_SIZE, i++) {
2313 struct page *page = migrate_pfn_to_page(migrate->src[i]);
2314
2315 if (!page || (migrate->src[i] & MIGRATE_PFN_MIGRATE))
2316 continue;
2317
2318 remove_migration_ptes(page, page);
2319
2320 migrate->src[i] = 0;
2321 unlock_page(page);
2322 restore--;
2323
2324 if (is_zone_device_page(page))
2325 put_page(page);
2326 else
2327 putback_lru_page(page);
2328 }
2329}
2330
2331static void migrate_vma_insert_page(struct migrate_vma *migrate,
2332 unsigned long addr,
2333 struct page *page,
2334 unsigned long *src,
2335 unsigned long *dst)
2336{
2337 struct vm_area_struct *vma = migrate->vma;
2338 struct mm_struct *mm = vma->vm_mm;
2339 bool flush = false;
2340 spinlock_t *ptl;
2341 pgd_t *pgdp;
2342 pud_t *pudp;
2343 pmd_t *pmdp;
2344 pte_t *ptep;
2345 pte_t entry;
2346
2347
2348 if (!vma_is_anonymous(vma))
2349 goto abort;
2350
2351 pgdp = pgd_offset(mm, addr);
2352 pudp = pud_alloc(mm, pgdp, addr);
2353 if (!pudp)
2354 goto abort;
2355 pmdp = pmd_alloc(mm, pudp, addr);
2356 if (!pmdp)
2357 goto abort;
2358
2359 if (pmd_trans_unstable(pmdp))
2360 goto abort;
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372 if (__pte_alloc(mm, vma, pmdp, addr))
2373 goto abort;
2374
2375
2376 if (unlikely(pmd_trans_unstable(pmdp)))
2377 goto abort;
2378
2379 if (unlikely(anon_vma_prepare(vma)))
2380 goto abort;
2381 if (mem_cgroup_newpage_charge(page, vma->vm_mm, GFP_KERNEL))
2382 goto abort;
2383
2384
2385
2386
2387
2388
2389 __SetPageUptodate(page);
2390
2391 if (is_zone_device_page(page) && is_hmm_page(page)) {
2392 swp_entry_t swp_entry;
2393
2394 swp_entry = make_hmm_entry(page, vma->vm_flags & VM_WRITE);
2395 entry = swp_entry_to_pte(swp_entry);
2396 } else {
2397 entry = mk_pte(page, vma->vm_page_prot);
2398 if (vma->vm_flags & VM_WRITE)
2399 entry = pte_mkwrite(pte_mkdirty(entry));
2400 }
2401
2402 ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl);
2403 if (pte_present(*ptep)) {
2404 unsigned long pfn = pte_pfn(*ptep);
2405
2406 if (!is_zero_pfn(pfn)) {
2407 pte_unmap_unlock(ptep, ptl);
2408 mem_cgroup_uncharge_page(page);
2409 goto abort;
2410 }
2411 flush = true;
2412 } else if (!pte_none(*ptep)) {
2413 pte_unmap_unlock(ptep, ptl);
2414 mem_cgroup_uncharge_page(page);
2415 goto abort;
2416 }
2417
2418
2419
2420
2421
2422 if (userfaultfd_missing(vma)) {
2423 pte_unmap_unlock(ptep, ptl);
2424 mem_cgroup_uncharge_page(page);
2425 goto abort;
2426 }
2427
2428 page_add_new_anon_rmap(page, vma, addr);
2429 inc_mm_counter(mm, MM_ANONPAGES);
2430 get_page(page);
2431
2432 if (flush) {
2433 flush_cache_page(vma, addr, pte_pfn(*ptep));
2434 ptep_clear_flush_notify(vma, addr, ptep);
2435 set_pte_at_notify(mm, addr, ptep, entry);
2436 update_mmu_cache(vma, addr, ptep);
2437 } else {
2438
2439 update_mmu_cache(vma, addr, ptep);
2440 set_pte_at(mm, addr, ptep, entry);
2441 }
2442 pte_unmap_unlock(ptep, ptl);
2443 *src = MIGRATE_PFN_MIGRATE;
2444 return;
2445
2446abort:
2447 *src &= ~MIGRATE_PFN_MIGRATE;
2448}
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458static void migrate_vma_pages(struct migrate_vma *migrate)
2459{
2460 const unsigned long npages = migrate->npages;
2461 const unsigned long start = migrate->start;
2462 struct vm_area_struct *vma = migrate->vma;
2463 struct mm_struct *mm = vma->vm_mm;
2464 unsigned long addr, i, mmu_start;
2465 bool notified = false;
2466
2467 for (i = 0, addr = start; i < npages; addr += PAGE_SIZE, i++) {
2468 struct page *newpage = migrate_pfn_to_page(migrate->dst[i]);
2469 struct page *page = migrate_pfn_to_page(migrate->src[i]);
2470 struct address_space *mapping;
2471 struct mem_cgroup *memcg;
2472 int r;
2473
2474 if (!newpage) {
2475 migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
2476 continue;
2477 }
2478
2479 if (!page) {
2480 if (!(migrate->src[i] & MIGRATE_PFN_MIGRATE)) {
2481 continue;
2482 }
2483 if (!notified) {
2484 mmu_start = addr;
2485 notified = true;
2486 mmu_notifier_invalidate_range_start(mm,
2487 mmu_start,
2488 migrate->end);
2489 }
2490 migrate_vma_insert_page(migrate, addr, newpage,
2491 &migrate->src[i],
2492 &migrate->dst[i]);
2493 continue;
2494 }
2495
2496 mapping = page_mapping(page);
2497
2498 if (is_zone_device_page(newpage)) {
2499 if (is_hmm_page(newpage)) {
2500
2501
2502
2503
2504 if (mapping) {
2505 migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
2506 continue;
2507 }
2508 } else {
2509
2510
2511
2512
2513 migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
2514 continue;
2515 }
2516 }
2517
2518 newpage->index = page->index;
2519 newpage->mapping = page->mapping;
2520 if (PageSwapBacked(page))
2521 SetPageSwapBacked(newpage);
2522
2523 mem_cgroup_prepare_migration(page, newpage, &memcg);
2524 r = migrate_page(mapping, newpage, page, MIGRATE_SYNC_NO_COPY);
2525 mem_cgroup_end_migration(memcg, page, newpage,
2526 r == MIGRATEPAGE_SUCCESS);
2527 if (r != MIGRATEPAGE_SUCCESS)
2528 migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
2529 }
2530
2531 if (notified)
2532 mmu_notifier_invalidate_range_end(mm, mmu_start,
2533 migrate->end);
2534}
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547static void migrate_vma_finalize(struct migrate_vma *migrate)
2548{
2549 const unsigned long npages = migrate->npages;
2550 unsigned long i;
2551
2552 for (i = 0; i < npages; i++) {
2553 struct page *newpage = migrate_pfn_to_page(migrate->dst[i]);
2554 struct page *page = migrate_pfn_to_page(migrate->src[i]);
2555
2556 if (!page) {
2557 if (newpage) {
2558 unlock_page(newpage);
2559 put_page(newpage);
2560 }
2561 continue;
2562 }
2563
2564 if (!(migrate->src[i] & MIGRATE_PFN_MIGRATE) || !newpage) {
2565 if (newpage) {
2566 unlock_page(newpage);
2567 put_page(newpage);
2568 }
2569 newpage = page;
2570 }
2571
2572 remove_migration_ptes(page, newpage);
2573 unlock_page(page);
2574 migrate->cpages--;
2575
2576 if (is_zone_device_page(page))
2577 put_page(page);
2578 else
2579 putback_lru_page(page);
2580
2581 if (newpage != page) {
2582 unlock_page(newpage);
2583 if (is_zone_device_page(newpage))
2584 put_page(newpage);
2585 else
2586 putback_lru_page(newpage);
2587 }
2588 }
2589}
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642int migrate_vma(const struct migrate_vma_ops *ops,
2643 struct vm_area_struct *vma,
2644 unsigned long start,
2645 unsigned long end,
2646 unsigned long *src,
2647 unsigned long *dst,
2648 void *private)
2649{
2650 struct migrate_vma migrate;
2651
2652
2653 start &= PAGE_MASK;
2654 end &= PAGE_MASK;
2655 if (!vma || is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_SPECIAL))
2656 return -EINVAL;
2657 if (start < vma->vm_start || start >= vma->vm_end)
2658 return -EINVAL;
2659 if (end <= vma->vm_start || end > vma->vm_end)
2660 return -EINVAL;
2661 if (!ops || !src || !dst || start >= end)
2662 return -EINVAL;
2663
2664 memset(src, 0, sizeof(*src) * ((end - start) >> PAGE_SHIFT));
2665 migrate.src = src;
2666 migrate.dst = dst;
2667 migrate.start = start;
2668 migrate.npages = 0;
2669 migrate.cpages = 0;
2670 migrate.end = end;
2671 migrate.vma = vma;
2672
2673
2674 migrate_vma_collect(&migrate);
2675 if (!migrate.cpages)
2676 return 0;
2677
2678
2679 migrate_vma_prepare(&migrate);
2680 if (!migrate.cpages)
2681 return 0;
2682
2683
2684 migrate_vma_unmap(&migrate);
2685 if (!migrate.cpages)
2686 return 0;
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696 ops->alloc_and_copy(vma, src, dst, start, end, private);
2697
2698
2699 migrate_vma_pages(&migrate);
2700
2701 ops->finalize_and_map(vma, src, dst, start, end, private);
2702
2703
2704 migrate_vma_finalize(&migrate);
2705
2706 return 0;
2707}
2708EXPORT_SYMBOL(migrate_vma);
2709