linux/mm/migrate.c
<<
>>
Prefs
   1/*
   2 * Memory Migration functionality - linux/mm/migration.c
   3 *
   4 * Copyright (C) 2006 Silicon Graphics, Inc., Christoph Lameter
   5 *
   6 * Page migration was first developed in the context of the memory hotplug
   7 * project. The main authors of the migration code are:
   8 *
   9 * IWAMOTO Toshihiro <iwamoto@valinux.co.jp>
  10 * Hirokazu Takahashi <taka@valinux.co.jp>
  11 * Dave Hansen <haveblue@us.ibm.com>
  12 * Christoph Lameter
  13 */
  14
  15#include <linux/migrate.h>
  16#include <linux/export.h>
  17#include <linux/swap.h>
  18#include <linux/swapops.h>
  19#include <linux/pagemap.h>
  20#include <linux/buffer_head.h>
  21#include <linux/mm_inline.h>
  22#include <linux/nsproxy.h>
  23#include <linux/pagevec.h>
  24#include <linux/ksm.h>
  25#include <linux/rmap.h>
  26#include <linux/topology.h>
  27#include <linux/cpu.h>
  28#include <linux/cpuset.h>
  29#include <linux/writeback.h>
  30#include <linux/mempolicy.h>
  31#include <linux/vmalloc.h>
  32#include <linux/security.h>
  33#include <linux/memcontrol.h>
  34#include <linux/syscalls.h>
  35#include <linux/hugetlb.h>
  36#include <linux/hugetlb_cgroup.h>
  37#include <linux/gfp.h>
  38#include <linux/balloon_compaction.h>
  39#include <linux/mmu_notifier.h>
  40#include <linux/memremap.h>
  41#include <linux/userfaultfd_k.h>
  42#include <linux/ptrace.h>
  43
  44#include <asm/tlbflush.h>
  45
  46#define CREATE_TRACE_POINTS
  47#include <trace/events/migrate.h>
  48
  49#include "internal.h"
  50
  51/*
  52 * migrate_prep() needs to be called before we start compiling a list of pages
  53 * to be migrated using isolate_lru_page(). If scheduling work on other CPUs is
  54 * undesirable, use migrate_prep_local()
  55 */
  56int migrate_prep(void)
  57{
  58        /*
  59         * Clear the LRU lists so pages can be isolated.
  60         * Note that pages may be moved off the LRU after we have
  61         * drained them. Those pages will fail to migrate like other
  62         * pages that may be busy.
  63         */
  64        lru_add_drain_all();
  65
  66        return 0;
  67}
  68
  69/* Do the necessary work of migrate_prep but not if it involves other CPUs */
  70int migrate_prep_local(void)
  71{
  72        lru_add_drain();
  73
  74        return 0;
  75}
  76
  77/*
  78 * Add isolated pages on the list back to the LRU under page lock
  79 * to avoid leaking evictable pages back onto unevictable list.
  80 */
  81void putback_lru_pages(struct list_head *l)
  82{
  83        struct page *page;
  84        struct page *page2;
  85
  86        list_for_each_entry_safe(page, page2, l, lru) {
  87                list_del(&page->lru);
  88                dec_zone_page_state(page, NR_ISOLATED_ANON +
  89                                page_is_file_cache(page));
  90                        putback_lru_page(page);
  91        }
  92}
  93
  94/*
  95 * Put previously isolated pages back onto the appropriate lists
  96 * from where they were once taken off for compaction/migration.
  97 *
  98 * This function shall be used instead of putback_lru_pages(),
  99 * whenever the isolated pageset has been built by isolate_migratepages_range()
 100 */
 101void putback_movable_pages(struct list_head *l)
 102{
 103        struct page *page;
 104        struct page *page2;
 105
 106        list_for_each_entry_safe(page, page2, l, lru) {
 107                if (unlikely(PageHuge(page))) {
 108                        putback_active_hugepage(page);
 109                        continue;
 110                }
 111                list_del(&page->lru);
 112                dec_zone_page_state(page, NR_ISOLATED_ANON +
 113                                page_is_file_cache(page));
 114                if (unlikely(isolated_balloon_page(page)))
 115                        balloon_page_putback(page);
 116                else
 117                        putback_lru_page(page);
 118        }
 119}
 120
 121/*
 122 * Restore a potential migration pte to a working pte entry
 123 */
 124static int remove_migration_pte(struct page *new, struct vm_area_struct *vma,
 125                                 unsigned long addr, void *old)
 126{
 127        struct mm_struct *mm = vma->vm_mm;
 128        swp_entry_t entry;
 129        pmd_t *pmd;
 130        pte_t *ptep, pte;
 131        spinlock_t *ptl;
 132
 133        if (unlikely(PageHuge(new))) {
 134                ptep = huge_pte_offset(mm, addr);
 135                if (!ptep)
 136                        goto out;
 137                ptl = huge_pte_lockptr(hstate_vma(vma), mm, ptep);
 138        } else {
 139                pmd = mm_find_pmd(mm, addr);
 140                if (!pmd)
 141                        goto out;
 142
 143                ptep = pte_offset_map(pmd, addr);
 144
 145                /*
 146                 * Peek to check is_swap_pte() before taking ptlock?  No, we
 147                 * can race mremap's move_ptes(), which skips anon_vma lock.
 148                 */
 149
 150                ptl = pte_lockptr(mm, pmd);
 151        }
 152
 153        spin_lock(ptl);
 154        pte = *ptep;
 155        if (!is_swap_pte(pte))
 156                goto unlock;
 157
 158        entry = pte_to_swp_entry(pte);
 159
 160        if (!is_migration_entry(entry) ||
 161            migration_entry_to_page(entry) != old)
 162                goto unlock;
 163
 164        get_page(new);
 165        pte = pte_mkold(mk_pte(new, READ_ONCE(vma->vm_page_prot)));
 166        if (pte_swp_soft_dirty(*ptep))
 167                pte = pte_mksoft_dirty(pte);
 168        if (is_write_migration_entry(entry))
 169                pte = pte_mkwrite(pte);
 170#ifdef CONFIG_HUGETLB_PAGE
 171        if (PageHuge(new)) {
 172                pte = pte_mkhuge(pte);
 173                pte = arch_make_huge_pte(pte, vma, new, 0);
 174        }
 175#endif
 176
 177        if (unlikely(is_zone_device_page(new)) && is_hmm_page(new)) {
 178                entry = make_hmm_entry(new, pte_write(pte));
 179                pte = swp_entry_to_pte(entry);
 180        } else
 181                flush_dcache_page(new);
 182        set_pte_at(mm, addr, ptep, pte);
 183
 184        if (PageHuge(new)) {
 185                if (PageAnon(new))
 186                        hugepage_add_anon_rmap(new, vma, addr);
 187                else
 188                        page_dup_rmap(new);
 189        } else if (PageAnon(new))
 190                page_add_anon_rmap(new, vma, addr);
 191        else
 192                page_add_file_rmap(new);
 193
 194        /* No need to invalidate - it was non-present before */
 195        update_mmu_cache(vma, addr, ptep);
 196unlock:
 197        pte_unmap_unlock(ptep, ptl);
 198out:
 199        return SWAP_AGAIN;
 200}
 201
 202/*
 203 * Get rid of all migration entries and replace them by
 204 * references to the indicated page.
 205 */
 206static void remove_migration_ptes(struct page *old, struct page *new)
 207{
 208        rmap_walk(new, remove_migration_pte, old);
 209}
 210
 211/*
 212 * Something used the pte of a page under migration. We need to
 213 * get to the page and wait until migration is finished.
 214 * When we return from this function the fault will be retried.
 215 */
 216void __migration_entry_wait(struct mm_struct *mm, pte_t *ptep,
 217                                spinlock_t *ptl)
 218{
 219        pte_t pte;
 220        swp_entry_t entry;
 221        struct page *page;
 222
 223        spin_lock(ptl);
 224        pte = *ptep;
 225        if (!is_swap_pte(pte))
 226                goto out;
 227
 228        entry = pte_to_swp_entry(pte);
 229        if (!is_migration_entry(entry))
 230                goto out;
 231
 232        page = migration_entry_to_page(entry);
 233
 234        /*
 235         * Once radix-tree replacement of page migration started, page_count
 236         * *must* be zero. And, we don't want to call wait_on_page_locked()
 237         * against a page without get_page().
 238         * So, we use get_page_unless_zero(), here. Even failed, page fault
 239         * will occur again.
 240         */
 241        if (!get_page_unless_zero(page))
 242                goto out;
 243        if (is_zone_device_page(page))
 244                get_zone_device_page(page);
 245        pte_unmap_unlock(ptep, ptl);
 246        wait_on_page_locked(page);
 247        put_page(page);
 248        return;
 249out:
 250        pte_unmap_unlock(ptep, ptl);
 251}
 252
 253void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd,
 254                                unsigned long address)
 255{
 256        spinlock_t *ptl = pte_lockptr(mm, pmd);
 257        pte_t *ptep = pte_offset_map(pmd, address);
 258        __migration_entry_wait(mm, ptep, ptl);
 259}
 260
 261void migration_entry_wait_huge(struct vm_area_struct *vma,
 262                struct mm_struct *mm, pte_t *pte)
 263{
 264        spinlock_t *ptl = huge_pte_lockptr(hstate_vma(vma), mm, pte);
 265        __migration_entry_wait(mm, pte, ptl);
 266}
 267
 268#ifdef CONFIG_BLOCK
 269/* Returns true if all buffers are successfully locked */
 270static bool buffer_migrate_lock_buffers(struct buffer_head *head,
 271                                                        enum migrate_mode mode)
 272{
 273        struct buffer_head *bh = head;
 274
 275        /* Simple case, sync compaction */
 276        if (mode != MIGRATE_ASYNC) {
 277                do {
 278                        get_bh(bh);
 279                        lock_buffer(bh);
 280                        bh = bh->b_this_page;
 281
 282                } while (bh != head);
 283
 284                return true;
 285        }
 286
 287        /* async case, we cannot block on lock_buffer so use trylock_buffer */
 288        do {
 289                get_bh(bh);
 290                if (!trylock_buffer(bh)) {
 291                        /*
 292                         * We failed to lock the buffer and cannot stall in
 293                         * async migration. Release the taken locks
 294                         */
 295                        struct buffer_head *failed_bh = bh;
 296                        put_bh(failed_bh);
 297                        bh = head;
 298                        while (bh != failed_bh) {
 299                                unlock_buffer(bh);
 300                                put_bh(bh);
 301                                bh = bh->b_this_page;
 302                        }
 303                        return false;
 304                }
 305
 306                bh = bh->b_this_page;
 307        } while (bh != head);
 308        return true;
 309}
 310#else
 311static inline bool buffer_migrate_lock_buffers(struct buffer_head *head,
 312                                                        enum migrate_mode mode)
 313{
 314        return true;
 315}
 316#endif /* CONFIG_BLOCK */
 317
 318/*
 319 * Replace the page in the mapping.
 320 *
 321 * The number of remaining references must be:
 322 * 1 for anonymous pages without a mapping
 323 * 2 for pages with a mapping
 324 * 3 for pages with a mapping and PagePrivate/PagePrivate2 set.
 325 */
 326int migrate_page_move_mapping(struct address_space *mapping,
 327                struct page *newpage, struct page *page,
 328                struct buffer_head *head, enum migrate_mode mode,
 329                int extra_count)
 330{
 331        int expected_count = 1 + extra_count;
 332        void **pslot;
 333
 334        /*
 335         * ZONE_DEVICE pages have 1 refcount always held by their device
 336         *
 337         * Note that DAX memory will never reach that point as it does not have
 338         * the MEMORY_DEVICE_ALLOW_MIGRATE flag set (see memory_hotplug.h).
 339         */
 340        expected_count += is_zone_device_page(page);
 341
 342        if (!mapping) {
 343                /* Anonymous page without mapping */
 344                if (page_count(page) != expected_count)
 345                        return -EAGAIN;
 346                return MIGRATEPAGE_SUCCESS;
 347        }
 348
 349        spin_lock_irq(&mapping->tree_lock);
 350
 351        pslot = radix_tree_lookup_slot(&mapping->page_tree,
 352                                        page_index(page));
 353
 354        expected_count += 1 + page_has_private(page);
 355        if (page_count(page) != expected_count ||
 356                radix_tree_deref_slot_protected(pslot, &mapping->tree_lock) != page) {
 357                spin_unlock_irq(&mapping->tree_lock);
 358                return -EAGAIN;
 359        }
 360
 361        if (!page_ref_freeze(page, expected_count)) {
 362                spin_unlock_irq(&mapping->tree_lock);
 363                return -EAGAIN;
 364        }
 365
 366        /*
 367         * In the async migration case of moving a page with buffers, lock the
 368         * buffers using trylock before the mapping is moved. If the mapping
 369         * was moved, we later failed to lock the buffers and could not move
 370         * the mapping back due to an elevated page count, we would have to
 371         * block waiting on other references to be dropped.
 372         */
 373        if (mode == MIGRATE_ASYNC && head &&
 374                        !buffer_migrate_lock_buffers(head, mode)) {
 375                page_ref_unfreeze(page, expected_count);
 376                spin_unlock_irq(&mapping->tree_lock);
 377                return -EAGAIN;
 378        }
 379
 380        /*
 381         * Now we know that no one else is looking at the page.
 382         */
 383        get_page(newpage);      /* add cache reference */
 384        if (PageSwapCache(page)) {
 385                SetPageSwapCache(newpage);
 386                set_page_private(newpage, page_private(page));
 387        }
 388
 389        radix_tree_replace_slot(pslot, newpage);
 390
 391        /*
 392         * Drop cache reference from old page by unfreezing
 393         * to one less reference.
 394         * We know this isn't the last reference.
 395         */
 396        page_ref_unfreeze(page, expected_count - 1);
 397
 398        /*
 399         * If moved to a different zone then also account
 400         * the page for that zone. Other VM counters will be
 401         * taken care of when we establish references to the
 402         * new page and drop references to the old page.
 403         *
 404         * Note that anonymous pages are accounted for
 405         * via NR_FILE_PAGES and NR_ANON_PAGES if they
 406         * are mapped to swap space.
 407         */
 408        __dec_zone_page_state(page, NR_FILE_PAGES);
 409        __inc_zone_page_state(newpage, NR_FILE_PAGES);
 410        if (!PageSwapCache(page) && PageSwapBacked(page)) {
 411                __dec_zone_page_state(page, NR_SHMEM);
 412                __inc_zone_page_state(newpage, NR_SHMEM);
 413        }
 414        spin_unlock_irq(&mapping->tree_lock);
 415
 416        return MIGRATEPAGE_SUCCESS;
 417}
 418
 419/*
 420 * The expected number of remaining references is the same as that
 421 * of migrate_page_move_mapping().
 422 */
 423int migrate_huge_page_move_mapping(struct address_space *mapping,
 424                                   struct page *newpage, struct page *page)
 425{
 426        int expected_count;
 427        void **pslot;
 428
 429        if (!mapping) {
 430                if (page_count(page) != 1)
 431                        return -EAGAIN;
 432                return MIGRATEPAGE_SUCCESS;
 433        }
 434
 435        spin_lock_irq(&mapping->tree_lock);
 436
 437        pslot = radix_tree_lookup_slot(&mapping->page_tree,
 438                                        page_index(page));
 439
 440        expected_count = 2 + page_has_private(page);
 441        if (page_count(page) != expected_count ||
 442                radix_tree_deref_slot_protected(pslot, &mapping->tree_lock) != page) {
 443                spin_unlock_irq(&mapping->tree_lock);
 444                return -EAGAIN;
 445        }
 446
 447        if (!page_ref_freeze(page, expected_count)) {
 448                spin_unlock_irq(&mapping->tree_lock);
 449                return -EAGAIN;
 450        }
 451
 452        get_page(newpage);
 453
 454        radix_tree_replace_slot(pslot, newpage);
 455
 456        page_ref_unfreeze(page, expected_count - 1);
 457
 458        spin_unlock_irq(&mapping->tree_lock);
 459        return MIGRATEPAGE_SUCCESS;
 460}
 461
 462/*
 463 * Gigantic pages are so large that we do not guarantee that page++ pointer
 464 * arithmetic will work across the entire page.  We need something more
 465 * specialized.
 466 */
 467static void __copy_gigantic_page(struct page *dst, struct page *src,
 468                                int nr_pages)
 469{
 470        int i;
 471        struct page *dst_base = dst;
 472        struct page *src_base = src;
 473
 474        for (i = 0; i < nr_pages; ) {
 475                cond_resched();
 476                copy_highpage(dst, src);
 477
 478                i++;
 479                dst = mem_map_next(dst, dst_base, i);
 480                src = mem_map_next(src, src_base, i);
 481        }
 482}
 483
 484static void copy_huge_page(struct page *dst, struct page *src)
 485{
 486        int i;
 487        int nr_pages;
 488
 489        if (PageHuge(src)) {
 490                /* hugetlbfs page */
 491                struct hstate *h = page_hstate(src);
 492                nr_pages = pages_per_huge_page(h);
 493
 494                if (unlikely(nr_pages > MAX_ORDER_NR_PAGES)) {
 495                        __copy_gigantic_page(dst, src, nr_pages);
 496                        return;
 497                }
 498        } else {
 499                /* thp page */
 500                BUG_ON(!PageTransHuge(src));
 501                nr_pages = hpage_nr_pages(src);
 502        }
 503
 504        for (i = 0; i < nr_pages; i++) {
 505                cond_resched();
 506                copy_highpage(dst + i, src + i);
 507        }
 508}
 509
 510/*
 511 * Copy the page to its new location
 512 */
 513void migrate_page_states(struct page *newpage, struct page *page)
 514{
 515        int cpupid;
 516
 517        if (PageError(page))
 518                SetPageError(newpage);
 519        if (PageReferenced(page))
 520                SetPageReferenced(newpage);
 521        if (PageUptodate(page))
 522                SetPageUptodate(newpage);
 523        if (TestClearPageActive(page)) {
 524                VM_BUG_ON_PAGE(PageUnevictable(page), page);
 525                SetPageActive(newpage);
 526        } else if (TestClearPageUnevictable(page))
 527                SetPageUnevictable(newpage);
 528        if (PageChecked(page))
 529                SetPageChecked(newpage);
 530        if (PageMappedToDisk(page))
 531                SetPageMappedToDisk(newpage);
 532
 533        if (PageDirty(page)) {
 534                clear_page_dirty_for_io(page);
 535                /*
 536                 * Want to mark the page and the radix tree as dirty, and
 537                 * redo the accounting that clear_page_dirty_for_io undid,
 538                 * but we can't use set_page_dirty because that function
 539                 * is actually a signal that all of the page has become dirty.
 540                 * Whereas only part of our page may be dirty.
 541                 */
 542                if (PageSwapBacked(page))
 543                        SetPageDirty(newpage);
 544                else
 545                        __set_page_dirty_nobuffers(newpage);
 546        }
 547
 548        /*
 549         * Copy NUMA information to the new page, to prevent over-eager
 550         * future migrations of this same page.
 551         */
 552        cpupid = page_cpupid_xchg_last(page, -1);
 553        page_cpupid_xchg_last(newpage, cpupid);
 554
 555        mlock_migrate_page(newpage, page);
 556        ksm_migrate_page(newpage, page);
 557        /*
 558         * Please do not reorder this without considering how mm/ksm.c's
 559         * get_ksm_page() depends upon ksm_migrate_page() and PageSwapCache().
 560         */
 561        ClearPageSwapCache(page);
 562        ClearPagePrivate(page);
 563        set_page_private(page, 0);
 564
 565        /*
 566         * If any waiters have accumulated on the new page then
 567         * wake them up.
 568         */
 569        if (PageWriteback(newpage))
 570                end_page_writeback(newpage);
 571}
 572
 573void migrate_page_copy(struct page *newpage, struct page *page)
 574{
 575        if (PageHuge(page) || PageTransHuge(page))
 576                copy_huge_page(newpage, page);
 577        else
 578                copy_highpage(newpage, page);
 579
 580        migrate_page_states(newpage, page);
 581}
 582
 583/************************************************************
 584 *                    Migration functions
 585 ***********************************************************/
 586
 587/* Always fail migration. Used for mappings that are not movable */
 588int fail_migrate_page(struct address_space *mapping,
 589                        struct page *newpage, struct page *page)
 590{
 591        return -EIO;
 592}
 593EXPORT_SYMBOL(fail_migrate_page);
 594
 595/*
 596 * Common logic to directly migrate a single page suitable for
 597 * pages that do not use PagePrivate/PagePrivate2.
 598 *
 599 * Pages are locked upon entry and exit.
 600 */
 601int migrate_page(struct address_space *mapping,
 602                struct page *newpage, struct page *page,
 603                enum migrate_mode mode)
 604{
 605        int rc;
 606
 607        BUG_ON(PageWriteback(page));    /* Writeback must be complete */
 608
 609        rc = migrate_page_move_mapping(mapping, newpage, page, NULL, mode, 0);
 610
 611        if (rc != MIGRATEPAGE_SUCCESS)
 612                return rc;
 613
 614        if ((int)mode != MIGRATE_SYNC_NO_COPY)
 615                migrate_page_copy(newpage, page);
 616        else
 617                migrate_page_states(newpage, page);
 618        return MIGRATEPAGE_SUCCESS;
 619}
 620EXPORT_SYMBOL(migrate_page);
 621
 622#ifdef CONFIG_BLOCK
 623/*
 624 * Migration function for pages with buffers. This function can only be used
 625 * if the underlying filesystem guarantees that no other references to "page"
 626 * exist.
 627 */
 628int buffer_migrate_page(struct address_space *mapping,
 629                struct page *newpage, struct page *page, enum migrate_mode mode)
 630{
 631        struct buffer_head *bh, *head;
 632        int rc;
 633
 634        if (!page_has_buffers(page))
 635                return migrate_page(mapping, newpage, page, mode);
 636
 637        head = page_buffers(page);
 638
 639        rc = migrate_page_move_mapping(mapping, newpage, page, head, mode, 0);
 640
 641        if (rc != MIGRATEPAGE_SUCCESS)
 642                return rc;
 643
 644        /*
 645         * In the async case, migrate_page_move_mapping locked the buffers
 646         * with an IRQ-safe spinlock held. In the sync case, the buffers
 647         * need to be locked now
 648         */
 649        if (mode != MIGRATE_ASYNC)
 650                BUG_ON(!buffer_migrate_lock_buffers(head, mode));
 651
 652        ClearPagePrivate(page);
 653        set_page_private(newpage, page_private(page));
 654        set_page_private(page, 0);
 655        put_page(page);
 656        get_page(newpage);
 657
 658        bh = head;
 659        do {
 660                set_bh_page(bh, newpage, bh_offset(bh));
 661                bh = bh->b_this_page;
 662
 663        } while (bh != head);
 664
 665        SetPagePrivate(newpage);
 666
 667        if ((int)mode != MIGRATE_SYNC_NO_COPY)
 668                migrate_page_copy(newpage, page);
 669        else
 670                migrate_page_states(newpage, page);
 671
 672        bh = head;
 673        do {
 674                unlock_buffer(bh);
 675                put_bh(bh);
 676                bh = bh->b_this_page;
 677
 678        } while (bh != head);
 679
 680        return MIGRATEPAGE_SUCCESS;
 681}
 682EXPORT_SYMBOL(buffer_migrate_page);
 683#endif
 684
 685/*
 686 * Writeback a page to clean the dirty state
 687 */
 688static int writeout(struct address_space *mapping, struct page *page)
 689{
 690        struct writeback_control wbc = {
 691                .sync_mode = WB_SYNC_NONE,
 692                .nr_to_write = 1,
 693                .range_start = 0,
 694                .range_end = LLONG_MAX,
 695                .for_reclaim = 1
 696        };
 697        int rc;
 698
 699        if (!mapping->a_ops->writepage)
 700                /* No write method for the address space */
 701                return -EINVAL;
 702
 703        if (!clear_page_dirty_for_io(page))
 704                /* Someone else already triggered a write */
 705                return -EAGAIN;
 706
 707        /*
 708         * A dirty page may imply that the underlying filesystem has
 709         * the page on some queue. So the page must be clean for
 710         * migration. Writeout may mean we loose the lock and the
 711         * page state is no longer what we checked for earlier.
 712         * At this point we know that the migration attempt cannot
 713         * be successful.
 714         */
 715        remove_migration_ptes(page, page);
 716
 717        rc = mapping->a_ops->writepage(page, &wbc);
 718
 719        if (rc != AOP_WRITEPAGE_ACTIVATE)
 720                /* unlocked. Relock */
 721                lock_page(page);
 722
 723        return (rc < 0) ? -EIO : -EAGAIN;
 724}
 725
 726/*
 727 * Default handling if a filesystem does not provide a migration function.
 728 */
 729static int fallback_migrate_page(struct address_space *mapping,
 730        struct page *newpage, struct page *page, enum migrate_mode mode)
 731{
 732        if (PageDirty(page)) {
 733                /* Only writeback pages in full synchronous migration */
 734                switch ((int)mode) {
 735                case MIGRATE_SYNC:
 736                case MIGRATE_SYNC_NO_COPY:
 737                        break;
 738                default:
 739                        return -EBUSY;
 740                }
 741                return writeout(mapping, page);
 742        }
 743
 744        /*
 745         * Buffers may be managed in a filesystem specific way.
 746         * We must have no buffers or drop them.
 747         */
 748        if (page_has_private(page) &&
 749            !try_to_release_page(page, GFP_KERNEL))
 750                return -EAGAIN;
 751
 752        return migrate_page(mapping, newpage, page, mode);
 753}
 754
 755/*
 756 * Move a page to a newly allocated page
 757 * The page is locked and all ptes have been successfully removed.
 758 *
 759 * The new page will have replaced the old page if this function
 760 * is successful.
 761 *
 762 * Return value:
 763 *   < 0 - error code
 764 *  MIGRATEPAGE_SUCCESS - success
 765 */
 766static int move_to_new_page(struct page *newpage, struct page *page,
 767                                int page_was_mapped, enum migrate_mode mode)
 768{
 769        struct address_space *mapping;
 770        int rc;
 771
 772        /*
 773         * Block others from accessing the page when we get around to
 774         * establishing additional references. We are the only one
 775         * holding a reference to the new page at this point.
 776         */
 777        if (!trylock_page(newpage))
 778                BUG();
 779
 780        /* Prepare mapping for the new page.*/
 781        newpage->index = page->index;
 782        newpage->mapping = page->mapping;
 783        if (PageSwapBacked(page))
 784                SetPageSwapBacked(newpage);
 785
 786        mapping = page_mapping(page);
 787        if (!mapping)
 788                rc = migrate_page(mapping, newpage, page, mode);
 789        else if (mapping->a_ops->migratepage)
 790                /*
 791                 * Most pages have a mapping and most filesystems provide a
 792                 * migratepage callback. Anonymous pages are part of swap
 793                 * space which also has its own migratepage callback. This
 794                 * is the most common path for page migration.
 795                 */
 796                rc = mapping->a_ops->migratepage(mapping,
 797                                                newpage, page, mode);
 798        else
 799                rc = fallback_migrate_page(mapping, newpage, page, mode);
 800
 801        if (rc != MIGRATEPAGE_SUCCESS) {
 802                newpage->mapping = NULL;
 803        } else {
 804                if (page_was_mapped)
 805                        remove_migration_ptes(page, newpage);
 806                page->mapping = NULL;
 807        }
 808
 809        unlock_page(newpage);
 810
 811        return rc;
 812}
 813
 814static int __unmap_and_move(struct page *page, struct page *newpage,
 815                                int force, enum migrate_mode mode)
 816{
 817        int rc = -EAGAIN;
 818        int page_was_mapped = 0;
 819        struct mem_cgroup *mem;
 820        struct anon_vma *anon_vma = NULL;
 821
 822        if (!trylock_page(page)) {
 823                if (!force || mode == MIGRATE_ASYNC)
 824                        goto out;
 825
 826                /*
 827                 * It's not safe for direct compaction to call lock_page.
 828                 * For example, during page readahead pages are added locked
 829                 * to the LRU. Later, when the IO completes the pages are
 830                 * marked uptodate and unlocked. However, the queueing
 831                 * could be merging multiple pages for one bio (e.g.
 832                 * mpage_readpages). If an allocation happens for the
 833                 * second or third page, the process can end up locking
 834                 * the same page twice and deadlocking. Rather than
 835                 * trying to be clever about what pages can be locked,
 836                 * avoid the use of lock_page for direct compaction
 837                 * altogether.
 838                 */
 839                if (current->flags & PF_MEMALLOC)
 840                        goto out;
 841
 842                lock_page(page);
 843        }
 844
 845        /* charge against new page */
 846        mem_cgroup_prepare_migration(page, newpage, &mem);
 847
 848        if (PageWriteback(page)) {
 849                /*
 850                 * Only in the case of a full synchronous migration is it
 851                 * necessary to wait for PageWriteback. In the async case,
 852                 * the retry loop is too short and in the sync-light case,
 853                 * the overhead of stalling is too much
 854                 */
 855                switch ((int)mode) {
 856                case MIGRATE_SYNC:
 857                case MIGRATE_SYNC_NO_COPY:
 858                        break;
 859                default:
 860                        rc = -EBUSY;
 861                        goto uncharge;
 862                }
 863                if (!force)
 864                        goto uncharge;
 865                wait_on_page_writeback(page);
 866        }
 867        /*
 868         * By try_to_unmap(), page->mapcount goes down to 0 here. In this case,
 869         * we cannot notice that anon_vma is freed while we migrates a page.
 870         * This get_anon_vma() delays freeing anon_vma pointer until the end
 871         * of migration. File cache pages are no problem because of page_lock()
 872         * File Caches may use write_page() or lock_page() in migration, then,
 873         * just care Anon page here.
 874         */
 875        if (PageAnon(page) && !PageKsm(page)) {
 876                /*
 877                 * Only page_lock_anon_vma_read() understands the subtleties of
 878                 * getting a hold on an anon_vma from outside one of its mms.
 879                 */
 880                anon_vma = page_get_anon_vma(page);
 881                if (anon_vma) {
 882                        /*
 883                         * Anon page
 884                         */
 885                } else if (PageSwapCache(page)) {
 886                        /*
 887                         * We cannot be sure that the anon_vma of an unmapped
 888                         * swapcache page is safe to use because we don't
 889                         * know in advance if the VMA that this page belonged
 890                         * to still exists. If the VMA and others sharing the
 891                         * data have been freed, then the anon_vma could
 892                         * already be invalid.
 893                         *
 894                         * To avoid this possibility, swapcache pages get
 895                         * migrated but are not remapped when migration
 896                         * completes
 897                         */
 898                } else {
 899                        goto uncharge;
 900                }
 901        }
 902
 903        if (unlikely(isolated_balloon_page(page))) {
 904                /*
 905                 * A ballooned page does not need any special attention from
 906                 * physical to virtual reverse mapping procedures.
 907                 * Skip any attempt to unmap PTEs or to remap swap cache,
 908                 * in order to avoid burning cycles at rmap level, and perform
 909                 * the page migration right away (proteced by page lock).
 910                 */
 911                rc = balloon_page_migrate(newpage, page, mode);
 912                goto uncharge;
 913        }
 914
 915        /*
 916         * Corner case handling:
 917         * 1. When a new swap-cache page is read into, it is added to the LRU
 918         * and treated as swapcache but it has no rmap yet.
 919         * Calling try_to_unmap() against a page->mapping==NULL page will
 920         * trigger a BUG.  So handle it here.
 921         * 2. An orphaned page (see truncate_complete_page) might have
 922         * fs-private metadata. The page can be picked up due to memory
 923         * offlining.  Everywhere else except page reclaim, the page is
 924         * invisible to the vm, so the page can not be migrated.  So try to
 925         * free the metadata, so the page can be freed.
 926         */
 927        if (!page->mapping) {
 928                VM_BUG_ON_PAGE(PageAnon(page), page);
 929                if (page_has_private(page)) {
 930                        try_to_free_buffers(page);
 931                        goto uncharge;
 932                }
 933                goto skip_unmap;
 934        }
 935
 936        /* Establish migration ptes or remove ptes */
 937        if (page_mapped(page)) {
 938                try_to_unmap(page,
 939                        TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS);
 940                page_was_mapped = 1;
 941        }
 942
 943skip_unmap:
 944        if (!page_mapped(page))
 945                rc = move_to_new_page(newpage, page, page_was_mapped, mode);
 946
 947        if (rc && page_was_mapped)
 948                remove_migration_ptes(page, page);
 949
 950        /* Drop an anon_vma reference if we took one */
 951        if (anon_vma)
 952                put_anon_vma(anon_vma);
 953
 954uncharge:
 955        mem_cgroup_end_migration(mem, page, newpage,
 956                                 rc == MIGRATEPAGE_SUCCESS);
 957        unlock_page(page);
 958out:
 959        return rc;
 960}
 961
 962/*
 963 * Obtain the lock on page, remove all ptes and migrate the page
 964 * to the newly allocated page in newpage.
 965 */
 966static int unmap_and_move(new_page_t get_new_page, unsigned long private,
 967                        struct page *page, int force, enum migrate_mode mode,
 968                        enum migrate_reason reason)
 969{
 970        int rc = 0;
 971        int *result = NULL;
 972        struct page *newpage = get_new_page(page, private, &result);
 973
 974        if (!newpage)
 975                return -ENOMEM;
 976
 977        if (page_count(page) == 1) {
 978                /* page was freed from under us. So we are done. */
 979                goto out;
 980        }
 981
 982        if (unlikely(PageTransHuge(page)))
 983                if (unlikely(split_huge_page(page)))
 984                        goto out;
 985
 986        rc = __unmap_and_move(page, newpage, force, mode);
 987
 988out:
 989        if (rc != -EAGAIN) {
 990                /*
 991                 * A page that has been migrated has all references
 992                 * removed and will be freed. A page that has not been
 993                 * migrated will have kepts its references and be
 994                 * restored.
 995                 */
 996                list_del(&page->lru);
 997                dec_zone_page_state(page, NR_ISOLATED_ANON +
 998                                page_is_file_cache(page));
 999                if (reason != MR_MEMORY_FAILURE)
1000                        putback_lru_page(page);
1001        }
1002        /*
1003         * Move the new page to the LRU. If migration was not successful
1004         * then this will free the page.
1005         */
1006        if (unlikely(__is_movable_balloon_page(newpage))) {
1007                /* drop our reference, page already in the balloon */
1008                put_page(newpage);
1009        } else {
1010                putback_lru_page(newpage);
1011        }
1012
1013        if (result) {
1014                if (rc)
1015                        *result = rc;
1016                else
1017                        *result = page_to_nid(newpage);
1018        }
1019        return rc;
1020}
1021
1022/*
1023 * Counterpart of unmap_and_move_page() for hugepage migration.
1024 *
1025 * This function doesn't wait the completion of hugepage I/O
1026 * because there is no race between I/O and migration for hugepage.
1027 * Note that currently hugepage I/O occurs only in direct I/O
1028 * where no lock is held and PG_writeback is irrelevant,
1029 * and writeback status of all subpages are counted in the reference
1030 * count of the head page (i.e. if all subpages of a 2MB hugepage are
1031 * under direct I/O, the reference of the head page is 512 and a bit more.)
1032 * This means that when we try to migrate hugepage whose subpages are
1033 * doing direct I/O, some references remain after try_to_unmap() and
1034 * hugepage migration fails without data corruption.
1035 *
1036 * There is also no race when direct I/O is issued on the page under migration,
1037 * because then pte is replaced with migration swap entry and direct I/O code
1038 * will wait in the page fault for migration to complete.
1039 */
1040static int unmap_and_move_huge_page(new_page_t get_new_page,
1041                                unsigned long private, struct page *hpage,
1042                                int force, enum migrate_mode mode)
1043{
1044        int rc = 0;
1045        int *result = NULL;
1046        int page_was_mapped = 0;
1047        struct page *new_hpage;
1048        struct anon_vma *anon_vma = NULL;
1049
1050        /*
1051         * Movability of hugepages depends on architectures and hugepage size.
1052         * This check is necessary because some callers of hugepage migration
1053         * like soft offline and memory hotremove don't walk through page
1054         * tables or check whether the hugepage is pmd-based or not before
1055         * kicking migration.
1056         */
1057        if (!hugepage_migration_supported(page_hstate(hpage))) {
1058                putback_active_hugepage(hpage);
1059                return -ENOSYS;
1060        }
1061
1062        new_hpage = get_new_page(hpage, private, &result);
1063        if (!new_hpage)
1064                return -ENOMEM;
1065
1066        rc = -EAGAIN;
1067
1068        if (!trylock_page(hpage)) {
1069                if (!force)
1070                        goto out;
1071                switch ((int)mode) {
1072                case MIGRATE_SYNC:
1073                case MIGRATE_SYNC_NO_COPY:
1074                        break;
1075                default:
1076                        goto out;
1077                }
1078                lock_page(hpage);
1079        }
1080
1081        if (PageAnon(hpage))
1082                anon_vma = page_get_anon_vma(hpage);
1083
1084        if (page_mapped(hpage)) {
1085                try_to_unmap(hpage,
1086                        TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS);
1087                page_was_mapped = 1;
1088        }
1089
1090        if (!page_mapped(hpage))
1091                rc = move_to_new_page(new_hpage, hpage, page_was_mapped, mode);
1092
1093        if (rc != MIGRATEPAGE_SUCCESS && page_was_mapped)
1094                remove_migration_ptes(hpage, hpage);
1095
1096        if (anon_vma)
1097                put_anon_vma(anon_vma);
1098
1099        if (!rc)
1100                hugetlb_cgroup_migrate(hpage, new_hpage);
1101
1102        unlock_page(hpage);
1103out:
1104        if (rc != -EAGAIN)
1105                putback_active_hugepage(hpage);
1106        putback_active_hugepage(new_hpage);
1107        if (result) {
1108                if (rc)
1109                        *result = rc;
1110                else
1111                        *result = page_to_nid(new_hpage);
1112        }
1113        return rc;
1114}
1115
1116/*
1117 * migrate_pages - migrate the pages specified in a list, to the free pages
1118 *                 supplied as the target for the page migration
1119 *
1120 * @from:               The list of pages to be migrated.
1121 * @get_new_page:       The function used to allocate free pages to be used
1122 *                      as the target of the page migration.
1123 * @private:            Private data to be passed on to get_new_page()
1124 * @mode:               The migration mode that specifies the constraints for
1125 *                      page migration, if any.
1126 * @reason:             The reason for page migration.
1127 *
1128 * The function returns after 10 attempts or if no pages are movable any more
1129 * because the list has become empty or no retryable pages exist any more.
1130 * The caller should call putback_lru_pages() to return pages to the LRU
1131 * or free list only if ret != 0.
1132 *
1133 * Returns the number of pages that were not migrated, or an error code.
1134 */
1135int migrate_pages(struct list_head *from, new_page_t get_new_page,
1136                unsigned long private, enum migrate_mode mode, int reason)
1137{
1138        int retry = 1;
1139        int nr_failed = 0;
1140        int nr_succeeded = 0;
1141        int pass = 0;
1142        struct page *page;
1143        struct page *page2;
1144        int swapwrite = current->flags & PF_SWAPWRITE;
1145        int rc;
1146
1147        if (!swapwrite)
1148                current->flags |= PF_SWAPWRITE;
1149
1150        for(pass = 0; pass < 10 && retry; pass++) {
1151                retry = 0;
1152
1153                list_for_each_entry_safe(page, page2, from, lru) {
1154                        cond_resched();
1155
1156                        if (PageHuge(page))
1157                                rc = unmap_and_move_huge_page(get_new_page,
1158                                                private, page, pass > 2, mode);
1159                        else
1160                                rc = unmap_and_move(get_new_page, private,
1161                                                page, pass > 2, mode, reason);
1162
1163                        switch(rc) {
1164                        case -ENOMEM:
1165                                goto out;
1166                        case -EAGAIN:
1167                                retry++;
1168                                break;
1169                        case MIGRATEPAGE_SUCCESS:
1170                                nr_succeeded++;
1171                                break;
1172                        default:
1173                                /* Permanent failure */
1174                                nr_failed++;
1175                                break;
1176                        }
1177                }
1178        }
1179        rc = nr_failed + retry;
1180out:
1181        if (nr_succeeded)
1182                count_vm_events(PGMIGRATE_SUCCESS, nr_succeeded);
1183        if (nr_failed)
1184                count_vm_events(PGMIGRATE_FAIL, nr_failed);
1185        trace_mm_migrate_pages(nr_succeeded, nr_failed, mode, reason);
1186
1187        if (!swapwrite)
1188                current->flags &= ~PF_SWAPWRITE;
1189
1190        return rc;
1191}
1192
1193#ifdef CONFIG_NUMA
1194/*
1195 * Move a list of individual pages
1196 */
1197struct page_to_node {
1198        unsigned long addr;
1199        struct page *page;
1200        int node;
1201        int status;
1202};
1203
1204static struct page *new_page_node(struct page *p, unsigned long private,
1205                int **result)
1206{
1207        struct page_to_node *pm = (struct page_to_node *)private;
1208
1209        while (pm->node != MAX_NUMNODES && pm->page != p)
1210                pm++;
1211
1212        if (pm->node == MAX_NUMNODES)
1213                return NULL;
1214
1215        *result = &pm->status;
1216
1217        if (PageHuge(p))
1218                return alloc_huge_page_node(page_hstate(compound_head(p)),
1219                                        pm->node);
1220        else
1221                return alloc_pages_exact_node(pm->node,
1222                                GFP_HIGHUSER_MOVABLE | __GFP_THISNODE, 0);
1223}
1224
1225/*
1226 * Move a set of pages as indicated in the pm array. The addr
1227 * field must be set to the virtual address of the page to be moved
1228 * and the node number must contain a valid target node.
1229 * The pm array ends with node = MAX_NUMNODES.
1230 */
1231static int do_move_page_to_node_array(struct mm_struct *mm,
1232                                      struct page_to_node *pm,
1233                                      int migrate_all)
1234{
1235        int err;
1236        struct page_to_node *pp;
1237        LIST_HEAD(pagelist);
1238
1239        down_read(&mm->mmap_sem);
1240
1241        /*
1242         * Build a list of pages to migrate
1243         */
1244        for (pp = pm; pp->node != MAX_NUMNODES; pp++) {
1245                struct vm_area_struct *vma;
1246                struct page *page;
1247
1248                err = -EFAULT;
1249                vma = find_vma(mm, pp->addr);
1250                if (!vma || pp->addr < vma->vm_start || !vma_migratable(vma))
1251                        goto set_status;
1252
1253                page = follow_page(vma, pp->addr, FOLL_GET|FOLL_SPLIT);
1254
1255                err = PTR_ERR(page);
1256                if (IS_ERR(page))
1257                        goto set_status;
1258
1259                err = -ENOENT;
1260                if (!page)
1261                        goto set_status;
1262
1263                /* Use PageReserved to check for zero page */
1264                if (PageReserved(page))
1265                        goto put_and_set;
1266
1267                pp->page = page;
1268                err = page_to_nid(page);
1269
1270                if (err == pp->node)
1271                        /*
1272                         * Node already in the right place
1273                         */
1274                        goto put_and_set;
1275
1276                err = -EACCES;
1277                if (page_mapcount(page) > 1 &&
1278                                !migrate_all)
1279                        goto put_and_set;
1280
1281                if (PageHuge(page)) {
1282                        if (PageHead(page))
1283                                isolate_huge_page(page, &pagelist);
1284                        goto put_and_set;
1285                }
1286
1287                err = isolate_lru_page(page);
1288                if (!err) {
1289                        list_add_tail(&page->lru, &pagelist);
1290                        inc_zone_page_state(page, NR_ISOLATED_ANON +
1291                                            page_is_file_cache(page));
1292                }
1293put_and_set:
1294                /*
1295                 * Either remove the duplicate refcount from
1296                 * isolate_lru_page() or drop the page ref if it was
1297                 * not isolated.
1298                 */
1299                put_page(page);
1300set_status:
1301                pp->status = err;
1302        }
1303
1304        err = 0;
1305        if (!list_empty(&pagelist)) {
1306                err = migrate_pages(&pagelist, new_page_node,
1307                                (unsigned long)pm, MIGRATE_SYNC, MR_SYSCALL);
1308                if (err)
1309                        putback_movable_pages(&pagelist);
1310        }
1311
1312        up_read(&mm->mmap_sem);
1313        return err;
1314}
1315
1316/*
1317 * Migrate an array of page address onto an array of nodes and fill
1318 * the corresponding array of status.
1319 */
1320static int do_pages_move(struct mm_struct *mm, nodemask_t task_nodes,
1321                         unsigned long nr_pages,
1322                         const void __user * __user *pages,
1323                         const int __user *nodes,
1324                         int __user *status, int flags)
1325{
1326        struct page_to_node *pm;
1327        unsigned long chunk_nr_pages;
1328        unsigned long chunk_start;
1329        int err;
1330
1331        err = -ENOMEM;
1332        pm = (struct page_to_node *)__get_free_page(GFP_KERNEL);
1333        if (!pm)
1334                goto out;
1335
1336        migrate_prep();
1337
1338        /*
1339         * Store a chunk of page_to_node array in a page,
1340         * but keep the last one as a marker
1341         */
1342        chunk_nr_pages = (PAGE_SIZE / sizeof(struct page_to_node)) - 1;
1343
1344        for (chunk_start = 0;
1345             chunk_start < nr_pages;
1346             chunk_start += chunk_nr_pages) {
1347                int j;
1348
1349                if (chunk_start + chunk_nr_pages > nr_pages)
1350                        chunk_nr_pages = nr_pages - chunk_start;
1351
1352                /* fill the chunk pm with addrs and nodes from user-space */
1353                for (j = 0; j < chunk_nr_pages; j++) {
1354                        const void __user *p;
1355                        int node;
1356
1357                        err = -EFAULT;
1358                        if (get_user(p, pages + j + chunk_start))
1359                                goto out_pm;
1360                        pm[j].addr = (unsigned long) p;
1361
1362                        if (get_user(node, nodes + j + chunk_start))
1363                                goto out_pm;
1364
1365                        err = -ENODEV;
1366                        if (node < 0 || node >= MAX_NUMNODES)
1367                                goto out_pm;
1368
1369                        if (!node_state(node, N_MEMORY))
1370                                goto out_pm;
1371
1372                        err = -EACCES;
1373                        if (!node_isset(node, task_nodes))
1374                                goto out_pm;
1375
1376                        pm[j].node = node;
1377                }
1378
1379                /* End marker for this chunk */
1380                pm[chunk_nr_pages].node = MAX_NUMNODES;
1381
1382                /* Migrate this chunk */
1383                err = do_move_page_to_node_array(mm, pm,
1384                                                 flags & MPOL_MF_MOVE_ALL);
1385                if (err < 0)
1386                        goto out_pm;
1387
1388                /* Return status information */
1389                for (j = 0; j < chunk_nr_pages; j++)
1390                        if (put_user(pm[j].status, status + j + chunk_start)) {
1391                                err = -EFAULT;
1392                                goto out_pm;
1393                        }
1394        }
1395        err = 0;
1396
1397out_pm:
1398        free_page((unsigned long)pm);
1399out:
1400        return err;
1401}
1402
1403/*
1404 * Determine the nodes of an array of pages and store it in an array of status.
1405 */
1406static void do_pages_stat_array(struct mm_struct *mm, unsigned long nr_pages,
1407                                const void __user **pages, int *status)
1408{
1409        unsigned long i;
1410
1411        down_read(&mm->mmap_sem);
1412
1413        for (i = 0; i < nr_pages; i++) {
1414                unsigned long addr = (unsigned long)(*pages);
1415                struct vm_area_struct *vma;
1416                struct page *page;
1417                int err = -EFAULT;
1418
1419                vma = find_vma(mm, addr);
1420                if (!vma || addr < vma->vm_start)
1421                        goto set_status;
1422
1423                page = follow_page(vma, addr, 0);
1424
1425                err = PTR_ERR(page);
1426                if (IS_ERR(page))
1427                        goto set_status;
1428
1429                err = -ENOENT;
1430                /* Use PageReserved to check for zero page */
1431                if (!page || PageReserved(page))
1432                        goto set_status;
1433
1434                err = page_to_nid(page);
1435set_status:
1436                *status = err;
1437
1438                pages++;
1439                status++;
1440        }
1441
1442        up_read(&mm->mmap_sem);
1443}
1444
1445/*
1446 * Determine the nodes of a user array of pages and store it in
1447 * a user array of status.
1448 */
1449static int do_pages_stat(struct mm_struct *mm, unsigned long nr_pages,
1450                         const void __user * __user *pages,
1451                         int __user *status)
1452{
1453#define DO_PAGES_STAT_CHUNK_NR 16
1454        const void __user *chunk_pages[DO_PAGES_STAT_CHUNK_NR];
1455        int chunk_status[DO_PAGES_STAT_CHUNK_NR];
1456
1457        while (nr_pages) {
1458                unsigned long chunk_nr;
1459
1460                chunk_nr = nr_pages;
1461                if (chunk_nr > DO_PAGES_STAT_CHUNK_NR)
1462                        chunk_nr = DO_PAGES_STAT_CHUNK_NR;
1463
1464                if (copy_from_user(chunk_pages, pages, chunk_nr * sizeof(*chunk_pages)))
1465                        break;
1466
1467                do_pages_stat_array(mm, chunk_nr, chunk_pages, chunk_status);
1468
1469                if (copy_to_user(status, chunk_status, chunk_nr * sizeof(*status)))
1470                        break;
1471
1472                pages += chunk_nr;
1473                status += chunk_nr;
1474                nr_pages -= chunk_nr;
1475        }
1476        return nr_pages ? -EFAULT : 0;
1477}
1478
1479/*
1480 * Move a list of pages in the address space of the currently executing
1481 * process.
1482 */
1483SYSCALL_DEFINE6(move_pages, pid_t, pid, unsigned long, nr_pages,
1484                const void __user * __user *, pages,
1485                const int __user *, nodes,
1486                int __user *, status, int, flags)
1487{
1488        struct task_struct *task;
1489        struct mm_struct *mm;
1490        int err;
1491        nodemask_t task_nodes;
1492
1493        /* Check flags */
1494        if (flags & ~(MPOL_MF_MOVE|MPOL_MF_MOVE_ALL))
1495                return -EINVAL;
1496
1497        if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
1498                return -EPERM;
1499
1500        /* Find the mm_struct */
1501        rcu_read_lock();
1502        task = pid ? find_task_by_vpid(pid) : current;
1503        if (!task) {
1504                rcu_read_unlock();
1505                return -ESRCH;
1506        }
1507        get_task_struct(task);
1508
1509        /*
1510         * Check if this process has the right to modify the specified
1511         * process. Use the regular "ptrace_may_access()" checks.
1512         */
1513        if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS)) {
1514                rcu_read_unlock();
1515                err = -EPERM;
1516                goto out;
1517        }
1518        rcu_read_unlock();
1519
1520        err = security_task_movememory(task);
1521        if (err)
1522                goto out;
1523
1524        task_nodes = cpuset_mems_allowed(task);
1525        mm = get_task_mm(task);
1526        put_task_struct(task);
1527
1528        if (!mm)
1529                return -EINVAL;
1530
1531        if (nodes)
1532                err = do_pages_move(mm, task_nodes, nr_pages, pages,
1533                                    nodes, status, flags);
1534        else
1535                err = do_pages_stat(mm, nr_pages, pages, status);
1536
1537        mmput(mm);
1538        return err;
1539
1540out:
1541        put_task_struct(task);
1542        return err;
1543}
1544
1545/*
1546 * Call migration functions in the vma_ops that may prepare
1547 * memory in a vm for migration. migration functions may perform
1548 * the migration for vmas that do not have an underlying page struct.
1549 */
1550int migrate_vmas(struct mm_struct *mm, const nodemask_t *to,
1551        const nodemask_t *from, unsigned long flags)
1552{
1553        struct vm_area_struct *vma;
1554        int err = 0;
1555
1556        for (vma = mm->mmap; vma && !err; vma = vma->vm_next) {
1557                if (vma->vm_ops && vma->vm_ops->migrate) {
1558                        err = vma->vm_ops->migrate(vma, to, from, flags);
1559                        if (err)
1560                                break;
1561                }
1562        }
1563        return err;
1564}
1565
1566#ifdef CONFIG_NUMA_BALANCING
1567/*
1568 * Returns true if this is a safe migration target node for misplaced NUMA
1569 * pages. Currently it only checks the watermarks which crude
1570 */
1571static bool migrate_balanced_pgdat(struct pglist_data *pgdat,
1572                                   unsigned long nr_migrate_pages)
1573{
1574        int z;
1575        for (z = pgdat->nr_zones - 1; z >= 0; z--) {
1576                struct zone *zone = pgdat->node_zones + z;
1577
1578                if (!populated_zone(zone))
1579                        continue;
1580
1581                if (zone->all_unreclaimable)
1582                        continue;
1583
1584                /* Avoid waking kswapd by allocating pages_to_migrate pages. */
1585                if (!zone_watermark_ok(zone, 0,
1586                                       high_wmark_pages(zone) +
1587                                       nr_migrate_pages,
1588                                       0, 0))
1589                        continue;
1590                return true;
1591        }
1592        return false;
1593}
1594
1595static struct page *alloc_misplaced_dst_page(struct page *page,
1596                                           unsigned long data,
1597                                           int **result)
1598{
1599        int nid = (int) data;
1600        struct page *newpage;
1601
1602        newpage = alloc_pages_exact_node(nid,
1603                                         (GFP_HIGHUSER_MOVABLE |
1604                                          __GFP_THISNODE | __GFP_NOMEMALLOC |
1605                                          __GFP_NORETRY | __GFP_NOWARN) &
1606                                         ~GFP_IOFS, 0);
1607        if (newpage)
1608                page_cpupid_xchg_last(newpage, page_cpupid_last(page));
1609
1610        return newpage;
1611}
1612
1613/*
1614 * page migration rate limiting control.
1615 * Do not migrate more than @pages_to_migrate in a @migrate_interval_millisecs
1616 * window of time. Default here says do not migrate more than 1280M per second.
1617 * If a node is rate-limited then PTE NUMA updates are also rate-limited. However
1618 * as it is faults that reset the window, pte updates will happen unconditionally
1619 * if there has not been a fault since @pteupdate_interval_millisecs after the
1620 * throttle window closed.
1621 */
1622static unsigned int migrate_interval_millisecs __read_mostly = 100;
1623static unsigned int pteupdate_interval_millisecs __read_mostly = 1000;
1624static unsigned int ratelimit_pages __read_mostly = 128 << (20 - PAGE_SHIFT);
1625
1626/* Returns true if NUMA migration is currently rate limited */
1627bool migrate_ratelimited(int node)
1628{
1629        pg_data_t *pgdat = NODE_DATA(node);
1630
1631        if (time_after(jiffies, pgdat->numabalancing_migrate_next_window +
1632                                msecs_to_jiffies(pteupdate_interval_millisecs)))
1633                return false;
1634
1635        if (pgdat->numabalancing_migrate_nr_pages < ratelimit_pages)
1636                return false;
1637
1638        return true;
1639}
1640
1641/* Returns true if the node is migrate rate-limited after the update */
1642static bool numamigrate_update_ratelimit(pg_data_t *pgdat,
1643                                        unsigned long nr_pages)
1644{
1645        /*
1646         * Rate-limit the amount of data that is being migrated to a node.
1647         * Optimal placement is no good if the memory bus is saturated and
1648         * all the time is being spent migrating!
1649         */
1650        if (time_after(jiffies, pgdat->numabalancing_migrate_next_window)) {
1651                spin_lock(&pgdat->numabalancing_migrate_lock);
1652                pgdat->numabalancing_migrate_nr_pages = 0;
1653                pgdat->numabalancing_migrate_next_window = jiffies +
1654                        msecs_to_jiffies(migrate_interval_millisecs);
1655                spin_unlock(&pgdat->numabalancing_migrate_lock);
1656        }
1657        if (pgdat->numabalancing_migrate_nr_pages > ratelimit_pages) {
1658                trace_mm_numa_migrate_ratelimit(current, pgdat->node_id,
1659                                                                nr_pages);
1660                return true;
1661        }
1662
1663        /*
1664         * This is an unlocked non-atomic update so errors are possible.
1665         * The consequences are failing to migrate when we potentiall should
1666         * have which is not severe enough to warrant locking. If it is ever
1667         * a problem, it can be converted to a per-cpu counter.
1668         */
1669        pgdat->numabalancing_migrate_nr_pages += nr_pages;
1670        return false;
1671}
1672
1673static int numamigrate_isolate_page(pg_data_t *pgdat, struct page *page)
1674{
1675        int page_lru;
1676
1677        VM_BUG_ON_PAGE(compound_order(page) && !PageTransHuge(page), page);
1678
1679        /* Avoid migrating to a node that is nearly full */
1680        if (!migrate_balanced_pgdat(pgdat, 1UL << compound_order(page)))
1681                return 0;
1682
1683        if (isolate_lru_page(page))
1684                return 0;
1685
1686        /*
1687         * migrate_misplaced_transhuge_page() skips page migration's usual
1688         * check on page_count(), so we must do it here, now that the page
1689         * has been isolated: a GUP pin, or any other pin, prevents migration.
1690         * The expected page count is 3: 1 for page's mapcount and 1 for the
1691         * caller's pin and 1 for the reference taken by isolate_lru_page().
1692         */
1693        if (PageTransHuge(page) && page_count(page) != 3) {
1694                putback_lru_page(page);
1695                return 0;
1696        }
1697
1698        page_lru = page_is_file_cache(page);
1699        mod_zone_page_state(page_zone(page), NR_ISOLATED_ANON + page_lru,
1700                                hpage_nr_pages(page));
1701
1702        /*
1703         * Isolating the page has taken another reference, so the
1704         * caller's reference can be safely dropped without the page
1705         * disappearing underneath us during migration.
1706         */
1707        put_page(page);
1708        return 1;
1709}
1710
1711bool pmd_trans_migrating(pmd_t pmd)
1712{
1713        struct page *page = pmd_page(pmd);
1714        return PageLocked(page);
1715}
1716
1717/*
1718 * Attempt to migrate a misplaced page to the specified destination
1719 * node. Caller is expected to have an elevated reference count on
1720 * the page that will be dropped by this function before returning.
1721 */
1722int migrate_misplaced_page(struct page *page, struct vm_area_struct *vma,
1723                           int node)
1724{
1725        pg_data_t *pgdat = NODE_DATA(node);
1726        int isolated;
1727        int nr_remaining;
1728        LIST_HEAD(migratepages);
1729
1730        /*
1731         * Don't migrate file pages that are mapped in multiple processes
1732         * with execute permissions as they are probably shared libraries.
1733         */
1734        if (page_mapcount(page) != 1 && page_is_file_cache(page) &&
1735            (vma->vm_flags & VM_EXEC))
1736                goto out;
1737
1738        /*
1739         * Rate-limit the amount of data that is being migrated to a node.
1740         * Optimal placement is no good if the memory bus is saturated and
1741         * all the time is being spent migrating!
1742         */
1743        if (numamigrate_update_ratelimit(pgdat, 1))
1744                goto out;
1745
1746        isolated = numamigrate_isolate_page(pgdat, page);
1747        if (!isolated)
1748                goto out;
1749
1750        list_add(&page->lru, &migratepages);
1751        nr_remaining = migrate_pages(&migratepages, alloc_misplaced_dst_page,
1752                                     node, MIGRATE_ASYNC, MR_NUMA_MISPLACED);
1753        if (nr_remaining) {
1754                putback_lru_pages(&migratepages);
1755                isolated = 0;
1756        } else
1757                count_vm_numa_event(NUMA_PAGE_MIGRATE);
1758        BUG_ON(!list_empty(&migratepages));
1759        return isolated;
1760
1761out:
1762        put_page(page);
1763        return 0;
1764}
1765#endif /* CONFIG_NUMA_BALANCING */
1766
1767#if defined(CONFIG_NUMA_BALANCING) && defined(CONFIG_TRANSPARENT_HUGEPAGE)
1768/*
1769 * Migrates a THP to a given target node. page must be locked and is unlocked
1770 * before returning.
1771 */
1772int migrate_misplaced_transhuge_page(struct mm_struct *mm,
1773                                struct vm_area_struct *vma,
1774                                pmd_t *pmd, pmd_t entry,
1775                                unsigned long address,
1776                                struct page *page, int node)
1777{
1778        spinlock_t *ptl;
1779        pg_data_t *pgdat = NODE_DATA(node);
1780        int isolated = 0;
1781        struct page *new_page = NULL;
1782        struct mem_cgroup *memcg = NULL;
1783        int page_lru = page_is_file_cache(page);
1784        unsigned long mmun_start = address & HPAGE_PMD_MASK;
1785        unsigned long mmun_end = mmun_start + HPAGE_PMD_SIZE;
1786        pmd_t orig_entry;
1787
1788        /*
1789         * Rate-limit the amount of data that is being migrated to a node.
1790         * Optimal placement is no good if the memory bus is saturated and
1791         * all the time is being spent migrating!
1792         */
1793        if (numamigrate_update_ratelimit(pgdat, HPAGE_PMD_NR))
1794                goto out_dropref;
1795
1796        new_page = alloc_pages_node(node,
1797                (GFP_TRANSHUGE | __GFP_THISNODE) & ~__GFP_WAIT,
1798                HPAGE_PMD_ORDER);
1799        if (!new_page)
1800                goto out_fail;
1801
1802        isolated = numamigrate_isolate_page(pgdat, page);
1803        if (!isolated) {
1804                put_page(new_page);
1805                goto out_fail;
1806        }
1807
1808        if (tlb_flush_pending(mm))
1809                flush_tlb_range(vma, mmun_start, mmun_end);
1810
1811        /* Prepare a page as a migration target */
1812        __set_page_locked(new_page);
1813        SetPageSwapBacked(new_page);
1814
1815        /* anon mapping, we can simply copy page->mapping to the new page: */
1816        new_page->mapping = page->mapping;
1817        new_page->index = page->index;
1818        migrate_page_copy(new_page, page);
1819        WARN_ON(PageLRU(new_page));
1820
1821        /* Recheck the target PMD */
1822        mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
1823        ptl = pmd_lock(mm, pmd);
1824        if (unlikely(!pmd_same(*pmd, entry) || page_count(page) != 2)) {
1825fail_putback:
1826                spin_unlock(ptl);
1827                mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
1828
1829                /* Reverse changes made by migrate_page_copy() */
1830                if (TestClearPageActive(new_page))
1831                        SetPageActive(page);
1832                if (TestClearPageUnevictable(new_page))
1833                        SetPageUnevictable(page);
1834                mlock_migrate_page(page, new_page);
1835
1836                unlock_page(new_page);
1837                put_page(new_page);             /* Free it */
1838
1839                /* Retake the callers reference and putback on LRU */
1840                get_page(page);
1841                putback_lru_page(page);
1842                mod_zone_page_state(page_zone(page),
1843                         NR_ISOLATED_ANON + page_lru, -HPAGE_PMD_NR);
1844
1845                goto out_unlock;
1846        }
1847
1848        /*
1849         * Traditional migration needs to prepare the memcg charge
1850         * transaction early to prevent the old page from being
1851         * uncharged when installing migration entries.  Here we can
1852         * save the potential rollback and start the charge transfer
1853         * only when migration is already known to end successfully.
1854         */
1855        mem_cgroup_prepare_migration(page, new_page, &memcg);
1856
1857        init_trans_huge_mmu_gather_count(new_page);
1858
1859        orig_entry = *pmd;
1860        entry = mk_pmd(new_page, vma->vm_page_prot);
1861        entry = pmd_mkhuge(entry);
1862        entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
1863
1864        /*
1865         * Clear the old entry under pagetable lock and establish the new PTE.
1866         * Any parallel GUP will either observe the old page blocking on the
1867         * page lock, block on the page table lock or observe the new page.
1868         * The SetPageUptodate on the new page and page_add_new_anon_rmap
1869         * guarantee the copy is visible before the pagetable update.
1870         */
1871        flush_cache_range(vma, mmun_start, mmun_end);
1872        page_add_new_anon_rmap(new_page, vma, mmun_start);
1873        pmdp_clear_flush_notify(vma, mmun_start, pmd);
1874        set_pmd_at(mm, mmun_start, pmd, entry);
1875        flush_tlb_range(vma, mmun_start, mmun_end);
1876        update_mmu_cache_pmd(vma, address, &entry);
1877
1878        if (page_count(page) != 2) {
1879                set_pmd_at(mm, mmun_start, pmd, orig_entry);
1880                flush_tlb_range(vma, mmun_start, mmun_end);
1881                mmu_notifier_invalidate_range(mm, mmun_start, mmun_end);
1882                update_mmu_cache_pmd(vma, address, &entry);
1883                page_remove_rmap(new_page);
1884                goto fail_putback;
1885        }
1886
1887        page_remove_rmap(page);
1888
1889        /*
1890         * Finish the charge transaction under the page table lock to
1891         * prevent split_huge_page() from dividing up the charge
1892         * before it's fully transferred to the new page.
1893         */
1894        mem_cgroup_end_migration(memcg, page, new_page, true);
1895        spin_unlock(ptl);
1896        mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
1897
1898        unlock_page(new_page);
1899        unlock_page(page);
1900        put_page(page);                 /* Drop the rmap reference */
1901        put_page(page);                 /* Drop the LRU isolation reference */
1902
1903        count_vm_events(PGMIGRATE_SUCCESS, HPAGE_PMD_NR);
1904        count_vm_numa_events(NUMA_PAGE_MIGRATE, HPAGE_PMD_NR);
1905
1906        mod_zone_page_state(page_zone(page),
1907                        NR_ISOLATED_ANON + page_lru,
1908                        -HPAGE_PMD_NR);
1909        return isolated;
1910
1911out_fail:
1912        count_vm_events(PGMIGRATE_FAIL, HPAGE_PMD_NR);
1913out_dropref:
1914        ptl = pmd_lock(mm, pmd);
1915        if (pmd_same(*pmd, entry)) {
1916                entry = pmd_mknonnuma(entry);
1917                set_pmd_at(mm, mmun_start, pmd, entry);
1918                update_mmu_cache_pmd(vma, address, &entry);
1919        }
1920        spin_unlock(ptl);
1921
1922out_unlock:
1923        unlock_page(page);
1924        put_page(page);
1925        return 0;
1926}
1927#endif /* CONFIG_NUMA_BALANCING */
1928
1929#endif /* CONFIG_NUMA */
1930
1931
1932struct migrate_vma {
1933        struct vm_area_struct   *vma;
1934        unsigned long           *dst;
1935        unsigned long           *src;
1936        unsigned long           cpages;
1937        unsigned long           npages;
1938        unsigned long           start;
1939        unsigned long           end;
1940};
1941
1942static int migrate_vma_collect_hole(unsigned long start,
1943                                    unsigned long end,
1944                                    struct mm_walk *walk)
1945{
1946        struct migrate_vma *migrate = walk->private;
1947        unsigned long addr;
1948
1949        for (addr = start & PAGE_MASK; addr < end; addr += PAGE_SIZE) {
1950                migrate->src[migrate->npages] = MIGRATE_PFN_MIGRATE;
1951                migrate->dst[migrate->npages] = 0;
1952                migrate->npages++;
1953                migrate->cpages++;
1954        }
1955
1956        return 0;
1957}
1958
1959static int migrate_vma_collect_pmd(pmd_t *pmdp,
1960                                   unsigned long start,
1961                                   unsigned long end,
1962                                   struct mm_walk *walk)
1963{
1964        struct migrate_vma *migrate = walk->private;
1965        unsigned long addr = start, unmapped = 0;
1966        struct mm_struct *mm = walk->mm;
1967        spinlock_t *ptl;
1968        pte_t *ptep;
1969
1970        if (pmd_trans_huge(*pmdp))
1971                split_huge_page_pmd(migrate->vma, start, pmdp);
1972        if (pmd_none_or_trans_huge_or_clear_bad(pmdp))
1973                return migrate_vma_collect_hole(start, end, walk);
1974
1975        ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl);
1976        arch_enter_lazy_mmu_mode();
1977
1978        for (; addr < end; addr += PAGE_SIZE, ptep++) {
1979                unsigned long mpfn, pfn;
1980                struct page *page;
1981                swp_entry_t entry;
1982                pte_t pte;
1983
1984                pte = *ptep;
1985                pfn = pte_pfn(pte);
1986
1987                if (pte_none(pte)) {
1988                        mpfn = MIGRATE_PFN_MIGRATE;
1989                        migrate->cpages++;
1990                        pfn = 0;
1991                        goto next;
1992                }
1993
1994                if (!pte_present(pte)) {
1995                        mpfn = pfn = 0;
1996
1997                        if (pte_file(pte))
1998                                goto next;
1999
2000                        /*
2001                         * Only care about unaddressable device page special
2002                         * page table entry. Other special swap entries are not
2003                         * migratable, and we ignore regular swapped page.
2004                         */
2005                        entry = pte_to_swp_entry(pte);
2006                        if (!is_hmm_entry(entry))
2007                                goto next;
2008
2009                        page = hmm_entry_to_page(entry);
2010                        mpfn = migrate_pfn(page_to_pfn(page))|
2011                                MIGRATE_PFN_DEVICE | MIGRATE_PFN_MIGRATE;
2012                        if (is_write_hmm_entry(entry))
2013                                mpfn |= MIGRATE_PFN_WRITE;
2014                } else {
2015                        page = vm_normal_page(migrate->vma, addr, pte);
2016                        mpfn = migrate_pfn(pfn) | MIGRATE_PFN_MIGRATE;
2017                        mpfn |= pte_write(pte) ? MIGRATE_PFN_WRITE : 0;
2018                }
2019
2020                /* FIXME support THP */
2021                if (!page || !page->mapping || PageTransCompound(page)) {
2022                        mpfn = pfn = 0;
2023                        goto next;
2024                }
2025                pfn = page_to_pfn(page);
2026
2027                /*
2028                 * By getting a reference on the page we pin it and that blocks
2029                 * any kind of migration. Side effect is that it "freezes" the
2030                 * pte.
2031                 *
2032                 * We drop this reference after isolating the page from the lru
2033                 * for non device page (device page are not on the lru and thus
2034                 * can't be dropped from it).
2035                 */
2036                get_page(page);
2037                migrate->cpages++;
2038
2039                /*
2040                 * Optimize for the common case where page is only mapped once
2041                 * in one process. If we can lock the page, then we can safely
2042                 * set up a special migration page table entry now.
2043                 */
2044                if (trylock_page(page)) {
2045                        pte_t swp_pte;
2046
2047                        mpfn |= MIGRATE_PFN_LOCKED;
2048                        ptep_get_and_clear(mm, addr, ptep);
2049
2050                        /* Setup special migration page table entry */
2051                        entry = make_migration_entry(page, pte_write(pte));
2052                        swp_pte = swp_entry_to_pte(entry);
2053                        if (pte_soft_dirty(pte))
2054                                swp_pte = pte_swp_mksoft_dirty(swp_pte);
2055                        set_pte_at(mm, addr, ptep, swp_pte);
2056
2057                        /*
2058                         * This is like regular unmap: we remove the rmap and
2059                         * drop page refcount. Page won't be freed, as we took
2060                         * a reference just above.
2061                         */
2062                        page_remove_rmap(page);
2063                        put_page(page);
2064                        if (pte_present(pte))
2065                                unmapped++;
2066                }
2067
2068next:
2069                migrate->dst[migrate->npages] = 0;
2070                migrate->src[migrate->npages++] = mpfn;
2071        }
2072        arch_leave_lazy_mmu_mode();
2073        pte_unmap_unlock(ptep - 1, ptl);
2074
2075        /* Only flush the TLB if we actually modified any entries */
2076        if (unmapped)
2077                flush_tlb_range(migrate->vma, start, end);
2078
2079        return 0;
2080}
2081
2082/*
2083 * migrate_vma_collect() - collect pages over a range of virtual addresses
2084 * @migrate: migrate struct containing all migration information
2085 *
2086 * This will walk the CPU page table. For each virtual address backed by a
2087 * valid page, it updates the src array and takes a reference on the page, in
2088 * order to pin the page until we lock it and unmap it.
2089 */
2090static void migrate_vma_collect(struct migrate_vma *migrate)
2091{
2092        struct mm_walk mm_walk;
2093
2094        mm_walk.pgd_entry = NULL;
2095        mm_walk.pud_entry = NULL;
2096        mm_walk.pte_entry = NULL;
2097        mm_walk.private = migrate;
2098        mm_walk.hugetlb_entry = NULL;
2099        mm_walk.mm = migrate->vma->vm_mm;
2100        mm_walk.pte_hole = migrate_vma_collect_hole;
2101        mm_walk.pmd_entry = migrate_vma_collect_pmd;
2102
2103        mmu_notifier_invalidate_range_start(mm_walk.mm,
2104                                            migrate->start,
2105                                            migrate->end);
2106        walk_page_range(migrate->start, migrate->end, &mm_walk);
2107        mmu_notifier_invalidate_range_end(mm_walk.mm,
2108                                          migrate->start,
2109                                          migrate->end);
2110
2111        migrate->end = migrate->start + (migrate->npages << PAGE_SHIFT);
2112}
2113
2114/*
2115 * migrate_vma_check_page() - check if page is pinned or not
2116 * @page: struct page to check
2117 *
2118 * Pinned pages cannot be migrated. This is the same test as in
2119 * migrate_page_move_mapping(), except that here we allow migration of a
2120 * ZONE_DEVICE page.
2121 */
2122static bool migrate_vma_check_page(struct page *page)
2123{
2124        /*
2125         * One extra ref because caller holds an extra reference, either from
2126         * isolate_lru_page() for a regular page, or migrate_vma_collect() for
2127         * a device page.
2128         */
2129        int extra = 1;
2130
2131        /*
2132         * FIXME support THP (transparent huge page), it is bit more complex to
2133         * check them than regular pages, because they can be mapped with a pmd
2134         * or with a pte (split pte mapping).
2135         */
2136        if (PageCompound(page))
2137                return false;
2138
2139        /*
2140         * Private page can never be pin as they have no valid pte and
2141         * GUP will fail for those. Yet if there is a pending migration
2142         * a thread might try to wait on the pte migration entry and
2143         * will bump the page reference count. Sadly there is no way to
2144         * differentiate a regular pin from migration wait. Hence to
2145         * avoid 2 racing thread trying to migrate back to CPU to enter
2146         * infinite loop (one stoping migration because the other is
2147         * waiting on pte migration entry). We always return true here.
2148         *
2149         * FIXME proper solution is to rework migration_entry_wait() so
2150         * it does not need to take a reference on page.
2151         */
2152        if (is_hmm_page(page))
2153                return true;
2154
2155        if (is_zone_device_page(page))
2156                /* Other ZONE_DEVICE memory type are not supported */
2157                return false;
2158
2159        if ((page_count(page) - extra) > page_mapcount(page))
2160                return false;
2161
2162        return true;
2163}
2164
2165/*
2166 * migrate_vma_prepare() - lock pages and isolate them from the lru
2167 * @migrate: migrate struct containing all migration information
2168 *
2169 * This locks pages that have been collected by migrate_vma_collect(). Once each
2170 * page is locked it is isolated from the lru (for non-device pages). Finally,
2171 * the ref taken by migrate_vma_collect() is dropped, as locked pages cannot be
2172 * migrated by concurrent kernel threads.
2173 */
2174static void migrate_vma_prepare(struct migrate_vma *migrate)
2175{
2176        const unsigned long npages = migrate->npages;
2177        const unsigned long start = migrate->start;
2178        unsigned long addr, i, restore = 0;
2179        bool allow_drain = true;
2180
2181        lru_add_drain();
2182
2183        for (i = 0; (i < npages) && migrate->cpages; i++) {
2184                struct page *page = migrate_pfn_to_page(migrate->src[i]);
2185                bool remap = true;
2186
2187                if (!page)
2188                        continue;
2189
2190                if (!(migrate->src[i] & MIGRATE_PFN_LOCKED)) {
2191                        /*
2192                         * Because we are migrating several pages there can be
2193                         * a deadlock between 2 concurrent migration where each
2194                         * are waiting on each other page lock.
2195                         *
2196                         * Make migrate_vma() a best effort thing and backoff
2197                         * for any page we can not lock right away.
2198                         */
2199                        if (!trylock_page(page)) {
2200                                migrate->src[i] = 0;
2201                                migrate->cpages--;
2202                                put_page(page);
2203                                continue;
2204                        }
2205                        remap = false;
2206                        migrate->src[i] |= MIGRATE_PFN_LOCKED;
2207                }
2208
2209                /* ZONE_DEVICE pages are not on LRU */
2210                if (!is_zone_device_page(page)) {
2211                        if (!PageLRU(page) && allow_drain) {
2212                                /* Drain CPU's pagevec */
2213                                lru_add_drain_all();
2214                                allow_drain = false;
2215                        }
2216
2217                        if (isolate_lru_page(page)) {
2218                                if (remap) {
2219                                        migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
2220                                        migrate->cpages--;
2221                                        restore++;
2222                                } else {
2223                                        migrate->src[i] = 0;
2224                                        unlock_page(page);
2225                                        migrate->cpages--;
2226                                        put_page(page);
2227                                }
2228                                continue;
2229                        }
2230
2231                        /* Drop the reference we took in collect */
2232                        put_page(page);
2233                }
2234
2235                if (!migrate_vma_check_page(page)) {
2236                        if (remap) {
2237                                migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
2238                                migrate->cpages--;
2239                                restore++;
2240
2241                                if (!is_zone_device_page(page)) {
2242                                        get_page(page);
2243                                        putback_lru_page(page);
2244                                }
2245                        } else {
2246                                migrate->src[i] = 0;
2247                                unlock_page(page);
2248                                migrate->cpages--;
2249
2250                                if (!is_zone_device_page(page))
2251                                        putback_lru_page(page);
2252                                else
2253                                        put_page(page);
2254                        }
2255                }
2256        }
2257
2258        for (i = 0, addr = start; i < npages && restore; i++, addr += PAGE_SIZE) {
2259                struct page *page = migrate_pfn_to_page(migrate->src[i]);
2260
2261                if (!page || (migrate->src[i] & MIGRATE_PFN_MIGRATE))
2262                        continue;
2263
2264                remove_migration_pte(page, migrate->vma, addr, page);
2265
2266                migrate->src[i] = 0;
2267                unlock_page(page);
2268                put_page(page);
2269                restore--;
2270        }
2271}
2272
2273/*
2274 * migrate_vma_unmap() - replace page mapping with special migration pte entry
2275 * @migrate: migrate struct containing all migration information
2276 *
2277 * Replace page mapping (CPU page table pte) with a special migration pte entry
2278 * and check again if it has been pinned. Pinned pages are restored because we
2279 * cannot migrate them.
2280 *
2281 * This is the last step before we call the device driver callback to allocate
2282 * destination memory and copy contents of original page over to new page.
2283 */
2284static void migrate_vma_unmap(struct migrate_vma *migrate)
2285{
2286        int flags = TTU_MIGRATION | TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS;
2287        const unsigned long npages = migrate->npages;
2288        const unsigned long start = migrate->start;
2289        unsigned long addr, i, restore = 0;
2290
2291        for (i = 0; i < npages; i++) {
2292                struct page *page = migrate_pfn_to_page(migrate->src[i]);
2293
2294                if (!page || !(migrate->src[i] & MIGRATE_PFN_MIGRATE))
2295                        continue;
2296
2297                if (page_mapped(page)) {
2298                        try_to_unmap(page, flags);
2299                        if (page_mapped(page))
2300                                goto restore;
2301                }
2302
2303                if (migrate_vma_check_page(page))
2304                        continue;
2305
2306restore:
2307                migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
2308                migrate->cpages--;
2309                restore++;
2310        }
2311
2312        for (addr = start, i = 0; i < npages && restore; addr += PAGE_SIZE, i++) {
2313                struct page *page = migrate_pfn_to_page(migrate->src[i]);
2314
2315                if (!page || (migrate->src[i] & MIGRATE_PFN_MIGRATE))
2316                        continue;
2317
2318                remove_migration_ptes(page, page);
2319
2320                migrate->src[i] = 0;
2321                unlock_page(page);
2322                restore--;
2323
2324                if (is_zone_device_page(page))
2325                        put_page(page);
2326                else
2327                        putback_lru_page(page);
2328        }
2329}
2330
2331static void migrate_vma_insert_page(struct migrate_vma *migrate,
2332                                    unsigned long addr,
2333                                    struct page *page,
2334                                    unsigned long *src,
2335                                    unsigned long *dst)
2336{
2337        struct vm_area_struct *vma = migrate->vma;
2338        struct mm_struct *mm = vma->vm_mm;
2339        bool flush = false;
2340        spinlock_t *ptl;
2341        pgd_t *pgdp;
2342        pud_t *pudp;
2343        pmd_t *pmdp;
2344        pte_t *ptep;
2345        pte_t entry;
2346
2347        /* Only allow populating anonymous memory */
2348        if (!vma_is_anonymous(vma))
2349                goto abort;
2350
2351        pgdp = pgd_offset(mm, addr);
2352        pudp = pud_alloc(mm, pgdp, addr);
2353        if (!pudp)
2354                goto abort;
2355        pmdp = pmd_alloc(mm, pudp, addr);
2356        if (!pmdp)
2357                goto abort;
2358
2359        if (pmd_trans_unstable(pmdp))
2360                goto abort;
2361
2362        /*
2363         * Use __pte_alloc() instead of pte_alloc_map().  We can't run
2364         * pte_offset_map() on pmds where a huge pmd might be created
2365         * from a different thread.
2366         *
2367         * pte_alloc_map() is safe to use under down_write(mmap_sem) or when
2368         * parallel threads are excluded by other means.
2369         *
2370         * Here we only have down_read(mmap_sem).
2371         */
2372        if (__pte_alloc(mm, vma, pmdp, addr))
2373                goto abort;
2374
2375        /* See the comment in pte_alloc_one_map() */
2376        if (unlikely(pmd_trans_unstable(pmdp)))
2377                goto abort;
2378
2379        if (unlikely(anon_vma_prepare(vma)))
2380                goto abort;
2381        if (mem_cgroup_newpage_charge(page, vma->vm_mm, GFP_KERNEL))
2382                goto abort;
2383
2384        /*
2385         * The memory barrier inside __SetPageUptodate makes sure that
2386         * preceding stores to the page contents become visible before
2387         * the set_pte_at() write.
2388         */
2389        __SetPageUptodate(page);
2390
2391        if (is_zone_device_page(page) && is_hmm_page(page)) {
2392                swp_entry_t swp_entry;
2393
2394                swp_entry = make_hmm_entry(page, vma->vm_flags & VM_WRITE);
2395                entry = swp_entry_to_pte(swp_entry);
2396        } else {
2397                entry = mk_pte(page, vma->vm_page_prot);
2398                if (vma->vm_flags & VM_WRITE)
2399                        entry = pte_mkwrite(pte_mkdirty(entry));
2400        }
2401
2402        ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl);
2403        if (pte_present(*ptep)) {
2404                unsigned long pfn = pte_pfn(*ptep);
2405
2406                if (!is_zero_pfn(pfn)) {
2407                        pte_unmap_unlock(ptep, ptl);
2408                        mem_cgroup_uncharge_page(page);
2409                        goto abort;
2410                }
2411                flush = true;
2412        } else if (!pte_none(*ptep)) {
2413                pte_unmap_unlock(ptep, ptl);
2414                mem_cgroup_uncharge_page(page);
2415                goto abort;
2416        }
2417
2418        /*
2419         * Check for usefaultfd but do not deliver the fault. Instead,
2420         * just back off.
2421         */
2422        if (userfaultfd_missing(vma)) {
2423                pte_unmap_unlock(ptep, ptl);
2424                mem_cgroup_uncharge_page(page);
2425                goto abort;
2426        }
2427
2428        page_add_new_anon_rmap(page, vma, addr);
2429        inc_mm_counter(mm, MM_ANONPAGES);
2430        get_page(page);
2431
2432        if (flush) {
2433                flush_cache_page(vma, addr, pte_pfn(*ptep));
2434                ptep_clear_flush_notify(vma, addr, ptep);
2435                set_pte_at_notify(mm, addr, ptep, entry);
2436                update_mmu_cache(vma, addr, ptep);
2437        } else {
2438                /* No need to invalidate - it was non-present before */
2439                update_mmu_cache(vma, addr, ptep);
2440                set_pte_at(mm, addr, ptep, entry);
2441        }
2442        pte_unmap_unlock(ptep, ptl);
2443        *src = MIGRATE_PFN_MIGRATE;
2444        return;
2445
2446abort:
2447        *src &= ~MIGRATE_PFN_MIGRATE;
2448}
2449
2450/*
2451 * migrate_vma_pages() - migrate meta-data from src page to dst page
2452 * @migrate: migrate struct containing all migration information
2453 *
2454 * This migrates struct page meta-data from source struct page to destination
2455 * struct page. This effectively finishes the migration from source page to the
2456 * destination page.
2457 */
2458static void migrate_vma_pages(struct migrate_vma *migrate)
2459{
2460        const unsigned long npages = migrate->npages;
2461        const unsigned long start = migrate->start;
2462        struct vm_area_struct *vma = migrate->vma;
2463        struct mm_struct *mm = vma->vm_mm;
2464        unsigned long addr, i, mmu_start;
2465        bool notified = false;
2466
2467        for (i = 0, addr = start; i < npages; addr += PAGE_SIZE, i++) {
2468                struct page *newpage = migrate_pfn_to_page(migrate->dst[i]);
2469                struct page *page = migrate_pfn_to_page(migrate->src[i]);
2470                struct address_space *mapping;
2471                struct mem_cgroup *memcg;
2472                int r;
2473
2474                if (!newpage) {
2475                        migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
2476                        continue;
2477                }
2478
2479                if (!page) {
2480                        if (!(migrate->src[i] & MIGRATE_PFN_MIGRATE)) {
2481                                continue;
2482                        }
2483                        if (!notified) {
2484                                mmu_start = addr;
2485                                notified = true;
2486                                mmu_notifier_invalidate_range_start(mm,
2487                                                                mmu_start,
2488                                                                migrate->end);
2489                        }
2490                        migrate_vma_insert_page(migrate, addr, newpage,
2491                                                &migrate->src[i],
2492                                                &migrate->dst[i]);
2493                        continue;
2494                }
2495
2496                mapping = page_mapping(page);
2497
2498                if (is_zone_device_page(newpage)) {
2499                        if (is_hmm_page(newpage)) {
2500                                /*
2501                                 * For now only support private anonymous when
2502                                 * migrating to un-addressable device memory.
2503                                 */
2504                                if (mapping) {
2505                                        migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
2506                                        continue;
2507                                }
2508                        } else {
2509                                /*
2510                                 * Other types of ZONE_DEVICE page are not
2511                                 * supported.
2512                                 */
2513                                migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
2514                                continue;
2515                        }
2516                }
2517
2518                newpage->index = page->index;
2519                newpage->mapping = page->mapping;
2520                if (PageSwapBacked(page))
2521                        SetPageSwapBacked(newpage);
2522
2523                mem_cgroup_prepare_migration(page, newpage, &memcg);
2524                r = migrate_page(mapping, newpage, page, MIGRATE_SYNC_NO_COPY);
2525                mem_cgroup_end_migration(memcg, page, newpage,
2526                                         r == MIGRATEPAGE_SUCCESS);
2527                if (r != MIGRATEPAGE_SUCCESS)
2528                        migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
2529        }
2530
2531        if (notified)
2532                mmu_notifier_invalidate_range_end(mm, mmu_start,
2533                                                  migrate->end);
2534}
2535
2536/*
2537 * migrate_vma_finalize() - restore CPU page table entry
2538 * @migrate: migrate struct containing all migration information
2539 *
2540 * This replaces the special migration pte entry with either a mapping to the
2541 * new page if migration was successful for that page, or to the original page
2542 * otherwise.
2543 *
2544 * This also unlocks the pages and puts them back on the lru, or drops the extra
2545 * refcount, for device pages.
2546 */
2547static void migrate_vma_finalize(struct migrate_vma *migrate)
2548{
2549        const unsigned long npages = migrate->npages;
2550        unsigned long i;
2551
2552        for (i = 0; i < npages; i++) {
2553                struct page *newpage = migrate_pfn_to_page(migrate->dst[i]);
2554                struct page *page = migrate_pfn_to_page(migrate->src[i]);
2555
2556                if (!page) {
2557                        if (newpage) {
2558                                unlock_page(newpage);
2559                                put_page(newpage);
2560                        }
2561                        continue;
2562                }
2563
2564                if (!(migrate->src[i] & MIGRATE_PFN_MIGRATE) || !newpage) {
2565                        if (newpage) {
2566                                unlock_page(newpage);
2567                                put_page(newpage);
2568                        }
2569                        newpage = page;
2570                }
2571
2572                remove_migration_ptes(page, newpage);
2573                unlock_page(page);
2574                migrate->cpages--;
2575
2576                if (is_zone_device_page(page))
2577                        put_page(page);
2578                else
2579                        putback_lru_page(page);
2580
2581                if (newpage != page) {
2582                        unlock_page(newpage);
2583                        if (is_zone_device_page(newpage))
2584                                put_page(newpage);
2585                        else
2586                                putback_lru_page(newpage);
2587                }
2588        }
2589}
2590
2591/*
2592 * migrate_vma() - migrate a range of memory inside vma
2593 *
2594 * @ops: migration callback for allocating destination memory and copying
2595 * @vma: virtual memory area containing the range to be migrated
2596 * @start: start address of the range to migrate (inclusive)
2597 * @end: end address of the range to migrate (exclusive)
2598 * @src: array of hmm_pfn_t containing source pfns
2599 * @dst: array of hmm_pfn_t containing destination pfns
2600 * @private: pointer passed back to each of the callback
2601 * Returns: 0 on success, error code otherwise
2602 *
2603 * This function tries to migrate a range of memory virtual address range, using
2604 * callbacks to allocate and copy memory from source to destination. First it
2605 * collects all the pages backing each virtual address in the range, saving this
2606 * inside the src array. Then it locks those pages and unmaps them. Once the pages
2607 * are locked and unmapped, it checks whether each page is pinned or not. Pages
2608 * that aren't pinned have the MIGRATE_PFN_MIGRATE flag set (by this function)
2609 * in the corresponding src array entry. It then restores any pages that are
2610 * pinned, by remapping and unlocking those pages.
2611 *
2612 * At this point it calls the alloc_and_copy() callback. For documentation on
2613 * what is expected from that callback, see struct migrate_vma_ops comments in
2614 * include/linux/migrate.h
2615 *
2616 * After the alloc_and_copy() callback, this function goes over each entry in
2617 * the src array that has the MIGRATE_PFN_VALID and MIGRATE_PFN_MIGRATE flag
2618 * set. If the corresponding entry in dst array has MIGRATE_PFN_VALID flag set,
2619 * then the function tries to migrate struct page information from the source
2620 * struct page to the destination struct page. If it fails to migrate the struct
2621 * page information, then it clears the MIGRATE_PFN_MIGRATE flag in the src
2622 * array.
2623 *
2624 * At this point all successfully migrated pages have an entry in the src
2625 * array with MIGRATE_PFN_VALID and MIGRATE_PFN_MIGRATE flag set and the dst
2626 * array entry with MIGRATE_PFN_VALID flag set.
2627 *
2628 * It then calls the finalize_and_map() callback. See comments for "struct
2629 * migrate_vma_ops", in include/linux/migrate.h for details about
2630 * finalize_and_map() behavior.
2631 *
2632 * After the finalize_and_map() callback, for successfully migrated pages, this
2633 * function updates the CPU page table to point to new pages, otherwise it
2634 * restores the CPU page table to point to the original source pages.
2635 *
2636 * Function returns 0 after the above steps, even if no pages were migrated
2637 * (The function only returns an error if any of the arguments are invalid.)
2638 *
2639 * Both src and dst array must be big enough for (end - start) >> PAGE_SHIFT
2640 * unsigned long entries.
2641 */
2642int migrate_vma(const struct migrate_vma_ops *ops,
2643                struct vm_area_struct *vma,
2644                unsigned long start,
2645                unsigned long end,
2646                unsigned long *src,
2647                unsigned long *dst,
2648                void *private)
2649{
2650        struct migrate_vma migrate;
2651
2652        /* Sanity check the arguments */
2653        start &= PAGE_MASK;
2654        end &= PAGE_MASK;
2655        if (!vma || is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_SPECIAL))
2656                return -EINVAL;
2657        if (start < vma->vm_start || start >= vma->vm_end)
2658                return -EINVAL;
2659        if (end <= vma->vm_start || end > vma->vm_end)
2660                return -EINVAL;
2661        if (!ops || !src || !dst || start >= end)
2662                return -EINVAL;
2663
2664        memset(src, 0, sizeof(*src) * ((end - start) >> PAGE_SHIFT));
2665        migrate.src = src;
2666        migrate.dst = dst;
2667        migrate.start = start;
2668        migrate.npages = 0;
2669        migrate.cpages = 0;
2670        migrate.end = end;
2671        migrate.vma = vma;
2672
2673        /* Collect, and try to unmap source pages */
2674        migrate_vma_collect(&migrate);
2675        if (!migrate.cpages)
2676                return 0;
2677
2678        /* Lock and isolate page */
2679        migrate_vma_prepare(&migrate);
2680        if (!migrate.cpages)
2681                return 0;
2682
2683        /* Unmap pages */
2684        migrate_vma_unmap(&migrate);
2685        if (!migrate.cpages)
2686                return 0;
2687
2688        /*
2689         * At this point pages are locked and unmapped, and thus they have
2690         * stable content and can safely be copied to destination memory that
2691         * is allocated by the callback.
2692         *
2693         * Note that migration can fail in migrate_vma_struct_page() for each
2694         * individual page.
2695         */
2696        ops->alloc_and_copy(vma, src, dst, start, end, private);
2697
2698        /* This does the real migration of struct page */
2699        migrate_vma_pages(&migrate);
2700
2701        ops->finalize_and_map(vma, src, dst, start, end, private);
2702
2703        /* Unlock and remap pages */
2704        migrate_vma_finalize(&migrate);
2705
2706        return 0;
2707}
2708EXPORT_SYMBOL(migrate_vma);
2709