linux/mm/migrate.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0
   2/*
   3 * Memory Migration functionality - linux/mm/migrate.c
   4 *
   5 * Copyright (C) 2006 Silicon Graphics, Inc., Christoph Lameter
   6 *
   7 * Page migration was first developed in the context of the memory hotplug
   8 * project. The main authors of the migration code are:
   9 *
  10 * IWAMOTO Toshihiro <iwamoto@valinux.co.jp>
  11 * Hirokazu Takahashi <taka@valinux.co.jp>
  12 * Dave Hansen <haveblue@us.ibm.com>
  13 * Christoph Lameter
  14 */
  15
  16#include <linux/migrate.h>
  17#include <linux/export.h>
  18#include <linux/swap.h>
  19#include <linux/swapops.h>
  20#include <linux/pagemap.h>
  21#include <linux/buffer_head.h>
  22#include <linux/mm_inline.h>
  23#include <linux/nsproxy.h>
  24#include <linux/pagevec.h>
  25#include <linux/ksm.h>
  26#include <linux/rmap.h>
  27#include <linux/topology.h>
  28#include <linux/cpu.h>
  29#include <linux/cpuset.h>
  30#include <linux/writeback.h>
  31#include <linux/mempolicy.h>
  32#include <linux/vmalloc.h>
  33#include <linux/security.h>
  34#include <linux/backing-dev.h>
  35#include <linux/compaction.h>
  36#include <linux/syscalls.h>
  37#include <linux/hugetlb.h>
  38#include <linux/hugetlb_cgroup.h>
  39#include <linux/gfp.h>
  40#include <linux/pfn_t.h>
  41#include <linux/memremap.h>
  42#include <linux/userfaultfd_k.h>
  43#include <linux/balloon_compaction.h>
  44#include <linux/mmu_notifier.h>
  45#include <linux/page_idle.h>
  46#include <linux/page_owner.h>
  47#include <linux/sched/mm.h>
  48#include <linux/ptrace.h>
  49
  50#include <asm/tlbflush.h>
  51
  52#define CREATE_TRACE_POINTS
  53#include <trace/events/migrate.h>
  54
  55#include "internal.h"
  56
  57/*
  58 * migrate_prep() needs to be called before we start compiling a list of pages
  59 * to be migrated using isolate_lru_page(). If scheduling work on other CPUs is
  60 * undesirable, use migrate_prep_local()
  61 */
  62int migrate_prep(void)
  63{
  64        /*
  65         * Clear the LRU lists so pages can be isolated.
  66         * Note that pages may be moved off the LRU after we have
  67         * drained them. Those pages will fail to migrate like other
  68         * pages that may be busy.
  69         */
  70        lru_add_drain_all();
  71
  72        return 0;
  73}
  74
  75/* Do the necessary work of migrate_prep but not if it involves other CPUs */
  76int migrate_prep_local(void)
  77{
  78        lru_add_drain();
  79
  80        return 0;
  81}
  82
  83int isolate_movable_page(struct page *page, isolate_mode_t mode)
  84{
  85        struct address_space *mapping;
  86
  87        /*
  88         * Avoid burning cycles with pages that are yet under __free_pages(),
  89         * or just got freed under us.
  90         *
  91         * In case we 'win' a race for a movable page being freed under us and
  92         * raise its refcount preventing __free_pages() from doing its job
  93         * the put_page() at the end of this block will take care of
  94         * release this page, thus avoiding a nasty leakage.
  95         */
  96        if (unlikely(!get_page_unless_zero(page)))
  97                goto out;
  98
  99        /*
 100         * Check PageMovable before holding a PG_lock because page's owner
 101         * assumes anybody doesn't touch PG_lock of newly allocated page
 102         * so unconditionally grapping the lock ruins page's owner side.
 103         */
 104        if (unlikely(!__PageMovable(page)))
 105                goto out_putpage;
 106        /*
 107         * As movable pages are not isolated from LRU lists, concurrent
 108         * compaction threads can race against page migration functions
 109         * as well as race against the releasing a page.
 110         *
 111         * In order to avoid having an already isolated movable page
 112         * being (wrongly) re-isolated while it is under migration,
 113         * or to avoid attempting to isolate pages being released,
 114         * lets be sure we have the page lock
 115         * before proceeding with the movable page isolation steps.
 116         */
 117        if (unlikely(!trylock_page(page)))
 118                goto out_putpage;
 119
 120        if (!PageMovable(page) || PageIsolated(page))
 121                goto out_no_isolated;
 122
 123        mapping = page_mapping(page);
 124        VM_BUG_ON_PAGE(!mapping, page);
 125
 126        if (!mapping->a_ops->isolate_page(page, mode))
 127                goto out_no_isolated;
 128
 129        /* Driver shouldn't use PG_isolated bit of page->flags */
 130        WARN_ON_ONCE(PageIsolated(page));
 131        __SetPageIsolated(page);
 132        unlock_page(page);
 133
 134        return 0;
 135
 136out_no_isolated:
 137        unlock_page(page);
 138out_putpage:
 139        put_page(page);
 140out:
 141        return -EBUSY;
 142}
 143
 144/* It should be called on page which is PG_movable */
 145void putback_movable_page(struct page *page)
 146{
 147        struct address_space *mapping;
 148
 149        VM_BUG_ON_PAGE(!PageLocked(page), page);
 150        VM_BUG_ON_PAGE(!PageMovable(page), page);
 151        VM_BUG_ON_PAGE(!PageIsolated(page), page);
 152
 153        mapping = page_mapping(page);
 154        mapping->a_ops->putback_page(page);
 155        __ClearPageIsolated(page);
 156}
 157
 158/*
 159 * Put previously isolated pages back onto the appropriate lists
 160 * from where they were once taken off for compaction/migration.
 161 *
 162 * This function shall be used whenever the isolated pageset has been
 163 * built from lru, balloon, hugetlbfs page. See isolate_migratepages_range()
 164 * and isolate_huge_page().
 165 */
 166void putback_movable_pages(struct list_head *l)
 167{
 168        struct page *page;
 169        struct page *page2;
 170
 171        list_for_each_entry_safe(page, page2, l, lru) {
 172                if (unlikely(PageHuge(page))) {
 173                        putback_active_hugepage(page);
 174                        continue;
 175                }
 176                list_del(&page->lru);
 177                /*
 178                 * We isolated non-lru movable page so here we can use
 179                 * __PageMovable because LRU page's mapping cannot have
 180                 * PAGE_MAPPING_MOVABLE.
 181                 */
 182                if (unlikely(__PageMovable(page))) {
 183                        VM_BUG_ON_PAGE(!PageIsolated(page), page);
 184                        lock_page(page);
 185                        if (PageMovable(page))
 186                                putback_movable_page(page);
 187                        else
 188                                __ClearPageIsolated(page);
 189                        unlock_page(page);
 190                        put_page(page);
 191                } else {
 192                        mod_node_page_state(page_pgdat(page), NR_ISOLATED_ANON +
 193                                        page_is_file_cache(page), -hpage_nr_pages(page));
 194                        putback_lru_page(page);
 195                }
 196        }
 197}
 198
 199/*
 200 * Restore a potential migration pte to a working pte entry
 201 */
 202static bool remove_migration_pte(struct page *page, struct vm_area_struct *vma,
 203                                 unsigned long addr, void *old)
 204{
 205        struct page_vma_mapped_walk pvmw = {
 206                .page = old,
 207                .vma = vma,
 208                .address = addr,
 209                .flags = PVMW_SYNC | PVMW_MIGRATION,
 210        };
 211        struct page *new;
 212        pte_t pte;
 213        swp_entry_t entry;
 214
 215        VM_BUG_ON_PAGE(PageTail(page), page);
 216        while (page_vma_mapped_walk(&pvmw)) {
 217                if (PageKsm(page))
 218                        new = page;
 219                else
 220                        new = page - pvmw.page->index +
 221                                linear_page_index(vma, pvmw.address);
 222
 223#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
 224                /* PMD-mapped THP migration entry */
 225                if (!pvmw.pte) {
 226                        VM_BUG_ON_PAGE(PageHuge(page) || !PageTransCompound(page), page);
 227                        remove_migration_pmd(&pvmw, new);
 228                        continue;
 229                }
 230#endif
 231
 232                get_page(new);
 233                pte = pte_mkold(mk_pte(new, READ_ONCE(vma->vm_page_prot)));
 234                if (pte_swp_soft_dirty(*pvmw.pte))
 235                        pte = pte_mksoft_dirty(pte);
 236
 237                /*
 238                 * Recheck VMA as permissions can change since migration started
 239                 */
 240                entry = pte_to_swp_entry(*pvmw.pte);
 241                if (is_write_migration_entry(entry))
 242                        pte = maybe_mkwrite(pte, vma);
 243
 244                if (unlikely(is_zone_device_page(new))) {
 245                        if (is_device_private_page(new)) {
 246                                entry = make_device_private_entry(new, pte_write(pte));
 247                                pte = swp_entry_to_pte(entry);
 248                        } else if (is_device_public_page(new)) {
 249                                pte = pte_mkdevmap(pte);
 250                                flush_dcache_page(new);
 251                        }
 252                } else
 253                        flush_dcache_page(new);
 254
 255#ifdef CONFIG_HUGETLB_PAGE
 256                if (PageHuge(new)) {
 257                        pte = pte_mkhuge(pte);
 258                        pte = arch_make_huge_pte(pte, vma, new, 0);
 259                        set_huge_pte_at(vma->vm_mm, pvmw.address, pvmw.pte, pte);
 260                        if (PageAnon(new))
 261                                hugepage_add_anon_rmap(new, vma, pvmw.address);
 262                        else
 263                                page_dup_rmap(new, true);
 264                } else
 265#endif
 266                {
 267                        set_pte_at(vma->vm_mm, pvmw.address, pvmw.pte, pte);
 268
 269                        if (PageAnon(new))
 270                                page_add_anon_rmap(new, vma, pvmw.address, false);
 271                        else
 272                                page_add_file_rmap(new, false);
 273                }
 274                if (vma->vm_flags & VM_LOCKED && !PageTransCompound(new))
 275                        mlock_vma_page(new);
 276
 277                /* No need to invalidate - it was non-present before */
 278                update_mmu_cache(vma, pvmw.address, pvmw.pte);
 279        }
 280
 281        return true;
 282}
 283
 284/*
 285 * Get rid of all migration entries and replace them by
 286 * references to the indicated page.
 287 */
 288void remove_migration_ptes(struct page *old, struct page *new, bool locked)
 289{
 290        struct rmap_walk_control rwc = {
 291                .rmap_one = remove_migration_pte,
 292                .arg = old,
 293        };
 294
 295        if (locked)
 296                rmap_walk_locked(new, &rwc);
 297        else
 298                rmap_walk(new, &rwc);
 299}
 300
 301/*
 302 * Something used the pte of a page under migration. We need to
 303 * get to the page and wait until migration is finished.
 304 * When we return from this function the fault will be retried.
 305 */
 306void __migration_entry_wait(struct mm_struct *mm, pte_t *ptep,
 307                                spinlock_t *ptl)
 308{
 309        pte_t pte;
 310        swp_entry_t entry;
 311        struct page *page;
 312
 313        spin_lock(ptl);
 314        pte = *ptep;
 315        if (!is_swap_pte(pte))
 316                goto out;
 317
 318        entry = pte_to_swp_entry(pte);
 319        if (!is_migration_entry(entry))
 320                goto out;
 321
 322        page = migration_entry_to_page(entry);
 323
 324        /*
 325         * Once radix-tree replacement of page migration started, page_count
 326         * *must* be zero. And, we don't want to call wait_on_page_locked()
 327         * against a page without get_page().
 328         * So, we use get_page_unless_zero(), here. Even failed, page fault
 329         * will occur again.
 330         */
 331        if (!get_page_unless_zero(page))
 332                goto out;
 333        pte_unmap_unlock(ptep, ptl);
 334        wait_on_page_locked(page);
 335        put_page(page);
 336        return;
 337out:
 338        pte_unmap_unlock(ptep, ptl);
 339}
 340
 341void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd,
 342                                unsigned long address)
 343{
 344        spinlock_t *ptl = pte_lockptr(mm, pmd);
 345        pte_t *ptep = pte_offset_map(pmd, address);
 346        __migration_entry_wait(mm, ptep, ptl);
 347}
 348
 349void migration_entry_wait_huge(struct vm_area_struct *vma,
 350                struct mm_struct *mm, pte_t *pte)
 351{
 352        spinlock_t *ptl = huge_pte_lockptr(hstate_vma(vma), mm, pte);
 353        __migration_entry_wait(mm, pte, ptl);
 354}
 355
 356#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
 357void pmd_migration_entry_wait(struct mm_struct *mm, pmd_t *pmd)
 358{
 359        spinlock_t *ptl;
 360        struct page *page;
 361
 362        ptl = pmd_lock(mm, pmd);
 363        if (!is_pmd_migration_entry(*pmd))
 364                goto unlock;
 365        page = migration_entry_to_page(pmd_to_swp_entry(*pmd));
 366        if (!get_page_unless_zero(page))
 367                goto unlock;
 368        spin_unlock(ptl);
 369        wait_on_page_locked(page);
 370        put_page(page);
 371        return;
 372unlock:
 373        spin_unlock(ptl);
 374}
 375#endif
 376
 377#ifdef CONFIG_BLOCK
 378/* Returns true if all buffers are successfully locked */
 379static bool buffer_migrate_lock_buffers(struct buffer_head *head,
 380                                                        enum migrate_mode mode)
 381{
 382        struct buffer_head *bh = head;
 383
 384        /* Simple case, sync compaction */
 385        if (mode != MIGRATE_ASYNC) {
 386                do {
 387                        get_bh(bh);
 388                        lock_buffer(bh);
 389                        bh = bh->b_this_page;
 390
 391                } while (bh != head);
 392
 393                return true;
 394        }
 395
 396        /* async case, we cannot block on lock_buffer so use trylock_buffer */
 397        do {
 398                get_bh(bh);
 399                if (!trylock_buffer(bh)) {
 400                        /*
 401                         * We failed to lock the buffer and cannot stall in
 402                         * async migration. Release the taken locks
 403                         */
 404                        struct buffer_head *failed_bh = bh;
 405                        put_bh(failed_bh);
 406                        bh = head;
 407                        while (bh != failed_bh) {
 408                                unlock_buffer(bh);
 409                                put_bh(bh);
 410                                bh = bh->b_this_page;
 411                        }
 412                        return false;
 413                }
 414
 415                bh = bh->b_this_page;
 416        } while (bh != head);
 417        return true;
 418}
 419#else
 420static inline bool buffer_migrate_lock_buffers(struct buffer_head *head,
 421                                                        enum migrate_mode mode)
 422{
 423        return true;
 424}
 425#endif /* CONFIG_BLOCK */
 426
 427/*
 428 * Replace the page in the mapping.
 429 *
 430 * The number of remaining references must be:
 431 * 1 for anonymous pages without a mapping
 432 * 2 for pages with a mapping
 433 * 3 for pages with a mapping and PagePrivate/PagePrivate2 set.
 434 */
 435int migrate_page_move_mapping(struct address_space *mapping,
 436                struct page *newpage, struct page *page,
 437                struct buffer_head *head, enum migrate_mode mode,
 438                int extra_count)
 439{
 440        struct zone *oldzone, *newzone;
 441        int dirty;
 442        int expected_count = 1 + extra_count;
 443        void **pslot;
 444
 445        /*
 446         * Device public or private pages have an extra refcount as they are
 447         * ZONE_DEVICE pages.
 448         */
 449        expected_count += is_device_private_page(page);
 450        expected_count += is_device_public_page(page);
 451
 452        if (!mapping) {
 453                /* Anonymous page without mapping */
 454                if (page_count(page) != expected_count)
 455                        return -EAGAIN;
 456
 457                /* No turning back from here */
 458                newpage->index = page->index;
 459                newpage->mapping = page->mapping;
 460                if (PageSwapBacked(page))
 461                        __SetPageSwapBacked(newpage);
 462
 463                return MIGRATEPAGE_SUCCESS;
 464        }
 465
 466        oldzone = page_zone(page);
 467        newzone = page_zone(newpage);
 468
 469        spin_lock_irq(&mapping->tree_lock);
 470
 471        pslot = radix_tree_lookup_slot(&mapping->page_tree,
 472                                        page_index(page));
 473
 474        expected_count += 1 + page_has_private(page);
 475        if (page_count(page) != expected_count ||
 476                radix_tree_deref_slot_protected(pslot, &mapping->tree_lock) != page) {
 477                spin_unlock_irq(&mapping->tree_lock);
 478                return -EAGAIN;
 479        }
 480
 481        if (!page_ref_freeze(page, expected_count)) {
 482                spin_unlock_irq(&mapping->tree_lock);
 483                return -EAGAIN;
 484        }
 485
 486        /*
 487         * In the async migration case of moving a page with buffers, lock the
 488         * buffers using trylock before the mapping is moved. If the mapping
 489         * was moved, we later failed to lock the buffers and could not move
 490         * the mapping back due to an elevated page count, we would have to
 491         * block waiting on other references to be dropped.
 492         */
 493        if (mode == MIGRATE_ASYNC && head &&
 494                        !buffer_migrate_lock_buffers(head, mode)) {
 495                page_ref_unfreeze(page, expected_count);
 496                spin_unlock_irq(&mapping->tree_lock);
 497                return -EAGAIN;
 498        }
 499
 500        /*
 501         * Now we know that no one else is looking at the page:
 502         * no turning back from here.
 503         */
 504        newpage->index = page->index;
 505        newpage->mapping = page->mapping;
 506        get_page(newpage);      /* add cache reference */
 507        if (PageSwapBacked(page)) {
 508                __SetPageSwapBacked(newpage);
 509                if (PageSwapCache(page)) {
 510                        SetPageSwapCache(newpage);
 511                        set_page_private(newpage, page_private(page));
 512                }
 513        } else {
 514                VM_BUG_ON_PAGE(PageSwapCache(page), page);
 515        }
 516
 517        /* Move dirty while page refs frozen and newpage not yet exposed */
 518        dirty = PageDirty(page);
 519        if (dirty) {
 520                ClearPageDirty(page);
 521                SetPageDirty(newpage);
 522        }
 523
 524        radix_tree_replace_slot(&mapping->page_tree, pslot, newpage);
 525
 526        /*
 527         * Drop cache reference from old page by unfreezing
 528         * to one less reference.
 529         * We know this isn't the last reference.
 530         */
 531        page_ref_unfreeze(page, expected_count - 1);
 532
 533        spin_unlock(&mapping->tree_lock);
 534        /* Leave irq disabled to prevent preemption while updating stats */
 535
 536        /*
 537         * If moved to a different zone then also account
 538         * the page for that zone. Other VM counters will be
 539         * taken care of when we establish references to the
 540         * new page and drop references to the old page.
 541         *
 542         * Note that anonymous pages are accounted for
 543         * via NR_FILE_PAGES and NR_ANON_MAPPED if they
 544         * are mapped to swap space.
 545         */
 546        if (newzone != oldzone) {
 547                __dec_node_state(oldzone->zone_pgdat, NR_FILE_PAGES);
 548                __inc_node_state(newzone->zone_pgdat, NR_FILE_PAGES);
 549                if (PageSwapBacked(page) && !PageSwapCache(page)) {
 550                        __dec_node_state(oldzone->zone_pgdat, NR_SHMEM);
 551                        __inc_node_state(newzone->zone_pgdat, NR_SHMEM);
 552                }
 553                if (dirty && mapping_cap_account_dirty(mapping)) {
 554                        __dec_node_state(oldzone->zone_pgdat, NR_FILE_DIRTY);
 555                        __dec_zone_state(oldzone, NR_ZONE_WRITE_PENDING);
 556                        __inc_node_state(newzone->zone_pgdat, NR_FILE_DIRTY);
 557                        __inc_zone_state(newzone, NR_ZONE_WRITE_PENDING);
 558                }
 559        }
 560        local_irq_enable();
 561
 562        return MIGRATEPAGE_SUCCESS;
 563}
 564EXPORT_SYMBOL(migrate_page_move_mapping);
 565
 566/*
 567 * The expected number of remaining references is the same as that
 568 * of migrate_page_move_mapping().
 569 */
 570int migrate_huge_page_move_mapping(struct address_space *mapping,
 571                                   struct page *newpage, struct page *page)
 572{
 573        int expected_count;
 574        void **pslot;
 575
 576        spin_lock_irq(&mapping->tree_lock);
 577
 578        pslot = radix_tree_lookup_slot(&mapping->page_tree,
 579                                        page_index(page));
 580
 581        expected_count = 2 + page_has_private(page);
 582        if (page_count(page) != expected_count ||
 583                radix_tree_deref_slot_protected(pslot, &mapping->tree_lock) != page) {
 584                spin_unlock_irq(&mapping->tree_lock);
 585                return -EAGAIN;
 586        }
 587
 588        if (!page_ref_freeze(page, expected_count)) {
 589                spin_unlock_irq(&mapping->tree_lock);
 590                return -EAGAIN;
 591        }
 592
 593        newpage->index = page->index;
 594        newpage->mapping = page->mapping;
 595
 596        get_page(newpage);
 597
 598        radix_tree_replace_slot(&mapping->page_tree, pslot, newpage);
 599
 600        page_ref_unfreeze(page, expected_count - 1);
 601
 602        spin_unlock_irq(&mapping->tree_lock);
 603
 604        return MIGRATEPAGE_SUCCESS;
 605}
 606
 607/*
 608 * Gigantic pages are so large that we do not guarantee that page++ pointer
 609 * arithmetic will work across the entire page.  We need something more
 610 * specialized.
 611 */
 612static void __copy_gigantic_page(struct page *dst, struct page *src,
 613                                int nr_pages)
 614{
 615        int i;
 616        struct page *dst_base = dst;
 617        struct page *src_base = src;
 618
 619        for (i = 0; i < nr_pages; ) {
 620                cond_resched();
 621                copy_highpage(dst, src);
 622
 623                i++;
 624                dst = mem_map_next(dst, dst_base, i);
 625                src = mem_map_next(src, src_base, i);
 626        }
 627}
 628
 629static void copy_huge_page(struct page *dst, struct page *src)
 630{
 631        int i;
 632        int nr_pages;
 633
 634        if (PageHuge(src)) {
 635                /* hugetlbfs page */
 636                struct hstate *h = page_hstate(src);
 637                nr_pages = pages_per_huge_page(h);
 638
 639                if (unlikely(nr_pages > MAX_ORDER_NR_PAGES)) {
 640                        __copy_gigantic_page(dst, src, nr_pages);
 641                        return;
 642                }
 643        } else {
 644                /* thp page */
 645                BUG_ON(!PageTransHuge(src));
 646                nr_pages = hpage_nr_pages(src);
 647        }
 648
 649        for (i = 0; i < nr_pages; i++) {
 650                cond_resched();
 651                copy_highpage(dst + i, src + i);
 652        }
 653}
 654
 655/*
 656 * Copy the page to its new location
 657 */
 658void migrate_page_states(struct page *newpage, struct page *page)
 659{
 660        int cpupid;
 661
 662        if (PageError(page))
 663                SetPageError(newpage);
 664        if (PageReferenced(page))
 665                SetPageReferenced(newpage);
 666        if (PageUptodate(page))
 667                SetPageUptodate(newpage);
 668        if (TestClearPageActive(page)) {
 669                VM_BUG_ON_PAGE(PageUnevictable(page), page);
 670                SetPageActive(newpage);
 671        } else if (TestClearPageUnevictable(page))
 672                SetPageUnevictable(newpage);
 673        if (PageChecked(page))
 674                SetPageChecked(newpage);
 675        if (PageMappedToDisk(page))
 676                SetPageMappedToDisk(newpage);
 677
 678        /* Move dirty on pages not done by migrate_page_move_mapping() */
 679        if (PageDirty(page))
 680                SetPageDirty(newpage);
 681
 682        if (page_is_young(page))
 683                set_page_young(newpage);
 684        if (page_is_idle(page))
 685                set_page_idle(newpage);
 686
 687        /*
 688         * Copy NUMA information to the new page, to prevent over-eager
 689         * future migrations of this same page.
 690         */
 691        cpupid = page_cpupid_xchg_last(page, -1);
 692        page_cpupid_xchg_last(newpage, cpupid);
 693
 694        ksm_migrate_page(newpage, page);
 695        /*
 696         * Please do not reorder this without considering how mm/ksm.c's
 697         * get_ksm_page() depends upon ksm_migrate_page() and PageSwapCache().
 698         */
 699        if (PageSwapCache(page))
 700                ClearPageSwapCache(page);
 701        ClearPagePrivate(page);
 702        set_page_private(page, 0);
 703
 704        /*
 705         * If any waiters have accumulated on the new page then
 706         * wake them up.
 707         */
 708        if (PageWriteback(newpage))
 709                end_page_writeback(newpage);
 710
 711        copy_page_owner(page, newpage);
 712
 713        mem_cgroup_migrate(page, newpage);
 714}
 715EXPORT_SYMBOL(migrate_page_states);
 716
 717void migrate_page_copy(struct page *newpage, struct page *page)
 718{
 719        if (PageHuge(page) || PageTransHuge(page))
 720                copy_huge_page(newpage, page);
 721        else
 722                copy_highpage(newpage, page);
 723
 724        migrate_page_states(newpage, page);
 725}
 726EXPORT_SYMBOL(migrate_page_copy);
 727
 728/************************************************************
 729 *                    Migration functions
 730 ***********************************************************/
 731
 732/*
 733 * Common logic to directly migrate a single LRU page suitable for
 734 * pages that do not use PagePrivate/PagePrivate2.
 735 *
 736 * Pages are locked upon entry and exit.
 737 */
 738int migrate_page(struct address_space *mapping,
 739                struct page *newpage, struct page *page,
 740                enum migrate_mode mode)
 741{
 742        int rc;
 743
 744        BUG_ON(PageWriteback(page));    /* Writeback must be complete */
 745
 746        rc = migrate_page_move_mapping(mapping, newpage, page, NULL, mode, 0);
 747
 748        if (rc != MIGRATEPAGE_SUCCESS)
 749                return rc;
 750
 751        if (mode != MIGRATE_SYNC_NO_COPY)
 752                migrate_page_copy(newpage, page);
 753        else
 754                migrate_page_states(newpage, page);
 755        return MIGRATEPAGE_SUCCESS;
 756}
 757EXPORT_SYMBOL(migrate_page);
 758
 759#ifdef CONFIG_BLOCK
 760/*
 761 * Migration function for pages with buffers. This function can only be used
 762 * if the underlying filesystem guarantees that no other references to "page"
 763 * exist.
 764 */
 765int buffer_migrate_page(struct address_space *mapping,
 766                struct page *newpage, struct page *page, enum migrate_mode mode)
 767{
 768        struct buffer_head *bh, *head;
 769        int rc;
 770
 771        if (!page_has_buffers(page))
 772                return migrate_page(mapping, newpage, page, mode);
 773
 774        head = page_buffers(page);
 775
 776        rc = migrate_page_move_mapping(mapping, newpage, page, head, mode, 0);
 777
 778        if (rc != MIGRATEPAGE_SUCCESS)
 779                return rc;
 780
 781        /*
 782         * In the async case, migrate_page_move_mapping locked the buffers
 783         * with an IRQ-safe spinlock held. In the sync case, the buffers
 784         * need to be locked now
 785         */
 786        if (mode != MIGRATE_ASYNC)
 787                BUG_ON(!buffer_migrate_lock_buffers(head, mode));
 788
 789        ClearPagePrivate(page);
 790        set_page_private(newpage, page_private(page));
 791        set_page_private(page, 0);
 792        put_page(page);
 793        get_page(newpage);
 794
 795        bh = head;
 796        do {
 797                set_bh_page(bh, newpage, bh_offset(bh));
 798                bh = bh->b_this_page;
 799
 800        } while (bh != head);
 801
 802        SetPagePrivate(newpage);
 803
 804        if (mode != MIGRATE_SYNC_NO_COPY)
 805                migrate_page_copy(newpage, page);
 806        else
 807                migrate_page_states(newpage, page);
 808
 809        bh = head;
 810        do {
 811                unlock_buffer(bh);
 812                put_bh(bh);
 813                bh = bh->b_this_page;
 814
 815        } while (bh != head);
 816
 817        return MIGRATEPAGE_SUCCESS;
 818}
 819EXPORT_SYMBOL(buffer_migrate_page);
 820#endif
 821
 822/*
 823 * Writeback a page to clean the dirty state
 824 */
 825static int writeout(struct address_space *mapping, struct page *page)
 826{
 827        struct writeback_control wbc = {
 828                .sync_mode = WB_SYNC_NONE,
 829                .nr_to_write = 1,
 830                .range_start = 0,
 831                .range_end = LLONG_MAX,
 832                .for_reclaim = 1
 833        };
 834        int rc;
 835
 836        if (!mapping->a_ops->writepage)
 837                /* No write method for the address space */
 838                return -EINVAL;
 839
 840        if (!clear_page_dirty_for_io(page))
 841                /* Someone else already triggered a write */
 842                return -EAGAIN;
 843
 844        /*
 845         * A dirty page may imply that the underlying filesystem has
 846         * the page on some queue. So the page must be clean for
 847         * migration. Writeout may mean we loose the lock and the
 848         * page state is no longer what we checked for earlier.
 849         * At this point we know that the migration attempt cannot
 850         * be successful.
 851         */
 852        remove_migration_ptes(page, page, false);
 853
 854        rc = mapping->a_ops->writepage(page, &wbc);
 855
 856        if (rc != AOP_WRITEPAGE_ACTIVATE)
 857                /* unlocked. Relock */
 858                lock_page(page);
 859
 860        return (rc < 0) ? -EIO : -EAGAIN;
 861}
 862
 863/*
 864 * Default handling if a filesystem does not provide a migration function.
 865 */
 866static int fallback_migrate_page(struct address_space *mapping,
 867        struct page *newpage, struct page *page, enum migrate_mode mode)
 868{
 869        if (PageDirty(page)) {
 870                /* Only writeback pages in full synchronous migration */
 871                switch (mode) {
 872                case MIGRATE_SYNC:
 873                case MIGRATE_SYNC_NO_COPY:
 874                        break;
 875                default:
 876                        return -EBUSY;
 877                }
 878                return writeout(mapping, page);
 879        }
 880
 881        /*
 882         * Buffers may be managed in a filesystem specific way.
 883         * We must have no buffers or drop them.
 884         */
 885        if (page_has_private(page) &&
 886            !try_to_release_page(page, GFP_KERNEL))
 887                return -EAGAIN;
 888
 889        return migrate_page(mapping, newpage, page, mode);
 890}
 891
 892/*
 893 * Move a page to a newly allocated page
 894 * The page is locked and all ptes have been successfully removed.
 895 *
 896 * The new page will have replaced the old page if this function
 897 * is successful.
 898 *
 899 * Return value:
 900 *   < 0 - error code
 901 *  MIGRATEPAGE_SUCCESS - success
 902 */
 903static int move_to_new_page(struct page *newpage, struct page *page,
 904                                enum migrate_mode mode)
 905{
 906        struct address_space *mapping;
 907        int rc = -EAGAIN;
 908        bool is_lru = !__PageMovable(page);
 909
 910        VM_BUG_ON_PAGE(!PageLocked(page), page);
 911        VM_BUG_ON_PAGE(!PageLocked(newpage), newpage);
 912
 913        mapping = page_mapping(page);
 914
 915        if (likely(is_lru)) {
 916                if (!mapping)
 917                        rc = migrate_page(mapping, newpage, page, mode);
 918                else if (mapping->a_ops->migratepage)
 919                        /*
 920                         * Most pages have a mapping and most filesystems
 921                         * provide a migratepage callback. Anonymous pages
 922                         * are part of swap space which also has its own
 923                         * migratepage callback. This is the most common path
 924                         * for page migration.
 925                         */
 926                        rc = mapping->a_ops->migratepage(mapping, newpage,
 927                                                        page, mode);
 928                else
 929                        rc = fallback_migrate_page(mapping, newpage,
 930                                                        page, mode);
 931        } else {
 932                /*
 933                 * In case of non-lru page, it could be released after
 934                 * isolation step. In that case, we shouldn't try migration.
 935                 */
 936                VM_BUG_ON_PAGE(!PageIsolated(page), page);
 937                if (!PageMovable(page)) {
 938                        rc = MIGRATEPAGE_SUCCESS;
 939                        __ClearPageIsolated(page);
 940                        goto out;
 941                }
 942
 943                rc = mapping->a_ops->migratepage(mapping, newpage,
 944                                                page, mode);
 945                WARN_ON_ONCE(rc == MIGRATEPAGE_SUCCESS &&
 946                        !PageIsolated(page));
 947        }
 948
 949        /*
 950         * When successful, old pagecache page->mapping must be cleared before
 951         * page is freed; but stats require that PageAnon be left as PageAnon.
 952         */
 953        if (rc == MIGRATEPAGE_SUCCESS) {
 954                if (__PageMovable(page)) {
 955                        VM_BUG_ON_PAGE(!PageIsolated(page), page);
 956
 957                        /*
 958                         * We clear PG_movable under page_lock so any compactor
 959                         * cannot try to migrate this page.
 960                         */
 961                        __ClearPageIsolated(page);
 962                }
 963
 964                /*
 965                 * Anonymous and movable page->mapping will be cleard by
 966                 * free_pages_prepare so don't reset it here for keeping
 967                 * the type to work PageAnon, for example.
 968                 */
 969                if (!PageMappingFlags(page))
 970                        page->mapping = NULL;
 971        }
 972out:
 973        return rc;
 974}
 975
 976static int __unmap_and_move(struct page *page, struct page *newpage,
 977                                int force, enum migrate_mode mode)
 978{
 979        int rc = -EAGAIN;
 980        int page_was_mapped = 0;
 981        struct anon_vma *anon_vma = NULL;
 982        bool is_lru = !__PageMovable(page);
 983
 984        if (!trylock_page(page)) {
 985                if (!force || mode == MIGRATE_ASYNC)
 986                        goto out;
 987
 988                /*
 989                 * It's not safe for direct compaction to call lock_page.
 990                 * For example, during page readahead pages are added locked
 991                 * to the LRU. Later, when the IO completes the pages are
 992                 * marked uptodate and unlocked. However, the queueing
 993                 * could be merging multiple pages for one bio (e.g.
 994                 * mpage_readpages). If an allocation happens for the
 995                 * second or third page, the process can end up locking
 996                 * the same page twice and deadlocking. Rather than
 997                 * trying to be clever about what pages can be locked,
 998                 * avoid the use of lock_page for direct compaction
 999                 * altogether.
1000                 */
1001                if (current->flags & PF_MEMALLOC)
1002                        goto out;
1003
1004                lock_page(page);
1005        }
1006
1007        if (PageWriteback(page)) {
1008                /*
1009                 * Only in the case of a full synchronous migration is it
1010                 * necessary to wait for PageWriteback. In the async case,
1011                 * the retry loop is too short and in the sync-light case,
1012                 * the overhead of stalling is too much
1013                 */
1014                switch (mode) {
1015                case MIGRATE_SYNC:
1016                case MIGRATE_SYNC_NO_COPY:
1017                        break;
1018                default:
1019                        rc = -EBUSY;
1020                        goto out_unlock;
1021                }
1022                if (!force)
1023                        goto out_unlock;
1024                wait_on_page_writeback(page);
1025        }
1026
1027        /*
1028         * By try_to_unmap(), page->mapcount goes down to 0 here. In this case,
1029         * we cannot notice that anon_vma is freed while we migrates a page.
1030         * This get_anon_vma() delays freeing anon_vma pointer until the end
1031         * of migration. File cache pages are no problem because of page_lock()
1032         * File Caches may use write_page() or lock_page() in migration, then,
1033         * just care Anon page here.
1034         *
1035         * Only page_get_anon_vma() understands the subtleties of
1036         * getting a hold on an anon_vma from outside one of its mms.
1037         * But if we cannot get anon_vma, then we won't need it anyway,
1038         * because that implies that the anon page is no longer mapped
1039         * (and cannot be remapped so long as we hold the page lock).
1040         */
1041        if (PageAnon(page) && !PageKsm(page))
1042                anon_vma = page_get_anon_vma(page);
1043
1044        /*
1045         * Block others from accessing the new page when we get around to
1046         * establishing additional references. We are usually the only one
1047         * holding a reference to newpage at this point. We used to have a BUG
1048         * here if trylock_page(newpage) fails, but would like to allow for
1049         * cases where there might be a race with the previous use of newpage.
1050         * This is much like races on refcount of oldpage: just don't BUG().
1051         */
1052        if (unlikely(!trylock_page(newpage)))
1053                goto out_unlock;
1054
1055        if (unlikely(!is_lru)) {
1056                rc = move_to_new_page(newpage, page, mode);
1057                goto out_unlock_both;
1058        }
1059
1060        /*
1061         * Corner case handling:
1062         * 1. When a new swap-cache page is read into, it is added to the LRU
1063         * and treated as swapcache but it has no rmap yet.
1064         * Calling try_to_unmap() against a page->mapping==NULL page will
1065         * trigger a BUG.  So handle it here.
1066         * 2. An orphaned page (see truncate_complete_page) might have
1067         * fs-private metadata. The page can be picked up due to memory
1068         * offlining.  Everywhere else except page reclaim, the page is
1069         * invisible to the vm, so the page can not be migrated.  So try to
1070         * free the metadata, so the page can be freed.
1071         */
1072        if (!page->mapping) {
1073                VM_BUG_ON_PAGE(PageAnon(page), page);
1074                if (page_has_private(page)) {
1075                        try_to_free_buffers(page);
1076                        goto out_unlock_both;
1077                }
1078        } else if (page_mapped(page)) {
1079                /* Establish migration ptes */
1080                VM_BUG_ON_PAGE(PageAnon(page) && !PageKsm(page) && !anon_vma,
1081                                page);
1082                try_to_unmap(page,
1083                        TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS);
1084                page_was_mapped = 1;
1085        }
1086
1087        if (!page_mapped(page))
1088                rc = move_to_new_page(newpage, page, mode);
1089
1090        if (page_was_mapped)
1091                remove_migration_ptes(page,
1092                        rc == MIGRATEPAGE_SUCCESS ? newpage : page, false);
1093
1094out_unlock_both:
1095        unlock_page(newpage);
1096out_unlock:
1097        /* Drop an anon_vma reference if we took one */
1098        if (anon_vma)
1099                put_anon_vma(anon_vma);
1100        unlock_page(page);
1101out:
1102        /*
1103         * If migration is successful, decrease refcount of the newpage
1104         * which will not free the page because new page owner increased
1105         * refcounter. As well, if it is LRU page, add the page to LRU
1106         * list in here.
1107         */
1108        if (rc == MIGRATEPAGE_SUCCESS) {
1109                if (unlikely(__PageMovable(newpage)))
1110                        put_page(newpage);
1111                else
1112                        putback_lru_page(newpage);
1113        }
1114
1115        return rc;
1116}
1117
1118/*
1119 * gcc 4.7 and 4.8 on arm get an ICEs when inlining unmap_and_move().  Work
1120 * around it.
1121 */
1122#if (GCC_VERSION >= 40700 && GCC_VERSION < 40900) && defined(CONFIG_ARM)
1123#define ICE_noinline noinline
1124#else
1125#define ICE_noinline
1126#endif
1127
1128/*
1129 * Obtain the lock on page, remove all ptes and migrate the page
1130 * to the newly allocated page in newpage.
1131 */
1132static ICE_noinline int unmap_and_move(new_page_t get_new_page,
1133                                   free_page_t put_new_page,
1134                                   unsigned long private, struct page *page,
1135                                   int force, enum migrate_mode mode,
1136                                   enum migrate_reason reason)
1137{
1138        int rc = MIGRATEPAGE_SUCCESS;
1139        int *result = NULL;
1140        struct page *newpage;
1141
1142        newpage = get_new_page(page, private, &result);
1143        if (!newpage)
1144                return -ENOMEM;
1145
1146        if (page_count(page) == 1) {
1147                /* page was freed from under us. So we are done. */
1148                ClearPageActive(page);
1149                ClearPageUnevictable(page);
1150                if (unlikely(__PageMovable(page))) {
1151                        lock_page(page);
1152                        if (!PageMovable(page))
1153                                __ClearPageIsolated(page);
1154                        unlock_page(page);
1155                }
1156                if (put_new_page)
1157                        put_new_page(newpage, private);
1158                else
1159                        put_page(newpage);
1160                goto out;
1161        }
1162
1163        if (unlikely(PageTransHuge(page) && !PageTransHuge(newpage))) {
1164                lock_page(page);
1165                rc = split_huge_page(page);
1166                unlock_page(page);
1167                if (rc)
1168                        goto out;
1169        }
1170
1171        rc = __unmap_and_move(page, newpage, force, mode);
1172        if (rc == MIGRATEPAGE_SUCCESS)
1173                set_page_owner_migrate_reason(newpage, reason);
1174
1175out:
1176        if (rc != -EAGAIN) {
1177                /*
1178                 * A page that has been migrated has all references
1179                 * removed and will be freed. A page that has not been
1180                 * migrated will have kepts its references and be
1181                 * restored.
1182                 */
1183                list_del(&page->lru);
1184
1185                /*
1186                 * Compaction can migrate also non-LRU pages which are
1187                 * not accounted to NR_ISOLATED_*. They can be recognized
1188                 * as __PageMovable
1189                 */
1190                if (likely(!__PageMovable(page)))
1191                        mod_node_page_state(page_pgdat(page), NR_ISOLATED_ANON +
1192                                        page_is_file_cache(page), -hpage_nr_pages(page));
1193        }
1194
1195        /*
1196         * If migration is successful, releases reference grabbed during
1197         * isolation. Otherwise, restore the page to right list unless
1198         * we want to retry.
1199         */
1200        if (rc == MIGRATEPAGE_SUCCESS) {
1201                put_page(page);
1202                if (reason == MR_MEMORY_FAILURE) {
1203                        /*
1204                         * Set PG_HWPoison on just freed page
1205                         * intentionally. Although it's rather weird,
1206                         * it's how HWPoison flag works at the moment.
1207                         */
1208                        if (!test_set_page_hwpoison(page))
1209                                num_poisoned_pages_inc();
1210                }
1211        } else {
1212                if (rc != -EAGAIN) {
1213                        if (likely(!__PageMovable(page))) {
1214                                putback_lru_page(page);
1215                                goto put_new;
1216                        }
1217
1218                        lock_page(page);
1219                        if (PageMovable(page))
1220                                putback_movable_page(page);
1221                        else
1222                                __ClearPageIsolated(page);
1223                        unlock_page(page);
1224                        put_page(page);
1225                }
1226put_new:
1227                if (put_new_page)
1228                        put_new_page(newpage, private);
1229                else
1230                        put_page(newpage);
1231        }
1232
1233        if (result) {
1234                if (rc)
1235                        *result = rc;
1236                else
1237                        *result = page_to_nid(newpage);
1238        }
1239        return rc;
1240}
1241
1242/*
1243 * Counterpart of unmap_and_move_page() for hugepage migration.
1244 *
1245 * This function doesn't wait the completion of hugepage I/O
1246 * because there is no race between I/O and migration for hugepage.
1247 * Note that currently hugepage I/O occurs only in direct I/O
1248 * where no lock is held and PG_writeback is irrelevant,
1249 * and writeback status of all subpages are counted in the reference
1250 * count of the head page (i.e. if all subpages of a 2MB hugepage are
1251 * under direct I/O, the reference of the head page is 512 and a bit more.)
1252 * This means that when we try to migrate hugepage whose subpages are
1253 * doing direct I/O, some references remain after try_to_unmap() and
1254 * hugepage migration fails without data corruption.
1255 *
1256 * There is also no race when direct I/O is issued on the page under migration,
1257 * because then pte is replaced with migration swap entry and direct I/O code
1258 * will wait in the page fault for migration to complete.
1259 */
1260static int unmap_and_move_huge_page(new_page_t get_new_page,
1261                                free_page_t put_new_page, unsigned long private,
1262                                struct page *hpage, int force,
1263                                enum migrate_mode mode, int reason)
1264{
1265        int rc = -EAGAIN;
1266        int *result = NULL;
1267        int page_was_mapped = 0;
1268        struct page *new_hpage;
1269        struct anon_vma *anon_vma = NULL;
1270
1271        /*
1272         * Movability of hugepages depends on architectures and hugepage size.
1273         * This check is necessary because some callers of hugepage migration
1274         * like soft offline and memory hotremove don't walk through page
1275         * tables or check whether the hugepage is pmd-based or not before
1276         * kicking migration.
1277         */
1278        if (!hugepage_migration_supported(page_hstate(hpage))) {
1279                putback_active_hugepage(hpage);
1280                return -ENOSYS;
1281        }
1282
1283        new_hpage = get_new_page(hpage, private, &result);
1284        if (!new_hpage)
1285                return -ENOMEM;
1286
1287        if (!trylock_page(hpage)) {
1288                if (!force)
1289                        goto out;
1290                switch (mode) {
1291                case MIGRATE_SYNC:
1292                case MIGRATE_SYNC_NO_COPY:
1293                        break;
1294                default:
1295                        goto out;
1296                }
1297                lock_page(hpage);
1298        }
1299
1300        if (PageAnon(hpage))
1301                anon_vma = page_get_anon_vma(hpage);
1302
1303        if (unlikely(!trylock_page(new_hpage)))
1304                goto put_anon;
1305
1306        if (page_mapped(hpage)) {
1307                try_to_unmap(hpage,
1308                        TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS);
1309                page_was_mapped = 1;
1310        }
1311
1312        if (!page_mapped(hpage))
1313                rc = move_to_new_page(new_hpage, hpage, mode);
1314
1315        if (page_was_mapped)
1316                remove_migration_ptes(hpage,
1317                        rc == MIGRATEPAGE_SUCCESS ? new_hpage : hpage, false);
1318
1319        unlock_page(new_hpage);
1320
1321put_anon:
1322        if (anon_vma)
1323                put_anon_vma(anon_vma);
1324
1325        if (rc == MIGRATEPAGE_SUCCESS) {
1326                hugetlb_cgroup_migrate(hpage, new_hpage);
1327                put_new_page = NULL;
1328                set_page_owner_migrate_reason(new_hpage, reason);
1329        }
1330
1331        unlock_page(hpage);
1332out:
1333        if (rc != -EAGAIN)
1334                putback_active_hugepage(hpage);
1335        if (reason == MR_MEMORY_FAILURE && !test_set_page_hwpoison(hpage))
1336                num_poisoned_pages_inc();
1337
1338        /*
1339         * If migration was not successful and there's a freeing callback, use
1340         * it.  Otherwise, put_page() will drop the reference grabbed during
1341         * isolation.
1342         */
1343        if (put_new_page)
1344                put_new_page(new_hpage, private);
1345        else
1346                putback_active_hugepage(new_hpage);
1347
1348        if (result) {
1349                if (rc)
1350                        *result = rc;
1351                else
1352                        *result = page_to_nid(new_hpage);
1353        }
1354        return rc;
1355}
1356
1357/*
1358 * migrate_pages - migrate the pages specified in a list, to the free pages
1359 *                 supplied as the target for the page migration
1360 *
1361 * @from:               The list of pages to be migrated.
1362 * @get_new_page:       The function used to allocate free pages to be used
1363 *                      as the target of the page migration.
1364 * @put_new_page:       The function used to free target pages if migration
1365 *                      fails, or NULL if no special handling is necessary.
1366 * @private:            Private data to be passed on to get_new_page()
1367 * @mode:               The migration mode that specifies the constraints for
1368 *                      page migration, if any.
1369 * @reason:             The reason for page migration.
1370 *
1371 * The function returns after 10 attempts or if no pages are movable any more
1372 * because the list has become empty or no retryable pages exist any more.
1373 * The caller should call putback_movable_pages() to return pages to the LRU
1374 * or free list only if ret != 0.
1375 *
1376 * Returns the number of pages that were not migrated, or an error code.
1377 */
1378int migrate_pages(struct list_head *from, new_page_t get_new_page,
1379                free_page_t put_new_page, unsigned long private,
1380                enum migrate_mode mode, int reason)
1381{
1382        int retry = 1;
1383        int nr_failed = 0;
1384        int nr_succeeded = 0;
1385        int pass = 0;
1386        struct page *page;
1387        struct page *page2;
1388        int swapwrite = current->flags & PF_SWAPWRITE;
1389        int rc;
1390
1391        if (!swapwrite)
1392                current->flags |= PF_SWAPWRITE;
1393
1394        for(pass = 0; pass < 10 && retry; pass++) {
1395                retry = 0;
1396
1397                list_for_each_entry_safe(page, page2, from, lru) {
1398                        cond_resched();
1399
1400                        if (PageHuge(page))
1401                                rc = unmap_and_move_huge_page(get_new_page,
1402                                                put_new_page, private, page,
1403                                                pass > 2, mode, reason);
1404                        else
1405                                rc = unmap_and_move(get_new_page, put_new_page,
1406                                                private, page, pass > 2, mode,
1407                                                reason);
1408
1409                        switch(rc) {
1410                        case -ENOMEM:
1411                                nr_failed++;
1412                                goto out;
1413                        case -EAGAIN:
1414                                retry++;
1415                                break;
1416                        case MIGRATEPAGE_SUCCESS:
1417                                nr_succeeded++;
1418                                break;
1419                        default:
1420                                /*
1421                                 * Permanent failure (-EBUSY, -ENOSYS, etc.):
1422                                 * unlike -EAGAIN case, the failed page is
1423                                 * removed from migration page list and not
1424                                 * retried in the next outer loop.
1425                                 */
1426                                nr_failed++;
1427                                break;
1428                        }
1429                }
1430        }
1431        nr_failed += retry;
1432        rc = nr_failed;
1433out:
1434        if (nr_succeeded)
1435                count_vm_events(PGMIGRATE_SUCCESS, nr_succeeded);
1436        if (nr_failed)
1437                count_vm_events(PGMIGRATE_FAIL, nr_failed);
1438        trace_mm_migrate_pages(nr_succeeded, nr_failed, mode, reason);
1439
1440        if (!swapwrite)
1441                current->flags &= ~PF_SWAPWRITE;
1442
1443        return rc;
1444}
1445
1446#ifdef CONFIG_NUMA
1447/*
1448 * Move a list of individual pages
1449 */
1450struct page_to_node {
1451        unsigned long addr;
1452        struct page *page;
1453        int node;
1454        int status;
1455};
1456
1457static struct page *new_page_node(struct page *p, unsigned long private,
1458                int **result)
1459{
1460        struct page_to_node *pm = (struct page_to_node *)private;
1461
1462        while (pm->node != MAX_NUMNODES && pm->page != p)
1463                pm++;
1464
1465        if (pm->node == MAX_NUMNODES)
1466                return NULL;
1467
1468        *result = &pm->status;
1469
1470        if (PageHuge(p))
1471                return alloc_huge_page_node(page_hstate(compound_head(p)),
1472                                        pm->node);
1473        else if (thp_migration_supported() && PageTransHuge(p)) {
1474                struct page *thp;
1475
1476                thp = alloc_pages_node(pm->node,
1477                        (GFP_TRANSHUGE | __GFP_THISNODE) & ~__GFP_RECLAIM,
1478                        HPAGE_PMD_ORDER);
1479                if (!thp)
1480                        return NULL;
1481                prep_transhuge_page(thp);
1482                return thp;
1483        } else
1484                return __alloc_pages_node(pm->node,
1485                                GFP_HIGHUSER_MOVABLE | __GFP_THISNODE, 0);
1486}
1487
1488/*
1489 * Move a set of pages as indicated in the pm array. The addr
1490 * field must be set to the virtual address of the page to be moved
1491 * and the node number must contain a valid target node.
1492 * The pm array ends with node = MAX_NUMNODES.
1493 */
1494static int do_move_page_to_node_array(struct mm_struct *mm,
1495                                      struct page_to_node *pm,
1496                                      int migrate_all)
1497{
1498        int err;
1499        struct page_to_node *pp;
1500        LIST_HEAD(pagelist);
1501
1502        down_read(&mm->mmap_sem);
1503
1504        /*
1505         * Build a list of pages to migrate
1506         */
1507        for (pp = pm; pp->node != MAX_NUMNODES; pp++) {
1508                struct vm_area_struct *vma;
1509                struct page *page;
1510                struct page *head;
1511                unsigned int follflags;
1512
1513                err = -EFAULT;
1514                vma = find_vma(mm, pp->addr);
1515                if (!vma || pp->addr < vma->vm_start || !vma_migratable(vma))
1516                        goto set_status;
1517
1518                /* FOLL_DUMP to ignore special (like zero) pages */
1519                follflags = FOLL_GET | FOLL_DUMP;
1520                if (!thp_migration_supported())
1521                        follflags |= FOLL_SPLIT;
1522                page = follow_page(vma, pp->addr, follflags);
1523
1524                err = PTR_ERR(page);
1525                if (IS_ERR(page))
1526                        goto set_status;
1527
1528                err = -ENOENT;
1529                if (!page)
1530                        goto set_status;
1531
1532                err = page_to_nid(page);
1533
1534                if (err == pp->node)
1535                        /*
1536                         * Node already in the right place
1537                         */
1538                        goto put_and_set;
1539
1540                err = -EACCES;
1541                if (page_mapcount(page) > 1 &&
1542                                !migrate_all)
1543                        goto put_and_set;
1544
1545                if (PageHuge(page)) {
1546                        if (PageHead(page)) {
1547                                isolate_huge_page(page, &pagelist);
1548                                err = 0;
1549                                pp->page = page;
1550                        }
1551                        goto put_and_set;
1552                }
1553
1554                pp->page = compound_head(page);
1555                head = compound_head(page);
1556                err = isolate_lru_page(head);
1557                if (!err) {
1558                        list_add_tail(&head->lru, &pagelist);
1559                        mod_node_page_state(page_pgdat(head),
1560                                NR_ISOLATED_ANON + page_is_file_cache(head),
1561                                hpage_nr_pages(head));
1562                }
1563put_and_set:
1564                /*
1565                 * Either remove the duplicate refcount from
1566                 * isolate_lru_page() or drop the page ref if it was
1567                 * not isolated.
1568                 */
1569                put_page(page);
1570set_status:
1571                pp->status = err;
1572        }
1573
1574        err = 0;
1575        if (!list_empty(&pagelist)) {
1576                err = migrate_pages(&pagelist, new_page_node, NULL,
1577                                (unsigned long)pm, MIGRATE_SYNC, MR_SYSCALL);
1578                if (err)
1579                        putback_movable_pages(&pagelist);
1580        }
1581
1582        up_read(&mm->mmap_sem);
1583        return err;
1584}
1585
1586/*
1587 * Migrate an array of page address onto an array of nodes and fill
1588 * the corresponding array of status.
1589 */
1590static int do_pages_move(struct mm_struct *mm, nodemask_t task_nodes,
1591                         unsigned long nr_pages,
1592                         const void __user * __user *pages,
1593                         const int __user *nodes,
1594                         int __user *status, int flags)
1595{
1596        struct page_to_node *pm;
1597        unsigned long chunk_nr_pages;
1598        unsigned long chunk_start;
1599        int err;
1600
1601        err = -ENOMEM;
1602        pm = (struct page_to_node *)__get_free_page(GFP_KERNEL);
1603        if (!pm)
1604                goto out;
1605
1606        migrate_prep();
1607
1608        /*
1609         * Store a chunk of page_to_node array in a page,
1610         * but keep the last one as a marker
1611         */
1612        chunk_nr_pages = (PAGE_SIZE / sizeof(struct page_to_node)) - 1;
1613
1614        for (chunk_start = 0;
1615             chunk_start < nr_pages;
1616             chunk_start += chunk_nr_pages) {
1617                int j;
1618
1619                if (chunk_start + chunk_nr_pages > nr_pages)
1620                        chunk_nr_pages = nr_pages - chunk_start;
1621
1622                /* fill the chunk pm with addrs and nodes from user-space */
1623                for (j = 0; j < chunk_nr_pages; j++) {
1624                        const void __user *p;
1625                        int node;
1626
1627                        err = -EFAULT;
1628                        if (get_user(p, pages + j + chunk_start))
1629                                goto out_pm;
1630                        pm[j].addr = (unsigned long) p;
1631
1632                        if (get_user(node, nodes + j + chunk_start))
1633                                goto out_pm;
1634
1635                        err = -ENODEV;
1636                        if (node < 0 || node >= MAX_NUMNODES)
1637                                goto out_pm;
1638
1639                        if (!node_state(node, N_MEMORY))
1640                                goto out_pm;
1641
1642                        err = -EACCES;
1643                        if (!node_isset(node, task_nodes))
1644                                goto out_pm;
1645
1646                        pm[j].node = node;
1647                }
1648
1649                /* End marker for this chunk */
1650                pm[chunk_nr_pages].node = MAX_NUMNODES;
1651
1652                /* Migrate this chunk */
1653                err = do_move_page_to_node_array(mm, pm,
1654                                                 flags & MPOL_MF_MOVE_ALL);
1655                if (err < 0)
1656                        goto out_pm;
1657
1658                /* Return status information */
1659                for (j = 0; j < chunk_nr_pages; j++)
1660                        if (put_user(pm[j].status, status + j + chunk_start)) {
1661                                err = -EFAULT;
1662                                goto out_pm;
1663                        }
1664        }
1665        err = 0;
1666
1667out_pm:
1668        free_page((unsigned long)pm);
1669out:
1670        return err;
1671}
1672
1673/*
1674 * Determine the nodes of an array of pages and store it in an array of status.
1675 */
1676static void do_pages_stat_array(struct mm_struct *mm, unsigned long nr_pages,
1677                                const void __user **pages, int *status)
1678{
1679        unsigned long i;
1680
1681        down_read(&mm->mmap_sem);
1682
1683        for (i = 0; i < nr_pages; i++) {
1684                unsigned long addr = (unsigned long)(*pages);
1685                struct vm_area_struct *vma;
1686                struct page *page;
1687                int err = -EFAULT;
1688
1689                vma = find_vma(mm, addr);
1690                if (!vma || addr < vma->vm_start)
1691                        goto set_status;
1692
1693                /* FOLL_DUMP to ignore special (like zero) pages */
1694                page = follow_page(vma, addr, FOLL_DUMP);
1695
1696                err = PTR_ERR(page);
1697                if (IS_ERR(page))
1698                        goto set_status;
1699
1700                err = page ? page_to_nid(page) : -ENOENT;
1701set_status:
1702                *status = err;
1703
1704                pages++;
1705                status++;
1706        }
1707
1708        up_read(&mm->mmap_sem);
1709}
1710
1711/*
1712 * Determine the nodes of a user array of pages and store it in
1713 * a user array of status.
1714 */
1715static int do_pages_stat(struct mm_struct *mm, unsigned long nr_pages,
1716                         const void __user * __user *pages,
1717                         int __user *status)
1718{
1719#define DO_PAGES_STAT_CHUNK_NR 16
1720        const void __user *chunk_pages[DO_PAGES_STAT_CHUNK_NR];
1721        int chunk_status[DO_PAGES_STAT_CHUNK_NR];
1722
1723        while (nr_pages) {
1724                unsigned long chunk_nr;
1725
1726                chunk_nr = nr_pages;
1727                if (chunk_nr > DO_PAGES_STAT_CHUNK_NR)
1728                        chunk_nr = DO_PAGES_STAT_CHUNK_NR;
1729
1730                if (copy_from_user(chunk_pages, pages, chunk_nr * sizeof(*chunk_pages)))
1731                        break;
1732
1733                do_pages_stat_array(mm, chunk_nr, chunk_pages, chunk_status);
1734
1735                if (copy_to_user(status, chunk_status, chunk_nr * sizeof(*status)))
1736                        break;
1737
1738                pages += chunk_nr;
1739                status += chunk_nr;
1740                nr_pages -= chunk_nr;
1741        }
1742        return nr_pages ? -EFAULT : 0;
1743}
1744
1745/*
1746 * Move a list of pages in the address space of the currently executing
1747 * process.
1748 */
1749SYSCALL_DEFINE6(move_pages, pid_t, pid, unsigned long, nr_pages,
1750                const void __user * __user *, pages,
1751                const int __user *, nodes,
1752                int __user *, status, int, flags)
1753{
1754        struct task_struct *task;
1755        struct mm_struct *mm;
1756        int err;
1757        nodemask_t task_nodes;
1758
1759        /* Check flags */
1760        if (flags & ~(MPOL_MF_MOVE|MPOL_MF_MOVE_ALL))
1761                return -EINVAL;
1762
1763        if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
1764                return -EPERM;
1765
1766        /* Find the mm_struct */
1767        rcu_read_lock();
1768        task = pid ? find_task_by_vpid(pid) : current;
1769        if (!task) {
1770                rcu_read_unlock();
1771                return -ESRCH;
1772        }
1773        get_task_struct(task);
1774
1775        /*
1776         * Check if this process has the right to modify the specified
1777         * process. Use the regular "ptrace_may_access()" checks.
1778         */
1779        if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS)) {
1780                rcu_read_unlock();
1781                err = -EPERM;
1782                goto out;
1783        }
1784        rcu_read_unlock();
1785
1786        err = security_task_movememory(task);
1787        if (err)
1788                goto out;
1789
1790        task_nodes = cpuset_mems_allowed(task);
1791        mm = get_task_mm(task);
1792        put_task_struct(task);
1793
1794        if (!mm)
1795                return -EINVAL;
1796
1797        if (nodes)
1798                err = do_pages_move(mm, task_nodes, nr_pages, pages,
1799                                    nodes, status, flags);
1800        else
1801                err = do_pages_stat(mm, nr_pages, pages, status);
1802
1803        mmput(mm);
1804        return err;
1805
1806out:
1807        put_task_struct(task);
1808        return err;
1809}
1810
1811#ifdef CONFIG_NUMA_BALANCING
1812/*
1813 * Returns true if this is a safe migration target node for misplaced NUMA
1814 * pages. Currently it only checks the watermarks which crude
1815 */
1816static bool migrate_balanced_pgdat(struct pglist_data *pgdat,
1817                                   unsigned long nr_migrate_pages)
1818{
1819        int z;
1820
1821        for (z = pgdat->nr_zones - 1; z >= 0; z--) {
1822                struct zone *zone = pgdat->node_zones + z;
1823
1824                if (!populated_zone(zone))
1825                        continue;
1826
1827                /* Avoid waking kswapd by allocating pages_to_migrate pages. */
1828                if (!zone_watermark_ok(zone, 0,
1829                                       high_wmark_pages(zone) +
1830                                       nr_migrate_pages,
1831                                       0, 0))
1832                        continue;
1833                return true;
1834        }
1835        return false;
1836}
1837
1838static struct page *alloc_misplaced_dst_page(struct page *page,
1839                                           unsigned long data,
1840                                           int **result)
1841{
1842        int nid = (int) data;
1843        struct page *newpage;
1844
1845        newpage = __alloc_pages_node(nid,
1846                                         (GFP_HIGHUSER_MOVABLE |
1847                                          __GFP_THISNODE | __GFP_NOMEMALLOC |
1848                                          __GFP_NORETRY | __GFP_NOWARN) &
1849                                         ~__GFP_RECLAIM, 0);
1850
1851        return newpage;
1852}
1853
1854/*
1855 * page migration rate limiting control.
1856 * Do not migrate more than @pages_to_migrate in a @migrate_interval_millisecs
1857 * window of time. Default here says do not migrate more than 1280M per second.
1858 */
1859static unsigned int migrate_interval_millisecs __read_mostly = 100;
1860static unsigned int ratelimit_pages __read_mostly = 128 << (20 - PAGE_SHIFT);
1861
1862/* Returns true if the node is migrate rate-limited after the update */
1863static bool numamigrate_update_ratelimit(pg_data_t *pgdat,
1864                                        unsigned long nr_pages)
1865{
1866        /*
1867         * Rate-limit the amount of data that is being migrated to a node.
1868         * Optimal placement is no good if the memory bus is saturated and
1869         * all the time is being spent migrating!
1870         */
1871        if (time_after(jiffies, pgdat->numabalancing_migrate_next_window)) {
1872                spin_lock(&pgdat->numabalancing_migrate_lock);
1873                pgdat->numabalancing_migrate_nr_pages = 0;
1874                pgdat->numabalancing_migrate_next_window = jiffies +
1875                        msecs_to_jiffies(migrate_interval_millisecs);
1876                spin_unlock(&pgdat->numabalancing_migrate_lock);
1877        }
1878        if (pgdat->numabalancing_migrate_nr_pages > ratelimit_pages) {
1879                trace_mm_numa_migrate_ratelimit(current, pgdat->node_id,
1880                                                                nr_pages);
1881                return true;
1882        }
1883
1884        /*
1885         * This is an unlocked non-atomic update so errors are possible.
1886         * The consequences are failing to migrate when we potentiall should
1887         * have which is not severe enough to warrant locking. If it is ever
1888         * a problem, it can be converted to a per-cpu counter.
1889         */
1890        pgdat->numabalancing_migrate_nr_pages += nr_pages;
1891        return false;
1892}
1893
1894static int numamigrate_isolate_page(pg_data_t *pgdat, struct page *page)
1895{
1896        int page_lru;
1897
1898        VM_BUG_ON_PAGE(compound_order(page) && !PageTransHuge(page), page);
1899
1900        /* Avoid migrating to a node that is nearly full */
1901        if (!migrate_balanced_pgdat(pgdat, 1UL << compound_order(page)))
1902                return 0;
1903
1904        if (isolate_lru_page(page))
1905                return 0;
1906
1907        /*
1908         * migrate_misplaced_transhuge_page() skips page migration's usual
1909         * check on page_count(), so we must do it here, now that the page
1910         * has been isolated: a GUP pin, or any other pin, prevents migration.
1911         * The expected page count is 3: 1 for page's mapcount and 1 for the
1912         * caller's pin and 1 for the reference taken by isolate_lru_page().
1913         */
1914        if (PageTransHuge(page) && page_count(page) != 3) {
1915                putback_lru_page(page);
1916                return 0;
1917        }
1918
1919        page_lru = page_is_file_cache(page);
1920        mod_node_page_state(page_pgdat(page), NR_ISOLATED_ANON + page_lru,
1921                                hpage_nr_pages(page));
1922
1923        /*
1924         * Isolating the page has taken another reference, so the
1925         * caller's reference can be safely dropped without the page
1926         * disappearing underneath us during migration.
1927         */
1928        put_page(page);
1929        return 1;
1930}
1931
1932bool pmd_trans_migrating(pmd_t pmd)
1933{
1934        struct page *page = pmd_page(pmd);
1935        return PageLocked(page);
1936}
1937
1938/*
1939 * Attempt to migrate a misplaced page to the specified destination
1940 * node. Caller is expected to have an elevated reference count on
1941 * the page that will be dropped by this function before returning.
1942 */
1943int migrate_misplaced_page(struct page *page, struct vm_area_struct *vma,
1944                           int node)
1945{
1946        pg_data_t *pgdat = NODE_DATA(node);
1947        int isolated;
1948        int nr_remaining;
1949        LIST_HEAD(migratepages);
1950
1951        /*
1952         * Don't migrate file pages that are mapped in multiple processes
1953         * with execute permissions as they are probably shared libraries.
1954         */
1955        if (page_mapcount(page) != 1 && page_is_file_cache(page) &&
1956            (vma->vm_flags & VM_EXEC))
1957                goto out;
1958
1959        /*
1960         * Rate-limit the amount of data that is being migrated to a node.
1961         * Optimal placement is no good if the memory bus is saturated and
1962         * all the time is being spent migrating!
1963         */
1964        if (numamigrate_update_ratelimit(pgdat, 1))
1965                goto out;
1966
1967        isolated = numamigrate_isolate_page(pgdat, page);
1968        if (!isolated)
1969                goto out;
1970
1971        list_add(&page->lru, &migratepages);
1972        nr_remaining = migrate_pages(&migratepages, alloc_misplaced_dst_page,
1973                                     NULL, node, MIGRATE_ASYNC,
1974                                     MR_NUMA_MISPLACED);
1975        if (nr_remaining) {
1976                if (!list_empty(&migratepages)) {
1977                        list_del(&page->lru);
1978                        dec_node_page_state(page, NR_ISOLATED_ANON +
1979                                        page_is_file_cache(page));
1980                        putback_lru_page(page);
1981                }
1982                isolated = 0;
1983        } else
1984                count_vm_numa_event(NUMA_PAGE_MIGRATE);
1985        BUG_ON(!list_empty(&migratepages));
1986        return isolated;
1987
1988out:
1989        put_page(page);
1990        return 0;
1991}
1992#endif /* CONFIG_NUMA_BALANCING */
1993
1994#if defined(CONFIG_NUMA_BALANCING) && defined(CONFIG_TRANSPARENT_HUGEPAGE)
1995/*
1996 * Migrates a THP to a given target node. page must be locked and is unlocked
1997 * before returning.
1998 */
1999int migrate_misplaced_transhuge_page(struct mm_struct *mm,
2000                                struct vm_area_struct *vma,
2001                                pmd_t *pmd, pmd_t entry,
2002                                unsigned long address,
2003                                struct page *page, int node)
2004{
2005        spinlock_t *ptl;
2006        pg_data_t *pgdat = NODE_DATA(node);
2007        int isolated = 0;
2008        struct page *new_page = NULL;
2009        int page_lru = page_is_file_cache(page);
2010        unsigned long mmun_start = address & HPAGE_PMD_MASK;
2011        unsigned long mmun_end = mmun_start + HPAGE_PMD_SIZE;
2012
2013        /*
2014         * Rate-limit the amount of data that is being migrated to a node.
2015         * Optimal placement is no good if the memory bus is saturated and
2016         * all the time is being spent migrating!
2017         */
2018        if (numamigrate_update_ratelimit(pgdat, HPAGE_PMD_NR))
2019                goto out_dropref;
2020
2021        new_page = alloc_pages_node(node,
2022                (GFP_TRANSHUGE_LIGHT | __GFP_THISNODE),
2023                HPAGE_PMD_ORDER);
2024        if (!new_page)
2025                goto out_fail;
2026        prep_transhuge_page(new_page);
2027
2028        isolated = numamigrate_isolate_page(pgdat, page);
2029        if (!isolated) {
2030                put_page(new_page);
2031                goto out_fail;
2032        }
2033
2034        /* Prepare a page as a migration target */
2035        __SetPageLocked(new_page);
2036        if (PageSwapBacked(page))
2037                __SetPageSwapBacked(new_page);
2038
2039        /* anon mapping, we can simply copy page->mapping to the new page: */
2040        new_page->mapping = page->mapping;
2041        new_page->index = page->index;
2042        migrate_page_copy(new_page, page);
2043        WARN_ON(PageLRU(new_page));
2044
2045        /* Recheck the target PMD */
2046        mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
2047        ptl = pmd_lock(mm, pmd);
2048        if (unlikely(!pmd_same(*pmd, entry) || !page_ref_freeze(page, 2))) {
2049                spin_unlock(ptl);
2050                mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
2051
2052                /* Reverse changes made by migrate_page_copy() */
2053                if (TestClearPageActive(new_page))
2054                        SetPageActive(page);
2055                if (TestClearPageUnevictable(new_page))
2056                        SetPageUnevictable(page);
2057
2058                unlock_page(new_page);
2059                put_page(new_page);             /* Free it */
2060
2061                /* Retake the callers reference and putback on LRU */
2062                get_page(page);
2063                putback_lru_page(page);
2064                mod_node_page_state(page_pgdat(page),
2065                         NR_ISOLATED_ANON + page_lru, -HPAGE_PMD_NR);
2066
2067                goto out_unlock;
2068        }
2069
2070        entry = mk_huge_pmd(new_page, vma->vm_page_prot);
2071        entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
2072
2073        /*
2074         * Clear the old entry under pagetable lock and establish the new PTE.
2075         * Any parallel GUP will either observe the old page blocking on the
2076         * page lock, block on the page table lock or observe the new page.
2077         * The SetPageUptodate on the new page and page_add_new_anon_rmap
2078         * guarantee the copy is visible before the pagetable update.
2079         */
2080        flush_cache_range(vma, mmun_start, mmun_end);
2081        page_add_anon_rmap(new_page, vma, mmun_start, true);
2082        pmdp_huge_clear_flush_notify(vma, mmun_start, pmd);
2083        set_pmd_at(mm, mmun_start, pmd, entry);
2084        update_mmu_cache_pmd(vma, address, &entry);
2085
2086        page_ref_unfreeze(page, 2);
2087        mlock_migrate_page(new_page, page);
2088        page_remove_rmap(page, true);
2089        set_page_owner_migrate_reason(new_page, MR_NUMA_MISPLACED);
2090
2091        spin_unlock(ptl);
2092        mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
2093
2094        /* Take an "isolate" reference and put new page on the LRU. */
2095        get_page(new_page);
2096        putback_lru_page(new_page);
2097
2098        unlock_page(new_page);
2099        unlock_page(page);
2100        put_page(page);                 /* Drop the rmap reference */
2101        put_page(page);                 /* Drop the LRU isolation reference */
2102
2103        count_vm_events(PGMIGRATE_SUCCESS, HPAGE_PMD_NR);
2104        count_vm_numa_events(NUMA_PAGE_MIGRATE, HPAGE_PMD_NR);
2105
2106        mod_node_page_state(page_pgdat(page),
2107                        NR_ISOLATED_ANON + page_lru,
2108                        -HPAGE_PMD_NR);
2109        return isolated;
2110
2111out_fail:
2112        count_vm_events(PGMIGRATE_FAIL, HPAGE_PMD_NR);
2113out_dropref:
2114        ptl = pmd_lock(mm, pmd);
2115        if (pmd_same(*pmd, entry)) {
2116                entry = pmd_modify(entry, vma->vm_page_prot);
2117                set_pmd_at(mm, mmun_start, pmd, entry);
2118                update_mmu_cache_pmd(vma, address, &entry);
2119        }
2120        spin_unlock(ptl);
2121
2122out_unlock:
2123        unlock_page(page);
2124        put_page(page);
2125        return 0;
2126}
2127#endif /* CONFIG_NUMA_BALANCING */
2128
2129#endif /* CONFIG_NUMA */
2130
2131#if defined(CONFIG_MIGRATE_VMA_HELPER)
2132struct migrate_vma {
2133        struct vm_area_struct   *vma;
2134        unsigned long           *dst;
2135        unsigned long           *src;
2136        unsigned long           cpages;
2137        unsigned long           npages;
2138        unsigned long           start;
2139        unsigned long           end;
2140};
2141
2142static int migrate_vma_collect_hole(unsigned long start,
2143                                    unsigned long end,
2144                                    struct mm_walk *walk)
2145{
2146        struct migrate_vma *migrate = walk->private;
2147        unsigned long addr;
2148
2149        for (addr = start & PAGE_MASK; addr < end; addr += PAGE_SIZE) {
2150                migrate->src[migrate->npages] = MIGRATE_PFN_MIGRATE;
2151                migrate->dst[migrate->npages] = 0;
2152                migrate->npages++;
2153                migrate->cpages++;
2154        }
2155
2156        return 0;
2157}
2158
2159static int migrate_vma_collect_skip(unsigned long start,
2160                                    unsigned long end,
2161                                    struct mm_walk *walk)
2162{
2163        struct migrate_vma *migrate = walk->private;
2164        unsigned long addr;
2165
2166        for (addr = start & PAGE_MASK; addr < end; addr += PAGE_SIZE) {
2167                migrate->dst[migrate->npages] = 0;
2168                migrate->src[migrate->npages++] = 0;
2169        }
2170
2171        return 0;
2172}
2173
2174static int migrate_vma_collect_pmd(pmd_t *pmdp,
2175                                   unsigned long start,
2176                                   unsigned long end,
2177                                   struct mm_walk *walk)
2178{
2179        struct migrate_vma *migrate = walk->private;
2180        struct vm_area_struct *vma = walk->vma;
2181        struct mm_struct *mm = vma->vm_mm;
2182        unsigned long addr = start, unmapped = 0;
2183        spinlock_t *ptl;
2184        pte_t *ptep;
2185
2186again:
2187        if (pmd_none(*pmdp))
2188                return migrate_vma_collect_hole(start, end, walk);
2189
2190        if (pmd_trans_huge(*pmdp)) {
2191                struct page *page;
2192
2193                ptl = pmd_lock(mm, pmdp);
2194                if (unlikely(!pmd_trans_huge(*pmdp))) {
2195                        spin_unlock(ptl);
2196                        goto again;
2197                }
2198
2199                page = pmd_page(*pmdp);
2200                if (is_huge_zero_page(page)) {
2201                        spin_unlock(ptl);
2202                        split_huge_pmd(vma, pmdp, addr);
2203                        if (pmd_trans_unstable(pmdp))
2204                                return migrate_vma_collect_skip(start, end,
2205                                                                walk);
2206                } else {
2207                        int ret;
2208
2209                        get_page(page);
2210                        spin_unlock(ptl);
2211                        if (unlikely(!trylock_page(page)))
2212                                return migrate_vma_collect_skip(start, end,
2213                                                                walk);
2214                        ret = split_huge_page(page);
2215                        unlock_page(page);
2216                        put_page(page);
2217                        if (ret)
2218                                return migrate_vma_collect_skip(start, end,
2219                                                                walk);
2220                        if (pmd_none(*pmdp))
2221                                return migrate_vma_collect_hole(start, end,
2222                                                                walk);
2223                }
2224        }
2225
2226        if (unlikely(pmd_bad(*pmdp)))
2227                return migrate_vma_collect_skip(start, end, walk);
2228
2229        ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl);
2230        arch_enter_lazy_mmu_mode();
2231
2232        for (; addr < end; addr += PAGE_SIZE, ptep++) {
2233                unsigned long mpfn, pfn;
2234                struct page *page;
2235                swp_entry_t entry;
2236                pte_t pte;
2237
2238                pte = *ptep;
2239                pfn = pte_pfn(pte);
2240
2241                if (pte_none(pte)) {
2242                        mpfn = MIGRATE_PFN_MIGRATE;
2243                        migrate->cpages++;
2244                        pfn = 0;
2245                        goto next;
2246                }
2247
2248                if (!pte_present(pte)) {
2249                        mpfn = pfn = 0;
2250
2251                        /*
2252                         * Only care about unaddressable device page special
2253                         * page table entry. Other special swap entries are not
2254                         * migratable, and we ignore regular swapped page.
2255                         */
2256                        entry = pte_to_swp_entry(pte);
2257                        if (!is_device_private_entry(entry))
2258                                goto next;
2259
2260                        page = device_private_entry_to_page(entry);
2261                        mpfn = migrate_pfn(page_to_pfn(page))|
2262                                MIGRATE_PFN_DEVICE | MIGRATE_PFN_MIGRATE;
2263                        if (is_write_device_private_entry(entry))
2264                                mpfn |= MIGRATE_PFN_WRITE;
2265                } else {
2266                        if (is_zero_pfn(pfn)) {
2267                                mpfn = MIGRATE_PFN_MIGRATE;
2268                                migrate->cpages++;
2269                                pfn = 0;
2270                                goto next;
2271                        }
2272                        page = _vm_normal_page(migrate->vma, addr, pte, true);
2273                        mpfn = migrate_pfn(pfn) | MIGRATE_PFN_MIGRATE;
2274                        mpfn |= pte_write(pte) ? MIGRATE_PFN_WRITE : 0;
2275                }
2276
2277                /* FIXME support THP */
2278                if (!page || !page->mapping || PageTransCompound(page)) {
2279                        mpfn = pfn = 0;
2280                        goto next;
2281                }
2282                pfn = page_to_pfn(page);
2283
2284                /*
2285                 * By getting a reference on the page we pin it and that blocks
2286                 * any kind of migration. Side effect is that it "freezes" the
2287                 * pte.
2288                 *
2289                 * We drop this reference after isolating the page from the lru
2290                 * for non device page (device page are not on the lru and thus
2291                 * can't be dropped from it).
2292                 */
2293                get_page(page);
2294                migrate->cpages++;
2295
2296                /*
2297                 * Optimize for the common case where page is only mapped once
2298                 * in one process. If we can lock the page, then we can safely
2299                 * set up a special migration page table entry now.
2300                 */
2301                if (trylock_page(page)) {
2302                        pte_t swp_pte;
2303
2304                        mpfn |= MIGRATE_PFN_LOCKED;
2305                        ptep_get_and_clear(mm, addr, ptep);
2306
2307                        /* Setup special migration page table entry */
2308                        entry = make_migration_entry(page, pte_write(pte));
2309                        swp_pte = swp_entry_to_pte(entry);
2310                        if (pte_soft_dirty(pte))
2311                                swp_pte = pte_swp_mksoft_dirty(swp_pte);
2312                        set_pte_at(mm, addr, ptep, swp_pte);
2313
2314                        /*
2315                         * This is like regular unmap: we remove the rmap and
2316                         * drop page refcount. Page won't be freed, as we took
2317                         * a reference just above.
2318                         */
2319                        page_remove_rmap(page, false);
2320                        put_page(page);
2321
2322                        if (pte_present(pte))
2323                                unmapped++;
2324                }
2325
2326next:
2327                migrate->dst[migrate->npages] = 0;
2328                migrate->src[migrate->npages++] = mpfn;
2329        }
2330        arch_leave_lazy_mmu_mode();
2331        pte_unmap_unlock(ptep - 1, ptl);
2332
2333        /* Only flush the TLB if we actually modified any entries */
2334        if (unmapped)
2335                flush_tlb_range(walk->vma, start, end);
2336
2337        return 0;
2338}
2339
2340/*
2341 * migrate_vma_collect() - collect pages over a range of virtual addresses
2342 * @migrate: migrate struct containing all migration information
2343 *
2344 * This will walk the CPU page table. For each virtual address backed by a
2345 * valid page, it updates the src array and takes a reference on the page, in
2346 * order to pin the page until we lock it and unmap it.
2347 */
2348static void migrate_vma_collect(struct migrate_vma *migrate)
2349{
2350        struct mm_walk mm_walk;
2351
2352        mm_walk.pmd_entry = migrate_vma_collect_pmd;
2353        mm_walk.pte_entry = NULL;
2354        mm_walk.pte_hole = migrate_vma_collect_hole;
2355        mm_walk.hugetlb_entry = NULL;
2356        mm_walk.test_walk = NULL;
2357        mm_walk.vma = migrate->vma;
2358        mm_walk.mm = migrate->vma->vm_mm;
2359        mm_walk.private = migrate;
2360
2361        mmu_notifier_invalidate_range_start(mm_walk.mm,
2362                                            migrate->start,
2363                                            migrate->end);
2364        walk_page_range(migrate->start, migrate->end, &mm_walk);
2365        mmu_notifier_invalidate_range_end(mm_walk.mm,
2366                                          migrate->start,
2367                                          migrate->end);
2368
2369        migrate->end = migrate->start + (migrate->npages << PAGE_SHIFT);
2370}
2371
2372/*
2373 * migrate_vma_check_page() - check if page is pinned or not
2374 * @page: struct page to check
2375 *
2376 * Pinned pages cannot be migrated. This is the same test as in
2377 * migrate_page_move_mapping(), except that here we allow migration of a
2378 * ZONE_DEVICE page.
2379 */
2380static bool migrate_vma_check_page(struct page *page)
2381{
2382        /*
2383         * One extra ref because caller holds an extra reference, either from
2384         * isolate_lru_page() for a regular page, or migrate_vma_collect() for
2385         * a device page.
2386         */
2387        int extra = 1;
2388
2389        /*
2390         * FIXME support THP (transparent huge page), it is bit more complex to
2391         * check them than regular pages, because they can be mapped with a pmd
2392         * or with a pte (split pte mapping).
2393         */
2394        if (PageCompound(page))
2395                return false;
2396
2397        /* Page from ZONE_DEVICE have one extra reference */
2398        if (is_zone_device_page(page)) {
2399                /*
2400                 * Private page can never be pin as they have no valid pte and
2401                 * GUP will fail for those. Yet if there is a pending migration
2402                 * a thread might try to wait on the pte migration entry and
2403                 * will bump the page reference count. Sadly there is no way to
2404                 * differentiate a regular pin from migration wait. Hence to
2405                 * avoid 2 racing thread trying to migrate back to CPU to enter
2406                 * infinite loop (one stoping migration because the other is
2407                 * waiting on pte migration entry). We always return true here.
2408                 *
2409                 * FIXME proper solution is to rework migration_entry_wait() so
2410                 * it does not need to take a reference on page.
2411                 */
2412                if (is_device_private_page(page))
2413                        return true;
2414
2415                /*
2416                 * Only allow device public page to be migrated and account for
2417                 * the extra reference count imply by ZONE_DEVICE pages.
2418                 */
2419                if (!is_device_public_page(page))
2420                        return false;
2421                extra++;
2422        }
2423
2424        /* For file back page */
2425        if (page_mapping(page))
2426                extra += 1 + page_has_private(page);
2427
2428        if ((page_count(page) - extra) > page_mapcount(page))
2429                return false;
2430
2431        return true;
2432}
2433
2434/*
2435 * migrate_vma_prepare() - lock pages and isolate them from the lru
2436 * @migrate: migrate struct containing all migration information
2437 *
2438 * This locks pages that have been collected by migrate_vma_collect(). Once each
2439 * page is locked it is isolated from the lru (for non-device pages). Finally,
2440 * the ref taken by migrate_vma_collect() is dropped, as locked pages cannot be
2441 * migrated by concurrent kernel threads.
2442 */
2443static void migrate_vma_prepare(struct migrate_vma *migrate)
2444{
2445        const unsigned long npages = migrate->npages;
2446        const unsigned long start = migrate->start;
2447        unsigned long addr, i, restore = 0;
2448        bool allow_drain = true;
2449
2450        lru_add_drain();
2451
2452        for (i = 0; (i < npages) && migrate->cpages; i++) {
2453                struct page *page = migrate_pfn_to_page(migrate->src[i]);
2454                bool remap = true;
2455
2456                if (!page)
2457                        continue;
2458
2459                if (!(migrate->src[i] & MIGRATE_PFN_LOCKED)) {
2460                        /*
2461                         * Because we are migrating several pages there can be
2462                         * a deadlock between 2 concurrent migration where each
2463                         * are waiting on each other page lock.
2464                         *
2465                         * Make migrate_vma() a best effort thing and backoff
2466                         * for any page we can not lock right away.
2467                         */
2468                        if (!trylock_page(page)) {
2469                                migrate->src[i] = 0;
2470                                migrate->cpages--;
2471                                put_page(page);
2472                                continue;
2473                        }
2474                        remap = false;
2475                        migrate->src[i] |= MIGRATE_PFN_LOCKED;
2476                }
2477
2478                /* ZONE_DEVICE pages are not on LRU */
2479                if (!is_zone_device_page(page)) {
2480                        if (!PageLRU(page) && allow_drain) {
2481                                /* Drain CPU's pagevec */
2482                                lru_add_drain_all();
2483                                allow_drain = false;
2484                        }
2485
2486                        if (isolate_lru_page(page)) {
2487                                if (remap) {
2488                                        migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
2489                                        migrate->cpages--;
2490                                        restore++;
2491                                } else {
2492                                        migrate->src[i] = 0;
2493                                        unlock_page(page);
2494                                        migrate->cpages--;
2495                                        put_page(page);
2496                                }
2497                                continue;
2498                        }
2499
2500                        /* Drop the reference we took in collect */
2501                        put_page(page);
2502                }
2503
2504                if (!migrate_vma_check_page(page)) {
2505                        if (remap) {
2506                                migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
2507                                migrate->cpages--;
2508                                restore++;
2509
2510                                if (!is_zone_device_page(page)) {
2511                                        get_page(page);
2512                                        putback_lru_page(page);
2513                                }
2514                        } else {
2515                                migrate->src[i] = 0;
2516                                unlock_page(page);
2517                                migrate->cpages--;
2518
2519                                if (!is_zone_device_page(page))
2520                                        putback_lru_page(page);
2521                                else
2522                                        put_page(page);
2523                        }
2524                }
2525        }
2526
2527        for (i = 0, addr = start; i < npages && restore; i++, addr += PAGE_SIZE) {
2528                struct page *page = migrate_pfn_to_page(migrate->src[i]);
2529
2530                if (!page || (migrate->src[i] & MIGRATE_PFN_MIGRATE))
2531                        continue;
2532
2533                remove_migration_pte(page, migrate->vma, addr, page);
2534
2535                migrate->src[i] = 0;
2536                unlock_page(page);
2537                put_page(page);
2538                restore--;
2539        }
2540}
2541
2542/*
2543 * migrate_vma_unmap() - replace page mapping with special migration pte entry
2544 * @migrate: migrate struct containing all migration information
2545 *
2546 * Replace page mapping (CPU page table pte) with a special migration pte entry
2547 * and check again if it has been pinned. Pinned pages are restored because we
2548 * cannot migrate them.
2549 *
2550 * This is the last step before we call the device driver callback to allocate
2551 * destination memory and copy contents of original page over to new page.
2552 */
2553static void migrate_vma_unmap(struct migrate_vma *migrate)
2554{
2555        int flags = TTU_MIGRATION | TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS;
2556        const unsigned long npages = migrate->npages;
2557        const unsigned long start = migrate->start;
2558        unsigned long addr, i, restore = 0;
2559
2560        for (i = 0; i < npages; i++) {
2561                struct page *page = migrate_pfn_to_page(migrate->src[i]);
2562
2563                if (!page || !(migrate->src[i] & MIGRATE_PFN_MIGRATE))
2564                        continue;
2565
2566                if (page_mapped(page)) {
2567                        try_to_unmap(page, flags);
2568                        if (page_mapped(page))
2569                                goto restore;
2570                }
2571
2572                if (migrate_vma_check_page(page))
2573                        continue;
2574
2575restore:
2576                migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
2577                migrate->cpages--;
2578                restore++;
2579        }
2580
2581        for (addr = start, i = 0; i < npages && restore; addr += PAGE_SIZE, i++) {
2582                struct page *page = migrate_pfn_to_page(migrate->src[i]);
2583
2584                if (!page || (migrate->src[i] & MIGRATE_PFN_MIGRATE))
2585                        continue;
2586
2587                remove_migration_ptes(page, page, false);
2588
2589                migrate->src[i] = 0;
2590                unlock_page(page);
2591                restore--;
2592
2593                if (is_zone_device_page(page))
2594                        put_page(page);
2595                else
2596                        putback_lru_page(page);
2597        }
2598}
2599
2600static void migrate_vma_insert_page(struct migrate_vma *migrate,
2601                                    unsigned long addr,
2602                                    struct page *page,
2603                                    unsigned long *src,
2604                                    unsigned long *dst)
2605{
2606        struct vm_area_struct *vma = migrate->vma;
2607        struct mm_struct *mm = vma->vm_mm;
2608        struct mem_cgroup *memcg;
2609        bool flush = false;
2610        spinlock_t *ptl;
2611        pte_t entry;
2612        pgd_t *pgdp;
2613        p4d_t *p4dp;
2614        pud_t *pudp;
2615        pmd_t *pmdp;
2616        pte_t *ptep;
2617
2618        /* Only allow populating anonymous memory */
2619        if (!vma_is_anonymous(vma))
2620                goto abort;
2621
2622        pgdp = pgd_offset(mm, addr);
2623        p4dp = p4d_alloc(mm, pgdp, addr);
2624        if (!p4dp)
2625                goto abort;
2626        pudp = pud_alloc(mm, p4dp, addr);
2627        if (!pudp)
2628                goto abort;
2629        pmdp = pmd_alloc(mm, pudp, addr);
2630        if (!pmdp)
2631                goto abort;
2632
2633        if (pmd_trans_huge(*pmdp) || pmd_devmap(*pmdp))
2634                goto abort;
2635
2636        /*
2637         * Use pte_alloc() instead of pte_alloc_map().  We can't run
2638         * pte_offset_map() on pmds where a huge pmd might be created
2639         * from a different thread.
2640         *
2641         * pte_alloc_map() is safe to use under down_write(mmap_sem) or when
2642         * parallel threads are excluded by other means.
2643         *
2644         * Here we only have down_read(mmap_sem).
2645         */
2646        if (pte_alloc(mm, pmdp, addr))
2647                goto abort;
2648
2649        /* See the comment in pte_alloc_one_map() */
2650        if (unlikely(pmd_trans_unstable(pmdp)))
2651                goto abort;
2652
2653        if (unlikely(anon_vma_prepare(vma)))
2654                goto abort;
2655        if (mem_cgroup_try_charge(page, vma->vm_mm, GFP_KERNEL, &memcg, false))
2656                goto abort;
2657
2658        /*
2659         * The memory barrier inside __SetPageUptodate makes sure that
2660         * preceding stores to the page contents become visible before
2661         * the set_pte_at() write.
2662         */
2663        __SetPageUptodate(page);
2664
2665        if (is_zone_device_page(page)) {
2666                if (is_device_private_page(page)) {
2667                        swp_entry_t swp_entry;
2668
2669                        swp_entry = make_device_private_entry(page, vma->vm_flags & VM_WRITE);
2670                        entry = swp_entry_to_pte(swp_entry);
2671                } else if (is_device_public_page(page)) {
2672                        entry = pte_mkold(mk_pte(page, READ_ONCE(vma->vm_page_prot)));
2673                        if (vma->vm_flags & VM_WRITE)
2674                                entry = pte_mkwrite(pte_mkdirty(entry));
2675                        entry = pte_mkdevmap(entry);
2676                }
2677        } else {
2678                entry = mk_pte(page, vma->vm_page_prot);
2679                if (vma->vm_flags & VM_WRITE)
2680                        entry = pte_mkwrite(pte_mkdirty(entry));
2681        }
2682
2683        ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl);
2684
2685        if (pte_present(*ptep)) {
2686                unsigned long pfn = pte_pfn(*ptep);
2687
2688                if (!is_zero_pfn(pfn)) {
2689                        pte_unmap_unlock(ptep, ptl);
2690                        mem_cgroup_cancel_charge(page, memcg, false);
2691                        goto abort;
2692                }
2693                flush = true;
2694        } else if (!pte_none(*ptep)) {
2695                pte_unmap_unlock(ptep, ptl);
2696                mem_cgroup_cancel_charge(page, memcg, false);
2697                goto abort;
2698        }
2699
2700        /*
2701         * Check for usefaultfd but do not deliver the fault. Instead,
2702         * just back off.
2703         */
2704        if (userfaultfd_missing(vma)) {
2705                pte_unmap_unlock(ptep, ptl);
2706                mem_cgroup_cancel_charge(page, memcg, false);
2707                goto abort;
2708        }
2709
2710        inc_mm_counter(mm, MM_ANONPAGES);
2711        page_add_new_anon_rmap(page, vma, addr, false);
2712        mem_cgroup_commit_charge(page, memcg, false, false);
2713        if (!is_zone_device_page(page))
2714                lru_cache_add_active_or_unevictable(page, vma);
2715        get_page(page);
2716
2717        if (flush) {
2718                flush_cache_page(vma, addr, pte_pfn(*ptep));
2719                ptep_clear_flush_notify(vma, addr, ptep);
2720                set_pte_at_notify(mm, addr, ptep, entry);
2721                update_mmu_cache(vma, addr, ptep);
2722        } else {
2723                /* No need to invalidate - it was non-present before */
2724                set_pte_at(mm, addr, ptep, entry);
2725                update_mmu_cache(vma, addr, ptep);
2726        }
2727
2728        pte_unmap_unlock(ptep, ptl);
2729        *src = MIGRATE_PFN_MIGRATE;
2730        return;
2731
2732abort:
2733        *src &= ~MIGRATE_PFN_MIGRATE;
2734}
2735
2736/*
2737 * migrate_vma_pages() - migrate meta-data from src page to dst page
2738 * @migrate: migrate struct containing all migration information
2739 *
2740 * This migrates struct page meta-data from source struct page to destination
2741 * struct page. This effectively finishes the migration from source page to the
2742 * destination page.
2743 */
2744static void migrate_vma_pages(struct migrate_vma *migrate)
2745{
2746        const unsigned long npages = migrate->npages;
2747        const unsigned long start = migrate->start;
2748        struct vm_area_struct *vma = migrate->vma;
2749        struct mm_struct *mm = vma->vm_mm;
2750        unsigned long addr, i, mmu_start;
2751        bool notified = false;
2752
2753        for (i = 0, addr = start; i < npages; addr += PAGE_SIZE, i++) {
2754                struct page *newpage = migrate_pfn_to_page(migrate->dst[i]);
2755                struct page *page = migrate_pfn_to_page(migrate->src[i]);
2756                struct address_space *mapping;
2757                int r;
2758
2759                if (!newpage) {
2760                        migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
2761                        continue;
2762                }
2763
2764                if (!page) {
2765                        if (!(migrate->src[i] & MIGRATE_PFN_MIGRATE)) {
2766                                continue;
2767                        }
2768                        if (!notified) {
2769                                mmu_start = addr;
2770                                notified = true;
2771                                mmu_notifier_invalidate_range_start(mm,
2772                                                                mmu_start,
2773                                                                migrate->end);
2774                        }
2775                        migrate_vma_insert_page(migrate, addr, newpage,
2776                                                &migrate->src[i],
2777                                                &migrate->dst[i]);
2778                        continue;
2779                }
2780
2781                mapping = page_mapping(page);
2782
2783                if (is_zone_device_page(newpage)) {
2784                        if (is_device_private_page(newpage)) {
2785                                /*
2786                                 * For now only support private anonymous when
2787                                 * migrating to un-addressable device memory.
2788                                 */
2789                                if (mapping) {
2790                                        migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
2791                                        continue;
2792                                }
2793                        } else if (!is_device_public_page(newpage)) {
2794                                /*
2795                                 * Other types of ZONE_DEVICE page are not
2796                                 * supported.
2797                                 */
2798                                migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
2799                                continue;
2800                        }
2801                }
2802
2803                r = migrate_page(mapping, newpage, page, MIGRATE_SYNC_NO_COPY);
2804                if (r != MIGRATEPAGE_SUCCESS)
2805                        migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
2806        }
2807
2808        if (notified)
2809                mmu_notifier_invalidate_range_end(mm, mmu_start,
2810                                                  migrate->end);
2811}
2812
2813/*
2814 * migrate_vma_finalize() - restore CPU page table entry
2815 * @migrate: migrate struct containing all migration information
2816 *
2817 * This replaces the special migration pte entry with either a mapping to the
2818 * new page if migration was successful for that page, or to the original page
2819 * otherwise.
2820 *
2821 * This also unlocks the pages and puts them back on the lru, or drops the extra
2822 * refcount, for device pages.
2823 */
2824static void migrate_vma_finalize(struct migrate_vma *migrate)
2825{
2826        const unsigned long npages = migrate->npages;
2827        unsigned long i;
2828
2829        for (i = 0; i < npages; i++) {
2830                struct page *newpage = migrate_pfn_to_page(migrate->dst[i]);
2831                struct page *page = migrate_pfn_to_page(migrate->src[i]);
2832
2833                if (!page) {
2834                        if (newpage) {
2835                                unlock_page(newpage);
2836                                put_page(newpage);
2837                        }
2838                        continue;
2839                }
2840
2841                if (!(migrate->src[i] & MIGRATE_PFN_MIGRATE) || !newpage) {
2842                        if (newpage) {
2843                                unlock_page(newpage);
2844                                put_page(newpage);
2845                        }
2846                        newpage = page;
2847                }
2848
2849                remove_migration_ptes(page, newpage, false);
2850                unlock_page(page);
2851                migrate->cpages--;
2852
2853                if (is_zone_device_page(page))
2854                        put_page(page);
2855                else
2856                        putback_lru_page(page);
2857
2858                if (newpage != page) {
2859                        unlock_page(newpage);
2860                        if (is_zone_device_page(newpage))
2861                                put_page(newpage);
2862                        else
2863                                putback_lru_page(newpage);
2864                }
2865        }
2866}
2867
2868/*
2869 * migrate_vma() - migrate a range of memory inside vma
2870 *
2871 * @ops: migration callback for allocating destination memory and copying
2872 * @vma: virtual memory area containing the range to be migrated
2873 * @start: start address of the range to migrate (inclusive)
2874 * @end: end address of the range to migrate (exclusive)
2875 * @src: array of hmm_pfn_t containing source pfns
2876 * @dst: array of hmm_pfn_t containing destination pfns
2877 * @private: pointer passed back to each of the callback
2878 * Returns: 0 on success, error code otherwise
2879 *
2880 * This function tries to migrate a range of memory virtual address range, using
2881 * callbacks to allocate and copy memory from source to destination. First it
2882 * collects all the pages backing each virtual address in the range, saving this
2883 * inside the src array. Then it locks those pages and unmaps them. Once the pages
2884 * are locked and unmapped, it checks whether each page is pinned or not. Pages
2885 * that aren't pinned have the MIGRATE_PFN_MIGRATE flag set (by this function)
2886 * in the corresponding src array entry. It then restores any pages that are
2887 * pinned, by remapping and unlocking those pages.
2888 *
2889 * At this point it calls the alloc_and_copy() callback. For documentation on
2890 * what is expected from that callback, see struct migrate_vma_ops comments in
2891 * include/linux/migrate.h
2892 *
2893 * After the alloc_and_copy() callback, this function goes over each entry in
2894 * the src array that has the MIGRATE_PFN_VALID and MIGRATE_PFN_MIGRATE flag
2895 * set. If the corresponding entry in dst array has MIGRATE_PFN_VALID flag set,
2896 * then the function tries to migrate struct page information from the source
2897 * struct page to the destination struct page. If it fails to migrate the struct
2898 * page information, then it clears the MIGRATE_PFN_MIGRATE flag in the src
2899 * array.
2900 *
2901 * At this point all successfully migrated pages have an entry in the src
2902 * array with MIGRATE_PFN_VALID and MIGRATE_PFN_MIGRATE flag set and the dst
2903 * array entry with MIGRATE_PFN_VALID flag set.
2904 *
2905 * It then calls the finalize_and_map() callback. See comments for "struct
2906 * migrate_vma_ops", in include/linux/migrate.h for details about
2907 * finalize_and_map() behavior.
2908 *
2909 * After the finalize_and_map() callback, for successfully migrated pages, this
2910 * function updates the CPU page table to point to new pages, otherwise it
2911 * restores the CPU page table to point to the original source pages.
2912 *
2913 * Function returns 0 after the above steps, even if no pages were migrated
2914 * (The function only returns an error if any of the arguments are invalid.)
2915 *
2916 * Both src and dst array must be big enough for (end - start) >> PAGE_SHIFT
2917 * unsigned long entries.
2918 */
2919int migrate_vma(const struct migrate_vma_ops *ops,
2920                struct vm_area_struct *vma,
2921                unsigned long start,
2922                unsigned long end,
2923                unsigned long *src,
2924                unsigned long *dst,
2925                void *private)
2926{
2927        struct migrate_vma migrate;
2928
2929        /* Sanity check the arguments */
2930        start &= PAGE_MASK;
2931        end &= PAGE_MASK;
2932        if (!vma || is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_SPECIAL))
2933                return -EINVAL;
2934        if (start < vma->vm_start || start >= vma->vm_end)
2935                return -EINVAL;
2936        if (end <= vma->vm_start || end > vma->vm_end)
2937                return -EINVAL;
2938        if (!ops || !src || !dst || start >= end)
2939                return -EINVAL;
2940
2941        memset(src, 0, sizeof(*src) * ((end - start) >> PAGE_SHIFT));
2942        migrate.src = src;
2943        migrate.dst = dst;
2944        migrate.start = start;
2945        migrate.npages = 0;
2946        migrate.cpages = 0;
2947        migrate.end = end;
2948        migrate.vma = vma;
2949
2950        /* Collect, and try to unmap source pages */
2951        migrate_vma_collect(&migrate);
2952        if (!migrate.cpages)
2953                return 0;
2954
2955        /* Lock and isolate page */
2956        migrate_vma_prepare(&migrate);
2957        if (!migrate.cpages)
2958                return 0;
2959
2960        /* Unmap pages */
2961        migrate_vma_unmap(&migrate);
2962        if (!migrate.cpages)
2963                return 0;
2964
2965        /*
2966         * At this point pages are locked and unmapped, and thus they have
2967         * stable content and can safely be copied to destination memory that
2968         * is allocated by the callback.
2969         *
2970         * Note that migration can fail in migrate_vma_struct_page() for each
2971         * individual page.
2972         */
2973        ops->alloc_and_copy(vma, src, dst, start, end, private);
2974
2975        /* This does the real migration of struct page */
2976        migrate_vma_pages(&migrate);
2977
2978        ops->finalize_and_map(vma, src, dst, start, end, private);
2979
2980        /* Unlock and remap pages */
2981        migrate_vma_finalize(&migrate);
2982
2983        return 0;
2984}
2985EXPORT_SYMBOL(migrate_vma);
2986#endif /* defined(MIGRATE_VMA_HELPER) */
2987