linux/mm/migrate.c
<<
>>
Prefs
   1/*
   2 * Memory Migration functionality - linux/mm/migration.c
   3 *
   4 * Copyright (C) 2006 Silicon Graphics, Inc., Christoph Lameter
   5 *
   6 * Page migration was first developed in the context of the memory hotplug
   7 * project. The main authors of the migration code are:
   8 *
   9 * IWAMOTO Toshihiro <iwamoto@valinux.co.jp>
  10 * Hirokazu Takahashi <taka@valinux.co.jp>
  11 * Dave Hansen <haveblue@us.ibm.com>
  12 * Christoph Lameter
  13 */
  14
  15#include <linux/migrate.h>
  16#include <linux/module.h>
  17#include <linux/swap.h>
  18#include <linux/swapops.h>
  19#include <linux/pagemap.h>
  20#include <linux/buffer_head.h>
  21#include <linux/mm_inline.h>
  22#include <linux/nsproxy.h>
  23#include <linux/pagevec.h>
  24#include <linux/rmap.h>
  25#include <linux/topology.h>
  26#include <linux/cpu.h>
  27#include <linux/cpuset.h>
  28#include <linux/writeback.h>
  29#include <linux/mempolicy.h>
  30#include <linux/vmalloc.h>
  31#include <linux/security.h>
  32#include <linux/memcontrol.h>
  33#include <linux/syscalls.h>
  34
  35#include "internal.h"
  36
  37#define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru))
  38
  39/*
  40 * migrate_prep() needs to be called before we start compiling a list of pages
  41 * to be migrated using isolate_lru_page().
  42 */
  43int migrate_prep(void)
  44{
  45        /*
  46         * Clear the LRU lists so pages can be isolated.
  47         * Note that pages may be moved off the LRU after we have
  48         * drained them. Those pages will fail to migrate like other
  49         * pages that may be busy.
  50         */
  51        lru_add_drain_all();
  52
  53        return 0;
  54}
  55
  56/*
  57 * Add isolated pages on the list back to the LRU under page lock
  58 * to avoid leaking evictable pages back onto unevictable list.
  59 *
  60 * returns the number of pages put back.
  61 */
  62int putback_lru_pages(struct list_head *l)
  63{
  64        struct page *page;
  65        struct page *page2;
  66        int count = 0;
  67
  68        list_for_each_entry_safe(page, page2, l, lru) {
  69                list_del(&page->lru);
  70                dec_zone_page_state(page, NR_ISOLATED_ANON +
  71                                page_is_file_cache(page));
  72                putback_lru_page(page);
  73                count++;
  74        }
  75        return count;
  76}
  77
  78/*
  79 * Restore a potential migration pte to a working pte entry
  80 */
  81static void remove_migration_pte(struct vm_area_struct *vma,
  82                struct page *old, struct page *new)
  83{
  84        struct mm_struct *mm = vma->vm_mm;
  85        swp_entry_t entry;
  86        pgd_t *pgd;
  87        pud_t *pud;
  88        pmd_t *pmd;
  89        pte_t *ptep, pte;
  90        spinlock_t *ptl;
  91        unsigned long addr = page_address_in_vma(new, vma);
  92
  93        if (addr == -EFAULT)
  94                return;
  95
  96        pgd = pgd_offset(mm, addr);
  97        if (!pgd_present(*pgd))
  98                return;
  99
 100        pud = pud_offset(pgd, addr);
 101        if (!pud_present(*pud))
 102                return;
 103
 104        pmd = pmd_offset(pud, addr);
 105        if (!pmd_present(*pmd))
 106                return;
 107
 108        ptep = pte_offset_map(pmd, addr);
 109
 110        if (!is_swap_pte(*ptep)) {
 111                pte_unmap(ptep);
 112                return;
 113        }
 114
 115        ptl = pte_lockptr(mm, pmd);
 116        spin_lock(ptl);
 117        pte = *ptep;
 118        if (!is_swap_pte(pte))
 119                goto out;
 120
 121        entry = pte_to_swp_entry(pte);
 122
 123        if (!is_migration_entry(entry) || migration_entry_to_page(entry) != old)
 124                goto out;
 125
 126        get_page(new);
 127        pte = pte_mkold(mk_pte(new, vma->vm_page_prot));
 128        if (is_write_migration_entry(entry))
 129                pte = pte_mkwrite(pte);
 130        flush_cache_page(vma, addr, pte_pfn(pte));
 131        set_pte_at(mm, addr, ptep, pte);
 132
 133        if (PageAnon(new))
 134                page_add_anon_rmap(new, vma, addr);
 135        else
 136                page_add_file_rmap(new);
 137
 138        /* No need to invalidate - it was non-present before */
 139        update_mmu_cache(vma, addr, pte);
 140
 141out:
 142        pte_unmap_unlock(ptep, ptl);
 143}
 144
 145/*
 146 * Note that remove_file_migration_ptes will only work on regular mappings,
 147 * Nonlinear mappings do not use migration entries.
 148 */
 149static void remove_file_migration_ptes(struct page *old, struct page *new)
 150{
 151        struct vm_area_struct *vma;
 152        struct address_space *mapping = new->mapping;
 153        struct prio_tree_iter iter;
 154        pgoff_t pgoff = new->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
 155
 156        if (!mapping)
 157                return;
 158
 159        spin_lock(&mapping->i_mmap_lock);
 160
 161        vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff)
 162                remove_migration_pte(vma, old, new);
 163
 164        spin_unlock(&mapping->i_mmap_lock);
 165}
 166
 167/*
 168 * Must hold mmap_sem lock on at least one of the vmas containing
 169 * the page so that the anon_vma cannot vanish.
 170 */
 171static void remove_anon_migration_ptes(struct page *old, struct page *new)
 172{
 173        struct anon_vma *anon_vma;
 174        struct vm_area_struct *vma;
 175        unsigned long mapping;
 176
 177        mapping = (unsigned long)new->mapping;
 178
 179        if (!mapping || (mapping & PAGE_MAPPING_ANON) == 0)
 180                return;
 181
 182        /*
 183         * We hold the mmap_sem lock. So no need to call page_lock_anon_vma.
 184         */
 185        anon_vma = (struct anon_vma *) (mapping - PAGE_MAPPING_ANON);
 186        spin_lock(&anon_vma->lock);
 187
 188        list_for_each_entry(vma, &anon_vma->head, anon_vma_node)
 189                remove_migration_pte(vma, old, new);
 190
 191        spin_unlock(&anon_vma->lock);
 192}
 193
 194/*
 195 * Get rid of all migration entries and replace them by
 196 * references to the indicated page.
 197 */
 198static void remove_migration_ptes(struct page *old, struct page *new)
 199{
 200        if (PageAnon(new))
 201                remove_anon_migration_ptes(old, new);
 202        else
 203                remove_file_migration_ptes(old, new);
 204}
 205
 206/*
 207 * Something used the pte of a page under migration. We need to
 208 * get to the page and wait until migration is finished.
 209 * When we return from this function the fault will be retried.
 210 *
 211 * This function is called from do_swap_page().
 212 */
 213void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd,
 214                                unsigned long address)
 215{
 216        pte_t *ptep, pte;
 217        spinlock_t *ptl;
 218        swp_entry_t entry;
 219        struct page *page;
 220
 221        ptep = pte_offset_map_lock(mm, pmd, address, &ptl);
 222        pte = *ptep;
 223        if (!is_swap_pte(pte))
 224                goto out;
 225
 226        entry = pte_to_swp_entry(pte);
 227        if (!is_migration_entry(entry))
 228                goto out;
 229
 230        page = migration_entry_to_page(entry);
 231
 232        /*
 233         * Once radix-tree replacement of page migration started, page_count
 234         * *must* be zero. And, we don't want to call wait_on_page_locked()
 235         * against a page without get_page().
 236         * So, we use get_page_unless_zero(), here. Even failed, page fault
 237         * will occur again.
 238         */
 239        if (!get_page_unless_zero(page))
 240                goto out;
 241        pte_unmap_unlock(ptep, ptl);
 242        wait_on_page_locked(page);
 243        put_page(page);
 244        return;
 245out:
 246        pte_unmap_unlock(ptep, ptl);
 247}
 248
 249/*
 250 * Replace the page in the mapping.
 251 *
 252 * The number of remaining references must be:
 253 * 1 for anonymous pages without a mapping
 254 * 2 for pages with a mapping
 255 * 3 for pages with a mapping and PagePrivate/PagePrivate2 set.
 256 */
 257static int migrate_page_move_mapping(struct address_space *mapping,
 258                struct page *newpage, struct page *page)
 259{
 260        int expected_count;
 261        void **pslot;
 262
 263        if (!mapping) {
 264                /* Anonymous page without mapping */
 265                if (page_count(page) != 1)
 266                        return -EAGAIN;
 267                return 0;
 268        }
 269
 270        spin_lock_irq(&mapping->tree_lock);
 271
 272        pslot = radix_tree_lookup_slot(&mapping->page_tree,
 273                                        page_index(page));
 274
 275        expected_count = 2 + page_has_private(page);
 276        if (page_count(page) != expected_count ||
 277                        (struct page *)radix_tree_deref_slot(pslot) != page) {
 278                spin_unlock_irq(&mapping->tree_lock);
 279                return -EAGAIN;
 280        }
 281
 282        if (!page_freeze_refs(page, expected_count)) {
 283                spin_unlock_irq(&mapping->tree_lock);
 284                return -EAGAIN;
 285        }
 286
 287        /*
 288         * Now we know that no one else is looking at the page.
 289         */
 290        get_page(newpage);      /* add cache reference */
 291        if (PageSwapCache(page)) {
 292                SetPageSwapCache(newpage);
 293                set_page_private(newpage, page_private(page));
 294        }
 295
 296        radix_tree_replace_slot(pslot, newpage);
 297
 298        page_unfreeze_refs(page, expected_count);
 299        /*
 300         * Drop cache reference from old page.
 301         * We know this isn't the last reference.
 302         */
 303        __put_page(page);
 304
 305        /*
 306         * If moved to a different zone then also account
 307         * the page for that zone. Other VM counters will be
 308         * taken care of when we establish references to the
 309         * new page and drop references to the old page.
 310         *
 311         * Note that anonymous pages are accounted for
 312         * via NR_FILE_PAGES and NR_ANON_PAGES if they
 313         * are mapped to swap space.
 314         */
 315        __dec_zone_page_state(page, NR_FILE_PAGES);
 316        __inc_zone_page_state(newpage, NR_FILE_PAGES);
 317        if (PageSwapBacked(page)) {
 318                __dec_zone_page_state(page, NR_SHMEM);
 319                __inc_zone_page_state(newpage, NR_SHMEM);
 320        }
 321        spin_unlock_irq(&mapping->tree_lock);
 322
 323        return 0;
 324}
 325
 326/*
 327 * Copy the page to its new location
 328 */
 329static void migrate_page_copy(struct page *newpage, struct page *page)
 330{
 331        int anon;
 332
 333        copy_highpage(newpage, page);
 334
 335        if (PageError(page))
 336                SetPageError(newpage);
 337        if (PageReferenced(page))
 338                SetPageReferenced(newpage);
 339        if (PageUptodate(page))
 340                SetPageUptodate(newpage);
 341        if (TestClearPageActive(page)) {
 342                VM_BUG_ON(PageUnevictable(page));
 343                SetPageActive(newpage);
 344        } else
 345                unevictable_migrate_page(newpage, page);
 346        if (PageChecked(page))
 347                SetPageChecked(newpage);
 348        if (PageMappedToDisk(page))
 349                SetPageMappedToDisk(newpage);
 350
 351        if (PageDirty(page)) {
 352                clear_page_dirty_for_io(page);
 353                /*
 354                 * Want to mark the page and the radix tree as dirty, and
 355                 * redo the accounting that clear_page_dirty_for_io undid,
 356                 * but we can't use set_page_dirty because that function
 357                 * is actually a signal that all of the page has become dirty.
 358                 * Wheras only part of our page may be dirty.
 359                 */
 360                __set_page_dirty_nobuffers(newpage);
 361        }
 362
 363        mlock_migrate_page(newpage, page);
 364
 365        ClearPageSwapCache(page);
 366        ClearPagePrivate(page);
 367        set_page_private(page, 0);
 368        /* page->mapping contains a flag for PageAnon() */
 369        anon = PageAnon(page);
 370        page->mapping = NULL;
 371
 372        /*
 373         * If any waiters have accumulated on the new page then
 374         * wake them up.
 375         */
 376        if (PageWriteback(newpage))
 377                end_page_writeback(newpage);
 378}
 379
 380/************************************************************
 381 *                    Migration functions
 382 ***********************************************************/
 383
 384/* Always fail migration. Used for mappings that are not movable */
 385int fail_migrate_page(struct address_space *mapping,
 386                        struct page *newpage, struct page *page)
 387{
 388        return -EIO;
 389}
 390EXPORT_SYMBOL(fail_migrate_page);
 391
 392/*
 393 * Common logic to directly migrate a single page suitable for
 394 * pages that do not use PagePrivate/PagePrivate2.
 395 *
 396 * Pages are locked upon entry and exit.
 397 */
 398int migrate_page(struct address_space *mapping,
 399                struct page *newpage, struct page *page)
 400{
 401        int rc;
 402
 403        BUG_ON(PageWriteback(page));    /* Writeback must be complete */
 404
 405        rc = migrate_page_move_mapping(mapping, newpage, page);
 406
 407        if (rc)
 408                return rc;
 409
 410        migrate_page_copy(newpage, page);
 411        return 0;
 412}
 413EXPORT_SYMBOL(migrate_page);
 414
 415#ifdef CONFIG_BLOCK
 416/*
 417 * Migration function for pages with buffers. This function can only be used
 418 * if the underlying filesystem guarantees that no other references to "page"
 419 * exist.
 420 */
 421int buffer_migrate_page(struct address_space *mapping,
 422                struct page *newpage, struct page *page)
 423{
 424        struct buffer_head *bh, *head;
 425        int rc;
 426
 427        if (!page_has_buffers(page))
 428                return migrate_page(mapping, newpage, page);
 429
 430        head = page_buffers(page);
 431
 432        rc = migrate_page_move_mapping(mapping, newpage, page);
 433
 434        if (rc)
 435                return rc;
 436
 437        bh = head;
 438        do {
 439                get_bh(bh);
 440                lock_buffer(bh);
 441                bh = bh->b_this_page;
 442
 443        } while (bh != head);
 444
 445        ClearPagePrivate(page);
 446        set_page_private(newpage, page_private(page));
 447        set_page_private(page, 0);
 448        put_page(page);
 449        get_page(newpage);
 450
 451        bh = head;
 452        do {
 453                set_bh_page(bh, newpage, bh_offset(bh));
 454                bh = bh->b_this_page;
 455
 456        } while (bh != head);
 457
 458        SetPagePrivate(newpage);
 459
 460        migrate_page_copy(newpage, page);
 461
 462        bh = head;
 463        do {
 464                unlock_buffer(bh);
 465                put_bh(bh);
 466                bh = bh->b_this_page;
 467
 468        } while (bh != head);
 469
 470        return 0;
 471}
 472EXPORT_SYMBOL(buffer_migrate_page);
 473#endif
 474
 475/*
 476 * Writeback a page to clean the dirty state
 477 */
 478static int writeout(struct address_space *mapping, struct page *page)
 479{
 480        struct writeback_control wbc = {
 481                .sync_mode = WB_SYNC_NONE,
 482                .nr_to_write = 1,
 483                .range_start = 0,
 484                .range_end = LLONG_MAX,
 485                .nonblocking = 1,
 486                .for_reclaim = 1
 487        };
 488        int rc;
 489
 490        if (!mapping->a_ops->writepage)
 491                /* No write method for the address space */
 492                return -EINVAL;
 493
 494        if (!clear_page_dirty_for_io(page))
 495                /* Someone else already triggered a write */
 496                return -EAGAIN;
 497
 498        /*
 499         * A dirty page may imply that the underlying filesystem has
 500         * the page on some queue. So the page must be clean for
 501         * migration. Writeout may mean we loose the lock and the
 502         * page state is no longer what we checked for earlier.
 503         * At this point we know that the migration attempt cannot
 504         * be successful.
 505         */
 506        remove_migration_ptes(page, page);
 507
 508        rc = mapping->a_ops->writepage(page, &wbc);
 509
 510        if (rc != AOP_WRITEPAGE_ACTIVATE)
 511                /* unlocked. Relock */
 512                lock_page(page);
 513
 514        return (rc < 0) ? -EIO : -EAGAIN;
 515}
 516
 517/*
 518 * Default handling if a filesystem does not provide a migration function.
 519 */
 520static int fallback_migrate_page(struct address_space *mapping,
 521        struct page *newpage, struct page *page)
 522{
 523        if (PageDirty(page))
 524                return writeout(mapping, page);
 525
 526        /*
 527         * Buffers may be managed in a filesystem specific way.
 528         * We must have no buffers or drop them.
 529         */
 530        if (page_has_private(page) &&
 531            !try_to_release_page(page, GFP_KERNEL))
 532                return -EAGAIN;
 533
 534        return migrate_page(mapping, newpage, page);
 535}
 536
 537/*
 538 * Move a page to a newly allocated page
 539 * The page is locked and all ptes have been successfully removed.
 540 *
 541 * The new page will have replaced the old page if this function
 542 * is successful.
 543 *
 544 * Return value:
 545 *   < 0 - error code
 546 *  == 0 - success
 547 */
 548static int move_to_new_page(struct page *newpage, struct page *page)
 549{
 550        struct address_space *mapping;
 551        int rc;
 552
 553        /*
 554         * Block others from accessing the page when we get around to
 555         * establishing additional references. We are the only one
 556         * holding a reference to the new page at this point.
 557         */
 558        if (!trylock_page(newpage))
 559                BUG();
 560
 561        /* Prepare mapping for the new page.*/
 562        newpage->index = page->index;
 563        newpage->mapping = page->mapping;
 564        if (PageSwapBacked(page))
 565                SetPageSwapBacked(newpage);
 566
 567        mapping = page_mapping(page);
 568        if (!mapping)
 569                rc = migrate_page(mapping, newpage, page);
 570        else if (mapping->a_ops->migratepage)
 571                /*
 572                 * Most pages have a mapping and most filesystems
 573                 * should provide a migration function. Anonymous
 574                 * pages are part of swap space which also has its
 575                 * own migration function. This is the most common
 576                 * path for page migration.
 577                 */
 578                rc = mapping->a_ops->migratepage(mapping,
 579                                                newpage, page);
 580        else
 581                rc = fallback_migrate_page(mapping, newpage, page);
 582
 583        if (!rc) {
 584                remove_migration_ptes(page, newpage);
 585        } else
 586                newpage->mapping = NULL;
 587
 588        unlock_page(newpage);
 589
 590        return rc;
 591}
 592
 593/*
 594 * Obtain the lock on page, remove all ptes and migrate the page
 595 * to the newly allocated page in newpage.
 596 */
 597static int unmap_and_move(new_page_t get_new_page, unsigned long private,
 598                        struct page *page, int force)
 599{
 600        int rc = 0;
 601        int *result = NULL;
 602        struct page *newpage = get_new_page(page, private, &result);
 603        int rcu_locked = 0;
 604        int charge = 0;
 605        struct mem_cgroup *mem = NULL;
 606
 607        if (!newpage)
 608                return -ENOMEM;
 609
 610        if (page_count(page) == 1) {
 611                /* page was freed from under us. So we are done. */
 612                goto move_newpage;
 613        }
 614
 615        /* prepare cgroup just returns 0 or -ENOMEM */
 616        rc = -EAGAIN;
 617
 618        if (!trylock_page(page)) {
 619                if (!force)
 620                        goto move_newpage;
 621                lock_page(page);
 622        }
 623
 624        /* charge against new page */
 625        charge = mem_cgroup_prepare_migration(page, &mem);
 626        if (charge == -ENOMEM) {
 627                rc = -ENOMEM;
 628                goto unlock;
 629        }
 630        BUG_ON(charge);
 631
 632        if (PageWriteback(page)) {
 633                if (!force)
 634                        goto uncharge;
 635                wait_on_page_writeback(page);
 636        }
 637        /*
 638         * By try_to_unmap(), page->mapcount goes down to 0 here. In this case,
 639         * we cannot notice that anon_vma is freed while we migrates a page.
 640         * This rcu_read_lock() delays freeing anon_vma pointer until the end
 641         * of migration. File cache pages are no problem because of page_lock()
 642         * File Caches may use write_page() or lock_page() in migration, then,
 643         * just care Anon page here.
 644         */
 645        if (PageAnon(page)) {
 646                rcu_read_lock();
 647                rcu_locked = 1;
 648        }
 649
 650        /*
 651         * Corner case handling:
 652         * 1. When a new swap-cache page is read into, it is added to the LRU
 653         * and treated as swapcache but it has no rmap yet.
 654         * Calling try_to_unmap() against a page->mapping==NULL page will
 655         * trigger a BUG.  So handle it here.
 656         * 2. An orphaned page (see truncate_complete_page) might have
 657         * fs-private metadata. The page can be picked up due to memory
 658         * offlining.  Everywhere else except page reclaim, the page is
 659         * invisible to the vm, so the page can not be migrated.  So try to
 660         * free the metadata, so the page can be freed.
 661         */
 662        if (!page->mapping) {
 663                if (!PageAnon(page) && page_has_private(page)) {
 664                        /*
 665                         * Go direct to try_to_free_buffers() here because
 666                         * a) that's what try_to_release_page() would do anyway
 667                         * b) we may be under rcu_read_lock() here, so we can't
 668                         *    use GFP_KERNEL which is what try_to_release_page()
 669                         *    needs to be effective.
 670                         */
 671                        try_to_free_buffers(page);
 672                        goto rcu_unlock;
 673                }
 674                goto skip_unmap;
 675        }
 676
 677        /* Establish migration ptes or remove ptes */
 678        try_to_unmap(page, TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS);
 679
 680skip_unmap:
 681        if (!page_mapped(page))
 682                rc = move_to_new_page(newpage, page);
 683
 684        if (rc)
 685                remove_migration_ptes(page, page);
 686rcu_unlock:
 687        if (rcu_locked)
 688                rcu_read_unlock();
 689uncharge:
 690        if (!charge)
 691                mem_cgroup_end_migration(mem, page, newpage);
 692unlock:
 693        unlock_page(page);
 694
 695        if (rc != -EAGAIN) {
 696                /*
 697                 * A page that has been migrated has all references
 698                 * removed and will be freed. A page that has not been
 699                 * migrated will have kepts its references and be
 700                 * restored.
 701                 */
 702                list_del(&page->lru);
 703                dec_zone_page_state(page, NR_ISOLATED_ANON +
 704                                page_is_file_cache(page));
 705                putback_lru_page(page);
 706        }
 707
 708move_newpage:
 709
 710        /*
 711         * Move the new page to the LRU. If migration was not successful
 712         * then this will free the page.
 713         */
 714        putback_lru_page(newpage);
 715
 716        if (result) {
 717                if (rc)
 718                        *result = rc;
 719                else
 720                        *result = page_to_nid(newpage);
 721        }
 722        return rc;
 723}
 724
 725/*
 726 * migrate_pages
 727 *
 728 * The function takes one list of pages to migrate and a function
 729 * that determines from the page to be migrated and the private data
 730 * the target of the move and allocates the page.
 731 *
 732 * The function returns after 10 attempts or if no pages
 733 * are movable anymore because to has become empty
 734 * or no retryable pages exist anymore. All pages will be
 735 * returned to the LRU or freed.
 736 *
 737 * Return: Number of pages not migrated or error code.
 738 */
 739int migrate_pages(struct list_head *from,
 740                new_page_t get_new_page, unsigned long private)
 741{
 742        int retry = 1;
 743        int nr_failed = 0;
 744        int pass = 0;
 745        struct page *page;
 746        struct page *page2;
 747        int swapwrite = current->flags & PF_SWAPWRITE;
 748        int rc;
 749        unsigned long flags;
 750
 751        local_irq_save(flags);
 752        list_for_each_entry(page, from, lru)
 753                __inc_zone_page_state(page, NR_ISOLATED_ANON +
 754                                page_is_file_cache(page));
 755        local_irq_restore(flags);
 756
 757        if (!swapwrite)
 758                current->flags |= PF_SWAPWRITE;
 759
 760        for(pass = 0; pass < 10 && retry; pass++) {
 761                retry = 0;
 762
 763                list_for_each_entry_safe(page, page2, from, lru) {
 764                        cond_resched();
 765
 766                        rc = unmap_and_move(get_new_page, private,
 767                                                page, pass > 2);
 768
 769                        switch(rc) {
 770                        case -ENOMEM:
 771                                goto out;
 772                        case -EAGAIN:
 773                                retry++;
 774                                break;
 775                        case 0:
 776                                break;
 777                        default:
 778                                /* Permanent failure */
 779                                nr_failed++;
 780                                break;
 781                        }
 782                }
 783        }
 784        rc = 0;
 785out:
 786        if (!swapwrite)
 787                current->flags &= ~PF_SWAPWRITE;
 788
 789        putback_lru_pages(from);
 790
 791        if (rc)
 792                return rc;
 793
 794        return nr_failed + retry;
 795}
 796
 797#ifdef CONFIG_NUMA
 798/*
 799 * Move a list of individual pages
 800 */
 801struct page_to_node {
 802        unsigned long addr;
 803        struct page *page;
 804        int node;
 805        int status;
 806};
 807
 808static struct page *new_page_node(struct page *p, unsigned long private,
 809                int **result)
 810{
 811        struct page_to_node *pm = (struct page_to_node *)private;
 812
 813        while (pm->node != MAX_NUMNODES && pm->page != p)
 814                pm++;
 815
 816        if (pm->node == MAX_NUMNODES)
 817                return NULL;
 818
 819        *result = &pm->status;
 820
 821        return alloc_pages_exact_node(pm->node,
 822                                GFP_HIGHUSER_MOVABLE | GFP_THISNODE, 0);
 823}
 824
 825/*
 826 * Move a set of pages as indicated in the pm array. The addr
 827 * field must be set to the virtual address of the page to be moved
 828 * and the node number must contain a valid target node.
 829 * The pm array ends with node = MAX_NUMNODES.
 830 */
 831static int do_move_page_to_node_array(struct mm_struct *mm,
 832                                      struct page_to_node *pm,
 833                                      int migrate_all)
 834{
 835        int err;
 836        struct page_to_node *pp;
 837        LIST_HEAD(pagelist);
 838
 839        down_read(&mm->mmap_sem);
 840
 841        /*
 842         * Build a list of pages to migrate
 843         */
 844        for (pp = pm; pp->node != MAX_NUMNODES; pp++) {
 845                struct vm_area_struct *vma;
 846                struct page *page;
 847
 848                err = -EFAULT;
 849                vma = find_vma(mm, pp->addr);
 850                if (!vma || !vma_migratable(vma))
 851                        goto set_status;
 852
 853                page = follow_page(vma, pp->addr, FOLL_GET);
 854
 855                err = PTR_ERR(page);
 856                if (IS_ERR(page))
 857                        goto set_status;
 858
 859                err = -ENOENT;
 860                if (!page)
 861                        goto set_status;
 862
 863                if (PageReserved(page))         /* Check for zero page */
 864                        goto put_and_set;
 865
 866                pp->page = page;
 867                err = page_to_nid(page);
 868
 869                if (err == pp->node)
 870                        /*
 871                         * Node already in the right place
 872                         */
 873                        goto put_and_set;
 874
 875                err = -EACCES;
 876                if (page_mapcount(page) > 1 &&
 877                                !migrate_all)
 878                        goto put_and_set;
 879
 880                err = isolate_lru_page(page);
 881                if (!err)
 882                        list_add_tail(&page->lru, &pagelist);
 883put_and_set:
 884                /*
 885                 * Either remove the duplicate refcount from
 886                 * isolate_lru_page() or drop the page ref if it was
 887                 * not isolated.
 888                 */
 889                put_page(page);
 890set_status:
 891                pp->status = err;
 892        }
 893
 894        err = 0;
 895        if (!list_empty(&pagelist))
 896                err = migrate_pages(&pagelist, new_page_node,
 897                                (unsigned long)pm);
 898
 899        up_read(&mm->mmap_sem);
 900        return err;
 901}
 902
 903/*
 904 * Migrate an array of page address onto an array of nodes and fill
 905 * the corresponding array of status.
 906 */
 907static int do_pages_move(struct mm_struct *mm, struct task_struct *task,
 908                         unsigned long nr_pages,
 909                         const void __user * __user *pages,
 910                         const int __user *nodes,
 911                         int __user *status, int flags)
 912{
 913        struct page_to_node *pm;
 914        nodemask_t task_nodes;
 915        unsigned long chunk_nr_pages;
 916        unsigned long chunk_start;
 917        int err;
 918
 919        task_nodes = cpuset_mems_allowed(task);
 920
 921        err = -ENOMEM;
 922        pm = (struct page_to_node *)__get_free_page(GFP_KERNEL);
 923        if (!pm)
 924                goto out;
 925
 926        migrate_prep();
 927
 928        /*
 929         * Store a chunk of page_to_node array in a page,
 930         * but keep the last one as a marker
 931         */
 932        chunk_nr_pages = (PAGE_SIZE / sizeof(struct page_to_node)) - 1;
 933
 934        for (chunk_start = 0;
 935             chunk_start < nr_pages;
 936             chunk_start += chunk_nr_pages) {
 937                int j;
 938
 939                if (chunk_start + chunk_nr_pages > nr_pages)
 940                        chunk_nr_pages = nr_pages - chunk_start;
 941
 942                /* fill the chunk pm with addrs and nodes from user-space */
 943                for (j = 0; j < chunk_nr_pages; j++) {
 944                        const void __user *p;
 945                        int node;
 946
 947                        err = -EFAULT;
 948                        if (get_user(p, pages + j + chunk_start))
 949                                goto out_pm;
 950                        pm[j].addr = (unsigned long) p;
 951
 952                        if (get_user(node, nodes + j + chunk_start))
 953                                goto out_pm;
 954
 955                        err = -ENODEV;
 956                        if (!node_state(node, N_HIGH_MEMORY))
 957                                goto out_pm;
 958
 959                        err = -EACCES;
 960                        if (!node_isset(node, task_nodes))
 961                                goto out_pm;
 962
 963                        pm[j].node = node;
 964                }
 965
 966                /* End marker for this chunk */
 967                pm[chunk_nr_pages].node = MAX_NUMNODES;
 968
 969                /* Migrate this chunk */
 970                err = do_move_page_to_node_array(mm, pm,
 971                                                 flags & MPOL_MF_MOVE_ALL);
 972                if (err < 0)
 973                        goto out_pm;
 974
 975                /* Return status information */
 976                for (j = 0; j < chunk_nr_pages; j++)
 977                        if (put_user(pm[j].status, status + j + chunk_start)) {
 978                                err = -EFAULT;
 979                                goto out_pm;
 980                        }
 981        }
 982        err = 0;
 983
 984out_pm:
 985        free_page((unsigned long)pm);
 986out:
 987        return err;
 988}
 989
 990/*
 991 * Determine the nodes of an array of pages and store it in an array of status.
 992 */
 993static void do_pages_stat_array(struct mm_struct *mm, unsigned long nr_pages,
 994                                const void __user **pages, int *status)
 995{
 996        unsigned long i;
 997
 998        down_read(&mm->mmap_sem);
 999
1000        for (i = 0; i < nr_pages; i++) {
1001                unsigned long addr = (unsigned long)(*pages);
1002                struct vm_area_struct *vma;
1003                struct page *page;
1004                int err = -EFAULT;
1005
1006                vma = find_vma(mm, addr);
1007                if (!vma)
1008                        goto set_status;
1009
1010                page = follow_page(vma, addr, 0);
1011
1012                err = PTR_ERR(page);
1013                if (IS_ERR(page))
1014                        goto set_status;
1015
1016                err = -ENOENT;
1017                /* Use PageReserved to check for zero page */
1018                if (!page || PageReserved(page))
1019                        goto set_status;
1020
1021                err = page_to_nid(page);
1022set_status:
1023                *status = err;
1024
1025                pages++;
1026                status++;
1027        }
1028
1029        up_read(&mm->mmap_sem);
1030}
1031
1032/*
1033 * Determine the nodes of a user array of pages and store it in
1034 * a user array of status.
1035 */
1036static int do_pages_stat(struct mm_struct *mm, unsigned long nr_pages,
1037                         const void __user * __user *pages,
1038                         int __user *status)
1039{
1040#define DO_PAGES_STAT_CHUNK_NR 16
1041        const void __user *chunk_pages[DO_PAGES_STAT_CHUNK_NR];
1042        int chunk_status[DO_PAGES_STAT_CHUNK_NR];
1043        unsigned long i, chunk_nr = DO_PAGES_STAT_CHUNK_NR;
1044        int err;
1045
1046        for (i = 0; i < nr_pages; i += chunk_nr) {
1047                if (chunk_nr + i > nr_pages)
1048                        chunk_nr = nr_pages - i;
1049
1050                err = copy_from_user(chunk_pages, &pages[i],
1051                                     chunk_nr * sizeof(*chunk_pages));
1052                if (err) {
1053                        err = -EFAULT;
1054                        goto out;
1055                }
1056
1057                do_pages_stat_array(mm, chunk_nr, chunk_pages, chunk_status);
1058
1059                err = copy_to_user(&status[i], chunk_status,
1060                                   chunk_nr * sizeof(*chunk_status));
1061                if (err) {
1062                        err = -EFAULT;
1063                        goto out;
1064                }
1065        }
1066        err = 0;
1067
1068out:
1069        return err;
1070}
1071
1072/*
1073 * Move a list of pages in the address space of the currently executing
1074 * process.
1075 */
1076SYSCALL_DEFINE6(move_pages, pid_t, pid, unsigned long, nr_pages,
1077                const void __user * __user *, pages,
1078                const int __user *, nodes,
1079                int __user *, status, int, flags)
1080{
1081        const struct cred *cred = current_cred(), *tcred;
1082        struct task_struct *task;
1083        struct mm_struct *mm;
1084        int err;
1085
1086        /* Check flags */
1087        if (flags & ~(MPOL_MF_MOVE|MPOL_MF_MOVE_ALL))
1088                return -EINVAL;
1089
1090        if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
1091                return -EPERM;
1092
1093        /* Find the mm_struct */
1094        read_lock(&tasklist_lock);
1095        task = pid ? find_task_by_vpid(pid) : current;
1096        if (!task) {
1097                read_unlock(&tasklist_lock);
1098                return -ESRCH;
1099        }
1100        mm = get_task_mm(task);
1101        read_unlock(&tasklist_lock);
1102
1103        if (!mm)
1104                return -EINVAL;
1105
1106        /*
1107         * Check if this process has the right to modify the specified
1108         * process. The right exists if the process has administrative
1109         * capabilities, superuser privileges or the same
1110         * userid as the target process.
1111         */
1112        rcu_read_lock();
1113        tcred = __task_cred(task);
1114        if (cred->euid != tcred->suid && cred->euid != tcred->uid &&
1115            cred->uid  != tcred->suid && cred->uid  != tcred->uid &&
1116            !capable(CAP_SYS_NICE)) {
1117                rcu_read_unlock();
1118                err = -EPERM;
1119                goto out;
1120        }
1121        rcu_read_unlock();
1122
1123        err = security_task_movememory(task);
1124        if (err)
1125                goto out;
1126
1127        if (nodes) {
1128                err = do_pages_move(mm, task, nr_pages, pages, nodes, status,
1129                                    flags);
1130        } else {
1131                err = do_pages_stat(mm, nr_pages, pages, status);
1132        }
1133
1134out:
1135        mmput(mm);
1136        return err;
1137}
1138
1139/*
1140 * Call migration functions in the vma_ops that may prepare
1141 * memory in a vm for migration. migration functions may perform
1142 * the migration for vmas that do not have an underlying page struct.
1143 */
1144int migrate_vmas(struct mm_struct *mm, const nodemask_t *to,
1145        const nodemask_t *from, unsigned long flags)
1146{
1147        struct vm_area_struct *vma;
1148        int err = 0;
1149
1150        for (vma = mm->mmap; vma && !err; vma = vma->vm_next) {
1151                if (vma->vm_ops && vma->vm_ops->migrate) {
1152                        err = vma->vm_ops->migrate(vma, to, from, flags);
1153                        if (err)
1154                                break;
1155                }
1156        }
1157        return err;
1158}
1159#endif
1160