LXR linux/mm/mempolicy.c

   1/*
   2 * Simple NUMA memory policy for the Linux kernel.
   3 *
   4 * Copyright 2003,2004 Andi Kleen, SuSE Labs.
   5 * (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc.
   6 * Subject to the GNU Public License, version 2.
   7 *
   8 * NUMA policy allows the user to give hints in which node(s) memory should
   9 * be allocated.
  10 *
  11 * Support four policies per VMA and per process:
  12 *
  13 * The VMA policy has priority over the process policy for a page fault.
  14 *
  15 * interleave     Allocate memory interleaved over a set of nodes,
  16 *                with normal fallback if it fails.
  17 *                For VMA based allocations this interleaves based on the
  18 *                offset into the backing object or offset into the mapping
  19 *                for anonymous memory. For process policy an process counter
  20 *                is used.
  21 *
  22 * bind           Only allocate memory on a specific set of nodes,
  23 *                no fallback.
  24 *                FIXME: memory is allocated starting with the first node
  25 *                to the last. It would be better if bind would truly restrict
  26 *                the allocation to memory nodes instead
  27 *
  28 * preferred       Try a specific node first before normal fallback.
  29 *                As a special case node -1 here means do the allocation
  30 *                on the local CPU. This is normally identical to default,
  31 *                but useful to set in a VMA when you have a non default
  32 *                process policy.
  33 *
  34 * default        Allocate on the local node first, or when on a VMA
  35 *                use the process policy. This is what Linux always did
  36 *                in a NUMA aware kernel and still does by, ahem, default.
  37 *
  38 * The process policy is applied for most non interrupt memory allocations
  39 * in that process' context. Interrupts ignore the policies and always
  40 * try to allocate on the local CPU. The VMA policy is only applied for memory
  41 * allocations for a VMA in the VM.
  42 *
  43 * Currently there are a few corner cases in swapping where the policy
  44 * is not applied, but the majority should be handled. When process policy
  45 * is used it is not remembered over swap outs/swap ins.
  46 *
  47 * Only the highest zone in the zone hierarchy gets policied. Allocations
  48 * requesting a lower zone just use default policy. This implies that
  49 * on systems with highmem kernel lowmem allocation don't get policied.
  50 * Same with GFP_DMA allocations.
  51 *
  52 * For shmfs/tmpfs/hugetlbfs shared memory the policy is shared between
  53 * all users and remembered even when nobody has memory mapped.
  54 */
  55
  56/* Notebook:
  57   fix mmap readahead to honour policy and enable policy for any page cache
  58   object
  59   statistics for bigpages
  60   global policy for page cache? currently it uses process policy. Requires
  61   first item above.
  62   handle mremap for shared memory (currently ignored for the policy)
  63   grows down?
  64   make bind policy root only? It can trigger oom much faster and the
  65   kernel is not always grateful with that.
  66   could replace all the switch()es with a mempolicy_ops structure.
  67*/
  68
  69#include <linux/mempolicy.h>
  70#include <linux/mm.h>
  71#include <linux/highmem.h>
  72#include <linux/hugetlb.h>
  73#include <linux/kernel.h>
  74#include <linux/sched.h>
  75#include <linux/nodemask.h>
  76#include <linux/cpuset.h>
  77#include <linux/gfp.h>
  78#include <linux/slab.h>
  79#include <linux/string.h>
  80#include <linux/module.h>
  81#include <linux/nsproxy.h>
  82#include <linux/interrupt.h>
  83#include <linux/init.h>
  84#include <linux/compat.h>
  85#include <linux/swap.h>
  86#include <linux/seq_file.h>
  87#include <linux/proc_fs.h>
  88#include <linux/migrate.h>
  89#include <linux/rmap.h>
  90#include <linux/security.h>
  91#include <linux/syscalls.h>
  92
  93#include <asm/tlbflush.h>
  94#include <asm/uaccess.h>
  95
  96/* Internal flags */
  97#define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0)    /* Skip checks for continuous vmas */
  98#define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1)          /* Invert check for nodemask */
  99#define MPOL_MF_STATS (MPOL_MF_INTERNAL << 2)           /* Gather statistics */
 100
 101static struct kmem_cache *policy_cache;
 102static struct kmem_cache *sn_cache;
 103
 104/* Highest zone. An specific allocation for a zone below that is not
 105   policied. */
 106enum zone_type policy_zone = 0;
 107
 108struct mempolicy default_policy = {
 109        .refcnt = ATOMIC_INIT(1), /* never free it */
 110        .policy = MPOL_DEFAULT,
 111};
 112
 113static void mpol_rebind_policy(struct mempolicy *pol,
 114                               const nodemask_t *newmask);
 115
 116/* Do sanity checking on a policy */
 117static int mpol_check_policy(int mode, nodemask_t *nodes)
 118{
 119        int empty = nodes_empty(*nodes);
 120
 121        switch (mode) {
 122        case MPOL_DEFAULT:
 123                if (!empty)
 124                        return -EINVAL;
 125                break;
 126        case MPOL_BIND:
 127        case MPOL_INTERLEAVE:
 128                /* Preferred will only use the first bit, but allow
 129                   more for now. */
 130                if (empty)
 131                        return -EINVAL;
 132                break;
 133        }
 134        return nodes_subset(*nodes, node_states[N_HIGH_MEMORY]) ? 0 : -EINVAL;
 135}
 136
 137/* Generate a custom zonelist for the BIND policy. */
 138static struct zonelist *bind_zonelist(nodemask_t *nodes)
 139{
 140        struct zonelist *zl;
 141        int num, max, nd;
 142        enum zone_type k;
 143
 144        max = 1 + MAX_NR_ZONES * nodes_weight(*nodes);
 145        max++;                  /* space for zlcache_ptr (see mmzone.h) */
 146        zl = kmalloc(sizeof(struct zone *) * max, GFP_KERNEL);
 147        if (!zl)
 148                return ERR_PTR(-ENOMEM);
 149        zl->zlcache_ptr = NULL;
 150        num = 0;
 151        /* First put in the highest zones from all nodes, then all the next 
 152           lower zones etc. Avoid empty zones because the memory allocator
 153           doesn't like them. If you implement node hot removal you
 154           have to fix that. */
 155        k = MAX_NR_ZONES - 1;
 156        while (1) {
 157                for_each_node_mask(nd, *nodes) { 
 158                        struct zone *z = &NODE_DATA(nd)->node_zones[k];
 159                        if (z->present_pages > 0) 
 160                                zl->zones[num++] = z;
 161                }
 162                if (k == 0)
 163                        break;
 164                k--;
 165        }
 166        if (num == 0) {
 167                kfree(zl);
 168                return ERR_PTR(-EINVAL);
 169        }
 170        zl->zones[num] = NULL;
 171        return zl;
 172}
 173
 174/* Create a new policy */
 175static struct mempolicy *mpol_new(int mode, nodemask_t *nodes)
 176{
 177        struct mempolicy *policy;
 178
 179        pr_debug("setting mode %d nodes[0] %lx\n",
 180                 mode, nodes ? nodes_addr(*nodes)[0] : -1);
 181
 182        if (mode == MPOL_DEFAULT)
 183                return NULL;
 184        policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
 185        if (!policy)
 186                return ERR_PTR(-ENOMEM);
 187        atomic_set(&policy->refcnt, 1);
 188        switch (mode) {
 189        case MPOL_INTERLEAVE:
 190                policy->v.nodes = *nodes;
 191                nodes_and(policy->v.nodes, policy->v.nodes,
 192                                        node_states[N_HIGH_MEMORY]);
 193                if (nodes_weight(policy->v.nodes) == 0) {
 194                        kmem_cache_free(policy_cache, policy);
 195                        return ERR_PTR(-EINVAL);
 196                }
 197                break;
 198        case MPOL_PREFERRED:
 199                policy->v.preferred_node = first_node(*nodes);
 200                if (policy->v.preferred_node >= MAX_NUMNODES)
 201                        policy->v.preferred_node = -1;
 202                break;
 203        case MPOL_BIND:
 204                policy->v.zonelist = bind_zonelist(nodes);
 205                if (IS_ERR(policy->v.zonelist)) {
 206                        void *error_code = policy->v.zonelist;
 207                        kmem_cache_free(policy_cache, policy);
 208                        return error_code;
 209                }
 210                break;
 211        }
 212        policy->policy = mode;
 213        policy->cpuset_mems_allowed = cpuset_mems_allowed(current);
 214        return policy;
 215}
 216
 217static void gather_stats(struct page *, void *, int pte_dirty);
 218static void migrate_page_add(struct page *page, struct list_head *pagelist,
 219                                unsigned long flags);
 220
 221/* Scan through pages checking if pages follow certain conditions. */
 222static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
 223                unsigned long addr, unsigned long end,
 224                const nodemask_t *nodes, unsigned long flags,
 225                void *private)
 226{
 227        pte_t *orig_pte;
 228        pte_t *pte;
 229        spinlock_t *ptl;
 230
 231        orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
 232        do {
 233                struct page *page;
 234                int nid;
 235
 236                if (!pte_present(*pte))
 237                        continue;
 238                page = vm_normal_page(vma, addr, *pte);
 239                if (!page)
 240                        continue;
 241                /*
 242                 * The check for PageReserved here is important to avoid
 243                 * handling zero pages and other pages that may have been
 244                 * marked special by the system.
 245                 *
 246                 * If the PageReserved would not be checked here then f.e.
 247                 * the location of the zero page could have an influence
 248                 * on MPOL_MF_STRICT, zero pages would be counted for
 249                 * the per node stats, and there would be useless attempts
 250                 * to put zero pages on the migration list.
 251                 */
 252                if (PageReserved(page))
 253                        continue;
 254                nid = page_to_nid(page);
 255                if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT))
 256                        continue;
 257
 258                if (flags & MPOL_MF_STATS)
 259                        gather_stats(page, private, pte_dirty(*pte));
 260                else if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
 261                        migrate_page_add(page, private, flags);
 262                else
 263                        break;
 264        } while (pte++, addr += PAGE_SIZE, addr != end);
 265        pte_unmap_unlock(orig_pte, ptl);
 266        return addr != end;
 267}
 268
 269static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud,
 270                unsigned long addr, unsigned long end,
 271                const nodemask_t *nodes, unsigned long flags,
 272                void *private)
 273{
 274        pmd_t *pmd;
 275        unsigned long next;
 276
 277        pmd = pmd_offset(pud, addr);
 278        do {
 279                next = pmd_addr_end(addr, end);
 280                if (pmd_none_or_clear_bad(pmd))
 281                        continue;
 282                if (check_pte_range(vma, pmd, addr, next, nodes,
 283                                    flags, private))
 284                        return -EIO;
 285        } while (pmd++, addr = next, addr != end);
 286        return 0;
 287}
 288
 289static inline int check_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
 290                unsigned long addr, unsigned long end,
 291                const nodemask_t *nodes, unsigned long flags,
 292                void *private)
 293{
 294        pud_t *pud;
 295        unsigned long next;
 296
 297        pud = pud_offset(pgd, addr);
 298        do {
 299                next = pud_addr_end(addr, end);
 300                if (pud_none_or_clear_bad(pud))
 301                        continue;
 302                if (check_pmd_range(vma, pud, addr, next, nodes,
 303                                    flags, private))
 304                        return -EIO;
 305        } while (pud++, addr = next, addr != end);
 306        return 0;
 307}
 308
 309static inline int check_pgd_range(struct vm_area_struct *vma,
 310                unsigned long addr, unsigned long end,
 311                const nodemask_t *nodes, unsigned long flags,
 312                void *private)
 313{
 314        pgd_t *pgd;
 315        unsigned long next;
 316
 317        pgd = pgd_offset(vma->vm_mm, addr);
 318        do {
 319                next = pgd_addr_end(addr, end);
 320                if (pgd_none_or_clear_bad(pgd))
 321                        continue;
 322                if (check_pud_range(vma, pgd, addr, next, nodes,
 323                                    flags, private))
 324                        return -EIO;
 325        } while (pgd++, addr = next, addr != end);
 326        return 0;
 327}
 328
 329/*
 330 * Check if all pages in a range are on a set of nodes.
 331 * If pagelist != NULL then isolate pages from the LRU and
 332 * put them on the pagelist.
 333 */
 334static struct vm_area_struct *
 335check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
 336                const nodemask_t *nodes, unsigned long flags, void *private)
 337{
 338        int err;
 339        struct vm_area_struct *first, *vma, *prev;
 340
 341        if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
 342
 343                err = migrate_prep();
 344                if (err)
 345                        return ERR_PTR(err);
 346        }
 347
 348        first = find_vma(mm, start);
 349        if (!first)
 350                return ERR_PTR(-EFAULT);
 351        prev = NULL;
 352        for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) {
 353                if (!(flags & MPOL_MF_DISCONTIG_OK)) {
 354                        if (!vma->vm_next && vma->vm_end < end)
 355                                return ERR_PTR(-EFAULT);
 356                        if (prev && prev->vm_end < vma->vm_start)
 357                                return ERR_PTR(-EFAULT);
 358                }
 359                if (!is_vm_hugetlb_page(vma) &&
 360                    ((flags & MPOL_MF_STRICT) ||
 361                     ((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) &&
 362                                vma_migratable(vma)))) {
 363                        unsigned long endvma = vma->vm_end;
 364
 365                        if (endvma > end)
 366                                endvma = end;
 367                        if (vma->vm_start > start)
 368                                start = vma->vm_start;
 369                        err = check_pgd_range(vma, start, endvma, nodes,
 370                                                flags, private);
 371                        if (err) {
 372                                first = ERR_PTR(err);
 373                                break;
 374                        }
 375                }
 376                prev = vma;
 377        }
 378        return first;
 379}
 380
 381/* Apply policy to a single VMA */
 382static int policy_vma(struct vm_area_struct *vma, struct mempolicy *new)
 383{
 384        int err = 0;
 385        struct mempolicy *old = vma->vm_policy;
 386
 387        pr_debug("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
 388                 vma->vm_start, vma->vm_end, vma->vm_pgoff,
 389                 vma->vm_ops, vma->vm_file,
 390                 vma->vm_ops ? vma->vm_ops->set_policy : NULL);
 391
 392        if (vma->vm_ops && vma->vm_ops->set_policy)
 393                err = vma->vm_ops->set_policy(vma, new);
 394        if (!err) {
 395                mpol_get(new);
 396                vma->vm_policy = new;
 397                mpol_free(old);
 398        }
 399        return err;
 400}
 401
 402/* Step 2: apply policy to a range and do splits. */
 403static int mbind_range(struct vm_area_struct *vma, unsigned long start,
 404                       unsigned long end, struct mempolicy *new)
 405{
 406        struct vm_area_struct *next;
 407        int err;
 408
 409        err = 0;
 410        for (; vma && vma->vm_start < end; vma = next) {
 411                next = vma->vm_next;
 412                if (vma->vm_start < start)
 413                        err = split_vma(vma->vm_mm, vma, start, 1);
 414                if (!err && vma->vm_end > end)
 415                        err = split_vma(vma->vm_mm, vma, end, 0);
 416                if (!err)
 417                        err = policy_vma(vma, new);
 418                if (err)
 419                        break;
 420        }
 421        return err;
 422}
 423
 424static int contextualize_policy(int mode, nodemask_t *nodes)
 425{
 426        if (!nodes)
 427                return 0;
 428
 429        cpuset_update_task_memory_state();
 430        if (!cpuset_nodes_subset_current_mems_allowed(*nodes))
 431                return -EINVAL;
 432        return mpol_check_policy(mode, nodes);
 433}
 434
 435
 436/*
 437 * Update task->flags PF_MEMPOLICY bit: set iff non-default
 438 * mempolicy.  Allows more rapid checking of this (combined perhaps
 439 * with other PF_* flag bits) on memory allocation hot code paths.
 440 *
 441 * If called from outside this file, the task 'p' should -only- be
 442 * a newly forked child not yet visible on the task list, because
 443 * manipulating the task flags of a visible task is not safe.
 444 *
 445 * The above limitation is why this routine has the funny name
 446 * mpol_fix_fork_child_flag().
 447 *
 448 * It is also safe to call this with a task pointer of current,
 449 * which the static wrapper mpol_set_task_struct_flag() does,
 450 * for use within this file.
 451 */
 452
 453void mpol_fix_fork_child_flag(struct task_struct *p)
 454{
 455        if (p->mempolicy)
 456                p->flags |= PF_MEMPOLICY;
 457        else
 458                p->flags &= ~PF_MEMPOLICY;
 459}
 460
 461static void mpol_set_task_struct_flag(void)
 462{
 463        mpol_fix_fork_child_flag(current);
 464}
 465
 466/* Set the process memory policy */
 467static long do_set_mempolicy(int mode, nodemask_t *nodes)
 468{
 469        struct mempolicy *new;
 470
 471        if (contextualize_policy(mode, nodes))
 472                return -EINVAL;
 473        new = mpol_new(mode, nodes);
 474        if (IS_ERR(new))
 475                return PTR_ERR(new);
 476        mpol_free(current->mempolicy);
 477        current->mempolicy = new;
 478        mpol_set_task_struct_flag();
 479        if (new && new->policy == MPOL_INTERLEAVE)
 480                current->il_next = first_node(new->v.nodes);
 481        return 0;
 482}
 483
 484/* Fill a zone bitmap for a policy */
 485static void get_zonemask(struct mempolicy *p, nodemask_t *nodes)
 486{
 487        int i;
 488
 489        nodes_clear(*nodes);
 490        switch (p->policy) {
 491        case MPOL_BIND:
 492                for (i = 0; p->v.zonelist->zones[i]; i++)
 493                        node_set(zone_to_nid(p->v.zonelist->zones[i]),
 494                                *nodes);
 495                break;
 496        case MPOL_DEFAULT:
 497                break;
 498        case MPOL_INTERLEAVE:
 499                *nodes = p->v.nodes;
 500                break;
 501        case MPOL_PREFERRED:
 502                /* or use current node instead of memory_map? */
 503                if (p->v.preferred_node < 0)
 504                        *nodes = node_states[N_HIGH_MEMORY];
 505                else
 506                        node_set(p->v.preferred_node, *nodes);
 507                break;
 508        default:
 509                BUG();
 510        }
 511}
 512
 513static int lookup_node(struct mm_struct *mm, unsigned long addr)
 514{
 515        struct page *p;
 516        int err;
 517
 518        err = get_user_pages(current, mm, addr & PAGE_MASK, 1, 0, 0, &p, NULL);
 519        if (err >= 0) {
 520                err = page_to_nid(p);
 521                put_page(p);
 522        }
 523        return err;
 524}
 525
 526/* Retrieve NUMA policy */
 527static long do_get_mempolicy(int *policy, nodemask_t *nmask,
 528                             unsigned long addr, unsigned long flags)
 529{
 530        int err;
 531        struct mm_struct *mm = current->mm;
 532        struct vm_area_struct *vma = NULL;
 533        struct mempolicy *pol = current->mempolicy;
 534
 535        cpuset_update_task_memory_state();
 536        if (flags &
 537                ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED))
 538                return -EINVAL;
 539
 540        if (flags & MPOL_F_MEMS_ALLOWED) {
 541                if (flags & (MPOL_F_NODE|MPOL_F_ADDR))
 542                        return -EINVAL;
 543                *policy = 0;    /* just so it's initialized */
 544                *nmask  = cpuset_current_mems_allowed;
 545                return 0;
 546        }
 547
 548        if (flags & MPOL_F_ADDR) {
 549                down_read(&mm->mmap_sem);
 550                vma = find_vma_intersection(mm, addr, addr+1);
 551                if (!vma) {
 552                        up_read(&mm->mmap_sem);
 553                        return -EFAULT;
 554                }
 555                if (vma->vm_ops && vma->vm_ops->get_policy)
 556                        pol = vma->vm_ops->get_policy(vma, addr);
 557                else
 558                        pol = vma->vm_policy;
 559        } else if (addr)
 560                return -EINVAL;
 561
 562        if (!pol)
 563                pol = &default_policy;
 564
 565        if (flags & MPOL_F_NODE) {
 566                if (flags & MPOL_F_ADDR) {
 567                        err = lookup_node(mm, addr);
 568                        if (err < 0)
 569                                goto out;
 570                        *policy = err;
 571                } else if (pol == current->mempolicy &&
 572                                pol->policy == MPOL_INTERLEAVE) {
 573                        *policy = current->il_next;
 574                } else {
 575                        err = -EINVAL;
 576                        goto out;
 577                }
 578        } else
 579                *policy = pol->policy;
 580
 581        if (vma) {
 582                up_read(&current->mm->mmap_sem);
 583                vma = NULL;
 584        }
 585
 586        err = 0;
 587        if (nmask)
 588                get_zonemask(pol, nmask);
 589
 590 out:
 591        if (vma)
 592                up_read(&current->mm->mmap_sem);
 593        return err;
 594}
 595
 596#ifdef CONFIG_MIGRATION
 597/*
 598 * page migration
 599 */
 600static void migrate_page_add(struct page *page, struct list_head *pagelist,
 601                                unsigned long flags)
 602{
 603        /*
 604         * Avoid migrating a page that is shared with others.
 605         */
 606        if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1)
 607                isolate_lru_page(page, pagelist);
 608}
 609
 610static struct page *new_node_page(struct page *page, unsigned long node, int **x)
 611{
 612        return alloc_pages_node(node, GFP_HIGHUSER_MOVABLE, 0);
 613}
 614
 615/*
 616 * Migrate pages from one node to a target node.
 617 * Returns error or the number of pages not migrated.
 618 */
 619static int migrate_to_node(struct mm_struct *mm, int source, int dest,
 620                           int flags)
 621{
 622        nodemask_t nmask;
 623        LIST_HEAD(pagelist);
 624        int err = 0;
 625
 626        nodes_clear(nmask);
 627        node_set(source, nmask);
 628
 629        check_range(mm, mm->mmap->vm_start, TASK_SIZE, &nmask,
 630                        flags | MPOL_MF_DISCONTIG_OK, &pagelist);
 631
 632        if (!list_empty(&pagelist))
 633                err = migrate_pages(&pagelist, new_node_page, dest);
 634
 635        return err;
 636}
 637
 638/*
 639 * Move pages between the two nodesets so as to preserve the physical
 640 * layout as much as possible.
 641 *
 642 * Returns the number of page that could not be moved.
 643 */
 644int do_migrate_pages(struct mm_struct *mm,
 645        const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags)
 646{
 647        LIST_HEAD(pagelist);
 648        int busy = 0;
 649        int err = 0;
 650        nodemask_t tmp;
 651
 652        down_read(&mm->mmap_sem);
 653
 654        err = migrate_vmas(mm, from_nodes, to_nodes, flags);
 655        if (err)
 656                goto out;
 657
 658/*
 659 * Find a 'source' bit set in 'tmp' whose corresponding 'dest'
 660 * bit in 'to' is not also set in 'tmp'.  Clear the found 'source'
 661 * bit in 'tmp', and return that <source, dest> pair for migration.
 662 * The pair of nodemasks 'to' and 'from' define the map.
 663 *
 664 * If no pair of bits is found that way, fallback to picking some
 665 * pair of 'source' and 'dest' bits that are not the same.  If the
 666 * 'source' and 'dest' bits are the same, this represents a node
 667 * that will be migrating to itself, so no pages need move.
 668 *
 669 * If no bits are left in 'tmp', or if all remaining bits left
 670 * in 'tmp' correspond to the same bit in 'to', return false
 671 * (nothing left to migrate).
 672 *
 673 * This lets us pick a pair of nodes to migrate between, such that
 674 * if possible the dest node is not already occupied by some other
 675 * source node, minimizing the risk of overloading the memory on a
 676 * node that would happen if we migrated incoming memory to a node
 677 * before migrating outgoing memory source that same node.
 678 *
 679 * A single scan of tmp is sufficient.  As we go, we remember the
 680 * most recent <s, d> pair that moved (s != d).  If we find a pair
 681 * that not only moved, but what's better, moved to an empty slot
 682 * (d is not set in tmp), then we break out then, with that pair.
 683 * Otherwise when we finish scannng from_tmp, we at least have the
 684 * most recent <s, d> pair that moved.  If we get all the way through
 685 * the scan of tmp without finding any node that moved, much less
 686 * moved to an empty node, then there is nothing left worth migrating.
 687 */
 688
 689        tmp = *from_nodes;
 690        while (!nodes_empty(tmp)) {
 691                int s,d;
 692                int source = -1;
 693                int dest = 0;
 694
 695                for_each_node_mask(s, tmp) {
 696                        d = node_remap(s, *from_nodes, *to_nodes);
 697                        if (s == d)
 698                                continue;
 699
 700                        source = s;     /* Node moved. Memorize */
 701                        dest = d;
 702
 703                        /* dest not in remaining from nodes? */
 704                        if (!node_isset(dest, tmp))
 705                                break;
 706                }
 707                if (source == -1)
 708                        break;
 709
 710                node_clear(source, tmp);
 711                err = migrate_to_node(mm, source, dest, flags);
 712                if (err > 0)
 713                        busy += err;
 714                if (err < 0)
 715                        break;
 716        }
 717out:
 718        up_read(&mm->mmap_sem);
 719        if (err < 0)
 720                return err;
 721        return busy;
 722
 723}
 724
 725/*
 726 * Allocate a new page for page migration based on vma policy.
 727 * Start assuming that page is mapped by vma pointed to by @private.
 728 * Search forward from there, if not.  N.B., this assumes that the
 729 * list of pages handed to migrate_pages()--which is how we get here--
 730 * is in virtual address order.
 731 */
 732static struct page *new_vma_page(struct page *page, unsigned long private, int **x)
 733{
 734        struct vm_area_struct *vma = (struct vm_area_struct *)private;
 735        unsigned long uninitialized_var(address);
 736
 737        while (vma) {
 738                address = page_address_in_vma(page, vma);
 739                if (address != -EFAULT)
 740                        break;
 741                vma = vma->vm_next;
 742        }
 743
 744        /*
 745         * if !vma, alloc_page_vma() will use task or system default policy
 746         */
 747        return alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
 748}
 749#else
 750
 751static void migrate_page_add(struct page *page, struct list_head *pagelist,
 752                                unsigned long flags)
 753{
 754}
 755
 756int do_migrate_pages(struct mm_struct *mm,
 757        const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags)
 758{
 759        return -ENOSYS;
 760}
 761
 762static struct page *new_vma_page(struct page *page, unsigned long private, int **x)
 763{
 764        return NULL;
 765}
 766#endif
 767
 768static long do_mbind(unsigned long start, unsigned long len,
 769                     unsigned long mode, nodemask_t *nmask,
 770                     unsigned long flags)
 771{
 772        struct vm_area_struct *vma;
 773        struct mm_struct *mm = current->mm;
 774        struct mempolicy *new;
 775        unsigned long end;
 776        int err;
 777        LIST_HEAD(pagelist);
 778
 779        if ((flags & ~(unsigned long)(MPOL_MF_STRICT |
 780                                      MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
 781            || mode > MPOL_MAX)
 782                return -EINVAL;
 783        if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
 784                return -EPERM;
 785
 786        if (start & ~PAGE_MASK)
 787                return -EINVAL;
 788
 789        if (mode == MPOL_DEFAULT)
 790                flags &= ~MPOL_MF_STRICT;
 791
 792        len = (len + PAGE_SIZE - 1) & PAGE_MASK;
 793        end = start + len;
 794
 795        if (end < start)
 796                return -EINVAL;
 797        if (end == start)
 798                return 0;
 799
 800        if (mpol_check_policy(mode, nmask))
 801                return -EINVAL;
 802
 803        new = mpol_new(mode, nmask);
 804        if (IS_ERR(new))
 805                return PTR_ERR(new);
 806
 807        /*
 808         * If we are using the default policy then operation
 809         * on discontinuous address spaces is okay after all
 810         */
 811        if (!new)
 812                flags |= MPOL_MF_DISCONTIG_OK;
 813
 814        pr_debug("mbind %lx-%lx mode:%ld nodes:%lx\n",start,start+len,
 815                 mode, nmask ? nodes_addr(*nmask)[0] : -1);
 816
 817        down_write(&mm->mmap_sem);
 818        vma = check_range(mm, start, end, nmask,
 819                          flags | MPOL_MF_INVERT, &pagelist);
 820
 821        err = PTR_ERR(vma);
 822        if (!IS_ERR(vma)) {
 823                int nr_failed = 0;
 824
 825                err = mbind_range(vma, start, end, new);
 826
 827                if (!list_empty(&pagelist))
 828                        nr_failed = migrate_pages(&pagelist, new_vma_page,
 829                                                (unsigned long)vma);
 830
 831                if (!err && nr_failed && (flags & MPOL_MF_STRICT))
 832                        err = -EIO;
 833        }
 834
 835        up_write(&mm->mmap_sem);
 836        mpol_free(new);
 837        return err;
 838}
 839
 840/*
 841 * User space interface with variable sized bitmaps for nodelists.
 842 */
 843
 844/* Copy a node mask from user space. */
 845static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
 846                     unsigned long maxnode)
 847{
 848        unsigned long k;
 849        unsigned long nlongs;
 850        unsigned long endmask;
 851
 852        --maxnode;
 853        nodes_clear(*nodes);
 854        if (maxnode == 0 || !nmask)
 855                return 0;
 856        if (maxnode > PAGE_SIZE*BITS_PER_BYTE)
 857                return -EINVAL;
 858
 859        nlongs = BITS_TO_LONGS(maxnode);
 860        if ((maxnode % BITS_PER_LONG) == 0)
 861                endmask = ~0UL;
 862        else
 863                endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1;
 864
 865        /* When the user specified more nodes than supported just check
 866           if the non supported part is all zero. */
 867        if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) {
 868                if (nlongs > PAGE_SIZE/sizeof(long))
 869                        return -EINVAL;
 870                for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) {
 871                        unsigned long t;
 872                        if (get_user(t, nmask + k))
 873                                return -EFAULT;
 874                        if (k == nlongs - 1) {
 875                                if (t & endmask)
 876                                        return -EINVAL;
 877                        } else if (t)
 878                                return -EINVAL;
 879                }
 880                nlongs = BITS_TO_LONGS(MAX_NUMNODES);
 881                endmask = ~0UL;
 882        }
 883
 884        if (copy_from_user(nodes_addr(*nodes), nmask, nlongs*sizeof(unsigned long)))
 885                return -EFAULT;
 886        nodes_addr(*nodes)[nlongs-1] &= endmask;
 887        return 0;
 888}
 889
 890/* Copy a kernel node mask to user space */
 891static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
 892                              nodemask_t *nodes)
 893{
 894        unsigned long copy = ALIGN(maxnode-1, 64) / 8;
 895        const int nbytes = BITS_TO_LONGS(MAX_NUMNODES) * sizeof(long);
 896
 897        if (copy > nbytes) {
 898                if (copy > PAGE_SIZE)
 899                        return -EINVAL;
 900                if (clear_user((char __user *)mask + nbytes, copy - nbytes))
 901                        return -EFAULT;
 902                copy = nbytes;
 903        }
 904        return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
 905}
 906
 907asmlinkage long sys_mbind(unsigned long start, unsigned long len,
 908                        unsigned long mode,
 909                        unsigned long __user *nmask, unsigned long maxnode,
 910                        unsigned flags)
 911{
 912        nodemask_t nodes;
 913        int err;
 914
 915        err = get_nodes(&nodes, nmask, maxnode);
 916        if (err)
 917                return err;
 918#ifdef CONFIG_CPUSETS
 919        /* Restrict the nodes to the allowed nodes in the cpuset */
 920        nodes_and(nodes, nodes, current->mems_allowed);
 921#endif
 922        return do_mbind(start, len, mode, &nodes, flags);
 923}
 924
 925/* Set the process memory policy */
 926asmlinkage long sys_set_mempolicy(int mode, unsigned long __user *nmask,
 927                unsigned long maxnode)
 928{
 929        int err;
 930        nodemask_t nodes;
 931
 932        if (mode < 0 || mode > MPOL_MAX)
 933                return -EINVAL;
 934        err = get_nodes(&nodes, nmask, maxnode);
 935        if (err)
 936                return err;
 937        return do_set_mempolicy(mode, &nodes);
 938}
 939
 940asmlinkage long sys_migrate_pages(pid_t pid, unsigned long maxnode,
 941                const unsigned long __user *old_nodes,
 942                const unsigned long __user *new_nodes)
 943{
 944        struct mm_struct *mm;
 945        struct task_struct *task;
 946        nodemask_t old;
 947        nodemask_t new;
 948        nodemask_t task_nodes;
 949        int err;
 950
 951        err = get_nodes(&old, old_nodes, maxnode);
 952        if (err)
 953                return err;
 954
 955        err = get_nodes(&new, new_nodes, maxnode);
 956        if (err)
 957                return err;
 958
 959        /* Find the mm_struct */
 960        read_lock(&tasklist_lock);
 961        task = pid ? find_task_by_vpid(pid) : current;
 962        if (!task) {
 963                read_unlock(&tasklist_lock);
 964                return -ESRCH;
 965        }
 966        mm = get_task_mm(task);
 967        read_unlock(&tasklist_lock);
 968
 969        if (!mm)
 970                return -EINVAL;
 971
 972        /*
 973         * Check if this process has the right to modify the specified
 974         * process. The right exists if the process has administrative
 975         * capabilities, superuser privileges or the same
 976         * userid as the target process.
 977         */
 978        if ((current->euid != task->suid) && (current->euid != task->uid) &&
 979            (current->uid != task->suid) && (current->uid != task->uid) &&
 980            !capable(CAP_SYS_NICE)) {
 981                err = -EPERM;
 982                goto out;
 983        }
 984
 985        task_nodes = cpuset_mems_allowed(task);
 986        /* Is the user allowed to access the target nodes? */
 987        if (!nodes_subset(new, task_nodes) && !capable(CAP_SYS_NICE)) {
 988                err = -EPERM;
 989                goto out;
 990        }
 991
 992        if (!nodes_subset(new, node_states[N_HIGH_MEMORY])) {
 993                err = -EINVAL;
 994                goto out;
 995        }
 996
 997        err = security_task_movememory(task);
 998        if (err)
 999                goto out;
1000

1001        err = do_migrate_pages(mm, &old, &new,
1002                capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE);
1003out:
1004        mmput(mm);
1005        return err;
1006}
1007
1008
1009/* Retrieve NUMA policy */
1010asmlinkage long sys_get_mempolicy(int __user *policy,
1011                                unsigned long __user *nmask,
1012                                unsigned long maxnode,
1013                                unsigned long addr, unsigned long flags)
1014{
1015        int err;
1016        int uninitialized_var(pval);
1017        nodemask_t nodes;
1018
1019        if (nmask != NULL && maxnode < MAX_NUMNODES)
1020                return -EINVAL;
1021
1022        err = do_get_mempolicy(&pval, &nodes, addr, flags);
1023
1024        if (err)
1025                return err;
1026
1027        if (policy && put_user(pval, policy))
1028                return -EFAULT;
1029
1030        if (nmask)
1031                err = copy_nodes_to_user(nmask, maxnode, &nodes);
1032
1033        return err;
1034}
1035
1036#ifdef CONFIG_COMPAT
1037
1038asmlinkage long compat_sys_get_mempolicy(int __user *policy,
1039                                     compat_ulong_t __user *nmask,
1040                                     compat_ulong_t maxnode,
1041                                     compat_ulong_t addr, compat_ulong_t flags)
1042{
1043        long err;
1044        unsigned long __user *nm = NULL;
1045        unsigned long nr_bits, alloc_size;
1046        DECLARE_BITMAP(bm, MAX_NUMNODES);
1047
1048        nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1049        alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1050
1051        if (nmask)
1052                nm = compat_alloc_user_space(alloc_size);
1053
1054        err = sys_get_mempolicy(policy, nm, nr_bits+1, addr, flags);
1055
1056        if (!err && nmask) {
1057                err = copy_from_user(bm, nm, alloc_size);
1058                /* ensure entire bitmap is zeroed */
1059                err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8);
1060                err |= compat_put_bitmap(nmask, bm, nr_bits);
1061        }
1062
1063        return err;
1064}
1065
1066asmlinkage long compat_sys_set_mempolicy(int mode, compat_ulong_t __user *nmask,
1067                                     compat_ulong_t maxnode)
1068{
1069        long err = 0;
1070        unsigned long __user *nm = NULL;
1071        unsigned long nr_bits, alloc_size;
1072        DECLARE_BITMAP(bm, MAX_NUMNODES);
1073
1074        nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1075        alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1076
1077        if (nmask) {
1078                err = compat_get_bitmap(bm, nmask, nr_bits);
1079                nm = compat_alloc_user_space(alloc_size);
1080                err |= copy_to_user(nm, bm, alloc_size);
1081        }
1082
1083        if (err)
1084                return -EFAULT;
1085
1086        return sys_set_mempolicy(mode, nm, nr_bits+1);
1087}
1088
1089asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len,
1090                             compat_ulong_t mode, compat_ulong_t __user *nmask,
1091                             compat_ulong_t maxnode, compat_ulong_t flags)
1092{
1093        long err = 0;
1094        unsigned long __user *nm = NULL;
1095        unsigned long nr_bits, alloc_size;
1096        nodemask_t bm;
1097
1098        nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1099        alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1100
1101        if (nmask) {
1102                err = compat_get_bitmap(nodes_addr(bm), nmask, nr_bits);
1103                nm = compat_alloc_user_space(alloc_size);
1104                err |= copy_to_user(nm, nodes_addr(bm), alloc_size);
1105        }
1106
1107        if (err)
1108                return -EFAULT;
1109
1110        return sys_mbind(start, len, mode, nm, nr_bits+1, flags);
1111}
1112
1113#endif
1114
1115/*
1116 * get_vma_policy(@task, @vma, @addr)
1117 * @task - task for fallback if vma policy == default
1118 * @vma   - virtual memory area whose policy is sought
1119 * @addr  - address in @vma for shared policy lookup
1120 *
1121 * Returns effective policy for a VMA at specified address.
1122 * Falls back to @task or system default policy, as necessary.
1123 * Returned policy has extra reference count if shared, vma,
1124 * or some other task's policy [show_numa_maps() can pass
1125 * @task != current].  It is the caller's responsibility to
1126 * free the reference in these cases.
1127 */
1128static struct mempolicy * get_vma_policy(struct task_struct *task,
1129                struct vm_area_struct *vma, unsigned long addr)
1130{
1131        struct mempolicy *pol = task->mempolicy;
1132        int shared_pol = 0;
1133
1134        if (vma) {
1135                if (vma->vm_ops && vma->vm_ops->get_policy) {
1136                        pol = vma->vm_ops->get_policy(vma, addr);
1137                        shared_pol = 1; /* if pol non-NULL, add ref below */
1138                } else if (vma->vm_policy &&
1139                                vma->vm_policy->policy != MPOL_DEFAULT)
1140                        pol = vma->vm_policy;
1141        }
1142        if (!pol)
1143                pol = &default_policy;
1144        else if (!shared_pol && pol != current->mempolicy)
1145                mpol_get(pol);  /* vma or other task's policy */
1146        return pol;
1147}
1148
1149/* Return a zonelist representing a mempolicy */
1150static struct zonelist *zonelist_policy(gfp_t gfp, struct mempolicy *policy)
1151{
1152        int nd;
1153
1154        switch (policy->policy) {
1155        case MPOL_PREFERRED:
1156                nd = policy->v.preferred_node;
1157                if (nd < 0)
1158                        nd = numa_node_id();
1159                break;
1160        case MPOL_BIND:
1161                /* Lower zones don't get a policy applied */
1162                /* Careful: current->mems_allowed might have moved */
1163                if (gfp_zone(gfp) >= policy_zone)
1164                        if (cpuset_zonelist_valid_mems_allowed(policy->v.zonelist))
1165                                return policy->v.zonelist;
1166                /*FALL THROUGH*/
1167        case MPOL_INTERLEAVE: /* should not happen */
1168        case MPOL_DEFAULT:
1169                nd = numa_node_id();
1170                break;
1171        default:
1172                nd = 0;
1173                BUG();
1174        }
1175        return NODE_DATA(nd)->node_zonelists + gfp_zone(gfp);
1176}
1177
1178/* Do dynamic interleaving for a process */
1179static unsigned interleave_nodes(struct mempolicy *policy)
1180{
1181        unsigned nid, next;
1182        struct task_struct *me = current;
1183
1184        nid = me->il_next;
1185        next = next_node(nid, policy->v.nodes);
1186        if (next >= MAX_NUMNODES)
1187                next = first_node(policy->v.nodes);
1188        me->il_next = next;
1189        return nid;
1190}
1191
1192/*
1193 * Depending on the memory policy provide a node from which to allocate the
1194 * next slab entry.
1195 */
1196unsigned slab_node(struct mempolicy *policy)
1197{
1198        int pol = policy ? policy->policy : MPOL_DEFAULT;
1199
1200        switch (pol) {
1201        case MPOL_INTERLEAVE:
1202                return interleave_nodes(policy);
1203
1204        case MPOL_BIND:
1205                /*
1206                 * Follow bind policy behavior and start allocation at the
1207                 * first node.
1208                 */
1209                return zone_to_nid(policy->v.zonelist->zones[0]);
1210
1211        case MPOL_PREFERRED:
1212                if (policy->v.preferred_node >= 0)
1213                        return policy->v.preferred_node;
1214                /* Fall through */
1215
1216        default:
1217                return numa_node_id();
1218        }
1219}
1220
1221/* Do static interleaving for a VMA with known offset. */
1222static unsigned offset_il_node(struct mempolicy *pol,
1223                struct vm_area_struct *vma, unsigned long off)
1224{
1225        unsigned nnodes = nodes_weight(pol->v.nodes);
1226        unsigned target = (unsigned)off % nnodes;
1227        int c;
1228        int nid = -1;
1229
1230        c = 0;
1231        do {
1232                nid = next_node(nid, pol->v.nodes);
1233                c++;
1234        } while (c <= target);
1235        return nid;
1236}
1237
1238/* Determine a node number for interleave */
1239static inline unsigned interleave_nid(struct mempolicy *pol,
1240                 struct vm_area_struct *vma, unsigned long addr, int shift)
1241{
1242        if (vma) {
1243                unsigned long off;
1244
1245                /*
1246                 * for small pages, there is no difference between
1247                 * shift and PAGE_SHIFT, so the bit-shift is safe.
1248                 * for huge pages, since vm_pgoff is in units of small
1249                 * pages, we need to shift off the always 0 bits to get
1250                 * a useful offset.
1251                 */
1252                BUG_ON(shift < PAGE_SHIFT);
1253                off = vma->vm_pgoff >> (shift - PAGE_SHIFT);
1254                off += (addr - vma->vm_start) >> shift;
1255                return offset_il_node(pol, vma, off);
1256        } else
1257                return interleave_nodes(pol);
1258}
1259
1260#ifdef CONFIG_HUGETLBFS
1261/*
1262 * huge_zonelist(@vma, @addr, @gfp_flags, @mpol)
1263 * @vma = virtual memory area whose policy is sought
1264 * @addr = address in @vma for shared policy lookup and interleave policy
1265 * @gfp_flags = for requested zone
1266 * @mpol = pointer to mempolicy pointer for reference counted 'BIND policy
1267 *
1268 * Returns a zonelist suitable for a huge page allocation.
1269 * If the effective policy is 'BIND, returns pointer to policy's zonelist.
1270 * If it is also a policy for which get_vma_policy() returns an extra
1271 * reference, we must hold that reference until after allocation.
1272 * In that case, return policy via @mpol so hugetlb allocation can drop
1273 * the reference.  For non-'BIND referenced policies, we can/do drop the
1274 * reference here, so the caller doesn't need to know about the special case
1275 * for default and current task policy.
1276 */
1277struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr,
1278                                gfp_t gfp_flags, struct mempolicy **mpol)
1279{
1280        struct mempolicy *pol = get_vma_policy(current, vma, addr);
1281        struct zonelist *zl;
1282
1283        *mpol = NULL;           /* probably no unref needed */
1284        if (pol->policy == MPOL_INTERLEAVE) {
1285                unsigned nid;
1286
1287                nid = interleave_nid(pol, vma, addr, HPAGE_SHIFT);
1288                __mpol_free(pol);               /* finished with pol */
1289                return NODE_DATA(nid)->node_zonelists + gfp_zone(gfp_flags);
1290        }
1291
1292        zl = zonelist_policy(GFP_HIGHUSER, pol);
1293        if (unlikely(pol != &default_policy && pol != current->mempolicy)) {
1294                if (pol->policy != MPOL_BIND)
1295                        __mpol_free(pol);       /* finished with pol */
1296                else
1297                        *mpol = pol;    /* unref needed after allocation */
1298        }
1299        return zl;
1300}
1301#endif
1302
1303/* Allocate a page in interleaved policy.
1304   Own path because it needs to do special accounting. */
1305static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
1306                                        unsigned nid)
1307{
1308        struct zonelist *zl;
1309        struct page *page;
1310
1311        zl = NODE_DATA(nid)->node_zonelists + gfp_zone(gfp);
1312        page = __alloc_pages(gfp, order, zl);
1313        if (page && page_zone(page) == zl->zones[0])
1314                inc_zone_page_state(page, NUMA_INTERLEAVE_HIT);
1315        return page;
1316}
1317
1318/**
1319 *      alloc_page_vma  - Allocate a page for a VMA.
1320 *
1321 *      @gfp:
1322 *      %GFP_USER    user allocation.
1323 *      %GFP_KERNEL  kernel allocations,
1324 *      %GFP_HIGHMEM highmem/user allocations,
1325 *      %GFP_FS      allocation should not call back into a file system.
1326 *      %GFP_ATOMIC  don't sleep.
1327 *
1328 *      @vma:  Pointer to VMA or NULL if not available.
1329 *      @addr: Virtual Address of the allocation. Must be inside the VMA.
1330 *
1331 *      This function allocates a page from the kernel page pool and applies
1332 *      a NUMA policy associated with the VMA or the current process.
1333 *      When VMA is not NULL caller must hold down_read on the mmap_sem of the
1334 *      mm_struct of the VMA to prevent it from going away. Should be used for
1335 *      all allocations for pages that will be mapped into
1336 *      user space. Returns NULL when no page can be allocated.
1337 *
1338 *      Should be called with the mm_sem of the vma hold.
1339 */
1340struct page *
1341alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
1342{
1343        struct mempolicy *pol = get_vma_policy(current, vma, addr);
1344        struct zonelist *zl;
1345
1346        cpuset_update_task_memory_state();
1347
1348        if (unlikely(pol->policy == MPOL_INTERLEAVE)) {
1349                unsigned nid;
1350
1351                nid = interleave_nid(pol, vma, addr, PAGE_SHIFT);
1352                return alloc_page_interleave(gfp, 0, nid);
1353        }
1354        zl = zonelist_policy(gfp, pol);
1355        if (pol != &default_policy && pol != current->mempolicy) {
1356                /*
1357                 * slow path: ref counted policy -- shared or vma
1358                 */
1359                struct page *page =  __alloc_pages(gfp, 0, zl);
1360                __mpol_free(pol);
1361                return page;
1362        }
1363        /*
1364         * fast path:  default or task policy
1365         */
1366        return __alloc_pages(gfp, 0, zl);
1367}
1368
1369/**
1370 *      alloc_pages_current - Allocate pages.
1371 *
1372 *      @gfp:
1373 *              %GFP_USER   user allocation,
1374 *              %GFP_KERNEL kernel allocation,
1375 *              %GFP_HIGHMEM highmem allocation,
1376 *              %GFP_FS     don't call back into a file system.
1377 *              %GFP_ATOMIC don't sleep.
1378 *      @order: Power of two of allocation size in pages. 0 is a single page.
1379 *
1380 *      Allocate a page from the kernel page pool.  When not in
1381 *      interrupt context and apply the current process NUMA policy.
1382 *      Returns NULL when no page can be allocated.
1383 *
1384 *      Don't call cpuset_update_task_memory_state() unless
1385 *      1) it's ok to take cpuset_sem (can WAIT), and
1386 *      2) allocating for current task (not interrupt).
1387 */
1388struct page *alloc_pages_current(gfp_t gfp, unsigned order)
1389{
1390        struct mempolicy *pol = current->mempolicy;
1391
1392        if ((gfp & __GFP_WAIT) && !in_interrupt())
1393                cpuset_update_task_memory_state();
1394        if (!pol || in_interrupt() || (gfp & __GFP_THISNODE))
1395                pol = &default_policy;
1396        if (pol->policy == MPOL_INTERLEAVE)
1397                return alloc_page_interleave(gfp, order, interleave_nodes(pol));
1398        return __alloc_pages(gfp, order, zonelist_policy(gfp, pol));
1399}
1400EXPORT_SYMBOL(alloc_pages_current);
1401
1402/*
1403 * If mpol_copy() sees current->cpuset == cpuset_being_rebound, then it
1404 * rebinds the mempolicy its copying by calling mpol_rebind_policy()
1405 * with the mems_allowed returned by cpuset_mems_allowed().  This
1406 * keeps mempolicies cpuset relative after its cpuset moves.  See
1407 * further kernel/cpuset.c update_nodemask().
1408 */
1409
1410/* Slow path of a mempolicy copy */
1411struct mempolicy *__mpol_copy(struct mempolicy *old)
1412{
1413        struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
1414
1415        if (!new)
1416                return ERR_PTR(-ENOMEM);
1417        if (current_cpuset_is_being_rebound()) {
1418                nodemask_t mems = cpuset_mems_allowed(current);
1419                mpol_rebind_policy(old, &mems);
1420        }
1421        *new = *old;
1422        atomic_set(&new->refcnt, 1);
1423        if (new->policy == MPOL_BIND) {
1424                int sz = ksize(old->v.zonelist);
1425                new->v.zonelist = kmemdup(old->v.zonelist, sz, GFP_KERNEL);
1426                if (!new->v.zonelist) {
1427                        kmem_cache_free(policy_cache, new);
1428                        return ERR_PTR(-ENOMEM);
1429                }
1430        }
1431        return new;
1432}
1433
1434/* Slow path of a mempolicy comparison */
1435int __mpol_equal(struct mempolicy *a, struct mempolicy *b)
1436{
1437        if (!a || !b)
1438                return 0;
1439        if (a->policy != b->policy)
1440                return 0;
1441        switch (a->policy) {
1442        case MPOL_DEFAULT:
1443                return 1;
1444        case MPOL_INTERLEAVE:
1445                return nodes_equal(a->v.nodes, b->v.nodes);
1446        case MPOL_PREFERRED:
1447                return a->v.preferred_node == b->v.preferred_node;
1448        case MPOL_BIND: {
1449                int i;
1450                for (i = 0; a->v.zonelist->zones[i]; i++)
1451                        if (a->v.zonelist->zones[i] != b->v.zonelist->zones[i])
1452                                return 0;
1453                return b->v.zonelist->zones[i] == NULL;
1454        }
1455        default:
1456                BUG();
1457                return 0;
1458        }
1459}
1460
1461/* Slow path of a mpol destructor. */
1462void __mpol_free(struct mempolicy *p)
1463{
1464        if (!atomic_dec_and_test(&p->refcnt))
1465                return;
1466        if (p->policy == MPOL_BIND)
1467                kfree(p->v.zonelist);
1468        p->policy = MPOL_DEFAULT;
1469        kmem_cache_free(policy_cache, p);
1470}
1471
1472/*
1473 * Shared memory backing store policy support.
1474 *
1475 * Remember policies even when nobody has shared memory mapped.
1476 * The policies are kept in Red-Black tree linked from the inode.
1477 * They are protected by the sp->lock spinlock, which should be held
1478 * for any accesses to the tree.
1479 */
1480
1481/* lookup first element intersecting start-end */
1482/* Caller holds sp->lock */
1483static struct sp_node *
1484sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
1485{
1486        struct rb_node *n = sp->root.rb_node;
1487
1488        while (n) {
1489                struct sp_node *p = rb_entry(n, struct sp_node, nd);
1490
1491                if (start >= p->end)
1492                        n = n->rb_right;
1493                else if (end <= p->start)
1494                        n = n->rb_left;
1495                else
1496                        break;
1497        }
1498        if (!n)
1499                return NULL;
1500        for (;;) {
1501                struct sp_node *w = NULL;
1502                struct rb_node *prev = rb_prev(n);
1503                if (!prev)
1504                        break;
1505                w = rb_entry(prev, struct sp_node, nd);
1506                if (w->end <= start)
1507                        break;
1508                n = prev;
1509        }
1510        return rb_entry(n, struct sp_node, nd);
1511}
1512
1513/* Insert a new shared policy into the list. */
1514/* Caller holds sp->lock */
1515static void sp_insert(struct shared_policy *sp, struct sp_node *new)
1516{
1517        struct rb_node **p = &sp->root.rb_node;
1518        struct rb_node *parent = NULL;
1519        struct sp_node *nd;
1520
1521        while (*p) {
1522                parent = *p;
1523                nd = rb_entry(parent, struct sp_node, nd);
1524                if (new->start < nd->start)
1525                        p = &(*p)->rb_left;
1526                else if (new->end > nd->end)
1527                        p = &(*p)->rb_right;
1528                else
1529                        BUG();
1530        }
1531        rb_link_node(&new->nd, parent, p);
1532        rb_insert_color(&new->nd, &sp->root);
1533        pr_debug("inserting %lx-%lx: %d\n", new->start, new->end,
1534                 new->policy ? new->policy->policy : 0);
1535}
1536
1537/* Find shared policy intersecting idx */
1538struct mempolicy *
1539mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
1540{
1541        struct mempolicy *pol = NULL;
1542        struct sp_node *sn;
1543
1544        if (!sp->root.rb_node)
1545                return NULL;
1546        spin_lock(&sp->lock);
1547        sn = sp_lookup(sp, idx, idx+1);
1548        if (sn) {
1549                mpol_get(sn->policy);
1550                pol = sn->policy;
1551        }
1552        spin_unlock(&sp->lock);
1553        return pol;
1554}
1555
1556static void sp_delete(struct shared_policy *sp, struct sp_node *n)
1557{
1558        pr_debug("deleting %lx-l%lx\n", n->start, n->end);
1559        rb_erase(&n->nd, &sp->root);
1560        mpol_free(n->policy);
1561        kmem_cache_free(sn_cache, n);
1562}
1563
1564static struct sp_node *sp_alloc(unsigned long start, unsigned long end,
1565                                struct mempolicy *pol)
1566{
1567        struct sp_node *n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
1568
1569        if (!n)
1570                return NULL;
1571        n->start = start;
1572        n->end = end;
1573        mpol_get(pol);
1574        n->policy = pol;
1575        return n;
1576}
1577
1578/* Replace a policy range. */
1579static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
1580                                 unsigned long end, struct sp_node *new)
1581{
1582        struct sp_node *n, *new2 = NULL;
1583
1584restart:
1585        spin_lock(&sp->lock);
1586        n = sp_lookup(sp, start, end);
1587        /* Take care of old policies in the same range. */
1588        while (n && n->start < end) {
1589                struct rb_node *next = rb_next(&n->nd);
1590                if (n->start >= start) {
1591                        if (n->end <= end)
1592                                sp_delete(sp, n);
1593                        else
1594                                n->start = end;
1595                } else {
1596                        /* Old policy spanning whole new range. */
1597                        if (n->end > end) {
1598                                if (!new2) {
1599                                        spin_unlock(&sp->lock);
1600                                        new2 = sp_alloc(end, n->end, n->policy);
1601                                        if (!new2)
1602                                                return -ENOMEM;
1603                                        goto restart;
1604                                }
1605                                n->end = start;
1606                                sp_insert(sp, new2);
1607                                new2 = NULL;
1608                                break;
1609                        } else
1610                                n->end = start;
1611                }
1612                if (!next)
1613                        break;
1614                n = rb_entry(next, struct sp_node, nd);
1615        }
1616        if (new)
1617                sp_insert(sp, new);
1618        spin_unlock(&sp->lock);
1619        if (new2) {
1620                mpol_free(new2->policy);
1621                kmem_cache_free(sn_cache, new2);
1622        }
1623        return 0;
1624}
1625
1626void mpol_shared_policy_init(struct shared_policy *info, int policy,
1627                                nodemask_t *policy_nodes)
1628{
1629        info->root = RB_ROOT;
1630        spin_lock_init(&info->lock);
1631
1632        if (policy != MPOL_DEFAULT) {
1633                struct mempolicy *newpol;
1634
1635                /* Falls back to MPOL_DEFAULT on any error */
1636                newpol = mpol_new(policy, policy_nodes);
1637                if (!IS_ERR(newpol)) {
1638                        /* Create pseudo-vma that contains just the policy */
1639                        struct vm_area_struct pvma;
1640
1641                        memset(&pvma, 0, sizeof(struct vm_area_struct));
1642                        /* Policy covers entire file */
1643                        pvma.vm_end = TASK_SIZE;
1644                        mpol_set_shared_policy(info, &pvma, newpol);
1645                        mpol_free(newpol);
1646                }
1647        }
1648}
1649
1650int mpol_set_shared_policy(struct shared_policy *info,
1651                        struct vm_area_struct *vma, struct mempolicy *npol)
1652{
1653        int err;
1654        struct sp_node *new = NULL;
1655        unsigned long sz = vma_pages(vma);
1656
1657        pr_debug("set_shared_policy %lx sz %lu %d %lx\n",
1658                 vma->vm_pgoff,
1659                 sz, npol? npol->policy : -1,
1660                 npol ? nodes_addr(npol->v.nodes)[0] : -1);
1661
1662        if (npol) {
1663                new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);
1664                if (!new)
1665                        return -ENOMEM;
1666        }
1667        err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new);
1668        if (err && new)
1669                kmem_cache_free(sn_cache, new);
1670        return err;
1671}
1672
1673/* Free a backing policy store on inode delete. */
1674void mpol_free_shared_policy(struct shared_policy *p)
1675{
1676        struct sp_node *n;
1677        struct rb_node *next;
1678
1679        if (!p->root.rb_node)
1680                return;
1681        spin_lock(&p->lock);
1682        next = rb_first(&p->root);
1683        while (next) {
1684                n = rb_entry(next, struct sp_node, nd);
1685                next = rb_next(&n->nd);
1686                rb_erase(&n->nd, &p->root);
1687                mpol_free(n->policy);
1688                kmem_cache_free(sn_cache, n);
1689        }
1690        spin_unlock(&p->lock);
1691}
1692
1693/* assumes fs == KERNEL_DS */
1694void __init numa_policy_init(void)
1695{
1696        nodemask_t interleave_nodes;
1697        unsigned long largest = 0;
1698        int nid, prefer = 0;
1699
1700        policy_cache = kmem_cache_create("numa_policy",
1701                                         sizeof(struct mempolicy),
1702                                         0, SLAB_PANIC, NULL);
1703
1704        sn_cache = kmem_cache_create("shared_policy_node",
1705                                     sizeof(struct sp_node),
1706                                     0, SLAB_PANIC, NULL);
1707
1708        /*
1709         * Set interleaving policy for system init. Interleaving is only
1710         * enabled across suitably sized nodes (default is >= 16MB), or
1711         * fall back to the largest node if they're all smaller.
1712         */
1713        nodes_clear(interleave_nodes);
1714        for_each_node_state(nid, N_HIGH_MEMORY) {
1715                unsigned long total_pages = node_present_pages(nid);
1716
1717                /* Preserve the largest node */
1718                if (largest < total_pages) {
1719                        largest = total_pages;
1720                        prefer = nid;
1721                }
1722
1723                /* Interleave this node? */
1724                if ((total_pages << PAGE_SHIFT) >= (16 << 20))
1725                        node_set(nid, interleave_nodes);
1726        }
1727
1728        /* All too small, use the largest */
1729        if (unlikely(nodes_empty(interleave_nodes)))
1730                node_set(prefer, interleave_nodes);
1731
1732        if (do_set_mempolicy(MPOL_INTERLEAVE, &interleave_nodes))
1733                printk("numa_policy_init: interleaving failed\n");
1734}
1735
1736/* Reset policy of current process to default */
1737void numa_default_policy(void)
1738{
1739        do_set_mempolicy(MPOL_DEFAULT, NULL);
1740}
1741
1742/* Migrate a policy to a different set of nodes */
1743static void mpol_rebind_policy(struct mempolicy *pol,
1744                               const nodemask_t *newmask)
1745{
1746        nodemask_t *mpolmask;
1747        nodemask_t tmp;
1748
1749        if (!pol)
1750                return;
1751        mpolmask = &pol->cpuset_mems_allowed;
1752        if (nodes_equal(*mpolmask, *newmask))
1753                return;
1754
1755        switch (pol->policy) {
1756        case MPOL_DEFAULT:
1757                break;
1758        case MPOL_INTERLEAVE:
1759                nodes_remap(tmp, pol->v.nodes, *mpolmask, *newmask);
1760                pol->v.nodes = tmp;
1761                *mpolmask = *newmask;
1762                current->il_next = node_remap(current->il_next,
1763                                                *mpolmask, *newmask);
1764                break;
1765        case MPOL_PREFERRED:
1766                pol->v.preferred_node = node_remap(pol->v.preferred_node,
1767                                                *mpolmask, *newmask);
1768                *mpolmask = *newmask;
1769                break;
1770        case MPOL_BIND: {
1771                nodemask_t nodes;
1772                struct zone **z;
1773                struct zonelist *zonelist;
1774
1775                nodes_clear(nodes);
1776                for (z = pol->v.zonelist->zones; *z; z++)
1777                        node_set(zone_to_nid(*z), nodes);
1778                nodes_remap(tmp, nodes, *mpolmask, *newmask);
1779                nodes = tmp;
1780
1781                zonelist = bind_zonelist(&nodes);
1782
1783                /* If no mem, then zonelist is NULL and we keep old zonelist.
1784                 * If that old zonelist has no remaining mems_allowed nodes,
1785                 * then zonelist_policy() will "FALL THROUGH" to MPOL_DEFAULT.
1786                 */
1787
1788                if (!IS_ERR(zonelist)) {
1789                        /* Good - got mem - substitute new zonelist */
1790                        kfree(pol->v.zonelist);
1791                        pol->v.zonelist = zonelist;
1792                }
1793                *mpolmask = *newmask;
1794                break;
1795        }
1796        default:
1797                BUG();
1798                break;
1799        }
1800}
1801
1802/*
1803 * Wrapper for mpol_rebind_policy() that just requires task
1804 * pointer, and updates task mempolicy.
1805 */
1806
1807void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new)
1808{
1809        mpol_rebind_policy(tsk->mempolicy, new);
1810}
1811
1812/*
1813 * Rebind each vma in mm to new nodemask.
1814 *
1815 * Call holding a reference to mm.  Takes mm->mmap_sem during call.
1816 */
1817
1818void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
1819{
1820        struct vm_area_struct *vma;
1821
1822        down_write(&mm->mmap_sem);
1823        for (vma = mm->mmap; vma; vma = vma->vm_next)
1824                mpol_rebind_policy(vma->vm_policy, new);
1825        up_write(&mm->mmap_sem);
1826}
1827
1828/*
1829 * Display pages allocated per node and memory policy via /proc.
1830 */
1831
1832static const char * const policy_types[] =
1833        { "default", "prefer", "bind", "interleave" };
1834
1835/*
1836 * Convert a mempolicy into a string.
1837 * Returns the number of characters in buffer (if positive)
1838 * or an error (negative)
1839 */
1840static inline int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
1841{
1842        char *p = buffer;
1843        int l;
1844        nodemask_t nodes;
1845        int mode = pol ? pol->policy : MPOL_DEFAULT;
1846
1847        switch (mode) {
1848        case MPOL_DEFAULT:
1849                nodes_clear(nodes);
1850                break;
1851
1852        case MPOL_PREFERRED:
1853                nodes_clear(nodes);
1854                node_set(pol->v.preferred_node, nodes);
1855                break;
1856
1857        case MPOL_BIND:
1858                get_zonemask(pol, &nodes);
1859                break;
1860
1861        case MPOL_INTERLEAVE:
1862                nodes = pol->v.nodes;
1863                break;
1864
1865        default:
1866                BUG();
1867                return -EFAULT;
1868        }
1869
1870        l = strlen(policy_types[mode]);
1871        if (buffer + maxlen < p + l + 1)
1872                return -ENOSPC;
1873
1874        strcpy(p, policy_types[mode]);
1875        p += l;
1876
1877        if (!nodes_empty(nodes)) {
1878                if (buffer + maxlen < p + 2)
1879                        return -ENOSPC;
1880                *p++ = '=';
1881                p += nodelist_scnprintf(p, buffer + maxlen - p, nodes);
1882        }
1883        return p - buffer;
1884}
1885
1886struct numa_maps {
1887        unsigned long pages;
1888        unsigned long anon;
1889        unsigned long active;
1890        unsigned long writeback;
1891        unsigned long mapcount_max;
1892        unsigned long dirty;
1893        unsigned long swapcache;
1894        unsigned long node[MAX_NUMNODES];
1895};
1896
1897static void gather_stats(struct page *page, void *private, int pte_dirty)
1898{
1899        struct numa_maps *md = private;
1900        int count = page_mapcount(page);
1901
1902        md->pages++;
1903        if (pte_dirty || PageDirty(page))
1904                md->dirty++;
1905
1906        if (PageSwapCache(page))
1907                md->swapcache++;
1908
1909        if (PageActive(page))
1910                md->active++;
1911
1912        if (PageWriteback(page))
1913                md->writeback++;
1914
1915        if (PageAnon(page))
1916                md->anon++;
1917
1918        if (count > md->mapcount_max)
1919                md->mapcount_max = count;
1920
1921        md->node[page_to_nid(page)]++;
1922}
1923
1924#ifdef CONFIG_HUGETLB_PAGE
1925static void check_huge_range(struct vm_area_struct *vma,
1926                unsigned long start, unsigned long end,
1927                struct numa_maps *md)
1928{
1929        unsigned long addr;
1930        struct page *page;
1931
1932        for (addr = start; addr < end; addr += HPAGE_SIZE) {
1933                pte_t *ptep = huge_pte_offset(vma->vm_mm, addr & HPAGE_MASK);
1934                pte_t pte;
1935
1936                if (!ptep)
1937                        continue;
1938
1939                pte = *ptep;
1940                if (pte_none(pte))
1941                        continue;
1942
1943                page = pte_page(pte);
1944                if (!page)
1945                        continue;
1946
1947                gather_stats(page, md, pte_dirty(*ptep));
1948        }
1949}
1950#else
1951static inline void check_huge_range(struct vm_area_struct *vma,
1952                unsigned long start, unsigned long end,
1953                struct numa_maps *md)
1954{
1955}
1956#endif
1957
1958int show_numa_map(struct seq_file *m, void *v)
1959{
1960        struct proc_maps_private *priv = m->private;
1961        struct vm_area_struct *vma = v;
1962        struct numa_maps *md;
1963        struct file *file = vma->vm_file;
1964        struct mm_struct *mm = vma->vm_mm;
1965        struct mempolicy *pol;
1966        int n;
1967        char buffer[50];
1968
1969        if (!mm)
1970                return 0;
1971
1972        md = kzalloc(sizeof(struct numa_maps), GFP_KERNEL);
1973        if (!md)
1974                return 0;
1975
1976        pol = get_vma_policy(priv->task, vma, vma->vm_start);
1977        mpol_to_str(buffer, sizeof(buffer), pol);
1978        /*
1979         * unref shared or other task's mempolicy
1980         */
1981        if (pol != &default_policy && pol != current->mempolicy)
1982                __mpol_free(pol);
1983
1984        seq_printf(m, "%08lx %s", vma->vm_start, buffer);
1985
1986        if (file) {
1987                seq_printf(m, " file=");
1988                seq_path(m, file->f_path.mnt, file->f_path.dentry, "\n\t= ");
1989        } else if (vma->vm_start <= mm->brk && vma->vm_end >= mm->start_brk) {
1990                seq_printf(m, " heap");
1991        } else if (vma->vm_start <= mm->start_stack &&
1992                        vma->vm_end >= mm->start_stack) {
1993                seq_printf(m, " stack");
1994        }
1995
1996        if (is_vm_hugetlb_page(vma)) {
1997                check_huge_range(vma, vma->vm_start, vma->vm_end, md);
1998                seq_printf(m, " huge");
1999        } else {
2000                check_pgd_range(vma, vma->vm_start, vma->vm_end,

2001                        &node_states[N_HIGH_MEMORY], MPOL_MF_STATS, md);
2002        }
2003
2004        if (!md->pages)
2005                goto out;
2006
2007        if (md->anon)
2008                seq_printf(m," anon=%lu",md->anon);
2009
2010        if (md->dirty)
2011                seq_printf(m," dirty=%lu",md->dirty);
2012
2013        if (md->pages != md->anon && md->pages != md->dirty)
2014                seq_printf(m, " mapped=%lu", md->pages);
2015
2016        if (md->mapcount_max > 1)
2017                seq_printf(m, " mapmax=%lu", md->mapcount_max);
2018
2019        if (md->swapcache)
2020                seq_printf(m," swapcache=%lu", md->swapcache);
2021
2022        if (md->active < md->pages && !is_vm_hugetlb_page(vma))
2023                seq_printf(m," active=%lu", md->active);
2024
2025        if (md->writeback)
2026                seq_printf(m," writeback=%lu", md->writeback);
2027
2028        for_each_node_state(n, N_HIGH_MEMORY)
2029                if (md->node[n])
2030                        seq_printf(m, " N%d=%lu", n, md->node[n]);
2031out:
2032        seq_putc(m, '\n');
2033        kfree(md);
2034
2035        if (m->count < m->size)
2036                m->version = (vma != priv->tail_vma) ? vma->vm_start : 0;
2037        return 0;
2038}
2039