linux/mm/mempolicy.c
<<
>>
Prefs
   1/*
   2 * Simple NUMA memory policy for the Linux kernel.
   3 *
   4 * Copyright 2003,2004 Andi Kleen, SuSE Labs.
   5 * (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc.
   6 * Subject to the GNU Public License, version 2.
   7 *
   8 * NUMA policy allows the user to give hints in which node(s) memory should
   9 * be allocated.
  10 *
  11 * Support four policies per VMA and per process:
  12 *
  13 * The VMA policy has priority over the process policy for a page fault.
  14 *
  15 * interleave     Allocate memory interleaved over a set of nodes,
  16 *                with normal fallback if it fails.
  17 *                For VMA based allocations this interleaves based on the
  18 *                offset into the backing object or offset into the mapping
  19 *                for anonymous memory. For process policy an process counter
  20 *                is used.
  21 *
  22 * bind           Only allocate memory on a specific set of nodes,
  23 *                no fallback.
  24 *                FIXME: memory is allocated starting with the first node
  25 *                to the last. It would be better if bind would truly restrict
  26 *                the allocation to memory nodes instead
  27 *
  28 * preferred       Try a specific node first before normal fallback.
  29 *                As a special case node -1 here means do the allocation
  30 *                on the local CPU. This is normally identical to default,
  31 *                but useful to set in a VMA when you have a non default
  32 *                process policy.
  33 *
  34 * default        Allocate on the local node first, or when on a VMA
  35 *                use the process policy. This is what Linux always did
  36 *                in a NUMA aware kernel and still does by, ahem, default.
  37 *
  38 * The process policy is applied for most non interrupt memory allocations
  39 * in that process' context. Interrupts ignore the policies and always
  40 * try to allocate on the local CPU. The VMA policy is only applied for memory
  41 * allocations for a VMA in the VM.
  42 *
  43 * Currently there are a few corner cases in swapping where the policy
  44 * is not applied, but the majority should be handled. When process policy
  45 * is used it is not remembered over swap outs/swap ins.
  46 *
  47 * Only the highest zone in the zone hierarchy gets policied. Allocations
  48 * requesting a lower zone just use default policy. This implies that
  49 * on systems with highmem kernel lowmem allocation don't get policied.
  50 * Same with GFP_DMA allocations.
  51 *
  52 * For shmfs/tmpfs/hugetlbfs shared memory the policy is shared between
  53 * all users and remembered even when nobody has memory mapped.
  54 */
  55
  56/* Notebook:
  57   fix mmap readahead to honour policy and enable policy for any page cache
  58   object
  59   statistics for bigpages
  60   global policy for page cache? currently it uses process policy. Requires
  61   first item above.
  62   handle mremap for shared memory (currently ignored for the policy)
  63   grows down?
  64   make bind policy root only? It can trigger oom much faster and the
  65   kernel is not always grateful with that.
  66*/
  67
  68#include <linux/mempolicy.h>
  69#include <linux/mm.h>
  70#include <linux/highmem.h>
  71#include <linux/hugetlb.h>
  72#include <linux/kernel.h>
  73#include <linux/sched.h>
  74#include <linux/nodemask.h>
  75#include <linux/cpuset.h>
  76#include <linux/gfp.h>
  77#include <linux/slab.h>
  78#include <linux/string.h>
  79#include <linux/module.h>
  80#include <linux/nsproxy.h>
  81#include <linux/interrupt.h>
  82#include <linux/init.h>
  83#include <linux/compat.h>
  84#include <linux/swap.h>
  85#include <linux/seq_file.h>
  86#include <linux/proc_fs.h>
  87#include <linux/migrate.h>
  88#include <linux/rmap.h>
  89#include <linux/security.h>
  90#include <linux/syscalls.h>
  91#include <linux/ctype.h>
  92
  93#include <asm/tlbflush.h>
  94#include <asm/uaccess.h>
  95
  96#include "internal.h"
  97
  98/* Internal flags */
  99#define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0)    /* Skip checks for continuous vmas */
 100#define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1)          /* Invert check for nodemask */
 101#define MPOL_MF_STATS (MPOL_MF_INTERNAL << 2)           /* Gather statistics */
 102
 103static struct kmem_cache *policy_cache;
 104static struct kmem_cache *sn_cache;
 105
 106/* Highest zone. An specific allocation for a zone below that is not
 107   policied. */
 108enum zone_type policy_zone = 0;
 109
 110/*
 111 * run-time system-wide default policy => local allocation
 112 */
 113struct mempolicy default_policy = {
 114        .refcnt = ATOMIC_INIT(1), /* never free it */
 115        .mode = MPOL_PREFERRED,
 116        .flags = MPOL_F_LOCAL,
 117};
 118
 119static const struct mempolicy_operations {
 120        int (*create)(struct mempolicy *pol, const nodemask_t *nodes);
 121        void (*rebind)(struct mempolicy *pol, const nodemask_t *nodes);
 122} mpol_ops[MPOL_MAX];
 123
 124/* Check that the nodemask contains at least one populated zone */
 125static int is_valid_nodemask(const nodemask_t *nodemask)
 126{
 127        int nd, k;
 128
 129        /* Check that there is something useful in this mask */
 130        k = policy_zone;
 131
 132        for_each_node_mask(nd, *nodemask) {
 133                struct zone *z;
 134
 135                for (k = 0; k <= policy_zone; k++) {
 136                        z = &NODE_DATA(nd)->node_zones[k];
 137                        if (z->present_pages > 0)
 138                                return 1;
 139                }
 140        }
 141
 142        return 0;
 143}
 144
 145static inline int mpol_store_user_nodemask(const struct mempolicy *pol)
 146{
 147        return pol->flags & (MPOL_F_STATIC_NODES | MPOL_F_RELATIVE_NODES);
 148}
 149
 150static void mpol_relative_nodemask(nodemask_t *ret, const nodemask_t *orig,
 151                                   const nodemask_t *rel)
 152{
 153        nodemask_t tmp;
 154        nodes_fold(tmp, *orig, nodes_weight(*rel));
 155        nodes_onto(*ret, tmp, *rel);
 156}
 157
 158static int mpol_new_interleave(struct mempolicy *pol, const nodemask_t *nodes)
 159{
 160        if (nodes_empty(*nodes))
 161                return -EINVAL;
 162        pol->v.nodes = *nodes;
 163        return 0;
 164}
 165
 166static int mpol_new_preferred(struct mempolicy *pol, const nodemask_t *nodes)
 167{
 168        if (!nodes)
 169                pol->flags |= MPOL_F_LOCAL;     /* local allocation */
 170        else if (nodes_empty(*nodes))
 171                return -EINVAL;                 /*  no allowed nodes */
 172        else
 173                pol->v.preferred_node = first_node(*nodes);
 174        return 0;
 175}
 176
 177static int mpol_new_bind(struct mempolicy *pol, const nodemask_t *nodes)
 178{
 179        if (!is_valid_nodemask(nodes))
 180                return -EINVAL;
 181        pol->v.nodes = *nodes;
 182        return 0;
 183}
 184
 185/*
 186 * mpol_set_nodemask is called after mpol_new() to set up the nodemask, if
 187 * any, for the new policy.  mpol_new() has already validated the nodes
 188 * parameter with respect to the policy mode and flags.  But, we need to
 189 * handle an empty nodemask with MPOL_PREFERRED here.
 190 *
 191 * Must be called holding task's alloc_lock to protect task's mems_allowed
 192 * and mempolicy.  May also be called holding the mmap_semaphore for write.
 193 */
 194static int mpol_set_nodemask(struct mempolicy *pol,
 195                     const nodemask_t *nodes, struct nodemask_scratch *nsc)
 196{
 197        int ret;
 198
 199        /* if mode is MPOL_DEFAULT, pol is NULL. This is right. */
 200        if (pol == NULL)
 201                return 0;
 202        /* Check N_HIGH_MEMORY */
 203        nodes_and(nsc->mask1,
 204                  cpuset_current_mems_allowed, node_states[N_HIGH_MEMORY]);
 205
 206        VM_BUG_ON(!nodes);
 207        if (pol->mode == MPOL_PREFERRED && nodes_empty(*nodes))
 208                nodes = NULL;   /* explicit local allocation */
 209        else {
 210                if (pol->flags & MPOL_F_RELATIVE_NODES)
 211                        mpol_relative_nodemask(&nsc->mask2, nodes,&nsc->mask1);
 212                else
 213                        nodes_and(nsc->mask2, *nodes, nsc->mask1);
 214
 215                if (mpol_store_user_nodemask(pol))
 216                        pol->w.user_nodemask = *nodes;
 217                else
 218                        pol->w.cpuset_mems_allowed =
 219                                                cpuset_current_mems_allowed;
 220        }
 221
 222        if (nodes)
 223                ret = mpol_ops[pol->mode].create(pol, &nsc->mask2);
 224        else
 225                ret = mpol_ops[pol->mode].create(pol, NULL);
 226        return ret;
 227}
 228
 229/*
 230 * This function just creates a new policy, does some check and simple
 231 * initialization. You must invoke mpol_set_nodemask() to set nodes.
 232 */
 233static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
 234                                  nodemask_t *nodes)
 235{
 236        struct mempolicy *policy;
 237
 238        pr_debug("setting mode %d flags %d nodes[0] %lx\n",
 239                 mode, flags, nodes ? nodes_addr(*nodes)[0] : -1);
 240
 241        if (mode == MPOL_DEFAULT) {
 242                if (nodes && !nodes_empty(*nodes))
 243                        return ERR_PTR(-EINVAL);
 244                return NULL;    /* simply delete any existing policy */
 245        }
 246        VM_BUG_ON(!nodes);
 247
 248        /*
 249         * MPOL_PREFERRED cannot be used with MPOL_F_STATIC_NODES or
 250         * MPOL_F_RELATIVE_NODES if the nodemask is empty (local allocation).
 251         * All other modes require a valid pointer to a non-empty nodemask.
 252         */
 253        if (mode == MPOL_PREFERRED) {
 254                if (nodes_empty(*nodes)) {
 255                        if (((flags & MPOL_F_STATIC_NODES) ||
 256                             (flags & MPOL_F_RELATIVE_NODES)))
 257                                return ERR_PTR(-EINVAL);
 258                }
 259        } else if (nodes_empty(*nodes))
 260                return ERR_PTR(-EINVAL);
 261        policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
 262        if (!policy)
 263                return ERR_PTR(-ENOMEM);
 264        atomic_set(&policy->refcnt, 1);
 265        policy->mode = mode;
 266        policy->flags = flags;
 267
 268        return policy;
 269}
 270
 271/* Slow path of a mpol destructor. */
 272void __mpol_put(struct mempolicy *p)
 273{
 274        if (!atomic_dec_and_test(&p->refcnt))
 275                return;
 276        kmem_cache_free(policy_cache, p);
 277}
 278
 279static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes)
 280{
 281}
 282
 283static void mpol_rebind_nodemask(struct mempolicy *pol,
 284                                 const nodemask_t *nodes)
 285{
 286        nodemask_t tmp;
 287
 288        if (pol->flags & MPOL_F_STATIC_NODES)
 289                nodes_and(tmp, pol->w.user_nodemask, *nodes);
 290        else if (pol->flags & MPOL_F_RELATIVE_NODES)
 291                mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
 292        else {
 293                nodes_remap(tmp, pol->v.nodes, pol->w.cpuset_mems_allowed,
 294                            *nodes);
 295                pol->w.cpuset_mems_allowed = *nodes;
 296        }
 297
 298        pol->v.nodes = tmp;
 299        if (!node_isset(current->il_next, tmp)) {
 300                current->il_next = next_node(current->il_next, tmp);
 301                if (current->il_next >= MAX_NUMNODES)
 302                        current->il_next = first_node(tmp);
 303                if (current->il_next >= MAX_NUMNODES)
 304                        current->il_next = numa_node_id();
 305        }
 306}
 307
 308static void mpol_rebind_preferred(struct mempolicy *pol,
 309                                  const nodemask_t *nodes)
 310{
 311        nodemask_t tmp;
 312
 313        if (pol->flags & MPOL_F_STATIC_NODES) {
 314                int node = first_node(pol->w.user_nodemask);
 315
 316                if (node_isset(node, *nodes)) {
 317                        pol->v.preferred_node = node;
 318                        pol->flags &= ~MPOL_F_LOCAL;
 319                } else
 320                        pol->flags |= MPOL_F_LOCAL;
 321        } else if (pol->flags & MPOL_F_RELATIVE_NODES) {
 322                mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
 323                pol->v.preferred_node = first_node(tmp);
 324        } else if (!(pol->flags & MPOL_F_LOCAL)) {
 325                pol->v.preferred_node = node_remap(pol->v.preferred_node,
 326                                                   pol->w.cpuset_mems_allowed,
 327                                                   *nodes);
 328                pol->w.cpuset_mems_allowed = *nodes;
 329        }
 330}
 331
 332/* Migrate a policy to a different set of nodes */
 333static void mpol_rebind_policy(struct mempolicy *pol,
 334                               const nodemask_t *newmask)
 335{
 336        if (!pol)
 337                return;
 338        if (!mpol_store_user_nodemask(pol) &&
 339            nodes_equal(pol->w.cpuset_mems_allowed, *newmask))
 340                return;
 341        mpol_ops[pol->mode].rebind(pol, newmask);
 342}
 343
 344/*
 345 * Wrapper for mpol_rebind_policy() that just requires task
 346 * pointer, and updates task mempolicy.
 347 *
 348 * Called with task's alloc_lock held.
 349 */
 350
 351void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new)
 352{
 353        mpol_rebind_policy(tsk->mempolicy, new);
 354}
 355
 356/*
 357 * Rebind each vma in mm to new nodemask.
 358 *
 359 * Call holding a reference to mm.  Takes mm->mmap_sem during call.
 360 */
 361
 362void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
 363{
 364        struct vm_area_struct *vma;
 365
 366        down_write(&mm->mmap_sem);
 367        for (vma = mm->mmap; vma; vma = vma->vm_next)
 368                mpol_rebind_policy(vma->vm_policy, new);
 369        up_write(&mm->mmap_sem);
 370}
 371
 372static const struct mempolicy_operations mpol_ops[MPOL_MAX] = {
 373        [MPOL_DEFAULT] = {
 374                .rebind = mpol_rebind_default,
 375        },
 376        [MPOL_INTERLEAVE] = {
 377                .create = mpol_new_interleave,
 378                .rebind = mpol_rebind_nodemask,
 379        },
 380        [MPOL_PREFERRED] = {
 381                .create = mpol_new_preferred,
 382                .rebind = mpol_rebind_preferred,
 383        },
 384        [MPOL_BIND] = {
 385                .create = mpol_new_bind,
 386                .rebind = mpol_rebind_nodemask,
 387        },
 388};
 389
 390static void gather_stats(struct page *, void *, int pte_dirty);
 391static void migrate_page_add(struct page *page, struct list_head *pagelist,
 392                                unsigned long flags);
 393
 394/* Scan through pages checking if pages follow certain conditions. */
 395static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
 396                unsigned long addr, unsigned long end,
 397                const nodemask_t *nodes, unsigned long flags,
 398                void *private)
 399{
 400        pte_t *orig_pte;
 401        pte_t *pte;
 402        spinlock_t *ptl;
 403
 404        orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
 405        do {
 406                struct page *page;
 407                int nid;
 408
 409                if (!pte_present(*pte))
 410                        continue;
 411                page = vm_normal_page(vma, addr, *pte);
 412                if (!page)
 413                        continue;
 414                /*
 415                 * The check for PageReserved here is important to avoid
 416                 * handling zero pages and other pages that may have been
 417                 * marked special by the system.
 418                 *
 419                 * If the PageReserved would not be checked here then f.e.
 420                 * the location of the zero page could have an influence
 421                 * on MPOL_MF_STRICT, zero pages would be counted for
 422                 * the per node stats, and there would be useless attempts
 423                 * to put zero pages on the migration list.
 424                 */
 425                if (PageReserved(page))
 426                        continue;
 427                nid = page_to_nid(page);
 428                if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT))
 429                        continue;
 430
 431                if (flags & MPOL_MF_STATS)
 432                        gather_stats(page, private, pte_dirty(*pte));
 433                else if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
 434                        migrate_page_add(page, private, flags);
 435                else
 436                        break;
 437        } while (pte++, addr += PAGE_SIZE, addr != end);
 438        pte_unmap_unlock(orig_pte, ptl);
 439        return addr != end;
 440}
 441
 442static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud,
 443                unsigned long addr, unsigned long end,
 444                const nodemask_t *nodes, unsigned long flags,
 445                void *private)
 446{
 447        pmd_t *pmd;
 448        unsigned long next;
 449
 450        pmd = pmd_offset(pud, addr);
 451        do {
 452                next = pmd_addr_end(addr, end);
 453                if (pmd_none_or_clear_bad(pmd))
 454                        continue;
 455                if (check_pte_range(vma, pmd, addr, next, nodes,
 456                                    flags, private))
 457                        return -EIO;
 458        } while (pmd++, addr = next, addr != end);
 459        return 0;
 460}
 461
 462static inline int check_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
 463                unsigned long addr, unsigned long end,
 464                const nodemask_t *nodes, unsigned long flags,
 465                void *private)
 466{
 467        pud_t *pud;
 468        unsigned long next;
 469
 470        pud = pud_offset(pgd, addr);
 471        do {
 472                next = pud_addr_end(addr, end);
 473                if (pud_none_or_clear_bad(pud))
 474                        continue;
 475                if (check_pmd_range(vma, pud, addr, next, nodes,
 476                                    flags, private))
 477                        return -EIO;
 478        } while (pud++, addr = next, addr != end);
 479        return 0;
 480}
 481
 482static inline int check_pgd_range(struct vm_area_struct *vma,
 483                unsigned long addr, unsigned long end,
 484                const nodemask_t *nodes, unsigned long flags,
 485                void *private)
 486{
 487        pgd_t *pgd;
 488        unsigned long next;
 489
 490        pgd = pgd_offset(vma->vm_mm, addr);
 491        do {
 492                next = pgd_addr_end(addr, end);
 493                if (pgd_none_or_clear_bad(pgd))
 494                        continue;
 495                if (check_pud_range(vma, pgd, addr, next, nodes,
 496                                    flags, private))
 497                        return -EIO;
 498        } while (pgd++, addr = next, addr != end);
 499        return 0;
 500}
 501
 502/*
 503 * Check if all pages in a range are on a set of nodes.
 504 * If pagelist != NULL then isolate pages from the LRU and
 505 * put them on the pagelist.
 506 */
 507static struct vm_area_struct *
 508check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
 509                const nodemask_t *nodes, unsigned long flags, void *private)
 510{
 511        int err;
 512        struct vm_area_struct *first, *vma, *prev;
 513
 514
 515        first = find_vma(mm, start);
 516        if (!first)
 517                return ERR_PTR(-EFAULT);
 518        prev = NULL;
 519        for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) {
 520                if (!(flags & MPOL_MF_DISCONTIG_OK)) {
 521                        if (!vma->vm_next && vma->vm_end < end)
 522                                return ERR_PTR(-EFAULT);
 523                        if (prev && prev->vm_end < vma->vm_start)
 524                                return ERR_PTR(-EFAULT);
 525                }
 526                if (!is_vm_hugetlb_page(vma) &&
 527                    ((flags & MPOL_MF_STRICT) ||
 528                     ((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) &&
 529                                vma_migratable(vma)))) {
 530                        unsigned long endvma = vma->vm_end;
 531
 532                        if (endvma > end)
 533                                endvma = end;
 534                        if (vma->vm_start > start)
 535                                start = vma->vm_start;
 536                        err = check_pgd_range(vma, start, endvma, nodes,
 537                                                flags, private);
 538                        if (err) {
 539                                first = ERR_PTR(err);
 540                                break;
 541                        }
 542                }
 543                prev = vma;
 544        }
 545        return first;
 546}
 547
 548/* Apply policy to a single VMA */
 549static int policy_vma(struct vm_area_struct *vma, struct mempolicy *new)
 550{
 551        int err = 0;
 552        struct mempolicy *old = vma->vm_policy;
 553
 554        pr_debug("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
 555                 vma->vm_start, vma->vm_end, vma->vm_pgoff,
 556                 vma->vm_ops, vma->vm_file,
 557                 vma->vm_ops ? vma->vm_ops->set_policy : NULL);
 558
 559        if (vma->vm_ops && vma->vm_ops->set_policy)
 560                err = vma->vm_ops->set_policy(vma, new);
 561        if (!err) {
 562                mpol_get(new);
 563                vma->vm_policy = new;
 564                mpol_put(old);
 565        }
 566        return err;
 567}
 568
 569/* Step 2: apply policy to a range and do splits. */
 570static int mbind_range(struct vm_area_struct *vma, unsigned long start,
 571                       unsigned long end, struct mempolicy *new)
 572{
 573        struct vm_area_struct *next;
 574        int err;
 575
 576        err = 0;
 577        for (; vma && vma->vm_start < end; vma = next) {
 578                next = vma->vm_next;
 579                if (vma->vm_start < start)
 580                        err = split_vma(vma->vm_mm, vma, start, 1);
 581                if (!err && vma->vm_end > end)
 582                        err = split_vma(vma->vm_mm, vma, end, 0);
 583                if (!err)
 584                        err = policy_vma(vma, new);
 585                if (err)
 586                        break;
 587        }
 588        return err;
 589}
 590
 591/*
 592 * Update task->flags PF_MEMPOLICY bit: set iff non-default
 593 * mempolicy.  Allows more rapid checking of this (combined perhaps
 594 * with other PF_* flag bits) on memory allocation hot code paths.
 595 *
 596 * If called from outside this file, the task 'p' should -only- be
 597 * a newly forked child not yet visible on the task list, because
 598 * manipulating the task flags of a visible task is not safe.
 599 *
 600 * The above limitation is why this routine has the funny name
 601 * mpol_fix_fork_child_flag().
 602 *
 603 * It is also safe to call this with a task pointer of current,
 604 * which the static wrapper mpol_set_task_struct_flag() does,
 605 * for use within this file.
 606 */
 607
 608void mpol_fix_fork_child_flag(struct task_struct *p)
 609{
 610        if (p->mempolicy)
 611                p->flags |= PF_MEMPOLICY;
 612        else
 613                p->flags &= ~PF_MEMPOLICY;
 614}
 615
 616static void mpol_set_task_struct_flag(void)
 617{
 618        mpol_fix_fork_child_flag(current);
 619}
 620
 621/* Set the process memory policy */
 622static long do_set_mempolicy(unsigned short mode, unsigned short flags,
 623                             nodemask_t *nodes)
 624{
 625        struct mempolicy *new, *old;
 626        struct mm_struct *mm = current->mm;
 627        NODEMASK_SCRATCH(scratch);
 628        int ret;
 629
 630        if (!scratch)
 631                return -ENOMEM;
 632
 633        new = mpol_new(mode, flags, nodes);
 634        if (IS_ERR(new)) {
 635                ret = PTR_ERR(new);
 636                goto out;
 637        }
 638        /*
 639         * prevent changing our mempolicy while show_numa_maps()
 640         * is using it.
 641         * Note:  do_set_mempolicy() can be called at init time
 642         * with no 'mm'.
 643         */
 644        if (mm)
 645                down_write(&mm->mmap_sem);
 646        task_lock(current);
 647        ret = mpol_set_nodemask(new, nodes, scratch);
 648        if (ret) {
 649                task_unlock(current);
 650                if (mm)
 651                        up_write(&mm->mmap_sem);
 652                mpol_put(new);
 653                goto out;
 654        }
 655        old = current->mempolicy;
 656        current->mempolicy = new;
 657        mpol_set_task_struct_flag();
 658        if (new && new->mode == MPOL_INTERLEAVE &&
 659            nodes_weight(new->v.nodes))
 660                current->il_next = first_node(new->v.nodes);
 661        task_unlock(current);
 662        if (mm)
 663                up_write(&mm->mmap_sem);
 664
 665        mpol_put(old);
 666        ret = 0;
 667out:
 668        NODEMASK_SCRATCH_FREE(scratch);
 669        return ret;
 670}
 671
 672/*
 673 * Return nodemask for policy for get_mempolicy() query
 674 *
 675 * Called with task's alloc_lock held
 676 */
 677static void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes)
 678{
 679        nodes_clear(*nodes);
 680        if (p == &default_policy)
 681                return;
 682
 683        switch (p->mode) {
 684        case MPOL_BIND:
 685                /* Fall through */
 686        case MPOL_INTERLEAVE:
 687                *nodes = p->v.nodes;
 688                break;
 689        case MPOL_PREFERRED:
 690                if (!(p->flags & MPOL_F_LOCAL))
 691                        node_set(p->v.preferred_node, *nodes);
 692                /* else return empty node mask for local allocation */
 693                break;
 694        default:
 695                BUG();
 696        }
 697}
 698
 699static int lookup_node(struct mm_struct *mm, unsigned long addr)
 700{
 701        struct page *p;
 702        int err;
 703
 704        err = get_user_pages(current, mm, addr & PAGE_MASK, 1, 0, 0, &p, NULL);
 705        if (err >= 0) {
 706                err = page_to_nid(p);
 707                put_page(p);
 708        }
 709        return err;
 710}
 711
 712/* Retrieve NUMA policy */
 713static long do_get_mempolicy(int *policy, nodemask_t *nmask,
 714                             unsigned long addr, unsigned long flags)
 715{
 716        int err;
 717        struct mm_struct *mm = current->mm;
 718        struct vm_area_struct *vma = NULL;
 719        struct mempolicy *pol = current->mempolicy;
 720
 721        if (flags &
 722                ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED))
 723                return -EINVAL;
 724
 725        if (flags & MPOL_F_MEMS_ALLOWED) {
 726                if (flags & (MPOL_F_NODE|MPOL_F_ADDR))
 727                        return -EINVAL;
 728                *policy = 0;    /* just so it's initialized */
 729                task_lock(current);
 730                *nmask  = cpuset_current_mems_allowed;
 731                task_unlock(current);
 732                return 0;
 733        }
 734
 735        if (flags & MPOL_F_ADDR) {
 736                /*
 737                 * Do NOT fall back to task policy if the
 738                 * vma/shared policy at addr is NULL.  We
 739                 * want to return MPOL_DEFAULT in this case.
 740                 */
 741                down_read(&mm->mmap_sem);
 742                vma = find_vma_intersection(mm, addr, addr+1);
 743                if (!vma) {
 744                        up_read(&mm->mmap_sem);
 745                        return -EFAULT;
 746                }
 747                if (vma->vm_ops && vma->vm_ops->get_policy)
 748                        pol = vma->vm_ops->get_policy(vma, addr);
 749                else
 750                        pol = vma->vm_policy;
 751        } else if (addr)
 752                return -EINVAL;
 753
 754        if (!pol)
 755                pol = &default_policy;  /* indicates default behavior */
 756
 757        if (flags & MPOL_F_NODE) {
 758                if (flags & MPOL_F_ADDR) {
 759                        err = lookup_node(mm, addr);
 760                        if (err < 0)
 761                                goto out;
 762                        *policy = err;
 763                } else if (pol == current->mempolicy &&
 764                                pol->mode == MPOL_INTERLEAVE) {
 765                        *policy = current->il_next;
 766                } else {
 767                        err = -EINVAL;
 768                        goto out;
 769                }
 770        } else {
 771                *policy = pol == &default_policy ? MPOL_DEFAULT :
 772                                                pol->mode;
 773                /*
 774                 * Internal mempolicy flags must be masked off before exposing
 775                 * the policy to userspace.
 776                 */
 777                *policy |= (pol->flags & MPOL_MODE_FLAGS);
 778        }
 779
 780        if (vma) {
 781                up_read(&current->mm->mmap_sem);
 782                vma = NULL;
 783        }
 784
 785        err = 0;
 786        if (nmask) {
 787                task_lock(current);
 788                get_policy_nodemask(pol, nmask);
 789                task_unlock(current);
 790        }
 791
 792 out:
 793        mpol_cond_put(pol);
 794        if (vma)
 795                up_read(&current->mm->mmap_sem);
 796        return err;
 797}
 798
 799#ifdef CONFIG_MIGRATION
 800/*
 801 * page migration
 802 */
 803static void migrate_page_add(struct page *page, struct list_head *pagelist,
 804                                unsigned long flags)
 805{
 806        /*
 807         * Avoid migrating a page that is shared with others.
 808         */
 809        if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1) {
 810                if (!isolate_lru_page(page)) {
 811                        list_add_tail(&page->lru, pagelist);
 812                }
 813        }
 814}
 815
 816static struct page *new_node_page(struct page *page, unsigned long node, int **x)
 817{
 818        return alloc_pages_exact_node(node, GFP_HIGHUSER_MOVABLE, 0);
 819}
 820
 821/*
 822 * Migrate pages from one node to a target node.
 823 * Returns error or the number of pages not migrated.
 824 */
 825static int migrate_to_node(struct mm_struct *mm, int source, int dest,
 826                           int flags)
 827{
 828        nodemask_t nmask;
 829        LIST_HEAD(pagelist);
 830        int err = 0;
 831
 832        nodes_clear(nmask);
 833        node_set(source, nmask);
 834
 835        check_range(mm, mm->mmap->vm_start, TASK_SIZE, &nmask,
 836                        flags | MPOL_MF_DISCONTIG_OK, &pagelist);
 837
 838        if (!list_empty(&pagelist))
 839                err = migrate_pages(&pagelist, new_node_page, dest);
 840
 841        return err;
 842}
 843
 844/*
 845 * Move pages between the two nodesets so as to preserve the physical
 846 * layout as much as possible.
 847 *
 848 * Returns the number of page that could not be moved.
 849 */
 850int do_migrate_pages(struct mm_struct *mm,
 851        const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags)
 852{
 853        int busy = 0;
 854        int err;
 855        nodemask_t tmp;
 856
 857        err = migrate_prep();
 858        if (err)
 859                return err;
 860
 861        down_read(&mm->mmap_sem);
 862
 863        err = migrate_vmas(mm, from_nodes, to_nodes, flags);
 864        if (err)
 865                goto out;
 866
 867/*
 868 * Find a 'source' bit set in 'tmp' whose corresponding 'dest'
 869 * bit in 'to' is not also set in 'tmp'.  Clear the found 'source'
 870 * bit in 'tmp', and return that <source, dest> pair for migration.
 871 * The pair of nodemasks 'to' and 'from' define the map.
 872 *
 873 * If no pair of bits is found that way, fallback to picking some
 874 * pair of 'source' and 'dest' bits that are not the same.  If the
 875 * 'source' and 'dest' bits are the same, this represents a node
 876 * that will be migrating to itself, so no pages need move.
 877 *
 878 * If no bits are left in 'tmp', or if all remaining bits left
 879 * in 'tmp' correspond to the same bit in 'to', return false
 880 * (nothing left to migrate).
 881 *
 882 * This lets us pick a pair of nodes to migrate between, such that
 883 * if possible the dest node is not already occupied by some other
 884 * source node, minimizing the risk of overloading the memory on a
 885 * node that would happen if we migrated incoming memory to a node
 886 * before migrating outgoing memory source that same node.
 887 *
 888 * A single scan of tmp is sufficient.  As we go, we remember the
 889 * most recent <s, d> pair that moved (s != d).  If we find a pair
 890 * that not only moved, but what's better, moved to an empty slot
 891 * (d is not set in tmp), then we break out then, with that pair.
 892 * Otherwise when we finish scannng from_tmp, we at least have the
 893 * most recent <s, d> pair that moved.  If we get all the way through
 894 * the scan of tmp without finding any node that moved, much less
 895 * moved to an empty node, then there is nothing left worth migrating.
 896 */
 897
 898        tmp = *from_nodes;
 899        while (!nodes_empty(tmp)) {
 900                int s,d;
 901                int source = -1;
 902                int dest = 0;
 903
 904                for_each_node_mask(s, tmp) {
 905                        d = node_remap(s, *from_nodes, *to_nodes);
 906                        if (s == d)
 907                                continue;
 908
 909                        source = s;     /* Node moved. Memorize */
 910                        dest = d;
 911
 912                        /* dest not in remaining from nodes? */
 913                        if (!node_isset(dest, tmp))
 914                                break;
 915                }
 916                if (source == -1)
 917                        break;
 918
 919                node_clear(source, tmp);
 920                err = migrate_to_node(mm, source, dest, flags);
 921                if (err > 0)
 922                        busy += err;
 923                if (err < 0)
 924                        break;
 925        }
 926out:
 927        up_read(&mm->mmap_sem);
 928        if (err < 0)
 929                return err;
 930        return busy;
 931
 932}
 933
 934/*
 935 * Allocate a new page for page migration based on vma policy.
 936 * Start assuming that page is mapped by vma pointed to by @private.
 937 * Search forward from there, if not.  N.B., this assumes that the
 938 * list of pages handed to migrate_pages()--which is how we get here--
 939 * is in virtual address order.
 940 */
 941static struct page *new_vma_page(struct page *page, unsigned long private, int **x)
 942{
 943        struct vm_area_struct *vma = (struct vm_area_struct *)private;
 944        unsigned long uninitialized_var(address);
 945
 946        while (vma) {
 947                address = page_address_in_vma(page, vma);
 948                if (address != -EFAULT)
 949                        break;
 950                vma = vma->vm_next;
 951        }
 952
 953        /*
 954         * if !vma, alloc_page_vma() will use task or system default policy
 955         */
 956        return alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
 957}
 958#else
 959
 960static void migrate_page_add(struct page *page, struct list_head *pagelist,
 961                                unsigned long flags)
 962{
 963}
 964
 965int do_migrate_pages(struct mm_struct *mm,
 966        const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags)
 967{
 968        return -ENOSYS;
 969}
 970
 971static struct page *new_vma_page(struct page *page, unsigned long private, int **x)
 972{
 973        return NULL;
 974}
 975#endif
 976
 977static long do_mbind(unsigned long start, unsigned long len,
 978                     unsigned short mode, unsigned short mode_flags,
 979                     nodemask_t *nmask, unsigned long flags)
 980{
 981        struct vm_area_struct *vma;
 982        struct mm_struct *mm = current->mm;
 983        struct mempolicy *new;
 984        unsigned long end;
 985        int err;
 986        LIST_HEAD(pagelist);
 987
 988        if (flags & ~(unsigned long)(MPOL_MF_STRICT |
 989                                     MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
 990                return -EINVAL;
 991        if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
 992                return -EPERM;
 993
 994        if (start & ~PAGE_MASK)
 995                return -EINVAL;
 996
 997        if (mode == MPOL_DEFAULT)
 998                flags &= ~MPOL_MF_STRICT;
 999
1000        len = (len + PAGE_SIZE - 1) & PAGE_MASK;
1001        end = start + len;
1002
1003        if (end < start)
1004                return -EINVAL;
1005        if (end == start)
1006                return 0;
1007
1008        new = mpol_new(mode, mode_flags, nmask);
1009        if (IS_ERR(new))
1010                return PTR_ERR(new);
1011
1012        /*
1013         * If we are using the default policy then operation
1014         * on discontinuous address spaces is okay after all
1015         */
1016        if (!new)
1017                flags |= MPOL_MF_DISCONTIG_OK;
1018
1019        pr_debug("mbind %lx-%lx mode:%d flags:%d nodes:%lx\n",
1020                 start, start + len, mode, mode_flags,
1021                 nmask ? nodes_addr(*nmask)[0] : -1);
1022
1023        if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
1024
1025                err = migrate_prep();
1026                if (err)
1027                        goto mpol_out;
1028        }
1029        {
1030                NODEMASK_SCRATCH(scratch);
1031                if (scratch) {
1032                        down_write(&mm->mmap_sem);
1033                        task_lock(current);
1034                        err = mpol_set_nodemask(new, nmask, scratch);
1035                        task_unlock(current);
1036                        if (err)
1037                                up_write(&mm->mmap_sem);
1038                } else
1039                        err = -ENOMEM;
1040                NODEMASK_SCRATCH_FREE(scratch);
1041        }
1042        if (err)
1043                goto mpol_out;
1044
1045        vma = check_range(mm, start, end, nmask,
1046                          flags | MPOL_MF_INVERT, &pagelist);
1047
1048        err = PTR_ERR(vma);
1049        if (!IS_ERR(vma)) {
1050                int nr_failed = 0;
1051
1052                err = mbind_range(vma, start, end, new);
1053
1054                if (!list_empty(&pagelist))
1055                        nr_failed = migrate_pages(&pagelist, new_vma_page,
1056                                                (unsigned long)vma);
1057
1058                if (!err && nr_failed && (flags & MPOL_MF_STRICT))
1059                        err = -EIO;
1060        } else
1061                putback_lru_pages(&pagelist);
1062
1063        up_write(&mm->mmap_sem);
1064 mpol_out:
1065        mpol_put(new);
1066        return err;
1067}
1068
1069/*
1070 * User space interface with variable sized bitmaps for nodelists.
1071 */
1072
1073/* Copy a node mask from user space. */
1074static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
1075                     unsigned long maxnode)
1076{
1077        unsigned long k;
1078        unsigned long nlongs;
1079        unsigned long endmask;
1080
1081        --maxnode;
1082        nodes_clear(*nodes);
1083        if (maxnode == 0 || !nmask)
1084                return 0;
1085        if (maxnode > PAGE_SIZE*BITS_PER_BYTE)
1086                return -EINVAL;
1087
1088        nlongs = BITS_TO_LONGS(maxnode);
1089        if ((maxnode % BITS_PER_LONG) == 0)
1090                endmask = ~0UL;
1091        else
1092                endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1;
1093
1094        /* When the user specified more nodes than supported just check
1095           if the non supported part is all zero. */
1096        if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) {
1097                if (nlongs > PAGE_SIZE/sizeof(long))
1098                        return -EINVAL;
1099                for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) {
1100                        unsigned long t;
1101                        if (get_user(t, nmask + k))
1102                                return -EFAULT;
1103                        if (k == nlongs - 1) {
1104                                if (t & endmask)
1105                                        return -EINVAL;
1106                        } else if (t)
1107                                return -EINVAL;
1108                }
1109                nlongs = BITS_TO_LONGS(MAX_NUMNODES);
1110                endmask = ~0UL;
1111        }
1112
1113        if (copy_from_user(nodes_addr(*nodes), nmask, nlongs*sizeof(unsigned long)))
1114                return -EFAULT;
1115        nodes_addr(*nodes)[nlongs-1] &= endmask;
1116        return 0;
1117}
1118
1119/* Copy a kernel node mask to user space */
1120static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
1121                              nodemask_t *nodes)
1122{
1123        unsigned long copy = ALIGN(maxnode-1, 64) / 8;
1124        const int nbytes = BITS_TO_LONGS(MAX_NUMNODES) * sizeof(long);
1125
1126        if (copy > nbytes) {
1127                if (copy > PAGE_SIZE)
1128                        return -EINVAL;
1129                if (clear_user((char __user *)mask + nbytes, copy - nbytes))
1130                        return -EFAULT;
1131                copy = nbytes;
1132        }
1133        return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
1134}
1135
1136SYSCALL_DEFINE6(mbind, unsigned long, start, unsigned long, len,
1137                unsigned long, mode, unsigned long __user *, nmask,
1138                unsigned long, maxnode, unsigned, flags)
1139{
1140        nodemask_t nodes;
1141        int err;
1142        unsigned short mode_flags;
1143
1144        mode_flags = mode & MPOL_MODE_FLAGS;
1145        mode &= ~MPOL_MODE_FLAGS;
1146        if (mode >= MPOL_MAX)
1147                return -EINVAL;
1148        if ((mode_flags & MPOL_F_STATIC_NODES) &&
1149            (mode_flags & MPOL_F_RELATIVE_NODES))
1150                return -EINVAL;
1151        err = get_nodes(&nodes, nmask, maxnode);
1152        if (err)
1153                return err;
1154        return do_mbind(start, len, mode, mode_flags, &nodes, flags);
1155}
1156
1157/* Set the process memory policy */
1158SYSCALL_DEFINE3(set_mempolicy, int, mode, unsigned long __user *, nmask,
1159                unsigned long, maxnode)
1160{
1161        int err;
1162        nodemask_t nodes;
1163        unsigned short flags;
1164
1165        flags = mode & MPOL_MODE_FLAGS;
1166        mode &= ~MPOL_MODE_FLAGS;
1167        if ((unsigned int)mode >= MPOL_MAX)
1168                return -EINVAL;
1169        if ((flags & MPOL_F_STATIC_NODES) && (flags & MPOL_F_RELATIVE_NODES))
1170                return -EINVAL;
1171        err = get_nodes(&nodes, nmask, maxnode);
1172        if (err)
1173                return err;
1174        return do_set_mempolicy(mode, flags, &nodes);
1175}
1176
1177SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode,
1178                const unsigned long __user *, old_nodes,
1179                const unsigned long __user *, new_nodes)
1180{
1181        const struct cred *cred = current_cred(), *tcred;
1182        struct mm_struct *mm;
1183        struct task_struct *task;
1184        nodemask_t old;
1185        nodemask_t new;
1186        nodemask_t task_nodes;
1187        int err;
1188
1189        err = get_nodes(&old, old_nodes, maxnode);
1190        if (err)
1191                return err;
1192
1193        err = get_nodes(&new, new_nodes, maxnode);
1194        if (err)
1195                return err;
1196
1197        /* Find the mm_struct */
1198        read_lock(&tasklist_lock);
1199        task = pid ? find_task_by_vpid(pid) : current;
1200        if (!task) {
1201                read_unlock(&tasklist_lock);
1202                return -ESRCH;
1203        }
1204        mm = get_task_mm(task);
1205        read_unlock(&tasklist_lock);
1206
1207        if (!mm)
1208                return -EINVAL;
1209
1210        /*
1211         * Check if this process has the right to modify the specified
1212         * process. The right exists if the process has administrative
1213         * capabilities, superuser privileges or the same
1214         * userid as the target process.
1215         */
1216        rcu_read_lock();
1217        tcred = __task_cred(task);
1218        if (cred->euid != tcred->suid && cred->euid != tcred->uid &&
1219            cred->uid  != tcred->suid && cred->uid  != tcred->uid &&
1220            !capable(CAP_SYS_NICE)) {
1221                rcu_read_unlock();
1222                err = -EPERM;
1223                goto out;
1224        }
1225        rcu_read_unlock();
1226
1227        task_nodes = cpuset_mems_allowed(task);
1228        /* Is the user allowed to access the target nodes? */
1229        if (!nodes_subset(new, task_nodes) && !capable(CAP_SYS_NICE)) {
1230                err = -EPERM;
1231                goto out;
1232        }
1233
1234        if (!nodes_subset(new, node_states[N_HIGH_MEMORY])) {
1235                err = -EINVAL;
1236                goto out;
1237        }
1238
1239        err = security_task_movememory(task);
1240        if (err)
1241                goto out;
1242
1243        err = do_migrate_pages(mm, &old, &new,
1244                capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE);
1245out:
1246        mmput(mm);
1247        return err;
1248}
1249
1250
1251/* Retrieve NUMA policy */
1252SYSCALL_DEFINE5(get_mempolicy, int __user *, policy,
1253                unsigned long __user *, nmask, unsigned long, maxnode,
1254                unsigned long, addr, unsigned long, flags)
1255{
1256        int err;
1257        int uninitialized_var(pval);
1258        nodemask_t nodes;
1259
1260        if (nmask != NULL && maxnode < MAX_NUMNODES)
1261                return -EINVAL;
1262
1263        err = do_get_mempolicy(&pval, &nodes, addr, flags);
1264
1265        if (err)
1266                return err;
1267
1268        if (policy && put_user(pval, policy))
1269                return -EFAULT;
1270
1271        if (nmask)
1272                err = copy_nodes_to_user(nmask, maxnode, &nodes);
1273
1274        return err;
1275}
1276
1277#ifdef CONFIG_COMPAT
1278
1279asmlinkage long compat_sys_get_mempolicy(int __user *policy,
1280                                     compat_ulong_t __user *nmask,
1281                                     compat_ulong_t maxnode,
1282                                     compat_ulong_t addr, compat_ulong_t flags)
1283{
1284        long err;
1285        unsigned long __user *nm = NULL;
1286        unsigned long nr_bits, alloc_size;
1287        DECLARE_BITMAP(bm, MAX_NUMNODES);
1288
1289        nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1290        alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1291
1292        if (nmask)
1293                nm = compat_alloc_user_space(alloc_size);
1294
1295        err = sys_get_mempolicy(policy, nm, nr_bits+1, addr, flags);
1296
1297        if (!err && nmask) {
1298                err = copy_from_user(bm, nm, alloc_size);
1299                /* ensure entire bitmap is zeroed */
1300                err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8);
1301                err |= compat_put_bitmap(nmask, bm, nr_bits);
1302        }
1303
1304        return err;
1305}
1306
1307asmlinkage long compat_sys_set_mempolicy(int mode, compat_ulong_t __user *nmask,
1308                                     compat_ulong_t maxnode)
1309{
1310        long err = 0;
1311        unsigned long __user *nm = NULL;
1312        unsigned long nr_bits, alloc_size;
1313        DECLARE_BITMAP(bm, MAX_NUMNODES);
1314
1315        nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1316        alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1317
1318        if (nmask) {
1319                err = compat_get_bitmap(bm, nmask, nr_bits);
1320                nm = compat_alloc_user_space(alloc_size);
1321                err |= copy_to_user(nm, bm, alloc_size);
1322        }
1323
1324        if (err)
1325                return -EFAULT;
1326
1327        return sys_set_mempolicy(mode, nm, nr_bits+1);
1328}
1329
1330asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len,
1331                             compat_ulong_t mode, compat_ulong_t __user *nmask,
1332                             compat_ulong_t maxnode, compat_ulong_t flags)
1333{
1334        long err = 0;
1335        unsigned long __user *nm = NULL;
1336        unsigned long nr_bits, alloc_size;
1337        nodemask_t bm;
1338
1339        nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1340        alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1341
1342        if (nmask) {
1343                err = compat_get_bitmap(nodes_addr(bm), nmask, nr_bits);
1344                nm = compat_alloc_user_space(alloc_size);
1345                err |= copy_to_user(nm, nodes_addr(bm), alloc_size);
1346        }
1347
1348        if (err)
1349                return -EFAULT;
1350
1351        return sys_mbind(start, len, mode, nm, nr_bits+1, flags);
1352}
1353
1354#endif
1355
1356/*
1357 * get_vma_policy(@task, @vma, @addr)
1358 * @task - task for fallback if vma policy == default
1359 * @vma   - virtual memory area whose policy is sought
1360 * @addr  - address in @vma for shared policy lookup
1361 *
1362 * Returns effective policy for a VMA at specified address.
1363 * Falls back to @task or system default policy, as necessary.
1364 * Current or other task's task mempolicy and non-shared vma policies
1365 * are protected by the task's mmap_sem, which must be held for read by
1366 * the caller.
1367 * Shared policies [those marked as MPOL_F_SHARED] require an extra reference
1368 * count--added by the get_policy() vm_op, as appropriate--to protect against
1369 * freeing by another task.  It is the caller's responsibility to free the
1370 * extra reference for shared policies.
1371 */
1372static struct mempolicy *get_vma_policy(struct task_struct *task,
1373                struct vm_area_struct *vma, unsigned long addr)
1374{
1375        struct mempolicy *pol = task->mempolicy;
1376
1377        if (vma) {
1378                if (vma->vm_ops && vma->vm_ops->get_policy) {
1379                        struct mempolicy *vpol = vma->vm_ops->get_policy(vma,
1380                                                                        addr);
1381                        if (vpol)
1382                                pol = vpol;
1383                } else if (vma->vm_policy)
1384                        pol = vma->vm_policy;
1385        }
1386        if (!pol)
1387                pol = &default_policy;
1388        return pol;
1389}
1390
1391/*
1392 * Return a nodemask representing a mempolicy for filtering nodes for
1393 * page allocation
1394 */
1395static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy)
1396{
1397        /* Lower zones don't get a nodemask applied for MPOL_BIND */
1398        if (unlikely(policy->mode == MPOL_BIND) &&
1399                        gfp_zone(gfp) >= policy_zone &&
1400                        cpuset_nodemask_valid_mems_allowed(&policy->v.nodes))
1401                return &policy->v.nodes;
1402
1403        return NULL;
1404}
1405
1406/* Return a zonelist indicated by gfp for node representing a mempolicy */
1407static struct zonelist *policy_zonelist(gfp_t gfp, struct mempolicy *policy)
1408{
1409        int nd = numa_node_id();
1410
1411        switch (policy->mode) {
1412        case MPOL_PREFERRED:
1413                if (!(policy->flags & MPOL_F_LOCAL))
1414                        nd = policy->v.preferred_node;
1415                break;
1416        case MPOL_BIND:
1417                /*
1418                 * Normally, MPOL_BIND allocations are node-local within the
1419                 * allowed nodemask.  However, if __GFP_THISNODE is set and the
1420                 * current node is part of the mask, we use the zonelist for
1421                 * the first node in the mask instead.
1422                 */
1423                if (unlikely(gfp & __GFP_THISNODE) &&
1424                                unlikely(!node_isset(nd, policy->v.nodes)))
1425                        nd = first_node(policy->v.nodes);
1426                break;
1427        case MPOL_INTERLEAVE: /* should not happen */
1428                break;
1429        default:
1430                BUG();
1431        }
1432        return node_zonelist(nd, gfp);
1433}
1434
1435/* Do dynamic interleaving for a process */
1436static unsigned interleave_nodes(struct mempolicy *policy)
1437{
1438        unsigned nid, next;
1439        struct task_struct *me = current;
1440
1441        nid = me->il_next;
1442        next = next_node(nid, policy->v.nodes);
1443        if (next >= MAX_NUMNODES)
1444                next = first_node(policy->v.nodes);
1445        if (next < MAX_NUMNODES)
1446                me->il_next = next;
1447        return nid;
1448}
1449
1450/*
1451 * Depending on the memory policy provide a node from which to allocate the
1452 * next slab entry.
1453 * @policy must be protected by freeing by the caller.  If @policy is
1454 * the current task's mempolicy, this protection is implicit, as only the
1455 * task can change it's policy.  The system default policy requires no
1456 * such protection.
1457 */
1458unsigned slab_node(struct mempolicy *policy)
1459{
1460        if (!policy || policy->flags & MPOL_F_LOCAL)
1461                return numa_node_id();
1462
1463        switch (policy->mode) {
1464        case MPOL_PREFERRED:
1465                /*
1466                 * handled MPOL_F_LOCAL above
1467                 */
1468                return policy->v.preferred_node;
1469
1470        case MPOL_INTERLEAVE:
1471                return interleave_nodes(policy);
1472
1473        case MPOL_BIND: {
1474                /*
1475                 * Follow bind policy behavior and start allocation at the
1476                 * first node.
1477                 */
1478                struct zonelist *zonelist;
1479                struct zone *zone;
1480                enum zone_type highest_zoneidx = gfp_zone(GFP_KERNEL);
1481                zonelist = &NODE_DATA(numa_node_id())->node_zonelists[0];
1482                (void)first_zones_zonelist(zonelist, highest_zoneidx,
1483                                                        &policy->v.nodes,
1484                                                        &zone);
1485                return zone->node;
1486        }
1487
1488        default:
1489                BUG();
1490        }
1491}
1492
1493/* Do static interleaving for a VMA with known offset. */
1494static unsigned offset_il_node(struct mempolicy *pol,
1495                struct vm_area_struct *vma, unsigned long off)
1496{
1497        unsigned nnodes = nodes_weight(pol->v.nodes);
1498        unsigned target;
1499        int c;
1500        int nid = -1;
1501
1502        if (!nnodes)
1503                return numa_node_id();
1504        target = (unsigned int)off % nnodes;
1505        c = 0;
1506        do {
1507                nid = next_node(nid, pol->v.nodes);
1508                c++;
1509        } while (c <= target);
1510        return nid;
1511}
1512
1513/* Determine a node number for interleave */
1514static inline unsigned interleave_nid(struct mempolicy *pol,
1515                 struct vm_area_struct *vma, unsigned long addr, int shift)
1516{
1517        if (vma) {
1518                unsigned long off;
1519
1520                /*
1521                 * for small pages, there is no difference between
1522                 * shift and PAGE_SHIFT, so the bit-shift is safe.
1523                 * for huge pages, since vm_pgoff is in units of small
1524                 * pages, we need to shift off the always 0 bits to get
1525                 * a useful offset.
1526                 */
1527                BUG_ON(shift < PAGE_SHIFT);
1528                off = vma->vm_pgoff >> (shift - PAGE_SHIFT);
1529                off += (addr - vma->vm_start) >> shift;
1530                return offset_il_node(pol, vma, off);
1531        } else
1532                return interleave_nodes(pol);
1533}
1534
1535#ifdef CONFIG_HUGETLBFS
1536/*
1537 * huge_zonelist(@vma, @addr, @gfp_flags, @mpol)
1538 * @vma = virtual memory area whose policy is sought
1539 * @addr = address in @vma for shared policy lookup and interleave policy
1540 * @gfp_flags = for requested zone
1541 * @mpol = pointer to mempolicy pointer for reference counted mempolicy
1542 * @nodemask = pointer to nodemask pointer for MPOL_BIND nodemask
1543 *
1544 * Returns a zonelist suitable for a huge page allocation and a pointer
1545 * to the struct mempolicy for conditional unref after allocation.
1546 * If the effective policy is 'BIND, returns a pointer to the mempolicy's
1547 * @nodemask for filtering the zonelist.
1548 */
1549struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr,
1550                                gfp_t gfp_flags, struct mempolicy **mpol,
1551                                nodemask_t **nodemask)
1552{
1553        struct zonelist *zl;
1554
1555        *mpol = get_vma_policy(current, vma, addr);
1556        *nodemask = NULL;       /* assume !MPOL_BIND */
1557
1558        if (unlikely((*mpol)->mode == MPOL_INTERLEAVE)) {
1559                zl = node_zonelist(interleave_nid(*mpol, vma, addr,
1560                                huge_page_shift(hstate_vma(vma))), gfp_flags);
1561        } else {
1562                zl = policy_zonelist(gfp_flags, *mpol);
1563                if ((*mpol)->mode == MPOL_BIND)
1564                        *nodemask = &(*mpol)->v.nodes;
1565        }
1566        return zl;
1567}
1568#endif
1569
1570/* Allocate a page in interleaved policy.
1571   Own path because it needs to do special accounting. */
1572static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
1573                                        unsigned nid)
1574{
1575        struct zonelist *zl;
1576        struct page *page;
1577
1578        zl = node_zonelist(nid, gfp);
1579        page = __alloc_pages(gfp, order, zl);
1580        if (page && page_zone(page) == zonelist_zone(&zl->_zonerefs[0]))
1581                inc_zone_page_state(page, NUMA_INTERLEAVE_HIT);
1582        return page;
1583}
1584
1585/**
1586 *      alloc_page_vma  - Allocate a page for a VMA.
1587 *
1588 *      @gfp:
1589 *      %GFP_USER    user allocation.
1590 *      %GFP_KERNEL  kernel allocations,
1591 *      %GFP_HIGHMEM highmem/user allocations,
1592 *      %GFP_FS      allocation should not call back into a file system.
1593 *      %GFP_ATOMIC  don't sleep.
1594 *
1595 *      @vma:  Pointer to VMA or NULL if not available.
1596 *      @addr: Virtual Address of the allocation. Must be inside the VMA.
1597 *
1598 *      This function allocates a page from the kernel page pool and applies
1599 *      a NUMA policy associated with the VMA or the current process.
1600 *      When VMA is not NULL caller must hold down_read on the mmap_sem of the
1601 *      mm_struct of the VMA to prevent it from going away. Should be used for
1602 *      all allocations for pages that will be mapped into
1603 *      user space. Returns NULL when no page can be allocated.
1604 *
1605 *      Should be called with the mm_sem of the vma hold.
1606 */
1607struct page *
1608alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
1609{
1610        struct mempolicy *pol = get_vma_policy(current, vma, addr);
1611        struct zonelist *zl;
1612
1613        if (unlikely(pol->mode == MPOL_INTERLEAVE)) {
1614                unsigned nid;
1615
1616                nid = interleave_nid(pol, vma, addr, PAGE_SHIFT);
1617                mpol_cond_put(pol);
1618                return alloc_page_interleave(gfp, 0, nid);
1619        }
1620        zl = policy_zonelist(gfp, pol);
1621        if (unlikely(mpol_needs_cond_ref(pol))) {
1622                /*
1623                 * slow path: ref counted shared policy
1624                 */
1625                struct page *page =  __alloc_pages_nodemask(gfp, 0,
1626                                                zl, policy_nodemask(gfp, pol));
1627                __mpol_put(pol);
1628                return page;
1629        }
1630        /*
1631         * fast path:  default or task policy
1632         */
1633        return __alloc_pages_nodemask(gfp, 0, zl, policy_nodemask(gfp, pol));
1634}
1635
1636/**
1637 *      alloc_pages_current - Allocate pages.
1638 *
1639 *      @gfp:
1640 *              %GFP_USER   user allocation,
1641 *              %GFP_KERNEL kernel allocation,
1642 *              %GFP_HIGHMEM highmem allocation,
1643 *              %GFP_FS     don't call back into a file system.
1644 *              %GFP_ATOMIC don't sleep.
1645 *      @order: Power of two of allocation size in pages. 0 is a single page.
1646 *
1647 *      Allocate a page from the kernel page pool.  When not in
1648 *      interrupt context and apply the current process NUMA policy.
1649 *      Returns NULL when no page can be allocated.
1650 *
1651 *      Don't call cpuset_update_task_memory_state() unless
1652 *      1) it's ok to take cpuset_sem (can WAIT), and
1653 *      2) allocating for current task (not interrupt).
1654 */
1655struct page *alloc_pages_current(gfp_t gfp, unsigned order)
1656{
1657        struct mempolicy *pol = current->mempolicy;
1658
1659        if (!pol || in_interrupt() || (gfp & __GFP_THISNODE))
1660                pol = &default_policy;
1661
1662        /*
1663         * No reference counting needed for current->mempolicy
1664         * nor system default_policy
1665         */
1666        if (pol->mode == MPOL_INTERLEAVE)
1667                return alloc_page_interleave(gfp, order, interleave_nodes(pol));
1668        return __alloc_pages_nodemask(gfp, order,
1669                        policy_zonelist(gfp, pol), policy_nodemask(gfp, pol));
1670}
1671EXPORT_SYMBOL(alloc_pages_current);
1672
1673/*
1674 * If mpol_dup() sees current->cpuset == cpuset_being_rebound, then it
1675 * rebinds the mempolicy its copying by calling mpol_rebind_policy()
1676 * with the mems_allowed returned by cpuset_mems_allowed().  This
1677 * keeps mempolicies cpuset relative after its cpuset moves.  See
1678 * further kernel/cpuset.c update_nodemask().
1679 */
1680
1681/* Slow path of a mempolicy duplicate */
1682struct mempolicy *__mpol_dup(struct mempolicy *old)
1683{
1684        struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
1685
1686        if (!new)
1687                return ERR_PTR(-ENOMEM);
1688        if (current_cpuset_is_being_rebound()) {
1689                nodemask_t mems = cpuset_mems_allowed(current);
1690                mpol_rebind_policy(old, &mems);
1691        }
1692        *new = *old;
1693        atomic_set(&new->refcnt, 1);
1694        return new;
1695}
1696
1697/*
1698 * If *frompol needs [has] an extra ref, copy *frompol to *tompol ,
1699 * eliminate the * MPOL_F_* flags that require conditional ref and
1700 * [NOTE!!!] drop the extra ref.  Not safe to reference *frompol directly
1701 * after return.  Use the returned value.
1702 *
1703 * Allows use of a mempolicy for, e.g., multiple allocations with a single
1704 * policy lookup, even if the policy needs/has extra ref on lookup.
1705 * shmem_readahead needs this.
1706 */
1707struct mempolicy *__mpol_cond_copy(struct mempolicy *tompol,
1708                                                struct mempolicy *frompol)
1709{
1710        if (!mpol_needs_cond_ref(frompol))
1711                return frompol;
1712
1713        *tompol = *frompol;
1714        tompol->flags &= ~MPOL_F_SHARED;        /* copy doesn't need unref */
1715        __mpol_put(frompol);
1716        return tompol;
1717}
1718
1719static int mpol_match_intent(const struct mempolicy *a,
1720                             const struct mempolicy *b)
1721{
1722        if (a->flags != b->flags)
1723                return 0;
1724        if (!mpol_store_user_nodemask(a))
1725                return 1;
1726        return nodes_equal(a->w.user_nodemask, b->w.user_nodemask);
1727}
1728
1729/* Slow path of a mempolicy comparison */
1730int __mpol_equal(struct mempolicy *a, struct mempolicy *b)
1731{
1732        if (!a || !b)
1733                return 0;
1734        if (a->mode != b->mode)
1735                return 0;
1736        if (a->mode != MPOL_DEFAULT && !mpol_match_intent(a, b))
1737                return 0;
1738        switch (a->mode) {
1739        case MPOL_BIND:
1740                /* Fall through */
1741        case MPOL_INTERLEAVE:
1742                return nodes_equal(a->v.nodes, b->v.nodes);
1743        case MPOL_PREFERRED:
1744                return a->v.preferred_node == b->v.preferred_node &&
1745                        a->flags == b->flags;
1746        default:
1747                BUG();
1748                return 0;
1749        }
1750}
1751
1752/*
1753 * Shared memory backing store policy support.
1754 *
1755 * Remember policies even when nobody has shared memory mapped.
1756 * The policies are kept in Red-Black tree linked from the inode.
1757 * They are protected by the sp->lock spinlock, which should be held
1758 * for any accesses to the tree.
1759 */
1760
1761/* lookup first element intersecting start-end */
1762/* Caller holds sp->lock */
1763static struct sp_node *
1764sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
1765{
1766        struct rb_node *n = sp->root.rb_node;
1767
1768        while (n) {
1769                struct sp_node *p = rb_entry(n, struct sp_node, nd);
1770
1771                if (start >= p->end)
1772                        n = n->rb_right;
1773                else if (end <= p->start)
1774                        n = n->rb_left;
1775                else
1776                        break;
1777        }
1778        if (!n)
1779                return NULL;
1780        for (;;) {
1781                struct sp_node *w = NULL;
1782                struct rb_node *prev = rb_prev(n);
1783                if (!prev)
1784                        break;
1785                w = rb_entry(prev, struct sp_node, nd);
1786                if (w->end <= start)
1787                        break;
1788                n = prev;
1789        }
1790        return rb_entry(n, struct sp_node, nd);
1791}
1792
1793/* Insert a new shared policy into the list. */
1794/* Caller holds sp->lock */
1795static void sp_insert(struct shared_policy *sp, struct sp_node *new)
1796{
1797        struct rb_node **p = &sp->root.rb_node;
1798        struct rb_node *parent = NULL;
1799        struct sp_node *nd;
1800
1801        while (*p) {
1802                parent = *p;
1803                nd = rb_entry(parent, struct sp_node, nd);
1804                if (new->start < nd->start)
1805                        p = &(*p)->rb_left;
1806                else if (new->end > nd->end)
1807                        p = &(*p)->rb_right;
1808                else
1809                        BUG();
1810        }
1811        rb_link_node(&new->nd, parent, p);
1812        rb_insert_color(&new->nd, &sp->root);
1813        pr_debug("inserting %lx-%lx: %d\n", new->start, new->end,
1814                 new->policy ? new->policy->mode : 0);
1815}
1816
1817/* Find shared policy intersecting idx */
1818struct mempolicy *
1819mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
1820{
1821        struct mempolicy *pol = NULL;
1822        struct sp_node *sn;
1823
1824        if (!sp->root.rb_node)
1825                return NULL;
1826        spin_lock(&sp->lock);
1827        sn = sp_lookup(sp, idx, idx+1);
1828        if (sn) {
1829                mpol_get(sn->policy);
1830                pol = sn->policy;
1831        }
1832        spin_unlock(&sp->lock);
1833        return pol;
1834}
1835
1836static void sp_delete(struct shared_policy *sp, struct sp_node *n)
1837{
1838        pr_debug("deleting %lx-l%lx\n", n->start, n->end);
1839        rb_erase(&n->nd, &sp->root);
1840        mpol_put(n->policy);
1841        kmem_cache_free(sn_cache, n);
1842}
1843
1844static struct sp_node *sp_alloc(unsigned long start, unsigned long end,
1845                                struct mempolicy *pol)
1846{
1847        struct sp_node *n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
1848
1849        if (!n)
1850                return NULL;
1851        n->start = start;
1852        n->end = end;
1853        mpol_get(pol);
1854        pol->flags |= MPOL_F_SHARED;    /* for unref */
1855        n->policy = pol;
1856        return n;
1857}
1858
1859/* Replace a policy range. */
1860static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
1861                                 unsigned long end, struct sp_node *new)
1862{
1863        struct sp_node *n, *new2 = NULL;
1864
1865restart:
1866        spin_lock(&sp->lock);
1867        n = sp_lookup(sp, start, end);
1868        /* Take care of old policies in the same range. */
1869        while (n && n->start < end) {
1870                struct rb_node *next = rb_next(&n->nd);
1871                if (n->start >= start) {
1872                        if (n->end <= end)
1873                                sp_delete(sp, n);
1874                        else
1875                                n->start = end;
1876                } else {
1877                        /* Old policy spanning whole new range. */
1878                        if (n->end > end) {
1879                                if (!new2) {
1880                                        spin_unlock(&sp->lock);
1881                                        new2 = sp_alloc(end, n->end, n->policy);
1882                                        if (!new2)
1883                                                return -ENOMEM;
1884                                        goto restart;
1885                                }
1886                                n->end = start;
1887                                sp_insert(sp, new2);
1888                                new2 = NULL;
1889                                break;
1890                        } else
1891                                n->end = start;
1892                }
1893                if (!next)
1894                        break;
1895                n = rb_entry(next, struct sp_node, nd);
1896        }
1897        if (new)
1898                sp_insert(sp, new);
1899        spin_unlock(&sp->lock);
1900        if (new2) {
1901                mpol_put(new2->policy);
1902                kmem_cache_free(sn_cache, new2);
1903        }
1904        return 0;
1905}
1906
1907/**
1908 * mpol_shared_policy_init - initialize shared policy for inode
1909 * @sp: pointer to inode shared policy
1910 * @mpol:  struct mempolicy to install
1911 *
1912 * Install non-NULL @mpol in inode's shared policy rb-tree.
1913 * On entry, the current task has a reference on a non-NULL @mpol.
1914 * This must be released on exit.
1915 * This is called at get_inode() calls and we can use GFP_KERNEL.
1916 */
1917void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol)
1918{
1919        int ret;
1920
1921        sp->root = RB_ROOT;             /* empty tree == default mempolicy */
1922        spin_lock_init(&sp->lock);
1923
1924        if (mpol) {
1925                struct vm_area_struct pvma;
1926                struct mempolicy *new;
1927                NODEMASK_SCRATCH(scratch);
1928
1929                if (!scratch)
1930                        return;
1931                /* contextualize the tmpfs mount point mempolicy */
1932                new = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask);
1933                if (IS_ERR(new)) {
1934                        mpol_put(mpol); /* drop our ref on sb mpol */
1935                        NODEMASK_SCRATCH_FREE(scratch);
1936                        return;         /* no valid nodemask intersection */
1937                }
1938
1939                task_lock(current);
1940                ret = mpol_set_nodemask(new, &mpol->w.user_nodemask, scratch);
1941                task_unlock(current);
1942                mpol_put(mpol); /* drop our ref on sb mpol */
1943                if (ret) {
1944                        NODEMASK_SCRATCH_FREE(scratch);
1945                        mpol_put(new);
1946                        return;
1947                }
1948
1949                /* Create pseudo-vma that contains just the policy */
1950                memset(&pvma, 0, sizeof(struct vm_area_struct));
1951                pvma.vm_end = TASK_SIZE;        /* policy covers entire file */
1952                mpol_set_shared_policy(sp, &pvma, new); /* adds ref */
1953                mpol_put(new);                  /* drop initial ref */
1954                NODEMASK_SCRATCH_FREE(scratch);
1955        }
1956}
1957
1958int mpol_set_shared_policy(struct shared_policy *info,
1959                        struct vm_area_struct *vma, struct mempolicy *npol)
1960{
1961        int err;
1962        struct sp_node *new = NULL;
1963        unsigned long sz = vma_pages(vma);
1964
1965        pr_debug("set_shared_policy %lx sz %lu %d %d %lx\n",
1966                 vma->vm_pgoff,
1967                 sz, npol ? npol->mode : -1,
1968                 npol ? npol->flags : -1,
1969                 npol ? nodes_addr(npol->v.nodes)[0] : -1);
1970
1971        if (npol) {
1972                new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);
1973                if (!new)
1974                        return -ENOMEM;
1975        }
1976        err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new);
1977        if (err && new)
1978                kmem_cache_free(sn_cache, new);
1979        return err;
1980}
1981
1982/* Free a backing policy store on inode delete. */
1983void mpol_free_shared_policy(struct shared_policy *p)
1984{
1985        struct sp_node *n;
1986        struct rb_node *next;
1987
1988        if (!p->root.rb_node)
1989                return;
1990        spin_lock(&p->lock);
1991        next = rb_first(&p->root);
1992        while (next) {
1993                n = rb_entry(next, struct sp_node, nd);
1994                next = rb_next(&n->nd);
1995                rb_erase(&n->nd, &p->root);
1996                mpol_put(n->policy);
1997                kmem_cache_free(sn_cache, n);
1998        }
1999        spin_unlock(&p->lock);
2000}
2001
2002/* assumes fs == KERNEL_DS */
2003void __init numa_policy_init(void)
2004{
2005        nodemask_t interleave_nodes;
2006        unsigned long largest = 0;
2007        int nid, prefer = 0;
2008
2009        policy_cache = kmem_cache_create("numa_policy",
2010                                         sizeof(struct mempolicy),
2011                                         0, SLAB_PANIC, NULL);
2012
2013        sn_cache = kmem_cache_create("shared_policy_node",
2014                                     sizeof(struct sp_node),
2015                                     0, SLAB_PANIC, NULL);
2016
2017        /*
2018         * Set interleaving policy for system init. Interleaving is only
2019         * enabled across suitably sized nodes (default is >= 16MB), or
2020         * fall back to the largest node if they're all smaller.
2021         */
2022        nodes_clear(interleave_nodes);
2023        for_each_node_state(nid, N_HIGH_MEMORY) {
2024                unsigned long total_pages = node_present_pages(nid);
2025
2026                /* Preserve the largest node */
2027                if (largest < total_pages) {
2028                        largest = total_pages;
2029                        prefer = nid;
2030                }
2031
2032                /* Interleave this node? */
2033                if ((total_pages << PAGE_SHIFT) >= (16 << 20))
2034                        node_set(nid, interleave_nodes);
2035        }
2036
2037        /* All too small, use the largest */
2038        if (unlikely(nodes_empty(interleave_nodes)))
2039                node_set(prefer, interleave_nodes);
2040
2041        if (do_set_mempolicy(MPOL_INTERLEAVE, 0, &interleave_nodes))
2042                printk("numa_policy_init: interleaving failed\n");
2043}
2044
2045/* Reset policy of current process to default */
2046void numa_default_policy(void)
2047{
2048        do_set_mempolicy(MPOL_DEFAULT, 0, NULL);
2049}
2050
2051/*
2052 * Parse and format mempolicy from/to strings
2053 */
2054
2055/*
2056 * "local" is pseudo-policy:  MPOL_PREFERRED with MPOL_F_LOCAL flag
2057 * Used only for mpol_parse_str() and mpol_to_str()
2058 */
2059#define MPOL_LOCAL (MPOL_INTERLEAVE + 1)
2060static const char * const policy_types[] =
2061        { "default", "prefer", "bind", "interleave", "local" };
2062
2063
2064#ifdef CONFIG_TMPFS
2065/**
2066 * mpol_parse_str - parse string to mempolicy
2067 * @str:  string containing mempolicy to parse
2068 * @mpol:  pointer to struct mempolicy pointer, returned on success.
2069 * @no_context:  flag whether to "contextualize" the mempolicy
2070 *
2071 * Format of input:
2072 *      <mode>[=<flags>][:<nodelist>]
2073 *
2074 * if @no_context is true, save the input nodemask in w.user_nodemask in
2075 * the returned mempolicy.  This will be used to "clone" the mempolicy in
2076 * a specific context [cpuset] at a later time.  Used to parse tmpfs mpol
2077 * mount option.  Note that if 'static' or 'relative' mode flags were
2078 * specified, the input nodemask will already have been saved.  Saving
2079 * it again is redundant, but safe.
2080 *
2081 * On success, returns 0, else 1
2082 */
2083int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context)
2084{
2085        struct mempolicy *new = NULL;
2086        unsigned short uninitialized_var(mode);
2087        unsigned short uninitialized_var(mode_flags);
2088        nodemask_t nodes;
2089        char *nodelist = strchr(str, ':');
2090        char *flags = strchr(str, '=');
2091        int i;
2092        int err = 1;
2093
2094        if (nodelist) {
2095                /* NUL-terminate mode or flags string */
2096                *nodelist++ = '\0';
2097                if (nodelist_parse(nodelist, nodes))
2098                        goto out;
2099                if (!nodes_subset(nodes, node_states[N_HIGH_MEMORY]))
2100                        goto out;
2101        } else
2102                nodes_clear(nodes);
2103
2104        if (flags)
2105                *flags++ = '\0';        /* terminate mode string */
2106
2107        for (i = 0; i <= MPOL_LOCAL; i++) {
2108                if (!strcmp(str, policy_types[i])) {
2109                        mode = i;
2110                        break;
2111                }
2112        }
2113        if (i > MPOL_LOCAL)
2114                goto out;
2115
2116        switch (mode) {
2117        case MPOL_PREFERRED:
2118                /*
2119                 * Insist on a nodelist of one node only
2120                 */
2121                if (nodelist) {
2122                        char *rest = nodelist;
2123                        while (isdigit(*rest))
2124                                rest++;
2125                        if (!*rest)
2126                                err = 0;
2127                }
2128                break;
2129        case MPOL_INTERLEAVE:
2130                /*
2131                 * Default to online nodes with memory if no nodelist
2132                 */
2133                if (!nodelist)
2134                        nodes = node_states[N_HIGH_MEMORY];
2135                err = 0;
2136                break;
2137        case MPOL_LOCAL:
2138                /*
2139                 * Don't allow a nodelist;  mpol_new() checks flags
2140                 */
2141                if (nodelist)
2142                        goto out;
2143                mode = MPOL_PREFERRED;
2144                break;
2145
2146        /*
2147         * case MPOL_BIND:    mpol_new() enforces non-empty nodemask.
2148         * case MPOL_DEFAULT: mpol_new() enforces empty nodemask, ignores flags.
2149         */
2150        }
2151
2152        mode_flags = 0;
2153        if (flags) {
2154                /*
2155                 * Currently, we only support two mutually exclusive
2156                 * mode flags.
2157                 */
2158                if (!strcmp(flags, "static"))
2159                        mode_flags |= MPOL_F_STATIC_NODES;
2160                else if (!strcmp(flags, "relative"))
2161                        mode_flags |= MPOL_F_RELATIVE_NODES;
2162                else
2163                        err = 1;
2164        }
2165
2166        new = mpol_new(mode, mode_flags, &nodes);
2167        if (IS_ERR(new))
2168                err = 1;
2169        else {
2170                int ret;
2171                NODEMASK_SCRATCH(scratch);
2172                if (scratch) {
2173                        task_lock(current);
2174                        ret = mpol_set_nodemask(new, &nodes, scratch);
2175                        task_unlock(current);
2176                } else
2177                        ret = -ENOMEM;
2178                NODEMASK_SCRATCH_FREE(scratch);
2179                if (ret) {
2180                        err = 1;
2181                        mpol_put(new);
2182                } else if (no_context) {
2183                        /* save for contextualization */
2184                        new->w.user_nodemask = nodes;
2185                }
2186        }
2187
2188out:
2189        /* Restore string for error message */
2190        if (nodelist)
2191                *--nodelist = ':';
2192        if (flags)
2193                *--flags = '=';
2194        if (!err)
2195                *mpol = new;
2196        return err;
2197}
2198#endif /* CONFIG_TMPFS */
2199
2200/**
2201 * mpol_to_str - format a mempolicy structure for printing
2202 * @buffer:  to contain formatted mempolicy string
2203 * @maxlen:  length of @buffer
2204 * @pol:  pointer to mempolicy to be formatted
2205 * @no_context:  "context free" mempolicy - use nodemask in w.user_nodemask
2206 *
2207 * Convert a mempolicy into a string.
2208 * Returns the number of characters in buffer (if positive)
2209 * or an error (negative)
2210 */
2211int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol, int no_context)
2212{
2213        char *p = buffer;
2214        int l;
2215        nodemask_t nodes;
2216        unsigned short mode;
2217        unsigned short flags = pol ? pol->flags : 0;
2218
2219        /*
2220         * Sanity check:  room for longest mode, flag and some nodes
2221         */
2222        VM_BUG_ON(maxlen < strlen("interleave") + strlen("relative") + 16);
2223
2224        if (!pol || pol == &default_policy)
2225                mode = MPOL_DEFAULT;
2226        else
2227                mode = pol->mode;
2228
2229        switch (mode) {
2230        case MPOL_DEFAULT:
2231                nodes_clear(nodes);
2232                break;
2233
2234        case MPOL_PREFERRED:
2235                nodes_clear(nodes);
2236                if (flags & MPOL_F_LOCAL)
2237                        mode = MPOL_LOCAL;      /* pseudo-policy */
2238                else
2239                        node_set(pol->v.preferred_node, nodes);
2240                break;
2241
2242        case MPOL_BIND:
2243                /* Fall through */
2244        case MPOL_INTERLEAVE:
2245                if (no_context)
2246                        nodes = pol->w.user_nodemask;
2247                else
2248                        nodes = pol->v.nodes;
2249                break;
2250
2251        default:
2252                BUG();
2253        }
2254
2255        l = strlen(policy_types[mode]);
2256        if (buffer + maxlen < p + l + 1)
2257                return -ENOSPC;
2258
2259        strcpy(p, policy_types[mode]);
2260        p += l;
2261
2262        if (flags & MPOL_MODE_FLAGS) {
2263                if (buffer + maxlen < p + 2)
2264                        return -ENOSPC;
2265                *p++ = '=';
2266
2267                /*
2268                 * Currently, the only defined flags are mutually exclusive
2269                 */
2270                if (flags & MPOL_F_STATIC_NODES)
2271                        p += snprintf(p, buffer + maxlen - p, "static");
2272                else if (flags & MPOL_F_RELATIVE_NODES)
2273                        p += snprintf(p, buffer + maxlen - p, "relative");
2274        }
2275
2276        if (!nodes_empty(nodes)) {
2277                if (buffer + maxlen < p + 2)
2278                        return -ENOSPC;
2279                *p++ = ':';
2280                p += nodelist_scnprintf(p, buffer + maxlen - p, nodes);
2281        }
2282        return p - buffer;
2283}
2284
2285struct numa_maps {
2286        unsigned long pages;
2287        unsigned long anon;
2288        unsigned long active;
2289        unsigned long writeback;
2290        unsigned long mapcount_max;
2291        unsigned long dirty;
2292        unsigned long swapcache;
2293        unsigned long node[MAX_NUMNODES];
2294};
2295
2296static void gather_stats(struct page *page, void *private, int pte_dirty)
2297{
2298        struct numa_maps *md = private;
2299        int count = page_mapcount(page);
2300
2301        md->pages++;
2302        if (pte_dirty || PageDirty(page))
2303                md->dirty++;
2304
2305        if (PageSwapCache(page))
2306                md->swapcache++;
2307
2308        if (PageActive(page) || PageUnevictable(page))
2309                md->active++;
2310
2311        if (PageWriteback(page))
2312                md->writeback++;
2313
2314        if (PageAnon(page))
2315                md->anon++;
2316
2317        if (count > md->mapcount_max)
2318                md->mapcount_max = count;
2319
2320        md->node[page_to_nid(page)]++;
2321}
2322
2323#ifdef CONFIG_HUGETLB_PAGE
2324static void check_huge_range(struct vm_area_struct *vma,
2325                unsigned long start, unsigned long end,
2326                struct numa_maps *md)
2327{
2328        unsigned long addr;
2329        struct page *page;
2330        struct hstate *h = hstate_vma(vma);
2331        unsigned long sz = huge_page_size(h);
2332
2333        for (addr = start; addr < end; addr += sz) {
2334                pte_t *ptep = huge_pte_offset(vma->vm_mm,
2335                                                addr & huge_page_mask(h));
2336                pte_t pte;
2337
2338                if (!ptep)
2339                        continue;
2340
2341                pte = *ptep;
2342                if (pte_none(pte))
2343                        continue;
2344
2345                page = pte_page(pte);
2346                if (!page)
2347                        continue;
2348
2349                gather_stats(page, md, pte_dirty(*ptep));
2350        }
2351}
2352#else
2353static inline void check_huge_range(struct vm_area_struct *vma,
2354                unsigned long start, unsigned long end,
2355                struct numa_maps *md)
2356{
2357}
2358#endif
2359
2360/*
2361 * Display pages allocated per node and memory policy via /proc.
2362 */
2363int show_numa_map(struct seq_file *m, void *v)
2364{
2365        struct proc_maps_private *priv = m->private;
2366        struct vm_area_struct *vma = v;
2367        struct numa_maps *md;
2368        struct file *file = vma->vm_file;
2369        struct mm_struct *mm = vma->vm_mm;
2370        struct mempolicy *pol;
2371        int n;
2372        char buffer[50];
2373
2374        if (!mm)
2375                return 0;
2376
2377        md = kzalloc(sizeof(struct numa_maps), GFP_KERNEL);
2378        if (!md)
2379                return 0;
2380
2381        pol = get_vma_policy(priv->task, vma, vma->vm_start);
2382        mpol_to_str(buffer, sizeof(buffer), pol, 0);
2383        mpol_cond_put(pol);
2384
2385        seq_printf(m, "%08lx %s", vma->vm_start, buffer);
2386
2387        if (file) {
2388                seq_printf(m, " file=");
2389                seq_path(m, &file->f_path, "\n\t= ");
2390        } else if (vma->vm_start <= mm->brk && vma->vm_end >= mm->start_brk) {
2391                seq_printf(m, " heap");
2392        } else if (vma->vm_start <= mm->start_stack &&
2393                        vma->vm_end >= mm->start_stack) {
2394                seq_printf(m, " stack");
2395        }
2396
2397        if (is_vm_hugetlb_page(vma)) {
2398                check_huge_range(vma, vma->vm_start, vma->vm_end, md);
2399                seq_printf(m, " huge");
2400        } else {
2401                check_pgd_range(vma, vma->vm_start, vma->vm_end,
2402                        &node_states[N_HIGH_MEMORY], MPOL_MF_STATS, md);
2403        }
2404
2405        if (!md->pages)
2406                goto out;
2407
2408        if (md->anon)
2409                seq_printf(m," anon=%lu",md->anon);
2410
2411        if (md->dirty)
2412                seq_printf(m," dirty=%lu",md->dirty);
2413
2414        if (md->pages != md->anon && md->pages != md->dirty)
2415                seq_printf(m, " mapped=%lu", md->pages);
2416
2417        if (md->mapcount_max > 1)
2418                seq_printf(m, " mapmax=%lu", md->mapcount_max);
2419
2420        if (md->swapcache)
2421                seq_printf(m," swapcache=%lu", md->swapcache);
2422
2423        if (md->active < md->pages && !is_vm_hugetlb_page(vma))
2424                seq_printf(m," active=%lu", md->active);
2425
2426        if (md->writeback)
2427                seq_printf(m," writeback=%lu", md->writeback);
2428
2429        for_each_node_state(n, N_HIGH_MEMORY)
2430                if (md->node[n])
2431                        seq_printf(m, " N%d=%lu", n, md->node[n]);
2432out:
2433        seq_putc(m, '\n');
2434        kfree(md);
2435
2436        if (m->count < m->size)
2437                m->version = (vma != priv->tail_vma) ? vma->vm_start : 0;
2438        return 0;
2439}
2440