LXR linux/mm/mempolicy.c

   1/*
   2 * Simple NUMA memory policy for the Linux kernel.
   3 *
   4 * Copyright 2003,2004 Andi Kleen, SuSE Labs.
   5 * (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc.
   6 * Subject to the GNU Public License, version 2.
   7 *
   8 * NUMA policy allows the user to give hints in which node(s) memory should
   9 * be allocated.
  10 *
  11 * Support four policies per VMA and per process:
  12 *
  13 * The VMA policy has priority over the process policy for a page fault.
  14 *
  15 * interleave     Allocate memory interleaved over a set of nodes,
  16 *                with normal fallback if it fails.
  17 *                For VMA based allocations this interleaves based on the
  18 *                offset into the backing object or offset into the mapping
  19 *                for anonymous memory. For process policy an process counter
  20 *                is used.
  21 *
  22 * bind           Only allocate memory on a specific set of nodes,
  23 *                no fallback.
  24 *                FIXME: memory is allocated starting with the first node
  25 *                to the last. It would be better if bind would truly restrict
  26 *                the allocation to memory nodes instead
  27 *
  28 * preferred       Try a specific node first before normal fallback.
  29 *                As a special case NUMA_NO_NODE here means do the allocation
  30 *                on the local CPU. This is normally identical to default,
  31 *                but useful to set in a VMA when you have a non default
  32 *                process policy.
  33 *
  34 * default        Allocate on the local node first, or when on a VMA
  35 *                use the process policy. This is what Linux always did
  36 *                in a NUMA aware kernel and still does by, ahem, default.
  37 *
  38 * The process policy is applied for most non interrupt memory allocations
  39 * in that process' context. Interrupts ignore the policies and always
  40 * try to allocate on the local CPU. The VMA policy is only applied for memory
  41 * allocations for a VMA in the VM.
  42 *
  43 * Currently there are a few corner cases in swapping where the policy
  44 * is not applied, but the majority should be handled. When process policy
  45 * is used it is not remembered over swap outs/swap ins.
  46 *
  47 * Only the highest zone in the zone hierarchy gets policied. Allocations
  48 * requesting a lower zone just use default policy. This implies that
  49 * on systems with highmem kernel lowmem allocation don't get policied.
  50 * Same with GFP_DMA allocations.
  51 *
  52 * For shmfs/tmpfs/hugetlbfs shared memory the policy is shared between
  53 * all users and remembered even when nobody has memory mapped.
  54 */
  55
  56/* Notebook:
  57   fix mmap readahead to honour policy and enable policy for any page cache
  58   object
  59   statistics for bigpages
  60   global policy for page cache? currently it uses process policy. Requires
  61   first item above.
  62   handle mremap for shared memory (currently ignored for the policy)
  63   grows down?
  64   make bind policy root only? It can trigger oom much faster and the
  65   kernel is not always grateful with that.
  66*/
  67
  68#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
  69
  70#include <linux/mempolicy.h>
  71#include <linux/mm.h>
  72#include <linux/highmem.h>
  73#include <linux/hugetlb.h>
  74#include <linux/kernel.h>
  75#include <linux/sched.h>
  76#include <linux/nodemask.h>
  77#include <linux/cpuset.h>
  78#include <linux/slab.h>
  79#include <linux/string.h>
  80#include <linux/export.h>
  81#include <linux/nsproxy.h>
  82#include <linux/interrupt.h>
  83#include <linux/init.h>
  84#include <linux/compat.h>
  85#include <linux/swap.h>
  86#include <linux/seq_file.h>
  87#include <linux/proc_fs.h>
  88#include <linux/migrate.h>
  89#include <linux/ksm.h>
  90#include <linux/rmap.h>
  91#include <linux/security.h>
  92#include <linux/syscalls.h>
  93#include <linux/ctype.h>
  94#include <linux/mm_inline.h>
  95#include <linux/mmu_notifier.h>
  96#include <linux/printk.h>
  97
  98#include <asm/tlbflush.h>
  99#include <asm/uaccess.h>
 100#include <linux/random.h>
 101
 102#include "internal.h"
 103
 104/* Internal flags */
 105#define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0)    /* Skip checks for continuous vmas */
 106#define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1)          /* Invert check for nodemask */
 107
 108static struct kmem_cache *policy_cache;
 109static struct kmem_cache *sn_cache;
 110
 111/* Highest zone. An specific allocation for a zone below that is not
 112   policied. */
 113enum zone_type policy_zone = 0;
 114
 115/*
 116 * run-time system-wide default policy => local allocation
 117 */
 118static struct mempolicy default_policy = {
 119        .refcnt = ATOMIC_INIT(1), /* never free it */
 120        .mode = MPOL_PREFERRED,
 121        .flags = MPOL_F_LOCAL,
 122};
 123
 124static struct mempolicy preferred_node_policy[MAX_NUMNODES];
 125
 126struct mempolicy *get_task_policy(struct task_struct *p)
 127{
 128        struct mempolicy *pol = p->mempolicy;
 129        int node;
 130
 131        if (pol)
 132                return pol;
 133
 134        node = numa_node_id();
 135        if (node != NUMA_NO_NODE) {
 136                pol = &preferred_node_policy[node];
 137                /* preferred_node_policy is not initialised early in boot */
 138                if (pol->mode)
 139                        return pol;
 140        }
 141
 142        return &default_policy;
 143}
 144
 145static const struct mempolicy_operations {
 146        int (*create)(struct mempolicy *pol, const nodemask_t *nodes);
 147        /*
 148         * If read-side task has no lock to protect task->mempolicy, write-side
 149         * task will rebind the task->mempolicy by two step. The first step is
 150         * setting all the newly nodes, and the second step is cleaning all the
 151         * disallowed nodes. In this way, we can avoid finding no node to alloc
 152         * page.
 153         * If we have a lock to protect task->mempolicy in read-side, we do
 154         * rebind directly.
 155         *
 156         * step:
 157         *      MPOL_REBIND_ONCE - do rebind work at once
 158         *      MPOL_REBIND_STEP1 - set all the newly nodes
 159         *      MPOL_REBIND_STEP2 - clean all the disallowed nodes
 160         */
 161        void (*rebind)(struct mempolicy *pol, const nodemask_t *nodes,
 162                        enum mpol_rebind_step step);
 163} mpol_ops[MPOL_MAX];
 164
 165static inline int mpol_store_user_nodemask(const struct mempolicy *pol)
 166{
 167        return pol->flags & MPOL_MODE_FLAGS;
 168}
 169
 170static void mpol_relative_nodemask(nodemask_t *ret, const nodemask_t *orig,
 171                                   const nodemask_t *rel)
 172{
 173        nodemask_t tmp;
 174        nodes_fold(tmp, *orig, nodes_weight(*rel));
 175        nodes_onto(*ret, tmp, *rel);
 176}
 177
 178static int mpol_new_interleave(struct mempolicy *pol, const nodemask_t *nodes)
 179{
 180        if (nodes_empty(*nodes))
 181                return -EINVAL;
 182        pol->v.nodes = *nodes;
 183        return 0;
 184}
 185
 186static int mpol_new_preferred(struct mempolicy *pol, const nodemask_t *nodes)
 187{
 188        if (!nodes)
 189                pol->flags |= MPOL_F_LOCAL;     /* local allocation */
 190        else if (nodes_empty(*nodes))
 191                return -EINVAL;                 /*  no allowed nodes */
 192        else
 193                pol->v.preferred_node = first_node(*nodes);
 194        return 0;
 195}
 196
 197static int mpol_new_bind(struct mempolicy *pol, const nodemask_t *nodes)
 198{
 199        if (nodes_empty(*nodes))
 200                return -EINVAL;
 201        pol->v.nodes = *nodes;
 202        return 0;
 203}
 204
 205/*
 206 * mpol_set_nodemask is called after mpol_new() to set up the nodemask, if
 207 * any, for the new policy.  mpol_new() has already validated the nodes
 208 * parameter with respect to the policy mode and flags.  But, we need to
 209 * handle an empty nodemask with MPOL_PREFERRED here.
 210 *
 211 * Must be called holding task's alloc_lock to protect task's mems_allowed
 212 * and mempolicy.  May also be called holding the mmap_semaphore for write.
 213 */
 214static int mpol_set_nodemask(struct mempolicy *pol,
 215                     const nodemask_t *nodes, struct nodemask_scratch *nsc)
 216{
 217        int ret;
 218
 219        /* if mode is MPOL_DEFAULT, pol is NULL. This is right. */
 220        if (pol == NULL)
 221                return 0;
 222        /* Check N_MEMORY */
 223        nodes_and(nsc->mask1,
 224                  cpuset_current_mems_allowed, node_states[N_MEMORY]);
 225
 226        VM_BUG_ON(!nodes);
 227        if (pol->mode == MPOL_PREFERRED && nodes_empty(*nodes))
 228                nodes = NULL;   /* explicit local allocation */
 229        else {
 230                if (pol->flags & MPOL_F_RELATIVE_NODES)
 231                        mpol_relative_nodemask(&nsc->mask2, nodes, &nsc->mask1);
 232                else
 233                        nodes_and(nsc->mask2, *nodes, nsc->mask1);
 234
 235                if (mpol_store_user_nodemask(pol))
 236                        pol->w.user_nodemask = *nodes;
 237                else
 238                        pol->w.cpuset_mems_allowed =
 239                                                cpuset_current_mems_allowed;
 240        }
 241
 242        if (nodes)
 243                ret = mpol_ops[pol->mode].create(pol, &nsc->mask2);
 244        else
 245                ret = mpol_ops[pol->mode].create(pol, NULL);
 246        return ret;
 247}
 248
 249/*
 250 * This function just creates a new policy, does some check and simple
 251 * initialization. You must invoke mpol_set_nodemask() to set nodes.
 252 */
 253static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
 254                                  nodemask_t *nodes)
 255{
 256        struct mempolicy *policy;
 257
 258        pr_debug("setting mode %d flags %d nodes[0] %lx\n",
 259                 mode, flags, nodes ? nodes_addr(*nodes)[0] : NUMA_NO_NODE);
 260
 261        if (mode == MPOL_DEFAULT) {
 262                if (nodes && !nodes_empty(*nodes))
 263                        return ERR_PTR(-EINVAL);
 264                return NULL;
 265        }
 266        VM_BUG_ON(!nodes);
 267
 268        /*
 269         * MPOL_PREFERRED cannot be used with MPOL_F_STATIC_NODES or
 270         * MPOL_F_RELATIVE_NODES if the nodemask is empty (local allocation).
 271         * All other modes require a valid pointer to a non-empty nodemask.
 272         */
 273        if (mode == MPOL_PREFERRED) {
 274                if (nodes_empty(*nodes)) {
 275                        if (((flags & MPOL_F_STATIC_NODES) ||
 276                             (flags & MPOL_F_RELATIVE_NODES)))
 277                                return ERR_PTR(-EINVAL);
 278                }
 279        } else if (mode == MPOL_LOCAL) {
 280                if (!nodes_empty(*nodes))
 281                        return ERR_PTR(-EINVAL);
 282                mode = MPOL_PREFERRED;
 283        } else if (nodes_empty(*nodes))
 284                return ERR_PTR(-EINVAL);
 285        policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
 286        if (!policy)
 287                return ERR_PTR(-ENOMEM);
 288        atomic_set(&policy->refcnt, 1);
 289        policy->mode = mode;
 290        policy->flags = flags;
 291
 292        return policy;
 293}
 294
 295/* Slow path of a mpol destructor. */
 296void __mpol_put(struct mempolicy *p)
 297{
 298        if (!atomic_dec_and_test(&p->refcnt))
 299                return;
 300        kmem_cache_free(policy_cache, p);
 301}
 302
 303static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes,
 304                                enum mpol_rebind_step step)
 305{
 306}
 307
 308/*
 309 * step:
 310 *      MPOL_REBIND_ONCE  - do rebind work at once
 311 *      MPOL_REBIND_STEP1 - set all the newly nodes
 312 *      MPOL_REBIND_STEP2 - clean all the disallowed nodes
 313 */
 314static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes,
 315                                 enum mpol_rebind_step step)
 316{
 317        nodemask_t tmp;
 318
 319        if (pol->flags & MPOL_F_STATIC_NODES)
 320                nodes_and(tmp, pol->w.user_nodemask, *nodes);
 321        else if (pol->flags & MPOL_F_RELATIVE_NODES)
 322                mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
 323        else {
 324                /*
 325                 * if step == 1, we use ->w.cpuset_mems_allowed to cache the
 326                 * result
 327                 */
 328                if (step == MPOL_REBIND_ONCE || step == MPOL_REBIND_STEP1) {
 329                        nodes_remap(tmp, pol->v.nodes,
 330                                        pol->w.cpuset_mems_allowed, *nodes);
 331                        pol->w.cpuset_mems_allowed = step ? tmp : *nodes;
 332                } else if (step == MPOL_REBIND_STEP2) {
 333                        tmp = pol->w.cpuset_mems_allowed;
 334                        pol->w.cpuset_mems_allowed = *nodes;
 335                } else
 336                        BUG();
 337        }
 338
 339        if (nodes_empty(tmp))
 340                tmp = *nodes;
 341
 342        if (step == MPOL_REBIND_STEP1)
 343                nodes_or(pol->v.nodes, pol->v.nodes, tmp);
 344        else if (step == MPOL_REBIND_ONCE || step == MPOL_REBIND_STEP2)
 345                pol->v.nodes = tmp;
 346        else
 347                BUG();
 348
 349        if (!node_isset(current->il_next, tmp)) {
 350                current->il_next = next_node(current->il_next, tmp);
 351                if (current->il_next >= MAX_NUMNODES)
 352                        current->il_next = first_node(tmp);
 353                if (current->il_next >= MAX_NUMNODES)
 354                        current->il_next = numa_node_id();
 355        }
 356}
 357
 358static void mpol_rebind_preferred(struct mempolicy *pol,
 359                                  const nodemask_t *nodes,
 360                                  enum mpol_rebind_step step)
 361{
 362        nodemask_t tmp;
 363
 364        if (pol->flags & MPOL_F_STATIC_NODES) {
 365                int node = first_node(pol->w.user_nodemask);
 366
 367                if (node_isset(node, *nodes)) {
 368                        pol->v.preferred_node = node;
 369                        pol->flags &= ~MPOL_F_LOCAL;
 370                } else
 371                        pol->flags |= MPOL_F_LOCAL;
 372        } else if (pol->flags & MPOL_F_RELATIVE_NODES) {
 373                mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
 374                pol->v.preferred_node = first_node(tmp);
 375        } else if (!(pol->flags & MPOL_F_LOCAL)) {
 376                pol->v.preferred_node = node_remap(pol->v.preferred_node,
 377                                                   pol->w.cpuset_mems_allowed,
 378                                                   *nodes);
 379                pol->w.cpuset_mems_allowed = *nodes;
 380        }
 381}
 382
 383/*
 384 * mpol_rebind_policy - Migrate a policy to a different set of nodes
 385 *
 386 * If read-side task has no lock to protect task->mempolicy, write-side
 387 * task will rebind the task->mempolicy by two step. The first step is
 388 * setting all the newly nodes, and the second step is cleaning all the
 389 * disallowed nodes. In this way, we can avoid finding no node to alloc
 390 * page.
 391 * If we have a lock to protect task->mempolicy in read-side, we do
 392 * rebind directly.
 393 *
 394 * step:
 395 *      MPOL_REBIND_ONCE  - do rebind work at once
 396 *      MPOL_REBIND_STEP1 - set all the newly nodes
 397 *      MPOL_REBIND_STEP2 - clean all the disallowed nodes
 398 */
 399static void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask,
 400                                enum mpol_rebind_step step)
 401{
 402        if (!pol)
 403                return;
 404        if (!mpol_store_user_nodemask(pol) && step == MPOL_REBIND_ONCE &&
 405            nodes_equal(pol->w.cpuset_mems_allowed, *newmask))
 406                return;
 407
 408        if (step == MPOL_REBIND_STEP1 && (pol->flags & MPOL_F_REBINDING))
 409                return;
 410
 411        if (step == MPOL_REBIND_STEP2 && !(pol->flags & MPOL_F_REBINDING))
 412                BUG();
 413
 414        if (step == MPOL_REBIND_STEP1)
 415                pol->flags |= MPOL_F_REBINDING;
 416        else if (step == MPOL_REBIND_STEP2)
 417                pol->flags &= ~MPOL_F_REBINDING;
 418        else if (step >= MPOL_REBIND_NSTEP)
 419                BUG();
 420
 421        mpol_ops[pol->mode].rebind(pol, newmask, step);
 422}
 423
 424/*
 425 * Wrapper for mpol_rebind_policy() that just requires task
 426 * pointer, and updates task mempolicy.
 427 *
 428 * Called with task's alloc_lock held.
 429 */
 430
 431void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new,
 432                        enum mpol_rebind_step step)
 433{
 434        mpol_rebind_policy(tsk->mempolicy, new, step);
 435}
 436
 437/*
 438 * Rebind each vma in mm to new nodemask.
 439 *
 440 * Call holding a reference to mm.  Takes mm->mmap_sem during call.
 441 */
 442
 443void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
 444{
 445        struct vm_area_struct *vma;
 446
 447        down_write(&mm->mmap_sem);
 448        for (vma = mm->mmap; vma; vma = vma->vm_next)
 449                mpol_rebind_policy(vma->vm_policy, new, MPOL_REBIND_ONCE);
 450        up_write(&mm->mmap_sem);
 451}
 452
 453static const struct mempolicy_operations mpol_ops[MPOL_MAX] = {
 454        [MPOL_DEFAULT] = {
 455                .rebind = mpol_rebind_default,
 456        },
 457        [MPOL_INTERLEAVE] = {
 458                .create = mpol_new_interleave,
 459                .rebind = mpol_rebind_nodemask,
 460        },
 461        [MPOL_PREFERRED] = {
 462                .create = mpol_new_preferred,
 463                .rebind = mpol_rebind_preferred,
 464        },
 465        [MPOL_BIND] = {
 466                .create = mpol_new_bind,
 467                .rebind = mpol_rebind_nodemask,
 468        },
 469};
 470
 471static void migrate_page_add(struct page *page, struct list_head *pagelist,
 472                                unsigned long flags);
 473
 474struct queue_pages {
 475        struct list_head *pagelist;
 476        unsigned long flags;
 477        nodemask_t *nmask;
 478        struct vm_area_struct *prev;
 479};
 480
 481/*
 482 * Scan through pages checking if pages follow certain conditions,
 483 * and move them to the pagelist if they do.
 484 */
 485static int queue_pages_pte_range(pmd_t *pmd, unsigned long addr,
 486                        unsigned long end, struct mm_walk *walk)
 487{
 488        struct vm_area_struct *vma = walk->vma;
 489        struct page *page;
 490        struct queue_pages *qp = walk->private;
 491        unsigned long flags = qp->flags;
 492        int nid;
 493        pte_t *pte;
 494        spinlock_t *ptl;
 495
 496        split_huge_page_pmd(vma, addr, pmd);
 497        if (pmd_trans_unstable(pmd))
 498                return 0;
 499
 500        pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
 501        for (; addr != end; pte++, addr += PAGE_SIZE) {
 502                if (!pte_present(*pte))
 503                        continue;
 504                page = vm_normal_page(vma, addr, *pte);
 505                if (!page)
 506                        continue;
 507                /*
 508                 * vm_normal_page() filters out zero pages, but there might
 509                 * still be PageReserved pages to skip, perhaps in a VDSO.
 510                 */
 511                if (PageReserved(page))
 512                        continue;
 513                nid = page_to_nid(page);
 514                if (node_isset(nid, *qp->nmask) == !!(flags & MPOL_MF_INVERT))
 515                        continue;
 516
 517                if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
 518                        migrate_page_add(page, qp->pagelist, flags);
 519        }
 520        pte_unmap_unlock(pte - 1, ptl);
 521        cond_resched();
 522        return 0;
 523}
 524
 525static int queue_pages_hugetlb(pte_t *pte, unsigned long hmask,
 526                               unsigned long addr, unsigned long end,
 527                               struct mm_walk *walk)
 528{
 529#ifdef CONFIG_HUGETLB_PAGE
 530        struct queue_pages *qp = walk->private;
 531        unsigned long flags = qp->flags;
 532        int nid;
 533        struct page *page;
 534        spinlock_t *ptl;
 535        pte_t entry;
 536
 537        ptl = huge_pte_lock(hstate_vma(walk->vma), walk->mm, pte);
 538        entry = huge_ptep_get(pte);
 539        if (!pte_present(entry))
 540                goto unlock;
 541        page = pte_page(entry);
 542        nid = page_to_nid(page);
 543        if (node_isset(nid, *qp->nmask) == !!(flags & MPOL_MF_INVERT))
 544                goto unlock;
 545        /* With MPOL_MF_MOVE, we migrate only unshared hugepage. */
 546        if (flags & (MPOL_MF_MOVE_ALL) ||
 547            (flags & MPOL_MF_MOVE && page_mapcount(page) == 1))
 548                isolate_huge_page(page, qp->pagelist);
 549unlock:
 550        spin_unlock(ptl);
 551#else
 552        BUG();
 553#endif
 554        return 0;
 555}
 556
 557#ifdef CONFIG_NUMA_BALANCING
 558/*
 559 * This is used to mark a range of virtual addresses to be inaccessible.
 560 * These are later cleared by a NUMA hinting fault. Depending on these
 561 * faults, pages may be migrated for better NUMA placement.
 562 *
 563 * This is assuming that NUMA faults are handled using PROT_NONE. If
 564 * an architecture makes a different choice, it will need further
 565 * changes to the core.
 566 */
 567unsigned long change_prot_numa(struct vm_area_struct *vma,
 568                        unsigned long addr, unsigned long end)
 569{
 570        int nr_updated;
 571
 572        nr_updated = change_protection(vma, addr, end, PAGE_NONE, 0, 1);
 573        if (nr_updated)
 574                count_vm_numa_events(NUMA_PTE_UPDATES, nr_updated);
 575
 576        return nr_updated;
 577}
 578#else
 579static unsigned long change_prot_numa(struct vm_area_struct *vma,
 580                        unsigned long addr, unsigned long end)
 581{
 582        return 0;
 583}
 584#endif /* CONFIG_NUMA_BALANCING */
 585
 586static int queue_pages_test_walk(unsigned long start, unsigned long end,
 587                                struct mm_walk *walk)
 588{
 589        struct vm_area_struct *vma = walk->vma;
 590        struct queue_pages *qp = walk->private;
 591        unsigned long endvma = vma->vm_end;
 592        unsigned long flags = qp->flags;
 593
 594        if (vma->vm_flags & VM_PFNMAP)
 595                return 1;
 596
 597        if (endvma > end)
 598                endvma = end;
 599        if (vma->vm_start > start)
 600                start = vma->vm_start;
 601
 602        if (!(flags & MPOL_MF_DISCONTIG_OK)) {
 603                if (!vma->vm_next && vma->vm_end < end)
 604                        return -EFAULT;
 605                if (qp->prev && qp->prev->vm_end < vma->vm_start)
 606                        return -EFAULT;
 607        }
 608
 609        qp->prev = vma;
 610
 611        if (vma->vm_flags & VM_PFNMAP)
 612                return 1;
 613
 614        if (flags & MPOL_MF_LAZY) {
 615                /* Similar to task_numa_work, skip inaccessible VMAs */
 616                if (vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE))
 617                        change_prot_numa(vma, start, endvma);
 618                return 1;
 619        }
 620
 621        if ((flags & MPOL_MF_STRICT) ||
 622            ((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) &&
 623             vma_migratable(vma)))
 624                /* queue pages from current vma */
 625                return 0;
 626        return 1;
 627}
 628
 629/*
 630 * Walk through page tables and collect pages to be migrated.
 631 *
 632 * If pages found in a given range are on a set of nodes (determined by
 633 * @nodes and @flags,) it's isolated and queued to the pagelist which is
 634 * passed via @private.)
 635 */
 636static int
 637queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end,
 638                nodemask_t *nodes, unsigned long flags,
 639                struct list_head *pagelist)
 640{
 641        struct queue_pages qp = {
 642                .pagelist = pagelist,
 643                .flags = flags,
 644                .nmask = nodes,
 645                .prev = NULL,
 646        };
 647        struct mm_walk queue_pages_walk = {
 648                .hugetlb_entry = queue_pages_hugetlb,
 649                .pmd_entry = queue_pages_pte_range,
 650                .test_walk = queue_pages_test_walk,
 651                .mm = mm,
 652                .private = &qp,
 653        };
 654
 655        return walk_page_range(start, end, &queue_pages_walk);
 656}
 657
 658/*
 659 * Apply policy to a single VMA
 660 * This must be called with the mmap_sem held for writing.
 661 */
 662static int vma_replace_policy(struct vm_area_struct *vma,
 663                                                struct mempolicy *pol)
 664{
 665        int err;
 666        struct mempolicy *old;
 667        struct mempolicy *new;
 668
 669        pr_debug("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
 670                 vma->vm_start, vma->vm_end, vma->vm_pgoff,
 671                 vma->vm_ops, vma->vm_file,
 672                 vma->vm_ops ? vma->vm_ops->set_policy : NULL);
 673
 674        new = mpol_dup(pol);
 675        if (IS_ERR(new))
 676                return PTR_ERR(new);
 677
 678        if (vma->vm_ops && vma->vm_ops->set_policy) {
 679                err = vma->vm_ops->set_policy(vma, new);
 680                if (err)
 681                        goto err_out;
 682        }
 683
 684        old = vma->vm_policy;
 685        vma->vm_policy = new; /* protected by mmap_sem */
 686        mpol_put(old);
 687
 688        return 0;
 689 err_out:
 690        mpol_put(new);
 691        return err;
 692}
 693
 694/* Step 2: apply policy to a range and do splits. */
 695static int mbind_range(struct mm_struct *mm, unsigned long start,
 696                       unsigned long end, struct mempolicy *new_pol)
 697{
 698        struct vm_area_struct *next;
 699        struct vm_area_struct *prev;
 700        struct vm_area_struct *vma;
 701        int err = 0;
 702        pgoff_t pgoff;
 703        unsigned long vmstart;
 704        unsigned long vmend;
 705
 706        vma = find_vma(mm, start);
 707        if (!vma || vma->vm_start > start)
 708                return -EFAULT;
 709
 710        prev = vma->vm_prev;
 711        if (start > vma->vm_start)
 712                prev = vma;
 713
 714        for (; vma && vma->vm_start < end; prev = vma, vma = next) {
 715                next = vma->vm_next;
 716                vmstart = max(start, vma->vm_start);
 717                vmend   = min(end, vma->vm_end);
 718
 719                if (mpol_equal(vma_policy(vma), new_pol))
 720                        continue;
 721
 722                pgoff = vma->vm_pgoff +
 723                        ((vmstart - vma->vm_start) >> PAGE_SHIFT);
 724                prev = vma_merge(mm, prev, vmstart, vmend, vma->vm_flags,
 725                                  vma->anon_vma, vma->vm_file, pgoff,
 726                                  new_pol);
 727                if (prev) {
 728                        vma = prev;
 729                        next = vma->vm_next;
 730                        if (mpol_equal(vma_policy(vma), new_pol))
 731                                continue;
 732                        /* vma_merge() joined vma && vma->next, case 8 */
 733                        goto replace;
 734                }
 735                if (vma->vm_start != vmstart) {
 736                        err = split_vma(vma->vm_mm, vma, vmstart, 1);
 737                        if (err)
 738                                goto out;
 739                }
 740                if (vma->vm_end != vmend) {
 741                        err = split_vma(vma->vm_mm, vma, vmend, 0);
 742                        if (err)
 743                                goto out;
 744                }
 745 replace:
 746                err = vma_replace_policy(vma, new_pol);
 747                if (err)
 748                        goto out;
 749        }
 750
 751 out:
 752        return err;
 753}
 754
 755/* Set the process memory policy */
 756static long do_set_mempolicy(unsigned short mode, unsigned short flags,
 757                             nodemask_t *nodes)
 758{
 759        struct mempolicy *new, *old;
 760        NODEMASK_SCRATCH(scratch);
 761        int ret;
 762
 763        if (!scratch)
 764                return -ENOMEM;
 765
 766        new = mpol_new(mode, flags, nodes);
 767        if (IS_ERR(new)) {
 768                ret = PTR_ERR(new);
 769                goto out;
 770        }
 771
 772        task_lock(current);
 773        ret = mpol_set_nodemask(new, nodes, scratch);
 774        if (ret) {
 775                task_unlock(current);
 776                mpol_put(new);
 777                goto out;
 778        }
 779        old = current->mempolicy;
 780        current->mempolicy = new;
 781        if (new && new->mode == MPOL_INTERLEAVE &&
 782            nodes_weight(new->v.nodes))
 783                current->il_next = first_node(new->v.nodes);
 784        task_unlock(current);
 785        mpol_put(old);
 786        ret = 0;
 787out:
 788        NODEMASK_SCRATCH_FREE(scratch);
 789        return ret;
 790}
 791
 792/*
 793 * Return nodemask for policy for get_mempolicy() query
 794 *
 795 * Called with task's alloc_lock held
 796 */
 797static void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes)
 798{
 799        nodes_clear(*nodes);
 800        if (p == &default_policy)
 801                return;
 802
 803        switch (p->mode) {
 804        case MPOL_BIND:
 805                /* Fall through */
 806        case MPOL_INTERLEAVE:
 807                *nodes = p->v.nodes;
 808                break;
 809        case MPOL_PREFERRED:
 810                if (!(p->flags & MPOL_F_LOCAL))
 811                        node_set(p->v.preferred_node, *nodes);
 812                /* else return empty node mask for local allocation */
 813                break;
 814        default:
 815                BUG();
 816        }
 817}
 818
 819static int lookup_node(struct mm_struct *mm, unsigned long addr)
 820{
 821        struct page *p;
 822        int err;
 823
 824        err = get_user_pages(current, mm, addr & PAGE_MASK, 1, 0, 0, &p, NULL);
 825        if (err >= 0) {
 826                err = page_to_nid(p);
 827                put_page(p);
 828        }
 829        return err;
 830}
 831
 832/* Retrieve NUMA policy */
 833static long do_get_mempolicy(int *policy, nodemask_t *nmask,
 834                             unsigned long addr, unsigned long flags)
 835{
 836        int err;
 837        struct mm_struct *mm = current->mm;
 838        struct vm_area_struct *vma = NULL;
 839        struct mempolicy *pol = current->mempolicy;
 840
 841        if (flags &
 842                ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED))
 843                return -EINVAL;
 844
 845        if (flags & MPOL_F_MEMS_ALLOWED) {
 846                if (flags & (MPOL_F_NODE|MPOL_F_ADDR))
 847                        return -EINVAL;
 848                *policy = 0;    /* just so it's initialized */
 849                task_lock(current);
 850                *nmask  = cpuset_current_mems_allowed;
 851                task_unlock(current);
 852                return 0;
 853        }
 854
 855        if (flags & MPOL_F_ADDR) {
 856                /*
 857                 * Do NOT fall back to task policy if the
 858                 * vma/shared policy at addr is NULL.  We
 859                 * want to return MPOL_DEFAULT in this case.
 860                 */
 861                down_read(&mm->mmap_sem);
 862                vma = find_vma_intersection(mm, addr, addr+1);
 863                if (!vma) {
 864                        up_read(&mm->mmap_sem);
 865                        return -EFAULT;
 866                }
 867                if (vma->vm_ops && vma->vm_ops->get_policy)
 868                        pol = vma->vm_ops->get_policy(vma, addr);
 869                else
 870                        pol = vma->vm_policy;
 871        } else if (addr)
 872                return -EINVAL;
 873
 874        if (!pol)
 875                pol = &default_policy;  /* indicates default behavior */
 876
 877        if (flags & MPOL_F_NODE) {
 878                if (flags & MPOL_F_ADDR) {
 879                        err = lookup_node(mm, addr);
 880                        if (err < 0)
 881                                goto out;
 882                        *policy = err;
 883                } else if (pol == current->mempolicy &&
 884                                pol->mode == MPOL_INTERLEAVE) {
 885                        *policy = current->il_next;
 886                } else {
 887                        err = -EINVAL;
 888                        goto out;
 889                }
 890        } else {
 891                *policy = pol == &default_policy ? MPOL_DEFAULT :
 892                                                pol->mode;
 893                /*
 894                 * Internal mempolicy flags must be masked off before exposing
 895                 * the policy to userspace.
 896                 */
 897                *policy |= (pol->flags & MPOL_MODE_FLAGS);
 898        }
 899
 900        if (vma) {
 901                up_read(&current->mm->mmap_sem);
 902                vma = NULL;
 903        }
 904
 905        err = 0;
 906        if (nmask) {
 907                if (mpol_store_user_nodemask(pol)) {
 908                        *nmask = pol->w.user_nodemask;
 909                } else {
 910                        task_lock(current);
 911                        get_policy_nodemask(pol, nmask);
 912                        task_unlock(current);
 913                }
 914        }
 915
 916 out:
 917        mpol_cond_put(pol);
 918        if (vma)
 919                up_read(&current->mm->mmap_sem);
 920        return err;
 921}
 922
 923#ifdef CONFIG_MIGRATION
 924/*
 925 * page migration
 926 */
 927static void migrate_page_add(struct page *page, struct list_head *pagelist,
 928                                unsigned long flags)
 929{
 930        /*
 931         * Avoid migrating a page that is shared with others.
 932         */
 933        if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1) {
 934                if (!isolate_lru_page(page)) {
 935                        list_add_tail(&page->lru, pagelist);
 936                        inc_zone_page_state(page, NR_ISOLATED_ANON +
 937                                            page_is_file_cache(page));
 938                }
 939        }
 940}
 941
 942static struct page *new_node_page(struct page *page, unsigned long node, int **x)
 943{
 944        if (PageHuge(page))
 945                return alloc_huge_page_node(page_hstate(compound_head(page)),
 946                                        node);
 947        else
 948                return alloc_pages_exact_node(node, GFP_HIGHUSER_MOVABLE |
 949                                                    __GFP_THISNODE, 0);
 950}
 951
 952/*
 953 * Migrate pages from one node to a target node.
 954 * Returns error or the number of pages not migrated.
 955 */
 956static int migrate_to_node(struct mm_struct *mm, int source, int dest,
 957                           int flags)
 958{
 959        nodemask_t nmask;
 960        LIST_HEAD(pagelist);
 961        int err = 0;
 962
 963        nodes_clear(nmask);
 964        node_set(source, nmask);
 965
 966        /*
 967         * This does not "check" the range but isolates all pages that
 968         * need migration.  Between passing in the full user address
 969         * space range and MPOL_MF_DISCONTIG_OK, this call can not fail.
 970         */
 971        VM_BUG_ON(!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)));
 972        queue_pages_range(mm, mm->mmap->vm_start, mm->task_size, &nmask,
 973                        flags | MPOL_MF_DISCONTIG_OK, &pagelist);
 974
 975        if (!list_empty(&pagelist)) {
 976                err = migrate_pages(&pagelist, new_node_page, NULL, dest,
 977                                        MIGRATE_SYNC, MR_SYSCALL);
 978                if (err)
 979                        putback_movable_pages(&pagelist);
 980        }
 981
 982        return err;
 983}
 984
 985/*
 986 * Move pages between the two nodesets so as to preserve the physical
 987 * layout as much as possible.
 988 *
 989 * Returns the number of page that could not be moved.
 990 */
 991int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
 992                     const nodemask_t *to, int flags)
 993{
 994        int busy = 0;
 995        int err;
 996        nodemask_t tmp;
 997
 998        err = migrate_prep();
 999        if (err)
1000                return err;

1001
1002        down_read(&mm->mmap_sem);
1003
1004        /*
1005         * Find a 'source' bit set in 'tmp' whose corresponding 'dest'
1006         * bit in 'to' is not also set in 'tmp'.  Clear the found 'source'
1007         * bit in 'tmp', and return that <source, dest> pair for migration.
1008         * The pair of nodemasks 'to' and 'from' define the map.
1009         *
1010         * If no pair of bits is found that way, fallback to picking some
1011         * pair of 'source' and 'dest' bits that are not the same.  If the
1012         * 'source' and 'dest' bits are the same, this represents a node
1013         * that will be migrating to itself, so no pages need move.
1014         *
1015         * If no bits are left in 'tmp', or if all remaining bits left
1016         * in 'tmp' correspond to the same bit in 'to', return false
1017         * (nothing left to migrate).
1018         *
1019         * This lets us pick a pair of nodes to migrate between, such that
1020         * if possible the dest node is not already occupied by some other
1021         * source node, minimizing the risk of overloading the memory on a
1022         * node that would happen if we migrated incoming memory to a node
1023         * before migrating outgoing memory source that same node.
1024         *
1025         * A single scan of tmp is sufficient.  As we go, we remember the
1026         * most recent <s, d> pair that moved (s != d).  If we find a pair
1027         * that not only moved, but what's better, moved to an empty slot
1028         * (d is not set in tmp), then we break out then, with that pair.
1029         * Otherwise when we finish scanning from_tmp, we at least have the
1030         * most recent <s, d> pair that moved.  If we get all the way through
1031         * the scan of tmp without finding any node that moved, much less
1032         * moved to an empty node, then there is nothing left worth migrating.
1033         */
1034
1035        tmp = *from;
1036        while (!nodes_empty(tmp)) {
1037                int s,d;
1038                int source = NUMA_NO_NODE;
1039                int dest = 0;
1040
1041                for_each_node_mask(s, tmp) {
1042
1043                        /*
1044                         * do_migrate_pages() tries to maintain the relative
1045                         * node relationship of the pages established between
1046                         * threads and memory areas.
1047                         *
1048                         * However if the number of source nodes is not equal to
1049                         * the number of destination nodes we can not preserve
1050                         * this node relative relationship.  In that case, skip
1051                         * copying memory from a node that is in the destination
1052                         * mask.
1053                         *
1054                         * Example: [2,3,4] -> [3,4,5] moves everything.
1055                         *          [0-7] - > [3,4,5] moves only 0,1,2,6,7.
1056                         */
1057
1058                        if ((nodes_weight(*from) != nodes_weight(*to)) &&
1059                                                (node_isset(s, *to)))
1060                                continue;
1061
1062                        d = node_remap(s, *from, *to);
1063                        if (s == d)
1064                                continue;
1065
1066                        source = s;     /* Node moved. Memorize */
1067                        dest = d;
1068
1069                        /* dest not in remaining from nodes? */
1070                        if (!node_isset(dest, tmp))
1071                                break;
1072                }
1073                if (source == NUMA_NO_NODE)
1074                        break;
1075
1076                node_clear(source, tmp);
1077                err = migrate_to_node(mm, source, dest, flags);
1078                if (err > 0)
1079                        busy += err;
1080                if (err < 0)
1081                        break;
1082        }
1083        up_read(&mm->mmap_sem);
1084        if (err < 0)
1085                return err;
1086        return busy;
1087
1088}
1089
1090/*
1091 * Allocate a new page for page migration based on vma policy.
1092 * Start by assuming the page is mapped by the same vma as contains @start.
1093 * Search forward from there, if not.  N.B., this assumes that the
1094 * list of pages handed to migrate_pages()--which is how we get here--
1095 * is in virtual address order.
1096 */
1097static struct page *new_page(struct page *page, unsigned long start, int **x)
1098{
1099        struct vm_area_struct *vma;
1100        unsigned long uninitialized_var(address);
1101
1102        vma = find_vma(current->mm, start);
1103        while (vma) {
1104                address = page_address_in_vma(page, vma);
1105                if (address != -EFAULT)
1106                        break;
1107                vma = vma->vm_next;
1108        }
1109
1110        if (PageHuge(page)) {
1111                BUG_ON(!vma);
1112                return alloc_huge_page_noerr(vma, address, 1);
1113        }
1114        /*
1115         * if !vma, alloc_page_vma() will use task or system default policy
1116         */
1117        return alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
1118}
1119#else
1120
1121static void migrate_page_add(struct page *page, struct list_head *pagelist,
1122                                unsigned long flags)
1123{
1124}
1125
1126int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
1127                     const nodemask_t *to, int flags)
1128{
1129        return -ENOSYS;
1130}
1131
1132static struct page *new_page(struct page *page, unsigned long start, int **x)
1133{
1134        return NULL;
1135}
1136#endif
1137
1138static long do_mbind(unsigned long start, unsigned long len,
1139                     unsigned short mode, unsigned short mode_flags,
1140                     nodemask_t *nmask, unsigned long flags)
1141{
1142        struct mm_struct *mm = current->mm;
1143        struct mempolicy *new;
1144        unsigned long end;
1145        int err;
1146        LIST_HEAD(pagelist);
1147
1148        if (flags & ~(unsigned long)MPOL_MF_VALID)
1149                return -EINVAL;
1150        if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
1151                return -EPERM;
1152
1153        if (start & ~PAGE_MASK)
1154                return -EINVAL;
1155
1156        if (mode == MPOL_DEFAULT)
1157                flags &= ~MPOL_MF_STRICT;
1158
1159        len = (len + PAGE_SIZE - 1) & PAGE_MASK;
1160        end = start + len;
1161
1162        if (end < start)
1163                return -EINVAL;
1164        if (end == start)
1165                return 0;
1166
1167        new = mpol_new(mode, mode_flags, nmask);
1168        if (IS_ERR(new))
1169                return PTR_ERR(new);
1170
1171        if (flags & MPOL_MF_LAZY)
1172                new->flags |= MPOL_F_MOF;
1173
1174        /*
1175         * If we are using the default policy then operation
1176         * on discontinuous address spaces is okay after all
1177         */
1178        if (!new)
1179                flags |= MPOL_MF_DISCONTIG_OK;
1180
1181        pr_debug("mbind %lx-%lx mode:%d flags:%d nodes:%lx\n",
1182                 start, start + len, mode, mode_flags,
1183                 nmask ? nodes_addr(*nmask)[0] : NUMA_NO_NODE);
1184
1185        if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
1186
1187                err = migrate_prep();
1188                if (err)
1189                        goto mpol_out;
1190        }
1191        {
1192                NODEMASK_SCRATCH(scratch);
1193                if (scratch) {
1194                        down_write(&mm->mmap_sem);
1195                        task_lock(current);
1196                        err = mpol_set_nodemask(new, nmask, scratch);
1197                        task_unlock(current);
1198                        if (err)
1199                                up_write(&mm->mmap_sem);
1200                } else
1201                        err = -ENOMEM;
1202                NODEMASK_SCRATCH_FREE(scratch);
1203        }
1204        if (err)
1205                goto mpol_out;
1206
1207        err = queue_pages_range(mm, start, end, nmask,
1208                          flags | MPOL_MF_INVERT, &pagelist);
1209        if (!err)
1210                err = mbind_range(mm, start, end, new);
1211
1212        if (!err) {
1213                int nr_failed = 0;
1214
1215                if (!list_empty(&pagelist)) {
1216                        WARN_ON_ONCE(flags & MPOL_MF_LAZY);
1217                        nr_failed = migrate_pages(&pagelist, new_page, NULL,
1218                                start, MIGRATE_SYNC, MR_MEMPOLICY_MBIND);
1219                        if (nr_failed)
1220                                putback_movable_pages(&pagelist);
1221                }
1222
1223                if (nr_failed && (flags & MPOL_MF_STRICT))
1224                        err = -EIO;
1225        } else
1226                putback_movable_pages(&pagelist);
1227
1228        up_write(&mm->mmap_sem);
1229 mpol_out:
1230        mpol_put(new);
1231        return err;
1232}
1233
1234/*
1235 * User space interface with variable sized bitmaps for nodelists.
1236 */
1237
1238/* Copy a node mask from user space. */
1239static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
1240                     unsigned long maxnode)
1241{
1242        unsigned long k;
1243        unsigned long nlongs;
1244        unsigned long endmask;
1245
1246        --maxnode;
1247        nodes_clear(*nodes);
1248        if (maxnode == 0 || !nmask)
1249                return 0;
1250        if (maxnode > PAGE_SIZE*BITS_PER_BYTE)
1251                return -EINVAL;
1252
1253        nlongs = BITS_TO_LONGS(maxnode);
1254        if ((maxnode % BITS_PER_LONG) == 0)
1255                endmask = ~0UL;
1256        else
1257                endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1;
1258
1259        /* When the user specified more nodes than supported just check
1260           if the non supported part is all zero. */
1261        if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) {
1262                if (nlongs > PAGE_SIZE/sizeof(long))
1263                        return -EINVAL;
1264                for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) {
1265                        unsigned long t;
1266                        if (get_user(t, nmask + k))
1267                                return -EFAULT;
1268                        if (k == nlongs - 1) {
1269                                if (t & endmask)
1270                                        return -EINVAL;
1271                        } else if (t)
1272                                return -EINVAL;
1273                }
1274                nlongs = BITS_TO_LONGS(MAX_NUMNODES);
1275                endmask = ~0UL;
1276        }
1277
1278        if (copy_from_user(nodes_addr(*nodes), nmask, nlongs*sizeof(unsigned long)))
1279                return -EFAULT;
1280        nodes_addr(*nodes)[nlongs-1] &= endmask;
1281        return 0;
1282}
1283
1284/* Copy a kernel node mask to user space */
1285static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
1286                              nodemask_t *nodes)
1287{
1288        unsigned long copy = ALIGN(maxnode-1, 64) / 8;
1289        const int nbytes = BITS_TO_LONGS(MAX_NUMNODES) * sizeof(long);
1290
1291        if (copy > nbytes) {
1292                if (copy > PAGE_SIZE)
1293                        return -EINVAL;
1294                if (clear_user((char __user *)mask + nbytes, copy - nbytes))
1295                        return -EFAULT;
1296                copy = nbytes;
1297        }
1298        return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
1299}
1300
1301SYSCALL_DEFINE6(mbind, unsigned long, start, unsigned long, len,
1302                unsigned long, mode, const unsigned long __user *, nmask,
1303                unsigned long, maxnode, unsigned, flags)
1304{
1305        nodemask_t nodes;
1306        int err;
1307        unsigned short mode_flags;
1308
1309        mode_flags = mode & MPOL_MODE_FLAGS;
1310        mode &= ~MPOL_MODE_FLAGS;
1311        if (mode >= MPOL_MAX)
1312                return -EINVAL;
1313        if ((mode_flags & MPOL_F_STATIC_NODES) &&
1314            (mode_flags & MPOL_F_RELATIVE_NODES))
1315                return -EINVAL;
1316        err = get_nodes(&nodes, nmask, maxnode);
1317        if (err)
1318                return err;
1319        return do_mbind(start, len, mode, mode_flags, &nodes, flags);
1320}
1321
1322/* Set the process memory policy */
1323SYSCALL_DEFINE3(set_mempolicy, int, mode, const unsigned long __user *, nmask,
1324                unsigned long, maxnode)
1325{
1326        int err;
1327        nodemask_t nodes;
1328        unsigned short flags;
1329
1330        flags = mode & MPOL_MODE_FLAGS;
1331        mode &= ~MPOL_MODE_FLAGS;
1332        if ((unsigned int)mode >= MPOL_MAX)
1333                return -EINVAL;
1334        if ((flags & MPOL_F_STATIC_NODES) && (flags & MPOL_F_RELATIVE_NODES))
1335                return -EINVAL;
1336        err = get_nodes(&nodes, nmask, maxnode);
1337        if (err)
1338                return err;
1339        return do_set_mempolicy(mode, flags, &nodes);
1340}
1341
1342SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode,
1343                const unsigned long __user *, old_nodes,
1344                const unsigned long __user *, new_nodes)
1345{
1346        const struct cred *cred = current_cred(), *tcred;
1347        struct mm_struct *mm = NULL;
1348        struct task_struct *task;
1349        nodemask_t task_nodes;
1350        int err;
1351        nodemask_t *old;
1352        nodemask_t *new;
1353        NODEMASK_SCRATCH(scratch);
1354
1355        if (!scratch)
1356                return -ENOMEM;
1357
1358        old = &scratch->mask1;
1359        new = &scratch->mask2;
1360
1361        err = get_nodes(old, old_nodes, maxnode);
1362        if (err)
1363                goto out;
1364
1365        err = get_nodes(new, new_nodes, maxnode);
1366        if (err)
1367                goto out;
1368
1369        /* Find the mm_struct */
1370        rcu_read_lock();
1371        task = pid ? find_task_by_vpid(pid) : current;
1372        if (!task) {
1373                rcu_read_unlock();
1374                err = -ESRCH;
1375                goto out;
1376        }
1377        get_task_struct(task);
1378
1379        err = -EINVAL;
1380
1381        /*
1382         * Check if this process has the right to modify the specified
1383         * process. The right exists if the process has administrative
1384         * capabilities, superuser privileges or the same
1385         * userid as the target process.
1386         */
1387        tcred = __task_cred(task);
1388        if (!uid_eq(cred->euid, tcred->suid) && !uid_eq(cred->euid, tcred->uid) &&
1389            !uid_eq(cred->uid,  tcred->suid) && !uid_eq(cred->uid,  tcred->uid) &&
1390            !capable(CAP_SYS_NICE)) {
1391                rcu_read_unlock();
1392                err = -EPERM;
1393                goto out_put;
1394        }
1395        rcu_read_unlock();
1396
1397        task_nodes = cpuset_mems_allowed(task);
1398        /* Is the user allowed to access the target nodes? */
1399        if (!nodes_subset(*new, task_nodes) && !capable(CAP_SYS_NICE)) {
1400                err = -EPERM;
1401                goto out_put;
1402        }
1403
1404        if (!nodes_subset(*new, node_states[N_MEMORY])) {
1405                err = -EINVAL;
1406                goto out_put;
1407        }
1408
1409        err = security_task_movememory(task);
1410        if (err)
1411                goto out_put;
1412
1413        mm = get_task_mm(task);
1414        put_task_struct(task);
1415
1416        if (!mm) {
1417                err = -EINVAL;
1418                goto out;
1419        }
1420
1421        err = do_migrate_pages(mm, old, new,
1422                capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE);
1423
1424        mmput(mm);
1425out:
1426        NODEMASK_SCRATCH_FREE(scratch);
1427
1428        return err;
1429
1430out_put:
1431        put_task_struct(task);
1432        goto out;
1433
1434}
1435
1436
1437/* Retrieve NUMA policy */
1438SYSCALL_DEFINE5(get_mempolicy, int __user *, policy,
1439                unsigned long __user *, nmask, unsigned long, maxnode,
1440                unsigned long, addr, unsigned long, flags)
1441{
1442        int err;
1443        int uninitialized_var(pval);
1444        nodemask_t nodes;
1445
1446        if (nmask != NULL && maxnode < MAX_NUMNODES)
1447                return -EINVAL;
1448
1449        err = do_get_mempolicy(&pval, &nodes, addr, flags);
1450
1451        if (err)
1452                return err;
1453
1454        if (policy && put_user(pval, policy))
1455                return -EFAULT;
1456
1457        if (nmask)
1458                err = copy_nodes_to_user(nmask, maxnode, &nodes);
1459
1460        return err;
1461}
1462
1463#ifdef CONFIG_COMPAT
1464
1465COMPAT_SYSCALL_DEFINE5(get_mempolicy, int __user *, policy,
1466                       compat_ulong_t __user *, nmask,
1467                       compat_ulong_t, maxnode,
1468                       compat_ulong_t, addr, compat_ulong_t, flags)
1469{
1470        long err;
1471        unsigned long __user *nm = NULL;
1472        unsigned long nr_bits, alloc_size;
1473        DECLARE_BITMAP(bm, MAX_NUMNODES);
1474
1475        nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1476        alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1477
1478        if (nmask)
1479                nm = compat_alloc_user_space(alloc_size);
1480
1481        err = sys_get_mempolicy(policy, nm, nr_bits+1, addr, flags);
1482
1483        if (!err && nmask) {
1484                unsigned long copy_size;
1485                copy_size = min_t(unsigned long, sizeof(bm), alloc_size);
1486                err = copy_from_user(bm, nm, copy_size);
1487                /* ensure entire bitmap is zeroed */
1488                err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8);
1489                err |= compat_put_bitmap(nmask, bm, nr_bits);
1490        }
1491
1492        return err;
1493}
1494
1495COMPAT_SYSCALL_DEFINE3(set_mempolicy, int, mode, compat_ulong_t __user *, nmask,
1496                       compat_ulong_t, maxnode)
1497{
1498        long err = 0;
1499        unsigned long __user *nm = NULL;
1500        unsigned long nr_bits, alloc_size;
1501        DECLARE_BITMAP(bm, MAX_NUMNODES);
1502
1503        nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1504        alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1505
1506        if (nmask) {
1507                err = compat_get_bitmap(bm, nmask, nr_bits);
1508                nm = compat_alloc_user_space(alloc_size);
1509                err |= copy_to_user(nm, bm, alloc_size);
1510        }
1511
1512        if (err)
1513                return -EFAULT;
1514
1515        return sys_set_mempolicy(mode, nm, nr_bits+1);
1516}
1517
1518COMPAT_SYSCALL_DEFINE6(mbind, compat_ulong_t, start, compat_ulong_t, len,
1519                       compat_ulong_t, mode, compat_ulong_t __user *, nmask,
1520                       compat_ulong_t, maxnode, compat_ulong_t, flags)
1521{
1522        long err = 0;
1523        unsigned long __user *nm = NULL;
1524        unsigned long nr_bits, alloc_size;
1525        nodemask_t bm;
1526
1527        nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1528        alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1529
1530        if (nmask) {
1531                err = compat_get_bitmap(nodes_addr(bm), nmask, nr_bits);
1532                nm = compat_alloc_user_space(alloc_size);
1533                err |= copy_to_user(nm, nodes_addr(bm), alloc_size);
1534        }
1535
1536        if (err)
1537                return -EFAULT;
1538
1539        return sys_mbind(start, len, mode, nm, nr_bits+1, flags);
1540}
1541
1542#endif
1543
1544struct mempolicy *__get_vma_policy(struct vm_area_struct *vma,
1545                                                unsigned long addr)
1546{
1547        struct mempolicy *pol = NULL;
1548
1549        if (vma) {
1550                if (vma->vm_ops && vma->vm_ops->get_policy) {
1551                        pol = vma->vm_ops->get_policy(vma, addr);
1552                } else if (vma->vm_policy) {
1553                        pol = vma->vm_policy;
1554
1555                        /*
1556                         * shmem_alloc_page() passes MPOL_F_SHARED policy with
1557                         * a pseudo vma whose vma->vm_ops=NULL. Take a reference
1558                         * count on these policies which will be dropped by
1559                         * mpol_cond_put() later
1560                         */
1561                        if (mpol_needs_cond_ref(pol))
1562                                mpol_get(pol);
1563                }
1564        }
1565
1566        return pol;
1567}
1568
1569/*
1570 * get_vma_policy(@vma, @addr)
1571 * @vma: virtual memory area whose policy is sought
1572 * @addr: address in @vma for shared policy lookup
1573 *
1574 * Returns effective policy for a VMA at specified address.
1575 * Falls back to current->mempolicy or system default policy, as necessary.
1576 * Shared policies [those marked as MPOL_F_SHARED] require an extra reference
1577 * count--added by the get_policy() vm_op, as appropriate--to protect against
1578 * freeing by another task.  It is the caller's responsibility to free the
1579 * extra reference for shared policies.
1580 */
1581static struct mempolicy *get_vma_policy(struct vm_area_struct *vma,
1582                                                unsigned long addr)
1583{
1584        struct mempolicy *pol = __get_vma_policy(vma, addr);
1585
1586        if (!pol)
1587                pol = get_task_policy(current);
1588
1589        return pol;
1590}
1591
1592bool vma_policy_mof(struct vm_area_struct *vma)
1593{
1594        struct mempolicy *pol;
1595
1596        if (vma->vm_ops && vma->vm_ops->get_policy) {
1597                bool ret = false;
1598
1599                pol = vma->vm_ops->get_policy(vma, vma->vm_start);
1600                if (pol && (pol->flags & MPOL_F_MOF))
1601                        ret = true;
1602                mpol_cond_put(pol);
1603
1604                return ret;
1605        }
1606
1607        pol = vma->vm_policy;
1608        if (!pol)
1609                pol = get_task_policy(current);
1610
1611        return pol->flags & MPOL_F_MOF;
1612}
1613
1614static int apply_policy_zone(struct mempolicy *policy, enum zone_type zone)
1615{
1616        enum zone_type dynamic_policy_zone = policy_zone;
1617
1618        BUG_ON(dynamic_policy_zone == ZONE_MOVABLE);
1619
1620        /*
1621         * if policy->v.nodes has movable memory only,
1622         * we apply policy when gfp_zone(gfp) = ZONE_MOVABLE only.
1623         *
1624         * policy->v.nodes is intersect with node_states[N_MEMORY].
1625         * so if the following test faile, it implies
1626         * policy->v.nodes has movable memory only.
1627         */
1628        if (!nodes_intersects(policy->v.nodes, node_states[N_HIGH_MEMORY]))
1629                dynamic_policy_zone = ZONE_MOVABLE;
1630
1631        return zone >= dynamic_policy_zone;
1632}
1633
1634/*
1635 * Return a nodemask representing a mempolicy for filtering nodes for
1636 * page allocation
1637 */
1638static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy)
1639{
1640        /* Lower zones don't get a nodemask applied for MPOL_BIND */
1641        if (unlikely(policy->mode == MPOL_BIND) &&
1642                        apply_policy_zone(policy, gfp_zone(gfp)) &&
1643                        cpuset_nodemask_valid_mems_allowed(&policy->v.nodes))
1644                return &policy->v.nodes;
1645
1646        return NULL;
1647}
1648
1649/* Return a zonelist indicated by gfp for node representing a mempolicy */
1650static struct zonelist *policy_zonelist(gfp_t gfp, struct mempolicy *policy,
1651        int nd)
1652{
1653        switch (policy->mode) {
1654        case MPOL_PREFERRED:
1655                if (!(policy->flags & MPOL_F_LOCAL))
1656                        nd = policy->v.preferred_node;
1657                break;
1658        case MPOL_BIND:
1659                /*
1660                 * Normally, MPOL_BIND allocations are node-local within the
1661                 * allowed nodemask.  However, if __GFP_THISNODE is set and the
1662                 * current node isn't part of the mask, we use the zonelist for
1663                 * the first node in the mask instead.
1664                 */
1665                if (unlikely(gfp & __GFP_THISNODE) &&
1666                                unlikely(!node_isset(nd, policy->v.nodes)))
1667                        nd = first_node(policy->v.nodes);
1668                break;
1669        default:
1670                BUG();
1671        }
1672        return node_zonelist(nd, gfp);
1673}
1674
1675/* Do dynamic interleaving for a process */
1676static unsigned interleave_nodes(struct mempolicy *policy)
1677{
1678        unsigned nid, next;
1679        struct task_struct *me = current;
1680
1681        nid = me->il_next;
1682        next = next_node(nid, policy->v.nodes);
1683        if (next >= MAX_NUMNODES)
1684                next = first_node(policy->v.nodes);
1685        if (next < MAX_NUMNODES)
1686                me->il_next = next;
1687        return nid;
1688}
1689
1690/*
1691 * Depending on the memory policy provide a node from which to allocate the
1692 * next slab entry.
1693 */
1694unsigned int mempolicy_slab_node(void)
1695{
1696        struct mempolicy *policy;
1697        int node = numa_mem_id();
1698
1699        if (in_interrupt())
1700                return node;
1701
1702        policy = current->mempolicy;
1703        if (!policy || policy->flags & MPOL_F_LOCAL)
1704                return node;
1705
1706        switch (policy->mode) {
1707        case MPOL_PREFERRED:
1708                /*
1709                 * handled MPOL_F_LOCAL above
1710                 */
1711                return policy->v.preferred_node;
1712
1713        case MPOL_INTERLEAVE:
1714                return interleave_nodes(policy);
1715
1716        case MPOL_BIND: {
1717                /*
1718                 * Follow bind policy behavior and start allocation at the
1719                 * first node.
1720                 */
1721                struct zonelist *zonelist;
1722                struct zone *zone;
1723                enum zone_type highest_zoneidx = gfp_zone(GFP_KERNEL);
1724                zonelist = &NODE_DATA(node)->node_zonelists[0];
1725                (void)first_zones_zonelist(zonelist, highest_zoneidx,
1726                                                        &policy->v.nodes,
1727                                                        &zone);
1728                return zone ? zone->node : node;
1729        }
1730
1731        default:
1732                BUG();
1733        }
1734}
1735
1736/* Do static interleaving for a VMA with known offset. */
1737static unsigned offset_il_node(struct mempolicy *pol,
1738                struct vm_area_struct *vma, unsigned long off)
1739{
1740        unsigned nnodes = nodes_weight(pol->v.nodes);
1741        unsigned target;
1742        int c;
1743        int nid = NUMA_NO_NODE;
1744
1745        if (!nnodes)
1746                return numa_node_id();
1747        target = (unsigned int)off % nnodes;
1748        c = 0;
1749        do {
1750                nid = next_node(nid, pol->v.nodes);
1751                c++;
1752        } while (c <= target);
1753        return nid;
1754}
1755
1756/* Determine a node number for interleave */
1757static inline unsigned interleave_nid(struct mempolicy *pol,
1758                 struct vm_area_struct *vma, unsigned long addr, int shift)
1759{
1760        if (vma) {
1761                unsigned long off;
1762
1763                /*
1764                 * for small pages, there is no difference between
1765                 * shift and PAGE_SHIFT, so the bit-shift is safe.
1766                 * for huge pages, since vm_pgoff is in units of small
1767                 * pages, we need to shift off the always 0 bits to get
1768                 * a useful offset.
1769                 */
1770                BUG_ON(shift < PAGE_SHIFT);
1771                off = vma->vm_pgoff >> (shift - PAGE_SHIFT);
1772                off += (addr - vma->vm_start) >> shift;
1773                return offset_il_node(pol, vma, off);
1774        } else
1775                return interleave_nodes(pol);
1776}
1777
1778/*
1779 * Return the bit number of a random bit set in the nodemask.
1780 * (returns NUMA_NO_NODE if nodemask is empty)
1781 */
1782int node_random(const nodemask_t *maskp)
1783{
1784        int w, bit = NUMA_NO_NODE;
1785
1786        w = nodes_weight(*maskp);
1787        if (w)
1788                bit = bitmap_ord_to_pos(maskp->bits,
1789                        get_random_int() % w, MAX_NUMNODES);
1790        return bit;
1791}
1792
1793#ifdef CONFIG_HUGETLBFS
1794/*
1795 * huge_zonelist(@vma, @addr, @gfp_flags, @mpol)
1796 * @vma: virtual memory area whose policy is sought
1797 * @addr: address in @vma for shared policy lookup and interleave policy
1798 * @gfp_flags: for requested zone
1799 * @mpol: pointer to mempolicy pointer for reference counted mempolicy
1800 * @nodemask: pointer to nodemask pointer for MPOL_BIND nodemask
1801 *
1802 * Returns a zonelist suitable for a huge page allocation and a pointer
1803 * to the struct mempolicy for conditional unref after allocation.
1804 * If the effective policy is 'BIND, returns a pointer to the mempolicy's
1805 * @nodemask for filtering the zonelist.
1806 *
1807 * Must be protected by read_mems_allowed_begin()
1808 */
1809struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr,
1810                                gfp_t gfp_flags, struct mempolicy **mpol,
1811                                nodemask_t **nodemask)
1812{
1813        struct zonelist *zl;
1814
1815        *mpol = get_vma_policy(vma, addr);
1816        *nodemask = NULL;       /* assume !MPOL_BIND */
1817
1818        if (unlikely((*mpol)->mode == MPOL_INTERLEAVE)) {
1819                zl = node_zonelist(interleave_nid(*mpol, vma, addr,
1820                                huge_page_shift(hstate_vma(vma))), gfp_flags);
1821        } else {
1822                zl = policy_zonelist(gfp_flags, *mpol, numa_node_id());
1823                if ((*mpol)->mode == MPOL_BIND)
1824                        *nodemask = &(*mpol)->v.nodes;
1825        }
1826        return zl;
1827}
1828
1829/*
1830 * init_nodemask_of_mempolicy
1831 *
1832 * If the current task's mempolicy is "default" [NULL], return 'false'
1833 * to indicate default policy.  Otherwise, extract the policy nodemask
1834 * for 'bind' or 'interleave' policy into the argument nodemask, or
1835 * initialize the argument nodemask to contain the single node for
1836 * 'preferred' or 'local' policy and return 'true' to indicate presence
1837 * of non-default mempolicy.
1838 *
1839 * We don't bother with reference counting the mempolicy [mpol_get/put]
1840 * because the current task is examining it's own mempolicy and a task's
1841 * mempolicy is only ever changed by the task itself.
1842 *
1843 * N.B., it is the caller's responsibility to free a returned nodemask.
1844 */
1845bool init_nodemask_of_mempolicy(nodemask_t *mask)
1846{
1847        struct mempolicy *mempolicy;
1848        int nid;
1849
1850        if (!(mask && current->mempolicy))
1851                return false;
1852
1853        task_lock(current);
1854        mempolicy = current->mempolicy;
1855        switch (mempolicy->mode) {
1856        case MPOL_PREFERRED:
1857                if (mempolicy->flags & MPOL_F_LOCAL)
1858                        nid = numa_node_id();
1859                else
1860                        nid = mempolicy->v.preferred_node;
1861                init_nodemask_of_node(mask, nid);
1862                break;
1863
1864        case MPOL_BIND:
1865                /* Fall through */
1866        case MPOL_INTERLEAVE:
1867                *mask =  mempolicy->v.nodes;
1868                break;
1869
1870        default:
1871                BUG();
1872        }
1873        task_unlock(current);
1874
1875        return true;
1876}
1877#endif
1878
1879/*
1880 * mempolicy_nodemask_intersects
1881 *
1882 * If tsk's mempolicy is "default" [NULL], return 'true' to indicate default
1883 * policy.  Otherwise, check for intersection between mask and the policy
1884 * nodemask for 'bind' or 'interleave' policy.  For 'perferred' or 'local'
1885 * policy, always return true since it may allocate elsewhere on fallback.
1886 *
1887 * Takes task_lock(tsk) to prevent freeing of its mempolicy.
1888 */
1889bool mempolicy_nodemask_intersects(struct task_struct *tsk,
1890                                        const nodemask_t *mask)
1891{
1892        struct mempolicy *mempolicy;
1893        bool ret = true;
1894
1895        if (!mask)
1896                return ret;
1897        task_lock(tsk);
1898        mempolicy = tsk->mempolicy;
1899        if (!mempolicy)
1900                goto out;
1901
1902        switch (mempolicy->mode) {
1903        case MPOL_PREFERRED:
1904                /*
1905                 * MPOL_PREFERRED and MPOL_F_LOCAL are only preferred nodes to
1906                 * allocate from, they may fallback to other nodes when oom.
1907                 * Thus, it's possible for tsk to have allocated memory from
1908                 * nodes in mask.
1909                 */
1910                break;
1911        case MPOL_BIND:
1912        case MPOL_INTERLEAVE:
1913                ret = nodes_intersects(mempolicy->v.nodes, *mask);
1914                break;
1915        default:
1916                BUG();
1917        }
1918out:
1919        task_unlock(tsk);
1920        return ret;
1921}
1922
1923/* Allocate a page in interleaved policy.
1924   Own path because it needs to do special accounting. */
1925static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
1926                                        unsigned nid)
1927{
1928        struct zonelist *zl;
1929        struct page *page;
1930
1931        zl = node_zonelist(nid, gfp);
1932        page = __alloc_pages(gfp, order, zl);
1933        if (page && page_zone(page) == zonelist_zone(&zl->_zonerefs[0]))
1934                inc_zone_page_state(page, NUMA_INTERLEAVE_HIT);
1935        return page;
1936}
1937
1938/**
1939 *      alloc_pages_vma - Allocate a page for a VMA.
1940 *
1941 *      @gfp:
1942 *      %GFP_USER    user allocation.
1943 *      %GFP_KERNEL  kernel allocations,
1944 *      %GFP_HIGHMEM highmem/user allocations,
1945 *      %GFP_FS      allocation should not call back into a file system.
1946 *      %GFP_ATOMIC  don't sleep.
1947 *
1948 *      @order:Order of the GFP allocation.
1949 *      @vma:  Pointer to VMA or NULL if not available.
1950 *      @addr: Virtual Address of the allocation. Must be inside the VMA.
1951 *      @node: Which node to prefer for allocation (modulo policy).
1952 *      @hugepage: for hugepages try only the preferred node if possible
1953 *
1954 *      This function allocates a page from the kernel page pool and applies
1955 *      a NUMA policy associated with the VMA or the current process.
1956 *      When VMA is not NULL caller must hold down_read on the mmap_sem of the
1957 *      mm_struct of the VMA to prevent it from going away. Should be used for
1958 *      all allocations for pages that will be mapped into user space. Returns
1959 *      NULL when no page can be allocated.
1960 */
1961struct page *
1962alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
1963                unsigned long addr, int node, bool hugepage)
1964{
1965        struct mempolicy *pol;
1966        struct page *page;
1967        unsigned int cpuset_mems_cookie;
1968        struct zonelist *zl;
1969        nodemask_t *nmask;
1970
1971retry_cpuset:
1972        pol = get_vma_policy(vma, addr);
1973        cpuset_mems_cookie = read_mems_allowed_begin();
1974
1975        if (unlikely(IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && hugepage &&
1976                                        pol->mode != MPOL_INTERLEAVE)) {
1977                /*
1978                 * For hugepage allocation and non-interleave policy which
1979                 * allows the current node, we only try to allocate from the
1980                 * current node and don't fall back to other nodes, as the
1981                 * cost of remote accesses would likely offset THP benefits.
1982                 *
1983                 * If the policy is interleave, or does not allow the current
1984                 * node in its nodemask, we allocate the standard way.
1985                 */
1986                nmask = policy_nodemask(gfp, pol);
1987                if (!nmask || node_isset(node, *nmask)) {
1988                        mpol_cond_put(pol);
1989                        page = alloc_pages_exact_node(node,
1990                                                gfp | __GFP_THISNODE, order);
1991                        goto out;
1992                }
1993        }
1994
1995        if (pol->mode == MPOL_INTERLEAVE) {
1996                unsigned nid;
1997
1998                nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order);
1999                mpol_cond_put(pol);
2000                page = alloc_page_interleave(gfp, order, nid);

2001                goto out;
2002        }
2003
2004        nmask = policy_nodemask(gfp, pol);
2005        zl = policy_zonelist(gfp, pol, node);
2006        mpol_cond_put(pol);
2007        page = __alloc_pages_nodemask(gfp, order, zl, nmask);
2008out:
2009        if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie)))
2010                goto retry_cpuset;
2011        return page;
2012}
2013
2014/**
2015 *      alloc_pages_current - Allocate pages.
2016 *
2017 *      @gfp:
2018 *              %GFP_USER   user allocation,
2019 *              %GFP_KERNEL kernel allocation,
2020 *              %GFP_HIGHMEM highmem allocation,
2021 *              %GFP_FS     don't call back into a file system.
2022 *              %GFP_ATOMIC don't sleep.
2023 *      @order: Power of two of allocation size in pages. 0 is a single page.
2024 *
2025 *      Allocate a page from the kernel page pool.  When not in
2026 *      interrupt context and apply the current process NUMA policy.
2027 *      Returns NULL when no page can be allocated.
2028 *
2029 *      Don't call cpuset_update_task_memory_state() unless
2030 *      1) it's ok to take cpuset_sem (can WAIT), and
2031 *      2) allocating for current task (not interrupt).
2032 */
2033struct page *alloc_pages_current(gfp_t gfp, unsigned order)
2034{
2035        struct mempolicy *pol = &default_policy;
2036        struct page *page;
2037        unsigned int cpuset_mems_cookie;
2038
2039        if (!in_interrupt() && !(gfp & __GFP_THISNODE))
2040                pol = get_task_policy(current);
2041
2042retry_cpuset:
2043        cpuset_mems_cookie = read_mems_allowed_begin();
2044
2045        /*
2046         * No reference counting needed for current->mempolicy
2047         * nor system default_policy
2048         */
2049        if (pol->mode == MPOL_INTERLEAVE)
2050                page = alloc_page_interleave(gfp, order, interleave_nodes(pol));
2051        else
2052                page = __alloc_pages_nodemask(gfp, order,
2053                                policy_zonelist(gfp, pol, numa_node_id()),
2054                                policy_nodemask(gfp, pol));
2055
2056        if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie)))
2057                goto retry_cpuset;
2058
2059        return page;
2060}
2061EXPORT_SYMBOL(alloc_pages_current);
2062
2063int vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst)
2064{
2065        struct mempolicy *pol = mpol_dup(vma_policy(src));
2066
2067        if (IS_ERR(pol))
2068                return PTR_ERR(pol);
2069        dst->vm_policy = pol;
2070        return 0;
2071}
2072
2073/*
2074 * If mpol_dup() sees current->cpuset == cpuset_being_rebound, then it
2075 * rebinds the mempolicy its copying by calling mpol_rebind_policy()
2076 * with the mems_allowed returned by cpuset_mems_allowed().  This
2077 * keeps mempolicies cpuset relative after its cpuset moves.  See
2078 * further kernel/cpuset.c update_nodemask().
2079 *
2080 * current's mempolicy may be rebinded by the other task(the task that changes
2081 * cpuset's mems), so we needn't do rebind work for current task.
2082 */
2083
2084/* Slow path of a mempolicy duplicate */
2085struct mempolicy *__mpol_dup(struct mempolicy *old)
2086{
2087        struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
2088
2089        if (!new)
2090                return ERR_PTR(-ENOMEM);
2091
2092        /* task's mempolicy is protected by alloc_lock */
2093        if (old == current->mempolicy) {
2094                task_lock(current);
2095                *new = *old;
2096                task_unlock(current);
2097        } else
2098                *new = *old;
2099
2100        if (current_cpuset_is_being_rebound()) {
2101                nodemask_t mems = cpuset_mems_allowed(current);
2102                if (new->flags & MPOL_F_REBINDING)
2103                        mpol_rebind_policy(new, &mems, MPOL_REBIND_STEP2);
2104                else
2105                        mpol_rebind_policy(new, &mems, MPOL_REBIND_ONCE);
2106        }
2107        atomic_set(&new->refcnt, 1);
2108        return new;
2109}
2110
2111/* Slow path of a mempolicy comparison */
2112bool __mpol_equal(struct mempolicy *a, struct mempolicy *b)
2113{
2114        if (!a || !b)
2115                return false;
2116        if (a->mode != b->mode)
2117                return false;
2118        if (a->flags != b->flags)
2119                return false;
2120        if (mpol_store_user_nodemask(a))
2121                if (!nodes_equal(a->w.user_nodemask, b->w.user_nodemask))
2122                        return false;
2123
2124        switch (a->mode) {
2125        case MPOL_BIND:
2126                /* Fall through */
2127        case MPOL_INTERLEAVE:
2128                return !!nodes_equal(a->v.nodes, b->v.nodes);
2129        case MPOL_PREFERRED:
2130                return a->v.preferred_node == b->v.preferred_node;
2131        default:
2132                BUG();
2133                return false;
2134        }
2135}
2136
2137/*
2138 * Shared memory backing store policy support.
2139 *
2140 * Remember policies even when nobody has shared memory mapped.
2141 * The policies are kept in Red-Black tree linked from the inode.
2142 * They are protected by the sp->lock spinlock, which should be held
2143 * for any accesses to the tree.
2144 */
2145
2146/* lookup first element intersecting start-end */
2147/* Caller holds sp->lock */
2148static struct sp_node *
2149sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
2150{
2151        struct rb_node *n = sp->root.rb_node;
2152
2153        while (n) {
2154                struct sp_node *p = rb_entry(n, struct sp_node, nd);
2155
2156                if (start >= p->end)
2157                        n = n->rb_right;
2158                else if (end <= p->start)
2159                        n = n->rb_left;
2160                else
2161                        break;
2162        }
2163        if (!n)
2164                return NULL;
2165        for (;;) {
2166                struct sp_node *w = NULL;
2167                struct rb_node *prev = rb_prev(n);
2168                if (!prev)
2169                        break;
2170                w = rb_entry(prev, struct sp_node, nd);
2171                if (w->end <= start)
2172                        break;
2173                n = prev;
2174        }
2175        return rb_entry(n, struct sp_node, nd);
2176}
2177
2178/* Insert a new shared policy into the list. */
2179/* Caller holds sp->lock */
2180static void sp_insert(struct shared_policy *sp, struct sp_node *new)
2181{
2182        struct rb_node **p = &sp->root.rb_node;
2183        struct rb_node *parent = NULL;
2184        struct sp_node *nd;
2185
2186        while (*p) {
2187                parent = *p;
2188                nd = rb_entry(parent, struct sp_node, nd);
2189                if (new->start < nd->start)
2190                        p = &(*p)->rb_left;
2191                else if (new->end > nd->end)
2192                        p = &(*p)->rb_right;
2193                else
2194                        BUG();
2195        }
2196        rb_link_node(&new->nd, parent, p);
2197        rb_insert_color(&new->nd, &sp->root);
2198        pr_debug("inserting %lx-%lx: %d\n", new->start, new->end,
2199                 new->policy ? new->policy->mode : 0);
2200}
2201
2202/* Find shared policy intersecting idx */
2203struct mempolicy *
2204mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
2205{
2206        struct mempolicy *pol = NULL;
2207        struct sp_node *sn;
2208
2209        if (!sp->root.rb_node)
2210                return NULL;
2211        spin_lock(&sp->lock);
2212        sn = sp_lookup(sp, idx, idx+1);
2213        if (sn) {
2214                mpol_get(sn->policy);
2215                pol = sn->policy;
2216        }
2217        spin_unlock(&sp->lock);
2218        return pol;
2219}
2220
2221static void sp_free(struct sp_node *n)
2222{
2223        mpol_put(n->policy);
2224        kmem_cache_free(sn_cache, n);
2225}
2226
2227/**
2228 * mpol_misplaced - check whether current page node is valid in policy
2229 *
2230 * @page: page to be checked
2231 * @vma: vm area where page mapped
2232 * @addr: virtual address where page mapped
2233 *
2234 * Lookup current policy node id for vma,addr and "compare to" page's
2235 * node id.
2236 *
2237 * Returns:
2238 *      -1      - not misplaced, page is in the right node
2239 *      node    - node id where the page should be
2240 *
2241 * Policy determination "mimics" alloc_page_vma().
2242 * Called from fault path where we know the vma and faulting address.
2243 */
2244int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long addr)
2245{
2246        struct mempolicy *pol;
2247        struct zone *zone;
2248        int curnid = page_to_nid(page);
2249        unsigned long pgoff;
2250        int thiscpu = raw_smp_processor_id();
2251        int thisnid = cpu_to_node(thiscpu);
2252        int polnid = -1;
2253        int ret = -1;
2254
2255        BUG_ON(!vma);
2256
2257        pol = get_vma_policy(vma, addr);
2258        if (!(pol->flags & MPOL_F_MOF))
2259                goto out;
2260
2261        switch (pol->mode) {
2262        case MPOL_INTERLEAVE:
2263                BUG_ON(addr >= vma->vm_end);
2264                BUG_ON(addr < vma->vm_start);
2265
2266                pgoff = vma->vm_pgoff;
2267                pgoff += (addr - vma->vm_start) >> PAGE_SHIFT;
2268                polnid = offset_il_node(pol, vma, pgoff);
2269                break;
2270
2271        case MPOL_PREFERRED:
2272                if (pol->flags & MPOL_F_LOCAL)
2273                        polnid = numa_node_id();
2274                else
2275                        polnid = pol->v.preferred_node;
2276                break;
2277
2278        case MPOL_BIND:
2279                /*
2280                 * allows binding to multiple nodes.
2281                 * use current page if in policy nodemask,
2282                 * else select nearest allowed node, if any.
2283                 * If no allowed nodes, use current [!misplaced].
2284                 */
2285                if (node_isset(curnid, pol->v.nodes))
2286                        goto out;
2287                (void)first_zones_zonelist(
2288                                node_zonelist(numa_node_id(), GFP_HIGHUSER),
2289                                gfp_zone(GFP_HIGHUSER),
2290                                &pol->v.nodes, &zone);
2291                polnid = zone->node;
2292                break;
2293
2294        default:
2295                BUG();
2296        }
2297
2298        /* Migrate the page towards the node whose CPU is referencing it */
2299        if (pol->flags & MPOL_F_MORON) {
2300                polnid = thisnid;
2301
2302                if (!should_numa_migrate_memory(current, page, curnid, thiscpu))
2303                        goto out;
2304        }
2305
2306        if (curnid != polnid)
2307                ret = polnid;
2308out:
2309        mpol_cond_put(pol);
2310
2311        return ret;
2312}
2313
2314static void sp_delete(struct shared_policy *sp, struct sp_node *n)
2315{
2316        pr_debug("deleting %lx-l%lx\n", n->start, n->end);
2317        rb_erase(&n->nd, &sp->root);
2318        sp_free(n);
2319}
2320
2321static void sp_node_init(struct sp_node *node, unsigned long start,
2322                        unsigned long end, struct mempolicy *pol)
2323{
2324        node->start = start;
2325        node->end = end;
2326        node->policy = pol;
2327}
2328
2329static struct sp_node *sp_alloc(unsigned long start, unsigned long end,
2330                                struct mempolicy *pol)
2331{
2332        struct sp_node *n;
2333        struct mempolicy *newpol;
2334
2335        n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
2336        if (!n)
2337                return NULL;
2338
2339        newpol = mpol_dup(pol);
2340        if (IS_ERR(newpol)) {
2341                kmem_cache_free(sn_cache, n);
2342                return NULL;
2343        }
2344        newpol->flags |= MPOL_F_SHARED;
2345        sp_node_init(n, start, end, newpol);
2346
2347        return n;
2348}
2349
2350/* Replace a policy range. */
2351static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
2352                                 unsigned long end, struct sp_node *new)
2353{
2354        struct sp_node *n;
2355        struct sp_node *n_new = NULL;
2356        struct mempolicy *mpol_new = NULL;
2357        int ret = 0;
2358
2359restart:
2360        spin_lock(&sp->lock);
2361        n = sp_lookup(sp, start, end);
2362        /* Take care of old policies in the same range. */
2363        while (n && n->start < end) {
2364                struct rb_node *next = rb_next(&n->nd);
2365                if (n->start >= start) {
2366                        if (n->end <= end)
2367                                sp_delete(sp, n);
2368                        else
2369                                n->start = end;
2370                } else {
2371                        /* Old policy spanning whole new range. */
2372                        if (n->end > end) {
2373                                if (!n_new)
2374                                        goto alloc_new;
2375
2376                                *mpol_new = *n->policy;
2377                                atomic_set(&mpol_new->refcnt, 1);
2378                                sp_node_init(n_new, end, n->end, mpol_new);
2379                                n->end = start;
2380                                sp_insert(sp, n_new);
2381                                n_new = NULL;
2382                                mpol_new = NULL;
2383                                break;
2384                        } else
2385                                n->end = start;
2386                }
2387                if (!next)
2388                        break;
2389                n = rb_entry(next, struct sp_node, nd);
2390        }
2391        if (new)
2392                sp_insert(sp, new);
2393        spin_unlock(&sp->lock);
2394        ret = 0;
2395
2396err_out:
2397        if (mpol_new)
2398                mpol_put(mpol_new);
2399        if (n_new)
2400                kmem_cache_free(sn_cache, n_new);
2401
2402        return ret;
2403
2404alloc_new:
2405        spin_unlock(&sp->lock);
2406        ret = -ENOMEM;
2407        n_new = kmem_cache_alloc(sn_cache, GFP_KERNEL);
2408        if (!n_new)
2409                goto err_out;
2410        mpol_new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
2411        if (!mpol_new)
2412                goto err_out;
2413        goto restart;
2414}
2415
2416/**
2417 * mpol_shared_policy_init - initialize shared policy for inode
2418 * @sp: pointer to inode shared policy
2419 * @mpol:  struct mempolicy to install
2420 *
2421 * Install non-NULL @mpol in inode's shared policy rb-tree.
2422 * On entry, the current task has a reference on a non-NULL @mpol.
2423 * This must be released on exit.
2424 * This is called at get_inode() calls and we can use GFP_KERNEL.
2425 */
2426void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol)
2427{
2428        int ret;
2429
2430        sp->root = RB_ROOT;             /* empty tree == default mempolicy */
2431        spin_lock_init(&sp->lock);
2432
2433        if (mpol) {
2434                struct vm_area_struct pvma;
2435                struct mempolicy *new;
2436                NODEMASK_SCRATCH(scratch);
2437
2438                if (!scratch)
2439                        goto put_mpol;
2440                /* contextualize the tmpfs mount point mempolicy */
2441                new = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask);
2442                if (IS_ERR(new))
2443                        goto free_scratch; /* no valid nodemask intersection */
2444
2445                task_lock(current);
2446                ret = mpol_set_nodemask(new, &mpol->w.user_nodemask, scratch);
2447                task_unlock(current);
2448                if (ret)
2449                        goto put_new;
2450
2451                /* Create pseudo-vma that contains just the policy */
2452                memset(&pvma, 0, sizeof(struct vm_area_struct));
2453                pvma.vm_end = TASK_SIZE;        /* policy covers entire file */
2454                mpol_set_shared_policy(sp, &pvma, new); /* adds ref */
2455
2456put_new:
2457                mpol_put(new);                  /* drop initial ref */
2458free_scratch:
2459                NODEMASK_SCRATCH_FREE(scratch);
2460put_mpol:
2461                mpol_put(mpol); /* drop our incoming ref on sb mpol */
2462        }
2463}
2464
2465int mpol_set_shared_policy(struct shared_policy *info,
2466                        struct vm_area_struct *vma, struct mempolicy *npol)
2467{
2468        int err;
2469        struct sp_node *new = NULL;
2470        unsigned long sz = vma_pages(vma);
2471
2472        pr_debug("set_shared_policy %lx sz %lu %d %d %lx\n",
2473                 vma->vm_pgoff,
2474                 sz, npol ? npol->mode : -1,
2475                 npol ? npol->flags : -1,
2476                 npol ? nodes_addr(npol->v.nodes)[0] : NUMA_NO_NODE);
2477
2478        if (npol) {
2479                new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);
2480                if (!new)
2481                        return -ENOMEM;
2482        }
2483        err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new);
2484        if (err && new)
2485                sp_free(new);
2486        return err;
2487}
2488
2489/* Free a backing policy store on inode delete. */
2490void mpol_free_shared_policy(struct shared_policy *p)
2491{
2492        struct sp_node *n;
2493        struct rb_node *next;
2494
2495        if (!p->root.rb_node)
2496                return;
2497        spin_lock(&p->lock);
2498        next = rb_first(&p->root);
2499        while (next) {
2500                n = rb_entry(next, struct sp_node, nd);
2501                next = rb_next(&n->nd);
2502                sp_delete(p, n);
2503        }
2504        spin_unlock(&p->lock);
2505}
2506
2507#ifdef CONFIG_NUMA_BALANCING
2508static int __initdata numabalancing_override;
2509
2510static void __init check_numabalancing_enable(void)
2511{
2512        bool numabalancing_default = false;
2513
2514        if (IS_ENABLED(CONFIG_NUMA_BALANCING_DEFAULT_ENABLED))
2515                numabalancing_default = true;
2516
2517        /* Parsed by setup_numabalancing. override == 1 enables, -1 disables */
2518        if (numabalancing_override)
2519                set_numabalancing_state(numabalancing_override == 1);
2520
2521        if (num_online_nodes() > 1 && !numabalancing_override) {
2522                pr_info("%s automatic NUMA balancing. "
2523                        "Configure with numa_balancing= or the "
2524                        "kernel.numa_balancing sysctl",
2525                        numabalancing_default ? "Enabling" : "Disabling");
2526                set_numabalancing_state(numabalancing_default);
2527        }
2528}
2529
2530static int __init setup_numabalancing(char *str)
2531{
2532        int ret = 0;
2533        if (!str)
2534                goto out;
2535
2536        if (!strcmp(str, "enable")) {
2537                numabalancing_override = 1;
2538                ret = 1;
2539        } else if (!strcmp(str, "disable")) {
2540                numabalancing_override = -1;
2541                ret = 1;
2542        }
2543out:
2544        if (!ret)
2545                pr_warn("Unable to parse numa_balancing=\n");
2546
2547        return ret;
2548}
2549__setup("numa_balancing=", setup_numabalancing);
2550#else
2551static inline void __init check_numabalancing_enable(void)
2552{
2553}
2554#endif /* CONFIG_NUMA_BALANCING */
2555
2556/* assumes fs == KERNEL_DS */
2557void __init numa_policy_init(void)
2558{
2559        nodemask_t interleave_nodes;
2560        unsigned long largest = 0;
2561        int nid, prefer = 0;
2562
2563        policy_cache = kmem_cache_create("numa_policy",
2564                                         sizeof(struct mempolicy),
2565                                         0, SLAB_PANIC, NULL);
2566
2567        sn_cache = kmem_cache_create("shared_policy_node",
2568                                     sizeof(struct sp_node),
2569                                     0, SLAB_PANIC, NULL);
2570
2571        for_each_node(nid) {
2572                preferred_node_policy[nid] = (struct mempolicy) {
2573                        .refcnt = ATOMIC_INIT(1),
2574                        .mode = MPOL_PREFERRED,
2575                        .flags = MPOL_F_MOF | MPOL_F_MORON,
2576                        .v = { .preferred_node = nid, },
2577                };
2578        }
2579
2580        /*
2581         * Set interleaving policy for system init. Interleaving is only
2582         * enabled across suitably sized nodes (default is >= 16MB), or
2583         * fall back to the largest node if they're all smaller.
2584         */
2585        nodes_clear(interleave_nodes);
2586        for_each_node_state(nid, N_MEMORY) {
2587                unsigned long total_pages = node_present_pages(nid);
2588
2589                /* Preserve the largest node */
2590                if (largest < total_pages) {
2591                        largest = total_pages;
2592                        prefer = nid;
2593                }
2594
2595                /* Interleave this node? */
2596                if ((total_pages << PAGE_SHIFT) >= (16 << 20))
2597                        node_set(nid, interleave_nodes);
2598        }
2599
2600        /* All too small, use the largest */
2601        if (unlikely(nodes_empty(interleave_nodes)))
2602                node_set(prefer, interleave_nodes);
2603
2604        if (do_set_mempolicy(MPOL_INTERLEAVE, 0, &interleave_nodes))
2605                pr_err("%s: interleaving failed\n", __func__);
2606
2607        check_numabalancing_enable();
2608}
2609
2610/* Reset policy of current process to default */
2611void numa_default_policy(void)
2612{
2613        do_set_mempolicy(MPOL_DEFAULT, 0, NULL);
2614}
2615
2616/*
2617 * Parse and format mempolicy from/to strings
2618 */
2619
2620/*
2621 * "local" is implemented internally by MPOL_PREFERRED with MPOL_F_LOCAL flag.
2622 */
2623static const char * const policy_modes[] =
2624{
2625        [MPOL_DEFAULT]    = "default",
2626        [MPOL_PREFERRED]  = "prefer",
2627        [MPOL_BIND]       = "bind",
2628        [MPOL_INTERLEAVE] = "interleave",
2629        [MPOL_LOCAL]      = "local",
2630};
2631
2632
2633#ifdef CONFIG_TMPFS
2634/**
2635 * mpol_parse_str - parse string to mempolicy, for tmpfs mpol mount option.
2636 * @str:  string containing mempolicy to parse
2637 * @mpol:  pointer to struct mempolicy pointer, returned on success.
2638 *
2639 * Format of input:
2640 *      <mode>[=<flags>][:<nodelist>]
2641 *
2642 * On success, returns 0, else 1
2643 */
2644int mpol_parse_str(char *str, struct mempolicy **mpol)
2645{
2646        struct mempolicy *new = NULL;
2647        unsigned short mode;
2648        unsigned short mode_flags;
2649        nodemask_t nodes;
2650        char *nodelist = strchr(str, ':');
2651        char *flags = strchr(str, '=');
2652        int err = 1;
2653
2654        if (nodelist) {
2655                /* NUL-terminate mode or flags string */
2656                *nodelist++ = '\0';
2657                if (nodelist_parse(nodelist, nodes))
2658                        goto out;
2659                if (!nodes_subset(nodes, node_states[N_MEMORY]))
2660                        goto out;
2661        } else
2662                nodes_clear(nodes);
2663
2664        if (flags)
2665                *flags++ = '\0';        /* terminate mode string */
2666
2667        for (mode = 0; mode < MPOL_MAX; mode++) {
2668                if (!strcmp(str, policy_modes[mode])) {
2669                        break;
2670                }
2671        }
2672        if (mode >= MPOL_MAX)
2673                goto out;
2674
2675        switch (mode) {
2676        case MPOL_PREFERRED:
2677                /*
2678                 * Insist on a nodelist of one node only
2679                 */
2680                if (nodelist) {
2681                        char *rest = nodelist;
2682                        while (isdigit(*rest))
2683                                rest++;
2684                        if (*rest)
2685                                goto out;
2686                }
2687                break;
2688        case MPOL_INTERLEAVE:
2689                /*
2690                 * Default to online nodes with memory if no nodelist
2691                 */
2692                if (!nodelist)
2693                        nodes = node_states[N_MEMORY];
2694                break;
2695        case MPOL_LOCAL:
2696                /*
2697                 * Don't allow a nodelist;  mpol_new() checks flags
2698                 */
2699                if (nodelist)
2700                        goto out;
2701                mode = MPOL_PREFERRED;
2702                break;
2703        case MPOL_DEFAULT:
2704                /*
2705                 * Insist on a empty nodelist
2706                 */
2707                if (!nodelist)
2708                        err = 0;
2709                goto out;
2710        case MPOL_BIND:
2711                /*
2712                 * Insist on a nodelist
2713                 */
2714                if (!nodelist)
2715                        goto out;
2716        }
2717
2718        mode_flags = 0;
2719        if (flags) {
2720                /*
2721                 * Currently, we only support two mutually exclusive
2722                 * mode flags.
2723                 */
2724                if (!strcmp(flags, "static"))
2725                        mode_flags |= MPOL_F_STATIC_NODES;
2726                else if (!strcmp(flags, "relative"))
2727                        mode_flags |= MPOL_F_RELATIVE_NODES;
2728                else
2729                        goto out;
2730        }
2731
2732        new = mpol_new(mode, mode_flags, &nodes);
2733        if (IS_ERR(new))
2734                goto out;
2735
2736        /*
2737         * Save nodes for mpol_to_str() to show the tmpfs mount options
2738         * for /proc/mounts, /proc/pid/mounts and /proc/pid/mountinfo.
2739         */
2740        if (mode != MPOL_PREFERRED)
2741                new->v.nodes = nodes;
2742        else if (nodelist)
2743                new->v.preferred_node = first_node(nodes);
2744        else
2745                new->flags |= MPOL_F_LOCAL;
2746
2747        /*
2748         * Save nodes for contextualization: this will be used to "clone"
2749         * the mempolicy in a specific context [cpuset] at a later time.
2750         */
2751        new->w.user_nodemask = nodes;
2752
2753        err = 0;
2754
2755out:
2756        /* Restore string for error message */
2757        if (nodelist)
2758                *--nodelist = ':';
2759        if (flags)
2760                *--flags = '=';
2761        if (!err)
2762                *mpol = new;
2763        return err;
2764}
2765#endif /* CONFIG_TMPFS */
2766
2767/**
2768 * mpol_to_str - format a mempolicy structure for printing
2769 * @buffer:  to contain formatted mempolicy string
2770 * @maxlen:  length of @buffer
2771 * @pol:  pointer to mempolicy to be formatted
2772 *
2773 * Convert @pol into a string.  If @buffer is too short, truncate the string.
2774 * Recommend a @maxlen of at least 32 for the longest mode, "interleave", the
2775 * longest flag, "relative", and to display at least a few node ids.
2776 */
2777void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
2778{
2779        char *p = buffer;
2780        nodemask_t nodes = NODE_MASK_NONE;
2781        unsigned short mode = MPOL_DEFAULT;
2782        unsigned short flags = 0;
2783
2784        if (pol && pol != &default_policy && !(pol->flags & MPOL_F_MORON)) {
2785                mode = pol->mode;
2786                flags = pol->flags;
2787        }
2788
2789        switch (mode) {
2790        case MPOL_DEFAULT:
2791                break;
2792        case MPOL_PREFERRED:
2793                if (flags & MPOL_F_LOCAL)
2794                        mode = MPOL_LOCAL;
2795                else
2796                        node_set(pol->v.preferred_node, nodes);
2797                break;
2798        case MPOL_BIND:
2799        case MPOL_INTERLEAVE:
2800                nodes = pol->v.nodes;
2801                break;
2802        default:
2803                WARN_ON_ONCE(1);
2804                snprintf(p, maxlen, "unknown");
2805                return;
2806        }
2807
2808        p += snprintf(p, maxlen, "%s", policy_modes[mode]);
2809
2810        if (flags & MPOL_MODE_FLAGS) {
2811                p += snprintf(p, buffer + maxlen - p, "=");
2812
2813                /*
2814                 * Currently, the only defined flags are mutually exclusive
2815                 */
2816                if (flags & MPOL_F_STATIC_NODES)
2817                        p += snprintf(p, buffer + maxlen - p, "static");
2818                else if (flags & MPOL_F_RELATIVE_NODES)
2819                        p += snprintf(p, buffer + maxlen - p, "relative");
2820        }
2821
2822        if (!nodes_empty(nodes))
2823                p += scnprintf(p, buffer + maxlen - p, ":%*pbl",
2824                               nodemask_pr_args(&nodes));
2825}
2826