linux/mm/mempolicy.c
<<
>>
Prefs
   1/*
   2 * Simple NUMA memory policy for the Linux kernel.
   3 *
   4 * Copyright 2003,2004 Andi Kleen, SuSE Labs.
   5 * (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc.
   6 * Subject to the GNU Public License, version 2.
   7 *
   8 * NUMA policy allows the user to give hints in which node(s) memory should
   9 * be allocated.
  10 *
  11 * Support four policies per VMA and per process:
  12 *
  13 * The VMA policy has priority over the process policy for a page fault.
  14 *
  15 * interleave     Allocate memory interleaved over a set of nodes,
  16 *                with normal fallback if it fails.
  17 *                For VMA based allocations this interleaves based on the
  18 *                offset into the backing object or offset into the mapping
  19 *                for anonymous memory. For process policy an process counter
  20 *                is used.
  21 *
  22 * bind           Only allocate memory on a specific set of nodes,
  23 *                no fallback.
  24 *                FIXME: memory is allocated starting with the first node
  25 *                to the last. It would be better if bind would truly restrict
  26 *                the allocation to memory nodes instead
  27 *
  28 * preferred       Try a specific node first before normal fallback.
  29 *                As a special case NUMA_NO_NODE here means do the allocation
  30 *                on the local CPU. This is normally identical to default,
  31 *                but useful to set in a VMA when you have a non default
  32 *                process policy.
  33 *
  34 * default        Allocate on the local node first, or when on a VMA
  35 *                use the process policy. This is what Linux always did
  36 *                in a NUMA aware kernel and still does by, ahem, default.
  37 *
  38 * The process policy is applied for most non interrupt memory allocations
  39 * in that process' context. Interrupts ignore the policies and always
  40 * try to allocate on the local CPU. The VMA policy is only applied for memory
  41 * allocations for a VMA in the VM.
  42 *
  43 * Currently there are a few corner cases in swapping where the policy
  44 * is not applied, but the majority should be handled. When process policy
  45 * is used it is not remembered over swap outs/swap ins.
  46 *
  47 * Only the highest zone in the zone hierarchy gets policied. Allocations
  48 * requesting a lower zone just use default policy. This implies that
  49 * on systems with highmem kernel lowmem allocation don't get policied.
  50 * Same with GFP_DMA allocations.
  51 *
  52 * For shmfs/tmpfs/hugetlbfs shared memory the policy is shared between
  53 * all users and remembered even when nobody has memory mapped.
  54 */
  55
  56/* Notebook:
  57   fix mmap readahead to honour policy and enable policy for any page cache
  58   object
  59   statistics for bigpages
  60   global policy for page cache? currently it uses process policy. Requires
  61   first item above.
  62   handle mremap for shared memory (currently ignored for the policy)
  63   grows down?
  64   make bind policy root only? It can trigger oom much faster and the
  65   kernel is not always grateful with that.
  66*/
  67
  68#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
  69
  70#include <linux/mempolicy.h>
  71#include <linux/mm.h>
  72#include <linux/highmem.h>
  73#include <linux/hugetlb.h>
  74#include <linux/kernel.h>
  75#include <linux/sched.h>
  76#include <linux/nodemask.h>
  77#include <linux/cpuset.h>
  78#include <linux/slab.h>
  79#include <linux/string.h>
  80#include <linux/export.h>
  81#include <linux/nsproxy.h>
  82#include <linux/interrupt.h>
  83#include <linux/init.h>
  84#include <linux/compat.h>
  85#include <linux/swap.h>
  86#include <linux/seq_file.h>
  87#include <linux/proc_fs.h>
  88#include <linux/migrate.h>
  89#include <linux/ksm.h>
  90#include <linux/rmap.h>
  91#include <linux/security.h>
  92#include <linux/syscalls.h>
  93#include <linux/ctype.h>
  94#include <linux/mm_inline.h>
  95#include <linux/mmu_notifier.h>
  96#include <linux/printk.h>
  97
  98#include <asm/tlbflush.h>
  99#include <asm/uaccess.h>
 100#include <linux/random.h>
 101
 102#include "internal.h"
 103
 104/* Internal flags */
 105#define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0)    /* Skip checks for continuous vmas */
 106#define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1)          /* Invert check for nodemask */
 107
 108static struct kmem_cache *policy_cache;
 109static struct kmem_cache *sn_cache;
 110
 111/* Highest zone. An specific allocation for a zone below that is not
 112   policied. */
 113enum zone_type policy_zone = 0;
 114
 115/*
 116 * run-time system-wide default policy => local allocation
 117 */
 118static struct mempolicy default_policy = {
 119        .refcnt = ATOMIC_INIT(1), /* never free it */
 120        .mode = MPOL_PREFERRED,
 121        .flags = MPOL_F_LOCAL,
 122};
 123
 124static struct mempolicy preferred_node_policy[MAX_NUMNODES];
 125
 126struct mempolicy *get_task_policy(struct task_struct *p)
 127{
 128        struct mempolicy *pol = p->mempolicy;
 129        int node;
 130
 131        if (pol)
 132                return pol;
 133
 134        node = numa_node_id();
 135        if (node != NUMA_NO_NODE) {
 136                pol = &preferred_node_policy[node];
 137                /* preferred_node_policy is not initialised early in boot */
 138                if (pol->mode)
 139                        return pol;
 140        }
 141
 142        return &default_policy;
 143}
 144
 145static const struct mempolicy_operations {
 146        int (*create)(struct mempolicy *pol, const nodemask_t *nodes);
 147        /*
 148         * If read-side task has no lock to protect task->mempolicy, write-side
 149         * task will rebind the task->mempolicy by two step. The first step is
 150         * setting all the newly nodes, and the second step is cleaning all the
 151         * disallowed nodes. In this way, we can avoid finding no node to alloc
 152         * page.
 153         * If we have a lock to protect task->mempolicy in read-side, we do
 154         * rebind directly.
 155         *
 156         * step:
 157         *      MPOL_REBIND_ONCE - do rebind work at once
 158         *      MPOL_REBIND_STEP1 - set all the newly nodes
 159         *      MPOL_REBIND_STEP2 - clean all the disallowed nodes
 160         */
 161        void (*rebind)(struct mempolicy *pol, const nodemask_t *nodes,
 162                        enum mpol_rebind_step step);
 163} mpol_ops[MPOL_MAX];
 164
 165static inline int mpol_store_user_nodemask(const struct mempolicy *pol)
 166{
 167        return pol->flags & MPOL_MODE_FLAGS;
 168}
 169
 170static void mpol_relative_nodemask(nodemask_t *ret, const nodemask_t *orig,
 171                                   const nodemask_t *rel)
 172{
 173        nodemask_t tmp;
 174        nodes_fold(tmp, *orig, nodes_weight(*rel));
 175        nodes_onto(*ret, tmp, *rel);
 176}
 177
 178static int mpol_new_interleave(struct mempolicy *pol, const nodemask_t *nodes)
 179{
 180        if (nodes_empty(*nodes))
 181                return -EINVAL;
 182        pol->v.nodes = *nodes;
 183        return 0;
 184}
 185
 186static int mpol_new_preferred(struct mempolicy *pol, const nodemask_t *nodes)
 187{
 188        if (!nodes)
 189                pol->flags |= MPOL_F_LOCAL;     /* local allocation */
 190        else if (nodes_empty(*nodes))
 191                return -EINVAL;                 /*  no allowed nodes */
 192        else
 193                pol->v.preferred_node = first_node(*nodes);
 194        return 0;
 195}
 196
 197static int mpol_new_bind(struct mempolicy *pol, const nodemask_t *nodes)
 198{
 199        if (nodes_empty(*nodes))
 200                return -EINVAL;
 201        pol->v.nodes = *nodes;
 202        return 0;
 203}
 204
 205/*
 206 * mpol_set_nodemask is called after mpol_new() to set up the nodemask, if
 207 * any, for the new policy.  mpol_new() has already validated the nodes
 208 * parameter with respect to the policy mode and flags.  But, we need to
 209 * handle an empty nodemask with MPOL_PREFERRED here.
 210 *
 211 * Must be called holding task's alloc_lock to protect task's mems_allowed
 212 * and mempolicy.  May also be called holding the mmap_semaphore for write.
 213 */
 214static int mpol_set_nodemask(struct mempolicy *pol,
 215                     const nodemask_t *nodes, struct nodemask_scratch *nsc)
 216{
 217        int ret;
 218
 219        /* if mode is MPOL_DEFAULT, pol is NULL. This is right. */
 220        if (pol == NULL)
 221                return 0;
 222        /* Check N_MEMORY */
 223        nodes_and(nsc->mask1,
 224                  cpuset_current_mems_allowed, node_states[N_MEMORY]);
 225
 226        VM_BUG_ON(!nodes);
 227        if (pol->mode == MPOL_PREFERRED && nodes_empty(*nodes))
 228                nodes = NULL;   /* explicit local allocation */
 229        else {
 230                if (pol->flags & MPOL_F_RELATIVE_NODES)
 231                        mpol_relative_nodemask(&nsc->mask2, nodes, &nsc->mask1);
 232                else
 233                        nodes_and(nsc->mask2, *nodes, nsc->mask1);
 234
 235                if (mpol_store_user_nodemask(pol))
 236                        pol->w.user_nodemask = *nodes;
 237                else
 238                        pol->w.cpuset_mems_allowed =
 239                                                cpuset_current_mems_allowed;
 240        }
 241
 242        if (nodes)
 243                ret = mpol_ops[pol->mode].create(pol, &nsc->mask2);
 244        else
 245                ret = mpol_ops[pol->mode].create(pol, NULL);
 246        return ret;
 247}
 248
 249/*
 250 * This function just creates a new policy, does some check and simple
 251 * initialization. You must invoke mpol_set_nodemask() to set nodes.
 252 */
 253static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
 254                                  nodemask_t *nodes)
 255{
 256        struct mempolicy *policy;
 257
 258        pr_debug("setting mode %d flags %d nodes[0] %lx\n",
 259                 mode, flags, nodes ? nodes_addr(*nodes)[0] : NUMA_NO_NODE);
 260
 261        if (mode == MPOL_DEFAULT) {
 262                if (nodes && !nodes_empty(*nodes))
 263                        return ERR_PTR(-EINVAL);
 264                return NULL;
 265        }
 266        VM_BUG_ON(!nodes);
 267
 268        /*
 269         * MPOL_PREFERRED cannot be used with MPOL_F_STATIC_NODES or
 270         * MPOL_F_RELATIVE_NODES if the nodemask is empty (local allocation).
 271         * All other modes require a valid pointer to a non-empty nodemask.
 272         */
 273        if (mode == MPOL_PREFERRED) {
 274                if (nodes_empty(*nodes)) {
 275                        if (((flags & MPOL_F_STATIC_NODES) ||
 276                             (flags & MPOL_F_RELATIVE_NODES)))
 277                                return ERR_PTR(-EINVAL);
 278                }
 279        } else if (mode == MPOL_LOCAL) {
 280                if (!nodes_empty(*nodes))
 281                        return ERR_PTR(-EINVAL);
 282                mode = MPOL_PREFERRED;
 283        } else if (nodes_empty(*nodes))
 284                return ERR_PTR(-EINVAL);
 285        policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
 286        if (!policy)
 287                return ERR_PTR(-ENOMEM);
 288        atomic_set(&policy->refcnt, 1);
 289        policy->mode = mode;
 290        policy->flags = flags;
 291
 292        return policy;
 293}
 294
 295/* Slow path of a mpol destructor. */
 296void __mpol_put(struct mempolicy *p)
 297{
 298        if (!atomic_dec_and_test(&p->refcnt))
 299                return;
 300        kmem_cache_free(policy_cache, p);
 301}
 302
 303static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes,
 304                                enum mpol_rebind_step step)
 305{
 306}
 307
 308/*
 309 * step:
 310 *      MPOL_REBIND_ONCE  - do rebind work at once
 311 *      MPOL_REBIND_STEP1 - set all the newly nodes
 312 *      MPOL_REBIND_STEP2 - clean all the disallowed nodes
 313 */
 314static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes,
 315                                 enum mpol_rebind_step step)
 316{
 317        nodemask_t tmp;
 318
 319        if (pol->flags & MPOL_F_STATIC_NODES)
 320                nodes_and(tmp, pol->w.user_nodemask, *nodes);
 321        else if (pol->flags & MPOL_F_RELATIVE_NODES)
 322                mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
 323        else {
 324                /*
 325                 * if step == 1, we use ->w.cpuset_mems_allowed to cache the
 326                 * result
 327                 */
 328                if (step == MPOL_REBIND_ONCE || step == MPOL_REBIND_STEP1) {
 329                        nodes_remap(tmp, pol->v.nodes,
 330                                        pol->w.cpuset_mems_allowed, *nodes);
 331                        pol->w.cpuset_mems_allowed = step ? tmp : *nodes;
 332                } else if (step == MPOL_REBIND_STEP2) {
 333                        tmp = pol->w.cpuset_mems_allowed;
 334                        pol->w.cpuset_mems_allowed = *nodes;
 335                } else
 336                        BUG();
 337        }
 338
 339        if (nodes_empty(tmp))
 340                tmp = *nodes;
 341
 342        if (step == MPOL_REBIND_STEP1)
 343                nodes_or(pol->v.nodes, pol->v.nodes, tmp);
 344        else if (step == MPOL_REBIND_ONCE || step == MPOL_REBIND_STEP2)
 345                pol->v.nodes = tmp;
 346        else
 347                BUG();
 348
 349        if (!node_isset(current->il_next, tmp)) {
 350                current->il_next = next_node(current->il_next, tmp);
 351                if (current->il_next >= MAX_NUMNODES)
 352                        current->il_next = first_node(tmp);
 353                if (current->il_next >= MAX_NUMNODES)
 354                        current->il_next = numa_node_id();
 355        }
 356}
 357
 358static void mpol_rebind_preferred(struct mempolicy *pol,
 359                                  const nodemask_t *nodes,
 360                                  enum mpol_rebind_step step)
 361{
 362        nodemask_t tmp;
 363
 364        if (pol->flags & MPOL_F_STATIC_NODES) {
 365                int node = first_node(pol->w.user_nodemask);
 366
 367                if (node_isset(node, *nodes)) {
 368                        pol->v.preferred_node = node;
 369                        pol->flags &= ~MPOL_F_LOCAL;
 370                } else
 371                        pol->flags |= MPOL_F_LOCAL;
 372        } else if (pol->flags & MPOL_F_RELATIVE_NODES) {
 373                mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
 374                pol->v.preferred_node = first_node(tmp);
 375        } else if (!(pol->flags & MPOL_F_LOCAL)) {
 376                pol->v.preferred_node = node_remap(pol->v.preferred_node,
 377                                                   pol->w.cpuset_mems_allowed,
 378                                                   *nodes);
 379                pol->w.cpuset_mems_allowed = *nodes;
 380        }
 381}
 382
 383/*
 384 * mpol_rebind_policy - Migrate a policy to a different set of nodes
 385 *
 386 * If read-side task has no lock to protect task->mempolicy, write-side
 387 * task will rebind the task->mempolicy by two step. The first step is
 388 * setting all the newly nodes, and the second step is cleaning all the
 389 * disallowed nodes. In this way, we can avoid finding no node to alloc
 390 * page.
 391 * If we have a lock to protect task->mempolicy in read-side, we do
 392 * rebind directly.
 393 *
 394 * step:
 395 *      MPOL_REBIND_ONCE  - do rebind work at once
 396 *      MPOL_REBIND_STEP1 - set all the newly nodes
 397 *      MPOL_REBIND_STEP2 - clean all the disallowed nodes
 398 */
 399static void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask,
 400                                enum mpol_rebind_step step)
 401{
 402        if (!pol)
 403                return;
 404        if (!mpol_store_user_nodemask(pol) && step == MPOL_REBIND_ONCE &&
 405            nodes_equal(pol->w.cpuset_mems_allowed, *newmask))
 406                return;
 407
 408        if (step == MPOL_REBIND_STEP1 && (pol->flags & MPOL_F_REBINDING))
 409                return;
 410
 411        if (step == MPOL_REBIND_STEP2 && !(pol->flags & MPOL_F_REBINDING))
 412                BUG();
 413
 414        if (step == MPOL_REBIND_STEP1)
 415                pol->flags |= MPOL_F_REBINDING;
 416        else if (step == MPOL_REBIND_STEP2)
 417                pol->flags &= ~MPOL_F_REBINDING;
 418        else if (step >= MPOL_REBIND_NSTEP)
 419                BUG();
 420
 421        mpol_ops[pol->mode].rebind(pol, newmask, step);
 422}
 423
 424/*
 425 * Wrapper for mpol_rebind_policy() that just requires task
 426 * pointer, and updates task mempolicy.
 427 *
 428 * Called with task's alloc_lock held.
 429 */
 430
 431void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new,
 432                        enum mpol_rebind_step step)
 433{
 434        mpol_rebind_policy(tsk->mempolicy, new, step);
 435}
 436
 437/*
 438 * Rebind each vma in mm to new nodemask.
 439 *
 440 * Call holding a reference to mm.  Takes mm->mmap_sem during call.
 441 */
 442
 443void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
 444{
 445        struct vm_area_struct *vma;
 446
 447        down_write(&mm->mmap_sem);
 448        for (vma = mm->mmap; vma; vma = vma->vm_next)
 449                mpol_rebind_policy(vma->vm_policy, new, MPOL_REBIND_ONCE);
 450        up_write(&mm->mmap_sem);
 451}
 452
 453static const struct mempolicy_operations mpol_ops[MPOL_MAX] = {
 454        [MPOL_DEFAULT] = {
 455                .rebind = mpol_rebind_default,
 456        },
 457        [MPOL_INTERLEAVE] = {
 458                .create = mpol_new_interleave,
 459                .rebind = mpol_rebind_nodemask,
 460        },
 461        [MPOL_PREFERRED] = {
 462                .create = mpol_new_preferred,
 463                .rebind = mpol_rebind_preferred,
 464        },
 465        [MPOL_BIND] = {
 466                .create = mpol_new_bind,
 467                .rebind = mpol_rebind_nodemask,
 468        },
 469};
 470
 471static void migrate_page_add(struct page *page, struct list_head *pagelist,
 472                                unsigned long flags);
 473
 474struct queue_pages {
 475        struct list_head *pagelist;
 476        unsigned long flags;
 477        nodemask_t *nmask;
 478        struct vm_area_struct *prev;
 479};
 480
 481/*
 482 * Scan through pages checking if pages follow certain conditions,
 483 * and move them to the pagelist if they do.
 484 */
 485static int queue_pages_pte_range(pmd_t *pmd, unsigned long addr,
 486                        unsigned long end, struct mm_walk *walk)
 487{
 488        struct vm_area_struct *vma = walk->vma;
 489        struct page *page;
 490        struct queue_pages *qp = walk->private;
 491        unsigned long flags = qp->flags;
 492        int nid, ret;
 493        pte_t *pte;
 494        spinlock_t *ptl;
 495
 496        if (pmd_trans_huge(*pmd)) {
 497                ptl = pmd_lock(walk->mm, pmd);
 498                if (pmd_trans_huge(*pmd)) {
 499                        page = pmd_page(*pmd);
 500                        if (is_huge_zero_page(page)) {
 501                                spin_unlock(ptl);
 502                                split_huge_pmd(vma, pmd, addr);
 503                        } else {
 504                                get_page(page);
 505                                spin_unlock(ptl);
 506                                lock_page(page);
 507                                ret = split_huge_page(page);
 508                                unlock_page(page);
 509                                put_page(page);
 510                                if (ret)
 511                                        return 0;
 512                        }
 513                } else {
 514                        spin_unlock(ptl);
 515                }
 516        }
 517
 518retry:
 519        pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
 520        for (; addr != end; pte++, addr += PAGE_SIZE) {
 521                if (!pte_present(*pte))
 522                        continue;
 523                page = vm_normal_page(vma, addr, *pte);
 524                if (!page)
 525                        continue;
 526                /*
 527                 * vm_normal_page() filters out zero pages, but there might
 528                 * still be PageReserved pages to skip, perhaps in a VDSO.
 529                 */
 530                if (PageReserved(page))
 531                        continue;
 532                nid = page_to_nid(page);
 533                if (node_isset(nid, *qp->nmask) == !!(flags & MPOL_MF_INVERT))
 534                        continue;
 535                if (PageTransCompound(page) && PageAnon(page)) {
 536                        get_page(page);
 537                        pte_unmap_unlock(pte, ptl);
 538                        lock_page(page);
 539                        ret = split_huge_page(page);
 540                        unlock_page(page);
 541                        put_page(page);
 542                        /* Failed to split -- skip. */
 543                        if (ret) {
 544                                pte = pte_offset_map_lock(walk->mm, pmd,
 545                                                addr, &ptl);
 546                                continue;
 547                        }
 548                        goto retry;
 549                }
 550
 551                migrate_page_add(page, qp->pagelist, flags);
 552        }
 553        pte_unmap_unlock(pte - 1, ptl);
 554        cond_resched();
 555        return 0;
 556}
 557
 558static int queue_pages_hugetlb(pte_t *pte, unsigned long hmask,
 559                               unsigned long addr, unsigned long end,
 560                               struct mm_walk *walk)
 561{
 562#ifdef CONFIG_HUGETLB_PAGE
 563        struct queue_pages *qp = walk->private;
 564        unsigned long flags = qp->flags;
 565        int nid;
 566        struct page *page;
 567        spinlock_t *ptl;
 568        pte_t entry;
 569
 570        ptl = huge_pte_lock(hstate_vma(walk->vma), walk->mm, pte);
 571        entry = huge_ptep_get(pte);
 572        if (!pte_present(entry))
 573                goto unlock;
 574        page = pte_page(entry);
 575        nid = page_to_nid(page);
 576        if (node_isset(nid, *qp->nmask) == !!(flags & MPOL_MF_INVERT))
 577                goto unlock;
 578        /* With MPOL_MF_MOVE, we migrate only unshared hugepage. */
 579        if (flags & (MPOL_MF_MOVE_ALL) ||
 580            (flags & MPOL_MF_MOVE && page_mapcount(page) == 1))
 581                isolate_huge_page(page, qp->pagelist);
 582unlock:
 583        spin_unlock(ptl);
 584#else
 585        BUG();
 586#endif
 587        return 0;
 588}
 589
 590#ifdef CONFIG_NUMA_BALANCING
 591/*
 592 * This is used to mark a range of virtual addresses to be inaccessible.
 593 * These are later cleared by a NUMA hinting fault. Depending on these
 594 * faults, pages may be migrated for better NUMA placement.
 595 *
 596 * This is assuming that NUMA faults are handled using PROT_NONE. If
 597 * an architecture makes a different choice, it will need further
 598 * changes to the core.
 599 */
 600unsigned long change_prot_numa(struct vm_area_struct *vma,
 601                        unsigned long addr, unsigned long end)
 602{
 603        int nr_updated;
 604
 605        nr_updated = change_protection(vma, addr, end, PAGE_NONE, 0, 1);
 606        if (nr_updated)
 607                count_vm_numa_events(NUMA_PTE_UPDATES, nr_updated);
 608
 609        return nr_updated;
 610}
 611#else
 612static unsigned long change_prot_numa(struct vm_area_struct *vma,
 613                        unsigned long addr, unsigned long end)
 614{
 615        return 0;
 616}
 617#endif /* CONFIG_NUMA_BALANCING */
 618
 619static int queue_pages_test_walk(unsigned long start, unsigned long end,
 620                                struct mm_walk *walk)
 621{
 622        struct vm_area_struct *vma = walk->vma;
 623        struct queue_pages *qp = walk->private;
 624        unsigned long endvma = vma->vm_end;
 625        unsigned long flags = qp->flags;
 626
 627        if (!vma_migratable(vma))
 628                return 1;
 629
 630        if (endvma > end)
 631                endvma = end;
 632        if (vma->vm_start > start)
 633                start = vma->vm_start;
 634
 635        if (!(flags & MPOL_MF_DISCONTIG_OK)) {
 636                if (!vma->vm_next && vma->vm_end < end)
 637                        return -EFAULT;
 638                if (qp->prev && qp->prev->vm_end < vma->vm_start)
 639                        return -EFAULT;
 640        }
 641
 642        qp->prev = vma;
 643
 644        if (flags & MPOL_MF_LAZY) {
 645                /* Similar to task_numa_work, skip inaccessible VMAs */
 646                if (!is_vm_hugetlb_page(vma) &&
 647                        (vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)) &&
 648                        !(vma->vm_flags & VM_MIXEDMAP))
 649                        change_prot_numa(vma, start, endvma);
 650                return 1;
 651        }
 652
 653        /* queue pages from current vma */
 654        if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
 655                return 0;
 656        return 1;
 657}
 658
 659/*
 660 * Walk through page tables and collect pages to be migrated.
 661 *
 662 * If pages found in a given range are on a set of nodes (determined by
 663 * @nodes and @flags,) it's isolated and queued to the pagelist which is
 664 * passed via @private.)
 665 */
 666static int
 667queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end,
 668                nodemask_t *nodes, unsigned long flags,
 669                struct list_head *pagelist)
 670{
 671        struct queue_pages qp = {
 672                .pagelist = pagelist,
 673                .flags = flags,
 674                .nmask = nodes,
 675                .prev = NULL,
 676        };
 677        struct mm_walk queue_pages_walk = {
 678                .hugetlb_entry = queue_pages_hugetlb,
 679                .pmd_entry = queue_pages_pte_range,
 680                .test_walk = queue_pages_test_walk,
 681                .mm = mm,
 682                .private = &qp,
 683        };
 684
 685        return walk_page_range(start, end, &queue_pages_walk);
 686}
 687
 688/*
 689 * Apply policy to a single VMA
 690 * This must be called with the mmap_sem held for writing.
 691 */
 692static int vma_replace_policy(struct vm_area_struct *vma,
 693                                                struct mempolicy *pol)
 694{
 695        int err;
 696        struct mempolicy *old;
 697        struct mempolicy *new;
 698
 699        pr_debug("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
 700                 vma->vm_start, vma->vm_end, vma->vm_pgoff,
 701                 vma->vm_ops, vma->vm_file,
 702                 vma->vm_ops ? vma->vm_ops->set_policy : NULL);
 703
 704        new = mpol_dup(pol);
 705        if (IS_ERR(new))
 706                return PTR_ERR(new);
 707
 708        if (vma->vm_ops && vma->vm_ops->set_policy) {
 709                err = vma->vm_ops->set_policy(vma, new);
 710                if (err)
 711                        goto err_out;
 712        }
 713
 714        old = vma->vm_policy;
 715        vma->vm_policy = new; /* protected by mmap_sem */
 716        mpol_put(old);
 717
 718        return 0;
 719 err_out:
 720        mpol_put(new);
 721        return err;
 722}
 723
 724/* Step 2: apply policy to a range and do splits. */
 725static int mbind_range(struct mm_struct *mm, unsigned long start,
 726                       unsigned long end, struct mempolicy *new_pol)
 727{
 728        struct vm_area_struct *next;
 729        struct vm_area_struct *prev;
 730        struct vm_area_struct *vma;
 731        int err = 0;
 732        pgoff_t pgoff;
 733        unsigned long vmstart;
 734        unsigned long vmend;
 735
 736        vma = find_vma(mm, start);
 737        if (!vma || vma->vm_start > start)
 738                return -EFAULT;
 739
 740        prev = vma->vm_prev;
 741        if (start > vma->vm_start)
 742                prev = vma;
 743
 744        for (; vma && vma->vm_start < end; prev = vma, vma = next) {
 745                next = vma->vm_next;
 746                vmstart = max(start, vma->vm_start);
 747                vmend   = min(end, vma->vm_end);
 748
 749                if (mpol_equal(vma_policy(vma), new_pol))
 750                        continue;
 751
 752                pgoff = vma->vm_pgoff +
 753                        ((vmstart - vma->vm_start) >> PAGE_SHIFT);
 754                prev = vma_merge(mm, prev, vmstart, vmend, vma->vm_flags,
 755                                 vma->anon_vma, vma->vm_file, pgoff,
 756                                 new_pol, vma->vm_userfaultfd_ctx);
 757                if (prev) {
 758                        vma = prev;
 759                        next = vma->vm_next;
 760                        if (mpol_equal(vma_policy(vma), new_pol))
 761                                continue;
 762                        /* vma_merge() joined vma && vma->next, case 8 */
 763                        goto replace;
 764                }
 765                if (vma->vm_start != vmstart) {
 766                        err = split_vma(vma->vm_mm, vma, vmstart, 1);
 767                        if (err)
 768                                goto out;
 769                }
 770                if (vma->vm_end != vmend) {
 771                        err = split_vma(vma->vm_mm, vma, vmend, 0);
 772                        if (err)
 773                                goto out;
 774                }
 775 replace:
 776                err = vma_replace_policy(vma, new_pol);
 777                if (err)
 778                        goto out;
 779        }
 780
 781 out:
 782        return err;
 783}
 784
 785/* Set the process memory policy */
 786static long do_set_mempolicy(unsigned short mode, unsigned short flags,
 787                             nodemask_t *nodes)
 788{
 789        struct mempolicy *new, *old;
 790        NODEMASK_SCRATCH(scratch);
 791        int ret;
 792
 793        if (!scratch)
 794                return -ENOMEM;
 795
 796        new = mpol_new(mode, flags, nodes);
 797        if (IS_ERR(new)) {
 798                ret = PTR_ERR(new);
 799                goto out;
 800        }
 801
 802        task_lock(current);
 803        ret = mpol_set_nodemask(new, nodes, scratch);
 804        if (ret) {
 805                task_unlock(current);
 806                mpol_put(new);
 807                goto out;
 808        }
 809        old = current->mempolicy;
 810        current->mempolicy = new;
 811        if (new && new->mode == MPOL_INTERLEAVE &&
 812            nodes_weight(new->v.nodes))
 813                current->il_next = first_node(new->v.nodes);
 814        task_unlock(current);
 815        mpol_put(old);
 816        ret = 0;
 817out:
 818        NODEMASK_SCRATCH_FREE(scratch);
 819        return ret;
 820}
 821
 822/*
 823 * Return nodemask for policy for get_mempolicy() query
 824 *
 825 * Called with task's alloc_lock held
 826 */
 827static void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes)
 828{
 829        nodes_clear(*nodes);
 830        if (p == &default_policy)
 831                return;
 832
 833        switch (p->mode) {
 834        case MPOL_BIND:
 835                /* Fall through */
 836        case MPOL_INTERLEAVE:
 837                *nodes = p->v.nodes;
 838                break;
 839        case MPOL_PREFERRED:
 840                if (!(p->flags & MPOL_F_LOCAL))
 841                        node_set(p->v.preferred_node, *nodes);
 842                /* else return empty node mask for local allocation */
 843                break;
 844        default:
 845                BUG();
 846        }
 847}
 848
 849static int lookup_node(unsigned long addr)
 850{
 851        struct page *p;
 852        int err;
 853
 854        err = get_user_pages(addr & PAGE_MASK, 1, 0, 0, &p, NULL);
 855        if (err >= 0) {
 856                err = page_to_nid(p);
 857                put_page(p);
 858        }
 859        return err;
 860}
 861
 862/* Retrieve NUMA policy */
 863static long do_get_mempolicy(int *policy, nodemask_t *nmask,
 864                             unsigned long addr, unsigned long flags)
 865{
 866        int err;
 867        struct mm_struct *mm = current->mm;
 868        struct vm_area_struct *vma = NULL;
 869        struct mempolicy *pol = current->mempolicy;
 870
 871        if (flags &
 872                ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED))
 873                return -EINVAL;
 874
 875        if (flags & MPOL_F_MEMS_ALLOWED) {
 876                if (flags & (MPOL_F_NODE|MPOL_F_ADDR))
 877                        return -EINVAL;
 878                *policy = 0;    /* just so it's initialized */
 879                task_lock(current);
 880                *nmask  = cpuset_current_mems_allowed;
 881                task_unlock(current);
 882                return 0;
 883        }
 884
 885        if (flags & MPOL_F_ADDR) {
 886                /*
 887                 * Do NOT fall back to task policy if the
 888                 * vma/shared policy at addr is NULL.  We
 889                 * want to return MPOL_DEFAULT in this case.
 890                 */
 891                down_read(&mm->mmap_sem);
 892                vma = find_vma_intersection(mm, addr, addr+1);
 893                if (!vma) {
 894                        up_read(&mm->mmap_sem);
 895                        return -EFAULT;
 896                }
 897                if (vma->vm_ops && vma->vm_ops->get_policy)
 898                        pol = vma->vm_ops->get_policy(vma, addr);
 899                else
 900                        pol = vma->vm_policy;
 901        } else if (addr)
 902                return -EINVAL;
 903
 904        if (!pol)
 905                pol = &default_policy;  /* indicates default behavior */
 906
 907        if (flags & MPOL_F_NODE) {
 908                if (flags & MPOL_F_ADDR) {
 909                        err = lookup_node(addr);
 910                        if (err < 0)
 911                                goto out;
 912                        *policy = err;
 913                } else if (pol == current->mempolicy &&
 914                                pol->mode == MPOL_INTERLEAVE) {
 915                        *policy = current->il_next;
 916                } else {
 917                        err = -EINVAL;
 918                        goto out;
 919                }
 920        } else {
 921                *policy = pol == &default_policy ? MPOL_DEFAULT :
 922                                                pol->mode;
 923                /*
 924                 * Internal mempolicy flags must be masked off before exposing
 925                 * the policy to userspace.
 926                 */
 927                *policy |= (pol->flags & MPOL_MODE_FLAGS);
 928        }
 929
 930        if (vma) {
 931                up_read(&current->mm->mmap_sem);
 932                vma = NULL;
 933        }
 934
 935        err = 0;
 936        if (nmask) {
 937                if (mpol_store_user_nodemask(pol)) {
 938                        *nmask = pol->w.user_nodemask;
 939                } else {
 940                        task_lock(current);
 941                        get_policy_nodemask(pol, nmask);
 942                        task_unlock(current);
 943                }
 944        }
 945
 946 out:
 947        mpol_cond_put(pol);
 948        if (vma)
 949                up_read(&current->mm->mmap_sem);
 950        return err;
 951}
 952
 953#ifdef CONFIG_MIGRATION
 954/*
 955 * page migration
 956 */
 957static void migrate_page_add(struct page *page, struct list_head *pagelist,
 958                                unsigned long flags)
 959{
 960        /*
 961         * Avoid migrating a page that is shared with others.
 962         */
 963        if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1) {
 964                if (!isolate_lru_page(page)) {
 965                        list_add_tail(&page->lru, pagelist);
 966                        inc_zone_page_state(page, NR_ISOLATED_ANON +
 967                                            page_is_file_cache(page));
 968                }
 969        }
 970}
 971
 972static struct page *new_node_page(struct page *page, unsigned long node, int **x)
 973{
 974        if (PageHuge(page))
 975                return alloc_huge_page_node(page_hstate(compound_head(page)),
 976                                        node);
 977        else
 978                return __alloc_pages_node(node, GFP_HIGHUSER_MOVABLE |
 979                                                    __GFP_THISNODE, 0);
 980}
 981
 982/*
 983 * Migrate pages from one node to a target node.
 984 * Returns error or the number of pages not migrated.
 985 */
 986static int migrate_to_node(struct mm_struct *mm, int source, int dest,
 987                           int flags)
 988{
 989        nodemask_t nmask;
 990        LIST_HEAD(pagelist);
 991        int err = 0;
 992
 993        nodes_clear(nmask);
 994        node_set(source, nmask);
 995
 996        /*
 997         * This does not "check" the range but isolates all pages that
 998         * need migration.  Between passing in the full user address
 999         * space range and MPOL_MF_DISCONTIG_OK, this call can not fail.
1000         */
1001        VM_BUG_ON(!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)));
1002        queue_pages_range(mm, mm->mmap->vm_start, mm->task_size, &nmask,
1003                        flags | MPOL_MF_DISCONTIG_OK, &pagelist);
1004
1005        if (!list_empty(&pagelist)) {
1006                err = migrate_pages(&pagelist, new_node_page, NULL, dest,
1007                                        MIGRATE_SYNC, MR_SYSCALL);
1008                if (err)
1009                        putback_movable_pages(&pagelist);
1010        }
1011
1012        return err;
1013}
1014
1015/*
1016 * Move pages between the two nodesets so as to preserve the physical
1017 * layout as much as possible.
1018 *
1019 * Returns the number of page that could not be moved.
1020 */
1021int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
1022                     const nodemask_t *to, int flags)
1023{
1024        int busy = 0;
1025        int err;
1026        nodemask_t tmp;
1027
1028        err = migrate_prep();
1029        if (err)
1030                return err;
1031
1032        down_read(&mm->mmap_sem);
1033
1034        /*
1035         * Find a 'source' bit set in 'tmp' whose corresponding 'dest'
1036         * bit in 'to' is not also set in 'tmp'.  Clear the found 'source'
1037         * bit in 'tmp', and return that <source, dest> pair for migration.
1038         * The pair of nodemasks 'to' and 'from' define the map.
1039         *
1040         * If no pair of bits is found that way, fallback to picking some
1041         * pair of 'source' and 'dest' bits that are not the same.  If the
1042         * 'source' and 'dest' bits are the same, this represents a node
1043         * that will be migrating to itself, so no pages need move.
1044         *
1045         * If no bits are left in 'tmp', or if all remaining bits left
1046         * in 'tmp' correspond to the same bit in 'to', return false
1047         * (nothing left to migrate).
1048         *
1049         * This lets us pick a pair of nodes to migrate between, such that
1050         * if possible the dest node is not already occupied by some other
1051         * source node, minimizing the risk of overloading the memory on a
1052         * node that would happen if we migrated incoming memory to a node
1053         * before migrating outgoing memory source that same node.
1054         *
1055         * A single scan of tmp is sufficient.  As we go, we remember the
1056         * most recent <s, d> pair that moved (s != d).  If we find a pair
1057         * that not only moved, but what's better, moved to an empty slot
1058         * (d is not set in tmp), then we break out then, with that pair.
1059         * Otherwise when we finish scanning from_tmp, we at least have the
1060         * most recent <s, d> pair that moved.  If we get all the way through
1061         * the scan of tmp without finding any node that moved, much less
1062         * moved to an empty node, then there is nothing left worth migrating.
1063         */
1064
1065        tmp = *from;
1066        while (!nodes_empty(tmp)) {
1067                int s,d;
1068                int source = NUMA_NO_NODE;
1069                int dest = 0;
1070
1071                for_each_node_mask(s, tmp) {
1072
1073                        /*
1074                         * do_migrate_pages() tries to maintain the relative
1075                         * node relationship of the pages established between
1076                         * threads and memory areas.
1077                         *
1078                         * However if the number of source nodes is not equal to
1079                         * the number of destination nodes we can not preserve
1080                         * this node relative relationship.  In that case, skip
1081                         * copying memory from a node that is in the destination
1082                         * mask.
1083                         *
1084                         * Example: [2,3,4] -> [3,4,5] moves everything.
1085                         *          [0-7] - > [3,4,5] moves only 0,1,2,6,7.
1086                         */
1087
1088                        if ((nodes_weight(*from) != nodes_weight(*to)) &&
1089                                                (node_isset(s, *to)))
1090                                continue;
1091
1092                        d = node_remap(s, *from, *to);
1093                        if (s == d)
1094                                continue;
1095
1096                        source = s;     /* Node moved. Memorize */
1097                        dest = d;
1098
1099                        /* dest not in remaining from nodes? */
1100                        if (!node_isset(dest, tmp))
1101                                break;
1102                }
1103                if (source == NUMA_NO_NODE)
1104                        break;
1105
1106                node_clear(source, tmp);
1107                err = migrate_to_node(mm, source, dest, flags);
1108                if (err > 0)
1109                        busy += err;
1110                if (err < 0)
1111                        break;
1112        }
1113        up_read(&mm->mmap_sem);
1114        if (err < 0)
1115                return err;
1116        return busy;
1117
1118}
1119
1120/*
1121 * Allocate a new page for page migration based on vma policy.
1122 * Start by assuming the page is mapped by the same vma as contains @start.
1123 * Search forward from there, if not.  N.B., this assumes that the
1124 * list of pages handed to migrate_pages()--which is how we get here--
1125 * is in virtual address order.
1126 */
1127static struct page *new_page(struct page *page, unsigned long start, int **x)
1128{
1129        struct vm_area_struct *vma;
1130        unsigned long uninitialized_var(address);
1131
1132        vma = find_vma(current->mm, start);
1133        while (vma) {
1134                address = page_address_in_vma(page, vma);
1135                if (address != -EFAULT)
1136                        break;
1137                vma = vma->vm_next;
1138        }
1139
1140        if (PageHuge(page)) {
1141                BUG_ON(!vma);
1142                return alloc_huge_page_noerr(vma, address, 1);
1143        }
1144        /*
1145         * if !vma, alloc_page_vma() will use task or system default policy
1146         */
1147        return alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
1148}
1149#else
1150
1151static void migrate_page_add(struct page *page, struct list_head *pagelist,
1152                                unsigned long flags)
1153{
1154}
1155
1156int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
1157                     const nodemask_t *to, int flags)
1158{
1159        return -ENOSYS;
1160}
1161
1162static struct page *new_page(struct page *page, unsigned long start, int **x)
1163{
1164        return NULL;
1165}
1166#endif
1167
1168static long do_mbind(unsigned long start, unsigned long len,
1169                     unsigned short mode, unsigned short mode_flags,
1170                     nodemask_t *nmask, unsigned long flags)
1171{
1172        struct mm_struct *mm = current->mm;
1173        struct mempolicy *new;
1174        unsigned long end;
1175        int err;
1176        LIST_HEAD(pagelist);
1177
1178        if (flags & ~(unsigned long)MPOL_MF_VALID)
1179                return -EINVAL;
1180        if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
1181                return -EPERM;
1182
1183        if (start & ~PAGE_MASK)
1184                return -EINVAL;
1185
1186        if (mode == MPOL_DEFAULT)
1187                flags &= ~MPOL_MF_STRICT;
1188
1189        len = (len + PAGE_SIZE - 1) & PAGE_MASK;
1190        end = start + len;
1191
1192        if (end < start)
1193                return -EINVAL;
1194        if (end == start)
1195                return 0;
1196
1197        new = mpol_new(mode, mode_flags, nmask);
1198        if (IS_ERR(new))
1199                return PTR_ERR(new);
1200
1201        if (flags & MPOL_MF_LAZY)
1202                new->flags |= MPOL_F_MOF;
1203
1204        /*
1205         * If we are using the default policy then operation
1206         * on discontinuous address spaces is okay after all
1207         */
1208        if (!new)
1209                flags |= MPOL_MF_DISCONTIG_OK;
1210
1211        pr_debug("mbind %lx-%lx mode:%d flags:%d nodes:%lx\n",
1212                 start, start + len, mode, mode_flags,
1213                 nmask ? nodes_addr(*nmask)[0] : NUMA_NO_NODE);
1214
1215        if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
1216
1217                err = migrate_prep();
1218                if (err)
1219                        goto mpol_out;
1220        }
1221        {
1222                NODEMASK_SCRATCH(scratch);
1223                if (scratch) {
1224                        down_write(&mm->mmap_sem);
1225                        task_lock(current);
1226                        err = mpol_set_nodemask(new, nmask, scratch);
1227                        task_unlock(current);
1228                        if (err)
1229                                up_write(&mm->mmap_sem);
1230                } else
1231                        err = -ENOMEM;
1232                NODEMASK_SCRATCH_FREE(scratch);
1233        }
1234        if (err)
1235                goto mpol_out;
1236
1237        err = queue_pages_range(mm, start, end, nmask,
1238                          flags | MPOL_MF_INVERT, &pagelist);
1239        if (!err)
1240                err = mbind_range(mm, start, end, new);
1241
1242        if (!err) {
1243                int nr_failed = 0;
1244
1245                if (!list_empty(&pagelist)) {
1246                        WARN_ON_ONCE(flags & MPOL_MF_LAZY);
1247                        nr_failed = migrate_pages(&pagelist, new_page, NULL,
1248                                start, MIGRATE_SYNC, MR_MEMPOLICY_MBIND);
1249                        if (nr_failed)
1250                                putback_movable_pages(&pagelist);
1251                }
1252
1253                if (nr_failed && (flags & MPOL_MF_STRICT))
1254                        err = -EIO;
1255        } else
1256                putback_movable_pages(&pagelist);
1257
1258        up_write(&mm->mmap_sem);
1259 mpol_out:
1260        mpol_put(new);
1261        return err;
1262}
1263
1264/*
1265 * User space interface with variable sized bitmaps for nodelists.
1266 */
1267
1268/* Copy a node mask from user space. */
1269static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
1270                     unsigned long maxnode)
1271{
1272        unsigned long k;
1273        unsigned long nlongs;
1274        unsigned long endmask;
1275
1276        --maxnode;
1277        nodes_clear(*nodes);
1278        if (maxnode == 0 || !nmask)
1279                return 0;
1280        if (maxnode > PAGE_SIZE*BITS_PER_BYTE)
1281                return -EINVAL;
1282
1283        nlongs = BITS_TO_LONGS(maxnode);
1284        if ((maxnode % BITS_PER_LONG) == 0)
1285                endmask = ~0UL;
1286        else
1287                endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1;
1288
1289        /* When the user specified more nodes than supported just check
1290           if the non supported part is all zero. */
1291        if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) {
1292                if (nlongs > PAGE_SIZE/sizeof(long))
1293                        return -EINVAL;
1294                for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) {
1295                        unsigned long t;
1296                        if (get_user(t, nmask + k))
1297                                return -EFAULT;
1298                        if (k == nlongs - 1) {
1299                                if (t & endmask)
1300                                        return -EINVAL;
1301                        } else if (t)
1302                                return -EINVAL;
1303                }
1304                nlongs = BITS_TO_LONGS(MAX_NUMNODES);
1305                endmask = ~0UL;
1306        }
1307
1308        if (copy_from_user(nodes_addr(*nodes), nmask, nlongs*sizeof(unsigned long)))
1309                return -EFAULT;
1310        nodes_addr(*nodes)[nlongs-1] &= endmask;
1311        return 0;
1312}
1313
1314/* Copy a kernel node mask to user space */
1315static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
1316                              nodemask_t *nodes)
1317{
1318        unsigned long copy = ALIGN(maxnode-1, 64) / 8;
1319        const int nbytes = BITS_TO_LONGS(MAX_NUMNODES) * sizeof(long);
1320
1321        if (copy > nbytes) {
1322                if (copy > PAGE_SIZE)
1323                        return -EINVAL;
1324                if (clear_user((char __user *)mask + nbytes, copy - nbytes))
1325                        return -EFAULT;
1326                copy = nbytes;
1327        }
1328        return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
1329}
1330
1331SYSCALL_DEFINE6(mbind, unsigned long, start, unsigned long, len,
1332                unsigned long, mode, const unsigned long __user *, nmask,
1333                unsigned long, maxnode, unsigned, flags)
1334{
1335        nodemask_t nodes;
1336        int err;
1337        unsigned short mode_flags;
1338
1339        mode_flags = mode & MPOL_MODE_FLAGS;
1340        mode &= ~MPOL_MODE_FLAGS;
1341        if (mode >= MPOL_MAX)
1342                return -EINVAL;
1343        if ((mode_flags & MPOL_F_STATIC_NODES) &&
1344            (mode_flags & MPOL_F_RELATIVE_NODES))
1345                return -EINVAL;
1346        err = get_nodes(&nodes, nmask, maxnode);
1347        if (err)
1348                return err;
1349        return do_mbind(start, len, mode, mode_flags, &nodes, flags);
1350}
1351
1352/* Set the process memory policy */
1353SYSCALL_DEFINE3(set_mempolicy, int, mode, const unsigned long __user *, nmask,
1354                unsigned long, maxnode)
1355{
1356        int err;
1357        nodemask_t nodes;
1358        unsigned short flags;
1359
1360        flags = mode & MPOL_MODE_FLAGS;
1361        mode &= ~MPOL_MODE_FLAGS;
1362        if ((unsigned int)mode >= MPOL_MAX)
1363                return -EINVAL;
1364        if ((flags & MPOL_F_STATIC_NODES) && (flags & MPOL_F_RELATIVE_NODES))
1365                return -EINVAL;
1366        err = get_nodes(&nodes, nmask, maxnode);
1367        if (err)
1368                return err;
1369        return do_set_mempolicy(mode, flags, &nodes);
1370}
1371
1372SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode,
1373                const unsigned long __user *, old_nodes,
1374                const unsigned long __user *, new_nodes)
1375{
1376        const struct cred *cred = current_cred(), *tcred;
1377        struct mm_struct *mm = NULL;
1378        struct task_struct *task;
1379        nodemask_t task_nodes;
1380        int err;
1381        nodemask_t *old;
1382        nodemask_t *new;
1383        NODEMASK_SCRATCH(scratch);
1384
1385        if (!scratch)
1386                return -ENOMEM;
1387
1388        old = &scratch->mask1;
1389        new = &scratch->mask2;
1390
1391        err = get_nodes(old, old_nodes, maxnode);
1392        if (err)
1393                goto out;
1394
1395        err = get_nodes(new, new_nodes, maxnode);
1396        if (err)
1397                goto out;
1398
1399        /* Find the mm_struct */
1400        rcu_read_lock();
1401        task = pid ? find_task_by_vpid(pid) : current;
1402        if (!task) {
1403                rcu_read_unlock();
1404                err = -ESRCH;
1405                goto out;
1406        }
1407        get_task_struct(task);
1408
1409        err = -EINVAL;
1410
1411        /*
1412         * Check if this process has the right to modify the specified
1413         * process. The right exists if the process has administrative
1414         * capabilities, superuser privileges or the same
1415         * userid as the target process.
1416         */
1417        tcred = __task_cred(task);
1418        if (!uid_eq(cred->euid, tcred->suid) && !uid_eq(cred->euid, tcred->uid) &&
1419            !uid_eq(cred->uid,  tcred->suid) && !uid_eq(cred->uid,  tcred->uid) &&
1420            !capable(CAP_SYS_NICE)) {
1421                rcu_read_unlock();
1422                err = -EPERM;
1423                goto out_put;
1424        }
1425        rcu_read_unlock();
1426
1427        task_nodes = cpuset_mems_allowed(task);
1428        /* Is the user allowed to access the target nodes? */
1429        if (!nodes_subset(*new, task_nodes) && !capable(CAP_SYS_NICE)) {
1430                err = -EPERM;
1431                goto out_put;
1432        }
1433
1434        if (!nodes_subset(*new, node_states[N_MEMORY])) {
1435                err = -EINVAL;
1436                goto out_put;
1437        }
1438
1439        err = security_task_movememory(task);
1440        if (err)
1441                goto out_put;
1442
1443        mm = get_task_mm(task);
1444        put_task_struct(task);
1445
1446        if (!mm) {
1447                err = -EINVAL;
1448                goto out;
1449        }
1450
1451        err = do_migrate_pages(mm, old, new,
1452                capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE);
1453
1454        mmput(mm);
1455out:
1456        NODEMASK_SCRATCH_FREE(scratch);
1457
1458        return err;
1459
1460out_put:
1461        put_task_struct(task);
1462        goto out;
1463
1464}
1465
1466
1467/* Retrieve NUMA policy */
1468SYSCALL_DEFINE5(get_mempolicy, int __user *, policy,
1469                unsigned long __user *, nmask, unsigned long, maxnode,
1470                unsigned long, addr, unsigned long, flags)
1471{
1472        int err;
1473        int uninitialized_var(pval);
1474        nodemask_t nodes;
1475
1476        if (nmask != NULL && maxnode < MAX_NUMNODES)
1477                return -EINVAL;
1478
1479        err = do_get_mempolicy(&pval, &nodes, addr, flags);
1480
1481        if (err)
1482                return err;
1483
1484        if (policy && put_user(pval, policy))
1485                return -EFAULT;
1486
1487        if (nmask)
1488                err = copy_nodes_to_user(nmask, maxnode, &nodes);
1489
1490        return err;
1491}
1492
1493#ifdef CONFIG_COMPAT
1494
1495COMPAT_SYSCALL_DEFINE5(get_mempolicy, int __user *, policy,
1496                       compat_ulong_t __user *, nmask,
1497                       compat_ulong_t, maxnode,
1498                       compat_ulong_t, addr, compat_ulong_t, flags)
1499{
1500        long err;
1501        unsigned long __user *nm = NULL;
1502        unsigned long nr_bits, alloc_size;
1503        DECLARE_BITMAP(bm, MAX_NUMNODES);
1504
1505        nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1506        alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1507
1508        if (nmask)
1509                nm = compat_alloc_user_space(alloc_size);
1510
1511        err = sys_get_mempolicy(policy, nm, nr_bits+1, addr, flags);
1512
1513        if (!err && nmask) {
1514                unsigned long copy_size;
1515                copy_size = min_t(unsigned long, sizeof(bm), alloc_size);
1516                err = copy_from_user(bm, nm, copy_size);
1517                /* ensure entire bitmap is zeroed */
1518                err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8);
1519                err |= compat_put_bitmap(nmask, bm, nr_bits);
1520        }
1521
1522        return err;
1523}
1524
1525COMPAT_SYSCALL_DEFINE3(set_mempolicy, int, mode, compat_ulong_t __user *, nmask,
1526                       compat_ulong_t, maxnode)
1527{
1528        long err = 0;
1529        unsigned long __user *nm = NULL;
1530        unsigned long nr_bits, alloc_size;
1531        DECLARE_BITMAP(bm, MAX_NUMNODES);
1532
1533        nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1534        alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1535
1536        if (nmask) {
1537                err = compat_get_bitmap(bm, nmask, nr_bits);
1538                nm = compat_alloc_user_space(alloc_size);
1539                err |= copy_to_user(nm, bm, alloc_size);
1540        }
1541
1542        if (err)
1543                return -EFAULT;
1544
1545        return sys_set_mempolicy(mode, nm, nr_bits+1);
1546}
1547
1548COMPAT_SYSCALL_DEFINE6(mbind, compat_ulong_t, start, compat_ulong_t, len,
1549                       compat_ulong_t, mode, compat_ulong_t __user *, nmask,
1550                       compat_ulong_t, maxnode, compat_ulong_t, flags)
1551{
1552        long err = 0;
1553        unsigned long __user *nm = NULL;
1554        unsigned long nr_bits, alloc_size;
1555        nodemask_t bm;
1556
1557        nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1558        alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1559
1560        if (nmask) {
1561                err = compat_get_bitmap(nodes_addr(bm), nmask, nr_bits);
1562                nm = compat_alloc_user_space(alloc_size);
1563                err |= copy_to_user(nm, nodes_addr(bm), alloc_size);
1564        }
1565
1566        if (err)
1567                return -EFAULT;
1568
1569        return sys_mbind(start, len, mode, nm, nr_bits+1, flags);
1570}
1571
1572#endif
1573
1574struct mempolicy *__get_vma_policy(struct vm_area_struct *vma,
1575                                                unsigned long addr)
1576{
1577        struct mempolicy *pol = NULL;
1578
1579        if (vma) {
1580                if (vma->vm_ops && vma->vm_ops->get_policy) {
1581                        pol = vma->vm_ops->get_policy(vma, addr);
1582                } else if (vma->vm_policy) {
1583                        pol = vma->vm_policy;
1584
1585                        /*
1586                         * shmem_alloc_page() passes MPOL_F_SHARED policy with
1587                         * a pseudo vma whose vma->vm_ops=NULL. Take a reference
1588                         * count on these policies which will be dropped by
1589                         * mpol_cond_put() later
1590                         */
1591                        if (mpol_needs_cond_ref(pol))
1592                                mpol_get(pol);
1593                }
1594        }
1595
1596        return pol;
1597}
1598
1599/*
1600 * get_vma_policy(@vma, @addr)
1601 * @vma: virtual memory area whose policy is sought
1602 * @addr: address in @vma for shared policy lookup
1603 *
1604 * Returns effective policy for a VMA at specified address.
1605 * Falls back to current->mempolicy or system default policy, as necessary.
1606 * Shared policies [those marked as MPOL_F_SHARED] require an extra reference
1607 * count--added by the get_policy() vm_op, as appropriate--to protect against
1608 * freeing by another task.  It is the caller's responsibility to free the
1609 * extra reference for shared policies.
1610 */
1611static struct mempolicy *get_vma_policy(struct vm_area_struct *vma,
1612                                                unsigned long addr)
1613{
1614        struct mempolicy *pol = __get_vma_policy(vma, addr);
1615
1616        if (!pol)
1617                pol = get_task_policy(current);
1618
1619        return pol;
1620}
1621
1622bool vma_policy_mof(struct vm_area_struct *vma)
1623{
1624        struct mempolicy *pol;
1625
1626        if (vma->vm_ops && vma->vm_ops->get_policy) {
1627                bool ret = false;
1628
1629                pol = vma->vm_ops->get_policy(vma, vma->vm_start);
1630                if (pol && (pol->flags & MPOL_F_MOF))
1631                        ret = true;
1632                mpol_cond_put(pol);
1633
1634                return ret;
1635        }
1636
1637        pol = vma->vm_policy;
1638        if (!pol)
1639                pol = get_task_policy(current);
1640
1641        return pol->flags & MPOL_F_MOF;
1642}
1643
1644static int apply_policy_zone(struct mempolicy *policy, enum zone_type zone)
1645{
1646        enum zone_type dynamic_policy_zone = policy_zone;
1647
1648        BUG_ON(dynamic_policy_zone == ZONE_MOVABLE);
1649
1650        /*
1651         * if policy->v.nodes has movable memory only,
1652         * we apply policy when gfp_zone(gfp) = ZONE_MOVABLE only.
1653         *
1654         * policy->v.nodes is intersect with node_states[N_MEMORY].
1655         * so if the following test faile, it implies
1656         * policy->v.nodes has movable memory only.
1657         */
1658        if (!nodes_intersects(policy->v.nodes, node_states[N_HIGH_MEMORY]))
1659                dynamic_policy_zone = ZONE_MOVABLE;
1660
1661        return zone >= dynamic_policy_zone;
1662}
1663
1664/*
1665 * Return a nodemask representing a mempolicy for filtering nodes for
1666 * page allocation
1667 */
1668static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy)
1669{
1670        /* Lower zones don't get a nodemask applied for MPOL_BIND */
1671        if (unlikely(policy->mode == MPOL_BIND) &&
1672                        apply_policy_zone(policy, gfp_zone(gfp)) &&
1673                        cpuset_nodemask_valid_mems_allowed(&policy->v.nodes))
1674                return &policy->v.nodes;
1675
1676        return NULL;
1677}
1678
1679/* Return a zonelist indicated by gfp for node representing a mempolicy */
1680static struct zonelist *policy_zonelist(gfp_t gfp, struct mempolicy *policy,
1681        int nd)
1682{
1683        switch (policy->mode) {
1684        case MPOL_PREFERRED:
1685                if (!(policy->flags & MPOL_F_LOCAL))
1686                        nd = policy->v.preferred_node;
1687                break;
1688        case MPOL_BIND:
1689                /*
1690                 * Normally, MPOL_BIND allocations are node-local within the
1691                 * allowed nodemask.  However, if __GFP_THISNODE is set and the
1692                 * current node isn't part of the mask, we use the zonelist for
1693                 * the first node in the mask instead.
1694                 */
1695                if (unlikely(gfp & __GFP_THISNODE) &&
1696                                unlikely(!node_isset(nd, policy->v.nodes)))
1697                        nd = first_node(policy->v.nodes);
1698                break;
1699        default:
1700                BUG();
1701        }
1702        return node_zonelist(nd, gfp);
1703}
1704
1705/* Do dynamic interleaving for a process */
1706static unsigned interleave_nodes(struct mempolicy *policy)
1707{
1708        unsigned nid, next;
1709        struct task_struct *me = current;
1710
1711        nid = me->il_next;
1712        next = next_node(nid, policy->v.nodes);
1713        if (next >= MAX_NUMNODES)
1714                next = first_node(policy->v.nodes);
1715        if (next < MAX_NUMNODES)
1716                me->il_next = next;
1717        return nid;
1718}
1719
1720/*
1721 * Depending on the memory policy provide a node from which to allocate the
1722 * next slab entry.
1723 */
1724unsigned int mempolicy_slab_node(void)
1725{
1726        struct mempolicy *policy;
1727        int node = numa_mem_id();
1728
1729        if (in_interrupt())
1730                return node;
1731
1732        policy = current->mempolicy;
1733        if (!policy || policy->flags & MPOL_F_LOCAL)
1734                return node;
1735
1736        switch (policy->mode) {
1737        case MPOL_PREFERRED:
1738                /*
1739                 * handled MPOL_F_LOCAL above
1740                 */
1741                return policy->v.preferred_node;
1742
1743        case MPOL_INTERLEAVE:
1744                return interleave_nodes(policy);
1745
1746        case MPOL_BIND: {
1747                /*
1748                 * Follow bind policy behavior and start allocation at the
1749                 * first node.
1750                 */
1751                struct zonelist *zonelist;
1752                struct zone *zone;
1753                enum zone_type highest_zoneidx = gfp_zone(GFP_KERNEL);
1754                zonelist = &NODE_DATA(node)->node_zonelists[0];
1755                (void)first_zones_zonelist(zonelist, highest_zoneidx,
1756                                                        &policy->v.nodes,
1757                                                        &zone);
1758                return zone ? zone->node : node;
1759        }
1760
1761        default:
1762                BUG();
1763        }
1764}
1765
1766/* Do static interleaving for a VMA with known offset. */
1767static unsigned offset_il_node(struct mempolicy *pol,
1768                struct vm_area_struct *vma, unsigned long off)
1769{
1770        unsigned nnodes = nodes_weight(pol->v.nodes);
1771        unsigned target;
1772        int c;
1773        int nid = NUMA_NO_NODE;
1774
1775        if (!nnodes)
1776                return numa_node_id();
1777        target = (unsigned int)off % nnodes;
1778        c = 0;
1779        do {
1780                nid = next_node(nid, pol->v.nodes);
1781                c++;
1782        } while (c <= target);
1783        return nid;
1784}
1785
1786/* Determine a node number for interleave */
1787static inline unsigned interleave_nid(struct mempolicy *pol,
1788                 struct vm_area_struct *vma, unsigned long addr, int shift)
1789{
1790        if (vma) {
1791                unsigned long off;
1792
1793                /*
1794                 * for small pages, there is no difference between
1795                 * shift and PAGE_SHIFT, so the bit-shift is safe.
1796                 * for huge pages, since vm_pgoff is in units of small
1797                 * pages, we need to shift off the always 0 bits to get
1798                 * a useful offset.
1799                 */
1800                BUG_ON(shift < PAGE_SHIFT);
1801                off = vma->vm_pgoff >> (shift - PAGE_SHIFT);
1802                off += (addr - vma->vm_start) >> shift;
1803                return offset_il_node(pol, vma, off);
1804        } else
1805                return interleave_nodes(pol);
1806}
1807
1808/*
1809 * Return the bit number of a random bit set in the nodemask.
1810 * (returns NUMA_NO_NODE if nodemask is empty)
1811 */
1812int node_random(const nodemask_t *maskp)
1813{
1814        int w, bit = NUMA_NO_NODE;
1815
1816        w = nodes_weight(*maskp);
1817        if (w)
1818                bit = bitmap_ord_to_pos(maskp->bits,
1819                        get_random_int() % w, MAX_NUMNODES);
1820        return bit;
1821}
1822
1823#ifdef CONFIG_HUGETLBFS
1824/*
1825 * huge_zonelist(@vma, @addr, @gfp_flags, @mpol)
1826 * @vma: virtual memory area whose policy is sought
1827 * @addr: address in @vma for shared policy lookup and interleave policy
1828 * @gfp_flags: for requested zone
1829 * @mpol: pointer to mempolicy pointer for reference counted mempolicy
1830 * @nodemask: pointer to nodemask pointer for MPOL_BIND nodemask
1831 *
1832 * Returns a zonelist suitable for a huge page allocation and a pointer
1833 * to the struct mempolicy for conditional unref after allocation.
1834 * If the effective policy is 'BIND, returns a pointer to the mempolicy's
1835 * @nodemask for filtering the zonelist.
1836 *
1837 * Must be protected by read_mems_allowed_begin()
1838 */
1839struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr,
1840                                gfp_t gfp_flags, struct mempolicy **mpol,
1841                                nodemask_t **nodemask)
1842{
1843        struct zonelist *zl;
1844
1845        *mpol = get_vma_policy(vma, addr);
1846        *nodemask = NULL;       /* assume !MPOL_BIND */
1847
1848        if (unlikely((*mpol)->mode == MPOL_INTERLEAVE)) {
1849                zl = node_zonelist(interleave_nid(*mpol, vma, addr,
1850                                huge_page_shift(hstate_vma(vma))), gfp_flags);
1851        } else {
1852                zl = policy_zonelist(gfp_flags, *mpol, numa_node_id());
1853                if ((*mpol)->mode == MPOL_BIND)
1854                        *nodemask = &(*mpol)->v.nodes;
1855        }
1856        return zl;
1857}
1858
1859/*
1860 * init_nodemask_of_mempolicy
1861 *
1862 * If the current task's mempolicy is "default" [NULL], return 'false'
1863 * to indicate default policy.  Otherwise, extract the policy nodemask
1864 * for 'bind' or 'interleave' policy into the argument nodemask, or
1865 * initialize the argument nodemask to contain the single node for
1866 * 'preferred' or 'local' policy and return 'true' to indicate presence
1867 * of non-default mempolicy.
1868 *
1869 * We don't bother with reference counting the mempolicy [mpol_get/put]
1870 * because the current task is examining it's own mempolicy and a task's
1871 * mempolicy is only ever changed by the task itself.
1872 *
1873 * N.B., it is the caller's responsibility to free a returned nodemask.
1874 */
1875bool init_nodemask_of_mempolicy(nodemask_t *mask)
1876{
1877        struct mempolicy *mempolicy;
1878        int nid;
1879
1880        if (!(mask && current->mempolicy))
1881                return false;
1882
1883        task_lock(current);
1884        mempolicy = current->mempolicy;
1885        switch (mempolicy->mode) {
1886        case MPOL_PREFERRED:
1887                if (mempolicy->flags & MPOL_F_LOCAL)
1888                        nid = numa_node_id();
1889                else
1890                        nid = mempolicy->v.preferred_node;
1891                init_nodemask_of_node(mask, nid);
1892                break;
1893
1894        case MPOL_BIND:
1895                /* Fall through */
1896        case MPOL_INTERLEAVE:
1897                *mask =  mempolicy->v.nodes;
1898                break;
1899
1900        default:
1901                BUG();
1902        }
1903        task_unlock(current);
1904
1905        return true;
1906}
1907#endif
1908
1909/*
1910 * mempolicy_nodemask_intersects
1911 *
1912 * If tsk's mempolicy is "default" [NULL], return 'true' to indicate default
1913 * policy.  Otherwise, check for intersection between mask and the policy
1914 * nodemask for 'bind' or 'interleave' policy.  For 'perferred' or 'local'
1915 * policy, always return true since it may allocate elsewhere on fallback.
1916 *
1917 * Takes task_lock(tsk) to prevent freeing of its mempolicy.
1918 */
1919bool mempolicy_nodemask_intersects(struct task_struct *tsk,
1920                                        const nodemask_t *mask)
1921{
1922        struct mempolicy *mempolicy;
1923        bool ret = true;
1924
1925        if (!mask)
1926                return ret;
1927        task_lock(tsk);
1928        mempolicy = tsk->mempolicy;
1929        if (!mempolicy)
1930                goto out;
1931
1932        switch (mempolicy->mode) {
1933        case MPOL_PREFERRED:
1934                /*
1935                 * MPOL_PREFERRED and MPOL_F_LOCAL are only preferred nodes to
1936                 * allocate from, they may fallback to other nodes when oom.
1937                 * Thus, it's possible for tsk to have allocated memory from
1938                 * nodes in mask.
1939                 */
1940                break;
1941        case MPOL_BIND:
1942        case MPOL_INTERLEAVE:
1943                ret = nodes_intersects(mempolicy->v.nodes, *mask);
1944                break;
1945        default:
1946                BUG();
1947        }
1948out:
1949        task_unlock(tsk);
1950        return ret;
1951}
1952
1953/* Allocate a page in interleaved policy.
1954   Own path because it needs to do special accounting. */
1955static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
1956                                        unsigned nid)
1957{
1958        struct zonelist *zl;
1959        struct page *page;
1960
1961        zl = node_zonelist(nid, gfp);
1962        page = __alloc_pages(gfp, order, zl);
1963        if (page && page_zone(page) == zonelist_zone(&zl->_zonerefs[0]))
1964                inc_zone_page_state(page, NUMA_INTERLEAVE_HIT);
1965        return page;
1966}
1967
1968/**
1969 *      alloc_pages_vma - Allocate a page for a VMA.
1970 *
1971 *      @gfp:
1972 *      %GFP_USER    user allocation.
1973 *      %GFP_KERNEL  kernel allocations,
1974 *      %GFP_HIGHMEM highmem/user allocations,
1975 *      %GFP_FS      allocation should not call back into a file system.
1976 *      %GFP_ATOMIC  don't sleep.
1977 *
1978 *      @order:Order of the GFP allocation.
1979 *      @vma:  Pointer to VMA or NULL if not available.
1980 *      @addr: Virtual Address of the allocation. Must be inside the VMA.
1981 *      @node: Which node to prefer for allocation (modulo policy).
1982 *      @hugepage: for hugepages try only the preferred node if possible
1983 *
1984 *      This function allocates a page from the kernel page pool and applies
1985 *      a NUMA policy associated with the VMA or the current process.
1986 *      When VMA is not NULL caller must hold down_read on the mmap_sem of the
1987 *      mm_struct of the VMA to prevent it from going away. Should be used for
1988 *      all allocations for pages that will be mapped into user space. Returns
1989 *      NULL when no page can be allocated.
1990 */
1991struct page *
1992alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
1993                unsigned long addr, int node, bool hugepage)
1994{
1995        struct mempolicy *pol;
1996        struct page *page;
1997        unsigned int cpuset_mems_cookie;
1998        struct zonelist *zl;
1999        nodemask_t *nmask;
2000
2001retry_cpuset:
2002        pol = get_vma_policy(vma, addr);
2003        cpuset_mems_cookie = read_mems_allowed_begin();
2004
2005        if (pol->mode == MPOL_INTERLEAVE) {
2006                unsigned nid;
2007
2008                nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order);
2009                mpol_cond_put(pol);
2010                page = alloc_page_interleave(gfp, order, nid);
2011                goto out;
2012        }
2013
2014        if (unlikely(IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && hugepage)) {
2015                int hpage_node = node;
2016
2017                /*
2018                 * For hugepage allocation and non-interleave policy which
2019                 * allows the current node (or other explicitly preferred
2020                 * node) we only try to allocate from the current/preferred
2021                 * node and don't fall back to other nodes, as the cost of
2022                 * remote accesses would likely offset THP benefits.
2023                 *
2024                 * If the policy is interleave, or does not allow the current
2025                 * node in its nodemask, we allocate the standard way.
2026                 */
2027                if (pol->mode == MPOL_PREFERRED &&
2028                                                !(pol->flags & MPOL_F_LOCAL))
2029                        hpage_node = pol->v.preferred_node;
2030
2031                nmask = policy_nodemask(gfp, pol);
2032                if (!nmask || node_isset(hpage_node, *nmask)) {
2033                        mpol_cond_put(pol);
2034                        page = __alloc_pages_node(hpage_node,
2035                                                gfp | __GFP_THISNODE, order);
2036                        goto out;
2037                }
2038        }
2039
2040        nmask = policy_nodemask(gfp, pol);
2041        zl = policy_zonelist(gfp, pol, node);
2042        mpol_cond_put(pol);
2043        page = __alloc_pages_nodemask(gfp, order, zl, nmask);
2044out:
2045        if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie)))
2046                goto retry_cpuset;
2047        return page;
2048}
2049
2050/**
2051 *      alloc_pages_current - Allocate pages.
2052 *
2053 *      @gfp:
2054 *              %GFP_USER   user allocation,
2055 *              %GFP_KERNEL kernel allocation,
2056 *              %GFP_HIGHMEM highmem allocation,
2057 *              %GFP_FS     don't call back into a file system.
2058 *              %GFP_ATOMIC don't sleep.
2059 *      @order: Power of two of allocation size in pages. 0 is a single page.
2060 *
2061 *      Allocate a page from the kernel page pool.  When not in
2062 *      interrupt context and apply the current process NUMA policy.
2063 *      Returns NULL when no page can be allocated.
2064 *
2065 *      Don't call cpuset_update_task_memory_state() unless
2066 *      1) it's ok to take cpuset_sem (can WAIT), and
2067 *      2) allocating for current task (not interrupt).
2068 */
2069struct page *alloc_pages_current(gfp_t gfp, unsigned order)
2070{
2071        struct mempolicy *pol = &default_policy;
2072        struct page *page;
2073        unsigned int cpuset_mems_cookie;
2074
2075        if (!in_interrupt() && !(gfp & __GFP_THISNODE))
2076                pol = get_task_policy(current);
2077
2078retry_cpuset:
2079        cpuset_mems_cookie = read_mems_allowed_begin();
2080
2081        /*
2082         * No reference counting needed for current->mempolicy
2083         * nor system default_policy
2084         */
2085        if (pol->mode == MPOL_INTERLEAVE)
2086                page = alloc_page_interleave(gfp, order, interleave_nodes(pol));
2087        else
2088                page = __alloc_pages_nodemask(gfp, order,
2089                                policy_zonelist(gfp, pol, numa_node_id()),
2090                                policy_nodemask(gfp, pol));
2091
2092        if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie)))
2093                goto retry_cpuset;
2094
2095        return page;
2096}
2097EXPORT_SYMBOL(alloc_pages_current);
2098
2099int vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst)
2100{
2101        struct mempolicy *pol = mpol_dup(vma_policy(src));
2102
2103        if (IS_ERR(pol))
2104                return PTR_ERR(pol);
2105        dst->vm_policy = pol;
2106        return 0;
2107}
2108
2109/*
2110 * If mpol_dup() sees current->cpuset == cpuset_being_rebound, then it
2111 * rebinds the mempolicy its copying by calling mpol_rebind_policy()
2112 * with the mems_allowed returned by cpuset_mems_allowed().  This
2113 * keeps mempolicies cpuset relative after its cpuset moves.  See
2114 * further kernel/cpuset.c update_nodemask().
2115 *
2116 * current's mempolicy may be rebinded by the other task(the task that changes
2117 * cpuset's mems), so we needn't do rebind work for current task.
2118 */
2119
2120/* Slow path of a mempolicy duplicate */
2121struct mempolicy *__mpol_dup(struct mempolicy *old)
2122{
2123        struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
2124
2125        if (!new)
2126                return ERR_PTR(-ENOMEM);
2127
2128        /* task's mempolicy is protected by alloc_lock */
2129        if (old == current->mempolicy) {
2130                task_lock(current);
2131                *new = *old;
2132                task_unlock(current);
2133        } else
2134                *new = *old;
2135
2136        if (current_cpuset_is_being_rebound()) {
2137                nodemask_t mems = cpuset_mems_allowed(current);
2138                if (new->flags & MPOL_F_REBINDING)
2139                        mpol_rebind_policy(new, &mems, MPOL_REBIND_STEP2);
2140                else
2141                        mpol_rebind_policy(new, &mems, MPOL_REBIND_ONCE);
2142        }
2143        atomic_set(&new->refcnt, 1);
2144        return new;
2145}
2146
2147/* Slow path of a mempolicy comparison */
2148bool __mpol_equal(struct mempolicy *a, struct mempolicy *b)
2149{
2150        if (!a || !b)
2151                return false;
2152        if (a->mode != b->mode)
2153                return false;
2154        if (a->flags != b->flags)
2155                return false;
2156        if (mpol_store_user_nodemask(a))
2157                if (!nodes_equal(a->w.user_nodemask, b->w.user_nodemask))
2158                        return false;
2159
2160        switch (a->mode) {
2161        case MPOL_BIND:
2162                /* Fall through */
2163        case MPOL_INTERLEAVE:
2164                return !!nodes_equal(a->v.nodes, b->v.nodes);
2165        case MPOL_PREFERRED:
2166                return a->v.preferred_node == b->v.preferred_node;
2167        default:
2168                BUG();
2169                return false;
2170        }
2171}
2172
2173/*
2174 * Shared memory backing store policy support.
2175 *
2176 * Remember policies even when nobody has shared memory mapped.
2177 * The policies are kept in Red-Black tree linked from the inode.
2178 * They are protected by the sp->lock rwlock, which should be held
2179 * for any accesses to the tree.
2180 */
2181
2182/*
2183 * lookup first element intersecting start-end.  Caller holds sp->lock for
2184 * reading or for writing
2185 */
2186static struct sp_node *
2187sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
2188{
2189        struct rb_node *n = sp->root.rb_node;
2190
2191        while (n) {
2192                struct sp_node *p = rb_entry(n, struct sp_node, nd);
2193
2194                if (start >= p->end)
2195                        n = n->rb_right;
2196                else if (end <= p->start)
2197                        n = n->rb_left;
2198                else
2199                        break;
2200        }
2201        if (!n)
2202                return NULL;
2203        for (;;) {
2204                struct sp_node *w = NULL;
2205                struct rb_node *prev = rb_prev(n);
2206                if (!prev)
2207                        break;
2208                w = rb_entry(prev, struct sp_node, nd);
2209                if (w->end <= start)
2210                        break;
2211                n = prev;
2212        }
2213        return rb_entry(n, struct sp_node, nd);
2214}
2215
2216/*
2217 * Insert a new shared policy into the list.  Caller holds sp->lock for
2218 * writing.
2219 */
2220static void sp_insert(struct shared_policy *sp, struct sp_node *new)
2221{
2222        struct rb_node **p = &sp->root.rb_node;
2223        struct rb_node *parent = NULL;
2224        struct sp_node *nd;
2225
2226        while (*p) {
2227                parent = *p;
2228                nd = rb_entry(parent, struct sp_node, nd);
2229                if (new->start < nd->start)
2230                        p = &(*p)->rb_left;
2231                else if (new->end > nd->end)
2232                        p = &(*p)->rb_right;
2233                else
2234                        BUG();
2235        }
2236        rb_link_node(&new->nd, parent, p);
2237        rb_insert_color(&new->nd, &sp->root);
2238        pr_debug("inserting %lx-%lx: %d\n", new->start, new->end,
2239                 new->policy ? new->policy->mode : 0);
2240}
2241
2242/* Find shared policy intersecting idx */
2243struct mempolicy *
2244mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
2245{
2246        struct mempolicy *pol = NULL;
2247        struct sp_node *sn;
2248
2249        if (!sp->root.rb_node)
2250                return NULL;
2251        read_lock(&sp->lock);
2252        sn = sp_lookup(sp, idx, idx+1);
2253        if (sn) {
2254                mpol_get(sn->policy);
2255                pol = sn->policy;
2256        }
2257        read_unlock(&sp->lock);
2258        return pol;
2259}
2260
2261static void sp_free(struct sp_node *n)
2262{
2263        mpol_put(n->policy);
2264        kmem_cache_free(sn_cache, n);
2265}
2266
2267/**
2268 * mpol_misplaced - check whether current page node is valid in policy
2269 *
2270 * @page: page to be checked
2271 * @vma: vm area where page mapped
2272 * @addr: virtual address where page mapped
2273 *
2274 * Lookup current policy node id for vma,addr and "compare to" page's
2275 * node id.
2276 *
2277 * Returns:
2278 *      -1      - not misplaced, page is in the right node
2279 *      node    - node id where the page should be
2280 *
2281 * Policy determination "mimics" alloc_page_vma().
2282 * Called from fault path where we know the vma and faulting address.
2283 */
2284int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long addr)
2285{
2286        struct mempolicy *pol;
2287        struct zone *zone;
2288        int curnid = page_to_nid(page);
2289        unsigned long pgoff;
2290        int thiscpu = raw_smp_processor_id();
2291        int thisnid = cpu_to_node(thiscpu);
2292        int polnid = -1;
2293        int ret = -1;
2294
2295        BUG_ON(!vma);
2296
2297        pol = get_vma_policy(vma, addr);
2298        if (!(pol->flags & MPOL_F_MOF))
2299                goto out;
2300
2301        switch (pol->mode) {
2302        case MPOL_INTERLEAVE:
2303                BUG_ON(addr >= vma->vm_end);
2304                BUG_ON(addr < vma->vm_start);
2305
2306                pgoff = vma->vm_pgoff;
2307                pgoff += (addr - vma->vm_start) >> PAGE_SHIFT;
2308                polnid = offset_il_node(pol, vma, pgoff);
2309                break;
2310
2311        case MPOL_PREFERRED:
2312                if (pol->flags & MPOL_F_LOCAL)
2313                        polnid = numa_node_id();
2314                else
2315                        polnid = pol->v.preferred_node;
2316                break;
2317
2318        case MPOL_BIND:
2319                /*
2320                 * allows binding to multiple nodes.
2321                 * use current page if in policy nodemask,
2322                 * else select nearest allowed node, if any.
2323                 * If no allowed nodes, use current [!misplaced].
2324                 */
2325                if (node_isset(curnid, pol->v.nodes))
2326                        goto out;
2327                (void)first_zones_zonelist(
2328                                node_zonelist(numa_node_id(), GFP_HIGHUSER),
2329                                gfp_zone(GFP_HIGHUSER),
2330                                &pol->v.nodes, &zone);
2331                polnid = zone->node;
2332                break;
2333
2334        default:
2335                BUG();
2336        }
2337
2338        /* Migrate the page towards the node whose CPU is referencing it */
2339        if (pol->flags & MPOL_F_MORON) {
2340                polnid = thisnid;
2341
2342                if (!should_numa_migrate_memory(current, page, curnid, thiscpu))
2343                        goto out;
2344        }
2345
2346        if (curnid != polnid)
2347                ret = polnid;
2348out:
2349        mpol_cond_put(pol);
2350
2351        return ret;
2352}
2353
2354static void sp_delete(struct shared_policy *sp, struct sp_node *n)
2355{
2356        pr_debug("deleting %lx-l%lx\n", n->start, n->end);
2357        rb_erase(&n->nd, &sp->root);
2358        sp_free(n);
2359}
2360
2361static void sp_node_init(struct sp_node *node, unsigned long start,
2362                        unsigned long end, struct mempolicy *pol)
2363{
2364        node->start = start;
2365        node->end = end;
2366        node->policy = pol;
2367}
2368
2369static struct sp_node *sp_alloc(unsigned long start, unsigned long end,
2370                                struct mempolicy *pol)
2371{
2372        struct sp_node *n;
2373        struct mempolicy *newpol;
2374
2375        n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
2376        if (!n)
2377                return NULL;
2378
2379        newpol = mpol_dup(pol);
2380        if (IS_ERR(newpol)) {
2381                kmem_cache_free(sn_cache, n);
2382                return NULL;
2383        }
2384        newpol->flags |= MPOL_F_SHARED;
2385        sp_node_init(n, start, end, newpol);
2386
2387        return n;
2388}
2389
2390/* Replace a policy range. */
2391static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
2392                                 unsigned long end, struct sp_node *new)
2393{
2394        struct sp_node *n;
2395        struct sp_node *n_new = NULL;
2396        struct mempolicy *mpol_new = NULL;
2397        int ret = 0;
2398
2399restart:
2400        write_lock(&sp->lock);
2401        n = sp_lookup(sp, start, end);
2402        /* Take care of old policies in the same range. */
2403        while (n && n->start < end) {
2404                struct rb_node *next = rb_next(&n->nd);
2405                if (n->start >= start) {
2406                        if (n->end <= end)
2407                                sp_delete(sp, n);
2408                        else
2409                                n->start = end;
2410                } else {
2411                        /* Old policy spanning whole new range. */
2412                        if (n->end > end) {
2413                                if (!n_new)
2414                                        goto alloc_new;
2415
2416                                *mpol_new = *n->policy;
2417                                atomic_set(&mpol_new->refcnt, 1);
2418                                sp_node_init(n_new, end, n->end, mpol_new);
2419                                n->end = start;
2420                                sp_insert(sp, n_new);
2421                                n_new = NULL;
2422                                mpol_new = NULL;
2423                                break;
2424                        } else
2425                                n->end = start;
2426                }
2427                if (!next)
2428                        break;
2429                n = rb_entry(next, struct sp_node, nd);
2430        }
2431        if (new)
2432                sp_insert(sp, new);
2433        write_unlock(&sp->lock);
2434        ret = 0;
2435
2436err_out:
2437        if (mpol_new)
2438                mpol_put(mpol_new);
2439        if (n_new)
2440                kmem_cache_free(sn_cache, n_new);
2441
2442        return ret;
2443
2444alloc_new:
2445        write_unlock(&sp->lock);
2446        ret = -ENOMEM;
2447        n_new = kmem_cache_alloc(sn_cache, GFP_KERNEL);
2448        if (!n_new)
2449                goto err_out;
2450        mpol_new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
2451        if (!mpol_new)
2452                goto err_out;
2453        goto restart;
2454}
2455
2456/**
2457 * mpol_shared_policy_init - initialize shared policy for inode
2458 * @sp: pointer to inode shared policy
2459 * @mpol:  struct mempolicy to install
2460 *
2461 * Install non-NULL @mpol in inode's shared policy rb-tree.
2462 * On entry, the current task has a reference on a non-NULL @mpol.
2463 * This must be released on exit.
2464 * This is called at get_inode() calls and we can use GFP_KERNEL.
2465 */
2466void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol)
2467{
2468        int ret;
2469
2470        sp->root = RB_ROOT;             /* empty tree == default mempolicy */
2471        rwlock_init(&sp->lock);
2472
2473        if (mpol) {
2474                struct vm_area_struct pvma;
2475                struct mempolicy *new;
2476                NODEMASK_SCRATCH(scratch);
2477
2478                if (!scratch)
2479                        goto put_mpol;
2480                /* contextualize the tmpfs mount point mempolicy */
2481                new = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask);
2482                if (IS_ERR(new))
2483                        goto free_scratch; /* no valid nodemask intersection */
2484
2485                task_lock(current);
2486                ret = mpol_set_nodemask(new, &mpol->w.user_nodemask, scratch);
2487                task_unlock(current);
2488                if (ret)
2489                        goto put_new;
2490
2491                /* Create pseudo-vma that contains just the policy */
2492                memset(&pvma, 0, sizeof(struct vm_area_struct));
2493                pvma.vm_end = TASK_SIZE;        /* policy covers entire file */
2494                mpol_set_shared_policy(sp, &pvma, new); /* adds ref */
2495
2496put_new:
2497                mpol_put(new);                  /* drop initial ref */
2498free_scratch:
2499                NODEMASK_SCRATCH_FREE(scratch);
2500put_mpol:
2501                mpol_put(mpol); /* drop our incoming ref on sb mpol */
2502        }
2503}
2504
2505int mpol_set_shared_policy(struct shared_policy *info,
2506                        struct vm_area_struct *vma, struct mempolicy *npol)
2507{
2508        int err;
2509        struct sp_node *new = NULL;
2510        unsigned long sz = vma_pages(vma);
2511
2512        pr_debug("set_shared_policy %lx sz %lu %d %d %lx\n",
2513                 vma->vm_pgoff,
2514                 sz, npol ? npol->mode : -1,
2515                 npol ? npol->flags : -1,
2516                 npol ? nodes_addr(npol->v.nodes)[0] : NUMA_NO_NODE);
2517
2518        if (npol) {
2519                new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);
2520                if (!new)
2521                        return -ENOMEM;
2522        }
2523        err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new);
2524        if (err && new)
2525                sp_free(new);
2526        return err;
2527}
2528
2529/* Free a backing policy store on inode delete. */
2530void mpol_free_shared_policy(struct shared_policy *p)
2531{
2532        struct sp_node *n;
2533        struct rb_node *next;
2534
2535        if (!p->root.rb_node)
2536                return;
2537        write_lock(&p->lock);
2538        next = rb_first(&p->root);
2539        while (next) {
2540                n = rb_entry(next, struct sp_node, nd);
2541                next = rb_next(&n->nd);
2542                sp_delete(p, n);
2543        }
2544        write_unlock(&p->lock);
2545}
2546
2547#ifdef CONFIG_NUMA_BALANCING
2548static int __initdata numabalancing_override;
2549
2550static void __init check_numabalancing_enable(void)
2551{
2552        bool numabalancing_default = false;
2553
2554        if (IS_ENABLED(CONFIG_NUMA_BALANCING_DEFAULT_ENABLED))
2555                numabalancing_default = true;
2556
2557        /* Parsed by setup_numabalancing. override == 1 enables, -1 disables */
2558        if (numabalancing_override)
2559                set_numabalancing_state(numabalancing_override == 1);
2560
2561        if (num_online_nodes() > 1 && !numabalancing_override) {
2562                pr_info("%s automatic NUMA balancing. Configure with numa_balancing= or the kernel.numa_balancing sysctl\n",
2563                        numabalancing_default ? "Enabling" : "Disabling");
2564                set_numabalancing_state(numabalancing_default);
2565        }
2566}
2567
2568static int __init setup_numabalancing(char *str)
2569{
2570        int ret = 0;
2571        if (!str)
2572                goto out;
2573
2574        if (!strcmp(str, "enable")) {
2575                numabalancing_override = 1;
2576                ret = 1;
2577        } else if (!strcmp(str, "disable")) {
2578                numabalancing_override = -1;
2579                ret = 1;
2580        }
2581out:
2582        if (!ret)
2583                pr_warn("Unable to parse numa_balancing=\n");
2584
2585        return ret;
2586}
2587__setup("numa_balancing=", setup_numabalancing);
2588#else
2589static inline void __init check_numabalancing_enable(void)
2590{
2591}
2592#endif /* CONFIG_NUMA_BALANCING */
2593
2594/* assumes fs == KERNEL_DS */
2595void __init numa_policy_init(void)
2596{
2597        nodemask_t interleave_nodes;
2598        unsigned long largest = 0;
2599        int nid, prefer = 0;
2600
2601        policy_cache = kmem_cache_create("numa_policy",
2602                                         sizeof(struct mempolicy),
2603                                         0, SLAB_PANIC, NULL);
2604
2605        sn_cache = kmem_cache_create("shared_policy_node",
2606                                     sizeof(struct sp_node),
2607                                     0, SLAB_PANIC, NULL);
2608
2609        for_each_node(nid) {
2610                preferred_node_policy[nid] = (struct mempolicy) {
2611                        .refcnt = ATOMIC_INIT(1),
2612                        .mode = MPOL_PREFERRED,
2613                        .flags = MPOL_F_MOF | MPOL_F_MORON,
2614                        .v = { .preferred_node = nid, },
2615                };
2616        }
2617
2618        /*
2619         * Set interleaving policy for system init. Interleaving is only
2620         * enabled across suitably sized nodes (default is >= 16MB), or
2621         * fall back to the largest node if they're all smaller.
2622         */
2623        nodes_clear(interleave_nodes);
2624        for_each_node_state(nid, N_MEMORY) {
2625                unsigned long total_pages = node_present_pages(nid);
2626
2627                /* Preserve the largest node */
2628                if (largest < total_pages) {
2629                        largest = total_pages;
2630                        prefer = nid;
2631                }
2632
2633                /* Interleave this node? */
2634                if ((total_pages << PAGE_SHIFT) >= (16 << 20))
2635                        node_set(nid, interleave_nodes);
2636        }
2637
2638        /* All too small, use the largest */
2639        if (unlikely(nodes_empty(interleave_nodes)))
2640                node_set(prefer, interleave_nodes);
2641
2642        if (do_set_mempolicy(MPOL_INTERLEAVE, 0, &interleave_nodes))
2643                pr_err("%s: interleaving failed\n", __func__);
2644
2645        check_numabalancing_enable();
2646}
2647
2648/* Reset policy of current process to default */
2649void numa_default_policy(void)
2650{
2651        do_set_mempolicy(MPOL_DEFAULT, 0, NULL);
2652}
2653
2654/*
2655 * Parse and format mempolicy from/to strings
2656 */
2657
2658/*
2659 * "local" is implemented internally by MPOL_PREFERRED with MPOL_F_LOCAL flag.
2660 */
2661static const char * const policy_modes[] =
2662{
2663        [MPOL_DEFAULT]    = "default",
2664        [MPOL_PREFERRED]  = "prefer",
2665        [MPOL_BIND]       = "bind",
2666        [MPOL_INTERLEAVE] = "interleave",
2667        [MPOL_LOCAL]      = "local",
2668};
2669
2670
2671#ifdef CONFIG_TMPFS
2672/**
2673 * mpol_parse_str - parse string to mempolicy, for tmpfs mpol mount option.
2674 * @str:  string containing mempolicy to parse
2675 * @mpol:  pointer to struct mempolicy pointer, returned on success.
2676 *
2677 * Format of input:
2678 *      <mode>[=<flags>][:<nodelist>]
2679 *
2680 * On success, returns 0, else 1
2681 */
2682int mpol_parse_str(char *str, struct mempolicy **mpol)
2683{
2684        struct mempolicy *new = NULL;
2685        unsigned short mode;
2686        unsigned short mode_flags;
2687        nodemask_t nodes;
2688        char *nodelist = strchr(str, ':');
2689        char *flags = strchr(str, '=');
2690        int err = 1;
2691
2692        if (nodelist) {
2693                /* NUL-terminate mode or flags string */
2694                *nodelist++ = '\0';
2695                if (nodelist_parse(nodelist, nodes))
2696                        goto out;
2697                if (!nodes_subset(nodes, node_states[N_MEMORY]))
2698                        goto out;
2699        } else
2700                nodes_clear(nodes);
2701
2702        if (flags)
2703                *flags++ = '\0';        /* terminate mode string */
2704
2705        for (mode = 0; mode < MPOL_MAX; mode++) {
2706                if (!strcmp(str, policy_modes[mode])) {
2707                        break;
2708                }
2709        }
2710        if (mode >= MPOL_MAX)
2711                goto out;
2712
2713        switch (mode) {
2714        case MPOL_PREFERRED:
2715                /*
2716                 * Insist on a nodelist of one node only
2717                 */
2718                if (nodelist) {
2719                        char *rest = nodelist;
2720                        while (isdigit(*rest))
2721                                rest++;
2722                        if (*rest)
2723                                goto out;
2724                }
2725                break;
2726        case MPOL_INTERLEAVE:
2727                /*
2728                 * Default to online nodes with memory if no nodelist
2729                 */
2730                if (!nodelist)
2731                        nodes = node_states[N_MEMORY];
2732                break;
2733        case MPOL_LOCAL:
2734                /*
2735                 * Don't allow a nodelist;  mpol_new() checks flags
2736                 */
2737                if (nodelist)
2738                        goto out;
2739                mode = MPOL_PREFERRED;
2740                break;
2741        case MPOL_DEFAULT:
2742                /*
2743                 * Insist on a empty nodelist
2744                 */
2745                if (!nodelist)
2746                        err = 0;
2747                goto out;
2748        case MPOL_BIND:
2749                /*
2750                 * Insist on a nodelist
2751                 */
2752                if (!nodelist)
2753                        goto out;
2754        }
2755
2756        mode_flags = 0;
2757        if (flags) {
2758                /*
2759                 * Currently, we only support two mutually exclusive
2760                 * mode flags.
2761                 */
2762                if (!strcmp(flags, "static"))
2763                        mode_flags |= MPOL_F_STATIC_NODES;
2764                else if (!strcmp(flags, "relative"))
2765                        mode_flags |= MPOL_F_RELATIVE_NODES;
2766                else
2767                        goto out;
2768        }
2769
2770        new = mpol_new(mode, mode_flags, &nodes);
2771        if (IS_ERR(new))
2772                goto out;
2773
2774        /*
2775         * Save nodes for mpol_to_str() to show the tmpfs mount options
2776         * for /proc/mounts, /proc/pid/mounts and /proc/pid/mountinfo.
2777         */
2778        if (mode != MPOL_PREFERRED)
2779                new->v.nodes = nodes;
2780        else if (nodelist)
2781                new->v.preferred_node = first_node(nodes);
2782        else
2783                new->flags |= MPOL_F_LOCAL;
2784
2785        /*
2786         * Save nodes for contextualization: this will be used to "clone"
2787         * the mempolicy in a specific context [cpuset] at a later time.
2788         */
2789        new->w.user_nodemask = nodes;
2790
2791        err = 0;
2792
2793out:
2794        /* Restore string for error message */
2795        if (nodelist)
2796                *--nodelist = ':';
2797        if (flags)
2798                *--flags = '=';
2799        if (!err)
2800                *mpol = new;
2801        return err;
2802}
2803#endif /* CONFIG_TMPFS */
2804
2805/**
2806 * mpol_to_str - format a mempolicy structure for printing
2807 * @buffer:  to contain formatted mempolicy string
2808 * @maxlen:  length of @buffer
2809 * @pol:  pointer to mempolicy to be formatted
2810 *
2811 * Convert @pol into a string.  If @buffer is too short, truncate the string.
2812 * Recommend a @maxlen of at least 32 for the longest mode, "interleave", the
2813 * longest flag, "relative", and to display at least a few node ids.
2814 */
2815void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
2816{
2817        char *p = buffer;
2818        nodemask_t nodes = NODE_MASK_NONE;
2819        unsigned short mode = MPOL_DEFAULT;
2820        unsigned short flags = 0;
2821
2822        if (pol && pol != &default_policy && !(pol->flags & MPOL_F_MORON)) {
2823                mode = pol->mode;
2824                flags = pol->flags;
2825        }
2826
2827        switch (mode) {
2828        case MPOL_DEFAULT:
2829                break;
2830        case MPOL_PREFERRED:
2831                if (flags & MPOL_F_LOCAL)
2832                        mode = MPOL_LOCAL;
2833                else
2834                        node_set(pol->v.preferred_node, nodes);
2835                break;
2836        case MPOL_BIND:
2837        case MPOL_INTERLEAVE:
2838                nodes = pol->v.nodes;
2839                break;
2840        default:
2841                WARN_ON_ONCE(1);
2842                snprintf(p, maxlen, "unknown");
2843                return;
2844        }
2845
2846        p += snprintf(p, maxlen, "%s", policy_modes[mode]);
2847
2848        if (flags & MPOL_MODE_FLAGS) {
2849                p += snprintf(p, buffer + maxlen - p, "=");
2850
2851                /*
2852                 * Currently, the only defined flags are mutually exclusive
2853                 */
2854                if (flags & MPOL_F_STATIC_NODES)
2855                        p += snprintf(p, buffer + maxlen - p, "static");
2856                else if (flags & MPOL_F_RELATIVE_NODES)
2857                        p += snprintf(p, buffer + maxlen - p, "relative");
2858        }
2859
2860        if (!nodes_empty(nodes))
2861                p += scnprintf(p, buffer + maxlen - p, ":%*pbl",
2862                               nodemask_pr_args(&nodes));
2863}
2864