linux/mm/mempolicy.c
<<
>>
Prefs
   1/*
   2 * Simple NUMA memory policy for the Linux kernel.
   3 *
   4 * Copyright 2003,2004 Andi Kleen, SuSE Labs.
   5 * (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc.
   6 * Subject to the GNU Public License, version 2.
   7 *
   8 * NUMA policy allows the user to give hints in which node(s) memory should
   9 * be allocated.
  10 *
  11 * Support four policies per VMA and per process:
  12 *
  13 * The VMA policy has priority over the process policy for a page fault.
  14 *
  15 * interleave     Allocate memory interleaved over a set of nodes,
  16 *                with normal fallback if it fails.
  17 *                For VMA based allocations this interleaves based on the
  18 *                offset into the backing object or offset into the mapping
  19 *                for anonymous memory. For process policy an process counter
  20 *                is used.
  21 *
  22 * bind           Only allocate memory on a specific set of nodes,
  23 *                no fallback.
  24 *                FIXME: memory is allocated starting with the first node
  25 *                to the last. It would be better if bind would truly restrict
  26 *                the allocation to memory nodes instead
  27 *
  28 * preferred       Try a specific node first before normal fallback.
  29 *                As a special case NUMA_NO_NODE here means do the allocation
  30 *                on the local CPU. This is normally identical to default,
  31 *                but useful to set in a VMA when you have a non default
  32 *                process policy.
  33 *
  34 * default        Allocate on the local node first, or when on a VMA
  35 *                use the process policy. This is what Linux always did
  36 *                in a NUMA aware kernel and still does by, ahem, default.
  37 *
  38 * The process policy is applied for most non interrupt memory allocations
  39 * in that process' context. Interrupts ignore the policies and always
  40 * try to allocate on the local CPU. The VMA policy is only applied for memory
  41 * allocations for a VMA in the VM.
  42 *
  43 * Currently there are a few corner cases in swapping where the policy
  44 * is not applied, but the majority should be handled. When process policy
  45 * is used it is not remembered over swap outs/swap ins.
  46 *
  47 * Only the highest zone in the zone hierarchy gets policied. Allocations
  48 * requesting a lower zone just use default policy. This implies that
  49 * on systems with highmem kernel lowmem allocation don't get policied.
  50 * Same with GFP_DMA allocations.
  51 *
  52 * For shmfs/tmpfs/hugetlbfs shared memory the policy is shared between
  53 * all users and remembered even when nobody has memory mapped.
  54 */
  55
  56/* Notebook:
  57   fix mmap readahead to honour policy and enable policy for any page cache
  58   object
  59   statistics for bigpages
  60   global policy for page cache? currently it uses process policy. Requires
  61   first item above.
  62   handle mremap for shared memory (currently ignored for the policy)
  63   grows down?
  64   make bind policy root only? It can trigger oom much faster and the
  65   kernel is not always grateful with that.
  66*/
  67
  68#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
  69
  70#include <linux/mempolicy.h>
  71#include <linux/mm.h>
  72#include <linux/highmem.h>
  73#include <linux/hugetlb.h>
  74#include <linux/kernel.h>
  75#include <linux/sched.h>
  76#include <linux/sched/mm.h>
  77#include <linux/sched/numa_balancing.h>
  78#include <linux/sched/task.h>
  79#include <linux/nodemask.h>
  80#include <linux/cpuset.h>
  81#include <linux/slab.h>
  82#include <linux/string.h>
  83#include <linux/export.h>
  84#include <linux/nsproxy.h>
  85#include <linux/interrupt.h>
  86#include <linux/init.h>
  87#include <linux/compat.h>
  88#include <linux/swap.h>
  89#include <linux/seq_file.h>
  90#include <linux/proc_fs.h>
  91#include <linux/migrate.h>
  92#include <linux/ksm.h>
  93#include <linux/rmap.h>
  94#include <linux/security.h>
  95#include <linux/syscalls.h>
  96#include <linux/ctype.h>
  97#include <linux/mm_inline.h>
  98#include <linux/mmu_notifier.h>
  99#include <linux/printk.h>
 100
 101#include <asm/tlbflush.h>
 102#include <linux/uaccess.h>
 103
 104#include "internal.h"
 105
 106/* Internal flags */
 107#define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0)    /* Skip checks for continuous vmas */
 108#define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1)          /* Invert check for nodemask */
 109
 110static struct kmem_cache *policy_cache;
 111static struct kmem_cache *sn_cache;
 112
 113/* Highest zone. An specific allocation for a zone below that is not
 114   policied. */
 115enum zone_type policy_zone = 0;
 116
 117/*
 118 * run-time system-wide default policy => local allocation
 119 */
 120static struct mempolicy default_policy = {
 121        .refcnt = ATOMIC_INIT(1), /* never free it */
 122        .mode = MPOL_PREFERRED,
 123        .flags = MPOL_F_LOCAL,
 124};
 125
 126static struct mempolicy preferred_node_policy[MAX_NUMNODES];
 127
 128struct mempolicy *get_task_policy(struct task_struct *p)
 129{
 130        struct mempolicy *pol = p->mempolicy;
 131        int node;
 132
 133        if (pol)
 134                return pol;
 135
 136        node = numa_node_id();
 137        if (node != NUMA_NO_NODE) {
 138                pol = &preferred_node_policy[node];
 139                /* preferred_node_policy is not initialised early in boot */
 140                if (pol->mode)
 141                        return pol;
 142        }
 143
 144        return &default_policy;
 145}
 146
 147static const struct mempolicy_operations {
 148        int (*create)(struct mempolicy *pol, const nodemask_t *nodes);
 149        /*
 150         * If read-side task has no lock to protect task->mempolicy, write-side
 151         * task will rebind the task->mempolicy by two step. The first step is
 152         * setting all the newly nodes, and the second step is cleaning all the
 153         * disallowed nodes. In this way, we can avoid finding no node to alloc
 154         * page.
 155         * If we have a lock to protect task->mempolicy in read-side, we do
 156         * rebind directly.
 157         *
 158         * step:
 159         *      MPOL_REBIND_ONCE - do rebind work at once
 160         *      MPOL_REBIND_STEP1 - set all the newly nodes
 161         *      MPOL_REBIND_STEP2 - clean all the disallowed nodes
 162         */
 163        void (*rebind)(struct mempolicy *pol, const nodemask_t *nodes,
 164                        enum mpol_rebind_step step);
 165} mpol_ops[MPOL_MAX];
 166
 167static inline int mpol_store_user_nodemask(const struct mempolicy *pol)
 168{
 169        return pol->flags & MPOL_MODE_FLAGS;
 170}
 171
 172static void mpol_relative_nodemask(nodemask_t *ret, const nodemask_t *orig,
 173                                   const nodemask_t *rel)
 174{
 175        nodemask_t tmp;
 176        nodes_fold(tmp, *orig, nodes_weight(*rel));
 177        nodes_onto(*ret, tmp, *rel);
 178}
 179
 180static int mpol_new_interleave(struct mempolicy *pol, const nodemask_t *nodes)
 181{
 182        if (nodes_empty(*nodes))
 183                return -EINVAL;
 184        pol->v.nodes = *nodes;
 185        return 0;
 186}
 187
 188static int mpol_new_preferred(struct mempolicy *pol, const nodemask_t *nodes)
 189{
 190        if (!nodes)
 191                pol->flags |= MPOL_F_LOCAL;     /* local allocation */
 192        else if (nodes_empty(*nodes))
 193                return -EINVAL;                 /*  no allowed nodes */
 194        else
 195                pol->v.preferred_node = first_node(*nodes);
 196        return 0;
 197}
 198
 199static int mpol_new_bind(struct mempolicy *pol, const nodemask_t *nodes)
 200{
 201        if (nodes_empty(*nodes))
 202                return -EINVAL;
 203        pol->v.nodes = *nodes;
 204        return 0;
 205}
 206
 207/*
 208 * mpol_set_nodemask is called after mpol_new() to set up the nodemask, if
 209 * any, for the new policy.  mpol_new() has already validated the nodes
 210 * parameter with respect to the policy mode and flags.  But, we need to
 211 * handle an empty nodemask with MPOL_PREFERRED here.
 212 *
 213 * Must be called holding task's alloc_lock to protect task's mems_allowed
 214 * and mempolicy.  May also be called holding the mmap_semaphore for write.
 215 */
 216static int mpol_set_nodemask(struct mempolicy *pol,
 217                     const nodemask_t *nodes, struct nodemask_scratch *nsc)
 218{
 219        int ret;
 220
 221        /* if mode is MPOL_DEFAULT, pol is NULL. This is right. */
 222        if (pol == NULL)
 223                return 0;
 224        /* Check N_MEMORY */
 225        nodes_and(nsc->mask1,
 226                  cpuset_current_mems_allowed, node_states[N_MEMORY]);
 227
 228        VM_BUG_ON(!nodes);
 229        if (pol->mode == MPOL_PREFERRED && nodes_empty(*nodes))
 230                nodes = NULL;   /* explicit local allocation */
 231        else {
 232                if (pol->flags & MPOL_F_RELATIVE_NODES)
 233                        mpol_relative_nodemask(&nsc->mask2, nodes, &nsc->mask1);
 234                else
 235                        nodes_and(nsc->mask2, *nodes, nsc->mask1);
 236
 237                if (mpol_store_user_nodemask(pol))
 238                        pol->w.user_nodemask = *nodes;
 239                else
 240                        pol->w.cpuset_mems_allowed =
 241                                                cpuset_current_mems_allowed;
 242        }
 243
 244        if (nodes)
 245                ret = mpol_ops[pol->mode].create(pol, &nsc->mask2);
 246        else
 247                ret = mpol_ops[pol->mode].create(pol, NULL);
 248        return ret;
 249}
 250
 251/*
 252 * This function just creates a new policy, does some check and simple
 253 * initialization. You must invoke mpol_set_nodemask() to set nodes.
 254 */
 255static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
 256                                  nodemask_t *nodes)
 257{
 258        struct mempolicy *policy;
 259
 260        pr_debug("setting mode %d flags %d nodes[0] %lx\n",
 261                 mode, flags, nodes ? nodes_addr(*nodes)[0] : NUMA_NO_NODE);
 262
 263        if (mode == MPOL_DEFAULT) {
 264                if (nodes && !nodes_empty(*nodes))
 265                        return ERR_PTR(-EINVAL);
 266                return NULL;
 267        }
 268        VM_BUG_ON(!nodes);
 269
 270        /*
 271         * MPOL_PREFERRED cannot be used with MPOL_F_STATIC_NODES or
 272         * MPOL_F_RELATIVE_NODES if the nodemask is empty (local allocation).
 273         * All other modes require a valid pointer to a non-empty nodemask.
 274         */
 275        if (mode == MPOL_PREFERRED) {
 276                if (nodes_empty(*nodes)) {
 277                        if (((flags & MPOL_F_STATIC_NODES) ||
 278                             (flags & MPOL_F_RELATIVE_NODES)))
 279                                return ERR_PTR(-EINVAL);
 280                }
 281        } else if (mode == MPOL_LOCAL) {
 282                if (!nodes_empty(*nodes) ||
 283                    (flags & MPOL_F_STATIC_NODES) ||
 284                    (flags & MPOL_F_RELATIVE_NODES))
 285                        return ERR_PTR(-EINVAL);
 286                mode = MPOL_PREFERRED;
 287        } else if (nodes_empty(*nodes))
 288                return ERR_PTR(-EINVAL);
 289        policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
 290        if (!policy)
 291                return ERR_PTR(-ENOMEM);
 292        atomic_set(&policy->refcnt, 1);
 293        policy->mode = mode;
 294        policy->flags = flags;
 295
 296        return policy;
 297}
 298
 299/* Slow path of a mpol destructor. */
 300void __mpol_put(struct mempolicy *p)
 301{
 302        if (!atomic_dec_and_test(&p->refcnt))
 303                return;
 304        kmem_cache_free(policy_cache, p);
 305}
 306
 307static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes,
 308                                enum mpol_rebind_step step)
 309{
 310}
 311
 312/*
 313 * step:
 314 *      MPOL_REBIND_ONCE  - do rebind work at once
 315 *      MPOL_REBIND_STEP1 - set all the newly nodes
 316 *      MPOL_REBIND_STEP2 - clean all the disallowed nodes
 317 */
 318static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes,
 319                                 enum mpol_rebind_step step)
 320{
 321        nodemask_t tmp;
 322
 323        if (pol->flags & MPOL_F_STATIC_NODES)
 324                nodes_and(tmp, pol->w.user_nodemask, *nodes);
 325        else if (pol->flags & MPOL_F_RELATIVE_NODES)
 326                mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
 327        else {
 328                /*
 329                 * if step == 1, we use ->w.cpuset_mems_allowed to cache the
 330                 * result
 331                 */
 332                if (step == MPOL_REBIND_ONCE || step == MPOL_REBIND_STEP1) {
 333                        nodes_remap(tmp, pol->v.nodes,
 334                                        pol->w.cpuset_mems_allowed, *nodes);
 335                        pol->w.cpuset_mems_allowed = step ? tmp : *nodes;
 336                } else if (step == MPOL_REBIND_STEP2) {
 337                        tmp = pol->w.cpuset_mems_allowed;
 338                        pol->w.cpuset_mems_allowed = *nodes;
 339                } else
 340                        BUG();
 341        }
 342
 343        if (nodes_empty(tmp))
 344                tmp = *nodes;
 345
 346        if (step == MPOL_REBIND_STEP1)
 347                nodes_or(pol->v.nodes, pol->v.nodes, tmp);
 348        else if (step == MPOL_REBIND_ONCE || step == MPOL_REBIND_STEP2)
 349                pol->v.nodes = tmp;
 350        else
 351                BUG();
 352
 353        if (!node_isset(current->il_next, tmp)) {
 354                current->il_next = next_node_in(current->il_next, tmp);
 355                if (current->il_next >= MAX_NUMNODES)
 356                        current->il_next = numa_node_id();
 357        }
 358}
 359
 360static void mpol_rebind_preferred(struct mempolicy *pol,
 361                                  const nodemask_t *nodes,
 362                                  enum mpol_rebind_step step)
 363{
 364        nodemask_t tmp;
 365
 366        if (pol->flags & MPOL_F_STATIC_NODES) {
 367                int node = first_node(pol->w.user_nodemask);
 368
 369                if (node_isset(node, *nodes)) {
 370                        pol->v.preferred_node = node;
 371                        pol->flags &= ~MPOL_F_LOCAL;
 372                } else
 373                        pol->flags |= MPOL_F_LOCAL;
 374        } else if (pol->flags & MPOL_F_RELATIVE_NODES) {
 375                mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
 376                pol->v.preferred_node = first_node(tmp);
 377        } else if (!(pol->flags & MPOL_F_LOCAL)) {
 378                pol->v.preferred_node = node_remap(pol->v.preferred_node,
 379                                                   pol->w.cpuset_mems_allowed,
 380                                                   *nodes);
 381                pol->w.cpuset_mems_allowed = *nodes;
 382        }
 383}
 384
 385/*
 386 * mpol_rebind_policy - Migrate a policy to a different set of nodes
 387 *
 388 * If read-side task has no lock to protect task->mempolicy, write-side
 389 * task will rebind the task->mempolicy by two step. The first step is
 390 * setting all the newly nodes, and the second step is cleaning all the
 391 * disallowed nodes. In this way, we can avoid finding no node to alloc
 392 * page.
 393 * If we have a lock to protect task->mempolicy in read-side, we do
 394 * rebind directly.
 395 *
 396 * step:
 397 *      MPOL_REBIND_ONCE  - do rebind work at once
 398 *      MPOL_REBIND_STEP1 - set all the newly nodes
 399 *      MPOL_REBIND_STEP2 - clean all the disallowed nodes
 400 */
 401static void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask,
 402                                enum mpol_rebind_step step)
 403{
 404        if (!pol)
 405                return;
 406        if (!mpol_store_user_nodemask(pol) && step == MPOL_REBIND_ONCE &&
 407            nodes_equal(pol->w.cpuset_mems_allowed, *newmask))
 408                return;
 409
 410        if (step == MPOL_REBIND_STEP1 && (pol->flags & MPOL_F_REBINDING))
 411                return;
 412
 413        if (step == MPOL_REBIND_STEP2 && !(pol->flags & MPOL_F_REBINDING))
 414                BUG();
 415
 416        if (step == MPOL_REBIND_STEP1)
 417                pol->flags |= MPOL_F_REBINDING;
 418        else if (step == MPOL_REBIND_STEP2)
 419                pol->flags &= ~MPOL_F_REBINDING;
 420        else if (step >= MPOL_REBIND_NSTEP)
 421                BUG();
 422
 423        mpol_ops[pol->mode].rebind(pol, newmask, step);
 424}
 425
 426/*
 427 * Wrapper for mpol_rebind_policy() that just requires task
 428 * pointer, and updates task mempolicy.
 429 *
 430 * Called with task's alloc_lock held.
 431 */
 432
 433void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new,
 434                        enum mpol_rebind_step step)
 435{
 436        mpol_rebind_policy(tsk->mempolicy, new, step);
 437}
 438
 439/*
 440 * Rebind each vma in mm to new nodemask.
 441 *
 442 * Call holding a reference to mm.  Takes mm->mmap_sem during call.
 443 */
 444
 445void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
 446{
 447        struct vm_area_struct *vma;
 448
 449        down_write(&mm->mmap_sem);
 450        for (vma = mm->mmap; vma; vma = vma->vm_next)
 451                mpol_rebind_policy(vma->vm_policy, new, MPOL_REBIND_ONCE);
 452        up_write(&mm->mmap_sem);
 453}
 454
 455static const struct mempolicy_operations mpol_ops[MPOL_MAX] = {
 456        [MPOL_DEFAULT] = {
 457                .rebind = mpol_rebind_default,
 458        },
 459        [MPOL_INTERLEAVE] = {
 460                .create = mpol_new_interleave,
 461                .rebind = mpol_rebind_nodemask,
 462        },
 463        [MPOL_PREFERRED] = {
 464                .create = mpol_new_preferred,
 465                .rebind = mpol_rebind_preferred,
 466        },
 467        [MPOL_BIND] = {
 468                .create = mpol_new_bind,
 469                .rebind = mpol_rebind_nodemask,
 470        },
 471};
 472
 473static void migrate_page_add(struct page *page, struct list_head *pagelist,
 474                                unsigned long flags);
 475
 476struct queue_pages {
 477        struct list_head *pagelist;
 478        unsigned long flags;
 479        nodemask_t *nmask;
 480        struct vm_area_struct *prev;
 481};
 482
 483/*
 484 * Scan through pages checking if pages follow certain conditions,
 485 * and move them to the pagelist if they do.
 486 */
 487static int queue_pages_pte_range(pmd_t *pmd, unsigned long addr,
 488                        unsigned long end, struct mm_walk *walk)
 489{
 490        struct vm_area_struct *vma = walk->vma;
 491        struct page *page;
 492        struct queue_pages *qp = walk->private;
 493        unsigned long flags = qp->flags;
 494        int nid, ret;
 495        pte_t *pte;
 496        spinlock_t *ptl;
 497
 498        if (pmd_trans_huge(*pmd)) {
 499                ptl = pmd_lock(walk->mm, pmd);
 500                if (pmd_trans_huge(*pmd)) {
 501                        page = pmd_page(*pmd);
 502                        if (is_huge_zero_page(page)) {
 503                                spin_unlock(ptl);
 504                                __split_huge_pmd(vma, pmd, addr, false, NULL);
 505                        } else {
 506                                get_page(page);
 507                                spin_unlock(ptl);
 508                                lock_page(page);
 509                                ret = split_huge_page(page);
 510                                unlock_page(page);
 511                                put_page(page);
 512                                if (ret)
 513                                        return 0;
 514                        }
 515                } else {
 516                        spin_unlock(ptl);
 517                }
 518        }
 519
 520        if (pmd_trans_unstable(pmd))
 521                return 0;
 522retry:
 523        pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
 524        for (; addr != end; pte++, addr += PAGE_SIZE) {
 525                if (!pte_present(*pte))
 526                        continue;
 527                page = vm_normal_page(vma, addr, *pte);
 528                if (!page)
 529                        continue;
 530                /*
 531                 * vm_normal_page() filters out zero pages, but there might
 532                 * still be PageReserved pages to skip, perhaps in a VDSO.
 533                 */
 534                if (PageReserved(page))
 535                        continue;
 536                nid = page_to_nid(page);
 537                if (node_isset(nid, *qp->nmask) == !!(flags & MPOL_MF_INVERT))
 538                        continue;
 539                if (PageTransCompound(page)) {
 540                        get_page(page);
 541                        pte_unmap_unlock(pte, ptl);
 542                        lock_page(page);
 543                        ret = split_huge_page(page);
 544                        unlock_page(page);
 545                        put_page(page);
 546                        /* Failed to split -- skip. */
 547                        if (ret) {
 548                                pte = pte_offset_map_lock(walk->mm, pmd,
 549                                                addr, &ptl);
 550                                continue;
 551                        }
 552                        goto retry;
 553                }
 554
 555                migrate_page_add(page, qp->pagelist, flags);
 556        }
 557        pte_unmap_unlock(pte - 1, ptl);
 558        cond_resched();
 559        return 0;
 560}
 561
 562static int queue_pages_hugetlb(pte_t *pte, unsigned long hmask,
 563                               unsigned long addr, unsigned long end,
 564                               struct mm_walk *walk)
 565{
 566#ifdef CONFIG_HUGETLB_PAGE
 567        struct queue_pages *qp = walk->private;
 568        unsigned long flags = qp->flags;
 569        int nid;
 570        struct page *page;
 571        spinlock_t *ptl;
 572        pte_t entry;
 573
 574        ptl = huge_pte_lock(hstate_vma(walk->vma), walk->mm, pte);
 575        entry = huge_ptep_get(pte);
 576        if (!pte_present(entry))
 577                goto unlock;
 578        page = pte_page(entry);
 579        nid = page_to_nid(page);
 580        if (node_isset(nid, *qp->nmask) == !!(flags & MPOL_MF_INVERT))
 581                goto unlock;
 582        /* With MPOL_MF_MOVE, we migrate only unshared hugepage. */
 583        if (flags & (MPOL_MF_MOVE_ALL) ||
 584            (flags & MPOL_MF_MOVE && page_mapcount(page) == 1))
 585                isolate_huge_page(page, qp->pagelist);
 586unlock:
 587        spin_unlock(ptl);
 588#else
 589        BUG();
 590#endif
 591        return 0;
 592}
 593
 594#ifdef CONFIG_NUMA_BALANCING
 595/*
 596 * This is used to mark a range of virtual addresses to be inaccessible.
 597 * These are later cleared by a NUMA hinting fault. Depending on these
 598 * faults, pages may be migrated for better NUMA placement.
 599 *
 600 * This is assuming that NUMA faults are handled using PROT_NONE. If
 601 * an architecture makes a different choice, it will need further
 602 * changes to the core.
 603 */
 604unsigned long change_prot_numa(struct vm_area_struct *vma,
 605                        unsigned long addr, unsigned long end)
 606{
 607        int nr_updated;
 608
 609        nr_updated = change_protection(vma, addr, end, PAGE_NONE, 0, 1);
 610        if (nr_updated)
 611                count_vm_numa_events(NUMA_PTE_UPDATES, nr_updated);
 612
 613        return nr_updated;
 614}
 615#else
 616static unsigned long change_prot_numa(struct vm_area_struct *vma,
 617                        unsigned long addr, unsigned long end)
 618{
 619        return 0;
 620}
 621#endif /* CONFIG_NUMA_BALANCING */
 622
 623static int queue_pages_test_walk(unsigned long start, unsigned long end,
 624                                struct mm_walk *walk)
 625{
 626        struct vm_area_struct *vma = walk->vma;
 627        struct queue_pages *qp = walk->private;
 628        unsigned long endvma = vma->vm_end;
 629        unsigned long flags = qp->flags;
 630
 631        if (!vma_migratable(vma))
 632                return 1;
 633
 634        if (endvma > end)
 635                endvma = end;
 636        if (vma->vm_start > start)
 637                start = vma->vm_start;
 638
 639        if (!(flags & MPOL_MF_DISCONTIG_OK)) {
 640                if (!vma->vm_next && vma->vm_end < end)
 641                        return -EFAULT;
 642                if (qp->prev && qp->prev->vm_end < vma->vm_start)
 643                        return -EFAULT;
 644        }
 645
 646        qp->prev = vma;
 647
 648        if (flags & MPOL_MF_LAZY) {
 649                /* Similar to task_numa_work, skip inaccessible VMAs */
 650                if (!is_vm_hugetlb_page(vma) &&
 651                        (vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)) &&
 652                        !(vma->vm_flags & VM_MIXEDMAP))
 653                        change_prot_numa(vma, start, endvma);
 654                return 1;
 655        }
 656
 657        /* queue pages from current vma */
 658        if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
 659                return 0;
 660        return 1;
 661}
 662
 663/*
 664 * Walk through page tables and collect pages to be migrated.
 665 *
 666 * If pages found in a given range are on a set of nodes (determined by
 667 * @nodes and @flags,) it's isolated and queued to the pagelist which is
 668 * passed via @private.)
 669 */
 670static int
 671queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end,
 672                nodemask_t *nodes, unsigned long flags,
 673                struct list_head *pagelist)
 674{
 675        struct queue_pages qp = {
 676                .pagelist = pagelist,
 677                .flags = flags,
 678                .nmask = nodes,
 679                .prev = NULL,
 680        };
 681        struct mm_walk queue_pages_walk = {
 682                .hugetlb_entry = queue_pages_hugetlb,
 683                .pmd_entry = queue_pages_pte_range,
 684                .test_walk = queue_pages_test_walk,
 685                .mm = mm,
 686                .private = &qp,
 687        };
 688
 689        return walk_page_range(start, end, &queue_pages_walk);
 690}
 691
 692/*
 693 * Apply policy to a single VMA
 694 * This must be called with the mmap_sem held for writing.
 695 */
 696static int vma_replace_policy(struct vm_area_struct *vma,
 697                                                struct mempolicy *pol)
 698{
 699        int err;
 700        struct mempolicy *old;
 701        struct mempolicy *new;
 702
 703        pr_debug("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
 704                 vma->vm_start, vma->vm_end, vma->vm_pgoff,
 705                 vma->vm_ops, vma->vm_file,
 706                 vma->vm_ops ? vma->vm_ops->set_policy : NULL);
 707
 708        new = mpol_dup(pol);
 709        if (IS_ERR(new))
 710                return PTR_ERR(new);
 711
 712        if (vma->vm_ops && vma->vm_ops->set_policy) {
 713                err = vma->vm_ops->set_policy(vma, new);
 714                if (err)
 715                        goto err_out;
 716        }
 717
 718        old = vma->vm_policy;
 719        vma->vm_policy = new; /* protected by mmap_sem */
 720        mpol_put(old);
 721
 722        return 0;
 723 err_out:
 724        mpol_put(new);
 725        return err;
 726}
 727
 728/* Step 2: apply policy to a range and do splits. */
 729static int mbind_range(struct mm_struct *mm, unsigned long start,
 730                       unsigned long end, struct mempolicy *new_pol)
 731{
 732        struct vm_area_struct *next;
 733        struct vm_area_struct *prev;
 734        struct vm_area_struct *vma;
 735        int err = 0;
 736        pgoff_t pgoff;
 737        unsigned long vmstart;
 738        unsigned long vmend;
 739
 740        vma = find_vma(mm, start);
 741        if (!vma || vma->vm_start > start)
 742                return -EFAULT;
 743
 744        prev = vma->vm_prev;
 745        if (start > vma->vm_start)
 746                prev = vma;
 747
 748        for (; vma && vma->vm_start < end; prev = vma, vma = next) {
 749                next = vma->vm_next;
 750                vmstart = max(start, vma->vm_start);
 751                vmend   = min(end, vma->vm_end);
 752
 753                if (mpol_equal(vma_policy(vma), new_pol))
 754                        continue;
 755
 756                pgoff = vma->vm_pgoff +
 757                        ((vmstart - vma->vm_start) >> PAGE_SHIFT);
 758                prev = vma_merge(mm, prev, vmstart, vmend, vma->vm_flags,
 759                                 vma->anon_vma, vma->vm_file, pgoff,
 760                                 new_pol, vma->vm_userfaultfd_ctx);
 761                if (prev) {
 762                        vma = prev;
 763                        next = vma->vm_next;
 764                        if (mpol_equal(vma_policy(vma), new_pol))
 765                                continue;
 766                        /* vma_merge() joined vma && vma->next, case 8 */
 767                        goto replace;
 768                }
 769                if (vma->vm_start != vmstart) {
 770                        err = split_vma(vma->vm_mm, vma, vmstart, 1);
 771                        if (err)
 772                                goto out;
 773                }
 774                if (vma->vm_end != vmend) {
 775                        err = split_vma(vma->vm_mm, vma, vmend, 0);
 776                        if (err)
 777                                goto out;
 778                }
 779 replace:
 780                err = vma_replace_policy(vma, new_pol);
 781                if (err)
 782                        goto out;
 783        }
 784
 785 out:
 786        return err;
 787}
 788
 789/* Set the process memory policy */
 790static long do_set_mempolicy(unsigned short mode, unsigned short flags,
 791                             nodemask_t *nodes)
 792{
 793        struct mempolicy *new, *old;
 794        NODEMASK_SCRATCH(scratch);
 795        int ret;
 796
 797        if (!scratch)
 798                return -ENOMEM;
 799
 800        new = mpol_new(mode, flags, nodes);
 801        if (IS_ERR(new)) {
 802                ret = PTR_ERR(new);
 803                goto out;
 804        }
 805
 806        task_lock(current);
 807        ret = mpol_set_nodemask(new, nodes, scratch);
 808        if (ret) {
 809                task_unlock(current);
 810                mpol_put(new);
 811                goto out;
 812        }
 813        old = current->mempolicy;
 814        current->mempolicy = new;
 815        if (new && new->mode == MPOL_INTERLEAVE &&
 816            nodes_weight(new->v.nodes))
 817                current->il_next = first_node(new->v.nodes);
 818        task_unlock(current);
 819        mpol_put(old);
 820        ret = 0;
 821out:
 822        NODEMASK_SCRATCH_FREE(scratch);
 823        return ret;
 824}
 825
 826/*
 827 * Return nodemask for policy for get_mempolicy() query
 828 *
 829 * Called with task's alloc_lock held
 830 */
 831static void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes)
 832{
 833        nodes_clear(*nodes);
 834        if (p == &default_policy)
 835                return;
 836
 837        switch (p->mode) {
 838        case MPOL_BIND:
 839                /* Fall through */
 840        case MPOL_INTERLEAVE:
 841                *nodes = p->v.nodes;
 842                break;
 843        case MPOL_PREFERRED:
 844                if (!(p->flags & MPOL_F_LOCAL))
 845                        node_set(p->v.preferred_node, *nodes);
 846                /* else return empty node mask for local allocation */
 847                break;
 848        default:
 849                BUG();
 850        }
 851}
 852
 853static int lookup_node(unsigned long addr)
 854{
 855        struct page *p;
 856        int err;
 857
 858        err = get_user_pages(addr & PAGE_MASK, 1, 0, &p, NULL);
 859        if (err >= 0) {
 860                err = page_to_nid(p);
 861                put_page(p);
 862        }
 863        return err;
 864}
 865
 866/* Retrieve NUMA policy */
 867static long do_get_mempolicy(int *policy, nodemask_t *nmask,
 868                             unsigned long addr, unsigned long flags)
 869{
 870        int err;
 871        struct mm_struct *mm = current->mm;
 872        struct vm_area_struct *vma = NULL;
 873        struct mempolicy *pol = current->mempolicy;
 874
 875        if (flags &
 876                ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED))
 877                return -EINVAL;
 878
 879        if (flags & MPOL_F_MEMS_ALLOWED) {
 880                if (flags & (MPOL_F_NODE|MPOL_F_ADDR))
 881                        return -EINVAL;
 882                *policy = 0;    /* just so it's initialized */
 883                task_lock(current);
 884                *nmask  = cpuset_current_mems_allowed;
 885                task_unlock(current);
 886                return 0;
 887        }
 888
 889        if (flags & MPOL_F_ADDR) {
 890                /*
 891                 * Do NOT fall back to task policy if the
 892                 * vma/shared policy at addr is NULL.  We
 893                 * want to return MPOL_DEFAULT in this case.
 894                 */
 895                down_read(&mm->mmap_sem);
 896                vma = find_vma_intersection(mm, addr, addr+1);
 897                if (!vma) {
 898                        up_read(&mm->mmap_sem);
 899                        return -EFAULT;
 900                }
 901                if (vma->vm_ops && vma->vm_ops->get_policy)
 902                        pol = vma->vm_ops->get_policy(vma, addr);
 903                else
 904                        pol = vma->vm_policy;
 905        } else if (addr)
 906                return -EINVAL;
 907
 908        if (!pol)
 909                pol = &default_policy;  /* indicates default behavior */
 910
 911        if (flags & MPOL_F_NODE) {
 912                if (flags & MPOL_F_ADDR) {
 913                        err = lookup_node(addr);
 914                        if (err < 0)
 915                                goto out;
 916                        *policy = err;
 917                } else if (pol == current->mempolicy &&
 918                                pol->mode == MPOL_INTERLEAVE) {
 919                        *policy = current->il_next;
 920                } else {
 921                        err = -EINVAL;
 922                        goto out;
 923                }
 924        } else {
 925                *policy = pol == &default_policy ? MPOL_DEFAULT :
 926                                                pol->mode;
 927                /*
 928                 * Internal mempolicy flags must be masked off before exposing
 929                 * the policy to userspace.
 930                 */
 931                *policy |= (pol->flags & MPOL_MODE_FLAGS);
 932        }
 933
 934        if (vma) {
 935                up_read(&current->mm->mmap_sem);
 936                vma = NULL;
 937        }
 938
 939        err = 0;
 940        if (nmask) {
 941                if (mpol_store_user_nodemask(pol)) {
 942                        *nmask = pol->w.user_nodemask;
 943                } else {
 944                        task_lock(current);
 945                        get_policy_nodemask(pol, nmask);
 946                        task_unlock(current);
 947                }
 948        }
 949
 950 out:
 951        mpol_cond_put(pol);
 952        if (vma)
 953                up_read(&current->mm->mmap_sem);
 954        return err;
 955}
 956
 957#ifdef CONFIG_MIGRATION
 958/*
 959 * page migration
 960 */
 961static void migrate_page_add(struct page *page, struct list_head *pagelist,
 962                                unsigned long flags)
 963{
 964        /*
 965         * Avoid migrating a page that is shared with others.
 966         */
 967        if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1) {
 968                if (!isolate_lru_page(page)) {
 969                        list_add_tail(&page->lru, pagelist);
 970                        inc_node_page_state(page, NR_ISOLATED_ANON +
 971                                            page_is_file_cache(page));
 972                }
 973        }
 974}
 975
 976static struct page *new_node_page(struct page *page, unsigned long node, int **x)
 977{
 978        if (PageHuge(page))
 979                return alloc_huge_page_node(page_hstate(compound_head(page)),
 980                                        node);
 981        else
 982                return __alloc_pages_node(node, GFP_HIGHUSER_MOVABLE |
 983                                                    __GFP_THISNODE, 0);
 984}
 985
 986/*
 987 * Migrate pages from one node to a target node.
 988 * Returns error or the number of pages not migrated.
 989 */
 990static int migrate_to_node(struct mm_struct *mm, int source, int dest,
 991                           int flags)
 992{
 993        nodemask_t nmask;
 994        LIST_HEAD(pagelist);
 995        int err = 0;
 996
 997        nodes_clear(nmask);
 998        node_set(source, nmask);
 999
1000        /*
1001         * This does not "check" the range but isolates all pages that
1002         * need migration.  Between passing in the full user address
1003         * space range and MPOL_MF_DISCONTIG_OK, this call can not fail.
1004         */
1005        VM_BUG_ON(!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)));
1006        queue_pages_range(mm, mm->mmap->vm_start, mm->task_size, &nmask,
1007                        flags | MPOL_MF_DISCONTIG_OK, &pagelist);
1008
1009        if (!list_empty(&pagelist)) {
1010                err = migrate_pages(&pagelist, new_node_page, NULL, dest,
1011                                        MIGRATE_SYNC, MR_SYSCALL);
1012                if (err)
1013                        putback_movable_pages(&pagelist);
1014        }
1015
1016        return err;
1017}
1018
1019/*
1020 * Move pages between the two nodesets so as to preserve the physical
1021 * layout as much as possible.
1022 *
1023 * Returns the number of page that could not be moved.
1024 */
1025int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
1026                     const nodemask_t *to, int flags)
1027{
1028        int busy = 0;
1029        int err;
1030        nodemask_t tmp;
1031
1032        err = migrate_prep();
1033        if (err)
1034                return err;
1035
1036        down_read(&mm->mmap_sem);
1037
1038        /*
1039         * Find a 'source' bit set in 'tmp' whose corresponding 'dest'
1040         * bit in 'to' is not also set in 'tmp'.  Clear the found 'source'
1041         * bit in 'tmp', and return that <source, dest> pair for migration.
1042         * The pair of nodemasks 'to' and 'from' define the map.
1043         *
1044         * If no pair of bits is found that way, fallback to picking some
1045         * pair of 'source' and 'dest' bits that are not the same.  If the
1046         * 'source' and 'dest' bits are the same, this represents a node
1047         * that will be migrating to itself, so no pages need move.
1048         *
1049         * If no bits are left in 'tmp', or if all remaining bits left
1050         * in 'tmp' correspond to the same bit in 'to', return false
1051         * (nothing left to migrate).
1052         *
1053         * This lets us pick a pair of nodes to migrate between, such that
1054         * if possible the dest node is not already occupied by some other
1055         * source node, minimizing the risk of overloading the memory on a
1056         * node that would happen if we migrated incoming memory to a node
1057         * before migrating outgoing memory source that same node.
1058         *
1059         * A single scan of tmp is sufficient.  As we go, we remember the
1060         * most recent <s, d> pair that moved (s != d).  If we find a pair
1061         * that not only moved, but what's better, moved to an empty slot
1062         * (d is not set in tmp), then we break out then, with that pair.
1063         * Otherwise when we finish scanning from_tmp, we at least have the
1064         * most recent <s, d> pair that moved.  If we get all the way through
1065         * the scan of tmp without finding any node that moved, much less
1066         * moved to an empty node, then there is nothing left worth migrating.
1067         */
1068
1069        tmp = *from;
1070        while (!nodes_empty(tmp)) {
1071                int s,d;
1072                int source = NUMA_NO_NODE;
1073                int dest = 0;
1074
1075                for_each_node_mask(s, tmp) {
1076
1077                        /*
1078                         * do_migrate_pages() tries to maintain the relative
1079                         * node relationship of the pages established between
1080                         * threads and memory areas.
1081                         *
1082                         * However if the number of source nodes is not equal to
1083                         * the number of destination nodes we can not preserve
1084                         * this node relative relationship.  In that case, skip
1085                         * copying memory from a node that is in the destination
1086                         * mask.
1087                         *
1088                         * Example: [2,3,4] -> [3,4,5] moves everything.
1089                         *          [0-7] - > [3,4,5] moves only 0,1,2,6,7.
1090                         */
1091
1092                        if ((nodes_weight(*from) != nodes_weight(*to)) &&
1093                                                (node_isset(s, *to)))
1094                                continue;
1095
1096                        d = node_remap(s, *from, *to);
1097                        if (s == d)
1098                                continue;
1099
1100                        source = s;     /* Node moved. Memorize */
1101                        dest = d;
1102
1103                        /* dest not in remaining from nodes? */
1104                        if (!node_isset(dest, tmp))
1105                                break;
1106                }
1107                if (source == NUMA_NO_NODE)
1108                        break;
1109
1110                node_clear(source, tmp);
1111                err = migrate_to_node(mm, source, dest, flags);
1112                if (err > 0)
1113                        busy += err;
1114                if (err < 0)
1115                        break;
1116        }
1117        up_read(&mm->mmap_sem);
1118        if (err < 0)
1119                return err;
1120        return busy;
1121
1122}
1123
1124/*
1125 * Allocate a new page for page migration based on vma policy.
1126 * Start by assuming the page is mapped by the same vma as contains @start.
1127 * Search forward from there, if not.  N.B., this assumes that the
1128 * list of pages handed to migrate_pages()--which is how we get here--
1129 * is in virtual address order.
1130 */
1131static struct page *new_page(struct page *page, unsigned long start, int **x)
1132{
1133        struct vm_area_struct *vma;
1134        unsigned long uninitialized_var(address);
1135
1136        vma = find_vma(current->mm, start);
1137        while (vma) {
1138                address = page_address_in_vma(page, vma);
1139                if (address != -EFAULT)
1140                        break;
1141                vma = vma->vm_next;
1142        }
1143
1144        if (PageHuge(page)) {
1145                BUG_ON(!vma);
1146                return alloc_huge_page_noerr(vma, address, 1);
1147        }
1148        /*
1149         * if !vma, alloc_page_vma() will use task or system default policy
1150         */
1151        return alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
1152}
1153#else
1154
1155static void migrate_page_add(struct page *page, struct list_head *pagelist,
1156                                unsigned long flags)
1157{
1158}
1159
1160int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
1161                     const nodemask_t *to, int flags)
1162{
1163        return -ENOSYS;
1164}
1165
1166static struct page *new_page(struct page *page, unsigned long start, int **x)
1167{
1168        return NULL;
1169}
1170#endif
1171
1172static long do_mbind(unsigned long start, unsigned long len,
1173                     unsigned short mode, unsigned short mode_flags,
1174                     nodemask_t *nmask, unsigned long flags)
1175{
1176        struct mm_struct *mm = current->mm;
1177        struct mempolicy *new;
1178        unsigned long end;
1179        int err;
1180        LIST_HEAD(pagelist);
1181
1182        if (flags & ~(unsigned long)MPOL_MF_VALID)
1183                return -EINVAL;
1184        if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
1185                return -EPERM;
1186
1187        if (start & ~PAGE_MASK)
1188                return -EINVAL;
1189
1190        if (mode == MPOL_DEFAULT)
1191                flags &= ~MPOL_MF_STRICT;
1192
1193        len = (len + PAGE_SIZE - 1) & PAGE_MASK;
1194        end = start + len;
1195
1196        if (end < start)
1197                return -EINVAL;
1198        if (end == start)
1199                return 0;
1200
1201        new = mpol_new(mode, mode_flags, nmask);
1202        if (IS_ERR(new))
1203                return PTR_ERR(new);
1204
1205        if (flags & MPOL_MF_LAZY)
1206                new->flags |= MPOL_F_MOF;
1207
1208        /*
1209         * If we are using the default policy then operation
1210         * on discontinuous address spaces is okay after all
1211         */
1212        if (!new)
1213                flags |= MPOL_MF_DISCONTIG_OK;
1214
1215        pr_debug("mbind %lx-%lx mode:%d flags:%d nodes:%lx\n",
1216                 start, start + len, mode, mode_flags,
1217                 nmask ? nodes_addr(*nmask)[0] : NUMA_NO_NODE);
1218
1219        if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
1220
1221                err = migrate_prep();
1222                if (err)
1223                        goto mpol_out;
1224        }
1225        {
1226                NODEMASK_SCRATCH(scratch);
1227                if (scratch) {
1228                        down_write(&mm->mmap_sem);
1229                        task_lock(current);
1230                        err = mpol_set_nodemask(new, nmask, scratch);
1231                        task_unlock(current);
1232                        if (err)
1233                                up_write(&mm->mmap_sem);
1234                } else
1235                        err = -ENOMEM;
1236                NODEMASK_SCRATCH_FREE(scratch);
1237        }
1238        if (err)
1239                goto mpol_out;
1240
1241        err = queue_pages_range(mm, start, end, nmask,
1242                          flags | MPOL_MF_INVERT, &pagelist);
1243        if (!err)
1244                err = mbind_range(mm, start, end, new);
1245
1246        if (!err) {
1247                int nr_failed = 0;
1248
1249                if (!list_empty(&pagelist)) {
1250                        WARN_ON_ONCE(flags & MPOL_MF_LAZY);
1251                        nr_failed = migrate_pages(&pagelist, new_page, NULL,
1252                                start, MIGRATE_SYNC, MR_MEMPOLICY_MBIND);
1253                        if (nr_failed)
1254                                putback_movable_pages(&pagelist);
1255                }
1256
1257                if (nr_failed && (flags & MPOL_MF_STRICT))
1258                        err = -EIO;
1259        } else
1260                putback_movable_pages(&pagelist);
1261
1262        up_write(&mm->mmap_sem);
1263 mpol_out:
1264        mpol_put(new);
1265        return err;
1266}
1267
1268/*
1269 * User space interface with variable sized bitmaps for nodelists.
1270 */
1271
1272/* Copy a node mask from user space. */
1273static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
1274                     unsigned long maxnode)
1275{
1276        unsigned long k;
1277        unsigned long nlongs;
1278        unsigned long endmask;
1279
1280        --maxnode;
1281        nodes_clear(*nodes);
1282        if (maxnode == 0 || !nmask)
1283                return 0;
1284        if (maxnode > PAGE_SIZE*BITS_PER_BYTE)
1285                return -EINVAL;
1286
1287        nlongs = BITS_TO_LONGS(maxnode);
1288        if ((maxnode % BITS_PER_LONG) == 0)
1289                endmask = ~0UL;
1290        else
1291                endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1;
1292
1293        /* When the user specified more nodes than supported just check
1294           if the non supported part is all zero. */
1295        if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) {
1296                if (nlongs > PAGE_SIZE/sizeof(long))
1297                        return -EINVAL;
1298                for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) {
1299                        unsigned long t;
1300                        if (get_user(t, nmask + k))
1301                                return -EFAULT;
1302                        if (k == nlongs - 1) {
1303                                if (t & endmask)
1304                                        return -EINVAL;
1305                        } else if (t)
1306                                return -EINVAL;
1307                }
1308                nlongs = BITS_TO_LONGS(MAX_NUMNODES);
1309                endmask = ~0UL;
1310        }
1311
1312        if (copy_from_user(nodes_addr(*nodes), nmask, nlongs*sizeof(unsigned long)))
1313                return -EFAULT;
1314        nodes_addr(*nodes)[nlongs-1] &= endmask;
1315        return 0;
1316}
1317
1318/* Copy a kernel node mask to user space */
1319static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
1320                              nodemask_t *nodes)
1321{
1322        unsigned long copy = ALIGN(maxnode-1, 64) / 8;
1323        const int nbytes = BITS_TO_LONGS(MAX_NUMNODES) * sizeof(long);
1324
1325        if (copy > nbytes) {
1326                if (copy > PAGE_SIZE)
1327                        return -EINVAL;
1328                if (clear_user((char __user *)mask + nbytes, copy - nbytes))
1329                        return -EFAULT;
1330                copy = nbytes;
1331        }
1332        return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
1333}
1334
1335SYSCALL_DEFINE6(mbind, unsigned long, start, unsigned long, len,
1336                unsigned long, mode, const unsigned long __user *, nmask,
1337                unsigned long, maxnode, unsigned, flags)
1338{
1339        nodemask_t nodes;
1340        int err;
1341        unsigned short mode_flags;
1342
1343        mode_flags = mode & MPOL_MODE_FLAGS;
1344        mode &= ~MPOL_MODE_FLAGS;
1345        if (mode >= MPOL_MAX)
1346                return -EINVAL;
1347        if ((mode_flags & MPOL_F_STATIC_NODES) &&
1348            (mode_flags & MPOL_F_RELATIVE_NODES))
1349                return -EINVAL;
1350        err = get_nodes(&nodes, nmask, maxnode);
1351        if (err)
1352                return err;
1353        return do_mbind(start, len, mode, mode_flags, &nodes, flags);
1354}
1355
1356/* Set the process memory policy */
1357SYSCALL_DEFINE3(set_mempolicy, int, mode, const unsigned long __user *, nmask,
1358                unsigned long, maxnode)
1359{
1360        int err;
1361        nodemask_t nodes;
1362        unsigned short flags;
1363
1364        flags = mode & MPOL_MODE_FLAGS;
1365        mode &= ~MPOL_MODE_FLAGS;
1366        if ((unsigned int)mode >= MPOL_MAX)
1367                return -EINVAL;
1368        if ((flags & MPOL_F_STATIC_NODES) && (flags & MPOL_F_RELATIVE_NODES))
1369                return -EINVAL;
1370        err = get_nodes(&nodes, nmask, maxnode);
1371        if (err)
1372                return err;
1373        return do_set_mempolicy(mode, flags, &nodes);
1374}
1375
1376SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode,
1377                const unsigned long __user *, old_nodes,
1378                const unsigned long __user *, new_nodes)
1379{
1380        const struct cred *cred = current_cred(), *tcred;
1381        struct mm_struct *mm = NULL;
1382        struct task_struct *task;
1383        nodemask_t task_nodes;
1384        int err;
1385        nodemask_t *old;
1386        nodemask_t *new;
1387        NODEMASK_SCRATCH(scratch);
1388
1389        if (!scratch)
1390                return -ENOMEM;
1391
1392        old = &scratch->mask1;
1393        new = &scratch->mask2;
1394
1395        err = get_nodes(old, old_nodes, maxnode);
1396        if (err)
1397                goto out;
1398
1399        err = get_nodes(new, new_nodes, maxnode);
1400        if (err)
1401                goto out;
1402
1403        /* Find the mm_struct */
1404        rcu_read_lock();
1405        task = pid ? find_task_by_vpid(pid) : current;
1406        if (!task) {
1407                rcu_read_unlock();
1408                err = -ESRCH;
1409                goto out;
1410        }
1411        get_task_struct(task);
1412
1413        err = -EINVAL;
1414
1415        /*
1416         * Check if this process has the right to modify the specified
1417         * process. The right exists if the process has administrative
1418         * capabilities, superuser privileges or the same
1419         * userid as the target process.
1420         */
1421        tcred = __task_cred(task);
1422        if (!uid_eq(cred->euid, tcred->suid) && !uid_eq(cred->euid, tcred->uid) &&
1423            !uid_eq(cred->uid,  tcred->suid) && !uid_eq(cred->uid,  tcred->uid) &&
1424            !capable(CAP_SYS_NICE)) {
1425                rcu_read_unlock();
1426                err = -EPERM;
1427                goto out_put;
1428        }
1429        rcu_read_unlock();
1430
1431        task_nodes = cpuset_mems_allowed(task);
1432        /* Is the user allowed to access the target nodes? */
1433        if (!nodes_subset(*new, task_nodes) && !capable(CAP_SYS_NICE)) {
1434                err = -EPERM;
1435                goto out_put;
1436        }
1437
1438        if (!nodes_subset(*new, node_states[N_MEMORY])) {
1439                err = -EINVAL;
1440                goto out_put;
1441        }
1442
1443        err = security_task_movememory(task);
1444        if (err)
1445                goto out_put;
1446
1447        mm = get_task_mm(task);
1448        put_task_struct(task);
1449
1450        if (!mm) {
1451                err = -EINVAL;
1452                goto out;
1453        }
1454
1455        err = do_migrate_pages(mm, old, new,
1456                capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE);
1457
1458        mmput(mm);
1459out:
1460        NODEMASK_SCRATCH_FREE(scratch);
1461
1462        return err;
1463
1464out_put:
1465        put_task_struct(task);
1466        goto out;
1467
1468}
1469
1470
1471/* Retrieve NUMA policy */
1472SYSCALL_DEFINE5(get_mempolicy, int __user *, policy,
1473                unsigned long __user *, nmask, unsigned long, maxnode,
1474                unsigned long, addr, unsigned long, flags)
1475{
1476        int err;
1477        int uninitialized_var(pval);
1478        nodemask_t nodes;
1479
1480        if (nmask != NULL && maxnode < MAX_NUMNODES)
1481                return -EINVAL;
1482
1483        err = do_get_mempolicy(&pval, &nodes, addr, flags);
1484
1485        if (err)
1486                return err;
1487
1488        if (policy && put_user(pval, policy))
1489                return -EFAULT;
1490
1491        if (nmask)
1492                err = copy_nodes_to_user(nmask, maxnode, &nodes);
1493
1494        return err;
1495}
1496
1497#ifdef CONFIG_COMPAT
1498
1499COMPAT_SYSCALL_DEFINE5(get_mempolicy, int __user *, policy,
1500                       compat_ulong_t __user *, nmask,
1501                       compat_ulong_t, maxnode,
1502                       compat_ulong_t, addr, compat_ulong_t, flags)
1503{
1504        long err;
1505        unsigned long __user *nm = NULL;
1506        unsigned long nr_bits, alloc_size;
1507        DECLARE_BITMAP(bm, MAX_NUMNODES);
1508
1509        nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1510        alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1511
1512        if (nmask)
1513                nm = compat_alloc_user_space(alloc_size);
1514
1515        err = sys_get_mempolicy(policy, nm, nr_bits+1, addr, flags);
1516
1517        if (!err && nmask) {
1518                unsigned long copy_size;
1519                copy_size = min_t(unsigned long, sizeof(bm), alloc_size);
1520                err = copy_from_user(bm, nm, copy_size);
1521                /* ensure entire bitmap is zeroed */
1522                err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8);
1523                err |= compat_put_bitmap(nmask, bm, nr_bits);
1524        }
1525
1526        return err;
1527}
1528
1529COMPAT_SYSCALL_DEFINE3(set_mempolicy, int, mode, compat_ulong_t __user *, nmask,
1530                       compat_ulong_t, maxnode)
1531{
1532        unsigned long __user *nm = NULL;
1533        unsigned long nr_bits, alloc_size;
1534        DECLARE_BITMAP(bm, MAX_NUMNODES);
1535
1536        nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1537        alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1538
1539        if (nmask) {
1540                if (compat_get_bitmap(bm, nmask, nr_bits))
1541                        return -EFAULT;
1542                nm = compat_alloc_user_space(alloc_size);
1543                if (copy_to_user(nm, bm, alloc_size))
1544                        return -EFAULT;
1545        }
1546
1547        return sys_set_mempolicy(mode, nm, nr_bits+1);
1548}
1549
1550COMPAT_SYSCALL_DEFINE6(mbind, compat_ulong_t, start, compat_ulong_t, len,
1551                       compat_ulong_t, mode, compat_ulong_t __user *, nmask,
1552                       compat_ulong_t, maxnode, compat_ulong_t, flags)
1553{
1554        unsigned long __user *nm = NULL;
1555        unsigned long nr_bits, alloc_size;
1556        nodemask_t bm;
1557
1558        nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1559        alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1560
1561        if (nmask) {
1562                if (compat_get_bitmap(nodes_addr(bm), nmask, nr_bits))
1563                        return -EFAULT;
1564                nm = compat_alloc_user_space(alloc_size);
1565                if (copy_to_user(nm, nodes_addr(bm), alloc_size))
1566                        return -EFAULT;
1567        }
1568
1569        return sys_mbind(start, len, mode, nm, nr_bits+1, flags);
1570}
1571
1572#endif
1573
1574struct mempolicy *__get_vma_policy(struct vm_area_struct *vma,
1575                                                unsigned long addr)
1576{
1577        struct mempolicy *pol = NULL;
1578
1579        if (vma) {
1580                if (vma->vm_ops && vma->vm_ops->get_policy) {
1581                        pol = vma->vm_ops->get_policy(vma, addr);
1582                } else if (vma->vm_policy) {
1583                        pol = vma->vm_policy;
1584
1585                        /*
1586                         * shmem_alloc_page() passes MPOL_F_SHARED policy with
1587                         * a pseudo vma whose vma->vm_ops=NULL. Take a reference
1588                         * count on these policies which will be dropped by
1589                         * mpol_cond_put() later
1590                         */
1591                        if (mpol_needs_cond_ref(pol))
1592                                mpol_get(pol);
1593                }
1594        }
1595
1596        return pol;
1597}
1598
1599/*
1600 * get_vma_policy(@vma, @addr)
1601 * @vma: virtual memory area whose policy is sought
1602 * @addr: address in @vma for shared policy lookup
1603 *
1604 * Returns effective policy for a VMA at specified address.
1605 * Falls back to current->mempolicy or system default policy, as necessary.
1606 * Shared policies [those marked as MPOL_F_SHARED] require an extra reference
1607 * count--added by the get_policy() vm_op, as appropriate--to protect against
1608 * freeing by another task.  It is the caller's responsibility to free the
1609 * extra reference for shared policies.
1610 */
1611static struct mempolicy *get_vma_policy(struct vm_area_struct *vma,
1612                                                unsigned long addr)
1613{
1614        struct mempolicy *pol = __get_vma_policy(vma, addr);
1615
1616        if (!pol)
1617                pol = get_task_policy(current);
1618
1619        return pol;
1620}
1621
1622bool vma_policy_mof(struct vm_area_struct *vma)
1623{
1624        struct mempolicy *pol;
1625
1626        if (vma->vm_ops && vma->vm_ops->get_policy) {
1627                bool ret = false;
1628
1629                pol = vma->vm_ops->get_policy(vma, vma->vm_start);
1630                if (pol && (pol->flags & MPOL_F_MOF))
1631                        ret = true;
1632                mpol_cond_put(pol);
1633
1634                return ret;
1635        }
1636
1637        pol = vma->vm_policy;
1638        if (!pol)
1639                pol = get_task_policy(current);
1640
1641        return pol->flags & MPOL_F_MOF;
1642}
1643
1644static int apply_policy_zone(struct mempolicy *policy, enum zone_type zone)
1645{
1646        enum zone_type dynamic_policy_zone = policy_zone;
1647
1648        BUG_ON(dynamic_policy_zone == ZONE_MOVABLE);
1649
1650        /*
1651         * if policy->v.nodes has movable memory only,
1652         * we apply policy when gfp_zone(gfp) = ZONE_MOVABLE only.
1653         *
1654         * policy->v.nodes is intersect with node_states[N_MEMORY].
1655         * so if the following test faile, it implies
1656         * policy->v.nodes has movable memory only.
1657         */
1658        if (!nodes_intersects(policy->v.nodes, node_states[N_HIGH_MEMORY]))
1659                dynamic_policy_zone = ZONE_MOVABLE;
1660
1661        return zone >= dynamic_policy_zone;
1662}
1663
1664/*
1665 * Return a nodemask representing a mempolicy for filtering nodes for
1666 * page allocation
1667 */
1668static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy)
1669{
1670        /* Lower zones don't get a nodemask applied for MPOL_BIND */
1671        if (unlikely(policy->mode == MPOL_BIND) &&
1672                        apply_policy_zone(policy, gfp_zone(gfp)) &&
1673                        cpuset_nodemask_valid_mems_allowed(&policy->v.nodes))
1674                return &policy->v.nodes;
1675
1676        return NULL;
1677}
1678
1679/* Return a zonelist indicated by gfp for node representing a mempolicy */
1680static struct zonelist *policy_zonelist(gfp_t gfp, struct mempolicy *policy,
1681        int nd)
1682{
1683        if (policy->mode == MPOL_PREFERRED && !(policy->flags & MPOL_F_LOCAL))
1684                nd = policy->v.preferred_node;
1685        else {
1686                /*
1687                 * __GFP_THISNODE shouldn't even be used with the bind policy
1688                 * because we might easily break the expectation to stay on the
1689                 * requested node and not break the policy.
1690                 */
1691                WARN_ON_ONCE(policy->mode == MPOL_BIND && (gfp & __GFP_THISNODE));
1692        }
1693
1694        return node_zonelist(nd, gfp);
1695}
1696
1697/* Do dynamic interleaving for a process */
1698static unsigned interleave_nodes(struct mempolicy *policy)
1699{
1700        unsigned nid, next;
1701        struct task_struct *me = current;
1702
1703        nid = me->il_next;
1704        next = next_node_in(nid, policy->v.nodes);
1705        if (next < MAX_NUMNODES)
1706                me->il_next = next;
1707        return nid;
1708}
1709
1710/*
1711 * Depending on the memory policy provide a node from which to allocate the
1712 * next slab entry.
1713 */
1714unsigned int mempolicy_slab_node(void)
1715{
1716        struct mempolicy *policy;
1717        int node = numa_mem_id();
1718
1719        if (in_interrupt())
1720                return node;
1721
1722        policy = current->mempolicy;
1723        if (!policy || policy->flags & MPOL_F_LOCAL)
1724                return node;
1725
1726        switch (policy->mode) {
1727        case MPOL_PREFERRED:
1728                /*
1729                 * handled MPOL_F_LOCAL above
1730                 */
1731                return policy->v.preferred_node;
1732
1733        case MPOL_INTERLEAVE:
1734                return interleave_nodes(policy);
1735
1736        case MPOL_BIND: {
1737                struct zoneref *z;
1738
1739                /*
1740                 * Follow bind policy behavior and start allocation at the
1741                 * first node.
1742                 */
1743                struct zonelist *zonelist;
1744                enum zone_type highest_zoneidx = gfp_zone(GFP_KERNEL);
1745                zonelist = &NODE_DATA(node)->node_zonelists[ZONELIST_FALLBACK];
1746                z = first_zones_zonelist(zonelist, highest_zoneidx,
1747                                                        &policy->v.nodes);
1748                return z->zone ? z->zone->node : node;
1749        }
1750
1751        default:
1752                BUG();
1753        }
1754}
1755
1756/*
1757 * Do static interleaving for a VMA with known offset @n.  Returns the n'th
1758 * node in pol->v.nodes (starting from n=0), wrapping around if n exceeds the
1759 * number of present nodes.
1760 */
1761static unsigned offset_il_node(struct mempolicy *pol,
1762                               struct vm_area_struct *vma, unsigned long n)
1763{
1764        unsigned nnodes = nodes_weight(pol->v.nodes);
1765        unsigned target;
1766        int i;
1767        int nid;
1768
1769        if (!nnodes)
1770                return numa_node_id();
1771        target = (unsigned int)n % nnodes;
1772        nid = first_node(pol->v.nodes);
1773        for (i = 0; i < target; i++)
1774                nid = next_node(nid, pol->v.nodes);
1775        return nid;
1776}
1777
1778/* Determine a node number for interleave */
1779static inline unsigned interleave_nid(struct mempolicy *pol,
1780                 struct vm_area_struct *vma, unsigned long addr, int shift)
1781{
1782        if (vma) {
1783                unsigned long off;
1784
1785                /*
1786                 * for small pages, there is no difference between
1787                 * shift and PAGE_SHIFT, so the bit-shift is safe.
1788                 * for huge pages, since vm_pgoff is in units of small
1789                 * pages, we need to shift off the always 0 bits to get
1790                 * a useful offset.
1791                 */
1792                BUG_ON(shift < PAGE_SHIFT);
1793                off = vma->vm_pgoff >> (shift - PAGE_SHIFT);
1794                off += (addr - vma->vm_start) >> shift;
1795                return offset_il_node(pol, vma, off);
1796        } else
1797                return interleave_nodes(pol);
1798}
1799
1800#ifdef CONFIG_HUGETLBFS
1801/*
1802 * huge_zonelist(@vma, @addr, @gfp_flags, @mpol)
1803 * @vma: virtual memory area whose policy is sought
1804 * @addr: address in @vma for shared policy lookup and interleave policy
1805 * @gfp_flags: for requested zone
1806 * @mpol: pointer to mempolicy pointer for reference counted mempolicy
1807 * @nodemask: pointer to nodemask pointer for MPOL_BIND nodemask
1808 *
1809 * Returns a zonelist suitable for a huge page allocation and a pointer
1810 * to the struct mempolicy for conditional unref after allocation.
1811 * If the effective policy is 'BIND, returns a pointer to the mempolicy's
1812 * @nodemask for filtering the zonelist.
1813 *
1814 * Must be protected by read_mems_allowed_begin()
1815 */
1816struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr,
1817                                gfp_t gfp_flags, struct mempolicy **mpol,
1818                                nodemask_t **nodemask)
1819{
1820        struct zonelist *zl;
1821
1822        *mpol = get_vma_policy(vma, addr);
1823        *nodemask = NULL;       /* assume !MPOL_BIND */
1824
1825        if (unlikely((*mpol)->mode == MPOL_INTERLEAVE)) {
1826                zl = node_zonelist(interleave_nid(*mpol, vma, addr,
1827                                huge_page_shift(hstate_vma(vma))), gfp_flags);
1828        } else {
1829                zl = policy_zonelist(gfp_flags, *mpol, numa_node_id());
1830                if ((*mpol)->mode == MPOL_BIND)
1831                        *nodemask = &(*mpol)->v.nodes;
1832        }
1833        return zl;
1834}
1835
1836/*
1837 * init_nodemask_of_mempolicy
1838 *
1839 * If the current task's mempolicy is "default" [NULL], return 'false'
1840 * to indicate default policy.  Otherwise, extract the policy nodemask
1841 * for 'bind' or 'interleave' policy into the argument nodemask, or
1842 * initialize the argument nodemask to contain the single node for
1843 * 'preferred' or 'local' policy and return 'true' to indicate presence
1844 * of non-default mempolicy.
1845 *
1846 * We don't bother with reference counting the mempolicy [mpol_get/put]
1847 * because the current task is examining it's own mempolicy and a task's
1848 * mempolicy is only ever changed by the task itself.
1849 *
1850 * N.B., it is the caller's responsibility to free a returned nodemask.
1851 */
1852bool init_nodemask_of_mempolicy(nodemask_t *mask)
1853{
1854        struct mempolicy *mempolicy;
1855        int nid;
1856
1857        if (!(mask && current->mempolicy))
1858                return false;
1859
1860        task_lock(current);
1861        mempolicy = current->mempolicy;
1862        switch (mempolicy->mode) {
1863        case MPOL_PREFERRED:
1864                if (mempolicy->flags & MPOL_F_LOCAL)
1865                        nid = numa_node_id();
1866                else
1867                        nid = mempolicy->v.preferred_node;
1868                init_nodemask_of_node(mask, nid);
1869                break;
1870
1871        case MPOL_BIND:
1872                /* Fall through */
1873        case MPOL_INTERLEAVE:
1874                *mask =  mempolicy->v.nodes;
1875                break;
1876
1877        default:
1878                BUG();
1879        }
1880        task_unlock(current);
1881
1882        return true;
1883}
1884#endif
1885
1886/*
1887 * mempolicy_nodemask_intersects
1888 *
1889 * If tsk's mempolicy is "default" [NULL], return 'true' to indicate default
1890 * policy.  Otherwise, check for intersection between mask and the policy
1891 * nodemask for 'bind' or 'interleave' policy.  For 'perferred' or 'local'
1892 * policy, always return true since it may allocate elsewhere on fallback.
1893 *
1894 * Takes task_lock(tsk) to prevent freeing of its mempolicy.
1895 */
1896bool mempolicy_nodemask_intersects(struct task_struct *tsk,
1897                                        const nodemask_t *mask)
1898{
1899        struct mempolicy *mempolicy;
1900        bool ret = true;
1901
1902        if (!mask)
1903                return ret;
1904        task_lock(tsk);
1905        mempolicy = tsk->mempolicy;
1906        if (!mempolicy)
1907                goto out;
1908
1909        switch (mempolicy->mode) {
1910        case MPOL_PREFERRED:
1911                /*
1912                 * MPOL_PREFERRED and MPOL_F_LOCAL are only preferred nodes to
1913                 * allocate from, they may fallback to other nodes when oom.
1914                 * Thus, it's possible for tsk to have allocated memory from
1915                 * nodes in mask.
1916                 */
1917                break;
1918        case MPOL_BIND:
1919        case MPOL_INTERLEAVE:
1920                ret = nodes_intersects(mempolicy->v.nodes, *mask);
1921                break;
1922        default:
1923                BUG();
1924        }
1925out:
1926        task_unlock(tsk);
1927        return ret;
1928}
1929
1930/* Allocate a page in interleaved policy.
1931   Own path because it needs to do special accounting. */
1932static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
1933                                        unsigned nid)
1934{
1935        struct zonelist *zl;
1936        struct page *page;
1937
1938        zl = node_zonelist(nid, gfp);
1939        page = __alloc_pages(gfp, order, zl);
1940        if (page && page_zone(page) == zonelist_zone(&zl->_zonerefs[0]))
1941                inc_zone_page_state(page, NUMA_INTERLEAVE_HIT);
1942        return page;
1943}
1944
1945/**
1946 *      alloc_pages_vma - Allocate a page for a VMA.
1947 *
1948 *      @gfp:
1949 *      %GFP_USER    user allocation.
1950 *      %GFP_KERNEL  kernel allocations,
1951 *      %GFP_HIGHMEM highmem/user allocations,
1952 *      %GFP_FS      allocation should not call back into a file system.
1953 *      %GFP_ATOMIC  don't sleep.
1954 *
1955 *      @order:Order of the GFP allocation.
1956 *      @vma:  Pointer to VMA or NULL if not available.
1957 *      @addr: Virtual Address of the allocation. Must be inside the VMA.
1958 *      @node: Which node to prefer for allocation (modulo policy).
1959 *      @hugepage: for hugepages try only the preferred node if possible
1960 *
1961 *      This function allocates a page from the kernel page pool and applies
1962 *      a NUMA policy associated with the VMA or the current process.
1963 *      When VMA is not NULL caller must hold down_read on the mmap_sem of the
1964 *      mm_struct of the VMA to prevent it from going away. Should be used for
1965 *      all allocations for pages that will be mapped into user space. Returns
1966 *      NULL when no page can be allocated.
1967 */
1968struct page *
1969alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
1970                unsigned long addr, int node, bool hugepage)
1971{
1972        struct mempolicy *pol;
1973        struct page *page;
1974        unsigned int cpuset_mems_cookie;
1975        struct zonelist *zl;
1976        nodemask_t *nmask;
1977
1978retry_cpuset:
1979        pol = get_vma_policy(vma, addr);
1980        cpuset_mems_cookie = read_mems_allowed_begin();
1981
1982        if (pol->mode == MPOL_INTERLEAVE) {
1983                unsigned nid;
1984
1985                nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order);
1986                mpol_cond_put(pol);
1987                page = alloc_page_interleave(gfp, order, nid);
1988                goto out;
1989        }
1990
1991        if (unlikely(IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && hugepage)) {
1992                int hpage_node = node;
1993
1994                /*
1995                 * For hugepage allocation and non-interleave policy which
1996                 * allows the current node (or other explicitly preferred
1997                 * node) we only try to allocate from the current/preferred
1998                 * node and don't fall back to other nodes, as the cost of
1999                 * remote accesses would likely offset THP benefits.
2000                 *
2001                 * If the policy is interleave, or does not allow the current
2002                 * node in its nodemask, we allocate the standard way.
2003                 */
2004                if (pol->mode == MPOL_PREFERRED &&
2005                                                !(pol->flags & MPOL_F_LOCAL))
2006                        hpage_node = pol->v.preferred_node;
2007
2008                nmask = policy_nodemask(gfp, pol);
2009                if (!nmask || node_isset(hpage_node, *nmask)) {
2010                        mpol_cond_put(pol);
2011                        page = __alloc_pages_node(hpage_node,
2012                                                gfp | __GFP_THISNODE, order);
2013                        goto out;
2014                }
2015        }
2016
2017        nmask = policy_nodemask(gfp, pol);
2018        zl = policy_zonelist(gfp, pol, node);
2019        page = __alloc_pages_nodemask(gfp, order, zl, nmask);
2020        mpol_cond_put(pol);
2021out:
2022        if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie)))
2023                goto retry_cpuset;
2024        return page;
2025}
2026
2027/**
2028 *      alloc_pages_current - Allocate pages.
2029 *
2030 *      @gfp:
2031 *              %GFP_USER   user allocation,
2032 *              %GFP_KERNEL kernel allocation,
2033 *              %GFP_HIGHMEM highmem allocation,
2034 *              %GFP_FS     don't call back into a file system.
2035 *              %GFP_ATOMIC don't sleep.
2036 *      @order: Power of two of allocation size in pages. 0 is a single page.
2037 *
2038 *      Allocate a page from the kernel page pool.  When not in
2039 *      interrupt context and apply the current process NUMA policy.
2040 *      Returns NULL when no page can be allocated.
2041 *
2042 *      Don't call cpuset_update_task_memory_state() unless
2043 *      1) it's ok to take cpuset_sem (can WAIT), and
2044 *      2) allocating for current task (not interrupt).
2045 */
2046struct page *alloc_pages_current(gfp_t gfp, unsigned order)
2047{
2048        struct mempolicy *pol = &default_policy;
2049        struct page *page;
2050        unsigned int cpuset_mems_cookie;
2051
2052        if (!in_interrupt() && !(gfp & __GFP_THISNODE))
2053                pol = get_task_policy(current);
2054
2055retry_cpuset:
2056        cpuset_mems_cookie = read_mems_allowed_begin();
2057
2058        /*
2059         * No reference counting needed for current->mempolicy
2060         * nor system default_policy
2061         */
2062        if (pol->mode == MPOL_INTERLEAVE)
2063                page = alloc_page_interleave(gfp, order, interleave_nodes(pol));
2064        else
2065                page = __alloc_pages_nodemask(gfp, order,
2066                                policy_zonelist(gfp, pol, numa_node_id()),
2067                                policy_nodemask(gfp, pol));
2068
2069        if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie)))
2070                goto retry_cpuset;
2071
2072        return page;
2073}
2074EXPORT_SYMBOL(alloc_pages_current);
2075
2076int vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst)
2077{
2078        struct mempolicy *pol = mpol_dup(vma_policy(src));
2079
2080        if (IS_ERR(pol))
2081                return PTR_ERR(pol);
2082        dst->vm_policy = pol;
2083        return 0;
2084}
2085
2086/*
2087 * If mpol_dup() sees current->cpuset == cpuset_being_rebound, then it
2088 * rebinds the mempolicy its copying by calling mpol_rebind_policy()
2089 * with the mems_allowed returned by cpuset_mems_allowed().  This
2090 * keeps mempolicies cpuset relative after its cpuset moves.  See
2091 * further kernel/cpuset.c update_nodemask().
2092 *
2093 * current's mempolicy may be rebinded by the other task(the task that changes
2094 * cpuset's mems), so we needn't do rebind work for current task.
2095 */
2096
2097/* Slow path of a mempolicy duplicate */
2098struct mempolicy *__mpol_dup(struct mempolicy *old)
2099{
2100        struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
2101
2102        if (!new)
2103                return ERR_PTR(-ENOMEM);
2104
2105        /* task's mempolicy is protected by alloc_lock */
2106        if (old == current->mempolicy) {
2107                task_lock(current);
2108                *new = *old;
2109                task_unlock(current);
2110        } else
2111                *new = *old;
2112
2113        if (current_cpuset_is_being_rebound()) {
2114                nodemask_t mems = cpuset_mems_allowed(current);
2115                if (new->flags & MPOL_F_REBINDING)
2116                        mpol_rebind_policy(new, &mems, MPOL_REBIND_STEP2);
2117                else
2118                        mpol_rebind_policy(new, &mems, MPOL_REBIND_ONCE);
2119        }
2120        atomic_set(&new->refcnt, 1);
2121        return new;
2122}
2123
2124/* Slow path of a mempolicy comparison */
2125bool __mpol_equal(struct mempolicy *a, struct mempolicy *b)
2126{
2127        if (!a || !b)
2128                return false;
2129        if (a->mode != b->mode)
2130                return false;
2131        if (a->flags != b->flags)
2132                return false;
2133        if (mpol_store_user_nodemask(a))
2134                if (!nodes_equal(a->w.user_nodemask, b->w.user_nodemask))
2135                        return false;
2136
2137        switch (a->mode) {
2138        case MPOL_BIND:
2139                /* Fall through */
2140        case MPOL_INTERLEAVE:
2141                return !!nodes_equal(a->v.nodes, b->v.nodes);
2142        case MPOL_PREFERRED:
2143                return a->v.preferred_node == b->v.preferred_node;
2144        default:
2145                BUG();
2146                return false;
2147        }
2148}
2149
2150/*
2151 * Shared memory backing store policy support.
2152 *
2153 * Remember policies even when nobody has shared memory mapped.
2154 * The policies are kept in Red-Black tree linked from the inode.
2155 * They are protected by the sp->lock rwlock, which should be held
2156 * for any accesses to the tree.
2157 */
2158
2159/*
2160 * lookup first element intersecting start-end.  Caller holds sp->lock for
2161 * reading or for writing
2162 */
2163static struct sp_node *
2164sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
2165{
2166        struct rb_node *n = sp->root.rb_node;
2167
2168        while (n) {
2169                struct sp_node *p = rb_entry(n, struct sp_node, nd);
2170
2171                if (start >= p->end)
2172                        n = n->rb_right;
2173                else if (end <= p->start)
2174                        n = n->rb_left;
2175                else
2176                        break;
2177        }
2178        if (!n)
2179                return NULL;
2180        for (;;) {
2181                struct sp_node *w = NULL;
2182                struct rb_node *prev = rb_prev(n);
2183                if (!prev)
2184                        break;
2185                w = rb_entry(prev, struct sp_node, nd);
2186                if (w->end <= start)
2187                        break;
2188                n = prev;
2189        }
2190        return rb_entry(n, struct sp_node, nd);
2191}
2192
2193/*
2194 * Insert a new shared policy into the list.  Caller holds sp->lock for
2195 * writing.
2196 */
2197static void sp_insert(struct shared_policy *sp, struct sp_node *new)
2198{
2199        struct rb_node **p = &sp->root.rb_node;
2200        struct rb_node *parent = NULL;
2201        struct sp_node *nd;
2202
2203        while (*p) {
2204                parent = *p;
2205                nd = rb_entry(parent, struct sp_node, nd);
2206                if (new->start < nd->start)
2207                        p = &(*p)->rb_left;
2208                else if (new->end > nd->end)
2209                        p = &(*p)->rb_right;
2210                else
2211                        BUG();
2212        }
2213        rb_link_node(&new->nd, parent, p);
2214        rb_insert_color(&new->nd, &sp->root);
2215        pr_debug("inserting %lx-%lx: %d\n", new->start, new->end,
2216                 new->policy ? new->policy->mode : 0);
2217}
2218
2219/* Find shared policy intersecting idx */
2220struct mempolicy *
2221mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
2222{
2223        struct mempolicy *pol = NULL;
2224        struct sp_node *sn;
2225
2226        if (!sp->root.rb_node)
2227                return NULL;
2228        read_lock(&sp->lock);
2229        sn = sp_lookup(sp, idx, idx+1);
2230        if (sn) {
2231                mpol_get(sn->policy);
2232                pol = sn->policy;
2233        }
2234        read_unlock(&sp->lock);
2235        return pol;
2236}
2237
2238static void sp_free(struct sp_node *n)
2239{
2240        mpol_put(n->policy);
2241        kmem_cache_free(sn_cache, n);
2242}
2243
2244/**
2245 * mpol_misplaced - check whether current page node is valid in policy
2246 *
2247 * @page: page to be checked
2248 * @vma: vm area where page mapped
2249 * @addr: virtual address where page mapped
2250 *
2251 * Lookup current policy node id for vma,addr and "compare to" page's
2252 * node id.
2253 *
2254 * Returns:
2255 *      -1      - not misplaced, page is in the right node
2256 *      node    - node id where the page should be
2257 *
2258 * Policy determination "mimics" alloc_page_vma().
2259 * Called from fault path where we know the vma and faulting address.
2260 */
2261int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long addr)
2262{
2263        struct mempolicy *pol;
2264        struct zoneref *z;
2265        int curnid = page_to_nid(page);
2266        unsigned long pgoff;
2267        int thiscpu = raw_smp_processor_id();
2268        int thisnid = cpu_to_node(thiscpu);
2269        int polnid = -1;
2270        int ret = -1;
2271
2272        BUG_ON(!vma);
2273
2274        pol = get_vma_policy(vma, addr);
2275        if (!(pol->flags & MPOL_F_MOF))
2276                goto out;
2277
2278        switch (pol->mode) {
2279        case MPOL_INTERLEAVE:
2280                BUG_ON(addr >= vma->vm_end);
2281                BUG_ON(addr < vma->vm_start);
2282
2283                pgoff = vma->vm_pgoff;
2284                pgoff += (addr - vma->vm_start) >> PAGE_SHIFT;
2285                polnid = offset_il_node(pol, vma, pgoff);
2286                break;
2287
2288        case MPOL_PREFERRED:
2289                if (pol->flags & MPOL_F_LOCAL)
2290                        polnid = numa_node_id();
2291                else
2292                        polnid = pol->v.preferred_node;
2293                break;
2294
2295        case MPOL_BIND:
2296
2297                /*
2298                 * allows binding to multiple nodes.
2299                 * use current page if in policy nodemask,
2300                 * else select nearest allowed node, if any.
2301                 * If no allowed nodes, use current [!misplaced].
2302                 */
2303                if (node_isset(curnid, pol->v.nodes))
2304                        goto out;
2305                z = first_zones_zonelist(
2306                                node_zonelist(numa_node_id(), GFP_HIGHUSER),
2307                                gfp_zone(GFP_HIGHUSER),
2308                                &pol->v.nodes);
2309                polnid = z->zone->node;
2310                break;
2311
2312        default:
2313                BUG();
2314        }
2315
2316        /* Migrate the page towards the node whose CPU is referencing it */
2317        if (pol->flags & MPOL_F_MORON) {
2318                polnid = thisnid;
2319
2320                if (!should_numa_migrate_memory(current, page, curnid, thiscpu))
2321                        goto out;
2322        }
2323
2324        if (curnid != polnid)
2325                ret = polnid;
2326out:
2327        mpol_cond_put(pol);
2328
2329        return ret;
2330}
2331
2332/*
2333 * Drop the (possibly final) reference to task->mempolicy.  It needs to be
2334 * dropped after task->mempolicy is set to NULL so that any allocation done as
2335 * part of its kmem_cache_free(), such as by KASAN, doesn't reference a freed
2336 * policy.
2337 */
2338void mpol_put_task_policy(struct task_struct *task)
2339{
2340        struct mempolicy *pol;
2341
2342        task_lock(task);
2343        pol = task->mempolicy;
2344        task->mempolicy = NULL;
2345        task_unlock(task);
2346        mpol_put(pol);
2347}
2348
2349static void sp_delete(struct shared_policy *sp, struct sp_node *n)
2350{
2351        pr_debug("deleting %lx-l%lx\n", n->start, n->end);
2352        rb_erase(&n->nd, &sp->root);
2353        sp_free(n);
2354}
2355
2356static void sp_node_init(struct sp_node *node, unsigned long start,
2357                        unsigned long end, struct mempolicy *pol)
2358{
2359        node->start = start;
2360        node->end = end;
2361        node->policy = pol;
2362}
2363
2364static struct sp_node *sp_alloc(unsigned long start, unsigned long end,
2365                                struct mempolicy *pol)
2366{
2367        struct sp_node *n;
2368        struct mempolicy *newpol;
2369
2370        n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
2371        if (!n)
2372                return NULL;
2373
2374        newpol = mpol_dup(pol);
2375        if (IS_ERR(newpol)) {
2376                kmem_cache_free(sn_cache, n);
2377                return NULL;
2378        }
2379        newpol->flags |= MPOL_F_SHARED;
2380        sp_node_init(n, start, end, newpol);
2381
2382        return n;
2383}
2384
2385/* Replace a policy range. */
2386static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
2387                                 unsigned long end, struct sp_node *new)
2388{
2389        struct sp_node *n;
2390        struct sp_node *n_new = NULL;
2391        struct mempolicy *mpol_new = NULL;
2392        int ret = 0;
2393
2394restart:
2395        write_lock(&sp->lock);
2396        n = sp_lookup(sp, start, end);
2397        /* Take care of old policies in the same range. */
2398        while (n && n->start < end) {
2399                struct rb_node *next = rb_next(&n->nd);
2400                if (n->start >= start) {
2401                        if (n->end <= end)
2402                                sp_delete(sp, n);
2403                        else
2404                                n->start = end;
2405                } else {
2406                        /* Old policy spanning whole new range. */
2407                        if (n->end > end) {
2408                                if (!n_new)
2409                                        goto alloc_new;
2410
2411                                *mpol_new = *n->policy;
2412                                atomic_set(&mpol_new->refcnt, 1);
2413                                sp_node_init(n_new, end, n->end, mpol_new);
2414                                n->end = start;
2415                                sp_insert(sp, n_new);
2416                                n_new = NULL;
2417                                mpol_new = NULL;
2418                                break;
2419                        } else
2420                                n->end = start;
2421                }
2422                if (!next)
2423                        break;
2424                n = rb_entry(next, struct sp_node, nd);
2425        }
2426        if (new)
2427                sp_insert(sp, new);
2428        write_unlock(&sp->lock);
2429        ret = 0;
2430
2431err_out:
2432        if (mpol_new)
2433                mpol_put(mpol_new);
2434        if (n_new)
2435                kmem_cache_free(sn_cache, n_new);
2436
2437        return ret;
2438
2439alloc_new:
2440        write_unlock(&sp->lock);
2441        ret = -ENOMEM;
2442        n_new = kmem_cache_alloc(sn_cache, GFP_KERNEL);
2443        if (!n_new)
2444                goto err_out;
2445        mpol_new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
2446        if (!mpol_new)
2447                goto err_out;
2448        goto restart;
2449}
2450
2451/**
2452 * mpol_shared_policy_init - initialize shared policy for inode
2453 * @sp: pointer to inode shared policy
2454 * @mpol:  struct mempolicy to install
2455 *
2456 * Install non-NULL @mpol in inode's shared policy rb-tree.
2457 * On entry, the current task has a reference on a non-NULL @mpol.
2458 * This must be released on exit.
2459 * This is called at get_inode() calls and we can use GFP_KERNEL.
2460 */
2461void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol)
2462{
2463        int ret;
2464
2465        sp->root = RB_ROOT;             /* empty tree == default mempolicy */
2466        rwlock_init(&sp->lock);
2467
2468        if (mpol) {
2469                struct vm_area_struct pvma;
2470                struct mempolicy *new;
2471                NODEMASK_SCRATCH(scratch);
2472
2473                if (!scratch)
2474                        goto put_mpol;
2475                /* contextualize the tmpfs mount point mempolicy */
2476                new = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask);
2477                if (IS_ERR(new))
2478                        goto free_scratch; /* no valid nodemask intersection */
2479
2480                task_lock(current);
2481                ret = mpol_set_nodemask(new, &mpol->w.user_nodemask, scratch);
2482                task_unlock(current);
2483                if (ret)
2484                        goto put_new;
2485
2486                /* Create pseudo-vma that contains just the policy */
2487                memset(&pvma, 0, sizeof(struct vm_area_struct));
2488                pvma.vm_end = TASK_SIZE;        /* policy covers entire file */
2489                mpol_set_shared_policy(sp, &pvma, new); /* adds ref */
2490
2491put_new:
2492                mpol_put(new);                  /* drop initial ref */
2493free_scratch:
2494                NODEMASK_SCRATCH_FREE(scratch);
2495put_mpol:
2496                mpol_put(mpol); /* drop our incoming ref on sb mpol */
2497        }
2498}
2499
2500int mpol_set_shared_policy(struct shared_policy *info,
2501                        struct vm_area_struct *vma, struct mempolicy *npol)
2502{
2503        int err;
2504        struct sp_node *new = NULL;
2505        unsigned long sz = vma_pages(vma);
2506
2507        pr_debug("set_shared_policy %lx sz %lu %d %d %lx\n",
2508                 vma->vm_pgoff,
2509                 sz, npol ? npol->mode : -1,
2510                 npol ? npol->flags : -1,
2511                 npol ? nodes_addr(npol->v.nodes)[0] : NUMA_NO_NODE);
2512
2513        if (npol) {
2514                new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);
2515                if (!new)
2516                        return -ENOMEM;
2517        }
2518        err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new);
2519        if (err && new)
2520                sp_free(new);
2521        return err;
2522}
2523
2524/* Free a backing policy store on inode delete. */
2525void mpol_free_shared_policy(struct shared_policy *p)
2526{
2527        struct sp_node *n;
2528        struct rb_node *next;
2529
2530        if (!p->root.rb_node)
2531                return;
2532        write_lock(&p->lock);
2533        next = rb_first(&p->root);
2534        while (next) {
2535                n = rb_entry(next, struct sp_node, nd);
2536                next = rb_next(&n->nd);
2537                sp_delete(p, n);
2538        }
2539        write_unlock(&p->lock);
2540}
2541
2542#ifdef CONFIG_NUMA_BALANCING
2543static int __initdata numabalancing_override;
2544
2545static void __init check_numabalancing_enable(void)
2546{
2547        bool numabalancing_default = false;
2548
2549        if (IS_ENABLED(CONFIG_NUMA_BALANCING_DEFAULT_ENABLED))
2550                numabalancing_default = true;
2551
2552        /* Parsed by setup_numabalancing. override == 1 enables, -1 disables */
2553        if (numabalancing_override)
2554                set_numabalancing_state(numabalancing_override == 1);
2555
2556        if (num_online_nodes() > 1 && !numabalancing_override) {
2557                pr_info("%s automatic NUMA balancing. Configure with numa_balancing= or the kernel.numa_balancing sysctl\n",
2558                        numabalancing_default ? "Enabling" : "Disabling");
2559                set_numabalancing_state(numabalancing_default);
2560        }
2561}
2562
2563static int __init setup_numabalancing(char *str)
2564{
2565        int ret = 0;
2566        if (!str)
2567                goto out;
2568
2569        if (!strcmp(str, "enable")) {
2570                numabalancing_override = 1;
2571                ret = 1;
2572        } else if (!strcmp(str, "disable")) {
2573                numabalancing_override = -1;
2574                ret = 1;
2575        }
2576out:
2577        if (!ret)
2578                pr_warn("Unable to parse numa_balancing=\n");
2579
2580        return ret;
2581}
2582__setup("numa_balancing=", setup_numabalancing);
2583#else
2584static inline void __init check_numabalancing_enable(void)
2585{
2586}
2587#endif /* CONFIG_NUMA_BALANCING */
2588
2589/* assumes fs == KERNEL_DS */
2590void __init numa_policy_init(void)
2591{
2592        nodemask_t interleave_nodes;
2593        unsigned long largest = 0;
2594        int nid, prefer = 0;
2595
2596        policy_cache = kmem_cache_create("numa_policy",
2597                                         sizeof(struct mempolicy),
2598                                         0, SLAB_PANIC, NULL);
2599
2600        sn_cache = kmem_cache_create("shared_policy_node",
2601                                     sizeof(struct sp_node),
2602                                     0, SLAB_PANIC, NULL);
2603
2604        for_each_node(nid) {
2605                preferred_node_policy[nid] = (struct mempolicy) {
2606                        .refcnt = ATOMIC_INIT(1),
2607                        .mode = MPOL_PREFERRED,
2608                        .flags = MPOL_F_MOF | MPOL_F_MORON,
2609                        .v = { .preferred_node = nid, },
2610                };
2611        }
2612
2613        /*
2614         * Set interleaving policy for system init. Interleaving is only
2615         * enabled across suitably sized nodes (default is >= 16MB), or
2616         * fall back to the largest node if they're all smaller.
2617         */
2618        nodes_clear(interleave_nodes);
2619        for_each_node_state(nid, N_MEMORY) {
2620                unsigned long total_pages = node_present_pages(nid);
2621
2622                /* Preserve the largest node */
2623                if (largest < total_pages) {
2624                        largest = total_pages;
2625                        prefer = nid;
2626                }
2627
2628                /* Interleave this node? */
2629                if ((total_pages << PAGE_SHIFT) >= (16 << 20))
2630                        node_set(nid, interleave_nodes);
2631        }
2632
2633        /* All too small, use the largest */
2634        if (unlikely(nodes_empty(interleave_nodes)))
2635                node_set(prefer, interleave_nodes);
2636
2637        if (do_set_mempolicy(MPOL_INTERLEAVE, 0, &interleave_nodes))
2638                pr_err("%s: interleaving failed\n", __func__);
2639
2640        check_numabalancing_enable();
2641}
2642
2643/* Reset policy of current process to default */
2644void numa_default_policy(void)
2645{
2646        do_set_mempolicy(MPOL_DEFAULT, 0, NULL);
2647}
2648
2649/*
2650 * Parse and format mempolicy from/to strings
2651 */
2652
2653/*
2654 * "local" is implemented internally by MPOL_PREFERRED with MPOL_F_LOCAL flag.
2655 */
2656static const char * const policy_modes[] =
2657{
2658        [MPOL_DEFAULT]    = "default",
2659        [MPOL_PREFERRED]  = "prefer",
2660        [MPOL_BIND]       = "bind",
2661        [MPOL_INTERLEAVE] = "interleave",
2662        [MPOL_LOCAL]      = "local",
2663};
2664
2665
2666#ifdef CONFIG_TMPFS
2667/**
2668 * mpol_parse_str - parse string to mempolicy, for tmpfs mpol mount option.
2669 * @str:  string containing mempolicy to parse
2670 * @mpol:  pointer to struct mempolicy pointer, returned on success.
2671 *
2672 * Format of input:
2673 *      <mode>[=<flags>][:<nodelist>]
2674 *
2675 * On success, returns 0, else 1
2676 */
2677int mpol_parse_str(char *str, struct mempolicy **mpol)
2678{
2679        struct mempolicy *new = NULL;
2680        unsigned short mode;
2681        unsigned short mode_flags;
2682        nodemask_t nodes;
2683        char *nodelist = strchr(str, ':');
2684        char *flags = strchr(str, '=');
2685        int err = 1;
2686
2687        if (nodelist) {
2688                /* NUL-terminate mode or flags string */
2689                *nodelist++ = '\0';
2690                if (nodelist_parse(nodelist, nodes))
2691                        goto out;
2692                if (!nodes_subset(nodes, node_states[N_MEMORY]))
2693                        goto out;
2694        } else
2695                nodes_clear(nodes);
2696
2697        if (flags)
2698                *flags++ = '\0';        /* terminate mode string */
2699
2700        for (mode = 0; mode < MPOL_MAX; mode++) {
2701                if (!strcmp(str, policy_modes[mode])) {
2702                        break;
2703                }
2704        }
2705        if (mode >= MPOL_MAX)
2706                goto out;
2707
2708        switch (mode) {
2709        case MPOL_PREFERRED:
2710                /*
2711                 * Insist on a nodelist of one node only
2712                 */
2713                if (nodelist) {
2714                        char *rest = nodelist;
2715                        while (isdigit(*rest))
2716                                rest++;
2717                        if (*rest)
2718                                goto out;
2719                }
2720                break;
2721        case MPOL_INTERLEAVE:
2722                /*
2723                 * Default to online nodes with memory if no nodelist
2724                 */
2725                if (!nodelist)
2726                        nodes = node_states[N_MEMORY];
2727                break;
2728        case MPOL_LOCAL:
2729                /*
2730                 * Don't allow a nodelist;  mpol_new() checks flags
2731                 */
2732                if (nodelist)
2733                        goto out;
2734                mode = MPOL_PREFERRED;
2735                break;
2736        case MPOL_DEFAULT:
2737                /*
2738                 * Insist on a empty nodelist
2739                 */
2740                if (!nodelist)
2741                        err = 0;
2742                goto out;
2743        case MPOL_BIND:
2744                /*
2745                 * Insist on a nodelist
2746                 */
2747                if (!nodelist)
2748                        goto out;
2749        }
2750
2751        mode_flags = 0;
2752        if (flags) {
2753                /*
2754                 * Currently, we only support two mutually exclusive
2755                 * mode flags.
2756                 */
2757                if (!strcmp(flags, "static"))
2758                        mode_flags |= MPOL_F_STATIC_NODES;
2759                else if (!strcmp(flags, "relative"))
2760                        mode_flags |= MPOL_F_RELATIVE_NODES;
2761                else
2762                        goto out;
2763        }
2764
2765        new = mpol_new(mode, mode_flags, &nodes);
2766        if (IS_ERR(new))
2767                goto out;
2768
2769        /*
2770         * Save nodes for mpol_to_str() to show the tmpfs mount options
2771         * for /proc/mounts, /proc/pid/mounts and /proc/pid/mountinfo.
2772         */
2773        if (mode != MPOL_PREFERRED)
2774                new->v.nodes = nodes;
2775        else if (nodelist)
2776                new->v.preferred_node = first_node(nodes);
2777        else
2778                new->flags |= MPOL_F_LOCAL;
2779
2780        /*
2781         * Save nodes for contextualization: this will be used to "clone"
2782         * the mempolicy in a specific context [cpuset] at a later time.
2783         */
2784        new->w.user_nodemask = nodes;
2785
2786        err = 0;
2787
2788out:
2789        /* Restore string for error message */
2790        if (nodelist)
2791                *--nodelist = ':';
2792        if (flags)
2793                *--flags = '=';
2794        if (!err)
2795                *mpol = new;
2796        return err;
2797}
2798#endif /* CONFIG_TMPFS */
2799
2800/**
2801 * mpol_to_str - format a mempolicy structure for printing
2802 * @buffer:  to contain formatted mempolicy string
2803 * @maxlen:  length of @buffer
2804 * @pol:  pointer to mempolicy to be formatted
2805 *
2806 * Convert @pol into a string.  If @buffer is too short, truncate the string.
2807 * Recommend a @maxlen of at least 32 for the longest mode, "interleave", the
2808 * longest flag, "relative", and to display at least a few node ids.
2809 */
2810void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
2811{
2812        char *p = buffer;
2813        nodemask_t nodes = NODE_MASK_NONE;
2814        unsigned short mode = MPOL_DEFAULT;
2815        unsigned short flags = 0;
2816
2817        if (pol && pol != &default_policy && !(pol->flags & MPOL_F_MORON)) {
2818                mode = pol->mode;
2819                flags = pol->flags;
2820        }
2821
2822        switch (mode) {
2823        case MPOL_DEFAULT:
2824                break;
2825        case MPOL_PREFERRED:
2826                if (flags & MPOL_F_LOCAL)
2827                        mode = MPOL_LOCAL;
2828                else
2829                        node_set(pol->v.preferred_node, nodes);
2830                break;
2831        case MPOL_BIND:
2832        case MPOL_INTERLEAVE:
2833                nodes = pol->v.nodes;
2834                break;
2835        default:
2836                WARN_ON_ONCE(1);
2837                snprintf(p, maxlen, "unknown");
2838                return;
2839        }
2840
2841        p += snprintf(p, maxlen, "%s", policy_modes[mode]);
2842
2843        if (flags & MPOL_MODE_FLAGS) {
2844                p += snprintf(p, buffer + maxlen - p, "=");
2845
2846                /*
2847                 * Currently, the only defined flags are mutually exclusive
2848                 */
2849                if (flags & MPOL_F_STATIC_NODES)
2850                        p += snprintf(p, buffer + maxlen - p, "static");
2851                else if (flags & MPOL_F_RELATIVE_NODES)
2852                        p += snprintf(p, buffer + maxlen - p, "relative");
2853        }
2854
2855        if (!nodes_empty(nodes))
2856                p += scnprintf(p, buffer + maxlen - p, ":%*pbl",
2857                               nodemask_pr_args(&nodes));
2858}
2859