linux/mm/mempolicy.c
<<
>>
Prefs
   1/*
   2 * Simple NUMA memory policy for the Linux kernel.
   3 *
   4 * Copyright 2003,2004 Andi Kleen, SuSE Labs.
   5 * (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc.
   6 * Subject to the GNU Public License, version 2.
   7 *
   8 * NUMA policy allows the user to give hints in which node(s) memory should
   9 * be allocated.
  10 *
  11 * Support four policies per VMA and per process:
  12 *
  13 * The VMA policy has priority over the process policy for a page fault.
  14 *
  15 * interleave     Allocate memory interleaved over a set of nodes,
  16 *                with normal fallback if it fails.
  17 *                For VMA based allocations this interleaves based on the
  18 *                offset into the backing object or offset into the mapping
  19 *                for anonymous memory. For process policy an process counter
  20 *                is used.
  21 *
  22 * bind           Only allocate memory on a specific set of nodes,
  23 *                no fallback.
  24 *                FIXME: memory is allocated starting with the first node
  25 *                to the last. It would be better if bind would truly restrict
  26 *                the allocation to memory nodes instead
  27 *
  28 * preferred       Try a specific node first before normal fallback.
  29 *                As a special case NUMA_NO_NODE here means do the allocation
  30 *                on the local CPU. This is normally identical to default,
  31 *                but useful to set in a VMA when you have a non default
  32 *                process policy.
  33 *
  34 * default        Allocate on the local node first, or when on a VMA
  35 *                use the process policy. This is what Linux always did
  36 *                in a NUMA aware kernel and still does by, ahem, default.
  37 *
  38 * The process policy is applied for most non interrupt memory allocations
  39 * in that process' context. Interrupts ignore the policies and always
  40 * try to allocate on the local CPU. The VMA policy is only applied for memory
  41 * allocations for a VMA in the VM.
  42 *
  43 * Currently there are a few corner cases in swapping where the policy
  44 * is not applied, but the majority should be handled. When process policy
  45 * is used it is not remembered over swap outs/swap ins.
  46 *
  47 * Only the highest zone in the zone hierarchy gets policied. Allocations
  48 * requesting a lower zone just use default policy. This implies that
  49 * on systems with highmem kernel lowmem allocation don't get policied.
  50 * Same with GFP_DMA allocations.
  51 *
  52 * For shmfs/tmpfs/hugetlbfs shared memory the policy is shared between
  53 * all users and remembered even when nobody has memory mapped.
  54 */
  55
  56/* Notebook:
  57   fix mmap readahead to honour policy and enable policy for any page cache
  58   object
  59   statistics for bigpages
  60   global policy for page cache? currently it uses process policy. Requires
  61   first item above.
  62   handle mremap for shared memory (currently ignored for the policy)
  63   grows down?
  64   make bind policy root only? It can trigger oom much faster and the
  65   kernel is not always grateful with that.
  66*/
  67
  68#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
  69
  70#include <linux/mempolicy.h>
  71#include <linux/mm.h>
  72#include <linux/highmem.h>
  73#include <linux/hugetlb.h>
  74#include <linux/kernel.h>
  75#include <linux/sched.h>
  76#include <linux/sched/mm.h>
  77#include <linux/sched/numa_balancing.h>
  78#include <linux/sched/task.h>
  79#include <linux/nodemask.h>
  80#include <linux/cpuset.h>
  81#include <linux/slab.h>
  82#include <linux/string.h>
  83#include <linux/export.h>
  84#include <linux/nsproxy.h>
  85#include <linux/interrupt.h>
  86#include <linux/init.h>
  87#include <linux/compat.h>
  88#include <linux/ptrace.h>
  89#include <linux/swap.h>
  90#include <linux/seq_file.h>
  91#include <linux/proc_fs.h>
  92#include <linux/migrate.h>
  93#include <linux/ksm.h>
  94#include <linux/rmap.h>
  95#include <linux/security.h>
  96#include <linux/syscalls.h>
  97#include <linux/ctype.h>
  98#include <linux/mm_inline.h>
  99#include <linux/mmu_notifier.h>
 100#include <linux/printk.h>
 101#include <linux/swapops.h>
 102
 103#include <asm/tlbflush.h>
 104#include <linux/uaccess.h>
 105
 106#include "internal.h"
 107
 108/* Internal flags */
 109#define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0)    /* Skip checks for continuous vmas */
 110#define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1)          /* Invert check for nodemask */
 111
 112static struct kmem_cache *policy_cache;
 113static struct kmem_cache *sn_cache;
 114
 115/* Highest zone. An specific allocation for a zone below that is not
 116   policied. */
 117enum zone_type policy_zone = 0;
 118
 119/*
 120 * run-time system-wide default policy => local allocation
 121 */
 122static struct mempolicy default_policy = {
 123        .refcnt = ATOMIC_INIT(1), /* never free it */
 124        .mode = MPOL_PREFERRED,
 125        .flags = MPOL_F_LOCAL,
 126};
 127
 128static struct mempolicy preferred_node_policy[MAX_NUMNODES];
 129
 130struct mempolicy *get_task_policy(struct task_struct *p)
 131{
 132        struct mempolicy *pol = p->mempolicy;
 133        int node;
 134
 135        if (pol)
 136                return pol;
 137
 138        node = numa_node_id();
 139        if (node != NUMA_NO_NODE) {
 140                pol = &preferred_node_policy[node];
 141                /* preferred_node_policy is not initialised early in boot */
 142                if (pol->mode)
 143                        return pol;
 144        }
 145
 146        return &default_policy;
 147}
 148
 149static const struct mempolicy_operations {
 150        int (*create)(struct mempolicy *pol, const nodemask_t *nodes);
 151        void (*rebind)(struct mempolicy *pol, const nodemask_t *nodes);
 152} mpol_ops[MPOL_MAX];
 153
 154static inline int mpol_store_user_nodemask(const struct mempolicy *pol)
 155{
 156        return pol->flags & MPOL_MODE_FLAGS;
 157}
 158
 159static void mpol_relative_nodemask(nodemask_t *ret, const nodemask_t *orig,
 160                                   const nodemask_t *rel)
 161{
 162        nodemask_t tmp;
 163        nodes_fold(tmp, *orig, nodes_weight(*rel));
 164        nodes_onto(*ret, tmp, *rel);
 165}
 166
 167static int mpol_new_interleave(struct mempolicy *pol, const nodemask_t *nodes)
 168{
 169        if (nodes_empty(*nodes))
 170                return -EINVAL;
 171        pol->v.nodes = *nodes;
 172        return 0;
 173}
 174
 175static int mpol_new_preferred(struct mempolicy *pol, const nodemask_t *nodes)
 176{
 177        if (!nodes)
 178                pol->flags |= MPOL_F_LOCAL;     /* local allocation */
 179        else if (nodes_empty(*nodes))
 180                return -EINVAL;                 /*  no allowed nodes */
 181        else
 182                pol->v.preferred_node = first_node(*nodes);
 183        return 0;
 184}
 185
 186static int mpol_new_bind(struct mempolicy *pol, const nodemask_t *nodes)
 187{
 188        if (nodes_empty(*nodes))
 189                return -EINVAL;
 190        pol->v.nodes = *nodes;
 191        return 0;
 192}
 193
 194/*
 195 * mpol_set_nodemask is called after mpol_new() to set up the nodemask, if
 196 * any, for the new policy.  mpol_new() has already validated the nodes
 197 * parameter with respect to the policy mode and flags.  But, we need to
 198 * handle an empty nodemask with MPOL_PREFERRED here.
 199 *
 200 * Must be called holding task's alloc_lock to protect task's mems_allowed
 201 * and mempolicy.  May also be called holding the mmap_semaphore for write.
 202 */
 203static int mpol_set_nodemask(struct mempolicy *pol,
 204                     const nodemask_t *nodes, struct nodemask_scratch *nsc)
 205{
 206        int ret;
 207
 208        /* if mode is MPOL_DEFAULT, pol is NULL. This is right. */
 209        if (pol == NULL)
 210                return 0;
 211        /* Check N_MEMORY */
 212        nodes_and(nsc->mask1,
 213                  cpuset_current_mems_allowed, node_states[N_MEMORY]);
 214
 215        VM_BUG_ON(!nodes);
 216        if (pol->mode == MPOL_PREFERRED && nodes_empty(*nodes))
 217                nodes = NULL;   /* explicit local allocation */
 218        else {
 219                if (pol->flags & MPOL_F_RELATIVE_NODES)
 220                        mpol_relative_nodemask(&nsc->mask2, nodes, &nsc->mask1);
 221                else
 222                        nodes_and(nsc->mask2, *nodes, nsc->mask1);
 223
 224                if (mpol_store_user_nodemask(pol))
 225                        pol->w.user_nodemask = *nodes;
 226                else
 227                        pol->w.cpuset_mems_allowed =
 228                                                cpuset_current_mems_allowed;
 229        }
 230
 231        if (nodes)
 232                ret = mpol_ops[pol->mode].create(pol, &nsc->mask2);
 233        else
 234                ret = mpol_ops[pol->mode].create(pol, NULL);
 235        return ret;
 236}
 237
 238/*
 239 * This function just creates a new policy, does some check and simple
 240 * initialization. You must invoke mpol_set_nodemask() to set nodes.
 241 */
 242static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
 243                                  nodemask_t *nodes)
 244{
 245        struct mempolicy *policy;
 246
 247        pr_debug("setting mode %d flags %d nodes[0] %lx\n",
 248                 mode, flags, nodes ? nodes_addr(*nodes)[0] : NUMA_NO_NODE);
 249
 250        if (mode == MPOL_DEFAULT) {
 251                if (nodes && !nodes_empty(*nodes))
 252                        return ERR_PTR(-EINVAL);
 253                return NULL;
 254        }
 255        VM_BUG_ON(!nodes);
 256
 257        /*
 258         * MPOL_PREFERRED cannot be used with MPOL_F_STATIC_NODES or
 259         * MPOL_F_RELATIVE_NODES if the nodemask is empty (local allocation).
 260         * All other modes require a valid pointer to a non-empty nodemask.
 261         */
 262        if (mode == MPOL_PREFERRED) {
 263                if (nodes_empty(*nodes)) {
 264                        if (((flags & MPOL_F_STATIC_NODES) ||
 265                             (flags & MPOL_F_RELATIVE_NODES)))
 266                                return ERR_PTR(-EINVAL);
 267                }
 268        } else if (mode == MPOL_LOCAL) {
 269                if (!nodes_empty(*nodes) ||
 270                    (flags & MPOL_F_STATIC_NODES) ||
 271                    (flags & MPOL_F_RELATIVE_NODES))
 272                        return ERR_PTR(-EINVAL);
 273                mode = MPOL_PREFERRED;
 274        } else if (nodes_empty(*nodes))
 275                return ERR_PTR(-EINVAL);
 276        policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
 277        if (!policy)
 278                return ERR_PTR(-ENOMEM);
 279        atomic_set(&policy->refcnt, 1);
 280        policy->mode = mode;
 281        policy->flags = flags;
 282
 283        return policy;
 284}
 285
 286/* Slow path of a mpol destructor. */
 287void __mpol_put(struct mempolicy *p)
 288{
 289        if (!atomic_dec_and_test(&p->refcnt))
 290                return;
 291        kmem_cache_free(policy_cache, p);
 292}
 293
 294static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes)
 295{
 296}
 297
 298static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes)
 299{
 300        nodemask_t tmp;
 301
 302        if (pol->flags & MPOL_F_STATIC_NODES)
 303                nodes_and(tmp, pol->w.user_nodemask, *nodes);
 304        else if (pol->flags & MPOL_F_RELATIVE_NODES)
 305                mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
 306        else {
 307                nodes_remap(tmp, pol->v.nodes,pol->w.cpuset_mems_allowed,
 308                                                                *nodes);
 309                pol->w.cpuset_mems_allowed = tmp;
 310        }
 311
 312        if (nodes_empty(tmp))
 313                tmp = *nodes;
 314
 315        pol->v.nodes = tmp;
 316}
 317
 318static void mpol_rebind_preferred(struct mempolicy *pol,
 319                                                const nodemask_t *nodes)
 320{
 321        nodemask_t tmp;
 322
 323        if (pol->flags & MPOL_F_STATIC_NODES) {
 324                int node = first_node(pol->w.user_nodemask);
 325
 326                if (node_isset(node, *nodes)) {
 327                        pol->v.preferred_node = node;
 328                        pol->flags &= ~MPOL_F_LOCAL;
 329                } else
 330                        pol->flags |= MPOL_F_LOCAL;
 331        } else if (pol->flags & MPOL_F_RELATIVE_NODES) {
 332                mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
 333                pol->v.preferred_node = first_node(tmp);
 334        } else if (!(pol->flags & MPOL_F_LOCAL)) {
 335                pol->v.preferred_node = node_remap(pol->v.preferred_node,
 336                                                   pol->w.cpuset_mems_allowed,
 337                                                   *nodes);
 338                pol->w.cpuset_mems_allowed = *nodes;
 339        }
 340}
 341
 342/*
 343 * mpol_rebind_policy - Migrate a policy to a different set of nodes
 344 *
 345 * Per-vma policies are protected by mmap_sem. Allocations using per-task
 346 * policies are protected by task->mems_allowed_seq to prevent a premature
 347 * OOM/allocation failure due to parallel nodemask modification.
 348 */
 349static void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask)
 350{
 351        if (!pol)
 352                return;
 353        if (!mpol_store_user_nodemask(pol) &&
 354            nodes_equal(pol->w.cpuset_mems_allowed, *newmask))
 355                return;
 356
 357        mpol_ops[pol->mode].rebind(pol, newmask);
 358}
 359
 360/*
 361 * Wrapper for mpol_rebind_policy() that just requires task
 362 * pointer, and updates task mempolicy.
 363 *
 364 * Called with task's alloc_lock held.
 365 */
 366
 367void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new)
 368{
 369        mpol_rebind_policy(tsk->mempolicy, new);
 370}
 371
 372/*
 373 * Rebind each vma in mm to new nodemask.
 374 *
 375 * Call holding a reference to mm.  Takes mm->mmap_sem during call.
 376 */
 377
 378void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
 379{
 380        struct vm_area_struct *vma;
 381
 382        down_write(&mm->mmap_sem);
 383        for (vma = mm->mmap; vma; vma = vma->vm_next)
 384                mpol_rebind_policy(vma->vm_policy, new);
 385        up_write(&mm->mmap_sem);
 386}
 387
 388static const struct mempolicy_operations mpol_ops[MPOL_MAX] = {
 389        [MPOL_DEFAULT] = {
 390                .rebind = mpol_rebind_default,
 391        },
 392        [MPOL_INTERLEAVE] = {
 393                .create = mpol_new_interleave,
 394                .rebind = mpol_rebind_nodemask,
 395        },
 396        [MPOL_PREFERRED] = {
 397                .create = mpol_new_preferred,
 398                .rebind = mpol_rebind_preferred,
 399        },
 400        [MPOL_BIND] = {
 401                .create = mpol_new_bind,
 402                .rebind = mpol_rebind_nodemask,
 403        },
 404};
 405
 406static void migrate_page_add(struct page *page, struct list_head *pagelist,
 407                                unsigned long flags);
 408
 409struct queue_pages {
 410        struct list_head *pagelist;
 411        unsigned long flags;
 412        nodemask_t *nmask;
 413        struct vm_area_struct *prev;
 414};
 415
 416/*
 417 * Check if the page's nid is in qp->nmask.
 418 *
 419 * If MPOL_MF_INVERT is set in qp->flags, check if the nid is
 420 * in the invert of qp->nmask.
 421 */
 422static inline bool queue_pages_required(struct page *page,
 423                                        struct queue_pages *qp)
 424{
 425        int nid = page_to_nid(page);
 426        unsigned long flags = qp->flags;
 427
 428        return node_isset(nid, *qp->nmask) == !(flags & MPOL_MF_INVERT);
 429}
 430
 431static int queue_pages_pmd(pmd_t *pmd, spinlock_t *ptl, unsigned long addr,
 432                                unsigned long end, struct mm_walk *walk)
 433{
 434        int ret = 0;
 435        struct page *page;
 436        struct queue_pages *qp = walk->private;
 437        unsigned long flags;
 438
 439        if (unlikely(is_pmd_migration_entry(*pmd))) {
 440                ret = 1;
 441                goto unlock;
 442        }
 443        page = pmd_page(*pmd);
 444        if (is_huge_zero_page(page)) {
 445                spin_unlock(ptl);
 446                __split_huge_pmd(walk->vma, pmd, addr, false, NULL);
 447                goto out;
 448        }
 449        if (!queue_pages_required(page, qp)) {
 450                ret = 1;
 451                goto unlock;
 452        }
 453
 454        ret = 1;
 455        flags = qp->flags;
 456        /* go to thp migration */
 457        if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
 458                migrate_page_add(page, qp->pagelist, flags);
 459unlock:
 460        spin_unlock(ptl);
 461out:
 462        return ret;
 463}
 464
 465/*
 466 * Scan through pages checking if pages follow certain conditions,
 467 * and move them to the pagelist if they do.
 468 */
 469static int queue_pages_pte_range(pmd_t *pmd, unsigned long addr,
 470                        unsigned long end, struct mm_walk *walk)
 471{
 472        struct vm_area_struct *vma = walk->vma;
 473        struct page *page;
 474        struct queue_pages *qp = walk->private;
 475        unsigned long flags = qp->flags;
 476        int ret;
 477        pte_t *pte;
 478        spinlock_t *ptl;
 479
 480        ptl = pmd_trans_huge_lock(pmd, vma);
 481        if (ptl) {
 482                ret = queue_pages_pmd(pmd, ptl, addr, end, walk);
 483                if (ret)
 484                        return 0;
 485        }
 486
 487        if (pmd_trans_unstable(pmd))
 488                return 0;
 489
 490        pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
 491        for (; addr != end; pte++, addr += PAGE_SIZE) {
 492                if (!pte_present(*pte))
 493                        continue;
 494                page = vm_normal_page(vma, addr, *pte);
 495                if (!page)
 496                        continue;
 497                /*
 498                 * vm_normal_page() filters out zero pages, but there might
 499                 * still be PageReserved pages to skip, perhaps in a VDSO.
 500                 */
 501                if (PageReserved(page))
 502                        continue;
 503                if (!queue_pages_required(page, qp))
 504                        continue;
 505                migrate_page_add(page, qp->pagelist, flags);
 506        }
 507        pte_unmap_unlock(pte - 1, ptl);
 508        cond_resched();
 509        return 0;
 510}
 511
 512static int queue_pages_hugetlb(pte_t *pte, unsigned long hmask,
 513                               unsigned long addr, unsigned long end,
 514                               struct mm_walk *walk)
 515{
 516#ifdef CONFIG_HUGETLB_PAGE
 517        struct queue_pages *qp = walk->private;
 518        unsigned long flags = qp->flags;
 519        struct page *page;
 520        spinlock_t *ptl;
 521        pte_t entry;
 522
 523        ptl = huge_pte_lock(hstate_vma(walk->vma), walk->mm, pte);
 524        entry = huge_ptep_get(pte);
 525        if (!pte_present(entry))
 526                goto unlock;
 527        page = pte_page(entry);
 528        if (!queue_pages_required(page, qp))
 529                goto unlock;
 530        /* With MPOL_MF_MOVE, we migrate only unshared hugepage. */
 531        if (flags & (MPOL_MF_MOVE_ALL) ||
 532            (flags & MPOL_MF_MOVE && page_mapcount(page) == 1))
 533                isolate_huge_page(page, qp->pagelist);
 534unlock:
 535        spin_unlock(ptl);
 536#else
 537        BUG();
 538#endif
 539        return 0;
 540}
 541
 542#ifdef CONFIG_NUMA_BALANCING
 543/*
 544 * This is used to mark a range of virtual addresses to be inaccessible.
 545 * These are later cleared by a NUMA hinting fault. Depending on these
 546 * faults, pages may be migrated for better NUMA placement.
 547 *
 548 * This is assuming that NUMA faults are handled using PROT_NONE. If
 549 * an architecture makes a different choice, it will need further
 550 * changes to the core.
 551 */
 552unsigned long change_prot_numa(struct vm_area_struct *vma,
 553                        unsigned long addr, unsigned long end)
 554{
 555        int nr_updated;
 556
 557        nr_updated = change_protection(vma, addr, end, PAGE_NONE, 0, 1);
 558        if (nr_updated)
 559                count_vm_numa_events(NUMA_PTE_UPDATES, nr_updated);
 560
 561        return nr_updated;
 562}
 563#else
 564static unsigned long change_prot_numa(struct vm_area_struct *vma,
 565                        unsigned long addr, unsigned long end)
 566{
 567        return 0;
 568}
 569#endif /* CONFIG_NUMA_BALANCING */
 570
 571static int queue_pages_test_walk(unsigned long start, unsigned long end,
 572                                struct mm_walk *walk)
 573{
 574        struct vm_area_struct *vma = walk->vma;
 575        struct queue_pages *qp = walk->private;
 576        unsigned long endvma = vma->vm_end;
 577        unsigned long flags = qp->flags;
 578
 579        if (!vma_migratable(vma))
 580                return 1;
 581
 582        if (endvma > end)
 583                endvma = end;
 584        if (vma->vm_start > start)
 585                start = vma->vm_start;
 586
 587        if (!(flags & MPOL_MF_DISCONTIG_OK)) {
 588                if (!vma->vm_next && vma->vm_end < end)
 589                        return -EFAULT;
 590                if (qp->prev && qp->prev->vm_end < vma->vm_start)
 591                        return -EFAULT;
 592        }
 593
 594        qp->prev = vma;
 595
 596        if (flags & MPOL_MF_LAZY) {
 597                /* Similar to task_numa_work, skip inaccessible VMAs */
 598                if (!is_vm_hugetlb_page(vma) &&
 599                        (vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)) &&
 600                        !(vma->vm_flags & VM_MIXEDMAP))
 601                        change_prot_numa(vma, start, endvma);
 602                return 1;
 603        }
 604
 605        /* queue pages from current vma */
 606        if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
 607                return 0;
 608        return 1;
 609}
 610
 611/*
 612 * Walk through page tables and collect pages to be migrated.
 613 *
 614 * If pages found in a given range are on a set of nodes (determined by
 615 * @nodes and @flags,) it's isolated and queued to the pagelist which is
 616 * passed via @private.)
 617 */
 618static int
 619queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end,
 620                nodemask_t *nodes, unsigned long flags,
 621                struct list_head *pagelist)
 622{
 623        struct queue_pages qp = {
 624                .pagelist = pagelist,
 625                .flags = flags,
 626                .nmask = nodes,
 627                .prev = NULL,
 628        };
 629        struct mm_walk queue_pages_walk = {
 630                .hugetlb_entry = queue_pages_hugetlb,
 631                .pmd_entry = queue_pages_pte_range,
 632                .test_walk = queue_pages_test_walk,
 633                .mm = mm,
 634                .private = &qp,
 635        };
 636
 637        return walk_page_range(start, end, &queue_pages_walk);
 638}
 639
 640/*
 641 * Apply policy to a single VMA
 642 * This must be called with the mmap_sem held for writing.
 643 */
 644static int vma_replace_policy(struct vm_area_struct *vma,
 645                                                struct mempolicy *pol)
 646{
 647        int err;
 648        struct mempolicy *old;
 649        struct mempolicy *new;
 650
 651        pr_debug("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
 652                 vma->vm_start, vma->vm_end, vma->vm_pgoff,
 653                 vma->vm_ops, vma->vm_file,
 654                 vma->vm_ops ? vma->vm_ops->set_policy : NULL);
 655
 656        new = mpol_dup(pol);
 657        if (IS_ERR(new))
 658                return PTR_ERR(new);
 659
 660        if (vma->vm_ops && vma->vm_ops->set_policy) {
 661                err = vma->vm_ops->set_policy(vma, new);
 662                if (err)
 663                        goto err_out;
 664        }
 665
 666        old = vma->vm_policy;
 667        vma->vm_policy = new; /* protected by mmap_sem */
 668        mpol_put(old);
 669
 670        return 0;
 671 err_out:
 672        mpol_put(new);
 673        return err;
 674}
 675
 676/* Step 2: apply policy to a range and do splits. */
 677static int mbind_range(struct mm_struct *mm, unsigned long start,
 678                       unsigned long end, struct mempolicy *new_pol)
 679{
 680        struct vm_area_struct *next;
 681        struct vm_area_struct *prev;
 682        struct vm_area_struct *vma;
 683        int err = 0;
 684        pgoff_t pgoff;
 685        unsigned long vmstart;
 686        unsigned long vmend;
 687
 688        vma = find_vma(mm, start);
 689        if (!vma || vma->vm_start > start)
 690                return -EFAULT;
 691
 692        prev = vma->vm_prev;
 693        if (start > vma->vm_start)
 694                prev = vma;
 695
 696        for (; vma && vma->vm_start < end; prev = vma, vma = next) {
 697                next = vma->vm_next;
 698                vmstart = max(start, vma->vm_start);
 699                vmend   = min(end, vma->vm_end);
 700
 701                if (mpol_equal(vma_policy(vma), new_pol))
 702                        continue;
 703
 704                pgoff = vma->vm_pgoff +
 705                        ((vmstart - vma->vm_start) >> PAGE_SHIFT);
 706                prev = vma_merge(mm, prev, vmstart, vmend, vma->vm_flags,
 707                                 vma->anon_vma, vma->vm_file, pgoff,
 708                                 new_pol, vma->vm_userfaultfd_ctx);
 709                if (prev) {
 710                        vma = prev;
 711                        next = vma->vm_next;
 712                        if (mpol_equal(vma_policy(vma), new_pol))
 713                                continue;
 714                        /* vma_merge() joined vma && vma->next, case 8 */
 715                        goto replace;
 716                }
 717                if (vma->vm_start != vmstart) {
 718                        err = split_vma(vma->vm_mm, vma, vmstart, 1);
 719                        if (err)
 720                                goto out;
 721                }
 722                if (vma->vm_end != vmend) {
 723                        err = split_vma(vma->vm_mm, vma, vmend, 0);
 724                        if (err)
 725                                goto out;
 726                }
 727 replace:
 728                err = vma_replace_policy(vma, new_pol);
 729                if (err)
 730                        goto out;
 731        }
 732
 733 out:
 734        return err;
 735}
 736
 737/* Set the process memory policy */
 738static long do_set_mempolicy(unsigned short mode, unsigned short flags,
 739                             nodemask_t *nodes)
 740{
 741        struct mempolicy *new, *old;
 742        NODEMASK_SCRATCH(scratch);
 743        int ret;
 744
 745        if (!scratch)
 746                return -ENOMEM;
 747
 748        new = mpol_new(mode, flags, nodes);
 749        if (IS_ERR(new)) {
 750                ret = PTR_ERR(new);
 751                goto out;
 752        }
 753
 754        task_lock(current);
 755        ret = mpol_set_nodemask(new, nodes, scratch);
 756        if (ret) {
 757                task_unlock(current);
 758                mpol_put(new);
 759                goto out;
 760        }
 761        old = current->mempolicy;
 762        current->mempolicy = new;
 763        if (new && new->mode == MPOL_INTERLEAVE)
 764                current->il_prev = MAX_NUMNODES-1;
 765        task_unlock(current);
 766        mpol_put(old);
 767        ret = 0;
 768out:
 769        NODEMASK_SCRATCH_FREE(scratch);
 770        return ret;
 771}
 772
 773/*
 774 * Return nodemask for policy for get_mempolicy() query
 775 *
 776 * Called with task's alloc_lock held
 777 */
 778static void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes)
 779{
 780        nodes_clear(*nodes);
 781        if (p == &default_policy)
 782                return;
 783
 784        switch (p->mode) {
 785        case MPOL_BIND:
 786                /* Fall through */
 787        case MPOL_INTERLEAVE:
 788                *nodes = p->v.nodes;
 789                break;
 790        case MPOL_PREFERRED:
 791                if (!(p->flags & MPOL_F_LOCAL))
 792                        node_set(p->v.preferred_node, *nodes);
 793                /* else return empty node mask for local allocation */
 794                break;
 795        default:
 796                BUG();
 797        }
 798}
 799
 800static int lookup_node(unsigned long addr)
 801{
 802        struct page *p;
 803        int err;
 804
 805        err = get_user_pages(addr & PAGE_MASK, 1, 0, &p, NULL);
 806        if (err >= 0) {
 807                err = page_to_nid(p);
 808                put_page(p);
 809        }
 810        return err;
 811}
 812
 813/* Retrieve NUMA policy */
 814static long do_get_mempolicy(int *policy, nodemask_t *nmask,
 815                             unsigned long addr, unsigned long flags)
 816{
 817        int err;
 818        struct mm_struct *mm = current->mm;
 819        struct vm_area_struct *vma = NULL;
 820        struct mempolicy *pol = current->mempolicy;
 821
 822        if (flags &
 823                ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED))
 824                return -EINVAL;
 825
 826        if (flags & MPOL_F_MEMS_ALLOWED) {
 827                if (flags & (MPOL_F_NODE|MPOL_F_ADDR))
 828                        return -EINVAL;
 829                *policy = 0;    /* just so it's initialized */
 830                task_lock(current);
 831                *nmask  = cpuset_current_mems_allowed;
 832                task_unlock(current);
 833                return 0;
 834        }
 835
 836        if (flags & MPOL_F_ADDR) {
 837                /*
 838                 * Do NOT fall back to task policy if the
 839                 * vma/shared policy at addr is NULL.  We
 840                 * want to return MPOL_DEFAULT in this case.
 841                 */
 842                down_read(&mm->mmap_sem);
 843                vma = find_vma_intersection(mm, addr, addr+1);
 844                if (!vma) {
 845                        up_read(&mm->mmap_sem);
 846                        return -EFAULT;
 847                }
 848                if (vma->vm_ops && vma->vm_ops->get_policy)
 849                        pol = vma->vm_ops->get_policy(vma, addr);
 850                else
 851                        pol = vma->vm_policy;
 852        } else if (addr)
 853                return -EINVAL;
 854
 855        if (!pol)
 856                pol = &default_policy;  /* indicates default behavior */
 857
 858        if (flags & MPOL_F_NODE) {
 859                if (flags & MPOL_F_ADDR) {
 860                        err = lookup_node(addr);
 861                        if (err < 0)
 862                                goto out;
 863                        *policy = err;
 864                } else if (pol == current->mempolicy &&
 865                                pol->mode == MPOL_INTERLEAVE) {
 866                        *policy = next_node_in(current->il_prev, pol->v.nodes);
 867                } else {
 868                        err = -EINVAL;
 869                        goto out;
 870                }
 871        } else {
 872                *policy = pol == &default_policy ? MPOL_DEFAULT :
 873                                                pol->mode;
 874                /*
 875                 * Internal mempolicy flags must be masked off before exposing
 876                 * the policy to userspace.
 877                 */
 878                *policy |= (pol->flags & MPOL_MODE_FLAGS);
 879        }
 880
 881        err = 0;
 882        if (nmask) {
 883                if (mpol_store_user_nodemask(pol)) {
 884                        *nmask = pol->w.user_nodemask;
 885                } else {
 886                        task_lock(current);
 887                        get_policy_nodemask(pol, nmask);
 888                        task_unlock(current);
 889                }
 890        }
 891
 892 out:
 893        mpol_cond_put(pol);
 894        if (vma)
 895                up_read(&current->mm->mmap_sem);
 896        return err;
 897}
 898
 899#ifdef CONFIG_MIGRATION
 900/*
 901 * page migration, thp tail pages can be passed.
 902 */
 903static void migrate_page_add(struct page *page, struct list_head *pagelist,
 904                                unsigned long flags)
 905{
 906        struct page *head = compound_head(page);
 907        /*
 908         * Avoid migrating a page that is shared with others.
 909         */
 910        if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(head) == 1) {
 911                if (!isolate_lru_page(head)) {
 912                        list_add_tail(&head->lru, pagelist);
 913                        mod_node_page_state(page_pgdat(head),
 914                                NR_ISOLATED_ANON + page_is_file_cache(head),
 915                                hpage_nr_pages(head));
 916                }
 917        }
 918}
 919
 920/* page allocation callback for NUMA node migration */
 921struct page *alloc_new_node_page(struct page *page, unsigned long node)
 922{
 923        if (PageHuge(page))
 924                return alloc_huge_page_node(page_hstate(compound_head(page)),
 925                                        node);
 926        else if (PageTransHuge(page)) {
 927                struct page *thp;
 928
 929                thp = alloc_pages_node(node,
 930                        (GFP_TRANSHUGE | __GFP_THISNODE),
 931                        HPAGE_PMD_ORDER);
 932                if (!thp)
 933                        return NULL;
 934                prep_transhuge_page(thp);
 935                return thp;
 936        } else
 937                return __alloc_pages_node(node, GFP_HIGHUSER_MOVABLE |
 938                                                    __GFP_THISNODE, 0);
 939}
 940
 941/*
 942 * Migrate pages from one node to a target node.
 943 * Returns error or the number of pages not migrated.
 944 */
 945static int migrate_to_node(struct mm_struct *mm, int source, int dest,
 946                           int flags)
 947{
 948        nodemask_t nmask;
 949        LIST_HEAD(pagelist);
 950        int err = 0;
 951
 952        nodes_clear(nmask);
 953        node_set(source, nmask);
 954
 955        /*
 956         * This does not "check" the range but isolates all pages that
 957         * need migration.  Between passing in the full user address
 958         * space range and MPOL_MF_DISCONTIG_OK, this call can not fail.
 959         */
 960        VM_BUG_ON(!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)));
 961        queue_pages_range(mm, mm->mmap->vm_start, mm->task_size, &nmask,
 962                        flags | MPOL_MF_DISCONTIG_OK, &pagelist);
 963
 964        if (!list_empty(&pagelist)) {
 965                err = migrate_pages(&pagelist, alloc_new_node_page, NULL, dest,
 966                                        MIGRATE_SYNC, MR_SYSCALL);
 967                if (err)
 968                        putback_movable_pages(&pagelist);
 969        }
 970
 971        return err;
 972}
 973
 974/*
 975 * Move pages between the two nodesets so as to preserve the physical
 976 * layout as much as possible.
 977 *
 978 * Returns the number of page that could not be moved.
 979 */
 980int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
 981                     const nodemask_t *to, int flags)
 982{
 983        int busy = 0;
 984        int err;
 985        nodemask_t tmp;
 986
 987        err = migrate_prep();
 988        if (err)
 989                return err;
 990
 991        down_read(&mm->mmap_sem);
 992
 993        /*
 994         * Find a 'source' bit set in 'tmp' whose corresponding 'dest'
 995         * bit in 'to' is not also set in 'tmp'.  Clear the found 'source'
 996         * bit in 'tmp', and return that <source, dest> pair for migration.
 997         * The pair of nodemasks 'to' and 'from' define the map.
 998         *
 999         * If no pair of bits is found that way, fallback to picking some
1000         * pair of 'source' and 'dest' bits that are not the same.  If the
1001         * 'source' and 'dest' bits are the same, this represents a node
1002         * that will be migrating to itself, so no pages need move.
1003         *
1004         * If no bits are left in 'tmp', or if all remaining bits left
1005         * in 'tmp' correspond to the same bit in 'to', return false
1006         * (nothing left to migrate).
1007         *
1008         * This lets us pick a pair of nodes to migrate between, such that
1009         * if possible the dest node is not already occupied by some other
1010         * source node, minimizing the risk of overloading the memory on a
1011         * node that would happen if we migrated incoming memory to a node
1012         * before migrating outgoing memory source that same node.
1013         *
1014         * A single scan of tmp is sufficient.  As we go, we remember the
1015         * most recent <s, d> pair that moved (s != d).  If we find a pair
1016         * that not only moved, but what's better, moved to an empty slot
1017         * (d is not set in tmp), then we break out then, with that pair.
1018         * Otherwise when we finish scanning from_tmp, we at least have the
1019         * most recent <s, d> pair that moved.  If we get all the way through
1020         * the scan of tmp without finding any node that moved, much less
1021         * moved to an empty node, then there is nothing left worth migrating.
1022         */
1023
1024        tmp = *from;
1025        while (!nodes_empty(tmp)) {
1026                int s,d;
1027                int source = NUMA_NO_NODE;
1028                int dest = 0;
1029
1030                for_each_node_mask(s, tmp) {
1031
1032                        /*
1033                         * do_migrate_pages() tries to maintain the relative
1034                         * node relationship of the pages established between
1035                         * threads and memory areas.
1036                         *
1037                         * However if the number of source nodes is not equal to
1038                         * the number of destination nodes we can not preserve
1039                         * this node relative relationship.  In that case, skip
1040                         * copying memory from a node that is in the destination
1041                         * mask.
1042                         *
1043                         * Example: [2,3,4] -> [3,4,5] moves everything.
1044                         *          [0-7] - > [3,4,5] moves only 0,1,2,6,7.
1045                         */
1046
1047                        if ((nodes_weight(*from) != nodes_weight(*to)) &&
1048                                                (node_isset(s, *to)))
1049                                continue;
1050
1051                        d = node_remap(s, *from, *to);
1052                        if (s == d)
1053                                continue;
1054
1055                        source = s;     /* Node moved. Memorize */
1056                        dest = d;
1057
1058                        /* dest not in remaining from nodes? */
1059                        if (!node_isset(dest, tmp))
1060                                break;
1061                }
1062                if (source == NUMA_NO_NODE)
1063                        break;
1064
1065                node_clear(source, tmp);
1066                err = migrate_to_node(mm, source, dest, flags);
1067                if (err > 0)
1068                        busy += err;
1069                if (err < 0)
1070                        break;
1071        }
1072        up_read(&mm->mmap_sem);
1073        if (err < 0)
1074                return err;
1075        return busy;
1076
1077}
1078
1079/*
1080 * Allocate a new page for page migration based on vma policy.
1081 * Start by assuming the page is mapped by the same vma as contains @start.
1082 * Search forward from there, if not.  N.B., this assumes that the
1083 * list of pages handed to migrate_pages()--which is how we get here--
1084 * is in virtual address order.
1085 */
1086static struct page *new_page(struct page *page, unsigned long start)
1087{
1088        struct vm_area_struct *vma;
1089        unsigned long uninitialized_var(address);
1090
1091        vma = find_vma(current->mm, start);
1092        while (vma) {
1093                address = page_address_in_vma(page, vma);
1094                if (address != -EFAULT)
1095                        break;
1096                vma = vma->vm_next;
1097        }
1098
1099        if (PageHuge(page)) {
1100                return alloc_huge_page_vma(page_hstate(compound_head(page)),
1101                                vma, address);
1102        } else if (PageTransHuge(page)) {
1103                struct page *thp;
1104
1105                thp = alloc_hugepage_vma(GFP_TRANSHUGE, vma, address,
1106                                         HPAGE_PMD_ORDER);
1107                if (!thp)
1108                        return NULL;
1109                prep_transhuge_page(thp);
1110                return thp;
1111        }
1112        /*
1113         * if !vma, alloc_page_vma() will use task or system default policy
1114         */
1115        return alloc_page_vma(GFP_HIGHUSER_MOVABLE | __GFP_RETRY_MAYFAIL,
1116                        vma, address);
1117}
1118#else
1119
1120static void migrate_page_add(struct page *page, struct list_head *pagelist,
1121                                unsigned long flags)
1122{
1123}
1124
1125int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
1126                     const nodemask_t *to, int flags)
1127{
1128        return -ENOSYS;
1129}
1130
1131static struct page *new_page(struct page *page, unsigned long start)
1132{
1133        return NULL;
1134}
1135#endif
1136
1137static long do_mbind(unsigned long start, unsigned long len,
1138                     unsigned short mode, unsigned short mode_flags,
1139                     nodemask_t *nmask, unsigned long flags)
1140{
1141        struct mm_struct *mm = current->mm;
1142        struct mempolicy *new;
1143        unsigned long end;
1144        int err;
1145        LIST_HEAD(pagelist);
1146
1147        if (flags & ~(unsigned long)MPOL_MF_VALID)
1148                return -EINVAL;
1149        if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
1150                return -EPERM;
1151
1152        if (start & ~PAGE_MASK)
1153                return -EINVAL;
1154
1155        if (mode == MPOL_DEFAULT)
1156                flags &= ~MPOL_MF_STRICT;
1157
1158        len = (len + PAGE_SIZE - 1) & PAGE_MASK;
1159        end = start + len;
1160
1161        if (end < start)
1162                return -EINVAL;
1163        if (end == start)
1164                return 0;
1165
1166        new = mpol_new(mode, mode_flags, nmask);
1167        if (IS_ERR(new))
1168                return PTR_ERR(new);
1169
1170        if (flags & MPOL_MF_LAZY)
1171                new->flags |= MPOL_F_MOF;
1172
1173        /*
1174         * If we are using the default policy then operation
1175         * on discontinuous address spaces is okay after all
1176         */
1177        if (!new)
1178                flags |= MPOL_MF_DISCONTIG_OK;
1179
1180        pr_debug("mbind %lx-%lx mode:%d flags:%d nodes:%lx\n",
1181                 start, start + len, mode, mode_flags,
1182                 nmask ? nodes_addr(*nmask)[0] : NUMA_NO_NODE);
1183
1184        if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
1185
1186                err = migrate_prep();
1187                if (err)
1188                        goto mpol_out;
1189        }
1190        {
1191                NODEMASK_SCRATCH(scratch);
1192                if (scratch) {
1193                        down_write(&mm->mmap_sem);
1194                        task_lock(current);
1195                        err = mpol_set_nodemask(new, nmask, scratch);
1196                        task_unlock(current);
1197                        if (err)
1198                                up_write(&mm->mmap_sem);
1199                } else
1200                        err = -ENOMEM;
1201                NODEMASK_SCRATCH_FREE(scratch);
1202        }
1203        if (err)
1204                goto mpol_out;
1205
1206        err = queue_pages_range(mm, start, end, nmask,
1207                          flags | MPOL_MF_INVERT, &pagelist);
1208        if (!err)
1209                err = mbind_range(mm, start, end, new);
1210
1211        if (!err) {
1212                int nr_failed = 0;
1213
1214                if (!list_empty(&pagelist)) {
1215                        WARN_ON_ONCE(flags & MPOL_MF_LAZY);
1216                        nr_failed = migrate_pages(&pagelist, new_page, NULL,
1217                                start, MIGRATE_SYNC, MR_MEMPOLICY_MBIND);
1218                        if (nr_failed)
1219                                putback_movable_pages(&pagelist);
1220                }
1221
1222                if (nr_failed && (flags & MPOL_MF_STRICT))
1223                        err = -EIO;
1224        } else
1225                putback_movable_pages(&pagelist);
1226
1227        up_write(&mm->mmap_sem);
1228 mpol_out:
1229        mpol_put(new);
1230        return err;
1231}
1232
1233/*
1234 * User space interface with variable sized bitmaps for nodelists.
1235 */
1236
1237/* Copy a node mask from user space. */
1238static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
1239                     unsigned long maxnode)
1240{
1241        unsigned long k;
1242        unsigned long t;
1243        unsigned long nlongs;
1244        unsigned long endmask;
1245
1246        --maxnode;
1247        nodes_clear(*nodes);
1248        if (maxnode == 0 || !nmask)
1249                return 0;
1250        if (maxnode > PAGE_SIZE*BITS_PER_BYTE)
1251                return -EINVAL;
1252
1253        nlongs = BITS_TO_LONGS(maxnode);
1254        if ((maxnode % BITS_PER_LONG) == 0)
1255                endmask = ~0UL;
1256        else
1257                endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1;
1258
1259        /*
1260         * When the user specified more nodes than supported just check
1261         * if the non supported part is all zero.
1262         *
1263         * If maxnode have more longs than MAX_NUMNODES, check
1264         * the bits in that area first. And then go through to
1265         * check the rest bits which equal or bigger than MAX_NUMNODES.
1266         * Otherwise, just check bits [MAX_NUMNODES, maxnode).
1267         */
1268        if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) {
1269                for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) {
1270                        if (get_user(t, nmask + k))
1271                                return -EFAULT;
1272                        if (k == nlongs - 1) {
1273                                if (t & endmask)
1274                                        return -EINVAL;
1275                        } else if (t)
1276                                return -EINVAL;
1277                }
1278                nlongs = BITS_TO_LONGS(MAX_NUMNODES);
1279                endmask = ~0UL;
1280        }
1281
1282        if (maxnode > MAX_NUMNODES && MAX_NUMNODES % BITS_PER_LONG != 0) {
1283                unsigned long valid_mask = endmask;
1284
1285                valid_mask &= ~((1UL << (MAX_NUMNODES % BITS_PER_LONG)) - 1);
1286                if (get_user(t, nmask + nlongs - 1))
1287                        return -EFAULT;
1288                if (t & valid_mask)
1289                        return -EINVAL;
1290        }
1291
1292        if (copy_from_user(nodes_addr(*nodes), nmask, nlongs*sizeof(unsigned long)))
1293                return -EFAULT;
1294        nodes_addr(*nodes)[nlongs-1] &= endmask;
1295        return 0;
1296}
1297
1298/* Copy a kernel node mask to user space */
1299static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
1300                              nodemask_t *nodes)
1301{
1302        unsigned long copy = ALIGN(maxnode-1, 64) / 8;
1303        const int nbytes = BITS_TO_LONGS(MAX_NUMNODES) * sizeof(long);
1304
1305        if (copy > nbytes) {
1306                if (copy > PAGE_SIZE)
1307                        return -EINVAL;
1308                if (clear_user((char __user *)mask + nbytes, copy - nbytes))
1309                        return -EFAULT;
1310                copy = nbytes;
1311        }
1312        return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
1313}
1314
1315static long kernel_mbind(unsigned long start, unsigned long len,
1316                         unsigned long mode, const unsigned long __user *nmask,
1317                         unsigned long maxnode, unsigned int flags)
1318{
1319        nodemask_t nodes;
1320        int err;
1321        unsigned short mode_flags;
1322
1323        mode_flags = mode & MPOL_MODE_FLAGS;
1324        mode &= ~MPOL_MODE_FLAGS;
1325        if (mode >= MPOL_MAX)
1326                return -EINVAL;
1327        if ((mode_flags & MPOL_F_STATIC_NODES) &&
1328            (mode_flags & MPOL_F_RELATIVE_NODES))
1329                return -EINVAL;
1330        err = get_nodes(&nodes, nmask, maxnode);
1331        if (err)
1332                return err;
1333        return do_mbind(start, len, mode, mode_flags, &nodes, flags);
1334}
1335
1336SYSCALL_DEFINE6(mbind, unsigned long, start, unsigned long, len,
1337                unsigned long, mode, const unsigned long __user *, nmask,
1338                unsigned long, maxnode, unsigned int, flags)
1339{
1340        return kernel_mbind(start, len, mode, nmask, maxnode, flags);
1341}
1342
1343/* Set the process memory policy */
1344static long kernel_set_mempolicy(int mode, const unsigned long __user *nmask,
1345                                 unsigned long maxnode)
1346{
1347        int err;
1348        nodemask_t nodes;
1349        unsigned short flags;
1350
1351        flags = mode & MPOL_MODE_FLAGS;
1352        mode &= ~MPOL_MODE_FLAGS;
1353        if ((unsigned int)mode >= MPOL_MAX)
1354                return -EINVAL;
1355        if ((flags & MPOL_F_STATIC_NODES) && (flags & MPOL_F_RELATIVE_NODES))
1356                return -EINVAL;
1357        err = get_nodes(&nodes, nmask, maxnode);
1358        if (err)
1359                return err;
1360        return do_set_mempolicy(mode, flags, &nodes);
1361}
1362
1363SYSCALL_DEFINE3(set_mempolicy, int, mode, const unsigned long __user *, nmask,
1364                unsigned long, maxnode)
1365{
1366        return kernel_set_mempolicy(mode, nmask, maxnode);
1367}
1368
1369static int kernel_migrate_pages(pid_t pid, unsigned long maxnode,
1370                                const unsigned long __user *old_nodes,
1371                                const unsigned long __user *new_nodes)
1372{
1373        struct mm_struct *mm = NULL;
1374        struct task_struct *task;
1375        nodemask_t task_nodes;
1376        int err;
1377        nodemask_t *old;
1378        nodemask_t *new;
1379        NODEMASK_SCRATCH(scratch);
1380
1381        if (!scratch)
1382                return -ENOMEM;
1383
1384        old = &scratch->mask1;
1385        new = &scratch->mask2;
1386
1387        err = get_nodes(old, old_nodes, maxnode);
1388        if (err)
1389                goto out;
1390
1391        err = get_nodes(new, new_nodes, maxnode);
1392        if (err)
1393                goto out;
1394
1395        /* Find the mm_struct */
1396        rcu_read_lock();
1397        task = pid ? find_task_by_vpid(pid) : current;
1398        if (!task) {
1399                rcu_read_unlock();
1400                err = -ESRCH;
1401                goto out;
1402        }
1403        get_task_struct(task);
1404
1405        err = -EINVAL;
1406
1407        /*
1408         * Check if this process has the right to modify the specified process.
1409         * Use the regular "ptrace_may_access()" checks.
1410         */
1411        if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS)) {
1412                rcu_read_unlock();
1413                err = -EPERM;
1414                goto out_put;
1415        }
1416        rcu_read_unlock();
1417
1418        task_nodes = cpuset_mems_allowed(task);
1419        /* Is the user allowed to access the target nodes? */
1420        if (!nodes_subset(*new, task_nodes) && !capable(CAP_SYS_NICE)) {
1421                err = -EPERM;
1422                goto out_put;
1423        }
1424
1425        task_nodes = cpuset_mems_allowed(current);
1426        nodes_and(*new, *new, task_nodes);
1427        if (nodes_empty(*new))
1428                goto out_put;
1429
1430        nodes_and(*new, *new, node_states[N_MEMORY]);
1431        if (nodes_empty(*new))
1432                goto out_put;
1433
1434        err = security_task_movememory(task);
1435        if (err)
1436                goto out_put;
1437
1438        mm = get_task_mm(task);
1439        put_task_struct(task);
1440
1441        if (!mm) {
1442                err = -EINVAL;
1443                goto out;
1444        }
1445
1446        err = do_migrate_pages(mm, old, new,
1447                capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE);
1448
1449        mmput(mm);
1450out:
1451        NODEMASK_SCRATCH_FREE(scratch);
1452
1453        return err;
1454
1455out_put:
1456        put_task_struct(task);
1457        goto out;
1458
1459}
1460
1461SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode,
1462                const unsigned long __user *, old_nodes,
1463                const unsigned long __user *, new_nodes)
1464{
1465        return kernel_migrate_pages(pid, maxnode, old_nodes, new_nodes);
1466}
1467
1468
1469/* Retrieve NUMA policy */
1470static int kernel_get_mempolicy(int __user *policy,
1471                                unsigned long __user *nmask,
1472                                unsigned long maxnode,
1473                                unsigned long addr,
1474                                unsigned long flags)
1475{
1476        int err;
1477        int uninitialized_var(pval);
1478        nodemask_t nodes;
1479
1480        if (nmask != NULL && maxnode < MAX_NUMNODES)
1481                return -EINVAL;
1482
1483        err = do_get_mempolicy(&pval, &nodes, addr, flags);
1484
1485        if (err)
1486                return err;
1487
1488        if (policy && put_user(pval, policy))
1489                return -EFAULT;
1490
1491        if (nmask)
1492                err = copy_nodes_to_user(nmask, maxnode, &nodes);
1493
1494        return err;
1495}
1496
1497SYSCALL_DEFINE5(get_mempolicy, int __user *, policy,
1498                unsigned long __user *, nmask, unsigned long, maxnode,
1499                unsigned long, addr, unsigned long, flags)
1500{
1501        return kernel_get_mempolicy(policy, nmask, maxnode, addr, flags);
1502}
1503
1504#ifdef CONFIG_COMPAT
1505
1506COMPAT_SYSCALL_DEFINE5(get_mempolicy, int __user *, policy,
1507                       compat_ulong_t __user *, nmask,
1508                       compat_ulong_t, maxnode,
1509                       compat_ulong_t, addr, compat_ulong_t, flags)
1510{
1511        long err;
1512        unsigned long __user *nm = NULL;
1513        unsigned long nr_bits, alloc_size;
1514        DECLARE_BITMAP(bm, MAX_NUMNODES);
1515
1516        nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1517        alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1518
1519        if (nmask)
1520                nm = compat_alloc_user_space(alloc_size);
1521
1522        err = kernel_get_mempolicy(policy, nm, nr_bits+1, addr, flags);
1523
1524        if (!err && nmask) {
1525                unsigned long copy_size;
1526                copy_size = min_t(unsigned long, sizeof(bm), alloc_size);
1527                err = copy_from_user(bm, nm, copy_size);
1528                /* ensure entire bitmap is zeroed */
1529                err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8);
1530                err |= compat_put_bitmap(nmask, bm, nr_bits);
1531        }
1532
1533        return err;
1534}
1535
1536COMPAT_SYSCALL_DEFINE3(set_mempolicy, int, mode, compat_ulong_t __user *, nmask,
1537                       compat_ulong_t, maxnode)
1538{
1539        unsigned long __user *nm = NULL;
1540        unsigned long nr_bits, alloc_size;
1541        DECLARE_BITMAP(bm, MAX_NUMNODES);
1542
1543        nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1544        alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1545
1546        if (nmask) {
1547                if (compat_get_bitmap(bm, nmask, nr_bits))
1548                        return -EFAULT;
1549                nm = compat_alloc_user_space(alloc_size);
1550                if (copy_to_user(nm, bm, alloc_size))
1551                        return -EFAULT;
1552        }
1553
1554        return kernel_set_mempolicy(mode, nm, nr_bits+1);
1555}
1556
1557COMPAT_SYSCALL_DEFINE6(mbind, compat_ulong_t, start, compat_ulong_t, len,
1558                       compat_ulong_t, mode, compat_ulong_t __user *, nmask,
1559                       compat_ulong_t, maxnode, compat_ulong_t, flags)
1560{
1561        unsigned long __user *nm = NULL;
1562        unsigned long nr_bits, alloc_size;
1563        nodemask_t bm;
1564
1565        nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1566        alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1567
1568        if (nmask) {
1569                if (compat_get_bitmap(nodes_addr(bm), nmask, nr_bits))
1570                        return -EFAULT;
1571                nm = compat_alloc_user_space(alloc_size);
1572                if (copy_to_user(nm, nodes_addr(bm), alloc_size))
1573                        return -EFAULT;
1574        }
1575
1576        return kernel_mbind(start, len, mode, nm, nr_bits+1, flags);
1577}
1578
1579COMPAT_SYSCALL_DEFINE4(migrate_pages, compat_pid_t, pid,
1580                       compat_ulong_t, maxnode,
1581                       const compat_ulong_t __user *, old_nodes,
1582                       const compat_ulong_t __user *, new_nodes)
1583{
1584        unsigned long __user *old = NULL;
1585        unsigned long __user *new = NULL;
1586        nodemask_t tmp_mask;
1587        unsigned long nr_bits;
1588        unsigned long size;
1589
1590        nr_bits = min_t(unsigned long, maxnode - 1, MAX_NUMNODES);
1591        size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1592        if (old_nodes) {
1593                if (compat_get_bitmap(nodes_addr(tmp_mask), old_nodes, nr_bits))
1594                        return -EFAULT;
1595                old = compat_alloc_user_space(new_nodes ? size * 2 : size);
1596                if (new_nodes)
1597                        new = old + size / sizeof(unsigned long);
1598                if (copy_to_user(old, nodes_addr(tmp_mask), size))
1599                        return -EFAULT;
1600        }
1601        if (new_nodes) {
1602                if (compat_get_bitmap(nodes_addr(tmp_mask), new_nodes, nr_bits))
1603                        return -EFAULT;
1604                if (new == NULL)
1605                        new = compat_alloc_user_space(size);
1606                if (copy_to_user(new, nodes_addr(tmp_mask), size))
1607                        return -EFAULT;
1608        }
1609        return kernel_migrate_pages(pid, nr_bits + 1, old, new);
1610}
1611
1612#endif /* CONFIG_COMPAT */
1613
1614struct mempolicy *__get_vma_policy(struct vm_area_struct *vma,
1615                                                unsigned long addr)
1616{
1617        struct mempolicy *pol = NULL;
1618
1619        if (vma) {
1620                if (vma->vm_ops && vma->vm_ops->get_policy) {
1621                        pol = vma->vm_ops->get_policy(vma, addr);
1622                } else if (vma->vm_policy) {
1623                        pol = vma->vm_policy;
1624
1625                        /*
1626                         * shmem_alloc_page() passes MPOL_F_SHARED policy with
1627                         * a pseudo vma whose vma->vm_ops=NULL. Take a reference
1628                         * count on these policies which will be dropped by
1629                         * mpol_cond_put() later
1630                         */
1631                        if (mpol_needs_cond_ref(pol))
1632                                mpol_get(pol);
1633                }
1634        }
1635
1636        return pol;
1637}
1638
1639/*
1640 * get_vma_policy(@vma, @addr)
1641 * @vma: virtual memory area whose policy is sought
1642 * @addr: address in @vma for shared policy lookup
1643 *
1644 * Returns effective policy for a VMA at specified address.
1645 * Falls back to current->mempolicy or system default policy, as necessary.
1646 * Shared policies [those marked as MPOL_F_SHARED] require an extra reference
1647 * count--added by the get_policy() vm_op, as appropriate--to protect against
1648 * freeing by another task.  It is the caller's responsibility to free the
1649 * extra reference for shared policies.
1650 */
1651static struct mempolicy *get_vma_policy(struct vm_area_struct *vma,
1652                                                unsigned long addr)
1653{
1654        struct mempolicy *pol = __get_vma_policy(vma, addr);
1655
1656        if (!pol)
1657                pol = get_task_policy(current);
1658
1659        return pol;
1660}
1661
1662bool vma_policy_mof(struct vm_area_struct *vma)
1663{
1664        struct mempolicy *pol;
1665
1666        if (vma->vm_ops && vma->vm_ops->get_policy) {
1667                bool ret = false;
1668
1669                pol = vma->vm_ops->get_policy(vma, vma->vm_start);
1670                if (pol && (pol->flags & MPOL_F_MOF))
1671                        ret = true;
1672                mpol_cond_put(pol);
1673
1674                return ret;
1675        }
1676
1677        pol = vma->vm_policy;
1678        if (!pol)
1679                pol = get_task_policy(current);
1680
1681        return pol->flags & MPOL_F_MOF;
1682}
1683
1684static int apply_policy_zone(struct mempolicy *policy, enum zone_type zone)
1685{
1686        enum zone_type dynamic_policy_zone = policy_zone;
1687
1688        BUG_ON(dynamic_policy_zone == ZONE_MOVABLE);
1689
1690        /*
1691         * if policy->v.nodes has movable memory only,
1692         * we apply policy when gfp_zone(gfp) = ZONE_MOVABLE only.
1693         *
1694         * policy->v.nodes is intersect with node_states[N_MEMORY].
1695         * so if the following test faile, it implies
1696         * policy->v.nodes has movable memory only.
1697         */
1698        if (!nodes_intersects(policy->v.nodes, node_states[N_HIGH_MEMORY]))
1699                dynamic_policy_zone = ZONE_MOVABLE;
1700
1701        return zone >= dynamic_policy_zone;
1702}
1703
1704/*
1705 * Return a nodemask representing a mempolicy for filtering nodes for
1706 * page allocation
1707 */
1708static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy)
1709{
1710        /* Lower zones don't get a nodemask applied for MPOL_BIND */
1711        if (unlikely(policy->mode == MPOL_BIND) &&
1712                        apply_policy_zone(policy, gfp_zone(gfp)) &&
1713                        cpuset_nodemask_valid_mems_allowed(&policy->v.nodes))
1714                return &policy->v.nodes;
1715
1716        return NULL;
1717}
1718
1719/* Return the node id preferred by the given mempolicy, or the given id */
1720static int policy_node(gfp_t gfp, struct mempolicy *policy,
1721                                                                int nd)
1722{
1723        if (policy->mode == MPOL_PREFERRED && !(policy->flags & MPOL_F_LOCAL))
1724                nd = policy->v.preferred_node;
1725        else {
1726                /*
1727                 * __GFP_THISNODE shouldn't even be used with the bind policy
1728                 * because we might easily break the expectation to stay on the
1729                 * requested node and not break the policy.
1730                 */
1731                WARN_ON_ONCE(policy->mode == MPOL_BIND && (gfp & __GFP_THISNODE));
1732        }
1733
1734        return nd;
1735}
1736
1737/* Do dynamic interleaving for a process */
1738static unsigned interleave_nodes(struct mempolicy *policy)
1739{
1740        unsigned next;
1741        struct task_struct *me = current;
1742
1743        next = next_node_in(me->il_prev, policy->v.nodes);
1744        if (next < MAX_NUMNODES)
1745                me->il_prev = next;
1746        return next;
1747}
1748
1749/*
1750 * Depending on the memory policy provide a node from which to allocate the
1751 * next slab entry.
1752 */
1753unsigned int mempolicy_slab_node(void)
1754{
1755        struct mempolicy *policy;
1756        int node = numa_mem_id();
1757
1758        if (in_interrupt())
1759                return node;
1760
1761        policy = current->mempolicy;
1762        if (!policy || policy->flags & MPOL_F_LOCAL)
1763                return node;
1764
1765        switch (policy->mode) {
1766        case MPOL_PREFERRED:
1767                /*
1768                 * handled MPOL_F_LOCAL above
1769                 */
1770                return policy->v.preferred_node;
1771
1772        case MPOL_INTERLEAVE:
1773                return interleave_nodes(policy);
1774
1775        case MPOL_BIND: {
1776                struct zoneref *z;
1777
1778                /*
1779                 * Follow bind policy behavior and start allocation at the
1780                 * first node.
1781                 */
1782                struct zonelist *zonelist;
1783                enum zone_type highest_zoneidx = gfp_zone(GFP_KERNEL);
1784                zonelist = &NODE_DATA(node)->node_zonelists[ZONELIST_FALLBACK];
1785                z = first_zones_zonelist(zonelist, highest_zoneidx,
1786                                                        &policy->v.nodes);
1787                return z->zone ? z->zone->node : node;
1788        }
1789
1790        default:
1791                BUG();
1792        }
1793}
1794
1795/*
1796 * Do static interleaving for a VMA with known offset @n.  Returns the n'th
1797 * node in pol->v.nodes (starting from n=0), wrapping around if n exceeds the
1798 * number of present nodes.
1799 */
1800static unsigned offset_il_node(struct mempolicy *pol, unsigned long n)
1801{
1802        unsigned nnodes = nodes_weight(pol->v.nodes);
1803        unsigned target;
1804        int i;
1805        int nid;
1806
1807        if (!nnodes)
1808                return numa_node_id();
1809        target = (unsigned int)n % nnodes;
1810        nid = first_node(pol->v.nodes);
1811        for (i = 0; i < target; i++)
1812                nid = next_node(nid, pol->v.nodes);
1813        return nid;
1814}
1815
1816/* Determine a node number for interleave */
1817static inline unsigned interleave_nid(struct mempolicy *pol,
1818                 struct vm_area_struct *vma, unsigned long addr, int shift)
1819{
1820        if (vma) {
1821                unsigned long off;
1822
1823                /*
1824                 * for small pages, there is no difference between
1825                 * shift and PAGE_SHIFT, so the bit-shift is safe.
1826                 * for huge pages, since vm_pgoff is in units of small
1827                 * pages, we need to shift off the always 0 bits to get
1828                 * a useful offset.
1829                 */
1830                BUG_ON(shift < PAGE_SHIFT);
1831                off = vma->vm_pgoff >> (shift - PAGE_SHIFT);
1832                off += (addr - vma->vm_start) >> shift;
1833                return offset_il_node(pol, off);
1834        } else
1835                return interleave_nodes(pol);
1836}
1837
1838#ifdef CONFIG_HUGETLBFS
1839/*
1840 * huge_node(@vma, @addr, @gfp_flags, @mpol)
1841 * @vma: virtual memory area whose policy is sought
1842 * @addr: address in @vma for shared policy lookup and interleave policy
1843 * @gfp_flags: for requested zone
1844 * @mpol: pointer to mempolicy pointer for reference counted mempolicy
1845 * @nodemask: pointer to nodemask pointer for MPOL_BIND nodemask
1846 *
1847 * Returns a nid suitable for a huge page allocation and a pointer
1848 * to the struct mempolicy for conditional unref after allocation.
1849 * If the effective policy is 'BIND, returns a pointer to the mempolicy's
1850 * @nodemask for filtering the zonelist.
1851 *
1852 * Must be protected by read_mems_allowed_begin()
1853 */
1854int huge_node(struct vm_area_struct *vma, unsigned long addr, gfp_t gfp_flags,
1855                                struct mempolicy **mpol, nodemask_t **nodemask)
1856{
1857        int nid;
1858
1859        *mpol = get_vma_policy(vma, addr);
1860        *nodemask = NULL;       /* assume !MPOL_BIND */
1861
1862        if (unlikely((*mpol)->mode == MPOL_INTERLEAVE)) {
1863                nid = interleave_nid(*mpol, vma, addr,
1864                                        huge_page_shift(hstate_vma(vma)));
1865        } else {
1866                nid = policy_node(gfp_flags, *mpol, numa_node_id());
1867                if ((*mpol)->mode == MPOL_BIND)
1868                        *nodemask = &(*mpol)->v.nodes;
1869        }
1870        return nid;
1871}
1872
1873/*
1874 * init_nodemask_of_mempolicy
1875 *
1876 * If the current task's mempolicy is "default" [NULL], return 'false'
1877 * to indicate default policy.  Otherwise, extract the policy nodemask
1878 * for 'bind' or 'interleave' policy into the argument nodemask, or
1879 * initialize the argument nodemask to contain the single node for
1880 * 'preferred' or 'local' policy and return 'true' to indicate presence
1881 * of non-default mempolicy.
1882 *
1883 * We don't bother with reference counting the mempolicy [mpol_get/put]
1884 * because the current task is examining it's own mempolicy and a task's
1885 * mempolicy is only ever changed by the task itself.
1886 *
1887 * N.B., it is the caller's responsibility to free a returned nodemask.
1888 */
1889bool init_nodemask_of_mempolicy(nodemask_t *mask)
1890{
1891        struct mempolicy *mempolicy;
1892        int nid;
1893
1894        if (!(mask && current->mempolicy))
1895                return false;
1896
1897        task_lock(current);
1898        mempolicy = current->mempolicy;
1899        switch (mempolicy->mode) {
1900        case MPOL_PREFERRED:
1901                if (mempolicy->flags & MPOL_F_LOCAL)
1902                        nid = numa_node_id();
1903                else
1904                        nid = mempolicy->v.preferred_node;
1905                init_nodemask_of_node(mask, nid);
1906                break;
1907
1908        case MPOL_BIND:
1909                /* Fall through */
1910        case MPOL_INTERLEAVE:
1911                *mask =  mempolicy->v.nodes;
1912                break;
1913
1914        default:
1915                BUG();
1916        }
1917        task_unlock(current);
1918
1919        return true;
1920}
1921#endif
1922
1923/*
1924 * mempolicy_nodemask_intersects
1925 *
1926 * If tsk's mempolicy is "default" [NULL], return 'true' to indicate default
1927 * policy.  Otherwise, check for intersection between mask and the policy
1928 * nodemask for 'bind' or 'interleave' policy.  For 'perferred' or 'local'
1929 * policy, always return true since it may allocate elsewhere on fallback.
1930 *
1931 * Takes task_lock(tsk) to prevent freeing of its mempolicy.
1932 */
1933bool mempolicy_nodemask_intersects(struct task_struct *tsk,
1934                                        const nodemask_t *mask)
1935{
1936        struct mempolicy *mempolicy;
1937        bool ret = true;
1938
1939        if (!mask)
1940                return ret;
1941        task_lock(tsk);
1942        mempolicy = tsk->mempolicy;
1943        if (!mempolicy)
1944                goto out;
1945
1946        switch (mempolicy->mode) {
1947        case MPOL_PREFERRED:
1948                /*
1949                 * MPOL_PREFERRED and MPOL_F_LOCAL are only preferred nodes to
1950                 * allocate from, they may fallback to other nodes when oom.
1951                 * Thus, it's possible for tsk to have allocated memory from
1952                 * nodes in mask.
1953                 */
1954                break;
1955        case MPOL_BIND:
1956        case MPOL_INTERLEAVE:
1957                ret = nodes_intersects(mempolicy->v.nodes, *mask);
1958                break;
1959        default:
1960                BUG();
1961        }
1962out:
1963        task_unlock(tsk);
1964        return ret;
1965}
1966
1967/* Allocate a page in interleaved policy.
1968   Own path because it needs to do special accounting. */
1969static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
1970                                        unsigned nid)
1971{
1972        struct page *page;
1973
1974        page = __alloc_pages(gfp, order, nid);
1975        /* skip NUMA_INTERLEAVE_HIT counter update if numa stats is disabled */
1976        if (!static_branch_likely(&vm_numa_stat_key))
1977                return page;
1978        if (page && page_to_nid(page) == nid) {
1979                preempt_disable();
1980                __inc_numa_state(page_zone(page), NUMA_INTERLEAVE_HIT);
1981                preempt_enable();
1982        }
1983        return page;
1984}
1985
1986/**
1987 *      alloc_pages_vma - Allocate a page for a VMA.
1988 *
1989 *      @gfp:
1990 *      %GFP_USER    user allocation.
1991 *      %GFP_KERNEL  kernel allocations,
1992 *      %GFP_HIGHMEM highmem/user allocations,
1993 *      %GFP_FS      allocation should not call back into a file system.
1994 *      %GFP_ATOMIC  don't sleep.
1995 *
1996 *      @order:Order of the GFP allocation.
1997 *      @vma:  Pointer to VMA or NULL if not available.
1998 *      @addr: Virtual Address of the allocation. Must be inside the VMA.
1999 *      @node: Which node to prefer for allocation (modulo policy).
2000 *      @hugepage: for hugepages try only the preferred node if possible
2001 *
2002 *      This function allocates a page from the kernel page pool and applies
2003 *      a NUMA policy associated with the VMA or the current process.
2004 *      When VMA is not NULL caller must hold down_read on the mmap_sem of the
2005 *      mm_struct of the VMA to prevent it from going away. Should be used for
2006 *      all allocations for pages that will be mapped into user space. Returns
2007 *      NULL when no page can be allocated.
2008 */
2009struct page *
2010alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
2011                unsigned long addr, int node, bool hugepage)
2012{
2013        struct mempolicy *pol;
2014        struct page *page;
2015        int preferred_nid;
2016        nodemask_t *nmask;
2017
2018        pol = get_vma_policy(vma, addr);
2019
2020        if (pol->mode == MPOL_INTERLEAVE) {
2021                unsigned nid;
2022
2023                nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order);
2024                mpol_cond_put(pol);
2025                page = alloc_page_interleave(gfp, order, nid);
2026                goto out;
2027        }
2028
2029        if (unlikely(IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && hugepage)) {
2030                int hpage_node = node;
2031
2032                /*
2033                 * For hugepage allocation and non-interleave policy which
2034                 * allows the current node (or other explicitly preferred
2035                 * node) we only try to allocate from the current/preferred
2036                 * node and don't fall back to other nodes, as the cost of
2037                 * remote accesses would likely offset THP benefits.
2038                 *
2039                 * If the policy is interleave, or does not allow the current
2040                 * node in its nodemask, we allocate the standard way.
2041                 */
2042                if (pol->mode == MPOL_PREFERRED &&
2043                                                !(pol->flags & MPOL_F_LOCAL))
2044                        hpage_node = pol->v.preferred_node;
2045
2046                nmask = policy_nodemask(gfp, pol);
2047                if (!nmask || node_isset(hpage_node, *nmask)) {
2048                        mpol_cond_put(pol);
2049                        page = __alloc_pages_node(hpage_node,
2050                                                gfp | __GFP_THISNODE, order);
2051                        goto out;
2052                }
2053        }
2054
2055        nmask = policy_nodemask(gfp, pol);
2056        preferred_nid = policy_node(gfp, pol, node);
2057        page = __alloc_pages_nodemask(gfp, order, preferred_nid, nmask);
2058        mpol_cond_put(pol);
2059out:
2060        return page;
2061}
2062
2063/**
2064 *      alloc_pages_current - Allocate pages.
2065 *
2066 *      @gfp:
2067 *              %GFP_USER   user allocation,
2068 *              %GFP_KERNEL kernel allocation,
2069 *              %GFP_HIGHMEM highmem allocation,
2070 *              %GFP_FS     don't call back into a file system.
2071 *              %GFP_ATOMIC don't sleep.
2072 *      @order: Power of two of allocation size in pages. 0 is a single page.
2073 *
2074 *      Allocate a page from the kernel page pool.  When not in
2075 *      interrupt context and apply the current process NUMA policy.
2076 *      Returns NULL when no page can be allocated.
2077 */
2078struct page *alloc_pages_current(gfp_t gfp, unsigned order)
2079{
2080        struct mempolicy *pol = &default_policy;
2081        struct page *page;
2082
2083        if (!in_interrupt() && !(gfp & __GFP_THISNODE))
2084                pol = get_task_policy(current);
2085
2086        /*
2087         * No reference counting needed for current->mempolicy
2088         * nor system default_policy
2089         */
2090        if (pol->mode == MPOL_INTERLEAVE)
2091                page = alloc_page_interleave(gfp, order, interleave_nodes(pol));
2092        else
2093                page = __alloc_pages_nodemask(gfp, order,
2094                                policy_node(gfp, pol, numa_node_id()),
2095                                policy_nodemask(gfp, pol));
2096
2097        return page;
2098}
2099EXPORT_SYMBOL(alloc_pages_current);
2100
2101int vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst)
2102{
2103        struct mempolicy *pol = mpol_dup(vma_policy(src));
2104
2105        if (IS_ERR(pol))
2106                return PTR_ERR(pol);
2107        dst->vm_policy = pol;
2108        return 0;
2109}
2110
2111/*
2112 * If mpol_dup() sees current->cpuset == cpuset_being_rebound, then it
2113 * rebinds the mempolicy its copying by calling mpol_rebind_policy()
2114 * with the mems_allowed returned by cpuset_mems_allowed().  This
2115 * keeps mempolicies cpuset relative after its cpuset moves.  See
2116 * further kernel/cpuset.c update_nodemask().
2117 *
2118 * current's mempolicy may be rebinded by the other task(the task that changes
2119 * cpuset's mems), so we needn't do rebind work for current task.
2120 */
2121
2122/* Slow path of a mempolicy duplicate */
2123struct mempolicy *__mpol_dup(struct mempolicy *old)
2124{
2125        struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
2126
2127        if (!new)
2128                return ERR_PTR(-ENOMEM);
2129
2130        /* task's mempolicy is protected by alloc_lock */
2131        if (old == current->mempolicy) {
2132                task_lock(current);
2133                *new = *old;
2134                task_unlock(current);
2135        } else
2136                *new = *old;
2137
2138        if (current_cpuset_is_being_rebound()) {
2139                nodemask_t mems = cpuset_mems_allowed(current);
2140                mpol_rebind_policy(new, &mems);
2141        }
2142        atomic_set(&new->refcnt, 1);
2143        return new;
2144}
2145
2146/* Slow path of a mempolicy comparison */
2147bool __mpol_equal(struct mempolicy *a, struct mempolicy *b)
2148{
2149        if (!a || !b)
2150                return false;
2151        if (a->mode != b->mode)
2152                return false;
2153        if (a->flags != b->flags)
2154                return false;
2155        if (mpol_store_user_nodemask(a))
2156                if (!nodes_equal(a->w.user_nodemask, b->w.user_nodemask))
2157                        return false;
2158
2159        switch (a->mode) {
2160        case MPOL_BIND:
2161                /* Fall through */
2162        case MPOL_INTERLEAVE:
2163                return !!nodes_equal(a->v.nodes, b->v.nodes);
2164        case MPOL_PREFERRED:
2165                /* a's ->flags is the same as b's */
2166                if (a->flags & MPOL_F_LOCAL)
2167                        return true;
2168                return a->v.preferred_node == b->v.preferred_node;
2169        default:
2170                BUG();
2171                return false;
2172        }
2173}
2174
2175/*
2176 * Shared memory backing store policy support.
2177 *
2178 * Remember policies even when nobody has shared memory mapped.
2179 * The policies are kept in Red-Black tree linked from the inode.
2180 * They are protected by the sp->lock rwlock, which should be held
2181 * for any accesses to the tree.
2182 */
2183
2184/*
2185 * lookup first element intersecting start-end.  Caller holds sp->lock for
2186 * reading or for writing
2187 */
2188static struct sp_node *
2189sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
2190{
2191        struct rb_node *n = sp->root.rb_node;
2192
2193        while (n) {
2194                struct sp_node *p = rb_entry(n, struct sp_node, nd);
2195
2196                if (start >= p->end)
2197                        n = n->rb_right;
2198                else if (end <= p->start)
2199                        n = n->rb_left;
2200                else
2201                        break;
2202        }
2203        if (!n)
2204                return NULL;
2205        for (;;) {
2206                struct sp_node *w = NULL;
2207                struct rb_node *prev = rb_prev(n);
2208                if (!prev)
2209                        break;
2210                w = rb_entry(prev, struct sp_node, nd);
2211                if (w->end <= start)
2212                        break;
2213                n = prev;
2214        }
2215        return rb_entry(n, struct sp_node, nd);
2216}
2217
2218/*
2219 * Insert a new shared policy into the list.  Caller holds sp->lock for
2220 * writing.
2221 */
2222static void sp_insert(struct shared_policy *sp, struct sp_node *new)
2223{
2224        struct rb_node **p = &sp->root.rb_node;
2225        struct rb_node *parent = NULL;
2226        struct sp_node *nd;
2227
2228        while (*p) {
2229                parent = *p;
2230                nd = rb_entry(parent, struct sp_node, nd);
2231                if (new->start < nd->start)
2232                        p = &(*p)->rb_left;
2233                else if (new->end > nd->end)
2234                        p = &(*p)->rb_right;
2235                else
2236                        BUG();
2237        }
2238        rb_link_node(&new->nd, parent, p);
2239        rb_insert_color(&new->nd, &sp->root);
2240        pr_debug("inserting %lx-%lx: %d\n", new->start, new->end,
2241                 new->policy ? new->policy->mode : 0);
2242}
2243
2244/* Find shared policy intersecting idx */
2245struct mempolicy *
2246mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
2247{
2248        struct mempolicy *pol = NULL;
2249        struct sp_node *sn;
2250
2251        if (!sp->root.rb_node)
2252                return NULL;
2253        read_lock(&sp->lock);
2254        sn = sp_lookup(sp, idx, idx+1);
2255        if (sn) {
2256                mpol_get(sn->policy);
2257                pol = sn->policy;
2258        }
2259        read_unlock(&sp->lock);
2260        return pol;
2261}
2262
2263static void sp_free(struct sp_node *n)
2264{
2265        mpol_put(n->policy);
2266        kmem_cache_free(sn_cache, n);
2267}
2268
2269/**
2270 * mpol_misplaced - check whether current page node is valid in policy
2271 *
2272 * @page: page to be checked
2273 * @vma: vm area where page mapped
2274 * @addr: virtual address where page mapped
2275 *
2276 * Lookup current policy node id for vma,addr and "compare to" page's
2277 * node id.
2278 *
2279 * Returns:
2280 *      -1      - not misplaced, page is in the right node
2281 *      node    - node id where the page should be
2282 *
2283 * Policy determination "mimics" alloc_page_vma().
2284 * Called from fault path where we know the vma and faulting address.
2285 */
2286int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long addr)
2287{
2288        struct mempolicy *pol;
2289        struct zoneref *z;
2290        int curnid = page_to_nid(page);
2291        unsigned long pgoff;
2292        int thiscpu = raw_smp_processor_id();
2293        int thisnid = cpu_to_node(thiscpu);
2294        int polnid = -1;
2295        int ret = -1;
2296
2297        pol = get_vma_policy(vma, addr);
2298        if (!(pol->flags & MPOL_F_MOF))
2299                goto out;
2300
2301        switch (pol->mode) {
2302        case MPOL_INTERLEAVE:
2303                pgoff = vma->vm_pgoff;
2304                pgoff += (addr - vma->vm_start) >> PAGE_SHIFT;
2305                polnid = offset_il_node(pol, pgoff);
2306                break;
2307
2308        case MPOL_PREFERRED:
2309                if (pol->flags & MPOL_F_LOCAL)
2310                        polnid = numa_node_id();
2311                else
2312                        polnid = pol->v.preferred_node;
2313                break;
2314
2315        case MPOL_BIND:
2316
2317                /*
2318                 * allows binding to multiple nodes.
2319                 * use current page if in policy nodemask,
2320                 * else select nearest allowed node, if any.
2321                 * If no allowed nodes, use current [!misplaced].
2322                 */
2323                if (node_isset(curnid, pol->v.nodes))
2324                        goto out;
2325                z = first_zones_zonelist(
2326                                node_zonelist(numa_node_id(), GFP_HIGHUSER),
2327                                gfp_zone(GFP_HIGHUSER),
2328                                &pol->v.nodes);
2329                polnid = z->zone->node;
2330                break;
2331
2332        default:
2333                BUG();
2334        }
2335
2336        /* Migrate the page towards the node whose CPU is referencing it */
2337        if (pol->flags & MPOL_F_MORON) {
2338                polnid = thisnid;
2339
2340                if (!should_numa_migrate_memory(current, page, curnid, thiscpu))
2341                        goto out;
2342        }
2343
2344        if (curnid != polnid)
2345                ret = polnid;
2346out:
2347        mpol_cond_put(pol);
2348
2349        return ret;
2350}
2351
2352/*
2353 * Drop the (possibly final) reference to task->mempolicy.  It needs to be
2354 * dropped after task->mempolicy is set to NULL so that any allocation done as
2355 * part of its kmem_cache_free(), such as by KASAN, doesn't reference a freed
2356 * policy.
2357 */
2358void mpol_put_task_policy(struct task_struct *task)
2359{
2360        struct mempolicy *pol;
2361
2362        task_lock(task);
2363        pol = task->mempolicy;
2364        task->mempolicy = NULL;
2365        task_unlock(task);
2366        mpol_put(pol);
2367}
2368
2369static void sp_delete(struct shared_policy *sp, struct sp_node *n)
2370{
2371        pr_debug("deleting %lx-l%lx\n", n->start, n->end);
2372        rb_erase(&n->nd, &sp->root);
2373        sp_free(n);
2374}
2375
2376static void sp_node_init(struct sp_node *node, unsigned long start,
2377                        unsigned long end, struct mempolicy *pol)
2378{
2379        node->start = start;
2380        node->end = end;
2381        node->policy = pol;
2382}
2383
2384static struct sp_node *sp_alloc(unsigned long start, unsigned long end,
2385                                struct mempolicy *pol)
2386{
2387        struct sp_node *n;
2388        struct mempolicy *newpol;
2389
2390        n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
2391        if (!n)
2392                return NULL;
2393
2394        newpol = mpol_dup(pol);
2395        if (IS_ERR(newpol)) {
2396                kmem_cache_free(sn_cache, n);
2397                return NULL;
2398        }
2399        newpol->flags |= MPOL_F_SHARED;
2400        sp_node_init(n, start, end, newpol);
2401
2402        return n;
2403}
2404
2405/* Replace a policy range. */
2406static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
2407                                 unsigned long end, struct sp_node *new)
2408{
2409        struct sp_node *n;
2410        struct sp_node *n_new = NULL;
2411        struct mempolicy *mpol_new = NULL;
2412        int ret = 0;
2413
2414restart:
2415        write_lock(&sp->lock);
2416        n = sp_lookup(sp, start, end);
2417        /* Take care of old policies in the same range. */
2418        while (n && n->start < end) {
2419                struct rb_node *next = rb_next(&n->nd);
2420                if (n->start >= start) {
2421                        if (n->end <= end)
2422                                sp_delete(sp, n);
2423                        else
2424                                n->start = end;
2425                } else {
2426                        /* Old policy spanning whole new range. */
2427                        if (n->end > end) {
2428                                if (!n_new)
2429                                        goto alloc_new;
2430
2431                                *mpol_new = *n->policy;
2432                                atomic_set(&mpol_new->refcnt, 1);
2433                                sp_node_init(n_new, end, n->end, mpol_new);
2434                                n->end = start;
2435                                sp_insert(sp, n_new);
2436                                n_new = NULL;
2437                                mpol_new = NULL;
2438                                break;
2439                        } else
2440                                n->end = start;
2441                }
2442                if (!next)
2443                        break;
2444                n = rb_entry(next, struct sp_node, nd);
2445        }
2446        if (new)
2447                sp_insert(sp, new);
2448        write_unlock(&sp->lock);
2449        ret = 0;
2450
2451err_out:
2452        if (mpol_new)
2453                mpol_put(mpol_new);
2454        if (n_new)
2455                kmem_cache_free(sn_cache, n_new);
2456
2457        return ret;
2458
2459alloc_new:
2460        write_unlock(&sp->lock);
2461        ret = -ENOMEM;
2462        n_new = kmem_cache_alloc(sn_cache, GFP_KERNEL);
2463        if (!n_new)
2464                goto err_out;
2465        mpol_new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
2466        if (!mpol_new)
2467                goto err_out;
2468        goto restart;
2469}
2470
2471/**
2472 * mpol_shared_policy_init - initialize shared policy for inode
2473 * @sp: pointer to inode shared policy
2474 * @mpol:  struct mempolicy to install
2475 *
2476 * Install non-NULL @mpol in inode's shared policy rb-tree.
2477 * On entry, the current task has a reference on a non-NULL @mpol.
2478 * This must be released on exit.
2479 * This is called at get_inode() calls and we can use GFP_KERNEL.
2480 */
2481void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol)
2482{
2483        int ret;
2484
2485        sp->root = RB_ROOT;             /* empty tree == default mempolicy */
2486        rwlock_init(&sp->lock);
2487
2488        if (mpol) {
2489                struct vm_area_struct pvma;
2490                struct mempolicy *new;
2491                NODEMASK_SCRATCH(scratch);
2492
2493                if (!scratch)
2494                        goto put_mpol;
2495                /* contextualize the tmpfs mount point mempolicy */
2496                new = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask);
2497                if (IS_ERR(new))
2498                        goto free_scratch; /* no valid nodemask intersection */
2499
2500                task_lock(current);
2501                ret = mpol_set_nodemask(new, &mpol->w.user_nodemask, scratch);
2502                task_unlock(current);
2503                if (ret)
2504                        goto put_new;
2505
2506                /* Create pseudo-vma that contains just the policy */
2507                memset(&pvma, 0, sizeof(struct vm_area_struct));
2508                pvma.vm_end = TASK_SIZE;        /* policy covers entire file */
2509                mpol_set_shared_policy(sp, &pvma, new); /* adds ref */
2510
2511put_new:
2512                mpol_put(new);                  /* drop initial ref */
2513free_scratch:
2514                NODEMASK_SCRATCH_FREE(scratch);
2515put_mpol:
2516                mpol_put(mpol); /* drop our incoming ref on sb mpol */
2517        }
2518}
2519
2520int mpol_set_shared_policy(struct shared_policy *info,
2521                        struct vm_area_struct *vma, struct mempolicy *npol)
2522{
2523        int err;
2524        struct sp_node *new = NULL;
2525        unsigned long sz = vma_pages(vma);
2526
2527        pr_debug("set_shared_policy %lx sz %lu %d %d %lx\n",
2528                 vma->vm_pgoff,
2529                 sz, npol ? npol->mode : -1,
2530                 npol ? npol->flags : -1,
2531                 npol ? nodes_addr(npol->v.nodes)[0] : NUMA_NO_NODE);
2532
2533        if (npol) {
2534                new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);
2535                if (!new)
2536                        return -ENOMEM;
2537        }
2538        err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new);
2539        if (err && new)
2540                sp_free(new);
2541        return err;
2542}
2543
2544/* Free a backing policy store on inode delete. */
2545void mpol_free_shared_policy(struct shared_policy *p)
2546{
2547        struct sp_node *n;
2548        struct rb_node *next;
2549
2550        if (!p->root.rb_node)
2551                return;
2552        write_lock(&p->lock);
2553        next = rb_first(&p->root);
2554        while (next) {
2555                n = rb_entry(next, struct sp_node, nd);
2556                next = rb_next(&n->nd);
2557                sp_delete(p, n);
2558        }
2559        write_unlock(&p->lock);
2560}
2561
2562#ifdef CONFIG_NUMA_BALANCING
2563static int __initdata numabalancing_override;
2564
2565static void __init check_numabalancing_enable(void)
2566{
2567        bool numabalancing_default = false;
2568
2569        if (IS_ENABLED(CONFIG_NUMA_BALANCING_DEFAULT_ENABLED))
2570                numabalancing_default = true;
2571
2572        /* Parsed by setup_numabalancing. override == 1 enables, -1 disables */
2573        if (numabalancing_override)
2574                set_numabalancing_state(numabalancing_override == 1);
2575
2576        if (num_online_nodes() > 1 && !numabalancing_override) {
2577                pr_info("%s automatic NUMA balancing. Configure with numa_balancing= or the kernel.numa_balancing sysctl\n",
2578                        numabalancing_default ? "Enabling" : "Disabling");
2579                set_numabalancing_state(numabalancing_default);
2580        }
2581}
2582
2583static int __init setup_numabalancing(char *str)
2584{
2585        int ret = 0;
2586        if (!str)
2587                goto out;
2588
2589        if (!strcmp(str, "enable")) {
2590                numabalancing_override = 1;
2591                ret = 1;
2592        } else if (!strcmp(str, "disable")) {
2593                numabalancing_override = -1;
2594                ret = 1;
2595        }
2596out:
2597        if (!ret)
2598                pr_warn("Unable to parse numa_balancing=\n");
2599
2600        return ret;
2601}
2602__setup("numa_balancing=", setup_numabalancing);
2603#else
2604static inline void __init check_numabalancing_enable(void)
2605{
2606}
2607#endif /* CONFIG_NUMA_BALANCING */
2608
2609/* assumes fs == KERNEL_DS */
2610void __init numa_policy_init(void)
2611{
2612        nodemask_t interleave_nodes;
2613        unsigned long largest = 0;
2614        int nid, prefer = 0;
2615
2616        policy_cache = kmem_cache_create("numa_policy",
2617                                         sizeof(struct mempolicy),
2618                                         0, SLAB_PANIC, NULL);
2619
2620        sn_cache = kmem_cache_create("shared_policy_node",
2621                                     sizeof(struct sp_node),
2622                                     0, SLAB_PANIC, NULL);
2623
2624        for_each_node(nid) {
2625                preferred_node_policy[nid] = (struct mempolicy) {
2626                        .refcnt = ATOMIC_INIT(1),
2627                        .mode = MPOL_PREFERRED,
2628                        .flags = MPOL_F_MOF | MPOL_F_MORON,
2629                        .v = { .preferred_node = nid, },
2630                };
2631        }
2632
2633        /*
2634         * Set interleaving policy for system init. Interleaving is only
2635         * enabled across suitably sized nodes (default is >= 16MB), or
2636         * fall back to the largest node if they're all smaller.
2637         */
2638        nodes_clear(interleave_nodes);
2639        for_each_node_state(nid, N_MEMORY) {
2640                unsigned long total_pages = node_present_pages(nid);
2641
2642                /* Preserve the largest node */
2643                if (largest < total_pages) {
2644                        largest = total_pages;
2645                        prefer = nid;
2646                }
2647
2648                /* Interleave this node? */
2649                if ((total_pages << PAGE_SHIFT) >= (16 << 20))
2650                        node_set(nid, interleave_nodes);
2651        }
2652
2653        /* All too small, use the largest */
2654        if (unlikely(nodes_empty(interleave_nodes)))
2655                node_set(prefer, interleave_nodes);
2656
2657        if (do_set_mempolicy(MPOL_INTERLEAVE, 0, &interleave_nodes))
2658                pr_err("%s: interleaving failed\n", __func__);
2659
2660        check_numabalancing_enable();
2661}
2662
2663/* Reset policy of current process to default */
2664void numa_default_policy(void)
2665{
2666        do_set_mempolicy(MPOL_DEFAULT, 0, NULL);
2667}
2668
2669/*
2670 * Parse and format mempolicy from/to strings
2671 */
2672
2673/*
2674 * "local" is implemented internally by MPOL_PREFERRED with MPOL_F_LOCAL flag.
2675 */
2676static const char * const policy_modes[] =
2677{
2678        [MPOL_DEFAULT]    = "default",
2679        [MPOL_PREFERRED]  = "prefer",
2680        [MPOL_BIND]       = "bind",
2681        [MPOL_INTERLEAVE] = "interleave",
2682        [MPOL_LOCAL]      = "local",
2683};
2684
2685
2686#ifdef CONFIG_TMPFS
2687/**
2688 * mpol_parse_str - parse string to mempolicy, for tmpfs mpol mount option.
2689 * @str:  string containing mempolicy to parse
2690 * @mpol:  pointer to struct mempolicy pointer, returned on success.
2691 *
2692 * Format of input:
2693 *      <mode>[=<flags>][:<nodelist>]
2694 *
2695 * On success, returns 0, else 1
2696 */
2697int mpol_parse_str(char *str, struct mempolicy **mpol)
2698{
2699        struct mempolicy *new = NULL;
2700        unsigned short mode;
2701        unsigned short mode_flags;
2702        nodemask_t nodes;
2703        char *nodelist = strchr(str, ':');
2704        char *flags = strchr(str, '=');
2705        int err = 1;
2706
2707        if (nodelist) {
2708                /* NUL-terminate mode or flags string */
2709                *nodelist++ = '\0';
2710                if (nodelist_parse(nodelist, nodes))
2711                        goto out;
2712                if (!nodes_subset(nodes, node_states[N_MEMORY]))
2713                        goto out;
2714        } else
2715                nodes_clear(nodes);
2716
2717        if (flags)
2718                *flags++ = '\0';        /* terminate mode string */
2719
2720        for (mode = 0; mode < MPOL_MAX; mode++) {
2721                if (!strcmp(str, policy_modes[mode])) {
2722                        break;
2723                }
2724        }
2725        if (mode >= MPOL_MAX)
2726                goto out;
2727
2728        switch (mode) {
2729        case MPOL_PREFERRED:
2730                /*
2731                 * Insist on a nodelist of one node only
2732                 */
2733                if (nodelist) {
2734                        char *rest = nodelist;
2735                        while (isdigit(*rest))
2736                                rest++;
2737                        if (*rest)
2738                                goto out;
2739                }
2740                break;
2741        case MPOL_INTERLEAVE:
2742                /*
2743                 * Default to online nodes with memory if no nodelist
2744                 */
2745                if (!nodelist)
2746                        nodes = node_states[N_MEMORY];
2747                break;
2748        case MPOL_LOCAL:
2749                /*
2750                 * Don't allow a nodelist;  mpol_new() checks flags
2751                 */
2752                if (nodelist)
2753                        goto out;
2754                mode = MPOL_PREFERRED;
2755                break;
2756        case MPOL_DEFAULT:
2757                /*
2758                 * Insist on a empty nodelist
2759                 */
2760                if (!nodelist)
2761                        err = 0;
2762                goto out;
2763        case MPOL_BIND:
2764                /*
2765                 * Insist on a nodelist
2766                 */
2767                if (!nodelist)
2768                        goto out;
2769        }
2770
2771        mode_flags = 0;
2772        if (flags) {
2773                /*
2774                 * Currently, we only support two mutually exclusive
2775                 * mode flags.
2776                 */
2777                if (!strcmp(flags, "static"))
2778                        mode_flags |= MPOL_F_STATIC_NODES;
2779                else if (!strcmp(flags, "relative"))
2780                        mode_flags |= MPOL_F_RELATIVE_NODES;
2781                else
2782                        goto out;
2783        }
2784
2785        new = mpol_new(mode, mode_flags, &nodes);
2786        if (IS_ERR(new))
2787                goto out;
2788
2789        /*
2790         * Save nodes for mpol_to_str() to show the tmpfs mount options
2791         * for /proc/mounts, /proc/pid/mounts and /proc/pid/mountinfo.
2792         */
2793        if (mode != MPOL_PREFERRED)
2794                new->v.nodes = nodes;
2795        else if (nodelist)
2796                new->v.preferred_node = first_node(nodes);
2797        else
2798                new->flags |= MPOL_F_LOCAL;
2799
2800        /*
2801         * Save nodes for contextualization: this will be used to "clone"
2802         * the mempolicy in a specific context [cpuset] at a later time.
2803         */
2804        new->w.user_nodemask = nodes;
2805
2806        err = 0;
2807
2808out:
2809        /* Restore string for error message */
2810        if (nodelist)
2811                *--nodelist = ':';
2812        if (flags)
2813                *--flags = '=';
2814        if (!err)
2815                *mpol = new;
2816        return err;
2817}
2818#endif /* CONFIG_TMPFS */
2819
2820/**
2821 * mpol_to_str - format a mempolicy structure for printing
2822 * @buffer:  to contain formatted mempolicy string
2823 * @maxlen:  length of @buffer
2824 * @pol:  pointer to mempolicy to be formatted
2825 *
2826 * Convert @pol into a string.  If @buffer is too short, truncate the string.
2827 * Recommend a @maxlen of at least 32 for the longest mode, "interleave", the
2828 * longest flag, "relative", and to display at least a few node ids.
2829 */
2830void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
2831{
2832        char *p = buffer;
2833        nodemask_t nodes = NODE_MASK_NONE;
2834        unsigned short mode = MPOL_DEFAULT;
2835        unsigned short flags = 0;
2836
2837        if (pol && pol != &default_policy && !(pol->flags & MPOL_F_MORON)) {
2838                mode = pol->mode;
2839                flags = pol->flags;
2840        }
2841
2842        switch (mode) {
2843        case MPOL_DEFAULT:
2844                break;
2845        case MPOL_PREFERRED:
2846                if (flags & MPOL_F_LOCAL)
2847                        mode = MPOL_LOCAL;
2848                else
2849                        node_set(pol->v.preferred_node, nodes);
2850                break;
2851        case MPOL_BIND:
2852        case MPOL_INTERLEAVE:
2853                nodes = pol->v.nodes;
2854                break;
2855        default:
2856                WARN_ON_ONCE(1);
2857                snprintf(p, maxlen, "unknown");
2858                return;
2859        }
2860
2861        p += snprintf(p, maxlen, "%s", policy_modes[mode]);
2862
2863        if (flags & MPOL_MODE_FLAGS) {
2864                p += snprintf(p, buffer + maxlen - p, "=");
2865
2866                /*
2867                 * Currently, the only defined flags are mutually exclusive
2868                 */
2869                if (flags & MPOL_F_STATIC_NODES)
2870                        p += snprintf(p, buffer + maxlen - p, "static");
2871                else if (flags & MPOL_F_RELATIVE_NODES)
2872                        p += snprintf(p, buffer + maxlen - p, "relative");
2873        }
2874
2875        if (!nodes_empty(nodes))
2876                p += scnprintf(p, buffer + maxlen - p, ":%*pbl",
2877                               nodemask_pr_args(&nodes));
2878}
2879