linux/mm/mempolicy.c
<<
>>
Prefs
   1/*
   2 * Simple NUMA memory policy for the Linux kernel.
   3 *
   4 * Copyright 2003,2004 Andi Kleen, SuSE Labs.
   5 * (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc.
   6 * Subject to the GNU Public License, version 2.
   7 *
   8 * NUMA policy allows the user to give hints in which node(s) memory should
   9 * be allocated.
  10 *
  11 * Support four policies per VMA and per process:
  12 *
  13 * The VMA policy has priority over the process policy for a page fault.
  14 *
  15 * interleave     Allocate memory interleaved over a set of nodes,
  16 *                with normal fallback if it fails.
  17 *                For VMA based allocations this interleaves based on the
  18 *                offset into the backing object or offset into the mapping
  19 *                for anonymous memory. For process policy an process counter
  20 *                is used.
  21 *
  22 * bind           Only allocate memory on a specific set of nodes,
  23 *                no fallback.
  24 *                FIXME: memory is allocated starting with the first node
  25 *                to the last. It would be better if bind would truly restrict
  26 *                the allocation to memory nodes instead
  27 *
  28 * preferred       Try a specific node first before normal fallback.
  29 *                As a special case NUMA_NO_NODE here means do the allocation
  30 *                on the local CPU. This is normally identical to default,
  31 *                but useful to set in a VMA when you have a non default
  32 *                process policy.
  33 *
  34 * default        Allocate on the local node first, or when on a VMA
  35 *                use the process policy. This is what Linux always did
  36 *                in a NUMA aware kernel and still does by, ahem, default.
  37 *
  38 * The process policy is applied for most non interrupt memory allocations
  39 * in that process' context. Interrupts ignore the policies and always
  40 * try to allocate on the local CPU. The VMA policy is only applied for memory
  41 * allocations for a VMA in the VM.
  42 *
  43 * Currently there are a few corner cases in swapping where the policy
  44 * is not applied, but the majority should be handled. When process policy
  45 * is used it is not remembered over swap outs/swap ins.
  46 *
  47 * Only the highest zone in the zone hierarchy gets policied. Allocations
  48 * requesting a lower zone just use default policy. This implies that
  49 * on systems with highmem kernel lowmem allocation don't get policied.
  50 * Same with GFP_DMA allocations.
  51 *
  52 * For shmfs/tmpfs/hugetlbfs shared memory the policy is shared between
  53 * all users and remembered even when nobody has memory mapped.
  54 */
  55
  56/* Notebook:
  57   fix mmap readahead to honour policy and enable policy for any page cache
  58   object
  59   statistics for bigpages
  60   global policy for page cache? currently it uses process policy. Requires
  61   first item above.
  62   handle mremap for shared memory (currently ignored for the policy)
  63   grows down?
  64   make bind policy root only? It can trigger oom much faster and the
  65   kernel is not always grateful with that.
  66*/
  67
  68#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
  69
  70#include <linux/mempolicy.h>
  71#include <linux/mm.h>
  72#include <linux/highmem.h>
  73#include <linux/hugetlb.h>
  74#include <linux/kernel.h>
  75#include <linux/sched.h>
  76#include <linux/sched/mm.h>
  77#include <linux/sched/numa_balancing.h>
  78#include <linux/sched/task.h>
  79#include <linux/nodemask.h>
  80#include <linux/cpuset.h>
  81#include <linux/slab.h>
  82#include <linux/string.h>
  83#include <linux/export.h>
  84#include <linux/nsproxy.h>
  85#include <linux/interrupt.h>
  86#include <linux/init.h>
  87#include <linux/compat.h>
  88#include <linux/ptrace.h>
  89#include <linux/swap.h>
  90#include <linux/seq_file.h>
  91#include <linux/proc_fs.h>
  92#include <linux/migrate.h>
  93#include <linux/ksm.h>
  94#include <linux/rmap.h>
  95#include <linux/security.h>
  96#include <linux/syscalls.h>
  97#include <linux/ctype.h>
  98#include <linux/mm_inline.h>
  99#include <linux/mmu_notifier.h>
 100#include <linux/printk.h>
 101#include <linux/swapops.h>
 102
 103#include <asm/tlbflush.h>
 104#include <linux/uaccess.h>
 105
 106#include "internal.h"
 107
 108/* Internal flags */
 109#define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0)    /* Skip checks for continuous vmas */
 110#define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1)          /* Invert check for nodemask */
 111
 112static struct kmem_cache *policy_cache;
 113static struct kmem_cache *sn_cache;
 114
 115/* Highest zone. An specific allocation for a zone below that is not
 116   policied. */
 117enum zone_type policy_zone = 0;
 118
 119/*
 120 * run-time system-wide default policy => local allocation
 121 */
 122static struct mempolicy default_policy = {
 123        .refcnt = ATOMIC_INIT(1), /* never free it */
 124        .mode = MPOL_PREFERRED,
 125        .flags = MPOL_F_LOCAL,
 126};
 127
 128static struct mempolicy preferred_node_policy[MAX_NUMNODES];
 129
 130struct mempolicy *get_task_policy(struct task_struct *p)
 131{
 132        struct mempolicy *pol = p->mempolicy;
 133        int node;
 134
 135        if (pol)
 136                return pol;
 137
 138        node = numa_node_id();
 139        if (node != NUMA_NO_NODE) {
 140                pol = &preferred_node_policy[node];
 141                /* preferred_node_policy is not initialised early in boot */
 142                if (pol->mode)
 143                        return pol;
 144        }
 145
 146        return &default_policy;
 147}
 148
 149static const struct mempolicy_operations {
 150        int (*create)(struct mempolicy *pol, const nodemask_t *nodes);
 151        void (*rebind)(struct mempolicy *pol, const nodemask_t *nodes);
 152} mpol_ops[MPOL_MAX];
 153
 154static inline int mpol_store_user_nodemask(const struct mempolicy *pol)
 155{
 156        return pol->flags & MPOL_MODE_FLAGS;
 157}
 158
 159static void mpol_relative_nodemask(nodemask_t *ret, const nodemask_t *orig,
 160                                   const nodemask_t *rel)
 161{
 162        nodemask_t tmp;
 163        nodes_fold(tmp, *orig, nodes_weight(*rel));
 164        nodes_onto(*ret, tmp, *rel);
 165}
 166
 167static int mpol_new_interleave(struct mempolicy *pol, const nodemask_t *nodes)
 168{
 169        if (nodes_empty(*nodes))
 170                return -EINVAL;
 171        pol->v.nodes = *nodes;
 172        return 0;
 173}
 174
 175static int mpol_new_preferred(struct mempolicy *pol, const nodemask_t *nodes)
 176{
 177        if (!nodes)
 178                pol->flags |= MPOL_F_LOCAL;     /* local allocation */
 179        else if (nodes_empty(*nodes))
 180                return -EINVAL;                 /*  no allowed nodes */
 181        else
 182                pol->v.preferred_node = first_node(*nodes);
 183        return 0;
 184}
 185
 186static int mpol_new_bind(struct mempolicy *pol, const nodemask_t *nodes)
 187{
 188        if (nodes_empty(*nodes))
 189                return -EINVAL;
 190        pol->v.nodes = *nodes;
 191        return 0;
 192}
 193
 194/*
 195 * mpol_set_nodemask is called after mpol_new() to set up the nodemask, if
 196 * any, for the new policy.  mpol_new() has already validated the nodes
 197 * parameter with respect to the policy mode and flags.  But, we need to
 198 * handle an empty nodemask with MPOL_PREFERRED here.
 199 *
 200 * Must be called holding task's alloc_lock to protect task's mems_allowed
 201 * and mempolicy.  May also be called holding the mmap_semaphore for write.
 202 */
 203static int mpol_set_nodemask(struct mempolicy *pol,
 204                     const nodemask_t *nodes, struct nodemask_scratch *nsc)
 205{
 206        int ret;
 207
 208        /* if mode is MPOL_DEFAULT, pol is NULL. This is right. */
 209        if (pol == NULL)
 210                return 0;
 211        /* Check N_MEMORY */
 212        nodes_and(nsc->mask1,
 213                  cpuset_current_mems_allowed, node_states[N_MEMORY]);
 214
 215        VM_BUG_ON(!nodes);
 216        if (pol->mode == MPOL_PREFERRED && nodes_empty(*nodes))
 217                nodes = NULL;   /* explicit local allocation */
 218        else {
 219                if (pol->flags & MPOL_F_RELATIVE_NODES)
 220                        mpol_relative_nodemask(&nsc->mask2, nodes, &nsc->mask1);
 221                else
 222                        nodes_and(nsc->mask2, *nodes, nsc->mask1);
 223
 224                if (mpol_store_user_nodemask(pol))
 225                        pol->w.user_nodemask = *nodes;
 226                else
 227                        pol->w.cpuset_mems_allowed =
 228                                                cpuset_current_mems_allowed;
 229        }
 230
 231        if (nodes)
 232                ret = mpol_ops[pol->mode].create(pol, &nsc->mask2);
 233        else
 234                ret = mpol_ops[pol->mode].create(pol, NULL);
 235        return ret;
 236}
 237
 238/*
 239 * This function just creates a new policy, does some check and simple
 240 * initialization. You must invoke mpol_set_nodemask() to set nodes.
 241 */
 242static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
 243                                  nodemask_t *nodes)
 244{
 245        struct mempolicy *policy;
 246
 247        pr_debug("setting mode %d flags %d nodes[0] %lx\n",
 248                 mode, flags, nodes ? nodes_addr(*nodes)[0] : NUMA_NO_NODE);
 249
 250        if (mode == MPOL_DEFAULT) {
 251                if (nodes && !nodes_empty(*nodes))
 252                        return ERR_PTR(-EINVAL);
 253                return NULL;
 254        }
 255        VM_BUG_ON(!nodes);
 256
 257        /*
 258         * MPOL_PREFERRED cannot be used with MPOL_F_STATIC_NODES or
 259         * MPOL_F_RELATIVE_NODES if the nodemask is empty (local allocation).
 260         * All other modes require a valid pointer to a non-empty nodemask.
 261         */
 262        if (mode == MPOL_PREFERRED) {
 263                if (nodes_empty(*nodes)) {
 264                        if (((flags & MPOL_F_STATIC_NODES) ||
 265                             (flags & MPOL_F_RELATIVE_NODES)))
 266                                return ERR_PTR(-EINVAL);
 267                }
 268        } else if (mode == MPOL_LOCAL) {
 269                if (!nodes_empty(*nodes) ||
 270                    (flags & MPOL_F_STATIC_NODES) ||
 271                    (flags & MPOL_F_RELATIVE_NODES))
 272                        return ERR_PTR(-EINVAL);
 273                mode = MPOL_PREFERRED;
 274        } else if (nodes_empty(*nodes))
 275                return ERR_PTR(-EINVAL);
 276        policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
 277        if (!policy)
 278                return ERR_PTR(-ENOMEM);
 279        atomic_set(&policy->refcnt, 1);
 280        policy->mode = mode;
 281        policy->flags = flags;
 282
 283        return policy;
 284}
 285
 286/* Slow path of a mpol destructor. */
 287void __mpol_put(struct mempolicy *p)
 288{
 289        if (!atomic_dec_and_test(&p->refcnt))
 290                return;
 291        kmem_cache_free(policy_cache, p);
 292}
 293
 294static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes)
 295{
 296}
 297
 298static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes)
 299{
 300        nodemask_t tmp;
 301
 302        if (pol->flags & MPOL_F_STATIC_NODES)
 303                nodes_and(tmp, pol->w.user_nodemask, *nodes);
 304        else if (pol->flags & MPOL_F_RELATIVE_NODES)
 305                mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
 306        else {
 307                nodes_remap(tmp, pol->v.nodes,pol->w.cpuset_mems_allowed,
 308                                                                *nodes);
 309                pol->w.cpuset_mems_allowed = tmp;
 310        }
 311
 312        if (nodes_empty(tmp))
 313                tmp = *nodes;
 314
 315        pol->v.nodes = tmp;
 316}
 317
 318static void mpol_rebind_preferred(struct mempolicy *pol,
 319                                                const nodemask_t *nodes)
 320{
 321        nodemask_t tmp;
 322
 323        if (pol->flags & MPOL_F_STATIC_NODES) {
 324                int node = first_node(pol->w.user_nodemask);
 325
 326                if (node_isset(node, *nodes)) {
 327                        pol->v.preferred_node = node;
 328                        pol->flags &= ~MPOL_F_LOCAL;
 329                } else
 330                        pol->flags |= MPOL_F_LOCAL;
 331        } else if (pol->flags & MPOL_F_RELATIVE_NODES) {
 332                mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
 333                pol->v.preferred_node = first_node(tmp);
 334        } else if (!(pol->flags & MPOL_F_LOCAL)) {
 335                pol->v.preferred_node = node_remap(pol->v.preferred_node,
 336                                                   pol->w.cpuset_mems_allowed,
 337                                                   *nodes);
 338                pol->w.cpuset_mems_allowed = *nodes;
 339        }
 340}
 341
 342/*
 343 * mpol_rebind_policy - Migrate a policy to a different set of nodes
 344 *
 345 * Per-vma policies are protected by mmap_sem. Allocations using per-task
 346 * policies are protected by task->mems_allowed_seq to prevent a premature
 347 * OOM/allocation failure due to parallel nodemask modification.
 348 */
 349static void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask)
 350{
 351        if (!pol)
 352                return;
 353        if (!mpol_store_user_nodemask(pol) &&
 354            nodes_equal(pol->w.cpuset_mems_allowed, *newmask))
 355                return;
 356
 357        mpol_ops[pol->mode].rebind(pol, newmask);
 358}
 359
 360/*
 361 * Wrapper for mpol_rebind_policy() that just requires task
 362 * pointer, and updates task mempolicy.
 363 *
 364 * Called with task's alloc_lock held.
 365 */
 366
 367void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new)
 368{
 369        mpol_rebind_policy(tsk->mempolicy, new);
 370}
 371
 372/*
 373 * Rebind each vma in mm to new nodemask.
 374 *
 375 * Call holding a reference to mm.  Takes mm->mmap_sem during call.
 376 */
 377
 378void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
 379{
 380        struct vm_area_struct *vma;
 381
 382        down_write(&mm->mmap_sem);
 383        for (vma = mm->mmap; vma; vma = vma->vm_next)
 384                mpol_rebind_policy(vma->vm_policy, new);
 385        up_write(&mm->mmap_sem);
 386}
 387
 388static const struct mempolicy_operations mpol_ops[MPOL_MAX] = {
 389        [MPOL_DEFAULT] = {
 390                .rebind = mpol_rebind_default,
 391        },
 392        [MPOL_INTERLEAVE] = {
 393                .create = mpol_new_interleave,
 394                .rebind = mpol_rebind_nodemask,
 395        },
 396        [MPOL_PREFERRED] = {
 397                .create = mpol_new_preferred,
 398                .rebind = mpol_rebind_preferred,
 399        },
 400        [MPOL_BIND] = {
 401                .create = mpol_new_bind,
 402                .rebind = mpol_rebind_nodemask,
 403        },
 404};
 405
 406static void migrate_page_add(struct page *page, struct list_head *pagelist,
 407                                unsigned long flags);
 408
 409struct queue_pages {
 410        struct list_head *pagelist;
 411        unsigned long flags;
 412        nodemask_t *nmask;
 413        struct vm_area_struct *prev;
 414};
 415
 416/*
 417 * Check if the page's nid is in qp->nmask.
 418 *
 419 * If MPOL_MF_INVERT is set in qp->flags, check if the nid is
 420 * in the invert of qp->nmask.
 421 */
 422static inline bool queue_pages_required(struct page *page,
 423                                        struct queue_pages *qp)
 424{
 425        int nid = page_to_nid(page);
 426        unsigned long flags = qp->flags;
 427
 428        return node_isset(nid, *qp->nmask) == !(flags & MPOL_MF_INVERT);
 429}
 430
 431static int queue_pages_pmd(pmd_t *pmd, spinlock_t *ptl, unsigned long addr,
 432                                unsigned long end, struct mm_walk *walk)
 433{
 434        int ret = 0;
 435        struct page *page;
 436        struct queue_pages *qp = walk->private;
 437        unsigned long flags;
 438
 439        if (unlikely(is_pmd_migration_entry(*pmd))) {
 440                ret = 1;
 441                goto unlock;
 442        }
 443        page = pmd_page(*pmd);
 444        if (is_huge_zero_page(page)) {
 445                spin_unlock(ptl);
 446                __split_huge_pmd(walk->vma, pmd, addr, false, NULL);
 447                goto out;
 448        }
 449        if (!queue_pages_required(page, qp)) {
 450                ret = 1;
 451                goto unlock;
 452        }
 453
 454        ret = 1;
 455        flags = qp->flags;
 456        /* go to thp migration */
 457        if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
 458                migrate_page_add(page, qp->pagelist, flags);
 459unlock:
 460        spin_unlock(ptl);
 461out:
 462        return ret;
 463}
 464
 465/*
 466 * Scan through pages checking if pages follow certain conditions,
 467 * and move them to the pagelist if they do.
 468 */
 469static int queue_pages_pte_range(pmd_t *pmd, unsigned long addr,
 470                        unsigned long end, struct mm_walk *walk)
 471{
 472        struct vm_area_struct *vma = walk->vma;
 473        struct page *page;
 474        struct queue_pages *qp = walk->private;
 475        unsigned long flags = qp->flags;
 476        int ret;
 477        pte_t *pte;
 478        spinlock_t *ptl;
 479
 480        ptl = pmd_trans_huge_lock(pmd, vma);
 481        if (ptl) {
 482                ret = queue_pages_pmd(pmd, ptl, addr, end, walk);
 483                if (ret)
 484                        return 0;
 485        }
 486
 487        if (pmd_trans_unstable(pmd))
 488                return 0;
 489
 490        pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
 491        for (; addr != end; pte++, addr += PAGE_SIZE) {
 492                if (!pte_present(*pte))
 493                        continue;
 494                page = vm_normal_page(vma, addr, *pte);
 495                if (!page)
 496                        continue;
 497                /*
 498                 * vm_normal_page() filters out zero pages, but there might
 499                 * still be PageReserved pages to skip, perhaps in a VDSO.
 500                 */
 501                if (PageReserved(page))
 502                        continue;
 503                if (!queue_pages_required(page, qp))
 504                        continue;
 505                migrate_page_add(page, qp->pagelist, flags);
 506        }
 507        pte_unmap_unlock(pte - 1, ptl);
 508        cond_resched();
 509        return 0;
 510}
 511
 512static int queue_pages_hugetlb(pte_t *pte, unsigned long hmask,
 513                               unsigned long addr, unsigned long end,
 514                               struct mm_walk *walk)
 515{
 516#ifdef CONFIG_HUGETLB_PAGE
 517        struct queue_pages *qp = walk->private;
 518        unsigned long flags = qp->flags;
 519        struct page *page;
 520        spinlock_t *ptl;
 521        pte_t entry;
 522
 523        ptl = huge_pte_lock(hstate_vma(walk->vma), walk->mm, pte);
 524        entry = huge_ptep_get(pte);
 525        if (!pte_present(entry))
 526                goto unlock;
 527        page = pte_page(entry);
 528        if (!queue_pages_required(page, qp))
 529                goto unlock;
 530        /* With MPOL_MF_MOVE, we migrate only unshared hugepage. */
 531        if (flags & (MPOL_MF_MOVE_ALL) ||
 532            (flags & MPOL_MF_MOVE && page_mapcount(page) == 1))
 533                isolate_huge_page(page, qp->pagelist);
 534unlock:
 535        spin_unlock(ptl);
 536#else
 537        BUG();
 538#endif
 539        return 0;
 540}
 541
 542#ifdef CONFIG_NUMA_BALANCING
 543/*
 544 * This is used to mark a range of virtual addresses to be inaccessible.
 545 * These are later cleared by a NUMA hinting fault. Depending on these
 546 * faults, pages may be migrated for better NUMA placement.
 547 *
 548 * This is assuming that NUMA faults are handled using PROT_NONE. If
 549 * an architecture makes a different choice, it will need further
 550 * changes to the core.
 551 */
 552unsigned long change_prot_numa(struct vm_area_struct *vma,
 553                        unsigned long addr, unsigned long end)
 554{
 555        int nr_updated;
 556
 557        nr_updated = change_protection(vma, addr, end, PAGE_NONE, 0, 1);
 558        if (nr_updated)
 559                count_vm_numa_events(NUMA_PTE_UPDATES, nr_updated);
 560
 561        return nr_updated;
 562}
 563#else
 564static unsigned long change_prot_numa(struct vm_area_struct *vma,
 565                        unsigned long addr, unsigned long end)
 566{
 567        return 0;
 568}
 569#endif /* CONFIG_NUMA_BALANCING */
 570
 571static int queue_pages_test_walk(unsigned long start, unsigned long end,
 572                                struct mm_walk *walk)
 573{
 574        struct vm_area_struct *vma = walk->vma;
 575        struct queue_pages *qp = walk->private;
 576        unsigned long endvma = vma->vm_end;
 577        unsigned long flags = qp->flags;
 578
 579        if (!vma_migratable(vma))
 580                return 1;
 581
 582        if (endvma > end)
 583                endvma = end;
 584        if (vma->vm_start > start)
 585                start = vma->vm_start;
 586
 587        if (!(flags & MPOL_MF_DISCONTIG_OK)) {
 588                if (!vma->vm_next && vma->vm_end < end)
 589                        return -EFAULT;
 590                if (qp->prev && qp->prev->vm_end < vma->vm_start)
 591                        return -EFAULT;
 592        }
 593
 594        qp->prev = vma;
 595
 596        if (flags & MPOL_MF_LAZY) {
 597                /* Similar to task_numa_work, skip inaccessible VMAs */
 598                if (!is_vm_hugetlb_page(vma) &&
 599                        (vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)) &&
 600                        !(vma->vm_flags & VM_MIXEDMAP))
 601                        change_prot_numa(vma, start, endvma);
 602                return 1;
 603        }
 604
 605        /* queue pages from current vma */
 606        if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
 607                return 0;
 608        return 1;
 609}
 610
 611/*
 612 * Walk through page tables and collect pages to be migrated.
 613 *
 614 * If pages found in a given range are on a set of nodes (determined by
 615 * @nodes and @flags,) it's isolated and queued to the pagelist which is
 616 * passed via @private.)
 617 */
 618static int
 619queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end,
 620                nodemask_t *nodes, unsigned long flags,
 621                struct list_head *pagelist)
 622{
 623        struct queue_pages qp = {
 624                .pagelist = pagelist,
 625                .flags = flags,
 626                .nmask = nodes,
 627                .prev = NULL,
 628        };
 629        struct mm_walk queue_pages_walk = {
 630                .hugetlb_entry = queue_pages_hugetlb,
 631                .pmd_entry = queue_pages_pte_range,
 632                .test_walk = queue_pages_test_walk,
 633                .mm = mm,
 634                .private = &qp,
 635        };
 636
 637        return walk_page_range(start, end, &queue_pages_walk);
 638}
 639
 640/*
 641 * Apply policy to a single VMA
 642 * This must be called with the mmap_sem held for writing.
 643 */
 644static int vma_replace_policy(struct vm_area_struct *vma,
 645                                                struct mempolicy *pol)
 646{
 647        int err;
 648        struct mempolicy *old;
 649        struct mempolicy *new;
 650
 651        pr_debug("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
 652                 vma->vm_start, vma->vm_end, vma->vm_pgoff,
 653                 vma->vm_ops, vma->vm_file,
 654                 vma->vm_ops ? vma->vm_ops->set_policy : NULL);
 655
 656        new = mpol_dup(pol);
 657        if (IS_ERR(new))
 658                return PTR_ERR(new);
 659
 660        if (vma->vm_ops && vma->vm_ops->set_policy) {
 661                err = vma->vm_ops->set_policy(vma, new);
 662                if (err)
 663                        goto err_out;
 664        }
 665
 666        old = vma->vm_policy;
 667        vma->vm_policy = new; /* protected by mmap_sem */
 668        mpol_put(old);
 669
 670        return 0;
 671 err_out:
 672        mpol_put(new);
 673        return err;
 674}
 675
 676/* Step 2: apply policy to a range and do splits. */
 677static int mbind_range(struct mm_struct *mm, unsigned long start,
 678                       unsigned long end, struct mempolicy *new_pol)
 679{
 680        struct vm_area_struct *next;
 681        struct vm_area_struct *prev;
 682        struct vm_area_struct *vma;
 683        int err = 0;
 684        pgoff_t pgoff;
 685        unsigned long vmstart;
 686        unsigned long vmend;
 687
 688        vma = find_vma(mm, start);
 689        if (!vma || vma->vm_start > start)
 690                return -EFAULT;
 691
 692        prev = vma->vm_prev;
 693        if (start > vma->vm_start)
 694                prev = vma;
 695
 696        for (; vma && vma->vm_start < end; prev = vma, vma = next) {
 697                next = vma->vm_next;
 698                vmstart = max(start, vma->vm_start);
 699                vmend   = min(end, vma->vm_end);
 700
 701                if (mpol_equal(vma_policy(vma), new_pol))
 702                        continue;
 703
 704                pgoff = vma->vm_pgoff +
 705                        ((vmstart - vma->vm_start) >> PAGE_SHIFT);
 706                prev = vma_merge(mm, prev, vmstart, vmend, vma->vm_flags,
 707                                 vma->anon_vma, vma->vm_file, pgoff,
 708                                 new_pol, vma->vm_userfaultfd_ctx);
 709                if (prev) {
 710                        vma = prev;
 711                        next = vma->vm_next;
 712                        if (mpol_equal(vma_policy(vma), new_pol))
 713                                continue;
 714                        /* vma_merge() joined vma && vma->next, case 8 */
 715                        goto replace;
 716                }
 717                if (vma->vm_start != vmstart) {
 718                        err = split_vma(vma->vm_mm, vma, vmstart, 1);
 719                        if (err)
 720                                goto out;
 721                }
 722                if (vma->vm_end != vmend) {
 723                        err = split_vma(vma->vm_mm, vma, vmend, 0);
 724                        if (err)
 725                                goto out;
 726                }
 727 replace:
 728                err = vma_replace_policy(vma, new_pol);
 729                if (err)
 730                        goto out;
 731        }
 732
 733 out:
 734        return err;
 735}
 736
 737/* Set the process memory policy */
 738static long do_set_mempolicy(unsigned short mode, unsigned short flags,
 739                             nodemask_t *nodes)
 740{
 741        struct mempolicy *new, *old;
 742        NODEMASK_SCRATCH(scratch);
 743        int ret;
 744
 745        if (!scratch)
 746                return -ENOMEM;
 747
 748        new = mpol_new(mode, flags, nodes);
 749        if (IS_ERR(new)) {
 750                ret = PTR_ERR(new);
 751                goto out;
 752        }
 753
 754        task_lock(current);
 755        ret = mpol_set_nodemask(new, nodes, scratch);
 756        if (ret) {
 757                task_unlock(current);
 758                mpol_put(new);
 759                goto out;
 760        }
 761        old = current->mempolicy;
 762        current->mempolicy = new;
 763        if (new && new->mode == MPOL_INTERLEAVE)
 764                current->il_prev = MAX_NUMNODES-1;
 765        task_unlock(current);
 766        mpol_put(old);
 767        ret = 0;
 768out:
 769        NODEMASK_SCRATCH_FREE(scratch);
 770        return ret;
 771}
 772
 773/*
 774 * Return nodemask for policy for get_mempolicy() query
 775 *
 776 * Called with task's alloc_lock held
 777 */
 778static void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes)
 779{
 780        nodes_clear(*nodes);
 781        if (p == &default_policy)
 782                return;
 783
 784        switch (p->mode) {
 785        case MPOL_BIND:
 786                /* Fall through */
 787        case MPOL_INTERLEAVE:
 788                *nodes = p->v.nodes;
 789                break;
 790        case MPOL_PREFERRED:
 791                if (!(p->flags & MPOL_F_LOCAL))
 792                        node_set(p->v.preferred_node, *nodes);
 793                /* else return empty node mask for local allocation */
 794                break;
 795        default:
 796                BUG();
 797        }
 798}
 799
 800static int lookup_node(struct mm_struct *mm, unsigned long addr)
 801{
 802        struct page *p;
 803        int err;
 804
 805        int locked = 1;
 806        err = get_user_pages_locked(addr & PAGE_MASK, 1, 0, &p, &locked);
 807        if (err >= 0) {
 808                err = page_to_nid(p);
 809                put_page(p);
 810        }
 811        if (locked)
 812                up_read(&mm->mmap_sem);
 813        return err;
 814}
 815
 816/* Retrieve NUMA policy */
 817static long do_get_mempolicy(int *policy, nodemask_t *nmask,
 818                             unsigned long addr, unsigned long flags)
 819{
 820        int err;
 821        struct mm_struct *mm = current->mm;
 822        struct vm_area_struct *vma = NULL;
 823        struct mempolicy *pol = current->mempolicy, *pol_refcount = NULL;
 824
 825        if (flags &
 826                ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED))
 827                return -EINVAL;
 828
 829        if (flags & MPOL_F_MEMS_ALLOWED) {
 830                if (flags & (MPOL_F_NODE|MPOL_F_ADDR))
 831                        return -EINVAL;
 832                *policy = 0;    /* just so it's initialized */
 833                task_lock(current);
 834                *nmask  = cpuset_current_mems_allowed;
 835                task_unlock(current);
 836                return 0;
 837        }
 838
 839        if (flags & MPOL_F_ADDR) {
 840                /*
 841                 * Do NOT fall back to task policy if the
 842                 * vma/shared policy at addr is NULL.  We
 843                 * want to return MPOL_DEFAULT in this case.
 844                 */
 845                down_read(&mm->mmap_sem);
 846                vma = find_vma_intersection(mm, addr, addr+1);
 847                if (!vma) {
 848                        up_read(&mm->mmap_sem);
 849                        return -EFAULT;
 850                }
 851                if (vma->vm_ops && vma->vm_ops->get_policy)
 852                        pol = vma->vm_ops->get_policy(vma, addr);
 853                else
 854                        pol = vma->vm_policy;
 855        } else if (addr)
 856                return -EINVAL;
 857
 858        if (!pol)
 859                pol = &default_policy;  /* indicates default behavior */
 860
 861        if (flags & MPOL_F_NODE) {
 862                if (flags & MPOL_F_ADDR) {
 863                        /*
 864                         * Take a refcount on the mpol, lookup_node()
 865                         * wil drop the mmap_sem, so after calling
 866                         * lookup_node() only "pol" remains valid, "vma"
 867                         * is stale.
 868                         */
 869                        pol_refcount = pol;
 870                        vma = NULL;
 871                        mpol_get(pol);
 872                        err = lookup_node(mm, addr);
 873                        if (err < 0)
 874                                goto out;
 875                        *policy = err;
 876                } else if (pol == current->mempolicy &&
 877                                pol->mode == MPOL_INTERLEAVE) {
 878                        *policy = next_node_in(current->il_prev, pol->v.nodes);
 879                } else {
 880                        err = -EINVAL;
 881                        goto out;
 882                }
 883        } else {
 884                *policy = pol == &default_policy ? MPOL_DEFAULT :
 885                                                pol->mode;
 886                /*
 887                 * Internal mempolicy flags must be masked off before exposing
 888                 * the policy to userspace.
 889                 */
 890                *policy |= (pol->flags & MPOL_MODE_FLAGS);
 891        }
 892
 893        err = 0;
 894        if (nmask) {
 895                if (mpol_store_user_nodemask(pol)) {
 896                        *nmask = pol->w.user_nodemask;
 897                } else {
 898                        task_lock(current);
 899                        get_policy_nodemask(pol, nmask);
 900                        task_unlock(current);
 901                }
 902        }
 903
 904 out:
 905        mpol_cond_put(pol);
 906        if (vma)
 907                up_read(&mm->mmap_sem);
 908        if (pol_refcount)
 909                mpol_put(pol_refcount);
 910        return err;
 911}
 912
 913#ifdef CONFIG_MIGRATION
 914/*
 915 * page migration, thp tail pages can be passed.
 916 */
 917static void migrate_page_add(struct page *page, struct list_head *pagelist,
 918                                unsigned long flags)
 919{
 920        struct page *head = compound_head(page);
 921        /*
 922         * Avoid migrating a page that is shared with others.
 923         */
 924        if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(head) == 1) {
 925                if (!isolate_lru_page(head)) {
 926                        list_add_tail(&head->lru, pagelist);
 927                        mod_node_page_state(page_pgdat(head),
 928                                NR_ISOLATED_ANON + page_is_file_cache(head),
 929                                hpage_nr_pages(head));
 930                }
 931        }
 932}
 933
 934/* page allocation callback for NUMA node migration */
 935struct page *alloc_new_node_page(struct page *page, unsigned long node)
 936{
 937        if (PageHuge(page))
 938                return alloc_huge_page_node(page_hstate(compound_head(page)),
 939                                        node);
 940        else if (PageTransHuge(page)) {
 941                struct page *thp;
 942
 943                thp = alloc_pages_node(node,
 944                        (GFP_TRANSHUGE | __GFP_THISNODE),
 945                        HPAGE_PMD_ORDER);
 946                if (!thp)
 947                        return NULL;
 948                prep_transhuge_page(thp);
 949                return thp;
 950        } else
 951                return __alloc_pages_node(node, GFP_HIGHUSER_MOVABLE |
 952                                                    __GFP_THISNODE, 0);
 953}
 954
 955/*
 956 * Migrate pages from one node to a target node.
 957 * Returns error or the number of pages not migrated.
 958 */
 959static int migrate_to_node(struct mm_struct *mm, int source, int dest,
 960                           int flags)
 961{
 962        nodemask_t nmask;
 963        LIST_HEAD(pagelist);
 964        int err = 0;
 965
 966        nodes_clear(nmask);
 967        node_set(source, nmask);
 968
 969        /*
 970         * This does not "check" the range but isolates all pages that
 971         * need migration.  Between passing in the full user address
 972         * space range and MPOL_MF_DISCONTIG_OK, this call can not fail.
 973         */
 974        VM_BUG_ON(!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)));
 975        queue_pages_range(mm, mm->mmap->vm_start, mm->task_size, &nmask,
 976                        flags | MPOL_MF_DISCONTIG_OK, &pagelist);
 977
 978        if (!list_empty(&pagelist)) {
 979                err = migrate_pages(&pagelist, alloc_new_node_page, NULL, dest,
 980                                        MIGRATE_SYNC, MR_SYSCALL);
 981                if (err)
 982                        putback_movable_pages(&pagelist);
 983        }
 984
 985        return err;
 986}
 987
 988/*
 989 * Move pages between the two nodesets so as to preserve the physical
 990 * layout as much as possible.
 991 *
 992 * Returns the number of page that could not be moved.
 993 */
 994int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
 995                     const nodemask_t *to, int flags)
 996{
 997        int busy = 0;
 998        int err;
 999        nodemask_t tmp;
1000
1001        err = migrate_prep();
1002        if (err)
1003                return err;
1004
1005        down_read(&mm->mmap_sem);
1006
1007        /*
1008         * Find a 'source' bit set in 'tmp' whose corresponding 'dest'
1009         * bit in 'to' is not also set in 'tmp'.  Clear the found 'source'
1010         * bit in 'tmp', and return that <source, dest> pair for migration.
1011         * The pair of nodemasks 'to' and 'from' define the map.
1012         *
1013         * If no pair of bits is found that way, fallback to picking some
1014         * pair of 'source' and 'dest' bits that are not the same.  If the
1015         * 'source' and 'dest' bits are the same, this represents a node
1016         * that will be migrating to itself, so no pages need move.
1017         *
1018         * If no bits are left in 'tmp', or if all remaining bits left
1019         * in 'tmp' correspond to the same bit in 'to', return false
1020         * (nothing left to migrate).
1021         *
1022         * This lets us pick a pair of nodes to migrate between, such that
1023         * if possible the dest node is not already occupied by some other
1024         * source node, minimizing the risk of overloading the memory on a
1025         * node that would happen if we migrated incoming memory to a node
1026         * before migrating outgoing memory source that same node.
1027         *
1028         * A single scan of tmp is sufficient.  As we go, we remember the
1029         * most recent <s, d> pair that moved (s != d).  If we find a pair
1030         * that not only moved, but what's better, moved to an empty slot
1031         * (d is not set in tmp), then we break out then, with that pair.
1032         * Otherwise when we finish scanning from_tmp, we at least have the
1033         * most recent <s, d> pair that moved.  If we get all the way through
1034         * the scan of tmp without finding any node that moved, much less
1035         * moved to an empty node, then there is nothing left worth migrating.
1036         */
1037
1038        tmp = *from;
1039        while (!nodes_empty(tmp)) {
1040                int s,d;
1041                int source = NUMA_NO_NODE;
1042                int dest = 0;
1043
1044                for_each_node_mask(s, tmp) {
1045
1046                        /*
1047                         * do_migrate_pages() tries to maintain the relative
1048                         * node relationship of the pages established between
1049                         * threads and memory areas.
1050                         *
1051                         * However if the number of source nodes is not equal to
1052                         * the number of destination nodes we can not preserve
1053                         * this node relative relationship.  In that case, skip
1054                         * copying memory from a node that is in the destination
1055                         * mask.
1056                         *
1057                         * Example: [2,3,4] -> [3,4,5] moves everything.
1058                         *          [0-7] - > [3,4,5] moves only 0,1,2,6,7.
1059                         */
1060
1061                        if ((nodes_weight(*from) != nodes_weight(*to)) &&
1062                                                (node_isset(s, *to)))
1063                                continue;
1064
1065                        d = node_remap(s, *from, *to);
1066                        if (s == d)
1067                                continue;
1068
1069                        source = s;     /* Node moved. Memorize */
1070                        dest = d;
1071
1072                        /* dest not in remaining from nodes? */
1073                        if (!node_isset(dest, tmp))
1074                                break;
1075                }
1076                if (source == NUMA_NO_NODE)
1077                        break;
1078
1079                node_clear(source, tmp);
1080                err = migrate_to_node(mm, source, dest, flags);
1081                if (err > 0)
1082                        busy += err;
1083                if (err < 0)
1084                        break;
1085        }
1086        up_read(&mm->mmap_sem);
1087        if (err < 0)
1088                return err;
1089        return busy;
1090
1091}
1092
1093/*
1094 * Allocate a new page for page migration based on vma policy.
1095 * Start by assuming the page is mapped by the same vma as contains @start.
1096 * Search forward from there, if not.  N.B., this assumes that the
1097 * list of pages handed to migrate_pages()--which is how we get here--
1098 * is in virtual address order.
1099 */
1100static struct page *new_page(struct page *page, unsigned long start)
1101{
1102        struct vm_area_struct *vma;
1103        unsigned long uninitialized_var(address);
1104
1105        vma = find_vma(current->mm, start);
1106        while (vma) {
1107                address = page_address_in_vma(page, vma);
1108                if (address != -EFAULT)
1109                        break;
1110                vma = vma->vm_next;
1111        }
1112
1113        if (PageHuge(page)) {
1114                return alloc_huge_page_vma(page_hstate(compound_head(page)),
1115                                vma, address);
1116        } else if (PageTransHuge(page)) {
1117                struct page *thp;
1118
1119                thp = alloc_hugepage_vma(GFP_TRANSHUGE, vma, address,
1120                                         HPAGE_PMD_ORDER);
1121                if (!thp)
1122                        return NULL;
1123                prep_transhuge_page(thp);
1124                return thp;
1125        }
1126        /*
1127         * if !vma, alloc_page_vma() will use task or system default policy
1128         */
1129        return alloc_page_vma(GFP_HIGHUSER_MOVABLE | __GFP_RETRY_MAYFAIL,
1130                        vma, address);
1131}
1132#else
1133
1134static void migrate_page_add(struct page *page, struct list_head *pagelist,
1135                                unsigned long flags)
1136{
1137}
1138
1139int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
1140                     const nodemask_t *to, int flags)
1141{
1142        return -ENOSYS;
1143}
1144
1145static struct page *new_page(struct page *page, unsigned long start)
1146{
1147        return NULL;
1148}
1149#endif
1150
1151static long do_mbind(unsigned long start, unsigned long len,
1152                     unsigned short mode, unsigned short mode_flags,
1153                     nodemask_t *nmask, unsigned long flags)
1154{
1155        struct mm_struct *mm = current->mm;
1156        struct mempolicy *new;
1157        unsigned long end;
1158        int err;
1159        LIST_HEAD(pagelist);
1160
1161        if (flags & ~(unsigned long)MPOL_MF_VALID)
1162                return -EINVAL;
1163        if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
1164                return -EPERM;
1165
1166        if (start & ~PAGE_MASK)
1167                return -EINVAL;
1168
1169        if (mode == MPOL_DEFAULT)
1170                flags &= ~MPOL_MF_STRICT;
1171
1172        len = (len + PAGE_SIZE - 1) & PAGE_MASK;
1173        end = start + len;
1174
1175        if (end < start)
1176                return -EINVAL;
1177        if (end == start)
1178                return 0;
1179
1180        new = mpol_new(mode, mode_flags, nmask);
1181        if (IS_ERR(new))
1182                return PTR_ERR(new);
1183
1184        if (flags & MPOL_MF_LAZY)
1185                new->flags |= MPOL_F_MOF;
1186
1187        /*
1188         * If we are using the default policy then operation
1189         * on discontinuous address spaces is okay after all
1190         */
1191        if (!new)
1192                flags |= MPOL_MF_DISCONTIG_OK;
1193
1194        pr_debug("mbind %lx-%lx mode:%d flags:%d nodes:%lx\n",
1195                 start, start + len, mode, mode_flags,
1196                 nmask ? nodes_addr(*nmask)[0] : NUMA_NO_NODE);
1197
1198        if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
1199
1200                err = migrate_prep();
1201                if (err)
1202                        goto mpol_out;
1203        }
1204        {
1205                NODEMASK_SCRATCH(scratch);
1206                if (scratch) {
1207                        down_write(&mm->mmap_sem);
1208                        task_lock(current);
1209                        err = mpol_set_nodemask(new, nmask, scratch);
1210                        task_unlock(current);
1211                        if (err)
1212                                up_write(&mm->mmap_sem);
1213                } else
1214                        err = -ENOMEM;
1215                NODEMASK_SCRATCH_FREE(scratch);
1216        }
1217        if (err)
1218                goto mpol_out;
1219
1220        err = queue_pages_range(mm, start, end, nmask,
1221                          flags | MPOL_MF_INVERT, &pagelist);
1222        if (!err)
1223                err = mbind_range(mm, start, end, new);
1224
1225        if (!err) {
1226                int nr_failed = 0;
1227
1228                if (!list_empty(&pagelist)) {
1229                        WARN_ON_ONCE(flags & MPOL_MF_LAZY);
1230                        nr_failed = migrate_pages(&pagelist, new_page, NULL,
1231                                start, MIGRATE_SYNC, MR_MEMPOLICY_MBIND);
1232                        if (nr_failed)
1233                                putback_movable_pages(&pagelist);
1234                }
1235
1236                if (nr_failed && (flags & MPOL_MF_STRICT))
1237                        err = -EIO;
1238        } else
1239                putback_movable_pages(&pagelist);
1240
1241        up_write(&mm->mmap_sem);
1242 mpol_out:
1243        mpol_put(new);
1244        return err;
1245}
1246
1247/*
1248 * User space interface with variable sized bitmaps for nodelists.
1249 */
1250
1251/* Copy a node mask from user space. */
1252static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
1253                     unsigned long maxnode)
1254{
1255        unsigned long k;
1256        unsigned long t;
1257        unsigned long nlongs;
1258        unsigned long endmask;
1259
1260        --maxnode;
1261        nodes_clear(*nodes);
1262        if (maxnode == 0 || !nmask)
1263                return 0;
1264        if (maxnode > PAGE_SIZE*BITS_PER_BYTE)
1265                return -EINVAL;
1266
1267        nlongs = BITS_TO_LONGS(maxnode);
1268        if ((maxnode % BITS_PER_LONG) == 0)
1269                endmask = ~0UL;
1270        else
1271                endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1;
1272
1273        /*
1274         * When the user specified more nodes than supported just check
1275         * if the non supported part is all zero.
1276         *
1277         * If maxnode have more longs than MAX_NUMNODES, check
1278         * the bits in that area first. And then go through to
1279         * check the rest bits which equal or bigger than MAX_NUMNODES.
1280         * Otherwise, just check bits [MAX_NUMNODES, maxnode).
1281         */
1282        if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) {
1283                for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) {
1284                        if (get_user(t, nmask + k))
1285                                return -EFAULT;
1286                        if (k == nlongs - 1) {
1287                                if (t & endmask)
1288                                        return -EINVAL;
1289                        } else if (t)
1290                                return -EINVAL;
1291                }
1292                nlongs = BITS_TO_LONGS(MAX_NUMNODES);
1293                endmask = ~0UL;
1294        }
1295
1296        if (maxnode > MAX_NUMNODES && MAX_NUMNODES % BITS_PER_LONG != 0) {
1297                unsigned long valid_mask = endmask;
1298
1299                valid_mask &= ~((1UL << (MAX_NUMNODES % BITS_PER_LONG)) - 1);
1300                if (get_user(t, nmask + nlongs - 1))
1301                        return -EFAULT;
1302                if (t & valid_mask)
1303                        return -EINVAL;
1304        }
1305
1306        if (copy_from_user(nodes_addr(*nodes), nmask, nlongs*sizeof(unsigned long)))
1307                return -EFAULT;
1308        nodes_addr(*nodes)[nlongs-1] &= endmask;
1309        return 0;
1310}
1311
1312/* Copy a kernel node mask to user space */
1313static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
1314                              nodemask_t *nodes)
1315{
1316        unsigned long copy = ALIGN(maxnode-1, 64) / 8;
1317        const int nbytes = BITS_TO_LONGS(MAX_NUMNODES) * sizeof(long);
1318
1319        if (copy > nbytes) {
1320                if (copy > PAGE_SIZE)
1321                        return -EINVAL;
1322                if (clear_user((char __user *)mask + nbytes, copy - nbytes))
1323                        return -EFAULT;
1324                copy = nbytes;
1325        }
1326        return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
1327}
1328
1329static long kernel_mbind(unsigned long start, unsigned long len,
1330                         unsigned long mode, const unsigned long __user *nmask,
1331                         unsigned long maxnode, unsigned int flags)
1332{
1333        nodemask_t nodes;
1334        int err;
1335        unsigned short mode_flags;
1336
1337        mode_flags = mode & MPOL_MODE_FLAGS;
1338        mode &= ~MPOL_MODE_FLAGS;
1339        if (mode >= MPOL_MAX)
1340                return -EINVAL;
1341        if ((mode_flags & MPOL_F_STATIC_NODES) &&
1342            (mode_flags & MPOL_F_RELATIVE_NODES))
1343                return -EINVAL;
1344        err = get_nodes(&nodes, nmask, maxnode);
1345        if (err)
1346                return err;
1347        return do_mbind(start, len, mode, mode_flags, &nodes, flags);
1348}
1349
1350SYSCALL_DEFINE6(mbind, unsigned long, start, unsigned long, len,
1351                unsigned long, mode, const unsigned long __user *, nmask,
1352                unsigned long, maxnode, unsigned int, flags)
1353{
1354        return kernel_mbind(start, len, mode, nmask, maxnode, flags);
1355}
1356
1357/* Set the process memory policy */
1358static long kernel_set_mempolicy(int mode, const unsigned long __user *nmask,
1359                                 unsigned long maxnode)
1360{
1361        int err;
1362        nodemask_t nodes;
1363        unsigned short flags;
1364
1365        flags = mode & MPOL_MODE_FLAGS;
1366        mode &= ~MPOL_MODE_FLAGS;
1367        if ((unsigned int)mode >= MPOL_MAX)
1368                return -EINVAL;
1369        if ((flags & MPOL_F_STATIC_NODES) && (flags & MPOL_F_RELATIVE_NODES))
1370                return -EINVAL;
1371        err = get_nodes(&nodes, nmask, maxnode);
1372        if (err)
1373                return err;
1374        return do_set_mempolicy(mode, flags, &nodes);
1375}
1376
1377SYSCALL_DEFINE3(set_mempolicy, int, mode, const unsigned long __user *, nmask,
1378                unsigned long, maxnode)
1379{
1380        return kernel_set_mempolicy(mode, nmask, maxnode);
1381}
1382
1383static int kernel_migrate_pages(pid_t pid, unsigned long maxnode,
1384                                const unsigned long __user *old_nodes,
1385                                const unsigned long __user *new_nodes)
1386{
1387        struct mm_struct *mm = NULL;
1388        struct task_struct *task;
1389        nodemask_t task_nodes;
1390        int err;
1391        nodemask_t *old;
1392        nodemask_t *new;
1393        NODEMASK_SCRATCH(scratch);
1394
1395        if (!scratch)
1396                return -ENOMEM;
1397
1398        old = &scratch->mask1;
1399        new = &scratch->mask2;
1400
1401        err = get_nodes(old, old_nodes, maxnode);
1402        if (err)
1403                goto out;
1404
1405        err = get_nodes(new, new_nodes, maxnode);
1406        if (err)
1407                goto out;
1408
1409        /* Find the mm_struct */
1410        rcu_read_lock();
1411        task = pid ? find_task_by_vpid(pid) : current;
1412        if (!task) {
1413                rcu_read_unlock();
1414                err = -ESRCH;
1415                goto out;
1416        }
1417        get_task_struct(task);
1418
1419        err = -EINVAL;
1420
1421        /*
1422         * Check if this process has the right to modify the specified process.
1423         * Use the regular "ptrace_may_access()" checks.
1424         */
1425        if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS)) {
1426                rcu_read_unlock();
1427                err = -EPERM;
1428                goto out_put;
1429        }
1430        rcu_read_unlock();
1431
1432        task_nodes = cpuset_mems_allowed(task);
1433        /* Is the user allowed to access the target nodes? */
1434        if (!nodes_subset(*new, task_nodes) && !capable(CAP_SYS_NICE)) {
1435                err = -EPERM;
1436                goto out_put;
1437        }
1438
1439        task_nodes = cpuset_mems_allowed(current);
1440        nodes_and(*new, *new, task_nodes);
1441        if (nodes_empty(*new))
1442                goto out_put;
1443
1444        nodes_and(*new, *new, node_states[N_MEMORY]);
1445        if (nodes_empty(*new))
1446                goto out_put;
1447
1448        err = security_task_movememory(task);
1449        if (err)
1450                goto out_put;
1451
1452        mm = get_task_mm(task);
1453        put_task_struct(task);
1454
1455        if (!mm) {
1456                err = -EINVAL;
1457                goto out;
1458        }
1459
1460        err = do_migrate_pages(mm, old, new,
1461                capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE);
1462
1463        mmput(mm);
1464out:
1465        NODEMASK_SCRATCH_FREE(scratch);
1466
1467        return err;
1468
1469out_put:
1470        put_task_struct(task);
1471        goto out;
1472
1473}
1474
1475SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode,
1476                const unsigned long __user *, old_nodes,
1477                const unsigned long __user *, new_nodes)
1478{
1479        return kernel_migrate_pages(pid, maxnode, old_nodes, new_nodes);
1480}
1481
1482
1483/* Retrieve NUMA policy */
1484static int kernel_get_mempolicy(int __user *policy,
1485                                unsigned long __user *nmask,
1486                                unsigned long maxnode,
1487                                unsigned long addr,
1488                                unsigned long flags)
1489{
1490        int err;
1491        int uninitialized_var(pval);
1492        nodemask_t nodes;
1493
1494        if (nmask != NULL && maxnode < MAX_NUMNODES)
1495                return -EINVAL;
1496
1497        err = do_get_mempolicy(&pval, &nodes, addr, flags);
1498
1499        if (err)
1500                return err;
1501
1502        if (policy && put_user(pval, policy))
1503                return -EFAULT;
1504
1505        if (nmask)
1506                err = copy_nodes_to_user(nmask, maxnode, &nodes);
1507
1508        return err;
1509}
1510
1511SYSCALL_DEFINE5(get_mempolicy, int __user *, policy,
1512                unsigned long __user *, nmask, unsigned long, maxnode,
1513                unsigned long, addr, unsigned long, flags)
1514{
1515        return kernel_get_mempolicy(policy, nmask, maxnode, addr, flags);
1516}
1517
1518#ifdef CONFIG_COMPAT
1519
1520COMPAT_SYSCALL_DEFINE5(get_mempolicy, int __user *, policy,
1521                       compat_ulong_t __user *, nmask,
1522                       compat_ulong_t, maxnode,
1523                       compat_ulong_t, addr, compat_ulong_t, flags)
1524{
1525        long err;
1526        unsigned long __user *nm = NULL;
1527        unsigned long nr_bits, alloc_size;
1528        DECLARE_BITMAP(bm, MAX_NUMNODES);
1529
1530        nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1531        alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1532
1533        if (nmask)
1534                nm = compat_alloc_user_space(alloc_size);
1535
1536        err = kernel_get_mempolicy(policy, nm, nr_bits+1, addr, flags);
1537
1538        if (!err && nmask) {
1539                unsigned long copy_size;
1540                copy_size = min_t(unsigned long, sizeof(bm), alloc_size);
1541                err = copy_from_user(bm, nm, copy_size);
1542                /* ensure entire bitmap is zeroed */
1543                err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8);
1544                err |= compat_put_bitmap(nmask, bm, nr_bits);
1545        }
1546
1547        return err;
1548}
1549
1550COMPAT_SYSCALL_DEFINE3(set_mempolicy, int, mode, compat_ulong_t __user *, nmask,
1551                       compat_ulong_t, maxnode)
1552{
1553        unsigned long __user *nm = NULL;
1554        unsigned long nr_bits, alloc_size;
1555        DECLARE_BITMAP(bm, MAX_NUMNODES);
1556
1557        nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1558        alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1559
1560        if (nmask) {
1561                if (compat_get_bitmap(bm, nmask, nr_bits))
1562                        return -EFAULT;
1563                nm = compat_alloc_user_space(alloc_size);
1564                if (copy_to_user(nm, bm, alloc_size))
1565                        return -EFAULT;
1566        }
1567
1568        return kernel_set_mempolicy(mode, nm, nr_bits+1);
1569}
1570
1571COMPAT_SYSCALL_DEFINE6(mbind, compat_ulong_t, start, compat_ulong_t, len,
1572                       compat_ulong_t, mode, compat_ulong_t __user *, nmask,
1573                       compat_ulong_t, maxnode, compat_ulong_t, flags)
1574{
1575        unsigned long __user *nm = NULL;
1576        unsigned long nr_bits, alloc_size;
1577        nodemask_t bm;
1578
1579        nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1580        alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1581
1582        if (nmask) {
1583                if (compat_get_bitmap(nodes_addr(bm), nmask, nr_bits))
1584                        return -EFAULT;
1585                nm = compat_alloc_user_space(alloc_size);
1586                if (copy_to_user(nm, nodes_addr(bm), alloc_size))
1587                        return -EFAULT;
1588        }
1589
1590        return kernel_mbind(start, len, mode, nm, nr_bits+1, flags);
1591}
1592
1593COMPAT_SYSCALL_DEFINE4(migrate_pages, compat_pid_t, pid,
1594                       compat_ulong_t, maxnode,
1595                       const compat_ulong_t __user *, old_nodes,
1596                       const compat_ulong_t __user *, new_nodes)
1597{
1598        unsigned long __user *old = NULL;
1599        unsigned long __user *new = NULL;
1600        nodemask_t tmp_mask;
1601        unsigned long nr_bits;
1602        unsigned long size;
1603
1604        nr_bits = min_t(unsigned long, maxnode - 1, MAX_NUMNODES);
1605        size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1606        if (old_nodes) {
1607                if (compat_get_bitmap(nodes_addr(tmp_mask), old_nodes, nr_bits))
1608                        return -EFAULT;
1609                old = compat_alloc_user_space(new_nodes ? size * 2 : size);
1610                if (new_nodes)
1611                        new = old + size / sizeof(unsigned long);
1612                if (copy_to_user(old, nodes_addr(tmp_mask), size))
1613                        return -EFAULT;
1614        }
1615        if (new_nodes) {
1616                if (compat_get_bitmap(nodes_addr(tmp_mask), new_nodes, nr_bits))
1617                        return -EFAULT;
1618                if (new == NULL)
1619                        new = compat_alloc_user_space(size);
1620                if (copy_to_user(new, nodes_addr(tmp_mask), size))
1621                        return -EFAULT;
1622        }
1623        return kernel_migrate_pages(pid, nr_bits + 1, old, new);
1624}
1625
1626#endif /* CONFIG_COMPAT */
1627
1628struct mempolicy *__get_vma_policy(struct vm_area_struct *vma,
1629                                                unsigned long addr)
1630{
1631        struct mempolicy *pol = NULL;
1632
1633        if (vma) {
1634                if (vma->vm_ops && vma->vm_ops->get_policy) {
1635                        pol = vma->vm_ops->get_policy(vma, addr);
1636                } else if (vma->vm_policy) {
1637                        pol = vma->vm_policy;
1638
1639                        /*
1640                         * shmem_alloc_page() passes MPOL_F_SHARED policy with
1641                         * a pseudo vma whose vma->vm_ops=NULL. Take a reference
1642                         * count on these policies which will be dropped by
1643                         * mpol_cond_put() later
1644                         */
1645                        if (mpol_needs_cond_ref(pol))
1646                                mpol_get(pol);
1647                }
1648        }
1649
1650        return pol;
1651}
1652
1653/*
1654 * get_vma_policy(@vma, @addr)
1655 * @vma: virtual memory area whose policy is sought
1656 * @addr: address in @vma for shared policy lookup
1657 *
1658 * Returns effective policy for a VMA at specified address.
1659 * Falls back to current->mempolicy or system default policy, as necessary.
1660 * Shared policies [those marked as MPOL_F_SHARED] require an extra reference
1661 * count--added by the get_policy() vm_op, as appropriate--to protect against
1662 * freeing by another task.  It is the caller's responsibility to free the
1663 * extra reference for shared policies.
1664 */
1665static struct mempolicy *get_vma_policy(struct vm_area_struct *vma,
1666                                                unsigned long addr)
1667{
1668        struct mempolicy *pol = __get_vma_policy(vma, addr);
1669
1670        if (!pol)
1671                pol = get_task_policy(current);
1672
1673        return pol;
1674}
1675
1676bool vma_policy_mof(struct vm_area_struct *vma)
1677{
1678        struct mempolicy *pol;
1679
1680        if (vma->vm_ops && vma->vm_ops->get_policy) {
1681                bool ret = false;
1682
1683                pol = vma->vm_ops->get_policy(vma, vma->vm_start);
1684                if (pol && (pol->flags & MPOL_F_MOF))
1685                        ret = true;
1686                mpol_cond_put(pol);
1687
1688                return ret;
1689        }
1690
1691        pol = vma->vm_policy;
1692        if (!pol)
1693                pol = get_task_policy(current);
1694
1695        return pol->flags & MPOL_F_MOF;
1696}
1697
1698static int apply_policy_zone(struct mempolicy *policy, enum zone_type zone)
1699{
1700        enum zone_type dynamic_policy_zone = policy_zone;
1701
1702        BUG_ON(dynamic_policy_zone == ZONE_MOVABLE);
1703
1704        /*
1705         * if policy->v.nodes has movable memory only,
1706         * we apply policy when gfp_zone(gfp) = ZONE_MOVABLE only.
1707         *
1708         * policy->v.nodes is intersect with node_states[N_MEMORY].
1709         * so if the following test faile, it implies
1710         * policy->v.nodes has movable memory only.
1711         */
1712        if (!nodes_intersects(policy->v.nodes, node_states[N_HIGH_MEMORY]))
1713                dynamic_policy_zone = ZONE_MOVABLE;
1714
1715        return zone >= dynamic_policy_zone;
1716}
1717
1718/*
1719 * Return a nodemask representing a mempolicy for filtering nodes for
1720 * page allocation
1721 */
1722static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy)
1723{
1724        /* Lower zones don't get a nodemask applied for MPOL_BIND */
1725        if (unlikely(policy->mode == MPOL_BIND) &&
1726                        apply_policy_zone(policy, gfp_zone(gfp)) &&
1727                        cpuset_nodemask_valid_mems_allowed(&policy->v.nodes))
1728                return &policy->v.nodes;
1729
1730        return NULL;
1731}
1732
1733/* Return the node id preferred by the given mempolicy, or the given id */
1734static int policy_node(gfp_t gfp, struct mempolicy *policy,
1735                                                                int nd)
1736{
1737        if (policy->mode == MPOL_PREFERRED && !(policy->flags & MPOL_F_LOCAL))
1738                nd = policy->v.preferred_node;
1739        else {
1740                /*
1741                 * __GFP_THISNODE shouldn't even be used with the bind policy
1742                 * because we might easily break the expectation to stay on the
1743                 * requested node and not break the policy.
1744                 */
1745                WARN_ON_ONCE(policy->mode == MPOL_BIND && (gfp & __GFP_THISNODE));
1746        }
1747
1748        return nd;
1749}
1750
1751/* Do dynamic interleaving for a process */
1752static unsigned interleave_nodes(struct mempolicy *policy)
1753{
1754        unsigned next;
1755        struct task_struct *me = current;
1756
1757        next = next_node_in(me->il_prev, policy->v.nodes);
1758        if (next < MAX_NUMNODES)
1759                me->il_prev = next;
1760        return next;
1761}
1762
1763/*
1764 * Depending on the memory policy provide a node from which to allocate the
1765 * next slab entry.
1766 */
1767unsigned int mempolicy_slab_node(void)
1768{
1769        struct mempolicy *policy;
1770        int node = numa_mem_id();
1771
1772        if (in_interrupt())
1773                return node;
1774
1775        policy = current->mempolicy;
1776        if (!policy || policy->flags & MPOL_F_LOCAL)
1777                return node;
1778
1779        switch (policy->mode) {
1780        case MPOL_PREFERRED:
1781                /*
1782                 * handled MPOL_F_LOCAL above
1783                 */
1784                return policy->v.preferred_node;
1785
1786        case MPOL_INTERLEAVE:
1787                return interleave_nodes(policy);
1788
1789        case MPOL_BIND: {
1790                struct zoneref *z;
1791
1792                /*
1793                 * Follow bind policy behavior and start allocation at the
1794                 * first node.
1795                 */
1796                struct zonelist *zonelist;
1797                enum zone_type highest_zoneidx = gfp_zone(GFP_KERNEL);
1798                zonelist = &NODE_DATA(node)->node_zonelists[ZONELIST_FALLBACK];
1799                z = first_zones_zonelist(zonelist, highest_zoneidx,
1800                                                        &policy->v.nodes);
1801                return z->zone ? zone_to_nid(z->zone) : node;
1802        }
1803
1804        default:
1805                BUG();
1806        }
1807}
1808
1809/*
1810 * Do static interleaving for a VMA with known offset @n.  Returns the n'th
1811 * node in pol->v.nodes (starting from n=0), wrapping around if n exceeds the
1812 * number of present nodes.
1813 */
1814static unsigned offset_il_node(struct mempolicy *pol, unsigned long n)
1815{
1816        unsigned nnodes = nodes_weight(pol->v.nodes);
1817        unsigned target;
1818        int i;
1819        int nid;
1820
1821        if (!nnodes)
1822                return numa_node_id();
1823        target = (unsigned int)n % nnodes;
1824        nid = first_node(pol->v.nodes);
1825        for (i = 0; i < target; i++)
1826                nid = next_node(nid, pol->v.nodes);
1827        return nid;
1828}
1829
1830/* Determine a node number for interleave */
1831static inline unsigned interleave_nid(struct mempolicy *pol,
1832                 struct vm_area_struct *vma, unsigned long addr, int shift)
1833{
1834        if (vma) {
1835                unsigned long off;
1836
1837                /*
1838                 * for small pages, there is no difference between
1839                 * shift and PAGE_SHIFT, so the bit-shift is safe.
1840                 * for huge pages, since vm_pgoff is in units of small
1841                 * pages, we need to shift off the always 0 bits to get
1842                 * a useful offset.
1843                 */
1844                BUG_ON(shift < PAGE_SHIFT);
1845                off = vma->vm_pgoff >> (shift - PAGE_SHIFT);
1846                off += (addr - vma->vm_start) >> shift;
1847                return offset_il_node(pol, off);
1848        } else
1849                return interleave_nodes(pol);
1850}
1851
1852#ifdef CONFIG_HUGETLBFS
1853/*
1854 * huge_node(@vma, @addr, @gfp_flags, @mpol)
1855 * @vma: virtual memory area whose policy is sought
1856 * @addr: address in @vma for shared policy lookup and interleave policy
1857 * @gfp_flags: for requested zone
1858 * @mpol: pointer to mempolicy pointer for reference counted mempolicy
1859 * @nodemask: pointer to nodemask pointer for MPOL_BIND nodemask
1860 *
1861 * Returns a nid suitable for a huge page allocation and a pointer
1862 * to the struct mempolicy for conditional unref after allocation.
1863 * If the effective policy is 'BIND, returns a pointer to the mempolicy's
1864 * @nodemask for filtering the zonelist.
1865 *
1866 * Must be protected by read_mems_allowed_begin()
1867 */
1868int huge_node(struct vm_area_struct *vma, unsigned long addr, gfp_t gfp_flags,
1869                                struct mempolicy **mpol, nodemask_t **nodemask)
1870{
1871        int nid;
1872
1873        *mpol = get_vma_policy(vma, addr);
1874        *nodemask = NULL;       /* assume !MPOL_BIND */
1875
1876        if (unlikely((*mpol)->mode == MPOL_INTERLEAVE)) {
1877                nid = interleave_nid(*mpol, vma, addr,
1878                                        huge_page_shift(hstate_vma(vma)));
1879        } else {
1880                nid = policy_node(gfp_flags, *mpol, numa_node_id());
1881                if ((*mpol)->mode == MPOL_BIND)
1882                        *nodemask = &(*mpol)->v.nodes;
1883        }
1884        return nid;
1885}
1886
1887/*
1888 * init_nodemask_of_mempolicy
1889 *
1890 * If the current task's mempolicy is "default" [NULL], return 'false'
1891 * to indicate default policy.  Otherwise, extract the policy nodemask
1892 * for 'bind' or 'interleave' policy into the argument nodemask, or
1893 * initialize the argument nodemask to contain the single node for
1894 * 'preferred' or 'local' policy and return 'true' to indicate presence
1895 * of non-default mempolicy.
1896 *
1897 * We don't bother with reference counting the mempolicy [mpol_get/put]
1898 * because the current task is examining it's own mempolicy and a task's
1899 * mempolicy is only ever changed by the task itself.
1900 *
1901 * N.B., it is the caller's responsibility to free a returned nodemask.
1902 */
1903bool init_nodemask_of_mempolicy(nodemask_t *mask)
1904{
1905        struct mempolicy *mempolicy;
1906        int nid;
1907
1908        if (!(mask && current->mempolicy))
1909                return false;
1910
1911        task_lock(current);
1912        mempolicy = current->mempolicy;
1913        switch (mempolicy->mode) {
1914        case MPOL_PREFERRED:
1915                if (mempolicy->flags & MPOL_F_LOCAL)
1916                        nid = numa_node_id();
1917                else
1918                        nid = mempolicy->v.preferred_node;
1919                init_nodemask_of_node(mask, nid);
1920                break;
1921
1922        case MPOL_BIND:
1923                /* Fall through */
1924        case MPOL_INTERLEAVE:
1925                *mask =  mempolicy->v.nodes;
1926                break;
1927
1928        default:
1929                BUG();
1930        }
1931        task_unlock(current);
1932
1933        return true;
1934}
1935#endif
1936
1937/*
1938 * mempolicy_nodemask_intersects
1939 *
1940 * If tsk's mempolicy is "default" [NULL], return 'true' to indicate default
1941 * policy.  Otherwise, check for intersection between mask and the policy
1942 * nodemask for 'bind' or 'interleave' policy.  For 'perferred' or 'local'
1943 * policy, always return true since it may allocate elsewhere on fallback.
1944 *
1945 * Takes task_lock(tsk) to prevent freeing of its mempolicy.
1946 */
1947bool mempolicy_nodemask_intersects(struct task_struct *tsk,
1948                                        const nodemask_t *mask)
1949{
1950        struct mempolicy *mempolicy;
1951        bool ret = true;
1952
1953        if (!mask)
1954                return ret;
1955        task_lock(tsk);
1956        mempolicy = tsk->mempolicy;
1957        if (!mempolicy)
1958                goto out;
1959
1960        switch (mempolicy->mode) {
1961        case MPOL_PREFERRED:
1962                /*
1963                 * MPOL_PREFERRED and MPOL_F_LOCAL are only preferred nodes to
1964                 * allocate from, they may fallback to other nodes when oom.
1965                 * Thus, it's possible for tsk to have allocated memory from
1966                 * nodes in mask.
1967                 */
1968                break;
1969        case MPOL_BIND:
1970        case MPOL_INTERLEAVE:
1971                ret = nodes_intersects(mempolicy->v.nodes, *mask);
1972                break;
1973        default:
1974                BUG();
1975        }
1976out:
1977        task_unlock(tsk);
1978        return ret;
1979}
1980
1981/* Allocate a page in interleaved policy.
1982   Own path because it needs to do special accounting. */
1983static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
1984                                        unsigned nid)
1985{
1986        struct page *page;
1987
1988        page = __alloc_pages(gfp, order, nid);
1989        /* skip NUMA_INTERLEAVE_HIT counter update if numa stats is disabled */
1990        if (!static_branch_likely(&vm_numa_stat_key))
1991                return page;
1992        if (page && page_to_nid(page) == nid) {
1993                preempt_disable();
1994                __inc_numa_state(page_zone(page), NUMA_INTERLEAVE_HIT);
1995                preempt_enable();
1996        }
1997        return page;
1998}
1999
2000/**
2001 *      alloc_pages_vma - Allocate a page for a VMA.
2002 *
2003 *      @gfp:
2004 *      %GFP_USER    user allocation.
2005 *      %GFP_KERNEL  kernel allocations,
2006 *      %GFP_HIGHMEM highmem/user allocations,
2007 *      %GFP_FS      allocation should not call back into a file system.
2008 *      %GFP_ATOMIC  don't sleep.
2009 *
2010 *      @order:Order of the GFP allocation.
2011 *      @vma:  Pointer to VMA or NULL if not available.
2012 *      @addr: Virtual Address of the allocation. Must be inside the VMA.
2013 *      @node: Which node to prefer for allocation (modulo policy).
2014 *      @hugepage: for hugepages try only the preferred node if possible
2015 *
2016 *      This function allocates a page from the kernel page pool and applies
2017 *      a NUMA policy associated with the VMA or the current process.
2018 *      When VMA is not NULL caller must hold down_read on the mmap_sem of the
2019 *      mm_struct of the VMA to prevent it from going away. Should be used for
2020 *      all allocations for pages that will be mapped into user space. Returns
2021 *      NULL when no page can be allocated.
2022 */
2023struct page *
2024alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
2025                unsigned long addr, int node, bool hugepage)
2026{
2027        struct mempolicy *pol;
2028        struct page *page;
2029        int preferred_nid;
2030        nodemask_t *nmask;
2031
2032        pol = get_vma_policy(vma, addr);
2033
2034        if (pol->mode == MPOL_INTERLEAVE) {
2035                unsigned nid;
2036
2037                nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order);
2038                mpol_cond_put(pol);
2039                page = alloc_page_interleave(gfp, order, nid);
2040                goto out;
2041        }
2042
2043        if (unlikely(IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && hugepage)) {
2044                int hpage_node = node;
2045
2046                /*
2047                 * For hugepage allocation and non-interleave policy which
2048                 * allows the current node (or other explicitly preferred
2049                 * node) we only try to allocate from the current/preferred
2050                 * node and don't fall back to other nodes, as the cost of
2051                 * remote accesses would likely offset THP benefits.
2052                 *
2053                 * If the policy is interleave, or does not allow the current
2054                 * node in its nodemask, we allocate the standard way.
2055                 */
2056                if (pol->mode == MPOL_PREFERRED && !(pol->flags & MPOL_F_LOCAL))
2057                        hpage_node = pol->v.preferred_node;
2058
2059                nmask = policy_nodemask(gfp, pol);
2060                if (!nmask || node_isset(hpage_node, *nmask)) {
2061                        mpol_cond_put(pol);
2062                        page = __alloc_pages_node(hpage_node,
2063                                                gfp | __GFP_THISNODE, order);
2064                        goto out;
2065                }
2066        }
2067
2068        nmask = policy_nodemask(gfp, pol);
2069        preferred_nid = policy_node(gfp, pol, node);
2070        page = __alloc_pages_nodemask(gfp, order, preferred_nid, nmask);
2071        mpol_cond_put(pol);
2072out:
2073        return page;
2074}
2075
2076/**
2077 *      alloc_pages_current - Allocate pages.
2078 *
2079 *      @gfp:
2080 *              %GFP_USER   user allocation,
2081 *              %GFP_KERNEL kernel allocation,
2082 *              %GFP_HIGHMEM highmem allocation,
2083 *              %GFP_FS     don't call back into a file system.
2084 *              %GFP_ATOMIC don't sleep.
2085 *      @order: Power of two of allocation size in pages. 0 is a single page.
2086 *
2087 *      Allocate a page from the kernel page pool.  When not in
2088 *      interrupt context and apply the current process NUMA policy.
2089 *      Returns NULL when no page can be allocated.
2090 */
2091struct page *alloc_pages_current(gfp_t gfp, unsigned order)
2092{
2093        struct mempolicy *pol = &default_policy;
2094        struct page *page;
2095
2096        if (!in_interrupt() && !(gfp & __GFP_THISNODE))
2097                pol = get_task_policy(current);
2098
2099        /*
2100         * No reference counting needed for current->mempolicy
2101         * nor system default_policy
2102         */
2103        if (pol->mode == MPOL_INTERLEAVE)
2104                page = alloc_page_interleave(gfp, order, interleave_nodes(pol));
2105        else
2106                page = __alloc_pages_nodemask(gfp, order,
2107                                policy_node(gfp, pol, numa_node_id()),
2108                                policy_nodemask(gfp, pol));
2109
2110        return page;
2111}
2112EXPORT_SYMBOL(alloc_pages_current);
2113
2114int vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst)
2115{
2116        struct mempolicy *pol = mpol_dup(vma_policy(src));
2117
2118        if (IS_ERR(pol))
2119                return PTR_ERR(pol);
2120        dst->vm_policy = pol;
2121        return 0;
2122}
2123
2124/*
2125 * If mpol_dup() sees current->cpuset == cpuset_being_rebound, then it
2126 * rebinds the mempolicy its copying by calling mpol_rebind_policy()
2127 * with the mems_allowed returned by cpuset_mems_allowed().  This
2128 * keeps mempolicies cpuset relative after its cpuset moves.  See
2129 * further kernel/cpuset.c update_nodemask().
2130 *
2131 * current's mempolicy may be rebinded by the other task(the task that changes
2132 * cpuset's mems), so we needn't do rebind work for current task.
2133 */
2134
2135/* Slow path of a mempolicy duplicate */
2136struct mempolicy *__mpol_dup(struct mempolicy *old)
2137{
2138        struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
2139
2140        if (!new)
2141                return ERR_PTR(-ENOMEM);
2142
2143        /* task's mempolicy is protected by alloc_lock */
2144        if (old == current->mempolicy) {
2145                task_lock(current);
2146                *new = *old;
2147                task_unlock(current);
2148        } else
2149                *new = *old;
2150
2151        if (current_cpuset_is_being_rebound()) {
2152                nodemask_t mems = cpuset_mems_allowed(current);
2153                mpol_rebind_policy(new, &mems);
2154        }
2155        atomic_set(&new->refcnt, 1);
2156        return new;
2157}
2158
2159/* Slow path of a mempolicy comparison */
2160bool __mpol_equal(struct mempolicy *a, struct mempolicy *b)
2161{
2162        if (!a || !b)
2163                return false;
2164        if (a->mode != b->mode)
2165                return false;
2166        if (a->flags != b->flags)
2167                return false;
2168        if (mpol_store_user_nodemask(a))
2169                if (!nodes_equal(a->w.user_nodemask, b->w.user_nodemask))
2170                        return false;
2171
2172        switch (a->mode) {
2173        case MPOL_BIND:
2174                /* Fall through */
2175        case MPOL_INTERLEAVE:
2176                return !!nodes_equal(a->v.nodes, b->v.nodes);
2177        case MPOL_PREFERRED:
2178                /* a's ->flags is the same as b's */
2179                if (a->flags & MPOL_F_LOCAL)
2180                        return true;
2181                return a->v.preferred_node == b->v.preferred_node;
2182        default:
2183                BUG();
2184                return false;
2185        }
2186}
2187
2188/*
2189 * Shared memory backing store policy support.
2190 *
2191 * Remember policies even when nobody has shared memory mapped.
2192 * The policies are kept in Red-Black tree linked from the inode.
2193 * They are protected by the sp->lock rwlock, which should be held
2194 * for any accesses to the tree.
2195 */
2196
2197/*
2198 * lookup first element intersecting start-end.  Caller holds sp->lock for
2199 * reading or for writing
2200 */
2201static struct sp_node *
2202sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
2203{
2204        struct rb_node *n = sp->root.rb_node;
2205
2206        while (n) {
2207                struct sp_node *p = rb_entry(n, struct sp_node, nd);
2208
2209                if (start >= p->end)
2210                        n = n->rb_right;
2211                else if (end <= p->start)
2212                        n = n->rb_left;
2213                else
2214                        break;
2215        }
2216        if (!n)
2217                return NULL;
2218        for (;;) {
2219                struct sp_node *w = NULL;
2220                struct rb_node *prev = rb_prev(n);
2221                if (!prev)
2222                        break;
2223                w = rb_entry(prev, struct sp_node, nd);
2224                if (w->end <= start)
2225                        break;
2226                n = prev;
2227        }
2228        return rb_entry(n, struct sp_node, nd);
2229}
2230
2231/*
2232 * Insert a new shared policy into the list.  Caller holds sp->lock for
2233 * writing.
2234 */
2235static void sp_insert(struct shared_policy *sp, struct sp_node *new)
2236{
2237        struct rb_node **p = &sp->root.rb_node;
2238        struct rb_node *parent = NULL;
2239        struct sp_node *nd;
2240
2241        while (*p) {
2242                parent = *p;
2243                nd = rb_entry(parent, struct sp_node, nd);
2244                if (new->start < nd->start)
2245                        p = &(*p)->rb_left;
2246                else if (new->end > nd->end)
2247                        p = &(*p)->rb_right;
2248                else
2249                        BUG();
2250        }
2251        rb_link_node(&new->nd, parent, p);
2252        rb_insert_color(&new->nd, &sp->root);
2253        pr_debug("inserting %lx-%lx: %d\n", new->start, new->end,
2254                 new->policy ? new->policy->mode : 0);
2255}
2256
2257/* Find shared policy intersecting idx */
2258struct mempolicy *
2259mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
2260{
2261        struct mempolicy *pol = NULL;
2262        struct sp_node *sn;
2263
2264        if (!sp->root.rb_node)
2265                return NULL;
2266        read_lock(&sp->lock);
2267        sn = sp_lookup(sp, idx, idx+1);
2268        if (sn) {
2269                mpol_get(sn->policy);
2270                pol = sn->policy;
2271        }
2272        read_unlock(&sp->lock);
2273        return pol;
2274}
2275
2276static void sp_free(struct sp_node *n)
2277{
2278        mpol_put(n->policy);
2279        kmem_cache_free(sn_cache, n);
2280}
2281
2282/**
2283 * mpol_misplaced - check whether current page node is valid in policy
2284 *
2285 * @page: page to be checked
2286 * @vma: vm area where page mapped
2287 * @addr: virtual address where page mapped
2288 *
2289 * Lookup current policy node id for vma,addr and "compare to" page's
2290 * node id.
2291 *
2292 * Returns:
2293 *      -1      - not misplaced, page is in the right node
2294 *      node    - node id where the page should be
2295 *
2296 * Policy determination "mimics" alloc_page_vma().
2297 * Called from fault path where we know the vma and faulting address.
2298 */
2299int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long addr)
2300{
2301        struct mempolicy *pol;
2302        struct zoneref *z;
2303        int curnid = page_to_nid(page);
2304        unsigned long pgoff;
2305        int thiscpu = raw_smp_processor_id();
2306        int thisnid = cpu_to_node(thiscpu);
2307        int polnid = -1;
2308        int ret = -1;
2309
2310        pol = get_vma_policy(vma, addr);
2311        if (!(pol->flags & MPOL_F_MOF))
2312                goto out;
2313
2314        switch (pol->mode) {
2315        case MPOL_INTERLEAVE:
2316                pgoff = vma->vm_pgoff;
2317                pgoff += (addr - vma->vm_start) >> PAGE_SHIFT;
2318                polnid = offset_il_node(pol, pgoff);
2319                break;
2320
2321        case MPOL_PREFERRED:
2322                if (pol->flags & MPOL_F_LOCAL)
2323                        polnid = numa_node_id();
2324                else
2325                        polnid = pol->v.preferred_node;
2326                break;
2327
2328        case MPOL_BIND:
2329
2330                /*
2331                 * allows binding to multiple nodes.
2332                 * use current page if in policy nodemask,
2333                 * else select nearest allowed node, if any.
2334                 * If no allowed nodes, use current [!misplaced].
2335                 */
2336                if (node_isset(curnid, pol->v.nodes))
2337                        goto out;
2338                z = first_zones_zonelist(
2339                                node_zonelist(numa_node_id(), GFP_HIGHUSER),
2340                                gfp_zone(GFP_HIGHUSER),
2341                                &pol->v.nodes);
2342                polnid = zone_to_nid(z->zone);
2343                break;
2344
2345        default:
2346                BUG();
2347        }
2348
2349        /* Migrate the page towards the node whose CPU is referencing it */
2350        if (pol->flags & MPOL_F_MORON) {
2351                polnid = thisnid;
2352
2353                if (!should_numa_migrate_memory(current, page, curnid, thiscpu))
2354                        goto out;
2355        }
2356
2357        if (curnid != polnid)
2358                ret = polnid;
2359out:
2360        mpol_cond_put(pol);
2361
2362        return ret;
2363}
2364
2365/*
2366 * Drop the (possibly final) reference to task->mempolicy.  It needs to be
2367 * dropped after task->mempolicy is set to NULL so that any allocation done as
2368 * part of its kmem_cache_free(), such as by KASAN, doesn't reference a freed
2369 * policy.
2370 */
2371void mpol_put_task_policy(struct task_struct *task)
2372{
2373        struct mempolicy *pol;
2374
2375        task_lock(task);
2376        pol = task->mempolicy;
2377        task->mempolicy = NULL;
2378        task_unlock(task);
2379        mpol_put(pol);
2380}
2381
2382static void sp_delete(struct shared_policy *sp, struct sp_node *n)
2383{
2384        pr_debug("deleting %lx-l%lx\n", n->start, n->end);
2385        rb_erase(&n->nd, &sp->root);
2386        sp_free(n);
2387}
2388
2389static void sp_node_init(struct sp_node *node, unsigned long start,
2390                        unsigned long end, struct mempolicy *pol)
2391{
2392        node->start = start;
2393        node->end = end;
2394        node->policy = pol;
2395}
2396
2397static struct sp_node *sp_alloc(unsigned long start, unsigned long end,
2398                                struct mempolicy *pol)
2399{
2400        struct sp_node *n;
2401        struct mempolicy *newpol;
2402
2403        n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
2404        if (!n)
2405                return NULL;
2406
2407        newpol = mpol_dup(pol);
2408        if (IS_ERR(newpol)) {
2409                kmem_cache_free(sn_cache, n);
2410                return NULL;
2411        }
2412        newpol->flags |= MPOL_F_SHARED;
2413        sp_node_init(n, start, end, newpol);
2414
2415        return n;
2416}
2417
2418/* Replace a policy range. */
2419static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
2420                                 unsigned long end, struct sp_node *new)
2421{
2422        struct sp_node *n;
2423        struct sp_node *n_new = NULL;
2424        struct mempolicy *mpol_new = NULL;
2425        int ret = 0;
2426
2427restart:
2428        write_lock(&sp->lock);
2429        n = sp_lookup(sp, start, end);
2430        /* Take care of old policies in the same range. */
2431        while (n && n->start < end) {
2432                struct rb_node *next = rb_next(&n->nd);
2433                if (n->start >= start) {
2434                        if (n->end <= end)
2435                                sp_delete(sp, n);
2436                        else
2437                                n->start = end;
2438                } else {
2439                        /* Old policy spanning whole new range. */
2440                        if (n->end > end) {
2441                                if (!n_new)
2442                                        goto alloc_new;
2443
2444                                *mpol_new = *n->policy;
2445                                atomic_set(&mpol_new->refcnt, 1);
2446                                sp_node_init(n_new, end, n->end, mpol_new);
2447                                n->end = start;
2448                                sp_insert(sp, n_new);
2449                                n_new = NULL;
2450                                mpol_new = NULL;
2451                                break;
2452                        } else
2453                                n->end = start;
2454                }
2455                if (!next)
2456                        break;
2457                n = rb_entry(next, struct sp_node, nd);
2458        }
2459        if (new)
2460                sp_insert(sp, new);
2461        write_unlock(&sp->lock);
2462        ret = 0;
2463
2464err_out:
2465        if (mpol_new)
2466                mpol_put(mpol_new);
2467        if (n_new)
2468                kmem_cache_free(sn_cache, n_new);
2469
2470        return ret;
2471
2472alloc_new:
2473        write_unlock(&sp->lock);
2474        ret = -ENOMEM;
2475        n_new = kmem_cache_alloc(sn_cache, GFP_KERNEL);
2476        if (!n_new)
2477                goto err_out;
2478        mpol_new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
2479        if (!mpol_new)
2480                goto err_out;
2481        goto restart;
2482}
2483
2484/**
2485 * mpol_shared_policy_init - initialize shared policy for inode
2486 * @sp: pointer to inode shared policy
2487 * @mpol:  struct mempolicy to install
2488 *
2489 * Install non-NULL @mpol in inode's shared policy rb-tree.
2490 * On entry, the current task has a reference on a non-NULL @mpol.
2491 * This must be released on exit.
2492 * This is called at get_inode() calls and we can use GFP_KERNEL.
2493 */
2494void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol)
2495{
2496        int ret;
2497
2498        sp->root = RB_ROOT;             /* empty tree == default mempolicy */
2499        rwlock_init(&sp->lock);
2500
2501        if (mpol) {
2502                struct vm_area_struct pvma;
2503                struct mempolicy *new;
2504                NODEMASK_SCRATCH(scratch);
2505
2506                if (!scratch)
2507                        goto put_mpol;
2508                /* contextualize the tmpfs mount point mempolicy */
2509                new = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask);
2510                if (IS_ERR(new))
2511                        goto free_scratch; /* no valid nodemask intersection */
2512
2513                task_lock(current);
2514                ret = mpol_set_nodemask(new, &mpol->w.user_nodemask, scratch);
2515                task_unlock(current);
2516                if (ret)
2517                        goto put_new;
2518
2519                /* Create pseudo-vma that contains just the policy */
2520                vma_init(&pvma, NULL);
2521                pvma.vm_end = TASK_SIZE;        /* policy covers entire file */
2522                mpol_set_shared_policy(sp, &pvma, new); /* adds ref */
2523
2524put_new:
2525                mpol_put(new);                  /* drop initial ref */
2526free_scratch:
2527                NODEMASK_SCRATCH_FREE(scratch);
2528put_mpol:
2529                mpol_put(mpol); /* drop our incoming ref on sb mpol */
2530        }
2531}
2532
2533int mpol_set_shared_policy(struct shared_policy *info,
2534                        struct vm_area_struct *vma, struct mempolicy *npol)
2535{
2536        int err;
2537        struct sp_node *new = NULL;
2538        unsigned long sz = vma_pages(vma);
2539
2540        pr_debug("set_shared_policy %lx sz %lu %d %d %lx\n",
2541                 vma->vm_pgoff,
2542                 sz, npol ? npol->mode : -1,
2543                 npol ? npol->flags : -1,
2544                 npol ? nodes_addr(npol->v.nodes)[0] : NUMA_NO_NODE);
2545
2546        if (npol) {
2547                new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);
2548                if (!new)
2549                        return -ENOMEM;
2550        }
2551        err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new);
2552        if (err && new)
2553                sp_free(new);
2554        return err;
2555}
2556
2557/* Free a backing policy store on inode delete. */
2558void mpol_free_shared_policy(struct shared_policy *p)
2559{
2560        struct sp_node *n;
2561        struct rb_node *next;
2562
2563        if (!p->root.rb_node)
2564                return;
2565        write_lock(&p->lock);
2566        next = rb_first(&p->root);
2567        while (next) {
2568                n = rb_entry(next, struct sp_node, nd);
2569                next = rb_next(&n->nd);
2570                sp_delete(p, n);
2571        }
2572        write_unlock(&p->lock);
2573}
2574
2575#ifdef CONFIG_NUMA_BALANCING
2576static int __initdata numabalancing_override;
2577
2578static void __init check_numabalancing_enable(void)
2579{
2580        bool numabalancing_default = false;
2581
2582        if (IS_ENABLED(CONFIG_NUMA_BALANCING_DEFAULT_ENABLED))
2583                numabalancing_default = true;
2584
2585        /* Parsed by setup_numabalancing. override == 1 enables, -1 disables */
2586        if (numabalancing_override)
2587                set_numabalancing_state(numabalancing_override == 1);
2588
2589        if (num_online_nodes() > 1 && !numabalancing_override) {
2590                pr_info("%s automatic NUMA balancing. Configure with numa_balancing= or the kernel.numa_balancing sysctl\n",
2591                        numabalancing_default ? "Enabling" : "Disabling");
2592                set_numabalancing_state(numabalancing_default);
2593        }
2594}
2595
2596static int __init setup_numabalancing(char *str)
2597{
2598        int ret = 0;
2599        if (!str)
2600                goto out;
2601
2602        if (!strcmp(str, "enable")) {
2603                numabalancing_override = 1;
2604                ret = 1;
2605        } else if (!strcmp(str, "disable")) {
2606                numabalancing_override = -1;
2607                ret = 1;
2608        }
2609out:
2610        if (!ret)
2611                pr_warn("Unable to parse numa_balancing=\n");
2612
2613        return ret;
2614}
2615__setup("numa_balancing=", setup_numabalancing);
2616#else
2617static inline void __init check_numabalancing_enable(void)
2618{
2619}
2620#endif /* CONFIG_NUMA_BALANCING */
2621
2622/* assumes fs == KERNEL_DS */
2623void __init numa_policy_init(void)
2624{
2625        nodemask_t interleave_nodes;
2626        unsigned long largest = 0;
2627        int nid, prefer = 0;
2628
2629        policy_cache = kmem_cache_create("numa_policy",
2630                                         sizeof(struct mempolicy),
2631                                         0, SLAB_PANIC, NULL);
2632
2633        sn_cache = kmem_cache_create("shared_policy_node",
2634                                     sizeof(struct sp_node),
2635                                     0, SLAB_PANIC, NULL);
2636
2637        for_each_node(nid) {
2638                preferred_node_policy[nid] = (struct mempolicy) {
2639                        .refcnt = ATOMIC_INIT(1),
2640                        .mode = MPOL_PREFERRED,
2641                        .flags = MPOL_F_MOF | MPOL_F_MORON,
2642                        .v = { .preferred_node = nid, },
2643                };
2644        }
2645
2646        /*
2647         * Set interleaving policy for system init. Interleaving is only
2648         * enabled across suitably sized nodes (default is >= 16MB), or
2649         * fall back to the largest node if they're all smaller.
2650         */
2651        nodes_clear(interleave_nodes);
2652        for_each_node_state(nid, N_MEMORY) {
2653                unsigned long total_pages = node_present_pages(nid);
2654
2655                /* Preserve the largest node */
2656                if (largest < total_pages) {
2657                        largest = total_pages;
2658                        prefer = nid;
2659                }
2660
2661                /* Interleave this node? */
2662                if ((total_pages << PAGE_SHIFT) >= (16 << 20))
2663                        node_set(nid, interleave_nodes);
2664        }
2665
2666        /* All too small, use the largest */
2667        if (unlikely(nodes_empty(interleave_nodes)))
2668                node_set(prefer, interleave_nodes);
2669
2670        if (do_set_mempolicy(MPOL_INTERLEAVE, 0, &interleave_nodes))
2671                pr_err("%s: interleaving failed\n", __func__);
2672
2673        check_numabalancing_enable();
2674}
2675
2676/* Reset policy of current process to default */
2677void numa_default_policy(void)
2678{
2679        do_set_mempolicy(MPOL_DEFAULT, 0, NULL);
2680}
2681
2682/*
2683 * Parse and format mempolicy from/to strings
2684 */
2685
2686/*
2687 * "local" is implemented internally by MPOL_PREFERRED with MPOL_F_LOCAL flag.
2688 */
2689static const char * const policy_modes[] =
2690{
2691        [MPOL_DEFAULT]    = "default",
2692        [MPOL_PREFERRED]  = "prefer",
2693        [MPOL_BIND]       = "bind",
2694        [MPOL_INTERLEAVE] = "interleave",
2695        [MPOL_LOCAL]      = "local",
2696};
2697
2698
2699#ifdef CONFIG_TMPFS
2700/**
2701 * mpol_parse_str - parse string to mempolicy, for tmpfs mpol mount option.
2702 * @str:  string containing mempolicy to parse
2703 * @mpol:  pointer to struct mempolicy pointer, returned on success.
2704 *
2705 * Format of input:
2706 *      <mode>[=<flags>][:<nodelist>]
2707 *
2708 * On success, returns 0, else 1
2709 */
2710int mpol_parse_str(char *str, struct mempolicy **mpol)
2711{
2712        struct mempolicy *new = NULL;
2713        unsigned short mode_flags;
2714        nodemask_t nodes;
2715        char *nodelist = strchr(str, ':');
2716        char *flags = strchr(str, '=');
2717        int err = 1, mode;
2718
2719        if (nodelist) {
2720                /* NUL-terminate mode or flags string */
2721                *nodelist++ = '\0';
2722                if (nodelist_parse(nodelist, nodes))
2723                        goto out;
2724                if (!nodes_subset(nodes, node_states[N_MEMORY]))
2725                        goto out;
2726        } else
2727                nodes_clear(nodes);
2728
2729        if (flags)
2730                *flags++ = '\0';        /* terminate mode string */
2731
2732        mode = match_string(policy_modes, MPOL_MAX, str);
2733        if (mode < 0)
2734                goto out;
2735
2736        switch (mode) {
2737        case MPOL_PREFERRED:
2738                /*
2739                 * Insist on a nodelist of one node only
2740                 */
2741                if (nodelist) {
2742                        char *rest = nodelist;
2743                        while (isdigit(*rest))
2744                                rest++;
2745                        if (*rest)
2746                                goto out;
2747                }
2748                break;
2749        case MPOL_INTERLEAVE:
2750                /*
2751                 * Default to online nodes with memory if no nodelist
2752                 */
2753                if (!nodelist)
2754                        nodes = node_states[N_MEMORY];
2755                break;
2756        case MPOL_LOCAL:
2757                /*
2758                 * Don't allow a nodelist;  mpol_new() checks flags
2759                 */
2760                if (nodelist)
2761                        goto out;
2762                mode = MPOL_PREFERRED;
2763                break;
2764        case MPOL_DEFAULT:
2765                /*
2766                 * Insist on a empty nodelist
2767                 */
2768                if (!nodelist)
2769                        err = 0;
2770                goto out;
2771        case MPOL_BIND:
2772                /*
2773                 * Insist on a nodelist
2774                 */
2775                if (!nodelist)
2776                        goto out;
2777        }
2778
2779        mode_flags = 0;
2780        if (flags) {
2781                /*
2782                 * Currently, we only support two mutually exclusive
2783                 * mode flags.
2784                 */
2785                if (!strcmp(flags, "static"))
2786                        mode_flags |= MPOL_F_STATIC_NODES;
2787                else if (!strcmp(flags, "relative"))
2788                        mode_flags |= MPOL_F_RELATIVE_NODES;
2789                else
2790                        goto out;
2791        }
2792
2793        new = mpol_new(mode, mode_flags, &nodes);
2794        if (IS_ERR(new))
2795                goto out;
2796
2797        /*
2798         * Save nodes for mpol_to_str() to show the tmpfs mount options
2799         * for /proc/mounts, /proc/pid/mounts and /proc/pid/mountinfo.
2800         */
2801        if (mode != MPOL_PREFERRED)
2802                new->v.nodes = nodes;
2803        else if (nodelist)
2804                new->v.preferred_node = first_node(nodes);
2805        else
2806                new->flags |= MPOL_F_LOCAL;
2807
2808        /*
2809         * Save nodes for contextualization: this will be used to "clone"
2810         * the mempolicy in a specific context [cpuset] at a later time.
2811         */
2812        new->w.user_nodemask = nodes;
2813
2814        err = 0;
2815
2816out:
2817        /* Restore string for error message */
2818        if (nodelist)
2819                *--nodelist = ':';
2820        if (flags)
2821                *--flags = '=';
2822        if (!err)
2823                *mpol = new;
2824        return err;
2825}
2826#endif /* CONFIG_TMPFS */
2827
2828/**
2829 * mpol_to_str - format a mempolicy structure for printing
2830 * @buffer:  to contain formatted mempolicy string
2831 * @maxlen:  length of @buffer
2832 * @pol:  pointer to mempolicy to be formatted
2833 *
2834 * Convert @pol into a string.  If @buffer is too short, truncate the string.
2835 * Recommend a @maxlen of at least 32 for the longest mode, "interleave", the
2836 * longest flag, "relative", and to display at least a few node ids.
2837 */
2838void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
2839{
2840        char *p = buffer;
2841        nodemask_t nodes = NODE_MASK_NONE;
2842        unsigned short mode = MPOL_DEFAULT;
2843        unsigned short flags = 0;
2844
2845        if (pol && pol != &default_policy && !(pol->flags & MPOL_F_MORON)) {
2846                mode = pol->mode;
2847                flags = pol->flags;
2848        }
2849
2850        switch (mode) {
2851        case MPOL_DEFAULT:
2852                break;
2853        case MPOL_PREFERRED:
2854                if (flags & MPOL_F_LOCAL)
2855                        mode = MPOL_LOCAL;
2856                else
2857                        node_set(pol->v.preferred_node, nodes);
2858                break;
2859        case MPOL_BIND:
2860        case MPOL_INTERLEAVE:
2861                nodes = pol->v.nodes;
2862                break;
2863        default:
2864                WARN_ON_ONCE(1);
2865                snprintf(p, maxlen, "unknown");
2866                return;
2867        }
2868
2869        p += snprintf(p, maxlen, "%s", policy_modes[mode]);
2870
2871        if (flags & MPOL_MODE_FLAGS) {
2872                p += snprintf(p, buffer + maxlen - p, "=");
2873
2874                /*
2875                 * Currently, the only defined flags are mutually exclusive
2876                 */
2877                if (flags & MPOL_F_STATIC_NODES)
2878                        p += snprintf(p, buffer + maxlen - p, "static");
2879                else if (flags & MPOL_F_RELATIVE_NODES)
2880                        p += snprintf(p, buffer + maxlen - p, "relative");
2881        }
2882
2883        if (!nodes_empty(nodes))
2884                p += scnprintf(p, buffer + maxlen - p, ":%*pbl",
2885                               nodemask_pr_args(&nodes));
2886}
2887