LXR linux/mm/mempolicy.c

   1// SPDX-License-Identifier: GPL-2.0-only
   2/*
   3 * Simple NUMA memory policy for the Linux kernel.
   4 *
   5 * Copyright 2003,2004 Andi Kleen, SuSE Labs.
   6 * (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc.
   7 *
   8 * NUMA policy allows the user to give hints in which node(s) memory should
   9 * be allocated.
  10 *
  11 * Support four policies per VMA and per process:
  12 *
  13 * The VMA policy has priority over the process policy for a page fault.
  14 *
  15 * interleave     Allocate memory interleaved over a set of nodes,
  16 *                with normal fallback if it fails.
  17 *                For VMA based allocations this interleaves based on the
  18 *                offset into the backing object or offset into the mapping
  19 *                for anonymous memory. For process policy an process counter
  20 *                is used.
  21 *
  22 * bind           Only allocate memory on a specific set of nodes,
  23 *                no fallback.
  24 *                FIXME: memory is allocated starting with the first node
  25 *                to the last. It would be better if bind would truly restrict
  26 *                the allocation to memory nodes instead
  27 *
  28 * preferred       Try a specific node first before normal fallback.
  29 *                As a special case NUMA_NO_NODE here means do the allocation
  30 *                on the local CPU. This is normally identical to default,
  31 *                but useful to set in a VMA when you have a non default
  32 *                process policy.
  33 *
  34 * default        Allocate on the local node first, or when on a VMA
  35 *                use the process policy. This is what Linux always did
  36 *                in a NUMA aware kernel and still does by, ahem, default.
  37 *
  38 * The process policy is applied for most non interrupt memory allocations
  39 * in that process' context. Interrupts ignore the policies and always
  40 * try to allocate on the local CPU. The VMA policy is only applied for memory
  41 * allocations for a VMA in the VM.
  42 *
  43 * Currently there are a few corner cases in swapping where the policy
  44 * is not applied, but the majority should be handled. When process policy
  45 * is used it is not remembered over swap outs/swap ins.
  46 *
  47 * Only the highest zone in the zone hierarchy gets policied. Allocations
  48 * requesting a lower zone just use default policy. This implies that
  49 * on systems with highmem kernel lowmem allocation don't get policied.
  50 * Same with GFP_DMA allocations.
  51 *
  52 * For shmfs/tmpfs/hugetlbfs shared memory the policy is shared between
  53 * all users and remembered even when nobody has memory mapped.
  54 */
  55
  56/* Notebook:
  57   fix mmap readahead to honour policy and enable policy for any page cache
  58   object
  59   statistics for bigpages
  60   global policy for page cache? currently it uses process policy. Requires
  61   first item above.
  62   handle mremap for shared memory (currently ignored for the policy)
  63   grows down?
  64   make bind policy root only? It can trigger oom much faster and the
  65   kernel is not always grateful with that.
  66*/
  67
  68#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
  69
  70#include <linux/mempolicy.h>
  71#include <linux/mm.h>
  72#include <linux/highmem.h>
  73#include <linux/hugetlb.h>
  74#include <linux/kernel.h>
  75#include <linux/sched.h>
  76#include <linux/sched/mm.h>
  77#include <linux/sched/numa_balancing.h>
  78#include <linux/sched/task.h>
  79#include <linux/nodemask.h>
  80#include <linux/cpuset.h>
  81#include <linux/slab.h>
  82#include <linux/string.h>
  83#include <linux/export.h>
  84#include <linux/nsproxy.h>
  85#include <linux/interrupt.h>
  86#include <linux/init.h>
  87#include <linux/compat.h>
  88#include <linux/ptrace.h>
  89#include <linux/swap.h>
  90#include <linux/seq_file.h>
  91#include <linux/proc_fs.h>
  92#include <linux/migrate.h>
  93#include <linux/ksm.h>
  94#include <linux/rmap.h>
  95#include <linux/security.h>
  96#include <linux/syscalls.h>
  97#include <linux/ctype.h>
  98#include <linux/mm_inline.h>
  99#include <linux/mmu_notifier.h>
 100#include <linux/printk.h>
 101#include <linux/swapops.h>
 102
 103#include <asm/tlbflush.h>
 104#include <linux/uaccess.h>
 105
 106#include "internal.h"
 107
 108/* Internal flags */
 109#define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0)    /* Skip checks for continuous vmas */
 110#define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1)          /* Invert check for nodemask */
 111
 112static struct kmem_cache *policy_cache;
 113static struct kmem_cache *sn_cache;
 114
 115/* Highest zone. An specific allocation for a zone below that is not
 116   policied. */
 117enum zone_type policy_zone = 0;
 118
 119/*
 120 * run-time system-wide default policy => local allocation
 121 */
 122static struct mempolicy default_policy = {
 123        .refcnt = ATOMIC_INIT(1), /* never free it */
 124        .mode = MPOL_PREFERRED,
 125        .flags = MPOL_F_LOCAL,
 126};
 127
 128static struct mempolicy preferred_node_policy[MAX_NUMNODES];
 129
 130struct mempolicy *get_task_policy(struct task_struct *p)
 131{
 132        struct mempolicy *pol = p->mempolicy;
 133        int node;
 134
 135        if (pol)
 136                return pol;
 137
 138        node = numa_node_id();
 139        if (node != NUMA_NO_NODE) {
 140                pol = &preferred_node_policy[node];
 141                /* preferred_node_policy is not initialised early in boot */
 142                if (pol->mode)
 143                        return pol;
 144        }
 145
 146        return &default_policy;
 147}
 148
 149static const struct mempolicy_operations {
 150        int (*create)(struct mempolicy *pol, const nodemask_t *nodes);
 151        void (*rebind)(struct mempolicy *pol, const nodemask_t *nodes);
 152} mpol_ops[MPOL_MAX];
 153
 154static inline int mpol_store_user_nodemask(const struct mempolicy *pol)
 155{
 156        return pol->flags & MPOL_MODE_FLAGS;
 157}
 158
 159static void mpol_relative_nodemask(nodemask_t *ret, const nodemask_t *orig,
 160                                   const nodemask_t *rel)
 161{
 162        nodemask_t tmp;
 163        nodes_fold(tmp, *orig, nodes_weight(*rel));
 164        nodes_onto(*ret, tmp, *rel);
 165}
 166
 167static int mpol_new_interleave(struct mempolicy *pol, const nodemask_t *nodes)
 168{
 169        if (nodes_empty(*nodes))
 170                return -EINVAL;
 171        pol->v.nodes = *nodes;
 172        return 0;
 173}
 174
 175static int mpol_new_preferred(struct mempolicy *pol, const nodemask_t *nodes)
 176{
 177        if (!nodes)
 178                pol->flags |= MPOL_F_LOCAL;     /* local allocation */
 179        else if (nodes_empty(*nodes))
 180                return -EINVAL;                 /*  no allowed nodes */
 181        else
 182                pol->v.preferred_node = first_node(*nodes);
 183        return 0;
 184}
 185
 186static int mpol_new_bind(struct mempolicy *pol, const nodemask_t *nodes)
 187{
 188        if (nodes_empty(*nodes))
 189                return -EINVAL;
 190        pol->v.nodes = *nodes;
 191        return 0;
 192}
 193
 194/*
 195 * mpol_set_nodemask is called after mpol_new() to set up the nodemask, if
 196 * any, for the new policy.  mpol_new() has already validated the nodes
 197 * parameter with respect to the policy mode and flags.  But, we need to
 198 * handle an empty nodemask with MPOL_PREFERRED here.
 199 *
 200 * Must be called holding task's alloc_lock to protect task's mems_allowed
 201 * and mempolicy.  May also be called holding the mmap_semaphore for write.
 202 */
 203static int mpol_set_nodemask(struct mempolicy *pol,
 204                     const nodemask_t *nodes, struct nodemask_scratch *nsc)
 205{
 206        int ret;
 207
 208        /* if mode is MPOL_DEFAULT, pol is NULL. This is right. */
 209        if (pol == NULL)
 210                return 0;
 211        /* Check N_MEMORY */
 212        nodes_and(nsc->mask1,
 213                  cpuset_current_mems_allowed, node_states[N_MEMORY]);
 214
 215        VM_BUG_ON(!nodes);
 216        if (pol->mode == MPOL_PREFERRED && nodes_empty(*nodes))
 217                nodes = NULL;   /* explicit local allocation */
 218        else {
 219                if (pol->flags & MPOL_F_RELATIVE_NODES)
 220                        mpol_relative_nodemask(&nsc->mask2, nodes, &nsc->mask1);
 221                else
 222                        nodes_and(nsc->mask2, *nodes, nsc->mask1);
 223
 224                if (mpol_store_user_nodemask(pol))
 225                        pol->w.user_nodemask = *nodes;
 226                else
 227                        pol->w.cpuset_mems_allowed =
 228                                                cpuset_current_mems_allowed;
 229        }
 230
 231        if (nodes)
 232                ret = mpol_ops[pol->mode].create(pol, &nsc->mask2);
 233        else
 234                ret = mpol_ops[pol->mode].create(pol, NULL);
 235        return ret;
 236}
 237
 238/*
 239 * This function just creates a new policy, does some check and simple
 240 * initialization. You must invoke mpol_set_nodemask() to set nodes.
 241 */
 242static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
 243                                  nodemask_t *nodes)
 244{
 245        struct mempolicy *policy;
 246
 247        pr_debug("setting mode %d flags %d nodes[0] %lx\n",
 248                 mode, flags, nodes ? nodes_addr(*nodes)[0] : NUMA_NO_NODE);
 249
 250        if (mode == MPOL_DEFAULT) {
 251                if (nodes && !nodes_empty(*nodes))
 252                        return ERR_PTR(-EINVAL);
 253                return NULL;
 254        }
 255        VM_BUG_ON(!nodes);
 256
 257        /*
 258         * MPOL_PREFERRED cannot be used with MPOL_F_STATIC_NODES or
 259         * MPOL_F_RELATIVE_NODES if the nodemask is empty (local allocation).
 260         * All other modes require a valid pointer to a non-empty nodemask.
 261         */
 262        if (mode == MPOL_PREFERRED) {
 263                if (nodes_empty(*nodes)) {
 264                        if (((flags & MPOL_F_STATIC_NODES) ||
 265                             (flags & MPOL_F_RELATIVE_NODES)))
 266                                return ERR_PTR(-EINVAL);
 267                }
 268        } else if (mode == MPOL_LOCAL) {
 269                if (!nodes_empty(*nodes) ||
 270                    (flags & MPOL_F_STATIC_NODES) ||
 271                    (flags & MPOL_F_RELATIVE_NODES))
 272                        return ERR_PTR(-EINVAL);
 273                mode = MPOL_PREFERRED;
 274        } else if (nodes_empty(*nodes))
 275                return ERR_PTR(-EINVAL);
 276        policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
 277        if (!policy)
 278                return ERR_PTR(-ENOMEM);
 279        atomic_set(&policy->refcnt, 1);
 280        policy->mode = mode;
 281        policy->flags = flags;
 282
 283        return policy;
 284}
 285
 286/* Slow path of a mpol destructor. */
 287void __mpol_put(struct mempolicy *p)
 288{
 289        if (!atomic_dec_and_test(&p->refcnt))
 290                return;
 291        kmem_cache_free(policy_cache, p);
 292}
 293
 294static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes)
 295{
 296}
 297
 298static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes)
 299{
 300        nodemask_t tmp;
 301
 302        if (pol->flags & MPOL_F_STATIC_NODES)
 303                nodes_and(tmp, pol->w.user_nodemask, *nodes);
 304        else if (pol->flags & MPOL_F_RELATIVE_NODES)
 305                mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
 306        else {
 307                nodes_remap(tmp, pol->v.nodes,pol->w.cpuset_mems_allowed,
 308                                                                *nodes);
 309                pol->w.cpuset_mems_allowed = *nodes;
 310        }
 311
 312        if (nodes_empty(tmp))
 313                tmp = *nodes;
 314
 315        pol->v.nodes = tmp;
 316}
 317
 318static void mpol_rebind_preferred(struct mempolicy *pol,
 319                                                const nodemask_t *nodes)
 320{
 321        nodemask_t tmp;
 322
 323        if (pol->flags & MPOL_F_STATIC_NODES) {
 324                int node = first_node(pol->w.user_nodemask);
 325
 326                if (node_isset(node, *nodes)) {
 327                        pol->v.preferred_node = node;
 328                        pol->flags &= ~MPOL_F_LOCAL;
 329                } else
 330                        pol->flags |= MPOL_F_LOCAL;
 331        } else if (pol->flags & MPOL_F_RELATIVE_NODES) {
 332                mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
 333                pol->v.preferred_node = first_node(tmp);
 334        } else if (!(pol->flags & MPOL_F_LOCAL)) {
 335                pol->v.preferred_node = node_remap(pol->v.preferred_node,
 336                                                   pol->w.cpuset_mems_allowed,
 337                                                   *nodes);
 338                pol->w.cpuset_mems_allowed = *nodes;
 339        }
 340}
 341
 342/*
 343 * mpol_rebind_policy - Migrate a policy to a different set of nodes
 344 *
 345 * Per-vma policies are protected by mmap_sem. Allocations using per-task
 346 * policies are protected by task->mems_allowed_seq to prevent a premature
 347 * OOM/allocation failure due to parallel nodemask modification.
 348 */
 349static void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask)
 350{
 351        if (!pol)
 352                return;
 353        if (!mpol_store_user_nodemask(pol) && !(pol->flags & MPOL_F_LOCAL) &&
 354            nodes_equal(pol->w.cpuset_mems_allowed, *newmask))
 355                return;
 356
 357        mpol_ops[pol->mode].rebind(pol, newmask);
 358}
 359
 360/*
 361 * Wrapper for mpol_rebind_policy() that just requires task
 362 * pointer, and updates task mempolicy.
 363 *
 364 * Called with task's alloc_lock held.
 365 */
 366
 367void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new)
 368{
 369        mpol_rebind_policy(tsk->mempolicy, new);
 370}
 371
 372/*
 373 * Rebind each vma in mm to new nodemask.
 374 *
 375 * Call holding a reference to mm.  Takes mm->mmap_sem during call.
 376 */
 377
 378void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
 379{
 380        struct vm_area_struct *vma;
 381
 382        down_write(&mm->mmap_sem);
 383        for (vma = mm->mmap; vma; vma = vma->vm_next)
 384                mpol_rebind_policy(vma->vm_policy, new);
 385        up_write(&mm->mmap_sem);
 386}
 387
 388static const struct mempolicy_operations mpol_ops[MPOL_MAX] = {
 389        [MPOL_DEFAULT] = {
 390                .rebind = mpol_rebind_default,
 391        },
 392        [MPOL_INTERLEAVE] = {
 393                .create = mpol_new_interleave,
 394                .rebind = mpol_rebind_nodemask,
 395        },
 396        [MPOL_PREFERRED] = {
 397                .create = mpol_new_preferred,
 398                .rebind = mpol_rebind_preferred,
 399        },
 400        [MPOL_BIND] = {
 401                .create = mpol_new_bind,
 402                .rebind = mpol_rebind_nodemask,
 403        },
 404};
 405
 406static int migrate_page_add(struct page *page, struct list_head *pagelist,
 407                                unsigned long flags);
 408
 409struct queue_pages {
 410        struct list_head *pagelist;
 411        unsigned long flags;
 412        nodemask_t *nmask;
 413        struct vm_area_struct *prev;
 414};
 415
 416/*
 417 * Check if the page's nid is in qp->nmask.
 418 *
 419 * If MPOL_MF_INVERT is set in qp->flags, check if the nid is
 420 * in the invert of qp->nmask.
 421 */
 422static inline bool queue_pages_required(struct page *page,
 423                                        struct queue_pages *qp)
 424{
 425        int nid = page_to_nid(page);
 426        unsigned long flags = qp->flags;
 427
 428        return node_isset(nid, *qp->nmask) == !(flags & MPOL_MF_INVERT);
 429}
 430
 431/*
 432 * queue_pages_pmd() has four possible return values:
 433 * 0 - pages are placed on the right node or queued successfully.
 434 * 1 - there is unmovable page, and MPOL_MF_MOVE* & MPOL_MF_STRICT were
 435 *     specified.
 436 * 2 - THP was split.
 437 * -EIO - is migration entry or only MPOL_MF_STRICT was specified and an
 438 *        existing page was already on a node that does not follow the
 439 *        policy.
 440 */
 441static int queue_pages_pmd(pmd_t *pmd, spinlock_t *ptl, unsigned long addr,
 442                                unsigned long end, struct mm_walk *walk)
 443{
 444        int ret = 0;
 445        struct page *page;
 446        struct queue_pages *qp = walk->private;
 447        unsigned long flags;
 448
 449        if (unlikely(is_pmd_migration_entry(*pmd))) {
 450                ret = -EIO;
 451                goto unlock;
 452        }
 453        page = pmd_page(*pmd);
 454        if (is_huge_zero_page(page)) {
 455                spin_unlock(ptl);
 456                __split_huge_pmd(walk->vma, pmd, addr, false, NULL);
 457                ret = 2;
 458                goto out;
 459        }
 460        if (!queue_pages_required(page, qp))
 461                goto unlock;
 462
 463        flags = qp->flags;
 464        /* go to thp migration */
 465        if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
 466                if (!vma_migratable(walk->vma) ||
 467                    migrate_page_add(page, qp->pagelist, flags)) {
 468                        ret = 1;
 469                        goto unlock;
 470                }
 471        } else
 472                ret = -EIO;
 473unlock:
 474        spin_unlock(ptl);
 475out:
 476        return ret;
 477}
 478
 479/*
 480 * Scan through pages checking if pages follow certain conditions,
 481 * and move them to the pagelist if they do.
 482 *
 483 * queue_pages_pte_range() has three possible return values:
 484 * 0 - pages are placed on the right node or queued successfully.
 485 * 1 - there is unmovable page, and MPOL_MF_MOVE* & MPOL_MF_STRICT were
 486 *     specified.
 487 * -EIO - only MPOL_MF_STRICT was specified and an existing page was already
 488 *        on a node that does not follow the policy.
 489 */
 490static int queue_pages_pte_range(pmd_t *pmd, unsigned long addr,
 491                        unsigned long end, struct mm_walk *walk)
 492{
 493        struct vm_area_struct *vma = walk->vma;
 494        struct page *page;
 495        struct queue_pages *qp = walk->private;
 496        unsigned long flags = qp->flags;
 497        int ret;
 498        bool has_unmovable = false;
 499        pte_t *pte;
 500        spinlock_t *ptl;
 501
 502        ptl = pmd_trans_huge_lock(pmd, vma);
 503        if (ptl) {
 504                ret = queue_pages_pmd(pmd, ptl, addr, end, walk);
 505                if (ret != 2)
 506                        return ret;
 507        }
 508        /* THP was split, fall through to pte walk */
 509
 510        if (pmd_trans_unstable(pmd))
 511                return 0;
 512
 513        pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
 514        for (; addr != end; pte++, addr += PAGE_SIZE) {
 515                if (!pte_present(*pte))
 516                        continue;
 517                page = vm_normal_page(vma, addr, *pte);
 518                if (!page)
 519                        continue;
 520                /*
 521                 * vm_normal_page() filters out zero pages, but there might
 522                 * still be PageReserved pages to skip, perhaps in a VDSO.
 523                 */
 524                if (PageReserved(page))
 525                        continue;
 526                if (!queue_pages_required(page, qp))
 527                        continue;
 528                if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
 529                        /* MPOL_MF_STRICT must be specified if we get here */
 530                        if (!vma_migratable(vma)) {
 531                                has_unmovable = true;
 532                                break;
 533                        }
 534
 535                        /*
 536                         * Do not abort immediately since there may be
 537                         * temporary off LRU pages in the range.  Still
 538                         * need migrate other LRU pages.
 539                         */
 540                        if (migrate_page_add(page, qp->pagelist, flags))
 541                                has_unmovable = true;
 542                } else
 543                        break;
 544        }
 545        pte_unmap_unlock(pte - 1, ptl);
 546        cond_resched();
 547
 548        if (has_unmovable)
 549                return 1;
 550
 551        return addr != end ? -EIO : 0;
 552}
 553
 554static int queue_pages_hugetlb(pte_t *pte, unsigned long hmask,
 555                               unsigned long addr, unsigned long end,
 556                               struct mm_walk *walk)
 557{
 558#ifdef CONFIG_HUGETLB_PAGE
 559        struct queue_pages *qp = walk->private;
 560        unsigned long flags = qp->flags;
 561        struct page *page;
 562        spinlock_t *ptl;
 563        pte_t entry;
 564
 565        ptl = huge_pte_lock(hstate_vma(walk->vma), walk->mm, pte);
 566        entry = huge_ptep_get(pte);
 567        if (!pte_present(entry))
 568                goto unlock;
 569        page = pte_page(entry);
 570        if (!queue_pages_required(page, qp))
 571                goto unlock;
 572        /* With MPOL_MF_MOVE, we migrate only unshared hugepage. */
 573        if (flags & (MPOL_MF_MOVE_ALL) ||
 574            (flags & MPOL_MF_MOVE && page_mapcount(page) == 1))
 575                isolate_huge_page(page, qp->pagelist);
 576unlock:
 577        spin_unlock(ptl);
 578#else
 579        BUG();
 580#endif
 581        return 0;
 582}
 583
 584#ifdef CONFIG_NUMA_BALANCING
 585/*
 586 * This is used to mark a range of virtual addresses to be inaccessible.
 587 * These are later cleared by a NUMA hinting fault. Depending on these
 588 * faults, pages may be migrated for better NUMA placement.
 589 *
 590 * This is assuming that NUMA faults are handled using PROT_NONE. If
 591 * an architecture makes a different choice, it will need further
 592 * changes to the core.
 593 */
 594unsigned long change_prot_numa(struct vm_area_struct *vma,
 595                        unsigned long addr, unsigned long end)
 596{
 597        int nr_updated;
 598
 599        nr_updated = change_protection(vma, addr, end, PAGE_NONE, 0, 1);
 600        if (nr_updated)
 601                count_vm_numa_events(NUMA_PTE_UPDATES, nr_updated);
 602
 603        return nr_updated;
 604}
 605#else
 606static unsigned long change_prot_numa(struct vm_area_struct *vma,
 607                        unsigned long addr, unsigned long end)
 608{
 609        return 0;
 610}
 611#endif /* CONFIG_NUMA_BALANCING */
 612
 613static int queue_pages_test_walk(unsigned long start, unsigned long end,
 614                                struct mm_walk *walk)
 615{
 616        struct vm_area_struct *vma = walk->vma;
 617        struct queue_pages *qp = walk->private;
 618        unsigned long endvma = vma->vm_end;
 619        unsigned long flags = qp->flags;
 620
 621        /*
 622         * Need check MPOL_MF_STRICT to return -EIO if possible
 623         * regardless of vma_migratable
 624         */
 625        if (!vma_migratable(vma) &&
 626            !(flags & MPOL_MF_STRICT))
 627                return 1;
 628
 629        if (endvma > end)
 630                endvma = end;
 631        if (vma->vm_start > start)
 632                start = vma->vm_start;
 633
 634        if (!(flags & MPOL_MF_DISCONTIG_OK)) {
 635                if (!vma->vm_next && vma->vm_end < end)
 636                        return -EFAULT;
 637                if (qp->prev && qp->prev->vm_end < vma->vm_start)
 638                        return -EFAULT;
 639        }
 640
 641        qp->prev = vma;
 642
 643        if (flags & MPOL_MF_LAZY) {
 644                /* Similar to task_numa_work, skip inaccessible VMAs */
 645                if (!is_vm_hugetlb_page(vma) &&
 646                        (vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)) &&
 647                        !(vma->vm_flags & VM_MIXEDMAP))
 648                        change_prot_numa(vma, start, endvma);
 649                return 1;
 650        }
 651
 652        /* queue pages from current vma */
 653        if (flags & MPOL_MF_VALID)
 654                return 0;
 655        return 1;
 656}
 657
 658/*
 659 * Walk through page tables and collect pages to be migrated.
 660 *
 661 * If pages found in a given range are on a set of nodes (determined by
 662 * @nodes and @flags,) it's isolated and queued to the pagelist which is
 663 * passed via @private.
 664 *
 665 * queue_pages_range() has three possible return values:
 666 * 1 - there is unmovable page, but MPOL_MF_MOVE* & MPOL_MF_STRICT were
 667 *     specified.
 668 * 0 - queue pages successfully or no misplaced page.
 669 * -EIO - there is misplaced page and only MPOL_MF_STRICT was specified.
 670 */
 671static int
 672queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end,
 673                nodemask_t *nodes, unsigned long flags,
 674                struct list_head *pagelist)
 675{
 676        struct queue_pages qp = {
 677                .pagelist = pagelist,
 678                .flags = flags,
 679                .nmask = nodes,
 680                .prev = NULL,
 681        };
 682        struct mm_walk queue_pages_walk = {
 683                .hugetlb_entry = queue_pages_hugetlb,
 684                .pmd_entry = queue_pages_pte_range,
 685                .test_walk = queue_pages_test_walk,
 686                .mm = mm,
 687                .private = &qp,
 688        };
 689
 690        return walk_page_range(start, end, &queue_pages_walk);
 691}
 692
 693/*
 694 * Apply policy to a single VMA
 695 * This must be called with the mmap_sem held for writing.
 696 */
 697static int vma_replace_policy(struct vm_area_struct *vma,
 698                                                struct mempolicy *pol)
 699{
 700        int err;
 701        struct mempolicy *old;
 702        struct mempolicy *new;
 703
 704        pr_debug("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
 705                 vma->vm_start, vma->vm_end, vma->vm_pgoff,
 706                 vma->vm_ops, vma->vm_file,
 707                 vma->vm_ops ? vma->vm_ops->set_policy : NULL);
 708
 709        new = mpol_dup(pol);
 710        if (IS_ERR(new))
 711                return PTR_ERR(new);
 712
 713        if (vma->vm_ops && vma->vm_ops->set_policy) {
 714                err = vma->vm_ops->set_policy(vma, new);
 715                if (err)
 716                        goto err_out;
 717        }
 718
 719        old = vma->vm_policy;
 720        vma->vm_policy = new; /* protected by mmap_sem */
 721        mpol_put(old);
 722
 723        return 0;
 724 err_out:
 725        mpol_put(new);
 726        return err;
 727}
 728
 729/* Step 2: apply policy to a range and do splits. */
 730static int mbind_range(struct mm_struct *mm, unsigned long start,
 731                       unsigned long end, struct mempolicy *new_pol)
 732{
 733        struct vm_area_struct *next;
 734        struct vm_area_struct *prev;
 735        struct vm_area_struct *vma;
 736        int err = 0;
 737        pgoff_t pgoff;
 738        unsigned long vmstart;
 739        unsigned long vmend;
 740
 741        vma = find_vma(mm, start);
 742        if (!vma || vma->vm_start > start)
 743                return -EFAULT;
 744
 745        prev = vma->vm_prev;
 746        if (start > vma->vm_start)
 747                prev = vma;
 748
 749        for (; vma && vma->vm_start < end; prev = vma, vma = next) {
 750                next = vma->vm_next;
 751                vmstart = max(start, vma->vm_start);
 752                vmend   = min(end, vma->vm_end);
 753
 754                if (mpol_equal(vma_policy(vma), new_pol))
 755                        continue;
 756
 757                pgoff = vma->vm_pgoff +
 758                        ((vmstart - vma->vm_start) >> PAGE_SHIFT);
 759                prev = vma_merge(mm, prev, vmstart, vmend, vma->vm_flags,
 760                                 vma->anon_vma, vma->vm_file, pgoff,
 761                                 new_pol, vma->vm_userfaultfd_ctx);
 762                if (prev) {
 763                        vma = prev;
 764                        next = vma->vm_next;
 765                        if (mpol_equal(vma_policy(vma), new_pol))
 766                                continue;
 767                        /* vma_merge() joined vma && vma->next, case 8 */
 768                        goto replace;
 769                }
 770                if (vma->vm_start != vmstart) {
 771                        err = split_vma(vma->vm_mm, vma, vmstart, 1);
 772                        if (err)
 773                                goto out;
 774                }
 775                if (vma->vm_end != vmend) {
 776                        err = split_vma(vma->vm_mm, vma, vmend, 0);
 777                        if (err)
 778                                goto out;
 779                }
 780 replace:
 781                err = vma_replace_policy(vma, new_pol);
 782                if (err)
 783                        goto out;
 784        }
 785
 786 out:
 787        return err;
 788}
 789
 790/* Set the process memory policy */
 791static long do_set_mempolicy(unsigned short mode, unsigned short flags,
 792                             nodemask_t *nodes)
 793{
 794        struct mempolicy *new, *old;
 795        NODEMASK_SCRATCH(scratch);
 796        int ret;
 797
 798        if (!scratch)
 799                return -ENOMEM;
 800
 801        new = mpol_new(mode, flags, nodes);
 802        if (IS_ERR(new)) {
 803                ret = PTR_ERR(new);
 804                goto out;
 805        }
 806
 807        task_lock(current);
 808        ret = mpol_set_nodemask(new, nodes, scratch);
 809        if (ret) {
 810                task_unlock(current);
 811                mpol_put(new);
 812                goto out;
 813        }
 814        old = current->mempolicy;
 815        current->mempolicy = new;
 816        if (new && new->mode == MPOL_INTERLEAVE)
 817                current->il_prev = MAX_NUMNODES-1;
 818        task_unlock(current);
 819        mpol_put(old);
 820        ret = 0;
 821out:
 822        NODEMASK_SCRATCH_FREE(scratch);
 823        return ret;
 824}
 825
 826/*
 827 * Return nodemask for policy for get_mempolicy() query
 828 *
 829 * Called with task's alloc_lock held
 830 */
 831static void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes)
 832{
 833        nodes_clear(*nodes);
 834        if (p == &default_policy)
 835                return;
 836
 837        switch (p->mode) {
 838        case MPOL_BIND:
 839                /* Fall through */
 840        case MPOL_INTERLEAVE:
 841                *nodes = p->v.nodes;
 842                break;
 843        case MPOL_PREFERRED:
 844                if (!(p->flags & MPOL_F_LOCAL))
 845                        node_set(p->v.preferred_node, *nodes);
 846                /* else return empty node mask for local allocation */
 847                break;
 848        default:
 849                BUG();
 850        }
 851}
 852
 853static int lookup_node(struct mm_struct *mm, unsigned long addr)
 854{
 855        struct page *p;
 856        int err;
 857
 858        int locked = 1;
 859        err = get_user_pages_locked(addr & PAGE_MASK, 1, 0, &p, &locked);
 860        if (err >= 0) {
 861                err = page_to_nid(p);
 862                put_page(p);
 863        }
 864        if (locked)
 865                up_read(&mm->mmap_sem);
 866        return err;
 867}
 868
 869/* Retrieve NUMA policy */
 870static long do_get_mempolicy(int *policy, nodemask_t *nmask,
 871                             unsigned long addr, unsigned long flags)
 872{
 873        int err;
 874        struct mm_struct *mm = current->mm;
 875        struct vm_area_struct *vma = NULL;
 876        struct mempolicy *pol = current->mempolicy, *pol_refcount = NULL;
 877
 878        if (flags &
 879                ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED))
 880                return -EINVAL;
 881
 882        if (flags & MPOL_F_MEMS_ALLOWED) {
 883                if (flags & (MPOL_F_NODE|MPOL_F_ADDR))
 884                        return -EINVAL;
 885                *policy = 0;    /* just so it's initialized */
 886                task_lock(current);
 887                *nmask  = cpuset_current_mems_allowed;
 888                task_unlock(current);
 889                return 0;
 890        }
 891
 892        if (flags & MPOL_F_ADDR) {
 893                /*
 894                 * Do NOT fall back to task policy if the
 895                 * vma/shared policy at addr is NULL.  We
 896                 * want to return MPOL_DEFAULT in this case.
 897                 */
 898                down_read(&mm->mmap_sem);
 899                vma = find_vma_intersection(mm, addr, addr+1);
 900                if (!vma) {
 901                        up_read(&mm->mmap_sem);
 902                        return -EFAULT;
 903                }
 904                if (vma->vm_ops && vma->vm_ops->get_policy)
 905                        pol = vma->vm_ops->get_policy(vma, addr);
 906                else
 907                        pol = vma->vm_policy;
 908        } else if (addr)
 909                return -EINVAL;
 910
 911        if (!pol)
 912                pol = &default_policy;  /* indicates default behavior */
 913
 914        if (flags & MPOL_F_NODE) {
 915                if (flags & MPOL_F_ADDR) {
 916                        /*
 917                         * Take a refcount on the mpol, lookup_node()
 918                         * wil drop the mmap_sem, so after calling
 919                         * lookup_node() only "pol" remains valid, "vma"
 920                         * is stale.
 921                         */
 922                        pol_refcount = pol;
 923                        vma = NULL;
 924                        mpol_get(pol);
 925                        err = lookup_node(mm, addr);
 926                        if (err < 0)
 927                                goto out;
 928                        *policy = err;
 929                } else if (pol == current->mempolicy &&
 930                                pol->mode == MPOL_INTERLEAVE) {
 931                        *policy = next_node_in(current->il_prev, pol->v.nodes);
 932                } else {
 933                        err = -EINVAL;
 934                        goto out;
 935                }
 936        } else {
 937                *policy = pol == &default_policy ? MPOL_DEFAULT :
 938                                                pol->mode;
 939                /*
 940                 * Internal mempolicy flags must be masked off before exposing
 941                 * the policy to userspace.
 942                 */
 943                *policy |= (pol->flags & MPOL_MODE_FLAGS);
 944        }
 945
 946        err = 0;
 947        if (nmask) {
 948                if (mpol_store_user_nodemask(pol)) {
 949                        *nmask = pol->w.user_nodemask;
 950                } else {
 951                        task_lock(current);
 952                        get_policy_nodemask(pol, nmask);
 953                        task_unlock(current);
 954                }
 955        }
 956
 957 out:
 958        mpol_cond_put(pol);
 959        if (vma)
 960                up_read(&mm->mmap_sem);
 961        if (pol_refcount)
 962                mpol_put(pol_refcount);
 963        return err;
 964}
 965
 966#ifdef CONFIG_MIGRATION
 967/*
 968 * page migration, thp tail pages can be passed.
 969 */
 970static int migrate_page_add(struct page *page, struct list_head *pagelist,
 971                                unsigned long flags)
 972{
 973        struct page *head = compound_head(page);
 974        /*
 975         * Avoid migrating a page that is shared with others.
 976         */
 977        if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(head) == 1) {
 978                if (!isolate_lru_page(head)) {
 979                        list_add_tail(&head->lru, pagelist);
 980                        mod_node_page_state(page_pgdat(head),
 981                                NR_ISOLATED_ANON + page_is_file_cache(head),
 982                                hpage_nr_pages(head));
 983                } else if (flags & MPOL_MF_STRICT) {
 984                        /*
 985                         * Non-movable page may reach here.  And, there may be
 986                         * temporary off LRU pages or non-LRU movable pages.
 987                         * Treat them as unmovable pages since they can't be
 988                         * isolated, so they can't be moved at the moment.  It
 989                         * should return -EIO for this case too.
 990                         */
 991                        return -EIO;
 992                }
 993        }
 994
 995        return 0;
 996}
 997
 998/* page allocation callback for NUMA node migration */
 999struct page *alloc_new_node_page(struct page *page, unsigned long node)
1000{

1001        if (PageHuge(page))
1002                return alloc_huge_page_node(page_hstate(compound_head(page)),
1003                                        node);
1004        else if (PageTransHuge(page)) {
1005                struct page *thp;
1006
1007                thp = alloc_pages_node(node,
1008                        (GFP_TRANSHUGE | __GFP_THISNODE),
1009                        HPAGE_PMD_ORDER);
1010                if (!thp)
1011                        return NULL;
1012                prep_transhuge_page(thp);
1013                return thp;
1014        } else
1015                return __alloc_pages_node(node, GFP_HIGHUSER_MOVABLE |
1016                                                    __GFP_THISNODE, 0);
1017}
1018
1019/*
1020 * Migrate pages from one node to a target node.
1021 * Returns error or the number of pages not migrated.
1022 */
1023static int migrate_to_node(struct mm_struct *mm, int source, int dest,
1024                           int flags)
1025{
1026        nodemask_t nmask;
1027        LIST_HEAD(pagelist);
1028        int err = 0;
1029
1030        nodes_clear(nmask);
1031        node_set(source, nmask);
1032
1033        /*
1034         * This does not "check" the range but isolates all pages that
1035         * need migration.  Between passing in the full user address
1036         * space range and MPOL_MF_DISCONTIG_OK, this call can not fail.
1037         */
1038        VM_BUG_ON(!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)));
1039        queue_pages_range(mm, mm->mmap->vm_start, mm->task_size, &nmask,
1040                        flags | MPOL_MF_DISCONTIG_OK, &pagelist);
1041
1042        if (!list_empty(&pagelist)) {
1043                err = migrate_pages(&pagelist, alloc_new_node_page, NULL, dest,
1044                                        MIGRATE_SYNC, MR_SYSCALL);
1045                if (err)
1046                        putback_movable_pages(&pagelist);
1047        }
1048
1049        return err;
1050}
1051
1052/*
1053 * Move pages between the two nodesets so as to preserve the physical
1054 * layout as much as possible.
1055 *
1056 * Returns the number of page that could not be moved.
1057 */
1058int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
1059                     const nodemask_t *to, int flags)
1060{
1061        int busy = 0;
1062        int err;
1063        nodemask_t tmp;
1064
1065        err = migrate_prep();
1066        if (err)
1067                return err;
1068
1069        down_read(&mm->mmap_sem);
1070
1071        /*
1072         * Find a 'source' bit set in 'tmp' whose corresponding 'dest'
1073         * bit in 'to' is not also set in 'tmp'.  Clear the found 'source'
1074         * bit in 'tmp', and return that <source, dest> pair for migration.
1075         * The pair of nodemasks 'to' and 'from' define the map.
1076         *
1077         * If no pair of bits is found that way, fallback to picking some
1078         * pair of 'source' and 'dest' bits that are not the same.  If the
1079         * 'source' and 'dest' bits are the same, this represents a node
1080         * that will be migrating to itself, so no pages need move.
1081         *
1082         * If no bits are left in 'tmp', or if all remaining bits left
1083         * in 'tmp' correspond to the same bit in 'to', return false
1084         * (nothing left to migrate).
1085         *
1086         * This lets us pick a pair of nodes to migrate between, such that
1087         * if possible the dest node is not already occupied by some other
1088         * source node, minimizing the risk of overloading the memory on a
1089         * node that would happen if we migrated incoming memory to a node
1090         * before migrating outgoing memory source that same node.
1091         *
1092         * A single scan of tmp is sufficient.  As we go, we remember the
1093         * most recent <s, d> pair that moved (s != d).  If we find a pair
1094         * that not only moved, but what's better, moved to an empty slot
1095         * (d is not set in tmp), then we break out then, with that pair.
1096         * Otherwise when we finish scanning from_tmp, we at least have the
1097         * most recent <s, d> pair that moved.  If we get all the way through
1098         * the scan of tmp without finding any node that moved, much less
1099         * moved to an empty node, then there is nothing left worth migrating.
1100         */
1101
1102        tmp = *from;
1103        while (!nodes_empty(tmp)) {
1104                int s,d;
1105                int source = NUMA_NO_NODE;
1106                int dest = 0;
1107
1108                for_each_node_mask(s, tmp) {
1109
1110                        /*
1111                         * do_migrate_pages() tries to maintain the relative
1112                         * node relationship of the pages established between
1113                         * threads and memory areas.
1114                         *
1115                         * However if the number of source nodes is not equal to
1116                         * the number of destination nodes we can not preserve
1117                         * this node relative relationship.  In that case, skip
1118                         * copying memory from a node that is in the destination
1119                         * mask.
1120                         *
1121                         * Example: [2,3,4] -> [3,4,5] moves everything.
1122                         *          [0-7] - > [3,4,5] moves only 0,1,2,6,7.
1123                         */
1124
1125                        if ((nodes_weight(*from) != nodes_weight(*to)) &&
1126                                                (node_isset(s, *to)))
1127                                continue;
1128
1129                        d = node_remap(s, *from, *to);
1130                        if (s == d)
1131                                continue;
1132
1133                        source = s;     /* Node moved. Memorize */
1134                        dest = d;
1135
1136                        /* dest not in remaining from nodes? */
1137                        if (!node_isset(dest, tmp))
1138                                break;
1139                }
1140                if (source == NUMA_NO_NODE)
1141                        break;
1142
1143                node_clear(source, tmp);
1144                err = migrate_to_node(mm, source, dest, flags);
1145                if (err > 0)
1146                        busy += err;
1147                if (err < 0)
1148                        break;
1149        }
1150        up_read(&mm->mmap_sem);
1151        if (err < 0)
1152                return err;
1153        return busy;
1154
1155}
1156
1157/*
1158 * Allocate a new page for page migration based on vma policy.
1159 * Start by assuming the page is mapped by the same vma as contains @start.
1160 * Search forward from there, if not.  N.B., this assumes that the
1161 * list of pages handed to migrate_pages()--which is how we get here--
1162 * is in virtual address order.
1163 */
1164static struct page *new_page(struct page *page, unsigned long start)
1165{
1166        struct vm_area_struct *vma;
1167        unsigned long uninitialized_var(address);
1168
1169        vma = find_vma(current->mm, start);
1170        while (vma) {
1171                address = page_address_in_vma(page, vma);
1172                if (address != -EFAULT)
1173                        break;
1174                vma = vma->vm_next;
1175        }
1176
1177        if (PageHuge(page)) {
1178                return alloc_huge_page_vma(page_hstate(compound_head(page)),
1179                                vma, address);
1180        } else if (PageTransHuge(page)) {
1181                struct page *thp;
1182
1183                thp = alloc_pages_vma(GFP_TRANSHUGE, HPAGE_PMD_ORDER, vma,
1184                                address, numa_node_id());
1185                if (!thp)
1186                        return NULL;
1187                prep_transhuge_page(thp);
1188                return thp;
1189        }
1190        /*
1191         * if !vma, alloc_page_vma() will use task or system default policy
1192         */
1193        return alloc_page_vma(GFP_HIGHUSER_MOVABLE | __GFP_RETRY_MAYFAIL,
1194                        vma, address);
1195}
1196#else
1197
1198static int migrate_page_add(struct page *page, struct list_head *pagelist,
1199                                unsigned long flags)
1200{
1201        return -EIO;
1202}
1203
1204int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
1205                     const nodemask_t *to, int flags)
1206{
1207        return -ENOSYS;
1208}
1209
1210static struct page *new_page(struct page *page, unsigned long start)
1211{
1212        return NULL;
1213}
1214#endif
1215
1216static long do_mbind(unsigned long start, unsigned long len,
1217                     unsigned short mode, unsigned short mode_flags,
1218                     nodemask_t *nmask, unsigned long flags)
1219{
1220        struct mm_struct *mm = current->mm;
1221        struct mempolicy *new;
1222        unsigned long end;
1223        int err;
1224        int ret;
1225        LIST_HEAD(pagelist);
1226
1227        if (flags & ~(unsigned long)MPOL_MF_VALID)
1228                return -EINVAL;
1229        if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
1230                return -EPERM;
1231
1232        if (start & ~PAGE_MASK)
1233                return -EINVAL;
1234
1235        if (mode == MPOL_DEFAULT)
1236                flags &= ~MPOL_MF_STRICT;
1237
1238        len = (len + PAGE_SIZE - 1) & PAGE_MASK;
1239        end = start + len;
1240
1241        if (end < start)
1242                return -EINVAL;
1243        if (end == start)
1244                return 0;
1245
1246        new = mpol_new(mode, mode_flags, nmask);
1247        if (IS_ERR(new))
1248                return PTR_ERR(new);
1249
1250        if (flags & MPOL_MF_LAZY)
1251                new->flags |= MPOL_F_MOF;
1252
1253        /*
1254         * If we are using the default policy then operation
1255         * on discontinuous address spaces is okay after all
1256         */
1257        if (!new)
1258                flags |= MPOL_MF_DISCONTIG_OK;
1259
1260        pr_debug("mbind %lx-%lx mode:%d flags:%d nodes:%lx\n",
1261                 start, start + len, mode, mode_flags,
1262                 nmask ? nodes_addr(*nmask)[0] : NUMA_NO_NODE);
1263
1264        if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
1265
1266                err = migrate_prep();
1267                if (err)
1268                        goto mpol_out;
1269        }
1270        {
1271                NODEMASK_SCRATCH(scratch);
1272                if (scratch) {
1273                        down_write(&mm->mmap_sem);
1274                        task_lock(current);
1275                        err = mpol_set_nodemask(new, nmask, scratch);
1276                        task_unlock(current);
1277                        if (err)
1278                                up_write(&mm->mmap_sem);
1279                } else
1280                        err = -ENOMEM;
1281                NODEMASK_SCRATCH_FREE(scratch);
1282        }
1283        if (err)
1284                goto mpol_out;
1285
1286        ret = queue_pages_range(mm, start, end, nmask,
1287                          flags | MPOL_MF_INVERT, &pagelist);
1288
1289        if (ret < 0) {
1290                err = -EIO;
1291                goto up_out;
1292        }
1293
1294        err = mbind_range(mm, start, end, new);
1295
1296        if (!err) {
1297                int nr_failed = 0;
1298
1299                if (!list_empty(&pagelist)) {
1300                        WARN_ON_ONCE(flags & MPOL_MF_LAZY);
1301                        nr_failed = migrate_pages(&pagelist, new_page, NULL,
1302                                start, MIGRATE_SYNC, MR_MEMPOLICY_MBIND);
1303                        if (nr_failed)
1304                                putback_movable_pages(&pagelist);
1305                }
1306
1307                if ((ret > 0) || (nr_failed && (flags & MPOL_MF_STRICT)))
1308                        err = -EIO;
1309        } else
1310                putback_movable_pages(&pagelist);
1311
1312up_out:
1313        up_write(&mm->mmap_sem);
1314mpol_out:
1315        mpol_put(new);
1316        return err;
1317}
1318
1319/*
1320 * User space interface with variable sized bitmaps for nodelists.
1321 */
1322
1323/* Copy a node mask from user space. */
1324static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
1325                     unsigned long maxnode)
1326{
1327        unsigned long k;
1328        unsigned long t;
1329        unsigned long nlongs;
1330        unsigned long endmask;
1331
1332        --maxnode;
1333        nodes_clear(*nodes);
1334        if (maxnode == 0 || !nmask)
1335                return 0;
1336        if (maxnode > PAGE_SIZE*BITS_PER_BYTE)
1337                return -EINVAL;
1338
1339        nlongs = BITS_TO_LONGS(maxnode);
1340        if ((maxnode % BITS_PER_LONG) == 0)
1341                endmask = ~0UL;
1342        else
1343                endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1;
1344
1345        /*
1346         * When the user specified more nodes than supported just check
1347         * if the non supported part is all zero.
1348         *
1349         * If maxnode have more longs than MAX_NUMNODES, check
1350         * the bits in that area first. And then go through to
1351         * check the rest bits which equal or bigger than MAX_NUMNODES.
1352         * Otherwise, just check bits [MAX_NUMNODES, maxnode).
1353         */
1354        if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) {
1355                for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) {
1356                        if (get_user(t, nmask + k))
1357                                return -EFAULT;
1358                        if (k == nlongs - 1) {
1359                                if (t & endmask)
1360                                        return -EINVAL;
1361                        } else if (t)
1362                                return -EINVAL;
1363                }
1364                nlongs = BITS_TO_LONGS(MAX_NUMNODES);
1365                endmask = ~0UL;
1366        }
1367
1368        if (maxnode > MAX_NUMNODES && MAX_NUMNODES % BITS_PER_LONG != 0) {
1369                unsigned long valid_mask = endmask;
1370
1371                valid_mask &= ~((1UL << (MAX_NUMNODES % BITS_PER_LONG)) - 1);
1372                if (get_user(t, nmask + nlongs - 1))
1373                        return -EFAULT;
1374                if (t & valid_mask)
1375                        return -EINVAL;
1376        }
1377
1378        if (copy_from_user(nodes_addr(*nodes), nmask, nlongs*sizeof(unsigned long)))
1379                return -EFAULT;
1380        nodes_addr(*nodes)[nlongs-1] &= endmask;
1381        return 0;
1382}
1383
1384/* Copy a kernel node mask to user space */
1385static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
1386                              nodemask_t *nodes)
1387{
1388        unsigned long copy = ALIGN(maxnode-1, 64) / 8;
1389        unsigned int nbytes = BITS_TO_LONGS(nr_node_ids) * sizeof(long);
1390
1391        if (copy > nbytes) {
1392                if (copy > PAGE_SIZE)
1393                        return -EINVAL;
1394                if (clear_user((char __user *)mask + nbytes, copy - nbytes))
1395                        return -EFAULT;
1396                copy = nbytes;
1397        }
1398        return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
1399}
1400
1401static long kernel_mbind(unsigned long start, unsigned long len,
1402                         unsigned long mode, const unsigned long __user *nmask,
1403                         unsigned long maxnode, unsigned int flags)
1404{
1405        nodemask_t nodes;
1406        int err;
1407        unsigned short mode_flags;
1408
1409        mode_flags = mode & MPOL_MODE_FLAGS;
1410        mode &= ~MPOL_MODE_FLAGS;
1411        if (mode >= MPOL_MAX)
1412                return -EINVAL;
1413        if ((mode_flags & MPOL_F_STATIC_NODES) &&
1414            (mode_flags & MPOL_F_RELATIVE_NODES))
1415                return -EINVAL;
1416        err = get_nodes(&nodes, nmask, maxnode);
1417        if (err)
1418                return err;
1419        return do_mbind(start, len, mode, mode_flags, &nodes, flags);
1420}
1421
1422SYSCALL_DEFINE6(mbind, unsigned long, start, unsigned long, len,
1423                unsigned long, mode, const unsigned long __user *, nmask,
1424                unsigned long, maxnode, unsigned int, flags)
1425{
1426        return kernel_mbind(start, len, mode, nmask, maxnode, flags);
1427}
1428
1429/* Set the process memory policy */
1430static long kernel_set_mempolicy(int mode, const unsigned long __user *nmask,
1431                                 unsigned long maxnode)
1432{
1433        int err;
1434        nodemask_t nodes;
1435        unsigned short flags;
1436
1437        flags = mode & MPOL_MODE_FLAGS;
1438        mode &= ~MPOL_MODE_FLAGS;
1439        if ((unsigned int)mode >= MPOL_MAX)
1440                return -EINVAL;
1441        if ((flags & MPOL_F_STATIC_NODES) && (flags & MPOL_F_RELATIVE_NODES))
1442                return -EINVAL;
1443        err = get_nodes(&nodes, nmask, maxnode);
1444        if (err)
1445                return err;
1446        return do_set_mempolicy(mode, flags, &nodes);
1447}
1448
1449SYSCALL_DEFINE3(set_mempolicy, int, mode, const unsigned long __user *, nmask,
1450                unsigned long, maxnode)
1451{
1452        return kernel_set_mempolicy(mode, nmask, maxnode);
1453}
1454
1455static int kernel_migrate_pages(pid_t pid, unsigned long maxnode,
1456                                const unsigned long __user *old_nodes,
1457                                const unsigned long __user *new_nodes)
1458{
1459        struct mm_struct *mm = NULL;
1460        struct task_struct *task;
1461        nodemask_t task_nodes;
1462        int err;
1463        nodemask_t *old;
1464        nodemask_t *new;
1465        NODEMASK_SCRATCH(scratch);
1466
1467        if (!scratch)
1468                return -ENOMEM;
1469
1470        old = &scratch->mask1;
1471        new = &scratch->mask2;
1472
1473        err = get_nodes(old, old_nodes, maxnode);
1474        if (err)
1475                goto out;
1476
1477        err = get_nodes(new, new_nodes, maxnode);
1478        if (err)
1479                goto out;
1480
1481        /* Find the mm_struct */
1482        rcu_read_lock();
1483        task = pid ? find_task_by_vpid(pid) : current;
1484        if (!task) {
1485                rcu_read_unlock();
1486                err = -ESRCH;
1487                goto out;
1488        }
1489        get_task_struct(task);
1490
1491        err = -EINVAL;
1492
1493        /*
1494         * Check if this process has the right to modify the specified process.
1495         * Use the regular "ptrace_may_access()" checks.
1496         */
1497        if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS)) {
1498                rcu_read_unlock();
1499                err = -EPERM;
1500                goto out_put;
1501        }
1502        rcu_read_unlock();
1503
1504        task_nodes = cpuset_mems_allowed(task);
1505        /* Is the user allowed to access the target nodes? */
1506        if (!nodes_subset(*new, task_nodes) && !capable(CAP_SYS_NICE)) {
1507                err = -EPERM;
1508                goto out_put;
1509        }
1510
1511        task_nodes = cpuset_mems_allowed(current);
1512        nodes_and(*new, *new, task_nodes);
1513        if (nodes_empty(*new))
1514                goto out_put;
1515
1516        nodes_and(*new, *new, node_states[N_MEMORY]);
1517        if (nodes_empty(*new))
1518                goto out_put;
1519
1520        err = security_task_movememory(task);
1521        if (err)
1522                goto out_put;
1523
1524        mm = get_task_mm(task);
1525        put_task_struct(task);
1526
1527        if (!mm) {
1528                err = -EINVAL;
1529                goto out;
1530        }
1531
1532        err = do_migrate_pages(mm, old, new,
1533                capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE);
1534
1535        mmput(mm);
1536out:
1537        NODEMASK_SCRATCH_FREE(scratch);
1538
1539        return err;
1540
1541out_put:
1542        put_task_struct(task);
1543        goto out;
1544
1545}
1546
1547SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode,
1548                const unsigned long __user *, old_nodes,
1549                const unsigned long __user *, new_nodes)
1550{
1551        return kernel_migrate_pages(pid, maxnode, old_nodes, new_nodes);
1552}
1553
1554
1555/* Retrieve NUMA policy */
1556static int kernel_get_mempolicy(int __user *policy,
1557                                unsigned long __user *nmask,
1558                                unsigned long maxnode,
1559                                unsigned long addr,
1560                                unsigned long flags)
1561{
1562        int err;
1563        int uninitialized_var(pval);
1564        nodemask_t nodes;
1565
1566        if (nmask != NULL && maxnode < nr_node_ids)
1567                return -EINVAL;
1568
1569        err = do_get_mempolicy(&pval, &nodes, addr, flags);
1570
1571        if (err)
1572                return err;
1573
1574        if (policy && put_user(pval, policy))
1575                return -EFAULT;
1576
1577        if (nmask)
1578                err = copy_nodes_to_user(nmask, maxnode, &nodes);
1579
1580        return err;
1581}
1582
1583SYSCALL_DEFINE5(get_mempolicy, int __user *, policy,
1584                unsigned long __user *, nmask, unsigned long, maxnode,
1585                unsigned long, addr, unsigned long, flags)
1586{
1587        return kernel_get_mempolicy(policy, nmask, maxnode, addr, flags);
1588}
1589
1590#ifdef CONFIG_COMPAT
1591
1592COMPAT_SYSCALL_DEFINE5(get_mempolicy, int __user *, policy,
1593                       compat_ulong_t __user *, nmask,
1594                       compat_ulong_t, maxnode,
1595                       compat_ulong_t, addr, compat_ulong_t, flags)
1596{
1597        long err;
1598        unsigned long __user *nm = NULL;
1599        unsigned long nr_bits, alloc_size;
1600        DECLARE_BITMAP(bm, MAX_NUMNODES);
1601
1602        nr_bits = min_t(unsigned long, maxnode-1, nr_node_ids);
1603        alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1604
1605        if (nmask)
1606                nm = compat_alloc_user_space(alloc_size);
1607
1608        err = kernel_get_mempolicy(policy, nm, nr_bits+1, addr, flags);
1609
1610        if (!err && nmask) {
1611                unsigned long copy_size;
1612                copy_size = min_t(unsigned long, sizeof(bm), alloc_size);
1613                err = copy_from_user(bm, nm, copy_size);
1614                /* ensure entire bitmap is zeroed */
1615                err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8);
1616                err |= compat_put_bitmap(nmask, bm, nr_bits);
1617        }
1618
1619        return err;
1620}
1621
1622COMPAT_SYSCALL_DEFINE3(set_mempolicy, int, mode, compat_ulong_t __user *, nmask,
1623                       compat_ulong_t, maxnode)
1624{
1625        unsigned long __user *nm = NULL;
1626        unsigned long nr_bits, alloc_size;
1627        DECLARE_BITMAP(bm, MAX_NUMNODES);
1628
1629        nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1630        alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1631
1632        if (nmask) {
1633                if (compat_get_bitmap(bm, nmask, nr_bits))
1634                        return -EFAULT;
1635                nm = compat_alloc_user_space(alloc_size);
1636                if (copy_to_user(nm, bm, alloc_size))
1637                        return -EFAULT;
1638        }
1639
1640        return kernel_set_mempolicy(mode, nm, nr_bits+1);
1641}
1642
1643COMPAT_SYSCALL_DEFINE6(mbind, compat_ulong_t, start, compat_ulong_t, len,
1644                       compat_ulong_t, mode, compat_ulong_t __user *, nmask,
1645                       compat_ulong_t, maxnode, compat_ulong_t, flags)
1646{
1647        unsigned long __user *nm = NULL;
1648        unsigned long nr_bits, alloc_size;
1649        nodemask_t bm;
1650
1651        nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1652        alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1653
1654        if (nmask) {
1655                if (compat_get_bitmap(nodes_addr(bm), nmask, nr_bits))
1656                        return -EFAULT;
1657                nm = compat_alloc_user_space(alloc_size);
1658                if (copy_to_user(nm, nodes_addr(bm), alloc_size))
1659                        return -EFAULT;
1660        }
1661
1662        return kernel_mbind(start, len, mode, nm, nr_bits+1, flags);
1663}
1664
1665COMPAT_SYSCALL_DEFINE4(migrate_pages, compat_pid_t, pid,
1666                       compat_ulong_t, maxnode,
1667                       const compat_ulong_t __user *, old_nodes,
1668                       const compat_ulong_t __user *, new_nodes)
1669{
1670        unsigned long __user *old = NULL;
1671        unsigned long __user *new = NULL;
1672        nodemask_t tmp_mask;
1673        unsigned long nr_bits;
1674        unsigned long size;
1675
1676        nr_bits = min_t(unsigned long, maxnode - 1, MAX_NUMNODES);
1677        size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1678        if (old_nodes) {
1679                if (compat_get_bitmap(nodes_addr(tmp_mask), old_nodes, nr_bits))
1680                        return -EFAULT;
1681                old = compat_alloc_user_space(new_nodes ? size * 2 : size);
1682                if (new_nodes)
1683                        new = old + size / sizeof(unsigned long);
1684                if (copy_to_user(old, nodes_addr(tmp_mask), size))
1685                        return -EFAULT;
1686        }
1687        if (new_nodes) {
1688                if (compat_get_bitmap(nodes_addr(tmp_mask), new_nodes, nr_bits))
1689                        return -EFAULT;
1690                if (new == NULL)
1691                        new = compat_alloc_user_space(size);
1692                if (copy_to_user(new, nodes_addr(tmp_mask), size))
1693                        return -EFAULT;
1694        }
1695        return kernel_migrate_pages(pid, nr_bits + 1, old, new);
1696}
1697
1698#endif /* CONFIG_COMPAT */
1699
1700struct mempolicy *__get_vma_policy(struct vm_area_struct *vma,
1701                                                unsigned long addr)
1702{
1703        struct mempolicy *pol = NULL;
1704
1705        if (vma) {
1706                if (vma->vm_ops && vma->vm_ops->get_policy) {
1707                        pol = vma->vm_ops->get_policy(vma, addr);
1708                } else if (vma->vm_policy) {
1709                        pol = vma->vm_policy;
1710
1711                        /*
1712                         * shmem_alloc_page() passes MPOL_F_SHARED policy with
1713                         * a pseudo vma whose vma->vm_ops=NULL. Take a reference
1714                         * count on these policies which will be dropped by
1715                         * mpol_cond_put() later
1716                         */
1717                        if (mpol_needs_cond_ref(pol))
1718                                mpol_get(pol);
1719                }
1720        }
1721
1722        return pol;
1723}
1724
1725/*
1726 * get_vma_policy(@vma, @addr)
1727 * @vma: virtual memory area whose policy is sought
1728 * @addr: address in @vma for shared policy lookup
1729 *
1730 * Returns effective policy for a VMA at specified address.
1731 * Falls back to current->mempolicy or system default policy, as necessary.
1732 * Shared policies [those marked as MPOL_F_SHARED] require an extra reference
1733 * count--added by the get_policy() vm_op, as appropriate--to protect against
1734 * freeing by another task.  It is the caller's responsibility to free the
1735 * extra reference for shared policies.
1736 */
1737struct mempolicy *get_vma_policy(struct vm_area_struct *vma,
1738                                                unsigned long addr)
1739{
1740        struct mempolicy *pol = __get_vma_policy(vma, addr);
1741
1742        if (!pol)
1743                pol = get_task_policy(current);
1744
1745        return pol;
1746}
1747
1748bool vma_policy_mof(struct vm_area_struct *vma)
1749{
1750        struct mempolicy *pol;
1751
1752        if (vma->vm_ops && vma->vm_ops->get_policy) {
1753                bool ret = false;
1754
1755                pol = vma->vm_ops->get_policy(vma, vma->vm_start);
1756                if (pol && (pol->flags & MPOL_F_MOF))
1757                        ret = true;
1758                mpol_cond_put(pol);
1759
1760                return ret;
1761        }
1762
1763        pol = vma->vm_policy;
1764        if (!pol)
1765                pol = get_task_policy(current);
1766
1767        return pol->flags & MPOL_F_MOF;
1768}
1769
1770static int apply_policy_zone(struct mempolicy *policy, enum zone_type zone)
1771{
1772        enum zone_type dynamic_policy_zone = policy_zone;
1773
1774        BUG_ON(dynamic_policy_zone == ZONE_MOVABLE);
1775
1776        /*
1777         * if policy->v.nodes has movable memory only,
1778         * we apply policy when gfp_zone(gfp) = ZONE_MOVABLE only.
1779         *
1780         * policy->v.nodes is intersect with node_states[N_MEMORY].
1781         * so if the following test faile, it implies
1782         * policy->v.nodes has movable memory only.
1783         */
1784        if (!nodes_intersects(policy->v.nodes, node_states[N_HIGH_MEMORY]))
1785                dynamic_policy_zone = ZONE_MOVABLE;
1786
1787        return zone >= dynamic_policy_zone;
1788}
1789
1790/*
1791 * Return a nodemask representing a mempolicy for filtering nodes for
1792 * page allocation
1793 */
1794static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy)
1795{
1796        /* Lower zones don't get a nodemask applied for MPOL_BIND */
1797        if (unlikely(policy->mode == MPOL_BIND) &&
1798                        apply_policy_zone(policy, gfp_zone(gfp)) &&
1799                        cpuset_nodemask_valid_mems_allowed(&policy->v.nodes))
1800                return &policy->v.nodes;
1801
1802        return NULL;
1803}
1804
1805/* Return the node id preferred by the given mempolicy, or the given id */
1806static int policy_node(gfp_t gfp, struct mempolicy *policy,
1807                                                                int nd)
1808{
1809        if (policy->mode == MPOL_PREFERRED && !(policy->flags & MPOL_F_LOCAL))
1810                nd = policy->v.preferred_node;
1811        else {
1812                /*
1813                 * __GFP_THISNODE shouldn't even be used with the bind policy
1814                 * because we might easily break the expectation to stay on the
1815                 * requested node and not break the policy.
1816                 */
1817                WARN_ON_ONCE(policy->mode == MPOL_BIND && (gfp & __GFP_THISNODE));
1818        }
1819
1820        return nd;
1821}
1822
1823/* Do dynamic interleaving for a process */
1824static unsigned interleave_nodes(struct mempolicy *policy)
1825{
1826        unsigned next;
1827        struct task_struct *me = current;
1828
1829        next = next_node_in(me->il_prev, policy->v.nodes);
1830        if (next < MAX_NUMNODES)
1831                me->il_prev = next;
1832        return next;
1833}
1834
1835/*
1836 * Depending on the memory policy provide a node from which to allocate the
1837 * next slab entry.
1838 */
1839unsigned int mempolicy_slab_node(void)
1840{
1841        struct mempolicy *policy;
1842        int node = numa_mem_id();
1843
1844        if (in_interrupt())
1845                return node;
1846
1847        policy = current->mempolicy;
1848        if (!policy || policy->flags & MPOL_F_LOCAL)
1849                return node;
1850
1851        switch (policy->mode) {
1852        case MPOL_PREFERRED:
1853                /*
1854                 * handled MPOL_F_LOCAL above
1855                 */
1856                return policy->v.preferred_node;
1857
1858        case MPOL_INTERLEAVE:
1859                return interleave_nodes(policy);
1860
1861        case MPOL_BIND: {
1862                struct zoneref *z;
1863
1864                /*
1865                 * Follow bind policy behavior and start allocation at the
1866                 * first node.
1867                 */
1868                struct zonelist *zonelist;
1869                enum zone_type highest_zoneidx = gfp_zone(GFP_KERNEL);
1870                zonelist = &NODE_DATA(node)->node_zonelists[ZONELIST_FALLBACK];
1871                z = first_zones_zonelist(zonelist, highest_zoneidx,
1872                                                        &policy->v.nodes);
1873                return z->zone ? zone_to_nid(z->zone) : node;
1874        }
1875
1876        default:
1877                BUG();
1878        }
1879}
1880
1881/*
1882 * Do static interleaving for a VMA with known offset @n.  Returns the n'th
1883 * node in pol->v.nodes (starting from n=0), wrapping around if n exceeds the
1884 * number of present nodes.
1885 */
1886static unsigned offset_il_node(struct mempolicy *pol, unsigned long n)
1887{
1888        unsigned nnodes = nodes_weight(pol->v.nodes);
1889        unsigned target;
1890        int i;
1891        int nid;
1892
1893        if (!nnodes)
1894                return numa_node_id();
1895        target = (unsigned int)n % nnodes;
1896        nid = first_node(pol->v.nodes);
1897        for (i = 0; i < target; i++)
1898                nid = next_node(nid, pol->v.nodes);
1899        return nid;
1900}
1901
1902/* Determine a node number for interleave */
1903static inline unsigned interleave_nid(struct mempolicy *pol,
1904                 struct vm_area_struct *vma, unsigned long addr, int shift)
1905{
1906        if (vma) {
1907                unsigned long off;
1908
1909                /*
1910                 * for small pages, there is no difference between
1911                 * shift and PAGE_SHIFT, so the bit-shift is safe.
1912                 * for huge pages, since vm_pgoff is in units of small
1913                 * pages, we need to shift off the always 0 bits to get
1914                 * a useful offset.
1915                 */
1916                BUG_ON(shift < PAGE_SHIFT);
1917                off = vma->vm_pgoff >> (shift - PAGE_SHIFT);
1918                off += (addr - vma->vm_start) >> shift;
1919                return offset_il_node(pol, off);
1920        } else
1921                return interleave_nodes(pol);
1922}
1923
1924#ifdef CONFIG_HUGETLBFS
1925/*
1926 * huge_node(@vma, @addr, @gfp_flags, @mpol)
1927 * @vma: virtual memory area whose policy is sought
1928 * @addr: address in @vma for shared policy lookup and interleave policy
1929 * @gfp_flags: for requested zone
1930 * @mpol: pointer to mempolicy pointer for reference counted mempolicy
1931 * @nodemask: pointer to nodemask pointer for MPOL_BIND nodemask
1932 *
1933 * Returns a nid suitable for a huge page allocation and a pointer
1934 * to the struct mempolicy for conditional unref after allocation.
1935 * If the effective policy is 'BIND, returns a pointer to the mempolicy's
1936 * @nodemask for filtering the zonelist.
1937 *
1938 * Must be protected by read_mems_allowed_begin()
1939 */
1940int huge_node(struct vm_area_struct *vma, unsigned long addr, gfp_t gfp_flags,
1941                                struct mempolicy **mpol, nodemask_t **nodemask)
1942{
1943        int nid;
1944
1945        *mpol = get_vma_policy(vma, addr);
1946        *nodemask = NULL;       /* assume !MPOL_BIND */
1947
1948        if (unlikely((*mpol)->mode == MPOL_INTERLEAVE)) {
1949                nid = interleave_nid(*mpol, vma, addr,
1950                                        huge_page_shift(hstate_vma(vma)));
1951        } else {
1952                nid = policy_node(gfp_flags, *mpol, numa_node_id());
1953                if ((*mpol)->mode == MPOL_BIND)
1954                        *nodemask = &(*mpol)->v.nodes;
1955        }
1956        return nid;
1957}
1958
1959/*
1960 * init_nodemask_of_mempolicy
1961 *
1962 * If the current task's mempolicy is "default" [NULL], return 'false'
1963 * to indicate default policy.  Otherwise, extract the policy nodemask
1964 * for 'bind' or 'interleave' policy into the argument nodemask, or
1965 * initialize the argument nodemask to contain the single node for
1966 * 'preferred' or 'local' policy and return 'true' to indicate presence
1967 * of non-default mempolicy.
1968 *
1969 * We don't bother with reference counting the mempolicy [mpol_get/put]
1970 * because the current task is examining it's own mempolicy and a task's
1971 * mempolicy is only ever changed by the task itself.
1972 *
1973 * N.B., it is the caller's responsibility to free a returned nodemask.
1974 */
1975bool init_nodemask_of_mempolicy(nodemask_t *mask)
1976{
1977        struct mempolicy *mempolicy;
1978        int nid;
1979
1980        if (!(mask && current->mempolicy))
1981                return false;
1982
1983        task_lock(current);
1984        mempolicy = current->mempolicy;
1985        switch (mempolicy->mode) {
1986        case MPOL_PREFERRED:
1987                if (mempolicy->flags & MPOL_F_LOCAL)
1988                        nid = numa_node_id();
1989                else
1990                        nid = mempolicy->v.preferred_node;
1991                init_nodemask_of_node(mask, nid);
1992                break;
1993
1994        case MPOL_BIND:
1995                /* Fall through */
1996        case MPOL_INTERLEAVE:
1997                *mask =  mempolicy->v.nodes;
1998                break;
1999
2000        default:

2001                BUG();
2002        }
2003        task_unlock(current);
2004
2005        return true;
2006}
2007#endif
2008
2009/*
2010 * mempolicy_nodemask_intersects
2011 *
2012 * If tsk's mempolicy is "default" [NULL], return 'true' to indicate default
2013 * policy.  Otherwise, check for intersection between mask and the policy
2014 * nodemask for 'bind' or 'interleave' policy.  For 'perferred' or 'local'
2015 * policy, always return true since it may allocate elsewhere on fallback.
2016 *
2017 * Takes task_lock(tsk) to prevent freeing of its mempolicy.
2018 */
2019bool mempolicy_nodemask_intersects(struct task_struct *tsk,
2020                                        const nodemask_t *mask)
2021{
2022        struct mempolicy *mempolicy;
2023        bool ret = true;
2024
2025        if (!mask)
2026                return ret;
2027        task_lock(tsk);
2028        mempolicy = tsk->mempolicy;
2029        if (!mempolicy)
2030                goto out;
2031
2032        switch (mempolicy->mode) {
2033        case MPOL_PREFERRED:
2034                /*
2035                 * MPOL_PREFERRED and MPOL_F_LOCAL are only preferred nodes to
2036                 * allocate from, they may fallback to other nodes when oom.
2037                 * Thus, it's possible for tsk to have allocated memory from
2038                 * nodes in mask.
2039                 */
2040                break;
2041        case MPOL_BIND:
2042        case MPOL_INTERLEAVE:
2043                ret = nodes_intersects(mempolicy->v.nodes, *mask);
2044                break;
2045        default:
2046                BUG();
2047        }
2048out:
2049        task_unlock(tsk);
2050        return ret;
2051}
2052
2053/* Allocate a page in interleaved policy.
2054   Own path because it needs to do special accounting. */
2055static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
2056                                        unsigned nid)
2057{
2058        struct page *page;
2059
2060        page = __alloc_pages(gfp, order, nid);
2061        /* skip NUMA_INTERLEAVE_HIT counter update if numa stats is disabled */
2062        if (!static_branch_likely(&vm_numa_stat_key))
2063                return page;
2064        if (page && page_to_nid(page) == nid) {
2065                preempt_disable();
2066                __inc_numa_state(page_zone(page), NUMA_INTERLEAVE_HIT);
2067                preempt_enable();
2068        }
2069        return page;
2070}
2071
2072/**
2073 *      alloc_pages_vma - Allocate a page for a VMA.
2074 *
2075 *      @gfp:
2076 *      %GFP_USER    user allocation.
2077 *      %GFP_KERNEL  kernel allocations,
2078 *      %GFP_HIGHMEM highmem/user allocations,
2079 *      %GFP_FS      allocation should not call back into a file system.
2080 *      %GFP_ATOMIC  don't sleep.
2081 *
2082 *      @order:Order of the GFP allocation.
2083 *      @vma:  Pointer to VMA or NULL if not available.
2084 *      @addr: Virtual Address of the allocation. Must be inside the VMA.
2085 *      @node: Which node to prefer for allocation (modulo policy).
2086 *
2087 *      This function allocates a page from the kernel page pool and applies
2088 *      a NUMA policy associated with the VMA or the current process.
2089 *      When VMA is not NULL caller must hold down_read on the mmap_sem of the
2090 *      mm_struct of the VMA to prevent it from going away. Should be used for
2091 *      all allocations for pages that will be mapped into user space. Returns
2092 *      NULL when no page can be allocated.
2093 */
2094struct page *
2095alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
2096                unsigned long addr, int node)
2097{
2098        struct mempolicy *pol;
2099        struct page *page;
2100        int preferred_nid;
2101        nodemask_t *nmask;
2102
2103        pol = get_vma_policy(vma, addr);
2104
2105        if (pol->mode == MPOL_INTERLEAVE) {
2106                unsigned nid;
2107
2108                nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order);
2109                mpol_cond_put(pol);
2110                page = alloc_page_interleave(gfp, order, nid);
2111                goto out;
2112        }
2113
2114        nmask = policy_nodemask(gfp, pol);
2115        preferred_nid = policy_node(gfp, pol, node);
2116        page = __alloc_pages_nodemask(gfp, order, preferred_nid, nmask);
2117        mpol_cond_put(pol);
2118out:
2119        return page;
2120}
2121EXPORT_SYMBOL(alloc_pages_vma);
2122
2123/**
2124 *      alloc_pages_current - Allocate pages.
2125 *
2126 *      @gfp:
2127 *              %GFP_USER   user allocation,
2128 *              %GFP_KERNEL kernel allocation,
2129 *              %GFP_HIGHMEM highmem allocation,
2130 *              %GFP_FS     don't call back into a file system.
2131 *              %GFP_ATOMIC don't sleep.
2132 *      @order: Power of two of allocation size in pages. 0 is a single page.
2133 *
2134 *      Allocate a page from the kernel page pool.  When not in
2135 *      interrupt context and apply the current process NUMA policy.
2136 *      Returns NULL when no page can be allocated.
2137 */
2138struct page *alloc_pages_current(gfp_t gfp, unsigned order)
2139{
2140        struct mempolicy *pol = &default_policy;
2141        struct page *page;
2142
2143        if (!in_interrupt() && !(gfp & __GFP_THISNODE))
2144                pol = get_task_policy(current);
2145
2146        /*
2147         * No reference counting needed for current->mempolicy
2148         * nor system default_policy
2149         */
2150        if (pol->mode == MPOL_INTERLEAVE)
2151                page = alloc_page_interleave(gfp, order, interleave_nodes(pol));
2152        else
2153                page = __alloc_pages_nodemask(gfp, order,
2154                                policy_node(gfp, pol, numa_node_id()),
2155                                policy_nodemask(gfp, pol));
2156
2157        return page;
2158}
2159EXPORT_SYMBOL(alloc_pages_current);
2160
2161int vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst)
2162{
2163        struct mempolicy *pol = mpol_dup(vma_policy(src));
2164
2165        if (IS_ERR(pol))
2166                return PTR_ERR(pol);
2167        dst->vm_policy = pol;
2168        return 0;
2169}
2170
2171/*
2172 * If mpol_dup() sees current->cpuset == cpuset_being_rebound, then it
2173 * rebinds the mempolicy its copying by calling mpol_rebind_policy()
2174 * with the mems_allowed returned by cpuset_mems_allowed().  This
2175 * keeps mempolicies cpuset relative after its cpuset moves.  See
2176 * further kernel/cpuset.c update_nodemask().
2177 *
2178 * current's mempolicy may be rebinded by the other task(the task that changes
2179 * cpuset's mems), so we needn't do rebind work for current task.
2180 */
2181
2182/* Slow path of a mempolicy duplicate */
2183struct mempolicy *__mpol_dup(struct mempolicy *old)
2184{
2185        struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
2186
2187        if (!new)
2188                return ERR_PTR(-ENOMEM);
2189
2190        /* task's mempolicy is protected by alloc_lock */
2191        if (old == current->mempolicy) {
2192                task_lock(current);
2193                *new = *old;
2194                task_unlock(current);
2195        } else
2196                *new = *old;
2197
2198        if (current_cpuset_is_being_rebound()) {
2199                nodemask_t mems = cpuset_mems_allowed(current);
2200                mpol_rebind_policy(new, &mems);
2201        }
2202        atomic_set(&new->refcnt, 1);
2203        return new;
2204}
2205
2206/* Slow path of a mempolicy comparison */
2207bool __mpol_equal(struct mempolicy *a, struct mempolicy *b)
2208{
2209        if (!a || !b)
2210                return false;
2211        if (a->mode != b->mode)
2212                return false;
2213        if (a->flags != b->flags)
2214                return false;
2215        if (mpol_store_user_nodemask(a))
2216                if (!nodes_equal(a->w.user_nodemask, b->w.user_nodemask))
2217                        return false;
2218
2219        switch (a->mode) {
2220        case MPOL_BIND:
2221                /* Fall through */
2222        case MPOL_INTERLEAVE:
2223                return !!nodes_equal(a->v.nodes, b->v.nodes);
2224        case MPOL_PREFERRED:
2225                /* a's ->flags is the same as b's */
2226                if (a->flags & MPOL_F_LOCAL)
2227                        return true;
2228                return a->v.preferred_node == b->v.preferred_node;
2229        default:
2230                BUG();
2231                return false;
2232        }
2233}
2234
2235/*
2236 * Shared memory backing store policy support.
2237 *
2238 * Remember policies even when nobody has shared memory mapped.
2239 * The policies are kept in Red-Black tree linked from the inode.
2240 * They are protected by the sp->lock rwlock, which should be held
2241 * for any accesses to the tree.
2242 */
2243
2244/*
2245 * lookup first element intersecting start-end.  Caller holds sp->lock for
2246 * reading or for writing
2247 */
2248static struct sp_node *
2249sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
2250{
2251        struct rb_node *n = sp->root.rb_node;
2252
2253        while (n) {
2254                struct sp_node *p = rb_entry(n, struct sp_node, nd);
2255
2256                if (start >= p->end)
2257                        n = n->rb_right;
2258                else if (end <= p->start)
2259                        n = n->rb_left;
2260                else
2261                        break;
2262        }
2263        if (!n)
2264                return NULL;
2265        for (;;) {
2266                struct sp_node *w = NULL;
2267                struct rb_node *prev = rb_prev(n);
2268                if (!prev)
2269                        break;
2270                w = rb_entry(prev, struct sp_node, nd);
2271                if (w->end <= start)
2272                        break;
2273                n = prev;
2274        }
2275        return rb_entry(n, struct sp_node, nd);
2276}
2277
2278/*
2279 * Insert a new shared policy into the list.  Caller holds sp->lock for
2280 * writing.
2281 */
2282static void sp_insert(struct shared_policy *sp, struct sp_node *new)
2283{
2284        struct rb_node **p = &sp->root.rb_node;
2285        struct rb_node *parent = NULL;
2286        struct sp_node *nd;
2287
2288        while (*p) {
2289                parent = *p;
2290                nd = rb_entry(parent, struct sp_node, nd);
2291                if (new->start < nd->start)
2292                        p = &(*p)->rb_left;
2293                else if (new->end > nd->end)
2294                        p = &(*p)->rb_right;
2295                else
2296                        BUG();
2297        }
2298        rb_link_node(&new->nd, parent, p);
2299        rb_insert_color(&new->nd, &sp->root);
2300        pr_debug("inserting %lx-%lx: %d\n", new->start, new->end,
2301                 new->policy ? new->policy->mode : 0);
2302}
2303
2304/* Find shared policy intersecting idx */
2305struct mempolicy *
2306mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
2307{
2308        struct mempolicy *pol = NULL;
2309        struct sp_node *sn;
2310
2311        if (!sp->root.rb_node)
2312                return NULL;
2313        read_lock(&sp->lock);
2314        sn = sp_lookup(sp, idx, idx+1);
2315        if (sn) {
2316                mpol_get(sn->policy);
2317                pol = sn->policy;
2318        }
2319        read_unlock(&sp->lock);
2320        return pol;
2321}
2322
2323static void sp_free(struct sp_node *n)
2324{
2325        mpol_put(n->policy);
2326        kmem_cache_free(sn_cache, n);
2327}
2328
2329/**
2330 * mpol_misplaced - check whether current page node is valid in policy
2331 *
2332 * @page: page to be checked
2333 * @vma: vm area where page mapped
2334 * @addr: virtual address where page mapped
2335 *
2336 * Lookup current policy node id for vma,addr and "compare to" page's
2337 * node id.
2338 *
2339 * Returns:
2340 *      -1      - not misplaced, page is in the right node
2341 *      node    - node id where the page should be
2342 *
2343 * Policy determination "mimics" alloc_page_vma().
2344 * Called from fault path where we know the vma and faulting address.
2345 */
2346int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long addr)
2347{
2348        struct mempolicy *pol;
2349        struct zoneref *z;
2350        int curnid = page_to_nid(page);
2351        unsigned long pgoff;
2352        int thiscpu = raw_smp_processor_id();
2353        int thisnid = cpu_to_node(thiscpu);
2354        int polnid = NUMA_NO_NODE;
2355        int ret = -1;
2356
2357        pol = get_vma_policy(vma, addr);
2358        if (!(pol->flags & MPOL_F_MOF))
2359                goto out;
2360
2361        switch (pol->mode) {
2362        case MPOL_INTERLEAVE:
2363                pgoff = vma->vm_pgoff;
2364                pgoff += (addr - vma->vm_start) >> PAGE_SHIFT;
2365                polnid = offset_il_node(pol, pgoff);
2366                break;
2367
2368        case MPOL_PREFERRED:
2369                if (pol->flags & MPOL_F_LOCAL)
2370                        polnid = numa_node_id();
2371                else
2372                        polnid = pol->v.preferred_node;
2373                break;
2374
2375        case MPOL_BIND:
2376
2377                /*
2378                 * allows binding to multiple nodes.
2379                 * use current page if in policy nodemask,
2380                 * else select nearest allowed node, if any.
2381                 * If no allowed nodes, use current [!misplaced].
2382                 */
2383                if (node_isset(curnid, pol->v.nodes))
2384                        goto out;
2385                z = first_zones_zonelist(
2386                                node_zonelist(numa_node_id(), GFP_HIGHUSER),
2387                                gfp_zone(GFP_HIGHUSER),
2388                                &pol->v.nodes);
2389                polnid = zone_to_nid(z->zone);
2390                break;
2391
2392        default:
2393                BUG();
2394        }
2395
2396        /* Migrate the page towards the node whose CPU is referencing it */
2397        if (pol->flags & MPOL_F_MORON) {
2398                polnid = thisnid;
2399
2400                if (!should_numa_migrate_memory(current, page, curnid, thiscpu))
2401                        goto out;
2402        }
2403
2404        if (curnid != polnid)
2405                ret = polnid;
2406out:
2407        mpol_cond_put(pol);
2408
2409        return ret;
2410}
2411
2412/*
2413 * Drop the (possibly final) reference to task->mempolicy.  It needs to be
2414 * dropped after task->mempolicy is set to NULL so that any allocation done as
2415 * part of its kmem_cache_free(), such as by KASAN, doesn't reference a freed
2416 * policy.
2417 */
2418void mpol_put_task_policy(struct task_struct *task)
2419{
2420        struct mempolicy *pol;
2421
2422        task_lock(task);
2423        pol = task->mempolicy;
2424        task->mempolicy = NULL;
2425        task_unlock(task);
2426        mpol_put(pol);
2427}
2428
2429static void sp_delete(struct shared_policy *sp, struct sp_node *n)
2430{
2431        pr_debug("deleting %lx-l%lx\n", n->start, n->end);
2432        rb_erase(&n->nd, &sp->root);
2433        sp_free(n);
2434}
2435
2436static void sp_node_init(struct sp_node *node, unsigned long start,
2437                        unsigned long end, struct mempolicy *pol)
2438{
2439        node->start = start;
2440        node->end = end;
2441        node->policy = pol;
2442}
2443
2444static struct sp_node *sp_alloc(unsigned long start, unsigned long end,
2445                                struct mempolicy *pol)
2446{
2447        struct sp_node *n;
2448        struct mempolicy *newpol;
2449
2450        n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
2451        if (!n)
2452                return NULL;
2453
2454        newpol = mpol_dup(pol);
2455        if (IS_ERR(newpol)) {
2456                kmem_cache_free(sn_cache, n);
2457                return NULL;
2458        }
2459        newpol->flags |= MPOL_F_SHARED;
2460        sp_node_init(n, start, end, newpol);
2461
2462        return n;
2463}
2464
2465/* Replace a policy range. */
2466static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
2467                                 unsigned long end, struct sp_node *new)
2468{
2469        struct sp_node *n;
2470        struct sp_node *n_new = NULL;
2471        struct mempolicy *mpol_new = NULL;
2472        int ret = 0;
2473
2474restart:
2475        write_lock(&sp->lock);
2476        n = sp_lookup(sp, start, end);
2477        /* Take care of old policies in the same range. */
2478        while (n && n->start < end) {
2479                struct rb_node *next = rb_next(&n->nd);
2480                if (n->start >= start) {
2481                        if (n->end <= end)
2482                                sp_delete(sp, n);
2483                        else
2484                                n->start = end;
2485                } else {
2486                        /* Old policy spanning whole new range. */
2487                        if (n->end > end) {
2488                                if (!n_new)
2489                                        goto alloc_new;
2490
2491                                *mpol_new = *n->policy;
2492                                atomic_set(&mpol_new->refcnt, 1);
2493                                sp_node_init(n_new, end, n->end, mpol_new);
2494                                n->end = start;
2495                                sp_insert(sp, n_new);
2496                                n_new = NULL;
2497                                mpol_new = NULL;
2498                                break;
2499                        } else
2500                                n->end = start;
2501                }
2502                if (!next)
2503                        break;
2504                n = rb_entry(next, struct sp_node, nd);
2505        }
2506        if (new)
2507                sp_insert(sp, new);
2508        write_unlock(&sp->lock);
2509        ret = 0;
2510
2511err_out:
2512        if (mpol_new)
2513                mpol_put(mpol_new);
2514        if (n_new)
2515                kmem_cache_free(sn_cache, n_new);
2516
2517        return ret;
2518
2519alloc_new:
2520        write_unlock(&sp->lock);
2521        ret = -ENOMEM;
2522        n_new = kmem_cache_alloc(sn_cache, GFP_KERNEL);
2523        if (!n_new)
2524                goto err_out;
2525        mpol_new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
2526        if (!mpol_new)
2527                goto err_out;
2528        goto restart;
2529}
2530
2531/**
2532 * mpol_shared_policy_init - initialize shared policy for inode
2533 * @sp: pointer to inode shared policy
2534 * @mpol:  struct mempolicy to install
2535 *
2536 * Install non-NULL @mpol in inode's shared policy rb-tree.
2537 * On entry, the current task has a reference on a non-NULL @mpol.
2538 * This must be released on exit.
2539 * This is called at get_inode() calls and we can use GFP_KERNEL.
2540 */
2541void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol)
2542{
2543        int ret;
2544
2545        sp->root = RB_ROOT;             /* empty tree == default mempolicy */
2546        rwlock_init(&sp->lock);
2547
2548        if (mpol) {
2549                struct vm_area_struct pvma;
2550                struct mempolicy *new;
2551                NODEMASK_SCRATCH(scratch);
2552
2553                if (!scratch)
2554                        goto put_mpol;
2555                /* contextualize the tmpfs mount point mempolicy */
2556                new = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask);
2557                if (IS_ERR(new))
2558                        goto free_scratch; /* no valid nodemask intersection */
2559
2560                task_lock(current);
2561                ret = mpol_set_nodemask(new, &mpol->w.user_nodemask, scratch);
2562                task_unlock(current);
2563                if (ret)
2564                        goto put_new;
2565
2566                /* Create pseudo-vma that contains just the policy */
2567                vma_init(&pvma, NULL);
2568                pvma.vm_end = TASK_SIZE;        /* policy covers entire file */
2569                mpol_set_shared_policy(sp, &pvma, new); /* adds ref */
2570
2571put_new:
2572                mpol_put(new);                  /* drop initial ref */
2573free_scratch:
2574                NODEMASK_SCRATCH_FREE(scratch);
2575put_mpol:
2576                mpol_put(mpol); /* drop our incoming ref on sb mpol */
2577        }
2578}
2579
2580int mpol_set_shared_policy(struct shared_policy *info,
2581                        struct vm_area_struct *vma, struct mempolicy *npol)
2582{
2583        int err;
2584        struct sp_node *new = NULL;
2585        unsigned long sz = vma_pages(vma);
2586
2587        pr_debug("set_shared_policy %lx sz %lu %d %d %lx\n",
2588                 vma->vm_pgoff,
2589                 sz, npol ? npol->mode : -1,
2590                 npol ? npol->flags : -1,
2591                 npol ? nodes_addr(npol->v.nodes)[0] : NUMA_NO_NODE);
2592
2593        if (npol) {
2594                new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);
2595                if (!new)
2596                        return -ENOMEM;
2597        }
2598        err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new);
2599        if (err && new)
2600                sp_free(new);
2601        return err;
2602}
2603
2604/* Free a backing policy store on inode delete. */
2605void mpol_free_shared_policy(struct shared_policy *p)
2606{
2607        struct sp_node *n;
2608        struct rb_node *next;
2609
2610        if (!p->root.rb_node)
2611                return;
2612        write_lock(&p->lock);
2613        next = rb_first(&p->root);
2614        while (next) {
2615                n = rb_entry(next, struct sp_node, nd);
2616                next = rb_next(&n->nd);
2617                sp_delete(p, n);
2618        }
2619        write_unlock(&p->lock);
2620}
2621
2622#ifdef CONFIG_NUMA_BALANCING
2623static int __initdata numabalancing_override;
2624
2625static void __init check_numabalancing_enable(void)
2626{
2627        bool numabalancing_default = false;
2628
2629        if (IS_ENABLED(CONFIG_NUMA_BALANCING_DEFAULT_ENABLED))
2630                numabalancing_default = true;
2631
2632        /* Parsed by setup_numabalancing. override == 1 enables, -1 disables */
2633        if (numabalancing_override)
2634                set_numabalancing_state(numabalancing_override == 1);
2635
2636        if (num_online_nodes() > 1 && !numabalancing_override) {
2637                pr_info("%s automatic NUMA balancing. Configure with numa_balancing= or the kernel.numa_balancing sysctl\n",
2638                        numabalancing_default ? "Enabling" : "Disabling");
2639                set_numabalancing_state(numabalancing_default);
2640        }
2641}
2642
2643static int __init setup_numabalancing(char *str)
2644{
2645        int ret = 0;
2646        if (!str)
2647                goto out;
2648
2649        if (!strcmp(str, "enable")) {
2650                numabalancing_override = 1;
2651                ret = 1;
2652        } else if (!strcmp(str, "disable")) {
2653                numabalancing_override = -1;
2654                ret = 1;
2655        }
2656out:
2657        if (!ret)
2658                pr_warn("Unable to parse numa_balancing=\n");
2659
2660        return ret;
2661}
2662__setup("numa_balancing=", setup_numabalancing);
2663#else
2664static inline void __init check_numabalancing_enable(void)
2665{
2666}
2667#endif /* CONFIG_NUMA_BALANCING */
2668
2669/* assumes fs == KERNEL_DS */
2670void __init numa_policy_init(void)
2671{
2672        nodemask_t interleave_nodes;
2673        unsigned long largest = 0;
2674        int nid, prefer = 0;
2675
2676        policy_cache = kmem_cache_create("numa_policy",
2677                                         sizeof(struct mempolicy),
2678                                         0, SLAB_PANIC, NULL);
2679
2680        sn_cache = kmem_cache_create("shared_policy_node",
2681                                     sizeof(struct sp_node),
2682                                     0, SLAB_PANIC, NULL);
2683
2684        for_each_node(nid) {
2685                preferred_node_policy[nid] = (struct mempolicy) {
2686                        .refcnt = ATOMIC_INIT(1),
2687                        .mode = MPOL_PREFERRED,
2688                        .flags = MPOL_F_MOF | MPOL_F_MORON,
2689                        .v = { .preferred_node = nid, },
2690                };
2691        }
2692
2693        /*
2694         * Set interleaving policy for system init. Interleaving is only
2695         * enabled across suitably sized nodes (default is >= 16MB), or
2696         * fall back to the largest node if they're all smaller.
2697         */
2698        nodes_clear(interleave_nodes);
2699        for_each_node_state(nid, N_MEMORY) {
2700                unsigned long total_pages = node_present_pages(nid);
2701
2702                /* Preserve the largest node */
2703                if (largest < total_pages) {
2704                        largest = total_pages;
2705                        prefer = nid;
2706                }
2707
2708                /* Interleave this node? */
2709                if ((total_pages << PAGE_SHIFT) >= (16 << 20))
2710                        node_set(nid, interleave_nodes);
2711        }
2712
2713        /* All too small, use the largest */
2714        if (unlikely(nodes_empty(interleave_nodes)))
2715                node_set(prefer, interleave_nodes);
2716
2717        if (do_set_mempolicy(MPOL_INTERLEAVE, 0, &interleave_nodes))
2718                pr_err("%s: interleaving failed\n", __func__);
2719
2720        check_numabalancing_enable();
2721}
2722
2723/* Reset policy of current process to default */
2724void numa_default_policy(void)
2725{
2726        do_set_mempolicy(MPOL_DEFAULT, 0, NULL);
2727}
2728
2729/*
2730 * Parse and format mempolicy from/to strings
2731 */
2732
2733/*
2734 * "local" is implemented internally by MPOL_PREFERRED with MPOL_F_LOCAL flag.
2735 */
2736static const char * const policy_modes[] =
2737{
2738        [MPOL_DEFAULT]    = "default",
2739        [MPOL_PREFERRED]  = "prefer",
2740        [MPOL_BIND]       = "bind",
2741        [MPOL_INTERLEAVE] = "interleave",
2742        [MPOL_LOCAL]      = "local",
2743};
2744
2745
2746#ifdef CONFIG_TMPFS
2747/**
2748 * mpol_parse_str - parse string to mempolicy, for tmpfs mpol mount option.
2749 * @str:  string containing mempolicy to parse
2750 * @mpol:  pointer to struct mempolicy pointer, returned on success.
2751 *
2752 * Format of input:
2753 *      <mode>[=<flags>][:<nodelist>]
2754 *
2755 * On success, returns 0, else 1
2756 */
2757int mpol_parse_str(char *str, struct mempolicy **mpol)
2758{
2759        struct mempolicy *new = NULL;
2760        unsigned short mode_flags;
2761        nodemask_t nodes;
2762        char *nodelist = strchr(str, ':');
2763        char *flags = strchr(str, '=');
2764        int err = 1, mode;
2765
2766        if (nodelist) {
2767                /* NUL-terminate mode or flags string */
2768                *nodelist++ = '\0';
2769                if (nodelist_parse(nodelist, nodes))
2770                        goto out;
2771                if (!nodes_subset(nodes, node_states[N_MEMORY]))
2772                        goto out;
2773        } else
2774                nodes_clear(nodes);
2775
2776        if (flags)
2777                *flags++ = '\0';        /* terminate mode string */
2778
2779        mode = match_string(policy_modes, MPOL_MAX, str);
2780        if (mode < 0)
2781                goto out;
2782
2783        switch (mode) {
2784        case MPOL_PREFERRED:
2785                /*
2786                 * Insist on a nodelist of one node only
2787                 */
2788                if (nodelist) {
2789                        char *rest = nodelist;
2790                        while (isdigit(*rest))
2791                                rest++;
2792                        if (*rest)
2793                                goto out;
2794                }
2795                break;
2796        case MPOL_INTERLEAVE:
2797                /*
2798                 * Default to online nodes with memory if no nodelist
2799                 */
2800                if (!nodelist)
2801                        nodes = node_states[N_MEMORY];
2802                break;
2803        case MPOL_LOCAL:
2804                /*
2805                 * Don't allow a nodelist;  mpol_new() checks flags
2806                 */
2807                if (nodelist)
2808                        goto out;
2809                mode = MPOL_PREFERRED;
2810                break;
2811        case MPOL_DEFAULT:
2812                /*
2813                 * Insist on a empty nodelist
2814                 */
2815                if (!nodelist)
2816                        err = 0;
2817                goto out;
2818        case MPOL_BIND:
2819                /*
2820                 * Insist on a nodelist
2821                 */
2822                if (!nodelist)
2823                        goto out;
2824        }
2825
2826        mode_flags = 0;
2827        if (flags) {
2828                /*
2829                 * Currently, we only support two mutually exclusive
2830                 * mode flags.
2831                 */
2832                if (!strcmp(flags, "static"))
2833                        mode_flags |= MPOL_F_STATIC_NODES;
2834                else if (!strcmp(flags, "relative"))
2835                        mode_flags |= MPOL_F_RELATIVE_NODES;
2836                else
2837                        goto out;
2838        }
2839
2840        new = mpol_new(mode, mode_flags, &nodes);
2841        if (IS_ERR(new))
2842                goto out;
2843
2844        /*
2845         * Save nodes for mpol_to_str() to show the tmpfs mount options
2846         * for /proc/mounts, /proc/pid/mounts and /proc/pid/mountinfo.
2847         */
2848        if (mode != MPOL_PREFERRED)
2849                new->v.nodes = nodes;
2850        else if (nodelist)
2851                new->v.preferred_node = first_node(nodes);
2852        else
2853                new->flags |= MPOL_F_LOCAL;
2854
2855        /*
2856         * Save nodes for contextualization: this will be used to "clone"
2857         * the mempolicy in a specific context [cpuset] at a later time.
2858         */
2859        new->w.user_nodemask = nodes;
2860
2861        err = 0;
2862
2863out:
2864        /* Restore string for error message */
2865        if (nodelist)
2866                *--nodelist = ':';
2867        if (flags)
2868                *--flags = '=';
2869        if (!err)
2870                *mpol = new;
2871        return err;
2872}
2873#endif /* CONFIG_TMPFS */
2874
2875/**
2876 * mpol_to_str - format a mempolicy structure for printing
2877 * @buffer:  to contain formatted mempolicy string
2878 * @maxlen:  length of @buffer
2879 * @pol:  pointer to mempolicy to be formatted
2880 *
2881 * Convert @pol into a string.  If @buffer is too short, truncate the string.
2882 * Recommend a @maxlen of at least 32 for the longest mode, "interleave", the
2883 * longest flag, "relative", and to display at least a few node ids.
2884 */
2885void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
2886{
2887        char *p = buffer;
2888        nodemask_t nodes = NODE_MASK_NONE;
2889        unsigned short mode = MPOL_DEFAULT;
2890        unsigned short flags = 0;
2891
2892        if (pol && pol != &default_policy && !(pol->flags & MPOL_F_MORON)) {
2893                mode = pol->mode;
2894                flags = pol->flags;
2895        }
2896
2897        switch (mode) {
2898        case MPOL_DEFAULT:
2899                break;
2900        case MPOL_PREFERRED:
2901                if (flags & MPOL_F_LOCAL)
2902                        mode = MPOL_LOCAL;
2903                else
2904                        node_set(pol->v.preferred_node, nodes);
2905                break;
2906        case MPOL_BIND:
2907        case MPOL_INTERLEAVE:
2908                nodes = pol->v.nodes;
2909                break;
2910        default:
2911                WARN_ON_ONCE(1);
2912                snprintf(p, maxlen, "unknown");
2913                return;
2914        }
2915
2916        p += snprintf(p, maxlen, "%s", policy_modes[mode]);
2917
2918        if (flags & MPOL_MODE_FLAGS) {
2919                p += snprintf(p, buffer + maxlen - p, "=");
2920
2921                /*
2922                 * Currently, the only defined flags are mutually exclusive
2923                 */
2924                if (flags & MPOL_F_STATIC_NODES)
2925                        p += snprintf(p, buffer + maxlen - p, "static");
2926                else if (flags & MPOL_F_RELATIVE_NODES)
2927                        p += snprintf(p, buffer + maxlen - p, "relative");
2928        }
2929
2930        if (!nodes_empty(nodes))
2931                p += scnprintf(p, buffer + maxlen - p, ":%*pbl",
2932                               nodemask_pr_args(&nodes));
2933}
2934