LXR linux/mm/mempolicy.c

   1// SPDX-License-Identifier: GPL-2.0-only
   2/*
   3 * Simple NUMA memory policy for the Linux kernel.
   4 *
   5 * Copyright 2003,2004 Andi Kleen, SuSE Labs.
   6 * (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc.
   7 *
   8 * NUMA policy allows the user to give hints in which node(s) memory should
   9 * be allocated.
  10 *
  11 * Support four policies per VMA and per process:
  12 *
  13 * The VMA policy has priority over the process policy for a page fault.
  14 *
  15 * interleave     Allocate memory interleaved over a set of nodes,
  16 *                with normal fallback if it fails.
  17 *                For VMA based allocations this interleaves based on the
  18 *                offset into the backing object or offset into the mapping
  19 *                for anonymous memory. For process policy an process counter
  20 *                is used.
  21 *
  22 * bind           Only allocate memory on a specific set of nodes,
  23 *                no fallback.
  24 *                FIXME: memory is allocated starting with the first node
  25 *                to the last. It would be better if bind would truly restrict
  26 *                the allocation to memory nodes instead
  27 *
  28 * preferred       Try a specific node first before normal fallback.
  29 *                As a special case NUMA_NO_NODE here means do the allocation
  30 *                on the local CPU. This is normally identical to default,
  31 *                but useful to set in a VMA when you have a non default
  32 *                process policy.
  33 *
  34 * default        Allocate on the local node first, or when on a VMA
  35 *                use the process policy. This is what Linux always did
  36 *                in a NUMA aware kernel and still does by, ahem, default.
  37 *
  38 * The process policy is applied for most non interrupt memory allocations
  39 * in that process' context. Interrupts ignore the policies and always
  40 * try to allocate on the local CPU. The VMA policy is only applied for memory
  41 * allocations for a VMA in the VM.
  42 *
  43 * Currently there are a few corner cases in swapping where the policy
  44 * is not applied, but the majority should be handled. When process policy
  45 * is used it is not remembered over swap outs/swap ins.
  46 *
  47 * Only the highest zone in the zone hierarchy gets policied. Allocations
  48 * requesting a lower zone just use default policy. This implies that
  49 * on systems with highmem kernel lowmem allocation don't get policied.
  50 * Same with GFP_DMA allocations.
  51 *
  52 * For shmfs/tmpfs/hugetlbfs shared memory the policy is shared between
  53 * all users and remembered even when nobody has memory mapped.
  54 */
  55
  56/* Notebook:
  57   fix mmap readahead to honour policy and enable policy for any page cache
  58   object
  59   statistics for bigpages
  60   global policy for page cache? currently it uses process policy. Requires
  61   first item above.
  62   handle mremap for shared memory (currently ignored for the policy)
  63   grows down?
  64   make bind policy root only? It can trigger oom much faster and the
  65   kernel is not always grateful with that.
  66*/
  67
  68#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
  69
  70#include <linux/mempolicy.h>
  71#include <linux/pagewalk.h>
  72#include <linux/highmem.h>
  73#include <linux/hugetlb.h>
  74#include <linux/kernel.h>
  75#include <linux/sched.h>
  76#include <linux/sched/mm.h>
  77#include <linux/sched/numa_balancing.h>
  78#include <linux/sched/task.h>
  79#include <linux/nodemask.h>
  80#include <linux/cpuset.h>
  81#include <linux/slab.h>
  82#include <linux/string.h>
  83#include <linux/export.h>
  84#include <linux/nsproxy.h>
  85#include <linux/interrupt.h>
  86#include <linux/init.h>
  87#include <linux/compat.h>
  88#include <linux/ptrace.h>
  89#include <linux/swap.h>
  90#include <linux/seq_file.h>
  91#include <linux/proc_fs.h>
  92#include <linux/migrate.h>
  93#include <linux/ksm.h>
  94#include <linux/rmap.h>
  95#include <linux/security.h>
  96#include <linux/syscalls.h>
  97#include <linux/ctype.h>
  98#include <linux/mm_inline.h>
  99#include <linux/mmu_notifier.h>
 100#include <linux/printk.h>
 101#include <linux/swapops.h>
 102
 103#include <asm/tlbflush.h>
 104#include <linux/uaccess.h>
 105
 106#include "internal.h"
 107
 108/* Internal flags */
 109#define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0)    /* Skip checks for continuous vmas */
 110#define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1)          /* Invert check for nodemask */
 111
 112static struct kmem_cache *policy_cache;
 113static struct kmem_cache *sn_cache;
 114
 115/* Highest zone. An specific allocation for a zone below that is not
 116   policied. */
 117enum zone_type policy_zone = 0;
 118
 119/*
 120 * run-time system-wide default policy => local allocation
 121 */
 122static struct mempolicy default_policy = {
 123        .refcnt = ATOMIC_INIT(1), /* never free it */
 124        .mode = MPOL_PREFERRED,
 125        .flags = MPOL_F_LOCAL,
 126};
 127
 128static struct mempolicy preferred_node_policy[MAX_NUMNODES];
 129
 130struct mempolicy *get_task_policy(struct task_struct *p)
 131{
 132        struct mempolicy *pol = p->mempolicy;
 133        int node;
 134
 135        if (pol)
 136                return pol;
 137
 138        node = numa_node_id();
 139        if (node != NUMA_NO_NODE) {
 140                pol = &preferred_node_policy[node];
 141                /* preferred_node_policy is not initialised early in boot */
 142                if (pol->mode)
 143                        return pol;
 144        }
 145
 146        return &default_policy;
 147}
 148
 149static const struct mempolicy_operations {
 150        int (*create)(struct mempolicy *pol, const nodemask_t *nodes);
 151        void (*rebind)(struct mempolicy *pol, const nodemask_t *nodes);
 152} mpol_ops[MPOL_MAX];
 153
 154static inline int mpol_store_user_nodemask(const struct mempolicy *pol)
 155{
 156        return pol->flags & MPOL_MODE_FLAGS;
 157}
 158
 159static void mpol_relative_nodemask(nodemask_t *ret, const nodemask_t *orig,
 160                                   const nodemask_t *rel)
 161{
 162        nodemask_t tmp;
 163        nodes_fold(tmp, *orig, nodes_weight(*rel));
 164        nodes_onto(*ret, tmp, *rel);
 165}
 166
 167static int mpol_new_interleave(struct mempolicy *pol, const nodemask_t *nodes)
 168{
 169        if (nodes_empty(*nodes))
 170                return -EINVAL;
 171        pol->v.nodes = *nodes;
 172        return 0;
 173}
 174
 175static int mpol_new_preferred(struct mempolicy *pol, const nodemask_t *nodes)
 176{
 177        if (!nodes)
 178                pol->flags |= MPOL_F_LOCAL;     /* local allocation */
 179        else if (nodes_empty(*nodes))
 180                return -EINVAL;                 /*  no allowed nodes */
 181        else
 182                pol->v.preferred_node = first_node(*nodes);
 183        return 0;
 184}
 185
 186static int mpol_new_bind(struct mempolicy *pol, const nodemask_t *nodes)
 187{
 188        if (nodes_empty(*nodes))
 189                return -EINVAL;
 190        pol->v.nodes = *nodes;
 191        return 0;
 192}
 193
 194/*
 195 * mpol_set_nodemask is called after mpol_new() to set up the nodemask, if
 196 * any, for the new policy.  mpol_new() has already validated the nodes
 197 * parameter with respect to the policy mode and flags.  But, we need to
 198 * handle an empty nodemask with MPOL_PREFERRED here.
 199 *
 200 * Must be called holding task's alloc_lock to protect task's mems_allowed
 201 * and mempolicy.  May also be called holding the mmap_semaphore for write.
 202 */
 203static int mpol_set_nodemask(struct mempolicy *pol,
 204                     const nodemask_t *nodes, struct nodemask_scratch *nsc)
 205{
 206        int ret;
 207
 208        /* if mode is MPOL_DEFAULT, pol is NULL. This is right. */
 209        if (pol == NULL)
 210                return 0;
 211        /* Check N_MEMORY */
 212        nodes_and(nsc->mask1,
 213                  cpuset_current_mems_allowed, node_states[N_MEMORY]);
 214
 215        VM_BUG_ON(!nodes);
 216        if (pol->mode == MPOL_PREFERRED && nodes_empty(*nodes))
 217                nodes = NULL;   /* explicit local allocation */
 218        else {
 219                if (pol->flags & MPOL_F_RELATIVE_NODES)
 220                        mpol_relative_nodemask(&nsc->mask2, nodes, &nsc->mask1);
 221                else
 222                        nodes_and(nsc->mask2, *nodes, nsc->mask1);
 223
 224                if (mpol_store_user_nodemask(pol))
 225                        pol->w.user_nodemask = *nodes;
 226                else
 227                        pol->w.cpuset_mems_allowed =
 228                                                cpuset_current_mems_allowed;
 229        }
 230
 231        if (nodes)
 232                ret = mpol_ops[pol->mode].create(pol, &nsc->mask2);
 233        else
 234                ret = mpol_ops[pol->mode].create(pol, NULL);
 235        return ret;
 236}
 237
 238/*
 239 * This function just creates a new policy, does some check and simple
 240 * initialization. You must invoke mpol_set_nodemask() to set nodes.
 241 */
 242static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
 243                                  nodemask_t *nodes)
 244{
 245        struct mempolicy *policy;
 246
 247        pr_debug("setting mode %d flags %d nodes[0] %lx\n",
 248                 mode, flags, nodes ? nodes_addr(*nodes)[0] : NUMA_NO_NODE);
 249
 250        if (mode == MPOL_DEFAULT) {
 251                if (nodes && !nodes_empty(*nodes))
 252                        return ERR_PTR(-EINVAL);
 253                return NULL;
 254        }
 255        VM_BUG_ON(!nodes);
 256
 257        /*
 258         * MPOL_PREFERRED cannot be used with MPOL_F_STATIC_NODES or
 259         * MPOL_F_RELATIVE_NODES if the nodemask is empty (local allocation).
 260         * All other modes require a valid pointer to a non-empty nodemask.
 261         */
 262        if (mode == MPOL_PREFERRED) {
 263                if (nodes_empty(*nodes)) {
 264                        if (((flags & MPOL_F_STATIC_NODES) ||
 265                             (flags & MPOL_F_RELATIVE_NODES)))
 266                                return ERR_PTR(-EINVAL);
 267                }
 268        } else if (mode == MPOL_LOCAL) {
 269                if (!nodes_empty(*nodes) ||
 270                    (flags & MPOL_F_STATIC_NODES) ||
 271                    (flags & MPOL_F_RELATIVE_NODES))
 272                        return ERR_PTR(-EINVAL);
 273                mode = MPOL_PREFERRED;
 274        } else if (nodes_empty(*nodes))
 275                return ERR_PTR(-EINVAL);
 276        policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
 277        if (!policy)
 278                return ERR_PTR(-ENOMEM);
 279        atomic_set(&policy->refcnt, 1);
 280        policy->mode = mode;
 281        policy->flags = flags;
 282
 283        return policy;
 284}
 285
 286/* Slow path of a mpol destructor. */
 287void __mpol_put(struct mempolicy *p)
 288{
 289        if (!atomic_dec_and_test(&p->refcnt))
 290                return;
 291        kmem_cache_free(policy_cache, p);
 292}
 293
 294static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes)
 295{
 296}
 297
 298static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes)
 299{
 300        nodemask_t tmp;
 301
 302        if (pol->flags & MPOL_F_STATIC_NODES)
 303                nodes_and(tmp, pol->w.user_nodemask, *nodes);
 304        else if (pol->flags & MPOL_F_RELATIVE_NODES)
 305                mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
 306        else {
 307                nodes_remap(tmp, pol->v.nodes,pol->w.cpuset_mems_allowed,
 308                                                                *nodes);
 309                pol->w.cpuset_mems_allowed = *nodes;
 310        }
 311
 312        if (nodes_empty(tmp))
 313                tmp = *nodes;
 314
 315        pol->v.nodes = tmp;
 316}
 317
 318static void mpol_rebind_preferred(struct mempolicy *pol,
 319                                                const nodemask_t *nodes)
 320{
 321        nodemask_t tmp;
 322
 323        if (pol->flags & MPOL_F_STATIC_NODES) {
 324                int node = first_node(pol->w.user_nodemask);
 325
 326                if (node_isset(node, *nodes)) {
 327                        pol->v.preferred_node = node;
 328                        pol->flags &= ~MPOL_F_LOCAL;
 329                } else
 330                        pol->flags |= MPOL_F_LOCAL;
 331        } else if (pol->flags & MPOL_F_RELATIVE_NODES) {
 332                mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
 333                pol->v.preferred_node = first_node(tmp);
 334        } else if (!(pol->flags & MPOL_F_LOCAL)) {
 335                pol->v.preferred_node = node_remap(pol->v.preferred_node,
 336                                                   pol->w.cpuset_mems_allowed,
 337                                                   *nodes);
 338                pol->w.cpuset_mems_allowed = *nodes;
 339        }
 340}
 341
 342/*
 343 * mpol_rebind_policy - Migrate a policy to a different set of nodes
 344 *
 345 * Per-vma policies are protected by mmap_sem. Allocations using per-task
 346 * policies are protected by task->mems_allowed_seq to prevent a premature
 347 * OOM/allocation failure due to parallel nodemask modification.
 348 */
 349static void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask)
 350{
 351        if (!pol)
 352                return;
 353        if (!mpol_store_user_nodemask(pol) && !(pol->flags & MPOL_F_LOCAL) &&
 354            nodes_equal(pol->w.cpuset_mems_allowed, *newmask))
 355                return;
 356
 357        mpol_ops[pol->mode].rebind(pol, newmask);
 358}
 359
 360/*
 361 * Wrapper for mpol_rebind_policy() that just requires task
 362 * pointer, and updates task mempolicy.
 363 *
 364 * Called with task's alloc_lock held.
 365 */
 366
 367void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new)
 368{
 369        mpol_rebind_policy(tsk->mempolicy, new);
 370}
 371
 372/*
 373 * Rebind each vma in mm to new nodemask.
 374 *
 375 * Call holding a reference to mm.  Takes mm->mmap_sem during call.
 376 */
 377
 378void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
 379{
 380        struct vm_area_struct *vma;
 381
 382        down_write(&mm->mmap_sem);
 383        for (vma = mm->mmap; vma; vma = vma->vm_next)
 384                mpol_rebind_policy(vma->vm_policy, new);
 385        up_write(&mm->mmap_sem);
 386}
 387
 388static const struct mempolicy_operations mpol_ops[MPOL_MAX] = {
 389        [MPOL_DEFAULT] = {
 390                .rebind = mpol_rebind_default,
 391        },
 392        [MPOL_INTERLEAVE] = {
 393                .create = mpol_new_interleave,
 394                .rebind = mpol_rebind_nodemask,
 395        },
 396        [MPOL_PREFERRED] = {
 397                .create = mpol_new_preferred,
 398                .rebind = mpol_rebind_preferred,
 399        },
 400        [MPOL_BIND] = {
 401                .create = mpol_new_bind,
 402                .rebind = mpol_rebind_nodemask,
 403        },
 404};
 405
 406static int migrate_page_add(struct page *page, struct list_head *pagelist,
 407                                unsigned long flags);
 408
 409struct queue_pages {
 410        struct list_head *pagelist;
 411        unsigned long flags;
 412        nodemask_t *nmask;
 413        struct vm_area_struct *prev;
 414};
 415
 416/*
 417 * Check if the page's nid is in qp->nmask.
 418 *
 419 * If MPOL_MF_INVERT is set in qp->flags, check if the nid is
 420 * in the invert of qp->nmask.
 421 */
 422static inline bool queue_pages_required(struct page *page,
 423                                        struct queue_pages *qp)
 424{
 425        int nid = page_to_nid(page);
 426        unsigned long flags = qp->flags;
 427
 428        return node_isset(nid, *qp->nmask) == !(flags & MPOL_MF_INVERT);
 429}
 430
 431/*
 432 * queue_pages_pmd() has four possible return values:
 433 * 0 - pages are placed on the right node or queued successfully.
 434 * 1 - there is unmovable page, and MPOL_MF_MOVE* & MPOL_MF_STRICT were
 435 *     specified.
 436 * 2 - THP was split.
 437 * -EIO - is migration entry or only MPOL_MF_STRICT was specified and an
 438 *        existing page was already on a node that does not follow the
 439 *        policy.
 440 */
 441static int queue_pages_pmd(pmd_t *pmd, spinlock_t *ptl, unsigned long addr,
 442                                unsigned long end, struct mm_walk *walk)
 443{
 444        int ret = 0;
 445        struct page *page;
 446        struct queue_pages *qp = walk->private;
 447        unsigned long flags;
 448
 449        if (unlikely(is_pmd_migration_entry(*pmd))) {
 450                ret = -EIO;
 451                goto unlock;
 452        }
 453        page = pmd_page(*pmd);
 454        if (is_huge_zero_page(page)) {
 455                spin_unlock(ptl);
 456                __split_huge_pmd(walk->vma, pmd, addr, false, NULL);
 457                ret = 2;
 458                goto out;
 459        }
 460        if (!queue_pages_required(page, qp))
 461                goto unlock;
 462
 463        flags = qp->flags;
 464        /* go to thp migration */
 465        if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
 466                if (!vma_migratable(walk->vma) ||
 467                    migrate_page_add(page, qp->pagelist, flags)) {
 468                        ret = 1;
 469                        goto unlock;
 470                }
 471        } else
 472                ret = -EIO;
 473unlock:
 474        spin_unlock(ptl);
 475out:
 476        return ret;
 477}
 478
 479/*
 480 * Scan through pages checking if pages follow certain conditions,
 481 * and move them to the pagelist if they do.
 482 *
 483 * queue_pages_pte_range() has three possible return values:
 484 * 0 - pages are placed on the right node or queued successfully.
 485 * 1 - there is unmovable page, and MPOL_MF_MOVE* & MPOL_MF_STRICT were
 486 *     specified.
 487 * -EIO - only MPOL_MF_STRICT was specified and an existing page was already
 488 *        on a node that does not follow the policy.
 489 */
 490static int queue_pages_pte_range(pmd_t *pmd, unsigned long addr,
 491                        unsigned long end, struct mm_walk *walk)
 492{
 493        struct vm_area_struct *vma = walk->vma;
 494        struct page *page;
 495        struct queue_pages *qp = walk->private;
 496        unsigned long flags = qp->flags;
 497        int ret;
 498        bool has_unmovable = false;
 499        pte_t *pte;
 500        spinlock_t *ptl;
 501
 502        ptl = pmd_trans_huge_lock(pmd, vma);
 503        if (ptl) {
 504                ret = queue_pages_pmd(pmd, ptl, addr, end, walk);
 505                if (ret != 2)
 506                        return ret;
 507        }
 508        /* THP was split, fall through to pte walk */
 509
 510        if (pmd_trans_unstable(pmd))
 511                return 0;
 512
 513        pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
 514        for (; addr != end; pte++, addr += PAGE_SIZE) {
 515                if (!pte_present(*pte))
 516                        continue;
 517                page = vm_normal_page(vma, addr, *pte);
 518                if (!page)
 519                        continue;
 520                /*
 521                 * vm_normal_page() filters out zero pages, but there might
 522                 * still be PageReserved pages to skip, perhaps in a VDSO.
 523                 */
 524                if (PageReserved(page))
 525                        continue;
 526                if (!queue_pages_required(page, qp))
 527                        continue;
 528                if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
 529                        /* MPOL_MF_STRICT must be specified if we get here */
 530                        if (!vma_migratable(vma)) {
 531                                has_unmovable = true;
 532                                break;
 533                        }
 534
 535                        /*
 536                         * Do not abort immediately since there may be
 537                         * temporary off LRU pages in the range.  Still
 538                         * need migrate other LRU pages.
 539                         */
 540                        if (migrate_page_add(page, qp->pagelist, flags))
 541                                has_unmovable = true;
 542                } else
 543                        break;
 544        }
 545        pte_unmap_unlock(pte - 1, ptl);
 546        cond_resched();
 547
 548        if (has_unmovable)
 549                return 1;
 550
 551        return addr != end ? -EIO : 0;
 552}
 553
 554static int queue_pages_hugetlb(pte_t *pte, unsigned long hmask,
 555                               unsigned long addr, unsigned long end,
 556                               struct mm_walk *walk)
 557{
 558#ifdef CONFIG_HUGETLB_PAGE
 559        struct queue_pages *qp = walk->private;
 560        unsigned long flags = qp->flags;
 561        struct page *page;
 562        spinlock_t *ptl;
 563        pte_t entry;
 564
 565        ptl = huge_pte_lock(hstate_vma(walk->vma), walk->mm, pte);
 566        entry = huge_ptep_get(pte);
 567        if (!pte_present(entry))
 568                goto unlock;
 569        page = pte_page(entry);
 570        if (!queue_pages_required(page, qp))
 571                goto unlock;
 572        /* With MPOL_MF_MOVE, we migrate only unshared hugepage. */
 573        if (flags & (MPOL_MF_MOVE_ALL) ||
 574            (flags & MPOL_MF_MOVE && page_mapcount(page) == 1))
 575                isolate_huge_page(page, qp->pagelist);
 576unlock:
 577        spin_unlock(ptl);
 578#else
 579        BUG();
 580#endif
 581        return 0;
 582}
 583
 584#ifdef CONFIG_NUMA_BALANCING
 585/*
 586 * This is used to mark a range of virtual addresses to be inaccessible.
 587 * These are later cleared by a NUMA hinting fault. Depending on these
 588 * faults, pages may be migrated for better NUMA placement.
 589 *
 590 * This is assuming that NUMA faults are handled using PROT_NONE. If
 591 * an architecture makes a different choice, it will need further
 592 * changes to the core.
 593 */
 594unsigned long change_prot_numa(struct vm_area_struct *vma,
 595                        unsigned long addr, unsigned long end)
 596{
 597        int nr_updated;
 598
 599        nr_updated = change_protection(vma, addr, end, PAGE_NONE, 0, 1);
 600        if (nr_updated)
 601                count_vm_numa_events(NUMA_PTE_UPDATES, nr_updated);
 602
 603        return nr_updated;
 604}
 605#else
 606static unsigned long change_prot_numa(struct vm_area_struct *vma,
 607                        unsigned long addr, unsigned long end)
 608{
 609        return 0;
 610}
 611#endif /* CONFIG_NUMA_BALANCING */
 612
 613static int queue_pages_test_walk(unsigned long start, unsigned long end,
 614                                struct mm_walk *walk)
 615{
 616        struct vm_area_struct *vma = walk->vma;
 617        struct queue_pages *qp = walk->private;
 618        unsigned long endvma = vma->vm_end;
 619        unsigned long flags = qp->flags;
 620
 621        /*
 622         * Need check MPOL_MF_STRICT to return -EIO if possible
 623         * regardless of vma_migratable
 624         */
 625        if (!vma_migratable(vma) &&
 626            !(flags & MPOL_MF_STRICT))
 627                return 1;
 628
 629        if (endvma > end)
 630                endvma = end;
 631        if (vma->vm_start > start)
 632                start = vma->vm_start;
 633
 634        if (!(flags & MPOL_MF_DISCONTIG_OK)) {
 635                if (!vma->vm_next && vma->vm_end < end)
 636                        return -EFAULT;
 637                if (qp->prev && qp->prev->vm_end < vma->vm_start)
 638                        return -EFAULT;
 639        }
 640
 641        qp->prev = vma;
 642
 643        if (flags & MPOL_MF_LAZY) {
 644                /* Similar to task_numa_work, skip inaccessible VMAs */
 645                if (!is_vm_hugetlb_page(vma) &&
 646                        (vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)) &&
 647                        !(vma->vm_flags & VM_MIXEDMAP))
 648                        change_prot_numa(vma, start, endvma);
 649                return 1;
 650        }
 651
 652        /* queue pages from current vma */
 653        if (flags & MPOL_MF_VALID)
 654                return 0;
 655        return 1;
 656}
 657
 658static const struct mm_walk_ops queue_pages_walk_ops = {
 659        .hugetlb_entry          = queue_pages_hugetlb,
 660        .pmd_entry              = queue_pages_pte_range,
 661        .test_walk              = queue_pages_test_walk,
 662};
 663
 664/*
 665 * Walk through page tables and collect pages to be migrated.
 666 *
 667 * If pages found in a given range are on a set of nodes (determined by
 668 * @nodes and @flags,) it's isolated and queued to the pagelist which is
 669 * passed via @private.
 670 *
 671 * queue_pages_range() has three possible return values:
 672 * 1 - there is unmovable page, but MPOL_MF_MOVE* & MPOL_MF_STRICT were
 673 *     specified.
 674 * 0 - queue pages successfully or no misplaced page.
 675 * errno - i.e. misplaced pages with MPOL_MF_STRICT specified (-EIO) or
 676 *         memory range specified by nodemask and maxnode points outside
 677 *         your accessible address space (-EFAULT)
 678 */
 679static int
 680queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end,
 681                nodemask_t *nodes, unsigned long flags,
 682                struct list_head *pagelist)
 683{
 684        struct queue_pages qp = {
 685                .pagelist = pagelist,
 686                .flags = flags,
 687                .nmask = nodes,
 688                .prev = NULL,
 689        };
 690
 691        return walk_page_range(mm, start, end, &queue_pages_walk_ops, &qp);
 692}
 693
 694/*
 695 * Apply policy to a single VMA
 696 * This must be called with the mmap_sem held for writing.
 697 */
 698static int vma_replace_policy(struct vm_area_struct *vma,
 699                                                struct mempolicy *pol)
 700{
 701        int err;
 702        struct mempolicy *old;
 703        struct mempolicy *new;
 704
 705        pr_debug("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
 706                 vma->vm_start, vma->vm_end, vma->vm_pgoff,
 707                 vma->vm_ops, vma->vm_file,
 708                 vma->vm_ops ? vma->vm_ops->set_policy : NULL);
 709
 710        new = mpol_dup(pol);
 711        if (IS_ERR(new))
 712                return PTR_ERR(new);
 713
 714        if (vma->vm_ops && vma->vm_ops->set_policy) {
 715                err = vma->vm_ops->set_policy(vma, new);
 716                if (err)
 717                        goto err_out;
 718        }
 719
 720        old = vma->vm_policy;
 721        vma->vm_policy = new; /* protected by mmap_sem */
 722        mpol_put(old);
 723
 724        return 0;
 725 err_out:
 726        mpol_put(new);
 727        return err;
 728}
 729
 730/* Step 2: apply policy to a range and do splits. */
 731static int mbind_range(struct mm_struct *mm, unsigned long start,
 732                       unsigned long end, struct mempolicy *new_pol)
 733{
 734        struct vm_area_struct *next;
 735        struct vm_area_struct *prev;
 736        struct vm_area_struct *vma;
 737        int err = 0;
 738        pgoff_t pgoff;
 739        unsigned long vmstart;
 740        unsigned long vmend;
 741
 742        vma = find_vma(mm, start);
 743        if (!vma || vma->vm_start > start)
 744                return -EFAULT;
 745
 746        prev = vma->vm_prev;
 747        if (start > vma->vm_start)
 748                prev = vma;
 749
 750        for (; vma && vma->vm_start < end; prev = vma, vma = next) {
 751                next = vma->vm_next;
 752                vmstart = max(start, vma->vm_start);
 753                vmend   = min(end, vma->vm_end);
 754
 755                if (mpol_equal(vma_policy(vma), new_pol))
 756                        continue;
 757
 758                pgoff = vma->vm_pgoff +
 759                        ((vmstart - vma->vm_start) >> PAGE_SHIFT);
 760                prev = vma_merge(mm, prev, vmstart, vmend, vma->vm_flags,
 761                                 vma->anon_vma, vma->vm_file, pgoff,
 762                                 new_pol, vma->vm_userfaultfd_ctx);
 763                if (prev) {
 764                        vma = prev;
 765                        next = vma->vm_next;
 766                        if (mpol_equal(vma_policy(vma), new_pol))
 767                                continue;
 768                        /* vma_merge() joined vma && vma->next, case 8 */
 769                        goto replace;
 770                }
 771                if (vma->vm_start != vmstart) {
 772                        err = split_vma(vma->vm_mm, vma, vmstart, 1);
 773                        if (err)
 774                                goto out;
 775                }
 776                if (vma->vm_end != vmend) {
 777                        err = split_vma(vma->vm_mm, vma, vmend, 0);
 778                        if (err)
 779                                goto out;
 780                }
 781 replace:
 782                err = vma_replace_policy(vma, new_pol);
 783                if (err)
 784                        goto out;
 785        }
 786
 787 out:
 788        return err;
 789}
 790
 791/* Set the process memory policy */
 792static long do_set_mempolicy(unsigned short mode, unsigned short flags,
 793                             nodemask_t *nodes)
 794{
 795        struct mempolicy *new, *old;
 796        NODEMASK_SCRATCH(scratch);
 797        int ret;
 798
 799        if (!scratch)
 800                return -ENOMEM;
 801
 802        new = mpol_new(mode, flags, nodes);
 803        if (IS_ERR(new)) {
 804                ret = PTR_ERR(new);
 805                goto out;
 806        }
 807
 808        task_lock(current);
 809        ret = mpol_set_nodemask(new, nodes, scratch);
 810        if (ret) {
 811                task_unlock(current);
 812                mpol_put(new);
 813                goto out;
 814        }
 815        old = current->mempolicy;
 816        current->mempolicy = new;
 817        if (new && new->mode == MPOL_INTERLEAVE)
 818                current->il_prev = MAX_NUMNODES-1;
 819        task_unlock(current);
 820        mpol_put(old);
 821        ret = 0;
 822out:
 823        NODEMASK_SCRATCH_FREE(scratch);
 824        return ret;
 825}
 826
 827/*
 828 * Return nodemask for policy for get_mempolicy() query
 829 *
 830 * Called with task's alloc_lock held
 831 */
 832static void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes)
 833{
 834        nodes_clear(*nodes);
 835        if (p == &default_policy)
 836                return;
 837
 838        switch (p->mode) {
 839        case MPOL_BIND:
 840                /* Fall through */
 841        case MPOL_INTERLEAVE:
 842                *nodes = p->v.nodes;
 843                break;
 844        case MPOL_PREFERRED:
 845                if (!(p->flags & MPOL_F_LOCAL))
 846                        node_set(p->v.preferred_node, *nodes);
 847                /* else return empty node mask for local allocation */
 848                break;
 849        default:
 850                BUG();
 851        }
 852}
 853
 854static int lookup_node(struct mm_struct *mm, unsigned long addr)
 855{
 856        struct page *p;
 857        int err;
 858
 859        int locked = 1;
 860        err = get_user_pages_locked(addr & PAGE_MASK, 1, 0, &p, &locked);
 861        if (err >= 0) {
 862                err = page_to_nid(p);
 863                put_page(p);
 864        }
 865        if (locked)
 866                up_read(&mm->mmap_sem);
 867        return err;
 868}
 869
 870/* Retrieve NUMA policy */
 871static long do_get_mempolicy(int *policy, nodemask_t *nmask,
 872                             unsigned long addr, unsigned long flags)
 873{
 874        int err;
 875        struct mm_struct *mm = current->mm;
 876        struct vm_area_struct *vma = NULL;
 877        struct mempolicy *pol = current->mempolicy, *pol_refcount = NULL;
 878
 879        if (flags &
 880                ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED))
 881                return -EINVAL;
 882
 883        if (flags & MPOL_F_MEMS_ALLOWED) {
 884                if (flags & (MPOL_F_NODE|MPOL_F_ADDR))
 885                        return -EINVAL;
 886                *policy = 0;    /* just so it's initialized */
 887                task_lock(current);
 888                *nmask  = cpuset_current_mems_allowed;
 889                task_unlock(current);
 890                return 0;
 891        }
 892
 893        if (flags & MPOL_F_ADDR) {
 894                /*
 895                 * Do NOT fall back to task policy if the
 896                 * vma/shared policy at addr is NULL.  We
 897                 * want to return MPOL_DEFAULT in this case.
 898                 */
 899                down_read(&mm->mmap_sem);
 900                vma = find_vma_intersection(mm, addr, addr+1);
 901                if (!vma) {
 902                        up_read(&mm->mmap_sem);
 903                        return -EFAULT;
 904                }
 905                if (vma->vm_ops && vma->vm_ops->get_policy)
 906                        pol = vma->vm_ops->get_policy(vma, addr);
 907                else
 908                        pol = vma->vm_policy;
 909        } else if (addr)
 910                return -EINVAL;
 911
 912        if (!pol)
 913                pol = &default_policy;  /* indicates default behavior */
 914
 915        if (flags & MPOL_F_NODE) {
 916                if (flags & MPOL_F_ADDR) {
 917                        /*
 918                         * Take a refcount on the mpol, lookup_node()
 919                         * wil drop the mmap_sem, so after calling
 920                         * lookup_node() only "pol" remains valid, "vma"
 921                         * is stale.
 922                         */
 923                        pol_refcount = pol;
 924                        vma = NULL;
 925                        mpol_get(pol);
 926                        err = lookup_node(mm, addr);
 927                        if (err < 0)
 928                                goto out;
 929                        *policy = err;
 930                } else if (pol == current->mempolicy &&
 931                                pol->mode == MPOL_INTERLEAVE) {
 932                        *policy = next_node_in(current->il_prev, pol->v.nodes);
 933                } else {
 934                        err = -EINVAL;
 935                        goto out;
 936                }
 937        } else {
 938                *policy = pol == &default_policy ? MPOL_DEFAULT :
 939                                                pol->mode;
 940                /*
 941                 * Internal mempolicy flags must be masked off before exposing
 942                 * the policy to userspace.
 943                 */
 944                *policy |= (pol->flags & MPOL_MODE_FLAGS);
 945        }
 946
 947        err = 0;
 948        if (nmask) {
 949                if (mpol_store_user_nodemask(pol)) {
 950                        *nmask = pol->w.user_nodemask;
 951                } else {
 952                        task_lock(current);
 953                        get_policy_nodemask(pol, nmask);
 954                        task_unlock(current);
 955                }
 956        }
 957
 958 out:
 959        mpol_cond_put(pol);
 960        if (vma)
 961                up_read(&mm->mmap_sem);
 962        if (pol_refcount)
 963                mpol_put(pol_refcount);
 964        return err;
 965}
 966
 967#ifdef CONFIG_MIGRATION
 968/*
 969 * page migration, thp tail pages can be passed.
 970 */
 971static int migrate_page_add(struct page *page, struct list_head *pagelist,
 972                                unsigned long flags)
 973{
 974        struct page *head = compound_head(page);
 975        /*
 976         * Avoid migrating a page that is shared with others.
 977         */
 978        if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(head) == 1) {
 979                if (!isolate_lru_page(head)) {
 980                        list_add_tail(&head->lru, pagelist);
 981                        mod_node_page_state(page_pgdat(head),
 982                                NR_ISOLATED_ANON + page_is_file_cache(head),
 983                                hpage_nr_pages(head));
 984                } else if (flags & MPOL_MF_STRICT) {
 985                        /*
 986                         * Non-movable page may reach here.  And, there may be
 987                         * temporary off LRU pages or non-LRU movable pages.
 988                         * Treat them as unmovable pages since they can't be
 989                         * isolated, so they can't be moved at the moment.  It
 990                         * should return -EIO for this case too.
 991                         */
 992                        return -EIO;
 993                }
 994        }
 995
 996        return 0;
 997}
 998
 999/* page allocation callback for NUMA node migration */
1000struct page *alloc_new_node_page(struct page *page, unsigned long node)

1001{
1002        if (PageHuge(page))
1003                return alloc_huge_page_node(page_hstate(compound_head(page)),
1004                                        node);
1005        else if (PageTransHuge(page)) {
1006                struct page *thp;
1007
1008                thp = alloc_pages_node(node,
1009                        (GFP_TRANSHUGE | __GFP_THISNODE),
1010                        HPAGE_PMD_ORDER);
1011                if (!thp)
1012                        return NULL;
1013                prep_transhuge_page(thp);
1014                return thp;
1015        } else
1016                return __alloc_pages_node(node, GFP_HIGHUSER_MOVABLE |
1017                                                    __GFP_THISNODE, 0);
1018}
1019
1020/*
1021 * Migrate pages from one node to a target node.
1022 * Returns error or the number of pages not migrated.
1023 */
1024static int migrate_to_node(struct mm_struct *mm, int source, int dest,
1025                           int flags)
1026{
1027        nodemask_t nmask;
1028        LIST_HEAD(pagelist);
1029        int err = 0;
1030
1031        nodes_clear(nmask);
1032        node_set(source, nmask);
1033
1034        /*
1035         * This does not "check" the range but isolates all pages that
1036         * need migration.  Between passing in the full user address
1037         * space range and MPOL_MF_DISCONTIG_OK, this call can not fail.
1038         */
1039        VM_BUG_ON(!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)));
1040        queue_pages_range(mm, mm->mmap->vm_start, mm->task_size, &nmask,
1041                        flags | MPOL_MF_DISCONTIG_OK, &pagelist);
1042
1043        if (!list_empty(&pagelist)) {
1044                err = migrate_pages(&pagelist, alloc_new_node_page, NULL, dest,
1045                                        MIGRATE_SYNC, MR_SYSCALL);
1046                if (err)
1047                        putback_movable_pages(&pagelist);
1048        }
1049
1050        return err;
1051}
1052
1053/*
1054 * Move pages between the two nodesets so as to preserve the physical
1055 * layout as much as possible.
1056 *
1057 * Returns the number of page that could not be moved.
1058 */
1059int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
1060                     const nodemask_t *to, int flags)
1061{
1062        int busy = 0;
1063        int err;
1064        nodemask_t tmp;
1065
1066        err = migrate_prep();
1067        if (err)
1068                return err;
1069
1070        down_read(&mm->mmap_sem);
1071
1072        /*
1073         * Find a 'source' bit set in 'tmp' whose corresponding 'dest'
1074         * bit in 'to' is not also set in 'tmp'.  Clear the found 'source'
1075         * bit in 'tmp', and return that <source, dest> pair for migration.
1076         * The pair of nodemasks 'to' and 'from' define the map.
1077         *
1078         * If no pair of bits is found that way, fallback to picking some
1079         * pair of 'source' and 'dest' bits that are not the same.  If the
1080         * 'source' and 'dest' bits are the same, this represents a node
1081         * that will be migrating to itself, so no pages need move.
1082         *
1083         * If no bits are left in 'tmp', or if all remaining bits left
1084         * in 'tmp' correspond to the same bit in 'to', return false
1085         * (nothing left to migrate).
1086         *
1087         * This lets us pick a pair of nodes to migrate between, such that
1088         * if possible the dest node is not already occupied by some other
1089         * source node, minimizing the risk of overloading the memory on a
1090         * node that would happen if we migrated incoming memory to a node
1091         * before migrating outgoing memory source that same node.
1092         *
1093         * A single scan of tmp is sufficient.  As we go, we remember the
1094         * most recent <s, d> pair that moved (s != d).  If we find a pair
1095         * that not only moved, but what's better, moved to an empty slot
1096         * (d is not set in tmp), then we break out then, with that pair.
1097         * Otherwise when we finish scanning from_tmp, we at least have the
1098         * most recent <s, d> pair that moved.  If we get all the way through
1099         * the scan of tmp without finding any node that moved, much less
1100         * moved to an empty node, then there is nothing left worth migrating.
1101         */
1102
1103        tmp = *from;
1104        while (!nodes_empty(tmp)) {
1105                int s,d;
1106                int source = NUMA_NO_NODE;
1107                int dest = 0;
1108
1109                for_each_node_mask(s, tmp) {
1110
1111                        /*
1112                         * do_migrate_pages() tries to maintain the relative
1113                         * node relationship of the pages established between
1114                         * threads and memory areas.
1115                         *
1116                         * However if the number of source nodes is not equal to
1117                         * the number of destination nodes we can not preserve
1118                         * this node relative relationship.  In that case, skip
1119                         * copying memory from a node that is in the destination
1120                         * mask.
1121                         *
1122                         * Example: [2,3,4] -> [3,4,5] moves everything.
1123                         *          [0-7] - > [3,4,5] moves only 0,1,2,6,7.
1124                         */
1125
1126                        if ((nodes_weight(*from) != nodes_weight(*to)) &&
1127                                                (node_isset(s, *to)))
1128                                continue;
1129
1130                        d = node_remap(s, *from, *to);
1131                        if (s == d)
1132                                continue;
1133
1134                        source = s;     /* Node moved. Memorize */
1135                        dest = d;
1136
1137                        /* dest not in remaining from nodes? */
1138                        if (!node_isset(dest, tmp))
1139                                break;
1140                }
1141                if (source == NUMA_NO_NODE)
1142                        break;
1143
1144                node_clear(source, tmp);
1145                err = migrate_to_node(mm, source, dest, flags);
1146                if (err > 0)
1147                        busy += err;
1148                if (err < 0)
1149                        break;
1150        }
1151        up_read(&mm->mmap_sem);
1152        if (err < 0)
1153                return err;
1154        return busy;
1155
1156}
1157
1158/*
1159 * Allocate a new page for page migration based on vma policy.
1160 * Start by assuming the page is mapped by the same vma as contains @start.
1161 * Search forward from there, if not.  N.B., this assumes that the
1162 * list of pages handed to migrate_pages()--which is how we get here--
1163 * is in virtual address order.
1164 */
1165static struct page *new_page(struct page *page, unsigned long start)
1166{
1167        struct vm_area_struct *vma;
1168        unsigned long uninitialized_var(address);
1169
1170        vma = find_vma(current->mm, start);
1171        while (vma) {
1172                address = page_address_in_vma(page, vma);
1173                if (address != -EFAULT)
1174                        break;
1175                vma = vma->vm_next;
1176        }
1177
1178        if (PageHuge(page)) {
1179                return alloc_huge_page_vma(page_hstate(compound_head(page)),
1180                                vma, address);
1181        } else if (PageTransHuge(page)) {
1182                struct page *thp;
1183
1184                thp = alloc_hugepage_vma(GFP_TRANSHUGE, vma, address,
1185                                         HPAGE_PMD_ORDER);
1186                if (!thp)
1187                        return NULL;
1188                prep_transhuge_page(thp);
1189                return thp;
1190        }
1191        /*
1192         * if !vma, alloc_page_vma() will use task or system default policy
1193         */
1194        return alloc_page_vma(GFP_HIGHUSER_MOVABLE | __GFP_RETRY_MAYFAIL,
1195                        vma, address);
1196}
1197#else
1198
1199static int migrate_page_add(struct page *page, struct list_head *pagelist,
1200                                unsigned long flags)
1201{
1202        return -EIO;
1203}
1204
1205int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
1206                     const nodemask_t *to, int flags)
1207{
1208        return -ENOSYS;
1209}
1210
1211static struct page *new_page(struct page *page, unsigned long start)
1212{
1213        return NULL;
1214}
1215#endif
1216
1217static long do_mbind(unsigned long start, unsigned long len,
1218                     unsigned short mode, unsigned short mode_flags,
1219                     nodemask_t *nmask, unsigned long flags)
1220{
1221        struct mm_struct *mm = current->mm;
1222        struct mempolicy *new;
1223        unsigned long end;
1224        int err;
1225        int ret;
1226        LIST_HEAD(pagelist);
1227
1228        if (flags & ~(unsigned long)MPOL_MF_VALID)
1229                return -EINVAL;
1230        if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
1231                return -EPERM;
1232
1233        if (start & ~PAGE_MASK)
1234                return -EINVAL;
1235
1236        if (mode == MPOL_DEFAULT)
1237                flags &= ~MPOL_MF_STRICT;
1238
1239        len = (len + PAGE_SIZE - 1) & PAGE_MASK;
1240        end = start + len;
1241
1242        if (end < start)
1243                return -EINVAL;
1244        if (end == start)
1245                return 0;
1246
1247        new = mpol_new(mode, mode_flags, nmask);
1248        if (IS_ERR(new))
1249                return PTR_ERR(new);
1250
1251        if (flags & MPOL_MF_LAZY)
1252                new->flags |= MPOL_F_MOF;
1253
1254        /*
1255         * If we are using the default policy then operation
1256         * on discontinuous address spaces is okay after all
1257         */
1258        if (!new)
1259                flags |= MPOL_MF_DISCONTIG_OK;
1260
1261        pr_debug("mbind %lx-%lx mode:%d flags:%d nodes:%lx\n",
1262                 start, start + len, mode, mode_flags,
1263                 nmask ? nodes_addr(*nmask)[0] : NUMA_NO_NODE);
1264
1265        if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
1266
1267                err = migrate_prep();
1268                if (err)
1269                        goto mpol_out;
1270        }
1271        {
1272                NODEMASK_SCRATCH(scratch);
1273                if (scratch) {
1274                        down_write(&mm->mmap_sem);
1275                        task_lock(current);
1276                        err = mpol_set_nodemask(new, nmask, scratch);
1277                        task_unlock(current);
1278                        if (err)
1279                                up_write(&mm->mmap_sem);
1280                } else
1281                        err = -ENOMEM;
1282                NODEMASK_SCRATCH_FREE(scratch);
1283        }
1284        if (err)
1285                goto mpol_out;
1286
1287        ret = queue_pages_range(mm, start, end, nmask,
1288                          flags | MPOL_MF_INVERT, &pagelist);
1289
1290        if (ret < 0) {
1291                err = ret;
1292                goto up_out;
1293        }
1294
1295        err = mbind_range(mm, start, end, new);
1296
1297        if (!err) {
1298                int nr_failed = 0;
1299
1300                if (!list_empty(&pagelist)) {
1301                        WARN_ON_ONCE(flags & MPOL_MF_LAZY);
1302                        nr_failed = migrate_pages(&pagelist, new_page, NULL,
1303                                start, MIGRATE_SYNC, MR_MEMPOLICY_MBIND);
1304                        if (nr_failed)
1305                                putback_movable_pages(&pagelist);
1306                }
1307
1308                if ((ret > 0) || (nr_failed && (flags & MPOL_MF_STRICT)))
1309                        err = -EIO;
1310        } else {
1311up_out:
1312                if (!list_empty(&pagelist))
1313                        putback_movable_pages(&pagelist);
1314        }
1315
1316        up_write(&mm->mmap_sem);
1317mpol_out:
1318        mpol_put(new);
1319        return err;
1320}
1321
1322/*
1323 * User space interface with variable sized bitmaps for nodelists.
1324 */
1325
1326/* Copy a node mask from user space. */
1327static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
1328                     unsigned long maxnode)
1329{
1330        unsigned long k;
1331        unsigned long t;
1332        unsigned long nlongs;
1333        unsigned long endmask;
1334
1335        --maxnode;
1336        nodes_clear(*nodes);
1337        if (maxnode == 0 || !nmask)
1338                return 0;
1339        if (maxnode > PAGE_SIZE*BITS_PER_BYTE)
1340                return -EINVAL;
1341
1342        nlongs = BITS_TO_LONGS(maxnode);
1343        if ((maxnode % BITS_PER_LONG) == 0)
1344                endmask = ~0UL;
1345        else
1346                endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1;
1347
1348        /*
1349         * When the user specified more nodes than supported just check
1350         * if the non supported part is all zero.
1351         *
1352         * If maxnode have more longs than MAX_NUMNODES, check
1353         * the bits in that area first. And then go through to
1354         * check the rest bits which equal or bigger than MAX_NUMNODES.
1355         * Otherwise, just check bits [MAX_NUMNODES, maxnode).
1356         */
1357        if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) {
1358                for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) {
1359                        if (get_user(t, nmask + k))
1360                                return -EFAULT;
1361                        if (k == nlongs - 1) {
1362                                if (t & endmask)
1363                                        return -EINVAL;
1364                        } else if (t)
1365                                return -EINVAL;
1366                }
1367                nlongs = BITS_TO_LONGS(MAX_NUMNODES);
1368                endmask = ~0UL;
1369        }
1370
1371        if (maxnode > MAX_NUMNODES && MAX_NUMNODES % BITS_PER_LONG != 0) {
1372                unsigned long valid_mask = endmask;
1373
1374                valid_mask &= ~((1UL << (MAX_NUMNODES % BITS_PER_LONG)) - 1);
1375                if (get_user(t, nmask + nlongs - 1))
1376                        return -EFAULT;
1377                if (t & valid_mask)
1378                        return -EINVAL;
1379        }
1380
1381        if (copy_from_user(nodes_addr(*nodes), nmask, nlongs*sizeof(unsigned long)))
1382                return -EFAULT;
1383        nodes_addr(*nodes)[nlongs-1] &= endmask;
1384        return 0;
1385}
1386
1387/* Copy a kernel node mask to user space */
1388static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
1389                              nodemask_t *nodes)
1390{
1391        unsigned long copy = ALIGN(maxnode-1, 64) / 8;
1392        unsigned int nbytes = BITS_TO_LONGS(nr_node_ids) * sizeof(long);
1393
1394        if (copy > nbytes) {
1395                if (copy > PAGE_SIZE)
1396                        return -EINVAL;
1397                if (clear_user((char __user *)mask + nbytes, copy - nbytes))
1398                        return -EFAULT;
1399                copy = nbytes;
1400        }
1401        return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
1402}
1403
1404static long kernel_mbind(unsigned long start, unsigned long len,
1405                         unsigned long mode, const unsigned long __user *nmask,
1406                         unsigned long maxnode, unsigned int flags)
1407{
1408        nodemask_t nodes;
1409        int err;
1410        unsigned short mode_flags;
1411
1412        start = untagged_addr(start);
1413        mode_flags = mode & MPOL_MODE_FLAGS;
1414        mode &= ~MPOL_MODE_FLAGS;
1415        if (mode >= MPOL_MAX)
1416                return -EINVAL;
1417        if ((mode_flags & MPOL_F_STATIC_NODES) &&
1418            (mode_flags & MPOL_F_RELATIVE_NODES))
1419                return -EINVAL;
1420        err = get_nodes(&nodes, nmask, maxnode);
1421        if (err)
1422                return err;
1423        return do_mbind(start, len, mode, mode_flags, &nodes, flags);
1424}
1425
1426SYSCALL_DEFINE6(mbind, unsigned long, start, unsigned long, len,
1427                unsigned long, mode, const unsigned long __user *, nmask,
1428                unsigned long, maxnode, unsigned int, flags)
1429{
1430        return kernel_mbind(start, len, mode, nmask, maxnode, flags);
1431}
1432
1433/* Set the process memory policy */
1434static long kernel_set_mempolicy(int mode, const unsigned long __user *nmask,
1435                                 unsigned long maxnode)
1436{
1437        int err;
1438        nodemask_t nodes;
1439        unsigned short flags;
1440
1441        flags = mode & MPOL_MODE_FLAGS;
1442        mode &= ~MPOL_MODE_FLAGS;
1443        if ((unsigned int)mode >= MPOL_MAX)
1444                return -EINVAL;
1445        if ((flags & MPOL_F_STATIC_NODES) && (flags & MPOL_F_RELATIVE_NODES))
1446                return -EINVAL;
1447        err = get_nodes(&nodes, nmask, maxnode);
1448        if (err)
1449                return err;
1450        return do_set_mempolicy(mode, flags, &nodes);
1451}
1452
1453SYSCALL_DEFINE3(set_mempolicy, int, mode, const unsigned long __user *, nmask,
1454                unsigned long, maxnode)
1455{
1456        return kernel_set_mempolicy(mode, nmask, maxnode);
1457}
1458
1459static int kernel_migrate_pages(pid_t pid, unsigned long maxnode,
1460                                const unsigned long __user *old_nodes,
1461                                const unsigned long __user *new_nodes)
1462{
1463        struct mm_struct *mm = NULL;
1464        struct task_struct *task;
1465        nodemask_t task_nodes;
1466        int err;
1467        nodemask_t *old;
1468        nodemask_t *new;
1469        NODEMASK_SCRATCH(scratch);
1470
1471        if (!scratch)
1472                return -ENOMEM;
1473
1474        old = &scratch->mask1;
1475        new = &scratch->mask2;
1476
1477        err = get_nodes(old, old_nodes, maxnode);
1478        if (err)
1479                goto out;
1480
1481        err = get_nodes(new, new_nodes, maxnode);
1482        if (err)
1483                goto out;
1484
1485        /* Find the mm_struct */
1486        rcu_read_lock();
1487        task = pid ? find_task_by_vpid(pid) : current;
1488        if (!task) {
1489                rcu_read_unlock();
1490                err = -ESRCH;
1491                goto out;
1492        }
1493        get_task_struct(task);
1494
1495        err = -EINVAL;
1496
1497        /*
1498         * Check if this process has the right to modify the specified process.
1499         * Use the regular "ptrace_may_access()" checks.
1500         */
1501        if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS)) {
1502                rcu_read_unlock();
1503                err = -EPERM;
1504                goto out_put;
1505        }
1506        rcu_read_unlock();
1507
1508        task_nodes = cpuset_mems_allowed(task);
1509        /* Is the user allowed to access the target nodes? */
1510        if (!nodes_subset(*new, task_nodes) && !capable(CAP_SYS_NICE)) {
1511                err = -EPERM;
1512                goto out_put;
1513        }
1514
1515        task_nodes = cpuset_mems_allowed(current);
1516        nodes_and(*new, *new, task_nodes);
1517        if (nodes_empty(*new))
1518                goto out_put;
1519
1520        err = security_task_movememory(task);
1521        if (err)
1522                goto out_put;
1523
1524        mm = get_task_mm(task);
1525        put_task_struct(task);
1526
1527        if (!mm) {
1528                err = -EINVAL;
1529                goto out;
1530        }
1531
1532        err = do_migrate_pages(mm, old, new,
1533                capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE);
1534
1535        mmput(mm);
1536out:
1537        NODEMASK_SCRATCH_FREE(scratch);
1538
1539        return err;
1540
1541out_put:
1542        put_task_struct(task);
1543        goto out;
1544
1545}
1546
1547SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode,
1548                const unsigned long __user *, old_nodes,
1549                const unsigned long __user *, new_nodes)
1550{
1551        return kernel_migrate_pages(pid, maxnode, old_nodes, new_nodes);
1552}
1553
1554
1555/* Retrieve NUMA policy */
1556static int kernel_get_mempolicy(int __user *policy,
1557                                unsigned long __user *nmask,
1558                                unsigned long maxnode,
1559                                unsigned long addr,
1560                                unsigned long flags)
1561{
1562        int err;
1563        int uninitialized_var(pval);
1564        nodemask_t nodes;
1565
1566        addr = untagged_addr(addr);
1567
1568        if (nmask != NULL && maxnode < nr_node_ids)
1569                return -EINVAL;
1570
1571        err = do_get_mempolicy(&pval, &nodes, addr, flags);
1572
1573        if (err)
1574                return err;
1575
1576        if (policy && put_user(pval, policy))
1577                return -EFAULT;
1578
1579        if (nmask)
1580                err = copy_nodes_to_user(nmask, maxnode, &nodes);
1581
1582        return err;
1583}
1584
1585SYSCALL_DEFINE5(get_mempolicy, int __user *, policy,
1586                unsigned long __user *, nmask, unsigned long, maxnode,
1587                unsigned long, addr, unsigned long, flags)
1588{
1589        return kernel_get_mempolicy(policy, nmask, maxnode, addr, flags);
1590}
1591
1592#ifdef CONFIG_COMPAT
1593
1594COMPAT_SYSCALL_DEFINE5(get_mempolicy, int __user *, policy,
1595                       compat_ulong_t __user *, nmask,
1596                       compat_ulong_t, maxnode,
1597                       compat_ulong_t, addr, compat_ulong_t, flags)
1598{
1599        long err;
1600        unsigned long __user *nm = NULL;
1601        unsigned long nr_bits, alloc_size;
1602        DECLARE_BITMAP(bm, MAX_NUMNODES);
1603
1604        nr_bits = min_t(unsigned long, maxnode-1, nr_node_ids);
1605        alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1606
1607        if (nmask)
1608                nm = compat_alloc_user_space(alloc_size);
1609
1610        err = kernel_get_mempolicy(policy, nm, nr_bits+1, addr, flags);
1611
1612        if (!err && nmask) {
1613                unsigned long copy_size;
1614                copy_size = min_t(unsigned long, sizeof(bm), alloc_size);
1615                err = copy_from_user(bm, nm, copy_size);
1616                /* ensure entire bitmap is zeroed */
1617                err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8);
1618                err |= compat_put_bitmap(nmask, bm, nr_bits);
1619        }
1620
1621        return err;
1622}
1623
1624COMPAT_SYSCALL_DEFINE3(set_mempolicy, int, mode, compat_ulong_t __user *, nmask,
1625                       compat_ulong_t, maxnode)
1626{
1627        unsigned long __user *nm = NULL;
1628        unsigned long nr_bits, alloc_size;
1629        DECLARE_BITMAP(bm, MAX_NUMNODES);
1630
1631        nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1632        alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1633
1634        if (nmask) {
1635                if (compat_get_bitmap(bm, nmask, nr_bits))
1636                        return -EFAULT;
1637                nm = compat_alloc_user_space(alloc_size);
1638                if (copy_to_user(nm, bm, alloc_size))
1639                        return -EFAULT;
1640        }
1641
1642        return kernel_set_mempolicy(mode, nm, nr_bits+1);
1643}
1644
1645COMPAT_SYSCALL_DEFINE6(mbind, compat_ulong_t, start, compat_ulong_t, len,
1646                       compat_ulong_t, mode, compat_ulong_t __user *, nmask,
1647                       compat_ulong_t, maxnode, compat_ulong_t, flags)
1648{
1649        unsigned long __user *nm = NULL;
1650        unsigned long nr_bits, alloc_size;
1651        nodemask_t bm;
1652
1653        nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1654        alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1655
1656        if (nmask) {
1657                if (compat_get_bitmap(nodes_addr(bm), nmask, nr_bits))
1658                        return -EFAULT;
1659                nm = compat_alloc_user_space(alloc_size);
1660                if (copy_to_user(nm, nodes_addr(bm), alloc_size))
1661                        return -EFAULT;
1662        }
1663
1664        return kernel_mbind(start, len, mode, nm, nr_bits+1, flags);
1665}
1666
1667COMPAT_SYSCALL_DEFINE4(migrate_pages, compat_pid_t, pid,
1668                       compat_ulong_t, maxnode,
1669                       const compat_ulong_t __user *, old_nodes,
1670                       const compat_ulong_t __user *, new_nodes)
1671{
1672        unsigned long __user *old = NULL;
1673        unsigned long __user *new = NULL;
1674        nodemask_t tmp_mask;
1675        unsigned long nr_bits;
1676        unsigned long size;
1677
1678        nr_bits = min_t(unsigned long, maxnode - 1, MAX_NUMNODES);
1679        size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1680        if (old_nodes) {
1681                if (compat_get_bitmap(nodes_addr(tmp_mask), old_nodes, nr_bits))
1682                        return -EFAULT;
1683                old = compat_alloc_user_space(new_nodes ? size * 2 : size);
1684                if (new_nodes)
1685                        new = old + size / sizeof(unsigned long);
1686                if (copy_to_user(old, nodes_addr(tmp_mask), size))
1687                        return -EFAULT;
1688        }
1689        if (new_nodes) {
1690                if (compat_get_bitmap(nodes_addr(tmp_mask), new_nodes, nr_bits))
1691                        return -EFAULT;
1692                if (new == NULL)
1693                        new = compat_alloc_user_space(size);
1694                if (copy_to_user(new, nodes_addr(tmp_mask), size))
1695                        return -EFAULT;
1696        }
1697        return kernel_migrate_pages(pid, nr_bits + 1, old, new);
1698}
1699
1700#endif /* CONFIG_COMPAT */
1701
1702struct mempolicy *__get_vma_policy(struct vm_area_struct *vma,
1703                                                unsigned long addr)
1704{
1705        struct mempolicy *pol = NULL;
1706
1707        if (vma) {
1708                if (vma->vm_ops && vma->vm_ops->get_policy) {
1709                        pol = vma->vm_ops->get_policy(vma, addr);
1710                } else if (vma->vm_policy) {
1711                        pol = vma->vm_policy;
1712
1713                        /*
1714                         * shmem_alloc_page() passes MPOL_F_SHARED policy with
1715                         * a pseudo vma whose vma->vm_ops=NULL. Take a reference
1716                         * count on these policies which will be dropped by
1717                         * mpol_cond_put() later
1718                         */
1719                        if (mpol_needs_cond_ref(pol))
1720                                mpol_get(pol);
1721                }
1722        }
1723
1724        return pol;
1725}
1726
1727/*
1728 * get_vma_policy(@vma, @addr)
1729 * @vma: virtual memory area whose policy is sought
1730 * @addr: address in @vma for shared policy lookup
1731 *
1732 * Returns effective policy for a VMA at specified address.
1733 * Falls back to current->mempolicy or system default policy, as necessary.
1734 * Shared policies [those marked as MPOL_F_SHARED] require an extra reference
1735 * count--added by the get_policy() vm_op, as appropriate--to protect against
1736 * freeing by another task.  It is the caller's responsibility to free the
1737 * extra reference for shared policies.
1738 */
1739static struct mempolicy *get_vma_policy(struct vm_area_struct *vma,
1740                                                unsigned long addr)
1741{
1742        struct mempolicy *pol = __get_vma_policy(vma, addr);
1743
1744        if (!pol)
1745                pol = get_task_policy(current);
1746
1747        return pol;
1748}
1749
1750bool vma_policy_mof(struct vm_area_struct *vma)
1751{
1752        struct mempolicy *pol;
1753
1754        if (vma->vm_ops && vma->vm_ops->get_policy) {
1755                bool ret = false;
1756
1757                pol = vma->vm_ops->get_policy(vma, vma->vm_start);
1758                if (pol && (pol->flags & MPOL_F_MOF))
1759                        ret = true;
1760                mpol_cond_put(pol);
1761
1762                return ret;
1763        }
1764
1765        pol = vma->vm_policy;
1766        if (!pol)
1767                pol = get_task_policy(current);
1768
1769        return pol->flags & MPOL_F_MOF;
1770}
1771
1772static int apply_policy_zone(struct mempolicy *policy, enum zone_type zone)
1773{
1774        enum zone_type dynamic_policy_zone = policy_zone;
1775
1776        BUG_ON(dynamic_policy_zone == ZONE_MOVABLE);
1777
1778        /*
1779         * if policy->v.nodes has movable memory only,
1780         * we apply policy when gfp_zone(gfp) = ZONE_MOVABLE only.
1781         *
1782         * policy->v.nodes is intersect with node_states[N_MEMORY].
1783         * so if the following test faile, it implies
1784         * policy->v.nodes has movable memory only.
1785         */
1786        if (!nodes_intersects(policy->v.nodes, node_states[N_HIGH_MEMORY]))
1787                dynamic_policy_zone = ZONE_MOVABLE;
1788
1789        return zone >= dynamic_policy_zone;
1790}
1791
1792/*
1793 * Return a nodemask representing a mempolicy for filtering nodes for
1794 * page allocation
1795 */
1796static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy)
1797{
1798        /* Lower zones don't get a nodemask applied for MPOL_BIND */
1799        if (unlikely(policy->mode == MPOL_BIND) &&
1800                        apply_policy_zone(policy, gfp_zone(gfp)) &&
1801                        cpuset_nodemask_valid_mems_allowed(&policy->v.nodes))
1802                return &policy->v.nodes;
1803
1804        return NULL;
1805}
1806
1807/* Return the node id preferred by the given mempolicy, or the given id */
1808static int policy_node(gfp_t gfp, struct mempolicy *policy,
1809                                                                int nd)
1810{
1811        if (policy->mode == MPOL_PREFERRED && !(policy->flags & MPOL_F_LOCAL))
1812                nd = policy->v.preferred_node;
1813        else {
1814                /*
1815                 * __GFP_THISNODE shouldn't even be used with the bind policy
1816                 * because we might easily break the expectation to stay on the
1817                 * requested node and not break the policy.
1818                 */
1819                WARN_ON_ONCE(policy->mode == MPOL_BIND && (gfp & __GFP_THISNODE));
1820        }
1821
1822        return nd;
1823}
1824
1825/* Do dynamic interleaving for a process */
1826static unsigned interleave_nodes(struct mempolicy *policy)
1827{
1828        unsigned next;
1829        struct task_struct *me = current;
1830
1831        next = next_node_in(me->il_prev, policy->v.nodes);
1832        if (next < MAX_NUMNODES)
1833                me->il_prev = next;
1834        return next;
1835}
1836
1837/*
1838 * Depending on the memory policy provide a node from which to allocate the
1839 * next slab entry.
1840 */
1841unsigned int mempolicy_slab_node(void)
1842{
1843        struct mempolicy *policy;
1844        int node = numa_mem_id();
1845
1846        if (in_interrupt())
1847                return node;
1848
1849        policy = current->mempolicy;
1850        if (!policy || policy->flags & MPOL_F_LOCAL)
1851                return node;
1852
1853        switch (policy->mode) {
1854        case MPOL_PREFERRED:
1855                /*
1856                 * handled MPOL_F_LOCAL above
1857                 */
1858                return policy->v.preferred_node;
1859
1860        case MPOL_INTERLEAVE:
1861                return interleave_nodes(policy);
1862
1863        case MPOL_BIND: {
1864                struct zoneref *z;
1865
1866                /*
1867                 * Follow bind policy behavior and start allocation at the
1868                 * first node.
1869                 */
1870                struct zonelist *zonelist;
1871                enum zone_type highest_zoneidx = gfp_zone(GFP_KERNEL);
1872                zonelist = &NODE_DATA(node)->node_zonelists[ZONELIST_FALLBACK];
1873                z = first_zones_zonelist(zonelist, highest_zoneidx,
1874                                                        &policy->v.nodes);
1875                return z->zone ? zone_to_nid(z->zone) : node;
1876        }
1877
1878        default:
1879                BUG();
1880        }
1881}
1882
1883/*
1884 * Do static interleaving for a VMA with known offset @n.  Returns the n'th
1885 * node in pol->v.nodes (starting from n=0), wrapping around if n exceeds the
1886 * number of present nodes.
1887 */
1888static unsigned offset_il_node(struct mempolicy *pol, unsigned long n)
1889{
1890        unsigned nnodes = nodes_weight(pol->v.nodes);
1891        unsigned target;
1892        int i;
1893        int nid;
1894
1895        if (!nnodes)
1896                return numa_node_id();
1897        target = (unsigned int)n % nnodes;
1898        nid = first_node(pol->v.nodes);
1899        for (i = 0; i < target; i++)
1900                nid = next_node(nid, pol->v.nodes);
1901        return nid;
1902}
1903
1904/* Determine a node number for interleave */
1905static inline unsigned interleave_nid(struct mempolicy *pol,
1906                 struct vm_area_struct *vma, unsigned long addr, int shift)
1907{
1908        if (vma) {
1909                unsigned long off;
1910
1911                /*
1912                 * for small pages, there is no difference between
1913                 * shift and PAGE_SHIFT, so the bit-shift is safe.
1914                 * for huge pages, since vm_pgoff is in units of small
1915                 * pages, we need to shift off the always 0 bits to get
1916                 * a useful offset.
1917                 */
1918                BUG_ON(shift < PAGE_SHIFT);
1919                off = vma->vm_pgoff >> (shift - PAGE_SHIFT);
1920                off += (addr - vma->vm_start) >> shift;
1921                return offset_il_node(pol, off);
1922        } else
1923                return interleave_nodes(pol);
1924}
1925
1926#ifdef CONFIG_HUGETLBFS
1927/*
1928 * huge_node(@vma, @addr, @gfp_flags, @mpol)
1929 * @vma: virtual memory area whose policy is sought
1930 * @addr: address in @vma for shared policy lookup and interleave policy
1931 * @gfp_flags: for requested zone
1932 * @mpol: pointer to mempolicy pointer for reference counted mempolicy
1933 * @nodemask: pointer to nodemask pointer for MPOL_BIND nodemask
1934 *
1935 * Returns a nid suitable for a huge page allocation and a pointer
1936 * to the struct mempolicy for conditional unref after allocation.
1937 * If the effective policy is 'BIND, returns a pointer to the mempolicy's
1938 * @nodemask for filtering the zonelist.
1939 *
1940 * Must be protected by read_mems_allowed_begin()
1941 */
1942int huge_node(struct vm_area_struct *vma, unsigned long addr, gfp_t gfp_flags,
1943                                struct mempolicy **mpol, nodemask_t **nodemask)
1944{
1945        int nid;
1946
1947        *mpol = get_vma_policy(vma, addr);
1948        *nodemask = NULL;       /* assume !MPOL_BIND */
1949
1950        if (unlikely((*mpol)->mode == MPOL_INTERLEAVE)) {
1951                nid = interleave_nid(*mpol, vma, addr,
1952                                        huge_page_shift(hstate_vma(vma)));
1953        } else {
1954                nid = policy_node(gfp_flags, *mpol, numa_node_id());
1955                if ((*mpol)->mode == MPOL_BIND)
1956                        *nodemask = &(*mpol)->v.nodes;
1957        }
1958        return nid;
1959}
1960
1961/*
1962 * init_nodemask_of_mempolicy
1963 *
1964 * If the current task's mempolicy is "default" [NULL], return 'false'
1965 * to indicate default policy.  Otherwise, extract the policy nodemask
1966 * for 'bind' or 'interleave' policy into the argument nodemask, or
1967 * initialize the argument nodemask to contain the single node for
1968 * 'preferred' or 'local' policy and return 'true' to indicate presence
1969 * of non-default mempolicy.
1970 *
1971 * We don't bother with reference counting the mempolicy [mpol_get/put]
1972 * because the current task is examining it's own mempolicy and a task's
1973 * mempolicy is only ever changed by the task itself.
1974 *
1975 * N.B., it is the caller's responsibility to free a returned nodemask.
1976 */
1977bool init_nodemask_of_mempolicy(nodemask_t *mask)
1978{
1979        struct mempolicy *mempolicy;
1980        int nid;
1981
1982        if (!(mask && current->mempolicy))
1983                return false;
1984
1985        task_lock(current);
1986        mempolicy = current->mempolicy;
1987        switch (mempolicy->mode) {
1988        case MPOL_PREFERRED:
1989                if (mempolicy->flags & MPOL_F_LOCAL)
1990                        nid = numa_node_id();
1991                else
1992                        nid = mempolicy->v.preferred_node;
1993                init_nodemask_of_node(mask, nid);
1994                break;
1995
1996        case MPOL_BIND:
1997                /* Fall through */
1998        case MPOL_INTERLEAVE:
1999                *mask =  mempolicy->v.nodes;
2000                break;

2001
2002        default:
2003                BUG();
2004        }
2005        task_unlock(current);
2006
2007        return true;
2008}
2009#endif
2010
2011/*
2012 * mempolicy_nodemask_intersects
2013 *
2014 * If tsk's mempolicy is "default" [NULL], return 'true' to indicate default
2015 * policy.  Otherwise, check for intersection between mask and the policy
2016 * nodemask for 'bind' or 'interleave' policy.  For 'perferred' or 'local'
2017 * policy, always return true since it may allocate elsewhere on fallback.
2018 *
2019 * Takes task_lock(tsk) to prevent freeing of its mempolicy.
2020 */
2021bool mempolicy_nodemask_intersects(struct task_struct *tsk,
2022                                        const nodemask_t *mask)
2023{
2024        struct mempolicy *mempolicy;
2025        bool ret = true;
2026
2027        if (!mask)
2028                return ret;
2029        task_lock(tsk);
2030        mempolicy = tsk->mempolicy;
2031        if (!mempolicy)
2032                goto out;
2033
2034        switch (mempolicy->mode) {
2035        case MPOL_PREFERRED:
2036                /*
2037                 * MPOL_PREFERRED and MPOL_F_LOCAL are only preferred nodes to
2038                 * allocate from, they may fallback to other nodes when oom.
2039                 * Thus, it's possible for tsk to have allocated memory from
2040                 * nodes in mask.
2041                 */
2042                break;
2043        case MPOL_BIND:
2044        case MPOL_INTERLEAVE:
2045                ret = nodes_intersects(mempolicy->v.nodes, *mask);
2046                break;
2047        default:
2048                BUG();
2049        }
2050out:
2051        task_unlock(tsk);
2052        return ret;
2053}
2054
2055/* Allocate a page in interleaved policy.
2056   Own path because it needs to do special accounting. */
2057static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
2058                                        unsigned nid)
2059{
2060        struct page *page;
2061
2062        page = __alloc_pages(gfp, order, nid);
2063        /* skip NUMA_INTERLEAVE_HIT counter update if numa stats is disabled */
2064        if (!static_branch_likely(&vm_numa_stat_key))
2065                return page;
2066        if (page && page_to_nid(page) == nid) {
2067                preempt_disable();
2068                __inc_numa_state(page_zone(page), NUMA_INTERLEAVE_HIT);
2069                preempt_enable();
2070        }
2071        return page;
2072}
2073
2074/**
2075 *      alloc_pages_vma - Allocate a page for a VMA.
2076 *
2077 *      @gfp:
2078 *      %GFP_USER    user allocation.
2079 *      %GFP_KERNEL  kernel allocations,
2080 *      %GFP_HIGHMEM highmem/user allocations,
2081 *      %GFP_FS      allocation should not call back into a file system.
2082 *      %GFP_ATOMIC  don't sleep.
2083 *
2084 *      @order:Order of the GFP allocation.
2085 *      @vma:  Pointer to VMA or NULL if not available.
2086 *      @addr: Virtual Address of the allocation. Must be inside the VMA.
2087 *      @node: Which node to prefer for allocation (modulo policy).
2088 *      @hugepage: for hugepages try only the preferred node if possible
2089 *
2090 *      This function allocates a page from the kernel page pool and applies
2091 *      a NUMA policy associated with the VMA or the current process.
2092 *      When VMA is not NULL caller must hold down_read on the mmap_sem of the
2093 *      mm_struct of the VMA to prevent it from going away. Should be used for
2094 *      all allocations for pages that will be mapped into user space. Returns
2095 *      NULL when no page can be allocated.
2096 */
2097struct page *
2098alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
2099                unsigned long addr, int node, bool hugepage)
2100{
2101        struct mempolicy *pol;
2102        struct page *page;
2103        int preferred_nid;
2104        nodemask_t *nmask;
2105
2106        pol = get_vma_policy(vma, addr);
2107
2108        if (pol->mode == MPOL_INTERLEAVE) {
2109                unsigned nid;
2110
2111                nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order);
2112                mpol_cond_put(pol);
2113                page = alloc_page_interleave(gfp, order, nid);
2114                goto out;
2115        }
2116
2117        if (unlikely(IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && hugepage)) {
2118                int hpage_node = node;
2119
2120                /*
2121                 * For hugepage allocation and non-interleave policy which
2122                 * allows the current node (or other explicitly preferred
2123                 * node) we only try to allocate from the current/preferred
2124                 * node and don't fall back to other nodes, as the cost of
2125                 * remote accesses would likely offset THP benefits.
2126                 *
2127                 * If the policy is interleave, or does not allow the current
2128                 * node in its nodemask, we allocate the standard way.
2129                 */
2130                if (pol->mode == MPOL_PREFERRED && !(pol->flags & MPOL_F_LOCAL))
2131                        hpage_node = pol->v.preferred_node;
2132
2133                nmask = policy_nodemask(gfp, pol);
2134                if (!nmask || node_isset(hpage_node, *nmask)) {
2135                        mpol_cond_put(pol);
2136                        page = __alloc_pages_node(hpage_node,
2137                                                gfp | __GFP_THISNODE, order);
2138
2139                        /*
2140                         * If hugepage allocations are configured to always
2141                         * synchronous compact or the vma has been madvised
2142                         * to prefer hugepage backing, retry allowing remote
2143                         * memory as well.
2144                         */
2145                        if (!page && (gfp & __GFP_DIRECT_RECLAIM))
2146                                page = __alloc_pages_node(hpage_node,
2147                                                gfp | __GFP_NORETRY, order);
2148
2149                        goto out;
2150                }
2151        }
2152
2153        nmask = policy_nodemask(gfp, pol);
2154        preferred_nid = policy_node(gfp, pol, node);
2155        page = __alloc_pages_nodemask(gfp, order, preferred_nid, nmask);
2156        mpol_cond_put(pol);
2157out:
2158        return page;
2159}
2160EXPORT_SYMBOL(alloc_pages_vma);
2161
2162/**
2163 *      alloc_pages_current - Allocate pages.
2164 *
2165 *      @gfp:
2166 *              %GFP_USER   user allocation,
2167 *              %GFP_KERNEL kernel allocation,
2168 *              %GFP_HIGHMEM highmem allocation,
2169 *              %GFP_FS     don't call back into a file system.
2170 *              %GFP_ATOMIC don't sleep.
2171 *      @order: Power of two of allocation size in pages. 0 is a single page.
2172 *
2173 *      Allocate a page from the kernel page pool.  When not in
2174 *      interrupt context and apply the current process NUMA policy.
2175 *      Returns NULL when no page can be allocated.
2176 */
2177struct page *alloc_pages_current(gfp_t gfp, unsigned order)
2178{
2179        struct mempolicy *pol = &default_policy;
2180        struct page *page;
2181
2182        if (!in_interrupt() && !(gfp & __GFP_THISNODE))
2183                pol = get_task_policy(current);
2184
2185        /*
2186         * No reference counting needed for current->mempolicy
2187         * nor system default_policy
2188         */
2189        if (pol->mode == MPOL_INTERLEAVE)
2190                page = alloc_page_interleave(gfp, order, interleave_nodes(pol));
2191        else
2192                page = __alloc_pages_nodemask(gfp, order,
2193                                policy_node(gfp, pol, numa_node_id()),
2194                                policy_nodemask(gfp, pol));
2195
2196        return page;
2197}
2198EXPORT_SYMBOL(alloc_pages_current);
2199
2200int vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst)
2201{
2202        struct mempolicy *pol = mpol_dup(vma_policy(src));
2203
2204        if (IS_ERR(pol))
2205                return PTR_ERR(pol);
2206        dst->vm_policy = pol;
2207        return 0;
2208}
2209
2210/*
2211 * If mpol_dup() sees current->cpuset == cpuset_being_rebound, then it
2212 * rebinds the mempolicy its copying by calling mpol_rebind_policy()
2213 * with the mems_allowed returned by cpuset_mems_allowed().  This
2214 * keeps mempolicies cpuset relative after its cpuset moves.  See
2215 * further kernel/cpuset.c update_nodemask().
2216 *
2217 * current's mempolicy may be rebinded by the other task(the task that changes
2218 * cpuset's mems), so we needn't do rebind work for current task.
2219 */
2220
2221/* Slow path of a mempolicy duplicate */
2222struct mempolicy *__mpol_dup(struct mempolicy *old)
2223{
2224        struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
2225
2226        if (!new)
2227                return ERR_PTR(-ENOMEM);
2228
2229        /* task's mempolicy is protected by alloc_lock */
2230        if (old == current->mempolicy) {
2231                task_lock(current);
2232                *new = *old;
2233                task_unlock(current);
2234        } else
2235                *new = *old;
2236
2237        if (current_cpuset_is_being_rebound()) {
2238                nodemask_t mems = cpuset_mems_allowed(current);
2239                mpol_rebind_policy(new, &mems);
2240        }
2241        atomic_set(&new->refcnt, 1);
2242        return new;
2243}
2244
2245/* Slow path of a mempolicy comparison */
2246bool __mpol_equal(struct mempolicy *a, struct mempolicy *b)
2247{
2248        if (!a || !b)
2249                return false;
2250        if (a->mode != b->mode)
2251                return false;
2252        if (a->flags != b->flags)
2253                return false;
2254        if (mpol_store_user_nodemask(a))
2255                if (!nodes_equal(a->w.user_nodemask, b->w.user_nodemask))
2256                        return false;
2257
2258        switch (a->mode) {
2259        case MPOL_BIND:
2260                /* Fall through */
2261        case MPOL_INTERLEAVE:
2262                return !!nodes_equal(a->v.nodes, b->v.nodes);
2263        case MPOL_PREFERRED:
2264                /* a's ->flags is the same as b's */
2265                if (a->flags & MPOL_F_LOCAL)
2266                        return true;
2267                return a->v.preferred_node == b->v.preferred_node;
2268        default:
2269                BUG();
2270                return false;
2271        }
2272}
2273
2274/*
2275 * Shared memory backing store policy support.
2276 *
2277 * Remember policies even when nobody has shared memory mapped.
2278 * The policies are kept in Red-Black tree linked from the inode.
2279 * They are protected by the sp->lock rwlock, which should be held
2280 * for any accesses to the tree.
2281 */
2282
2283/*
2284 * lookup first element intersecting start-end.  Caller holds sp->lock for
2285 * reading or for writing
2286 */
2287static struct sp_node *
2288sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
2289{
2290        struct rb_node *n = sp->root.rb_node;
2291
2292        while (n) {
2293                struct sp_node *p = rb_entry(n, struct sp_node, nd);
2294
2295                if (start >= p->end)
2296                        n = n->rb_right;
2297                else if (end <= p->start)
2298                        n = n->rb_left;
2299                else
2300                        break;
2301        }
2302        if (!n)
2303                return NULL;
2304        for (;;) {
2305                struct sp_node *w = NULL;
2306                struct rb_node *prev = rb_prev(n);
2307                if (!prev)
2308                        break;
2309                w = rb_entry(prev, struct sp_node, nd);
2310                if (w->end <= start)
2311                        break;
2312                n = prev;
2313        }
2314        return rb_entry(n, struct sp_node, nd);
2315}
2316
2317/*
2318 * Insert a new shared policy into the list.  Caller holds sp->lock for
2319 * writing.
2320 */
2321static void sp_insert(struct shared_policy *sp, struct sp_node *new)
2322{
2323        struct rb_node **p = &sp->root.rb_node;
2324        struct rb_node *parent = NULL;
2325        struct sp_node *nd;
2326
2327        while (*p) {
2328                parent = *p;
2329                nd = rb_entry(parent, struct sp_node, nd);
2330                if (new->start < nd->start)
2331                        p = &(*p)->rb_left;
2332                else if (new->end > nd->end)
2333                        p = &(*p)->rb_right;
2334                else
2335                        BUG();
2336        }
2337        rb_link_node(&new->nd, parent, p);
2338        rb_insert_color(&new->nd, &sp->root);
2339        pr_debug("inserting %lx-%lx: %d\n", new->start, new->end,
2340                 new->policy ? new->policy->mode : 0);
2341}
2342
2343/* Find shared policy intersecting idx */
2344struct mempolicy *
2345mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
2346{
2347        struct mempolicy *pol = NULL;
2348        struct sp_node *sn;
2349
2350        if (!sp->root.rb_node)
2351                return NULL;
2352        read_lock(&sp->lock);
2353        sn = sp_lookup(sp, idx, idx+1);
2354        if (sn) {
2355                mpol_get(sn->policy);
2356                pol = sn->policy;
2357        }
2358        read_unlock(&sp->lock);
2359        return pol;
2360}
2361
2362static void sp_free(struct sp_node *n)
2363{
2364        mpol_put(n->policy);
2365        kmem_cache_free(sn_cache, n);
2366}
2367
2368/**
2369 * mpol_misplaced - check whether current page node is valid in policy
2370 *
2371 * @page: page to be checked
2372 * @vma: vm area where page mapped
2373 * @addr: virtual address where page mapped
2374 *
2375 * Lookup current policy node id for vma,addr and "compare to" page's
2376 * node id.
2377 *
2378 * Returns:
2379 *      -1      - not misplaced, page is in the right node
2380 *      node    - node id where the page should be
2381 *
2382 * Policy determination "mimics" alloc_page_vma().
2383 * Called from fault path where we know the vma and faulting address.
2384 */
2385int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long addr)
2386{
2387        struct mempolicy *pol;
2388        struct zoneref *z;
2389        int curnid = page_to_nid(page);
2390        unsigned long pgoff;
2391        int thiscpu = raw_smp_processor_id();
2392        int thisnid = cpu_to_node(thiscpu);
2393        int polnid = NUMA_NO_NODE;
2394        int ret = -1;
2395
2396        pol = get_vma_policy(vma, addr);
2397        if (!(pol->flags & MPOL_F_MOF))
2398                goto out;
2399
2400        switch (pol->mode) {
2401        case MPOL_INTERLEAVE:
2402                pgoff = vma->vm_pgoff;
2403                pgoff += (addr - vma->vm_start) >> PAGE_SHIFT;
2404                polnid = offset_il_node(pol, pgoff);
2405                break;
2406
2407        case MPOL_PREFERRED:
2408                if (pol->flags & MPOL_F_LOCAL)
2409                        polnid = numa_node_id();
2410                else
2411                        polnid = pol->v.preferred_node;
2412                break;
2413
2414        case MPOL_BIND:
2415
2416                /*
2417                 * allows binding to multiple nodes.
2418                 * use current page if in policy nodemask,
2419                 * else select nearest allowed node, if any.
2420                 * If no allowed nodes, use current [!misplaced].
2421                 */
2422                if (node_isset(curnid, pol->v.nodes))
2423                        goto out;
2424                z = first_zones_zonelist(
2425                                node_zonelist(numa_node_id(), GFP_HIGHUSER),
2426                                gfp_zone(GFP_HIGHUSER),
2427                                &pol->v.nodes);
2428                polnid = zone_to_nid(z->zone);
2429                break;
2430
2431        default:
2432                BUG();
2433        }
2434
2435        /* Migrate the page towards the node whose CPU is referencing it */
2436        if (pol->flags & MPOL_F_MORON) {
2437                polnid = thisnid;
2438
2439                if (!should_numa_migrate_memory(current, page, curnid, thiscpu))
2440                        goto out;
2441        }
2442
2443        if (curnid != polnid)
2444                ret = polnid;
2445out:
2446        mpol_cond_put(pol);
2447
2448        return ret;
2449}
2450
2451/*
2452 * Drop the (possibly final) reference to task->mempolicy.  It needs to be
2453 * dropped after task->mempolicy is set to NULL so that any allocation done as
2454 * part of its kmem_cache_free(), such as by KASAN, doesn't reference a freed
2455 * policy.
2456 */
2457void mpol_put_task_policy(struct task_struct *task)
2458{
2459        struct mempolicy *pol;
2460
2461        task_lock(task);
2462        pol = task->mempolicy;
2463        task->mempolicy = NULL;
2464        task_unlock(task);
2465        mpol_put(pol);
2466}
2467
2468static void sp_delete(struct shared_policy *sp, struct sp_node *n)
2469{
2470        pr_debug("deleting %lx-l%lx\n", n->start, n->end);
2471        rb_erase(&n->nd, &sp->root);
2472        sp_free(n);
2473}
2474
2475static void sp_node_init(struct sp_node *node, unsigned long start,
2476                        unsigned long end, struct mempolicy *pol)
2477{
2478        node->start = start;
2479        node->end = end;
2480        node->policy = pol;
2481}
2482
2483static struct sp_node *sp_alloc(unsigned long start, unsigned long end,
2484                                struct mempolicy *pol)
2485{
2486        struct sp_node *n;
2487        struct mempolicy *newpol;
2488
2489        n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
2490        if (!n)
2491                return NULL;
2492
2493        newpol = mpol_dup(pol);
2494        if (IS_ERR(newpol)) {
2495                kmem_cache_free(sn_cache, n);
2496                return NULL;
2497        }
2498        newpol->flags |= MPOL_F_SHARED;
2499        sp_node_init(n, start, end, newpol);
2500
2501        return n;
2502}
2503
2504/* Replace a policy range. */
2505static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
2506                                 unsigned long end, struct sp_node *new)
2507{
2508        struct sp_node *n;
2509        struct sp_node *n_new = NULL;
2510        struct mempolicy *mpol_new = NULL;
2511        int ret = 0;
2512
2513restart:
2514        write_lock(&sp->lock);
2515        n = sp_lookup(sp, start, end);
2516        /* Take care of old policies in the same range. */
2517        while (n && n->start < end) {
2518                struct rb_node *next = rb_next(&n->nd);
2519                if (n->start >= start) {
2520                        if (n->end <= end)
2521                                sp_delete(sp, n);
2522                        else
2523                                n->start = end;
2524                } else {
2525                        /* Old policy spanning whole new range. */
2526                        if (n->end > end) {
2527                                if (!n_new)
2528                                        goto alloc_new;
2529
2530                                *mpol_new = *n->policy;
2531                                atomic_set(&mpol_new->refcnt, 1);
2532                                sp_node_init(n_new, end, n->end, mpol_new);
2533                                n->end = start;
2534                                sp_insert(sp, n_new);
2535                                n_new = NULL;
2536                                mpol_new = NULL;
2537                                break;
2538                        } else
2539                                n->end = start;
2540                }
2541                if (!next)
2542                        break;
2543                n = rb_entry(next, struct sp_node, nd);
2544        }
2545        if (new)
2546                sp_insert(sp, new);
2547        write_unlock(&sp->lock);
2548        ret = 0;
2549
2550err_out:
2551        if (mpol_new)
2552                mpol_put(mpol_new);
2553        if (n_new)
2554                kmem_cache_free(sn_cache, n_new);
2555
2556        return ret;
2557
2558alloc_new:
2559        write_unlock(&sp->lock);
2560        ret = -ENOMEM;
2561        n_new = kmem_cache_alloc(sn_cache, GFP_KERNEL);
2562        if (!n_new)
2563                goto err_out;
2564        mpol_new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
2565        if (!mpol_new)
2566                goto err_out;
2567        goto restart;
2568}
2569
2570/**
2571 * mpol_shared_policy_init - initialize shared policy for inode
2572 * @sp: pointer to inode shared policy
2573 * @mpol:  struct mempolicy to install
2574 *
2575 * Install non-NULL @mpol in inode's shared policy rb-tree.
2576 * On entry, the current task has a reference on a non-NULL @mpol.
2577 * This must be released on exit.
2578 * This is called at get_inode() calls and we can use GFP_KERNEL.
2579 */
2580void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol)
2581{
2582        int ret;
2583
2584        sp->root = RB_ROOT;             /* empty tree == default mempolicy */
2585        rwlock_init(&sp->lock);
2586
2587        if (mpol) {
2588                struct vm_area_struct pvma;
2589                struct mempolicy *new;
2590                NODEMASK_SCRATCH(scratch);
2591
2592                if (!scratch)
2593                        goto put_mpol;
2594                /* contextualize the tmpfs mount point mempolicy */
2595                new = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask);
2596                if (IS_ERR(new))
2597                        goto free_scratch; /* no valid nodemask intersection */
2598
2599                task_lock(current);
2600                ret = mpol_set_nodemask(new, &mpol->w.user_nodemask, scratch);
2601                task_unlock(current);
2602                if (ret)
2603                        goto put_new;
2604
2605                /* Create pseudo-vma that contains just the policy */
2606                vma_init(&pvma, NULL);
2607                pvma.vm_end = TASK_SIZE;        /* policy covers entire file */
2608                mpol_set_shared_policy(sp, &pvma, new); /* adds ref */
2609
2610put_new:
2611                mpol_put(new);                  /* drop initial ref */
2612free_scratch:
2613                NODEMASK_SCRATCH_FREE(scratch);
2614put_mpol:
2615                mpol_put(mpol); /* drop our incoming ref on sb mpol */
2616        }
2617}
2618
2619int mpol_set_shared_policy(struct shared_policy *info,
2620                        struct vm_area_struct *vma, struct mempolicy *npol)
2621{
2622        int err;
2623        struct sp_node *new = NULL;
2624        unsigned long sz = vma_pages(vma);
2625
2626        pr_debug("set_shared_policy %lx sz %lu %d %d %lx\n",
2627                 vma->vm_pgoff,
2628                 sz, npol ? npol->mode : -1,
2629                 npol ? npol->flags : -1,
2630                 npol ? nodes_addr(npol->v.nodes)[0] : NUMA_NO_NODE);
2631
2632        if (npol) {
2633                new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);
2634                if (!new)
2635                        return -ENOMEM;
2636        }
2637        err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new);
2638        if (err && new)
2639                sp_free(new);
2640        return err;
2641}
2642
2643/* Free a backing policy store on inode delete. */
2644void mpol_free_shared_policy(struct shared_policy *p)
2645{
2646        struct sp_node *n;
2647        struct rb_node *next;
2648
2649        if (!p->root.rb_node)
2650                return;
2651        write_lock(&p->lock);
2652        next = rb_first(&p->root);
2653        while (next) {
2654                n = rb_entry(next, struct sp_node, nd);
2655                next = rb_next(&n->nd);
2656                sp_delete(p, n);
2657        }
2658        write_unlock(&p->lock);
2659}
2660
2661#ifdef CONFIG_NUMA_BALANCING
2662static int __initdata numabalancing_override;
2663
2664static void __init check_numabalancing_enable(void)
2665{
2666        bool numabalancing_default = false;
2667
2668        if (IS_ENABLED(CONFIG_NUMA_BALANCING_DEFAULT_ENABLED))
2669                numabalancing_default = true;
2670
2671        /* Parsed by setup_numabalancing. override == 1 enables, -1 disables */
2672        if (numabalancing_override)
2673                set_numabalancing_state(numabalancing_override == 1);
2674
2675        if (num_online_nodes() > 1 && !numabalancing_override) {
2676                pr_info("%s automatic NUMA balancing. Configure with numa_balancing= or the kernel.numa_balancing sysctl\n",
2677                        numabalancing_default ? "Enabling" : "Disabling");
2678                set_numabalancing_state(numabalancing_default);
2679        }
2680}
2681
2682static int __init setup_numabalancing(char *str)
2683{
2684        int ret = 0;
2685        if (!str)
2686                goto out;
2687
2688        if (!strcmp(str, "enable")) {
2689                numabalancing_override = 1;
2690                ret = 1;
2691        } else if (!strcmp(str, "disable")) {
2692                numabalancing_override = -1;
2693                ret = 1;
2694        }
2695out:
2696        if (!ret)
2697                pr_warn("Unable to parse numa_balancing=\n");
2698
2699        return ret;
2700}
2701__setup("numa_balancing=", setup_numabalancing);
2702#else
2703static inline void __init check_numabalancing_enable(void)
2704{
2705}
2706#endif /* CONFIG_NUMA_BALANCING */
2707
2708/* assumes fs == KERNEL_DS */
2709void __init numa_policy_init(void)
2710{
2711        nodemask_t interleave_nodes;
2712        unsigned long largest = 0;
2713        int nid, prefer = 0;
2714
2715        policy_cache = kmem_cache_create("numa_policy",
2716                                         sizeof(struct mempolicy),
2717                                         0, SLAB_PANIC, NULL);
2718
2719        sn_cache = kmem_cache_create("shared_policy_node",
2720                                     sizeof(struct sp_node),
2721                                     0, SLAB_PANIC, NULL);
2722
2723        for_each_node(nid) {
2724                preferred_node_policy[nid] = (struct mempolicy) {
2725                        .refcnt = ATOMIC_INIT(1),
2726                        .mode = MPOL_PREFERRED,
2727                        .flags = MPOL_F_MOF | MPOL_F_MORON,
2728                        .v = { .preferred_node = nid, },
2729                };
2730        }
2731
2732        /*
2733         * Set interleaving policy for system init. Interleaving is only
2734         * enabled across suitably sized nodes (default is >= 16MB), or
2735         * fall back to the largest node if they're all smaller.
2736         */
2737        nodes_clear(interleave_nodes);
2738        for_each_node_state(nid, N_MEMORY) {
2739                unsigned long total_pages = node_present_pages(nid);
2740
2741                /* Preserve the largest node */
2742                if (largest < total_pages) {
2743                        largest = total_pages;
2744                        prefer = nid;
2745                }
2746
2747                /* Interleave this node? */
2748                if ((total_pages << PAGE_SHIFT) >= (16 << 20))
2749                        node_set(nid, interleave_nodes);
2750        }
2751
2752        /* All too small, use the largest */
2753        if (unlikely(nodes_empty(interleave_nodes)))
2754                node_set(prefer, interleave_nodes);
2755
2756        if (do_set_mempolicy(MPOL_INTERLEAVE, 0, &interleave_nodes))
2757                pr_err("%s: interleaving failed\n", __func__);
2758
2759        check_numabalancing_enable();
2760}
2761
2762/* Reset policy of current process to default */
2763void numa_default_policy(void)
2764{
2765        do_set_mempolicy(MPOL_DEFAULT, 0, NULL);
2766}
2767
2768/*
2769 * Parse and format mempolicy from/to strings
2770 */
2771
2772/*
2773 * "local" is implemented internally by MPOL_PREFERRED with MPOL_F_LOCAL flag.
2774 */
2775static const char * const policy_modes[] =
2776{
2777        [MPOL_DEFAULT]    = "default",
2778        [MPOL_PREFERRED]  = "prefer",
2779        [MPOL_BIND]       = "bind",
2780        [MPOL_INTERLEAVE] = "interleave",
2781        [MPOL_LOCAL]      = "local",
2782};
2783
2784
2785#ifdef CONFIG_TMPFS
2786/**
2787 * mpol_parse_str - parse string to mempolicy, for tmpfs mpol mount option.
2788 * @str:  string containing mempolicy to parse
2789 * @mpol:  pointer to struct mempolicy pointer, returned on success.
2790 *
2791 * Format of input:
2792 *      <mode>[=<flags>][:<nodelist>]
2793 *
2794 * On success, returns 0, else 1
2795 */
2796int mpol_parse_str(char *str, struct mempolicy **mpol)
2797{
2798        struct mempolicy *new = NULL;
2799        unsigned short mode_flags;
2800        nodemask_t nodes;
2801        char *nodelist = strchr(str, ':');
2802        char *flags = strchr(str, '=');
2803        int err = 1, mode;
2804
2805        if (nodelist) {
2806                /* NUL-terminate mode or flags string */
2807                *nodelist++ = '\0';
2808                if (nodelist_parse(nodelist, nodes))
2809                        goto out;
2810                if (!nodes_subset(nodes, node_states[N_MEMORY]))
2811                        goto out;
2812        } else
2813                nodes_clear(nodes);
2814
2815        if (flags)
2816                *flags++ = '\0';        /* terminate mode string */
2817
2818        mode = match_string(policy_modes, MPOL_MAX, str);
2819        if (mode < 0)
2820                goto out;
2821
2822        switch (mode) {
2823        case MPOL_PREFERRED:
2824                /*
2825                 * Insist on a nodelist of one node only
2826                 */
2827                if (nodelist) {
2828                        char *rest = nodelist;
2829                        while (isdigit(*rest))
2830                                rest++;
2831                        if (*rest)
2832                                goto out;
2833                }
2834                break;
2835        case MPOL_INTERLEAVE:
2836                /*
2837                 * Default to online nodes with memory if no nodelist
2838                 */
2839                if (!nodelist)
2840                        nodes = node_states[N_MEMORY];
2841                break;
2842        case MPOL_LOCAL:
2843                /*
2844                 * Don't allow a nodelist;  mpol_new() checks flags
2845                 */
2846                if (nodelist)
2847                        goto out;
2848                mode = MPOL_PREFERRED;
2849                break;
2850        case MPOL_DEFAULT:
2851                /*
2852                 * Insist on a empty nodelist
2853                 */
2854                if (!nodelist)
2855                        err = 0;
2856                goto out;
2857        case MPOL_BIND:
2858                /*
2859                 * Insist on a nodelist
2860                 */
2861                if (!nodelist)
2862                        goto out;
2863        }
2864
2865        mode_flags = 0;
2866        if (flags) {
2867                /*
2868                 * Currently, we only support two mutually exclusive
2869                 * mode flags.
2870                 */
2871                if (!strcmp(flags, "static"))
2872                        mode_flags |= MPOL_F_STATIC_NODES;
2873                else if (!strcmp(flags, "relative"))
2874                        mode_flags |= MPOL_F_RELATIVE_NODES;
2875                else
2876                        goto out;
2877        }
2878
2879        new = mpol_new(mode, mode_flags, &nodes);
2880        if (IS_ERR(new))
2881                goto out;
2882
2883        /*
2884         * Save nodes for mpol_to_str() to show the tmpfs mount options
2885         * for /proc/mounts, /proc/pid/mounts and /proc/pid/mountinfo.
2886         */
2887        if (mode != MPOL_PREFERRED)
2888                new->v.nodes = nodes;
2889        else if (nodelist)
2890                new->v.preferred_node = first_node(nodes);
2891        else
2892                new->flags |= MPOL_F_LOCAL;
2893
2894        /*
2895         * Save nodes for contextualization: this will be used to "clone"
2896         * the mempolicy in a specific context [cpuset] at a later time.
2897         */
2898        new->w.user_nodemask = nodes;
2899
2900        err = 0;
2901
2902out:
2903        /* Restore string for error message */
2904        if (nodelist)
2905                *--nodelist = ':';
2906        if (flags)
2907                *--flags = '=';
2908        if (!err)
2909                *mpol = new;
2910        return err;
2911}
2912#endif /* CONFIG_TMPFS */
2913
2914/**
2915 * mpol_to_str - format a mempolicy structure for printing
2916 * @buffer:  to contain formatted mempolicy string
2917 * @maxlen:  length of @buffer
2918 * @pol:  pointer to mempolicy to be formatted
2919 *
2920 * Convert @pol into a string.  If @buffer is too short, truncate the string.
2921 * Recommend a @maxlen of at least 32 for the longest mode, "interleave", the
2922 * longest flag, "relative", and to display at least a few node ids.
2923 */
2924void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
2925{
2926        char *p = buffer;
2927        nodemask_t nodes = NODE_MASK_NONE;
2928        unsigned short mode = MPOL_DEFAULT;
2929        unsigned short flags = 0;
2930
2931        if (pol && pol != &default_policy && !(pol->flags & MPOL_F_MORON)) {
2932                mode = pol->mode;
2933                flags = pol->flags;
2934        }
2935
2936        switch (mode) {
2937        case MPOL_DEFAULT:
2938                break;
2939        case MPOL_PREFERRED:
2940                if (flags & MPOL_F_LOCAL)
2941                        mode = MPOL_LOCAL;
2942                else
2943                        node_set(pol->v.preferred_node, nodes);
2944                break;
2945        case MPOL_BIND:
2946        case MPOL_INTERLEAVE:
2947                nodes = pol->v.nodes;
2948                break;
2949        default:
2950                WARN_ON_ONCE(1);
2951                snprintf(p, maxlen, "unknown");
2952                return;
2953        }
2954
2955        p += snprintf(p, maxlen, "%s", policy_modes[mode]);
2956
2957        if (flags & MPOL_MODE_FLAGS) {
2958                p += snprintf(p, buffer + maxlen - p, "=");
2959
2960                /*
2961                 * Currently, the only defined flags are mutually exclusive
2962                 */
2963                if (flags & MPOL_F_STATIC_NODES)
2964                        p += snprintf(p, buffer + maxlen - p, "static");
2965                else if (flags & MPOL_F_RELATIVE_NODES)
2966                        p += snprintf(p, buffer + maxlen - p, "relative");
2967        }
2968
2969        if (!nodes_empty(nodes))
2970                p += scnprintf(p, buffer + maxlen - p, ":%*pbl",
2971                               nodemask_pr_args(&nodes));
2972}
2973