linux/mm/mempolicy.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0-only
   2/*
   3 * Simple NUMA memory policy for the Linux kernel.
   4 *
   5 * Copyright 2003,2004 Andi Kleen, SuSE Labs.
   6 * (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc.
   7 *
   8 * NUMA policy allows the user to give hints in which node(s) memory should
   9 * be allocated.
  10 *
  11 * Support four policies per VMA and per process:
  12 *
  13 * The VMA policy has priority over the process policy for a page fault.
  14 *
  15 * interleave     Allocate memory interleaved over a set of nodes,
  16 *                with normal fallback if it fails.
  17 *                For VMA based allocations this interleaves based on the
  18 *                offset into the backing object or offset into the mapping
  19 *                for anonymous memory. For process policy an process counter
  20 *                is used.
  21 *
  22 * bind           Only allocate memory on a specific set of nodes,
  23 *                no fallback.
  24 *                FIXME: memory is allocated starting with the first node
  25 *                to the last. It would be better if bind would truly restrict
  26 *                the allocation to memory nodes instead
  27 *
  28 * preferred       Try a specific node first before normal fallback.
  29 *                As a special case NUMA_NO_NODE here means do the allocation
  30 *                on the local CPU. This is normally identical to default,
  31 *                but useful to set in a VMA when you have a non default
  32 *                process policy.
  33 *
  34 * default        Allocate on the local node first, or when on a VMA
  35 *                use the process policy. This is what Linux always did
  36 *                in a NUMA aware kernel and still does by, ahem, default.
  37 *
  38 * The process policy is applied for most non interrupt memory allocations
  39 * in that process' context. Interrupts ignore the policies and always
  40 * try to allocate on the local CPU. The VMA policy is only applied for memory
  41 * allocations for a VMA in the VM.
  42 *
  43 * Currently there are a few corner cases in swapping where the policy
  44 * is not applied, but the majority should be handled. When process policy
  45 * is used it is not remembered over swap outs/swap ins.
  46 *
  47 * Only the highest zone in the zone hierarchy gets policied. Allocations
  48 * requesting a lower zone just use default policy. This implies that
  49 * on systems with highmem kernel lowmem allocation don't get policied.
  50 * Same with GFP_DMA allocations.
  51 *
  52 * For shmfs/tmpfs/hugetlbfs shared memory the policy is shared between
  53 * all users and remembered even when nobody has memory mapped.
  54 */
  55
  56/* Notebook:
  57   fix mmap readahead to honour policy and enable policy for any page cache
  58   object
  59   statistics for bigpages
  60   global policy for page cache? currently it uses process policy. Requires
  61   first item above.
  62   handle mremap for shared memory (currently ignored for the policy)
  63   grows down?
  64   make bind policy root only? It can trigger oom much faster and the
  65   kernel is not always grateful with that.
  66*/
  67
  68#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
  69
  70#include <linux/mempolicy.h>
  71#include <linux/pagewalk.h>
  72#include <linux/highmem.h>
  73#include <linux/hugetlb.h>
  74#include <linux/kernel.h>
  75#include <linux/sched.h>
  76#include <linux/sched/mm.h>
  77#include <linux/sched/numa_balancing.h>
  78#include <linux/sched/task.h>
  79#include <linux/nodemask.h>
  80#include <linux/cpuset.h>
  81#include <linux/slab.h>
  82#include <linux/string.h>
  83#include <linux/export.h>
  84#include <linux/nsproxy.h>
  85#include <linux/interrupt.h>
  86#include <linux/init.h>
  87#include <linux/compat.h>
  88#include <linux/ptrace.h>
  89#include <linux/swap.h>
  90#include <linux/seq_file.h>
  91#include <linux/proc_fs.h>
  92#include <linux/migrate.h>
  93#include <linux/ksm.h>
  94#include <linux/rmap.h>
  95#include <linux/security.h>
  96#include <linux/syscalls.h>
  97#include <linux/ctype.h>
  98#include <linux/mm_inline.h>
  99#include <linux/mmu_notifier.h>
 100#include <linux/printk.h>
 101#include <linux/swapops.h>
 102
 103#include <asm/tlbflush.h>
 104#include <linux/uaccess.h>
 105
 106#include "internal.h"
 107
 108/* Internal flags */
 109#define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0)    /* Skip checks for continuous vmas */
 110#define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1)          /* Invert check for nodemask */
 111
 112static struct kmem_cache *policy_cache;
 113static struct kmem_cache *sn_cache;
 114
 115/* Highest zone. An specific allocation for a zone below that is not
 116   policied. */
 117enum zone_type policy_zone = 0;
 118
 119/*
 120 * run-time system-wide default policy => local allocation
 121 */
 122static struct mempolicy default_policy = {
 123        .refcnt = ATOMIC_INIT(1), /* never free it */
 124        .mode = MPOL_PREFERRED,
 125        .flags = MPOL_F_LOCAL,
 126};
 127
 128static struct mempolicy preferred_node_policy[MAX_NUMNODES];
 129
 130struct mempolicy *get_task_policy(struct task_struct *p)
 131{
 132        struct mempolicy *pol = p->mempolicy;
 133        int node;
 134
 135        if (pol)
 136                return pol;
 137
 138        node = numa_node_id();
 139        if (node != NUMA_NO_NODE) {
 140                pol = &preferred_node_policy[node];
 141                /* preferred_node_policy is not initialised early in boot */
 142                if (pol->mode)
 143                        return pol;
 144        }
 145
 146        return &default_policy;
 147}
 148
 149static const struct mempolicy_operations {
 150        int (*create)(struct mempolicy *pol, const nodemask_t *nodes);
 151        void (*rebind)(struct mempolicy *pol, const nodemask_t *nodes);
 152} mpol_ops[MPOL_MAX];
 153
 154static inline int mpol_store_user_nodemask(const struct mempolicy *pol)
 155{
 156        return pol->flags & MPOL_MODE_FLAGS;
 157}
 158
 159static void mpol_relative_nodemask(nodemask_t *ret, const nodemask_t *orig,
 160                                   const nodemask_t *rel)
 161{
 162        nodemask_t tmp;
 163        nodes_fold(tmp, *orig, nodes_weight(*rel));
 164        nodes_onto(*ret, tmp, *rel);
 165}
 166
 167static int mpol_new_interleave(struct mempolicy *pol, const nodemask_t *nodes)
 168{
 169        if (nodes_empty(*nodes))
 170                return -EINVAL;
 171        pol->v.nodes = *nodes;
 172        return 0;
 173}
 174
 175static int mpol_new_preferred(struct mempolicy *pol, const nodemask_t *nodes)
 176{
 177        if (!nodes)
 178                pol->flags |= MPOL_F_LOCAL;     /* local allocation */
 179        else if (nodes_empty(*nodes))
 180                return -EINVAL;                 /*  no allowed nodes */
 181        else
 182                pol->v.preferred_node = first_node(*nodes);
 183        return 0;
 184}
 185
 186static int mpol_new_bind(struct mempolicy *pol, const nodemask_t *nodes)
 187{
 188        if (nodes_empty(*nodes))
 189                return -EINVAL;
 190        pol->v.nodes = *nodes;
 191        return 0;
 192}
 193
 194/*
 195 * mpol_set_nodemask is called after mpol_new() to set up the nodemask, if
 196 * any, for the new policy.  mpol_new() has already validated the nodes
 197 * parameter with respect to the policy mode and flags.  But, we need to
 198 * handle an empty nodemask with MPOL_PREFERRED here.
 199 *
 200 * Must be called holding task's alloc_lock to protect task's mems_allowed
 201 * and mempolicy.  May also be called holding the mmap_semaphore for write.
 202 */
 203static int mpol_set_nodemask(struct mempolicy *pol,
 204                     const nodemask_t *nodes, struct nodemask_scratch *nsc)
 205{
 206        int ret;
 207
 208        /* if mode is MPOL_DEFAULT, pol is NULL. This is right. */
 209        if (pol == NULL)
 210                return 0;
 211        /* Check N_MEMORY */
 212        nodes_and(nsc->mask1,
 213                  cpuset_current_mems_allowed, node_states[N_MEMORY]);
 214
 215        VM_BUG_ON(!nodes);
 216        if (pol->mode == MPOL_PREFERRED && nodes_empty(*nodes))
 217                nodes = NULL;   /* explicit local allocation */
 218        else {
 219                if (pol->flags & MPOL_F_RELATIVE_NODES)
 220                        mpol_relative_nodemask(&nsc->mask2, nodes, &nsc->mask1);
 221                else
 222                        nodes_and(nsc->mask2, *nodes, nsc->mask1);
 223
 224                if (mpol_store_user_nodemask(pol))
 225                        pol->w.user_nodemask = *nodes;
 226                else
 227                        pol->w.cpuset_mems_allowed =
 228                                                cpuset_current_mems_allowed;
 229        }
 230
 231        if (nodes)
 232                ret = mpol_ops[pol->mode].create(pol, &nsc->mask2);
 233        else
 234                ret = mpol_ops[pol->mode].create(pol, NULL);
 235        return ret;
 236}
 237
 238/*
 239 * This function just creates a new policy, does some check and simple
 240 * initialization. You must invoke mpol_set_nodemask() to set nodes.
 241 */
 242static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
 243                                  nodemask_t *nodes)
 244{
 245        struct mempolicy *policy;
 246
 247        pr_debug("setting mode %d flags %d nodes[0] %lx\n",
 248                 mode, flags, nodes ? nodes_addr(*nodes)[0] : NUMA_NO_NODE);
 249
 250        if (mode == MPOL_DEFAULT) {
 251                if (nodes && !nodes_empty(*nodes))
 252                        return ERR_PTR(-EINVAL);
 253                return NULL;
 254        }
 255        VM_BUG_ON(!nodes);
 256
 257        /*
 258         * MPOL_PREFERRED cannot be used with MPOL_F_STATIC_NODES or
 259         * MPOL_F_RELATIVE_NODES if the nodemask is empty (local allocation).
 260         * All other modes require a valid pointer to a non-empty nodemask.
 261         */
 262        if (mode == MPOL_PREFERRED) {
 263                if (nodes_empty(*nodes)) {
 264                        if (((flags & MPOL_F_STATIC_NODES) ||
 265                             (flags & MPOL_F_RELATIVE_NODES)))
 266                                return ERR_PTR(-EINVAL);
 267                }
 268        } else if (mode == MPOL_LOCAL) {
 269                if (!nodes_empty(*nodes) ||
 270                    (flags & MPOL_F_STATIC_NODES) ||
 271                    (flags & MPOL_F_RELATIVE_NODES))
 272                        return ERR_PTR(-EINVAL);
 273                mode = MPOL_PREFERRED;
 274        } else if (nodes_empty(*nodes))
 275                return ERR_PTR(-EINVAL);
 276        policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
 277        if (!policy)
 278                return ERR_PTR(-ENOMEM);
 279        atomic_set(&policy->refcnt, 1);
 280        policy->mode = mode;
 281        policy->flags = flags;
 282
 283        return policy;
 284}
 285
 286/* Slow path of a mpol destructor. */
 287void __mpol_put(struct mempolicy *p)
 288{
 289        if (!atomic_dec_and_test(&p->refcnt))
 290                return;
 291        kmem_cache_free(policy_cache, p);
 292}
 293
 294static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes)
 295{
 296}
 297
 298static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes)
 299{
 300        nodemask_t tmp;
 301
 302        if (pol->flags & MPOL_F_STATIC_NODES)
 303                nodes_and(tmp, pol->w.user_nodemask, *nodes);
 304        else if (pol->flags & MPOL_F_RELATIVE_NODES)
 305                mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
 306        else {
 307                nodes_remap(tmp, pol->v.nodes,pol->w.cpuset_mems_allowed,
 308                                                                *nodes);
 309                pol->w.cpuset_mems_allowed = *nodes;
 310        }
 311
 312        if (nodes_empty(tmp))
 313                tmp = *nodes;
 314
 315        pol->v.nodes = tmp;
 316}
 317
 318static void mpol_rebind_preferred(struct mempolicy *pol,
 319                                                const nodemask_t *nodes)
 320{
 321        nodemask_t tmp;
 322
 323        if (pol->flags & MPOL_F_STATIC_NODES) {
 324                int node = first_node(pol->w.user_nodemask);
 325
 326                if (node_isset(node, *nodes)) {
 327                        pol->v.preferred_node = node;
 328                        pol->flags &= ~MPOL_F_LOCAL;
 329                } else
 330                        pol->flags |= MPOL_F_LOCAL;
 331        } else if (pol->flags & MPOL_F_RELATIVE_NODES) {
 332                mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
 333                pol->v.preferred_node = first_node(tmp);
 334        } else if (!(pol->flags & MPOL_F_LOCAL)) {
 335                pol->v.preferred_node = node_remap(pol->v.preferred_node,
 336                                                   pol->w.cpuset_mems_allowed,
 337                                                   *nodes);
 338                pol->w.cpuset_mems_allowed = *nodes;
 339        }
 340}
 341
 342/*
 343 * mpol_rebind_policy - Migrate a policy to a different set of nodes
 344 *
 345 * Per-vma policies are protected by mmap_sem. Allocations using per-task
 346 * policies are protected by task->mems_allowed_seq to prevent a premature
 347 * OOM/allocation failure due to parallel nodemask modification.
 348 */
 349static void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask)
 350{
 351        if (!pol)
 352                return;
 353        if (!mpol_store_user_nodemask(pol) && !(pol->flags & MPOL_F_LOCAL) &&
 354            nodes_equal(pol->w.cpuset_mems_allowed, *newmask))
 355                return;
 356
 357        mpol_ops[pol->mode].rebind(pol, newmask);
 358}
 359
 360/*
 361 * Wrapper for mpol_rebind_policy() that just requires task
 362 * pointer, and updates task mempolicy.
 363 *
 364 * Called with task's alloc_lock held.
 365 */
 366
 367void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new)
 368{
 369        mpol_rebind_policy(tsk->mempolicy, new);
 370}
 371
 372/*
 373 * Rebind each vma in mm to new nodemask.
 374 *
 375 * Call holding a reference to mm.  Takes mm->mmap_sem during call.
 376 */
 377
 378void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
 379{
 380        struct vm_area_struct *vma;
 381
 382        down_write(&mm->mmap_sem);
 383        for (vma = mm->mmap; vma; vma = vma->vm_next)
 384                mpol_rebind_policy(vma->vm_policy, new);
 385        up_write(&mm->mmap_sem);
 386}
 387
 388static const struct mempolicy_operations mpol_ops[MPOL_MAX] = {
 389        [MPOL_DEFAULT] = {
 390                .rebind = mpol_rebind_default,
 391        },
 392        [MPOL_INTERLEAVE] = {
 393                .create = mpol_new_interleave,
 394                .rebind = mpol_rebind_nodemask,
 395        },
 396        [MPOL_PREFERRED] = {
 397                .create = mpol_new_preferred,
 398                .rebind = mpol_rebind_preferred,
 399        },
 400        [MPOL_BIND] = {
 401                .create = mpol_new_bind,
 402                .rebind = mpol_rebind_nodemask,
 403        },
 404};
 405
 406static int migrate_page_add(struct page *page, struct list_head *pagelist,
 407                                unsigned long flags);
 408
 409struct queue_pages {
 410        struct list_head *pagelist;
 411        unsigned long flags;
 412        nodemask_t *nmask;
 413        unsigned long start;
 414        unsigned long end;
 415        struct vm_area_struct *first;
 416};
 417
 418/*
 419 * Check if the page's nid is in qp->nmask.
 420 *
 421 * If MPOL_MF_INVERT is set in qp->flags, check if the nid is
 422 * in the invert of qp->nmask.
 423 */
 424static inline bool queue_pages_required(struct page *page,
 425                                        struct queue_pages *qp)
 426{
 427        int nid = page_to_nid(page);
 428        unsigned long flags = qp->flags;
 429
 430        return node_isset(nid, *qp->nmask) == !(flags & MPOL_MF_INVERT);
 431}
 432
 433/*
 434 * queue_pages_pmd() has four possible return values:
 435 * 0 - pages are placed on the right node or queued successfully.
 436 * 1 - there is unmovable page, and MPOL_MF_MOVE* & MPOL_MF_STRICT were
 437 *     specified.
 438 * 2 - THP was split.
 439 * -EIO - is migration entry or only MPOL_MF_STRICT was specified and an
 440 *        existing page was already on a node that does not follow the
 441 *        policy.
 442 */
 443static int queue_pages_pmd(pmd_t *pmd, spinlock_t *ptl, unsigned long addr,
 444                                unsigned long end, struct mm_walk *walk)
 445{
 446        int ret = 0;
 447        struct page *page;
 448        struct queue_pages *qp = walk->private;
 449        unsigned long flags;
 450
 451        if (unlikely(is_pmd_migration_entry(*pmd))) {
 452                ret = -EIO;
 453                goto unlock;
 454        }
 455        page = pmd_page(*pmd);
 456        if (is_huge_zero_page(page)) {
 457                spin_unlock(ptl);
 458                __split_huge_pmd(walk->vma, pmd, addr, false, NULL);
 459                ret = 2;
 460                goto out;
 461        }
 462        if (!queue_pages_required(page, qp))
 463                goto unlock;
 464
 465        flags = qp->flags;
 466        /* go to thp migration */
 467        if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
 468                if (!vma_migratable(walk->vma) ||
 469                    migrate_page_add(page, qp->pagelist, flags)) {
 470                        ret = 1;
 471                        goto unlock;
 472                }
 473        } else
 474                ret = -EIO;
 475unlock:
 476        spin_unlock(ptl);
 477out:
 478        return ret;
 479}
 480
 481/*
 482 * Scan through pages checking if pages follow certain conditions,
 483 * and move them to the pagelist if they do.
 484 *
 485 * queue_pages_pte_range() has three possible return values:
 486 * 0 - pages are placed on the right node or queued successfully.
 487 * 1 - there is unmovable page, and MPOL_MF_MOVE* & MPOL_MF_STRICT were
 488 *     specified.
 489 * -EIO - only MPOL_MF_STRICT was specified and an existing page was already
 490 *        on a node that does not follow the policy.
 491 */
 492static int queue_pages_pte_range(pmd_t *pmd, unsigned long addr,
 493                        unsigned long end, struct mm_walk *walk)
 494{
 495        struct vm_area_struct *vma = walk->vma;
 496        struct page *page;
 497        struct queue_pages *qp = walk->private;
 498        unsigned long flags = qp->flags;
 499        int ret;
 500        bool has_unmovable = false;
 501        pte_t *pte;
 502        spinlock_t *ptl;
 503
 504        ptl = pmd_trans_huge_lock(pmd, vma);
 505        if (ptl) {
 506                ret = queue_pages_pmd(pmd, ptl, addr, end, walk);
 507                if (ret != 2)
 508                        return ret;
 509        }
 510        /* THP was split, fall through to pte walk */
 511
 512        if (pmd_trans_unstable(pmd))
 513                return 0;
 514
 515        pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
 516        for (; addr != end; pte++, addr += PAGE_SIZE) {
 517                if (!pte_present(*pte))
 518                        continue;
 519                page = vm_normal_page(vma, addr, *pte);
 520                if (!page)
 521                        continue;
 522                /*
 523                 * vm_normal_page() filters out zero pages, but there might
 524                 * still be PageReserved pages to skip, perhaps in a VDSO.
 525                 */
 526                if (PageReserved(page))
 527                        continue;
 528                if (!queue_pages_required(page, qp))
 529                        continue;
 530                if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
 531                        /* MPOL_MF_STRICT must be specified if we get here */
 532                        if (!vma_migratable(vma)) {
 533                                has_unmovable = true;
 534                                break;
 535                        }
 536
 537                        /*
 538                         * Do not abort immediately since there may be
 539                         * temporary off LRU pages in the range.  Still
 540                         * need migrate other LRU pages.
 541                         */
 542                        if (migrate_page_add(page, qp->pagelist, flags))
 543                                has_unmovable = true;
 544                } else
 545                        break;
 546        }
 547        pte_unmap_unlock(pte - 1, ptl);
 548        cond_resched();
 549
 550        if (has_unmovable)
 551                return 1;
 552
 553        return addr != end ? -EIO : 0;
 554}
 555
 556static int queue_pages_hugetlb(pte_t *pte, unsigned long hmask,
 557                               unsigned long addr, unsigned long end,
 558                               struct mm_walk *walk)
 559{
 560#ifdef CONFIG_HUGETLB_PAGE
 561        struct queue_pages *qp = walk->private;
 562        unsigned long flags = qp->flags;
 563        struct page *page;
 564        spinlock_t *ptl;
 565        pte_t entry;
 566
 567        ptl = huge_pte_lock(hstate_vma(walk->vma), walk->mm, pte);
 568        entry = huge_ptep_get(pte);
 569        if (!pte_present(entry))
 570                goto unlock;
 571        page = pte_page(entry);
 572        if (!queue_pages_required(page, qp))
 573                goto unlock;
 574        /* With MPOL_MF_MOVE, we migrate only unshared hugepage. */
 575        if (flags & (MPOL_MF_MOVE_ALL) ||
 576            (flags & MPOL_MF_MOVE && page_mapcount(page) == 1))
 577                isolate_huge_page(page, qp->pagelist);
 578unlock:
 579        spin_unlock(ptl);
 580#else
 581        BUG();
 582#endif
 583        return 0;
 584}
 585
 586#ifdef CONFIG_NUMA_BALANCING
 587/*
 588 * This is used to mark a range of virtual addresses to be inaccessible.
 589 * These are later cleared by a NUMA hinting fault. Depending on these
 590 * faults, pages may be migrated for better NUMA placement.
 591 *
 592 * This is assuming that NUMA faults are handled using PROT_NONE. If
 593 * an architecture makes a different choice, it will need further
 594 * changes to the core.
 595 */
 596unsigned long change_prot_numa(struct vm_area_struct *vma,
 597                        unsigned long addr, unsigned long end)
 598{
 599        int nr_updated;
 600
 601        nr_updated = change_protection(vma, addr, end, PAGE_NONE, 0, 1);
 602        if (nr_updated)
 603                count_vm_numa_events(NUMA_PTE_UPDATES, nr_updated);
 604
 605        return nr_updated;
 606}
 607#else
 608static unsigned long change_prot_numa(struct vm_area_struct *vma,
 609                        unsigned long addr, unsigned long end)
 610{
 611        return 0;
 612}
 613#endif /* CONFIG_NUMA_BALANCING */
 614
 615static int queue_pages_test_walk(unsigned long start, unsigned long end,
 616                                struct mm_walk *walk)
 617{
 618        struct vm_area_struct *vma = walk->vma;
 619        struct queue_pages *qp = walk->private;
 620        unsigned long endvma = vma->vm_end;
 621        unsigned long flags = qp->flags;
 622
 623        /* range check first */
 624        VM_BUG_ON((vma->vm_start > start) || (vma->vm_end < end));
 625
 626        if (!qp->first) {
 627                qp->first = vma;
 628                if (!(flags & MPOL_MF_DISCONTIG_OK) &&
 629                        (qp->start < vma->vm_start))
 630                        /* hole at head side of range */
 631                        return -EFAULT;
 632        }
 633        if (!(flags & MPOL_MF_DISCONTIG_OK) &&
 634                ((vma->vm_end < qp->end) &&
 635                (!vma->vm_next || vma->vm_end < vma->vm_next->vm_start)))
 636                /* hole at middle or tail of range */
 637                return -EFAULT;
 638
 639        /*
 640         * Need check MPOL_MF_STRICT to return -EIO if possible
 641         * regardless of vma_migratable
 642         */
 643        if (!vma_migratable(vma) &&
 644            !(flags & MPOL_MF_STRICT))
 645                return 1;
 646
 647        if (endvma > end)
 648                endvma = end;
 649
 650        if (flags & MPOL_MF_LAZY) {
 651                /* Similar to task_numa_work, skip inaccessible VMAs */
 652                if (!is_vm_hugetlb_page(vma) &&
 653                        (vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)) &&
 654                        !(vma->vm_flags & VM_MIXEDMAP))
 655                        change_prot_numa(vma, start, endvma);
 656                return 1;
 657        }
 658
 659        /* queue pages from current vma */
 660        if (flags & MPOL_MF_VALID)
 661                return 0;
 662        return 1;
 663}
 664
 665static const struct mm_walk_ops queue_pages_walk_ops = {
 666        .hugetlb_entry          = queue_pages_hugetlb,
 667        .pmd_entry              = queue_pages_pte_range,
 668        .test_walk              = queue_pages_test_walk,
 669};
 670
 671/*
 672 * Walk through page tables and collect pages to be migrated.
 673 *
 674 * If pages found in a given range are on a set of nodes (determined by
 675 * @nodes and @flags,) it's isolated and queued to the pagelist which is
 676 * passed via @private.
 677 *
 678 * queue_pages_range() has three possible return values:
 679 * 1 - there is unmovable page, but MPOL_MF_MOVE* & MPOL_MF_STRICT were
 680 *     specified.
 681 * 0 - queue pages successfully or no misplaced page.
 682 * errno - i.e. misplaced pages with MPOL_MF_STRICT specified (-EIO) or
 683 *         memory range specified by nodemask and maxnode points outside
 684 *         your accessible address space (-EFAULT)
 685 */
 686static int
 687queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end,
 688                nodemask_t *nodes, unsigned long flags,
 689                struct list_head *pagelist)
 690{
 691        int err;
 692        struct queue_pages qp = {
 693                .pagelist = pagelist,
 694                .flags = flags,
 695                .nmask = nodes,
 696                .start = start,
 697                .end = end,
 698                .first = NULL,
 699        };
 700
 701        err = walk_page_range(mm, start, end, &queue_pages_walk_ops, &qp);
 702
 703        if (!qp.first)
 704                /* whole range in hole */
 705                err = -EFAULT;
 706
 707        return err;
 708}
 709
 710/*
 711 * Apply policy to a single VMA
 712 * This must be called with the mmap_sem held for writing.
 713 */
 714static int vma_replace_policy(struct vm_area_struct *vma,
 715                                                struct mempolicy *pol)
 716{
 717        int err;
 718        struct mempolicy *old;
 719        struct mempolicy *new;
 720
 721        pr_debug("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
 722                 vma->vm_start, vma->vm_end, vma->vm_pgoff,
 723                 vma->vm_ops, vma->vm_file,
 724                 vma->vm_ops ? vma->vm_ops->set_policy : NULL);
 725
 726        new = mpol_dup(pol);
 727        if (IS_ERR(new))
 728                return PTR_ERR(new);
 729
 730        if (vma->vm_ops && vma->vm_ops->set_policy) {
 731                err = vma->vm_ops->set_policy(vma, new);
 732                if (err)
 733                        goto err_out;
 734        }
 735
 736        old = vma->vm_policy;
 737        vma->vm_policy = new; /* protected by mmap_sem */
 738        mpol_put(old);
 739
 740        return 0;
 741 err_out:
 742        mpol_put(new);
 743        return err;
 744}
 745
 746/* Step 2: apply policy to a range and do splits. */
 747static int mbind_range(struct mm_struct *mm, unsigned long start,
 748                       unsigned long end, struct mempolicy *new_pol)
 749{
 750        struct vm_area_struct *next;
 751        struct vm_area_struct *prev;
 752        struct vm_area_struct *vma;
 753        int err = 0;
 754        pgoff_t pgoff;
 755        unsigned long vmstart;
 756        unsigned long vmend;
 757
 758        vma = find_vma(mm, start);
 759        VM_BUG_ON(!vma);
 760
 761        prev = vma->vm_prev;
 762        if (start > vma->vm_start)
 763                prev = vma;
 764
 765        for (; vma && vma->vm_start < end; prev = vma, vma = next) {
 766                next = vma->vm_next;
 767                vmstart = max(start, vma->vm_start);
 768                vmend   = min(end, vma->vm_end);
 769
 770                if (mpol_equal(vma_policy(vma), new_pol))
 771                        continue;
 772
 773                pgoff = vma->vm_pgoff +
 774                        ((vmstart - vma->vm_start) >> PAGE_SHIFT);
 775                prev = vma_merge(mm, prev, vmstart, vmend, vma->vm_flags,
 776                                 vma->anon_vma, vma->vm_file, pgoff,
 777                                 new_pol, vma->vm_userfaultfd_ctx);
 778                if (prev) {
 779                        vma = prev;
 780                        next = vma->vm_next;
 781                        if (mpol_equal(vma_policy(vma), new_pol))
 782                                continue;
 783                        /* vma_merge() joined vma && vma->next, case 8 */
 784                        goto replace;
 785                }
 786                if (vma->vm_start != vmstart) {
 787                        err = split_vma(vma->vm_mm, vma, vmstart, 1);
 788                        if (err)
 789                                goto out;
 790                }
 791                if (vma->vm_end != vmend) {
 792                        err = split_vma(vma->vm_mm, vma, vmend, 0);
 793                        if (err)
 794                                goto out;
 795                }
 796 replace:
 797                err = vma_replace_policy(vma, new_pol);
 798                if (err)
 799                        goto out;
 800        }
 801
 802 out:
 803        return err;
 804}
 805
 806/* Set the process memory policy */
 807static long do_set_mempolicy(unsigned short mode, unsigned short flags,
 808                             nodemask_t *nodes)
 809{
 810        struct mempolicy *new, *old;
 811        NODEMASK_SCRATCH(scratch);
 812        int ret;
 813
 814        if (!scratch)
 815                return -ENOMEM;
 816
 817        new = mpol_new(mode, flags, nodes);
 818        if (IS_ERR(new)) {
 819                ret = PTR_ERR(new);
 820                goto out;
 821        }
 822
 823        task_lock(current);
 824        ret = mpol_set_nodemask(new, nodes, scratch);
 825        if (ret) {
 826                task_unlock(current);
 827                mpol_put(new);
 828                goto out;
 829        }
 830        old = current->mempolicy;
 831        current->mempolicy = new;
 832        if (new && new->mode == MPOL_INTERLEAVE)
 833                current->il_prev = MAX_NUMNODES-1;
 834        task_unlock(current);
 835        mpol_put(old);
 836        ret = 0;
 837out:
 838        NODEMASK_SCRATCH_FREE(scratch);
 839        return ret;
 840}
 841
 842/*
 843 * Return nodemask for policy for get_mempolicy() query
 844 *
 845 * Called with task's alloc_lock held
 846 */
 847static void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes)
 848{
 849        nodes_clear(*nodes);
 850        if (p == &default_policy)
 851                return;
 852
 853        switch (p->mode) {
 854        case MPOL_BIND:
 855                /* Fall through */
 856        case MPOL_INTERLEAVE:
 857                *nodes = p->v.nodes;
 858                break;
 859        case MPOL_PREFERRED:
 860                if (!(p->flags & MPOL_F_LOCAL))
 861                        node_set(p->v.preferred_node, *nodes);
 862                /* else return empty node mask for local allocation */
 863                break;
 864        default:
 865                BUG();
 866        }
 867}
 868
 869static int lookup_node(struct mm_struct *mm, unsigned long addr)
 870{
 871        struct page *p;
 872        int err;
 873
 874        int locked = 1;
 875        err = get_user_pages_locked(addr & PAGE_MASK, 1, 0, &p, &locked);
 876        if (err >= 0) {
 877                err = page_to_nid(p);
 878                put_page(p);
 879        }
 880        if (locked)
 881                up_read(&mm->mmap_sem);
 882        return err;
 883}
 884
 885/* Retrieve NUMA policy */
 886static long do_get_mempolicy(int *policy, nodemask_t *nmask,
 887                             unsigned long addr, unsigned long flags)
 888{
 889        int err;
 890        struct mm_struct *mm = current->mm;
 891        struct vm_area_struct *vma = NULL;
 892        struct mempolicy *pol = current->mempolicy, *pol_refcount = NULL;
 893
 894        if (flags &
 895                ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED))
 896                return -EINVAL;
 897
 898        if (flags & MPOL_F_MEMS_ALLOWED) {
 899                if (flags & (MPOL_F_NODE|MPOL_F_ADDR))
 900                        return -EINVAL;
 901                *policy = 0;    /* just so it's initialized */
 902                task_lock(current);
 903                *nmask  = cpuset_current_mems_allowed;
 904                task_unlock(current);
 905                return 0;
 906        }
 907
 908        if (flags & MPOL_F_ADDR) {
 909                /*
 910                 * Do NOT fall back to task policy if the
 911                 * vma/shared policy at addr is NULL.  We
 912                 * want to return MPOL_DEFAULT in this case.
 913                 */
 914                down_read(&mm->mmap_sem);
 915                vma = find_vma_intersection(mm, addr, addr+1);
 916                if (!vma) {
 917                        up_read(&mm->mmap_sem);
 918                        return -EFAULT;
 919                }
 920                if (vma->vm_ops && vma->vm_ops->get_policy)
 921                        pol = vma->vm_ops->get_policy(vma, addr);
 922                else
 923                        pol = vma->vm_policy;
 924        } else if (addr)
 925                return -EINVAL;
 926
 927        if (!pol)
 928                pol = &default_policy;  /* indicates default behavior */
 929
 930        if (flags & MPOL_F_NODE) {
 931                if (flags & MPOL_F_ADDR) {
 932                        /*
 933                         * Take a refcount on the mpol, lookup_node()
 934                         * wil drop the mmap_sem, so after calling
 935                         * lookup_node() only "pol" remains valid, "vma"
 936                         * is stale.
 937                         */
 938                        pol_refcount = pol;
 939                        vma = NULL;
 940                        mpol_get(pol);
 941                        err = lookup_node(mm, addr);
 942                        if (err < 0)
 943                                goto out;
 944                        *policy = err;
 945                } else if (pol == current->mempolicy &&
 946                                pol->mode == MPOL_INTERLEAVE) {
 947                        *policy = next_node_in(current->il_prev, pol->v.nodes);
 948                } else {
 949                        err = -EINVAL;
 950                        goto out;
 951                }
 952        } else {
 953                *policy = pol == &default_policy ? MPOL_DEFAULT :
 954                                                pol->mode;
 955                /*
 956                 * Internal mempolicy flags must be masked off before exposing
 957                 * the policy to userspace.
 958                 */
 959                *policy |= (pol->flags & MPOL_MODE_FLAGS);
 960        }
 961
 962        err = 0;
 963        if (nmask) {
 964                if (mpol_store_user_nodemask(pol)) {
 965                        *nmask = pol->w.user_nodemask;
 966                } else {
 967                        task_lock(current);
 968                        get_policy_nodemask(pol, nmask);
 969                        task_unlock(current);
 970                }
 971        }
 972
 973 out:
 974        mpol_cond_put(pol);
 975        if (vma)
 976                up_read(&mm->mmap_sem);
 977        if (pol_refcount)
 978                mpol_put(pol_refcount);
 979        return err;
 980}
 981
 982#ifdef CONFIG_MIGRATION
 983/*
 984 * page migration, thp tail pages can be passed.
 985 */
 986static int migrate_page_add(struct page *page, struct list_head *pagelist,
 987                                unsigned long flags)
 988{
 989        struct page *head = compound_head(page);
 990        /*
 991         * Avoid migrating a page that is shared with others.
 992         */
 993        if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(head) == 1) {
 994                if (!isolate_lru_page(head)) {
 995                        list_add_tail(&head->lru, pagelist);
 996                        mod_node_page_state(page_pgdat(head),
 997                                NR_ISOLATED_ANON + page_is_file_cache(head),
 998                                hpage_nr_pages(head));
 999                } else if (flags & MPOL_MF_STRICT) {
1000                        /*
1001                         * Non-movable page may reach here.  And, there may be
1002                         * temporary off LRU pages or non-LRU movable pages.
1003                         * Treat them as unmovable pages since they can't be
1004                         * isolated, so they can't be moved at the moment.  It
1005                         * should return -EIO for this case too.
1006                         */
1007                        return -EIO;
1008                }
1009        }
1010
1011        return 0;
1012}
1013
1014/* page allocation callback for NUMA node migration */
1015struct page *alloc_new_node_page(struct page *page, unsigned long node)
1016{
1017        if (PageHuge(page))
1018                return alloc_huge_page_node(page_hstate(compound_head(page)),
1019                                        node);
1020        else if (PageTransHuge(page)) {
1021                struct page *thp;
1022
1023                thp = alloc_pages_node(node,
1024                        (GFP_TRANSHUGE | __GFP_THISNODE),
1025                        HPAGE_PMD_ORDER);
1026                if (!thp)
1027                        return NULL;
1028                prep_transhuge_page(thp);
1029                return thp;
1030        } else
1031                return __alloc_pages_node(node, GFP_HIGHUSER_MOVABLE |
1032                                                    __GFP_THISNODE, 0);
1033}
1034
1035/*
1036 * Migrate pages from one node to a target node.
1037 * Returns error or the number of pages not migrated.
1038 */
1039static int migrate_to_node(struct mm_struct *mm, int source, int dest,
1040                           int flags)
1041{
1042        nodemask_t nmask;
1043        LIST_HEAD(pagelist);
1044        int err = 0;
1045
1046        nodes_clear(nmask);
1047        node_set(source, nmask);
1048
1049        /*
1050         * This does not "check" the range but isolates all pages that
1051         * need migration.  Between passing in the full user address
1052         * space range and MPOL_MF_DISCONTIG_OK, this call can not fail.
1053         */
1054        VM_BUG_ON(!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)));
1055        queue_pages_range(mm, mm->mmap->vm_start, mm->task_size, &nmask,
1056                        flags | MPOL_MF_DISCONTIG_OK, &pagelist);
1057
1058        if (!list_empty(&pagelist)) {
1059                err = migrate_pages(&pagelist, alloc_new_node_page, NULL, dest,
1060                                        MIGRATE_SYNC, MR_SYSCALL);
1061                if (err)
1062                        putback_movable_pages(&pagelist);
1063        }
1064
1065        return err;
1066}
1067
1068/*
1069 * Move pages between the two nodesets so as to preserve the physical
1070 * layout as much as possible.
1071 *
1072 * Returns the number of page that could not be moved.
1073 */
1074int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
1075                     const nodemask_t *to, int flags)
1076{
1077        int busy = 0;
1078        int err;
1079        nodemask_t tmp;
1080
1081        err = migrate_prep();
1082        if (err)
1083                return err;
1084
1085        down_read(&mm->mmap_sem);
1086
1087        /*
1088         * Find a 'source' bit set in 'tmp' whose corresponding 'dest'
1089         * bit in 'to' is not also set in 'tmp'.  Clear the found 'source'
1090         * bit in 'tmp', and return that <source, dest> pair for migration.
1091         * The pair of nodemasks 'to' and 'from' define the map.
1092         *
1093         * If no pair of bits is found that way, fallback to picking some
1094         * pair of 'source' and 'dest' bits that are not the same.  If the
1095         * 'source' and 'dest' bits are the same, this represents a node
1096         * that will be migrating to itself, so no pages need move.
1097         *
1098         * If no bits are left in 'tmp', or if all remaining bits left
1099         * in 'tmp' correspond to the same bit in 'to', return false
1100         * (nothing left to migrate).
1101         *
1102         * This lets us pick a pair of nodes to migrate between, such that
1103         * if possible the dest node is not already occupied by some other
1104         * source node, minimizing the risk of overloading the memory on a
1105         * node that would happen if we migrated incoming memory to a node
1106         * before migrating outgoing memory source that same node.
1107         *
1108         * A single scan of tmp is sufficient.  As we go, we remember the
1109         * most recent <s, d> pair that moved (s != d).  If we find a pair
1110         * that not only moved, but what's better, moved to an empty slot
1111         * (d is not set in tmp), then we break out then, with that pair.
1112         * Otherwise when we finish scanning from_tmp, we at least have the
1113         * most recent <s, d> pair that moved.  If we get all the way through
1114         * the scan of tmp without finding any node that moved, much less
1115         * moved to an empty node, then there is nothing left worth migrating.
1116         */
1117
1118        tmp = *from;
1119        while (!nodes_empty(tmp)) {
1120                int s,d;
1121                int source = NUMA_NO_NODE;
1122                int dest = 0;
1123
1124                for_each_node_mask(s, tmp) {
1125
1126                        /*
1127                         * do_migrate_pages() tries to maintain the relative
1128                         * node relationship of the pages established between
1129                         * threads and memory areas.
1130                         *
1131                         * However if the number of source nodes is not equal to
1132                         * the number of destination nodes we can not preserve
1133                         * this node relative relationship.  In that case, skip
1134                         * copying memory from a node that is in the destination
1135                         * mask.
1136                         *
1137                         * Example: [2,3,4] -> [3,4,5] moves everything.
1138                         *          [0-7] - > [3,4,5] moves only 0,1,2,6,7.
1139                         */
1140
1141                        if ((nodes_weight(*from) != nodes_weight(*to)) &&
1142                                                (node_isset(s, *to)))
1143                                continue;
1144
1145                        d = node_remap(s, *from, *to);
1146                        if (s == d)
1147                                continue;
1148
1149                        source = s;     /* Node moved. Memorize */
1150                        dest = d;
1151
1152                        /* dest not in remaining from nodes? */
1153                        if (!node_isset(dest, tmp))
1154                                break;
1155                }
1156                if (source == NUMA_NO_NODE)
1157                        break;
1158
1159                node_clear(source, tmp);
1160                err = migrate_to_node(mm, source, dest, flags);
1161                if (err > 0)
1162                        busy += err;
1163                if (err < 0)
1164                        break;
1165        }
1166        up_read(&mm->mmap_sem);
1167        if (err < 0)
1168                return err;
1169        return busy;
1170
1171}
1172
1173/*
1174 * Allocate a new page for page migration based on vma policy.
1175 * Start by assuming the page is mapped by the same vma as contains @start.
1176 * Search forward from there, if not.  N.B., this assumes that the
1177 * list of pages handed to migrate_pages()--which is how we get here--
1178 * is in virtual address order.
1179 */
1180static struct page *new_page(struct page *page, unsigned long start)
1181{
1182        struct vm_area_struct *vma;
1183        unsigned long uninitialized_var(address);
1184
1185        vma = find_vma(current->mm, start);
1186        while (vma) {
1187                address = page_address_in_vma(page, vma);
1188                if (address != -EFAULT)
1189                        break;
1190                vma = vma->vm_next;
1191        }
1192
1193        if (PageHuge(page)) {
1194                return alloc_huge_page_vma(page_hstate(compound_head(page)),
1195                                vma, address);
1196        } else if (PageTransHuge(page)) {
1197                struct page *thp;
1198
1199                thp = alloc_hugepage_vma(GFP_TRANSHUGE, vma, address,
1200                                         HPAGE_PMD_ORDER);
1201                if (!thp)
1202                        return NULL;
1203                prep_transhuge_page(thp);
1204                return thp;
1205        }
1206        /*
1207         * if !vma, alloc_page_vma() will use task or system default policy
1208         */
1209        return alloc_page_vma(GFP_HIGHUSER_MOVABLE | __GFP_RETRY_MAYFAIL,
1210                        vma, address);
1211}
1212#else
1213
1214static int migrate_page_add(struct page *page, struct list_head *pagelist,
1215                                unsigned long flags)
1216{
1217        return -EIO;
1218}
1219
1220int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
1221                     const nodemask_t *to, int flags)
1222{
1223        return -ENOSYS;
1224}
1225
1226static struct page *new_page(struct page *page, unsigned long start)
1227{
1228        return NULL;
1229}
1230#endif
1231
1232static long do_mbind(unsigned long start, unsigned long len,
1233                     unsigned short mode, unsigned short mode_flags,
1234                     nodemask_t *nmask, unsigned long flags)
1235{
1236        struct mm_struct *mm = current->mm;
1237        struct mempolicy *new;
1238        unsigned long end;
1239        int err;
1240        int ret;
1241        LIST_HEAD(pagelist);
1242
1243        if (flags & ~(unsigned long)MPOL_MF_VALID)
1244                return -EINVAL;
1245        if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
1246                return -EPERM;
1247
1248        if (start & ~PAGE_MASK)
1249                return -EINVAL;
1250
1251        if (mode == MPOL_DEFAULT)
1252                flags &= ~MPOL_MF_STRICT;
1253
1254        len = (len + PAGE_SIZE - 1) & PAGE_MASK;
1255        end = start + len;
1256
1257        if (end < start)
1258                return -EINVAL;
1259        if (end == start)
1260                return 0;
1261
1262        new = mpol_new(mode, mode_flags, nmask);
1263        if (IS_ERR(new))
1264                return PTR_ERR(new);
1265
1266        if (flags & MPOL_MF_LAZY)
1267                new->flags |= MPOL_F_MOF;
1268
1269        /*
1270         * If we are using the default policy then operation
1271         * on discontinuous address spaces is okay after all
1272         */
1273        if (!new)
1274                flags |= MPOL_MF_DISCONTIG_OK;
1275
1276        pr_debug("mbind %lx-%lx mode:%d flags:%d nodes:%lx\n",
1277                 start, start + len, mode, mode_flags,
1278                 nmask ? nodes_addr(*nmask)[0] : NUMA_NO_NODE);
1279
1280        if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
1281
1282                err = migrate_prep();
1283                if (err)
1284                        goto mpol_out;
1285        }
1286        {
1287                NODEMASK_SCRATCH(scratch);
1288                if (scratch) {
1289                        down_write(&mm->mmap_sem);
1290                        task_lock(current);
1291                        err = mpol_set_nodemask(new, nmask, scratch);
1292                        task_unlock(current);
1293                        if (err)
1294                                up_write(&mm->mmap_sem);
1295                } else
1296                        err = -ENOMEM;
1297                NODEMASK_SCRATCH_FREE(scratch);
1298        }
1299        if (err)
1300                goto mpol_out;
1301
1302        ret = queue_pages_range(mm, start, end, nmask,
1303                          flags | MPOL_MF_INVERT, &pagelist);
1304
1305        if (ret < 0) {
1306                err = ret;
1307                goto up_out;
1308        }
1309
1310        err = mbind_range(mm, start, end, new);
1311
1312        if (!err) {
1313                int nr_failed = 0;
1314
1315                if (!list_empty(&pagelist)) {
1316                        WARN_ON_ONCE(flags & MPOL_MF_LAZY);
1317                        nr_failed = migrate_pages(&pagelist, new_page, NULL,
1318                                start, MIGRATE_SYNC, MR_MEMPOLICY_MBIND);
1319                        if (nr_failed)
1320                                putback_movable_pages(&pagelist);
1321                }
1322
1323                if ((ret > 0) || (nr_failed && (flags & MPOL_MF_STRICT)))
1324                        err = -EIO;
1325        } else {
1326up_out:
1327                if (!list_empty(&pagelist))
1328                        putback_movable_pages(&pagelist);
1329        }
1330
1331        up_write(&mm->mmap_sem);
1332mpol_out:
1333        mpol_put(new);
1334        return err;
1335}
1336
1337/*
1338 * User space interface with variable sized bitmaps for nodelists.
1339 */
1340
1341/* Copy a node mask from user space. */
1342static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
1343                     unsigned long maxnode)
1344{
1345        unsigned long k;
1346        unsigned long t;
1347        unsigned long nlongs;
1348        unsigned long endmask;
1349
1350        --maxnode;
1351        nodes_clear(*nodes);
1352        if (maxnode == 0 || !nmask)
1353                return 0;
1354        if (maxnode > PAGE_SIZE*BITS_PER_BYTE)
1355                return -EINVAL;
1356
1357        nlongs = BITS_TO_LONGS(maxnode);
1358        if ((maxnode % BITS_PER_LONG) == 0)
1359                endmask = ~0UL;
1360        else
1361                endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1;
1362
1363        /*
1364         * When the user specified more nodes than supported just check
1365         * if the non supported part is all zero.
1366         *
1367         * If maxnode have more longs than MAX_NUMNODES, check
1368         * the bits in that area first. And then go through to
1369         * check the rest bits which equal or bigger than MAX_NUMNODES.
1370         * Otherwise, just check bits [MAX_NUMNODES, maxnode).
1371         */
1372        if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) {
1373                for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) {
1374                        if (get_user(t, nmask + k))
1375                                return -EFAULT;
1376                        if (k == nlongs - 1) {
1377                                if (t & endmask)
1378                                        return -EINVAL;
1379                        } else if (t)
1380                                return -EINVAL;
1381                }
1382                nlongs = BITS_TO_LONGS(MAX_NUMNODES);
1383                endmask = ~0UL;
1384        }
1385
1386        if (maxnode > MAX_NUMNODES && MAX_NUMNODES % BITS_PER_LONG != 0) {
1387                unsigned long valid_mask = endmask;
1388
1389                valid_mask &= ~((1UL << (MAX_NUMNODES % BITS_PER_LONG)) - 1);
1390                if (get_user(t, nmask + nlongs - 1))
1391                        return -EFAULT;
1392                if (t & valid_mask)
1393                        return -EINVAL;
1394        }
1395
1396        if (copy_from_user(nodes_addr(*nodes), nmask, nlongs*sizeof(unsigned long)))
1397                return -EFAULT;
1398        nodes_addr(*nodes)[nlongs-1] &= endmask;
1399        return 0;
1400}
1401
1402/* Copy a kernel node mask to user space */
1403static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
1404                              nodemask_t *nodes)
1405{
1406        unsigned long copy = ALIGN(maxnode-1, 64) / 8;
1407        unsigned int nbytes = BITS_TO_LONGS(nr_node_ids) * sizeof(long);
1408
1409        if (copy > nbytes) {
1410                if (copy > PAGE_SIZE)
1411                        return -EINVAL;
1412                if (clear_user((char __user *)mask + nbytes, copy - nbytes))
1413                        return -EFAULT;
1414                copy = nbytes;
1415        }
1416        return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
1417}
1418
1419static long kernel_mbind(unsigned long start, unsigned long len,
1420                         unsigned long mode, const unsigned long __user *nmask,
1421                         unsigned long maxnode, unsigned int flags)
1422{
1423        nodemask_t nodes;
1424        int err;
1425        unsigned short mode_flags;
1426
1427        start = untagged_addr(start);
1428        mode_flags = mode & MPOL_MODE_FLAGS;
1429        mode &= ~MPOL_MODE_FLAGS;
1430        if (mode >= MPOL_MAX)
1431                return -EINVAL;
1432        if ((mode_flags & MPOL_F_STATIC_NODES) &&
1433            (mode_flags & MPOL_F_RELATIVE_NODES))
1434                return -EINVAL;
1435        err = get_nodes(&nodes, nmask, maxnode);
1436        if (err)
1437                return err;
1438        return do_mbind(start, len, mode, mode_flags, &nodes, flags);
1439}
1440
1441SYSCALL_DEFINE6(mbind, unsigned long, start, unsigned long, len,
1442                unsigned long, mode, const unsigned long __user *, nmask,
1443                unsigned long, maxnode, unsigned int, flags)
1444{
1445        return kernel_mbind(start, len, mode, nmask, maxnode, flags);
1446}
1447
1448/* Set the process memory policy */
1449static long kernel_set_mempolicy(int mode, const unsigned long __user *nmask,
1450                                 unsigned long maxnode)
1451{
1452        int err;
1453        nodemask_t nodes;
1454        unsigned short flags;
1455
1456        flags = mode & MPOL_MODE_FLAGS;
1457        mode &= ~MPOL_MODE_FLAGS;
1458        if ((unsigned int)mode >= MPOL_MAX)
1459                return -EINVAL;
1460        if ((flags & MPOL_F_STATIC_NODES) && (flags & MPOL_F_RELATIVE_NODES))
1461                return -EINVAL;
1462        err = get_nodes(&nodes, nmask, maxnode);
1463        if (err)
1464                return err;
1465        return do_set_mempolicy(mode, flags, &nodes);
1466}
1467
1468SYSCALL_DEFINE3(set_mempolicy, int, mode, const unsigned long __user *, nmask,
1469                unsigned long, maxnode)
1470{
1471        return kernel_set_mempolicy(mode, nmask, maxnode);
1472}
1473
1474static int kernel_migrate_pages(pid_t pid, unsigned long maxnode,
1475                                const unsigned long __user *old_nodes,
1476                                const unsigned long __user *new_nodes)
1477{
1478        struct mm_struct *mm = NULL;
1479        struct task_struct *task;
1480        nodemask_t task_nodes;
1481        int err;
1482        nodemask_t *old;
1483        nodemask_t *new;
1484        NODEMASK_SCRATCH(scratch);
1485
1486        if (!scratch)
1487                return -ENOMEM;
1488
1489        old = &scratch->mask1;
1490        new = &scratch->mask2;
1491
1492        err = get_nodes(old, old_nodes, maxnode);
1493        if (err)
1494                goto out;
1495
1496        err = get_nodes(new, new_nodes, maxnode);
1497        if (err)
1498                goto out;
1499
1500        /* Find the mm_struct */
1501        rcu_read_lock();
1502        task = pid ? find_task_by_vpid(pid) : current;
1503        if (!task) {
1504                rcu_read_unlock();
1505                err = -ESRCH;
1506                goto out;
1507        }
1508        get_task_struct(task);
1509
1510        err = -EINVAL;
1511
1512        /*
1513         * Check if this process has the right to modify the specified process.
1514         * Use the regular "ptrace_may_access()" checks.
1515         */
1516        if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS)) {
1517                rcu_read_unlock();
1518                err = -EPERM;
1519                goto out_put;
1520        }
1521        rcu_read_unlock();
1522
1523        task_nodes = cpuset_mems_allowed(task);
1524        /* Is the user allowed to access the target nodes? */
1525        if (!nodes_subset(*new, task_nodes) && !capable(CAP_SYS_NICE)) {
1526                err = -EPERM;
1527                goto out_put;
1528        }
1529
1530        task_nodes = cpuset_mems_allowed(current);
1531        nodes_and(*new, *new, task_nodes);
1532        if (nodes_empty(*new))
1533                goto out_put;
1534
1535        err = security_task_movememory(task);
1536        if (err)
1537                goto out_put;
1538
1539        mm = get_task_mm(task);
1540        put_task_struct(task);
1541
1542        if (!mm) {
1543                err = -EINVAL;
1544                goto out;
1545        }
1546
1547        err = do_migrate_pages(mm, old, new,
1548                capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE);
1549
1550        mmput(mm);
1551out:
1552        NODEMASK_SCRATCH_FREE(scratch);
1553
1554        return err;
1555
1556out_put:
1557        put_task_struct(task);
1558        goto out;
1559
1560}
1561
1562SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode,
1563                const unsigned long __user *, old_nodes,
1564                const unsigned long __user *, new_nodes)
1565{
1566        return kernel_migrate_pages(pid, maxnode, old_nodes, new_nodes);
1567}
1568
1569
1570/* Retrieve NUMA policy */
1571static int kernel_get_mempolicy(int __user *policy,
1572                                unsigned long __user *nmask,
1573                                unsigned long maxnode,
1574                                unsigned long addr,
1575                                unsigned long flags)
1576{
1577        int err;
1578        int uninitialized_var(pval);
1579        nodemask_t nodes;
1580
1581        addr = untagged_addr(addr);
1582
1583        if (nmask != NULL && maxnode < nr_node_ids)
1584                return -EINVAL;
1585
1586        err = do_get_mempolicy(&pval, &nodes, addr, flags);
1587
1588        if (err)
1589                return err;
1590
1591        if (policy && put_user(pval, policy))
1592                return -EFAULT;
1593
1594        if (nmask)
1595                err = copy_nodes_to_user(nmask, maxnode, &nodes);
1596
1597        return err;
1598}
1599
1600SYSCALL_DEFINE5(get_mempolicy, int __user *, policy,
1601                unsigned long __user *, nmask, unsigned long, maxnode,
1602                unsigned long, addr, unsigned long, flags)
1603{
1604        return kernel_get_mempolicy(policy, nmask, maxnode, addr, flags);
1605}
1606
1607#ifdef CONFIG_COMPAT
1608
1609COMPAT_SYSCALL_DEFINE5(get_mempolicy, int __user *, policy,
1610                       compat_ulong_t __user *, nmask,
1611                       compat_ulong_t, maxnode,
1612                       compat_ulong_t, addr, compat_ulong_t, flags)
1613{
1614        long err;
1615        unsigned long __user *nm = NULL;
1616        unsigned long nr_bits, alloc_size;
1617        DECLARE_BITMAP(bm, MAX_NUMNODES);
1618
1619        nr_bits = min_t(unsigned long, maxnode-1, nr_node_ids);
1620        alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1621
1622        if (nmask)
1623                nm = compat_alloc_user_space(alloc_size);
1624
1625        err = kernel_get_mempolicy(policy, nm, nr_bits+1, addr, flags);
1626
1627        if (!err && nmask) {
1628                unsigned long copy_size;
1629                copy_size = min_t(unsigned long, sizeof(bm), alloc_size);
1630                err = copy_from_user(bm, nm, copy_size);
1631                /* ensure entire bitmap is zeroed */
1632                err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8);
1633                err |= compat_put_bitmap(nmask, bm, nr_bits);
1634        }
1635
1636        return err;
1637}
1638
1639COMPAT_SYSCALL_DEFINE3(set_mempolicy, int, mode, compat_ulong_t __user *, nmask,
1640                       compat_ulong_t, maxnode)
1641{
1642        unsigned long __user *nm = NULL;
1643        unsigned long nr_bits, alloc_size;
1644        DECLARE_BITMAP(bm, MAX_NUMNODES);
1645
1646        nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1647        alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1648
1649        if (nmask) {
1650                if (compat_get_bitmap(bm, nmask, nr_bits))
1651                        return -EFAULT;
1652                nm = compat_alloc_user_space(alloc_size);
1653                if (copy_to_user(nm, bm, alloc_size))
1654                        return -EFAULT;
1655        }
1656
1657        return kernel_set_mempolicy(mode, nm, nr_bits+1);
1658}
1659
1660COMPAT_SYSCALL_DEFINE6(mbind, compat_ulong_t, start, compat_ulong_t, len,
1661                       compat_ulong_t, mode, compat_ulong_t __user *, nmask,
1662                       compat_ulong_t, maxnode, compat_ulong_t, flags)
1663{
1664        unsigned long __user *nm = NULL;
1665        unsigned long nr_bits, alloc_size;
1666        nodemask_t bm;
1667
1668        nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1669        alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1670
1671        if (nmask) {
1672                if (compat_get_bitmap(nodes_addr(bm), nmask, nr_bits))
1673                        return -EFAULT;
1674                nm = compat_alloc_user_space(alloc_size);
1675                if (copy_to_user(nm, nodes_addr(bm), alloc_size))
1676                        return -EFAULT;
1677        }
1678
1679        return kernel_mbind(start, len, mode, nm, nr_bits+1, flags);
1680}
1681
1682COMPAT_SYSCALL_DEFINE4(migrate_pages, compat_pid_t, pid,
1683                       compat_ulong_t, maxnode,
1684                       const compat_ulong_t __user *, old_nodes,
1685                       const compat_ulong_t __user *, new_nodes)
1686{
1687        unsigned long __user *old = NULL;
1688        unsigned long __user *new = NULL;
1689        nodemask_t tmp_mask;
1690        unsigned long nr_bits;
1691        unsigned long size;
1692
1693        nr_bits = min_t(unsigned long, maxnode - 1, MAX_NUMNODES);
1694        size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1695        if (old_nodes) {
1696                if (compat_get_bitmap(nodes_addr(tmp_mask), old_nodes, nr_bits))
1697                        return -EFAULT;
1698                old = compat_alloc_user_space(new_nodes ? size * 2 : size);
1699                if (new_nodes)
1700                        new = old + size / sizeof(unsigned long);
1701                if (copy_to_user(old, nodes_addr(tmp_mask), size))
1702                        return -EFAULT;
1703        }
1704        if (new_nodes) {
1705                if (compat_get_bitmap(nodes_addr(tmp_mask), new_nodes, nr_bits))
1706                        return -EFAULT;
1707                if (new == NULL)
1708                        new = compat_alloc_user_space(size);
1709                if (copy_to_user(new, nodes_addr(tmp_mask), size))
1710                        return -EFAULT;
1711        }
1712        return kernel_migrate_pages(pid, nr_bits + 1, old, new);
1713}
1714
1715#endif /* CONFIG_COMPAT */
1716
1717struct mempolicy *__get_vma_policy(struct vm_area_struct *vma,
1718                                                unsigned long addr)
1719{
1720        struct mempolicy *pol = NULL;
1721
1722        if (vma) {
1723                if (vma->vm_ops && vma->vm_ops->get_policy) {
1724                        pol = vma->vm_ops->get_policy(vma, addr);
1725                } else if (vma->vm_policy) {
1726                        pol = vma->vm_policy;
1727
1728                        /*
1729                         * shmem_alloc_page() passes MPOL_F_SHARED policy with
1730                         * a pseudo vma whose vma->vm_ops=NULL. Take a reference
1731                         * count on these policies which will be dropped by
1732                         * mpol_cond_put() later
1733                         */
1734                        if (mpol_needs_cond_ref(pol))
1735                                mpol_get(pol);
1736                }
1737        }
1738
1739        return pol;
1740}
1741
1742/*
1743 * get_vma_policy(@vma, @addr)
1744 * @vma: virtual memory area whose policy is sought
1745 * @addr: address in @vma for shared policy lookup
1746 *
1747 * Returns effective policy for a VMA at specified address.
1748 * Falls back to current->mempolicy or system default policy, as necessary.
1749 * Shared policies [those marked as MPOL_F_SHARED] require an extra reference
1750 * count--added by the get_policy() vm_op, as appropriate--to protect against
1751 * freeing by another task.  It is the caller's responsibility to free the
1752 * extra reference for shared policies.
1753 */
1754static struct mempolicy *get_vma_policy(struct vm_area_struct *vma,
1755                                                unsigned long addr)
1756{
1757        struct mempolicy *pol = __get_vma_policy(vma, addr);
1758
1759        if (!pol)
1760                pol = get_task_policy(current);
1761
1762        return pol;
1763}
1764
1765bool vma_policy_mof(struct vm_area_struct *vma)
1766{
1767        struct mempolicy *pol;
1768
1769        if (vma->vm_ops && vma->vm_ops->get_policy) {
1770                bool ret = false;
1771
1772                pol = vma->vm_ops->get_policy(vma, vma->vm_start);
1773                if (pol && (pol->flags & MPOL_F_MOF))
1774                        ret = true;
1775                mpol_cond_put(pol);
1776
1777                return ret;
1778        }
1779
1780        pol = vma->vm_policy;
1781        if (!pol)
1782                pol = get_task_policy(current);
1783
1784        return pol->flags & MPOL_F_MOF;
1785}
1786
1787static int apply_policy_zone(struct mempolicy *policy, enum zone_type zone)
1788{
1789        enum zone_type dynamic_policy_zone = policy_zone;
1790
1791        BUG_ON(dynamic_policy_zone == ZONE_MOVABLE);
1792
1793        /*
1794         * if policy->v.nodes has movable memory only,
1795         * we apply policy when gfp_zone(gfp) = ZONE_MOVABLE only.
1796         *
1797         * policy->v.nodes is intersect with node_states[N_MEMORY].
1798         * so if the following test faile, it implies
1799         * policy->v.nodes has movable memory only.
1800         */
1801        if (!nodes_intersects(policy->v.nodes, node_states[N_HIGH_MEMORY]))
1802                dynamic_policy_zone = ZONE_MOVABLE;
1803
1804        return zone >= dynamic_policy_zone;
1805}
1806
1807/*
1808 * Return a nodemask representing a mempolicy for filtering nodes for
1809 * page allocation
1810 */
1811static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy)
1812{
1813        /* Lower zones don't get a nodemask applied for MPOL_BIND */
1814        if (unlikely(policy->mode == MPOL_BIND) &&
1815                        apply_policy_zone(policy, gfp_zone(gfp)) &&
1816                        cpuset_nodemask_valid_mems_allowed(&policy->v.nodes))
1817                return &policy->v.nodes;
1818
1819        return NULL;
1820}
1821
1822/* Return the node id preferred by the given mempolicy, or the given id */
1823static int policy_node(gfp_t gfp, struct mempolicy *policy,
1824                                                                int nd)
1825{
1826        if (policy->mode == MPOL_PREFERRED && !(policy->flags & MPOL_F_LOCAL))
1827                nd = policy->v.preferred_node;
1828        else {
1829                /*
1830                 * __GFP_THISNODE shouldn't even be used with the bind policy
1831                 * because we might easily break the expectation to stay on the
1832                 * requested node and not break the policy.
1833                 */
1834                WARN_ON_ONCE(policy->mode == MPOL_BIND && (gfp & __GFP_THISNODE));
1835        }
1836
1837        return nd;
1838}
1839
1840/* Do dynamic interleaving for a process */
1841static unsigned interleave_nodes(struct mempolicy *policy)
1842{
1843        unsigned next;
1844        struct task_struct *me = current;
1845
1846        next = next_node_in(me->il_prev, policy->v.nodes);
1847        if (next < MAX_NUMNODES)
1848                me->il_prev = next;
1849        return next;
1850}
1851
1852/*
1853 * Depending on the memory policy provide a node from which to allocate the
1854 * next slab entry.
1855 */
1856unsigned int mempolicy_slab_node(void)
1857{
1858        struct mempolicy *policy;
1859        int node = numa_mem_id();
1860
1861        if (in_interrupt())
1862                return node;
1863
1864        policy = current->mempolicy;
1865        if (!policy || policy->flags & MPOL_F_LOCAL)
1866                return node;
1867
1868        switch (policy->mode) {
1869        case MPOL_PREFERRED:
1870                /*
1871                 * handled MPOL_F_LOCAL above
1872                 */
1873                return policy->v.preferred_node;
1874
1875        case MPOL_INTERLEAVE:
1876                return interleave_nodes(policy);
1877
1878        case MPOL_BIND: {
1879                struct zoneref *z;
1880
1881                /*
1882                 * Follow bind policy behavior and start allocation at the
1883                 * first node.
1884                 */
1885                struct zonelist *zonelist;
1886                enum zone_type highest_zoneidx = gfp_zone(GFP_KERNEL);
1887                zonelist = &NODE_DATA(node)->node_zonelists[ZONELIST_FALLBACK];
1888                z = first_zones_zonelist(zonelist, highest_zoneidx,
1889                                                        &policy->v.nodes);
1890                return z->zone ? zone_to_nid(z->zone) : node;
1891        }
1892
1893        default:
1894                BUG();
1895        }
1896}
1897
1898/*
1899 * Do static interleaving for a VMA with known offset @n.  Returns the n'th
1900 * node in pol->v.nodes (starting from n=0), wrapping around if n exceeds the
1901 * number of present nodes.
1902 */
1903static unsigned offset_il_node(struct mempolicy *pol, unsigned long n)
1904{
1905        unsigned nnodes = nodes_weight(pol->v.nodes);
1906        unsigned target;
1907        int i;
1908        int nid;
1909
1910        if (!nnodes)
1911                return numa_node_id();
1912        target = (unsigned int)n % nnodes;
1913        nid = first_node(pol->v.nodes);
1914        for (i = 0; i < target; i++)
1915                nid = next_node(nid, pol->v.nodes);
1916        return nid;
1917}
1918
1919/* Determine a node number for interleave */
1920static inline unsigned interleave_nid(struct mempolicy *pol,
1921                 struct vm_area_struct *vma, unsigned long addr, int shift)
1922{
1923        if (vma) {
1924                unsigned long off;
1925
1926                /*
1927                 * for small pages, there is no difference between
1928                 * shift and PAGE_SHIFT, so the bit-shift is safe.
1929                 * for huge pages, since vm_pgoff is in units of small
1930                 * pages, we need to shift off the always 0 bits to get
1931                 * a useful offset.
1932                 */
1933                BUG_ON(shift < PAGE_SHIFT);
1934                off = vma->vm_pgoff >> (shift - PAGE_SHIFT);
1935                off += (addr - vma->vm_start) >> shift;
1936                return offset_il_node(pol, off);
1937        } else
1938                return interleave_nodes(pol);
1939}
1940
1941#ifdef CONFIG_HUGETLBFS
1942/*
1943 * huge_node(@vma, @addr, @gfp_flags, @mpol)
1944 * @vma: virtual memory area whose policy is sought
1945 * @addr: address in @vma for shared policy lookup and interleave policy
1946 * @gfp_flags: for requested zone
1947 * @mpol: pointer to mempolicy pointer for reference counted mempolicy
1948 * @nodemask: pointer to nodemask pointer for MPOL_BIND nodemask
1949 *
1950 * Returns a nid suitable for a huge page allocation and a pointer
1951 * to the struct mempolicy for conditional unref after allocation.
1952 * If the effective policy is 'BIND, returns a pointer to the mempolicy's
1953 * @nodemask for filtering the zonelist.
1954 *
1955 * Must be protected by read_mems_allowed_begin()
1956 */
1957int huge_node(struct vm_area_struct *vma, unsigned long addr, gfp_t gfp_flags,
1958                                struct mempolicy **mpol, nodemask_t **nodemask)
1959{
1960        int nid;
1961
1962        *mpol = get_vma_policy(vma, addr);
1963        *nodemask = NULL;       /* assume !MPOL_BIND */
1964
1965        if (unlikely((*mpol)->mode == MPOL_INTERLEAVE)) {
1966                nid = interleave_nid(*mpol, vma, addr,
1967                                        huge_page_shift(hstate_vma(vma)));
1968        } else {
1969                nid = policy_node(gfp_flags, *mpol, numa_node_id());
1970                if ((*mpol)->mode == MPOL_BIND)
1971                        *nodemask = &(*mpol)->v.nodes;
1972        }
1973        return nid;
1974}
1975
1976/*
1977 * init_nodemask_of_mempolicy
1978 *
1979 * If the current task's mempolicy is "default" [NULL], return 'false'
1980 * to indicate default policy.  Otherwise, extract the policy nodemask
1981 * for 'bind' or 'interleave' policy into the argument nodemask, or
1982 * initialize the argument nodemask to contain the single node for
1983 * 'preferred' or 'local' policy and return 'true' to indicate presence
1984 * of non-default mempolicy.
1985 *
1986 * We don't bother with reference counting the mempolicy [mpol_get/put]
1987 * because the current task is examining it's own mempolicy and a task's
1988 * mempolicy is only ever changed by the task itself.
1989 *
1990 * N.B., it is the caller's responsibility to free a returned nodemask.
1991 */
1992bool init_nodemask_of_mempolicy(nodemask_t *mask)
1993{
1994        struct mempolicy *mempolicy;
1995        int nid;
1996
1997        if (!(mask && current->mempolicy))
1998                return false;
1999
2000        task_lock(current);
2001        mempolicy = current->mempolicy;
2002        switch (mempolicy->mode) {
2003        case MPOL_PREFERRED:
2004                if (mempolicy->flags & MPOL_F_LOCAL)
2005                        nid = numa_node_id();
2006                else
2007                        nid = mempolicy->v.preferred_node;
2008                init_nodemask_of_node(mask, nid);
2009                break;
2010
2011        case MPOL_BIND:
2012                /* Fall through */
2013        case MPOL_INTERLEAVE:
2014                *mask =  mempolicy->v.nodes;
2015                break;
2016
2017        default:
2018                BUG();
2019        }
2020        task_unlock(current);
2021
2022        return true;
2023}
2024#endif
2025
2026/*
2027 * mempolicy_nodemask_intersects
2028 *
2029 * If tsk's mempolicy is "default" [NULL], return 'true' to indicate default
2030 * policy.  Otherwise, check for intersection between mask and the policy
2031 * nodemask for 'bind' or 'interleave' policy.  For 'perferred' or 'local'
2032 * policy, always return true since it may allocate elsewhere on fallback.
2033 *
2034 * Takes task_lock(tsk) to prevent freeing of its mempolicy.
2035 */
2036bool mempolicy_nodemask_intersects(struct task_struct *tsk,
2037                                        const nodemask_t *mask)
2038{
2039        struct mempolicy *mempolicy;
2040        bool ret = true;
2041
2042        if (!mask)
2043                return ret;
2044        task_lock(tsk);
2045        mempolicy = tsk->mempolicy;
2046        if (!mempolicy)
2047                goto out;
2048
2049        switch (mempolicy->mode) {
2050        case MPOL_PREFERRED:
2051                /*
2052                 * MPOL_PREFERRED and MPOL_F_LOCAL are only preferred nodes to
2053                 * allocate from, they may fallback to other nodes when oom.
2054                 * Thus, it's possible for tsk to have allocated memory from
2055                 * nodes in mask.
2056                 */
2057                break;
2058        case MPOL_BIND:
2059        case MPOL_INTERLEAVE:
2060                ret = nodes_intersects(mempolicy->v.nodes, *mask);
2061                break;
2062        default:
2063                BUG();
2064        }
2065out:
2066        task_unlock(tsk);
2067        return ret;
2068}
2069
2070/* Allocate a page in interleaved policy.
2071   Own path because it needs to do special accounting. */
2072static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
2073                                        unsigned nid)
2074{
2075        struct page *page;
2076
2077        page = __alloc_pages(gfp, order, nid);
2078        /* skip NUMA_INTERLEAVE_HIT counter update if numa stats is disabled */
2079        if (!static_branch_likely(&vm_numa_stat_key))
2080                return page;
2081        if (page && page_to_nid(page) == nid) {
2082                preempt_disable();
2083                __inc_numa_state(page_zone(page), NUMA_INTERLEAVE_HIT);
2084                preempt_enable();
2085        }
2086        return page;
2087}
2088
2089/**
2090 *      alloc_pages_vma - Allocate a page for a VMA.
2091 *
2092 *      @gfp:
2093 *      %GFP_USER    user allocation.
2094 *      %GFP_KERNEL  kernel allocations,
2095 *      %GFP_HIGHMEM highmem/user allocations,
2096 *      %GFP_FS      allocation should not call back into a file system.
2097 *      %GFP_ATOMIC  don't sleep.
2098 *
2099 *      @order:Order of the GFP allocation.
2100 *      @vma:  Pointer to VMA or NULL if not available.
2101 *      @addr: Virtual Address of the allocation. Must be inside the VMA.
2102 *      @node: Which node to prefer for allocation (modulo policy).
2103 *      @hugepage: for hugepages try only the preferred node if possible
2104 *
2105 *      This function allocates a page from the kernel page pool and applies
2106 *      a NUMA policy associated with the VMA or the current process.
2107 *      When VMA is not NULL caller must hold down_read on the mmap_sem of the
2108 *      mm_struct of the VMA to prevent it from going away. Should be used for
2109 *      all allocations for pages that will be mapped into user space. Returns
2110 *      NULL when no page can be allocated.
2111 */
2112struct page *
2113alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
2114                unsigned long addr, int node, bool hugepage)
2115{
2116        struct mempolicy *pol;
2117        struct page *page;
2118        int preferred_nid;
2119        nodemask_t *nmask;
2120
2121        pol = get_vma_policy(vma, addr);
2122
2123        if (pol->mode == MPOL_INTERLEAVE) {
2124                unsigned nid;
2125
2126                nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order);
2127                mpol_cond_put(pol);
2128                page = alloc_page_interleave(gfp, order, nid);
2129                goto out;
2130        }
2131
2132        if (unlikely(IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && hugepage)) {
2133                int hpage_node = node;
2134
2135                /*
2136                 * For hugepage allocation and non-interleave policy which
2137                 * allows the current node (or other explicitly preferred
2138                 * node) we only try to allocate from the current/preferred
2139                 * node and don't fall back to other nodes, as the cost of
2140                 * remote accesses would likely offset THP benefits.
2141                 *
2142                 * If the policy is interleave, or does not allow the current
2143                 * node in its nodemask, we allocate the standard way.
2144                 */
2145                if (pol->mode == MPOL_PREFERRED && !(pol->flags & MPOL_F_LOCAL))
2146                        hpage_node = pol->v.preferred_node;
2147
2148                nmask = policy_nodemask(gfp, pol);
2149                if (!nmask || node_isset(hpage_node, *nmask)) {
2150                        mpol_cond_put(pol);
2151                        /*
2152                         * First, try to allocate THP only on local node, but
2153                         * don't reclaim unnecessarily, just compact.
2154                         */
2155                        page = __alloc_pages_node(hpage_node,
2156                                gfp | __GFP_THISNODE | __GFP_NORETRY, order);
2157
2158                        /*
2159                         * If hugepage allocations are configured to always
2160                         * synchronous compact or the vma has been madvised
2161                         * to prefer hugepage backing, retry allowing remote
2162                         * memory with both reclaim and compact as well.
2163                         */
2164                        if (!page && (gfp & __GFP_DIRECT_RECLAIM))
2165                                page = __alloc_pages_node(hpage_node,
2166                                                                gfp, order);
2167
2168                        goto out;
2169                }
2170        }
2171
2172        nmask = policy_nodemask(gfp, pol);
2173        preferred_nid = policy_node(gfp, pol, node);
2174        page = __alloc_pages_nodemask(gfp, order, preferred_nid, nmask);
2175        mpol_cond_put(pol);
2176out:
2177        return page;
2178}
2179EXPORT_SYMBOL(alloc_pages_vma);
2180
2181/**
2182 *      alloc_pages_current - Allocate pages.
2183 *
2184 *      @gfp:
2185 *              %GFP_USER   user allocation,
2186 *              %GFP_KERNEL kernel allocation,
2187 *              %GFP_HIGHMEM highmem allocation,
2188 *              %GFP_FS     don't call back into a file system.
2189 *              %GFP_ATOMIC don't sleep.
2190 *      @order: Power of two of allocation size in pages. 0 is a single page.
2191 *
2192 *      Allocate a page from the kernel page pool.  When not in
2193 *      interrupt context and apply the current process NUMA policy.
2194 *      Returns NULL when no page can be allocated.
2195 */
2196struct page *alloc_pages_current(gfp_t gfp, unsigned order)
2197{
2198        struct mempolicy *pol = &default_policy;
2199        struct page *page;
2200
2201        if (!in_interrupt() && !(gfp & __GFP_THISNODE))
2202                pol = get_task_policy(current);
2203
2204        /*
2205         * No reference counting needed for current->mempolicy
2206         * nor system default_policy
2207         */
2208        if (pol->mode == MPOL_INTERLEAVE)
2209                page = alloc_page_interleave(gfp, order, interleave_nodes(pol));
2210        else
2211                page = __alloc_pages_nodemask(gfp, order,
2212                                policy_node(gfp, pol, numa_node_id()),
2213                                policy_nodemask(gfp, pol));
2214
2215        return page;
2216}
2217EXPORT_SYMBOL(alloc_pages_current);
2218
2219int vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst)
2220{
2221        struct mempolicy *pol = mpol_dup(vma_policy(src));
2222
2223        if (IS_ERR(pol))
2224                return PTR_ERR(pol);
2225        dst->vm_policy = pol;
2226        return 0;
2227}
2228
2229/*
2230 * If mpol_dup() sees current->cpuset == cpuset_being_rebound, then it
2231 * rebinds the mempolicy its copying by calling mpol_rebind_policy()
2232 * with the mems_allowed returned by cpuset_mems_allowed().  This
2233 * keeps mempolicies cpuset relative after its cpuset moves.  See
2234 * further kernel/cpuset.c update_nodemask().
2235 *
2236 * current's mempolicy may be rebinded by the other task(the task that changes
2237 * cpuset's mems), so we needn't do rebind work for current task.
2238 */
2239
2240/* Slow path of a mempolicy duplicate */
2241struct mempolicy *__mpol_dup(struct mempolicy *old)
2242{
2243        struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
2244
2245        if (!new)
2246                return ERR_PTR(-ENOMEM);
2247
2248        /* task's mempolicy is protected by alloc_lock */
2249        if (old == current->mempolicy) {
2250                task_lock(current);
2251                *new = *old;
2252                task_unlock(current);
2253        } else
2254                *new = *old;
2255
2256        if (current_cpuset_is_being_rebound()) {
2257                nodemask_t mems = cpuset_mems_allowed(current);
2258                mpol_rebind_policy(new, &mems);
2259        }
2260        atomic_set(&new->refcnt, 1);
2261        return new;
2262}
2263
2264/* Slow path of a mempolicy comparison */
2265bool __mpol_equal(struct mempolicy *a, struct mempolicy *b)
2266{
2267        if (!a || !b)
2268                return false;
2269        if (a->mode != b->mode)
2270                return false;
2271        if (a->flags != b->flags)
2272                return false;
2273        if (mpol_store_user_nodemask(a))
2274                if (!nodes_equal(a->w.user_nodemask, b->w.user_nodemask))
2275                        return false;
2276
2277        switch (a->mode) {
2278        case MPOL_BIND:
2279                /* Fall through */
2280        case MPOL_INTERLEAVE:
2281                return !!nodes_equal(a->v.nodes, b->v.nodes);
2282        case MPOL_PREFERRED:
2283                /* a's ->flags is the same as b's */
2284                if (a->flags & MPOL_F_LOCAL)
2285                        return true;
2286                return a->v.preferred_node == b->v.preferred_node;
2287        default:
2288                BUG();
2289                return false;
2290        }
2291}
2292
2293/*
2294 * Shared memory backing store policy support.
2295 *
2296 * Remember policies even when nobody has shared memory mapped.
2297 * The policies are kept in Red-Black tree linked from the inode.
2298 * They are protected by the sp->lock rwlock, which should be held
2299 * for any accesses to the tree.
2300 */
2301
2302/*
2303 * lookup first element intersecting start-end.  Caller holds sp->lock for
2304 * reading or for writing
2305 */
2306static struct sp_node *
2307sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
2308{
2309        struct rb_node *n = sp->root.rb_node;
2310
2311        while (n) {
2312                struct sp_node *p = rb_entry(n, struct sp_node, nd);
2313
2314                if (start >= p->end)
2315                        n = n->rb_right;
2316                else if (end <= p->start)
2317                        n = n->rb_left;
2318                else
2319                        break;
2320        }
2321        if (!n)
2322                return NULL;
2323        for (;;) {
2324                struct sp_node *w = NULL;
2325                struct rb_node *prev = rb_prev(n);
2326                if (!prev)
2327                        break;
2328                w = rb_entry(prev, struct sp_node, nd);
2329                if (w->end <= start)
2330                        break;
2331                n = prev;
2332        }
2333        return rb_entry(n, struct sp_node, nd);
2334}
2335
2336/*
2337 * Insert a new shared policy into the list.  Caller holds sp->lock for
2338 * writing.
2339 */
2340static void sp_insert(struct shared_policy *sp, struct sp_node *new)
2341{
2342        struct rb_node **p = &sp->root.rb_node;
2343        struct rb_node *parent = NULL;
2344        struct sp_node *nd;
2345
2346        while (*p) {
2347                parent = *p;
2348                nd = rb_entry(parent, struct sp_node, nd);
2349                if (new->start < nd->start)
2350                        p = &(*p)->rb_left;
2351                else if (new->end > nd->end)
2352                        p = &(*p)->rb_right;
2353                else
2354                        BUG();
2355        }
2356        rb_link_node(&new->nd, parent, p);
2357        rb_insert_color(&new->nd, &sp->root);
2358        pr_debug("inserting %lx-%lx: %d\n", new->start, new->end,
2359                 new->policy ? new->policy->mode : 0);
2360}
2361
2362/* Find shared policy intersecting idx */
2363struct mempolicy *
2364mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
2365{
2366        struct mempolicy *pol = NULL;
2367        struct sp_node *sn;
2368
2369        if (!sp->root.rb_node)
2370                return NULL;
2371        read_lock(&sp->lock);
2372        sn = sp_lookup(sp, idx, idx+1);
2373        if (sn) {
2374                mpol_get(sn->policy);
2375                pol = sn->policy;
2376        }
2377        read_unlock(&sp->lock);
2378        return pol;
2379}
2380
2381static void sp_free(struct sp_node *n)
2382{
2383        mpol_put(n->policy);
2384        kmem_cache_free(sn_cache, n);
2385}
2386
2387/**
2388 * mpol_misplaced - check whether current page node is valid in policy
2389 *
2390 * @page: page to be checked
2391 * @vma: vm area where page mapped
2392 * @addr: virtual address where page mapped
2393 *
2394 * Lookup current policy node id for vma,addr and "compare to" page's
2395 * node id.
2396 *
2397 * Returns:
2398 *      -1      - not misplaced, page is in the right node
2399 *      node    - node id where the page should be
2400 *
2401 * Policy determination "mimics" alloc_page_vma().
2402 * Called from fault path where we know the vma and faulting address.
2403 */
2404int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long addr)
2405{
2406        struct mempolicy *pol;
2407        struct zoneref *z;
2408        int curnid = page_to_nid(page);
2409        unsigned long pgoff;
2410        int thiscpu = raw_smp_processor_id();
2411        int thisnid = cpu_to_node(thiscpu);
2412        int polnid = NUMA_NO_NODE;
2413        int ret = -1;
2414
2415        pol = get_vma_policy(vma, addr);
2416        if (!(pol->flags & MPOL_F_MOF))
2417                goto out;
2418
2419        switch (pol->mode) {
2420        case MPOL_INTERLEAVE:
2421                pgoff = vma->vm_pgoff;
2422                pgoff += (addr - vma->vm_start) >> PAGE_SHIFT;
2423                polnid = offset_il_node(pol, pgoff);
2424                break;
2425
2426        case MPOL_PREFERRED:
2427                if (pol->flags & MPOL_F_LOCAL)
2428                        polnid = numa_node_id();
2429                else
2430                        polnid = pol->v.preferred_node;
2431                break;
2432
2433        case MPOL_BIND:
2434
2435                /*
2436                 * allows binding to multiple nodes.
2437                 * use current page if in policy nodemask,
2438                 * else select nearest allowed node, if any.
2439                 * If no allowed nodes, use current [!misplaced].
2440                 */
2441                if (node_isset(curnid, pol->v.nodes))
2442                        goto out;
2443                z = first_zones_zonelist(
2444                                node_zonelist(numa_node_id(), GFP_HIGHUSER),
2445                                gfp_zone(GFP_HIGHUSER),
2446                                &pol->v.nodes);
2447                polnid = zone_to_nid(z->zone);
2448                break;
2449
2450        default:
2451                BUG();
2452        }
2453
2454        /* Migrate the page towards the node whose CPU is referencing it */
2455        if (pol->flags & MPOL_F_MORON) {
2456                polnid = thisnid;
2457
2458                if (!should_numa_migrate_memory(current, page, curnid, thiscpu))
2459                        goto out;
2460        }
2461
2462        if (curnid != polnid)
2463                ret = polnid;
2464out:
2465        mpol_cond_put(pol);
2466
2467        return ret;
2468}
2469
2470/*
2471 * Drop the (possibly final) reference to task->mempolicy.  It needs to be
2472 * dropped after task->mempolicy is set to NULL so that any allocation done as
2473 * part of its kmem_cache_free(), such as by KASAN, doesn't reference a freed
2474 * policy.
2475 */
2476void mpol_put_task_policy(struct task_struct *task)
2477{
2478        struct mempolicy *pol;
2479
2480        task_lock(task);
2481        pol = task->mempolicy;
2482        task->mempolicy = NULL;
2483        task_unlock(task);
2484        mpol_put(pol);
2485}
2486
2487static void sp_delete(struct shared_policy *sp, struct sp_node *n)
2488{
2489        pr_debug("deleting %lx-l%lx\n", n->start, n->end);
2490        rb_erase(&n->nd, &sp->root);
2491        sp_free(n);
2492}
2493
2494static void sp_node_init(struct sp_node *node, unsigned long start,
2495                        unsigned long end, struct mempolicy *pol)
2496{
2497        node->start = start;
2498        node->end = end;
2499        node->policy = pol;
2500}
2501
2502static struct sp_node *sp_alloc(unsigned long start, unsigned long end,
2503                                struct mempolicy *pol)
2504{
2505        struct sp_node *n;
2506        struct mempolicy *newpol;
2507
2508        n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
2509        if (!n)
2510                return NULL;
2511
2512        newpol = mpol_dup(pol);
2513        if (IS_ERR(newpol)) {
2514                kmem_cache_free(sn_cache, n);
2515                return NULL;
2516        }
2517        newpol->flags |= MPOL_F_SHARED;
2518        sp_node_init(n, start, end, newpol);
2519
2520        return n;
2521}
2522
2523/* Replace a policy range. */
2524static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
2525                                 unsigned long end, struct sp_node *new)
2526{
2527        struct sp_node *n;
2528        struct sp_node *n_new = NULL;
2529        struct mempolicy *mpol_new = NULL;
2530        int ret = 0;
2531
2532restart:
2533        write_lock(&sp->lock);
2534        n = sp_lookup(sp, start, end);
2535        /* Take care of old policies in the same range. */
2536        while (n && n->start < end) {
2537                struct rb_node *next = rb_next(&n->nd);
2538                if (n->start >= start) {
2539                        if (n->end <= end)
2540                                sp_delete(sp, n);
2541                        else
2542                                n->start = end;
2543                } else {
2544                        /* Old policy spanning whole new range. */
2545                        if (n->end > end) {
2546                                if (!n_new)
2547                                        goto alloc_new;
2548
2549                                *mpol_new = *n->policy;
2550                                atomic_set(&mpol_new->refcnt, 1);
2551                                sp_node_init(n_new, end, n->end, mpol_new);
2552                                n->end = start;
2553                                sp_insert(sp, n_new);
2554                                n_new = NULL;
2555                                mpol_new = NULL;
2556                                break;
2557                        } else
2558                                n->end = start;
2559                }
2560                if (!next)
2561                        break;
2562                n = rb_entry(next, struct sp_node, nd);
2563        }
2564        if (new)
2565                sp_insert(sp, new);
2566        write_unlock(&sp->lock);
2567        ret = 0;
2568
2569err_out:
2570        if (mpol_new)
2571                mpol_put(mpol_new);
2572        if (n_new)
2573                kmem_cache_free(sn_cache, n_new);
2574
2575        return ret;
2576
2577alloc_new:
2578        write_unlock(&sp->lock);
2579        ret = -ENOMEM;
2580        n_new = kmem_cache_alloc(sn_cache, GFP_KERNEL);
2581        if (!n_new)
2582                goto err_out;
2583        mpol_new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
2584        if (!mpol_new)
2585                goto err_out;
2586        goto restart;
2587}
2588
2589/**
2590 * mpol_shared_policy_init - initialize shared policy for inode
2591 * @sp: pointer to inode shared policy
2592 * @mpol:  struct mempolicy to install
2593 *
2594 * Install non-NULL @mpol in inode's shared policy rb-tree.
2595 * On entry, the current task has a reference on a non-NULL @mpol.
2596 * This must be released on exit.
2597 * This is called at get_inode() calls and we can use GFP_KERNEL.
2598 */
2599void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol)
2600{
2601        int ret;
2602
2603        sp->root = RB_ROOT;             /* empty tree == default mempolicy */
2604        rwlock_init(&sp->lock);
2605
2606        if (mpol) {
2607                struct vm_area_struct pvma;
2608                struct mempolicy *new;
2609                NODEMASK_SCRATCH(scratch);
2610
2611                if (!scratch)
2612                        goto put_mpol;
2613                /* contextualize the tmpfs mount point mempolicy */
2614                new = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask);
2615                if (IS_ERR(new))
2616                        goto free_scratch; /* no valid nodemask intersection */
2617
2618                task_lock(current);
2619                ret = mpol_set_nodemask(new, &mpol->w.user_nodemask, scratch);
2620                task_unlock(current);
2621                if (ret)
2622                        goto put_new;
2623
2624                /* Create pseudo-vma that contains just the policy */
2625                vma_init(&pvma, NULL);
2626                pvma.vm_end = TASK_SIZE;        /* policy covers entire file */
2627                mpol_set_shared_policy(sp, &pvma, new); /* adds ref */
2628
2629put_new:
2630                mpol_put(new);                  /* drop initial ref */
2631free_scratch:
2632                NODEMASK_SCRATCH_FREE(scratch);
2633put_mpol:
2634                mpol_put(mpol); /* drop our incoming ref on sb mpol */
2635        }
2636}
2637
2638int mpol_set_shared_policy(struct shared_policy *info,
2639                        struct vm_area_struct *vma, struct mempolicy *npol)
2640{
2641        int err;
2642        struct sp_node *new = NULL;
2643        unsigned long sz = vma_pages(vma);
2644
2645        pr_debug("set_shared_policy %lx sz %lu %d %d %lx\n",
2646                 vma->vm_pgoff,
2647                 sz, npol ? npol->mode : -1,
2648                 npol ? npol->flags : -1,
2649                 npol ? nodes_addr(npol->v.nodes)[0] : NUMA_NO_NODE);
2650
2651        if (npol) {
2652                new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);
2653                if (!new)
2654                        return -ENOMEM;
2655        }
2656        err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new);
2657        if (err && new)
2658                sp_free(new);
2659        return err;
2660}
2661
2662/* Free a backing policy store on inode delete. */
2663void mpol_free_shared_policy(struct shared_policy *p)
2664{
2665        struct sp_node *n;
2666        struct rb_node *next;
2667
2668        if (!p->root.rb_node)
2669                return;
2670        write_lock(&p->lock);
2671        next = rb_first(&p->root);
2672        while (next) {
2673                n = rb_entry(next, struct sp_node, nd);
2674                next = rb_next(&n->nd);
2675                sp_delete(p, n);
2676        }
2677        write_unlock(&p->lock);
2678}
2679
2680#ifdef CONFIG_NUMA_BALANCING
2681static int __initdata numabalancing_override;
2682
2683static void __init check_numabalancing_enable(void)
2684{
2685        bool numabalancing_default = false;
2686
2687        if (IS_ENABLED(CONFIG_NUMA_BALANCING_DEFAULT_ENABLED))
2688                numabalancing_default = true;
2689
2690        /* Parsed by setup_numabalancing. override == 1 enables, -1 disables */
2691        if (numabalancing_override)
2692                set_numabalancing_state(numabalancing_override == 1);
2693
2694        if (num_online_nodes() > 1 && !numabalancing_override) {
2695                pr_info("%s automatic NUMA balancing. Configure with numa_balancing= or the kernel.numa_balancing sysctl\n",
2696                        numabalancing_default ? "Enabling" : "Disabling");
2697                set_numabalancing_state(numabalancing_default);
2698        }
2699}
2700
2701static int __init setup_numabalancing(char *str)
2702{
2703        int ret = 0;
2704        if (!str)
2705                goto out;
2706
2707        if (!strcmp(str, "enable")) {
2708                numabalancing_override = 1;
2709                ret = 1;
2710        } else if (!strcmp(str, "disable")) {
2711                numabalancing_override = -1;
2712                ret = 1;
2713        }
2714out:
2715        if (!ret)
2716                pr_warn("Unable to parse numa_balancing=\n");
2717
2718        return ret;
2719}
2720__setup("numa_balancing=", setup_numabalancing);
2721#else
2722static inline void __init check_numabalancing_enable(void)
2723{
2724}
2725#endif /* CONFIG_NUMA_BALANCING */
2726
2727/* assumes fs == KERNEL_DS */
2728void __init numa_policy_init(void)
2729{
2730        nodemask_t interleave_nodes;
2731        unsigned long largest = 0;
2732        int nid, prefer = 0;
2733
2734        policy_cache = kmem_cache_create("numa_policy",
2735                                         sizeof(struct mempolicy),
2736                                         0, SLAB_PANIC, NULL);
2737
2738        sn_cache = kmem_cache_create("shared_policy_node",
2739                                     sizeof(struct sp_node),
2740                                     0, SLAB_PANIC, NULL);
2741
2742        for_each_node(nid) {
2743                preferred_node_policy[nid] = (struct mempolicy) {
2744                        .refcnt = ATOMIC_INIT(1),
2745                        .mode = MPOL_PREFERRED,
2746                        .flags = MPOL_F_MOF | MPOL_F_MORON,
2747                        .v = { .preferred_node = nid, },
2748                };
2749        }
2750
2751        /*
2752         * Set interleaving policy for system init. Interleaving is only
2753         * enabled across suitably sized nodes (default is >= 16MB), or
2754         * fall back to the largest node if they're all smaller.
2755         */
2756        nodes_clear(interleave_nodes);
2757        for_each_node_state(nid, N_MEMORY) {
2758                unsigned long total_pages = node_present_pages(nid);
2759
2760                /* Preserve the largest node */
2761                if (largest < total_pages) {
2762                        largest = total_pages;
2763                        prefer = nid;
2764                }
2765
2766                /* Interleave this node? */
2767                if ((total_pages << PAGE_SHIFT) >= (16 << 20))
2768                        node_set(nid, interleave_nodes);
2769        }
2770
2771        /* All too small, use the largest */
2772        if (unlikely(nodes_empty(interleave_nodes)))
2773                node_set(prefer, interleave_nodes);
2774
2775        if (do_set_mempolicy(MPOL_INTERLEAVE, 0, &interleave_nodes))
2776                pr_err("%s: interleaving failed\n", __func__);
2777
2778        check_numabalancing_enable();
2779}
2780
2781/* Reset policy of current process to default */
2782void numa_default_policy(void)
2783{
2784        do_set_mempolicy(MPOL_DEFAULT, 0, NULL);
2785}
2786
2787/*
2788 * Parse and format mempolicy from/to strings
2789 */
2790
2791/*
2792 * "local" is implemented internally by MPOL_PREFERRED with MPOL_F_LOCAL flag.
2793 */
2794static const char * const policy_modes[] =
2795{
2796        [MPOL_DEFAULT]    = "default",
2797        [MPOL_PREFERRED]  = "prefer",
2798        [MPOL_BIND]       = "bind",
2799        [MPOL_INTERLEAVE] = "interleave",
2800        [MPOL_LOCAL]      = "local",
2801};
2802
2803
2804#ifdef CONFIG_TMPFS
2805/**
2806 * mpol_parse_str - parse string to mempolicy, for tmpfs mpol mount option.
2807 * @str:  string containing mempolicy to parse
2808 * @mpol:  pointer to struct mempolicy pointer, returned on success.
2809 *
2810 * Format of input:
2811 *      <mode>[=<flags>][:<nodelist>]
2812 *
2813 * On success, returns 0, else 1
2814 */
2815int mpol_parse_str(char *str, struct mempolicy **mpol)
2816{
2817        struct mempolicy *new = NULL;
2818        unsigned short mode_flags;
2819        nodemask_t nodes;
2820        char *nodelist = strchr(str, ':');
2821        char *flags = strchr(str, '=');
2822        int err = 1, mode;
2823
2824        if (nodelist) {
2825                /* NUL-terminate mode or flags string */
2826                *nodelist++ = '\0';
2827                if (nodelist_parse(nodelist, nodes))
2828                        goto out;
2829                if (!nodes_subset(nodes, node_states[N_MEMORY]))
2830                        goto out;
2831        } else
2832                nodes_clear(nodes);
2833
2834        if (flags)
2835                *flags++ = '\0';        /* terminate mode string */
2836
2837        mode = match_string(policy_modes, MPOL_MAX, str);
2838        if (mode < 0)
2839                goto out;
2840
2841        switch (mode) {
2842        case MPOL_PREFERRED:
2843                /*
2844                 * Insist on a nodelist of one node only
2845                 */
2846                if (nodelist) {
2847                        char *rest = nodelist;
2848                        while (isdigit(*rest))
2849                                rest++;
2850                        if (*rest)
2851                                goto out;
2852                }
2853                break;
2854        case MPOL_INTERLEAVE:
2855                /*
2856                 * Default to online nodes with memory if no nodelist
2857                 */
2858                if (!nodelist)
2859                        nodes = node_states[N_MEMORY];
2860                break;
2861        case MPOL_LOCAL:
2862                /*
2863                 * Don't allow a nodelist;  mpol_new() checks flags
2864                 */
2865                if (nodelist)
2866                        goto out;
2867                mode = MPOL_PREFERRED;
2868                break;
2869        case MPOL_DEFAULT:
2870                /*
2871                 * Insist on a empty nodelist
2872                 */
2873                if (!nodelist)
2874                        err = 0;
2875                goto out;
2876        case MPOL_BIND:
2877                /*
2878                 * Insist on a nodelist
2879                 */
2880                if (!nodelist)
2881                        goto out;
2882        }
2883
2884        mode_flags = 0;
2885        if (flags) {
2886                /*
2887                 * Currently, we only support two mutually exclusive
2888                 * mode flags.
2889                 */
2890                if (!strcmp(flags, "static"))
2891                        mode_flags |= MPOL_F_STATIC_NODES;
2892                else if (!strcmp(flags, "relative"))
2893                        mode_flags |= MPOL_F_RELATIVE_NODES;
2894                else
2895                        goto out;
2896        }
2897
2898        new = mpol_new(mode, mode_flags, &nodes);
2899        if (IS_ERR(new))
2900                goto out;
2901
2902        /*
2903         * Save nodes for mpol_to_str() to show the tmpfs mount options
2904         * for /proc/mounts, /proc/pid/mounts and /proc/pid/mountinfo.
2905         */
2906        if (mode != MPOL_PREFERRED)
2907                new->v.nodes = nodes;
2908        else if (nodelist)
2909                new->v.preferred_node = first_node(nodes);
2910        else
2911                new->flags |= MPOL_F_LOCAL;
2912
2913        /*
2914         * Save nodes for contextualization: this will be used to "clone"
2915         * the mempolicy in a specific context [cpuset] at a later time.
2916         */
2917        new->w.user_nodemask = nodes;
2918
2919        err = 0;
2920
2921out:
2922        /* Restore string for error message */
2923        if (nodelist)
2924                *--nodelist = ':';
2925        if (flags)
2926                *--flags = '=';
2927        if (!err)
2928                *mpol = new;
2929        return err;
2930}
2931#endif /* CONFIG_TMPFS */
2932
2933/**
2934 * mpol_to_str - format a mempolicy structure for printing
2935 * @buffer:  to contain formatted mempolicy string
2936 * @maxlen:  length of @buffer
2937 * @pol:  pointer to mempolicy to be formatted
2938 *
2939 * Convert @pol into a string.  If @buffer is too short, truncate the string.
2940 * Recommend a @maxlen of at least 32 for the longest mode, "interleave", the
2941 * longest flag, "relative", and to display at least a few node ids.
2942 */
2943void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
2944{
2945        char *p = buffer;
2946        nodemask_t nodes = NODE_MASK_NONE;
2947        unsigned short mode = MPOL_DEFAULT;
2948        unsigned short flags = 0;
2949
2950        if (pol && pol != &default_policy && !(pol->flags & MPOL_F_MORON)) {
2951                mode = pol->mode;
2952                flags = pol->flags;
2953        }
2954
2955        switch (mode) {
2956        case MPOL_DEFAULT:
2957                break;
2958        case MPOL_PREFERRED:
2959                if (flags & MPOL_F_LOCAL)
2960                        mode = MPOL_LOCAL;
2961                else
2962                        node_set(pol->v.preferred_node, nodes);
2963                break;
2964        case MPOL_BIND:
2965        case MPOL_INTERLEAVE:
2966                nodes = pol->v.nodes;
2967                break;
2968        default:
2969                WARN_ON_ONCE(1);
2970                snprintf(p, maxlen, "unknown");
2971                return;
2972        }
2973
2974        p += snprintf(p, maxlen, "%s", policy_modes[mode]);
2975
2976        if (flags & MPOL_MODE_FLAGS) {
2977                p += snprintf(p, buffer + maxlen - p, "=");
2978
2979                /*
2980                 * Currently, the only defined flags are mutually exclusive
2981                 */
2982                if (flags & MPOL_F_STATIC_NODES)
2983                        p += snprintf(p, buffer + maxlen - p, "static");
2984                else if (flags & MPOL_F_RELATIVE_NODES)
2985                        p += snprintf(p, buffer + maxlen - p, "relative");
2986        }
2987
2988        if (!nodes_empty(nodes))
2989                p += scnprintf(p, buffer + maxlen - p, ":%*pbl",
2990                               nodemask_pr_args(&nodes));
2991}
2992