LXR linux/mm/mempolicy.c

   1// SPDX-License-Identifier: GPL-2.0-only
   2/*
   3 * Simple NUMA memory policy for the Linux kernel.
   4 *
   5 * Copyright 2003,2004 Andi Kleen, SuSE Labs.
   6 * (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc.
   7 *
   8 * NUMA policy allows the user to give hints in which node(s) memory should
   9 * be allocated.
  10 *
  11 * Support four policies per VMA and per process:
  12 *
  13 * The VMA policy has priority over the process policy for a page fault.
  14 *
  15 * interleave     Allocate memory interleaved over a set of nodes,
  16 *                with normal fallback if it fails.
  17 *                For VMA based allocations this interleaves based on the
  18 *                offset into the backing object or offset into the mapping
  19 *                for anonymous memory. For process policy an process counter
  20 *                is used.
  21 *
  22 * bind           Only allocate memory on a specific set of nodes,
  23 *                no fallback.
  24 *                FIXME: memory is allocated starting with the first node
  25 *                to the last. It would be better if bind would truly restrict
  26 *                the allocation to memory nodes instead
  27 *
  28 * preferred       Try a specific node first before normal fallback.
  29 *                As a special case NUMA_NO_NODE here means do the allocation
  30 *                on the local CPU. This is normally identical to default,
  31 *                but useful to set in a VMA when you have a non default
  32 *                process policy.
  33 *
  34 * default        Allocate on the local node first, or when on a VMA
  35 *                use the process policy. This is what Linux always did
  36 *                in a NUMA aware kernel and still does by, ahem, default.
  37 *
  38 * The process policy is applied for most non interrupt memory allocations
  39 * in that process' context. Interrupts ignore the policies and always
  40 * try to allocate on the local CPU. The VMA policy is only applied for memory
  41 * allocations for a VMA in the VM.
  42 *
  43 * Currently there are a few corner cases in swapping where the policy
  44 * is not applied, but the majority should be handled. When process policy
  45 * is used it is not remembered over swap outs/swap ins.
  46 *
  47 * Only the highest zone in the zone hierarchy gets policied. Allocations
  48 * requesting a lower zone just use default policy. This implies that
  49 * on systems with highmem kernel lowmem allocation don't get policied.
  50 * Same with GFP_DMA allocations.
  51 *
  52 * For shmfs/tmpfs/hugetlbfs shared memory the policy is shared between
  53 * all users and remembered even when nobody has memory mapped.
  54 */
  55
  56/* Notebook:
  57   fix mmap readahead to honour policy and enable policy for any page cache
  58   object
  59   statistics for bigpages
  60   global policy for page cache? currently it uses process policy. Requires
  61   first item above.
  62   handle mremap for shared memory (currently ignored for the policy)
  63   grows down?
  64   make bind policy root only? It can trigger oom much faster and the
  65   kernel is not always grateful with that.
  66*/
  67
  68#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
  69
  70#include <linux/mempolicy.h>
  71#include <linux/mm.h>
  72#include <linux/highmem.h>
  73#include <linux/hugetlb.h>
  74#include <linux/kernel.h>
  75#include <linux/sched.h>
  76#include <linux/sched/mm.h>
  77#include <linux/sched/numa_balancing.h>
  78#include <linux/sched/task.h>
  79#include <linux/nodemask.h>
  80#include <linux/cpuset.h>
  81#include <linux/slab.h>
  82#include <linux/string.h>
  83#include <linux/export.h>
  84#include <linux/nsproxy.h>
  85#include <linux/interrupt.h>
  86#include <linux/init.h>
  87#include <linux/compat.h>
  88#include <linux/ptrace.h>
  89#include <linux/swap.h>
  90#include <linux/seq_file.h>
  91#include <linux/proc_fs.h>
  92#include <linux/migrate.h>
  93#include <linux/ksm.h>
  94#include <linux/rmap.h>
  95#include <linux/security.h>
  96#include <linux/syscalls.h>
  97#include <linux/ctype.h>
  98#include <linux/mm_inline.h>
  99#include <linux/mmu_notifier.h>
 100#include <linux/printk.h>
 101#include <linux/swapops.h>
 102
 103#include <asm/tlbflush.h>
 104#include <linux/uaccess.h>
 105
 106#include "internal.h"
 107
 108/* Internal flags */
 109#define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0)    /* Skip checks for continuous vmas */
 110#define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1)          /* Invert check for nodemask */
 111
 112static struct kmem_cache *policy_cache;
 113static struct kmem_cache *sn_cache;
 114
 115/* Highest zone. An specific allocation for a zone below that is not
 116   policied. */
 117enum zone_type policy_zone = 0;
 118
 119/*
 120 * run-time system-wide default policy => local allocation
 121 */
 122static struct mempolicy default_policy = {
 123        .refcnt = ATOMIC_INIT(1), /* never free it */
 124        .mode = MPOL_PREFERRED,
 125        .flags = MPOL_F_LOCAL,
 126};
 127
 128static struct mempolicy preferred_node_policy[MAX_NUMNODES];
 129
 130struct mempolicy *get_task_policy(struct task_struct *p)
 131{
 132        struct mempolicy *pol = p->mempolicy;
 133        int node;
 134
 135        if (pol)
 136                return pol;
 137
 138        node = numa_node_id();
 139        if (node != NUMA_NO_NODE) {
 140                pol = &preferred_node_policy[node];
 141                /* preferred_node_policy is not initialised early in boot */
 142                if (pol->mode)
 143                        return pol;
 144        }
 145
 146        return &default_policy;
 147}
 148
 149static const struct mempolicy_operations {
 150        int (*create)(struct mempolicy *pol, const nodemask_t *nodes);
 151        void (*rebind)(struct mempolicy *pol, const nodemask_t *nodes);
 152} mpol_ops[MPOL_MAX];
 153
 154static inline int mpol_store_user_nodemask(const struct mempolicy *pol)
 155{
 156        return pol->flags & MPOL_MODE_FLAGS;
 157}
 158
 159static void mpol_relative_nodemask(nodemask_t *ret, const nodemask_t *orig,
 160                                   const nodemask_t *rel)
 161{
 162        nodemask_t tmp;
 163        nodes_fold(tmp, *orig, nodes_weight(*rel));
 164        nodes_onto(*ret, tmp, *rel);
 165}
 166
 167static int mpol_new_interleave(struct mempolicy *pol, const nodemask_t *nodes)
 168{
 169        if (nodes_empty(*nodes))
 170                return -EINVAL;
 171        pol->v.nodes = *nodes;
 172        return 0;
 173}
 174
 175static int mpol_new_preferred(struct mempolicy *pol, const nodemask_t *nodes)
 176{
 177        if (!nodes)
 178                pol->flags |= MPOL_F_LOCAL;     /* local allocation */
 179        else if (nodes_empty(*nodes))
 180                return -EINVAL;                 /*  no allowed nodes */
 181        else
 182                pol->v.preferred_node = first_node(*nodes);
 183        return 0;
 184}
 185
 186static int mpol_new_bind(struct mempolicy *pol, const nodemask_t *nodes)
 187{
 188        if (nodes_empty(*nodes))
 189                return -EINVAL;
 190        pol->v.nodes = *nodes;
 191        return 0;
 192}
 193
 194/*
 195 * mpol_set_nodemask is called after mpol_new() to set up the nodemask, if
 196 * any, for the new policy.  mpol_new() has already validated the nodes
 197 * parameter with respect to the policy mode and flags.  But, we need to
 198 * handle an empty nodemask with MPOL_PREFERRED here.
 199 *
 200 * Must be called holding task's alloc_lock to protect task's mems_allowed
 201 * and mempolicy.  May also be called holding the mmap_semaphore for write.
 202 */
 203static int mpol_set_nodemask(struct mempolicy *pol,
 204                     const nodemask_t *nodes, struct nodemask_scratch *nsc)
 205{
 206        int ret;
 207
 208        /* if mode is MPOL_DEFAULT, pol is NULL. This is right. */
 209        if (pol == NULL)
 210                return 0;
 211        /* Check N_MEMORY */
 212        nodes_and(nsc->mask1,
 213                  cpuset_current_mems_allowed, node_states[N_MEMORY]);
 214
 215        VM_BUG_ON(!nodes);
 216        if (pol->mode == MPOL_PREFERRED && nodes_empty(*nodes))
 217                nodes = NULL;   /* explicit local allocation */
 218        else {
 219                if (pol->flags & MPOL_F_RELATIVE_NODES)
 220                        mpol_relative_nodemask(&nsc->mask2, nodes, &nsc->mask1);
 221                else
 222                        nodes_and(nsc->mask2, *nodes, nsc->mask1);
 223
 224                if (mpol_store_user_nodemask(pol))
 225                        pol->w.user_nodemask = *nodes;
 226                else
 227                        pol->w.cpuset_mems_allowed =
 228                                                cpuset_current_mems_allowed;
 229        }
 230
 231        if (nodes)
 232                ret = mpol_ops[pol->mode].create(pol, &nsc->mask2);
 233        else
 234                ret = mpol_ops[pol->mode].create(pol, NULL);
 235        return ret;
 236}
 237
 238/*
 239 * This function just creates a new policy, does some check and simple
 240 * initialization. You must invoke mpol_set_nodemask() to set nodes.
 241 */
 242static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
 243                                  nodemask_t *nodes)
 244{
 245        struct mempolicy *policy;
 246
 247        pr_debug("setting mode %d flags %d nodes[0] %lx\n",
 248                 mode, flags, nodes ? nodes_addr(*nodes)[0] : NUMA_NO_NODE);
 249
 250        if (mode == MPOL_DEFAULT) {
 251                if (nodes && !nodes_empty(*nodes))
 252                        return ERR_PTR(-EINVAL);
 253                return NULL;
 254        }
 255        VM_BUG_ON(!nodes);
 256
 257        /*
 258         * MPOL_PREFERRED cannot be used with MPOL_F_STATIC_NODES or
 259         * MPOL_F_RELATIVE_NODES if the nodemask is empty (local allocation).
 260         * All other modes require a valid pointer to a non-empty nodemask.
 261         */
 262        if (mode == MPOL_PREFERRED) {
 263                if (nodes_empty(*nodes)) {
 264                        if (((flags & MPOL_F_STATIC_NODES) ||
 265                             (flags & MPOL_F_RELATIVE_NODES)))
 266                                return ERR_PTR(-EINVAL);
 267                }
 268        } else if (mode == MPOL_LOCAL) {
 269                if (!nodes_empty(*nodes) ||
 270                    (flags & MPOL_F_STATIC_NODES) ||
 271                    (flags & MPOL_F_RELATIVE_NODES))
 272                        return ERR_PTR(-EINVAL);
 273                mode = MPOL_PREFERRED;
 274        } else if (nodes_empty(*nodes))
 275                return ERR_PTR(-EINVAL);
 276        policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
 277        if (!policy)
 278                return ERR_PTR(-ENOMEM);
 279        atomic_set(&policy->refcnt, 1);
 280        policy->mode = mode;
 281        policy->flags = flags;
 282
 283        return policy;
 284}
 285
 286/* Slow path of a mpol destructor. */
 287void __mpol_put(struct mempolicy *p)
 288{
 289        if (!atomic_dec_and_test(&p->refcnt))
 290                return;
 291        kmem_cache_free(policy_cache, p);
 292}
 293
 294static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes)
 295{
 296}
 297
 298static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes)
 299{
 300        nodemask_t tmp;
 301
 302        if (pol->flags & MPOL_F_STATIC_NODES)
 303                nodes_and(tmp, pol->w.user_nodemask, *nodes);
 304        else if (pol->flags & MPOL_F_RELATIVE_NODES)
 305                mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
 306        else {
 307                nodes_remap(tmp, pol->v.nodes,pol->w.cpuset_mems_allowed,
 308                                                                *nodes);
 309                pol->w.cpuset_mems_allowed = *nodes;
 310        }
 311
 312        if (nodes_empty(tmp))
 313                tmp = *nodes;
 314
 315        pol->v.nodes = tmp;
 316}
 317
 318static void mpol_rebind_preferred(struct mempolicy *pol,
 319                                                const nodemask_t *nodes)
 320{
 321        nodemask_t tmp;
 322
 323        if (pol->flags & MPOL_F_STATIC_NODES) {
 324                int node = first_node(pol->w.user_nodemask);
 325
 326                if (node_isset(node, *nodes)) {
 327                        pol->v.preferred_node = node;
 328                        pol->flags &= ~MPOL_F_LOCAL;
 329                } else
 330                        pol->flags |= MPOL_F_LOCAL;
 331        } else if (pol->flags & MPOL_F_RELATIVE_NODES) {
 332                mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
 333                pol->v.preferred_node = first_node(tmp);
 334        } else if (!(pol->flags & MPOL_F_LOCAL)) {
 335                pol->v.preferred_node = node_remap(pol->v.preferred_node,
 336                                                   pol->w.cpuset_mems_allowed,
 337                                                   *nodes);
 338                pol->w.cpuset_mems_allowed = *nodes;
 339        }
 340}
 341
 342/*
 343 * mpol_rebind_policy - Migrate a policy to a different set of nodes
 344 *
 345 * Per-vma policies are protected by mmap_sem. Allocations using per-task
 346 * policies are protected by task->mems_allowed_seq to prevent a premature
 347 * OOM/allocation failure due to parallel nodemask modification.
 348 */
 349static void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask)
 350{
 351        if (!pol)
 352                return;
 353        if (!mpol_store_user_nodemask(pol) && !(pol->flags & MPOL_F_LOCAL) &&
 354            nodes_equal(pol->w.cpuset_mems_allowed, *newmask))
 355                return;
 356
 357        mpol_ops[pol->mode].rebind(pol, newmask);
 358}
 359
 360/*
 361 * Wrapper for mpol_rebind_policy() that just requires task
 362 * pointer, and updates task mempolicy.
 363 *
 364 * Called with task's alloc_lock held.
 365 */
 366
 367void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new)
 368{
 369        mpol_rebind_policy(tsk->mempolicy, new);
 370}
 371
 372/*
 373 * Rebind each vma in mm to new nodemask.
 374 *
 375 * Call holding a reference to mm.  Takes mm->mmap_sem during call.
 376 */
 377
 378void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
 379{
 380        struct vm_area_struct *vma;
 381
 382        down_write(&mm->mmap_sem);
 383        for (vma = mm->mmap; vma; vma = vma->vm_next)
 384                mpol_rebind_policy(vma->vm_policy, new);
 385        up_write(&mm->mmap_sem);
 386}
 387
 388static const struct mempolicy_operations mpol_ops[MPOL_MAX] = {
 389        [MPOL_DEFAULT] = {
 390                .rebind = mpol_rebind_default,
 391        },
 392        [MPOL_INTERLEAVE] = {
 393                .create = mpol_new_interleave,
 394                .rebind = mpol_rebind_nodemask,
 395        },
 396        [MPOL_PREFERRED] = {
 397                .create = mpol_new_preferred,
 398                .rebind = mpol_rebind_preferred,
 399        },
 400        [MPOL_BIND] = {
 401                .create = mpol_new_bind,
 402                .rebind = mpol_rebind_nodemask,
 403        },
 404};
 405
 406static void migrate_page_add(struct page *page, struct list_head *pagelist,
 407                                unsigned long flags);
 408
 409struct queue_pages {
 410        struct list_head *pagelist;
 411        unsigned long flags;
 412        nodemask_t *nmask;
 413        struct vm_area_struct *prev;
 414};
 415
 416/*
 417 * Check if the page's nid is in qp->nmask.
 418 *
 419 * If MPOL_MF_INVERT is set in qp->flags, check if the nid is
 420 * in the invert of qp->nmask.
 421 */
 422static inline bool queue_pages_required(struct page *page,
 423                                        struct queue_pages *qp)
 424{
 425        int nid = page_to_nid(page);
 426        unsigned long flags = qp->flags;
 427
 428        return node_isset(nid, *qp->nmask) == !(flags & MPOL_MF_INVERT);
 429}
 430
 431/*
 432 * queue_pages_pmd() has three possible return values:
 433 * 1 - pages are placed on the right node or queued successfully.
 434 * 0 - THP was split.
 435 * -EIO - is migration entry or MPOL_MF_STRICT was specified and an existing
 436 *        page was already on a node that does not follow the policy.
 437 */
 438static int queue_pages_pmd(pmd_t *pmd, spinlock_t *ptl, unsigned long addr,
 439                                unsigned long end, struct mm_walk *walk)
 440{
 441        int ret = 0;
 442        struct page *page;
 443        struct queue_pages *qp = walk->private;
 444        unsigned long flags;
 445
 446        if (unlikely(is_pmd_migration_entry(*pmd))) {
 447                ret = -EIO;
 448                goto unlock;
 449        }
 450        page = pmd_page(*pmd);
 451        if (is_huge_zero_page(page)) {
 452                spin_unlock(ptl);
 453                __split_huge_pmd(walk->vma, pmd, addr, false, NULL);
 454                goto out;
 455        }
 456        if (!queue_pages_required(page, qp)) {
 457                ret = 1;
 458                goto unlock;
 459        }
 460
 461        ret = 1;
 462        flags = qp->flags;
 463        /* go to thp migration */
 464        if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
 465                if (!vma_migratable(walk->vma)) {
 466                        ret = -EIO;
 467                        goto unlock;
 468                }
 469
 470                migrate_page_add(page, qp->pagelist, flags);
 471        } else
 472                ret = -EIO;
 473unlock:
 474        spin_unlock(ptl);
 475out:
 476        return ret;
 477}
 478
 479/*
 480 * Scan through pages checking if pages follow certain conditions,
 481 * and move them to the pagelist if they do.
 482 */
 483static int queue_pages_pte_range(pmd_t *pmd, unsigned long addr,
 484                        unsigned long end, struct mm_walk *walk)
 485{
 486        struct vm_area_struct *vma = walk->vma;
 487        struct page *page;
 488        struct queue_pages *qp = walk->private;
 489        unsigned long flags = qp->flags;
 490        int ret;
 491        pte_t *pte;
 492        spinlock_t *ptl;
 493
 494        ptl = pmd_trans_huge_lock(pmd, vma);
 495        if (ptl) {
 496                ret = queue_pages_pmd(pmd, ptl, addr, end, walk);
 497                if (ret > 0)
 498                        return 0;
 499                else if (ret < 0)
 500                        return ret;
 501        }
 502
 503        if (pmd_trans_unstable(pmd))
 504                return 0;
 505
 506        pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
 507        for (; addr != end; pte++, addr += PAGE_SIZE) {
 508                if (!pte_present(*pte))
 509                        continue;
 510                page = vm_normal_page(vma, addr, *pte);
 511                if (!page)
 512                        continue;
 513                /*
 514                 * vm_normal_page() filters out zero pages, but there might
 515                 * still be PageReserved pages to skip, perhaps in a VDSO.
 516                 */
 517                if (PageReserved(page))
 518                        continue;
 519                if (!queue_pages_required(page, qp))
 520                        continue;
 521                if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
 522                        if (!vma_migratable(vma))
 523                                break;
 524                        migrate_page_add(page, qp->pagelist, flags);
 525                } else
 526                        break;
 527        }
 528        pte_unmap_unlock(pte - 1, ptl);
 529        cond_resched();
 530        return addr != end ? -EIO : 0;
 531}
 532
 533static int queue_pages_hugetlb(pte_t *pte, unsigned long hmask,
 534                               unsigned long addr, unsigned long end,
 535                               struct mm_walk *walk)
 536{
 537#ifdef CONFIG_HUGETLB_PAGE
 538        struct queue_pages *qp = walk->private;
 539        unsigned long flags = qp->flags;
 540        struct page *page;
 541        spinlock_t *ptl;
 542        pte_t entry;
 543
 544        ptl = huge_pte_lock(hstate_vma(walk->vma), walk->mm, pte);
 545        entry = huge_ptep_get(pte);
 546        if (!pte_present(entry))
 547                goto unlock;
 548        page = pte_page(entry);
 549        if (!queue_pages_required(page, qp))
 550                goto unlock;
 551        /* With MPOL_MF_MOVE, we migrate only unshared hugepage. */
 552        if (flags & (MPOL_MF_MOVE_ALL) ||
 553            (flags & MPOL_MF_MOVE && page_mapcount(page) == 1))
 554                isolate_huge_page(page, qp->pagelist);
 555unlock:
 556        spin_unlock(ptl);
 557#else
 558        BUG();
 559#endif
 560        return 0;
 561}
 562
 563#ifdef CONFIG_NUMA_BALANCING
 564/*
 565 * This is used to mark a range of virtual addresses to be inaccessible.
 566 * These are later cleared by a NUMA hinting fault. Depending on these
 567 * faults, pages may be migrated for better NUMA placement.
 568 *
 569 * This is assuming that NUMA faults are handled using PROT_NONE. If
 570 * an architecture makes a different choice, it will need further
 571 * changes to the core.
 572 */
 573unsigned long change_prot_numa(struct vm_area_struct *vma,
 574                        unsigned long addr, unsigned long end)
 575{
 576        int nr_updated;
 577
 578        nr_updated = change_protection(vma, addr, end, PAGE_NONE, 0, 1);
 579        if (nr_updated)
 580                count_vm_numa_events(NUMA_PTE_UPDATES, nr_updated);
 581
 582        return nr_updated;
 583}
 584#else
 585static unsigned long change_prot_numa(struct vm_area_struct *vma,
 586                        unsigned long addr, unsigned long end)
 587{
 588        return 0;
 589}
 590#endif /* CONFIG_NUMA_BALANCING */
 591
 592static int queue_pages_test_walk(unsigned long start, unsigned long end,
 593                                struct mm_walk *walk)
 594{
 595        struct vm_area_struct *vma = walk->vma;
 596        struct queue_pages *qp = walk->private;
 597        unsigned long endvma = vma->vm_end;
 598        unsigned long flags = qp->flags;
 599
 600        /*
 601         * Need check MPOL_MF_STRICT to return -EIO if possible
 602         * regardless of vma_migratable
 603         */
 604        if (!vma_migratable(vma) &&
 605            !(flags & MPOL_MF_STRICT))
 606                return 1;
 607
 608        if (endvma > end)
 609                endvma = end;
 610        if (vma->vm_start > start)
 611                start = vma->vm_start;
 612
 613        if (!(flags & MPOL_MF_DISCONTIG_OK)) {
 614                if (!vma->vm_next && vma->vm_end < end)
 615                        return -EFAULT;
 616                if (qp->prev && qp->prev->vm_end < vma->vm_start)
 617                        return -EFAULT;
 618        }
 619
 620        qp->prev = vma;
 621
 622        if (flags & MPOL_MF_LAZY) {
 623                /* Similar to task_numa_work, skip inaccessible VMAs */
 624                if (!is_vm_hugetlb_page(vma) &&
 625                        (vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)) &&
 626                        !(vma->vm_flags & VM_MIXEDMAP))
 627                        change_prot_numa(vma, start, endvma);
 628                return 1;
 629        }
 630
 631        /* queue pages from current vma */
 632        if (flags & MPOL_MF_VALID)
 633                return 0;
 634        return 1;
 635}
 636
 637/*
 638 * Walk through page tables and collect pages to be migrated.
 639 *
 640 * If pages found in a given range are on a set of nodes (determined by
 641 * @nodes and @flags,) it's isolated and queued to the pagelist which is
 642 * passed via @private.)
 643 */
 644static int
 645queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end,
 646                nodemask_t *nodes, unsigned long flags,
 647                struct list_head *pagelist)
 648{
 649        struct queue_pages qp = {
 650                .pagelist = pagelist,
 651                .flags = flags,
 652                .nmask = nodes,
 653                .prev = NULL,
 654        };
 655        struct mm_walk queue_pages_walk = {
 656                .hugetlb_entry = queue_pages_hugetlb,
 657                .pmd_entry = queue_pages_pte_range,
 658                .test_walk = queue_pages_test_walk,
 659                .mm = mm,
 660                .private = &qp,
 661        };
 662
 663        return walk_page_range(start, end, &queue_pages_walk);
 664}
 665
 666/*
 667 * Apply policy to a single VMA
 668 * This must be called with the mmap_sem held for writing.
 669 */
 670static int vma_replace_policy(struct vm_area_struct *vma,
 671                                                struct mempolicy *pol)
 672{
 673        int err;
 674        struct mempolicy *old;
 675        struct mempolicy *new;
 676
 677        pr_debug("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
 678                 vma->vm_start, vma->vm_end, vma->vm_pgoff,
 679                 vma->vm_ops, vma->vm_file,
 680                 vma->vm_ops ? vma->vm_ops->set_policy : NULL);
 681
 682        new = mpol_dup(pol);
 683        if (IS_ERR(new))
 684                return PTR_ERR(new);
 685
 686        if (vma->vm_ops && vma->vm_ops->set_policy) {
 687                err = vma->vm_ops->set_policy(vma, new);
 688                if (err)
 689                        goto err_out;
 690        }
 691
 692        old = vma->vm_policy;
 693        vma->vm_policy = new; /* protected by mmap_sem */
 694        mpol_put(old);
 695
 696        return 0;
 697 err_out:
 698        mpol_put(new);
 699        return err;
 700}
 701
 702/* Step 2: apply policy to a range and do splits. */
 703static int mbind_range(struct mm_struct *mm, unsigned long start,
 704                       unsigned long end, struct mempolicy *new_pol)
 705{
 706        struct vm_area_struct *next;
 707        struct vm_area_struct *prev;
 708        struct vm_area_struct *vma;
 709        int err = 0;
 710        pgoff_t pgoff;
 711        unsigned long vmstart;
 712        unsigned long vmend;
 713
 714        vma = find_vma(mm, start);
 715        if (!vma || vma->vm_start > start)
 716                return -EFAULT;
 717
 718        prev = vma->vm_prev;
 719        if (start > vma->vm_start)
 720                prev = vma;
 721
 722        for (; vma && vma->vm_start < end; prev = vma, vma = next) {
 723                next = vma->vm_next;
 724                vmstart = max(start, vma->vm_start);
 725                vmend   = min(end, vma->vm_end);
 726
 727                if (mpol_equal(vma_policy(vma), new_pol))
 728                        continue;
 729
 730                pgoff = vma->vm_pgoff +
 731                        ((vmstart - vma->vm_start) >> PAGE_SHIFT);
 732                prev = vma_merge(mm, prev, vmstart, vmend, vma->vm_flags,
 733                                 vma->anon_vma, vma->vm_file, pgoff,
 734                                 new_pol, vma->vm_userfaultfd_ctx);
 735                if (prev) {
 736                        vma = prev;
 737                        next = vma->vm_next;
 738                        if (mpol_equal(vma_policy(vma), new_pol))
 739                                continue;
 740                        /* vma_merge() joined vma && vma->next, case 8 */
 741                        goto replace;
 742                }
 743                if (vma->vm_start != vmstart) {
 744                        err = split_vma(vma->vm_mm, vma, vmstart, 1);
 745                        if (err)
 746                                goto out;
 747                }
 748                if (vma->vm_end != vmend) {
 749                        err = split_vma(vma->vm_mm, vma, vmend, 0);
 750                        if (err)
 751                                goto out;
 752                }
 753 replace:
 754                err = vma_replace_policy(vma, new_pol);
 755                if (err)
 756                        goto out;
 757        }
 758
 759 out:
 760        return err;
 761}
 762
 763/* Set the process memory policy */
 764static long do_set_mempolicy(unsigned short mode, unsigned short flags,
 765                             nodemask_t *nodes)
 766{
 767        struct mempolicy *new, *old;
 768        NODEMASK_SCRATCH(scratch);
 769        int ret;
 770
 771        if (!scratch)
 772                return -ENOMEM;
 773
 774        new = mpol_new(mode, flags, nodes);
 775        if (IS_ERR(new)) {
 776                ret = PTR_ERR(new);
 777                goto out;
 778        }
 779
 780        task_lock(current);
 781        ret = mpol_set_nodemask(new, nodes, scratch);
 782        if (ret) {
 783                task_unlock(current);
 784                mpol_put(new);
 785                goto out;
 786        }
 787        old = current->mempolicy;
 788        current->mempolicy = new;
 789        if (new && new->mode == MPOL_INTERLEAVE)
 790                current->il_prev = MAX_NUMNODES-1;
 791        task_unlock(current);
 792        mpol_put(old);
 793        ret = 0;
 794out:
 795        NODEMASK_SCRATCH_FREE(scratch);
 796        return ret;
 797}
 798
 799/*
 800 * Return nodemask for policy for get_mempolicy() query
 801 *
 802 * Called with task's alloc_lock held
 803 */
 804static void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes)
 805{
 806        nodes_clear(*nodes);
 807        if (p == &default_policy)
 808                return;
 809
 810        switch (p->mode) {
 811        case MPOL_BIND:
 812                /* Fall through */
 813        case MPOL_INTERLEAVE:
 814                *nodes = p->v.nodes;
 815                break;
 816        case MPOL_PREFERRED:
 817                if (!(p->flags & MPOL_F_LOCAL))
 818                        node_set(p->v.preferred_node, *nodes);
 819                /* else return empty node mask for local allocation */
 820                break;
 821        default:
 822                BUG();
 823        }
 824}
 825
 826static int lookup_node(struct mm_struct *mm, unsigned long addr)
 827{
 828        struct page *p;
 829        int err;
 830
 831        int locked = 1;
 832        err = get_user_pages_locked(addr & PAGE_MASK, 1, 0, &p, &locked);
 833        if (err >= 0) {
 834                err = page_to_nid(p);
 835                put_page(p);
 836        }
 837        if (locked)
 838                up_read(&mm->mmap_sem);
 839        return err;
 840}
 841
 842/* Retrieve NUMA policy */
 843static long do_get_mempolicy(int *policy, nodemask_t *nmask,
 844                             unsigned long addr, unsigned long flags)
 845{
 846        int err;
 847        struct mm_struct *mm = current->mm;
 848        struct vm_area_struct *vma = NULL;
 849        struct mempolicy *pol = current->mempolicy, *pol_refcount = NULL;
 850
 851        if (flags &
 852                ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED))
 853                return -EINVAL;
 854
 855        if (flags & MPOL_F_MEMS_ALLOWED) {
 856                if (flags & (MPOL_F_NODE|MPOL_F_ADDR))
 857                        return -EINVAL;
 858                *policy = 0;    /* just so it's initialized */
 859                task_lock(current);
 860                *nmask  = cpuset_current_mems_allowed;
 861                task_unlock(current);
 862                return 0;
 863        }
 864
 865        if (flags & MPOL_F_ADDR) {
 866                /*
 867                 * Do NOT fall back to task policy if the
 868                 * vma/shared policy at addr is NULL.  We
 869                 * want to return MPOL_DEFAULT in this case.
 870                 */
 871                down_read(&mm->mmap_sem);
 872                vma = find_vma_intersection(mm, addr, addr+1);
 873                if (!vma) {
 874                        up_read(&mm->mmap_sem);
 875                        return -EFAULT;
 876                }
 877                if (vma->vm_ops && vma->vm_ops->get_policy)
 878                        pol = vma->vm_ops->get_policy(vma, addr);
 879                else
 880                        pol = vma->vm_policy;
 881        } else if (addr)
 882                return -EINVAL;
 883
 884        if (!pol)
 885                pol = &default_policy;  /* indicates default behavior */
 886
 887        if (flags & MPOL_F_NODE) {
 888                if (flags & MPOL_F_ADDR) {
 889                        /*
 890                         * Take a refcount on the mpol, lookup_node()
 891                         * wil drop the mmap_sem, so after calling
 892                         * lookup_node() only "pol" remains valid, "vma"
 893                         * is stale.
 894                         */
 895                        pol_refcount = pol;
 896                        vma = NULL;
 897                        mpol_get(pol);
 898                        err = lookup_node(mm, addr);
 899                        if (err < 0)
 900                                goto out;
 901                        *policy = err;
 902                } else if (pol == current->mempolicy &&
 903                                pol->mode == MPOL_INTERLEAVE) {
 904                        *policy = next_node_in(current->il_prev, pol->v.nodes);
 905                } else {
 906                        err = -EINVAL;
 907                        goto out;
 908                }
 909        } else {
 910                *policy = pol == &default_policy ? MPOL_DEFAULT :
 911                                                pol->mode;
 912                /*
 913                 * Internal mempolicy flags must be masked off before exposing
 914                 * the policy to userspace.
 915                 */
 916                *policy |= (pol->flags & MPOL_MODE_FLAGS);
 917        }
 918
 919        err = 0;
 920        if (nmask) {
 921                if (mpol_store_user_nodemask(pol)) {
 922                        *nmask = pol->w.user_nodemask;
 923                } else {
 924                        task_lock(current);
 925                        get_policy_nodemask(pol, nmask);
 926                        task_unlock(current);
 927                }
 928        }
 929
 930 out:
 931        mpol_cond_put(pol);
 932        if (vma)
 933                up_read(&mm->mmap_sem);
 934        if (pol_refcount)
 935                mpol_put(pol_refcount);
 936        return err;
 937}
 938
 939#ifdef CONFIG_MIGRATION
 940/*
 941 * page migration, thp tail pages can be passed.
 942 */
 943static void migrate_page_add(struct page *page, struct list_head *pagelist,
 944                                unsigned long flags)
 945{
 946        struct page *head = compound_head(page);
 947        /*
 948         * Avoid migrating a page that is shared with others.
 949         */
 950        if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(head) == 1) {
 951                if (!isolate_lru_page(head)) {
 952                        list_add_tail(&head->lru, pagelist);
 953                        mod_node_page_state(page_pgdat(head),
 954                                NR_ISOLATED_ANON + page_is_file_cache(head),
 955                                hpage_nr_pages(head));
 956                }
 957        }
 958}
 959
 960/* page allocation callback for NUMA node migration */
 961struct page *alloc_new_node_page(struct page *page, unsigned long node)
 962{
 963        if (PageHuge(page))
 964                return alloc_huge_page_node(page_hstate(compound_head(page)),
 965                                        node);
 966        else if (PageTransHuge(page)) {
 967                struct page *thp;
 968
 969                thp = alloc_pages_node(node,
 970                        (GFP_TRANSHUGE | __GFP_THISNODE),
 971                        HPAGE_PMD_ORDER);
 972                if (!thp)
 973                        return NULL;
 974                prep_transhuge_page(thp);
 975                return thp;
 976        } else
 977                return __alloc_pages_node(node, GFP_HIGHUSER_MOVABLE |
 978                                                    __GFP_THISNODE, 0);
 979}
 980
 981/*
 982 * Migrate pages from one node to a target node.
 983 * Returns error or the number of pages not migrated.
 984 */
 985static int migrate_to_node(struct mm_struct *mm, int source, int dest,
 986                           int flags)
 987{
 988        nodemask_t nmask;
 989        LIST_HEAD(pagelist);
 990        int err = 0;
 991
 992        nodes_clear(nmask);
 993        node_set(source, nmask);
 994
 995        /*
 996         * This does not "check" the range but isolates all pages that
 997         * need migration.  Between passing in the full user address
 998         * space range and MPOL_MF_DISCONTIG_OK, this call can not fail.
 999         */
1000        VM_BUG_ON(!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)));

1001        queue_pages_range(mm, mm->mmap->vm_start, mm->task_size, &nmask,
1002                        flags | MPOL_MF_DISCONTIG_OK, &pagelist);
1003
1004        if (!list_empty(&pagelist)) {
1005                err = migrate_pages(&pagelist, alloc_new_node_page, NULL, dest,
1006                                        MIGRATE_SYNC, MR_SYSCALL);
1007                if (err)
1008                        putback_movable_pages(&pagelist);
1009        }
1010
1011        return err;
1012}
1013
1014/*
1015 * Move pages between the two nodesets so as to preserve the physical
1016 * layout as much as possible.
1017 *
1018 * Returns the number of page that could not be moved.
1019 */
1020int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
1021                     const nodemask_t *to, int flags)
1022{
1023        int busy = 0;
1024        int err;
1025        nodemask_t tmp;
1026
1027        err = migrate_prep();
1028        if (err)
1029                return err;
1030
1031        down_read(&mm->mmap_sem);
1032
1033        /*
1034         * Find a 'source' bit set in 'tmp' whose corresponding 'dest'
1035         * bit in 'to' is not also set in 'tmp'.  Clear the found 'source'
1036         * bit in 'tmp', and return that <source, dest> pair for migration.
1037         * The pair of nodemasks 'to' and 'from' define the map.
1038         *
1039         * If no pair of bits is found that way, fallback to picking some
1040         * pair of 'source' and 'dest' bits that are not the same.  If the
1041         * 'source' and 'dest' bits are the same, this represents a node
1042         * that will be migrating to itself, so no pages need move.
1043         *
1044         * If no bits are left in 'tmp', or if all remaining bits left
1045         * in 'tmp' correspond to the same bit in 'to', return false
1046         * (nothing left to migrate).
1047         *
1048         * This lets us pick a pair of nodes to migrate between, such that
1049         * if possible the dest node is not already occupied by some other
1050         * source node, minimizing the risk of overloading the memory on a
1051         * node that would happen if we migrated incoming memory to a node
1052         * before migrating outgoing memory source that same node.
1053         *
1054         * A single scan of tmp is sufficient.  As we go, we remember the
1055         * most recent <s, d> pair that moved (s != d).  If we find a pair
1056         * that not only moved, but what's better, moved to an empty slot
1057         * (d is not set in tmp), then we break out then, with that pair.
1058         * Otherwise when we finish scanning from_tmp, we at least have the
1059         * most recent <s, d> pair that moved.  If we get all the way through
1060         * the scan of tmp without finding any node that moved, much less
1061         * moved to an empty node, then there is nothing left worth migrating.
1062         */
1063
1064        tmp = *from;
1065        while (!nodes_empty(tmp)) {
1066                int s,d;
1067                int source = NUMA_NO_NODE;
1068                int dest = 0;
1069
1070                for_each_node_mask(s, tmp) {
1071
1072                        /*
1073                         * do_migrate_pages() tries to maintain the relative
1074                         * node relationship of the pages established between
1075                         * threads and memory areas.
1076                         *
1077                         * However if the number of source nodes is not equal to
1078                         * the number of destination nodes we can not preserve
1079                         * this node relative relationship.  In that case, skip
1080                         * copying memory from a node that is in the destination
1081                         * mask.
1082                         *
1083                         * Example: [2,3,4] -> [3,4,5] moves everything.
1084                         *          [0-7] - > [3,4,5] moves only 0,1,2,6,7.
1085                         */
1086
1087                        if ((nodes_weight(*from) != nodes_weight(*to)) &&
1088                                                (node_isset(s, *to)))
1089                                continue;
1090
1091                        d = node_remap(s, *from, *to);
1092                        if (s == d)
1093                                continue;
1094
1095                        source = s;     /* Node moved. Memorize */
1096                        dest = d;
1097
1098                        /* dest not in remaining from nodes? */
1099                        if (!node_isset(dest, tmp))
1100                                break;
1101                }
1102                if (source == NUMA_NO_NODE)
1103                        break;
1104
1105                node_clear(source, tmp);
1106                err = migrate_to_node(mm, source, dest, flags);
1107                if (err > 0)
1108                        busy += err;
1109                if (err < 0)
1110                        break;
1111        }
1112        up_read(&mm->mmap_sem);
1113        if (err < 0)
1114                return err;
1115        return busy;
1116
1117}
1118
1119/*
1120 * Allocate a new page for page migration based on vma policy.
1121 * Start by assuming the page is mapped by the same vma as contains @start.
1122 * Search forward from there, if not.  N.B., this assumes that the
1123 * list of pages handed to migrate_pages()--which is how we get here--
1124 * is in virtual address order.
1125 */
1126static struct page *new_page(struct page *page, unsigned long start)
1127{
1128        struct vm_area_struct *vma;
1129        unsigned long uninitialized_var(address);
1130
1131        vma = find_vma(current->mm, start);
1132        while (vma) {
1133                address = page_address_in_vma(page, vma);
1134                if (address != -EFAULT)
1135                        break;
1136                vma = vma->vm_next;
1137        }
1138
1139        if (PageHuge(page)) {
1140                return alloc_huge_page_vma(page_hstate(compound_head(page)),
1141                                vma, address);
1142        } else if (PageTransHuge(page)) {
1143                struct page *thp;
1144
1145                thp = alloc_hugepage_vma(GFP_TRANSHUGE, vma, address,
1146                                         HPAGE_PMD_ORDER);
1147                if (!thp)
1148                        return NULL;
1149                prep_transhuge_page(thp);
1150                return thp;
1151        }
1152        /*
1153         * if !vma, alloc_page_vma() will use task or system default policy
1154         */
1155        return alloc_page_vma(GFP_HIGHUSER_MOVABLE | __GFP_RETRY_MAYFAIL,
1156                        vma, address);
1157}
1158#else
1159
1160static void migrate_page_add(struct page *page, struct list_head *pagelist,
1161                                unsigned long flags)
1162{
1163}
1164
1165int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
1166                     const nodemask_t *to, int flags)
1167{
1168        return -ENOSYS;
1169}
1170
1171static struct page *new_page(struct page *page, unsigned long start)
1172{
1173        return NULL;
1174}
1175#endif
1176
1177static long do_mbind(unsigned long start, unsigned long len,
1178                     unsigned short mode, unsigned short mode_flags,
1179                     nodemask_t *nmask, unsigned long flags)
1180{
1181        struct mm_struct *mm = current->mm;
1182        struct mempolicy *new;
1183        unsigned long end;
1184        int err;
1185        LIST_HEAD(pagelist);
1186
1187        if (flags & ~(unsigned long)MPOL_MF_VALID)
1188                return -EINVAL;
1189        if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
1190                return -EPERM;
1191
1192        if (start & ~PAGE_MASK)
1193                return -EINVAL;
1194
1195        if (mode == MPOL_DEFAULT)
1196                flags &= ~MPOL_MF_STRICT;
1197
1198        len = (len + PAGE_SIZE - 1) & PAGE_MASK;
1199        end = start + len;
1200
1201        if (end < start)
1202                return -EINVAL;
1203        if (end == start)
1204                return 0;
1205
1206        new = mpol_new(mode, mode_flags, nmask);
1207        if (IS_ERR(new))
1208                return PTR_ERR(new);
1209
1210        if (flags & MPOL_MF_LAZY)
1211                new->flags |= MPOL_F_MOF;
1212
1213        /*
1214         * If we are using the default policy then operation
1215         * on discontinuous address spaces is okay after all
1216         */
1217        if (!new)
1218                flags |= MPOL_MF_DISCONTIG_OK;
1219
1220        pr_debug("mbind %lx-%lx mode:%d flags:%d nodes:%lx\n",
1221                 start, start + len, mode, mode_flags,
1222                 nmask ? nodes_addr(*nmask)[0] : NUMA_NO_NODE);
1223
1224        if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
1225
1226                err = migrate_prep();
1227                if (err)
1228                        goto mpol_out;
1229        }
1230        {
1231                NODEMASK_SCRATCH(scratch);
1232                if (scratch) {
1233                        down_write(&mm->mmap_sem);
1234                        task_lock(current);
1235                        err = mpol_set_nodemask(new, nmask, scratch);
1236                        task_unlock(current);
1237                        if (err)
1238                                up_write(&mm->mmap_sem);
1239                } else
1240                        err = -ENOMEM;
1241                NODEMASK_SCRATCH_FREE(scratch);
1242        }
1243        if (err)
1244                goto mpol_out;
1245
1246        err = queue_pages_range(mm, start, end, nmask,
1247                          flags | MPOL_MF_INVERT, &pagelist);
1248        if (!err)
1249                err = mbind_range(mm, start, end, new);
1250
1251        if (!err) {
1252                int nr_failed = 0;
1253
1254                if (!list_empty(&pagelist)) {
1255                        WARN_ON_ONCE(flags & MPOL_MF_LAZY);
1256                        nr_failed = migrate_pages(&pagelist, new_page, NULL,
1257                                start, MIGRATE_SYNC, MR_MEMPOLICY_MBIND);
1258                        if (nr_failed)
1259                                putback_movable_pages(&pagelist);
1260                }
1261
1262                if (nr_failed && (flags & MPOL_MF_STRICT))
1263                        err = -EIO;
1264        } else
1265                putback_movable_pages(&pagelist);
1266
1267        up_write(&mm->mmap_sem);
1268 mpol_out:
1269        mpol_put(new);
1270        return err;
1271}
1272
1273/*
1274 * User space interface with variable sized bitmaps for nodelists.
1275 */
1276
1277/* Copy a node mask from user space. */
1278static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
1279                     unsigned long maxnode)
1280{
1281        unsigned long k;
1282        unsigned long t;
1283        unsigned long nlongs;
1284        unsigned long endmask;
1285
1286        --maxnode;
1287        nodes_clear(*nodes);
1288        if (maxnode == 0 || !nmask)
1289                return 0;
1290        if (maxnode > PAGE_SIZE*BITS_PER_BYTE)
1291                return -EINVAL;
1292
1293        nlongs = BITS_TO_LONGS(maxnode);
1294        if ((maxnode % BITS_PER_LONG) == 0)
1295                endmask = ~0UL;
1296        else
1297                endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1;
1298
1299        /*
1300         * When the user specified more nodes than supported just check
1301         * if the non supported part is all zero.
1302         *
1303         * If maxnode have more longs than MAX_NUMNODES, check
1304         * the bits in that area first. And then go through to
1305         * check the rest bits which equal or bigger than MAX_NUMNODES.
1306         * Otherwise, just check bits [MAX_NUMNODES, maxnode).
1307         */
1308        if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) {
1309                for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) {
1310                        if (get_user(t, nmask + k))
1311                                return -EFAULT;
1312                        if (k == nlongs - 1) {
1313                                if (t & endmask)
1314                                        return -EINVAL;
1315                        } else if (t)
1316                                return -EINVAL;
1317                }
1318                nlongs = BITS_TO_LONGS(MAX_NUMNODES);
1319                endmask = ~0UL;
1320        }
1321
1322        if (maxnode > MAX_NUMNODES && MAX_NUMNODES % BITS_PER_LONG != 0) {
1323                unsigned long valid_mask = endmask;
1324
1325                valid_mask &= ~((1UL << (MAX_NUMNODES % BITS_PER_LONG)) - 1);
1326                if (get_user(t, nmask + nlongs - 1))
1327                        return -EFAULT;
1328                if (t & valid_mask)
1329                        return -EINVAL;
1330        }
1331
1332        if (copy_from_user(nodes_addr(*nodes), nmask, nlongs*sizeof(unsigned long)))
1333                return -EFAULT;
1334        nodes_addr(*nodes)[nlongs-1] &= endmask;
1335        return 0;
1336}
1337
1338/* Copy a kernel node mask to user space */
1339static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
1340                              nodemask_t *nodes)
1341{
1342        unsigned long copy = ALIGN(maxnode-1, 64) / 8;
1343        unsigned int nbytes = BITS_TO_LONGS(nr_node_ids) * sizeof(long);
1344
1345        if (copy > nbytes) {
1346                if (copy > PAGE_SIZE)
1347                        return -EINVAL;
1348                if (clear_user((char __user *)mask + nbytes, copy - nbytes))
1349                        return -EFAULT;
1350                copy = nbytes;
1351        }
1352        return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
1353}
1354
1355static long kernel_mbind(unsigned long start, unsigned long len,
1356                         unsigned long mode, const unsigned long __user *nmask,
1357                         unsigned long maxnode, unsigned int flags)
1358{
1359        nodemask_t nodes;
1360        int err;
1361        unsigned short mode_flags;
1362
1363        mode_flags = mode & MPOL_MODE_FLAGS;
1364        mode &= ~MPOL_MODE_FLAGS;
1365        if (mode >= MPOL_MAX)
1366                return -EINVAL;
1367        if ((mode_flags & MPOL_F_STATIC_NODES) &&
1368            (mode_flags & MPOL_F_RELATIVE_NODES))
1369                return -EINVAL;
1370        err = get_nodes(&nodes, nmask, maxnode);
1371        if (err)
1372                return err;
1373        return do_mbind(start, len, mode, mode_flags, &nodes, flags);
1374}
1375
1376SYSCALL_DEFINE6(mbind, unsigned long, start, unsigned long, len,
1377                unsigned long, mode, const unsigned long __user *, nmask,
1378                unsigned long, maxnode, unsigned int, flags)
1379{
1380        return kernel_mbind(start, len, mode, nmask, maxnode, flags);
1381}
1382
1383/* Set the process memory policy */
1384static long kernel_set_mempolicy(int mode, const unsigned long __user *nmask,
1385                                 unsigned long maxnode)
1386{
1387        int err;
1388        nodemask_t nodes;
1389        unsigned short flags;
1390
1391        flags = mode & MPOL_MODE_FLAGS;
1392        mode &= ~MPOL_MODE_FLAGS;
1393        if ((unsigned int)mode >= MPOL_MAX)
1394                return -EINVAL;
1395        if ((flags & MPOL_F_STATIC_NODES) && (flags & MPOL_F_RELATIVE_NODES))
1396                return -EINVAL;
1397        err = get_nodes(&nodes, nmask, maxnode);
1398        if (err)
1399                return err;
1400        return do_set_mempolicy(mode, flags, &nodes);
1401}
1402
1403SYSCALL_DEFINE3(set_mempolicy, int, mode, const unsigned long __user *, nmask,
1404                unsigned long, maxnode)
1405{
1406        return kernel_set_mempolicy(mode, nmask, maxnode);
1407}
1408
1409static int kernel_migrate_pages(pid_t pid, unsigned long maxnode,
1410                                const unsigned long __user *old_nodes,
1411                                const unsigned long __user *new_nodes)
1412{
1413        struct mm_struct *mm = NULL;
1414        struct task_struct *task;
1415        nodemask_t task_nodes;
1416        int err;
1417        nodemask_t *old;
1418        nodemask_t *new;
1419        NODEMASK_SCRATCH(scratch);
1420
1421        if (!scratch)
1422                return -ENOMEM;
1423
1424        old = &scratch->mask1;
1425        new = &scratch->mask2;
1426
1427        err = get_nodes(old, old_nodes, maxnode);
1428        if (err)
1429                goto out;
1430
1431        err = get_nodes(new, new_nodes, maxnode);
1432        if (err)
1433                goto out;
1434
1435        /* Find the mm_struct */
1436        rcu_read_lock();
1437        task = pid ? find_task_by_vpid(pid) : current;
1438        if (!task) {
1439                rcu_read_unlock();
1440                err = -ESRCH;
1441                goto out;
1442        }
1443        get_task_struct(task);
1444
1445        err = -EINVAL;
1446
1447        /*
1448         * Check if this process has the right to modify the specified process.
1449         * Use the regular "ptrace_may_access()" checks.
1450         */
1451        if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS)) {
1452                rcu_read_unlock();
1453                err = -EPERM;
1454                goto out_put;
1455        }
1456        rcu_read_unlock();
1457
1458        task_nodes = cpuset_mems_allowed(task);
1459        /* Is the user allowed to access the target nodes? */
1460        if (!nodes_subset(*new, task_nodes) && !capable(CAP_SYS_NICE)) {
1461                err = -EPERM;
1462                goto out_put;
1463        }
1464
1465        task_nodes = cpuset_mems_allowed(current);
1466        nodes_and(*new, *new, task_nodes);
1467        if (nodes_empty(*new))
1468                goto out_put;
1469
1470        nodes_and(*new, *new, node_states[N_MEMORY]);
1471        if (nodes_empty(*new))
1472                goto out_put;
1473
1474        err = security_task_movememory(task);
1475        if (err)
1476                goto out_put;
1477
1478        mm = get_task_mm(task);
1479        put_task_struct(task);
1480
1481        if (!mm) {
1482                err = -EINVAL;
1483                goto out;
1484        }
1485
1486        err = do_migrate_pages(mm, old, new,
1487                capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE);
1488
1489        mmput(mm);
1490out:
1491        NODEMASK_SCRATCH_FREE(scratch);
1492
1493        return err;
1494
1495out_put:
1496        put_task_struct(task);
1497        goto out;
1498
1499}
1500
1501SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode,
1502                const unsigned long __user *, old_nodes,
1503                const unsigned long __user *, new_nodes)
1504{
1505        return kernel_migrate_pages(pid, maxnode, old_nodes, new_nodes);
1506}
1507
1508
1509/* Retrieve NUMA policy */
1510static int kernel_get_mempolicy(int __user *policy,
1511                                unsigned long __user *nmask,
1512                                unsigned long maxnode,
1513                                unsigned long addr,
1514                                unsigned long flags)
1515{
1516        int err;
1517        int uninitialized_var(pval);
1518        nodemask_t nodes;
1519
1520        if (nmask != NULL && maxnode < nr_node_ids)
1521                return -EINVAL;
1522
1523        err = do_get_mempolicy(&pval, &nodes, addr, flags);
1524
1525        if (err)
1526                return err;
1527
1528        if (policy && put_user(pval, policy))
1529                return -EFAULT;
1530
1531        if (nmask)
1532                err = copy_nodes_to_user(nmask, maxnode, &nodes);
1533
1534        return err;
1535}
1536
1537SYSCALL_DEFINE5(get_mempolicy, int __user *, policy,
1538                unsigned long __user *, nmask, unsigned long, maxnode,
1539                unsigned long, addr, unsigned long, flags)
1540{
1541        return kernel_get_mempolicy(policy, nmask, maxnode, addr, flags);
1542}
1543
1544#ifdef CONFIG_COMPAT
1545
1546COMPAT_SYSCALL_DEFINE5(get_mempolicy, int __user *, policy,
1547                       compat_ulong_t __user *, nmask,
1548                       compat_ulong_t, maxnode,
1549                       compat_ulong_t, addr, compat_ulong_t, flags)
1550{
1551        long err;
1552        unsigned long __user *nm = NULL;
1553        unsigned long nr_bits, alloc_size;
1554        DECLARE_BITMAP(bm, MAX_NUMNODES);
1555
1556        nr_bits = min_t(unsigned long, maxnode-1, nr_node_ids);
1557        alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1558
1559        if (nmask)
1560                nm = compat_alloc_user_space(alloc_size);
1561
1562        err = kernel_get_mempolicy(policy, nm, nr_bits+1, addr, flags);
1563
1564        if (!err && nmask) {
1565                unsigned long copy_size;
1566                copy_size = min_t(unsigned long, sizeof(bm), alloc_size);
1567                err = copy_from_user(bm, nm, copy_size);
1568                /* ensure entire bitmap is zeroed */
1569                err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8);
1570                err |= compat_put_bitmap(nmask, bm, nr_bits);
1571        }
1572
1573        return err;
1574}
1575
1576COMPAT_SYSCALL_DEFINE3(set_mempolicy, int, mode, compat_ulong_t __user *, nmask,
1577                       compat_ulong_t, maxnode)
1578{
1579        unsigned long __user *nm = NULL;
1580        unsigned long nr_bits, alloc_size;
1581        DECLARE_BITMAP(bm, MAX_NUMNODES);
1582
1583        nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1584        alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1585
1586        if (nmask) {
1587                if (compat_get_bitmap(bm, nmask, nr_bits))
1588                        return -EFAULT;
1589                nm = compat_alloc_user_space(alloc_size);
1590                if (copy_to_user(nm, bm, alloc_size))
1591                        return -EFAULT;
1592        }
1593
1594        return kernel_set_mempolicy(mode, nm, nr_bits+1);
1595}
1596
1597COMPAT_SYSCALL_DEFINE6(mbind, compat_ulong_t, start, compat_ulong_t, len,
1598                       compat_ulong_t, mode, compat_ulong_t __user *, nmask,
1599                       compat_ulong_t, maxnode, compat_ulong_t, flags)
1600{
1601        unsigned long __user *nm = NULL;
1602        unsigned long nr_bits, alloc_size;
1603        nodemask_t bm;
1604
1605        nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1606        alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1607
1608        if (nmask) {
1609                if (compat_get_bitmap(nodes_addr(bm), nmask, nr_bits))
1610                        return -EFAULT;
1611                nm = compat_alloc_user_space(alloc_size);
1612                if (copy_to_user(nm, nodes_addr(bm), alloc_size))
1613                        return -EFAULT;
1614        }
1615
1616        return kernel_mbind(start, len, mode, nm, nr_bits+1, flags);
1617}
1618
1619COMPAT_SYSCALL_DEFINE4(migrate_pages, compat_pid_t, pid,
1620                       compat_ulong_t, maxnode,
1621                       const compat_ulong_t __user *, old_nodes,
1622                       const compat_ulong_t __user *, new_nodes)
1623{
1624        unsigned long __user *old = NULL;
1625        unsigned long __user *new = NULL;
1626        nodemask_t tmp_mask;
1627        unsigned long nr_bits;
1628        unsigned long size;
1629
1630        nr_bits = min_t(unsigned long, maxnode - 1, MAX_NUMNODES);
1631        size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1632        if (old_nodes) {
1633                if (compat_get_bitmap(nodes_addr(tmp_mask), old_nodes, nr_bits))
1634                        return -EFAULT;
1635                old = compat_alloc_user_space(new_nodes ? size * 2 : size);
1636                if (new_nodes)
1637                        new = old + size / sizeof(unsigned long);
1638                if (copy_to_user(old, nodes_addr(tmp_mask), size))
1639                        return -EFAULT;
1640        }
1641        if (new_nodes) {
1642                if (compat_get_bitmap(nodes_addr(tmp_mask), new_nodes, nr_bits))
1643                        return -EFAULT;
1644                if (new == NULL)
1645                        new = compat_alloc_user_space(size);
1646                if (copy_to_user(new, nodes_addr(tmp_mask), size))
1647                        return -EFAULT;
1648        }
1649        return kernel_migrate_pages(pid, nr_bits + 1, old, new);
1650}
1651
1652#endif /* CONFIG_COMPAT */
1653
1654struct mempolicy *__get_vma_policy(struct vm_area_struct *vma,
1655                                                unsigned long addr)
1656{
1657        struct mempolicy *pol = NULL;
1658
1659        if (vma) {
1660                if (vma->vm_ops && vma->vm_ops->get_policy) {
1661                        pol = vma->vm_ops->get_policy(vma, addr);
1662                } else if (vma->vm_policy) {
1663                        pol = vma->vm_policy;
1664
1665                        /*
1666                         * shmem_alloc_page() passes MPOL_F_SHARED policy with
1667                         * a pseudo vma whose vma->vm_ops=NULL. Take a reference
1668                         * count on these policies which will be dropped by
1669                         * mpol_cond_put() later
1670                         */
1671                        if (mpol_needs_cond_ref(pol))
1672                                mpol_get(pol);
1673                }
1674        }
1675
1676        return pol;
1677}
1678
1679/*
1680 * get_vma_policy(@vma, @addr)
1681 * @vma: virtual memory area whose policy is sought
1682 * @addr: address in @vma for shared policy lookup
1683 *
1684 * Returns effective policy for a VMA at specified address.
1685 * Falls back to current->mempolicy or system default policy, as necessary.
1686 * Shared policies [those marked as MPOL_F_SHARED] require an extra reference
1687 * count--added by the get_policy() vm_op, as appropriate--to protect against
1688 * freeing by another task.  It is the caller's responsibility to free the
1689 * extra reference for shared policies.
1690 */
1691static struct mempolicy *get_vma_policy(struct vm_area_struct *vma,
1692                                                unsigned long addr)
1693{
1694        struct mempolicy *pol = __get_vma_policy(vma, addr);
1695
1696        if (!pol)
1697                pol = get_task_policy(current);
1698
1699        return pol;
1700}
1701
1702bool vma_policy_mof(struct vm_area_struct *vma)
1703{
1704        struct mempolicy *pol;
1705
1706        if (vma->vm_ops && vma->vm_ops->get_policy) {
1707                bool ret = false;
1708
1709                pol = vma->vm_ops->get_policy(vma, vma->vm_start);
1710                if (pol && (pol->flags & MPOL_F_MOF))
1711                        ret = true;
1712                mpol_cond_put(pol);
1713
1714                return ret;
1715        }
1716
1717        pol = vma->vm_policy;
1718        if (!pol)
1719                pol = get_task_policy(current);
1720
1721        return pol->flags & MPOL_F_MOF;
1722}
1723
1724static int apply_policy_zone(struct mempolicy *policy, enum zone_type zone)
1725{
1726        enum zone_type dynamic_policy_zone = policy_zone;
1727
1728        BUG_ON(dynamic_policy_zone == ZONE_MOVABLE);
1729
1730        /*
1731         * if policy->v.nodes has movable memory only,
1732         * we apply policy when gfp_zone(gfp) = ZONE_MOVABLE only.
1733         *
1734         * policy->v.nodes is intersect with node_states[N_MEMORY].
1735         * so if the following test faile, it implies
1736         * policy->v.nodes has movable memory only.
1737         */
1738        if (!nodes_intersects(policy->v.nodes, node_states[N_HIGH_MEMORY]))
1739                dynamic_policy_zone = ZONE_MOVABLE;
1740
1741        return zone >= dynamic_policy_zone;
1742}
1743
1744/*
1745 * Return a nodemask representing a mempolicy for filtering nodes for
1746 * page allocation
1747 */
1748static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy)
1749{
1750        /* Lower zones don't get a nodemask applied for MPOL_BIND */
1751        if (unlikely(policy->mode == MPOL_BIND) &&
1752                        apply_policy_zone(policy, gfp_zone(gfp)) &&
1753                        cpuset_nodemask_valid_mems_allowed(&policy->v.nodes))
1754                return &policy->v.nodes;
1755
1756        return NULL;
1757}
1758
1759/* Return the node id preferred by the given mempolicy, or the given id */
1760static int policy_node(gfp_t gfp, struct mempolicy *policy,
1761                                                                int nd)
1762{
1763        if (policy->mode == MPOL_PREFERRED && !(policy->flags & MPOL_F_LOCAL))
1764                nd = policy->v.preferred_node;
1765        else {
1766                /*
1767                 * __GFP_THISNODE shouldn't even be used with the bind policy
1768                 * because we might easily break the expectation to stay on the
1769                 * requested node and not break the policy.
1770                 */
1771                WARN_ON_ONCE(policy->mode == MPOL_BIND && (gfp & __GFP_THISNODE));
1772        }
1773
1774        return nd;
1775}
1776
1777/* Do dynamic interleaving for a process */
1778static unsigned interleave_nodes(struct mempolicy *policy)
1779{
1780        unsigned next;
1781        struct task_struct *me = current;
1782
1783        next = next_node_in(me->il_prev, policy->v.nodes);
1784        if (next < MAX_NUMNODES)
1785                me->il_prev = next;
1786        return next;
1787}
1788
1789/*
1790 * Depending on the memory policy provide a node from which to allocate the
1791 * next slab entry.
1792 */
1793unsigned int mempolicy_slab_node(void)
1794{
1795        struct mempolicy *policy;
1796        int node = numa_mem_id();
1797
1798        if (in_interrupt())
1799                return node;
1800
1801        policy = current->mempolicy;
1802        if (!policy || policy->flags & MPOL_F_LOCAL)
1803                return node;
1804
1805        switch (policy->mode) {
1806        case MPOL_PREFERRED:
1807                /*
1808                 * handled MPOL_F_LOCAL above
1809                 */
1810                return policy->v.preferred_node;
1811
1812        case MPOL_INTERLEAVE:
1813                return interleave_nodes(policy);
1814
1815        case MPOL_BIND: {
1816                struct zoneref *z;
1817
1818                /*
1819                 * Follow bind policy behavior and start allocation at the
1820                 * first node.
1821                 */
1822                struct zonelist *zonelist;
1823                enum zone_type highest_zoneidx = gfp_zone(GFP_KERNEL);
1824                zonelist = &NODE_DATA(node)->node_zonelists[ZONELIST_FALLBACK];
1825                z = first_zones_zonelist(zonelist, highest_zoneidx,
1826                                                        &policy->v.nodes);
1827                return z->zone ? zone_to_nid(z->zone) : node;
1828        }
1829
1830        default:
1831                BUG();
1832        }
1833}
1834
1835/*
1836 * Do static interleaving for a VMA with known offset @n.  Returns the n'th
1837 * node in pol->v.nodes (starting from n=0), wrapping around if n exceeds the
1838 * number of present nodes.
1839 */
1840static unsigned offset_il_node(struct mempolicy *pol, unsigned long n)
1841{
1842        unsigned nnodes = nodes_weight(pol->v.nodes);
1843        unsigned target;
1844        int i;
1845        int nid;
1846
1847        if (!nnodes)
1848                return numa_node_id();
1849        target = (unsigned int)n % nnodes;
1850        nid = first_node(pol->v.nodes);
1851        for (i = 0; i < target; i++)
1852                nid = next_node(nid, pol->v.nodes);
1853        return nid;
1854}
1855
1856/* Determine a node number for interleave */
1857static inline unsigned interleave_nid(struct mempolicy *pol,
1858                 struct vm_area_struct *vma, unsigned long addr, int shift)
1859{
1860        if (vma) {
1861                unsigned long off;
1862
1863                /*
1864                 * for small pages, there is no difference between
1865                 * shift and PAGE_SHIFT, so the bit-shift is safe.
1866                 * for huge pages, since vm_pgoff is in units of small
1867                 * pages, we need to shift off the always 0 bits to get
1868                 * a useful offset.
1869                 */
1870                BUG_ON(shift < PAGE_SHIFT);
1871                off = vma->vm_pgoff >> (shift - PAGE_SHIFT);
1872                off += (addr - vma->vm_start) >> shift;
1873                return offset_il_node(pol, off);
1874        } else
1875                return interleave_nodes(pol);
1876}
1877
1878#ifdef CONFIG_HUGETLBFS
1879/*
1880 * huge_node(@vma, @addr, @gfp_flags, @mpol)
1881 * @vma: virtual memory area whose policy is sought
1882 * @addr: address in @vma for shared policy lookup and interleave policy
1883 * @gfp_flags: for requested zone
1884 * @mpol: pointer to mempolicy pointer for reference counted mempolicy
1885 * @nodemask: pointer to nodemask pointer for MPOL_BIND nodemask
1886 *
1887 * Returns a nid suitable for a huge page allocation and a pointer
1888 * to the struct mempolicy for conditional unref after allocation.
1889 * If the effective policy is 'BIND, returns a pointer to the mempolicy's
1890 * @nodemask for filtering the zonelist.
1891 *
1892 * Must be protected by read_mems_allowed_begin()
1893 */
1894int huge_node(struct vm_area_struct *vma, unsigned long addr, gfp_t gfp_flags,
1895                                struct mempolicy **mpol, nodemask_t **nodemask)
1896{
1897        int nid;
1898
1899        *mpol = get_vma_policy(vma, addr);
1900        *nodemask = NULL;       /* assume !MPOL_BIND */
1901
1902        if (unlikely((*mpol)->mode == MPOL_INTERLEAVE)) {
1903                nid = interleave_nid(*mpol, vma, addr,
1904                                        huge_page_shift(hstate_vma(vma)));
1905        } else {
1906                nid = policy_node(gfp_flags, *mpol, numa_node_id());
1907                if ((*mpol)->mode == MPOL_BIND)
1908                        *nodemask = &(*mpol)->v.nodes;
1909        }
1910        return nid;
1911}
1912
1913/*
1914 * init_nodemask_of_mempolicy
1915 *
1916 * If the current task's mempolicy is "default" [NULL], return 'false'
1917 * to indicate default policy.  Otherwise, extract the policy nodemask
1918 * for 'bind' or 'interleave' policy into the argument nodemask, or
1919 * initialize the argument nodemask to contain the single node for
1920 * 'preferred' or 'local' policy and return 'true' to indicate presence
1921 * of non-default mempolicy.
1922 *
1923 * We don't bother with reference counting the mempolicy [mpol_get/put]
1924 * because the current task is examining it's own mempolicy and a task's
1925 * mempolicy is only ever changed by the task itself.
1926 *
1927 * N.B., it is the caller's responsibility to free a returned nodemask.
1928 */
1929bool init_nodemask_of_mempolicy(nodemask_t *mask)
1930{
1931        struct mempolicy *mempolicy;
1932        int nid;
1933
1934        if (!(mask && current->mempolicy))
1935                return false;
1936
1937        task_lock(current);
1938        mempolicy = current->mempolicy;
1939        switch (mempolicy->mode) {
1940        case MPOL_PREFERRED:
1941                if (mempolicy->flags & MPOL_F_LOCAL)
1942                        nid = numa_node_id();
1943                else
1944                        nid = mempolicy->v.preferred_node;
1945                init_nodemask_of_node(mask, nid);
1946                break;
1947
1948        case MPOL_BIND:
1949                /* Fall through */
1950        case MPOL_INTERLEAVE:
1951                *mask =  mempolicy->v.nodes;
1952                break;
1953
1954        default:
1955                BUG();
1956        }
1957        task_unlock(current);
1958
1959        return true;
1960}
1961#endif
1962
1963/*
1964 * mempolicy_nodemask_intersects
1965 *
1966 * If tsk's mempolicy is "default" [NULL], return 'true' to indicate default
1967 * policy.  Otherwise, check for intersection between mask and the policy
1968 * nodemask for 'bind' or 'interleave' policy.  For 'perferred' or 'local'
1969 * policy, always return true since it may allocate elsewhere on fallback.
1970 *
1971 * Takes task_lock(tsk) to prevent freeing of its mempolicy.
1972 */
1973bool mempolicy_nodemask_intersects(struct task_struct *tsk,
1974                                        const nodemask_t *mask)
1975{
1976        struct mempolicy *mempolicy;
1977        bool ret = true;
1978
1979        if (!mask)
1980                return ret;
1981        task_lock(tsk);
1982        mempolicy = tsk->mempolicy;
1983        if (!mempolicy)
1984                goto out;
1985
1986        switch (mempolicy->mode) {
1987        case MPOL_PREFERRED:
1988                /*
1989                 * MPOL_PREFERRED and MPOL_F_LOCAL are only preferred nodes to
1990                 * allocate from, they may fallback to other nodes when oom.
1991                 * Thus, it's possible for tsk to have allocated memory from
1992                 * nodes in mask.
1993                 */
1994                break;
1995        case MPOL_BIND:
1996        case MPOL_INTERLEAVE:
1997                ret = nodes_intersects(mempolicy->v.nodes, *mask);
1998                break;
1999        default:
2000                BUG();

2001        }
2002out:
2003        task_unlock(tsk);
2004        return ret;
2005}
2006
2007/* Allocate a page in interleaved policy.
2008   Own path because it needs to do special accounting. */
2009static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
2010                                        unsigned nid)
2011{
2012        struct page *page;
2013
2014        page = __alloc_pages(gfp, order, nid);
2015        /* skip NUMA_INTERLEAVE_HIT counter update if numa stats is disabled */
2016        if (!static_branch_likely(&vm_numa_stat_key))
2017                return page;
2018        if (page && page_to_nid(page) == nid) {
2019                preempt_disable();
2020                __inc_numa_state(page_zone(page), NUMA_INTERLEAVE_HIT);
2021                preempt_enable();
2022        }
2023        return page;
2024}
2025
2026/**
2027 *      alloc_pages_vma - Allocate a page for a VMA.
2028 *
2029 *      @gfp:
2030 *      %GFP_USER    user allocation.
2031 *      %GFP_KERNEL  kernel allocations,
2032 *      %GFP_HIGHMEM highmem/user allocations,
2033 *      %GFP_FS      allocation should not call back into a file system.
2034 *      %GFP_ATOMIC  don't sleep.
2035 *
2036 *      @order:Order of the GFP allocation.
2037 *      @vma:  Pointer to VMA or NULL if not available.
2038 *      @addr: Virtual Address of the allocation. Must be inside the VMA.
2039 *      @node: Which node to prefer for allocation (modulo policy).
2040 *      @hugepage: for hugepages try only the preferred node if possible
2041 *
2042 *      This function allocates a page from the kernel page pool and applies
2043 *      a NUMA policy associated with the VMA or the current process.
2044 *      When VMA is not NULL caller must hold down_read on the mmap_sem of the
2045 *      mm_struct of the VMA to prevent it from going away. Should be used for
2046 *      all allocations for pages that will be mapped into user space. Returns
2047 *      NULL when no page can be allocated.
2048 */
2049struct page *
2050alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
2051                unsigned long addr, int node, bool hugepage)
2052{
2053        struct mempolicy *pol;
2054        struct page *page;
2055        int preferred_nid;
2056        nodemask_t *nmask;
2057
2058        pol = get_vma_policy(vma, addr);
2059
2060        if (pol->mode == MPOL_INTERLEAVE) {
2061                unsigned nid;
2062
2063                nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order);
2064                mpol_cond_put(pol);
2065                page = alloc_page_interleave(gfp, order, nid);
2066                goto out;
2067        }
2068
2069        if (unlikely(IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && hugepage)) {
2070                int hpage_node = node;
2071
2072                /*
2073                 * For hugepage allocation and non-interleave policy which
2074                 * allows the current node (or other explicitly preferred
2075                 * node) we only try to allocate from the current/preferred
2076                 * node and don't fall back to other nodes, as the cost of
2077                 * remote accesses would likely offset THP benefits.
2078                 *
2079                 * If the policy is interleave, or does not allow the current
2080                 * node in its nodemask, we allocate the standard way.
2081                 */
2082                if (pol->mode == MPOL_PREFERRED && !(pol->flags & MPOL_F_LOCAL))
2083                        hpage_node = pol->v.preferred_node;
2084
2085                nmask = policy_nodemask(gfp, pol);
2086                if (!nmask || node_isset(hpage_node, *nmask)) {
2087                        mpol_cond_put(pol);
2088                        page = __alloc_pages_node(hpage_node,
2089                                                gfp | __GFP_THISNODE, order);
2090                        goto out;
2091                }
2092        }
2093
2094        nmask = policy_nodemask(gfp, pol);
2095        preferred_nid = policy_node(gfp, pol, node);
2096        page = __alloc_pages_nodemask(gfp, order, preferred_nid, nmask);
2097        mpol_cond_put(pol);
2098out:
2099        return page;
2100}
2101
2102/**
2103 *      alloc_pages_current - Allocate pages.
2104 *
2105 *      @gfp:
2106 *              %GFP_USER   user allocation,
2107 *              %GFP_KERNEL kernel allocation,
2108 *              %GFP_HIGHMEM highmem allocation,
2109 *              %GFP_FS     don't call back into a file system.
2110 *              %GFP_ATOMIC don't sleep.
2111 *      @order: Power of two of allocation size in pages. 0 is a single page.
2112 *
2113 *      Allocate a page from the kernel page pool.  When not in
2114 *      interrupt context and apply the current process NUMA policy.
2115 *      Returns NULL when no page can be allocated.
2116 */
2117struct page *alloc_pages_current(gfp_t gfp, unsigned order)
2118{
2119        struct mempolicy *pol = &default_policy;
2120        struct page *page;
2121
2122        if (!in_interrupt() && !(gfp & __GFP_THISNODE))
2123                pol = get_task_policy(current);
2124
2125        /*
2126         * No reference counting needed for current->mempolicy
2127         * nor system default_policy
2128         */
2129        if (pol->mode == MPOL_INTERLEAVE)
2130                page = alloc_page_interleave(gfp, order, interleave_nodes(pol));
2131        else
2132                page = __alloc_pages_nodemask(gfp, order,
2133                                policy_node(gfp, pol, numa_node_id()),
2134                                policy_nodemask(gfp, pol));
2135
2136        return page;
2137}
2138EXPORT_SYMBOL(alloc_pages_current);
2139
2140int vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst)
2141{
2142        struct mempolicy *pol = mpol_dup(vma_policy(src));
2143
2144        if (IS_ERR(pol))
2145                return PTR_ERR(pol);
2146        dst->vm_policy = pol;
2147        return 0;
2148}
2149
2150/*
2151 * If mpol_dup() sees current->cpuset == cpuset_being_rebound, then it
2152 * rebinds the mempolicy its copying by calling mpol_rebind_policy()
2153 * with the mems_allowed returned by cpuset_mems_allowed().  This
2154 * keeps mempolicies cpuset relative after its cpuset moves.  See
2155 * further kernel/cpuset.c update_nodemask().
2156 *
2157 * current's mempolicy may be rebinded by the other task(the task that changes
2158 * cpuset's mems), so we needn't do rebind work for current task.
2159 */
2160
2161/* Slow path of a mempolicy duplicate */
2162struct mempolicy *__mpol_dup(struct mempolicy *old)
2163{
2164        struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
2165
2166        if (!new)
2167                return ERR_PTR(-ENOMEM);
2168
2169        /* task's mempolicy is protected by alloc_lock */
2170        if (old == current->mempolicy) {
2171                task_lock(current);
2172                *new = *old;
2173                task_unlock(current);
2174        } else
2175                *new = *old;
2176
2177        if (current_cpuset_is_being_rebound()) {
2178                nodemask_t mems = cpuset_mems_allowed(current);
2179                mpol_rebind_policy(new, &mems);
2180        }
2181        atomic_set(&new->refcnt, 1);
2182        return new;
2183}
2184
2185/* Slow path of a mempolicy comparison */
2186bool __mpol_equal(struct mempolicy *a, struct mempolicy *b)
2187{
2188        if (!a || !b)
2189                return false;
2190        if (a->mode != b->mode)
2191                return false;
2192        if (a->flags != b->flags)
2193                return false;
2194        if (mpol_store_user_nodemask(a))
2195                if (!nodes_equal(a->w.user_nodemask, b->w.user_nodemask))
2196                        return false;
2197
2198        switch (a->mode) {
2199        case MPOL_BIND:
2200                /* Fall through */
2201        case MPOL_INTERLEAVE:
2202                return !!nodes_equal(a->v.nodes, b->v.nodes);
2203        case MPOL_PREFERRED:
2204                /* a's ->flags is the same as b's */
2205                if (a->flags & MPOL_F_LOCAL)
2206                        return true;
2207                return a->v.preferred_node == b->v.preferred_node;
2208        default:
2209                BUG();
2210                return false;
2211        }
2212}
2213
2214/*
2215 * Shared memory backing store policy support.
2216 *
2217 * Remember policies even when nobody has shared memory mapped.
2218 * The policies are kept in Red-Black tree linked from the inode.
2219 * They are protected by the sp->lock rwlock, which should be held
2220 * for any accesses to the tree.
2221 */
2222
2223/*
2224 * lookup first element intersecting start-end.  Caller holds sp->lock for
2225 * reading or for writing
2226 */
2227static struct sp_node *
2228sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
2229{
2230        struct rb_node *n = sp->root.rb_node;
2231
2232        while (n) {
2233                struct sp_node *p = rb_entry(n, struct sp_node, nd);
2234
2235                if (start >= p->end)
2236                        n = n->rb_right;
2237                else if (end <= p->start)
2238                        n = n->rb_left;
2239                else
2240                        break;
2241        }
2242        if (!n)
2243                return NULL;
2244        for (;;) {
2245                struct sp_node *w = NULL;
2246                struct rb_node *prev = rb_prev(n);
2247                if (!prev)
2248                        break;
2249                w = rb_entry(prev, struct sp_node, nd);
2250                if (w->end <= start)
2251                        break;
2252                n = prev;
2253        }
2254        return rb_entry(n, struct sp_node, nd);
2255}
2256
2257/*
2258 * Insert a new shared policy into the list.  Caller holds sp->lock for
2259 * writing.
2260 */
2261static void sp_insert(struct shared_policy *sp, struct sp_node *new)
2262{
2263        struct rb_node **p = &sp->root.rb_node;
2264        struct rb_node *parent = NULL;
2265        struct sp_node *nd;
2266
2267        while (*p) {
2268                parent = *p;
2269                nd = rb_entry(parent, struct sp_node, nd);
2270                if (new->start < nd->start)
2271                        p = &(*p)->rb_left;
2272                else if (new->end > nd->end)
2273                        p = &(*p)->rb_right;
2274                else
2275                        BUG();
2276        }
2277        rb_link_node(&new->nd, parent, p);
2278        rb_insert_color(&new->nd, &sp->root);
2279        pr_debug("inserting %lx-%lx: %d\n", new->start, new->end,
2280                 new->policy ? new->policy->mode : 0);
2281}
2282
2283/* Find shared policy intersecting idx */
2284struct mempolicy *
2285mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
2286{
2287        struct mempolicy *pol = NULL;
2288        struct sp_node *sn;
2289
2290        if (!sp->root.rb_node)
2291                return NULL;
2292        read_lock(&sp->lock);
2293        sn = sp_lookup(sp, idx, idx+1);
2294        if (sn) {
2295                mpol_get(sn->policy);
2296                pol = sn->policy;
2297        }
2298        read_unlock(&sp->lock);
2299        return pol;
2300}
2301
2302static void sp_free(struct sp_node *n)
2303{
2304        mpol_put(n->policy);
2305        kmem_cache_free(sn_cache, n);
2306}
2307
2308/**
2309 * mpol_misplaced - check whether current page node is valid in policy
2310 *
2311 * @page: page to be checked
2312 * @vma: vm area where page mapped
2313 * @addr: virtual address where page mapped
2314 *
2315 * Lookup current policy node id for vma,addr and "compare to" page's
2316 * node id.
2317 *
2318 * Returns:
2319 *      -1      - not misplaced, page is in the right node
2320 *      node    - node id where the page should be
2321 *
2322 * Policy determination "mimics" alloc_page_vma().
2323 * Called from fault path where we know the vma and faulting address.
2324 */
2325int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long addr)
2326{
2327        struct mempolicy *pol;
2328        struct zoneref *z;
2329        int curnid = page_to_nid(page);
2330        unsigned long pgoff;
2331        int thiscpu = raw_smp_processor_id();
2332        int thisnid = cpu_to_node(thiscpu);
2333        int polnid = NUMA_NO_NODE;
2334        int ret = -1;
2335
2336        pol = get_vma_policy(vma, addr);
2337        if (!(pol->flags & MPOL_F_MOF))
2338                goto out;
2339
2340        switch (pol->mode) {
2341        case MPOL_INTERLEAVE:
2342                pgoff = vma->vm_pgoff;
2343                pgoff += (addr - vma->vm_start) >> PAGE_SHIFT;
2344                polnid = offset_il_node(pol, pgoff);
2345                break;
2346
2347        case MPOL_PREFERRED:
2348                if (pol->flags & MPOL_F_LOCAL)
2349                        polnid = numa_node_id();
2350                else
2351                        polnid = pol->v.preferred_node;
2352                break;
2353
2354        case MPOL_BIND:
2355
2356                /*
2357                 * allows binding to multiple nodes.
2358                 * use current page if in policy nodemask,
2359                 * else select nearest allowed node, if any.
2360                 * If no allowed nodes, use current [!misplaced].
2361                 */
2362                if (node_isset(curnid, pol->v.nodes))
2363                        goto out;
2364                z = first_zones_zonelist(
2365                                node_zonelist(numa_node_id(), GFP_HIGHUSER),
2366                                gfp_zone(GFP_HIGHUSER),
2367                                &pol->v.nodes);
2368                polnid = zone_to_nid(z->zone);
2369                break;
2370
2371        default:
2372                BUG();
2373        }
2374
2375        /* Migrate the page towards the node whose CPU is referencing it */
2376        if (pol->flags & MPOL_F_MORON) {
2377                polnid = thisnid;
2378
2379                if (!should_numa_migrate_memory(current, page, curnid, thiscpu))
2380                        goto out;
2381        }
2382
2383        if (curnid != polnid)
2384                ret = polnid;
2385out:
2386        mpol_cond_put(pol);
2387
2388        return ret;
2389}
2390
2391/*
2392 * Drop the (possibly final) reference to task->mempolicy.  It needs to be
2393 * dropped after task->mempolicy is set to NULL so that any allocation done as
2394 * part of its kmem_cache_free(), such as by KASAN, doesn't reference a freed
2395 * policy.
2396 */
2397void mpol_put_task_policy(struct task_struct *task)
2398{
2399        struct mempolicy *pol;
2400
2401        task_lock(task);
2402        pol = task->mempolicy;
2403        task->mempolicy = NULL;
2404        task_unlock(task);
2405        mpol_put(pol);
2406}
2407
2408static void sp_delete(struct shared_policy *sp, struct sp_node *n)
2409{
2410        pr_debug("deleting %lx-l%lx\n", n->start, n->end);
2411        rb_erase(&n->nd, &sp->root);
2412        sp_free(n);
2413}
2414
2415static void sp_node_init(struct sp_node *node, unsigned long start,
2416                        unsigned long end, struct mempolicy *pol)
2417{
2418        node->start = start;
2419        node->end = end;
2420        node->policy = pol;
2421}
2422
2423static struct sp_node *sp_alloc(unsigned long start, unsigned long end,
2424                                struct mempolicy *pol)
2425{
2426        struct sp_node *n;
2427        struct mempolicy *newpol;
2428
2429        n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
2430        if (!n)
2431                return NULL;
2432
2433        newpol = mpol_dup(pol);
2434        if (IS_ERR(newpol)) {
2435                kmem_cache_free(sn_cache, n);
2436                return NULL;
2437        }
2438        newpol->flags |= MPOL_F_SHARED;
2439        sp_node_init(n, start, end, newpol);
2440
2441        return n;
2442}
2443
2444/* Replace a policy range. */
2445static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
2446                                 unsigned long end, struct sp_node *new)
2447{
2448        struct sp_node *n;
2449        struct sp_node *n_new = NULL;
2450        struct mempolicy *mpol_new = NULL;
2451        int ret = 0;
2452
2453restart:
2454        write_lock(&sp->lock);
2455        n = sp_lookup(sp, start, end);
2456        /* Take care of old policies in the same range. */
2457        while (n && n->start < end) {
2458                struct rb_node *next = rb_next(&n->nd);
2459                if (n->start >= start) {
2460                        if (n->end <= end)
2461                                sp_delete(sp, n);
2462                        else
2463                                n->start = end;
2464                } else {
2465                        /* Old policy spanning whole new range. */
2466                        if (n->end > end) {
2467                                if (!n_new)
2468                                        goto alloc_new;
2469
2470                                *mpol_new = *n->policy;
2471                                atomic_set(&mpol_new->refcnt, 1);
2472                                sp_node_init(n_new, end, n->end, mpol_new);
2473                                n->end = start;
2474                                sp_insert(sp, n_new);
2475                                n_new = NULL;
2476                                mpol_new = NULL;
2477                                break;
2478                        } else
2479                                n->end = start;
2480                }
2481                if (!next)
2482                        break;
2483                n = rb_entry(next, struct sp_node, nd);
2484        }
2485        if (new)
2486                sp_insert(sp, new);
2487        write_unlock(&sp->lock);
2488        ret = 0;
2489
2490err_out:
2491        if (mpol_new)
2492                mpol_put(mpol_new);
2493        if (n_new)
2494                kmem_cache_free(sn_cache, n_new);
2495
2496        return ret;
2497
2498alloc_new:
2499        write_unlock(&sp->lock);
2500        ret = -ENOMEM;
2501        n_new = kmem_cache_alloc(sn_cache, GFP_KERNEL);
2502        if (!n_new)
2503                goto err_out;
2504        mpol_new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
2505        if (!mpol_new)
2506                goto err_out;
2507        goto restart;
2508}
2509
2510/**
2511 * mpol_shared_policy_init - initialize shared policy for inode
2512 * @sp: pointer to inode shared policy
2513 * @mpol:  struct mempolicy to install
2514 *
2515 * Install non-NULL @mpol in inode's shared policy rb-tree.
2516 * On entry, the current task has a reference on a non-NULL @mpol.
2517 * This must be released on exit.
2518 * This is called at get_inode() calls and we can use GFP_KERNEL.
2519 */
2520void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol)
2521{
2522        int ret;
2523
2524        sp->root = RB_ROOT;             /* empty tree == default mempolicy */
2525        rwlock_init(&sp->lock);
2526
2527        if (mpol) {
2528                struct vm_area_struct pvma;
2529                struct mempolicy *new;
2530                NODEMASK_SCRATCH(scratch);
2531
2532                if (!scratch)
2533                        goto put_mpol;
2534                /* contextualize the tmpfs mount point mempolicy */
2535                new = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask);
2536                if (IS_ERR(new))
2537                        goto free_scratch; /* no valid nodemask intersection */
2538
2539                task_lock(current);
2540                ret = mpol_set_nodemask(new, &mpol->w.user_nodemask, scratch);
2541                task_unlock(current);
2542                if (ret)
2543                        goto put_new;
2544
2545                /* Create pseudo-vma that contains just the policy */
2546                vma_init(&pvma, NULL);
2547                pvma.vm_end = TASK_SIZE;        /* policy covers entire file */
2548                mpol_set_shared_policy(sp, &pvma, new); /* adds ref */
2549
2550put_new:
2551                mpol_put(new);                  /* drop initial ref */
2552free_scratch:
2553                NODEMASK_SCRATCH_FREE(scratch);
2554put_mpol:
2555                mpol_put(mpol); /* drop our incoming ref on sb mpol */
2556        }
2557}
2558
2559int mpol_set_shared_policy(struct shared_policy *info,
2560                        struct vm_area_struct *vma, struct mempolicy *npol)
2561{
2562        int err;
2563        struct sp_node *new = NULL;
2564        unsigned long sz = vma_pages(vma);
2565
2566        pr_debug("set_shared_policy %lx sz %lu %d %d %lx\n",
2567                 vma->vm_pgoff,
2568                 sz, npol ? npol->mode : -1,
2569                 npol ? npol->flags : -1,
2570                 npol ? nodes_addr(npol->v.nodes)[0] : NUMA_NO_NODE);
2571
2572        if (npol) {
2573                new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);
2574                if (!new)
2575                        return -ENOMEM;
2576        }
2577        err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new);
2578        if (err && new)
2579                sp_free(new);
2580        return err;
2581}
2582
2583/* Free a backing policy store on inode delete. */
2584void mpol_free_shared_policy(struct shared_policy *p)
2585{
2586        struct sp_node *n;
2587        struct rb_node *next;
2588
2589        if (!p->root.rb_node)
2590                return;
2591        write_lock(&p->lock);
2592        next = rb_first(&p->root);
2593        while (next) {
2594                n = rb_entry(next, struct sp_node, nd);
2595                next = rb_next(&n->nd);
2596                sp_delete(p, n);
2597        }
2598        write_unlock(&p->lock);
2599}
2600
2601#ifdef CONFIG_NUMA_BALANCING
2602static int __initdata numabalancing_override;
2603
2604static void __init check_numabalancing_enable(void)
2605{
2606        bool numabalancing_default = false;
2607
2608        if (IS_ENABLED(CONFIG_NUMA_BALANCING_DEFAULT_ENABLED))
2609                numabalancing_default = true;
2610
2611        /* Parsed by setup_numabalancing. override == 1 enables, -1 disables */
2612        if (numabalancing_override)
2613                set_numabalancing_state(numabalancing_override == 1);
2614
2615        if (num_online_nodes() > 1 && !numabalancing_override) {
2616                pr_info("%s automatic NUMA balancing. Configure with numa_balancing= or the kernel.numa_balancing sysctl\n",
2617                        numabalancing_default ? "Enabling" : "Disabling");
2618                set_numabalancing_state(numabalancing_default);
2619        }
2620}
2621
2622static int __init setup_numabalancing(char *str)
2623{
2624        int ret = 0;
2625        if (!str)
2626                goto out;
2627
2628        if (!strcmp(str, "enable")) {
2629                numabalancing_override = 1;
2630                ret = 1;
2631        } else if (!strcmp(str, "disable")) {
2632                numabalancing_override = -1;
2633                ret = 1;
2634        }
2635out:
2636        if (!ret)
2637                pr_warn("Unable to parse numa_balancing=\n");
2638
2639        return ret;
2640}
2641__setup("numa_balancing=", setup_numabalancing);
2642#else
2643static inline void __init check_numabalancing_enable(void)
2644{
2645}
2646#endif /* CONFIG_NUMA_BALANCING */
2647
2648/* assumes fs == KERNEL_DS */
2649void __init numa_policy_init(void)
2650{
2651        nodemask_t interleave_nodes;
2652        unsigned long largest = 0;
2653        int nid, prefer = 0;
2654
2655        policy_cache = kmem_cache_create("numa_policy",
2656                                         sizeof(struct mempolicy),
2657                                         0, SLAB_PANIC, NULL);
2658
2659        sn_cache = kmem_cache_create("shared_policy_node",
2660                                     sizeof(struct sp_node),
2661                                     0, SLAB_PANIC, NULL);
2662
2663        for_each_node(nid) {
2664                preferred_node_policy[nid] = (struct mempolicy) {
2665                        .refcnt = ATOMIC_INIT(1),
2666                        .mode = MPOL_PREFERRED,
2667                        .flags = MPOL_F_MOF | MPOL_F_MORON,
2668                        .v = { .preferred_node = nid, },
2669                };
2670        }
2671
2672        /*
2673         * Set interleaving policy for system init. Interleaving is only
2674         * enabled across suitably sized nodes (default is >= 16MB), or
2675         * fall back to the largest node if they're all smaller.
2676         */
2677        nodes_clear(interleave_nodes);
2678        for_each_node_state(nid, N_MEMORY) {
2679                unsigned long total_pages = node_present_pages(nid);
2680
2681                /* Preserve the largest node */
2682                if (largest < total_pages) {
2683                        largest = total_pages;
2684                        prefer = nid;
2685                }
2686
2687                /* Interleave this node? */
2688                if ((total_pages << PAGE_SHIFT) >= (16 << 20))
2689                        node_set(nid, interleave_nodes);
2690        }
2691
2692        /* All too small, use the largest */
2693        if (unlikely(nodes_empty(interleave_nodes)))
2694                node_set(prefer, interleave_nodes);
2695
2696        if (do_set_mempolicy(MPOL_INTERLEAVE, 0, &interleave_nodes))
2697                pr_err("%s: interleaving failed\n", __func__);
2698
2699        check_numabalancing_enable();
2700}
2701
2702/* Reset policy of current process to default */
2703void numa_default_policy(void)
2704{
2705        do_set_mempolicy(MPOL_DEFAULT, 0, NULL);
2706}
2707
2708/*
2709 * Parse and format mempolicy from/to strings
2710 */
2711
2712/*
2713 * "local" is implemented internally by MPOL_PREFERRED with MPOL_F_LOCAL flag.
2714 */
2715static const char * const policy_modes[] =
2716{
2717        [MPOL_DEFAULT]    = "default",
2718        [MPOL_PREFERRED]  = "prefer",
2719        [MPOL_BIND]       = "bind",
2720        [MPOL_INTERLEAVE] = "interleave",
2721        [MPOL_LOCAL]      = "local",
2722};
2723
2724
2725#ifdef CONFIG_TMPFS
2726/**
2727 * mpol_parse_str - parse string to mempolicy, for tmpfs mpol mount option.
2728 * @str:  string containing mempolicy to parse
2729 * @mpol:  pointer to struct mempolicy pointer, returned on success.
2730 *
2731 * Format of input:
2732 *      <mode>[=<flags>][:<nodelist>]
2733 *
2734 * On success, returns 0, else 1
2735 */
2736int mpol_parse_str(char *str, struct mempolicy **mpol)
2737{
2738        struct mempolicy *new = NULL;
2739        unsigned short mode_flags;
2740        nodemask_t nodes;
2741        char *nodelist = strchr(str, ':');
2742        char *flags = strchr(str, '=');
2743        int err = 1, mode;
2744
2745        if (nodelist) {
2746                /* NUL-terminate mode or flags string */
2747                *nodelist++ = '\0';
2748                if (nodelist_parse(nodelist, nodes))
2749                        goto out;
2750                if (!nodes_subset(nodes, node_states[N_MEMORY]))
2751                        goto out;
2752        } else
2753                nodes_clear(nodes);
2754
2755        if (flags)
2756                *flags++ = '\0';        /* terminate mode string */
2757
2758        mode = match_string(policy_modes, MPOL_MAX, str);
2759        if (mode < 0)
2760                goto out;
2761
2762        switch (mode) {
2763        case MPOL_PREFERRED:
2764                /*
2765                 * Insist on a nodelist of one node only
2766                 */
2767                if (nodelist) {
2768                        char *rest = nodelist;
2769                        while (isdigit(*rest))
2770                                rest++;
2771                        if (*rest)
2772                                goto out;
2773                }
2774                break;
2775        case MPOL_INTERLEAVE:
2776                /*
2777                 * Default to online nodes with memory if no nodelist
2778                 */
2779                if (!nodelist)
2780                        nodes = node_states[N_MEMORY];
2781                break;
2782        case MPOL_LOCAL:
2783                /*
2784                 * Don't allow a nodelist;  mpol_new() checks flags
2785                 */
2786                if (nodelist)
2787                        goto out;
2788                mode = MPOL_PREFERRED;
2789                break;
2790        case MPOL_DEFAULT:
2791                /*
2792                 * Insist on a empty nodelist
2793                 */
2794                if (!nodelist)
2795                        err = 0;
2796                goto out;
2797        case MPOL_BIND:
2798                /*
2799                 * Insist on a nodelist
2800                 */
2801                if (!nodelist)
2802                        goto out;
2803        }
2804
2805        mode_flags = 0;
2806        if (flags) {
2807                /*
2808                 * Currently, we only support two mutually exclusive
2809                 * mode flags.
2810                 */
2811                if (!strcmp(flags, "static"))
2812                        mode_flags |= MPOL_F_STATIC_NODES;
2813                else if (!strcmp(flags, "relative"))
2814                        mode_flags |= MPOL_F_RELATIVE_NODES;
2815                else
2816                        goto out;
2817        }
2818
2819        new = mpol_new(mode, mode_flags, &nodes);
2820        if (IS_ERR(new))
2821                goto out;
2822
2823        /*
2824         * Save nodes for mpol_to_str() to show the tmpfs mount options
2825         * for /proc/mounts, /proc/pid/mounts and /proc/pid/mountinfo.
2826         */
2827        if (mode != MPOL_PREFERRED)
2828                new->v.nodes = nodes;
2829        else if (nodelist)
2830                new->v.preferred_node = first_node(nodes);
2831        else
2832                new->flags |= MPOL_F_LOCAL;
2833
2834        /*
2835         * Save nodes for contextualization: this will be used to "clone"
2836         * the mempolicy in a specific context [cpuset] at a later time.
2837         */
2838        new->w.user_nodemask = nodes;
2839
2840        err = 0;
2841
2842out:
2843        /* Restore string for error message */
2844        if (nodelist)
2845                *--nodelist = ':';
2846        if (flags)
2847                *--flags = '=';
2848        if (!err)
2849                *mpol = new;
2850        return err;
2851}
2852#endif /* CONFIG_TMPFS */
2853
2854/**
2855 * mpol_to_str - format a mempolicy structure for printing
2856 * @buffer:  to contain formatted mempolicy string
2857 * @maxlen:  length of @buffer
2858 * @pol:  pointer to mempolicy to be formatted
2859 *
2860 * Convert @pol into a string.  If @buffer is too short, truncate the string.
2861 * Recommend a @maxlen of at least 32 for the longest mode, "interleave", the
2862 * longest flag, "relative", and to display at least a few node ids.
2863 */
2864void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
2865{
2866        char *p = buffer;
2867        nodemask_t nodes = NODE_MASK_NONE;
2868        unsigned short mode = MPOL_DEFAULT;
2869        unsigned short flags = 0;
2870
2871        if (pol && pol != &default_policy && !(pol->flags & MPOL_F_MORON)) {
2872                mode = pol->mode;
2873                flags = pol->flags;
2874        }
2875
2876        switch (mode) {
2877        case MPOL_DEFAULT:
2878                break;
2879        case MPOL_PREFERRED:
2880                if (flags & MPOL_F_LOCAL)
2881                        mode = MPOL_LOCAL;
2882                else
2883                        node_set(pol->v.preferred_node, nodes);
2884                break;
2885        case MPOL_BIND:
2886        case MPOL_INTERLEAVE:
2887                nodes = pol->v.nodes;
2888                break;
2889        default:
2890                WARN_ON_ONCE(1);
2891                snprintf(p, maxlen, "unknown");
2892                return;
2893        }
2894
2895        p += snprintf(p, maxlen, "%s", policy_modes[mode]);
2896
2897        if (flags & MPOL_MODE_FLAGS) {
2898                p += snprintf(p, buffer + maxlen - p, "=");
2899
2900                /*
2901                 * Currently, the only defined flags are mutually exclusive
2902                 */
2903                if (flags & MPOL_F_STATIC_NODES)
2904                        p += snprintf(p, buffer + maxlen - p, "static");
2905                else if (flags & MPOL_F_RELATIVE_NODES)
2906                        p += snprintf(p, buffer + maxlen - p, "relative");
2907        }
2908
2909        if (!nodes_empty(nodes))
2910                p += scnprintf(p, buffer + maxlen - p, ":%*pbl",
2911                               nodemask_pr_args(&nodes));
2912}
2913