linux/mm/mempolicy.c
<<
>>
Prefs
   1/*
   2 * Simple NUMA memory policy for the Linux kernel.
   3 *
   4 * Copyright 2003,2004 Andi Kleen, SuSE Labs.
   5 * (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc.
   6 * Subject to the GNU Public License, version 2.
   7 *
   8 * NUMA policy allows the user to give hints in which node(s) memory should
   9 * be allocated.
  10 *
  11 * Support four policies per VMA and per process:
  12 *
  13 * The VMA policy has priority over the process policy for a page fault.
  14 *
  15 * interleave     Allocate memory interleaved over a set of nodes,
  16 *                with normal fallback if it fails.
  17 *                For VMA based allocations this interleaves based on the
  18 *                offset into the backing object or offset into the mapping
  19 *                for anonymous memory. For process policy an process counter
  20 *                is used.
  21 *
  22 * bind           Only allocate memory on a specific set of nodes,
  23 *                no fallback.
  24 *                FIXME: memory is allocated starting with the first node
  25 *                to the last. It would be better if bind would truly restrict
  26 *                the allocation to memory nodes instead
  27 *
  28 * preferred       Try a specific node first before normal fallback.
  29 *                As a special case NUMA_NO_NODE here means do the allocation
  30 *                on the local CPU. This is normally identical to default,
  31 *                but useful to set in a VMA when you have a non default
  32 *                process policy.
  33 *
  34 * default        Allocate on the local node first, or when on a VMA
  35 *                use the process policy. This is what Linux always did
  36 *                in a NUMA aware kernel and still does by, ahem, default.
  37 *
  38 * The process policy is applied for most non interrupt memory allocations
  39 * in that process' context. Interrupts ignore the policies and always
  40 * try to allocate on the local CPU. The VMA policy is only applied for memory
  41 * allocations for a VMA in the VM.
  42 *
  43 * Currently there are a few corner cases in swapping where the policy
  44 * is not applied, but the majority should be handled. When process policy
  45 * is used it is not remembered over swap outs/swap ins.
  46 *
  47 * Only the highest zone in the zone hierarchy gets policied. Allocations
  48 * requesting a lower zone just use default policy. This implies that
  49 * on systems with highmem kernel lowmem allocation don't get policied.
  50 * Same with GFP_DMA allocations.
  51 *
  52 * For shmfs/tmpfs/hugetlbfs shared memory the policy is shared between
  53 * all users and remembered even when nobody has memory mapped.
  54 */
  55
  56/* Notebook:
  57   fix mmap readahead to honour policy and enable policy for any page cache
  58   object
  59   statistics for bigpages
  60   global policy for page cache? currently it uses process policy. Requires
  61   first item above.
  62   handle mremap for shared memory (currently ignored for the policy)
  63   grows down?
  64   make bind policy root only? It can trigger oom much faster and the
  65   kernel is not always grateful with that.
  66*/
  67
  68#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
  69
  70#include <linux/mempolicy.h>
  71#include <linux/pagewalk.h>
  72#include <linux/highmem.h>
  73#include <linux/hugetlb.h>
  74#include <linux/kernel.h>
  75#include <linux/sched.h>
  76#include <linux/sched/mm.h>
  77#include <linux/sched/numa_balancing.h>
  78#include <linux/sched/task.h>
  79#include <linux/nodemask.h>
  80#include <linux/cpuset.h>
  81#include <linux/slab.h>
  82#include <linux/string.h>
  83#include <linux/export.h>
  84#include <linux/nsproxy.h>
  85#include <linux/interrupt.h>
  86#include <linux/init.h>
  87#include <linux/compat.h>
  88#include <linux/ptrace.h>
  89#include <linux/swap.h>
  90#include <linux/seq_file.h>
  91#include <linux/proc_fs.h>
  92#include <linux/migrate.h>
  93#include <linux/ksm.h>
  94#include <linux/rmap.h>
  95#include <linux/security.h>
  96#include <linux/syscalls.h>
  97#include <linux/ctype.h>
  98#include <linux/mm_inline.h>
  99#include <linux/mmu_notifier.h>
 100#include <linux/printk.h>
 101#include <linux/swapops.h>
 102
 103#include <asm/tlbflush.h>
 104#include <linux/uaccess.h>
 105
 106#include "internal.h"
 107
 108/* Internal flags */
 109#define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0)    /* Skip checks for continuous vmas */
 110#define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1)          /* Invert check for nodemask */
 111
 112static struct kmem_cache *policy_cache;
 113static struct kmem_cache *sn_cache;
 114
 115/* Highest zone. An specific allocation for a zone below that is not
 116   policied. */
 117enum zone_type policy_zone = 0;
 118
 119/*
 120 * run-time system-wide default policy => local allocation
 121 */
 122static struct mempolicy default_policy = {
 123        .refcnt = ATOMIC_INIT(1), /* never free it */
 124        .mode = MPOL_PREFERRED,
 125        .flags = MPOL_F_LOCAL,
 126};
 127
 128static struct mempolicy preferred_node_policy[MAX_NUMNODES];
 129
 130/**
 131 * numa_map_to_online_node - Find closest online node
 132 * @nid: Node id to start the search
 133 *
 134 * Lookup the next closest node by distance if @nid is not online.
 135 */
 136int numa_map_to_online_node(int node)
 137{
 138        int min_node;
 139
 140        if (node == NUMA_NO_NODE)
 141                node = 0;
 142
 143        min_node = node;
 144        if (!node_online(node)) {
 145                int min_dist = INT_MAX, dist, n;
 146
 147                for_each_online_node(n) {
 148                        dist = node_distance(node, n);
 149                        if (dist < min_dist) {
 150                                min_dist = dist;
 151                                min_node = n;
 152                        }
 153                }
 154        }
 155
 156        return min_node;
 157}
 158EXPORT_SYMBOL_GPL(numa_map_to_online_node);
 159
 160struct mempolicy *get_task_policy(struct task_struct *p)
 161{
 162        struct mempolicy *pol = p->mempolicy;
 163        int node;
 164
 165        if (pol)
 166                return pol;
 167
 168        node = numa_node_id();
 169        if (node != NUMA_NO_NODE) {
 170                pol = &preferred_node_policy[node];
 171                /* preferred_node_policy is not initialised early in boot */
 172                if (pol->mode)
 173                        return pol;
 174        }
 175
 176        return &default_policy;
 177}
 178
 179static const struct mempolicy_operations {
 180        int (*create)(struct mempolicy *pol, const nodemask_t *nodes);
 181        void (*rebind)(struct mempolicy *pol, const nodemask_t *nodes);
 182} mpol_ops[MPOL_MAX];
 183
 184static inline int mpol_store_user_nodemask(const struct mempolicy *pol)
 185{
 186        return pol->flags & MPOL_MODE_FLAGS;
 187}
 188
 189static void mpol_relative_nodemask(nodemask_t *ret, const nodemask_t *orig,
 190                                   const nodemask_t *rel)
 191{
 192        nodemask_t tmp;
 193        nodes_fold(tmp, *orig, nodes_weight(*rel));
 194        nodes_onto(*ret, tmp, *rel);
 195}
 196
 197static int mpol_new_interleave(struct mempolicy *pol, const nodemask_t *nodes)
 198{
 199        if (nodes_empty(*nodes))
 200                return -EINVAL;
 201        pol->v.nodes = *nodes;
 202        return 0;
 203}
 204
 205static int mpol_new_preferred(struct mempolicy *pol, const nodemask_t *nodes)
 206{
 207        if (!nodes)
 208                pol->flags |= MPOL_F_LOCAL;     /* local allocation */
 209        else if (nodes_empty(*nodes))
 210                return -EINVAL;                 /*  no allowed nodes */
 211        else
 212                pol->v.preferred_node = first_node(*nodes);
 213        return 0;
 214}
 215
 216static int mpol_new_bind(struct mempolicy *pol, const nodemask_t *nodes)
 217{
 218        if (nodes_empty(*nodes))
 219                return -EINVAL;
 220        pol->v.nodes = *nodes;
 221        return 0;
 222}
 223
 224/*
 225 * mpol_set_nodemask is called after mpol_new() to set up the nodemask, if
 226 * any, for the new policy.  mpol_new() has already validated the nodes
 227 * parameter with respect to the policy mode and flags.  But, we need to
 228 * handle an empty nodemask with MPOL_PREFERRED here.
 229 *
 230 * Must be called holding task's alloc_lock to protect task's mems_allowed
 231 * and mempolicy.  May also be called holding the mmap_semaphore for write.
 232 */
 233static int mpol_set_nodemask(struct mempolicy *pol,
 234                     const nodemask_t *nodes, struct nodemask_scratch *nsc)
 235{
 236        int ret;
 237
 238        /* if mode is MPOL_DEFAULT, pol is NULL. This is right. */
 239        if (pol == NULL)
 240                return 0;
 241        /* Check N_MEMORY */
 242        nodes_and(nsc->mask1,
 243                  cpuset_current_mems_allowed, node_states[N_MEMORY]);
 244
 245        VM_BUG_ON(!nodes);
 246        if (pol->mode == MPOL_PREFERRED && nodes_empty(*nodes))
 247                nodes = NULL;   /* explicit local allocation */
 248        else {
 249                if (pol->flags & MPOL_F_RELATIVE_NODES)
 250                        mpol_relative_nodemask(&nsc->mask2, nodes, &nsc->mask1);
 251                else
 252                        nodes_and(nsc->mask2, *nodes, nsc->mask1);
 253
 254                if (mpol_store_user_nodemask(pol))
 255                        pol->w.user_nodemask = *nodes;
 256                else
 257                        pol->w.cpuset_mems_allowed =
 258                                                cpuset_current_mems_allowed;
 259        }
 260
 261        if (nodes)
 262                ret = mpol_ops[pol->mode].create(pol, &nsc->mask2);
 263        else
 264                ret = mpol_ops[pol->mode].create(pol, NULL);
 265        return ret;
 266}
 267
 268/*
 269 * This function just creates a new policy, does some check and simple
 270 * initialization. You must invoke mpol_set_nodemask() to set nodes.
 271 */
 272static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
 273                                  nodemask_t *nodes)
 274{
 275        struct mempolicy *policy;
 276
 277        pr_debug("setting mode %d flags %d nodes[0] %lx\n",
 278                 mode, flags, nodes ? nodes_addr(*nodes)[0] : NUMA_NO_NODE);
 279
 280        if (mode == MPOL_DEFAULT) {
 281                if (nodes && !nodes_empty(*nodes))
 282                        return ERR_PTR(-EINVAL);
 283                return NULL;
 284        }
 285        VM_BUG_ON(!nodes);
 286
 287        /*
 288         * MPOL_PREFERRED cannot be used with MPOL_F_STATIC_NODES or
 289         * MPOL_F_RELATIVE_NODES if the nodemask is empty (local allocation).
 290         * All other modes require a valid pointer to a non-empty nodemask.
 291         */
 292        if (mode == MPOL_PREFERRED) {
 293                if (nodes_empty(*nodes)) {
 294                        if (((flags & MPOL_F_STATIC_NODES) ||
 295                             (flags & MPOL_F_RELATIVE_NODES)))
 296                                return ERR_PTR(-EINVAL);
 297                }
 298        } else if (mode == MPOL_LOCAL) {
 299                if (!nodes_empty(*nodes) ||
 300                    (flags & MPOL_F_STATIC_NODES) ||
 301                    (flags & MPOL_F_RELATIVE_NODES))
 302                        return ERR_PTR(-EINVAL);
 303                mode = MPOL_PREFERRED;
 304        } else if (nodes_empty(*nodes))
 305                return ERR_PTR(-EINVAL);
 306        policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
 307        if (!policy)
 308                return ERR_PTR(-ENOMEM);
 309        atomic_set(&policy->refcnt, 1);
 310        policy->mode = mode;
 311        policy->flags = flags;
 312
 313        return policy;
 314}
 315
 316/* Slow path of a mpol destructor. */
 317void __mpol_put(struct mempolicy *p)
 318{
 319        if (!atomic_dec_and_test(&p->refcnt))
 320                return;
 321        kmem_cache_free(policy_cache, p);
 322}
 323
 324static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes)
 325{
 326}
 327
 328static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes)
 329{
 330        nodemask_t tmp;
 331
 332        if (pol->flags & MPOL_F_STATIC_NODES)
 333                nodes_and(tmp, pol->w.user_nodemask, *nodes);
 334        else if (pol->flags & MPOL_F_RELATIVE_NODES)
 335                mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
 336        else {
 337                nodes_remap(tmp, pol->v.nodes,pol->w.cpuset_mems_allowed,
 338                                                                *nodes);
 339                pol->w.cpuset_mems_allowed = *nodes;
 340        }
 341
 342        if (nodes_empty(tmp))
 343                tmp = *nodes;
 344
 345        pol->v.nodes = tmp;
 346}
 347
 348static void mpol_rebind_preferred(struct mempolicy *pol,
 349                                                const nodemask_t *nodes)
 350{
 351        nodemask_t tmp;
 352
 353        if (pol->flags & MPOL_F_STATIC_NODES) {
 354                int node = first_node(pol->w.user_nodemask);
 355
 356                if (node_isset(node, *nodes)) {
 357                        pol->v.preferred_node = node;
 358                        pol->flags &= ~MPOL_F_LOCAL;
 359                } else
 360                        pol->flags |= MPOL_F_LOCAL;
 361        } else if (pol->flags & MPOL_F_RELATIVE_NODES) {
 362                mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
 363                pol->v.preferred_node = first_node(tmp);
 364        } else if (!(pol->flags & MPOL_F_LOCAL)) {
 365                pol->v.preferred_node = node_remap(pol->v.preferred_node,
 366                                                   pol->w.cpuset_mems_allowed,
 367                                                   *nodes);
 368                pol->w.cpuset_mems_allowed = *nodes;
 369        }
 370}
 371
 372/*
 373 * mpol_rebind_policy - Migrate a policy to a different set of nodes
 374 *
 375 * Per-vma policies are protected by mmap_sem. Allocations using per-task
 376 * policies are protected by task->mems_allowed_seq to prevent a premature
 377 * OOM/allocation failure due to parallel nodemask modification.
 378 */
 379static void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask)
 380{
 381        if (!pol)
 382                return;
 383        if (!mpol_store_user_nodemask(pol) &&
 384            nodes_equal(pol->w.cpuset_mems_allowed, *newmask))
 385                return;
 386
 387        mpol_ops[pol->mode].rebind(pol, newmask);
 388}
 389
 390/*
 391 * Wrapper for mpol_rebind_policy() that just requires task
 392 * pointer, and updates task mempolicy.
 393 *
 394 * Called with task's alloc_lock held.
 395 */
 396
 397void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new)
 398{
 399        mpol_rebind_policy(tsk->mempolicy, new);
 400}
 401
 402/*
 403 * Rebind each vma in mm to new nodemask.
 404 *
 405 * Call holding a reference to mm.  Takes mm->mmap_sem during call.
 406 */
 407
 408void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
 409{
 410        struct vm_area_struct *vma;
 411
 412        down_write(&mm->mmap_sem);
 413        for (vma = mm->mmap; vma; vma = vma->vm_next)
 414                mpol_rebind_policy(vma->vm_policy, new);
 415        up_write(&mm->mmap_sem);
 416}
 417
 418static const struct mempolicy_operations mpol_ops[MPOL_MAX] = {
 419        [MPOL_DEFAULT] = {
 420                .rebind = mpol_rebind_default,
 421        },
 422        [MPOL_INTERLEAVE] = {
 423                .create = mpol_new_interleave,
 424                .rebind = mpol_rebind_nodemask,
 425        },
 426        [MPOL_PREFERRED] = {
 427                .create = mpol_new_preferred,
 428                .rebind = mpol_rebind_preferred,
 429        },
 430        [MPOL_BIND] = {
 431                .create = mpol_new_bind,
 432                .rebind = mpol_rebind_nodemask,
 433        },
 434};
 435
 436static int migrate_page_add(struct page *page, struct list_head *pagelist,
 437                                unsigned long flags);
 438
 439struct queue_pages {
 440        struct list_head *pagelist;
 441        unsigned long flags;
 442        nodemask_t *nmask;
 443        struct vm_area_struct *prev;
 444};
 445
 446/*
 447 * Check if the page's nid is in qp->nmask.
 448 *
 449 * If MPOL_MF_INVERT is set in qp->flags, check if the nid is
 450 * in the invert of qp->nmask.
 451 */
 452static inline bool queue_pages_required(struct page *page,
 453                                        struct queue_pages *qp)
 454{
 455        int nid = page_to_nid(page);
 456        unsigned long flags = qp->flags;
 457
 458        return node_isset(nid, *qp->nmask) == !(flags & MPOL_MF_INVERT);
 459}
 460
 461/*
 462 * queue_pages_pmd() has four possible return values:
 463 * 0 - pages are placed on the right node or queued successfully.
 464 * 1 - there is unmovable page, and MPOL_MF_MOVE* & MPOL_MF_STRICT were
 465 *     specified.
 466 * 2 - THP was split.
 467 * -EIO - is migration entry or only MPOL_MF_STRICT was specified and an
 468 *        existing page was already on a node that does not follow the
 469 *        policy.
 470 */
 471static int queue_pages_pmd(pmd_t *pmd, spinlock_t *ptl, unsigned long addr,
 472                                unsigned long end, struct mm_walk *walk)
 473{
 474        int ret = 0;
 475        struct page *page;
 476        struct queue_pages *qp = walk->private;
 477        unsigned long flags;
 478
 479        if (unlikely(is_pmd_migration_entry(*pmd))) {
 480                ret = -EIO;
 481                goto unlock;
 482        }
 483        page = pmd_page(*pmd);
 484        if (is_huge_zero_page(page)) {
 485                spin_unlock(ptl);
 486                __split_huge_pmd(walk->vma, pmd, addr, false, NULL);
 487                ret = 2;
 488                goto out;
 489        }
 490        if (!queue_pages_required(page, qp))
 491                goto unlock;
 492
 493        flags = qp->flags;
 494        /* go to thp migration */
 495        if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
 496                if (!vma_migratable(walk->vma) ||
 497                    migrate_page_add(page, qp->pagelist, flags)) {
 498                        ret = 1;
 499                        goto unlock;
 500                }
 501        } else
 502                ret = -EIO;
 503unlock:
 504        spin_unlock(ptl);
 505out:
 506        return ret;
 507}
 508
 509/*
 510 * Scan through pages checking if pages follow certain conditions,
 511 * and move them to the pagelist if they do.
 512 *
 513 * queue_pages_pte_range() has three possible return values:
 514 * 0 - pages are placed on the right node or queued successfully.
 515 * 1 - there is unmovable page, and MPOL_MF_MOVE* & MPOL_MF_STRICT were
 516 *     specified.
 517 * -EIO - only MPOL_MF_STRICT was specified and an existing page was already
 518 *        on a node that does not follow the policy.
 519 */
 520static int queue_pages_pte_range(pmd_t *pmd, unsigned long addr,
 521                        unsigned long end, struct mm_walk *walk)
 522{
 523        struct vm_area_struct *vma = walk->vma;
 524        struct page *page;
 525        struct queue_pages *qp = walk->private;
 526        unsigned long flags = qp->flags;
 527        int ret;
 528        bool has_unmovable = false;
 529        pte_t *pte, *mapped_pte;
 530        spinlock_t *ptl;
 531
 532        ptl = pmd_trans_huge_lock(pmd, vma);
 533        if (ptl) {
 534                ret = queue_pages_pmd(pmd, ptl, addr, end, walk);
 535                if (ret != 2)
 536                        return ret;
 537        }
 538        /* THP was split, fall through to pte walk */
 539
 540        if (pmd_trans_unstable(pmd))
 541                return 0;
 542
 543        mapped_pte = pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
 544        for (; addr != end; pte++, addr += PAGE_SIZE) {
 545                if (!pte_present(*pte))
 546                        continue;
 547                page = vm_normal_page(vma, addr, *pte);
 548                if (!page)
 549                        continue;
 550                /*
 551                 * vm_normal_page() filters out zero pages, but there might
 552                 * still be PageReserved pages to skip, perhaps in a VDSO.
 553                 */
 554                if (PageReserved(page))
 555                        continue;
 556                if (!queue_pages_required(page, qp))
 557                        continue;
 558                if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
 559                        /* MPOL_MF_STRICT must be specified if we get here */
 560                        if (!vma_migratable(vma)) {
 561                                has_unmovable = true;
 562                                break;
 563                        }
 564
 565                        /*
 566                         * Do not abort immediately since there may be
 567                         * temporary off LRU pages in the range.  Still
 568                         * need migrate other LRU pages.
 569                         */
 570                        if (migrate_page_add(page, qp->pagelist, flags))
 571                                has_unmovable = true;
 572                } else
 573                        break;
 574        }
 575        pte_unmap_unlock(mapped_pte, ptl);
 576        cond_resched();
 577
 578        if (has_unmovable)
 579                return 1;
 580
 581        return addr != end ? -EIO : 0;
 582}
 583
 584static int queue_pages_hugetlb(pte_t *pte, unsigned long hmask,
 585                               unsigned long addr, unsigned long end,
 586                               struct mm_walk *walk)
 587{
 588#ifdef CONFIG_HUGETLB_PAGE
 589        struct queue_pages *qp = walk->private;
 590        unsigned long flags = qp->flags;
 591        struct page *page;
 592        spinlock_t *ptl;
 593        pte_t entry;
 594
 595        ptl = huge_pte_lock(hstate_vma(walk->vma), walk->mm, pte);
 596        entry = huge_ptep_get(pte);
 597        if (!pte_present(entry))
 598                goto unlock;
 599        page = pte_page(entry);
 600        if (!queue_pages_required(page, qp))
 601                goto unlock;
 602        /* With MPOL_MF_MOVE, we migrate only unshared hugepage. */
 603        if (flags & (MPOL_MF_MOVE_ALL) ||
 604            (flags & MPOL_MF_MOVE && page_mapcount(page) == 1))
 605                isolate_huge_page(page, qp->pagelist);
 606unlock:
 607        spin_unlock(ptl);
 608#else
 609        BUG();
 610#endif
 611        return 0;
 612}
 613
 614#ifdef CONFIG_NUMA_BALANCING
 615/*
 616 * This is used to mark a range of virtual addresses to be inaccessible.
 617 * These are later cleared by a NUMA hinting fault. Depending on these
 618 * faults, pages may be migrated for better NUMA placement.
 619 *
 620 * This is assuming that NUMA faults are handled using PROT_NONE. If
 621 * an architecture makes a different choice, it will need further
 622 * changes to the core.
 623 */
 624unsigned long change_prot_numa(struct vm_area_struct *vma,
 625                        unsigned long addr, unsigned long end)
 626{
 627        int nr_updated;
 628
 629        nr_updated = change_protection(vma, addr, end, PAGE_NONE, 0, 1);
 630        if (nr_updated)
 631                count_vm_numa_events(NUMA_PTE_UPDATES, nr_updated);
 632
 633        return nr_updated;
 634}
 635#else
 636static unsigned long change_prot_numa(struct vm_area_struct *vma,
 637                        unsigned long addr, unsigned long end)
 638{
 639        return 0;
 640}
 641#endif /* CONFIG_NUMA_BALANCING */
 642
 643static int queue_pages_test_walk(unsigned long start, unsigned long end,
 644                                struct mm_walk *walk)
 645{
 646        struct vm_area_struct *vma = walk->vma;
 647        struct queue_pages *qp = walk->private;
 648        unsigned long endvma = vma->vm_end;
 649        unsigned long flags = qp->flags;
 650
 651        /* range check first */
 652        if (!(flags & MPOL_MF_DISCONTIG_OK)) {
 653                if (!vma->vm_next && vma->vm_end < end)
 654                        return -EFAULT;
 655                if (qp->prev && qp->prev->vm_end < vma->vm_start)
 656                        return -EFAULT;
 657        }
 658
 659        qp->prev = vma;
 660
 661        /*
 662         * Need check MPOL_MF_STRICT to return -EIO if possible
 663         * regardless of vma_migratable
 664         */
 665        if (!vma_migratable(vma) &&
 666            !(flags & MPOL_MF_STRICT))
 667                return 1;
 668
 669        if (endvma > end)
 670                endvma = end;
 671        if (vma->vm_start > start)
 672                start = vma->vm_start;
 673
 674        if (flags & MPOL_MF_LAZY) {
 675                /* Similar to task_numa_work, skip inaccessible VMAs */
 676                if (!is_vm_hugetlb_page(vma) &&
 677                        (vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)) &&
 678                        !(vma->vm_flags & VM_MIXEDMAP))
 679                        change_prot_numa(vma, start, endvma);
 680                return 1;
 681        }
 682
 683        /* queue pages from current vma */
 684        if (flags & MPOL_MF_VALID)
 685                return 0;
 686        return 1;
 687}
 688
 689static const struct mm_walk_ops queue_pages_walk_ops = {
 690        .hugetlb_entry          = queue_pages_hugetlb,
 691        .pmd_entry              = queue_pages_pte_range,
 692        .test_walk              = queue_pages_test_walk,
 693};
 694
 695/*
 696 * Walk through page tables and collect pages to be migrated.
 697 *
 698 * If pages found in a given range are on a set of nodes (determined by
 699 * @nodes and @flags,) it's isolated and queued to the pagelist which is
 700 * passed via @private.
 701 *
 702 * queue_pages_range() has three possible return values:
 703 * 1 - there is unmovable page, but MPOL_MF_MOVE* & MPOL_MF_STRICT were
 704 *     specified.
 705 * 0 - queue pages successfully or no misplaced page.
 706 * errno - i.e. misplaced pages with MPOL_MF_STRICT specified (-EIO) or
 707 *         memory range specified by nodemask and maxnode points outside
 708 *         your accessible address space (-EFAULT)
 709 */
 710static int
 711queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end,
 712                nodemask_t *nodes, unsigned long flags,
 713                struct list_head *pagelist)
 714{
 715        struct queue_pages qp = {
 716                .pagelist = pagelist,
 717                .flags = flags,
 718                .nmask = nodes,
 719                .prev = NULL,
 720        };
 721
 722        return walk_page_range(mm, start, end, &queue_pages_walk_ops, &qp);
 723}
 724
 725/*
 726 * Apply policy to a single VMA
 727 * This must be called with the mmap_sem held for writing.
 728 */
 729static int vma_replace_policy(struct vm_area_struct *vma,
 730                                                struct mempolicy *pol)
 731{
 732        int err;
 733        struct mempolicy *old;
 734        struct mempolicy *new;
 735
 736        pr_debug("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
 737                 vma->vm_start, vma->vm_end, vma->vm_pgoff,
 738                 vma->vm_ops, vma->vm_file,
 739                 vma->vm_ops ? vma->vm_ops->set_policy : NULL);
 740
 741        new = mpol_dup(pol);
 742        if (IS_ERR(new))
 743                return PTR_ERR(new);
 744
 745        if (vma->vm_ops && vma->vm_ops->set_policy) {
 746                err = vma->vm_ops->set_policy(vma, new);
 747                if (err)
 748                        goto err_out;
 749        }
 750
 751        old = vma->vm_policy;
 752        vma->vm_policy = new; /* protected by mmap_sem */
 753        mpol_put(old);
 754
 755        return 0;
 756 err_out:
 757        mpol_put(new);
 758        return err;
 759}
 760
 761/* Step 2: apply policy to a range and do splits. */
 762static int mbind_range(struct mm_struct *mm, unsigned long start,
 763                       unsigned long end, struct mempolicy *new_pol)
 764{
 765        struct vm_area_struct *next;
 766        struct vm_area_struct *prev;
 767        struct vm_area_struct *vma;
 768        int err = 0;
 769        pgoff_t pgoff;
 770        unsigned long vmstart;
 771        unsigned long vmend;
 772
 773        vma = find_vma(mm, start);
 774        if (!vma || vma->vm_start > start)
 775                return -EFAULT;
 776
 777        prev = vma->vm_prev;
 778        if (start > vma->vm_start)
 779                prev = vma;
 780
 781        for (; vma && vma->vm_start < end; prev = vma, vma = next) {
 782                next = vma->vm_next;
 783                vmstart = max(start, vma->vm_start);
 784                vmend   = min(end, vma->vm_end);
 785
 786                if (mpol_equal(vma_policy(vma), new_pol))
 787                        continue;
 788
 789                pgoff = vma->vm_pgoff +
 790                        ((vmstart - vma->vm_start) >> PAGE_SHIFT);
 791                prev = vma_merge(mm, prev, vmstart, vmend, vma->vm_flags,
 792                                 vma->anon_vma, vma->vm_file, pgoff,
 793                                 new_pol, vma->vm_userfaultfd_ctx);
 794                if (prev) {
 795                        vma = prev;
 796                        next = vma->vm_next;
 797                        if (mpol_equal(vma_policy(vma), new_pol))
 798                                continue;
 799                        /* vma_merge() joined vma && vma->next, case 8 */
 800                        goto replace;
 801                }
 802                if (vma->vm_start != vmstart) {
 803                        err = split_vma(vma->vm_mm, vma, vmstart, 1);
 804                        if (err)
 805                                goto out;
 806                }
 807                if (vma->vm_end != vmend) {
 808                        err = split_vma(vma->vm_mm, vma, vmend, 0);
 809                        if (err)
 810                                goto out;
 811                }
 812 replace:
 813                err = vma_replace_policy(vma, new_pol);
 814                if (err)
 815                        goto out;
 816        }
 817
 818 out:
 819        return err;
 820}
 821
 822/* Set the process memory policy */
 823static long do_set_mempolicy(unsigned short mode, unsigned short flags,
 824                             nodemask_t *nodes)
 825{
 826        struct mempolicy *new, *old;
 827        NODEMASK_SCRATCH(scratch);
 828        int ret;
 829
 830        if (!scratch)
 831                return -ENOMEM;
 832
 833        new = mpol_new(mode, flags, nodes);
 834        if (IS_ERR(new)) {
 835                ret = PTR_ERR(new);
 836                goto out;
 837        }
 838
 839        task_lock(current);
 840        ret = mpol_set_nodemask(new, nodes, scratch);
 841        if (ret) {
 842                task_unlock(current);
 843                mpol_put(new);
 844                goto out;
 845        }
 846        old = current->mempolicy;
 847        current->mempolicy = new;
 848        if (new && new->mode == MPOL_INTERLEAVE)
 849                current->il_prev = MAX_NUMNODES-1;
 850        task_unlock(current);
 851        mpol_put(old);
 852        ret = 0;
 853out:
 854        NODEMASK_SCRATCH_FREE(scratch);
 855        return ret;
 856}
 857
 858/*
 859 * Return nodemask for policy for get_mempolicy() query
 860 *
 861 * Called with task's alloc_lock held
 862 */
 863static void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes)
 864{
 865        nodes_clear(*nodes);
 866        if (p == &default_policy)
 867                return;
 868
 869        switch (p->mode) {
 870        case MPOL_BIND:
 871                /* Fall through */
 872        case MPOL_INTERLEAVE:
 873                *nodes = p->v.nodes;
 874                break;
 875        case MPOL_PREFERRED:
 876                if (!(p->flags & MPOL_F_LOCAL))
 877                        node_set(p->v.preferred_node, *nodes);
 878                /* else return empty node mask for local allocation */
 879                break;
 880        default:
 881                BUG();
 882        }
 883}
 884
 885static int lookup_node(struct mm_struct *mm, unsigned long addr)
 886{
 887        struct page *p = NULL;
 888        int err;
 889
 890        int locked = 1;
 891        err = get_user_pages_locked(addr & PAGE_MASK, 1, 0, &p, &locked);
 892        if (err == 0) {
 893                /* E.g. GUP interrupted by fatal signal */
 894                err = -EFAULT;
 895        } else if (err > 0) {
 896                err = page_to_nid(p);
 897                put_page(p);
 898        }
 899        if (locked)
 900                up_read(&mm->mmap_sem);
 901        return err;
 902}
 903
 904/* Retrieve NUMA policy */
 905static long do_get_mempolicy(int *policy, nodemask_t *nmask,
 906                             unsigned long addr, unsigned long flags)
 907{
 908        int err;
 909        struct mm_struct *mm = current->mm;
 910        struct vm_area_struct *vma = NULL;
 911        struct mempolicy *pol = current->mempolicy, *pol_refcount = NULL;
 912
 913        if (flags &
 914                ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED))
 915                return -EINVAL;
 916
 917        if (flags & MPOL_F_MEMS_ALLOWED) {
 918                if (flags & (MPOL_F_NODE|MPOL_F_ADDR))
 919                        return -EINVAL;
 920                *policy = 0;    /* just so it's initialized */
 921                task_lock(current);
 922                *nmask  = cpuset_current_mems_allowed;
 923                task_unlock(current);
 924                return 0;
 925        }
 926
 927        if (flags & MPOL_F_ADDR) {
 928                /*
 929                 * Do NOT fall back to task policy if the
 930                 * vma/shared policy at addr is NULL.  We
 931                 * want to return MPOL_DEFAULT in this case.
 932                 */
 933                down_read(&mm->mmap_sem);
 934                vma = find_vma_intersection(mm, addr, addr+1);
 935                if (!vma) {
 936                        up_read(&mm->mmap_sem);
 937                        return -EFAULT;
 938                }
 939                if (vma->vm_ops && vma->vm_ops->get_policy)
 940                        pol = vma->vm_ops->get_policy(vma, addr);
 941                else
 942                        pol = vma->vm_policy;
 943        } else if (addr)
 944                return -EINVAL;
 945
 946        if (!pol)
 947                pol = &default_policy;  /* indicates default behavior */
 948
 949        if (flags & MPOL_F_NODE) {
 950                if (flags & MPOL_F_ADDR) {
 951                        /*
 952                         * Take a refcount on the mpol, lookup_node()
 953                         * wil drop the mmap_sem, so after calling
 954                         * lookup_node() only "pol" remains valid, "vma"
 955                         * is stale.
 956                         */
 957                        pol_refcount = pol;
 958                        vma = NULL;
 959                        mpol_get(pol);
 960                        err = lookup_node(mm, addr);
 961                        if (err < 0)
 962                                goto out;
 963                        *policy = err;
 964                } else if (pol == current->mempolicy &&
 965                                pol->mode == MPOL_INTERLEAVE) {
 966                        *policy = next_node_in(current->il_prev, pol->v.nodes);
 967                } else {
 968                        err = -EINVAL;
 969                        goto out;
 970                }
 971        } else {
 972                *policy = pol == &default_policy ? MPOL_DEFAULT :
 973                                                pol->mode;
 974                /*
 975                 * Internal mempolicy flags must be masked off before exposing
 976                 * the policy to userspace.
 977                 */
 978                *policy |= (pol->flags & MPOL_MODE_FLAGS);
 979        }
 980
 981        err = 0;
 982        if (nmask) {
 983                if (mpol_store_user_nodemask(pol)) {
 984                        *nmask = pol->w.user_nodemask;
 985                } else {
 986                        task_lock(current);
 987                        get_policy_nodemask(pol, nmask);
 988                        task_unlock(current);
 989                }
 990        }
 991
 992 out:
 993        mpol_cond_put(pol);
 994        if (vma)
 995                up_read(&mm->mmap_sem);
 996        if (pol_refcount)
 997                mpol_put(pol_refcount);
 998        return err;
 999}
1000
1001#ifdef CONFIG_MIGRATION
1002/*
1003 * page migration, thp tail pages can be passed.
1004 */
1005static int migrate_page_add(struct page *page, struct list_head *pagelist,
1006                                unsigned long flags)
1007{
1008        struct page *head = compound_head(page);
1009        /*
1010         * Avoid migrating a page that is shared with others.
1011         */
1012        if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(head) == 1) {
1013                if (!isolate_lru_page(head)) {
1014                        list_add_tail(&head->lru, pagelist);
1015                        mod_node_page_state(page_pgdat(head),
1016                                NR_ISOLATED_ANON + page_is_file_lru(head),
1017                                thp_nr_pages(head));
1018                } else if (flags & MPOL_MF_STRICT) {
1019                        /*
1020                         * Non-movable page may reach here.  And, there may be
1021                         * temporary off LRU pages or non-LRU movable pages.
1022                         * Treat them as unmovable pages since they can't be
1023                         * isolated, so they can't be moved at the moment.  It
1024                         * should return -EIO for this case too.
1025                         */
1026                        return -EIO;
1027                }
1028        }
1029
1030        return 0;
1031}
1032
1033/* page allocation callback for NUMA node migration */
1034struct page *alloc_new_node_page(struct page *page, unsigned long node)
1035{
1036        if (PageHuge(page))
1037                return alloc_huge_page_node(page_hstate(compound_head(page)),
1038                                        node);
1039        else if (PageTransHuge(page)) {
1040                struct page *thp;
1041
1042                thp = alloc_pages_node(node,
1043                        (GFP_TRANSHUGE | __GFP_THISNODE),
1044                        HPAGE_PMD_ORDER);
1045                if (!thp)
1046                        return NULL;
1047                prep_transhuge_page(thp);
1048                return thp;
1049        } else
1050                return __alloc_pages_node(node, GFP_HIGHUSER_MOVABLE |
1051                                                    __GFP_THISNODE, 0);
1052}
1053
1054/*
1055 * Migrate pages from one node to a target node.
1056 * Returns error or the number of pages not migrated.
1057 */
1058static int migrate_to_node(struct mm_struct *mm, int source, int dest,
1059                           int flags)
1060{
1061        nodemask_t nmask;
1062        LIST_HEAD(pagelist);
1063        int err = 0;
1064
1065        nodes_clear(nmask);
1066        node_set(source, nmask);
1067
1068        /*
1069         * This does not "check" the range but isolates all pages that
1070         * need migration.  Between passing in the full user address
1071         * space range and MPOL_MF_DISCONTIG_OK, this call can not fail.
1072         */
1073        VM_BUG_ON(!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)));
1074        queue_pages_range(mm, mm->mmap->vm_start, mm->task_size, &nmask,
1075                        flags | MPOL_MF_DISCONTIG_OK, &pagelist);
1076
1077        if (!list_empty(&pagelist)) {
1078                err = migrate_pages(&pagelist, alloc_new_node_page, NULL, dest,
1079                                        MIGRATE_SYNC, MR_SYSCALL);
1080                if (err)
1081                        putback_movable_pages(&pagelist);
1082        }
1083
1084        return err;
1085}
1086
1087/*
1088 * Move pages between the two nodesets so as to preserve the physical
1089 * layout as much as possible.
1090 *
1091 * Returns the number of page that could not be moved.
1092 */
1093int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
1094                     const nodemask_t *to, int flags)
1095{
1096        int busy = 0;
1097        int err;
1098        nodemask_t tmp;
1099
1100        err = migrate_prep();
1101        if (err)
1102                return err;
1103
1104        down_read(&mm->mmap_sem);
1105
1106        /*
1107         * Find a 'source' bit set in 'tmp' whose corresponding 'dest'
1108         * bit in 'to' is not also set in 'tmp'.  Clear the found 'source'
1109         * bit in 'tmp', and return that <source, dest> pair for migration.
1110         * The pair of nodemasks 'to' and 'from' define the map.
1111         *
1112         * If no pair of bits is found that way, fallback to picking some
1113         * pair of 'source' and 'dest' bits that are not the same.  If the
1114         * 'source' and 'dest' bits are the same, this represents a node
1115         * that will be migrating to itself, so no pages need move.
1116         *
1117         * If no bits are left in 'tmp', or if all remaining bits left
1118         * in 'tmp' correspond to the same bit in 'to', return false
1119         * (nothing left to migrate).
1120         *
1121         * This lets us pick a pair of nodes to migrate between, such that
1122         * if possible the dest node is not already occupied by some other
1123         * source node, minimizing the risk of overloading the memory on a
1124         * node that would happen if we migrated incoming memory to a node
1125         * before migrating outgoing memory source that same node.
1126         *
1127         * A single scan of tmp is sufficient.  As we go, we remember the
1128         * most recent <s, d> pair that moved (s != d).  If we find a pair
1129         * that not only moved, but what's better, moved to an empty slot
1130         * (d is not set in tmp), then we break out then, with that pair.
1131         * Otherwise when we finish scanning from_tmp, we at least have the
1132         * most recent <s, d> pair that moved.  If we get all the way through
1133         * the scan of tmp without finding any node that moved, much less
1134         * moved to an empty node, then there is nothing left worth migrating.
1135         */
1136
1137        tmp = *from;
1138        while (!nodes_empty(tmp)) {
1139                int s,d;
1140                int source = NUMA_NO_NODE;
1141                int dest = 0;
1142
1143                for_each_node_mask(s, tmp) {
1144
1145                        /*
1146                         * do_migrate_pages() tries to maintain the relative
1147                         * node relationship of the pages established between
1148                         * threads and memory areas.
1149                         *
1150                         * However if the number of source nodes is not equal to
1151                         * the number of destination nodes we can not preserve
1152                         * this node relative relationship.  In that case, skip
1153                         * copying memory from a node that is in the destination
1154                         * mask.
1155                         *
1156                         * Example: [2,3,4] -> [3,4,5] moves everything.
1157                         *          [0-7] - > [3,4,5] moves only 0,1,2,6,7.
1158                         */
1159
1160                        if ((nodes_weight(*from) != nodes_weight(*to)) &&
1161                                                (node_isset(s, *to)))
1162                                continue;
1163
1164                        d = node_remap(s, *from, *to);
1165                        if (s == d)
1166                                continue;
1167
1168                        source = s;     /* Node moved. Memorize */
1169                        dest = d;
1170
1171                        /* dest not in remaining from nodes? */
1172                        if (!node_isset(dest, tmp))
1173                                break;
1174                }
1175                if (source == NUMA_NO_NODE)
1176                        break;
1177
1178                node_clear(source, tmp);
1179                err = migrate_to_node(mm, source, dest, flags);
1180                if (err > 0)
1181                        busy += err;
1182                if (err < 0)
1183                        break;
1184        }
1185        up_read(&mm->mmap_sem);
1186        if (err < 0)
1187                return err;
1188        return busy;
1189
1190}
1191
1192/*
1193 * Allocate a new page for page migration based on vma policy.
1194 * Start by assuming the page is mapped by the same vma as contains @start.
1195 * Search forward from there, if not.  N.B., this assumes that the
1196 * list of pages handed to migrate_pages()--which is how we get here--
1197 * is in virtual address order.
1198 */
1199static struct page *new_page(struct page *page, unsigned long start)
1200{
1201        struct vm_area_struct *vma;
1202        unsigned long uninitialized_var(address);
1203
1204        vma = find_vma(current->mm, start);
1205        while (vma) {
1206                address = page_address_in_vma(page, vma);
1207                if (address != -EFAULT)
1208                        break;
1209                vma = vma->vm_next;
1210        }
1211
1212        if (PageHuge(page)) {
1213                return alloc_huge_page_vma(page_hstate(compound_head(page)),
1214                                vma, address);
1215        } else if (PageTransHuge(page)) {
1216                struct page *thp;
1217
1218                thp = alloc_hugepage_vma(GFP_TRANSHUGE, vma, address,
1219                                         HPAGE_PMD_ORDER);
1220                if (!thp)
1221                        return NULL;
1222                prep_transhuge_page(thp);
1223                return thp;
1224        }
1225        /*
1226         * if !vma, alloc_page_vma() will use task or system default policy
1227         */
1228        return alloc_page_vma(GFP_HIGHUSER_MOVABLE | __GFP_RETRY_MAYFAIL,
1229                        vma, address);
1230}
1231#else
1232
1233static int migrate_page_add(struct page *page, struct list_head *pagelist,
1234                                unsigned long flags)
1235{
1236        return -EIO;
1237}
1238
1239int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
1240                     const nodemask_t *to, int flags)
1241{
1242        return -ENOSYS;
1243}
1244
1245static struct page *new_page(struct page *page, unsigned long start)
1246{
1247        return NULL;
1248}
1249#endif
1250
1251static long do_mbind(unsigned long start, unsigned long len,
1252                     unsigned short mode, unsigned short mode_flags,
1253                     nodemask_t *nmask, unsigned long flags)
1254{
1255        struct mm_struct *mm = current->mm;
1256        struct mempolicy *new;
1257        unsigned long end;
1258        int err;
1259        int ret;
1260        LIST_HEAD(pagelist);
1261
1262        if (flags & ~(unsigned long)MPOL_MF_VALID)
1263                return -EINVAL;
1264        if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
1265                return -EPERM;
1266
1267        if (start & ~PAGE_MASK)
1268                return -EINVAL;
1269
1270        if (mode == MPOL_DEFAULT)
1271                flags &= ~MPOL_MF_STRICT;
1272
1273        len = (len + PAGE_SIZE - 1) & PAGE_MASK;
1274        end = start + len;
1275
1276        if (end < start)
1277                return -EINVAL;
1278        if (end == start)
1279                return 0;
1280
1281        new = mpol_new(mode, mode_flags, nmask);
1282        if (IS_ERR(new))
1283                return PTR_ERR(new);
1284
1285        if (flags & MPOL_MF_LAZY)
1286                new->flags |= MPOL_F_MOF;
1287
1288        /*
1289         * If we are using the default policy then operation
1290         * on discontinuous address spaces is okay after all
1291         */
1292        if (!new)
1293                flags |= MPOL_MF_DISCONTIG_OK;
1294
1295        pr_debug("mbind %lx-%lx mode:%d flags:%d nodes:%lx\n",
1296                 start, start + len, mode, mode_flags,
1297                 nmask ? nodes_addr(*nmask)[0] : NUMA_NO_NODE);
1298
1299        if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
1300
1301                err = migrate_prep();
1302                if (err)
1303                        goto mpol_out;
1304        }
1305        {
1306                NODEMASK_SCRATCH(scratch);
1307                if (scratch) {
1308                        down_write(&mm->mmap_sem);
1309                        task_lock(current);
1310                        err = mpol_set_nodemask(new, nmask, scratch);
1311                        task_unlock(current);
1312                        if (err)
1313                                up_write(&mm->mmap_sem);
1314                } else
1315                        err = -ENOMEM;
1316                NODEMASK_SCRATCH_FREE(scratch);
1317        }
1318        if (err)
1319                goto mpol_out;
1320
1321        ret = queue_pages_range(mm, start, end, nmask,
1322                          flags | MPOL_MF_INVERT, &pagelist);
1323
1324        if (ret < 0) {
1325                err = ret;
1326                goto up_out;
1327        }
1328
1329        err = mbind_range(mm, start, end, new);
1330
1331        if (!err) {
1332                int nr_failed = 0;
1333
1334                if (!list_empty(&pagelist)) {
1335                        WARN_ON_ONCE(flags & MPOL_MF_LAZY);
1336                        nr_failed = migrate_pages(&pagelist, new_page, NULL,
1337                                start, MIGRATE_SYNC, MR_MEMPOLICY_MBIND);
1338                        if (nr_failed)
1339                                putback_movable_pages(&pagelist);
1340                }
1341
1342                if ((ret > 0) || (nr_failed && (flags & MPOL_MF_STRICT)))
1343                        err = -EIO;
1344        } else {
1345up_out:
1346                if (!list_empty(&pagelist))
1347                        putback_movable_pages(&pagelist);
1348        }
1349
1350        up_write(&mm->mmap_sem);
1351mpol_out:
1352        mpol_put(new);
1353        return err;
1354}
1355
1356/*
1357 * User space interface with variable sized bitmaps for nodelists.
1358 */
1359
1360/* Copy a node mask from user space. */
1361static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
1362                     unsigned long maxnode)
1363{
1364        unsigned long k;
1365        unsigned long t;
1366        unsigned long nlongs;
1367        unsigned long endmask;
1368
1369        --maxnode;
1370        nodes_clear(*nodes);
1371        if (maxnode == 0 || !nmask)
1372                return 0;
1373        if (maxnode > PAGE_SIZE*BITS_PER_BYTE)
1374                return -EINVAL;
1375
1376        nlongs = BITS_TO_LONGS(maxnode);
1377        if ((maxnode % BITS_PER_LONG) == 0)
1378                endmask = ~0UL;
1379        else
1380                endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1;
1381
1382        /*
1383         * When the user specified more nodes than supported just check
1384         * if the non supported part is all zero.
1385         *
1386         * If maxnode have more longs than MAX_NUMNODES, check
1387         * the bits in that area first. And then go through to
1388         * check the rest bits which equal or bigger than MAX_NUMNODES.
1389         * Otherwise, just check bits [MAX_NUMNODES, maxnode).
1390         */
1391        if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) {
1392                for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) {
1393                        if (get_user(t, nmask + k))
1394                                return -EFAULT;
1395                        if (k == nlongs - 1) {
1396                                if (t & endmask)
1397                                        return -EINVAL;
1398                        } else if (t)
1399                                return -EINVAL;
1400                }
1401                nlongs = BITS_TO_LONGS(MAX_NUMNODES);
1402                endmask = ~0UL;
1403        }
1404
1405        if (maxnode > MAX_NUMNODES && MAX_NUMNODES % BITS_PER_LONG != 0) {
1406                unsigned long valid_mask = endmask;
1407
1408                valid_mask &= ~((1UL << (MAX_NUMNODES % BITS_PER_LONG)) - 1);
1409                if (get_user(t, nmask + nlongs - 1))
1410                        return -EFAULT;
1411                if (t & valid_mask)
1412                        return -EINVAL;
1413        }
1414
1415        if (copy_from_user(nodes_addr(*nodes), nmask, nlongs*sizeof(unsigned long)))
1416                return -EFAULT;
1417        nodes_addr(*nodes)[nlongs-1] &= endmask;
1418        return 0;
1419}
1420
1421/* Copy a kernel node mask to user space */
1422static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
1423                              nodemask_t *nodes)
1424{
1425        unsigned long copy = ALIGN(maxnode-1, 64) / 8;
1426        unsigned int nbytes = BITS_TO_LONGS(nr_node_ids) * sizeof(long);
1427
1428        if (copy > nbytes) {
1429                if (copy > PAGE_SIZE)
1430                        return -EINVAL;
1431                if (clear_user((char __user *)mask + nbytes, copy - nbytes))
1432                        return -EFAULT;
1433                copy = nbytes;
1434        }
1435        return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
1436}
1437
1438static long kernel_mbind(unsigned long start, unsigned long len,
1439                         unsigned long mode, const unsigned long __user *nmask,
1440                         unsigned long maxnode, unsigned int flags)
1441{
1442        nodemask_t nodes;
1443        int err;
1444        unsigned short mode_flags;
1445
1446        start = untagged_addr(start);
1447        mode_flags = mode & MPOL_MODE_FLAGS;
1448        mode &= ~MPOL_MODE_FLAGS;
1449        if (mode >= MPOL_MAX)
1450                return -EINVAL;
1451        if ((mode_flags & MPOL_F_STATIC_NODES) &&
1452            (mode_flags & MPOL_F_RELATIVE_NODES))
1453                return -EINVAL;
1454        err = get_nodes(&nodes, nmask, maxnode);
1455        if (err)
1456                return err;
1457        return do_mbind(start, len, mode, mode_flags, &nodes, flags);
1458}
1459
1460SYSCALL_DEFINE6(mbind, unsigned long, start, unsigned long, len,
1461                unsigned long, mode, const unsigned long __user *, nmask,
1462                unsigned long, maxnode, unsigned int, flags)
1463{
1464        return kernel_mbind(start, len, mode, nmask, maxnode, flags);
1465}
1466
1467/* Set the process memory policy */
1468static long kernel_set_mempolicy(int mode, const unsigned long __user *nmask,
1469                                 unsigned long maxnode)
1470{
1471        int err;
1472        nodemask_t nodes;
1473        unsigned short flags;
1474
1475        flags = mode & MPOL_MODE_FLAGS;
1476        mode &= ~MPOL_MODE_FLAGS;
1477        if ((unsigned int)mode >= MPOL_MAX)
1478                return -EINVAL;
1479        if ((flags & MPOL_F_STATIC_NODES) && (flags & MPOL_F_RELATIVE_NODES))
1480                return -EINVAL;
1481        err = get_nodes(&nodes, nmask, maxnode);
1482        if (err)
1483                return err;
1484        return do_set_mempolicy(mode, flags, &nodes);
1485}
1486
1487SYSCALL_DEFINE3(set_mempolicy, int, mode, const unsigned long __user *, nmask,
1488                unsigned long, maxnode)
1489{
1490        return kernel_set_mempolicy(mode, nmask, maxnode);
1491}
1492
1493static int kernel_migrate_pages(pid_t pid, unsigned long maxnode,
1494                                const unsigned long __user *old_nodes,
1495                                const unsigned long __user *new_nodes)
1496{
1497        struct mm_struct *mm = NULL;
1498        struct task_struct *task;
1499        nodemask_t task_nodes;
1500        int err;
1501        nodemask_t *old;
1502        nodemask_t *new;
1503        NODEMASK_SCRATCH(scratch);
1504
1505        if (!scratch)
1506                return -ENOMEM;
1507
1508        old = &scratch->mask1;
1509        new = &scratch->mask2;
1510
1511        err = get_nodes(old, old_nodes, maxnode);
1512        if (err)
1513                goto out;
1514
1515        err = get_nodes(new, new_nodes, maxnode);
1516        if (err)
1517                goto out;
1518
1519        /* Find the mm_struct */
1520        rcu_read_lock();
1521        task = pid ? find_task_by_vpid(pid) : current;
1522        if (!task) {
1523                rcu_read_unlock();
1524                err = -ESRCH;
1525                goto out;
1526        }
1527        get_task_struct(task);
1528
1529        err = -EINVAL;
1530
1531        /*
1532         * Check if this process has the right to modify the specified process.
1533         * Use the regular "ptrace_may_access()" checks.
1534         */
1535        if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS)) {
1536                rcu_read_unlock();
1537                err = -EPERM;
1538                goto out_put;
1539        }
1540        rcu_read_unlock();
1541
1542        task_nodes = cpuset_mems_allowed(task);
1543        /* Is the user allowed to access the target nodes? */
1544        if (!nodes_subset(*new, task_nodes) && !capable(CAP_SYS_NICE)) {
1545                err = -EPERM;
1546                goto out_put;
1547        }
1548
1549        task_nodes = cpuset_mems_allowed(current);
1550        nodes_and(*new, *new, task_nodes);
1551        if (nodes_empty(*new))
1552                goto out_put;
1553
1554        nodes_and(*new, *new, node_states[N_MEMORY]);
1555        if (nodes_empty(*new))
1556                goto out_put;
1557
1558        err = security_task_movememory(task);
1559        if (err)
1560                goto out_put;
1561
1562        mm = get_task_mm(task);
1563        put_task_struct(task);
1564
1565        if (!mm) {
1566                err = -EINVAL;
1567                goto out;
1568        }
1569
1570        err = do_migrate_pages(mm, old, new,
1571                capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE);
1572
1573        mmput(mm);
1574out:
1575        NODEMASK_SCRATCH_FREE(scratch);
1576
1577        return err;
1578
1579out_put:
1580        put_task_struct(task);
1581        goto out;
1582
1583}
1584
1585SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode,
1586                const unsigned long __user *, old_nodes,
1587                const unsigned long __user *, new_nodes)
1588{
1589        return kernel_migrate_pages(pid, maxnode, old_nodes, new_nodes);
1590}
1591
1592
1593/* Retrieve NUMA policy */
1594static int kernel_get_mempolicy(int __user *policy,
1595                                unsigned long __user *nmask,
1596                                unsigned long maxnode,
1597                                unsigned long addr,
1598                                unsigned long flags)
1599{
1600        int err;
1601        int uninitialized_var(pval);
1602        nodemask_t nodes;
1603
1604        addr = untagged_addr(addr);
1605
1606        if (nmask != NULL && maxnode < nr_node_ids)
1607                return -EINVAL;
1608
1609        err = do_get_mempolicy(&pval, &nodes, addr, flags);
1610
1611        if (err)
1612                return err;
1613
1614        if (policy && put_user(pval, policy))
1615                return -EFAULT;
1616
1617        if (nmask)
1618                err = copy_nodes_to_user(nmask, maxnode, &nodes);
1619
1620        return err;
1621}
1622
1623SYSCALL_DEFINE5(get_mempolicy, int __user *, policy,
1624                unsigned long __user *, nmask, unsigned long, maxnode,
1625                unsigned long, addr, unsigned long, flags)
1626{
1627        return kernel_get_mempolicy(policy, nmask, maxnode, addr, flags);
1628}
1629
1630#ifdef CONFIG_COMPAT
1631
1632COMPAT_SYSCALL_DEFINE5(get_mempolicy, int __user *, policy,
1633                       compat_ulong_t __user *, nmask,
1634                       compat_ulong_t, maxnode,
1635                       compat_ulong_t, addr, compat_ulong_t, flags)
1636{
1637        long err;
1638        unsigned long __user *nm = NULL;
1639        unsigned long nr_bits, alloc_size;
1640        DECLARE_BITMAP(bm, MAX_NUMNODES);
1641
1642        nr_bits = min_t(unsigned long, maxnode-1, nr_node_ids);
1643        alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1644
1645        if (nmask)
1646                nm = compat_alloc_user_space(alloc_size);
1647
1648        err = kernel_get_mempolicy(policy, nm, nr_bits+1, addr, flags);
1649
1650        if (!err && nmask) {
1651                unsigned long copy_size;
1652                copy_size = min_t(unsigned long, sizeof(bm), alloc_size);
1653                err = copy_from_user(bm, nm, copy_size);
1654                /* ensure entire bitmap is zeroed */
1655                err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8);
1656                err |= compat_put_bitmap(nmask, bm, nr_bits);
1657        }
1658
1659        return err;
1660}
1661
1662COMPAT_SYSCALL_DEFINE3(set_mempolicy, int, mode, compat_ulong_t __user *, nmask,
1663                       compat_ulong_t, maxnode)
1664{
1665        unsigned long __user *nm = NULL;
1666        unsigned long nr_bits, alloc_size;
1667        DECLARE_BITMAP(bm, MAX_NUMNODES);
1668
1669        nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1670        alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1671
1672        if (nmask) {
1673                if (compat_get_bitmap(bm, nmask, nr_bits))
1674                        return -EFAULT;
1675                nm = compat_alloc_user_space(alloc_size);
1676                if (copy_to_user(nm, bm, alloc_size))
1677                        return -EFAULT;
1678        }
1679
1680        return kernel_set_mempolicy(mode, nm, nr_bits+1);
1681}
1682
1683COMPAT_SYSCALL_DEFINE6(mbind, compat_ulong_t, start, compat_ulong_t, len,
1684                       compat_ulong_t, mode, compat_ulong_t __user *, nmask,
1685                       compat_ulong_t, maxnode, compat_ulong_t, flags)
1686{
1687        unsigned long __user *nm = NULL;
1688        unsigned long nr_bits, alloc_size;
1689        nodemask_t bm;
1690
1691        nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1692        alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1693
1694        if (nmask) {
1695                if (compat_get_bitmap(nodes_addr(bm), nmask, nr_bits))
1696                        return -EFAULT;
1697                nm = compat_alloc_user_space(alloc_size);
1698                if (copy_to_user(nm, nodes_addr(bm), alloc_size))
1699                        return -EFAULT;
1700        }
1701
1702        return kernel_mbind(start, len, mode, nm, nr_bits+1, flags);
1703}
1704
1705COMPAT_SYSCALL_DEFINE4(migrate_pages, compat_pid_t, pid,
1706                       compat_ulong_t, maxnode,
1707                       const compat_ulong_t __user *, old_nodes,
1708                       const compat_ulong_t __user *, new_nodes)
1709{
1710        unsigned long __user *old = NULL;
1711        unsigned long __user *new = NULL;
1712        nodemask_t tmp_mask;
1713        unsigned long nr_bits;
1714        unsigned long size;
1715
1716        nr_bits = min_t(unsigned long, maxnode - 1, MAX_NUMNODES);
1717        size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1718        if (old_nodes) {
1719                if (compat_get_bitmap(nodes_addr(tmp_mask), old_nodes, nr_bits))
1720                        return -EFAULT;
1721                old = compat_alloc_user_space(new_nodes ? size * 2 : size);
1722                if (new_nodes)
1723                        new = old + size / sizeof(unsigned long);
1724                if (copy_to_user(old, nodes_addr(tmp_mask), size))
1725                        return -EFAULT;
1726        }
1727        if (new_nodes) {
1728                if (compat_get_bitmap(nodes_addr(tmp_mask), new_nodes, nr_bits))
1729                        return -EFAULT;
1730                if (new == NULL)
1731                        new = compat_alloc_user_space(size);
1732                if (copy_to_user(new, nodes_addr(tmp_mask), size))
1733                        return -EFAULT;
1734        }
1735        return kernel_migrate_pages(pid, nr_bits + 1, old, new);
1736}
1737
1738#endif /* CONFIG_COMPAT */
1739
1740struct mempolicy *__get_vma_policy(struct vm_area_struct *vma,
1741                                                unsigned long addr)
1742{
1743        struct mempolicy *pol = NULL;
1744
1745        if (vma) {
1746                if (vma->vm_ops && vma->vm_ops->get_policy) {
1747                        pol = vma->vm_ops->get_policy(vma, addr);
1748                } else if (vma->vm_policy) {
1749                        pol = vma->vm_policy;
1750
1751                        /*
1752                         * shmem_alloc_page() passes MPOL_F_SHARED policy with
1753                         * a pseudo vma whose vma->vm_ops=NULL. Take a reference
1754                         * count on these policies which will be dropped by
1755                         * mpol_cond_put() later
1756                         */
1757                        if (mpol_needs_cond_ref(pol))
1758                                mpol_get(pol);
1759                }
1760        }
1761
1762        return pol;
1763}
1764
1765/*
1766 * get_vma_policy(@vma, @addr)
1767 * @vma: virtual memory area whose policy is sought
1768 * @addr: address in @vma for shared policy lookup
1769 *
1770 * Returns effective policy for a VMA at specified address.
1771 * Falls back to current->mempolicy or system default policy, as necessary.
1772 * Shared policies [those marked as MPOL_F_SHARED] require an extra reference
1773 * count--added by the get_policy() vm_op, as appropriate--to protect against
1774 * freeing by another task.  It is the caller's responsibility to free the
1775 * extra reference for shared policies.
1776 */
1777static struct mempolicy *get_vma_policy(struct vm_area_struct *vma,
1778                                                unsigned long addr)
1779{
1780        struct mempolicy *pol = __get_vma_policy(vma, addr);
1781
1782        if (!pol)
1783                pol = get_task_policy(current);
1784
1785        return pol;
1786}
1787
1788bool vma_policy_mof(struct vm_area_struct *vma)
1789{
1790        struct mempolicy *pol;
1791
1792        if (vma->vm_ops && vma->vm_ops->get_policy) {
1793                bool ret = false;
1794
1795                pol = vma->vm_ops->get_policy(vma, vma->vm_start);
1796                if (pol && (pol->flags & MPOL_F_MOF))
1797                        ret = true;
1798                mpol_cond_put(pol);
1799
1800                return ret;
1801        }
1802
1803        pol = vma->vm_policy;
1804        if (!pol)
1805                pol = get_task_policy(current);
1806
1807        return pol->flags & MPOL_F_MOF;
1808}
1809
1810static int apply_policy_zone(struct mempolicy *policy, enum zone_type zone)
1811{
1812        enum zone_type dynamic_policy_zone = policy_zone;
1813
1814        BUG_ON(dynamic_policy_zone == ZONE_MOVABLE);
1815
1816        /*
1817         * if policy->v.nodes has movable memory only,
1818         * we apply policy when gfp_zone(gfp) = ZONE_MOVABLE only.
1819         *
1820         * policy->v.nodes is intersect with node_states[N_MEMORY].
1821         * so if the following test faile, it implies
1822         * policy->v.nodes has movable memory only.
1823         */
1824        if (!nodes_intersects(policy->v.nodes, node_states[N_HIGH_MEMORY]))
1825                dynamic_policy_zone = ZONE_MOVABLE;
1826
1827        return zone >= dynamic_policy_zone;
1828}
1829
1830/*
1831 * Return a nodemask representing a mempolicy for filtering nodes for
1832 * page allocation
1833 */
1834static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy)
1835{
1836        /* Lower zones don't get a nodemask applied for MPOL_BIND */
1837        if (unlikely(policy->mode == MPOL_BIND) &&
1838                        apply_policy_zone(policy, gfp_zone(gfp)) &&
1839                        cpuset_nodemask_valid_mems_allowed(&policy->v.nodes))
1840                return &policy->v.nodes;
1841
1842        return NULL;
1843}
1844
1845/* Return the node id preferred by the given mempolicy, or the given id */
1846static int policy_node(gfp_t gfp, struct mempolicy *policy,
1847                                                                int nd)
1848{
1849        if (policy->mode == MPOL_PREFERRED && !(policy->flags & MPOL_F_LOCAL))
1850                nd = policy->v.preferred_node;
1851        else {
1852                /*
1853                 * __GFP_THISNODE shouldn't even be used with the bind policy
1854                 * because we might easily break the expectation to stay on the
1855                 * requested node and not break the policy.
1856                 */
1857                WARN_ON_ONCE(policy->mode == MPOL_BIND && (gfp & __GFP_THISNODE));
1858        }
1859
1860        return nd;
1861}
1862
1863/* Do dynamic interleaving for a process */
1864static unsigned interleave_nodes(struct mempolicy *policy)
1865{
1866        unsigned next;
1867        struct task_struct *me = current;
1868
1869        next = next_node_in(me->il_prev, policy->v.nodes);
1870        if (next < MAX_NUMNODES)
1871                me->il_prev = next;
1872        return next;
1873}
1874
1875/*
1876 * Depending on the memory policy provide a node from which to allocate the
1877 * next slab entry.
1878 */
1879unsigned int mempolicy_slab_node(void)
1880{
1881        struct mempolicy *policy;
1882        int node = numa_mem_id();
1883
1884        if (in_interrupt())
1885                return node;
1886
1887        policy = current->mempolicy;
1888        if (!policy || policy->flags & MPOL_F_LOCAL)
1889                return node;
1890
1891        switch (policy->mode) {
1892        case MPOL_PREFERRED:
1893                /*
1894                 * handled MPOL_F_LOCAL above
1895                 */
1896                return policy->v.preferred_node;
1897
1898        case MPOL_INTERLEAVE:
1899                return interleave_nodes(policy);
1900
1901        case MPOL_BIND: {
1902                struct zoneref *z;
1903
1904                /*
1905                 * Follow bind policy behavior and start allocation at the
1906                 * first node.
1907                 */
1908                struct zonelist *zonelist;
1909                enum zone_type highest_zoneidx = gfp_zone(GFP_KERNEL);
1910                zonelist = &NODE_DATA(node)->node_zonelists[ZONELIST_FALLBACK];
1911                z = first_zones_zonelist(zonelist, highest_zoneidx,
1912                                                        &policy->v.nodes);
1913                return z->zone ? zone_to_nid(z->zone) : node;
1914        }
1915
1916        default:
1917                BUG();
1918        }
1919}
1920
1921/*
1922 * Do static interleaving for a VMA with known offset @n.  Returns the n'th
1923 * node in pol->v.nodes (starting from n=0), wrapping around if n exceeds the
1924 * number of present nodes.
1925 */
1926static unsigned offset_il_node(struct mempolicy *pol, unsigned long n)
1927{
1928        unsigned nnodes = nodes_weight(pol->v.nodes);
1929        unsigned target;
1930        int i;
1931        int nid;
1932
1933        if (!nnodes)
1934                return numa_node_id();
1935        target = (unsigned int)n % nnodes;
1936        nid = first_node(pol->v.nodes);
1937        for (i = 0; i < target; i++)
1938                nid = next_node(nid, pol->v.nodes);
1939        return nid;
1940}
1941
1942/* Determine a node number for interleave */
1943static inline unsigned interleave_nid(struct mempolicy *pol,
1944                 struct vm_area_struct *vma, unsigned long addr, int shift)
1945{
1946        if (vma) {
1947                unsigned long off;
1948
1949                /*
1950                 * for small pages, there is no difference between
1951                 * shift and PAGE_SHIFT, so the bit-shift is safe.
1952                 * for huge pages, since vm_pgoff is in units of small
1953                 * pages, we need to shift off the always 0 bits to get
1954                 * a useful offset.
1955                 */
1956                BUG_ON(shift < PAGE_SHIFT);
1957                off = vma->vm_pgoff >> (shift - PAGE_SHIFT);
1958                off += (addr - vma->vm_start) >> shift;
1959                return offset_il_node(pol, off);
1960        } else
1961                return interleave_nodes(pol);
1962}
1963
1964#ifdef CONFIG_HUGETLBFS
1965/*
1966 * huge_node(@vma, @addr, @gfp_flags, @mpol)
1967 * @vma: virtual memory area whose policy is sought
1968 * @addr: address in @vma for shared policy lookup and interleave policy
1969 * @gfp_flags: for requested zone
1970 * @mpol: pointer to mempolicy pointer for reference counted mempolicy
1971 * @nodemask: pointer to nodemask pointer for MPOL_BIND nodemask
1972 *
1973 * Returns a nid suitable for a huge page allocation and a pointer
1974 * to the struct mempolicy for conditional unref after allocation.
1975 * If the effective policy is 'BIND, returns a pointer to the mempolicy's
1976 * @nodemask for filtering the zonelist.
1977 *
1978 * Must be protected by read_mems_allowed_begin()
1979 */
1980int huge_node(struct vm_area_struct *vma, unsigned long addr, gfp_t gfp_flags,
1981                                struct mempolicy **mpol, nodemask_t **nodemask)
1982{
1983        int nid;
1984
1985        *mpol = get_vma_policy(vma, addr);
1986        *nodemask = NULL;       /* assume !MPOL_BIND */
1987
1988        if (unlikely((*mpol)->mode == MPOL_INTERLEAVE)) {
1989                nid = interleave_nid(*mpol, vma, addr,
1990                                        huge_page_shift(hstate_vma(vma)));
1991        } else {
1992                nid = policy_node(gfp_flags, *mpol, numa_node_id());
1993                if ((*mpol)->mode == MPOL_BIND)
1994                        *nodemask = &(*mpol)->v.nodes;
1995        }
1996        return nid;
1997}
1998
1999/*
2000 * init_nodemask_of_mempolicy
2001 *
2002 * If the current task's mempolicy is "default" [NULL], return 'false'
2003 * to indicate default policy.  Otherwise, extract the policy nodemask
2004 * for 'bind' or 'interleave' policy into the argument nodemask, or
2005 * initialize the argument nodemask to contain the single node for
2006 * 'preferred' or 'local' policy and return 'true' to indicate presence
2007 * of non-default mempolicy.
2008 *
2009 * We don't bother with reference counting the mempolicy [mpol_get/put]
2010 * because the current task is examining it's own mempolicy and a task's
2011 * mempolicy is only ever changed by the task itself.
2012 *
2013 * N.B., it is the caller's responsibility to free a returned nodemask.
2014 */
2015bool init_nodemask_of_mempolicy(nodemask_t *mask)
2016{
2017        struct mempolicy *mempolicy;
2018        int nid;
2019
2020        if (!(mask && current->mempolicy))
2021                return false;
2022
2023        task_lock(current);
2024        mempolicy = current->mempolicy;
2025        switch (mempolicy->mode) {
2026        case MPOL_PREFERRED:
2027                if (mempolicy->flags & MPOL_F_LOCAL)
2028                        nid = numa_node_id();
2029                else
2030                        nid = mempolicy->v.preferred_node;
2031                init_nodemask_of_node(mask, nid);
2032                break;
2033
2034        case MPOL_BIND:
2035                /* Fall through */
2036        case MPOL_INTERLEAVE:
2037                *mask =  mempolicy->v.nodes;
2038                break;
2039
2040        default:
2041                BUG();
2042        }
2043        task_unlock(current);
2044
2045        return true;
2046}
2047#endif
2048
2049/*
2050 * mempolicy_nodemask_intersects
2051 *
2052 * If tsk's mempolicy is "default" [NULL], return 'true' to indicate default
2053 * policy.  Otherwise, check for intersection between mask and the policy
2054 * nodemask for 'bind' or 'interleave' policy.  For 'perferred' or 'local'
2055 * policy, always return true since it may allocate elsewhere on fallback.
2056 *
2057 * Takes task_lock(tsk) to prevent freeing of its mempolicy.
2058 */
2059bool mempolicy_nodemask_intersects(struct task_struct *tsk,
2060                                        const nodemask_t *mask)
2061{
2062        struct mempolicy *mempolicy;
2063        bool ret = true;
2064
2065        if (!mask)
2066                return ret;
2067        task_lock(tsk);
2068        mempolicy = tsk->mempolicy;
2069        if (!mempolicy)
2070                goto out;
2071
2072        switch (mempolicy->mode) {
2073        case MPOL_PREFERRED:
2074                /*
2075                 * MPOL_PREFERRED and MPOL_F_LOCAL are only preferred nodes to
2076                 * allocate from, they may fallback to other nodes when oom.
2077                 * Thus, it's possible for tsk to have allocated memory from
2078                 * nodes in mask.
2079                 */
2080                break;
2081        case MPOL_BIND:
2082        case MPOL_INTERLEAVE:
2083                ret = nodes_intersects(mempolicy->v.nodes, *mask);
2084                break;
2085        default:
2086                BUG();
2087        }
2088out:
2089        task_unlock(tsk);
2090        return ret;
2091}
2092
2093/* Allocate a page in interleaved policy.
2094   Own path because it needs to do special accounting. */
2095static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
2096                                        unsigned nid)
2097{
2098        struct page *page;
2099
2100        page = __alloc_pages(gfp, order, nid);
2101        /* skip NUMA_INTERLEAVE_HIT counter update if numa stats is disabled */
2102        if (!static_branch_likely(&vm_numa_stat_key))
2103                return page;
2104        if (page && page_to_nid(page) == nid) {
2105                preempt_disable();
2106                __inc_numa_state(page_zone(page), NUMA_INTERLEAVE_HIT);
2107                preempt_enable();
2108        }
2109        return page;
2110}
2111
2112/**
2113 *      alloc_pages_vma - Allocate a page for a VMA.
2114 *
2115 *      @gfp:
2116 *      %GFP_USER    user allocation.
2117 *      %GFP_KERNEL  kernel allocations,
2118 *      %GFP_HIGHMEM highmem/user allocations,
2119 *      %GFP_FS      allocation should not call back into a file system.
2120 *      %GFP_ATOMIC  don't sleep.
2121 *
2122 *      @order:Order of the GFP allocation.
2123 *      @vma:  Pointer to VMA or NULL if not available.
2124 *      @addr: Virtual Address of the allocation. Must be inside the VMA.
2125 *      @node: Which node to prefer for allocation (modulo policy).
2126 *      @hugepage: for hugepages try only the preferred node if possible
2127 *
2128 *      This function allocates a page from the kernel page pool and applies
2129 *      a NUMA policy associated with the VMA or the current process.
2130 *      When VMA is not NULL caller must hold down_read on the mmap_sem of the
2131 *      mm_struct of the VMA to prevent it from going away. Should be used for
2132 *      all allocations for pages that will be mapped into user space. Returns
2133 *      NULL when no page can be allocated.
2134 */
2135struct page *
2136alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
2137                unsigned long addr, int node, bool hugepage)
2138{
2139        struct mempolicy *pol;
2140        struct page *page;
2141        int preferred_nid;
2142        nodemask_t *nmask;
2143
2144        pol = get_vma_policy(vma, addr);
2145
2146        if (pol->mode == MPOL_INTERLEAVE) {
2147                unsigned nid;
2148
2149                nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order);
2150                mpol_cond_put(pol);
2151                page = alloc_page_interleave(gfp, order, nid);
2152                goto out;
2153        }
2154
2155        if (unlikely(IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && hugepage)) {
2156                int hpage_node = node;
2157
2158                /*
2159                 * For hugepage allocation and non-interleave policy which
2160                 * allows the current node (or other explicitly preferred
2161                 * node) we only try to allocate from the current/preferred
2162                 * node and don't fall back to other nodes, as the cost of
2163                 * remote accesses would likely offset THP benefits.
2164                 *
2165                 * If the policy is interleave, or does not allow the current
2166                 * node in its nodemask, we allocate the standard way.
2167                 */
2168                if (pol->mode == MPOL_PREFERRED &&
2169                                                !(pol->flags & MPOL_F_LOCAL))
2170                        hpage_node = pol->v.preferred_node;
2171
2172                nmask = policy_nodemask(gfp, pol);
2173                if (!nmask || node_isset(hpage_node, *nmask)) {
2174                        mpol_cond_put(pol);
2175                        /*
2176                         * We cannot invoke reclaim if __GFP_THISNODE
2177                         * is set. Invoking reclaim with
2178                         * __GFP_THISNODE set, would cause THP
2179                         * allocations to trigger heavy swapping
2180                         * despite there may be tons of free memory
2181                         * (including potentially plenty of THP
2182                         * already available in the buddy) on all the
2183                         * other NUMA nodes.
2184                         *
2185                         * At most we could invoke compaction when
2186                         * __GFP_THISNODE is set (but we would need to
2187                         * refrain from invoking reclaim even if
2188                         * compaction returned COMPACT_SKIPPED because
2189                         * there wasn't not enough memory to succeed
2190                         * compaction). For now just avoid
2191                         * __GFP_THISNODE instead of limiting the
2192                         * allocation path to a strict and single
2193                         * compaction invocation.
2194                         *
2195                         * Supposedly if direct reclaim was enabled by
2196                         * the caller, the app prefers THP regardless
2197                         * of the node it comes from so this would be
2198                         * more desiderable behavior than only
2199                         * providing THP originated from the local
2200                         * node in such case.
2201                         */
2202                        if (!(gfp & __GFP_DIRECT_RECLAIM))
2203                                gfp |= __GFP_THISNODE;
2204                        page = __alloc_pages_node(hpage_node, gfp, order);
2205                        goto out;
2206                }
2207        }
2208
2209        nmask = policy_nodemask(gfp, pol);
2210        preferred_nid = policy_node(gfp, pol, node);
2211        page = __alloc_pages_nodemask(gfp, order, preferred_nid, nmask);
2212        mpol_cond_put(pol);
2213out:
2214        return page;
2215}
2216EXPORT_SYMBOL(alloc_pages_vma);
2217
2218/**
2219 *      alloc_pages_current - Allocate pages.
2220 *
2221 *      @gfp:
2222 *              %GFP_USER   user allocation,
2223 *              %GFP_KERNEL kernel allocation,
2224 *              %GFP_HIGHMEM highmem allocation,
2225 *              %GFP_FS     don't call back into a file system.
2226 *              %GFP_ATOMIC don't sleep.
2227 *      @order: Power of two of allocation size in pages. 0 is a single page.
2228 *
2229 *      Allocate a page from the kernel page pool.  When not in
2230 *      interrupt context and apply the current process NUMA policy.
2231 *      Returns NULL when no page can be allocated.
2232 */
2233struct page *alloc_pages_current(gfp_t gfp, unsigned order)
2234{
2235        struct mempolicy *pol = &default_policy;
2236        struct page *page;
2237
2238        if (!in_interrupt() && !(gfp & __GFP_THISNODE))
2239                pol = get_task_policy(current);
2240
2241        /*
2242         * No reference counting needed for current->mempolicy
2243         * nor system default_policy
2244         */
2245        if (pol->mode == MPOL_INTERLEAVE)
2246                page = alloc_page_interleave(gfp, order, interleave_nodes(pol));
2247        else
2248                page = __alloc_pages_nodemask(gfp, order,
2249                                policy_node(gfp, pol, numa_node_id()),
2250                                policy_nodemask(gfp, pol));
2251
2252        return page;
2253}
2254EXPORT_SYMBOL(alloc_pages_current);
2255
2256int vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst)
2257{
2258        struct mempolicy *pol = mpol_dup(vma_policy(src));
2259
2260        if (IS_ERR(pol))
2261                return PTR_ERR(pol);
2262        dst->vm_policy = pol;
2263        return 0;
2264}
2265
2266/*
2267 * If mpol_dup() sees current->cpuset == cpuset_being_rebound, then it
2268 * rebinds the mempolicy its copying by calling mpol_rebind_policy()
2269 * with the mems_allowed returned by cpuset_mems_allowed().  This
2270 * keeps mempolicies cpuset relative after its cpuset moves.  See
2271 * further kernel/cpuset.c update_nodemask().
2272 *
2273 * current's mempolicy may be rebinded by the other task(the task that changes
2274 * cpuset's mems), so we needn't do rebind work for current task.
2275 */
2276
2277/* Slow path of a mempolicy duplicate */
2278struct mempolicy *__mpol_dup(struct mempolicy *old)
2279{
2280        struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
2281
2282        if (!new)
2283                return ERR_PTR(-ENOMEM);
2284
2285        /* task's mempolicy is protected by alloc_lock */
2286        if (old == current->mempolicy) {
2287                task_lock(current);
2288                *new = *old;
2289                task_unlock(current);
2290        } else
2291                *new = *old;
2292
2293        if (current_cpuset_is_being_rebound()) {
2294                nodemask_t mems = cpuset_mems_allowed(current);
2295                mpol_rebind_policy(new, &mems);
2296        }
2297        atomic_set(&new->refcnt, 1);
2298        return new;
2299}
2300
2301/* Slow path of a mempolicy comparison */
2302bool __mpol_equal(struct mempolicy *a, struct mempolicy *b)
2303{
2304        if (!a || !b)
2305                return false;
2306        if (a->mode != b->mode)
2307                return false;
2308        if (a->flags != b->flags)
2309                return false;
2310        if (mpol_store_user_nodemask(a))
2311                if (!nodes_equal(a->w.user_nodemask, b->w.user_nodemask))
2312                        return false;
2313
2314        switch (a->mode) {
2315        case MPOL_BIND:
2316                /* Fall through */
2317        case MPOL_INTERLEAVE:
2318                return !!nodes_equal(a->v.nodes, b->v.nodes);
2319        case MPOL_PREFERRED:
2320                /* a's ->flags is the same as b's */
2321                if (a->flags & MPOL_F_LOCAL)
2322                        return true;
2323                return a->v.preferred_node == b->v.preferred_node;
2324        default:
2325                BUG();
2326                return false;
2327        }
2328}
2329
2330/*
2331 * Shared memory backing store policy support.
2332 *
2333 * Remember policies even when nobody has shared memory mapped.
2334 * The policies are kept in Red-Black tree linked from the inode.
2335 * They are protected by the sp->lock rwlock, which should be held
2336 * for any accesses to the tree.
2337 */
2338
2339/*
2340 * lookup first element intersecting start-end.  Caller holds sp->lock for
2341 * reading or for writing
2342 */
2343static struct sp_node *
2344sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
2345{
2346        struct rb_node *n = sp->root.rb_node;
2347
2348        while (n) {
2349                struct sp_node *p = rb_entry(n, struct sp_node, nd);
2350
2351                if (start >= p->end)
2352                        n = n->rb_right;
2353                else if (end <= p->start)
2354                        n = n->rb_left;
2355                else
2356                        break;
2357        }
2358        if (!n)
2359                return NULL;
2360        for (;;) {
2361                struct sp_node *w = NULL;
2362                struct rb_node *prev = rb_prev(n);
2363                if (!prev)
2364                        break;
2365                w = rb_entry(prev, struct sp_node, nd);
2366                if (w->end <= start)
2367                        break;
2368                n = prev;
2369        }
2370        return rb_entry(n, struct sp_node, nd);
2371}
2372
2373/*
2374 * Insert a new shared policy into the list.  Caller holds sp->lock for
2375 * writing.
2376 */
2377static void sp_insert(struct shared_policy *sp, struct sp_node *new)
2378{
2379        struct rb_node **p = &sp->root.rb_node;
2380        struct rb_node *parent = NULL;
2381        struct sp_node *nd;
2382
2383        while (*p) {
2384                parent = *p;
2385                nd = rb_entry(parent, struct sp_node, nd);
2386                if (new->start < nd->start)
2387                        p = &(*p)->rb_left;
2388                else if (new->end > nd->end)
2389                        p = &(*p)->rb_right;
2390                else
2391                        BUG();
2392        }
2393        rb_link_node(&new->nd, parent, p);
2394        rb_insert_color(&new->nd, &sp->root);
2395        pr_debug("inserting %lx-%lx: %d\n", new->start, new->end,
2396                 new->policy ? new->policy->mode : 0);
2397}
2398
2399/* Find shared policy intersecting idx */
2400struct mempolicy *
2401mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
2402{
2403        struct mempolicy *pol = NULL;
2404        struct sp_node *sn;
2405
2406        if (!sp->root.rb_node)
2407                return NULL;
2408        read_lock(&sp->lock);
2409        sn = sp_lookup(sp, idx, idx+1);
2410        if (sn) {
2411                mpol_get(sn->policy);
2412                pol = sn->policy;
2413        }
2414        read_unlock(&sp->lock);
2415        return pol;
2416}
2417
2418static void sp_free(struct sp_node *n)
2419{
2420        mpol_put(n->policy);
2421        kmem_cache_free(sn_cache, n);
2422}
2423
2424/**
2425 * mpol_misplaced - check whether current page node is valid in policy
2426 *
2427 * @page: page to be checked
2428 * @vma: vm area where page mapped
2429 * @addr: virtual address where page mapped
2430 *
2431 * Lookup current policy node id for vma,addr and "compare to" page's
2432 * node id.
2433 *
2434 * Returns:
2435 *      -1      - not misplaced, page is in the right node
2436 *      node    - node id where the page should be
2437 *
2438 * Policy determination "mimics" alloc_page_vma().
2439 * Called from fault path where we know the vma and faulting address.
2440 */
2441int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long addr)
2442{
2443        struct mempolicy *pol;
2444        struct zoneref *z;
2445        int curnid = page_to_nid(page);
2446        unsigned long pgoff;
2447        int thiscpu = raw_smp_processor_id();
2448        int thisnid = cpu_to_node(thiscpu);
2449        int polnid = NUMA_NO_NODE;
2450        int ret = -1;
2451
2452        pol = get_vma_policy(vma, addr);
2453        if (!(pol->flags & MPOL_F_MOF))
2454                goto out;
2455
2456        switch (pol->mode) {
2457        case MPOL_INTERLEAVE:
2458                pgoff = vma->vm_pgoff;
2459                pgoff += (addr - vma->vm_start) >> PAGE_SHIFT;
2460                polnid = offset_il_node(pol, pgoff);
2461                break;
2462
2463        case MPOL_PREFERRED:
2464                if (pol->flags & MPOL_F_LOCAL)
2465                        polnid = numa_node_id();
2466                else
2467                        polnid = pol->v.preferred_node;
2468                break;
2469
2470        case MPOL_BIND:
2471
2472                /*
2473                 * allows binding to multiple nodes.
2474                 * use current page if in policy nodemask,
2475                 * else select nearest allowed node, if any.
2476                 * If no allowed nodes, use current [!misplaced].
2477                 */
2478                if (node_isset(curnid, pol->v.nodes))
2479                        goto out;
2480                z = first_zones_zonelist(
2481                                node_zonelist(numa_node_id(), GFP_HIGHUSER),
2482                                gfp_zone(GFP_HIGHUSER),
2483                                &pol->v.nodes);
2484                polnid = zone_to_nid(z->zone);
2485                break;
2486
2487        default:
2488                BUG();
2489        }
2490
2491        /* Migrate the page towards the node whose CPU is referencing it */
2492        if (pol->flags & MPOL_F_MORON) {
2493                polnid = thisnid;
2494
2495                if (!should_numa_migrate_memory(current, page, curnid, thiscpu))
2496                        goto out;
2497        }
2498
2499        if (curnid != polnid)
2500                ret = polnid;
2501out:
2502        mpol_cond_put(pol);
2503
2504        return ret;
2505}
2506
2507/*
2508 * Drop the (possibly final) reference to task->mempolicy.  It needs to be
2509 * dropped after task->mempolicy is set to NULL so that any allocation done as
2510 * part of its kmem_cache_free(), such as by KASAN, doesn't reference a freed
2511 * policy.
2512 */
2513void mpol_put_task_policy(struct task_struct *task)
2514{
2515        struct mempolicy *pol;
2516
2517        task_lock(task);
2518        pol = task->mempolicy;
2519        task->mempolicy = NULL;
2520        task_unlock(task);
2521        mpol_put(pol);
2522}
2523
2524static void sp_delete(struct shared_policy *sp, struct sp_node *n)
2525{
2526        pr_debug("deleting %lx-l%lx\n", n->start, n->end);
2527        rb_erase(&n->nd, &sp->root);
2528        sp_free(n);
2529}
2530
2531static void sp_node_init(struct sp_node *node, unsigned long start,
2532                        unsigned long end, struct mempolicy *pol)
2533{
2534        node->start = start;
2535        node->end = end;
2536        node->policy = pol;
2537}
2538
2539static struct sp_node *sp_alloc(unsigned long start, unsigned long end,
2540                                struct mempolicy *pol)
2541{
2542        struct sp_node *n;
2543        struct mempolicy *newpol;
2544
2545        n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
2546        if (!n)
2547                return NULL;
2548
2549        newpol = mpol_dup(pol);
2550        if (IS_ERR(newpol)) {
2551                kmem_cache_free(sn_cache, n);
2552                return NULL;
2553        }
2554        newpol->flags |= MPOL_F_SHARED;
2555        sp_node_init(n, start, end, newpol);
2556
2557        return n;
2558}
2559
2560/* Replace a policy range. */
2561static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
2562                                 unsigned long end, struct sp_node *new)
2563{
2564        struct sp_node *n;
2565        struct sp_node *n_new = NULL;
2566        struct mempolicy *mpol_new = NULL;
2567        int ret = 0;
2568
2569restart:
2570        write_lock(&sp->lock);
2571        n = sp_lookup(sp, start, end);
2572        /* Take care of old policies in the same range. */
2573        while (n && n->start < end) {
2574                struct rb_node *next = rb_next(&n->nd);
2575                if (n->start >= start) {
2576                        if (n->end <= end)
2577                                sp_delete(sp, n);
2578                        else
2579                                n->start = end;
2580                } else {
2581                        /* Old policy spanning whole new range. */
2582                        if (n->end > end) {
2583                                if (!n_new)
2584                                        goto alloc_new;
2585
2586                                *mpol_new = *n->policy;
2587                                atomic_set(&mpol_new->refcnt, 1);
2588                                sp_node_init(n_new, end, n->end, mpol_new);
2589                                n->end = start;
2590                                sp_insert(sp, n_new);
2591                                n_new = NULL;
2592                                mpol_new = NULL;
2593                                break;
2594                        } else
2595                                n->end = start;
2596                }
2597                if (!next)
2598                        break;
2599                n = rb_entry(next, struct sp_node, nd);
2600        }
2601        if (new)
2602                sp_insert(sp, new);
2603        write_unlock(&sp->lock);
2604        ret = 0;
2605
2606err_out:
2607        if (mpol_new)
2608                mpol_put(mpol_new);
2609        if (n_new)
2610                kmem_cache_free(sn_cache, n_new);
2611
2612        return ret;
2613
2614alloc_new:
2615        write_unlock(&sp->lock);
2616        ret = -ENOMEM;
2617        n_new = kmem_cache_alloc(sn_cache, GFP_KERNEL);
2618        if (!n_new)
2619                goto err_out;
2620        mpol_new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
2621        if (!mpol_new)
2622                goto err_out;
2623        goto restart;
2624}
2625
2626/**
2627 * mpol_shared_policy_init - initialize shared policy for inode
2628 * @sp: pointer to inode shared policy
2629 * @mpol:  struct mempolicy to install
2630 *
2631 * Install non-NULL @mpol in inode's shared policy rb-tree.
2632 * On entry, the current task has a reference on a non-NULL @mpol.
2633 * This must be released on exit.
2634 * This is called at get_inode() calls and we can use GFP_KERNEL.
2635 */
2636void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol)
2637{
2638        int ret;
2639
2640        sp->root = RB_ROOT;             /* empty tree == default mempolicy */
2641        rwlock_init(&sp->lock);
2642
2643        if (mpol) {
2644                struct vm_area_struct pvma;
2645                struct mempolicy *new;
2646                NODEMASK_SCRATCH(scratch);
2647
2648                if (!scratch)
2649                        goto put_mpol;
2650                /* contextualize the tmpfs mount point mempolicy */
2651                new = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask);
2652                if (IS_ERR(new))
2653                        goto free_scratch; /* no valid nodemask intersection */
2654
2655                task_lock(current);
2656                ret = mpol_set_nodemask(new, &mpol->w.user_nodemask, scratch);
2657                task_unlock(current);
2658                if (ret)
2659                        goto put_new;
2660
2661                /* Create pseudo-vma that contains just the policy */
2662                memset(&pvma, 0, sizeof(struct vm_area_struct));
2663                vma_init(&pvma, NULL);
2664                pvma.vm_end = TASK_SIZE;        /* policy covers entire file */
2665                mpol_set_shared_policy(sp, &pvma, new); /* adds ref */
2666
2667put_new:
2668                mpol_put(new);                  /* drop initial ref */
2669free_scratch:
2670                NODEMASK_SCRATCH_FREE(scratch);
2671put_mpol:
2672                mpol_put(mpol); /* drop our incoming ref on sb mpol */
2673        }
2674}
2675
2676int mpol_set_shared_policy(struct shared_policy *info,
2677                        struct vm_area_struct *vma, struct mempolicy *npol)
2678{
2679        int err;
2680        struct sp_node *new = NULL;
2681        unsigned long sz = vma_pages(vma);
2682
2683        pr_debug("set_shared_policy %lx sz %lu %d %d %lx\n",
2684                 vma->vm_pgoff,
2685                 sz, npol ? npol->mode : -1,
2686                 npol ? npol->flags : -1,
2687                 npol ? nodes_addr(npol->v.nodes)[0] : NUMA_NO_NODE);
2688
2689        if (npol) {
2690                new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);
2691                if (!new)
2692                        return -ENOMEM;
2693        }
2694        err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new);
2695        if (err && new)
2696                sp_free(new);
2697        return err;
2698}
2699
2700/* Free a backing policy store on inode delete. */
2701void mpol_free_shared_policy(struct shared_policy *p)
2702{
2703        struct sp_node *n;
2704        struct rb_node *next;
2705
2706        if (!p->root.rb_node)
2707                return;
2708        write_lock(&p->lock);
2709        next = rb_first(&p->root);
2710        while (next) {
2711                n = rb_entry(next, struct sp_node, nd);
2712                next = rb_next(&n->nd);
2713                sp_delete(p, n);
2714        }
2715        write_unlock(&p->lock);
2716}
2717
2718#ifdef CONFIG_NUMA_BALANCING
2719static int __initdata numabalancing_override;
2720
2721static void __init check_numabalancing_enable(void)
2722{
2723        bool numabalancing_default = false;
2724
2725        if (IS_ENABLED(CONFIG_NUMA_BALANCING_DEFAULT_ENABLED))
2726                numabalancing_default = true;
2727
2728        /* Parsed by setup_numabalancing. override == 1 enables, -1 disables */
2729        if (numabalancing_override)
2730                set_numabalancing_state(numabalancing_override == 1);
2731
2732        if (num_online_nodes() > 1 && !numabalancing_override) {
2733                pr_info("%s automatic NUMA balancing. Configure with numa_balancing= or the kernel.numa_balancing sysctl\n",
2734                        numabalancing_default ? "Enabling" : "Disabling");
2735                set_numabalancing_state(numabalancing_default);
2736        }
2737}
2738
2739static int __init setup_numabalancing(char *str)
2740{
2741        int ret = 0;
2742        if (!str)
2743                goto out;
2744
2745        if (!strcmp(str, "enable")) {
2746                numabalancing_override = 1;
2747                ret = 1;
2748        } else if (!strcmp(str, "disable")) {
2749                numabalancing_override = -1;
2750                ret = 1;
2751        }
2752out:
2753        if (!ret)
2754                pr_warn("Unable to parse numa_balancing=\n");
2755
2756        return ret;
2757}
2758__setup("numa_balancing=", setup_numabalancing);
2759#else
2760static inline void __init check_numabalancing_enable(void)
2761{
2762}
2763#endif /* CONFIG_NUMA_BALANCING */
2764
2765/* assumes fs == KERNEL_DS */
2766void __init numa_policy_init(void)
2767{
2768        nodemask_t interleave_nodes;
2769        unsigned long largest = 0;
2770        int nid, prefer = 0;
2771
2772        policy_cache = kmem_cache_create("numa_policy",
2773                                         sizeof(struct mempolicy),
2774                                         0, SLAB_PANIC, NULL);
2775
2776        sn_cache = kmem_cache_create("shared_policy_node",
2777                                     sizeof(struct sp_node),
2778                                     0, SLAB_PANIC, NULL);
2779
2780        for_each_node(nid) {
2781                preferred_node_policy[nid] = (struct mempolicy) {
2782                        .refcnt = ATOMIC_INIT(1),
2783                        .mode = MPOL_PREFERRED,
2784                        .flags = MPOL_F_MOF | MPOL_F_MORON,
2785                        .v = { .preferred_node = nid, },
2786                };
2787        }
2788
2789        /*
2790         * Set interleaving policy for system init. Interleaving is only
2791         * enabled across suitably sized nodes (default is >= 16MB), or
2792         * fall back to the largest node if they're all smaller.
2793         */
2794        nodes_clear(interleave_nodes);
2795        for_each_node_state(nid, N_MEMORY) {
2796                unsigned long total_pages = node_present_pages(nid);
2797
2798                /* Preserve the largest node */
2799                if (largest < total_pages) {
2800                        largest = total_pages;
2801                        prefer = nid;
2802                }
2803
2804                /* Interleave this node? */
2805                if ((total_pages << PAGE_SHIFT) >= (16 << 20))
2806                        node_set(nid, interleave_nodes);
2807        }
2808
2809        /* All too small, use the largest */
2810        if (unlikely(nodes_empty(interleave_nodes)))
2811                node_set(prefer, interleave_nodes);
2812
2813        if (do_set_mempolicy(MPOL_INTERLEAVE, 0, &interleave_nodes))
2814                pr_err("%s: interleaving failed\n", __func__);
2815
2816        check_numabalancing_enable();
2817}
2818
2819/* Reset policy of current process to default */
2820void numa_default_policy(void)
2821{
2822        do_set_mempolicy(MPOL_DEFAULT, 0, NULL);
2823}
2824
2825/*
2826 * Parse and format mempolicy from/to strings
2827 */
2828
2829/*
2830 * "local" is implemented internally by MPOL_PREFERRED with MPOL_F_LOCAL flag.
2831 */
2832static const char * const policy_modes[] =
2833{
2834        [MPOL_DEFAULT]    = "default",
2835        [MPOL_PREFERRED]  = "prefer",
2836        [MPOL_BIND]       = "bind",
2837        [MPOL_INTERLEAVE] = "interleave",
2838        [MPOL_LOCAL]      = "local",
2839};
2840
2841
2842#ifdef CONFIG_TMPFS
2843/**
2844 * mpol_parse_str - parse string to mempolicy, for tmpfs mpol mount option.
2845 * @str:  string containing mempolicy to parse
2846 * @mpol:  pointer to struct mempolicy pointer, returned on success.
2847 *
2848 * Format of input:
2849 *      <mode>[=<flags>][:<nodelist>]
2850 *
2851 * On success, returns 0, else 1
2852 */
2853int mpol_parse_str(char *str, struct mempolicy **mpol)
2854{
2855        struct mempolicy *new = NULL;
2856        unsigned short mode;
2857        unsigned short mode_flags;
2858        nodemask_t nodes;
2859        char *nodelist = strchr(str, ':');
2860        char *flags = strchr(str, '=');
2861        int err = 1;
2862
2863        if (flags)
2864                *flags++ = '\0';        /* terminate mode string */
2865
2866        if (nodelist) {
2867                /* NUL-terminate mode or flags string */
2868                *nodelist++ = '\0';
2869                if (nodelist_parse(nodelist, nodes))
2870                        goto out;
2871                if (!nodes_subset(nodes, node_states[N_MEMORY]))
2872                        goto out;
2873        } else
2874                nodes_clear(nodes);
2875
2876        for (mode = 0; mode < MPOL_MAX; mode++) {
2877                if (!strcmp(str, policy_modes[mode])) {
2878                        break;
2879                }
2880        }
2881        if (mode >= MPOL_MAX)
2882                goto out;
2883
2884        switch (mode) {
2885        case MPOL_PREFERRED:
2886                /*
2887                 * Insist on a nodelist of one node only, although later
2888                 * we use first_node(nodes) to grab a single node, so here
2889                 * nodelist (or nodes) cannot be empty.
2890                 */
2891                if (nodelist) {
2892                        char *rest = nodelist;
2893                        while (isdigit(*rest))
2894                                rest++;
2895                        if (*rest)
2896                                goto out;
2897                        if (nodes_empty(nodes))
2898                                goto out;
2899                }
2900                break;
2901        case MPOL_INTERLEAVE:
2902                /*
2903                 * Default to online nodes with memory if no nodelist
2904                 */
2905                if (!nodelist)
2906                        nodes = node_states[N_MEMORY];
2907                break;
2908        case MPOL_LOCAL:
2909                /*
2910                 * Don't allow a nodelist;  mpol_new() checks flags
2911                 */
2912                if (nodelist)
2913                        goto out;
2914                mode = MPOL_PREFERRED;
2915                break;
2916        case MPOL_DEFAULT:
2917                /*
2918                 * Insist on a empty nodelist
2919                 */
2920                if (!nodelist)
2921                        err = 0;
2922                goto out;
2923        case MPOL_BIND:
2924                /*
2925                 * Insist on a nodelist
2926                 */
2927                if (!nodelist)
2928                        goto out;
2929        }
2930
2931        mode_flags = 0;
2932        if (flags) {
2933                /*
2934                 * Currently, we only support two mutually exclusive
2935                 * mode flags.
2936                 */
2937                if (!strcmp(flags, "static"))
2938                        mode_flags |= MPOL_F_STATIC_NODES;
2939                else if (!strcmp(flags, "relative"))
2940                        mode_flags |= MPOL_F_RELATIVE_NODES;
2941                else
2942                        goto out;
2943        }
2944
2945        new = mpol_new(mode, mode_flags, &nodes);
2946        if (IS_ERR(new))
2947                goto out;
2948
2949        /*
2950         * Save nodes for mpol_to_str() to show the tmpfs mount options
2951         * for /proc/mounts, /proc/pid/mounts and /proc/pid/mountinfo.
2952         */
2953        if (mode != MPOL_PREFERRED)
2954                new->v.nodes = nodes;
2955        else if (nodelist)
2956                new->v.preferred_node = first_node(nodes);
2957        else
2958                new->flags |= MPOL_F_LOCAL;
2959
2960        /*
2961         * Save nodes for contextualization: this will be used to "clone"
2962         * the mempolicy in a specific context [cpuset] at a later time.
2963         */
2964        new->w.user_nodemask = nodes;
2965
2966        err = 0;
2967
2968out:
2969        /* Restore string for error message */
2970        if (nodelist)
2971                *--nodelist = ':';
2972        if (flags)
2973                *--flags = '=';
2974        if (!err)
2975                *mpol = new;
2976        return err;
2977}
2978#endif /* CONFIG_TMPFS */
2979
2980/**
2981 * mpol_to_str - format a mempolicy structure for printing
2982 * @buffer:  to contain formatted mempolicy string
2983 * @maxlen:  length of @buffer
2984 * @pol:  pointer to mempolicy to be formatted
2985 *
2986 * Convert @pol into a string.  If @buffer is too short, truncate the string.
2987 * Recommend a @maxlen of at least 32 for the longest mode, "interleave", the
2988 * longest flag, "relative", and to display at least a few node ids.
2989 */
2990void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
2991{
2992        char *p = buffer;
2993        nodemask_t nodes = NODE_MASK_NONE;
2994        unsigned short mode = MPOL_DEFAULT;
2995        unsigned short flags = 0;
2996
2997        if (pol && pol != &default_policy && !(pol->flags & MPOL_F_MORON)) {
2998                mode = pol->mode;
2999                flags = pol->flags;
3000        }
3001
3002        switch (mode) {
3003        case MPOL_DEFAULT:
3004                break;
3005        case MPOL_PREFERRED:
3006                if (flags & MPOL_F_LOCAL)
3007                        mode = MPOL_LOCAL;
3008                else
3009                        node_set(pol->v.preferred_node, nodes);
3010                break;
3011        case MPOL_BIND:
3012        case MPOL_INTERLEAVE:
3013                nodes = pol->v.nodes;
3014                break;
3015        default:
3016                WARN_ON_ONCE(1);
3017                snprintf(p, maxlen, "unknown");
3018                return;
3019        }
3020
3021        p += snprintf(p, maxlen, "%s", policy_modes[mode]);
3022
3023        if (flags & MPOL_MODE_FLAGS) {
3024                p += snprintf(p, buffer + maxlen - p, "=");
3025
3026                /*
3027                 * Currently, the only defined flags are mutually exclusive
3028                 */
3029                if (flags & MPOL_F_STATIC_NODES)
3030                        p += snprintf(p, buffer + maxlen - p, "static");
3031                else if (flags & MPOL_F_RELATIVE_NODES)
3032                        p += snprintf(p, buffer + maxlen - p, "relative");
3033        }
3034
3035        if (!nodes_empty(nodes))
3036                p += scnprintf(p, buffer + maxlen - p, ":%*pbl",
3037                               nodemask_pr_args(&nodes));
3038}
3039