linux/mm/mempolicy.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0-only
   2/*
   3 * Simple NUMA memory policy for the Linux kernel.
   4 *
   5 * Copyright 2003,2004 Andi Kleen, SuSE Labs.
   6 * (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc.
   7 *
   8 * NUMA policy allows the user to give hints in which node(s) memory should
   9 * be allocated.
  10 *
  11 * Support four policies per VMA and per process:
  12 *
  13 * The VMA policy has priority over the process policy for a page fault.
  14 *
  15 * interleave     Allocate memory interleaved over a set of nodes,
  16 *                with normal fallback if it fails.
  17 *                For VMA based allocations this interleaves based on the
  18 *                offset into the backing object or offset into the mapping
  19 *                for anonymous memory. For process policy an process counter
  20 *                is used.
  21 *
  22 * bind           Only allocate memory on a specific set of nodes,
  23 *                no fallback.
  24 *                FIXME: memory is allocated starting with the first node
  25 *                to the last. It would be better if bind would truly restrict
  26 *                the allocation to memory nodes instead
  27 *
  28 * preferred       Try a specific node first before normal fallback.
  29 *                As a special case NUMA_NO_NODE here means do the allocation
  30 *                on the local CPU. This is normally identical to default,
  31 *                but useful to set in a VMA when you have a non default
  32 *                process policy.
  33 *
  34 * default        Allocate on the local node first, or when on a VMA
  35 *                use the process policy. This is what Linux always did
  36 *                in a NUMA aware kernel and still does by, ahem, default.
  37 *
  38 * The process policy is applied for most non interrupt memory allocations
  39 * in that process' context. Interrupts ignore the policies and always
  40 * try to allocate on the local CPU. The VMA policy is only applied for memory
  41 * allocations for a VMA in the VM.
  42 *
  43 * Currently there are a few corner cases in swapping where the policy
  44 * is not applied, but the majority should be handled. When process policy
  45 * is used it is not remembered over swap outs/swap ins.
  46 *
  47 * Only the highest zone in the zone hierarchy gets policied. Allocations
  48 * requesting a lower zone just use default policy. This implies that
  49 * on systems with highmem kernel lowmem allocation don't get policied.
  50 * Same with GFP_DMA allocations.
  51 *
  52 * For shmfs/tmpfs/hugetlbfs shared memory the policy is shared between
  53 * all users and remembered even when nobody has memory mapped.
  54 */
  55
  56/* Notebook:
  57   fix mmap readahead to honour policy and enable policy for any page cache
  58   object
  59   statistics for bigpages
  60   global policy for page cache? currently it uses process policy. Requires
  61   first item above.
  62   handle mremap for shared memory (currently ignored for the policy)
  63   grows down?
  64   make bind policy root only? It can trigger oom much faster and the
  65   kernel is not always grateful with that.
  66*/
  67
  68#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
  69
  70#include <linux/mempolicy.h>
  71#include <linux/pagewalk.h>
  72#include <linux/highmem.h>
  73#include <linux/hugetlb.h>
  74#include <linux/kernel.h>
  75#include <linux/sched.h>
  76#include <linux/sched/mm.h>
  77#include <linux/sched/numa_balancing.h>
  78#include <linux/sched/task.h>
  79#include <linux/nodemask.h>
  80#include <linux/cpuset.h>
  81#include <linux/slab.h>
  82#include <linux/string.h>
  83#include <linux/export.h>
  84#include <linux/nsproxy.h>
  85#include <linux/interrupt.h>
  86#include <linux/init.h>
  87#include <linux/compat.h>
  88#include <linux/ptrace.h>
  89#include <linux/swap.h>
  90#include <linux/seq_file.h>
  91#include <linux/proc_fs.h>
  92#include <linux/migrate.h>
  93#include <linux/ksm.h>
  94#include <linux/rmap.h>
  95#include <linux/security.h>
  96#include <linux/syscalls.h>
  97#include <linux/ctype.h>
  98#include <linux/mm_inline.h>
  99#include <linux/mmu_notifier.h>
 100#include <linux/printk.h>
 101#include <linux/swapops.h>
 102
 103#include <asm/tlbflush.h>
 104#include <linux/uaccess.h>
 105
 106#include "internal.h"
 107
 108/* Internal flags */
 109#define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0)    /* Skip checks for continuous vmas */
 110#define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1)          /* Invert check for nodemask */
 111
 112static struct kmem_cache *policy_cache;
 113static struct kmem_cache *sn_cache;
 114
 115/* Highest zone. An specific allocation for a zone below that is not
 116   policied. */
 117enum zone_type policy_zone = 0;
 118
 119/*
 120 * run-time system-wide default policy => local allocation
 121 */
 122static struct mempolicy default_policy = {
 123        .refcnt = ATOMIC_INIT(1), /* never free it */
 124        .mode = MPOL_PREFERRED,
 125        .flags = MPOL_F_LOCAL,
 126};
 127
 128static struct mempolicy preferred_node_policy[MAX_NUMNODES];
 129
 130/**
 131 * numa_map_to_online_node - Find closest online node
 132 * @node: Node id to start the search
 133 *
 134 * Lookup the next closest node by distance if @nid is not online.
 135 */
 136int numa_map_to_online_node(int node)
 137{
 138        int min_dist = INT_MAX, dist, n, min_node;
 139
 140        if (node == NUMA_NO_NODE || node_online(node))
 141                return node;
 142
 143        min_node = node;
 144        for_each_online_node(n) {
 145                dist = node_distance(node, n);
 146                if (dist < min_dist) {
 147                        min_dist = dist;
 148                        min_node = n;
 149                }
 150        }
 151
 152        return min_node;
 153}
 154EXPORT_SYMBOL_GPL(numa_map_to_online_node);
 155
 156struct mempolicy *get_task_policy(struct task_struct *p)
 157{
 158        struct mempolicy *pol = p->mempolicy;
 159        int node;
 160
 161        if (pol)
 162                return pol;
 163
 164        node = numa_node_id();
 165        if (node != NUMA_NO_NODE) {
 166                pol = &preferred_node_policy[node];
 167                /* preferred_node_policy is not initialised early in boot */
 168                if (pol->mode)
 169                        return pol;
 170        }
 171
 172        return &default_policy;
 173}
 174
 175static const struct mempolicy_operations {
 176        int (*create)(struct mempolicy *pol, const nodemask_t *nodes);
 177        void (*rebind)(struct mempolicy *pol, const nodemask_t *nodes);
 178} mpol_ops[MPOL_MAX];
 179
 180static inline int mpol_store_user_nodemask(const struct mempolicy *pol)
 181{
 182        return pol->flags & MPOL_MODE_FLAGS;
 183}
 184
 185static void mpol_relative_nodemask(nodemask_t *ret, const nodemask_t *orig,
 186                                   const nodemask_t *rel)
 187{
 188        nodemask_t tmp;
 189        nodes_fold(tmp, *orig, nodes_weight(*rel));
 190        nodes_onto(*ret, tmp, *rel);
 191}
 192
 193static int mpol_new_interleave(struct mempolicy *pol, const nodemask_t *nodes)
 194{
 195        if (nodes_empty(*nodes))
 196                return -EINVAL;
 197        pol->v.nodes = *nodes;
 198        return 0;
 199}
 200
 201static int mpol_new_preferred(struct mempolicy *pol, const nodemask_t *nodes)
 202{
 203        if (!nodes)
 204                pol->flags |= MPOL_F_LOCAL;     /* local allocation */
 205        else if (nodes_empty(*nodes))
 206                return -EINVAL;                 /*  no allowed nodes */
 207        else
 208                pol->v.preferred_node = first_node(*nodes);
 209        return 0;
 210}
 211
 212static int mpol_new_bind(struct mempolicy *pol, const nodemask_t *nodes)
 213{
 214        if (nodes_empty(*nodes))
 215                return -EINVAL;
 216        pol->v.nodes = *nodes;
 217        return 0;
 218}
 219
 220/*
 221 * mpol_set_nodemask is called after mpol_new() to set up the nodemask, if
 222 * any, for the new policy.  mpol_new() has already validated the nodes
 223 * parameter with respect to the policy mode and flags.  But, we need to
 224 * handle an empty nodemask with MPOL_PREFERRED here.
 225 *
 226 * Must be called holding task's alloc_lock to protect task's mems_allowed
 227 * and mempolicy.  May also be called holding the mmap_lock for write.
 228 */
 229static int mpol_set_nodemask(struct mempolicy *pol,
 230                     const nodemask_t *nodes, struct nodemask_scratch *nsc)
 231{
 232        int ret;
 233
 234        /* if mode is MPOL_DEFAULT, pol is NULL. This is right. */
 235        if (pol == NULL)
 236                return 0;
 237        /* Check N_MEMORY */
 238        nodes_and(nsc->mask1,
 239                  cpuset_current_mems_allowed, node_states[N_MEMORY]);
 240
 241        VM_BUG_ON(!nodes);
 242        if (pol->mode == MPOL_PREFERRED && nodes_empty(*nodes))
 243                nodes = NULL;   /* explicit local allocation */
 244        else {
 245                if (pol->flags & MPOL_F_RELATIVE_NODES)
 246                        mpol_relative_nodemask(&nsc->mask2, nodes, &nsc->mask1);
 247                else
 248                        nodes_and(nsc->mask2, *nodes, nsc->mask1);
 249
 250                if (mpol_store_user_nodemask(pol))
 251                        pol->w.user_nodemask = *nodes;
 252                else
 253                        pol->w.cpuset_mems_allowed =
 254                                                cpuset_current_mems_allowed;
 255        }
 256
 257        if (nodes)
 258                ret = mpol_ops[pol->mode].create(pol, &nsc->mask2);
 259        else
 260                ret = mpol_ops[pol->mode].create(pol, NULL);
 261        return ret;
 262}
 263
 264/*
 265 * This function just creates a new policy, does some check and simple
 266 * initialization. You must invoke mpol_set_nodemask() to set nodes.
 267 */
 268static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
 269                                  nodemask_t *nodes)
 270{
 271        struct mempolicy *policy;
 272
 273        pr_debug("setting mode %d flags %d nodes[0] %lx\n",
 274                 mode, flags, nodes ? nodes_addr(*nodes)[0] : NUMA_NO_NODE);
 275
 276        if (mode == MPOL_DEFAULT) {
 277                if (nodes && !nodes_empty(*nodes))
 278                        return ERR_PTR(-EINVAL);
 279                return NULL;
 280        }
 281        VM_BUG_ON(!nodes);
 282
 283        /*
 284         * MPOL_PREFERRED cannot be used with MPOL_F_STATIC_NODES or
 285         * MPOL_F_RELATIVE_NODES if the nodemask is empty (local allocation).
 286         * All other modes require a valid pointer to a non-empty nodemask.
 287         */
 288        if (mode == MPOL_PREFERRED) {
 289                if (nodes_empty(*nodes)) {
 290                        if (((flags & MPOL_F_STATIC_NODES) ||
 291                             (flags & MPOL_F_RELATIVE_NODES)))
 292                                return ERR_PTR(-EINVAL);
 293                }
 294        } else if (mode == MPOL_LOCAL) {
 295                if (!nodes_empty(*nodes) ||
 296                    (flags & MPOL_F_STATIC_NODES) ||
 297                    (flags & MPOL_F_RELATIVE_NODES))
 298                        return ERR_PTR(-EINVAL);
 299                mode = MPOL_PREFERRED;
 300        } else if (nodes_empty(*nodes))
 301                return ERR_PTR(-EINVAL);
 302        policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
 303        if (!policy)
 304                return ERR_PTR(-ENOMEM);
 305        atomic_set(&policy->refcnt, 1);
 306        policy->mode = mode;
 307        policy->flags = flags;
 308
 309        return policy;
 310}
 311
 312/* Slow path of a mpol destructor. */
 313void __mpol_put(struct mempolicy *p)
 314{
 315        if (!atomic_dec_and_test(&p->refcnt))
 316                return;
 317        kmem_cache_free(policy_cache, p);
 318}
 319
 320static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes)
 321{
 322}
 323
 324static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes)
 325{
 326        nodemask_t tmp;
 327
 328        if (pol->flags & MPOL_F_STATIC_NODES)
 329                nodes_and(tmp, pol->w.user_nodemask, *nodes);
 330        else if (pol->flags & MPOL_F_RELATIVE_NODES)
 331                mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
 332        else {
 333                nodes_remap(tmp, pol->v.nodes,pol->w.cpuset_mems_allowed,
 334                                                                *nodes);
 335                pol->w.cpuset_mems_allowed = *nodes;
 336        }
 337
 338        if (nodes_empty(tmp))
 339                tmp = *nodes;
 340
 341        pol->v.nodes = tmp;
 342}
 343
 344static void mpol_rebind_preferred(struct mempolicy *pol,
 345                                                const nodemask_t *nodes)
 346{
 347        nodemask_t tmp;
 348
 349        if (pol->flags & MPOL_F_STATIC_NODES) {
 350                int node = first_node(pol->w.user_nodemask);
 351
 352                if (node_isset(node, *nodes)) {
 353                        pol->v.preferred_node = node;
 354                        pol->flags &= ~MPOL_F_LOCAL;
 355                } else
 356                        pol->flags |= MPOL_F_LOCAL;
 357        } else if (pol->flags & MPOL_F_RELATIVE_NODES) {
 358                mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
 359                pol->v.preferred_node = first_node(tmp);
 360        } else if (!(pol->flags & MPOL_F_LOCAL)) {
 361                pol->v.preferred_node = node_remap(pol->v.preferred_node,
 362                                                   pol->w.cpuset_mems_allowed,
 363                                                   *nodes);
 364                pol->w.cpuset_mems_allowed = *nodes;
 365        }
 366}
 367
 368/*
 369 * mpol_rebind_policy - Migrate a policy to a different set of nodes
 370 *
 371 * Per-vma policies are protected by mmap_lock. Allocations using per-task
 372 * policies are protected by task->mems_allowed_seq to prevent a premature
 373 * OOM/allocation failure due to parallel nodemask modification.
 374 */
 375static void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask)
 376{
 377        if (!pol)
 378                return;
 379        if (!mpol_store_user_nodemask(pol) && !(pol->flags & MPOL_F_LOCAL) &&
 380            nodes_equal(pol->w.cpuset_mems_allowed, *newmask))
 381                return;
 382
 383        mpol_ops[pol->mode].rebind(pol, newmask);
 384}
 385
 386/*
 387 * Wrapper for mpol_rebind_policy() that just requires task
 388 * pointer, and updates task mempolicy.
 389 *
 390 * Called with task's alloc_lock held.
 391 */
 392
 393void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new)
 394{
 395        mpol_rebind_policy(tsk->mempolicy, new);
 396}
 397
 398/*
 399 * Rebind each vma in mm to new nodemask.
 400 *
 401 * Call holding a reference to mm.  Takes mm->mmap_lock during call.
 402 */
 403
 404void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
 405{
 406        struct vm_area_struct *vma;
 407
 408        mmap_write_lock(mm);
 409        for (vma = mm->mmap; vma; vma = vma->vm_next)
 410                mpol_rebind_policy(vma->vm_policy, new);
 411        mmap_write_unlock(mm);
 412}
 413
 414static const struct mempolicy_operations mpol_ops[MPOL_MAX] = {
 415        [MPOL_DEFAULT] = {
 416                .rebind = mpol_rebind_default,
 417        },
 418        [MPOL_INTERLEAVE] = {
 419                .create = mpol_new_interleave,
 420                .rebind = mpol_rebind_nodemask,
 421        },
 422        [MPOL_PREFERRED] = {
 423                .create = mpol_new_preferred,
 424                .rebind = mpol_rebind_preferred,
 425        },
 426        [MPOL_BIND] = {
 427                .create = mpol_new_bind,
 428                .rebind = mpol_rebind_nodemask,
 429        },
 430};
 431
 432static int migrate_page_add(struct page *page, struct list_head *pagelist,
 433                                unsigned long flags);
 434
 435struct queue_pages {
 436        struct list_head *pagelist;
 437        unsigned long flags;
 438        nodemask_t *nmask;
 439        unsigned long start;
 440        unsigned long end;
 441        struct vm_area_struct *first;
 442};
 443
 444/*
 445 * Check if the page's nid is in qp->nmask.
 446 *
 447 * If MPOL_MF_INVERT is set in qp->flags, check if the nid is
 448 * in the invert of qp->nmask.
 449 */
 450static inline bool queue_pages_required(struct page *page,
 451                                        struct queue_pages *qp)
 452{
 453        int nid = page_to_nid(page);
 454        unsigned long flags = qp->flags;
 455
 456        return node_isset(nid, *qp->nmask) == !(flags & MPOL_MF_INVERT);
 457}
 458
 459/*
 460 * queue_pages_pmd() has four possible return values:
 461 * 0 - pages are placed on the right node or queued successfully.
 462 * 1 - there is unmovable page, and MPOL_MF_MOVE* & MPOL_MF_STRICT were
 463 *     specified.
 464 * 2 - THP was split.
 465 * -EIO - is migration entry or only MPOL_MF_STRICT was specified and an
 466 *        existing page was already on a node that does not follow the
 467 *        policy.
 468 */
 469static int queue_pages_pmd(pmd_t *pmd, spinlock_t *ptl, unsigned long addr,
 470                                unsigned long end, struct mm_walk *walk)
 471        __releases(ptl)
 472{
 473        int ret = 0;
 474        struct page *page;
 475        struct queue_pages *qp = walk->private;
 476        unsigned long flags;
 477
 478        if (unlikely(is_pmd_migration_entry(*pmd))) {
 479                ret = -EIO;
 480                goto unlock;
 481        }
 482        page = pmd_page(*pmd);
 483        if (is_huge_zero_page(page)) {
 484                spin_unlock(ptl);
 485                __split_huge_pmd(walk->vma, pmd, addr, false, NULL);
 486                ret = 2;
 487                goto out;
 488        }
 489        if (!queue_pages_required(page, qp))
 490                goto unlock;
 491
 492        flags = qp->flags;
 493        /* go to thp migration */
 494        if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
 495                if (!vma_migratable(walk->vma) ||
 496                    migrate_page_add(page, qp->pagelist, flags)) {
 497                        ret = 1;
 498                        goto unlock;
 499                }
 500        } else
 501                ret = -EIO;
 502unlock:
 503        spin_unlock(ptl);
 504out:
 505        return ret;
 506}
 507
 508/*
 509 * Scan through pages checking if pages follow certain conditions,
 510 * and move them to the pagelist if they do.
 511 *
 512 * queue_pages_pte_range() has three possible return values:
 513 * 0 - pages are placed on the right node or queued successfully.
 514 * 1 - there is unmovable page, and MPOL_MF_MOVE* & MPOL_MF_STRICT were
 515 *     specified.
 516 * -EIO - only MPOL_MF_STRICT was specified and an existing page was already
 517 *        on a node that does not follow the policy.
 518 */
 519static int queue_pages_pte_range(pmd_t *pmd, unsigned long addr,
 520                        unsigned long end, struct mm_walk *walk)
 521{
 522        struct vm_area_struct *vma = walk->vma;
 523        struct page *page;
 524        struct queue_pages *qp = walk->private;
 525        unsigned long flags = qp->flags;
 526        int ret;
 527        bool has_unmovable = false;
 528        pte_t *pte;
 529        spinlock_t *ptl;
 530
 531        ptl = pmd_trans_huge_lock(pmd, vma);
 532        if (ptl) {
 533                ret = queue_pages_pmd(pmd, ptl, addr, end, walk);
 534                if (ret != 2)
 535                        return ret;
 536        }
 537        /* THP was split, fall through to pte walk */
 538
 539        if (pmd_trans_unstable(pmd))
 540                return 0;
 541
 542        pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
 543        for (; addr != end; pte++, addr += PAGE_SIZE) {
 544                if (!pte_present(*pte))
 545                        continue;
 546                page = vm_normal_page(vma, addr, *pte);
 547                if (!page)
 548                        continue;
 549                /*
 550                 * vm_normal_page() filters out zero pages, but there might
 551                 * still be PageReserved pages to skip, perhaps in a VDSO.
 552                 */
 553                if (PageReserved(page))
 554                        continue;
 555                if (!queue_pages_required(page, qp))
 556                        continue;
 557                if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
 558                        /* MPOL_MF_STRICT must be specified if we get here */
 559                        if (!vma_migratable(vma)) {
 560                                has_unmovable = true;
 561                                break;
 562                        }
 563
 564                        /*
 565                         * Do not abort immediately since there may be
 566                         * temporary off LRU pages in the range.  Still
 567                         * need migrate other LRU pages.
 568                         */
 569                        if (migrate_page_add(page, qp->pagelist, flags))
 570                                has_unmovable = true;
 571                } else
 572                        break;
 573        }
 574        pte_unmap_unlock(pte - 1, ptl);
 575        cond_resched();
 576
 577        if (has_unmovable)
 578                return 1;
 579
 580        return addr != end ? -EIO : 0;
 581}
 582
 583static int queue_pages_hugetlb(pte_t *pte, unsigned long hmask,
 584                               unsigned long addr, unsigned long end,
 585                               struct mm_walk *walk)
 586{
 587        int ret = 0;
 588#ifdef CONFIG_HUGETLB_PAGE
 589        struct queue_pages *qp = walk->private;
 590        unsigned long flags = (qp->flags & MPOL_MF_VALID);
 591        struct page *page;
 592        spinlock_t *ptl;
 593        pte_t entry;
 594
 595        ptl = huge_pte_lock(hstate_vma(walk->vma), walk->mm, pte);
 596        entry = huge_ptep_get(pte);
 597        if (!pte_present(entry))
 598                goto unlock;
 599        page = pte_page(entry);
 600        if (!queue_pages_required(page, qp))
 601                goto unlock;
 602
 603        if (flags == MPOL_MF_STRICT) {
 604                /*
 605                 * STRICT alone means only detecting misplaced page and no
 606                 * need to further check other vma.
 607                 */
 608                ret = -EIO;
 609                goto unlock;
 610        }
 611
 612        if (!vma_migratable(walk->vma)) {
 613                /*
 614                 * Must be STRICT with MOVE*, otherwise .test_walk() have
 615                 * stopped walking current vma.
 616                 * Detecting misplaced page but allow migrating pages which
 617                 * have been queued.
 618                 */
 619                ret = 1;
 620                goto unlock;
 621        }
 622
 623        /* With MPOL_MF_MOVE, we migrate only unshared hugepage. */
 624        if (flags & (MPOL_MF_MOVE_ALL) ||
 625            (flags & MPOL_MF_MOVE && page_mapcount(page) == 1)) {
 626                if (!isolate_huge_page(page, qp->pagelist) &&
 627                        (flags & MPOL_MF_STRICT))
 628                        /*
 629                         * Failed to isolate page but allow migrating pages
 630                         * which have been queued.
 631                         */
 632                        ret = 1;
 633        }
 634unlock:
 635        spin_unlock(ptl);
 636#else
 637        BUG();
 638#endif
 639        return ret;
 640}
 641
 642#ifdef CONFIG_NUMA_BALANCING
 643/*
 644 * This is used to mark a range of virtual addresses to be inaccessible.
 645 * These are later cleared by a NUMA hinting fault. Depending on these
 646 * faults, pages may be migrated for better NUMA placement.
 647 *
 648 * This is assuming that NUMA faults are handled using PROT_NONE. If
 649 * an architecture makes a different choice, it will need further
 650 * changes to the core.
 651 */
 652unsigned long change_prot_numa(struct vm_area_struct *vma,
 653                        unsigned long addr, unsigned long end)
 654{
 655        int nr_updated;
 656
 657        nr_updated = change_protection(vma, addr, end, PAGE_NONE, MM_CP_PROT_NUMA);
 658        if (nr_updated)
 659                count_vm_numa_events(NUMA_PTE_UPDATES, nr_updated);
 660
 661        return nr_updated;
 662}
 663#else
 664static unsigned long change_prot_numa(struct vm_area_struct *vma,
 665                        unsigned long addr, unsigned long end)
 666{
 667        return 0;
 668}
 669#endif /* CONFIG_NUMA_BALANCING */
 670
 671static int queue_pages_test_walk(unsigned long start, unsigned long end,
 672                                struct mm_walk *walk)
 673{
 674        struct vm_area_struct *vma = walk->vma;
 675        struct queue_pages *qp = walk->private;
 676        unsigned long endvma = vma->vm_end;
 677        unsigned long flags = qp->flags;
 678
 679        /* range check first */
 680        VM_BUG_ON_VMA((vma->vm_start > start) || (vma->vm_end < end), vma);
 681
 682        if (!qp->first) {
 683                qp->first = vma;
 684                if (!(flags & MPOL_MF_DISCONTIG_OK) &&
 685                        (qp->start < vma->vm_start))
 686                        /* hole at head side of range */
 687                        return -EFAULT;
 688        }
 689        if (!(flags & MPOL_MF_DISCONTIG_OK) &&
 690                ((vma->vm_end < qp->end) &&
 691                (!vma->vm_next || vma->vm_end < vma->vm_next->vm_start)))
 692                /* hole at middle or tail of range */
 693                return -EFAULT;
 694
 695        /*
 696         * Need check MPOL_MF_STRICT to return -EIO if possible
 697         * regardless of vma_migratable
 698         */
 699        if (!vma_migratable(vma) &&
 700            !(flags & MPOL_MF_STRICT))
 701                return 1;
 702
 703        if (endvma > end)
 704                endvma = end;
 705
 706        if (flags & MPOL_MF_LAZY) {
 707                /* Similar to task_numa_work, skip inaccessible VMAs */
 708                if (!is_vm_hugetlb_page(vma) && vma_is_accessible(vma) &&
 709                        !(vma->vm_flags & VM_MIXEDMAP))
 710                        change_prot_numa(vma, start, endvma);
 711                return 1;
 712        }
 713
 714        /* queue pages from current vma */
 715        if (flags & MPOL_MF_VALID)
 716                return 0;
 717        return 1;
 718}
 719
 720static const struct mm_walk_ops queue_pages_walk_ops = {
 721        .hugetlb_entry          = queue_pages_hugetlb,
 722        .pmd_entry              = queue_pages_pte_range,
 723        .test_walk              = queue_pages_test_walk,
 724};
 725
 726/*
 727 * Walk through page tables and collect pages to be migrated.
 728 *
 729 * If pages found in a given range are on a set of nodes (determined by
 730 * @nodes and @flags,) it's isolated and queued to the pagelist which is
 731 * passed via @private.
 732 *
 733 * queue_pages_range() has three possible return values:
 734 * 1 - there is unmovable page, but MPOL_MF_MOVE* & MPOL_MF_STRICT were
 735 *     specified.
 736 * 0 - queue pages successfully or no misplaced page.
 737 * errno - i.e. misplaced pages with MPOL_MF_STRICT specified (-EIO) or
 738 *         memory range specified by nodemask and maxnode points outside
 739 *         your accessible address space (-EFAULT)
 740 */
 741static int
 742queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end,
 743                nodemask_t *nodes, unsigned long flags,
 744                struct list_head *pagelist)
 745{
 746        int err;
 747        struct queue_pages qp = {
 748                .pagelist = pagelist,
 749                .flags = flags,
 750                .nmask = nodes,
 751                .start = start,
 752                .end = end,
 753                .first = NULL,
 754        };
 755
 756        err = walk_page_range(mm, start, end, &queue_pages_walk_ops, &qp);
 757
 758        if (!qp.first)
 759                /* whole range in hole */
 760                err = -EFAULT;
 761
 762        return err;
 763}
 764
 765/*
 766 * Apply policy to a single VMA
 767 * This must be called with the mmap_lock held for writing.
 768 */
 769static int vma_replace_policy(struct vm_area_struct *vma,
 770                                                struct mempolicy *pol)
 771{
 772        int err;
 773        struct mempolicy *old;
 774        struct mempolicy *new;
 775
 776        pr_debug("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
 777                 vma->vm_start, vma->vm_end, vma->vm_pgoff,
 778                 vma->vm_ops, vma->vm_file,
 779                 vma->vm_ops ? vma->vm_ops->set_policy : NULL);
 780
 781        new = mpol_dup(pol);
 782        if (IS_ERR(new))
 783                return PTR_ERR(new);
 784
 785        if (vma->vm_ops && vma->vm_ops->set_policy) {
 786                err = vma->vm_ops->set_policy(vma, new);
 787                if (err)
 788                        goto err_out;
 789        }
 790
 791        old = vma->vm_policy;
 792        vma->vm_policy = new; /* protected by mmap_lock */
 793        mpol_put(old);
 794
 795        return 0;
 796 err_out:
 797        mpol_put(new);
 798        return err;
 799}
 800
 801/* Step 2: apply policy to a range and do splits. */
 802static int mbind_range(struct mm_struct *mm, unsigned long start,
 803                       unsigned long end, struct mempolicy *new_pol)
 804{
 805        struct vm_area_struct *next;
 806        struct vm_area_struct *prev;
 807        struct vm_area_struct *vma;
 808        int err = 0;
 809        pgoff_t pgoff;
 810        unsigned long vmstart;
 811        unsigned long vmend;
 812
 813        vma = find_vma(mm, start);
 814        VM_BUG_ON(!vma);
 815
 816        prev = vma->vm_prev;
 817        if (start > vma->vm_start)
 818                prev = vma;
 819
 820        for (; vma && vma->vm_start < end; prev = vma, vma = next) {
 821                next = vma->vm_next;
 822                vmstart = max(start, vma->vm_start);
 823                vmend   = min(end, vma->vm_end);
 824
 825                if (mpol_equal(vma_policy(vma), new_pol))
 826                        continue;
 827
 828                pgoff = vma->vm_pgoff +
 829                        ((vmstart - vma->vm_start) >> PAGE_SHIFT);
 830                prev = vma_merge(mm, prev, vmstart, vmend, vma->vm_flags,
 831                                 vma->anon_vma, vma->vm_file, pgoff,
 832                                 new_pol, vma->vm_userfaultfd_ctx);
 833                if (prev) {
 834                        vma = prev;
 835                        next = vma->vm_next;
 836                        if (mpol_equal(vma_policy(vma), new_pol))
 837                                continue;
 838                        /* vma_merge() joined vma && vma->next, case 8 */
 839                        goto replace;
 840                }
 841                if (vma->vm_start != vmstart) {
 842                        err = split_vma(vma->vm_mm, vma, vmstart, 1);
 843                        if (err)
 844                                goto out;
 845                }
 846                if (vma->vm_end != vmend) {
 847                        err = split_vma(vma->vm_mm, vma, vmend, 0);
 848                        if (err)
 849                                goto out;
 850                }
 851 replace:
 852                err = vma_replace_policy(vma, new_pol);
 853                if (err)
 854                        goto out;
 855        }
 856
 857 out:
 858        return err;
 859}
 860
 861/* Set the process memory policy */
 862static long do_set_mempolicy(unsigned short mode, unsigned short flags,
 863                             nodemask_t *nodes)
 864{
 865        struct mempolicy *new, *old;
 866        NODEMASK_SCRATCH(scratch);
 867        int ret;
 868
 869        if (!scratch)
 870                return -ENOMEM;
 871
 872        new = mpol_new(mode, flags, nodes);
 873        if (IS_ERR(new)) {
 874                ret = PTR_ERR(new);
 875                goto out;
 876        }
 877
 878        task_lock(current);
 879        ret = mpol_set_nodemask(new, nodes, scratch);
 880        if (ret) {
 881                task_unlock(current);
 882                mpol_put(new);
 883                goto out;
 884        }
 885        old = current->mempolicy;
 886        current->mempolicy = new;
 887        if (new && new->mode == MPOL_INTERLEAVE)
 888                current->il_prev = MAX_NUMNODES-1;
 889        task_unlock(current);
 890        mpol_put(old);
 891        ret = 0;
 892out:
 893        NODEMASK_SCRATCH_FREE(scratch);
 894        return ret;
 895}
 896
 897/*
 898 * Return nodemask for policy for get_mempolicy() query
 899 *
 900 * Called with task's alloc_lock held
 901 */
 902static void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes)
 903{
 904        nodes_clear(*nodes);
 905        if (p == &default_policy)
 906                return;
 907
 908        switch (p->mode) {
 909        case MPOL_BIND:
 910        case MPOL_INTERLEAVE:
 911                *nodes = p->v.nodes;
 912                break;
 913        case MPOL_PREFERRED:
 914                if (!(p->flags & MPOL_F_LOCAL))
 915                        node_set(p->v.preferred_node, *nodes);
 916                /* else return empty node mask for local allocation */
 917                break;
 918        default:
 919                BUG();
 920        }
 921}
 922
 923static int lookup_node(struct mm_struct *mm, unsigned long addr)
 924{
 925        struct page *p = NULL;
 926        int err;
 927
 928        int locked = 1;
 929        err = get_user_pages_locked(addr & PAGE_MASK, 1, 0, &p, &locked);
 930        if (err > 0) {
 931                err = page_to_nid(p);
 932                put_page(p);
 933        }
 934        if (locked)
 935                mmap_read_unlock(mm);
 936        return err;
 937}
 938
 939/* Retrieve NUMA policy */
 940static long do_get_mempolicy(int *policy, nodemask_t *nmask,
 941                             unsigned long addr, unsigned long flags)
 942{
 943        int err;
 944        struct mm_struct *mm = current->mm;
 945        struct vm_area_struct *vma = NULL;
 946        struct mempolicy *pol = current->mempolicy, *pol_refcount = NULL;
 947
 948        if (flags &
 949                ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED))
 950                return -EINVAL;
 951
 952        if (flags & MPOL_F_MEMS_ALLOWED) {
 953                if (flags & (MPOL_F_NODE|MPOL_F_ADDR))
 954                        return -EINVAL;
 955                *policy = 0;    /* just so it's initialized */
 956                task_lock(current);
 957                *nmask  = cpuset_current_mems_allowed;
 958                task_unlock(current);
 959                return 0;
 960        }
 961
 962        if (flags & MPOL_F_ADDR) {
 963                /*
 964                 * Do NOT fall back to task policy if the
 965                 * vma/shared policy at addr is NULL.  We
 966                 * want to return MPOL_DEFAULT in this case.
 967                 */
 968                mmap_read_lock(mm);
 969                vma = find_vma_intersection(mm, addr, addr+1);
 970                if (!vma) {
 971                        mmap_read_unlock(mm);
 972                        return -EFAULT;
 973                }
 974                if (vma->vm_ops && vma->vm_ops->get_policy)
 975                        pol = vma->vm_ops->get_policy(vma, addr);
 976                else
 977                        pol = vma->vm_policy;
 978        } else if (addr)
 979                return -EINVAL;
 980
 981        if (!pol)
 982                pol = &default_policy;  /* indicates default behavior */
 983
 984        if (flags & MPOL_F_NODE) {
 985                if (flags & MPOL_F_ADDR) {
 986                        /*
 987                         * Take a refcount on the mpol, lookup_node()
 988                         * wil drop the mmap_lock, so after calling
 989                         * lookup_node() only "pol" remains valid, "vma"
 990                         * is stale.
 991                         */
 992                        pol_refcount = pol;
 993                        vma = NULL;
 994                        mpol_get(pol);
 995                        err = lookup_node(mm, addr);
 996                        if (err < 0)
 997                                goto out;
 998                        *policy = err;
 999                } else if (pol == current->mempolicy &&
1000                                pol->mode == MPOL_INTERLEAVE) {
1001                        *policy = next_node_in(current->il_prev, pol->v.nodes);
1002                } else {
1003                        err = -EINVAL;
1004                        goto out;
1005                }
1006        } else {
1007                *policy = pol == &default_policy ? MPOL_DEFAULT :
1008                                                pol->mode;
1009                /*
1010                 * Internal mempolicy flags must be masked off before exposing
1011                 * the policy to userspace.
1012                 */
1013                *policy |= (pol->flags & MPOL_MODE_FLAGS);
1014        }
1015
1016        err = 0;
1017        if (nmask) {
1018                if (mpol_store_user_nodemask(pol)) {
1019                        *nmask = pol->w.user_nodemask;
1020                } else {
1021                        task_lock(current);
1022                        get_policy_nodemask(pol, nmask);
1023                        task_unlock(current);
1024                }
1025        }
1026
1027 out:
1028        mpol_cond_put(pol);
1029        if (vma)
1030                mmap_read_unlock(mm);
1031        if (pol_refcount)
1032                mpol_put(pol_refcount);
1033        return err;
1034}
1035
1036#ifdef CONFIG_MIGRATION
1037/*
1038 * page migration, thp tail pages can be passed.
1039 */
1040static int migrate_page_add(struct page *page, struct list_head *pagelist,
1041                                unsigned long flags)
1042{
1043        struct page *head = compound_head(page);
1044        /*
1045         * Avoid migrating a page that is shared with others.
1046         */
1047        if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(head) == 1) {
1048                if (!isolate_lru_page(head)) {
1049                        list_add_tail(&head->lru, pagelist);
1050                        mod_node_page_state(page_pgdat(head),
1051                                NR_ISOLATED_ANON + page_is_file_lru(head),
1052                                thp_nr_pages(head));
1053                } else if (flags & MPOL_MF_STRICT) {
1054                        /*
1055                         * Non-movable page may reach here.  And, there may be
1056                         * temporary off LRU pages or non-LRU movable pages.
1057                         * Treat them as unmovable pages since they can't be
1058                         * isolated, so they can't be moved at the moment.  It
1059                         * should return -EIO for this case too.
1060                         */
1061                        return -EIO;
1062                }
1063        }
1064
1065        return 0;
1066}
1067
1068/*
1069 * Migrate pages from one node to a target node.
1070 * Returns error or the number of pages not migrated.
1071 */
1072static int migrate_to_node(struct mm_struct *mm, int source, int dest,
1073                           int flags)
1074{
1075        nodemask_t nmask;
1076        LIST_HEAD(pagelist);
1077        int err = 0;
1078        struct migration_target_control mtc = {
1079                .nid = dest,
1080                .gfp_mask = GFP_HIGHUSER_MOVABLE | __GFP_THISNODE,
1081        };
1082
1083        nodes_clear(nmask);
1084        node_set(source, nmask);
1085
1086        /*
1087         * This does not "check" the range but isolates all pages that
1088         * need migration.  Between passing in the full user address
1089         * space range and MPOL_MF_DISCONTIG_OK, this call can not fail.
1090         */
1091        VM_BUG_ON(!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)));
1092        queue_pages_range(mm, mm->mmap->vm_start, mm->task_size, &nmask,
1093                        flags | MPOL_MF_DISCONTIG_OK, &pagelist);
1094
1095        if (!list_empty(&pagelist)) {
1096                err = migrate_pages(&pagelist, alloc_migration_target, NULL,
1097                                (unsigned long)&mtc, MIGRATE_SYNC, MR_SYSCALL);
1098                if (err)
1099                        putback_movable_pages(&pagelist);
1100        }
1101
1102        return err;
1103}
1104
1105/*
1106 * Move pages between the two nodesets so as to preserve the physical
1107 * layout as much as possible.
1108 *
1109 * Returns the number of page that could not be moved.
1110 */
1111int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
1112                     const nodemask_t *to, int flags)
1113{
1114        int busy = 0;
1115        int err;
1116        nodemask_t tmp;
1117
1118        err = migrate_prep();
1119        if (err)
1120                return err;
1121
1122        mmap_read_lock(mm);
1123
1124        /*
1125         * Find a 'source' bit set in 'tmp' whose corresponding 'dest'
1126         * bit in 'to' is not also set in 'tmp'.  Clear the found 'source'
1127         * bit in 'tmp', and return that <source, dest> pair for migration.
1128         * The pair of nodemasks 'to' and 'from' define the map.
1129         *
1130         * If no pair of bits is found that way, fallback to picking some
1131         * pair of 'source' and 'dest' bits that are not the same.  If the
1132         * 'source' and 'dest' bits are the same, this represents a node
1133         * that will be migrating to itself, so no pages need move.
1134         *
1135         * If no bits are left in 'tmp', or if all remaining bits left
1136         * in 'tmp' correspond to the same bit in 'to', return false
1137         * (nothing left to migrate).
1138         *
1139         * This lets us pick a pair of nodes to migrate between, such that
1140         * if possible the dest node is not already occupied by some other
1141         * source node, minimizing the risk of overloading the memory on a
1142         * node that would happen if we migrated incoming memory to a node
1143         * before migrating outgoing memory source that same node.
1144         *
1145         * A single scan of tmp is sufficient.  As we go, we remember the
1146         * most recent <s, d> pair that moved (s != d).  If we find a pair
1147         * that not only moved, but what's better, moved to an empty slot
1148         * (d is not set in tmp), then we break out then, with that pair.
1149         * Otherwise when we finish scanning from_tmp, we at least have the
1150         * most recent <s, d> pair that moved.  If we get all the way through
1151         * the scan of tmp without finding any node that moved, much less
1152         * moved to an empty node, then there is nothing left worth migrating.
1153         */
1154
1155        tmp = *from;
1156        while (!nodes_empty(tmp)) {
1157                int s,d;
1158                int source = NUMA_NO_NODE;
1159                int dest = 0;
1160
1161                for_each_node_mask(s, tmp) {
1162
1163                        /*
1164                         * do_migrate_pages() tries to maintain the relative
1165                         * node relationship of the pages established between
1166                         * threads and memory areas.
1167                         *
1168                         * However if the number of source nodes is not equal to
1169                         * the number of destination nodes we can not preserve
1170                         * this node relative relationship.  In that case, skip
1171                         * copying memory from a node that is in the destination
1172                         * mask.
1173                         *
1174                         * Example: [2,3,4] -> [3,4,5] moves everything.
1175                         *          [0-7] - > [3,4,5] moves only 0,1,2,6,7.
1176                         */
1177
1178                        if ((nodes_weight(*from) != nodes_weight(*to)) &&
1179                                                (node_isset(s, *to)))
1180                                continue;
1181
1182                        d = node_remap(s, *from, *to);
1183                        if (s == d)
1184                                continue;
1185
1186                        source = s;     /* Node moved. Memorize */
1187                        dest = d;
1188
1189                        /* dest not in remaining from nodes? */
1190                        if (!node_isset(dest, tmp))
1191                                break;
1192                }
1193                if (source == NUMA_NO_NODE)
1194                        break;
1195
1196                node_clear(source, tmp);
1197                err = migrate_to_node(mm, source, dest, flags);
1198                if (err > 0)
1199                        busy += err;
1200                if (err < 0)
1201                        break;
1202        }
1203        mmap_read_unlock(mm);
1204        if (err < 0)
1205                return err;
1206        return busy;
1207
1208}
1209
1210/*
1211 * Allocate a new page for page migration based on vma policy.
1212 * Start by assuming the page is mapped by the same vma as contains @start.
1213 * Search forward from there, if not.  N.B., this assumes that the
1214 * list of pages handed to migrate_pages()--which is how we get here--
1215 * is in virtual address order.
1216 */
1217static struct page *new_page(struct page *page, unsigned long start)
1218{
1219        struct vm_area_struct *vma;
1220        unsigned long address;
1221
1222        vma = find_vma(current->mm, start);
1223        while (vma) {
1224                address = page_address_in_vma(page, vma);
1225                if (address != -EFAULT)
1226                        break;
1227                vma = vma->vm_next;
1228        }
1229
1230        if (PageHuge(page)) {
1231                return alloc_huge_page_vma(page_hstate(compound_head(page)),
1232                                vma, address);
1233        } else if (PageTransHuge(page)) {
1234                struct page *thp;
1235
1236                thp = alloc_hugepage_vma(GFP_TRANSHUGE, vma, address,
1237                                         HPAGE_PMD_ORDER);
1238                if (!thp)
1239                        return NULL;
1240                prep_transhuge_page(thp);
1241                return thp;
1242        }
1243        /*
1244         * if !vma, alloc_page_vma() will use task or system default policy
1245         */
1246        return alloc_page_vma(GFP_HIGHUSER_MOVABLE | __GFP_RETRY_MAYFAIL,
1247                        vma, address);
1248}
1249#else
1250
1251static int migrate_page_add(struct page *page, struct list_head *pagelist,
1252                                unsigned long flags)
1253{
1254        return -EIO;
1255}
1256
1257int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
1258                     const nodemask_t *to, int flags)
1259{
1260        return -ENOSYS;
1261}
1262
1263static struct page *new_page(struct page *page, unsigned long start)
1264{
1265        return NULL;
1266}
1267#endif
1268
1269static long do_mbind(unsigned long start, unsigned long len,
1270                     unsigned short mode, unsigned short mode_flags,
1271                     nodemask_t *nmask, unsigned long flags)
1272{
1273        struct mm_struct *mm = current->mm;
1274        struct mempolicy *new;
1275        unsigned long end;
1276        int err;
1277        int ret;
1278        LIST_HEAD(pagelist);
1279
1280        if (flags & ~(unsigned long)MPOL_MF_VALID)
1281                return -EINVAL;
1282        if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
1283                return -EPERM;
1284
1285        if (start & ~PAGE_MASK)
1286                return -EINVAL;
1287
1288        if (mode == MPOL_DEFAULT)
1289                flags &= ~MPOL_MF_STRICT;
1290
1291        len = (len + PAGE_SIZE - 1) & PAGE_MASK;
1292        end = start + len;
1293
1294        if (end < start)
1295                return -EINVAL;
1296        if (end == start)
1297                return 0;
1298
1299        new = mpol_new(mode, mode_flags, nmask);
1300        if (IS_ERR(new))
1301                return PTR_ERR(new);
1302
1303        if (flags & MPOL_MF_LAZY)
1304                new->flags |= MPOL_F_MOF;
1305
1306        /*
1307         * If we are using the default policy then operation
1308         * on discontinuous address spaces is okay after all
1309         */
1310        if (!new)
1311                flags |= MPOL_MF_DISCONTIG_OK;
1312
1313        pr_debug("mbind %lx-%lx mode:%d flags:%d nodes:%lx\n",
1314                 start, start + len, mode, mode_flags,
1315                 nmask ? nodes_addr(*nmask)[0] : NUMA_NO_NODE);
1316
1317        if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
1318
1319                err = migrate_prep();
1320                if (err)
1321                        goto mpol_out;
1322        }
1323        {
1324                NODEMASK_SCRATCH(scratch);
1325                if (scratch) {
1326                        mmap_write_lock(mm);
1327                        task_lock(current);
1328                        err = mpol_set_nodemask(new, nmask, scratch);
1329                        task_unlock(current);
1330                        if (err)
1331                                mmap_write_unlock(mm);
1332                } else
1333                        err = -ENOMEM;
1334                NODEMASK_SCRATCH_FREE(scratch);
1335        }
1336        if (err)
1337                goto mpol_out;
1338
1339        ret = queue_pages_range(mm, start, end, nmask,
1340                          flags | MPOL_MF_INVERT, &pagelist);
1341
1342        if (ret < 0) {
1343                err = ret;
1344                goto up_out;
1345        }
1346
1347        err = mbind_range(mm, start, end, new);
1348
1349        if (!err) {
1350                int nr_failed = 0;
1351
1352                if (!list_empty(&pagelist)) {
1353                        WARN_ON_ONCE(flags & MPOL_MF_LAZY);
1354                        nr_failed = migrate_pages(&pagelist, new_page, NULL,
1355                                start, MIGRATE_SYNC, MR_MEMPOLICY_MBIND);
1356                        if (nr_failed)
1357                                putback_movable_pages(&pagelist);
1358                }
1359
1360                if ((ret > 0) || (nr_failed && (flags & MPOL_MF_STRICT)))
1361                        err = -EIO;
1362        } else {
1363up_out:
1364                if (!list_empty(&pagelist))
1365                        putback_movable_pages(&pagelist);
1366        }
1367
1368        mmap_write_unlock(mm);
1369mpol_out:
1370        mpol_put(new);
1371        return err;
1372}
1373
1374/*
1375 * User space interface with variable sized bitmaps for nodelists.
1376 */
1377
1378/* Copy a node mask from user space. */
1379static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
1380                     unsigned long maxnode)
1381{
1382        unsigned long k;
1383        unsigned long t;
1384        unsigned long nlongs;
1385        unsigned long endmask;
1386
1387        --maxnode;
1388        nodes_clear(*nodes);
1389        if (maxnode == 0 || !nmask)
1390                return 0;
1391        if (maxnode > PAGE_SIZE*BITS_PER_BYTE)
1392                return -EINVAL;
1393
1394        nlongs = BITS_TO_LONGS(maxnode);
1395        if ((maxnode % BITS_PER_LONG) == 0)
1396                endmask = ~0UL;
1397        else
1398                endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1;
1399
1400        /*
1401         * When the user specified more nodes than supported just check
1402         * if the non supported part is all zero.
1403         *
1404         * If maxnode have more longs than MAX_NUMNODES, check
1405         * the bits in that area first. And then go through to
1406         * check the rest bits which equal or bigger than MAX_NUMNODES.
1407         * Otherwise, just check bits [MAX_NUMNODES, maxnode).
1408         */
1409        if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) {
1410                for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) {
1411                        if (get_user(t, nmask + k))
1412                                return -EFAULT;
1413                        if (k == nlongs - 1) {
1414                                if (t & endmask)
1415                                        return -EINVAL;
1416                        } else if (t)
1417                                return -EINVAL;
1418                }
1419                nlongs = BITS_TO_LONGS(MAX_NUMNODES);
1420                endmask = ~0UL;
1421        }
1422
1423        if (maxnode > MAX_NUMNODES && MAX_NUMNODES % BITS_PER_LONG != 0) {
1424                unsigned long valid_mask = endmask;
1425
1426                valid_mask &= ~((1UL << (MAX_NUMNODES % BITS_PER_LONG)) - 1);
1427                if (get_user(t, nmask + nlongs - 1))
1428                        return -EFAULT;
1429                if (t & valid_mask)
1430                        return -EINVAL;
1431        }
1432
1433        if (copy_from_user(nodes_addr(*nodes), nmask, nlongs*sizeof(unsigned long)))
1434                return -EFAULT;
1435        nodes_addr(*nodes)[nlongs-1] &= endmask;
1436        return 0;
1437}
1438
1439/* Copy a kernel node mask to user space */
1440static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
1441                              nodemask_t *nodes)
1442{
1443        unsigned long copy = ALIGN(maxnode-1, 64) / 8;
1444        unsigned int nbytes = BITS_TO_LONGS(nr_node_ids) * sizeof(long);
1445
1446        if (copy > nbytes) {
1447                if (copy > PAGE_SIZE)
1448                        return -EINVAL;
1449                if (clear_user((char __user *)mask + nbytes, copy - nbytes))
1450                        return -EFAULT;
1451                copy = nbytes;
1452        }
1453        return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
1454}
1455
1456static long kernel_mbind(unsigned long start, unsigned long len,
1457                         unsigned long mode, const unsigned long __user *nmask,
1458                         unsigned long maxnode, unsigned int flags)
1459{
1460        nodemask_t nodes;
1461        int err;
1462        unsigned short mode_flags;
1463
1464        start = untagged_addr(start);
1465        mode_flags = mode & MPOL_MODE_FLAGS;
1466        mode &= ~MPOL_MODE_FLAGS;
1467        if (mode >= MPOL_MAX)
1468                return -EINVAL;
1469        if ((mode_flags & MPOL_F_STATIC_NODES) &&
1470            (mode_flags & MPOL_F_RELATIVE_NODES))
1471                return -EINVAL;
1472        err = get_nodes(&nodes, nmask, maxnode);
1473        if (err)
1474                return err;
1475        return do_mbind(start, len, mode, mode_flags, &nodes, flags);
1476}
1477
1478SYSCALL_DEFINE6(mbind, unsigned long, start, unsigned long, len,
1479                unsigned long, mode, const unsigned long __user *, nmask,
1480                unsigned long, maxnode, unsigned int, flags)
1481{
1482        return kernel_mbind(start, len, mode, nmask, maxnode, flags);
1483}
1484
1485/* Set the process memory policy */
1486static long kernel_set_mempolicy(int mode, const unsigned long __user *nmask,
1487                                 unsigned long maxnode)
1488{
1489        int err;
1490        nodemask_t nodes;
1491        unsigned short flags;
1492
1493        flags = mode & MPOL_MODE_FLAGS;
1494        mode &= ~MPOL_MODE_FLAGS;
1495        if ((unsigned int)mode >= MPOL_MAX)
1496                return -EINVAL;
1497        if ((flags & MPOL_F_STATIC_NODES) && (flags & MPOL_F_RELATIVE_NODES))
1498                return -EINVAL;
1499        err = get_nodes(&nodes, nmask, maxnode);
1500        if (err)
1501                return err;
1502        return do_set_mempolicy(mode, flags, &nodes);
1503}
1504
1505SYSCALL_DEFINE3(set_mempolicy, int, mode, const unsigned long __user *, nmask,
1506                unsigned long, maxnode)
1507{
1508        return kernel_set_mempolicy(mode, nmask, maxnode);
1509}
1510
1511static int kernel_migrate_pages(pid_t pid, unsigned long maxnode,
1512                                const unsigned long __user *old_nodes,
1513                                const unsigned long __user *new_nodes)
1514{
1515        struct mm_struct *mm = NULL;
1516        struct task_struct *task;
1517        nodemask_t task_nodes;
1518        int err;
1519        nodemask_t *old;
1520        nodemask_t *new;
1521        NODEMASK_SCRATCH(scratch);
1522
1523        if (!scratch)
1524                return -ENOMEM;
1525
1526        old = &scratch->mask1;
1527        new = &scratch->mask2;
1528
1529        err = get_nodes(old, old_nodes, maxnode);
1530        if (err)
1531                goto out;
1532
1533        err = get_nodes(new, new_nodes, maxnode);
1534        if (err)
1535                goto out;
1536
1537        /* Find the mm_struct */
1538        rcu_read_lock();
1539        task = pid ? find_task_by_vpid(pid) : current;
1540        if (!task) {
1541                rcu_read_unlock();
1542                err = -ESRCH;
1543                goto out;
1544        }
1545        get_task_struct(task);
1546
1547        err = -EINVAL;
1548
1549        /*
1550         * Check if this process has the right to modify the specified process.
1551         * Use the regular "ptrace_may_access()" checks.
1552         */
1553        if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS)) {
1554                rcu_read_unlock();
1555                err = -EPERM;
1556                goto out_put;
1557        }
1558        rcu_read_unlock();
1559
1560        task_nodes = cpuset_mems_allowed(task);
1561        /* Is the user allowed to access the target nodes? */
1562        if (!nodes_subset(*new, task_nodes) && !capable(CAP_SYS_NICE)) {
1563                err = -EPERM;
1564                goto out_put;
1565        }
1566
1567        task_nodes = cpuset_mems_allowed(current);
1568        nodes_and(*new, *new, task_nodes);
1569        if (nodes_empty(*new))
1570                goto out_put;
1571
1572        err = security_task_movememory(task);
1573        if (err)
1574                goto out_put;
1575
1576        mm = get_task_mm(task);
1577        put_task_struct(task);
1578
1579        if (!mm) {
1580                err = -EINVAL;
1581                goto out;
1582        }
1583
1584        err = do_migrate_pages(mm, old, new,
1585                capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE);
1586
1587        mmput(mm);
1588out:
1589        NODEMASK_SCRATCH_FREE(scratch);
1590
1591        return err;
1592
1593out_put:
1594        put_task_struct(task);
1595        goto out;
1596
1597}
1598
1599SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode,
1600                const unsigned long __user *, old_nodes,
1601                const unsigned long __user *, new_nodes)
1602{
1603        return kernel_migrate_pages(pid, maxnode, old_nodes, new_nodes);
1604}
1605
1606
1607/* Retrieve NUMA policy */
1608static int kernel_get_mempolicy(int __user *policy,
1609                                unsigned long __user *nmask,
1610                                unsigned long maxnode,
1611                                unsigned long addr,
1612                                unsigned long flags)
1613{
1614        int err;
1615        int pval;
1616        nodemask_t nodes;
1617
1618        if (nmask != NULL && maxnode < nr_node_ids)
1619                return -EINVAL;
1620
1621        addr = untagged_addr(addr);
1622
1623        err = do_get_mempolicy(&pval, &nodes, addr, flags);
1624
1625        if (err)
1626                return err;
1627
1628        if (policy && put_user(pval, policy))
1629                return -EFAULT;
1630
1631        if (nmask)
1632                err = copy_nodes_to_user(nmask, maxnode, &nodes);
1633
1634        return err;
1635}
1636
1637SYSCALL_DEFINE5(get_mempolicy, int __user *, policy,
1638                unsigned long __user *, nmask, unsigned long, maxnode,
1639                unsigned long, addr, unsigned long, flags)
1640{
1641        return kernel_get_mempolicy(policy, nmask, maxnode, addr, flags);
1642}
1643
1644#ifdef CONFIG_COMPAT
1645
1646COMPAT_SYSCALL_DEFINE5(get_mempolicy, int __user *, policy,
1647                       compat_ulong_t __user *, nmask,
1648                       compat_ulong_t, maxnode,
1649                       compat_ulong_t, addr, compat_ulong_t, flags)
1650{
1651        long err;
1652        unsigned long __user *nm = NULL;
1653        unsigned long nr_bits, alloc_size;
1654        DECLARE_BITMAP(bm, MAX_NUMNODES);
1655
1656        nr_bits = min_t(unsigned long, maxnode-1, nr_node_ids);
1657        alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1658
1659        if (nmask)
1660                nm = compat_alloc_user_space(alloc_size);
1661
1662        err = kernel_get_mempolicy(policy, nm, nr_bits+1, addr, flags);
1663
1664        if (!err && nmask) {
1665                unsigned long copy_size;
1666                copy_size = min_t(unsigned long, sizeof(bm), alloc_size);
1667                err = copy_from_user(bm, nm, copy_size);
1668                /* ensure entire bitmap is zeroed */
1669                err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8);
1670                err |= compat_put_bitmap(nmask, bm, nr_bits);
1671        }
1672
1673        return err;
1674}
1675
1676COMPAT_SYSCALL_DEFINE3(set_mempolicy, int, mode, compat_ulong_t __user *, nmask,
1677                       compat_ulong_t, maxnode)
1678{
1679        unsigned long __user *nm = NULL;
1680        unsigned long nr_bits, alloc_size;
1681        DECLARE_BITMAP(bm, MAX_NUMNODES);
1682
1683        nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1684        alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1685
1686        if (nmask) {
1687                if (compat_get_bitmap(bm, nmask, nr_bits))
1688                        return -EFAULT;
1689                nm = compat_alloc_user_space(alloc_size);
1690                if (copy_to_user(nm, bm, alloc_size))
1691                        return -EFAULT;
1692        }
1693
1694        return kernel_set_mempolicy(mode, nm, nr_bits+1);
1695}
1696
1697COMPAT_SYSCALL_DEFINE6(mbind, compat_ulong_t, start, compat_ulong_t, len,
1698                       compat_ulong_t, mode, compat_ulong_t __user *, nmask,
1699                       compat_ulong_t, maxnode, compat_ulong_t, flags)
1700{
1701        unsigned long __user *nm = NULL;
1702        unsigned long nr_bits, alloc_size;
1703        nodemask_t bm;
1704
1705        nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1706        alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1707
1708        if (nmask) {
1709                if (compat_get_bitmap(nodes_addr(bm), nmask, nr_bits))
1710                        return -EFAULT;
1711                nm = compat_alloc_user_space(alloc_size);
1712                if (copy_to_user(nm, nodes_addr(bm), alloc_size))
1713                        return -EFAULT;
1714        }
1715
1716        return kernel_mbind(start, len, mode, nm, nr_bits+1, flags);
1717}
1718
1719COMPAT_SYSCALL_DEFINE4(migrate_pages, compat_pid_t, pid,
1720                       compat_ulong_t, maxnode,
1721                       const compat_ulong_t __user *, old_nodes,
1722                       const compat_ulong_t __user *, new_nodes)
1723{
1724        unsigned long __user *old = NULL;
1725        unsigned long __user *new = NULL;
1726        nodemask_t tmp_mask;
1727        unsigned long nr_bits;
1728        unsigned long size;
1729
1730        nr_bits = min_t(unsigned long, maxnode - 1, MAX_NUMNODES);
1731        size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1732        if (old_nodes) {
1733                if (compat_get_bitmap(nodes_addr(tmp_mask), old_nodes, nr_bits))
1734                        return -EFAULT;
1735                old = compat_alloc_user_space(new_nodes ? size * 2 : size);
1736                if (new_nodes)
1737                        new = old + size / sizeof(unsigned long);
1738                if (copy_to_user(old, nodes_addr(tmp_mask), size))
1739                        return -EFAULT;
1740        }
1741        if (new_nodes) {
1742                if (compat_get_bitmap(nodes_addr(tmp_mask), new_nodes, nr_bits))
1743                        return -EFAULT;
1744                if (new == NULL)
1745                        new = compat_alloc_user_space(size);
1746                if (copy_to_user(new, nodes_addr(tmp_mask), size))
1747                        return -EFAULT;
1748        }
1749        return kernel_migrate_pages(pid, nr_bits + 1, old, new);
1750}
1751
1752#endif /* CONFIG_COMPAT */
1753
1754bool vma_migratable(struct vm_area_struct *vma)
1755{
1756        if (vma->vm_flags & (VM_IO | VM_PFNMAP))
1757                return false;
1758
1759        /*
1760         * DAX device mappings require predictable access latency, so avoid
1761         * incurring periodic faults.
1762         */
1763        if (vma_is_dax(vma))
1764                return false;
1765
1766        if (is_vm_hugetlb_page(vma) &&
1767                !hugepage_migration_supported(hstate_vma(vma)))
1768                return false;
1769
1770        /*
1771         * Migration allocates pages in the highest zone. If we cannot
1772         * do so then migration (at least from node to node) is not
1773         * possible.
1774         */
1775        if (vma->vm_file &&
1776                gfp_zone(mapping_gfp_mask(vma->vm_file->f_mapping))
1777                        < policy_zone)
1778                return false;
1779        return true;
1780}
1781
1782struct mempolicy *__get_vma_policy(struct vm_area_struct *vma,
1783                                                unsigned long addr)
1784{
1785        struct mempolicy *pol = NULL;
1786
1787        if (vma) {
1788                if (vma->vm_ops && vma->vm_ops->get_policy) {
1789                        pol = vma->vm_ops->get_policy(vma, addr);
1790                } else if (vma->vm_policy) {
1791                        pol = vma->vm_policy;
1792
1793                        /*
1794                         * shmem_alloc_page() passes MPOL_F_SHARED policy with
1795                         * a pseudo vma whose vma->vm_ops=NULL. Take a reference
1796                         * count on these policies which will be dropped by
1797                         * mpol_cond_put() later
1798                         */
1799                        if (mpol_needs_cond_ref(pol))
1800                                mpol_get(pol);
1801                }
1802        }
1803
1804        return pol;
1805}
1806
1807/*
1808 * get_vma_policy(@vma, @addr)
1809 * @vma: virtual memory area whose policy is sought
1810 * @addr: address in @vma for shared policy lookup
1811 *
1812 * Returns effective policy for a VMA at specified address.
1813 * Falls back to current->mempolicy or system default policy, as necessary.
1814 * Shared policies [those marked as MPOL_F_SHARED] require an extra reference
1815 * count--added by the get_policy() vm_op, as appropriate--to protect against
1816 * freeing by another task.  It is the caller's responsibility to free the
1817 * extra reference for shared policies.
1818 */
1819static struct mempolicy *get_vma_policy(struct vm_area_struct *vma,
1820                                                unsigned long addr)
1821{
1822        struct mempolicy *pol = __get_vma_policy(vma, addr);
1823
1824        if (!pol)
1825                pol = get_task_policy(current);
1826
1827        return pol;
1828}
1829
1830bool vma_policy_mof(struct vm_area_struct *vma)
1831{
1832        struct mempolicy *pol;
1833
1834        if (vma->vm_ops && vma->vm_ops->get_policy) {
1835                bool ret = false;
1836
1837                pol = vma->vm_ops->get_policy(vma, vma->vm_start);
1838                if (pol && (pol->flags & MPOL_F_MOF))
1839                        ret = true;
1840                mpol_cond_put(pol);
1841
1842                return ret;
1843        }
1844
1845        pol = vma->vm_policy;
1846        if (!pol)
1847                pol = get_task_policy(current);
1848
1849        return pol->flags & MPOL_F_MOF;
1850}
1851
1852static int apply_policy_zone(struct mempolicy *policy, enum zone_type zone)
1853{
1854        enum zone_type dynamic_policy_zone = policy_zone;
1855
1856        BUG_ON(dynamic_policy_zone == ZONE_MOVABLE);
1857
1858        /*
1859         * if policy->v.nodes has movable memory only,
1860         * we apply policy when gfp_zone(gfp) = ZONE_MOVABLE only.
1861         *
1862         * policy->v.nodes is intersect with node_states[N_MEMORY].
1863         * so if the following test faile, it implies
1864         * policy->v.nodes has movable memory only.
1865         */
1866        if (!nodes_intersects(policy->v.nodes, node_states[N_HIGH_MEMORY]))
1867                dynamic_policy_zone = ZONE_MOVABLE;
1868
1869        return zone >= dynamic_policy_zone;
1870}
1871
1872/*
1873 * Return a nodemask representing a mempolicy for filtering nodes for
1874 * page allocation
1875 */
1876nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy)
1877{
1878        /* Lower zones don't get a nodemask applied for MPOL_BIND */
1879        if (unlikely(policy->mode == MPOL_BIND) &&
1880                        apply_policy_zone(policy, gfp_zone(gfp)) &&
1881                        cpuset_nodemask_valid_mems_allowed(&policy->v.nodes))
1882                return &policy->v.nodes;
1883
1884        return NULL;
1885}
1886
1887/* Return the node id preferred by the given mempolicy, or the given id */
1888static int policy_node(gfp_t gfp, struct mempolicy *policy,
1889                                                                int nd)
1890{
1891        if (policy->mode == MPOL_PREFERRED && !(policy->flags & MPOL_F_LOCAL))
1892                nd = policy->v.preferred_node;
1893        else {
1894                /*
1895                 * __GFP_THISNODE shouldn't even be used with the bind policy
1896                 * because we might easily break the expectation to stay on the
1897                 * requested node and not break the policy.
1898                 */
1899                WARN_ON_ONCE(policy->mode == MPOL_BIND && (gfp & __GFP_THISNODE));
1900        }
1901
1902        return nd;
1903}
1904
1905/* Do dynamic interleaving for a process */
1906static unsigned interleave_nodes(struct mempolicy *policy)
1907{
1908        unsigned next;
1909        struct task_struct *me = current;
1910
1911        next = next_node_in(me->il_prev, policy->v.nodes);
1912        if (next < MAX_NUMNODES)
1913                me->il_prev = next;
1914        return next;
1915}
1916
1917/*
1918 * Depending on the memory policy provide a node from which to allocate the
1919 * next slab entry.
1920 */
1921unsigned int mempolicy_slab_node(void)
1922{
1923        struct mempolicy *policy;
1924        int node = numa_mem_id();
1925
1926        if (in_interrupt())
1927                return node;
1928
1929        policy = current->mempolicy;
1930        if (!policy || policy->flags & MPOL_F_LOCAL)
1931                return node;
1932
1933        switch (policy->mode) {
1934        case MPOL_PREFERRED:
1935                /*
1936                 * handled MPOL_F_LOCAL above
1937                 */
1938                return policy->v.preferred_node;
1939
1940        case MPOL_INTERLEAVE:
1941                return interleave_nodes(policy);
1942
1943        case MPOL_BIND: {
1944                struct zoneref *z;
1945
1946                /*
1947                 * Follow bind policy behavior and start allocation at the
1948                 * first node.
1949                 */
1950                struct zonelist *zonelist;
1951                enum zone_type highest_zoneidx = gfp_zone(GFP_KERNEL);
1952                zonelist = &NODE_DATA(node)->node_zonelists[ZONELIST_FALLBACK];
1953                z = first_zones_zonelist(zonelist, highest_zoneidx,
1954                                                        &policy->v.nodes);
1955                return z->zone ? zone_to_nid(z->zone) : node;
1956        }
1957
1958        default:
1959                BUG();
1960        }
1961}
1962
1963/*
1964 * Do static interleaving for a VMA with known offset @n.  Returns the n'th
1965 * node in pol->v.nodes (starting from n=0), wrapping around if n exceeds the
1966 * number of present nodes.
1967 */
1968static unsigned offset_il_node(struct mempolicy *pol, unsigned long n)
1969{
1970        unsigned nnodes = nodes_weight(pol->v.nodes);
1971        unsigned target;
1972        int i;
1973        int nid;
1974
1975        if (!nnodes)
1976                return numa_node_id();
1977        target = (unsigned int)n % nnodes;
1978        nid = first_node(pol->v.nodes);
1979        for (i = 0; i < target; i++)
1980                nid = next_node(nid, pol->v.nodes);
1981        return nid;
1982}
1983
1984/* Determine a node number for interleave */
1985static inline unsigned interleave_nid(struct mempolicy *pol,
1986                 struct vm_area_struct *vma, unsigned long addr, int shift)
1987{
1988        if (vma) {
1989                unsigned long off;
1990
1991                /*
1992                 * for small pages, there is no difference between
1993                 * shift and PAGE_SHIFT, so the bit-shift is safe.
1994                 * for huge pages, since vm_pgoff is in units of small
1995                 * pages, we need to shift off the always 0 bits to get
1996                 * a useful offset.
1997                 */
1998                BUG_ON(shift < PAGE_SHIFT);
1999                off = vma->vm_pgoff >> (shift - PAGE_SHIFT);
2000                off += (addr - vma->vm_start) >> shift;
2001                return offset_il_node(pol, off);
2002        } else
2003                return interleave_nodes(pol);
2004}
2005
2006#ifdef CONFIG_HUGETLBFS
2007/*
2008 * huge_node(@vma, @addr, @gfp_flags, @mpol)
2009 * @vma: virtual memory area whose policy is sought
2010 * @addr: address in @vma for shared policy lookup and interleave policy
2011 * @gfp_flags: for requested zone
2012 * @mpol: pointer to mempolicy pointer for reference counted mempolicy
2013 * @nodemask: pointer to nodemask pointer for MPOL_BIND nodemask
2014 *
2015 * Returns a nid suitable for a huge page allocation and a pointer
2016 * to the struct mempolicy for conditional unref after allocation.
2017 * If the effective policy is 'BIND, returns a pointer to the mempolicy's
2018 * @nodemask for filtering the zonelist.
2019 *
2020 * Must be protected by read_mems_allowed_begin()
2021 */
2022int huge_node(struct vm_area_struct *vma, unsigned long addr, gfp_t gfp_flags,
2023                                struct mempolicy **mpol, nodemask_t **nodemask)
2024{
2025        int nid;
2026
2027        *mpol = get_vma_policy(vma, addr);
2028        *nodemask = NULL;       /* assume !MPOL_BIND */
2029
2030        if (unlikely((*mpol)->mode == MPOL_INTERLEAVE)) {
2031                nid = interleave_nid(*mpol, vma, addr,
2032                                        huge_page_shift(hstate_vma(vma)));
2033        } else {
2034                nid = policy_node(gfp_flags, *mpol, numa_node_id());
2035                if ((*mpol)->mode == MPOL_BIND)
2036                        *nodemask = &(*mpol)->v.nodes;
2037        }
2038        return nid;
2039}
2040
2041/*
2042 * init_nodemask_of_mempolicy
2043 *
2044 * If the current task's mempolicy is "default" [NULL], return 'false'
2045 * to indicate default policy.  Otherwise, extract the policy nodemask
2046 * for 'bind' or 'interleave' policy into the argument nodemask, or
2047 * initialize the argument nodemask to contain the single node for
2048 * 'preferred' or 'local' policy and return 'true' to indicate presence
2049 * of non-default mempolicy.
2050 *
2051 * We don't bother with reference counting the mempolicy [mpol_get/put]
2052 * because the current task is examining it's own mempolicy and a task's
2053 * mempolicy is only ever changed by the task itself.
2054 *
2055 * N.B., it is the caller's responsibility to free a returned nodemask.
2056 */
2057bool init_nodemask_of_mempolicy(nodemask_t *mask)
2058{
2059        struct mempolicy *mempolicy;
2060        int nid;
2061
2062        if (!(mask && current->mempolicy))
2063                return false;
2064
2065        task_lock(current);
2066        mempolicy = current->mempolicy;
2067        switch (mempolicy->mode) {
2068        case MPOL_PREFERRED:
2069                if (mempolicy->flags & MPOL_F_LOCAL)
2070                        nid = numa_node_id();
2071                else
2072                        nid = mempolicy->v.preferred_node;
2073                init_nodemask_of_node(mask, nid);
2074                break;
2075
2076        case MPOL_BIND:
2077        case MPOL_INTERLEAVE:
2078                *mask =  mempolicy->v.nodes;
2079                break;
2080
2081        default:
2082                BUG();
2083        }
2084        task_unlock(current);
2085
2086        return true;
2087}
2088#endif
2089
2090/*
2091 * mempolicy_nodemask_intersects
2092 *
2093 * If tsk's mempolicy is "default" [NULL], return 'true' to indicate default
2094 * policy.  Otherwise, check for intersection between mask and the policy
2095 * nodemask for 'bind' or 'interleave' policy.  For 'perferred' or 'local'
2096 * policy, always return true since it may allocate elsewhere on fallback.
2097 *
2098 * Takes task_lock(tsk) to prevent freeing of its mempolicy.
2099 */
2100bool mempolicy_nodemask_intersects(struct task_struct *tsk,
2101                                        const nodemask_t *mask)
2102{
2103        struct mempolicy *mempolicy;
2104        bool ret = true;
2105
2106        if (!mask)
2107                return ret;
2108        task_lock(tsk);
2109        mempolicy = tsk->mempolicy;
2110        if (!mempolicy)
2111                goto out;
2112
2113        switch (mempolicy->mode) {
2114        case MPOL_PREFERRED:
2115                /*
2116                 * MPOL_PREFERRED and MPOL_F_LOCAL are only preferred nodes to
2117                 * allocate from, they may fallback to other nodes when oom.
2118                 * Thus, it's possible for tsk to have allocated memory from
2119                 * nodes in mask.
2120                 */
2121                break;
2122        case MPOL_BIND:
2123        case MPOL_INTERLEAVE:
2124                ret = nodes_intersects(mempolicy->v.nodes, *mask);
2125                break;
2126        default:
2127                BUG();
2128        }
2129out:
2130        task_unlock(tsk);
2131        return ret;
2132}
2133
2134/* Allocate a page in interleaved policy.
2135   Own path because it needs to do special accounting. */
2136static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
2137                                        unsigned nid)
2138{
2139        struct page *page;
2140
2141        page = __alloc_pages(gfp, order, nid);
2142        /* skip NUMA_INTERLEAVE_HIT counter update if numa stats is disabled */
2143        if (!static_branch_likely(&vm_numa_stat_key))
2144                return page;
2145        if (page && page_to_nid(page) == nid) {
2146                preempt_disable();
2147                __inc_numa_state(page_zone(page), NUMA_INTERLEAVE_HIT);
2148                preempt_enable();
2149        }
2150        return page;
2151}
2152
2153/**
2154 *      alloc_pages_vma - Allocate a page for a VMA.
2155 *
2156 *      @gfp:
2157 *      %GFP_USER    user allocation.
2158 *      %GFP_KERNEL  kernel allocations,
2159 *      %GFP_HIGHMEM highmem/user allocations,
2160 *      %GFP_FS      allocation should not call back into a file system.
2161 *      %GFP_ATOMIC  don't sleep.
2162 *
2163 *      @order:Order of the GFP allocation.
2164 *      @vma:  Pointer to VMA or NULL if not available.
2165 *      @addr: Virtual Address of the allocation. Must be inside the VMA.
2166 *      @node: Which node to prefer for allocation (modulo policy).
2167 *      @hugepage: for hugepages try only the preferred node if possible
2168 *
2169 *      This function allocates a page from the kernel page pool and applies
2170 *      a NUMA policy associated with the VMA or the current process.
2171 *      When VMA is not NULL caller must read-lock the mmap_lock of the
2172 *      mm_struct of the VMA to prevent it from going away. Should be used for
2173 *      all allocations for pages that will be mapped into user space. Returns
2174 *      NULL when no page can be allocated.
2175 */
2176struct page *
2177alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
2178                unsigned long addr, int node, bool hugepage)
2179{
2180        struct mempolicy *pol;
2181        struct page *page;
2182        int preferred_nid;
2183        nodemask_t *nmask;
2184
2185        pol = get_vma_policy(vma, addr);
2186
2187        if (pol->mode == MPOL_INTERLEAVE) {
2188                unsigned nid;
2189
2190                nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order);
2191                mpol_cond_put(pol);
2192                page = alloc_page_interleave(gfp, order, nid);
2193                goto out;
2194        }
2195
2196        if (unlikely(IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && hugepage)) {
2197                int hpage_node = node;
2198
2199                /*
2200                 * For hugepage allocation and non-interleave policy which
2201                 * allows the current node (or other explicitly preferred
2202                 * node) we only try to allocate from the current/preferred
2203                 * node and don't fall back to other nodes, as the cost of
2204                 * remote accesses would likely offset THP benefits.
2205                 *
2206                 * If the policy is interleave, or does not allow the current
2207                 * node in its nodemask, we allocate the standard way.
2208                 */
2209                if (pol->mode == MPOL_PREFERRED && !(pol->flags & MPOL_F_LOCAL))
2210                        hpage_node = pol->v.preferred_node;
2211
2212                nmask = policy_nodemask(gfp, pol);
2213                if (!nmask || node_isset(hpage_node, *nmask)) {
2214                        mpol_cond_put(pol);
2215                        /*
2216                         * First, try to allocate THP only on local node, but
2217                         * don't reclaim unnecessarily, just compact.
2218                         */
2219                        page = __alloc_pages_node(hpage_node,
2220                                gfp | __GFP_THISNODE | __GFP_NORETRY, order);
2221
2222                        /*
2223                         * If hugepage allocations are configured to always
2224                         * synchronous compact or the vma has been madvised
2225                         * to prefer hugepage backing, retry allowing remote
2226                         * memory with both reclaim and compact as well.
2227                         */
2228                        if (!page && (gfp & __GFP_DIRECT_RECLAIM))
2229                                page = __alloc_pages_node(hpage_node,
2230                                                                gfp, order);
2231
2232                        goto out;
2233                }
2234        }
2235
2236        nmask = policy_nodemask(gfp, pol);
2237        preferred_nid = policy_node(gfp, pol, node);
2238        page = __alloc_pages_nodemask(gfp, order, preferred_nid, nmask);
2239        mpol_cond_put(pol);
2240out:
2241        return page;
2242}
2243EXPORT_SYMBOL(alloc_pages_vma);
2244
2245/**
2246 *      alloc_pages_current - Allocate pages.
2247 *
2248 *      @gfp:
2249 *              %GFP_USER   user allocation,
2250 *              %GFP_KERNEL kernel allocation,
2251 *              %GFP_HIGHMEM highmem allocation,
2252 *              %GFP_FS     don't call back into a file system.
2253 *              %GFP_ATOMIC don't sleep.
2254 *      @order: Power of two of allocation size in pages. 0 is a single page.
2255 *
2256 *      Allocate a page from the kernel page pool.  When not in
2257 *      interrupt context and apply the current process NUMA policy.
2258 *      Returns NULL when no page can be allocated.
2259 */
2260struct page *alloc_pages_current(gfp_t gfp, unsigned order)
2261{
2262        struct mempolicy *pol = &default_policy;
2263        struct page *page;
2264
2265        if (!in_interrupt() && !(gfp & __GFP_THISNODE))
2266                pol = get_task_policy(current);
2267
2268        /*
2269         * No reference counting needed for current->mempolicy
2270         * nor system default_policy
2271         */
2272        if (pol->mode == MPOL_INTERLEAVE)
2273                page = alloc_page_interleave(gfp, order, interleave_nodes(pol));
2274        else
2275                page = __alloc_pages_nodemask(gfp, order,
2276                                policy_node(gfp, pol, numa_node_id()),
2277                                policy_nodemask(gfp, pol));
2278
2279        return page;
2280}
2281EXPORT_SYMBOL(alloc_pages_current);
2282
2283int vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst)
2284{
2285        struct mempolicy *pol = mpol_dup(vma_policy(src));
2286
2287        if (IS_ERR(pol))
2288                return PTR_ERR(pol);
2289        dst->vm_policy = pol;
2290        return 0;
2291}
2292
2293/*
2294 * If mpol_dup() sees current->cpuset == cpuset_being_rebound, then it
2295 * rebinds the mempolicy its copying by calling mpol_rebind_policy()
2296 * with the mems_allowed returned by cpuset_mems_allowed().  This
2297 * keeps mempolicies cpuset relative after its cpuset moves.  See
2298 * further kernel/cpuset.c update_nodemask().
2299 *
2300 * current's mempolicy may be rebinded by the other task(the task that changes
2301 * cpuset's mems), so we needn't do rebind work for current task.
2302 */
2303
2304/* Slow path of a mempolicy duplicate */
2305struct mempolicy *__mpol_dup(struct mempolicy *old)
2306{
2307        struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
2308
2309        if (!new)
2310                return ERR_PTR(-ENOMEM);
2311
2312        /* task's mempolicy is protected by alloc_lock */
2313        if (old == current->mempolicy) {
2314                task_lock(current);
2315                *new = *old;
2316                task_unlock(current);
2317        } else
2318                *new = *old;
2319
2320        if (current_cpuset_is_being_rebound()) {
2321                nodemask_t mems = cpuset_mems_allowed(current);
2322                mpol_rebind_policy(new, &mems);
2323        }
2324        atomic_set(&new->refcnt, 1);
2325        return new;
2326}
2327
2328/* Slow path of a mempolicy comparison */
2329bool __mpol_equal(struct mempolicy *a, struct mempolicy *b)
2330{
2331        if (!a || !b)
2332                return false;
2333        if (a->mode != b->mode)
2334                return false;
2335        if (a->flags != b->flags)
2336                return false;
2337        if (mpol_store_user_nodemask(a))
2338                if (!nodes_equal(a->w.user_nodemask, b->w.user_nodemask))
2339                        return false;
2340
2341        switch (a->mode) {
2342        case MPOL_BIND:
2343        case MPOL_INTERLEAVE:
2344                return !!nodes_equal(a->v.nodes, b->v.nodes);
2345        case MPOL_PREFERRED:
2346                /* a's ->flags is the same as b's */
2347                if (a->flags & MPOL_F_LOCAL)
2348                        return true;
2349                return a->v.preferred_node == b->v.preferred_node;
2350        default:
2351                BUG();
2352                return false;
2353        }
2354}
2355
2356/*
2357 * Shared memory backing store policy support.
2358 *
2359 * Remember policies even when nobody has shared memory mapped.
2360 * The policies are kept in Red-Black tree linked from the inode.
2361 * They are protected by the sp->lock rwlock, which should be held
2362 * for any accesses to the tree.
2363 */
2364
2365/*
2366 * lookup first element intersecting start-end.  Caller holds sp->lock for
2367 * reading or for writing
2368 */
2369static struct sp_node *
2370sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
2371{
2372        struct rb_node *n = sp->root.rb_node;
2373
2374        while (n) {
2375                struct sp_node *p = rb_entry(n, struct sp_node, nd);
2376
2377                if (start >= p->end)
2378                        n = n->rb_right;
2379                else if (end <= p->start)
2380                        n = n->rb_left;
2381                else
2382                        break;
2383        }
2384        if (!n)
2385                return NULL;
2386        for (;;) {
2387                struct sp_node *w = NULL;
2388                struct rb_node *prev = rb_prev(n);
2389                if (!prev)
2390                        break;
2391                w = rb_entry(prev, struct sp_node, nd);
2392                if (w->end <= start)
2393                        break;
2394                n = prev;
2395        }
2396        return rb_entry(n, struct sp_node, nd);
2397}
2398
2399/*
2400 * Insert a new shared policy into the list.  Caller holds sp->lock for
2401 * writing.
2402 */
2403static void sp_insert(struct shared_policy *sp, struct sp_node *new)
2404{
2405        struct rb_node **p = &sp->root.rb_node;
2406        struct rb_node *parent = NULL;
2407        struct sp_node *nd;
2408
2409        while (*p) {
2410                parent = *p;
2411                nd = rb_entry(parent, struct sp_node, nd);
2412                if (new->start < nd->start)
2413                        p = &(*p)->rb_left;
2414                else if (new->end > nd->end)
2415                        p = &(*p)->rb_right;
2416                else
2417                        BUG();
2418        }
2419        rb_link_node(&new->nd, parent, p);
2420        rb_insert_color(&new->nd, &sp->root);
2421        pr_debug("inserting %lx-%lx: %d\n", new->start, new->end,
2422                 new->policy ? new->policy->mode : 0);
2423}
2424
2425/* Find shared policy intersecting idx */
2426struct mempolicy *
2427mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
2428{
2429        struct mempolicy *pol = NULL;
2430        struct sp_node *sn;
2431
2432        if (!sp->root.rb_node)
2433                return NULL;
2434        read_lock(&sp->lock);
2435        sn = sp_lookup(sp, idx, idx+1);
2436        if (sn) {
2437                mpol_get(sn->policy);
2438                pol = sn->policy;
2439        }
2440        read_unlock(&sp->lock);
2441        return pol;
2442}
2443
2444static void sp_free(struct sp_node *n)
2445{
2446        mpol_put(n->policy);
2447        kmem_cache_free(sn_cache, n);
2448}
2449
2450/**
2451 * mpol_misplaced - check whether current page node is valid in policy
2452 *
2453 * @page: page to be checked
2454 * @vma: vm area where page mapped
2455 * @addr: virtual address where page mapped
2456 *
2457 * Lookup current policy node id for vma,addr and "compare to" page's
2458 * node id.
2459 *
2460 * Returns:
2461 *      -1      - not misplaced, page is in the right node
2462 *      node    - node id where the page should be
2463 *
2464 * Policy determination "mimics" alloc_page_vma().
2465 * Called from fault path where we know the vma and faulting address.
2466 */
2467int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long addr)
2468{
2469        struct mempolicy *pol;
2470        struct zoneref *z;
2471        int curnid = page_to_nid(page);
2472        unsigned long pgoff;
2473        int thiscpu = raw_smp_processor_id();
2474        int thisnid = cpu_to_node(thiscpu);
2475        int polnid = NUMA_NO_NODE;
2476        int ret = -1;
2477
2478        pol = get_vma_policy(vma, addr);
2479        if (!(pol->flags & MPOL_F_MOF))
2480                goto out;
2481
2482        switch (pol->mode) {
2483        case MPOL_INTERLEAVE:
2484                pgoff = vma->vm_pgoff;
2485                pgoff += (addr - vma->vm_start) >> PAGE_SHIFT;
2486                polnid = offset_il_node(pol, pgoff);
2487                break;
2488
2489        case MPOL_PREFERRED:
2490                if (pol->flags & MPOL_F_LOCAL)
2491                        polnid = numa_node_id();
2492                else
2493                        polnid = pol->v.preferred_node;
2494                break;
2495
2496        case MPOL_BIND:
2497
2498                /*
2499                 * allows binding to multiple nodes.
2500                 * use current page if in policy nodemask,
2501                 * else select nearest allowed node, if any.
2502                 * If no allowed nodes, use current [!misplaced].
2503                 */
2504                if (node_isset(curnid, pol->v.nodes))
2505                        goto out;
2506                z = first_zones_zonelist(
2507                                node_zonelist(numa_node_id(), GFP_HIGHUSER),
2508                                gfp_zone(GFP_HIGHUSER),
2509                                &pol->v.nodes);
2510                polnid = zone_to_nid(z->zone);
2511                break;
2512
2513        default:
2514                BUG();
2515        }
2516
2517        /* Migrate the page towards the node whose CPU is referencing it */
2518        if (pol->flags & MPOL_F_MORON) {
2519                polnid = thisnid;
2520
2521                if (!should_numa_migrate_memory(current, page, curnid, thiscpu))
2522                        goto out;
2523        }
2524
2525        if (curnid != polnid)
2526                ret = polnid;
2527out:
2528        mpol_cond_put(pol);
2529
2530        return ret;
2531}
2532
2533/*
2534 * Drop the (possibly final) reference to task->mempolicy.  It needs to be
2535 * dropped after task->mempolicy is set to NULL so that any allocation done as
2536 * part of its kmem_cache_free(), such as by KASAN, doesn't reference a freed
2537 * policy.
2538 */
2539void mpol_put_task_policy(struct task_struct *task)
2540{
2541        struct mempolicy *pol;
2542
2543        task_lock(task);
2544        pol = task->mempolicy;
2545        task->mempolicy = NULL;
2546        task_unlock(task);
2547        mpol_put(pol);
2548}
2549
2550static void sp_delete(struct shared_policy *sp, struct sp_node *n)
2551{
2552        pr_debug("deleting %lx-l%lx\n", n->start, n->end);
2553        rb_erase(&n->nd, &sp->root);
2554        sp_free(n);
2555}
2556
2557static void sp_node_init(struct sp_node *node, unsigned long start,
2558                        unsigned long end, struct mempolicy *pol)
2559{
2560        node->start = start;
2561        node->end = end;
2562        node->policy = pol;
2563}
2564
2565static struct sp_node *sp_alloc(unsigned long start, unsigned long end,
2566                                struct mempolicy *pol)
2567{
2568        struct sp_node *n;
2569        struct mempolicy *newpol;
2570
2571        n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
2572        if (!n)
2573                return NULL;
2574
2575        newpol = mpol_dup(pol);
2576        if (IS_ERR(newpol)) {
2577                kmem_cache_free(sn_cache, n);
2578                return NULL;
2579        }
2580        newpol->flags |= MPOL_F_SHARED;
2581        sp_node_init(n, start, end, newpol);
2582
2583        return n;
2584}
2585
2586/* Replace a policy range. */
2587static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
2588                                 unsigned long end, struct sp_node *new)
2589{
2590        struct sp_node *n;
2591        struct sp_node *n_new = NULL;
2592        struct mempolicy *mpol_new = NULL;
2593        int ret = 0;
2594
2595restart:
2596        write_lock(&sp->lock);
2597        n = sp_lookup(sp, start, end);
2598        /* Take care of old policies in the same range. */
2599        while (n && n->start < end) {
2600                struct rb_node *next = rb_next(&n->nd);
2601                if (n->start >= start) {
2602                        if (n->end <= end)
2603                                sp_delete(sp, n);
2604                        else
2605                                n->start = end;
2606                } else {
2607                        /* Old policy spanning whole new range. */
2608                        if (n->end > end) {
2609                                if (!n_new)
2610                                        goto alloc_new;
2611
2612                                *mpol_new = *n->policy;
2613                                atomic_set(&mpol_new->refcnt, 1);
2614                                sp_node_init(n_new, end, n->end, mpol_new);
2615                                n->end = start;
2616                                sp_insert(sp, n_new);
2617                                n_new = NULL;
2618                                mpol_new = NULL;
2619                                break;
2620                        } else
2621                                n->end = start;
2622                }
2623                if (!next)
2624                        break;
2625                n = rb_entry(next, struct sp_node, nd);
2626        }
2627        if (new)
2628                sp_insert(sp, new);
2629        write_unlock(&sp->lock);
2630        ret = 0;
2631
2632err_out:
2633        if (mpol_new)
2634                mpol_put(mpol_new);
2635        if (n_new)
2636                kmem_cache_free(sn_cache, n_new);
2637
2638        return ret;
2639
2640alloc_new:
2641        write_unlock(&sp->lock);
2642        ret = -ENOMEM;
2643        n_new = kmem_cache_alloc(sn_cache, GFP_KERNEL);
2644        if (!n_new)
2645                goto err_out;
2646        mpol_new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
2647        if (!mpol_new)
2648                goto err_out;
2649        goto restart;
2650}
2651
2652/**
2653 * mpol_shared_policy_init - initialize shared policy for inode
2654 * @sp: pointer to inode shared policy
2655 * @mpol:  struct mempolicy to install
2656 *
2657 * Install non-NULL @mpol in inode's shared policy rb-tree.
2658 * On entry, the current task has a reference on a non-NULL @mpol.
2659 * This must be released on exit.
2660 * This is called at get_inode() calls and we can use GFP_KERNEL.
2661 */
2662void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol)
2663{
2664        int ret;
2665
2666        sp->root = RB_ROOT;             /* empty tree == default mempolicy */
2667        rwlock_init(&sp->lock);
2668
2669        if (mpol) {
2670                struct vm_area_struct pvma;
2671                struct mempolicy *new;
2672                NODEMASK_SCRATCH(scratch);
2673
2674                if (!scratch)
2675                        goto put_mpol;
2676                /* contextualize the tmpfs mount point mempolicy */
2677                new = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask);
2678                if (IS_ERR(new))
2679                        goto free_scratch; /* no valid nodemask intersection */
2680
2681                task_lock(current);
2682                ret = mpol_set_nodemask(new, &mpol->w.user_nodemask, scratch);
2683                task_unlock(current);
2684                if (ret)
2685                        goto put_new;
2686
2687                /* Create pseudo-vma that contains just the policy */
2688                vma_init(&pvma, NULL);
2689                pvma.vm_end = TASK_SIZE;        /* policy covers entire file */
2690                mpol_set_shared_policy(sp, &pvma, new); /* adds ref */
2691
2692put_new:
2693                mpol_put(new);                  /* drop initial ref */
2694free_scratch:
2695                NODEMASK_SCRATCH_FREE(scratch);
2696put_mpol:
2697                mpol_put(mpol); /* drop our incoming ref on sb mpol */
2698        }
2699}
2700
2701int mpol_set_shared_policy(struct shared_policy *info,
2702                        struct vm_area_struct *vma, struct mempolicy *npol)
2703{
2704        int err;
2705        struct sp_node *new = NULL;
2706        unsigned long sz = vma_pages(vma);
2707
2708        pr_debug("set_shared_policy %lx sz %lu %d %d %lx\n",
2709                 vma->vm_pgoff,
2710                 sz, npol ? npol->mode : -1,
2711                 npol ? npol->flags : -1,
2712                 npol ? nodes_addr(npol->v.nodes)[0] : NUMA_NO_NODE);
2713
2714        if (npol) {
2715                new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);
2716                if (!new)
2717                        return -ENOMEM;
2718        }
2719        err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new);
2720        if (err && new)
2721                sp_free(new);
2722        return err;
2723}
2724
2725/* Free a backing policy store on inode delete. */
2726void mpol_free_shared_policy(struct shared_policy *p)
2727{
2728        struct sp_node *n;
2729        struct rb_node *next;
2730
2731        if (!p->root.rb_node)
2732                return;
2733        write_lock(&p->lock);
2734        next = rb_first(&p->root);
2735        while (next) {
2736                n = rb_entry(next, struct sp_node, nd);
2737                next = rb_next(&n->nd);
2738                sp_delete(p, n);
2739        }
2740        write_unlock(&p->lock);
2741}
2742
2743#ifdef CONFIG_NUMA_BALANCING
2744static int __initdata numabalancing_override;
2745
2746static void __init check_numabalancing_enable(void)
2747{
2748        bool numabalancing_default = false;
2749
2750        if (IS_ENABLED(CONFIG_NUMA_BALANCING_DEFAULT_ENABLED))
2751                numabalancing_default = true;
2752
2753        /* Parsed by setup_numabalancing. override == 1 enables, -1 disables */
2754        if (numabalancing_override)
2755                set_numabalancing_state(numabalancing_override == 1);
2756
2757        if (num_online_nodes() > 1 && !numabalancing_override) {
2758                pr_info("%s automatic NUMA balancing. Configure with numa_balancing= or the kernel.numa_balancing sysctl\n",
2759                        numabalancing_default ? "Enabling" : "Disabling");
2760                set_numabalancing_state(numabalancing_default);
2761        }
2762}
2763
2764static int __init setup_numabalancing(char *str)
2765{
2766        int ret = 0;
2767        if (!str)
2768                goto out;
2769
2770        if (!strcmp(str, "enable")) {
2771                numabalancing_override = 1;
2772                ret = 1;
2773        } else if (!strcmp(str, "disable")) {
2774                numabalancing_override = -1;
2775                ret = 1;
2776        }
2777out:
2778        if (!ret)
2779                pr_warn("Unable to parse numa_balancing=\n");
2780
2781        return ret;
2782}
2783__setup("numa_balancing=", setup_numabalancing);
2784#else
2785static inline void __init check_numabalancing_enable(void)
2786{
2787}
2788#endif /* CONFIG_NUMA_BALANCING */
2789
2790/* assumes fs == KERNEL_DS */
2791void __init numa_policy_init(void)
2792{
2793        nodemask_t interleave_nodes;
2794        unsigned long largest = 0;
2795        int nid, prefer = 0;
2796
2797        policy_cache = kmem_cache_create("numa_policy",
2798                                         sizeof(struct mempolicy),
2799                                         0, SLAB_PANIC, NULL);
2800
2801        sn_cache = kmem_cache_create("shared_policy_node",
2802                                     sizeof(struct sp_node),
2803                                     0, SLAB_PANIC, NULL);
2804
2805        for_each_node(nid) {
2806                preferred_node_policy[nid] = (struct mempolicy) {
2807                        .refcnt = ATOMIC_INIT(1),
2808                        .mode = MPOL_PREFERRED,
2809                        .flags = MPOL_F_MOF | MPOL_F_MORON,
2810                        .v = { .preferred_node = nid, },
2811                };
2812        }
2813
2814        /*
2815         * Set interleaving policy for system init. Interleaving is only
2816         * enabled across suitably sized nodes (default is >= 16MB), or
2817         * fall back to the largest node if they're all smaller.
2818         */
2819        nodes_clear(interleave_nodes);
2820        for_each_node_state(nid, N_MEMORY) {
2821                unsigned long total_pages = node_present_pages(nid);
2822
2823                /* Preserve the largest node */
2824                if (largest < total_pages) {
2825                        largest = total_pages;
2826                        prefer = nid;
2827                }
2828
2829                /* Interleave this node? */
2830                if ((total_pages << PAGE_SHIFT) >= (16 << 20))
2831                        node_set(nid, interleave_nodes);
2832        }
2833
2834        /* All too small, use the largest */
2835        if (unlikely(nodes_empty(interleave_nodes)))
2836                node_set(prefer, interleave_nodes);
2837
2838        if (do_set_mempolicy(MPOL_INTERLEAVE, 0, &interleave_nodes))
2839                pr_err("%s: interleaving failed\n", __func__);
2840
2841        check_numabalancing_enable();
2842}
2843
2844/* Reset policy of current process to default */
2845void numa_default_policy(void)
2846{
2847        do_set_mempolicy(MPOL_DEFAULT, 0, NULL);
2848}
2849
2850/*
2851 * Parse and format mempolicy from/to strings
2852 */
2853
2854/*
2855 * "local" is implemented internally by MPOL_PREFERRED with MPOL_F_LOCAL flag.
2856 */
2857static const char * const policy_modes[] =
2858{
2859        [MPOL_DEFAULT]    = "default",
2860        [MPOL_PREFERRED]  = "prefer",
2861        [MPOL_BIND]       = "bind",
2862        [MPOL_INTERLEAVE] = "interleave",
2863        [MPOL_LOCAL]      = "local",
2864};
2865
2866
2867#ifdef CONFIG_TMPFS
2868/**
2869 * mpol_parse_str - parse string to mempolicy, for tmpfs mpol mount option.
2870 * @str:  string containing mempolicy to parse
2871 * @mpol:  pointer to struct mempolicy pointer, returned on success.
2872 *
2873 * Format of input:
2874 *      <mode>[=<flags>][:<nodelist>]
2875 *
2876 * On success, returns 0, else 1
2877 */
2878int mpol_parse_str(char *str, struct mempolicy **mpol)
2879{
2880        struct mempolicy *new = NULL;
2881        unsigned short mode_flags;
2882        nodemask_t nodes;
2883        char *nodelist = strchr(str, ':');
2884        char *flags = strchr(str, '=');
2885        int err = 1, mode;
2886
2887        if (flags)
2888                *flags++ = '\0';        /* terminate mode string */
2889
2890        if (nodelist) {
2891                /* NUL-terminate mode or flags string */
2892                *nodelist++ = '\0';
2893                if (nodelist_parse(nodelist, nodes))
2894                        goto out;
2895                if (!nodes_subset(nodes, node_states[N_MEMORY]))
2896                        goto out;
2897        } else
2898                nodes_clear(nodes);
2899
2900        mode = match_string(policy_modes, MPOL_MAX, str);
2901        if (mode < 0)
2902                goto out;
2903
2904        switch (mode) {
2905        case MPOL_PREFERRED:
2906                /*
2907                 * Insist on a nodelist of one node only, although later
2908                 * we use first_node(nodes) to grab a single node, so here
2909                 * nodelist (or nodes) cannot be empty.
2910                 */
2911                if (nodelist) {
2912                        char *rest = nodelist;
2913                        while (isdigit(*rest))
2914                                rest++;
2915                        if (*rest)
2916                                goto out;
2917                        if (nodes_empty(nodes))
2918                                goto out;
2919                }
2920                break;
2921        case MPOL_INTERLEAVE:
2922                /*
2923                 * Default to online nodes with memory if no nodelist
2924                 */
2925                if (!nodelist)
2926                        nodes = node_states[N_MEMORY];
2927                break;
2928        case MPOL_LOCAL:
2929                /*
2930                 * Don't allow a nodelist;  mpol_new() checks flags
2931                 */
2932                if (nodelist)
2933                        goto out;
2934                mode = MPOL_PREFERRED;
2935                break;
2936        case MPOL_DEFAULT:
2937                /*
2938                 * Insist on a empty nodelist
2939                 */
2940                if (!nodelist)
2941                        err = 0;
2942                goto out;
2943        case MPOL_BIND:
2944                /*
2945                 * Insist on a nodelist
2946                 */
2947                if (!nodelist)
2948                        goto out;
2949        }
2950
2951        mode_flags = 0;
2952        if (flags) {
2953                /*
2954                 * Currently, we only support two mutually exclusive
2955                 * mode flags.
2956                 */
2957                if (!strcmp(flags, "static"))
2958                        mode_flags |= MPOL_F_STATIC_NODES;
2959                else if (!strcmp(flags, "relative"))
2960                        mode_flags |= MPOL_F_RELATIVE_NODES;
2961                else
2962                        goto out;
2963        }
2964
2965        new = mpol_new(mode, mode_flags, &nodes);
2966        if (IS_ERR(new))
2967                goto out;
2968
2969        /*
2970         * Save nodes for mpol_to_str() to show the tmpfs mount options
2971         * for /proc/mounts, /proc/pid/mounts and /proc/pid/mountinfo.
2972         */
2973        if (mode != MPOL_PREFERRED)
2974                new->v.nodes = nodes;
2975        else if (nodelist)
2976                new->v.preferred_node = first_node(nodes);
2977        else
2978                new->flags |= MPOL_F_LOCAL;
2979
2980        /*
2981         * Save nodes for contextualization: this will be used to "clone"
2982         * the mempolicy in a specific context [cpuset] at a later time.
2983         */
2984        new->w.user_nodemask = nodes;
2985
2986        err = 0;
2987
2988out:
2989        /* Restore string for error message */
2990        if (nodelist)
2991                *--nodelist = ':';
2992        if (flags)
2993                *--flags = '=';
2994        if (!err)
2995                *mpol = new;
2996        return err;
2997}
2998#endif /* CONFIG_TMPFS */
2999
3000/**
3001 * mpol_to_str - format a mempolicy structure for printing
3002 * @buffer:  to contain formatted mempolicy string
3003 * @maxlen:  length of @buffer
3004 * @pol:  pointer to mempolicy to be formatted
3005 *
3006 * Convert @pol into a string.  If @buffer is too short, truncate the string.
3007 * Recommend a @maxlen of at least 32 for the longest mode, "interleave", the
3008 * longest flag, "relative", and to display at least a few node ids.
3009 */
3010void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
3011{
3012        char *p = buffer;
3013        nodemask_t nodes = NODE_MASK_NONE;
3014        unsigned short mode = MPOL_DEFAULT;
3015        unsigned short flags = 0;
3016
3017        if (pol && pol != &default_policy && !(pol->flags & MPOL_F_MORON)) {
3018                mode = pol->mode;
3019                flags = pol->flags;
3020        }
3021
3022        switch (mode) {
3023        case MPOL_DEFAULT:
3024                break;
3025        case MPOL_PREFERRED:
3026                if (flags & MPOL_F_LOCAL)
3027                        mode = MPOL_LOCAL;
3028                else
3029                        node_set(pol->v.preferred_node, nodes);
3030                break;
3031        case MPOL_BIND:
3032        case MPOL_INTERLEAVE:
3033                nodes = pol->v.nodes;
3034                break;
3035        default:
3036                WARN_ON_ONCE(1);
3037                snprintf(p, maxlen, "unknown");
3038                return;
3039        }
3040
3041        p += snprintf(p, maxlen, "%s", policy_modes[mode]);
3042
3043        if (flags & MPOL_MODE_FLAGS) {
3044                p += snprintf(p, buffer + maxlen - p, "=");
3045
3046                /*
3047                 * Currently, the only defined flags are mutually exclusive
3048                 */
3049                if (flags & MPOL_F_STATIC_NODES)
3050                        p += snprintf(p, buffer + maxlen - p, "static");
3051                else if (flags & MPOL_F_RELATIVE_NODES)
3052                        p += snprintf(p, buffer + maxlen - p, "relative");
3053        }
3054
3055        if (!nodes_empty(nodes))
3056                p += scnprintf(p, buffer + maxlen - p, ":%*pbl",
3057                               nodemask_pr_args(&nodes));
3058}
3059