linux/mm/mempolicy.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0-only
   2/*
   3 * Simple NUMA memory policy for the Linux kernel.
   4 *
   5 * Copyright 2003,2004 Andi Kleen, SuSE Labs.
   6 * (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc.
   7 *
   8 * NUMA policy allows the user to give hints in which node(s) memory should
   9 * be allocated.
  10 *
  11 * Support four policies per VMA and per process:
  12 *
  13 * The VMA policy has priority over the process policy for a page fault.
  14 *
  15 * interleave     Allocate memory interleaved over a set of nodes,
  16 *                with normal fallback if it fails.
  17 *                For VMA based allocations this interleaves based on the
  18 *                offset into the backing object or offset into the mapping
  19 *                for anonymous memory. For process policy an process counter
  20 *                is used.
  21 *
  22 * bind           Only allocate memory on a specific set of nodes,
  23 *                no fallback.
  24 *                FIXME: memory is allocated starting with the first node
  25 *                to the last. It would be better if bind would truly restrict
  26 *                the allocation to memory nodes instead
  27 *
  28 * preferred       Try a specific node first before normal fallback.
  29 *                As a special case NUMA_NO_NODE here means do the allocation
  30 *                on the local CPU. This is normally identical to default,
  31 *                but useful to set in a VMA when you have a non default
  32 *                process policy.
  33 *
  34 * default        Allocate on the local node first, or when on a VMA
  35 *                use the process policy. This is what Linux always did
  36 *                in a NUMA aware kernel and still does by, ahem, default.
  37 *
  38 * The process policy is applied for most non interrupt memory allocations
  39 * in that process' context. Interrupts ignore the policies and always
  40 * try to allocate on the local CPU. The VMA policy is only applied for memory
  41 * allocations for a VMA in the VM.
  42 *
  43 * Currently there are a few corner cases in swapping where the policy
  44 * is not applied, but the majority should be handled. When process policy
  45 * is used it is not remembered over swap outs/swap ins.
  46 *
  47 * Only the highest zone in the zone hierarchy gets policied. Allocations
  48 * requesting a lower zone just use default policy. This implies that
  49 * on systems with highmem kernel lowmem allocation don't get policied.
  50 * Same with GFP_DMA allocations.
  51 *
  52 * For shmfs/tmpfs/hugetlbfs shared memory the policy is shared between
  53 * all users and remembered even when nobody has memory mapped.
  54 */
  55
  56/* Notebook:
  57   fix mmap readahead to honour policy and enable policy for any page cache
  58   object
  59   statistics for bigpages
  60   global policy for page cache? currently it uses process policy. Requires
  61   first item above.
  62   handle mremap for shared memory (currently ignored for the policy)
  63   grows down?
  64   make bind policy root only? It can trigger oom much faster and the
  65   kernel is not always grateful with that.
  66*/
  67
  68#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
  69
  70#include <linux/mempolicy.h>
  71#include <linux/pagewalk.h>
  72#include <linux/highmem.h>
  73#include <linux/hugetlb.h>
  74#include <linux/kernel.h>
  75#include <linux/sched.h>
  76#include <linux/sched/mm.h>
  77#include <linux/sched/numa_balancing.h>
  78#include <linux/sched/task.h>
  79#include <linux/nodemask.h>
  80#include <linux/cpuset.h>
  81#include <linux/slab.h>
  82#include <linux/string.h>
  83#include <linux/export.h>
  84#include <linux/nsproxy.h>
  85#include <linux/interrupt.h>
  86#include <linux/init.h>
  87#include <linux/compat.h>
  88#include <linux/ptrace.h>
  89#include <linux/swap.h>
  90#include <linux/seq_file.h>
  91#include <linux/proc_fs.h>
  92#include <linux/migrate.h>
  93#include <linux/ksm.h>
  94#include <linux/rmap.h>
  95#include <linux/security.h>
  96#include <linux/syscalls.h>
  97#include <linux/ctype.h>
  98#include <linux/mm_inline.h>
  99#include <linux/mmu_notifier.h>
 100#include <linux/printk.h>
 101#include <linux/swapops.h>
 102
 103#include <asm/tlbflush.h>
 104#include <linux/uaccess.h>
 105
 106#include "internal.h"
 107
 108/* Internal flags */
 109#define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0)    /* Skip checks for continuous vmas */
 110#define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1)          /* Invert check for nodemask */
 111
 112static struct kmem_cache *policy_cache;
 113static struct kmem_cache *sn_cache;
 114
 115/* Highest zone. An specific allocation for a zone below that is not
 116   policied. */
 117enum zone_type policy_zone = 0;
 118
 119/*
 120 * run-time system-wide default policy => local allocation
 121 */
 122static struct mempolicy default_policy = {
 123        .refcnt = ATOMIC_INIT(1), /* never free it */
 124        .mode = MPOL_PREFERRED,
 125        .flags = MPOL_F_LOCAL,
 126};
 127
 128static struct mempolicy preferred_node_policy[MAX_NUMNODES];
 129
 130/**
 131 * numa_map_to_online_node - Find closest online node
 132 * @node: Node id to start the search
 133 *
 134 * Lookup the next closest node by distance if @nid is not online.
 135 */
 136int numa_map_to_online_node(int node)
 137{
 138        int min_dist = INT_MAX, dist, n, min_node;
 139
 140        if (node == NUMA_NO_NODE || node_online(node))
 141                return node;
 142
 143        min_node = node;
 144        for_each_online_node(n) {
 145                dist = node_distance(node, n);
 146                if (dist < min_dist) {
 147                        min_dist = dist;
 148                        min_node = n;
 149                }
 150        }
 151
 152        return min_node;
 153}
 154EXPORT_SYMBOL_GPL(numa_map_to_online_node);
 155
 156struct mempolicy *get_task_policy(struct task_struct *p)
 157{
 158        struct mempolicy *pol = p->mempolicy;
 159        int node;
 160
 161        if (pol)
 162                return pol;
 163
 164        node = numa_node_id();
 165        if (node != NUMA_NO_NODE) {
 166                pol = &preferred_node_policy[node];
 167                /* preferred_node_policy is not initialised early in boot */
 168                if (pol->mode)
 169                        return pol;
 170        }
 171
 172        return &default_policy;
 173}
 174
 175static const struct mempolicy_operations {
 176        int (*create)(struct mempolicy *pol, const nodemask_t *nodes);
 177        void (*rebind)(struct mempolicy *pol, const nodemask_t *nodes);
 178} mpol_ops[MPOL_MAX];
 179
 180static inline int mpol_store_user_nodemask(const struct mempolicy *pol)
 181{
 182        return pol->flags & MPOL_MODE_FLAGS;
 183}
 184
 185static void mpol_relative_nodemask(nodemask_t *ret, const nodemask_t *orig,
 186                                   const nodemask_t *rel)
 187{
 188        nodemask_t tmp;
 189        nodes_fold(tmp, *orig, nodes_weight(*rel));
 190        nodes_onto(*ret, tmp, *rel);
 191}
 192
 193static int mpol_new_interleave(struct mempolicy *pol, const nodemask_t *nodes)
 194{
 195        if (nodes_empty(*nodes))
 196                return -EINVAL;
 197        pol->v.nodes = *nodes;
 198        return 0;
 199}
 200
 201static int mpol_new_preferred(struct mempolicy *pol, const nodemask_t *nodes)
 202{
 203        if (!nodes)
 204                pol->flags |= MPOL_F_LOCAL;     /* local allocation */
 205        else if (nodes_empty(*nodes))
 206                return -EINVAL;                 /*  no allowed nodes */
 207        else
 208                pol->v.preferred_node = first_node(*nodes);
 209        return 0;
 210}
 211
 212static int mpol_new_bind(struct mempolicy *pol, const nodemask_t *nodes)
 213{
 214        if (nodes_empty(*nodes))
 215                return -EINVAL;
 216        pol->v.nodes = *nodes;
 217        return 0;
 218}
 219
 220/*
 221 * mpol_set_nodemask is called after mpol_new() to set up the nodemask, if
 222 * any, for the new policy.  mpol_new() has already validated the nodes
 223 * parameter with respect to the policy mode and flags.  But, we need to
 224 * handle an empty nodemask with MPOL_PREFERRED here.
 225 *
 226 * Must be called holding task's alloc_lock to protect task's mems_allowed
 227 * and mempolicy.  May also be called holding the mmap_lock for write.
 228 */
 229static int mpol_set_nodemask(struct mempolicy *pol,
 230                     const nodemask_t *nodes, struct nodemask_scratch *nsc)
 231{
 232        int ret;
 233
 234        /* if mode is MPOL_DEFAULT, pol is NULL. This is right. */
 235        if (pol == NULL)
 236                return 0;
 237        /* Check N_MEMORY */
 238        nodes_and(nsc->mask1,
 239                  cpuset_current_mems_allowed, node_states[N_MEMORY]);
 240
 241        VM_BUG_ON(!nodes);
 242        if (pol->mode == MPOL_PREFERRED && nodes_empty(*nodes))
 243                nodes = NULL;   /* explicit local allocation */
 244        else {
 245                if (pol->flags & MPOL_F_RELATIVE_NODES)
 246                        mpol_relative_nodemask(&nsc->mask2, nodes, &nsc->mask1);
 247                else
 248                        nodes_and(nsc->mask2, *nodes, nsc->mask1);
 249
 250                if (mpol_store_user_nodemask(pol))
 251                        pol->w.user_nodemask = *nodes;
 252                else
 253                        pol->w.cpuset_mems_allowed =
 254                                                cpuset_current_mems_allowed;
 255        }
 256
 257        if (nodes)
 258                ret = mpol_ops[pol->mode].create(pol, &nsc->mask2);
 259        else
 260                ret = mpol_ops[pol->mode].create(pol, NULL);
 261        return ret;
 262}
 263
 264/*
 265 * This function just creates a new policy, does some check and simple
 266 * initialization. You must invoke mpol_set_nodemask() to set nodes.
 267 */
 268static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
 269                                  nodemask_t *nodes)
 270{
 271        struct mempolicy *policy;
 272
 273        pr_debug("setting mode %d flags %d nodes[0] %lx\n",
 274                 mode, flags, nodes ? nodes_addr(*nodes)[0] : NUMA_NO_NODE);
 275
 276        if (mode == MPOL_DEFAULT) {
 277                if (nodes && !nodes_empty(*nodes))
 278                        return ERR_PTR(-EINVAL);
 279                return NULL;
 280        }
 281        VM_BUG_ON(!nodes);
 282
 283        /*
 284         * MPOL_PREFERRED cannot be used with MPOL_F_STATIC_NODES or
 285         * MPOL_F_RELATIVE_NODES if the nodemask is empty (local allocation).
 286         * All other modes require a valid pointer to a non-empty nodemask.
 287         */
 288        if (mode == MPOL_PREFERRED) {
 289                if (nodes_empty(*nodes)) {
 290                        if (((flags & MPOL_F_STATIC_NODES) ||
 291                             (flags & MPOL_F_RELATIVE_NODES)))
 292                                return ERR_PTR(-EINVAL);
 293                }
 294        } else if (mode == MPOL_LOCAL) {
 295                if (!nodes_empty(*nodes) ||
 296                    (flags & MPOL_F_STATIC_NODES) ||
 297                    (flags & MPOL_F_RELATIVE_NODES))
 298                        return ERR_PTR(-EINVAL);
 299                mode = MPOL_PREFERRED;
 300        } else if (nodes_empty(*nodes))
 301                return ERR_PTR(-EINVAL);
 302        policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
 303        if (!policy)
 304                return ERR_PTR(-ENOMEM);
 305        atomic_set(&policy->refcnt, 1);
 306        policy->mode = mode;
 307        policy->flags = flags;
 308
 309        return policy;
 310}
 311
 312/* Slow path of a mpol destructor. */
 313void __mpol_put(struct mempolicy *p)
 314{
 315        if (!atomic_dec_and_test(&p->refcnt))
 316                return;
 317        kmem_cache_free(policy_cache, p);
 318}
 319
 320static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes)
 321{
 322}
 323
 324static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes)
 325{
 326        nodemask_t tmp;
 327
 328        if (pol->flags & MPOL_F_STATIC_NODES)
 329                nodes_and(tmp, pol->w.user_nodemask, *nodes);
 330        else if (pol->flags & MPOL_F_RELATIVE_NODES)
 331                mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
 332        else {
 333                nodes_remap(tmp, pol->v.nodes, pol->w.cpuset_mems_allowed,
 334                                                                *nodes);
 335                pol->w.cpuset_mems_allowed = *nodes;
 336        }
 337
 338        if (nodes_empty(tmp))
 339                tmp = *nodes;
 340
 341        pol->v.nodes = tmp;
 342}
 343
 344static void mpol_rebind_preferred(struct mempolicy *pol,
 345                                                const nodemask_t *nodes)
 346{
 347        nodemask_t tmp;
 348
 349        if (pol->flags & MPOL_F_STATIC_NODES) {
 350                int node = first_node(pol->w.user_nodemask);
 351
 352                if (node_isset(node, *nodes)) {
 353                        pol->v.preferred_node = node;
 354                        pol->flags &= ~MPOL_F_LOCAL;
 355                } else
 356                        pol->flags |= MPOL_F_LOCAL;
 357        } else if (pol->flags & MPOL_F_RELATIVE_NODES) {
 358                mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
 359                pol->v.preferred_node = first_node(tmp);
 360        } else if (!(pol->flags & MPOL_F_LOCAL)) {
 361                pol->v.preferred_node = node_remap(pol->v.preferred_node,
 362                                                   pol->w.cpuset_mems_allowed,
 363                                                   *nodes);
 364                pol->w.cpuset_mems_allowed = *nodes;
 365        }
 366}
 367
 368/*
 369 * mpol_rebind_policy - Migrate a policy to a different set of nodes
 370 *
 371 * Per-vma policies are protected by mmap_lock. Allocations using per-task
 372 * policies are protected by task->mems_allowed_seq to prevent a premature
 373 * OOM/allocation failure due to parallel nodemask modification.
 374 */
 375static void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask)
 376{
 377        if (!pol)
 378                return;
 379        if (!mpol_store_user_nodemask(pol) && !(pol->flags & MPOL_F_LOCAL) &&
 380            nodes_equal(pol->w.cpuset_mems_allowed, *newmask))
 381                return;
 382
 383        mpol_ops[pol->mode].rebind(pol, newmask);
 384}
 385
 386/*
 387 * Wrapper for mpol_rebind_policy() that just requires task
 388 * pointer, and updates task mempolicy.
 389 *
 390 * Called with task's alloc_lock held.
 391 */
 392
 393void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new)
 394{
 395        mpol_rebind_policy(tsk->mempolicy, new);
 396}
 397
 398/*
 399 * Rebind each vma in mm to new nodemask.
 400 *
 401 * Call holding a reference to mm.  Takes mm->mmap_lock during call.
 402 */
 403
 404void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
 405{
 406        struct vm_area_struct *vma;
 407
 408        mmap_write_lock(mm);
 409        for (vma = mm->mmap; vma; vma = vma->vm_next)
 410                mpol_rebind_policy(vma->vm_policy, new);
 411        mmap_write_unlock(mm);
 412}
 413
 414static const struct mempolicy_operations mpol_ops[MPOL_MAX] = {
 415        [MPOL_DEFAULT] = {
 416                .rebind = mpol_rebind_default,
 417        },
 418        [MPOL_INTERLEAVE] = {
 419                .create = mpol_new_interleave,
 420                .rebind = mpol_rebind_nodemask,
 421        },
 422        [MPOL_PREFERRED] = {
 423                .create = mpol_new_preferred,
 424                .rebind = mpol_rebind_preferred,
 425        },
 426        [MPOL_BIND] = {
 427                .create = mpol_new_bind,
 428                .rebind = mpol_rebind_nodemask,
 429        },
 430};
 431
 432static int migrate_page_add(struct page *page, struct list_head *pagelist,
 433                                unsigned long flags);
 434
 435struct queue_pages {
 436        struct list_head *pagelist;
 437        unsigned long flags;
 438        nodemask_t *nmask;
 439        unsigned long start;
 440        unsigned long end;
 441        struct vm_area_struct *first;
 442};
 443
 444/*
 445 * Check if the page's nid is in qp->nmask.
 446 *
 447 * If MPOL_MF_INVERT is set in qp->flags, check if the nid is
 448 * in the invert of qp->nmask.
 449 */
 450static inline bool queue_pages_required(struct page *page,
 451                                        struct queue_pages *qp)
 452{
 453        int nid = page_to_nid(page);
 454        unsigned long flags = qp->flags;
 455
 456        return node_isset(nid, *qp->nmask) == !(flags & MPOL_MF_INVERT);
 457}
 458
 459/*
 460 * queue_pages_pmd() has four possible return values:
 461 * 0 - pages are placed on the right node or queued successfully.
 462 * 1 - there is unmovable page, and MPOL_MF_MOVE* & MPOL_MF_STRICT were
 463 *     specified.
 464 * 2 - THP was split.
 465 * -EIO - is migration entry or only MPOL_MF_STRICT was specified and an
 466 *        existing page was already on a node that does not follow the
 467 *        policy.
 468 */
 469static int queue_pages_pmd(pmd_t *pmd, spinlock_t *ptl, unsigned long addr,
 470                                unsigned long end, struct mm_walk *walk)
 471        __releases(ptl)
 472{
 473        int ret = 0;
 474        struct page *page;
 475        struct queue_pages *qp = walk->private;
 476        unsigned long flags;
 477
 478        if (unlikely(is_pmd_migration_entry(*pmd))) {
 479                ret = -EIO;
 480                goto unlock;
 481        }
 482        page = pmd_page(*pmd);
 483        if (is_huge_zero_page(page)) {
 484                spin_unlock(ptl);
 485                __split_huge_pmd(walk->vma, pmd, addr, false, NULL);
 486                ret = 2;
 487                goto out;
 488        }
 489        if (!queue_pages_required(page, qp))
 490                goto unlock;
 491
 492        flags = qp->flags;
 493        /* go to thp migration */
 494        if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
 495                if (!vma_migratable(walk->vma) ||
 496                    migrate_page_add(page, qp->pagelist, flags)) {
 497                        ret = 1;
 498                        goto unlock;
 499                }
 500        } else
 501                ret = -EIO;
 502unlock:
 503        spin_unlock(ptl);
 504out:
 505        return ret;
 506}
 507
 508/*
 509 * Scan through pages checking if pages follow certain conditions,
 510 * and move them to the pagelist if they do.
 511 *
 512 * queue_pages_pte_range() has three possible return values:
 513 * 0 - pages are placed on the right node or queued successfully.
 514 * 1 - there is unmovable page, and MPOL_MF_MOVE* & MPOL_MF_STRICT were
 515 *     specified.
 516 * -EIO - only MPOL_MF_STRICT was specified and an existing page was already
 517 *        on a node that does not follow the policy.
 518 */
 519static int queue_pages_pte_range(pmd_t *pmd, unsigned long addr,
 520                        unsigned long end, struct mm_walk *walk)
 521{
 522        struct vm_area_struct *vma = walk->vma;
 523        struct page *page;
 524        struct queue_pages *qp = walk->private;
 525        unsigned long flags = qp->flags;
 526        int ret;
 527        bool has_unmovable = false;
 528        pte_t *pte, *mapped_pte;
 529        spinlock_t *ptl;
 530
 531        ptl = pmd_trans_huge_lock(pmd, vma);
 532        if (ptl) {
 533                ret = queue_pages_pmd(pmd, ptl, addr, end, walk);
 534                if (ret != 2)
 535                        return ret;
 536        }
 537        /* THP was split, fall through to pte walk */
 538
 539        if (pmd_trans_unstable(pmd))
 540                return 0;
 541
 542        mapped_pte = pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
 543        for (; addr != end; pte++, addr += PAGE_SIZE) {
 544                if (!pte_present(*pte))
 545                        continue;
 546                page = vm_normal_page(vma, addr, *pte);
 547                if (!page)
 548                        continue;
 549                /*
 550                 * vm_normal_page() filters out zero pages, but there might
 551                 * still be PageReserved pages to skip, perhaps in a VDSO.
 552                 */
 553                if (PageReserved(page))
 554                        continue;
 555                if (!queue_pages_required(page, qp))
 556                        continue;
 557                if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
 558                        /* MPOL_MF_STRICT must be specified if we get here */
 559                        if (!vma_migratable(vma)) {
 560                                has_unmovable = true;
 561                                break;
 562                        }
 563
 564                        /*
 565                         * Do not abort immediately since there may be
 566                         * temporary off LRU pages in the range.  Still
 567                         * need migrate other LRU pages.
 568                         */
 569                        if (migrate_page_add(page, qp->pagelist, flags))
 570                                has_unmovable = true;
 571                } else
 572                        break;
 573        }
 574        pte_unmap_unlock(mapped_pte, ptl);
 575        cond_resched();
 576
 577        if (has_unmovable)
 578                return 1;
 579
 580        return addr != end ? -EIO : 0;
 581}
 582
 583static int queue_pages_hugetlb(pte_t *pte, unsigned long hmask,
 584                               unsigned long addr, unsigned long end,
 585                               struct mm_walk *walk)
 586{
 587        int ret = 0;
 588#ifdef CONFIG_HUGETLB_PAGE
 589        struct queue_pages *qp = walk->private;
 590        unsigned long flags = (qp->flags & MPOL_MF_VALID);
 591        struct page *page;
 592        spinlock_t *ptl;
 593        pte_t entry;
 594
 595        ptl = huge_pte_lock(hstate_vma(walk->vma), walk->mm, pte);
 596        entry = huge_ptep_get(pte);
 597        if (!pte_present(entry))
 598                goto unlock;
 599        page = pte_page(entry);
 600        if (!queue_pages_required(page, qp))
 601                goto unlock;
 602
 603        if (flags == MPOL_MF_STRICT) {
 604                /*
 605                 * STRICT alone means only detecting misplaced page and no
 606                 * need to further check other vma.
 607                 */
 608                ret = -EIO;
 609                goto unlock;
 610        }
 611
 612        if (!vma_migratable(walk->vma)) {
 613                /*
 614                 * Must be STRICT with MOVE*, otherwise .test_walk() have
 615                 * stopped walking current vma.
 616                 * Detecting misplaced page but allow migrating pages which
 617                 * have been queued.
 618                 */
 619                ret = 1;
 620                goto unlock;
 621        }
 622
 623        /* With MPOL_MF_MOVE, we migrate only unshared hugepage. */
 624        if (flags & (MPOL_MF_MOVE_ALL) ||
 625            (flags & MPOL_MF_MOVE && page_mapcount(page) == 1)) {
 626                if (!isolate_huge_page(page, qp->pagelist) &&
 627                        (flags & MPOL_MF_STRICT))
 628                        /*
 629                         * Failed to isolate page but allow migrating pages
 630                         * which have been queued.
 631                         */
 632                        ret = 1;
 633        }
 634unlock:
 635        spin_unlock(ptl);
 636#else
 637        BUG();
 638#endif
 639        return ret;
 640}
 641
 642#ifdef CONFIG_NUMA_BALANCING
 643/*
 644 * This is used to mark a range of virtual addresses to be inaccessible.
 645 * These are later cleared by a NUMA hinting fault. Depending on these
 646 * faults, pages may be migrated for better NUMA placement.
 647 *
 648 * This is assuming that NUMA faults are handled using PROT_NONE. If
 649 * an architecture makes a different choice, it will need further
 650 * changes to the core.
 651 */
 652unsigned long change_prot_numa(struct vm_area_struct *vma,
 653                        unsigned long addr, unsigned long end)
 654{
 655        int nr_updated;
 656
 657        nr_updated = change_protection(vma, addr, end, PAGE_NONE, MM_CP_PROT_NUMA);
 658        if (nr_updated)
 659                count_vm_numa_events(NUMA_PTE_UPDATES, nr_updated);
 660
 661        return nr_updated;
 662}
 663#else
 664static unsigned long change_prot_numa(struct vm_area_struct *vma,
 665                        unsigned long addr, unsigned long end)
 666{
 667        return 0;
 668}
 669#endif /* CONFIG_NUMA_BALANCING */
 670
 671static int queue_pages_test_walk(unsigned long start, unsigned long end,
 672                                struct mm_walk *walk)
 673{
 674        struct vm_area_struct *vma = walk->vma;
 675        struct queue_pages *qp = walk->private;
 676        unsigned long endvma = vma->vm_end;
 677        unsigned long flags = qp->flags;
 678
 679        /* range check first */
 680        VM_BUG_ON_VMA(!range_in_vma(vma, start, end), vma);
 681
 682        if (!qp->first) {
 683                qp->first = vma;
 684                if (!(flags & MPOL_MF_DISCONTIG_OK) &&
 685                        (qp->start < vma->vm_start))
 686                        /* hole at head side of range */
 687                        return -EFAULT;
 688        }
 689        if (!(flags & MPOL_MF_DISCONTIG_OK) &&
 690                ((vma->vm_end < qp->end) &&
 691                (!vma->vm_next || vma->vm_end < vma->vm_next->vm_start)))
 692                /* hole at middle or tail of range */
 693                return -EFAULT;
 694
 695        /*
 696         * Need check MPOL_MF_STRICT to return -EIO if possible
 697         * regardless of vma_migratable
 698         */
 699        if (!vma_migratable(vma) &&
 700            !(flags & MPOL_MF_STRICT))
 701                return 1;
 702
 703        if (endvma > end)
 704                endvma = end;
 705
 706        if (flags & MPOL_MF_LAZY) {
 707                /* Similar to task_numa_work, skip inaccessible VMAs */
 708                if (!is_vm_hugetlb_page(vma) && vma_is_accessible(vma) &&
 709                        !(vma->vm_flags & VM_MIXEDMAP))
 710                        change_prot_numa(vma, start, endvma);
 711                return 1;
 712        }
 713
 714        /* queue pages from current vma */
 715        if (flags & MPOL_MF_VALID)
 716                return 0;
 717        return 1;
 718}
 719
 720static const struct mm_walk_ops queue_pages_walk_ops = {
 721        .hugetlb_entry          = queue_pages_hugetlb,
 722        .pmd_entry              = queue_pages_pte_range,
 723        .test_walk              = queue_pages_test_walk,
 724};
 725
 726/*
 727 * Walk through page tables and collect pages to be migrated.
 728 *
 729 * If pages found in a given range are on a set of nodes (determined by
 730 * @nodes and @flags,) it's isolated and queued to the pagelist which is
 731 * passed via @private.
 732 *
 733 * queue_pages_range() has three possible return values:
 734 * 1 - there is unmovable page, but MPOL_MF_MOVE* & MPOL_MF_STRICT were
 735 *     specified.
 736 * 0 - queue pages successfully or no misplaced page.
 737 * errno - i.e. misplaced pages with MPOL_MF_STRICT specified (-EIO) or
 738 *         memory range specified by nodemask and maxnode points outside
 739 *         your accessible address space (-EFAULT)
 740 */
 741static int
 742queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end,
 743                nodemask_t *nodes, unsigned long flags,
 744                struct list_head *pagelist)
 745{
 746        int err;
 747        struct queue_pages qp = {
 748                .pagelist = pagelist,
 749                .flags = flags,
 750                .nmask = nodes,
 751                .start = start,
 752                .end = end,
 753                .first = NULL,
 754        };
 755
 756        err = walk_page_range(mm, start, end, &queue_pages_walk_ops, &qp);
 757
 758        if (!qp.first)
 759                /* whole range in hole */
 760                err = -EFAULT;
 761
 762        return err;
 763}
 764
 765/*
 766 * Apply policy to a single VMA
 767 * This must be called with the mmap_lock held for writing.
 768 */
 769static int vma_replace_policy(struct vm_area_struct *vma,
 770                                                struct mempolicy *pol)
 771{
 772        int err;
 773        struct mempolicy *old;
 774        struct mempolicy *new;
 775
 776        pr_debug("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
 777                 vma->vm_start, vma->vm_end, vma->vm_pgoff,
 778                 vma->vm_ops, vma->vm_file,
 779                 vma->vm_ops ? vma->vm_ops->set_policy : NULL);
 780
 781        new = mpol_dup(pol);
 782        if (IS_ERR(new))
 783                return PTR_ERR(new);
 784
 785        if (vma->vm_ops && vma->vm_ops->set_policy) {
 786                err = vma->vm_ops->set_policy(vma, new);
 787                if (err)
 788                        goto err_out;
 789        }
 790
 791        old = vma->vm_policy;
 792        vma->vm_policy = new; /* protected by mmap_lock */
 793        mpol_put(old);
 794
 795        return 0;
 796 err_out:
 797        mpol_put(new);
 798        return err;
 799}
 800
 801/* Step 2: apply policy to a range and do splits. */
 802static int mbind_range(struct mm_struct *mm, unsigned long start,
 803                       unsigned long end, struct mempolicy *new_pol)
 804{
 805        struct vm_area_struct *next;
 806        struct vm_area_struct *prev;
 807        struct vm_area_struct *vma;
 808        int err = 0;
 809        pgoff_t pgoff;
 810        unsigned long vmstart;
 811        unsigned long vmend;
 812
 813        vma = find_vma(mm, start);
 814        VM_BUG_ON(!vma);
 815
 816        prev = vma->vm_prev;
 817        if (start > vma->vm_start)
 818                prev = vma;
 819
 820        for (; vma && vma->vm_start < end; prev = vma, vma = next) {
 821                next = vma->vm_next;
 822                vmstart = max(start, vma->vm_start);
 823                vmend   = min(end, vma->vm_end);
 824
 825                if (mpol_equal(vma_policy(vma), new_pol))
 826                        continue;
 827
 828                pgoff = vma->vm_pgoff +
 829                        ((vmstart - vma->vm_start) >> PAGE_SHIFT);
 830                prev = vma_merge(mm, prev, vmstart, vmend, vma->vm_flags,
 831                                 vma->anon_vma, vma->vm_file, pgoff,
 832                                 new_pol, vma->vm_userfaultfd_ctx);
 833                if (prev) {
 834                        vma = prev;
 835                        next = vma->vm_next;
 836                        if (mpol_equal(vma_policy(vma), new_pol))
 837                                continue;
 838                        /* vma_merge() joined vma && vma->next, case 8 */
 839                        goto replace;
 840                }
 841                if (vma->vm_start != vmstart) {
 842                        err = split_vma(vma->vm_mm, vma, vmstart, 1);
 843                        if (err)
 844                                goto out;
 845                }
 846                if (vma->vm_end != vmend) {
 847                        err = split_vma(vma->vm_mm, vma, vmend, 0);
 848                        if (err)
 849                                goto out;
 850                }
 851 replace:
 852                err = vma_replace_policy(vma, new_pol);
 853                if (err)
 854                        goto out;
 855        }
 856
 857 out:
 858        return err;
 859}
 860
 861/* Set the process memory policy */
 862static long do_set_mempolicy(unsigned short mode, unsigned short flags,
 863                             nodemask_t *nodes)
 864{
 865        struct mempolicy *new, *old;
 866        NODEMASK_SCRATCH(scratch);
 867        int ret;
 868
 869        if (!scratch)
 870                return -ENOMEM;
 871
 872        new = mpol_new(mode, flags, nodes);
 873        if (IS_ERR(new)) {
 874                ret = PTR_ERR(new);
 875                goto out;
 876        }
 877
 878        if (flags & MPOL_F_NUMA_BALANCING) {
 879                if (new && new->mode == MPOL_BIND) {
 880                        new->flags |= (MPOL_F_MOF | MPOL_F_MORON);
 881                } else {
 882                        ret = -EINVAL;
 883                        mpol_put(new);
 884                        goto out;
 885                }
 886        }
 887
 888        ret = mpol_set_nodemask(new, nodes, scratch);
 889        if (ret) {
 890                mpol_put(new);
 891                goto out;
 892        }
 893        task_lock(current);
 894        old = current->mempolicy;
 895        current->mempolicy = new;
 896        if (new && new->mode == MPOL_INTERLEAVE)
 897                current->il_prev = MAX_NUMNODES-1;
 898        task_unlock(current);
 899        mpol_put(old);
 900        ret = 0;
 901out:
 902        NODEMASK_SCRATCH_FREE(scratch);
 903        return ret;
 904}
 905
 906/*
 907 * Return nodemask for policy for get_mempolicy() query
 908 *
 909 * Called with task's alloc_lock held
 910 */
 911static void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes)
 912{
 913        nodes_clear(*nodes);
 914        if (p == &default_policy)
 915                return;
 916
 917        switch (p->mode) {
 918        case MPOL_BIND:
 919        case MPOL_INTERLEAVE:
 920                *nodes = p->v.nodes;
 921                break;
 922        case MPOL_PREFERRED:
 923                if (!(p->flags & MPOL_F_LOCAL))
 924                        node_set(p->v.preferred_node, *nodes);
 925                /* else return empty node mask for local allocation */
 926                break;
 927        default:
 928                BUG();
 929        }
 930}
 931
 932static int lookup_node(struct mm_struct *mm, unsigned long addr)
 933{
 934        struct page *p = NULL;
 935        int err;
 936
 937        int locked = 1;
 938        err = get_user_pages_locked(addr & PAGE_MASK, 1, 0, &p, &locked);
 939        if (err > 0) {
 940                err = page_to_nid(p);
 941                put_page(p);
 942        }
 943        if (locked)
 944                mmap_read_unlock(mm);
 945        return err;
 946}
 947
 948/* Retrieve NUMA policy */
 949static long do_get_mempolicy(int *policy, nodemask_t *nmask,
 950                             unsigned long addr, unsigned long flags)
 951{
 952        int err;
 953        struct mm_struct *mm = current->mm;
 954        struct vm_area_struct *vma = NULL;
 955        struct mempolicy *pol = current->mempolicy, *pol_refcount = NULL;
 956
 957        if (flags &
 958                ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED))
 959                return -EINVAL;
 960
 961        if (flags & MPOL_F_MEMS_ALLOWED) {
 962                if (flags & (MPOL_F_NODE|MPOL_F_ADDR))
 963                        return -EINVAL;
 964                *policy = 0;    /* just so it's initialized */
 965                task_lock(current);
 966                *nmask  = cpuset_current_mems_allowed;
 967                task_unlock(current);
 968                return 0;
 969        }
 970
 971        if (flags & MPOL_F_ADDR) {
 972                /*
 973                 * Do NOT fall back to task policy if the
 974                 * vma/shared policy at addr is NULL.  We
 975                 * want to return MPOL_DEFAULT in this case.
 976                 */
 977                mmap_read_lock(mm);
 978                vma = find_vma_intersection(mm, addr, addr+1);
 979                if (!vma) {
 980                        mmap_read_unlock(mm);
 981                        return -EFAULT;
 982                }
 983                if (vma->vm_ops && vma->vm_ops->get_policy)
 984                        pol = vma->vm_ops->get_policy(vma, addr);
 985                else
 986                        pol = vma->vm_policy;
 987        } else if (addr)
 988                return -EINVAL;
 989
 990        if (!pol)
 991                pol = &default_policy;  /* indicates default behavior */
 992
 993        if (flags & MPOL_F_NODE) {
 994                if (flags & MPOL_F_ADDR) {
 995                        /*
 996                         * Take a refcount on the mpol, lookup_node()
 997                         * will drop the mmap_lock, so after calling
 998                         * lookup_node() only "pol" remains valid, "vma"
 999                         * is stale.
1000                         */
1001                        pol_refcount = pol;
1002                        vma = NULL;
1003                        mpol_get(pol);
1004                        err = lookup_node(mm, addr);
1005                        if (err < 0)
1006                                goto out;
1007                        *policy = err;
1008                } else if (pol == current->mempolicy &&
1009                                pol->mode == MPOL_INTERLEAVE) {
1010                        *policy = next_node_in(current->il_prev, pol->v.nodes);
1011                } else {
1012                        err = -EINVAL;
1013                        goto out;
1014                }
1015        } else {
1016                *policy = pol == &default_policy ? MPOL_DEFAULT :
1017                                                pol->mode;
1018                /*
1019                 * Internal mempolicy flags must be masked off before exposing
1020                 * the policy to userspace.
1021                 */
1022                *policy |= (pol->flags & MPOL_MODE_FLAGS);
1023        }
1024
1025        err = 0;
1026        if (nmask) {
1027                if (mpol_store_user_nodemask(pol)) {
1028                        *nmask = pol->w.user_nodemask;
1029                } else {
1030                        task_lock(current);
1031                        get_policy_nodemask(pol, nmask);
1032                        task_unlock(current);
1033                }
1034        }
1035
1036 out:
1037        mpol_cond_put(pol);
1038        if (vma)
1039                mmap_read_unlock(mm);
1040        if (pol_refcount)
1041                mpol_put(pol_refcount);
1042        return err;
1043}
1044
1045#ifdef CONFIG_MIGRATION
1046/*
1047 * page migration, thp tail pages can be passed.
1048 */
1049static int migrate_page_add(struct page *page, struct list_head *pagelist,
1050                                unsigned long flags)
1051{
1052        struct page *head = compound_head(page);
1053        /*
1054         * Avoid migrating a page that is shared with others.
1055         */
1056        if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(head) == 1) {
1057                if (!isolate_lru_page(head)) {
1058                        list_add_tail(&head->lru, pagelist);
1059                        mod_node_page_state(page_pgdat(head),
1060                                NR_ISOLATED_ANON + page_is_file_lru(head),
1061                                thp_nr_pages(head));
1062                } else if (flags & MPOL_MF_STRICT) {
1063                        /*
1064                         * Non-movable page may reach here.  And, there may be
1065                         * temporary off LRU pages or non-LRU movable pages.
1066                         * Treat them as unmovable pages since they can't be
1067                         * isolated, so they can't be moved at the moment.  It
1068                         * should return -EIO for this case too.
1069                         */
1070                        return -EIO;
1071                }
1072        }
1073
1074        return 0;
1075}
1076
1077/*
1078 * Migrate pages from one node to a target node.
1079 * Returns error or the number of pages not migrated.
1080 */
1081static int migrate_to_node(struct mm_struct *mm, int source, int dest,
1082                           int flags)
1083{
1084        nodemask_t nmask;
1085        LIST_HEAD(pagelist);
1086        int err = 0;
1087        struct migration_target_control mtc = {
1088                .nid = dest,
1089                .gfp_mask = GFP_HIGHUSER_MOVABLE | __GFP_THISNODE,
1090        };
1091
1092        nodes_clear(nmask);
1093        node_set(source, nmask);
1094
1095        /*
1096         * This does not "check" the range but isolates all pages that
1097         * need migration.  Between passing in the full user address
1098         * space range and MPOL_MF_DISCONTIG_OK, this call can not fail.
1099         */
1100        VM_BUG_ON(!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)));
1101        queue_pages_range(mm, mm->mmap->vm_start, mm->task_size, &nmask,
1102                        flags | MPOL_MF_DISCONTIG_OK, &pagelist);
1103
1104        if (!list_empty(&pagelist)) {
1105                err = migrate_pages(&pagelist, alloc_migration_target, NULL,
1106                                (unsigned long)&mtc, MIGRATE_SYNC, MR_SYSCALL);
1107                if (err)
1108                        putback_movable_pages(&pagelist);
1109        }
1110
1111        return err;
1112}
1113
1114/*
1115 * Move pages between the two nodesets so as to preserve the physical
1116 * layout as much as possible.
1117 *
1118 * Returns the number of page that could not be moved.
1119 */
1120int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
1121                     const nodemask_t *to, int flags)
1122{
1123        int busy = 0;
1124        int err = 0;
1125        nodemask_t tmp;
1126
1127        lru_cache_disable();
1128
1129        mmap_read_lock(mm);
1130
1131        /*
1132         * Find a 'source' bit set in 'tmp' whose corresponding 'dest'
1133         * bit in 'to' is not also set in 'tmp'.  Clear the found 'source'
1134         * bit in 'tmp', and return that <source, dest> pair for migration.
1135         * The pair of nodemasks 'to' and 'from' define the map.
1136         *
1137         * If no pair of bits is found that way, fallback to picking some
1138         * pair of 'source' and 'dest' bits that are not the same.  If the
1139         * 'source' and 'dest' bits are the same, this represents a node
1140         * that will be migrating to itself, so no pages need move.
1141         *
1142         * If no bits are left in 'tmp', or if all remaining bits left
1143         * in 'tmp' correspond to the same bit in 'to', return false
1144         * (nothing left to migrate).
1145         *
1146         * This lets us pick a pair of nodes to migrate between, such that
1147         * if possible the dest node is not already occupied by some other
1148         * source node, minimizing the risk of overloading the memory on a
1149         * node that would happen if we migrated incoming memory to a node
1150         * before migrating outgoing memory source that same node.
1151         *
1152         * A single scan of tmp is sufficient.  As we go, we remember the
1153         * most recent <s, d> pair that moved (s != d).  If we find a pair
1154         * that not only moved, but what's better, moved to an empty slot
1155         * (d is not set in tmp), then we break out then, with that pair.
1156         * Otherwise when we finish scanning from_tmp, we at least have the
1157         * most recent <s, d> pair that moved.  If we get all the way through
1158         * the scan of tmp without finding any node that moved, much less
1159         * moved to an empty node, then there is nothing left worth migrating.
1160         */
1161
1162        tmp = *from;
1163        while (!nodes_empty(tmp)) {
1164                int s, d;
1165                int source = NUMA_NO_NODE;
1166                int dest = 0;
1167
1168                for_each_node_mask(s, tmp) {
1169
1170                        /*
1171                         * do_migrate_pages() tries to maintain the relative
1172                         * node relationship of the pages established between
1173                         * threads and memory areas.
1174                         *
1175                         * However if the number of source nodes is not equal to
1176                         * the number of destination nodes we can not preserve
1177                         * this node relative relationship.  In that case, skip
1178                         * copying memory from a node that is in the destination
1179                         * mask.
1180                         *
1181                         * Example: [2,3,4] -> [3,4,5] moves everything.
1182                         *          [0-7] - > [3,4,5] moves only 0,1,2,6,7.
1183                         */
1184
1185                        if ((nodes_weight(*from) != nodes_weight(*to)) &&
1186                                                (node_isset(s, *to)))
1187                                continue;
1188
1189                        d = node_remap(s, *from, *to);
1190                        if (s == d)
1191                                continue;
1192
1193                        source = s;     /* Node moved. Memorize */
1194                        dest = d;
1195
1196                        /* dest not in remaining from nodes? */
1197                        if (!node_isset(dest, tmp))
1198                                break;
1199                }
1200                if (source == NUMA_NO_NODE)
1201                        break;
1202
1203                node_clear(source, tmp);
1204                err = migrate_to_node(mm, source, dest, flags);
1205                if (err > 0)
1206                        busy += err;
1207                if (err < 0)
1208                        break;
1209        }
1210        mmap_read_unlock(mm);
1211
1212        lru_cache_enable();
1213        if (err < 0)
1214                return err;
1215        return busy;
1216
1217}
1218
1219/*
1220 * Allocate a new page for page migration based on vma policy.
1221 * Start by assuming the page is mapped by the same vma as contains @start.
1222 * Search forward from there, if not.  N.B., this assumes that the
1223 * list of pages handed to migrate_pages()--which is how we get here--
1224 * is in virtual address order.
1225 */
1226static struct page *new_page(struct page *page, unsigned long start)
1227{
1228        struct vm_area_struct *vma;
1229        unsigned long address;
1230
1231        vma = find_vma(current->mm, start);
1232        while (vma) {
1233                address = page_address_in_vma(page, vma);
1234                if (address != -EFAULT)
1235                        break;
1236                vma = vma->vm_next;
1237        }
1238
1239        if (PageHuge(page)) {
1240                return alloc_huge_page_vma(page_hstate(compound_head(page)),
1241                                vma, address);
1242        } else if (PageTransHuge(page)) {
1243                struct page *thp;
1244
1245                thp = alloc_hugepage_vma(GFP_TRANSHUGE, vma, address,
1246                                         HPAGE_PMD_ORDER);
1247                if (!thp)
1248                        return NULL;
1249                prep_transhuge_page(thp);
1250                return thp;
1251        }
1252        /*
1253         * if !vma, alloc_page_vma() will use task or system default policy
1254         */
1255        return alloc_page_vma(GFP_HIGHUSER_MOVABLE | __GFP_RETRY_MAYFAIL,
1256                        vma, address);
1257}
1258#else
1259
1260static int migrate_page_add(struct page *page, struct list_head *pagelist,
1261                                unsigned long flags)
1262{
1263        return -EIO;
1264}
1265
1266int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
1267                     const nodemask_t *to, int flags)
1268{
1269        return -ENOSYS;
1270}
1271
1272static struct page *new_page(struct page *page, unsigned long start)
1273{
1274        return NULL;
1275}
1276#endif
1277
1278static long do_mbind(unsigned long start, unsigned long len,
1279                     unsigned short mode, unsigned short mode_flags,
1280                     nodemask_t *nmask, unsigned long flags)
1281{
1282        struct mm_struct *mm = current->mm;
1283        struct mempolicy *new;
1284        unsigned long end;
1285        int err;
1286        int ret;
1287        LIST_HEAD(pagelist);
1288
1289        if (flags & ~(unsigned long)MPOL_MF_VALID)
1290                return -EINVAL;
1291        if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
1292                return -EPERM;
1293
1294        if (start & ~PAGE_MASK)
1295                return -EINVAL;
1296
1297        if (mode == MPOL_DEFAULT)
1298                flags &= ~MPOL_MF_STRICT;
1299
1300        len = (len + PAGE_SIZE - 1) & PAGE_MASK;
1301        end = start + len;
1302
1303        if (end < start)
1304                return -EINVAL;
1305        if (end == start)
1306                return 0;
1307
1308        new = mpol_new(mode, mode_flags, nmask);
1309        if (IS_ERR(new))
1310                return PTR_ERR(new);
1311
1312        if (flags & MPOL_MF_LAZY)
1313                new->flags |= MPOL_F_MOF;
1314
1315        /*
1316         * If we are using the default policy then operation
1317         * on discontinuous address spaces is okay after all
1318         */
1319        if (!new)
1320                flags |= MPOL_MF_DISCONTIG_OK;
1321
1322        pr_debug("mbind %lx-%lx mode:%d flags:%d nodes:%lx\n",
1323                 start, start + len, mode, mode_flags,
1324                 nmask ? nodes_addr(*nmask)[0] : NUMA_NO_NODE);
1325
1326        if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
1327
1328                lru_cache_disable();
1329        }
1330        {
1331                NODEMASK_SCRATCH(scratch);
1332                if (scratch) {
1333                        mmap_write_lock(mm);
1334                        err = mpol_set_nodemask(new, nmask, scratch);
1335                        if (err)
1336                                mmap_write_unlock(mm);
1337                } else
1338                        err = -ENOMEM;
1339                NODEMASK_SCRATCH_FREE(scratch);
1340        }
1341        if (err)
1342                goto mpol_out;
1343
1344        ret = queue_pages_range(mm, start, end, nmask,
1345                          flags | MPOL_MF_INVERT, &pagelist);
1346
1347        if (ret < 0) {
1348                err = ret;
1349                goto up_out;
1350        }
1351
1352        err = mbind_range(mm, start, end, new);
1353
1354        if (!err) {
1355                int nr_failed = 0;
1356
1357                if (!list_empty(&pagelist)) {
1358                        WARN_ON_ONCE(flags & MPOL_MF_LAZY);
1359                        nr_failed = migrate_pages(&pagelist, new_page, NULL,
1360                                start, MIGRATE_SYNC, MR_MEMPOLICY_MBIND);
1361                        if (nr_failed)
1362                                putback_movable_pages(&pagelist);
1363                }
1364
1365                if ((ret > 0) || (nr_failed && (flags & MPOL_MF_STRICT)))
1366                        err = -EIO;
1367        } else {
1368up_out:
1369                if (!list_empty(&pagelist))
1370                        putback_movable_pages(&pagelist);
1371        }
1372
1373        mmap_write_unlock(mm);
1374mpol_out:
1375        mpol_put(new);
1376        if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
1377                lru_cache_enable();
1378        return err;
1379}
1380
1381/*
1382 * User space interface with variable sized bitmaps for nodelists.
1383 */
1384
1385/* Copy a node mask from user space. */
1386static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
1387                     unsigned long maxnode)
1388{
1389        unsigned long k;
1390        unsigned long t;
1391        unsigned long nlongs;
1392        unsigned long endmask;
1393
1394        --maxnode;
1395        nodes_clear(*nodes);
1396        if (maxnode == 0 || !nmask)
1397                return 0;
1398        if (maxnode > PAGE_SIZE*BITS_PER_BYTE)
1399                return -EINVAL;
1400
1401        nlongs = BITS_TO_LONGS(maxnode);
1402        if ((maxnode % BITS_PER_LONG) == 0)
1403                endmask = ~0UL;
1404        else
1405                endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1;
1406
1407        /*
1408         * When the user specified more nodes than supported just check
1409         * if the non supported part is all zero.
1410         *
1411         * If maxnode have more longs than MAX_NUMNODES, check
1412         * the bits in that area first. And then go through to
1413         * check the rest bits which equal or bigger than MAX_NUMNODES.
1414         * Otherwise, just check bits [MAX_NUMNODES, maxnode).
1415         */
1416        if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) {
1417                for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) {
1418                        if (get_user(t, nmask + k))
1419                                return -EFAULT;
1420                        if (k == nlongs - 1) {
1421                                if (t & endmask)
1422                                        return -EINVAL;
1423                        } else if (t)
1424                                return -EINVAL;
1425                }
1426                nlongs = BITS_TO_LONGS(MAX_NUMNODES);
1427                endmask = ~0UL;
1428        }
1429
1430        if (maxnode > MAX_NUMNODES && MAX_NUMNODES % BITS_PER_LONG != 0) {
1431                unsigned long valid_mask = endmask;
1432
1433                valid_mask &= ~((1UL << (MAX_NUMNODES % BITS_PER_LONG)) - 1);
1434                if (get_user(t, nmask + nlongs - 1))
1435                        return -EFAULT;
1436                if (t & valid_mask)
1437                        return -EINVAL;
1438        }
1439
1440        if (copy_from_user(nodes_addr(*nodes), nmask, nlongs*sizeof(unsigned long)))
1441                return -EFAULT;
1442        nodes_addr(*nodes)[nlongs-1] &= endmask;
1443        return 0;
1444}
1445
1446/* Copy a kernel node mask to user space */
1447static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
1448                              nodemask_t *nodes)
1449{
1450        unsigned long copy = ALIGN(maxnode-1, 64) / 8;
1451        unsigned int nbytes = BITS_TO_LONGS(nr_node_ids) * sizeof(long);
1452
1453        if (copy > nbytes) {
1454                if (copy > PAGE_SIZE)
1455                        return -EINVAL;
1456                if (clear_user((char __user *)mask + nbytes, copy - nbytes))
1457                        return -EFAULT;
1458                copy = nbytes;
1459        }
1460        return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
1461}
1462
1463static long kernel_mbind(unsigned long start, unsigned long len,
1464                         unsigned long mode, const unsigned long __user *nmask,
1465                         unsigned long maxnode, unsigned int flags)
1466{
1467        nodemask_t nodes;
1468        int err;
1469        unsigned short mode_flags;
1470
1471        start = untagged_addr(start);
1472        mode_flags = mode & MPOL_MODE_FLAGS;
1473        mode &= ~MPOL_MODE_FLAGS;
1474        if (mode >= MPOL_MAX)
1475                return -EINVAL;
1476        if ((mode_flags & MPOL_F_STATIC_NODES) &&
1477            (mode_flags & MPOL_F_RELATIVE_NODES))
1478                return -EINVAL;
1479        err = get_nodes(&nodes, nmask, maxnode);
1480        if (err)
1481                return err;
1482        return do_mbind(start, len, mode, mode_flags, &nodes, flags);
1483}
1484
1485SYSCALL_DEFINE6(mbind, unsigned long, start, unsigned long, len,
1486                unsigned long, mode, const unsigned long __user *, nmask,
1487                unsigned long, maxnode, unsigned int, flags)
1488{
1489        return kernel_mbind(start, len, mode, nmask, maxnode, flags);
1490}
1491
1492/* Set the process memory policy */
1493static long kernel_set_mempolicy(int mode, const unsigned long __user *nmask,
1494                                 unsigned long maxnode)
1495{
1496        int err;
1497        nodemask_t nodes;
1498        unsigned short flags;
1499
1500        flags = mode & MPOL_MODE_FLAGS;
1501        mode &= ~MPOL_MODE_FLAGS;
1502        if ((unsigned int)mode >= MPOL_MAX)
1503                return -EINVAL;
1504        if ((flags & MPOL_F_STATIC_NODES) && (flags & MPOL_F_RELATIVE_NODES))
1505                return -EINVAL;
1506        err = get_nodes(&nodes, nmask, maxnode);
1507        if (err)
1508                return err;
1509        return do_set_mempolicy(mode, flags, &nodes);
1510}
1511
1512SYSCALL_DEFINE3(set_mempolicy, int, mode, const unsigned long __user *, nmask,
1513                unsigned long, maxnode)
1514{
1515        return kernel_set_mempolicy(mode, nmask, maxnode);
1516}
1517
1518static int kernel_migrate_pages(pid_t pid, unsigned long maxnode,
1519                                const unsigned long __user *old_nodes,
1520                                const unsigned long __user *new_nodes)
1521{
1522        struct mm_struct *mm = NULL;
1523        struct task_struct *task;
1524        nodemask_t task_nodes;
1525        int err;
1526        nodemask_t *old;
1527        nodemask_t *new;
1528        NODEMASK_SCRATCH(scratch);
1529
1530        if (!scratch)
1531                return -ENOMEM;
1532
1533        old = &scratch->mask1;
1534        new = &scratch->mask2;
1535
1536        err = get_nodes(old, old_nodes, maxnode);
1537        if (err)
1538                goto out;
1539
1540        err = get_nodes(new, new_nodes, maxnode);
1541        if (err)
1542                goto out;
1543
1544        /* Find the mm_struct */
1545        rcu_read_lock();
1546        task = pid ? find_task_by_vpid(pid) : current;
1547        if (!task) {
1548                rcu_read_unlock();
1549                err = -ESRCH;
1550                goto out;
1551        }
1552        get_task_struct(task);
1553
1554        err = -EINVAL;
1555
1556        /*
1557         * Check if this process has the right to modify the specified process.
1558         * Use the regular "ptrace_may_access()" checks.
1559         */
1560        if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS)) {
1561                rcu_read_unlock();
1562                err = -EPERM;
1563                goto out_put;
1564        }
1565        rcu_read_unlock();
1566
1567        task_nodes = cpuset_mems_allowed(task);
1568        /* Is the user allowed to access the target nodes? */
1569        if (!nodes_subset(*new, task_nodes) && !capable(CAP_SYS_NICE)) {
1570                err = -EPERM;
1571                goto out_put;
1572        }
1573
1574        task_nodes = cpuset_mems_allowed(current);
1575        nodes_and(*new, *new, task_nodes);
1576        if (nodes_empty(*new))
1577                goto out_put;
1578
1579        err = security_task_movememory(task);
1580        if (err)
1581                goto out_put;
1582
1583        mm = get_task_mm(task);
1584        put_task_struct(task);
1585
1586        if (!mm) {
1587                err = -EINVAL;
1588                goto out;
1589        }
1590
1591        err = do_migrate_pages(mm, old, new,
1592                capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE);
1593
1594        mmput(mm);
1595out:
1596        NODEMASK_SCRATCH_FREE(scratch);
1597
1598        return err;
1599
1600out_put:
1601        put_task_struct(task);
1602        goto out;
1603
1604}
1605
1606SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode,
1607                const unsigned long __user *, old_nodes,
1608                const unsigned long __user *, new_nodes)
1609{
1610        return kernel_migrate_pages(pid, maxnode, old_nodes, new_nodes);
1611}
1612
1613
1614/* Retrieve NUMA policy */
1615static int kernel_get_mempolicy(int __user *policy,
1616                                unsigned long __user *nmask,
1617                                unsigned long maxnode,
1618                                unsigned long addr,
1619                                unsigned long flags)
1620{
1621        int err;
1622        int pval;
1623        nodemask_t nodes;
1624
1625        if (nmask != NULL && maxnode < nr_node_ids)
1626                return -EINVAL;
1627
1628        addr = untagged_addr(addr);
1629
1630        err = do_get_mempolicy(&pval, &nodes, addr, flags);
1631
1632        if (err)
1633                return err;
1634
1635        if (policy && put_user(pval, policy))
1636                return -EFAULT;
1637
1638        if (nmask)
1639                err = copy_nodes_to_user(nmask, maxnode, &nodes);
1640
1641        return err;
1642}
1643
1644SYSCALL_DEFINE5(get_mempolicy, int __user *, policy,
1645                unsigned long __user *, nmask, unsigned long, maxnode,
1646                unsigned long, addr, unsigned long, flags)
1647{
1648        return kernel_get_mempolicy(policy, nmask, maxnode, addr, flags);
1649}
1650
1651#ifdef CONFIG_COMPAT
1652
1653COMPAT_SYSCALL_DEFINE5(get_mempolicy, int __user *, policy,
1654                       compat_ulong_t __user *, nmask,
1655                       compat_ulong_t, maxnode,
1656                       compat_ulong_t, addr, compat_ulong_t, flags)
1657{
1658        long err;
1659        unsigned long __user *nm = NULL;
1660        unsigned long nr_bits, alloc_size;
1661        DECLARE_BITMAP(bm, MAX_NUMNODES);
1662
1663        nr_bits = min_t(unsigned long, maxnode-1, nr_node_ids);
1664        alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1665
1666        if (nmask)
1667                nm = compat_alloc_user_space(alloc_size);
1668
1669        err = kernel_get_mempolicy(policy, nm, nr_bits+1, addr, flags);
1670
1671        if (!err && nmask) {
1672                unsigned long copy_size;
1673                copy_size = min_t(unsigned long, sizeof(bm), alloc_size);
1674                err = copy_from_user(bm, nm, copy_size);
1675                /* ensure entire bitmap is zeroed */
1676                err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8);
1677                err |= compat_put_bitmap(nmask, bm, nr_bits);
1678        }
1679
1680        return err;
1681}
1682
1683COMPAT_SYSCALL_DEFINE3(set_mempolicy, int, mode, compat_ulong_t __user *, nmask,
1684                       compat_ulong_t, maxnode)
1685{
1686        unsigned long __user *nm = NULL;
1687        unsigned long nr_bits, alloc_size;
1688        DECLARE_BITMAP(bm, MAX_NUMNODES);
1689
1690        nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1691        alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1692
1693        if (nmask) {
1694                if (compat_get_bitmap(bm, nmask, nr_bits))
1695                        return -EFAULT;
1696                nm = compat_alloc_user_space(alloc_size);
1697                if (copy_to_user(nm, bm, alloc_size))
1698                        return -EFAULT;
1699        }
1700
1701        return kernel_set_mempolicy(mode, nm, nr_bits+1);
1702}
1703
1704COMPAT_SYSCALL_DEFINE6(mbind, compat_ulong_t, start, compat_ulong_t, len,
1705                       compat_ulong_t, mode, compat_ulong_t __user *, nmask,
1706                       compat_ulong_t, maxnode, compat_ulong_t, flags)
1707{
1708        unsigned long __user *nm = NULL;
1709        unsigned long nr_bits, alloc_size;
1710        nodemask_t bm;
1711
1712        nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1713        alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1714
1715        if (nmask) {
1716                if (compat_get_bitmap(nodes_addr(bm), nmask, nr_bits))
1717                        return -EFAULT;
1718                nm = compat_alloc_user_space(alloc_size);
1719                if (copy_to_user(nm, nodes_addr(bm), alloc_size))
1720                        return -EFAULT;
1721        }
1722
1723        return kernel_mbind(start, len, mode, nm, nr_bits+1, flags);
1724}
1725
1726COMPAT_SYSCALL_DEFINE4(migrate_pages, compat_pid_t, pid,
1727                       compat_ulong_t, maxnode,
1728                       const compat_ulong_t __user *, old_nodes,
1729                       const compat_ulong_t __user *, new_nodes)
1730{
1731        unsigned long __user *old = NULL;
1732        unsigned long __user *new = NULL;
1733        nodemask_t tmp_mask;
1734        unsigned long nr_bits;
1735        unsigned long size;
1736
1737        nr_bits = min_t(unsigned long, maxnode - 1, MAX_NUMNODES);
1738        size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1739        if (old_nodes) {
1740                if (compat_get_bitmap(nodes_addr(tmp_mask), old_nodes, nr_bits))
1741                        return -EFAULT;
1742                old = compat_alloc_user_space(new_nodes ? size * 2 : size);
1743                if (new_nodes)
1744                        new = old + size / sizeof(unsigned long);
1745                if (copy_to_user(old, nodes_addr(tmp_mask), size))
1746                        return -EFAULT;
1747        }
1748        if (new_nodes) {
1749                if (compat_get_bitmap(nodes_addr(tmp_mask), new_nodes, nr_bits))
1750                        return -EFAULT;
1751                if (new == NULL)
1752                        new = compat_alloc_user_space(size);
1753                if (copy_to_user(new, nodes_addr(tmp_mask), size))
1754                        return -EFAULT;
1755        }
1756        return kernel_migrate_pages(pid, nr_bits + 1, old, new);
1757}
1758
1759#endif /* CONFIG_COMPAT */
1760
1761bool vma_migratable(struct vm_area_struct *vma)
1762{
1763        if (vma->vm_flags & (VM_IO | VM_PFNMAP))
1764                return false;
1765
1766        /*
1767         * DAX device mappings require predictable access latency, so avoid
1768         * incurring periodic faults.
1769         */
1770        if (vma_is_dax(vma))
1771                return false;
1772
1773        if (is_vm_hugetlb_page(vma) &&
1774                !hugepage_migration_supported(hstate_vma(vma)))
1775                return false;
1776
1777        /*
1778         * Migration allocates pages in the highest zone. If we cannot
1779         * do so then migration (at least from node to node) is not
1780         * possible.
1781         */
1782        if (vma->vm_file &&
1783                gfp_zone(mapping_gfp_mask(vma->vm_file->f_mapping))
1784                        < policy_zone)
1785                return false;
1786        return true;
1787}
1788
1789struct mempolicy *__get_vma_policy(struct vm_area_struct *vma,
1790                                                unsigned long addr)
1791{
1792        struct mempolicy *pol = NULL;
1793
1794        if (vma) {
1795                if (vma->vm_ops && vma->vm_ops->get_policy) {
1796                        pol = vma->vm_ops->get_policy(vma, addr);
1797                } else if (vma->vm_policy) {
1798                        pol = vma->vm_policy;
1799
1800                        /*
1801                         * shmem_alloc_page() passes MPOL_F_SHARED policy with
1802                         * a pseudo vma whose vma->vm_ops=NULL. Take a reference
1803                         * count on these policies which will be dropped by
1804                         * mpol_cond_put() later
1805                         */
1806                        if (mpol_needs_cond_ref(pol))
1807                                mpol_get(pol);
1808                }
1809        }
1810
1811        return pol;
1812}
1813
1814/*
1815 * get_vma_policy(@vma, @addr)
1816 * @vma: virtual memory area whose policy is sought
1817 * @addr: address in @vma for shared policy lookup
1818 *
1819 * Returns effective policy for a VMA at specified address.
1820 * Falls back to current->mempolicy or system default policy, as necessary.
1821 * Shared policies [those marked as MPOL_F_SHARED] require an extra reference
1822 * count--added by the get_policy() vm_op, as appropriate--to protect against
1823 * freeing by another task.  It is the caller's responsibility to free the
1824 * extra reference for shared policies.
1825 */
1826static struct mempolicy *get_vma_policy(struct vm_area_struct *vma,
1827                                                unsigned long addr)
1828{
1829        struct mempolicy *pol = __get_vma_policy(vma, addr);
1830
1831        if (!pol)
1832                pol = get_task_policy(current);
1833
1834        return pol;
1835}
1836
1837bool vma_policy_mof(struct vm_area_struct *vma)
1838{
1839        struct mempolicy *pol;
1840
1841        if (vma->vm_ops && vma->vm_ops->get_policy) {
1842                bool ret = false;
1843
1844                pol = vma->vm_ops->get_policy(vma, vma->vm_start);
1845                if (pol && (pol->flags & MPOL_F_MOF))
1846                        ret = true;
1847                mpol_cond_put(pol);
1848
1849                return ret;
1850        }
1851
1852        pol = vma->vm_policy;
1853        if (!pol)
1854                pol = get_task_policy(current);
1855
1856        return pol->flags & MPOL_F_MOF;
1857}
1858
1859static int apply_policy_zone(struct mempolicy *policy, enum zone_type zone)
1860{
1861        enum zone_type dynamic_policy_zone = policy_zone;
1862
1863        BUG_ON(dynamic_policy_zone == ZONE_MOVABLE);
1864
1865        /*
1866         * if policy->v.nodes has movable memory only,
1867         * we apply policy when gfp_zone(gfp) = ZONE_MOVABLE only.
1868         *
1869         * policy->v.nodes is intersect with node_states[N_MEMORY].
1870         * so if the following test fails, it implies
1871         * policy->v.nodes has movable memory only.
1872         */
1873        if (!nodes_intersects(policy->v.nodes, node_states[N_HIGH_MEMORY]))
1874                dynamic_policy_zone = ZONE_MOVABLE;
1875
1876        return zone >= dynamic_policy_zone;
1877}
1878
1879/*
1880 * Return a nodemask representing a mempolicy for filtering nodes for
1881 * page allocation
1882 */
1883nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy)
1884{
1885        /* Lower zones don't get a nodemask applied for MPOL_BIND */
1886        if (unlikely(policy->mode == MPOL_BIND) &&
1887                        apply_policy_zone(policy, gfp_zone(gfp)) &&
1888                        cpuset_nodemask_valid_mems_allowed(&policy->v.nodes))
1889                return &policy->v.nodes;
1890
1891        return NULL;
1892}
1893
1894/* Return the node id preferred by the given mempolicy, or the given id */
1895static int policy_node(gfp_t gfp, struct mempolicy *policy, int nd)
1896{
1897        if (policy->mode == MPOL_PREFERRED && !(policy->flags & MPOL_F_LOCAL))
1898                nd = policy->v.preferred_node;
1899        else {
1900                /*
1901                 * __GFP_THISNODE shouldn't even be used with the bind policy
1902                 * because we might easily break the expectation to stay on the
1903                 * requested node and not break the policy.
1904                 */
1905                WARN_ON_ONCE(policy->mode == MPOL_BIND && (gfp & __GFP_THISNODE));
1906        }
1907
1908        return nd;
1909}
1910
1911/* Do dynamic interleaving for a process */
1912static unsigned interleave_nodes(struct mempolicy *policy)
1913{
1914        unsigned next;
1915        struct task_struct *me = current;
1916
1917        next = next_node_in(me->il_prev, policy->v.nodes);
1918        if (next < MAX_NUMNODES)
1919                me->il_prev = next;
1920        return next;
1921}
1922
1923/*
1924 * Depending on the memory policy provide a node from which to allocate the
1925 * next slab entry.
1926 */
1927unsigned int mempolicy_slab_node(void)
1928{
1929        struct mempolicy *policy;
1930        int node = numa_mem_id();
1931
1932        if (in_interrupt())
1933                return node;
1934
1935        policy = current->mempolicy;
1936        if (!policy || policy->flags & MPOL_F_LOCAL)
1937                return node;
1938
1939        switch (policy->mode) {
1940        case MPOL_PREFERRED:
1941                /*
1942                 * handled MPOL_F_LOCAL above
1943                 */
1944                return policy->v.preferred_node;
1945
1946        case MPOL_INTERLEAVE:
1947                return interleave_nodes(policy);
1948
1949        case MPOL_BIND: {
1950                struct zoneref *z;
1951
1952                /*
1953                 * Follow bind policy behavior and start allocation at the
1954                 * first node.
1955                 */
1956                struct zonelist *zonelist;
1957                enum zone_type highest_zoneidx = gfp_zone(GFP_KERNEL);
1958                zonelist = &NODE_DATA(node)->node_zonelists[ZONELIST_FALLBACK];
1959                z = first_zones_zonelist(zonelist, highest_zoneidx,
1960                                                        &policy->v.nodes);
1961                return z->zone ? zone_to_nid(z->zone) : node;
1962        }
1963
1964        default:
1965                BUG();
1966        }
1967}
1968
1969/*
1970 * Do static interleaving for a VMA with known offset @n.  Returns the n'th
1971 * node in pol->v.nodes (starting from n=0), wrapping around if n exceeds the
1972 * number of present nodes.
1973 */
1974static unsigned offset_il_node(struct mempolicy *pol, unsigned long n)
1975{
1976        unsigned nnodes = nodes_weight(pol->v.nodes);
1977        unsigned target;
1978        int i;
1979        int nid;
1980
1981        if (!nnodes)
1982                return numa_node_id();
1983        target = (unsigned int)n % nnodes;
1984        nid = first_node(pol->v.nodes);
1985        for (i = 0; i < target; i++)
1986                nid = next_node(nid, pol->v.nodes);
1987        return nid;
1988}
1989
1990/* Determine a node number for interleave */
1991static inline unsigned interleave_nid(struct mempolicy *pol,
1992                 struct vm_area_struct *vma, unsigned long addr, int shift)
1993{
1994        if (vma) {
1995                unsigned long off;
1996
1997                /*
1998                 * for small pages, there is no difference between
1999                 * shift and PAGE_SHIFT, so the bit-shift is safe.
2000                 * for huge pages, since vm_pgoff is in units of small
2001                 * pages, we need to shift off the always 0 bits to get
2002                 * a useful offset.
2003                 */
2004                BUG_ON(shift < PAGE_SHIFT);
2005                off = vma->vm_pgoff >> (shift - PAGE_SHIFT);
2006                off += (addr - vma->vm_start) >> shift;
2007                return offset_il_node(pol, off);
2008        } else
2009                return interleave_nodes(pol);
2010}
2011
2012#ifdef CONFIG_HUGETLBFS
2013/*
2014 * huge_node(@vma, @addr, @gfp_flags, @mpol)
2015 * @vma: virtual memory area whose policy is sought
2016 * @addr: address in @vma for shared policy lookup and interleave policy
2017 * @gfp_flags: for requested zone
2018 * @mpol: pointer to mempolicy pointer for reference counted mempolicy
2019 * @nodemask: pointer to nodemask pointer for MPOL_BIND nodemask
2020 *
2021 * Returns a nid suitable for a huge page allocation and a pointer
2022 * to the struct mempolicy for conditional unref after allocation.
2023 * If the effective policy is 'BIND, returns a pointer to the mempolicy's
2024 * @nodemask for filtering the zonelist.
2025 *
2026 * Must be protected by read_mems_allowed_begin()
2027 */
2028int huge_node(struct vm_area_struct *vma, unsigned long addr, gfp_t gfp_flags,
2029                                struct mempolicy **mpol, nodemask_t **nodemask)
2030{
2031        int nid;
2032
2033        *mpol = get_vma_policy(vma, addr);
2034        *nodemask = NULL;       /* assume !MPOL_BIND */
2035
2036        if (unlikely((*mpol)->mode == MPOL_INTERLEAVE)) {
2037                nid = interleave_nid(*mpol, vma, addr,
2038                                        huge_page_shift(hstate_vma(vma)));
2039        } else {
2040                nid = policy_node(gfp_flags, *mpol, numa_node_id());
2041                if ((*mpol)->mode == MPOL_BIND)
2042                        *nodemask = &(*mpol)->v.nodes;
2043        }
2044        return nid;
2045}
2046
2047/*
2048 * init_nodemask_of_mempolicy
2049 *
2050 * If the current task's mempolicy is "default" [NULL], return 'false'
2051 * to indicate default policy.  Otherwise, extract the policy nodemask
2052 * for 'bind' or 'interleave' policy into the argument nodemask, or
2053 * initialize the argument nodemask to contain the single node for
2054 * 'preferred' or 'local' policy and return 'true' to indicate presence
2055 * of non-default mempolicy.
2056 *
2057 * We don't bother with reference counting the mempolicy [mpol_get/put]
2058 * because the current task is examining it's own mempolicy and a task's
2059 * mempolicy is only ever changed by the task itself.
2060 *
2061 * N.B., it is the caller's responsibility to free a returned nodemask.
2062 */
2063bool init_nodemask_of_mempolicy(nodemask_t *mask)
2064{
2065        struct mempolicy *mempolicy;
2066        int nid;
2067
2068        if (!(mask && current->mempolicy))
2069                return false;
2070
2071        task_lock(current);
2072        mempolicy = current->mempolicy;
2073        switch (mempolicy->mode) {
2074        case MPOL_PREFERRED:
2075                if (mempolicy->flags & MPOL_F_LOCAL)
2076                        nid = numa_node_id();
2077                else
2078                        nid = mempolicy->v.preferred_node;
2079                init_nodemask_of_node(mask, nid);
2080                break;
2081
2082        case MPOL_BIND:
2083        case MPOL_INTERLEAVE:
2084                *mask =  mempolicy->v.nodes;
2085                break;
2086
2087        default:
2088                BUG();
2089        }
2090        task_unlock(current);
2091
2092        return true;
2093}
2094#endif
2095
2096/*
2097 * mempolicy_nodemask_intersects
2098 *
2099 * If tsk's mempolicy is "default" [NULL], return 'true' to indicate default
2100 * policy.  Otherwise, check for intersection between mask and the policy
2101 * nodemask for 'bind' or 'interleave' policy.  For 'preferred' or 'local'
2102 * policy, always return true since it may allocate elsewhere on fallback.
2103 *
2104 * Takes task_lock(tsk) to prevent freeing of its mempolicy.
2105 */
2106bool mempolicy_nodemask_intersects(struct task_struct *tsk,
2107                                        const nodemask_t *mask)
2108{
2109        struct mempolicy *mempolicy;
2110        bool ret = true;
2111
2112        if (!mask)
2113                return ret;
2114        task_lock(tsk);
2115        mempolicy = tsk->mempolicy;
2116        if (!mempolicy)
2117                goto out;
2118
2119        switch (mempolicy->mode) {
2120        case MPOL_PREFERRED:
2121                /*
2122                 * MPOL_PREFERRED and MPOL_F_LOCAL are only preferred nodes to
2123                 * allocate from, they may fallback to other nodes when oom.
2124                 * Thus, it's possible for tsk to have allocated memory from
2125                 * nodes in mask.
2126                 */
2127                break;
2128        case MPOL_BIND:
2129        case MPOL_INTERLEAVE:
2130                ret = nodes_intersects(mempolicy->v.nodes, *mask);
2131                break;
2132        default:
2133                BUG();
2134        }
2135out:
2136        task_unlock(tsk);
2137        return ret;
2138}
2139
2140/* Allocate a page in interleaved policy.
2141   Own path because it needs to do special accounting. */
2142static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
2143                                        unsigned nid)
2144{
2145        struct page *page;
2146
2147        page = __alloc_pages(gfp, order, nid, NULL);
2148        /* skip NUMA_INTERLEAVE_HIT counter update if numa stats is disabled */
2149        if (!static_branch_likely(&vm_numa_stat_key))
2150                return page;
2151        if (page && page_to_nid(page) == nid) {
2152                preempt_disable();
2153                __inc_numa_state(page_zone(page), NUMA_INTERLEAVE_HIT);
2154                preempt_enable();
2155        }
2156        return page;
2157}
2158
2159/**
2160 * alloc_pages_vma - Allocate a page for a VMA.
2161 * @gfp: GFP flags.
2162 * @order: Order of the GFP allocation.
2163 * @vma: Pointer to VMA or NULL if not available.
2164 * @addr: Virtual address of the allocation.  Must be inside @vma.
2165 * @node: Which node to prefer for allocation (modulo policy).
2166 * @hugepage: For hugepages try only the preferred node if possible.
2167 *
2168 * Allocate a page for a specific address in @vma, using the appropriate
2169 * NUMA policy.  When @vma is not NULL the caller must hold the mmap_lock
2170 * of the mm_struct of the VMA to prevent it from going away.  Should be
2171 * used for all allocations for pages that will be mapped into user space.
2172 *
2173 * Return: The page on success or NULL if allocation fails.
2174 */
2175struct page *alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
2176                unsigned long addr, int node, bool hugepage)
2177{
2178        struct mempolicy *pol;
2179        struct page *page;
2180        int preferred_nid;
2181        nodemask_t *nmask;
2182
2183        pol = get_vma_policy(vma, addr);
2184
2185        if (pol->mode == MPOL_INTERLEAVE) {
2186                unsigned nid;
2187
2188                nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order);
2189                mpol_cond_put(pol);
2190                page = alloc_page_interleave(gfp, order, nid);
2191                goto out;
2192        }
2193
2194        if (unlikely(IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && hugepage)) {
2195                int hpage_node = node;
2196
2197                /*
2198                 * For hugepage allocation and non-interleave policy which
2199                 * allows the current node (or other explicitly preferred
2200                 * node) we only try to allocate from the current/preferred
2201                 * node and don't fall back to other nodes, as the cost of
2202                 * remote accesses would likely offset THP benefits.
2203                 *
2204                 * If the policy is interleave, or does not allow the current
2205                 * node in its nodemask, we allocate the standard way.
2206                 */
2207                if (pol->mode == MPOL_PREFERRED && !(pol->flags & MPOL_F_LOCAL))
2208                        hpage_node = pol->v.preferred_node;
2209
2210                nmask = policy_nodemask(gfp, pol);
2211                if (!nmask || node_isset(hpage_node, *nmask)) {
2212                        mpol_cond_put(pol);
2213                        /*
2214                         * First, try to allocate THP only on local node, but
2215                         * don't reclaim unnecessarily, just compact.
2216                         */
2217                        page = __alloc_pages_node(hpage_node,
2218                                gfp | __GFP_THISNODE | __GFP_NORETRY, order);
2219
2220                        /*
2221                         * If hugepage allocations are configured to always
2222                         * synchronous compact or the vma has been madvised
2223                         * to prefer hugepage backing, retry allowing remote
2224                         * memory with both reclaim and compact as well.
2225                         */
2226                        if (!page && (gfp & __GFP_DIRECT_RECLAIM))
2227                                page = __alloc_pages_node(hpage_node,
2228                                                                gfp, order);
2229
2230                        goto out;
2231                }
2232        }
2233
2234        nmask = policy_nodemask(gfp, pol);
2235        preferred_nid = policy_node(gfp, pol, node);
2236        page = __alloc_pages(gfp, order, preferred_nid, nmask);
2237        mpol_cond_put(pol);
2238out:
2239        return page;
2240}
2241EXPORT_SYMBOL(alloc_pages_vma);
2242
2243/**
2244 * alloc_pages - Allocate pages.
2245 * @gfp: GFP flags.
2246 * @order: Power of two of number of pages to allocate.
2247 *
2248 * Allocate 1 << @order contiguous pages.  The physical address of the
2249 * first page is naturally aligned (eg an order-3 allocation will be aligned
2250 * to a multiple of 8 * PAGE_SIZE bytes).  The NUMA policy of the current
2251 * process is honoured when in process context.
2252 *
2253 * Context: Can be called from any context, providing the appropriate GFP
2254 * flags are used.
2255 * Return: The page on success or NULL if allocation fails.
2256 */
2257struct page *alloc_pages(gfp_t gfp, unsigned order)
2258{
2259        struct mempolicy *pol = &default_policy;
2260        struct page *page;
2261
2262        if (!in_interrupt() && !(gfp & __GFP_THISNODE))
2263                pol = get_task_policy(current);
2264
2265        /*
2266         * No reference counting needed for current->mempolicy
2267         * nor system default_policy
2268         */
2269        if (pol->mode == MPOL_INTERLEAVE)
2270                page = alloc_page_interleave(gfp, order, interleave_nodes(pol));
2271        else
2272                page = __alloc_pages(gfp, order,
2273                                policy_node(gfp, pol, numa_node_id()),
2274                                policy_nodemask(gfp, pol));
2275
2276        return page;
2277}
2278EXPORT_SYMBOL(alloc_pages);
2279
2280int vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst)
2281{
2282        struct mempolicy *pol = mpol_dup(vma_policy(src));
2283
2284        if (IS_ERR(pol))
2285                return PTR_ERR(pol);
2286        dst->vm_policy = pol;
2287        return 0;
2288}
2289
2290/*
2291 * If mpol_dup() sees current->cpuset == cpuset_being_rebound, then it
2292 * rebinds the mempolicy its copying by calling mpol_rebind_policy()
2293 * with the mems_allowed returned by cpuset_mems_allowed().  This
2294 * keeps mempolicies cpuset relative after its cpuset moves.  See
2295 * further kernel/cpuset.c update_nodemask().
2296 *
2297 * current's mempolicy may be rebinded by the other task(the task that changes
2298 * cpuset's mems), so we needn't do rebind work for current task.
2299 */
2300
2301/* Slow path of a mempolicy duplicate */
2302struct mempolicy *__mpol_dup(struct mempolicy *old)
2303{
2304        struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
2305
2306        if (!new)
2307                return ERR_PTR(-ENOMEM);
2308
2309        /* task's mempolicy is protected by alloc_lock */
2310        if (old == current->mempolicy) {
2311                task_lock(current);
2312                *new = *old;
2313                task_unlock(current);
2314        } else
2315                *new = *old;
2316
2317        if (current_cpuset_is_being_rebound()) {
2318                nodemask_t mems = cpuset_mems_allowed(current);
2319                mpol_rebind_policy(new, &mems);
2320        }
2321        atomic_set(&new->refcnt, 1);
2322        return new;
2323}
2324
2325/* Slow path of a mempolicy comparison */
2326bool __mpol_equal(struct mempolicy *a, struct mempolicy *b)
2327{
2328        if (!a || !b)
2329                return false;
2330        if (a->mode != b->mode)
2331                return false;
2332        if (a->flags != b->flags)
2333                return false;
2334        if (mpol_store_user_nodemask(a))
2335                if (!nodes_equal(a->w.user_nodemask, b->w.user_nodemask))
2336                        return false;
2337
2338        switch (a->mode) {
2339        case MPOL_BIND:
2340        case MPOL_INTERLEAVE:
2341                return !!nodes_equal(a->v.nodes, b->v.nodes);
2342        case MPOL_PREFERRED:
2343                /* a's ->flags is the same as b's */
2344                if (a->flags & MPOL_F_LOCAL)
2345                        return true;
2346                return a->v.preferred_node == b->v.preferred_node;
2347        default:
2348                BUG();
2349                return false;
2350        }
2351}
2352
2353/*
2354 * Shared memory backing store policy support.
2355 *
2356 * Remember policies even when nobody has shared memory mapped.
2357 * The policies are kept in Red-Black tree linked from the inode.
2358 * They are protected by the sp->lock rwlock, which should be held
2359 * for any accesses to the tree.
2360 */
2361
2362/*
2363 * lookup first element intersecting start-end.  Caller holds sp->lock for
2364 * reading or for writing
2365 */
2366static struct sp_node *
2367sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
2368{
2369        struct rb_node *n = sp->root.rb_node;
2370
2371        while (n) {
2372                struct sp_node *p = rb_entry(n, struct sp_node, nd);
2373
2374                if (start >= p->end)
2375                        n = n->rb_right;
2376                else if (end <= p->start)
2377                        n = n->rb_left;
2378                else
2379                        break;
2380        }
2381        if (!n)
2382                return NULL;
2383        for (;;) {
2384                struct sp_node *w = NULL;
2385                struct rb_node *prev = rb_prev(n);
2386                if (!prev)
2387                        break;
2388                w = rb_entry(prev, struct sp_node, nd);
2389                if (w->end <= start)
2390                        break;
2391                n = prev;
2392        }
2393        return rb_entry(n, struct sp_node, nd);
2394}
2395
2396/*
2397 * Insert a new shared policy into the list.  Caller holds sp->lock for
2398 * writing.
2399 */
2400static void sp_insert(struct shared_policy *sp, struct sp_node *new)
2401{
2402        struct rb_node **p = &sp->root.rb_node;
2403        struct rb_node *parent = NULL;
2404        struct sp_node *nd;
2405
2406        while (*p) {
2407                parent = *p;
2408                nd = rb_entry(parent, struct sp_node, nd);
2409                if (new->start < nd->start)
2410                        p = &(*p)->rb_left;
2411                else if (new->end > nd->end)
2412                        p = &(*p)->rb_right;
2413                else
2414                        BUG();
2415        }
2416        rb_link_node(&new->nd, parent, p);
2417        rb_insert_color(&new->nd, &sp->root);
2418        pr_debug("inserting %lx-%lx: %d\n", new->start, new->end,
2419                 new->policy ? new->policy->mode : 0);
2420}
2421
2422/* Find shared policy intersecting idx */
2423struct mempolicy *
2424mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
2425{
2426        struct mempolicy *pol = NULL;
2427        struct sp_node *sn;
2428
2429        if (!sp->root.rb_node)
2430                return NULL;
2431        read_lock(&sp->lock);
2432        sn = sp_lookup(sp, idx, idx+1);
2433        if (sn) {
2434                mpol_get(sn->policy);
2435                pol = sn->policy;
2436        }
2437        read_unlock(&sp->lock);
2438        return pol;
2439}
2440
2441static void sp_free(struct sp_node *n)
2442{
2443        mpol_put(n->policy);
2444        kmem_cache_free(sn_cache, n);
2445}
2446
2447/**
2448 * mpol_misplaced - check whether current page node is valid in policy
2449 *
2450 * @page: page to be checked
2451 * @vma: vm area where page mapped
2452 * @addr: virtual address where page mapped
2453 *
2454 * Lookup current policy node id for vma,addr and "compare to" page's
2455 * node id.  Policy determination "mimics" alloc_page_vma().
2456 * Called from fault path where we know the vma and faulting address.
2457 *
2458 * Return: -1 if the page is in a node that is valid for this policy, or a
2459 * suitable node ID to allocate a replacement page from.
2460 */
2461int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long addr)
2462{
2463        struct mempolicy *pol;
2464        struct zoneref *z;
2465        int curnid = page_to_nid(page);
2466        unsigned long pgoff;
2467        int thiscpu = raw_smp_processor_id();
2468        int thisnid = cpu_to_node(thiscpu);
2469        int polnid = NUMA_NO_NODE;
2470        int ret = -1;
2471
2472        pol = get_vma_policy(vma, addr);
2473        if (!(pol->flags & MPOL_F_MOF))
2474                goto out;
2475
2476        switch (pol->mode) {
2477        case MPOL_INTERLEAVE:
2478                pgoff = vma->vm_pgoff;
2479                pgoff += (addr - vma->vm_start) >> PAGE_SHIFT;
2480                polnid = offset_il_node(pol, pgoff);
2481                break;
2482
2483        case MPOL_PREFERRED:
2484                if (pol->flags & MPOL_F_LOCAL)
2485                        polnid = numa_node_id();
2486                else
2487                        polnid = pol->v.preferred_node;
2488                break;
2489
2490        case MPOL_BIND:
2491                /* Optimize placement among multiple nodes via NUMA balancing */
2492                if (pol->flags & MPOL_F_MORON) {
2493                        if (node_isset(thisnid, pol->v.nodes))
2494                                break;
2495                        goto out;
2496                }
2497
2498                /*
2499                 * allows binding to multiple nodes.
2500                 * use current page if in policy nodemask,
2501                 * else select nearest allowed node, if any.
2502                 * If no allowed nodes, use current [!misplaced].
2503                 */
2504                if (node_isset(curnid, pol->v.nodes))
2505                        goto out;
2506                z = first_zones_zonelist(
2507                                node_zonelist(numa_node_id(), GFP_HIGHUSER),
2508                                gfp_zone(GFP_HIGHUSER),
2509                                &pol->v.nodes);
2510                polnid = zone_to_nid(z->zone);
2511                break;
2512
2513        default:
2514                BUG();
2515        }
2516
2517        /* Migrate the page towards the node whose CPU is referencing it */
2518        if (pol->flags & MPOL_F_MORON) {
2519                polnid = thisnid;
2520
2521                if (!should_numa_migrate_memory(current, page, curnid, thiscpu))
2522                        goto out;
2523        }
2524
2525        if (curnid != polnid)
2526                ret = polnid;
2527out:
2528        mpol_cond_put(pol);
2529
2530        return ret;
2531}
2532
2533/*
2534 * Drop the (possibly final) reference to task->mempolicy.  It needs to be
2535 * dropped after task->mempolicy is set to NULL so that any allocation done as
2536 * part of its kmem_cache_free(), such as by KASAN, doesn't reference a freed
2537 * policy.
2538 */
2539void mpol_put_task_policy(struct task_struct *task)
2540{
2541        struct mempolicy *pol;
2542
2543        task_lock(task);
2544        pol = task->mempolicy;
2545        task->mempolicy = NULL;
2546        task_unlock(task);
2547        mpol_put(pol);
2548}
2549
2550static void sp_delete(struct shared_policy *sp, struct sp_node *n)
2551{
2552        pr_debug("deleting %lx-l%lx\n", n->start, n->end);
2553        rb_erase(&n->nd, &sp->root);
2554        sp_free(n);
2555}
2556
2557static void sp_node_init(struct sp_node *node, unsigned long start,
2558                        unsigned long end, struct mempolicy *pol)
2559{
2560        node->start = start;
2561        node->end = end;
2562        node->policy = pol;
2563}
2564
2565static struct sp_node *sp_alloc(unsigned long start, unsigned long end,
2566                                struct mempolicy *pol)
2567{
2568        struct sp_node *n;
2569        struct mempolicy *newpol;
2570
2571        n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
2572        if (!n)
2573                return NULL;
2574
2575        newpol = mpol_dup(pol);
2576        if (IS_ERR(newpol)) {
2577                kmem_cache_free(sn_cache, n);
2578                return NULL;
2579        }
2580        newpol->flags |= MPOL_F_SHARED;
2581        sp_node_init(n, start, end, newpol);
2582
2583        return n;
2584}
2585
2586/* Replace a policy range. */
2587static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
2588                                 unsigned long end, struct sp_node *new)
2589{
2590        struct sp_node *n;
2591        struct sp_node *n_new = NULL;
2592        struct mempolicy *mpol_new = NULL;
2593        int ret = 0;
2594
2595restart:
2596        write_lock(&sp->lock);
2597        n = sp_lookup(sp, start, end);
2598        /* Take care of old policies in the same range. */
2599        while (n && n->start < end) {
2600                struct rb_node *next = rb_next(&n->nd);
2601                if (n->start >= start) {
2602                        if (n->end <= end)
2603                                sp_delete(sp, n);
2604                        else
2605                                n->start = end;
2606                } else {
2607                        /* Old policy spanning whole new range. */
2608                        if (n->end > end) {
2609                                if (!n_new)
2610                                        goto alloc_new;
2611
2612                                *mpol_new = *n->policy;
2613                                atomic_set(&mpol_new->refcnt, 1);
2614                                sp_node_init(n_new, end, n->end, mpol_new);
2615                                n->end = start;
2616                                sp_insert(sp, n_new);
2617                                n_new = NULL;
2618                                mpol_new = NULL;
2619                                break;
2620                        } else
2621                                n->end = start;
2622                }
2623                if (!next)
2624                        break;
2625                n = rb_entry(next, struct sp_node, nd);
2626        }
2627        if (new)
2628                sp_insert(sp, new);
2629        write_unlock(&sp->lock);
2630        ret = 0;
2631
2632err_out:
2633        if (mpol_new)
2634                mpol_put(mpol_new);
2635        if (n_new)
2636                kmem_cache_free(sn_cache, n_new);
2637
2638        return ret;
2639
2640alloc_new:
2641        write_unlock(&sp->lock);
2642        ret = -ENOMEM;
2643        n_new = kmem_cache_alloc(sn_cache, GFP_KERNEL);
2644        if (!n_new)
2645                goto err_out;
2646        mpol_new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
2647        if (!mpol_new)
2648                goto err_out;
2649        goto restart;
2650}
2651
2652/**
2653 * mpol_shared_policy_init - initialize shared policy for inode
2654 * @sp: pointer to inode shared policy
2655 * @mpol:  struct mempolicy to install
2656 *
2657 * Install non-NULL @mpol in inode's shared policy rb-tree.
2658 * On entry, the current task has a reference on a non-NULL @mpol.
2659 * This must be released on exit.
2660 * This is called at get_inode() calls and we can use GFP_KERNEL.
2661 */
2662void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol)
2663{
2664        int ret;
2665
2666        sp->root = RB_ROOT;             /* empty tree == default mempolicy */
2667        rwlock_init(&sp->lock);
2668
2669        if (mpol) {
2670                struct vm_area_struct pvma;
2671                struct mempolicy *new;
2672                NODEMASK_SCRATCH(scratch);
2673
2674                if (!scratch)
2675                        goto put_mpol;
2676                /* contextualize the tmpfs mount point mempolicy */
2677                new = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask);
2678                if (IS_ERR(new))
2679                        goto free_scratch; /* no valid nodemask intersection */
2680
2681                task_lock(current);
2682                ret = mpol_set_nodemask(new, &mpol->w.user_nodemask, scratch);
2683                task_unlock(current);
2684                if (ret)
2685                        goto put_new;
2686
2687                /* Create pseudo-vma that contains just the policy */
2688                vma_init(&pvma, NULL);
2689                pvma.vm_end = TASK_SIZE;        /* policy covers entire file */
2690                mpol_set_shared_policy(sp, &pvma, new); /* adds ref */
2691
2692put_new:
2693                mpol_put(new);                  /* drop initial ref */
2694free_scratch:
2695                NODEMASK_SCRATCH_FREE(scratch);
2696put_mpol:
2697                mpol_put(mpol); /* drop our incoming ref on sb mpol */
2698        }
2699}
2700
2701int mpol_set_shared_policy(struct shared_policy *info,
2702                        struct vm_area_struct *vma, struct mempolicy *npol)
2703{
2704        int err;
2705        struct sp_node *new = NULL;
2706        unsigned long sz = vma_pages(vma);
2707
2708        pr_debug("set_shared_policy %lx sz %lu %d %d %lx\n",
2709                 vma->vm_pgoff,
2710                 sz, npol ? npol->mode : -1,
2711                 npol ? npol->flags : -1,
2712                 npol ? nodes_addr(npol->v.nodes)[0] : NUMA_NO_NODE);
2713
2714        if (npol) {
2715                new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);
2716                if (!new)
2717                        return -ENOMEM;
2718        }
2719        err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new);
2720        if (err && new)
2721                sp_free(new);
2722        return err;
2723}
2724
2725/* Free a backing policy store on inode delete. */
2726void mpol_free_shared_policy(struct shared_policy *p)
2727{
2728        struct sp_node *n;
2729        struct rb_node *next;
2730
2731        if (!p->root.rb_node)
2732                return;
2733        write_lock(&p->lock);
2734        next = rb_first(&p->root);
2735        while (next) {
2736                n = rb_entry(next, struct sp_node, nd);
2737                next = rb_next(&n->nd);
2738                sp_delete(p, n);
2739        }
2740        write_unlock(&p->lock);
2741}
2742
2743#ifdef CONFIG_NUMA_BALANCING
2744static int __initdata numabalancing_override;
2745
2746static void __init check_numabalancing_enable(void)
2747{
2748        bool numabalancing_default = false;
2749
2750        if (IS_ENABLED(CONFIG_NUMA_BALANCING_DEFAULT_ENABLED))
2751                numabalancing_default = true;
2752
2753        /* Parsed by setup_numabalancing. override == 1 enables, -1 disables */
2754        if (numabalancing_override)
2755                set_numabalancing_state(numabalancing_override == 1);
2756
2757        if (num_online_nodes() > 1 && !numabalancing_override) {
2758                pr_info("%s automatic NUMA balancing. Configure with numa_balancing= or the kernel.numa_balancing sysctl\n",
2759                        numabalancing_default ? "Enabling" : "Disabling");
2760                set_numabalancing_state(numabalancing_default);
2761        }
2762}
2763
2764static int __init setup_numabalancing(char *str)
2765{
2766        int ret = 0;
2767        if (!str)
2768                goto out;
2769
2770        if (!strcmp(str, "enable")) {
2771                numabalancing_override = 1;
2772                ret = 1;
2773        } else if (!strcmp(str, "disable")) {
2774                numabalancing_override = -1;
2775                ret = 1;
2776        }
2777out:
2778        if (!ret)
2779                pr_warn("Unable to parse numa_balancing=\n");
2780
2781        return ret;
2782}
2783__setup("numa_balancing=", setup_numabalancing);
2784#else
2785static inline void __init check_numabalancing_enable(void)
2786{
2787}
2788#endif /* CONFIG_NUMA_BALANCING */
2789
2790/* assumes fs == KERNEL_DS */
2791void __init numa_policy_init(void)
2792{
2793        nodemask_t interleave_nodes;
2794        unsigned long largest = 0;
2795        int nid, prefer = 0;
2796
2797        policy_cache = kmem_cache_create("numa_policy",
2798                                         sizeof(struct mempolicy),
2799                                         0, SLAB_PANIC, NULL);
2800
2801        sn_cache = kmem_cache_create("shared_policy_node",
2802                                     sizeof(struct sp_node),
2803                                     0, SLAB_PANIC, NULL);
2804
2805        for_each_node(nid) {
2806                preferred_node_policy[nid] = (struct mempolicy) {
2807                        .refcnt = ATOMIC_INIT(1),
2808                        .mode = MPOL_PREFERRED,
2809                        .flags = MPOL_F_MOF | MPOL_F_MORON,
2810                        .v = { .preferred_node = nid, },
2811                };
2812        }
2813
2814        /*
2815         * Set interleaving policy for system init. Interleaving is only
2816         * enabled across suitably sized nodes (default is >= 16MB), or
2817         * fall back to the largest node if they're all smaller.
2818         */
2819        nodes_clear(interleave_nodes);
2820        for_each_node_state(nid, N_MEMORY) {
2821                unsigned long total_pages = node_present_pages(nid);
2822
2823                /* Preserve the largest node */
2824                if (largest < total_pages) {
2825                        largest = total_pages;
2826                        prefer = nid;
2827                }
2828
2829                /* Interleave this node? */
2830                if ((total_pages << PAGE_SHIFT) >= (16 << 20))
2831                        node_set(nid, interleave_nodes);
2832        }
2833
2834        /* All too small, use the largest */
2835        if (unlikely(nodes_empty(interleave_nodes)))
2836                node_set(prefer, interleave_nodes);
2837
2838        if (do_set_mempolicy(MPOL_INTERLEAVE, 0, &interleave_nodes))
2839                pr_err("%s: interleaving failed\n", __func__);
2840
2841        check_numabalancing_enable();
2842}
2843
2844/* Reset policy of current process to default */
2845void numa_default_policy(void)
2846{
2847        do_set_mempolicy(MPOL_DEFAULT, 0, NULL);
2848}
2849
2850/*
2851 * Parse and format mempolicy from/to strings
2852 */
2853
2854/*
2855 * "local" is implemented internally by MPOL_PREFERRED with MPOL_F_LOCAL flag.
2856 */
2857static const char * const policy_modes[] =
2858{
2859        [MPOL_DEFAULT]    = "default",
2860        [MPOL_PREFERRED]  = "prefer",
2861        [MPOL_BIND]       = "bind",
2862        [MPOL_INTERLEAVE] = "interleave",
2863        [MPOL_LOCAL]      = "local",
2864};
2865
2866
2867#ifdef CONFIG_TMPFS
2868/**
2869 * mpol_parse_str - parse string to mempolicy, for tmpfs mpol mount option.
2870 * @str:  string containing mempolicy to parse
2871 * @mpol:  pointer to struct mempolicy pointer, returned on success.
2872 *
2873 * Format of input:
2874 *      <mode>[=<flags>][:<nodelist>]
2875 *
2876 * On success, returns 0, else 1
2877 */
2878int mpol_parse_str(char *str, struct mempolicy **mpol)
2879{
2880        struct mempolicy *new = NULL;
2881        unsigned short mode_flags;
2882        nodemask_t nodes;
2883        char *nodelist = strchr(str, ':');
2884        char *flags = strchr(str, '=');
2885        int err = 1, mode;
2886
2887        if (flags)
2888                *flags++ = '\0';        /* terminate mode string */
2889
2890        if (nodelist) {
2891                /* NUL-terminate mode or flags string */
2892                *nodelist++ = '\0';
2893                if (nodelist_parse(nodelist, nodes))
2894                        goto out;
2895                if (!nodes_subset(nodes, node_states[N_MEMORY]))
2896                        goto out;
2897        } else
2898                nodes_clear(nodes);
2899
2900        mode = match_string(policy_modes, MPOL_MAX, str);
2901        if (mode < 0)
2902                goto out;
2903
2904        switch (mode) {
2905        case MPOL_PREFERRED:
2906                /*
2907                 * Insist on a nodelist of one node only, although later
2908                 * we use first_node(nodes) to grab a single node, so here
2909                 * nodelist (or nodes) cannot be empty.
2910                 */
2911                if (nodelist) {
2912                        char *rest = nodelist;
2913                        while (isdigit(*rest))
2914                                rest++;
2915                        if (*rest)
2916                                goto out;
2917                        if (nodes_empty(nodes))
2918                                goto out;
2919                }
2920                break;
2921        case MPOL_INTERLEAVE:
2922                /*
2923                 * Default to online nodes with memory if no nodelist
2924                 */
2925                if (!nodelist)
2926                        nodes = node_states[N_MEMORY];
2927                break;
2928        case MPOL_LOCAL:
2929                /*
2930                 * Don't allow a nodelist;  mpol_new() checks flags
2931                 */
2932                if (nodelist)
2933                        goto out;
2934                mode = MPOL_PREFERRED;
2935                break;
2936        case MPOL_DEFAULT:
2937                /*
2938                 * Insist on a empty nodelist
2939                 */
2940                if (!nodelist)
2941                        err = 0;
2942                goto out;
2943        case MPOL_BIND:
2944                /*
2945                 * Insist on a nodelist
2946                 */
2947                if (!nodelist)
2948                        goto out;
2949        }
2950
2951        mode_flags = 0;
2952        if (flags) {
2953                /*
2954                 * Currently, we only support two mutually exclusive
2955                 * mode flags.
2956                 */
2957                if (!strcmp(flags, "static"))
2958                        mode_flags |= MPOL_F_STATIC_NODES;
2959                else if (!strcmp(flags, "relative"))
2960                        mode_flags |= MPOL_F_RELATIVE_NODES;
2961                else
2962                        goto out;
2963        }
2964
2965        new = mpol_new(mode, mode_flags, &nodes);
2966        if (IS_ERR(new))
2967                goto out;
2968
2969        /*
2970         * Save nodes for mpol_to_str() to show the tmpfs mount options
2971         * for /proc/mounts, /proc/pid/mounts and /proc/pid/mountinfo.
2972         */
2973        if (mode != MPOL_PREFERRED)
2974                new->v.nodes = nodes;
2975        else if (nodelist)
2976                new->v.preferred_node = first_node(nodes);
2977        else
2978                new->flags |= MPOL_F_LOCAL;
2979
2980        /*
2981         * Save nodes for contextualization: this will be used to "clone"
2982         * the mempolicy in a specific context [cpuset] at a later time.
2983         */
2984        new->w.user_nodemask = nodes;
2985
2986        err = 0;
2987
2988out:
2989        /* Restore string for error message */
2990        if (nodelist)
2991                *--nodelist = ':';
2992        if (flags)
2993                *--flags = '=';
2994        if (!err)
2995                *mpol = new;
2996        return err;
2997}
2998#endif /* CONFIG_TMPFS */
2999
3000/**
3001 * mpol_to_str - format a mempolicy structure for printing
3002 * @buffer:  to contain formatted mempolicy string
3003 * @maxlen:  length of @buffer
3004 * @pol:  pointer to mempolicy to be formatted
3005 *
3006 * Convert @pol into a string.  If @buffer is too short, truncate the string.
3007 * Recommend a @maxlen of at least 32 for the longest mode, "interleave", the
3008 * longest flag, "relative", and to display at least a few node ids.
3009 */
3010void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
3011{
3012        char *p = buffer;
3013        nodemask_t nodes = NODE_MASK_NONE;
3014        unsigned short mode = MPOL_DEFAULT;
3015        unsigned short flags = 0;
3016
3017        if (pol && pol != &default_policy && !(pol->flags & MPOL_F_MORON)) {
3018                mode = pol->mode;
3019                flags = pol->flags;
3020        }
3021
3022        switch (mode) {
3023        case MPOL_DEFAULT:
3024                break;
3025        case MPOL_PREFERRED:
3026                if (flags & MPOL_F_LOCAL)
3027                        mode = MPOL_LOCAL;
3028                else
3029                        node_set(pol->v.preferred_node, nodes);
3030                break;
3031        case MPOL_BIND:
3032        case MPOL_INTERLEAVE:
3033                nodes = pol->v.nodes;
3034                break;
3035        default:
3036                WARN_ON_ONCE(1);
3037                snprintf(p, maxlen, "unknown");
3038                return;
3039        }
3040
3041        p += snprintf(p, maxlen, "%s", policy_modes[mode]);
3042
3043        if (flags & MPOL_MODE_FLAGS) {
3044                p += snprintf(p, buffer + maxlen - p, "=");
3045
3046                /*
3047                 * Currently, the only defined flags are mutually exclusive
3048                 */
3049                if (flags & MPOL_F_STATIC_NODES)
3050                        p += snprintf(p, buffer + maxlen - p, "static");
3051                else if (flags & MPOL_F_RELATIVE_NODES)
3052                        p += snprintf(p, buffer + maxlen - p, "relative");
3053        }
3054
3055        if (!nodes_empty(nodes))
3056                p += scnprintf(p, buffer + maxlen - p, ":%*pbl",
3057                               nodemask_pr_args(&nodes));
3058}
3059