linux/mm/mempolicy.c
<<
>>
Prefs
   1/*
   2 * Simple NUMA memory policy for the Linux kernel.
   3 *
   4 * Copyright 2003,2004 Andi Kleen, SuSE Labs.
   5 * (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc.
   6 * Subject to the GNU Public License, version 2.
   7 *
   8 * NUMA policy allows the user to give hints in which node(s) memory should
   9 * be allocated.
  10 *
  11 * Support four policies per VMA and per process:
  12 *
  13 * The VMA policy has priority over the process policy for a page fault.
  14 *
  15 * interleave     Allocate memory interleaved over a set of nodes,
  16 *                with normal fallback if it fails.
  17 *                For VMA based allocations this interleaves based on the
  18 *                offset into the backing object or offset into the mapping
  19 *                for anonymous memory. For process policy an process counter
  20 *                is used.
  21 *
  22 * bind           Only allocate memory on a specific set of nodes,
  23 *                no fallback.
  24 *                FIXME: memory is allocated starting with the first node
  25 *                to the last. It would be better if bind would truly restrict
  26 *                the allocation to memory nodes instead
  27 *
  28 * preferred       Try a specific node first before normal fallback.
  29 *                As a special case NUMA_NO_NODE here means do the allocation
  30 *                on the local CPU. This is normally identical to default,
  31 *                but useful to set in a VMA when you have a non default
  32 *                process policy.
  33 *
  34 * default        Allocate on the local node first, or when on a VMA
  35 *                use the process policy. This is what Linux always did
  36 *                in a NUMA aware kernel and still does by, ahem, default.
  37 *
  38 * The process policy is applied for most non interrupt memory allocations
  39 * in that process' context. Interrupts ignore the policies and always
  40 * try to allocate on the local CPU. The VMA policy is only applied for memory
  41 * allocations for a VMA in the VM.
  42 *
  43 * Currently there are a few corner cases in swapping where the policy
  44 * is not applied, but the majority should be handled. When process policy
  45 * is used it is not remembered over swap outs/swap ins.
  46 *
  47 * Only the highest zone in the zone hierarchy gets policied. Allocations
  48 * requesting a lower zone just use default policy. This implies that
  49 * on systems with highmem kernel lowmem allocation don't get policied.
  50 * Same with GFP_DMA allocations.
  51 *
  52 * For shmfs/tmpfs/hugetlbfs shared memory the policy is shared between
  53 * all users and remembered even when nobody has memory mapped.
  54 */
  55
  56/* Notebook:
  57   fix mmap readahead to honour policy and enable policy for any page cache
  58   object
  59   statistics for bigpages
  60   global policy for page cache? currently it uses process policy. Requires
  61   first item above.
  62   handle mremap for shared memory (currently ignored for the policy)
  63   grows down?
  64   make bind policy root only? It can trigger oom much faster and the
  65   kernel is not always grateful with that.
  66*/
  67
  68#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
  69
  70#include <linux/mempolicy.h>
  71#include <linux/pagewalk.h>
  72#include <linux/highmem.h>
  73#include <linux/hugetlb.h>
  74#include <linux/kernel.h>
  75#include <linux/sched.h>
  76#include <linux/sched/mm.h>
  77#include <linux/sched/numa_balancing.h>
  78#include <linux/sched/task.h>
  79#include <linux/nodemask.h>
  80#include <linux/cpuset.h>
  81#include <linux/slab.h>
  82#include <linux/string.h>
  83#include <linux/export.h>
  84#include <linux/nsproxy.h>
  85#include <linux/interrupt.h>
  86#include <linux/init.h>
  87#include <linux/compat.h>
  88#include <linux/ptrace.h>
  89#include <linux/swap.h>
  90#include <linux/seq_file.h>
  91#include <linux/proc_fs.h>
  92#include <linux/migrate.h>
  93#include <linux/ksm.h>
  94#include <linux/rmap.h>
  95#include <linux/security.h>
  96#include <linux/syscalls.h>
  97#include <linux/ctype.h>
  98#include <linux/mm_inline.h>
  99#include <linux/mmu_notifier.h>
 100#include <linux/printk.h>
 101#include <linux/swapops.h>
 102
 103#include <asm/tlbflush.h>
 104#include <linux/uaccess.h>
 105
 106#include "internal.h"
 107
 108/* Internal flags */
 109#define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0)    /* Skip checks for continuous vmas */
 110#define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1)          /* Invert check for nodemask */
 111
 112static struct kmem_cache *policy_cache;
 113static struct kmem_cache *sn_cache;
 114
 115/* Highest zone. An specific allocation for a zone below that is not
 116   policied. */
 117enum zone_type policy_zone = 0;
 118
 119/*
 120 * run-time system-wide default policy => local allocation
 121 */
 122static struct mempolicy default_policy = {
 123        .refcnt = ATOMIC_INIT(1), /* never free it */
 124        .mode = MPOL_PREFERRED,
 125        .flags = MPOL_F_LOCAL,
 126};
 127
 128static struct mempolicy preferred_node_policy[MAX_NUMNODES];
 129
 130/**
 131 * numa_map_to_online_node - Find closest online node
 132 * @nid: Node id to start the search
 133 *
 134 * Lookup the next closest node by distance if @nid is not online.
 135 */
 136int numa_map_to_online_node(int node)
 137{
 138        int min_node;
 139
 140        if (node == NUMA_NO_NODE)
 141                node = 0;
 142
 143        min_node = node;
 144        if (!node_online(node)) {
 145                int min_dist = INT_MAX, dist, n;
 146
 147                for_each_online_node(n) {
 148                        dist = node_distance(node, n);
 149                        if (dist < min_dist) {
 150                                min_dist = dist;
 151                                min_node = n;
 152                        }
 153                }
 154        }
 155
 156        return min_node;
 157}
 158EXPORT_SYMBOL_GPL(numa_map_to_online_node);
 159
 160struct mempolicy *get_task_policy(struct task_struct *p)
 161{
 162        struct mempolicy *pol = p->mempolicy;
 163        int node;
 164
 165        if (pol)
 166                return pol;
 167
 168        node = numa_node_id();
 169        if (node != NUMA_NO_NODE) {
 170                pol = &preferred_node_policy[node];
 171                /* preferred_node_policy is not initialised early in boot */
 172                if (pol->mode)
 173                        return pol;
 174        }
 175
 176        return &default_policy;
 177}
 178
 179static const struct mempolicy_operations {
 180        int (*create)(struct mempolicy *pol, const nodemask_t *nodes);
 181        void (*rebind)(struct mempolicy *pol, const nodemask_t *nodes);
 182} mpol_ops[MPOL_MAX];
 183
 184static inline int mpol_store_user_nodemask(const struct mempolicy *pol)
 185{
 186        return pol->flags & MPOL_MODE_FLAGS;
 187}
 188
 189static void mpol_relative_nodemask(nodemask_t *ret, const nodemask_t *orig,
 190                                   const nodemask_t *rel)
 191{
 192        nodemask_t tmp;
 193        nodes_fold(tmp, *orig, nodes_weight(*rel));
 194        nodes_onto(*ret, tmp, *rel);
 195}
 196
 197static int mpol_new_interleave(struct mempolicy *pol, const nodemask_t *nodes)
 198{
 199        if (nodes_empty(*nodes))
 200                return -EINVAL;
 201        pol->v.nodes = *nodes;
 202        return 0;
 203}
 204
 205static int mpol_new_preferred(struct mempolicy *pol, const nodemask_t *nodes)
 206{
 207        if (!nodes)
 208                pol->flags |= MPOL_F_LOCAL;     /* local allocation */
 209        else if (nodes_empty(*nodes))
 210                return -EINVAL;                 /*  no allowed nodes */
 211        else
 212                pol->v.preferred_node = first_node(*nodes);
 213        return 0;
 214}
 215
 216static int mpol_new_bind(struct mempolicy *pol, const nodemask_t *nodes)
 217{
 218        if (nodes_empty(*nodes))
 219                return -EINVAL;
 220        pol->v.nodes = *nodes;
 221        return 0;
 222}
 223
 224/*
 225 * mpol_set_nodemask is called after mpol_new() to set up the nodemask, if
 226 * any, for the new policy.  mpol_new() has already validated the nodes
 227 * parameter with respect to the policy mode and flags.  But, we need to
 228 * handle an empty nodemask with MPOL_PREFERRED here.
 229 *
 230 * Must be called holding task's alloc_lock to protect task's mems_allowed
 231 * and mempolicy.  May also be called holding the mmap_lock for write.
 232 */
 233static int mpol_set_nodemask(struct mempolicy *pol,
 234                     const nodemask_t *nodes, struct nodemask_scratch *nsc)
 235{
 236        int ret;
 237
 238        /* if mode is MPOL_DEFAULT, pol is NULL. This is right. */
 239        if (pol == NULL)
 240                return 0;
 241        /* Check N_MEMORY */
 242        nodes_and(nsc->mask1,
 243                  cpuset_current_mems_allowed, node_states[N_MEMORY]);
 244
 245        VM_BUG_ON(!nodes);
 246        if (pol->mode == MPOL_PREFERRED && nodes_empty(*nodes))
 247                nodes = NULL;   /* explicit local allocation */
 248        else {
 249                if (pol->flags & MPOL_F_RELATIVE_NODES)
 250                        mpol_relative_nodemask(&nsc->mask2, nodes, &nsc->mask1);
 251                else
 252                        nodes_and(nsc->mask2, *nodes, nsc->mask1);
 253
 254                if (mpol_store_user_nodemask(pol))
 255                        pol->w.user_nodemask = *nodes;
 256                else
 257                        pol->w.cpuset_mems_allowed =
 258                                                cpuset_current_mems_allowed;
 259        }
 260
 261        if (nodes)
 262                ret = mpol_ops[pol->mode].create(pol, &nsc->mask2);
 263        else
 264                ret = mpol_ops[pol->mode].create(pol, NULL);
 265        return ret;
 266}
 267
 268/*
 269 * This function just creates a new policy, does some check and simple
 270 * initialization. You must invoke mpol_set_nodemask() to set nodes.
 271 */
 272static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
 273                                  nodemask_t *nodes)
 274{
 275        struct mempolicy *policy;
 276
 277        pr_debug("setting mode %d flags %d nodes[0] %lx\n",
 278                 mode, flags, nodes ? nodes_addr(*nodes)[0] : NUMA_NO_NODE);
 279
 280        if (mode == MPOL_DEFAULT) {
 281                if (nodes && !nodes_empty(*nodes))
 282                        return ERR_PTR(-EINVAL);
 283                return NULL;
 284        }
 285        VM_BUG_ON(!nodes);
 286
 287        /*
 288         * MPOL_PREFERRED cannot be used with MPOL_F_STATIC_NODES or
 289         * MPOL_F_RELATIVE_NODES if the nodemask is empty (local allocation).
 290         * All other modes require a valid pointer to a non-empty nodemask.
 291         */
 292        if (mode == MPOL_PREFERRED) {
 293                if (nodes_empty(*nodes)) {
 294                        if (((flags & MPOL_F_STATIC_NODES) ||
 295                             (flags & MPOL_F_RELATIVE_NODES)))
 296                                return ERR_PTR(-EINVAL);
 297                }
 298        } else if (mode == MPOL_LOCAL) {
 299                if (!nodes_empty(*nodes) ||
 300                    (flags & MPOL_F_STATIC_NODES) ||
 301                    (flags & MPOL_F_RELATIVE_NODES))
 302                        return ERR_PTR(-EINVAL);
 303                mode = MPOL_PREFERRED;
 304        } else if (nodes_empty(*nodes))
 305                return ERR_PTR(-EINVAL);
 306        policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
 307        if (!policy)
 308                return ERR_PTR(-ENOMEM);
 309        atomic_set(&policy->refcnt, 1);
 310        policy->mode = mode;
 311        policy->flags = flags;
 312
 313        return policy;
 314}
 315
 316/* Slow path of a mpol destructor. */
 317void __mpol_put(struct mempolicy *p)
 318{
 319        if (!atomic_dec_and_test(&p->refcnt))
 320                return;
 321        kmem_cache_free(policy_cache, p);
 322}
 323
 324static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes)
 325{
 326}
 327
 328static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes)
 329{
 330        nodemask_t tmp;
 331
 332        if (pol->flags & MPOL_F_STATIC_NODES)
 333                nodes_and(tmp, pol->w.user_nodemask, *nodes);
 334        else if (pol->flags & MPOL_F_RELATIVE_NODES)
 335                mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
 336        else {
 337                nodes_remap(tmp, pol->v.nodes,pol->w.cpuset_mems_allowed,
 338                                                                *nodes);
 339                pol->w.cpuset_mems_allowed = *nodes;
 340        }
 341
 342        if (nodes_empty(tmp))
 343                tmp = *nodes;
 344
 345        pol->v.nodes = tmp;
 346}
 347
 348static void mpol_rebind_preferred(struct mempolicy *pol,
 349                                                const nodemask_t *nodes)
 350{
 351        nodemask_t tmp;
 352
 353        if (pol->flags & MPOL_F_STATIC_NODES) {
 354                int node = first_node(pol->w.user_nodemask);
 355
 356                if (node_isset(node, *nodes)) {
 357                        pol->v.preferred_node = node;
 358                        pol->flags &= ~MPOL_F_LOCAL;
 359                } else
 360                        pol->flags |= MPOL_F_LOCAL;
 361        } else if (pol->flags & MPOL_F_RELATIVE_NODES) {
 362                mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
 363                pol->v.preferred_node = first_node(tmp);
 364        } else if (!(pol->flags & MPOL_F_LOCAL)) {
 365                pol->v.preferred_node = node_remap(pol->v.preferred_node,
 366                                                   pol->w.cpuset_mems_allowed,
 367                                                   *nodes);
 368                pol->w.cpuset_mems_allowed = *nodes;
 369        }
 370}
 371
 372/*
 373 * mpol_rebind_policy - Migrate a policy to a different set of nodes
 374 *
 375 * Per-vma policies are protected by mmap_lock. Allocations using per-task
 376 * policies are protected by task->mems_allowed_seq to prevent a premature
 377 * OOM/allocation failure due to parallel nodemask modification.
 378 */
 379static void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask)
 380{
 381        if (!pol)
 382                return;
 383        if (!mpol_store_user_nodemask(pol) &&
 384            nodes_equal(pol->w.cpuset_mems_allowed, *newmask))
 385                return;
 386
 387        mpol_ops[pol->mode].rebind(pol, newmask);
 388}
 389
 390/*
 391 * Wrapper for mpol_rebind_policy() that just requires task
 392 * pointer, and updates task mempolicy.
 393 *
 394 * Called with task's alloc_lock held.
 395 */
 396
 397void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new)
 398{
 399        mpol_rebind_policy(tsk->mempolicy, new);
 400}
 401
 402/*
 403 * Rebind each vma in mm to new nodemask.
 404 *
 405 * Call holding a reference to mm.  Takes mm->mmap_lock during call.
 406 */
 407
 408void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
 409{
 410        struct vm_area_struct *vma;
 411
 412        mmap_write_lock(mm);
 413        for (vma = mm->mmap; vma; vma = vma->vm_next)
 414                mpol_rebind_policy(vma->vm_policy, new);
 415        mmap_write_unlock(mm);
 416}
 417
 418static const struct mempolicy_operations mpol_ops[MPOL_MAX] = {
 419        [MPOL_DEFAULT] = {
 420                .rebind = mpol_rebind_default,
 421        },
 422        [MPOL_INTERLEAVE] = {
 423                .create = mpol_new_interleave,
 424                .rebind = mpol_rebind_nodemask,
 425        },
 426        [MPOL_PREFERRED] = {
 427                .create = mpol_new_preferred,
 428                .rebind = mpol_rebind_preferred,
 429        },
 430        [MPOL_BIND] = {
 431                .create = mpol_new_bind,
 432                .rebind = mpol_rebind_nodemask,
 433        },
 434};
 435
 436static int migrate_page_add(struct page *page, struct list_head *pagelist,
 437                                unsigned long flags);
 438
 439struct queue_pages {
 440        struct list_head *pagelist;
 441        unsigned long flags;
 442        nodemask_t *nmask;
 443        struct vm_area_struct *prev;
 444};
 445
 446/*
 447 * Check if the page's nid is in qp->nmask.
 448 *
 449 * If MPOL_MF_INVERT is set in qp->flags, check if the nid is
 450 * in the invert of qp->nmask.
 451 */
 452static inline bool queue_pages_required(struct page *page,
 453                                        struct queue_pages *qp)
 454{
 455        int nid = page_to_nid(page);
 456        unsigned long flags = qp->flags;
 457
 458        return node_isset(nid, *qp->nmask) == !(flags & MPOL_MF_INVERT);
 459}
 460
 461/*
 462 * queue_pages_pmd() has four possible return values:
 463 * 0 - pages are placed on the right node or queued successfully.
 464 * 1 - there is unmovable page, and MPOL_MF_MOVE* & MPOL_MF_STRICT were
 465 *     specified.
 466 * 2 - THP was split.
 467 * -EIO - is migration entry or only MPOL_MF_STRICT was specified and an
 468 *        existing page was already on a node that does not follow the
 469 *        policy.
 470 */
 471static int queue_pages_pmd(pmd_t *pmd, spinlock_t *ptl, unsigned long addr,
 472                                unsigned long end, struct mm_walk *walk)
 473{
 474        int ret = 0;
 475        struct page *page;
 476        struct queue_pages *qp = walk->private;
 477        unsigned long flags;
 478
 479        if (unlikely(is_pmd_migration_entry(*pmd))) {
 480                ret = -EIO;
 481                goto unlock;
 482        }
 483        page = pmd_page(*pmd);
 484        if (is_huge_zero_page(page)) {
 485                spin_unlock(ptl);
 486                __split_huge_pmd(walk->vma, pmd, addr, false, NULL);
 487                ret = 2;
 488                goto out;
 489        }
 490        if (!queue_pages_required(page, qp))
 491                goto unlock;
 492
 493        flags = qp->flags;
 494        /* go to thp migration */
 495        if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
 496                if (!vma_migratable(walk->vma) ||
 497                    migrate_page_add(page, qp->pagelist, flags)) {
 498                        ret = 1;
 499                        goto unlock;
 500                }
 501        } else
 502                ret = -EIO;
 503unlock:
 504        spin_unlock(ptl);
 505out:
 506        return ret;
 507}
 508
 509/*
 510 * Scan through pages checking if pages follow certain conditions,
 511 * and move them to the pagelist if they do.
 512 *
 513 * queue_pages_pte_range() has three possible return values:
 514 * 0 - pages are placed on the right node or queued successfully.
 515 * 1 - there is unmovable page, and MPOL_MF_MOVE* & MPOL_MF_STRICT were
 516 *     specified.
 517 * -EIO - only MPOL_MF_STRICT was specified and an existing page was already
 518 *        on a node that does not follow the policy.
 519 */
 520static int queue_pages_pte_range(pmd_t *pmd, unsigned long addr,
 521                        unsigned long end, struct mm_walk *walk)
 522{
 523        struct vm_area_struct *vma = walk->vma;
 524        struct page *page;
 525        struct queue_pages *qp = walk->private;
 526        unsigned long flags = qp->flags;
 527        int ret;
 528        bool has_unmovable = false;
 529        pte_t *pte, *mapped_pte;
 530        spinlock_t *ptl;
 531
 532        ptl = pmd_trans_huge_lock(pmd, vma);
 533        if (ptl) {
 534                ret = queue_pages_pmd(pmd, ptl, addr, end, walk);
 535                if (ret != 2)
 536                        return ret;
 537        }
 538        /* THP was split, fall through to pte walk */
 539
 540        if (pmd_trans_unstable(pmd))
 541                return 0;
 542
 543        mapped_pte = pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
 544        for (; addr != end; pte++, addr += PAGE_SIZE) {
 545                if (!pte_present(*pte))
 546                        continue;
 547                page = vm_normal_page(vma, addr, *pte);
 548                if (!page)
 549                        continue;
 550                /*
 551                 * vm_normal_page() filters out zero pages, but there might
 552                 * still be PageReserved pages to skip, perhaps in a VDSO.
 553                 */
 554                if (PageReserved(page))
 555                        continue;
 556                if (!queue_pages_required(page, qp))
 557                        continue;
 558                if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
 559                        /* MPOL_MF_STRICT must be specified if we get here */
 560                        if (!vma_migratable(vma)) {
 561                                has_unmovable = true;
 562                                break;
 563                        }
 564
 565                        /*
 566                         * Do not abort immediately since there may be
 567                         * temporary off LRU pages in the range.  Still
 568                         * need migrate other LRU pages.
 569                         */
 570                        if (migrate_page_add(page, qp->pagelist, flags))
 571                                has_unmovable = true;
 572                } else
 573                        break;
 574        }
 575        pte_unmap_unlock(mapped_pte, ptl);
 576        cond_resched();
 577
 578        if (has_unmovable)
 579                return 1;
 580
 581        return addr != end ? -EIO : 0;
 582}
 583
 584static int queue_pages_hugetlb(pte_t *pte, unsigned long hmask,
 585                               unsigned long addr, unsigned long end,
 586                               struct mm_walk *walk)
 587{
 588#ifdef CONFIG_HUGETLB_PAGE
 589        struct queue_pages *qp = walk->private;
 590        unsigned long flags = qp->flags;
 591        struct page *page;
 592        spinlock_t *ptl;
 593        pte_t entry;
 594
 595        ptl = huge_pte_lock(hstate_vma(walk->vma), walk->mm, pte);
 596        entry = huge_ptep_get(pte);
 597        if (!pte_present(entry))
 598                goto unlock;
 599        page = pte_page(entry);
 600        if (!queue_pages_required(page, qp))
 601                goto unlock;
 602        /* With MPOL_MF_MOVE, we migrate only unshared hugepage. */
 603        if (flags & (MPOL_MF_MOVE_ALL) ||
 604            (flags & MPOL_MF_MOVE && page_mapcount(page) == 1))
 605                isolate_huge_page(page, qp->pagelist);
 606unlock:
 607        spin_unlock(ptl);
 608#else
 609        BUG();
 610#endif
 611        return 0;
 612}
 613
 614#ifdef CONFIG_NUMA_BALANCING
 615/*
 616 * This is used to mark a range of virtual addresses to be inaccessible.
 617 * These are later cleared by a NUMA hinting fault. Depending on these
 618 * faults, pages may be migrated for better NUMA placement.
 619 *
 620 * This is assuming that NUMA faults are handled using PROT_NONE. If
 621 * an architecture makes a different choice, it will need further
 622 * changes to the core.
 623 */
 624unsigned long change_prot_numa(struct vm_area_struct *vma,
 625                        unsigned long addr, unsigned long end)
 626{
 627        int nr_updated;
 628
 629        nr_updated = change_protection(vma, addr, end, PAGE_NONE, MM_CP_PROT_NUMA);
 630        if (nr_updated)
 631                count_vm_numa_events(NUMA_PTE_UPDATES, nr_updated);
 632
 633        return nr_updated;
 634}
 635#else
 636static unsigned long change_prot_numa(struct vm_area_struct *vma,
 637                        unsigned long addr, unsigned long end)
 638{
 639        return 0;
 640}
 641#endif /* CONFIG_NUMA_BALANCING */
 642
 643static int queue_pages_test_walk(unsigned long start, unsigned long end,
 644                                struct mm_walk *walk)
 645{
 646        struct vm_area_struct *vma = walk->vma;
 647        struct queue_pages *qp = walk->private;
 648        unsigned long endvma = vma->vm_end;
 649        unsigned long flags = qp->flags;
 650
 651        /* range check first */
 652        if (!(flags & MPOL_MF_DISCONTIG_OK)) {
 653                if (!vma->vm_next && vma->vm_end < end)
 654                        return -EFAULT;
 655                if (qp->prev && qp->prev->vm_end < vma->vm_start)
 656                        return -EFAULT;
 657        }
 658
 659        qp->prev = vma;
 660
 661        /*
 662         * Need check MPOL_MF_STRICT to return -EIO if possible
 663         * regardless of vma_migratable
 664         */
 665        if (!vma_migratable(vma) &&
 666            !(flags & MPOL_MF_STRICT))
 667                return 1;
 668
 669        if (endvma > end)
 670                endvma = end;
 671        if (vma->vm_start > start)
 672                start = vma->vm_start;
 673
 674        if (flags & MPOL_MF_LAZY) {
 675                /* Similar to task_numa_work, skip inaccessible VMAs */
 676                if (!is_vm_hugetlb_page(vma) &&
 677                        (vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)) &&
 678                        !(vma->vm_flags & VM_MIXEDMAP))
 679                        change_prot_numa(vma, start, endvma);
 680                return 1;
 681        }
 682
 683        /* queue pages from current vma */
 684        if (flags & MPOL_MF_VALID)
 685                return 0;
 686        return 1;
 687}
 688
 689static const struct mm_walk_ops queue_pages_walk_ops = {
 690        .hugetlb_entry          = queue_pages_hugetlb,
 691        .pmd_entry              = queue_pages_pte_range,
 692        .test_walk              = queue_pages_test_walk,
 693};
 694
 695/*
 696 * Walk through page tables and collect pages to be migrated.
 697 *
 698 * If pages found in a given range are on a set of nodes (determined by
 699 * @nodes and @flags,) it's isolated and queued to the pagelist which is
 700 * passed via @private.
 701 *
 702 * queue_pages_range() has three possible return values:
 703 * 1 - there is unmovable page, but MPOL_MF_MOVE* & MPOL_MF_STRICT were
 704 *     specified.
 705 * 0 - queue pages successfully or no misplaced page.
 706 * errno - i.e. misplaced pages with MPOL_MF_STRICT specified (-EIO) or
 707 *         memory range specified by nodemask and maxnode points outside
 708 *         your accessible address space (-EFAULT)
 709 */
 710static int
 711queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end,
 712                nodemask_t *nodes, unsigned long flags,
 713                struct list_head *pagelist)
 714{
 715        struct queue_pages qp = {
 716                .pagelist = pagelist,
 717                .flags = flags,
 718                .nmask = nodes,
 719                .prev = NULL,
 720        };
 721
 722        return walk_page_range(mm, start, end, &queue_pages_walk_ops, &qp);
 723}
 724
 725/*
 726 * Apply policy to a single VMA
 727 * This must be called with the mmap_lock held for writing.
 728 */
 729static int vma_replace_policy(struct vm_area_struct *vma,
 730                                                struct mempolicy *pol)
 731{
 732        int err;
 733        struct mempolicy *old;
 734        struct mempolicy *new;
 735
 736        pr_debug("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
 737                 vma->vm_start, vma->vm_end, vma->vm_pgoff,
 738                 vma->vm_ops, vma->vm_file,
 739                 vma->vm_ops ? vma->vm_ops->set_policy : NULL);
 740
 741        new = mpol_dup(pol);
 742        if (IS_ERR(new))
 743                return PTR_ERR(new);
 744
 745        if (vma->vm_ops && vma->vm_ops->set_policy) {
 746                err = vma->vm_ops->set_policy(vma, new);
 747                if (err)
 748                        goto err_out;
 749        }
 750
 751        old = vma->vm_policy;
 752        vma->vm_policy = new; /* protected by mmap_lock */
 753        mpol_put(old);
 754
 755        return 0;
 756 err_out:
 757        mpol_put(new);
 758        return err;
 759}
 760
 761/* Step 2: apply policy to a range and do splits. */
 762static int mbind_range(struct mm_struct *mm, unsigned long start,
 763                       unsigned long end, struct mempolicy *new_pol)
 764{
 765        struct vm_area_struct *next;
 766        struct vm_area_struct *prev;
 767        struct vm_area_struct *vma;
 768        int err = 0;
 769        pgoff_t pgoff;
 770        unsigned long vmstart;
 771        unsigned long vmend;
 772
 773        vma = find_vma(mm, start);
 774        if (!vma || vma->vm_start > start)
 775                return -EFAULT;
 776
 777        prev = vma->vm_prev;
 778        if (start > vma->vm_start)
 779                prev = vma;
 780
 781        for (; vma && vma->vm_start < end; prev = vma, vma = next) {
 782                next = vma->vm_next;
 783                vmstart = max(start, vma->vm_start);
 784                vmend   = min(end, vma->vm_end);
 785
 786                if (mpol_equal(vma_policy(vma), new_pol))
 787                        continue;
 788
 789                pgoff = vma->vm_pgoff +
 790                        ((vmstart - vma->vm_start) >> PAGE_SHIFT);
 791                prev = vma_merge(mm, prev, vmstart, vmend, vma->vm_flags,
 792                                 vma->anon_vma, vma->vm_file, pgoff,
 793                                 new_pol, vma->vm_userfaultfd_ctx);
 794                if (prev) {
 795                        vma = prev;
 796                        next = vma->vm_next;
 797                        if (mpol_equal(vma_policy(vma), new_pol))
 798                                continue;
 799                        /* vma_merge() joined vma && vma->next, case 8 */
 800                        goto replace;
 801                }
 802                if (vma->vm_start != vmstart) {
 803                        err = split_vma(vma->vm_mm, vma, vmstart, 1);
 804                        if (err)
 805                                goto out;
 806                }
 807                if (vma->vm_end != vmend) {
 808                        err = split_vma(vma->vm_mm, vma, vmend, 0);
 809                        if (err)
 810                                goto out;
 811                }
 812 replace:
 813                err = vma_replace_policy(vma, new_pol);
 814                if (err)
 815                        goto out;
 816        }
 817
 818 out:
 819        return err;
 820}
 821
 822/* Set the process memory policy */
 823static long do_set_mempolicy(unsigned short mode, unsigned short flags,
 824                             nodemask_t *nodes)
 825{
 826        struct mempolicy *new, *old;
 827        NODEMASK_SCRATCH(scratch);
 828        int ret;
 829
 830        if (!scratch)
 831                return -ENOMEM;
 832
 833        new = mpol_new(mode, flags, nodes);
 834        if (IS_ERR(new)) {
 835                ret = PTR_ERR(new);
 836                goto out;
 837        }
 838
 839        task_lock(current);
 840        ret = mpol_set_nodemask(new, nodes, scratch);
 841        if (ret) {
 842                task_unlock(current);
 843                mpol_put(new);
 844                goto out;
 845        }
 846        old = current->mempolicy;
 847        current->mempolicy = new;
 848        if (new && new->mode == MPOL_INTERLEAVE)
 849                current->il_prev = MAX_NUMNODES-1;
 850        task_unlock(current);
 851        mpol_put(old);
 852        ret = 0;
 853out:
 854        NODEMASK_SCRATCH_FREE(scratch);
 855        return ret;
 856}
 857
 858/*
 859 * Return nodemask for policy for get_mempolicy() query
 860 *
 861 * Called with task's alloc_lock held
 862 */
 863static void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes)
 864{
 865        nodes_clear(*nodes);
 866        if (p == &default_policy)
 867                return;
 868
 869        switch (p->mode) {
 870        case MPOL_BIND:
 871                /* Fall through */
 872        case MPOL_INTERLEAVE:
 873                *nodes = p->v.nodes;
 874                break;
 875        case MPOL_PREFERRED:
 876                if (!(p->flags & MPOL_F_LOCAL))
 877                        node_set(p->v.preferred_node, *nodes);
 878                /* else return empty node mask for local allocation */
 879                break;
 880        default:
 881                BUG();
 882        }
 883}
 884
 885static int lookup_node(struct mm_struct *mm, unsigned long addr)
 886{
 887        struct page *p = NULL;
 888        int err;
 889
 890        int locked = 1;
 891        err = get_user_pages_locked(addr & PAGE_MASK, 1, 0, &p, &locked);
 892        if (err > 0) {
 893                err = page_to_nid(p);
 894                put_page(p);
 895        }
 896        if (locked)
 897                mmap_read_unlock(mm);
 898        return err;
 899}
 900
 901/* Retrieve NUMA policy */
 902static long do_get_mempolicy(int *policy, nodemask_t *nmask,
 903                             unsigned long addr, unsigned long flags)
 904{
 905        int err;
 906        struct mm_struct *mm = current->mm;
 907        struct vm_area_struct *vma = NULL;
 908        struct mempolicy *pol = current->mempolicy, *pol_refcount = NULL;
 909
 910        if (flags &
 911                ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED))
 912                return -EINVAL;
 913
 914        if (flags & MPOL_F_MEMS_ALLOWED) {
 915                if (flags & (MPOL_F_NODE|MPOL_F_ADDR))
 916                        return -EINVAL;
 917                *policy = 0;    /* just so it's initialized */
 918                task_lock(current);
 919                *nmask  = cpuset_current_mems_allowed;
 920                task_unlock(current);
 921                return 0;
 922        }
 923
 924        if (flags & MPOL_F_ADDR) {
 925                /*
 926                 * Do NOT fall back to task policy if the
 927                 * vma/shared policy at addr is NULL.  We
 928                 * want to return MPOL_DEFAULT in this case.
 929                 */
 930                mmap_read_lock(mm);
 931                vma = find_vma_intersection(mm, addr, addr+1);
 932                if (!vma) {
 933                        mmap_read_unlock(mm);
 934                        return -EFAULT;
 935                }
 936                if (vma->vm_ops && vma->vm_ops->get_policy)
 937                        pol = vma->vm_ops->get_policy(vma, addr);
 938                else
 939                        pol = vma->vm_policy;
 940        } else if (addr)
 941                return -EINVAL;
 942
 943        if (!pol)
 944                pol = &default_policy;  /* indicates default behavior */
 945
 946        if (flags & MPOL_F_NODE) {
 947                if (flags & MPOL_F_ADDR) {
 948                        /*
 949                         * Take a refcount on the mpol, lookup_node()
 950                         * wil drop the mmap_lock, so after calling
 951                         * lookup_node() only "pol" remains valid, "vma"
 952                         * is stale.
 953                         */
 954                        pol_refcount = pol;
 955                        vma = NULL;
 956                        mpol_get(pol);
 957                        err = lookup_node(mm, addr);
 958                        if (err < 0)
 959                                goto out;
 960                        *policy = err;
 961                } else if (pol == current->mempolicy &&
 962                                pol->mode == MPOL_INTERLEAVE) {
 963                        *policy = next_node_in(current->il_prev, pol->v.nodes);
 964                } else {
 965                        err = -EINVAL;
 966                        goto out;
 967                }
 968        } else {
 969                *policy = pol == &default_policy ? MPOL_DEFAULT :
 970                                                pol->mode;
 971                /*
 972                 * Internal mempolicy flags must be masked off before exposing
 973                 * the policy to userspace.
 974                 */
 975                *policy |= (pol->flags & MPOL_MODE_FLAGS);
 976        }
 977
 978        err = 0;
 979        if (nmask) {
 980                if (mpol_store_user_nodemask(pol)) {
 981                        *nmask = pol->w.user_nodemask;
 982                } else {
 983                        task_lock(current);
 984                        get_policy_nodemask(pol, nmask);
 985                        task_unlock(current);
 986                }
 987        }
 988
 989 out:
 990        mpol_cond_put(pol);
 991        if (vma)
 992                mmap_read_unlock(mm);
 993        if (pol_refcount)
 994                mpol_put(pol_refcount);
 995        return err;
 996}
 997
 998#ifdef CONFIG_MIGRATION
 999/*
1000 * page migration, thp tail pages can be passed.
1001 */
1002static int migrate_page_add(struct page *page, struct list_head *pagelist,
1003                                unsigned long flags)
1004{
1005        struct page *head = compound_head(page);
1006        /*
1007         * Avoid migrating a page that is shared with others.
1008         */
1009        if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(head) == 1) {
1010                if (!isolate_lru_page(head)) {
1011                        list_add_tail(&head->lru, pagelist);
1012                        mod_node_page_state(page_pgdat(head),
1013                                NR_ISOLATED_ANON + page_is_file_lru(head),
1014                                thp_nr_pages(head));
1015                } else if (flags & MPOL_MF_STRICT) {
1016                        /*
1017                         * Non-movable page may reach here.  And, there may be
1018                         * temporary off LRU pages or non-LRU movable pages.
1019                         * Treat them as unmovable pages since they can't be
1020                         * isolated, so they can't be moved at the moment.  It
1021                         * should return -EIO for this case too.
1022                         */
1023                        return -EIO;
1024                }
1025        }
1026
1027        return 0;
1028}
1029
1030/*
1031 * Migrate pages from one node to a target node.
1032 * Returns error or the number of pages not migrated.
1033 */
1034static int migrate_to_node(struct mm_struct *mm, int source, int dest,
1035                           int flags)
1036{
1037        nodemask_t nmask;
1038        LIST_HEAD(pagelist);
1039        int err = 0;
1040        struct migration_target_control mtc = {
1041                .nid = dest,
1042                .gfp_mask = GFP_HIGHUSER_MOVABLE | __GFP_THISNODE,
1043        };
1044
1045        nodes_clear(nmask);
1046        node_set(source, nmask);
1047
1048        /*
1049         * This does not "check" the range but isolates all pages that
1050         * need migration.  Between passing in the full user address
1051         * space range and MPOL_MF_DISCONTIG_OK, this call can not fail.
1052         */
1053        VM_BUG_ON(!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)));
1054        queue_pages_range(mm, mm->mmap->vm_start, mm->task_size, &nmask,
1055                        flags | MPOL_MF_DISCONTIG_OK, &pagelist);
1056
1057        if (!list_empty(&pagelist)) {
1058                err = migrate_pages(&pagelist, alloc_migration_target, NULL,
1059                                (unsigned long)&mtc, MIGRATE_SYNC, MR_SYSCALL);
1060                if (err)
1061                        putback_movable_pages(&pagelist);
1062        }
1063
1064        return err;
1065}
1066
1067/*
1068 * Move pages between the two nodesets so as to preserve the physical
1069 * layout as much as possible.
1070 *
1071 * Returns the number of page that could not be moved.
1072 */
1073int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
1074                     const nodemask_t *to, int flags)
1075{
1076        int busy = 0;
1077        int err;
1078        nodemask_t tmp;
1079
1080        err = migrate_prep();
1081        if (err)
1082                return err;
1083
1084        mmap_read_lock(mm);
1085
1086        /*
1087         * Find a 'source' bit set in 'tmp' whose corresponding 'dest'
1088         * bit in 'to' is not also set in 'tmp'.  Clear the found 'source'
1089         * bit in 'tmp', and return that <source, dest> pair for migration.
1090         * The pair of nodemasks 'to' and 'from' define the map.
1091         *
1092         * If no pair of bits is found that way, fallback to picking some
1093         * pair of 'source' and 'dest' bits that are not the same.  If the
1094         * 'source' and 'dest' bits are the same, this represents a node
1095         * that will be migrating to itself, so no pages need move.
1096         *
1097         * If no bits are left in 'tmp', or if all remaining bits left
1098         * in 'tmp' correspond to the same bit in 'to', return false
1099         * (nothing left to migrate).
1100         *
1101         * This lets us pick a pair of nodes to migrate between, such that
1102         * if possible the dest node is not already occupied by some other
1103         * source node, minimizing the risk of overloading the memory on a
1104         * node that would happen if we migrated incoming memory to a node
1105         * before migrating outgoing memory source that same node.
1106         *
1107         * A single scan of tmp is sufficient.  As we go, we remember the
1108         * most recent <s, d> pair that moved (s != d).  If we find a pair
1109         * that not only moved, but what's better, moved to an empty slot
1110         * (d is not set in tmp), then we break out then, with that pair.
1111         * Otherwise when we finish scanning from_tmp, we at least have the
1112         * most recent <s, d> pair that moved.  If we get all the way through
1113         * the scan of tmp without finding any node that moved, much less
1114         * moved to an empty node, then there is nothing left worth migrating.
1115         */
1116
1117        tmp = *from;
1118        while (!nodes_empty(tmp)) {
1119                int s,d;
1120                int source = NUMA_NO_NODE;
1121                int dest = 0;
1122
1123                for_each_node_mask(s, tmp) {
1124
1125                        /*
1126                         * do_migrate_pages() tries to maintain the relative
1127                         * node relationship of the pages established between
1128                         * threads and memory areas.
1129                         *
1130                         * However if the number of source nodes is not equal to
1131                         * the number of destination nodes we can not preserve
1132                         * this node relative relationship.  In that case, skip
1133                         * copying memory from a node that is in the destination
1134                         * mask.
1135                         *
1136                         * Example: [2,3,4] -> [3,4,5] moves everything.
1137                         *          [0-7] - > [3,4,5] moves only 0,1,2,6,7.
1138                         */
1139
1140                        if ((nodes_weight(*from) != nodes_weight(*to)) &&
1141                                                (node_isset(s, *to)))
1142                                continue;
1143
1144                        d = node_remap(s, *from, *to);
1145                        if (s == d)
1146                                continue;
1147
1148                        source = s;     /* Node moved. Memorize */
1149                        dest = d;
1150
1151                        /* dest not in remaining from nodes? */
1152                        if (!node_isset(dest, tmp))
1153                                break;
1154                }
1155                if (source == NUMA_NO_NODE)
1156                        break;
1157
1158                node_clear(source, tmp);
1159                err = migrate_to_node(mm, source, dest, flags);
1160                if (err > 0)
1161                        busy += err;
1162                if (err < 0)
1163                        break;
1164        }
1165        mmap_read_unlock(mm);
1166        if (err < 0)
1167                return err;
1168        return busy;
1169
1170}
1171
1172/*
1173 * Allocate a new page for page migration based on vma policy.
1174 * Start by assuming the page is mapped by the same vma as contains @start.
1175 * Search forward from there, if not.  N.B., this assumes that the
1176 * list of pages handed to migrate_pages()--which is how we get here--
1177 * is in virtual address order.
1178 */
1179static struct page *new_page(struct page *page, unsigned long start)
1180{
1181        struct vm_area_struct *vma;
1182        unsigned long uninitialized_var(address);
1183
1184        vma = find_vma(current->mm, start);
1185        while (vma) {
1186                address = page_address_in_vma(page, vma);
1187                if (address != -EFAULT)
1188                        break;
1189                vma = vma->vm_next;
1190        }
1191
1192        if (PageHuge(page)) {
1193                return alloc_huge_page_vma(page_hstate(compound_head(page)),
1194                                vma, address);
1195        } else if (PageTransHuge(page)) {
1196                struct page *thp;
1197
1198                thp = alloc_hugepage_vma(GFP_TRANSHUGE, vma, address,
1199                                         HPAGE_PMD_ORDER);
1200                if (!thp)
1201                        return NULL;
1202                prep_transhuge_page(thp);
1203                return thp;
1204        }
1205        /*
1206         * if !vma, alloc_page_vma() will use task or system default policy
1207         */
1208        return alloc_page_vma(GFP_HIGHUSER_MOVABLE | __GFP_RETRY_MAYFAIL,
1209                        vma, address);
1210}
1211#else
1212
1213static int migrate_page_add(struct page *page, struct list_head *pagelist,
1214                                unsigned long flags)
1215{
1216        return -EIO;
1217}
1218
1219int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
1220                     const nodemask_t *to, int flags)
1221{
1222        return -ENOSYS;
1223}
1224
1225static struct page *new_page(struct page *page, unsigned long start)
1226{
1227        return NULL;
1228}
1229#endif
1230
1231static long do_mbind(unsigned long start, unsigned long len,
1232                     unsigned short mode, unsigned short mode_flags,
1233                     nodemask_t *nmask, unsigned long flags)
1234{
1235        struct mm_struct *mm = current->mm;
1236        struct mempolicy *new;
1237        unsigned long end;
1238        int err;
1239        int ret;
1240        LIST_HEAD(pagelist);
1241
1242        if (flags & ~(unsigned long)MPOL_MF_VALID)
1243                return -EINVAL;
1244        if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
1245                return -EPERM;
1246
1247        if (start & ~PAGE_MASK)
1248                return -EINVAL;
1249
1250        if (mode == MPOL_DEFAULT)
1251                flags &= ~MPOL_MF_STRICT;
1252
1253        len = (len + PAGE_SIZE - 1) & PAGE_MASK;
1254        end = start + len;
1255
1256        if (end < start)
1257                return -EINVAL;
1258        if (end == start)
1259                return 0;
1260
1261        new = mpol_new(mode, mode_flags, nmask);
1262        if (IS_ERR(new))
1263                return PTR_ERR(new);
1264
1265        if (flags & MPOL_MF_LAZY)
1266                new->flags |= MPOL_F_MOF;
1267
1268        /*
1269         * If we are using the default policy then operation
1270         * on discontinuous address spaces is okay after all
1271         */
1272        if (!new)
1273                flags |= MPOL_MF_DISCONTIG_OK;
1274
1275        pr_debug("mbind %lx-%lx mode:%d flags:%d nodes:%lx\n",
1276                 start, start + len, mode, mode_flags,
1277                 nmask ? nodes_addr(*nmask)[0] : NUMA_NO_NODE);
1278
1279        if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
1280
1281                err = migrate_prep();
1282                if (err)
1283                        goto mpol_out;
1284        }
1285        {
1286                NODEMASK_SCRATCH(scratch);
1287                if (scratch) {
1288                        mmap_write_lock(mm);
1289                        task_lock(current);
1290                        err = mpol_set_nodemask(new, nmask, scratch);
1291                        task_unlock(current);
1292                        if (err)
1293                                mmap_write_unlock(mm);
1294                } else
1295                        err = -ENOMEM;
1296                NODEMASK_SCRATCH_FREE(scratch);
1297        }
1298        if (err)
1299                goto mpol_out;
1300
1301        ret = queue_pages_range(mm, start, end, nmask,
1302                          flags | MPOL_MF_INVERT, &pagelist);
1303
1304        if (ret < 0) {
1305                err = ret;
1306                goto up_out;
1307        }
1308
1309        err = mbind_range(mm, start, end, new);
1310
1311        if (!err) {
1312                int nr_failed = 0;
1313
1314                if (!list_empty(&pagelist)) {
1315                        WARN_ON_ONCE(flags & MPOL_MF_LAZY);
1316                        nr_failed = migrate_pages(&pagelist, new_page, NULL,
1317                                start, MIGRATE_SYNC, MR_MEMPOLICY_MBIND);
1318                        if (nr_failed)
1319                                putback_movable_pages(&pagelist);
1320                }
1321
1322                if ((ret > 0) || (nr_failed && (flags & MPOL_MF_STRICT)))
1323                        err = -EIO;
1324        } else {
1325up_out:
1326                if (!list_empty(&pagelist))
1327                        putback_movable_pages(&pagelist);
1328        }
1329
1330        mmap_write_unlock(mm);
1331mpol_out:
1332        mpol_put(new);
1333        return err;
1334}
1335
1336/*
1337 * User space interface with variable sized bitmaps for nodelists.
1338 */
1339
1340/* Copy a node mask from user space. */
1341static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
1342                     unsigned long maxnode)
1343{
1344        unsigned long k;
1345        unsigned long t;
1346        unsigned long nlongs;
1347        unsigned long endmask;
1348
1349        --maxnode;
1350        nodes_clear(*nodes);
1351        if (maxnode == 0 || !nmask)
1352                return 0;
1353        if (maxnode > PAGE_SIZE*BITS_PER_BYTE)
1354                return -EINVAL;
1355
1356        nlongs = BITS_TO_LONGS(maxnode);
1357        if ((maxnode % BITS_PER_LONG) == 0)
1358                endmask = ~0UL;
1359        else
1360                endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1;
1361
1362        /*
1363         * When the user specified more nodes than supported just check
1364         * if the non supported part is all zero.
1365         *
1366         * If maxnode have more longs than MAX_NUMNODES, check
1367         * the bits in that area first. And then go through to
1368         * check the rest bits which equal or bigger than MAX_NUMNODES.
1369         * Otherwise, just check bits [MAX_NUMNODES, maxnode).
1370         */
1371        if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) {
1372                for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) {
1373                        if (get_user(t, nmask + k))
1374                                return -EFAULT;
1375                        if (k == nlongs - 1) {
1376                                if (t & endmask)
1377                                        return -EINVAL;
1378                        } else if (t)
1379                                return -EINVAL;
1380                }
1381                nlongs = BITS_TO_LONGS(MAX_NUMNODES);
1382                endmask = ~0UL;
1383        }
1384
1385        if (maxnode > MAX_NUMNODES && MAX_NUMNODES % BITS_PER_LONG != 0) {
1386                unsigned long valid_mask = endmask;
1387
1388                valid_mask &= ~((1UL << (MAX_NUMNODES % BITS_PER_LONG)) - 1);
1389                if (get_user(t, nmask + nlongs - 1))
1390                        return -EFAULT;
1391                if (t & valid_mask)
1392                        return -EINVAL;
1393        }
1394
1395        if (copy_from_user(nodes_addr(*nodes), nmask, nlongs*sizeof(unsigned long)))
1396                return -EFAULT;
1397        nodes_addr(*nodes)[nlongs-1] &= endmask;
1398        return 0;
1399}
1400
1401/* Copy a kernel node mask to user space */
1402static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
1403                              nodemask_t *nodes)
1404{
1405        unsigned long copy = ALIGN(maxnode-1, 64) / 8;
1406        unsigned int nbytes = BITS_TO_LONGS(nr_node_ids) * sizeof(long);
1407
1408        if (copy > nbytes) {
1409                if (copy > PAGE_SIZE)
1410                        return -EINVAL;
1411                if (clear_user((char __user *)mask + nbytes, copy - nbytes))
1412                        return -EFAULT;
1413                copy = nbytes;
1414        }
1415        return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
1416}
1417
1418static long kernel_mbind(unsigned long start, unsigned long len,
1419                         unsigned long mode, const unsigned long __user *nmask,
1420                         unsigned long maxnode, unsigned int flags)
1421{
1422        nodemask_t nodes;
1423        int err;
1424        unsigned short mode_flags;
1425
1426        start = untagged_addr(start);
1427        mode_flags = mode & MPOL_MODE_FLAGS;
1428        mode &= ~MPOL_MODE_FLAGS;
1429        if (mode >= MPOL_MAX)
1430                return -EINVAL;
1431        if ((mode_flags & MPOL_F_STATIC_NODES) &&
1432            (mode_flags & MPOL_F_RELATIVE_NODES))
1433                return -EINVAL;
1434        err = get_nodes(&nodes, nmask, maxnode);
1435        if (err)
1436                return err;
1437        return do_mbind(start, len, mode, mode_flags, &nodes, flags);
1438}
1439
1440SYSCALL_DEFINE6(mbind, unsigned long, start, unsigned long, len,
1441                unsigned long, mode, const unsigned long __user *, nmask,
1442                unsigned long, maxnode, unsigned int, flags)
1443{
1444        return kernel_mbind(start, len, mode, nmask, maxnode, flags);
1445}
1446
1447/* Set the process memory policy */
1448static long kernel_set_mempolicy(int mode, const unsigned long __user *nmask,
1449                                 unsigned long maxnode)
1450{
1451        int err;
1452        nodemask_t nodes;
1453        unsigned short flags;
1454
1455        flags = mode & MPOL_MODE_FLAGS;
1456        mode &= ~MPOL_MODE_FLAGS;
1457        if ((unsigned int)mode >= MPOL_MAX)
1458                return -EINVAL;
1459        if ((flags & MPOL_F_STATIC_NODES) && (flags & MPOL_F_RELATIVE_NODES))
1460                return -EINVAL;
1461        err = get_nodes(&nodes, nmask, maxnode);
1462        if (err)
1463                return err;
1464        return do_set_mempolicy(mode, flags, &nodes);
1465}
1466
1467SYSCALL_DEFINE3(set_mempolicy, int, mode, const unsigned long __user *, nmask,
1468                unsigned long, maxnode)
1469{
1470        return kernel_set_mempolicy(mode, nmask, maxnode);
1471}
1472
1473static int kernel_migrate_pages(pid_t pid, unsigned long maxnode,
1474                                const unsigned long __user *old_nodes,
1475                                const unsigned long __user *new_nodes)
1476{
1477        struct mm_struct *mm = NULL;
1478        struct task_struct *task;
1479        nodemask_t task_nodes;
1480        int err;
1481        nodemask_t *old;
1482        nodemask_t *new;
1483        NODEMASK_SCRATCH(scratch);
1484
1485        if (!scratch)
1486                return -ENOMEM;
1487
1488        old = &scratch->mask1;
1489        new = &scratch->mask2;
1490
1491        err = get_nodes(old, old_nodes, maxnode);
1492        if (err)
1493                goto out;
1494
1495        err = get_nodes(new, new_nodes, maxnode);
1496        if (err)
1497                goto out;
1498
1499        /* Find the mm_struct */
1500        rcu_read_lock();
1501        task = pid ? find_task_by_vpid(pid) : current;
1502        if (!task) {
1503                rcu_read_unlock();
1504                err = -ESRCH;
1505                goto out;
1506        }
1507        get_task_struct(task);
1508
1509        err = -EINVAL;
1510
1511        /*
1512         * Check if this process has the right to modify the specified process.
1513         * Use the regular "ptrace_may_access()" checks.
1514         */
1515        if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS)) {
1516                rcu_read_unlock();
1517                err = -EPERM;
1518                goto out_put;
1519        }
1520        rcu_read_unlock();
1521
1522        task_nodes = cpuset_mems_allowed(task);
1523        /* Is the user allowed to access the target nodes? */
1524        if (!nodes_subset(*new, task_nodes) && !capable(CAP_SYS_NICE)) {
1525                err = -EPERM;
1526                goto out_put;
1527        }
1528
1529        task_nodes = cpuset_mems_allowed(current);
1530        nodes_and(*new, *new, task_nodes);
1531        if (nodes_empty(*new))
1532                goto out_put;
1533
1534        nodes_and(*new, *new, node_states[N_MEMORY]);
1535        if (nodes_empty(*new))
1536                goto out_put;
1537
1538        err = security_task_movememory(task);
1539        if (err)
1540                goto out_put;
1541
1542        mm = get_task_mm(task);
1543        put_task_struct(task);
1544
1545        if (!mm) {
1546                err = -EINVAL;
1547                goto out;
1548        }
1549
1550        err = do_migrate_pages(mm, old, new,
1551                capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE);
1552
1553        mmput(mm);
1554out:
1555        NODEMASK_SCRATCH_FREE(scratch);
1556
1557        return err;
1558
1559out_put:
1560        put_task_struct(task);
1561        goto out;
1562
1563}
1564
1565SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode,
1566                const unsigned long __user *, old_nodes,
1567                const unsigned long __user *, new_nodes)
1568{
1569        return kernel_migrate_pages(pid, maxnode, old_nodes, new_nodes);
1570}
1571
1572
1573/* Retrieve NUMA policy */
1574static int kernel_get_mempolicy(int __user *policy,
1575                                unsigned long __user *nmask,
1576                                unsigned long maxnode,
1577                                unsigned long addr,
1578                                unsigned long flags)
1579{
1580        int err;
1581        int uninitialized_var(pval);
1582        nodemask_t nodes;
1583
1584        addr = untagged_addr(addr);
1585
1586        if (nmask != NULL && maxnode < nr_node_ids)
1587                return -EINVAL;
1588
1589        err = do_get_mempolicy(&pval, &nodes, addr, flags);
1590
1591        if (err)
1592                return err;
1593
1594        if (policy && put_user(pval, policy))
1595                return -EFAULT;
1596
1597        if (nmask)
1598                err = copy_nodes_to_user(nmask, maxnode, &nodes);
1599
1600        return err;
1601}
1602
1603SYSCALL_DEFINE5(get_mempolicy, int __user *, policy,
1604                unsigned long __user *, nmask, unsigned long, maxnode,
1605                unsigned long, addr, unsigned long, flags)
1606{
1607        return kernel_get_mempolicy(policy, nmask, maxnode, addr, flags);
1608}
1609
1610#ifdef CONFIG_COMPAT
1611
1612COMPAT_SYSCALL_DEFINE5(get_mempolicy, int __user *, policy,
1613                       compat_ulong_t __user *, nmask,
1614                       compat_ulong_t, maxnode,
1615                       compat_ulong_t, addr, compat_ulong_t, flags)
1616{
1617        long err;
1618        unsigned long __user *nm = NULL;
1619        unsigned long nr_bits, alloc_size;
1620        DECLARE_BITMAP(bm, MAX_NUMNODES);
1621
1622        nr_bits = min_t(unsigned long, maxnode-1, nr_node_ids);
1623        alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1624
1625        if (nmask)
1626                nm = compat_alloc_user_space(alloc_size);
1627
1628        err = kernel_get_mempolicy(policy, nm, nr_bits+1, addr, flags);
1629
1630        if (!err && nmask) {
1631                unsigned long copy_size;
1632                copy_size = min_t(unsigned long, sizeof(bm), alloc_size);
1633                err = copy_from_user(bm, nm, copy_size);
1634                /* ensure entire bitmap is zeroed */
1635                err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8);
1636                err |= compat_put_bitmap(nmask, bm, nr_bits);
1637        }
1638
1639        return err;
1640}
1641
1642COMPAT_SYSCALL_DEFINE3(set_mempolicy, int, mode, compat_ulong_t __user *, nmask,
1643                       compat_ulong_t, maxnode)
1644{
1645        unsigned long __user *nm = NULL;
1646        unsigned long nr_bits, alloc_size;
1647        DECLARE_BITMAP(bm, MAX_NUMNODES);
1648
1649        nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1650        alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1651
1652        if (nmask) {
1653                if (compat_get_bitmap(bm, nmask, nr_bits))
1654                        return -EFAULT;
1655                nm = compat_alloc_user_space(alloc_size);
1656                if (copy_to_user(nm, bm, alloc_size))
1657                        return -EFAULT;
1658        }
1659
1660        return kernel_set_mempolicy(mode, nm, nr_bits+1);
1661}
1662
1663COMPAT_SYSCALL_DEFINE6(mbind, compat_ulong_t, start, compat_ulong_t, len,
1664                       compat_ulong_t, mode, compat_ulong_t __user *, nmask,
1665                       compat_ulong_t, maxnode, compat_ulong_t, flags)
1666{
1667        unsigned long __user *nm = NULL;
1668        unsigned long nr_bits, alloc_size;
1669        nodemask_t bm;
1670
1671        nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1672        alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1673
1674        if (nmask) {
1675                if (compat_get_bitmap(nodes_addr(bm), nmask, nr_bits))
1676                        return -EFAULT;
1677                nm = compat_alloc_user_space(alloc_size);
1678                if (copy_to_user(nm, nodes_addr(bm), alloc_size))
1679                        return -EFAULT;
1680        }
1681
1682        return kernel_mbind(start, len, mode, nm, nr_bits+1, flags);
1683}
1684
1685COMPAT_SYSCALL_DEFINE4(migrate_pages, compat_pid_t, pid,
1686                       compat_ulong_t, maxnode,
1687                       const compat_ulong_t __user *, old_nodes,
1688                       const compat_ulong_t __user *, new_nodes)
1689{
1690        unsigned long __user *old = NULL;
1691        unsigned long __user *new = NULL;
1692        nodemask_t tmp_mask;
1693        unsigned long nr_bits;
1694        unsigned long size;
1695
1696        nr_bits = min_t(unsigned long, maxnode - 1, MAX_NUMNODES);
1697        size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1698        if (old_nodes) {
1699                if (compat_get_bitmap(nodes_addr(tmp_mask), old_nodes, nr_bits))
1700                        return -EFAULT;
1701                old = compat_alloc_user_space(new_nodes ? size * 2 : size);
1702                if (new_nodes)
1703                        new = old + size / sizeof(unsigned long);
1704                if (copy_to_user(old, nodes_addr(tmp_mask), size))
1705                        return -EFAULT;
1706        }
1707        if (new_nodes) {
1708                if (compat_get_bitmap(nodes_addr(tmp_mask), new_nodes, nr_bits))
1709                        return -EFAULT;
1710                if (new == NULL)
1711                        new = compat_alloc_user_space(size);
1712                if (copy_to_user(new, nodes_addr(tmp_mask), size))
1713                        return -EFAULT;
1714        }
1715        return kernel_migrate_pages(pid, nr_bits + 1, old, new);
1716}
1717
1718#endif /* CONFIG_COMPAT */
1719
1720struct mempolicy *__get_vma_policy(struct vm_area_struct *vma,
1721                                                unsigned long addr)
1722{
1723        struct mempolicy *pol = NULL;
1724
1725        if (vma) {
1726                if (vma->vm_ops && vma->vm_ops->get_policy) {
1727                        pol = vma->vm_ops->get_policy(vma, addr);
1728                } else if (vma->vm_policy) {
1729                        pol = vma->vm_policy;
1730
1731                        /*
1732                         * shmem_alloc_page() passes MPOL_F_SHARED policy with
1733                         * a pseudo vma whose vma->vm_ops=NULL. Take a reference
1734                         * count on these policies which will be dropped by
1735                         * mpol_cond_put() later
1736                         */
1737                        if (mpol_needs_cond_ref(pol))
1738                                mpol_get(pol);
1739                }
1740        }
1741
1742        return pol;
1743}
1744
1745/*
1746 * get_vma_policy(@vma, @addr)
1747 * @vma: virtual memory area whose policy is sought
1748 * @addr: address in @vma for shared policy lookup
1749 *
1750 * Returns effective policy for a VMA at specified address.
1751 * Falls back to current->mempolicy or system default policy, as necessary.
1752 * Shared policies [those marked as MPOL_F_SHARED] require an extra reference
1753 * count--added by the get_policy() vm_op, as appropriate--to protect against
1754 * freeing by another task.  It is the caller's responsibility to free the
1755 * extra reference for shared policies.
1756 */
1757static struct mempolicy *get_vma_policy(struct vm_area_struct *vma,
1758                                                unsigned long addr)
1759{
1760        struct mempolicy *pol = __get_vma_policy(vma, addr);
1761
1762        if (!pol)
1763                pol = get_task_policy(current);
1764
1765        return pol;
1766}
1767
1768bool vma_policy_mof(struct vm_area_struct *vma)
1769{
1770        struct mempolicy *pol;
1771
1772        if (vma->vm_ops && vma->vm_ops->get_policy) {
1773                bool ret = false;
1774
1775                pol = vma->vm_ops->get_policy(vma, vma->vm_start);
1776                if (pol && (pol->flags & MPOL_F_MOF))
1777                        ret = true;
1778                mpol_cond_put(pol);
1779
1780                return ret;
1781        }
1782
1783        pol = vma->vm_policy;
1784        if (!pol)
1785                pol = get_task_policy(current);
1786
1787        return pol->flags & MPOL_F_MOF;
1788}
1789
1790static int apply_policy_zone(struct mempolicy *policy, enum zone_type zone)
1791{
1792        enum zone_type dynamic_policy_zone = policy_zone;
1793
1794        BUG_ON(dynamic_policy_zone == ZONE_MOVABLE);
1795
1796        /*
1797         * if policy->v.nodes has movable memory only,
1798         * we apply policy when gfp_zone(gfp) = ZONE_MOVABLE only.
1799         *
1800         * policy->v.nodes is intersect with node_states[N_MEMORY].
1801         * so if the following test faile, it implies
1802         * policy->v.nodes has movable memory only.
1803         */
1804        if (!nodes_intersects(policy->v.nodes, node_states[N_HIGH_MEMORY]))
1805                dynamic_policy_zone = ZONE_MOVABLE;
1806
1807        return zone >= dynamic_policy_zone;
1808}
1809
1810/*
1811 * Return a nodemask representing a mempolicy for filtering nodes for
1812 * page allocation
1813 */
1814static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy)
1815{
1816        /* Lower zones don't get a nodemask applied for MPOL_BIND */
1817        if (unlikely(policy->mode == MPOL_BIND) &&
1818                        apply_policy_zone(policy, gfp_zone(gfp)) &&
1819                        cpuset_nodemask_valid_mems_allowed(&policy->v.nodes))
1820                return &policy->v.nodes;
1821
1822        return NULL;
1823}
1824
1825/* Return the node id preferred by the given mempolicy, or the given id */
1826static int policy_node(gfp_t gfp, struct mempolicy *policy,
1827                                                                int nd)
1828{
1829        if (policy->mode == MPOL_PREFERRED && !(policy->flags & MPOL_F_LOCAL))
1830                nd = policy->v.preferred_node;
1831        else {
1832                /*
1833                 * __GFP_THISNODE shouldn't even be used with the bind policy
1834                 * because we might easily break the expectation to stay on the
1835                 * requested node and not break the policy.
1836                 */
1837                WARN_ON_ONCE(policy->mode == MPOL_BIND && (gfp & __GFP_THISNODE));
1838        }
1839
1840        return nd;
1841}
1842
1843/* Do dynamic interleaving for a process */
1844static unsigned interleave_nodes(struct mempolicy *policy)
1845{
1846        unsigned next;
1847        struct task_struct *me = current;
1848
1849        next = next_node_in(me->il_prev, policy->v.nodes);
1850        if (next < MAX_NUMNODES)
1851                me->il_prev = next;
1852        return next;
1853}
1854
1855/*
1856 * Depending on the memory policy provide a node from which to allocate the
1857 * next slab entry.
1858 */
1859unsigned int mempolicy_slab_node(void)
1860{
1861        struct mempolicy *policy;
1862        int node = numa_mem_id();
1863
1864        if (in_interrupt())
1865                return node;
1866
1867        policy = current->mempolicy;
1868        if (!policy || policy->flags & MPOL_F_LOCAL)
1869                return node;
1870
1871        switch (policy->mode) {
1872        case MPOL_PREFERRED:
1873                /*
1874                 * handled MPOL_F_LOCAL above
1875                 */
1876                return policy->v.preferred_node;
1877
1878        case MPOL_INTERLEAVE:
1879                return interleave_nodes(policy);
1880
1881        case MPOL_BIND: {
1882                struct zoneref *z;
1883
1884                /*
1885                 * Follow bind policy behavior and start allocation at the
1886                 * first node.
1887                 */
1888                struct zonelist *zonelist;
1889                enum zone_type highest_zoneidx = gfp_zone(GFP_KERNEL);
1890                zonelist = &NODE_DATA(node)->node_zonelists[ZONELIST_FALLBACK];
1891                z = first_zones_zonelist(zonelist, highest_zoneidx,
1892                                                        &policy->v.nodes);
1893                return z->zone ? zone_to_nid(z->zone) : node;
1894        }
1895
1896        default:
1897                BUG();
1898        }
1899}
1900
1901/*
1902 * Do static interleaving for a VMA with known offset @n.  Returns the n'th
1903 * node in pol->v.nodes (starting from n=0), wrapping around if n exceeds the
1904 * number of present nodes.
1905 */
1906static unsigned offset_il_node(struct mempolicy *pol, unsigned long n)
1907{
1908        unsigned nnodes = nodes_weight(pol->v.nodes);
1909        unsigned target;
1910        int i;
1911        int nid;
1912
1913        if (!nnodes)
1914                return numa_node_id();
1915        target = (unsigned int)n % nnodes;
1916        nid = first_node(pol->v.nodes);
1917        for (i = 0; i < target; i++)
1918                nid = next_node(nid, pol->v.nodes);
1919        return nid;
1920}
1921
1922/* Determine a node number for interleave */
1923static inline unsigned interleave_nid(struct mempolicy *pol,
1924                 struct vm_area_struct *vma, unsigned long addr, int shift)
1925{
1926        if (vma) {
1927                unsigned long off;
1928
1929                /*
1930                 * for small pages, there is no difference between
1931                 * shift and PAGE_SHIFT, so the bit-shift is safe.
1932                 * for huge pages, since vm_pgoff is in units of small
1933                 * pages, we need to shift off the always 0 bits to get
1934                 * a useful offset.
1935                 */
1936                BUG_ON(shift < PAGE_SHIFT);
1937                off = vma->vm_pgoff >> (shift - PAGE_SHIFT);
1938                off += (addr - vma->vm_start) >> shift;
1939                return offset_il_node(pol, off);
1940        } else
1941                return interleave_nodes(pol);
1942}
1943
1944#ifdef CONFIG_HUGETLBFS
1945/*
1946 * huge_node(@vma, @addr, @gfp_flags, @mpol)
1947 * @vma: virtual memory area whose policy is sought
1948 * @addr: address in @vma for shared policy lookup and interleave policy
1949 * @gfp_flags: for requested zone
1950 * @mpol: pointer to mempolicy pointer for reference counted mempolicy
1951 * @nodemask: pointer to nodemask pointer for MPOL_BIND nodemask
1952 *
1953 * Returns a nid suitable for a huge page allocation and a pointer
1954 * to the struct mempolicy for conditional unref after allocation.
1955 * If the effective policy is 'BIND, returns a pointer to the mempolicy's
1956 * @nodemask for filtering the zonelist.
1957 *
1958 * Must be protected by read_mems_allowed_begin()
1959 */
1960int huge_node(struct vm_area_struct *vma, unsigned long addr, gfp_t gfp_flags,
1961                                struct mempolicy **mpol, nodemask_t **nodemask)
1962{
1963        int nid;
1964
1965        *mpol = get_vma_policy(vma, addr);
1966        *nodemask = NULL;       /* assume !MPOL_BIND */
1967
1968        if (unlikely((*mpol)->mode == MPOL_INTERLEAVE)) {
1969                nid = interleave_nid(*mpol, vma, addr,
1970                                        huge_page_shift(hstate_vma(vma)));
1971        } else {
1972                nid = policy_node(gfp_flags, *mpol, numa_node_id());
1973                if ((*mpol)->mode == MPOL_BIND)
1974                        *nodemask = &(*mpol)->v.nodes;
1975        }
1976        return nid;
1977}
1978
1979/*
1980 * init_nodemask_of_mempolicy
1981 *
1982 * If the current task's mempolicy is "default" [NULL], return 'false'
1983 * to indicate default policy.  Otherwise, extract the policy nodemask
1984 * for 'bind' or 'interleave' policy into the argument nodemask, or
1985 * initialize the argument nodemask to contain the single node for
1986 * 'preferred' or 'local' policy and return 'true' to indicate presence
1987 * of non-default mempolicy.
1988 *
1989 * We don't bother with reference counting the mempolicy [mpol_get/put]
1990 * because the current task is examining it's own mempolicy and a task's
1991 * mempolicy is only ever changed by the task itself.
1992 *
1993 * N.B., it is the caller's responsibility to free a returned nodemask.
1994 */
1995bool init_nodemask_of_mempolicy(nodemask_t *mask)
1996{
1997        struct mempolicy *mempolicy;
1998        int nid;
1999
2000        if (!(mask && current->mempolicy))
2001                return false;
2002
2003        task_lock(current);
2004        mempolicy = current->mempolicy;
2005        switch (mempolicy->mode) {
2006        case MPOL_PREFERRED:
2007                if (mempolicy->flags & MPOL_F_LOCAL)
2008                        nid = numa_node_id();
2009                else
2010                        nid = mempolicy->v.preferred_node;
2011                init_nodemask_of_node(mask, nid);
2012                break;
2013
2014        case MPOL_BIND:
2015                /* Fall through */
2016        case MPOL_INTERLEAVE:
2017                *mask =  mempolicy->v.nodes;
2018                break;
2019
2020        default:
2021                BUG();
2022        }
2023        task_unlock(current);
2024
2025        return true;
2026}
2027#endif
2028
2029/*
2030 * mempolicy_nodemask_intersects
2031 *
2032 * If tsk's mempolicy is "default" [NULL], return 'true' to indicate default
2033 * policy.  Otherwise, check for intersection between mask and the policy
2034 * nodemask for 'bind' or 'interleave' policy.  For 'perferred' or 'local'
2035 * policy, always return true since it may allocate elsewhere on fallback.
2036 *
2037 * Takes task_lock(tsk) to prevent freeing of its mempolicy.
2038 */
2039bool mempolicy_nodemask_intersects(struct task_struct *tsk,
2040                                        const nodemask_t *mask)
2041{
2042        struct mempolicy *mempolicy;
2043        bool ret = true;
2044
2045        if (!mask)
2046                return ret;
2047        task_lock(tsk);
2048        mempolicy = tsk->mempolicy;
2049        if (!mempolicy)
2050                goto out;
2051
2052        switch (mempolicy->mode) {
2053        case MPOL_PREFERRED:
2054                /*
2055                 * MPOL_PREFERRED and MPOL_F_LOCAL are only preferred nodes to
2056                 * allocate from, they may fallback to other nodes when oom.
2057                 * Thus, it's possible for tsk to have allocated memory from
2058                 * nodes in mask.
2059                 */
2060                break;
2061        case MPOL_BIND:
2062        case MPOL_INTERLEAVE:
2063                ret = nodes_intersects(mempolicy->v.nodes, *mask);
2064                break;
2065        default:
2066                BUG();
2067        }
2068out:
2069        task_unlock(tsk);
2070        return ret;
2071}
2072
2073/* Allocate a page in interleaved policy.
2074   Own path because it needs to do special accounting. */
2075static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
2076                                        unsigned nid)
2077{
2078        struct page *page;
2079
2080        page = __alloc_pages(gfp, order, nid);
2081        /* skip NUMA_INTERLEAVE_HIT counter update if numa stats is disabled */
2082        if (!static_branch_likely(&vm_numa_stat_key))
2083                return page;
2084        if (page && page_to_nid(page) == nid) {
2085                preempt_disable();
2086                __inc_numa_state(page_zone(page), NUMA_INTERLEAVE_HIT);
2087                preempt_enable();
2088        }
2089        return page;
2090}
2091
2092/**
2093 *      alloc_pages_vma - Allocate a page for a VMA.
2094 *
2095 *      @gfp:
2096 *      %GFP_USER    user allocation.
2097 *      %GFP_KERNEL  kernel allocations,
2098 *      %GFP_HIGHMEM highmem/user allocations,
2099 *      %GFP_FS      allocation should not call back into a file system.
2100 *      %GFP_ATOMIC  don't sleep.
2101 *
2102 *      @order:Order of the GFP allocation.
2103 *      @vma:  Pointer to VMA or NULL if not available.
2104 *      @addr: Virtual Address of the allocation. Must be inside the VMA.
2105 *      @node: Which node to prefer for allocation (modulo policy).
2106 *      @hugepage: for hugepages try only the preferred node if possible
2107 *
2108 *      This function allocates a page from the kernel page pool and applies
2109 *      a NUMA policy associated with the VMA or the current process.
2110 *      When VMA is not NULL caller must read-lock the mmap_lock of the
2111 *      mm_struct of the VMA to prevent it from going away. Should be used for
2112 *      all allocations for pages that will be mapped into user space. Returns
2113 *      NULL when no page can be allocated.
2114 */
2115struct page *
2116alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
2117                unsigned long addr, int node, bool hugepage)
2118{
2119        struct mempolicy *pol;
2120        struct page *page;
2121        int preferred_nid;
2122        nodemask_t *nmask;
2123
2124        pol = get_vma_policy(vma, addr);
2125
2126        if (pol->mode == MPOL_INTERLEAVE) {
2127                unsigned nid;
2128
2129                nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order);
2130                mpol_cond_put(pol);
2131                page = alloc_page_interleave(gfp, order, nid);
2132                goto out;
2133        }
2134
2135        if (unlikely(IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && hugepage)) {
2136                int hpage_node = node;
2137
2138                /*
2139                 * For hugepage allocation and non-interleave policy which
2140                 * allows the current node (or other explicitly preferred
2141                 * node) we only try to allocate from the current/preferred
2142                 * node and don't fall back to other nodes, as the cost of
2143                 * remote accesses would likely offset THP benefits.
2144                 *
2145                 * If the policy is interleave, or does not allow the current
2146                 * node in its nodemask, we allocate the standard way.
2147                 */
2148                if (pol->mode == MPOL_PREFERRED &&
2149                                                !(pol->flags & MPOL_F_LOCAL))
2150                        hpage_node = pol->v.preferred_node;
2151
2152                nmask = policy_nodemask(gfp, pol);
2153                if (!nmask || node_isset(hpage_node, *nmask)) {
2154                        mpol_cond_put(pol);
2155                        /*
2156                         * We cannot invoke reclaim if __GFP_THISNODE
2157                         * is set. Invoking reclaim with
2158                         * __GFP_THISNODE set, would cause THP
2159                         * allocations to trigger heavy swapping
2160                         * despite there may be tons of free memory
2161                         * (including potentially plenty of THP
2162                         * already available in the buddy) on all the
2163                         * other NUMA nodes.
2164                         *
2165                         * At most we could invoke compaction when
2166                         * __GFP_THISNODE is set (but we would need to
2167                         * refrain from invoking reclaim even if
2168                         * compaction returned COMPACT_SKIPPED because
2169                         * there wasn't not enough memory to succeed
2170                         * compaction). For now just avoid
2171                         * __GFP_THISNODE instead of limiting the
2172                         * allocation path to a strict and single
2173                         * compaction invocation.
2174                         *
2175                         * Supposedly if direct reclaim was enabled by
2176                         * the caller, the app prefers THP regardless
2177                         * of the node it comes from so this would be
2178                         * more desiderable behavior than only
2179                         * providing THP originated from the local
2180                         * node in such case.
2181                         */
2182                        if (!(gfp & __GFP_DIRECT_RECLAIM))
2183                                gfp |= __GFP_THISNODE;
2184                        page = __alloc_pages_node(hpage_node, gfp, order);
2185                        goto out;
2186                }
2187        }
2188
2189        nmask = policy_nodemask(gfp, pol);
2190        preferred_nid = policy_node(gfp, pol, node);
2191        page = __alloc_pages_nodemask(gfp, order, preferred_nid, nmask);
2192        mpol_cond_put(pol);
2193out:
2194        return page;
2195}
2196EXPORT_SYMBOL(alloc_pages_vma);
2197
2198/**
2199 *      alloc_pages_current - Allocate pages.
2200 *
2201 *      @gfp:
2202 *              %GFP_USER   user allocation,
2203 *              %GFP_KERNEL kernel allocation,
2204 *              %GFP_HIGHMEM highmem allocation,
2205 *              %GFP_FS     don't call back into a file system.
2206 *              %GFP_ATOMIC don't sleep.
2207 *      @order: Power of two of allocation size in pages. 0 is a single page.
2208 *
2209 *      Allocate a page from the kernel page pool.  When not in
2210 *      interrupt context and apply the current process NUMA policy.
2211 *      Returns NULL when no page can be allocated.
2212 */
2213struct page *alloc_pages_current(gfp_t gfp, unsigned order)
2214{
2215        struct mempolicy *pol = &default_policy;
2216        struct page *page;
2217
2218        if (!in_interrupt() && !(gfp & __GFP_THISNODE))
2219                pol = get_task_policy(current);
2220
2221        /*
2222         * No reference counting needed for current->mempolicy
2223         * nor system default_policy
2224         */
2225        if (pol->mode == MPOL_INTERLEAVE)
2226                page = alloc_page_interleave(gfp, order, interleave_nodes(pol));
2227        else
2228                page = __alloc_pages_nodemask(gfp, order,
2229                                policy_node(gfp, pol, numa_node_id()),
2230                                policy_nodemask(gfp, pol));
2231
2232        return page;
2233}
2234EXPORT_SYMBOL(alloc_pages_current);
2235
2236int vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst)
2237{
2238        struct mempolicy *pol = mpol_dup(vma_policy(src));
2239
2240        if (IS_ERR(pol))
2241                return PTR_ERR(pol);
2242        dst->vm_policy = pol;
2243        return 0;
2244}
2245
2246/*
2247 * If mpol_dup() sees current->cpuset == cpuset_being_rebound, then it
2248 * rebinds the mempolicy its copying by calling mpol_rebind_policy()
2249 * with the mems_allowed returned by cpuset_mems_allowed().  This
2250 * keeps mempolicies cpuset relative after its cpuset moves.  See
2251 * further kernel/cpuset.c update_nodemask().
2252 *
2253 * current's mempolicy may be rebinded by the other task(the task that changes
2254 * cpuset's mems), so we needn't do rebind work for current task.
2255 */
2256
2257/* Slow path of a mempolicy duplicate */
2258struct mempolicy *__mpol_dup(struct mempolicy *old)
2259{
2260        struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
2261
2262        if (!new)
2263                return ERR_PTR(-ENOMEM);
2264
2265        /* task's mempolicy is protected by alloc_lock */
2266        if (old == current->mempolicy) {
2267                task_lock(current);
2268                *new = *old;
2269                task_unlock(current);
2270        } else
2271                *new = *old;
2272
2273        if (current_cpuset_is_being_rebound()) {
2274                nodemask_t mems = cpuset_mems_allowed(current);
2275                mpol_rebind_policy(new, &mems);
2276        }
2277        atomic_set(&new->refcnt, 1);
2278        return new;
2279}
2280
2281/* Slow path of a mempolicy comparison */
2282bool __mpol_equal(struct mempolicy *a, struct mempolicy *b)
2283{
2284        if (!a || !b)
2285                return false;
2286        if (a->mode != b->mode)
2287                return false;
2288        if (a->flags != b->flags)
2289                return false;
2290        if (mpol_store_user_nodemask(a))
2291                if (!nodes_equal(a->w.user_nodemask, b->w.user_nodemask))
2292                        return false;
2293
2294        switch (a->mode) {
2295        case MPOL_BIND:
2296                /* Fall through */
2297        case MPOL_INTERLEAVE:
2298                return !!nodes_equal(a->v.nodes, b->v.nodes);
2299        case MPOL_PREFERRED:
2300                /* a's ->flags is the same as b's */
2301                if (a->flags & MPOL_F_LOCAL)
2302                        return true;
2303                return a->v.preferred_node == b->v.preferred_node;
2304        default:
2305                BUG();
2306                return false;
2307        }
2308}
2309
2310/*
2311 * Shared memory backing store policy support.
2312 *
2313 * Remember policies even when nobody has shared memory mapped.
2314 * The policies are kept in Red-Black tree linked from the inode.
2315 * They are protected by the sp->lock rwlock, which should be held
2316 * for any accesses to the tree.
2317 */
2318
2319/*
2320 * lookup first element intersecting start-end.  Caller holds sp->lock for
2321 * reading or for writing
2322 */
2323static struct sp_node *
2324sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
2325{
2326        struct rb_node *n = sp->root.rb_node;
2327
2328        while (n) {
2329                struct sp_node *p = rb_entry(n, struct sp_node, nd);
2330
2331                if (start >= p->end)
2332                        n = n->rb_right;
2333                else if (end <= p->start)
2334                        n = n->rb_left;
2335                else
2336                        break;
2337        }
2338        if (!n)
2339                return NULL;
2340        for (;;) {
2341                struct sp_node *w = NULL;
2342                struct rb_node *prev = rb_prev(n);
2343                if (!prev)
2344                        break;
2345                w = rb_entry(prev, struct sp_node, nd);
2346                if (w->end <= start)
2347                        break;
2348                n = prev;
2349        }
2350        return rb_entry(n, struct sp_node, nd);
2351}
2352
2353/*
2354 * Insert a new shared policy into the list.  Caller holds sp->lock for
2355 * writing.
2356 */
2357static void sp_insert(struct shared_policy *sp, struct sp_node *new)
2358{
2359        struct rb_node **p = &sp->root.rb_node;
2360        struct rb_node *parent = NULL;
2361        struct sp_node *nd;
2362
2363        while (*p) {
2364                parent = *p;
2365                nd = rb_entry(parent, struct sp_node, nd);
2366                if (new->start < nd->start)
2367                        p = &(*p)->rb_left;
2368                else if (new->end > nd->end)
2369                        p = &(*p)->rb_right;
2370                else
2371                        BUG();
2372        }
2373        rb_link_node(&new->nd, parent, p);
2374        rb_insert_color(&new->nd, &sp->root);
2375        pr_debug("inserting %lx-%lx: %d\n", new->start, new->end,
2376                 new->policy ? new->policy->mode : 0);
2377}
2378
2379/* Find shared policy intersecting idx */
2380struct mempolicy *
2381mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
2382{
2383        struct mempolicy *pol = NULL;
2384        struct sp_node *sn;
2385
2386        if (!sp->root.rb_node)
2387                return NULL;
2388        read_lock(&sp->lock);
2389        sn = sp_lookup(sp, idx, idx+1);
2390        if (sn) {
2391                mpol_get(sn->policy);
2392                pol = sn->policy;
2393        }
2394        read_unlock(&sp->lock);
2395        return pol;
2396}
2397
2398static void sp_free(struct sp_node *n)
2399{
2400        mpol_put(n->policy);
2401        kmem_cache_free(sn_cache, n);
2402}
2403
2404/**
2405 * mpol_misplaced - check whether current page node is valid in policy
2406 *
2407 * @page: page to be checked
2408 * @vma: vm area where page mapped
2409 * @addr: virtual address where page mapped
2410 *
2411 * Lookup current policy node id for vma,addr and "compare to" page's
2412 * node id.
2413 *
2414 * Returns:
2415 *      -1      - not misplaced, page is in the right node
2416 *      node    - node id where the page should be
2417 *
2418 * Policy determination "mimics" alloc_page_vma().
2419 * Called from fault path where we know the vma and faulting address.
2420 */
2421int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long addr)
2422{
2423        struct mempolicy *pol;
2424        struct zoneref *z;
2425        int curnid = page_to_nid(page);
2426        unsigned long pgoff;
2427        int thiscpu = raw_smp_processor_id();
2428        int thisnid = cpu_to_node(thiscpu);
2429        int polnid = NUMA_NO_NODE;
2430        int ret = -1;
2431
2432        pol = get_vma_policy(vma, addr);
2433        if (!(pol->flags & MPOL_F_MOF))
2434                goto out;
2435
2436        switch (pol->mode) {
2437        case MPOL_INTERLEAVE:
2438                pgoff = vma->vm_pgoff;
2439                pgoff += (addr - vma->vm_start) >> PAGE_SHIFT;
2440                polnid = offset_il_node(pol, pgoff);
2441                break;
2442
2443        case MPOL_PREFERRED:
2444                if (pol->flags & MPOL_F_LOCAL)
2445                        polnid = numa_node_id();
2446                else
2447                        polnid = pol->v.preferred_node;
2448                break;
2449
2450        case MPOL_BIND:
2451
2452                /*
2453                 * allows binding to multiple nodes.
2454                 * use current page if in policy nodemask,
2455                 * else select nearest allowed node, if any.
2456                 * If no allowed nodes, use current [!misplaced].
2457                 */
2458                if (node_isset(curnid, pol->v.nodes))
2459                        goto out;
2460                z = first_zones_zonelist(
2461                                node_zonelist(numa_node_id(), GFP_HIGHUSER),
2462                                gfp_zone(GFP_HIGHUSER),
2463                                &pol->v.nodes);
2464                polnid = zone_to_nid(z->zone);
2465                break;
2466
2467        default:
2468                BUG();
2469        }
2470
2471        /* Migrate the page towards the node whose CPU is referencing it */
2472        if (pol->flags & MPOL_F_MORON) {
2473                polnid = thisnid;
2474
2475                if (!should_numa_migrate_memory(current, page, curnid, thiscpu))
2476                        goto out;
2477        }
2478
2479        if (curnid != polnid)
2480                ret = polnid;
2481out:
2482        mpol_cond_put(pol);
2483
2484        return ret;
2485}
2486
2487/*
2488 * Drop the (possibly final) reference to task->mempolicy.  It needs to be
2489 * dropped after task->mempolicy is set to NULL so that any allocation done as
2490 * part of its kmem_cache_free(), such as by KASAN, doesn't reference a freed
2491 * policy.
2492 */
2493void mpol_put_task_policy(struct task_struct *task)
2494{
2495        struct mempolicy *pol;
2496
2497        task_lock(task);
2498        pol = task->mempolicy;
2499        task->mempolicy = NULL;
2500        task_unlock(task);
2501        mpol_put(pol);
2502}
2503
2504static void sp_delete(struct shared_policy *sp, struct sp_node *n)
2505{
2506        pr_debug("deleting %lx-l%lx\n", n->start, n->end);
2507        rb_erase(&n->nd, &sp->root);
2508        sp_free(n);
2509}
2510
2511static void sp_node_init(struct sp_node *node, unsigned long start,
2512                        unsigned long end, struct mempolicy *pol)
2513{
2514        node->start = start;
2515        node->end = end;
2516        node->policy = pol;
2517}
2518
2519static struct sp_node *sp_alloc(unsigned long start, unsigned long end,
2520                                struct mempolicy *pol)
2521{
2522        struct sp_node *n;
2523        struct mempolicy *newpol;
2524
2525        n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
2526        if (!n)
2527                return NULL;
2528
2529        newpol = mpol_dup(pol);
2530        if (IS_ERR(newpol)) {
2531                kmem_cache_free(sn_cache, n);
2532                return NULL;
2533        }
2534        newpol->flags |= MPOL_F_SHARED;
2535        sp_node_init(n, start, end, newpol);
2536
2537        return n;
2538}
2539
2540/* Replace a policy range. */
2541static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
2542                                 unsigned long end, struct sp_node *new)
2543{
2544        struct sp_node *n;
2545        struct sp_node *n_new = NULL;
2546        struct mempolicy *mpol_new = NULL;
2547        int ret = 0;
2548
2549restart:
2550        write_lock(&sp->lock);
2551        n = sp_lookup(sp, start, end);
2552        /* Take care of old policies in the same range. */
2553        while (n && n->start < end) {
2554                struct rb_node *next = rb_next(&n->nd);
2555                if (n->start >= start) {
2556                        if (n->end <= end)
2557                                sp_delete(sp, n);
2558                        else
2559                                n->start = end;
2560                } else {
2561                        /* Old policy spanning whole new range. */
2562                        if (n->end > end) {
2563                                if (!n_new)
2564                                        goto alloc_new;
2565
2566                                *mpol_new = *n->policy;
2567                                atomic_set(&mpol_new->refcnt, 1);
2568                                sp_node_init(n_new, end, n->end, mpol_new);
2569                                n->end = start;
2570                                sp_insert(sp, n_new);
2571                                n_new = NULL;
2572                                mpol_new = NULL;
2573                                break;
2574                        } else
2575                                n->end = start;
2576                }
2577                if (!next)
2578                        break;
2579                n = rb_entry(next, struct sp_node, nd);
2580        }
2581        if (new)
2582                sp_insert(sp, new);
2583        write_unlock(&sp->lock);
2584        ret = 0;
2585
2586err_out:
2587        if (mpol_new)
2588                mpol_put(mpol_new);
2589        if (n_new)
2590                kmem_cache_free(sn_cache, n_new);
2591
2592        return ret;
2593
2594alloc_new:
2595        write_unlock(&sp->lock);
2596        ret = -ENOMEM;
2597        n_new = kmem_cache_alloc(sn_cache, GFP_KERNEL);
2598        if (!n_new)
2599                goto err_out;
2600        mpol_new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
2601        if (!mpol_new)
2602                goto err_out;
2603        goto restart;
2604}
2605
2606/**
2607 * mpol_shared_policy_init - initialize shared policy for inode
2608 * @sp: pointer to inode shared policy
2609 * @mpol:  struct mempolicy to install
2610 *
2611 * Install non-NULL @mpol in inode's shared policy rb-tree.
2612 * On entry, the current task has a reference on a non-NULL @mpol.
2613 * This must be released on exit.
2614 * This is called at get_inode() calls and we can use GFP_KERNEL.
2615 */
2616void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol)
2617{
2618        int ret;
2619
2620        sp->root = RB_ROOT;             /* empty tree == default mempolicy */
2621        rwlock_init(&sp->lock);
2622
2623        if (mpol) {
2624                struct vm_area_struct pvma;
2625                struct mempolicy *new;
2626                NODEMASK_SCRATCH(scratch);
2627
2628                if (!scratch)
2629                        goto put_mpol;
2630                /* contextualize the tmpfs mount point mempolicy */
2631                new = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask);
2632                if (IS_ERR(new))
2633                        goto free_scratch; /* no valid nodemask intersection */
2634
2635                task_lock(current);
2636                ret = mpol_set_nodemask(new, &mpol->w.user_nodemask, scratch);
2637                task_unlock(current);
2638                if (ret)
2639                        goto put_new;
2640
2641                /* Create pseudo-vma that contains just the policy */
2642                memset(&pvma, 0, sizeof(struct vm_area_struct));
2643                vma_init(&pvma, NULL);
2644                pvma.vm_end = TASK_SIZE;        /* policy covers entire file */
2645                mpol_set_shared_policy(sp, &pvma, new); /* adds ref */
2646
2647put_new:
2648                mpol_put(new);                  /* drop initial ref */
2649free_scratch:
2650                NODEMASK_SCRATCH_FREE(scratch);
2651put_mpol:
2652                mpol_put(mpol); /* drop our incoming ref on sb mpol */
2653        }
2654}
2655
2656int mpol_set_shared_policy(struct shared_policy *info,
2657                        struct vm_area_struct *vma, struct mempolicy *npol)
2658{
2659        int err;
2660        struct sp_node *new = NULL;
2661        unsigned long sz = vma_pages(vma);
2662
2663        pr_debug("set_shared_policy %lx sz %lu %d %d %lx\n",
2664                 vma->vm_pgoff,
2665                 sz, npol ? npol->mode : -1,
2666                 npol ? npol->flags : -1,
2667                 npol ? nodes_addr(npol->v.nodes)[0] : NUMA_NO_NODE);
2668
2669        if (npol) {
2670                new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);
2671                if (!new)
2672                        return -ENOMEM;
2673        }
2674        err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new);
2675        if (err && new)
2676                sp_free(new);
2677        return err;
2678}
2679
2680/* Free a backing policy store on inode delete. */
2681void mpol_free_shared_policy(struct shared_policy *p)
2682{
2683        struct sp_node *n;
2684        struct rb_node *next;
2685
2686        if (!p->root.rb_node)
2687                return;
2688        write_lock(&p->lock);
2689        next = rb_first(&p->root);
2690        while (next) {
2691                n = rb_entry(next, struct sp_node, nd);
2692                next = rb_next(&n->nd);
2693                sp_delete(p, n);
2694        }
2695        write_unlock(&p->lock);
2696}
2697
2698#ifdef CONFIG_NUMA_BALANCING
2699static int __initdata numabalancing_override;
2700
2701static void __init check_numabalancing_enable(void)
2702{
2703        bool numabalancing_default = false;
2704
2705        if (IS_ENABLED(CONFIG_NUMA_BALANCING_DEFAULT_ENABLED))
2706                numabalancing_default = true;
2707
2708        /* Parsed by setup_numabalancing. override == 1 enables, -1 disables */
2709        if (numabalancing_override)
2710                set_numabalancing_state(numabalancing_override == 1);
2711
2712        if (num_online_nodes() > 1 && !numabalancing_override) {
2713                pr_info("%s automatic NUMA balancing. Configure with numa_balancing= or the kernel.numa_balancing sysctl\n",
2714                        numabalancing_default ? "Enabling" : "Disabling");
2715                set_numabalancing_state(numabalancing_default);
2716        }
2717}
2718
2719static int __init setup_numabalancing(char *str)
2720{
2721        int ret = 0;
2722        if (!str)
2723                goto out;
2724
2725        if (!strcmp(str, "enable")) {
2726                numabalancing_override = 1;
2727                ret = 1;
2728        } else if (!strcmp(str, "disable")) {
2729                numabalancing_override = -1;
2730                ret = 1;
2731        }
2732out:
2733        if (!ret)
2734                pr_warn("Unable to parse numa_balancing=\n");
2735
2736        return ret;
2737}
2738__setup("numa_balancing=", setup_numabalancing);
2739#else
2740static inline void __init check_numabalancing_enable(void)
2741{
2742}
2743#endif /* CONFIG_NUMA_BALANCING */
2744
2745/* assumes fs == KERNEL_DS */
2746void __init numa_policy_init(void)
2747{
2748        nodemask_t interleave_nodes;
2749        unsigned long largest = 0;
2750        int nid, prefer = 0;
2751
2752        policy_cache = kmem_cache_create("numa_policy",
2753                                         sizeof(struct mempolicy),
2754                                         0, SLAB_PANIC, NULL);
2755
2756        sn_cache = kmem_cache_create("shared_policy_node",
2757                                     sizeof(struct sp_node),
2758                                     0, SLAB_PANIC, NULL);
2759
2760        for_each_node(nid) {
2761                preferred_node_policy[nid] = (struct mempolicy) {
2762                        .refcnt = ATOMIC_INIT(1),
2763                        .mode = MPOL_PREFERRED,
2764                        .flags = MPOL_F_MOF | MPOL_F_MORON,
2765                        .v = { .preferred_node = nid, },
2766                };
2767        }
2768
2769        /*
2770         * Set interleaving policy for system init. Interleaving is only
2771         * enabled across suitably sized nodes (default is >= 16MB), or
2772         * fall back to the largest node if they're all smaller.
2773         */
2774        nodes_clear(interleave_nodes);
2775        for_each_node_state(nid, N_MEMORY) {
2776                unsigned long total_pages = node_present_pages(nid);
2777
2778                /* Preserve the largest node */
2779                if (largest < total_pages) {
2780                        largest = total_pages;
2781                        prefer = nid;
2782                }
2783
2784                /* Interleave this node? */
2785                if ((total_pages << PAGE_SHIFT) >= (16 << 20))
2786                        node_set(nid, interleave_nodes);
2787        }
2788
2789        /* All too small, use the largest */
2790        if (unlikely(nodes_empty(interleave_nodes)))
2791                node_set(prefer, interleave_nodes);
2792
2793        if (do_set_mempolicy(MPOL_INTERLEAVE, 0, &interleave_nodes))
2794                pr_err("%s: interleaving failed\n", __func__);
2795
2796        check_numabalancing_enable();
2797}
2798
2799/* Reset policy of current process to default */
2800void numa_default_policy(void)
2801{
2802        do_set_mempolicy(MPOL_DEFAULT, 0, NULL);
2803}
2804
2805/*
2806 * Parse and format mempolicy from/to strings
2807 */
2808
2809/*
2810 * "local" is implemented internally by MPOL_PREFERRED with MPOL_F_LOCAL flag.
2811 */
2812static const char * const policy_modes[] =
2813{
2814        [MPOL_DEFAULT]    = "default",
2815        [MPOL_PREFERRED]  = "prefer",
2816        [MPOL_BIND]       = "bind",
2817        [MPOL_INTERLEAVE] = "interleave",
2818        [MPOL_LOCAL]      = "local",
2819};
2820
2821
2822#ifdef CONFIG_TMPFS
2823/**
2824 * mpol_parse_str - parse string to mempolicy, for tmpfs mpol mount option.
2825 * @str:  string containing mempolicy to parse
2826 * @mpol:  pointer to struct mempolicy pointer, returned on success.
2827 *
2828 * Format of input:
2829 *      <mode>[=<flags>][:<nodelist>]
2830 *
2831 * On success, returns 0, else 1
2832 */
2833int mpol_parse_str(char *str, struct mempolicy **mpol)
2834{
2835        struct mempolicy *new = NULL;
2836        unsigned short mode;
2837        unsigned short mode_flags;
2838        nodemask_t nodes;
2839        char *nodelist = strchr(str, ':');
2840        char *flags = strchr(str, '=');
2841        int err = 1;
2842
2843        if (flags)
2844                *flags++ = '\0';        /* terminate mode string */
2845
2846        if (nodelist) {
2847                /* NUL-terminate mode or flags string */
2848                *nodelist++ = '\0';
2849                if (nodelist_parse(nodelist, nodes))
2850                        goto out;
2851                if (!nodes_subset(nodes, node_states[N_MEMORY]))
2852                        goto out;
2853        } else
2854                nodes_clear(nodes);
2855
2856        for (mode = 0; mode < MPOL_MAX; mode++) {
2857                if (!strcmp(str, policy_modes[mode])) {
2858                        break;
2859                }
2860        }
2861        if (mode >= MPOL_MAX)
2862                goto out;
2863
2864        switch (mode) {
2865        case MPOL_PREFERRED:
2866                /*
2867                 * Insist on a nodelist of one node only, although later
2868                 * we use first_node(nodes) to grab a single node, so here
2869                 * nodelist (or nodes) cannot be empty.
2870                 */
2871                if (nodelist) {
2872                        char *rest = nodelist;
2873                        while (isdigit(*rest))
2874                                rest++;
2875                        if (*rest)
2876                                goto out;
2877                        if (nodes_empty(nodes))
2878                                goto out;
2879                }
2880                break;
2881        case MPOL_INTERLEAVE:
2882                /*
2883                 * Default to online nodes with memory if no nodelist
2884                 */
2885                if (!nodelist)
2886                        nodes = node_states[N_MEMORY];
2887                break;
2888        case MPOL_LOCAL:
2889                /*
2890                 * Don't allow a nodelist;  mpol_new() checks flags
2891                 */
2892                if (nodelist)
2893                        goto out;
2894                mode = MPOL_PREFERRED;
2895                break;
2896        case MPOL_DEFAULT:
2897                /*
2898                 * Insist on a empty nodelist
2899                 */
2900                if (!nodelist)
2901                        err = 0;
2902                goto out;
2903        case MPOL_BIND:
2904                /*
2905                 * Insist on a nodelist
2906                 */
2907                if (!nodelist)
2908                        goto out;
2909        }
2910
2911        mode_flags = 0;
2912        if (flags) {
2913                /*
2914                 * Currently, we only support two mutually exclusive
2915                 * mode flags.
2916                 */
2917                if (!strcmp(flags, "static"))
2918                        mode_flags |= MPOL_F_STATIC_NODES;
2919                else if (!strcmp(flags, "relative"))
2920                        mode_flags |= MPOL_F_RELATIVE_NODES;
2921                else
2922                        goto out;
2923        }
2924
2925        new = mpol_new(mode, mode_flags, &nodes);
2926        if (IS_ERR(new))
2927                goto out;
2928
2929        /*
2930         * Save nodes for mpol_to_str() to show the tmpfs mount options
2931         * for /proc/mounts, /proc/pid/mounts and /proc/pid/mountinfo.
2932         */
2933        if (mode != MPOL_PREFERRED)
2934                new->v.nodes = nodes;
2935        else if (nodelist)
2936                new->v.preferred_node = first_node(nodes);
2937        else
2938                new->flags |= MPOL_F_LOCAL;
2939
2940        /*
2941         * Save nodes for contextualization: this will be used to "clone"
2942         * the mempolicy in a specific context [cpuset] at a later time.
2943         */
2944        new->w.user_nodemask = nodes;
2945
2946        err = 0;
2947
2948out:
2949        /* Restore string for error message */
2950        if (nodelist)
2951                *--nodelist = ':';
2952        if (flags)
2953                *--flags = '=';
2954        if (!err)
2955                *mpol = new;
2956        return err;
2957}
2958#endif /* CONFIG_TMPFS */
2959
2960/**
2961 * mpol_to_str - format a mempolicy structure for printing
2962 * @buffer:  to contain formatted mempolicy string
2963 * @maxlen:  length of @buffer
2964 * @pol:  pointer to mempolicy to be formatted
2965 *
2966 * Convert @pol into a string.  If @buffer is too short, truncate the string.
2967 * Recommend a @maxlen of at least 32 for the longest mode, "interleave", the
2968 * longest flag, "relative", and to display at least a few node ids.
2969 */
2970void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
2971{
2972        char *p = buffer;
2973        nodemask_t nodes = NODE_MASK_NONE;
2974        unsigned short mode = MPOL_DEFAULT;
2975        unsigned short flags = 0;
2976
2977        if (pol && pol != &default_policy && !(pol->flags & MPOL_F_MORON)) {
2978                mode = pol->mode;
2979                flags = pol->flags;
2980        }
2981
2982        switch (mode) {
2983        case MPOL_DEFAULT:
2984                break;
2985        case MPOL_PREFERRED:
2986                if (flags & MPOL_F_LOCAL)
2987                        mode = MPOL_LOCAL;
2988                else
2989                        node_set(pol->v.preferred_node, nodes);
2990                break;
2991        case MPOL_BIND:
2992        case MPOL_INTERLEAVE:
2993                nodes = pol->v.nodes;
2994                break;
2995        default:
2996                WARN_ON_ONCE(1);
2997                snprintf(p, maxlen, "unknown");
2998                return;
2999        }
3000
3001        p += snprintf(p, maxlen, "%s", policy_modes[mode]);
3002
3003        if (flags & MPOL_MODE_FLAGS) {
3004                p += snprintf(p, buffer + maxlen - p, "=");
3005
3006                /*
3007                 * Currently, the only defined flags are mutually exclusive
3008                 */
3009                if (flags & MPOL_F_STATIC_NODES)
3010                        p += snprintf(p, buffer + maxlen - p, "static");
3011                else if (flags & MPOL_F_RELATIVE_NODES)
3012                        p += snprintf(p, buffer + maxlen - p, "relative");
3013        }
3014
3015        if (!nodes_empty(nodes))
3016                p += scnprintf(p, buffer + maxlen - p, ":%*pbl",
3017                               nodemask_pr_args(&nodes));
3018}
3019