LXR linux/mm/memcontrol.c

   1// SPDX-License-Identifier: GPL-2.0-or-later
   2/* memcontrol.c - Memory Controller
   3 *
   4 * Copyright IBM Corporation, 2007
   5 * Author Balbir Singh <balbir@linux.vnet.ibm.com>
   6 *
   7 * Copyright 2007 OpenVZ SWsoft Inc
   8 * Author: Pavel Emelianov <xemul@openvz.org>
   9 *
  10 * Memory thresholds
  11 * Copyright (C) 2009 Nokia Corporation
  12 * Author: Kirill A. Shutemov
  13 *
  14 * Kernel Memory Controller
  15 * Copyright (C) 2012 Parallels Inc. and Google Inc.
  16 * Authors: Glauber Costa and Suleiman Souhlal
  17 *
  18 * Native page reclaim
  19 * Charge lifetime sanitation
  20 * Lockless page tracking & accounting
  21 * Unified hierarchy configuration model
  22 * Copyright (C) 2015 Red Hat, Inc., Johannes Weiner
  23 */
  24
  25#include <linux/page_counter.h>
  26#include <linux/memcontrol.h>
  27#include <linux/cgroup.h>
  28#include <linux/pagewalk.h>
  29#include <linux/sched/mm.h>
  30#include <linux/shmem_fs.h>
  31#include <linux/hugetlb.h>
  32#include <linux/pagemap.h>
  33#include <linux/vm_event_item.h>
  34#include <linux/smp.h>
  35#include <linux/page-flags.h>
  36#include <linux/backing-dev.h>
  37#include <linux/bit_spinlock.h>
  38#include <linux/rcupdate.h>
  39#include <linux/limits.h>
  40#include <linux/export.h>
  41#include <linux/mutex.h>
  42#include <linux/rbtree.h>
  43#include <linux/slab.h>
  44#include <linux/swap.h>
  45#include <linux/swapops.h>
  46#include <linux/spinlock.h>
  47#include <linux/eventfd.h>
  48#include <linux/poll.h>
  49#include <linux/sort.h>
  50#include <linux/fs.h>
  51#include <linux/seq_file.h>
  52#include <linux/vmpressure.h>
  53#include <linux/mm_inline.h>
  54#include <linux/swap_cgroup.h>
  55#include <linux/cpu.h>
  56#include <linux/oom.h>
  57#include <linux/lockdep.h>
  58#include <linux/file.h>
  59#include <linux/tracehook.h>
  60#include <linux/psi.h>
  61#include <linux/seq_buf.h>
  62#include "internal.h"
  63#include <net/sock.h>
  64#include <net/ip.h>
  65#include "slab.h"
  66
  67#include <linux/uaccess.h>
  68
  69#include <trace/events/vmscan.h>
  70
  71struct cgroup_subsys memory_cgrp_subsys __read_mostly;
  72EXPORT_SYMBOL(memory_cgrp_subsys);
  73
  74struct mem_cgroup *root_mem_cgroup __read_mostly;
  75
  76#define MEM_CGROUP_RECLAIM_RETRIES      5
  77
  78/* Socket memory accounting disabled? */
  79static bool cgroup_memory_nosocket;
  80
  81/* Kernel memory accounting disabled? */
  82static bool cgroup_memory_nokmem;
  83
  84/* Whether the swap controller is active */
  85#ifdef CONFIG_MEMCG_SWAP
  86int do_swap_account __read_mostly;
  87#else
  88#define do_swap_account         0
  89#endif
  90
  91#ifdef CONFIG_CGROUP_WRITEBACK
  92static DECLARE_WAIT_QUEUE_HEAD(memcg_cgwb_frn_waitq);
  93#endif
  94
  95/* Whether legacy memory+swap accounting is active */
  96static bool do_memsw_account(void)
  97{
  98        return !cgroup_subsys_on_dfl(memory_cgrp_subsys) && do_swap_account;
  99}
 100
 101#define THRESHOLDS_EVENTS_TARGET 128
 102#define SOFTLIMIT_EVENTS_TARGET 1024
 103
 104/*
 105 * Cgroups above their limits are maintained in a RB-Tree, independent of
 106 * their hierarchy representation
 107 */
 108
 109struct mem_cgroup_tree_per_node {
 110        struct rb_root rb_root;
 111        struct rb_node *rb_rightmost;
 112        spinlock_t lock;
 113};
 114
 115struct mem_cgroup_tree {
 116        struct mem_cgroup_tree_per_node *rb_tree_per_node[MAX_NUMNODES];
 117};
 118
 119static struct mem_cgroup_tree soft_limit_tree __read_mostly;
 120
 121/* for OOM */
 122struct mem_cgroup_eventfd_list {
 123        struct list_head list;
 124        struct eventfd_ctx *eventfd;
 125};
 126
 127/*
 128 * cgroup_event represents events which userspace want to receive.
 129 */
 130struct mem_cgroup_event {
 131        /*
 132         * memcg which the event belongs to.
 133         */
 134        struct mem_cgroup *memcg;
 135        /*
 136         * eventfd to signal userspace about the event.
 137         */
 138        struct eventfd_ctx *eventfd;
 139        /*
 140         * Each of these stored in a list by the cgroup.
 141         */
 142        struct list_head list;
 143        /*
 144         * register_event() callback will be used to add new userspace
 145         * waiter for changes related to this event.  Use eventfd_signal()
 146         * on eventfd to send notification to userspace.
 147         */
 148        int (*register_event)(struct mem_cgroup *memcg,
 149                              struct eventfd_ctx *eventfd, const char *args);
 150        /*
 151         * unregister_event() callback will be called when userspace closes
 152         * the eventfd or on cgroup removing.  This callback must be set,
 153         * if you want provide notification functionality.
 154         */
 155        void (*unregister_event)(struct mem_cgroup *memcg,
 156                                 struct eventfd_ctx *eventfd);
 157        /*
 158         * All fields below needed to unregister event when
 159         * userspace closes eventfd.
 160         */
 161        poll_table pt;
 162        wait_queue_head_t *wqh;
 163        wait_queue_entry_t wait;
 164        struct work_struct remove;
 165};
 166
 167static void mem_cgroup_threshold(struct mem_cgroup *memcg);
 168static void mem_cgroup_oom_notify(struct mem_cgroup *memcg);
 169
 170/* Stuffs for move charges at task migration. */
 171/*
 172 * Types of charges to be moved.
 173 */
 174#define MOVE_ANON       0x1U
 175#define MOVE_FILE       0x2U
 176#define MOVE_MASK       (MOVE_ANON | MOVE_FILE)
 177
 178/* "mc" and its members are protected by cgroup_mutex */
 179static struct move_charge_struct {
 180        spinlock_t        lock; /* for from, to */
 181        struct mm_struct  *mm;
 182        struct mem_cgroup *from;
 183        struct mem_cgroup *to;
 184        unsigned long flags;
 185        unsigned long precharge;
 186        unsigned long moved_charge;
 187        unsigned long moved_swap;
 188        struct task_struct *moving_task;        /* a task moving charges */
 189        wait_queue_head_t waitq;                /* a waitq for other context */
 190} mc = {
 191        .lock = __SPIN_LOCK_UNLOCKED(mc.lock),
 192        .waitq = __WAIT_QUEUE_HEAD_INITIALIZER(mc.waitq),
 193};
 194
 195/*
 196 * Maximum loops in mem_cgroup_hierarchical_reclaim(), used for soft
 197 * limit reclaim to prevent infinite loops, if they ever occur.
 198 */
 199#define MEM_CGROUP_MAX_RECLAIM_LOOPS            100
 200#define MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS 2
 201
 202enum charge_type {
 203        MEM_CGROUP_CHARGE_TYPE_CACHE = 0,
 204        MEM_CGROUP_CHARGE_TYPE_ANON,
 205        MEM_CGROUP_CHARGE_TYPE_SWAPOUT, /* for accounting swapcache */
 206        MEM_CGROUP_CHARGE_TYPE_DROP,    /* a page was unused swap cache */
 207        NR_CHARGE_TYPE,
 208};
 209
 210/* for encoding cft->private value on file */
 211enum res_type {
 212        _MEM,
 213        _MEMSWAP,
 214        _OOM_TYPE,
 215        _KMEM,
 216        _TCP,
 217};
 218
 219#define MEMFILE_PRIVATE(x, val) ((x) << 16 | (val))
 220#define MEMFILE_TYPE(val)       ((val) >> 16 & 0xffff)
 221#define MEMFILE_ATTR(val)       ((val) & 0xffff)
 222/* Used for OOM nofiier */
 223#define OOM_CONTROL             (0)
 224
 225/*
 226 * Iteration constructs for visiting all cgroups (under a tree).  If
 227 * loops are exited prematurely (break), mem_cgroup_iter_break() must
 228 * be used for reference counting.
 229 */
 230#define for_each_mem_cgroup_tree(iter, root)            \
 231        for (iter = mem_cgroup_iter(root, NULL, NULL);  \
 232             iter != NULL;                              \
 233             iter = mem_cgroup_iter(root, iter, NULL))
 234
 235#define for_each_mem_cgroup(iter)                       \
 236        for (iter = mem_cgroup_iter(NULL, NULL, NULL);  \
 237             iter != NULL;                              \
 238             iter = mem_cgroup_iter(NULL, iter, NULL))
 239
 240static inline bool should_force_charge(void)
 241{
 242        return tsk_is_oom_victim(current) || fatal_signal_pending(current) ||
 243                (current->flags & PF_EXITING);
 244}
 245
 246/* Some nice accessors for the vmpressure. */
 247struct vmpressure *memcg_to_vmpressure(struct mem_cgroup *memcg)
 248{
 249        if (!memcg)
 250                memcg = root_mem_cgroup;
 251        return &memcg->vmpressure;
 252}
 253
 254struct cgroup_subsys_state *vmpressure_to_css(struct vmpressure *vmpr)
 255{
 256        return &container_of(vmpr, struct mem_cgroup, vmpressure)->css;
 257}
 258
 259#ifdef CONFIG_MEMCG_KMEM
 260/*
 261 * This will be the memcg's index in each cache's ->memcg_params.memcg_caches.
 262 * The main reason for not using cgroup id for this:
 263 *  this works better in sparse environments, where we have a lot of memcgs,
 264 *  but only a few kmem-limited. Or also, if we have, for instance, 200
 265 *  memcgs, and none but the 200th is kmem-limited, we'd have to have a
 266 *  200 entry array for that.
 267 *
 268 * The current size of the caches array is stored in memcg_nr_cache_ids. It
 269 * will double each time we have to increase it.
 270 */
 271static DEFINE_IDA(memcg_cache_ida);
 272int memcg_nr_cache_ids;
 273
 274/* Protects memcg_nr_cache_ids */
 275static DECLARE_RWSEM(memcg_cache_ids_sem);
 276
 277void memcg_get_cache_ids(void)
 278{
 279        down_read(&memcg_cache_ids_sem);
 280}
 281
 282void memcg_put_cache_ids(void)
 283{
 284        up_read(&memcg_cache_ids_sem);
 285}
 286
 287/*
 288 * MIN_SIZE is different than 1, because we would like to avoid going through
 289 * the alloc/free process all the time. In a small machine, 4 kmem-limited
 290 * cgroups is a reasonable guess. In the future, it could be a parameter or
 291 * tunable, but that is strictly not necessary.
 292 *
 293 * MAX_SIZE should be as large as the number of cgrp_ids. Ideally, we could get
 294 * this constant directly from cgroup, but it is understandable that this is
 295 * better kept as an internal representation in cgroup.c. In any case, the
 296 * cgrp_id space is not getting any smaller, and we don't have to necessarily
 297 * increase ours as well if it increases.
 298 */
 299#define MEMCG_CACHES_MIN_SIZE 4
 300#define MEMCG_CACHES_MAX_SIZE MEM_CGROUP_ID_MAX
 301
 302/*
 303 * A lot of the calls to the cache allocation functions are expected to be
 304 * inlined by the compiler. Since the calls to memcg_kmem_get_cache are
 305 * conditional to this static branch, we'll have to allow modules that does
 306 * kmem_cache_alloc and the such to see this symbol as well
 307 */
 308DEFINE_STATIC_KEY_FALSE(memcg_kmem_enabled_key);
 309EXPORT_SYMBOL(memcg_kmem_enabled_key);
 310
 311struct workqueue_struct *memcg_kmem_cache_wq;
 312#endif
 313
 314static int memcg_shrinker_map_size;
 315static DEFINE_MUTEX(memcg_shrinker_map_mutex);
 316
 317static void memcg_free_shrinker_map_rcu(struct rcu_head *head)
 318{
 319        kvfree(container_of(head, struct memcg_shrinker_map, rcu));
 320}
 321
 322static int memcg_expand_one_shrinker_map(struct mem_cgroup *memcg,
 323                                         int size, int old_size)
 324{
 325        struct memcg_shrinker_map *new, *old;
 326        int nid;
 327
 328        lockdep_assert_held(&memcg_shrinker_map_mutex);
 329
 330        for_each_node(nid) {
 331                old = rcu_dereference_protected(
 332                        mem_cgroup_nodeinfo(memcg, nid)->shrinker_map, true);
 333                /* Not yet online memcg */
 334                if (!old)
 335                        return 0;
 336
 337                new = kvmalloc(sizeof(*new) + size, GFP_KERNEL);
 338                if (!new)
 339                        return -ENOMEM;
 340
 341                /* Set all old bits, clear all new bits */
 342                memset(new->map, (int)0xff, old_size);
 343                memset((void *)new->map + old_size, 0, size - old_size);
 344
 345                rcu_assign_pointer(memcg->nodeinfo[nid]->shrinker_map, new);
 346                call_rcu(&old->rcu, memcg_free_shrinker_map_rcu);
 347        }
 348
 349        return 0;
 350}
 351
 352static void memcg_free_shrinker_maps(struct mem_cgroup *memcg)
 353{
 354        struct mem_cgroup_per_node *pn;
 355        struct memcg_shrinker_map *map;
 356        int nid;
 357
 358        if (mem_cgroup_is_root(memcg))
 359                return;
 360
 361        for_each_node(nid) {
 362                pn = mem_cgroup_nodeinfo(memcg, nid);
 363                map = rcu_dereference_protected(pn->shrinker_map, true);
 364                if (map)
 365                        kvfree(map);
 366                rcu_assign_pointer(pn->shrinker_map, NULL);
 367        }
 368}
 369
 370static int memcg_alloc_shrinker_maps(struct mem_cgroup *memcg)
 371{
 372        struct memcg_shrinker_map *map;
 373        int nid, size, ret = 0;
 374
 375        if (mem_cgroup_is_root(memcg))
 376                return 0;
 377
 378        mutex_lock(&memcg_shrinker_map_mutex);
 379        size = memcg_shrinker_map_size;
 380        for_each_node(nid) {
 381                map = kvzalloc(sizeof(*map) + size, GFP_KERNEL);
 382                if (!map) {
 383                        memcg_free_shrinker_maps(memcg);
 384                        ret = -ENOMEM;
 385                        break;
 386                }
 387                rcu_assign_pointer(memcg->nodeinfo[nid]->shrinker_map, map);
 388        }
 389        mutex_unlock(&memcg_shrinker_map_mutex);
 390
 391        return ret;
 392}
 393
 394int memcg_expand_shrinker_maps(int new_id)
 395{
 396        int size, old_size, ret = 0;
 397        struct mem_cgroup *memcg;
 398
 399        size = DIV_ROUND_UP(new_id + 1, BITS_PER_LONG) * sizeof(unsigned long);
 400        old_size = memcg_shrinker_map_size;
 401        if (size <= old_size)
 402                return 0;
 403
 404        mutex_lock(&memcg_shrinker_map_mutex);
 405        if (!root_mem_cgroup)
 406                goto unlock;
 407
 408        for_each_mem_cgroup(memcg) {
 409                if (mem_cgroup_is_root(memcg))
 410                        continue;
 411                ret = memcg_expand_one_shrinker_map(memcg, size, old_size);
 412                if (ret)
 413                        goto unlock;
 414        }
 415unlock:
 416        if (!ret)
 417                memcg_shrinker_map_size = size;
 418        mutex_unlock(&memcg_shrinker_map_mutex);
 419        return ret;
 420}
 421
 422void memcg_set_shrinker_bit(struct mem_cgroup *memcg, int nid, int shrinker_id)
 423{
 424        if (shrinker_id >= 0 && memcg && !mem_cgroup_is_root(memcg)) {
 425                struct memcg_shrinker_map *map;
 426
 427                rcu_read_lock();
 428                map = rcu_dereference(memcg->nodeinfo[nid]->shrinker_map);
 429                /* Pairs with smp mb in shrink_slab() */
 430                smp_mb__before_atomic();
 431                set_bit(shrinker_id, map->map);
 432                rcu_read_unlock();
 433        }
 434}
 435
 436/**
 437 * mem_cgroup_css_from_page - css of the memcg associated with a page
 438 * @page: page of interest
 439 *
 440 * If memcg is bound to the default hierarchy, css of the memcg associated
 441 * with @page is returned.  The returned css remains associated with @page
 442 * until it is released.
 443 *
 444 * If memcg is bound to a traditional hierarchy, the css of root_mem_cgroup
 445 * is returned.
 446 */
 447struct cgroup_subsys_state *mem_cgroup_css_from_page(struct page *page)
 448{
 449        struct mem_cgroup *memcg;
 450
 451        memcg = page->mem_cgroup;
 452
 453        if (!memcg || !cgroup_subsys_on_dfl(memory_cgrp_subsys))
 454                memcg = root_mem_cgroup;
 455
 456        return &memcg->css;
 457}
 458
 459/**
 460 * page_cgroup_ino - return inode number of the memcg a page is charged to
 461 * @page: the page
 462 *
 463 * Look up the closest online ancestor of the memory cgroup @page is charged to
 464 * and return its inode number or 0 if @page is not charged to any cgroup. It
 465 * is safe to call this function without holding a reference to @page.
 466 *
 467 * Note, this function is inherently racy, because there is nothing to prevent
 468 * the cgroup inode from getting torn down and potentially reallocated a moment
 469 * after page_cgroup_ino() returns, so it only should be used by callers that
 470 * do not care (such as procfs interfaces).
 471 */
 472ino_t page_cgroup_ino(struct page *page)
 473{
 474        struct mem_cgroup *memcg;
 475        unsigned long ino = 0;
 476
 477        rcu_read_lock();
 478        if (PageSlab(page) && !PageTail(page))
 479                memcg = memcg_from_slab_page(page);
 480        else
 481                memcg = READ_ONCE(page->mem_cgroup);
 482        while (memcg && !(memcg->css.flags & CSS_ONLINE))
 483                memcg = parent_mem_cgroup(memcg);
 484        if (memcg)
 485                ino = cgroup_ino(memcg->css.cgroup);
 486        rcu_read_unlock();
 487        return ino;
 488}
 489
 490static struct mem_cgroup_per_node *
 491mem_cgroup_page_nodeinfo(struct mem_cgroup *memcg, struct page *page)
 492{
 493        int nid = page_to_nid(page);
 494
 495        return memcg->nodeinfo[nid];
 496}
 497
 498static struct mem_cgroup_tree_per_node *
 499soft_limit_tree_node(int nid)
 500{
 501        return soft_limit_tree.rb_tree_per_node[nid];
 502}
 503
 504static struct mem_cgroup_tree_per_node *
 505soft_limit_tree_from_page(struct page *page)
 506{
 507        int nid = page_to_nid(page);
 508
 509        return soft_limit_tree.rb_tree_per_node[nid];
 510}
 511
 512static void __mem_cgroup_insert_exceeded(struct mem_cgroup_per_node *mz,
 513                                         struct mem_cgroup_tree_per_node *mctz,
 514                                         unsigned long new_usage_in_excess)
 515{
 516        struct rb_node **p = &mctz->rb_root.rb_node;
 517        struct rb_node *parent = NULL;
 518        struct mem_cgroup_per_node *mz_node;
 519        bool rightmost = true;
 520
 521        if (mz->on_tree)
 522                return;
 523
 524        mz->usage_in_excess = new_usage_in_excess;
 525        if (!mz->usage_in_excess)
 526                return;
 527        while (*p) {
 528                parent = *p;
 529                mz_node = rb_entry(parent, struct mem_cgroup_per_node,
 530                                        tree_node);
 531                if (mz->usage_in_excess < mz_node->usage_in_excess) {
 532                        p = &(*p)->rb_left;
 533                        rightmost = false;
 534                }
 535
 536                /*
 537                 * We can't avoid mem cgroups that are over their soft
 538                 * limit by the same amount
 539                 */
 540                else if (mz->usage_in_excess >= mz_node->usage_in_excess)
 541                        p = &(*p)->rb_right;
 542        }
 543
 544        if (rightmost)
 545                mctz->rb_rightmost = &mz->tree_node;
 546
 547        rb_link_node(&mz->tree_node, parent, p);
 548        rb_insert_color(&mz->tree_node, &mctz->rb_root);
 549        mz->on_tree = true;
 550}
 551
 552static void __mem_cgroup_remove_exceeded(struct mem_cgroup_per_node *mz,
 553                                         struct mem_cgroup_tree_per_node *mctz)
 554{
 555        if (!mz->on_tree)
 556                return;
 557
 558        if (&mz->tree_node == mctz->rb_rightmost)
 559                mctz->rb_rightmost = rb_prev(&mz->tree_node);
 560
 561        rb_erase(&mz->tree_node, &mctz->rb_root);
 562        mz->on_tree = false;
 563}
 564
 565static void mem_cgroup_remove_exceeded(struct mem_cgroup_per_node *mz,
 566                                       struct mem_cgroup_tree_per_node *mctz)
 567{
 568        unsigned long flags;
 569
 570        spin_lock_irqsave(&mctz->lock, flags);
 571        __mem_cgroup_remove_exceeded(mz, mctz);
 572        spin_unlock_irqrestore(&mctz->lock, flags);
 573}
 574
 575static unsigned long soft_limit_excess(struct mem_cgroup *memcg)
 576{
 577        unsigned long nr_pages = page_counter_read(&memcg->memory);
 578        unsigned long soft_limit = READ_ONCE(memcg->soft_limit);
 579        unsigned long excess = 0;
 580
 581        if (nr_pages > soft_limit)
 582                excess = nr_pages - soft_limit;
 583
 584        return excess;
 585}
 586
 587static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page)
 588{
 589        unsigned long excess;
 590        struct mem_cgroup_per_node *mz;
 591        struct mem_cgroup_tree_per_node *mctz;
 592
 593        mctz = soft_limit_tree_from_page(page);
 594        if (!mctz)
 595                return;
 596        /*
 597         * Necessary to update all ancestors when hierarchy is used.
 598         * because their event counter is not touched.
 599         */
 600        for (; memcg; memcg = parent_mem_cgroup(memcg)) {
 601                mz = mem_cgroup_page_nodeinfo(memcg, page);
 602                excess = soft_limit_excess(memcg);
 603                /*
 604                 * We have to update the tree if mz is on RB-tree or
 605                 * mem is over its softlimit.
 606                 */
 607                if (excess || mz->on_tree) {
 608                        unsigned long flags;
 609
 610                        spin_lock_irqsave(&mctz->lock, flags);
 611                        /* if on-tree, remove it */
 612                        if (mz->on_tree)
 613                                __mem_cgroup_remove_exceeded(mz, mctz);
 614                        /*
 615                         * Insert again. mz->usage_in_excess will be updated.
 616                         * If excess is 0, no tree ops.
 617                         */
 618                        __mem_cgroup_insert_exceeded(mz, mctz, excess);
 619                        spin_unlock_irqrestore(&mctz->lock, flags);
 620                }
 621        }
 622}
 623
 624static void mem_cgroup_remove_from_trees(struct mem_cgroup *memcg)
 625{
 626        struct mem_cgroup_tree_per_node *mctz;
 627        struct mem_cgroup_per_node *mz;
 628        int nid;
 629
 630        for_each_node(nid) {
 631                mz = mem_cgroup_nodeinfo(memcg, nid);
 632                mctz = soft_limit_tree_node(nid);
 633                if (mctz)
 634                        mem_cgroup_remove_exceeded(mz, mctz);
 635        }
 636}
 637
 638static struct mem_cgroup_per_node *
 639__mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz)
 640{
 641        struct mem_cgroup_per_node *mz;
 642
 643retry:
 644        mz = NULL;
 645        if (!mctz->rb_rightmost)
 646                goto done;              /* Nothing to reclaim from */
 647
 648        mz = rb_entry(mctz->rb_rightmost,
 649                      struct mem_cgroup_per_node, tree_node);
 650        /*
 651         * Remove the node now but someone else can add it back,
 652         * we will to add it back at the end of reclaim to its correct
 653         * position in the tree.
 654         */
 655        __mem_cgroup_remove_exceeded(mz, mctz);
 656        if (!soft_limit_excess(mz->memcg) ||
 657            !css_tryget_online(&mz->memcg->css))
 658                goto retry;
 659done:
 660        return mz;
 661}
 662
 663static struct mem_cgroup_per_node *
 664mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz)
 665{
 666        struct mem_cgroup_per_node *mz;
 667
 668        spin_lock_irq(&mctz->lock);
 669        mz = __mem_cgroup_largest_soft_limit_node(mctz);
 670        spin_unlock_irq(&mctz->lock);
 671        return mz;
 672}
 673
 674/**
 675 * __mod_memcg_state - update cgroup memory statistics
 676 * @memcg: the memory cgroup
 677 * @idx: the stat item - can be enum memcg_stat_item or enum node_stat_item
 678 * @val: delta to add to the counter, can be negative
 679 */
 680void __mod_memcg_state(struct mem_cgroup *memcg, int idx, int val)
 681{
 682        long x;
 683
 684        if (mem_cgroup_disabled())
 685                return;
 686
 687        x = val + __this_cpu_read(memcg->vmstats_percpu->stat[idx]);
 688        if (unlikely(abs(x) > MEMCG_CHARGE_BATCH)) {
 689                struct mem_cgroup *mi;
 690
 691                /*
 692                 * Batch local counters to keep them in sync with
 693                 * the hierarchical ones.
 694                 */
 695                __this_cpu_add(memcg->vmstats_local->stat[idx], x);
 696                for (mi = memcg; mi; mi = parent_mem_cgroup(mi))
 697                        atomic_long_add(x, &mi->vmstats[idx]);
 698                x = 0;
 699        }
 700        __this_cpu_write(memcg->vmstats_percpu->stat[idx], x);
 701}
 702
 703static struct mem_cgroup_per_node *
 704parent_nodeinfo(struct mem_cgroup_per_node *pn, int nid)
 705{
 706        struct mem_cgroup *parent;
 707
 708        parent = parent_mem_cgroup(pn->memcg);
 709        if (!parent)
 710                return NULL;
 711        return mem_cgroup_nodeinfo(parent, nid);
 712}
 713
 714/**
 715 * __mod_lruvec_state - update lruvec memory statistics
 716 * @lruvec: the lruvec
 717 * @idx: the stat item
 718 * @val: delta to add to the counter, can be negative
 719 *
 720 * The lruvec is the intersection of the NUMA node and a cgroup. This
 721 * function updates the all three counters that are affected by a
 722 * change of state at this level: per-node, per-cgroup, per-lruvec.
 723 */
 724void __mod_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx,
 725                        int val)
 726{
 727        pg_data_t *pgdat = lruvec_pgdat(lruvec);
 728        struct mem_cgroup_per_node *pn;
 729        struct mem_cgroup *memcg;
 730        long x;
 731
 732        /* Update node */
 733        __mod_node_page_state(pgdat, idx, val);
 734
 735        if (mem_cgroup_disabled())
 736                return;
 737
 738        pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
 739        memcg = pn->memcg;
 740
 741        /* Update memcg */
 742        __mod_memcg_state(memcg, idx, val);
 743
 744        /* Update lruvec */
 745        __this_cpu_add(pn->lruvec_stat_local->count[idx], val);
 746
 747        x = val + __this_cpu_read(pn->lruvec_stat_cpu->count[idx]);
 748        if (unlikely(abs(x) > MEMCG_CHARGE_BATCH)) {
 749                struct mem_cgroup_per_node *pi;
 750
 751                for (pi = pn; pi; pi = parent_nodeinfo(pi, pgdat->node_id))
 752                        atomic_long_add(x, &pi->lruvec_stat[idx]);
 753                x = 0;
 754        }
 755        __this_cpu_write(pn->lruvec_stat_cpu->count[idx], x);
 756}
 757
 758void __mod_lruvec_slab_state(void *p, enum node_stat_item idx, int val)
 759{
 760        struct page *page = virt_to_head_page(p);
 761        pg_data_t *pgdat = page_pgdat(page);
 762        struct mem_cgroup *memcg;
 763        struct lruvec *lruvec;
 764
 765        rcu_read_lock();
 766        memcg = memcg_from_slab_page(page);
 767
 768        /* Untracked pages have no memcg, no lruvec. Update only the node */
 769        if (!memcg || memcg == root_mem_cgroup) {
 770                __mod_node_page_state(pgdat, idx, val);
 771        } else {
 772                lruvec = mem_cgroup_lruvec(memcg, pgdat);
 773                __mod_lruvec_state(lruvec, idx, val);
 774        }
 775        rcu_read_unlock();
 776}
 777
 778/**
 779 * __count_memcg_events - account VM events in a cgroup
 780 * @memcg: the memory cgroup
 781 * @idx: the event item
 782 * @count: the number of events that occured
 783 */
 784void __count_memcg_events(struct mem_cgroup *memcg, enum vm_event_item idx,
 785                          unsigned long count)
 786{
 787        unsigned long x;
 788
 789        if (mem_cgroup_disabled())
 790                return;
 791
 792        x = count + __this_cpu_read(memcg->vmstats_percpu->events[idx]);
 793        if (unlikely(x > MEMCG_CHARGE_BATCH)) {
 794                struct mem_cgroup *mi;
 795
 796                /*
 797                 * Batch local counters to keep them in sync with
 798                 * the hierarchical ones.
 799                 */
 800                __this_cpu_add(memcg->vmstats_local->events[idx], x);
 801                for (mi = memcg; mi; mi = parent_mem_cgroup(mi))
 802                        atomic_long_add(x, &mi->vmevents[idx]);
 803                x = 0;
 804        }
 805        __this_cpu_write(memcg->vmstats_percpu->events[idx], x);
 806}
 807
 808static unsigned long memcg_events(struct mem_cgroup *memcg, int event)
 809{
 810        return atomic_long_read(&memcg->vmevents[event]);
 811}
 812
 813static unsigned long memcg_events_local(struct mem_cgroup *memcg, int event)
 814{
 815        long x = 0;
 816        int cpu;
 817
 818        for_each_possible_cpu(cpu)
 819                x += per_cpu(memcg->vmstats_local->events[event], cpu);
 820        return x;
 821}
 822
 823static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg,
 824                                         struct page *page,
 825                                         bool compound, int nr_pages)
 826{
 827        /*
 828         * Here, RSS means 'mapped anon' and anon's SwapCache. Shmem/tmpfs is
 829         * counted as CACHE even if it's on ANON LRU.
 830         */
 831        if (PageAnon(page))
 832                __mod_memcg_state(memcg, MEMCG_RSS, nr_pages);
 833        else {
 834                __mod_memcg_state(memcg, MEMCG_CACHE, nr_pages);
 835                if (PageSwapBacked(page))
 836                        __mod_memcg_state(memcg, NR_SHMEM, nr_pages);
 837        }
 838
 839        if (compound) {
 840                VM_BUG_ON_PAGE(!PageTransHuge(page), page);
 841                __mod_memcg_state(memcg, MEMCG_RSS_HUGE, nr_pages);
 842        }
 843
 844        /* pagein of a big page is an event. So, ignore page size */
 845        if (nr_pages > 0)
 846                __count_memcg_events(memcg, PGPGIN, 1);
 847        else {
 848                __count_memcg_events(memcg, PGPGOUT, 1);
 849                nr_pages = -nr_pages; /* for event */
 850        }
 851
 852        __this_cpu_add(memcg->vmstats_percpu->nr_page_events, nr_pages);
 853}
 854
 855static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg,
 856                                       enum mem_cgroup_events_target target)
 857{
 858        unsigned long val, next;
 859
 860        val = __this_cpu_read(memcg->vmstats_percpu->nr_page_events);
 861        next = __this_cpu_read(memcg->vmstats_percpu->targets[target]);
 862        /* from time_after() in jiffies.h */
 863        if ((long)(next - val) < 0) {
 864                switch (target) {
 865                case MEM_CGROUP_TARGET_THRESH:
 866                        next = val + THRESHOLDS_EVENTS_TARGET;
 867                        break;
 868                case MEM_CGROUP_TARGET_SOFTLIMIT:
 869                        next = val + SOFTLIMIT_EVENTS_TARGET;
 870                        break;
 871                default:
 872                        break;
 873                }
 874                __this_cpu_write(memcg->vmstats_percpu->targets[target], next);
 875                return true;
 876        }
 877        return false;
 878}
 879
 880/*
 881 * Check events in order.
 882 *
 883 */
 884static void memcg_check_events(struct mem_cgroup *memcg, struct page *page)
 885{
 886        /* threshold event is triggered in finer grain than soft limit */
 887        if (unlikely(mem_cgroup_event_ratelimit(memcg,
 888                                                MEM_CGROUP_TARGET_THRESH))) {
 889                bool do_softlimit;
 890
 891                do_softlimit = mem_cgroup_event_ratelimit(memcg,
 892                                                MEM_CGROUP_TARGET_SOFTLIMIT);
 893                mem_cgroup_threshold(memcg);
 894                if (unlikely(do_softlimit))
 895                        mem_cgroup_update_tree(memcg, page);
 896        }
 897}
 898
 899struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
 900{
 901        /*
 902         * mm_update_next_owner() may clear mm->owner to NULL
 903         * if it races with swapoff, page migration, etc.
 904         * So this can be called with p == NULL.
 905         */
 906        if (unlikely(!p))
 907                return NULL;
 908
 909        return mem_cgroup_from_css(task_css(p, memory_cgrp_id));
 910}
 911EXPORT_SYMBOL(mem_cgroup_from_task);
 912
 913/**
 914 * get_mem_cgroup_from_mm: Obtain a reference on given mm_struct's memcg.
 915 * @mm: mm from which memcg should be extracted. It can be NULL.
 916 *
 917 * Obtain a reference on mm->memcg and returns it if successful. Otherwise
 918 * root_mem_cgroup is returned. However if mem_cgroup is disabled, NULL is
 919 * returned.
 920 */
 921struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm)
 922{
 923        struct mem_cgroup *memcg;
 924
 925        if (mem_cgroup_disabled())
 926                return NULL;
 927
 928        rcu_read_lock();
 929        do {
 930                /*
 931                 * Page cache insertions can happen withou an
 932                 * actual mm context, e.g. during disk probing
 933                 * on boot, loopback IO, acct() writes etc.
 934                 */
 935                if (unlikely(!mm))
 936                        memcg = root_mem_cgroup;
 937                else {
 938                        memcg = mem_cgroup_from_task(rcu_dereference(mm->owner));
 939                        if (unlikely(!memcg))
 940                                memcg = root_mem_cgroup;
 941                }
 942        } while (!css_tryget(&memcg->css));
 943        rcu_read_unlock();
 944        return memcg;
 945}
 946EXPORT_SYMBOL(get_mem_cgroup_from_mm);
 947
 948/**
 949 * get_mem_cgroup_from_page: Obtain a reference on given page's memcg.
 950 * @page: page from which memcg should be extracted.
 951 *
 952 * Obtain a reference on page->memcg and returns it if successful. Otherwise
 953 * root_mem_cgroup is returned.
 954 */
 955struct mem_cgroup *get_mem_cgroup_from_page(struct page *page)
 956{
 957        struct mem_cgroup *memcg = page->mem_cgroup;
 958
 959        if (mem_cgroup_disabled())
 960                return NULL;
 961
 962        rcu_read_lock();
 963        if (!memcg || !css_tryget_online(&memcg->css))
 964                memcg = root_mem_cgroup;
 965        rcu_read_unlock();
 966        return memcg;
 967}
 968EXPORT_SYMBOL(get_mem_cgroup_from_page);
 969
 970/**
 971 * If current->active_memcg is non-NULL, do not fallback to current->mm->memcg.
 972 */
 973static __always_inline struct mem_cgroup *get_mem_cgroup_from_current(void)
 974{
 975        if (unlikely(current->active_memcg)) {
 976                struct mem_cgroup *memcg = root_mem_cgroup;
 977
 978                rcu_read_lock();
 979                if (css_tryget_online(&current->active_memcg->css))
 980                        memcg = current->active_memcg;
 981                rcu_read_unlock();
 982                return memcg;
 983        }
 984        return get_mem_cgroup_from_mm(current->mm);
 985}
 986
 987/**
 988 * mem_cgroup_iter - iterate over memory cgroup hierarchy
 989 * @root: hierarchy root
 990 * @prev: previously returned memcg, NULL on first invocation
 991 * @reclaim: cookie for shared reclaim walks, NULL for full walks
 992 *
 993 * Returns references to children of the hierarchy below @root, or
 994 * @root itself, or %NULL after a full round-trip.
 995 *
 996 * Caller must pass the return value in @prev on subsequent
 997 * invocations for reference counting, or use mem_cgroup_iter_break()
 998 * to cancel a hierarchy walk before the round-trip is complete.
 999 *
1000 * Reclaimers can specify a node and a priority level in @reclaim to

1001 * divide up the memcgs in the hierarchy among all concurrent
1002 * reclaimers operating on the same node and priority.
1003 */
1004struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
1005                                   struct mem_cgroup *prev,
1006                                   struct mem_cgroup_reclaim_cookie *reclaim)
1007{
1008        struct mem_cgroup_reclaim_iter *uninitialized_var(iter);
1009        struct cgroup_subsys_state *css = NULL;
1010        struct mem_cgroup *memcg = NULL;
1011        struct mem_cgroup *pos = NULL;
1012
1013        if (mem_cgroup_disabled())
1014                return NULL;
1015
1016        if (!root)
1017                root = root_mem_cgroup;
1018
1019        if (prev && !reclaim)
1020                pos = prev;
1021
1022        if (!root->use_hierarchy && root != root_mem_cgroup) {
1023                if (prev)
1024                        goto out;
1025                return root;
1026        }
1027
1028        rcu_read_lock();
1029
1030        if (reclaim) {
1031                struct mem_cgroup_per_node *mz;
1032
1033                mz = mem_cgroup_nodeinfo(root, reclaim->pgdat->node_id);
1034                iter = &mz->iter;
1035
1036                if (prev && reclaim->generation != iter->generation)
1037                        goto out_unlock;
1038
1039                while (1) {
1040                        pos = READ_ONCE(iter->position);
1041                        if (!pos || css_tryget(&pos->css))
1042                                break;
1043                        /*
1044                         * css reference reached zero, so iter->position will
1045                         * be cleared by ->css_released. However, we should not
1046                         * rely on this happening soon, because ->css_released
1047                         * is called from a work queue, and by busy-waiting we
1048                         * might block it. So we clear iter->position right
1049                         * away.
1050                         */
1051                        (void)cmpxchg(&iter->position, pos, NULL);
1052                }
1053        }
1054
1055        if (pos)
1056                css = &pos->css;
1057
1058        for (;;) {
1059                css = css_next_descendant_pre(css, &root->css);
1060                if (!css) {
1061                        /*
1062                         * Reclaimers share the hierarchy walk, and a
1063                         * new one might jump in right at the end of
1064                         * the hierarchy - make sure they see at least
1065                         * one group and restart from the beginning.
1066                         */
1067                        if (!prev)
1068                                continue;
1069                        break;
1070                }
1071
1072                /*
1073                 * Verify the css and acquire a reference.  The root
1074                 * is provided by the caller, so we know it's alive
1075                 * and kicking, and don't take an extra reference.
1076                 */
1077                memcg = mem_cgroup_from_css(css);
1078
1079                if (css == &root->css)
1080                        break;
1081
1082                if (css_tryget(css))
1083                        break;
1084
1085                memcg = NULL;
1086        }
1087
1088        if (reclaim) {
1089                /*
1090                 * The position could have already been updated by a competing
1091                 * thread, so check that the value hasn't changed since we read
1092                 * it to avoid reclaiming from the same cgroup twice.
1093                 */
1094                (void)cmpxchg(&iter->position, pos, memcg);
1095
1096                if (pos)
1097                        css_put(&pos->css);
1098
1099                if (!memcg)
1100                        iter->generation++;
1101                else if (!prev)
1102                        reclaim->generation = iter->generation;
1103        }
1104
1105out_unlock:
1106        rcu_read_unlock();
1107out:
1108        if (prev && prev != root)
1109                css_put(&prev->css);
1110
1111        return memcg;
1112}
1113
1114/**
1115 * mem_cgroup_iter_break - abort a hierarchy walk prematurely
1116 * @root: hierarchy root
1117 * @prev: last visited hierarchy member as returned by mem_cgroup_iter()
1118 */
1119void mem_cgroup_iter_break(struct mem_cgroup *root,
1120                           struct mem_cgroup *prev)
1121{
1122        if (!root)
1123                root = root_mem_cgroup;
1124        if (prev && prev != root)
1125                css_put(&prev->css);
1126}
1127
1128static void __invalidate_reclaim_iterators(struct mem_cgroup *from,
1129                                        struct mem_cgroup *dead_memcg)
1130{
1131        struct mem_cgroup_reclaim_iter *iter;
1132        struct mem_cgroup_per_node *mz;
1133        int nid;
1134
1135        for_each_node(nid) {
1136                mz = mem_cgroup_nodeinfo(from, nid);
1137                iter = &mz->iter;
1138                cmpxchg(&iter->position, dead_memcg, NULL);
1139        }
1140}
1141
1142static void invalidate_reclaim_iterators(struct mem_cgroup *dead_memcg)
1143{
1144        struct mem_cgroup *memcg = dead_memcg;
1145        struct mem_cgroup *last;
1146
1147        do {
1148                __invalidate_reclaim_iterators(memcg, dead_memcg);
1149                last = memcg;
1150        } while ((memcg = parent_mem_cgroup(memcg)));
1151
1152        /*
1153         * When cgruop1 non-hierarchy mode is used,
1154         * parent_mem_cgroup() does not walk all the way up to the
1155         * cgroup root (root_mem_cgroup). So we have to handle
1156         * dead_memcg from cgroup root separately.
1157         */
1158        if (last != root_mem_cgroup)
1159                __invalidate_reclaim_iterators(root_mem_cgroup,
1160                                                dead_memcg);
1161}
1162
1163/**
1164 * mem_cgroup_scan_tasks - iterate over tasks of a memory cgroup hierarchy
1165 * @memcg: hierarchy root
1166 * @fn: function to call for each task
1167 * @arg: argument passed to @fn
1168 *
1169 * This function iterates over tasks attached to @memcg or to any of its
1170 * descendants and calls @fn for each task. If @fn returns a non-zero
1171 * value, the function breaks the iteration loop and returns the value.
1172 * Otherwise, it will iterate over all tasks and return 0.
1173 *
1174 * This function must not be called for the root memory cgroup.
1175 */
1176int mem_cgroup_scan_tasks(struct mem_cgroup *memcg,
1177                          int (*fn)(struct task_struct *, void *), void *arg)
1178{
1179        struct mem_cgroup *iter;
1180        int ret = 0;
1181
1182        BUG_ON(memcg == root_mem_cgroup);
1183
1184        for_each_mem_cgroup_tree(iter, memcg) {
1185                struct css_task_iter it;
1186                struct task_struct *task;
1187
1188                css_task_iter_start(&iter->css, CSS_TASK_ITER_PROCS, &it);
1189                while (!ret && (task = css_task_iter_next(&it)))
1190                        ret = fn(task, arg);
1191                css_task_iter_end(&it);
1192                if (ret) {
1193                        mem_cgroup_iter_break(memcg, iter);
1194                        break;
1195                }
1196        }
1197        return ret;
1198}
1199
1200/**
1201 * mem_cgroup_page_lruvec - return lruvec for isolating/putting an LRU page
1202 * @page: the page
1203 * @pgdat: pgdat of the page
1204 *
1205 * This function is only safe when following the LRU page isolation
1206 * and putback protocol: the LRU lock must be held, and the page must
1207 * either be PageLRU() or the caller must have isolated/allocated it.
1208 */
1209struct lruvec *mem_cgroup_page_lruvec(struct page *page, struct pglist_data *pgdat)
1210{
1211        struct mem_cgroup_per_node *mz;
1212        struct mem_cgroup *memcg;
1213        struct lruvec *lruvec;
1214
1215        if (mem_cgroup_disabled()) {
1216                lruvec = &pgdat->__lruvec;
1217                goto out;
1218        }
1219
1220        memcg = page->mem_cgroup;
1221        /*
1222         * Swapcache readahead pages are added to the LRU - and
1223         * possibly migrated - before they are charged.
1224         */
1225        if (!memcg)
1226                memcg = root_mem_cgroup;
1227
1228        mz = mem_cgroup_page_nodeinfo(memcg, page);
1229        lruvec = &mz->lruvec;
1230out:
1231        /*
1232         * Since a node can be onlined after the mem_cgroup was created,
1233         * we have to be prepared to initialize lruvec->zone here;
1234         * and if offlined then reonlined, we need to reinitialize it.
1235         */
1236        if (unlikely(lruvec->pgdat != pgdat))
1237                lruvec->pgdat = pgdat;
1238        return lruvec;
1239}
1240
1241/**
1242 * mem_cgroup_update_lru_size - account for adding or removing an lru page
1243 * @lruvec: mem_cgroup per zone lru vector
1244 * @lru: index of lru list the page is sitting on
1245 * @zid: zone id of the accounted pages
1246 * @nr_pages: positive when adding or negative when removing
1247 *
1248 * This function must be called under lru_lock, just before a page is added
1249 * to or just after a page is removed from an lru list (that ordering being
1250 * so as to allow it to check that lru_size 0 is consistent with list_empty).
1251 */
1252void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru,
1253                                int zid, int nr_pages)
1254{
1255        struct mem_cgroup_per_node *mz;
1256        unsigned long *lru_size;
1257        long size;
1258
1259        if (mem_cgroup_disabled())
1260                return;
1261
1262        mz = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
1263        lru_size = &mz->lru_zone_size[zid][lru];
1264
1265        if (nr_pages < 0)
1266                *lru_size += nr_pages;
1267
1268        size = *lru_size;
1269        if (WARN_ONCE(size < 0,
1270                "%s(%p, %d, %d): lru_size %ld\n",
1271                __func__, lruvec, lru, nr_pages, size)) {
1272                VM_BUG_ON(1);
1273                *lru_size = 0;
1274        }
1275
1276        if (nr_pages > 0)
1277                *lru_size += nr_pages;
1278}
1279
1280/**
1281 * mem_cgroup_margin - calculate chargeable space of a memory cgroup
1282 * @memcg: the memory cgroup
1283 *
1284 * Returns the maximum amount of memory @mem can be charged with, in
1285 * pages.
1286 */
1287static unsigned long mem_cgroup_margin(struct mem_cgroup *memcg)
1288{
1289        unsigned long margin = 0;
1290        unsigned long count;
1291        unsigned long limit;
1292
1293        count = page_counter_read(&memcg->memory);
1294        limit = READ_ONCE(memcg->memory.max);
1295        if (count < limit)
1296                margin = limit - count;
1297
1298        if (do_memsw_account()) {
1299                count = page_counter_read(&memcg->memsw);
1300                limit = READ_ONCE(memcg->memsw.max);
1301                if (count <= limit)
1302                        margin = min(margin, limit - count);
1303                else
1304                        margin = 0;
1305        }
1306
1307        return margin;
1308}
1309
1310/*
1311 * A routine for checking "mem" is under move_account() or not.
1312 *
1313 * Checking a cgroup is mc.from or mc.to or under hierarchy of
1314 * moving cgroups. This is for waiting at high-memory pressure
1315 * caused by "move".
1316 */
1317static bool mem_cgroup_under_move(struct mem_cgroup *memcg)
1318{
1319        struct mem_cgroup *from;
1320        struct mem_cgroup *to;
1321        bool ret = false;
1322        /*
1323         * Unlike task_move routines, we access mc.to, mc.from not under
1324         * mutual exclusion by cgroup_mutex. Here, we take spinlock instead.
1325         */
1326        spin_lock(&mc.lock);
1327        from = mc.from;
1328        to = mc.to;
1329        if (!from)
1330                goto unlock;
1331
1332        ret = mem_cgroup_is_descendant(from, memcg) ||
1333                mem_cgroup_is_descendant(to, memcg);
1334unlock:
1335        spin_unlock(&mc.lock);
1336        return ret;
1337}
1338
1339static bool mem_cgroup_wait_acct_move(struct mem_cgroup *memcg)
1340{
1341        if (mc.moving_task && current != mc.moving_task) {
1342                if (mem_cgroup_under_move(memcg)) {
1343                        DEFINE_WAIT(wait);
1344                        prepare_to_wait(&mc.waitq, &wait, TASK_INTERRUPTIBLE);
1345                        /* moving charge context might have finished. */
1346                        if (mc.moving_task)
1347                                schedule();
1348                        finish_wait(&mc.waitq, &wait);
1349                        return true;
1350                }
1351        }
1352        return false;
1353}
1354
1355static char *memory_stat_format(struct mem_cgroup *memcg)
1356{
1357        struct seq_buf s;
1358        int i;
1359
1360        seq_buf_init(&s, kmalloc(PAGE_SIZE, GFP_KERNEL), PAGE_SIZE);
1361        if (!s.buffer)
1362                return NULL;
1363
1364        /*
1365         * Provide statistics on the state of the memory subsystem as
1366         * well as cumulative event counters that show past behavior.
1367         *
1368         * This list is ordered following a combination of these gradients:
1369         * 1) generic big picture -> specifics and details
1370         * 2) reflecting userspace activity -> reflecting kernel heuristics
1371         *
1372         * Current memory state:
1373         */
1374
1375        seq_buf_printf(&s, "anon %llu\n",
1376                       (u64)memcg_page_state(memcg, MEMCG_RSS) *
1377                       PAGE_SIZE);
1378        seq_buf_printf(&s, "file %llu\n",
1379                       (u64)memcg_page_state(memcg, MEMCG_CACHE) *
1380                       PAGE_SIZE);
1381        seq_buf_printf(&s, "kernel_stack %llu\n",
1382                       (u64)memcg_page_state(memcg, MEMCG_KERNEL_STACK_KB) *
1383                       1024);
1384        seq_buf_printf(&s, "slab %llu\n",
1385                       (u64)(memcg_page_state(memcg, NR_SLAB_RECLAIMABLE) +
1386                             memcg_page_state(memcg, NR_SLAB_UNRECLAIMABLE)) *
1387                       PAGE_SIZE);
1388        seq_buf_printf(&s, "sock %llu\n",
1389                       (u64)memcg_page_state(memcg, MEMCG_SOCK) *
1390                       PAGE_SIZE);
1391
1392        seq_buf_printf(&s, "shmem %llu\n",
1393                       (u64)memcg_page_state(memcg, NR_SHMEM) *
1394                       PAGE_SIZE);
1395        seq_buf_printf(&s, "file_mapped %llu\n",
1396                       (u64)memcg_page_state(memcg, NR_FILE_MAPPED) *
1397                       PAGE_SIZE);
1398        seq_buf_printf(&s, "file_dirty %llu\n",
1399                       (u64)memcg_page_state(memcg, NR_FILE_DIRTY) *
1400                       PAGE_SIZE);
1401        seq_buf_printf(&s, "file_writeback %llu\n",
1402                       (u64)memcg_page_state(memcg, NR_WRITEBACK) *
1403                       PAGE_SIZE);
1404
1405        /*
1406         * TODO: We should eventually replace our own MEMCG_RSS_HUGE counter
1407         * with the NR_ANON_THP vm counter, but right now it's a pain in the
1408         * arse because it requires migrating the work out of rmap to a place
1409         * where the page->mem_cgroup is set up and stable.
1410         */
1411        seq_buf_printf(&s, "anon_thp %llu\n",
1412                       (u64)memcg_page_state(memcg, MEMCG_RSS_HUGE) *
1413                       PAGE_SIZE);
1414
1415        for (i = 0; i < NR_LRU_LISTS; i++)
1416                seq_buf_printf(&s, "%s %llu\n", lru_list_name(i),
1417                               (u64)memcg_page_state(memcg, NR_LRU_BASE + i) *
1418                               PAGE_SIZE);
1419
1420        seq_buf_printf(&s, "slab_reclaimable %llu\n",
1421                       (u64)memcg_page_state(memcg, NR_SLAB_RECLAIMABLE) *
1422                       PAGE_SIZE);
1423        seq_buf_printf(&s, "slab_unreclaimable %llu\n",
1424                       (u64)memcg_page_state(memcg, NR_SLAB_UNRECLAIMABLE) *
1425                       PAGE_SIZE);
1426
1427        /* Accumulated memory events */
1428
1429        seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGFAULT),
1430                       memcg_events(memcg, PGFAULT));
1431        seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGMAJFAULT),
1432                       memcg_events(memcg, PGMAJFAULT));
1433
1434        seq_buf_printf(&s, "workingset_refault %lu\n",
1435                       memcg_page_state(memcg, WORKINGSET_REFAULT));
1436        seq_buf_printf(&s, "workingset_activate %lu\n",
1437                       memcg_page_state(memcg, WORKINGSET_ACTIVATE));
1438        seq_buf_printf(&s, "workingset_nodereclaim %lu\n",
1439                       memcg_page_state(memcg, WORKINGSET_NODERECLAIM));
1440
1441        seq_buf_printf(&s, "%s %lu\n",  vm_event_name(PGREFILL),
1442                       memcg_events(memcg, PGREFILL));
1443        seq_buf_printf(&s, "pgscan %lu\n",
1444                       memcg_events(memcg, PGSCAN_KSWAPD) +
1445                       memcg_events(memcg, PGSCAN_DIRECT));
1446        seq_buf_printf(&s, "pgsteal %lu\n",
1447                       memcg_events(memcg, PGSTEAL_KSWAPD) +
1448                       memcg_events(memcg, PGSTEAL_DIRECT));
1449        seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGACTIVATE),
1450                       memcg_events(memcg, PGACTIVATE));
1451        seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGDEACTIVATE),
1452                       memcg_events(memcg, PGDEACTIVATE));
1453        seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGLAZYFREE),
1454                       memcg_events(memcg, PGLAZYFREE));
1455        seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGLAZYFREED),
1456                       memcg_events(memcg, PGLAZYFREED));
1457
1458#ifdef CONFIG_TRANSPARENT_HUGEPAGE
1459        seq_buf_printf(&s, "%s %lu\n", vm_event_name(THP_FAULT_ALLOC),
1460                       memcg_events(memcg, THP_FAULT_ALLOC));
1461        seq_buf_printf(&s, "%s %lu\n", vm_event_name(THP_COLLAPSE_ALLOC),
1462                       memcg_events(memcg, THP_COLLAPSE_ALLOC));
1463#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
1464
1465        /* The above should easily fit into one page */
1466        WARN_ON_ONCE(seq_buf_has_overflowed(&s));
1467
1468        return s.buffer;
1469}
1470
1471#define K(x) ((x) << (PAGE_SHIFT-10))
1472/**
1473 * mem_cgroup_print_oom_context: Print OOM information relevant to
1474 * memory controller.
1475 * @memcg: The memory cgroup that went over limit
1476 * @p: Task that is going to be killed
1477 *
1478 * NOTE: @memcg and @p's mem_cgroup can be different when hierarchy is
1479 * enabled
1480 */
1481void mem_cgroup_print_oom_context(struct mem_cgroup *memcg, struct task_struct *p)
1482{
1483        rcu_read_lock();
1484
1485        if (memcg) {
1486                pr_cont(",oom_memcg=");
1487                pr_cont_cgroup_path(memcg->css.cgroup);
1488        } else
1489                pr_cont(",global_oom");
1490        if (p) {
1491                pr_cont(",task_memcg=");
1492                pr_cont_cgroup_path(task_cgroup(p, memory_cgrp_id));
1493        }
1494        rcu_read_unlock();
1495}
1496
1497/**
1498 * mem_cgroup_print_oom_meminfo: Print OOM memory information relevant to
1499 * memory controller.
1500 * @memcg: The memory cgroup that went over limit
1501 */
1502void mem_cgroup_print_oom_meminfo(struct mem_cgroup *memcg)
1503{
1504        char *buf;
1505
1506        pr_info("memory: usage %llukB, limit %llukB, failcnt %lu\n",
1507                K((u64)page_counter_read(&memcg->memory)),
1508                K((u64)memcg->memory.max), memcg->memory.failcnt);
1509        if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
1510                pr_info("swap: usage %llukB, limit %llukB, failcnt %lu\n",
1511                        K((u64)page_counter_read(&memcg->swap)),
1512                        K((u64)memcg->swap.max), memcg->swap.failcnt);
1513        else {
1514                pr_info("memory+swap: usage %llukB, limit %llukB, failcnt %lu\n",
1515                        K((u64)page_counter_read(&memcg->memsw)),
1516                        K((u64)memcg->memsw.max), memcg->memsw.failcnt);
1517                pr_info("kmem: usage %llukB, limit %llukB, failcnt %lu\n",
1518                        K((u64)page_counter_read(&memcg->kmem)),
1519                        K((u64)memcg->kmem.max), memcg->kmem.failcnt);
1520        }
1521
1522        pr_info("Memory cgroup stats for ");
1523        pr_cont_cgroup_path(memcg->css.cgroup);
1524        pr_cont(":");
1525        buf = memory_stat_format(memcg);
1526        if (!buf)
1527                return;
1528        pr_info("%s", buf);
1529        kfree(buf);
1530}
1531
1532/*
1533 * Return the memory (and swap, if configured) limit for a memcg.
1534 */
1535unsigned long mem_cgroup_get_max(struct mem_cgroup *memcg)
1536{
1537        unsigned long max;
1538
1539        max = memcg->memory.max;
1540        if (mem_cgroup_swappiness(memcg)) {
1541                unsigned long memsw_max;
1542                unsigned long swap_max;
1543
1544                memsw_max = memcg->memsw.max;
1545                swap_max = memcg->swap.max;
1546                swap_max = min(swap_max, (unsigned long)total_swap_pages);
1547                max = min(max + swap_max, memsw_max);
1548        }
1549        return max;
1550}
1551
1552unsigned long mem_cgroup_size(struct mem_cgroup *memcg)
1553{
1554        return page_counter_read(&memcg->memory);
1555}
1556
1557static bool mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
1558                                     int order)
1559{
1560        struct oom_control oc = {
1561                .zonelist = NULL,
1562                .nodemask = NULL,
1563                .memcg = memcg,
1564                .gfp_mask = gfp_mask,
1565                .order = order,
1566        };
1567        bool ret;
1568
1569        if (mutex_lock_killable(&oom_lock))
1570                return true;
1571        /*
1572         * A few threads which were not waiting at mutex_lock_killable() can
1573         * fail to bail out. Therefore, check again after holding oom_lock.
1574         */
1575        ret = should_force_charge() || out_of_memory(&oc);
1576        mutex_unlock(&oom_lock);
1577        return ret;
1578}
1579
1580static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg,
1581                                   pg_data_t *pgdat,
1582                                   gfp_t gfp_mask,
1583                                   unsigned long *total_scanned)
1584{
1585        struct mem_cgroup *victim = NULL;
1586        int total = 0;
1587        int loop = 0;
1588        unsigned long excess;
1589        unsigned long nr_scanned;
1590        struct mem_cgroup_reclaim_cookie reclaim = {
1591                .pgdat = pgdat,
1592        };
1593
1594        excess = soft_limit_excess(root_memcg);
1595
1596        while (1) {
1597                victim = mem_cgroup_iter(root_memcg, victim, &reclaim);
1598                if (!victim) {
1599                        loop++;
1600                        if (loop >= 2) {
1601                                /*
1602                                 * If we have not been able to reclaim
1603                                 * anything, it might because there are
1604                                 * no reclaimable pages under this hierarchy
1605                                 */
1606                                if (!total)
1607                                        break;
1608                                /*
1609                                 * We want to do more targeted reclaim.
1610                                 * excess >> 2 is not to excessive so as to
1611                                 * reclaim too much, nor too less that we keep
1612                                 * coming back to reclaim from this cgroup
1613                                 */
1614                                if (total >= (excess >> 2) ||
1615                                        (loop > MEM_CGROUP_MAX_RECLAIM_LOOPS))
1616                                        break;
1617                        }
1618                        continue;
1619                }
1620                total += mem_cgroup_shrink_node(victim, gfp_mask, false,
1621                                        pgdat, &nr_scanned);
1622                *total_scanned += nr_scanned;
1623                if (!soft_limit_excess(root_memcg))
1624                        break;
1625        }
1626        mem_cgroup_iter_break(root_memcg, victim);
1627        return total;
1628}
1629
1630#ifdef CONFIG_LOCKDEP
1631static struct lockdep_map memcg_oom_lock_dep_map = {
1632        .name = "memcg_oom_lock",
1633};
1634#endif
1635
1636static DEFINE_SPINLOCK(memcg_oom_lock);
1637
1638/*
1639 * Check OOM-Killer is already running under our hierarchy.
1640 * If someone is running, return false.
1641 */
1642static bool mem_cgroup_oom_trylock(struct mem_cgroup *memcg)
1643{
1644        struct mem_cgroup *iter, *failed = NULL;
1645
1646        spin_lock(&memcg_oom_lock);
1647
1648        for_each_mem_cgroup_tree(iter, memcg) {
1649                if (iter->oom_lock) {
1650                        /*
1651                         * this subtree of our hierarchy is already locked
1652                         * so we cannot give a lock.
1653                         */
1654                        failed = iter;
1655                        mem_cgroup_iter_break(memcg, iter);
1656                        break;
1657                } else
1658                        iter->oom_lock = true;
1659        }
1660
1661        if (failed) {
1662                /*
1663                 * OK, we failed to lock the whole subtree so we have
1664                 * to clean up what we set up to the failing subtree
1665                 */
1666                for_each_mem_cgroup_tree(iter, memcg) {
1667                        if (iter == failed) {
1668                                mem_cgroup_iter_break(memcg, iter);
1669                                break;
1670                        }
1671                        iter->oom_lock = false;
1672                }
1673        } else
1674                mutex_acquire(&memcg_oom_lock_dep_map, 0, 1, _RET_IP_);
1675
1676        spin_unlock(&memcg_oom_lock);
1677
1678        return !failed;
1679}
1680
1681static void mem_cgroup_oom_unlock(struct mem_cgroup *memcg)
1682{
1683        struct mem_cgroup *iter;
1684
1685        spin_lock(&memcg_oom_lock);
1686        mutex_release(&memcg_oom_lock_dep_map, _RET_IP_);
1687        for_each_mem_cgroup_tree(iter, memcg)
1688                iter->oom_lock = false;
1689        spin_unlock(&memcg_oom_lock);
1690}
1691
1692static void mem_cgroup_mark_under_oom(struct mem_cgroup *memcg)
1693{
1694        struct mem_cgroup *iter;
1695
1696        spin_lock(&memcg_oom_lock);
1697        for_each_mem_cgroup_tree(iter, memcg)
1698                iter->under_oom++;
1699        spin_unlock(&memcg_oom_lock);
1700}
1701
1702static void mem_cgroup_unmark_under_oom(struct mem_cgroup *memcg)
1703{
1704        struct mem_cgroup *iter;
1705
1706        /*
1707         * When a new child is created while the hierarchy is under oom,
1708         * mem_cgroup_oom_lock() may not be called. Watch for underflow.
1709         */
1710        spin_lock(&memcg_oom_lock);
1711        for_each_mem_cgroup_tree(iter, memcg)
1712                if (iter->under_oom > 0)
1713                        iter->under_oom--;
1714        spin_unlock(&memcg_oom_lock);
1715}
1716
1717static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq);
1718
1719struct oom_wait_info {
1720        struct mem_cgroup *memcg;
1721        wait_queue_entry_t      wait;
1722};
1723
1724static int memcg_oom_wake_function(wait_queue_entry_t *wait,
1725        unsigned mode, int sync, void *arg)
1726{
1727        struct mem_cgroup *wake_memcg = (struct mem_cgroup *)arg;
1728        struct mem_cgroup *oom_wait_memcg;
1729        struct oom_wait_info *oom_wait_info;
1730
1731        oom_wait_info = container_of(wait, struct oom_wait_info, wait);
1732        oom_wait_memcg = oom_wait_info->memcg;
1733
1734        if (!mem_cgroup_is_descendant(wake_memcg, oom_wait_memcg) &&
1735            !mem_cgroup_is_descendant(oom_wait_memcg, wake_memcg))
1736                return 0;
1737        return autoremove_wake_function(wait, mode, sync, arg);
1738}
1739
1740static void memcg_oom_recover(struct mem_cgroup *memcg)
1741{
1742        /*
1743         * For the following lockless ->under_oom test, the only required
1744         * guarantee is that it must see the state asserted by an OOM when
1745         * this function is called as a result of userland actions
1746         * triggered by the notification of the OOM.  This is trivially
1747         * achieved by invoking mem_cgroup_mark_under_oom() before
1748         * triggering notification.
1749         */
1750        if (memcg && memcg->under_oom)
1751                __wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, memcg);
1752}
1753
1754enum oom_status {
1755        OOM_SUCCESS,
1756        OOM_FAILED,
1757        OOM_ASYNC,
1758        OOM_SKIPPED
1759};
1760
1761static enum oom_status mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order)
1762{
1763        enum oom_status ret;
1764        bool locked;
1765
1766        if (order > PAGE_ALLOC_COSTLY_ORDER)
1767                return OOM_SKIPPED;
1768
1769        memcg_memory_event(memcg, MEMCG_OOM);
1770
1771        /*
1772         * We are in the middle of the charge context here, so we
1773         * don't want to block when potentially sitting on a callstack
1774         * that holds all kinds of filesystem and mm locks.
1775         *
1776         * cgroup1 allows disabling the OOM killer and waiting for outside
1777         * handling until the charge can succeed; remember the context and put
1778         * the task to sleep at the end of the page fault when all locks are
1779         * released.
1780         *
1781         * On the other hand, in-kernel OOM killer allows for an async victim
1782         * memory reclaim (oom_reaper) and that means that we are not solely
1783         * relying on the oom victim to make a forward progress and we can
1784         * invoke the oom killer here.
1785         *
1786         * Please note that mem_cgroup_out_of_memory might fail to find a
1787         * victim and then we have to bail out from the charge path.
1788         */
1789        if (memcg->oom_kill_disable) {
1790                if (!current->in_user_fault)
1791                        return OOM_SKIPPED;
1792                css_get(&memcg->css);
1793                current->memcg_in_oom = memcg;
1794                current->memcg_oom_gfp_mask = mask;
1795                current->memcg_oom_order = order;
1796
1797                return OOM_ASYNC;
1798        }
1799
1800        mem_cgroup_mark_under_oom(memcg);
1801
1802        locked = mem_cgroup_oom_trylock(memcg);
1803
1804        if (locked)
1805                mem_cgroup_oom_notify(memcg);
1806
1807        mem_cgroup_unmark_under_oom(memcg);
1808        if (mem_cgroup_out_of_memory(memcg, mask, order))
1809                ret = OOM_SUCCESS;
1810        else
1811                ret = OOM_FAILED;
1812
1813        if (locked)
1814                mem_cgroup_oom_unlock(memcg);
1815
1816        return ret;
1817}
1818
1819/**
1820 * mem_cgroup_oom_synchronize - complete memcg OOM handling
1821 * @handle: actually kill/wait or just clean up the OOM state
1822 *
1823 * This has to be called at the end of a page fault if the memcg OOM
1824 * handler was enabled.
1825 *
1826 * Memcg supports userspace OOM handling where failed allocations must
1827 * sleep on a waitqueue until the userspace task resolves the
1828 * situation.  Sleeping directly in the charge context with all kinds
1829 * of locks held is not a good idea, instead we remember an OOM state
1830 * in the task and mem_cgroup_oom_synchronize() has to be called at
1831 * the end of the page fault to complete the OOM handling.
1832 *
1833 * Returns %true if an ongoing memcg OOM situation was detected and
1834 * completed, %false otherwise.
1835 */
1836bool mem_cgroup_oom_synchronize(bool handle)
1837{
1838        struct mem_cgroup *memcg = current->memcg_in_oom;
1839        struct oom_wait_info owait;
1840        bool locked;
1841
1842        /* OOM is global, do not handle */
1843        if (!memcg)
1844                return false;
1845
1846        if (!handle)
1847                goto cleanup;
1848
1849        owait.memcg = memcg;
1850        owait.wait.flags = 0;
1851        owait.wait.func = memcg_oom_wake_function;
1852        owait.wait.private = current;
1853        INIT_LIST_HEAD(&owait.wait.entry);
1854
1855        prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE);
1856        mem_cgroup_mark_under_oom(memcg);
1857
1858        locked = mem_cgroup_oom_trylock(memcg);
1859
1860        if (locked)
1861                mem_cgroup_oom_notify(memcg);
1862
1863        if (locked && !memcg->oom_kill_disable) {
1864                mem_cgroup_unmark_under_oom(memcg);
1865                finish_wait(&memcg_oom_waitq, &owait.wait);
1866                mem_cgroup_out_of_memory(memcg, current->memcg_oom_gfp_mask,
1867                                         current->memcg_oom_order);
1868        } else {
1869                schedule();
1870                mem_cgroup_unmark_under_oom(memcg);
1871                finish_wait(&memcg_oom_waitq, &owait.wait);
1872        }
1873
1874        if (locked) {
1875                mem_cgroup_oom_unlock(memcg);
1876                /*
1877                 * There is no guarantee that an OOM-lock contender
1878                 * sees the wakeups triggered by the OOM kill
1879                 * uncharges.  Wake any sleepers explicitely.
1880                 */
1881                memcg_oom_recover(memcg);
1882        }
1883cleanup:
1884        current->memcg_in_oom = NULL;
1885        css_put(&memcg->css);
1886        return true;
1887}
1888
1889/**
1890 * mem_cgroup_get_oom_group - get a memory cgroup to clean up after OOM
1891 * @victim: task to be killed by the OOM killer
1892 * @oom_domain: memcg in case of memcg OOM, NULL in case of system-wide OOM
1893 *
1894 * Returns a pointer to a memory cgroup, which has to be cleaned up
1895 * by killing all belonging OOM-killable tasks.
1896 *
1897 * Caller has to call mem_cgroup_put() on the returned non-NULL memcg.
1898 */
1899struct mem_cgroup *mem_cgroup_get_oom_group(struct task_struct *victim,
1900                                            struct mem_cgroup *oom_domain)
1901{
1902        struct mem_cgroup *oom_group = NULL;
1903        struct mem_cgroup *memcg;
1904
1905        if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
1906                return NULL;
1907
1908        if (!oom_domain)
1909                oom_domain = root_mem_cgroup;
1910
1911        rcu_read_lock();
1912
1913        memcg = mem_cgroup_from_task(victim);
1914        if (memcg == root_mem_cgroup)
1915                goto out;
1916
1917        /*
1918         * Traverse the memory cgroup hierarchy from the victim task's
1919         * cgroup up to the OOMing cgroup (or root) to find the
1920         * highest-level memory cgroup with oom.group set.
1921         */
1922        for (; memcg; memcg = parent_mem_cgroup(memcg)) {
1923                if (memcg->oom_group)
1924                        oom_group = memcg;
1925
1926                if (memcg == oom_domain)
1927                        break;
1928        }
1929
1930        if (oom_group)
1931                css_get(&oom_group->css);
1932out:
1933        rcu_read_unlock();
1934
1935        return oom_group;
1936}
1937
1938void mem_cgroup_print_oom_group(struct mem_cgroup *memcg)
1939{
1940        pr_info("Tasks in ");
1941        pr_cont_cgroup_path(memcg->css.cgroup);
1942        pr_cont(" are going to be killed due to memory.oom.group set\n");
1943}
1944
1945/**
1946 * lock_page_memcg - lock a page->mem_cgroup binding
1947 * @page: the page
1948 *
1949 * This function protects unlocked LRU pages from being moved to
1950 * another cgroup.
1951 *
1952 * It ensures lifetime of the returned memcg. Caller is responsible
1953 * for the lifetime of the page; __unlock_page_memcg() is available
1954 * when @page might get freed inside the locked section.
1955 */
1956struct mem_cgroup *lock_page_memcg(struct page *page)
1957{
1958        struct mem_cgroup *memcg;
1959        unsigned long flags;
1960
1961        /*
1962         * The RCU lock is held throughout the transaction.  The fast
1963         * path can get away without acquiring the memcg->move_lock
1964         * because page moving starts with an RCU grace period.
1965         *
1966         * The RCU lock also protects the memcg from being freed when
1967         * the page state that is going to change is the only thing
1968         * preventing the page itself from being freed. E.g. writeback
1969         * doesn't hold a page reference and relies on PG_writeback to
1970         * keep off truncation, migration and so forth.
1971         */
1972        rcu_read_lock();
1973
1974        if (mem_cgroup_disabled())
1975                return NULL;
1976again:
1977        memcg = page->mem_cgroup;
1978        if (unlikely(!memcg))
1979                return NULL;
1980
1981        if (atomic_read(&memcg->moving_account) <= 0)
1982                return memcg;
1983
1984        spin_lock_irqsave(&memcg->move_lock, flags);
1985        if (memcg != page->mem_cgroup) {
1986                spin_unlock_irqrestore(&memcg->move_lock, flags);
1987                goto again;
1988        }
1989
1990        /*
1991         * When charge migration first begins, we can have locked and
1992         * unlocked page stat updates happening concurrently.  Track
1993         * the task who has the lock for unlock_page_memcg().
1994         */
1995        memcg->move_lock_task = current;
1996        memcg->move_lock_flags = flags;
1997
1998        return memcg;
1999}
2000EXPORT_SYMBOL(lock_page_memcg);

2001
2002/**
2003 * __unlock_page_memcg - unlock and unpin a memcg
2004 * @memcg: the memcg
2005 *
2006 * Unlock and unpin a memcg returned by lock_page_memcg().
2007 */
2008void __unlock_page_memcg(struct mem_cgroup *memcg)
2009{
2010        if (memcg && memcg->move_lock_task == current) {
2011                unsigned long flags = memcg->move_lock_flags;
2012
2013                memcg->move_lock_task = NULL;
2014                memcg->move_lock_flags = 0;
2015
2016                spin_unlock_irqrestore(&memcg->move_lock, flags);
2017        }
2018
2019        rcu_read_unlock();
2020}
2021
2022/**
2023 * unlock_page_memcg - unlock a page->mem_cgroup binding
2024 * @page: the page
2025 */
2026void unlock_page_memcg(struct page *page)
2027{
2028        __unlock_page_memcg(page->mem_cgroup);
2029}
2030EXPORT_SYMBOL(unlock_page_memcg);
2031
2032struct memcg_stock_pcp {
2033        struct mem_cgroup *cached; /* this never be root cgroup */
2034        unsigned int nr_pages;
2035        struct work_struct work;
2036        unsigned long flags;
2037#define FLUSHING_CACHED_CHARGE  0
2038};
2039static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock);
2040static DEFINE_MUTEX(percpu_charge_mutex);
2041
2042/**
2043 * consume_stock: Try to consume stocked charge on this cpu.
2044 * @memcg: memcg to consume from.
2045 * @nr_pages: how many pages to charge.
2046 *
2047 * The charges will only happen if @memcg matches the current cpu's memcg
2048 * stock, and at least @nr_pages are available in that stock.  Failure to
2049 * service an allocation will refill the stock.
2050 *
2051 * returns true if successful, false otherwise.
2052 */
2053static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
2054{
2055        struct memcg_stock_pcp *stock;
2056        unsigned long flags;
2057        bool ret = false;
2058
2059        if (nr_pages > MEMCG_CHARGE_BATCH)
2060                return ret;
2061
2062        local_irq_save(flags);
2063
2064        stock = this_cpu_ptr(&memcg_stock);
2065        if (memcg == stock->cached && stock->nr_pages >= nr_pages) {
2066                stock->nr_pages -= nr_pages;
2067                ret = true;
2068        }
2069
2070        local_irq_restore(flags);
2071
2072        return ret;
2073}
2074
2075/*
2076 * Returns stocks cached in percpu and reset cached information.
2077 */
2078static void drain_stock(struct memcg_stock_pcp *stock)
2079{
2080        struct mem_cgroup *old = stock->cached;
2081
2082        if (stock->nr_pages) {
2083                page_counter_uncharge(&old->memory, stock->nr_pages);
2084                if (do_memsw_account())
2085                        page_counter_uncharge(&old->memsw, stock->nr_pages);
2086                css_put_many(&old->css, stock->nr_pages);
2087                stock->nr_pages = 0;
2088        }
2089        stock->cached = NULL;
2090}
2091
2092static void drain_local_stock(struct work_struct *dummy)
2093{
2094        struct memcg_stock_pcp *stock;
2095        unsigned long flags;
2096
2097        /*
2098         * The only protection from memory hotplug vs. drain_stock races is
2099         * that we always operate on local CPU stock here with IRQ disabled
2100         */
2101        local_irq_save(flags);
2102
2103        stock = this_cpu_ptr(&memcg_stock);
2104        drain_stock(stock);
2105        clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags);
2106
2107        local_irq_restore(flags);
2108}
2109
2110/*
2111 * Cache charges(val) to local per_cpu area.
2112 * This will be consumed by consume_stock() function, later.
2113 */
2114static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
2115{
2116        struct memcg_stock_pcp *stock;
2117        unsigned long flags;
2118
2119        local_irq_save(flags);
2120
2121        stock = this_cpu_ptr(&memcg_stock);
2122        if (stock->cached != memcg) { /* reset if necessary */
2123                drain_stock(stock);
2124                stock->cached = memcg;
2125        }
2126        stock->nr_pages += nr_pages;
2127
2128        if (stock->nr_pages > MEMCG_CHARGE_BATCH)
2129                drain_stock(stock);
2130
2131        local_irq_restore(flags);
2132}
2133
2134/*
2135 * Drains all per-CPU charge caches for given root_memcg resp. subtree
2136 * of the hierarchy under it.
2137 */
2138static void drain_all_stock(struct mem_cgroup *root_memcg)
2139{
2140        int cpu, curcpu;
2141
2142        /* If someone's already draining, avoid adding running more workers. */
2143        if (!mutex_trylock(&percpu_charge_mutex))
2144                return;
2145        /*
2146         * Notify other cpus that system-wide "drain" is running
2147         * We do not care about races with the cpu hotplug because cpu down
2148         * as well as workers from this path always operate on the local
2149         * per-cpu data. CPU up doesn't touch memcg_stock at all.
2150         */
2151        curcpu = get_cpu();
2152        for_each_online_cpu(cpu) {
2153                struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu);
2154                struct mem_cgroup *memcg;
2155                bool flush = false;
2156
2157                rcu_read_lock();
2158                memcg = stock->cached;
2159                if (memcg && stock->nr_pages &&
2160                    mem_cgroup_is_descendant(memcg, root_memcg))
2161                        flush = true;
2162                rcu_read_unlock();
2163
2164                if (flush &&
2165                    !test_and_set_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) {
2166                        if (cpu == curcpu)
2167                                drain_local_stock(&stock->work);
2168                        else
2169                                schedule_work_on(cpu, &stock->work);
2170                }
2171        }
2172        put_cpu();
2173        mutex_unlock(&percpu_charge_mutex);
2174}
2175
2176static int memcg_hotplug_cpu_dead(unsigned int cpu)
2177{
2178        struct memcg_stock_pcp *stock;
2179        struct mem_cgroup *memcg, *mi;
2180
2181        stock = &per_cpu(memcg_stock, cpu);
2182        drain_stock(stock);
2183
2184        for_each_mem_cgroup(memcg) {
2185                int i;
2186
2187                for (i = 0; i < MEMCG_NR_STAT; i++) {
2188                        int nid;
2189                        long x;
2190
2191                        x = this_cpu_xchg(memcg->vmstats_percpu->stat[i], 0);
2192                        if (x)
2193                                for (mi = memcg; mi; mi = parent_mem_cgroup(mi))
2194                                        atomic_long_add(x, &memcg->vmstats[i]);
2195
2196                        if (i >= NR_VM_NODE_STAT_ITEMS)
2197                                continue;
2198
2199                        for_each_node(nid) {
2200                                struct mem_cgroup_per_node *pn;
2201
2202                                pn = mem_cgroup_nodeinfo(memcg, nid);
2203                                x = this_cpu_xchg(pn->lruvec_stat_cpu->count[i], 0);
2204                                if (x)
2205                                        do {
2206                                                atomic_long_add(x, &pn->lruvec_stat[i]);
2207                                        } while ((pn = parent_nodeinfo(pn, nid)));
2208                        }
2209                }
2210
2211                for (i = 0; i < NR_VM_EVENT_ITEMS; i++) {
2212                        long x;
2213
2214                        x = this_cpu_xchg(memcg->vmstats_percpu->events[i], 0);
2215                        if (x)
2216                                for (mi = memcg; mi; mi = parent_mem_cgroup(mi))
2217                                        atomic_long_add(x, &memcg->vmevents[i]);
2218                }
2219        }
2220
2221        return 0;
2222}
2223
2224static void reclaim_high(struct mem_cgroup *memcg,
2225                         unsigned int nr_pages,
2226                         gfp_t gfp_mask)
2227{
2228        do {
2229                if (page_counter_read(&memcg->memory) <= memcg->high)
2230                        continue;
2231                memcg_memory_event(memcg, MEMCG_HIGH);
2232                try_to_free_mem_cgroup_pages(memcg, nr_pages, gfp_mask, true);
2233        } while ((memcg = parent_mem_cgroup(memcg)));
2234}
2235
2236static void high_work_func(struct work_struct *work)
2237{
2238        struct mem_cgroup *memcg;
2239
2240        memcg = container_of(work, struct mem_cgroup, high_work);
2241        reclaim_high(memcg, MEMCG_CHARGE_BATCH, GFP_KERNEL);
2242}
2243
2244/*
2245 * Clamp the maximum sleep time per allocation batch to 2 seconds. This is
2246 * enough to still cause a significant slowdown in most cases, while still
2247 * allowing diagnostics and tracing to proceed without becoming stuck.
2248 */
2249#define MEMCG_MAX_HIGH_DELAY_JIFFIES (2UL*HZ)
2250
2251/*
2252 * When calculating the delay, we use these either side of the exponentiation to
2253 * maintain precision and scale to a reasonable number of jiffies (see the table
2254 * below.
2255 *
2256 * - MEMCG_DELAY_PRECISION_SHIFT: Extra precision bits while translating the
2257 *   overage ratio to a delay.
2258 * - MEMCG_DELAY_SCALING_SHIFT: The number of bits to scale down down the
2259 *   proposed penalty in order to reduce to a reasonable number of jiffies, and
2260 *   to produce a reasonable delay curve.
2261 *
2262 * MEMCG_DELAY_SCALING_SHIFT just happens to be a number that produces a
2263 * reasonable delay curve compared to precision-adjusted overage, not
2264 * penalising heavily at first, but still making sure that growth beyond the
2265 * limit penalises misbehaviour cgroups by slowing them down exponentially. For
2266 * example, with a high of 100 megabytes:
2267 *
2268 *  +-------+------------------------+
2269 *  | usage | time to allocate in ms |
2270 *  +-------+------------------------+
2271 *  | 100M  |                      0 |
2272 *  | 101M  |                      6 |
2273 *  | 102M  |                     25 |
2274 *  | 103M  |                     57 |
2275 *  | 104M  |                    102 |
2276 *  | 105M  |                    159 |
2277 *  | 106M  |                    230 |
2278 *  | 107M  |                    313 |
2279 *  | 108M  |                    409 |
2280 *  | 109M  |                    518 |
2281 *  | 110M  |                    639 |
2282 *  | 111M  |                    774 |
2283 *  | 112M  |                    921 |
2284 *  | 113M  |                   1081 |
2285 *  | 114M  |                   1254 |
2286 *  | 115M  |                   1439 |
2287 *  | 116M  |                   1638 |
2288 *  | 117M  |                   1849 |
2289 *  | 118M  |                   2000 |
2290 *  | 119M  |                   2000 |
2291 *  | 120M  |                   2000 |
2292 *  +-------+------------------------+
2293 */
2294 #define MEMCG_DELAY_PRECISION_SHIFT 20
2295 #define MEMCG_DELAY_SCALING_SHIFT 14
2296
2297/*
2298 * Scheduled by try_charge() to be executed from the userland return path
2299 * and reclaims memory over the high limit.
2300 */
2301void mem_cgroup_handle_over_high(void)
2302{
2303        unsigned long usage, high, clamped_high;
2304        unsigned long pflags;
2305        unsigned long penalty_jiffies, overage;
2306        unsigned int nr_pages = current->memcg_nr_pages_over_high;
2307        struct mem_cgroup *memcg;
2308
2309        if (likely(!nr_pages))
2310                return;
2311
2312        memcg = get_mem_cgroup_from_mm(current->mm);
2313        reclaim_high(memcg, nr_pages, GFP_KERNEL);
2314        current->memcg_nr_pages_over_high = 0;
2315
2316        /*
2317         * memory.high is breached and reclaim is unable to keep up. Throttle
2318         * allocators proactively to slow down excessive growth.
2319         *
2320         * We use overage compared to memory.high to calculate the number of
2321         * jiffies to sleep (penalty_jiffies). Ideally this value should be
2322         * fairly lenient on small overages, and increasingly harsh when the
2323         * memcg in question makes it clear that it has no intention of stopping
2324         * its crazy behaviour, so we exponentially increase the delay based on
2325         * overage amount.
2326         */
2327
2328        usage = page_counter_read(&memcg->memory);
2329        high = READ_ONCE(memcg->high);
2330
2331        if (usage <= high)
2332                goto out;
2333
2334        /*
2335         * Prevent division by 0 in overage calculation by acting as if it was a
2336         * threshold of 1 page
2337         */
2338        clamped_high = max(high, 1UL);
2339
2340        overage = div_u64((u64)(usage - high) << MEMCG_DELAY_PRECISION_SHIFT,
2341                          clamped_high);
2342
2343        penalty_jiffies = ((u64)overage * overage * HZ)
2344                >> (MEMCG_DELAY_PRECISION_SHIFT + MEMCG_DELAY_SCALING_SHIFT);
2345
2346        /*
2347         * Factor in the task's own contribution to the overage, such that four
2348         * N-sized allocations are throttled approximately the same as one
2349         * 4N-sized allocation.
2350         *
2351         * MEMCG_CHARGE_BATCH pages is nominal, so work out how much smaller or
2352         * larger the current charge patch is than that.
2353         */
2354        penalty_jiffies = penalty_jiffies * nr_pages / MEMCG_CHARGE_BATCH;
2355
2356        /*
2357         * Clamp the max delay per usermode return so as to still keep the
2358         * application moving forwards and also permit diagnostics, albeit
2359         * extremely slowly.
2360         */
2361        penalty_jiffies = min(penalty_jiffies, MEMCG_MAX_HIGH_DELAY_JIFFIES);
2362
2363        /*
2364         * Don't sleep if the amount of jiffies this memcg owes us is so low
2365         * that it's not even worth doing, in an attempt to be nice to those who
2366         * go only a small amount over their memory.high value and maybe haven't
2367         * been aggressively reclaimed enough yet.
2368         */
2369        if (penalty_jiffies <= HZ / 100)
2370                goto out;
2371
2372        /*
2373         * If we exit early, we're guaranteed to die (since
2374         * schedule_timeout_killable sets TASK_KILLABLE). This means we don't
2375         * need to account for any ill-begotten jiffies to pay them off later.
2376         */
2377        psi_memstall_enter(&pflags);
2378        schedule_timeout_killable(penalty_jiffies);
2379        psi_memstall_leave(&pflags);
2380
2381out:
2382        css_put(&memcg->css);
2383}
2384
2385static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
2386                      unsigned int nr_pages)
2387{
2388        unsigned int batch = max(MEMCG_CHARGE_BATCH, nr_pages);
2389        int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
2390        struct mem_cgroup *mem_over_limit;
2391        struct page_counter *counter;
2392        unsigned long nr_reclaimed;
2393        bool may_swap = true;
2394        bool drained = false;
2395        enum oom_status oom_status;
2396
2397        if (mem_cgroup_is_root(memcg))
2398                return 0;
2399retry:
2400        if (consume_stock(memcg, nr_pages))
2401                return 0;
2402
2403        if (!do_memsw_account() ||
2404            page_counter_try_charge(&memcg->memsw, batch, &counter)) {
2405                if (page_counter_try_charge(&memcg->memory, batch, &counter))
2406                        goto done_restock;
2407                if (do_memsw_account())
2408                        page_counter_uncharge(&memcg->memsw, batch);
2409                mem_over_limit = mem_cgroup_from_counter(counter, memory);
2410        } else {
2411                mem_over_limit = mem_cgroup_from_counter(counter, memsw);
2412                may_swap = false;
2413        }
2414
2415        if (batch > nr_pages) {
2416                batch = nr_pages;
2417                goto retry;
2418        }
2419
2420        /*
2421         * Memcg doesn't have a dedicated reserve for atomic
2422         * allocations. But like the global atomic pool, we need to
2423         * put the burden of reclaim on regular allocation requests
2424         * and let these go through as privileged allocations.
2425         */
2426        if (gfp_mask & __GFP_ATOMIC)
2427                goto force;
2428
2429        /*
2430         * Unlike in global OOM situations, memcg is not in a physical
2431         * memory shortage.  Allow dying and OOM-killed tasks to
2432         * bypass the last charges so that they can exit quickly and
2433         * free their memory.
2434         */
2435        if (unlikely(should_force_charge()))
2436                goto force;
2437
2438        /*
2439         * Prevent unbounded recursion when reclaim operations need to
2440         * allocate memory. This might exceed the limits temporarily,
2441         * but we prefer facilitating memory reclaim and getting back
2442         * under the limit over triggering OOM kills in these cases.
2443         */
2444        if (unlikely(current->flags & PF_MEMALLOC))
2445                goto force;
2446
2447        if (unlikely(task_in_memcg_oom(current)))
2448                goto nomem;
2449
2450        if (!gfpflags_allow_blocking(gfp_mask))
2451                goto nomem;
2452
2453        memcg_memory_event(mem_over_limit, MEMCG_MAX);
2454
2455        nr_reclaimed = try_to_free_mem_cgroup_pages(mem_over_limit, nr_pages,
2456                                                    gfp_mask, may_swap);
2457
2458        if (mem_cgroup_margin(mem_over_limit) >= nr_pages)
2459                goto retry;
2460
2461        if (!drained) {
2462                drain_all_stock(mem_over_limit);
2463                drained = true;
2464                goto retry;
2465        }
2466
2467        if (gfp_mask & __GFP_NORETRY)
2468                goto nomem;
2469        /*
2470         * Even though the limit is exceeded at this point, reclaim
2471         * may have been able to free some pages.  Retry the charge
2472         * before killing the task.
2473         *
2474         * Only for regular pages, though: huge pages are rather
2475         * unlikely to succeed so close to the limit, and we fall back
2476         * to regular pages anyway in case of failure.
2477         */
2478        if (nr_reclaimed && nr_pages <= (1 << PAGE_ALLOC_COSTLY_ORDER))
2479                goto retry;
2480        /*
2481         * At task move, charge accounts can be doubly counted. So, it's
2482         * better to wait until the end of task_move if something is going on.
2483         */
2484        if (mem_cgroup_wait_acct_move(mem_over_limit))
2485                goto retry;
2486
2487        if (nr_retries--)
2488                goto retry;
2489
2490        if (gfp_mask & __GFP_RETRY_MAYFAIL)
2491                goto nomem;
2492
2493        if (gfp_mask & __GFP_NOFAIL)
2494                goto force;
2495
2496        if (fatal_signal_pending(current))
2497                goto force;
2498
2499        /*
2500         * keep retrying as long as the memcg oom killer is able to make
2501         * a forward progress or bypass the charge if the oom killer
2502         * couldn't make any progress.
2503         */
2504        oom_status = mem_cgroup_oom(mem_over_limit, gfp_mask,
2505                       get_order(nr_pages * PAGE_SIZE));
2506        switch (oom_status) {
2507        case OOM_SUCCESS:
2508                nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
2509                goto retry;
2510        case OOM_FAILED:
2511                goto force;
2512        default:
2513                goto nomem;
2514        }
2515nomem:
2516        if (!(gfp_mask & __GFP_NOFAIL))
2517                return -ENOMEM;
2518force:
2519        /*
2520         * The allocation either can't fail or will lead to more memory
2521         * being freed very soon.  Allow memory usage go over the limit
2522         * temporarily by force charging it.
2523         */
2524        page_counter_charge(&memcg->memory, nr_pages);
2525        if (do_memsw_account())
2526                page_counter_charge(&memcg->memsw, nr_pages);
2527        css_get_many(&memcg->css, nr_pages);
2528
2529        return 0;
2530
2531done_restock:
2532        css_get_many(&memcg->css, batch);
2533        if (batch > nr_pages)
2534                refill_stock(memcg, batch - nr_pages);
2535
2536        /*
2537         * If the hierarchy is above the normal consumption range, schedule
2538         * reclaim on returning to userland.  We can perform reclaim here
2539         * if __GFP_RECLAIM but let's always punt for simplicity and so that
2540         * GFP_KERNEL can consistently be used during reclaim.  @memcg is
2541         * not recorded as it most likely matches current's and won't
2542         * change in the meantime.  As high limit is checked again before
2543         * reclaim, the cost of mismatch is negligible.
2544         */
2545        do {
2546                if (page_counter_read(&memcg->memory) > memcg->high) {
2547                        /* Don't bother a random interrupted task */
2548                        if (in_interrupt()) {
2549                                schedule_work(&memcg->high_work);
2550                                break;
2551                        }
2552                        current->memcg_nr_pages_over_high += batch;
2553                        set_notify_resume(current);
2554                        break;
2555                }
2556        } while ((memcg = parent_mem_cgroup(memcg)));
2557
2558        return 0;
2559}
2560
2561static void cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages)
2562{
2563        if (mem_cgroup_is_root(memcg))
2564                return;
2565
2566        page_counter_uncharge(&memcg->memory, nr_pages);
2567        if (do_memsw_account())
2568                page_counter_uncharge(&memcg->memsw, nr_pages);
2569
2570        css_put_many(&memcg->css, nr_pages);
2571}
2572
2573static void lock_page_lru(struct page *page, int *isolated)
2574{
2575        pg_data_t *pgdat = page_pgdat(page);
2576
2577        spin_lock_irq(&pgdat->lru_lock);
2578        if (PageLRU(page)) {
2579                struct lruvec *lruvec;
2580
2581                lruvec = mem_cgroup_page_lruvec(page, pgdat);
2582                ClearPageLRU(page);
2583                del_page_from_lru_list(page, lruvec, page_lru(page));
2584                *isolated = 1;
2585        } else
2586                *isolated = 0;
2587}
2588
2589static void unlock_page_lru(struct page *page, int isolated)
2590{
2591        pg_data_t *pgdat = page_pgdat(page);
2592
2593        if (isolated) {
2594                struct lruvec *lruvec;
2595
2596                lruvec = mem_cgroup_page_lruvec(page, pgdat);
2597                VM_BUG_ON_PAGE(PageLRU(page), page);
2598                SetPageLRU(page);
2599                add_page_to_lru_list(page, lruvec, page_lru(page));
2600        }
2601        spin_unlock_irq(&pgdat->lru_lock);
2602}
2603
2604static void commit_charge(struct page *page, struct mem_cgroup *memcg,
2605                          bool lrucare)
2606{
2607        int isolated;
2608
2609        VM_BUG_ON_PAGE(page->mem_cgroup, page);
2610
2611        /*
2612         * In some cases, SwapCache and FUSE(splice_buf->radixtree), the page
2613         * may already be on some other mem_cgroup's LRU.  Take care of it.
2614         */
2615        if (lrucare)
2616                lock_page_lru(page, &isolated);
2617
2618        /*
2619         * Nobody should be changing or seriously looking at
2620         * page->mem_cgroup at this point:
2621         *
2622         * - the page is uncharged
2623         *
2624         * - the page is off-LRU
2625         *
2626         * - an anonymous fault has exclusive page access, except for
2627         *   a locked page table
2628         *
2629         * - a page cache insertion, a swapin fault, or a migration
2630         *   have the page locked
2631         */
2632        page->mem_cgroup = memcg;
2633
2634        if (lrucare)
2635                unlock_page_lru(page, isolated);
2636}
2637
2638#ifdef CONFIG_MEMCG_KMEM
2639static int memcg_alloc_cache_id(void)
2640{
2641        int id, size;
2642        int err;
2643
2644        id = ida_simple_get(&memcg_cache_ida,
2645                            0, MEMCG_CACHES_MAX_SIZE, GFP_KERNEL);
2646        if (id < 0)
2647                return id;
2648
2649        if (id < memcg_nr_cache_ids)
2650                return id;
2651
2652        /*
2653         * There's no space for the new id in memcg_caches arrays,
2654         * so we have to grow them.
2655         */
2656        down_write(&memcg_cache_ids_sem);
2657
2658        size = 2 * (id + 1);
2659        if (size < MEMCG_CACHES_MIN_SIZE)
2660                size = MEMCG_CACHES_MIN_SIZE;
2661        else if (size > MEMCG_CACHES_MAX_SIZE)
2662                size = MEMCG_CACHES_MAX_SIZE;
2663
2664        err = memcg_update_all_caches(size);
2665        if (!err)
2666                err = memcg_update_all_list_lrus(size);
2667        if (!err)
2668                memcg_nr_cache_ids = size;
2669
2670        up_write(&memcg_cache_ids_sem);
2671
2672        if (err) {
2673                ida_simple_remove(&memcg_cache_ida, id);
2674                return err;
2675        }
2676        return id;
2677}
2678
2679static void memcg_free_cache_id(int id)
2680{
2681        ida_simple_remove(&memcg_cache_ida, id);
2682}
2683
2684struct memcg_kmem_cache_create_work {
2685        struct mem_cgroup *memcg;
2686        struct kmem_cache *cachep;
2687        struct work_struct work;
2688};
2689
2690static void memcg_kmem_cache_create_func(struct work_struct *w)
2691{
2692        struct memcg_kmem_cache_create_work *cw =
2693                container_of(w, struct memcg_kmem_cache_create_work, work);
2694        struct mem_cgroup *memcg = cw->memcg;
2695        struct kmem_cache *cachep = cw->cachep;
2696
2697        memcg_create_kmem_cache(memcg, cachep);
2698
2699        css_put(&memcg->css);
2700        kfree(cw);
2701}
2702
2703/*
2704 * Enqueue the creation of a per-memcg kmem_cache.
2705 */
2706static void memcg_schedule_kmem_cache_create(struct mem_cgroup *memcg,
2707                                               struct kmem_cache *cachep)
2708{
2709        struct memcg_kmem_cache_create_work *cw;
2710
2711        if (!css_tryget_online(&memcg->css))
2712                return;
2713
2714        cw = kmalloc(sizeof(*cw), GFP_NOWAIT | __GFP_NOWARN);
2715        if (!cw)
2716                return;
2717
2718        cw->memcg = memcg;
2719        cw->cachep = cachep;
2720        INIT_WORK(&cw->work, memcg_kmem_cache_create_func);
2721
2722        queue_work(memcg_kmem_cache_wq, &cw->work);
2723}
2724
2725static inline bool memcg_kmem_bypass(void)
2726{
2727        if (in_interrupt() || !current->mm || (current->flags & PF_KTHREAD))
2728                return true;
2729        return false;
2730}
2731
2732/**
2733 * memcg_kmem_get_cache: select the correct per-memcg cache for allocation
2734 * @cachep: the original global kmem cache
2735 *
2736 * Return the kmem_cache we're supposed to use for a slab allocation.
2737 * We try to use the current memcg's version of the cache.
2738 *
2739 * If the cache does not exist yet, if we are the first user of it, we
2740 * create it asynchronously in a workqueue and let the current allocation
2741 * go through with the original cache.
2742 *
2743 * This function takes a reference to the cache it returns to assure it
2744 * won't get destroyed while we are working with it. Once the caller is
2745 * done with it, memcg_kmem_put_cache() must be called to release the
2746 * reference.
2747 */
2748struct kmem_cache *memcg_kmem_get_cache(struct kmem_cache *cachep)
2749{
2750        struct mem_cgroup *memcg;
2751        struct kmem_cache *memcg_cachep;
2752        struct memcg_cache_array *arr;
2753        int kmemcg_id;
2754
2755        VM_BUG_ON(!is_root_cache(cachep));
2756
2757        if (memcg_kmem_bypass())
2758                return cachep;
2759
2760        rcu_read_lock();
2761
2762        if (unlikely(current->active_memcg))
2763                memcg = current->active_memcg;
2764        else
2765                memcg = mem_cgroup_from_task(current);
2766
2767        if (!memcg || memcg == root_mem_cgroup)
2768                goto out_unlock;
2769
2770        kmemcg_id = READ_ONCE(memcg->kmemcg_id);
2771        if (kmemcg_id < 0)
2772                goto out_unlock;
2773
2774        arr = rcu_dereference(cachep->memcg_params.memcg_caches);
2775
2776        /*
2777         * Make sure we will access the up-to-date value. The code updating
2778         * memcg_caches issues a write barrier to match the data dependency
2779         * barrier inside READ_ONCE() (see memcg_create_kmem_cache()).
2780         */
2781        memcg_cachep = READ_ONCE(arr->entries[kmemcg_id]);
2782
2783        /*
2784         * If we are in a safe context (can wait, and not in interrupt
2785         * context), we could be be predictable and return right away.
2786         * This would guarantee that the allocation being performed
2787         * already belongs in the new cache.
2788         *
2789         * However, there are some clashes that can arrive from locking.
2790         * For instance, because we acquire the slab_mutex while doing
2791         * memcg_create_kmem_cache, this means no further allocation
2792         * could happen with the slab_mutex held. So it's better to
2793         * defer everything.
2794         *
2795         * If the memcg is dying or memcg_cache is about to be released,
2796         * don't bother creating new kmem_caches. Because memcg_cachep
2797         * is ZEROed as the fist step of kmem offlining, we don't need
2798         * percpu_ref_tryget_live() here. css_tryget_online() check in
2799         * memcg_schedule_kmem_cache_create() will prevent us from
2800         * creation of a new kmem_cache.
2801         */
2802        if (unlikely(!memcg_cachep))
2803                memcg_schedule_kmem_cache_create(memcg, cachep);
2804        else if (percpu_ref_tryget(&memcg_cachep->memcg_params.refcnt))
2805                cachep = memcg_cachep;
2806out_unlock:
2807        rcu_read_unlock();
2808        return cachep;
2809}
2810
2811/**
2812 * memcg_kmem_put_cache: drop reference taken by memcg_kmem_get_cache
2813 * @cachep: the cache returned by memcg_kmem_get_cache
2814 */
2815void memcg_kmem_put_cache(struct kmem_cache *cachep)
2816{
2817        if (!is_root_cache(cachep))
2818                percpu_ref_put(&cachep->memcg_params.refcnt);
2819}
2820
2821/**
2822 * __memcg_kmem_charge_memcg: charge a kmem page
2823 * @page: page to charge
2824 * @gfp: reclaim mode
2825 * @order: allocation order
2826 * @memcg: memory cgroup to charge
2827 *
2828 * Returns 0 on success, an error code on failure.
2829 */
2830int __memcg_kmem_charge_memcg(struct page *page, gfp_t gfp, int order,
2831                            struct mem_cgroup *memcg)
2832{
2833        unsigned int nr_pages = 1 << order;
2834        struct page_counter *counter;
2835        int ret;
2836
2837        ret = try_charge(memcg, gfp, nr_pages);
2838        if (ret)
2839                return ret;
2840
2841        if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) &&
2842            !page_counter_try_charge(&memcg->kmem, nr_pages, &counter)) {
2843
2844                /*
2845                 * Enforce __GFP_NOFAIL allocation because callers are not
2846                 * prepared to see failures and likely do not have any failure
2847                 * handling code.
2848                 */
2849                if (gfp & __GFP_NOFAIL) {
2850                        page_counter_charge(&memcg->kmem, nr_pages);
2851                        return 0;
2852                }
2853                cancel_charge(memcg, nr_pages);
2854                return -ENOMEM;
2855        }
2856        return 0;
2857}
2858
2859/**
2860 * __memcg_kmem_charge: charge a kmem page to the current memory cgroup
2861 * @page: page to charge
2862 * @gfp: reclaim mode
2863 * @order: allocation order
2864 *
2865 * Returns 0 on success, an error code on failure.
2866 */
2867int __memcg_kmem_charge(struct page *page, gfp_t gfp, int order)
2868{
2869        struct mem_cgroup *memcg;
2870        int ret = 0;
2871
2872        if (memcg_kmem_bypass())
2873                return 0;
2874
2875        memcg = get_mem_cgroup_from_current();
2876        if (!mem_cgroup_is_root(memcg)) {
2877                ret = __memcg_kmem_charge_memcg(page, gfp, order, memcg);
2878                if (!ret) {
2879                        page->mem_cgroup = memcg;
2880                        __SetPageKmemcg(page);
2881                }
2882        }
2883        css_put(&memcg->css);
2884        return ret;
2885}
2886
2887/**
2888 * __memcg_kmem_uncharge_memcg: uncharge a kmem page
2889 * @memcg: memcg to uncharge
2890 * @nr_pages: number of pages to uncharge
2891 */
2892void __memcg_kmem_uncharge_memcg(struct mem_cgroup *memcg,
2893                                 unsigned int nr_pages)
2894{
2895        if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
2896                page_counter_uncharge(&memcg->kmem, nr_pages);
2897
2898        page_counter_uncharge(&memcg->memory, nr_pages);
2899        if (do_memsw_account())
2900                page_counter_uncharge(&memcg->memsw, nr_pages);
2901}
2902/**
2903 * __memcg_kmem_uncharge: uncharge a kmem page
2904 * @page: page to uncharge
2905 * @order: allocation order
2906 */
2907void __memcg_kmem_uncharge(struct page *page, int order)
2908{
2909        struct mem_cgroup *memcg = page->mem_cgroup;
2910        unsigned int nr_pages = 1 << order;
2911
2912        if (!memcg)
2913                return;
2914
2915        VM_BUG_ON_PAGE(mem_cgroup_is_root(memcg), page);
2916        __memcg_kmem_uncharge_memcg(memcg, nr_pages);
2917        page->mem_cgroup = NULL;
2918
2919        /* slab pages do not have PageKmemcg flag set */
2920        if (PageKmemcg(page))
2921                __ClearPageKmemcg(page);
2922
2923        css_put_many(&memcg->css, nr_pages);
2924}
2925#endif /* CONFIG_MEMCG_KMEM */
2926
2927#ifdef CONFIG_TRANSPARENT_HUGEPAGE
2928
2929/*
2930 * Because tail pages are not marked as "used", set it. We're under
2931 * pgdat->lru_lock and migration entries setup in all page mappings.
2932 */
2933void mem_cgroup_split_huge_fixup(struct page *head)
2934{
2935        int i;
2936
2937        if (mem_cgroup_disabled())
2938                return;
2939
2940        for (i = 1; i < HPAGE_PMD_NR; i++)
2941                head[i].mem_cgroup = head->mem_cgroup;
2942
2943        __mod_memcg_state(head->mem_cgroup, MEMCG_RSS_HUGE, -HPAGE_PMD_NR);
2944}
2945#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
2946
2947#ifdef CONFIG_MEMCG_SWAP
2948/**
2949 * mem_cgroup_move_swap_account - move swap charge and swap_cgroup's record.
2950 * @entry: swap entry to be moved
2951 * @from:  mem_cgroup which the entry is moved from
2952 * @to:  mem_cgroup which the entry is moved to
2953 *
2954 * It succeeds only when the swap_cgroup's record for this entry is the same
2955 * as the mem_cgroup's id of @from.
2956 *
2957 * Returns 0 on success, -EINVAL on failure.
2958 *
2959 * The caller must have charged to @to, IOW, called page_counter_charge() about
2960 * both res and memsw, and called css_get().
2961 */
2962static int mem_cgroup_move_swap_account(swp_entry_t entry,
2963                                struct mem_cgroup *from, struct mem_cgroup *to)
2964{
2965        unsigned short old_id, new_id;
2966
2967        old_id = mem_cgroup_id(from);
2968        new_id = mem_cgroup_id(to);
2969
2970        if (swap_cgroup_cmpxchg(entry, old_id, new_id) == old_id) {
2971                mod_memcg_state(from, MEMCG_SWAP, -1);
2972                mod_memcg_state(to, MEMCG_SWAP, 1);
2973                return 0;
2974        }
2975        return -EINVAL;
2976}
2977#else
2978static inline int mem_cgroup_move_swap_account(swp_entry_t entry,
2979                                struct mem_cgroup *from, struct mem_cgroup *to)
2980{
2981        return -EINVAL;
2982}
2983#endif
2984
2985static DEFINE_MUTEX(memcg_max_mutex);
2986
2987static int mem_cgroup_resize_max(struct mem_cgroup *memcg,
2988                                 unsigned long max, bool memsw)
2989{
2990        bool enlarge = false;
2991        bool drained = false;
2992        int ret;
2993        bool limits_invariant;
2994        struct page_counter *counter = memsw ? &memcg->memsw : &memcg->memory;
2995
2996        do {
2997                if (signal_pending(current)) {
2998                        ret = -EINTR;
2999                        break;
3000                }

3001
3002                mutex_lock(&memcg_max_mutex);
3003                /*
3004                 * Make sure that the new limit (memsw or memory limit) doesn't
3005                 * break our basic invariant rule memory.max <= memsw.max.
3006                 */
3007                limits_invariant = memsw ? max >= memcg->memory.max :
3008                                           max <= memcg->memsw.max;
3009                if (!limits_invariant) {
3010                        mutex_unlock(&memcg_max_mutex);
3011                        ret = -EINVAL;
3012                        break;
3013                }
3014                if (max > counter->max)
3015                        enlarge = true;
3016                ret = page_counter_set_max(counter, max);
3017                mutex_unlock(&memcg_max_mutex);
3018
3019                if (!ret)
3020                        break;
3021
3022                if (!drained) {
3023                        drain_all_stock(memcg);
3024                        drained = true;
3025                        continue;
3026                }
3027
3028                if (!try_to_free_mem_cgroup_pages(memcg, 1,
3029                                        GFP_KERNEL, !memsw)) {
3030                        ret = -EBUSY;
3031                        break;
3032                }
3033        } while (true);
3034
3035        if (!ret && enlarge)
3036                memcg_oom_recover(memcg);
3037
3038        return ret;
3039}
3040
3041unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order,
3042                                            gfp_t gfp_mask,
3043                                            unsigned long *total_scanned)
3044{
3045        unsigned long nr_reclaimed = 0;
3046        struct mem_cgroup_per_node *mz, *next_mz = NULL;
3047        unsigned long reclaimed;
3048        int loop = 0;
3049        struct mem_cgroup_tree_per_node *mctz;
3050        unsigned long excess;
3051        unsigned long nr_scanned;
3052
3053        if (order > 0)
3054                return 0;
3055
3056        mctz = soft_limit_tree_node(pgdat->node_id);
3057
3058        /*
3059         * Do not even bother to check the largest node if the root
3060         * is empty. Do it lockless to prevent lock bouncing. Races
3061         * are acceptable as soft limit is best effort anyway.
3062         */
3063        if (!mctz || RB_EMPTY_ROOT(&mctz->rb_root))
3064                return 0;
3065
3066        /*
3067         * This loop can run a while, specially if mem_cgroup's continuously
3068         * keep exceeding their soft limit and putting the system under
3069         * pressure
3070         */
3071        do {
3072                if (next_mz)
3073                        mz = next_mz;
3074                else
3075                        mz = mem_cgroup_largest_soft_limit_node(mctz);
3076                if (!mz)
3077                        break;
3078
3079                nr_scanned = 0;
3080                reclaimed = mem_cgroup_soft_reclaim(mz->memcg, pgdat,
3081                                                    gfp_mask, &nr_scanned);
3082                nr_reclaimed += reclaimed;
3083                *total_scanned += nr_scanned;
3084                spin_lock_irq(&mctz->lock);
3085                __mem_cgroup_remove_exceeded(mz, mctz);
3086
3087                /*
3088                 * If we failed to reclaim anything from this memory cgroup
3089                 * it is time to move on to the next cgroup
3090                 */
3091                next_mz = NULL;
3092                if (!reclaimed)
3093                        next_mz = __mem_cgroup_largest_soft_limit_node(mctz);
3094
3095                excess = soft_limit_excess(mz->memcg);
3096                /*
3097                 * One school of thought says that we should not add
3098                 * back the node to the tree if reclaim returns 0.
3099                 * But our reclaim could return 0, simply because due
3100                 * to priority we are exposing a smaller subset of
3101                 * memory to reclaim from. Consider this as a longer
3102                 * term TODO.
3103                 */
3104                /* If excess == 0, no tree ops */
3105                __mem_cgroup_insert_exceeded(mz, mctz, excess);
3106                spin_unlock_irq(&mctz->lock);
3107                css_put(&mz->memcg->css);
3108                loop++;
3109                /*
3110                 * Could not reclaim anything and there are no more
3111                 * mem cgroups to try or we seem to be looping without
3112                 * reclaiming anything.
3113                 */
3114                if (!nr_reclaimed &&
3115                        (next_mz == NULL ||
3116                        loop > MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS))
3117                        break;
3118        } while (!nr_reclaimed);
3119        if (next_mz)
3120                css_put(&next_mz->memcg->css);
3121        return nr_reclaimed;
3122}
3123
3124/*
3125 * Test whether @memcg has children, dead or alive.  Note that this
3126 * function doesn't care whether @memcg has use_hierarchy enabled and
3127 * returns %true if there are child csses according to the cgroup
3128 * hierarchy.  Testing use_hierarchy is the caller's responsiblity.
3129 */
3130static inline bool memcg_has_children(struct mem_cgroup *memcg)
3131{
3132        bool ret;
3133
3134        rcu_read_lock();
3135        ret = css_next_child(NULL, &memcg->css);
3136        rcu_read_unlock();
3137        return ret;
3138}
3139
3140/*
3141 * Reclaims as many pages from the given memcg as possible.
3142 *
3143 * Caller is responsible for holding css reference for memcg.
3144 */
3145static int mem_cgroup_force_empty(struct mem_cgroup *memcg)
3146{
3147        int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
3148
3149        /* we call try-to-free pages for make this cgroup empty */
3150        lru_add_drain_all();
3151
3152        drain_all_stock(memcg);
3153
3154        /* try to free all pages in this cgroup */
3155        while (nr_retries && page_counter_read(&memcg->memory)) {
3156                int progress;
3157
3158                if (signal_pending(current))
3159                        return -EINTR;
3160
3161                progress = try_to_free_mem_cgroup_pages(memcg, 1,
3162                                                        GFP_KERNEL, true);
3163                if (!progress) {
3164                        nr_retries--;
3165                        /* maybe some writeback is necessary */
3166                        congestion_wait(BLK_RW_ASYNC, HZ/10);
3167                }
3168
3169        }
3170
3171        return 0;
3172}
3173
3174static ssize_t mem_cgroup_force_empty_write(struct kernfs_open_file *of,
3175                                            char *buf, size_t nbytes,
3176                                            loff_t off)
3177{
3178        struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
3179
3180        if (mem_cgroup_is_root(memcg))
3181                return -EINVAL;
3182        return mem_cgroup_force_empty(memcg) ?: nbytes;
3183}
3184
3185static u64 mem_cgroup_hierarchy_read(struct cgroup_subsys_state *css,
3186                                     struct cftype *cft)
3187{
3188        return mem_cgroup_from_css(css)->use_hierarchy;
3189}
3190
3191static int mem_cgroup_hierarchy_write(struct cgroup_subsys_state *css,
3192                                      struct cftype *cft, u64 val)
3193{
3194        int retval = 0;
3195        struct mem_cgroup *memcg = mem_cgroup_from_css(css);
3196        struct mem_cgroup *parent_memcg = mem_cgroup_from_css(memcg->css.parent);
3197
3198        if (memcg->use_hierarchy == val)
3199                return 0;
3200
3201        /*
3202         * If parent's use_hierarchy is set, we can't make any modifications
3203         * in the child subtrees. If it is unset, then the change can
3204         * occur, provided the current cgroup has no children.
3205         *
3206         * For the root cgroup, parent_mem is NULL, we allow value to be
3207         * set if there are no children.
3208         */
3209        if ((!parent_memcg || !parent_memcg->use_hierarchy) &&
3210                                (val == 1 || val == 0)) {
3211                if (!memcg_has_children(memcg))
3212                        memcg->use_hierarchy = val;
3213                else
3214                        retval = -EBUSY;
3215        } else
3216                retval = -EINVAL;
3217
3218        return retval;
3219}
3220
3221static unsigned long mem_cgroup_usage(struct mem_cgroup *memcg, bool swap)
3222{
3223        unsigned long val;
3224
3225        if (mem_cgroup_is_root(memcg)) {
3226                val = memcg_page_state(memcg, MEMCG_CACHE) +
3227                        memcg_page_state(memcg, MEMCG_RSS);
3228                if (swap)
3229                        val += memcg_page_state(memcg, MEMCG_SWAP);
3230        } else {
3231                if (!swap)
3232                        val = page_counter_read(&memcg->memory);
3233                else
3234                        val = page_counter_read(&memcg->memsw);
3235        }
3236        return val;
3237}
3238
3239enum {
3240        RES_USAGE,
3241        RES_LIMIT,
3242        RES_MAX_USAGE,
3243        RES_FAILCNT,
3244        RES_SOFT_LIMIT,
3245};
3246
3247static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css,
3248                               struct cftype *cft)
3249{
3250        struct mem_cgroup *memcg = mem_cgroup_from_css(css);
3251        struct page_counter *counter;
3252
3253        switch (MEMFILE_TYPE(cft->private)) {
3254        case _MEM:
3255                counter = &memcg->memory;
3256                break;
3257        case _MEMSWAP:
3258                counter = &memcg->memsw;
3259                break;
3260        case _KMEM:
3261                counter = &memcg->kmem;
3262                break;
3263        case _TCP:
3264                counter = &memcg->tcpmem;
3265                break;
3266        default:
3267                BUG();
3268        }
3269
3270        switch (MEMFILE_ATTR(cft->private)) {
3271        case RES_USAGE:
3272                if (counter == &memcg->memory)
3273                        return (u64)mem_cgroup_usage(memcg, false) * PAGE_SIZE;
3274                if (counter == &memcg->memsw)
3275                        return (u64)mem_cgroup_usage(memcg, true) * PAGE_SIZE;
3276                return (u64)page_counter_read(counter) * PAGE_SIZE;
3277        case RES_LIMIT:
3278                return (u64)counter->max * PAGE_SIZE;
3279        case RES_MAX_USAGE:
3280                return (u64)counter->watermark * PAGE_SIZE;
3281        case RES_FAILCNT:
3282                return counter->failcnt;
3283        case RES_SOFT_LIMIT:
3284                return (u64)memcg->soft_limit * PAGE_SIZE;
3285        default:
3286                BUG();
3287        }
3288}
3289
3290static void memcg_flush_percpu_vmstats(struct mem_cgroup *memcg)
3291{
3292        unsigned long stat[MEMCG_NR_STAT] = {0};
3293        struct mem_cgroup *mi;
3294        int node, cpu, i;
3295
3296        for_each_online_cpu(cpu)
3297                for (i = 0; i < MEMCG_NR_STAT; i++)
3298                        stat[i] += per_cpu(memcg->vmstats_percpu->stat[i], cpu);
3299
3300        for (mi = memcg; mi; mi = parent_mem_cgroup(mi))
3301                for (i = 0; i < MEMCG_NR_STAT; i++)
3302                        atomic_long_add(stat[i], &mi->vmstats[i]);
3303
3304        for_each_node(node) {
3305                struct mem_cgroup_per_node *pn = memcg->nodeinfo[node];
3306                struct mem_cgroup_per_node *pi;
3307
3308                for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++)
3309                        stat[i] = 0;
3310
3311                for_each_online_cpu(cpu)
3312                        for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++)
3313                                stat[i] += per_cpu(
3314                                        pn->lruvec_stat_cpu->count[i], cpu);
3315
3316                for (pi = pn; pi; pi = parent_nodeinfo(pi, node))
3317                        for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++)
3318                                atomic_long_add(stat[i], &pi->lruvec_stat[i]);
3319        }
3320}
3321
3322static void memcg_flush_percpu_vmevents(struct mem_cgroup *memcg)
3323{
3324        unsigned long events[NR_VM_EVENT_ITEMS];
3325        struct mem_cgroup *mi;
3326        int cpu, i;
3327
3328        for (i = 0; i < NR_VM_EVENT_ITEMS; i++)
3329                events[i] = 0;
3330
3331        for_each_online_cpu(cpu)
3332                for (i = 0; i < NR_VM_EVENT_ITEMS; i++)
3333                        events[i] += per_cpu(memcg->vmstats_percpu->events[i],
3334                                             cpu);
3335
3336        for (mi = memcg; mi; mi = parent_mem_cgroup(mi))
3337                for (i = 0; i < NR_VM_EVENT_ITEMS; i++)
3338                        atomic_long_add(events[i], &mi->vmevents[i]);
3339}
3340
3341#ifdef CONFIG_MEMCG_KMEM
3342static int memcg_online_kmem(struct mem_cgroup *memcg)
3343{
3344        int memcg_id;
3345
3346        if (cgroup_memory_nokmem)
3347                return 0;
3348
3349        BUG_ON(memcg->kmemcg_id >= 0);
3350        BUG_ON(memcg->kmem_state);
3351
3352        memcg_id = memcg_alloc_cache_id();
3353        if (memcg_id < 0)
3354                return memcg_id;
3355
3356        static_branch_inc(&memcg_kmem_enabled_key);
3357        /*
3358         * A memory cgroup is considered kmem-online as soon as it gets
3359         * kmemcg_id. Setting the id after enabling static branching will
3360         * guarantee no one starts accounting before all call sites are
3361         * patched.
3362         */
3363        memcg->kmemcg_id = memcg_id;
3364        memcg->kmem_state = KMEM_ONLINE;
3365        INIT_LIST_HEAD(&memcg->kmem_caches);
3366
3367        return 0;
3368}
3369
3370static void memcg_offline_kmem(struct mem_cgroup *memcg)
3371{
3372        struct cgroup_subsys_state *css;
3373        struct mem_cgroup *parent, *child;
3374        int kmemcg_id;
3375
3376        if (memcg->kmem_state != KMEM_ONLINE)
3377                return;
3378        /*
3379         * Clear the online state before clearing memcg_caches array
3380         * entries. The slab_mutex in memcg_deactivate_kmem_caches()
3381         * guarantees that no cache will be created for this cgroup
3382         * after we are done (see memcg_create_kmem_cache()).
3383         */
3384        memcg->kmem_state = KMEM_ALLOCATED;
3385
3386        parent = parent_mem_cgroup(memcg);
3387        if (!parent)
3388                parent = root_mem_cgroup;
3389
3390        /*
3391         * Deactivate and reparent kmem_caches.
3392         */
3393        memcg_deactivate_kmem_caches(memcg, parent);
3394
3395        kmemcg_id = memcg->kmemcg_id;
3396        BUG_ON(kmemcg_id < 0);
3397
3398        /*
3399         * Change kmemcg_id of this cgroup and all its descendants to the
3400         * parent's id, and then move all entries from this cgroup's list_lrus
3401         * to ones of the parent. After we have finished, all list_lrus
3402         * corresponding to this cgroup are guaranteed to remain empty. The
3403         * ordering is imposed by list_lru_node->lock taken by
3404         * memcg_drain_all_list_lrus().
3405         */
3406        rcu_read_lock(); /* can be called from css_free w/o cgroup_mutex */
3407        css_for_each_descendant_pre(css, &memcg->css) {
3408                child = mem_cgroup_from_css(css);
3409                BUG_ON(child->kmemcg_id != kmemcg_id);
3410                child->kmemcg_id = parent->kmemcg_id;
3411                if (!memcg->use_hierarchy)
3412                        break;
3413        }
3414        rcu_read_unlock();
3415
3416        memcg_drain_all_list_lrus(kmemcg_id, parent);
3417
3418        memcg_free_cache_id(kmemcg_id);
3419}
3420
3421static void memcg_free_kmem(struct mem_cgroup *memcg)
3422{
3423        /* css_alloc() failed, offlining didn't happen */
3424        if (unlikely(memcg->kmem_state == KMEM_ONLINE))
3425                memcg_offline_kmem(memcg);
3426
3427        if (memcg->kmem_state == KMEM_ALLOCATED) {
3428                WARN_ON(!list_empty(&memcg->kmem_caches));
3429                static_branch_dec(&memcg_kmem_enabled_key);
3430        }
3431}
3432#else
3433static int memcg_online_kmem(struct mem_cgroup *memcg)
3434{
3435        return 0;
3436}
3437static void memcg_offline_kmem(struct mem_cgroup *memcg)
3438{
3439}
3440static void memcg_free_kmem(struct mem_cgroup *memcg)
3441{
3442}
3443#endif /* CONFIG_MEMCG_KMEM */
3444
3445static int memcg_update_kmem_max(struct mem_cgroup *memcg,
3446                                 unsigned long max)
3447{
3448        int ret;
3449
3450        mutex_lock(&memcg_max_mutex);
3451        ret = page_counter_set_max(&memcg->kmem, max);
3452        mutex_unlock(&memcg_max_mutex);
3453        return ret;
3454}
3455
3456static int memcg_update_tcp_max(struct mem_cgroup *memcg, unsigned long max)
3457{
3458        int ret;
3459
3460        mutex_lock(&memcg_max_mutex);
3461
3462        ret = page_counter_set_max(&memcg->tcpmem, max);
3463        if (ret)
3464                goto out;
3465
3466        if (!memcg->tcpmem_active) {
3467                /*
3468                 * The active flag needs to be written after the static_key
3469                 * update. This is what guarantees that the socket activation
3470                 * function is the last one to run. See mem_cgroup_sk_alloc()
3471                 * for details, and note that we don't mark any socket as
3472                 * belonging to this memcg until that flag is up.
3473                 *
3474                 * We need to do this, because static_keys will span multiple
3475                 * sites, but we can't control their order. If we mark a socket
3476                 * as accounted, but the accounting functions are not patched in
3477                 * yet, we'll lose accounting.
3478                 *
3479                 * We never race with the readers in mem_cgroup_sk_alloc(),
3480                 * because when this value change, the code to process it is not
3481                 * patched in yet.
3482                 */
3483                static_branch_inc(&memcg_sockets_enabled_key);
3484                memcg->tcpmem_active = true;
3485        }
3486out:
3487        mutex_unlock(&memcg_max_mutex);
3488        return ret;
3489}
3490
3491/*
3492 * The user of this function is...
3493 * RES_LIMIT.
3494 */
3495static ssize_t mem_cgroup_write(struct kernfs_open_file *of,
3496                                char *buf, size_t nbytes, loff_t off)
3497{
3498        struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
3499        unsigned long nr_pages;
3500        int ret;
3501
3502        buf = strstrip(buf);
3503        ret = page_counter_memparse(buf, "-1", &nr_pages);
3504        if (ret)
3505                return ret;
3506
3507        switch (MEMFILE_ATTR(of_cft(of)->private)) {
3508        case RES_LIMIT:
3509                if (mem_cgroup_is_root(memcg)) { /* Can't set limit on root */
3510                        ret = -EINVAL;
3511                        break;
3512                }
3513                switch (MEMFILE_TYPE(of_cft(of)->private)) {
3514                case _MEM:
3515                        ret = mem_cgroup_resize_max(memcg, nr_pages, false);
3516                        break;
3517                case _MEMSWAP:
3518                        ret = mem_cgroup_resize_max(memcg, nr_pages, true);
3519                        break;
3520                case _KMEM:
3521                        pr_warn_once("kmem.limit_in_bytes is deprecated and will be removed. "
3522                                     "Please report your usecase to linux-mm@kvack.org if you "
3523                                     "depend on this functionality.\n");
3524                        ret = memcg_update_kmem_max(memcg, nr_pages);
3525                        break;
3526                case _TCP:
3527                        ret = memcg_update_tcp_max(memcg, nr_pages);
3528                        break;
3529                }
3530                break;
3531        case RES_SOFT_LIMIT:
3532                memcg->soft_limit = nr_pages;
3533                ret = 0;
3534                break;
3535        }
3536        return ret ?: nbytes;
3537}
3538
3539static ssize_t mem_cgroup_reset(struct kernfs_open_file *of, char *buf,
3540                                size_t nbytes, loff_t off)
3541{
3542        struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
3543        struct page_counter *counter;
3544
3545        switch (MEMFILE_TYPE(of_cft(of)->private)) {
3546        case _MEM:
3547                counter = &memcg->memory;
3548                break;
3549        case _MEMSWAP:
3550                counter = &memcg->memsw;
3551                break;
3552        case _KMEM:
3553                counter = &memcg->kmem;
3554                break;
3555        case _TCP:
3556                counter = &memcg->tcpmem;
3557                break;
3558        default:
3559                BUG();
3560        }
3561
3562        switch (MEMFILE_ATTR(of_cft(of)->private)) {
3563        case RES_MAX_USAGE:
3564                page_counter_reset_watermark(counter);
3565                break;
3566        case RES_FAILCNT:
3567                counter->failcnt = 0;
3568                break;
3569        default:
3570                BUG();
3571        }
3572
3573        return nbytes;
3574}
3575
3576static u64 mem_cgroup_move_charge_read(struct cgroup_subsys_state *css,
3577                                        struct cftype *cft)
3578{
3579        return mem_cgroup_from_css(css)->move_charge_at_immigrate;
3580}
3581
3582#ifdef CONFIG_MMU
3583static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css,
3584                                        struct cftype *cft, u64 val)
3585{
3586        struct mem_cgroup *memcg = mem_cgroup_from_css(css);
3587
3588        if (val & ~MOVE_MASK)
3589                return -EINVAL;
3590
3591        /*
3592         * No kind of locking is needed in here, because ->can_attach() will
3593         * check this value once in the beginning of the process, and then carry
3594         * on with stale data. This means that changes to this value will only
3595         * affect task migrations starting after the change.
3596         */
3597        memcg->move_charge_at_immigrate = val;
3598        return 0;
3599}
3600#else
3601static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css,
3602                                        struct cftype *cft, u64 val)
3603{
3604        return -ENOSYS;
3605}
3606#endif
3607
3608#ifdef CONFIG_NUMA
3609
3610#define LRU_ALL_FILE (BIT(LRU_INACTIVE_FILE) | BIT(LRU_ACTIVE_FILE))
3611#define LRU_ALL_ANON (BIT(LRU_INACTIVE_ANON) | BIT(LRU_ACTIVE_ANON))
3612#define LRU_ALL      ((1 << NR_LRU_LISTS) - 1)
3613
3614static unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg,
3615                                           int nid, unsigned int lru_mask)
3616{
3617        struct lruvec *lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(nid));
3618        unsigned long nr = 0;
3619        enum lru_list lru;
3620
3621        VM_BUG_ON((unsigned)nid >= nr_node_ids);
3622
3623        for_each_lru(lru) {
3624                if (!(BIT(lru) & lru_mask))
3625                        continue;
3626                nr += lruvec_page_state_local(lruvec, NR_LRU_BASE + lru);
3627        }
3628        return nr;
3629}
3630
3631static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *memcg,
3632                                             unsigned int lru_mask)
3633{
3634        unsigned long nr = 0;
3635        enum lru_list lru;
3636
3637        for_each_lru(lru) {
3638                if (!(BIT(lru) & lru_mask))
3639                        continue;
3640                nr += memcg_page_state_local(memcg, NR_LRU_BASE + lru);
3641        }
3642        return nr;
3643}
3644
3645static int memcg_numa_stat_show(struct seq_file *m, void *v)
3646{
3647        struct numa_stat {
3648                const char *name;
3649                unsigned int lru_mask;
3650        };
3651
3652        static const struct numa_stat stats[] = {
3653                { "total", LRU_ALL },
3654                { "file", LRU_ALL_FILE },
3655                { "anon", LRU_ALL_ANON },
3656                { "unevictable", BIT(LRU_UNEVICTABLE) },
3657        };
3658        const struct numa_stat *stat;
3659        int nid;
3660        unsigned long nr;
3661        struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
3662
3663        for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) {
3664                nr = mem_cgroup_nr_lru_pages(memcg, stat->lru_mask);
3665                seq_printf(m, "%s=%lu", stat->name, nr);
3666                for_each_node_state(nid, N_MEMORY) {
3667                        nr = mem_cgroup_node_nr_lru_pages(memcg, nid,
3668                                                          stat->lru_mask);
3669                        seq_printf(m, " N%d=%lu", nid, nr);
3670                }
3671                seq_putc(m, '\n');
3672        }
3673
3674        for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) {
3675                struct mem_cgroup *iter;
3676
3677                nr = 0;
3678                for_each_mem_cgroup_tree(iter, memcg)
3679                        nr += mem_cgroup_nr_lru_pages(iter, stat->lru_mask);
3680                seq_printf(m, "hierarchical_%s=%lu", stat->name, nr);
3681                for_each_node_state(nid, N_MEMORY) {
3682                        nr = 0;
3683                        for_each_mem_cgroup_tree(iter, memcg)
3684                                nr += mem_cgroup_node_nr_lru_pages(
3685                                        iter, nid, stat->lru_mask);
3686                        seq_printf(m, " N%d=%lu", nid, nr);
3687                }
3688                seq_putc(m, '\n');
3689        }
3690
3691        return 0;
3692}
3693#endif /* CONFIG_NUMA */
3694
3695static const unsigned int memcg1_stats[] = {
3696        MEMCG_CACHE,
3697        MEMCG_RSS,
3698        MEMCG_RSS_HUGE,
3699        NR_SHMEM,
3700        NR_FILE_MAPPED,
3701        NR_FILE_DIRTY,
3702        NR_WRITEBACK,
3703        MEMCG_SWAP,
3704};
3705
3706static const char *const memcg1_stat_names[] = {
3707        "cache",
3708        "rss",
3709        "rss_huge",
3710        "shmem",
3711        "mapped_file",
3712        "dirty",
3713        "writeback",
3714        "swap",
3715};
3716
3717/* Universal VM events cgroup1 shows, original sort order */
3718static const unsigned int memcg1_events[] = {
3719        PGPGIN,
3720        PGPGOUT,
3721        PGFAULT,
3722        PGMAJFAULT,
3723};
3724
3725static int memcg_stat_show(struct seq_file *m, void *v)
3726{
3727        struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
3728        unsigned long memory, memsw;
3729        struct mem_cgroup *mi;
3730        unsigned int i;
3731
3732        BUILD_BUG_ON(ARRAY_SIZE(memcg1_stat_names) != ARRAY_SIZE(memcg1_stats));
3733
3734        for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) {
3735                if (memcg1_stats[i] == MEMCG_SWAP && !do_memsw_account())
3736                        continue;
3737                seq_printf(m, "%s %lu\n", memcg1_stat_names[i],
3738                           memcg_page_state_local(memcg, memcg1_stats[i]) *
3739                           PAGE_SIZE);
3740        }
3741
3742        for (i = 0; i < ARRAY_SIZE(memcg1_events); i++)
3743                seq_printf(m, "%s %lu\n", vm_event_name(memcg1_events[i]),
3744                           memcg_events_local(memcg, memcg1_events[i]));
3745
3746        for (i = 0; i < NR_LRU_LISTS; i++)
3747                seq_printf(m, "%s %lu\n", lru_list_name(i),
3748                           memcg_page_state_local(memcg, NR_LRU_BASE + i) *
3749                           PAGE_SIZE);
3750
3751        /* Hierarchical information */
3752        memory = memsw = PAGE_COUNTER_MAX;
3753        for (mi = memcg; mi; mi = parent_mem_cgroup(mi)) {
3754                memory = min(memory, mi->memory.max);
3755                memsw = min(memsw, mi->memsw.max);
3756        }
3757        seq_printf(m, "hierarchical_memory_limit %llu\n",
3758                   (u64)memory * PAGE_SIZE);
3759        if (do_memsw_account())
3760                seq_printf(m, "hierarchical_memsw_limit %llu\n",
3761                           (u64)memsw * PAGE_SIZE);
3762
3763        for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) {
3764                if (memcg1_stats[i] == MEMCG_SWAP && !do_memsw_account())
3765                        continue;
3766                seq_printf(m, "total_%s %llu\n", memcg1_stat_names[i],
3767                           (u64)memcg_page_state(memcg, memcg1_stats[i]) *
3768                           PAGE_SIZE);
3769        }
3770
3771        for (i = 0; i < ARRAY_SIZE(memcg1_events); i++)
3772                seq_printf(m, "total_%s %llu\n",
3773                           vm_event_name(memcg1_events[i]),
3774                           (u64)memcg_events(memcg, memcg1_events[i]));
3775
3776        for (i = 0; i < NR_LRU_LISTS; i++)
3777                seq_printf(m, "total_%s %llu\n", lru_list_name(i),
3778                           (u64)memcg_page_state(memcg, NR_LRU_BASE + i) *
3779                           PAGE_SIZE);
3780
3781#ifdef CONFIG_DEBUG_VM
3782        {
3783                pg_data_t *pgdat;
3784                struct mem_cgroup_per_node *mz;
3785                struct zone_reclaim_stat *rstat;
3786                unsigned long recent_rotated[2] = {0, 0};
3787                unsigned long recent_scanned[2] = {0, 0};
3788
3789                for_each_online_pgdat(pgdat) {
3790                        mz = mem_cgroup_nodeinfo(memcg, pgdat->node_id);
3791                        rstat = &mz->lruvec.reclaim_stat;
3792
3793                        recent_rotated[0] += rstat->recent_rotated[0];
3794                        recent_rotated[1] += rstat->recent_rotated[1];
3795                        recent_scanned[0] += rstat->recent_scanned[0];
3796                        recent_scanned[1] += rstat->recent_scanned[1];
3797                }
3798                seq_printf(m, "recent_rotated_anon %lu\n", recent_rotated[0]);
3799                seq_printf(m, "recent_rotated_file %lu\n", recent_rotated[1]);
3800                seq_printf(m, "recent_scanned_anon %lu\n", recent_scanned[0]);
3801                seq_printf(m, "recent_scanned_file %lu\n", recent_scanned[1]);
3802        }
3803#endif
3804
3805        return 0;
3806}
3807
3808static u64 mem_cgroup_swappiness_read(struct cgroup_subsys_state *css,
3809                                      struct cftype *cft)
3810{
3811        struct mem_cgroup *memcg = mem_cgroup_from_css(css);
3812
3813        return mem_cgroup_swappiness(memcg);
3814}
3815
3816static int mem_cgroup_swappiness_write(struct cgroup_subsys_state *css,
3817                                       struct cftype *cft, u64 val)
3818{
3819        struct mem_cgroup *memcg = mem_cgroup_from_css(css);
3820
3821        if (val > 100)
3822                return -EINVAL;
3823
3824        if (css->parent)
3825                memcg->swappiness = val;
3826        else
3827                vm_swappiness = val;
3828
3829        return 0;
3830}
3831
3832static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap)
3833{
3834        struct mem_cgroup_threshold_ary *t;
3835        unsigned long usage;
3836        int i;
3837
3838        rcu_read_lock();
3839        if (!swap)
3840                t = rcu_dereference(memcg->thresholds.primary);
3841        else
3842                t = rcu_dereference(memcg->memsw_thresholds.primary);
3843
3844        if (!t)
3845                goto unlock;
3846
3847        usage = mem_cgroup_usage(memcg, swap);
3848
3849        /*
3850         * current_threshold points to threshold just below or equal to usage.
3851         * If it's not true, a threshold was crossed after last
3852         * call of __mem_cgroup_threshold().
3853         */
3854        i = t->current_threshold;
3855
3856        /*
3857         * Iterate backward over array of thresholds starting from
3858         * current_threshold and check if a threshold is crossed.
3859         * If none of thresholds below usage is crossed, we read
3860         * only one element of the array here.
3861         */
3862        for (; i >= 0 && unlikely(t->entries[i].threshold > usage); i--)
3863                eventfd_signal(t->entries[i].eventfd, 1);
3864
3865        /* i = current_threshold + 1 */
3866        i++;
3867
3868        /*
3869         * Iterate forward over array of thresholds starting from
3870         * current_threshold+1 and check if a threshold is crossed.
3871         * If none of thresholds above usage is crossed, we read
3872         * only one element of the array here.
3873         */
3874        for (; i < t->size && unlikely(t->entries[i].threshold <= usage); i++)
3875                eventfd_signal(t->entries[i].eventfd, 1);
3876
3877        /* Update current_threshold */
3878        t->current_threshold = i - 1;
3879unlock:
3880        rcu_read_unlock();
3881}
3882
3883static void mem_cgroup_threshold(struct mem_cgroup *memcg)
3884{
3885        while (memcg) {
3886                __mem_cgroup_threshold(memcg, false);
3887                if (do_memsw_account())
3888                        __mem_cgroup_threshold(memcg, true);
3889
3890                memcg = parent_mem_cgroup(memcg);
3891        }
3892}
3893
3894static int compare_thresholds(const void *a, const void *b)
3895{
3896        const struct mem_cgroup_threshold *_a = a;
3897        const struct mem_cgroup_threshold *_b = b;
3898
3899        if (_a->threshold > _b->threshold)
3900                return 1;
3901
3902        if (_a->threshold < _b->threshold)
3903                return -1;
3904
3905        return 0;
3906}
3907
3908static int mem_cgroup_oom_notify_cb(struct mem_cgroup *memcg)
3909{
3910        struct mem_cgroup_eventfd_list *ev;
3911
3912        spin_lock(&memcg_oom_lock);
3913
3914        list_for_each_entry(ev, &memcg->oom_notify, list)
3915                eventfd_signal(ev->eventfd, 1);
3916
3917        spin_unlock(&memcg_oom_lock);
3918        return 0;
3919}
3920
3921static void mem_cgroup_oom_notify(struct mem_cgroup *memcg)
3922{
3923        struct mem_cgroup *iter;
3924
3925        for_each_mem_cgroup_tree(iter, memcg)
3926                mem_cgroup_oom_notify_cb(iter);
3927}
3928
3929static int __mem_cgroup_usage_register_event(struct mem_cgroup *memcg,
3930        struct eventfd_ctx *eventfd, const char *args, enum res_type type)
3931{
3932        struct mem_cgroup_thresholds *thresholds;
3933        struct mem_cgroup_threshold_ary *new;
3934        unsigned long threshold;
3935        unsigned long usage;
3936        int i, size, ret;
3937
3938        ret = page_counter_memparse(args, "-1", &threshold);
3939        if (ret)
3940                return ret;
3941
3942        mutex_lock(&memcg->thresholds_lock);
3943
3944        if (type == _MEM) {
3945                thresholds = &memcg->thresholds;
3946                usage = mem_cgroup_usage(memcg, false);
3947        } else if (type == _MEMSWAP) {
3948                thresholds = &memcg->memsw_thresholds;
3949                usage = mem_cgroup_usage(memcg, true);
3950        } else
3951                BUG();
3952
3953        /* Check if a threshold crossed before adding a new one */
3954        if (thresholds->primary)
3955                __mem_cgroup_threshold(memcg, type == _MEMSWAP);
3956
3957        size = thresholds->primary ? thresholds->primary->size + 1 : 1;
3958
3959        /* Allocate memory for new array of thresholds */
3960        new = kmalloc(struct_size(new, entries, size), GFP_KERNEL);
3961        if (!new) {
3962                ret = -ENOMEM;
3963                goto unlock;
3964        }
3965        new->size = size;
3966
3967        /* Copy thresholds (if any) to new array */
3968        if (thresholds->primary) {
3969                memcpy(new->entries, thresholds->primary->entries, (size - 1) *
3970                                sizeof(struct mem_cgroup_threshold));
3971        }
3972
3973        /* Add new threshold */
3974        new->entries[size - 1].eventfd = eventfd;
3975        new->entries[size - 1].threshold = threshold;
3976
3977        /* Sort thresholds. Registering of new threshold isn't time-critical */
3978        sort(new->entries, size, sizeof(struct mem_cgroup_threshold),
3979                        compare_thresholds, NULL);
3980
3981        /* Find current threshold */
3982        new->current_threshold = -1;
3983        for (i = 0; i < size; i++) {
3984                if (new->entries[i].threshold <= usage) {
3985                        /*
3986                         * new->current_threshold will not be used until
3987                         * rcu_assign_pointer(), so it's safe to increment
3988                         * it here.
3989                         */
3990                        ++new->current_threshold;
3991                } else
3992                        break;
3993        }
3994
3995        /* Free old spare buffer and save old primary buffer as spare */
3996        kfree(thresholds->spare);
3997        thresholds->spare = thresholds->primary;
3998
3999        rcu_assign_pointer(thresholds->primary, new);
4000

4001        /* To be sure that nobody uses thresholds */
4002        synchronize_rcu();
4003
4004unlock:
4005        mutex_unlock(&memcg->thresholds_lock);
4006
4007        return ret;
4008}
4009
4010static int mem_cgroup_usage_register_event(struct mem_cgroup *memcg,
4011        struct eventfd_ctx *eventfd, const char *args)
4012{
4013        return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEM);
4014}
4015
4016static int memsw_cgroup_usage_register_event(struct mem_cgroup *memcg,
4017        struct eventfd_ctx *eventfd, const char *args)
4018{
4019        return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEMSWAP);
4020}
4021
4022static void __mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
4023        struct eventfd_ctx *eventfd, enum res_type type)
4024{
4025        struct mem_cgroup_thresholds *thresholds;
4026        struct mem_cgroup_threshold_ary *new;
4027        unsigned long usage;
4028        int i, j, size;
4029
4030        mutex_lock(&memcg->thresholds_lock);
4031
4032        if (type == _MEM) {
4033                thresholds = &memcg->thresholds;
4034                usage = mem_cgroup_usage(memcg, false);
4035        } else if (type == _MEMSWAP) {
4036                thresholds = &memcg->memsw_thresholds;
4037                usage = mem_cgroup_usage(memcg, true);
4038        } else
4039                BUG();
4040
4041        if (!thresholds->primary)
4042                goto unlock;
4043
4044        /* Check if a threshold crossed before removing */
4045        __mem_cgroup_threshold(memcg, type == _MEMSWAP);
4046
4047        /* Calculate new number of threshold */
4048        size = 0;
4049        for (i = 0; i < thresholds->primary->size; i++) {
4050                if (thresholds->primary->entries[i].eventfd != eventfd)
4051                        size++;
4052        }
4053
4054        new = thresholds->spare;
4055
4056        /* Set thresholds array to NULL if we don't have thresholds */
4057        if (!size) {
4058                kfree(new);
4059                new = NULL;
4060                goto swap_buffers;
4061        }
4062
4063        new->size = size;
4064
4065        /* Copy thresholds and find current threshold */
4066        new->current_threshold = -1;
4067        for (i = 0, j = 0; i < thresholds->primary->size; i++) {
4068                if (thresholds->primary->entries[i].eventfd == eventfd)
4069                        continue;
4070
4071                new->entries[j] = thresholds->primary->entries[i];
4072                if (new->entries[j].threshold <= usage) {
4073                        /*
4074                         * new->current_threshold will not be used
4075                         * until rcu_assign_pointer(), so it's safe to increment
4076                         * it here.
4077                         */
4078                        ++new->current_threshold;
4079                }
4080                j++;
4081        }
4082
4083swap_buffers:
4084        /* Swap primary and spare array */
4085        thresholds->spare = thresholds->primary;
4086
4087        rcu_assign_pointer(thresholds->primary, new);
4088
4089        /* To be sure that nobody uses thresholds */
4090        synchronize_rcu();
4091
4092        /* If all events are unregistered, free the spare array */
4093        if (!new) {
4094                kfree(thresholds->spare);
4095                thresholds->spare = NULL;
4096        }
4097unlock:
4098        mutex_unlock(&memcg->thresholds_lock);
4099}
4100
4101static void mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
4102        struct eventfd_ctx *eventfd)
4103{
4104        return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEM);
4105}
4106
4107static void memsw_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
4108        struct eventfd_ctx *eventfd)
4109{
4110        return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEMSWAP);
4111}
4112
4113static int mem_cgroup_oom_register_event(struct mem_cgroup *memcg,
4114        struct eventfd_ctx *eventfd, const char *args)
4115{
4116        struct mem_cgroup_eventfd_list *event;
4117
4118        event = kmalloc(sizeof(*event), GFP_KERNEL);
4119        if (!event)
4120                return -ENOMEM;
4121
4122        spin_lock(&memcg_oom_lock);
4123
4124        event->eventfd = eventfd;
4125        list_add(&event->list, &memcg->oom_notify);
4126
4127        /* already in OOM ? */
4128        if (memcg->under_oom)
4129                eventfd_signal(eventfd, 1);
4130        spin_unlock(&memcg_oom_lock);
4131
4132        return 0;
4133}
4134
4135static void mem_cgroup_oom_unregister_event(struct mem_cgroup *memcg,
4136        struct eventfd_ctx *eventfd)
4137{
4138        struct mem_cgroup_eventfd_list *ev, *tmp;
4139
4140        spin_lock(&memcg_oom_lock);
4141
4142        list_for_each_entry_safe(ev, tmp, &memcg->oom_notify, list) {
4143                if (ev->eventfd == eventfd) {
4144                        list_del(&ev->list);
4145                        kfree(ev);
4146                }
4147        }
4148
4149        spin_unlock(&memcg_oom_lock);
4150}
4151
4152static int mem_cgroup_oom_control_read(struct seq_file *sf, void *v)
4153{
4154        struct mem_cgroup *memcg = mem_cgroup_from_seq(sf);
4155
4156        seq_printf(sf, "oom_kill_disable %d\n", memcg->oom_kill_disable);
4157        seq_printf(sf, "under_oom %d\n", (bool)memcg->under_oom);
4158        seq_printf(sf, "oom_kill %lu\n",
4159                   atomic_long_read(&memcg->memory_events[MEMCG_OOM_KILL]));
4160        return 0;
4161}
4162
4163static int mem_cgroup_oom_control_write(struct cgroup_subsys_state *css,
4164        struct cftype *cft, u64 val)
4165{
4166        struct mem_cgroup *memcg = mem_cgroup_from_css(css);
4167
4168        /* cannot set to root cgroup and only 0 and 1 are allowed */
4169        if (!css->parent || !((val == 0) || (val == 1)))
4170                return -EINVAL;
4171
4172        memcg->oom_kill_disable = val;
4173        if (!val)
4174                memcg_oom_recover(memcg);
4175
4176        return 0;
4177}
4178
4179#ifdef CONFIG_CGROUP_WRITEBACK
4180
4181#include <trace/events/writeback.h>
4182
4183static int memcg_wb_domain_init(struct mem_cgroup *memcg, gfp_t gfp)
4184{
4185        return wb_domain_init(&memcg->cgwb_domain, gfp);
4186}
4187
4188static void memcg_wb_domain_exit(struct mem_cgroup *memcg)
4189{
4190        wb_domain_exit(&memcg->cgwb_domain);
4191}
4192
4193static void memcg_wb_domain_size_changed(struct mem_cgroup *memcg)
4194{
4195        wb_domain_size_changed(&memcg->cgwb_domain);
4196}
4197
4198struct wb_domain *mem_cgroup_wb_domain(struct bdi_writeback *wb)
4199{
4200        struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css);
4201
4202        if (!memcg->css.parent)
4203                return NULL;
4204
4205        return &memcg->cgwb_domain;
4206}
4207
4208/*
4209 * idx can be of type enum memcg_stat_item or node_stat_item.
4210 * Keep in sync with memcg_exact_page().
4211 */
4212static unsigned long memcg_exact_page_state(struct mem_cgroup *memcg, int idx)
4213{
4214        long x = atomic_long_read(&memcg->vmstats[idx]);
4215        int cpu;
4216
4217        for_each_online_cpu(cpu)
4218                x += per_cpu_ptr(memcg->vmstats_percpu, cpu)->stat[idx];
4219        if (x < 0)
4220                x = 0;
4221        return x;
4222}
4223
4224/**
4225 * mem_cgroup_wb_stats - retrieve writeback related stats from its memcg
4226 * @wb: bdi_writeback in question
4227 * @pfilepages: out parameter for number of file pages
4228 * @pheadroom: out parameter for number of allocatable pages according to memcg
4229 * @pdirty: out parameter for number of dirty pages
4230 * @pwriteback: out parameter for number of pages under writeback
4231 *
4232 * Determine the numbers of file, headroom, dirty, and writeback pages in
4233 * @wb's memcg.  File, dirty and writeback are self-explanatory.  Headroom
4234 * is a bit more involved.
4235 *
4236 * A memcg's headroom is "min(max, high) - used".  In the hierarchy, the
4237 * headroom is calculated as the lowest headroom of itself and the
4238 * ancestors.  Note that this doesn't consider the actual amount of
4239 * available memory in the system.  The caller should further cap
4240 * *@pheadroom accordingly.
4241 */
4242void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages,
4243                         unsigned long *pheadroom, unsigned long *pdirty,
4244                         unsigned long *pwriteback)
4245{
4246        struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css);
4247        struct mem_cgroup *parent;
4248
4249        *pdirty = memcg_exact_page_state(memcg, NR_FILE_DIRTY);
4250
4251        /* this should eventually include NR_UNSTABLE_NFS */
4252        *pwriteback = memcg_exact_page_state(memcg, NR_WRITEBACK);
4253        *pfilepages = memcg_exact_page_state(memcg, NR_INACTIVE_FILE) +
4254                        memcg_exact_page_state(memcg, NR_ACTIVE_FILE);
4255        *pheadroom = PAGE_COUNTER_MAX;
4256
4257        while ((parent = parent_mem_cgroup(memcg))) {
4258                unsigned long ceiling = min(memcg->memory.max, memcg->high);
4259                unsigned long used = page_counter_read(&memcg->memory);
4260
4261                *pheadroom = min(*pheadroom, ceiling - min(ceiling, used));
4262                memcg = parent;
4263        }
4264}
4265
4266/*
4267 * Foreign dirty flushing
4268 *
4269 * There's an inherent mismatch between memcg and writeback.  The former
4270 * trackes ownership per-page while the latter per-inode.  This was a
4271 * deliberate design decision because honoring per-page ownership in the
4272 * writeback path is complicated, may lead to higher CPU and IO overheads
4273 * and deemed unnecessary given that write-sharing an inode across
4274 * different cgroups isn't a common use-case.
4275 *
4276 * Combined with inode majority-writer ownership switching, this works well
4277 * enough in most cases but there are some pathological cases.  For
4278 * example, let's say there are two cgroups A and B which keep writing to
4279 * different but confined parts of the same inode.  B owns the inode and
4280 * A's memory is limited far below B's.  A's dirty ratio can rise enough to
4281 * trigger balance_dirty_pages() sleeps but B's can be low enough to avoid
4282 * triggering background writeback.  A will be slowed down without a way to
4283 * make writeback of the dirty pages happen.
4284 *
4285 * Conditions like the above can lead to a cgroup getting repatedly and
4286 * severely throttled after making some progress after each
4287 * dirty_expire_interval while the underyling IO device is almost
4288 * completely idle.
4289 *
4290 * Solving this problem completely requires matching the ownership tracking
4291 * granularities between memcg and writeback in either direction.  However,
4292 * the more egregious behaviors can be avoided by simply remembering the
4293 * most recent foreign dirtying events and initiating remote flushes on
4294 * them when local writeback isn't enough to keep the memory clean enough.
4295 *
4296 * The following two functions implement such mechanism.  When a foreign
4297 * page - a page whose memcg and writeback ownerships don't match - is
4298 * dirtied, mem_cgroup_track_foreign_dirty() records the inode owning
4299 * bdi_writeback on the page owning memcg.  When balance_dirty_pages()
4300 * decides that the memcg needs to sleep due to high dirty ratio, it calls
4301 * mem_cgroup_flush_foreign() which queues writeback on the recorded
4302 * foreign bdi_writebacks which haven't expired.  Both the numbers of
4303 * recorded bdi_writebacks and concurrent in-flight foreign writebacks are
4304 * limited to MEMCG_CGWB_FRN_CNT.
4305 *
4306 * The mechanism only remembers IDs and doesn't hold any object references.
4307 * As being wrong occasionally doesn't matter, updates and accesses to the
4308 * records are lockless and racy.
4309 */
4310void mem_cgroup_track_foreign_dirty_slowpath(struct page *page,
4311                                             struct bdi_writeback *wb)
4312{
4313        struct mem_cgroup *memcg = page->mem_cgroup;
4314        struct memcg_cgwb_frn *frn;
4315        u64 now = get_jiffies_64();
4316        u64 oldest_at = now;
4317        int oldest = -1;
4318        int i;
4319
4320        trace_track_foreign_dirty(page, wb);
4321
4322        /*
4323         * Pick the slot to use.  If there is already a slot for @wb, keep
4324         * using it.  If not replace the oldest one which isn't being
4325         * written out.
4326         */
4327        for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++) {
4328                frn = &memcg->cgwb_frn[i];
4329                if (frn->bdi_id == wb->bdi->id &&
4330                    frn->memcg_id == wb->memcg_css->id)
4331                        break;
4332                if (time_before64(frn->at, oldest_at) &&
4333                    atomic_read(&frn->done.cnt) == 1) {
4334                        oldest = i;
4335                        oldest_at = frn->at;
4336                }
4337        }
4338
4339        if (i < MEMCG_CGWB_FRN_CNT) {
4340                /*
4341                 * Re-using an existing one.  Update timestamp lazily to
4342                 * avoid making the cacheline hot.  We want them to be
4343                 * reasonably up-to-date and significantly shorter than
4344                 * dirty_expire_interval as that's what expires the record.
4345                 * Use the shorter of 1s and dirty_expire_interval / 8.
4346                 */
4347                unsigned long update_intv =
4348                        min_t(unsigned long, HZ,
4349                              msecs_to_jiffies(dirty_expire_interval * 10) / 8);
4350
4351                if (time_before64(frn->at, now - update_intv))
4352                        frn->at = now;
4353        } else if (oldest >= 0) {
4354                /* replace the oldest free one */
4355                frn = &memcg->cgwb_frn[oldest];
4356                frn->bdi_id = wb->bdi->id;
4357                frn->memcg_id = wb->memcg_css->id;
4358                frn->at = now;
4359        }
4360}
4361
4362/* issue foreign writeback flushes for recorded foreign dirtying events */
4363void mem_cgroup_flush_foreign(struct bdi_writeback *wb)
4364{
4365        struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css);
4366        unsigned long intv = msecs_to_jiffies(dirty_expire_interval * 10);
4367        u64 now = jiffies_64;
4368        int i;
4369
4370        for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++) {
4371                struct memcg_cgwb_frn *frn = &memcg->cgwb_frn[i];
4372
4373                /*
4374                 * If the record is older than dirty_expire_interval,
4375                 * writeback on it has already started.  No need to kick it
4376                 * off again.  Also, don't start a new one if there's
4377                 * already one in flight.
4378                 */
4379                if (time_after64(frn->at, now - intv) &&
4380                    atomic_read(&frn->done.cnt) == 1) {
4381                        frn->at = 0;
4382                        trace_flush_foreign(wb, frn->bdi_id, frn->memcg_id);
4383                        cgroup_writeback_by_id(frn->bdi_id, frn->memcg_id, 0,
4384                                               WB_REASON_FOREIGN_FLUSH,
4385                                               &frn->done);
4386                }
4387        }
4388}
4389
4390#else   /* CONFIG_CGROUP_WRITEBACK */
4391
4392static int memcg_wb_domain_init(struct mem_cgroup *memcg, gfp_t gfp)
4393{
4394        return 0;
4395}
4396
4397static void memcg_wb_domain_exit(struct mem_cgroup *memcg)
4398{
4399}
4400
4401static void memcg_wb_domain_size_changed(struct mem_cgroup *memcg)
4402{
4403}
4404
4405#endif  /* CONFIG_CGROUP_WRITEBACK */
4406
4407/*
4408 * DO NOT USE IN NEW FILES.
4409 *
4410 * "cgroup.event_control" implementation.
4411 *
4412 * This is way over-engineered.  It tries to support fully configurable
4413 * events for each user.  Such level of flexibility is completely
4414 * unnecessary especially in the light of the planned unified hierarchy.
4415 *
4416 * Please deprecate this and replace with something simpler if at all
4417 * possible.
4418 */
4419
4420/*
4421 * Unregister event and free resources.
4422 *
4423 * Gets called from workqueue.
4424 */
4425static void memcg_event_remove(struct work_struct *work)
4426{
4427        struct mem_cgroup_event *event =
4428                container_of(work, struct mem_cgroup_event, remove);
4429        struct mem_cgroup *memcg = event->memcg;
4430
4431        remove_wait_queue(event->wqh, &event->wait);
4432
4433        event->unregister_event(memcg, event->eventfd);
4434
4435        /* Notify userspace the event is going away. */
4436        eventfd_signal(event->eventfd, 1);
4437
4438        eventfd_ctx_put(event->eventfd);
4439        kfree(event);
4440        css_put(&memcg->css);
4441}
4442
4443/*
4444 * Gets called on EPOLLHUP on eventfd when user closes it.
4445 *
4446 * Called with wqh->lock held and interrupts disabled.
4447 */
4448static int memcg_event_wake(wait_queue_entry_t *wait, unsigned mode,
4449                            int sync, void *key)
4450{
4451        struct mem_cgroup_event *event =
4452                container_of(wait, struct mem_cgroup_event, wait);
4453        struct mem_cgroup *memcg = event->memcg;
4454        __poll_t flags = key_to_poll(key);
4455
4456        if (flags & EPOLLHUP) {
4457                /*
4458                 * If the event has been detached at cgroup removal, we
4459                 * can simply return knowing the other side will cleanup
4460                 * for us.
4461                 *
4462                 * We can't race against event freeing since the other
4463                 * side will require wqh->lock via remove_wait_queue(),
4464                 * which we hold.
4465                 */
4466                spin_lock(&memcg->event_list_lock);
4467                if (!list_empty(&event->list)) {
4468                        list_del_init(&event->list);
4469                        /*
4470                         * We are in atomic context, but cgroup_event_remove()
4471                         * may sleep, so we have to call it in workqueue.
4472                         */
4473                        schedule_work(&event->remove);
4474                }
4475                spin_unlock(&memcg->event_list_lock);
4476        }
4477
4478        return 0;
4479}
4480
4481static void memcg_event_ptable_queue_proc(struct file *file,
4482                wait_queue_head_t *wqh, poll_table *pt)
4483{
4484        struct mem_cgroup_event *event =
4485                container_of(pt, struct mem_cgroup_event, pt);
4486
4487        event->wqh = wqh;
4488        add_wait_queue(wqh, &event->wait);
4489}
4490
4491/*
4492 * DO NOT USE IN NEW FILES.
4493 *
4494 * Parse input and register new cgroup event handler.
4495 *
4496 * Input must be in format '<event_fd> <control_fd> <args>'.
4497 * Interpretation of args is defined by control file implementation.
4498 */
4499static ssize_t memcg_write_event_control(struct kernfs_open_file *of,
4500                                         char *buf, size_t nbytes, loff_t off)
4501{
4502        struct cgroup_subsys_state *css = of_css(of);
4503        struct mem_cgroup *memcg = mem_cgroup_from_css(css);
4504        struct mem_cgroup_event *event;
4505        struct cgroup_subsys_state *cfile_css;
4506        unsigned int efd, cfd;
4507        struct fd efile;
4508        struct fd cfile;
4509        const char *name;
4510        char *endp;
4511        int ret;
4512
4513        buf = strstrip(buf);
4514
4515        efd = simple_strtoul(buf, &endp, 10);
4516        if (*endp != ' ')
4517                return -EINVAL;
4518        buf = endp + 1;
4519
4520        cfd = simple_strtoul(buf, &endp, 10);
4521        if ((*endp != ' ') && (*endp != '\0'))
4522                return -EINVAL;
4523        buf = endp + 1;
4524
4525        event = kzalloc(sizeof(*event), GFP_KERNEL);
4526        if (!event)
4527                return -ENOMEM;
4528
4529        event->memcg = memcg;
4530        INIT_LIST_HEAD(&event->list);
4531        init_poll_funcptr(&event->pt, memcg_event_ptable_queue_proc);
4532        init_waitqueue_func_entry(&event->wait, memcg_event_wake);
4533        INIT_WORK(&event->remove, memcg_event_remove);
4534
4535        efile = fdget(efd);
4536        if (!efile.file) {
4537                ret = -EBADF;
4538                goto out_kfree;
4539        }
4540
4541        event->eventfd = eventfd_ctx_fileget(efile.file);
4542        if (IS_ERR(event->eventfd)) {
4543                ret = PTR_ERR(event->eventfd);
4544                goto out_put_efile;
4545        }
4546
4547        cfile = fdget(cfd);
4548        if (!cfile.file) {
4549                ret = -EBADF;
4550                goto out_put_eventfd;
4551        }
4552
4553        /* the process need read permission on control file */
4554        /* AV: shouldn't we check that it's been opened for read instead? */
4555        ret = inode_permission(file_inode(cfile.file), MAY_READ);
4556        if (ret < 0)
4557                goto out_put_cfile;
4558
4559        /*
4560         * Determine the event callbacks and set them in @event.  This used
4561         * to be done via struct cftype but cgroup core no longer knows
4562         * about these events.  The following is crude but the whole thing
4563         * is for compatibility anyway.
4564         *
4565         * DO NOT ADD NEW FILES.
4566         */
4567        name = cfile.file->f_path.dentry->d_name.name;
4568
4569        if (!strcmp(name, "memory.usage_in_bytes")) {
4570                event->register_event = mem_cgroup_usage_register_event;
4571                event->unregister_event = mem_cgroup_usage_unregister_event;
4572        } else if (!strcmp(name, "memory.oom_control")) {
4573                event->register_event = mem_cgroup_oom_register_event;
4574                event->unregister_event = mem_cgroup_oom_unregister_event;
4575        } else if (!strcmp(name, "memory.pressure_level")) {
4576                event->register_event = vmpressure_register_event;
4577                event->unregister_event = vmpressure_unregister_event;
4578        } else if (!strcmp(name, "memory.memsw.usage_in_bytes")) {
4579                event->register_event = memsw_cgroup_usage_register_event;
4580                event->unregister_event = memsw_cgroup_usage_unregister_event;
4581        } else {
4582                ret = -EINVAL;
4583                goto out_put_cfile;
4584        }
4585
4586        /*
4587         * Verify @cfile should belong to @css.  Also, remaining events are
4588         * automatically removed on cgroup destruction but the removal is
4589         * asynchronous, so take an extra ref on @css.
4590         */
4591        cfile_css = css_tryget_online_from_dir(cfile.file->f_path.dentry->d_parent,
4592                                               &memory_cgrp_subsys);
4593        ret = -EINVAL;
4594        if (IS_ERR(cfile_css))
4595                goto out_put_cfile;
4596        if (cfile_css != css) {
4597                css_put(cfile_css);
4598                goto out_put_cfile;
4599        }
4600
4601        ret = event->register_event(memcg, event->eventfd, buf);
4602        if (ret)
4603                goto out_put_css;
4604
4605        vfs_poll(efile.file, &event->pt);
4606
4607        spin_lock(&memcg->event_list_lock);
4608        list_add(&event->list, &memcg->event_list);
4609        spin_unlock(&memcg->event_list_lock);
4610
4611        fdput(cfile);
4612        fdput(efile);
4613
4614        return nbytes;
4615
4616out_put_css:
4617        css_put(css);
4618out_put_cfile:
4619        fdput(cfile);
4620out_put_eventfd:
4621        eventfd_ctx_put(event->eventfd);
4622out_put_efile:
4623        fdput(efile);
4624out_kfree:
4625        kfree(event);
4626
4627        return ret;
4628}
4629
4630static struct cftype mem_cgroup_legacy_files[] = {
4631        {
4632                .name = "usage_in_bytes",
4633                .private = MEMFILE_PRIVATE(_MEM, RES_USAGE),
4634                .read_u64 = mem_cgroup_read_u64,
4635        },
4636        {
4637                .name = "max_usage_in_bytes",
4638                .private = MEMFILE_PRIVATE(_MEM, RES_MAX_USAGE),
4639                .write = mem_cgroup_reset,
4640                .read_u64 = mem_cgroup_read_u64,
4641        },
4642        {
4643                .name = "limit_in_bytes",
4644                .private = MEMFILE_PRIVATE(_MEM, RES_LIMIT),
4645                .write = mem_cgroup_write,
4646                .read_u64 = mem_cgroup_read_u64,
4647        },
4648        {
4649                .name = "soft_limit_in_bytes",
4650                .private = MEMFILE_PRIVATE(_MEM, RES_SOFT_LIMIT),
4651                .write = mem_cgroup_write,
4652                .read_u64 = mem_cgroup_read_u64,
4653        },
4654        {
4655                .name = "failcnt",
4656                .private = MEMFILE_PRIVATE(_MEM, RES_FAILCNT),
4657                .write = mem_cgroup_reset,
4658                .read_u64 = mem_cgroup_read_u64,
4659        },
4660        {
4661                .name = "stat",
4662                .seq_show = memcg_stat_show,
4663        },
4664        {
4665                .name = "force_empty",
4666                .write = mem_cgroup_force_empty_write,
4667        },
4668        {
4669                .name = "use_hierarchy",
4670                .write_u64 = mem_cgroup_hierarchy_write,
4671                .read_u64 = mem_cgroup_hierarchy_read,
4672        },
4673        {
4674                .name = "cgroup.event_control",         /* XXX: for compat */
4675                .write = memcg_write_event_control,
4676                .flags = CFTYPE_NO_PREFIX | CFTYPE_WORLD_WRITABLE,
4677        },
4678        {
4679                .name = "swappiness",
4680                .read_u64 = mem_cgroup_swappiness_read,
4681                .write_u64 = mem_cgroup_swappiness_write,
4682        },
4683        {
4684                .name = "move_charge_at_immigrate",
4685                .read_u64 = mem_cgroup_move_charge_read,
4686                .write_u64 = mem_cgroup_move_charge_write,
4687        },
4688        {
4689                .name = "oom_control",
4690                .seq_show = mem_cgroup_oom_control_read,
4691                .write_u64 = mem_cgroup_oom_control_write,
4692                .private = MEMFILE_PRIVATE(_OOM_TYPE, OOM_CONTROL),
4693        },
4694        {
4695                .name = "pressure_level",
4696        },
4697#ifdef CONFIG_NUMA
4698        {
4699                .name = "numa_stat",
4700                .seq_show = memcg_numa_stat_show,
4701        },
4702#endif
4703        {
4704                .name = "kmem.limit_in_bytes",
4705                .private = MEMFILE_PRIVATE(_KMEM, RES_LIMIT),
4706                .write = mem_cgroup_write,
4707                .read_u64 = mem_cgroup_read_u64,
4708        },
4709        {
4710                .name = "kmem.usage_in_bytes",
4711                .private = MEMFILE_PRIVATE(_KMEM, RES_USAGE),
4712                .read_u64 = mem_cgroup_read_u64,
4713        },
4714        {
4715                .name = "kmem.failcnt",
4716                .private = MEMFILE_PRIVATE(_KMEM, RES_FAILCNT),
4717                .write = mem_cgroup_reset,
4718                .read_u64 = mem_cgroup_read_u64,
4719        },
4720        {
4721                .name = "kmem.max_usage_in_bytes",
4722                .private = MEMFILE_PRIVATE(_KMEM, RES_MAX_USAGE),
4723                .write = mem_cgroup_reset,
4724                .read_u64 = mem_cgroup_read_u64,
4725        },
4726#if defined(CONFIG_SLAB) || defined(CONFIG_SLUB_DEBUG)
4727        {
4728                .name = "kmem.slabinfo",
4729                .seq_start = memcg_slab_start,
4730                .seq_next = memcg_slab_next,
4731                .seq_stop = memcg_slab_stop,
4732                .seq_show = memcg_slab_show,
4733        },
4734#endif
4735        {
4736                .name = "kmem.tcp.limit_in_bytes",
4737                .private = MEMFILE_PRIVATE(_TCP, RES_LIMIT),
4738                .write = mem_cgroup_write,
4739                .read_u64 = mem_cgroup_read_u64,
4740        },
4741        {
4742                .name = "kmem.tcp.usage_in_bytes",
4743                .private = MEMFILE_PRIVATE(_TCP, RES_USAGE),
4744                .read_u64 = mem_cgroup_read_u64,
4745        },
4746        {
4747                .name = "kmem.tcp.failcnt",
4748                .private = MEMFILE_PRIVATE(_TCP, RES_FAILCNT),
4749                .write = mem_cgroup_reset,
4750                .read_u64 = mem_cgroup_read_u64,
4751        },
4752        {
4753                .name = "kmem.tcp.max_usage_in_bytes",
4754                .private = MEMFILE_PRIVATE(_TCP, RES_MAX_USAGE),
4755                .write = mem_cgroup_reset,
4756                .read_u64 = mem_cgroup_read_u64,
4757        },
4758        { },    /* terminate */
4759};
4760
4761/*
4762 * Private memory cgroup IDR
4763 *
4764 * Swap-out records and page cache shadow entries need to store memcg
4765 * references in constrained space, so we maintain an ID space that is
4766 * limited to 16 bit (MEM_CGROUP_ID_MAX), limiting the total number of
4767 * memory-controlled cgroups to 64k.
4768 *
4769 * However, there usually are many references to the oflline CSS after
4770 * the cgroup has been destroyed, such as page cache or reclaimable
4771 * slab objects, that don't need to hang on to the ID. We want to keep
4772 * those dead CSS from occupying IDs, or we might quickly exhaust the
4773 * relatively small ID space and prevent the creation of new cgroups
4774 * even when there are much fewer than 64k cgroups - possibly none.
4775 *
4776 * Maintain a private 16-bit ID space for memcg, and allow the ID to
4777 * be freed and recycled when it's no longer needed, which is usually
4778 * when the CSS is offlined.
4779 *
4780 * The only exception to that are records of swapped out tmpfs/shmem
4781 * pages that need to be attributed to live ancestors on swapin. But
4782 * those references are manageable from userspace.
4783 */
4784
4785static DEFINE_IDR(mem_cgroup_idr);
4786
4787static void mem_cgroup_id_remove(struct mem_cgroup *memcg)
4788{
4789        if (memcg->id.id > 0) {
4790                idr_remove(&mem_cgroup_idr, memcg->id.id);
4791                memcg->id.id = 0;
4792        }
4793}
4794
4795static void mem_cgroup_id_get_many(struct mem_cgroup *memcg, unsigned int n)
4796{
4797        refcount_add(n, &memcg->id.ref);
4798}
4799
4800static void mem_cgroup_id_put_many(struct mem_cgroup *memcg, unsigned int n)
4801{
4802        if (refcount_sub_and_test(n, &memcg->id.ref)) {
4803                mem_cgroup_id_remove(memcg);
4804
4805                /* Memcg ID pins CSS */
4806                css_put(&memcg->css);
4807        }
4808}
4809
4810static inline void mem_cgroup_id_put(struct mem_cgroup *memcg)
4811{
4812        mem_cgroup_id_put_many(memcg, 1);
4813}
4814
4815/**
4816 * mem_cgroup_from_id - look up a memcg from a memcg id
4817 * @id: the memcg id to look up
4818 *
4819 * Caller must hold rcu_read_lock().
4820 */
4821struct mem_cgroup *mem_cgroup_from_id(unsigned short id)
4822{
4823        WARN_ON_ONCE(!rcu_read_lock_held());
4824        return idr_find(&mem_cgroup_idr, id);
4825}
4826
4827static int alloc_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node)
4828{
4829        struct mem_cgroup_per_node *pn;
4830        int tmp = node;
4831        /*
4832         * This routine is called against possible nodes.
4833         * But it's BUG to call kmalloc() against offline node.
4834         *
4835         * TODO: this routine can waste much memory for nodes which will
4836         *       never be onlined. It's better to use memory hotplug callback
4837         *       function.
4838         */
4839        if (!node_state(node, N_NORMAL_MEMORY))
4840                tmp = -1;
4841        pn = kzalloc_node(sizeof(*pn), GFP_KERNEL, tmp);
4842        if (!pn)
4843                return 1;
4844
4845        pn->lruvec_stat_local = alloc_percpu(struct lruvec_stat);
4846        if (!pn->lruvec_stat_local) {
4847                kfree(pn);
4848                return 1;
4849        }
4850
4851        pn->lruvec_stat_cpu = alloc_percpu(struct lruvec_stat);
4852        if (!pn->lruvec_stat_cpu) {
4853                free_percpu(pn->lruvec_stat_local);
4854                kfree(pn);
4855                return 1;
4856        }
4857
4858        lruvec_init(&pn->lruvec);
4859        pn->usage_in_excess = 0;
4860        pn->on_tree = false;
4861        pn->memcg = memcg;
4862
4863        memcg->nodeinfo[node] = pn;
4864        return 0;
4865}
4866
4867static void free_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node)
4868{
4869        struct mem_cgroup_per_node *pn = memcg->nodeinfo[node];
4870
4871        if (!pn)
4872                return;
4873
4874        free_percpu(pn->lruvec_stat_cpu);
4875        free_percpu(pn->lruvec_stat_local);
4876        kfree(pn);
4877}
4878
4879static void __mem_cgroup_free(struct mem_cgroup *memcg)
4880{
4881        int node;
4882
4883        for_each_node(node)
4884                free_mem_cgroup_per_node_info(memcg, node);
4885        free_percpu(memcg->vmstats_percpu);
4886        free_percpu(memcg->vmstats_local);
4887        kfree(memcg);
4888}
4889
4890static void mem_cgroup_free(struct mem_cgroup *memcg)
4891{
4892        memcg_wb_domain_exit(memcg);
4893        /*
4894         * Flush percpu vmstats and vmevents to guarantee the value correctness
4895         * on parent's and all ancestor levels.
4896         */
4897        memcg_flush_percpu_vmstats(memcg);
4898        memcg_flush_percpu_vmevents(memcg);
4899        __mem_cgroup_free(memcg);
4900}
4901
4902static struct mem_cgroup *mem_cgroup_alloc(void)
4903{
4904        struct mem_cgroup *memcg;
4905        unsigned int size;
4906        int node;
4907        int __maybe_unused i;
4908
4909        size = sizeof(struct mem_cgroup);
4910        size += nr_node_ids * sizeof(struct mem_cgroup_per_node *);
4911
4912        memcg = kzalloc(size, GFP_KERNEL);
4913        if (!memcg)
4914                return NULL;
4915
4916        memcg->id.id = idr_alloc(&mem_cgroup_idr, NULL,
4917                                 1, MEM_CGROUP_ID_MAX,
4918                                 GFP_KERNEL);
4919        if (memcg->id.id < 0)
4920                goto fail;
4921
4922        memcg->vmstats_local = alloc_percpu(struct memcg_vmstats_percpu);
4923        if (!memcg->vmstats_local)
4924                goto fail;
4925
4926        memcg->vmstats_percpu = alloc_percpu(struct memcg_vmstats_percpu);
4927        if (!memcg->vmstats_percpu)
4928                goto fail;
4929
4930        for_each_node(node)
4931                if (alloc_mem_cgroup_per_node_info(memcg, node))
4932                        goto fail;
4933
4934        if (memcg_wb_domain_init(memcg, GFP_KERNEL))
4935                goto fail;
4936
4937        INIT_WORK(&memcg->high_work, high_work_func);
4938        INIT_LIST_HEAD(&memcg->oom_notify);
4939        mutex_init(&memcg->thresholds_lock);
4940        spin_lock_init(&memcg->move_lock);
4941        vmpressure_init(&memcg->vmpressure);
4942        INIT_LIST_HEAD(&memcg->event_list);
4943        spin_lock_init(&memcg->event_list_lock);
4944        memcg->socket_pressure = jiffies;
4945#ifdef CONFIG_MEMCG_KMEM
4946        memcg->kmemcg_id = -1;
4947#endif
4948#ifdef CONFIG_CGROUP_WRITEBACK
4949        INIT_LIST_HEAD(&memcg->cgwb_list);
4950        for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++)
4951                memcg->cgwb_frn[i].done =
4952                        __WB_COMPLETION_INIT(&memcg_cgwb_frn_waitq);
4953#endif
4954#ifdef CONFIG_TRANSPARENT_HUGEPAGE
4955        spin_lock_init(&memcg->deferred_split_queue.split_queue_lock);
4956        INIT_LIST_HEAD(&memcg->deferred_split_queue.split_queue);
4957        memcg->deferred_split_queue.split_queue_len = 0;
4958#endif
4959        idr_replace(&mem_cgroup_idr, memcg, memcg->id.id);
4960        return memcg;
4961fail:
4962        mem_cgroup_id_remove(memcg);
4963        __mem_cgroup_free(memcg);
4964        return NULL;
4965}
4966
4967static struct cgroup_subsys_state * __ref
4968mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
4969{
4970        struct mem_cgroup *parent = mem_cgroup_from_css(parent_css);
4971        struct mem_cgroup *memcg;
4972        long error = -ENOMEM;
4973
4974        memcg = mem_cgroup_alloc();
4975        if (!memcg)
4976                return ERR_PTR(error);
4977
4978        memcg->high = PAGE_COUNTER_MAX;
4979        memcg->soft_limit = PAGE_COUNTER_MAX;
4980        if (parent) {
4981                memcg->swappiness = mem_cgroup_swappiness(parent);
4982                memcg->oom_kill_disable = parent->oom_kill_disable;
4983        }
4984        if (parent && parent->use_hierarchy) {
4985                memcg->use_hierarchy = true;
4986                page_counter_init(&memcg->memory, &parent->memory);
4987                page_counter_init(&memcg->swap, &parent->swap);
4988                page_counter_init(&memcg->memsw, &parent->memsw);
4989                page_counter_init(&memcg->kmem, &parent->kmem);
4990                page_counter_init(&memcg->tcpmem, &parent->tcpmem);
4991        } else {
4992                page_counter_init(&memcg->memory, NULL);
4993                page_counter_init(&memcg->swap, NULL);
4994                page_counter_init(&memcg->memsw, NULL);
4995                page_counter_init(&memcg->kmem, NULL);
4996                page_counter_init(&memcg->tcpmem, NULL);
4997                /*
4998                 * Deeper hierachy with use_hierarchy == false doesn't make
4999                 * much sense so let cgroup subsystem know about this
5000                 * unfortunate state in our controller.

5001                 */
5002                if (parent != root_mem_cgroup)
5003                        memory_cgrp_subsys.broken_hierarchy = true;
5004        }
5005
5006        /* The following stuff does not apply to the root */
5007        if (!parent) {
5008#ifdef CONFIG_MEMCG_KMEM
5009                INIT_LIST_HEAD(&memcg->kmem_caches);
5010#endif
5011                root_mem_cgroup = memcg;
5012                return &memcg->css;
5013        }
5014
5015        error = memcg_online_kmem(memcg);
5016        if (error)
5017                goto fail;
5018
5019        if (cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_nosocket)
5020                static_branch_inc(&memcg_sockets_enabled_key);
5021
5022        return &memcg->css;
5023fail:
5024        mem_cgroup_id_remove(memcg);
5025        mem_cgroup_free(memcg);
5026        return ERR_PTR(-ENOMEM);
5027}
5028
5029static int mem_cgroup_css_online(struct cgroup_subsys_state *css)
5030{
5031        struct mem_cgroup *memcg = mem_cgroup_from_css(css);
5032
5033        /*
5034         * A memcg must be visible for memcg_expand_shrinker_maps()
5035         * by the time the maps are allocated. So, we allocate maps
5036         * here, when for_each_mem_cgroup() can't skip it.
5037         */
5038        if (memcg_alloc_shrinker_maps(memcg)) {
5039                mem_cgroup_id_remove(memcg);
5040                return -ENOMEM;
5041        }
5042
5043        /* Online state pins memcg ID, memcg ID pins CSS */
5044        refcount_set(&memcg->id.ref, 1);
5045        css_get(css);
5046        return 0;
5047}
5048
5049static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
5050{
5051        struct mem_cgroup *memcg = mem_cgroup_from_css(css);
5052        struct mem_cgroup_event *event, *tmp;
5053
5054        /*
5055         * Unregister events and notify userspace.
5056         * Notify userspace about cgroup removing only after rmdir of cgroup
5057         * directory to avoid race between userspace and kernelspace.
5058         */
5059        spin_lock(&memcg->event_list_lock);
5060        list_for_each_entry_safe(event, tmp, &memcg->event_list, list) {
5061                list_del_init(&event->list);
5062                schedule_work(&event->remove);
5063        }
5064        spin_unlock(&memcg->event_list_lock);
5065
5066        page_counter_set_min(&memcg->memory, 0);
5067        page_counter_set_low(&memcg->memory, 0);
5068
5069        memcg_offline_kmem(memcg);
5070        wb_memcg_offline(memcg);
5071
5072        drain_all_stock(memcg);
5073
5074        mem_cgroup_id_put(memcg);
5075}
5076
5077static void mem_cgroup_css_released(struct cgroup_subsys_state *css)
5078{
5079        struct mem_cgroup *memcg = mem_cgroup_from_css(css);
5080
5081        invalidate_reclaim_iterators(memcg);
5082}
5083
5084static void mem_cgroup_css_free(struct cgroup_subsys_state *css)
5085{
5086        struct mem_cgroup *memcg = mem_cgroup_from_css(css);
5087        int __maybe_unused i;
5088
5089#ifdef CONFIG_CGROUP_WRITEBACK
5090        for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++)
5091                wb_wait_for_completion(&memcg->cgwb_frn[i].done);
5092#endif
5093        if (cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_nosocket)
5094                static_branch_dec(&memcg_sockets_enabled_key);
5095
5096        if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && memcg->tcpmem_active)
5097                static_branch_dec(&memcg_sockets_enabled_key);
5098
5099        vmpressure_cleanup(&memcg->vmpressure);
5100        cancel_work_sync(&memcg->high_work);
5101        mem_cgroup_remove_from_trees(memcg);
5102        memcg_free_shrinker_maps(memcg);
5103        memcg_free_kmem(memcg);
5104        mem_cgroup_free(memcg);
5105}
5106
5107/**
5108 * mem_cgroup_css_reset - reset the states of a mem_cgroup
5109 * @css: the target css
5110 *
5111 * Reset the states of the mem_cgroup associated with @css.  This is
5112 * invoked when the userland requests disabling on the default hierarchy
5113 * but the memcg is pinned through dependency.  The memcg should stop
5114 * applying policies and should revert to the vanilla state as it may be
5115 * made visible again.
5116 *
5117 * The current implementation only resets the essential configurations.
5118 * This needs to be expanded to cover all the visible parts.
5119 */
5120static void mem_cgroup_css_reset(struct cgroup_subsys_state *css)
5121{
5122        struct mem_cgroup *memcg = mem_cgroup_from_css(css);
5123
5124        page_counter_set_max(&memcg->memory, PAGE_COUNTER_MAX);
5125        page_counter_set_max(&memcg->swap, PAGE_COUNTER_MAX);
5126        page_counter_set_max(&memcg->memsw, PAGE_COUNTER_MAX);
5127        page_counter_set_max(&memcg->kmem, PAGE_COUNTER_MAX);
5128        page_counter_set_max(&memcg->tcpmem, PAGE_COUNTER_MAX);
5129        page_counter_set_min(&memcg->memory, 0);
5130        page_counter_set_low(&memcg->memory, 0);
5131        memcg->high = PAGE_COUNTER_MAX;
5132        memcg->soft_limit = PAGE_COUNTER_MAX;
5133        memcg_wb_domain_size_changed(memcg);
5134}
5135
5136#ifdef CONFIG_MMU
5137/* Handlers for move charge at task migration. */
5138static int mem_cgroup_do_precharge(unsigned long count)
5139{
5140        int ret;
5141
5142        /* Try a single bulk charge without reclaim first, kswapd may wake */
5143        ret = try_charge(mc.to, GFP_KERNEL & ~__GFP_DIRECT_RECLAIM, count);
5144        if (!ret) {
5145                mc.precharge += count;
5146                return ret;
5147        }
5148
5149        /* Try charges one by one with reclaim, but do not retry */
5150        while (count--) {
5151                ret = try_charge(mc.to, GFP_KERNEL | __GFP_NORETRY, 1);
5152                if (ret)
5153                        return ret;
5154                mc.precharge++;
5155                cond_resched();
5156        }
5157        return 0;
5158}
5159
5160union mc_target {
5161        struct page     *page;
5162        swp_entry_t     ent;
5163};
5164
5165enum mc_target_type {
5166        MC_TARGET_NONE = 0,
5167        MC_TARGET_PAGE,
5168        MC_TARGET_SWAP,
5169        MC_TARGET_DEVICE,
5170};
5171
5172static struct page *mc_handle_present_pte(struct vm_area_struct *vma,
5173                                                unsigned long addr, pte_t ptent)
5174{
5175        struct page *page = vm_normal_page(vma, addr, ptent);
5176
5177        if (!page || !page_mapped(page))
5178                return NULL;
5179        if (PageAnon(page)) {
5180                if (!(mc.flags & MOVE_ANON))
5181                        return NULL;
5182        } else {
5183                if (!(mc.flags & MOVE_FILE))
5184                        return NULL;
5185        }
5186        if (!get_page_unless_zero(page))
5187                return NULL;
5188
5189        return page;
5190}
5191
5192#if defined(CONFIG_SWAP) || defined(CONFIG_DEVICE_PRIVATE)
5193static struct page *mc_handle_swap_pte(struct vm_area_struct *vma,
5194                        pte_t ptent, swp_entry_t *entry)
5195{
5196        struct page *page = NULL;
5197        swp_entry_t ent = pte_to_swp_entry(ptent);
5198
5199        if (!(mc.flags & MOVE_ANON) || non_swap_entry(ent))
5200                return NULL;
5201
5202        /*
5203         * Handle MEMORY_DEVICE_PRIVATE which are ZONE_DEVICE page belonging to
5204         * a device and because they are not accessible by CPU they are store
5205         * as special swap entry in the CPU page table.
5206         */
5207        if (is_device_private_entry(ent)) {
5208                page = device_private_entry_to_page(ent);
5209                /*
5210                 * MEMORY_DEVICE_PRIVATE means ZONE_DEVICE page and which have
5211                 * a refcount of 1 when free (unlike normal page)
5212                 */
5213                if (!page_ref_add_unless(page, 1, 1))
5214                        return NULL;
5215                return page;
5216        }
5217
5218        /*
5219         * Because lookup_swap_cache() updates some statistics counter,
5220         * we call find_get_page() with swapper_space directly.
5221         */
5222        page = find_get_page(swap_address_space(ent), swp_offset(ent));
5223        if (do_memsw_account())
5224                entry->val = ent.val;
5225
5226        return page;
5227}
5228#else
5229static struct page *mc_handle_swap_pte(struct vm_area_struct *vma,
5230                        pte_t ptent, swp_entry_t *entry)
5231{
5232        return NULL;
5233}
5234#endif
5235
5236static struct page *mc_handle_file_pte(struct vm_area_struct *vma,
5237                        unsigned long addr, pte_t ptent, swp_entry_t *entry)
5238{
5239        struct page *page = NULL;
5240        struct address_space *mapping;
5241        pgoff_t pgoff;
5242
5243        if (!vma->vm_file) /* anonymous vma */
5244                return NULL;
5245        if (!(mc.flags & MOVE_FILE))
5246                return NULL;
5247
5248        mapping = vma->vm_file->f_mapping;
5249        pgoff = linear_page_index(vma, addr);
5250
5251        /* page is moved even if it's not RSS of this task(page-faulted). */
5252#ifdef CONFIG_SWAP
5253        /* shmem/tmpfs may report page out on swap: account for that too. */
5254        if (shmem_mapping(mapping)) {
5255                page = find_get_entry(mapping, pgoff);
5256                if (xa_is_value(page)) {
5257                        swp_entry_t swp = radix_to_swp_entry(page);
5258                        if (do_memsw_account())
5259                                *entry = swp;
5260                        page = find_get_page(swap_address_space(swp),
5261                                             swp_offset(swp));
5262                }
5263        } else
5264                page = find_get_page(mapping, pgoff);
5265#else
5266        page = find_get_page(mapping, pgoff);
5267#endif
5268        return page;
5269}
5270
5271/**
5272 * mem_cgroup_move_account - move account of the page
5273 * @page: the page
5274 * @compound: charge the page as compound or small page
5275 * @from: mem_cgroup which the page is moved from.
5276 * @to: mem_cgroup which the page is moved to. @from != @to.
5277 *
5278 * The caller must make sure the page is not on LRU (isolate_page() is useful.)
5279 *
5280 * This function doesn't do "charge" to new cgroup and doesn't do "uncharge"
5281 * from old cgroup.
5282 */
5283static int mem_cgroup_move_account(struct page *page,
5284                                   bool compound,
5285                                   struct mem_cgroup *from,
5286                                   struct mem_cgroup *to)
5287{
5288        struct lruvec *from_vec, *to_vec;
5289        struct pglist_data *pgdat;
5290        unsigned long flags;
5291        unsigned int nr_pages = compound ? hpage_nr_pages(page) : 1;
5292        int ret;
5293        bool anon;
5294
5295        VM_BUG_ON(from == to);
5296        VM_BUG_ON_PAGE(PageLRU(page), page);
5297        VM_BUG_ON(compound && !PageTransHuge(page));
5298
5299        /*
5300         * Prevent mem_cgroup_migrate() from looking at
5301         * page->mem_cgroup of its source page while we change it.
5302         */
5303        ret = -EBUSY;
5304        if (!trylock_page(page))
5305                goto out;
5306
5307        ret = -EINVAL;
5308        if (page->mem_cgroup != from)
5309                goto out_unlock;
5310
5311        anon = PageAnon(page);
5312
5313        pgdat = page_pgdat(page);
5314        from_vec = mem_cgroup_lruvec(from, pgdat);
5315        to_vec = mem_cgroup_lruvec(to, pgdat);
5316
5317        spin_lock_irqsave(&from->move_lock, flags);
5318
5319        if (!anon && page_mapped(page)) {
5320                __mod_lruvec_state(from_vec, NR_FILE_MAPPED, -nr_pages);
5321                __mod_lruvec_state(to_vec, NR_FILE_MAPPED, nr_pages);
5322        }
5323
5324        /*
5325         * move_lock grabbed above and caller set from->moving_account, so
5326         * mod_memcg_page_state will serialize updates to PageDirty.
5327         * So mapping should be stable for dirty pages.
5328         */
5329        if (!anon && PageDirty(page)) {
5330                struct address_space *mapping = page_mapping(page);
5331
5332                if (mapping_cap_account_dirty(mapping)) {
5333                        __mod_lruvec_state(from_vec, NR_FILE_DIRTY, -nr_pages);
5334                        __mod_lruvec_state(to_vec, NR_FILE_DIRTY, nr_pages);
5335                }
5336        }
5337
5338        if (PageWriteback(page)) {
5339                __mod_lruvec_state(from_vec, NR_WRITEBACK, -nr_pages);
5340                __mod_lruvec_state(to_vec, NR_WRITEBACK, nr_pages);
5341        }
5342
5343#ifdef CONFIG_TRANSPARENT_HUGEPAGE
5344        if (compound && !list_empty(page_deferred_list(page))) {
5345                spin_lock(&from->deferred_split_queue.split_queue_lock);
5346                list_del_init(page_deferred_list(page));
5347                from->deferred_split_queue.split_queue_len--;
5348                spin_unlock(&from->deferred_split_queue.split_queue_lock);
5349        }
5350#endif
5351        /*
5352         * It is safe to change page->mem_cgroup here because the page
5353         * is referenced, charged, and isolated - we can't race with
5354         * uncharging, charging, migration, or LRU putback.
5355         */
5356
5357        /* caller should have done css_get */
5358        page->mem_cgroup = to;
5359
5360#ifdef CONFIG_TRANSPARENT_HUGEPAGE
5361        if (compound && list_empty(page_deferred_list(page))) {
5362                spin_lock(&to->deferred_split_queue.split_queue_lock);
5363                list_add_tail(page_deferred_list(page),
5364                              &to->deferred_split_queue.split_queue);
5365                to->deferred_split_queue.split_queue_len++;
5366                spin_unlock(&to->deferred_split_queue.split_queue_lock);
5367        }
5368#endif
5369
5370        spin_unlock_irqrestore(&from->move_lock, flags);
5371
5372        ret = 0;
5373
5374        local_irq_disable();
5375        mem_cgroup_charge_statistics(to, page, compound, nr_pages);
5376        memcg_check_events(to, page);
5377        mem_cgroup_charge_statistics(from, page, compound, -nr_pages);
5378        memcg_check_events(from, page);
5379        local_irq_enable();
5380out_unlock:
5381        unlock_page(page);
5382out:
5383        return ret;
5384}
5385
5386/**
5387 * get_mctgt_type - get target type of moving charge
5388 * @vma: the vma the pte to be checked belongs
5389 * @addr: the address corresponding to the pte to be checked
5390 * @ptent: the pte to be checked
5391 * @target: the pointer the target page or swap ent will be stored(can be NULL)
5392 *
5393 * Returns
5394 *   0(MC_TARGET_NONE): if the pte is not a target for move charge.
5395 *   1(MC_TARGET_PAGE): if the page corresponding to this pte is a target for
5396 *     move charge. if @target is not NULL, the page is stored in target->page
5397 *     with extra refcnt got(Callers should handle it).
5398 *   2(MC_TARGET_SWAP): if the swap entry corresponding to this pte is a
5399 *     target for charge migration. if @target is not NULL, the entry is stored
5400 *     in target->ent.
5401 *   3(MC_TARGET_DEVICE): like MC_TARGET_PAGE  but page is MEMORY_DEVICE_PRIVATE
5402 *     (so ZONE_DEVICE page and thus not on the lru).
5403 *     For now we such page is charge like a regular page would be as for all
5404 *     intent and purposes it is just special memory taking the place of a
5405 *     regular page.
5406 *
5407 *     See Documentations/vm/hmm.txt and include/linux/hmm.h
5408 *
5409 * Called with pte lock held.
5410 */
5411
5412static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma,
5413                unsigned long addr, pte_t ptent, union mc_target *target)
5414{
5415        struct page *page = NULL;
5416        enum mc_target_type ret = MC_TARGET_NONE;
5417        swp_entry_t ent = { .val = 0 };
5418
5419        if (pte_present(ptent))
5420                page = mc_handle_present_pte(vma, addr, ptent);
5421        else if (is_swap_pte(ptent))
5422                page = mc_handle_swap_pte(vma, ptent, &ent);
5423        else if (pte_none(ptent))
5424                page = mc_handle_file_pte(vma, addr, ptent, &ent);
5425
5426        if (!page && !ent.val)
5427                return ret;
5428        if (page) {
5429                /*
5430                 * Do only loose check w/o serialization.
5431                 * mem_cgroup_move_account() checks the page is valid or
5432                 * not under LRU exclusion.
5433                 */
5434                if (page->mem_cgroup == mc.from) {
5435                        ret = MC_TARGET_PAGE;
5436                        if (is_device_private_page(page))
5437                                ret = MC_TARGET_DEVICE;
5438                        if (target)
5439                                target->page = page;
5440                }
5441                if (!ret || !target)
5442                        put_page(page);
5443        }
5444        /*
5445         * There is a swap entry and a page doesn't exist or isn't charged.
5446         * But we cannot move a tail-page in a THP.
5447         */
5448        if (ent.val && !ret && (!page || !PageTransCompound(page)) &&
5449            mem_cgroup_id(mc.from) == lookup_swap_cgroup_id(ent)) {
5450                ret = MC_TARGET_SWAP;
5451                if (target)
5452                        target->ent = ent;
5453        }
5454        return ret;
5455}
5456
5457#ifdef CONFIG_TRANSPARENT_HUGEPAGE
5458/*
5459 * We don't consider PMD mapped swapping or file mapped pages because THP does
5460 * not support them for now.
5461 * Caller should make sure that pmd_trans_huge(pmd) is true.
5462 */
5463static enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma,
5464                unsigned long addr, pmd_t pmd, union mc_target *target)
5465{
5466        struct page *page = NULL;
5467        enum mc_target_type ret = MC_TARGET_NONE;
5468
5469        if (unlikely(is_swap_pmd(pmd))) {
5470                VM_BUG_ON(thp_migration_supported() &&
5471                                  !is_pmd_migration_entry(pmd));
5472                return ret;
5473        }
5474        page = pmd_page(pmd);
5475        VM_BUG_ON_PAGE(!page || !PageHead(page), page);
5476        if (!(mc.flags & MOVE_ANON))
5477                return ret;
5478        if (page->mem_cgroup == mc.from) {
5479                ret = MC_TARGET_PAGE;
5480                if (target) {
5481                        get_page(page);
5482                        target->page = page;
5483                }
5484        }
5485        return ret;
5486}
5487#else
5488static inline enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma,
5489                unsigned long addr, pmd_t pmd, union mc_target *target)
5490{
5491        return MC_TARGET_NONE;
5492}
5493#endif
5494
5495static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd,
5496                                        unsigned long addr, unsigned long end,
5497                                        struct mm_walk *walk)
5498{
5499        struct vm_area_struct *vma = walk->vma;
5500        pte_t *pte;
5501        spinlock_t *ptl;
5502
5503        ptl = pmd_trans_huge_lock(pmd, vma);
5504        if (ptl) {
5505                /*
5506                 * Note their can not be MC_TARGET_DEVICE for now as we do not
5507                 * support transparent huge page with MEMORY_DEVICE_PRIVATE but
5508                 * this might change.
5509                 */
5510                if (get_mctgt_type_thp(vma, addr, *pmd, NULL) == MC_TARGET_PAGE)
5511                        mc.precharge += HPAGE_PMD_NR;
5512                spin_unlock(ptl);
5513                return 0;
5514        }
5515
5516        if (pmd_trans_unstable(pmd))
5517                return 0;
5518        pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
5519        for (; addr != end; pte++, addr += PAGE_SIZE)
5520                if (get_mctgt_type(vma, addr, *pte, NULL))
5521                        mc.precharge++; /* increment precharge temporarily */
5522        pte_unmap_unlock(pte - 1, ptl);
5523        cond_resched();
5524
5525        return 0;
5526}
5527
5528static const struct mm_walk_ops precharge_walk_ops = {
5529        .pmd_entry      = mem_cgroup_count_precharge_pte_range,
5530};
5531
5532static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm)
5533{
5534        unsigned long precharge;
5535
5536        down_read(&mm->mmap_sem);
5537        walk_page_range(mm, 0, mm->highest_vm_end, &precharge_walk_ops, NULL);
5538        up_read(&mm->mmap_sem);
5539
5540        precharge = mc.precharge;
5541        mc.precharge = 0;
5542
5543        return precharge;
5544}
5545
5546static int mem_cgroup_precharge_mc(struct mm_struct *mm)
5547{
5548        unsigned long precharge = mem_cgroup_count_precharge(mm);
5549
5550        VM_BUG_ON(mc.moving_task);
5551        mc.moving_task = current;
5552        return mem_cgroup_do_precharge(precharge);
5553}
5554
5555/* cancels all extra charges on mc.from and mc.to, and wakes up all waiters. */
5556static void __mem_cgroup_clear_mc(void)
5557{
5558        struct mem_cgroup *from = mc.from;
5559        struct mem_cgroup *to = mc.to;
5560
5561        /* we must uncharge all the leftover precharges from mc.to */
5562        if (mc.precharge) {
5563                cancel_charge(mc.to, mc.precharge);
5564                mc.precharge = 0;
5565        }
5566        /*
5567         * we didn't uncharge from mc.from at mem_cgroup_move_account(), so
5568         * we must uncharge here.
5569         */
5570        if (mc.moved_charge) {
5571                cancel_charge(mc.from, mc.moved_charge);
5572                mc.moved_charge = 0;
5573        }
5574        /* we must fixup refcnts and charges */
5575        if (mc.moved_swap) {
5576                /* uncharge swap account from the old cgroup */
5577                if (!mem_cgroup_is_root(mc.from))
5578                        page_counter_uncharge(&mc.from->memsw, mc.moved_swap);
5579
5580                mem_cgroup_id_put_many(mc.from, mc.moved_swap);
5581
5582                /*
5583                 * we charged both to->memory and to->memsw, so we
5584                 * should uncharge to->memory.
5585                 */
5586                if (!mem_cgroup_is_root(mc.to))
5587                        page_counter_uncharge(&mc.to->memory, mc.moved_swap);
5588
5589                mem_cgroup_id_get_many(mc.to, mc.moved_swap);
5590                css_put_many(&mc.to->css, mc.moved_swap);
5591
5592                mc.moved_swap = 0;
5593        }
5594        memcg_oom_recover(from);
5595        memcg_oom_recover(to);
5596        wake_up_all(&mc.waitq);
5597}
5598
5599static void mem_cgroup_clear_mc(void)
5600{
5601        struct mm_struct *mm = mc.mm;
5602
5603        /*
5604         * we must clear moving_task before waking up waiters at the end of
5605         * task migration.
5606         */
5607        mc.moving_task = NULL;
5608        __mem_cgroup_clear_mc();
5609        spin_lock(&mc.lock);
5610        mc.from = NULL;
5611        mc.to = NULL;
5612        mc.mm = NULL;
5613        spin_unlock(&mc.lock);
5614
5615        mmput(mm);
5616}
5617
5618static int mem_cgroup_can_attach(struct cgroup_taskset *tset)
5619{
5620        struct cgroup_subsys_state *css;
5621        struct mem_cgroup *memcg = NULL; /* unneeded init to make gcc happy */
5622        struct mem_cgroup *from;
5623        struct task_struct *leader, *p;
5624        struct mm_struct *mm;
5625        unsigned long move_flags;
5626        int ret = 0;
5627
5628        /* charge immigration isn't supported on the default hierarchy */
5629        if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
5630                return 0;
5631
5632        /*
5633         * Multi-process migrations only happen on the default hierarchy
5634         * where charge immigration is not used.  Perform charge
5635         * immigration if @tset contains a leader and whine if there are
5636         * multiple.
5637         */
5638        p = NULL;
5639        cgroup_taskset_for_each_leader(leader, css, tset) {
5640                WARN_ON_ONCE(p);
5641                p = leader;
5642                memcg = mem_cgroup_from_css(css);
5643        }
5644        if (!p)
5645                return 0;
5646
5647        /*
5648         * We are now commited to this value whatever it is. Changes in this
5649         * tunable will only affect upcoming migrations, not the current one.
5650         * So we need to save it, and keep it going.
5651         */
5652        move_flags = READ_ONCE(memcg->move_charge_at_immigrate);
5653        if (!move_flags)
5654                return 0;
5655
5656        from = mem_cgroup_from_task(p);
5657
5658        VM_BUG_ON(from == memcg);
5659
5660        mm = get_task_mm(p);
5661        if (!mm)
5662                return 0;
5663        /* We move charges only when we move a owner of the mm */
5664        if (mm->owner == p) {
5665                VM_BUG_ON(mc.from);
5666                VM_BUG_ON(mc.to);
5667                VM_BUG_ON(mc.precharge);
5668                VM_BUG_ON(mc.moved_charge);
5669                VM_BUG_ON(mc.moved_swap);
5670
5671                spin_lock(&mc.lock);
5672                mc.mm = mm;
5673                mc.from = from;
5674                mc.to = memcg;
5675                mc.flags = move_flags;
5676                spin_unlock(&mc.lock);
5677                /* We set mc.moving_task later */
5678
5679                ret = mem_cgroup_precharge_mc(mm);
5680                if (ret)
5681                        mem_cgroup_clear_mc();
5682        } else {
5683                mmput(mm);
5684        }
5685        return ret;
5686}
5687
5688static void mem_cgroup_cancel_attach(struct cgroup_taskset *tset)
5689{
5690        if (mc.to)
5691                mem_cgroup_clear_mc();
5692}
5693
5694static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
5695                                unsigned long addr, unsigned long end,
5696                                struct mm_walk *walk)
5697{
5698        int ret = 0;
5699        struct vm_area_struct *vma = walk->vma;
5700        pte_t *pte;
5701        spinlock_t *ptl;
5702        enum mc_target_type target_type;
5703        union mc_target target;
5704        struct page *page;
5705
5706        ptl = pmd_trans_huge_lock(pmd, vma);
5707        if (ptl) {
5708                if (mc.precharge < HPAGE_PMD_NR) {
5709                        spin_unlock(ptl);
5710                        return 0;
5711                }
5712                target_type = get_mctgt_type_thp(vma, addr, *pmd, &target);
5713                if (target_type == MC_TARGET_PAGE) {
5714                        page = target.page;
5715                        if (!isolate_lru_page(page)) {
5716                                if (!mem_cgroup_move_account(page, true,
5717                                                             mc.from, mc.to)) {
5718                                        mc.precharge -= HPAGE_PMD_NR;
5719                                        mc.moved_charge += HPAGE_PMD_NR;
5720                                }
5721                                putback_lru_page(page);
5722                        }
5723                        put_page(page);
5724                } else if (target_type == MC_TARGET_DEVICE) {
5725                        page = target.page;
5726                        if (!mem_cgroup_move_account(page, true,
5727                                                     mc.from, mc.to)) {
5728                                mc.precharge -= HPAGE_PMD_NR;
5729                                mc.moved_charge += HPAGE_PMD_NR;
5730                        }
5731                        put_page(page);
5732                }
5733                spin_unlock(ptl);
5734                return 0;
5735        }
5736
5737        if (pmd_trans_unstable(pmd))
5738                return 0;
5739retry:
5740        pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
5741        for (; addr != end; addr += PAGE_SIZE) {
5742                pte_t ptent = *(pte++);
5743                bool device = false;
5744                swp_entry_t ent;
5745
5746                if (!mc.precharge)
5747                        break;
5748
5749                switch (get_mctgt_type(vma, addr, ptent, &target)) {
5750                case MC_TARGET_DEVICE:
5751                        device = true;
5752                        /* fall through */
5753                case MC_TARGET_PAGE:
5754                        page = target.page;
5755                        /*
5756                         * We can have a part of the split pmd here. Moving it
5757                         * can be done but it would be too convoluted so simply
5758                         * ignore such a partial THP and keep it in original
5759                         * memcg. There should be somebody mapping the head.
5760                         */
5761                        if (PageTransCompound(page))
5762                                goto put;
5763                        if (!device && isolate_lru_page(page))
5764                                goto put;
5765                        if (!mem_cgroup_move_account(page, false,
5766                                                mc.from, mc.to)) {
5767                                mc.precharge--;
5768                                /* we uncharge from mc.from later. */
5769                                mc.moved_charge++;
5770                        }
5771                        if (!device)
5772                                putback_lru_page(page);
5773put:                    /* get_mctgt_type() gets the page */
5774                        put_page(page);
5775                        break;
5776                case MC_TARGET_SWAP:
5777                        ent = target.ent;
5778                        if (!mem_cgroup_move_swap_account(ent, mc.from, mc.to)) {
5779                                mc.precharge--;
5780                                /* we fixup refcnts and charges later. */
5781                                mc.moved_swap++;
5782                        }
5783                        break;
5784                default:
5785                        break;
5786                }
5787        }
5788        pte_unmap_unlock(pte - 1, ptl);
5789        cond_resched();
5790
5791        if (addr != end) {
5792                /*
5793                 * We have consumed all precharges we got in can_attach().
5794                 * We try charge one by one, but don't do any additional
5795                 * charges to mc.to if we have failed in charge once in attach()
5796                 * phase.
5797                 */
5798                ret = mem_cgroup_do_precharge(1);
5799                if (!ret)
5800                        goto retry;
5801        }
5802
5803        return ret;
5804}
5805
5806static const struct mm_walk_ops charge_walk_ops = {
5807        .pmd_entry      = mem_cgroup_move_charge_pte_range,
5808};
5809
5810static void mem_cgroup_move_charge(void)
5811{
5812        lru_add_drain_all();
5813        /*
5814         * Signal lock_page_memcg() to take the memcg's move_lock
5815         * while we're moving its pages to another memcg. Then wait
5816         * for already started RCU-only updates to finish.
5817         */
5818        atomic_inc(&mc.from->moving_account);
5819        synchronize_rcu();
5820retry:
5821        if (unlikely(!down_read_trylock(&mc.mm->mmap_sem))) {
5822                /*
5823                 * Someone who are holding the mmap_sem might be waiting in
5824                 * waitq. So we cancel all extra charges, wake up all waiters,
5825                 * and retry. Because we cancel precharges, we might not be able
5826                 * to move enough charges, but moving charge is a best-effort
5827                 * feature anyway, so it wouldn't be a big problem.
5828                 */
5829                __mem_cgroup_clear_mc();
5830                cond_resched();
5831                goto retry;
5832        }
5833        /*
5834         * When we have consumed all precharges and failed in doing
5835         * additional charge, the page walk just aborts.
5836         */
5837        walk_page_range(mc.mm, 0, mc.mm->highest_vm_end, &charge_walk_ops,
5838                        NULL);
5839
5840        up_read(&mc.mm->mmap_sem);
5841        atomic_dec(&mc.from->moving_account);
5842}
5843
5844static void mem_cgroup_move_task(void)
5845{
5846        if (mc.to) {
5847                mem_cgroup_move_charge();
5848                mem_cgroup_clear_mc();
5849        }
5850}
5851#else   /* !CONFIG_MMU */
5852static int mem_cgroup_can_attach(struct cgroup_taskset *tset)
5853{
5854        return 0;
5855}
5856static void mem_cgroup_cancel_attach(struct cgroup_taskset *tset)
5857{
5858}
5859static void mem_cgroup_move_task(void)
5860{
5861}
5862#endif
5863
5864/*
5865 * Cgroup retains root cgroups across [un]mount cycles making it necessary
5866 * to verify whether we're attached to the default hierarchy on each mount
5867 * attempt.
5868 */
5869static void mem_cgroup_bind(struct cgroup_subsys_state *root_css)
5870{
5871        /*
5872         * use_hierarchy is forced on the default hierarchy.  cgroup core
5873         * guarantees that @root doesn't have any children, so turning it
5874         * on for the root memcg is enough.
5875         */
5876        if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
5877                root_mem_cgroup->use_hierarchy = true;
5878        else
5879                root_mem_cgroup->use_hierarchy = false;
5880}
5881
5882static int seq_puts_memcg_tunable(struct seq_file *m, unsigned long value)
5883{
5884        if (value == PAGE_COUNTER_MAX)
5885                seq_puts(m, "max\n");
5886        else
5887                seq_printf(m, "%llu\n", (u64)value * PAGE_SIZE);
5888
5889        return 0;
5890}
5891
5892static u64 memory_current_read(struct cgroup_subsys_state *css,
5893                               struct cftype *cft)
5894{
5895        struct mem_cgroup *memcg = mem_cgroup_from_css(css);
5896
5897        return (u64)page_counter_read(&memcg->memory) * PAGE_SIZE;
5898}
5899
5900static int memory_min_show(struct seq_file *m, void *v)
5901{
5902        return seq_puts_memcg_tunable(m,
5903                READ_ONCE(mem_cgroup_from_seq(m)->memory.min));
5904}
5905
5906static ssize_t memory_min_write(struct kernfs_open_file *of,
5907                                char *buf, size_t nbytes, loff_t off)
5908{
5909        struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
5910        unsigned long min;
5911        int err;
5912
5913        buf = strstrip(buf);
5914        err = page_counter_memparse(buf, "max", &min);
5915        if (err)
5916                return err;
5917
5918        page_counter_set_min(&memcg->memory, min);
5919
5920        return nbytes;
5921}
5922
5923static int memory_low_show(struct seq_file *m, void *v)
5924{
5925        return seq_puts_memcg_tunable(m,
5926                READ_ONCE(mem_cgroup_from_seq(m)->memory.low));
5927}
5928
5929static ssize_t memory_low_write(struct kernfs_open_file *of,
5930                                char *buf, size_t nbytes, loff_t off)
5931{
5932        struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
5933        unsigned long low;
5934        int err;
5935
5936        buf = strstrip(buf);
5937        err = page_counter_memparse(buf, "max", &low);
5938        if (err)
5939                return err;
5940
5941        page_counter_set_low(&memcg->memory, low);
5942
5943        return nbytes;
5944}
5945
5946static int memory_high_show(struct seq_file *m, void *v)
5947{
5948        return seq_puts_memcg_tunable(m, READ_ONCE(mem_cgroup_from_seq(m)->high));
5949}
5950
5951static ssize_t memory_high_write(struct kernfs_open_file *of,
5952                                 char *buf, size_t nbytes, loff_t off)
5953{
5954        struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
5955        unsigned int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
5956        bool drained = false;
5957        unsigned long high;
5958        int err;
5959
5960        buf = strstrip(buf);
5961        err = page_counter_memparse(buf, "max", &high);
5962        if (err)
5963                return err;
5964
5965        memcg->high = high;
5966
5967        for (;;) {
5968                unsigned long nr_pages = page_counter_read(&memcg->memory);
5969                unsigned long reclaimed;
5970
5971                if (nr_pages <= high)
5972                        break;
5973
5974                if (signal_pending(current))
5975                        break;
5976
5977                if (!drained) {
5978                        drain_all_stock(memcg);
5979                        drained = true;
5980                        continue;
5981                }
5982
5983                reclaimed = try_to_free_mem_cgroup_pages(memcg, nr_pages - high,
5984                                                         GFP_KERNEL, true);
5985
5986                if (!reclaimed && !nr_retries--)
5987                        break;
5988        }
5989
5990        return nbytes;
5991}
5992
5993static int memory_max_show(struct seq_file *m, void *v)
5994{
5995        return seq_puts_memcg_tunable(m,
5996                READ_ONCE(mem_cgroup_from_seq(m)->memory.max));
5997}
5998
5999static ssize_t memory_max_write(struct kernfs_open_file *of,
6000                                char *buf, size_t nbytes, loff_t off)

6001{
6002        struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
6003        unsigned int nr_reclaims = MEM_CGROUP_RECLAIM_RETRIES;
6004        bool drained = false;
6005        unsigned long max;
6006        int err;
6007
6008        buf = strstrip(buf);
6009        err = page_counter_memparse(buf, "max", &max);
6010        if (err)
6011                return err;
6012
6013        xchg(&memcg->memory.max, max);
6014
6015        for (;;) {
6016                unsigned long nr_pages = page_counter_read(&memcg->memory);
6017
6018                if (nr_pages <= max)
6019                        break;
6020
6021                if (signal_pending(current))
6022                        break;
6023
6024                if (!drained) {
6025                        drain_all_stock(memcg);
6026                        drained = true;
6027                        continue;
6028                }
6029
6030                if (nr_reclaims) {
6031                        if (!try_to_free_mem_cgroup_pages(memcg, nr_pages - max,
6032                                                          GFP_KERNEL, true))
6033                                nr_reclaims--;
6034                        continue;
6035                }
6036
6037                memcg_memory_event(memcg, MEMCG_OOM);
6038                if (!mem_cgroup_out_of_memory(memcg, GFP_KERNEL, 0))
6039                        break;
6040        }
6041
6042        memcg_wb_domain_size_changed(memcg);
6043        return nbytes;
6044}
6045
6046static void __memory_events_show(struct seq_file *m, atomic_long_t *events)
6047{
6048        seq_printf(m, "low %lu\n", atomic_long_read(&events[MEMCG_LOW]));
6049        seq_printf(m, "high %lu\n", atomic_long_read(&events[MEMCG_HIGH]));
6050        seq_printf(m, "max %lu\n", atomic_long_read(&events[MEMCG_MAX]));
6051        seq_printf(m, "oom %lu\n", atomic_long_read(&events[MEMCG_OOM]));
6052        seq_printf(m, "oom_kill %lu\n",
6053                   atomic_long_read(&events[MEMCG_OOM_KILL]));
6054}
6055
6056static int memory_events_show(struct seq_file *m, void *v)
6057{
6058        struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
6059
6060        __memory_events_show(m, memcg->memory_events);
6061        return 0;
6062}
6063
6064static int memory_events_local_show(struct seq_file *m, void *v)
6065{
6066        struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
6067
6068        __memory_events_show(m, memcg->memory_events_local);
6069        return 0;
6070}
6071
6072static int memory_stat_show(struct seq_file *m, void *v)
6073{
6074        struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
6075        char *buf;
6076
6077        buf = memory_stat_format(memcg);
6078        if (!buf)
6079                return -ENOMEM;
6080        seq_puts(m, buf);
6081        kfree(buf);
6082        return 0;
6083}
6084
6085static int memory_oom_group_show(struct seq_file *m, void *v)
6086{
6087        struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
6088
6089        seq_printf(m, "%d\n", memcg->oom_group);
6090
6091        return 0;
6092}
6093
6094static ssize_t memory_oom_group_write(struct kernfs_open_file *of,
6095                                      char *buf, size_t nbytes, loff_t off)
6096{
6097        struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
6098        int ret, oom_group;
6099
6100        buf = strstrip(buf);
6101        if (!buf)
6102                return -EINVAL;
6103
6104        ret = kstrtoint(buf, 0, &oom_group);
6105        if (ret)
6106                return ret;
6107
6108        if (oom_group != 0 && oom_group != 1)
6109                return -EINVAL;
6110
6111        memcg->oom_group = oom_group;
6112
6113        return nbytes;
6114}
6115
6116static struct cftype memory_files[] = {
6117        {
6118                .name = "current",
6119                .flags = CFTYPE_NOT_ON_ROOT,
6120                .read_u64 = memory_current_read,
6121        },
6122        {
6123                .name = "min",
6124                .flags = CFTYPE_NOT_ON_ROOT,
6125                .seq_show = memory_min_show,
6126                .write = memory_min_write,
6127        },
6128        {
6129                .name = "low",
6130                .flags = CFTYPE_NOT_ON_ROOT,
6131                .seq_show = memory_low_show,
6132                .write = memory_low_write,
6133        },
6134        {
6135                .name = "high",
6136                .flags = CFTYPE_NOT_ON_ROOT,
6137                .seq_show = memory_high_show,
6138                .write = memory_high_write,
6139        },
6140        {
6141                .name = "max",
6142                .flags = CFTYPE_NOT_ON_ROOT,
6143                .seq_show = memory_max_show,
6144                .write = memory_max_write,
6145        },
6146        {
6147                .name = "events",
6148                .flags = CFTYPE_NOT_ON_ROOT,
6149                .file_offset = offsetof(struct mem_cgroup, events_file),
6150                .seq_show = memory_events_show,
6151        },
6152        {
6153                .name = "events.local",
6154                .flags = CFTYPE_NOT_ON_ROOT,
6155                .file_offset = offsetof(struct mem_cgroup, events_local_file),
6156                .seq_show = memory_events_local_show,
6157        },
6158        {
6159                .name = "stat",
6160                .flags = CFTYPE_NOT_ON_ROOT,
6161                .seq_show = memory_stat_show,
6162        },
6163        {
6164                .name = "oom.group",
6165                .flags = CFTYPE_NOT_ON_ROOT | CFTYPE_NS_DELEGATABLE,
6166                .seq_show = memory_oom_group_show,
6167                .write = memory_oom_group_write,
6168        },
6169        { }     /* terminate */
6170};
6171
6172struct cgroup_subsys memory_cgrp_subsys = {
6173        .css_alloc = mem_cgroup_css_alloc,
6174        .css_online = mem_cgroup_css_online,
6175        .css_offline = mem_cgroup_css_offline,
6176        .css_released = mem_cgroup_css_released,
6177        .css_free = mem_cgroup_css_free,
6178        .css_reset = mem_cgroup_css_reset,
6179        .can_attach = mem_cgroup_can_attach,
6180        .cancel_attach = mem_cgroup_cancel_attach,
6181        .post_attach = mem_cgroup_move_task,
6182        .bind = mem_cgroup_bind,
6183        .dfl_cftypes = memory_files,
6184        .legacy_cftypes = mem_cgroup_legacy_files,
6185        .early_init = 0,
6186};
6187
6188/**
6189 * mem_cgroup_protected - check if memory consumption is in the normal range
6190 * @root: the top ancestor of the sub-tree being checked
6191 * @memcg: the memory cgroup to check
6192 *
6193 * WARNING: This function is not stateless! It can only be used as part
6194 *          of a top-down tree iteration, not for isolated queries.
6195 *
6196 * Returns one of the following:
6197 *   MEMCG_PROT_NONE: cgroup memory is not protected
6198 *   MEMCG_PROT_LOW: cgroup memory is protected as long there is
6199 *     an unprotected supply of reclaimable memory from other cgroups.
6200 *   MEMCG_PROT_MIN: cgroup memory is protected
6201 *
6202 * @root is exclusive; it is never protected when looked at directly
6203 *
6204 * To provide a proper hierarchical behavior, effective memory.min/low values
6205 * are used. Below is the description of how effective memory.low is calculated.
6206 * Effective memory.min values is calculated in the same way.
6207 *
6208 * Effective memory.low is always equal or less than the original memory.low.
6209 * If there is no memory.low overcommittment (which is always true for
6210 * top-level memory cgroups), these two values are equal.
6211 * Otherwise, it's a part of parent's effective memory.low,
6212 * calculated as a cgroup's memory.low usage divided by sum of sibling's
6213 * memory.low usages, where memory.low usage is the size of actually
6214 * protected memory.
6215 *
6216 *                                             low_usage
6217 * elow = min( memory.low, parent->elow * ------------------ ),
6218 *                                        siblings_low_usage
6219 *
6220 *             | memory.current, if memory.current < memory.low
6221 * low_usage = |
6222 *             | 0, otherwise.
6223 *
6224 *
6225 * Such definition of the effective memory.low provides the expected
6226 * hierarchical behavior: parent's memory.low value is limiting
6227 * children, unprotected memory is reclaimed first and cgroups,
6228 * which are not using their guarantee do not affect actual memory
6229 * distribution.
6230 *
6231 * For example, if there are memcgs A, A/B, A/C, A/D and A/E:
6232 *
6233 *     A      A/memory.low = 2G, A/memory.current = 6G
6234 *    //\\
6235 *   BC  DE   B/memory.low = 3G  B/memory.current = 2G
6236 *            C/memory.low = 1G  C/memory.current = 2G
6237 *            D/memory.low = 0   D/memory.current = 2G
6238 *            E/memory.low = 10G E/memory.current = 0
6239 *
6240 * and the memory pressure is applied, the following memory distribution
6241 * is expected (approximately):
6242 *
6243 *     A/memory.current = 2G
6244 *
6245 *     B/memory.current = 1.3G
6246 *     C/memory.current = 0.6G
6247 *     D/memory.current = 0
6248 *     E/memory.current = 0
6249 *
6250 * These calculations require constant tracking of the actual low usages
6251 * (see propagate_protected_usage()), as well as recursive calculation of
6252 * effective memory.low values. But as we do call mem_cgroup_protected()
6253 * path for each memory cgroup top-down from the reclaim,
6254 * it's possible to optimize this part, and save calculated elow
6255 * for next usage. This part is intentionally racy, but it's ok,
6256 * as memory.low is a best-effort mechanism.
6257 */
6258enum mem_cgroup_protection mem_cgroup_protected(struct mem_cgroup *root,
6259                                                struct mem_cgroup *memcg)
6260{
6261        struct mem_cgroup *parent;
6262        unsigned long emin, parent_emin;
6263        unsigned long elow, parent_elow;
6264        unsigned long usage;
6265
6266        if (mem_cgroup_disabled())
6267                return MEMCG_PROT_NONE;
6268
6269        if (!root)
6270                root = root_mem_cgroup;
6271        if (memcg == root)
6272                return MEMCG_PROT_NONE;
6273
6274        usage = page_counter_read(&memcg->memory);
6275        if (!usage)
6276                return MEMCG_PROT_NONE;
6277
6278        emin = memcg->memory.min;
6279        elow = memcg->memory.low;
6280
6281        parent = parent_mem_cgroup(memcg);
6282        /* No parent means a non-hierarchical mode on v1 memcg */
6283        if (!parent)
6284                return MEMCG_PROT_NONE;
6285
6286        if (parent == root)
6287                goto exit;
6288
6289        parent_emin = READ_ONCE(parent->memory.emin);
6290        emin = min(emin, parent_emin);
6291        if (emin && parent_emin) {
6292                unsigned long min_usage, siblings_min_usage;
6293
6294                min_usage = min(usage, memcg->memory.min);
6295                siblings_min_usage = atomic_long_read(
6296                        &parent->memory.children_min_usage);
6297
6298                if (min_usage && siblings_min_usage)
6299                        emin = min(emin, parent_emin * min_usage /
6300                                   siblings_min_usage);
6301        }
6302
6303        parent_elow = READ_ONCE(parent->memory.elow);
6304        elow = min(elow, parent_elow);
6305        if (elow && parent_elow) {
6306                unsigned long low_usage, siblings_low_usage;
6307
6308                low_usage = min(usage, memcg->memory.low);
6309                siblings_low_usage = atomic_long_read(
6310                        &parent->memory.children_low_usage);
6311
6312                if (low_usage && siblings_low_usage)
6313                        elow = min(elow, parent_elow * low_usage /
6314                                   siblings_low_usage);
6315        }
6316
6317exit:
6318        memcg->memory.emin = emin;
6319        memcg->memory.elow = elow;
6320
6321        if (usage <= emin)
6322                return MEMCG_PROT_MIN;
6323        else if (usage <= elow)
6324                return MEMCG_PROT_LOW;
6325        else
6326                return MEMCG_PROT_NONE;
6327}
6328
6329/**
6330 * mem_cgroup_try_charge - try charging a page
6331 * @page: page to charge
6332 * @mm: mm context of the victim
6333 * @gfp_mask: reclaim mode
6334 * @memcgp: charged memcg return
6335 * @compound: charge the page as compound or small page
6336 *
6337 * Try to charge @page to the memcg that @mm belongs to, reclaiming
6338 * pages according to @gfp_mask if necessary.
6339 *
6340 * Returns 0 on success, with *@memcgp pointing to the charged memcg.
6341 * Otherwise, an error code is returned.
6342 *
6343 * After page->mapping has been set up, the caller must finalize the
6344 * charge with mem_cgroup_commit_charge().  Or abort the transaction
6345 * with mem_cgroup_cancel_charge() in case page instantiation fails.
6346 */
6347int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm,
6348                          gfp_t gfp_mask, struct mem_cgroup **memcgp,
6349                          bool compound)
6350{
6351        struct mem_cgroup *memcg = NULL;
6352        unsigned int nr_pages = compound ? hpage_nr_pages(page) : 1;
6353        int ret = 0;
6354
6355        if (mem_cgroup_disabled())
6356                goto out;
6357
6358        if (PageSwapCache(page)) {
6359                /*
6360                 * Every swap fault against a single page tries to charge the
6361                 * page, bail as early as possible.  shmem_unuse() encounters
6362                 * already charged pages, too.  The USED bit is protected by
6363                 * the page lock, which serializes swap cache removal, which
6364                 * in turn serializes uncharging.
6365                 */
6366                VM_BUG_ON_PAGE(!PageLocked(page), page);
6367                if (compound_head(page)->mem_cgroup)
6368                        goto out;
6369
6370                if (do_swap_account) {
6371                        swp_entry_t ent = { .val = page_private(page), };
6372                        unsigned short id = lookup_swap_cgroup_id(ent);
6373
6374                        rcu_read_lock();
6375                        memcg = mem_cgroup_from_id(id);
6376                        if (memcg && !css_tryget_online(&memcg->css))
6377                                memcg = NULL;
6378                        rcu_read_unlock();
6379                }
6380        }
6381
6382        if (!memcg)
6383                memcg = get_mem_cgroup_from_mm(mm);
6384
6385        ret = try_charge(memcg, gfp_mask, nr_pages);
6386
6387        css_put(&memcg->css);
6388out:
6389        *memcgp = memcg;
6390        return ret;
6391}
6392
6393int mem_cgroup_try_charge_delay(struct page *page, struct mm_struct *mm,
6394                          gfp_t gfp_mask, struct mem_cgroup **memcgp,
6395                          bool compound)
6396{
6397        struct mem_cgroup *memcg;
6398        int ret;
6399
6400        ret = mem_cgroup_try_charge(page, mm, gfp_mask, memcgp, compound);
6401        memcg = *memcgp;
6402        mem_cgroup_throttle_swaprate(memcg, page_to_nid(page), gfp_mask);
6403        return ret;
6404}
6405
6406/**
6407 * mem_cgroup_commit_charge - commit a page charge
6408 * @page: page to charge
6409 * @memcg: memcg to charge the page to
6410 * @lrucare: page might be on LRU already
6411 * @compound: charge the page as compound or small page
6412 *
6413 * Finalize a charge transaction started by mem_cgroup_try_charge(),
6414 * after page->mapping has been set up.  This must happen atomically
6415 * as part of the page instantiation, i.e. under the page table lock
6416 * for anonymous pages, under the page lock for page and swap cache.
6417 *
6418 * In addition, the page must not be on the LRU during the commit, to
6419 * prevent racing with task migration.  If it might be, use @lrucare.
6420 *
6421 * Use mem_cgroup_cancel_charge() to cancel the transaction instead.
6422 */
6423void mem_cgroup_commit_charge(struct page *page, struct mem_cgroup *memcg,
6424                              bool lrucare, bool compound)
6425{
6426        unsigned int nr_pages = compound ? hpage_nr_pages(page) : 1;
6427
6428        VM_BUG_ON_PAGE(!page->mapping, page);
6429        VM_BUG_ON_PAGE(PageLRU(page) && !lrucare, page);
6430
6431        if (mem_cgroup_disabled())
6432                return;
6433        /*
6434         * Swap faults will attempt to charge the same page multiple
6435         * times.  But reuse_swap_page() might have removed the page
6436         * from swapcache already, so we can't check PageSwapCache().
6437         */
6438        if (!memcg)
6439                return;
6440
6441        commit_charge(page, memcg, lrucare);
6442
6443        local_irq_disable();
6444        mem_cgroup_charge_statistics(memcg, page, compound, nr_pages);
6445        memcg_check_events(memcg, page);
6446        local_irq_enable();
6447
6448        if (do_memsw_account() && PageSwapCache(page)) {
6449                swp_entry_t entry = { .val = page_private(page) };
6450                /*
6451                 * The swap entry might not get freed for a long time,
6452                 * let's not wait for it.  The page already received a
6453                 * memory+swap charge, drop the swap entry duplicate.
6454                 */
6455                mem_cgroup_uncharge_swap(entry, nr_pages);
6456        }
6457}
6458
6459/**
6460 * mem_cgroup_cancel_charge - cancel a page charge
6461 * @page: page to charge
6462 * @memcg: memcg to charge the page to
6463 * @compound: charge the page as compound or small page
6464 *
6465 * Cancel a charge transaction started by mem_cgroup_try_charge().
6466 */
6467void mem_cgroup_cancel_charge(struct page *page, struct mem_cgroup *memcg,
6468                bool compound)
6469{
6470        unsigned int nr_pages = compound ? hpage_nr_pages(page) : 1;
6471
6472        if (mem_cgroup_disabled())
6473                return;
6474        /*
6475         * Swap faults will attempt to charge the same page multiple
6476         * times.  But reuse_swap_page() might have removed the page
6477         * from swapcache already, so we can't check PageSwapCache().
6478         */
6479        if (!memcg)
6480                return;
6481
6482        cancel_charge(memcg, nr_pages);
6483}
6484
6485struct uncharge_gather {
6486        struct mem_cgroup *memcg;
6487        unsigned long pgpgout;
6488        unsigned long nr_anon;
6489        unsigned long nr_file;
6490        unsigned long nr_kmem;
6491        unsigned long nr_huge;
6492        unsigned long nr_shmem;
6493        struct page *dummy_page;
6494};
6495
6496static inline void uncharge_gather_clear(struct uncharge_gather *ug)
6497{
6498        memset(ug, 0, sizeof(*ug));
6499}
6500
6501static void uncharge_batch(const struct uncharge_gather *ug)
6502{
6503        unsigned long nr_pages = ug->nr_anon + ug->nr_file + ug->nr_kmem;
6504        unsigned long flags;
6505
6506        if (!mem_cgroup_is_root(ug->memcg)) {
6507                page_counter_uncharge(&ug->memcg->memory, nr_pages);
6508                if (do_memsw_account())
6509                        page_counter_uncharge(&ug->memcg->memsw, nr_pages);
6510                if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && ug->nr_kmem)
6511                        page_counter_uncharge(&ug->memcg->kmem, ug->nr_kmem);
6512                memcg_oom_recover(ug->memcg);
6513        }
6514
6515        local_irq_save(flags);
6516        __mod_memcg_state(ug->memcg, MEMCG_RSS, -ug->nr_anon);
6517        __mod_memcg_state(ug->memcg, MEMCG_CACHE, -ug->nr_file);
6518        __mod_memcg_state(ug->memcg, MEMCG_RSS_HUGE, -ug->nr_huge);
6519        __mod_memcg_state(ug->memcg, NR_SHMEM, -ug->nr_shmem);
6520        __count_memcg_events(ug->memcg, PGPGOUT, ug->pgpgout);
6521        __this_cpu_add(ug->memcg->vmstats_percpu->nr_page_events, nr_pages);
6522        memcg_check_events(ug->memcg, ug->dummy_page);
6523        local_irq_restore(flags);
6524
6525        if (!mem_cgroup_is_root(ug->memcg))
6526                css_put_many(&ug->memcg->css, nr_pages);
6527}
6528
6529static void uncharge_page(struct page *page, struct uncharge_gather *ug)
6530{
6531        VM_BUG_ON_PAGE(PageLRU(page), page);
6532        VM_BUG_ON_PAGE(page_count(page) && !is_zone_device_page(page) &&
6533                        !PageHWPoison(page) , page);
6534
6535        if (!page->mem_cgroup)
6536                return;
6537
6538        /*
6539         * Nobody should be changing or seriously looking at
6540         * page->mem_cgroup at this point, we have fully
6541         * exclusive access to the page.
6542         */
6543
6544        if (ug->memcg != page->mem_cgroup) {
6545                if (ug->memcg) {
6546                        uncharge_batch(ug);
6547                        uncharge_gather_clear(ug);
6548                }
6549                ug->memcg = page->mem_cgroup;
6550        }
6551
6552        if (!PageKmemcg(page)) {
6553                unsigned int nr_pages = 1;
6554
6555                if (PageTransHuge(page)) {
6556                        nr_pages = compound_nr(page);
6557                        ug->nr_huge += nr_pages;
6558                }
6559                if (PageAnon(page))
6560                        ug->nr_anon += nr_pages;
6561                else {
6562                        ug->nr_file += nr_pages;
6563                        if (PageSwapBacked(page))
6564                                ug->nr_shmem += nr_pages;
6565                }
6566                ug->pgpgout++;
6567        } else {
6568                ug->nr_kmem += compound_nr(page);
6569                __ClearPageKmemcg(page);
6570        }
6571
6572        ug->dummy_page = page;
6573        page->mem_cgroup = NULL;
6574}
6575
6576static void uncharge_list(struct list_head *page_list)
6577{
6578        struct uncharge_gather ug;
6579        struct list_head *next;
6580
6581        uncharge_gather_clear(&ug);
6582
6583        /*
6584         * Note that the list can be a single page->lru; hence the
6585         * do-while loop instead of a simple list_for_each_entry().
6586         */
6587        next = page_list->next;
6588        do {
6589                struct page *page;
6590
6591                page = list_entry(next, struct page, lru);
6592                next = page->lru.next;
6593
6594                uncharge_page(page, &ug);
6595        } while (next != page_list);
6596
6597        if (ug.memcg)
6598                uncharge_batch(&ug);
6599}
6600
6601/**
6602 * mem_cgroup_uncharge - uncharge a page
6603 * @page: page to uncharge
6604 *
6605 * Uncharge a page previously charged with mem_cgroup_try_charge() and
6606 * mem_cgroup_commit_charge().
6607 */
6608void mem_cgroup_uncharge(struct page *page)
6609{
6610        struct uncharge_gather ug;
6611
6612        if (mem_cgroup_disabled())
6613                return;
6614
6615        /* Don't touch page->lru of any random page, pre-check: */
6616        if (!page->mem_cgroup)
6617                return;
6618
6619        uncharge_gather_clear(&ug);
6620        uncharge_page(page, &ug);
6621        uncharge_batch(&ug);
6622}
6623
6624/**
6625 * mem_cgroup_uncharge_list - uncharge a list of page
6626 * @page_list: list of pages to uncharge
6627 *
6628 * Uncharge a list of pages previously charged with
6629 * mem_cgroup_try_charge() and mem_cgroup_commit_charge().
6630 */
6631void mem_cgroup_uncharge_list(struct list_head *page_list)
6632{
6633        if (mem_cgroup_disabled())
6634                return;
6635
6636        if (!list_empty(page_list))
6637                uncharge_list(page_list);
6638}
6639
6640/**
6641 * mem_cgroup_migrate - charge a page's replacement
6642 * @oldpage: currently circulating page
6643 * @newpage: replacement page
6644 *
6645 * Charge @newpage as a replacement page for @oldpage. @oldpage will
6646 * be uncharged upon free.
6647 *
6648 * Both pages must be locked, @newpage->mapping must be set up.
6649 */
6650void mem_cgroup_migrate(struct page *oldpage, struct page *newpage)
6651{
6652        struct mem_cgroup *memcg;
6653        unsigned int nr_pages;
6654        bool compound;
6655        unsigned long flags;
6656
6657        VM_BUG_ON_PAGE(!PageLocked(oldpage), oldpage);
6658        VM_BUG_ON_PAGE(!PageLocked(newpage), newpage);
6659        VM_BUG_ON_PAGE(PageAnon(oldpage) != PageAnon(newpage), newpage);
6660        VM_BUG_ON_PAGE(PageTransHuge(oldpage) != PageTransHuge(newpage),
6661                       newpage);
6662
6663        if (mem_cgroup_disabled())
6664                return;
6665
6666        /* Page cache replacement: new page already charged? */
6667        if (newpage->mem_cgroup)
6668                return;
6669
6670        /* Swapcache readahead pages can get replaced before being charged */
6671        memcg = oldpage->mem_cgroup;
6672        if (!memcg)
6673                return;
6674
6675        /* Force-charge the new page. The old one will be freed soon */
6676        compound = PageTransHuge(newpage);
6677        nr_pages = compound ? hpage_nr_pages(newpage) : 1;
6678
6679        page_counter_charge(&memcg->memory, nr_pages);
6680        if (do_memsw_account())
6681                page_counter_charge(&memcg->memsw, nr_pages);
6682        css_get_many(&memcg->css, nr_pages);
6683
6684        commit_charge(newpage, memcg, false);
6685
6686        local_irq_save(flags);
6687        mem_cgroup_charge_statistics(memcg, newpage, compound, nr_pages);
6688        memcg_check_events(memcg, newpage);
6689        local_irq_restore(flags);
6690}
6691
6692DEFINE_STATIC_KEY_FALSE(memcg_sockets_enabled_key);
6693EXPORT_SYMBOL(memcg_sockets_enabled_key);
6694
6695void mem_cgroup_sk_alloc(struct sock *sk)
6696{
6697        struct mem_cgroup *memcg;
6698
6699        if (!mem_cgroup_sockets_enabled)
6700                return;
6701
6702        /*
6703         * Socket cloning can throw us here with sk_memcg already
6704         * filled. It won't however, necessarily happen from
6705         * process context. So the test for root memcg given
6706         * the current task's memcg won't help us in this case.
6707         *
6708         * Respecting the original socket's memcg is a better
6709         * decision in this case.
6710         */
6711        if (sk->sk_memcg) {
6712                css_get(&sk->sk_memcg->css);
6713                return;
6714        }
6715
6716        rcu_read_lock();
6717        memcg = mem_cgroup_from_task(current);
6718        if (memcg == root_mem_cgroup)
6719                goto out;
6720        if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && !memcg->tcpmem_active)
6721                goto out;
6722        if (css_tryget_online(&memcg->css))
6723                sk->sk_memcg = memcg;
6724out:
6725        rcu_read_unlock();
6726}
6727
6728void mem_cgroup_sk_free(struct sock *sk)
6729{
6730        if (sk->sk_memcg)
6731                css_put(&sk->sk_memcg->css);
6732}
6733
6734/**
6735 * mem_cgroup_charge_skmem - charge socket memory
6736 * @memcg: memcg to charge
6737 * @nr_pages: number of pages to charge
6738 *
6739 * Charges @nr_pages to @memcg. Returns %true if the charge fit within
6740 * @memcg's configured limit, %false if the charge had to be forced.
6741 */
6742bool mem_cgroup_charge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages)
6743{
6744        gfp_t gfp_mask = GFP_KERNEL;
6745
6746        if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) {
6747                struct page_counter *fail;
6748
6749                if (page_counter_try_charge(&memcg->tcpmem, nr_pages, &fail)) {
6750                        memcg->tcpmem_pressure = 0;
6751                        return true;
6752                }
6753                page_counter_charge(&memcg->tcpmem, nr_pages);
6754                memcg->tcpmem_pressure = 1;
6755                return false;
6756        }
6757
6758        /* Don't block in the packet receive path */
6759        if (in_softirq())
6760                gfp_mask = GFP_NOWAIT;
6761
6762        mod_memcg_state(memcg, MEMCG_SOCK, nr_pages);
6763
6764        if (try_charge(memcg, gfp_mask, nr_pages) == 0)
6765                return true;
6766
6767        try_charge(memcg, gfp_mask|__GFP_NOFAIL, nr_pages);
6768        return false;
6769}
6770
6771/**
6772 * mem_cgroup_uncharge_skmem - uncharge socket memory
6773 * @memcg: memcg to uncharge
6774 * @nr_pages: number of pages to uncharge
6775 */
6776void mem_cgroup_uncharge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages)
6777{
6778        if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) {
6779                page_counter_uncharge(&memcg->tcpmem, nr_pages);
6780                return;
6781        }
6782
6783        mod_memcg_state(memcg, MEMCG_SOCK, -nr_pages);
6784
6785        refill_stock(memcg, nr_pages);
6786}
6787
6788static int __init cgroup_memory(char *s)
6789{
6790        char *token;
6791
6792        while ((token = strsep(&s, ",")) != NULL) {
6793                if (!*token)
6794                        continue;
6795                if (!strcmp(token, "nosocket"))
6796                        cgroup_memory_nosocket = true;
6797                if (!strcmp(token, "nokmem"))
6798                        cgroup_memory_nokmem = true;
6799        }
6800        return 0;
6801}
6802__setup("cgroup.memory=", cgroup_memory);
6803
6804/*
6805 * subsys_initcall() for memory controller.
6806 *
6807 * Some parts like memcg_hotplug_cpu_dead() have to be initialized from this
6808 * context because of lock dependencies (cgroup_lock -> cpu hotplug) but
6809 * basically everything that doesn't depend on a specific mem_cgroup structure
6810 * should be initialized from here.
6811 */
6812static int __init mem_cgroup_init(void)
6813{
6814        int cpu, node;
6815
6816#ifdef CONFIG_MEMCG_KMEM
6817        /*
6818         * Kmem cache creation is mostly done with the slab_mutex held,
6819         * so use a workqueue with limited concurrency to avoid stalling
6820         * all worker threads in case lots of cgroups are created and
6821         * destroyed simultaneously.
6822         */
6823        memcg_kmem_cache_wq = alloc_workqueue("memcg_kmem_cache", 0, 1);
6824        BUG_ON(!memcg_kmem_cache_wq);
6825#endif
6826
6827        cpuhp_setup_state_nocalls(CPUHP_MM_MEMCQ_DEAD, "mm/memctrl:dead", NULL,
6828                                  memcg_hotplug_cpu_dead);
6829
6830        for_each_possible_cpu(cpu)
6831                INIT_WORK(&per_cpu_ptr(&memcg_stock, cpu)->work,
6832                          drain_local_stock);
6833
6834        for_each_node(node) {
6835                struct mem_cgroup_tree_per_node *rtpn;
6836
6837                rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL,
6838                                    node_online(node) ? node : NUMA_NO_NODE);
6839
6840                rtpn->rb_root = RB_ROOT;
6841                rtpn->rb_rightmost = NULL;
6842                spin_lock_init(&rtpn->lock);
6843                soft_limit_tree.rb_tree_per_node[node] = rtpn;
6844        }
6845
6846        return 0;
6847}
6848subsys_initcall(mem_cgroup_init);
6849
6850#ifdef CONFIG_MEMCG_SWAP
6851static struct mem_cgroup *mem_cgroup_id_get_online(struct mem_cgroup *memcg)
6852{
6853        while (!refcount_inc_not_zero(&memcg->id.ref)) {
6854                /*
6855                 * The root cgroup cannot be destroyed, so it's refcount must
6856                 * always be >= 1.
6857                 */
6858                if (WARN_ON_ONCE(memcg == root_mem_cgroup)) {
6859                        VM_BUG_ON(1);
6860                        break;
6861                }
6862                memcg = parent_mem_cgroup(memcg);
6863                if (!memcg)
6864                        memcg = root_mem_cgroup;
6865        }
6866        return memcg;
6867}
6868
6869/**
6870 * mem_cgroup_swapout - transfer a memsw charge to swap
6871 * @page: page whose memsw charge to transfer
6872 * @entry: swap entry to move the charge to
6873 *
6874 * Transfer the memsw charge of @page to @entry.
6875 */
6876void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
6877{
6878        struct mem_cgroup *memcg, *swap_memcg;
6879        unsigned int nr_entries;
6880        unsigned short oldid;
6881
6882        VM_BUG_ON_PAGE(PageLRU(page), page);
6883        VM_BUG_ON_PAGE(page_count(page), page);
6884
6885        if (!do_memsw_account())
6886                return;
6887
6888        memcg = page->mem_cgroup;
6889
6890        /* Readahead page, never charged */
6891        if (!memcg)
6892                return;
6893
6894        /*
6895         * In case the memcg owning these pages has been offlined and doesn't
6896         * have an ID allocated to it anymore, charge the closest online
6897         * ancestor for the swap instead and transfer the memory+swap charge.
6898         */
6899        swap_memcg = mem_cgroup_id_get_online(memcg);
6900        nr_entries = hpage_nr_pages(page);
6901        /* Get references for the tail pages, too */
6902        if (nr_entries > 1)
6903                mem_cgroup_id_get_many(swap_memcg, nr_entries - 1);
6904        oldid = swap_cgroup_record(entry, mem_cgroup_id(swap_memcg),
6905                                   nr_entries);
6906        VM_BUG_ON_PAGE(oldid, page);
6907        mod_memcg_state(swap_memcg, MEMCG_SWAP, nr_entries);
6908
6909        page->mem_cgroup = NULL;
6910
6911        if (!mem_cgroup_is_root(memcg))
6912                page_counter_uncharge(&memcg->memory, nr_entries);
6913
6914        if (memcg != swap_memcg) {
6915                if (!mem_cgroup_is_root(swap_memcg))
6916                        page_counter_charge(&swap_memcg->memsw, nr_entries);
6917                page_counter_uncharge(&memcg->memsw, nr_entries);
6918        }
6919
6920        /*
6921         * Interrupts should be disabled here because the caller holds the
6922         * i_pages lock which is taken with interrupts-off. It is
6923         * important here to have the interrupts disabled because it is the
6924         * only synchronisation we have for updating the per-CPU variables.
6925         */
6926        VM_BUG_ON(!irqs_disabled());
6927        mem_cgroup_charge_statistics(memcg, page, PageTransHuge(page),
6928                                     -nr_entries);
6929        memcg_check_events(memcg, page);
6930
6931        if (!mem_cgroup_is_root(memcg))
6932                css_put_many(&memcg->css, nr_entries);
6933}
6934
6935/**
6936 * mem_cgroup_try_charge_swap - try charging swap space for a page
6937 * @page: page being added to swap
6938 * @entry: swap entry to charge
6939 *
6940 * Try to charge @page's memcg for the swap space at @entry.
6941 *
6942 * Returns 0 on success, -ENOMEM on failure.
6943 */
6944int mem_cgroup_try_charge_swap(struct page *page, swp_entry_t entry)
6945{
6946        unsigned int nr_pages = hpage_nr_pages(page);
6947        struct page_counter *counter;
6948        struct mem_cgroup *memcg;
6949        unsigned short oldid;
6950
6951        if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) || !do_swap_account)
6952                return 0;
6953
6954        memcg = page->mem_cgroup;
6955
6956        /* Readahead page, never charged */
6957        if (!memcg)
6958                return 0;
6959
6960        if (!entry.val) {
6961                memcg_memory_event(memcg, MEMCG_SWAP_FAIL);
6962                return 0;
6963        }
6964
6965        memcg = mem_cgroup_id_get_online(memcg);
6966
6967        if (!mem_cgroup_is_root(memcg) &&
6968            !page_counter_try_charge(&memcg->swap, nr_pages, &counter)) {
6969                memcg_memory_event(memcg, MEMCG_SWAP_MAX);
6970                memcg_memory_event(memcg, MEMCG_SWAP_FAIL);
6971                mem_cgroup_id_put(memcg);
6972                return -ENOMEM;
6973        }
6974
6975        /* Get references for the tail pages, too */
6976        if (nr_pages > 1)
6977                mem_cgroup_id_get_many(memcg, nr_pages - 1);
6978        oldid = swap_cgroup_record(entry, mem_cgroup_id(memcg), nr_pages);
6979        VM_BUG_ON_PAGE(oldid, page);
6980        mod_memcg_state(memcg, MEMCG_SWAP, nr_pages);
6981
6982        return 0;
6983}
6984
6985/**
6986 * mem_cgroup_uncharge_swap - uncharge swap space
6987 * @entry: swap entry to uncharge
6988 * @nr_pages: the amount of swap space to uncharge
6989 */
6990void mem_cgroup_uncharge_swap(swp_entry_t entry, unsigned int nr_pages)
6991{
6992        struct mem_cgroup *memcg;
6993        unsigned short id;
6994
6995        if (!do_swap_account)
6996                return;
6997
6998        id = swap_cgroup_record(entry, 0, nr_pages);
6999        rcu_read_lock();
7000        memcg = mem_cgroup_from_id(id);

7001        if (memcg) {
7002                if (!mem_cgroup_is_root(memcg)) {
7003                        if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
7004                                page_counter_uncharge(&memcg->swap, nr_pages);
7005                        else
7006                                page_counter_uncharge(&memcg->memsw, nr_pages);
7007                }
7008                mod_memcg_state(memcg, MEMCG_SWAP, -nr_pages);
7009                mem_cgroup_id_put_many(memcg, nr_pages);
7010        }
7011        rcu_read_unlock();
7012}
7013
7014long mem_cgroup_get_nr_swap_pages(struct mem_cgroup *memcg)
7015{
7016        long nr_swap_pages = get_nr_swap_pages();
7017
7018        if (!do_swap_account || !cgroup_subsys_on_dfl(memory_cgrp_subsys))
7019                return nr_swap_pages;
7020        for (; memcg != root_mem_cgroup; memcg = parent_mem_cgroup(memcg))
7021                nr_swap_pages = min_t(long, nr_swap_pages,
7022                                      READ_ONCE(memcg->swap.max) -
7023                                      page_counter_read(&memcg->swap));
7024        return nr_swap_pages;
7025}
7026
7027bool mem_cgroup_swap_full(struct page *page)
7028{
7029        struct mem_cgroup *memcg;
7030
7031        VM_BUG_ON_PAGE(!PageLocked(page), page);
7032
7033        if (vm_swap_full())
7034                return true;
7035        if (!do_swap_account || !cgroup_subsys_on_dfl(memory_cgrp_subsys))
7036                return false;
7037
7038        memcg = page->mem_cgroup;
7039        if (!memcg)
7040                return false;
7041
7042        for (; memcg != root_mem_cgroup; memcg = parent_mem_cgroup(memcg))
7043                if (page_counter_read(&memcg->swap) * 2 >= memcg->swap.max)
7044                        return true;
7045
7046        return false;
7047}
7048
7049/* for remember boot option*/
7050#ifdef CONFIG_MEMCG_SWAP_ENABLED
7051static int really_do_swap_account __initdata = 1;
7052#else
7053static int really_do_swap_account __initdata;
7054#endif
7055
7056static int __init enable_swap_account(char *s)
7057{
7058        if (!strcmp(s, "1"))
7059                really_do_swap_account = 1;
7060        else if (!strcmp(s, "0"))
7061                really_do_swap_account = 0;
7062        return 1;
7063}
7064__setup("swapaccount=", enable_swap_account);
7065
7066static u64 swap_current_read(struct cgroup_subsys_state *css,
7067                             struct cftype *cft)
7068{
7069        struct mem_cgroup *memcg = mem_cgroup_from_css(css);
7070
7071        return (u64)page_counter_read(&memcg->swap) * PAGE_SIZE;
7072}
7073
7074static int swap_max_show(struct seq_file *m, void *v)
7075{
7076        return seq_puts_memcg_tunable(m,
7077                READ_ONCE(mem_cgroup_from_seq(m)->swap.max));
7078}
7079
7080static ssize_t swap_max_write(struct kernfs_open_file *of,
7081                              char *buf, size_t nbytes, loff_t off)
7082{
7083        struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
7084        unsigned long max;
7085        int err;
7086
7087        buf = strstrip(buf);
7088        err = page_counter_memparse(buf, "max", &max);
7089        if (err)
7090                return err;
7091
7092        xchg(&memcg->swap.max, max);
7093
7094        return nbytes;
7095}
7096
7097static int swap_events_show(struct seq_file *m, void *v)
7098{
7099        struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
7100
7101        seq_printf(m, "max %lu\n",
7102                   atomic_long_read(&memcg->memory_events[MEMCG_SWAP_MAX]));
7103        seq_printf(m, "fail %lu\n",
7104                   atomic_long_read(&memcg->memory_events[MEMCG_SWAP_FAIL]));
7105
7106        return 0;
7107}
7108
7109static struct cftype swap_files[] = {
7110        {
7111                .name = "swap.current",
7112                .flags = CFTYPE_NOT_ON_ROOT,
7113                .read_u64 = swap_current_read,
7114        },
7115        {
7116                .name = "swap.max",
7117                .flags = CFTYPE_NOT_ON_ROOT,
7118                .seq_show = swap_max_show,
7119                .write = swap_max_write,
7120        },
7121        {
7122                .name = "swap.events",
7123                .flags = CFTYPE_NOT_ON_ROOT,
7124                .file_offset = offsetof(struct mem_cgroup, swap_events_file),
7125                .seq_show = swap_events_show,
7126        },
7127        { }     /* terminate */
7128};
7129
7130static struct cftype memsw_cgroup_files[] = {
7131        {
7132                .name = "memsw.usage_in_bytes",
7133                .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE),
7134                .read_u64 = mem_cgroup_read_u64,
7135        },
7136        {
7137                .name = "memsw.max_usage_in_bytes",
7138                .private = MEMFILE_PRIVATE(_MEMSWAP, RES_MAX_USAGE),
7139                .write = mem_cgroup_reset,
7140                .read_u64 = mem_cgroup_read_u64,
7141        },
7142        {
7143                .name = "memsw.limit_in_bytes",
7144                .private = MEMFILE_PRIVATE(_MEMSWAP, RES_LIMIT),
7145                .write = mem_cgroup_write,
7146                .read_u64 = mem_cgroup_read_u64,
7147        },
7148        {
7149                .name = "memsw.failcnt",
7150                .private = MEMFILE_PRIVATE(_MEMSWAP, RES_FAILCNT),
7151                .write = mem_cgroup_reset,
7152                .read_u64 = mem_cgroup_read_u64,
7153        },
7154        { },    /* terminate */
7155};
7156
7157static int __init mem_cgroup_swap_init(void)
7158{
7159        if (!mem_cgroup_disabled() && really_do_swap_account) {
7160                do_swap_account = 1;
7161                WARN_ON(cgroup_add_dfl_cftypes(&memory_cgrp_subsys,
7162                                               swap_files));
7163                WARN_ON(cgroup_add_legacy_cftypes(&memory_cgrp_subsys,
7164                                                  memsw_cgroup_files));
7165        }
7166        return 0;
7167}
7168subsys_initcall(mem_cgroup_swap_init);
7169
7170#endif /* CONFIG_MEMCG_SWAP */
7171