linux/mm/memcontrol.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0-or-later
   2/* memcontrol.c - Memory Controller
   3 *
   4 * Copyright IBM Corporation, 2007
   5 * Author Balbir Singh <balbir@linux.vnet.ibm.com>
   6 *
   7 * Copyright 2007 OpenVZ SWsoft Inc
   8 * Author: Pavel Emelianov <xemul@openvz.org>
   9 *
  10 * Memory thresholds
  11 * Copyright (C) 2009 Nokia Corporation
  12 * Author: Kirill A. Shutemov
  13 *
  14 * Kernel Memory Controller
  15 * Copyright (C) 2012 Parallels Inc. and Google Inc.
  16 * Authors: Glauber Costa and Suleiman Souhlal
  17 *
  18 * Native page reclaim
  19 * Charge lifetime sanitation
  20 * Lockless page tracking & accounting
  21 * Unified hierarchy configuration model
  22 * Copyright (C) 2015 Red Hat, Inc., Johannes Weiner
  23 */
  24
  25#include <linux/page_counter.h>
  26#include <linux/memcontrol.h>
  27#include <linux/cgroup.h>
  28#include <linux/pagewalk.h>
  29#include <linux/sched/mm.h>
  30#include <linux/shmem_fs.h>
  31#include <linux/hugetlb.h>
  32#include <linux/pagemap.h>
  33#include <linux/vm_event_item.h>
  34#include <linux/smp.h>
  35#include <linux/page-flags.h>
  36#include <linux/backing-dev.h>
  37#include <linux/bit_spinlock.h>
  38#include <linux/rcupdate.h>
  39#include <linux/limits.h>
  40#include <linux/export.h>
  41#include <linux/mutex.h>
  42#include <linux/rbtree.h>
  43#include <linux/slab.h>
  44#include <linux/swap.h>
  45#include <linux/swapops.h>
  46#include <linux/spinlock.h>
  47#include <linux/eventfd.h>
  48#include <linux/poll.h>
  49#include <linux/sort.h>
  50#include <linux/fs.h>
  51#include <linux/seq_file.h>
  52#include <linux/vmpressure.h>
  53#include <linux/mm_inline.h>
  54#include <linux/swap_cgroup.h>
  55#include <linux/cpu.h>
  56#include <linux/oom.h>
  57#include <linux/lockdep.h>
  58#include <linux/file.h>
  59#include <linux/tracehook.h>
  60#include <linux/psi.h>
  61#include <linux/seq_buf.h>
  62#include "internal.h"
  63#include <net/sock.h>
  64#include <net/ip.h>
  65#include "slab.h"
  66
  67#include <linux/uaccess.h>
  68
  69#include <trace/events/vmscan.h>
  70
  71struct cgroup_subsys memory_cgrp_subsys __read_mostly;
  72EXPORT_SYMBOL(memory_cgrp_subsys);
  73
  74struct mem_cgroup *root_mem_cgroup __read_mostly;
  75
  76#define MEM_CGROUP_RECLAIM_RETRIES      5
  77
  78/* Socket memory accounting disabled? */
  79static bool cgroup_memory_nosocket;
  80
  81/* Kernel memory accounting disabled? */
  82static bool cgroup_memory_nokmem;
  83
  84/* Whether the swap controller is active */
  85#ifdef CONFIG_MEMCG_SWAP
  86bool cgroup_memory_noswap __read_mostly;
  87#else
  88#define cgroup_memory_noswap            1
  89#endif
  90
  91#ifdef CONFIG_CGROUP_WRITEBACK
  92static DECLARE_WAIT_QUEUE_HEAD(memcg_cgwb_frn_waitq);
  93#endif
  94
  95/* Whether legacy memory+swap accounting is active */
  96static bool do_memsw_account(void)
  97{
  98        return !cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_noswap;
  99}
 100
 101#define THRESHOLDS_EVENTS_TARGET 128
 102#define SOFTLIMIT_EVENTS_TARGET 1024
 103
 104/*
 105 * Cgroups above their limits are maintained in a RB-Tree, independent of
 106 * their hierarchy representation
 107 */
 108
 109struct mem_cgroup_tree_per_node {
 110        struct rb_root rb_root;
 111        struct rb_node *rb_rightmost;
 112        spinlock_t lock;
 113};
 114
 115struct mem_cgroup_tree {
 116        struct mem_cgroup_tree_per_node *rb_tree_per_node[MAX_NUMNODES];
 117};
 118
 119static struct mem_cgroup_tree soft_limit_tree __read_mostly;
 120
 121/* for OOM */
 122struct mem_cgroup_eventfd_list {
 123        struct list_head list;
 124        struct eventfd_ctx *eventfd;
 125};
 126
 127/*
 128 * cgroup_event represents events which userspace want to receive.
 129 */
 130struct mem_cgroup_event {
 131        /*
 132         * memcg which the event belongs to.
 133         */
 134        struct mem_cgroup *memcg;
 135        /*
 136         * eventfd to signal userspace about the event.
 137         */
 138        struct eventfd_ctx *eventfd;
 139        /*
 140         * Each of these stored in a list by the cgroup.
 141         */
 142        struct list_head list;
 143        /*
 144         * register_event() callback will be used to add new userspace
 145         * waiter for changes related to this event.  Use eventfd_signal()
 146         * on eventfd to send notification to userspace.
 147         */
 148        int (*register_event)(struct mem_cgroup *memcg,
 149                              struct eventfd_ctx *eventfd, const char *args);
 150        /*
 151         * unregister_event() callback will be called when userspace closes
 152         * the eventfd or on cgroup removing.  This callback must be set,
 153         * if you want provide notification functionality.
 154         */
 155        void (*unregister_event)(struct mem_cgroup *memcg,
 156                                 struct eventfd_ctx *eventfd);
 157        /*
 158         * All fields below needed to unregister event when
 159         * userspace closes eventfd.
 160         */
 161        poll_table pt;
 162        wait_queue_head_t *wqh;
 163        wait_queue_entry_t wait;
 164        struct work_struct remove;
 165};
 166
 167static void mem_cgroup_threshold(struct mem_cgroup *memcg);
 168static void mem_cgroup_oom_notify(struct mem_cgroup *memcg);
 169
 170/* Stuffs for move charges at task migration. */
 171/*
 172 * Types of charges to be moved.
 173 */
 174#define MOVE_ANON       0x1U
 175#define MOVE_FILE       0x2U
 176#define MOVE_MASK       (MOVE_ANON | MOVE_FILE)
 177
 178/* "mc" and its members are protected by cgroup_mutex */
 179static struct move_charge_struct {
 180        spinlock_t        lock; /* for from, to */
 181        struct mm_struct  *mm;
 182        struct mem_cgroup *from;
 183        struct mem_cgroup *to;
 184        unsigned long flags;
 185        unsigned long precharge;
 186        unsigned long moved_charge;
 187        unsigned long moved_swap;
 188        struct task_struct *moving_task;        /* a task moving charges */
 189        wait_queue_head_t waitq;                /* a waitq for other context */
 190} mc = {
 191        .lock = __SPIN_LOCK_UNLOCKED(mc.lock),
 192        .waitq = __WAIT_QUEUE_HEAD_INITIALIZER(mc.waitq),
 193};
 194
 195/*
 196 * Maximum loops in mem_cgroup_hierarchical_reclaim(), used for soft
 197 * limit reclaim to prevent infinite loops, if they ever occur.
 198 */
 199#define MEM_CGROUP_MAX_RECLAIM_LOOPS            100
 200#define MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS 2
 201
 202enum charge_type {
 203        MEM_CGROUP_CHARGE_TYPE_CACHE = 0,
 204        MEM_CGROUP_CHARGE_TYPE_ANON,
 205        MEM_CGROUP_CHARGE_TYPE_SWAPOUT, /* for accounting swapcache */
 206        MEM_CGROUP_CHARGE_TYPE_DROP,    /* a page was unused swap cache */
 207        NR_CHARGE_TYPE,
 208};
 209
 210/* for encoding cft->private value on file */
 211enum res_type {
 212        _MEM,
 213        _MEMSWAP,
 214        _OOM_TYPE,
 215        _KMEM,
 216        _TCP,
 217};
 218
 219#define MEMFILE_PRIVATE(x, val) ((x) << 16 | (val))
 220#define MEMFILE_TYPE(val)       ((val) >> 16 & 0xffff)
 221#define MEMFILE_ATTR(val)       ((val) & 0xffff)
 222/* Used for OOM nofiier */
 223#define OOM_CONTROL             (0)
 224
 225/*
 226 * Iteration constructs for visiting all cgroups (under a tree).  If
 227 * loops are exited prematurely (break), mem_cgroup_iter_break() must
 228 * be used for reference counting.
 229 */
 230#define for_each_mem_cgroup_tree(iter, root)            \
 231        for (iter = mem_cgroup_iter(root, NULL, NULL);  \
 232             iter != NULL;                              \
 233             iter = mem_cgroup_iter(root, iter, NULL))
 234
 235#define for_each_mem_cgroup(iter)                       \
 236        for (iter = mem_cgroup_iter(NULL, NULL, NULL);  \
 237             iter != NULL;                              \
 238             iter = mem_cgroup_iter(NULL, iter, NULL))
 239
 240static inline bool should_force_charge(void)
 241{
 242        return tsk_is_oom_victim(current) || fatal_signal_pending(current) ||
 243                (current->flags & PF_EXITING);
 244}
 245
 246/* Some nice accessors for the vmpressure. */
 247struct vmpressure *memcg_to_vmpressure(struct mem_cgroup *memcg)
 248{
 249        if (!memcg)
 250                memcg = root_mem_cgroup;
 251        return &memcg->vmpressure;
 252}
 253
 254struct cgroup_subsys_state *vmpressure_to_css(struct vmpressure *vmpr)
 255{
 256        return &container_of(vmpr, struct mem_cgroup, vmpressure)->css;
 257}
 258
 259#ifdef CONFIG_MEMCG_KMEM
 260/*
 261 * This will be the memcg's index in each cache's ->memcg_params.memcg_caches.
 262 * The main reason for not using cgroup id for this:
 263 *  this works better in sparse environments, where we have a lot of memcgs,
 264 *  but only a few kmem-limited. Or also, if we have, for instance, 200
 265 *  memcgs, and none but the 200th is kmem-limited, we'd have to have a
 266 *  200 entry array for that.
 267 *
 268 * The current size of the caches array is stored in memcg_nr_cache_ids. It
 269 * will double each time we have to increase it.
 270 */
 271static DEFINE_IDA(memcg_cache_ida);
 272int memcg_nr_cache_ids;
 273
 274/* Protects memcg_nr_cache_ids */
 275static DECLARE_RWSEM(memcg_cache_ids_sem);
 276
 277void memcg_get_cache_ids(void)
 278{
 279        down_read(&memcg_cache_ids_sem);
 280}
 281
 282void memcg_put_cache_ids(void)
 283{
 284        up_read(&memcg_cache_ids_sem);
 285}
 286
 287/*
 288 * MIN_SIZE is different than 1, because we would like to avoid going through
 289 * the alloc/free process all the time. In a small machine, 4 kmem-limited
 290 * cgroups is a reasonable guess. In the future, it could be a parameter or
 291 * tunable, but that is strictly not necessary.
 292 *
 293 * MAX_SIZE should be as large as the number of cgrp_ids. Ideally, we could get
 294 * this constant directly from cgroup, but it is understandable that this is
 295 * better kept as an internal representation in cgroup.c. In any case, the
 296 * cgrp_id space is not getting any smaller, and we don't have to necessarily
 297 * increase ours as well if it increases.
 298 */
 299#define MEMCG_CACHES_MIN_SIZE 4
 300#define MEMCG_CACHES_MAX_SIZE MEM_CGROUP_ID_MAX
 301
 302/*
 303 * A lot of the calls to the cache allocation functions are expected to be
 304 * inlined by the compiler. Since the calls to memcg_kmem_get_cache are
 305 * conditional to this static branch, we'll have to allow modules that does
 306 * kmem_cache_alloc and the such to see this symbol as well
 307 */
 308DEFINE_STATIC_KEY_FALSE(memcg_kmem_enabled_key);
 309EXPORT_SYMBOL(memcg_kmem_enabled_key);
 310
 311struct workqueue_struct *memcg_kmem_cache_wq;
 312#endif
 313
 314static int memcg_shrinker_map_size;
 315static DEFINE_MUTEX(memcg_shrinker_map_mutex);
 316
 317static void memcg_free_shrinker_map_rcu(struct rcu_head *head)
 318{
 319        kvfree(container_of(head, struct memcg_shrinker_map, rcu));
 320}
 321
 322static int memcg_expand_one_shrinker_map(struct mem_cgroup *memcg,
 323                                         int size, int old_size)
 324{
 325        struct memcg_shrinker_map *new, *old;
 326        int nid;
 327
 328        lockdep_assert_held(&memcg_shrinker_map_mutex);
 329
 330        for_each_node(nid) {
 331                old = rcu_dereference_protected(
 332                        mem_cgroup_nodeinfo(memcg, nid)->shrinker_map, true);
 333                /* Not yet online memcg */
 334                if (!old)
 335                        return 0;
 336
 337                new = kvmalloc_node(sizeof(*new) + size, GFP_KERNEL, nid);
 338                if (!new)
 339                        return -ENOMEM;
 340
 341                /* Set all old bits, clear all new bits */
 342                memset(new->map, (int)0xff, old_size);
 343                memset((void *)new->map + old_size, 0, size - old_size);
 344
 345                rcu_assign_pointer(memcg->nodeinfo[nid]->shrinker_map, new);
 346                call_rcu(&old->rcu, memcg_free_shrinker_map_rcu);
 347        }
 348
 349        return 0;
 350}
 351
 352static void memcg_free_shrinker_maps(struct mem_cgroup *memcg)
 353{
 354        struct mem_cgroup_per_node *pn;
 355        struct memcg_shrinker_map *map;
 356        int nid;
 357
 358        if (mem_cgroup_is_root(memcg))
 359                return;
 360
 361        for_each_node(nid) {
 362                pn = mem_cgroup_nodeinfo(memcg, nid);
 363                map = rcu_dereference_protected(pn->shrinker_map, true);
 364                if (map)
 365                        kvfree(map);
 366                rcu_assign_pointer(pn->shrinker_map, NULL);
 367        }
 368}
 369
 370static int memcg_alloc_shrinker_maps(struct mem_cgroup *memcg)
 371{
 372        struct memcg_shrinker_map *map;
 373        int nid, size, ret = 0;
 374
 375        if (mem_cgroup_is_root(memcg))
 376                return 0;
 377
 378        mutex_lock(&memcg_shrinker_map_mutex);
 379        size = memcg_shrinker_map_size;
 380        for_each_node(nid) {
 381                map = kvzalloc_node(sizeof(*map) + size, GFP_KERNEL, nid);
 382                if (!map) {
 383                        memcg_free_shrinker_maps(memcg);
 384                        ret = -ENOMEM;
 385                        break;
 386                }
 387                rcu_assign_pointer(memcg->nodeinfo[nid]->shrinker_map, map);
 388        }
 389        mutex_unlock(&memcg_shrinker_map_mutex);
 390
 391        return ret;
 392}
 393
 394int memcg_expand_shrinker_maps(int new_id)
 395{
 396        int size, old_size, ret = 0;
 397        struct mem_cgroup *memcg;
 398
 399        size = DIV_ROUND_UP(new_id + 1, BITS_PER_LONG) * sizeof(unsigned long);
 400        old_size = memcg_shrinker_map_size;
 401        if (size <= old_size)
 402                return 0;
 403
 404        mutex_lock(&memcg_shrinker_map_mutex);
 405        if (!root_mem_cgroup)
 406                goto unlock;
 407
 408        for_each_mem_cgroup(memcg) {
 409                if (mem_cgroup_is_root(memcg))
 410                        continue;
 411                ret = memcg_expand_one_shrinker_map(memcg, size, old_size);
 412                if (ret) {
 413                        mem_cgroup_iter_break(NULL, memcg);
 414                        goto unlock;
 415                }
 416        }
 417unlock:
 418        if (!ret)
 419                memcg_shrinker_map_size = size;
 420        mutex_unlock(&memcg_shrinker_map_mutex);
 421        return ret;
 422}
 423
 424void memcg_set_shrinker_bit(struct mem_cgroup *memcg, int nid, int shrinker_id)
 425{
 426        if (shrinker_id >= 0 && memcg && !mem_cgroup_is_root(memcg)) {
 427                struct memcg_shrinker_map *map;
 428
 429                rcu_read_lock();
 430                map = rcu_dereference(memcg->nodeinfo[nid]->shrinker_map);
 431                /* Pairs with smp mb in shrink_slab() */
 432                smp_mb__before_atomic();
 433                set_bit(shrinker_id, map->map);
 434                rcu_read_unlock();
 435        }
 436}
 437
 438/**
 439 * mem_cgroup_css_from_page - css of the memcg associated with a page
 440 * @page: page of interest
 441 *
 442 * If memcg is bound to the default hierarchy, css of the memcg associated
 443 * with @page is returned.  The returned css remains associated with @page
 444 * until it is released.
 445 *
 446 * If memcg is bound to a traditional hierarchy, the css of root_mem_cgroup
 447 * is returned.
 448 */
 449struct cgroup_subsys_state *mem_cgroup_css_from_page(struct page *page)
 450{
 451        struct mem_cgroup *memcg;
 452
 453        memcg = page->mem_cgroup;
 454
 455        if (!memcg || !cgroup_subsys_on_dfl(memory_cgrp_subsys))
 456                memcg = root_mem_cgroup;
 457
 458        return &memcg->css;
 459}
 460
 461/**
 462 * page_cgroup_ino - return inode number of the memcg a page is charged to
 463 * @page: the page
 464 *
 465 * Look up the closest online ancestor of the memory cgroup @page is charged to
 466 * and return its inode number or 0 if @page is not charged to any cgroup. It
 467 * is safe to call this function without holding a reference to @page.
 468 *
 469 * Note, this function is inherently racy, because there is nothing to prevent
 470 * the cgroup inode from getting torn down and potentially reallocated a moment
 471 * after page_cgroup_ino() returns, so it only should be used by callers that
 472 * do not care (such as procfs interfaces).
 473 */
 474ino_t page_cgroup_ino(struct page *page)
 475{
 476        struct mem_cgroup *memcg;
 477        unsigned long ino = 0;
 478
 479        rcu_read_lock();
 480        if (PageSlab(page) && !PageTail(page))
 481                memcg = memcg_from_slab_page(page);
 482        else
 483                memcg = READ_ONCE(page->mem_cgroup);
 484        while (memcg && !(memcg->css.flags & CSS_ONLINE))
 485                memcg = parent_mem_cgroup(memcg);
 486        if (memcg)
 487                ino = cgroup_ino(memcg->css.cgroup);
 488        rcu_read_unlock();
 489        return ino;
 490}
 491
 492static struct mem_cgroup_per_node *
 493mem_cgroup_page_nodeinfo(struct mem_cgroup *memcg, struct page *page)
 494{
 495        int nid = page_to_nid(page);
 496
 497        return memcg->nodeinfo[nid];
 498}
 499
 500static struct mem_cgroup_tree_per_node *
 501soft_limit_tree_node(int nid)
 502{
 503        return soft_limit_tree.rb_tree_per_node[nid];
 504}
 505
 506static struct mem_cgroup_tree_per_node *
 507soft_limit_tree_from_page(struct page *page)
 508{
 509        int nid = page_to_nid(page);
 510
 511        return soft_limit_tree.rb_tree_per_node[nid];
 512}
 513
 514static void __mem_cgroup_insert_exceeded(struct mem_cgroup_per_node *mz,
 515                                         struct mem_cgroup_tree_per_node *mctz,
 516                                         unsigned long new_usage_in_excess)
 517{
 518        struct rb_node **p = &mctz->rb_root.rb_node;
 519        struct rb_node *parent = NULL;
 520        struct mem_cgroup_per_node *mz_node;
 521        bool rightmost = true;
 522
 523        if (mz->on_tree)
 524                return;
 525
 526        mz->usage_in_excess = new_usage_in_excess;
 527        if (!mz->usage_in_excess)
 528                return;
 529        while (*p) {
 530                parent = *p;
 531                mz_node = rb_entry(parent, struct mem_cgroup_per_node,
 532                                        tree_node);
 533                if (mz->usage_in_excess < mz_node->usage_in_excess) {
 534                        p = &(*p)->rb_left;
 535                        rightmost = false;
 536                }
 537
 538                /*
 539                 * We can't avoid mem cgroups that are over their soft
 540                 * limit by the same amount
 541                 */
 542                else if (mz->usage_in_excess >= mz_node->usage_in_excess)
 543                        p = &(*p)->rb_right;
 544        }
 545
 546        if (rightmost)
 547                mctz->rb_rightmost = &mz->tree_node;
 548
 549        rb_link_node(&mz->tree_node, parent, p);
 550        rb_insert_color(&mz->tree_node, &mctz->rb_root);
 551        mz->on_tree = true;
 552}
 553
 554static void __mem_cgroup_remove_exceeded(struct mem_cgroup_per_node *mz,
 555                                         struct mem_cgroup_tree_per_node *mctz)
 556{
 557        if (!mz->on_tree)
 558                return;
 559
 560        if (&mz->tree_node == mctz->rb_rightmost)
 561                mctz->rb_rightmost = rb_prev(&mz->tree_node);
 562
 563        rb_erase(&mz->tree_node, &mctz->rb_root);
 564        mz->on_tree = false;
 565}
 566
 567static void mem_cgroup_remove_exceeded(struct mem_cgroup_per_node *mz,
 568                                       struct mem_cgroup_tree_per_node *mctz)
 569{
 570        unsigned long flags;
 571
 572        spin_lock_irqsave(&mctz->lock, flags);
 573        __mem_cgroup_remove_exceeded(mz, mctz);
 574        spin_unlock_irqrestore(&mctz->lock, flags);
 575}
 576
 577static unsigned long soft_limit_excess(struct mem_cgroup *memcg)
 578{
 579        unsigned long nr_pages = page_counter_read(&memcg->memory);
 580        unsigned long soft_limit = READ_ONCE(memcg->soft_limit);
 581        unsigned long excess = 0;
 582
 583        if (nr_pages > soft_limit)
 584                excess = nr_pages - soft_limit;
 585
 586        return excess;
 587}
 588
 589static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page)
 590{
 591        unsigned long excess;
 592        struct mem_cgroup_per_node *mz;
 593        struct mem_cgroup_tree_per_node *mctz;
 594
 595        mctz = soft_limit_tree_from_page(page);
 596        if (!mctz)
 597                return;
 598        /*
 599         * Necessary to update all ancestors when hierarchy is used.
 600         * because their event counter is not touched.
 601         */
 602        for (; memcg; memcg = parent_mem_cgroup(memcg)) {
 603                mz = mem_cgroup_page_nodeinfo(memcg, page);
 604                excess = soft_limit_excess(memcg);
 605                /*
 606                 * We have to update the tree if mz is on RB-tree or
 607                 * mem is over its softlimit.
 608                 */
 609                if (excess || mz->on_tree) {
 610                        unsigned long flags;
 611
 612                        spin_lock_irqsave(&mctz->lock, flags);
 613                        /* if on-tree, remove it */
 614                        if (mz->on_tree)
 615                                __mem_cgroup_remove_exceeded(mz, mctz);
 616                        /*
 617                         * Insert again. mz->usage_in_excess will be updated.
 618                         * If excess is 0, no tree ops.
 619                         */
 620                        __mem_cgroup_insert_exceeded(mz, mctz, excess);
 621                        spin_unlock_irqrestore(&mctz->lock, flags);
 622                }
 623        }
 624}
 625
 626static void mem_cgroup_remove_from_trees(struct mem_cgroup *memcg)
 627{
 628        struct mem_cgroup_tree_per_node *mctz;
 629        struct mem_cgroup_per_node *mz;
 630        int nid;
 631
 632        for_each_node(nid) {
 633                mz = mem_cgroup_nodeinfo(memcg, nid);
 634                mctz = soft_limit_tree_node(nid);
 635                if (mctz)
 636                        mem_cgroup_remove_exceeded(mz, mctz);
 637        }
 638}
 639
 640static struct mem_cgroup_per_node *
 641__mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz)
 642{
 643        struct mem_cgroup_per_node *mz;
 644
 645retry:
 646        mz = NULL;
 647        if (!mctz->rb_rightmost)
 648                goto done;              /* Nothing to reclaim from */
 649
 650        mz = rb_entry(mctz->rb_rightmost,
 651                      struct mem_cgroup_per_node, tree_node);
 652        /*
 653         * Remove the node now but someone else can add it back,
 654         * we will to add it back at the end of reclaim to its correct
 655         * position in the tree.
 656         */
 657        __mem_cgroup_remove_exceeded(mz, mctz);
 658        if (!soft_limit_excess(mz->memcg) ||
 659            !css_tryget(&mz->memcg->css))
 660                goto retry;
 661done:
 662        return mz;
 663}
 664
 665static struct mem_cgroup_per_node *
 666mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz)
 667{
 668        struct mem_cgroup_per_node *mz;
 669
 670        spin_lock_irq(&mctz->lock);
 671        mz = __mem_cgroup_largest_soft_limit_node(mctz);
 672        spin_unlock_irq(&mctz->lock);
 673        return mz;
 674}
 675
 676/**
 677 * __mod_memcg_state - update cgroup memory statistics
 678 * @memcg: the memory cgroup
 679 * @idx: the stat item - can be enum memcg_stat_item or enum node_stat_item
 680 * @val: delta to add to the counter, can be negative
 681 */
 682void __mod_memcg_state(struct mem_cgroup *memcg, int idx, int val)
 683{
 684        long x;
 685
 686        if (mem_cgroup_disabled())
 687                return;
 688
 689        x = val + __this_cpu_read(memcg->vmstats_percpu->stat[idx]);
 690        if (unlikely(abs(x) > MEMCG_CHARGE_BATCH)) {
 691                struct mem_cgroup *mi;
 692
 693                /*
 694                 * Batch local counters to keep them in sync with
 695                 * the hierarchical ones.
 696                 */
 697                __this_cpu_add(memcg->vmstats_local->stat[idx], x);
 698                for (mi = memcg; mi; mi = parent_mem_cgroup(mi))
 699                        atomic_long_add(x, &mi->vmstats[idx]);
 700                x = 0;
 701        }
 702        __this_cpu_write(memcg->vmstats_percpu->stat[idx], x);
 703}
 704
 705static struct mem_cgroup_per_node *
 706parent_nodeinfo(struct mem_cgroup_per_node *pn, int nid)
 707{
 708        struct mem_cgroup *parent;
 709
 710        parent = parent_mem_cgroup(pn->memcg);
 711        if (!parent)
 712                return NULL;
 713        return mem_cgroup_nodeinfo(parent, nid);
 714}
 715
 716/**
 717 * __mod_lruvec_state - update lruvec memory statistics
 718 * @lruvec: the lruvec
 719 * @idx: the stat item
 720 * @val: delta to add to the counter, can be negative
 721 *
 722 * The lruvec is the intersection of the NUMA node and a cgroup. This
 723 * function updates the all three counters that are affected by a
 724 * change of state at this level: per-node, per-cgroup, per-lruvec.
 725 */
 726void __mod_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx,
 727                        int val)
 728{
 729        pg_data_t *pgdat = lruvec_pgdat(lruvec);
 730        struct mem_cgroup_per_node *pn;
 731        struct mem_cgroup *memcg;
 732        long x;
 733
 734        /* Update node */
 735        __mod_node_page_state(pgdat, idx, val);
 736
 737        if (mem_cgroup_disabled())
 738                return;
 739
 740        pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
 741        memcg = pn->memcg;
 742
 743        /* Update memcg */
 744        __mod_memcg_state(memcg, idx, val);
 745
 746        /* Update lruvec */
 747        __this_cpu_add(pn->lruvec_stat_local->count[idx], val);
 748
 749        x = val + __this_cpu_read(pn->lruvec_stat_cpu->count[idx]);
 750        if (unlikely(abs(x) > MEMCG_CHARGE_BATCH)) {
 751                struct mem_cgroup_per_node *pi;
 752
 753                for (pi = pn; pi; pi = parent_nodeinfo(pi, pgdat->node_id))
 754                        atomic_long_add(x, &pi->lruvec_stat[idx]);
 755                x = 0;
 756        }
 757        __this_cpu_write(pn->lruvec_stat_cpu->count[idx], x);
 758}
 759
 760void __mod_lruvec_slab_state(void *p, enum node_stat_item idx, int val)
 761{
 762        pg_data_t *pgdat = page_pgdat(virt_to_page(p));
 763        struct mem_cgroup *memcg;
 764        struct lruvec *lruvec;
 765
 766        rcu_read_lock();
 767        memcg = mem_cgroup_from_obj(p);
 768
 769        /* Untracked pages have no memcg, no lruvec. Update only the node */
 770        if (!memcg || memcg == root_mem_cgroup) {
 771                __mod_node_page_state(pgdat, idx, val);
 772        } else {
 773                lruvec = mem_cgroup_lruvec(memcg, pgdat);
 774                __mod_lruvec_state(lruvec, idx, val);
 775        }
 776        rcu_read_unlock();
 777}
 778
 779void mod_memcg_obj_state(void *p, int idx, int val)
 780{
 781        struct mem_cgroup *memcg;
 782
 783        rcu_read_lock();
 784        memcg = mem_cgroup_from_obj(p);
 785        if (memcg)
 786                mod_memcg_state(memcg, idx, val);
 787        rcu_read_unlock();
 788}
 789
 790/**
 791 * __count_memcg_events - account VM events in a cgroup
 792 * @memcg: the memory cgroup
 793 * @idx: the event item
 794 * @count: the number of events that occured
 795 */
 796void __count_memcg_events(struct mem_cgroup *memcg, enum vm_event_item idx,
 797                          unsigned long count)
 798{
 799        unsigned long x;
 800
 801        if (mem_cgroup_disabled())
 802                return;
 803
 804        x = count + __this_cpu_read(memcg->vmstats_percpu->events[idx]);
 805        if (unlikely(x > MEMCG_CHARGE_BATCH)) {
 806                struct mem_cgroup *mi;
 807
 808                /*
 809                 * Batch local counters to keep them in sync with
 810                 * the hierarchical ones.
 811                 */
 812                __this_cpu_add(memcg->vmstats_local->events[idx], x);
 813                for (mi = memcg; mi; mi = parent_mem_cgroup(mi))
 814                        atomic_long_add(x, &mi->vmevents[idx]);
 815                x = 0;
 816        }
 817        __this_cpu_write(memcg->vmstats_percpu->events[idx], x);
 818}
 819
 820static unsigned long memcg_events(struct mem_cgroup *memcg, int event)
 821{
 822        return atomic_long_read(&memcg->vmevents[event]);
 823}
 824
 825static unsigned long memcg_events_local(struct mem_cgroup *memcg, int event)
 826{
 827        long x = 0;
 828        int cpu;
 829
 830        for_each_possible_cpu(cpu)
 831                x += per_cpu(memcg->vmstats_local->events[event], cpu);
 832        return x;
 833}
 834
 835static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg,
 836                                         struct page *page,
 837                                         int nr_pages)
 838{
 839        /* pagein of a big page is an event. So, ignore page size */
 840        if (nr_pages > 0)
 841                __count_memcg_events(memcg, PGPGIN, 1);
 842        else {
 843                __count_memcg_events(memcg, PGPGOUT, 1);
 844                nr_pages = -nr_pages; /* for event */
 845        }
 846
 847        __this_cpu_add(memcg->vmstats_percpu->nr_page_events, nr_pages);
 848}
 849
 850static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg,
 851                                       enum mem_cgroup_events_target target)
 852{
 853        unsigned long val, next;
 854
 855        val = __this_cpu_read(memcg->vmstats_percpu->nr_page_events);
 856        next = __this_cpu_read(memcg->vmstats_percpu->targets[target]);
 857        /* from time_after() in jiffies.h */
 858        if ((long)(next - val) < 0) {
 859                switch (target) {
 860                case MEM_CGROUP_TARGET_THRESH:
 861                        next = val + THRESHOLDS_EVENTS_TARGET;
 862                        break;
 863                case MEM_CGROUP_TARGET_SOFTLIMIT:
 864                        next = val + SOFTLIMIT_EVENTS_TARGET;
 865                        break;
 866                default:
 867                        break;
 868                }
 869                __this_cpu_write(memcg->vmstats_percpu->targets[target], next);
 870                return true;
 871        }
 872        return false;
 873}
 874
 875/*
 876 * Check events in order.
 877 *
 878 */
 879static void memcg_check_events(struct mem_cgroup *memcg, struct page *page)
 880{
 881        /* threshold event is triggered in finer grain than soft limit */
 882        if (unlikely(mem_cgroup_event_ratelimit(memcg,
 883                                                MEM_CGROUP_TARGET_THRESH))) {
 884                bool do_softlimit;
 885
 886                do_softlimit = mem_cgroup_event_ratelimit(memcg,
 887                                                MEM_CGROUP_TARGET_SOFTLIMIT);
 888                mem_cgroup_threshold(memcg);
 889                if (unlikely(do_softlimit))
 890                        mem_cgroup_update_tree(memcg, page);
 891        }
 892}
 893
 894struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
 895{
 896        /*
 897         * mm_update_next_owner() may clear mm->owner to NULL
 898         * if it races with swapoff, page migration, etc.
 899         * So this can be called with p == NULL.
 900         */
 901        if (unlikely(!p))
 902                return NULL;
 903
 904        return mem_cgroup_from_css(task_css(p, memory_cgrp_id));
 905}
 906EXPORT_SYMBOL(mem_cgroup_from_task);
 907
 908/**
 909 * get_mem_cgroup_from_mm: Obtain a reference on given mm_struct's memcg.
 910 * @mm: mm from which memcg should be extracted. It can be NULL.
 911 *
 912 * Obtain a reference on mm->memcg and returns it if successful. Otherwise
 913 * root_mem_cgroup is returned. However if mem_cgroup is disabled, NULL is
 914 * returned.
 915 */
 916struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm)
 917{
 918        struct mem_cgroup *memcg;
 919
 920        if (mem_cgroup_disabled())
 921                return NULL;
 922
 923        rcu_read_lock();
 924        do {
 925                /*
 926                 * Page cache insertions can happen withou an
 927                 * actual mm context, e.g. during disk probing
 928                 * on boot, loopback IO, acct() writes etc.
 929                 */
 930                if (unlikely(!mm))
 931                        memcg = root_mem_cgroup;
 932                else {
 933                        memcg = mem_cgroup_from_task(rcu_dereference(mm->owner));
 934                        if (unlikely(!memcg))
 935                                memcg = root_mem_cgroup;
 936                }
 937        } while (!css_tryget(&memcg->css));
 938        rcu_read_unlock();
 939        return memcg;
 940}
 941EXPORT_SYMBOL(get_mem_cgroup_from_mm);
 942
 943/**
 944 * get_mem_cgroup_from_page: Obtain a reference on given page's memcg.
 945 * @page: page from which memcg should be extracted.
 946 *
 947 * Obtain a reference on page->memcg and returns it if successful. Otherwise
 948 * root_mem_cgroup is returned.
 949 */
 950struct mem_cgroup *get_mem_cgroup_from_page(struct page *page)
 951{
 952        struct mem_cgroup *memcg = page->mem_cgroup;
 953
 954        if (mem_cgroup_disabled())
 955                return NULL;
 956
 957        rcu_read_lock();
 958        /* Page should not get uncharged and freed memcg under us. */
 959        if (!memcg || WARN_ON_ONCE(!css_tryget(&memcg->css)))
 960                memcg = root_mem_cgroup;
 961        rcu_read_unlock();
 962        return memcg;
 963}
 964EXPORT_SYMBOL(get_mem_cgroup_from_page);
 965
 966/**
 967 * If current->active_memcg is non-NULL, do not fallback to current->mm->memcg.
 968 */
 969static __always_inline struct mem_cgroup *get_mem_cgroup_from_current(void)
 970{
 971        if (unlikely(current->active_memcg)) {
 972                struct mem_cgroup *memcg;
 973
 974                rcu_read_lock();
 975                /* current->active_memcg must hold a ref. */
 976                if (WARN_ON_ONCE(!css_tryget(&current->active_memcg->css)))
 977                        memcg = root_mem_cgroup;
 978                else
 979                        memcg = current->active_memcg;
 980                rcu_read_unlock();
 981                return memcg;
 982        }
 983        return get_mem_cgroup_from_mm(current->mm);
 984}
 985
 986/**
 987 * mem_cgroup_iter - iterate over memory cgroup hierarchy
 988 * @root: hierarchy root
 989 * @prev: previously returned memcg, NULL on first invocation
 990 * @reclaim: cookie for shared reclaim walks, NULL for full walks
 991 *
 992 * Returns references to children of the hierarchy below @root, or
 993 * @root itself, or %NULL after a full round-trip.
 994 *
 995 * Caller must pass the return value in @prev on subsequent
 996 * invocations for reference counting, or use mem_cgroup_iter_break()
 997 * to cancel a hierarchy walk before the round-trip is complete.
 998 *
 999 * Reclaimers can specify a node and a priority level in @reclaim to
1000 * divide up the memcgs in the hierarchy among all concurrent
1001 * reclaimers operating on the same node and priority.
1002 */
1003struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
1004                                   struct mem_cgroup *prev,
1005                                   struct mem_cgroup_reclaim_cookie *reclaim)
1006{
1007        struct mem_cgroup_reclaim_iter *uninitialized_var(iter);
1008        struct cgroup_subsys_state *css = NULL;
1009        struct mem_cgroup *memcg = NULL;
1010        struct mem_cgroup *pos = NULL;
1011
1012        if (mem_cgroup_disabled())
1013                return NULL;
1014
1015        if (!root)
1016                root = root_mem_cgroup;
1017
1018        if (prev && !reclaim)
1019                pos = prev;
1020
1021        if (!root->use_hierarchy && root != root_mem_cgroup) {
1022                if (prev)
1023                        goto out;
1024                return root;
1025        }
1026
1027        rcu_read_lock();
1028
1029        if (reclaim) {
1030                struct mem_cgroup_per_node *mz;
1031
1032                mz = mem_cgroup_nodeinfo(root, reclaim->pgdat->node_id);
1033                iter = &mz->iter;
1034
1035                if (prev && reclaim->generation != iter->generation)
1036                        goto out_unlock;
1037
1038                while (1) {
1039                        pos = READ_ONCE(iter->position);
1040                        if (!pos || css_tryget(&pos->css))
1041                                break;
1042                        /*
1043                         * css reference reached zero, so iter->position will
1044                         * be cleared by ->css_released. However, we should not
1045                         * rely on this happening soon, because ->css_released
1046                         * is called from a work queue, and by busy-waiting we
1047                         * might block it. So we clear iter->position right
1048                         * away.
1049                         */
1050                        (void)cmpxchg(&iter->position, pos, NULL);
1051                }
1052        }
1053
1054        if (pos)
1055                css = &pos->css;
1056
1057        for (;;) {
1058                css = css_next_descendant_pre(css, &root->css);
1059                if (!css) {
1060                        /*
1061                         * Reclaimers share the hierarchy walk, and a
1062                         * new one might jump in right at the end of
1063                         * the hierarchy - make sure they see at least
1064                         * one group and restart from the beginning.
1065                         */
1066                        if (!prev)
1067                                continue;
1068                        break;
1069                }
1070
1071                /*
1072                 * Verify the css and acquire a reference.  The root
1073                 * is provided by the caller, so we know it's alive
1074                 * and kicking, and don't take an extra reference.
1075                 */
1076                memcg = mem_cgroup_from_css(css);
1077
1078                if (css == &root->css)
1079                        break;
1080
1081                if (css_tryget(css))
1082                        break;
1083
1084                memcg = NULL;
1085        }
1086
1087        if (reclaim) {
1088                /*
1089                 * The position could have already been updated by a competing
1090                 * thread, so check that the value hasn't changed since we read
1091                 * it to avoid reclaiming from the same cgroup twice.
1092                 */
1093                (void)cmpxchg(&iter->position, pos, memcg);
1094
1095                if (pos)
1096                        css_put(&pos->css);
1097
1098                if (!memcg)
1099                        iter->generation++;
1100                else if (!prev)
1101                        reclaim->generation = iter->generation;
1102        }
1103
1104out_unlock:
1105        rcu_read_unlock();
1106out:
1107        if (prev && prev != root)
1108                css_put(&prev->css);
1109
1110        return memcg;
1111}
1112
1113/**
1114 * mem_cgroup_iter_break - abort a hierarchy walk prematurely
1115 * @root: hierarchy root
1116 * @prev: last visited hierarchy member as returned by mem_cgroup_iter()
1117 */
1118void mem_cgroup_iter_break(struct mem_cgroup *root,
1119                           struct mem_cgroup *prev)
1120{
1121        if (!root)
1122                root = root_mem_cgroup;
1123        if (prev && prev != root)
1124                css_put(&prev->css);
1125}
1126
1127static void __invalidate_reclaim_iterators(struct mem_cgroup *from,
1128                                        struct mem_cgroup *dead_memcg)
1129{
1130        struct mem_cgroup_reclaim_iter *iter;
1131        struct mem_cgroup_per_node *mz;
1132        int nid;
1133
1134        for_each_node(nid) {
1135                mz = mem_cgroup_nodeinfo(from, nid);
1136                iter = &mz->iter;
1137                cmpxchg(&iter->position, dead_memcg, NULL);
1138        }
1139}
1140
1141static void invalidate_reclaim_iterators(struct mem_cgroup *dead_memcg)
1142{
1143        struct mem_cgroup *memcg = dead_memcg;
1144        struct mem_cgroup *last;
1145
1146        do {
1147                __invalidate_reclaim_iterators(memcg, dead_memcg);
1148                last = memcg;
1149        } while ((memcg = parent_mem_cgroup(memcg)));
1150
1151        /*
1152         * When cgruop1 non-hierarchy mode is used,
1153         * parent_mem_cgroup() does not walk all the way up to the
1154         * cgroup root (root_mem_cgroup). So we have to handle
1155         * dead_memcg from cgroup root separately.
1156         */
1157        if (last != root_mem_cgroup)
1158                __invalidate_reclaim_iterators(root_mem_cgroup,
1159                                                dead_memcg);
1160}
1161
1162/**
1163 * mem_cgroup_scan_tasks - iterate over tasks of a memory cgroup hierarchy
1164 * @memcg: hierarchy root
1165 * @fn: function to call for each task
1166 * @arg: argument passed to @fn
1167 *
1168 * This function iterates over tasks attached to @memcg or to any of its
1169 * descendants and calls @fn for each task. If @fn returns a non-zero
1170 * value, the function breaks the iteration loop and returns the value.
1171 * Otherwise, it will iterate over all tasks and return 0.
1172 *
1173 * This function must not be called for the root memory cgroup.
1174 */
1175int mem_cgroup_scan_tasks(struct mem_cgroup *memcg,
1176                          int (*fn)(struct task_struct *, void *), void *arg)
1177{
1178        struct mem_cgroup *iter;
1179        int ret = 0;
1180
1181        BUG_ON(memcg == root_mem_cgroup);
1182
1183        for_each_mem_cgroup_tree(iter, memcg) {
1184                struct css_task_iter it;
1185                struct task_struct *task;
1186
1187                css_task_iter_start(&iter->css, CSS_TASK_ITER_PROCS, &it);
1188                while (!ret && (task = css_task_iter_next(&it)))
1189                        ret = fn(task, arg);
1190                css_task_iter_end(&it);
1191                if (ret) {
1192                        mem_cgroup_iter_break(memcg, iter);
1193                        break;
1194                }
1195        }
1196        return ret;
1197}
1198
1199/**
1200 * mem_cgroup_page_lruvec - return lruvec for isolating/putting an LRU page
1201 * @page: the page
1202 * @pgdat: pgdat of the page
1203 *
1204 * This function relies on page->mem_cgroup being stable - see the
1205 * access rules in commit_charge().
1206 */
1207struct lruvec *mem_cgroup_page_lruvec(struct page *page, struct pglist_data *pgdat)
1208{
1209        struct mem_cgroup_per_node *mz;
1210        struct mem_cgroup *memcg;
1211        struct lruvec *lruvec;
1212
1213        if (mem_cgroup_disabled()) {
1214                lruvec = &pgdat->__lruvec;
1215                goto out;
1216        }
1217
1218        memcg = page->mem_cgroup;
1219        /*
1220         * Swapcache readahead pages are added to the LRU - and
1221         * possibly migrated - before they are charged.
1222         */
1223        if (!memcg)
1224                memcg = root_mem_cgroup;
1225
1226        mz = mem_cgroup_page_nodeinfo(memcg, page);
1227        lruvec = &mz->lruvec;
1228out:
1229        /*
1230         * Since a node can be onlined after the mem_cgroup was created,
1231         * we have to be prepared to initialize lruvec->zone here;
1232         * and if offlined then reonlined, we need to reinitialize it.
1233         */
1234        if (unlikely(lruvec->pgdat != pgdat))
1235                lruvec->pgdat = pgdat;
1236        return lruvec;
1237}
1238
1239/**
1240 * mem_cgroup_update_lru_size - account for adding or removing an lru page
1241 * @lruvec: mem_cgroup per zone lru vector
1242 * @lru: index of lru list the page is sitting on
1243 * @zid: zone id of the accounted pages
1244 * @nr_pages: positive when adding or negative when removing
1245 *
1246 * This function must be called under lru_lock, just before a page is added
1247 * to or just after a page is removed from an lru list (that ordering being
1248 * so as to allow it to check that lru_size 0 is consistent with list_empty).
1249 */
1250void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru,
1251                                int zid, int nr_pages)
1252{
1253        struct mem_cgroup_per_node *mz;
1254        unsigned long *lru_size;
1255        long size;
1256
1257        if (mem_cgroup_disabled())
1258                return;
1259
1260        mz = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
1261        lru_size = &mz->lru_zone_size[zid][lru];
1262
1263        if (nr_pages < 0)
1264                *lru_size += nr_pages;
1265
1266        size = *lru_size;
1267        if (WARN_ONCE(size < 0,
1268                "%s(%p, %d, %d): lru_size %ld\n",
1269                __func__, lruvec, lru, nr_pages, size)) {
1270                VM_BUG_ON(1);
1271                *lru_size = 0;
1272        }
1273
1274        if (nr_pages > 0)
1275                *lru_size += nr_pages;
1276}
1277
1278/**
1279 * mem_cgroup_margin - calculate chargeable space of a memory cgroup
1280 * @memcg: the memory cgroup
1281 *
1282 * Returns the maximum amount of memory @mem can be charged with, in
1283 * pages.
1284 */
1285static unsigned long mem_cgroup_margin(struct mem_cgroup *memcg)
1286{
1287        unsigned long margin = 0;
1288        unsigned long count;
1289        unsigned long limit;
1290
1291        count = page_counter_read(&memcg->memory);
1292        limit = READ_ONCE(memcg->memory.max);
1293        if (count < limit)
1294                margin = limit - count;
1295
1296        if (do_memsw_account()) {
1297                count = page_counter_read(&memcg->memsw);
1298                limit = READ_ONCE(memcg->memsw.max);
1299                if (count < limit)
1300                        margin = min(margin, limit - count);
1301                else
1302                        margin = 0;
1303        }
1304
1305        return margin;
1306}
1307
1308/*
1309 * A routine for checking "mem" is under move_account() or not.
1310 *
1311 * Checking a cgroup is mc.from or mc.to or under hierarchy of
1312 * moving cgroups. This is for waiting at high-memory pressure
1313 * caused by "move".
1314 */
1315static bool mem_cgroup_under_move(struct mem_cgroup *memcg)
1316{
1317        struct mem_cgroup *from;
1318        struct mem_cgroup *to;
1319        bool ret = false;
1320        /*
1321         * Unlike task_move routines, we access mc.to, mc.from not under
1322         * mutual exclusion by cgroup_mutex. Here, we take spinlock instead.
1323         */
1324        spin_lock(&mc.lock);
1325        from = mc.from;
1326        to = mc.to;
1327        if (!from)
1328                goto unlock;
1329
1330        ret = mem_cgroup_is_descendant(from, memcg) ||
1331                mem_cgroup_is_descendant(to, memcg);
1332unlock:
1333        spin_unlock(&mc.lock);
1334        return ret;
1335}
1336
1337static bool mem_cgroup_wait_acct_move(struct mem_cgroup *memcg)
1338{
1339        if (mc.moving_task && current != mc.moving_task) {
1340                if (mem_cgroup_under_move(memcg)) {
1341                        DEFINE_WAIT(wait);
1342                        prepare_to_wait(&mc.waitq, &wait, TASK_INTERRUPTIBLE);
1343                        /* moving charge context might have finished. */
1344                        if (mc.moving_task)
1345                                schedule();
1346                        finish_wait(&mc.waitq, &wait);
1347                        return true;
1348                }
1349        }
1350        return false;
1351}
1352
1353static char *memory_stat_format(struct mem_cgroup *memcg)
1354{
1355        struct seq_buf s;
1356        int i;
1357
1358        seq_buf_init(&s, kmalloc(PAGE_SIZE, GFP_KERNEL), PAGE_SIZE);
1359        if (!s.buffer)
1360                return NULL;
1361
1362        /*
1363         * Provide statistics on the state of the memory subsystem as
1364         * well as cumulative event counters that show past behavior.
1365         *
1366         * This list is ordered following a combination of these gradients:
1367         * 1) generic big picture -> specifics and details
1368         * 2) reflecting userspace activity -> reflecting kernel heuristics
1369         *
1370         * Current memory state:
1371         */
1372
1373        seq_buf_printf(&s, "anon %llu\n",
1374                       (u64)memcg_page_state(memcg, NR_ANON_MAPPED) *
1375                       PAGE_SIZE);
1376        seq_buf_printf(&s, "file %llu\n",
1377                       (u64)memcg_page_state(memcg, NR_FILE_PAGES) *
1378                       PAGE_SIZE);
1379        seq_buf_printf(&s, "kernel_stack %llu\n",
1380                       (u64)memcg_page_state(memcg, MEMCG_KERNEL_STACK_KB) *
1381                       1024);
1382        seq_buf_printf(&s, "slab %llu\n",
1383                       (u64)(memcg_page_state(memcg, NR_SLAB_RECLAIMABLE) +
1384                             memcg_page_state(memcg, NR_SLAB_UNRECLAIMABLE)) *
1385                       PAGE_SIZE);
1386        seq_buf_printf(&s, "sock %llu\n",
1387                       (u64)memcg_page_state(memcg, MEMCG_SOCK) *
1388                       PAGE_SIZE);
1389
1390        seq_buf_printf(&s, "shmem %llu\n",
1391                       (u64)memcg_page_state(memcg, NR_SHMEM) *
1392                       PAGE_SIZE);
1393        seq_buf_printf(&s, "file_mapped %llu\n",
1394                       (u64)memcg_page_state(memcg, NR_FILE_MAPPED) *
1395                       PAGE_SIZE);
1396        seq_buf_printf(&s, "file_dirty %llu\n",
1397                       (u64)memcg_page_state(memcg, NR_FILE_DIRTY) *
1398                       PAGE_SIZE);
1399        seq_buf_printf(&s, "file_writeback %llu\n",
1400                       (u64)memcg_page_state(memcg, NR_WRITEBACK) *
1401                       PAGE_SIZE);
1402
1403#ifdef CONFIG_TRANSPARENT_HUGEPAGE
1404        seq_buf_printf(&s, "anon_thp %llu\n",
1405                       (u64)memcg_page_state(memcg, NR_ANON_THPS) *
1406                       HPAGE_PMD_SIZE);
1407#endif
1408
1409        for (i = 0; i < NR_LRU_LISTS; i++)
1410                seq_buf_printf(&s, "%s %llu\n", lru_list_name(i),
1411                               (u64)memcg_page_state(memcg, NR_LRU_BASE + i) *
1412                               PAGE_SIZE);
1413
1414        seq_buf_printf(&s, "slab_reclaimable %llu\n",
1415                       (u64)memcg_page_state(memcg, NR_SLAB_RECLAIMABLE) *
1416                       PAGE_SIZE);
1417        seq_buf_printf(&s, "slab_unreclaimable %llu\n",
1418                       (u64)memcg_page_state(memcg, NR_SLAB_UNRECLAIMABLE) *
1419                       PAGE_SIZE);
1420
1421        /* Accumulated memory events */
1422
1423        seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGFAULT),
1424                       memcg_events(memcg, PGFAULT));
1425        seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGMAJFAULT),
1426                       memcg_events(memcg, PGMAJFAULT));
1427
1428        seq_buf_printf(&s, "workingset_refault %lu\n",
1429                       memcg_page_state(memcg, WORKINGSET_REFAULT));
1430        seq_buf_printf(&s, "workingset_activate %lu\n",
1431                       memcg_page_state(memcg, WORKINGSET_ACTIVATE));
1432        seq_buf_printf(&s, "workingset_restore %lu\n",
1433                       memcg_page_state(memcg, WORKINGSET_RESTORE));
1434        seq_buf_printf(&s, "workingset_nodereclaim %lu\n",
1435                       memcg_page_state(memcg, WORKINGSET_NODERECLAIM));
1436
1437        seq_buf_printf(&s, "%s %lu\n",  vm_event_name(PGREFILL),
1438                       memcg_events(memcg, PGREFILL));
1439        seq_buf_printf(&s, "pgscan %lu\n",
1440                       memcg_events(memcg, PGSCAN_KSWAPD) +
1441                       memcg_events(memcg, PGSCAN_DIRECT));
1442        seq_buf_printf(&s, "pgsteal %lu\n",
1443                       memcg_events(memcg, PGSTEAL_KSWAPD) +
1444                       memcg_events(memcg, PGSTEAL_DIRECT));
1445        seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGACTIVATE),
1446                       memcg_events(memcg, PGACTIVATE));
1447        seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGDEACTIVATE),
1448                       memcg_events(memcg, PGDEACTIVATE));
1449        seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGLAZYFREE),
1450                       memcg_events(memcg, PGLAZYFREE));
1451        seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGLAZYFREED),
1452                       memcg_events(memcg, PGLAZYFREED));
1453
1454#ifdef CONFIG_TRANSPARENT_HUGEPAGE
1455        seq_buf_printf(&s, "%s %lu\n", vm_event_name(THP_FAULT_ALLOC),
1456                       memcg_events(memcg, THP_FAULT_ALLOC));
1457        seq_buf_printf(&s, "%s %lu\n", vm_event_name(THP_COLLAPSE_ALLOC),
1458                       memcg_events(memcg, THP_COLLAPSE_ALLOC));
1459#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
1460
1461        /* The above should easily fit into one page */
1462        WARN_ON_ONCE(seq_buf_has_overflowed(&s));
1463
1464        return s.buffer;
1465}
1466
1467#define K(x) ((x) << (PAGE_SHIFT-10))
1468/**
1469 * mem_cgroup_print_oom_context: Print OOM information relevant to
1470 * memory controller.
1471 * @memcg: The memory cgroup that went over limit
1472 * @p: Task that is going to be killed
1473 *
1474 * NOTE: @memcg and @p's mem_cgroup can be different when hierarchy is
1475 * enabled
1476 */
1477void mem_cgroup_print_oom_context(struct mem_cgroup *memcg, struct task_struct *p)
1478{
1479        rcu_read_lock();
1480
1481        if (memcg) {
1482                pr_cont(",oom_memcg=");
1483                pr_cont_cgroup_path(memcg->css.cgroup);
1484        } else
1485                pr_cont(",global_oom");
1486        if (p) {
1487                pr_cont(",task_memcg=");
1488                pr_cont_cgroup_path(task_cgroup(p, memory_cgrp_id));
1489        }
1490        rcu_read_unlock();
1491}
1492
1493/**
1494 * mem_cgroup_print_oom_meminfo: Print OOM memory information relevant to
1495 * memory controller.
1496 * @memcg: The memory cgroup that went over limit
1497 */
1498void mem_cgroup_print_oom_meminfo(struct mem_cgroup *memcg)
1499{
1500        char *buf;
1501
1502        pr_info("memory: usage %llukB, limit %llukB, failcnt %lu\n",
1503                K((u64)page_counter_read(&memcg->memory)),
1504                K((u64)READ_ONCE(memcg->memory.max)), memcg->memory.failcnt);
1505        if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
1506                pr_info("swap: usage %llukB, limit %llukB, failcnt %lu\n",
1507                        K((u64)page_counter_read(&memcg->swap)),
1508                        K((u64)READ_ONCE(memcg->swap.max)), memcg->swap.failcnt);
1509        else {
1510                pr_info("memory+swap: usage %llukB, limit %llukB, failcnt %lu\n",
1511                        K((u64)page_counter_read(&memcg->memsw)),
1512                        K((u64)memcg->memsw.max), memcg->memsw.failcnt);
1513                pr_info("kmem: usage %llukB, limit %llukB, failcnt %lu\n",
1514                        K((u64)page_counter_read(&memcg->kmem)),
1515                        K((u64)memcg->kmem.max), memcg->kmem.failcnt);
1516        }
1517
1518        pr_info("Memory cgroup stats for ");
1519        pr_cont_cgroup_path(memcg->css.cgroup);
1520        pr_cont(":");
1521        buf = memory_stat_format(memcg);
1522        if (!buf)
1523                return;
1524        pr_info("%s", buf);
1525        kfree(buf);
1526}
1527
1528/*
1529 * Return the memory (and swap, if configured) limit for a memcg.
1530 */
1531unsigned long mem_cgroup_get_max(struct mem_cgroup *memcg)
1532{
1533        unsigned long max;
1534
1535        max = READ_ONCE(memcg->memory.max);
1536        if (mem_cgroup_swappiness(memcg)) {
1537                unsigned long memsw_max;
1538                unsigned long swap_max;
1539
1540                memsw_max = memcg->memsw.max;
1541                swap_max = READ_ONCE(memcg->swap.max);
1542                swap_max = min(swap_max, (unsigned long)total_swap_pages);
1543                max = min(max + swap_max, memsw_max);
1544        }
1545        return max;
1546}
1547
1548unsigned long mem_cgroup_size(struct mem_cgroup *memcg)
1549{
1550        return page_counter_read(&memcg->memory);
1551}
1552
1553static bool mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
1554                                     int order)
1555{
1556        struct oom_control oc = {
1557                .zonelist = NULL,
1558                .nodemask = NULL,
1559                .memcg = memcg,
1560                .gfp_mask = gfp_mask,
1561                .order = order,
1562        };
1563        bool ret;
1564
1565        if (mutex_lock_killable(&oom_lock))
1566                return true;
1567        /*
1568         * A few threads which were not waiting at mutex_lock_killable() can
1569         * fail to bail out. Therefore, check again after holding oom_lock.
1570         */
1571        ret = should_force_charge() || out_of_memory(&oc);
1572        mutex_unlock(&oom_lock);
1573        return ret;
1574}
1575
1576static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg,
1577                                   pg_data_t *pgdat,
1578                                   gfp_t gfp_mask,
1579                                   unsigned long *total_scanned)
1580{
1581        struct mem_cgroup *victim = NULL;
1582        int total = 0;
1583        int loop = 0;
1584        unsigned long excess;
1585        unsigned long nr_scanned;
1586        struct mem_cgroup_reclaim_cookie reclaim = {
1587                .pgdat = pgdat,
1588        };
1589
1590        excess = soft_limit_excess(root_memcg);
1591
1592        while (1) {
1593                victim = mem_cgroup_iter(root_memcg, victim, &reclaim);
1594                if (!victim) {
1595                        loop++;
1596                        if (loop >= 2) {
1597                                /*
1598                                 * If we have not been able to reclaim
1599                                 * anything, it might because there are
1600                                 * no reclaimable pages under this hierarchy
1601                                 */
1602                                if (!total)
1603                                        break;
1604                                /*
1605                                 * We want to do more targeted reclaim.
1606                                 * excess >> 2 is not to excessive so as to
1607                                 * reclaim too much, nor too less that we keep
1608                                 * coming back to reclaim from this cgroup
1609                                 */
1610                                if (total >= (excess >> 2) ||
1611                                        (loop > MEM_CGROUP_MAX_RECLAIM_LOOPS))
1612                                        break;
1613                        }
1614                        continue;
1615                }
1616                total += mem_cgroup_shrink_node(victim, gfp_mask, false,
1617                                        pgdat, &nr_scanned);
1618                *total_scanned += nr_scanned;
1619                if (!soft_limit_excess(root_memcg))
1620                        break;
1621        }
1622        mem_cgroup_iter_break(root_memcg, victim);
1623        return total;
1624}
1625
1626#ifdef CONFIG_LOCKDEP
1627static struct lockdep_map memcg_oom_lock_dep_map = {
1628        .name = "memcg_oom_lock",
1629};
1630#endif
1631
1632static DEFINE_SPINLOCK(memcg_oom_lock);
1633
1634/*
1635 * Check OOM-Killer is already running under our hierarchy.
1636 * If someone is running, return false.
1637 */
1638static bool mem_cgroup_oom_trylock(struct mem_cgroup *memcg)
1639{
1640        struct mem_cgroup *iter, *failed = NULL;
1641
1642        spin_lock(&memcg_oom_lock);
1643
1644        for_each_mem_cgroup_tree(iter, memcg) {
1645                if (iter->oom_lock) {
1646                        /*
1647                         * this subtree of our hierarchy is already locked
1648                         * so we cannot give a lock.
1649                         */
1650                        failed = iter;
1651                        mem_cgroup_iter_break(memcg, iter);
1652                        break;
1653                } else
1654                        iter->oom_lock = true;
1655        }
1656
1657        if (failed) {
1658                /*
1659                 * OK, we failed to lock the whole subtree so we have
1660                 * to clean up what we set up to the failing subtree
1661                 */
1662                for_each_mem_cgroup_tree(iter, memcg) {
1663                        if (iter == failed) {
1664                                mem_cgroup_iter_break(memcg, iter);
1665                                break;
1666                        }
1667                        iter->oom_lock = false;
1668                }
1669        } else
1670                mutex_acquire(&memcg_oom_lock_dep_map, 0, 1, _RET_IP_);
1671
1672        spin_unlock(&memcg_oom_lock);
1673
1674        return !failed;
1675}
1676
1677static void mem_cgroup_oom_unlock(struct mem_cgroup *memcg)
1678{
1679        struct mem_cgroup *iter;
1680
1681        spin_lock(&memcg_oom_lock);
1682        mutex_release(&memcg_oom_lock_dep_map, _RET_IP_);
1683        for_each_mem_cgroup_tree(iter, memcg)
1684                iter->oom_lock = false;
1685        spin_unlock(&memcg_oom_lock);
1686}
1687
1688static void mem_cgroup_mark_under_oom(struct mem_cgroup *memcg)
1689{
1690        struct mem_cgroup *iter;
1691
1692        spin_lock(&memcg_oom_lock);
1693        for_each_mem_cgroup_tree(iter, memcg)
1694                iter->under_oom++;
1695        spin_unlock(&memcg_oom_lock);
1696}
1697
1698static void mem_cgroup_unmark_under_oom(struct mem_cgroup *memcg)
1699{
1700        struct mem_cgroup *iter;
1701
1702        /*
1703         * When a new child is created while the hierarchy is under oom,
1704         * mem_cgroup_oom_lock() may not be called. Watch for underflow.
1705         */
1706        spin_lock(&memcg_oom_lock);
1707        for_each_mem_cgroup_tree(iter, memcg)
1708                if (iter->under_oom > 0)
1709                        iter->under_oom--;
1710        spin_unlock(&memcg_oom_lock);
1711}
1712
1713static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq);
1714
1715struct oom_wait_info {
1716        struct mem_cgroup *memcg;
1717        wait_queue_entry_t      wait;
1718};
1719
1720static int memcg_oom_wake_function(wait_queue_entry_t *wait,
1721        unsigned mode, int sync, void *arg)
1722{
1723        struct mem_cgroup *wake_memcg = (struct mem_cgroup *)arg;
1724        struct mem_cgroup *oom_wait_memcg;
1725        struct oom_wait_info *oom_wait_info;
1726
1727        oom_wait_info = container_of(wait, struct oom_wait_info, wait);
1728        oom_wait_memcg = oom_wait_info->memcg;
1729
1730        if (!mem_cgroup_is_descendant(wake_memcg, oom_wait_memcg) &&
1731            !mem_cgroup_is_descendant(oom_wait_memcg, wake_memcg))
1732                return 0;
1733        return autoremove_wake_function(wait, mode, sync, arg);
1734}
1735
1736static void memcg_oom_recover(struct mem_cgroup *memcg)
1737{
1738        /*
1739         * For the following lockless ->under_oom test, the only required
1740         * guarantee is that it must see the state asserted by an OOM when
1741         * this function is called as a result of userland actions
1742         * triggered by the notification of the OOM.  This is trivially
1743         * achieved by invoking mem_cgroup_mark_under_oom() before
1744         * triggering notification.
1745         */
1746        if (memcg && memcg->under_oom)
1747                __wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, memcg);
1748}
1749
1750enum oom_status {
1751        OOM_SUCCESS,
1752        OOM_FAILED,
1753        OOM_ASYNC,
1754        OOM_SKIPPED
1755};
1756
1757static enum oom_status mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order)
1758{
1759        enum oom_status ret;
1760        bool locked;
1761
1762        if (order > PAGE_ALLOC_COSTLY_ORDER)
1763                return OOM_SKIPPED;
1764
1765        memcg_memory_event(memcg, MEMCG_OOM);
1766
1767        /*
1768         * We are in the middle of the charge context here, so we
1769         * don't want to block when potentially sitting on a callstack
1770         * that holds all kinds of filesystem and mm locks.
1771         *
1772         * cgroup1 allows disabling the OOM killer and waiting for outside
1773         * handling until the charge can succeed; remember the context and put
1774         * the task to sleep at the end of the page fault when all locks are
1775         * released.
1776         *
1777         * On the other hand, in-kernel OOM killer allows for an async victim
1778         * memory reclaim (oom_reaper) and that means that we are not solely
1779         * relying on the oom victim to make a forward progress and we can
1780         * invoke the oom killer here.
1781         *
1782         * Please note that mem_cgroup_out_of_memory might fail to find a
1783         * victim and then we have to bail out from the charge path.
1784         */
1785        if (memcg->oom_kill_disable) {
1786                if (!current->in_user_fault)
1787                        return OOM_SKIPPED;
1788                css_get(&memcg->css);
1789                current->memcg_in_oom = memcg;
1790                current->memcg_oom_gfp_mask = mask;
1791                current->memcg_oom_order = order;
1792
1793                return OOM_ASYNC;
1794        }
1795
1796        mem_cgroup_mark_under_oom(memcg);
1797
1798        locked = mem_cgroup_oom_trylock(memcg);
1799
1800        if (locked)
1801                mem_cgroup_oom_notify(memcg);
1802
1803        mem_cgroup_unmark_under_oom(memcg);
1804        if (mem_cgroup_out_of_memory(memcg, mask, order))
1805                ret = OOM_SUCCESS;
1806        else
1807                ret = OOM_FAILED;
1808
1809        if (locked)
1810                mem_cgroup_oom_unlock(memcg);
1811
1812        return ret;
1813}
1814
1815/**
1816 * mem_cgroup_oom_synchronize - complete memcg OOM handling
1817 * @handle: actually kill/wait or just clean up the OOM state
1818 *
1819 * This has to be called at the end of a page fault if the memcg OOM
1820 * handler was enabled.
1821 *
1822 * Memcg supports userspace OOM handling where failed allocations must
1823 * sleep on a waitqueue until the userspace task resolves the
1824 * situation.  Sleeping directly in the charge context with all kinds
1825 * of locks held is not a good idea, instead we remember an OOM state
1826 * in the task and mem_cgroup_oom_synchronize() has to be called at
1827 * the end of the page fault to complete the OOM handling.
1828 *
1829 * Returns %true if an ongoing memcg OOM situation was detected and
1830 * completed, %false otherwise.
1831 */
1832bool mem_cgroup_oom_synchronize(bool handle)
1833{
1834        struct mem_cgroup *memcg = current->memcg_in_oom;
1835        struct oom_wait_info owait;
1836        bool locked;
1837
1838        /* OOM is global, do not handle */
1839        if (!memcg)
1840                return false;
1841
1842        if (!handle)
1843                goto cleanup;
1844
1845        owait.memcg = memcg;
1846        owait.wait.flags = 0;
1847        owait.wait.func = memcg_oom_wake_function;
1848        owait.wait.private = current;
1849        INIT_LIST_HEAD(&owait.wait.entry);
1850
1851        prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE);
1852        mem_cgroup_mark_under_oom(memcg);
1853
1854        locked = mem_cgroup_oom_trylock(memcg);
1855
1856        if (locked)
1857                mem_cgroup_oom_notify(memcg);
1858
1859        if (locked && !memcg->oom_kill_disable) {
1860                mem_cgroup_unmark_under_oom(memcg);
1861                finish_wait(&memcg_oom_waitq, &owait.wait);
1862                mem_cgroup_out_of_memory(memcg, current->memcg_oom_gfp_mask,
1863                                         current->memcg_oom_order);
1864        } else {
1865                schedule();
1866                mem_cgroup_unmark_under_oom(memcg);
1867                finish_wait(&memcg_oom_waitq, &owait.wait);
1868        }
1869
1870        if (locked) {
1871                mem_cgroup_oom_unlock(memcg);
1872                /*
1873                 * There is no guarantee that an OOM-lock contender
1874                 * sees the wakeups triggered by the OOM kill
1875                 * uncharges.  Wake any sleepers explicitely.
1876                 */
1877                memcg_oom_recover(memcg);
1878        }
1879cleanup:
1880        current->memcg_in_oom = NULL;
1881        css_put(&memcg->css);
1882        return true;
1883}
1884
1885/**
1886 * mem_cgroup_get_oom_group - get a memory cgroup to clean up after OOM
1887 * @victim: task to be killed by the OOM killer
1888 * @oom_domain: memcg in case of memcg OOM, NULL in case of system-wide OOM
1889 *
1890 * Returns a pointer to a memory cgroup, which has to be cleaned up
1891 * by killing all belonging OOM-killable tasks.
1892 *
1893 * Caller has to call mem_cgroup_put() on the returned non-NULL memcg.
1894 */
1895struct mem_cgroup *mem_cgroup_get_oom_group(struct task_struct *victim,
1896                                            struct mem_cgroup *oom_domain)
1897{
1898        struct mem_cgroup *oom_group = NULL;
1899        struct mem_cgroup *memcg;
1900
1901        if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
1902                return NULL;
1903
1904        if (!oom_domain)
1905                oom_domain = root_mem_cgroup;
1906
1907        rcu_read_lock();
1908
1909        memcg = mem_cgroup_from_task(victim);
1910        if (memcg == root_mem_cgroup)
1911                goto out;
1912
1913        /*
1914         * If the victim task has been asynchronously moved to a different
1915         * memory cgroup, we might end up killing tasks outside oom_domain.
1916         * In this case it's better to ignore memory.group.oom.
1917         */
1918        if (unlikely(!mem_cgroup_is_descendant(memcg, oom_domain)))
1919                goto out;
1920
1921        /*
1922         * Traverse the memory cgroup hierarchy from the victim task's
1923         * cgroup up to the OOMing cgroup (or root) to find the
1924         * highest-level memory cgroup with oom.group set.
1925         */
1926        for (; memcg; memcg = parent_mem_cgroup(memcg)) {
1927                if (memcg->oom_group)
1928                        oom_group = memcg;
1929
1930                if (memcg == oom_domain)
1931                        break;
1932        }
1933
1934        if (oom_group)
1935                css_get(&oom_group->css);
1936out:
1937        rcu_read_unlock();
1938
1939        return oom_group;
1940}
1941
1942void mem_cgroup_print_oom_group(struct mem_cgroup *memcg)
1943{
1944        pr_info("Tasks in ");
1945        pr_cont_cgroup_path(memcg->css.cgroup);
1946        pr_cont(" are going to be killed due to memory.oom.group set\n");
1947}
1948
1949/**
1950 * lock_page_memcg - lock a page->mem_cgroup binding
1951 * @page: the page
1952 *
1953 * This function protects unlocked LRU pages from being moved to
1954 * another cgroup.
1955 *
1956 * It ensures lifetime of the returned memcg. Caller is responsible
1957 * for the lifetime of the page; __unlock_page_memcg() is available
1958 * when @page might get freed inside the locked section.
1959 */
1960struct mem_cgroup *lock_page_memcg(struct page *page)
1961{
1962        struct page *head = compound_head(page); /* rmap on tail pages */
1963        struct mem_cgroup *memcg;
1964        unsigned long flags;
1965
1966        /*
1967         * The RCU lock is held throughout the transaction.  The fast
1968         * path can get away without acquiring the memcg->move_lock
1969         * because page moving starts with an RCU grace period.
1970         *
1971         * The RCU lock also protects the memcg from being freed when
1972         * the page state that is going to change is the only thing
1973         * preventing the page itself from being freed. E.g. writeback
1974         * doesn't hold a page reference and relies on PG_writeback to
1975         * keep off truncation, migration and so forth.
1976         */
1977        rcu_read_lock();
1978
1979        if (mem_cgroup_disabled())
1980                return NULL;
1981again:
1982        memcg = head->mem_cgroup;
1983        if (unlikely(!memcg))
1984                return NULL;
1985
1986        if (atomic_read(&memcg->moving_account) <= 0)
1987                return memcg;
1988
1989        spin_lock_irqsave(&memcg->move_lock, flags);
1990        if (memcg != head->mem_cgroup) {
1991                spin_unlock_irqrestore(&memcg->move_lock, flags);
1992                goto again;
1993        }
1994
1995        /*
1996         * When charge migration first begins, we can have locked and
1997         * unlocked page stat updates happening concurrently.  Track
1998         * the task who has the lock for unlock_page_memcg().
1999         */
2000        memcg->move_lock_task = current;
2001        memcg->move_lock_flags = flags;
2002
2003        return memcg;
2004}
2005EXPORT_SYMBOL(lock_page_memcg);
2006
2007/**
2008 * __unlock_page_memcg - unlock and unpin a memcg
2009 * @memcg: the memcg
2010 *
2011 * Unlock and unpin a memcg returned by lock_page_memcg().
2012 */
2013void __unlock_page_memcg(struct mem_cgroup *memcg)
2014{
2015        if (memcg && memcg->move_lock_task == current) {
2016                unsigned long flags = memcg->move_lock_flags;
2017
2018                memcg->move_lock_task = NULL;
2019                memcg->move_lock_flags = 0;
2020
2021                spin_unlock_irqrestore(&memcg->move_lock, flags);
2022        }
2023
2024        rcu_read_unlock();
2025}
2026
2027/**
2028 * unlock_page_memcg - unlock a page->mem_cgroup binding
2029 * @page: the page
2030 */
2031void unlock_page_memcg(struct page *page)
2032{
2033        struct page *head = compound_head(page);
2034
2035        __unlock_page_memcg(head->mem_cgroup);
2036}
2037EXPORT_SYMBOL(unlock_page_memcg);
2038
2039struct memcg_stock_pcp {
2040        struct mem_cgroup *cached; /* this never be root cgroup */
2041        unsigned int nr_pages;
2042        struct work_struct work;
2043        unsigned long flags;
2044#define FLUSHING_CACHED_CHARGE  0
2045};
2046static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock);
2047static DEFINE_MUTEX(percpu_charge_mutex);
2048
2049/**
2050 * consume_stock: Try to consume stocked charge on this cpu.
2051 * @memcg: memcg to consume from.
2052 * @nr_pages: how many pages to charge.
2053 *
2054 * The charges will only happen if @memcg matches the current cpu's memcg
2055 * stock, and at least @nr_pages are available in that stock.  Failure to
2056 * service an allocation will refill the stock.
2057 *
2058 * returns true if successful, false otherwise.
2059 */
2060static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
2061{
2062        struct memcg_stock_pcp *stock;
2063        unsigned long flags;
2064        bool ret = false;
2065
2066        if (nr_pages > MEMCG_CHARGE_BATCH)
2067                return ret;
2068
2069        local_irq_save(flags);
2070
2071        stock = this_cpu_ptr(&memcg_stock);
2072        if (memcg == stock->cached && stock->nr_pages >= nr_pages) {
2073                stock->nr_pages -= nr_pages;
2074                ret = true;
2075        }
2076
2077        local_irq_restore(flags);
2078
2079        return ret;
2080}
2081
2082/*
2083 * Returns stocks cached in percpu and reset cached information.
2084 */
2085static void drain_stock(struct memcg_stock_pcp *stock)
2086{
2087        struct mem_cgroup *old = stock->cached;
2088
2089        if (stock->nr_pages) {
2090                page_counter_uncharge(&old->memory, stock->nr_pages);
2091                if (do_memsw_account())
2092                        page_counter_uncharge(&old->memsw, stock->nr_pages);
2093                css_put_many(&old->css, stock->nr_pages);
2094                stock->nr_pages = 0;
2095        }
2096        stock->cached = NULL;
2097}
2098
2099static void drain_local_stock(struct work_struct *dummy)
2100{
2101        struct memcg_stock_pcp *stock;
2102        unsigned long flags;
2103
2104        /*
2105         * The only protection from memory hotplug vs. drain_stock races is
2106         * that we always operate on local CPU stock here with IRQ disabled
2107         */
2108        local_irq_save(flags);
2109
2110        stock = this_cpu_ptr(&memcg_stock);
2111        drain_stock(stock);
2112        clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags);
2113
2114        local_irq_restore(flags);
2115}
2116
2117/*
2118 * Cache charges(val) to local per_cpu area.
2119 * This will be consumed by consume_stock() function, later.
2120 */
2121static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
2122{
2123        struct memcg_stock_pcp *stock;
2124        unsigned long flags;
2125
2126        local_irq_save(flags);
2127
2128        stock = this_cpu_ptr(&memcg_stock);
2129        if (stock->cached != memcg) { /* reset if necessary */
2130                drain_stock(stock);
2131                stock->cached = memcg;
2132        }
2133        stock->nr_pages += nr_pages;
2134
2135        if (stock->nr_pages > MEMCG_CHARGE_BATCH)
2136                drain_stock(stock);
2137
2138        local_irq_restore(flags);
2139}
2140
2141/*
2142 * Drains all per-CPU charge caches for given root_memcg resp. subtree
2143 * of the hierarchy under it.
2144 */
2145static void drain_all_stock(struct mem_cgroup *root_memcg)
2146{
2147        int cpu, curcpu;
2148
2149        /* If someone's already draining, avoid adding running more workers. */
2150        if (!mutex_trylock(&percpu_charge_mutex))
2151                return;
2152        /*
2153         * Notify other cpus that system-wide "drain" is running
2154         * We do not care about races with the cpu hotplug because cpu down
2155         * as well as workers from this path always operate on the local
2156         * per-cpu data. CPU up doesn't touch memcg_stock at all.
2157         */
2158        curcpu = get_cpu();
2159        for_each_online_cpu(cpu) {
2160                struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu);
2161                struct mem_cgroup *memcg;
2162                bool flush = false;
2163
2164                rcu_read_lock();
2165                memcg = stock->cached;
2166                if (memcg && stock->nr_pages &&
2167                    mem_cgroup_is_descendant(memcg, root_memcg))
2168                        flush = true;
2169                rcu_read_unlock();
2170
2171                if (flush &&
2172                    !test_and_set_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) {
2173                        if (cpu == curcpu)
2174                                drain_local_stock(&stock->work);
2175                        else
2176                                schedule_work_on(cpu, &stock->work);
2177                }
2178        }
2179        put_cpu();
2180        mutex_unlock(&percpu_charge_mutex);
2181}
2182
2183static int memcg_hotplug_cpu_dead(unsigned int cpu)
2184{
2185        struct memcg_stock_pcp *stock;
2186        struct mem_cgroup *memcg, *mi;
2187
2188        stock = &per_cpu(memcg_stock, cpu);
2189        drain_stock(stock);
2190
2191        for_each_mem_cgroup(memcg) {
2192                int i;
2193
2194                for (i = 0; i < MEMCG_NR_STAT; i++) {
2195                        int nid;
2196                        long x;
2197
2198                        x = this_cpu_xchg(memcg->vmstats_percpu->stat[i], 0);
2199                        if (x)
2200                                for (mi = memcg; mi; mi = parent_mem_cgroup(mi))
2201                                        atomic_long_add(x, &memcg->vmstats[i]);
2202
2203                        if (i >= NR_VM_NODE_STAT_ITEMS)
2204                                continue;
2205
2206                        for_each_node(nid) {
2207                                struct mem_cgroup_per_node *pn;
2208
2209                                pn = mem_cgroup_nodeinfo(memcg, nid);
2210                                x = this_cpu_xchg(pn->lruvec_stat_cpu->count[i], 0);
2211                                if (x)
2212                                        do {
2213                                                atomic_long_add(x, &pn->lruvec_stat[i]);
2214                                        } while ((pn = parent_nodeinfo(pn, nid)));
2215                        }
2216                }
2217
2218                for (i = 0; i < NR_VM_EVENT_ITEMS; i++) {
2219                        long x;
2220
2221                        x = this_cpu_xchg(memcg->vmstats_percpu->events[i], 0);
2222                        if (x)
2223                                for (mi = memcg; mi; mi = parent_mem_cgroup(mi))
2224                                        atomic_long_add(x, &memcg->vmevents[i]);
2225                }
2226        }
2227
2228        return 0;
2229}
2230
2231static void reclaim_high(struct mem_cgroup *memcg,
2232                         unsigned int nr_pages,
2233                         gfp_t gfp_mask)
2234{
2235        do {
2236                if (page_counter_read(&memcg->memory) <=
2237                    READ_ONCE(memcg->memory.high))
2238                        continue;
2239                memcg_memory_event(memcg, MEMCG_HIGH);
2240                try_to_free_mem_cgroup_pages(memcg, nr_pages, gfp_mask, true);
2241        } while ((memcg = parent_mem_cgroup(memcg)) &&
2242                 !mem_cgroup_is_root(memcg));
2243}
2244
2245static void high_work_func(struct work_struct *work)
2246{
2247        struct mem_cgroup *memcg;
2248
2249        memcg = container_of(work, struct mem_cgroup, high_work);
2250        reclaim_high(memcg, MEMCG_CHARGE_BATCH, GFP_KERNEL);
2251}
2252
2253/*
2254 * Clamp the maximum sleep time per allocation batch to 2 seconds. This is
2255 * enough to still cause a significant slowdown in most cases, while still
2256 * allowing diagnostics and tracing to proceed without becoming stuck.
2257 */
2258#define MEMCG_MAX_HIGH_DELAY_JIFFIES (2UL*HZ)
2259
2260/*
2261 * When calculating the delay, we use these either side of the exponentiation to
2262 * maintain precision and scale to a reasonable number of jiffies (see the table
2263 * below.
2264 *
2265 * - MEMCG_DELAY_PRECISION_SHIFT: Extra precision bits while translating the
2266 *   overage ratio to a delay.
2267 * - MEMCG_DELAY_SCALING_SHIFT: The number of bits to scale down down the
2268 *   proposed penalty in order to reduce to a reasonable number of jiffies, and
2269 *   to produce a reasonable delay curve.
2270 *
2271 * MEMCG_DELAY_SCALING_SHIFT just happens to be a number that produces a
2272 * reasonable delay curve compared to precision-adjusted overage, not
2273 * penalising heavily at first, but still making sure that growth beyond the
2274 * limit penalises misbehaviour cgroups by slowing them down exponentially. For
2275 * example, with a high of 100 megabytes:
2276 *
2277 *  +-------+------------------------+
2278 *  | usage | time to allocate in ms |
2279 *  +-------+------------------------+
2280 *  | 100M  |                      0 |
2281 *  | 101M  |                      6 |
2282 *  | 102M  |                     25 |
2283 *  | 103M  |                     57 |
2284 *  | 104M  |                    102 |
2285 *  | 105M  |                    159 |
2286 *  | 106M  |                    230 |
2287 *  | 107M  |                    313 |
2288 *  | 108M  |                    409 |
2289 *  | 109M  |                    518 |
2290 *  | 110M  |                    639 |
2291 *  | 111M  |                    774 |
2292 *  | 112M  |                    921 |
2293 *  | 113M  |                   1081 |
2294 *  | 114M  |                   1254 |
2295 *  | 115M  |                   1439 |
2296 *  | 116M  |                   1638 |
2297 *  | 117M  |                   1849 |
2298 *  | 118M  |                   2000 |
2299 *  | 119M  |                   2000 |
2300 *  | 120M  |                   2000 |
2301 *  +-------+------------------------+
2302 */
2303 #define MEMCG_DELAY_PRECISION_SHIFT 20
2304 #define MEMCG_DELAY_SCALING_SHIFT 14
2305
2306static u64 calculate_overage(unsigned long usage, unsigned long high)
2307{
2308        u64 overage;
2309
2310        if (usage <= high)
2311                return 0;
2312
2313        /*
2314         * Prevent division by 0 in overage calculation by acting as if
2315         * it was a threshold of 1 page
2316         */
2317        high = max(high, 1UL);
2318
2319        overage = usage - high;
2320        overage <<= MEMCG_DELAY_PRECISION_SHIFT;
2321        return div64_u64(overage, high);
2322}
2323
2324static u64 mem_find_max_overage(struct mem_cgroup *memcg)
2325{
2326        u64 overage, max_overage = 0;
2327
2328        do {
2329                overage = calculate_overage(page_counter_read(&memcg->memory),
2330                                            READ_ONCE(memcg->memory.high));
2331                max_overage = max(overage, max_overage);
2332        } while ((memcg = parent_mem_cgroup(memcg)) &&
2333                 !mem_cgroup_is_root(memcg));
2334
2335        return max_overage;
2336}
2337
2338static u64 swap_find_max_overage(struct mem_cgroup *memcg)
2339{
2340        u64 overage, max_overage = 0;
2341
2342        do {
2343                overage = calculate_overage(page_counter_read(&memcg->swap),
2344                                            READ_ONCE(memcg->swap.high));
2345                if (overage)
2346                        memcg_memory_event(memcg, MEMCG_SWAP_HIGH);
2347                max_overage = max(overage, max_overage);
2348        } while ((memcg = parent_mem_cgroup(memcg)) &&
2349                 !mem_cgroup_is_root(memcg));
2350
2351        return max_overage;
2352}
2353
2354/*
2355 * Get the number of jiffies that we should penalise a mischievous cgroup which
2356 * is exceeding its memory.high by checking both it and its ancestors.
2357 */
2358static unsigned long calculate_high_delay(struct mem_cgroup *memcg,
2359                                          unsigned int nr_pages,
2360                                          u64 max_overage)
2361{
2362        unsigned long penalty_jiffies;
2363
2364        if (!max_overage)
2365                return 0;
2366
2367        /*
2368         * We use overage compared to memory.high to calculate the number of
2369         * jiffies to sleep (penalty_jiffies). Ideally this value should be
2370         * fairly lenient on small overages, and increasingly harsh when the
2371         * memcg in question makes it clear that it has no intention of stopping
2372         * its crazy behaviour, so we exponentially increase the delay based on
2373         * overage amount.
2374         */
2375        penalty_jiffies = max_overage * max_overage * HZ;
2376        penalty_jiffies >>= MEMCG_DELAY_PRECISION_SHIFT;
2377        penalty_jiffies >>= MEMCG_DELAY_SCALING_SHIFT;
2378
2379        /*
2380         * Factor in the task's own contribution to the overage, such that four
2381         * N-sized allocations are throttled approximately the same as one
2382         * 4N-sized allocation.
2383         *
2384         * MEMCG_CHARGE_BATCH pages is nominal, so work out how much smaller or
2385         * larger the current charge patch is than that.
2386         */
2387        return penalty_jiffies * nr_pages / MEMCG_CHARGE_BATCH;
2388}
2389
2390/*
2391 * Scheduled by try_charge() to be executed from the userland return path
2392 * and reclaims memory over the high limit.
2393 */
2394void mem_cgroup_handle_over_high(void)
2395{
2396        unsigned long penalty_jiffies;
2397        unsigned long pflags;
2398        unsigned int nr_pages = current->memcg_nr_pages_over_high;
2399        struct mem_cgroup *memcg;
2400
2401        if (likely(!nr_pages))
2402                return;
2403
2404        memcg = get_mem_cgroup_from_mm(current->mm);
2405        reclaim_high(memcg, nr_pages, GFP_KERNEL);
2406        current->memcg_nr_pages_over_high = 0;
2407
2408        /*
2409         * memory.high is breached and reclaim is unable to keep up. Throttle
2410         * allocators proactively to slow down excessive growth.
2411         */
2412        penalty_jiffies = calculate_high_delay(memcg, nr_pages,
2413                                               mem_find_max_overage(memcg));
2414
2415        penalty_jiffies += calculate_high_delay(memcg, nr_pages,
2416                                                swap_find_max_overage(memcg));
2417
2418        /*
2419         * Clamp the max delay per usermode return so as to still keep the
2420         * application moving forwards and also permit diagnostics, albeit
2421         * extremely slowly.
2422         */
2423        penalty_jiffies = min(penalty_jiffies, MEMCG_MAX_HIGH_DELAY_JIFFIES);
2424
2425        /*
2426         * Don't sleep if the amount of jiffies this memcg owes us is so low
2427         * that it's not even worth doing, in an attempt to be nice to those who
2428         * go only a small amount over their memory.high value and maybe haven't
2429         * been aggressively reclaimed enough yet.
2430         */
2431        if (penalty_jiffies <= HZ / 100)
2432                goto out;
2433
2434        /*
2435         * If we exit early, we're guaranteed to die (since
2436         * schedule_timeout_killable sets TASK_KILLABLE). This means we don't
2437         * need to account for any ill-begotten jiffies to pay them off later.
2438         */
2439        psi_memstall_enter(&pflags);
2440        schedule_timeout_killable(penalty_jiffies);
2441        psi_memstall_leave(&pflags);
2442
2443out:
2444        css_put(&memcg->css);
2445}
2446
2447static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
2448                      unsigned int nr_pages)
2449{
2450        unsigned int batch = max(MEMCG_CHARGE_BATCH, nr_pages);
2451        int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
2452        struct mem_cgroup *mem_over_limit;
2453        struct page_counter *counter;
2454        unsigned long nr_reclaimed;
2455        bool may_swap = true;
2456        bool drained = false;
2457        enum oom_status oom_status;
2458
2459        if (mem_cgroup_is_root(memcg))
2460                return 0;
2461retry:
2462        if (consume_stock(memcg, nr_pages))
2463                return 0;
2464
2465        if (!do_memsw_account() ||
2466            page_counter_try_charge(&memcg->memsw, batch, &counter)) {
2467                if (page_counter_try_charge(&memcg->memory, batch, &counter))
2468                        goto done_restock;
2469                if (do_memsw_account())
2470                        page_counter_uncharge(&memcg->memsw, batch);
2471                mem_over_limit = mem_cgroup_from_counter(counter, memory);
2472        } else {
2473                mem_over_limit = mem_cgroup_from_counter(counter, memsw);
2474                may_swap = false;
2475        }
2476
2477        if (batch > nr_pages) {
2478                batch = nr_pages;
2479                goto retry;
2480        }
2481
2482        /*
2483         * Memcg doesn't have a dedicated reserve for atomic
2484         * allocations. But like the global atomic pool, we need to
2485         * put the burden of reclaim on regular allocation requests
2486         * and let these go through as privileged allocations.
2487         */
2488        if (gfp_mask & __GFP_ATOMIC)
2489                goto force;
2490
2491        /*
2492         * Unlike in global OOM situations, memcg is not in a physical
2493         * memory shortage.  Allow dying and OOM-killed tasks to
2494         * bypass the last charges so that they can exit quickly and
2495         * free their memory.
2496         */
2497        if (unlikely(should_force_charge()))
2498                goto force;
2499
2500        /*
2501         * Prevent unbounded recursion when reclaim operations need to
2502         * allocate memory. This might exceed the limits temporarily,
2503         * but we prefer facilitating memory reclaim and getting back
2504         * under the limit over triggering OOM kills in these cases.
2505         */
2506        if (unlikely(current->flags & PF_MEMALLOC))
2507                goto force;
2508
2509        if (unlikely(task_in_memcg_oom(current)))
2510                goto nomem;
2511
2512        if (!gfpflags_allow_blocking(gfp_mask))
2513                goto nomem;
2514
2515        memcg_memory_event(mem_over_limit, MEMCG_MAX);
2516
2517        nr_reclaimed = try_to_free_mem_cgroup_pages(mem_over_limit, nr_pages,
2518                                                    gfp_mask, may_swap);
2519
2520        if (mem_cgroup_margin(mem_over_limit) >= nr_pages)
2521                goto retry;
2522
2523        if (!drained) {
2524                drain_all_stock(mem_over_limit);
2525                drained = true;
2526                goto retry;
2527        }
2528
2529        if (gfp_mask & __GFP_NORETRY)
2530                goto nomem;
2531        /*
2532         * Even though the limit is exceeded at this point, reclaim
2533         * may have been able to free some pages.  Retry the charge
2534         * before killing the task.
2535         *
2536         * Only for regular pages, though: huge pages are rather
2537         * unlikely to succeed so close to the limit, and we fall back
2538         * to regular pages anyway in case of failure.
2539         */
2540        if (nr_reclaimed && nr_pages <= (1 << PAGE_ALLOC_COSTLY_ORDER))
2541                goto retry;
2542        /*
2543         * At task move, charge accounts can be doubly counted. So, it's
2544         * better to wait until the end of task_move if something is going on.
2545         */
2546        if (mem_cgroup_wait_acct_move(mem_over_limit))
2547                goto retry;
2548
2549        if (nr_retries--)
2550                goto retry;
2551
2552        if (gfp_mask & __GFP_RETRY_MAYFAIL)
2553                goto nomem;
2554
2555        if (gfp_mask & __GFP_NOFAIL)
2556                goto force;
2557
2558        if (fatal_signal_pending(current))
2559                goto force;
2560
2561        /*
2562         * keep retrying as long as the memcg oom killer is able to make
2563         * a forward progress or bypass the charge if the oom killer
2564         * couldn't make any progress.
2565         */
2566        oom_status = mem_cgroup_oom(mem_over_limit, gfp_mask,
2567                       get_order(nr_pages * PAGE_SIZE));
2568        switch (oom_status) {
2569        case OOM_SUCCESS:
2570                nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
2571                goto retry;
2572        case OOM_FAILED:
2573                goto force;
2574        default:
2575                goto nomem;
2576        }
2577nomem:
2578        if (!(gfp_mask & __GFP_NOFAIL))
2579                return -ENOMEM;
2580force:
2581        /*
2582         * The allocation either can't fail or will lead to more memory
2583         * being freed very soon.  Allow memory usage go over the limit
2584         * temporarily by force charging it.
2585         */
2586        page_counter_charge(&memcg->memory, nr_pages);
2587        if (do_memsw_account())
2588                page_counter_charge(&memcg->memsw, nr_pages);
2589        css_get_many(&memcg->css, nr_pages);
2590
2591        return 0;
2592
2593done_restock:
2594        css_get_many(&memcg->css, batch);
2595        if (batch > nr_pages)
2596                refill_stock(memcg, batch - nr_pages);
2597
2598        /*
2599         * If the hierarchy is above the normal consumption range, schedule
2600         * reclaim on returning to userland.  We can perform reclaim here
2601         * if __GFP_RECLAIM but let's always punt for simplicity and so that
2602         * GFP_KERNEL can consistently be used during reclaim.  @memcg is
2603         * not recorded as it most likely matches current's and won't
2604         * change in the meantime.  As high limit is checked again before
2605         * reclaim, the cost of mismatch is negligible.
2606         */
2607        do {
2608                bool mem_high, swap_high;
2609
2610                mem_high = page_counter_read(&memcg->memory) >
2611                        READ_ONCE(memcg->memory.high);
2612                swap_high = page_counter_read(&memcg->swap) >
2613                        READ_ONCE(memcg->swap.high);
2614
2615                /* Don't bother a random interrupted task */
2616                if (in_interrupt()) {
2617                        if (mem_high) {
2618                                schedule_work(&memcg->high_work);
2619                                break;
2620                        }
2621                        continue;
2622                }
2623
2624                if (mem_high || swap_high) {
2625                        /*
2626                         * The allocating tasks in this cgroup will need to do
2627                         * reclaim or be throttled to prevent further growth
2628                         * of the memory or swap footprints.
2629                         *
2630                         * Target some best-effort fairness between the tasks,
2631                         * and distribute reclaim work and delay penalties
2632                         * based on how much each task is actually allocating.
2633                         */
2634                        current->memcg_nr_pages_over_high += batch;
2635                        set_notify_resume(current);
2636                        break;
2637                }
2638        } while ((memcg = parent_mem_cgroup(memcg)));
2639
2640        return 0;
2641}
2642
2643#if defined(CONFIG_MEMCG_KMEM) || defined(CONFIG_MMU)
2644static void cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages)
2645{
2646        if (mem_cgroup_is_root(memcg))
2647                return;
2648
2649        page_counter_uncharge(&memcg->memory, nr_pages);
2650        if (do_memsw_account())
2651                page_counter_uncharge(&memcg->memsw, nr_pages);
2652
2653        css_put_many(&memcg->css, nr_pages);
2654}
2655#endif
2656
2657static void commit_charge(struct page *page, struct mem_cgroup *memcg)
2658{
2659        VM_BUG_ON_PAGE(page->mem_cgroup, page);
2660        /*
2661         * Any of the following ensures page->mem_cgroup stability:
2662         *
2663         * - the page lock
2664         * - LRU isolation
2665         * - lock_page_memcg()
2666         * - exclusive reference
2667         */
2668        page->mem_cgroup = memcg;
2669}
2670
2671#ifdef CONFIG_MEMCG_KMEM
2672/*
2673 * Returns a pointer to the memory cgroup to which the kernel object is charged.
2674 *
2675 * The caller must ensure the memcg lifetime, e.g. by taking rcu_read_lock(),
2676 * cgroup_mutex, etc.
2677 */
2678struct mem_cgroup *mem_cgroup_from_obj(void *p)
2679{
2680        struct page *page;
2681
2682        if (mem_cgroup_disabled())
2683                return NULL;
2684
2685        page = virt_to_head_page(p);
2686
2687        /*
2688         * Slab pages don't have page->mem_cgroup set because corresponding
2689         * kmem caches can be reparented during the lifetime. That's why
2690         * memcg_from_slab_page() should be used instead.
2691         */
2692        if (PageSlab(page))
2693                return memcg_from_slab_page(page);
2694
2695        /* All other pages use page->mem_cgroup */
2696        return page->mem_cgroup;
2697}
2698
2699static int memcg_alloc_cache_id(void)
2700{
2701        int id, size;
2702        int err;
2703
2704        id = ida_simple_get(&memcg_cache_ida,
2705                            0, MEMCG_CACHES_MAX_SIZE, GFP_KERNEL);
2706        if (id < 0)
2707                return id;
2708
2709        if (id < memcg_nr_cache_ids)
2710                return id;
2711
2712        /*
2713         * There's no space for the new id in memcg_caches arrays,
2714         * so we have to grow them.
2715         */
2716        down_write(&memcg_cache_ids_sem);
2717
2718        size = 2 * (id + 1);
2719        if (size < MEMCG_CACHES_MIN_SIZE)
2720                size = MEMCG_CACHES_MIN_SIZE;
2721        else if (size > MEMCG_CACHES_MAX_SIZE)
2722                size = MEMCG_CACHES_MAX_SIZE;
2723
2724        err = memcg_update_all_caches(size);
2725        if (!err)
2726                err = memcg_update_all_list_lrus(size);
2727        if (!err)
2728                memcg_nr_cache_ids = size;
2729
2730        up_write(&memcg_cache_ids_sem);
2731
2732        if (err) {
2733                ida_simple_remove(&memcg_cache_ida, id);
2734                return err;
2735        }
2736        return id;
2737}
2738
2739static void memcg_free_cache_id(int id)
2740{
2741        ida_simple_remove(&memcg_cache_ida, id);
2742}
2743
2744struct memcg_kmem_cache_create_work {
2745        struct mem_cgroup *memcg;
2746        struct kmem_cache *cachep;
2747        struct work_struct work;
2748};
2749
2750static void memcg_kmem_cache_create_func(struct work_struct *w)
2751{
2752        struct memcg_kmem_cache_create_work *cw =
2753                container_of(w, struct memcg_kmem_cache_create_work, work);
2754        struct mem_cgroup *memcg = cw->memcg;
2755        struct kmem_cache *cachep = cw->cachep;
2756
2757        memcg_create_kmem_cache(memcg, cachep);
2758
2759        css_put(&memcg->css);
2760        kfree(cw);
2761}
2762
2763/*
2764 * Enqueue the creation of a per-memcg kmem_cache.
2765 */
2766static void memcg_schedule_kmem_cache_create(struct mem_cgroup *memcg,
2767                                               struct kmem_cache *cachep)
2768{
2769        struct memcg_kmem_cache_create_work *cw;
2770
2771        if (!css_tryget_online(&memcg->css))
2772                return;
2773
2774        cw = kmalloc(sizeof(*cw), GFP_NOWAIT | __GFP_NOWARN);
2775        if (!cw) {
2776                css_put(&memcg->css);
2777                return;
2778        }
2779
2780        cw->memcg = memcg;
2781        cw->cachep = cachep;
2782        INIT_WORK(&cw->work, memcg_kmem_cache_create_func);
2783
2784        queue_work(memcg_kmem_cache_wq, &cw->work);
2785}
2786
2787static inline bool memcg_kmem_bypass(void)
2788{
2789        if (in_interrupt())
2790                return true;
2791
2792        /* Allow remote memcg charging in kthread contexts. */
2793        if ((!current->mm || (current->flags & PF_KTHREAD)) &&
2794             !current->active_memcg)
2795                return true;
2796        return false;
2797}
2798
2799/**
2800 * memcg_kmem_get_cache: select the correct per-memcg cache for allocation
2801 * @cachep: the original global kmem cache
2802 *
2803 * Return the kmem_cache we're supposed to use for a slab allocation.
2804 * We try to use the current memcg's version of the cache.
2805 *
2806 * If the cache does not exist yet, if we are the first user of it, we
2807 * create it asynchronously in a workqueue and let the current allocation
2808 * go through with the original cache.
2809 *
2810 * This function takes a reference to the cache it returns to assure it
2811 * won't get destroyed while we are working with it. Once the caller is
2812 * done with it, memcg_kmem_put_cache() must be called to release the
2813 * reference.
2814 */
2815struct kmem_cache *memcg_kmem_get_cache(struct kmem_cache *cachep)
2816{
2817        struct mem_cgroup *memcg;
2818        struct kmem_cache *memcg_cachep;
2819        struct memcg_cache_array *arr;
2820        int kmemcg_id;
2821
2822        VM_BUG_ON(!is_root_cache(cachep));
2823
2824        if (memcg_kmem_bypass())
2825                return cachep;
2826
2827        rcu_read_lock();
2828
2829        if (unlikely(current->active_memcg))
2830                memcg = current->active_memcg;
2831        else
2832                memcg = mem_cgroup_from_task(current);
2833
2834        if (!memcg || memcg == root_mem_cgroup)
2835                goto out_unlock;
2836
2837        kmemcg_id = READ_ONCE(memcg->kmemcg_id);
2838        if (kmemcg_id < 0)
2839                goto out_unlock;
2840
2841        arr = rcu_dereference(cachep->memcg_params.memcg_caches);
2842
2843        /*
2844         * Make sure we will access the up-to-date value. The code updating
2845         * memcg_caches issues a write barrier to match the data dependency
2846         * barrier inside READ_ONCE() (see memcg_create_kmem_cache()).
2847         */
2848        memcg_cachep = READ_ONCE(arr->entries[kmemcg_id]);
2849
2850        /*
2851         * If we are in a safe context (can wait, and not in interrupt
2852         * context), we could be be predictable and return right away.
2853         * This would guarantee that the allocation being performed
2854         * already belongs in the new cache.
2855         *
2856         * However, there are some clashes that can arrive from locking.
2857         * For instance, because we acquire the slab_mutex while doing
2858         * memcg_create_kmem_cache, this means no further allocation
2859         * could happen with the slab_mutex held. So it's better to
2860         * defer everything.
2861         *
2862         * If the memcg is dying or memcg_cache is about to be released,
2863         * don't bother creating new kmem_caches. Because memcg_cachep
2864         * is ZEROed as the fist step of kmem offlining, we don't need
2865         * percpu_ref_tryget_live() here. css_tryget_online() check in
2866         * memcg_schedule_kmem_cache_create() will prevent us from
2867         * creation of a new kmem_cache.
2868         */
2869        if (unlikely(!memcg_cachep))
2870                memcg_schedule_kmem_cache_create(memcg, cachep);
2871        else if (percpu_ref_tryget(&memcg_cachep->memcg_params.refcnt))
2872                cachep = memcg_cachep;
2873out_unlock:
2874        rcu_read_unlock();
2875        return cachep;
2876}
2877
2878/**
2879 * memcg_kmem_put_cache: drop reference taken by memcg_kmem_get_cache
2880 * @cachep: the cache returned by memcg_kmem_get_cache
2881 */
2882void memcg_kmem_put_cache(struct kmem_cache *cachep)
2883{
2884        if (!is_root_cache(cachep))
2885                percpu_ref_put(&cachep->memcg_params.refcnt);
2886}
2887
2888/**
2889 * __memcg_kmem_charge: charge a number of kernel pages to a memcg
2890 * @memcg: memory cgroup to charge
2891 * @gfp: reclaim mode
2892 * @nr_pages: number of pages to charge
2893 *
2894 * Returns 0 on success, an error code on failure.
2895 */
2896int __memcg_kmem_charge(struct mem_cgroup *memcg, gfp_t gfp,
2897                        unsigned int nr_pages)
2898{
2899        struct page_counter *counter;
2900        int ret;
2901
2902        ret = try_charge(memcg, gfp, nr_pages);
2903        if (ret)
2904                return ret;
2905
2906        if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) &&
2907            !page_counter_try_charge(&memcg->kmem, nr_pages, &counter)) {
2908
2909                /*
2910                 * Enforce __GFP_NOFAIL allocation because callers are not
2911                 * prepared to see failures and likely do not have any failure
2912                 * handling code.
2913                 */
2914                if (gfp & __GFP_NOFAIL) {
2915                        page_counter_charge(&memcg->kmem, nr_pages);
2916                        return 0;
2917                }
2918                cancel_charge(memcg, nr_pages);
2919                return -ENOMEM;
2920        }
2921        return 0;
2922}
2923
2924/**
2925 * __memcg_kmem_uncharge: uncharge a number of kernel pages from a memcg
2926 * @memcg: memcg to uncharge
2927 * @nr_pages: number of pages to uncharge
2928 */
2929void __memcg_kmem_uncharge(struct mem_cgroup *memcg, unsigned int nr_pages)
2930{
2931        if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
2932                page_counter_uncharge(&memcg->kmem, nr_pages);
2933
2934        page_counter_uncharge(&memcg->memory, nr_pages);
2935        if (do_memsw_account())
2936                page_counter_uncharge(&memcg->memsw, nr_pages);
2937}
2938
2939/**
2940 * __memcg_kmem_charge_page: charge a kmem page to the current memory cgroup
2941 * @page: page to charge
2942 * @gfp: reclaim mode
2943 * @order: allocation order
2944 *
2945 * Returns 0 on success, an error code on failure.
2946 */
2947int __memcg_kmem_charge_page(struct page *page, gfp_t gfp, int order)
2948{
2949        struct mem_cgroup *memcg;
2950        int ret = 0;
2951
2952        if (memcg_kmem_bypass())
2953                return 0;
2954
2955        memcg = get_mem_cgroup_from_current();
2956        if (!mem_cgroup_is_root(memcg)) {
2957                ret = __memcg_kmem_charge(memcg, gfp, 1 << order);
2958                if (!ret) {
2959                        page->mem_cgroup = memcg;
2960                        __SetPageKmemcg(page);
2961                }
2962        }
2963        css_put(&memcg->css);
2964        return ret;
2965}
2966
2967/**
2968 * __memcg_kmem_uncharge_page: uncharge a kmem page
2969 * @page: page to uncharge
2970 * @order: allocation order
2971 */
2972void __memcg_kmem_uncharge_page(struct page *page, int order)
2973{
2974        struct mem_cgroup *memcg = page->mem_cgroup;
2975        unsigned int nr_pages = 1 << order;
2976
2977        if (!memcg)
2978                return;
2979
2980        VM_BUG_ON_PAGE(mem_cgroup_is_root(memcg), page);
2981        __memcg_kmem_uncharge(memcg, nr_pages);
2982        page->mem_cgroup = NULL;
2983
2984        /* slab pages do not have PageKmemcg flag set */
2985        if (PageKmemcg(page))
2986                __ClearPageKmemcg(page);
2987
2988        css_put_many(&memcg->css, nr_pages);
2989}
2990#endif /* CONFIG_MEMCG_KMEM */
2991
2992#ifdef CONFIG_TRANSPARENT_HUGEPAGE
2993
2994/*
2995 * Because tail pages are not marked as "used", set it. We're under
2996 * pgdat->lru_lock and migration entries setup in all page mappings.
2997 */
2998void mem_cgroup_split_huge_fixup(struct page *head)
2999{
3000        int i;
3001
3002        if (mem_cgroup_disabled())
3003                return;
3004
3005        for (i = 1; i < HPAGE_PMD_NR; i++)
3006                head[i].mem_cgroup = head->mem_cgroup;
3007}
3008#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
3009
3010#ifdef CONFIG_MEMCG_SWAP
3011/**
3012 * mem_cgroup_move_swap_account - move swap charge and swap_cgroup's record.
3013 * @entry: swap entry to be moved
3014 * @from:  mem_cgroup which the entry is moved from
3015 * @to:  mem_cgroup which the entry is moved to
3016 *
3017 * It succeeds only when the swap_cgroup's record for this entry is the same
3018 * as the mem_cgroup's id of @from.
3019 *
3020 * Returns 0 on success, -EINVAL on failure.
3021 *
3022 * The caller must have charged to @to, IOW, called page_counter_charge() about
3023 * both res and memsw, and called css_get().
3024 */
3025static int mem_cgroup_move_swap_account(swp_entry_t entry,
3026                                struct mem_cgroup *from, struct mem_cgroup *to)
3027{
3028        unsigned short old_id, new_id;
3029
3030        old_id = mem_cgroup_id(from);
3031        new_id = mem_cgroup_id(to);
3032
3033        if (swap_cgroup_cmpxchg(entry, old_id, new_id) == old_id) {
3034                mod_memcg_state(from, MEMCG_SWAP, -1);
3035                mod_memcg_state(to, MEMCG_SWAP, 1);
3036                return 0;
3037        }
3038        return -EINVAL;
3039}
3040#else
3041static inline int mem_cgroup_move_swap_account(swp_entry_t entry,
3042                                struct mem_cgroup *from, struct mem_cgroup *to)
3043{
3044        return -EINVAL;
3045}
3046#endif
3047
3048static DEFINE_MUTEX(memcg_max_mutex);
3049
3050static int mem_cgroup_resize_max(struct mem_cgroup *memcg,
3051                                 unsigned long max, bool memsw)
3052{
3053        bool enlarge = false;
3054        bool drained = false;
3055        int ret;
3056        bool limits_invariant;
3057        struct page_counter *counter = memsw ? &memcg->memsw : &memcg->memory;
3058
3059        do {
3060                if (signal_pending(current)) {
3061                        ret = -EINTR;
3062                        break;
3063                }
3064
3065                mutex_lock(&memcg_max_mutex);
3066                /*
3067                 * Make sure that the new limit (memsw or memory limit) doesn't
3068                 * break our basic invariant rule memory.max <= memsw.max.
3069                 */
3070                limits_invariant = memsw ? max >= READ_ONCE(memcg->memory.max) :
3071                                           max <= memcg->memsw.max;
3072                if (!limits_invariant) {
3073                        mutex_unlock(&memcg_max_mutex);
3074                        ret = -EINVAL;
3075                        break;
3076                }
3077                if (max > counter->max)
3078                        enlarge = true;
3079                ret = page_counter_set_max(counter, max);
3080                mutex_unlock(&memcg_max_mutex);
3081
3082                if (!ret)
3083                        break;
3084
3085                if (!drained) {
3086                        drain_all_stock(memcg);
3087                        drained = true;
3088                        continue;
3089                }
3090
3091                if (!try_to_free_mem_cgroup_pages(memcg, 1,
3092                                        GFP_KERNEL, !memsw)) {
3093                        ret = -EBUSY;
3094                        break;
3095                }
3096        } while (true);
3097
3098        if (!ret && enlarge)
3099                memcg_oom_recover(memcg);
3100
3101        return ret;
3102}
3103
3104unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order,
3105                                            gfp_t gfp_mask,
3106                                            unsigned long *total_scanned)
3107{
3108        unsigned long nr_reclaimed = 0;
3109        struct mem_cgroup_per_node *mz, *next_mz = NULL;
3110        unsigned long reclaimed;
3111        int loop = 0;
3112        struct mem_cgroup_tree_per_node *mctz;
3113        unsigned long excess;
3114        unsigned long nr_scanned;
3115
3116        if (order > 0)
3117                return 0;
3118
3119        mctz = soft_limit_tree_node(pgdat->node_id);
3120
3121        /*
3122         * Do not even bother to check the largest node if the root
3123         * is empty. Do it lockless to prevent lock bouncing. Races
3124         * are acceptable as soft limit is best effort anyway.
3125         */
3126        if (!mctz || RB_EMPTY_ROOT(&mctz->rb_root))
3127                return 0;
3128
3129        /*
3130         * This loop can run a while, specially if mem_cgroup's continuously
3131         * keep exceeding their soft limit and putting the system under
3132         * pressure
3133         */
3134        do {
3135                if (next_mz)
3136                        mz = next_mz;
3137                else
3138                        mz = mem_cgroup_largest_soft_limit_node(mctz);
3139                if (!mz)
3140                        break;
3141
3142                nr_scanned = 0;
3143                reclaimed = mem_cgroup_soft_reclaim(mz->memcg, pgdat,
3144                                                    gfp_mask, &nr_scanned);
3145                nr_reclaimed += reclaimed;
3146                *total_scanned += nr_scanned;
3147                spin_lock_irq(&mctz->lock);
3148                __mem_cgroup_remove_exceeded(mz, mctz);
3149
3150                /*
3151                 * If we failed to reclaim anything from this memory cgroup
3152                 * it is time to move on to the next cgroup
3153                 */
3154                next_mz = NULL;
3155                if (!reclaimed)
3156                        next_mz = __mem_cgroup_largest_soft_limit_node(mctz);
3157
3158                excess = soft_limit_excess(mz->memcg);
3159                /*
3160                 * One school of thought says that we should not add
3161                 * back the node to the tree if reclaim returns 0.
3162                 * But our reclaim could return 0, simply because due
3163                 * to priority we are exposing a smaller subset of
3164                 * memory to reclaim from. Consider this as a longer
3165                 * term TODO.
3166                 */
3167                /* If excess == 0, no tree ops */
3168                __mem_cgroup_insert_exceeded(mz, mctz, excess);
3169                spin_unlock_irq(&mctz->lock);
3170                css_put(&mz->memcg->css);
3171                loop++;
3172                /*
3173                 * Could not reclaim anything and there are no more
3174                 * mem cgroups to try or we seem to be looping without
3175                 * reclaiming anything.
3176                 */
3177                if (!nr_reclaimed &&
3178                        (next_mz == NULL ||
3179                        loop > MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS))
3180                        break;
3181        } while (!nr_reclaimed);
3182        if (next_mz)
3183                css_put(&next_mz->memcg->css);
3184        return nr_reclaimed;
3185}
3186
3187/*
3188 * Test whether @memcg has children, dead or alive.  Note that this
3189 * function doesn't care whether @memcg has use_hierarchy enabled and
3190 * returns %true if there are child csses according to the cgroup
3191 * hierarchy.  Testing use_hierarchy is the caller's responsibility.
3192 */
3193static inline bool memcg_has_children(struct mem_cgroup *memcg)
3194{
3195        bool ret;
3196
3197        rcu_read_lock();
3198        ret = css_next_child(NULL, &memcg->css);
3199        rcu_read_unlock();
3200        return ret;
3201}
3202
3203/*
3204 * Reclaims as many pages from the given memcg as possible.
3205 *
3206 * Caller is responsible for holding css reference for memcg.
3207 */
3208static int mem_cgroup_force_empty(struct mem_cgroup *memcg)
3209{
3210        int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
3211
3212        /* we call try-to-free pages for make this cgroup empty */
3213        lru_add_drain_all();
3214
3215        drain_all_stock(memcg);
3216
3217        /* try to free all pages in this cgroup */
3218        while (nr_retries && page_counter_read(&memcg->memory)) {
3219                int progress;
3220
3221                if (signal_pending(current))
3222                        return -EINTR;
3223
3224                progress = try_to_free_mem_cgroup_pages(memcg, 1,
3225                                                        GFP_KERNEL, true);
3226                if (!progress) {
3227                        nr_retries--;
3228                        /* maybe some writeback is necessary */
3229                        congestion_wait(BLK_RW_ASYNC, HZ/10);
3230                }
3231
3232        }
3233
3234        return 0;
3235}
3236
3237static ssize_t mem_cgroup_force_empty_write(struct kernfs_open_file *of,
3238                                            char *buf, size_t nbytes,
3239                                            loff_t off)
3240{
3241        struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
3242
3243        if (mem_cgroup_is_root(memcg))
3244                return -EINVAL;
3245        return mem_cgroup_force_empty(memcg) ?: nbytes;
3246}
3247
3248static u64 mem_cgroup_hierarchy_read(struct cgroup_subsys_state *css,
3249                                     struct cftype *cft)
3250{
3251        return mem_cgroup_from_css(css)->use_hierarchy;
3252}
3253
3254static int mem_cgroup_hierarchy_write(struct cgroup_subsys_state *css,
3255                                      struct cftype *cft, u64 val)
3256{
3257        int retval = 0;
3258        struct mem_cgroup *memcg = mem_cgroup_from_css(css);
3259        struct mem_cgroup *parent_memcg = mem_cgroup_from_css(memcg->css.parent);
3260
3261        if (memcg->use_hierarchy == val)
3262                return 0;
3263
3264        /*
3265         * If parent's use_hierarchy is set, we can't make any modifications
3266         * in the child subtrees. If it is unset, then the change can
3267         * occur, provided the current cgroup has no children.
3268         *
3269         * For the root cgroup, parent_mem is NULL, we allow value to be
3270         * set if there are no children.
3271         */
3272        if ((!parent_memcg || !parent_memcg->use_hierarchy) &&
3273                                (val == 1 || val == 0)) {
3274                if (!memcg_has_children(memcg))
3275                        memcg->use_hierarchy = val;
3276                else
3277                        retval = -EBUSY;
3278        } else
3279                retval = -EINVAL;
3280
3281        return retval;
3282}
3283
3284static unsigned long mem_cgroup_usage(struct mem_cgroup *memcg, bool swap)
3285{
3286        unsigned long val;
3287
3288        if (mem_cgroup_is_root(memcg)) {
3289                val = memcg_page_state(memcg, NR_FILE_PAGES) +
3290                        memcg_page_state(memcg, NR_ANON_MAPPED);
3291                if (swap)
3292                        val += memcg_page_state(memcg, MEMCG_SWAP);
3293        } else {
3294                if (!swap)
3295                        val = page_counter_read(&memcg->memory);
3296                else
3297                        val = page_counter_read(&memcg->memsw);
3298        }
3299        return val;
3300}
3301
3302enum {
3303        RES_USAGE,
3304        RES_LIMIT,
3305        RES_MAX_USAGE,
3306        RES_FAILCNT,
3307        RES_SOFT_LIMIT,
3308};
3309
3310static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css,
3311                               struct cftype *cft)
3312{
3313        struct mem_cgroup *memcg = mem_cgroup_from_css(css);
3314        struct page_counter *counter;
3315
3316        switch (MEMFILE_TYPE(cft->private)) {
3317        case _MEM:
3318                counter = &memcg->memory;
3319                break;
3320        case _MEMSWAP:
3321                counter = &memcg->memsw;
3322                break;
3323        case _KMEM:
3324                counter = &memcg->kmem;
3325                break;
3326        case _TCP:
3327                counter = &memcg->tcpmem;
3328                break;
3329        default:
3330                BUG();
3331        }
3332
3333        switch (MEMFILE_ATTR(cft->private)) {
3334        case RES_USAGE:
3335                if (counter == &memcg->memory)
3336                        return (u64)mem_cgroup_usage(memcg, false) * PAGE_SIZE;
3337                if (counter == &memcg->memsw)
3338                        return (u64)mem_cgroup_usage(memcg, true) * PAGE_SIZE;
3339                return (u64)page_counter_read(counter) * PAGE_SIZE;
3340        case RES_LIMIT:
3341                return (u64)counter->max * PAGE_SIZE;
3342        case RES_MAX_USAGE:
3343                return (u64)counter->watermark * PAGE_SIZE;
3344        case RES_FAILCNT:
3345                return counter->failcnt;
3346        case RES_SOFT_LIMIT:
3347                return (u64)memcg->soft_limit * PAGE_SIZE;
3348        default:
3349                BUG();
3350        }
3351}
3352
3353static void memcg_flush_percpu_vmstats(struct mem_cgroup *memcg)
3354{
3355        unsigned long stat[MEMCG_NR_STAT] = {0};
3356        struct mem_cgroup *mi;
3357        int node, cpu, i;
3358
3359        for_each_online_cpu(cpu)
3360                for (i = 0; i < MEMCG_NR_STAT; i++)
3361                        stat[i] += per_cpu(memcg->vmstats_percpu->stat[i], cpu);
3362
3363        for (mi = memcg; mi; mi = parent_mem_cgroup(mi))
3364                for (i = 0; i < MEMCG_NR_STAT; i++)
3365                        atomic_long_add(stat[i], &mi->vmstats[i]);
3366
3367        for_each_node(node) {
3368                struct mem_cgroup_per_node *pn = memcg->nodeinfo[node];
3369                struct mem_cgroup_per_node *pi;
3370
3371                for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++)
3372                        stat[i] = 0;
3373
3374                for_each_online_cpu(cpu)
3375                        for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++)
3376                                stat[i] += per_cpu(
3377                                        pn->lruvec_stat_cpu->count[i], cpu);
3378
3379                for (pi = pn; pi; pi = parent_nodeinfo(pi, node))
3380                        for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++)
3381                                atomic_long_add(stat[i], &pi->lruvec_stat[i]);
3382        }
3383}
3384
3385static void memcg_flush_percpu_vmevents(struct mem_cgroup *memcg)
3386{
3387        unsigned long events[NR_VM_EVENT_ITEMS];
3388        struct mem_cgroup *mi;
3389        int cpu, i;
3390
3391        for (i = 0; i < NR_VM_EVENT_ITEMS; i++)
3392                events[i] = 0;
3393
3394        for_each_online_cpu(cpu)
3395                for (i = 0; i < NR_VM_EVENT_ITEMS; i++)
3396                        events[i] += per_cpu(memcg->vmstats_percpu->events[i],
3397                                             cpu);
3398
3399        for (mi = memcg; mi; mi = parent_mem_cgroup(mi))
3400                for (i = 0; i < NR_VM_EVENT_ITEMS; i++)
3401                        atomic_long_add(events[i], &mi->vmevents[i]);
3402}
3403
3404#ifdef CONFIG_MEMCG_KMEM
3405static int memcg_online_kmem(struct mem_cgroup *memcg)
3406{
3407        int memcg_id;
3408
3409        if (cgroup_memory_nokmem)
3410                return 0;
3411
3412        BUG_ON(memcg->kmemcg_id >= 0);
3413        BUG_ON(memcg->kmem_state);
3414
3415        memcg_id = memcg_alloc_cache_id();
3416        if (memcg_id < 0)
3417                return memcg_id;
3418
3419        static_branch_inc(&memcg_kmem_enabled_key);
3420        /*
3421         * A memory cgroup is considered kmem-online as soon as it gets
3422         * kmemcg_id. Setting the id after enabling static branching will
3423         * guarantee no one starts accounting before all call sites are
3424         * patched.
3425         */
3426        memcg->kmemcg_id = memcg_id;
3427        memcg->kmem_state = KMEM_ONLINE;
3428        INIT_LIST_HEAD(&memcg->kmem_caches);
3429
3430        return 0;
3431}
3432
3433static void memcg_offline_kmem(struct mem_cgroup *memcg)
3434{
3435        struct cgroup_subsys_state *css;
3436        struct mem_cgroup *parent, *child;
3437        int kmemcg_id;
3438
3439        if (memcg->kmem_state != KMEM_ONLINE)
3440                return;
3441        /*
3442         * Clear the online state before clearing memcg_caches array
3443         * entries. The slab_mutex in memcg_deactivate_kmem_caches()
3444         * guarantees that no cache will be created for this cgroup
3445         * after we are done (see memcg_create_kmem_cache()).
3446         */
3447        memcg->kmem_state = KMEM_ALLOCATED;
3448
3449        parent = parent_mem_cgroup(memcg);
3450        if (!parent)
3451                parent = root_mem_cgroup;
3452
3453        /*
3454         * Deactivate and reparent kmem_caches.
3455         */
3456        memcg_deactivate_kmem_caches(memcg, parent);
3457
3458        kmemcg_id = memcg->kmemcg_id;
3459        BUG_ON(kmemcg_id < 0);
3460
3461        /*
3462         * Change kmemcg_id of this cgroup and all its descendants to the
3463         * parent's id, and then move all entries from this cgroup's list_lrus
3464         * to ones of the parent. After we have finished, all list_lrus
3465         * corresponding to this cgroup are guaranteed to remain empty. The
3466         * ordering is imposed by list_lru_node->lock taken by
3467         * memcg_drain_all_list_lrus().
3468         */
3469        rcu_read_lock(); /* can be called from css_free w/o cgroup_mutex */
3470        css_for_each_descendant_pre(css, &memcg->css) {
3471                child = mem_cgroup_from_css(css);
3472                BUG_ON(child->kmemcg_id != kmemcg_id);
3473                child->kmemcg_id = parent->kmemcg_id;
3474                if (!memcg->use_hierarchy)
3475                        break;
3476        }
3477        rcu_read_unlock();
3478
3479        memcg_drain_all_list_lrus(kmemcg_id, parent);
3480
3481        memcg_free_cache_id(kmemcg_id);
3482}
3483
3484static void memcg_free_kmem(struct mem_cgroup *memcg)
3485{
3486        /* css_alloc() failed, offlining didn't happen */
3487        if (unlikely(memcg->kmem_state == KMEM_ONLINE))
3488                memcg_offline_kmem(memcg);
3489
3490        if (memcg->kmem_state == KMEM_ALLOCATED) {
3491                WARN_ON(!list_empty(&memcg->kmem_caches));
3492                static_branch_dec(&memcg_kmem_enabled_key);
3493        }
3494}
3495#else
3496static int memcg_online_kmem(struct mem_cgroup *memcg)
3497{
3498        return 0;
3499}
3500static void memcg_offline_kmem(struct mem_cgroup *memcg)
3501{
3502}
3503static void memcg_free_kmem(struct mem_cgroup *memcg)
3504{
3505}
3506#endif /* CONFIG_MEMCG_KMEM */
3507
3508static int memcg_update_kmem_max(struct mem_cgroup *memcg,
3509                                 unsigned long max)
3510{
3511        int ret;
3512
3513        mutex_lock(&memcg_max_mutex);
3514        ret = page_counter_set_max(&memcg->kmem, max);
3515        mutex_unlock(&memcg_max_mutex);
3516        return ret;
3517}
3518
3519static int memcg_update_tcp_max(struct mem_cgroup *memcg, unsigned long max)
3520{
3521        int ret;
3522
3523        mutex_lock(&memcg_max_mutex);
3524
3525        ret = page_counter_set_max(&memcg->tcpmem, max);
3526        if (ret)
3527                goto out;
3528
3529        if (!memcg->tcpmem_active) {
3530                /*
3531                 * The active flag needs to be written after the static_key
3532                 * update. This is what guarantees that the socket activation
3533                 * function is the last one to run. See mem_cgroup_sk_alloc()
3534                 * for details, and note that we don't mark any socket as
3535                 * belonging to this memcg until that flag is up.
3536                 *
3537                 * We need to do this, because static_keys will span multiple
3538                 * sites, but we can't control their order. If we mark a socket
3539                 * as accounted, but the accounting functions are not patched in
3540                 * yet, we'll lose accounting.
3541                 *
3542                 * We never race with the readers in mem_cgroup_sk_alloc(),
3543                 * because when this value change, the code to process it is not
3544                 * patched in yet.
3545                 */
3546                static_branch_inc(&memcg_sockets_enabled_key);
3547                memcg->tcpmem_active = true;
3548        }
3549out:
3550        mutex_unlock(&memcg_max_mutex);
3551        return ret;
3552}
3553
3554/*
3555 * The user of this function is...
3556 * RES_LIMIT.
3557 */
3558static ssize_t mem_cgroup_write(struct kernfs_open_file *of,
3559                                char *buf, size_t nbytes, loff_t off)
3560{
3561        struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
3562        unsigned long nr_pages;
3563        int ret;
3564
3565        buf = strstrip(buf);
3566        ret = page_counter_memparse(buf, "-1", &nr_pages);
3567        if (ret)
3568                return ret;
3569
3570        switch (MEMFILE_ATTR(of_cft(of)->private)) {
3571        case RES_LIMIT:
3572                if (mem_cgroup_is_root(memcg)) { /* Can't set limit on root */
3573                        ret = -EINVAL;
3574                        break;
3575                }
3576                switch (MEMFILE_TYPE(of_cft(of)->private)) {
3577                case _MEM:
3578                        ret = mem_cgroup_resize_max(memcg, nr_pages, false);
3579                        break;
3580                case _MEMSWAP:
3581                        ret = mem_cgroup_resize_max(memcg, nr_pages, true);
3582                        break;
3583                case _KMEM:
3584                        pr_warn_once("kmem.limit_in_bytes is deprecated and will be removed. "
3585                                     "Please report your usecase to linux-mm@kvack.org if you "
3586                                     "depend on this functionality.\n");
3587                        ret = memcg_update_kmem_max(memcg, nr_pages);
3588                        break;
3589                case _TCP:
3590                        ret = memcg_update_tcp_max(memcg, nr_pages);
3591                        break;
3592                }
3593                break;
3594        case RES_SOFT_LIMIT:
3595                memcg->soft_limit = nr_pages;
3596                ret = 0;
3597                break;
3598        }
3599        return ret ?: nbytes;
3600}
3601
3602static ssize_t mem_cgroup_reset(struct kernfs_open_file *of, char *buf,
3603                                size_t nbytes, loff_t off)
3604{
3605        struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
3606        struct page_counter *counter;
3607
3608        switch (MEMFILE_TYPE(of_cft(of)->private)) {
3609        case _MEM:
3610                counter = &memcg->memory;
3611                break;
3612        case _MEMSWAP:
3613                counter = &memcg->memsw;
3614                break;
3615        case _KMEM:
3616                counter = &memcg->kmem;
3617                break;
3618        case _TCP:
3619                counter = &memcg->tcpmem;
3620                break;
3621        default:
3622                BUG();
3623        }
3624
3625        switch (MEMFILE_ATTR(of_cft(of)->private)) {
3626        case RES_MAX_USAGE:
3627                page_counter_reset_watermark(counter);
3628                break;
3629        case RES_FAILCNT:
3630                counter->failcnt = 0;
3631                break;
3632        default:
3633                BUG();
3634        }
3635
3636        return nbytes;
3637}
3638
3639static u64 mem_cgroup_move_charge_read(struct cgroup_subsys_state *css,
3640                                        struct cftype *cft)
3641{
3642        return mem_cgroup_from_css(css)->move_charge_at_immigrate;
3643}
3644
3645#ifdef CONFIG_MMU
3646static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css,
3647                                        struct cftype *cft, u64 val)
3648{
3649        struct mem_cgroup *memcg = mem_cgroup_from_css(css);
3650
3651        if (val & ~MOVE_MASK)
3652                return -EINVAL;
3653
3654        /*
3655         * No kind of locking is needed in here, because ->can_attach() will
3656         * check this value once in the beginning of the process, and then carry
3657         * on with stale data. This means that changes to this value will only
3658         * affect task migrations starting after the change.
3659         */
3660        memcg->move_charge_at_immigrate = val;
3661        return 0;
3662}
3663#else
3664static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css,
3665                                        struct cftype *cft, u64 val)
3666{
3667        return -ENOSYS;
3668}
3669#endif
3670
3671#ifdef CONFIG_NUMA
3672
3673#define LRU_ALL_FILE (BIT(LRU_INACTIVE_FILE) | BIT(LRU_ACTIVE_FILE))
3674#define LRU_ALL_ANON (BIT(LRU_INACTIVE_ANON) | BIT(LRU_ACTIVE_ANON))
3675#define LRU_ALL      ((1 << NR_LRU_LISTS) - 1)
3676
3677static unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg,
3678                                int nid, unsigned int lru_mask, bool tree)
3679{
3680        struct lruvec *lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(nid));
3681        unsigned long nr = 0;
3682        enum lru_list lru;
3683
3684        VM_BUG_ON((unsigned)nid >= nr_node_ids);
3685
3686        for_each_lru(lru) {
3687                if (!(BIT(lru) & lru_mask))
3688                        continue;
3689                if (tree)
3690                        nr += lruvec_page_state(lruvec, NR_LRU_BASE + lru);
3691                else
3692                        nr += lruvec_page_state_local(lruvec, NR_LRU_BASE + lru);
3693        }
3694        return nr;
3695}
3696
3697static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *memcg,
3698                                             unsigned int lru_mask,
3699                                             bool tree)
3700{
3701        unsigned long nr = 0;
3702        enum lru_list lru;
3703
3704        for_each_lru(lru) {
3705                if (!(BIT(lru) & lru_mask))
3706                        continue;
3707                if (tree)
3708                        nr += memcg_page_state(memcg, NR_LRU_BASE + lru);
3709                else
3710                        nr += memcg_page_state_local(memcg, NR_LRU_BASE + lru);
3711        }
3712        return nr;
3713}
3714
3715static int memcg_numa_stat_show(struct seq_file *m, void *v)
3716{
3717        struct numa_stat {
3718                const char *name;
3719                unsigned int lru_mask;
3720        };
3721
3722        static const struct numa_stat stats[] = {
3723                { "total", LRU_ALL },
3724                { "file", LRU_ALL_FILE },
3725                { "anon", LRU_ALL_ANON },
3726                { "unevictable", BIT(LRU_UNEVICTABLE) },
3727        };
3728        const struct numa_stat *stat;
3729        int nid;
3730        struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
3731
3732        for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) {
3733                seq_printf(m, "%s=%lu", stat->name,
3734                           mem_cgroup_nr_lru_pages(memcg, stat->lru_mask,
3735                                                   false));
3736                for_each_node_state(nid, N_MEMORY)
3737                        seq_printf(m, " N%d=%lu", nid,
3738                                   mem_cgroup_node_nr_lru_pages(memcg, nid,
3739                                                        stat->lru_mask, false));
3740                seq_putc(m, '\n');
3741        }
3742
3743        for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) {
3744
3745                seq_printf(m, "hierarchical_%s=%lu", stat->name,
3746                           mem_cgroup_nr_lru_pages(memcg, stat->lru_mask,
3747                                                   true));
3748                for_each_node_state(nid, N_MEMORY)
3749                        seq_printf(m, " N%d=%lu", nid,
3750                                   mem_cgroup_node_nr_lru_pages(memcg, nid,
3751                                                        stat->lru_mask, true));
3752                seq_putc(m, '\n');
3753        }
3754
3755        return 0;
3756}
3757#endif /* CONFIG_NUMA */
3758
3759static const unsigned int memcg1_stats[] = {
3760        NR_FILE_PAGES,
3761        NR_ANON_MAPPED,
3762#ifdef CONFIG_TRANSPARENT_HUGEPAGE
3763        NR_ANON_THPS,
3764#endif
3765        NR_SHMEM,
3766        NR_FILE_MAPPED,
3767        NR_FILE_DIRTY,
3768        NR_WRITEBACK,
3769        MEMCG_SWAP,
3770};
3771
3772static const char *const memcg1_stat_names[] = {
3773        "cache",
3774        "rss",
3775#ifdef CONFIG_TRANSPARENT_HUGEPAGE
3776        "rss_huge",
3777#endif
3778        "shmem",
3779        "mapped_file",
3780        "dirty",
3781        "writeback",
3782        "swap",
3783};
3784
3785/* Universal VM events cgroup1 shows, original sort order */
3786static const unsigned int memcg1_events[] = {
3787        PGPGIN,
3788        PGPGOUT,
3789        PGFAULT,
3790        PGMAJFAULT,
3791};
3792
3793static int memcg_stat_show(struct seq_file *m, void *v)
3794{
3795        struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
3796        unsigned long memory, memsw;
3797        struct mem_cgroup *mi;
3798        unsigned int i;
3799
3800        BUILD_BUG_ON(ARRAY_SIZE(memcg1_stat_names) != ARRAY_SIZE(memcg1_stats));
3801
3802        for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) {
3803                unsigned long nr;
3804
3805                if (memcg1_stats[i] == MEMCG_SWAP && !do_memsw_account())
3806                        continue;
3807                nr = memcg_page_state_local(memcg, memcg1_stats[i]);
3808#ifdef CONFIG_TRANSPARENT_HUGEPAGE
3809                if (memcg1_stats[i] == NR_ANON_THPS)
3810                        nr *= HPAGE_PMD_NR;
3811#endif
3812                seq_printf(m, "%s %lu\n", memcg1_stat_names[i], nr * PAGE_SIZE);
3813        }
3814
3815        for (i = 0; i < ARRAY_SIZE(memcg1_events); i++)
3816                seq_printf(m, "%s %lu\n", vm_event_name(memcg1_events[i]),
3817                           memcg_events_local(memcg, memcg1_events[i]));
3818
3819        for (i = 0; i < NR_LRU_LISTS; i++)
3820                seq_printf(m, "%s %lu\n", lru_list_name(i),
3821                           memcg_page_state_local(memcg, NR_LRU_BASE + i) *
3822                           PAGE_SIZE);
3823
3824        /* Hierarchical information */
3825        memory = memsw = PAGE_COUNTER_MAX;
3826        for (mi = memcg; mi; mi = parent_mem_cgroup(mi)) {
3827                memory = min(memory, READ_ONCE(mi->memory.max));
3828                memsw = min(memsw, READ_ONCE(mi->memsw.max));
3829        }
3830        seq_printf(m, "hierarchical_memory_limit %llu\n",
3831                   (u64)memory * PAGE_SIZE);
3832        if (do_memsw_account())
3833                seq_printf(m, "hierarchical_memsw_limit %llu\n",
3834                           (u64)memsw * PAGE_SIZE);
3835
3836        for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) {
3837                if (memcg1_stats[i] == MEMCG_SWAP && !do_memsw_account())
3838                        continue;
3839                seq_printf(m, "total_%s %llu\n", memcg1_stat_names[i],
3840                           (u64)memcg_page_state(memcg, memcg1_stats[i]) *
3841                           PAGE_SIZE);
3842        }
3843
3844        for (i = 0; i < ARRAY_SIZE(memcg1_events); i++)
3845                seq_printf(m, "total_%s %llu\n",
3846                           vm_event_name(memcg1_events[i]),
3847                           (u64)memcg_events(memcg, memcg1_events[i]));
3848
3849        for (i = 0; i < NR_LRU_LISTS; i++)
3850                seq_printf(m, "total_%s %llu\n", lru_list_name(i),
3851                           (u64)memcg_page_state(memcg, NR_LRU_BASE + i) *
3852                           PAGE_SIZE);
3853
3854#ifdef CONFIG_DEBUG_VM
3855        {
3856                pg_data_t *pgdat;
3857                struct mem_cgroup_per_node *mz;
3858                unsigned long anon_cost = 0;
3859                unsigned long file_cost = 0;
3860
3861                for_each_online_pgdat(pgdat) {
3862                        mz = mem_cgroup_nodeinfo(memcg, pgdat->node_id);
3863
3864                        anon_cost += mz->lruvec.anon_cost;
3865                        file_cost += mz->lruvec.file_cost;
3866                }
3867                seq_printf(m, "anon_cost %lu\n", anon_cost);
3868                seq_printf(m, "file_cost %lu\n", file_cost);
3869        }
3870#endif
3871
3872        return 0;
3873}
3874
3875static u64 mem_cgroup_swappiness_read(struct cgroup_subsys_state *css,
3876                                      struct cftype *cft)
3877{
3878        struct mem_cgroup *memcg = mem_cgroup_from_css(css);
3879
3880        return mem_cgroup_swappiness(memcg);
3881}
3882
3883static int mem_cgroup_swappiness_write(struct cgroup_subsys_state *css,
3884                                       struct cftype *cft, u64 val)
3885{
3886        struct mem_cgroup *memcg = mem_cgroup_from_css(css);
3887
3888        if (val > 100)
3889                return -EINVAL;
3890
3891        if (css->parent)
3892                memcg->swappiness = val;
3893        else
3894                vm_swappiness = val;
3895
3896        return 0;
3897}
3898
3899static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap)
3900{
3901        struct mem_cgroup_threshold_ary *t;
3902        unsigned long usage;
3903        int i;
3904
3905        rcu_read_lock();
3906        if (!swap)
3907                t = rcu_dereference(memcg->thresholds.primary);
3908        else
3909                t = rcu_dereference(memcg->memsw_thresholds.primary);
3910
3911        if (!t)
3912                goto unlock;
3913
3914        usage = mem_cgroup_usage(memcg, swap);
3915
3916        /*
3917         * current_threshold points to threshold just below or equal to usage.
3918         * If it's not true, a threshold was crossed after last
3919         * call of __mem_cgroup_threshold().
3920         */
3921        i = t->current_threshold;
3922
3923        /*
3924         * Iterate backward over array of thresholds starting from
3925         * current_threshold and check if a threshold is crossed.
3926         * If none of thresholds below usage is crossed, we read
3927         * only one element of the array here.
3928         */
3929        for (; i >= 0 && unlikely(t->entries[i].threshold > usage); i--)
3930                eventfd_signal(t->entries[i].eventfd, 1);
3931
3932        /* i = current_threshold + 1 */
3933        i++;
3934
3935        /*
3936         * Iterate forward over array of thresholds starting from
3937         * current_threshold+1 and check if a threshold is crossed.
3938         * If none of thresholds above usage is crossed, we read
3939         * only one element of the array here.
3940         */
3941        for (; i < t->size && unlikely(t->entries[i].threshold <= usage); i++)
3942                eventfd_signal(t->entries[i].eventfd, 1);
3943
3944        /* Update current_threshold */
3945        t->current_threshold = i - 1;
3946unlock:
3947        rcu_read_unlock();
3948}
3949
3950static void mem_cgroup_threshold(struct mem_cgroup *memcg)
3951{
3952        while (memcg) {
3953                __mem_cgroup_threshold(memcg, false);
3954                if (do_memsw_account())
3955                        __mem_cgroup_threshold(memcg, true);
3956
3957                memcg = parent_mem_cgroup(memcg);
3958        }
3959}
3960
3961static int compare_thresholds(const void *a, const void *b)
3962{
3963        const struct mem_cgroup_threshold *_a = a;
3964        const struct mem_cgroup_threshold *_b = b;
3965
3966        if (_a->threshold > _b->threshold)
3967                return 1;
3968
3969        if (_a->threshold < _b->threshold)
3970                return -1;
3971
3972        return 0;
3973}
3974
3975static int mem_cgroup_oom_notify_cb(struct mem_cgroup *memcg)
3976{
3977        struct mem_cgroup_eventfd_list *ev;
3978
3979        spin_lock(&memcg_oom_lock);
3980
3981        list_for_each_entry(ev, &memcg->oom_notify, list)
3982                eventfd_signal(ev->eventfd, 1);
3983
3984        spin_unlock(&memcg_oom_lock);
3985        return 0;
3986}
3987
3988static void mem_cgroup_oom_notify(struct mem_cgroup *memcg)
3989{
3990        struct mem_cgroup *iter;
3991
3992        for_each_mem_cgroup_tree(iter, memcg)
3993                mem_cgroup_oom_notify_cb(iter);
3994}
3995
3996static int __mem_cgroup_usage_register_event(struct mem_cgroup *memcg,
3997        struct eventfd_ctx *eventfd, const char *args, enum res_type type)
3998{
3999        struct mem_cgroup_thresholds *thresholds;
4000        struct mem_cgroup_threshold_ary *new;
4001        unsigned long threshold;
4002        unsigned long usage;
4003        int i, size, ret;
4004
4005        ret = page_counter_memparse(args, "-1", &threshold);
4006        if (ret)
4007                return ret;
4008
4009        mutex_lock(&memcg->thresholds_lock);
4010
4011        if (type == _MEM) {
4012                thresholds = &memcg->thresholds;
4013                usage = mem_cgroup_usage(memcg, false);
4014        } else if (type == _MEMSWAP) {
4015                thresholds = &memcg->memsw_thresholds;
4016                usage = mem_cgroup_usage(memcg, true);
4017        } else
4018                BUG();
4019
4020        /* Check if a threshold crossed before adding a new one */
4021        if (thresholds->primary)
4022                __mem_cgroup_threshold(memcg, type == _MEMSWAP);
4023
4024        size = thresholds->primary ? thresholds->primary->size + 1 : 1;
4025
4026        /* Allocate memory for new array of thresholds */
4027        new = kmalloc(struct_size(new, entries, size), GFP_KERNEL);
4028        if (!new) {
4029                ret = -ENOMEM;
4030                goto unlock;
4031        }
4032        new->size = size;
4033
4034        /* Copy thresholds (if any) to new array */
4035        if (thresholds->primary) {
4036                memcpy(new->entries, thresholds->primary->entries, (size - 1) *
4037                                sizeof(struct mem_cgroup_threshold));
4038        }
4039
4040        /* Add new threshold */
4041        new->entries[size - 1].eventfd = eventfd;
4042        new->entries[size - 1].threshold = threshold;
4043
4044        /* Sort thresholds. Registering of new threshold isn't time-critical */
4045        sort(new->entries, size, sizeof(struct mem_cgroup_threshold),
4046                        compare_thresholds, NULL);
4047
4048        /* Find current threshold */
4049        new->current_threshold = -1;
4050        for (i = 0; i < size; i++) {
4051                if (new->entries[i].threshold <= usage) {
4052                        /*
4053                         * new->current_threshold will not be used until
4054                         * rcu_assign_pointer(), so it's safe to increment
4055                         * it here.
4056                         */
4057                        ++new->current_threshold;
4058                } else
4059                        break;
4060        }
4061
4062        /* Free old spare buffer and save old primary buffer as spare */
4063        kfree(thresholds->spare);
4064        thresholds->spare = thresholds->primary;
4065
4066        rcu_assign_pointer(thresholds->primary, new);
4067
4068        /* To be sure that nobody uses thresholds */
4069        synchronize_rcu();
4070
4071unlock:
4072        mutex_unlock(&memcg->thresholds_lock);
4073
4074        return ret;
4075}
4076
4077static int mem_cgroup_usage_register_event(struct mem_cgroup *memcg,
4078        struct eventfd_ctx *eventfd, const char *args)
4079{
4080        return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEM);
4081}
4082
4083static int memsw_cgroup_usage_register_event(struct mem_cgroup *memcg,
4084        struct eventfd_ctx *eventfd, const char *args)
4085{
4086        return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEMSWAP);
4087}
4088
4089static void __mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
4090        struct eventfd_ctx *eventfd, enum res_type type)
4091{
4092        struct mem_cgroup_thresholds *thresholds;
4093        struct mem_cgroup_threshold_ary *new;
4094        unsigned long usage;
4095        int i, j, size, entries;
4096
4097        mutex_lock(&memcg->thresholds_lock);
4098
4099        if (type == _MEM) {
4100                thresholds = &memcg->thresholds;
4101                usage = mem_cgroup_usage(memcg, false);
4102        } else if (type == _MEMSWAP) {
4103                thresholds = &memcg->memsw_thresholds;
4104                usage = mem_cgroup_usage(memcg, true);
4105        } else
4106                BUG();
4107
4108        if (!thresholds->primary)
4109                goto unlock;
4110
4111        /* Check if a threshold crossed before removing */
4112        __mem_cgroup_threshold(memcg, type == _MEMSWAP);
4113
4114        /* Calculate new number of threshold */
4115        size = entries = 0;
4116        for (i = 0; i < thresholds->primary->size; i++) {
4117                if (thresholds->primary->entries[i].eventfd != eventfd)
4118                        size++;
4119                else
4120                        entries++;
4121        }
4122
4123        new = thresholds->spare;
4124
4125        /* If no items related to eventfd have been cleared, nothing to do */
4126        if (!entries)
4127                goto unlock;
4128
4129        /* Set thresholds array to NULL if we don't have thresholds */
4130        if (!size) {
4131                kfree(new);
4132                new = NULL;
4133                goto swap_buffers;
4134        }
4135
4136        new->size = size;
4137
4138        /* Copy thresholds and find current threshold */
4139        new->current_threshold = -1;
4140        for (i = 0, j = 0; i < thresholds->primary->size; i++) {
4141                if (thresholds->primary->entries[i].eventfd == eventfd)
4142                        continue;
4143
4144                new->entries[j] = thresholds->primary->entries[i];
4145                if (new->entries[j].threshold <= usage) {
4146                        /*
4147                         * new->current_threshold will not be used
4148                         * until rcu_assign_pointer(), so it's safe to increment
4149                         * it here.
4150                         */
4151                        ++new->current_threshold;
4152                }
4153                j++;
4154        }
4155
4156swap_buffers:
4157        /* Swap primary and spare array */
4158        thresholds->spare = thresholds->primary;
4159
4160        rcu_assign_pointer(thresholds->primary, new);
4161
4162        /* To be sure that nobody uses thresholds */
4163        synchronize_rcu();
4164
4165        /* If all events are unregistered, free the spare array */
4166        if (!new) {
4167                kfree(thresholds->spare);
4168                thresholds->spare = NULL;
4169        }
4170unlock:
4171        mutex_unlock(&memcg->thresholds_lock);
4172}
4173
4174static void mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
4175        struct eventfd_ctx *eventfd)
4176{
4177        return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEM);
4178}
4179
4180static void memsw_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
4181        struct eventfd_ctx *eventfd)
4182{
4183        return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEMSWAP);
4184}
4185
4186static int mem_cgroup_oom_register_event(struct mem_cgroup *memcg,
4187        struct eventfd_ctx *eventfd, const char *args)
4188{
4189        struct mem_cgroup_eventfd_list *event;
4190
4191        event = kmalloc(sizeof(*event), GFP_KERNEL);
4192        if (!event)
4193                return -ENOMEM;
4194
4195        spin_lock(&memcg_oom_lock);
4196
4197        event->eventfd = eventfd;
4198        list_add(&event->list, &memcg->oom_notify);
4199
4200        /* already in OOM ? */
4201        if (memcg->under_oom)
4202                eventfd_signal(eventfd, 1);
4203        spin_unlock(&memcg_oom_lock);
4204
4205        return 0;
4206}
4207
4208static void mem_cgroup_oom_unregister_event(struct mem_cgroup *memcg,
4209        struct eventfd_ctx *eventfd)
4210{
4211        struct mem_cgroup_eventfd_list *ev, *tmp;
4212
4213        spin_lock(&memcg_oom_lock);
4214
4215        list_for_each_entry_safe(ev, tmp, &memcg->oom_notify, list) {
4216                if (ev->eventfd == eventfd) {
4217                        list_del(&ev->list);
4218                        kfree(ev);
4219                }
4220        }
4221
4222        spin_unlock(&memcg_oom_lock);
4223}
4224
4225static int mem_cgroup_oom_control_read(struct seq_file *sf, void *v)
4226{
4227        struct mem_cgroup *memcg = mem_cgroup_from_seq(sf);
4228
4229        seq_printf(sf, "oom_kill_disable %d\n", memcg->oom_kill_disable);
4230        seq_printf(sf, "under_oom %d\n", (bool)memcg->under_oom);
4231        seq_printf(sf, "oom_kill %lu\n",
4232                   atomic_long_read(&memcg->memory_events[MEMCG_OOM_KILL]));
4233        return 0;
4234}
4235
4236static int mem_cgroup_oom_control_write(struct cgroup_subsys_state *css,
4237        struct cftype *cft, u64 val)
4238{
4239        struct mem_cgroup *memcg = mem_cgroup_from_css(css);
4240
4241        /* cannot set to root cgroup and only 0 and 1 are allowed */
4242        if (!css->parent || !((val == 0) || (val == 1)))
4243                return -EINVAL;
4244
4245        memcg->oom_kill_disable = val;
4246        if (!val)
4247                memcg_oom_recover(memcg);
4248
4249        return 0;
4250}
4251
4252#ifdef CONFIG_CGROUP_WRITEBACK
4253
4254#include <trace/events/writeback.h>
4255
4256static int memcg_wb_domain_init(struct mem_cgroup *memcg, gfp_t gfp)
4257{
4258        return wb_domain_init(&memcg->cgwb_domain, gfp);
4259}
4260
4261static void memcg_wb_domain_exit(struct mem_cgroup *memcg)
4262{
4263        wb_domain_exit(&memcg->cgwb_domain);
4264}
4265
4266static void memcg_wb_domain_size_changed(struct mem_cgroup *memcg)
4267{
4268        wb_domain_size_changed(&memcg->cgwb_domain);
4269}
4270
4271struct wb_domain *mem_cgroup_wb_domain(struct bdi_writeback *wb)
4272{
4273        struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css);
4274
4275        if (!memcg->css.parent)
4276                return NULL;
4277
4278        return &memcg->cgwb_domain;
4279}
4280
4281/*
4282 * idx can be of type enum memcg_stat_item or node_stat_item.
4283 * Keep in sync with memcg_exact_page().
4284 */
4285static unsigned long memcg_exact_page_state(struct mem_cgroup *memcg, int idx)
4286{
4287        long x = atomic_long_read(&memcg->vmstats[idx]);
4288        int cpu;
4289
4290        for_each_online_cpu(cpu)
4291                x += per_cpu_ptr(memcg->vmstats_percpu, cpu)->stat[idx];
4292        if (x < 0)
4293                x = 0;
4294        return x;
4295}
4296
4297/**
4298 * mem_cgroup_wb_stats - retrieve writeback related stats from its memcg
4299 * @wb: bdi_writeback in question
4300 * @pfilepages: out parameter for number of file pages
4301 * @pheadroom: out parameter for number of allocatable pages according to memcg
4302 * @pdirty: out parameter for number of dirty pages
4303 * @pwriteback: out parameter for number of pages under writeback
4304 *
4305 * Determine the numbers of file, headroom, dirty, and writeback pages in
4306 * @wb's memcg.  File, dirty and writeback are self-explanatory.  Headroom
4307 * is a bit more involved.
4308 *
4309 * A memcg's headroom is "min(max, high) - used".  In the hierarchy, the
4310 * headroom is calculated as the lowest headroom of itself and the
4311 * ancestors.  Note that this doesn't consider the actual amount of
4312 * available memory in the system.  The caller should further cap
4313 * *@pheadroom accordingly.
4314 */
4315void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages,
4316                         unsigned long *pheadroom, unsigned long *pdirty,
4317                         unsigned long *pwriteback)
4318{
4319        struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css);
4320        struct mem_cgroup *parent;
4321
4322        *pdirty = memcg_exact_page_state(memcg, NR_FILE_DIRTY);
4323
4324        *pwriteback = memcg_exact_page_state(memcg, NR_WRITEBACK);
4325        *pfilepages = memcg_exact_page_state(memcg, NR_INACTIVE_FILE) +
4326                        memcg_exact_page_state(memcg, NR_ACTIVE_FILE);
4327        *pheadroom = PAGE_COUNTER_MAX;
4328
4329        while ((parent = parent_mem_cgroup(memcg))) {
4330                unsigned long ceiling = min(READ_ONCE(memcg->memory.max),
4331                                            READ_ONCE(memcg->memory.high));
4332                unsigned long used = page_counter_read(&memcg->memory);
4333
4334                *pheadroom = min(*pheadroom, ceiling - min(ceiling, used));
4335                memcg = parent;
4336        }
4337}
4338
4339/*
4340 * Foreign dirty flushing
4341 *
4342 * There's an inherent mismatch between memcg and writeback.  The former
4343 * trackes ownership per-page while the latter per-inode.  This was a
4344 * deliberate design decision because honoring per-page ownership in the
4345 * writeback path is complicated, may lead to higher CPU and IO overheads
4346 * and deemed unnecessary given that write-sharing an inode across
4347 * different cgroups isn't a common use-case.
4348 *
4349 * Combined with inode majority-writer ownership switching, this works well
4350 * enough in most cases but there are some pathological cases.  For
4351 * example, let's say there are two cgroups A and B which keep writing to
4352 * different but confined parts of the same inode.  B owns the inode and
4353 * A's memory is limited far below B's.  A's dirty ratio can rise enough to
4354 * trigger balance_dirty_pages() sleeps but B's can be low enough to avoid
4355 * triggering background writeback.  A will be slowed down without a way to
4356 * make writeback of the dirty pages happen.
4357 *
4358 * Conditions like the above can lead to a cgroup getting repatedly and
4359 * severely throttled after making some progress after each
4360 * dirty_expire_interval while the underyling IO device is almost
4361 * completely idle.
4362 *
4363 * Solving this problem completely requires matching the ownership tracking
4364 * granularities between memcg and writeback in either direction.  However,
4365 * the more egregious behaviors can be avoided by simply remembering the
4366 * most recent foreign dirtying events and initiating remote flushes on
4367 * them when local writeback isn't enough to keep the memory clean enough.
4368 *
4369 * The following two functions implement such mechanism.  When a foreign
4370 * page - a page whose memcg and writeback ownerships don't match - is
4371 * dirtied, mem_cgroup_track_foreign_dirty() records the inode owning
4372 * bdi_writeback on the page owning memcg.  When balance_dirty_pages()
4373 * decides that the memcg needs to sleep due to high dirty ratio, it calls
4374 * mem_cgroup_flush_foreign() which queues writeback on the recorded
4375 * foreign bdi_writebacks which haven't expired.  Both the numbers of
4376 * recorded bdi_writebacks and concurrent in-flight foreign writebacks are
4377 * limited to MEMCG_CGWB_FRN_CNT.
4378 *
4379 * The mechanism only remembers IDs and doesn't hold any object references.
4380 * As being wrong occasionally doesn't matter, updates and accesses to the
4381 * records are lockless and racy.
4382 */
4383void mem_cgroup_track_foreign_dirty_slowpath(struct page *page,
4384                                             struct bdi_writeback *wb)
4385{
4386        struct mem_cgroup *memcg = page->mem_cgroup;
4387        struct memcg_cgwb_frn *frn;
4388        u64 now = get_jiffies_64();
4389        u64 oldest_at = now;
4390        int oldest = -1;
4391        int i;
4392
4393        trace_track_foreign_dirty(page, wb);
4394
4395        /*
4396         * Pick the slot to use.  If there is already a slot for @wb, keep
4397         * using it.  If not replace the oldest one which isn't being
4398         * written out.
4399         */
4400        for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++) {
4401                frn = &memcg->cgwb_frn[i];
4402                if (frn->bdi_id == wb->bdi->id &&
4403                    frn->memcg_id == wb->memcg_css->id)
4404                        break;
4405                if (time_before64(frn->at, oldest_at) &&
4406                    atomic_read(&frn->done.cnt) == 1) {
4407                        oldest = i;
4408                        oldest_at = frn->at;
4409                }
4410        }
4411
4412        if (i < MEMCG_CGWB_FRN_CNT) {
4413                /*
4414                 * Re-using an existing one.  Update timestamp lazily to
4415                 * avoid making the cacheline hot.  We want them to be
4416                 * reasonably up-to-date and significantly shorter than
4417                 * dirty_expire_interval as that's what expires the record.
4418                 * Use the shorter of 1s and dirty_expire_interval / 8.
4419                 */
4420                unsigned long update_intv =
4421                        min_t(unsigned long, HZ,
4422                              msecs_to_jiffies(dirty_expire_interval * 10) / 8);
4423
4424                if (time_before64(frn->at, now - update_intv))
4425                        frn->at = now;
4426        } else if (oldest >= 0) {
4427                /* replace the oldest free one */
4428                frn = &memcg->cgwb_frn[oldest];
4429                frn->bdi_id = wb->bdi->id;
4430                frn->memcg_id = wb->memcg_css->id;
4431                frn->at = now;
4432        }
4433}
4434
4435/* issue foreign writeback flushes for recorded foreign dirtying events */
4436void mem_cgroup_flush_foreign(struct bdi_writeback *wb)
4437{
4438        struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css);
4439        unsigned long intv = msecs_to_jiffies(dirty_expire_interval * 10);
4440        u64 now = jiffies_64;
4441        int i;
4442
4443        for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++) {
4444                struct memcg_cgwb_frn *frn = &memcg->cgwb_frn[i];
4445
4446                /*
4447                 * If the record is older than dirty_expire_interval,
4448                 * writeback on it has already started.  No need to kick it
4449                 * off again.  Also, don't start a new one if there's
4450                 * already one in flight.
4451                 */
4452                if (time_after64(frn->at, now - intv) &&
4453                    atomic_read(&frn->done.cnt) == 1) {
4454                        frn->at = 0;
4455                        trace_flush_foreign(wb, frn->bdi_id, frn->memcg_id);
4456                        cgroup_writeback_by_id(frn->bdi_id, frn->memcg_id, 0,
4457                                               WB_REASON_FOREIGN_FLUSH,
4458                                               &frn->done);
4459                }
4460        }
4461}
4462
4463#else   /* CONFIG_CGROUP_WRITEBACK */
4464
4465static int memcg_wb_domain_init(struct mem_cgroup *memcg, gfp_t gfp)
4466{
4467        return 0;
4468}
4469
4470static void memcg_wb_domain_exit(struct mem_cgroup *memcg)
4471{
4472}
4473
4474static void memcg_wb_domain_size_changed(struct mem_cgroup *memcg)
4475{
4476}
4477
4478#endif  /* CONFIG_CGROUP_WRITEBACK */
4479
4480/*
4481 * DO NOT USE IN NEW FILES.
4482 *
4483 * "cgroup.event_control" implementation.
4484 *
4485 * This is way over-engineered.  It tries to support fully configurable
4486 * events for each user.  Such level of flexibility is completely
4487 * unnecessary especially in the light of the planned unified hierarchy.
4488 *
4489 * Please deprecate this and replace with something simpler if at all
4490 * possible.
4491 */
4492
4493/*
4494 * Unregister event and free resources.
4495 *
4496 * Gets called from workqueue.
4497 */
4498static void memcg_event_remove(struct work_struct *work)
4499{
4500        struct mem_cgroup_event *event =
4501                container_of(work, struct mem_cgroup_event, remove);
4502        struct mem_cgroup *memcg = event->memcg;
4503
4504        remove_wait_queue(event->wqh, &event->wait);
4505
4506        event->unregister_event(memcg, event->eventfd);
4507
4508        /* Notify userspace the event is going away. */
4509        eventfd_signal(event->eventfd, 1);
4510
4511        eventfd_ctx_put(event->eventfd);
4512        kfree(event);
4513        css_put(&memcg->css);
4514}
4515
4516/*
4517 * Gets called on EPOLLHUP on eventfd when user closes it.
4518 *
4519 * Called with wqh->lock held and interrupts disabled.
4520 */
4521static int memcg_event_wake(wait_queue_entry_t *wait, unsigned mode,
4522                            int sync, void *key)
4523{
4524        struct mem_cgroup_event *event =
4525                container_of(wait, struct mem_cgroup_event, wait);
4526        struct mem_cgroup *memcg = event->memcg;
4527        __poll_t flags = key_to_poll(key);
4528
4529        if (flags & EPOLLHUP) {
4530                /*
4531                 * If the event has been detached at cgroup removal, we
4532                 * can simply return knowing the other side will cleanup
4533                 * for us.
4534                 *
4535                 * We can't race against event freeing since the other
4536                 * side will require wqh->lock via remove_wait_queue(),
4537                 * which we hold.
4538                 */
4539                spin_lock(&memcg->event_list_lock);
4540                if (!list_empty(&event->list)) {
4541                        list_del_init(&event->list);
4542                        /*
4543                         * We are in atomic context, but cgroup_event_remove()
4544                         * may sleep, so we have to call it in workqueue.
4545                         */
4546                        schedule_work(&event->remove);
4547                }
4548                spin_unlock(&memcg->event_list_lock);
4549        }
4550
4551        return 0;
4552}
4553
4554static void memcg_event_ptable_queue_proc(struct file *file,
4555                wait_queue_head_t *wqh, poll_table *pt)
4556{
4557        struct mem_cgroup_event *event =
4558                container_of(pt, struct mem_cgroup_event, pt);
4559
4560        event->wqh = wqh;
4561        add_wait_queue(wqh, &event->wait);
4562}
4563
4564/*
4565 * DO NOT USE IN NEW FILES.
4566 *
4567 * Parse input and register new cgroup event handler.
4568 *
4569 * Input must be in format '<event_fd> <control_fd> <args>'.
4570 * Interpretation of args is defined by control file implementation.
4571 */
4572static ssize_t memcg_write_event_control(struct kernfs_open_file *of,
4573                                         char *buf, size_t nbytes, loff_t off)
4574{
4575        struct cgroup_subsys_state *css = of_css(of);
4576        struct mem_cgroup *memcg = mem_cgroup_from_css(css);
4577        struct mem_cgroup_event *event;
4578        struct cgroup_subsys_state *cfile_css;
4579        unsigned int efd, cfd;
4580        struct fd efile;
4581        struct fd cfile;
4582        const char *name;
4583        char *endp;
4584        int ret;
4585
4586        buf = strstrip(buf);
4587
4588        efd = simple_strtoul(buf, &endp, 10);
4589        if (*endp != ' ')
4590                return -EINVAL;
4591        buf = endp + 1;
4592
4593        cfd = simple_strtoul(buf, &endp, 10);
4594        if ((*endp != ' ') && (*endp != '\0'))
4595                return -EINVAL;
4596        buf = endp + 1;
4597
4598        event = kzalloc(sizeof(*event), GFP_KERNEL);
4599        if (!event)
4600                return -ENOMEM;
4601
4602        event->memcg = memcg;
4603        INIT_LIST_HEAD(&event->list);
4604        init_poll_funcptr(&event->pt, memcg_event_ptable_queue_proc);
4605        init_waitqueue_func_entry(&event->wait, memcg_event_wake);
4606        INIT_WORK(&event->remove, memcg_event_remove);
4607
4608        efile = fdget(efd);
4609        if (!efile.file) {
4610                ret = -EBADF;
4611                goto out_kfree;
4612        }
4613
4614        event->eventfd = eventfd_ctx_fileget(efile.file);
4615        if (IS_ERR(event->eventfd)) {
4616                ret = PTR_ERR(event->eventfd);
4617                goto out_put_efile;
4618        }
4619
4620        cfile = fdget(cfd);
4621        if (!cfile.file) {
4622                ret = -EBADF;
4623                goto out_put_eventfd;
4624        }
4625
4626        /* the process need read permission on control file */
4627        /* AV: shouldn't we check that it's been opened for read instead? */
4628        ret = inode_permission(file_inode(cfile.file), MAY_READ);
4629        if (ret < 0)
4630                goto out_put_cfile;
4631
4632        /*
4633         * Determine the event callbacks and set them in @event.  This used
4634         * to be done via struct cftype but cgroup core no longer knows
4635         * about these events.  The following is crude but the whole thing
4636         * is for compatibility anyway.
4637         *
4638         * DO NOT ADD NEW FILES.
4639         */
4640        name = cfile.file->f_path.dentry->d_name.name;
4641
4642        if (!strcmp(name, "memory.usage_in_bytes")) {
4643                event->register_event = mem_cgroup_usage_register_event;
4644                event->unregister_event = mem_cgroup_usage_unregister_event;
4645        } else if (!strcmp(name, "memory.oom_control")) {
4646                event->register_event = mem_cgroup_oom_register_event;
4647                event->unregister_event = mem_cgroup_oom_unregister_event;
4648        } else if (!strcmp(name, "memory.pressure_level")) {
4649                event->register_event = vmpressure_register_event;
4650                event->unregister_event = vmpressure_unregister_event;
4651        } else if (!strcmp(name, "memory.memsw.usage_in_bytes")) {
4652                event->register_event = memsw_cgroup_usage_register_event;
4653                event->unregister_event = memsw_cgroup_usage_unregister_event;
4654        } else {
4655                ret = -EINVAL;
4656                goto out_put_cfile;
4657        }
4658
4659        /*
4660         * Verify @cfile should belong to @css.  Also, remaining events are
4661         * automatically removed on cgroup destruction but the removal is
4662         * asynchronous, so take an extra ref on @css.
4663         */
4664        cfile_css = css_tryget_online_from_dir(cfile.file->f_path.dentry->d_parent,
4665                                               &memory_cgrp_subsys);
4666        ret = -EINVAL;
4667        if (IS_ERR(cfile_css))
4668                goto out_put_cfile;
4669        if (cfile_css != css) {
4670                css_put(cfile_css);
4671                goto out_put_cfile;
4672        }
4673
4674        ret = event->register_event(memcg, event->eventfd, buf);
4675        if (ret)
4676                goto out_put_css;
4677
4678        vfs_poll(efile.file, &event->pt);
4679
4680        spin_lock(&memcg->event_list_lock);
4681        list_add(&event->list, &memcg->event_list);
4682        spin_unlock(&memcg->event_list_lock);
4683
4684        fdput(cfile);
4685        fdput(efile);
4686
4687        return nbytes;
4688
4689out_put_css:
4690        css_put(css);
4691out_put_cfile:
4692        fdput(cfile);
4693out_put_eventfd:
4694        eventfd_ctx_put(event->eventfd);
4695out_put_efile:
4696        fdput(efile);
4697out_kfree:
4698        kfree(event);
4699
4700        return ret;
4701}
4702
4703static struct cftype mem_cgroup_legacy_files[] = {
4704        {
4705                .name = "usage_in_bytes",
4706                .private = MEMFILE_PRIVATE(_MEM, RES_USAGE),
4707                .read_u64 = mem_cgroup_read_u64,
4708        },
4709        {
4710                .name = "max_usage_in_bytes",
4711                .private = MEMFILE_PRIVATE(_MEM, RES_MAX_USAGE),
4712                .write = mem_cgroup_reset,
4713                .read_u64 = mem_cgroup_read_u64,
4714        },
4715        {
4716                .name = "limit_in_bytes",
4717                .private = MEMFILE_PRIVATE(_MEM, RES_LIMIT),
4718                .write = mem_cgroup_write,
4719                .read_u64 = mem_cgroup_read_u64,
4720        },
4721        {
4722                .name = "soft_limit_in_bytes",
4723                .private = MEMFILE_PRIVATE(_MEM, RES_SOFT_LIMIT),
4724                .write = mem_cgroup_write,
4725                .read_u64 = mem_cgroup_read_u64,
4726        },
4727        {
4728                .name = "failcnt",
4729                .private = MEMFILE_PRIVATE(_MEM, RES_FAILCNT),
4730                .write = mem_cgroup_reset,
4731                .read_u64 = mem_cgroup_read_u64,
4732        },
4733        {
4734                .name = "stat",
4735                .seq_show = memcg_stat_show,
4736        },
4737        {
4738                .name = "force_empty",
4739                .write = mem_cgroup_force_empty_write,
4740        },
4741        {
4742                .name = "use_hierarchy",
4743                .write_u64 = mem_cgroup_hierarchy_write,
4744                .read_u64 = mem_cgroup_hierarchy_read,
4745        },
4746        {
4747                .name = "cgroup.event_control",         /* XXX: for compat */
4748                .write = memcg_write_event_control,
4749                .flags = CFTYPE_NO_PREFIX | CFTYPE_WORLD_WRITABLE,
4750        },
4751        {
4752                .name = "swappiness",
4753                .read_u64 = mem_cgroup_swappiness_read,
4754                .write_u64 = mem_cgroup_swappiness_write,
4755        },
4756        {
4757                .name = "move_charge_at_immigrate",
4758                .read_u64 = mem_cgroup_move_charge_read,
4759                .write_u64 = mem_cgroup_move_charge_write,
4760        },
4761        {
4762                .name = "oom_control",
4763                .seq_show = mem_cgroup_oom_control_read,
4764                .write_u64 = mem_cgroup_oom_control_write,
4765                .private = MEMFILE_PRIVATE(_OOM_TYPE, OOM_CONTROL),
4766        },
4767        {
4768                .name = "pressure_level",
4769        },
4770#ifdef CONFIG_NUMA
4771        {
4772                .name = "numa_stat",
4773                .seq_show = memcg_numa_stat_show,
4774        },
4775#endif
4776        {
4777                .name = "kmem.limit_in_bytes",
4778                .private = MEMFILE_PRIVATE(_KMEM, RES_LIMIT),
4779                .write = mem_cgroup_write,
4780                .read_u64 = mem_cgroup_read_u64,
4781        },
4782        {
4783                .name = "kmem.usage_in_bytes",
4784                .private = MEMFILE_PRIVATE(_KMEM, RES_USAGE),
4785                .read_u64 = mem_cgroup_read_u64,
4786        },
4787        {
4788                .name = "kmem.failcnt",
4789                .private = MEMFILE_PRIVATE(_KMEM, RES_FAILCNT),
4790                .write = mem_cgroup_reset,
4791                .read_u64 = mem_cgroup_read_u64,
4792        },
4793        {
4794                .name = "kmem.max_usage_in_bytes",
4795                .private = MEMFILE_PRIVATE(_KMEM, RES_MAX_USAGE),
4796                .write = mem_cgroup_reset,
4797                .read_u64 = mem_cgroup_read_u64,
4798        },
4799#if defined(CONFIG_MEMCG_KMEM) && \
4800        (defined(CONFIG_SLAB) || defined(CONFIG_SLUB_DEBUG))
4801        {
4802                .name = "kmem.slabinfo",
4803                .seq_start = memcg_slab_start,
4804                .seq_next = memcg_slab_next,
4805                .seq_stop = memcg_slab_stop,
4806                .seq_show = memcg_slab_show,
4807        },
4808#endif
4809        {
4810                .name = "kmem.tcp.limit_in_bytes",
4811                .private = MEMFILE_PRIVATE(_TCP, RES_LIMIT),
4812                .write = mem_cgroup_write,
4813                .read_u64 = mem_cgroup_read_u64,
4814        },
4815        {
4816                .name = "kmem.tcp.usage_in_bytes",
4817                .private = MEMFILE_PRIVATE(_TCP, RES_USAGE),
4818                .read_u64 = mem_cgroup_read_u64,
4819        },
4820        {
4821                .name = "kmem.tcp.failcnt",
4822                .private = MEMFILE_PRIVATE(_TCP, RES_FAILCNT),
4823                .write = mem_cgroup_reset,
4824                .read_u64 = mem_cgroup_read_u64,
4825        },
4826        {
4827                .name = "kmem.tcp.max_usage_in_bytes",
4828                .private = MEMFILE_PRIVATE(_TCP, RES_MAX_USAGE),
4829                .write = mem_cgroup_reset,
4830                .read_u64 = mem_cgroup_read_u64,
4831        },
4832        { },    /* terminate */
4833};
4834
4835/*
4836 * Private memory cgroup IDR
4837 *
4838 * Swap-out records and page cache shadow entries need to store memcg
4839 * references in constrained space, so we maintain an ID space that is
4840 * limited to 16 bit (MEM_CGROUP_ID_MAX), limiting the total number of
4841 * memory-controlled cgroups to 64k.
4842 *
4843 * However, there usually are many references to the offline CSS after
4844 * the cgroup has been destroyed, such as page cache or reclaimable
4845 * slab objects, that don't need to hang on to the ID. We want to keep
4846 * those dead CSS from occupying IDs, or we might quickly exhaust the
4847 * relatively small ID space and prevent the creation of new cgroups
4848 * even when there are much fewer than 64k cgroups - possibly none.
4849 *
4850 * Maintain a private 16-bit ID space for memcg, and allow the ID to
4851 * be freed and recycled when it's no longer needed, which is usually
4852 * when the CSS is offlined.
4853 *
4854 * The only exception to that are records of swapped out tmpfs/shmem
4855 * pages that need to be attributed to live ancestors on swapin. But
4856 * those references are manageable from userspace.
4857 */
4858
4859static DEFINE_IDR(mem_cgroup_idr);
4860
4861static void mem_cgroup_id_remove(struct mem_cgroup *memcg)
4862{
4863        if (memcg->id.id > 0) {
4864                idr_remove(&mem_cgroup_idr, memcg->id.id);
4865                memcg->id.id = 0;
4866        }
4867}
4868
4869static void __maybe_unused mem_cgroup_id_get_many(struct mem_cgroup *memcg,
4870                                                  unsigned int n)
4871{
4872        refcount_add(n, &memcg->id.ref);
4873}
4874
4875static void mem_cgroup_id_put_many(struct mem_cgroup *memcg, unsigned int n)
4876{
4877        if (refcount_sub_and_test(n, &memcg->id.ref)) {
4878                mem_cgroup_id_remove(memcg);
4879
4880                /* Memcg ID pins CSS */
4881                css_put(&memcg->css);
4882        }
4883}
4884
4885static inline void mem_cgroup_id_put(struct mem_cgroup *memcg)
4886{
4887        mem_cgroup_id_put_many(memcg, 1);
4888}
4889
4890/**
4891 * mem_cgroup_from_id - look up a memcg from a memcg id
4892 * @id: the memcg id to look up
4893 *
4894 * Caller must hold rcu_read_lock().
4895 */
4896struct mem_cgroup *mem_cgroup_from_id(unsigned short id)
4897{
4898        WARN_ON_ONCE(!rcu_read_lock_held());
4899        return idr_find(&mem_cgroup_idr, id);
4900}
4901
4902static int alloc_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node)
4903{
4904        struct mem_cgroup_per_node *pn;
4905        int tmp = node;
4906        /*
4907         * This routine is called against possible nodes.
4908         * But it's BUG to call kmalloc() against offline node.
4909         *
4910         * TODO: this routine can waste much memory for nodes which will
4911         *       never be onlined. It's better to use memory hotplug callback
4912         *       function.
4913         */
4914        if (!node_state(node, N_NORMAL_MEMORY))
4915                tmp = -1;
4916        pn = kzalloc_node(sizeof(*pn), GFP_KERNEL, tmp);
4917        if (!pn)
4918                return 1;
4919
4920        pn->lruvec_stat_local = alloc_percpu(struct lruvec_stat);
4921        if (!pn->lruvec_stat_local) {
4922                kfree(pn);
4923                return 1;
4924        }
4925
4926        pn->lruvec_stat_cpu = alloc_percpu(struct lruvec_stat);
4927        if (!pn->lruvec_stat_cpu) {
4928                free_percpu(pn->lruvec_stat_local);
4929                kfree(pn);
4930                return 1;
4931        }
4932
4933        lruvec_init(&pn->lruvec);
4934        pn->usage_in_excess = 0;
4935        pn->on_tree = false;
4936        pn->memcg = memcg;
4937
4938        memcg->nodeinfo[node] = pn;
4939        return 0;
4940}
4941
4942static void free_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node)
4943{
4944        struct mem_cgroup_per_node *pn = memcg->nodeinfo[node];
4945
4946        if (!pn)
4947                return;
4948
4949        free_percpu(pn->lruvec_stat_cpu);
4950        free_percpu(pn->lruvec_stat_local);
4951        kfree(pn);
4952}
4953
4954static void __mem_cgroup_free(struct mem_cgroup *memcg)
4955{
4956        int node;
4957
4958        for_each_node(node)
4959                free_mem_cgroup_per_node_info(memcg, node);
4960        free_percpu(memcg->vmstats_percpu);
4961        free_percpu(memcg->vmstats_local);
4962        kfree(memcg);
4963}
4964
4965static void mem_cgroup_free(struct mem_cgroup *memcg)
4966{
4967        memcg_wb_domain_exit(memcg);
4968        /*
4969         * Flush percpu vmstats and vmevents to guarantee the value correctness
4970         * on parent's and all ancestor levels.
4971         */
4972        memcg_flush_percpu_vmstats(memcg);
4973        memcg_flush_percpu_vmevents(memcg);
4974        __mem_cgroup_free(memcg);
4975}
4976
4977static struct mem_cgroup *mem_cgroup_alloc(void)
4978{
4979        struct mem_cgroup *memcg;
4980        unsigned int size;
4981        int node;
4982        int __maybe_unused i;
4983        long error = -ENOMEM;
4984
4985        size = sizeof(struct mem_cgroup);
4986        size += nr_node_ids * sizeof(struct mem_cgroup_per_node *);
4987
4988        memcg = kzalloc(size, GFP_KERNEL);
4989        if (!memcg)
4990                return ERR_PTR(error);
4991
4992        memcg->id.id = idr_alloc(&mem_cgroup_idr, NULL,
4993                                 1, MEM_CGROUP_ID_MAX,
4994                                 GFP_KERNEL);
4995        if (memcg->id.id < 0) {
4996                error = memcg->id.id;
4997                goto fail;
4998        }
4999
5000        memcg->vmstats_local = alloc_percpu(struct memcg_vmstats_percpu);
5001        if (!memcg->vmstats_local)
5002                goto fail;
5003
5004        memcg->vmstats_percpu = alloc_percpu(struct memcg_vmstats_percpu);
5005        if (!memcg->vmstats_percpu)
5006                goto fail;
5007
5008        for_each_node(node)
5009                if (alloc_mem_cgroup_per_node_info(memcg, node))
5010                        goto fail;
5011
5012        if (memcg_wb_domain_init(memcg, GFP_KERNEL))
5013                goto fail;
5014
5015        INIT_WORK(&memcg->high_work, high_work_func);
5016        INIT_LIST_HEAD(&memcg->oom_notify);
5017        mutex_init(&memcg->thresholds_lock);
5018        spin_lock_init(&memcg->move_lock);
5019        vmpressure_init(&memcg->vmpressure);
5020        INIT_LIST_HEAD(&memcg->event_list);
5021        spin_lock_init(&memcg->event_list_lock);
5022        memcg->socket_pressure = jiffies;
5023#ifdef CONFIG_MEMCG_KMEM
5024        memcg->kmemcg_id = -1;
5025#endif
5026#ifdef CONFIG_CGROUP_WRITEBACK
5027        INIT_LIST_HEAD(&memcg->cgwb_list);
5028        for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++)
5029                memcg->cgwb_frn[i].done =
5030                        __WB_COMPLETION_INIT(&memcg_cgwb_frn_waitq);
5031#endif
5032#ifdef CONFIG_TRANSPARENT_HUGEPAGE
5033        spin_lock_init(&memcg->deferred_split_queue.split_queue_lock);
5034        INIT_LIST_HEAD(&memcg->deferred_split_queue.split_queue);
5035        memcg->deferred_split_queue.split_queue_len = 0;
5036#endif
5037        idr_replace(&mem_cgroup_idr, memcg, memcg->id.id);
5038        return memcg;
5039fail:
5040        mem_cgroup_id_remove(memcg);
5041        __mem_cgroup_free(memcg);
5042        return ERR_PTR(error);
5043}
5044
5045static struct cgroup_subsys_state * __ref
5046mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
5047{
5048        struct mem_cgroup *parent = mem_cgroup_from_css(parent_css);
5049        struct mem_cgroup *memcg;
5050        long error = -ENOMEM;
5051
5052        memcg = mem_cgroup_alloc();
5053        if (IS_ERR(memcg))
5054                return ERR_CAST(memcg);
5055
5056        page_counter_set_high(&memcg->memory, PAGE_COUNTER_MAX);
5057        memcg->soft_limit = PAGE_COUNTER_MAX;
5058        page_counter_set_high(&memcg->swap, PAGE_COUNTER_MAX);
5059        if (parent) {
5060                memcg->swappiness = mem_cgroup_swappiness(parent);
5061                memcg->oom_kill_disable = parent->oom_kill_disable;
5062        }
5063        if (parent && parent->use_hierarchy) {
5064                memcg->use_hierarchy = true;
5065                page_counter_init(&memcg->memory, &parent->memory);
5066                page_counter_init(&memcg->swap, &parent->swap);
5067                page_counter_init(&memcg->memsw, &parent->memsw);
5068                page_counter_init(&memcg->kmem, &parent->kmem);
5069                page_counter_init(&memcg->tcpmem, &parent->tcpmem);
5070        } else {
5071                page_counter_init(&memcg->memory, NULL);
5072                page_counter_init(&memcg->swap, NULL);
5073                page_counter_init(&memcg->memsw, NULL);
5074                page_counter_init(&memcg->kmem, NULL);
5075                page_counter_init(&memcg->tcpmem, NULL);
5076                /*
5077                 * Deeper hierachy with use_hierarchy == false doesn't make
5078                 * much sense so let cgroup subsystem know about this
5079                 * unfortunate state in our controller.
5080                 */
5081                if (parent != root_mem_cgroup)
5082                        memory_cgrp_subsys.broken_hierarchy = true;
5083        }
5084
5085        /* The following stuff does not apply to the root */
5086        if (!parent) {
5087#ifdef CONFIG_MEMCG_KMEM
5088                INIT_LIST_HEAD(&memcg->kmem_caches);
5089#endif
5090                root_mem_cgroup = memcg;
5091                return &memcg->css;
5092        }
5093
5094        error = memcg_online_kmem(memcg);
5095        if (error)
5096                goto fail;
5097
5098        if (cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_nosocket)
5099                static_branch_inc(&memcg_sockets_enabled_key);
5100
5101        return &memcg->css;
5102fail:
5103        mem_cgroup_id_remove(memcg);
5104        mem_cgroup_free(memcg);
5105        return ERR_PTR(error);
5106}
5107
5108static int mem_cgroup_css_online(struct cgroup_subsys_state *css)
5109{
5110        struct mem_cgroup *memcg = mem_cgroup_from_css(css);
5111
5112        /*
5113         * A memcg must be visible for memcg_expand_shrinker_maps()
5114         * by the time the maps are allocated. So, we allocate maps
5115         * here, when for_each_mem_cgroup() can't skip it.
5116         */
5117        if (memcg_alloc_shrinker_maps(memcg)) {
5118                mem_cgroup_id_remove(memcg);
5119                return -ENOMEM;
5120        }
5121
5122        /* Online state pins memcg ID, memcg ID pins CSS */
5123        refcount_set(&memcg->id.ref, 1);
5124        css_get(css);
5125        return 0;
5126}
5127
5128static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
5129{
5130        struct mem_cgroup *memcg = mem_cgroup_from_css(css);
5131        struct mem_cgroup_event *event, *tmp;
5132
5133        /*
5134         * Unregister events and notify userspace.
5135         * Notify userspace about cgroup removing only after rmdir of cgroup
5136         * directory to avoid race between userspace and kernelspace.
5137         */
5138        spin_lock(&memcg->event_list_lock);
5139        list_for_each_entry_safe(event, tmp, &memcg->event_list, list) {
5140                list_del_init(&event->list);
5141                schedule_work(&event->remove);
5142        }
5143        spin_unlock(&memcg->event_list_lock);
5144
5145        page_counter_set_min(&memcg->memory, 0);
5146        page_counter_set_low(&memcg->memory, 0);
5147
5148        memcg_offline_kmem(memcg);
5149        wb_memcg_offline(memcg);
5150
5151        drain_all_stock(memcg);
5152
5153        mem_cgroup_id_put(memcg);
5154}
5155
5156static void mem_cgroup_css_released(struct cgroup_subsys_state *css)
5157{
5158        struct mem_cgroup *memcg = mem_cgroup_from_css(css);
5159
5160        invalidate_reclaim_iterators(memcg);
5161}
5162
5163static void mem_cgroup_css_free(struct cgroup_subsys_state *css)
5164{
5165        struct mem_cgroup *memcg = mem_cgroup_from_css(css);
5166        int __maybe_unused i;
5167
5168#ifdef CONFIG_CGROUP_WRITEBACK
5169        for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++)
5170                wb_wait_for_completion(&memcg->cgwb_frn[i].done);
5171#endif
5172        if (cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_nosocket)
5173                static_branch_dec(&memcg_sockets_enabled_key);
5174
5175        if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && memcg->tcpmem_active)
5176                static_branch_dec(&memcg_sockets_enabled_key);
5177
5178        vmpressure_cleanup(&memcg->vmpressure);
5179        cancel_work_sync(&memcg->high_work);
5180        mem_cgroup_remove_from_trees(memcg);
5181        memcg_free_shrinker_maps(memcg);
5182        memcg_free_kmem(memcg);
5183        mem_cgroup_free(memcg);
5184}
5185
5186/**
5187 * mem_cgroup_css_reset - reset the states of a mem_cgroup
5188 * @css: the target css
5189 *
5190 * Reset the states of the mem_cgroup associated with @css.  This is
5191 * invoked when the userland requests disabling on the default hierarchy
5192 * but the memcg is pinned through dependency.  The memcg should stop
5193 * applying policies and should revert to the vanilla state as it may be
5194 * made visible again.
5195 *
5196 * The current implementation only resets the essential configurations.
5197 * This needs to be expanded to cover all the visible parts.
5198 */
5199static void mem_cgroup_css_reset(struct cgroup_subsys_state *css)
5200{
5201        struct mem_cgroup *memcg = mem_cgroup_from_css(css);
5202
5203        page_counter_set_max(&memcg->memory, PAGE_COUNTER_MAX);
5204        page_counter_set_max(&memcg->swap, PAGE_COUNTER_MAX);
5205        page_counter_set_max(&memcg->memsw, PAGE_COUNTER_MAX);
5206        page_counter_set_max(&memcg->kmem, PAGE_COUNTER_MAX);
5207        page_counter_set_max(&memcg->tcpmem, PAGE_COUNTER_MAX);
5208        page_counter_set_min(&memcg->memory, 0);
5209        page_counter_set_low(&memcg->memory, 0);
5210        page_counter_set_high(&memcg->memory, PAGE_COUNTER_MAX);
5211        memcg->soft_limit = PAGE_COUNTER_MAX;
5212        page_counter_set_high(&memcg->swap, PAGE_COUNTER_MAX);
5213        memcg_wb_domain_size_changed(memcg);
5214}
5215
5216#ifdef CONFIG_MMU
5217/* Handlers for move charge at task migration. */
5218static int mem_cgroup_do_precharge(unsigned long count)
5219{
5220        int ret;
5221
5222        /* Try a single bulk charge without reclaim first, kswapd may wake */
5223        ret = try_charge(mc.to, GFP_KERNEL & ~__GFP_DIRECT_RECLAIM, count);
5224        if (!ret) {
5225                mc.precharge += count;
5226                return ret;
5227        }
5228
5229        /* Try charges one by one with reclaim, but do not retry */
5230        while (count--) {
5231                ret = try_charge(mc.to, GFP_KERNEL | __GFP_NORETRY, 1);
5232                if (ret)
5233                        return ret;
5234                mc.precharge++;
5235                cond_resched();
5236        }
5237        return 0;
5238}
5239
5240union mc_target {
5241        struct page     *page;
5242        swp_entry_t     ent;
5243};
5244
5245enum mc_target_type {
5246        MC_TARGET_NONE = 0,
5247        MC_TARGET_PAGE,
5248        MC_TARGET_SWAP,
5249        MC_TARGET_DEVICE,
5250};
5251
5252static struct page *mc_handle_present_pte(struct vm_area_struct *vma,
5253                                                unsigned long addr, pte_t ptent)
5254{
5255        struct page *page = vm_normal_page(vma, addr, ptent);
5256
5257        if (!page || !page_mapped(page))
5258                return NULL;
5259        if (PageAnon(page)) {
5260                if (!(mc.flags & MOVE_ANON))
5261                        return NULL;
5262        } else {
5263                if (!(mc.flags & MOVE_FILE))
5264                        return NULL;
5265        }
5266        if (!get_page_unless_zero(page))
5267                return NULL;
5268
5269        return page;
5270}
5271
5272#if defined(CONFIG_SWAP) || defined(CONFIG_DEVICE_PRIVATE)
5273static struct page *mc_handle_swap_pte(struct vm_area_struct *vma,
5274                        pte_t ptent, swp_entry_t *entry)
5275{
5276        struct page *page = NULL;
5277        swp_entry_t ent = pte_to_swp_entry(ptent);
5278
5279        if (!(mc.flags & MOVE_ANON) || non_swap_entry(ent))
5280                return NULL;
5281
5282        /*
5283         * Handle MEMORY_DEVICE_PRIVATE which are ZONE_DEVICE page belonging to
5284         * a device and because they are not accessible by CPU they are store
5285         * as special swap entry in the CPU page table.
5286         */
5287        if (is_device_private_entry(ent)) {
5288                page = device_private_entry_to_page(ent);
5289                /*
5290                 * MEMORY_DEVICE_PRIVATE means ZONE_DEVICE page and which have
5291                 * a refcount of 1 when free (unlike normal page)
5292                 */
5293                if (!page_ref_add_unless(page, 1, 1))
5294                        return NULL;
5295                return page;
5296        }
5297
5298        /*
5299         * Because lookup_swap_cache() updates some statistics counter,
5300         * we call find_get_page() with swapper_space directly.
5301         */
5302        page = find_get_page(swap_address_space(ent), swp_offset(ent));
5303        entry->val = ent.val;
5304
5305        return page;
5306}
5307#else
5308static struct page *mc_handle_swap_pte(struct vm_area_struct *vma,
5309                        pte_t ptent, swp_entry_t *entry)
5310{
5311        return NULL;
5312}
5313#endif
5314
5315static struct page *mc_handle_file_pte(struct vm_area_struct *vma,
5316                        unsigned long addr, pte_t ptent, swp_entry_t *entry)
5317{
5318        struct page *page = NULL;
5319        struct address_space *mapping;
5320        pgoff_t pgoff;
5321
5322        if (!vma->vm_file) /* anonymous vma */
5323                return NULL;
5324        if (!(mc.flags & MOVE_FILE))
5325                return NULL;
5326
5327        mapping = vma->vm_file->f_mapping;
5328        pgoff = linear_page_index(vma, addr);
5329
5330        /* page is moved even if it's not RSS of this task(page-faulted). */
5331#ifdef CONFIG_SWAP
5332        /* shmem/tmpfs may report page out on swap: account for that too. */
5333        if (shmem_mapping(mapping)) {
5334                page = find_get_entry(mapping, pgoff);
5335                if (xa_is_value(page)) {
5336                        swp_entry_t swp = radix_to_swp_entry(page);
5337                        *entry = swp;
5338                        page = find_get_page(swap_address_space(swp),
5339                                             swp_offset(swp));
5340                }
5341        } else
5342                page = find_get_page(mapping, pgoff);
5343#else
5344        page = find_get_page(mapping, pgoff);
5345#endif
5346        return page;
5347}
5348
5349/**
5350 * mem_cgroup_move_account - move account of the page
5351 * @page: the page
5352 * @compound: charge the page as compound or small page
5353 * @from: mem_cgroup which the page is moved from.
5354 * @to: mem_cgroup which the page is moved to. @from != @to.
5355 *
5356 * The caller must make sure the page is not on LRU (isolate_page() is useful.)
5357 *
5358 * This function doesn't do "charge" to new cgroup and doesn't do "uncharge"
5359 * from old cgroup.
5360 */
5361static int mem_cgroup_move_account(struct page *page,
5362                                   bool compound,
5363                                   struct mem_cgroup *from,
5364                                   struct mem_cgroup *to)
5365{
5366        struct lruvec *from_vec, *to_vec;
5367        struct pglist_data *pgdat;
5368        unsigned int nr_pages = compound ? hpage_nr_pages(page) : 1;
5369        int ret;
5370
5371        VM_BUG_ON(from == to);
5372        VM_BUG_ON_PAGE(PageLRU(page), page);
5373        VM_BUG_ON(compound && !PageTransHuge(page));
5374
5375        /*
5376         * Prevent mem_cgroup_migrate() from looking at
5377         * page->mem_cgroup of its source page while we change it.
5378         */
5379        ret = -EBUSY;
5380        if (!trylock_page(page))
5381                goto out;
5382
5383        ret = -EINVAL;
5384        if (page->mem_cgroup != from)
5385                goto out_unlock;
5386
5387        pgdat = page_pgdat(page);
5388        from_vec = mem_cgroup_lruvec(from, pgdat);
5389        to_vec = mem_cgroup_lruvec(to, pgdat);
5390
5391        lock_page_memcg(page);
5392
5393        if (PageAnon(page)) {
5394                if (page_mapped(page)) {
5395                        __mod_lruvec_state(from_vec, NR_ANON_MAPPED, -nr_pages);
5396                        __mod_lruvec_state(to_vec, NR_ANON_MAPPED, nr_pages);
5397                        if (PageTransHuge(page)) {
5398                                __mod_lruvec_state(from_vec, NR_ANON_THPS,
5399                                                   -nr_pages);
5400                                __mod_lruvec_state(to_vec, NR_ANON_THPS,
5401                                                   nr_pages);
5402                        }
5403
5404                }
5405        } else {
5406                __mod_lruvec_state(from_vec, NR_FILE_PAGES, -nr_pages);
5407                __mod_lruvec_state(to_vec, NR_FILE_PAGES, nr_pages);
5408
5409                if (PageSwapBacked(page)) {
5410                        __mod_lruvec_state(from_vec, NR_SHMEM, -nr_pages);
5411                        __mod_lruvec_state(to_vec, NR_SHMEM, nr_pages);
5412                }
5413
5414                if (page_mapped(page)) {
5415                        __mod_lruvec_state(from_vec, NR_FILE_MAPPED, -nr_pages);
5416                        __mod_lruvec_state(to_vec, NR_FILE_MAPPED, nr_pages);
5417                }
5418
5419                if (PageDirty(page)) {
5420                        struct address_space *mapping = page_mapping(page);
5421
5422                        if (mapping_cap_account_dirty(mapping)) {
5423                                __mod_lruvec_state(from_vec, NR_FILE_DIRTY,
5424                                                   -nr_pages);
5425                                __mod_lruvec_state(to_vec, NR_FILE_DIRTY,
5426                                                   nr_pages);
5427                        }
5428                }
5429        }
5430
5431        if (PageWriteback(page)) {
5432                __mod_lruvec_state(from_vec, NR_WRITEBACK, -nr_pages);
5433                __mod_lruvec_state(to_vec, NR_WRITEBACK, nr_pages);
5434        }
5435
5436        /*
5437         * All state has been migrated, let's switch to the new memcg.
5438         *
5439         * It is safe to change page->mem_cgroup here because the page
5440         * is referenced, charged, isolated, and locked: we can't race
5441         * with (un)charging, migration, LRU putback, or anything else
5442         * that would rely on a stable page->mem_cgroup.
5443         *
5444         * Note that lock_page_memcg is a memcg lock, not a page lock,
5445         * to save space. As soon as we switch page->mem_cgroup to a
5446         * new memcg that isn't locked, the above state can change
5447         * concurrently again. Make sure we're truly done with it.
5448         */
5449        smp_mb();
5450
5451        page->mem_cgroup = to;  /* caller should have done css_get */
5452
5453        __unlock_page_memcg(from);
5454
5455        ret = 0;
5456
5457        local_irq_disable();
5458        mem_cgroup_charge_statistics(to, page, nr_pages);
5459        memcg_check_events(to, page);
5460        mem_cgroup_charge_statistics(from, page, -nr_pages);
5461        memcg_check_events(from, page);
5462        local_irq_enable();
5463out_unlock:
5464        unlock_page(page);
5465out:
5466        return ret;
5467}
5468
5469/**
5470 * get_mctgt_type - get target type of moving charge
5471 * @vma: the vma the pte to be checked belongs
5472 * @addr: the address corresponding to the pte to be checked
5473 * @ptent: the pte to be checked
5474 * @target: the pointer the target page or swap ent will be stored(can be NULL)
5475 *
5476 * Returns
5477 *   0(MC_TARGET_NONE): if the pte is not a target for move charge.
5478 *   1(MC_TARGET_PAGE): if the page corresponding to this pte is a target for
5479 *     move charge. if @target is not NULL, the page is stored in target->page
5480 *     with extra refcnt got(Callers should handle it).
5481 *   2(MC_TARGET_SWAP): if the swap entry corresponding to this pte is a
5482 *     target for charge migration. if @target is not NULL, the entry is stored
5483 *     in target->ent.
5484 *   3(MC_TARGET_DEVICE): like MC_TARGET_PAGE  but page is MEMORY_DEVICE_PRIVATE
5485 *     (so ZONE_DEVICE page and thus not on the lru).
5486 *     For now we such page is charge like a regular page would be as for all
5487 *     intent and purposes it is just special memory taking the place of a
5488 *     regular page.
5489 *
5490 *     See Documentations/vm/hmm.txt and include/linux/hmm.h
5491 *
5492 * Called with pte lock held.
5493 */
5494
5495static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma,
5496                unsigned long addr, pte_t ptent, union mc_target *target)
5497{
5498        struct page *page = NULL;
5499        enum mc_target_type ret = MC_TARGET_NONE;
5500        swp_entry_t ent = { .val = 0 };
5501
5502        if (pte_present(ptent))
5503                page = mc_handle_present_pte(vma, addr, ptent);
5504        else if (is_swap_pte(ptent))
5505                page = mc_handle_swap_pte(vma, ptent, &ent);
5506        else if (pte_none(ptent))
5507                page = mc_handle_file_pte(vma, addr, ptent, &ent);
5508
5509        if (!page && !ent.val)
5510                return ret;
5511        if (page) {
5512                /*
5513                 * Do only loose check w/o serialization.
5514                 * mem_cgroup_move_account() checks the page is valid or
5515                 * not under LRU exclusion.
5516                 */
5517                if (page->mem_cgroup == mc.from) {
5518                        ret = MC_TARGET_PAGE;
5519                        if (is_device_private_page(page))
5520                                ret = MC_TARGET_DEVICE;
5521                        if (target)
5522                                target->page = page;
5523                }
5524                if (!ret || !target)
5525                        put_page(page);
5526        }
5527        /*
5528         * There is a swap entry and a page doesn't exist or isn't charged.
5529         * But we cannot move a tail-page in a THP.
5530         */
5531        if (ent.val && !ret && (!page || !PageTransCompound(page)) &&
5532            mem_cgroup_id(mc.from) == lookup_swap_cgroup_id(ent)) {
5533                ret = MC_TARGET_SWAP;
5534                if (target)
5535                        target->ent = ent;
5536        }
5537        return ret;
5538}
5539
5540#ifdef CONFIG_TRANSPARENT_HUGEPAGE
5541/*
5542 * We don't consider PMD mapped swapping or file mapped pages because THP does
5543 * not support them for now.
5544 * Caller should make sure that pmd_trans_huge(pmd) is true.
5545 */
5546static enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma,
5547                unsigned long addr, pmd_t pmd, union mc_target *target)
5548{
5549        struct page *page = NULL;
5550        enum mc_target_type ret = MC_TARGET_NONE;
5551
5552        if (unlikely(is_swap_pmd(pmd))) {
5553                VM_BUG_ON(thp_migration_supported() &&
5554                                  !is_pmd_migration_entry(pmd));
5555                return ret;
5556        }
5557        page = pmd_page(pmd);
5558        VM_BUG_ON_PAGE(!page || !PageHead(page), page);
5559        if (!(mc.flags & MOVE_ANON))
5560                return ret;
5561        if (page->mem_cgroup == mc.from) {
5562                ret = MC_TARGET_PAGE;
5563                if (target) {
5564                        get_page(page);
5565                        target->page = page;
5566                }
5567        }
5568        return ret;
5569}
5570#else
5571static inline enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma,
5572                unsigned long addr, pmd_t pmd, union mc_target *target)
5573{
5574        return MC_TARGET_NONE;
5575}
5576#endif
5577
5578static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd,
5579                                        unsigned long addr, unsigned long end,
5580                                        struct mm_walk *walk)
5581{
5582        struct vm_area_struct *vma = walk->vma;
5583        pte_t *pte;
5584        spinlock_t *ptl;
5585
5586        ptl = pmd_trans_huge_lock(pmd, vma);
5587        if (ptl) {
5588                /*
5589                 * Note their can not be MC_TARGET_DEVICE for now as we do not
5590                 * support transparent huge page with MEMORY_DEVICE_PRIVATE but
5591                 * this might change.
5592                 */
5593                if (get_mctgt_type_thp(vma, addr, *pmd, NULL) == MC_TARGET_PAGE)
5594                        mc.precharge += HPAGE_PMD_NR;
5595                spin_unlock(ptl);
5596                return 0;
5597        }
5598
5599        if (pmd_trans_unstable(pmd))
5600                return 0;
5601        pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
5602        for (; addr != end; pte++, addr += PAGE_SIZE)
5603                if (get_mctgt_type(vma, addr, *pte, NULL))
5604                        mc.precharge++; /* increment precharge temporarily */
5605        pte_unmap_unlock(pte - 1, ptl);
5606        cond_resched();
5607
5608        return 0;
5609}
5610
5611static const struct mm_walk_ops precharge_walk_ops = {
5612        .pmd_entry      = mem_cgroup_count_precharge_pte_range,
5613};
5614
5615static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm)
5616{
5617        unsigned long precharge;
5618
5619        mmap_read_lock(mm);
5620        walk_page_range(mm, 0, mm->highest_vm_end, &precharge_walk_ops, NULL);
5621        mmap_read_unlock(mm);
5622
5623        precharge = mc.precharge;
5624        mc.precharge = 0;
5625
5626        return precharge;
5627}
5628
5629static int mem_cgroup_precharge_mc(struct mm_struct *mm)
5630{
5631        unsigned long precharge = mem_cgroup_count_precharge(mm);
5632
5633        VM_BUG_ON(mc.moving_task);
5634        mc.moving_task = current;
5635        return mem_cgroup_do_precharge(precharge);
5636}
5637
5638/* cancels all extra charges on mc.from and mc.to, and wakes up all waiters. */
5639static void __mem_cgroup_clear_mc(void)
5640{
5641        struct mem_cgroup *from = mc.from;
5642        struct mem_cgroup *to = mc.to;
5643
5644        /* we must uncharge all the leftover precharges from mc.to */
5645        if (mc.precharge) {
5646                cancel_charge(mc.to, mc.precharge);
5647                mc.precharge = 0;
5648        }
5649        /*
5650         * we didn't uncharge from mc.from at mem_cgroup_move_account(), so
5651         * we must uncharge here.
5652         */
5653        if (mc.moved_charge) {
5654                cancel_charge(mc.from, mc.moved_charge);
5655                mc.moved_charge = 0;
5656        }
5657        /* we must fixup refcnts and charges */
5658        if (mc.moved_swap) {
5659                /* uncharge swap account from the old cgroup */
5660                if (!mem_cgroup_is_root(mc.from))
5661                        page_counter_uncharge(&mc.from->memsw, mc.moved_swap);
5662
5663                mem_cgroup_id_put_many(mc.from, mc.moved_swap);
5664
5665                /*
5666                 * we charged both to->memory and to->memsw, so we
5667                 * should uncharge to->memory.
5668                 */
5669                if (!mem_cgroup_is_root(mc.to))
5670                        page_counter_uncharge(&mc.to->memory, mc.moved_swap);
5671
5672                css_put_many(&mc.to->css, mc.moved_swap);
5673
5674                mc.moved_swap = 0;
5675        }
5676        memcg_oom_recover(from);
5677        memcg_oom_recover(to);
5678        wake_up_all(&mc.waitq);
5679}
5680
5681static void mem_cgroup_clear_mc(void)
5682{
5683        struct mm_struct *mm = mc.mm;
5684
5685        /*
5686         * we must clear moving_task before waking up waiters at the end of
5687         * task migration.
5688         */
5689        mc.moving_task = NULL;
5690        __mem_cgroup_clear_mc();
5691        spin_lock(&mc.lock);
5692        mc.from = NULL;
5693        mc.to = NULL;
5694        mc.mm = NULL;
5695        spin_unlock(&mc.lock);
5696
5697        mmput(mm);
5698}
5699
5700static int mem_cgroup_can_attach(struct cgroup_taskset *tset)
5701{
5702        struct cgroup_subsys_state *css;
5703        struct mem_cgroup *memcg = NULL; /* unneeded init to make gcc happy */
5704        struct mem_cgroup *from;
5705        struct task_struct *leader, *p;
5706        struct mm_struct *mm;
5707        unsigned long move_flags;
5708        int ret = 0;
5709
5710        /* charge immigration isn't supported on the default hierarchy */
5711        if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
5712                return 0;
5713
5714        /*
5715         * Multi-process migrations only happen on the default hierarchy
5716         * where charge immigration is not used.  Perform charge
5717         * immigration if @tset contains a leader and whine if there are
5718         * multiple.
5719         */
5720        p = NULL;
5721        cgroup_taskset_for_each_leader(leader, css, tset) {
5722                WARN_ON_ONCE(p);
5723                p = leader;
5724                memcg = mem_cgroup_from_css(css);
5725        }
5726        if (!p)
5727                return 0;
5728
5729        /*
5730         * We are now commited to this value whatever it is. Changes in this
5731         * tunable will only affect upcoming migrations, not the current one.
5732         * So we need to save it, and keep it going.
5733         */
5734        move_flags = READ_ONCE(memcg->move_charge_at_immigrate);
5735        if (!move_flags)
5736                return 0;
5737
5738        from = mem_cgroup_from_task(p);
5739
5740        VM_BUG_ON(from == memcg);
5741
5742        mm = get_task_mm(p);
5743        if (!mm)
5744                return 0;
5745        /* We move charges only when we move a owner of the mm */
5746        if (mm->owner == p) {
5747                VM_BUG_ON(mc.from);
5748                VM_BUG_ON(mc.to);
5749                VM_BUG_ON(mc.precharge);
5750                VM_BUG_ON(mc.moved_charge);
5751                VM_BUG_ON(mc.moved_swap);
5752
5753                spin_lock(&mc.lock);
5754                mc.mm = mm;
5755                mc.from = from;
5756                mc.to = memcg;
5757                mc.flags = move_flags;
5758                spin_unlock(&mc.lock);
5759                /* We set mc.moving_task later */
5760
5761                ret = mem_cgroup_precharge_mc(mm);
5762                if (ret)
5763                        mem_cgroup_clear_mc();
5764        } else {
5765                mmput(mm);
5766        }
5767        return ret;
5768}
5769
5770static void mem_cgroup_cancel_attach(struct cgroup_taskset *tset)
5771{
5772        if (mc.to)
5773                mem_cgroup_clear_mc();
5774}
5775
5776static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
5777                                unsigned long addr, unsigned long end,
5778                                struct mm_walk *walk)
5779{
5780        int ret = 0;
5781        struct vm_area_struct *vma = walk->vma;
5782        pte_t *pte;
5783        spinlock_t *ptl;
5784        enum mc_target_type target_type;
5785        union mc_target target;
5786        struct page *page;
5787
5788        ptl = pmd_trans_huge_lock(pmd, vma);
5789        if (ptl) {
5790                if (mc.precharge < HPAGE_PMD_NR) {
5791                        spin_unlock(ptl);
5792                        return 0;
5793                }
5794                target_type = get_mctgt_type_thp(vma, addr, *pmd, &target);
5795                if (target_type == MC_TARGET_PAGE) {
5796                        page = target.page;
5797                        if (!isolate_lru_page(page)) {
5798                                if (!mem_cgroup_move_account(page, true,
5799                                                             mc.from, mc.to)) {
5800                                        mc.precharge -= HPAGE_PMD_NR;
5801                                        mc.moved_charge += HPAGE_PMD_NR;
5802                                }
5803                                putback_lru_page(page);
5804                        }
5805                        put_page(page);
5806                } else if (target_type == MC_TARGET_DEVICE) {
5807                        page = target.page;
5808                        if (!mem_cgroup_move_account(page, true,
5809                                                     mc.from, mc.to)) {
5810                                mc.precharge -= HPAGE_PMD_NR;
5811                                mc.moved_charge += HPAGE_PMD_NR;
5812                        }
5813                        put_page(page);
5814                }
5815                spin_unlock(ptl);
5816                return 0;
5817        }
5818
5819        if (pmd_trans_unstable(pmd))
5820                return 0;
5821retry:
5822        pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
5823        for (; addr != end; addr += PAGE_SIZE) {
5824                pte_t ptent = *(pte++);
5825                bool device = false;
5826                swp_entry_t ent;
5827
5828                if (!mc.precharge)
5829                        break;
5830
5831                switch (get_mctgt_type(vma, addr, ptent, &target)) {
5832                case MC_TARGET_DEVICE:
5833                        device = true;
5834                        fallthrough;
5835                case MC_TARGET_PAGE:
5836                        page = target.page;
5837                        /*
5838                         * We can have a part of the split pmd here. Moving it
5839                         * can be done but it would be too convoluted so simply
5840                         * ignore such a partial THP and keep it in original
5841                         * memcg. There should be somebody mapping the head.
5842                         */
5843                        if (PageTransCompound(page))
5844                                goto put;
5845                        if (!device && isolate_lru_page(page))
5846                                goto put;
5847                        if (!mem_cgroup_move_account(page, false,
5848                                                mc.from, mc.to)) {
5849                                mc.precharge--;
5850                                /* we uncharge from mc.from later. */
5851                                mc.moved_charge++;
5852                        }
5853                        if (!device)
5854                                putback_lru_page(page);
5855put:                    /* get_mctgt_type() gets the page */
5856                        put_page(page);
5857                        break;
5858                case MC_TARGET_SWAP:
5859                        ent = target.ent;
5860                        if (!mem_cgroup_move_swap_account(ent, mc.from, mc.to)) {
5861                                mc.precharge--;
5862                                mem_cgroup_id_get_many(mc.to, 1);
5863                                /* we fixup other refcnts and charges later. */
5864                                mc.moved_swap++;
5865                        }
5866                        break;
5867                default:
5868                        break;
5869                }
5870        }
5871        pte_unmap_unlock(pte - 1, ptl);
5872        cond_resched();
5873
5874        if (addr != end) {
5875                /*
5876                 * We have consumed all precharges we got in can_attach().
5877                 * We try charge one by one, but don't do any additional
5878                 * charges to mc.to if we have failed in charge once in attach()
5879                 * phase.
5880                 */
5881                ret = mem_cgroup_do_precharge(1);
5882                if (!ret)
5883                        goto retry;
5884        }
5885
5886        return ret;
5887}
5888
5889static const struct mm_walk_ops charge_walk_ops = {
5890        .pmd_entry      = mem_cgroup_move_charge_pte_range,
5891};
5892
5893static void mem_cgroup_move_charge(void)
5894{
5895        lru_add_drain_all();
5896        /*
5897         * Signal lock_page_memcg() to take the memcg's move_lock
5898         * while we're moving its pages to another memcg. Then wait
5899         * for already started RCU-only updates to finish.
5900         */
5901        atomic_inc(&mc.from->moving_account);
5902        synchronize_rcu();
5903retry:
5904        if (unlikely(!mmap_read_trylock(mc.mm))) {
5905                /*
5906                 * Someone who are holding the mmap_lock might be waiting in
5907                 * waitq. So we cancel all extra charges, wake up all waiters,
5908                 * and retry. Because we cancel precharges, we might not be able
5909                 * to move enough charges, but moving charge is a best-effort
5910                 * feature anyway, so it wouldn't be a big problem.
5911                 */
5912                __mem_cgroup_clear_mc();
5913                cond_resched();
5914                goto retry;
5915        }
5916        /*
5917         * When we have consumed all precharges and failed in doing
5918         * additional charge, the page walk just aborts.
5919         */
5920        walk_page_range(mc.mm, 0, mc.mm->highest_vm_end, &charge_walk_ops,
5921                        NULL);
5922
5923        mmap_read_unlock(mc.mm);
5924        atomic_dec(&mc.from->moving_account);
5925}
5926
5927static void mem_cgroup_move_task(void)
5928{
5929        if (mc.to) {
5930                mem_cgroup_move_charge();
5931                mem_cgroup_clear_mc();
5932        }
5933}
5934#else   /* !CONFIG_MMU */
5935static int mem_cgroup_can_attach(struct cgroup_taskset *tset)
5936{
5937        return 0;
5938}
5939static void mem_cgroup_cancel_attach(struct cgroup_taskset *tset)
5940{
5941}
5942static void mem_cgroup_move_task(void)
5943{
5944}
5945#endif
5946
5947/*
5948 * Cgroup retains root cgroups across [un]mount cycles making it necessary
5949 * to verify whether we're attached to the default hierarchy on each mount
5950 * attempt.
5951 */
5952static void mem_cgroup_bind(struct cgroup_subsys_state *root_css)
5953{
5954        /*
5955         * use_hierarchy is forced on the default hierarchy.  cgroup core
5956         * guarantees that @root doesn't have any children, so turning it
5957         * on for the root memcg is enough.
5958         */
5959        if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
5960                root_mem_cgroup->use_hierarchy = true;
5961        else
5962                root_mem_cgroup->use_hierarchy = false;
5963}
5964
5965static int seq_puts_memcg_tunable(struct seq_file *m, unsigned long value)
5966{
5967        if (value == PAGE_COUNTER_MAX)
5968                seq_puts(m, "max\n");
5969        else
5970                seq_printf(m, "%llu\n", (u64)value * PAGE_SIZE);
5971
5972        return 0;
5973}
5974
5975static u64 memory_current_read(struct cgroup_subsys_state *css,
5976                               struct cftype *cft)
5977{
5978        struct mem_cgroup *memcg = mem_cgroup_from_css(css);
5979
5980        return (u64)page_counter_read(&memcg->memory) * PAGE_SIZE;
5981}
5982
5983static int memory_min_show(struct seq_file *m, void *v)
5984{
5985        return seq_puts_memcg_tunable(m,
5986                READ_ONCE(mem_cgroup_from_seq(m)->memory.min));
5987}
5988
5989static ssize_t memory_min_write(struct kernfs_open_file *of,
5990                                char *buf, size_t nbytes, loff_t off)
5991{
5992        struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
5993        unsigned long min;
5994        int err;
5995
5996        buf = strstrip(buf);
5997        err = page_counter_memparse(buf, "max", &min);
5998        if (err)
5999                return err;
6000
6001        page_counter_set_min(&memcg->memory, min);
6002
6003        return nbytes;
6004}
6005
6006static int memory_low_show(struct seq_file *m, void *v)
6007{
6008        return seq_puts_memcg_tunable(m,
6009                READ_ONCE(mem_cgroup_from_seq(m)->memory.low));
6010}
6011
6012static ssize_t memory_low_write(struct kernfs_open_file *of,
6013                                char *buf, size_t nbytes, loff_t off)
6014{
6015        struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
6016        unsigned long low;
6017        int err;
6018
6019        buf = strstrip(buf);
6020        err = page_counter_memparse(buf, "max", &low);
6021        if (err)
6022                return err;
6023
6024        page_counter_set_low(&memcg->memory, low);
6025
6026        return nbytes;
6027}
6028
6029static int memory_high_show(struct seq_file *m, void *v)
6030{
6031        return seq_puts_memcg_tunable(m,
6032                READ_ONCE(mem_cgroup_from_seq(m)->memory.high));
6033}
6034
6035static ssize_t memory_high_write(struct kernfs_open_file *of,
6036                                 char *buf, size_t nbytes, loff_t off)
6037{
6038        struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
6039        unsigned int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
6040        bool drained = false;
6041        unsigned long high;
6042        int err;
6043
6044        buf = strstrip(buf);
6045        err = page_counter_memparse(buf, "max", &high);
6046        if (err)
6047                return err;
6048
6049        page_counter_set_high(&memcg->memory, high);
6050
6051        for (;;) {
6052                unsigned long nr_pages = page_counter_read(&memcg->memory);
6053                unsigned long reclaimed;
6054
6055                if (nr_pages <= high)
6056                        break;
6057
6058                if (signal_pending(current))
6059                        break;
6060
6061                if (!drained) {
6062                        drain_all_stock(memcg);
6063                        drained = true;
6064                        continue;
6065                }
6066
6067                reclaimed = try_to_free_mem_cgroup_pages(memcg, nr_pages - high,
6068                                                         GFP_KERNEL, true);
6069
6070                if (!reclaimed && !nr_retries--)
6071                        break;
6072        }
6073
6074        return nbytes;
6075}
6076
6077static int memory_max_show(struct seq_file *m, void *v)
6078{
6079        return seq_puts_memcg_tunable(m,
6080                READ_ONCE(mem_cgroup_from_seq(m)->memory.max));
6081}
6082
6083static ssize_t memory_max_write(struct kernfs_open_file *of,
6084                                char *buf, size_t nbytes, loff_t off)
6085{
6086        struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
6087        unsigned int nr_reclaims = MEM_CGROUP_RECLAIM_RETRIES;
6088        bool drained = false;
6089        unsigned long max;
6090        int err;
6091
6092        buf = strstrip(buf);
6093        err = page_counter_memparse(buf, "max", &max);
6094        if (err)
6095                return err;
6096
6097        xchg(&memcg->memory.max, max);
6098
6099        for (;;) {
6100                unsigned long nr_pages = page_counter_read(&memcg->memory);
6101
6102                if (nr_pages <= max)
6103                        break;
6104
6105                if (signal_pending(current))
6106                        break;
6107
6108                if (!drained) {
6109                        drain_all_stock(memcg);
6110                        drained = true;
6111                        continue;
6112                }
6113
6114                if (nr_reclaims) {
6115                        if (!try_to_free_mem_cgroup_pages(memcg, nr_pages - max,
6116                                                          GFP_KERNEL, true))
6117                                nr_reclaims--;
6118                        continue;
6119                }
6120
6121                memcg_memory_event(memcg, MEMCG_OOM);
6122                if (!mem_cgroup_out_of_memory(memcg, GFP_KERNEL, 0))
6123                        break;
6124        }
6125
6126        memcg_wb_domain_size_changed(memcg);
6127        return nbytes;
6128}
6129
6130static void __memory_events_show(struct seq_file *m, atomic_long_t *events)
6131{
6132        seq_printf(m, "low %lu\n", atomic_long_read(&events[MEMCG_LOW]));
6133        seq_printf(m, "high %lu\n", atomic_long_read(&events[MEMCG_HIGH]));
6134        seq_printf(m, "max %lu\n", atomic_long_read(&events[MEMCG_MAX]));
6135        seq_printf(m, "oom %lu\n", atomic_long_read(&events[MEMCG_OOM]));
6136        seq_printf(m, "oom_kill %lu\n",
6137                   atomic_long_read(&events[MEMCG_OOM_KILL]));
6138}
6139
6140static int memory_events_show(struct seq_file *m, void *v)
6141{
6142        struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
6143
6144        __memory_events_show(m, memcg->memory_events);
6145        return 0;
6146}
6147
6148static int memory_events_local_show(struct seq_file *m, void *v)
6149{
6150        struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
6151
6152        __memory_events_show(m, memcg->memory_events_local);
6153        return 0;
6154}
6155
6156static int memory_stat_show(struct seq_file *m, void *v)
6157{
6158        struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
6159        char *buf;
6160
6161        buf = memory_stat_format(memcg);
6162        if (!buf)
6163                return -ENOMEM;
6164        seq_puts(m, buf);
6165        kfree(buf);
6166        return 0;
6167}
6168
6169static int memory_oom_group_show(struct seq_file *m, void *v)
6170{
6171        struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
6172
6173        seq_printf(m, "%d\n", memcg->oom_group);
6174
6175        return 0;
6176}
6177
6178static ssize_t memory_oom_group_write(struct kernfs_open_file *of,
6179                                      char *buf, size_t nbytes, loff_t off)
6180{
6181        struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
6182        int ret, oom_group;
6183
6184        buf = strstrip(buf);
6185        if (!buf)
6186                return -EINVAL;
6187
6188        ret = kstrtoint(buf, 0, &oom_group);
6189        if (ret)
6190                return ret;
6191
6192        if (oom_group != 0 && oom_group != 1)
6193                return -EINVAL;
6194
6195        memcg->oom_group = oom_group;
6196
6197        return nbytes;
6198}
6199
6200static struct cftype memory_files[] = {
6201        {
6202                .name = "current",
6203                .flags = CFTYPE_NOT_ON_ROOT,
6204                .read_u64 = memory_current_read,
6205        },
6206        {
6207                .name = "min",
6208                .flags = CFTYPE_NOT_ON_ROOT,
6209                .seq_show = memory_min_show,
6210                .write = memory_min_write,
6211        },
6212        {
6213                .name = "low",
6214                .flags = CFTYPE_NOT_ON_ROOT,
6215                .seq_show = memory_low_show,
6216                .write = memory_low_write,
6217        },
6218        {
6219                .name = "high",
6220                .flags = CFTYPE_NOT_ON_ROOT,
6221                .seq_show = memory_high_show,
6222                .write = memory_high_write,
6223        },
6224        {
6225                .name = "max",
6226                .flags = CFTYPE_NOT_ON_ROOT,
6227                .seq_show = memory_max_show,
6228                .write = memory_max_write,
6229        },
6230        {
6231                .name = "events",
6232                .flags = CFTYPE_NOT_ON_ROOT,
6233                .file_offset = offsetof(struct mem_cgroup, events_file),
6234                .seq_show = memory_events_show,
6235        },
6236        {
6237                .name = "events.local",
6238                .flags = CFTYPE_NOT_ON_ROOT,
6239                .file_offset = offsetof(struct mem_cgroup, events_local_file),
6240                .seq_show = memory_events_local_show,
6241        },
6242        {
6243                .name = "stat",
6244                .seq_show = memory_stat_show,
6245        },
6246        {
6247                .name = "oom.group",
6248                .flags = CFTYPE_NOT_ON_ROOT | CFTYPE_NS_DELEGATABLE,
6249                .seq_show = memory_oom_group_show,
6250                .write = memory_oom_group_write,
6251        },
6252        { }     /* terminate */
6253};
6254
6255struct cgroup_subsys memory_cgrp_subsys = {
6256        .css_alloc = mem_cgroup_css_alloc,
6257        .css_online = mem_cgroup_css_online,
6258        .css_offline = mem_cgroup_css_offline,
6259        .css_released = mem_cgroup_css_released,
6260        .css_free = mem_cgroup_css_free,
6261        .css_reset = mem_cgroup_css_reset,
6262        .can_attach = mem_cgroup_can_attach,
6263        .cancel_attach = mem_cgroup_cancel_attach,
6264        .post_attach = mem_cgroup_move_task,
6265        .bind = mem_cgroup_bind,
6266        .dfl_cftypes = memory_files,
6267        .legacy_cftypes = mem_cgroup_legacy_files,
6268        .early_init = 0,
6269};
6270
6271/*
6272 * This function calculates an individual cgroup's effective
6273 * protection which is derived from its own memory.min/low, its
6274 * parent's and siblings' settings, as well as the actual memory
6275 * distribution in the tree.
6276 *
6277 * The following rules apply to the effective protection values:
6278 *
6279 * 1. At the first level of reclaim, effective protection is equal to
6280 *    the declared protection in memory.min and memory.low.
6281 *
6282 * 2. To enable safe delegation of the protection configuration, at
6283 *    subsequent levels the effective protection is capped to the
6284 *    parent's effective protection.
6285 *
6286 * 3. To make complex and dynamic subtrees easier to configure, the
6287 *    user is allowed to overcommit the declared protection at a given
6288 *    level. If that is the case, the parent's effective protection is
6289 *    distributed to the children in proportion to how much protection
6290 *    they have declared and how much of it they are utilizing.
6291 *
6292 *    This makes distribution proportional, but also work-conserving:
6293 *    if one cgroup claims much more protection than it uses memory,
6294 *    the unused remainder is available to its siblings.
6295 *
6296 * 4. Conversely, when the declared protection is undercommitted at a
6297 *    given level, the distribution of the larger parental protection
6298 *    budget is NOT proportional. A cgroup's protection from a sibling
6299 *    is capped to its own memory.min/low setting.
6300 *
6301 * 5. However, to allow protecting recursive subtrees from each other
6302 *    without having to declare each individual cgroup's fixed share
6303 *    of the ancestor's claim to protection, any unutilized -
6304 *    "floating" - protection from up the tree is distributed in
6305 *    proportion to each cgroup's *usage*. This makes the protection
6306 *    neutral wrt sibling cgroups and lets them compete freely over
6307 *    the shared parental protection budget, but it protects the
6308 *    subtree as a whole from neighboring subtrees.
6309 *
6310 * Note that 4. and 5. are not in conflict: 4. is about protecting
6311 * against immediate siblings whereas 5. is about protecting against
6312 * neighboring subtrees.
6313 */
6314static unsigned long effective_protection(unsigned long usage,
6315                                          unsigned long parent_usage,
6316                                          unsigned long setting,
6317                                          unsigned long parent_effective,
6318                                          unsigned long siblings_protected)
6319{
6320        unsigned long protected;
6321        unsigned long ep;
6322
6323        protected = min(usage, setting);
6324        /*
6325         * If all cgroups at this level combined claim and use more
6326         * protection then what the parent affords them, distribute
6327         * shares in proportion to utilization.
6328         *
6329         * We are using actual utilization rather than the statically
6330         * claimed protection in order to be work-conserving: claimed
6331         * but unused protection is available to siblings that would
6332         * otherwise get a smaller chunk than what they claimed.
6333         */
6334        if (siblings_protected > parent_effective)
6335                return protected * parent_effective / siblings_protected;
6336
6337        /*
6338         * Ok, utilized protection of all children is within what the
6339         * parent affords them, so we know whatever this child claims
6340         * and utilizes is effectively protected.
6341         *
6342         * If there is unprotected usage beyond this value, reclaim
6343         * will apply pressure in proportion to that amount.
6344         *
6345         * If there is unutilized protection, the cgroup will be fully
6346         * shielded from reclaim, but we do return a smaller value for
6347         * protection than what the group could enjoy in theory. This
6348         * is okay. With the overcommit distribution above, effective
6349         * protection is always dependent on how memory is actually
6350         * consumed among the siblings anyway.
6351         */
6352        ep = protected;
6353
6354        /*
6355         * If the children aren't claiming (all of) the protection
6356         * afforded to them by the parent, distribute the remainder in
6357         * proportion to the (unprotected) memory of each cgroup. That
6358         * way, cgroups that aren't explicitly prioritized wrt each
6359         * other compete freely over the allowance, but they are
6360         * collectively protected from neighboring trees.
6361         *
6362         * We're using unprotected memory for the weight so that if
6363         * some cgroups DO claim explicit protection, we don't protect
6364         * the same bytes twice.
6365         *
6366         * Check both usage and parent_usage against the respective
6367         * protected values. One should imply the other, but they
6368         * aren't read atomically - make sure the division is sane.
6369         */
6370        if (!(cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_RECURSIVE_PROT))
6371                return ep;
6372        if (parent_effective > siblings_protected &&
6373            parent_usage > siblings_protected &&
6374            usage > protected) {
6375                unsigned long unclaimed;
6376
6377                unclaimed = parent_effective - siblings_protected;
6378                unclaimed *= usage - protected;
6379                unclaimed /= parent_usage - siblings_protected;
6380
6381                ep += unclaimed;
6382        }
6383
6384        return ep;
6385}
6386
6387/**
6388 * mem_cgroup_protected - check if memory consumption is in the normal range
6389 * @root: the top ancestor of the sub-tree being checked
6390 * @memcg: the memory cgroup to check
6391 *
6392 * WARNING: This function is not stateless! It can only be used as part
6393 *          of a top-down tree iteration, not for isolated queries.
6394 *
6395 * Returns one of the following:
6396 *   MEMCG_PROT_NONE: cgroup memory is not protected
6397 *   MEMCG_PROT_LOW: cgroup memory is protected as long there is
6398 *     an unprotected supply of reclaimable memory from other cgroups.
6399 *   MEMCG_PROT_MIN: cgroup memory is protected
6400 */
6401enum mem_cgroup_protection mem_cgroup_protected(struct mem_cgroup *root,
6402                                                struct mem_cgroup *memcg)
6403{
6404        unsigned long usage, parent_usage;
6405        struct mem_cgroup *parent;
6406
6407        if (mem_cgroup_disabled())
6408                return MEMCG_PROT_NONE;
6409
6410        if (!root)
6411                root = root_mem_cgroup;
6412        if (memcg == root)
6413                return MEMCG_PROT_NONE;
6414
6415        usage = page_counter_read(&memcg->memory);
6416        if (!usage)
6417                return MEMCG_PROT_NONE;
6418
6419        parent = parent_mem_cgroup(memcg);
6420        /* No parent means a non-hierarchical mode on v1 memcg */
6421        if (!parent)
6422                return MEMCG_PROT_NONE;
6423
6424        if (parent == root) {
6425                memcg->memory.emin = READ_ONCE(memcg->memory.min);
6426                memcg->memory.elow = READ_ONCE(memcg->memory.low);
6427                goto out;
6428        }
6429
6430        parent_usage = page_counter_read(&parent->memory);
6431
6432        WRITE_ONCE(memcg->memory.emin, effective_protection(usage, parent_usage,
6433                        READ_ONCE(memcg->memory.min),
6434                        READ_ONCE(parent->memory.emin),
6435                        atomic_long_read(&parent->memory.children_min_usage)));
6436
6437        WRITE_ONCE(memcg->memory.elow, effective_protection(usage, parent_usage,
6438                        READ_ONCE(memcg->memory.low),
6439                        READ_ONCE(parent->memory.elow),
6440                        atomic_long_read(&parent->memory.children_low_usage)));
6441
6442out:
6443        if (usage <= memcg->memory.emin)
6444                return MEMCG_PROT_MIN;
6445        else if (usage <= memcg->memory.elow)
6446                return MEMCG_PROT_LOW;
6447        else
6448                return MEMCG_PROT_NONE;
6449}
6450
6451/**
6452 * mem_cgroup_charge - charge a newly allocated page to a cgroup
6453 * @page: page to charge
6454 * @mm: mm context of the victim
6455 * @gfp_mask: reclaim mode
6456 *
6457 * Try to charge @page to the memcg that @mm belongs to, reclaiming
6458 * pages according to @gfp_mask if necessary.
6459 *
6460 * Returns 0 on success. Otherwise, an error code is returned.
6461 */
6462int mem_cgroup_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask)
6463{
6464        unsigned int nr_pages = hpage_nr_pages(page);
6465        struct mem_cgroup *memcg = NULL;
6466        int ret = 0;
6467
6468        if (mem_cgroup_disabled())
6469                goto out;
6470
6471        if (PageSwapCache(page)) {
6472                swp_entry_t ent = { .val = page_private(page), };
6473                unsigned short id;
6474
6475                /*
6476                 * Every swap fault against a single page tries to charge the
6477                 * page, bail as early as possible.  shmem_unuse() encounters
6478                 * already charged pages, too.  page->mem_cgroup is protected
6479                 * by the page lock, which serializes swap cache removal, which
6480                 * in turn serializes uncharging.
6481                 */
6482                VM_BUG_ON_PAGE(!PageLocked(page), page);
6483                if (compound_head(page)->mem_cgroup)
6484                        goto out;
6485
6486                id = lookup_swap_cgroup_id(ent);
6487                rcu_read_lock();
6488                memcg = mem_cgroup_from_id(id);
6489                if (memcg && !css_tryget_online(&memcg->css))
6490                        memcg = NULL;
6491                rcu_read_unlock();
6492        }
6493
6494        if (!memcg)
6495                memcg = get_mem_cgroup_from_mm(mm);
6496
6497        ret = try_charge(memcg, gfp_mask, nr_pages);
6498        if (ret)
6499                goto out_put;
6500
6501        commit_charge(page, memcg);
6502
6503        local_irq_disable();
6504        mem_cgroup_charge_statistics(memcg, page, nr_pages);
6505        memcg_check_events(memcg, page);
6506        local_irq_enable();
6507
6508        if (PageSwapCache(page)) {
6509                swp_entry_t entry = { .val = page_private(page) };
6510                /*
6511                 * The swap entry might not get freed for a long time,
6512                 * let's not wait for it.  The page already received a
6513                 * memory+swap charge, drop the swap entry duplicate.
6514                 */
6515                mem_cgroup_uncharge_swap(entry, nr_pages);
6516        }
6517
6518out_put:
6519        css_put(&memcg->css);
6520out:
6521        return ret;
6522}
6523
6524struct uncharge_gather {
6525        struct mem_cgroup *memcg;
6526        unsigned long nr_pages;
6527        unsigned long pgpgout;
6528        unsigned long nr_kmem;
6529        struct page *dummy_page;
6530};
6531
6532static inline void uncharge_gather_clear(struct uncharge_gather *ug)
6533{
6534        memset(ug, 0, sizeof(*ug));
6535}
6536
6537static void uncharge_batch(const struct uncharge_gather *ug)
6538{
6539        unsigned long flags;
6540
6541        if (!mem_cgroup_is_root(ug->memcg)) {
6542                page_counter_uncharge(&ug->memcg->memory, ug->nr_pages);
6543                if (do_memsw_account())
6544                        page_counter_uncharge(&ug->memcg->memsw, ug->nr_pages);
6545                if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && ug->nr_kmem)
6546                        page_counter_uncharge(&ug->memcg->kmem, ug->nr_kmem);
6547                memcg_oom_recover(ug->memcg);
6548        }
6549
6550        local_irq_save(flags);
6551        __count_memcg_events(ug->memcg, PGPGOUT, ug->pgpgout);
6552        __this_cpu_add(ug->memcg->vmstats_percpu->nr_page_events, ug->nr_pages);
6553        memcg_check_events(ug->memcg, ug->dummy_page);
6554        local_irq_restore(flags);
6555
6556        if (!mem_cgroup_is_root(ug->memcg))
6557                css_put_many(&ug->memcg->css, ug->nr_pages);
6558}
6559
6560static void uncharge_page(struct page *page, struct uncharge_gather *ug)
6561{
6562        unsigned long nr_pages;
6563
6564        VM_BUG_ON_PAGE(PageLRU(page), page);
6565
6566        if (!page->mem_cgroup)
6567                return;
6568
6569        /*
6570         * Nobody should be changing or seriously looking at
6571         * page->mem_cgroup at this point, we have fully
6572         * exclusive access to the page.
6573         */
6574
6575        if (ug->memcg != page->mem_cgroup) {
6576                if (ug->memcg) {
6577                        uncharge_batch(ug);
6578                        uncharge_gather_clear(ug);
6579                }
6580                ug->memcg = page->mem_cgroup;
6581        }
6582
6583        nr_pages = compound_nr(page);
6584        ug->nr_pages += nr_pages;
6585
6586        if (!PageKmemcg(page)) {
6587                ug->pgpgout++;
6588        } else {
6589                ug->nr_kmem += nr_pages;
6590                __ClearPageKmemcg(page);
6591        }
6592
6593        ug->dummy_page = page;
6594        page->mem_cgroup = NULL;
6595}
6596
6597static void uncharge_list(struct list_head *page_list)
6598{
6599        struct uncharge_gather ug;
6600        struct list_head *next;
6601
6602        uncharge_gather_clear(&ug);
6603
6604        /*
6605         * Note that the list can be a single page->lru; hence the
6606         * do-while loop instead of a simple list_for_each_entry().
6607         */
6608        next = page_list->next;
6609        do {
6610                struct page *page;
6611
6612                page = list_entry(next, struct page, lru);
6613                next = page->lru.next;
6614
6615                uncharge_page(page, &ug);
6616        } while (next != page_list);
6617
6618        if (ug.memcg)
6619                uncharge_batch(&ug);
6620}
6621
6622/**
6623 * mem_cgroup_uncharge - uncharge a page
6624 * @page: page to uncharge
6625 *
6626 * Uncharge a page previously charged with mem_cgroup_charge().
6627 */
6628void mem_cgroup_uncharge(struct page *page)
6629{
6630        struct uncharge_gather ug;
6631
6632        if (mem_cgroup_disabled())
6633                return;
6634
6635        /* Don't touch page->lru of any random page, pre-check: */
6636        if (!page->mem_cgroup)
6637                return;
6638
6639        uncharge_gather_clear(&ug);
6640        uncharge_page(page, &ug);
6641        uncharge_batch(&ug);
6642}
6643
6644/**
6645 * mem_cgroup_uncharge_list - uncharge a list of page
6646 * @page_list: list of pages to uncharge
6647 *
6648 * Uncharge a list of pages previously charged with
6649 * mem_cgroup_charge().
6650 */
6651void mem_cgroup_uncharge_list(struct list_head *page_list)
6652{
6653        if (mem_cgroup_disabled())
6654                return;
6655
6656        if (!list_empty(page_list))
6657                uncharge_list(page_list);
6658}
6659
6660/**
6661 * mem_cgroup_migrate - charge a page's replacement
6662 * @oldpage: currently circulating page
6663 * @newpage: replacement page
6664 *
6665 * Charge @newpage as a replacement page for @oldpage. @oldpage will
6666 * be uncharged upon free.
6667 *
6668 * Both pages must be locked, @newpage->mapping must be set up.
6669 */
6670void mem_cgroup_migrate(struct page *oldpage, struct page *newpage)
6671{
6672        struct mem_cgroup *memcg;
6673        unsigned int nr_pages;
6674        unsigned long flags;
6675
6676        VM_BUG_ON_PAGE(!PageLocked(oldpage), oldpage);
6677        VM_BUG_ON_PAGE(!PageLocked(newpage), newpage);
6678        VM_BUG_ON_PAGE(PageAnon(oldpage) != PageAnon(newpage), newpage);
6679        VM_BUG_ON_PAGE(PageTransHuge(oldpage) != PageTransHuge(newpage),
6680                       newpage);
6681
6682        if (mem_cgroup_disabled())
6683                return;
6684
6685        /* Page cache replacement: new page already charged? */
6686        if (newpage->mem_cgroup)
6687                return;
6688
6689        /* Swapcache readahead pages can get replaced before being charged */
6690        memcg = oldpage->mem_cgroup;
6691        if (!memcg)
6692                return;
6693
6694        /* Force-charge the new page. The old one will be freed soon */
6695        nr_pages = hpage_nr_pages(newpage);
6696
6697        page_counter_charge(&memcg->memory, nr_pages);
6698        if (do_memsw_account())
6699                page_counter_charge(&memcg->memsw, nr_pages);
6700        css_get_many(&memcg->css, nr_pages);
6701
6702        commit_charge(newpage, memcg);
6703
6704        local_irq_save(flags);
6705        mem_cgroup_charge_statistics(memcg, newpage, nr_pages);
6706        memcg_check_events(memcg, newpage);
6707        local_irq_restore(flags);
6708}
6709
6710DEFINE_STATIC_KEY_FALSE(memcg_sockets_enabled_key);
6711EXPORT_SYMBOL(memcg_sockets_enabled_key);
6712
6713void mem_cgroup_sk_alloc(struct sock *sk)
6714{
6715        struct mem_cgroup *memcg;
6716
6717        if (!mem_cgroup_sockets_enabled)
6718                return;
6719
6720        /* Do not associate the sock with unrelated interrupted task's memcg. */
6721        if (in_interrupt())
6722                return;
6723
6724        rcu_read_lock();
6725        memcg = mem_cgroup_from_task(current);
6726        if (memcg == root_mem_cgroup)
6727                goto out;
6728        if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && !memcg->tcpmem_active)
6729                goto out;
6730        if (css_tryget(&memcg->css))
6731                sk->sk_memcg = memcg;
6732out:
6733        rcu_read_unlock();
6734}
6735
6736void mem_cgroup_sk_free(struct sock *sk)
6737{
6738        if (sk->sk_memcg)
6739                css_put(&sk->sk_memcg->css);
6740}
6741
6742/**
6743 * mem_cgroup_charge_skmem - charge socket memory
6744 * @memcg: memcg to charge
6745 * @nr_pages: number of pages to charge
6746 *
6747 * Charges @nr_pages to @memcg. Returns %true if the charge fit within
6748 * @memcg's configured limit, %false if the charge had to be forced.
6749 */
6750bool mem_cgroup_charge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages)
6751{
6752        gfp_t gfp_mask = GFP_KERNEL;
6753
6754        if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) {
6755                struct page_counter *fail;
6756
6757                if (page_counter_try_charge(&memcg->tcpmem, nr_pages, &fail)) {
6758                        memcg->tcpmem_pressure = 0;
6759                        return true;
6760                }
6761                page_counter_charge(&memcg->tcpmem, nr_pages);
6762                memcg->tcpmem_pressure = 1;
6763                return false;
6764        }
6765
6766        /* Don't block in the packet receive path */
6767        if (in_softirq())
6768                gfp_mask = GFP_NOWAIT;
6769
6770        mod_memcg_state(memcg, MEMCG_SOCK, nr_pages);
6771
6772        if (try_charge(memcg, gfp_mask, nr_pages) == 0)
6773                return true;
6774
6775        try_charge(memcg, gfp_mask|__GFP_NOFAIL, nr_pages);
6776        return false;
6777}
6778
6779/**
6780 * mem_cgroup_uncharge_skmem - uncharge socket memory
6781 * @memcg: memcg to uncharge
6782 * @nr_pages: number of pages to uncharge
6783 */
6784void mem_cgroup_uncharge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages)
6785{
6786        if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) {
6787                page_counter_uncharge(&memcg->tcpmem, nr_pages);
6788                return;
6789        }
6790
6791        mod_memcg_state(memcg, MEMCG_SOCK, -nr_pages);
6792
6793        refill_stock(memcg, nr_pages);
6794}
6795
6796static int __init cgroup_memory(char *s)
6797{
6798        char *token;
6799
6800        while ((token = strsep(&s, ",")) != NULL) {
6801                if (!*token)
6802                        continue;
6803                if (!strcmp(token, "nosocket"))
6804                        cgroup_memory_nosocket = true;
6805                if (!strcmp(token, "nokmem"))
6806                        cgroup_memory_nokmem = true;
6807        }
6808        return 0;
6809}
6810__setup("cgroup.memory=", cgroup_memory);
6811
6812/*
6813 * subsys_initcall() for memory controller.
6814 *
6815 * Some parts like memcg_hotplug_cpu_dead() have to be initialized from this
6816 * context because of lock dependencies (cgroup_lock -> cpu hotplug) but
6817 * basically everything that doesn't depend on a specific mem_cgroup structure
6818 * should be initialized from here.
6819 */
6820static int __init mem_cgroup_init(void)
6821{
6822        int cpu, node;
6823
6824#ifdef CONFIG_MEMCG_KMEM
6825        /*
6826         * Kmem cache creation is mostly done with the slab_mutex held,
6827         * so use a workqueue with limited concurrency to avoid stalling
6828         * all worker threads in case lots of cgroups are created and
6829         * destroyed simultaneously.
6830         */
6831        memcg_kmem_cache_wq = alloc_workqueue("memcg_kmem_cache", 0, 1);
6832        BUG_ON(!memcg_kmem_cache_wq);
6833#endif
6834
6835        cpuhp_setup_state_nocalls(CPUHP_MM_MEMCQ_DEAD, "mm/memctrl:dead", NULL,
6836                                  memcg_hotplug_cpu_dead);
6837
6838        for_each_possible_cpu(cpu)
6839                INIT_WORK(&per_cpu_ptr(&memcg_stock, cpu)->work,
6840                          drain_local_stock);
6841
6842        for_each_node(node) {
6843                struct mem_cgroup_tree_per_node *rtpn;
6844
6845                rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL,
6846                                    node_online(node) ? node : NUMA_NO_NODE);
6847
6848                rtpn->rb_root = RB_ROOT;
6849                rtpn->rb_rightmost = NULL;
6850                spin_lock_init(&rtpn->lock);
6851                soft_limit_tree.rb_tree_per_node[node] = rtpn;
6852        }
6853
6854        return 0;
6855}
6856subsys_initcall(mem_cgroup_init);
6857
6858#ifdef CONFIG_MEMCG_SWAP
6859static struct mem_cgroup *mem_cgroup_id_get_online(struct mem_cgroup *memcg)
6860{
6861        while (!refcount_inc_not_zero(&memcg->id.ref)) {
6862                /*
6863                 * The root cgroup cannot be destroyed, so it's refcount must
6864                 * always be >= 1.
6865                 */
6866                if (WARN_ON_ONCE(memcg == root_mem_cgroup)) {
6867                        VM_BUG_ON(1);
6868                        break;
6869                }
6870                memcg = parent_mem_cgroup(memcg);
6871                if (!memcg)
6872                        memcg = root_mem_cgroup;
6873        }
6874        return memcg;
6875}
6876
6877/**
6878 * mem_cgroup_swapout - transfer a memsw charge to swap
6879 * @page: page whose memsw charge to transfer
6880 * @entry: swap entry to move the charge to
6881 *
6882 * Transfer the memsw charge of @page to @entry.
6883 */
6884void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
6885{
6886        struct mem_cgroup *memcg, *swap_memcg;
6887        unsigned int nr_entries;
6888        unsigned short oldid;
6889
6890        VM_BUG_ON_PAGE(PageLRU(page), page);
6891        VM_BUG_ON_PAGE(page_count(page), page);
6892
6893        if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
6894                return;
6895
6896        memcg = page->mem_cgroup;
6897
6898        /* Readahead page, never charged */
6899        if (!memcg)
6900                return;
6901
6902        /*
6903         * In case the memcg owning these pages has been offlined and doesn't
6904         * have an ID allocated to it anymore, charge the closest online
6905         * ancestor for the swap instead and transfer the memory+swap charge.
6906         */
6907        swap_memcg = mem_cgroup_id_get_online(memcg);
6908        nr_entries = hpage_nr_pages(page);
6909        /* Get references for the tail pages, too */
6910        if (nr_entries > 1)
6911                mem_cgroup_id_get_many(swap_memcg, nr_entries - 1);
6912        oldid = swap_cgroup_record(entry, mem_cgroup_id(swap_memcg),
6913                                   nr_entries);
6914        VM_BUG_ON_PAGE(oldid, page);
6915        mod_memcg_state(swap_memcg, MEMCG_SWAP, nr_entries);
6916
6917        page->mem_cgroup = NULL;
6918
6919        if (!mem_cgroup_is_root(memcg))
6920                page_counter_uncharge(&memcg->memory, nr_entries);
6921
6922        if (!cgroup_memory_noswap && memcg != swap_memcg) {
6923                if (!mem_cgroup_is_root(swap_memcg))
6924                        page_counter_charge(&swap_memcg->memsw, nr_entries);
6925                page_counter_uncharge(&memcg->memsw, nr_entries);
6926        }
6927
6928        /*
6929         * Interrupts should be disabled here because the caller holds the
6930         * i_pages lock which is taken with interrupts-off. It is
6931         * important here to have the interrupts disabled because it is the
6932         * only synchronisation we have for updating the per-CPU variables.
6933         */
6934        VM_BUG_ON(!irqs_disabled());
6935        mem_cgroup_charge_statistics(memcg, page, -nr_entries);
6936        memcg_check_events(memcg, page);
6937
6938        if (!mem_cgroup_is_root(memcg))
6939                css_put_many(&memcg->css, nr_entries);
6940}
6941
6942/**
6943 * mem_cgroup_try_charge_swap - try charging swap space for a page
6944 * @page: page being added to swap
6945 * @entry: swap entry to charge
6946 *
6947 * Try to charge @page's memcg for the swap space at @entry.
6948 *
6949 * Returns 0 on success, -ENOMEM on failure.
6950 */
6951int mem_cgroup_try_charge_swap(struct page *page, swp_entry_t entry)
6952{
6953        unsigned int nr_pages = hpage_nr_pages(page);
6954        struct page_counter *counter;
6955        struct mem_cgroup *memcg;
6956        unsigned short oldid;
6957
6958        if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
6959                return 0;
6960
6961        memcg = page->mem_cgroup;
6962
6963        /* Readahead page, never charged */
6964        if (!memcg)
6965                return 0;
6966
6967        if (!entry.val) {
6968                memcg_memory_event(memcg, MEMCG_SWAP_FAIL);
6969                return 0;
6970        }
6971
6972        memcg = mem_cgroup_id_get_online(memcg);
6973
6974        if (!cgroup_memory_noswap && !mem_cgroup_is_root(memcg) &&
6975            !page_counter_try_charge(&memcg->swap, nr_pages, &counter)) {
6976                memcg_memory_event(memcg, MEMCG_SWAP_MAX);
6977                memcg_memory_event(memcg, MEMCG_SWAP_FAIL);
6978                mem_cgroup_id_put(memcg);
6979                return -ENOMEM;
6980        }
6981
6982        /* Get references for the tail pages, too */
6983        if (nr_pages > 1)
6984                mem_cgroup_id_get_many(memcg, nr_pages - 1);
6985        oldid = swap_cgroup_record(entry, mem_cgroup_id(memcg), nr_pages);
6986        VM_BUG_ON_PAGE(oldid, page);
6987        mod_memcg_state(memcg, MEMCG_SWAP, nr_pages);
6988
6989        return 0;
6990}
6991
6992/**
6993 * mem_cgroup_uncharge_swap - uncharge swap space
6994 * @entry: swap entry to uncharge
6995 * @nr_pages: the amount of swap space to uncharge
6996 */
6997void mem_cgroup_uncharge_swap(swp_entry_t entry, unsigned int nr_pages)
6998{
6999        struct mem_cgroup *memcg;
7000        unsigned short id;
7001
7002        id = swap_cgroup_record(entry, 0, nr_pages);
7003        rcu_read_lock();
7004        memcg = mem_cgroup_from_id(id);
7005        if (memcg) {
7006                if (!cgroup_memory_noswap && !mem_cgroup_is_root(memcg)) {
7007                        if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
7008                                page_counter_uncharge(&memcg->swap, nr_pages);
7009                        else
7010                                page_counter_uncharge(&memcg->memsw, nr_pages);
7011                }
7012                mod_memcg_state(memcg, MEMCG_SWAP, -nr_pages);
7013                mem_cgroup_id_put_many(memcg, nr_pages);
7014        }
7015        rcu_read_unlock();
7016}
7017
7018long mem_cgroup_get_nr_swap_pages(struct mem_cgroup *memcg)
7019{
7020        long nr_swap_pages = get_nr_swap_pages();
7021
7022        if (cgroup_memory_noswap || !cgroup_subsys_on_dfl(memory_cgrp_subsys))
7023                return nr_swap_pages;
7024        for (; memcg != root_mem_cgroup; memcg = parent_mem_cgroup(memcg))
7025                nr_swap_pages = min_t(long, nr_swap_pages,
7026                                      READ_ONCE(memcg->swap.max) -
7027                                      page_counter_read(&memcg->swap));
7028        return nr_swap_pages;
7029}
7030
7031bool mem_cgroup_swap_full(struct page *page)
7032{
7033        struct mem_cgroup *memcg;
7034
7035        VM_BUG_ON_PAGE(!PageLocked(page), page);
7036
7037        if (vm_swap_full())
7038                return true;
7039        if (cgroup_memory_noswap || !cgroup_subsys_on_dfl(memory_cgrp_subsys))
7040                return false;
7041
7042        memcg = page->mem_cgroup;
7043        if (!memcg)
7044                return false;
7045
7046        for (; memcg != root_mem_cgroup; memcg = parent_mem_cgroup(memcg)) {
7047                unsigned long usage = page_counter_read(&memcg->swap);
7048
7049                if (usage * 2 >= READ_ONCE(memcg->swap.high) ||
7050                    usage * 2 >= READ_ONCE(memcg->swap.max))
7051                        return true;
7052        }
7053
7054        return false;
7055}
7056
7057static int __init setup_swap_account(char *s)
7058{
7059        if (!strcmp(s, "1"))
7060                cgroup_memory_noswap = 0;
7061        else if (!strcmp(s, "0"))
7062                cgroup_memory_noswap = 1;
7063        return 1;
7064}
7065__setup("swapaccount=", setup_swap_account);
7066
7067static u64 swap_current_read(struct cgroup_subsys_state *css,
7068                             struct cftype *cft)
7069{
7070        struct mem_cgroup *memcg = mem_cgroup_from_css(css);
7071
7072        return (u64)page_counter_read(&memcg->swap) * PAGE_SIZE;
7073}
7074
7075static int swap_high_show(struct seq_file *m, void *v)
7076{
7077        return seq_puts_memcg_tunable(m,
7078                READ_ONCE(mem_cgroup_from_seq(m)->swap.high));
7079}
7080
7081static ssize_t swap_high_write(struct kernfs_open_file *of,
7082                               char *buf, size_t nbytes, loff_t off)
7083{
7084        struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
7085        unsigned long high;
7086        int err;
7087
7088        buf = strstrip(buf);
7089        err = page_counter_memparse(buf, "max", &high);
7090        if (err)
7091                return err;
7092
7093        page_counter_set_high(&memcg->swap, high);
7094
7095        return nbytes;
7096}
7097
7098static int swap_max_show(struct seq_file *m, void *v)
7099{
7100        return seq_puts_memcg_tunable(m,
7101                READ_ONCE(mem_cgroup_from_seq(m)->swap.max));
7102}
7103
7104static ssize_t swap_max_write(struct kernfs_open_file *of,
7105                              char *buf, size_t nbytes, loff_t off)
7106{
7107        struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
7108        unsigned long max;
7109        int err;
7110
7111        buf = strstrip(buf);
7112        err = page_counter_memparse(buf, "max", &max);
7113        if (err)
7114                return err;
7115
7116        xchg(&memcg->swap.max, max);
7117
7118        return nbytes;
7119}
7120
7121static int swap_events_show(struct seq_file *m, void *v)
7122{
7123        struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
7124
7125        seq_printf(m, "high %lu\n",
7126                   atomic_long_read(&memcg->memory_events[MEMCG_SWAP_HIGH]));
7127        seq_printf(m, "max %lu\n",
7128                   atomic_long_read(&memcg->memory_events[MEMCG_SWAP_MAX]));
7129        seq_printf(m, "fail %lu\n",
7130                   atomic_long_read(&memcg->memory_events[MEMCG_SWAP_FAIL]));
7131
7132        return 0;
7133}
7134
7135static struct cftype swap_files[] = {
7136        {
7137                .name = "swap.current",
7138                .flags = CFTYPE_NOT_ON_ROOT,
7139                .read_u64 = swap_current_read,
7140        },
7141        {
7142                .name = "swap.high",
7143                .flags = CFTYPE_NOT_ON_ROOT,
7144                .seq_show = swap_high_show,
7145                .write = swap_high_write,
7146        },
7147        {
7148                .name = "swap.max",
7149                .flags = CFTYPE_NOT_ON_ROOT,
7150                .seq_show = swap_max_show,
7151                .write = swap_max_write,
7152        },
7153        {
7154                .name = "swap.events",
7155                .flags = CFTYPE_NOT_ON_ROOT,
7156                .file_offset = offsetof(struct mem_cgroup, swap_events_file),
7157                .seq_show = swap_events_show,
7158        },
7159        { }     /* terminate */
7160};
7161
7162static struct cftype memsw_files[] = {
7163        {
7164                .name = "memsw.usage_in_bytes",
7165                .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE),
7166                .read_u64 = mem_cgroup_read_u64,
7167        },
7168        {
7169                .name = "memsw.max_usage_in_bytes",
7170                .private = MEMFILE_PRIVATE(_MEMSWAP, RES_MAX_USAGE),
7171                .write = mem_cgroup_reset,
7172                .read_u64 = mem_cgroup_read_u64,
7173        },
7174        {
7175                .name = "memsw.limit_in_bytes",
7176                .private = MEMFILE_PRIVATE(_MEMSWAP, RES_LIMIT),
7177                .write = mem_cgroup_write,
7178                .read_u64 = mem_cgroup_read_u64,
7179        },
7180        {
7181                .name = "memsw.failcnt",
7182                .private = MEMFILE_PRIVATE(_MEMSWAP, RES_FAILCNT),
7183                .write = mem_cgroup_reset,
7184                .read_u64 = mem_cgroup_read_u64,
7185        },
7186        { },    /* terminate */
7187};
7188
7189/*
7190 * If mem_cgroup_swap_init() is implemented as a subsys_initcall()
7191 * instead of a core_initcall(), this could mean cgroup_memory_noswap still
7192 * remains set to false even when memcg is disabled via "cgroup_disable=memory"
7193 * boot parameter. This may result in premature OOPS inside
7194 * mem_cgroup_get_nr_swap_pages() function in corner cases.
7195 */
7196static int __init mem_cgroup_swap_init(void)
7197{
7198        /* No memory control -> no swap control */
7199        if (mem_cgroup_disabled())
7200                cgroup_memory_noswap = true;
7201
7202        if (cgroup_memory_noswap)
7203                return 0;
7204
7205        WARN_ON(cgroup_add_dfl_cftypes(&memory_cgrp_subsys, swap_files));
7206        WARN_ON(cgroup_add_legacy_cftypes(&memory_cgrp_subsys, memsw_files));
7207
7208        return 0;
7209}
7210core_initcall(mem_cgroup_swap_init);
7211
7212#endif /* CONFIG_MEMCG_SWAP */
7213