LXR linux/mm/memcontrol.c

   1/* memcontrol.c - Memory Controller
   2 *
   3 * Copyright IBM Corporation, 2007
   4 * Author Balbir Singh <balbir@linux.vnet.ibm.com>
   5 *
   6 * Copyright 2007 OpenVZ SWsoft Inc
   7 * Author: Pavel Emelianov <xemul@openvz.org>
   8 *
   9 * Memory thresholds
  10 * Copyright (C) 2009 Nokia Corporation
  11 * Author: Kirill A. Shutemov
  12 *
  13 * Kernel Memory Controller
  14 * Copyright (C) 2012 Parallels Inc. and Google Inc.
  15 * Authors: Glauber Costa and Suleiman Souhlal
  16 *
  17 * Native page reclaim
  18 * Charge lifetime sanitation
  19 * Lockless page tracking & accounting
  20 * Unified hierarchy configuration model
  21 * Copyright (C) 2015 Red Hat, Inc., Johannes Weiner
  22 *
  23 * This program is free software; you can redistribute it and/or modify
  24 * it under the terms of the GNU General Public License as published by
  25 * the Free Software Foundation; either version 2 of the License, or
  26 * (at your option) any later version.
  27 *
  28 * This program is distributed in the hope that it will be useful,
  29 * but WITHOUT ANY WARRANTY; without even the implied warranty of
  30 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  31 * GNU General Public License for more details.
  32 */
  33
  34#include <linux/page_counter.h>
  35#include <linux/memcontrol.h>
  36#include <linux/cgroup.h>
  37#include <linux/mm.h>
  38#include <linux/hugetlb.h>
  39#include <linux/pagemap.h>
  40#include <linux/smp.h>
  41#include <linux/page-flags.h>
  42#include <linux/backing-dev.h>
  43#include <linux/bit_spinlock.h>
  44#include <linux/rcupdate.h>
  45#include <linux/limits.h>
  46#include <linux/export.h>
  47#include <linux/mutex.h>
  48#include <linux/rbtree.h>
  49#include <linux/slab.h>
  50#include <linux/swap.h>
  51#include <linux/swapops.h>
  52#include <linux/spinlock.h>
  53#include <linux/eventfd.h>
  54#include <linux/poll.h>
  55#include <linux/sort.h>
  56#include <linux/fs.h>
  57#include <linux/seq_file.h>
  58#include <linux/vmpressure.h>
  59#include <linux/mm_inline.h>
  60#include <linux/swap_cgroup.h>
  61#include <linux/cpu.h>
  62#include <linux/oom.h>
  63#include <linux/lockdep.h>
  64#include <linux/file.h>
  65#include <linux/tracehook.h>
  66#include "internal.h"
  67#include <net/sock.h>
  68#include <net/ip.h>
  69#include "slab.h"
  70
  71#include <asm/uaccess.h>
  72
  73#include <trace/events/vmscan.h>
  74
  75struct cgroup_subsys memory_cgrp_subsys __read_mostly;
  76EXPORT_SYMBOL(memory_cgrp_subsys);
  77
  78struct mem_cgroup *root_mem_cgroup __read_mostly;
  79
  80#define MEM_CGROUP_RECLAIM_RETRIES      5
  81
  82/* Socket memory accounting disabled? */
  83static bool cgroup_memory_nosocket;
  84
  85/* Kernel memory accounting disabled? */
  86static bool cgroup_memory_nokmem;
  87
  88/* Whether the swap controller is active */
  89#ifdef CONFIG_MEMCG_SWAP
  90int do_swap_account __read_mostly;
  91#else
  92#define do_swap_account         0
  93#endif
  94
  95/* Whether legacy memory+swap accounting is active */
  96static bool do_memsw_account(void)
  97{
  98        return !cgroup_subsys_on_dfl(memory_cgrp_subsys) && do_swap_account;
  99}
 100
 101static const char * const mem_cgroup_stat_names[] = {
 102        "cache",
 103        "rss",
 104        "rss_huge",
 105        "mapped_file",
 106        "dirty",
 107        "writeback",
 108        "swap",
 109};
 110
 111static const char * const mem_cgroup_events_names[] = {
 112        "pgpgin",
 113        "pgpgout",
 114        "pgfault",
 115        "pgmajfault",
 116};
 117
 118static const char * const mem_cgroup_lru_names[] = {
 119        "inactive_anon",
 120        "active_anon",
 121        "inactive_file",
 122        "active_file",
 123        "unevictable",
 124};
 125
 126#define THRESHOLDS_EVENTS_TARGET 128
 127#define SOFTLIMIT_EVENTS_TARGET 1024
 128#define NUMAINFO_EVENTS_TARGET  1024
 129
 130/*
 131 * Cgroups above their limits are maintained in a RB-Tree, independent of
 132 * their hierarchy representation
 133 */
 134
 135struct mem_cgroup_tree_per_node {
 136        struct rb_root rb_root;
 137        spinlock_t lock;
 138};
 139
 140struct mem_cgroup_tree {
 141        struct mem_cgroup_tree_per_node *rb_tree_per_node[MAX_NUMNODES];
 142};
 143
 144static struct mem_cgroup_tree soft_limit_tree __read_mostly;
 145
 146/* for OOM */
 147struct mem_cgroup_eventfd_list {
 148        struct list_head list;
 149        struct eventfd_ctx *eventfd;
 150};
 151
 152/*
 153 * cgroup_event represents events which userspace want to receive.
 154 */
 155struct mem_cgroup_event {
 156        /*
 157         * memcg which the event belongs to.
 158         */
 159        struct mem_cgroup *memcg;
 160        /*
 161         * eventfd to signal userspace about the event.
 162         */
 163        struct eventfd_ctx *eventfd;
 164        /*
 165         * Each of these stored in a list by the cgroup.
 166         */
 167        struct list_head list;
 168        /*
 169         * register_event() callback will be used to add new userspace
 170         * waiter for changes related to this event.  Use eventfd_signal()
 171         * on eventfd to send notification to userspace.
 172         */
 173        int (*register_event)(struct mem_cgroup *memcg,
 174                              struct eventfd_ctx *eventfd, const char *args);
 175        /*
 176         * unregister_event() callback will be called when userspace closes
 177         * the eventfd or on cgroup removing.  This callback must be set,
 178         * if you want provide notification functionality.
 179         */
 180        void (*unregister_event)(struct mem_cgroup *memcg,
 181                                 struct eventfd_ctx *eventfd);
 182        /*
 183         * All fields below needed to unregister event when
 184         * userspace closes eventfd.
 185         */
 186        poll_table pt;
 187        wait_queue_head_t *wqh;
 188        wait_queue_t wait;
 189        struct work_struct remove;
 190};
 191
 192static void mem_cgroup_threshold(struct mem_cgroup *memcg);
 193static void mem_cgroup_oom_notify(struct mem_cgroup *memcg);
 194
 195/* Stuffs for move charges at task migration. */
 196/*
 197 * Types of charges to be moved.
 198 */
 199#define MOVE_ANON       0x1U
 200#define MOVE_FILE       0x2U
 201#define MOVE_MASK       (MOVE_ANON | MOVE_FILE)
 202
 203/* "mc" and its members are protected by cgroup_mutex */
 204static struct move_charge_struct {
 205        spinlock_t        lock; /* for from, to */
 206        struct mm_struct  *mm;
 207        struct mem_cgroup *from;
 208        struct mem_cgroup *to;
 209        unsigned long flags;
 210        unsigned long precharge;
 211        unsigned long moved_charge;
 212        unsigned long moved_swap;
 213        struct task_struct *moving_task;        /* a task moving charges */
 214        wait_queue_head_t waitq;                /* a waitq for other context */
 215} mc = {
 216        .lock = __SPIN_LOCK_UNLOCKED(mc.lock),
 217        .waitq = __WAIT_QUEUE_HEAD_INITIALIZER(mc.waitq),
 218};
 219
 220/*
 221 * Maximum loops in mem_cgroup_hierarchical_reclaim(), used for soft
 222 * limit reclaim to prevent infinite loops, if they ever occur.
 223 */
 224#define MEM_CGROUP_MAX_RECLAIM_LOOPS            100
 225#define MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS 2
 226
 227enum charge_type {
 228        MEM_CGROUP_CHARGE_TYPE_CACHE = 0,
 229        MEM_CGROUP_CHARGE_TYPE_ANON,
 230        MEM_CGROUP_CHARGE_TYPE_SWAPOUT, /* for accounting swapcache */
 231        MEM_CGROUP_CHARGE_TYPE_DROP,    /* a page was unused swap cache */
 232        NR_CHARGE_TYPE,
 233};
 234
 235/* for encoding cft->private value on file */
 236enum res_type {
 237        _MEM,
 238        _MEMSWAP,
 239        _OOM_TYPE,
 240        _KMEM,
 241        _TCP,
 242};
 243
 244#define MEMFILE_PRIVATE(x, val) ((x) << 16 | (val))
 245#define MEMFILE_TYPE(val)       ((val) >> 16 & 0xffff)
 246#define MEMFILE_ATTR(val)       ((val) & 0xffff)
 247/* Used for OOM nofiier */
 248#define OOM_CONTROL             (0)
 249
 250/* Some nice accessors for the vmpressure. */
 251struct vmpressure *memcg_to_vmpressure(struct mem_cgroup *memcg)
 252{
 253        if (!memcg)
 254                memcg = root_mem_cgroup;
 255        return &memcg->vmpressure;
 256}
 257
 258struct cgroup_subsys_state *vmpressure_to_css(struct vmpressure *vmpr)
 259{
 260        return &container_of(vmpr, struct mem_cgroup, vmpressure)->css;
 261}
 262
 263static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg)
 264{
 265        return (memcg == root_mem_cgroup);
 266}
 267
 268#ifndef CONFIG_SLOB
 269/*
 270 * This will be the memcg's index in each cache's ->memcg_params.memcg_caches.
 271 * The main reason for not using cgroup id for this:
 272 *  this works better in sparse environments, where we have a lot of memcgs,
 273 *  but only a few kmem-limited. Or also, if we have, for instance, 200
 274 *  memcgs, and none but the 200th is kmem-limited, we'd have to have a
 275 *  200 entry array for that.
 276 *
 277 * The current size of the caches array is stored in memcg_nr_cache_ids. It
 278 * will double each time we have to increase it.
 279 */
 280static DEFINE_IDA(memcg_cache_ida);
 281int memcg_nr_cache_ids;
 282
 283/* Protects memcg_nr_cache_ids */
 284static DECLARE_RWSEM(memcg_cache_ids_sem);
 285
 286void memcg_get_cache_ids(void)
 287{
 288        down_read(&memcg_cache_ids_sem);
 289}
 290
 291void memcg_put_cache_ids(void)
 292{
 293        up_read(&memcg_cache_ids_sem);
 294}
 295
 296/*
 297 * MIN_SIZE is different than 1, because we would like to avoid going through
 298 * the alloc/free process all the time. In a small machine, 4 kmem-limited
 299 * cgroups is a reasonable guess. In the future, it could be a parameter or
 300 * tunable, but that is strictly not necessary.
 301 *
 302 * MAX_SIZE should be as large as the number of cgrp_ids. Ideally, we could get
 303 * this constant directly from cgroup, but it is understandable that this is
 304 * better kept as an internal representation in cgroup.c. In any case, the
 305 * cgrp_id space is not getting any smaller, and we don't have to necessarily
 306 * increase ours as well if it increases.
 307 */
 308#define MEMCG_CACHES_MIN_SIZE 4
 309#define MEMCG_CACHES_MAX_SIZE MEM_CGROUP_ID_MAX
 310
 311/*
 312 * A lot of the calls to the cache allocation functions are expected to be
 313 * inlined by the compiler. Since the calls to memcg_kmem_get_cache are
 314 * conditional to this static branch, we'll have to allow modules that does
 315 * kmem_cache_alloc and the such to see this symbol as well
 316 */
 317DEFINE_STATIC_KEY_FALSE(memcg_kmem_enabled_key);
 318EXPORT_SYMBOL(memcg_kmem_enabled_key);
 319
 320#endif /* !CONFIG_SLOB */
 321
 322/**
 323 * mem_cgroup_css_from_page - css of the memcg associated with a page
 324 * @page: page of interest
 325 *
 326 * If memcg is bound to the default hierarchy, css of the memcg associated
 327 * with @page is returned.  The returned css remains associated with @page
 328 * until it is released.
 329 *
 330 * If memcg is bound to a traditional hierarchy, the css of root_mem_cgroup
 331 * is returned.
 332 */
 333struct cgroup_subsys_state *mem_cgroup_css_from_page(struct page *page)
 334{
 335        struct mem_cgroup *memcg;
 336
 337        memcg = page->mem_cgroup;
 338
 339        if (!memcg || !cgroup_subsys_on_dfl(memory_cgrp_subsys))
 340                memcg = root_mem_cgroup;
 341
 342        return &memcg->css;
 343}
 344
 345/**
 346 * page_cgroup_ino - return inode number of the memcg a page is charged to
 347 * @page: the page
 348 *
 349 * Look up the closest online ancestor of the memory cgroup @page is charged to
 350 * and return its inode number or 0 if @page is not charged to any cgroup. It
 351 * is safe to call this function without holding a reference to @page.
 352 *
 353 * Note, this function is inherently racy, because there is nothing to prevent
 354 * the cgroup inode from getting torn down and potentially reallocated a moment
 355 * after page_cgroup_ino() returns, so it only should be used by callers that
 356 * do not care (such as procfs interfaces).
 357 */
 358ino_t page_cgroup_ino(struct page *page)
 359{
 360        struct mem_cgroup *memcg;
 361        unsigned long ino = 0;
 362
 363        rcu_read_lock();
 364        memcg = READ_ONCE(page->mem_cgroup);
 365        while (memcg && !(memcg->css.flags & CSS_ONLINE))
 366                memcg = parent_mem_cgroup(memcg);
 367        if (memcg)
 368                ino = cgroup_ino(memcg->css.cgroup);
 369        rcu_read_unlock();
 370        return ino;
 371}
 372
 373static struct mem_cgroup_per_node *
 374mem_cgroup_page_nodeinfo(struct mem_cgroup *memcg, struct page *page)
 375{
 376        int nid = page_to_nid(page);
 377
 378        return memcg->nodeinfo[nid];
 379}
 380
 381static struct mem_cgroup_tree_per_node *
 382soft_limit_tree_node(int nid)
 383{
 384        return soft_limit_tree.rb_tree_per_node[nid];
 385}
 386
 387static struct mem_cgroup_tree_per_node *
 388soft_limit_tree_from_page(struct page *page)
 389{
 390        int nid = page_to_nid(page);
 391
 392        return soft_limit_tree.rb_tree_per_node[nid];
 393}
 394
 395static void __mem_cgroup_insert_exceeded(struct mem_cgroup_per_node *mz,
 396                                         struct mem_cgroup_tree_per_node *mctz,
 397                                         unsigned long new_usage_in_excess)
 398{
 399        struct rb_node **p = &mctz->rb_root.rb_node;
 400        struct rb_node *parent = NULL;
 401        struct mem_cgroup_per_node *mz_node;
 402
 403        if (mz->on_tree)
 404                return;
 405
 406        mz->usage_in_excess = new_usage_in_excess;
 407        if (!mz->usage_in_excess)
 408                return;
 409        while (*p) {
 410                parent = *p;
 411                mz_node = rb_entry(parent, struct mem_cgroup_per_node,
 412                                        tree_node);
 413                if (mz->usage_in_excess < mz_node->usage_in_excess)
 414                        p = &(*p)->rb_left;
 415                /*
 416                 * We can't avoid mem cgroups that are over their soft
 417                 * limit by the same amount
 418                 */
 419                else if (mz->usage_in_excess >= mz_node->usage_in_excess)
 420                        p = &(*p)->rb_right;
 421        }
 422        rb_link_node(&mz->tree_node, parent, p);
 423        rb_insert_color(&mz->tree_node, &mctz->rb_root);
 424        mz->on_tree = true;
 425}
 426
 427static void __mem_cgroup_remove_exceeded(struct mem_cgroup_per_node *mz,
 428                                         struct mem_cgroup_tree_per_node *mctz)
 429{
 430        if (!mz->on_tree)
 431                return;
 432        rb_erase(&mz->tree_node, &mctz->rb_root);
 433        mz->on_tree = false;
 434}
 435
 436static void mem_cgroup_remove_exceeded(struct mem_cgroup_per_node *mz,
 437                                       struct mem_cgroup_tree_per_node *mctz)
 438{
 439        unsigned long flags;
 440
 441        spin_lock_irqsave(&mctz->lock, flags);
 442        __mem_cgroup_remove_exceeded(mz, mctz);
 443        spin_unlock_irqrestore(&mctz->lock, flags);
 444}
 445
 446static unsigned long soft_limit_excess(struct mem_cgroup *memcg)
 447{
 448        unsigned long nr_pages = page_counter_read(&memcg->memory);
 449        unsigned long soft_limit = READ_ONCE(memcg->soft_limit);
 450        unsigned long excess = 0;
 451
 452        if (nr_pages > soft_limit)
 453                excess = nr_pages - soft_limit;
 454
 455        return excess;
 456}
 457
 458static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page)
 459{
 460        unsigned long excess;
 461        struct mem_cgroup_per_node *mz;
 462        struct mem_cgroup_tree_per_node *mctz;
 463
 464        mctz = soft_limit_tree_from_page(page);
 465        /*
 466         * Necessary to update all ancestors when hierarchy is used.
 467         * because their event counter is not touched.
 468         */
 469        for (; memcg; memcg = parent_mem_cgroup(memcg)) {
 470                mz = mem_cgroup_page_nodeinfo(memcg, page);
 471                excess = soft_limit_excess(memcg);
 472                /*
 473                 * We have to update the tree if mz is on RB-tree or
 474                 * mem is over its softlimit.
 475                 */
 476                if (excess || mz->on_tree) {
 477                        unsigned long flags;
 478
 479                        spin_lock_irqsave(&mctz->lock, flags);
 480                        /* if on-tree, remove it */
 481                        if (mz->on_tree)
 482                                __mem_cgroup_remove_exceeded(mz, mctz);
 483                        /*
 484                         * Insert again. mz->usage_in_excess will be updated.
 485                         * If excess is 0, no tree ops.
 486                         */
 487                        __mem_cgroup_insert_exceeded(mz, mctz, excess);
 488                        spin_unlock_irqrestore(&mctz->lock, flags);
 489                }
 490        }
 491}
 492
 493static void mem_cgroup_remove_from_trees(struct mem_cgroup *memcg)
 494{
 495        struct mem_cgroup_tree_per_node *mctz;
 496        struct mem_cgroup_per_node *mz;
 497        int nid;
 498
 499        for_each_node(nid) {
 500                mz = mem_cgroup_nodeinfo(memcg, nid);
 501                mctz = soft_limit_tree_node(nid);
 502                mem_cgroup_remove_exceeded(mz, mctz);
 503        }
 504}
 505
 506static struct mem_cgroup_per_node *
 507__mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz)
 508{
 509        struct rb_node *rightmost = NULL;
 510        struct mem_cgroup_per_node *mz;
 511
 512retry:
 513        mz = NULL;
 514        rightmost = rb_last(&mctz->rb_root);
 515        if (!rightmost)
 516                goto done;              /* Nothing to reclaim from */
 517
 518        mz = rb_entry(rightmost, struct mem_cgroup_per_node, tree_node);
 519        /*
 520         * Remove the node now but someone else can add it back,
 521         * we will to add it back at the end of reclaim to its correct
 522         * position in the tree.
 523         */
 524        __mem_cgroup_remove_exceeded(mz, mctz);
 525        if (!soft_limit_excess(mz->memcg) ||
 526            !css_tryget_online(&mz->memcg->css))
 527                goto retry;
 528done:
 529        return mz;
 530}
 531
 532static struct mem_cgroup_per_node *
 533mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz)
 534{
 535        struct mem_cgroup_per_node *mz;
 536
 537        spin_lock_irq(&mctz->lock);
 538        mz = __mem_cgroup_largest_soft_limit_node(mctz);
 539        spin_unlock_irq(&mctz->lock);
 540        return mz;
 541}
 542
 543/*
 544 * Return page count for single (non recursive) @memcg.
 545 *
 546 * Implementation Note: reading percpu statistics for memcg.
 547 *
 548 * Both of vmstat[] and percpu_counter has threshold and do periodic
 549 * synchronization to implement "quick" read. There are trade-off between
 550 * reading cost and precision of value. Then, we may have a chance to implement
 551 * a periodic synchronization of counter in memcg's counter.
 552 *
 553 * But this _read() function is used for user interface now. The user accounts
 554 * memory usage by memory cgroup and he _always_ requires exact value because
 555 * he accounts memory. Even if we provide quick-and-fuzzy read, we always
 556 * have to visit all online cpus and make sum. So, for now, unnecessary
 557 * synchronization is not implemented. (just implemented for cpu hotplug)
 558 *
 559 * If there are kernel internal actions which can make use of some not-exact
 560 * value, and reading all cpu value can be performance bottleneck in some
 561 * common workload, threshold and synchronization as vmstat[] should be
 562 * implemented.
 563 */
 564static unsigned long
 565mem_cgroup_read_stat(struct mem_cgroup *memcg, enum mem_cgroup_stat_index idx)
 566{
 567        long val = 0;
 568        int cpu;
 569
 570        /* Per-cpu values can be negative, use a signed accumulator */
 571        for_each_possible_cpu(cpu)
 572                val += per_cpu(memcg->stat->count[idx], cpu);
 573        /*
 574         * Summing races with updates, so val may be negative.  Avoid exposing
 575         * transient negative values.
 576         */
 577        if (val < 0)
 578                val = 0;
 579        return val;
 580}
 581
 582static unsigned long mem_cgroup_read_events(struct mem_cgroup *memcg,
 583                                            enum mem_cgroup_events_index idx)
 584{
 585        unsigned long val = 0;
 586        int cpu;
 587
 588        for_each_possible_cpu(cpu)
 589                val += per_cpu(memcg->stat->events[idx], cpu);
 590        return val;
 591}
 592
 593static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg,
 594                                         struct page *page,
 595                                         bool compound, int nr_pages)
 596{
 597        /*
 598         * Here, RSS means 'mapped anon' and anon's SwapCache. Shmem/tmpfs is
 599         * counted as CACHE even if it's on ANON LRU.
 600         */
 601        if (PageAnon(page))
 602                __this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_RSS],
 603                                nr_pages);
 604        else
 605                __this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_CACHE],
 606                                nr_pages);
 607
 608        if (compound) {
 609                VM_BUG_ON_PAGE(!PageTransHuge(page), page);
 610                __this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_RSS_HUGE],
 611                                nr_pages);
 612        }
 613
 614        /* pagein of a big page is an event. So, ignore page size */
 615        if (nr_pages > 0)
 616                __this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGIN]);
 617        else {
 618                __this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGOUT]);
 619                nr_pages = -nr_pages; /* for event */
 620        }
 621
 622        __this_cpu_add(memcg->stat->nr_page_events, nr_pages);
 623}
 624
 625unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg,
 626                                           int nid, unsigned int lru_mask)
 627{
 628        unsigned long nr = 0;
 629        struct mem_cgroup_per_node *mz;
 630        enum lru_list lru;
 631
 632        VM_BUG_ON((unsigned)nid >= nr_node_ids);
 633
 634        for_each_lru(lru) {
 635                if (!(BIT(lru) & lru_mask))
 636                        continue;
 637                mz = mem_cgroup_nodeinfo(memcg, nid);
 638                nr += mz->lru_size[lru];
 639        }
 640        return nr;
 641}
 642
 643static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *memcg,
 644                        unsigned int lru_mask)
 645{
 646        unsigned long nr = 0;
 647        int nid;
 648
 649        for_each_node_state(nid, N_MEMORY)
 650                nr += mem_cgroup_node_nr_lru_pages(memcg, nid, lru_mask);
 651        return nr;
 652}
 653
 654static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg,
 655                                       enum mem_cgroup_events_target target)
 656{
 657        unsigned long val, next;
 658
 659        val = __this_cpu_read(memcg->stat->nr_page_events);
 660        next = __this_cpu_read(memcg->stat->targets[target]);
 661        /* from time_after() in jiffies.h */
 662        if ((long)next - (long)val < 0) {
 663                switch (target) {
 664                case MEM_CGROUP_TARGET_THRESH:
 665                        next = val + THRESHOLDS_EVENTS_TARGET;
 666                        break;
 667                case MEM_CGROUP_TARGET_SOFTLIMIT:
 668                        next = val + SOFTLIMIT_EVENTS_TARGET;
 669                        break;
 670                case MEM_CGROUP_TARGET_NUMAINFO:
 671                        next = val + NUMAINFO_EVENTS_TARGET;
 672                        break;
 673                default:
 674                        break;
 675                }
 676                __this_cpu_write(memcg->stat->targets[target], next);
 677                return true;
 678        }
 679        return false;
 680}
 681
 682/*
 683 * Check events in order.
 684 *
 685 */
 686static void memcg_check_events(struct mem_cgroup *memcg, struct page *page)
 687{
 688        /* threshold event is triggered in finer grain than soft limit */
 689        if (unlikely(mem_cgroup_event_ratelimit(memcg,
 690                                                MEM_CGROUP_TARGET_THRESH))) {
 691                bool do_softlimit;
 692                bool do_numainfo __maybe_unused;
 693
 694                do_softlimit = mem_cgroup_event_ratelimit(memcg,
 695                                                MEM_CGROUP_TARGET_SOFTLIMIT);
 696#if MAX_NUMNODES > 1
 697                do_numainfo = mem_cgroup_event_ratelimit(memcg,
 698                                                MEM_CGROUP_TARGET_NUMAINFO);
 699#endif
 700                mem_cgroup_threshold(memcg);
 701                if (unlikely(do_softlimit))
 702                        mem_cgroup_update_tree(memcg, page);
 703#if MAX_NUMNODES > 1
 704                if (unlikely(do_numainfo))
 705                        atomic_inc(&memcg->numainfo_events);
 706#endif
 707        }
 708}
 709
 710struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
 711{
 712        /*
 713         * mm_update_next_owner() may clear mm->owner to NULL
 714         * if it races with swapoff, page migration, etc.
 715         * So this can be called with p == NULL.
 716         */
 717        if (unlikely(!p))
 718                return NULL;
 719
 720        return mem_cgroup_from_css(task_css(p, memory_cgrp_id));
 721}
 722EXPORT_SYMBOL(mem_cgroup_from_task);
 723
 724static struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm)
 725{
 726        struct mem_cgroup *memcg = NULL;
 727
 728        rcu_read_lock();
 729        do {
 730                /*
 731                 * Page cache insertions can happen withou an
 732                 * actual mm context, e.g. during disk probing
 733                 * on boot, loopback IO, acct() writes etc.
 734                 */
 735                if (unlikely(!mm))
 736                        memcg = root_mem_cgroup;
 737                else {
 738                        memcg = mem_cgroup_from_task(rcu_dereference(mm->owner));
 739                        if (unlikely(!memcg))
 740                                memcg = root_mem_cgroup;
 741                }
 742        } while (!css_tryget_online(&memcg->css));
 743        rcu_read_unlock();
 744        return memcg;
 745}
 746
 747/**
 748 * mem_cgroup_iter - iterate over memory cgroup hierarchy
 749 * @root: hierarchy root
 750 * @prev: previously returned memcg, NULL on first invocation
 751 * @reclaim: cookie for shared reclaim walks, NULL for full walks
 752 *
 753 * Returns references to children of the hierarchy below @root, or
 754 * @root itself, or %NULL after a full round-trip.
 755 *
 756 * Caller must pass the return value in @prev on subsequent
 757 * invocations for reference counting, or use mem_cgroup_iter_break()
 758 * to cancel a hierarchy walk before the round-trip is complete.
 759 *
 760 * Reclaimers can specify a zone and a priority level in @reclaim to
 761 * divide up the memcgs in the hierarchy among all concurrent
 762 * reclaimers operating on the same zone and priority.
 763 */
 764struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
 765                                   struct mem_cgroup *prev,
 766                                   struct mem_cgroup_reclaim_cookie *reclaim)
 767{
 768        struct mem_cgroup_reclaim_iter *uninitialized_var(iter);
 769        struct cgroup_subsys_state *css = NULL;
 770        struct mem_cgroup *memcg = NULL;
 771        struct mem_cgroup *pos = NULL;
 772
 773        if (mem_cgroup_disabled())
 774                return NULL;
 775
 776        if (!root)
 777                root = root_mem_cgroup;
 778
 779        if (prev && !reclaim)
 780                pos = prev;
 781
 782        if (!root->use_hierarchy && root != root_mem_cgroup) {
 783                if (prev)
 784                        goto out;
 785                return root;
 786        }
 787
 788        rcu_read_lock();
 789
 790        if (reclaim) {
 791                struct mem_cgroup_per_node *mz;
 792
 793                mz = mem_cgroup_nodeinfo(root, reclaim->pgdat->node_id);
 794                iter = &mz->iter[reclaim->priority];
 795
 796                if (prev && reclaim->generation != iter->generation)
 797                        goto out_unlock;
 798
 799                while (1) {
 800                        pos = READ_ONCE(iter->position);
 801                        if (!pos || css_tryget(&pos->css))
 802                                break;
 803                        /*
 804                         * css reference reached zero, so iter->position will
 805                         * be cleared by ->css_released. However, we should not
 806                         * rely on this happening soon, because ->css_released
 807                         * is called from a work queue, and by busy-waiting we
 808                         * might block it. So we clear iter->position right
 809                         * away.
 810                         */
 811                        (void)cmpxchg(&iter->position, pos, NULL);
 812                }
 813        }
 814
 815        if (pos)
 816                css = &pos->css;
 817
 818        for (;;) {
 819                css = css_next_descendant_pre(css, &root->css);
 820                if (!css) {
 821                        /*
 822                         * Reclaimers share the hierarchy walk, and a
 823                         * new one might jump in right at the end of
 824                         * the hierarchy - make sure they see at least
 825                         * one group and restart from the beginning.
 826                         */
 827                        if (!prev)
 828                                continue;
 829                        break;
 830                }
 831
 832                /*
 833                 * Verify the css and acquire a reference.  The root
 834                 * is provided by the caller, so we know it's alive
 835                 * and kicking, and don't take an extra reference.
 836                 */
 837                memcg = mem_cgroup_from_css(css);
 838
 839                if (css == &root->css)
 840                        break;
 841
 842                if (css_tryget(css))
 843                        break;
 844
 845                memcg = NULL;
 846        }
 847
 848        if (reclaim) {
 849                /*
 850                 * The position could have already been updated by a competing
 851                 * thread, so check that the value hasn't changed since we read
 852                 * it to avoid reclaiming from the same cgroup twice.
 853                 */
 854                (void)cmpxchg(&iter->position, pos, memcg);
 855
 856                if (pos)
 857                        css_put(&pos->css);
 858
 859                if (!memcg)
 860                        iter->generation++;
 861                else if (!prev)
 862                        reclaim->generation = iter->generation;
 863        }
 864
 865out_unlock:
 866        rcu_read_unlock();
 867out:
 868        if (prev && prev != root)
 869                css_put(&prev->css);
 870
 871        return memcg;
 872}
 873
 874/**
 875 * mem_cgroup_iter_break - abort a hierarchy walk prematurely
 876 * @root: hierarchy root
 877 * @prev: last visited hierarchy member as returned by mem_cgroup_iter()
 878 */
 879void mem_cgroup_iter_break(struct mem_cgroup *root,
 880                           struct mem_cgroup *prev)
 881{
 882        if (!root)
 883                root = root_mem_cgroup;
 884        if (prev && prev != root)
 885                css_put(&prev->css);
 886}
 887
 888static void invalidate_reclaim_iterators(struct mem_cgroup *dead_memcg)
 889{
 890        struct mem_cgroup *memcg = dead_memcg;
 891        struct mem_cgroup_reclaim_iter *iter;
 892        struct mem_cgroup_per_node *mz;
 893        int nid;
 894        int i;
 895
 896        while ((memcg = parent_mem_cgroup(memcg))) {
 897                for_each_node(nid) {
 898                        mz = mem_cgroup_nodeinfo(memcg, nid);
 899                        for (i = 0; i <= DEF_PRIORITY; i++) {
 900                                iter = &mz->iter[i];
 901                                cmpxchg(&iter->position,
 902                                        dead_memcg, NULL);
 903                        }
 904                }
 905        }
 906}
 907
 908/*
 909 * Iteration constructs for visiting all cgroups (under a tree).  If
 910 * loops are exited prematurely (break), mem_cgroup_iter_break() must
 911 * be used for reference counting.
 912 */
 913#define for_each_mem_cgroup_tree(iter, root)            \
 914        for (iter = mem_cgroup_iter(root, NULL, NULL);  \
 915             iter != NULL;                              \
 916             iter = mem_cgroup_iter(root, iter, NULL))
 917
 918#define for_each_mem_cgroup(iter)                       \
 919        for (iter = mem_cgroup_iter(NULL, NULL, NULL);  \
 920             iter != NULL;                              \
 921             iter = mem_cgroup_iter(NULL, iter, NULL))
 922
 923/**
 924 * mem_cgroup_page_lruvec - return lruvec for isolating/putting an LRU page
 925 * @page: the page
 926 * @zone: zone of the page
 927 *
 928 * This function is only safe when following the LRU page isolation
 929 * and putback protocol: the LRU lock must be held, and the page must
 930 * either be PageLRU() or the caller must have isolated/allocated it.
 931 */
 932struct lruvec *mem_cgroup_page_lruvec(struct page *page, struct pglist_data *pgdat)
 933{
 934        struct mem_cgroup_per_node *mz;
 935        struct mem_cgroup *memcg;
 936        struct lruvec *lruvec;
 937
 938        if (mem_cgroup_disabled()) {
 939                lruvec = &pgdat->lruvec;
 940                goto out;
 941        }
 942
 943        memcg = page->mem_cgroup;
 944        /*
 945         * Swapcache readahead pages are added to the LRU - and
 946         * possibly migrated - before they are charged.
 947         */
 948        if (!memcg)
 949                memcg = root_mem_cgroup;
 950
 951        mz = mem_cgroup_page_nodeinfo(memcg, page);
 952        lruvec = &mz->lruvec;
 953out:
 954        /*
 955         * Since a node can be onlined after the mem_cgroup was created,
 956         * we have to be prepared to initialize lruvec->zone here;
 957         * and if offlined then reonlined, we need to reinitialize it.
 958         */
 959        if (unlikely(lruvec->pgdat != pgdat))
 960                lruvec->pgdat = pgdat;
 961        return lruvec;
 962}
 963
 964/**
 965 * mem_cgroup_update_lru_size - account for adding or removing an lru page
 966 * @lruvec: mem_cgroup per zone lru vector
 967 * @lru: index of lru list the page is sitting on
 968 * @nr_pages: positive when adding or negative when removing
 969 *
 970 * This function must be called under lru_lock, just before a page is added
 971 * to or just after a page is removed from an lru list (that ordering being
 972 * so as to allow it to check that lru_size 0 is consistent with list_empty).
 973 */
 974void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru,
 975                                int nr_pages)
 976{
 977        struct mem_cgroup_per_node *mz;
 978        unsigned long *lru_size;
 979        long size;
 980        bool empty;
 981
 982        if (mem_cgroup_disabled())
 983                return;
 984
 985        mz = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
 986        lru_size = mz->lru_size + lru;
 987        empty = list_empty(lruvec->lists + lru);
 988
 989        if (nr_pages < 0)
 990                *lru_size += nr_pages;
 991
 992        size = *lru_size;
 993        if (WARN_ONCE(size < 0 || empty != !size,
 994                "%s(%p, %d, %d): lru_size %ld but %sempty\n",
 995                __func__, lruvec, lru, nr_pages, size, empty ? "" : "not ")) {
 996                VM_BUG_ON(1);
 997                *lru_size = 0;
 998        }
 999
1000        if (nr_pages > 0)

1001                *lru_size += nr_pages;
1002}
1003
1004bool task_in_mem_cgroup(struct task_struct *task, struct mem_cgroup *memcg)
1005{
1006        struct mem_cgroup *task_memcg;
1007        struct task_struct *p;
1008        bool ret;
1009
1010        p = find_lock_task_mm(task);
1011        if (p) {
1012                task_memcg = get_mem_cgroup_from_mm(p->mm);
1013                task_unlock(p);
1014        } else {
1015                /*
1016                 * All threads may have already detached their mm's, but the oom
1017                 * killer still needs to detect if they have already been oom
1018                 * killed to prevent needlessly killing additional tasks.
1019                 */
1020                rcu_read_lock();
1021                task_memcg = mem_cgroup_from_task(task);
1022                css_get(&task_memcg->css);
1023                rcu_read_unlock();
1024        }
1025        ret = mem_cgroup_is_descendant(task_memcg, memcg);
1026        css_put(&task_memcg->css);
1027        return ret;
1028}
1029
1030/**
1031 * mem_cgroup_margin - calculate chargeable space of a memory cgroup
1032 * @memcg: the memory cgroup
1033 *
1034 * Returns the maximum amount of memory @mem can be charged with, in
1035 * pages.
1036 */
1037static unsigned long mem_cgroup_margin(struct mem_cgroup *memcg)
1038{
1039        unsigned long margin = 0;
1040        unsigned long count;
1041        unsigned long limit;
1042
1043        count = page_counter_read(&memcg->memory);
1044        limit = READ_ONCE(memcg->memory.limit);
1045        if (count < limit)
1046                margin = limit - count;
1047
1048        if (do_memsw_account()) {
1049                count = page_counter_read(&memcg->memsw);
1050                limit = READ_ONCE(memcg->memsw.limit);
1051                if (count <= limit)
1052                        margin = min(margin, limit - count);
1053                else
1054                        margin = 0;
1055        }
1056
1057        return margin;
1058}
1059
1060/*
1061 * A routine for checking "mem" is under move_account() or not.
1062 *
1063 * Checking a cgroup is mc.from or mc.to or under hierarchy of
1064 * moving cgroups. This is for waiting at high-memory pressure
1065 * caused by "move".
1066 */
1067static bool mem_cgroup_under_move(struct mem_cgroup *memcg)
1068{
1069        struct mem_cgroup *from;
1070        struct mem_cgroup *to;
1071        bool ret = false;
1072        /*
1073         * Unlike task_move routines, we access mc.to, mc.from not under
1074         * mutual exclusion by cgroup_mutex. Here, we take spinlock instead.
1075         */
1076        spin_lock(&mc.lock);
1077        from = mc.from;
1078        to = mc.to;
1079        if (!from)
1080                goto unlock;
1081
1082        ret = mem_cgroup_is_descendant(from, memcg) ||
1083                mem_cgroup_is_descendant(to, memcg);
1084unlock:
1085        spin_unlock(&mc.lock);
1086        return ret;
1087}
1088
1089static bool mem_cgroup_wait_acct_move(struct mem_cgroup *memcg)
1090{
1091        if (mc.moving_task && current != mc.moving_task) {
1092                if (mem_cgroup_under_move(memcg)) {
1093                        DEFINE_WAIT(wait);
1094                        prepare_to_wait(&mc.waitq, &wait, TASK_INTERRUPTIBLE);
1095                        /* moving charge context might have finished. */
1096                        if (mc.moving_task)
1097                                schedule();
1098                        finish_wait(&mc.waitq, &wait);
1099                        return true;
1100                }
1101        }
1102        return false;
1103}
1104
1105#define K(x) ((x) << (PAGE_SHIFT-10))
1106/**
1107 * mem_cgroup_print_oom_info: Print OOM information relevant to memory controller.
1108 * @memcg: The memory cgroup that went over limit
1109 * @p: Task that is going to be killed
1110 *
1111 * NOTE: @memcg and @p's mem_cgroup can be different when hierarchy is
1112 * enabled
1113 */
1114void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
1115{
1116        struct mem_cgroup *iter;
1117        unsigned int i;
1118
1119        rcu_read_lock();
1120
1121        if (p) {
1122                pr_info("Task in ");
1123                pr_cont_cgroup_path(task_cgroup(p, memory_cgrp_id));
1124                pr_cont(" killed as a result of limit of ");
1125        } else {
1126                pr_info("Memory limit reached of cgroup ");
1127        }
1128
1129        pr_cont_cgroup_path(memcg->css.cgroup);
1130        pr_cont("\n");
1131
1132        rcu_read_unlock();
1133
1134        pr_info("memory: usage %llukB, limit %llukB, failcnt %lu\n",
1135                K((u64)page_counter_read(&memcg->memory)),
1136                K((u64)memcg->memory.limit), memcg->memory.failcnt);
1137        pr_info("memory+swap: usage %llukB, limit %llukB, failcnt %lu\n",
1138                K((u64)page_counter_read(&memcg->memsw)),
1139                K((u64)memcg->memsw.limit), memcg->memsw.failcnt);
1140        pr_info("kmem: usage %llukB, limit %llukB, failcnt %lu\n",
1141                K((u64)page_counter_read(&memcg->kmem)),
1142                K((u64)memcg->kmem.limit), memcg->kmem.failcnt);
1143
1144        for_each_mem_cgroup_tree(iter, memcg) {
1145                pr_info("Memory cgroup stats for ");
1146                pr_cont_cgroup_path(iter->css.cgroup);
1147                pr_cont(":");
1148
1149                for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) {
1150                        if (i == MEM_CGROUP_STAT_SWAP && !do_swap_account)
1151                                continue;
1152                        pr_cont(" %s:%luKB", mem_cgroup_stat_names[i],
1153                                K(mem_cgroup_read_stat(iter, i)));
1154                }
1155
1156                for (i = 0; i < NR_LRU_LISTS; i++)
1157                        pr_cont(" %s:%luKB", mem_cgroup_lru_names[i],
1158                                K(mem_cgroup_nr_lru_pages(iter, BIT(i))));
1159
1160                pr_cont("\n");
1161        }
1162}
1163
1164/*
1165 * This function returns the number of memcg under hierarchy tree. Returns
1166 * 1(self count) if no children.
1167 */
1168static int mem_cgroup_count_children(struct mem_cgroup *memcg)
1169{
1170        int num = 0;
1171        struct mem_cgroup *iter;
1172
1173        for_each_mem_cgroup_tree(iter, memcg)
1174                num++;
1175        return num;
1176}
1177
1178/*
1179 * Return the memory (and swap, if configured) limit for a memcg.
1180 */
1181static unsigned long mem_cgroup_get_limit(struct mem_cgroup *memcg)
1182{
1183        unsigned long limit;
1184
1185        limit = memcg->memory.limit;
1186        if (mem_cgroup_swappiness(memcg)) {
1187                unsigned long memsw_limit;
1188                unsigned long swap_limit;
1189
1190                memsw_limit = memcg->memsw.limit;
1191                swap_limit = memcg->swap.limit;
1192                swap_limit = min(swap_limit, (unsigned long)total_swap_pages);
1193                limit = min(limit + swap_limit, memsw_limit);
1194        }
1195        return limit;
1196}
1197
1198static bool mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
1199                                     int order)
1200{
1201        struct oom_control oc = {
1202                .zonelist = NULL,
1203                .nodemask = NULL,
1204                .memcg = memcg,
1205                .gfp_mask = gfp_mask,
1206                .order = order,
1207        };
1208        struct mem_cgroup *iter;
1209        unsigned long chosen_points = 0;
1210        unsigned long totalpages;
1211        unsigned int points = 0;
1212        struct task_struct *chosen = NULL;
1213
1214        mutex_lock(&oom_lock);
1215
1216        /*
1217         * If current has a pending SIGKILL or is exiting, then automatically
1218         * select it.  The goal is to allow it to allocate so that it may
1219         * quickly exit and free its memory.
1220         */
1221        if (task_will_free_mem(current)) {
1222                mark_oom_victim(current);
1223                wake_oom_reaper(current);
1224                goto unlock;
1225        }
1226
1227        check_panic_on_oom(&oc, CONSTRAINT_MEMCG);
1228        totalpages = mem_cgroup_get_limit(memcg) ? : 1;
1229        for_each_mem_cgroup_tree(iter, memcg) {
1230                struct css_task_iter it;
1231                struct task_struct *task;
1232
1233                css_task_iter_start(&iter->css, &it);
1234                while ((task = css_task_iter_next(&it))) {
1235                        switch (oom_scan_process_thread(&oc, task)) {
1236                        case OOM_SCAN_SELECT:
1237                                if (chosen)
1238                                        put_task_struct(chosen);
1239                                chosen = task;
1240                                chosen_points = ULONG_MAX;
1241                                get_task_struct(chosen);
1242                                /* fall through */
1243                        case OOM_SCAN_CONTINUE:
1244                                continue;
1245                        case OOM_SCAN_ABORT:
1246                                css_task_iter_end(&it);
1247                                mem_cgroup_iter_break(memcg, iter);
1248                                if (chosen)
1249                                        put_task_struct(chosen);
1250                                /* Set a dummy value to return "true". */
1251                                chosen = (void *) 1;
1252                                goto unlock;
1253                        case OOM_SCAN_OK:
1254                                break;
1255                        };
1256                        points = oom_badness(task, memcg, NULL, totalpages);
1257                        if (!points || points < chosen_points)
1258                                continue;
1259                        /* Prefer thread group leaders for display purposes */
1260                        if (points == chosen_points &&
1261                            thread_group_leader(chosen))
1262                                continue;
1263
1264                        if (chosen)
1265                                put_task_struct(chosen);
1266                        chosen = task;
1267                        chosen_points = points;
1268                        get_task_struct(chosen);
1269                }
1270                css_task_iter_end(&it);
1271        }
1272
1273        if (chosen) {
1274                points = chosen_points * 1000 / totalpages;
1275                oom_kill_process(&oc, chosen, points, totalpages,
1276                                 "Memory cgroup out of memory");
1277        }
1278unlock:
1279        mutex_unlock(&oom_lock);
1280        return chosen;
1281}
1282
1283#if MAX_NUMNODES > 1
1284
1285/**
1286 * test_mem_cgroup_node_reclaimable
1287 * @memcg: the target memcg
1288 * @nid: the node ID to be checked.
1289 * @noswap : specify true here if the user wants flle only information.
1290 *
1291 * This function returns whether the specified memcg contains any
1292 * reclaimable pages on a node. Returns true if there are any reclaimable
1293 * pages in the node.
1294 */
1295static bool test_mem_cgroup_node_reclaimable(struct mem_cgroup *memcg,
1296                int nid, bool noswap)
1297{
1298        if (mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL_FILE))
1299                return true;
1300        if (noswap || !total_swap_pages)
1301                return false;
1302        if (mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL_ANON))
1303                return true;
1304        return false;
1305
1306}
1307
1308/*
1309 * Always updating the nodemask is not very good - even if we have an empty
1310 * list or the wrong list here, we can start from some node and traverse all
1311 * nodes based on the zonelist. So update the list loosely once per 10 secs.
1312 *
1313 */
1314static void mem_cgroup_may_update_nodemask(struct mem_cgroup *memcg)
1315{
1316        int nid;
1317        /*
1318         * numainfo_events > 0 means there was at least NUMAINFO_EVENTS_TARGET
1319         * pagein/pageout changes since the last update.
1320         */
1321        if (!atomic_read(&memcg->numainfo_events))
1322                return;
1323        if (atomic_inc_return(&memcg->numainfo_updating) > 1)
1324                return;
1325
1326        /* make a nodemask where this memcg uses memory from */
1327        memcg->scan_nodes = node_states[N_MEMORY];
1328
1329        for_each_node_mask(nid, node_states[N_MEMORY]) {
1330
1331                if (!test_mem_cgroup_node_reclaimable(memcg, nid, false))
1332                        node_clear(nid, memcg->scan_nodes);
1333        }
1334
1335        atomic_set(&memcg->numainfo_events, 0);
1336        atomic_set(&memcg->numainfo_updating, 0);
1337}
1338
1339/*
1340 * Selecting a node where we start reclaim from. Because what we need is just
1341 * reducing usage counter, start from anywhere is O,K. Considering
1342 * memory reclaim from current node, there are pros. and cons.
1343 *
1344 * Freeing memory from current node means freeing memory from a node which
1345 * we'll use or we've used. So, it may make LRU bad. And if several threads
1346 * hit limits, it will see a contention on a node. But freeing from remote
1347 * node means more costs for memory reclaim because of memory latency.
1348 *
1349 * Now, we use round-robin. Better algorithm is welcomed.
1350 */
1351int mem_cgroup_select_victim_node(struct mem_cgroup *memcg)
1352{
1353        int node;
1354
1355        mem_cgroup_may_update_nodemask(memcg);
1356        node = memcg->last_scanned_node;
1357
1358        node = next_node_in(node, memcg->scan_nodes);
1359        /*
1360         * mem_cgroup_may_update_nodemask might have seen no reclaimmable pages
1361         * last time it really checked all the LRUs due to rate limiting.
1362         * Fallback to the current node in that case for simplicity.
1363         */
1364        if (unlikely(node == MAX_NUMNODES))
1365                node = numa_node_id();
1366
1367        memcg->last_scanned_node = node;
1368        return node;
1369}
1370#else
1371int mem_cgroup_select_victim_node(struct mem_cgroup *memcg)
1372{
1373        return 0;
1374}
1375#endif
1376
1377static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg,
1378                                   pg_data_t *pgdat,
1379                                   gfp_t gfp_mask,
1380                                   unsigned long *total_scanned)
1381{
1382        struct mem_cgroup *victim = NULL;
1383        int total = 0;
1384        int loop = 0;
1385        unsigned long excess;
1386        unsigned long nr_scanned;
1387        struct mem_cgroup_reclaim_cookie reclaim = {
1388                .pgdat = pgdat,
1389                .priority = 0,
1390        };
1391
1392        excess = soft_limit_excess(root_memcg);
1393
1394        while (1) {
1395                victim = mem_cgroup_iter(root_memcg, victim, &reclaim);
1396                if (!victim) {
1397                        loop++;
1398                        if (loop >= 2) {
1399                                /*
1400                                 * If we have not been able to reclaim
1401                                 * anything, it might because there are
1402                                 * no reclaimable pages under this hierarchy
1403                                 */
1404                                if (!total)
1405                                        break;
1406                                /*
1407                                 * We want to do more targeted reclaim.
1408                                 * excess >> 2 is not to excessive so as to
1409                                 * reclaim too much, nor too less that we keep
1410                                 * coming back to reclaim from this cgroup
1411                                 */
1412                                if (total >= (excess >> 2) ||
1413                                        (loop > MEM_CGROUP_MAX_RECLAIM_LOOPS))
1414                                        break;
1415                        }
1416                        continue;
1417                }
1418                total += mem_cgroup_shrink_node(victim, gfp_mask, false,
1419                                        pgdat, &nr_scanned);
1420                *total_scanned += nr_scanned;
1421                if (!soft_limit_excess(root_memcg))
1422                        break;
1423        }
1424        mem_cgroup_iter_break(root_memcg, victim);
1425        return total;
1426}
1427
1428#ifdef CONFIG_LOCKDEP
1429static struct lockdep_map memcg_oom_lock_dep_map = {
1430        .name = "memcg_oom_lock",
1431};
1432#endif
1433
1434static DEFINE_SPINLOCK(memcg_oom_lock);
1435
1436/*
1437 * Check OOM-Killer is already running under our hierarchy.
1438 * If someone is running, return false.
1439 */
1440static bool mem_cgroup_oom_trylock(struct mem_cgroup *memcg)
1441{
1442        struct mem_cgroup *iter, *failed = NULL;
1443
1444        spin_lock(&memcg_oom_lock);
1445
1446        for_each_mem_cgroup_tree(iter, memcg) {
1447                if (iter->oom_lock) {
1448                        /*
1449                         * this subtree of our hierarchy is already locked
1450                         * so we cannot give a lock.
1451                         */
1452                        failed = iter;
1453                        mem_cgroup_iter_break(memcg, iter);
1454                        break;
1455                } else
1456                        iter->oom_lock = true;
1457        }
1458
1459        if (failed) {
1460                /*
1461                 * OK, we failed to lock the whole subtree so we have
1462                 * to clean up what we set up to the failing subtree
1463                 */
1464                for_each_mem_cgroup_tree(iter, memcg) {
1465                        if (iter == failed) {
1466                                mem_cgroup_iter_break(memcg, iter);
1467                                break;
1468                        }
1469                        iter->oom_lock = false;
1470                }
1471        } else
1472                mutex_acquire(&memcg_oom_lock_dep_map, 0, 1, _RET_IP_);
1473
1474        spin_unlock(&memcg_oom_lock);
1475
1476        return !failed;
1477}
1478
1479static void mem_cgroup_oom_unlock(struct mem_cgroup *memcg)
1480{
1481        struct mem_cgroup *iter;
1482
1483        spin_lock(&memcg_oom_lock);
1484        mutex_release(&memcg_oom_lock_dep_map, 1, _RET_IP_);
1485        for_each_mem_cgroup_tree(iter, memcg)
1486                iter->oom_lock = false;
1487        spin_unlock(&memcg_oom_lock);
1488}
1489
1490static void mem_cgroup_mark_under_oom(struct mem_cgroup *memcg)
1491{
1492        struct mem_cgroup *iter;
1493
1494        spin_lock(&memcg_oom_lock);
1495        for_each_mem_cgroup_tree(iter, memcg)
1496                iter->under_oom++;
1497        spin_unlock(&memcg_oom_lock);
1498}
1499
1500static void mem_cgroup_unmark_under_oom(struct mem_cgroup *memcg)
1501{
1502        struct mem_cgroup *iter;
1503
1504        /*
1505         * When a new child is created while the hierarchy is under oom,
1506         * mem_cgroup_oom_lock() may not be called. Watch for underflow.
1507         */
1508        spin_lock(&memcg_oom_lock);
1509        for_each_mem_cgroup_tree(iter, memcg)
1510                if (iter->under_oom > 0)
1511                        iter->under_oom--;
1512        spin_unlock(&memcg_oom_lock);
1513}
1514
1515static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq);
1516
1517struct oom_wait_info {
1518        struct mem_cgroup *memcg;
1519        wait_queue_t    wait;
1520};
1521
1522static int memcg_oom_wake_function(wait_queue_t *wait,
1523        unsigned mode, int sync, void *arg)
1524{
1525        struct mem_cgroup *wake_memcg = (struct mem_cgroup *)arg;
1526        struct mem_cgroup *oom_wait_memcg;
1527        struct oom_wait_info *oom_wait_info;
1528
1529        oom_wait_info = container_of(wait, struct oom_wait_info, wait);
1530        oom_wait_memcg = oom_wait_info->memcg;
1531
1532        if (!mem_cgroup_is_descendant(wake_memcg, oom_wait_memcg) &&
1533            !mem_cgroup_is_descendant(oom_wait_memcg, wake_memcg))
1534                return 0;
1535        return autoremove_wake_function(wait, mode, sync, arg);
1536}
1537
1538static void memcg_oom_recover(struct mem_cgroup *memcg)
1539{
1540        /*
1541         * For the following lockless ->under_oom test, the only required
1542         * guarantee is that it must see the state asserted by an OOM when
1543         * this function is called as a result of userland actions
1544         * triggered by the notification of the OOM.  This is trivially
1545         * achieved by invoking mem_cgroup_mark_under_oom() before
1546         * triggering notification.
1547         */
1548        if (memcg && memcg->under_oom)
1549                __wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, memcg);
1550}
1551
1552static void mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order)
1553{
1554        if (!current->memcg_may_oom)
1555                return;
1556        /*
1557         * We are in the middle of the charge context here, so we
1558         * don't want to block when potentially sitting on a callstack
1559         * that holds all kinds of filesystem and mm locks.
1560         *
1561         * Also, the caller may handle a failed allocation gracefully
1562         * (like optional page cache readahead) and so an OOM killer
1563         * invocation might not even be necessary.
1564         *
1565         * That's why we don't do anything here except remember the
1566         * OOM context and then deal with it at the end of the page
1567         * fault when the stack is unwound, the locks are released,
1568         * and when we know whether the fault was overall successful.
1569         */
1570        css_get(&memcg->css);
1571        current->memcg_in_oom = memcg;
1572        current->memcg_oom_gfp_mask = mask;
1573        current->memcg_oom_order = order;
1574}
1575
1576/**
1577 * mem_cgroup_oom_synchronize - complete memcg OOM handling
1578 * @handle: actually kill/wait or just clean up the OOM state
1579 *
1580 * This has to be called at the end of a page fault if the memcg OOM
1581 * handler was enabled.
1582 *
1583 * Memcg supports userspace OOM handling where failed allocations must
1584 * sleep on a waitqueue until the userspace task resolves the
1585 * situation.  Sleeping directly in the charge context with all kinds
1586 * of locks held is not a good idea, instead we remember an OOM state
1587 * in the task and mem_cgroup_oom_synchronize() has to be called at
1588 * the end of the page fault to complete the OOM handling.
1589 *
1590 * Returns %true if an ongoing memcg OOM situation was detected and
1591 * completed, %false otherwise.
1592 */
1593bool mem_cgroup_oom_synchronize(bool handle)
1594{
1595        struct mem_cgroup *memcg = current->memcg_in_oom;
1596        struct oom_wait_info owait;
1597        bool locked;
1598
1599        /* OOM is global, do not handle */
1600        if (!memcg)
1601                return false;
1602
1603        if (!handle || oom_killer_disabled)
1604                goto cleanup;
1605
1606        owait.memcg = memcg;
1607        owait.wait.flags = 0;
1608        owait.wait.func = memcg_oom_wake_function;
1609        owait.wait.private = current;
1610        INIT_LIST_HEAD(&owait.wait.task_list);
1611
1612        prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE);
1613        mem_cgroup_mark_under_oom(memcg);
1614
1615        locked = mem_cgroup_oom_trylock(memcg);
1616
1617        if (locked)
1618                mem_cgroup_oom_notify(memcg);
1619
1620        if (locked && !memcg->oom_kill_disable) {
1621                mem_cgroup_unmark_under_oom(memcg);
1622                finish_wait(&memcg_oom_waitq, &owait.wait);
1623                mem_cgroup_out_of_memory(memcg, current->memcg_oom_gfp_mask,
1624                                         current->memcg_oom_order);
1625        } else {
1626                schedule();
1627                mem_cgroup_unmark_under_oom(memcg);
1628                finish_wait(&memcg_oom_waitq, &owait.wait);
1629        }
1630
1631        if (locked) {
1632                mem_cgroup_oom_unlock(memcg);
1633                /*
1634                 * There is no guarantee that an OOM-lock contender
1635                 * sees the wakeups triggered by the OOM kill
1636                 * uncharges.  Wake any sleepers explicitely.
1637                 */
1638                memcg_oom_recover(memcg);
1639        }
1640cleanup:
1641        current->memcg_in_oom = NULL;
1642        css_put(&memcg->css);
1643        return true;
1644}
1645
1646/**
1647 * lock_page_memcg - lock a page->mem_cgroup binding
1648 * @page: the page
1649 *
1650 * This function protects unlocked LRU pages from being moved to
1651 * another cgroup and stabilizes their page->mem_cgroup binding.
1652 */
1653void lock_page_memcg(struct page *page)
1654{
1655        struct mem_cgroup *memcg;
1656        unsigned long flags;
1657
1658        /*
1659         * The RCU lock is held throughout the transaction.  The fast
1660         * path can get away without acquiring the memcg->move_lock
1661         * because page moving starts with an RCU grace period.
1662         */
1663        rcu_read_lock();
1664
1665        if (mem_cgroup_disabled())
1666                return;
1667again:
1668        memcg = page->mem_cgroup;
1669        if (unlikely(!memcg))
1670                return;
1671
1672        if (atomic_read(&memcg->moving_account) <= 0)
1673                return;
1674
1675        spin_lock_irqsave(&memcg->move_lock, flags);
1676        if (memcg != page->mem_cgroup) {
1677                spin_unlock_irqrestore(&memcg->move_lock, flags);
1678                goto again;
1679        }
1680
1681        /*
1682         * When charge migration first begins, we can have locked and
1683         * unlocked page stat updates happening concurrently.  Track
1684         * the task who has the lock for unlock_page_memcg().
1685         */
1686        memcg->move_lock_task = current;
1687        memcg->move_lock_flags = flags;
1688
1689        return;
1690}
1691EXPORT_SYMBOL(lock_page_memcg);
1692
1693/**
1694 * unlock_page_memcg - unlock a page->mem_cgroup binding
1695 * @page: the page
1696 */
1697void unlock_page_memcg(struct page *page)
1698{
1699        struct mem_cgroup *memcg = page->mem_cgroup;
1700
1701        if (memcg && memcg->move_lock_task == current) {
1702                unsigned long flags = memcg->move_lock_flags;
1703
1704                memcg->move_lock_task = NULL;
1705                memcg->move_lock_flags = 0;
1706
1707                spin_unlock_irqrestore(&memcg->move_lock, flags);
1708        }
1709
1710        rcu_read_unlock();
1711}
1712EXPORT_SYMBOL(unlock_page_memcg);
1713
1714/*
1715 * size of first charge trial. "32" comes from vmscan.c's magic value.
1716 * TODO: maybe necessary to use big numbers in big irons.
1717 */
1718#define CHARGE_BATCH    32U
1719struct memcg_stock_pcp {
1720        struct mem_cgroup *cached; /* this never be root cgroup */
1721        unsigned int nr_pages;
1722        struct work_struct work;
1723        unsigned long flags;
1724#define FLUSHING_CACHED_CHARGE  0
1725};
1726static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock);
1727static DEFINE_MUTEX(percpu_charge_mutex);
1728
1729/**
1730 * consume_stock: Try to consume stocked charge on this cpu.
1731 * @memcg: memcg to consume from.
1732 * @nr_pages: how many pages to charge.
1733 *
1734 * The charges will only happen if @memcg matches the current cpu's memcg
1735 * stock, and at least @nr_pages are available in that stock.  Failure to
1736 * service an allocation will refill the stock.
1737 *
1738 * returns true if successful, false otherwise.
1739 */
1740static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
1741{
1742        struct memcg_stock_pcp *stock;
1743        unsigned long flags;
1744        bool ret = false;
1745
1746        if (nr_pages > CHARGE_BATCH)
1747                return ret;
1748
1749        local_irq_save(flags);
1750
1751        stock = this_cpu_ptr(&memcg_stock);
1752        if (memcg == stock->cached && stock->nr_pages >= nr_pages) {
1753                stock->nr_pages -= nr_pages;
1754                ret = true;
1755        }
1756
1757        local_irq_restore(flags);
1758
1759        return ret;
1760}
1761
1762/*
1763 * Returns stocks cached in percpu and reset cached information.
1764 */
1765static void drain_stock(struct memcg_stock_pcp *stock)
1766{
1767        struct mem_cgroup *old = stock->cached;
1768
1769        if (stock->nr_pages) {
1770                page_counter_uncharge(&old->memory, stock->nr_pages);
1771                if (do_memsw_account())
1772                        page_counter_uncharge(&old->memsw, stock->nr_pages);
1773                css_put_many(&old->css, stock->nr_pages);
1774                stock->nr_pages = 0;
1775        }
1776        stock->cached = NULL;
1777}
1778
1779static void drain_local_stock(struct work_struct *dummy)
1780{
1781        struct memcg_stock_pcp *stock;
1782        unsigned long flags;
1783
1784        local_irq_save(flags);
1785
1786        stock = this_cpu_ptr(&memcg_stock);
1787        drain_stock(stock);
1788        clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags);
1789
1790        local_irq_restore(flags);
1791}
1792
1793/*
1794 * Cache charges(val) to local per_cpu area.
1795 * This will be consumed by consume_stock() function, later.
1796 */
1797static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
1798{
1799        struct memcg_stock_pcp *stock;
1800        unsigned long flags;
1801
1802        local_irq_save(flags);
1803
1804        stock = this_cpu_ptr(&memcg_stock);
1805        if (stock->cached != memcg) { /* reset if necessary */
1806                drain_stock(stock);
1807                stock->cached = memcg;
1808        }
1809        stock->nr_pages += nr_pages;
1810
1811        local_irq_restore(flags);
1812}
1813
1814/*
1815 * Drains all per-CPU charge caches for given root_memcg resp. subtree
1816 * of the hierarchy under it.
1817 */
1818static void drain_all_stock(struct mem_cgroup *root_memcg)
1819{
1820        int cpu, curcpu;
1821
1822        /* If someone's already draining, avoid adding running more workers. */
1823        if (!mutex_trylock(&percpu_charge_mutex))
1824                return;
1825        /* Notify other cpus that system-wide "drain" is running */
1826        get_online_cpus();
1827        curcpu = get_cpu();
1828        for_each_online_cpu(cpu) {
1829                struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu);
1830                struct mem_cgroup *memcg;
1831
1832                memcg = stock->cached;
1833                if (!memcg || !stock->nr_pages)
1834                        continue;
1835                if (!mem_cgroup_is_descendant(memcg, root_memcg))
1836                        continue;
1837                if (!test_and_set_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) {
1838                        if (cpu == curcpu)
1839                                drain_local_stock(&stock->work);
1840                        else
1841                                schedule_work_on(cpu, &stock->work);
1842                }
1843        }
1844        put_cpu();
1845        put_online_cpus();
1846        mutex_unlock(&percpu_charge_mutex);
1847}
1848
1849static int memcg_cpu_hotplug_callback(struct notifier_block *nb,
1850                                        unsigned long action,
1851                                        void *hcpu)
1852{
1853        int cpu = (unsigned long)hcpu;
1854        struct memcg_stock_pcp *stock;
1855
1856        if (action == CPU_ONLINE)
1857                return NOTIFY_OK;
1858
1859        if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
1860                return NOTIFY_OK;
1861
1862        stock = &per_cpu(memcg_stock, cpu);
1863        drain_stock(stock);
1864        return NOTIFY_OK;
1865}
1866
1867static void reclaim_high(struct mem_cgroup *memcg,
1868                         unsigned int nr_pages,
1869                         gfp_t gfp_mask)
1870{
1871        do {
1872                if (page_counter_read(&memcg->memory) <= memcg->high)
1873                        continue;
1874                mem_cgroup_events(memcg, MEMCG_HIGH, 1);
1875                try_to_free_mem_cgroup_pages(memcg, nr_pages, gfp_mask, true);
1876        } while ((memcg = parent_mem_cgroup(memcg)));
1877}
1878
1879static void high_work_func(struct work_struct *work)
1880{
1881        struct mem_cgroup *memcg;
1882
1883        memcg = container_of(work, struct mem_cgroup, high_work);
1884        reclaim_high(memcg, CHARGE_BATCH, GFP_KERNEL);
1885}
1886
1887/*
1888 * Scheduled by try_charge() to be executed from the userland return path
1889 * and reclaims memory over the high limit.
1890 */
1891void mem_cgroup_handle_over_high(void)
1892{
1893        unsigned int nr_pages = current->memcg_nr_pages_over_high;
1894        struct mem_cgroup *memcg;
1895
1896        if (likely(!nr_pages))
1897                return;
1898
1899        memcg = get_mem_cgroup_from_mm(current->mm);
1900        reclaim_high(memcg, nr_pages, GFP_KERNEL);
1901        css_put(&memcg->css);
1902        current->memcg_nr_pages_over_high = 0;
1903}
1904
1905static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
1906                      unsigned int nr_pages)
1907{
1908        unsigned int batch = max(CHARGE_BATCH, nr_pages);
1909        int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
1910        struct mem_cgroup *mem_over_limit;
1911        struct page_counter *counter;
1912        unsigned long nr_reclaimed;
1913        bool may_swap = true;
1914        bool drained = false;
1915
1916        if (mem_cgroup_is_root(memcg))
1917                return 0;
1918retry:
1919        if (consume_stock(memcg, nr_pages))
1920                return 0;
1921
1922        if (!do_memsw_account() ||
1923            page_counter_try_charge(&memcg->memsw, batch, &counter)) {
1924                if (page_counter_try_charge(&memcg->memory, batch, &counter))
1925                        goto done_restock;
1926                if (do_memsw_account())
1927                        page_counter_uncharge(&memcg->memsw, batch);
1928                mem_over_limit = mem_cgroup_from_counter(counter, memory);
1929        } else {
1930                mem_over_limit = mem_cgroup_from_counter(counter, memsw);
1931                may_swap = false;
1932        }
1933
1934        if (batch > nr_pages) {
1935                batch = nr_pages;
1936                goto retry;
1937        }
1938
1939        /*
1940         * Unlike in global OOM situations, memcg is not in a physical
1941         * memory shortage.  Allow dying and OOM-killed tasks to
1942         * bypass the last charges so that they can exit quickly and
1943         * free their memory.
1944         */
1945        if (unlikely(test_thread_flag(TIF_MEMDIE) ||
1946                     fatal_signal_pending(current) ||
1947                     current->flags & PF_EXITING))
1948                goto force;
1949
1950        if (unlikely(task_in_memcg_oom(current)))
1951                goto nomem;
1952
1953        if (!gfpflags_allow_blocking(gfp_mask))
1954                goto nomem;
1955
1956        mem_cgroup_events(mem_over_limit, MEMCG_MAX, 1);
1957
1958        nr_reclaimed = try_to_free_mem_cgroup_pages(mem_over_limit, nr_pages,
1959                                                    gfp_mask, may_swap);
1960
1961        if (mem_cgroup_margin(mem_over_limit) >= nr_pages)
1962                goto retry;
1963
1964        if (!drained) {
1965                drain_all_stock(mem_over_limit);
1966                drained = true;
1967                goto retry;
1968        }
1969
1970        if (gfp_mask & __GFP_NORETRY)
1971                goto nomem;
1972        /*
1973         * Even though the limit is exceeded at this point, reclaim
1974         * may have been able to free some pages.  Retry the charge
1975         * before killing the task.
1976         *
1977         * Only for regular pages, though: huge pages are rather
1978         * unlikely to succeed so close to the limit, and we fall back
1979         * to regular pages anyway in case of failure.
1980         */
1981        if (nr_reclaimed && nr_pages <= (1 << PAGE_ALLOC_COSTLY_ORDER))
1982                goto retry;
1983        /*
1984         * At task move, charge accounts can be doubly counted. So, it's
1985         * better to wait until the end of task_move if something is going on.
1986         */
1987        if (mem_cgroup_wait_acct_move(mem_over_limit))
1988                goto retry;
1989
1990        if (nr_retries--)
1991                goto retry;
1992
1993        if (gfp_mask & __GFP_NOFAIL)
1994                goto force;
1995
1996        if (fatal_signal_pending(current))
1997                goto force;
1998
1999        mem_cgroup_events(mem_over_limit, MEMCG_OOM, 1);
2000

2001        mem_cgroup_oom(mem_over_limit, gfp_mask,
2002                       get_order(nr_pages * PAGE_SIZE));
2003nomem:
2004        if (!(gfp_mask & __GFP_NOFAIL))
2005                return -ENOMEM;
2006force:
2007        /*
2008         * The allocation either can't fail or will lead to more memory
2009         * being freed very soon.  Allow memory usage go over the limit
2010         * temporarily by force charging it.
2011         */
2012        page_counter_charge(&memcg->memory, nr_pages);
2013        if (do_memsw_account())
2014                page_counter_charge(&memcg->memsw, nr_pages);
2015        css_get_many(&memcg->css, nr_pages);
2016
2017        return 0;
2018
2019done_restock:
2020        css_get_many(&memcg->css, batch);
2021        if (batch > nr_pages)
2022                refill_stock(memcg, batch - nr_pages);
2023
2024        /*
2025         * If the hierarchy is above the normal consumption range, schedule
2026         * reclaim on returning to userland.  We can perform reclaim here
2027         * if __GFP_RECLAIM but let's always punt for simplicity and so that
2028         * GFP_KERNEL can consistently be used during reclaim.  @memcg is
2029         * not recorded as it most likely matches current's and won't
2030         * change in the meantime.  As high limit is checked again before
2031         * reclaim, the cost of mismatch is negligible.
2032         */
2033        do {
2034                if (page_counter_read(&memcg->memory) > memcg->high) {
2035                        /* Don't bother a random interrupted task */
2036                        if (in_interrupt()) {
2037                                schedule_work(&memcg->high_work);
2038                                break;
2039                        }
2040                        current->memcg_nr_pages_over_high += batch;
2041                        set_notify_resume(current);
2042                        break;
2043                }
2044        } while ((memcg = parent_mem_cgroup(memcg)));
2045
2046        return 0;
2047}
2048
2049static void cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages)
2050{
2051        if (mem_cgroup_is_root(memcg))
2052                return;
2053
2054        page_counter_uncharge(&memcg->memory, nr_pages);
2055        if (do_memsw_account())
2056                page_counter_uncharge(&memcg->memsw, nr_pages);
2057
2058        css_put_many(&memcg->css, nr_pages);
2059}
2060
2061static void lock_page_lru(struct page *page, int *isolated)
2062{
2063        struct zone *zone = page_zone(page);
2064
2065        spin_lock_irq(zone_lru_lock(zone));
2066        if (PageLRU(page)) {
2067                struct lruvec *lruvec;
2068
2069                lruvec = mem_cgroup_page_lruvec(page, zone->zone_pgdat);
2070                ClearPageLRU(page);
2071                del_page_from_lru_list(page, lruvec, page_lru(page));
2072                *isolated = 1;
2073        } else
2074                *isolated = 0;
2075}
2076
2077static void unlock_page_lru(struct page *page, int isolated)
2078{
2079        struct zone *zone = page_zone(page);
2080
2081        if (isolated) {
2082                struct lruvec *lruvec;
2083
2084                lruvec = mem_cgroup_page_lruvec(page, zone->zone_pgdat);
2085                VM_BUG_ON_PAGE(PageLRU(page), page);
2086                SetPageLRU(page);
2087                add_page_to_lru_list(page, lruvec, page_lru(page));
2088        }
2089        spin_unlock_irq(zone_lru_lock(zone));
2090}
2091
2092static void commit_charge(struct page *page, struct mem_cgroup *memcg,
2093                          bool lrucare)
2094{
2095        int isolated;
2096
2097        VM_BUG_ON_PAGE(page->mem_cgroup, page);
2098
2099        /*
2100         * In some cases, SwapCache and FUSE(splice_buf->radixtree), the page
2101         * may already be on some other mem_cgroup's LRU.  Take care of it.
2102         */
2103        if (lrucare)
2104                lock_page_lru(page, &isolated);
2105
2106        /*
2107         * Nobody should be changing or seriously looking at
2108         * page->mem_cgroup at this point:
2109         *
2110         * - the page is uncharged
2111         *
2112         * - the page is off-LRU
2113         *
2114         * - an anonymous fault has exclusive page access, except for
2115         *   a locked page table
2116         *
2117         * - a page cache insertion, a swapin fault, or a migration
2118         *   have the page locked
2119         */
2120        page->mem_cgroup = memcg;
2121
2122        if (lrucare)
2123                unlock_page_lru(page, isolated);
2124}
2125
2126#ifndef CONFIG_SLOB
2127static int memcg_alloc_cache_id(void)
2128{
2129        int id, size;
2130        int err;
2131
2132        id = ida_simple_get(&memcg_cache_ida,
2133                            0, MEMCG_CACHES_MAX_SIZE, GFP_KERNEL);
2134        if (id < 0)
2135                return id;
2136
2137        if (id < memcg_nr_cache_ids)
2138                return id;
2139
2140        /*
2141         * There's no space for the new id in memcg_caches arrays,
2142         * so we have to grow them.
2143         */
2144        down_write(&memcg_cache_ids_sem);
2145
2146        size = 2 * (id + 1);
2147        if (size < MEMCG_CACHES_MIN_SIZE)
2148                size = MEMCG_CACHES_MIN_SIZE;
2149        else if (size > MEMCG_CACHES_MAX_SIZE)
2150                size = MEMCG_CACHES_MAX_SIZE;
2151
2152        err = memcg_update_all_caches(size);
2153        if (!err)
2154                err = memcg_update_all_list_lrus(size);
2155        if (!err)
2156                memcg_nr_cache_ids = size;
2157
2158        up_write(&memcg_cache_ids_sem);
2159
2160        if (err) {
2161                ida_simple_remove(&memcg_cache_ida, id);
2162                return err;
2163        }
2164        return id;
2165}
2166
2167static void memcg_free_cache_id(int id)
2168{
2169        ida_simple_remove(&memcg_cache_ida, id);
2170}
2171
2172struct memcg_kmem_cache_create_work {
2173        struct mem_cgroup *memcg;
2174        struct kmem_cache *cachep;
2175        struct work_struct work;
2176};
2177
2178static void memcg_kmem_cache_create_func(struct work_struct *w)
2179{
2180        struct memcg_kmem_cache_create_work *cw =
2181                container_of(w, struct memcg_kmem_cache_create_work, work);
2182        struct mem_cgroup *memcg = cw->memcg;
2183        struct kmem_cache *cachep = cw->cachep;
2184
2185        memcg_create_kmem_cache(memcg, cachep);
2186
2187        css_put(&memcg->css);
2188        kfree(cw);
2189}
2190
2191/*
2192 * Enqueue the creation of a per-memcg kmem_cache.
2193 */
2194static void __memcg_schedule_kmem_cache_create(struct mem_cgroup *memcg,
2195                                               struct kmem_cache *cachep)
2196{
2197        struct memcg_kmem_cache_create_work *cw;
2198
2199        cw = kmalloc(sizeof(*cw), GFP_NOWAIT);
2200        if (!cw)
2201                return;
2202
2203        css_get(&memcg->css);
2204
2205        cw->memcg = memcg;
2206        cw->cachep = cachep;
2207        INIT_WORK(&cw->work, memcg_kmem_cache_create_func);
2208
2209        schedule_work(&cw->work);
2210}
2211
2212static void memcg_schedule_kmem_cache_create(struct mem_cgroup *memcg,
2213                                             struct kmem_cache *cachep)
2214{
2215        /*
2216         * We need to stop accounting when we kmalloc, because if the
2217         * corresponding kmalloc cache is not yet created, the first allocation
2218         * in __memcg_schedule_kmem_cache_create will recurse.
2219         *
2220         * However, it is better to enclose the whole function. Depending on
2221         * the debugging options enabled, INIT_WORK(), for instance, can
2222         * trigger an allocation. This too, will make us recurse. Because at
2223         * this point we can't allow ourselves back into memcg_kmem_get_cache,
2224         * the safest choice is to do it like this, wrapping the whole function.
2225         */
2226        current->memcg_kmem_skip_account = 1;
2227        __memcg_schedule_kmem_cache_create(memcg, cachep);
2228        current->memcg_kmem_skip_account = 0;
2229}
2230
2231static inline bool memcg_kmem_bypass(void)
2232{
2233        if (in_interrupt() || !current->mm || (current->flags & PF_KTHREAD))
2234                return true;
2235        return false;
2236}
2237
2238/**
2239 * memcg_kmem_get_cache: select the correct per-memcg cache for allocation
2240 * @cachep: the original global kmem cache
2241 *
2242 * Return the kmem_cache we're supposed to use for a slab allocation.
2243 * We try to use the current memcg's version of the cache.
2244 *
2245 * If the cache does not exist yet, if we are the first user of it, we
2246 * create it asynchronously in a workqueue and let the current allocation
2247 * go through with the original cache.
2248 *
2249 * This function takes a reference to the cache it returns to assure it
2250 * won't get destroyed while we are working with it. Once the caller is
2251 * done with it, memcg_kmem_put_cache() must be called to release the
2252 * reference.
2253 */
2254struct kmem_cache *memcg_kmem_get_cache(struct kmem_cache *cachep)
2255{
2256        struct mem_cgroup *memcg;
2257        struct kmem_cache *memcg_cachep;
2258        int kmemcg_id;
2259
2260        VM_BUG_ON(!is_root_cache(cachep));
2261
2262        if (memcg_kmem_bypass())
2263                return cachep;
2264
2265        if (current->memcg_kmem_skip_account)
2266                return cachep;
2267
2268        memcg = get_mem_cgroup_from_mm(current->mm);
2269        kmemcg_id = READ_ONCE(memcg->kmemcg_id);
2270        if (kmemcg_id < 0)
2271                goto out;
2272
2273        memcg_cachep = cache_from_memcg_idx(cachep, kmemcg_id);
2274        if (likely(memcg_cachep))
2275                return memcg_cachep;
2276
2277        /*
2278         * If we are in a safe context (can wait, and not in interrupt
2279         * context), we could be be predictable and return right away.
2280         * This would guarantee that the allocation being performed
2281         * already belongs in the new cache.
2282         *
2283         * However, there are some clashes that can arrive from locking.
2284         * For instance, because we acquire the slab_mutex while doing
2285         * memcg_create_kmem_cache, this means no further allocation
2286         * could happen with the slab_mutex held. So it's better to
2287         * defer everything.
2288         */
2289        memcg_schedule_kmem_cache_create(memcg, cachep);
2290out:
2291        css_put(&memcg->css);
2292        return cachep;
2293}
2294
2295/**
2296 * memcg_kmem_put_cache: drop reference taken by memcg_kmem_get_cache
2297 * @cachep: the cache returned by memcg_kmem_get_cache
2298 */
2299void memcg_kmem_put_cache(struct kmem_cache *cachep)
2300{
2301        if (!is_root_cache(cachep))
2302                css_put(&cachep->memcg_params.memcg->css);
2303}
2304
2305/**
2306 * memcg_kmem_charge: charge a kmem page
2307 * @page: page to charge
2308 * @gfp: reclaim mode
2309 * @order: allocation order
2310 * @memcg: memory cgroup to charge
2311 *
2312 * Returns 0 on success, an error code on failure.
2313 */
2314int memcg_kmem_charge_memcg(struct page *page, gfp_t gfp, int order,
2315                            struct mem_cgroup *memcg)
2316{
2317        unsigned int nr_pages = 1 << order;
2318        struct page_counter *counter;
2319        int ret;
2320
2321        ret = try_charge(memcg, gfp, nr_pages);
2322        if (ret)
2323                return ret;
2324
2325        if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) &&
2326            !page_counter_try_charge(&memcg->kmem, nr_pages, &counter)) {
2327                cancel_charge(memcg, nr_pages);
2328                return -ENOMEM;
2329        }
2330
2331        page->mem_cgroup = memcg;
2332
2333        return 0;
2334}
2335
2336/**
2337 * memcg_kmem_charge: charge a kmem page to the current memory cgroup
2338 * @page: page to charge
2339 * @gfp: reclaim mode
2340 * @order: allocation order
2341 *
2342 * Returns 0 on success, an error code on failure.
2343 */
2344int memcg_kmem_charge(struct page *page, gfp_t gfp, int order)
2345{
2346        struct mem_cgroup *memcg;
2347        int ret = 0;
2348
2349        if (memcg_kmem_bypass())
2350                return 0;
2351
2352        memcg = get_mem_cgroup_from_mm(current->mm);
2353        if (!mem_cgroup_is_root(memcg)) {
2354                ret = memcg_kmem_charge_memcg(page, gfp, order, memcg);
2355                if (!ret)
2356                        __SetPageKmemcg(page);
2357        }
2358        css_put(&memcg->css);
2359        return ret;
2360}
2361/**
2362 * memcg_kmem_uncharge: uncharge a kmem page
2363 * @page: page to uncharge
2364 * @order: allocation order
2365 */
2366void memcg_kmem_uncharge(struct page *page, int order)
2367{
2368        struct mem_cgroup *memcg = page->mem_cgroup;
2369        unsigned int nr_pages = 1 << order;
2370
2371        if (!memcg)
2372                return;
2373
2374        VM_BUG_ON_PAGE(mem_cgroup_is_root(memcg), page);
2375
2376        if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
2377                page_counter_uncharge(&memcg->kmem, nr_pages);
2378
2379        page_counter_uncharge(&memcg->memory, nr_pages);
2380        if (do_memsw_account())
2381                page_counter_uncharge(&memcg->memsw, nr_pages);
2382
2383        page->mem_cgroup = NULL;
2384
2385        /* slab pages do not have PageKmemcg flag set */
2386        if (PageKmemcg(page))
2387                __ClearPageKmemcg(page);
2388
2389        css_put_many(&memcg->css, nr_pages);
2390}
2391#endif /* !CONFIG_SLOB */
2392
2393#ifdef CONFIG_TRANSPARENT_HUGEPAGE
2394
2395/*
2396 * Because tail pages are not marked as "used", set it. We're under
2397 * zone_lru_lock and migration entries setup in all page mappings.
2398 */
2399void mem_cgroup_split_huge_fixup(struct page *head)
2400{
2401        int i;
2402
2403        if (mem_cgroup_disabled())
2404                return;
2405
2406        for (i = 1; i < HPAGE_PMD_NR; i++)
2407                head[i].mem_cgroup = head->mem_cgroup;
2408
2409        __this_cpu_sub(head->mem_cgroup->stat->count[MEM_CGROUP_STAT_RSS_HUGE],
2410                       HPAGE_PMD_NR);
2411}
2412#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
2413
2414#ifdef CONFIG_MEMCG_SWAP
2415static void mem_cgroup_swap_statistics(struct mem_cgroup *memcg,
2416                                         bool charge)
2417{
2418        int val = (charge) ? 1 : -1;
2419        this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_SWAP], val);
2420}
2421
2422/**
2423 * mem_cgroup_move_swap_account - move swap charge and swap_cgroup's record.
2424 * @entry: swap entry to be moved
2425 * @from:  mem_cgroup which the entry is moved from
2426 * @to:  mem_cgroup which the entry is moved to
2427 *
2428 * It succeeds only when the swap_cgroup's record for this entry is the same
2429 * as the mem_cgroup's id of @from.
2430 *
2431 * Returns 0 on success, -EINVAL on failure.
2432 *
2433 * The caller must have charged to @to, IOW, called page_counter_charge() about
2434 * both res and memsw, and called css_get().
2435 */
2436static int mem_cgroup_move_swap_account(swp_entry_t entry,
2437                                struct mem_cgroup *from, struct mem_cgroup *to)
2438{
2439        unsigned short old_id, new_id;
2440
2441        old_id = mem_cgroup_id(from);
2442        new_id = mem_cgroup_id(to);
2443
2444        if (swap_cgroup_cmpxchg(entry, old_id, new_id) == old_id) {
2445                mem_cgroup_swap_statistics(from, false);
2446                mem_cgroup_swap_statistics(to, true);
2447                return 0;
2448        }
2449        return -EINVAL;
2450}
2451#else
2452static inline int mem_cgroup_move_swap_account(swp_entry_t entry,
2453                                struct mem_cgroup *from, struct mem_cgroup *to)
2454{
2455        return -EINVAL;
2456}
2457#endif
2458
2459static DEFINE_MUTEX(memcg_limit_mutex);
2460
2461static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
2462                                   unsigned long limit)
2463{
2464        unsigned long curusage;
2465        unsigned long oldusage;
2466        bool enlarge = false;
2467        int retry_count;
2468        int ret;
2469
2470        /*
2471         * For keeping hierarchical_reclaim simple, how long we should retry
2472         * is depends on callers. We set our retry-count to be function
2473         * of # of children which we should visit in this loop.
2474         */
2475        retry_count = MEM_CGROUP_RECLAIM_RETRIES *
2476                      mem_cgroup_count_children(memcg);
2477
2478        oldusage = page_counter_read(&memcg->memory);
2479
2480        do {
2481                if (signal_pending(current)) {
2482                        ret = -EINTR;
2483                        break;
2484                }
2485
2486                mutex_lock(&memcg_limit_mutex);
2487                if (limit > memcg->memsw.limit) {
2488                        mutex_unlock(&memcg_limit_mutex);
2489                        ret = -EINVAL;
2490                        break;
2491                }
2492                if (limit > memcg->memory.limit)
2493                        enlarge = true;
2494                ret = page_counter_limit(&memcg->memory, limit);
2495                mutex_unlock(&memcg_limit_mutex);
2496
2497                if (!ret)
2498                        break;
2499
2500                try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL, true);
2501
2502                curusage = page_counter_read(&memcg->memory);
2503                /* Usage is reduced ? */
2504                if (curusage >= oldusage)
2505                        retry_count--;
2506                else
2507                        oldusage = curusage;
2508        } while (retry_count);
2509
2510        if (!ret && enlarge)
2511                memcg_oom_recover(memcg);
2512
2513        return ret;
2514}
2515
2516static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg,
2517                                         unsigned long limit)
2518{
2519        unsigned long curusage;
2520        unsigned long oldusage;
2521        bool enlarge = false;
2522        int retry_count;
2523        int ret;
2524
2525        /* see mem_cgroup_resize_res_limit */
2526        retry_count = MEM_CGROUP_RECLAIM_RETRIES *
2527                      mem_cgroup_count_children(memcg);
2528
2529        oldusage = page_counter_read(&memcg->memsw);
2530
2531        do {
2532                if (signal_pending(current)) {
2533                        ret = -EINTR;
2534                        break;
2535                }
2536
2537                mutex_lock(&memcg_limit_mutex);
2538                if (limit < memcg->memory.limit) {
2539                        mutex_unlock(&memcg_limit_mutex);
2540                        ret = -EINVAL;
2541                        break;
2542                }
2543                if (limit > memcg->memsw.limit)
2544                        enlarge = true;
2545                ret = page_counter_limit(&memcg->memsw, limit);
2546                mutex_unlock(&memcg_limit_mutex);
2547
2548                if (!ret)
2549                        break;
2550
2551                try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL, false);
2552
2553                curusage = page_counter_read(&memcg->memsw);
2554                /* Usage is reduced ? */
2555                if (curusage >= oldusage)
2556                        retry_count--;
2557                else
2558                        oldusage = curusage;
2559        } while (retry_count);
2560
2561        if (!ret && enlarge)
2562                memcg_oom_recover(memcg);
2563
2564        return ret;
2565}
2566
2567unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order,
2568                                            gfp_t gfp_mask,
2569                                            unsigned long *total_scanned)
2570{
2571        unsigned long nr_reclaimed = 0;
2572        struct mem_cgroup_per_node *mz, *next_mz = NULL;
2573        unsigned long reclaimed;
2574        int loop = 0;
2575        struct mem_cgroup_tree_per_node *mctz;
2576        unsigned long excess;
2577        unsigned long nr_scanned;
2578
2579        if (order > 0)
2580                return 0;
2581
2582        mctz = soft_limit_tree_node(pgdat->node_id);
2583
2584        /*
2585         * Do not even bother to check the largest node if the root
2586         * is empty. Do it lockless to prevent lock bouncing. Races
2587         * are acceptable as soft limit is best effort anyway.
2588         */
2589        if (RB_EMPTY_ROOT(&mctz->rb_root))
2590                return 0;
2591
2592        /*
2593         * This loop can run a while, specially if mem_cgroup's continuously
2594         * keep exceeding their soft limit and putting the system under
2595         * pressure
2596         */
2597        do {
2598                if (next_mz)
2599                        mz = next_mz;
2600                else
2601                        mz = mem_cgroup_largest_soft_limit_node(mctz);
2602                if (!mz)
2603                        break;
2604
2605                nr_scanned = 0;
2606                reclaimed = mem_cgroup_soft_reclaim(mz->memcg, pgdat,
2607                                                    gfp_mask, &nr_scanned);
2608                nr_reclaimed += reclaimed;
2609                *total_scanned += nr_scanned;
2610                spin_lock_irq(&mctz->lock);
2611                __mem_cgroup_remove_exceeded(mz, mctz);
2612
2613                /*
2614                 * If we failed to reclaim anything from this memory cgroup
2615                 * it is time to move on to the next cgroup
2616                 */
2617                next_mz = NULL;
2618                if (!reclaimed)
2619                        next_mz = __mem_cgroup_largest_soft_limit_node(mctz);
2620
2621                excess = soft_limit_excess(mz->memcg);
2622                /*
2623                 * One school of thought says that we should not add
2624                 * back the node to the tree if reclaim returns 0.
2625                 * But our reclaim could return 0, simply because due
2626                 * to priority we are exposing a smaller subset of
2627                 * memory to reclaim from. Consider this as a longer
2628                 * term TODO.
2629                 */
2630                /* If excess == 0, no tree ops */
2631                __mem_cgroup_insert_exceeded(mz, mctz, excess);
2632                spin_unlock_irq(&mctz->lock);
2633                css_put(&mz->memcg->css);
2634                loop++;
2635                /*
2636                 * Could not reclaim anything and there are no more
2637                 * mem cgroups to try or we seem to be looping without
2638                 * reclaiming anything.
2639                 */
2640                if (!nr_reclaimed &&
2641                        (next_mz == NULL ||
2642                        loop > MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS))
2643                        break;
2644        } while (!nr_reclaimed);
2645        if (next_mz)
2646                css_put(&next_mz->memcg->css);
2647        return nr_reclaimed;
2648}
2649
2650/*
2651 * Test whether @memcg has children, dead or alive.  Note that this
2652 * function doesn't care whether @memcg has use_hierarchy enabled and
2653 * returns %true if there are child csses according to the cgroup
2654 * hierarchy.  Testing use_hierarchy is the caller's responsiblity.
2655 */
2656static inline bool memcg_has_children(struct mem_cgroup *memcg)
2657{
2658        bool ret;
2659
2660        rcu_read_lock();
2661        ret = css_next_child(NULL, &memcg->css);
2662        rcu_read_unlock();
2663        return ret;
2664}
2665
2666/*
2667 * Reclaims as many pages from the given memcg as possible.
2668 *
2669 * Caller is responsible for holding css reference for memcg.
2670 */
2671static int mem_cgroup_force_empty(struct mem_cgroup *memcg)
2672{
2673        int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
2674
2675        /* we call try-to-free pages for make this cgroup empty */
2676        lru_add_drain_all();
2677        /* try to free all pages in this cgroup */
2678        while (nr_retries && page_counter_read(&memcg->memory)) {
2679                int progress;
2680
2681                if (signal_pending(current))
2682                        return -EINTR;
2683
2684                progress = try_to_free_mem_cgroup_pages(memcg, 1,
2685                                                        GFP_KERNEL, true);
2686                if (!progress) {
2687                        nr_retries--;
2688                        /* maybe some writeback is necessary */
2689                        congestion_wait(BLK_RW_ASYNC, HZ/10);
2690                }
2691
2692        }
2693
2694        return 0;
2695}
2696
2697static ssize_t mem_cgroup_force_empty_write(struct kernfs_open_file *of,
2698                                            char *buf, size_t nbytes,
2699                                            loff_t off)
2700{
2701        struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
2702
2703        if (mem_cgroup_is_root(memcg))
2704                return -EINVAL;
2705        return mem_cgroup_force_empty(memcg) ?: nbytes;
2706}
2707
2708static u64 mem_cgroup_hierarchy_read(struct cgroup_subsys_state *css,
2709                                     struct cftype *cft)
2710{
2711        return mem_cgroup_from_css(css)->use_hierarchy;
2712}
2713
2714static int mem_cgroup_hierarchy_write(struct cgroup_subsys_state *css,
2715                                      struct cftype *cft, u64 val)
2716{
2717        int retval = 0;
2718        struct mem_cgroup *memcg = mem_cgroup_from_css(css);
2719        struct mem_cgroup *parent_memcg = mem_cgroup_from_css(memcg->css.parent);
2720
2721        if (memcg->use_hierarchy == val)
2722                return 0;
2723
2724        /*
2725         * If parent's use_hierarchy is set, we can't make any modifications
2726         * in the child subtrees. If it is unset, then the change can
2727         * occur, provided the current cgroup has no children.
2728         *
2729         * For the root cgroup, parent_mem is NULL, we allow value to be
2730         * set if there are no children.
2731         */
2732        if ((!parent_memcg || !parent_memcg->use_hierarchy) &&
2733                                (val == 1 || val == 0)) {
2734                if (!memcg_has_children(memcg))
2735                        memcg->use_hierarchy = val;
2736                else
2737                        retval = -EBUSY;
2738        } else
2739                retval = -EINVAL;
2740
2741        return retval;
2742}
2743
2744static void tree_stat(struct mem_cgroup *memcg, unsigned long *stat)
2745{
2746        struct mem_cgroup *iter;
2747        int i;
2748
2749        memset(stat, 0, sizeof(*stat) * MEMCG_NR_STAT);
2750
2751        for_each_mem_cgroup_tree(iter, memcg) {
2752                for (i = 0; i < MEMCG_NR_STAT; i++)
2753                        stat[i] += mem_cgroup_read_stat(iter, i);
2754        }
2755}
2756
2757static void tree_events(struct mem_cgroup *memcg, unsigned long *events)
2758{
2759        struct mem_cgroup *iter;
2760        int i;
2761
2762        memset(events, 0, sizeof(*events) * MEMCG_NR_EVENTS);
2763
2764        for_each_mem_cgroup_tree(iter, memcg) {
2765                for (i = 0; i < MEMCG_NR_EVENTS; i++)
2766                        events[i] += mem_cgroup_read_events(iter, i);
2767        }
2768}
2769
2770static unsigned long mem_cgroup_usage(struct mem_cgroup *memcg, bool swap)
2771{
2772        unsigned long val = 0;
2773
2774        if (mem_cgroup_is_root(memcg)) {
2775                struct mem_cgroup *iter;
2776
2777                for_each_mem_cgroup_tree(iter, memcg) {
2778                        val += mem_cgroup_read_stat(iter,
2779                                        MEM_CGROUP_STAT_CACHE);
2780                        val += mem_cgroup_read_stat(iter,
2781                                        MEM_CGROUP_STAT_RSS);
2782                        if (swap)
2783                                val += mem_cgroup_read_stat(iter,
2784                                                MEM_CGROUP_STAT_SWAP);
2785                }
2786        } else {
2787                if (!swap)
2788                        val = page_counter_read(&memcg->memory);
2789                else
2790                        val = page_counter_read(&memcg->memsw);
2791        }
2792        return val;
2793}
2794
2795enum {
2796        RES_USAGE,
2797        RES_LIMIT,
2798        RES_MAX_USAGE,
2799        RES_FAILCNT,
2800        RES_SOFT_LIMIT,
2801};
2802
2803static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css,
2804                               struct cftype *cft)
2805{
2806        struct mem_cgroup *memcg = mem_cgroup_from_css(css);
2807        struct page_counter *counter;
2808
2809        switch (MEMFILE_TYPE(cft->private)) {
2810        case _MEM:
2811                counter = &memcg->memory;
2812                break;
2813        case _MEMSWAP:
2814                counter = &memcg->memsw;
2815                break;
2816        case _KMEM:
2817                counter = &memcg->kmem;
2818                break;
2819        case _TCP:
2820                counter = &memcg->tcpmem;
2821                break;
2822        default:
2823                BUG();
2824        }
2825
2826        switch (MEMFILE_ATTR(cft->private)) {
2827        case RES_USAGE:
2828                if (counter == &memcg->memory)
2829                        return (u64)mem_cgroup_usage(memcg, false) * PAGE_SIZE;
2830                if (counter == &memcg->memsw)
2831                        return (u64)mem_cgroup_usage(memcg, true) * PAGE_SIZE;
2832                return (u64)page_counter_read(counter) * PAGE_SIZE;
2833        case RES_LIMIT:
2834                return (u64)counter->limit * PAGE_SIZE;
2835        case RES_MAX_USAGE:
2836                return (u64)counter->watermark * PAGE_SIZE;
2837        case RES_FAILCNT:
2838                return counter->failcnt;
2839        case RES_SOFT_LIMIT:
2840                return (u64)memcg->soft_limit * PAGE_SIZE;
2841        default:
2842                BUG();
2843        }
2844}
2845
2846#ifndef CONFIG_SLOB
2847static int memcg_online_kmem(struct mem_cgroup *memcg)
2848{
2849        int memcg_id;
2850
2851        if (cgroup_memory_nokmem)
2852                return 0;
2853
2854        BUG_ON(memcg->kmemcg_id >= 0);
2855        BUG_ON(memcg->kmem_state);
2856
2857        memcg_id = memcg_alloc_cache_id();
2858        if (memcg_id < 0)
2859                return memcg_id;
2860
2861        static_branch_inc(&memcg_kmem_enabled_key);
2862        /*
2863         * A memory cgroup is considered kmem-online as soon as it gets
2864         * kmemcg_id. Setting the id after enabling static branching will
2865         * guarantee no one starts accounting before all call sites are
2866         * patched.
2867         */
2868        memcg->kmemcg_id = memcg_id;
2869        memcg->kmem_state = KMEM_ONLINE;
2870
2871        return 0;
2872}
2873
2874static void memcg_offline_kmem(struct mem_cgroup *memcg)
2875{
2876        struct cgroup_subsys_state *css;
2877        struct mem_cgroup *parent, *child;
2878        int kmemcg_id;
2879
2880        if (memcg->kmem_state != KMEM_ONLINE)
2881                return;
2882        /*
2883         * Clear the online state before clearing memcg_caches array
2884         * entries. The slab_mutex in memcg_deactivate_kmem_caches()
2885         * guarantees that no cache will be created for this cgroup
2886         * after we are done (see memcg_create_kmem_cache()).
2887         */
2888        memcg->kmem_state = KMEM_ALLOCATED;
2889
2890        memcg_deactivate_kmem_caches(memcg);
2891
2892        kmemcg_id = memcg->kmemcg_id;
2893        BUG_ON(kmemcg_id < 0);
2894
2895        parent = parent_mem_cgroup(memcg);
2896        if (!parent)
2897                parent = root_mem_cgroup;
2898
2899        /*
2900         * Change kmemcg_id of this cgroup and all its descendants to the
2901         * parent's id, and then move all entries from this cgroup's list_lrus
2902         * to ones of the parent. After we have finished, all list_lrus
2903         * corresponding to this cgroup are guaranteed to remain empty. The
2904         * ordering is imposed by list_lru_node->lock taken by
2905         * memcg_drain_all_list_lrus().
2906         */
2907        rcu_read_lock(); /* can be called from css_free w/o cgroup_mutex */
2908        css_for_each_descendant_pre(css, &memcg->css) {
2909                child = mem_cgroup_from_css(css);
2910                BUG_ON(child->kmemcg_id != kmemcg_id);
2911                child->kmemcg_id = parent->kmemcg_id;
2912                if (!memcg->use_hierarchy)
2913                        break;
2914        }
2915        rcu_read_unlock();
2916
2917        memcg_drain_all_list_lrus(kmemcg_id, parent->kmemcg_id);
2918
2919        memcg_free_cache_id(kmemcg_id);
2920}
2921
2922static void memcg_free_kmem(struct mem_cgroup *memcg)
2923{
2924        /* css_alloc() failed, offlining didn't happen */
2925        if (unlikely(memcg->kmem_state == KMEM_ONLINE))
2926                memcg_offline_kmem(memcg);
2927
2928        if (memcg->kmem_state == KMEM_ALLOCATED) {
2929                memcg_destroy_kmem_caches(memcg);
2930                static_branch_dec(&memcg_kmem_enabled_key);
2931                WARN_ON(page_counter_read(&memcg->kmem));
2932        }
2933}
2934#else
2935static int memcg_online_kmem(struct mem_cgroup *memcg)
2936{
2937        return 0;
2938}
2939static void memcg_offline_kmem(struct mem_cgroup *memcg)
2940{
2941}
2942static void memcg_free_kmem(struct mem_cgroup *memcg)
2943{
2944}
2945#endif /* !CONFIG_SLOB */
2946
2947static int memcg_update_kmem_limit(struct mem_cgroup *memcg,
2948                                   unsigned long limit)
2949{
2950        int ret;
2951
2952        mutex_lock(&memcg_limit_mutex);
2953        ret = page_counter_limit(&memcg->kmem, limit);
2954        mutex_unlock(&memcg_limit_mutex);
2955        return ret;
2956}
2957
2958static int memcg_update_tcp_limit(struct mem_cgroup *memcg, unsigned long limit)
2959{
2960        int ret;
2961
2962        mutex_lock(&memcg_limit_mutex);
2963
2964        ret = page_counter_limit(&memcg->tcpmem, limit);
2965        if (ret)
2966                goto out;
2967
2968        if (!memcg->tcpmem_active) {
2969                /*
2970                 * The active flag needs to be written after the static_key
2971                 * update. This is what guarantees that the socket activation
2972                 * function is the last one to run. See sock_update_memcg() for
2973                 * details, and note that we don't mark any socket as belonging
2974                 * to this memcg until that flag is up.
2975                 *
2976                 * We need to do this, because static_keys will span multiple
2977                 * sites, but we can't control their order. If we mark a socket
2978                 * as accounted, but the accounting functions are not patched in
2979                 * yet, we'll lose accounting.
2980                 *
2981                 * We never race with the readers in sock_update_memcg(),
2982                 * because when this value change, the code to process it is not
2983                 * patched in yet.
2984                 */
2985                static_branch_inc(&memcg_sockets_enabled_key);
2986                memcg->tcpmem_active = true;
2987        }
2988out:
2989        mutex_unlock(&memcg_limit_mutex);
2990        return ret;
2991}
2992
2993/*
2994 * The user of this function is...
2995 * RES_LIMIT.
2996 */
2997static ssize_t mem_cgroup_write(struct kernfs_open_file *of,
2998                                char *buf, size_t nbytes, loff_t off)
2999{
3000        struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));

3001        unsigned long nr_pages;
3002        int ret;
3003
3004        buf = strstrip(buf);
3005        ret = page_counter_memparse(buf, "-1", &nr_pages);
3006        if (ret)
3007                return ret;
3008
3009        switch (MEMFILE_ATTR(of_cft(of)->private)) {
3010        case RES_LIMIT:
3011                if (mem_cgroup_is_root(memcg)) { /* Can't set limit on root */
3012                        ret = -EINVAL;
3013                        break;
3014                }
3015                switch (MEMFILE_TYPE(of_cft(of)->private)) {
3016                case _MEM:
3017                        ret = mem_cgroup_resize_limit(memcg, nr_pages);
3018                        break;
3019                case _MEMSWAP:
3020                        ret = mem_cgroup_resize_memsw_limit(memcg, nr_pages);
3021                        break;
3022                case _KMEM:
3023                        ret = memcg_update_kmem_limit(memcg, nr_pages);
3024                        break;
3025                case _TCP:
3026                        ret = memcg_update_tcp_limit(memcg, nr_pages);
3027                        break;
3028                }
3029                break;
3030        case RES_SOFT_LIMIT:
3031                memcg->soft_limit = nr_pages;
3032                ret = 0;
3033                break;
3034        }
3035        return ret ?: nbytes;
3036}
3037
3038static ssize_t mem_cgroup_reset(struct kernfs_open_file *of, char *buf,
3039                                size_t nbytes, loff_t off)
3040{
3041        struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
3042        struct page_counter *counter;
3043
3044        switch (MEMFILE_TYPE(of_cft(of)->private)) {
3045        case _MEM:
3046                counter = &memcg->memory;
3047                break;
3048        case _MEMSWAP:
3049                counter = &memcg->memsw;
3050                break;
3051        case _KMEM:
3052                counter = &memcg->kmem;
3053                break;
3054        case _TCP:
3055                counter = &memcg->tcpmem;
3056                break;
3057        default:
3058                BUG();
3059        }
3060
3061        switch (MEMFILE_ATTR(of_cft(of)->private)) {
3062        case RES_MAX_USAGE:
3063                page_counter_reset_watermark(counter);
3064                break;
3065        case RES_FAILCNT:
3066                counter->failcnt = 0;
3067                break;
3068        default:
3069                BUG();
3070        }
3071
3072        return nbytes;
3073}
3074
3075static u64 mem_cgroup_move_charge_read(struct cgroup_subsys_state *css,
3076                                        struct cftype *cft)
3077{
3078        return mem_cgroup_from_css(css)->move_charge_at_immigrate;
3079}
3080
3081#ifdef CONFIG_MMU
3082static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css,
3083                                        struct cftype *cft, u64 val)
3084{
3085        struct mem_cgroup *memcg = mem_cgroup_from_css(css);
3086
3087        if (val & ~MOVE_MASK)
3088                return -EINVAL;
3089
3090        /*
3091         * No kind of locking is needed in here, because ->can_attach() will
3092         * check this value once in the beginning of the process, and then carry
3093         * on with stale data. This means that changes to this value will only
3094         * affect task migrations starting after the change.
3095         */
3096        memcg->move_charge_at_immigrate = val;
3097        return 0;
3098}
3099#else
3100static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css,
3101                                        struct cftype *cft, u64 val)
3102{
3103        return -ENOSYS;
3104}
3105#endif
3106
3107#ifdef CONFIG_NUMA
3108static int memcg_numa_stat_show(struct seq_file *m, void *v)
3109{
3110        struct numa_stat {
3111                const char *name;
3112                unsigned int lru_mask;
3113        };
3114
3115        static const struct numa_stat stats[] = {
3116                { "total", LRU_ALL },
3117                { "file", LRU_ALL_FILE },
3118                { "anon", LRU_ALL_ANON },
3119                { "unevictable", BIT(LRU_UNEVICTABLE) },
3120        };
3121        const struct numa_stat *stat;
3122        int nid;
3123        unsigned long nr;
3124        struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
3125
3126        for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) {
3127                nr = mem_cgroup_nr_lru_pages(memcg, stat->lru_mask);
3128                seq_printf(m, "%s=%lu", stat->name, nr);
3129                for_each_node_state(nid, N_MEMORY) {
3130                        nr = mem_cgroup_node_nr_lru_pages(memcg, nid,
3131                                                          stat->lru_mask);
3132                        seq_printf(m, " N%d=%lu", nid, nr);
3133                }
3134                seq_putc(m, '\n');
3135        }
3136
3137        for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) {
3138                struct mem_cgroup *iter;
3139
3140                nr = 0;
3141                for_each_mem_cgroup_tree(iter, memcg)
3142                        nr += mem_cgroup_nr_lru_pages(iter, stat->lru_mask);
3143                seq_printf(m, "hierarchical_%s=%lu", stat->name, nr);
3144                for_each_node_state(nid, N_MEMORY) {
3145                        nr = 0;
3146                        for_each_mem_cgroup_tree(iter, memcg)
3147                                nr += mem_cgroup_node_nr_lru_pages(
3148                                        iter, nid, stat->lru_mask);
3149                        seq_printf(m, " N%d=%lu", nid, nr);
3150                }
3151                seq_putc(m, '\n');
3152        }
3153
3154        return 0;
3155}
3156#endif /* CONFIG_NUMA */
3157
3158static int memcg_stat_show(struct seq_file *m, void *v)
3159{
3160        struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
3161        unsigned long memory, memsw;
3162        struct mem_cgroup *mi;
3163        unsigned int i;
3164
3165        BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_stat_names) !=
3166                     MEM_CGROUP_STAT_NSTATS);
3167        BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_events_names) !=
3168                     MEM_CGROUP_EVENTS_NSTATS);
3169        BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_lru_names) != NR_LRU_LISTS);
3170
3171        for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) {
3172                if (i == MEM_CGROUP_STAT_SWAP && !do_memsw_account())
3173                        continue;
3174                seq_printf(m, "%s %lu\n", mem_cgroup_stat_names[i],
3175                           mem_cgroup_read_stat(memcg, i) * PAGE_SIZE);
3176        }
3177
3178        for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++)
3179                seq_printf(m, "%s %lu\n", mem_cgroup_events_names[i],
3180                           mem_cgroup_read_events(memcg, i));
3181
3182        for (i = 0; i < NR_LRU_LISTS; i++)
3183                seq_printf(m, "%s %lu\n", mem_cgroup_lru_names[i],
3184                           mem_cgroup_nr_lru_pages(memcg, BIT(i)) * PAGE_SIZE);
3185
3186        /* Hierarchical information */
3187        memory = memsw = PAGE_COUNTER_MAX;
3188        for (mi = memcg; mi; mi = parent_mem_cgroup(mi)) {
3189                memory = min(memory, mi->memory.limit);
3190                memsw = min(memsw, mi->memsw.limit);
3191        }
3192        seq_printf(m, "hierarchical_memory_limit %llu\n",
3193                   (u64)memory * PAGE_SIZE);
3194        if (do_memsw_account())
3195                seq_printf(m, "hierarchical_memsw_limit %llu\n",
3196                           (u64)memsw * PAGE_SIZE);
3197
3198        for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) {
3199                unsigned long long val = 0;
3200
3201                if (i == MEM_CGROUP_STAT_SWAP && !do_memsw_account())
3202                        continue;
3203                for_each_mem_cgroup_tree(mi, memcg)
3204                        val += mem_cgroup_read_stat(mi, i) * PAGE_SIZE;
3205                seq_printf(m, "total_%s %llu\n", mem_cgroup_stat_names[i], val);
3206        }
3207
3208        for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++) {
3209                unsigned long long val = 0;
3210
3211                for_each_mem_cgroup_tree(mi, memcg)
3212                        val += mem_cgroup_read_events(mi, i);
3213                seq_printf(m, "total_%s %llu\n",
3214                           mem_cgroup_events_names[i], val);
3215        }
3216
3217        for (i = 0; i < NR_LRU_LISTS; i++) {
3218                unsigned long long val = 0;
3219
3220                for_each_mem_cgroup_tree(mi, memcg)
3221                        val += mem_cgroup_nr_lru_pages(mi, BIT(i)) * PAGE_SIZE;
3222                seq_printf(m, "total_%s %llu\n", mem_cgroup_lru_names[i], val);
3223        }
3224
3225#ifdef CONFIG_DEBUG_VM
3226        {
3227                pg_data_t *pgdat;
3228                struct mem_cgroup_per_node *mz;
3229                struct zone_reclaim_stat *rstat;
3230                unsigned long recent_rotated[2] = {0, 0};
3231                unsigned long recent_scanned[2] = {0, 0};
3232
3233                for_each_online_pgdat(pgdat) {
3234                        mz = mem_cgroup_nodeinfo(memcg, pgdat->node_id);
3235                        rstat = &mz->lruvec.reclaim_stat;
3236
3237                        recent_rotated[0] += rstat->recent_rotated[0];
3238                        recent_rotated[1] += rstat->recent_rotated[1];
3239                        recent_scanned[0] += rstat->recent_scanned[0];
3240                        recent_scanned[1] += rstat->recent_scanned[1];
3241                }
3242                seq_printf(m, "recent_rotated_anon %lu\n", recent_rotated[0]);
3243                seq_printf(m, "recent_rotated_file %lu\n", recent_rotated[1]);
3244                seq_printf(m, "recent_scanned_anon %lu\n", recent_scanned[0]);
3245                seq_printf(m, "recent_scanned_file %lu\n", recent_scanned[1]);
3246        }
3247#endif
3248
3249        return 0;
3250}
3251
3252static u64 mem_cgroup_swappiness_read(struct cgroup_subsys_state *css,
3253                                      struct cftype *cft)
3254{
3255        struct mem_cgroup *memcg = mem_cgroup_from_css(css);
3256
3257        return mem_cgroup_swappiness(memcg);
3258}
3259
3260static int mem_cgroup_swappiness_write(struct cgroup_subsys_state *css,
3261                                       struct cftype *cft, u64 val)
3262{
3263        struct mem_cgroup *memcg = mem_cgroup_from_css(css);
3264
3265        if (val > 100)
3266                return -EINVAL;
3267
3268        if (css->parent)
3269                memcg->swappiness = val;
3270        else
3271                vm_swappiness = val;
3272
3273        return 0;
3274}
3275
3276static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap)
3277{
3278        struct mem_cgroup_threshold_ary *t;
3279        unsigned long usage;
3280        int i;
3281
3282        rcu_read_lock();
3283        if (!swap)
3284                t = rcu_dereference(memcg->thresholds.primary);
3285        else
3286                t = rcu_dereference(memcg->memsw_thresholds.primary);
3287
3288        if (!t)
3289                goto unlock;
3290
3291        usage = mem_cgroup_usage(memcg, swap);
3292
3293        /*
3294         * current_threshold points to threshold just below or equal to usage.
3295         * If it's not true, a threshold was crossed after last
3296         * call of __mem_cgroup_threshold().
3297         */
3298        i = t->current_threshold;
3299
3300        /*
3301         * Iterate backward over array of thresholds starting from
3302         * current_threshold and check if a threshold is crossed.
3303         * If none of thresholds below usage is crossed, we read
3304         * only one element of the array here.
3305         */
3306        for (; i >= 0 && unlikely(t->entries[i].threshold > usage); i--)
3307                eventfd_signal(t->entries[i].eventfd, 1);
3308
3309        /* i = current_threshold + 1 */
3310        i++;
3311
3312        /*
3313         * Iterate forward over array of thresholds starting from
3314         * current_threshold+1 and check if a threshold is crossed.
3315         * If none of thresholds above usage is crossed, we read
3316         * only one element of the array here.
3317         */
3318        for (; i < t->size && unlikely(t->entries[i].threshold <= usage); i++)
3319                eventfd_signal(t->entries[i].eventfd, 1);
3320
3321        /* Update current_threshold */
3322        t->current_threshold = i - 1;
3323unlock:
3324        rcu_read_unlock();
3325}
3326
3327static void mem_cgroup_threshold(struct mem_cgroup *memcg)
3328{
3329        while (memcg) {
3330                __mem_cgroup_threshold(memcg, false);
3331                if (do_memsw_account())
3332                        __mem_cgroup_threshold(memcg, true);
3333
3334                memcg = parent_mem_cgroup(memcg);
3335        }
3336}
3337
3338static int compare_thresholds(const void *a, const void *b)
3339{
3340        const struct mem_cgroup_threshold *_a = a;
3341        const struct mem_cgroup_threshold *_b = b;
3342
3343        if (_a->threshold > _b->threshold)
3344                return 1;
3345
3346        if (_a->threshold < _b->threshold)
3347                return -1;
3348
3349        return 0;
3350}
3351
3352static int mem_cgroup_oom_notify_cb(struct mem_cgroup *memcg)
3353{
3354        struct mem_cgroup_eventfd_list *ev;
3355
3356        spin_lock(&memcg_oom_lock);
3357
3358        list_for_each_entry(ev, &memcg->oom_notify, list)
3359                eventfd_signal(ev->eventfd, 1);
3360
3361        spin_unlock(&memcg_oom_lock);
3362        return 0;
3363}
3364
3365static void mem_cgroup_oom_notify(struct mem_cgroup *memcg)
3366{
3367        struct mem_cgroup *iter;
3368
3369        for_each_mem_cgroup_tree(iter, memcg)
3370                mem_cgroup_oom_notify_cb(iter);
3371}
3372
3373static int __mem_cgroup_usage_register_event(struct mem_cgroup *memcg,
3374        struct eventfd_ctx *eventfd, const char *args, enum res_type type)
3375{
3376        struct mem_cgroup_thresholds *thresholds;
3377        struct mem_cgroup_threshold_ary *new;
3378        unsigned long threshold;
3379        unsigned long usage;
3380        int i, size, ret;
3381
3382        ret = page_counter_memparse(args, "-1", &threshold);
3383        if (ret)
3384                return ret;
3385
3386        mutex_lock(&memcg->thresholds_lock);
3387
3388        if (type == _MEM) {
3389                thresholds = &memcg->thresholds;
3390                usage = mem_cgroup_usage(memcg, false);
3391        } else if (type == _MEMSWAP) {
3392                thresholds = &memcg->memsw_thresholds;
3393                usage = mem_cgroup_usage(memcg, true);
3394        } else
3395                BUG();
3396
3397        /* Check if a threshold crossed before adding a new one */
3398        if (thresholds->primary)
3399                __mem_cgroup_threshold(memcg, type == _MEMSWAP);
3400
3401        size = thresholds->primary ? thresholds->primary->size + 1 : 1;
3402
3403        /* Allocate memory for new array of thresholds */
3404        new = kmalloc(sizeof(*new) + size * sizeof(struct mem_cgroup_threshold),
3405                        GFP_KERNEL);
3406        if (!new) {
3407                ret = -ENOMEM;
3408                goto unlock;
3409        }
3410        new->size = size;
3411
3412        /* Copy thresholds (if any) to new array */
3413        if (thresholds->primary) {
3414                memcpy(new->entries, thresholds->primary->entries, (size - 1) *
3415                                sizeof(struct mem_cgroup_threshold));
3416        }
3417
3418        /* Add new threshold */
3419        new->entries[size - 1].eventfd = eventfd;
3420        new->entries[size - 1].threshold = threshold;
3421
3422        /* Sort thresholds. Registering of new threshold isn't time-critical */
3423        sort(new->entries, size, sizeof(struct mem_cgroup_threshold),
3424                        compare_thresholds, NULL);
3425
3426        /* Find current threshold */
3427        new->current_threshold = -1;
3428        for (i = 0; i < size; i++) {
3429                if (new->entries[i].threshold <= usage) {
3430                        /*
3431                         * new->current_threshold will not be used until
3432                         * rcu_assign_pointer(), so it's safe to increment
3433                         * it here.
3434                         */
3435                        ++new->current_threshold;
3436                } else
3437                        break;
3438        }
3439
3440        /* Free old spare buffer and save old primary buffer as spare */
3441        kfree(thresholds->spare);
3442        thresholds->spare = thresholds->primary;
3443
3444        rcu_assign_pointer(thresholds->primary, new);
3445
3446        /* To be sure that nobody uses thresholds */
3447        synchronize_rcu();
3448
3449unlock:
3450        mutex_unlock(&memcg->thresholds_lock);
3451
3452        return ret;
3453}
3454
3455static int mem_cgroup_usage_register_event(struct mem_cgroup *memcg,
3456        struct eventfd_ctx *eventfd, const char *args)
3457{
3458        return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEM);
3459}
3460
3461static int memsw_cgroup_usage_register_event(struct mem_cgroup *memcg,
3462        struct eventfd_ctx *eventfd, const char *args)
3463{
3464        return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEMSWAP);
3465}
3466
3467static void __mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
3468        struct eventfd_ctx *eventfd, enum res_type type)
3469{
3470        struct mem_cgroup_thresholds *thresholds;
3471        struct mem_cgroup_threshold_ary *new;
3472        unsigned long usage;
3473        int i, j, size;
3474
3475        mutex_lock(&memcg->thresholds_lock);
3476
3477        if (type == _MEM) {
3478                thresholds = &memcg->thresholds;
3479                usage = mem_cgroup_usage(memcg, false);
3480        } else if (type == _MEMSWAP) {
3481                thresholds = &memcg->memsw_thresholds;
3482                usage = mem_cgroup_usage(memcg, true);
3483        } else
3484                BUG();
3485
3486        if (!thresholds->primary)
3487                goto unlock;
3488
3489        /* Check if a threshold crossed before removing */
3490        __mem_cgroup_threshold(memcg, type == _MEMSWAP);
3491
3492        /* Calculate new number of threshold */
3493        size = 0;
3494        for (i = 0; i < thresholds->primary->size; i++) {
3495                if (thresholds->primary->entries[i].eventfd != eventfd)
3496                        size++;
3497        }
3498
3499        new = thresholds->spare;
3500
3501        /* Set thresholds array to NULL if we don't have thresholds */
3502        if (!size) {
3503                kfree(new);
3504                new = NULL;
3505                goto swap_buffers;
3506        }
3507
3508        new->size = size;
3509
3510        /* Copy thresholds and find current threshold */
3511        new->current_threshold = -1;
3512        for (i = 0, j = 0; i < thresholds->primary->size; i++) {
3513                if (thresholds->primary->entries[i].eventfd == eventfd)
3514                        continue;
3515
3516                new->entries[j] = thresholds->primary->entries[i];
3517                if (new->entries[j].threshold <= usage) {
3518                        /*
3519                         * new->current_threshold will not be used
3520                         * until rcu_assign_pointer(), so it's safe to increment
3521                         * it here.
3522                         */
3523                        ++new->current_threshold;
3524                }
3525                j++;
3526        }
3527
3528swap_buffers:
3529        /* Swap primary and spare array */
3530        thresholds->spare = thresholds->primary;
3531
3532        rcu_assign_pointer(thresholds->primary, new);
3533
3534        /* To be sure that nobody uses thresholds */
3535        synchronize_rcu();
3536
3537        /* If all events are unregistered, free the spare array */
3538        if (!new) {
3539                kfree(thresholds->spare);
3540                thresholds->spare = NULL;
3541        }
3542unlock:
3543        mutex_unlock(&memcg->thresholds_lock);
3544}
3545
3546static void mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
3547        struct eventfd_ctx *eventfd)
3548{
3549        return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEM);
3550}
3551
3552static void memsw_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
3553        struct eventfd_ctx *eventfd)
3554{
3555        return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEMSWAP);
3556}
3557
3558static int mem_cgroup_oom_register_event(struct mem_cgroup *memcg,
3559        struct eventfd_ctx *eventfd, const char *args)
3560{
3561        struct mem_cgroup_eventfd_list *event;
3562
3563        event = kmalloc(sizeof(*event), GFP_KERNEL);
3564        if (!event)
3565                return -ENOMEM;
3566
3567        spin_lock(&memcg_oom_lock);
3568
3569        event->eventfd = eventfd;
3570        list_add(&event->list, &memcg->oom_notify);
3571
3572        /* already in OOM ? */
3573        if (memcg->under_oom)
3574                eventfd_signal(eventfd, 1);
3575        spin_unlock(&memcg_oom_lock);
3576
3577        return 0;
3578}
3579
3580static void mem_cgroup_oom_unregister_event(struct mem_cgroup *memcg,
3581        struct eventfd_ctx *eventfd)
3582{
3583        struct mem_cgroup_eventfd_list *ev, *tmp;
3584
3585        spin_lock(&memcg_oom_lock);
3586
3587        list_for_each_entry_safe(ev, tmp, &memcg->oom_notify, list) {
3588                if (ev->eventfd == eventfd) {
3589                        list_del(&ev->list);
3590                        kfree(ev);
3591                }
3592        }
3593
3594        spin_unlock(&memcg_oom_lock);
3595}
3596
3597static int mem_cgroup_oom_control_read(struct seq_file *sf, void *v)
3598{
3599        struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(sf));
3600
3601        seq_printf(sf, "oom_kill_disable %d\n", memcg->oom_kill_disable);
3602        seq_printf(sf, "under_oom %d\n", (bool)memcg->under_oom);
3603        return 0;
3604}
3605
3606static int mem_cgroup_oom_control_write(struct cgroup_subsys_state *css,
3607        struct cftype *cft, u64 val)
3608{
3609        struct mem_cgroup *memcg = mem_cgroup_from_css(css);
3610
3611        /* cannot set to root cgroup and only 0 and 1 are allowed */
3612        if (!css->parent || !((val == 0) || (val == 1)))
3613                return -EINVAL;
3614
3615        memcg->oom_kill_disable = val;
3616        if (!val)
3617                memcg_oom_recover(memcg);
3618
3619        return 0;
3620}
3621
3622#ifdef CONFIG_CGROUP_WRITEBACK
3623
3624struct list_head *mem_cgroup_cgwb_list(struct mem_cgroup *memcg)
3625{
3626        return &memcg->cgwb_list;
3627}
3628
3629static int memcg_wb_domain_init(struct mem_cgroup *memcg, gfp_t gfp)
3630{
3631        return wb_domain_init(&memcg->cgwb_domain, gfp);
3632}
3633
3634static void memcg_wb_domain_exit(struct mem_cgroup *memcg)
3635{
3636        wb_domain_exit(&memcg->cgwb_domain);
3637}
3638
3639static void memcg_wb_domain_size_changed(struct mem_cgroup *memcg)
3640{
3641        wb_domain_size_changed(&memcg->cgwb_domain);
3642}
3643
3644struct wb_domain *mem_cgroup_wb_domain(struct bdi_writeback *wb)
3645{
3646        struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css);
3647
3648        if (!memcg->css.parent)
3649                return NULL;
3650
3651        return &memcg->cgwb_domain;
3652}
3653
3654/**
3655 * mem_cgroup_wb_stats - retrieve writeback related stats from its memcg
3656 * @wb: bdi_writeback in question
3657 * @pfilepages: out parameter for number of file pages
3658 * @pheadroom: out parameter for number of allocatable pages according to memcg
3659 * @pdirty: out parameter for number of dirty pages
3660 * @pwriteback: out parameter for number of pages under writeback
3661 *
3662 * Determine the numbers of file, headroom, dirty, and writeback pages in
3663 * @wb's memcg.  File, dirty and writeback are self-explanatory.  Headroom
3664 * is a bit more involved.
3665 *
3666 * A memcg's headroom is "min(max, high) - used".  In the hierarchy, the
3667 * headroom is calculated as the lowest headroom of itself and the
3668 * ancestors.  Note that this doesn't consider the actual amount of
3669 * available memory in the system.  The caller should further cap
3670 * *@pheadroom accordingly.
3671 */
3672void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages,
3673                         unsigned long *pheadroom, unsigned long *pdirty,
3674                         unsigned long *pwriteback)
3675{
3676        struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css);
3677        struct mem_cgroup *parent;
3678
3679        *pdirty = mem_cgroup_read_stat(memcg, MEM_CGROUP_STAT_DIRTY);
3680
3681        /* this should eventually include NR_UNSTABLE_NFS */
3682        *pwriteback = mem_cgroup_read_stat(memcg, MEM_CGROUP_STAT_WRITEBACK);
3683        *pfilepages = mem_cgroup_nr_lru_pages(memcg, (1 << LRU_INACTIVE_FILE) |
3684                                                     (1 << LRU_ACTIVE_FILE));
3685        *pheadroom = PAGE_COUNTER_MAX;
3686
3687        while ((parent = parent_mem_cgroup(memcg))) {
3688                unsigned long ceiling = min(memcg->memory.limit, memcg->high);
3689                unsigned long used = page_counter_read(&memcg->memory);
3690
3691                *pheadroom = min(*pheadroom, ceiling - min(ceiling, used));
3692                memcg = parent;
3693        }
3694}
3695
3696#else   /* CONFIG_CGROUP_WRITEBACK */
3697
3698static int memcg_wb_domain_init(struct mem_cgroup *memcg, gfp_t gfp)
3699{
3700        return 0;
3701}
3702
3703static void memcg_wb_domain_exit(struct mem_cgroup *memcg)
3704{
3705}
3706
3707static void memcg_wb_domain_size_changed(struct mem_cgroup *memcg)
3708{
3709}
3710
3711#endif  /* CONFIG_CGROUP_WRITEBACK */
3712
3713/*
3714 * DO NOT USE IN NEW FILES.
3715 *
3716 * "cgroup.event_control" implementation.
3717 *
3718 * This is way over-engineered.  It tries to support fully configurable
3719 * events for each user.  Such level of flexibility is completely
3720 * unnecessary especially in the light of the planned unified hierarchy.
3721 *
3722 * Please deprecate this and replace with something simpler if at all
3723 * possible.
3724 */
3725
3726/*
3727 * Unregister event and free resources.
3728 *
3729 * Gets called from workqueue.
3730 */
3731static void memcg_event_remove(struct work_struct *work)
3732{
3733        struct mem_cgroup_event *event =
3734                container_of(work, struct mem_cgroup_event, remove);
3735        struct mem_cgroup *memcg = event->memcg;
3736
3737        remove_wait_queue(event->wqh, &event->wait);
3738
3739        event->unregister_event(memcg, event->eventfd);
3740
3741        /* Notify userspace the event is going away. */
3742        eventfd_signal(event->eventfd, 1);
3743
3744        eventfd_ctx_put(event->eventfd);
3745        kfree(event);
3746        css_put(&memcg->css);
3747}
3748
3749/*
3750 * Gets called on POLLHUP on eventfd when user closes it.
3751 *
3752 * Called with wqh->lock held and interrupts disabled.
3753 */
3754static int memcg_event_wake(wait_queue_t *wait, unsigned mode,
3755                            int sync, void *key)
3756{
3757        struct mem_cgroup_event *event =
3758                container_of(wait, struct mem_cgroup_event, wait);
3759        struct mem_cgroup *memcg = event->memcg;
3760        unsigned long flags = (unsigned long)key;
3761
3762        if (flags & POLLHUP) {
3763                /*
3764                 * If the event has been detached at cgroup removal, we
3765                 * can simply return knowing the other side will cleanup
3766                 * for us.
3767                 *
3768                 * We can't race against event freeing since the other
3769                 * side will require wqh->lock via remove_wait_queue(),
3770                 * which we hold.
3771                 */
3772                spin_lock(&memcg->event_list_lock);
3773                if (!list_empty(&event->list)) {
3774                        list_del_init(&event->list);
3775                        /*
3776                         * We are in atomic context, but cgroup_event_remove()
3777                         * may sleep, so we have to call it in workqueue.
3778                         */
3779                        schedule_work(&event->remove);
3780                }
3781                spin_unlock(&memcg->event_list_lock);
3782        }
3783
3784        return 0;
3785}
3786
3787static void memcg_event_ptable_queue_proc(struct file *file,
3788                wait_queue_head_t *wqh, poll_table *pt)
3789{
3790        struct mem_cgroup_event *event =
3791                container_of(pt, struct mem_cgroup_event, pt);
3792
3793        event->wqh = wqh;
3794        add_wait_queue(wqh, &event->wait);
3795}
3796
3797/*
3798 * DO NOT USE IN NEW FILES.
3799 *
3800 * Parse input and register new cgroup event handler.
3801 *
3802 * Input must be in format '<event_fd> <control_fd> <args>'.
3803 * Interpretation of args is defined by control file implementation.
3804 */
3805static ssize_t memcg_write_event_control(struct kernfs_open_file *of,
3806                                         char *buf, size_t nbytes, loff_t off)
3807{
3808        struct cgroup_subsys_state *css = of_css(of);
3809        struct mem_cgroup *memcg = mem_cgroup_from_css(css);
3810        struct mem_cgroup_event *event;
3811        struct cgroup_subsys_state *cfile_css;
3812        unsigned int efd, cfd;
3813        struct fd efile;
3814        struct fd cfile;
3815        const char *name;
3816        char *endp;
3817        int ret;
3818
3819        buf = strstrip(buf);
3820
3821        efd = simple_strtoul(buf, &endp, 10);
3822        if (*endp != ' ')
3823                return -EINVAL;
3824        buf = endp + 1;
3825
3826        cfd = simple_strtoul(buf, &endp, 10);
3827        if ((*endp != ' ') && (*endp != '\0'))
3828                return -EINVAL;
3829        buf = endp + 1;
3830
3831        event = kzalloc(sizeof(*event), GFP_KERNEL);
3832        if (!event)
3833                return -ENOMEM;
3834
3835        event->memcg = memcg;
3836        INIT_LIST_HEAD(&event->list);
3837        init_poll_funcptr(&event->pt, memcg_event_ptable_queue_proc);
3838        init_waitqueue_func_entry(&event->wait, memcg_event_wake);
3839        INIT_WORK(&event->remove, memcg_event_remove);
3840
3841        efile = fdget(efd);
3842        if (!efile.file) {
3843                ret = -EBADF;
3844                goto out_kfree;
3845        }
3846
3847        event->eventfd = eventfd_ctx_fileget(efile.file);
3848        if (IS_ERR(event->eventfd)) {
3849                ret = PTR_ERR(event->eventfd);
3850                goto out_put_efile;
3851        }
3852
3853        cfile = fdget(cfd);
3854        if (!cfile.file) {
3855                ret = -EBADF;
3856                goto out_put_eventfd;
3857        }
3858
3859        /* the process need read permission on control file */
3860        /* AV: shouldn't we check that it's been opened for read instead? */
3861        ret = inode_permission(file_inode(cfile.file), MAY_READ);
3862        if (ret < 0)
3863                goto out_put_cfile;
3864
3865        /*
3866         * Determine the event callbacks and set them in @event.  This used
3867         * to be done via struct cftype but cgroup core no longer knows
3868         * about these events.  The following is crude but the whole thing
3869         * is for compatibility anyway.
3870         *
3871         * DO NOT ADD NEW FILES.
3872         */
3873        name = cfile.file->f_path.dentry->d_name.name;
3874
3875        if (!strcmp(name, "memory.usage_in_bytes")) {
3876                event->register_event = mem_cgroup_usage_register_event;
3877                event->unregister_event = mem_cgroup_usage_unregister_event;
3878        } else if (!strcmp(name, "memory.oom_control")) {
3879                event->register_event = mem_cgroup_oom_register_event;
3880                event->unregister_event = mem_cgroup_oom_unregister_event;
3881        } else if (!strcmp(name, "memory.pressure_level")) {
3882                event->register_event = vmpressure_register_event;
3883                event->unregister_event = vmpressure_unregister_event;
3884        } else if (!strcmp(name, "memory.memsw.usage_in_bytes")) {
3885                event->register_event = memsw_cgroup_usage_register_event;
3886                event->unregister_event = memsw_cgroup_usage_unregister_event;
3887        } else {
3888                ret = -EINVAL;
3889                goto out_put_cfile;
3890        }
3891
3892        /*
3893         * Verify @cfile should belong to @css.  Also, remaining events are
3894         * automatically removed on cgroup destruction but the removal is
3895         * asynchronous, so take an extra ref on @css.
3896         */
3897        cfile_css = css_tryget_online_from_dir(cfile.file->f_path.dentry->d_parent,
3898                                               &memory_cgrp_subsys);
3899        ret = -EINVAL;
3900        if (IS_ERR(cfile_css))
3901                goto out_put_cfile;
3902        if (cfile_css != css) {
3903                css_put(cfile_css);
3904                goto out_put_cfile;
3905        }
3906
3907        ret = event->register_event(memcg, event->eventfd, buf);
3908        if (ret)
3909                goto out_put_css;
3910
3911        efile.file->f_op->poll(efile.file, &event->pt);
3912
3913        spin_lock(&memcg->event_list_lock);
3914        list_add(&event->list, &memcg->event_list);
3915        spin_unlock(&memcg->event_list_lock);
3916
3917        fdput(cfile);
3918        fdput(efile);
3919
3920        return nbytes;
3921
3922out_put_css:
3923        css_put(css);
3924out_put_cfile:
3925        fdput(cfile);
3926out_put_eventfd:
3927        eventfd_ctx_put(event->eventfd);
3928out_put_efile:
3929        fdput(efile);
3930out_kfree:
3931        kfree(event);
3932
3933        return ret;
3934}
3935
3936static struct cftype mem_cgroup_legacy_files[] = {
3937        {
3938                .name = "usage_in_bytes",
3939                .private = MEMFILE_PRIVATE(_MEM, RES_USAGE),
3940                .read_u64 = mem_cgroup_read_u64,
3941        },
3942        {
3943                .name = "max_usage_in_bytes",
3944                .private = MEMFILE_PRIVATE(_MEM, RES_MAX_USAGE),
3945                .write = mem_cgroup_reset,
3946                .read_u64 = mem_cgroup_read_u64,
3947        },
3948        {
3949                .name = "limit_in_bytes",
3950                .private = MEMFILE_PRIVATE(_MEM, RES_LIMIT),
3951                .write = mem_cgroup_write,
3952                .read_u64 = mem_cgroup_read_u64,
3953        },
3954        {
3955                .name = "soft_limit_in_bytes",
3956                .private = MEMFILE_PRIVATE(_MEM, RES_SOFT_LIMIT),
3957                .write = mem_cgroup_write,
3958                .read_u64 = mem_cgroup_read_u64,
3959        },
3960        {
3961                .name = "failcnt",
3962                .private = MEMFILE_PRIVATE(_MEM, RES_FAILCNT),
3963                .write = mem_cgroup_reset,
3964                .read_u64 = mem_cgroup_read_u64,
3965        },
3966        {
3967                .name = "stat",
3968                .seq_show = memcg_stat_show,
3969        },
3970        {
3971                .name = "force_empty",
3972                .write = mem_cgroup_force_empty_write,
3973        },
3974        {
3975                .name = "use_hierarchy",
3976                .write_u64 = mem_cgroup_hierarchy_write,
3977                .read_u64 = mem_cgroup_hierarchy_read,
3978        },
3979        {
3980                .name = "cgroup.event_control",         /* XXX: for compat */
3981                .write = memcg_write_event_control,
3982                .flags = CFTYPE_NO_PREFIX | CFTYPE_WORLD_WRITABLE,
3983        },
3984        {
3985                .name = "swappiness",
3986                .read_u64 = mem_cgroup_swappiness_read,
3987                .write_u64 = mem_cgroup_swappiness_write,
3988        },
3989        {
3990                .name = "move_charge_at_immigrate",
3991                .read_u64 = mem_cgroup_move_charge_read,
3992                .write_u64 = mem_cgroup_move_charge_write,
3993        },
3994        {
3995                .name = "oom_control",
3996                .seq_show = mem_cgroup_oom_control_read,
3997                .write_u64 = mem_cgroup_oom_control_write,
3998                .private = MEMFILE_PRIVATE(_OOM_TYPE, OOM_CONTROL),
3999        },
4000        {

4001                .name = "pressure_level",
4002        },
4003#ifdef CONFIG_NUMA
4004        {
4005                .name = "numa_stat",
4006                .seq_show = memcg_numa_stat_show,
4007        },
4008#endif
4009        {
4010                .name = "kmem.limit_in_bytes",
4011                .private = MEMFILE_PRIVATE(_KMEM, RES_LIMIT),
4012                .write = mem_cgroup_write,
4013                .read_u64 = mem_cgroup_read_u64,
4014        },
4015        {
4016                .name = "kmem.usage_in_bytes",
4017                .private = MEMFILE_PRIVATE(_KMEM, RES_USAGE),
4018                .read_u64 = mem_cgroup_read_u64,
4019        },
4020        {
4021                .name = "kmem.failcnt",
4022                .private = MEMFILE_PRIVATE(_KMEM, RES_FAILCNT),
4023                .write = mem_cgroup_reset,
4024                .read_u64 = mem_cgroup_read_u64,
4025        },
4026        {
4027                .name = "kmem.max_usage_in_bytes",
4028                .private = MEMFILE_PRIVATE(_KMEM, RES_MAX_USAGE),
4029                .write = mem_cgroup_reset,
4030                .read_u64 = mem_cgroup_read_u64,
4031        },
4032#ifdef CONFIG_SLABINFO
4033        {
4034                .name = "kmem.slabinfo",
4035                .seq_start = slab_start,
4036                .seq_next = slab_next,
4037                .seq_stop = slab_stop,
4038                .seq_show = memcg_slab_show,
4039        },
4040#endif
4041        {
4042                .name = "kmem.tcp.limit_in_bytes",
4043                .private = MEMFILE_PRIVATE(_TCP, RES_LIMIT),
4044                .write = mem_cgroup_write,
4045                .read_u64 = mem_cgroup_read_u64,
4046        },
4047        {
4048                .name = "kmem.tcp.usage_in_bytes",
4049                .private = MEMFILE_PRIVATE(_TCP, RES_USAGE),
4050                .read_u64 = mem_cgroup_read_u64,
4051        },
4052        {
4053                .name = "kmem.tcp.failcnt",
4054                .private = MEMFILE_PRIVATE(_TCP, RES_FAILCNT),
4055                .write = mem_cgroup_reset,
4056                .read_u64 = mem_cgroup_read_u64,
4057        },
4058        {
4059                .name = "kmem.tcp.max_usage_in_bytes",
4060                .private = MEMFILE_PRIVATE(_TCP, RES_MAX_USAGE),
4061                .write = mem_cgroup_reset,
4062                .read_u64 = mem_cgroup_read_u64,
4063        },
4064        { },    /* terminate */
4065};
4066
4067/*
4068 * Private memory cgroup IDR
4069 *
4070 * Swap-out records and page cache shadow entries need to store memcg
4071 * references in constrained space, so we maintain an ID space that is
4072 * limited to 16 bit (MEM_CGROUP_ID_MAX), limiting the total number of
4073 * memory-controlled cgroups to 64k.
4074 *
4075 * However, there usually are many references to the oflline CSS after
4076 * the cgroup has been destroyed, such as page cache or reclaimable
4077 * slab objects, that don't need to hang on to the ID. We want to keep
4078 * those dead CSS from occupying IDs, or we might quickly exhaust the
4079 * relatively small ID space and prevent the creation of new cgroups
4080 * even when there are much fewer than 64k cgroups - possibly none.
4081 *
4082 * Maintain a private 16-bit ID space for memcg, and allow the ID to
4083 * be freed and recycled when it's no longer needed, which is usually
4084 * when the CSS is offlined.
4085 *
4086 * The only exception to that are records of swapped out tmpfs/shmem
4087 * pages that need to be attributed to live ancestors on swapin. But
4088 * those references are manageable from userspace.
4089 */
4090
4091static DEFINE_IDR(mem_cgroup_idr);
4092
4093static void mem_cgroup_id_get_many(struct mem_cgroup *memcg, unsigned int n)
4094{
4095        atomic_add(n, &memcg->id.ref);
4096}
4097
4098static void mem_cgroup_id_put_many(struct mem_cgroup *memcg, unsigned int n)
4099{
4100        if (atomic_sub_and_test(n, &memcg->id.ref)) {
4101                idr_remove(&mem_cgroup_idr, memcg->id.id);
4102                memcg->id.id = 0;
4103
4104                /* Memcg ID pins CSS */
4105                css_put(&memcg->css);
4106        }
4107}
4108
4109static inline void mem_cgroup_id_get(struct mem_cgroup *memcg)
4110{
4111        mem_cgroup_id_get_many(memcg, 1);
4112}
4113
4114static inline void mem_cgroup_id_put(struct mem_cgroup *memcg)
4115{
4116        mem_cgroup_id_put_many(memcg, 1);
4117}
4118
4119/**
4120 * mem_cgroup_from_id - look up a memcg from a memcg id
4121 * @id: the memcg id to look up
4122 *
4123 * Caller must hold rcu_read_lock().
4124 */
4125struct mem_cgroup *mem_cgroup_from_id(unsigned short id)
4126{
4127        WARN_ON_ONCE(!rcu_read_lock_held());
4128        return idr_find(&mem_cgroup_idr, id);
4129}
4130
4131static int alloc_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node)
4132{
4133        struct mem_cgroup_per_node *pn;
4134        int tmp = node;
4135        /*
4136         * This routine is called against possible nodes.
4137         * But it's BUG to call kmalloc() against offline node.
4138         *
4139         * TODO: this routine can waste much memory for nodes which will
4140         *       never be onlined. It's better to use memory hotplug callback
4141         *       function.
4142         */
4143        if (!node_state(node, N_NORMAL_MEMORY))
4144                tmp = -1;
4145        pn = kzalloc_node(sizeof(*pn), GFP_KERNEL, tmp);
4146        if (!pn)
4147                return 1;
4148
4149        lruvec_init(&pn->lruvec);
4150        pn->usage_in_excess = 0;
4151        pn->on_tree = false;
4152        pn->memcg = memcg;
4153
4154        memcg->nodeinfo[node] = pn;
4155        return 0;
4156}
4157
4158static void free_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node)
4159{
4160        kfree(memcg->nodeinfo[node]);
4161}
4162
4163static void mem_cgroup_free(struct mem_cgroup *memcg)
4164{
4165        int node;
4166
4167        memcg_wb_domain_exit(memcg);
4168        for_each_node(node)
4169                free_mem_cgroup_per_node_info(memcg, node);
4170        free_percpu(memcg->stat);
4171        kfree(memcg);
4172}
4173
4174static struct mem_cgroup *mem_cgroup_alloc(void)
4175{
4176        struct mem_cgroup *memcg;
4177        size_t size;
4178        int node;
4179
4180        size = sizeof(struct mem_cgroup);
4181        size += nr_node_ids * sizeof(struct mem_cgroup_per_node *);
4182
4183        memcg = kzalloc(size, GFP_KERNEL);
4184        if (!memcg)
4185                return NULL;
4186
4187        memcg->id.id = idr_alloc(&mem_cgroup_idr, NULL,
4188                                 1, MEM_CGROUP_ID_MAX,
4189                                 GFP_KERNEL);
4190        if (memcg->id.id < 0)
4191                goto fail;
4192
4193        memcg->stat = alloc_percpu(struct mem_cgroup_stat_cpu);
4194        if (!memcg->stat)
4195                goto fail;
4196
4197        for_each_node(node)
4198                if (alloc_mem_cgroup_per_node_info(memcg, node))
4199                        goto fail;
4200
4201        if (memcg_wb_domain_init(memcg, GFP_KERNEL))
4202                goto fail;
4203
4204        INIT_WORK(&memcg->high_work, high_work_func);
4205        memcg->last_scanned_node = MAX_NUMNODES;
4206        INIT_LIST_HEAD(&memcg->oom_notify);
4207        mutex_init(&memcg->thresholds_lock);
4208        spin_lock_init(&memcg->move_lock);
4209        vmpressure_init(&memcg->vmpressure);
4210        INIT_LIST_HEAD(&memcg->event_list);
4211        spin_lock_init(&memcg->event_list_lock);
4212        memcg->socket_pressure = jiffies;
4213#ifndef CONFIG_SLOB
4214        memcg->kmemcg_id = -1;
4215#endif
4216#ifdef CONFIG_CGROUP_WRITEBACK
4217        INIT_LIST_HEAD(&memcg->cgwb_list);
4218#endif
4219        idr_replace(&mem_cgroup_idr, memcg, memcg->id.id);
4220        return memcg;
4221fail:
4222        if (memcg->id.id > 0)
4223                idr_remove(&mem_cgroup_idr, memcg->id.id);
4224        mem_cgroup_free(memcg);
4225        return NULL;
4226}
4227
4228static struct cgroup_subsys_state * __ref
4229mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
4230{
4231        struct mem_cgroup *parent = mem_cgroup_from_css(parent_css);
4232        struct mem_cgroup *memcg;
4233        long error = -ENOMEM;
4234
4235        memcg = mem_cgroup_alloc();
4236        if (!memcg)
4237                return ERR_PTR(error);
4238
4239        memcg->high = PAGE_COUNTER_MAX;
4240        memcg->soft_limit = PAGE_COUNTER_MAX;
4241        if (parent) {
4242                memcg->swappiness = mem_cgroup_swappiness(parent);
4243                memcg->oom_kill_disable = parent->oom_kill_disable;
4244        }
4245        if (parent && parent->use_hierarchy) {
4246                memcg->use_hierarchy = true;
4247                page_counter_init(&memcg->memory, &parent->memory);
4248                page_counter_init(&memcg->swap, &parent->swap);
4249                page_counter_init(&memcg->memsw, &parent->memsw);
4250                page_counter_init(&memcg->kmem, &parent->kmem);
4251                page_counter_init(&memcg->tcpmem, &parent->tcpmem);
4252        } else {
4253                page_counter_init(&memcg->memory, NULL);
4254                page_counter_init(&memcg->swap, NULL);
4255                page_counter_init(&memcg->memsw, NULL);
4256                page_counter_init(&memcg->kmem, NULL);
4257                page_counter_init(&memcg->tcpmem, NULL);
4258                /*
4259                 * Deeper hierachy with use_hierarchy == false doesn't make
4260                 * much sense so let cgroup subsystem know about this
4261                 * unfortunate state in our controller.
4262                 */
4263                if (parent != root_mem_cgroup)
4264                        memory_cgrp_subsys.broken_hierarchy = true;
4265        }
4266
4267        /* The following stuff does not apply to the root */
4268        if (!parent) {
4269                root_mem_cgroup = memcg;
4270                return &memcg->css;
4271        }
4272
4273        error = memcg_online_kmem(memcg);
4274        if (error)
4275                goto fail;
4276
4277        if (cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_nosocket)
4278                static_branch_inc(&memcg_sockets_enabled_key);
4279
4280        return &memcg->css;
4281fail:
4282        mem_cgroup_free(memcg);
4283        return ERR_PTR(-ENOMEM);
4284}
4285
4286static int mem_cgroup_css_online(struct cgroup_subsys_state *css)
4287{
4288        /* Online state pins memcg ID, memcg ID pins CSS */
4289        mem_cgroup_id_get(mem_cgroup_from_css(css));
4290        css_get(css);
4291        return 0;
4292}
4293
4294static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
4295{
4296        struct mem_cgroup *memcg = mem_cgroup_from_css(css);
4297        struct mem_cgroup_event *event, *tmp;
4298
4299        /*
4300         * Unregister events and notify userspace.
4301         * Notify userspace about cgroup removing only after rmdir of cgroup
4302         * directory to avoid race between userspace and kernelspace.
4303         */
4304        spin_lock(&memcg->event_list_lock);
4305        list_for_each_entry_safe(event, tmp, &memcg->event_list, list) {
4306                list_del_init(&event->list);
4307                schedule_work(&event->remove);
4308        }
4309        spin_unlock(&memcg->event_list_lock);
4310
4311        memcg_offline_kmem(memcg);
4312        wb_memcg_offline(memcg);
4313
4314        mem_cgroup_id_put(memcg);
4315}
4316
4317static void mem_cgroup_css_released(struct cgroup_subsys_state *css)
4318{
4319        struct mem_cgroup *memcg = mem_cgroup_from_css(css);
4320
4321        invalidate_reclaim_iterators(memcg);
4322}
4323
4324static void mem_cgroup_css_free(struct cgroup_subsys_state *css)
4325{
4326        struct mem_cgroup *memcg = mem_cgroup_from_css(css);
4327
4328        if (cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_nosocket)
4329                static_branch_dec(&memcg_sockets_enabled_key);
4330
4331        if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && memcg->tcpmem_active)
4332                static_branch_dec(&memcg_sockets_enabled_key);
4333
4334        vmpressure_cleanup(&memcg->vmpressure);
4335        cancel_work_sync(&memcg->high_work);
4336        mem_cgroup_remove_from_trees(memcg);
4337        memcg_free_kmem(memcg);
4338        mem_cgroup_free(memcg);
4339}
4340
4341/**
4342 * mem_cgroup_css_reset - reset the states of a mem_cgroup
4343 * @css: the target css
4344 *
4345 * Reset the states of the mem_cgroup associated with @css.  This is
4346 * invoked when the userland requests disabling on the default hierarchy
4347 * but the memcg is pinned through dependency.  The memcg should stop
4348 * applying policies and should revert to the vanilla state as it may be
4349 * made visible again.
4350 *
4351 * The current implementation only resets the essential configurations.
4352 * This needs to be expanded to cover all the visible parts.
4353 */
4354static void mem_cgroup_css_reset(struct cgroup_subsys_state *css)
4355{
4356        struct mem_cgroup *memcg = mem_cgroup_from_css(css);
4357
4358        page_counter_limit(&memcg->memory, PAGE_COUNTER_MAX);
4359        page_counter_limit(&memcg->swap, PAGE_COUNTER_MAX);
4360        page_counter_limit(&memcg->memsw, PAGE_COUNTER_MAX);
4361        page_counter_limit(&memcg->kmem, PAGE_COUNTER_MAX);
4362        page_counter_limit(&memcg->tcpmem, PAGE_COUNTER_MAX);
4363        memcg->low = 0;
4364        memcg->high = PAGE_COUNTER_MAX;
4365        memcg->soft_limit = PAGE_COUNTER_MAX;
4366        memcg_wb_domain_size_changed(memcg);
4367}
4368
4369#ifdef CONFIG_MMU
4370/* Handlers for move charge at task migration. */
4371static int mem_cgroup_do_precharge(unsigned long count)
4372{
4373        int ret;
4374
4375        /* Try a single bulk charge without reclaim first, kswapd may wake */
4376        ret = try_charge(mc.to, GFP_KERNEL & ~__GFP_DIRECT_RECLAIM, count);
4377        if (!ret) {
4378                mc.precharge += count;
4379                return ret;
4380        }
4381
4382        /* Try charges one by one with reclaim */
4383        while (count--) {
4384                ret = try_charge(mc.to, GFP_KERNEL & ~__GFP_NORETRY, 1);
4385                if (ret)
4386                        return ret;
4387                mc.precharge++;
4388                cond_resched();
4389        }
4390        return 0;
4391}
4392
4393union mc_target {
4394        struct page     *page;
4395        swp_entry_t     ent;
4396};
4397
4398enum mc_target_type {
4399        MC_TARGET_NONE = 0,
4400        MC_TARGET_PAGE,
4401        MC_TARGET_SWAP,
4402};
4403
4404static struct page *mc_handle_present_pte(struct vm_area_struct *vma,
4405                                                unsigned long addr, pte_t ptent)
4406{
4407        struct page *page = vm_normal_page(vma, addr, ptent);
4408
4409        if (!page || !page_mapped(page))
4410                return NULL;
4411        if (PageAnon(page)) {
4412                if (!(mc.flags & MOVE_ANON))
4413                        return NULL;
4414        } else {
4415                if (!(mc.flags & MOVE_FILE))
4416                        return NULL;
4417        }
4418        if (!get_page_unless_zero(page))
4419                return NULL;
4420
4421        return page;
4422}
4423
4424#ifdef CONFIG_SWAP
4425static struct page *mc_handle_swap_pte(struct vm_area_struct *vma,
4426                        pte_t ptent, swp_entry_t *entry)
4427{
4428        struct page *page = NULL;
4429        swp_entry_t ent = pte_to_swp_entry(ptent);
4430
4431        if (!(mc.flags & MOVE_ANON) || non_swap_entry(ent))
4432                return NULL;
4433        /*
4434         * Because lookup_swap_cache() updates some statistics counter,
4435         * we call find_get_page() with swapper_space directly.
4436         */
4437        page = find_get_page(swap_address_space(ent), ent.val);
4438        if (do_memsw_account())
4439                entry->val = ent.val;
4440
4441        return page;
4442}
4443#else
4444static struct page *mc_handle_swap_pte(struct vm_area_struct *vma,
4445                        pte_t ptent, swp_entry_t *entry)
4446{
4447        return NULL;
4448}
4449#endif
4450
4451static struct page *mc_handle_file_pte(struct vm_area_struct *vma,
4452                        unsigned long addr, pte_t ptent, swp_entry_t *entry)
4453{
4454        struct page *page = NULL;
4455        struct address_space *mapping;
4456        pgoff_t pgoff;
4457
4458        if (!vma->vm_file) /* anonymous vma */
4459                return NULL;
4460        if (!(mc.flags & MOVE_FILE))
4461                return NULL;
4462
4463        mapping = vma->vm_file->f_mapping;
4464        pgoff = linear_page_index(vma, addr);
4465
4466        /* page is moved even if it's not RSS of this task(page-faulted). */
4467#ifdef CONFIG_SWAP
4468        /* shmem/tmpfs may report page out on swap: account for that too. */
4469        if (shmem_mapping(mapping)) {
4470                page = find_get_entry(mapping, pgoff);
4471                if (radix_tree_exceptional_entry(page)) {
4472                        swp_entry_t swp = radix_to_swp_entry(page);
4473                        if (do_memsw_account())
4474                                *entry = swp;
4475                        page = find_get_page(swap_address_space(swp), swp.val);
4476                }
4477        } else
4478                page = find_get_page(mapping, pgoff);
4479#else
4480        page = find_get_page(mapping, pgoff);
4481#endif
4482        return page;
4483}
4484
4485/**
4486 * mem_cgroup_move_account - move account of the page
4487 * @page: the page
4488 * @compound: charge the page as compound or small page
4489 * @from: mem_cgroup which the page is moved from.
4490 * @to: mem_cgroup which the page is moved to. @from != @to.
4491 *
4492 * The caller must make sure the page is not on LRU (isolate_page() is useful.)
4493 *
4494 * This function doesn't do "charge" to new cgroup and doesn't do "uncharge"
4495 * from old cgroup.
4496 */
4497static int mem_cgroup_move_account(struct page *page,
4498                                   bool compound,
4499                                   struct mem_cgroup *from,
4500                                   struct mem_cgroup *to)
4501{
4502        unsigned long flags;
4503        unsigned int nr_pages = compound ? hpage_nr_pages(page) : 1;
4504        int ret;
4505        bool anon;
4506
4507        VM_BUG_ON(from == to);
4508        VM_BUG_ON_PAGE(PageLRU(page), page);
4509        VM_BUG_ON(compound && !PageTransHuge(page));
4510
4511        /*
4512         * Prevent mem_cgroup_migrate() from looking at
4513         * page->mem_cgroup of its source page while we change it.
4514         */
4515        ret = -EBUSY;
4516        if (!trylock_page(page))
4517                goto out;
4518
4519        ret = -EINVAL;
4520        if (page->mem_cgroup != from)
4521                goto out_unlock;
4522
4523        anon = PageAnon(page);
4524
4525        spin_lock_irqsave(&from->move_lock, flags);
4526
4527        if (!anon && page_mapped(page)) {
4528                __this_cpu_sub(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED],
4529                               nr_pages);
4530                __this_cpu_add(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED],
4531                               nr_pages);
4532        }
4533
4534        /*
4535         * move_lock grabbed above and caller set from->moving_account, so
4536         * mem_cgroup_update_page_stat() will serialize updates to PageDirty.
4537         * So mapping should be stable for dirty pages.
4538         */
4539        if (!anon && PageDirty(page)) {
4540                struct address_space *mapping = page_mapping(page);
4541
4542                if (mapping_cap_account_dirty(mapping)) {
4543                        __this_cpu_sub(from->stat->count[MEM_CGROUP_STAT_DIRTY],
4544                                       nr_pages);
4545                        __this_cpu_add(to->stat->count[MEM_CGROUP_STAT_DIRTY],
4546                                       nr_pages);
4547                }
4548        }
4549
4550        if (PageWriteback(page)) {
4551                __this_cpu_sub(from->stat->count[MEM_CGROUP_STAT_WRITEBACK],
4552                               nr_pages);
4553                __this_cpu_add(to->stat->count[MEM_CGROUP_STAT_WRITEBACK],
4554                               nr_pages);
4555        }
4556
4557        /*
4558         * It is safe to change page->mem_cgroup here because the page
4559         * is referenced, charged, and isolated - we can't race with
4560         * uncharging, charging, migration, or LRU putback.
4561         */
4562
4563        /* caller should have done css_get */
4564        page->mem_cgroup = to;
4565        spin_unlock_irqrestore(&from->move_lock, flags);
4566
4567        ret = 0;
4568
4569        local_irq_disable();
4570        mem_cgroup_charge_statistics(to, page, compound, nr_pages);
4571        memcg_check_events(to, page);
4572        mem_cgroup_charge_statistics(from, page, compound, -nr_pages);
4573        memcg_check_events(from, page);
4574        local_irq_enable();
4575out_unlock:
4576        unlock_page(page);
4577out:
4578        return ret;
4579}
4580
4581/**
4582 * get_mctgt_type - get target type of moving charge
4583 * @vma: the vma the pte to be checked belongs
4584 * @addr: the address corresponding to the pte to be checked
4585 * @ptent: the pte to be checked
4586 * @target: the pointer the target page or swap ent will be stored(can be NULL)
4587 *
4588 * Returns
4589 *   0(MC_TARGET_NONE): if the pte is not a target for move charge.
4590 *   1(MC_TARGET_PAGE): if the page corresponding to this pte is a target for
4591 *     move charge. if @target is not NULL, the page is stored in target->page
4592 *     with extra refcnt got(Callers should handle it).
4593 *   2(MC_TARGET_SWAP): if the swap entry corresponding to this pte is a
4594 *     target for charge migration. if @target is not NULL, the entry is stored
4595 *     in target->ent.
4596 *
4597 * Called with pte lock held.
4598 */
4599
4600static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma,
4601                unsigned long addr, pte_t ptent, union mc_target *target)
4602{
4603        struct page *page = NULL;
4604        enum mc_target_type ret = MC_TARGET_NONE;
4605        swp_entry_t ent = { .val = 0 };
4606
4607        if (pte_present(ptent))
4608                page = mc_handle_present_pte(vma, addr, ptent);
4609        else if (is_swap_pte(ptent))
4610                page = mc_handle_swap_pte(vma, ptent, &ent);
4611        else if (pte_none(ptent))
4612                page = mc_handle_file_pte(vma, addr, ptent, &ent);
4613
4614        if (!page && !ent.val)
4615                return ret;
4616        if (page) {
4617                /*
4618                 * Do only loose check w/o serialization.
4619                 * mem_cgroup_move_account() checks the page is valid or
4620                 * not under LRU exclusion.
4621                 */
4622                if (page->mem_cgroup == mc.from) {
4623                        ret = MC_TARGET_PAGE;
4624                        if (target)
4625                                target->page = page;
4626                }
4627                if (!ret || !target)
4628                        put_page(page);
4629        }
4630        /* There is a swap entry and a page doesn't exist or isn't charged */
4631        if (ent.val && !ret &&
4632            mem_cgroup_id(mc.from) == lookup_swap_cgroup_id(ent)) {
4633                ret = MC_TARGET_SWAP;
4634                if (target)
4635                        target->ent = ent;
4636        }
4637        return ret;
4638}
4639
4640#ifdef CONFIG_TRANSPARENT_HUGEPAGE
4641/*
4642 * We don't consider swapping or file mapped pages because THP does not
4643 * support them for now.
4644 * Caller should make sure that pmd_trans_huge(pmd) is true.
4645 */
4646static enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma,
4647                unsigned long addr, pmd_t pmd, union mc_target *target)
4648{
4649        struct page *page = NULL;
4650        enum mc_target_type ret = MC_TARGET_NONE;
4651
4652        page = pmd_page(pmd);
4653        VM_BUG_ON_PAGE(!page || !PageHead(page), page);
4654        if (!(mc.flags & MOVE_ANON))
4655                return ret;
4656        if (page->mem_cgroup == mc.from) {
4657                ret = MC_TARGET_PAGE;
4658                if (target) {
4659                        get_page(page);
4660                        target->page = page;
4661                }
4662        }
4663        return ret;
4664}
4665#else
4666static inline enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma,
4667                unsigned long addr, pmd_t pmd, union mc_target *target)
4668{
4669        return MC_TARGET_NONE;
4670}
4671#endif
4672
4673static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd,
4674                                        unsigned long addr, unsigned long end,
4675                                        struct mm_walk *walk)
4676{
4677        struct vm_area_struct *vma = walk->vma;
4678        pte_t *pte;
4679        spinlock_t *ptl;
4680
4681        ptl = pmd_trans_huge_lock(pmd, vma);
4682        if (ptl) {
4683                if (get_mctgt_type_thp(vma, addr, *pmd, NULL) == MC_TARGET_PAGE)
4684                        mc.precharge += HPAGE_PMD_NR;
4685                spin_unlock(ptl);
4686                return 0;
4687        }
4688
4689        if (pmd_trans_unstable(pmd))
4690                return 0;
4691        pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
4692        for (; addr != end; pte++, addr += PAGE_SIZE)
4693                if (get_mctgt_type(vma, addr, *pte, NULL))
4694                        mc.precharge++; /* increment precharge temporarily */
4695        pte_unmap_unlock(pte - 1, ptl);
4696        cond_resched();
4697
4698        return 0;
4699}
4700
4701static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm)
4702{
4703        unsigned long precharge;
4704
4705        struct mm_walk mem_cgroup_count_precharge_walk = {
4706                .pmd_entry = mem_cgroup_count_precharge_pte_range,
4707                .mm = mm,
4708        };
4709        down_read(&mm->mmap_sem);
4710        walk_page_range(0, ~0UL, &mem_cgroup_count_precharge_walk);
4711        up_read(&mm->mmap_sem);
4712
4713        precharge = mc.precharge;
4714        mc.precharge = 0;
4715
4716        return precharge;
4717}
4718
4719static int mem_cgroup_precharge_mc(struct mm_struct *mm)
4720{
4721        unsigned long precharge = mem_cgroup_count_precharge(mm);
4722
4723        VM_BUG_ON(mc.moving_task);
4724        mc.moving_task = current;
4725        return mem_cgroup_do_precharge(precharge);
4726}
4727
4728/* cancels all extra charges on mc.from and mc.to, and wakes up all waiters. */
4729static void __mem_cgroup_clear_mc(void)
4730{
4731        struct mem_cgroup *from = mc.from;
4732        struct mem_cgroup *to = mc.to;
4733
4734        /* we must uncharge all the leftover precharges from mc.to */
4735        if (mc.precharge) {
4736                cancel_charge(mc.to, mc.precharge);
4737                mc.precharge = 0;
4738        }
4739        /*
4740         * we didn't uncharge from mc.from at mem_cgroup_move_account(), so
4741         * we must uncharge here.
4742         */
4743        if (mc.moved_charge) {
4744                cancel_charge(mc.from, mc.moved_charge);
4745                mc.moved_charge = 0;
4746        }
4747        /* we must fixup refcnts and charges */
4748        if (mc.moved_swap) {
4749                /* uncharge swap account from the old cgroup */
4750                if (!mem_cgroup_is_root(mc.from))
4751                        page_counter_uncharge(&mc.from->memsw, mc.moved_swap);
4752
4753                mem_cgroup_id_put_many(mc.from, mc.moved_swap);
4754
4755                /*
4756                 * we charged both to->memory and to->memsw, so we
4757                 * should uncharge to->memory.
4758                 */
4759                if (!mem_cgroup_is_root(mc.to))
4760                        page_counter_uncharge(&mc.to->memory, mc.moved_swap);
4761
4762                mem_cgroup_id_get_many(mc.to, mc.moved_swap);
4763                css_put_many(&mc.to->css, mc.moved_swap);
4764
4765                mc.moved_swap = 0;
4766        }
4767        memcg_oom_recover(from);
4768        memcg_oom_recover(to);
4769        wake_up_all(&mc.waitq);
4770}
4771
4772static void mem_cgroup_clear_mc(void)
4773{
4774        struct mm_struct *mm = mc.mm;
4775
4776        /*
4777         * we must clear moving_task before waking up waiters at the end of
4778         * task migration.
4779         */
4780        mc.moving_task = NULL;
4781        __mem_cgroup_clear_mc();
4782        spin_lock(&mc.lock);
4783        mc.from = NULL;
4784        mc.to = NULL;
4785        mc.mm = NULL;
4786        spin_unlock(&mc.lock);
4787
4788        mmput(mm);
4789}
4790
4791static int mem_cgroup_can_attach(struct cgroup_taskset *tset)
4792{
4793        struct cgroup_subsys_state *css;
4794        struct mem_cgroup *memcg = NULL; /* unneeded init to make gcc happy */
4795        struct mem_cgroup *from;
4796        struct task_struct *leader, *p;
4797        struct mm_struct *mm;
4798        unsigned long move_flags;
4799        int ret = 0;
4800
4801        /* charge immigration isn't supported on the default hierarchy */
4802        if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
4803                return 0;
4804
4805        /*
4806         * Multi-process migrations only happen on the default hierarchy
4807         * where charge immigration is not used.  Perform charge
4808         * immigration if @tset contains a leader and whine if there are
4809         * multiple.
4810         */
4811        p = NULL;
4812        cgroup_taskset_for_each_leader(leader, css, tset) {
4813                WARN_ON_ONCE(p);
4814                p = leader;
4815                memcg = mem_cgroup_from_css(css);
4816        }
4817        if (!p)
4818                return 0;
4819
4820        /*
4821         * We are now commited to this value whatever it is. Changes in this
4822         * tunable will only affect upcoming migrations, not the current one.
4823         * So we need to save it, and keep it going.
4824         */
4825        move_flags = READ_ONCE(memcg->move_charge_at_immigrate);
4826        if (!move_flags)
4827                return 0;
4828
4829        from = mem_cgroup_from_task(p);
4830
4831        VM_BUG_ON(from == memcg);
4832
4833        mm = get_task_mm(p);
4834        if (!mm)
4835                return 0;
4836        /* We move charges only when we move a owner of the mm */
4837        if (mm->owner == p) {
4838                VM_BUG_ON(mc.from);
4839                VM_BUG_ON(mc.to);
4840                VM_BUG_ON(mc.precharge);
4841                VM_BUG_ON(mc.moved_charge);
4842                VM_BUG_ON(mc.moved_swap);
4843
4844                spin_lock(&mc.lock);
4845                mc.mm = mm;
4846                mc.from = from;
4847                mc.to = memcg;
4848                mc.flags = move_flags;
4849                spin_unlock(&mc.lock);
4850                /* We set mc.moving_task later */
4851
4852                ret = mem_cgroup_precharge_mc(mm);
4853                if (ret)
4854                        mem_cgroup_clear_mc();
4855        } else {
4856                mmput(mm);
4857        }
4858        return ret;
4859}
4860
4861static void mem_cgroup_cancel_attach(struct cgroup_taskset *tset)
4862{
4863        if (mc.to)
4864                mem_cgroup_clear_mc();
4865}
4866
4867static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
4868                                unsigned long addr, unsigned long end,
4869                                struct mm_walk *walk)
4870{
4871        int ret = 0;
4872        struct vm_area_struct *vma = walk->vma;
4873        pte_t *pte;
4874        spinlock_t *ptl;
4875        enum mc_target_type target_type;
4876        union mc_target target;
4877        struct page *page;
4878
4879        ptl = pmd_trans_huge_lock(pmd, vma);
4880        if (ptl) {
4881                if (mc.precharge < HPAGE_PMD_NR) {
4882                        spin_unlock(ptl);
4883                        return 0;
4884                }
4885                target_type = get_mctgt_type_thp(vma, addr, *pmd, &target);
4886                if (target_type == MC_TARGET_PAGE) {
4887                        page = target.page;
4888                        if (!isolate_lru_page(page)) {
4889                                if (!mem_cgroup_move_account(page, true,
4890                                                             mc.from, mc.to)) {
4891                                        mc.precharge -= HPAGE_PMD_NR;
4892                                        mc.moved_charge += HPAGE_PMD_NR;
4893                                }
4894                                putback_lru_page(page);
4895                        }
4896                        put_page(page);
4897                }
4898                spin_unlock(ptl);
4899                return 0;
4900        }
4901
4902        if (pmd_trans_unstable(pmd))
4903                return 0;
4904retry:
4905        pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
4906        for (; addr != end; addr += PAGE_SIZE) {
4907                pte_t ptent = *(pte++);
4908                swp_entry_t ent;
4909
4910                if (!mc.precharge)
4911                        break;
4912
4913                switch (get_mctgt_type(vma, addr, ptent, &target)) {
4914                case MC_TARGET_PAGE:
4915                        page = target.page;
4916                        /*
4917                         * We can have a part of the split pmd here. Moving it
4918                         * can be done but it would be too convoluted so simply
4919                         * ignore such a partial THP and keep it in original
4920                         * memcg. There should be somebody mapping the head.
4921                         */
4922                        if (PageTransCompound(page))
4923                                goto put;
4924                        if (isolate_lru_page(page))
4925                                goto put;
4926                        if (!mem_cgroup_move_account(page, false,
4927                                                mc.from, mc.to)) {
4928                                mc.precharge--;
4929                                /* we uncharge from mc.from later. */
4930                                mc.moved_charge++;
4931                        }
4932                        putback_lru_page(page);
4933put:                    /* get_mctgt_type() gets the page */
4934                        put_page(page);
4935                        break;
4936                case MC_TARGET_SWAP:
4937                        ent = target.ent;
4938                        if (!mem_cgroup_move_swap_account(ent, mc.from, mc.to)) {
4939                                mc.precharge--;
4940                                /* we fixup refcnts and charges later. */
4941                                mc.moved_swap++;
4942                        }
4943                        break;
4944                default:
4945                        break;
4946                }
4947        }
4948        pte_unmap_unlock(pte - 1, ptl);
4949        cond_resched();
4950
4951        if (addr != end) {
4952                /*
4953                 * We have consumed all precharges we got in can_attach().
4954                 * We try charge one by one, but don't do any additional
4955                 * charges to mc.to if we have failed in charge once in attach()
4956                 * phase.
4957                 */
4958                ret = mem_cgroup_do_precharge(1);
4959                if (!ret)
4960                        goto retry;
4961        }
4962
4963        return ret;
4964}
4965
4966static void mem_cgroup_move_charge(void)
4967{
4968        struct mm_walk mem_cgroup_move_charge_walk = {
4969                .pmd_entry = mem_cgroup_move_charge_pte_range,
4970                .mm = mc.mm,
4971        };
4972
4973        lru_add_drain_all();
4974        /*
4975         * Signal lock_page_memcg() to take the memcg's move_lock
4976         * while we're moving its pages to another memcg. Then wait
4977         * for already started RCU-only updates to finish.
4978         */
4979        atomic_inc(&mc.from->moving_account);
4980        synchronize_rcu();
4981retry:
4982        if (unlikely(!down_read_trylock(&mc.mm->mmap_sem))) {
4983                /*
4984                 * Someone who are holding the mmap_sem might be waiting in
4985                 * waitq. So we cancel all extra charges, wake up all waiters,
4986                 * and retry. Because we cancel precharges, we might not be able
4987                 * to move enough charges, but moving charge is a best-effort
4988                 * feature anyway, so it wouldn't be a big problem.
4989                 */
4990                __mem_cgroup_clear_mc();
4991                cond_resched();
4992                goto retry;
4993        }
4994        /*
4995         * When we have consumed all precharges and failed in doing
4996         * additional charge, the page walk just aborts.
4997         */
4998        walk_page_range(0, ~0UL, &mem_cgroup_move_charge_walk);
4999        up_read(&mc.mm->mmap_sem);
5000        atomic_dec(&mc.from->moving_account);

5001}
5002
5003static void mem_cgroup_move_task(void)
5004{
5005        if (mc.to) {
5006                mem_cgroup_move_charge();
5007                mem_cgroup_clear_mc();
5008        }
5009}
5010#else   /* !CONFIG_MMU */
5011static int mem_cgroup_can_attach(struct cgroup_taskset *tset)
5012{
5013        return 0;
5014}
5015static void mem_cgroup_cancel_attach(struct cgroup_taskset *tset)
5016{
5017}
5018static void mem_cgroup_move_task(void)
5019{
5020}
5021#endif
5022
5023/*
5024 * Cgroup retains root cgroups across [un]mount cycles making it necessary
5025 * to verify whether we're attached to the default hierarchy on each mount
5026 * attempt.
5027 */
5028static void mem_cgroup_bind(struct cgroup_subsys_state *root_css)
5029{
5030        /*
5031         * use_hierarchy is forced on the default hierarchy.  cgroup core
5032         * guarantees that @root doesn't have any children, so turning it
5033         * on for the root memcg is enough.
5034         */
5035        if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
5036                root_mem_cgroup->use_hierarchy = true;
5037        else
5038                root_mem_cgroup->use_hierarchy = false;
5039}
5040
5041static u64 memory_current_read(struct cgroup_subsys_state *css,
5042                               struct cftype *cft)
5043{
5044        struct mem_cgroup *memcg = mem_cgroup_from_css(css);
5045
5046        return (u64)page_counter_read(&memcg->memory) * PAGE_SIZE;
5047}
5048
5049static int memory_low_show(struct seq_file *m, void *v)
5050{
5051        struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
5052        unsigned long low = READ_ONCE(memcg->low);
5053
5054        if (low == PAGE_COUNTER_MAX)
5055                seq_puts(m, "max\n");
5056        else
5057                seq_printf(m, "%llu\n", (u64)low * PAGE_SIZE);
5058
5059        return 0;
5060}
5061
5062static ssize_t memory_low_write(struct kernfs_open_file *of,
5063                                char *buf, size_t nbytes, loff_t off)
5064{
5065        struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
5066        unsigned long low;
5067        int err;
5068
5069        buf = strstrip(buf);
5070        err = page_counter_memparse(buf, "max", &low);
5071        if (err)
5072                return err;
5073
5074        memcg->low = low;
5075
5076        return nbytes;
5077}
5078
5079static int memory_high_show(struct seq_file *m, void *v)
5080{
5081        struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
5082        unsigned long high = READ_ONCE(memcg->high);
5083
5084        if (high == PAGE_COUNTER_MAX)
5085                seq_puts(m, "max\n");
5086        else
5087                seq_printf(m, "%llu\n", (u64)high * PAGE_SIZE);
5088
5089        return 0;
5090}
5091
5092static ssize_t memory_high_write(struct kernfs_open_file *of,
5093                                 char *buf, size_t nbytes, loff_t off)
5094{
5095        struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
5096        unsigned long nr_pages;
5097        unsigned long high;
5098        int err;
5099
5100        buf = strstrip(buf);
5101        err = page_counter_memparse(buf, "max", &high);
5102        if (err)
5103                return err;
5104
5105        memcg->high = high;
5106
5107        nr_pages = page_counter_read(&memcg->memory);
5108        if (nr_pages > high)
5109                try_to_free_mem_cgroup_pages(memcg, nr_pages - high,
5110                                             GFP_KERNEL, true);
5111
5112        memcg_wb_domain_size_changed(memcg);
5113        return nbytes;
5114}
5115
5116static int memory_max_show(struct seq_file *m, void *v)
5117{
5118        struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
5119        unsigned long max = READ_ONCE(memcg->memory.limit);
5120
5121        if (max == PAGE_COUNTER_MAX)
5122                seq_puts(m, "max\n");
5123        else
5124                seq_printf(m, "%llu\n", (u64)max * PAGE_SIZE);
5125
5126        return 0;
5127}
5128
5129static ssize_t memory_max_write(struct kernfs_open_file *of,
5130                                char *buf, size_t nbytes, loff_t off)
5131{
5132        struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
5133        unsigned int nr_reclaims = MEM_CGROUP_RECLAIM_RETRIES;
5134        bool drained = false;
5135        unsigned long max;
5136        int err;
5137
5138        buf = strstrip(buf);
5139        err = page_counter_memparse(buf, "max", &max);
5140        if (err)
5141                return err;
5142
5143        xchg(&memcg->memory.limit, max);
5144
5145        for (;;) {
5146                unsigned long nr_pages = page_counter_read(&memcg->memory);
5147
5148                if (nr_pages <= max)
5149                        break;
5150
5151                if (signal_pending(current)) {
5152                        err = -EINTR;
5153                        break;
5154                }
5155
5156                if (!drained) {
5157                        drain_all_stock(memcg);
5158                        drained = true;
5159                        continue;
5160                }
5161
5162                if (nr_reclaims) {
5163                        if (!try_to_free_mem_cgroup_pages(memcg, nr_pages - max,
5164                                                          GFP_KERNEL, true))
5165                                nr_reclaims--;
5166                        continue;
5167                }
5168
5169                mem_cgroup_events(memcg, MEMCG_OOM, 1);
5170                if (!mem_cgroup_out_of_memory(memcg, GFP_KERNEL, 0))
5171                        break;
5172        }
5173
5174        memcg_wb_domain_size_changed(memcg);
5175        return nbytes;
5176}
5177
5178static int memory_events_show(struct seq_file *m, void *v)
5179{
5180        struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
5181
5182        seq_printf(m, "low %lu\n", mem_cgroup_read_events(memcg, MEMCG_LOW));
5183        seq_printf(m, "high %lu\n", mem_cgroup_read_events(memcg, MEMCG_HIGH));
5184        seq_printf(m, "max %lu\n", mem_cgroup_read_events(memcg, MEMCG_MAX));
5185        seq_printf(m, "oom %lu\n", mem_cgroup_read_events(memcg, MEMCG_OOM));
5186
5187        return 0;
5188}
5189
5190static int memory_stat_show(struct seq_file *m, void *v)
5191{
5192        struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
5193        unsigned long stat[MEMCG_NR_STAT];
5194        unsigned long events[MEMCG_NR_EVENTS];
5195        int i;
5196
5197        /*
5198         * Provide statistics on the state of the memory subsystem as
5199         * well as cumulative event counters that show past behavior.
5200         *
5201         * This list is ordered following a combination of these gradients:
5202         * 1) generic big picture -> specifics and details
5203         * 2) reflecting userspace activity -> reflecting kernel heuristics
5204         *
5205         * Current memory state:
5206         */
5207
5208        tree_stat(memcg, stat);
5209        tree_events(memcg, events);
5210
5211        seq_printf(m, "anon %llu\n",
5212                   (u64)stat[MEM_CGROUP_STAT_RSS] * PAGE_SIZE);
5213        seq_printf(m, "file %llu\n",
5214                   (u64)stat[MEM_CGROUP_STAT_CACHE] * PAGE_SIZE);
5215        seq_printf(m, "kernel_stack %llu\n",
5216                   (u64)stat[MEMCG_KERNEL_STACK_KB] * 1024);
5217        seq_printf(m, "slab %llu\n",
5218                   (u64)(stat[MEMCG_SLAB_RECLAIMABLE] +
5219                         stat[MEMCG_SLAB_UNRECLAIMABLE]) * PAGE_SIZE);
5220        seq_printf(m, "sock %llu\n",
5221                   (u64)stat[MEMCG_SOCK] * PAGE_SIZE);
5222
5223        seq_printf(m, "file_mapped %llu\n",
5224                   (u64)stat[MEM_CGROUP_STAT_FILE_MAPPED] * PAGE_SIZE);
5225        seq_printf(m, "file_dirty %llu\n",
5226                   (u64)stat[MEM_CGROUP_STAT_DIRTY] * PAGE_SIZE);
5227        seq_printf(m, "file_writeback %llu\n",
5228                   (u64)stat[MEM_CGROUP_STAT_WRITEBACK] * PAGE_SIZE);
5229
5230        for (i = 0; i < NR_LRU_LISTS; i++) {
5231                struct mem_cgroup *mi;
5232                unsigned long val = 0;
5233
5234                for_each_mem_cgroup_tree(mi, memcg)
5235                        val += mem_cgroup_nr_lru_pages(mi, BIT(i));
5236                seq_printf(m, "%s %llu\n",
5237                           mem_cgroup_lru_names[i], (u64)val * PAGE_SIZE);
5238        }
5239
5240        seq_printf(m, "slab_reclaimable %llu\n",
5241                   (u64)stat[MEMCG_SLAB_RECLAIMABLE] * PAGE_SIZE);
5242        seq_printf(m, "slab_unreclaimable %llu\n",
5243                   (u64)stat[MEMCG_SLAB_UNRECLAIMABLE] * PAGE_SIZE);
5244
5245        /* Accumulated memory events */
5246
5247        seq_printf(m, "pgfault %lu\n",
5248                   events[MEM_CGROUP_EVENTS_PGFAULT]);
5249        seq_printf(m, "pgmajfault %lu\n",
5250                   events[MEM_CGROUP_EVENTS_PGMAJFAULT]);
5251
5252        return 0;
5253}
5254
5255static struct cftype memory_files[] = {
5256        {
5257                .name = "current",
5258                .flags = CFTYPE_NOT_ON_ROOT,
5259                .read_u64 = memory_current_read,
5260        },
5261        {
5262                .name = "low",
5263                .flags = CFTYPE_NOT_ON_ROOT,
5264                .seq_show = memory_low_show,
5265                .write = memory_low_write,
5266        },
5267        {
5268                .name = "high",
5269                .flags = CFTYPE_NOT_ON_ROOT,
5270                .seq_show = memory_high_show,
5271                .write = memory_high_write,
5272        },
5273        {
5274                .name = "max",
5275                .flags = CFTYPE_NOT_ON_ROOT,
5276                .seq_show = memory_max_show,
5277                .write = memory_max_write,
5278        },
5279        {
5280                .name = "events",
5281                .flags = CFTYPE_NOT_ON_ROOT,
5282                .file_offset = offsetof(struct mem_cgroup, events_file),
5283                .seq_show = memory_events_show,
5284        },
5285        {
5286                .name = "stat",
5287                .flags = CFTYPE_NOT_ON_ROOT,
5288                .seq_show = memory_stat_show,
5289        },
5290        { }     /* terminate */
5291};
5292
5293struct cgroup_subsys memory_cgrp_subsys = {
5294        .css_alloc = mem_cgroup_css_alloc,
5295        .css_online = mem_cgroup_css_online,
5296        .css_offline = mem_cgroup_css_offline,
5297        .css_released = mem_cgroup_css_released,
5298        .css_free = mem_cgroup_css_free,
5299        .css_reset = mem_cgroup_css_reset,
5300        .can_attach = mem_cgroup_can_attach,
5301        .cancel_attach = mem_cgroup_cancel_attach,
5302        .post_attach = mem_cgroup_move_task,
5303        .bind = mem_cgroup_bind,
5304        .dfl_cftypes = memory_files,
5305        .legacy_cftypes = mem_cgroup_legacy_files,
5306        .early_init = 0,
5307};
5308
5309/**
5310 * mem_cgroup_low - check if memory consumption is below the normal range
5311 * @root: the highest ancestor to consider
5312 * @memcg: the memory cgroup to check
5313 *
5314 * Returns %true if memory consumption of @memcg, and that of all
5315 * configurable ancestors up to @root, is below the normal range.
5316 */
5317bool mem_cgroup_low(struct mem_cgroup *root, struct mem_cgroup *memcg)
5318{
5319        if (mem_cgroup_disabled())
5320                return false;
5321
5322        /*
5323         * The toplevel group doesn't have a configurable range, so
5324         * it's never low when looked at directly, and it is not
5325         * considered an ancestor when assessing the hierarchy.
5326         */
5327
5328        if (memcg == root_mem_cgroup)
5329                return false;
5330
5331        if (page_counter_read(&memcg->memory) >= memcg->low)
5332                return false;
5333
5334        while (memcg != root) {
5335                memcg = parent_mem_cgroup(memcg);
5336
5337                if (memcg == root_mem_cgroup)
5338                        break;
5339
5340                if (page_counter_read(&memcg->memory) >= memcg->low)
5341                        return false;
5342        }
5343        return true;
5344}
5345
5346/**
5347 * mem_cgroup_try_charge - try charging a page
5348 * @page: page to charge
5349 * @mm: mm context of the victim
5350 * @gfp_mask: reclaim mode
5351 * @memcgp: charged memcg return
5352 * @compound: charge the page as compound or small page
5353 *
5354 * Try to charge @page to the memcg that @mm belongs to, reclaiming
5355 * pages according to @gfp_mask if necessary.
5356 *
5357 * Returns 0 on success, with *@memcgp pointing to the charged memcg.
5358 * Otherwise, an error code is returned.
5359 *
5360 * After page->mapping has been set up, the caller must finalize the
5361 * charge with mem_cgroup_commit_charge().  Or abort the transaction
5362 * with mem_cgroup_cancel_charge() in case page instantiation fails.
5363 */
5364int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm,
5365                          gfp_t gfp_mask, struct mem_cgroup **memcgp,
5366                          bool compound)
5367{
5368        struct mem_cgroup *memcg = NULL;
5369        unsigned int nr_pages = compound ? hpage_nr_pages(page) : 1;
5370        int ret = 0;
5371
5372        if (mem_cgroup_disabled())
5373                goto out;
5374
5375        if (PageSwapCache(page)) {
5376                /*
5377                 * Every swap fault against a single page tries to charge the
5378                 * page, bail as early as possible.  shmem_unuse() encounters
5379                 * already charged pages, too.  The USED bit is protected by
5380                 * the page lock, which serializes swap cache removal, which
5381                 * in turn serializes uncharging.
5382                 */
5383                VM_BUG_ON_PAGE(!PageLocked(page), page);
5384                if (page->mem_cgroup)
5385                        goto out;
5386
5387                if (do_swap_account) {
5388                        swp_entry_t ent = { .val = page_private(page), };
5389                        unsigned short id = lookup_swap_cgroup_id(ent);
5390
5391                        rcu_read_lock();
5392                        memcg = mem_cgroup_from_id(id);
5393                        if (memcg && !css_tryget_online(&memcg->css))
5394                                memcg = NULL;
5395                        rcu_read_unlock();
5396                }
5397        }
5398
5399        if (!memcg)
5400                memcg = get_mem_cgroup_from_mm(mm);
5401
5402        ret = try_charge(memcg, gfp_mask, nr_pages);
5403
5404        css_put(&memcg->css);
5405out:
5406        *memcgp = memcg;
5407        return ret;
5408}
5409
5410/**
5411 * mem_cgroup_commit_charge - commit a page charge
5412 * @page: page to charge
5413 * @memcg: memcg to charge the page to
5414 * @lrucare: page might be on LRU already
5415 * @compound: charge the page as compound or small page
5416 *
5417 * Finalize a charge transaction started by mem_cgroup_try_charge(),
5418 * after page->mapping has been set up.  This must happen atomically
5419 * as part of the page instantiation, i.e. under the page table lock
5420 * for anonymous pages, under the page lock for page and swap cache.
5421 *
5422 * In addition, the page must not be on the LRU during the commit, to
5423 * prevent racing with task migration.  If it might be, use @lrucare.
5424 *
5425 * Use mem_cgroup_cancel_charge() to cancel the transaction instead.
5426 */
5427void mem_cgroup_commit_charge(struct page *page, struct mem_cgroup *memcg,
5428                              bool lrucare, bool compound)
5429{
5430        unsigned int nr_pages = compound ? hpage_nr_pages(page) : 1;
5431
5432        VM_BUG_ON_PAGE(!page->mapping, page);
5433        VM_BUG_ON_PAGE(PageLRU(page) && !lrucare, page);
5434
5435        if (mem_cgroup_disabled())
5436                return;
5437        /*
5438         * Swap faults will attempt to charge the same page multiple
5439         * times.  But reuse_swap_page() might have removed the page
5440         * from swapcache already, so we can't check PageSwapCache().
5441         */
5442        if (!memcg)
5443                return;
5444
5445        commit_charge(page, memcg, lrucare);
5446
5447        local_irq_disable();
5448        mem_cgroup_charge_statistics(memcg, page, compound, nr_pages);
5449        memcg_check_events(memcg, page);
5450        local_irq_enable();
5451
5452        if (do_memsw_account() && PageSwapCache(page)) {
5453                swp_entry_t entry = { .val = page_private(page) };
5454                /*
5455                 * The swap entry might not get freed for a long time,
5456                 * let's not wait for it.  The page already received a
5457                 * memory+swap charge, drop the swap entry duplicate.
5458                 */
5459                mem_cgroup_uncharge_swap(entry);
5460        }
5461}
5462
5463/**
5464 * mem_cgroup_cancel_charge - cancel a page charge
5465 * @page: page to charge
5466 * @memcg: memcg to charge the page to
5467 * @compound: charge the page as compound or small page
5468 *
5469 * Cancel a charge transaction started by mem_cgroup_try_charge().
5470 */
5471void mem_cgroup_cancel_charge(struct page *page, struct mem_cgroup *memcg,
5472                bool compound)
5473{
5474        unsigned int nr_pages = compound ? hpage_nr_pages(page) : 1;
5475
5476        if (mem_cgroup_disabled())
5477                return;
5478        /*
5479         * Swap faults will attempt to charge the same page multiple
5480         * times.  But reuse_swap_page() might have removed the page
5481         * from swapcache already, so we can't check PageSwapCache().
5482         */
5483        if (!memcg)
5484                return;
5485
5486        cancel_charge(memcg, nr_pages);
5487}
5488
5489static void uncharge_batch(struct mem_cgroup *memcg, unsigned long pgpgout,
5490                           unsigned long nr_anon, unsigned long nr_file,
5491                           unsigned long nr_huge, unsigned long nr_kmem,
5492                           struct page *dummy_page)
5493{
5494        unsigned long nr_pages = nr_anon + nr_file + nr_kmem;
5495        unsigned long flags;
5496
5497        if (!mem_cgroup_is_root(memcg)) {
5498                page_counter_uncharge(&memcg->memory, nr_pages);
5499                if (do_memsw_account())
5500                        page_counter_uncharge(&memcg->memsw, nr_pages);
5501                if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && nr_kmem)
5502                        page_counter_uncharge(&memcg->kmem, nr_kmem);
5503                memcg_oom_recover(memcg);
5504        }
5505
5506        local_irq_save(flags);
5507        __this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_RSS], nr_anon);
5508        __this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_CACHE], nr_file);
5509        __this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_RSS_HUGE], nr_huge);
5510        __this_cpu_add(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGOUT], pgpgout);
5511        __this_cpu_add(memcg->stat->nr_page_events, nr_pages);
5512        memcg_check_events(memcg, dummy_page);
5513        local_irq_restore(flags);
5514
5515        if (!mem_cgroup_is_root(memcg))
5516                css_put_many(&memcg->css, nr_pages);
5517}
5518
5519static void uncharge_list(struct list_head *page_list)
5520{
5521        struct mem_cgroup *memcg = NULL;
5522        unsigned long nr_anon = 0;
5523        unsigned long nr_file = 0;
5524        unsigned long nr_huge = 0;
5525        unsigned long nr_kmem = 0;
5526        unsigned long pgpgout = 0;
5527        struct list_head *next;
5528        struct page *page;
5529
5530        /*
5531         * Note that the list can be a single page->lru; hence the
5532         * do-while loop instead of a simple list_for_each_entry().
5533         */
5534        next = page_list->next;
5535        do {
5536                page = list_entry(next, struct page, lru);
5537                next = page->lru.next;
5538
5539                VM_BUG_ON_PAGE(PageLRU(page), page);
5540                VM_BUG_ON_PAGE(page_count(page), page);
5541
5542                if (!page->mem_cgroup)
5543                        continue;
5544
5545                /*
5546                 * Nobody should be changing or seriously looking at
5547                 * page->mem_cgroup at this point, we have fully
5548                 * exclusive access to the page.
5549                 */
5550
5551                if (memcg != page->mem_cgroup) {
5552                        if (memcg) {
5553                                uncharge_batch(memcg, pgpgout, nr_anon, nr_file,
5554                                               nr_huge, nr_kmem, page);
5555                                pgpgout = nr_anon = nr_file =
5556                                        nr_huge = nr_kmem = 0;
5557                        }
5558                        memcg = page->mem_cgroup;
5559                }
5560
5561                if (!PageKmemcg(page)) {
5562                        unsigned int nr_pages = 1;
5563
5564                        if (PageTransHuge(page)) {
5565                                nr_pages <<= compound_order(page);
5566                                nr_huge += nr_pages;
5567                        }
5568                        if (PageAnon(page))
5569                                nr_anon += nr_pages;
5570                        else
5571                                nr_file += nr_pages;
5572                        pgpgout++;
5573                } else {
5574                        nr_kmem += 1 << compound_order(page);
5575                        __ClearPageKmemcg(page);
5576                }
5577
5578                page->mem_cgroup = NULL;
5579        } while (next != page_list);
5580
5581        if (memcg)
5582                uncharge_batch(memcg, pgpgout, nr_anon, nr_file,
5583                               nr_huge, nr_kmem, page);
5584}
5585
5586/**
5587 * mem_cgroup_uncharge - uncharge a page
5588 * @page: page to uncharge
5589 *
5590 * Uncharge a page previously charged with mem_cgroup_try_charge() and
5591 * mem_cgroup_commit_charge().
5592 */
5593void mem_cgroup_uncharge(struct page *page)
5594{
5595        if (mem_cgroup_disabled())
5596                return;
5597
5598        /* Don't touch page->lru of any random page, pre-check: */
5599        if (!page->mem_cgroup)
5600                return;
5601
5602        INIT_LIST_HEAD(&page->lru);
5603        uncharge_list(&page->lru);
5604}
5605
5606/**
5607 * mem_cgroup_uncharge_list - uncharge a list of page
5608 * @page_list: list of pages to uncharge
5609 *
5610 * Uncharge a list of pages previously charged with
5611 * mem_cgroup_try_charge() and mem_cgroup_commit_charge().
5612 */
5613void mem_cgroup_uncharge_list(struct list_head *page_list)
5614{
5615        if (mem_cgroup_disabled())
5616                return;
5617
5618        if (!list_empty(page_list))
5619                uncharge_list(page_list);
5620}
5621
5622/**
5623 * mem_cgroup_migrate - charge a page's replacement
5624 * @oldpage: currently circulating page
5625 * @newpage: replacement page
5626 *
5627 * Charge @newpage as a replacement page for @oldpage. @oldpage will
5628 * be uncharged upon free.
5629 *
5630 * Both pages must be locked, @newpage->mapping must be set up.
5631 */
5632void mem_cgroup_migrate(struct page *oldpage, struct page *newpage)
5633{
5634        struct mem_cgroup *memcg;
5635        unsigned int nr_pages;
5636        bool compound;
5637        unsigned long flags;
5638
5639        VM_BUG_ON_PAGE(!PageLocked(oldpage), oldpage);
5640        VM_BUG_ON_PAGE(!PageLocked(newpage), newpage);
5641        VM_BUG_ON_PAGE(PageAnon(oldpage) != PageAnon(newpage), newpage);
5642        VM_BUG_ON_PAGE(PageTransHuge(oldpage) != PageTransHuge(newpage),
5643                       newpage);
5644
5645        if (mem_cgroup_disabled())
5646                return;
5647
5648        /* Page cache replacement: new page already charged? */
5649        if (newpage->mem_cgroup)
5650                return;
5651
5652        /* Swapcache readahead pages can get replaced before being charged */
5653        memcg = oldpage->mem_cgroup;
5654        if (!memcg)
5655                return;
5656
5657        /* Force-charge the new page. The old one will be freed soon */
5658        compound = PageTransHuge(newpage);
5659        nr_pages = compound ? hpage_nr_pages(newpage) : 1;
5660
5661        page_counter_charge(&memcg->memory, nr_pages);
5662        if (do_memsw_account())
5663                page_counter_charge(&memcg->memsw, nr_pages);
5664        css_get_many(&memcg->css, nr_pages);
5665
5666        commit_charge(newpage, memcg, false);
5667
5668        local_irq_save(flags);
5669        mem_cgroup_charge_statistics(memcg, newpage, compound, nr_pages);
5670        memcg_check_events(memcg, newpage);
5671        local_irq_restore(flags);
5672}
5673
5674DEFINE_STATIC_KEY_FALSE(memcg_sockets_enabled_key);
5675EXPORT_SYMBOL(memcg_sockets_enabled_key);
5676
5677void sock_update_memcg(struct sock *sk)
5678{
5679        struct mem_cgroup *memcg;
5680
5681        /* Socket cloning can throw us here with sk_cgrp already
5682         * filled. It won't however, necessarily happen from
5683         * process context. So the test for root memcg given
5684         * the current task's memcg won't help us in this case.
5685         *
5686         * Respecting the original socket's memcg is a better
5687         * decision in this case.
5688         */
5689        if (sk->sk_memcg) {
5690                BUG_ON(mem_cgroup_is_root(sk->sk_memcg));
5691                css_get(&sk->sk_memcg->css);
5692                return;
5693        }
5694
5695        rcu_read_lock();
5696        memcg = mem_cgroup_from_task(current);
5697        if (memcg == root_mem_cgroup)
5698                goto out;
5699        if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && !memcg->tcpmem_active)
5700                goto out;
5701        if (css_tryget_online(&memcg->css))
5702                sk->sk_memcg = memcg;
5703out:
5704        rcu_read_unlock();
5705}
5706EXPORT_SYMBOL(sock_update_memcg);
5707
5708void sock_release_memcg(struct sock *sk)
5709{
5710        WARN_ON(!sk->sk_memcg);
5711        css_put(&sk->sk_memcg->css);
5712}
5713
5714/**
5715 * mem_cgroup_charge_skmem - charge socket memory
5716 * @memcg: memcg to charge
5717 * @nr_pages: number of pages to charge
5718 *
5719 * Charges @nr_pages to @memcg. Returns %true if the charge fit within
5720 * @memcg's configured limit, %false if the charge had to be forced.
5721 */
5722bool mem_cgroup_charge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages)
5723{
5724        gfp_t gfp_mask = GFP_KERNEL;
5725
5726        if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) {
5727                struct page_counter *fail;
5728
5729                if (page_counter_try_charge(&memcg->tcpmem, nr_pages, &fail)) {
5730                        memcg->tcpmem_pressure = 0;
5731                        return true;
5732                }
5733                page_counter_charge(&memcg->tcpmem, nr_pages);
5734                memcg->tcpmem_pressure = 1;
5735                return false;
5736        }
5737
5738        /* Don't block in the packet receive path */
5739        if (in_softirq())
5740                gfp_mask = GFP_NOWAIT;
5741
5742        this_cpu_add(memcg->stat->count[MEMCG_SOCK], nr_pages);
5743
5744        if (try_charge(memcg, gfp_mask, nr_pages) == 0)
5745                return true;
5746
5747        try_charge(memcg, gfp_mask|__GFP_NOFAIL, nr_pages);
5748        return false;
5749}
5750
5751/**
5752 * mem_cgroup_uncharge_skmem - uncharge socket memory
5753 * @memcg - memcg to uncharge
5754 * @nr_pages - number of pages to uncharge
5755 */
5756void mem_cgroup_uncharge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages)
5757{
5758        if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) {
5759                page_counter_uncharge(&memcg->tcpmem, nr_pages);
5760                return;
5761        }
5762
5763        this_cpu_sub(memcg->stat->count[MEMCG_SOCK], nr_pages);
5764
5765        page_counter_uncharge(&memcg->memory, nr_pages);
5766        css_put_many(&memcg->css, nr_pages);
5767}
5768
5769static int __init cgroup_memory(char *s)
5770{
5771        char *token;
5772
5773        while ((token = strsep(&s, ",")) != NULL) {
5774                if (!*token)
5775                        continue;
5776                if (!strcmp(token, "nosocket"))
5777                        cgroup_memory_nosocket = true;
5778                if (!strcmp(token, "nokmem"))
5779                        cgroup_memory_nokmem = true;
5780        }
5781        return 0;
5782}
5783__setup("cgroup.memory=", cgroup_memory);
5784
5785/*
5786 * subsys_initcall() for memory controller.
5787 *
5788 * Some parts like hotcpu_notifier() have to be initialized from this context
5789 * because of lock dependencies (cgroup_lock -> cpu hotplug) but basically
5790 * everything that doesn't depend on a specific mem_cgroup structure should
5791 * be initialized from here.
5792 */
5793static int __init mem_cgroup_init(void)
5794{
5795        int cpu, node;
5796
5797        hotcpu_notifier(memcg_cpu_hotplug_callback, 0);
5798
5799        for_each_possible_cpu(cpu)
5800                INIT_WORK(&per_cpu_ptr(&memcg_stock, cpu)->work,
5801                          drain_local_stock);
5802
5803        for_each_node(node) {
5804                struct mem_cgroup_tree_per_node *rtpn;
5805
5806                rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL,
5807                                    node_online(node) ? node : NUMA_NO_NODE);
5808
5809                rtpn->rb_root = RB_ROOT;
5810                spin_lock_init(&rtpn->lock);
5811                soft_limit_tree.rb_tree_per_node[node] = rtpn;
5812        }
5813
5814        return 0;
5815}
5816subsys_initcall(mem_cgroup_init);
5817
5818#ifdef CONFIG_MEMCG_SWAP
5819static struct mem_cgroup *mem_cgroup_id_get_online(struct mem_cgroup *memcg)
5820{
5821        while (!atomic_inc_not_zero(&memcg->id.ref)) {
5822                /*
5823                 * The root cgroup cannot be destroyed, so it's refcount must
5824                 * always be >= 1.
5825                 */
5826                if (WARN_ON_ONCE(memcg == root_mem_cgroup)) {
5827                        VM_BUG_ON(1);
5828                        break;
5829                }
5830                memcg = parent_mem_cgroup(memcg);
5831                if (!memcg)
5832                        memcg = root_mem_cgroup;
5833        }
5834        return memcg;
5835}
5836
5837/**
5838 * mem_cgroup_swapout - transfer a memsw charge to swap
5839 * @page: page whose memsw charge to transfer
5840 * @entry: swap entry to move the charge to
5841 *
5842 * Transfer the memsw charge of @page to @entry.
5843 */
5844void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
5845{
5846        struct mem_cgroup *memcg, *swap_memcg;
5847        unsigned short oldid;
5848
5849        VM_BUG_ON_PAGE(PageLRU(page), page);
5850        VM_BUG_ON_PAGE(page_count(page), page);
5851
5852        if (!do_memsw_account())
5853                return;
5854
5855        memcg = page->mem_cgroup;
5856
5857        /* Readahead page, never charged */
5858        if (!memcg)
5859                return;
5860
5861        /*
5862         * In case the memcg owning these pages has been offlined and doesn't
5863         * have an ID allocated to it anymore, charge the closest online
5864         * ancestor for the swap instead and transfer the memory+swap charge.
5865         */
5866        swap_memcg = mem_cgroup_id_get_online(memcg);
5867        oldid = swap_cgroup_record(entry, mem_cgroup_id(swap_memcg));
5868        VM_BUG_ON_PAGE(oldid, page);
5869        mem_cgroup_swap_statistics(swap_memcg, true);
5870
5871        page->mem_cgroup = NULL;
5872
5873        if (!mem_cgroup_is_root(memcg))
5874                page_counter_uncharge(&memcg->memory, 1);
5875
5876        if (memcg != swap_memcg) {
5877                if (!mem_cgroup_is_root(swap_memcg))
5878                        page_counter_charge(&swap_memcg->memsw, 1);
5879                page_counter_uncharge(&memcg->memsw, 1);
5880        }
5881
5882        /*
5883         * Interrupts should be disabled here because the caller holds the
5884         * mapping->tree_lock lock which is taken with interrupts-off. It is
5885         * important here to have the interrupts disabled because it is the
5886         * only synchronisation we have for udpating the per-CPU variables.
5887         */
5888        VM_BUG_ON(!irqs_disabled());
5889        mem_cgroup_charge_statistics(memcg, page, false, -1);
5890        memcg_check_events(memcg, page);
5891
5892        if (!mem_cgroup_is_root(memcg))
5893                css_put(&memcg->css);
5894}
5895
5896/*
5897 * mem_cgroup_try_charge_swap - try charging a swap entry
5898 * @page: page being added to swap
5899 * @entry: swap entry to charge
5900 *
5901 * Try to charge @entry to the memcg that @page belongs to.
5902 *
5903 * Returns 0 on success, -ENOMEM on failure.
5904 */
5905int mem_cgroup_try_charge_swap(struct page *page, swp_entry_t entry)
5906{
5907        struct mem_cgroup *memcg;
5908        struct page_counter *counter;
5909        unsigned short oldid;
5910
5911        if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) || !do_swap_account)
5912                return 0;
5913
5914        memcg = page->mem_cgroup;
5915
5916        /* Readahead page, never charged */
5917        if (!memcg)
5918                return 0;
5919
5920        memcg = mem_cgroup_id_get_online(memcg);
5921
5922        if (!mem_cgroup_is_root(memcg) &&
5923            !page_counter_try_charge(&memcg->swap, 1, &counter)) {
5924                mem_cgroup_id_put(memcg);
5925                return -ENOMEM;
5926        }
5927
5928        oldid = swap_cgroup_record(entry, mem_cgroup_id(memcg));
5929        VM_BUG_ON_PAGE(oldid, page);
5930        mem_cgroup_swap_statistics(memcg, true);
5931
5932        return 0;
5933}
5934
5935/**
5936 * mem_cgroup_uncharge_swap - uncharge a swap entry
5937 * @entry: swap entry to uncharge
5938 *
5939 * Drop the swap charge associated with @entry.
5940 */
5941void mem_cgroup_uncharge_swap(swp_entry_t entry)
5942{
5943        struct mem_cgroup *memcg;
5944        unsigned short id;
5945
5946        if (!do_swap_account)
5947                return;
5948
5949        id = swap_cgroup_record(entry, 0);
5950        rcu_read_lock();
5951        memcg = mem_cgroup_from_id(id);
5952        if (memcg) {
5953                if (!mem_cgroup_is_root(memcg)) {
5954                        if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
5955                                page_counter_uncharge(&memcg->swap, 1);
5956                        else
5957                                page_counter_uncharge(&memcg->memsw, 1);
5958                }
5959                mem_cgroup_swap_statistics(memcg, false);
5960                mem_cgroup_id_put(memcg);
5961        }
5962        rcu_read_unlock();
5963}
5964
5965long mem_cgroup_get_nr_swap_pages(struct mem_cgroup *memcg)
5966{
5967        long nr_swap_pages = get_nr_swap_pages();
5968
5969        if (!do_swap_account || !cgroup_subsys_on_dfl(memory_cgrp_subsys))
5970                return nr_swap_pages;
5971        for (; memcg != root_mem_cgroup; memcg = parent_mem_cgroup(memcg))
5972                nr_swap_pages = min_t(long, nr_swap_pages,
5973                                      READ_ONCE(memcg->swap.limit) -
5974                                      page_counter_read(&memcg->swap));
5975        return nr_swap_pages;
5976}
5977
5978bool mem_cgroup_swap_full(struct page *page)
5979{
5980        struct mem_cgroup *memcg;
5981
5982        VM_BUG_ON_PAGE(!PageLocked(page), page);
5983
5984        if (vm_swap_full())
5985                return true;
5986        if (!do_swap_account || !cgroup_subsys_on_dfl(memory_cgrp_subsys))
5987                return false;
5988
5989        memcg = page->mem_cgroup;
5990        if (!memcg)
5991                return false;
5992
5993        for (; memcg != root_mem_cgroup; memcg = parent_mem_cgroup(memcg))
5994                if (page_counter_read(&memcg->swap) * 2 >= memcg->swap.limit)
5995                        return true;
5996
5997        return false;
5998}
5999
6000/* for remember boot option*/

6001#ifdef CONFIG_MEMCG_SWAP_ENABLED
6002static int really_do_swap_account __initdata = 1;
6003#else
6004static int really_do_swap_account __initdata;
6005#endif
6006
6007static int __init enable_swap_account(char *s)
6008{
6009        if (!strcmp(s, "1"))
6010                really_do_swap_account = 1;
6011        else if (!strcmp(s, "0"))
6012                really_do_swap_account = 0;
6013        return 1;
6014}
6015__setup("swapaccount=", enable_swap_account);
6016
6017static u64 swap_current_read(struct cgroup_subsys_state *css,
6018                             struct cftype *cft)
6019{
6020        struct mem_cgroup *memcg = mem_cgroup_from_css(css);
6021
6022        return (u64)page_counter_read(&memcg->swap) * PAGE_SIZE;
6023}
6024
6025static int swap_max_show(struct seq_file *m, void *v)
6026{
6027        struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
6028        unsigned long max = READ_ONCE(memcg->swap.limit);
6029
6030        if (max == PAGE_COUNTER_MAX)
6031                seq_puts(m, "max\n");
6032        else
6033                seq_printf(m, "%llu\n", (u64)max * PAGE_SIZE);
6034
6035        return 0;
6036}
6037
6038static ssize_t swap_max_write(struct kernfs_open_file *of,
6039                              char *buf, size_t nbytes, loff_t off)
6040{
6041        struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
6042        unsigned long max;
6043        int err;
6044
6045        buf = strstrip(buf);
6046        err = page_counter_memparse(buf, "max", &max);
6047        if (err)
6048                return err;
6049
6050        mutex_lock(&memcg_limit_mutex);
6051        err = page_counter_limit(&memcg->swap, max);
6052        mutex_unlock(&memcg_limit_mutex);
6053        if (err)
6054                return err;
6055
6056        return nbytes;
6057}
6058
6059static struct cftype swap_files[] = {
6060        {
6061                .name = "swap.current",
6062                .flags = CFTYPE_NOT_ON_ROOT,
6063                .read_u64 = swap_current_read,
6064        },
6065        {
6066                .name = "swap.max",
6067                .flags = CFTYPE_NOT_ON_ROOT,
6068                .seq_show = swap_max_show,
6069                .write = swap_max_write,
6070        },
6071        { }     /* terminate */
6072};
6073
6074static struct cftype memsw_cgroup_files[] = {
6075        {
6076                .name = "memsw.usage_in_bytes",
6077                .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE),
6078                .read_u64 = mem_cgroup_read_u64,
6079        },
6080        {
6081                .name = "memsw.max_usage_in_bytes",
6082                .private = MEMFILE_PRIVATE(_MEMSWAP, RES_MAX_USAGE),
6083                .write = mem_cgroup_reset,
6084                .read_u64 = mem_cgroup_read_u64,
6085        },
6086        {
6087                .name = "memsw.limit_in_bytes",
6088                .private = MEMFILE_PRIVATE(_MEMSWAP, RES_LIMIT),
6089                .write = mem_cgroup_write,
6090                .read_u64 = mem_cgroup_read_u64,
6091        },
6092        {
6093                .name = "memsw.failcnt",
6094                .private = MEMFILE_PRIVATE(_MEMSWAP, RES_FAILCNT),
6095                .write = mem_cgroup_reset,
6096                .read_u64 = mem_cgroup_read_u64,
6097        },
6098        { },    /* terminate */
6099};
6100
6101static int __init mem_cgroup_swap_init(void)
6102{
6103        if (!mem_cgroup_disabled() && really_do_swap_account) {
6104                do_swap_account = 1;
6105                WARN_ON(cgroup_add_dfl_cftypes(&memory_cgrp_subsys,
6106                                               swap_files));
6107                WARN_ON(cgroup_add_legacy_cftypes(&memory_cgrp_subsys,
6108                                                  memsw_cgroup_files));
6109        }
6110        return 0;
6111}
6112subsys_initcall(mem_cgroup_swap_init);
6113
6114#endif /* CONFIG_MEMCG_SWAP */
6115