LXR linux/mm/memcontrol.c

   1/* memcontrol.c - Memory Controller
   2 *
   3 * Copyright IBM Corporation, 2007
   4 * Author Balbir Singh <balbir@linux.vnet.ibm.com>
   5 *
   6 * Copyright 2007 OpenVZ SWsoft Inc
   7 * Author: Pavel Emelianov <xemul@openvz.org>
   8 *
   9 * Memory thresholds
  10 * Copyright (C) 2009 Nokia Corporation
  11 * Author: Kirill A. Shutemov
  12 *
  13 * Kernel Memory Controller
  14 * Copyright (C) 2012 Parallels Inc. and Google Inc.
  15 * Authors: Glauber Costa and Suleiman Souhlal
  16 *
  17 * Native page reclaim
  18 * Charge lifetime sanitation
  19 * Lockless page tracking & accounting
  20 * Unified hierarchy configuration model
  21 * Copyright (C) 2015 Red Hat, Inc., Johannes Weiner
  22 *
  23 * This program is free software; you can redistribute it and/or modify
  24 * it under the terms of the GNU General Public License as published by
  25 * the Free Software Foundation; either version 2 of the License, or
  26 * (at your option) any later version.
  27 *
  28 * This program is distributed in the hope that it will be useful,
  29 * but WITHOUT ANY WARRANTY; without even the implied warranty of
  30 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  31 * GNU General Public License for more details.
  32 */
  33
  34#include <linux/page_counter.h>
  35#include <linux/memcontrol.h>
  36#include <linux/cgroup.h>
  37#include <linux/mm.h>
  38#include <linux/sched/mm.h>
  39#include <linux/shmem_fs.h>
  40#include <linux/hugetlb.h>
  41#include <linux/pagemap.h>
  42#include <linux/smp.h>
  43#include <linux/page-flags.h>
  44#include <linux/backing-dev.h>
  45#include <linux/bit_spinlock.h>
  46#include <linux/rcupdate.h>
  47#include <linux/limits.h>
  48#include <linux/export.h>
  49#include <linux/mutex.h>
  50#include <linux/rbtree.h>
  51#include <linux/slab.h>
  52#include <linux/swap.h>
  53#include <linux/swapops.h>
  54#include <linux/spinlock.h>
  55#include <linux/eventfd.h>
  56#include <linux/poll.h>
  57#include <linux/sort.h>
  58#include <linux/fs.h>
  59#include <linux/seq_file.h>
  60#include <linux/vmpressure.h>
  61#include <linux/mm_inline.h>
  62#include <linux/swap_cgroup.h>
  63#include <linux/cpu.h>
  64#include <linux/oom.h>
  65#include <linux/lockdep.h>
  66#include <linux/file.h>
  67#include <linux/tracehook.h>
  68#include "internal.h"
  69#include <net/sock.h>
  70#include <net/ip.h>
  71#include "slab.h"
  72
  73#include <linux/uaccess.h>
  74
  75#include <trace/events/vmscan.h>
  76
  77struct cgroup_subsys memory_cgrp_subsys __read_mostly;
  78EXPORT_SYMBOL(memory_cgrp_subsys);
  79
  80struct mem_cgroup *root_mem_cgroup __read_mostly;
  81
  82#define MEM_CGROUP_RECLAIM_RETRIES      5
  83
  84/* Socket memory accounting disabled? */
  85static bool cgroup_memory_nosocket;
  86
  87/* Kernel memory accounting disabled? */
  88static bool cgroup_memory_nokmem;
  89
  90/* Whether the swap controller is active */
  91#ifdef CONFIG_MEMCG_SWAP
  92int do_swap_account __read_mostly;
  93#else
  94#define do_swap_account         0
  95#endif
  96
  97/* Whether legacy memory+swap accounting is active */
  98static bool do_memsw_account(void)
  99{
 100        return !cgroup_subsys_on_dfl(memory_cgrp_subsys) && do_swap_account;
 101}
 102
 103static const char *const mem_cgroup_lru_names[] = {
 104        "inactive_anon",
 105        "active_anon",
 106        "inactive_file",
 107        "active_file",
 108        "unevictable",
 109};
 110
 111#define THRESHOLDS_EVENTS_TARGET 128
 112#define SOFTLIMIT_EVENTS_TARGET 1024
 113#define NUMAINFO_EVENTS_TARGET  1024
 114
 115/*
 116 * Cgroups above their limits are maintained in a RB-Tree, independent of
 117 * their hierarchy representation
 118 */
 119
 120struct mem_cgroup_tree_per_node {
 121        struct rb_root rb_root;
 122        struct rb_node *rb_rightmost;
 123        spinlock_t lock;
 124};
 125
 126struct mem_cgroup_tree {
 127        struct mem_cgroup_tree_per_node *rb_tree_per_node[MAX_NUMNODES];
 128};
 129
 130static struct mem_cgroup_tree soft_limit_tree __read_mostly;
 131
 132/* for OOM */
 133struct mem_cgroup_eventfd_list {
 134        struct list_head list;
 135        struct eventfd_ctx *eventfd;
 136};
 137
 138/*
 139 * cgroup_event represents events which userspace want to receive.
 140 */
 141struct mem_cgroup_event {
 142        /*
 143         * memcg which the event belongs to.
 144         */
 145        struct mem_cgroup *memcg;
 146        /*
 147         * eventfd to signal userspace about the event.
 148         */
 149        struct eventfd_ctx *eventfd;
 150        /*
 151         * Each of these stored in a list by the cgroup.
 152         */
 153        struct list_head list;
 154        /*
 155         * register_event() callback will be used to add new userspace
 156         * waiter for changes related to this event.  Use eventfd_signal()
 157         * on eventfd to send notification to userspace.
 158         */
 159        int (*register_event)(struct mem_cgroup *memcg,
 160                              struct eventfd_ctx *eventfd, const char *args);
 161        /*
 162         * unregister_event() callback will be called when userspace closes
 163         * the eventfd or on cgroup removing.  This callback must be set,
 164         * if you want provide notification functionality.
 165         */
 166        void (*unregister_event)(struct mem_cgroup *memcg,
 167                                 struct eventfd_ctx *eventfd);
 168        /*
 169         * All fields below needed to unregister event when
 170         * userspace closes eventfd.
 171         */
 172        poll_table pt;
 173        wait_queue_head_t *wqh;
 174        wait_queue_entry_t wait;
 175        struct work_struct remove;
 176};
 177
 178static void mem_cgroup_threshold(struct mem_cgroup *memcg);
 179static void mem_cgroup_oom_notify(struct mem_cgroup *memcg);
 180
 181/* Stuffs for move charges at task migration. */
 182/*
 183 * Types of charges to be moved.
 184 */
 185#define MOVE_ANON       0x1U
 186#define MOVE_FILE       0x2U
 187#define MOVE_MASK       (MOVE_ANON | MOVE_FILE)
 188
 189/* "mc" and its members are protected by cgroup_mutex */
 190static struct move_charge_struct {
 191        spinlock_t        lock; /* for from, to */
 192        struct mm_struct  *mm;
 193        struct mem_cgroup *from;
 194        struct mem_cgroup *to;
 195        unsigned long flags;
 196        unsigned long precharge;
 197        unsigned long moved_charge;
 198        unsigned long moved_swap;
 199        struct task_struct *moving_task;        /* a task moving charges */
 200        wait_queue_head_t waitq;                /* a waitq for other context */
 201} mc = {
 202        .lock = __SPIN_LOCK_UNLOCKED(mc.lock),
 203        .waitq = __WAIT_QUEUE_HEAD_INITIALIZER(mc.waitq),
 204};
 205
 206/*
 207 * Maximum loops in mem_cgroup_hierarchical_reclaim(), used for soft
 208 * limit reclaim to prevent infinite loops, if they ever occur.
 209 */
 210#define MEM_CGROUP_MAX_RECLAIM_LOOPS            100
 211#define MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS 2
 212
 213enum charge_type {
 214        MEM_CGROUP_CHARGE_TYPE_CACHE = 0,
 215        MEM_CGROUP_CHARGE_TYPE_ANON,
 216        MEM_CGROUP_CHARGE_TYPE_SWAPOUT, /* for accounting swapcache */
 217        MEM_CGROUP_CHARGE_TYPE_DROP,    /* a page was unused swap cache */
 218        NR_CHARGE_TYPE,
 219};
 220
 221/* for encoding cft->private value on file */
 222enum res_type {
 223        _MEM,
 224        _MEMSWAP,
 225        _OOM_TYPE,
 226        _KMEM,
 227        _TCP,
 228};
 229
 230#define MEMFILE_PRIVATE(x, val) ((x) << 16 | (val))
 231#define MEMFILE_TYPE(val)       ((val) >> 16 & 0xffff)
 232#define MEMFILE_ATTR(val)       ((val) & 0xffff)
 233/* Used for OOM nofiier */
 234#define OOM_CONTROL             (0)
 235
 236/* Some nice accessors for the vmpressure. */
 237struct vmpressure *memcg_to_vmpressure(struct mem_cgroup *memcg)
 238{
 239        if (!memcg)
 240                memcg = root_mem_cgroup;
 241        return &memcg->vmpressure;
 242}
 243
 244struct cgroup_subsys_state *vmpressure_to_css(struct vmpressure *vmpr)
 245{
 246        return &container_of(vmpr, struct mem_cgroup, vmpressure)->css;
 247}
 248
 249static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg)
 250{
 251        return (memcg == root_mem_cgroup);
 252}
 253
 254#ifndef CONFIG_SLOB
 255/*
 256 * This will be the memcg's index in each cache's ->memcg_params.memcg_caches.
 257 * The main reason for not using cgroup id for this:
 258 *  this works better in sparse environments, where we have a lot of memcgs,
 259 *  but only a few kmem-limited. Or also, if we have, for instance, 200
 260 *  memcgs, and none but the 200th is kmem-limited, we'd have to have a
 261 *  200 entry array for that.
 262 *
 263 * The current size of the caches array is stored in memcg_nr_cache_ids. It
 264 * will double each time we have to increase it.
 265 */
 266static DEFINE_IDA(memcg_cache_ida);
 267int memcg_nr_cache_ids;
 268
 269/* Protects memcg_nr_cache_ids */
 270static DECLARE_RWSEM(memcg_cache_ids_sem);
 271
 272void memcg_get_cache_ids(void)
 273{
 274        down_read(&memcg_cache_ids_sem);
 275}
 276
 277void memcg_put_cache_ids(void)
 278{
 279        up_read(&memcg_cache_ids_sem);
 280}
 281
 282/*
 283 * MIN_SIZE is different than 1, because we would like to avoid going through
 284 * the alloc/free process all the time. In a small machine, 4 kmem-limited
 285 * cgroups is a reasonable guess. In the future, it could be a parameter or
 286 * tunable, but that is strictly not necessary.
 287 *
 288 * MAX_SIZE should be as large as the number of cgrp_ids. Ideally, we could get
 289 * this constant directly from cgroup, but it is understandable that this is
 290 * better kept as an internal representation in cgroup.c. In any case, the
 291 * cgrp_id space is not getting any smaller, and we don't have to necessarily
 292 * increase ours as well if it increases.
 293 */
 294#define MEMCG_CACHES_MIN_SIZE 4
 295#define MEMCG_CACHES_MAX_SIZE MEM_CGROUP_ID_MAX
 296
 297/*
 298 * A lot of the calls to the cache allocation functions are expected to be
 299 * inlined by the compiler. Since the calls to memcg_kmem_get_cache are
 300 * conditional to this static branch, we'll have to allow modules that does
 301 * kmem_cache_alloc and the such to see this symbol as well
 302 */
 303DEFINE_STATIC_KEY_FALSE(memcg_kmem_enabled_key);
 304EXPORT_SYMBOL(memcg_kmem_enabled_key);
 305
 306struct workqueue_struct *memcg_kmem_cache_wq;
 307
 308#endif /* !CONFIG_SLOB */
 309
 310/**
 311 * mem_cgroup_css_from_page - css of the memcg associated with a page
 312 * @page: page of interest
 313 *
 314 * If memcg is bound to the default hierarchy, css of the memcg associated
 315 * with @page is returned.  The returned css remains associated with @page
 316 * until it is released.
 317 *
 318 * If memcg is bound to a traditional hierarchy, the css of root_mem_cgroup
 319 * is returned.
 320 */
 321struct cgroup_subsys_state *mem_cgroup_css_from_page(struct page *page)
 322{
 323        struct mem_cgroup *memcg;
 324
 325        memcg = page->mem_cgroup;
 326
 327        if (!memcg || !cgroup_subsys_on_dfl(memory_cgrp_subsys))
 328                memcg = root_mem_cgroup;
 329
 330        return &memcg->css;
 331}
 332
 333/**
 334 * page_cgroup_ino - return inode number of the memcg a page is charged to
 335 * @page: the page
 336 *
 337 * Look up the closest online ancestor of the memory cgroup @page is charged to
 338 * and return its inode number or 0 if @page is not charged to any cgroup. It
 339 * is safe to call this function without holding a reference to @page.
 340 *
 341 * Note, this function is inherently racy, because there is nothing to prevent
 342 * the cgroup inode from getting torn down and potentially reallocated a moment
 343 * after page_cgroup_ino() returns, so it only should be used by callers that
 344 * do not care (such as procfs interfaces).
 345 */
 346ino_t page_cgroup_ino(struct page *page)
 347{
 348        struct mem_cgroup *memcg;
 349        unsigned long ino = 0;
 350
 351        rcu_read_lock();
 352        memcg = READ_ONCE(page->mem_cgroup);
 353        while (memcg && !(memcg->css.flags & CSS_ONLINE))
 354                memcg = parent_mem_cgroup(memcg);
 355        if (memcg)
 356                ino = cgroup_ino(memcg->css.cgroup);
 357        rcu_read_unlock();
 358        return ino;
 359}
 360
 361static struct mem_cgroup_per_node *
 362mem_cgroup_page_nodeinfo(struct mem_cgroup *memcg, struct page *page)
 363{
 364        int nid = page_to_nid(page);
 365
 366        return memcg->nodeinfo[nid];
 367}
 368
 369static struct mem_cgroup_tree_per_node *
 370soft_limit_tree_node(int nid)
 371{
 372        return soft_limit_tree.rb_tree_per_node[nid];
 373}
 374
 375static struct mem_cgroup_tree_per_node *
 376soft_limit_tree_from_page(struct page *page)
 377{
 378        int nid = page_to_nid(page);
 379
 380        return soft_limit_tree.rb_tree_per_node[nid];
 381}
 382
 383static void __mem_cgroup_insert_exceeded(struct mem_cgroup_per_node *mz,
 384                                         struct mem_cgroup_tree_per_node *mctz,
 385                                         unsigned long new_usage_in_excess)
 386{
 387        struct rb_node **p = &mctz->rb_root.rb_node;
 388        struct rb_node *parent = NULL;
 389        struct mem_cgroup_per_node *mz_node;
 390        bool rightmost = true;
 391
 392        if (mz->on_tree)
 393                return;
 394
 395        mz->usage_in_excess = new_usage_in_excess;
 396        if (!mz->usage_in_excess)
 397                return;
 398        while (*p) {
 399                parent = *p;
 400                mz_node = rb_entry(parent, struct mem_cgroup_per_node,
 401                                        tree_node);
 402                if (mz->usage_in_excess < mz_node->usage_in_excess) {
 403                        p = &(*p)->rb_left;
 404                        rightmost = false;
 405                }
 406
 407                /*
 408                 * We can't avoid mem cgroups that are over their soft
 409                 * limit by the same amount
 410                 */
 411                else if (mz->usage_in_excess >= mz_node->usage_in_excess)
 412                        p = &(*p)->rb_right;
 413        }
 414
 415        if (rightmost)
 416                mctz->rb_rightmost = &mz->tree_node;
 417
 418        rb_link_node(&mz->tree_node, parent, p);
 419        rb_insert_color(&mz->tree_node, &mctz->rb_root);
 420        mz->on_tree = true;
 421}
 422
 423static void __mem_cgroup_remove_exceeded(struct mem_cgroup_per_node *mz,
 424                                         struct mem_cgroup_tree_per_node *mctz)
 425{
 426        if (!mz->on_tree)
 427                return;
 428
 429        if (&mz->tree_node == mctz->rb_rightmost)
 430                mctz->rb_rightmost = rb_prev(&mz->tree_node);
 431
 432        rb_erase(&mz->tree_node, &mctz->rb_root);
 433        mz->on_tree = false;
 434}
 435
 436static void mem_cgroup_remove_exceeded(struct mem_cgroup_per_node *mz,
 437                                       struct mem_cgroup_tree_per_node *mctz)
 438{
 439        unsigned long flags;
 440
 441        spin_lock_irqsave(&mctz->lock, flags);
 442        __mem_cgroup_remove_exceeded(mz, mctz);
 443        spin_unlock_irqrestore(&mctz->lock, flags);
 444}
 445
 446static unsigned long soft_limit_excess(struct mem_cgroup *memcg)
 447{
 448        unsigned long nr_pages = page_counter_read(&memcg->memory);
 449        unsigned long soft_limit = READ_ONCE(memcg->soft_limit);
 450        unsigned long excess = 0;
 451
 452        if (nr_pages > soft_limit)
 453                excess = nr_pages - soft_limit;
 454
 455        return excess;
 456}
 457
 458static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page)
 459{
 460        unsigned long excess;
 461        struct mem_cgroup_per_node *mz;
 462        struct mem_cgroup_tree_per_node *mctz;
 463
 464        mctz = soft_limit_tree_from_page(page);
 465        if (!mctz)
 466                return;
 467        /*
 468         * Necessary to update all ancestors when hierarchy is used.
 469         * because their event counter is not touched.
 470         */
 471        for (; memcg; memcg = parent_mem_cgroup(memcg)) {
 472                mz = mem_cgroup_page_nodeinfo(memcg, page);
 473                excess = soft_limit_excess(memcg);
 474                /*
 475                 * We have to update the tree if mz is on RB-tree or
 476                 * mem is over its softlimit.
 477                 */
 478                if (excess || mz->on_tree) {
 479                        unsigned long flags;
 480
 481                        spin_lock_irqsave(&mctz->lock, flags);
 482                        /* if on-tree, remove it */
 483                        if (mz->on_tree)
 484                                __mem_cgroup_remove_exceeded(mz, mctz);
 485                        /*
 486                         * Insert again. mz->usage_in_excess will be updated.
 487                         * If excess is 0, no tree ops.
 488                         */
 489                        __mem_cgroup_insert_exceeded(mz, mctz, excess);
 490                        spin_unlock_irqrestore(&mctz->lock, flags);
 491                }
 492        }
 493}
 494
 495static void mem_cgroup_remove_from_trees(struct mem_cgroup *memcg)
 496{
 497        struct mem_cgroup_tree_per_node *mctz;
 498        struct mem_cgroup_per_node *mz;
 499        int nid;
 500
 501        for_each_node(nid) {
 502                mz = mem_cgroup_nodeinfo(memcg, nid);
 503                mctz = soft_limit_tree_node(nid);
 504                if (mctz)
 505                        mem_cgroup_remove_exceeded(mz, mctz);
 506        }
 507}
 508
 509static struct mem_cgroup_per_node *
 510__mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz)
 511{
 512        struct mem_cgroup_per_node *mz;
 513
 514retry:
 515        mz = NULL;
 516        if (!mctz->rb_rightmost)
 517                goto done;              /* Nothing to reclaim from */
 518
 519        mz = rb_entry(mctz->rb_rightmost,
 520                      struct mem_cgroup_per_node, tree_node);
 521        /*
 522         * Remove the node now but someone else can add it back,
 523         * we will to add it back at the end of reclaim to its correct
 524         * position in the tree.
 525         */
 526        __mem_cgroup_remove_exceeded(mz, mctz);
 527        if (!soft_limit_excess(mz->memcg) ||
 528            !css_tryget_online(&mz->memcg->css))
 529                goto retry;
 530done:
 531        return mz;
 532}
 533
 534static struct mem_cgroup_per_node *
 535mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz)
 536{
 537        struct mem_cgroup_per_node *mz;
 538
 539        spin_lock_irq(&mctz->lock);
 540        mz = __mem_cgroup_largest_soft_limit_node(mctz);
 541        spin_unlock_irq(&mctz->lock);
 542        return mz;
 543}
 544
 545static unsigned long memcg_sum_events(struct mem_cgroup *memcg,
 546                                      int event)
 547{
 548        return atomic_long_read(&memcg->events[event]);
 549}
 550
 551static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg,
 552                                         struct page *page,
 553                                         bool compound, int nr_pages)
 554{
 555        /*
 556         * Here, RSS means 'mapped anon' and anon's SwapCache. Shmem/tmpfs is
 557         * counted as CACHE even if it's on ANON LRU.
 558         */
 559        if (PageAnon(page))
 560                __mod_memcg_state(memcg, MEMCG_RSS, nr_pages);
 561        else {
 562                __mod_memcg_state(memcg, MEMCG_CACHE, nr_pages);
 563                if (PageSwapBacked(page))
 564                        __mod_memcg_state(memcg, NR_SHMEM, nr_pages);
 565        }
 566
 567        if (compound) {
 568                VM_BUG_ON_PAGE(!PageTransHuge(page), page);
 569                __mod_memcg_state(memcg, MEMCG_RSS_HUGE, nr_pages);
 570        }
 571
 572        /* pagein of a big page is an event. So, ignore page size */
 573        if (nr_pages > 0)
 574                __count_memcg_events(memcg, PGPGIN, 1);
 575        else {
 576                __count_memcg_events(memcg, PGPGOUT, 1);
 577                nr_pages = -nr_pages; /* for event */
 578        }
 579
 580        __this_cpu_add(memcg->stat_cpu->nr_page_events, nr_pages);
 581}
 582
 583unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg,
 584                                           int nid, unsigned int lru_mask)
 585{
 586        struct lruvec *lruvec = mem_cgroup_lruvec(NODE_DATA(nid), memcg);
 587        unsigned long nr = 0;
 588        enum lru_list lru;
 589
 590        VM_BUG_ON((unsigned)nid >= nr_node_ids);
 591
 592        for_each_lru(lru) {
 593                if (!(BIT(lru) & lru_mask))
 594                        continue;
 595                nr += mem_cgroup_get_lru_size(lruvec, lru);
 596        }
 597        return nr;
 598}
 599
 600static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *memcg,
 601                        unsigned int lru_mask)
 602{
 603        unsigned long nr = 0;
 604        int nid;
 605
 606        for_each_node_state(nid, N_MEMORY)
 607                nr += mem_cgroup_node_nr_lru_pages(memcg, nid, lru_mask);
 608        return nr;
 609}
 610
 611static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg,
 612                                       enum mem_cgroup_events_target target)
 613{
 614        unsigned long val, next;
 615
 616        val = __this_cpu_read(memcg->stat_cpu->nr_page_events);
 617        next = __this_cpu_read(memcg->stat_cpu->targets[target]);
 618        /* from time_after() in jiffies.h */
 619        if ((long)(next - val) < 0) {
 620                switch (target) {
 621                case MEM_CGROUP_TARGET_THRESH:
 622                        next = val + THRESHOLDS_EVENTS_TARGET;
 623                        break;
 624                case MEM_CGROUP_TARGET_SOFTLIMIT:
 625                        next = val + SOFTLIMIT_EVENTS_TARGET;
 626                        break;
 627                case MEM_CGROUP_TARGET_NUMAINFO:
 628                        next = val + NUMAINFO_EVENTS_TARGET;
 629                        break;
 630                default:
 631                        break;
 632                }
 633                __this_cpu_write(memcg->stat_cpu->targets[target], next);
 634                return true;
 635        }
 636        return false;
 637}
 638
 639/*
 640 * Check events in order.
 641 *
 642 */
 643static void memcg_check_events(struct mem_cgroup *memcg, struct page *page)
 644{
 645        /* threshold event is triggered in finer grain than soft limit */
 646        if (unlikely(mem_cgroup_event_ratelimit(memcg,
 647                                                MEM_CGROUP_TARGET_THRESH))) {
 648                bool do_softlimit;
 649                bool do_numainfo __maybe_unused;
 650
 651                do_softlimit = mem_cgroup_event_ratelimit(memcg,
 652                                                MEM_CGROUP_TARGET_SOFTLIMIT);
 653#if MAX_NUMNODES > 1
 654                do_numainfo = mem_cgroup_event_ratelimit(memcg,
 655                                                MEM_CGROUP_TARGET_NUMAINFO);
 656#endif
 657                mem_cgroup_threshold(memcg);
 658                if (unlikely(do_softlimit))
 659                        mem_cgroup_update_tree(memcg, page);
 660#if MAX_NUMNODES > 1
 661                if (unlikely(do_numainfo))
 662                        atomic_inc(&memcg->numainfo_events);
 663#endif
 664        }
 665}
 666
 667struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
 668{
 669        /*
 670         * mm_update_next_owner() may clear mm->owner to NULL
 671         * if it races with swapoff, page migration, etc.
 672         * So this can be called with p == NULL.
 673         */
 674        if (unlikely(!p))
 675                return NULL;
 676
 677        return mem_cgroup_from_css(task_css(p, memory_cgrp_id));
 678}
 679EXPORT_SYMBOL(mem_cgroup_from_task);
 680
 681static struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm)
 682{
 683        struct mem_cgroup *memcg = NULL;
 684
 685        rcu_read_lock();
 686        do {
 687                /*
 688                 * Page cache insertions can happen withou an
 689                 * actual mm context, e.g. during disk probing
 690                 * on boot, loopback IO, acct() writes etc.
 691                 */
 692                if (unlikely(!mm))
 693                        memcg = root_mem_cgroup;
 694                else {
 695                        memcg = mem_cgroup_from_task(rcu_dereference(mm->owner));
 696                        if (unlikely(!memcg))
 697                                memcg = root_mem_cgroup;
 698                }
 699        } while (!css_tryget_online(&memcg->css));
 700        rcu_read_unlock();
 701        return memcg;
 702}
 703
 704/**
 705 * mem_cgroup_iter - iterate over memory cgroup hierarchy
 706 * @root: hierarchy root
 707 * @prev: previously returned memcg, NULL on first invocation
 708 * @reclaim: cookie for shared reclaim walks, NULL for full walks
 709 *
 710 * Returns references to children of the hierarchy below @root, or
 711 * @root itself, or %NULL after a full round-trip.
 712 *
 713 * Caller must pass the return value in @prev on subsequent
 714 * invocations for reference counting, or use mem_cgroup_iter_break()
 715 * to cancel a hierarchy walk before the round-trip is complete.
 716 *
 717 * Reclaimers can specify a node and a priority level in @reclaim to
 718 * divide up the memcgs in the hierarchy among all concurrent
 719 * reclaimers operating on the same node and priority.
 720 */
 721struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
 722                                   struct mem_cgroup *prev,
 723                                   struct mem_cgroup_reclaim_cookie *reclaim)
 724{
 725        struct mem_cgroup_reclaim_iter *uninitialized_var(iter);
 726        struct cgroup_subsys_state *css = NULL;
 727        struct mem_cgroup *memcg = NULL;
 728        struct mem_cgroup *pos = NULL;
 729
 730        if (mem_cgroup_disabled())
 731                return NULL;
 732
 733        if (!root)
 734                root = root_mem_cgroup;
 735
 736        if (prev && !reclaim)
 737                pos = prev;
 738
 739        if (!root->use_hierarchy && root != root_mem_cgroup) {
 740                if (prev)
 741                        goto out;
 742                return root;
 743        }
 744
 745        rcu_read_lock();
 746
 747        if (reclaim) {
 748                struct mem_cgroup_per_node *mz;
 749
 750                mz = mem_cgroup_nodeinfo(root, reclaim->pgdat->node_id);
 751                iter = &mz->iter[reclaim->priority];
 752
 753                if (prev && reclaim->generation != iter->generation)
 754                        goto out_unlock;
 755
 756                while (1) {
 757                        pos = READ_ONCE(iter->position);
 758                        if (!pos || css_tryget(&pos->css))
 759                                break;
 760                        /*
 761                         * css reference reached zero, so iter->position will
 762                         * be cleared by ->css_released. However, we should not
 763                         * rely on this happening soon, because ->css_released
 764                         * is called from a work queue, and by busy-waiting we
 765                         * might block it. So we clear iter->position right
 766                         * away.
 767                         */
 768                        (void)cmpxchg(&iter->position, pos, NULL);
 769                }
 770        }
 771
 772        if (pos)
 773                css = &pos->css;
 774
 775        for (;;) {
 776                css = css_next_descendant_pre(css, &root->css);
 777                if (!css) {
 778                        /*
 779                         * Reclaimers share the hierarchy walk, and a
 780                         * new one might jump in right at the end of
 781                         * the hierarchy - make sure they see at least
 782                         * one group and restart from the beginning.
 783                         */
 784                        if (!prev)
 785                                continue;
 786                        break;
 787                }
 788
 789                /*
 790                 * Verify the css and acquire a reference.  The root
 791                 * is provided by the caller, so we know it's alive
 792                 * and kicking, and don't take an extra reference.
 793                 */
 794                memcg = mem_cgroup_from_css(css);
 795
 796                if (css == &root->css)
 797                        break;
 798
 799                if (css_tryget(css))
 800                        break;
 801
 802                memcg = NULL;
 803        }
 804
 805        if (reclaim) {
 806                /*
 807                 * The position could have already been updated by a competing
 808                 * thread, so check that the value hasn't changed since we read
 809                 * it to avoid reclaiming from the same cgroup twice.
 810                 */
 811                (void)cmpxchg(&iter->position, pos, memcg);
 812
 813                if (pos)
 814                        css_put(&pos->css);
 815
 816                if (!memcg)
 817                        iter->generation++;
 818                else if (!prev)
 819                        reclaim->generation = iter->generation;
 820        }
 821
 822out_unlock:
 823        rcu_read_unlock();
 824out:
 825        if (prev && prev != root)
 826                css_put(&prev->css);
 827
 828        return memcg;
 829}
 830
 831/**
 832 * mem_cgroup_iter_break - abort a hierarchy walk prematurely
 833 * @root: hierarchy root
 834 * @prev: last visited hierarchy member as returned by mem_cgroup_iter()
 835 */
 836void mem_cgroup_iter_break(struct mem_cgroup *root,
 837                           struct mem_cgroup *prev)
 838{
 839        if (!root)
 840                root = root_mem_cgroup;
 841        if (prev && prev != root)
 842                css_put(&prev->css);
 843}
 844
 845static void invalidate_reclaim_iterators(struct mem_cgroup *dead_memcg)
 846{
 847        struct mem_cgroup *memcg = dead_memcg;
 848        struct mem_cgroup_reclaim_iter *iter;
 849        struct mem_cgroup_per_node *mz;
 850        int nid;
 851        int i;
 852
 853        while ((memcg = parent_mem_cgroup(memcg))) {
 854                for_each_node(nid) {
 855                        mz = mem_cgroup_nodeinfo(memcg, nid);
 856                        for (i = 0; i <= DEF_PRIORITY; i++) {
 857                                iter = &mz->iter[i];
 858                                cmpxchg(&iter->position,
 859                                        dead_memcg, NULL);
 860                        }
 861                }
 862        }
 863}
 864
 865/*
 866 * Iteration constructs for visiting all cgroups (under a tree).  If
 867 * loops are exited prematurely (break), mem_cgroup_iter_break() must
 868 * be used for reference counting.
 869 */
 870#define for_each_mem_cgroup_tree(iter, root)            \
 871        for (iter = mem_cgroup_iter(root, NULL, NULL);  \
 872             iter != NULL;                              \
 873             iter = mem_cgroup_iter(root, iter, NULL))
 874
 875#define for_each_mem_cgroup(iter)                       \
 876        for (iter = mem_cgroup_iter(NULL, NULL, NULL);  \
 877             iter != NULL;                              \
 878             iter = mem_cgroup_iter(NULL, iter, NULL))
 879
 880/**
 881 * mem_cgroup_scan_tasks - iterate over tasks of a memory cgroup hierarchy
 882 * @memcg: hierarchy root
 883 * @fn: function to call for each task
 884 * @arg: argument passed to @fn
 885 *
 886 * This function iterates over tasks attached to @memcg or to any of its
 887 * descendants and calls @fn for each task. If @fn returns a non-zero
 888 * value, the function breaks the iteration loop and returns the value.
 889 * Otherwise, it will iterate over all tasks and return 0.
 890 *
 891 * This function must not be called for the root memory cgroup.
 892 */
 893int mem_cgroup_scan_tasks(struct mem_cgroup *memcg,
 894                          int (*fn)(struct task_struct *, void *), void *arg)
 895{
 896        struct mem_cgroup *iter;
 897        int ret = 0;
 898
 899        BUG_ON(memcg == root_mem_cgroup);
 900
 901        for_each_mem_cgroup_tree(iter, memcg) {
 902                struct css_task_iter it;
 903                struct task_struct *task;
 904
 905                css_task_iter_start(&iter->css, 0, &it);
 906                while (!ret && (task = css_task_iter_next(&it)))
 907                        ret = fn(task, arg);
 908                css_task_iter_end(&it);
 909                if (ret) {
 910                        mem_cgroup_iter_break(memcg, iter);
 911                        break;
 912                }
 913        }
 914        return ret;
 915}
 916
 917/**
 918 * mem_cgroup_page_lruvec - return lruvec for isolating/putting an LRU page
 919 * @page: the page
 920 * @pgdat: pgdat of the page
 921 *
 922 * This function is only safe when following the LRU page isolation
 923 * and putback protocol: the LRU lock must be held, and the page must
 924 * either be PageLRU() or the caller must have isolated/allocated it.
 925 */
 926struct lruvec *mem_cgroup_page_lruvec(struct page *page, struct pglist_data *pgdat)
 927{
 928        struct mem_cgroup_per_node *mz;
 929        struct mem_cgroup *memcg;
 930        struct lruvec *lruvec;
 931
 932        if (mem_cgroup_disabled()) {
 933                lruvec = &pgdat->lruvec;
 934                goto out;
 935        }
 936
 937        memcg = page->mem_cgroup;
 938        /*
 939         * Swapcache readahead pages are added to the LRU - and
 940         * possibly migrated - before they are charged.
 941         */
 942        if (!memcg)
 943                memcg = root_mem_cgroup;
 944
 945        mz = mem_cgroup_page_nodeinfo(memcg, page);
 946        lruvec = &mz->lruvec;
 947out:
 948        /*
 949         * Since a node can be onlined after the mem_cgroup was created,
 950         * we have to be prepared to initialize lruvec->zone here;
 951         * and if offlined then reonlined, we need to reinitialize it.
 952         */
 953        if (unlikely(lruvec->pgdat != pgdat))
 954                lruvec->pgdat = pgdat;
 955        return lruvec;
 956}
 957
 958/**
 959 * mem_cgroup_update_lru_size - account for adding or removing an lru page
 960 * @lruvec: mem_cgroup per zone lru vector
 961 * @lru: index of lru list the page is sitting on
 962 * @zid: zone id of the accounted pages
 963 * @nr_pages: positive when adding or negative when removing
 964 *
 965 * This function must be called under lru_lock, just before a page is added
 966 * to or just after a page is removed from an lru list (that ordering being
 967 * so as to allow it to check that lru_size 0 is consistent with list_empty).
 968 */
 969void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru,
 970                                int zid, int nr_pages)
 971{
 972        struct mem_cgroup_per_node *mz;
 973        unsigned long *lru_size;
 974        long size;
 975
 976        if (mem_cgroup_disabled())
 977                return;
 978
 979        mz = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
 980        lru_size = &mz->lru_zone_size[zid][lru];
 981
 982        if (nr_pages < 0)
 983                *lru_size += nr_pages;
 984
 985        size = *lru_size;
 986        if (WARN_ONCE(size < 0,
 987                "%s(%p, %d, %d): lru_size %ld\n",
 988                __func__, lruvec, lru, nr_pages, size)) {
 989                VM_BUG_ON(1);
 990                *lru_size = 0;
 991        }
 992
 993        if (nr_pages > 0)
 994                *lru_size += nr_pages;
 995}
 996
 997bool task_in_mem_cgroup(struct task_struct *task, struct mem_cgroup *memcg)
 998{
 999        struct mem_cgroup *task_memcg;
1000        struct task_struct *p;

1001        bool ret;
1002
1003        p = find_lock_task_mm(task);
1004        if (p) {
1005                task_memcg = get_mem_cgroup_from_mm(p->mm);
1006                task_unlock(p);
1007        } else {
1008                /*
1009                 * All threads may have already detached their mm's, but the oom
1010                 * killer still needs to detect if they have already been oom
1011                 * killed to prevent needlessly killing additional tasks.
1012                 */
1013                rcu_read_lock();
1014                task_memcg = mem_cgroup_from_task(task);
1015                css_get(&task_memcg->css);
1016                rcu_read_unlock();
1017        }
1018        ret = mem_cgroup_is_descendant(task_memcg, memcg);
1019        css_put(&task_memcg->css);
1020        return ret;
1021}
1022
1023/**
1024 * mem_cgroup_margin - calculate chargeable space of a memory cgroup
1025 * @memcg: the memory cgroup
1026 *
1027 * Returns the maximum amount of memory @mem can be charged with, in
1028 * pages.
1029 */
1030static unsigned long mem_cgroup_margin(struct mem_cgroup *memcg)
1031{
1032        unsigned long margin = 0;
1033        unsigned long count;
1034        unsigned long limit;
1035
1036        count = page_counter_read(&memcg->memory);
1037        limit = READ_ONCE(memcg->memory.limit);
1038        if (count < limit)
1039                margin = limit - count;
1040
1041        if (do_memsw_account()) {
1042                count = page_counter_read(&memcg->memsw);
1043                limit = READ_ONCE(memcg->memsw.limit);
1044                if (count <= limit)
1045                        margin = min(margin, limit - count);
1046                else
1047                        margin = 0;
1048        }
1049
1050        return margin;
1051}
1052
1053/*
1054 * A routine for checking "mem" is under move_account() or not.
1055 *
1056 * Checking a cgroup is mc.from or mc.to or under hierarchy of
1057 * moving cgroups. This is for waiting at high-memory pressure
1058 * caused by "move".
1059 */
1060static bool mem_cgroup_under_move(struct mem_cgroup *memcg)
1061{
1062        struct mem_cgroup *from;
1063        struct mem_cgroup *to;
1064        bool ret = false;
1065        /*
1066         * Unlike task_move routines, we access mc.to, mc.from not under
1067         * mutual exclusion by cgroup_mutex. Here, we take spinlock instead.
1068         */
1069        spin_lock(&mc.lock);
1070        from = mc.from;
1071        to = mc.to;
1072        if (!from)
1073                goto unlock;
1074
1075        ret = mem_cgroup_is_descendant(from, memcg) ||
1076                mem_cgroup_is_descendant(to, memcg);
1077unlock:
1078        spin_unlock(&mc.lock);
1079        return ret;
1080}
1081
1082static bool mem_cgroup_wait_acct_move(struct mem_cgroup *memcg)
1083{
1084        if (mc.moving_task && current != mc.moving_task) {
1085                if (mem_cgroup_under_move(memcg)) {
1086                        DEFINE_WAIT(wait);
1087                        prepare_to_wait(&mc.waitq, &wait, TASK_INTERRUPTIBLE);
1088                        /* moving charge context might have finished. */
1089                        if (mc.moving_task)
1090                                schedule();
1091                        finish_wait(&mc.waitq, &wait);
1092                        return true;
1093                }
1094        }
1095        return false;
1096}
1097
1098static const unsigned int memcg1_stats[] = {
1099        MEMCG_CACHE,
1100        MEMCG_RSS,
1101        MEMCG_RSS_HUGE,
1102        NR_SHMEM,
1103        NR_FILE_MAPPED,
1104        NR_FILE_DIRTY,
1105        NR_WRITEBACK,
1106        MEMCG_SWAP,
1107};
1108
1109static const char *const memcg1_stat_names[] = {
1110        "cache",
1111        "rss",
1112        "rss_huge",
1113        "shmem",
1114        "mapped_file",
1115        "dirty",
1116        "writeback",
1117        "swap",
1118};
1119
1120#define K(x) ((x) << (PAGE_SHIFT-10))
1121/**
1122 * mem_cgroup_print_oom_info: Print OOM information relevant to memory controller.
1123 * @memcg: The memory cgroup that went over limit
1124 * @p: Task that is going to be killed
1125 *
1126 * NOTE: @memcg and @p's mem_cgroup can be different when hierarchy is
1127 * enabled
1128 */
1129void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
1130{
1131        struct mem_cgroup *iter;
1132        unsigned int i;
1133
1134        rcu_read_lock();
1135
1136        if (p) {
1137                pr_info("Task in ");
1138                pr_cont_cgroup_path(task_cgroup(p, memory_cgrp_id));
1139                pr_cont(" killed as a result of limit of ");
1140        } else {
1141                pr_info("Memory limit reached of cgroup ");
1142        }
1143
1144        pr_cont_cgroup_path(memcg->css.cgroup);
1145        pr_cont("\n");
1146
1147        rcu_read_unlock();
1148
1149        pr_info("memory: usage %llukB, limit %llukB, failcnt %lu\n",
1150                K((u64)page_counter_read(&memcg->memory)),
1151                K((u64)memcg->memory.limit), memcg->memory.failcnt);
1152        pr_info("memory+swap: usage %llukB, limit %llukB, failcnt %lu\n",
1153                K((u64)page_counter_read(&memcg->memsw)),
1154                K((u64)memcg->memsw.limit), memcg->memsw.failcnt);
1155        pr_info("kmem: usage %llukB, limit %llukB, failcnt %lu\n",
1156                K((u64)page_counter_read(&memcg->kmem)),
1157                K((u64)memcg->kmem.limit), memcg->kmem.failcnt);
1158
1159        for_each_mem_cgroup_tree(iter, memcg) {
1160                pr_info("Memory cgroup stats for ");
1161                pr_cont_cgroup_path(iter->css.cgroup);
1162                pr_cont(":");
1163
1164                for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) {
1165                        if (memcg1_stats[i] == MEMCG_SWAP && !do_swap_account)
1166                                continue;
1167                        pr_cont(" %s:%luKB", memcg1_stat_names[i],
1168                                K(memcg_page_state(iter, memcg1_stats[i])));
1169                }
1170
1171                for (i = 0; i < NR_LRU_LISTS; i++)
1172                        pr_cont(" %s:%luKB", mem_cgroup_lru_names[i],
1173                                K(mem_cgroup_nr_lru_pages(iter, BIT(i))));
1174
1175                pr_cont("\n");
1176        }
1177}
1178
1179/*
1180 * Return the memory (and swap, if configured) limit for a memcg.
1181 */
1182unsigned long mem_cgroup_get_limit(struct mem_cgroup *memcg)
1183{
1184        unsigned long limit;
1185
1186        limit = memcg->memory.limit;
1187        if (mem_cgroup_swappiness(memcg)) {
1188                unsigned long memsw_limit;
1189                unsigned long swap_limit;
1190
1191                memsw_limit = memcg->memsw.limit;
1192                swap_limit = memcg->swap.limit;
1193                swap_limit = min(swap_limit, (unsigned long)total_swap_pages);
1194                limit = min(limit + swap_limit, memsw_limit);
1195        }
1196        return limit;
1197}
1198
1199static bool mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
1200                                     int order)
1201{
1202        struct oom_control oc = {
1203                .zonelist = NULL,
1204                .nodemask = NULL,
1205                .memcg = memcg,
1206                .gfp_mask = gfp_mask,
1207                .order = order,
1208        };
1209        bool ret;
1210
1211        mutex_lock(&oom_lock);
1212        ret = out_of_memory(&oc);
1213        mutex_unlock(&oom_lock);
1214        return ret;
1215}
1216
1217#if MAX_NUMNODES > 1
1218
1219/**
1220 * test_mem_cgroup_node_reclaimable
1221 * @memcg: the target memcg
1222 * @nid: the node ID to be checked.
1223 * @noswap : specify true here if the user wants flle only information.
1224 *
1225 * This function returns whether the specified memcg contains any
1226 * reclaimable pages on a node. Returns true if there are any reclaimable
1227 * pages in the node.
1228 */
1229static bool test_mem_cgroup_node_reclaimable(struct mem_cgroup *memcg,
1230                int nid, bool noswap)
1231{
1232        if (mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL_FILE))
1233                return true;
1234        if (noswap || !total_swap_pages)
1235                return false;
1236        if (mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL_ANON))
1237                return true;
1238        return false;
1239
1240}
1241
1242/*
1243 * Always updating the nodemask is not very good - even if we have an empty
1244 * list or the wrong list here, we can start from some node and traverse all
1245 * nodes based on the zonelist. So update the list loosely once per 10 secs.
1246 *
1247 */
1248static void mem_cgroup_may_update_nodemask(struct mem_cgroup *memcg)
1249{
1250        int nid;
1251        /*
1252         * numainfo_events > 0 means there was at least NUMAINFO_EVENTS_TARGET
1253         * pagein/pageout changes since the last update.
1254         */
1255        if (!atomic_read(&memcg->numainfo_events))
1256                return;
1257        if (atomic_inc_return(&memcg->numainfo_updating) > 1)
1258                return;
1259
1260        /* make a nodemask where this memcg uses memory from */
1261        memcg->scan_nodes = node_states[N_MEMORY];
1262
1263        for_each_node_mask(nid, node_states[N_MEMORY]) {
1264
1265                if (!test_mem_cgroup_node_reclaimable(memcg, nid, false))
1266                        node_clear(nid, memcg->scan_nodes);
1267        }
1268
1269        atomic_set(&memcg->numainfo_events, 0);
1270        atomic_set(&memcg->numainfo_updating, 0);
1271}
1272
1273/*
1274 * Selecting a node where we start reclaim from. Because what we need is just
1275 * reducing usage counter, start from anywhere is O,K. Considering
1276 * memory reclaim from current node, there are pros. and cons.
1277 *
1278 * Freeing memory from current node means freeing memory from a node which
1279 * we'll use or we've used. So, it may make LRU bad. And if several threads
1280 * hit limits, it will see a contention on a node. But freeing from remote
1281 * node means more costs for memory reclaim because of memory latency.
1282 *
1283 * Now, we use round-robin. Better algorithm is welcomed.
1284 */
1285int mem_cgroup_select_victim_node(struct mem_cgroup *memcg)
1286{
1287        int node;
1288
1289        mem_cgroup_may_update_nodemask(memcg);
1290        node = memcg->last_scanned_node;
1291
1292        node = next_node_in(node, memcg->scan_nodes);
1293        /*
1294         * mem_cgroup_may_update_nodemask might have seen no reclaimmable pages
1295         * last time it really checked all the LRUs due to rate limiting.
1296         * Fallback to the current node in that case for simplicity.
1297         */
1298        if (unlikely(node == MAX_NUMNODES))
1299                node = numa_node_id();
1300
1301        memcg->last_scanned_node = node;
1302        return node;
1303}
1304#else
1305int mem_cgroup_select_victim_node(struct mem_cgroup *memcg)
1306{
1307        return 0;
1308}
1309#endif
1310
1311static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg,
1312                                   pg_data_t *pgdat,
1313                                   gfp_t gfp_mask,
1314                                   unsigned long *total_scanned)
1315{
1316        struct mem_cgroup *victim = NULL;
1317        int total = 0;
1318        int loop = 0;
1319        unsigned long excess;
1320        unsigned long nr_scanned;
1321        struct mem_cgroup_reclaim_cookie reclaim = {
1322                .pgdat = pgdat,
1323                .priority = 0,
1324        };
1325
1326        excess = soft_limit_excess(root_memcg);
1327
1328        while (1) {
1329                victim = mem_cgroup_iter(root_memcg, victim, &reclaim);
1330                if (!victim) {
1331                        loop++;
1332                        if (loop >= 2) {
1333                                /*
1334                                 * If we have not been able to reclaim
1335                                 * anything, it might because there are
1336                                 * no reclaimable pages under this hierarchy
1337                                 */
1338                                if (!total)
1339                                        break;
1340                                /*
1341                                 * We want to do more targeted reclaim.
1342                                 * excess >> 2 is not to excessive so as to
1343                                 * reclaim too much, nor too less that we keep
1344                                 * coming back to reclaim from this cgroup
1345                                 */
1346                                if (total >= (excess >> 2) ||
1347                                        (loop > MEM_CGROUP_MAX_RECLAIM_LOOPS))
1348                                        break;
1349                        }
1350                        continue;
1351                }
1352                total += mem_cgroup_shrink_node(victim, gfp_mask, false,
1353                                        pgdat, &nr_scanned);
1354                *total_scanned += nr_scanned;
1355                if (!soft_limit_excess(root_memcg))
1356                        break;
1357        }
1358        mem_cgroup_iter_break(root_memcg, victim);
1359        return total;
1360}
1361
1362#ifdef CONFIG_LOCKDEP
1363static struct lockdep_map memcg_oom_lock_dep_map = {
1364        .name = "memcg_oom_lock",
1365};
1366#endif
1367
1368static DEFINE_SPINLOCK(memcg_oom_lock);
1369
1370/*
1371 * Check OOM-Killer is already running under our hierarchy.
1372 * If someone is running, return false.
1373 */
1374static bool mem_cgroup_oom_trylock(struct mem_cgroup *memcg)
1375{
1376        struct mem_cgroup *iter, *failed = NULL;
1377
1378        spin_lock(&memcg_oom_lock);
1379
1380        for_each_mem_cgroup_tree(iter, memcg) {
1381                if (iter->oom_lock) {
1382                        /*
1383                         * this subtree of our hierarchy is already locked
1384                         * so we cannot give a lock.
1385                         */
1386                        failed = iter;
1387                        mem_cgroup_iter_break(memcg, iter);
1388                        break;
1389                } else
1390                        iter->oom_lock = true;
1391        }
1392
1393        if (failed) {
1394                /*
1395                 * OK, we failed to lock the whole subtree so we have
1396                 * to clean up what we set up to the failing subtree
1397                 */
1398                for_each_mem_cgroup_tree(iter, memcg) {
1399                        if (iter == failed) {
1400                                mem_cgroup_iter_break(memcg, iter);
1401                                break;
1402                        }
1403                        iter->oom_lock = false;
1404                }
1405        } else
1406                mutex_acquire(&memcg_oom_lock_dep_map, 0, 1, _RET_IP_);
1407
1408        spin_unlock(&memcg_oom_lock);
1409
1410        return !failed;
1411}
1412
1413static void mem_cgroup_oom_unlock(struct mem_cgroup *memcg)
1414{
1415        struct mem_cgroup *iter;
1416
1417        spin_lock(&memcg_oom_lock);
1418        mutex_release(&memcg_oom_lock_dep_map, 1, _RET_IP_);
1419        for_each_mem_cgroup_tree(iter, memcg)
1420                iter->oom_lock = false;
1421        spin_unlock(&memcg_oom_lock);
1422}
1423
1424static void mem_cgroup_mark_under_oom(struct mem_cgroup *memcg)
1425{
1426        struct mem_cgroup *iter;
1427
1428        spin_lock(&memcg_oom_lock);
1429        for_each_mem_cgroup_tree(iter, memcg)
1430                iter->under_oom++;
1431        spin_unlock(&memcg_oom_lock);
1432}
1433
1434static void mem_cgroup_unmark_under_oom(struct mem_cgroup *memcg)
1435{
1436        struct mem_cgroup *iter;
1437
1438        /*
1439         * When a new child is created while the hierarchy is under oom,
1440         * mem_cgroup_oom_lock() may not be called. Watch for underflow.
1441         */
1442        spin_lock(&memcg_oom_lock);
1443        for_each_mem_cgroup_tree(iter, memcg)
1444                if (iter->under_oom > 0)
1445                        iter->under_oom--;
1446        spin_unlock(&memcg_oom_lock);
1447}
1448
1449static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq);
1450
1451struct oom_wait_info {
1452        struct mem_cgroup *memcg;
1453        wait_queue_entry_t      wait;
1454};
1455
1456static int memcg_oom_wake_function(wait_queue_entry_t *wait,
1457        unsigned mode, int sync, void *arg)
1458{
1459        struct mem_cgroup *wake_memcg = (struct mem_cgroup *)arg;
1460        struct mem_cgroup *oom_wait_memcg;
1461        struct oom_wait_info *oom_wait_info;
1462
1463        oom_wait_info = container_of(wait, struct oom_wait_info, wait);
1464        oom_wait_memcg = oom_wait_info->memcg;
1465
1466        if (!mem_cgroup_is_descendant(wake_memcg, oom_wait_memcg) &&
1467            !mem_cgroup_is_descendant(oom_wait_memcg, wake_memcg))
1468                return 0;
1469        return autoremove_wake_function(wait, mode, sync, arg);
1470}
1471
1472static void memcg_oom_recover(struct mem_cgroup *memcg)
1473{
1474        /*
1475         * For the following lockless ->under_oom test, the only required
1476         * guarantee is that it must see the state asserted by an OOM when
1477         * this function is called as a result of userland actions
1478         * triggered by the notification of the OOM.  This is trivially
1479         * achieved by invoking mem_cgroup_mark_under_oom() before
1480         * triggering notification.
1481         */
1482        if (memcg && memcg->under_oom)
1483                __wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, memcg);
1484}
1485
1486static void mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order)
1487{
1488        if (!current->memcg_may_oom || order > PAGE_ALLOC_COSTLY_ORDER)
1489                return;
1490        /*
1491         * We are in the middle of the charge context here, so we
1492         * don't want to block when potentially sitting on a callstack
1493         * that holds all kinds of filesystem and mm locks.
1494         *
1495         * Also, the caller may handle a failed allocation gracefully
1496         * (like optional page cache readahead) and so an OOM killer
1497         * invocation might not even be necessary.
1498         *
1499         * That's why we don't do anything here except remember the
1500         * OOM context and then deal with it at the end of the page
1501         * fault when the stack is unwound, the locks are released,
1502         * and when we know whether the fault was overall successful.
1503         */
1504        css_get(&memcg->css);
1505        current->memcg_in_oom = memcg;
1506        current->memcg_oom_gfp_mask = mask;
1507        current->memcg_oom_order = order;
1508}
1509
1510/**
1511 * mem_cgroup_oom_synchronize - complete memcg OOM handling
1512 * @handle: actually kill/wait or just clean up the OOM state
1513 *
1514 * This has to be called at the end of a page fault if the memcg OOM
1515 * handler was enabled.
1516 *
1517 * Memcg supports userspace OOM handling where failed allocations must
1518 * sleep on a waitqueue until the userspace task resolves the
1519 * situation.  Sleeping directly in the charge context with all kinds
1520 * of locks held is not a good idea, instead we remember an OOM state
1521 * in the task and mem_cgroup_oom_synchronize() has to be called at
1522 * the end of the page fault to complete the OOM handling.
1523 *
1524 * Returns %true if an ongoing memcg OOM situation was detected and
1525 * completed, %false otherwise.
1526 */
1527bool mem_cgroup_oom_synchronize(bool handle)
1528{
1529        struct mem_cgroup *memcg = current->memcg_in_oom;
1530        struct oom_wait_info owait;
1531        bool locked;
1532
1533        /* OOM is global, do not handle */
1534        if (!memcg)
1535                return false;
1536
1537        if (!handle)
1538                goto cleanup;
1539
1540        owait.memcg = memcg;
1541        owait.wait.flags = 0;
1542        owait.wait.func = memcg_oom_wake_function;
1543        owait.wait.private = current;
1544        INIT_LIST_HEAD(&owait.wait.entry);
1545
1546        prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE);
1547        mem_cgroup_mark_under_oom(memcg);
1548
1549        locked = mem_cgroup_oom_trylock(memcg);
1550
1551        if (locked)
1552                mem_cgroup_oom_notify(memcg);
1553
1554        if (locked && !memcg->oom_kill_disable) {
1555                mem_cgroup_unmark_under_oom(memcg);
1556                finish_wait(&memcg_oom_waitq, &owait.wait);
1557                mem_cgroup_out_of_memory(memcg, current->memcg_oom_gfp_mask,
1558                                         current->memcg_oom_order);
1559        } else {
1560                schedule();
1561                mem_cgroup_unmark_under_oom(memcg);
1562                finish_wait(&memcg_oom_waitq, &owait.wait);
1563        }
1564
1565        if (locked) {
1566                mem_cgroup_oom_unlock(memcg);
1567                /*
1568                 * There is no guarantee that an OOM-lock contender
1569                 * sees the wakeups triggered by the OOM kill
1570                 * uncharges.  Wake any sleepers explicitely.
1571                 */
1572                memcg_oom_recover(memcg);
1573        }
1574cleanup:
1575        current->memcg_in_oom = NULL;
1576        css_put(&memcg->css);
1577        return true;
1578}
1579
1580/**
1581 * lock_page_memcg - lock a page->mem_cgroup binding
1582 * @page: the page
1583 *
1584 * This function protects unlocked LRU pages from being moved to
1585 * another cgroup.
1586 *
1587 * It ensures lifetime of the returned memcg. Caller is responsible
1588 * for the lifetime of the page; __unlock_page_memcg() is available
1589 * when @page might get freed inside the locked section.
1590 */
1591struct mem_cgroup *lock_page_memcg(struct page *page)
1592{
1593        struct mem_cgroup *memcg;
1594        unsigned long flags;
1595
1596        /*
1597         * The RCU lock is held throughout the transaction.  The fast
1598         * path can get away without acquiring the memcg->move_lock
1599         * because page moving starts with an RCU grace period.
1600         *
1601         * The RCU lock also protects the memcg from being freed when
1602         * the page state that is going to change is the only thing
1603         * preventing the page itself from being freed. E.g. writeback
1604         * doesn't hold a page reference and relies on PG_writeback to
1605         * keep off truncation, migration and so forth.
1606         */
1607        rcu_read_lock();
1608
1609        if (mem_cgroup_disabled())
1610                return NULL;
1611again:
1612        memcg = page->mem_cgroup;
1613        if (unlikely(!memcg))
1614                return NULL;
1615
1616        if (atomic_read(&memcg->moving_account) <= 0)
1617                return memcg;
1618
1619        spin_lock_irqsave(&memcg->move_lock, flags);
1620        if (memcg != page->mem_cgroup) {
1621                spin_unlock_irqrestore(&memcg->move_lock, flags);
1622                goto again;
1623        }
1624
1625        /*
1626         * When charge migration first begins, we can have locked and
1627         * unlocked page stat updates happening concurrently.  Track
1628         * the task who has the lock for unlock_page_memcg().
1629         */
1630        memcg->move_lock_task = current;
1631        memcg->move_lock_flags = flags;
1632
1633        return memcg;
1634}
1635EXPORT_SYMBOL(lock_page_memcg);
1636
1637/**
1638 * __unlock_page_memcg - unlock and unpin a memcg
1639 * @memcg: the memcg
1640 *
1641 * Unlock and unpin a memcg returned by lock_page_memcg().
1642 */
1643void __unlock_page_memcg(struct mem_cgroup *memcg)
1644{
1645        if (memcg && memcg->move_lock_task == current) {
1646                unsigned long flags = memcg->move_lock_flags;
1647
1648                memcg->move_lock_task = NULL;
1649                memcg->move_lock_flags = 0;
1650
1651                spin_unlock_irqrestore(&memcg->move_lock, flags);
1652        }
1653
1654        rcu_read_unlock();
1655}
1656
1657/**
1658 * unlock_page_memcg - unlock a page->mem_cgroup binding
1659 * @page: the page
1660 */
1661void unlock_page_memcg(struct page *page)
1662{
1663        __unlock_page_memcg(page->mem_cgroup);
1664}
1665EXPORT_SYMBOL(unlock_page_memcg);
1666
1667struct memcg_stock_pcp {
1668        struct mem_cgroup *cached; /* this never be root cgroup */
1669        unsigned int nr_pages;
1670        struct work_struct work;
1671        unsigned long flags;
1672#define FLUSHING_CACHED_CHARGE  0
1673};
1674static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock);
1675static DEFINE_MUTEX(percpu_charge_mutex);
1676
1677/**
1678 * consume_stock: Try to consume stocked charge on this cpu.
1679 * @memcg: memcg to consume from.
1680 * @nr_pages: how many pages to charge.
1681 *
1682 * The charges will only happen if @memcg matches the current cpu's memcg
1683 * stock, and at least @nr_pages are available in that stock.  Failure to
1684 * service an allocation will refill the stock.
1685 *
1686 * returns true if successful, false otherwise.
1687 */
1688static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
1689{
1690        struct memcg_stock_pcp *stock;
1691        unsigned long flags;
1692        bool ret = false;
1693
1694        if (nr_pages > MEMCG_CHARGE_BATCH)
1695                return ret;
1696
1697        local_irq_save(flags);
1698
1699        stock = this_cpu_ptr(&memcg_stock);
1700        if (memcg == stock->cached && stock->nr_pages >= nr_pages) {
1701                stock->nr_pages -= nr_pages;
1702                ret = true;
1703        }
1704
1705        local_irq_restore(flags);
1706
1707        return ret;
1708}
1709
1710/*
1711 * Returns stocks cached in percpu and reset cached information.
1712 */
1713static void drain_stock(struct memcg_stock_pcp *stock)
1714{
1715        struct mem_cgroup *old = stock->cached;
1716
1717        if (stock->nr_pages) {
1718                page_counter_uncharge(&old->memory, stock->nr_pages);
1719                if (do_memsw_account())
1720                        page_counter_uncharge(&old->memsw, stock->nr_pages);
1721                css_put_many(&old->css, stock->nr_pages);
1722                stock->nr_pages = 0;
1723        }
1724        stock->cached = NULL;
1725}
1726
1727static void drain_local_stock(struct work_struct *dummy)
1728{
1729        struct memcg_stock_pcp *stock;
1730        unsigned long flags;
1731
1732        /*
1733         * The only protection from memory hotplug vs. drain_stock races is
1734         * that we always operate on local CPU stock here with IRQ disabled
1735         */
1736        local_irq_save(flags);
1737
1738        stock = this_cpu_ptr(&memcg_stock);
1739        drain_stock(stock);
1740        clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags);
1741
1742        local_irq_restore(flags);
1743}
1744
1745/*
1746 * Cache charges(val) to local per_cpu area.
1747 * This will be consumed by consume_stock() function, later.
1748 */
1749static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
1750{
1751        struct memcg_stock_pcp *stock;
1752        unsigned long flags;
1753
1754        local_irq_save(flags);
1755
1756        stock = this_cpu_ptr(&memcg_stock);
1757        if (stock->cached != memcg) { /* reset if necessary */
1758                drain_stock(stock);
1759                stock->cached = memcg;
1760        }
1761        stock->nr_pages += nr_pages;
1762
1763        if (stock->nr_pages > MEMCG_CHARGE_BATCH)
1764                drain_stock(stock);
1765
1766        local_irq_restore(flags);
1767}
1768
1769/*
1770 * Drains all per-CPU charge caches for given root_memcg resp. subtree
1771 * of the hierarchy under it.
1772 */
1773static void drain_all_stock(struct mem_cgroup *root_memcg)
1774{
1775        int cpu, curcpu;
1776
1777        /* If someone's already draining, avoid adding running more workers. */
1778        if (!mutex_trylock(&percpu_charge_mutex))
1779                return;
1780        /*
1781         * Notify other cpus that system-wide "drain" is running
1782         * We do not care about races with the cpu hotplug because cpu down
1783         * as well as workers from this path always operate on the local
1784         * per-cpu data. CPU up doesn't touch memcg_stock at all.
1785         */
1786        curcpu = get_cpu();
1787        for_each_online_cpu(cpu) {
1788                struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu);
1789                struct mem_cgroup *memcg;
1790
1791                memcg = stock->cached;
1792                if (!memcg || !stock->nr_pages || !css_tryget(&memcg->css))
1793                        continue;
1794                if (!mem_cgroup_is_descendant(memcg, root_memcg)) {
1795                        css_put(&memcg->css);
1796                        continue;
1797                }
1798                if (!test_and_set_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) {
1799                        if (cpu == curcpu)
1800                                drain_local_stock(&stock->work);
1801                        else
1802                                schedule_work_on(cpu, &stock->work);
1803                }
1804                css_put(&memcg->css);
1805        }
1806        put_cpu();
1807        mutex_unlock(&percpu_charge_mutex);
1808}
1809
1810static int memcg_hotplug_cpu_dead(unsigned int cpu)
1811{
1812        struct memcg_stock_pcp *stock;
1813        struct mem_cgroup *memcg;
1814
1815        stock = &per_cpu(memcg_stock, cpu);
1816        drain_stock(stock);
1817
1818        for_each_mem_cgroup(memcg) {
1819                int i;
1820
1821                for (i = 0; i < MEMCG_NR_STAT; i++) {
1822                        int nid;
1823                        long x;
1824
1825                        x = this_cpu_xchg(memcg->stat_cpu->count[i], 0);
1826                        if (x)
1827                                atomic_long_add(x, &memcg->stat[i]);
1828
1829                        if (i >= NR_VM_NODE_STAT_ITEMS)
1830                                continue;
1831
1832                        for_each_node(nid) {
1833                                struct mem_cgroup_per_node *pn;
1834
1835                                pn = mem_cgroup_nodeinfo(memcg, nid);
1836                                x = this_cpu_xchg(pn->lruvec_stat_cpu->count[i], 0);
1837                                if (x)
1838                                        atomic_long_add(x, &pn->lruvec_stat[i]);
1839                        }
1840                }
1841
1842                for (i = 0; i < NR_VM_EVENT_ITEMS; i++) {
1843                        long x;
1844
1845                        x = this_cpu_xchg(memcg->stat_cpu->events[i], 0);
1846                        if (x)
1847                                atomic_long_add(x, &memcg->events[i]);
1848                }
1849        }
1850
1851        return 0;
1852}
1853
1854static void reclaim_high(struct mem_cgroup *memcg,
1855                         unsigned int nr_pages,
1856                         gfp_t gfp_mask)
1857{
1858        do {
1859                if (page_counter_read(&memcg->memory) <= memcg->high)
1860                        continue;
1861                memcg_memory_event(memcg, MEMCG_HIGH);
1862                try_to_free_mem_cgroup_pages(memcg, nr_pages, gfp_mask, true);
1863        } while ((memcg = parent_mem_cgroup(memcg)));
1864}
1865
1866static void high_work_func(struct work_struct *work)
1867{
1868        struct mem_cgroup *memcg;
1869
1870        memcg = container_of(work, struct mem_cgroup, high_work);
1871        reclaim_high(memcg, MEMCG_CHARGE_BATCH, GFP_KERNEL);
1872}
1873
1874/*
1875 * Scheduled by try_charge() to be executed from the userland return path
1876 * and reclaims memory over the high limit.
1877 */
1878void mem_cgroup_handle_over_high(void)
1879{
1880        unsigned int nr_pages = current->memcg_nr_pages_over_high;
1881        struct mem_cgroup *memcg;
1882
1883        if (likely(!nr_pages))
1884                return;
1885
1886        memcg = get_mem_cgroup_from_mm(current->mm);
1887        reclaim_high(memcg, nr_pages, GFP_KERNEL);
1888        css_put(&memcg->css);
1889        current->memcg_nr_pages_over_high = 0;
1890}
1891
1892static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
1893                      unsigned int nr_pages)
1894{
1895        unsigned int batch = max(MEMCG_CHARGE_BATCH, nr_pages);
1896        int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
1897        struct mem_cgroup *mem_over_limit;
1898        struct page_counter *counter;
1899        unsigned long nr_reclaimed;
1900        bool may_swap = true;
1901        bool drained = false;
1902
1903        if (mem_cgroup_is_root(memcg))
1904                return 0;
1905retry:
1906        if (consume_stock(memcg, nr_pages))
1907                return 0;
1908
1909        if (!do_memsw_account() ||
1910            page_counter_try_charge(&memcg->memsw, batch, &counter)) {
1911                if (page_counter_try_charge(&memcg->memory, batch, &counter))
1912                        goto done_restock;
1913                if (do_memsw_account())
1914                        page_counter_uncharge(&memcg->memsw, batch);
1915                mem_over_limit = mem_cgroup_from_counter(counter, memory);
1916        } else {
1917                mem_over_limit = mem_cgroup_from_counter(counter, memsw);
1918                may_swap = false;
1919        }
1920
1921        if (batch > nr_pages) {
1922                batch = nr_pages;
1923                goto retry;
1924        }
1925
1926        /*
1927         * Unlike in global OOM situations, memcg is not in a physical
1928         * memory shortage.  Allow dying and OOM-killed tasks to
1929         * bypass the last charges so that they can exit quickly and
1930         * free their memory.
1931         */
1932        if (unlikely(tsk_is_oom_victim(current) ||
1933                     fatal_signal_pending(current) ||
1934                     current->flags & PF_EXITING))
1935                goto force;
1936
1937        /*
1938         * Prevent unbounded recursion when reclaim operations need to
1939         * allocate memory. This might exceed the limits temporarily,
1940         * but we prefer facilitating memory reclaim and getting back
1941         * under the limit over triggering OOM kills in these cases.
1942         */
1943        if (unlikely(current->flags & PF_MEMALLOC))
1944                goto force;
1945
1946        if (unlikely(task_in_memcg_oom(current)))
1947                goto nomem;
1948
1949        if (!gfpflags_allow_blocking(gfp_mask))
1950                goto nomem;
1951
1952        memcg_memory_event(mem_over_limit, MEMCG_MAX);
1953
1954        nr_reclaimed = try_to_free_mem_cgroup_pages(mem_over_limit, nr_pages,
1955                                                    gfp_mask, may_swap);
1956
1957        if (mem_cgroup_margin(mem_over_limit) >= nr_pages)
1958                goto retry;
1959
1960        if (!drained) {
1961                drain_all_stock(mem_over_limit);
1962                drained = true;
1963                goto retry;
1964        }
1965
1966        if (gfp_mask & __GFP_NORETRY)
1967                goto nomem;
1968        /*
1969         * Even though the limit is exceeded at this point, reclaim
1970         * may have been able to free some pages.  Retry the charge
1971         * before killing the task.
1972         *
1973         * Only for regular pages, though: huge pages are rather
1974         * unlikely to succeed so close to the limit, and we fall back
1975         * to regular pages anyway in case of failure.
1976         */
1977        if (nr_reclaimed && nr_pages <= (1 << PAGE_ALLOC_COSTLY_ORDER))
1978                goto retry;
1979        /*
1980         * At task move, charge accounts can be doubly counted. So, it's
1981         * better to wait until the end of task_move if something is going on.
1982         */
1983        if (mem_cgroup_wait_acct_move(mem_over_limit))
1984                goto retry;
1985
1986        if (nr_retries--)
1987                goto retry;
1988
1989        if (gfp_mask & __GFP_NOFAIL)
1990                goto force;
1991
1992        if (fatal_signal_pending(current))
1993                goto force;
1994
1995        memcg_memory_event(mem_over_limit, MEMCG_OOM);
1996
1997        mem_cgroup_oom(mem_over_limit, gfp_mask,
1998                       get_order(nr_pages * PAGE_SIZE));
1999nomem:
2000        if (!(gfp_mask & __GFP_NOFAIL))

2001                return -ENOMEM;
2002force:
2003        /*
2004         * The allocation either can't fail or will lead to more memory
2005         * being freed very soon.  Allow memory usage go over the limit
2006         * temporarily by force charging it.
2007         */
2008        page_counter_charge(&memcg->memory, nr_pages);
2009        if (do_memsw_account())
2010                page_counter_charge(&memcg->memsw, nr_pages);
2011        css_get_many(&memcg->css, nr_pages);
2012
2013        return 0;
2014
2015done_restock:
2016        css_get_many(&memcg->css, batch);
2017        if (batch > nr_pages)
2018                refill_stock(memcg, batch - nr_pages);
2019
2020        /*
2021         * If the hierarchy is above the normal consumption range, schedule
2022         * reclaim on returning to userland.  We can perform reclaim here
2023         * if __GFP_RECLAIM but let's always punt for simplicity and so that
2024         * GFP_KERNEL can consistently be used during reclaim.  @memcg is
2025         * not recorded as it most likely matches current's and won't
2026         * change in the meantime.  As high limit is checked again before
2027         * reclaim, the cost of mismatch is negligible.
2028         */
2029        do {
2030                if (page_counter_read(&memcg->memory) > memcg->high) {
2031                        /* Don't bother a random interrupted task */
2032                        if (in_interrupt()) {
2033                                schedule_work(&memcg->high_work);
2034                                break;
2035                        }
2036                        current->memcg_nr_pages_over_high += batch;
2037                        set_notify_resume(current);
2038                        break;
2039                }
2040        } while ((memcg = parent_mem_cgroup(memcg)));
2041
2042        return 0;
2043}
2044
2045static void cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages)
2046{
2047        if (mem_cgroup_is_root(memcg))
2048                return;
2049
2050        page_counter_uncharge(&memcg->memory, nr_pages);
2051        if (do_memsw_account())
2052                page_counter_uncharge(&memcg->memsw, nr_pages);
2053
2054        css_put_many(&memcg->css, nr_pages);
2055}
2056
2057static void lock_page_lru(struct page *page, int *isolated)
2058{
2059        struct zone *zone = page_zone(page);
2060
2061        spin_lock_irq(zone_lru_lock(zone));
2062        if (PageLRU(page)) {
2063                struct lruvec *lruvec;
2064
2065                lruvec = mem_cgroup_page_lruvec(page, zone->zone_pgdat);
2066                ClearPageLRU(page);
2067                del_page_from_lru_list(page, lruvec, page_lru(page));
2068                *isolated = 1;
2069        } else
2070                *isolated = 0;
2071}
2072
2073static void unlock_page_lru(struct page *page, int isolated)
2074{
2075        struct zone *zone = page_zone(page);
2076
2077        if (isolated) {
2078                struct lruvec *lruvec;
2079
2080                lruvec = mem_cgroup_page_lruvec(page, zone->zone_pgdat);
2081                VM_BUG_ON_PAGE(PageLRU(page), page);
2082                SetPageLRU(page);
2083                add_page_to_lru_list(page, lruvec, page_lru(page));
2084        }
2085        spin_unlock_irq(zone_lru_lock(zone));
2086}
2087
2088static void commit_charge(struct page *page, struct mem_cgroup *memcg,
2089                          bool lrucare)
2090{
2091        int isolated;
2092
2093        VM_BUG_ON_PAGE(page->mem_cgroup, page);
2094
2095        /*
2096         * In some cases, SwapCache and FUSE(splice_buf->radixtree), the page
2097         * may already be on some other mem_cgroup's LRU.  Take care of it.
2098         */
2099        if (lrucare)
2100                lock_page_lru(page, &isolated);
2101
2102        /*
2103         * Nobody should be changing or seriously looking at
2104         * page->mem_cgroup at this point:
2105         *
2106         * - the page is uncharged
2107         *
2108         * - the page is off-LRU
2109         *
2110         * - an anonymous fault has exclusive page access, except for
2111         *   a locked page table
2112         *
2113         * - a page cache insertion, a swapin fault, or a migration
2114         *   have the page locked
2115         */
2116        page->mem_cgroup = memcg;
2117
2118        if (lrucare)
2119                unlock_page_lru(page, isolated);
2120}
2121
2122#ifndef CONFIG_SLOB
2123static int memcg_alloc_cache_id(void)
2124{
2125        int id, size;
2126        int err;
2127
2128        id = ida_simple_get(&memcg_cache_ida,
2129                            0, MEMCG_CACHES_MAX_SIZE, GFP_KERNEL);
2130        if (id < 0)
2131                return id;
2132
2133        if (id < memcg_nr_cache_ids)
2134                return id;
2135
2136        /*
2137         * There's no space for the new id in memcg_caches arrays,
2138         * so we have to grow them.
2139         */
2140        down_write(&memcg_cache_ids_sem);
2141
2142        size = 2 * (id + 1);
2143        if (size < MEMCG_CACHES_MIN_SIZE)
2144                size = MEMCG_CACHES_MIN_SIZE;
2145        else if (size > MEMCG_CACHES_MAX_SIZE)
2146                size = MEMCG_CACHES_MAX_SIZE;
2147
2148        err = memcg_update_all_caches(size);
2149        if (!err)
2150                err = memcg_update_all_list_lrus(size);
2151        if (!err)
2152                memcg_nr_cache_ids = size;
2153
2154        up_write(&memcg_cache_ids_sem);
2155
2156        if (err) {
2157                ida_simple_remove(&memcg_cache_ida, id);
2158                return err;
2159        }
2160        return id;
2161}
2162
2163static void memcg_free_cache_id(int id)
2164{
2165        ida_simple_remove(&memcg_cache_ida, id);
2166}
2167
2168struct memcg_kmem_cache_create_work {
2169        struct mem_cgroup *memcg;
2170        struct kmem_cache *cachep;
2171        struct work_struct work;
2172};
2173
2174static void memcg_kmem_cache_create_func(struct work_struct *w)
2175{
2176        struct memcg_kmem_cache_create_work *cw =
2177                container_of(w, struct memcg_kmem_cache_create_work, work);
2178        struct mem_cgroup *memcg = cw->memcg;
2179        struct kmem_cache *cachep = cw->cachep;
2180
2181        memcg_create_kmem_cache(memcg, cachep);
2182
2183        css_put(&memcg->css);
2184        kfree(cw);
2185}
2186
2187/*
2188 * Enqueue the creation of a per-memcg kmem_cache.
2189 */
2190static void __memcg_schedule_kmem_cache_create(struct mem_cgroup *memcg,
2191                                               struct kmem_cache *cachep)
2192{
2193        struct memcg_kmem_cache_create_work *cw;
2194
2195        cw = kmalloc(sizeof(*cw), GFP_NOWAIT | __GFP_NOWARN);
2196        if (!cw)
2197                return;
2198
2199        css_get(&memcg->css);
2200
2201        cw->memcg = memcg;
2202        cw->cachep = cachep;
2203        INIT_WORK(&cw->work, memcg_kmem_cache_create_func);
2204
2205        queue_work(memcg_kmem_cache_wq, &cw->work);
2206}
2207
2208static void memcg_schedule_kmem_cache_create(struct mem_cgroup *memcg,
2209                                             struct kmem_cache *cachep)
2210{
2211        /*
2212         * We need to stop accounting when we kmalloc, because if the
2213         * corresponding kmalloc cache is not yet created, the first allocation
2214         * in __memcg_schedule_kmem_cache_create will recurse.
2215         *
2216         * However, it is better to enclose the whole function. Depending on
2217         * the debugging options enabled, INIT_WORK(), for instance, can
2218         * trigger an allocation. This too, will make us recurse. Because at
2219         * this point we can't allow ourselves back into memcg_kmem_get_cache,
2220         * the safest choice is to do it like this, wrapping the whole function.
2221         */
2222        current->memcg_kmem_skip_account = 1;
2223        __memcg_schedule_kmem_cache_create(memcg, cachep);
2224        current->memcg_kmem_skip_account = 0;
2225}
2226
2227static inline bool memcg_kmem_bypass(void)
2228{
2229        if (in_interrupt() || !current->mm || (current->flags & PF_KTHREAD))
2230                return true;
2231        return false;
2232}
2233
2234/**
2235 * memcg_kmem_get_cache: select the correct per-memcg cache for allocation
2236 * @cachep: the original global kmem cache
2237 *
2238 * Return the kmem_cache we're supposed to use for a slab allocation.
2239 * We try to use the current memcg's version of the cache.
2240 *
2241 * If the cache does not exist yet, if we are the first user of it, we
2242 * create it asynchronously in a workqueue and let the current allocation
2243 * go through with the original cache.
2244 *
2245 * This function takes a reference to the cache it returns to assure it
2246 * won't get destroyed while we are working with it. Once the caller is
2247 * done with it, memcg_kmem_put_cache() must be called to release the
2248 * reference.
2249 */
2250struct kmem_cache *memcg_kmem_get_cache(struct kmem_cache *cachep)
2251{
2252        struct mem_cgroup *memcg;
2253        struct kmem_cache *memcg_cachep;
2254        int kmemcg_id;
2255
2256        VM_BUG_ON(!is_root_cache(cachep));
2257
2258        if (memcg_kmem_bypass())
2259                return cachep;
2260
2261        if (current->memcg_kmem_skip_account)
2262                return cachep;
2263
2264        memcg = get_mem_cgroup_from_mm(current->mm);
2265        kmemcg_id = READ_ONCE(memcg->kmemcg_id);
2266        if (kmemcg_id < 0)
2267                goto out;
2268
2269        memcg_cachep = cache_from_memcg_idx(cachep, kmemcg_id);
2270        if (likely(memcg_cachep))
2271                return memcg_cachep;
2272
2273        /*
2274         * If we are in a safe context (can wait, and not in interrupt
2275         * context), we could be be predictable and return right away.
2276         * This would guarantee that the allocation being performed
2277         * already belongs in the new cache.
2278         *
2279         * However, there are some clashes that can arrive from locking.
2280         * For instance, because we acquire the slab_mutex while doing
2281         * memcg_create_kmem_cache, this means no further allocation
2282         * could happen with the slab_mutex held. So it's better to
2283         * defer everything.
2284         */
2285        memcg_schedule_kmem_cache_create(memcg, cachep);
2286out:
2287        css_put(&memcg->css);
2288        return cachep;
2289}
2290
2291/**
2292 * memcg_kmem_put_cache: drop reference taken by memcg_kmem_get_cache
2293 * @cachep: the cache returned by memcg_kmem_get_cache
2294 */
2295void memcg_kmem_put_cache(struct kmem_cache *cachep)
2296{
2297        if (!is_root_cache(cachep))
2298                css_put(&cachep->memcg_params.memcg->css);
2299}
2300
2301/**
2302 * memcg_kmem_charge_memcg: charge a kmem page
2303 * @page: page to charge
2304 * @gfp: reclaim mode
2305 * @order: allocation order
2306 * @memcg: memory cgroup to charge
2307 *
2308 * Returns 0 on success, an error code on failure.
2309 */
2310int memcg_kmem_charge_memcg(struct page *page, gfp_t gfp, int order,
2311                            struct mem_cgroup *memcg)
2312{
2313        unsigned int nr_pages = 1 << order;
2314        struct page_counter *counter;
2315        int ret;
2316
2317        ret = try_charge(memcg, gfp, nr_pages);
2318        if (ret)
2319                return ret;
2320
2321        if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) &&
2322            !page_counter_try_charge(&memcg->kmem, nr_pages, &counter)) {
2323                cancel_charge(memcg, nr_pages);
2324                return -ENOMEM;
2325        }
2326
2327        page->mem_cgroup = memcg;
2328
2329        return 0;
2330}
2331
2332/**
2333 * memcg_kmem_charge: charge a kmem page to the current memory cgroup
2334 * @page: page to charge
2335 * @gfp: reclaim mode
2336 * @order: allocation order
2337 *
2338 * Returns 0 on success, an error code on failure.
2339 */
2340int memcg_kmem_charge(struct page *page, gfp_t gfp, int order)
2341{
2342        struct mem_cgroup *memcg;
2343        int ret = 0;
2344
2345        if (memcg_kmem_bypass())
2346                return 0;
2347
2348        memcg = get_mem_cgroup_from_mm(current->mm);
2349        if (!mem_cgroup_is_root(memcg)) {
2350                ret = memcg_kmem_charge_memcg(page, gfp, order, memcg);
2351                if (!ret)
2352                        __SetPageKmemcg(page);
2353        }
2354        css_put(&memcg->css);
2355        return ret;
2356}
2357/**
2358 * memcg_kmem_uncharge: uncharge a kmem page
2359 * @page: page to uncharge
2360 * @order: allocation order
2361 */
2362void memcg_kmem_uncharge(struct page *page, int order)
2363{
2364        struct mem_cgroup *memcg = page->mem_cgroup;
2365        unsigned int nr_pages = 1 << order;
2366
2367        if (!memcg)
2368                return;
2369
2370        VM_BUG_ON_PAGE(mem_cgroup_is_root(memcg), page);
2371
2372        if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
2373                page_counter_uncharge(&memcg->kmem, nr_pages);
2374
2375        page_counter_uncharge(&memcg->memory, nr_pages);
2376        if (do_memsw_account())
2377                page_counter_uncharge(&memcg->memsw, nr_pages);
2378
2379        page->mem_cgroup = NULL;
2380
2381        /* slab pages do not have PageKmemcg flag set */
2382        if (PageKmemcg(page))
2383                __ClearPageKmemcg(page);
2384
2385        css_put_many(&memcg->css, nr_pages);
2386}
2387#endif /* !CONFIG_SLOB */
2388
2389#ifdef CONFIG_TRANSPARENT_HUGEPAGE
2390
2391/*
2392 * Because tail pages are not marked as "used", set it. We're under
2393 * zone_lru_lock and migration entries setup in all page mappings.
2394 */
2395void mem_cgroup_split_huge_fixup(struct page *head)
2396{
2397        int i;
2398
2399        if (mem_cgroup_disabled())
2400                return;
2401
2402        for (i = 1; i < HPAGE_PMD_NR; i++)
2403                head[i].mem_cgroup = head->mem_cgroup;
2404
2405        __mod_memcg_state(head->mem_cgroup, MEMCG_RSS_HUGE, -HPAGE_PMD_NR);
2406}
2407#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
2408
2409#ifdef CONFIG_MEMCG_SWAP
2410/**
2411 * mem_cgroup_move_swap_account - move swap charge and swap_cgroup's record.
2412 * @entry: swap entry to be moved
2413 * @from:  mem_cgroup which the entry is moved from
2414 * @to:  mem_cgroup which the entry is moved to
2415 *
2416 * It succeeds only when the swap_cgroup's record for this entry is the same
2417 * as the mem_cgroup's id of @from.
2418 *
2419 * Returns 0 on success, -EINVAL on failure.
2420 *
2421 * The caller must have charged to @to, IOW, called page_counter_charge() about
2422 * both res and memsw, and called css_get().
2423 */
2424static int mem_cgroup_move_swap_account(swp_entry_t entry,
2425                                struct mem_cgroup *from, struct mem_cgroup *to)
2426{
2427        unsigned short old_id, new_id;
2428
2429        old_id = mem_cgroup_id(from);
2430        new_id = mem_cgroup_id(to);
2431
2432        if (swap_cgroup_cmpxchg(entry, old_id, new_id) == old_id) {
2433                mod_memcg_state(from, MEMCG_SWAP, -1);
2434                mod_memcg_state(to, MEMCG_SWAP, 1);
2435                return 0;
2436        }
2437        return -EINVAL;
2438}
2439#else
2440static inline int mem_cgroup_move_swap_account(swp_entry_t entry,
2441                                struct mem_cgroup *from, struct mem_cgroup *to)
2442{
2443        return -EINVAL;
2444}
2445#endif
2446
2447static DEFINE_MUTEX(memcg_limit_mutex);
2448
2449static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
2450                                   unsigned long limit, bool memsw)
2451{
2452        bool enlarge = false;
2453        int ret;
2454        bool limits_invariant;
2455        struct page_counter *counter = memsw ? &memcg->memsw : &memcg->memory;
2456
2457        do {
2458                if (signal_pending(current)) {
2459                        ret = -EINTR;
2460                        break;
2461                }
2462
2463                mutex_lock(&memcg_limit_mutex);
2464                /*
2465                 * Make sure that the new limit (memsw or memory limit) doesn't
2466                 * break our basic invariant rule memory.limit <= memsw.limit.
2467                 */
2468                limits_invariant = memsw ? limit >= memcg->memory.limit :
2469                                           limit <= memcg->memsw.limit;
2470                if (!limits_invariant) {
2471                        mutex_unlock(&memcg_limit_mutex);
2472                        ret = -EINVAL;
2473                        break;
2474                }
2475                if (limit > counter->limit)
2476                        enlarge = true;
2477                ret = page_counter_limit(counter, limit);
2478                mutex_unlock(&memcg_limit_mutex);
2479
2480                if (!ret)
2481                        break;
2482
2483                if (!try_to_free_mem_cgroup_pages(memcg, 1,
2484                                        GFP_KERNEL, !memsw)) {
2485                        ret = -EBUSY;
2486                        break;
2487                }
2488        } while (true);
2489
2490        if (!ret && enlarge)
2491                memcg_oom_recover(memcg);
2492
2493        return ret;
2494}
2495
2496unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order,
2497                                            gfp_t gfp_mask,
2498                                            unsigned long *total_scanned)
2499{
2500        unsigned long nr_reclaimed = 0;
2501        struct mem_cgroup_per_node *mz, *next_mz = NULL;
2502        unsigned long reclaimed;
2503        int loop = 0;
2504        struct mem_cgroup_tree_per_node *mctz;
2505        unsigned long excess;
2506        unsigned long nr_scanned;
2507
2508        if (order > 0)
2509                return 0;
2510
2511        mctz = soft_limit_tree_node(pgdat->node_id);
2512
2513        /*
2514         * Do not even bother to check the largest node if the root
2515         * is empty. Do it lockless to prevent lock bouncing. Races
2516         * are acceptable as soft limit is best effort anyway.
2517         */
2518        if (!mctz || RB_EMPTY_ROOT(&mctz->rb_root))
2519                return 0;
2520
2521        /*
2522         * This loop can run a while, specially if mem_cgroup's continuously
2523         * keep exceeding their soft limit and putting the system under
2524         * pressure
2525         */
2526        do {
2527                if (next_mz)
2528                        mz = next_mz;
2529                else
2530                        mz = mem_cgroup_largest_soft_limit_node(mctz);
2531                if (!mz)
2532                        break;
2533
2534                nr_scanned = 0;
2535                reclaimed = mem_cgroup_soft_reclaim(mz->memcg, pgdat,
2536                                                    gfp_mask, &nr_scanned);
2537                nr_reclaimed += reclaimed;
2538                *total_scanned += nr_scanned;
2539                spin_lock_irq(&mctz->lock);
2540                __mem_cgroup_remove_exceeded(mz, mctz);
2541
2542                /*
2543                 * If we failed to reclaim anything from this memory cgroup
2544                 * it is time to move on to the next cgroup
2545                 */
2546                next_mz = NULL;
2547                if (!reclaimed)
2548                        next_mz = __mem_cgroup_largest_soft_limit_node(mctz);
2549
2550                excess = soft_limit_excess(mz->memcg);
2551                /*
2552                 * One school of thought says that we should not add
2553                 * back the node to the tree if reclaim returns 0.
2554                 * But our reclaim could return 0, simply because due
2555                 * to priority we are exposing a smaller subset of
2556                 * memory to reclaim from. Consider this as a longer
2557                 * term TODO.
2558                 */
2559                /* If excess == 0, no tree ops */
2560                __mem_cgroup_insert_exceeded(mz, mctz, excess);
2561                spin_unlock_irq(&mctz->lock);
2562                css_put(&mz->memcg->css);
2563                loop++;
2564                /*
2565                 * Could not reclaim anything and there are no more
2566                 * mem cgroups to try or we seem to be looping without
2567                 * reclaiming anything.
2568                 */
2569                if (!nr_reclaimed &&
2570                        (next_mz == NULL ||
2571                        loop > MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS))
2572                        break;
2573        } while (!nr_reclaimed);
2574        if (next_mz)
2575                css_put(&next_mz->memcg->css);
2576        return nr_reclaimed;
2577}
2578
2579/*
2580 * Test whether @memcg has children, dead or alive.  Note that this
2581 * function doesn't care whether @memcg has use_hierarchy enabled and
2582 * returns %true if there are child csses according to the cgroup
2583 * hierarchy.  Testing use_hierarchy is the caller's responsiblity.
2584 */
2585static inline bool memcg_has_children(struct mem_cgroup *memcg)
2586{
2587        bool ret;
2588
2589        rcu_read_lock();
2590        ret = css_next_child(NULL, &memcg->css);
2591        rcu_read_unlock();
2592        return ret;
2593}
2594
2595/*
2596 * Reclaims as many pages from the given memcg as possible.
2597 *
2598 * Caller is responsible for holding css reference for memcg.
2599 */
2600static int mem_cgroup_force_empty(struct mem_cgroup *memcg)
2601{
2602        int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
2603
2604        /* we call try-to-free pages for make this cgroup empty */
2605        lru_add_drain_all();
2606        /* try to free all pages in this cgroup */
2607        while (nr_retries && page_counter_read(&memcg->memory)) {
2608                int progress;
2609
2610                if (signal_pending(current))
2611                        return -EINTR;
2612
2613                progress = try_to_free_mem_cgroup_pages(memcg, 1,
2614                                                        GFP_KERNEL, true);
2615                if (!progress) {
2616                        nr_retries--;
2617                        /* maybe some writeback is necessary */
2618                        congestion_wait(BLK_RW_ASYNC, HZ/10);
2619                }
2620
2621        }
2622
2623        return 0;
2624}
2625
2626static ssize_t mem_cgroup_force_empty_write(struct kernfs_open_file *of,
2627                                            char *buf, size_t nbytes,
2628                                            loff_t off)
2629{
2630        struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
2631
2632        if (mem_cgroup_is_root(memcg))
2633                return -EINVAL;
2634        return mem_cgroup_force_empty(memcg) ?: nbytes;
2635}
2636
2637static u64 mem_cgroup_hierarchy_read(struct cgroup_subsys_state *css,
2638                                     struct cftype *cft)
2639{
2640        return mem_cgroup_from_css(css)->use_hierarchy;
2641}
2642
2643static int mem_cgroup_hierarchy_write(struct cgroup_subsys_state *css,
2644                                      struct cftype *cft, u64 val)
2645{
2646        int retval = 0;
2647        struct mem_cgroup *memcg = mem_cgroup_from_css(css);
2648        struct mem_cgroup *parent_memcg = mem_cgroup_from_css(memcg->css.parent);
2649
2650        if (memcg->use_hierarchy == val)
2651                return 0;
2652
2653        /*
2654         * If parent's use_hierarchy is set, we can't make any modifications
2655         * in the child subtrees. If it is unset, then the change can
2656         * occur, provided the current cgroup has no children.
2657         *
2658         * For the root cgroup, parent_mem is NULL, we allow value to be
2659         * set if there are no children.
2660         */
2661        if ((!parent_memcg || !parent_memcg->use_hierarchy) &&
2662                                (val == 1 || val == 0)) {
2663                if (!memcg_has_children(memcg))
2664                        memcg->use_hierarchy = val;
2665                else
2666                        retval = -EBUSY;
2667        } else
2668                retval = -EINVAL;
2669
2670        return retval;
2671}
2672
2673static void tree_stat(struct mem_cgroup *memcg, unsigned long *stat)
2674{
2675        struct mem_cgroup *iter;
2676        int i;
2677
2678        memset(stat, 0, sizeof(*stat) * MEMCG_NR_STAT);
2679
2680        for_each_mem_cgroup_tree(iter, memcg) {
2681                for (i = 0; i < MEMCG_NR_STAT; i++)
2682                        stat[i] += memcg_page_state(iter, i);
2683        }
2684}
2685
2686static void tree_events(struct mem_cgroup *memcg, unsigned long *events)
2687{
2688        struct mem_cgroup *iter;
2689        int i;
2690
2691        memset(events, 0, sizeof(*events) * NR_VM_EVENT_ITEMS);
2692
2693        for_each_mem_cgroup_tree(iter, memcg) {
2694                for (i = 0; i < NR_VM_EVENT_ITEMS; i++)
2695                        events[i] += memcg_sum_events(iter, i);
2696        }
2697}
2698
2699static unsigned long mem_cgroup_usage(struct mem_cgroup *memcg, bool swap)
2700{
2701        unsigned long val = 0;
2702
2703        if (mem_cgroup_is_root(memcg)) {
2704                struct mem_cgroup *iter;
2705
2706                for_each_mem_cgroup_tree(iter, memcg) {
2707                        val += memcg_page_state(iter, MEMCG_CACHE);
2708                        val += memcg_page_state(iter, MEMCG_RSS);
2709                        if (swap)
2710                                val += memcg_page_state(iter, MEMCG_SWAP);
2711                }
2712        } else {
2713                if (!swap)
2714                        val = page_counter_read(&memcg->memory);
2715                else
2716                        val = page_counter_read(&memcg->memsw);
2717        }
2718        return val;
2719}
2720
2721enum {
2722        RES_USAGE,
2723        RES_LIMIT,
2724        RES_MAX_USAGE,
2725        RES_FAILCNT,
2726        RES_SOFT_LIMIT,
2727};
2728
2729static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css,
2730                               struct cftype *cft)
2731{
2732        struct mem_cgroup *memcg = mem_cgroup_from_css(css);
2733        struct page_counter *counter;
2734
2735        switch (MEMFILE_TYPE(cft->private)) {
2736        case _MEM:
2737                counter = &memcg->memory;
2738                break;
2739        case _MEMSWAP:
2740                counter = &memcg->memsw;
2741                break;
2742        case _KMEM:
2743                counter = &memcg->kmem;
2744                break;
2745        case _TCP:
2746                counter = &memcg->tcpmem;
2747                break;
2748        default:
2749                BUG();
2750        }
2751
2752        switch (MEMFILE_ATTR(cft->private)) {
2753        case RES_USAGE:
2754                if (counter == &memcg->memory)
2755                        return (u64)mem_cgroup_usage(memcg, false) * PAGE_SIZE;
2756                if (counter == &memcg->memsw)
2757                        return (u64)mem_cgroup_usage(memcg, true) * PAGE_SIZE;
2758                return (u64)page_counter_read(counter) * PAGE_SIZE;
2759        case RES_LIMIT:
2760                return (u64)counter->limit * PAGE_SIZE;
2761        case RES_MAX_USAGE:
2762                return (u64)counter->watermark * PAGE_SIZE;
2763        case RES_FAILCNT:
2764                return counter->failcnt;
2765        case RES_SOFT_LIMIT:
2766                return (u64)memcg->soft_limit * PAGE_SIZE;
2767        default:
2768                BUG();
2769        }
2770}
2771
2772#ifndef CONFIG_SLOB
2773static int memcg_online_kmem(struct mem_cgroup *memcg)
2774{
2775        int memcg_id;
2776
2777        if (cgroup_memory_nokmem)
2778                return 0;
2779
2780        BUG_ON(memcg->kmemcg_id >= 0);
2781        BUG_ON(memcg->kmem_state);
2782
2783        memcg_id = memcg_alloc_cache_id();
2784        if (memcg_id < 0)
2785                return memcg_id;
2786
2787        static_branch_inc(&memcg_kmem_enabled_key);
2788        /*
2789         * A memory cgroup is considered kmem-online as soon as it gets
2790         * kmemcg_id. Setting the id after enabling static branching will
2791         * guarantee no one starts accounting before all call sites are
2792         * patched.
2793         */
2794        memcg->kmemcg_id = memcg_id;
2795        memcg->kmem_state = KMEM_ONLINE;
2796        INIT_LIST_HEAD(&memcg->kmem_caches);
2797
2798        return 0;
2799}
2800
2801static void memcg_offline_kmem(struct mem_cgroup *memcg)
2802{
2803        struct cgroup_subsys_state *css;
2804        struct mem_cgroup *parent, *child;
2805        int kmemcg_id;
2806
2807        if (memcg->kmem_state != KMEM_ONLINE)
2808                return;
2809        /*
2810         * Clear the online state before clearing memcg_caches array
2811         * entries. The slab_mutex in memcg_deactivate_kmem_caches()
2812         * guarantees that no cache will be created for this cgroup
2813         * after we are done (see memcg_create_kmem_cache()).
2814         */
2815        memcg->kmem_state = KMEM_ALLOCATED;
2816
2817        memcg_deactivate_kmem_caches(memcg);
2818
2819        kmemcg_id = memcg->kmemcg_id;
2820        BUG_ON(kmemcg_id < 0);
2821
2822        parent = parent_mem_cgroup(memcg);
2823        if (!parent)
2824                parent = root_mem_cgroup;
2825
2826        /*
2827         * Change kmemcg_id of this cgroup and all its descendants to the
2828         * parent's id, and then move all entries from this cgroup's list_lrus
2829         * to ones of the parent. After we have finished, all list_lrus
2830         * corresponding to this cgroup are guaranteed to remain empty. The
2831         * ordering is imposed by list_lru_node->lock taken by
2832         * memcg_drain_all_list_lrus().
2833         */
2834        rcu_read_lock(); /* can be called from css_free w/o cgroup_mutex */
2835        css_for_each_descendant_pre(css, &memcg->css) {
2836                child = mem_cgroup_from_css(css);
2837                BUG_ON(child->kmemcg_id != kmemcg_id);
2838                child->kmemcg_id = parent->kmemcg_id;
2839                if (!memcg->use_hierarchy)
2840                        break;
2841        }
2842        rcu_read_unlock();
2843
2844        memcg_drain_all_list_lrus(kmemcg_id, parent->kmemcg_id);
2845
2846        memcg_free_cache_id(kmemcg_id);
2847}
2848
2849static void memcg_free_kmem(struct mem_cgroup *memcg)
2850{
2851        /* css_alloc() failed, offlining didn't happen */
2852        if (unlikely(memcg->kmem_state == KMEM_ONLINE))
2853                memcg_offline_kmem(memcg);
2854
2855        if (memcg->kmem_state == KMEM_ALLOCATED) {
2856                memcg_destroy_kmem_caches(memcg);
2857                static_branch_dec(&memcg_kmem_enabled_key);
2858                WARN_ON(page_counter_read(&memcg->kmem));
2859        }
2860}
2861#else
2862static int memcg_online_kmem(struct mem_cgroup *memcg)
2863{
2864        return 0;
2865}
2866static void memcg_offline_kmem(struct mem_cgroup *memcg)
2867{
2868}
2869static void memcg_free_kmem(struct mem_cgroup *memcg)
2870{
2871}
2872#endif /* !CONFIG_SLOB */
2873
2874static int memcg_update_kmem_limit(struct mem_cgroup *memcg,
2875                                   unsigned long limit)
2876{
2877        int ret;
2878
2879        mutex_lock(&memcg_limit_mutex);
2880        ret = page_counter_limit(&memcg->kmem, limit);
2881        mutex_unlock(&memcg_limit_mutex);
2882        return ret;
2883}
2884
2885static int memcg_update_tcp_limit(struct mem_cgroup *memcg, unsigned long limit)
2886{
2887        int ret;
2888
2889        mutex_lock(&memcg_limit_mutex);
2890
2891        ret = page_counter_limit(&memcg->tcpmem, limit);
2892        if (ret)
2893                goto out;
2894
2895        if (!memcg->tcpmem_active) {
2896                /*
2897                 * The active flag needs to be written after the static_key
2898                 * update. This is what guarantees that the socket activation
2899                 * function is the last one to run. See mem_cgroup_sk_alloc()
2900                 * for details, and note that we don't mark any socket as
2901                 * belonging to this memcg until that flag is up.
2902                 *
2903                 * We need to do this, because static_keys will span multiple
2904                 * sites, but we can't control their order. If we mark a socket
2905                 * as accounted, but the accounting functions are not patched in
2906                 * yet, we'll lose accounting.
2907                 *
2908                 * We never race with the readers in mem_cgroup_sk_alloc(),
2909                 * because when this value change, the code to process it is not
2910                 * patched in yet.
2911                 */
2912                static_branch_inc(&memcg_sockets_enabled_key);
2913                memcg->tcpmem_active = true;
2914        }
2915out:
2916        mutex_unlock(&memcg_limit_mutex);
2917        return ret;
2918}
2919
2920/*
2921 * The user of this function is...
2922 * RES_LIMIT.
2923 */
2924static ssize_t mem_cgroup_write(struct kernfs_open_file *of,
2925                                char *buf, size_t nbytes, loff_t off)
2926{
2927        struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
2928        unsigned long nr_pages;
2929        int ret;
2930
2931        buf = strstrip(buf);
2932        ret = page_counter_memparse(buf, "-1", &nr_pages);
2933        if (ret)
2934                return ret;
2935
2936        switch (MEMFILE_ATTR(of_cft(of)->private)) {
2937        case RES_LIMIT:
2938                if (mem_cgroup_is_root(memcg)) { /* Can't set limit on root */
2939                        ret = -EINVAL;
2940                        break;
2941                }
2942                switch (MEMFILE_TYPE(of_cft(of)->private)) {
2943                case _MEM:
2944                        ret = mem_cgroup_resize_limit(memcg, nr_pages, false);
2945                        break;
2946                case _MEMSWAP:
2947                        ret = mem_cgroup_resize_limit(memcg, nr_pages, true);
2948                        break;
2949                case _KMEM:
2950                        ret = memcg_update_kmem_limit(memcg, nr_pages);
2951                        break;
2952                case _TCP:
2953                        ret = memcg_update_tcp_limit(memcg, nr_pages);
2954                        break;
2955                }
2956                break;
2957        case RES_SOFT_LIMIT:
2958                memcg->soft_limit = nr_pages;
2959                ret = 0;
2960                break;
2961        }
2962        return ret ?: nbytes;
2963}
2964
2965static ssize_t mem_cgroup_reset(struct kernfs_open_file *of, char *buf,
2966                                size_t nbytes, loff_t off)
2967{
2968        struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
2969        struct page_counter *counter;
2970
2971        switch (MEMFILE_TYPE(of_cft(of)->private)) {
2972        case _MEM:
2973                counter = &memcg->memory;
2974                break;
2975        case _MEMSWAP:
2976                counter = &memcg->memsw;
2977                break;
2978        case _KMEM:
2979                counter = &memcg->kmem;
2980                break;
2981        case _TCP:
2982                counter = &memcg->tcpmem;
2983                break;
2984        default:
2985                BUG();
2986        }
2987
2988        switch (MEMFILE_ATTR(of_cft(of)->private)) {
2989        case RES_MAX_USAGE:
2990                page_counter_reset_watermark(counter);
2991                break;
2992        case RES_FAILCNT:
2993                counter->failcnt = 0;
2994                break;
2995        default:
2996                BUG();
2997        }
2998
2999        return nbytes;
3000}

3001
3002static u64 mem_cgroup_move_charge_read(struct cgroup_subsys_state *css,
3003                                        struct cftype *cft)
3004{
3005        return mem_cgroup_from_css(css)->move_charge_at_immigrate;
3006}
3007
3008#ifdef CONFIG_MMU
3009static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css,
3010                                        struct cftype *cft, u64 val)
3011{
3012        struct mem_cgroup *memcg = mem_cgroup_from_css(css);
3013
3014        if (val & ~MOVE_MASK)
3015                return -EINVAL;
3016
3017        /*
3018         * No kind of locking is needed in here, because ->can_attach() will
3019         * check this value once in the beginning of the process, and then carry
3020         * on with stale data. This means that changes to this value will only
3021         * affect task migrations starting after the change.
3022         */
3023        memcg->move_charge_at_immigrate = val;
3024        return 0;
3025}
3026#else
3027static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css,
3028                                        struct cftype *cft, u64 val)
3029{
3030        return -ENOSYS;
3031}
3032#endif
3033
3034#ifdef CONFIG_NUMA
3035static int memcg_numa_stat_show(struct seq_file *m, void *v)
3036{
3037        struct numa_stat {
3038                const char *name;
3039                unsigned int lru_mask;
3040        };
3041
3042        static const struct numa_stat stats[] = {
3043                { "total", LRU_ALL },
3044                { "file", LRU_ALL_FILE },
3045                { "anon", LRU_ALL_ANON },
3046                { "unevictable", BIT(LRU_UNEVICTABLE) },
3047        };
3048        const struct numa_stat *stat;
3049        int nid;
3050        unsigned long nr;
3051        struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
3052
3053        for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) {
3054                nr = mem_cgroup_nr_lru_pages(memcg, stat->lru_mask);
3055                seq_printf(m, "%s=%lu", stat->name, nr);
3056                for_each_node_state(nid, N_MEMORY) {
3057                        nr = mem_cgroup_node_nr_lru_pages(memcg, nid,
3058                                                          stat->lru_mask);
3059                        seq_printf(m, " N%d=%lu", nid, nr);
3060                }
3061                seq_putc(m, '\n');
3062        }
3063
3064        for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) {
3065                struct mem_cgroup *iter;
3066
3067                nr = 0;
3068                for_each_mem_cgroup_tree(iter, memcg)
3069                        nr += mem_cgroup_nr_lru_pages(iter, stat->lru_mask);
3070                seq_printf(m, "hierarchical_%s=%lu", stat->name, nr);
3071                for_each_node_state(nid, N_MEMORY) {
3072                        nr = 0;
3073                        for_each_mem_cgroup_tree(iter, memcg)
3074                                nr += mem_cgroup_node_nr_lru_pages(
3075                                        iter, nid, stat->lru_mask);
3076                        seq_printf(m, " N%d=%lu", nid, nr);
3077                }
3078                seq_putc(m, '\n');
3079        }
3080
3081        return 0;
3082}
3083#endif /* CONFIG_NUMA */
3084
3085/* Universal VM events cgroup1 shows, original sort order */
3086unsigned int memcg1_events[] = {
3087        PGPGIN,
3088        PGPGOUT,
3089        PGFAULT,
3090        PGMAJFAULT,
3091};
3092
3093static const char *const memcg1_event_names[] = {
3094        "pgpgin",
3095        "pgpgout",
3096        "pgfault",
3097        "pgmajfault",
3098};
3099
3100static int memcg_stat_show(struct seq_file *m, void *v)
3101{
3102        struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
3103        unsigned long memory, memsw;
3104        struct mem_cgroup *mi;
3105        unsigned int i;
3106
3107        BUILD_BUG_ON(ARRAY_SIZE(memcg1_stat_names) != ARRAY_SIZE(memcg1_stats));
3108        BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_lru_names) != NR_LRU_LISTS);
3109
3110        for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) {
3111                if (memcg1_stats[i] == MEMCG_SWAP && !do_memsw_account())
3112                        continue;
3113                seq_printf(m, "%s %lu\n", memcg1_stat_names[i],
3114                           memcg_page_state(memcg, memcg1_stats[i]) *
3115                           PAGE_SIZE);
3116        }
3117
3118        for (i = 0; i < ARRAY_SIZE(memcg1_events); i++)
3119                seq_printf(m, "%s %lu\n", memcg1_event_names[i],
3120                           memcg_sum_events(memcg, memcg1_events[i]));
3121
3122        for (i = 0; i < NR_LRU_LISTS; i++)
3123                seq_printf(m, "%s %lu\n", mem_cgroup_lru_names[i],
3124                           mem_cgroup_nr_lru_pages(memcg, BIT(i)) * PAGE_SIZE);
3125
3126        /* Hierarchical information */
3127        memory = memsw = PAGE_COUNTER_MAX;
3128        for (mi = memcg; mi; mi = parent_mem_cgroup(mi)) {
3129                memory = min(memory, mi->memory.limit);
3130                memsw = min(memsw, mi->memsw.limit);
3131        }
3132        seq_printf(m, "hierarchical_memory_limit %llu\n",
3133                   (u64)memory * PAGE_SIZE);
3134        if (do_memsw_account())
3135                seq_printf(m, "hierarchical_memsw_limit %llu\n",
3136                           (u64)memsw * PAGE_SIZE);
3137
3138        for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) {
3139                unsigned long long val = 0;
3140
3141                if (memcg1_stats[i] == MEMCG_SWAP && !do_memsw_account())
3142                        continue;
3143                for_each_mem_cgroup_tree(mi, memcg)
3144                        val += memcg_page_state(mi, memcg1_stats[i]) *
3145                        PAGE_SIZE;
3146                seq_printf(m, "total_%s %llu\n", memcg1_stat_names[i], val);
3147        }
3148
3149        for (i = 0; i < ARRAY_SIZE(memcg1_events); i++) {
3150                unsigned long long val = 0;
3151
3152                for_each_mem_cgroup_tree(mi, memcg)
3153                        val += memcg_sum_events(mi, memcg1_events[i]);
3154                seq_printf(m, "total_%s %llu\n", memcg1_event_names[i], val);
3155        }
3156
3157        for (i = 0; i < NR_LRU_LISTS; i++) {
3158                unsigned long long val = 0;
3159
3160                for_each_mem_cgroup_tree(mi, memcg)
3161                        val += mem_cgroup_nr_lru_pages(mi, BIT(i)) * PAGE_SIZE;
3162                seq_printf(m, "total_%s %llu\n", mem_cgroup_lru_names[i], val);
3163        }
3164
3165#ifdef CONFIG_DEBUG_VM
3166        {
3167                pg_data_t *pgdat;
3168                struct mem_cgroup_per_node *mz;
3169                struct zone_reclaim_stat *rstat;
3170                unsigned long recent_rotated[2] = {0, 0};
3171                unsigned long recent_scanned[2] = {0, 0};
3172
3173                for_each_online_pgdat(pgdat) {
3174                        mz = mem_cgroup_nodeinfo(memcg, pgdat->node_id);
3175                        rstat = &mz->lruvec.reclaim_stat;
3176
3177                        recent_rotated[0] += rstat->recent_rotated[0];
3178                        recent_rotated[1] += rstat->recent_rotated[1];
3179                        recent_scanned[0] += rstat->recent_scanned[0];
3180                        recent_scanned[1] += rstat->recent_scanned[1];
3181                }
3182                seq_printf(m, "recent_rotated_anon %lu\n", recent_rotated[0]);
3183                seq_printf(m, "recent_rotated_file %lu\n", recent_rotated[1]);
3184                seq_printf(m, "recent_scanned_anon %lu\n", recent_scanned[0]);
3185                seq_printf(m, "recent_scanned_file %lu\n", recent_scanned[1]);
3186        }
3187#endif
3188
3189        return 0;
3190}
3191
3192static u64 mem_cgroup_swappiness_read(struct cgroup_subsys_state *css,
3193                                      struct cftype *cft)
3194{
3195        struct mem_cgroup *memcg = mem_cgroup_from_css(css);
3196
3197        return mem_cgroup_swappiness(memcg);
3198}
3199
3200static int mem_cgroup_swappiness_write(struct cgroup_subsys_state *css,
3201                                       struct cftype *cft, u64 val)
3202{
3203        struct mem_cgroup *memcg = mem_cgroup_from_css(css);
3204
3205        if (val > 100)
3206                return -EINVAL;
3207
3208        if (css->parent)
3209                memcg->swappiness = val;
3210        else
3211                vm_swappiness = val;
3212
3213        return 0;
3214}
3215
3216static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap)
3217{
3218        struct mem_cgroup_threshold_ary *t;
3219        unsigned long usage;
3220        int i;
3221
3222        rcu_read_lock();
3223        if (!swap)
3224                t = rcu_dereference(memcg->thresholds.primary);
3225        else
3226                t = rcu_dereference(memcg->memsw_thresholds.primary);
3227
3228        if (!t)
3229                goto unlock;
3230
3231        usage = mem_cgroup_usage(memcg, swap);
3232
3233        /*
3234         * current_threshold points to threshold just below or equal to usage.
3235         * If it's not true, a threshold was crossed after last
3236         * call of __mem_cgroup_threshold().
3237         */
3238        i = t->current_threshold;
3239
3240        /*
3241         * Iterate backward over array of thresholds starting from
3242         * current_threshold and check if a threshold is crossed.
3243         * If none of thresholds below usage is crossed, we read
3244         * only one element of the array here.
3245         */
3246        for (; i >= 0 && unlikely(t->entries[i].threshold > usage); i--)
3247                eventfd_signal(t->entries[i].eventfd, 1);
3248
3249        /* i = current_threshold + 1 */
3250        i++;
3251
3252        /*
3253         * Iterate forward over array of thresholds starting from
3254         * current_threshold+1 and check if a threshold is crossed.
3255         * If none of thresholds above usage is crossed, we read
3256         * only one element of the array here.
3257         */
3258        for (; i < t->size && unlikely(t->entries[i].threshold <= usage); i++)
3259                eventfd_signal(t->entries[i].eventfd, 1);
3260
3261        /* Update current_threshold */
3262        t->current_threshold = i - 1;
3263unlock:
3264        rcu_read_unlock();
3265}
3266
3267static void mem_cgroup_threshold(struct mem_cgroup *memcg)
3268{
3269        while (memcg) {
3270                __mem_cgroup_threshold(memcg, false);
3271                if (do_memsw_account())
3272                        __mem_cgroup_threshold(memcg, true);
3273
3274                memcg = parent_mem_cgroup(memcg);
3275        }
3276}
3277
3278static int compare_thresholds(const void *a, const void *b)
3279{
3280        const struct mem_cgroup_threshold *_a = a;
3281        const struct mem_cgroup_threshold *_b = b;
3282
3283        if (_a->threshold > _b->threshold)
3284                return 1;
3285
3286        if (_a->threshold < _b->threshold)
3287                return -1;
3288
3289        return 0;
3290}
3291
3292static int mem_cgroup_oom_notify_cb(struct mem_cgroup *memcg)
3293{
3294        struct mem_cgroup_eventfd_list *ev;
3295
3296        spin_lock(&memcg_oom_lock);
3297
3298        list_for_each_entry(ev, &memcg->oom_notify, list)
3299                eventfd_signal(ev->eventfd, 1);
3300
3301        spin_unlock(&memcg_oom_lock);
3302        return 0;
3303}
3304
3305static void mem_cgroup_oom_notify(struct mem_cgroup *memcg)
3306{
3307        struct mem_cgroup *iter;
3308
3309        for_each_mem_cgroup_tree(iter, memcg)
3310                mem_cgroup_oom_notify_cb(iter);
3311}
3312
3313static int __mem_cgroup_usage_register_event(struct mem_cgroup *memcg,
3314        struct eventfd_ctx *eventfd, const char *args, enum res_type type)
3315{
3316        struct mem_cgroup_thresholds *thresholds;
3317        struct mem_cgroup_threshold_ary *new;
3318        unsigned long threshold;
3319        unsigned long usage;
3320        int i, size, ret;
3321
3322        ret = page_counter_memparse(args, "-1", &threshold);
3323        if (ret)
3324                return ret;
3325
3326        mutex_lock(&memcg->thresholds_lock);
3327
3328        if (type == _MEM) {
3329                thresholds = &memcg->thresholds;
3330                usage = mem_cgroup_usage(memcg, false);
3331        } else if (type == _MEMSWAP) {
3332                thresholds = &memcg->memsw_thresholds;
3333                usage = mem_cgroup_usage(memcg, true);
3334        } else
3335                BUG();
3336
3337        /* Check if a threshold crossed before adding a new one */
3338        if (thresholds->primary)
3339                __mem_cgroup_threshold(memcg, type == _MEMSWAP);
3340
3341        size = thresholds->primary ? thresholds->primary->size + 1 : 1;
3342
3343        /* Allocate memory for new array of thresholds */
3344        new = kmalloc(sizeof(*new) + size * sizeof(struct mem_cgroup_threshold),
3345                        GFP_KERNEL);
3346        if (!new) {
3347                ret = -ENOMEM;
3348                goto unlock;
3349        }
3350        new->size = size;
3351
3352        /* Copy thresholds (if any) to new array */
3353        if (thresholds->primary) {
3354                memcpy(new->entries, thresholds->primary->entries, (size - 1) *
3355                                sizeof(struct mem_cgroup_threshold));
3356        }
3357
3358        /* Add new threshold */
3359        new->entries[size - 1].eventfd = eventfd;
3360        new->entries[size - 1].threshold = threshold;
3361
3362        /* Sort thresholds. Registering of new threshold isn't time-critical */
3363        sort(new->entries, size, sizeof(struct mem_cgroup_threshold),
3364                        compare_thresholds, NULL);
3365
3366        /* Find current threshold */
3367        new->current_threshold = -1;
3368        for (i = 0; i < size; i++) {
3369                if (new->entries[i].threshold <= usage) {
3370                        /*
3371                         * new->current_threshold will not be used until
3372                         * rcu_assign_pointer(), so it's safe to increment
3373                         * it here.
3374                         */
3375                        ++new->current_threshold;
3376                } else
3377                        break;
3378        }
3379
3380        /* Free old spare buffer and save old primary buffer as spare */
3381        kfree(thresholds->spare);
3382        thresholds->spare = thresholds->primary;
3383
3384        rcu_assign_pointer(thresholds->primary, new);
3385
3386        /* To be sure that nobody uses thresholds */
3387        synchronize_rcu();
3388
3389unlock:
3390        mutex_unlock(&memcg->thresholds_lock);
3391
3392        return ret;
3393}
3394
3395static int mem_cgroup_usage_register_event(struct mem_cgroup *memcg,
3396        struct eventfd_ctx *eventfd, const char *args)
3397{
3398        return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEM);
3399}
3400
3401static int memsw_cgroup_usage_register_event(struct mem_cgroup *memcg,
3402        struct eventfd_ctx *eventfd, const char *args)
3403{
3404        return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEMSWAP);
3405}
3406
3407static void __mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
3408        struct eventfd_ctx *eventfd, enum res_type type)
3409{
3410        struct mem_cgroup_thresholds *thresholds;
3411        struct mem_cgroup_threshold_ary *new;
3412        unsigned long usage;
3413        int i, j, size;
3414
3415        mutex_lock(&memcg->thresholds_lock);
3416
3417        if (type == _MEM) {
3418                thresholds = &memcg->thresholds;
3419                usage = mem_cgroup_usage(memcg, false);
3420        } else if (type == _MEMSWAP) {
3421                thresholds = &memcg->memsw_thresholds;
3422                usage = mem_cgroup_usage(memcg, true);
3423        } else
3424                BUG();
3425
3426        if (!thresholds->primary)
3427                goto unlock;
3428
3429        /* Check if a threshold crossed before removing */
3430        __mem_cgroup_threshold(memcg, type == _MEMSWAP);
3431
3432        /* Calculate new number of threshold */
3433        size = 0;
3434        for (i = 0; i < thresholds->primary->size; i++) {
3435                if (thresholds->primary->entries[i].eventfd != eventfd)
3436                        size++;
3437        }
3438
3439        new = thresholds->spare;
3440
3441        /* Set thresholds array to NULL if we don't have thresholds */
3442        if (!size) {
3443                kfree(new);
3444                new = NULL;
3445                goto swap_buffers;
3446        }
3447
3448        new->size = size;
3449
3450        /* Copy thresholds and find current threshold */
3451        new->current_threshold = -1;
3452        for (i = 0, j = 0; i < thresholds->primary->size; i++) {
3453                if (thresholds->primary->entries[i].eventfd == eventfd)
3454                        continue;
3455
3456                new->entries[j] = thresholds->primary->entries[i];
3457                if (new->entries[j].threshold <= usage) {
3458                        /*
3459                         * new->current_threshold will not be used
3460                         * until rcu_assign_pointer(), so it's safe to increment
3461                         * it here.
3462                         */
3463                        ++new->current_threshold;
3464                }
3465                j++;
3466        }
3467
3468swap_buffers:
3469        /* Swap primary and spare array */
3470        thresholds->spare = thresholds->primary;
3471
3472        rcu_assign_pointer(thresholds->primary, new);
3473
3474        /* To be sure that nobody uses thresholds */
3475        synchronize_rcu();
3476
3477        /* If all events are unregistered, free the spare array */
3478        if (!new) {
3479                kfree(thresholds->spare);
3480                thresholds->spare = NULL;
3481        }
3482unlock:
3483        mutex_unlock(&memcg->thresholds_lock);
3484}
3485
3486static void mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
3487        struct eventfd_ctx *eventfd)
3488{
3489        return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEM);
3490}
3491
3492static void memsw_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
3493        struct eventfd_ctx *eventfd)
3494{
3495        return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEMSWAP);
3496}
3497
3498static int mem_cgroup_oom_register_event(struct mem_cgroup *memcg,
3499        struct eventfd_ctx *eventfd, const char *args)
3500{
3501        struct mem_cgroup_eventfd_list *event;
3502
3503        event = kmalloc(sizeof(*event), GFP_KERNEL);
3504        if (!event)
3505                return -ENOMEM;
3506
3507        spin_lock(&memcg_oom_lock);
3508
3509        event->eventfd = eventfd;
3510        list_add(&event->list, &memcg->oom_notify);
3511
3512        /* already in OOM ? */
3513        if (memcg->under_oom)
3514                eventfd_signal(eventfd, 1);
3515        spin_unlock(&memcg_oom_lock);
3516
3517        return 0;
3518}
3519
3520static void mem_cgroup_oom_unregister_event(struct mem_cgroup *memcg,
3521        struct eventfd_ctx *eventfd)
3522{
3523        struct mem_cgroup_eventfd_list *ev, *tmp;
3524
3525        spin_lock(&memcg_oom_lock);
3526
3527        list_for_each_entry_safe(ev, tmp, &memcg->oom_notify, list) {
3528                if (ev->eventfd == eventfd) {
3529                        list_del(&ev->list);
3530                        kfree(ev);
3531                }
3532        }
3533
3534        spin_unlock(&memcg_oom_lock);
3535}
3536
3537static int mem_cgroup_oom_control_read(struct seq_file *sf, void *v)
3538{
3539        struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(sf));
3540
3541        seq_printf(sf, "oom_kill_disable %d\n", memcg->oom_kill_disable);
3542        seq_printf(sf, "under_oom %d\n", (bool)memcg->under_oom);
3543        seq_printf(sf, "oom_kill %lu\n", memcg_sum_events(memcg, OOM_KILL));
3544        return 0;
3545}
3546
3547static int mem_cgroup_oom_control_write(struct cgroup_subsys_state *css,
3548        struct cftype *cft, u64 val)
3549{
3550        struct mem_cgroup *memcg = mem_cgroup_from_css(css);
3551
3552        /* cannot set to root cgroup and only 0 and 1 are allowed */
3553        if (!css->parent || !((val == 0) || (val == 1)))
3554                return -EINVAL;
3555
3556        memcg->oom_kill_disable = val;
3557        if (!val)
3558                memcg_oom_recover(memcg);
3559
3560        return 0;
3561}
3562
3563#ifdef CONFIG_CGROUP_WRITEBACK
3564
3565struct list_head *mem_cgroup_cgwb_list(struct mem_cgroup *memcg)
3566{
3567        return &memcg->cgwb_list;
3568}
3569
3570static int memcg_wb_domain_init(struct mem_cgroup *memcg, gfp_t gfp)
3571{
3572        return wb_domain_init(&memcg->cgwb_domain, gfp);
3573}
3574
3575static void memcg_wb_domain_exit(struct mem_cgroup *memcg)
3576{
3577        wb_domain_exit(&memcg->cgwb_domain);
3578}
3579
3580static void memcg_wb_domain_size_changed(struct mem_cgroup *memcg)
3581{
3582        wb_domain_size_changed(&memcg->cgwb_domain);
3583}
3584
3585struct wb_domain *mem_cgroup_wb_domain(struct bdi_writeback *wb)
3586{
3587        struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css);
3588
3589        if (!memcg->css.parent)
3590                return NULL;
3591
3592        return &memcg->cgwb_domain;
3593}
3594
3595/**
3596 * mem_cgroup_wb_stats - retrieve writeback related stats from its memcg
3597 * @wb: bdi_writeback in question
3598 * @pfilepages: out parameter for number of file pages
3599 * @pheadroom: out parameter for number of allocatable pages according to memcg
3600 * @pdirty: out parameter for number of dirty pages
3601 * @pwriteback: out parameter for number of pages under writeback
3602 *
3603 * Determine the numbers of file, headroom, dirty, and writeback pages in
3604 * @wb's memcg.  File, dirty and writeback are self-explanatory.  Headroom
3605 * is a bit more involved.
3606 *
3607 * A memcg's headroom is "min(max, high) - used".  In the hierarchy, the
3608 * headroom is calculated as the lowest headroom of itself and the
3609 * ancestors.  Note that this doesn't consider the actual amount of
3610 * available memory in the system.  The caller should further cap
3611 * *@pheadroom accordingly.
3612 */
3613void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages,
3614                         unsigned long *pheadroom, unsigned long *pdirty,
3615                         unsigned long *pwriteback)
3616{
3617        struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css);
3618        struct mem_cgroup *parent;
3619
3620        *pdirty = memcg_page_state(memcg, NR_FILE_DIRTY);
3621
3622        /* this should eventually include NR_UNSTABLE_NFS */
3623        *pwriteback = memcg_page_state(memcg, NR_WRITEBACK);
3624        *pfilepages = mem_cgroup_nr_lru_pages(memcg, (1 << LRU_INACTIVE_FILE) |
3625                                                     (1 << LRU_ACTIVE_FILE));
3626        *pheadroom = PAGE_COUNTER_MAX;
3627
3628        while ((parent = parent_mem_cgroup(memcg))) {
3629                unsigned long ceiling = min(memcg->memory.limit, memcg->high);
3630                unsigned long used = page_counter_read(&memcg->memory);
3631
3632                *pheadroom = min(*pheadroom, ceiling - min(ceiling, used));
3633                memcg = parent;
3634        }
3635}
3636
3637#else   /* CONFIG_CGROUP_WRITEBACK */
3638
3639static int memcg_wb_domain_init(struct mem_cgroup *memcg, gfp_t gfp)
3640{
3641        return 0;
3642}
3643
3644static void memcg_wb_domain_exit(struct mem_cgroup *memcg)
3645{
3646}
3647
3648static void memcg_wb_domain_size_changed(struct mem_cgroup *memcg)
3649{
3650}
3651
3652#endif  /* CONFIG_CGROUP_WRITEBACK */
3653
3654/*
3655 * DO NOT USE IN NEW FILES.
3656 *
3657 * "cgroup.event_control" implementation.
3658 *
3659 * This is way over-engineered.  It tries to support fully configurable
3660 * events for each user.  Such level of flexibility is completely
3661 * unnecessary especially in the light of the planned unified hierarchy.
3662 *
3663 * Please deprecate this and replace with something simpler if at all
3664 * possible.
3665 */
3666
3667/*
3668 * Unregister event and free resources.
3669 *
3670 * Gets called from workqueue.
3671 */
3672static void memcg_event_remove(struct work_struct *work)
3673{
3674        struct mem_cgroup_event *event =
3675                container_of(work, struct mem_cgroup_event, remove);
3676        struct mem_cgroup *memcg = event->memcg;
3677
3678        remove_wait_queue(event->wqh, &event->wait);
3679
3680        event->unregister_event(memcg, event->eventfd);
3681
3682        /* Notify userspace the event is going away. */
3683        eventfd_signal(event->eventfd, 1);
3684
3685        eventfd_ctx_put(event->eventfd);
3686        kfree(event);
3687        css_put(&memcg->css);
3688}
3689
3690/*
3691 * Gets called on EPOLLHUP on eventfd when user closes it.
3692 *
3693 * Called with wqh->lock held and interrupts disabled.
3694 */
3695static int memcg_event_wake(wait_queue_entry_t *wait, unsigned mode,
3696                            int sync, void *key)
3697{
3698        struct mem_cgroup_event *event =
3699                container_of(wait, struct mem_cgroup_event, wait);
3700        struct mem_cgroup *memcg = event->memcg;
3701        __poll_t flags = key_to_poll(key);
3702
3703        if (flags & EPOLLHUP) {
3704                /*
3705                 * If the event has been detached at cgroup removal, we
3706                 * can simply return knowing the other side will cleanup
3707                 * for us.
3708                 *
3709                 * We can't race against event freeing since the other
3710                 * side will require wqh->lock via remove_wait_queue(),
3711                 * which we hold.
3712                 */
3713                spin_lock(&memcg->event_list_lock);
3714                if (!list_empty(&event->list)) {
3715                        list_del_init(&event->list);
3716                        /*
3717                         * We are in atomic context, but cgroup_event_remove()
3718                         * may sleep, so we have to call it in workqueue.
3719                         */
3720                        schedule_work(&event->remove);
3721                }
3722                spin_unlock(&memcg->event_list_lock);
3723        }
3724
3725        return 0;
3726}
3727
3728static void memcg_event_ptable_queue_proc(struct file *file,
3729                wait_queue_head_t *wqh, poll_table *pt)
3730{
3731        struct mem_cgroup_event *event =
3732                container_of(pt, struct mem_cgroup_event, pt);
3733
3734        event->wqh = wqh;
3735        add_wait_queue(wqh, &event->wait);
3736}
3737
3738/*
3739 * DO NOT USE IN NEW FILES.
3740 *
3741 * Parse input and register new cgroup event handler.
3742 *
3743 * Input must be in format '<event_fd> <control_fd> <args>'.
3744 * Interpretation of args is defined by control file implementation.
3745 */
3746static ssize_t memcg_write_event_control(struct kernfs_open_file *of,
3747                                         char *buf, size_t nbytes, loff_t off)
3748{
3749        struct cgroup_subsys_state *css = of_css(of);
3750        struct mem_cgroup *memcg = mem_cgroup_from_css(css);
3751        struct mem_cgroup_event *event;
3752        struct cgroup_subsys_state *cfile_css;
3753        unsigned int efd, cfd;
3754        struct fd efile;
3755        struct fd cfile;
3756        const char *name;
3757        char *endp;
3758        int ret;
3759
3760        buf = strstrip(buf);
3761
3762        efd = simple_strtoul(buf, &endp, 10);
3763        if (*endp != ' ')
3764                return -EINVAL;
3765        buf = endp + 1;
3766
3767        cfd = simple_strtoul(buf, &endp, 10);
3768        if ((*endp != ' ') && (*endp != '\0'))
3769                return -EINVAL;
3770        buf = endp + 1;
3771
3772        event = kzalloc(sizeof(*event), GFP_KERNEL);
3773        if (!event)
3774                return -ENOMEM;
3775
3776        event->memcg = memcg;
3777        INIT_LIST_HEAD(&event->list);
3778        init_poll_funcptr(&event->pt, memcg_event_ptable_queue_proc);
3779        init_waitqueue_func_entry(&event->wait, memcg_event_wake);
3780        INIT_WORK(&event->remove, memcg_event_remove);
3781
3782        efile = fdget(efd);
3783        if (!efile.file) {
3784                ret = -EBADF;
3785                goto out_kfree;
3786        }
3787
3788        event->eventfd = eventfd_ctx_fileget(efile.file);
3789        if (IS_ERR(event->eventfd)) {
3790                ret = PTR_ERR(event->eventfd);
3791                goto out_put_efile;
3792        }
3793
3794        cfile = fdget(cfd);
3795        if (!cfile.file) {
3796                ret = -EBADF;
3797                goto out_put_eventfd;
3798        }
3799
3800        /* the process need read permission on control file */
3801        /* AV: shouldn't we check that it's been opened for read instead? */
3802        ret = inode_permission(file_inode(cfile.file), MAY_READ);
3803        if (ret < 0)
3804                goto out_put_cfile;
3805
3806        /*
3807         * Determine the event callbacks and set them in @event.  This used
3808         * to be done via struct cftype but cgroup core no longer knows
3809         * about these events.  The following is crude but the whole thing
3810         * is for compatibility anyway.
3811         *
3812         * DO NOT ADD NEW FILES.
3813         */
3814        name = cfile.file->f_path.dentry->d_name.name;
3815
3816        if (!strcmp(name, "memory.usage_in_bytes")) {
3817                event->register_event = mem_cgroup_usage_register_event;
3818                event->unregister_event = mem_cgroup_usage_unregister_event;
3819        } else if (!strcmp(name, "memory.oom_control")) {
3820                event->register_event = mem_cgroup_oom_register_event;
3821                event->unregister_event = mem_cgroup_oom_unregister_event;
3822        } else if (!strcmp(name, "memory.pressure_level")) {
3823                event->register_event = vmpressure_register_event;
3824                event->unregister_event = vmpressure_unregister_event;
3825        } else if (!strcmp(name, "memory.memsw.usage_in_bytes")) {
3826                event->register_event = memsw_cgroup_usage_register_event;
3827                event->unregister_event = memsw_cgroup_usage_unregister_event;
3828        } else {
3829                ret = -EINVAL;
3830                goto out_put_cfile;
3831        }
3832
3833        /*
3834         * Verify @cfile should belong to @css.  Also, remaining events are
3835         * automatically removed on cgroup destruction but the removal is
3836         * asynchronous, so take an extra ref on @css.
3837         */
3838        cfile_css = css_tryget_online_from_dir(cfile.file->f_path.dentry->d_parent,
3839                                               &memory_cgrp_subsys);
3840        ret = -EINVAL;
3841        if (IS_ERR(cfile_css))
3842                goto out_put_cfile;
3843        if (cfile_css != css) {
3844                css_put(cfile_css);
3845                goto out_put_cfile;
3846        }
3847
3848        ret = event->register_event(memcg, event->eventfd, buf);
3849        if (ret)
3850                goto out_put_css;
3851
3852        efile.file->f_op->poll(efile.file, &event->pt);
3853
3854        spin_lock(&memcg->event_list_lock);
3855        list_add(&event->list, &memcg->event_list);
3856        spin_unlock(&memcg->event_list_lock);
3857
3858        fdput(cfile);
3859        fdput(efile);
3860
3861        return nbytes;
3862
3863out_put_css:
3864        css_put(css);
3865out_put_cfile:
3866        fdput(cfile);
3867out_put_eventfd:
3868        eventfd_ctx_put(event->eventfd);
3869out_put_efile:
3870        fdput(efile);
3871out_kfree:
3872        kfree(event);
3873
3874        return ret;
3875}
3876
3877static struct cftype mem_cgroup_legacy_files[] = {
3878        {
3879                .name = "usage_in_bytes",
3880                .private = MEMFILE_PRIVATE(_MEM, RES_USAGE),
3881                .read_u64 = mem_cgroup_read_u64,
3882        },
3883        {
3884                .name = "max_usage_in_bytes",
3885                .private = MEMFILE_PRIVATE(_MEM, RES_MAX_USAGE),
3886                .write = mem_cgroup_reset,
3887                .read_u64 = mem_cgroup_read_u64,
3888        },
3889        {
3890                .name = "limit_in_bytes",
3891                .private = MEMFILE_PRIVATE(_MEM, RES_LIMIT),
3892                .write = mem_cgroup_write,
3893                .read_u64 = mem_cgroup_read_u64,
3894        },
3895        {
3896                .name = "soft_limit_in_bytes",
3897                .private = MEMFILE_PRIVATE(_MEM, RES_SOFT_LIMIT),
3898                .write = mem_cgroup_write,
3899                .read_u64 = mem_cgroup_read_u64,
3900        },
3901        {
3902                .name = "failcnt",
3903                .private = MEMFILE_PRIVATE(_MEM, RES_FAILCNT),
3904                .write = mem_cgroup_reset,
3905                .read_u64 = mem_cgroup_read_u64,
3906        },
3907        {
3908                .name = "stat",
3909                .seq_show = memcg_stat_show,
3910        },
3911        {
3912                .name = "force_empty",
3913                .write = mem_cgroup_force_empty_write,
3914        },
3915        {
3916                .name = "use_hierarchy",
3917                .write_u64 = mem_cgroup_hierarchy_write,
3918                .read_u64 = mem_cgroup_hierarchy_read,
3919        },
3920        {
3921                .name = "cgroup.event_control",         /* XXX: for compat */
3922                .write = memcg_write_event_control,
3923                .flags = CFTYPE_NO_PREFIX | CFTYPE_WORLD_WRITABLE,
3924        },
3925        {
3926                .name = "swappiness",
3927                .read_u64 = mem_cgroup_swappiness_read,
3928                .write_u64 = mem_cgroup_swappiness_write,
3929        },
3930        {
3931                .name = "move_charge_at_immigrate",
3932                .read_u64 = mem_cgroup_move_charge_read,
3933                .write_u64 = mem_cgroup_move_charge_write,
3934        },
3935        {
3936                .name = "oom_control",
3937                .seq_show = mem_cgroup_oom_control_read,
3938                .write_u64 = mem_cgroup_oom_control_write,
3939                .private = MEMFILE_PRIVATE(_OOM_TYPE, OOM_CONTROL),
3940        },
3941        {
3942                .name = "pressure_level",
3943        },
3944#ifdef CONFIG_NUMA
3945        {
3946                .name = "numa_stat",
3947                .seq_show = memcg_numa_stat_show,
3948        },
3949#endif
3950        {
3951                .name = "kmem.limit_in_bytes",
3952                .private = MEMFILE_PRIVATE(_KMEM, RES_LIMIT),
3953                .write = mem_cgroup_write,
3954                .read_u64 = mem_cgroup_read_u64,
3955        },
3956        {
3957                .name = "kmem.usage_in_bytes",
3958                .private = MEMFILE_PRIVATE(_KMEM, RES_USAGE),
3959                .read_u64 = mem_cgroup_read_u64,
3960        },
3961        {
3962                .name = "kmem.failcnt",
3963                .private = MEMFILE_PRIVATE(_KMEM, RES_FAILCNT),
3964                .write = mem_cgroup_reset,
3965                .read_u64 = mem_cgroup_read_u64,
3966        },
3967        {
3968                .name = "kmem.max_usage_in_bytes",
3969                .private = MEMFILE_PRIVATE(_KMEM, RES_MAX_USAGE),
3970                .write = mem_cgroup_reset,
3971                .read_u64 = mem_cgroup_read_u64,
3972        },
3973#if defined(CONFIG_SLAB) || defined(CONFIG_SLUB_DEBUG)
3974        {
3975                .name = "kmem.slabinfo",
3976                .seq_start = memcg_slab_start,
3977                .seq_next = memcg_slab_next,
3978                .seq_stop = memcg_slab_stop,
3979                .seq_show = memcg_slab_show,
3980        },
3981#endif
3982        {
3983                .name = "kmem.tcp.limit_in_bytes",
3984                .private = MEMFILE_PRIVATE(_TCP, RES_LIMIT),
3985                .write = mem_cgroup_write,
3986                .read_u64 = mem_cgroup_read_u64,
3987        },
3988        {
3989                .name = "kmem.tcp.usage_in_bytes",
3990                .private = MEMFILE_PRIVATE(_TCP, RES_USAGE),
3991                .read_u64 = mem_cgroup_read_u64,
3992        },
3993        {
3994                .name = "kmem.tcp.failcnt",
3995                .private = MEMFILE_PRIVATE(_TCP, RES_FAILCNT),
3996                .write = mem_cgroup_reset,
3997                .read_u64 = mem_cgroup_read_u64,
3998        },
3999        {
4000                .name = "kmem.tcp.max_usage_in_bytes",

4001                .private = MEMFILE_PRIVATE(_TCP, RES_MAX_USAGE),
4002                .write = mem_cgroup_reset,
4003                .read_u64 = mem_cgroup_read_u64,
4004        },
4005        { },    /* terminate */
4006};
4007
4008/*
4009 * Private memory cgroup IDR
4010 *
4011 * Swap-out records and page cache shadow entries need to store memcg
4012 * references in constrained space, so we maintain an ID space that is
4013 * limited to 16 bit (MEM_CGROUP_ID_MAX), limiting the total number of
4014 * memory-controlled cgroups to 64k.
4015 *
4016 * However, there usually are many references to the oflline CSS after
4017 * the cgroup has been destroyed, such as page cache or reclaimable
4018 * slab objects, that don't need to hang on to the ID. We want to keep
4019 * those dead CSS from occupying IDs, or we might quickly exhaust the
4020 * relatively small ID space and prevent the creation of new cgroups
4021 * even when there are much fewer than 64k cgroups - possibly none.
4022 *
4023 * Maintain a private 16-bit ID space for memcg, and allow the ID to
4024 * be freed and recycled when it's no longer needed, which is usually
4025 * when the CSS is offlined.
4026 *
4027 * The only exception to that are records of swapped out tmpfs/shmem
4028 * pages that need to be attributed to live ancestors on swapin. But
4029 * those references are manageable from userspace.
4030 */
4031
4032static DEFINE_IDR(mem_cgroup_idr);
4033
4034static void mem_cgroup_id_get_many(struct mem_cgroup *memcg, unsigned int n)
4035{
4036        VM_BUG_ON(atomic_read(&memcg->id.ref) <= 0);
4037        atomic_add(n, &memcg->id.ref);
4038}
4039
4040static void mem_cgroup_id_put_many(struct mem_cgroup *memcg, unsigned int n)
4041{
4042        VM_BUG_ON(atomic_read(&memcg->id.ref) < n);
4043        if (atomic_sub_and_test(n, &memcg->id.ref)) {
4044                idr_remove(&mem_cgroup_idr, memcg->id.id);
4045                memcg->id.id = 0;
4046
4047                /* Memcg ID pins CSS */
4048                css_put(&memcg->css);
4049        }
4050}
4051
4052static inline void mem_cgroup_id_get(struct mem_cgroup *memcg)
4053{
4054        mem_cgroup_id_get_many(memcg, 1);
4055}
4056
4057static inline void mem_cgroup_id_put(struct mem_cgroup *memcg)
4058{
4059        mem_cgroup_id_put_many(memcg, 1);
4060}
4061
4062/**
4063 * mem_cgroup_from_id - look up a memcg from a memcg id
4064 * @id: the memcg id to look up
4065 *
4066 * Caller must hold rcu_read_lock().
4067 */
4068struct mem_cgroup *mem_cgroup_from_id(unsigned short id)
4069{
4070        WARN_ON_ONCE(!rcu_read_lock_held());
4071        return idr_find(&mem_cgroup_idr, id);
4072}
4073
4074static int alloc_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node)
4075{
4076        struct mem_cgroup_per_node *pn;
4077        int tmp = node;
4078        /*
4079         * This routine is called against possible nodes.
4080         * But it's BUG to call kmalloc() against offline node.
4081         *
4082         * TODO: this routine can waste much memory for nodes which will
4083         *       never be onlined. It's better to use memory hotplug callback
4084         *       function.
4085         */
4086        if (!node_state(node, N_NORMAL_MEMORY))
4087                tmp = -1;
4088        pn = kzalloc_node(sizeof(*pn), GFP_KERNEL, tmp);
4089        if (!pn)
4090                return 1;
4091
4092        pn->lruvec_stat_cpu = alloc_percpu(struct lruvec_stat);
4093        if (!pn->lruvec_stat_cpu) {
4094                kfree(pn);
4095                return 1;
4096        }
4097
4098        lruvec_init(&pn->lruvec);
4099        pn->usage_in_excess = 0;
4100        pn->on_tree = false;
4101        pn->memcg = memcg;
4102
4103        memcg->nodeinfo[node] = pn;
4104        return 0;
4105}
4106
4107static void free_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node)
4108{
4109        struct mem_cgroup_per_node *pn = memcg->nodeinfo[node];
4110
4111        if (!pn)
4112                return;
4113
4114        free_percpu(pn->lruvec_stat_cpu);
4115        kfree(pn);
4116}
4117
4118static void __mem_cgroup_free(struct mem_cgroup *memcg)
4119{
4120        int node;
4121
4122        for_each_node(node)
4123                free_mem_cgroup_per_node_info(memcg, node);
4124        free_percpu(memcg->stat_cpu);
4125        kfree(memcg);
4126}
4127
4128static void mem_cgroup_free(struct mem_cgroup *memcg)
4129{
4130        memcg_wb_domain_exit(memcg);
4131        __mem_cgroup_free(memcg);
4132}
4133
4134static struct mem_cgroup *mem_cgroup_alloc(void)
4135{
4136        struct mem_cgroup *memcg;
4137        size_t size;
4138        int node;
4139
4140        size = sizeof(struct mem_cgroup);
4141        size += nr_node_ids * sizeof(struct mem_cgroup_per_node *);
4142
4143        memcg = kzalloc(size, GFP_KERNEL);
4144        if (!memcg)
4145                return NULL;
4146
4147        memcg->id.id = idr_alloc(&mem_cgroup_idr, NULL,
4148                                 1, MEM_CGROUP_ID_MAX,
4149                                 GFP_KERNEL);
4150        if (memcg->id.id < 0)
4151                goto fail;
4152
4153        memcg->stat_cpu = alloc_percpu(struct mem_cgroup_stat_cpu);
4154        if (!memcg->stat_cpu)
4155                goto fail;
4156
4157        for_each_node(node)
4158                if (alloc_mem_cgroup_per_node_info(memcg, node))
4159                        goto fail;
4160
4161        if (memcg_wb_domain_init(memcg, GFP_KERNEL))
4162                goto fail;
4163
4164        INIT_WORK(&memcg->high_work, high_work_func);
4165        memcg->last_scanned_node = MAX_NUMNODES;
4166        INIT_LIST_HEAD(&memcg->oom_notify);
4167        mutex_init(&memcg->thresholds_lock);
4168        spin_lock_init(&memcg->move_lock);
4169        vmpressure_init(&memcg->vmpressure);
4170        INIT_LIST_HEAD(&memcg->event_list);
4171        spin_lock_init(&memcg->event_list_lock);
4172        memcg->socket_pressure = jiffies;
4173#ifndef CONFIG_SLOB
4174        memcg->kmemcg_id = -1;
4175#endif
4176#ifdef CONFIG_CGROUP_WRITEBACK
4177        INIT_LIST_HEAD(&memcg->cgwb_list);
4178#endif
4179        idr_replace(&mem_cgroup_idr, memcg, memcg->id.id);
4180        return memcg;
4181fail:
4182        if (memcg->id.id > 0)
4183                idr_remove(&mem_cgroup_idr, memcg->id.id);
4184        __mem_cgroup_free(memcg);
4185        return NULL;
4186}
4187
4188static struct cgroup_subsys_state * __ref
4189mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
4190{
4191        struct mem_cgroup *parent = mem_cgroup_from_css(parent_css);
4192        struct mem_cgroup *memcg;
4193        long error = -ENOMEM;
4194
4195        memcg = mem_cgroup_alloc();
4196        if (!memcg)
4197                return ERR_PTR(error);
4198
4199        memcg->high = PAGE_COUNTER_MAX;
4200        memcg->soft_limit = PAGE_COUNTER_MAX;
4201        if (parent) {
4202                memcg->swappiness = mem_cgroup_swappiness(parent);
4203                memcg->oom_kill_disable = parent->oom_kill_disable;
4204        }
4205        if (parent && parent->use_hierarchy) {
4206                memcg->use_hierarchy = true;
4207                page_counter_init(&memcg->memory, &parent->memory);
4208                page_counter_init(&memcg->swap, &parent->swap);
4209                page_counter_init(&memcg->memsw, &parent->memsw);
4210                page_counter_init(&memcg->kmem, &parent->kmem);
4211                page_counter_init(&memcg->tcpmem, &parent->tcpmem);
4212        } else {
4213                page_counter_init(&memcg->memory, NULL);
4214                page_counter_init(&memcg->swap, NULL);
4215                page_counter_init(&memcg->memsw, NULL);
4216                page_counter_init(&memcg->kmem, NULL);
4217                page_counter_init(&memcg->tcpmem, NULL);
4218                /*
4219                 * Deeper hierachy with use_hierarchy == false doesn't make
4220                 * much sense so let cgroup subsystem know about this
4221                 * unfortunate state in our controller.
4222                 */
4223                if (parent != root_mem_cgroup)
4224                        memory_cgrp_subsys.broken_hierarchy = true;
4225        }
4226
4227        /* The following stuff does not apply to the root */
4228        if (!parent) {
4229                root_mem_cgroup = memcg;
4230                return &memcg->css;
4231        }
4232
4233        error = memcg_online_kmem(memcg);
4234        if (error)
4235                goto fail;
4236
4237        if (cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_nosocket)
4238                static_branch_inc(&memcg_sockets_enabled_key);
4239
4240        return &memcg->css;
4241fail:
4242        mem_cgroup_free(memcg);
4243        return ERR_PTR(-ENOMEM);
4244}
4245
4246static int mem_cgroup_css_online(struct cgroup_subsys_state *css)
4247{
4248        struct mem_cgroup *memcg = mem_cgroup_from_css(css);
4249
4250        /* Online state pins memcg ID, memcg ID pins CSS */
4251        atomic_set(&memcg->id.ref, 1);
4252        css_get(css);
4253        return 0;
4254}
4255
4256static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
4257{
4258        struct mem_cgroup *memcg = mem_cgroup_from_css(css);
4259        struct mem_cgroup_event *event, *tmp;
4260
4261        /*
4262         * Unregister events and notify userspace.
4263         * Notify userspace about cgroup removing only after rmdir of cgroup
4264         * directory to avoid race between userspace and kernelspace.
4265         */
4266        spin_lock(&memcg->event_list_lock);
4267        list_for_each_entry_safe(event, tmp, &memcg->event_list, list) {
4268                list_del_init(&event->list);
4269                schedule_work(&event->remove);
4270        }
4271        spin_unlock(&memcg->event_list_lock);
4272
4273        memcg->low = 0;
4274
4275        memcg_offline_kmem(memcg);
4276        wb_memcg_offline(memcg);
4277
4278        mem_cgroup_id_put(memcg);
4279}
4280
4281static void mem_cgroup_css_released(struct cgroup_subsys_state *css)
4282{
4283        struct mem_cgroup *memcg = mem_cgroup_from_css(css);
4284
4285        invalidate_reclaim_iterators(memcg);
4286}
4287
4288static void mem_cgroup_css_free(struct cgroup_subsys_state *css)
4289{
4290        struct mem_cgroup *memcg = mem_cgroup_from_css(css);
4291
4292        if (cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_nosocket)
4293                static_branch_dec(&memcg_sockets_enabled_key);
4294
4295        if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && memcg->tcpmem_active)
4296                static_branch_dec(&memcg_sockets_enabled_key);
4297
4298        vmpressure_cleanup(&memcg->vmpressure);
4299        cancel_work_sync(&memcg->high_work);
4300        mem_cgroup_remove_from_trees(memcg);
4301        memcg_free_kmem(memcg);
4302        mem_cgroup_free(memcg);
4303}
4304
4305/**
4306 * mem_cgroup_css_reset - reset the states of a mem_cgroup
4307 * @css: the target css
4308 *
4309 * Reset the states of the mem_cgroup associated with @css.  This is
4310 * invoked when the userland requests disabling on the default hierarchy
4311 * but the memcg is pinned through dependency.  The memcg should stop
4312 * applying policies and should revert to the vanilla state as it may be
4313 * made visible again.
4314 *
4315 * The current implementation only resets the essential configurations.
4316 * This needs to be expanded to cover all the visible parts.
4317 */
4318static void mem_cgroup_css_reset(struct cgroup_subsys_state *css)
4319{
4320        struct mem_cgroup *memcg = mem_cgroup_from_css(css);
4321
4322        page_counter_limit(&memcg->memory, PAGE_COUNTER_MAX);
4323        page_counter_limit(&memcg->swap, PAGE_COUNTER_MAX);
4324        page_counter_limit(&memcg->memsw, PAGE_COUNTER_MAX);
4325        page_counter_limit(&memcg->kmem, PAGE_COUNTER_MAX);
4326        page_counter_limit(&memcg->tcpmem, PAGE_COUNTER_MAX);
4327        memcg->low = 0;
4328        memcg->high = PAGE_COUNTER_MAX;
4329        memcg->soft_limit = PAGE_COUNTER_MAX;
4330        memcg_wb_domain_size_changed(memcg);
4331}
4332
4333#ifdef CONFIG_MMU
4334/* Handlers for move charge at task migration. */
4335static int mem_cgroup_do_precharge(unsigned long count)
4336{
4337        int ret;
4338
4339        /* Try a single bulk charge without reclaim first, kswapd may wake */
4340        ret = try_charge(mc.to, GFP_KERNEL & ~__GFP_DIRECT_RECLAIM, count);
4341        if (!ret) {
4342                mc.precharge += count;
4343                return ret;
4344        }
4345
4346        /* Try charges one by one with reclaim, but do not retry */
4347        while (count--) {
4348                ret = try_charge(mc.to, GFP_KERNEL | __GFP_NORETRY, 1);
4349                if (ret)
4350                        return ret;
4351                mc.precharge++;
4352                cond_resched();
4353        }
4354        return 0;
4355}
4356
4357union mc_target {
4358        struct page     *page;
4359        swp_entry_t     ent;
4360};
4361
4362enum mc_target_type {
4363        MC_TARGET_NONE = 0,
4364        MC_TARGET_PAGE,
4365        MC_TARGET_SWAP,
4366        MC_TARGET_DEVICE,
4367};
4368
4369static struct page *mc_handle_present_pte(struct vm_area_struct *vma,
4370                                                unsigned long addr, pte_t ptent)
4371{
4372        struct page *page = _vm_normal_page(vma, addr, ptent, true);
4373
4374        if (!page || !page_mapped(page))
4375                return NULL;
4376        if (PageAnon(page)) {
4377                if (!(mc.flags & MOVE_ANON))
4378                        return NULL;
4379        } else {
4380                if (!(mc.flags & MOVE_FILE))
4381                        return NULL;
4382        }
4383        if (!get_page_unless_zero(page))
4384                return NULL;
4385
4386        return page;
4387}
4388
4389#if defined(CONFIG_SWAP) || defined(CONFIG_DEVICE_PRIVATE)
4390static struct page *mc_handle_swap_pte(struct vm_area_struct *vma,
4391                        pte_t ptent, swp_entry_t *entry)
4392{
4393        struct page *page = NULL;
4394        swp_entry_t ent = pte_to_swp_entry(ptent);
4395
4396        if (!(mc.flags & MOVE_ANON) || non_swap_entry(ent))
4397                return NULL;
4398
4399        /*
4400         * Handle MEMORY_DEVICE_PRIVATE which are ZONE_DEVICE page belonging to
4401         * a device and because they are not accessible by CPU they are store
4402         * as special swap entry in the CPU page table.
4403         */
4404        if (is_device_private_entry(ent)) {
4405                page = device_private_entry_to_page(ent);
4406                /*
4407                 * MEMORY_DEVICE_PRIVATE means ZONE_DEVICE page and which have
4408                 * a refcount of 1 when free (unlike normal page)
4409                 */
4410                if (!page_ref_add_unless(page, 1, 1))
4411                        return NULL;
4412                return page;
4413        }
4414
4415        /*
4416         * Because lookup_swap_cache() updates some statistics counter,
4417         * we call find_get_page() with swapper_space directly.
4418         */
4419        page = find_get_page(swap_address_space(ent), swp_offset(ent));
4420        if (do_memsw_account())
4421                entry->val = ent.val;
4422
4423        return page;
4424}
4425#else
4426static struct page *mc_handle_swap_pte(struct vm_area_struct *vma,
4427                        pte_t ptent, swp_entry_t *entry)
4428{
4429        return NULL;
4430}
4431#endif
4432
4433static struct page *mc_handle_file_pte(struct vm_area_struct *vma,
4434                        unsigned long addr, pte_t ptent, swp_entry_t *entry)
4435{
4436        struct page *page = NULL;
4437        struct address_space *mapping;
4438        pgoff_t pgoff;
4439
4440        if (!vma->vm_file) /* anonymous vma */
4441                return NULL;
4442        if (!(mc.flags & MOVE_FILE))
4443                return NULL;
4444
4445        mapping = vma->vm_file->f_mapping;
4446        pgoff = linear_page_index(vma, addr);
4447
4448        /* page is moved even if it's not RSS of this task(page-faulted). */
4449#ifdef CONFIG_SWAP
4450        /* shmem/tmpfs may report page out on swap: account for that too. */
4451        if (shmem_mapping(mapping)) {
4452                page = find_get_entry(mapping, pgoff);
4453                if (radix_tree_exceptional_entry(page)) {
4454                        swp_entry_t swp = radix_to_swp_entry(page);
4455                        if (do_memsw_account())
4456                                *entry = swp;
4457                        page = find_get_page(swap_address_space(swp),
4458                                             swp_offset(swp));
4459                }
4460        } else
4461                page = find_get_page(mapping, pgoff);
4462#else
4463        page = find_get_page(mapping, pgoff);
4464#endif
4465        return page;
4466}
4467
4468/**
4469 * mem_cgroup_move_account - move account of the page
4470 * @page: the page
4471 * @compound: charge the page as compound or small page
4472 * @from: mem_cgroup which the page is moved from.
4473 * @to: mem_cgroup which the page is moved to. @from != @to.
4474 *
4475 * The caller must make sure the page is not on LRU (isolate_page() is useful.)
4476 *
4477 * This function doesn't do "charge" to new cgroup and doesn't do "uncharge"
4478 * from old cgroup.
4479 */
4480static int mem_cgroup_move_account(struct page *page,
4481                                   bool compound,
4482                                   struct mem_cgroup *from,
4483                                   struct mem_cgroup *to)
4484{
4485        unsigned long flags;
4486        unsigned int nr_pages = compound ? hpage_nr_pages(page) : 1;
4487        int ret;
4488        bool anon;
4489
4490        VM_BUG_ON(from == to);
4491        VM_BUG_ON_PAGE(PageLRU(page), page);
4492        VM_BUG_ON(compound && !PageTransHuge(page));
4493
4494        /*
4495         * Prevent mem_cgroup_migrate() from looking at
4496         * page->mem_cgroup of its source page while we change it.
4497         */
4498        ret = -EBUSY;
4499        if (!trylock_page(page))
4500                goto out;
4501
4502        ret = -EINVAL;
4503        if (page->mem_cgroup != from)
4504                goto out_unlock;
4505
4506        anon = PageAnon(page);
4507
4508        spin_lock_irqsave(&from->move_lock, flags);
4509
4510        if (!anon && page_mapped(page)) {
4511                __mod_memcg_state(from, NR_FILE_MAPPED, -nr_pages);
4512                __mod_memcg_state(to, NR_FILE_MAPPED, nr_pages);
4513        }
4514
4515        /*
4516         * move_lock grabbed above and caller set from->moving_account, so
4517         * mod_memcg_page_state will serialize updates to PageDirty.
4518         * So mapping should be stable for dirty pages.
4519         */
4520        if (!anon && PageDirty(page)) {
4521                struct address_space *mapping = page_mapping(page);
4522
4523                if (mapping_cap_account_dirty(mapping)) {
4524                        __mod_memcg_state(from, NR_FILE_DIRTY, -nr_pages);
4525                        __mod_memcg_state(to, NR_FILE_DIRTY, nr_pages);
4526                }
4527        }
4528
4529        if (PageWriteback(page)) {
4530                __mod_memcg_state(from, NR_WRITEBACK, -nr_pages);
4531                __mod_memcg_state(to, NR_WRITEBACK, nr_pages);
4532        }
4533
4534        /*
4535         * It is safe to change page->mem_cgroup here because the page
4536         * is referenced, charged, and isolated - we can't race with
4537         * uncharging, charging, migration, or LRU putback.
4538         */
4539
4540        /* caller should have done css_get */
4541        page->mem_cgroup = to;
4542        spin_unlock_irqrestore(&from->move_lock, flags);
4543
4544        ret = 0;
4545
4546        local_irq_disable();
4547        mem_cgroup_charge_statistics(to, page, compound, nr_pages);
4548        memcg_check_events(to, page);
4549        mem_cgroup_charge_statistics(from, page, compound, -nr_pages);
4550        memcg_check_events(from, page);
4551        local_irq_enable();
4552out_unlock:
4553        unlock_page(page);
4554out:
4555        return ret;
4556}
4557
4558/**
4559 * get_mctgt_type - get target type of moving charge
4560 * @vma: the vma the pte to be checked belongs
4561 * @addr: the address corresponding to the pte to be checked
4562 * @ptent: the pte to be checked
4563 * @target: the pointer the target page or swap ent will be stored(can be NULL)
4564 *
4565 * Returns
4566 *   0(MC_TARGET_NONE): if the pte is not a target for move charge.
4567 *   1(MC_TARGET_PAGE): if the page corresponding to this pte is a target for
4568 *     move charge. if @target is not NULL, the page is stored in target->page
4569 *     with extra refcnt got(Callers should handle it).
4570 *   2(MC_TARGET_SWAP): if the swap entry corresponding to this pte is a
4571 *     target for charge migration. if @target is not NULL, the entry is stored
4572 *     in target->ent.
4573 *   3(MC_TARGET_DEVICE): like MC_TARGET_PAGE  but page is MEMORY_DEVICE_PUBLIC
4574 *     or MEMORY_DEVICE_PRIVATE (so ZONE_DEVICE page and thus not on the lru).
4575 *     For now we such page is charge like a regular page would be as for all
4576 *     intent and purposes it is just special memory taking the place of a
4577 *     regular page.
4578 *
4579 *     See Documentations/vm/hmm.txt and include/linux/hmm.h
4580 *
4581 * Called with pte lock held.
4582 */
4583
4584static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma,
4585                unsigned long addr, pte_t ptent, union mc_target *target)
4586{
4587        struct page *page = NULL;
4588        enum mc_target_type ret = MC_TARGET_NONE;
4589        swp_entry_t ent = { .val = 0 };
4590
4591        if (pte_present(ptent))
4592                page = mc_handle_present_pte(vma, addr, ptent);
4593        else if (is_swap_pte(ptent))
4594                page = mc_handle_swap_pte(vma, ptent, &ent);
4595        else if (pte_none(ptent))
4596                page = mc_handle_file_pte(vma, addr, ptent, &ent);
4597
4598        if (!page && !ent.val)
4599                return ret;
4600        if (page) {
4601                /*
4602                 * Do only loose check w/o serialization.
4603                 * mem_cgroup_move_account() checks the page is valid or
4604                 * not under LRU exclusion.
4605                 */
4606                if (page->mem_cgroup == mc.from) {
4607                        ret = MC_TARGET_PAGE;
4608                        if (is_device_private_page(page) ||
4609                            is_device_public_page(page))
4610                                ret = MC_TARGET_DEVICE;
4611                        if (target)
4612                                target->page = page;
4613                }
4614                if (!ret || !target)
4615                        put_page(page);
4616        }
4617        /*
4618         * There is a swap entry and a page doesn't exist or isn't charged.
4619         * But we cannot move a tail-page in a THP.
4620         */
4621        if (ent.val && !ret && (!page || !PageTransCompound(page)) &&
4622            mem_cgroup_id(mc.from) == lookup_swap_cgroup_id(ent)) {
4623                ret = MC_TARGET_SWAP;
4624                if (target)
4625                        target->ent = ent;
4626        }
4627        return ret;
4628}
4629
4630#ifdef CONFIG_TRANSPARENT_HUGEPAGE
4631/*
4632 * We don't consider PMD mapped swapping or file mapped pages because THP does
4633 * not support them for now.
4634 * Caller should make sure that pmd_trans_huge(pmd) is true.
4635 */
4636static enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma,
4637                unsigned long addr, pmd_t pmd, union mc_target *target)
4638{
4639        struct page *page = NULL;
4640        enum mc_target_type ret = MC_TARGET_NONE;
4641
4642        if (unlikely(is_swap_pmd(pmd))) {
4643                VM_BUG_ON(thp_migration_supported() &&
4644                                  !is_pmd_migration_entry(pmd));
4645                return ret;
4646        }
4647        page = pmd_page(pmd);
4648        VM_BUG_ON_PAGE(!page || !PageHead(page), page);
4649        if (!(mc.flags & MOVE_ANON))
4650                return ret;
4651        if (page->mem_cgroup == mc.from) {
4652                ret = MC_TARGET_PAGE;
4653                if (target) {
4654                        get_page(page);
4655                        target->page = page;
4656                }
4657        }
4658        return ret;
4659}
4660#else
4661static inline enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma,
4662                unsigned long addr, pmd_t pmd, union mc_target *target)
4663{
4664        return MC_TARGET_NONE;
4665}
4666#endif
4667
4668static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd,
4669                                        unsigned long addr, unsigned long end,
4670                                        struct mm_walk *walk)
4671{
4672        struct vm_area_struct *vma = walk->vma;
4673        pte_t *pte;
4674        spinlock_t *ptl;
4675
4676        ptl = pmd_trans_huge_lock(pmd, vma);
4677        if (ptl) {
4678                /*
4679                 * Note their can not be MC_TARGET_DEVICE for now as we do not
4680                 * support transparent huge page with MEMORY_DEVICE_PUBLIC or
4681                 * MEMORY_DEVICE_PRIVATE but this might change.
4682                 */
4683                if (get_mctgt_type_thp(vma, addr, *pmd, NULL) == MC_TARGET_PAGE)
4684                        mc.precharge += HPAGE_PMD_NR;
4685                spin_unlock(ptl);
4686                return 0;
4687        }
4688
4689        if (pmd_trans_unstable(pmd))
4690                return 0;
4691        pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
4692        for (; addr != end; pte++, addr += PAGE_SIZE)
4693                if (get_mctgt_type(vma, addr, *pte, NULL))
4694                        mc.precharge++; /* increment precharge temporarily */
4695        pte_unmap_unlock(pte - 1, ptl);
4696        cond_resched();
4697
4698        return 0;
4699}
4700
4701static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm)
4702{
4703        unsigned long precharge;
4704
4705        struct mm_walk mem_cgroup_count_precharge_walk = {
4706                .pmd_entry = mem_cgroup_count_precharge_pte_range,
4707                .mm = mm,
4708        };
4709        down_read(&mm->mmap_sem);
4710        walk_page_range(0, mm->highest_vm_end,
4711                        &mem_cgroup_count_precharge_walk);
4712        up_read(&mm->mmap_sem);
4713
4714        precharge = mc.precharge;
4715        mc.precharge = 0;
4716
4717        return precharge;
4718}
4719
4720static int mem_cgroup_precharge_mc(struct mm_struct *mm)
4721{
4722        unsigned long precharge = mem_cgroup_count_precharge(mm);
4723
4724        VM_BUG_ON(mc.moving_task);
4725        mc.moving_task = current;
4726        return mem_cgroup_do_precharge(precharge);
4727}
4728
4729/* cancels all extra charges on mc.from and mc.to, and wakes up all waiters. */
4730static void __mem_cgroup_clear_mc(void)
4731{
4732        struct mem_cgroup *from = mc.from;
4733        struct mem_cgroup *to = mc.to;
4734
4735        /* we must uncharge all the leftover precharges from mc.to */
4736        if (mc.precharge) {
4737                cancel_charge(mc.to, mc.precharge);
4738                mc.precharge = 0;
4739        }
4740        /*
4741         * we didn't uncharge from mc.from at mem_cgroup_move_account(), so
4742         * we must uncharge here.
4743         */
4744        if (mc.moved_charge) {
4745                cancel_charge(mc.from, mc.moved_charge);
4746                mc.moved_charge = 0;
4747        }
4748        /* we must fixup refcnts and charges */
4749        if (mc.moved_swap) {
4750                /* uncharge swap account from the old cgroup */
4751                if (!mem_cgroup_is_root(mc.from))
4752                        page_counter_uncharge(&mc.from->memsw, mc.moved_swap);
4753
4754                mem_cgroup_id_put_many(mc.from, mc.moved_swap);
4755
4756                /*
4757                 * we charged both to->memory and to->memsw, so we
4758                 * should uncharge to->memory.
4759                 */
4760                if (!mem_cgroup_is_root(mc.to))
4761                        page_counter_uncharge(&mc.to->memory, mc.moved_swap);
4762
4763                mem_cgroup_id_get_many(mc.to, mc.moved_swap);
4764                css_put_many(&mc.to->css, mc.moved_swap);
4765
4766                mc.moved_swap = 0;
4767        }
4768        memcg_oom_recover(from);
4769        memcg_oom_recover(to);
4770        wake_up_all(&mc.waitq);
4771}
4772
4773static void mem_cgroup_clear_mc(void)
4774{
4775        struct mm_struct *mm = mc.mm;
4776
4777        /*
4778         * we must clear moving_task before waking up waiters at the end of
4779         * task migration.
4780         */
4781        mc.moving_task = NULL;
4782        __mem_cgroup_clear_mc();
4783        spin_lock(&mc.lock);
4784        mc.from = NULL;
4785        mc.to = NULL;
4786        mc.mm = NULL;
4787        spin_unlock(&mc.lock);
4788
4789        mmput(mm);
4790}
4791
4792static int mem_cgroup_can_attach(struct cgroup_taskset *tset)
4793{
4794        struct cgroup_subsys_state *css;
4795        struct mem_cgroup *memcg = NULL; /* unneeded init to make gcc happy */
4796        struct mem_cgroup *from;
4797        struct task_struct *leader, *p;
4798        struct mm_struct *mm;
4799        unsigned long move_flags;
4800        int ret = 0;
4801
4802        /* charge immigration isn't supported on the default hierarchy */
4803        if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
4804                return 0;
4805
4806        /*
4807         * Multi-process migrations only happen on the default hierarchy
4808         * where charge immigration is not used.  Perform charge
4809         * immigration if @tset contains a leader and whine if there are
4810         * multiple.
4811         */
4812        p = NULL;
4813        cgroup_taskset_for_each_leader(leader, css, tset) {
4814                WARN_ON_ONCE(p);
4815                p = leader;
4816                memcg = mem_cgroup_from_css(css);
4817        }
4818        if (!p)
4819                return 0;
4820
4821        /*
4822         * We are now commited to this value whatever it is. Changes in this
4823         * tunable will only affect upcoming migrations, not the current one.
4824         * So we need to save it, and keep it going.
4825         */
4826        move_flags = READ_ONCE(memcg->move_charge_at_immigrate);
4827        if (!move_flags)
4828                return 0;
4829
4830        from = mem_cgroup_from_task(p);
4831
4832        VM_BUG_ON(from == memcg);
4833
4834        mm = get_task_mm(p);
4835        if (!mm)
4836                return 0;
4837        /* We move charges only when we move a owner of the mm */
4838        if (mm->owner == p) {
4839                VM_BUG_ON(mc.from);
4840                VM_BUG_ON(mc.to);
4841                VM_BUG_ON(mc.precharge);
4842                VM_BUG_ON(mc.moved_charge);
4843                VM_BUG_ON(mc.moved_swap);
4844
4845                spin_lock(&mc.lock);
4846                mc.mm = mm;
4847                mc.from = from;
4848                mc.to = memcg;
4849                mc.flags = move_flags;
4850                spin_unlock(&mc.lock);
4851                /* We set mc.moving_task later */
4852
4853                ret = mem_cgroup_precharge_mc(mm);
4854                if (ret)
4855                        mem_cgroup_clear_mc();
4856        } else {
4857                mmput(mm);
4858        }
4859        return ret;
4860}
4861
4862static void mem_cgroup_cancel_attach(struct cgroup_taskset *tset)
4863{
4864        if (mc.to)
4865                mem_cgroup_clear_mc();
4866}
4867
4868static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
4869                                unsigned long addr, unsigned long end,
4870                                struct mm_walk *walk)
4871{
4872        int ret = 0;
4873        struct vm_area_struct *vma = walk->vma;
4874        pte_t *pte;
4875        spinlock_t *ptl;
4876        enum mc_target_type target_type;
4877        union mc_target target;
4878        struct page *page;
4879
4880        ptl = pmd_trans_huge_lock(pmd, vma);
4881        if (ptl) {
4882                if (mc.precharge < HPAGE_PMD_NR) {
4883                        spin_unlock(ptl);
4884                        return 0;
4885                }
4886                target_type = get_mctgt_type_thp(vma, addr, *pmd, &target);
4887                if (target_type == MC_TARGET_PAGE) {
4888                        page = target.page;
4889                        if (!isolate_lru_page(page)) {
4890                                if (!mem_cgroup_move_account(page, true,
4891                                                             mc.from, mc.to)) {
4892                                        mc.precharge -= HPAGE_PMD_NR;
4893                                        mc.moved_charge += HPAGE_PMD_NR;
4894                                }
4895                                putback_lru_page(page);
4896                        }
4897                        put_page(page);
4898                } else if (target_type == MC_TARGET_DEVICE) {
4899                        page = target.page;
4900                        if (!mem_cgroup_move_account(page, true,
4901                                                     mc.from, mc.to)) {
4902                                mc.precharge -= HPAGE_PMD_NR;
4903                                mc.moved_charge += HPAGE_PMD_NR;
4904                        }
4905                        put_page(page);
4906                }
4907                spin_unlock(ptl);
4908                return 0;
4909        }
4910
4911        if (pmd_trans_unstable(pmd))
4912                return 0;
4913retry:
4914        pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
4915        for (; addr != end; addr += PAGE_SIZE) {
4916                pte_t ptent = *(pte++);
4917                bool device = false;
4918                swp_entry_t ent;
4919
4920                if (!mc.precharge)
4921                        break;
4922
4923                switch (get_mctgt_type(vma, addr, ptent, &target)) {
4924                case MC_TARGET_DEVICE:
4925                        device = true;
4926                        /* fall through */
4927                case MC_TARGET_PAGE:
4928                        page = target.page;
4929                        /*
4930                         * We can have a part of the split pmd here. Moving it
4931                         * can be done but it would be too convoluted so simply
4932                         * ignore such a partial THP and keep it in original
4933                         * memcg. There should be somebody mapping the head.
4934                         */
4935                        if (PageTransCompound(page))
4936                                goto put;
4937                        if (!device && isolate_lru_page(page))
4938                                goto put;
4939                        if (!mem_cgroup_move_account(page, false,
4940                                                mc.from, mc.to)) {
4941                                mc.precharge--;
4942                                /* we uncharge from mc.from later. */
4943                                mc.moved_charge++;
4944                        }
4945                        if (!device)
4946                                putback_lru_page(page);
4947put:                    /* get_mctgt_type() gets the page */
4948                        put_page(page);
4949                        break;
4950                case MC_TARGET_SWAP:
4951                        ent = target.ent;
4952                        if (!mem_cgroup_move_swap_account(ent, mc.from, mc.to)) {
4953                                mc.precharge--;
4954                                /* we fixup refcnts and charges later. */
4955                                mc.moved_swap++;
4956                        }
4957                        break;
4958                default:
4959                        break;
4960                }
4961        }
4962        pte_unmap_unlock(pte - 1, ptl);
4963        cond_resched();
4964
4965        if (addr != end) {
4966                /*
4967                 * We have consumed all precharges we got in can_attach().
4968                 * We try charge one by one, but don't do any additional
4969                 * charges to mc.to if we have failed in charge once in attach()
4970                 * phase.
4971                 */
4972                ret = mem_cgroup_do_precharge(1);
4973                if (!ret)
4974                        goto retry;
4975        }
4976
4977        return ret;
4978}
4979
4980static void mem_cgroup_move_charge(void)
4981{
4982        struct mm_walk mem_cgroup_move_charge_walk = {
4983                .pmd_entry = mem_cgroup_move_charge_pte_range,
4984                .mm = mc.mm,
4985        };
4986
4987        lru_add_drain_all();
4988        /*
4989         * Signal lock_page_memcg() to take the memcg's move_lock
4990         * while we're moving its pages to another memcg. Then wait
4991         * for already started RCU-only updates to finish.
4992         */
4993        atomic_inc(&mc.from->moving_account);
4994        synchronize_rcu();
4995retry:
4996        if (unlikely(!down_read_trylock(&mc.mm->mmap_sem))) {
4997                /*
4998                 * Someone who are holding the mmap_sem might be waiting in
4999                 * waitq. So we cancel all extra charges, wake up all waiters,
5000                 * and retry. Because we cancel precharges, we might not be able

5001                 * to move enough charges, but moving charge is a best-effort
5002                 * feature anyway, so it wouldn't be a big problem.
5003                 */
5004                __mem_cgroup_clear_mc();
5005                cond_resched();
5006                goto retry;
5007        }
5008        /*
5009         * When we have consumed all precharges and failed in doing
5010         * additional charge, the page walk just aborts.
5011         */
5012        walk_page_range(0, mc.mm->highest_vm_end, &mem_cgroup_move_charge_walk);
5013
5014        up_read(&mc.mm->mmap_sem);
5015        atomic_dec(&mc.from->moving_account);
5016}
5017
5018static void mem_cgroup_move_task(void)
5019{
5020        if (mc.to) {
5021                mem_cgroup_move_charge();
5022                mem_cgroup_clear_mc();
5023        }
5024}
5025#else   /* !CONFIG_MMU */
5026static int mem_cgroup_can_attach(struct cgroup_taskset *tset)
5027{
5028        return 0;
5029}
5030static void mem_cgroup_cancel_attach(struct cgroup_taskset *tset)
5031{
5032}
5033static void mem_cgroup_move_task(void)
5034{
5035}
5036#endif
5037
5038/*
5039 * Cgroup retains root cgroups across [un]mount cycles making it necessary
5040 * to verify whether we're attached to the default hierarchy on each mount
5041 * attempt.
5042 */
5043static void mem_cgroup_bind(struct cgroup_subsys_state *root_css)
5044{
5045        /*
5046         * use_hierarchy is forced on the default hierarchy.  cgroup core
5047         * guarantees that @root doesn't have any children, so turning it
5048         * on for the root memcg is enough.
5049         */
5050        if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
5051                root_mem_cgroup->use_hierarchy = true;
5052        else
5053                root_mem_cgroup->use_hierarchy = false;
5054}
5055
5056static u64 memory_current_read(struct cgroup_subsys_state *css,
5057                               struct cftype *cft)
5058{
5059        struct mem_cgroup *memcg = mem_cgroup_from_css(css);
5060
5061        return (u64)page_counter_read(&memcg->memory) * PAGE_SIZE;
5062}
5063
5064static int memory_low_show(struct seq_file *m, void *v)
5065{
5066        struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
5067        unsigned long low = READ_ONCE(memcg->low);
5068
5069        if (low == PAGE_COUNTER_MAX)
5070                seq_puts(m, "max\n");
5071        else
5072                seq_printf(m, "%llu\n", (u64)low * PAGE_SIZE);
5073
5074        return 0;
5075}
5076
5077static ssize_t memory_low_write(struct kernfs_open_file *of,
5078                                char *buf, size_t nbytes, loff_t off)
5079{
5080        struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
5081        unsigned long low;
5082        int err;
5083
5084        buf = strstrip(buf);
5085        err = page_counter_memparse(buf, "max", &low);
5086        if (err)
5087                return err;
5088
5089        memcg->low = low;
5090
5091        return nbytes;
5092}
5093
5094static int memory_high_show(struct seq_file *m, void *v)
5095{
5096        struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
5097        unsigned long high = READ_ONCE(memcg->high);
5098
5099        if (high == PAGE_COUNTER_MAX)
5100                seq_puts(m, "max\n");
5101        else
5102                seq_printf(m, "%llu\n", (u64)high * PAGE_SIZE);
5103
5104        return 0;
5105}
5106
5107static ssize_t memory_high_write(struct kernfs_open_file *of,
5108                                 char *buf, size_t nbytes, loff_t off)
5109{
5110        struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
5111        unsigned long nr_pages;
5112        unsigned long high;
5113        int err;
5114
5115        buf = strstrip(buf);
5116        err = page_counter_memparse(buf, "max", &high);
5117        if (err)
5118                return err;
5119
5120        memcg->high = high;
5121
5122        nr_pages = page_counter_read(&memcg->memory);
5123        if (nr_pages > high)
5124                try_to_free_mem_cgroup_pages(memcg, nr_pages - high,
5125                                             GFP_KERNEL, true);
5126
5127        memcg_wb_domain_size_changed(memcg);
5128        return nbytes;
5129}
5130
5131static int memory_max_show(struct seq_file *m, void *v)
5132{
5133        struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
5134        unsigned long max = READ_ONCE(memcg->memory.limit);
5135
5136        if (max == PAGE_COUNTER_MAX)
5137                seq_puts(m, "max\n");
5138        else
5139                seq_printf(m, "%llu\n", (u64)max * PAGE_SIZE);
5140
5141        return 0;
5142}
5143
5144static ssize_t memory_max_write(struct kernfs_open_file *of,
5145                                char *buf, size_t nbytes, loff_t off)
5146{
5147        struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
5148        unsigned int nr_reclaims = MEM_CGROUP_RECLAIM_RETRIES;
5149        bool drained = false;
5150        unsigned long max;
5151        int err;
5152
5153        buf = strstrip(buf);
5154        err = page_counter_memparse(buf, "max", &max);
5155        if (err)
5156                return err;
5157
5158        xchg(&memcg->memory.limit, max);
5159
5160        for (;;) {
5161                unsigned long nr_pages = page_counter_read(&memcg->memory);
5162
5163                if (nr_pages <= max)
5164                        break;
5165
5166                if (signal_pending(current)) {
5167                        err = -EINTR;
5168                        break;
5169                }
5170
5171                if (!drained) {
5172                        drain_all_stock(memcg);
5173                        drained = true;
5174                        continue;
5175                }
5176
5177                if (nr_reclaims) {
5178                        if (!try_to_free_mem_cgroup_pages(memcg, nr_pages - max,
5179                                                          GFP_KERNEL, true))
5180                                nr_reclaims--;
5181                        continue;
5182                }
5183
5184                memcg_memory_event(memcg, MEMCG_OOM);
5185                if (!mem_cgroup_out_of_memory(memcg, GFP_KERNEL, 0))
5186                        break;
5187        }
5188
5189        memcg_wb_domain_size_changed(memcg);
5190        return nbytes;
5191}
5192
5193static int memory_events_show(struct seq_file *m, void *v)
5194{
5195        struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
5196
5197        seq_printf(m, "low %lu\n",
5198                   atomic_long_read(&memcg->memory_events[MEMCG_LOW]));
5199        seq_printf(m, "high %lu\n",
5200                   atomic_long_read(&memcg->memory_events[MEMCG_HIGH]));
5201        seq_printf(m, "max %lu\n",
5202                   atomic_long_read(&memcg->memory_events[MEMCG_MAX]));
5203        seq_printf(m, "oom %lu\n",
5204                   atomic_long_read(&memcg->memory_events[MEMCG_OOM]));
5205        seq_printf(m, "oom_kill %lu\n", memcg_sum_events(memcg, OOM_KILL));
5206
5207        return 0;
5208}
5209
5210static int memory_stat_show(struct seq_file *m, void *v)
5211{
5212        struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
5213        unsigned long stat[MEMCG_NR_STAT];
5214        unsigned long events[NR_VM_EVENT_ITEMS];
5215        int i;
5216
5217        /*
5218         * Provide statistics on the state of the memory subsystem as
5219         * well as cumulative event counters that show past behavior.
5220         *
5221         * This list is ordered following a combination of these gradients:
5222         * 1) generic big picture -> specifics and details
5223         * 2) reflecting userspace activity -> reflecting kernel heuristics
5224         *
5225         * Current memory state:
5226         */
5227
5228        tree_stat(memcg, stat);
5229        tree_events(memcg, events);
5230
5231        seq_printf(m, "anon %llu\n",
5232                   (u64)stat[MEMCG_RSS] * PAGE_SIZE);
5233        seq_printf(m, "file %llu\n",
5234                   (u64)stat[MEMCG_CACHE] * PAGE_SIZE);
5235        seq_printf(m, "kernel_stack %llu\n",
5236                   (u64)stat[MEMCG_KERNEL_STACK_KB] * 1024);
5237        seq_printf(m, "slab %llu\n",
5238                   (u64)(stat[NR_SLAB_RECLAIMABLE] +
5239                         stat[NR_SLAB_UNRECLAIMABLE]) * PAGE_SIZE);
5240        seq_printf(m, "sock %llu\n",
5241                   (u64)stat[MEMCG_SOCK] * PAGE_SIZE);
5242
5243        seq_printf(m, "shmem %llu\n",
5244                   (u64)stat[NR_SHMEM] * PAGE_SIZE);
5245        seq_printf(m, "file_mapped %llu\n",
5246                   (u64)stat[NR_FILE_MAPPED] * PAGE_SIZE);
5247        seq_printf(m, "file_dirty %llu\n",
5248                   (u64)stat[NR_FILE_DIRTY] * PAGE_SIZE);
5249        seq_printf(m, "file_writeback %llu\n",
5250                   (u64)stat[NR_WRITEBACK] * PAGE_SIZE);
5251
5252        for (i = 0; i < NR_LRU_LISTS; i++) {
5253                struct mem_cgroup *mi;
5254                unsigned long val = 0;
5255
5256                for_each_mem_cgroup_tree(mi, memcg)
5257                        val += mem_cgroup_nr_lru_pages(mi, BIT(i));
5258                seq_printf(m, "%s %llu\n",
5259                           mem_cgroup_lru_names[i], (u64)val * PAGE_SIZE);
5260        }
5261
5262        seq_printf(m, "slab_reclaimable %llu\n",
5263                   (u64)stat[NR_SLAB_RECLAIMABLE] * PAGE_SIZE);
5264        seq_printf(m, "slab_unreclaimable %llu\n",
5265                   (u64)stat[NR_SLAB_UNRECLAIMABLE] * PAGE_SIZE);
5266
5267        /* Accumulated memory events */
5268
5269        seq_printf(m, "pgfault %lu\n", events[PGFAULT]);
5270        seq_printf(m, "pgmajfault %lu\n", events[PGMAJFAULT]);
5271
5272        seq_printf(m, "pgrefill %lu\n", events[PGREFILL]);
5273        seq_printf(m, "pgscan %lu\n", events[PGSCAN_KSWAPD] +
5274                   events[PGSCAN_DIRECT]);
5275        seq_printf(m, "pgsteal %lu\n", events[PGSTEAL_KSWAPD] +
5276                   events[PGSTEAL_DIRECT]);
5277        seq_printf(m, "pgactivate %lu\n", events[PGACTIVATE]);
5278        seq_printf(m, "pgdeactivate %lu\n", events[PGDEACTIVATE]);
5279        seq_printf(m, "pglazyfree %lu\n", events[PGLAZYFREE]);
5280        seq_printf(m, "pglazyfreed %lu\n", events[PGLAZYFREED]);
5281
5282        seq_printf(m, "workingset_refault %lu\n",
5283                   stat[WORKINGSET_REFAULT]);
5284        seq_printf(m, "workingset_activate %lu\n",
5285                   stat[WORKINGSET_ACTIVATE]);
5286        seq_printf(m, "workingset_nodereclaim %lu\n",
5287                   stat[WORKINGSET_NODERECLAIM]);
5288
5289        return 0;
5290}
5291
5292static struct cftype memory_files[] = {
5293        {
5294                .name = "current",
5295                .flags = CFTYPE_NOT_ON_ROOT,
5296                .read_u64 = memory_current_read,
5297        },
5298        {
5299                .name = "low",
5300                .flags = CFTYPE_NOT_ON_ROOT,
5301                .seq_show = memory_low_show,
5302                .write = memory_low_write,
5303        },
5304        {
5305                .name = "high",
5306                .flags = CFTYPE_NOT_ON_ROOT,
5307                .seq_show = memory_high_show,
5308                .write = memory_high_write,
5309        },
5310        {
5311                .name = "max",
5312                .flags = CFTYPE_NOT_ON_ROOT,
5313                .seq_show = memory_max_show,
5314                .write = memory_max_write,
5315        },
5316        {
5317                .name = "events",
5318                .flags = CFTYPE_NOT_ON_ROOT,
5319                .file_offset = offsetof(struct mem_cgroup, events_file),
5320                .seq_show = memory_events_show,
5321        },
5322        {
5323                .name = "stat",
5324                .flags = CFTYPE_NOT_ON_ROOT,
5325                .seq_show = memory_stat_show,
5326        },
5327        { }     /* terminate */
5328};
5329
5330struct cgroup_subsys memory_cgrp_subsys = {
5331        .css_alloc = mem_cgroup_css_alloc,
5332        .css_online = mem_cgroup_css_online,
5333        .css_offline = mem_cgroup_css_offline,
5334        .css_released = mem_cgroup_css_released,
5335        .css_free = mem_cgroup_css_free,
5336        .css_reset = mem_cgroup_css_reset,
5337        .can_attach = mem_cgroup_can_attach,
5338        .cancel_attach = mem_cgroup_cancel_attach,
5339        .post_attach = mem_cgroup_move_task,
5340        .bind = mem_cgroup_bind,
5341        .dfl_cftypes = memory_files,
5342        .legacy_cftypes = mem_cgroup_legacy_files,
5343        .early_init = 0,
5344};
5345
5346/**
5347 * mem_cgroup_low - check if memory consumption is below the normal range
5348 * @root: the top ancestor of the sub-tree being checked
5349 * @memcg: the memory cgroup to check
5350 *
5351 * Returns %true if memory consumption of @memcg, and that of all
5352 * ancestors up to (but not including) @root, is below the normal range.
5353 *
5354 * @root is exclusive; it is never low when looked at directly and isn't
5355 * checked when traversing the hierarchy.
5356 *
5357 * Excluding @root enables using memory.low to prioritize memory usage
5358 * between cgroups within a subtree of the hierarchy that is limited by
5359 * memory.high or memory.max.
5360 *
5361 * For example, given cgroup A with children B and C:
5362 *
5363 *    A
5364 *   / \
5365 *  B   C
5366 *
5367 * and
5368 *
5369 *  1. A/memory.current > A/memory.high
5370 *  2. A/B/memory.current < A/B/memory.low
5371 *  3. A/C/memory.current >= A/C/memory.low
5372 *
5373 * As 'A' is high, i.e. triggers reclaim from 'A', and 'B' is low, we
5374 * should reclaim from 'C' until 'A' is no longer high or until we can
5375 * no longer reclaim from 'C'.  If 'A', i.e. @root, isn't excluded by
5376 * mem_cgroup_low when reclaming from 'A', then 'B' won't be considered
5377 * low and we will reclaim indiscriminately from both 'B' and 'C'.
5378 */
5379bool mem_cgroup_low(struct mem_cgroup *root, struct mem_cgroup *memcg)
5380{
5381        if (mem_cgroup_disabled())
5382                return false;
5383
5384        if (!root)
5385                root = root_mem_cgroup;
5386        if (memcg == root)
5387                return false;
5388
5389        for (; memcg != root; memcg = parent_mem_cgroup(memcg)) {
5390                if (page_counter_read(&memcg->memory) >= memcg->low)
5391                        return false;
5392        }
5393
5394        return true;
5395}
5396
5397/**
5398 * mem_cgroup_try_charge - try charging a page
5399 * @page: page to charge
5400 * @mm: mm context of the victim
5401 * @gfp_mask: reclaim mode
5402 * @memcgp: charged memcg return
5403 * @compound: charge the page as compound or small page
5404 *
5405 * Try to charge @page to the memcg that @mm belongs to, reclaiming
5406 * pages according to @gfp_mask if necessary.
5407 *
5408 * Returns 0 on success, with *@memcgp pointing to the charged memcg.
5409 * Otherwise, an error code is returned.
5410 *
5411 * After page->mapping has been set up, the caller must finalize the
5412 * charge with mem_cgroup_commit_charge().  Or abort the transaction
5413 * with mem_cgroup_cancel_charge() in case page instantiation fails.
5414 */
5415int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm,
5416                          gfp_t gfp_mask, struct mem_cgroup **memcgp,
5417                          bool compound)
5418{
5419        struct mem_cgroup *memcg = NULL;
5420        unsigned int nr_pages = compound ? hpage_nr_pages(page) : 1;
5421        int ret = 0;
5422
5423        if (mem_cgroup_disabled())
5424                goto out;
5425
5426        if (PageSwapCache(page)) {
5427                /*
5428                 * Every swap fault against a single page tries to charge the
5429                 * page, bail as early as possible.  shmem_unuse() encounters
5430                 * already charged pages, too.  The USED bit is protected by
5431                 * the page lock, which serializes swap cache removal, which
5432                 * in turn serializes uncharging.
5433                 */
5434                VM_BUG_ON_PAGE(!PageLocked(page), page);
5435                if (compound_head(page)->mem_cgroup)
5436                        goto out;
5437
5438                if (do_swap_account) {
5439                        swp_entry_t ent = { .val = page_private(page), };
5440                        unsigned short id = lookup_swap_cgroup_id(ent);
5441
5442                        rcu_read_lock();
5443                        memcg = mem_cgroup_from_id(id);
5444                        if (memcg && !css_tryget_online(&memcg->css))
5445                                memcg = NULL;
5446                        rcu_read_unlock();
5447                }
5448        }
5449
5450        if (!memcg)
5451                memcg = get_mem_cgroup_from_mm(mm);
5452
5453        ret = try_charge(memcg, gfp_mask, nr_pages);
5454
5455        css_put(&memcg->css);
5456out:
5457        *memcgp = memcg;
5458        return ret;
5459}
5460
5461/**
5462 * mem_cgroup_commit_charge - commit a page charge
5463 * @page: page to charge
5464 * @memcg: memcg to charge the page to
5465 * @lrucare: page might be on LRU already
5466 * @compound: charge the page as compound or small page
5467 *
5468 * Finalize a charge transaction started by mem_cgroup_try_charge(),
5469 * after page->mapping has been set up.  This must happen atomically
5470 * as part of the page instantiation, i.e. under the page table lock
5471 * for anonymous pages, under the page lock for page and swap cache.
5472 *
5473 * In addition, the page must not be on the LRU during the commit, to
5474 * prevent racing with task migration.  If it might be, use @lrucare.
5475 *
5476 * Use mem_cgroup_cancel_charge() to cancel the transaction instead.
5477 */
5478void mem_cgroup_commit_charge(struct page *page, struct mem_cgroup *memcg,
5479                              bool lrucare, bool compound)
5480{
5481        unsigned int nr_pages = compound ? hpage_nr_pages(page) : 1;
5482
5483        VM_BUG_ON_PAGE(!page->mapping, page);
5484        VM_BUG_ON_PAGE(PageLRU(page) && !lrucare, page);
5485
5486        if (mem_cgroup_disabled())
5487                return;
5488        /*
5489         * Swap faults will attempt to charge the same page multiple
5490         * times.  But reuse_swap_page() might have removed the page
5491         * from swapcache already, so we can't check PageSwapCache().
5492         */
5493        if (!memcg)
5494                return;
5495
5496        commit_charge(page, memcg, lrucare);
5497
5498        local_irq_disable();
5499        mem_cgroup_charge_statistics(memcg, page, compound, nr_pages);
5500        memcg_check_events(memcg, page);
5501        local_irq_enable();
5502
5503        if (do_memsw_account() && PageSwapCache(page)) {
5504                swp_entry_t entry = { .val = page_private(page) };
5505                /*
5506                 * The swap entry might not get freed for a long time,
5507                 * let's not wait for it.  The page already received a
5508                 * memory+swap charge, drop the swap entry duplicate.
5509                 */
5510                mem_cgroup_uncharge_swap(entry, nr_pages);
5511        }
5512}
5513
5514/**
5515 * mem_cgroup_cancel_charge - cancel a page charge
5516 * @page: page to charge
5517 * @memcg: memcg to charge the page to
5518 * @compound: charge the page as compound or small page
5519 *
5520 * Cancel a charge transaction started by mem_cgroup_try_charge().
5521 */
5522void mem_cgroup_cancel_charge(struct page *page, struct mem_cgroup *memcg,
5523                bool compound)
5524{
5525        unsigned int nr_pages = compound ? hpage_nr_pages(page) : 1;
5526
5527        if (mem_cgroup_disabled())
5528                return;
5529        /*
5530         * Swap faults will attempt to charge the same page multiple
5531         * times.  But reuse_swap_page() might have removed the page
5532         * from swapcache already, so we can't check PageSwapCache().
5533         */
5534        if (!memcg)
5535                return;
5536
5537        cancel_charge(memcg, nr_pages);
5538}
5539
5540struct uncharge_gather {
5541        struct mem_cgroup *memcg;
5542        unsigned long pgpgout;
5543        unsigned long nr_anon;
5544        unsigned long nr_file;
5545        unsigned long nr_kmem;
5546        unsigned long nr_huge;
5547        unsigned long nr_shmem;
5548        struct page *dummy_page;
5549};
5550
5551static inline void uncharge_gather_clear(struct uncharge_gather *ug)
5552{
5553        memset(ug, 0, sizeof(*ug));
5554}
5555
5556static void uncharge_batch(const struct uncharge_gather *ug)
5557{
5558        unsigned long nr_pages = ug->nr_anon + ug->nr_file + ug->nr_kmem;
5559        unsigned long flags;
5560
5561        if (!mem_cgroup_is_root(ug->memcg)) {
5562                page_counter_uncharge(&ug->memcg->memory, nr_pages);
5563                if (do_memsw_account())
5564                        page_counter_uncharge(&ug->memcg->memsw, nr_pages);
5565                if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && ug->nr_kmem)
5566                        page_counter_uncharge(&ug->memcg->kmem, ug->nr_kmem);
5567                memcg_oom_recover(ug->memcg);
5568        }
5569
5570        local_irq_save(flags);
5571        __mod_memcg_state(ug->memcg, MEMCG_RSS, -ug->nr_anon);
5572        __mod_memcg_state(ug->memcg, MEMCG_CACHE, -ug->nr_file);
5573        __mod_memcg_state(ug->memcg, MEMCG_RSS_HUGE, -ug->nr_huge);
5574        __mod_memcg_state(ug->memcg, NR_SHMEM, -ug->nr_shmem);
5575        __count_memcg_events(ug->memcg, PGPGOUT, ug->pgpgout);
5576        __this_cpu_add(ug->memcg->stat_cpu->nr_page_events, nr_pages);
5577        memcg_check_events(ug->memcg, ug->dummy_page);
5578        local_irq_restore(flags);
5579
5580        if (!mem_cgroup_is_root(ug->memcg))
5581                css_put_many(&ug->memcg->css, nr_pages);
5582}
5583
5584static void uncharge_page(struct page *page, struct uncharge_gather *ug)
5585{
5586        VM_BUG_ON_PAGE(PageLRU(page), page);
5587        VM_BUG_ON_PAGE(page_count(page) && !is_zone_device_page(page) &&
5588                        !PageHWPoison(page) , page);
5589
5590        if (!page->mem_cgroup)
5591                return;
5592
5593        /*
5594         * Nobody should be changing or seriously looking at
5595         * page->mem_cgroup at this point, we have fully
5596         * exclusive access to the page.
5597         */
5598
5599        if (ug->memcg != page->mem_cgroup) {
5600                if (ug->memcg) {
5601                        uncharge_batch(ug);
5602                        uncharge_gather_clear(ug);
5603                }
5604                ug->memcg = page->mem_cgroup;
5605        }
5606
5607        if (!PageKmemcg(page)) {
5608                unsigned int nr_pages = 1;
5609
5610                if (PageTransHuge(page)) {
5611                        nr_pages <<= compound_order(page);
5612                        ug->nr_huge += nr_pages;
5613                }
5614                if (PageAnon(page))
5615                        ug->nr_anon += nr_pages;
5616                else {
5617                        ug->nr_file += nr_pages;
5618                        if (PageSwapBacked(page))
5619                                ug->nr_shmem += nr_pages;
5620                }
5621                ug->pgpgout++;
5622        } else {
5623                ug->nr_kmem += 1 << compound_order(page);
5624                __ClearPageKmemcg(page);
5625        }
5626
5627        ug->dummy_page = page;
5628        page->mem_cgroup = NULL;
5629}
5630
5631static void uncharge_list(struct list_head *page_list)
5632{
5633        struct uncharge_gather ug;
5634        struct list_head *next;
5635
5636        uncharge_gather_clear(&ug);
5637
5638        /*
5639         * Note that the list can be a single page->lru; hence the
5640         * do-while loop instead of a simple list_for_each_entry().
5641         */
5642        next = page_list->next;
5643        do {
5644                struct page *page;
5645
5646                page = list_entry(next, struct page, lru);
5647                next = page->lru.next;
5648
5649                uncharge_page(page, &ug);
5650        } while (next != page_list);
5651
5652        if (ug.memcg)
5653                uncharge_batch(&ug);
5654}
5655
5656/**
5657 * mem_cgroup_uncharge - uncharge a page
5658 * @page: page to uncharge
5659 *
5660 * Uncharge a page previously charged with mem_cgroup_try_charge() and
5661 * mem_cgroup_commit_charge().
5662 */
5663void mem_cgroup_uncharge(struct page *page)
5664{
5665        struct uncharge_gather ug;
5666
5667        if (mem_cgroup_disabled())
5668                return;
5669
5670        /* Don't touch page->lru of any random page, pre-check: */
5671        if (!page->mem_cgroup)
5672                return;
5673
5674        uncharge_gather_clear(&ug);
5675        uncharge_page(page, &ug);
5676        uncharge_batch(&ug);
5677}
5678
5679/**
5680 * mem_cgroup_uncharge_list - uncharge a list of page
5681 * @page_list: list of pages to uncharge
5682 *
5683 * Uncharge a list of pages previously charged with
5684 * mem_cgroup_try_charge() and mem_cgroup_commit_charge().
5685 */
5686void mem_cgroup_uncharge_list(struct list_head *page_list)
5687{
5688        if (mem_cgroup_disabled())
5689                return;
5690
5691        if (!list_empty(page_list))
5692                uncharge_list(page_list);
5693}
5694
5695/**
5696 * mem_cgroup_migrate - charge a page's replacement
5697 * @oldpage: currently circulating page
5698 * @newpage: replacement page
5699 *
5700 * Charge @newpage as a replacement page for @oldpage. @oldpage will
5701 * be uncharged upon free.
5702 *
5703 * Both pages must be locked, @newpage->mapping must be set up.
5704 */
5705void mem_cgroup_migrate(struct page *oldpage, struct page *newpage)
5706{
5707        struct mem_cgroup *memcg;
5708        unsigned int nr_pages;
5709        bool compound;
5710        unsigned long flags;
5711
5712        VM_BUG_ON_PAGE(!PageLocked(oldpage), oldpage);
5713        VM_BUG_ON_PAGE(!PageLocked(newpage), newpage);
5714        VM_BUG_ON_PAGE(PageAnon(oldpage) != PageAnon(newpage), newpage);
5715        VM_BUG_ON_PAGE(PageTransHuge(oldpage) != PageTransHuge(newpage),
5716                       newpage);
5717
5718        if (mem_cgroup_disabled())
5719                return;
5720
5721        /* Page cache replacement: new page already charged? */
5722        if (newpage->mem_cgroup)
5723                return;
5724
5725        /* Swapcache readahead pages can get replaced before being charged */
5726        memcg = oldpage->mem_cgroup;
5727        if (!memcg)
5728                return;
5729
5730        /* Force-charge the new page. The old one will be freed soon */
5731        compound = PageTransHuge(newpage);
5732        nr_pages = compound ? hpage_nr_pages(newpage) : 1;
5733
5734        page_counter_charge(&memcg->memory, nr_pages);
5735        if (do_memsw_account())
5736                page_counter_charge(&memcg->memsw, nr_pages);
5737        css_get_many(&memcg->css, nr_pages);
5738
5739        commit_charge(newpage, memcg, false);
5740
5741        local_irq_save(flags);
5742        mem_cgroup_charge_statistics(memcg, newpage, compound, nr_pages);
5743        memcg_check_events(memcg, newpage);
5744        local_irq_restore(flags);
5745}
5746
5747DEFINE_STATIC_KEY_FALSE(memcg_sockets_enabled_key);
5748EXPORT_SYMBOL(memcg_sockets_enabled_key);
5749
5750void mem_cgroup_sk_alloc(struct sock *sk)
5751{
5752        struct mem_cgroup *memcg;
5753
5754        if (!mem_cgroup_sockets_enabled)
5755                return;
5756
5757        /*
5758         * Socket cloning can throw us here with sk_memcg already
5759         * filled. It won't however, necessarily happen from
5760         * process context. So the test for root memcg given
5761         * the current task's memcg won't help us in this case.
5762         *
5763         * Respecting the original socket's memcg is a better
5764         * decision in this case.
5765         */
5766        if (sk->sk_memcg) {
5767                css_get(&sk->sk_memcg->css);
5768                return;
5769        }
5770
5771        rcu_read_lock();
5772        memcg = mem_cgroup_from_task(current);
5773        if (memcg == root_mem_cgroup)
5774                goto out;
5775        if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && !memcg->tcpmem_active)
5776                goto out;
5777        if (css_tryget_online(&memcg->css))
5778                sk->sk_memcg = memcg;
5779out:
5780        rcu_read_unlock();
5781}
5782
5783void mem_cgroup_sk_free(struct sock *sk)
5784{
5785        if (sk->sk_memcg)
5786                css_put(&sk->sk_memcg->css);
5787}
5788
5789/**
5790 * mem_cgroup_charge_skmem - charge socket memory
5791 * @memcg: memcg to charge
5792 * @nr_pages: number of pages to charge
5793 *
5794 * Charges @nr_pages to @memcg. Returns %true if the charge fit within
5795 * @memcg's configured limit, %false if the charge had to be forced.
5796 */
5797bool mem_cgroup_charge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages)
5798{
5799        gfp_t gfp_mask = GFP_KERNEL;
5800
5801        if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) {
5802                struct page_counter *fail;
5803
5804                if (page_counter_try_charge(&memcg->tcpmem, nr_pages, &fail)) {
5805                        memcg->tcpmem_pressure = 0;
5806                        return true;
5807                }
5808                page_counter_charge(&memcg->tcpmem, nr_pages);
5809                memcg->tcpmem_pressure = 1;
5810                return false;
5811        }
5812
5813        /* Don't block in the packet receive path */
5814        if (in_softirq())
5815                gfp_mask = GFP_NOWAIT;
5816
5817        mod_memcg_state(memcg, MEMCG_SOCK, nr_pages);
5818
5819        if (try_charge(memcg, gfp_mask, nr_pages) == 0)
5820                return true;
5821
5822        try_charge(memcg, gfp_mask|__GFP_NOFAIL, nr_pages);
5823        return false;
5824}
5825
5826/**
5827 * mem_cgroup_uncharge_skmem - uncharge socket memory
5828 * @memcg: memcg to uncharge
5829 * @nr_pages: number of pages to uncharge
5830 */
5831void mem_cgroup_uncharge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages)
5832{
5833        if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) {
5834                page_counter_uncharge(&memcg->tcpmem, nr_pages);
5835                return;
5836        }
5837
5838        mod_memcg_state(memcg, MEMCG_SOCK, -nr_pages);
5839
5840        refill_stock(memcg, nr_pages);
5841}
5842
5843static int __init cgroup_memory(char *s)
5844{
5845        char *token;
5846
5847        while ((token = strsep(&s, ",")) != NULL) {
5848                if (!*token)
5849                        continue;
5850                if (!strcmp(token, "nosocket"))
5851                        cgroup_memory_nosocket = true;
5852                if (!strcmp(token, "nokmem"))
5853                        cgroup_memory_nokmem = true;
5854        }
5855        return 0;
5856}
5857__setup("cgroup.memory=", cgroup_memory);
5858
5859/*
5860 * subsys_initcall() for memory controller.
5861 *
5862 * Some parts like memcg_hotplug_cpu_dead() have to be initialized from this
5863 * context because of lock dependencies (cgroup_lock -> cpu hotplug) but
5864 * basically everything that doesn't depend on a specific mem_cgroup structure
5865 * should be initialized from here.
5866 */
5867static int __init mem_cgroup_init(void)
5868{
5869        int cpu, node;
5870
5871#ifndef CONFIG_SLOB
5872        /*
5873         * Kmem cache creation is mostly done with the slab_mutex held,
5874         * so use a workqueue with limited concurrency to avoid stalling
5875         * all worker threads in case lots of cgroups are created and
5876         * destroyed simultaneously.
5877         */
5878        memcg_kmem_cache_wq = alloc_workqueue("memcg_kmem_cache", 0, 1);
5879        BUG_ON(!memcg_kmem_cache_wq);
5880#endif
5881
5882        cpuhp_setup_state_nocalls(CPUHP_MM_MEMCQ_DEAD, "mm/memctrl:dead", NULL,
5883                                  memcg_hotplug_cpu_dead);
5884
5885        for_each_possible_cpu(cpu)
5886                INIT_WORK(&per_cpu_ptr(&memcg_stock, cpu)->work,
5887                          drain_local_stock);
5888
5889        for_each_node(node) {
5890                struct mem_cgroup_tree_per_node *rtpn;
5891
5892                rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL,
5893                                    node_online(node) ? node : NUMA_NO_NODE);
5894
5895                rtpn->rb_root = RB_ROOT;
5896                rtpn->rb_rightmost = NULL;
5897                spin_lock_init(&rtpn->lock);
5898                soft_limit_tree.rb_tree_per_node[node] = rtpn;
5899        }
5900
5901        return 0;
5902}
5903subsys_initcall(mem_cgroup_init);
5904
5905#ifdef CONFIG_MEMCG_SWAP
5906static struct mem_cgroup *mem_cgroup_id_get_online(struct mem_cgroup *memcg)
5907{
5908        while (!atomic_inc_not_zero(&memcg->id.ref)) {
5909                /*
5910                 * The root cgroup cannot be destroyed, so it's refcount must
5911                 * always be >= 1.
5912                 */
5913                if (WARN_ON_ONCE(memcg == root_mem_cgroup)) {
5914                        VM_BUG_ON(1);
5915                        break;
5916                }
5917                memcg = parent_mem_cgroup(memcg);
5918                if (!memcg)
5919                        memcg = root_mem_cgroup;
5920        }
5921        return memcg;
5922}
5923
5924/**
5925 * mem_cgroup_swapout - transfer a memsw charge to swap
5926 * @page: page whose memsw charge to transfer
5927 * @entry: swap entry to move the charge to
5928 *
5929 * Transfer the memsw charge of @page to @entry.
5930 */
5931void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
5932{
5933        struct mem_cgroup *memcg, *swap_memcg;
5934        unsigned int nr_entries;
5935        unsigned short oldid;
5936
5937        VM_BUG_ON_PAGE(PageLRU(page), page);
5938        VM_BUG_ON_PAGE(page_count(page), page);
5939
5940        if (!do_memsw_account())
5941                return;
5942
5943        memcg = page->mem_cgroup;
5944
5945        /* Readahead page, never charged */
5946        if (!memcg)
5947                return;
5948
5949        /*
5950         * In case the memcg owning these pages has been offlined and doesn't
5951         * have an ID allocated to it anymore, charge the closest online
5952         * ancestor for the swap instead and transfer the memory+swap charge.
5953         */
5954        swap_memcg = mem_cgroup_id_get_online(memcg);
5955        nr_entries = hpage_nr_pages(page);
5956        /* Get references for the tail pages, too */
5957        if (nr_entries > 1)
5958                mem_cgroup_id_get_many(swap_memcg, nr_entries - 1);
5959        oldid = swap_cgroup_record(entry, mem_cgroup_id(swap_memcg),
5960                                   nr_entries);
5961        VM_BUG_ON_PAGE(oldid, page);
5962        mod_memcg_state(swap_memcg, MEMCG_SWAP, nr_entries);
5963
5964        page->mem_cgroup = NULL;
5965
5966        if (!mem_cgroup_is_root(memcg))
5967                page_counter_uncharge(&memcg->memory, nr_entries);
5968
5969        if (memcg != swap_memcg) {
5970                if (!mem_cgroup_is_root(swap_memcg))
5971                        page_counter_charge(&swap_memcg->memsw, nr_entries);
5972                page_counter_uncharge(&memcg->memsw, nr_entries);
5973        }
5974
5975        /*
5976         * Interrupts should be disabled here because the caller holds the
5977         * i_pages lock which is taken with interrupts-off. It is
5978         * important here to have the interrupts disabled because it is the
5979         * only synchronisation we have for updating the per-CPU variables.
5980         */
5981        VM_BUG_ON(!irqs_disabled());
5982        mem_cgroup_charge_statistics(memcg, page, PageTransHuge(page),
5983                                     -nr_entries);
5984        memcg_check_events(memcg, page);
5985
5986        if (!mem_cgroup_is_root(memcg))
5987                css_put_many(&memcg->css, nr_entries);
5988}
5989
5990/**
5991 * mem_cgroup_try_charge_swap - try charging swap space for a page
5992 * @page: page being added to swap
5993 * @entry: swap entry to charge
5994 *
5995 * Try to charge @page's memcg for the swap space at @entry.
5996 *
5997 * Returns 0 on success, -ENOMEM on failure.
5998 */
5999int mem_cgroup_try_charge_swap(struct page *page, swp_entry_t entry)
6000{

6001        unsigned int nr_pages = hpage_nr_pages(page);
6002        struct page_counter *counter;
6003        struct mem_cgroup *memcg;
6004        unsigned short oldid;
6005
6006        if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) || !do_swap_account)
6007                return 0;
6008
6009        memcg = page->mem_cgroup;
6010
6011        /* Readahead page, never charged */
6012        if (!memcg)
6013                return 0;
6014
6015        memcg = mem_cgroup_id_get_online(memcg);
6016
6017        if (!mem_cgroup_is_root(memcg) &&
6018            !page_counter_try_charge(&memcg->swap, nr_pages, &counter)) {
6019                mem_cgroup_id_put(memcg);
6020                return -ENOMEM;
6021        }
6022
6023        /* Get references for the tail pages, too */
6024        if (nr_pages > 1)
6025                mem_cgroup_id_get_many(memcg, nr_pages - 1);
6026        oldid = swap_cgroup_record(entry, mem_cgroup_id(memcg), nr_pages);
6027        VM_BUG_ON_PAGE(oldid, page);
6028        mod_memcg_state(memcg, MEMCG_SWAP, nr_pages);
6029
6030        return 0;
6031}
6032
6033/**
6034 * mem_cgroup_uncharge_swap - uncharge swap space
6035 * @entry: swap entry to uncharge
6036 * @nr_pages: the amount of swap space to uncharge
6037 */
6038void mem_cgroup_uncharge_swap(swp_entry_t entry, unsigned int nr_pages)
6039{
6040        struct mem_cgroup *memcg;
6041        unsigned short id;
6042
6043        if (!do_swap_account)
6044                return;
6045
6046        id = swap_cgroup_record(entry, 0, nr_pages);
6047        rcu_read_lock();
6048        memcg = mem_cgroup_from_id(id);
6049        if (memcg) {
6050                if (!mem_cgroup_is_root(memcg)) {
6051                        if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
6052                                page_counter_uncharge(&memcg->swap, nr_pages);
6053                        else
6054                                page_counter_uncharge(&memcg->memsw, nr_pages);
6055                }
6056                mod_memcg_state(memcg, MEMCG_SWAP, -nr_pages);
6057                mem_cgroup_id_put_many(memcg, nr_pages);
6058        }
6059        rcu_read_unlock();
6060}
6061
6062long mem_cgroup_get_nr_swap_pages(struct mem_cgroup *memcg)
6063{
6064        long nr_swap_pages = get_nr_swap_pages();
6065
6066        if (!do_swap_account || !cgroup_subsys_on_dfl(memory_cgrp_subsys))
6067                return nr_swap_pages;
6068        for (; memcg != root_mem_cgroup; memcg = parent_mem_cgroup(memcg))
6069                nr_swap_pages = min_t(long, nr_swap_pages,
6070                                      READ_ONCE(memcg->swap.limit) -
6071                                      page_counter_read(&memcg->swap));
6072        return nr_swap_pages;
6073}
6074
6075bool mem_cgroup_swap_full(struct page *page)
6076{
6077        struct mem_cgroup *memcg;
6078
6079        VM_BUG_ON_PAGE(!PageLocked(page), page);
6080
6081        if (vm_swap_full())
6082                return true;
6083        if (!do_swap_account || !cgroup_subsys_on_dfl(memory_cgrp_subsys))
6084                return false;
6085
6086        memcg = page->mem_cgroup;
6087        if (!memcg)
6088                return false;
6089
6090        for (; memcg != root_mem_cgroup; memcg = parent_mem_cgroup(memcg))
6091                if (page_counter_read(&memcg->swap) * 2 >= memcg->swap.limit)
6092                        return true;
6093
6094        return false;
6095}
6096
6097/* for remember boot option*/
6098#ifdef CONFIG_MEMCG_SWAP_ENABLED
6099static int really_do_swap_account __initdata = 1;
6100#else
6101static int really_do_swap_account __initdata;
6102#endif
6103
6104static int __init enable_swap_account(char *s)
6105{
6106        if (!strcmp(s, "1"))
6107                really_do_swap_account = 1;
6108        else if (!strcmp(s, "0"))
6109                really_do_swap_account = 0;
6110        return 1;
6111}
6112__setup("swapaccount=", enable_swap_account);
6113
6114static u64 swap_current_read(struct cgroup_subsys_state *css,
6115                             struct cftype *cft)
6116{
6117        struct mem_cgroup *memcg = mem_cgroup_from_css(css);
6118
6119        return (u64)page_counter_read(&memcg->swap) * PAGE_SIZE;
6120}
6121
6122static int swap_max_show(struct seq_file *m, void *v)
6123{
6124        struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
6125        unsigned long max = READ_ONCE(memcg->swap.limit);
6126
6127        if (max == PAGE_COUNTER_MAX)
6128                seq_puts(m, "max\n");
6129        else
6130                seq_printf(m, "%llu\n", (u64)max * PAGE_SIZE);
6131
6132        return 0;
6133}
6134
6135static ssize_t swap_max_write(struct kernfs_open_file *of,
6136                              char *buf, size_t nbytes, loff_t off)
6137{
6138        struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
6139        unsigned long max;
6140        int err;
6141
6142        buf = strstrip(buf);
6143        err = page_counter_memparse(buf, "max", &max);
6144        if (err)
6145                return err;
6146
6147        mutex_lock(&memcg_limit_mutex);
6148        err = page_counter_limit(&memcg->swap, max);
6149        mutex_unlock(&memcg_limit_mutex);
6150        if (err)
6151                return err;
6152
6153        return nbytes;
6154}
6155
6156static struct cftype swap_files[] = {
6157        {
6158                .name = "swap.current",
6159                .flags = CFTYPE_NOT_ON_ROOT,
6160                .read_u64 = swap_current_read,
6161        },
6162        {
6163                .name = "swap.max",
6164                .flags = CFTYPE_NOT_ON_ROOT,
6165                .seq_show = swap_max_show,
6166                .write = swap_max_write,
6167        },
6168        { }     /* terminate */
6169};
6170
6171static struct cftype memsw_cgroup_files[] = {
6172        {
6173                .name = "memsw.usage_in_bytes",
6174                .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE),
6175                .read_u64 = mem_cgroup_read_u64,
6176        },
6177        {
6178                .name = "memsw.max_usage_in_bytes",
6179                .private = MEMFILE_PRIVATE(_MEMSWAP, RES_MAX_USAGE),
6180                .write = mem_cgroup_reset,
6181                .read_u64 = mem_cgroup_read_u64,
6182        },
6183        {
6184                .name = "memsw.limit_in_bytes",
6185                .private = MEMFILE_PRIVATE(_MEMSWAP, RES_LIMIT),
6186                .write = mem_cgroup_write,
6187                .read_u64 = mem_cgroup_read_u64,
6188        },
6189        {
6190                .name = "memsw.failcnt",
6191                .private = MEMFILE_PRIVATE(_MEMSWAP, RES_FAILCNT),
6192                .write = mem_cgroup_reset,
6193                .read_u64 = mem_cgroup_read_u64,
6194        },
6195        { },    /* terminate */
6196};
6197
6198static int __init mem_cgroup_swap_init(void)
6199{
6200        if (!mem_cgroup_disabled() && really_do_swap_account) {
6201                do_swap_account = 1;
6202                WARN_ON(cgroup_add_dfl_cftypes(&memory_cgrp_subsys,
6203                                               swap_files));
6204                WARN_ON(cgroup_add_legacy_cftypes(&memory_cgrp_subsys,
6205                                                  memsw_cgroup_files));
6206        }
6207        return 0;
6208}
6209subsys_initcall(mem_cgroup_swap_init);
6210
6211#endif /* CONFIG_MEMCG_SWAP */
6212