linux/mm/memcontrol.c
<<
>>
Prefs
   1/* memcontrol.c - Memory Controller
   2 *
   3 * Copyright IBM Corporation, 2007
   4 * Author Balbir Singh <balbir@linux.vnet.ibm.com>
   5 *
   6 * Copyright 2007 OpenVZ SWsoft Inc
   7 * Author: Pavel Emelianov <xemul@openvz.org>
   8 *
   9 * Memory thresholds
  10 * Copyright (C) 2009 Nokia Corporation
  11 * Author: Kirill A. Shutemov
  12 *
  13 * Kernel Memory Controller
  14 * Copyright (C) 2012 Parallels Inc. and Google Inc.
  15 * Authors: Glauber Costa and Suleiman Souhlal
  16 *
  17 * This program is free software; you can redistribute it and/or modify
  18 * it under the terms of the GNU General Public License as published by
  19 * the Free Software Foundation; either version 2 of the License, or
  20 * (at your option) any later version.
  21 *
  22 * This program is distributed in the hope that it will be useful,
  23 * but WITHOUT ANY WARRANTY; without even the implied warranty of
  24 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  25 * GNU General Public License for more details.
  26 */
  27
  28#include <linux/page_counter.h>
  29#include <linux/memcontrol.h>
  30#include <linux/cgroup.h>
  31#include <linux/mm.h>
  32#include <linux/hugetlb.h>
  33#include <linux/pagemap.h>
  34#include <linux/smp.h>
  35#include <linux/page-flags.h>
  36#include <linux/backing-dev.h>
  37#include <linux/bit_spinlock.h>
  38#include <linux/rcupdate.h>
  39#include <linux/limits.h>
  40#include <linux/export.h>
  41#include <linux/mutex.h>
  42#include <linux/rbtree.h>
  43#include <linux/slab.h>
  44#include <linux/swap.h>
  45#include <linux/swapops.h>
  46#include <linux/spinlock.h>
  47#include <linux/eventfd.h>
  48#include <linux/sort.h>
  49#include <linux/fs.h>
  50#include <linux/seq_file.h>
  51#include <linux/vmalloc.h>
  52#include <linux/vmpressure.h>
  53#include <linux/mm_inline.h>
  54#include <linux/page_cgroup.h>
  55#include <linux/cpu.h>
  56#include <linux/oom.h>
  57#include "internal.h"
  58#include <net/sock.h>
  59#include <net/ip.h>
  60#include <net/tcp_memcontrol.h>
  61#include "slab.h"
  62
  63#include <asm/uaccess.h>
  64
  65#include <trace/events/vmscan.h>
  66
  67struct cgroup_subsys mem_cgroup_subsys __read_mostly;
  68EXPORT_SYMBOL(mem_cgroup_subsys);
  69
  70#define MEM_CGROUP_RECLAIM_RETRIES      5
  71static struct mem_cgroup *root_mem_cgroup __read_mostly;
  72
  73#ifdef CONFIG_MEMCG_SWAP
  74/* Turned on only when memory cgroup is enabled && really_do_swap_account = 1 */
  75int do_swap_account __read_mostly;
  76
  77/* for remember boot option*/
  78#ifdef CONFIG_MEMCG_SWAP_ENABLED
  79static int really_do_swap_account __initdata = 1;
  80#else
  81static int really_do_swap_account __initdata = 0;
  82#endif
  83
  84#else
  85#define do_swap_account         0
  86#endif
  87
  88
  89/*
  90 * Statistics for memory cgroup.
  91 */
  92enum mem_cgroup_stat_index {
  93        /*
  94         * For MEM_CONTAINER_TYPE_ALL, usage = pagecache + rss.
  95         */
  96        MEM_CGROUP_STAT_CACHE,          /* # of pages charged as cache */
  97        MEM_CGROUP_STAT_RSS,            /* # of pages charged as anon rss */
  98        MEM_CGROUP_STAT_RSS_HUGE,       /* # of pages charged as anon huge */
  99        MEM_CGROUP_STAT_FILE_MAPPED,    /* # of pages charged as file rss */
 100        MEM_CGROUP_STAT_SWAP,           /* # of pages, swapped out */
 101        MEM_CGROUP_STAT_NSTATS,
 102};
 103
 104static const char * const mem_cgroup_stat_names[] = {
 105        "cache",
 106        "rss",
 107        "rss_huge",
 108        "mapped_file",
 109        "swap",
 110};
 111
 112enum mem_cgroup_events_index {
 113        MEM_CGROUP_EVENTS_PGPGIN,       /* # of pages paged in */
 114        MEM_CGROUP_EVENTS_PGPGOUT,      /* # of pages paged out */
 115        MEM_CGROUP_EVENTS_PGFAULT,      /* # of page-faults */
 116        MEM_CGROUP_EVENTS_PGMAJFAULT,   /* # of major page-faults */
 117        MEM_CGROUP_EVENTS_NSTATS,
 118};
 119
 120static const char * const mem_cgroup_events_names[] = {
 121        "pgpgin",
 122        "pgpgout",
 123        "pgfault",
 124        "pgmajfault",
 125};
 126
 127static const char * const mem_cgroup_lru_names[] = {
 128        "inactive_anon",
 129        "active_anon",
 130        "inactive_file",
 131        "active_file",
 132        "unevictable",
 133};
 134
 135/*
 136 * Per memcg event counter is incremented at every pagein/pageout. With THP,
 137 * it will be incremated by the number of pages. This counter is used for
 138 * for trigger some periodic events. This is straightforward and better
 139 * than using jiffies etc. to handle periodic memcg event.
 140 */
 141enum mem_cgroup_events_target {
 142        MEM_CGROUP_TARGET_THRESH,
 143        MEM_CGROUP_TARGET_SOFTLIMIT,
 144        MEM_CGROUP_TARGET_NUMAINFO,
 145        MEM_CGROUP_NTARGETS,
 146};
 147#define THRESHOLDS_EVENTS_TARGET 128
 148#define SOFTLIMIT_EVENTS_TARGET 1024
 149#define NUMAINFO_EVENTS_TARGET  1024
 150
 151#define MEM_CGROUP_ID_MAX       USHRT_MAX
 152
 153static void mem_cgroup_id_put(struct mem_cgroup *memcg);
 154static unsigned short mem_cgroup_id(struct mem_cgroup *memcg);
 155
 156struct mem_cgroup_stat_cpu {
 157        long count[MEM_CGROUP_STAT_NSTATS];
 158        unsigned long events[MEM_CGROUP_EVENTS_NSTATS];
 159        unsigned long nr_page_events;
 160        unsigned long targets[MEM_CGROUP_NTARGETS];
 161};
 162
 163struct mem_cgroup_reclaim_iter {
 164        /*
 165         * last scanned hierarchy member. Valid only if last_dead_count
 166         * matches memcg->dead_count of the hierarchy root group.
 167         */
 168        struct mem_cgroup *last_visited;
 169        unsigned long last_dead_count;
 170
 171        /* scan generation, increased every round-trip */
 172        unsigned int generation;
 173};
 174
 175/*
 176 * per-zone information in memory controller.
 177 */
 178struct mem_cgroup_per_zone {
 179        struct lruvec           lruvec;
 180        unsigned long           lru_size[NR_LRU_LISTS];
 181
 182        struct mem_cgroup_reclaim_iter reclaim_iter[DEF_PRIORITY + 1];
 183
 184        struct rb_node          tree_node;      /* RB tree node */
 185        unsigned long           usage_in_excess;/* Set to the value by which */
 186                                                /* the soft limit is exceeded*/
 187        bool                    on_tree;
 188        struct mem_cgroup       *memcg;         /* Back pointer, we cannot */
 189                                                /* use container_of        */
 190};
 191
 192struct mem_cgroup_per_node {
 193        struct mem_cgroup_per_zone zoneinfo[MAX_NR_ZONES];
 194};
 195
 196struct mem_cgroup_lru_info {
 197        struct mem_cgroup_per_node *nodeinfo[0];
 198};
 199
 200/*
 201 * Cgroups above their limits are maintained in a RB-Tree, independent of
 202 * their hierarchy representation
 203 */
 204
 205struct mem_cgroup_tree_per_zone {
 206        struct rb_root rb_root;
 207        spinlock_t lock;
 208};
 209
 210struct mem_cgroup_tree_per_node {
 211        struct mem_cgroup_tree_per_zone rb_tree_per_zone[MAX_NR_ZONES];
 212};
 213
 214struct mem_cgroup_tree {
 215        struct mem_cgroup_tree_per_node *rb_tree_per_node[MAX_NUMNODES];
 216};
 217
 218static struct mem_cgroup_tree soft_limit_tree __read_mostly;
 219
 220struct mem_cgroup_threshold {
 221        struct eventfd_ctx *eventfd;
 222        unsigned long threshold;
 223};
 224
 225/* For threshold */
 226struct mem_cgroup_threshold_ary {
 227        /* An array index points to threshold just below or equal to usage. */
 228        int current_threshold;
 229        /* Size of entries[] */
 230        unsigned int size;
 231        /* Array of thresholds */
 232        struct mem_cgroup_threshold entries[0];
 233};
 234
 235struct mem_cgroup_thresholds {
 236        /* Primary thresholds array */
 237        struct mem_cgroup_threshold_ary *primary;
 238        /*
 239         * Spare threshold array.
 240         * This is needed to make mem_cgroup_unregister_event() "never fail".
 241         * It must be able to store at least primary->size - 1 entries.
 242         */
 243        struct mem_cgroup_threshold_ary *spare;
 244};
 245
 246/* for OOM */
 247struct mem_cgroup_eventfd_list {
 248        struct list_head list;
 249        struct eventfd_ctx *eventfd;
 250};
 251
 252static void mem_cgroup_threshold(struct mem_cgroup *memcg);
 253static void mem_cgroup_oom_notify(struct mem_cgroup *memcg);
 254
 255/*
 256 * The memory controller data structure. The memory controller controls both
 257 * page cache and RSS per cgroup. We would eventually like to provide
 258 * statistics based on the statistics developed by Rik Van Riel for clock-pro,
 259 * to help the administrator determine what knobs to tune.
 260 *
 261 * TODO: Add a water mark for the memory controller. Reclaim will begin when
 262 * we hit the water mark. May be even add a low water mark, such that
 263 * no reclaim occurs from a cgroup at it's low water mark, this is
 264 * a feature that will be implemented much later in the future.
 265 */
 266struct mem_cgroup {
 267        struct cgroup_subsys_state css;
 268
 269        /* Private memcg ID. Used to ID objects that outlive the cgroup */
 270        unsigned short id;
 271
 272        /*
 273         * the counter to account for memory usage
 274         */
 275        struct page_counter memory;
 276
 277        unsigned long soft_limit;
 278
 279        /* vmpressure notifications */
 280        struct vmpressure vmpressure;
 281
 282        union {
 283                /*
 284                 * the counter to account for mem+swap usage.
 285                 */
 286                struct page_counter memsw;
 287                /*
 288                 * rcu_freeing is used only when freeing struct mem_cgroup,
 289                 * so put it into a union to avoid wasting more memory.
 290                 * It must be disjoint from the css field.  It could be
 291                 * in a union with the res field, but res plays a much
 292                 * larger part in mem_cgroup life than memsw, and might
 293                 * be of interest, even at time of free, when debugging.
 294                 * So share rcu_head with the less interesting memsw.
 295                 */
 296                struct rcu_head rcu_freeing;
 297                /*
 298                 * We also need some space for a worker in deferred freeing.
 299                 * By the time we call it, rcu_freeing is no longer in use.
 300                 */
 301                struct work_struct work_freeing;
 302        };
 303        /*
 304         * the counter to account for kernel memory usage.
 305         */
 306        struct page_counter kmem;
 307        /*
 308         * Should the accounting and control be hierarchical, per subtree?
 309         */
 310        bool use_hierarchy;
 311        unsigned long kmem_account_flags; /* See KMEM_ACCOUNTED_*, below */
 312
 313        bool            oom_lock;
 314        atomic_t        under_oom;
 315        atomic_t        oom_wakeups;
 316
 317        atomic_t        refcnt;
 318
 319        int     swappiness;
 320        /* OOM-Killer disable */
 321        int             oom_kill_disable;
 322
 323        /* set when res.limit == memsw.limit */
 324        bool            memsw_is_minimum;
 325
 326        /* protect arrays of thresholds */
 327        struct mutex thresholds_lock;
 328
 329        /* thresholds for memory usage. RCU-protected */
 330        struct mem_cgroup_thresholds thresholds;
 331
 332        /* thresholds for mem+swap usage. RCU-protected */
 333        struct mem_cgroup_thresholds memsw_thresholds;
 334
 335        /* For oom notifier event fd */
 336        struct list_head oom_notify;
 337
 338        /*
 339         * Should we move charges of a task when a task is moved into this
 340         * mem_cgroup ? And what type of charges should we move ?
 341         */
 342        unsigned long   move_charge_at_immigrate;
 343        /*
 344         * set > 0 if pages under this cgroup are moving to other cgroup.
 345         */
 346        atomic_t        moving_account;
 347        /* taken only while moving_account > 0 */
 348        spinlock_t      move_lock;
 349        /*
 350         * percpu counter.
 351         */
 352        struct mem_cgroup_stat_cpu __percpu *stat;
 353        /*
 354         * used when a cpu is offlined or other synchronizations
 355         * See mem_cgroup_read_stat().
 356         */
 357        struct mem_cgroup_stat_cpu nocpu_base;
 358        spinlock_t pcp_counter_lock;
 359
 360        atomic_t        dead_count;
 361#if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_INET)
 362        struct tcp_memcontrol tcp_mem;
 363#endif
 364#if defined(CONFIG_MEMCG_KMEM)
 365        /* analogous to slab_common's slab_caches list. per-memcg */
 366        struct list_head memcg_slab_caches;
 367        /* Not a spinlock, we can take a lot of time walking the list */
 368        struct mutex slab_caches_mutex;
 369        /* Index in the kmem_cache->memcg_params->memcg_caches array */
 370        int kmemcg_id;
 371#endif
 372
 373        int last_scanned_node;
 374#if MAX_NUMNODES > 1
 375        nodemask_t      scan_nodes;
 376        atomic_t        numainfo_events;
 377        atomic_t        numainfo_updating;
 378#endif
 379
 380        /*
 381         * Per cgroup active and inactive list, similar to the
 382         * per zone LRU lists.
 383         *
 384         * WARNING: This has to be the last element of the struct. Don't
 385         * add new fields after this point.
 386         */
 387        struct mem_cgroup_lru_info info;
 388};
 389
 390static size_t memcg_size(void)
 391{
 392        return sizeof(struct mem_cgroup) +
 393                nr_node_ids * sizeof(struct mem_cgroup_per_node *);
 394}
 395
 396/* internal only representation about the status of kmem accounting. */
 397enum {
 398        KMEM_ACCOUNTED_ACTIVE = 0, /* accounted by this cgroup itself */
 399        KMEM_ACCOUNTED_ACTIVATED, /* static key enabled. */
 400        KMEM_ACCOUNTED_DEAD, /* dead memcg with pending kmem charges */
 401};
 402
 403/* We account when limit is on, but only after call sites are patched */
 404#define KMEM_ACCOUNTED_MASK \
 405                ((1 << KMEM_ACCOUNTED_ACTIVE) | (1 << KMEM_ACCOUNTED_ACTIVATED))
 406
 407#ifdef CONFIG_MEMCG_KMEM
 408static inline void memcg_kmem_set_active(struct mem_cgroup *memcg)
 409{
 410        set_bit(KMEM_ACCOUNTED_ACTIVE, &memcg->kmem_account_flags);
 411}
 412
 413static bool memcg_kmem_is_active(struct mem_cgroup *memcg)
 414{
 415        return test_bit(KMEM_ACCOUNTED_ACTIVE, &memcg->kmem_account_flags);
 416}
 417
 418static void memcg_kmem_set_activated(struct mem_cgroup *memcg)
 419{
 420        set_bit(KMEM_ACCOUNTED_ACTIVATED, &memcg->kmem_account_flags);
 421}
 422
 423static void memcg_kmem_clear_activated(struct mem_cgroup *memcg)
 424{
 425        clear_bit(KMEM_ACCOUNTED_ACTIVATED, &memcg->kmem_account_flags);
 426}
 427
 428static void memcg_kmem_mark_dead(struct mem_cgroup *memcg)
 429{
 430        if (test_bit(KMEM_ACCOUNTED_ACTIVE, &memcg->kmem_account_flags))
 431                set_bit(KMEM_ACCOUNTED_DEAD, &memcg->kmem_account_flags);
 432}
 433
 434static bool memcg_kmem_test_and_clear_dead(struct mem_cgroup *memcg)
 435{
 436        return test_and_clear_bit(KMEM_ACCOUNTED_DEAD,
 437                                  &memcg->kmem_account_flags);
 438}
 439#endif
 440
 441/* Stuffs for move charges at task migration. */
 442/*
 443 * Types of charges to be moved. "move_charge_at_immitgrate" and
 444 * "immigrate_flags" are treated as a left-shifted bitmap of these types.
 445 */
 446enum move_type {
 447        MOVE_CHARGE_TYPE_ANON,  /* private anonymous page and swap of it */
 448        MOVE_CHARGE_TYPE_FILE,  /* file page(including tmpfs) and swap of it */
 449        NR_MOVE_TYPE,
 450};
 451
 452/* "mc" and its members are protected by cgroup_mutex */
 453static struct move_charge_struct {
 454        spinlock_t        lock; /* for from, to */
 455        struct mem_cgroup *from;
 456        struct mem_cgroup *to;
 457        unsigned long immigrate_flags;
 458        unsigned long precharge;
 459        unsigned long moved_charge;
 460        unsigned long moved_swap;
 461        struct task_struct *moving_task;        /* a task moving charges */
 462        wait_queue_head_t waitq;                /* a waitq for other context */
 463} mc = {
 464        .lock = __SPIN_LOCK_UNLOCKED(mc.lock),
 465        .waitq = __WAIT_QUEUE_HEAD_INITIALIZER(mc.waitq),
 466};
 467
 468static bool move_anon(void)
 469{
 470        return test_bit(MOVE_CHARGE_TYPE_ANON, &mc.immigrate_flags);
 471}
 472
 473static bool move_file(void)
 474{
 475        return test_bit(MOVE_CHARGE_TYPE_FILE, &mc.immigrate_flags);
 476}
 477
 478/*
 479 * Maximum loops in mem_cgroup_hierarchical_reclaim(), used for soft
 480 * limit reclaim to prevent infinite loops, if they ever occur.
 481 */
 482#define MEM_CGROUP_MAX_RECLAIM_LOOPS            100
 483#define MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS 2
 484
 485enum charge_type {
 486        MEM_CGROUP_CHARGE_TYPE_CACHE = 0,
 487        MEM_CGROUP_CHARGE_TYPE_ANON,
 488        MEM_CGROUP_CHARGE_TYPE_SWAPOUT, /* for accounting swapcache */
 489        MEM_CGROUP_CHARGE_TYPE_DROP,    /* a page was unused swap cache */
 490        NR_CHARGE_TYPE,
 491};
 492
 493/* for encoding cft->private value on file */
 494enum res_type {
 495        _MEM,
 496        _MEMSWAP,
 497        _OOM_TYPE,
 498        _KMEM,
 499};
 500
 501#define MEMFILE_PRIVATE(x, val) ((x) << 16 | (val))
 502#define MEMFILE_TYPE(val)       ((val) >> 16 & 0xffff)
 503#define MEMFILE_ATTR(val)       ((val) & 0xffff)
 504/* Used for OOM nofiier */
 505#define OOM_CONTROL             (0)
 506
 507/*
 508 * Reclaim flags for mem_cgroup_hierarchical_reclaim
 509 */
 510#define MEM_CGROUP_RECLAIM_NOSWAP_BIT   0x0
 511#define MEM_CGROUP_RECLAIM_NOSWAP       (1 << MEM_CGROUP_RECLAIM_NOSWAP_BIT)
 512#define MEM_CGROUP_RECLAIM_SHRINK_BIT   0x1
 513#define MEM_CGROUP_RECLAIM_SHRINK       (1 << MEM_CGROUP_RECLAIM_SHRINK_BIT)
 514
 515/*
 516 * The memcg_create_mutex will be held whenever a new cgroup is created.
 517 * As a consequence, any change that needs to protect against new child cgroups
 518 * appearing has to hold it as well.
 519 */
 520static DEFINE_MUTEX(memcg_create_mutex);
 521
 522static void mem_cgroup_get(struct mem_cgroup *memcg);
 523static void mem_cgroup_put(struct mem_cgroup *memcg);
 524
 525static inline
 526struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *s)
 527{
 528        return container_of(s, struct mem_cgroup, css);
 529}
 530
 531/* Some nice accessors for the vmpressure. */
 532struct vmpressure *memcg_to_vmpressure(struct mem_cgroup *memcg)
 533{
 534        if (!memcg)
 535                memcg = root_mem_cgroup;
 536        return &memcg->vmpressure;
 537}
 538
 539struct cgroup_subsys_state *vmpressure_to_css(struct vmpressure *vmpr)
 540{
 541        return &container_of(vmpr, struct mem_cgroup, vmpressure)->css;
 542}
 543
 544struct vmpressure *css_to_vmpressure(struct cgroup_subsys_state *css)
 545{
 546        return &mem_cgroup_from_css(css)->vmpressure;
 547}
 548
 549static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg)
 550{
 551        return (memcg == root_mem_cgroup);
 552}
 553
 554/* Writing them here to avoid exposing memcg's inner layout */
 555#if defined(CONFIG_INET) && defined(CONFIG_MEMCG_KMEM)
 556
 557void sock_update_memcg(struct sock *sk)
 558{
 559        if (mem_cgroup_sockets_enabled) {
 560                struct mem_cgroup *memcg;
 561                struct cg_proto *cg_proto;
 562
 563                BUG_ON(!sk->sk_prot->proto_cgroup);
 564
 565                /* Socket cloning can throw us here with sk_cgrp already
 566                 * filled. It won't however, necessarily happen from
 567                 * process context. So the test for root memcg given
 568                 * the current task's memcg won't help us in this case.
 569                 *
 570                 * Respecting the original socket's memcg is a better
 571                 * decision in this case.
 572                 */
 573                if (sk->sk_cgrp) {
 574                        BUG_ON(mem_cgroup_is_root(sk->sk_cgrp->memcg));
 575                        mem_cgroup_get(sk->sk_cgrp->memcg);
 576                        return;
 577                }
 578
 579                rcu_read_lock();
 580                memcg = mem_cgroup_from_task(current);
 581                cg_proto = sk->sk_prot->proto_cgroup(memcg);
 582                if (!mem_cgroup_is_root(memcg) && memcg_proto_active(cg_proto)) {
 583                        mem_cgroup_get(memcg);
 584                        sk->sk_cgrp = cg_proto;
 585                }
 586                rcu_read_unlock();
 587        }
 588}
 589EXPORT_SYMBOL(sock_update_memcg);
 590
 591void sock_release_memcg(struct sock *sk)
 592{
 593        if (mem_cgroup_sockets_enabled && sk->sk_cgrp) {
 594                struct mem_cgroup *memcg;
 595                WARN_ON(!sk->sk_cgrp->memcg);
 596                memcg = sk->sk_cgrp->memcg;
 597                mem_cgroup_put(memcg);
 598        }
 599}
 600
 601struct cg_proto *tcp_proto_cgroup(struct mem_cgroup *memcg)
 602{
 603        if (!memcg || mem_cgroup_is_root(memcg))
 604                return NULL;
 605
 606        return &memcg->tcp_mem.cg_proto;
 607}
 608EXPORT_SYMBOL(tcp_proto_cgroup);
 609
 610static void disarm_sock_keys(struct mem_cgroup *memcg)
 611{
 612        if (!memcg_proto_activated(&memcg->tcp_mem.cg_proto))
 613                return;
 614        static_key_slow_dec(&memcg_socket_limit_enabled);
 615}
 616#else
 617static void disarm_sock_keys(struct mem_cgroup *memcg)
 618{
 619}
 620#endif
 621
 622#ifdef CONFIG_MEMCG_KMEM
 623/*
 624 * This will be the memcg's index in each cache's ->memcg_params->memcg_caches.
 625 * There are two main reasons for not using the css_id for this:
 626 *  1) this works better in sparse environments, where we have a lot of memcgs,
 627 *     but only a few kmem-limited. Or also, if we have, for instance, 200
 628 *     memcgs, and none but the 200th is kmem-limited, we'd have to have a
 629 *     200 entry array for that.
 630 *
 631 *  2) In order not to violate the cgroup API, we would like to do all memory
 632 *     allocation in ->create(). At that point, we haven't yet allocated the
 633 *     css_id. Having a separate index prevents us from messing with the cgroup
 634 *     core for this
 635 *
 636 * The current size of the caches array is stored in
 637 * memcg_limited_groups_array_size.  It will double each time we have to
 638 * increase it.
 639 */
 640static DEFINE_IDA(kmem_limited_groups);
 641int memcg_limited_groups_array_size;
 642
 643/*
 644 * MIN_SIZE is different than 1, because we would like to avoid going through
 645 * the alloc/free process all the time. In a small machine, 4 kmem-limited
 646 * cgroups is a reasonable guess. In the future, it could be a parameter or
 647 * tunable, but that is strictly not necessary.
 648 *
 649 * MAX_SIZE should be as large as the number of css_ids. Ideally, we could get
 650 * this constant directly from cgroup, but it is understandable that this is
 651 * better kept as an internal representation in cgroup.c. In any case, the
 652 * css_id space is not getting any smaller, and we don't have to necessarily
 653 * increase ours as well if it increases.
 654 */
 655#define MEMCG_CACHES_MIN_SIZE 4
 656#define MEMCG_CACHES_MAX_SIZE 65535
 657
 658/*
 659 * A lot of the calls to the cache allocation functions are expected to be
 660 * inlined by the compiler. Since the calls to memcg_kmem_get_cache are
 661 * conditional to this static branch, we'll have to allow modules that does
 662 * kmem_cache_alloc and the such to see this symbol as well
 663 */
 664struct static_key memcg_kmem_enabled_key;
 665EXPORT_SYMBOL(memcg_kmem_enabled_key);
 666
 667static void disarm_kmem_keys(struct mem_cgroup *memcg)
 668{
 669        if (memcg_kmem_is_active(memcg)) {
 670                static_key_slow_dec(&memcg_kmem_enabled_key);
 671                ida_simple_remove(&kmem_limited_groups, memcg->kmemcg_id);
 672        }
 673        /*
 674         * This check can't live in kmem destruction function,
 675         * since the charges will outlive the cgroup
 676         */
 677        WARN_ON(page_counter_read(&memcg->kmem));
 678}
 679#else
 680static void disarm_kmem_keys(struct mem_cgroup *memcg)
 681{
 682}
 683#endif /* CONFIG_MEMCG_KMEM */
 684
 685static void disarm_static_keys(struct mem_cgroup *memcg)
 686{
 687        disarm_sock_keys(memcg);
 688        disarm_kmem_keys(memcg);
 689}
 690
 691static void drain_all_stock_async(struct mem_cgroup *memcg);
 692
 693static struct mem_cgroup_per_zone *
 694mem_cgroup_zoneinfo(struct mem_cgroup *memcg, int nid, int zid)
 695{
 696        VM_BUG_ON((unsigned)nid >= nr_node_ids);
 697        return &memcg->info.nodeinfo[nid]->zoneinfo[zid];
 698}
 699
 700struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *memcg)
 701{
 702        return &memcg->css;
 703}
 704
 705static struct mem_cgroup_per_zone *
 706page_cgroup_zoneinfo(struct mem_cgroup *memcg, struct page *page)
 707{
 708        int nid = page_to_nid(page);
 709        int zid = page_zonenum(page);
 710
 711        return mem_cgroup_zoneinfo(memcg, nid, zid);
 712}
 713
 714static struct mem_cgroup_tree_per_zone *
 715soft_limit_tree_node_zone(int nid, int zid)
 716{
 717        return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid];
 718}
 719
 720static struct mem_cgroup_tree_per_zone *
 721soft_limit_tree_from_page(struct page *page)
 722{
 723        int nid = page_to_nid(page);
 724        int zid = page_zonenum(page);
 725
 726        return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid];
 727}
 728
 729static void
 730__mem_cgroup_insert_exceeded(struct mem_cgroup *memcg,
 731                                struct mem_cgroup_per_zone *mz,
 732                                struct mem_cgroup_tree_per_zone *mctz,
 733                                unsigned long new_usage_in_excess)
 734{
 735        struct rb_node **p = &mctz->rb_root.rb_node;
 736        struct rb_node *parent = NULL;
 737        struct mem_cgroup_per_zone *mz_node;
 738
 739        if (mz->on_tree)
 740                return;
 741
 742        mz->usage_in_excess = new_usage_in_excess;
 743        if (!mz->usage_in_excess)
 744                return;
 745        while (*p) {
 746                parent = *p;
 747                mz_node = rb_entry(parent, struct mem_cgroup_per_zone,
 748                                        tree_node);
 749                if (mz->usage_in_excess < mz_node->usage_in_excess)
 750                        p = &(*p)->rb_left;
 751                /*
 752                 * We can't avoid mem cgroups that are over their soft
 753                 * limit by the same amount
 754                 */
 755                else if (mz->usage_in_excess >= mz_node->usage_in_excess)
 756                        p = &(*p)->rb_right;
 757        }
 758        rb_link_node(&mz->tree_node, parent, p);
 759        rb_insert_color(&mz->tree_node, &mctz->rb_root);
 760        mz->on_tree = true;
 761}
 762
 763static void
 764__mem_cgroup_remove_exceeded(struct mem_cgroup *memcg,
 765                                struct mem_cgroup_per_zone *mz,
 766                                struct mem_cgroup_tree_per_zone *mctz)
 767{
 768        if (!mz->on_tree)
 769                return;
 770        rb_erase(&mz->tree_node, &mctz->rb_root);
 771        mz->on_tree = false;
 772}
 773
 774static void
 775mem_cgroup_remove_exceeded(struct mem_cgroup *memcg,
 776                                struct mem_cgroup_per_zone *mz,
 777                                struct mem_cgroup_tree_per_zone *mctz)
 778{
 779        spin_lock(&mctz->lock);
 780        __mem_cgroup_remove_exceeded(memcg, mz, mctz);
 781        spin_unlock(&mctz->lock);
 782}
 783
 784static unsigned long soft_limit_excess(struct mem_cgroup *memcg)
 785{
 786        unsigned long nr_pages = page_counter_read(&memcg->memory);
 787        unsigned long soft_limit = ACCESS_ONCE(memcg->soft_limit);
 788        unsigned long excess = 0;
 789
 790        if (nr_pages > soft_limit)
 791                excess = nr_pages - soft_limit;
 792
 793        return excess;
 794}
 795
 796static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page)
 797{
 798        unsigned long excess;
 799        struct mem_cgroup_per_zone *mz;
 800        struct mem_cgroup_tree_per_zone *mctz;
 801        int nid = page_to_nid(page);
 802        int zid = page_zonenum(page);
 803        mctz = soft_limit_tree_from_page(page);
 804
 805        /*
 806         * Necessary to update all ancestors when hierarchy is used.
 807         * because their event counter is not touched.
 808         */
 809        for (; memcg; memcg = parent_mem_cgroup(memcg)) {
 810                mz = mem_cgroup_zoneinfo(memcg, nid, zid);
 811                excess = soft_limit_excess(memcg);
 812                /*
 813                 * We have to update the tree if mz is on RB-tree or
 814                 * mem is over its softlimit.
 815                 */
 816                if (excess || mz->on_tree) {
 817                        spin_lock(&mctz->lock);
 818                        /* if on-tree, remove it */
 819                        if (mz->on_tree)
 820                                __mem_cgroup_remove_exceeded(memcg, mz, mctz);
 821                        /*
 822                         * Insert again. mz->usage_in_excess will be updated.
 823                         * If excess is 0, no tree ops.
 824                         */
 825                        __mem_cgroup_insert_exceeded(memcg, mz, mctz, excess);
 826                        spin_unlock(&mctz->lock);
 827                }
 828        }
 829}
 830
 831static void mem_cgroup_remove_from_trees(struct mem_cgroup *memcg)
 832{
 833        int node, zone;
 834        struct mem_cgroup_per_zone *mz;
 835        struct mem_cgroup_tree_per_zone *mctz;
 836
 837        for_each_node(node) {
 838                for (zone = 0; zone < MAX_NR_ZONES; zone++) {
 839                        mz = mem_cgroup_zoneinfo(memcg, node, zone);
 840                        mctz = soft_limit_tree_node_zone(node, zone);
 841                        mem_cgroup_remove_exceeded(memcg, mz, mctz);
 842                }
 843        }
 844}
 845
 846static struct mem_cgroup_per_zone *
 847__mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz)
 848{
 849        struct rb_node *rightmost = NULL;
 850        struct mem_cgroup_per_zone *mz;
 851
 852retry:
 853        mz = NULL;
 854        rightmost = rb_last(&mctz->rb_root);
 855        if (!rightmost)
 856                goto done;              /* Nothing to reclaim from */
 857
 858        mz = rb_entry(rightmost, struct mem_cgroup_per_zone, tree_node);
 859        /*
 860         * Remove the node now but someone else can add it back,
 861         * we will to add it back at the end of reclaim to its correct
 862         * position in the tree.
 863         */
 864        __mem_cgroup_remove_exceeded(mz->memcg, mz, mctz);
 865        if (!soft_limit_excess(mz->memcg) ||
 866                !css_tryget(&mz->memcg->css))
 867                goto retry;
 868done:
 869        return mz;
 870}
 871
 872static struct mem_cgroup_per_zone *
 873mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz)
 874{
 875        struct mem_cgroup_per_zone *mz;
 876
 877        spin_lock(&mctz->lock);
 878        mz = __mem_cgroup_largest_soft_limit_node(mctz);
 879        spin_unlock(&mctz->lock);
 880        return mz;
 881}
 882
 883/*
 884 * Implementation Note: reading percpu statistics for memcg.
 885 *
 886 * Both of vmstat[] and percpu_counter has threshold and do periodic
 887 * synchronization to implement "quick" read. There are trade-off between
 888 * reading cost and precision of value. Then, we may have a chance to implement
 889 * a periodic synchronizion of counter in memcg's counter.
 890 *
 891 * But this _read() function is used for user interface now. The user accounts
 892 * memory usage by memory cgroup and he _always_ requires exact value because
 893 * he accounts memory. Even if we provide quick-and-fuzzy read, we always
 894 * have to visit all online cpus and make sum. So, for now, unnecessary
 895 * synchronization is not implemented. (just implemented for cpu hotplug)
 896 *
 897 * If there are kernel internal actions which can make use of some not-exact
 898 * value, and reading all cpu value can be performance bottleneck in some
 899 * common workload, threashold and synchonization as vmstat[] should be
 900 * implemented.
 901 */
 902static long mem_cgroup_read_stat(struct mem_cgroup *memcg,
 903                                 enum mem_cgroup_stat_index idx)
 904{
 905        long val = 0;
 906        int cpu;
 907
 908        get_online_cpus();
 909        for_each_online_cpu(cpu)
 910                val += per_cpu(memcg->stat->count[idx], cpu);
 911#ifdef CONFIG_HOTPLUG_CPU
 912        spin_lock(&memcg->pcp_counter_lock);
 913        val += memcg->nocpu_base.count[idx];
 914        spin_unlock(&memcg->pcp_counter_lock);
 915#endif
 916        put_online_cpus();
 917        return val;
 918}
 919
 920static void mem_cgroup_swap_statistics(struct mem_cgroup *memcg,
 921                                         bool charge)
 922{
 923        int val = (charge) ? 1 : -1;
 924        this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_SWAP], val);
 925}
 926
 927static unsigned long mem_cgroup_read_events(struct mem_cgroup *memcg,
 928                                            enum mem_cgroup_events_index idx)
 929{
 930        unsigned long val = 0;
 931        int cpu;
 932
 933        for_each_online_cpu(cpu)
 934                val += per_cpu(memcg->stat->events[idx], cpu);
 935#ifdef CONFIG_HOTPLUG_CPU
 936        spin_lock(&memcg->pcp_counter_lock);
 937        val += memcg->nocpu_base.events[idx];
 938        spin_unlock(&memcg->pcp_counter_lock);
 939#endif
 940        return val;
 941}
 942
 943static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg,
 944                                         struct page *page,
 945                                         bool anon, int nr_pages)
 946{
 947        preempt_disable();
 948
 949        /*
 950         * Here, RSS means 'mapped anon' and anon's SwapCache. Shmem/tmpfs is
 951         * counted as CACHE even if it's on ANON LRU.
 952         */
 953        if (anon)
 954                __this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_RSS],
 955                                nr_pages);
 956        else
 957                __this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_CACHE],
 958                                nr_pages);
 959
 960        if (PageTransHuge(page))
 961                __this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_RSS_HUGE],
 962                                nr_pages);
 963
 964        /* pagein of a big page is an event. So, ignore page size */
 965        if (nr_pages > 0)
 966                __this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGIN]);
 967        else {
 968                __this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGOUT]);
 969                nr_pages = -nr_pages; /* for event */
 970        }
 971
 972        __this_cpu_add(memcg->stat->nr_page_events, nr_pages);
 973
 974        preempt_enable();
 975}
 976
 977unsigned long
 978mem_cgroup_get_lru_size(struct lruvec *lruvec, enum lru_list lru)
 979{
 980        struct mem_cgroup_per_zone *mz;
 981
 982        mz = container_of(lruvec, struct mem_cgroup_per_zone, lruvec);
 983        return mz->lru_size[lru];
 984}
 985
 986static unsigned long
 987mem_cgroup_zone_nr_lru_pages(struct mem_cgroup *memcg, int nid, int zid,
 988                        unsigned int lru_mask)
 989{
 990        struct mem_cgroup_per_zone *mz;
 991        enum lru_list lru;
 992        unsigned long ret = 0;
 993
 994        mz = mem_cgroup_zoneinfo(memcg, nid, zid);
 995
 996        for_each_lru(lru) {
 997                if (BIT(lru) & lru_mask)
 998                        ret += mz->lru_size[lru];
 999        }
1000        return ret;
1001}
1002
1003static unsigned long
1004mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg,
1005                        int nid, unsigned int lru_mask)
1006{
1007        u64 total = 0;
1008        int zid;
1009
1010        for (zid = 0; zid < MAX_NR_ZONES; zid++)
1011                total += mem_cgroup_zone_nr_lru_pages(memcg,
1012                                                nid, zid, lru_mask);
1013
1014        return total;
1015}
1016
1017static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *memcg,
1018                        unsigned int lru_mask)
1019{
1020        int nid;
1021        u64 total = 0;
1022
1023        for_each_node_state(nid, N_MEMORY)
1024                total += mem_cgroup_node_nr_lru_pages(memcg, nid, lru_mask);
1025        return total;
1026}
1027
1028static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg,
1029                                       enum mem_cgroup_events_target target)
1030{
1031        unsigned long val, next;
1032
1033        val = __this_cpu_read(memcg->stat->nr_page_events);
1034        next = __this_cpu_read(memcg->stat->targets[target]);
1035        /* from time_after() in jiffies.h */
1036        if ((long)next - (long)val < 0) {
1037                switch (target) {
1038                case MEM_CGROUP_TARGET_THRESH:
1039                        next = val + THRESHOLDS_EVENTS_TARGET;
1040                        break;
1041                case MEM_CGROUP_TARGET_SOFTLIMIT:
1042                        next = val + SOFTLIMIT_EVENTS_TARGET;
1043                        break;
1044                case MEM_CGROUP_TARGET_NUMAINFO:
1045                        next = val + NUMAINFO_EVENTS_TARGET;
1046                        break;
1047                default:
1048                        break;
1049                }
1050                __this_cpu_write(memcg->stat->targets[target], next);
1051                return true;
1052        }
1053        return false;
1054}
1055
1056/*
1057 * Check events in order.
1058 *
1059 */
1060static void memcg_check_events(struct mem_cgroup *memcg, struct page *page)
1061{
1062        preempt_disable();
1063        /* threshold event is triggered in finer grain than soft limit */
1064        if (unlikely(mem_cgroup_event_ratelimit(memcg,
1065                                                MEM_CGROUP_TARGET_THRESH))) {
1066                bool do_softlimit;
1067                bool do_numainfo __maybe_unused;
1068
1069                do_softlimit = mem_cgroup_event_ratelimit(memcg,
1070                                                MEM_CGROUP_TARGET_SOFTLIMIT);
1071#if MAX_NUMNODES > 1
1072                do_numainfo = mem_cgroup_event_ratelimit(memcg,
1073                                                MEM_CGROUP_TARGET_NUMAINFO);
1074#endif
1075                preempt_enable();
1076
1077                mem_cgroup_threshold(memcg);
1078                if (unlikely(do_softlimit))
1079                        mem_cgroup_update_tree(memcg, page);
1080#if MAX_NUMNODES > 1
1081                if (unlikely(do_numainfo))
1082                        atomic_inc(&memcg->numainfo_events);
1083#endif
1084        } else
1085                preempt_enable();
1086}
1087
1088struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont)
1089{
1090        return mem_cgroup_from_css(
1091                cgroup_subsys_state(cont, mem_cgroup_subsys_id));
1092}
1093
1094struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
1095{
1096        /*
1097         * mm_update_next_owner() may clear mm->owner to NULL
1098         * if it races with swapoff, page migration, etc.
1099         * So this can be called with p == NULL.
1100         */
1101        if (unlikely(!p))
1102                return NULL;
1103
1104        return mem_cgroup_from_css(task_subsys_state(p, mem_cgroup_subsys_id));
1105}
1106
1107struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm)
1108{
1109        struct mem_cgroup *memcg = NULL;
1110
1111        if (!mm)
1112                return NULL;
1113        /*
1114         * Because we have no locks, mm->owner's may be being moved to other
1115         * cgroup. We use css_tryget() here even if this looks
1116         * pessimistic (rather than adding locks here).
1117         */
1118        rcu_read_lock();
1119        do {
1120                memcg = mem_cgroup_from_task(rcu_dereference(mm->owner));
1121                if (unlikely(!memcg))
1122                        break;
1123        } while (!css_tryget(&memcg->css));
1124        rcu_read_unlock();
1125        return memcg;
1126}
1127
1128/*
1129 * Returns a next (in a pre-order walk) alive memcg (with elevated css
1130 * ref. count) or NULL if the whole root's subtree has been visited.
1131 *
1132 * helper function to be used by mem_cgroup_iter
1133 */
1134static struct mem_cgroup *__mem_cgroup_iter_next(struct mem_cgroup *root,
1135                struct mem_cgroup *last_visited)
1136{
1137        struct cgroup *prev_cgroup, *next_cgroup;
1138
1139        /*
1140         * Root is not visited by cgroup iterators so it needs an
1141         * explicit visit.
1142         */
1143        if (!last_visited)
1144                return root;
1145
1146        prev_cgroup = (last_visited == root) ? NULL
1147                : last_visited->css.cgroup;
1148skip_node:
1149        next_cgroup = cgroup_next_descendant_pre(
1150                        prev_cgroup, root->css.cgroup);
1151
1152        /*
1153         * Even if we found a group we have to make sure it is
1154         * alive. css && !memcg means that the groups should be
1155         * skipped and we should continue the tree walk.
1156         * last_visited css is safe to use because it is
1157         * protected by css_get and the tree walk is rcu safe.
1158         */
1159        if (next_cgroup) {
1160                struct mem_cgroup *mem = mem_cgroup_from_cont(
1161                                next_cgroup);
1162                if (css_tryget(&mem->css))
1163                        return mem;
1164                else {
1165                        prev_cgroup = next_cgroup;
1166                        goto skip_node;
1167                }
1168        }
1169
1170        return NULL;
1171}
1172
1173/**
1174 * mem_cgroup_iter - iterate over memory cgroup hierarchy
1175 * @root: hierarchy root
1176 * @prev: previously returned memcg, NULL on first invocation
1177 * @reclaim: cookie for shared reclaim walks, NULL for full walks
1178 *
1179 * Returns references to children of the hierarchy below @root, or
1180 * @root itself, or %NULL after a full round-trip.
1181 *
1182 * Caller must pass the return value in @prev on subsequent
1183 * invocations for reference counting, or use mem_cgroup_iter_break()
1184 * to cancel a hierarchy walk before the round-trip is complete.
1185 *
1186 * Reclaimers can specify a zone and a priority level in @reclaim to
1187 * divide up the memcgs in the hierarchy among all concurrent
1188 * reclaimers operating on the same zone and priority.
1189 */
1190struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
1191                                   struct mem_cgroup *prev,
1192                                   struct mem_cgroup_reclaim_cookie *reclaim)
1193{
1194        struct mem_cgroup *memcg = NULL;
1195        struct mem_cgroup *last_visited = NULL;
1196        unsigned long uninitialized_var(dead_count);
1197
1198        if (mem_cgroup_disabled())
1199                return NULL;
1200
1201        if (!root)
1202                root = root_mem_cgroup;
1203
1204        if (prev && !reclaim)
1205                last_visited = prev;
1206
1207        if (!root->use_hierarchy && root != root_mem_cgroup) {
1208                if (prev)
1209                        goto out_css_put;
1210                return root;
1211        }
1212
1213        rcu_read_lock();
1214        while (!memcg) {
1215                struct mem_cgroup_reclaim_iter *uninitialized_var(iter);
1216
1217                if (reclaim) {
1218                        int nid = zone_to_nid(reclaim->zone);
1219                        int zid = zone_idx(reclaim->zone);
1220                        struct mem_cgroup_per_zone *mz;
1221
1222                        mz = mem_cgroup_zoneinfo(root, nid, zid);
1223                        iter = &mz->reclaim_iter[reclaim->priority];
1224                        if (prev && reclaim->generation != iter->generation) {
1225                                iter->last_visited = NULL;
1226                                goto out_unlock;
1227                        }
1228
1229                        /*
1230                         * If the dead_count mismatches, a destruction
1231                         * has happened or is happening concurrently.
1232                         * If the dead_count matches, a destruction
1233                         * might still happen concurrently, but since
1234                         * we checked under RCU, that destruction
1235                         * won't free the object until we release the
1236                         * RCU reader lock.  Thus, the dead_count
1237                         * check verifies the pointer is still valid,
1238                         * css_tryget() verifies the cgroup pointed to
1239                         * is alive.
1240                         */
1241                        dead_count = atomic_read(&root->dead_count);
1242                        if (dead_count == iter->last_dead_count) {
1243                                smp_rmb();
1244                                last_visited = iter->last_visited;
1245                                if (last_visited && last_visited != root &&
1246                                    !css_tryget(&last_visited->css))
1247                                        last_visited = NULL;
1248                        }
1249                }
1250
1251                memcg = __mem_cgroup_iter_next(root, last_visited);
1252
1253                if (reclaim) {
1254                        if (last_visited && last_visited != root)
1255                                css_put(&last_visited->css);
1256
1257                        iter->last_visited = memcg;
1258                        smp_wmb();
1259                        iter->last_dead_count = dead_count;
1260
1261                        if (!memcg)
1262                                iter->generation++;
1263                        else if (!prev && memcg)
1264                                reclaim->generation = iter->generation;
1265                }
1266
1267                if (prev && !memcg)
1268                        goto out_unlock;
1269        }
1270out_unlock:
1271        rcu_read_unlock();
1272out_css_put:
1273        if (prev && prev != root)
1274                css_put(&prev->css);
1275
1276        return memcg;
1277}
1278
1279/**
1280 * mem_cgroup_iter_break - abort a hierarchy walk prematurely
1281 * @root: hierarchy root
1282 * @prev: last visited hierarchy member as returned by mem_cgroup_iter()
1283 */
1284void mem_cgroup_iter_break(struct mem_cgroup *root,
1285                           struct mem_cgroup *prev)
1286{
1287        if (!root)
1288                root = root_mem_cgroup;
1289        if (prev && prev != root)
1290                css_put(&prev->css);
1291}
1292
1293/*
1294 * Iteration constructs for visiting all cgroups (under a tree).  If
1295 * loops are exited prematurely (break), mem_cgroup_iter_break() must
1296 * be used for reference counting.
1297 */
1298#define for_each_mem_cgroup_tree(iter, root)            \
1299        for (iter = mem_cgroup_iter(root, NULL, NULL);  \
1300             iter != NULL;                              \
1301             iter = mem_cgroup_iter(root, iter, NULL))
1302
1303#define for_each_mem_cgroup(iter)                       \
1304        for (iter = mem_cgroup_iter(NULL, NULL, NULL);  \
1305             iter != NULL;                              \
1306             iter = mem_cgroup_iter(NULL, iter, NULL))
1307
1308void __mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx)
1309{
1310        struct mem_cgroup *memcg;
1311
1312        rcu_read_lock();
1313        memcg = mem_cgroup_from_task(rcu_dereference(mm->owner));
1314        if (unlikely(!memcg))
1315                goto out;
1316
1317        switch (idx) {
1318        case PGFAULT:
1319                this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGFAULT]);
1320                break;
1321        case PGMAJFAULT:
1322                this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGMAJFAULT]);
1323                break;
1324        default:
1325                BUG();
1326        }
1327out:
1328        rcu_read_unlock();
1329}
1330EXPORT_SYMBOL(__mem_cgroup_count_vm_event);
1331
1332/**
1333 * mem_cgroup_zone_lruvec - get the lru list vector for a zone and memcg
1334 * @zone: zone of the wanted lruvec
1335 * @memcg: memcg of the wanted lruvec
1336 *
1337 * Returns the lru list vector holding pages for the given @zone and
1338 * @mem.  This can be the global zone lruvec, if the memory controller
1339 * is disabled.
1340 */
1341struct lruvec *mem_cgroup_zone_lruvec(struct zone *zone,
1342                                      struct mem_cgroup *memcg)
1343{
1344        struct mem_cgroup_per_zone *mz;
1345        struct lruvec *lruvec;
1346
1347        if (mem_cgroup_disabled()) {
1348                lruvec = &zone->lruvec;
1349                goto out;
1350        }
1351
1352        mz = mem_cgroup_zoneinfo(memcg, zone_to_nid(zone), zone_idx(zone));
1353        lruvec = &mz->lruvec;
1354out:
1355        /*
1356         * Since a node can be onlined after the mem_cgroup was created,
1357         * we have to be prepared to initialize lruvec->zone here;
1358         * and if offlined then reonlined, we need to reinitialize it.
1359         */
1360        if (unlikely(lruvec->zone != zone))
1361                lruvec->zone = zone;
1362        return lruvec;
1363}
1364
1365/*
1366 * Following LRU functions are allowed to be used without PCG_LOCK.
1367 * Operations are called by routine of global LRU independently from memcg.
1368 * What we have to take care of here is validness of pc->mem_cgroup.
1369 *
1370 * Changes to pc->mem_cgroup happens when
1371 * 1. charge
1372 * 2. moving account
1373 * In typical case, "charge" is done before add-to-lru. Exception is SwapCache.
1374 * It is added to LRU before charge.
1375 * If PCG_USED bit is not set, page_cgroup is not added to this private LRU.
1376 * When moving account, the page is not on LRU. It's isolated.
1377 */
1378
1379/**
1380 * mem_cgroup_page_lruvec - return lruvec for adding an lru page
1381 * @page: the page
1382 * @zone: zone of the page
1383 */
1384struct lruvec *mem_cgroup_page_lruvec(struct page *page, struct zone *zone)
1385{
1386        struct mem_cgroup_per_zone *mz;
1387        struct mem_cgroup *memcg;
1388        struct page_cgroup *pc;
1389        struct lruvec *lruvec;
1390
1391        if (mem_cgroup_disabled()) {
1392                lruvec = &zone->lruvec;
1393                goto out;
1394        }
1395
1396        pc = lookup_page_cgroup(page);
1397        memcg = pc->mem_cgroup;
1398
1399        /*
1400         * Surreptitiously switch any uncharged offlist page to root:
1401         * an uncharged page off lru does nothing to secure
1402         * its former mem_cgroup from sudden removal.
1403         *
1404         * Our caller holds lru_lock, and PageCgroupUsed is updated
1405         * under page_cgroup lock: between them, they make all uses
1406         * of pc->mem_cgroup safe.
1407         */
1408        if (!PageLRU(page) && !PageCgroupUsed(pc) && memcg != root_mem_cgroup)
1409                pc->mem_cgroup = memcg = root_mem_cgroup;
1410
1411        mz = page_cgroup_zoneinfo(memcg, page);
1412        lruvec = &mz->lruvec;
1413out:
1414        /*
1415         * Since a node can be onlined after the mem_cgroup was created,
1416         * we have to be prepared to initialize lruvec->zone here;
1417         * and if offlined then reonlined, we need to reinitialize it.
1418         */
1419        if (unlikely(lruvec->zone != zone))
1420                lruvec->zone = zone;
1421        return lruvec;
1422}
1423
1424/**
1425 * mem_cgroup_update_lru_size - account for adding or removing an lru page
1426 * @lruvec: mem_cgroup per zone lru vector
1427 * @lru: index of lru list the page is sitting on
1428 * @nr_pages: positive when adding or negative when removing
1429 *
1430 * This function must be called when a page is added to or removed from an
1431 * lru list.
1432 */
1433void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru,
1434                                int nr_pages)
1435{
1436        struct mem_cgroup_per_zone *mz;
1437        unsigned long *lru_size;
1438
1439        if (mem_cgroup_disabled())
1440                return;
1441
1442        mz = container_of(lruvec, struct mem_cgroup_per_zone, lruvec);
1443        lru_size = mz->lru_size + lru;
1444        *lru_size += nr_pages;
1445        VM_BUG_ON((long)(*lru_size) < 0);
1446}
1447
1448/*
1449 * Checks whether given mem is same or in the root_mem_cgroup's
1450 * hierarchy subtree
1451 */
1452bool __mem_cgroup_same_or_subtree(const struct mem_cgroup *root_memcg,
1453                                  struct mem_cgroup *memcg)
1454{
1455        if (root_memcg == memcg)
1456                return true;
1457        if (!root_memcg->use_hierarchy || !memcg)
1458                return false;
1459        return cgroup_is_descendant(memcg->css.cgroup, root_memcg->css.cgroup);
1460}
1461
1462static bool mem_cgroup_same_or_subtree(const struct mem_cgroup *root_memcg,
1463                                       struct mem_cgroup *memcg)
1464{
1465        bool ret;
1466
1467        rcu_read_lock();
1468        ret = __mem_cgroup_same_or_subtree(root_memcg, memcg);
1469        rcu_read_unlock();
1470        return ret;
1471}
1472
1473int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *memcg)
1474{
1475        int ret;
1476        struct mem_cgroup *curr = NULL;
1477        struct task_struct *p;
1478
1479        p = find_lock_task_mm(task);
1480        if (p) {
1481                curr = try_get_mem_cgroup_from_mm(p->mm);
1482                task_unlock(p);
1483        } else {
1484                /*
1485                 * All threads may have already detached their mm's, but the oom
1486                 * killer still needs to detect if they have already been oom
1487                 * killed to prevent needlessly killing additional tasks.
1488                 */
1489                task_lock(task);
1490                curr = mem_cgroup_from_task(task);
1491                if (curr)
1492                        css_get(&curr->css);
1493                task_unlock(task);
1494        }
1495        if (!curr)
1496                return 0;
1497        /*
1498         * We should check use_hierarchy of "memcg" not "curr". Because checking
1499         * use_hierarchy of "curr" here make this function true if hierarchy is
1500         * enabled in "curr" and "curr" is a child of "memcg" in *cgroup*
1501         * hierarchy(even if use_hierarchy is disabled in "memcg").
1502         */
1503        ret = mem_cgroup_same_or_subtree(memcg, curr);
1504        css_put(&curr->css);
1505        return ret;
1506}
1507
1508int mem_cgroup_inactive_anon_is_low(struct lruvec *lruvec)
1509{
1510        unsigned long inactive_ratio;
1511        unsigned long inactive;
1512        unsigned long active;
1513        unsigned long gb;
1514
1515        inactive = mem_cgroup_get_lru_size(lruvec, LRU_INACTIVE_ANON);
1516        active = mem_cgroup_get_lru_size(lruvec, LRU_ACTIVE_ANON);
1517
1518        gb = (inactive + active) >> (30 - PAGE_SHIFT);
1519        if (gb)
1520                inactive_ratio = int_sqrt(10 * gb);
1521        else
1522                inactive_ratio = 1;
1523
1524        return inactive * inactive_ratio < active;
1525}
1526
1527#define mem_cgroup_from_counter(counter, member)        \
1528        container_of(counter, struct mem_cgroup, member)
1529
1530/**
1531 * mem_cgroup_margin - calculate chargeable space of a memory cgroup
1532 * @memcg: the memory cgroup
1533 *
1534 * Returns the maximum amount of memory @mem can be charged with, in
1535 * pages.
1536 */
1537static unsigned long mem_cgroup_margin(struct mem_cgroup *memcg)
1538{
1539        unsigned long margin = 0;
1540        unsigned long count;
1541        unsigned long limit;
1542
1543        count = page_counter_read(&memcg->memory);
1544        limit = ACCESS_ONCE(memcg->memory.limit);
1545        if (count < limit)
1546                margin = limit - count;
1547
1548        if (do_swap_account) {
1549                count = page_counter_read(&memcg->memsw);
1550                limit = ACCESS_ONCE(memcg->memsw.limit);
1551                if (count <= limit)
1552                        margin = min(margin, limit - count);
1553        }
1554
1555        return margin;
1556}
1557
1558int mem_cgroup_swappiness(struct mem_cgroup *memcg)
1559{
1560        struct cgroup *cgrp = memcg->css.cgroup;
1561
1562        /* root ? */
1563        if (cgrp->parent == NULL)
1564                return vm_swappiness;
1565
1566        return memcg->swappiness;
1567}
1568
1569/*
1570 * memcg->moving_account is used for checking possibility that some thread is
1571 * calling move_account(). When a thread on CPU-A starts moving pages under
1572 * a memcg, other threads should check memcg->moving_account under
1573 * rcu_read_lock(), like this:
1574 *
1575 *         CPU-A                                    CPU-B
1576 *                                              rcu_read_lock()
1577 *         memcg->moving_account+1              if (memcg->mocing_account)
1578 *                                                   take heavy locks.
1579 *         synchronize_rcu()                    update something.
1580 *                                              rcu_read_unlock()
1581 *         start move here.
1582 */
1583
1584/* for quick checking without looking up memcg */
1585atomic_t memcg_moving __read_mostly;
1586
1587static void mem_cgroup_start_move(struct mem_cgroup *memcg)
1588{
1589        atomic_inc(&memcg_moving);
1590        atomic_inc(&memcg->moving_account);
1591        synchronize_rcu();
1592}
1593
1594static void mem_cgroup_end_move(struct mem_cgroup *memcg)
1595{
1596        /*
1597         * Now, mem_cgroup_clear_mc() may call this function with NULL.
1598         * We check NULL in callee rather than caller.
1599         */
1600        if (memcg) {
1601                atomic_dec(&memcg_moving);
1602                atomic_dec(&memcg->moving_account);
1603        }
1604}
1605
1606/*
1607 * 2 routines for checking "mem" is under move_account() or not.
1608 *
1609 * mem_cgroup_stolen() -  checking whether a cgroup is mc.from or not. This
1610 *                        is used for avoiding races in accounting.  If true,
1611 *                        pc->mem_cgroup may be overwritten.
1612 *
1613 * mem_cgroup_under_move() - checking a cgroup is mc.from or mc.to or
1614 *                        under hierarchy of moving cgroups. This is for
1615 *                        waiting at hith-memory prressure caused by "move".
1616 */
1617
1618static bool mem_cgroup_stolen(struct mem_cgroup *memcg)
1619{
1620        VM_BUG_ON(!rcu_read_lock_held());
1621        return atomic_read(&memcg->moving_account) > 0;
1622}
1623
1624static bool mem_cgroup_under_move(struct mem_cgroup *memcg)
1625{
1626        struct mem_cgroup *from;
1627        struct mem_cgroup *to;
1628        bool ret = false;
1629        /*
1630         * Unlike task_move routines, we access mc.to, mc.from not under
1631         * mutual exclusion by cgroup_mutex. Here, we take spinlock instead.
1632         */
1633        spin_lock(&mc.lock);
1634        from = mc.from;
1635        to = mc.to;
1636        if (!from)
1637                goto unlock;
1638
1639        ret = mem_cgroup_same_or_subtree(memcg, from)
1640                || mem_cgroup_same_or_subtree(memcg, to);
1641unlock:
1642        spin_unlock(&mc.lock);
1643        return ret;
1644}
1645
1646static bool mem_cgroup_wait_acct_move(struct mem_cgroup *memcg)
1647{
1648        if (mc.moving_task && current != mc.moving_task) {
1649                if (mem_cgroup_under_move(memcg)) {
1650                        DEFINE_WAIT(wait);
1651                        prepare_to_wait(&mc.waitq, &wait, TASK_INTERRUPTIBLE);
1652                        /* moving charge context might have finished. */
1653                        if (mc.moving_task)
1654                                schedule();
1655                        finish_wait(&mc.waitq, &wait);
1656                        return true;
1657                }
1658        }
1659        return false;
1660}
1661
1662/*
1663 * Take this lock when
1664 * - a code tries to modify page's memcg while it's USED.
1665 * - a code tries to modify page state accounting in a memcg.
1666 * see mem_cgroup_stolen(), too.
1667 */
1668static void move_lock_mem_cgroup(struct mem_cgroup *memcg,
1669                                  unsigned long *flags)
1670{
1671        spin_lock_irqsave(&memcg->move_lock, *flags);
1672}
1673
1674static void move_unlock_mem_cgroup(struct mem_cgroup *memcg,
1675                                unsigned long *flags)
1676{
1677        spin_unlock_irqrestore(&memcg->move_lock, *flags);
1678}
1679
1680#define K(x) ((x) << (PAGE_SHIFT-10))
1681/**
1682 * mem_cgroup_print_oom_info: Print OOM information relevant to memory controller.
1683 * @memcg: The memory cgroup that went over limit
1684 * @p: Task that is going to be killed
1685 *
1686 * NOTE: @memcg and @p's mem_cgroup can be different when hierarchy is
1687 * enabled
1688 */
1689void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
1690{
1691        struct cgroup *task_cgrp;
1692        struct cgroup *mem_cgrp;
1693        /*
1694         * Need a buffer in BSS, can't rely on allocations. The code relies
1695         * on the assumption that OOM is serialized for memory controller.
1696         * If this assumption is broken, revisit this code.
1697         */
1698        static char memcg_name[PATH_MAX];
1699        int ret;
1700        struct mem_cgroup *iter;
1701        unsigned int i;
1702
1703        if (!p)
1704                return;
1705
1706        rcu_read_lock();
1707
1708        mem_cgrp = memcg->css.cgroup;
1709        task_cgrp = task_cgroup(p, mem_cgroup_subsys_id);
1710
1711        ret = cgroup_path(task_cgrp, memcg_name, PATH_MAX);
1712        if (ret < 0) {
1713                /*
1714                 * Unfortunately, we are unable to convert to a useful name
1715                 * But we'll still print out the usage information
1716                 */
1717                rcu_read_unlock();
1718                goto done;
1719        }
1720        rcu_read_unlock();
1721
1722        pr_info("Task in %s killed", memcg_name);
1723
1724        rcu_read_lock();
1725        ret = cgroup_path(mem_cgrp, memcg_name, PATH_MAX);
1726        if (ret < 0) {
1727                rcu_read_unlock();
1728                goto done;
1729        }
1730        rcu_read_unlock();
1731
1732        /*
1733         * Continues from above, so we don't need an KERN_ level
1734         */
1735        pr_cont(" as a result of limit of %s\n", memcg_name);
1736done:
1737
1738        pr_info("memory: usage %llukB, limit %llukB, failcnt %lu\n",
1739                K((u64)page_counter_read(&memcg->memory)),
1740                K((u64)memcg->memory.limit), memcg->memory.failcnt);
1741        pr_info("memory+swap: usage %llukB, limit %llukB, failcnt %lu\n",
1742                K((u64)page_counter_read(&memcg->memsw)),
1743                K((u64)memcg->memsw.limit), memcg->memsw.failcnt);
1744        pr_info("kmem: usage %llukB, limit %llukB, failcnt %lu\n",
1745                K((u64)page_counter_read(&memcg->kmem)),
1746                K((u64)memcg->kmem.limit), memcg->kmem.failcnt);
1747
1748        for_each_mem_cgroup_tree(iter, memcg) {
1749                pr_info("Memory cgroup stats");
1750
1751                rcu_read_lock();
1752                ret = cgroup_path(iter->css.cgroup, memcg_name, PATH_MAX);
1753                if (!ret)
1754                        pr_cont(" for %s", memcg_name);
1755                rcu_read_unlock();
1756                pr_cont(":");
1757
1758                for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) {
1759                        if (i == MEM_CGROUP_STAT_SWAP && !do_swap_account)
1760                                continue;
1761                        pr_cont(" %s:%ldKB", mem_cgroup_stat_names[i],
1762                                K(mem_cgroup_read_stat(iter, i)));
1763                }
1764
1765                for (i = 0; i < NR_LRU_LISTS; i++)
1766                        pr_cont(" %s:%luKB", mem_cgroup_lru_names[i],
1767                                K(mem_cgroup_nr_lru_pages(iter, BIT(i))));
1768
1769                pr_cont("\n");
1770        }
1771}
1772
1773/*
1774 * This function returns the number of memcg under hierarchy tree. Returns
1775 * 1(self count) if no children.
1776 */
1777static int mem_cgroup_count_children(struct mem_cgroup *memcg)
1778{
1779        int num = 0;
1780        struct mem_cgroup *iter;
1781
1782        for_each_mem_cgroup_tree(iter, memcg)
1783                num++;
1784        return num;
1785}
1786
1787/*
1788 * Return the memory (and swap, if configured) limit for a memcg.
1789 */
1790static unsigned long mem_cgroup_get_limit(struct mem_cgroup *memcg)
1791{
1792        unsigned long limit;
1793
1794        limit = memcg->memory.limit;
1795        if (mem_cgroup_swappiness(memcg)) {
1796                unsigned long memsw_limit;
1797
1798                memsw_limit = memcg->memsw.limit;
1799                limit = min(limit + total_swap_pages, memsw_limit);
1800        }
1801        return limit;
1802}
1803
1804static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
1805                                     int order)
1806{
1807        struct mem_cgroup *iter;
1808        unsigned long chosen_points = 0;
1809        unsigned long totalpages;
1810        unsigned int points = 0;
1811        struct task_struct *chosen = NULL;
1812
1813        /*
1814         * If current has a pending SIGKILL or is exiting, then automatically
1815         * select it.  The goal is to allow it to allocate so that it may
1816         * quickly exit and free its memory.
1817         */
1818        if (fatal_signal_pending(current) || current->flags & PF_EXITING) {
1819                set_thread_flag(TIF_MEMDIE);
1820                return;
1821        }
1822
1823        check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, order, NULL);
1824        totalpages = mem_cgroup_get_limit(memcg) ? : 1;
1825        for_each_mem_cgroup_tree(iter, memcg) {
1826                struct cgroup *cgroup = iter->css.cgroup;
1827                struct cgroup_iter it;
1828                struct task_struct *task;
1829
1830                cgroup_iter_start(cgroup, &it);
1831                while ((task = cgroup_iter_next(cgroup, &it))) {
1832                        switch (oom_scan_process_thread(task, totalpages, NULL,
1833                                                        false)) {
1834                        case OOM_SCAN_SELECT:
1835                                if (chosen)
1836                                        put_task_struct(chosen);
1837                                chosen = task;
1838                                chosen_points = ULONG_MAX;
1839                                get_task_struct(chosen);
1840                                /* fall through */
1841                        case OOM_SCAN_CONTINUE:
1842                                continue;
1843                        case OOM_SCAN_ABORT:
1844                                cgroup_iter_end(cgroup, &it);
1845                                mem_cgroup_iter_break(memcg, iter);
1846                                if (chosen)
1847                                        put_task_struct(chosen);
1848                                return;
1849                        case OOM_SCAN_OK:
1850                                break;
1851                        };
1852                        points = oom_badness(task, memcg, NULL, totalpages);
1853                        if (points > chosen_points) {
1854                                if (chosen)
1855                                        put_task_struct(chosen);
1856                                chosen = task;
1857                                chosen_points = points;
1858                                get_task_struct(chosen);
1859                        }
1860                }
1861                cgroup_iter_end(cgroup, &it);
1862        }
1863
1864        if (!chosen)
1865                return;
1866        points = chosen_points * 1000 / totalpages;
1867        oom_kill_process(chosen, gfp_mask, order, points, totalpages, memcg,
1868                         NULL, "Memory cgroup out of memory");
1869}
1870
1871static unsigned long mem_cgroup_reclaim(struct mem_cgroup *memcg,
1872                                        gfp_t gfp_mask,
1873                                        unsigned long flags)
1874{
1875        unsigned long total = 0;
1876        bool noswap = false;
1877        int loop;
1878
1879        if (flags & MEM_CGROUP_RECLAIM_NOSWAP)
1880                noswap = true;
1881        if (!(flags & MEM_CGROUP_RECLAIM_SHRINK) && memcg->memsw_is_minimum)
1882                noswap = true;
1883
1884        for (loop = 0; loop < MEM_CGROUP_MAX_RECLAIM_LOOPS; loop++) {
1885                if (loop)
1886                        drain_all_stock_async(memcg);
1887                total += try_to_free_mem_cgroup_pages(memcg, gfp_mask, noswap);
1888                /*
1889                 * Allow limit shrinkers, which are triggered directly
1890                 * by userspace, to catch signals and stop reclaim
1891                 * after minimal progress, regardless of the margin.
1892                 */
1893                if (total && (flags & MEM_CGROUP_RECLAIM_SHRINK))
1894                        break;
1895                if (mem_cgroup_margin(memcg))
1896                        break;
1897                /*
1898                 * If nothing was reclaimed after two attempts, there
1899                 * may be no reclaimable pages in this hierarchy.
1900                 */
1901                if (loop && !total)
1902                        break;
1903        }
1904        return total;
1905}
1906
1907/**
1908 * test_mem_cgroup_node_reclaimable
1909 * @memcg: the target memcg
1910 * @nid: the node ID to be checked.
1911 * @noswap : specify true here if the user wants flle only information.
1912 *
1913 * This function returns whether the specified memcg contains any
1914 * reclaimable pages on a node. Returns true if there are any reclaimable
1915 * pages in the node.
1916 */
1917static bool test_mem_cgroup_node_reclaimable(struct mem_cgroup *memcg,
1918                int nid, bool noswap)
1919{
1920        if (mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL_FILE))
1921                return true;
1922        if (noswap || !total_swap_pages)
1923                return false;
1924        if (mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL_ANON))
1925                return true;
1926        return false;
1927
1928}
1929#if MAX_NUMNODES > 1
1930
1931/*
1932 * Always updating the nodemask is not very good - even if we have an empty
1933 * list or the wrong list here, we can start from some node and traverse all
1934 * nodes based on the zonelist. So update the list loosely once per 10 secs.
1935 *
1936 */
1937static void mem_cgroup_may_update_nodemask(struct mem_cgroup *memcg)
1938{
1939        int nid;
1940        /*
1941         * numainfo_events > 0 means there was at least NUMAINFO_EVENTS_TARGET
1942         * pagein/pageout changes since the last update.
1943         */
1944        if (!atomic_read(&memcg->numainfo_events))
1945                return;
1946        if (atomic_inc_return(&memcg->numainfo_updating) > 1)
1947                return;
1948
1949        /* make a nodemask where this memcg uses memory from */
1950        memcg->scan_nodes = node_states[N_MEMORY];
1951
1952        for_each_node_mask(nid, node_states[N_MEMORY]) {
1953
1954                if (!test_mem_cgroup_node_reclaimable(memcg, nid, false))
1955                        node_clear(nid, memcg->scan_nodes);
1956        }
1957
1958        atomic_set(&memcg->numainfo_events, 0);
1959        atomic_set(&memcg->numainfo_updating, 0);
1960}
1961
1962/*
1963 * Selecting a node where we start reclaim from. Because what we need is just
1964 * reducing usage counter, start from anywhere is O,K. Considering
1965 * memory reclaim from current node, there are pros. and cons.
1966 *
1967 * Freeing memory from current node means freeing memory from a node which
1968 * we'll use or we've used. So, it may make LRU bad. And if several threads
1969 * hit limits, it will see a contention on a node. But freeing from remote
1970 * node means more costs for memory reclaim because of memory latency.
1971 *
1972 * Now, we use round-robin. Better algorithm is welcomed.
1973 */
1974int mem_cgroup_select_victim_node(struct mem_cgroup *memcg)
1975{
1976        int node;
1977
1978        mem_cgroup_may_update_nodemask(memcg);
1979        node = memcg->last_scanned_node;
1980
1981        node = next_node(node, memcg->scan_nodes);
1982        if (node == MAX_NUMNODES)
1983                node = first_node(memcg->scan_nodes);
1984        /*
1985         * We call this when we hit limit, not when pages are added to LRU.
1986         * No LRU may hold pages because all pages are UNEVICTABLE or
1987         * memcg is too small and all pages are not on LRU. In that case,
1988         * we use curret node.
1989         */
1990        if (unlikely(node == MAX_NUMNODES))
1991                node = numa_node_id();
1992
1993        memcg->last_scanned_node = node;
1994        return node;
1995}
1996
1997/*
1998 * Check all nodes whether it contains reclaimable pages or not.
1999 * For quick scan, we make use of scan_nodes. This will allow us to skip
2000 * unused nodes. But scan_nodes is lazily updated and may not cotain
2001 * enough new information. We need to do double check.
2002 */
2003static bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap)
2004{
2005        int nid;
2006
2007        /*
2008         * quick check...making use of scan_node.
2009         * We can skip unused nodes.
2010         */
2011        if (!nodes_empty(memcg->scan_nodes)) {
2012                for (nid = first_node(memcg->scan_nodes);
2013                     nid < MAX_NUMNODES;
2014                     nid = next_node(nid, memcg->scan_nodes)) {
2015
2016                        if (test_mem_cgroup_node_reclaimable(memcg, nid, noswap))
2017                                return true;
2018                }
2019        }
2020        /*
2021         * Check rest of nodes.
2022         */
2023        for_each_node_state(nid, N_MEMORY) {
2024                if (node_isset(nid, memcg->scan_nodes))
2025                        continue;
2026                if (test_mem_cgroup_node_reclaimable(memcg, nid, noswap))
2027                        return true;
2028        }
2029        return false;
2030}
2031
2032#else
2033int mem_cgroup_select_victim_node(struct mem_cgroup *memcg)
2034{
2035        return 0;
2036}
2037
2038static bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap)
2039{
2040        return test_mem_cgroup_node_reclaimable(memcg, 0, noswap);
2041}
2042#endif
2043
2044static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg,
2045                                   struct zone *zone,
2046                                   gfp_t gfp_mask,
2047                                   unsigned long *total_scanned)
2048{
2049        struct mem_cgroup *victim = NULL;
2050        int total = 0;
2051        int loop = 0;
2052        unsigned long excess;
2053        unsigned long nr_scanned;
2054        struct mem_cgroup_reclaim_cookie reclaim = {
2055                .zone = zone,
2056                .priority = 0,
2057        };
2058
2059        excess = soft_limit_excess(root_memcg);
2060
2061        while (1) {
2062                victim = mem_cgroup_iter(root_memcg, victim, &reclaim);
2063                if (!victim) {
2064                        loop++;
2065                        if (loop >= 2) {
2066                                /*
2067                                 * If we have not been able to reclaim
2068                                 * anything, it might because there are
2069                                 * no reclaimable pages under this hierarchy
2070                                 */
2071                                if (!total)
2072                                        break;
2073                                /*
2074                                 * We want to do more targeted reclaim.
2075                                 * excess >> 2 is not to excessive so as to
2076                                 * reclaim too much, nor too less that we keep
2077                                 * coming back to reclaim from this cgroup
2078                                 */
2079                                if (total >= (excess >> 2) ||
2080                                        (loop > MEM_CGROUP_MAX_RECLAIM_LOOPS))
2081                                        break;
2082                        }
2083                        continue;
2084                }
2085                if (!mem_cgroup_reclaimable(victim, false))
2086                        continue;
2087                total += mem_cgroup_shrink_node_zone(victim, gfp_mask, false,
2088                                                     zone, &nr_scanned);
2089                *total_scanned += nr_scanned;
2090                if (!soft_limit_excess(root_memcg))
2091                        break;
2092        }
2093        mem_cgroup_iter_break(root_memcg, victim);
2094        return total;
2095}
2096
2097static DEFINE_SPINLOCK(memcg_oom_lock);
2098
2099/*
2100 * Check OOM-Killer is already running under our hierarchy.
2101 * If someone is running, return false.
2102 */
2103static bool mem_cgroup_oom_trylock(struct mem_cgroup *memcg)
2104{
2105        struct mem_cgroup *iter, *failed = NULL;
2106
2107        spin_lock(&memcg_oom_lock);
2108
2109        for_each_mem_cgroup_tree(iter, memcg) {
2110                if (iter->oom_lock) {
2111                        /*
2112                         * this subtree of our hierarchy is already locked
2113                         * so we cannot give a lock.
2114                         */
2115                        failed = iter;
2116                        mem_cgroup_iter_break(memcg, iter);
2117                        break;
2118                } else
2119                        iter->oom_lock = true;
2120        }
2121
2122        if (failed) {
2123                /*
2124                 * OK, we failed to lock the whole subtree so we have
2125                 * to clean up what we set up to the failing subtree
2126                 */
2127                for_each_mem_cgroup_tree(iter, memcg) {
2128                        if (iter == failed) {
2129                                mem_cgroup_iter_break(memcg, iter);
2130                                break;
2131                        }
2132                        iter->oom_lock = false;
2133                }
2134        }
2135
2136        spin_unlock(&memcg_oom_lock);
2137
2138        return !failed;
2139}
2140
2141static void mem_cgroup_oom_unlock(struct mem_cgroup *memcg)
2142{
2143        struct mem_cgroup *iter;
2144
2145        spin_lock(&memcg_oom_lock);
2146        for_each_mem_cgroup_tree(iter, memcg)
2147                iter->oom_lock = false;
2148        spin_unlock(&memcg_oom_lock);
2149}
2150
2151static void mem_cgroup_mark_under_oom(struct mem_cgroup *memcg)
2152{
2153        struct mem_cgroup *iter;
2154
2155        for_each_mem_cgroup_tree(iter, memcg)
2156                atomic_inc(&iter->under_oom);
2157}
2158
2159static void mem_cgroup_unmark_under_oom(struct mem_cgroup *memcg)
2160{
2161        struct mem_cgroup *iter;
2162
2163        /*
2164         * When a new child is created while the hierarchy is under oom,
2165         * mem_cgroup_oom_lock() may not be called. We have to use
2166         * atomic_add_unless() here.
2167         */
2168        for_each_mem_cgroup_tree(iter, memcg)
2169                atomic_add_unless(&iter->under_oom, -1, 0);
2170}
2171
2172static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq);
2173
2174struct oom_wait_info {
2175        struct mem_cgroup *memcg;
2176        wait_queue_t    wait;
2177};
2178
2179static int memcg_oom_wake_function(wait_queue_t *wait,
2180        unsigned mode, int sync, void *arg)
2181{
2182        struct mem_cgroup *wake_memcg = (struct mem_cgroup *)arg;
2183        struct mem_cgroup *oom_wait_memcg;
2184        struct oom_wait_info *oom_wait_info;
2185
2186        oom_wait_info = container_of(wait, struct oom_wait_info, wait);
2187        oom_wait_memcg = oom_wait_info->memcg;
2188
2189        /*
2190         * Both of oom_wait_info->memcg and wake_memcg are stable under us.
2191         * Then we can use css_is_ancestor without taking care of RCU.
2192         */
2193        if (!mem_cgroup_same_or_subtree(oom_wait_memcg, wake_memcg)
2194                && !mem_cgroup_same_or_subtree(wake_memcg, oom_wait_memcg))
2195                return 0;
2196        return autoremove_wake_function(wait, mode, sync, arg);
2197}
2198
2199static void memcg_wakeup_oom(struct mem_cgroup *memcg)
2200{
2201        atomic_inc(&memcg->oom_wakeups);
2202        /* for filtering, pass "memcg" as argument. */
2203        __wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, memcg);
2204}
2205
2206static void memcg_oom_recover(struct mem_cgroup *memcg)
2207{
2208        if (memcg && atomic_read(&memcg->under_oom))
2209                memcg_wakeup_oom(memcg);
2210}
2211
2212static void mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order)
2213{
2214        if (!current->memcg_oom.may_oom)
2215                return;
2216        /*
2217         * We are in the middle of the charge context here, so we
2218         * don't want to block when potentially sitting on a callstack
2219         * that holds all kinds of filesystem and mm locks.
2220         *
2221         * Also, the caller may handle a failed allocation gracefully
2222         * (like optional page cache readahead) and so an OOM killer
2223         * invocation might not even be necessary.
2224         *
2225         * That's why we don't do anything here except remember the
2226         * OOM context and then deal with it at the end of the page
2227         * fault when the stack is unwound, the locks are released,
2228         * and when we know whether the fault was overall successful.
2229         */
2230        css_get(&memcg->css);
2231        current->memcg_oom.memcg = memcg;
2232        current->memcg_oom.gfp_mask = mask;
2233        current->memcg_oom.order = order;
2234}
2235
2236/**
2237 * mem_cgroup_oom_synchronize - complete memcg OOM handling
2238 * @handle: actually kill/wait or just clean up the OOM state
2239 *
2240 * This has to be called at the end of a page fault if the memcg OOM
2241 * handler was enabled.
2242 *
2243 * Memcg supports userspace OOM handling where failed allocations must
2244 * sleep on a waitqueue until the userspace task resolves the
2245 * situation.  Sleeping directly in the charge context with all kinds
2246 * of locks held is not a good idea, instead we remember an OOM state
2247 * in the task and mem_cgroup_oom_synchronize() has to be called at
2248 * the end of the page fault to complete the OOM handling.
2249 *
2250 * Returns %true if an ongoing memcg OOM situation was detected and
2251 * completed, %false otherwise.
2252 */
2253bool mem_cgroup_oom_synchronize(bool handle)
2254{
2255        struct mem_cgroup *memcg = current->memcg_oom.memcg;
2256        struct oom_wait_info owait;
2257        bool locked;
2258
2259        /* OOM is global, do not handle */
2260        if (!memcg)
2261                return false;
2262
2263        if (!handle)
2264                goto cleanup;
2265
2266        owait.memcg = memcg;
2267        owait.wait.flags = 0;
2268        owait.wait.func = memcg_oom_wake_function;
2269        owait.wait.private = current;
2270        INIT_LIST_HEAD(&owait.wait.task_list);
2271
2272        prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE);
2273        mem_cgroup_mark_under_oom(memcg);
2274
2275        locked = mem_cgroup_oom_trylock(memcg);
2276
2277        if (locked)
2278                mem_cgroup_oom_notify(memcg);
2279
2280        if (locked && !memcg->oom_kill_disable) {
2281                mem_cgroup_unmark_under_oom(memcg);
2282                finish_wait(&memcg_oom_waitq, &owait.wait);
2283                mem_cgroup_out_of_memory(memcg, current->memcg_oom.gfp_mask,
2284                                         current->memcg_oom.order);
2285        } else {
2286                schedule();
2287                mem_cgroup_unmark_under_oom(memcg);
2288                finish_wait(&memcg_oom_waitq, &owait.wait);
2289        }
2290
2291        if (locked) {
2292                mem_cgroup_oom_unlock(memcg);
2293                /*
2294                 * There is no guarantee that an OOM-lock contender
2295                 * sees the wakeups triggered by the OOM kill
2296                 * uncharges.  Wake any sleepers explicitely.
2297                 */
2298                memcg_oom_recover(memcg);
2299        }
2300cleanup:
2301        current->memcg_oom.memcg = NULL;
2302        css_put(&memcg->css);
2303        return true;
2304}
2305
2306/*
2307 * Currently used to update mapped file statistics, but the routine can be
2308 * generalized to update other statistics as well.
2309 *
2310 * Notes: Race condition
2311 *
2312 * We usually use page_cgroup_lock() for accessing page_cgroup member but
2313 * it tends to be costly. But considering some conditions, we doesn't need
2314 * to do so _always_.
2315 *
2316 * Considering "charge", lock_page_cgroup() is not required because all
2317 * file-stat operations happen after a page is attached to radix-tree. There
2318 * are no race with "charge".
2319 *
2320 * Considering "uncharge", we know that memcg doesn't clear pc->mem_cgroup
2321 * at "uncharge" intentionally. So, we always see valid pc->mem_cgroup even
2322 * if there are race with "uncharge". Statistics itself is properly handled
2323 * by flags.
2324 *
2325 * Considering "move", this is an only case we see a race. To make the race
2326 * small, we check mm->moving_account and detect there are possibility of race
2327 * If there is, we take a lock.
2328 */
2329
2330void __mem_cgroup_begin_update_page_stat(struct page *page,
2331                                bool *locked, unsigned long *flags)
2332{
2333        struct mem_cgroup *memcg;
2334        struct page_cgroup *pc;
2335
2336        pc = lookup_page_cgroup(page);
2337again:
2338        memcg = pc->mem_cgroup;
2339        if (unlikely(!memcg || !PageCgroupUsed(pc)))
2340                return;
2341        /*
2342         * If this memory cgroup is not under account moving, we don't
2343         * need to take move_lock_mem_cgroup(). Because we already hold
2344         * rcu_read_lock(), any calls to move_account will be delayed until
2345         * rcu_read_unlock() if mem_cgroup_stolen() == true.
2346         */
2347        if (!mem_cgroup_stolen(memcg))
2348                return;
2349
2350        move_lock_mem_cgroup(memcg, flags);
2351        if (memcg != pc->mem_cgroup || !PageCgroupUsed(pc)) {
2352                move_unlock_mem_cgroup(memcg, flags);
2353                goto again;
2354        }
2355        *locked = true;
2356}
2357
2358void __mem_cgroup_end_update_page_stat(struct page *page, unsigned long *flags)
2359{
2360        struct page_cgroup *pc = lookup_page_cgroup(page);
2361
2362        /*
2363         * It's guaranteed that pc->mem_cgroup never changes while
2364         * lock is held because a routine modifies pc->mem_cgroup
2365         * should take move_lock_mem_cgroup().
2366         */
2367        move_unlock_mem_cgroup(pc->mem_cgroup, flags);
2368}
2369
2370void mem_cgroup_update_page_stat(struct page *page,
2371                                 enum mem_cgroup_page_stat_item idx, int val)
2372{
2373        struct mem_cgroup *memcg;
2374        struct page_cgroup *pc = lookup_page_cgroup(page);
2375        unsigned long uninitialized_var(flags);
2376
2377        if (mem_cgroup_disabled())
2378                return;
2379
2380        memcg = pc->mem_cgroup;
2381        if (unlikely(!memcg || !PageCgroupUsed(pc)))
2382                return;
2383
2384        switch (idx) {
2385        case MEMCG_NR_FILE_MAPPED:
2386                idx = MEM_CGROUP_STAT_FILE_MAPPED;
2387                break;
2388        default:
2389                BUG();
2390        }
2391
2392        this_cpu_add(memcg->stat->count[idx], val);
2393}
2394
2395/*
2396 * size of first charge trial. "32" comes from vmscan.c's magic value.
2397 * TODO: maybe necessary to use big numbers in big irons.
2398 */
2399#define CHARGE_BATCH    32U
2400struct memcg_stock_pcp {
2401        struct mem_cgroup *cached; /* this never be root cgroup */
2402        unsigned int nr_pages;
2403        struct work_struct work;
2404        unsigned long flags;
2405#define FLUSHING_CACHED_CHARGE  0
2406};
2407static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock);
2408static DEFINE_MUTEX(percpu_charge_mutex);
2409
2410/**
2411 * consume_stock: Try to consume stocked charge on this cpu.
2412 * @memcg: memcg to consume from.
2413 * @nr_pages: how many pages to charge.
2414 *
2415 * The charges will only happen if @memcg matches the current cpu's memcg
2416 * stock, and at least @nr_pages are available in that stock.  Failure to
2417 * service an allocation will refill the stock.
2418 *
2419 * returns true if successful, false otherwise.
2420 */
2421static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
2422{
2423        struct memcg_stock_pcp *stock;
2424        bool ret = false;
2425
2426        if (nr_pages > CHARGE_BATCH)
2427                return ret;
2428
2429        stock = &get_cpu_var(memcg_stock);
2430        if (memcg == stock->cached && stock->nr_pages >= nr_pages) {
2431                stock->nr_pages -= nr_pages;
2432                ret = true;
2433        }
2434        put_cpu_var(memcg_stock);
2435        return ret;
2436}
2437
2438/*
2439 * Returns stocks cached in percpu and reset cached information.
2440 */
2441static void drain_stock(struct memcg_stock_pcp *stock)
2442{
2443        struct mem_cgroup *old = stock->cached;
2444
2445        if (stock->nr_pages) {
2446                page_counter_uncharge(&old->memory, stock->nr_pages);
2447                if (do_swap_account)
2448                        page_counter_uncharge(&old->memsw, stock->nr_pages);
2449                stock->nr_pages = 0;
2450        }
2451        stock->cached = NULL;
2452}
2453
2454/*
2455 * This must be called under preempt disabled or must be called by
2456 * a thread which is pinned to local cpu.
2457 */
2458static void drain_local_stock(struct work_struct *dummy)
2459{
2460        struct memcg_stock_pcp *stock = this_cpu_ptr(&memcg_stock);
2461        drain_stock(stock);
2462        clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags);
2463}
2464
2465static void __init memcg_stock_init(void)
2466{
2467        int cpu;
2468
2469        for_each_possible_cpu(cpu) {
2470                struct memcg_stock_pcp *stock =
2471                                        &per_cpu(memcg_stock, cpu);
2472                INIT_WORK(&stock->work, drain_local_stock);
2473        }
2474}
2475
2476/*
2477 * Cache charges(val) to local per_cpu area.
2478 * This will be consumed by consume_stock() function, later.
2479 */
2480static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
2481{
2482        struct memcg_stock_pcp *stock = &get_cpu_var(memcg_stock);
2483
2484        if (stock->cached != memcg) { /* reset if necessary */
2485                drain_stock(stock);
2486                stock->cached = memcg;
2487        }
2488        stock->nr_pages += nr_pages;
2489        put_cpu_var(memcg_stock);
2490}
2491
2492/*
2493 * Drains all per-CPU charge caches for given root_memcg resp. subtree
2494 * of the hierarchy under it. sync flag says whether we should block
2495 * until the work is done.
2496 */
2497static void drain_all_stock(struct mem_cgroup *root_memcg, bool sync)
2498{
2499        int cpu, curcpu;
2500
2501        /* Notify other cpus that system-wide "drain" is running */
2502        get_online_cpus();
2503        curcpu = get_cpu();
2504        for_each_online_cpu(cpu) {
2505                struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu);
2506                struct mem_cgroup *memcg;
2507
2508                memcg = stock->cached;
2509                if (!memcg || !stock->nr_pages)
2510                        continue;
2511                if (!mem_cgroup_same_or_subtree(root_memcg, memcg))
2512                        continue;
2513                if (!test_and_set_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) {
2514                        if (cpu == curcpu)
2515                                drain_local_stock(&stock->work);
2516                        else
2517                                schedule_work_on(cpu, &stock->work);
2518                }
2519        }
2520        put_cpu();
2521
2522        if (!sync)
2523                goto out;
2524
2525        for_each_online_cpu(cpu) {
2526                struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu);
2527                if (test_bit(FLUSHING_CACHED_CHARGE, &stock->flags))
2528                        flush_work(&stock->work);
2529        }
2530out:
2531        put_online_cpus();
2532}
2533
2534/*
2535 * Tries to drain stocked charges in other cpus. This function is asynchronous
2536 * and just put a work per cpu for draining localy on each cpu. Caller can
2537 * expects some charges will be back later but cannot wait for it.
2538 */
2539static void drain_all_stock_async(struct mem_cgroup *root_memcg)
2540{
2541        /*
2542         * If someone calls draining, avoid adding more kworker runs.
2543         */
2544        if (!mutex_trylock(&percpu_charge_mutex))
2545                return;
2546        drain_all_stock(root_memcg, false);
2547        mutex_unlock(&percpu_charge_mutex);
2548}
2549
2550/* This is a synchronous drain interface. */
2551static void drain_all_stock_sync(struct mem_cgroup *root_memcg)
2552{
2553        /* called when force_empty is called */
2554        mutex_lock(&percpu_charge_mutex);
2555        drain_all_stock(root_memcg, true);
2556        mutex_unlock(&percpu_charge_mutex);
2557}
2558
2559/*
2560 * This function drains percpu counter value from DEAD cpu and
2561 * move it to local cpu. Note that this function can be preempted.
2562 */
2563static void mem_cgroup_drain_pcp_counter(struct mem_cgroup *memcg, int cpu)
2564{
2565        int i;
2566
2567        spin_lock(&memcg->pcp_counter_lock);
2568        for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) {
2569                long x = per_cpu(memcg->stat->count[i], cpu);
2570
2571                per_cpu(memcg->stat->count[i], cpu) = 0;
2572                memcg->nocpu_base.count[i] += x;
2573        }
2574        for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++) {
2575                unsigned long x = per_cpu(memcg->stat->events[i], cpu);
2576
2577                per_cpu(memcg->stat->events[i], cpu) = 0;
2578                memcg->nocpu_base.events[i] += x;
2579        }
2580        spin_unlock(&memcg->pcp_counter_lock);
2581}
2582
2583static int memcg_cpu_hotplug_callback(struct notifier_block *nb,
2584                                        unsigned long action,
2585                                        void *hcpu)
2586{
2587        int cpu = (unsigned long)hcpu;
2588        struct memcg_stock_pcp *stock;
2589        struct mem_cgroup *iter;
2590
2591        if (action == CPU_ONLINE)
2592                return NOTIFY_OK;
2593
2594        if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
2595                return NOTIFY_OK;
2596
2597        for_each_mem_cgroup(iter)
2598                mem_cgroup_drain_pcp_counter(iter, cpu);
2599
2600        stock = &per_cpu(memcg_stock, cpu);
2601        drain_stock(stock);
2602        return NOTIFY_OK;
2603}
2604
2605
2606/* See __mem_cgroup_try_charge() for details */
2607enum {
2608        CHARGE_OK,              /* success */
2609        CHARGE_RETRY,           /* need to retry but retry is not bad */
2610        CHARGE_NOMEM,           /* we can't do more. return -ENOMEM */
2611        CHARGE_WOULDBLOCK,      /* GFP_WAIT wasn't set and no enough res. */
2612};
2613
2614static int mem_cgroup_do_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
2615                                unsigned int nr_pages, unsigned int min_pages,
2616                                bool invoke_oom)
2617{
2618        struct mem_cgroup *mem_over_limit;
2619        struct page_counter *counter;
2620        unsigned long flags = 0;
2621        int ret;
2622
2623        ret = page_counter_try_charge(&memcg->memory, nr_pages, &counter);
2624
2625        if (likely(!ret)) {
2626                if (!do_swap_account)
2627                        return CHARGE_OK;
2628                ret = page_counter_try_charge(&memcg->memsw, nr_pages, &counter);
2629                if (likely(!ret))
2630                        return CHARGE_OK;
2631
2632                page_counter_uncharge(&memcg->memory, nr_pages);
2633                mem_over_limit = mem_cgroup_from_counter(counter, memsw);
2634                flags |= MEM_CGROUP_RECLAIM_NOSWAP;
2635        } else
2636                mem_over_limit = mem_cgroup_from_counter(counter, memory);
2637        /*
2638         * Never reclaim on behalf of optional batching, retry with a
2639         * single page instead.
2640         */
2641        if (nr_pages > min_pages)
2642                return CHARGE_RETRY;
2643
2644        if (!(gfp_mask & __GFP_WAIT))
2645                return CHARGE_WOULDBLOCK;
2646
2647        if (gfp_mask & __GFP_NORETRY)
2648                return CHARGE_NOMEM;
2649
2650        ret = mem_cgroup_reclaim(mem_over_limit, gfp_mask, flags);
2651        if (mem_cgroup_margin(mem_over_limit) >= nr_pages)
2652                return CHARGE_RETRY;
2653        /*
2654         * Even though the limit is exceeded at this point, reclaim
2655         * may have been able to free some pages.  Retry the charge
2656         * before killing the task.
2657         *
2658         * Only for regular pages, though: huge pages are rather
2659         * unlikely to succeed so close to the limit, and we fall back
2660         * to regular pages anyway in case of failure.
2661         */
2662        if (nr_pages <= (1 << PAGE_ALLOC_COSTLY_ORDER) && ret)
2663                return CHARGE_RETRY;
2664
2665        /*
2666         * At task move, charge accounts can be doubly counted. So, it's
2667         * better to wait until the end of task_move if something is going on.
2668         */
2669        if (mem_cgroup_wait_acct_move(mem_over_limit))
2670                return CHARGE_RETRY;
2671
2672        if (invoke_oom)
2673                mem_cgroup_oom(mem_over_limit, gfp_mask,
2674                               get_order(nr_pages * PAGE_SIZE));
2675
2676        return CHARGE_NOMEM;
2677}
2678
2679/*
2680 * __mem_cgroup_try_charge() does
2681 * 1. detect memcg to be charged against from passed *mm and *ptr,
2682 * 2. update page_counter
2683 * 3. call memory reclaim if necessary.
2684 *
2685 * In some special case, if the task is fatal, fatal_signal_pending() or
2686 * has TIF_MEMDIE, this function returns -EINTR while writing root_mem_cgroup
2687 * to *ptr. There are two reasons for this. 1: fatal threads should quit as soon
2688 * as possible without any hazards. 2: all pages should have a valid
2689 * pc->mem_cgroup. If mm is NULL and the caller doesn't pass a valid memcg
2690 * pointer, that is treated as a charge to root_mem_cgroup.
2691 *
2692 * So __mem_cgroup_try_charge() will return
2693 *  0       ...  on success, filling *ptr with a valid memcg pointer.
2694 *  -ENOMEM ...  charge failure because of resource limits.
2695 *  -EINTR  ...  if thread is fatal. *ptr is filled with root_mem_cgroup.
2696 *
2697 * Unlike the exported interface, an "oom" parameter is added. if oom==true,
2698 * the oom-killer can be invoked.
2699 */
2700static int __mem_cgroup_try_charge(struct mm_struct *mm,
2701                                   gfp_t gfp_mask,
2702                                   unsigned int nr_pages,
2703                                   struct mem_cgroup **ptr,
2704                                   bool oom)
2705{
2706        unsigned int batch = max(CHARGE_BATCH, nr_pages);
2707        int nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES;
2708        struct mem_cgroup *memcg = NULL;
2709        int ret;
2710
2711        /*
2712         * Unlike gloval-vm's OOM-kill, we're not in memory shortage
2713         * in system level. So, allow to go ahead dying process in addition to
2714         * MEMDIE process.
2715         */
2716        if (unlikely(test_thread_flag(TIF_MEMDIE)
2717                     || fatal_signal_pending(current)))
2718                goto bypass;
2719
2720        /*
2721         * Prevent unbounded recursion when reclaim operations need to
2722         * allocate memory. This might exceed the limits temporarily,
2723         * but we prefer facilitating memory reclaim and getting back
2724         * under the limit over triggering OOM kills in these cases.
2725         */
2726        if (unlikely(current->flags & PF_MEMALLOC))
2727                goto bypass;
2728
2729        if (unlikely(task_in_memcg_oom(current)))
2730                goto nomem;
2731
2732        if (gfp_mask & __GFP_NOFAIL)
2733                oom = false;
2734
2735        /*
2736         * We always charge the cgroup the mm_struct belongs to.
2737         * The mm_struct's mem_cgroup changes on task migration if the
2738         * thread group leader migrates. It's possible that mm is not
2739         * set, if so charge the root memcg (happens for pagecache usage).
2740         */
2741        if (!*ptr && !mm)
2742                *ptr = root_mem_cgroup;
2743again:
2744        if (*ptr) { /* css should be a valid one */
2745                memcg = *ptr;
2746                if (mem_cgroup_is_root(memcg))
2747                        goto done;
2748                if (consume_stock(memcg, nr_pages))
2749                        goto done;
2750                css_get(&memcg->css);
2751        } else {
2752                struct task_struct *p;
2753
2754                rcu_read_lock();
2755                p = rcu_dereference(mm->owner);
2756                /*
2757                 * Because we don't have task_lock(), "p" can exit.
2758                 * In that case, "memcg" can point to root or p can be NULL with
2759                 * race with swapoff. Then, we have small risk of mis-accouning.
2760                 * But such kind of mis-account by race always happens because
2761                 * we don't have cgroup_mutex(). It's overkill and we allo that
2762                 * small race, here.
2763                 * (*) swapoff at el will charge against mm-struct not against
2764                 * task-struct. So, mm->owner can be NULL.
2765                 */
2766                memcg = mem_cgroup_from_task(p);
2767                if (!memcg)
2768                        memcg = root_mem_cgroup;
2769                if (mem_cgroup_is_root(memcg)) {
2770                        rcu_read_unlock();
2771                        goto done;
2772                }
2773                if (consume_stock(memcg, nr_pages)) {
2774                        /*
2775                         * It seems dagerous to access memcg without css_get().
2776                         * But considering how consume_stok works, it's not
2777                         * necessary. If consume_stock success, some charges
2778                         * from this memcg are cached on this cpu. So, we
2779                         * don't need to call css_get()/css_tryget() before
2780                         * calling consume_stock().
2781                         */
2782                        rcu_read_unlock();
2783                        goto done;
2784                }
2785                /* after here, we may be blocked. we need to get refcnt */
2786                if (!css_tryget(&memcg->css)) {
2787                        rcu_read_unlock();
2788                        goto again;
2789                }
2790                rcu_read_unlock();
2791        }
2792
2793        do {
2794                bool invoke_oom = oom && !nr_oom_retries;
2795
2796                /* If killed, bypass charge */
2797                if (fatal_signal_pending(current)) {
2798                        css_put(&memcg->css);
2799                        goto bypass;
2800                }
2801
2802                ret = mem_cgroup_do_charge(memcg, gfp_mask, batch,
2803                                           nr_pages, invoke_oom);
2804                switch (ret) {
2805                case CHARGE_OK:
2806                        break;
2807                case CHARGE_RETRY: /* not in OOM situation but retry */
2808                        batch = nr_pages;
2809                        css_put(&memcg->css);
2810                        memcg = NULL;
2811                        goto again;
2812                case CHARGE_WOULDBLOCK: /* !__GFP_WAIT */
2813                        css_put(&memcg->css);
2814                        goto nomem;
2815                case CHARGE_NOMEM: /* OOM routine works */
2816                        if (!oom || invoke_oom) {
2817                                css_put(&memcg->css);
2818                                goto nomem;
2819                        }
2820                        nr_oom_retries--;
2821                        break;
2822                }
2823        } while (ret != CHARGE_OK);
2824
2825        if (batch > nr_pages)
2826                refill_stock(memcg, batch - nr_pages);
2827        css_put(&memcg->css);
2828done:
2829        *ptr = memcg;
2830        return 0;
2831nomem:
2832        if (!(gfp_mask & __GFP_NOFAIL)) {
2833                *ptr = NULL;
2834                return -ENOMEM;
2835        }
2836bypass:
2837        *ptr = root_mem_cgroup;
2838        return -EINTR;
2839}
2840
2841/*
2842 * Somemtimes we have to undo a charge we got by try_charge().
2843 * This function is for that and do uncharge, put css's refcnt.
2844 * gotten by try_charge().
2845 */
2846static void __mem_cgroup_cancel_charge(struct mem_cgroup *memcg,
2847                                       unsigned int nr_pages)
2848{
2849        if (!mem_cgroup_is_root(memcg)) {
2850                page_counter_uncharge(&memcg->memory, nr_pages);
2851                if (do_swap_account)
2852                        page_counter_uncharge(&memcg->memsw, nr_pages);
2853        }
2854}
2855
2856struct mem_cgroup *mem_cgroup_from_id(unsigned short id);
2857/*
2858 * A helper function to get mem_cgroup from ID. must be called under
2859 * rcu_read_lock().  The caller is responsible for calling css_tryget if
2860 * the mem_cgroup is used for charging. (dropping refcnt from swap can be
2861 * called against removed memcg.)
2862 */
2863static struct mem_cgroup *mem_cgroup_lookup(unsigned short id)
2864{
2865        /* ID 0 is unused ID */
2866        if (!id)
2867                return NULL;
2868        return mem_cgroup_from_id(id);
2869}
2870
2871struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page)
2872{
2873        struct mem_cgroup *memcg = NULL;
2874        struct page_cgroup *pc;
2875        unsigned short id;
2876        swp_entry_t ent;
2877
2878        VM_BUG_ON_PAGE(!PageLocked(page), page);
2879
2880        pc = lookup_page_cgroup(page);
2881        lock_page_cgroup(pc);
2882        if (PageCgroupUsed(pc)) {
2883                memcg = pc->mem_cgroup;
2884                if (memcg && !css_tryget(&memcg->css))
2885                        memcg = NULL;
2886        } else if (PageSwapCache(page)) {
2887                ent.val = page_private(page);
2888                id = lookup_swap_cgroup_id(ent);
2889                rcu_read_lock();
2890                memcg = mem_cgroup_lookup(id);
2891                if (memcg && !css_tryget(&memcg->css))
2892                        memcg = NULL;
2893                rcu_read_unlock();
2894        }
2895        unlock_page_cgroup(pc);
2896        return memcg;
2897}
2898
2899static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg,
2900                                       struct page *page,
2901                                       unsigned int nr_pages,
2902                                       enum charge_type ctype,
2903                                       bool lrucare)
2904{
2905        struct page_cgroup *pc = lookup_page_cgroup(page);
2906        struct zone *uninitialized_var(zone);
2907        struct lruvec *lruvec;
2908        bool was_on_lru = false;
2909        bool anon;
2910
2911        lock_page_cgroup(pc);
2912        VM_BUG_ON_PAGE(PageCgroupUsed(pc), page);
2913        /*
2914         * we don't need page_cgroup_lock about tail pages, becase they are not
2915         * accessed by any other context at this point.
2916         */
2917
2918        /*
2919         * In some cases, SwapCache and FUSE(splice_buf->radixtree), the page
2920         * may already be on some other mem_cgroup's LRU.  Take care of it.
2921         */
2922        if (lrucare) {
2923                zone = page_zone(page);
2924                spin_lock_irq(&zone->lru_lock);
2925                if (PageLRU(page)) {
2926                        lruvec = mem_cgroup_zone_lruvec(zone, pc->mem_cgroup);
2927                        ClearPageLRU(page);
2928                        del_page_from_lru_list(page, lruvec, page_lru(page));
2929                        was_on_lru = true;
2930                }
2931        }
2932
2933        pc->mem_cgroup = memcg;
2934        /*
2935         * We access a page_cgroup asynchronously without lock_page_cgroup().
2936         * Especially when a page_cgroup is taken from a page, pc->mem_cgroup
2937         * is accessed after testing USED bit. To make pc->mem_cgroup visible
2938         * before USED bit, we need memory barrier here.
2939         * See mem_cgroup_add_lru_list(), etc.
2940         */
2941        smp_wmb();
2942        SetPageCgroupUsed(pc);
2943
2944        if (lrucare) {
2945                if (was_on_lru) {
2946                        lruvec = mem_cgroup_zone_lruvec(zone, pc->mem_cgroup);
2947                        VM_BUG_ON_PAGE(PageLRU(page), page);
2948                        SetPageLRU(page);
2949                        add_page_to_lru_list(page, lruvec, page_lru(page));
2950                }
2951                spin_unlock_irq(&zone->lru_lock);
2952        }
2953
2954        if (ctype == MEM_CGROUP_CHARGE_TYPE_ANON)
2955                anon = true;
2956        else
2957                anon = false;
2958
2959        mem_cgroup_charge_statistics(memcg, page, anon, nr_pages);
2960        unlock_page_cgroup(pc);
2961
2962        /*
2963         * "charge_statistics" updated event counter. Then, check it.
2964         * Insert ancestor (and ancestor's ancestors), to softlimit RB-tree.
2965         * if they exceeds softlimit.
2966         */
2967        memcg_check_events(memcg, page);
2968}
2969
2970#ifdef CONFIG_MEMCG_KMEM
2971static inline bool memcg_can_account_kmem(struct mem_cgroup *memcg)
2972{
2973        return !mem_cgroup_disabled() && !mem_cgroup_is_root(memcg) &&
2974                (memcg->kmem_account_flags & KMEM_ACCOUNTED_MASK);
2975}
2976
2977/*
2978 * This is a bit cumbersome, but it is rarely used and avoids a backpointer
2979 * in the memcg_cache_params struct.
2980 */
2981static struct kmem_cache *memcg_params_to_cache(struct memcg_cache_params *p)
2982{
2983        struct kmem_cache *cachep;
2984
2985        VM_BUG_ON(p->is_root_cache);
2986        cachep = p->root_cache;
2987        return cachep->memcg_params->memcg_caches[memcg_cache_id(p->memcg)];
2988}
2989
2990#ifdef CONFIG_SLABINFO
2991static int mem_cgroup_slabinfo_read(struct cgroup *cont, struct cftype *cft,
2992                                        struct seq_file *m)
2993{
2994        struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
2995        struct memcg_cache_params *params;
2996
2997        if (!memcg_can_account_kmem(memcg))
2998                return -EIO;
2999
3000        print_slabinfo_header(m);
3001
3002        mutex_lock(&memcg->slab_caches_mutex);
3003        list_for_each_entry(params, &memcg->memcg_slab_caches, list)
3004                cache_show(memcg_params_to_cache(params), m);
3005        mutex_unlock(&memcg->slab_caches_mutex);
3006
3007        return 0;
3008}
3009#endif
3010
3011static int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp,
3012                             unsigned long nr_pages)
3013{
3014        struct page_counter *counter;
3015        struct mem_cgroup *_memcg;
3016        int ret = 0;
3017        bool may_oom;
3018
3019        ret = page_counter_try_charge(&memcg->kmem, nr_pages, &counter);
3020        if (ret < 0)
3021                return ret;
3022
3023        /*
3024         * Conditions under which we can wait for the oom_killer. Those are
3025         * the same conditions tested by the core page allocator
3026         */
3027        may_oom = (gfp & __GFP_FS) && !(gfp & __GFP_NORETRY);
3028
3029        _memcg = memcg;
3030        ret = __mem_cgroup_try_charge(NULL, gfp, nr_pages, &_memcg, may_oom);
3031
3032        if (ret == -EINTR)  {
3033                /*
3034                 * __mem_cgroup_try_charge() chosed to bypass to root due to
3035                 * OOM kill or fatal signal.  Since our only options are to
3036                 * either fail the allocation or charge it to this cgroup, do
3037                 * it as a temporary condition. But we can't fail. From a
3038                 * kmem/slab perspective, the cache has already been selected,
3039                 * by mem_cgroup_kmem_get_cache(), so it is too late to change
3040                 * our minds.
3041                 *
3042                 * This condition will only trigger if the task entered
3043                 * memcg_charge_kmem in a sane state, but was OOM-killed during
3044                 * __mem_cgroup_try_charge() above. Tasks that were already
3045                 * dying when the allocation triggers should have been already
3046                 * directed to the root cgroup in memcontrol.h
3047                 */
3048                page_counter_charge(&memcg->memory, nr_pages);
3049                if (do_swap_account)
3050                        page_counter_charge(&memcg->memsw, nr_pages);
3051                ret = 0;
3052        } else if (ret)
3053                page_counter_uncharge(&memcg->kmem, nr_pages);
3054
3055        return ret;
3056}
3057
3058static void memcg_uncharge_kmem(struct mem_cgroup *memcg,
3059                                unsigned long nr_pages)
3060{
3061        page_counter_uncharge(&memcg->memory, nr_pages);
3062        if (do_swap_account)
3063                page_counter_uncharge(&memcg->memsw, nr_pages);
3064
3065        /* Not down to 0 */
3066        if (page_counter_uncharge(&memcg->kmem, nr_pages))
3067                return;
3068
3069        if (memcg_kmem_test_and_clear_dead(memcg))
3070                mem_cgroup_put(memcg);
3071}
3072
3073/*
3074 * helper for acessing a memcg's index. It will be used as an index in the
3075 * child cache array in kmem_cache, and also to derive its name. This function
3076 * will return -1 when this is not a kmem-limited memcg.
3077 */
3078int memcg_cache_id(struct mem_cgroup *memcg)
3079{
3080        return memcg ? memcg->kmemcg_id : -1;
3081}
3082
3083/*
3084 * This ends up being protected by the set_limit mutex, during normal
3085 * operation, because that is its main call site.
3086 *
3087 * But when we create a new cache, we can call this as well if its parent
3088 * is kmem-limited. That will have to hold set_limit_mutex as well.
3089 */
3090int memcg_update_cache_sizes(struct mem_cgroup *memcg)
3091{
3092        int num, ret;
3093
3094        num = ida_simple_get(&kmem_limited_groups,
3095                                0, MEMCG_CACHES_MAX_SIZE, GFP_KERNEL);
3096        if (num < 0)
3097                return num;
3098        /*
3099         * After this point, kmem_accounted (that we test atomically in
3100         * the beginning of this conditional), is no longer 0. This
3101         * guarantees only one process will set the following boolean
3102         * to true. We don't need test_and_set because we're protected
3103         * by the set_limit_mutex anyway.
3104         */
3105        memcg_kmem_set_activated(memcg);
3106
3107        ret = memcg_update_all_caches(num+1);
3108        if (ret) {
3109                ida_simple_remove(&kmem_limited_groups, num);
3110                memcg_kmem_clear_activated(memcg);
3111                return ret;
3112        }
3113
3114        memcg->kmemcg_id = num;
3115        INIT_LIST_HEAD(&memcg->memcg_slab_caches);
3116        mutex_init(&memcg->slab_caches_mutex);
3117        return 0;
3118}
3119
3120static size_t memcg_caches_array_size(int num_groups)
3121{
3122        ssize_t size;
3123        if (num_groups <= 0)
3124                return 0;
3125
3126        size = 2 * num_groups;
3127        if (size < MEMCG_CACHES_MIN_SIZE)
3128                size = MEMCG_CACHES_MIN_SIZE;
3129        else if (size > MEMCG_CACHES_MAX_SIZE)
3130                size = MEMCG_CACHES_MAX_SIZE;
3131
3132        return size;
3133}
3134
3135/*
3136 * We should update the current array size iff all caches updates succeed. This
3137 * can only be done from the slab side. The slab mutex needs to be held when
3138 * calling this.
3139 */
3140void memcg_update_array_size(int num)
3141{
3142        if (num > memcg_limited_groups_array_size)
3143                memcg_limited_groups_array_size = memcg_caches_array_size(num);
3144}
3145
3146static void kmem_cache_destroy_work_func(struct work_struct *w);
3147
3148int memcg_update_cache_size(struct kmem_cache *s, int num_groups)
3149{
3150        struct memcg_cache_params *cur_params = s->memcg_params;
3151
3152        VM_BUG_ON(s->memcg_params && !s->memcg_params->is_root_cache);
3153        /*
3154         * Need to do this if we are increasing the size or there are
3155         * kmem_caches with null memcg_params otherwise we will dereference
3156         * in  __memcg_kmem_get_cache()
3157         */
3158        if (num_groups > memcg_limited_groups_array_size || !cur_params) {
3159                int i;
3160                ssize_t size = memcg_caches_array_size(num_groups);
3161
3162                size *= sizeof(void *);
3163                size += sizeof(struct memcg_cache_params);
3164
3165                s->memcg_params = kzalloc(size, GFP_KERNEL);
3166                if (!s->memcg_params) {
3167                        s->memcg_params = cur_params;
3168                        return -ENOMEM;
3169                }
3170
3171                s->memcg_params->is_root_cache = true;
3172
3173                /* if there was no kmem_cache->memcg_params, first time so we are done */
3174                if (!cur_params)
3175                        return 0;
3176
3177                /*
3178                 * There is the chance it will be bigger than
3179                 * memcg_limited_groups_array_size, if we failed an allocation
3180                 * in a cache, in which case all caches updated before it, will
3181                 * have a bigger array.
3182                 *
3183                 * But if that is the case, the data after
3184                 * memcg_limited_groups_array_size is certainly unused
3185                 */
3186                for (i = 0; i < memcg_limited_groups_array_size; i++) {
3187                        if (!cur_params->memcg_caches[i])
3188                                continue;
3189                        s->memcg_params->memcg_caches[i] =
3190                                                cur_params->memcg_caches[i];
3191                }
3192
3193                /*
3194                 * Ideally, we would wait until all caches succeed, and only
3195                 * then free the old one. But this is not worth the extra
3196                 * pointer per-cache we'd have to have for this.
3197                 *
3198                 * It is not a big deal if some caches are left with a size
3199                 * bigger than the others. And all updates will reset this
3200                 * anyway.
3201                 */
3202                kfree(cur_params);
3203        }
3204        return 0;
3205}
3206
3207int memcg_alloc_cache_params(struct mem_cgroup *memcg, struct kmem_cache *s,
3208                             struct kmem_cache *root_cache)
3209{
3210        size_t size = sizeof(struct memcg_cache_params);
3211
3212        if (!memcg_kmem_enabled())
3213                return 0;
3214
3215        if (!memcg)
3216                size += memcg_limited_groups_array_size * sizeof(void *);
3217
3218        s->memcg_params = kzalloc(size, GFP_KERNEL);
3219        if (!s->memcg_params)
3220                return -ENOMEM;
3221
3222        if (memcg) {
3223                s->memcg_params->memcg = memcg;
3224                s->memcg_params->root_cache = root_cache;
3225                INIT_WORK(&s->memcg_params->destroy,
3226                                kmem_cache_destroy_work_func);
3227        } else
3228                s->memcg_params->is_root_cache = true;
3229
3230        return 0;
3231}
3232
3233void memcg_free_cache_params(struct kmem_cache *s)
3234{
3235        kfree(s->memcg_params);
3236}
3237
3238void memcg_register_cache(struct kmem_cache *s)
3239{
3240        struct kmem_cache *root;
3241        struct mem_cgroup *memcg;
3242        int id;
3243
3244        if (is_root_cache(s))
3245                return;
3246
3247        /*
3248         * Holding the slab_mutex assures nobody will touch the memcg_caches
3249         * array while we are modifying it.
3250         */
3251        lockdep_assert_held(&slab_mutex);
3252
3253        root = s->memcg_params->root_cache;
3254        memcg = s->memcg_params->memcg;
3255        id = memcg_cache_id(memcg);
3256
3257        mutex_lock(&memcg->slab_caches_mutex);
3258        list_add(&s->memcg_params->list, &memcg->memcg_slab_caches);
3259        mutex_unlock(&memcg->slab_caches_mutex);
3260
3261        VM_BUG_ON(root->memcg_params->memcg_caches[id]);
3262        root->memcg_params->memcg_caches[id] = s;
3263        /*
3264         * the readers won't lock, make sure everybody sees the updated value,
3265         * so they won't put stuff in the queue again for no reason
3266         */
3267        wmb();
3268}
3269
3270void memcg_unregister_cache(struct kmem_cache *s)
3271{
3272        struct kmem_cache *root;
3273        struct mem_cgroup *memcg;
3274        int id;
3275
3276        /*
3277         * This happens, for instance, when a root cache goes away before we
3278         * add any memcg.
3279         */
3280        if (!s->memcg_params)
3281                return;
3282
3283        if (s->memcg_params->is_root_cache)
3284                return;
3285
3286        /*
3287         * Holding the slab_mutex assures nobody will touch the memcg_caches
3288         * array while we are modifying it.
3289         */
3290        lockdep_assert_held(&slab_mutex);
3291
3292        memcg = s->memcg_params->memcg;
3293        id  = memcg_cache_id(memcg);
3294
3295        root = s->memcg_params->root_cache;
3296        VM_BUG_ON(!root->memcg_params->memcg_caches[id]);
3297        root->memcg_params->memcg_caches[id] = NULL;
3298
3299        mutex_lock(&memcg->slab_caches_mutex);
3300        list_del(&s->memcg_params->list);
3301        mutex_unlock(&memcg->slab_caches_mutex);
3302
3303        mem_cgroup_put(memcg);
3304}
3305
3306/*
3307 * During the creation a new cache, we need to disable our accounting mechanism
3308 * altogether. This is true even if we are not creating, but rather just
3309 * enqueing new caches to be created.
3310 *
3311 * This is because that process will trigger allocations; some visible, like
3312 * explicit kmallocs to auxiliary data structures, name strings and internal
3313 * cache structures; some well concealed, like INIT_WORK() that can allocate
3314 * objects during debug.
3315 *
3316 * If any allocation happens during memcg_kmem_get_cache, we will recurse back
3317 * to it. This may not be a bounded recursion: since the first cache creation
3318 * failed to complete (waiting on the allocation), we'll just try to create the
3319 * cache again, failing at the same point.
3320 *
3321 * memcg_kmem_get_cache is prepared to abort after seeing a positive count of
3322 * memcg_kmem_skip_account. So we enclose anything that might allocate memory
3323 * inside the following two functions.
3324 */
3325static inline void memcg_stop_kmem_account(void)
3326{
3327        VM_BUG_ON(!current->mm);
3328        current->memcg_kmem_skip_account++;
3329}
3330
3331static inline void memcg_resume_kmem_account(void)
3332{
3333        VM_BUG_ON(!current->mm);
3334        current->memcg_kmem_skip_account--;
3335}
3336
3337static void kmem_cache_destroy_work_func(struct work_struct *w)
3338{
3339        struct kmem_cache *cachep;
3340        struct memcg_cache_params *p;
3341
3342        p = container_of(w, struct memcg_cache_params, destroy);
3343
3344        cachep = memcg_params_to_cache(p);
3345
3346        /*
3347         * If we get down to 0 after shrink, we could delete right away.
3348         * However, memcg_release_pages() already puts us back in the workqueue
3349         * in that case. If we proceed deleting, we'll get a dangling
3350         * reference, and removing the object from the workqueue in that case
3351         * is unnecessary complication. We are not a fast path.
3352         *
3353         * Note that this case is fundamentally different from racing with
3354         * shrink_slab(): if memcg_cgroup_destroy_cache() is called in
3355         * kmem_cache_shrink, not only we would be reinserting a dead cache
3356         * into the queue, but doing so from inside the worker racing to
3357         * destroy it.
3358         *
3359         * So if we aren't down to zero, we'll just schedule a worker and try
3360         * again
3361         */
3362        if (atomic_read(&cachep->memcg_params->nr_pages) != 0) {
3363                kmem_cache_shrink(cachep);
3364                if (atomic_read(&cachep->memcg_params->nr_pages) == 0)
3365                        return;
3366        } else
3367                kmem_cache_destroy(cachep);
3368}
3369
3370void mem_cgroup_destroy_cache(struct kmem_cache *cachep)
3371{
3372        if (!cachep->memcg_params->dead)
3373                return;
3374
3375        /*
3376         * There are many ways in which we can get here.
3377         *
3378         * We can get to a memory-pressure situation while the delayed work is
3379         * still pending to run. The vmscan shrinkers can then release all
3380         * cache memory and get us to destruction. If this is the case, we'll
3381         * be executed twice, which is a bug (the second time will execute over
3382         * bogus data). In this case, cancelling the work should be fine.
3383         *
3384         * But we can also get here from the worker itself, if
3385         * kmem_cache_shrink is enough to shake all the remaining objects and
3386         * get the page count to 0. In this case, we'll deadlock if we try to
3387         * cancel the work (the worker runs with an internal lock held, which
3388         * is the same lock we would hold for cancel_work_sync().)
3389         *
3390         * Since we can't possibly know who got us here, just refrain from
3391         * running if there is already work pending
3392         */
3393        if (work_pending(&cachep->memcg_params->destroy))
3394                return;
3395        /*
3396         * We have to defer the actual destroying to a workqueue, because
3397         * we might currently be in a context that cannot sleep.
3398         */
3399        schedule_work(&cachep->memcg_params->destroy);
3400}
3401
3402/*
3403 * This lock protects updaters, not readers. We want readers to be as fast as
3404 * they can, and they will either see NULL or a valid cache value. Our model
3405 * allow them to see NULL, in which case the root memcg will be selected.
3406 *
3407 * We need this lock because multiple allocations to the same cache from a non
3408 * will span more than one worker. Only one of them can create the cache.
3409 */
3410static DEFINE_MUTEX(memcg_cache_mutex);
3411
3412/*
3413 * Called with memcg_cache_mutex held
3414 */
3415static struct kmem_cache *kmem_cache_dup(struct mem_cgroup *memcg,
3416                                         struct kmem_cache *s)
3417{
3418        struct kmem_cache *new;
3419        static char *tmp_name = NULL;
3420
3421        lockdep_assert_held(&memcg_cache_mutex);
3422
3423        /*
3424         * kmem_cache_create_memcg duplicates the given name and
3425         * cgroup_name for this name requires RCU context.
3426         * This static temporary buffer is used to prevent from
3427         * pointless shortliving allocation.
3428         */
3429        if (!tmp_name) {
3430                tmp_name = kmalloc(PATH_MAX, GFP_KERNEL);
3431                if (!tmp_name)
3432                        return NULL;
3433        }
3434
3435        rcu_read_lock();
3436        snprintf(tmp_name, PATH_MAX, "%s(%d:%s)", s->name,
3437                         memcg_cache_id(memcg), cgroup_name(memcg->css.cgroup));
3438        rcu_read_unlock();
3439
3440        new = kmem_cache_create_memcg(memcg, tmp_name, s->object_size, s->align,
3441                                      (s->flags & ~SLAB_PANIC), s->ctor, s);
3442
3443        if (new)
3444                new->allocflags |= __GFP_KMEMCG;
3445
3446        return new;
3447}
3448
3449static struct kmem_cache *memcg_create_kmem_cache(struct mem_cgroup *memcg,
3450                                                  struct kmem_cache *cachep)
3451{
3452        struct kmem_cache *new_cachep;
3453
3454        BUG_ON(!memcg_can_account_kmem(memcg));
3455
3456        mutex_lock(&memcg_cache_mutex);
3457
3458        new_cachep = kmem_cache_dup(memcg, cachep);
3459        if (new_cachep == NULL) {
3460                new_cachep = cachep;
3461                goto out;
3462        }
3463
3464        mem_cgroup_get(memcg);
3465out:
3466        mutex_unlock(&memcg_cache_mutex);
3467        return new_cachep;
3468}
3469
3470static DEFINE_MUTEX(memcg_limit_mutex);
3471
3472int __kmem_cache_destroy_memcg_children(struct kmem_cache *s)
3473{
3474        struct kmem_cache *c;
3475        int i, failed = 0;
3476
3477        /*
3478         * If the cache is being destroyed, we trust that there is no one else
3479         * requesting objects from it. Even if there are, the sanity checks in
3480         * kmem_cache_destroy should caught this ill-case.
3481         *
3482         * Still, we don't want anyone else freeing memcg_caches under our
3483         * noses, which can happen if a new memcg comes to life. As usual,
3484         * we'll take the memcg_limit_mutex to protect ourselves against this.
3485         */
3486        mutex_lock(&memcg_limit_mutex);
3487        for (i = 0; i < memcg_limited_groups_array_size; i++) {
3488                c = s->memcg_params->memcg_caches[i];
3489                if (!c)
3490                        continue;
3491
3492                /*
3493                 * We will now manually delete the caches, so to avoid races
3494                 * we need to cancel all pending destruction workers and
3495                 * proceed with destruction ourselves.
3496                 *
3497                 * kmem_cache_destroy() will call kmem_cache_shrink internally,
3498                 * and that could spawn the workers again: it is likely that
3499                 * the cache still have active pages until this very moment.
3500                 * This would lead us back to mem_cgroup_destroy_cache.
3501                 *
3502                 * But that will not execute at all if the "dead" flag is not
3503                 * set, so flip it down to guarantee we are in control.
3504                 */
3505                c->memcg_params->dead = false;
3506                cancel_work_sync(&c->memcg_params->destroy);
3507                kmem_cache_destroy(c);
3508
3509                if (cache_from_memcg(s, i))
3510                        failed++;
3511        }
3512        mutex_unlock(&memcg_limit_mutex);
3513        return failed;
3514}
3515
3516struct create_work {
3517        struct mem_cgroup *memcg;
3518        struct kmem_cache *cachep;
3519        struct work_struct work;
3520};
3521
3522static void mem_cgroup_destroy_all_caches(struct mem_cgroup *memcg)
3523{
3524        struct kmem_cache *cachep;
3525        struct memcg_cache_params *params;
3526
3527        if (!memcg_kmem_is_active(memcg))
3528                return;
3529
3530        mutex_lock(&memcg->slab_caches_mutex);
3531        list_for_each_entry(params, &memcg->memcg_slab_caches, list) {
3532                cachep = memcg_params_to_cache(params);
3533                cachep->memcg_params->dead = true;
3534                schedule_work(&cachep->memcg_params->destroy);
3535        }
3536        mutex_unlock(&memcg->slab_caches_mutex);
3537}
3538
3539static void memcg_create_cache_work_func(struct work_struct *w)
3540{
3541        struct create_work *cw;
3542
3543        cw = container_of(w, struct create_work, work);
3544        memcg_create_kmem_cache(cw->memcg, cw->cachep);
3545        /* Drop the reference gotten when we enqueued. */
3546        css_put(&cw->memcg->css);
3547        kfree(cw);
3548}
3549
3550/*
3551 * Enqueue the creation of a per-memcg kmem_cache.
3552 */
3553static void __memcg_create_cache_enqueue(struct mem_cgroup *memcg,
3554                                         struct kmem_cache *cachep)
3555{
3556        struct create_work *cw;
3557
3558        cw = kmalloc(sizeof(struct create_work), GFP_NOWAIT);
3559        if (cw == NULL) {
3560                css_put(&memcg->css);
3561                return;
3562        }
3563
3564        cw->memcg = memcg;
3565        cw->cachep = cachep;
3566
3567        INIT_WORK(&cw->work, memcg_create_cache_work_func);
3568        schedule_work(&cw->work);
3569}
3570
3571static void memcg_create_cache_enqueue(struct mem_cgroup *memcg,
3572                                       struct kmem_cache *cachep)
3573{
3574        /*
3575         * We need to stop accounting when we kmalloc, because if the
3576         * corresponding kmalloc cache is not yet created, the first allocation
3577         * in __memcg_create_cache_enqueue will recurse.
3578         *
3579         * However, it is better to enclose the whole function. Depending on
3580         * the debugging options enabled, INIT_WORK(), for instance, can
3581         * trigger an allocation. This too, will make us recurse. Because at
3582         * this point we can't allow ourselves back into memcg_kmem_get_cache,
3583         * the safest choice is to do it like this, wrapping the whole function.
3584         */
3585        memcg_stop_kmem_account();
3586        __memcg_create_cache_enqueue(memcg, cachep);
3587        memcg_resume_kmem_account();
3588}
3589/*
3590 * Return the kmem_cache we're supposed to use for a slab allocation.
3591 * We try to use the current memcg's version of the cache.
3592 *
3593 * If the cache does not exist yet, if we are the first user of it,
3594 * we either create it immediately, if possible, or create it asynchronously
3595 * in a workqueue.
3596 * In the latter case, we will let the current allocation go through with
3597 * the original cache.
3598 *
3599 * Can't be called in interrupt context or from kernel threads.
3600 * This function needs to be called with rcu_read_lock() held.
3601 */
3602struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep,
3603                                          gfp_t gfp)
3604{
3605        struct mem_cgroup *memcg;
3606        int idx;
3607
3608        VM_BUG_ON(!cachep->memcg_params);
3609        VM_BUG_ON(!cachep->memcg_params->is_root_cache);
3610
3611        if (!current->mm || current->memcg_kmem_skip_account)
3612                return cachep;
3613
3614        rcu_read_lock();
3615        memcg = mem_cgroup_from_task(rcu_dereference(current->mm->owner));
3616
3617        if (!memcg_can_account_kmem(memcg))
3618                goto out;
3619
3620        idx = memcg_cache_id(memcg);
3621
3622        /*
3623         * barrier to mare sure we're always seeing the up to date value.  The
3624         * code updating memcg_caches will issue a write barrier to match this.
3625         */
3626        read_barrier_depends();
3627        if (likely(cachep->memcg_params->memcg_caches[idx])) {
3628                cachep = cachep->memcg_params->memcg_caches[idx];
3629                goto out;
3630        }
3631
3632        /* The corresponding put will be done in the workqueue. */
3633        if (!css_tryget(&memcg->css))
3634                goto out;
3635        rcu_read_unlock();
3636
3637        /*
3638         * If we are in a safe context (can wait, and not in interrupt
3639         * context), we could be be predictable and return right away.
3640         * This would guarantee that the allocation being performed
3641         * already belongs in the new cache.
3642         *
3643         * However, there are some clashes that can arrive from locking.
3644         * For instance, because we acquire the slab_mutex while doing
3645         * kmem_cache_dup, this means no further allocation could happen
3646         * with the slab_mutex held.
3647         *
3648         * Also, because cache creation issue get_online_cpus(), this
3649         * creates a lock chain: memcg_slab_mutex -> cpu_hotplug_mutex,
3650         * that ends up reversed during cpu hotplug. (cpuset allocates
3651         * a bunch of GFP_KERNEL memory during cpuup). Due to all that,
3652         * better to defer everything.
3653         */
3654        memcg_create_cache_enqueue(memcg, cachep);
3655        return cachep;
3656out:
3657        rcu_read_unlock();
3658        return cachep;
3659}
3660EXPORT_SYMBOL(__memcg_kmem_get_cache);
3661
3662/*
3663 * We need to verify if the allocation against current->mm->owner's memcg is
3664 * possible for the given order. But the page is not allocated yet, so we'll
3665 * need a further commit step to do the final arrangements.
3666 *
3667 * It is possible for the task to switch cgroups in this mean time, so at
3668 * commit time, we can't rely on task conversion any longer.  We'll then use
3669 * the handle argument to return to the caller which cgroup we should commit
3670 * against. We could also return the memcg directly and avoid the pointer
3671 * passing, but a boolean return value gives better semantics considering
3672 * the compiled-out case as well.
3673 *
3674 * Returning true means the allocation is possible.
3675 */
3676bool
3677__memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **_memcg, int order)
3678{
3679        struct mem_cgroup *memcg;
3680        int ret;
3681
3682        *_memcg = NULL;
3683        memcg = try_get_mem_cgroup_from_mm(current->mm);
3684
3685        /*
3686         * very rare case described in mem_cgroup_from_task. Unfortunately there
3687         * isn't much we can do without complicating this too much, and it would
3688         * be gfp-dependent anyway. Just let it go
3689         */
3690        if (unlikely(!memcg))
3691                return true;
3692
3693        if (!memcg_can_account_kmem(memcg)) {
3694                css_put(&memcg->css);
3695                return true;
3696        }
3697
3698        ret = memcg_charge_kmem(memcg, gfp, 1 << order);
3699        if (!ret)
3700                *_memcg = memcg;
3701
3702        css_put(&memcg->css);
3703        return (ret == 0);
3704}
3705
3706void __memcg_kmem_commit_charge(struct page *page, struct mem_cgroup *memcg,
3707                              int order)
3708{
3709        struct page_cgroup *pc;
3710
3711        VM_BUG_ON(mem_cgroup_is_root(memcg));
3712
3713        /* The page allocation failed. Revert */
3714        if (!page) {
3715                memcg_uncharge_kmem(memcg, 1 << order);
3716                return;
3717        }
3718
3719        pc = lookup_page_cgroup(page);
3720        lock_page_cgroup(pc);
3721        pc->mem_cgroup = memcg;
3722        SetPageCgroupUsed(pc);
3723        unlock_page_cgroup(pc);
3724}
3725
3726void __memcg_kmem_uncharge_pages(struct page *page, int order)
3727{
3728        struct mem_cgroup *memcg = NULL;
3729        struct page_cgroup *pc;
3730
3731
3732        pc = lookup_page_cgroup(page);
3733        /*
3734         * Fast unlocked return. Theoretically might have changed, have to
3735         * check again after locking.
3736         */
3737        if (!PageCgroupUsed(pc))
3738                return;
3739
3740        lock_page_cgroup(pc);
3741        if (PageCgroupUsed(pc)) {
3742                memcg = pc->mem_cgroup;
3743                ClearPageCgroupUsed(pc);
3744        }
3745        unlock_page_cgroup(pc);
3746
3747        /*
3748         * We trust that only if there is a memcg associated with the page, it
3749         * is a valid allocation
3750         */
3751        if (!memcg)
3752                return;
3753
3754        VM_BUG_ON_PAGE(mem_cgroup_is_root(memcg), page);
3755        memcg_uncharge_kmem(memcg, 1 << order);
3756}
3757#else
3758static inline void mem_cgroup_destroy_all_caches(struct mem_cgroup *memcg)
3759{
3760}
3761#endif /* CONFIG_MEMCG_KMEM */
3762
3763#ifdef CONFIG_TRANSPARENT_HUGEPAGE
3764
3765#define PCGF_NOCOPY_AT_SPLIT (1 << PCG_LOCK | 1 << PCG_MIGRATION)
3766/*
3767 * Because tail pages are not marked as "used", set it. We're under
3768 * zone->lru_lock, 'splitting on pmd' and compound_lock.
3769 * charge/uncharge will be never happen and move_account() is done under
3770 * compound_lock(), so we don't have to take care of races.
3771 */
3772void mem_cgroup_split_huge_fixup(struct page *head)
3773{
3774        struct page_cgroup *head_pc = lookup_page_cgroup(head);
3775        struct page_cgroup *pc;
3776        struct mem_cgroup *memcg;
3777        int i;
3778
3779        if (mem_cgroup_disabled())
3780                return;
3781
3782        memcg = head_pc->mem_cgroup;
3783        for (i = 1; i < HPAGE_PMD_NR; i++) {
3784                pc = head_pc + i;
3785                pc->mem_cgroup = memcg;
3786                smp_wmb();/* see __commit_charge() */
3787                pc->flags = head_pc->flags & ~PCGF_NOCOPY_AT_SPLIT;
3788        }
3789        __this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_RSS_HUGE],
3790                       HPAGE_PMD_NR);
3791}
3792#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
3793
3794/**
3795 * mem_cgroup_move_account - move account of the page
3796 * @page: the page
3797 * @nr_pages: number of regular pages (>1 for huge pages)
3798 * @pc: page_cgroup of the page.
3799 * @from: mem_cgroup which the page is moved from.
3800 * @to: mem_cgroup which the page is moved to. @from != @to.
3801 *
3802 * The caller must confirm following.
3803 * - page is not on LRU (isolate_page() is useful.)
3804 * - compound_lock is held when nr_pages > 1
3805 *
3806 * This function doesn't do "charge" to new cgroup and doesn't do "uncharge"
3807 * from old cgroup.
3808 */
3809static int mem_cgroup_move_account(struct page *page,
3810                                   unsigned int nr_pages,
3811                                   struct page_cgroup *pc,
3812                                   struct mem_cgroup *from,
3813                                   struct mem_cgroup *to)
3814{
3815        unsigned long flags;
3816        int ret;
3817        bool anon = PageAnon(page);
3818
3819        VM_BUG_ON(from == to);
3820        VM_BUG_ON_PAGE(PageLRU(page), page);
3821        /*
3822         * The page is isolated from LRU. So, collapse function
3823         * will not handle this page. But page splitting can happen.
3824         * Do this check under compound_page_lock(). The caller should
3825         * hold it.
3826         */
3827        ret = -EBUSY;
3828        if (nr_pages > 1 && !PageTransHuge(page))
3829                goto out;
3830
3831        lock_page_cgroup(pc);
3832
3833        ret = -EINVAL;
3834        if (!PageCgroupUsed(pc) || pc->mem_cgroup != from)
3835                goto unlock;
3836
3837        move_lock_mem_cgroup(from, &flags);
3838
3839        if (!anon && page_mapped(page)) {
3840                /* Update mapped_file data for mem_cgroup */
3841                preempt_disable();
3842                __this_cpu_dec(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
3843                __this_cpu_inc(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
3844                preempt_enable();
3845        }
3846        mem_cgroup_charge_statistics(from, page, anon, -nr_pages);
3847
3848        /* caller should have done css_get */
3849        pc->mem_cgroup = to;
3850        mem_cgroup_charge_statistics(to, page, anon, nr_pages);
3851        move_unlock_mem_cgroup(from, &flags);
3852        ret = 0;
3853unlock:
3854        unlock_page_cgroup(pc);
3855        /*
3856         * check events
3857         */
3858        memcg_check_events(to, page);
3859        memcg_check_events(from, page);
3860out:
3861        return ret;
3862}
3863
3864/**
3865 * mem_cgroup_move_parent - moves page to the parent group
3866 * @page: the page to move
3867 * @pc: page_cgroup of the page
3868 * @child: page's cgroup
3869 *
3870 * move charges to its parent or the root cgroup if the group has no
3871 * parent (aka use_hierarchy==0).
3872 * Although this might fail (get_page_unless_zero, isolate_lru_page or
3873 * mem_cgroup_move_account fails) the failure is always temporary and
3874 * it signals a race with a page removal/uncharge or migration. In the
3875 * first case the page is on the way out and it will vanish from the LRU
3876 * on the next attempt and the call should be retried later.
3877 * Isolation from the LRU fails only if page has been isolated from
3878 * the LRU since we looked at it and that usually means either global
3879 * reclaim or migration going on. The page will either get back to the
3880 * LRU or vanish.
3881 * Finaly mem_cgroup_move_account fails only if the page got uncharged
3882 * (!PageCgroupUsed) or moved to a different group. The page will
3883 * disappear in the next attempt.
3884 */
3885static int mem_cgroup_move_parent(struct page *page,
3886                                  struct page_cgroup *pc,
3887                                  struct mem_cgroup *child)
3888{
3889        struct mem_cgroup *parent;
3890        unsigned int nr_pages;
3891        unsigned long uninitialized_var(flags);
3892        int ret;
3893
3894        VM_BUG_ON(mem_cgroup_is_root(child));
3895
3896        ret = -EBUSY;
3897        if (!get_page_unless_zero(page))
3898                goto out;
3899        if (isolate_lru_page(page))
3900                goto put;
3901
3902        nr_pages = hpage_nr_pages(page);
3903
3904        parent = parent_mem_cgroup(child);
3905        /*
3906         * If no parent, move charges to root cgroup.
3907         */
3908        if (!parent)
3909                parent = root_mem_cgroup;
3910
3911        if (nr_pages > 1) {
3912                VM_BUG_ON_PAGE(!PageTransHuge(page), page);
3913                flags = compound_lock_irqsave(page);
3914        }
3915
3916        ret = mem_cgroup_move_account(page, nr_pages,
3917                                pc, child, parent);
3918        if (!ret) {
3919                /* Take charge off the local counters */
3920                page_counter_cancel(&child->memory, nr_pages);
3921                if (do_swap_account)
3922                        page_counter_cancel(&child->memsw, nr_pages);
3923        }
3924
3925        if (nr_pages > 1)
3926                compound_unlock_irqrestore(page, flags);
3927        putback_lru_page(page);
3928put:
3929        put_page(page);
3930out:
3931        return ret;
3932}
3933
3934/*
3935 * Charge the memory controller for page usage.
3936 * Return
3937 * 0 if the charge was successful
3938 * < 0 if the cgroup is over its limit
3939 */
3940static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
3941                                gfp_t gfp_mask, enum charge_type ctype)
3942{
3943        struct mem_cgroup *memcg = NULL;
3944        unsigned int nr_pages = 1;
3945        bool oom = true;
3946        int ret;
3947
3948        if (PageTransHuge(page)) {
3949                nr_pages <<= compound_order(page);
3950                VM_BUG_ON_PAGE(!PageTransHuge(page), page);
3951                /*
3952                 * Never OOM-kill a process for a huge page.  The
3953                 * fault handler will fall back to regular pages.
3954                 */
3955                oom = false;
3956        }
3957
3958        ret = __mem_cgroup_try_charge(mm, gfp_mask, nr_pages, &memcg, oom);
3959        if (ret == -ENOMEM)
3960                return ret;
3961        __mem_cgroup_commit_charge(memcg, page, nr_pages, ctype, false);
3962        return 0;
3963}
3964
3965int mem_cgroup_newpage_charge(struct page *page,
3966                              struct mm_struct *mm, gfp_t gfp_mask)
3967{
3968        if (mem_cgroup_disabled())
3969                return 0;
3970        VM_BUG_ON_PAGE(page_mapped(page), page);
3971        VM_BUG_ON_PAGE(page->mapping && !PageAnon(page), page);
3972        VM_BUG_ON(!mm);
3973        return mem_cgroup_charge_common(page, mm, gfp_mask,
3974                                        MEM_CGROUP_CHARGE_TYPE_ANON);
3975}
3976
3977/*
3978 * While swap-in, try_charge -> commit or cancel, the page is locked.
3979 * And when try_charge() successfully returns, one refcnt to memcg without
3980 * struct page_cgroup is acquired. This refcnt will be consumed by
3981 * "commit()" or removed by "cancel()"
3982 */
3983static int __mem_cgroup_try_charge_swapin(struct mm_struct *mm,
3984                                          struct page *page,
3985                                          gfp_t mask,
3986                                          struct mem_cgroup **memcgp)
3987{
3988        struct mem_cgroup *memcg;
3989        struct page_cgroup *pc;
3990        int ret;
3991
3992        pc = lookup_page_cgroup(page);
3993        /*
3994         * Every swap fault against a single page tries to charge the
3995         * page, bail as early as possible.  shmem_unuse() encounters
3996         * already charged pages, too.  The USED bit is protected by
3997         * the page lock, which serializes swap cache removal, which
3998         * in turn serializes uncharging.
3999         */
4000        if (PageCgroupUsed(pc))
4001                return 0;
4002        if (!do_swap_account)
4003                goto charge_cur_mm;
4004        memcg = try_get_mem_cgroup_from_page(page);
4005        if (!memcg)
4006                goto charge_cur_mm;
4007        *memcgp = memcg;
4008        ret = __mem_cgroup_try_charge(NULL, mask, 1, memcgp, true);
4009        css_put(&memcg->css);
4010        if (ret == -EINTR)
4011                ret = 0;
4012        return ret;
4013charge_cur_mm:
4014        ret = __mem_cgroup_try_charge(mm, mask, 1, memcgp, true);
4015        if (ret == -EINTR)
4016                ret = 0;
4017        return ret;
4018}
4019
4020int mem_cgroup_try_charge_swapin(struct mm_struct *mm, struct page *page,
4021                                 gfp_t gfp_mask, struct mem_cgroup **memcgp)
4022{
4023        *memcgp = NULL;
4024        if (mem_cgroup_disabled())
4025                return 0;
4026        /*
4027         * A racing thread's fault, or swapoff, may have already
4028         * updated the pte, and even removed page from swap cache: in
4029         * those cases unuse_pte()'s pte_same() test will fail; but
4030         * there's also a KSM case which does need to charge the page.
4031         */
4032        if (!PageSwapCache(page)) {
4033                int ret;
4034
4035                ret = __mem_cgroup_try_charge(mm, gfp_mask, 1, memcgp, true);
4036                if (ret == -EINTR)
4037                        ret = 0;
4038                return ret;
4039        }
4040        return __mem_cgroup_try_charge_swapin(mm, page, gfp_mask, memcgp);
4041}
4042
4043void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *memcg)
4044{
4045        if (mem_cgroup_disabled())
4046                return;
4047        if (!memcg)
4048                return;
4049        __mem_cgroup_cancel_charge(memcg, 1);
4050}
4051
4052static void
4053__mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *memcg,
4054                                        enum charge_type ctype)
4055{
4056        if (mem_cgroup_disabled())
4057                return;
4058        if (!memcg)
4059                return;
4060
4061        __mem_cgroup_commit_charge(memcg, page, 1, ctype, true);
4062        /*
4063         * Now swap is on-memory. This means this page may be
4064         * counted both as mem and swap....double count.
4065         * Fix it by uncharging from memsw. Basically, this SwapCache is stable
4066         * under lock_page(). But in do_swap_page()::memory.c, reuse_swap_page()
4067         * may call delete_from_swap_cache() before reach here.
4068         */
4069        if (do_swap_account && PageSwapCache(page)) {
4070                swp_entry_t ent = {.val = page_private(page)};
4071                mem_cgroup_uncharge_swap(ent);
4072        }
4073}
4074
4075void mem_cgroup_commit_charge_swapin(struct page *page,
4076                                     struct mem_cgroup *memcg)
4077{
4078        __mem_cgroup_commit_charge_swapin(page, memcg,
4079                                          MEM_CGROUP_CHARGE_TYPE_ANON);
4080}
4081
4082int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
4083                                gfp_t gfp_mask)
4084{
4085        struct mem_cgroup *memcg = NULL;
4086        enum charge_type type = MEM_CGROUP_CHARGE_TYPE_CACHE;
4087        int ret;
4088
4089        if (mem_cgroup_disabled())
4090                return 0;
4091        if (PageCompound(page))
4092                return 0;
4093
4094        if (!PageSwapCache(page))
4095                ret = mem_cgroup_charge_common(page, mm, gfp_mask, type);
4096        else { /* page is swapcache/shmem */
4097                ret = __mem_cgroup_try_charge_swapin(mm, page,
4098                                                     gfp_mask, &memcg);
4099                if (!ret)
4100                        __mem_cgroup_commit_charge_swapin(page, memcg, type);
4101        }
4102        return ret;
4103}
4104
4105static void mem_cgroup_do_uncharge(struct mem_cgroup *memcg,
4106                                   unsigned int nr_pages,
4107                                   const enum charge_type ctype)
4108{
4109        struct memcg_batch_info *batch = NULL;
4110        bool uncharge_memsw = true;
4111
4112        /* If swapout, usage of swap doesn't decrease */
4113        if (!do_swap_account || ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT)
4114                uncharge_memsw = false;
4115
4116        batch = &current->memcg_batch;
4117        /*
4118         * In usual, we do css_get() when we remember memcg pointer.
4119         * But in this case, we keep res->usage until end of a series of
4120         * uncharges. Then, it's ok to ignore memcg's refcnt.
4121         */
4122        if (!batch->memcg)
4123                batch->memcg = memcg;
4124        /*
4125         * do_batch > 0 when unmapping pages or inode invalidate/truncate.
4126         * In those cases, all pages freed continuously can be expected to be in
4127         * the same cgroup and we have chance to coalesce uncharges.
4128         * But we do uncharge one by one if this is killed by OOM(TIF_MEMDIE)
4129         * because we want to do uncharge as soon as possible.
4130         */
4131
4132        if (!batch->do_batch || test_thread_flag(TIF_MEMDIE))
4133                goto direct_uncharge;
4134
4135        if (nr_pages > 1)
4136                goto direct_uncharge;
4137
4138        /*
4139         * In typical case, batch->memcg == mem. This means we can
4140         * merge a series of uncharges to an uncharge of page_counter.
4141         * If not, we uncharge page_counter ony by one.
4142         */
4143        if (batch->memcg != memcg)
4144                goto direct_uncharge;
4145        /* remember freed charge and uncharge it later */
4146        batch->nr_pages++;
4147        if (uncharge_memsw)
4148                batch->memsw_nr_pages++;
4149        return;
4150direct_uncharge:
4151        page_counter_uncharge(&memcg->memory, nr_pages);
4152        if (uncharge_memsw)
4153                page_counter_uncharge(&memcg->memsw, nr_pages);
4154        if (unlikely(batch->memcg != memcg))
4155                memcg_oom_recover(memcg);
4156}
4157
4158/*
4159 * uncharge if !page_mapped(page)
4160 */
4161static struct mem_cgroup *
4162__mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype,
4163                             bool end_migration)
4164{
4165        struct mem_cgroup *memcg = NULL;
4166        unsigned int nr_pages = 1;
4167        struct page_cgroup *pc;
4168        bool anon;
4169
4170        if (mem_cgroup_disabled())
4171                return NULL;
4172
4173        if (PageTransHuge(page)) {
4174                nr_pages <<= compound_order(page);
4175                VM_BUG_ON_PAGE(!PageTransHuge(page), page);
4176        }
4177        /*
4178         * Check if our page_cgroup is valid
4179         */
4180        pc = lookup_page_cgroup(page);
4181        if (unlikely(!PageCgroupUsed(pc)))
4182                return NULL;
4183
4184        lock_page_cgroup(pc);
4185
4186        memcg = pc->mem_cgroup;
4187
4188        if (!PageCgroupUsed(pc))
4189                goto unlock_out;
4190
4191        anon = PageAnon(page);
4192
4193        switch (ctype) {
4194        case MEM_CGROUP_CHARGE_TYPE_ANON:
4195                /*
4196                 * Generally PageAnon tells if it's the anon statistics to be
4197                 * updated; but sometimes e.g. mem_cgroup_uncharge_page() is
4198                 * used before page reached the stage of being marked PageAnon.
4199                 */
4200                anon = true;
4201                /* fallthrough */
4202        case MEM_CGROUP_CHARGE_TYPE_DROP:
4203                /* See mem_cgroup_prepare_migration() */
4204                if (page_mapped(page))
4205                        goto unlock_out;
4206                /*
4207                 * Pages under migration may not be uncharged.  But
4208                 * end_migration() /must/ be the one uncharging the
4209                 * unused post-migration page and so it has to call
4210                 * here with the migration bit still set.  See the
4211                 * page_counter handling below.
4212                 */
4213                if (!end_migration && PageCgroupMigration(pc))
4214                        goto unlock_out;
4215                break;
4216        case MEM_CGROUP_CHARGE_TYPE_SWAPOUT:
4217                if (!PageAnon(page)) {  /* Shared memory */
4218                        if (page->mapping && !page_is_file_cache(page))
4219                                goto unlock_out;
4220                } else if (page_mapped(page)) /* Anon */
4221                                goto unlock_out;
4222                break;
4223        default:
4224                break;
4225        }
4226
4227        mem_cgroup_charge_statistics(memcg, page, anon, -nr_pages);
4228
4229        ClearPageCgroupUsed(pc);
4230        /*
4231         * pc->mem_cgroup is not cleared here. It will be accessed when it's
4232         * freed from LRU. This is safe because uncharged page is expected not
4233         * to be reused (freed soon). Exception is SwapCache, it's handled by
4234         * special functions.
4235         */
4236
4237        unlock_page_cgroup(pc);
4238        /*
4239         * even after unlock, we have memcg->memory.usage here and this memcg
4240         * will never be freed.
4241         */
4242        memcg_check_events(memcg, page);
4243        if (do_swap_account && ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT) {
4244                mem_cgroup_swap_statistics(memcg, true);
4245                mem_cgroup_get(memcg);
4246        }
4247        /*
4248         * Migration does not charge the page_counter for the
4249         * replacement page, so leave it alone when phasing out the
4250         * page that is unused after the migration.
4251         */
4252        if (!end_migration && !mem_cgroup_is_root(memcg))
4253                mem_cgroup_do_uncharge(memcg, nr_pages, ctype);
4254
4255        return memcg;
4256
4257unlock_out:
4258        unlock_page_cgroup(pc);
4259        return NULL;
4260}
4261
4262void mem_cgroup_uncharge_page(struct page *page)
4263{
4264        /* early check. */
4265        if (page_mapped(page))
4266                return;
4267        VM_BUG_ON_PAGE(page->mapping && !PageAnon(page), page);
4268        /*
4269         * If the page is in swap cache, uncharge should be deferred
4270         * to the swap path, which also properly accounts swap usage
4271         * and handles memcg lifetime.
4272         *
4273         * Note that this check is not stable and reclaim may add the
4274         * page to swap cache at any time after this.  However, if the
4275         * page is not in swap cache by the time page->mapcount hits
4276         * 0, there won't be any page table references to the swap
4277         * slot, and reclaim will free it and not actually write the
4278         * page to disk.
4279         */
4280        if (PageSwapCache(page))
4281                return;
4282        __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_ANON, false);
4283}
4284
4285void mem_cgroup_uncharge_cache_page(struct page *page)
4286{
4287        VM_BUG_ON_PAGE(page_mapped(page), page);
4288        VM_BUG_ON_PAGE(page->mapping, page);
4289        __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE, false);
4290}
4291
4292/*
4293 * Batch_start/batch_end is called in unmap_page_range/invlidate/trucate.
4294 * In that cases, pages are freed continuously and we can expect pages
4295 * are in the same memcg. All these calls itself limits the number of
4296 * pages freed at once, then uncharge_start/end() is called properly.
4297 * This may be called prural(2) times in a context,
4298 */
4299
4300void mem_cgroup_uncharge_start(void)
4301{
4302        current->memcg_batch.do_batch++;
4303        /* We can do nest. */
4304        if (current->memcg_batch.do_batch == 1) {
4305                current->memcg_batch.memcg = NULL;
4306                current->memcg_batch.nr_pages = 0;
4307                current->memcg_batch.memsw_nr_pages = 0;
4308        }
4309}
4310
4311void mem_cgroup_uncharge_end(void)
4312{
4313        struct memcg_batch_info *batch = &current->memcg_batch;
4314
4315        if (!batch->do_batch)
4316                return;
4317
4318        batch->do_batch--;
4319        if (batch->do_batch) /* If stacked, do nothing. */
4320                return;
4321
4322        if (!batch->memcg)
4323                return;
4324        /*
4325         * This "batch->memcg" is valid without any css_get/put etc...
4326         * bacause we hide charges behind us.
4327         */
4328        if (batch->nr_pages)
4329                page_counter_uncharge(&batch->memcg->memory, batch->nr_pages);
4330        if (batch->memsw_nr_pages)
4331                page_counter_uncharge(&batch->memcg->memsw, batch->memsw_nr_pages);
4332        memcg_oom_recover(batch->memcg);
4333        /* forget this pointer (for sanity check) */
4334        batch->memcg = NULL;
4335}
4336
4337#ifdef CONFIG_SWAP
4338/*
4339 * called after __delete_from_swap_cache() and drop "page" account.
4340 * memcg information is recorded to swap_cgroup of "ent"
4341 */
4342void
4343mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent, bool swapout)
4344{
4345        struct mem_cgroup *memcg;
4346        int ctype = MEM_CGROUP_CHARGE_TYPE_SWAPOUT;
4347
4348        if (!swapout) /* this was a swap cache but the swap is unused ! */
4349                ctype = MEM_CGROUP_CHARGE_TYPE_DROP;
4350
4351        memcg = __mem_cgroup_uncharge_common(page, ctype, false);
4352
4353        /*
4354         * record memcg information,  if swapout && memcg != NULL,
4355         * mem_cgroup_get() was called in uncharge().
4356         */
4357        if (do_swap_account && swapout && memcg)
4358                swap_cgroup_record(ent, mem_cgroup_id(memcg));
4359}
4360#endif
4361
4362#ifdef CONFIG_MEMCG_SWAP
4363/*
4364 * called from swap_entry_free(). remove record in swap_cgroup and
4365 * uncharge "memsw" account.
4366 */
4367void mem_cgroup_uncharge_swap(swp_entry_t ent)
4368{
4369        struct mem_cgroup *memcg;
4370        unsigned short id;
4371
4372        if (!do_swap_account)
4373                return;
4374
4375        id = swap_cgroup_record(ent, 0);
4376        rcu_read_lock();
4377        memcg = mem_cgroup_lookup(id);
4378        if (memcg) {
4379                /*
4380                 * We uncharge this because swap is freed.
4381                 * This memcg can be obsolete one. We avoid calling css_tryget
4382                 */
4383                if (!mem_cgroup_is_root(memcg))
4384                        page_counter_uncharge(&memcg->memsw, 1);
4385                mem_cgroup_swap_statistics(memcg, false);
4386                mem_cgroup_put(memcg);
4387        }
4388        rcu_read_unlock();
4389}
4390
4391/**
4392 * mem_cgroup_move_swap_account - move swap charge and swap_cgroup's record.
4393 * @entry: swap entry to be moved
4394 * @from:  mem_cgroup which the entry is moved from
4395 * @to:  mem_cgroup which the entry is moved to
4396 *
4397 * It succeeds only when the swap_cgroup's record for this entry is the same
4398 * as the mem_cgroup's id of @from.
4399 *
4400 * Returns 0 on success, -EINVAL on failure.
4401 *
4402 * The caller must have charged to @to, IOW, called page_counter_charge() about
4403 * both res and memsw, and called css_get().
4404 */
4405static int mem_cgroup_move_swap_account(swp_entry_t entry,
4406                                struct mem_cgroup *from, struct mem_cgroup *to)
4407{
4408        unsigned short old_id, new_id;
4409
4410        old_id = mem_cgroup_id(from);
4411        new_id = mem_cgroup_id(to);
4412
4413        if (swap_cgroup_cmpxchg(entry, old_id, new_id) == old_id) {
4414                mem_cgroup_swap_statistics(from, false);
4415                mem_cgroup_swap_statistics(to, true);
4416                /*
4417                 * This function is only called from task migration context now.
4418                 * It postpones page_counter and refcount handling till the end
4419                 * of task migration(mem_cgroup_clear_mc()) for performance
4420                 * improvement. But we cannot postpone mem_cgroup_get(to)
4421                 * because if the process that has been moved to @to does
4422                 * swap-in, the refcount of @to might be decreased to 0.
4423                 */
4424                mem_cgroup_get(to);
4425                return 0;
4426        }
4427        return -EINVAL;
4428}
4429#else
4430static inline int mem_cgroup_move_swap_account(swp_entry_t entry,
4431                                struct mem_cgroup *from, struct mem_cgroup *to)
4432{
4433        return -EINVAL;
4434}
4435#endif
4436
4437/*
4438 * Before starting migration, account PAGE_SIZE to mem_cgroup that the old
4439 * page belongs to.
4440 */
4441void mem_cgroup_prepare_migration(struct page *page, struct page *newpage,
4442                                  struct mem_cgroup **memcgp)
4443{
4444        struct mem_cgroup *memcg = NULL;
4445        unsigned int nr_pages = 1;
4446        struct page_cgroup *pc;
4447        enum charge_type ctype;
4448
4449        *memcgp = NULL;
4450
4451        if (mem_cgroup_disabled())
4452                return;
4453
4454        if (PageTransHuge(page))
4455                nr_pages <<= compound_order(page);
4456
4457        pc = lookup_page_cgroup(page);
4458        lock_page_cgroup(pc);
4459        if (PageCgroupUsed(pc)) {
4460                memcg = pc->mem_cgroup;
4461                css_get(&memcg->css);
4462                /*
4463                 * At migrating an anonymous page, its mapcount goes down
4464                 * to 0 and uncharge() will be called. But, even if it's fully
4465                 * unmapped, migration may fail and this page has to be
4466                 * charged again. We set MIGRATION flag here and delay uncharge
4467                 * until end_migration() is called
4468                 *
4469                 * Corner Case Thinking
4470                 * A)
4471                 * When the old page was mapped as Anon and it's unmap-and-freed
4472                 * while migration was ongoing.
4473                 * If unmap finds the old page, uncharge() of it will be delayed
4474                 * until end_migration(). If unmap finds a new page, it's
4475                 * uncharged when it make mapcount to be 1->0. If unmap code
4476                 * finds swap_migration_entry, the new page will not be mapped
4477                 * and end_migration() will find it(mapcount==0).
4478                 *
4479                 * B)
4480                 * When the old page was mapped but migraion fails, the kernel
4481                 * remaps it. A charge for it is kept by MIGRATION flag even
4482                 * if mapcount goes down to 0. We can do remap successfully
4483                 * without charging it again.
4484                 *
4485                 * C)
4486                 * The "old" page is under lock_page() until the end of
4487                 * migration, so, the old page itself will not be swapped-out.
4488                 * If the new page is swapped out before end_migraton, our
4489                 * hook to usual swap-out path will catch the event.
4490                 */
4491                if (PageAnon(page))
4492                        SetPageCgroupMigration(pc);
4493        }
4494        unlock_page_cgroup(pc);
4495        /*
4496         * If the page is not charged at this point,
4497         * we return here.
4498         */
4499        if (!memcg)
4500                return;
4501
4502        *memcgp = memcg;
4503        /*
4504         * We charge new page before it's used/mapped. So, even if unlock_page()
4505         * is called before end_migration, we can catch all events on this new
4506         * page. In the case new page is migrated but not remapped, new page's
4507         * mapcount will be finally 0 and we call uncharge in end_migration().
4508         */
4509        if (PageAnon(page))
4510                ctype = MEM_CGROUP_CHARGE_TYPE_ANON;
4511        else
4512                ctype = MEM_CGROUP_CHARGE_TYPE_CACHE;
4513        /*
4514         * The page is committed to the memcg, but it's not actually
4515         * charged to the page_counter since we plan on replacing the
4516         * old one and only one page is going to be left afterwards.
4517         */
4518        __mem_cgroup_commit_charge(memcg, newpage, nr_pages, ctype, false);
4519}
4520
4521/* remove redundant charge if migration failed*/
4522void mem_cgroup_end_migration(struct mem_cgroup *memcg,
4523        struct page *oldpage, struct page *newpage, bool migration_ok)
4524{
4525        struct page *used, *unused;
4526        struct page_cgroup *pc;
4527        bool anon;
4528
4529        if (!memcg)
4530                return;
4531
4532        if (!migration_ok) {
4533                used = oldpage;
4534                unused = newpage;
4535        } else {
4536                used = newpage;
4537                unused = oldpage;
4538        }
4539        anon = PageAnon(used);
4540        __mem_cgroup_uncharge_common(unused,
4541                                     anon ? MEM_CGROUP_CHARGE_TYPE_ANON
4542                                     : MEM_CGROUP_CHARGE_TYPE_CACHE,
4543                                     true);
4544        css_put(&memcg->css);
4545        /*
4546         * We disallowed uncharge of pages under migration because mapcount
4547         * of the page goes down to zero, temporarly.
4548         * Clear the flag and check the page should be charged.
4549         */
4550        pc = lookup_page_cgroup(oldpage);
4551        lock_page_cgroup(pc);
4552        ClearPageCgroupMigration(pc);
4553        unlock_page_cgroup(pc);
4554
4555        /*
4556         * If a page is a file cache, radix-tree replacement is very atomic
4557         * and we can skip this check. When it was an Anon page, its mapcount
4558         * goes down to 0. But because we added MIGRATION flage, it's not
4559         * uncharged yet. There are several case but page->mapcount check
4560         * and USED bit check in mem_cgroup_uncharge_page() will do enough
4561         * check. (see prepare_charge() also)
4562         */
4563        if (anon)
4564                mem_cgroup_uncharge_page(used);
4565}
4566
4567/*
4568 * At replace page cache, newpage is not under any memcg but it's on
4569 * LRU. So, this function doesn't touch page_counter but handles LRU
4570 * in correct way. Both pages are locked so we cannot race with uncharge.
4571 */
4572void mem_cgroup_replace_page_cache(struct page *oldpage,
4573                                  struct page *newpage)
4574{
4575        struct mem_cgroup *memcg = NULL;
4576        struct page_cgroup *pc;
4577        enum charge_type type = MEM_CGROUP_CHARGE_TYPE_CACHE;
4578
4579        if (mem_cgroup_disabled())
4580                return;
4581
4582        pc = lookup_page_cgroup(oldpage);
4583        /* fix accounting on old pages */
4584        lock_page_cgroup(pc);
4585        if (PageCgroupUsed(pc)) {
4586                memcg = pc->mem_cgroup;
4587                mem_cgroup_charge_statistics(memcg, oldpage, false, -1);
4588                ClearPageCgroupUsed(pc);
4589        }
4590        unlock_page_cgroup(pc);
4591
4592        /*
4593         * When called from shmem_replace_page(), in some cases the
4594         * oldpage has already been charged, and in some cases not.
4595         */
4596        if (!memcg)
4597                return;
4598        /*
4599         * Even if newpage->mapping was NULL before starting replacement,
4600         * the newpage may be on LRU(or pagevec for LRU) already. We lock
4601         * LRU while we overwrite pc->mem_cgroup.
4602         */
4603        __mem_cgroup_commit_charge(memcg, newpage, 1, type, true);
4604}
4605
4606#ifdef CONFIG_DEBUG_VM
4607static struct page_cgroup *lookup_page_cgroup_used(struct page *page)
4608{
4609        struct page_cgroup *pc;
4610
4611        pc = lookup_page_cgroup(page);
4612        /*
4613         * Can be NULL while feeding pages into the page allocator for
4614         * the first time, i.e. during boot or memory hotplug;
4615         * or when mem_cgroup_disabled().
4616         */
4617        if (likely(pc) && PageCgroupUsed(pc))
4618                return pc;
4619        return NULL;
4620}
4621
4622bool mem_cgroup_bad_page_check(struct page *page)
4623{
4624        if (mem_cgroup_disabled())
4625                return false;
4626
4627        return lookup_page_cgroup_used(page) != NULL;
4628}
4629
4630void mem_cgroup_print_bad_page(struct page *page)
4631{
4632        struct page_cgroup *pc;
4633
4634        pc = lookup_page_cgroup_used(page);
4635        if (pc) {
4636                pr_alert("pc:%p pc->flags:%lx pc->mem_cgroup:%p\n",
4637                         pc, pc->flags, pc->mem_cgroup);
4638        }
4639}
4640#endif
4641
4642static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
4643                                   unsigned long limit)
4644{
4645        unsigned long curusage;
4646        unsigned long oldusage;
4647        unsigned long memswlimit;
4648        bool enlarge = false;
4649        int retry_count;
4650        int ret;
4651
4652        /*
4653         * For keeping hierarchical_reclaim simple, how long we should retry
4654         * is depends on callers. We set our retry-count to be function
4655         * of # of children which we should visit in this loop.
4656         */
4657        retry_count = MEM_CGROUP_RECLAIM_RETRIES *
4658                      mem_cgroup_count_children(memcg);
4659
4660        oldusage = page_counter_read(&memcg->memory);
4661
4662        do {
4663                if (signal_pending(current)) {
4664                        ret = -EINTR;
4665                        break;
4666                }
4667                mutex_lock(&memcg_limit_mutex);
4668                memswlimit = memcg->memsw.limit;
4669                if (limit > memswlimit) {
4670                        mutex_unlock(&memcg_limit_mutex);
4671                        ret = -EINVAL;
4672                        break;
4673                }
4674
4675                if (limit > memcg->memory.limit)
4676                        enlarge = true;
4677
4678                ret = page_counter_limit(&memcg->memory, limit);
4679                if (!ret) {
4680                        if (memswlimit == limit)
4681                                memcg->memsw_is_minimum = true;
4682                        else
4683                                memcg->memsw_is_minimum = false;
4684                }
4685                mutex_unlock(&memcg_limit_mutex);
4686
4687                if (!ret)
4688                        break;
4689
4690                mem_cgroup_reclaim(memcg, GFP_KERNEL,
4691                                   MEM_CGROUP_RECLAIM_SHRINK);
4692                curusage = page_counter_read(&memcg->memory);
4693                /* Usage is reduced ? */
4694                if (curusage >= oldusage)
4695                        retry_count--;
4696                else
4697                        oldusage = curusage;
4698        } while (retry_count);
4699
4700        if (!ret && enlarge)
4701                memcg_oom_recover(memcg);
4702
4703        return ret;
4704}
4705
4706static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg,
4707                                         unsigned long limit)
4708{
4709        unsigned long curusage;
4710        unsigned long oldusage;
4711        unsigned long memlimit, memswlimit;
4712        bool enlarge = false;
4713        int retry_count;
4714        int ret;
4715
4716        /* see mem_cgroup_resize_res_limit */
4717        retry_count = MEM_CGROUP_RECLAIM_RETRIES *
4718                      mem_cgroup_count_children(memcg);
4719
4720        oldusage = page_counter_read(&memcg->memsw);
4721
4722        do {
4723                if (signal_pending(current)) {
4724                        ret = -EINTR;
4725                        break;
4726                }
4727                mutex_lock(&memcg_limit_mutex);
4728                memlimit = memcg->memory.limit;
4729                if (limit < memlimit) {
4730                        mutex_unlock(&memcg_limit_mutex);
4731                        ret = -EINVAL;
4732                        break;
4733                }
4734                memswlimit = memcg->memsw.limit;
4735                if (limit > memswlimit)
4736                        enlarge = true;
4737                ret = page_counter_limit(&memcg->memsw, limit);
4738                if (!ret) {
4739                        if (memlimit == limit)
4740                                memcg->memsw_is_minimum = true;
4741                        else
4742                                memcg->memsw_is_minimum = false;
4743                }
4744                mutex_unlock(&memcg_limit_mutex);
4745
4746                if (!ret)
4747                        break;
4748
4749                mem_cgroup_reclaim(memcg, GFP_KERNEL,
4750                                   MEM_CGROUP_RECLAIM_NOSWAP |
4751                                   MEM_CGROUP_RECLAIM_SHRINK);
4752                curusage = page_counter_read(&memcg->memsw);
4753                /* Usage is reduced ? */
4754                if (curusage >= oldusage)
4755                        retry_count--;
4756                else
4757                        oldusage = curusage;
4758        } while (retry_count);
4759
4760        if (!ret && enlarge)
4761                memcg_oom_recover(memcg);
4762        return ret;
4763}
4764
4765unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
4766                                            gfp_t gfp_mask,
4767                                            unsigned long *total_scanned)
4768{
4769        unsigned long nr_reclaimed = 0;
4770        struct mem_cgroup_per_zone *mz, *next_mz = NULL;
4771        unsigned long reclaimed;
4772        int loop = 0;
4773        struct mem_cgroup_tree_per_zone *mctz;
4774        unsigned long excess;
4775        unsigned long nr_scanned;
4776
4777        if (order > 0)
4778                return 0;
4779
4780        mctz = soft_limit_tree_node_zone(zone_to_nid(zone), zone_idx(zone));
4781        /*
4782         * This loop can run a while, specially if mem_cgroup's continuously
4783         * keep exceeding their soft limit and putting the system under
4784         * pressure
4785         */
4786        do {
4787                if (next_mz)
4788                        mz = next_mz;
4789                else
4790                        mz = mem_cgroup_largest_soft_limit_node(mctz);
4791                if (!mz)
4792                        break;
4793
4794                nr_scanned = 0;
4795                reclaimed = mem_cgroup_soft_reclaim(mz->memcg, zone,
4796                                                    gfp_mask, &nr_scanned);
4797                nr_reclaimed += reclaimed;
4798                *total_scanned += nr_scanned;
4799                spin_lock(&mctz->lock);
4800
4801                /*
4802                 * If we failed to reclaim anything from this memory cgroup
4803                 * it is time to move on to the next cgroup
4804                 */
4805                next_mz = NULL;
4806                if (!reclaimed) {
4807                        do {
4808                                /*
4809                                 * Loop until we find yet another one.
4810                                 *
4811                                 * By the time we get the soft_limit lock
4812                                 * again, someone might have aded the
4813                                 * group back on the RB tree. Iterate to
4814                                 * make sure we get a different mem.
4815                                 * mem_cgroup_largest_soft_limit_node returns
4816                                 * NULL if no other cgroup is present on
4817                                 * the tree
4818                                 */
4819                                next_mz =
4820                                __mem_cgroup_largest_soft_limit_node(mctz);
4821                                if (next_mz == mz)
4822                                        css_put(&next_mz->memcg->css);
4823                                else /* next_mz == NULL or other memcg */
4824                                        break;
4825                        } while (1);
4826                }
4827                __mem_cgroup_remove_exceeded(mz->memcg, mz, mctz);
4828                excess = soft_limit_excess(mz->memcg);
4829                /*
4830                 * One school of thought says that we should not add
4831                 * back the node to the tree if reclaim returns 0.
4832                 * But our reclaim could return 0, simply because due
4833                 * to priority we are exposing a smaller subset of
4834                 * memory to reclaim from. Consider this as a longer
4835                 * term TODO.
4836                 */
4837                /* If excess == 0, no tree ops */
4838                __mem_cgroup_insert_exceeded(mz->memcg, mz, mctz, excess);
4839                spin_unlock(&mctz->lock);
4840                css_put(&mz->memcg->css);
4841                loop++;
4842                /*
4843                 * Could not reclaim anything and there are no more
4844                 * mem cgroups to try or we seem to be looping without
4845                 * reclaiming anything.
4846                 */
4847                if (!nr_reclaimed &&
4848                        (next_mz == NULL ||
4849                        loop > MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS))
4850                        break;
4851        } while (!nr_reclaimed);
4852        if (next_mz)
4853                css_put(&next_mz->memcg->css);
4854        return nr_reclaimed;
4855}
4856
4857/**
4858 * mem_cgroup_force_empty_list - clears LRU of a group
4859 * @memcg: group to clear
4860 * @node: NUMA node
4861 * @zid: zone id
4862 * @lru: lru to to clear
4863 *
4864 * Traverse a specified page_cgroup list and try to drop them all.  This doesn't
4865 * reclaim the pages page themselves - pages are moved to the parent (or root)
4866 * group.
4867 */
4868static void mem_cgroup_force_empty_list(struct mem_cgroup *memcg,
4869                                int node, int zid, enum lru_list lru)
4870{
4871        struct lruvec *lruvec;
4872        unsigned long flags;
4873        struct list_head *list;
4874        struct page *busy;
4875        struct zone *zone;
4876
4877        zone = &NODE_DATA(node)->node_zones[zid];
4878        lruvec = mem_cgroup_zone_lruvec(zone, memcg);
4879        list = &lruvec->lists[lru];
4880
4881        busy = NULL;
4882        do {
4883                struct page_cgroup *pc;
4884                struct page *page;
4885
4886                spin_lock_irqsave(&zone->lru_lock, flags);
4887                if (list_empty(list)) {
4888                        spin_unlock_irqrestore(&zone->lru_lock, flags);
4889                        break;
4890                }
4891                page = list_entry(list->prev, struct page, lru);
4892                if (busy == page) {
4893                        list_move(&page->lru, list);
4894                        busy = NULL;
4895                        spin_unlock_irqrestore(&zone->lru_lock, flags);
4896                        continue;
4897                }
4898                spin_unlock_irqrestore(&zone->lru_lock, flags);
4899
4900                pc = lookup_page_cgroup(page);
4901
4902                if (mem_cgroup_move_parent(page, pc, memcg)) {
4903                        /* found lock contention or "pc" is obsolete. */
4904                        busy = page;
4905                        cond_resched();
4906                } else
4907                        busy = NULL;
4908        } while (!list_empty(list));
4909}
4910
4911/*
4912 * make mem_cgroup's charge to be 0 if there is no task by moving
4913 * all the charges and pages to the parent.
4914 * This enables deleting this mem_cgroup.
4915 *
4916 * Caller is responsible for holding css reference on the memcg.
4917 */
4918static void mem_cgroup_reparent_charges(struct mem_cgroup *memcg)
4919{
4920        int node, zid;
4921
4922        do {
4923                /* This is for making all *used* pages to be on LRU. */
4924                lru_add_drain_all();
4925                drain_all_stock_sync(memcg);
4926                mem_cgroup_start_move(memcg);
4927                for_each_node_state(node, N_MEMORY) {
4928                        for (zid = 0; zid < MAX_NR_ZONES; zid++) {
4929                                enum lru_list lru;
4930                                for_each_lru(lru) {
4931                                        mem_cgroup_force_empty_list(memcg,
4932                                                        node, zid, lru);
4933                                }
4934                        }
4935                }
4936                mem_cgroup_end_move(memcg);
4937                memcg_oom_recover(memcg);
4938                cond_resched();
4939
4940                /*
4941                 * Kernel memory may not necessarily be trackable to a specific
4942                 * process. So they are not migrated, and therefore we can't
4943                 * expect their value to drop to 0 here.
4944                 * Having res filled up with kmem only is enough.
4945                 *
4946                 * This is a safety check because mem_cgroup_force_empty_list
4947                 * could have raced with mem_cgroup_replace_page_cache callers
4948                 * so the lru seemed empty but the page could have been added
4949                 * right after the check. RES_USAGE should be safe as we always
4950                 * charge before adding to the LRU.
4951                 */
4952        } while (page_counter_read(&memcg->memory) -
4953                 page_counter_read(&memcg->kmem) > 0);
4954}
4955
4956/*
4957 * This mainly exists for tests during the setting of set of use_hierarchy.
4958 * Since this is the very setting we are changing, the current hierarchy value
4959 * is meaningless
4960 */
4961static inline bool __memcg_has_children(struct mem_cgroup *memcg)
4962{
4963        struct cgroup *pos;
4964
4965        /* bounce at first found */
4966        cgroup_for_each_child(pos, memcg->css.cgroup)
4967                return true;
4968        return false;
4969}
4970
4971/*
4972 * Must be called with memcg_create_mutex held, unless the cgroup is guaranteed
4973 * to be already dead (as in mem_cgroup_force_empty, for instance).  This is
4974 * from mem_cgroup_count_children(), in the sense that we don't really care how
4975 * many children we have; we only need to know if we have any.  It also counts
4976 * any memcg without hierarchy as infertile.
4977 */
4978static inline bool memcg_has_children(struct mem_cgroup *memcg)
4979{
4980        return memcg->use_hierarchy && __memcg_has_children(memcg);
4981}
4982
4983/*
4984 * Reclaims as many pages from the given memcg as possible and moves
4985 * the rest to the parent.
4986 *
4987 * Caller is responsible for holding css reference for memcg.
4988 */
4989static int mem_cgroup_force_empty(struct mem_cgroup *memcg)
4990{
4991        int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
4992        struct cgroup *cgrp = memcg->css.cgroup;
4993
4994        /* returns EBUSY if there is a task or if we come here twice. */
4995        if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children))
4996                return -EBUSY;
4997
4998        /* we call try-to-free pages for make this cgroup empty */
4999        lru_add_drain_all();
5000        /* try to free all pages in this cgroup */
5001        while (nr_retries && page_counter_read(&memcg->memory)) {
5002                int progress;
5003
5004                if (signal_pending(current))
5005                        return -EINTR;
5006
5007                progress = try_to_free_mem_cgroup_pages(memcg, GFP_KERNEL,
5008                                                false);
5009                if (!progress) {
5010                        nr_retries--;
5011                        /* maybe some writeback is necessary */
5012                        congestion_wait(BLK_RW_ASYNC, HZ/10);
5013                }
5014
5015        }
5016        lru_add_drain();
5017        mem_cgroup_reparent_charges(memcg);
5018
5019        return 0;
5020}
5021
5022static int mem_cgroup_force_empty_write(struct cgroup *cont, unsigned int event)
5023{
5024        struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
5025        int ret;
5026
5027        if (mem_cgroup_is_root(memcg))
5028                return -EINVAL;
5029        css_get(&memcg->css);
5030        ret = mem_cgroup_force_empty(memcg);
5031        css_put(&memcg->css);
5032
5033        return ret;
5034}
5035
5036
5037static u64 mem_cgroup_hierarchy_read(struct cgroup *cont, struct cftype *cft)
5038{
5039        return mem_cgroup_from_cont(cont)->use_hierarchy;
5040}
5041
5042static int mem_cgroup_hierarchy_write(struct cgroup *cont, struct cftype *cft,
5043                                        u64 val)
5044{
5045        int retval = 0;
5046        struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
5047        struct cgroup *parent = cont->parent;
5048        struct mem_cgroup *parent_memcg = NULL;
5049
5050        if (parent)
5051                parent_memcg = mem_cgroup_from_cont(parent);
5052
5053        mutex_lock(&memcg_create_mutex);
5054
5055        if (memcg->use_hierarchy == val)
5056                goto out;
5057
5058        /*
5059         * If parent's use_hierarchy is set, we can't make any modifications
5060         * in the child subtrees. If it is unset, then the change can
5061         * occur, provided the current cgroup has no children.
5062         *
5063         * For the root cgroup, parent_mem is NULL, we allow value to be
5064         * set if there are no children.
5065         */
5066        if ((!parent_memcg || !parent_memcg->use_hierarchy) &&
5067                                (val == 1 || val == 0)) {
5068                if (!__memcg_has_children(memcg))
5069                        memcg->use_hierarchy = val;
5070                else
5071                        retval = -EBUSY;
5072        } else
5073                retval = -EINVAL;
5074
5075out:
5076        mutex_unlock(&memcg_create_mutex);
5077
5078        return retval;
5079}
5080
5081
5082static unsigned long tree_stat(struct mem_cgroup *memcg,
5083                               enum mem_cgroup_stat_index idx)
5084{
5085        struct mem_cgroup *iter;
5086        long val = 0;
5087
5088        /* Per-cpu values can be negative, use a signed accumulator */
5089        for_each_mem_cgroup_tree(iter, memcg)
5090                val += mem_cgroup_read_stat(iter, idx);
5091
5092        if (val < 0) /* race ? */
5093                val = 0;
5094        return val;
5095}
5096
5097static inline unsigned long mem_cgroup_usage(struct mem_cgroup *memcg, bool swap)
5098{
5099        unsigned long val;
5100
5101        if (mem_cgroup_is_root(memcg)) {
5102                val = tree_stat(memcg, MEM_CGROUP_STAT_CACHE);
5103                val += tree_stat(memcg, MEM_CGROUP_STAT_RSS);
5104                if (swap)
5105                        val += tree_stat(memcg, MEM_CGROUP_STAT_SWAP);
5106        } else {
5107                if (!swap)
5108                        val = page_counter_read(&memcg->memory);
5109                else
5110                        val = page_counter_read(&memcg->memsw);
5111        }
5112        return val;
5113}
5114
5115enum {
5116        RES_USAGE,
5117        RES_LIMIT,
5118        RES_MAX_USAGE,
5119        RES_FAILCNT,
5120        RES_SOFT_LIMIT,
5121};
5122
5123static ssize_t mem_cgroup_read(struct cgroup *cont, struct cftype *cft,
5124                               struct file *file, char __user *buf,
5125                               size_t nbytes, loff_t *ppos)
5126{
5127        struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
5128        char str[64];
5129        u64 val;
5130        int len;
5131        struct page_counter *counter;
5132
5133        switch (MEMFILE_TYPE(cft->private)) {
5134        case _MEM:
5135                counter = &memcg->memory;
5136                break;
5137        case _MEMSWAP:
5138                counter = &memcg->memsw;
5139                break;
5140        case _KMEM:
5141                counter = &memcg->kmem;
5142                break;
5143        default:
5144                BUG();
5145        }
5146
5147        switch (MEMFILE_ATTR(cft->private)) {
5148        case RES_USAGE:
5149                if (counter == &memcg->memory)
5150                        val = (u64)mem_cgroup_usage(memcg, false) * PAGE_SIZE;
5151                else if (counter == &memcg->memsw)
5152                        val = (u64)mem_cgroup_usage(memcg, true) * PAGE_SIZE;
5153                else
5154                        val = (u64)page_counter_read(counter) * PAGE_SIZE;
5155                break;
5156        case RES_LIMIT:
5157                val = (u64)counter->limit * PAGE_SIZE;
5158                break;
5159        case RES_MAX_USAGE:
5160                val = (u64)counter->watermark * PAGE_SIZE;
5161                break;
5162        case RES_FAILCNT:
5163                val = (u64)counter->failcnt;
5164                break;
5165        case RES_SOFT_LIMIT:
5166                val = (u64)memcg->soft_limit * PAGE_SIZE;
5167                break;
5168        default:
5169                BUG();
5170        }
5171
5172        len = scnprintf(str, sizeof(str), "%llu\n", (unsigned long long)val);
5173        return simple_read_from_buffer(buf, nbytes, ppos, str, len);
5174}
5175
5176static int memcg_update_kmem_limit(struct cgroup *cont, unsigned long limit)
5177{
5178        int ret = -EINVAL;
5179#ifdef CONFIG_MEMCG_KMEM
5180        struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
5181        /*
5182         * For simplicity, we won't allow this to be disabled.  It also can't
5183         * be changed if the cgroup has children already, or if tasks had
5184         * already joined.
5185         *
5186         * If tasks join before we set the limit, a person looking at
5187         * kmem.usage_in_bytes will have no way to determine when it took
5188         * place, which makes the value quite meaningless.
5189         *
5190         * After it first became limited, changes in the value of the limit are
5191         * of course permitted.
5192         */
5193        mutex_lock(&memcg_create_mutex);
5194        mutex_lock(&memcg_limit_mutex);
5195        if (!memcg->kmem_account_flags && limit != PAGE_COUNTER_MAX) {
5196                if (cgroup_task_count(cont) || memcg_has_children(memcg)) {
5197                        ret = -EBUSY;
5198                        goto out;
5199                }
5200                ret = page_counter_limit(&memcg->kmem, limit);
5201                VM_BUG_ON(ret);
5202
5203                ret = memcg_update_cache_sizes(memcg);
5204                if (ret) {
5205                        page_counter_limit(&memcg->kmem, PAGE_COUNTER_MAX);
5206                        goto out;
5207                }
5208                static_key_slow_inc(&memcg_kmem_enabled_key);
5209                /*
5210                 * setting the active bit after the inc will guarantee no one
5211                 * starts accounting before all call sites are patched
5212                 */
5213                memcg_kmem_set_active(memcg);
5214
5215                /*
5216                 * kmem charges can outlive the cgroup. In the case of slab
5217                 * pages, for instance, a page contain objects from various
5218                 * processes, so it is unfeasible to migrate them away. We
5219                 * need to reference count the memcg because of that.
5220                 */
5221                mem_cgroup_get(memcg);
5222        } else
5223                ret = page_counter_limit(&memcg->kmem, limit);
5224out:
5225        mutex_unlock(&memcg_limit_mutex);
5226        mutex_unlock(&memcg_create_mutex);
5227#endif
5228        return ret;
5229}
5230
5231#ifdef CONFIG_MEMCG_KMEM
5232static int memcg_propagate_kmem(struct mem_cgroup *memcg)
5233{
5234        int ret = 0;
5235        struct mem_cgroup *parent = parent_mem_cgroup(memcg);
5236        if (!parent)
5237                goto out;
5238
5239        memcg->kmem_account_flags = parent->kmem_account_flags;
5240        /*
5241         * When that happen, we need to disable the static branch only on those
5242         * memcgs that enabled it. To achieve this, we would be forced to
5243         * complicate the code by keeping track of which memcgs were the ones
5244         * that actually enabled limits, and which ones got it from its
5245         * parents.
5246         *
5247         * It is a lot simpler just to do static_key_slow_inc() on every child
5248         * that is accounted.
5249         */
5250        if (!memcg_kmem_is_active(memcg))
5251                goto out;
5252
5253        /*
5254         * destroy(), called if we fail, will issue static_key_slow_inc() and
5255         * mem_cgroup_put() if kmem is enabled. We have to either call them
5256         * unconditionally, or clear the KMEM_ACTIVE flag. I personally find
5257         * this more consistent, since it always leads to the same destroy path
5258         */
5259        mem_cgroup_get(memcg);
5260        static_key_slow_inc(&memcg_kmem_enabled_key);
5261
5262        mutex_lock(&memcg_limit_mutex);
5263        ret = memcg_update_cache_sizes(memcg);
5264        mutex_unlock(&memcg_limit_mutex);
5265out:
5266        return ret;
5267}
5268#endif /* CONFIG_MEMCG_KMEM */
5269
5270/*
5271 * The user of this function is...
5272 * RES_LIMIT.
5273 */
5274static int mem_cgroup_write(struct cgroup *cont, struct cftype *cft,
5275                            const char *buffer)
5276{
5277        struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
5278        unsigned long nr_pages;
5279        int ret;
5280
5281        ret = page_counter_memparse(buffer, &nr_pages);
5282        if (ret)
5283                return ret;
5284
5285        switch (MEMFILE_ATTR(cft->private)) {
5286        case RES_LIMIT:
5287                if (mem_cgroup_is_root(memcg)) { /* Can't set limit on root */
5288                        ret = -EINVAL;
5289                        break;
5290                }
5291                switch (MEMFILE_TYPE(cft->private)) {
5292                case _MEM:
5293                        ret = mem_cgroup_resize_limit(memcg, nr_pages);
5294                        break;
5295                case _MEMSWAP:
5296                        ret = mem_cgroup_resize_memsw_limit(memcg, nr_pages);
5297                        break;
5298                case _KMEM:
5299                        ret = memcg_update_kmem_limit(cont, nr_pages);
5300                        break;
5301                }
5302                break;
5303        case RES_SOFT_LIMIT:
5304                memcg->soft_limit = nr_pages;
5305                ret = 0;
5306                break;
5307        }
5308        return ret;
5309}
5310
5311static int mem_cgroup_reset(struct cgroup *cont, unsigned int event)
5312{
5313        struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
5314        struct page_counter *counter;
5315
5316        switch (MEMFILE_TYPE(event)) {
5317        case _MEM:
5318                counter = &memcg->memory;
5319                break;
5320        case _MEMSWAP:
5321                counter = &memcg->memsw;
5322                break;
5323        case _KMEM:
5324                counter = &memcg->kmem;
5325                break;
5326        default:
5327                BUG();
5328        }
5329
5330        switch (MEMFILE_ATTR(event)) {
5331        case RES_MAX_USAGE:
5332                page_counter_reset_watermark(counter);
5333                break;
5334        case RES_FAILCNT:
5335                counter->failcnt = 0;
5336                break;
5337        default:
5338                BUG();
5339        }
5340
5341        return 0;
5342}
5343
5344static u64 mem_cgroup_move_charge_read(struct cgroup *cgrp,
5345                                        struct cftype *cft)
5346{
5347        return mem_cgroup_from_cont(cgrp)->move_charge_at_immigrate;
5348}
5349
5350#ifdef CONFIG_MMU
5351static int mem_cgroup_move_charge_write(struct cgroup *cgrp,
5352                                        struct cftype *cft, u64 val)
5353{
5354        struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
5355
5356        if (val >= (1 << NR_MOVE_TYPE))
5357                return -EINVAL;
5358
5359        /*
5360         * No kind of locking is needed in here, because ->can_attach() will
5361         * check this value once in the beginning of the process, and then carry
5362         * on with stale data. This means that changes to this value will only
5363         * affect task migrations starting after the change.
5364         */
5365        memcg->move_charge_at_immigrate = val;
5366        return 0;
5367}
5368#else
5369static int mem_cgroup_move_charge_write(struct cgroup *cgrp,
5370                                        struct cftype *cft, u64 val)
5371{
5372        return -ENOSYS;
5373}
5374#endif
5375
5376#ifdef CONFIG_NUMA
5377static int memcg_numa_stat_show(struct cgroup *cont, struct cftype *cft,
5378                                      struct seq_file *m)
5379{
5380        int nid;
5381        unsigned long total_nr, file_nr, anon_nr, unevictable_nr;
5382        unsigned long node_nr;
5383        struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
5384
5385        total_nr = mem_cgroup_nr_lru_pages(memcg, LRU_ALL);
5386        seq_printf(m, "total=%lu", total_nr);
5387        for_each_node_state(nid, N_MEMORY) {
5388                node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL);
5389                seq_printf(m, " N%d=%lu", nid, node_nr);
5390        }
5391        seq_putc(m, '\n');
5392
5393        file_nr = mem_cgroup_nr_lru_pages(memcg, LRU_ALL_FILE);
5394        seq_printf(m, "file=%lu", file_nr);
5395        for_each_node_state(nid, N_MEMORY) {
5396                node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid,
5397                                LRU_ALL_FILE);
5398                seq_printf(m, " N%d=%lu", nid, node_nr);
5399        }
5400        seq_putc(m, '\n');
5401
5402        anon_nr = mem_cgroup_nr_lru_pages(memcg, LRU_ALL_ANON);
5403        seq_printf(m, "anon=%lu", anon_nr);
5404        for_each_node_state(nid, N_MEMORY) {
5405                node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid,
5406                                LRU_ALL_ANON);
5407                seq_printf(m, " N%d=%lu", nid, node_nr);
5408        }
5409        seq_putc(m, '\n');
5410
5411        unevictable_nr = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_UNEVICTABLE));
5412        seq_printf(m, "unevictable=%lu", unevictable_nr);
5413        for_each_node_state(nid, N_MEMORY) {
5414                node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid,
5415                                BIT(LRU_UNEVICTABLE));
5416                seq_printf(m, " N%d=%lu", nid, node_nr);
5417        }
5418        seq_putc(m, '\n');
5419        return 0;
5420}
5421#endif /* CONFIG_NUMA */
5422
5423static inline void mem_cgroup_lru_names_not_uptodate(void)
5424{
5425        BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_lru_names) != NR_LRU_LISTS);
5426}
5427
5428static int memcg_stat_show(struct cgroup *cont, struct cftype *cft,
5429                                 struct seq_file *m)
5430{
5431        struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
5432        unsigned long memory, memsw;
5433        struct mem_cgroup *mi;
5434        unsigned int i;
5435
5436        for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) {
5437                if (i == MEM_CGROUP_STAT_SWAP && !do_swap_account)
5438                        continue;
5439                seq_printf(m, "%s %ld\n", mem_cgroup_stat_names[i],
5440                           mem_cgroup_read_stat(memcg, i) * PAGE_SIZE);
5441        }
5442
5443        for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++)
5444                seq_printf(m, "%s %lu\n", mem_cgroup_events_names[i],
5445                           mem_cgroup_read_events(memcg, i));
5446
5447        for (i = 0; i < NR_LRU_LISTS; i++)
5448                seq_printf(m, "%s %lu\n", mem_cgroup_lru_names[i],
5449                           mem_cgroup_nr_lru_pages(memcg, BIT(i)) * PAGE_SIZE);
5450
5451        /* Hierarchical information */
5452        memory = memsw = PAGE_COUNTER_MAX;
5453        for (mi = memcg; mi; mi = parent_mem_cgroup(mi)) {
5454                memory = min(memory, mi->memory.limit);
5455                memsw = min(memsw, mi->memsw.limit);
5456        }
5457        seq_printf(m, "hierarchical_memory_limit %llu\n",
5458                   (u64)memory * PAGE_SIZE);
5459        if (do_swap_account)
5460                seq_printf(m, "hierarchical_memsw_limit %llu\n",
5461                           (u64)memsw * PAGE_SIZE);
5462
5463        for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) {
5464                long long val = 0;
5465
5466                if (i == MEM_CGROUP_STAT_SWAP && !do_swap_account)
5467                        continue;
5468                for_each_mem_cgroup_tree(mi, memcg)
5469                        val += mem_cgroup_read_stat(mi, i) * PAGE_SIZE;
5470                seq_printf(m, "total_%s %lld\n", mem_cgroup_stat_names[i], val);
5471        }
5472
5473        for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++) {
5474                unsigned long long val = 0;
5475
5476                for_each_mem_cgroup_tree(mi, memcg)
5477                        val += mem_cgroup_read_events(mi, i);
5478                seq_printf(m, "total_%s %llu\n",
5479                           mem_cgroup_events_names[i], val);
5480        }
5481
5482        for (i = 0; i < NR_LRU_LISTS; i++) {
5483                unsigned long long val = 0;
5484
5485                for_each_mem_cgroup_tree(mi, memcg)
5486                        val += mem_cgroup_nr_lru_pages(mi, BIT(i)) * PAGE_SIZE;
5487                seq_printf(m, "total_%s %llu\n", mem_cgroup_lru_names[i], val);
5488        }
5489
5490#ifdef CONFIG_DEBUG_VM
5491        {
5492                int nid, zid;
5493                struct mem_cgroup_per_zone *mz;
5494                struct zone_reclaim_stat *rstat;
5495                unsigned long recent_rotated[2] = {0, 0};
5496                unsigned long recent_scanned[2] = {0, 0};
5497
5498                for_each_online_node(nid)
5499                        for (zid = 0; zid < MAX_NR_ZONES; zid++) {
5500                                mz = mem_cgroup_zoneinfo(memcg, nid, zid);
5501                                rstat = &mz->lruvec.reclaim_stat;
5502
5503                                recent_rotated[0] += rstat->recent_rotated[0];
5504                                recent_rotated[1] += rstat->recent_rotated[1];
5505                                recent_scanned[0] += rstat->recent_scanned[0];
5506                                recent_scanned[1] += rstat->recent_scanned[1];
5507                        }
5508                seq_printf(m, "recent_rotated_anon %lu\n", recent_rotated[0]);
5509                seq_printf(m, "recent_rotated_file %lu\n", recent_rotated[1]);
5510                seq_printf(m, "recent_scanned_anon %lu\n", recent_scanned[0]);
5511                seq_printf(m, "recent_scanned_file %lu\n", recent_scanned[1]);
5512        }
5513#endif
5514
5515        return 0;
5516}
5517
5518static u64 mem_cgroup_swappiness_read(struct cgroup *cgrp, struct cftype *cft)
5519{
5520        struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
5521
5522        return mem_cgroup_swappiness(memcg);
5523}
5524
5525static int mem_cgroup_swappiness_write(struct cgroup *cgrp, struct cftype *cft,
5526                                       u64 val)
5527{
5528        struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
5529
5530        if (val > 100)
5531                return -EINVAL;
5532
5533        if (cgrp->parent)
5534                memcg->swappiness = val;
5535        else
5536                vm_swappiness = val;
5537
5538        return 0;
5539}
5540
5541static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap)
5542{
5543        struct mem_cgroup_threshold_ary *t;
5544        unsigned long usage;
5545        int i;
5546
5547        rcu_read_lock();
5548        if (!swap)
5549                t = rcu_dereference(memcg->thresholds.primary);
5550        else
5551                t = rcu_dereference(memcg->memsw_thresholds.primary);
5552
5553        if (!t)
5554                goto unlock;
5555
5556        usage = mem_cgroup_usage(memcg, swap);
5557
5558        /*
5559         * current_threshold points to threshold just below or equal to usage.
5560         * If it's not true, a threshold was crossed after last
5561         * call of __mem_cgroup_threshold().
5562         */
5563        i = t->current_threshold;
5564
5565        /*
5566         * Iterate backward over array of thresholds starting from
5567         * current_threshold and check if a threshold is crossed.
5568         * If none of thresholds below usage is crossed, we read
5569         * only one element of the array here.
5570         */
5571        for (; i >= 0 && unlikely(t->entries[i].threshold > usage); i--)
5572                eventfd_signal(t->entries[i].eventfd, 1);
5573
5574        /* i = current_threshold + 1 */
5575        i++;
5576
5577        /*
5578         * Iterate forward over array of thresholds starting from
5579         * current_threshold+1 and check if a threshold is crossed.
5580         * If none of thresholds above usage is crossed, we read
5581         * only one element of the array here.
5582         */
5583        for (; i < t->size && unlikely(t->entries[i].threshold <= usage); i++)
5584                eventfd_signal(t->entries[i].eventfd, 1);
5585
5586        /* Update current_threshold */
5587        t->current_threshold = i - 1;
5588unlock:
5589        rcu_read_unlock();
5590}
5591
5592static void mem_cgroup_threshold(struct mem_cgroup *memcg)
5593{
5594        while (memcg) {
5595                __mem_cgroup_threshold(memcg, false);
5596                if (do_swap_account)
5597                        __mem_cgroup_threshold(memcg, true);
5598
5599                memcg = parent_mem_cgroup(memcg);
5600        }
5601}
5602
5603static int compare_thresholds(const void *a, const void *b)
5604{
5605        const struct mem_cgroup_threshold *_a = a;
5606        const struct mem_cgroup_threshold *_b = b;
5607
5608        if (_a->threshold > _b->threshold)
5609                return 1;
5610
5611        if (_a->threshold < _b->threshold)
5612                return -1;
5613
5614        return 0;
5615}
5616
5617static int mem_cgroup_oom_notify_cb(struct mem_cgroup *memcg)
5618{
5619        struct mem_cgroup_eventfd_list *ev;
5620
5621        spin_lock(&memcg_oom_lock);
5622
5623        list_for_each_entry(ev, &memcg->oom_notify, list)
5624                eventfd_signal(ev->eventfd, 1);
5625
5626        spin_unlock(&memcg_oom_lock);
5627        return 0;
5628}
5629
5630static void mem_cgroup_oom_notify(struct mem_cgroup *memcg)
5631{
5632        struct mem_cgroup *iter;
5633
5634        for_each_mem_cgroup_tree(iter, memcg)
5635                mem_cgroup_oom_notify_cb(iter);
5636}
5637
5638static int mem_cgroup_usage_register_event(struct cgroup *cgrp,
5639        struct cftype *cft, struct eventfd_ctx *eventfd, const char *args)
5640{
5641        struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
5642        struct mem_cgroup_thresholds *thresholds;
5643        struct mem_cgroup_threshold_ary *new;
5644        enum res_type type = MEMFILE_TYPE(cft->private);
5645        unsigned long threshold;
5646        unsigned long usage;
5647        int i, size, ret;
5648
5649        ret = page_counter_memparse(args, &threshold);
5650        if (ret)
5651                return ret;
5652
5653        mutex_lock(&memcg->thresholds_lock);
5654
5655        if (type == _MEM)
5656                thresholds = &memcg->thresholds;
5657        else if (type == _MEMSWAP)
5658                thresholds = &memcg->memsw_thresholds;
5659        else
5660                BUG();
5661
5662        usage = mem_cgroup_usage(memcg, type == _MEMSWAP);
5663
5664        /* Check if a threshold crossed before adding a new one */
5665        if (thresholds->primary)
5666                __mem_cgroup_threshold(memcg, type == _MEMSWAP);
5667
5668        size = thresholds->primary ? thresholds->primary->size + 1 : 1;
5669
5670        /* Allocate memory for new array of thresholds */
5671        new = kmalloc(sizeof(*new) + size * sizeof(struct mem_cgroup_threshold),
5672                        GFP_KERNEL);
5673        if (!new) {
5674                ret = -ENOMEM;
5675                goto unlock;
5676        }
5677        new->size = size;
5678
5679        /* Copy thresholds (if any) to new array */
5680        if (thresholds->primary) {
5681                memcpy(new->entries, thresholds->primary->entries, (size - 1) *
5682                                sizeof(struct mem_cgroup_threshold));
5683        }
5684
5685        /* Add new threshold */
5686        new->entries[size - 1].eventfd = eventfd;
5687        new->entries[size - 1].threshold = threshold;
5688
5689        /* Sort thresholds. Registering of new threshold isn't time-critical */
5690        sort(new->entries, size, sizeof(struct mem_cgroup_threshold),
5691                        compare_thresholds, NULL);
5692
5693        /* Find current threshold */
5694        new->current_threshold = -1;
5695        for (i = 0; i < size; i++) {
5696                if (new->entries[i].threshold <= usage) {
5697                        /*
5698                         * new->current_threshold will not be used until
5699                         * rcu_assign_pointer(), so it's safe to increment
5700                         * it here.
5701                         */
5702                        ++new->current_threshold;
5703                } else
5704                        break;
5705        }
5706
5707        /* Free old spare buffer and save old primary buffer as spare */
5708        kfree(thresholds->spare);
5709        thresholds->spare = thresholds->primary;
5710
5711        rcu_assign_pointer(thresholds->primary, new);
5712
5713        /* To be sure that nobody uses thresholds */
5714        synchronize_rcu();
5715
5716unlock:
5717        mutex_unlock(&memcg->thresholds_lock);
5718
5719        return ret;
5720}
5721
5722static void mem_cgroup_usage_unregister_event(struct cgroup *cgrp,
5723        struct cftype *cft, struct eventfd_ctx *eventfd)
5724{
5725        struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
5726        struct mem_cgroup_thresholds *thresholds;
5727        struct mem_cgroup_threshold_ary *new;
5728        enum res_type type = MEMFILE_TYPE(cft->private);
5729        unsigned long usage;
5730        int i, j, size;
5731
5732        mutex_lock(&memcg->thresholds_lock);
5733        if (type == _MEM)
5734                thresholds = &memcg->thresholds;
5735        else if (type == _MEMSWAP)
5736                thresholds = &memcg->memsw_thresholds;
5737        else
5738                BUG();
5739
5740        if (!thresholds->primary)
5741                goto unlock;
5742
5743        usage = mem_cgroup_usage(memcg, type == _MEMSWAP);
5744
5745        /* Check if a threshold crossed before removing */
5746        __mem_cgroup_threshold(memcg, type == _MEMSWAP);
5747
5748        /* Calculate new number of threshold */
5749        size = 0;
5750        for (i = 0; i < thresholds->primary->size; i++) {
5751                if (thresholds->primary->entries[i].eventfd != eventfd)
5752                        size++;
5753        }
5754
5755        new = thresholds->spare;
5756
5757        /* Set thresholds array to NULL if we don't have thresholds */
5758        if (!size) {
5759                kfree(new);
5760                new = NULL;
5761                goto swap_buffers;
5762        }
5763
5764        new->size = size;
5765
5766        /* Copy thresholds and find current threshold */
5767        new->current_threshold = -1;
5768        for (i = 0, j = 0; i < thresholds->primary->size; i++) {
5769                if (thresholds->primary->entries[i].eventfd == eventfd)
5770                        continue;
5771
5772                new->entries[j] = thresholds->primary->entries[i];
5773                if (new->entries[j].threshold <= usage) {
5774                        /*
5775                         * new->current_threshold will not be used
5776                         * until rcu_assign_pointer(), so it's safe to increment
5777                         * it here.
5778                         */
5779                        ++new->current_threshold;
5780                }
5781                j++;
5782        }
5783
5784swap_buffers:
5785        /* Swap primary and spare array */
5786        thresholds->spare = thresholds->primary;
5787        /* If all events are unregistered, free the spare array */
5788        if (!new) {
5789                kfree(thresholds->spare);
5790                thresholds->spare = NULL;
5791        }
5792
5793        rcu_assign_pointer(thresholds->primary, new);
5794
5795        /* To be sure that nobody uses thresholds */
5796        synchronize_rcu();
5797unlock:
5798        mutex_unlock(&memcg->thresholds_lock);
5799}
5800
5801static int mem_cgroup_oom_register_event(struct cgroup *cgrp,
5802        struct cftype *cft, struct eventfd_ctx *eventfd, const char *args)
5803{
5804        struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
5805        struct mem_cgroup_eventfd_list *event;
5806        enum res_type type = MEMFILE_TYPE(cft->private);
5807
5808        BUG_ON(type != _OOM_TYPE);
5809        event = kmalloc(sizeof(*event), GFP_KERNEL);
5810        if (!event)
5811                return -ENOMEM;
5812
5813        spin_lock(&memcg_oom_lock);
5814
5815        event->eventfd = eventfd;
5816        list_add(&event->list, &memcg->oom_notify);
5817
5818        /* already in OOM ? */
5819        if (atomic_read(&memcg->under_oom))
5820                eventfd_signal(eventfd, 1);
5821        spin_unlock(&memcg_oom_lock);
5822
5823        return 0;
5824}
5825
5826static void mem_cgroup_oom_unregister_event(struct cgroup *cgrp,
5827        struct cftype *cft, struct eventfd_ctx *eventfd)
5828{
5829        struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
5830        struct mem_cgroup_eventfd_list *ev, *tmp;
5831        enum res_type type = MEMFILE_TYPE(cft->private);
5832
5833        BUG_ON(type != _OOM_TYPE);
5834
5835        spin_lock(&memcg_oom_lock);
5836
5837        list_for_each_entry_safe(ev, tmp, &memcg->oom_notify, list) {
5838                if (ev->eventfd == eventfd) {
5839                        list_del(&ev->list);
5840                        kfree(ev);
5841                }
5842        }
5843
5844        spin_unlock(&memcg_oom_lock);
5845}
5846
5847static int mem_cgroup_oom_control_read(struct cgroup *cgrp,
5848        struct cftype *cft,  struct cgroup_map_cb *cb)
5849{
5850        struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
5851
5852        cb->fill(cb, "oom_kill_disable", memcg->oom_kill_disable);
5853
5854        if (atomic_read(&memcg->under_oom))
5855                cb->fill(cb, "under_oom", 1);
5856        else
5857                cb->fill(cb, "under_oom", 0);
5858        return 0;
5859}
5860
5861static int mem_cgroup_oom_control_write(struct cgroup *cgrp,
5862        struct cftype *cft, u64 val)
5863{
5864        struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
5865
5866        /* cannot set to root cgroup and only 0 and 1 are allowed */
5867        if (!cgrp->parent || !((val == 0) || (val == 1)))
5868                return -EINVAL;
5869
5870        memcg->oom_kill_disable = val;
5871        if (!val)
5872                memcg_oom_recover(memcg);
5873
5874        return 0;
5875}
5876
5877#ifdef CONFIG_MEMCG_KMEM
5878static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
5879{
5880        int ret;
5881
5882        memcg->kmemcg_id = -1;
5883        ret = memcg_propagate_kmem(memcg);
5884        if (ret)
5885                return ret;
5886
5887        return mem_cgroup_sockets_init(memcg, ss);
5888}
5889
5890static void kmem_cgroup_destroy(struct mem_cgroup *memcg)
5891{
5892        mem_cgroup_sockets_destroy(memcg);
5893
5894        memcg_kmem_mark_dead(memcg);
5895
5896        if (page_counter_read(&memcg->kmem))
5897                return;
5898
5899        /*
5900         * Charges already down to 0, undo mem_cgroup_get() done in the charge
5901         * path here, being careful not to race with memcg_uncharge_kmem: it is
5902         * possible that the charges went down to 0 between mark_dead and the
5903         * page_counter read, so in that case, we don't need the put
5904         */
5905        if (memcg_kmem_test_and_clear_dead(memcg))
5906                mem_cgroup_put(memcg);
5907}
5908#else
5909static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
5910{
5911        return 0;
5912}
5913
5914static void kmem_cgroup_destroy(struct mem_cgroup *memcg)
5915{
5916}
5917#endif
5918
5919static struct cftype mem_cgroup_files[] = {
5920        {
5921                .name = "usage_in_bytes",
5922                .private = MEMFILE_PRIVATE(_MEM, RES_USAGE),
5923                .read = mem_cgroup_read,
5924                .register_event = mem_cgroup_usage_register_event,
5925                .unregister_event = mem_cgroup_usage_unregister_event,
5926        },
5927        {
5928                .name = "max_usage_in_bytes",
5929                .private = MEMFILE_PRIVATE(_MEM, RES_MAX_USAGE),
5930                .trigger = mem_cgroup_reset,
5931                .read = mem_cgroup_read,
5932        },
5933        {
5934                .name = "limit_in_bytes",
5935                .private = MEMFILE_PRIVATE(_MEM, RES_LIMIT),
5936                .write_string = mem_cgroup_write,
5937                .read = mem_cgroup_read,
5938        },
5939        {
5940                .name = "soft_limit_in_bytes",
5941                .private = MEMFILE_PRIVATE(_MEM, RES_SOFT_LIMIT),
5942                .write_string = mem_cgroup_write,
5943                .read = mem_cgroup_read,
5944        },
5945        {
5946                .name = "failcnt",
5947                .private = MEMFILE_PRIVATE(_MEM, RES_FAILCNT),
5948                .trigger = mem_cgroup_reset,
5949                .read = mem_cgroup_read,
5950        },
5951        {
5952                .name = "stat",
5953                .read_seq_string = memcg_stat_show,
5954        },
5955        {
5956                .name = "force_empty",
5957                .trigger = mem_cgroup_force_empty_write,
5958        },
5959        {
5960                .name = "use_hierarchy",
5961                .flags = CFTYPE_INSANE,
5962                .write_u64 = mem_cgroup_hierarchy_write,
5963                .read_u64 = mem_cgroup_hierarchy_read,
5964        },
5965        {
5966                .name = "swappiness",
5967                .read_u64 = mem_cgroup_swappiness_read,
5968                .write_u64 = mem_cgroup_swappiness_write,
5969        },
5970        {
5971                .name = "move_charge_at_immigrate",
5972                .read_u64 = mem_cgroup_move_charge_read,
5973                .write_u64 = mem_cgroup_move_charge_write,
5974        },
5975        {
5976                .name = "oom_control",
5977                .read_map = mem_cgroup_oom_control_read,
5978                .write_u64 = mem_cgroup_oom_control_write,
5979                .register_event = mem_cgroup_oom_register_event,
5980                .unregister_event = mem_cgroup_oom_unregister_event,
5981                .private = MEMFILE_PRIVATE(_OOM_TYPE, OOM_CONTROL),
5982        },
5983        {
5984                .name = "pressure_level",
5985                .register_event = vmpressure_register_event,
5986                .unregister_event = vmpressure_unregister_event,
5987        },
5988#ifdef CONFIG_NUMA
5989        {
5990                .name = "numa_stat",
5991                .read_seq_string = memcg_numa_stat_show,
5992        },
5993#endif
5994#ifdef CONFIG_MEMCG_KMEM
5995        {
5996                .name = "kmem.limit_in_bytes",
5997                .private = MEMFILE_PRIVATE(_KMEM, RES_LIMIT),
5998                .write_string = mem_cgroup_write,
5999                .read = mem_cgroup_read,
6000        },
6001        {
6002                .name = "kmem.usage_in_bytes",
6003                .private = MEMFILE_PRIVATE(_KMEM, RES_USAGE),
6004                .read = mem_cgroup_read,
6005        },
6006        {
6007                .name = "kmem.failcnt",
6008                .private = MEMFILE_PRIVATE(_KMEM, RES_FAILCNT),
6009                .trigger = mem_cgroup_reset,
6010                .read = mem_cgroup_read,
6011        },
6012        {
6013                .name = "kmem.max_usage_in_bytes",
6014                .private = MEMFILE_PRIVATE(_KMEM, RES_MAX_USAGE),
6015                .trigger = mem_cgroup_reset,
6016                .read = mem_cgroup_read,
6017        },
6018#ifdef CONFIG_SLABINFO
6019        {
6020                .name = "kmem.slabinfo",
6021                .read_seq_string = mem_cgroup_slabinfo_read,
6022        },
6023#endif
6024#endif
6025        { },    /* terminate */
6026};
6027
6028#ifdef CONFIG_MEMCG_SWAP
6029static struct cftype memsw_cgroup_files[] = {
6030        {
6031                .name = "memsw.usage_in_bytes",
6032                .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE),
6033                .read = mem_cgroup_read,
6034                .register_event = mem_cgroup_usage_register_event,
6035                .unregister_event = mem_cgroup_usage_unregister_event,
6036        },
6037        {
6038                .name = "memsw.max_usage_in_bytes",
6039                .private = MEMFILE_PRIVATE(_MEMSWAP, RES_MAX_USAGE),
6040                .trigger = mem_cgroup_reset,
6041                .read = mem_cgroup_read,
6042        },
6043        {
6044                .name = "memsw.limit_in_bytes",
6045                .private = MEMFILE_PRIVATE(_MEMSWAP, RES_LIMIT),
6046                .write_string = mem_cgroup_write,
6047                .read = mem_cgroup_read,
6048        },
6049        {
6050                .name = "memsw.failcnt",
6051                .private = MEMFILE_PRIVATE(_MEMSWAP, RES_FAILCNT),
6052                .trigger = mem_cgroup_reset,
6053                .read = mem_cgroup_read,
6054        },
6055        { },    /* terminate */
6056};
6057#endif
6058
6059/*
6060 * Private memory cgroup IDR
6061 *
6062 * Swap-out records and page cache shadow entries need to store memcg
6063 * references in constrained space, so we maintain an ID space that is
6064 * limited to 16 bit (MEM_CGROUP_ID_MAX), limiting the total number of
6065 * memory-controlled cgroups to 64k.
6066 *
6067 * However, there usually are many references to the oflline CSS after
6068 * the cgroup has been destroyed, such as page cache or reclaimable
6069 * slab objects, that don't need to hang on to the ID. We want to keep
6070 * those dead CSS from occupying IDs, or we might quickly exhaust the
6071 * relatively small ID space and prevent the creation of new cgroups
6072 * even when there are much fewer than 64k cgroups - possibly none.
6073 *
6074 * Maintain a private 16-bit ID space for memcg, and allow the ID to
6075 * be freed and recycled when it's no longer needed, which is usually
6076 * when the CSS is offlined.
6077 *
6078 * The only exception to that are records of swapped out tmpfs/shmem
6079 * pages that need to be attributed to live ancestors on swapin. But
6080 * those references are manageable from userspace.
6081 */
6082
6083static DEFINE_IDR(mem_cgroup_idr);
6084
6085static unsigned short mem_cgroup_id(struct mem_cgroup *memcg)
6086{
6087        return memcg->id;
6088}
6089
6090static void mem_cgroup_id_put(struct mem_cgroup *memcg)
6091{
6092        idr_remove(&mem_cgroup_idr, memcg->id);
6093        memcg->id = 0;
6094        synchronize_rcu();
6095}
6096
6097/**
6098 * mem_cgroup_from_id - look up a memcg from a memcg id
6099 * @id: the memcg id to look up
6100 *
6101 * Caller must hold rcu_read_lock().
6102 */
6103struct mem_cgroup *mem_cgroup_from_id(unsigned short id)
6104{
6105        WARN_ON_ONCE(!rcu_read_lock_held());
6106        return idr_find(&mem_cgroup_idr, id);
6107}
6108
6109static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node)
6110{
6111        struct mem_cgroup_per_node *pn;
6112        struct mem_cgroup_per_zone *mz;
6113        int zone, tmp = node;
6114        /*
6115         * This routine is called against possible nodes.
6116         * But it's BUG to call kmalloc() against offline node.
6117         *
6118         * TODO: this routine can waste much memory for nodes which will
6119         *       never be onlined. It's better to use memory hotplug callback
6120         *       function.
6121         */
6122        if (!node_state(node, N_NORMAL_MEMORY))
6123                tmp = -1;
6124        pn = kzalloc_node(sizeof(*pn), GFP_KERNEL, tmp);
6125        if (!pn)
6126                return 1;
6127
6128        for (zone = 0; zone < MAX_NR_ZONES; zone++) {
6129                mz = &pn->zoneinfo[zone];
6130                lruvec_init(&mz->lruvec);
6131                mz->usage_in_excess = 0;
6132                mz->on_tree = false;
6133                mz->memcg = memcg;
6134        }
6135        memcg->info.nodeinfo[node] = pn;
6136        return 0;
6137}
6138
6139static void free_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node)
6140{
6141        kfree(memcg->info.nodeinfo[node]);
6142}
6143
6144static struct mem_cgroup *mem_cgroup_alloc(void)
6145{
6146        struct mem_cgroup *memcg;
6147        size_t size = memcg_size();
6148        int id;
6149
6150        /* Can be very big if nr_node_ids is very big */
6151        if (size < PAGE_SIZE)
6152                memcg = kzalloc(size, GFP_KERNEL);
6153        else
6154                memcg = vzalloc(size);
6155
6156        if (!memcg)
6157                return NULL;
6158
6159        id = idr_alloc(&mem_cgroup_idr, NULL,
6160                       1, MEM_CGROUP_ID_MAX,
6161                       GFP_KERNEL);
6162        if (id < 0)
6163                goto fail;
6164
6165        memcg->id = id;
6166
6167        memcg->stat = alloc_percpu(struct mem_cgroup_stat_cpu);
6168        if (!memcg->stat)
6169                goto out_free;
6170        spin_lock_init(&memcg->pcp_counter_lock);
6171        idr_replace(&mem_cgroup_idr, memcg, memcg->id);
6172        synchronize_rcu();
6173        return memcg;
6174
6175out_free:
6176        if (memcg->id > 0) {
6177                idr_remove(&mem_cgroup_idr, memcg->id);
6178                synchronize_rcu();
6179        }
6180fail:
6181        if (size < PAGE_SIZE)
6182                kfree(memcg);
6183        else
6184                vfree(memcg);
6185        return NULL;
6186}
6187
6188/*
6189 * At destroying mem_cgroup, references from swap_cgroup can remain.
6190 * (scanning all at force_empty is too costly...)
6191 *
6192 * Instead of clearing all references at force_empty, we remember
6193 * the number of reference from swap_cgroup and free mem_cgroup when
6194 * it goes down to 0.
6195 *
6196 * Removal of cgroup itself succeeds regardless of refs from swap.
6197 */
6198
6199static void __mem_cgroup_free(struct mem_cgroup *memcg)
6200{
6201        int node;
6202        size_t size = memcg_size();
6203
6204        mem_cgroup_remove_from_trees(memcg);
6205
6206        mem_cgroup_id_put(memcg);
6207
6208        for_each_node(node)
6209                free_mem_cgroup_per_zone_info(memcg, node);
6210
6211        free_percpu(memcg->stat);
6212
6213        /*
6214         * We need to make sure that (at least for now), the jump label
6215         * destruction code runs outside of the cgroup lock. This is because
6216         * get_online_cpus(), which is called from the static_branch update,
6217         * can't be called inside the cgroup_lock. cpusets are the ones
6218         * enforcing this dependency, so if they ever change, we might as well.
6219         *
6220         * schedule_work() will guarantee this happens. Be careful if you need
6221         * to move this code around, and make sure it is outside
6222         * the cgroup_lock.
6223         */
6224        disarm_static_keys(memcg);
6225        if (size < PAGE_SIZE)
6226                kfree(memcg);
6227        else
6228                vfree(memcg);
6229}
6230
6231
6232/*
6233 * Helpers for freeing a kmalloc()ed/vzalloc()ed mem_cgroup by RCU,
6234 * but in process context.  The work_freeing structure is overlaid
6235 * on the rcu_freeing structure, which itself is overlaid on memsw.
6236 */
6237static void free_work(struct work_struct *work)
6238{
6239        struct mem_cgroup *memcg;
6240
6241        memcg = container_of(work, struct mem_cgroup, work_freeing);
6242        __mem_cgroup_free(memcg);
6243}
6244
6245static void free_rcu(struct rcu_head *rcu_head)
6246{
6247        struct mem_cgroup *memcg;
6248
6249        memcg = container_of(rcu_head, struct mem_cgroup, rcu_freeing);
6250        INIT_WORK(&memcg->work_freeing, free_work);
6251        schedule_work(&memcg->work_freeing);
6252}
6253
6254static void mem_cgroup_get(struct mem_cgroup *memcg)
6255{
6256        atomic_inc(&memcg->refcnt);
6257}
6258
6259static void __mem_cgroup_put(struct mem_cgroup *memcg, int count)
6260{
6261        if (atomic_sub_and_test(count, &memcg->refcnt)) {
6262                struct mem_cgroup *parent = parent_mem_cgroup(memcg);
6263                call_rcu(&memcg->rcu_freeing, free_rcu);
6264                if (parent)
6265                        mem_cgroup_put(parent);
6266        }
6267}
6268
6269static void mem_cgroup_put(struct mem_cgroup *memcg)
6270{
6271        __mem_cgroup_put(memcg, 1);
6272}
6273
6274/*
6275 * Returns the parent mem_cgroup in memcgroup hierarchy with hierarchy enabled.
6276 */
6277struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg)
6278{
6279        if (!memcg->memory.parent)
6280                return NULL;
6281        return mem_cgroup_from_counter(memcg->memory.parent, memory);
6282}
6283EXPORT_SYMBOL(parent_mem_cgroup);
6284
6285static void __init mem_cgroup_soft_limit_tree_init(void)
6286{
6287        struct mem_cgroup_tree_per_node *rtpn;
6288        struct mem_cgroup_tree_per_zone *rtpz;
6289        int tmp, node, zone;
6290
6291        for_each_node(node) {
6292                tmp = node;
6293                if (!node_state(node, N_NORMAL_MEMORY))
6294                        tmp = -1;
6295                rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL, tmp);
6296                BUG_ON(!rtpn);
6297
6298                soft_limit_tree.rb_tree_per_node[node] = rtpn;
6299
6300                for (zone = 0; zone < MAX_NR_ZONES; zone++) {
6301                        rtpz = &rtpn->rb_tree_per_zone[zone];
6302                        rtpz->rb_root = RB_ROOT;
6303                        spin_lock_init(&rtpz->lock);
6304                }
6305        }
6306}
6307
6308static struct cgroup_subsys_state * __ref
6309mem_cgroup_css_alloc(struct cgroup *cont)
6310{
6311        struct mem_cgroup *memcg;
6312        long error = -ENOMEM;
6313        int node;
6314
6315        memcg = mem_cgroup_alloc();
6316        if (!memcg)
6317                return ERR_PTR(error);
6318
6319        for_each_node(node)
6320                if (alloc_mem_cgroup_per_zone_info(memcg, node))
6321                        goto free_out;
6322
6323        /* root ? */
6324        if (cont->parent == NULL) {
6325                root_mem_cgroup = memcg;
6326                page_counter_init(&memcg->memory, NULL);
6327                memcg->soft_limit = PAGE_COUNTER_MAX;
6328                page_counter_init(&memcg->memsw, NULL);
6329                page_counter_init(&memcg->kmem, NULL);
6330        }
6331
6332        memcg->last_scanned_node = MAX_NUMNODES;
6333        INIT_LIST_HEAD(&memcg->oom_notify);
6334        atomic_set(&memcg->refcnt, 1);
6335        memcg->move_charge_at_immigrate = 0;
6336        mutex_init(&memcg->thresholds_lock);
6337        spin_lock_init(&memcg->move_lock);
6338        vmpressure_init(&memcg->vmpressure);
6339
6340        return &memcg->css;
6341
6342free_out:
6343        __mem_cgroup_free(memcg);
6344        return ERR_PTR(error);
6345}
6346
6347static int
6348mem_cgroup_css_online(struct cgroup *cont)
6349{
6350        struct mem_cgroup *memcg, *parent;
6351        int error = 0;
6352
6353        if (!cont->parent)
6354                return 0;
6355
6356        mutex_lock(&memcg_create_mutex);
6357        memcg = mem_cgroup_from_cont(cont);
6358        parent = mem_cgroup_from_cont(cont->parent);
6359
6360        memcg->use_hierarchy = parent->use_hierarchy;
6361        memcg->oom_kill_disable = parent->oom_kill_disable;
6362        memcg->swappiness = mem_cgroup_swappiness(parent);
6363
6364        if (parent->use_hierarchy) {
6365                page_counter_init(&memcg->memory, &parent->memory);
6366                memcg->soft_limit = PAGE_COUNTER_MAX;
6367                page_counter_init(&memcg->memsw, &parent->memsw);
6368                page_counter_init(&memcg->kmem, &parent->kmem);
6369
6370                /*
6371                 * We increment refcnt of the parent to ensure that we can
6372                 * safely access it on page_counter_charge/uncharge.
6373                 * This refcnt will be decremented when freeing this
6374                 * mem_cgroup(see mem_cgroup_put).
6375                 */
6376                mem_cgroup_get(parent);
6377        } else {
6378                page_counter_init(&memcg->memory, NULL);
6379                memcg->soft_limit = PAGE_COUNTER_MAX;
6380                page_counter_init(&memcg->memsw, NULL);
6381                page_counter_init(&memcg->kmem, NULL);
6382                /*
6383                 * Deeper hierachy with use_hierarchy == false doesn't make
6384                 * much sense so let cgroup subsystem know about this
6385                 * unfortunate state in our controller.
6386                 */
6387                if (parent != root_mem_cgroup)
6388                        mem_cgroup_subsys.broken_hierarchy = true;
6389        }
6390
6391        error = memcg_init_kmem(memcg, &mem_cgroup_subsys);
6392        mutex_unlock(&memcg_create_mutex);
6393        return error;
6394}
6395
6396/*
6397 * Announce all parents that a group from their hierarchy is gone.
6398 */
6399static void mem_cgroup_invalidate_reclaim_iterators(struct mem_cgroup *memcg)
6400{
6401        struct mem_cgroup *parent = memcg;
6402
6403        while ((parent = parent_mem_cgroup(parent)))
6404                atomic_inc(&parent->dead_count);
6405
6406        /*
6407         * if the root memcg is not hierarchical we have to check it
6408         * explicitely.
6409         */
6410        if (!root_mem_cgroup->use_hierarchy)
6411                atomic_inc(&root_mem_cgroup->dead_count);
6412}
6413
6414static void mem_cgroup_css_offline(struct cgroup *cont)
6415{
6416        struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
6417        struct cgroup *iter;
6418
6419        mem_cgroup_invalidate_reclaim_iterators(memcg);
6420
6421        /*
6422         * This requires that offlining is serialized.  Right now that is
6423         * guaranteed because css_killed_work_fn() holds the cgroup_mutex.
6424         */
6425        rcu_read_lock();
6426        cgroup_for_each_descendant_post(iter, cont) {
6427                rcu_read_unlock();
6428                mem_cgroup_reparent_charges(mem_cgroup_from_cont(iter));
6429                rcu_read_lock();
6430        }
6431        rcu_read_unlock();
6432        mem_cgroup_reparent_charges(memcg);
6433
6434        mem_cgroup_destroy_all_caches(memcg);
6435}
6436
6437static void mem_cgroup_css_free(struct cgroup *cont)
6438{
6439        struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
6440
6441        kmem_cgroup_destroy(memcg);
6442
6443        mem_cgroup_put(memcg);
6444}
6445
6446#ifdef CONFIG_MMU
6447/* Handlers for move charge at task migration. */
6448#define PRECHARGE_COUNT_AT_ONCE 256
6449static int mem_cgroup_do_precharge(unsigned long count)
6450{
6451        int ret = 0;
6452        int batch_count = PRECHARGE_COUNT_AT_ONCE;
6453        struct mem_cgroup *memcg = mc.to;
6454
6455        if (mem_cgroup_is_root(memcg)) {
6456                mc.precharge += count;
6457                /* we don't need css_get for root */
6458                return ret;
6459        }
6460        /* try to charge at once */
6461        if (count > 1) {
6462                struct page_counter *dummy;
6463                /*
6464                 * "memcg" cannot be under rmdir() because we've already checked
6465                 * by cgroup_lock_live_cgroup() that it is not removed and we
6466                 * are still under the same cgroup_mutex. So we can postpone
6467                 * css_get().
6468                 */
6469                if (page_counter_try_charge(&memcg->memory, count, &dummy))
6470                        goto one_by_one;
6471                if (do_swap_account &&
6472                    page_counter_try_charge(&memcg->memsw, count, &dummy)) {
6473                        page_counter_uncharge(&memcg->memory, count);
6474                        goto one_by_one;
6475                }
6476                mc.precharge += count;
6477                return ret;
6478        }
6479one_by_one:
6480        /* fall back to one by one charge */
6481        while (count--) {
6482                if (signal_pending(current)) {
6483                        ret = -EINTR;
6484                        break;
6485                }
6486                if (!batch_count--) {
6487                        batch_count = PRECHARGE_COUNT_AT_ONCE;
6488                        cond_resched();
6489                }
6490                ret = __mem_cgroup_try_charge(NULL,
6491                                        GFP_KERNEL, 1, &memcg, false);
6492                if (ret)
6493                        /* mem_cgroup_clear_mc() will do uncharge later */
6494                        return ret;
6495                mc.precharge++;
6496        }
6497        return ret;
6498}
6499
6500/**
6501 * get_mctgt_type - get target type of moving charge
6502 * @vma: the vma the pte to be checked belongs
6503 * @addr: the address corresponding to the pte to be checked
6504 * @ptent: the pte to be checked
6505 * @target: the pointer the target page or swap ent will be stored(can be NULL)
6506 *
6507 * Returns
6508 *   0(MC_TARGET_NONE): if the pte is not a target for move charge.
6509 *   1(MC_TARGET_PAGE): if the page corresponding to this pte is a target for
6510 *     move charge. if @target is not NULL, the page is stored in target->page
6511 *     with extra refcnt got(Callers should handle it).
6512 *   2(MC_TARGET_SWAP): if the swap entry corresponding to this pte is a
6513 *     target for charge migration. if @target is not NULL, the entry is stored
6514 *     in target->ent.
6515 *
6516 * Called with pte lock held.
6517 */
6518union mc_target {
6519        struct page     *page;
6520        swp_entry_t     ent;
6521};
6522
6523enum mc_target_type {
6524        MC_TARGET_NONE = 0,
6525        MC_TARGET_PAGE,
6526        MC_TARGET_SWAP,
6527};
6528
6529static struct page *mc_handle_present_pte(struct vm_area_struct *vma,
6530                                                unsigned long addr, pte_t ptent)
6531{
6532        struct page *page = vm_normal_page(vma, addr, ptent);
6533
6534        if (!page || !page_mapped(page))
6535                return NULL;
6536        if (PageAnon(page)) {
6537                /* we don't move shared anon */
6538                if (!move_anon())
6539                        return NULL;
6540        } else if (!move_file())
6541                /* we ignore mapcount for file pages */
6542                return NULL;
6543        if (!get_page_unless_zero(page))
6544                return NULL;
6545
6546        return page;
6547}
6548
6549#ifdef CONFIG_SWAP
6550static struct page *mc_handle_swap_pte(struct vm_area_struct *vma,
6551                        unsigned long addr, pte_t ptent, swp_entry_t *entry)
6552{
6553        struct page *page = NULL;
6554        swp_entry_t ent = pte_to_swp_entry(ptent);
6555
6556        if (!move_anon() || non_swap_entry(ent))
6557                return NULL;
6558        /*
6559         * Because lookup_swap_cache() updates some statistics counter,
6560         * we call find_get_page() with swapper_space directly.
6561         */
6562        page = find_get_page(swap_address_space(ent), ent.val);
6563        if (do_swap_account)
6564                entry->val = ent.val;
6565
6566        return page;
6567}
6568#else
6569static struct page *mc_handle_swap_pte(struct vm_area_struct *vma,
6570                        unsigned long addr, pte_t ptent, swp_entry_t *entry)
6571{
6572        return NULL;
6573}
6574#endif
6575
6576static struct page *mc_handle_file_pte(struct vm_area_struct *vma,
6577                        unsigned long addr, pte_t ptent, swp_entry_t *entry)
6578{
6579        struct page *page = NULL;
6580        struct address_space *mapping;
6581        pgoff_t pgoff;
6582
6583        if (!vma->vm_file) /* anonymous vma */
6584                return NULL;
6585        if (!move_file())
6586                return NULL;
6587
6588        mapping = vma->vm_file->f_mapping;
6589        if (pte_none(ptent))
6590                pgoff = linear_page_index(vma, addr);
6591        else /* pte_file(ptent) is true */
6592                pgoff = pte_to_pgoff(ptent);
6593
6594        /* page is moved even if it's not RSS of this task(page-faulted). */
6595#ifdef CONFIG_SWAP
6596        /* shmem/tmpfs may report page out on swap: account for that too. */
6597        if (shmem_mapping(mapping)) {
6598                page = __find_get_page(mapping, pgoff);
6599                if (radix_tree_exceptional_entry(page)) {
6600                        swp_entry_t swp = radix_to_swp_entry(page);
6601                        if (do_swap_account)
6602                                *entry = swp;
6603                        page = find_get_page(swap_address_space(swp), swp.val);
6604                }
6605        } else
6606                page = find_get_page(mapping, pgoff);
6607#else
6608        page = find_get_page(mapping, pgoff);
6609#endif
6610        return page;
6611}
6612
6613static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma,
6614                unsigned long addr, pte_t ptent, union mc_target *target)
6615{
6616        struct page *page = NULL;
6617        struct page_cgroup *pc;
6618        enum mc_target_type ret = MC_TARGET_NONE;
6619        swp_entry_t ent = { .val = 0 };
6620
6621        if (pte_present(ptent))
6622                page = mc_handle_present_pte(vma, addr, ptent);
6623        else if (is_swap_pte(ptent))
6624                page = mc_handle_swap_pte(vma, addr, ptent, &ent);
6625        else if (pte_none(ptent) || pte_file(ptent))
6626                page = mc_handle_file_pte(vma, addr, ptent, &ent);
6627
6628        if (!page && !ent.val)
6629                return ret;
6630        if (page) {
6631                pc = lookup_page_cgroup(page);
6632                /*
6633                 * Do only loose check w/o page_cgroup lock.
6634                 * mem_cgroup_move_account() checks the pc is valid or not under
6635                 * the lock.
6636                 */
6637                if (PageCgroupUsed(pc) && pc->mem_cgroup == mc.from) {
6638                        ret = MC_TARGET_PAGE;
6639                        if (target)
6640                                target->page = page;
6641                }
6642                if (!ret || !target)
6643                        put_page(page);
6644        }
6645        /* There is a swap entry and a page doesn't exist or isn't charged */
6646        if (ent.val && !ret &&
6647                        mem_cgroup_id(mc.from) == lookup_swap_cgroup_id(ent)) {
6648                ret = MC_TARGET_SWAP;
6649                if (target)
6650                        target->ent = ent;
6651        }
6652        return ret;
6653}
6654
6655#ifdef CONFIG_TRANSPARENT_HUGEPAGE
6656/*
6657 * We don't consider swapping or file mapped pages because THP does not
6658 * support them for now.
6659 * Caller should make sure that pmd_trans_huge(pmd) is true.
6660 */
6661static enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma,
6662                unsigned long addr, pmd_t pmd, union mc_target *target)
6663{
6664        struct page *page = NULL;
6665        struct page_cgroup *pc;
6666        enum mc_target_type ret = MC_TARGET_NONE;
6667
6668        page = pmd_page(pmd);
6669        VM_BUG_ON_PAGE(!page || !PageHead(page), page);
6670        if (!move_anon())
6671                return ret;
6672        pc = lookup_page_cgroup(page);
6673        if (PageCgroupUsed(pc) && pc->mem_cgroup == mc.from) {
6674                ret = MC_TARGET_PAGE;
6675                if (target) {
6676                        get_page(page);
6677                        target->page = page;
6678                }
6679        }
6680        return ret;
6681}
6682#else
6683static inline enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma,
6684                unsigned long addr, pmd_t pmd, union mc_target *target)
6685{
6686        return MC_TARGET_NONE;
6687}
6688#endif
6689
6690static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd,
6691                                        unsigned long addr, unsigned long end,
6692                                        struct mm_walk *walk)
6693{
6694        struct vm_area_struct *vma = walk->private;
6695        pte_t *pte;
6696        spinlock_t *ptl;
6697
6698        if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
6699                if (get_mctgt_type_thp(vma, addr, *pmd, NULL) == MC_TARGET_PAGE)
6700                        mc.precharge += HPAGE_PMD_NR;
6701                spin_unlock(ptl);
6702                return 0;
6703        }
6704
6705        if (pmd_trans_unstable(pmd))
6706                return 0;
6707        pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
6708        for (; addr != end; pte++, addr += PAGE_SIZE)
6709                if (get_mctgt_type(vma, addr, *pte, NULL))
6710                        mc.precharge++; /* increment precharge temporarily */
6711        pte_unmap_unlock(pte - 1, ptl);
6712        cond_resched();
6713
6714        return 0;
6715}
6716
6717static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm)
6718{
6719        unsigned long precharge;
6720        struct vm_area_struct *vma;
6721
6722        down_read(&mm->mmap_sem);
6723        for (vma = mm->mmap; vma; vma = vma->vm_next) {
6724                struct mm_walk mem_cgroup_count_precharge_walk = {
6725                        .pmd_entry = mem_cgroup_count_precharge_pte_range,
6726                        .mm = mm,
6727                        .private = vma,
6728                };
6729                if (is_vm_hugetlb_page(vma))
6730                        continue;
6731                walk_page_range(vma->vm_start, vma->vm_end,
6732                                        &mem_cgroup_count_precharge_walk);
6733        }
6734        up_read(&mm->mmap_sem);
6735
6736        precharge = mc.precharge;
6737        mc.precharge = 0;
6738
6739        return precharge;
6740}
6741
6742static int mem_cgroup_precharge_mc(struct mm_struct *mm)
6743{
6744        unsigned long precharge = mem_cgroup_count_precharge(mm);
6745
6746        VM_BUG_ON(mc.moving_task);
6747        mc.moving_task = current;
6748        return mem_cgroup_do_precharge(precharge);
6749}
6750
6751/* cancels all extra charges on mc.from and mc.to, and wakes up all waiters. */
6752static void __mem_cgroup_clear_mc(void)
6753{
6754        struct mem_cgroup *from = mc.from;
6755        struct mem_cgroup *to = mc.to;
6756
6757        /* we must uncharge all the leftover precharges from mc.to */
6758        if (mc.precharge) {
6759                __mem_cgroup_cancel_charge(mc.to, mc.precharge);
6760                mc.precharge = 0;
6761        }
6762        /*
6763         * we didn't uncharge from mc.from at mem_cgroup_move_account(), so
6764         * we must uncharge here.
6765         */
6766        if (mc.moved_charge) {
6767                __mem_cgroup_cancel_charge(mc.from, mc.moved_charge);
6768                mc.moved_charge = 0;
6769        }
6770        /* we must fixup refcnts and charges */
6771        if (mc.moved_swap) {
6772                /* uncharge swap account from the old cgroup */
6773                if (!mem_cgroup_is_root(mc.from))
6774                        page_counter_uncharge(&mc.from->memsw, mc.moved_swap);
6775
6776                if (!mem_cgroup_is_root(mc.to)) {
6777                        /*
6778                         * we charged both to->memory and to->memsw, so we
6779                         * should uncharge to->memory.
6780                         */
6781                        page_counter_uncharge(&mc.to->memory, mc.moved_swap);
6782                }
6783                __mem_cgroup_put(mc.from, mc.moved_swap);
6784
6785                /* we've already done mem_cgroup_get(mc.to) */
6786                mc.moved_swap = 0;
6787        }
6788        memcg_oom_recover(from);
6789        memcg_oom_recover(to);
6790        wake_up_all(&mc.waitq);
6791}
6792
6793static void mem_cgroup_clear_mc(void)
6794{
6795        struct mem_cgroup *from = mc.from;
6796
6797        /*
6798         * we must clear moving_task before waking up waiters at the end of
6799         * task migration.
6800         */
6801        mc.moving_task = NULL;
6802        __mem_cgroup_clear_mc();
6803        spin_lock(&mc.lock);
6804        mc.from = NULL;
6805        mc.to = NULL;
6806        spin_unlock(&mc.lock);
6807        mem_cgroup_end_move(from);
6808}
6809
6810static int mem_cgroup_can_attach(struct cgroup *cgroup,
6811                                 struct cgroup_taskset *tset)
6812{
6813        struct task_struct *p = cgroup_taskset_first(tset);
6814        int ret = 0;
6815        struct mem_cgroup *memcg = mem_cgroup_from_cont(cgroup);
6816        unsigned long move_charge_at_immigrate;
6817
6818        /*
6819         * We are now commited to this value whatever it is. Changes in this
6820         * tunable will only affect upcoming migrations, not the current one.
6821         * So we need to save it, and keep it going.
6822         */
6823        move_charge_at_immigrate  = memcg->move_charge_at_immigrate;
6824        if (move_charge_at_immigrate) {
6825                struct mm_struct *mm;
6826                struct mem_cgroup *from = mem_cgroup_from_task(p);
6827
6828                VM_BUG_ON(from == memcg);
6829
6830                mm = get_task_mm(p);
6831                if (!mm)
6832                        return 0;
6833                /* We move charges only when we move a owner of the mm */
6834                if (mm->owner == p) {
6835                        VM_BUG_ON(mc.from);
6836                        VM_BUG_ON(mc.to);
6837                        VM_BUG_ON(mc.precharge);
6838                        VM_BUG_ON(mc.moved_charge);
6839                        VM_BUG_ON(mc.moved_swap);
6840                        mem_cgroup_start_move(from);
6841                        spin_lock(&mc.lock);
6842                        mc.from = from;
6843                        mc.to = memcg;
6844                        mc.immigrate_flags = move_charge_at_immigrate;
6845                        spin_unlock(&mc.lock);
6846                        /* We set mc.moving_task later */
6847
6848                        ret = mem_cgroup_precharge_mc(mm);
6849                        if (ret)
6850                                mem_cgroup_clear_mc();
6851                }
6852                mmput(mm);
6853        }
6854        return ret;
6855}
6856
6857static void mem_cgroup_cancel_attach(struct cgroup *cgroup,
6858                                     struct cgroup_taskset *tset)
6859{
6860        mem_cgroup_clear_mc();
6861}
6862
6863static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
6864                                unsigned long addr, unsigned long end,
6865                                struct mm_walk *walk)
6866{
6867        int ret = 0;
6868        struct vm_area_struct *vma = walk->private;
6869        pte_t *pte;
6870        spinlock_t *ptl;
6871        enum mc_target_type target_type;
6872        union mc_target target;
6873        struct page *page;
6874        struct page_cgroup *pc;
6875
6876        /*
6877         * We don't take compound_lock() here but no race with splitting thp
6878         * happens because:
6879         *  - if pmd_trans_huge_lock() returns 1, the relevant thp is not
6880         *    under splitting, which means there's no concurrent thp split,
6881         *  - if another thread runs into split_huge_page() just after we
6882         *    entered this if-block, the thread must wait for page table lock
6883         *    to be unlocked in __split_huge_page_splitting(), where the main
6884         *    part of thp split is not executed yet.
6885         */
6886        if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
6887                if (mc.precharge < HPAGE_PMD_NR) {
6888                        spin_unlock(ptl);
6889                        return 0;
6890                }
6891                target_type = get_mctgt_type_thp(vma, addr, *pmd, &target);
6892                if (target_type == MC_TARGET_PAGE) {
6893                        page = target.page;
6894                        if (!isolate_lru_page(page)) {
6895                                pc = lookup_page_cgroup(page);
6896                                if (!mem_cgroup_move_account(page, HPAGE_PMD_NR,
6897                                                        pc, mc.from, mc.to)) {
6898                                        mc.precharge -= HPAGE_PMD_NR;
6899                                        mc.moved_charge += HPAGE_PMD_NR;
6900                                }
6901                                putback_lru_page(page);
6902                        }
6903                        put_page(page);
6904                }
6905                spin_unlock(ptl);
6906                return 0;
6907        }
6908
6909        if (pmd_trans_unstable(pmd))
6910                return 0;
6911retry:
6912        pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
6913        for (; addr != end; addr += PAGE_SIZE) {
6914                pte_t ptent = *(pte++);
6915                swp_entry_t ent;
6916
6917                if (!mc.precharge)
6918                        break;
6919
6920                switch (get_mctgt_type(vma, addr, ptent, &target)) {
6921                case MC_TARGET_PAGE:
6922                        page = target.page;
6923                        if (isolate_lru_page(page))
6924                                goto put;
6925                        pc = lookup_page_cgroup(page);
6926                        if (!mem_cgroup_move_account(page, 1, pc,
6927                                                     mc.from, mc.to)) {
6928                                mc.precharge--;
6929                                /* we uncharge from mc.from later. */
6930                                mc.moved_charge++;
6931                        }
6932                        putback_lru_page(page);
6933put:                    /* get_mctgt_type() gets the page */
6934                        put_page(page);
6935                        break;
6936                case MC_TARGET_SWAP:
6937                        ent = target.ent;
6938                        if (!mem_cgroup_move_swap_account(ent, mc.from, mc.to)) {
6939                                mc.precharge--;
6940                                /* we fixup refcnts and charges later. */
6941                                mc.moved_swap++;
6942                        }
6943                        break;
6944                default:
6945                        break;
6946                }
6947        }
6948        pte_unmap_unlock(pte - 1, ptl);
6949        cond_resched();
6950
6951        if (addr != end) {
6952                /*
6953                 * We have consumed all precharges we got in can_attach().
6954                 * We try charge one by one, but don't do any additional
6955                 * charges to mc.to if we have failed in charge once in attach()
6956                 * phase.
6957                 */
6958                ret = mem_cgroup_do_precharge(1);
6959                if (!ret)
6960                        goto retry;
6961        }
6962
6963        return ret;
6964}
6965
6966static void mem_cgroup_move_charge(struct mm_struct *mm)
6967{
6968        struct vm_area_struct *vma;
6969
6970        lru_add_drain_all();
6971retry:
6972        if (unlikely(!down_read_trylock(&mm->mmap_sem))) {
6973                /*
6974                 * Someone who are holding the mmap_sem might be waiting in
6975                 * waitq. So we cancel all extra charges, wake up all waiters,
6976                 * and retry. Because we cancel precharges, we might not be able
6977                 * to move enough charges, but moving charge is a best-effort
6978                 * feature anyway, so it wouldn't be a big problem.
6979                 */
6980                __mem_cgroup_clear_mc();
6981                cond_resched();
6982                goto retry;
6983        }
6984        for (vma = mm->mmap; vma; vma = vma->vm_next) {
6985                int ret;
6986                struct mm_walk mem_cgroup_move_charge_walk = {
6987                        .pmd_entry = mem_cgroup_move_charge_pte_range,
6988                        .mm = mm,
6989                        .private = vma,
6990                };
6991                if (is_vm_hugetlb_page(vma))
6992                        continue;
6993                ret = walk_page_range(vma->vm_start, vma->vm_end,
6994                                                &mem_cgroup_move_charge_walk);
6995                if (ret)
6996                        /*
6997                         * means we have consumed all precharges and failed in
6998                         * doing additional charge. Just abandon here.
6999                         */
7000                        break;
7001        }
7002        up_read(&mm->mmap_sem);
7003}
7004
7005static void mem_cgroup_move_task(struct cgroup *cont,
7006                                 struct cgroup_taskset *tset)
7007{
7008        struct task_struct *p = cgroup_taskset_first(tset);
7009        struct mm_struct *mm = get_task_mm(p);
7010
7011        if (mm) {
7012                if (mc.to)
7013                        mem_cgroup_move_charge(mm);
7014                mmput(mm);
7015        }
7016        if (mc.to)
7017                mem_cgroup_clear_mc();
7018}
7019#else   /* !CONFIG_MMU */
7020static int mem_cgroup_can_attach(struct cgroup *cgroup,
7021                                 struct cgroup_taskset *tset)
7022{
7023        return 0;
7024}
7025static void mem_cgroup_cancel_attach(struct cgroup *cgroup,
7026                                     struct cgroup_taskset *tset)
7027{
7028}
7029static void mem_cgroup_move_task(struct cgroup *cont,
7030                                 struct cgroup_taskset *tset)
7031{
7032}
7033#endif
7034
7035/*
7036 * Cgroup retains root cgroups across [un]mount cycles making it necessary
7037 * to verify sane_behavior flag on each mount attempt.
7038 */
7039static void mem_cgroup_bind(struct cgroup *root)
7040{
7041        /*
7042         * use_hierarchy is forced with sane_behavior.  cgroup core
7043         * guarantees that @root doesn't have any children, so turning it
7044         * on for the root memcg is enough.
7045         */
7046        if (cgroup_sane_behavior(root))
7047                mem_cgroup_from_cont(root)->use_hierarchy = true;
7048}
7049
7050struct cgroup_subsys mem_cgroup_subsys = {
7051        .name = "memory",
7052        .subsys_id = mem_cgroup_subsys_id,
7053        .css_alloc = mem_cgroup_css_alloc,
7054        .css_online = mem_cgroup_css_online,
7055        .css_offline = mem_cgroup_css_offline,
7056        .css_free = mem_cgroup_css_free,
7057        .can_attach = mem_cgroup_can_attach,
7058        .cancel_attach = mem_cgroup_cancel_attach,
7059        .attach = mem_cgroup_move_task,
7060        .bind = mem_cgroup_bind,
7061        .base_cftypes = mem_cgroup_files,
7062        .early_init = 0,
7063};
7064
7065#ifdef CONFIG_MEMCG_SWAP
7066static int __init enable_swap_account(char *s)
7067{
7068        /* consider enabled if no parameter or 1 is given */
7069        if (!strcmp(s, "1"))
7070                really_do_swap_account = 1;
7071        else if (!strcmp(s, "0"))
7072                really_do_swap_account = 0;
7073        return 1;
7074}
7075__setup("swapaccount=", enable_swap_account);
7076
7077static void __init memsw_file_init(void)
7078{
7079        WARN_ON(cgroup_add_cftypes(&mem_cgroup_subsys, memsw_cgroup_files));
7080}
7081
7082static void __init enable_swap_cgroup(void)
7083{
7084        if (!mem_cgroup_disabled() && really_do_swap_account) {
7085                do_swap_account = 1;
7086                memsw_file_init();
7087        }
7088}
7089
7090#else
7091static void __init enable_swap_cgroup(void)
7092{
7093}
7094#endif
7095
7096/*
7097 * subsys_initcall() for memory controller.
7098 *
7099 * Some parts like hotcpu_notifier() have to be initialized from this context
7100 * because of lock dependencies (cgroup_lock -> cpu hotplug) but basically
7101 * everything that doesn't depend on a specific mem_cgroup structure should
7102 * be initialized from here.
7103 */
7104static int __init mem_cgroup_init(void)
7105{
7106        hotcpu_notifier(memcg_cpu_hotplug_callback, 0);
7107        enable_swap_cgroup();
7108        mem_cgroup_soft_limit_tree_init();
7109        memcg_stock_init();
7110        return 0;
7111}
7112subsys_initcall(mem_cgroup_init);
7113