1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28#include <linux/res_counter.h>
29#include <linux/memcontrol.h>
30#include <linux/cgroup.h>
31#include <linux/mm.h>
32#include <linux/hugetlb.h>
33#include <linux/pagemap.h>
34#include <linux/smp.h>
35#include <linux/page-flags.h>
36#include <linux/backing-dev.h>
37#include <linux/bit_spinlock.h>
38#include <linux/rcupdate.h>
39#include <linux/limits.h>
40#include <linux/export.h>
41#include <linux/mutex.h>
42#include <linux/rbtree.h>
43#include <linux/slab.h>
44#include <linux/swap.h>
45#include <linux/swapops.h>
46#include <linux/spinlock.h>
47#include <linux/eventfd.h>
48#include <linux/sort.h>
49#include <linux/fs.h>
50#include <linux/seq_file.h>
51#include <linux/vmalloc.h>
52#include <linux/vmpressure.h>
53#include <linux/mm_inline.h>
54#include <linux/page_cgroup.h>
55#include <linux/cpu.h>
56#include <linux/oom.h>
57#include <linux/lockdep.h>
58#include "internal.h"
59#include <net/sock.h>
60#include <net/ip.h>
61#include <net/tcp_memcontrol.h>
62#include "slab.h"
63
64#include <asm/uaccess.h>
65
66#include <trace/events/vmscan.h>
67
68struct cgroup_subsys mem_cgroup_subsys __read_mostly;
69EXPORT_SYMBOL(mem_cgroup_subsys);
70
71#define MEM_CGROUP_RECLAIM_RETRIES 5
72static struct mem_cgroup *root_mem_cgroup __read_mostly;
73
74#ifdef CONFIG_MEMCG_SWAP
75
76int do_swap_account __read_mostly;
77
78
79#ifdef CONFIG_MEMCG_SWAP_ENABLED
80static int really_do_swap_account __initdata = 1;
81#else
82static int really_do_swap_account __initdata = 0;
83#endif
84
85#else
86#define do_swap_account 0
87#endif
88
89
90static const char * const mem_cgroup_stat_names[] = {
91 "cache",
92 "rss",
93 "rss_huge",
94 "mapped_file",
95 "writeback",
96 "swap",
97};
98
99enum mem_cgroup_events_index {
100 MEM_CGROUP_EVENTS_PGPGIN,
101 MEM_CGROUP_EVENTS_PGPGOUT,
102 MEM_CGROUP_EVENTS_PGFAULT,
103 MEM_CGROUP_EVENTS_PGMAJFAULT,
104 MEM_CGROUP_EVENTS_NSTATS,
105};
106
107static const char * const mem_cgroup_events_names[] = {
108 "pgpgin",
109 "pgpgout",
110 "pgfault",
111 "pgmajfault",
112};
113
114static const char * const mem_cgroup_lru_names[] = {
115 "inactive_anon",
116 "active_anon",
117 "inactive_file",
118 "active_file",
119 "unevictable",
120};
121
122
123
124
125
126
127
128enum mem_cgroup_events_target {
129 MEM_CGROUP_TARGET_THRESH,
130 MEM_CGROUP_TARGET_SOFTLIMIT,
131 MEM_CGROUP_TARGET_NUMAINFO,
132 MEM_CGROUP_NTARGETS,
133};
134#define THRESHOLDS_EVENTS_TARGET 128
135#define SOFTLIMIT_EVENTS_TARGET 1024
136#define NUMAINFO_EVENTS_TARGET 1024
137
138struct mem_cgroup_stat_cpu {
139 long count[MEM_CGROUP_STAT_NSTATS];
140 unsigned long events[MEM_CGROUP_EVENTS_NSTATS];
141 unsigned long nr_page_events;
142 unsigned long targets[MEM_CGROUP_NTARGETS];
143};
144
145struct mem_cgroup_reclaim_iter {
146
147
148
149
150 struct mem_cgroup *last_visited;
151 unsigned long last_dead_count;
152
153
154 unsigned int generation;
155};
156
157
158
159
160struct mem_cgroup_per_zone {
161 struct lruvec lruvec;
162 unsigned long lru_size[NR_LRU_LISTS];
163
164 struct mem_cgroup_reclaim_iter reclaim_iter[DEF_PRIORITY + 1];
165
166 struct rb_node tree_node;
167 unsigned long long usage_in_excess;
168
169 bool on_tree;
170 struct mem_cgroup *memcg;
171
172};
173
174struct mem_cgroup_per_node {
175 struct mem_cgroup_per_zone zoneinfo[MAX_NR_ZONES];
176};
177
178
179
180
181
182
183struct mem_cgroup_tree_per_zone {
184 struct rb_root rb_root;
185 spinlock_t lock;
186};
187
188struct mem_cgroup_tree_per_node {
189 struct mem_cgroup_tree_per_zone rb_tree_per_zone[MAX_NR_ZONES];
190};
191
192struct mem_cgroup_tree {
193 struct mem_cgroup_tree_per_node *rb_tree_per_node[MAX_NUMNODES];
194};
195
196static struct mem_cgroup_tree soft_limit_tree __read_mostly;
197
198struct mem_cgroup_threshold {
199 struct eventfd_ctx *eventfd;
200 u64 threshold;
201};
202
203
204struct mem_cgroup_threshold_ary {
205
206 int current_threshold;
207
208 unsigned int size;
209
210 struct mem_cgroup_threshold entries[0];
211};
212
213struct mem_cgroup_thresholds {
214
215 struct mem_cgroup_threshold_ary *primary;
216
217
218
219
220
221 struct mem_cgroup_threshold_ary *spare;
222};
223
224
225struct mem_cgroup_eventfd_list {
226 struct list_head list;
227 struct eventfd_ctx *eventfd;
228};
229
230static void mem_cgroup_threshold(struct mem_cgroup *memcg);
231static void mem_cgroup_oom_notify(struct mem_cgroup *memcg);
232
233
234
235
236
237
238
239
240
241
242
243
244struct mem_cgroup {
245 struct cgroup_subsys_state css;
246
247
248
249 struct res_counter res;
250
251
252 struct vmpressure vmpressure;
253
254
255
256
257 struct res_counter memsw;
258
259
260
261
262 struct res_counter kmem;
263
264
265
266 bool use_hierarchy;
267 unsigned long kmem_account_flags;
268
269 bool oom_lock;
270 atomic_t under_oom;
271 atomic_t oom_wakeups;
272
273 int swappiness;
274
275 int oom_kill_disable;
276
277
278 bool memsw_is_minimum;
279
280
281 struct mutex thresholds_lock;
282
283
284 struct mem_cgroup_thresholds thresholds;
285
286
287 struct mem_cgroup_thresholds memsw_thresholds;
288
289
290 struct list_head oom_notify;
291
292
293
294
295
296 unsigned long move_charge_at_immigrate;
297
298
299
300 atomic_t moving_account;
301
302 spinlock_t move_lock;
303
304
305
306 struct mem_cgroup_stat_cpu __percpu *stat;
307
308
309
310
311 struct mem_cgroup_stat_cpu nocpu_base;
312 spinlock_t pcp_counter_lock;
313
314 atomic_t dead_count;
315#if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_INET)
316 struct cg_proto tcp_mem;
317#endif
318#if defined(CONFIG_MEMCG_KMEM)
319
320 struct list_head memcg_slab_caches;
321
322 struct mutex slab_caches_mutex;
323
324 int kmemcg_id;
325#endif
326
327 int last_scanned_node;
328#if MAX_NUMNODES > 1
329 nodemask_t scan_nodes;
330 atomic_t numainfo_events;
331 atomic_t numainfo_updating;
332#endif
333
334 struct mem_cgroup_per_node *nodeinfo[0];
335
336};
337
338static size_t memcg_size(void)
339{
340 return sizeof(struct mem_cgroup) +
341 nr_node_ids * sizeof(struct mem_cgroup_per_node *);
342}
343
344
345enum {
346 KMEM_ACCOUNTED_ACTIVE = 0,
347 KMEM_ACCOUNTED_ACTIVATED,
348 KMEM_ACCOUNTED_DEAD,
349};
350
351
352#define KMEM_ACCOUNTED_MASK \
353 ((1 << KMEM_ACCOUNTED_ACTIVE) | (1 << KMEM_ACCOUNTED_ACTIVATED))
354
355#ifdef CONFIG_MEMCG_KMEM
356static inline void memcg_kmem_set_active(struct mem_cgroup *memcg)
357{
358 set_bit(KMEM_ACCOUNTED_ACTIVE, &memcg->kmem_account_flags);
359}
360
361static bool memcg_kmem_is_active(struct mem_cgroup *memcg)
362{
363 return test_bit(KMEM_ACCOUNTED_ACTIVE, &memcg->kmem_account_flags);
364}
365
366static void memcg_kmem_set_activated(struct mem_cgroup *memcg)
367{
368 set_bit(KMEM_ACCOUNTED_ACTIVATED, &memcg->kmem_account_flags);
369}
370
371static void memcg_kmem_clear_activated(struct mem_cgroup *memcg)
372{
373 clear_bit(KMEM_ACCOUNTED_ACTIVATED, &memcg->kmem_account_flags);
374}
375
376static void memcg_kmem_mark_dead(struct mem_cgroup *memcg)
377{
378
379
380
381
382 smp_wmb();
383 if (test_bit(KMEM_ACCOUNTED_ACTIVE, &memcg->kmem_account_flags))
384 set_bit(KMEM_ACCOUNTED_DEAD, &memcg->kmem_account_flags);
385}
386
387static bool memcg_kmem_test_and_clear_dead(struct mem_cgroup *memcg)
388{
389 return test_and_clear_bit(KMEM_ACCOUNTED_DEAD,
390 &memcg->kmem_account_flags);
391}
392#endif
393
394
395
396
397
398
399enum move_type {
400 MOVE_CHARGE_TYPE_ANON,
401 MOVE_CHARGE_TYPE_FILE,
402 NR_MOVE_TYPE,
403};
404
405
406static struct move_charge_struct {
407 spinlock_t lock;
408 struct mem_cgroup *from;
409 struct mem_cgroup *to;
410 unsigned long immigrate_flags;
411 unsigned long precharge;
412 unsigned long moved_charge;
413 unsigned long moved_swap;
414 struct task_struct *moving_task;
415 wait_queue_head_t waitq;
416} mc = {
417 .lock = __SPIN_LOCK_UNLOCKED(mc.lock),
418 .waitq = __WAIT_QUEUE_HEAD_INITIALIZER(mc.waitq),
419};
420
421static bool move_anon(void)
422{
423 return test_bit(MOVE_CHARGE_TYPE_ANON, &mc.immigrate_flags);
424}
425
426static bool move_file(void)
427{
428 return test_bit(MOVE_CHARGE_TYPE_FILE, &mc.immigrate_flags);
429}
430
431
432
433
434
435#define MEM_CGROUP_MAX_RECLAIM_LOOPS 100
436#define MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS 2
437
438enum charge_type {
439 MEM_CGROUP_CHARGE_TYPE_CACHE = 0,
440 MEM_CGROUP_CHARGE_TYPE_ANON,
441 MEM_CGROUP_CHARGE_TYPE_SWAPOUT,
442 MEM_CGROUP_CHARGE_TYPE_DROP,
443 NR_CHARGE_TYPE,
444};
445
446
447enum res_type {
448 _MEM,
449 _MEMSWAP,
450 _OOM_TYPE,
451 _KMEM,
452};
453
454#define MEMFILE_PRIVATE(x, val) ((x) << 16 | (val))
455#define MEMFILE_TYPE(val) ((val) >> 16 & 0xffff)
456#define MEMFILE_ATTR(val) ((val) & 0xffff)
457
458#define OOM_CONTROL (0)
459
460
461
462
463#define MEM_CGROUP_RECLAIM_NOSWAP_BIT 0x0
464#define MEM_CGROUP_RECLAIM_NOSWAP (1 << MEM_CGROUP_RECLAIM_NOSWAP_BIT)
465#define MEM_CGROUP_RECLAIM_SHRINK_BIT 0x1
466#define MEM_CGROUP_RECLAIM_SHRINK (1 << MEM_CGROUP_RECLAIM_SHRINK_BIT)
467
468
469
470
471
472
473static DEFINE_MUTEX(memcg_create_mutex);
474
475struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *s)
476{
477 return s ? container_of(s, struct mem_cgroup, css) : NULL;
478}
479
480
481struct vmpressure *memcg_to_vmpressure(struct mem_cgroup *memcg)
482{
483 if (!memcg)
484 memcg = root_mem_cgroup;
485 return &memcg->vmpressure;
486}
487
488struct cgroup_subsys_state *vmpressure_to_css(struct vmpressure *vmpr)
489{
490 return &container_of(vmpr, struct mem_cgroup, vmpressure)->css;
491}
492
493struct vmpressure *css_to_vmpressure(struct cgroup_subsys_state *css)
494{
495 return &mem_cgroup_from_css(css)->vmpressure;
496}
497
498static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg)
499{
500 return (memcg == root_mem_cgroup);
501}
502
503
504
505
506
507#define MEM_CGROUP_ID_MAX USHRT_MAX
508
509static inline unsigned short mem_cgroup_id(struct mem_cgroup *memcg)
510{
511
512
513
514
515 return memcg->css.cgroup->id + 1;
516}
517
518static inline struct mem_cgroup *mem_cgroup_from_id(unsigned short id)
519{
520 struct cgroup_subsys_state *css;
521
522 css = css_from_id(id - 1, &mem_cgroup_subsys);
523 return mem_cgroup_from_css(css);
524}
525
526
527#if defined(CONFIG_INET) && defined(CONFIG_MEMCG_KMEM)
528
529void sock_update_memcg(struct sock *sk)
530{
531 if (mem_cgroup_sockets_enabled) {
532 struct mem_cgroup *memcg;
533 struct cg_proto *cg_proto;
534
535 BUG_ON(!sk->sk_prot->proto_cgroup);
536
537
538
539
540
541
542
543
544
545 if (sk->sk_cgrp) {
546 BUG_ON(mem_cgroup_is_root(sk->sk_cgrp->memcg));
547 css_get(&sk->sk_cgrp->memcg->css);
548 return;
549 }
550
551 rcu_read_lock();
552 memcg = mem_cgroup_from_task(current);
553 cg_proto = sk->sk_prot->proto_cgroup(memcg);
554 if (!mem_cgroup_is_root(memcg) &&
555 memcg_proto_active(cg_proto) && css_tryget(&memcg->css)) {
556 sk->sk_cgrp = cg_proto;
557 }
558 rcu_read_unlock();
559 }
560}
561EXPORT_SYMBOL(sock_update_memcg);
562
563void sock_release_memcg(struct sock *sk)
564{
565 if (mem_cgroup_sockets_enabled && sk->sk_cgrp) {
566 struct mem_cgroup *memcg;
567 WARN_ON(!sk->sk_cgrp->memcg);
568 memcg = sk->sk_cgrp->memcg;
569 css_put(&sk->sk_cgrp->memcg->css);
570 }
571}
572
573struct cg_proto *tcp_proto_cgroup(struct mem_cgroup *memcg)
574{
575 if (!memcg || mem_cgroup_is_root(memcg))
576 return NULL;
577
578 return &memcg->tcp_mem;
579}
580EXPORT_SYMBOL(tcp_proto_cgroup);
581
582static void disarm_sock_keys(struct mem_cgroup *memcg)
583{
584 if (!memcg_proto_activated(&memcg->tcp_mem))
585 return;
586 static_key_slow_dec(&memcg_socket_limit_enabled);
587}
588#else
589static void disarm_sock_keys(struct mem_cgroup *memcg)
590{
591}
592#endif
593
594#ifdef CONFIG_MEMCG_KMEM
595
596
597
598
599
600
601
602
603
604
605
606
607static DEFINE_IDA(kmem_limited_groups);
608int memcg_limited_groups_array_size;
609
610
611
612
613
614
615
616
617
618
619
620
621
622#define MEMCG_CACHES_MIN_SIZE 4
623#define MEMCG_CACHES_MAX_SIZE MEM_CGROUP_ID_MAX
624
625
626
627
628
629
630
631struct static_key memcg_kmem_enabled_key;
632EXPORT_SYMBOL(memcg_kmem_enabled_key);
633
634static void disarm_kmem_keys(struct mem_cgroup *memcg)
635{
636 if (memcg_kmem_is_active(memcg)) {
637 static_key_slow_dec(&memcg_kmem_enabled_key);
638 ida_simple_remove(&kmem_limited_groups, memcg->kmemcg_id);
639 }
640
641
642
643
644 WARN_ON(res_counter_read_u64(&memcg->kmem, RES_USAGE) != 0);
645}
646#else
647static void disarm_kmem_keys(struct mem_cgroup *memcg)
648{
649}
650#endif
651
652static void disarm_static_keys(struct mem_cgroup *memcg)
653{
654 disarm_sock_keys(memcg);
655 disarm_kmem_keys(memcg);
656}
657
658static void drain_all_stock_async(struct mem_cgroup *memcg);
659
660static struct mem_cgroup_per_zone *
661mem_cgroup_zoneinfo(struct mem_cgroup *memcg, int nid, int zid)
662{
663 VM_BUG_ON((unsigned)nid >= nr_node_ids);
664 return &memcg->nodeinfo[nid]->zoneinfo[zid];
665}
666
667struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *memcg)
668{
669 return &memcg->css;
670}
671
672static struct mem_cgroup_per_zone *
673page_cgroup_zoneinfo(struct mem_cgroup *memcg, struct page *page)
674{
675 int nid = page_to_nid(page);
676 int zid = page_zonenum(page);
677
678 return mem_cgroup_zoneinfo(memcg, nid, zid);
679}
680
681static struct mem_cgroup_tree_per_zone *
682soft_limit_tree_node_zone(int nid, int zid)
683{
684 return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid];
685}
686
687static struct mem_cgroup_tree_per_zone *
688soft_limit_tree_from_page(struct page *page)
689{
690 int nid = page_to_nid(page);
691 int zid = page_zonenum(page);
692
693 return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid];
694}
695
696static void
697__mem_cgroup_insert_exceeded(struct mem_cgroup *memcg,
698 struct mem_cgroup_per_zone *mz,
699 struct mem_cgroup_tree_per_zone *mctz,
700 unsigned long long new_usage_in_excess)
701{
702 struct rb_node **p = &mctz->rb_root.rb_node;
703 struct rb_node *parent = NULL;
704 struct mem_cgroup_per_zone *mz_node;
705
706 if (mz->on_tree)
707 return;
708
709 mz->usage_in_excess = new_usage_in_excess;
710 if (!mz->usage_in_excess)
711 return;
712 while (*p) {
713 parent = *p;
714 mz_node = rb_entry(parent, struct mem_cgroup_per_zone,
715 tree_node);
716 if (mz->usage_in_excess < mz_node->usage_in_excess)
717 p = &(*p)->rb_left;
718
719
720
721
722 else if (mz->usage_in_excess >= mz_node->usage_in_excess)
723 p = &(*p)->rb_right;
724 }
725 rb_link_node(&mz->tree_node, parent, p);
726 rb_insert_color(&mz->tree_node, &mctz->rb_root);
727 mz->on_tree = true;
728}
729
730static void
731__mem_cgroup_remove_exceeded(struct mem_cgroup *memcg,
732 struct mem_cgroup_per_zone *mz,
733 struct mem_cgroup_tree_per_zone *mctz)
734{
735 if (!mz->on_tree)
736 return;
737 rb_erase(&mz->tree_node, &mctz->rb_root);
738 mz->on_tree = false;
739}
740
741static void
742mem_cgroup_remove_exceeded(struct mem_cgroup *memcg,
743 struct mem_cgroup_per_zone *mz,
744 struct mem_cgroup_tree_per_zone *mctz)
745{
746 spin_lock(&mctz->lock);
747 __mem_cgroup_remove_exceeded(memcg, mz, mctz);
748 spin_unlock(&mctz->lock);
749}
750
751
752static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page)
753{
754 unsigned long long excess;
755 struct mem_cgroup_per_zone *mz;
756 struct mem_cgroup_tree_per_zone *mctz;
757 int nid = page_to_nid(page);
758 int zid = page_zonenum(page);
759 mctz = soft_limit_tree_from_page(page);
760
761
762
763
764
765 for (; memcg; memcg = parent_mem_cgroup(memcg)) {
766 mz = mem_cgroup_zoneinfo(memcg, nid, zid);
767 excess = res_counter_soft_limit_excess(&memcg->res);
768
769
770
771
772 if (excess || mz->on_tree) {
773 spin_lock(&mctz->lock);
774
775 if (mz->on_tree)
776 __mem_cgroup_remove_exceeded(memcg, mz, mctz);
777
778
779
780
781 __mem_cgroup_insert_exceeded(memcg, mz, mctz, excess);
782 spin_unlock(&mctz->lock);
783 }
784 }
785}
786
787static void mem_cgroup_remove_from_trees(struct mem_cgroup *memcg)
788{
789 int node, zone;
790 struct mem_cgroup_per_zone *mz;
791 struct mem_cgroup_tree_per_zone *mctz;
792
793 for_each_node(node) {
794 for (zone = 0; zone < MAX_NR_ZONES; zone++) {
795 mz = mem_cgroup_zoneinfo(memcg, node, zone);
796 mctz = soft_limit_tree_node_zone(node, zone);
797 mem_cgroup_remove_exceeded(memcg, mz, mctz);
798 }
799 }
800}
801
802static struct mem_cgroup_per_zone *
803__mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz)
804{
805 struct rb_node *rightmost = NULL;
806 struct mem_cgroup_per_zone *mz;
807
808retry:
809 mz = NULL;
810 rightmost = rb_last(&mctz->rb_root);
811 if (!rightmost)
812 goto done;
813
814 mz = rb_entry(rightmost, struct mem_cgroup_per_zone, tree_node);
815
816
817
818
819
820 __mem_cgroup_remove_exceeded(mz->memcg, mz, mctz);
821 if (!res_counter_soft_limit_excess(&mz->memcg->res) ||
822 !css_tryget(&mz->memcg->css))
823 goto retry;
824done:
825 return mz;
826}
827
828static struct mem_cgroup_per_zone *
829mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz)
830{
831 struct mem_cgroup_per_zone *mz;
832
833 spin_lock(&mctz->lock);
834 mz = __mem_cgroup_largest_soft_limit_node(mctz);
835 spin_unlock(&mctz->lock);
836 return mz;
837}
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858static long mem_cgroup_read_stat(struct mem_cgroup *memcg,
859 enum mem_cgroup_stat_index idx)
860{
861 long val = 0;
862 int cpu;
863
864 get_online_cpus();
865 for_each_online_cpu(cpu)
866 val += per_cpu(memcg->stat->count[idx], cpu);
867#ifdef CONFIG_HOTPLUG_CPU
868 spin_lock(&memcg->pcp_counter_lock);
869 val += memcg->nocpu_base.count[idx];
870 spin_unlock(&memcg->pcp_counter_lock);
871#endif
872 put_online_cpus();
873 return val;
874}
875
876static void mem_cgroup_swap_statistics(struct mem_cgroup *memcg,
877 bool charge)
878{
879 int val = (charge) ? 1 : -1;
880 this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_SWAP], val);
881}
882
883static unsigned long mem_cgroup_read_events(struct mem_cgroup *memcg,
884 enum mem_cgroup_events_index idx)
885{
886 unsigned long val = 0;
887 int cpu;
888
889 get_online_cpus();
890 for_each_online_cpu(cpu)
891 val += per_cpu(memcg->stat->events[idx], cpu);
892#ifdef CONFIG_HOTPLUG_CPU
893 spin_lock(&memcg->pcp_counter_lock);
894 val += memcg->nocpu_base.events[idx];
895 spin_unlock(&memcg->pcp_counter_lock);
896#endif
897 put_online_cpus();
898 return val;
899}
900
901static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg,
902 struct page *page,
903 bool anon, int nr_pages)
904{
905 preempt_disable();
906
907
908
909
910
911 if (anon)
912 __this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_RSS],
913 nr_pages);
914 else
915 __this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_CACHE],
916 nr_pages);
917
918 if (PageTransHuge(page))
919 __this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_RSS_HUGE],
920 nr_pages);
921
922
923 if (nr_pages > 0)
924 __this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGIN]);
925 else {
926 __this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGOUT]);
927 nr_pages = -nr_pages;
928 }
929
930 __this_cpu_add(memcg->stat->nr_page_events, nr_pages);
931
932 preempt_enable();
933}
934
935unsigned long
936mem_cgroup_get_lru_size(struct lruvec *lruvec, enum lru_list lru)
937{
938 struct mem_cgroup_per_zone *mz;
939
940 mz = container_of(lruvec, struct mem_cgroup_per_zone, lruvec);
941 return mz->lru_size[lru];
942}
943
944static unsigned long
945mem_cgroup_zone_nr_lru_pages(struct mem_cgroup *memcg, int nid, int zid,
946 unsigned int lru_mask)
947{
948 struct mem_cgroup_per_zone *mz;
949 enum lru_list lru;
950 unsigned long ret = 0;
951
952 mz = mem_cgroup_zoneinfo(memcg, nid, zid);
953
954 for_each_lru(lru) {
955 if (BIT(lru) & lru_mask)
956 ret += mz->lru_size[lru];
957 }
958 return ret;
959}
960
961static unsigned long
962mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg,
963 int nid, unsigned int lru_mask)
964{
965 u64 total = 0;
966 int zid;
967
968 for (zid = 0; zid < MAX_NR_ZONES; zid++)
969 total += mem_cgroup_zone_nr_lru_pages(memcg,
970 nid, zid, lru_mask);
971
972 return total;
973}
974
975static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *memcg,
976 unsigned int lru_mask)
977{
978 int nid;
979 u64 total = 0;
980
981 for_each_node_state(nid, N_MEMORY)
982 total += mem_cgroup_node_nr_lru_pages(memcg, nid, lru_mask);
983 return total;
984}
985
986static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg,
987 enum mem_cgroup_events_target target)
988{
989 unsigned long val, next;
990
991 val = __this_cpu_read(memcg->stat->nr_page_events);
992 next = __this_cpu_read(memcg->stat->targets[target]);
993
994 if ((long)next - (long)val < 0) {
995 switch (target) {
996 case MEM_CGROUP_TARGET_THRESH:
997 next = val + THRESHOLDS_EVENTS_TARGET;
998 break;
999 case MEM_CGROUP_TARGET_SOFTLIMIT:
1000 next = val + SOFTLIMIT_EVENTS_TARGET;
1001 break;
1002 case MEM_CGROUP_TARGET_NUMAINFO:
1003 next = val + NUMAINFO_EVENTS_TARGET;
1004 break;
1005 default:
1006 break;
1007 }
1008 __this_cpu_write(memcg->stat->targets[target], next);
1009 return true;
1010 }
1011 return false;
1012}
1013
1014
1015
1016
1017
1018static void memcg_check_events(struct mem_cgroup *memcg, struct page *page)
1019{
1020 preempt_disable();
1021
1022 if (unlikely(mem_cgroup_event_ratelimit(memcg,
1023 MEM_CGROUP_TARGET_THRESH))) {
1024 bool do_softlimit;
1025 bool do_numainfo __maybe_unused;
1026
1027 do_softlimit = mem_cgroup_event_ratelimit(memcg,
1028 MEM_CGROUP_TARGET_SOFTLIMIT);
1029#if MAX_NUMNODES > 1
1030 do_numainfo = mem_cgroup_event_ratelimit(memcg,
1031 MEM_CGROUP_TARGET_NUMAINFO);
1032#endif
1033 preempt_enable();
1034
1035 mem_cgroup_threshold(memcg);
1036 if (unlikely(do_softlimit))
1037 mem_cgroup_update_tree(memcg, page);
1038#if MAX_NUMNODES > 1
1039 if (unlikely(do_numainfo))
1040 atomic_inc(&memcg->numainfo_events);
1041#endif
1042 } else
1043 preempt_enable();
1044}
1045
1046struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
1047{
1048
1049
1050
1051
1052
1053 if (unlikely(!p))
1054 return NULL;
1055
1056 return mem_cgroup_from_css(task_css(p, mem_cgroup_subsys_id));
1057}
1058
1059struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm)
1060{
1061 struct mem_cgroup *memcg = NULL;
1062
1063 if (!mm)
1064 return NULL;
1065
1066
1067
1068
1069
1070 rcu_read_lock();
1071 do {
1072 memcg = mem_cgroup_from_task(rcu_dereference(mm->owner));
1073 if (unlikely(!memcg))
1074 break;
1075 } while (!css_tryget(&memcg->css));
1076 rcu_read_unlock();
1077 return memcg;
1078}
1079
1080
1081
1082
1083
1084
1085
1086static struct mem_cgroup *__mem_cgroup_iter_next(struct mem_cgroup *root,
1087 struct mem_cgroup *last_visited)
1088{
1089 struct cgroup_subsys_state *prev_css, *next_css;
1090
1091 prev_css = last_visited ? &last_visited->css : NULL;
1092skip_node:
1093 next_css = css_next_descendant_pre(prev_css, &root->css);
1094
1095
1096
1097
1098
1099
1100
1101
1102 if (next_css) {
1103 struct mem_cgroup *mem = mem_cgroup_from_css(next_css);
1104
1105 if (css_tryget(&mem->css))
1106 return mem;
1107 else {
1108 prev_css = next_css;
1109 goto skip_node;
1110 }
1111 }
1112
1113 return NULL;
1114}
1115
1116static void mem_cgroup_iter_invalidate(struct mem_cgroup *root)
1117{
1118
1119
1120
1121
1122
1123 atomic_inc(&root->dead_count);
1124}
1125
1126static struct mem_cgroup *
1127mem_cgroup_iter_load(struct mem_cgroup_reclaim_iter *iter,
1128 struct mem_cgroup *root,
1129 int *sequence)
1130{
1131 struct mem_cgroup *position = NULL;
1132
1133
1134
1135
1136
1137
1138
1139
1140 *sequence = atomic_read(&root->dead_count);
1141 if (iter->last_dead_count == *sequence) {
1142 smp_rmb();
1143 position = iter->last_visited;
1144 if (position && !css_tryget(&position->css))
1145 position = NULL;
1146 }
1147 return position;
1148}
1149
1150static void mem_cgroup_iter_update(struct mem_cgroup_reclaim_iter *iter,
1151 struct mem_cgroup *last_visited,
1152 struct mem_cgroup *new_position,
1153 int sequence)
1154{
1155 if (last_visited)
1156 css_put(&last_visited->css);
1157
1158
1159
1160
1161
1162
1163 iter->last_visited = new_position;
1164 smp_wmb();
1165 iter->last_dead_count = sequence;
1166}
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
1186 struct mem_cgroup *prev,
1187 struct mem_cgroup_reclaim_cookie *reclaim)
1188{
1189 struct mem_cgroup *memcg = NULL;
1190 struct mem_cgroup *last_visited = NULL;
1191
1192 if (mem_cgroup_disabled())
1193 return NULL;
1194
1195 if (!root)
1196 root = root_mem_cgroup;
1197
1198 if (prev && !reclaim)
1199 last_visited = prev;
1200
1201 if (!root->use_hierarchy && root != root_mem_cgroup) {
1202 if (prev)
1203 goto out_css_put;
1204 return root;
1205 }
1206
1207 rcu_read_lock();
1208 while (!memcg) {
1209 struct mem_cgroup_reclaim_iter *uninitialized_var(iter);
1210 int uninitialized_var(seq);
1211
1212 if (reclaim) {
1213 int nid = zone_to_nid(reclaim->zone);
1214 int zid = zone_idx(reclaim->zone);
1215 struct mem_cgroup_per_zone *mz;
1216
1217 mz = mem_cgroup_zoneinfo(root, nid, zid);
1218 iter = &mz->reclaim_iter[reclaim->priority];
1219 if (prev && reclaim->generation != iter->generation) {
1220 iter->last_visited = NULL;
1221 goto out_unlock;
1222 }
1223
1224 last_visited = mem_cgroup_iter_load(iter, root, &seq);
1225 }
1226
1227 memcg = __mem_cgroup_iter_next(root, last_visited);
1228
1229 if (reclaim) {
1230 mem_cgroup_iter_update(iter, last_visited, memcg, seq);
1231
1232 if (!memcg)
1233 iter->generation++;
1234 else if (!prev && memcg)
1235 reclaim->generation = iter->generation;
1236 }
1237
1238 if (prev && !memcg)
1239 goto out_unlock;
1240 }
1241out_unlock:
1242 rcu_read_unlock();
1243out_css_put:
1244 if (prev && prev != root)
1245 css_put(&prev->css);
1246
1247 return memcg;
1248}
1249
1250
1251
1252
1253
1254
1255void mem_cgroup_iter_break(struct mem_cgroup *root,
1256 struct mem_cgroup *prev)
1257{
1258 if (!root)
1259 root = root_mem_cgroup;
1260 if (prev && prev != root)
1261 css_put(&prev->css);
1262}
1263
1264
1265
1266
1267
1268
1269#define for_each_mem_cgroup_tree(iter, root) \
1270 for (iter = mem_cgroup_iter(root, NULL, NULL); \
1271 iter != NULL; \
1272 iter = mem_cgroup_iter(root, iter, NULL))
1273
1274#define for_each_mem_cgroup(iter) \
1275 for (iter = mem_cgroup_iter(NULL, NULL, NULL); \
1276 iter != NULL; \
1277 iter = mem_cgroup_iter(NULL, iter, NULL))
1278
1279void __mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx)
1280{
1281 struct mem_cgroup *memcg;
1282
1283 rcu_read_lock();
1284 memcg = mem_cgroup_from_task(rcu_dereference(mm->owner));
1285 if (unlikely(!memcg))
1286 goto out;
1287
1288 switch (idx) {
1289 case PGFAULT:
1290 this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGFAULT]);
1291 break;
1292 case PGMAJFAULT:
1293 this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGMAJFAULT]);
1294 break;
1295 default:
1296 BUG();
1297 }
1298out:
1299 rcu_read_unlock();
1300}
1301EXPORT_SYMBOL(__mem_cgroup_count_vm_event);
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312struct lruvec *mem_cgroup_zone_lruvec(struct zone *zone,
1313 struct mem_cgroup *memcg)
1314{
1315 struct mem_cgroup_per_zone *mz;
1316 struct lruvec *lruvec;
1317
1318 if (mem_cgroup_disabled()) {
1319 lruvec = &zone->lruvec;
1320 goto out;
1321 }
1322
1323 mz = mem_cgroup_zoneinfo(memcg, zone_to_nid(zone), zone_idx(zone));
1324 lruvec = &mz->lruvec;
1325out:
1326
1327
1328
1329
1330
1331 if (unlikely(lruvec->zone != zone))
1332 lruvec->zone = zone;
1333 return lruvec;
1334}
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355struct lruvec *mem_cgroup_page_lruvec(struct page *page, struct zone *zone)
1356{
1357 struct mem_cgroup_per_zone *mz;
1358 struct mem_cgroup *memcg;
1359 struct page_cgroup *pc;
1360 struct lruvec *lruvec;
1361
1362 if (mem_cgroup_disabled()) {
1363 lruvec = &zone->lruvec;
1364 goto out;
1365 }
1366
1367 pc = lookup_page_cgroup(page);
1368 memcg = pc->mem_cgroup;
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379 if (!PageLRU(page) && !PageCgroupUsed(pc) && memcg != root_mem_cgroup)
1380 pc->mem_cgroup = memcg = root_mem_cgroup;
1381
1382 mz = page_cgroup_zoneinfo(memcg, page);
1383 lruvec = &mz->lruvec;
1384out:
1385
1386
1387
1388
1389
1390 if (unlikely(lruvec->zone != zone))
1391 lruvec->zone = zone;
1392 return lruvec;
1393}
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru,
1405 int nr_pages)
1406{
1407 struct mem_cgroup_per_zone *mz;
1408 unsigned long *lru_size;
1409
1410 if (mem_cgroup_disabled())
1411 return;
1412
1413 mz = container_of(lruvec, struct mem_cgroup_per_zone, lruvec);
1414 lru_size = mz->lru_size + lru;
1415 *lru_size += nr_pages;
1416 VM_BUG_ON((long)(*lru_size) < 0);
1417}
1418
1419
1420
1421
1422
1423bool __mem_cgroup_same_or_subtree(const struct mem_cgroup *root_memcg,
1424 struct mem_cgroup *memcg)
1425{
1426 if (root_memcg == memcg)
1427 return true;
1428 if (!root_memcg->use_hierarchy || !memcg)
1429 return false;
1430 return cgroup_is_descendant(memcg->css.cgroup, root_memcg->css.cgroup);
1431}
1432
1433static bool mem_cgroup_same_or_subtree(const struct mem_cgroup *root_memcg,
1434 struct mem_cgroup *memcg)
1435{
1436 bool ret;
1437
1438 rcu_read_lock();
1439 ret = __mem_cgroup_same_or_subtree(root_memcg, memcg);
1440 rcu_read_unlock();
1441 return ret;
1442}
1443
1444bool task_in_mem_cgroup(struct task_struct *task,
1445 const struct mem_cgroup *memcg)
1446{
1447 struct mem_cgroup *curr = NULL;
1448 struct task_struct *p;
1449 bool ret;
1450
1451 p = find_lock_task_mm(task);
1452 if (p) {
1453 curr = try_get_mem_cgroup_from_mm(p->mm);
1454 task_unlock(p);
1455 } else {
1456
1457
1458
1459
1460
1461 rcu_read_lock();
1462 curr = mem_cgroup_from_task(task);
1463 if (curr)
1464 css_get(&curr->css);
1465 rcu_read_unlock();
1466 }
1467 if (!curr)
1468 return false;
1469
1470
1471
1472
1473
1474
1475 ret = mem_cgroup_same_or_subtree(memcg, curr);
1476 css_put(&curr->css);
1477 return ret;
1478}
1479
1480int mem_cgroup_inactive_anon_is_low(struct lruvec *lruvec)
1481{
1482 unsigned long inactive_ratio;
1483 unsigned long inactive;
1484 unsigned long active;
1485 unsigned long gb;
1486
1487 inactive = mem_cgroup_get_lru_size(lruvec, LRU_INACTIVE_ANON);
1488 active = mem_cgroup_get_lru_size(lruvec, LRU_ACTIVE_ANON);
1489
1490 gb = (inactive + active) >> (30 - PAGE_SHIFT);
1491 if (gb)
1492 inactive_ratio = int_sqrt(10 * gb);
1493 else
1494 inactive_ratio = 1;
1495
1496 return inactive * inactive_ratio < active;
1497}
1498
1499#define mem_cgroup_from_res_counter(counter, member) \
1500 container_of(counter, struct mem_cgroup, member)
1501
1502
1503
1504
1505
1506
1507
1508
1509static unsigned long mem_cgroup_margin(struct mem_cgroup *memcg)
1510{
1511 unsigned long long margin;
1512
1513 margin = res_counter_margin(&memcg->res);
1514 if (do_swap_account)
1515 margin = min(margin, res_counter_margin(&memcg->memsw));
1516 return margin >> PAGE_SHIFT;
1517}
1518
1519int mem_cgroup_swappiness(struct mem_cgroup *memcg)
1520{
1521
1522 if (!css_parent(&memcg->css))
1523 return vm_swappiness;
1524
1525 return memcg->swappiness;
1526}
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544atomic_t memcg_moving __read_mostly;
1545
1546static void mem_cgroup_start_move(struct mem_cgroup *memcg)
1547{
1548 atomic_inc(&memcg_moving);
1549 atomic_inc(&memcg->moving_account);
1550 synchronize_rcu();
1551}
1552
1553static void mem_cgroup_end_move(struct mem_cgroup *memcg)
1554{
1555
1556
1557
1558
1559 if (memcg) {
1560 atomic_dec(&memcg_moving);
1561 atomic_dec(&memcg->moving_account);
1562 }
1563}
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577static bool mem_cgroup_stolen(struct mem_cgroup *memcg)
1578{
1579 VM_BUG_ON(!rcu_read_lock_held());
1580 return atomic_read(&memcg->moving_account) > 0;
1581}
1582
1583static bool mem_cgroup_under_move(struct mem_cgroup *memcg)
1584{
1585 struct mem_cgroup *from;
1586 struct mem_cgroup *to;
1587 bool ret = false;
1588
1589
1590
1591
1592 spin_lock(&mc.lock);
1593 from = mc.from;
1594 to = mc.to;
1595 if (!from)
1596 goto unlock;
1597
1598 ret = mem_cgroup_same_or_subtree(memcg, from)
1599 || mem_cgroup_same_or_subtree(memcg, to);
1600unlock:
1601 spin_unlock(&mc.lock);
1602 return ret;
1603}
1604
1605static bool mem_cgroup_wait_acct_move(struct mem_cgroup *memcg)
1606{
1607 if (mc.moving_task && current != mc.moving_task) {
1608 if (mem_cgroup_under_move(memcg)) {
1609 DEFINE_WAIT(wait);
1610 prepare_to_wait(&mc.waitq, &wait, TASK_INTERRUPTIBLE);
1611
1612 if (mc.moving_task)
1613 schedule();
1614 finish_wait(&mc.waitq, &wait);
1615 return true;
1616 }
1617 }
1618 return false;
1619}
1620
1621
1622
1623
1624
1625
1626
1627static void move_lock_mem_cgroup(struct mem_cgroup *memcg,
1628 unsigned long *flags)
1629{
1630 spin_lock_irqsave(&memcg->move_lock, *flags);
1631}
1632
1633static void move_unlock_mem_cgroup(struct mem_cgroup *memcg,
1634 unsigned long *flags)
1635{
1636 spin_unlock_irqrestore(&memcg->move_lock, *flags);
1637}
1638
1639#define K(x) ((x) << (PAGE_SHIFT-10))
1640
1641
1642
1643
1644
1645
1646
1647
1648void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
1649{
1650 struct cgroup *task_cgrp;
1651 struct cgroup *mem_cgrp;
1652
1653
1654
1655
1656
1657 static char memcg_name[PATH_MAX];
1658 int ret;
1659 struct mem_cgroup *iter;
1660 unsigned int i;
1661
1662 if (!p)
1663 return;
1664
1665 rcu_read_lock();
1666
1667 mem_cgrp = memcg->css.cgroup;
1668 task_cgrp = task_cgroup(p, mem_cgroup_subsys_id);
1669
1670 ret = cgroup_path(task_cgrp, memcg_name, PATH_MAX);
1671 if (ret < 0) {
1672
1673
1674
1675
1676 rcu_read_unlock();
1677 goto done;
1678 }
1679 rcu_read_unlock();
1680
1681 pr_info("Task in %s killed", memcg_name);
1682
1683 rcu_read_lock();
1684 ret = cgroup_path(mem_cgrp, memcg_name, PATH_MAX);
1685 if (ret < 0) {
1686 rcu_read_unlock();
1687 goto done;
1688 }
1689 rcu_read_unlock();
1690
1691
1692
1693
1694 pr_cont(" as a result of limit of %s\n", memcg_name);
1695done:
1696
1697 pr_info("memory: usage %llukB, limit %llukB, failcnt %llu\n",
1698 res_counter_read_u64(&memcg->res, RES_USAGE) >> 10,
1699 res_counter_read_u64(&memcg->res, RES_LIMIT) >> 10,
1700 res_counter_read_u64(&memcg->res, RES_FAILCNT));
1701 pr_info("memory+swap: usage %llukB, limit %llukB, failcnt %llu\n",
1702 res_counter_read_u64(&memcg->memsw, RES_USAGE) >> 10,
1703 res_counter_read_u64(&memcg->memsw, RES_LIMIT) >> 10,
1704 res_counter_read_u64(&memcg->memsw, RES_FAILCNT));
1705 pr_info("kmem: usage %llukB, limit %llukB, failcnt %llu\n",
1706 res_counter_read_u64(&memcg->kmem, RES_USAGE) >> 10,
1707 res_counter_read_u64(&memcg->kmem, RES_LIMIT) >> 10,
1708 res_counter_read_u64(&memcg->kmem, RES_FAILCNT));
1709
1710 for_each_mem_cgroup_tree(iter, memcg) {
1711 pr_info("Memory cgroup stats");
1712
1713 rcu_read_lock();
1714 ret = cgroup_path(iter->css.cgroup, memcg_name, PATH_MAX);
1715 if (!ret)
1716 pr_cont(" for %s", memcg_name);
1717 rcu_read_unlock();
1718 pr_cont(":");
1719
1720 for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) {
1721 if (i == MEM_CGROUP_STAT_SWAP && !do_swap_account)
1722 continue;
1723 pr_cont(" %s:%ldKB", mem_cgroup_stat_names[i],
1724 K(mem_cgroup_read_stat(iter, i)));
1725 }
1726
1727 for (i = 0; i < NR_LRU_LISTS; i++)
1728 pr_cont(" %s:%luKB", mem_cgroup_lru_names[i],
1729 K(mem_cgroup_nr_lru_pages(iter, BIT(i))));
1730
1731 pr_cont("\n");
1732 }
1733}
1734
1735
1736
1737
1738
1739static int mem_cgroup_count_children(struct mem_cgroup *memcg)
1740{
1741 int num = 0;
1742 struct mem_cgroup *iter;
1743
1744 for_each_mem_cgroup_tree(iter, memcg)
1745 num++;
1746 return num;
1747}
1748
1749
1750
1751
1752static u64 mem_cgroup_get_limit(struct mem_cgroup *memcg)
1753{
1754 u64 limit;
1755
1756 limit = res_counter_read_u64(&memcg->res, RES_LIMIT);
1757
1758
1759
1760
1761 if (mem_cgroup_swappiness(memcg)) {
1762 u64 memsw;
1763
1764 limit += total_swap_pages << PAGE_SHIFT;
1765 memsw = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
1766
1767
1768
1769
1770
1771 limit = min(limit, memsw);
1772 }
1773
1774 return limit;
1775}
1776
1777static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
1778 int order)
1779{
1780 struct mem_cgroup *iter;
1781 unsigned long chosen_points = 0;
1782 unsigned long totalpages;
1783 unsigned int points = 0;
1784 struct task_struct *chosen = NULL;
1785
1786
1787
1788
1789
1790
1791 if (fatal_signal_pending(current) || current->flags & PF_EXITING) {
1792 set_thread_flag(TIF_MEMDIE);
1793 return;
1794 }
1795
1796 check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, order, NULL);
1797 totalpages = mem_cgroup_get_limit(memcg) >> PAGE_SHIFT ? : 1;
1798 for_each_mem_cgroup_tree(iter, memcg) {
1799 struct css_task_iter it;
1800 struct task_struct *task;
1801
1802 css_task_iter_start(&iter->css, &it);
1803 while ((task = css_task_iter_next(&it))) {
1804 switch (oom_scan_process_thread(task, totalpages, NULL,
1805 false)) {
1806 case OOM_SCAN_SELECT:
1807 if (chosen)
1808 put_task_struct(chosen);
1809 chosen = task;
1810 chosen_points = ULONG_MAX;
1811 get_task_struct(chosen);
1812
1813 case OOM_SCAN_CONTINUE:
1814 continue;
1815 case OOM_SCAN_ABORT:
1816 css_task_iter_end(&it);
1817 mem_cgroup_iter_break(memcg, iter);
1818 if (chosen)
1819 put_task_struct(chosen);
1820 return;
1821 case OOM_SCAN_OK:
1822 break;
1823 };
1824 points = oom_badness(task, memcg, NULL, totalpages);
1825 if (points > chosen_points) {
1826 if (chosen)
1827 put_task_struct(chosen);
1828 chosen = task;
1829 chosen_points = points;
1830 get_task_struct(chosen);
1831 }
1832 }
1833 css_task_iter_end(&it);
1834 }
1835
1836 if (!chosen)
1837 return;
1838 points = chosen_points * 1000 / totalpages;
1839 oom_kill_process(chosen, gfp_mask, order, points, totalpages, memcg,
1840 NULL, "Memory cgroup out of memory");
1841}
1842
1843static unsigned long mem_cgroup_reclaim(struct mem_cgroup *memcg,
1844 gfp_t gfp_mask,
1845 unsigned long flags)
1846{
1847 unsigned long total = 0;
1848 bool noswap = false;
1849 int loop;
1850
1851 if (flags & MEM_CGROUP_RECLAIM_NOSWAP)
1852 noswap = true;
1853 if (!(flags & MEM_CGROUP_RECLAIM_SHRINK) && memcg->memsw_is_minimum)
1854 noswap = true;
1855
1856 for (loop = 0; loop < MEM_CGROUP_MAX_RECLAIM_LOOPS; loop++) {
1857 if (loop)
1858 drain_all_stock_async(memcg);
1859 total += try_to_free_mem_cgroup_pages(memcg, gfp_mask, noswap);
1860
1861
1862
1863
1864
1865 if (total && (flags & MEM_CGROUP_RECLAIM_SHRINK))
1866 break;
1867 if (mem_cgroup_margin(memcg))
1868 break;
1869
1870
1871
1872
1873 if (loop && !total)
1874 break;
1875 }
1876 return total;
1877}
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889static bool test_mem_cgroup_node_reclaimable(struct mem_cgroup *memcg,
1890 int nid, bool noswap)
1891{
1892 if (mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL_FILE))
1893 return true;
1894 if (noswap || !total_swap_pages)
1895 return false;
1896 if (mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL_ANON))
1897 return true;
1898 return false;
1899
1900}
1901#if MAX_NUMNODES > 1
1902
1903
1904
1905
1906
1907
1908
1909static void mem_cgroup_may_update_nodemask(struct mem_cgroup *memcg)
1910{
1911 int nid;
1912
1913
1914
1915
1916 if (!atomic_read(&memcg->numainfo_events))
1917 return;
1918 if (atomic_inc_return(&memcg->numainfo_updating) > 1)
1919 return;
1920
1921
1922 memcg->scan_nodes = node_states[N_MEMORY];
1923
1924 for_each_node_mask(nid, node_states[N_MEMORY]) {
1925
1926 if (!test_mem_cgroup_node_reclaimable(memcg, nid, false))
1927 node_clear(nid, memcg->scan_nodes);
1928 }
1929
1930 atomic_set(&memcg->numainfo_events, 0);
1931 atomic_set(&memcg->numainfo_updating, 0);
1932}
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946int mem_cgroup_select_victim_node(struct mem_cgroup *memcg)
1947{
1948 int node;
1949
1950 mem_cgroup_may_update_nodemask(memcg);
1951 node = memcg->last_scanned_node;
1952
1953 node = next_node(node, memcg->scan_nodes);
1954 if (node == MAX_NUMNODES)
1955 node = first_node(memcg->scan_nodes);
1956
1957
1958
1959
1960
1961
1962 if (unlikely(node == MAX_NUMNODES))
1963 node = numa_node_id();
1964
1965 memcg->last_scanned_node = node;
1966 return node;
1967}
1968
1969
1970
1971
1972
1973
1974
1975static bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap)
1976{
1977 int nid;
1978
1979
1980
1981
1982
1983 if (!nodes_empty(memcg->scan_nodes)) {
1984 for (nid = first_node(memcg->scan_nodes);
1985 nid < MAX_NUMNODES;
1986 nid = next_node(nid, memcg->scan_nodes)) {
1987
1988 if (test_mem_cgroup_node_reclaimable(memcg, nid, noswap))
1989 return true;
1990 }
1991 }
1992
1993
1994
1995 for_each_node_state(nid, N_MEMORY) {
1996 if (node_isset(nid, memcg->scan_nodes))
1997 continue;
1998 if (test_mem_cgroup_node_reclaimable(memcg, nid, noswap))
1999 return true;
2000 }
2001 return false;
2002}
2003
2004#else
2005int mem_cgroup_select_victim_node(struct mem_cgroup *memcg)
2006{
2007 return 0;
2008}
2009
2010static bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap)
2011{
2012 return test_mem_cgroup_node_reclaimable(memcg, 0, noswap);
2013}
2014#endif
2015
2016static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg,
2017 struct zone *zone,
2018 gfp_t gfp_mask,
2019 unsigned long *total_scanned)
2020{
2021 struct mem_cgroup *victim = NULL;
2022 int total = 0;
2023 int loop = 0;
2024 unsigned long excess;
2025 unsigned long nr_scanned;
2026 struct mem_cgroup_reclaim_cookie reclaim = {
2027 .zone = zone,
2028 .priority = 0,
2029 };
2030
2031 excess = res_counter_soft_limit_excess(&root_memcg->res) >> PAGE_SHIFT;
2032
2033 while (1) {
2034 victim = mem_cgroup_iter(root_memcg, victim, &reclaim);
2035 if (!victim) {
2036 loop++;
2037 if (loop >= 2) {
2038
2039
2040
2041
2042
2043 if (!total)
2044 break;
2045
2046
2047
2048
2049
2050
2051 if (total >= (excess >> 2) ||
2052 (loop > MEM_CGROUP_MAX_RECLAIM_LOOPS))
2053 break;
2054 }
2055 continue;
2056 }
2057 if (!mem_cgroup_reclaimable(victim, false))
2058 continue;
2059 total += mem_cgroup_shrink_node_zone(victim, gfp_mask, false,
2060 zone, &nr_scanned);
2061 *total_scanned += nr_scanned;
2062 if (!res_counter_soft_limit_excess(&root_memcg->res))
2063 break;
2064 }
2065 mem_cgroup_iter_break(root_memcg, victim);
2066 return total;
2067}
2068
2069#ifdef CONFIG_LOCKDEP
2070static struct lockdep_map memcg_oom_lock_dep_map = {
2071 .name = "memcg_oom_lock",
2072};
2073#endif
2074
2075static DEFINE_SPINLOCK(memcg_oom_lock);
2076
2077
2078
2079
2080
2081static bool mem_cgroup_oom_trylock(struct mem_cgroup *memcg)
2082{
2083 struct mem_cgroup *iter, *failed = NULL;
2084
2085 spin_lock(&memcg_oom_lock);
2086
2087 for_each_mem_cgroup_tree(iter, memcg) {
2088 if (iter->oom_lock) {
2089
2090
2091
2092
2093 failed = iter;
2094 mem_cgroup_iter_break(memcg, iter);
2095 break;
2096 } else
2097 iter->oom_lock = true;
2098 }
2099
2100 if (failed) {
2101
2102
2103
2104
2105 for_each_mem_cgroup_tree(iter, memcg) {
2106 if (iter == failed) {
2107 mem_cgroup_iter_break(memcg, iter);
2108 break;
2109 }
2110 iter->oom_lock = false;
2111 }
2112 } else
2113 mutex_acquire(&memcg_oom_lock_dep_map, 0, 1, _RET_IP_);
2114
2115 spin_unlock(&memcg_oom_lock);
2116
2117 return !failed;
2118}
2119
2120static void mem_cgroup_oom_unlock(struct mem_cgroup *memcg)
2121{
2122 struct mem_cgroup *iter;
2123
2124 spin_lock(&memcg_oom_lock);
2125 mutex_release(&memcg_oom_lock_dep_map, 1, _RET_IP_);
2126 for_each_mem_cgroup_tree(iter, memcg)
2127 iter->oom_lock = false;
2128 spin_unlock(&memcg_oom_lock);
2129}
2130
2131static void mem_cgroup_mark_under_oom(struct mem_cgroup *memcg)
2132{
2133 struct mem_cgroup *iter;
2134
2135 for_each_mem_cgroup_tree(iter, memcg)
2136 atomic_inc(&iter->under_oom);
2137}
2138
2139static void mem_cgroup_unmark_under_oom(struct mem_cgroup *memcg)
2140{
2141 struct mem_cgroup *iter;
2142
2143
2144
2145
2146
2147
2148 for_each_mem_cgroup_tree(iter, memcg)
2149 atomic_add_unless(&iter->under_oom, -1, 0);
2150}
2151
2152static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq);
2153
2154struct oom_wait_info {
2155 struct mem_cgroup *memcg;
2156 wait_queue_t wait;
2157};
2158
2159static int memcg_oom_wake_function(wait_queue_t *wait,
2160 unsigned mode, int sync, void *arg)
2161{
2162 struct mem_cgroup *wake_memcg = (struct mem_cgroup *)arg;
2163 struct mem_cgroup *oom_wait_memcg;
2164 struct oom_wait_info *oom_wait_info;
2165
2166 oom_wait_info = container_of(wait, struct oom_wait_info, wait);
2167 oom_wait_memcg = oom_wait_info->memcg;
2168
2169
2170
2171
2172
2173 if (!mem_cgroup_same_or_subtree(oom_wait_memcg, wake_memcg)
2174 && !mem_cgroup_same_or_subtree(wake_memcg, oom_wait_memcg))
2175 return 0;
2176 return autoremove_wake_function(wait, mode, sync, arg);
2177}
2178
2179static void memcg_wakeup_oom(struct mem_cgroup *memcg)
2180{
2181 atomic_inc(&memcg->oom_wakeups);
2182
2183 __wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, memcg);
2184}
2185
2186static void memcg_oom_recover(struct mem_cgroup *memcg)
2187{
2188 if (memcg && atomic_read(&memcg->under_oom))
2189 memcg_wakeup_oom(memcg);
2190}
2191
2192static void mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order)
2193{
2194 if (!current->memcg_oom.may_oom)
2195 return;
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210 css_get(&memcg->css);
2211 current->memcg_oom.memcg = memcg;
2212 current->memcg_oom.gfp_mask = mask;
2213 current->memcg_oom.order = order;
2214}
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233bool mem_cgroup_oom_synchronize(bool handle)
2234{
2235 struct mem_cgroup *memcg = current->memcg_oom.memcg;
2236 struct oom_wait_info owait;
2237 bool locked;
2238
2239
2240 if (!memcg)
2241 return false;
2242
2243 if (!handle)
2244 goto cleanup;
2245
2246 owait.memcg = memcg;
2247 owait.wait.flags = 0;
2248 owait.wait.func = memcg_oom_wake_function;
2249 owait.wait.private = current;
2250 INIT_LIST_HEAD(&owait.wait.task_list);
2251
2252 prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE);
2253 mem_cgroup_mark_under_oom(memcg);
2254
2255 locked = mem_cgroup_oom_trylock(memcg);
2256
2257 if (locked)
2258 mem_cgroup_oom_notify(memcg);
2259
2260 if (locked && !memcg->oom_kill_disable) {
2261 mem_cgroup_unmark_under_oom(memcg);
2262 finish_wait(&memcg_oom_waitq, &owait.wait);
2263 mem_cgroup_out_of_memory(memcg, current->memcg_oom.gfp_mask,
2264 current->memcg_oom.order);
2265 } else {
2266 schedule();
2267 mem_cgroup_unmark_under_oom(memcg);
2268 finish_wait(&memcg_oom_waitq, &owait.wait);
2269 }
2270
2271 if (locked) {
2272 mem_cgroup_oom_unlock(memcg);
2273
2274
2275
2276
2277
2278 memcg_oom_recover(memcg);
2279 }
2280cleanup:
2281 current->memcg_oom.memcg = NULL;
2282 css_put(&memcg->css);
2283 return true;
2284}
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310void __mem_cgroup_begin_update_page_stat(struct page *page,
2311 bool *locked, unsigned long *flags)
2312{
2313 struct mem_cgroup *memcg;
2314 struct page_cgroup *pc;
2315
2316 pc = lookup_page_cgroup(page);
2317again:
2318 memcg = pc->mem_cgroup;
2319 if (unlikely(!memcg || !PageCgroupUsed(pc)))
2320 return;
2321
2322
2323
2324
2325
2326
2327 if (!mem_cgroup_stolen(memcg))
2328 return;
2329
2330 move_lock_mem_cgroup(memcg, flags);
2331 if (memcg != pc->mem_cgroup || !PageCgroupUsed(pc)) {
2332 move_unlock_mem_cgroup(memcg, flags);
2333 goto again;
2334 }
2335 *locked = true;
2336}
2337
2338void __mem_cgroup_end_update_page_stat(struct page *page, unsigned long *flags)
2339{
2340 struct page_cgroup *pc = lookup_page_cgroup(page);
2341
2342
2343
2344
2345
2346
2347 move_unlock_mem_cgroup(pc->mem_cgroup, flags);
2348}
2349
2350void mem_cgroup_update_page_stat(struct page *page,
2351 enum mem_cgroup_stat_index idx, int val)
2352{
2353 struct mem_cgroup *memcg;
2354 struct page_cgroup *pc = lookup_page_cgroup(page);
2355 unsigned long uninitialized_var(flags);
2356
2357 if (mem_cgroup_disabled())
2358 return;
2359
2360 VM_BUG_ON(!rcu_read_lock_held());
2361 memcg = pc->mem_cgroup;
2362 if (unlikely(!memcg || !PageCgroupUsed(pc)))
2363 return;
2364
2365 this_cpu_add(memcg->stat->count[idx], val);
2366}
2367
2368
2369
2370
2371
2372#define CHARGE_BATCH 32U
2373struct memcg_stock_pcp {
2374 struct mem_cgroup *cached;
2375 unsigned int nr_pages;
2376 struct work_struct work;
2377 unsigned long flags;
2378#define FLUSHING_CACHED_CHARGE 0
2379};
2380static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock);
2381static DEFINE_MUTEX(percpu_charge_mutex);
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
2395{
2396 struct memcg_stock_pcp *stock;
2397 bool ret = true;
2398
2399 if (nr_pages > CHARGE_BATCH)
2400 return false;
2401
2402 stock = &get_cpu_var(memcg_stock);
2403 if (memcg == stock->cached && stock->nr_pages >= nr_pages)
2404 stock->nr_pages -= nr_pages;
2405 else
2406 ret = false;
2407 put_cpu_var(memcg_stock);
2408 return ret;
2409}
2410
2411
2412
2413
2414static void drain_stock(struct memcg_stock_pcp *stock)
2415{
2416 struct mem_cgroup *old = stock->cached;
2417
2418 if (stock->nr_pages) {
2419 unsigned long bytes = stock->nr_pages * PAGE_SIZE;
2420
2421 res_counter_uncharge(&old->res, bytes);
2422 if (do_swap_account)
2423 res_counter_uncharge(&old->memsw, bytes);
2424 stock->nr_pages = 0;
2425 }
2426 stock->cached = NULL;
2427}
2428
2429
2430
2431
2432
2433static void drain_local_stock(struct work_struct *dummy)
2434{
2435 struct memcg_stock_pcp *stock = &__get_cpu_var(memcg_stock);
2436 drain_stock(stock);
2437 clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags);
2438}
2439
2440static void __init memcg_stock_init(void)
2441{
2442 int cpu;
2443
2444 for_each_possible_cpu(cpu) {
2445 struct memcg_stock_pcp *stock =
2446 &per_cpu(memcg_stock, cpu);
2447 INIT_WORK(&stock->work, drain_local_stock);
2448 }
2449}
2450
2451
2452
2453
2454
2455static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
2456{
2457 struct memcg_stock_pcp *stock = &get_cpu_var(memcg_stock);
2458
2459 if (stock->cached != memcg) {
2460 drain_stock(stock);
2461 stock->cached = memcg;
2462 }
2463 stock->nr_pages += nr_pages;
2464 put_cpu_var(memcg_stock);
2465}
2466
2467
2468
2469
2470
2471
2472static void drain_all_stock(struct mem_cgroup *root_memcg, bool sync)
2473{
2474 int cpu, curcpu;
2475
2476
2477 get_online_cpus();
2478 curcpu = get_cpu();
2479 for_each_online_cpu(cpu) {
2480 struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu);
2481 struct mem_cgroup *memcg;
2482
2483 memcg = stock->cached;
2484 if (!memcg || !stock->nr_pages)
2485 continue;
2486 if (!mem_cgroup_same_or_subtree(root_memcg, memcg))
2487 continue;
2488 if (!test_and_set_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) {
2489 if (cpu == curcpu)
2490 drain_local_stock(&stock->work);
2491 else
2492 schedule_work_on(cpu, &stock->work);
2493 }
2494 }
2495 put_cpu();
2496
2497 if (!sync)
2498 goto out;
2499
2500 for_each_online_cpu(cpu) {
2501 struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu);
2502 if (test_bit(FLUSHING_CACHED_CHARGE, &stock->flags))
2503 flush_work(&stock->work);
2504 }
2505out:
2506 put_online_cpus();
2507}
2508
2509
2510
2511
2512
2513
2514
2515static void drain_all_stock_async(struct mem_cgroup *root_memcg)
2516{
2517
2518
2519
2520 if (!mutex_trylock(&percpu_charge_mutex))
2521 return;
2522 drain_all_stock(root_memcg, false);
2523 mutex_unlock(&percpu_charge_mutex);
2524}
2525
2526
2527static void drain_all_stock_sync(struct mem_cgroup *root_memcg)
2528{
2529
2530 mutex_lock(&percpu_charge_mutex);
2531 drain_all_stock(root_memcg, true);
2532 mutex_unlock(&percpu_charge_mutex);
2533}
2534
2535
2536
2537
2538
2539static void mem_cgroup_drain_pcp_counter(struct mem_cgroup *memcg, int cpu)
2540{
2541 int i;
2542
2543 spin_lock(&memcg->pcp_counter_lock);
2544 for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) {
2545 long x = per_cpu(memcg->stat->count[i], cpu);
2546
2547 per_cpu(memcg->stat->count[i], cpu) = 0;
2548 memcg->nocpu_base.count[i] += x;
2549 }
2550 for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++) {
2551 unsigned long x = per_cpu(memcg->stat->events[i], cpu);
2552
2553 per_cpu(memcg->stat->events[i], cpu) = 0;
2554 memcg->nocpu_base.events[i] += x;
2555 }
2556 spin_unlock(&memcg->pcp_counter_lock);
2557}
2558
2559static int memcg_cpu_hotplug_callback(struct notifier_block *nb,
2560 unsigned long action,
2561 void *hcpu)
2562{
2563 int cpu = (unsigned long)hcpu;
2564 struct memcg_stock_pcp *stock;
2565 struct mem_cgroup *iter;
2566
2567 if (action == CPU_ONLINE)
2568 return NOTIFY_OK;
2569
2570 if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
2571 return NOTIFY_OK;
2572
2573 for_each_mem_cgroup(iter)
2574 mem_cgroup_drain_pcp_counter(iter, cpu);
2575
2576 stock = &per_cpu(memcg_stock, cpu);
2577 drain_stock(stock);
2578 return NOTIFY_OK;
2579}
2580
2581
2582
2583enum {
2584 CHARGE_OK,
2585 CHARGE_RETRY,
2586 CHARGE_NOMEM,
2587 CHARGE_WOULDBLOCK,
2588};
2589
2590static int mem_cgroup_do_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
2591 unsigned int nr_pages, unsigned int min_pages,
2592 bool invoke_oom)
2593{
2594 unsigned long csize = nr_pages * PAGE_SIZE;
2595 struct mem_cgroup *mem_over_limit;
2596 struct res_counter *fail_res;
2597 unsigned long flags = 0;
2598 int ret;
2599
2600 ret = res_counter_charge(&memcg->res, csize, &fail_res);
2601
2602 if (likely(!ret)) {
2603 if (!do_swap_account)
2604 return CHARGE_OK;
2605 ret = res_counter_charge(&memcg->memsw, csize, &fail_res);
2606 if (likely(!ret))
2607 return CHARGE_OK;
2608
2609 res_counter_uncharge(&memcg->res, csize);
2610 mem_over_limit = mem_cgroup_from_res_counter(fail_res, memsw);
2611 flags |= MEM_CGROUP_RECLAIM_NOSWAP;
2612 } else
2613 mem_over_limit = mem_cgroup_from_res_counter(fail_res, res);
2614
2615
2616
2617
2618 if (nr_pages > min_pages)
2619 return CHARGE_RETRY;
2620
2621 if (!(gfp_mask & __GFP_WAIT))
2622 return CHARGE_WOULDBLOCK;
2623
2624 if (gfp_mask & __GFP_NORETRY)
2625 return CHARGE_NOMEM;
2626
2627 ret = mem_cgroup_reclaim(mem_over_limit, gfp_mask, flags);
2628 if (mem_cgroup_margin(mem_over_limit) >= nr_pages)
2629 return CHARGE_RETRY;
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639 if (nr_pages <= (1 << PAGE_ALLOC_COSTLY_ORDER) && ret)
2640 return CHARGE_RETRY;
2641
2642
2643
2644
2645
2646 if (mem_cgroup_wait_acct_move(mem_over_limit))
2647 return CHARGE_RETRY;
2648
2649 if (invoke_oom)
2650 mem_cgroup_oom(mem_over_limit, gfp_mask, get_order(csize));
2651
2652 return CHARGE_NOMEM;
2653}
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676static int __mem_cgroup_try_charge(struct mm_struct *mm,
2677 gfp_t gfp_mask,
2678 unsigned int nr_pages,
2679 struct mem_cgroup **ptr,
2680 bool oom)
2681{
2682 unsigned int batch = max(CHARGE_BATCH, nr_pages);
2683 int nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES;
2684 struct mem_cgroup *memcg = NULL;
2685 int ret;
2686
2687
2688
2689
2690
2691
2692 if (unlikely(test_thread_flag(TIF_MEMDIE)
2693 || fatal_signal_pending(current)))
2694 goto bypass;
2695
2696 if (unlikely(task_in_memcg_oom(current)))
2697 goto nomem;
2698
2699 if (gfp_mask & __GFP_NOFAIL)
2700 oom = false;
2701
2702
2703
2704
2705
2706
2707
2708 if (!*ptr && !mm)
2709 *ptr = root_mem_cgroup;
2710again:
2711 if (*ptr) {
2712 memcg = *ptr;
2713 if (mem_cgroup_is_root(memcg))
2714 goto done;
2715 if (consume_stock(memcg, nr_pages))
2716 goto done;
2717 css_get(&memcg->css);
2718 } else {
2719 struct task_struct *p;
2720
2721 rcu_read_lock();
2722 p = rcu_dereference(mm->owner);
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733 memcg = mem_cgroup_from_task(p);
2734 if (!memcg)
2735 memcg = root_mem_cgroup;
2736 if (mem_cgroup_is_root(memcg)) {
2737 rcu_read_unlock();
2738 goto done;
2739 }
2740 if (consume_stock(memcg, nr_pages)) {
2741
2742
2743
2744
2745
2746
2747
2748
2749 rcu_read_unlock();
2750 goto done;
2751 }
2752
2753 if (!css_tryget(&memcg->css)) {
2754 rcu_read_unlock();
2755 goto again;
2756 }
2757 rcu_read_unlock();
2758 }
2759
2760 do {
2761 bool invoke_oom = oom && !nr_oom_retries;
2762
2763
2764 if (fatal_signal_pending(current)) {
2765 css_put(&memcg->css);
2766 goto bypass;
2767 }
2768
2769 ret = mem_cgroup_do_charge(memcg, gfp_mask, batch,
2770 nr_pages, invoke_oom);
2771 switch (ret) {
2772 case CHARGE_OK:
2773 break;
2774 case CHARGE_RETRY:
2775 batch = nr_pages;
2776 css_put(&memcg->css);
2777 memcg = NULL;
2778 goto again;
2779 case CHARGE_WOULDBLOCK:
2780 css_put(&memcg->css);
2781 goto nomem;
2782 case CHARGE_NOMEM:
2783 if (!oom || invoke_oom) {
2784 css_put(&memcg->css);
2785 goto nomem;
2786 }
2787 nr_oom_retries--;
2788 break;
2789 }
2790 } while (ret != CHARGE_OK);
2791
2792 if (batch > nr_pages)
2793 refill_stock(memcg, batch - nr_pages);
2794 css_put(&memcg->css);
2795done:
2796 *ptr = memcg;
2797 return 0;
2798nomem:
2799 if (!(gfp_mask & __GFP_NOFAIL)) {
2800 *ptr = NULL;
2801 return -ENOMEM;
2802 }
2803bypass:
2804 *ptr = root_mem_cgroup;
2805 return -EINTR;
2806}
2807
2808
2809
2810
2811
2812
2813static void __mem_cgroup_cancel_charge(struct mem_cgroup *memcg,
2814 unsigned int nr_pages)
2815{
2816 if (!mem_cgroup_is_root(memcg)) {
2817 unsigned long bytes = nr_pages * PAGE_SIZE;
2818
2819 res_counter_uncharge(&memcg->res, bytes);
2820 if (do_swap_account)
2821 res_counter_uncharge(&memcg->memsw, bytes);
2822 }
2823}
2824
2825
2826
2827
2828
2829static void __mem_cgroup_cancel_local_charge(struct mem_cgroup *memcg,
2830 unsigned int nr_pages)
2831{
2832 unsigned long bytes = nr_pages * PAGE_SIZE;
2833
2834 if (mem_cgroup_is_root(memcg))
2835 return;
2836
2837 res_counter_uncharge_until(&memcg->res, memcg->res.parent, bytes);
2838 if (do_swap_account)
2839 res_counter_uncharge_until(&memcg->memsw,
2840 memcg->memsw.parent, bytes);
2841}
2842
2843
2844
2845
2846
2847
2848
2849static struct mem_cgroup *mem_cgroup_lookup(unsigned short id)
2850{
2851
2852 if (!id)
2853 return NULL;
2854 return mem_cgroup_from_id(id);
2855}
2856
2857struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page)
2858{
2859 struct mem_cgroup *memcg = NULL;
2860 struct page_cgroup *pc;
2861 unsigned short id;
2862 swp_entry_t ent;
2863
2864 VM_BUG_ON(!PageLocked(page));
2865
2866 pc = lookup_page_cgroup(page);
2867 lock_page_cgroup(pc);
2868 if (PageCgroupUsed(pc)) {
2869 memcg = pc->mem_cgroup;
2870 if (memcg && !css_tryget(&memcg->css))
2871 memcg = NULL;
2872 } else if (PageSwapCache(page)) {
2873 ent.val = page_private(page);
2874 id = lookup_swap_cgroup_id(ent);
2875 rcu_read_lock();
2876 memcg = mem_cgroup_lookup(id);
2877 if (memcg && !css_tryget(&memcg->css))
2878 memcg = NULL;
2879 rcu_read_unlock();
2880 }
2881 unlock_page_cgroup(pc);
2882 return memcg;
2883}
2884
2885static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg,
2886 struct page *page,
2887 unsigned int nr_pages,
2888 enum charge_type ctype,
2889 bool lrucare)
2890{
2891 struct page_cgroup *pc = lookup_page_cgroup(page);
2892 struct zone *uninitialized_var(zone);
2893 struct lruvec *lruvec;
2894 bool was_on_lru = false;
2895 bool anon;
2896
2897 lock_page_cgroup(pc);
2898 VM_BUG_ON(PageCgroupUsed(pc));
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908 if (lrucare) {
2909 zone = page_zone(page);
2910 spin_lock_irq(&zone->lru_lock);
2911 if (PageLRU(page)) {
2912 lruvec = mem_cgroup_zone_lruvec(zone, pc->mem_cgroup);
2913 ClearPageLRU(page);
2914 del_page_from_lru_list(page, lruvec, page_lru(page));
2915 was_on_lru = true;
2916 }
2917 }
2918
2919 pc->mem_cgroup = memcg;
2920
2921
2922
2923
2924
2925
2926
2927 smp_wmb();
2928 SetPageCgroupUsed(pc);
2929
2930 if (lrucare) {
2931 if (was_on_lru) {
2932 lruvec = mem_cgroup_zone_lruvec(zone, pc->mem_cgroup);
2933 VM_BUG_ON(PageLRU(page));
2934 SetPageLRU(page);
2935 add_page_to_lru_list(page, lruvec, page_lru(page));
2936 }
2937 spin_unlock_irq(&zone->lru_lock);
2938 }
2939
2940 if (ctype == MEM_CGROUP_CHARGE_TYPE_ANON)
2941 anon = true;
2942 else
2943 anon = false;
2944
2945 mem_cgroup_charge_statistics(memcg, page, anon, nr_pages);
2946 unlock_page_cgroup(pc);
2947
2948
2949
2950
2951
2952
2953 memcg_check_events(memcg, page);
2954}
2955
2956static DEFINE_MUTEX(set_limit_mutex);
2957
2958#ifdef CONFIG_MEMCG_KMEM
2959static inline bool memcg_can_account_kmem(struct mem_cgroup *memcg)
2960{
2961 return !mem_cgroup_disabled() && !mem_cgroup_is_root(memcg) &&
2962 (memcg->kmem_account_flags & KMEM_ACCOUNTED_MASK);
2963}
2964
2965
2966
2967
2968
2969static struct kmem_cache *memcg_params_to_cache(struct memcg_cache_params *p)
2970{
2971 struct kmem_cache *cachep;
2972
2973 VM_BUG_ON(p->is_root_cache);
2974 cachep = p->root_cache;
2975 return cache_from_memcg_idx(cachep, memcg_cache_id(p->memcg));
2976}
2977
2978#ifdef CONFIG_SLABINFO
2979static int mem_cgroup_slabinfo_read(struct cgroup_subsys_state *css,
2980 struct cftype *cft, struct seq_file *m)
2981{
2982 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
2983 struct memcg_cache_params *params;
2984
2985 if (!memcg_can_account_kmem(memcg))
2986 return -EIO;
2987
2988 print_slabinfo_header(m);
2989
2990 mutex_lock(&memcg->slab_caches_mutex);
2991 list_for_each_entry(params, &memcg->memcg_slab_caches, list)
2992 cache_show(memcg_params_to_cache(params), m);
2993 mutex_unlock(&memcg->slab_caches_mutex);
2994
2995 return 0;
2996}
2997#endif
2998
2999static int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp, u64 size)
3000{
3001 struct res_counter *fail_res;
3002 struct mem_cgroup *_memcg;
3003 int ret = 0;
3004
3005 ret = res_counter_charge(&memcg->kmem, size, &fail_res);
3006 if (ret)
3007 return ret;
3008
3009 _memcg = memcg;
3010 ret = __mem_cgroup_try_charge(NULL, gfp, size >> PAGE_SHIFT,
3011 &_memcg, oom_gfp_allowed(gfp));
3012
3013 if (ret == -EINTR) {
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029 res_counter_charge_nofail(&memcg->res, size, &fail_res);
3030 if (do_swap_account)
3031 res_counter_charge_nofail(&memcg->memsw, size,
3032 &fail_res);
3033 ret = 0;
3034 } else if (ret)
3035 res_counter_uncharge(&memcg->kmem, size);
3036
3037 return ret;
3038}
3039
3040static void memcg_uncharge_kmem(struct mem_cgroup *memcg, u64 size)
3041{
3042 res_counter_uncharge(&memcg->res, size);
3043 if (do_swap_account)
3044 res_counter_uncharge(&memcg->memsw, size);
3045
3046
3047 if (res_counter_uncharge(&memcg->kmem, size))
3048 return;
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058 if (memcg_kmem_test_and_clear_dead(memcg))
3059 css_put(&memcg->css);
3060}
3061
3062void memcg_cache_list_add(struct mem_cgroup *memcg, struct kmem_cache *cachep)
3063{
3064 if (!memcg)
3065 return;
3066
3067 mutex_lock(&memcg->slab_caches_mutex);
3068 list_add(&cachep->memcg_params->list, &memcg->memcg_slab_caches);
3069 mutex_unlock(&memcg->slab_caches_mutex);
3070}
3071
3072
3073
3074
3075
3076
3077int memcg_cache_id(struct mem_cgroup *memcg)
3078{
3079 return memcg ? memcg->kmemcg_id : -1;
3080}
3081
3082
3083
3084
3085
3086
3087
3088
3089int memcg_update_cache_sizes(struct mem_cgroup *memcg)
3090{
3091 int num, ret;
3092
3093 num = ida_simple_get(&kmem_limited_groups,
3094 0, MEMCG_CACHES_MAX_SIZE, GFP_KERNEL);
3095 if (num < 0)
3096 return num;
3097
3098
3099
3100
3101
3102
3103
3104 memcg_kmem_set_activated(memcg);
3105
3106 ret = memcg_update_all_caches(num+1);
3107 if (ret) {
3108 ida_simple_remove(&kmem_limited_groups, num);
3109 memcg_kmem_clear_activated(memcg);
3110 return ret;
3111 }
3112
3113 memcg->kmemcg_id = num;
3114 INIT_LIST_HEAD(&memcg->memcg_slab_caches);
3115 mutex_init(&memcg->slab_caches_mutex);
3116 return 0;
3117}
3118
3119static size_t memcg_caches_array_size(int num_groups)
3120{
3121 ssize_t size;
3122 if (num_groups <= 0)
3123 return 0;
3124
3125 size = 2 * num_groups;
3126 if (size < MEMCG_CACHES_MIN_SIZE)
3127 size = MEMCG_CACHES_MIN_SIZE;
3128 else if (size > MEMCG_CACHES_MAX_SIZE)
3129 size = MEMCG_CACHES_MAX_SIZE;
3130
3131 return size;
3132}
3133
3134
3135
3136
3137
3138
3139void memcg_update_array_size(int num)
3140{
3141 if (num > memcg_limited_groups_array_size)
3142 memcg_limited_groups_array_size = memcg_caches_array_size(num);
3143}
3144
3145static void kmem_cache_destroy_work_func(struct work_struct *w);
3146
3147int memcg_update_cache_size(struct kmem_cache *s, int num_groups)
3148{
3149 struct memcg_cache_params *cur_params = s->memcg_params;
3150
3151 VM_BUG_ON(!is_root_cache(s));
3152
3153 if (num_groups > memcg_limited_groups_array_size) {
3154 int i;
3155 ssize_t size = memcg_caches_array_size(num_groups);
3156
3157 size *= sizeof(void *);
3158 size += offsetof(struct memcg_cache_params, memcg_caches);
3159
3160 s->memcg_params = kzalloc(size, GFP_KERNEL);
3161 if (!s->memcg_params) {
3162 s->memcg_params = cur_params;
3163 return -ENOMEM;
3164 }
3165
3166 s->memcg_params->is_root_cache = true;
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177 for (i = 0; i < memcg_limited_groups_array_size; i++) {
3178 if (!cur_params->memcg_caches[i])
3179 continue;
3180 s->memcg_params->memcg_caches[i] =
3181 cur_params->memcg_caches[i];
3182 }
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193 kfree(cur_params);
3194 }
3195 return 0;
3196}
3197
3198int memcg_register_cache(struct mem_cgroup *memcg, struct kmem_cache *s,
3199 struct kmem_cache *root_cache)
3200{
3201 size_t size;
3202
3203 if (!memcg_kmem_enabled())
3204 return 0;
3205
3206 if (!memcg) {
3207 size = offsetof(struct memcg_cache_params, memcg_caches);
3208 size += memcg_limited_groups_array_size * sizeof(void *);
3209 } else
3210 size = sizeof(struct memcg_cache_params);
3211
3212 s->memcg_params = kzalloc(size, GFP_KERNEL);
3213 if (!s->memcg_params)
3214 return -ENOMEM;
3215
3216 if (memcg) {
3217 s->memcg_params->memcg = memcg;
3218 s->memcg_params->root_cache = root_cache;
3219 INIT_WORK(&s->memcg_params->destroy,
3220 kmem_cache_destroy_work_func);
3221 } else
3222 s->memcg_params->is_root_cache = true;
3223
3224 return 0;
3225}
3226
3227void memcg_release_cache(struct kmem_cache *s)
3228{
3229 struct kmem_cache *root;
3230 struct mem_cgroup *memcg;
3231 int id;
3232
3233
3234
3235
3236
3237 if (!s->memcg_params)
3238 return;
3239
3240 if (s->memcg_params->is_root_cache)
3241 goto out;
3242
3243 memcg = s->memcg_params->memcg;
3244 id = memcg_cache_id(memcg);
3245
3246 root = s->memcg_params->root_cache;
3247 root->memcg_params->memcg_caches[id] = NULL;
3248
3249 mutex_lock(&memcg->slab_caches_mutex);
3250 list_del(&s->memcg_params->list);
3251 mutex_unlock(&memcg->slab_caches_mutex);
3252
3253 css_put(&memcg->css);
3254out:
3255 kfree(s->memcg_params);
3256}
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277static inline void memcg_stop_kmem_account(void)
3278{
3279 VM_BUG_ON(!current->mm);
3280 current->memcg_kmem_skip_account++;
3281}
3282
3283static inline void memcg_resume_kmem_account(void)
3284{
3285 VM_BUG_ON(!current->mm);
3286 current->memcg_kmem_skip_account--;
3287}
3288
3289static void kmem_cache_destroy_work_func(struct work_struct *w)
3290{
3291 struct kmem_cache *cachep;
3292 struct memcg_cache_params *p;
3293
3294 p = container_of(w, struct memcg_cache_params, destroy);
3295
3296 cachep = memcg_params_to_cache(p);
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314 if (atomic_read(&cachep->memcg_params->nr_pages) != 0) {
3315 kmem_cache_shrink(cachep);
3316 if (atomic_read(&cachep->memcg_params->nr_pages) == 0)
3317 return;
3318 } else
3319 kmem_cache_destroy(cachep);
3320}
3321
3322void mem_cgroup_destroy_cache(struct kmem_cache *cachep)
3323{
3324 if (!cachep->memcg_params->dead)
3325 return;
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345 if (work_pending(&cachep->memcg_params->destroy))
3346 return;
3347
3348
3349
3350
3351 schedule_work(&cachep->memcg_params->destroy);
3352}
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362static DEFINE_MUTEX(memcg_cache_mutex);
3363
3364
3365
3366
3367static struct kmem_cache *kmem_cache_dup(struct mem_cgroup *memcg,
3368 struct kmem_cache *s)
3369{
3370 struct kmem_cache *new;
3371 static char *tmp_name = NULL;
3372
3373 lockdep_assert_held(&memcg_cache_mutex);
3374
3375
3376
3377
3378
3379
3380
3381 if (!tmp_name) {
3382 tmp_name = kmalloc(PATH_MAX, GFP_KERNEL);
3383 if (!tmp_name)
3384 return NULL;
3385 }
3386
3387 rcu_read_lock();
3388 snprintf(tmp_name, PATH_MAX, "%s(%d:%s)", s->name,
3389 memcg_cache_id(memcg), cgroup_name(memcg->css.cgroup));
3390 rcu_read_unlock();
3391
3392 new = kmem_cache_create_memcg(memcg, tmp_name, s->object_size, s->align,
3393 (s->flags & ~SLAB_PANIC), s->ctor, s);
3394
3395 if (new)
3396 new->allocflags |= __GFP_KMEMCG;
3397
3398 return new;
3399}
3400
3401static struct kmem_cache *memcg_create_kmem_cache(struct mem_cgroup *memcg,
3402 struct kmem_cache *cachep)
3403{
3404 struct kmem_cache *new_cachep;
3405 int idx;
3406
3407 BUG_ON(!memcg_can_account_kmem(memcg));
3408
3409 idx = memcg_cache_id(memcg);
3410
3411 mutex_lock(&memcg_cache_mutex);
3412 new_cachep = cache_from_memcg_idx(cachep, idx);
3413 if (new_cachep) {
3414 css_put(&memcg->css);
3415 goto out;
3416 }
3417
3418 new_cachep = kmem_cache_dup(memcg, cachep);
3419 if (new_cachep == NULL) {
3420 new_cachep = cachep;
3421 css_put(&memcg->css);
3422 goto out;
3423 }
3424
3425 atomic_set(&new_cachep->memcg_params->nr_pages , 0);
3426
3427 cachep->memcg_params->memcg_caches[idx] = new_cachep;
3428
3429
3430
3431
3432 wmb();
3433out:
3434 mutex_unlock(&memcg_cache_mutex);
3435 return new_cachep;
3436}
3437
3438void kmem_cache_destroy_memcg_children(struct kmem_cache *s)
3439{
3440 struct kmem_cache *c;
3441 int i;
3442
3443 if (!s->memcg_params)
3444 return;
3445 if (!s->memcg_params->is_root_cache)
3446 return;
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457 mutex_lock(&set_limit_mutex);
3458 for_each_memcg_cache_index(i) {
3459 c = cache_from_memcg_idx(s, i);
3460 if (!c)
3461 continue;
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476 c->memcg_params->dead = false;
3477 cancel_work_sync(&c->memcg_params->destroy);
3478 kmem_cache_destroy(c);
3479 }
3480 mutex_unlock(&set_limit_mutex);
3481}
3482
3483struct create_work {
3484 struct mem_cgroup *memcg;
3485 struct kmem_cache *cachep;
3486 struct work_struct work;
3487};
3488
3489static void mem_cgroup_destroy_all_caches(struct mem_cgroup *memcg)
3490{
3491 struct kmem_cache *cachep;
3492 struct memcg_cache_params *params;
3493
3494 if (!memcg_kmem_is_active(memcg))
3495 return;
3496
3497 mutex_lock(&memcg->slab_caches_mutex);
3498 list_for_each_entry(params, &memcg->memcg_slab_caches, list) {
3499 cachep = memcg_params_to_cache(params);
3500 cachep->memcg_params->dead = true;
3501 schedule_work(&cachep->memcg_params->destroy);
3502 }
3503 mutex_unlock(&memcg->slab_caches_mutex);
3504}
3505
3506static void memcg_create_cache_work_func(struct work_struct *w)
3507{
3508 struct create_work *cw;
3509
3510 cw = container_of(w, struct create_work, work);
3511 memcg_create_kmem_cache(cw->memcg, cw->cachep);
3512 kfree(cw);
3513}
3514
3515
3516
3517
3518static void __memcg_create_cache_enqueue(struct mem_cgroup *memcg,
3519 struct kmem_cache *cachep)
3520{
3521 struct create_work *cw;
3522
3523 cw = kmalloc(sizeof(struct create_work), GFP_NOWAIT);
3524 if (cw == NULL) {
3525 css_put(&memcg->css);
3526 return;
3527 }
3528
3529 cw->memcg = memcg;
3530 cw->cachep = cachep;
3531
3532 INIT_WORK(&cw->work, memcg_create_cache_work_func);
3533 schedule_work(&cw->work);
3534}
3535
3536static void memcg_create_cache_enqueue(struct mem_cgroup *memcg,
3537 struct kmem_cache *cachep)
3538{
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550 memcg_stop_kmem_account();
3551 __memcg_create_cache_enqueue(memcg, cachep);
3552 memcg_resume_kmem_account();
3553}
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep,
3568 gfp_t gfp)
3569{
3570 struct mem_cgroup *memcg;
3571 int idx;
3572
3573 VM_BUG_ON(!cachep->memcg_params);
3574 VM_BUG_ON(!cachep->memcg_params->is_root_cache);
3575
3576 if (!current->mm || current->memcg_kmem_skip_account)
3577 return cachep;
3578
3579 rcu_read_lock();
3580 memcg = mem_cgroup_from_task(rcu_dereference(current->mm->owner));
3581
3582 if (!memcg_can_account_kmem(memcg))
3583 goto out;
3584
3585 idx = memcg_cache_id(memcg);
3586
3587
3588
3589
3590
3591 read_barrier_depends();
3592 if (likely(cache_from_memcg_idx(cachep, idx))) {
3593 cachep = cache_from_memcg_idx(cachep, idx);
3594 goto out;
3595 }
3596
3597
3598 if (!css_tryget(&memcg->css))
3599 goto out;
3600 rcu_read_unlock();
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619 memcg_create_cache_enqueue(memcg, cachep);
3620 return cachep;
3621out:
3622 rcu_read_unlock();
3623 return cachep;
3624}
3625EXPORT_SYMBOL(__memcg_kmem_get_cache);
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641bool
3642__memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **_memcg, int order)
3643{
3644 struct mem_cgroup *memcg;
3645 int ret;
3646
3647 *_memcg = NULL;
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673 if (!current->mm || current->memcg_kmem_skip_account)
3674 return true;
3675
3676 memcg = try_get_mem_cgroup_from_mm(current->mm);
3677
3678
3679
3680
3681
3682
3683 if (unlikely(!memcg))
3684 return true;
3685
3686 if (!memcg_can_account_kmem(memcg)) {
3687 css_put(&memcg->css);
3688 return true;
3689 }
3690
3691 ret = memcg_charge_kmem(memcg, gfp, PAGE_SIZE << order);
3692 if (!ret)
3693 *_memcg = memcg;
3694
3695 css_put(&memcg->css);
3696 return (ret == 0);
3697}
3698
3699void __memcg_kmem_commit_charge(struct page *page, struct mem_cgroup *memcg,
3700 int order)
3701{
3702 struct page_cgroup *pc;
3703
3704 VM_BUG_ON(mem_cgroup_is_root(memcg));
3705
3706
3707 if (!page) {
3708 memcg_uncharge_kmem(memcg, PAGE_SIZE << order);
3709 return;
3710 }
3711
3712 pc = lookup_page_cgroup(page);
3713 lock_page_cgroup(pc);
3714 pc->mem_cgroup = memcg;
3715 SetPageCgroupUsed(pc);
3716 unlock_page_cgroup(pc);
3717}
3718
3719void __memcg_kmem_uncharge_pages(struct page *page, int order)
3720{
3721 struct mem_cgroup *memcg = NULL;
3722 struct page_cgroup *pc;
3723
3724
3725 pc = lookup_page_cgroup(page);
3726
3727
3728
3729
3730 if (!PageCgroupUsed(pc))
3731 return;
3732
3733 lock_page_cgroup(pc);
3734 if (PageCgroupUsed(pc)) {
3735 memcg = pc->mem_cgroup;
3736 ClearPageCgroupUsed(pc);
3737 }
3738 unlock_page_cgroup(pc);
3739
3740
3741
3742
3743
3744 if (!memcg)
3745 return;
3746
3747 VM_BUG_ON(mem_cgroup_is_root(memcg));
3748 memcg_uncharge_kmem(memcg, PAGE_SIZE << order);
3749}
3750#else
3751static inline void mem_cgroup_destroy_all_caches(struct mem_cgroup *memcg)
3752{
3753}
3754#endif
3755
3756#ifdef CONFIG_TRANSPARENT_HUGEPAGE
3757
3758#define PCGF_NOCOPY_AT_SPLIT (1 << PCG_LOCK | 1 << PCG_MIGRATION)
3759
3760
3761
3762
3763
3764
3765void mem_cgroup_split_huge_fixup(struct page *head)
3766{
3767 struct page_cgroup *head_pc = lookup_page_cgroup(head);
3768 struct page_cgroup *pc;
3769 struct mem_cgroup *memcg;
3770 int i;
3771
3772 if (mem_cgroup_disabled())
3773 return;
3774
3775 memcg = head_pc->mem_cgroup;
3776 for (i = 1; i < HPAGE_PMD_NR; i++) {
3777 pc = head_pc + i;
3778 pc->mem_cgroup = memcg;
3779 smp_wmb();
3780 pc->flags = head_pc->flags & ~PCGF_NOCOPY_AT_SPLIT;
3781 }
3782 __this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_RSS_HUGE],
3783 HPAGE_PMD_NR);
3784}
3785#endif
3786
3787static inline
3788void mem_cgroup_move_account_page_stat(struct mem_cgroup *from,
3789 struct mem_cgroup *to,
3790 unsigned int nr_pages,
3791 enum mem_cgroup_stat_index idx)
3792{
3793
3794 preempt_disable();
3795 __this_cpu_sub(from->stat->count[idx], nr_pages);
3796 __this_cpu_add(to->stat->count[idx], nr_pages);
3797 preempt_enable();
3798}
3799
3800
3801
3802
3803
3804
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815static int mem_cgroup_move_account(struct page *page,
3816 unsigned int nr_pages,
3817 struct page_cgroup *pc,
3818 struct mem_cgroup *from,
3819 struct mem_cgroup *to)
3820{
3821 unsigned long flags;
3822 int ret;
3823 bool anon = PageAnon(page);
3824
3825 VM_BUG_ON(from == to);
3826 VM_BUG_ON(PageLRU(page));
3827
3828
3829
3830
3831
3832
3833 ret = -EBUSY;
3834 if (nr_pages > 1 && !PageTransHuge(page))
3835 goto out;
3836
3837 lock_page_cgroup(pc);
3838
3839 ret = -EINVAL;
3840 if (!PageCgroupUsed(pc) || pc->mem_cgroup != from)
3841 goto unlock;
3842
3843 move_lock_mem_cgroup(from, &flags);
3844
3845 if (!anon && page_mapped(page))
3846 mem_cgroup_move_account_page_stat(from, to, nr_pages,
3847 MEM_CGROUP_STAT_FILE_MAPPED);
3848
3849 if (PageWriteback(page))
3850 mem_cgroup_move_account_page_stat(from, to, nr_pages,
3851 MEM_CGROUP_STAT_WRITEBACK);
3852
3853 mem_cgroup_charge_statistics(from, page, anon, -nr_pages);
3854
3855
3856 pc->mem_cgroup = to;
3857 mem_cgroup_charge_statistics(to, page, anon, nr_pages);
3858 move_unlock_mem_cgroup(from, &flags);
3859 ret = 0;
3860unlock:
3861 unlock_page_cgroup(pc);
3862
3863
3864
3865 memcg_check_events(to, page);
3866 memcg_check_events(from, page);
3867out:
3868 return ret;
3869}
3870
3871
3872
3873
3874
3875
3876
3877
3878
3879
3880
3881
3882
3883
3884
3885
3886
3887
3888
3889
3890
3891
3892static int mem_cgroup_move_parent(struct page *page,
3893 struct page_cgroup *pc,
3894 struct mem_cgroup *child)
3895{
3896 struct mem_cgroup *parent;
3897 unsigned int nr_pages;
3898 unsigned long uninitialized_var(flags);
3899 int ret;
3900
3901 VM_BUG_ON(mem_cgroup_is_root(child));
3902
3903 ret = -EBUSY;
3904 if (!get_page_unless_zero(page))
3905 goto out;
3906 if (isolate_lru_page(page))
3907 goto put;
3908
3909 nr_pages = hpage_nr_pages(page);
3910
3911 parent = parent_mem_cgroup(child);
3912
3913
3914
3915 if (!parent)
3916 parent = root_mem_cgroup;
3917
3918 if (nr_pages > 1) {
3919 VM_BUG_ON(!PageTransHuge(page));
3920 flags = compound_lock_irqsave(page);
3921 }
3922
3923 ret = mem_cgroup_move_account(page, nr_pages,
3924 pc, child, parent);
3925 if (!ret)
3926 __mem_cgroup_cancel_local_charge(child, nr_pages);
3927
3928 if (nr_pages > 1)
3929 compound_unlock_irqrestore(page, flags);
3930 putback_lru_page(page);
3931put:
3932 put_page(page);
3933out:
3934 return ret;
3935}
3936
3937
3938
3939
3940
3941
3942
3943static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
3944 gfp_t gfp_mask, enum charge_type ctype)
3945{
3946 struct mem_cgroup *memcg = NULL;
3947 unsigned int nr_pages = 1;
3948 bool oom = true;
3949 int ret;
3950
3951 if (PageTransHuge(page)) {
3952 nr_pages <<= compound_order(page);
3953 VM_BUG_ON(!PageTransHuge(page));
3954
3955
3956
3957
3958 oom = false;
3959 }
3960
3961 ret = __mem_cgroup_try_charge(mm, gfp_mask, nr_pages, &memcg, oom);
3962 if (ret == -ENOMEM)
3963 return ret;
3964 __mem_cgroup_commit_charge(memcg, page, nr_pages, ctype, false);
3965 return 0;
3966}
3967
3968int mem_cgroup_newpage_charge(struct page *page,
3969 struct mm_struct *mm, gfp_t gfp_mask)
3970{
3971 if (mem_cgroup_disabled())
3972 return 0;
3973 VM_BUG_ON(page_mapped(page));
3974 VM_BUG_ON(page->mapping && !PageAnon(page));
3975 VM_BUG_ON(!mm);
3976 return mem_cgroup_charge_common(page, mm, gfp_mask,
3977 MEM_CGROUP_CHARGE_TYPE_ANON);
3978}
3979
3980
3981
3982
3983
3984
3985
3986static int __mem_cgroup_try_charge_swapin(struct mm_struct *mm,
3987 struct page *page,
3988 gfp_t mask,
3989 struct mem_cgroup **memcgp)
3990{
3991 struct mem_cgroup *memcg;
3992 struct page_cgroup *pc;
3993 int ret;
3994
3995 pc = lookup_page_cgroup(page);
3996
3997
3998
3999
4000
4001
4002
4003 if (PageCgroupUsed(pc))
4004 return 0;
4005 if (!do_swap_account)
4006 goto charge_cur_mm;
4007 memcg = try_get_mem_cgroup_from_page(page);
4008 if (!memcg)
4009 goto charge_cur_mm;
4010 *memcgp = memcg;
4011 ret = __mem_cgroup_try_charge(NULL, mask, 1, memcgp, true);
4012 css_put(&memcg->css);
4013 if (ret == -EINTR)
4014 ret = 0;
4015 return ret;
4016charge_cur_mm:
4017 ret = __mem_cgroup_try_charge(mm, mask, 1, memcgp, true);
4018 if (ret == -EINTR)
4019 ret = 0;
4020 return ret;
4021}
4022
4023int mem_cgroup_try_charge_swapin(struct mm_struct *mm, struct page *page,
4024 gfp_t gfp_mask, struct mem_cgroup **memcgp)
4025{
4026 *memcgp = NULL;
4027 if (mem_cgroup_disabled())
4028 return 0;
4029
4030
4031
4032
4033
4034
4035 if (!PageSwapCache(page)) {
4036 int ret;
4037
4038 ret = __mem_cgroup_try_charge(mm, gfp_mask, 1, memcgp, true);
4039 if (ret == -EINTR)
4040 ret = 0;
4041 return ret;
4042 }
4043 return __mem_cgroup_try_charge_swapin(mm, page, gfp_mask, memcgp);
4044}
4045
4046void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *memcg)
4047{
4048 if (mem_cgroup_disabled())
4049 return;
4050 if (!memcg)
4051 return;
4052 __mem_cgroup_cancel_charge(memcg, 1);
4053}
4054
4055static void
4056__mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *memcg,
4057 enum charge_type ctype)
4058{
4059 if (mem_cgroup_disabled())
4060 return;
4061 if (!memcg)
4062 return;
4063
4064 __mem_cgroup_commit_charge(memcg, page, 1, ctype, true);
4065
4066
4067
4068
4069
4070
4071
4072 if (do_swap_account && PageSwapCache(page)) {
4073 swp_entry_t ent = {.val = page_private(page)};
4074 mem_cgroup_uncharge_swap(ent);
4075 }
4076}
4077
4078void mem_cgroup_commit_charge_swapin(struct page *page,
4079 struct mem_cgroup *memcg)
4080{
4081 __mem_cgroup_commit_charge_swapin(page, memcg,
4082 MEM_CGROUP_CHARGE_TYPE_ANON);
4083}
4084
4085int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
4086 gfp_t gfp_mask)
4087{
4088 struct mem_cgroup *memcg = NULL;
4089 enum charge_type type = MEM_CGROUP_CHARGE_TYPE_CACHE;
4090 int ret;
4091
4092 if (mem_cgroup_disabled())
4093 return 0;
4094 if (PageCompound(page))
4095 return 0;
4096
4097 if (!PageSwapCache(page))
4098 ret = mem_cgroup_charge_common(page, mm, gfp_mask, type);
4099 else {
4100 ret = __mem_cgroup_try_charge_swapin(mm, page,
4101 gfp_mask, &memcg);
4102 if (!ret)
4103 __mem_cgroup_commit_charge_swapin(page, memcg, type);
4104 }
4105 return ret;
4106}
4107
4108static void mem_cgroup_do_uncharge(struct mem_cgroup *memcg,
4109 unsigned int nr_pages,
4110 const enum charge_type ctype)
4111{
4112 struct memcg_batch_info *batch = NULL;
4113 bool uncharge_memsw = true;
4114
4115
4116 if (!do_swap_account || ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT)
4117 uncharge_memsw = false;
4118
4119 batch = ¤t->memcg_batch;
4120
4121
4122
4123
4124
4125 if (!batch->memcg)
4126 batch->memcg = memcg;
4127
4128
4129
4130
4131
4132
4133
4134
4135 if (!batch->do_batch || test_thread_flag(TIF_MEMDIE))
4136 goto direct_uncharge;
4137
4138 if (nr_pages > 1)
4139 goto direct_uncharge;
4140
4141
4142
4143
4144
4145
4146 if (batch->memcg != memcg)
4147 goto direct_uncharge;
4148
4149 batch->nr_pages++;
4150 if (uncharge_memsw)
4151 batch->memsw_nr_pages++;
4152 return;
4153direct_uncharge:
4154 res_counter_uncharge(&memcg->res, nr_pages * PAGE_SIZE);
4155 if (uncharge_memsw)
4156 res_counter_uncharge(&memcg->memsw, nr_pages * PAGE_SIZE);
4157 if (unlikely(batch->memcg != memcg))
4158 memcg_oom_recover(memcg);
4159}
4160
4161
4162
4163
4164static struct mem_cgroup *
4165__mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype,
4166 bool end_migration)
4167{
4168 struct mem_cgroup *memcg = NULL;
4169 unsigned int nr_pages = 1;
4170 struct page_cgroup *pc;
4171 bool anon;
4172
4173 if (mem_cgroup_disabled())
4174 return NULL;
4175
4176 if (PageTransHuge(page)) {
4177 nr_pages <<= compound_order(page);
4178 VM_BUG_ON(!PageTransHuge(page));
4179 }
4180
4181
4182
4183 pc = lookup_page_cgroup(page);
4184 if (unlikely(!PageCgroupUsed(pc)))
4185 return NULL;
4186
4187 lock_page_cgroup(pc);
4188
4189 memcg = pc->mem_cgroup;
4190
4191 if (!PageCgroupUsed(pc))
4192 goto unlock_out;
4193
4194 anon = PageAnon(page);
4195
4196 switch (ctype) {
4197 case MEM_CGROUP_CHARGE_TYPE_ANON:
4198
4199
4200
4201
4202
4203 anon = true;
4204
4205 case MEM_CGROUP_CHARGE_TYPE_DROP:
4206
4207 if (page_mapped(page))
4208 goto unlock_out;
4209
4210
4211
4212
4213
4214
4215
4216 if (!end_migration && PageCgroupMigration(pc))
4217 goto unlock_out;
4218 break;
4219 case MEM_CGROUP_CHARGE_TYPE_SWAPOUT:
4220 if (!PageAnon(page)) {
4221 if (page->mapping && !page_is_file_cache(page))
4222 goto unlock_out;
4223 } else if (page_mapped(page))
4224 goto unlock_out;
4225 break;
4226 default:
4227 break;
4228 }
4229
4230 mem_cgroup_charge_statistics(memcg, page, anon, -nr_pages);
4231
4232 ClearPageCgroupUsed(pc);
4233
4234
4235
4236
4237
4238
4239
4240 unlock_page_cgroup(pc);
4241
4242
4243
4244
4245 memcg_check_events(memcg, page);
4246 if (do_swap_account && ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT) {
4247 mem_cgroup_swap_statistics(memcg, true);
4248 css_get(&memcg->css);
4249 }
4250
4251
4252
4253
4254
4255 if (!end_migration && !mem_cgroup_is_root(memcg))
4256 mem_cgroup_do_uncharge(memcg, nr_pages, ctype);
4257
4258 return memcg;
4259
4260unlock_out:
4261 unlock_page_cgroup(pc);
4262 return NULL;
4263}
4264
4265void mem_cgroup_uncharge_page(struct page *page)
4266{
4267
4268 if (page_mapped(page))
4269 return;
4270 VM_BUG_ON(page->mapping && !PageAnon(page));
4271
4272
4273
4274
4275
4276
4277
4278
4279
4280
4281
4282
4283 if (PageSwapCache(page))
4284 return;
4285 __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_ANON, false);
4286}
4287
4288void mem_cgroup_uncharge_cache_page(struct page *page)
4289{
4290 VM_BUG_ON(page_mapped(page));
4291 VM_BUG_ON(page->mapping);
4292 __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE, false);
4293}
4294
4295
4296
4297
4298
4299
4300
4301
4302
4303void mem_cgroup_uncharge_start(void)
4304{
4305 current->memcg_batch.do_batch++;
4306
4307 if (current->memcg_batch.do_batch == 1) {
4308 current->memcg_batch.memcg = NULL;
4309 current->memcg_batch.nr_pages = 0;
4310 current->memcg_batch.memsw_nr_pages = 0;
4311 }
4312}
4313
4314void mem_cgroup_uncharge_end(void)
4315{
4316 struct memcg_batch_info *batch = ¤t->memcg_batch;
4317
4318 if (!batch->do_batch)
4319 return;
4320
4321 batch->do_batch--;
4322 if (batch->do_batch)
4323 return;
4324
4325 if (!batch->memcg)
4326 return;
4327
4328
4329
4330
4331 if (batch->nr_pages)
4332 res_counter_uncharge(&batch->memcg->res,
4333 batch->nr_pages * PAGE_SIZE);
4334 if (batch->memsw_nr_pages)
4335 res_counter_uncharge(&batch->memcg->memsw,
4336 batch->memsw_nr_pages * PAGE_SIZE);
4337 memcg_oom_recover(batch->memcg);
4338
4339 batch->memcg = NULL;
4340}
4341
4342#ifdef CONFIG_SWAP
4343
4344
4345
4346
4347void
4348mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent, bool swapout)
4349{
4350 struct mem_cgroup *memcg;
4351 int ctype = MEM_CGROUP_CHARGE_TYPE_SWAPOUT;
4352
4353 if (!swapout)
4354 ctype = MEM_CGROUP_CHARGE_TYPE_DROP;
4355
4356 memcg = __mem_cgroup_uncharge_common(page, ctype, false);
4357
4358
4359
4360
4361
4362 if (do_swap_account && swapout && memcg)
4363 swap_cgroup_record(ent, mem_cgroup_id(memcg));
4364}
4365#endif
4366
4367#ifdef CONFIG_MEMCG_SWAP
4368
4369
4370
4371
4372void mem_cgroup_uncharge_swap(swp_entry_t ent)
4373{
4374 struct mem_cgroup *memcg;
4375 unsigned short id;
4376
4377 if (!do_swap_account)
4378 return;
4379
4380 id = swap_cgroup_record(ent, 0);
4381 rcu_read_lock();
4382 memcg = mem_cgroup_lookup(id);
4383 if (memcg) {
4384
4385
4386
4387
4388 if (!mem_cgroup_is_root(memcg))
4389 res_counter_uncharge(&memcg->memsw, PAGE_SIZE);
4390 mem_cgroup_swap_statistics(memcg, false);
4391 css_put(&memcg->css);
4392 }
4393 rcu_read_unlock();
4394}
4395
4396
4397
4398
4399
4400
4401
4402
4403
4404
4405
4406
4407
4408
4409
4410static int mem_cgroup_move_swap_account(swp_entry_t entry,
4411 struct mem_cgroup *from, struct mem_cgroup *to)
4412{
4413 unsigned short old_id, new_id;
4414
4415 old_id = mem_cgroup_id(from);
4416 new_id = mem_cgroup_id(to);
4417
4418 if (swap_cgroup_cmpxchg(entry, old_id, new_id) == old_id) {
4419 mem_cgroup_swap_statistics(from, false);
4420 mem_cgroup_swap_statistics(to, true);
4421
4422
4423
4424
4425
4426
4427
4428
4429
4430
4431
4432 css_get(&to->css);
4433 return 0;
4434 }
4435 return -EINVAL;
4436}
4437#else
4438static inline int mem_cgroup_move_swap_account(swp_entry_t entry,
4439 struct mem_cgroup *from, struct mem_cgroup *to)
4440{
4441 return -EINVAL;
4442}
4443#endif
4444
4445
4446
4447
4448
4449void mem_cgroup_prepare_migration(struct page *page, struct page *newpage,
4450 struct mem_cgroup **memcgp)
4451{
4452 struct mem_cgroup *memcg = NULL;
4453 unsigned int nr_pages = 1;
4454 struct page_cgroup *pc;
4455 enum charge_type ctype;
4456
4457 *memcgp = NULL;
4458
4459 if (mem_cgroup_disabled())
4460 return;
4461
4462 if (PageTransHuge(page))
4463 nr_pages <<= compound_order(page);
4464
4465 pc = lookup_page_cgroup(page);
4466 lock_page_cgroup(pc);
4467 if (PageCgroupUsed(pc)) {
4468 memcg = pc->mem_cgroup;
4469 css_get(&memcg->css);
4470
4471
4472
4473
4474
4475
4476
4477
4478
4479
4480
4481
4482
4483
4484
4485
4486
4487
4488
4489
4490
4491
4492
4493
4494
4495
4496
4497
4498
4499 if (PageAnon(page))
4500 SetPageCgroupMigration(pc);
4501 }
4502 unlock_page_cgroup(pc);
4503
4504
4505
4506
4507 if (!memcg)
4508 return;
4509
4510 *memcgp = memcg;
4511
4512
4513
4514
4515
4516
4517 if (PageAnon(page))
4518 ctype = MEM_CGROUP_CHARGE_TYPE_ANON;
4519 else
4520 ctype = MEM_CGROUP_CHARGE_TYPE_CACHE;
4521
4522
4523
4524
4525
4526 __mem_cgroup_commit_charge(memcg, newpage, nr_pages, ctype, false);
4527}
4528
4529
4530void mem_cgroup_end_migration(struct mem_cgroup *memcg,
4531 struct page *oldpage, struct page *newpage, bool migration_ok)
4532{
4533 struct page *used, *unused;
4534 struct page_cgroup *pc;
4535 bool anon;
4536
4537 if (!memcg)
4538 return;
4539
4540 if (!migration_ok) {
4541 used = oldpage;
4542 unused = newpage;
4543 } else {
4544 used = newpage;
4545 unused = oldpage;
4546 }
4547 anon = PageAnon(used);
4548 __mem_cgroup_uncharge_common(unused,
4549 anon ? MEM_CGROUP_CHARGE_TYPE_ANON
4550 : MEM_CGROUP_CHARGE_TYPE_CACHE,
4551 true);
4552 css_put(&memcg->css);
4553
4554
4555
4556
4557
4558 pc = lookup_page_cgroup(oldpage);
4559 lock_page_cgroup(pc);
4560 ClearPageCgroupMigration(pc);
4561 unlock_page_cgroup(pc);
4562
4563
4564
4565
4566
4567
4568
4569
4570
4571 if (anon)
4572 mem_cgroup_uncharge_page(used);
4573}
4574
4575
4576
4577
4578
4579
4580void mem_cgroup_replace_page_cache(struct page *oldpage,
4581 struct page *newpage)
4582{
4583 struct mem_cgroup *memcg = NULL;
4584 struct page_cgroup *pc;
4585 enum charge_type type = MEM_CGROUP_CHARGE_TYPE_CACHE;
4586
4587 if (mem_cgroup_disabled())
4588 return;
4589
4590 pc = lookup_page_cgroup(oldpage);
4591
4592 lock_page_cgroup(pc);
4593 if (PageCgroupUsed(pc)) {
4594 memcg = pc->mem_cgroup;
4595 mem_cgroup_charge_statistics(memcg, oldpage, false, -1);
4596 ClearPageCgroupUsed(pc);
4597 }
4598 unlock_page_cgroup(pc);
4599
4600
4601
4602
4603
4604 if (!memcg)
4605 return;
4606
4607
4608
4609
4610
4611 __mem_cgroup_commit_charge(memcg, newpage, 1, type, true);
4612}
4613
4614#ifdef CONFIG_DEBUG_VM
4615static struct page_cgroup *lookup_page_cgroup_used(struct page *page)
4616{
4617 struct page_cgroup *pc;
4618
4619 pc = lookup_page_cgroup(page);
4620
4621
4622
4623
4624
4625 if (likely(pc) && PageCgroupUsed(pc))
4626 return pc;
4627 return NULL;
4628}
4629
4630bool mem_cgroup_bad_page_check(struct page *page)
4631{
4632 if (mem_cgroup_disabled())
4633 return false;
4634
4635 return lookup_page_cgroup_used(page) != NULL;
4636}
4637
4638void mem_cgroup_print_bad_page(struct page *page)
4639{
4640 struct page_cgroup *pc;
4641
4642 pc = lookup_page_cgroup_used(page);
4643 if (pc) {
4644 pr_alert("pc:%p pc->flags:%lx pc->mem_cgroup:%p\n",
4645 pc, pc->flags, pc->mem_cgroup);
4646 }
4647}
4648#endif
4649
4650static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
4651 unsigned long long val)
4652{
4653 int retry_count;
4654 u64 memswlimit, memlimit;
4655 int ret = 0;
4656 int children = mem_cgroup_count_children(memcg);
4657 u64 curusage, oldusage;
4658 int enlarge;
4659
4660
4661
4662
4663
4664
4665 retry_count = MEM_CGROUP_RECLAIM_RETRIES * children;
4666
4667 oldusage = res_counter_read_u64(&memcg->res, RES_USAGE);
4668
4669 enlarge = 0;
4670 while (retry_count) {
4671 if (signal_pending(current)) {
4672 ret = -EINTR;
4673 break;
4674 }
4675
4676
4677
4678
4679
4680 mutex_lock(&set_limit_mutex);
4681 memswlimit = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
4682 if (memswlimit < val) {
4683 ret = -EINVAL;
4684 mutex_unlock(&set_limit_mutex);
4685 break;
4686 }
4687
4688 memlimit = res_counter_read_u64(&memcg->res, RES_LIMIT);
4689 if (memlimit < val)
4690 enlarge = 1;
4691
4692 ret = res_counter_set_limit(&memcg->res, val);
4693 if (!ret) {
4694 if (memswlimit == val)
4695 memcg->memsw_is_minimum = true;
4696 else
4697 memcg->memsw_is_minimum = false;
4698 }
4699 mutex_unlock(&set_limit_mutex);
4700
4701 if (!ret)
4702 break;
4703
4704 mem_cgroup_reclaim(memcg, GFP_KERNEL,
4705 MEM_CGROUP_RECLAIM_SHRINK);
4706 curusage = res_counter_read_u64(&memcg->res, RES_USAGE);
4707
4708 if (curusage >= oldusage)
4709 retry_count--;
4710 else
4711 oldusage = curusage;
4712 }
4713 if (!ret && enlarge)
4714 memcg_oom_recover(memcg);
4715
4716 return ret;
4717}
4718
4719static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg,
4720 unsigned long long val)
4721{
4722 int retry_count;
4723 u64 memlimit, memswlimit, oldusage, curusage;
4724 int children = mem_cgroup_count_children(memcg);
4725 int ret = -EBUSY;
4726 int enlarge = 0;
4727
4728
4729 retry_count = children * MEM_CGROUP_RECLAIM_RETRIES;
4730 oldusage = res_counter_read_u64(&memcg->memsw, RES_USAGE);
4731 while (retry_count) {
4732 if (signal_pending(current)) {
4733 ret = -EINTR;
4734 break;
4735 }
4736
4737
4738
4739
4740
4741 mutex_lock(&set_limit_mutex);
4742 memlimit = res_counter_read_u64(&memcg->res, RES_LIMIT);
4743 if (memlimit > val) {
4744 ret = -EINVAL;
4745 mutex_unlock(&set_limit_mutex);
4746 break;
4747 }
4748 memswlimit = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
4749 if (memswlimit < val)
4750 enlarge = 1;
4751 ret = res_counter_set_limit(&memcg->memsw, val);
4752 if (!ret) {
4753 if (memlimit == val)
4754 memcg->memsw_is_minimum = true;
4755 else
4756 memcg->memsw_is_minimum = false;
4757 }
4758 mutex_unlock(&set_limit_mutex);
4759
4760 if (!ret)
4761 break;
4762
4763 mem_cgroup_reclaim(memcg, GFP_KERNEL,
4764 MEM_CGROUP_RECLAIM_NOSWAP |
4765 MEM_CGROUP_RECLAIM_SHRINK);
4766 curusage = res_counter_read_u64(&memcg->memsw, RES_USAGE);
4767
4768 if (curusage >= oldusage)
4769 retry_count--;
4770 else
4771 oldusage = curusage;
4772 }
4773 if (!ret && enlarge)
4774 memcg_oom_recover(memcg);
4775 return ret;
4776}
4777
4778unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
4779 gfp_t gfp_mask,
4780 unsigned long *total_scanned)
4781{
4782 unsigned long nr_reclaimed = 0;
4783 struct mem_cgroup_per_zone *mz, *next_mz = NULL;
4784 unsigned long reclaimed;
4785 int loop = 0;
4786 struct mem_cgroup_tree_per_zone *mctz;
4787 unsigned long long excess;
4788 unsigned long nr_scanned;
4789
4790 if (order > 0)
4791 return 0;
4792
4793 mctz = soft_limit_tree_node_zone(zone_to_nid(zone), zone_idx(zone));
4794
4795
4796
4797
4798
4799 do {
4800 if (next_mz)
4801 mz = next_mz;
4802 else
4803 mz = mem_cgroup_largest_soft_limit_node(mctz);
4804 if (!mz)
4805 break;
4806
4807 nr_scanned = 0;
4808 reclaimed = mem_cgroup_soft_reclaim(mz->memcg, zone,
4809 gfp_mask, &nr_scanned);
4810 nr_reclaimed += reclaimed;
4811 *total_scanned += nr_scanned;
4812 spin_lock(&mctz->lock);
4813
4814
4815
4816
4817
4818 next_mz = NULL;
4819 if (!reclaimed) {
4820 do {
4821
4822
4823
4824
4825
4826
4827
4828
4829
4830
4831
4832 next_mz =
4833 __mem_cgroup_largest_soft_limit_node(mctz);
4834 if (next_mz == mz)
4835 css_put(&next_mz->memcg->css);
4836 else
4837 break;
4838 } while (1);
4839 }
4840 __mem_cgroup_remove_exceeded(mz->memcg, mz, mctz);
4841 excess = res_counter_soft_limit_excess(&mz->memcg->res);
4842
4843
4844
4845
4846
4847
4848
4849
4850
4851 __mem_cgroup_insert_exceeded(mz->memcg, mz, mctz, excess);
4852 spin_unlock(&mctz->lock);
4853 css_put(&mz->memcg->css);
4854 loop++;
4855
4856
4857
4858
4859
4860 if (!nr_reclaimed &&
4861 (next_mz == NULL ||
4862 loop > MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS))
4863 break;
4864 } while (!nr_reclaimed);
4865 if (next_mz)
4866 css_put(&next_mz->memcg->css);
4867 return nr_reclaimed;
4868}
4869
4870
4871
4872
4873
4874
4875
4876
4877
4878
4879
4880
4881static void mem_cgroup_force_empty_list(struct mem_cgroup *memcg,
4882 int node, int zid, enum lru_list lru)
4883{
4884 struct lruvec *lruvec;
4885 unsigned long flags;
4886 struct list_head *list;
4887 struct page *busy;
4888 struct zone *zone;
4889
4890 zone = &NODE_DATA(node)->node_zones[zid];
4891 lruvec = mem_cgroup_zone_lruvec(zone, memcg);
4892 list = &lruvec->lists[lru];
4893
4894 busy = NULL;
4895 do {
4896 struct page_cgroup *pc;
4897 struct page *page;
4898
4899 spin_lock_irqsave(&zone->lru_lock, flags);
4900 if (list_empty(list)) {
4901 spin_unlock_irqrestore(&zone->lru_lock, flags);
4902 break;
4903 }
4904 page = list_entry(list->prev, struct page, lru);
4905 if (busy == page) {
4906 list_move(&page->lru, list);
4907 busy = NULL;
4908 spin_unlock_irqrestore(&zone->lru_lock, flags);
4909 continue;
4910 }
4911 spin_unlock_irqrestore(&zone->lru_lock, flags);
4912
4913 pc = lookup_page_cgroup(page);
4914
4915 if (mem_cgroup_move_parent(page, pc, memcg)) {
4916
4917 busy = page;
4918 cond_resched();
4919 } else
4920 busy = NULL;
4921 } while (!list_empty(list));
4922}
4923
4924
4925
4926
4927
4928
4929
4930
4931static void mem_cgroup_reparent_charges(struct mem_cgroup *memcg)
4932{
4933 int node, zid;
4934 u64 usage;
4935
4936 do {
4937
4938 lru_add_drain_all();
4939 drain_all_stock_sync(memcg);
4940 mem_cgroup_start_move(memcg);
4941 for_each_node_state(node, N_MEMORY) {
4942 for (zid = 0; zid < MAX_NR_ZONES; zid++) {
4943 enum lru_list lru;
4944 for_each_lru(lru) {
4945 mem_cgroup_force_empty_list(memcg,
4946 node, zid, lru);
4947 }
4948 }
4949 }
4950 mem_cgroup_end_move(memcg);
4951 memcg_oom_recover(memcg);
4952 cond_resched();
4953
4954
4955
4956
4957
4958
4959
4960
4961
4962
4963
4964
4965
4966 usage = res_counter_read_u64(&memcg->res, RES_USAGE) -
4967 res_counter_read_u64(&memcg->kmem, RES_USAGE);
4968 } while (usage > 0);
4969}
4970
4971static inline bool memcg_has_children(struct mem_cgroup *memcg)
4972{
4973 lockdep_assert_held(&memcg_create_mutex);
4974
4975
4976
4977
4978
4979
4980
4981 return memcg->use_hierarchy &&
4982 !list_empty(&memcg->css.cgroup->children);
4983}
4984
4985
4986
4987
4988
4989
4990
4991static int mem_cgroup_force_empty(struct mem_cgroup *memcg)
4992{
4993 int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
4994 struct cgroup *cgrp = memcg->css.cgroup;
4995
4996
4997 if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children))
4998 return -EBUSY;
4999
5000
5001 lru_add_drain_all();
5002
5003 while (nr_retries && res_counter_read_u64(&memcg->res, RES_USAGE) > 0) {
5004 int progress;
5005
5006 if (signal_pending(current))
5007 return -EINTR;
5008
5009 progress = try_to_free_mem_cgroup_pages(memcg, GFP_KERNEL,
5010 false);
5011 if (!progress) {
5012 nr_retries--;
5013
5014 congestion_wait(BLK_RW_ASYNC, HZ/10);
5015 }
5016
5017 }
5018 lru_add_drain();
5019 mem_cgroup_reparent_charges(memcg);
5020
5021 return 0;
5022}
5023
5024static int mem_cgroup_force_empty_write(struct cgroup_subsys_state *css,
5025 unsigned int event)
5026{
5027 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
5028
5029 if (mem_cgroup_is_root(memcg))
5030 return -EINVAL;
5031 return mem_cgroup_force_empty(memcg);
5032}
5033
5034static u64 mem_cgroup_hierarchy_read(struct cgroup_subsys_state *css,
5035 struct cftype *cft)
5036{
5037 return mem_cgroup_from_css(css)->use_hierarchy;
5038}
5039
5040static int mem_cgroup_hierarchy_write(struct cgroup_subsys_state *css,
5041 struct cftype *cft, u64 val)
5042{
5043 int retval = 0;
5044 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
5045 struct mem_cgroup *parent_memcg = mem_cgroup_from_css(css_parent(&memcg->css));
5046
5047 mutex_lock(&memcg_create_mutex);
5048
5049 if (memcg->use_hierarchy == val)
5050 goto out;
5051
5052
5053
5054
5055
5056
5057
5058
5059
5060 if ((!parent_memcg || !parent_memcg->use_hierarchy) &&
5061 (val == 1 || val == 0)) {
5062 if (list_empty(&memcg->css.cgroup->children))
5063 memcg->use_hierarchy = val;
5064 else
5065 retval = -EBUSY;
5066 } else
5067 retval = -EINVAL;
5068
5069out:
5070 mutex_unlock(&memcg_create_mutex);
5071
5072 return retval;
5073}
5074
5075
5076static unsigned long mem_cgroup_recursive_stat(struct mem_cgroup *memcg,
5077 enum mem_cgroup_stat_index idx)
5078{
5079 struct mem_cgroup *iter;
5080 long val = 0;
5081
5082
5083 for_each_mem_cgroup_tree(iter, memcg)
5084 val += mem_cgroup_read_stat(iter, idx);
5085
5086 if (val < 0)
5087 val = 0;
5088 return val;
5089}
5090
5091static inline u64 mem_cgroup_usage(struct mem_cgroup *memcg, bool swap)
5092{
5093 u64 val;
5094
5095 if (!mem_cgroup_is_root(memcg)) {
5096 if (!swap)
5097 return res_counter_read_u64(&memcg->res, RES_USAGE);
5098 else
5099 return res_counter_read_u64(&memcg->memsw, RES_USAGE);
5100 }
5101
5102
5103
5104
5105
5106 val = mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_CACHE);
5107 val += mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_RSS);
5108
5109 if (swap)
5110 val += mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_SWAP);
5111
5112 return val << PAGE_SHIFT;
5113}
5114
5115static ssize_t mem_cgroup_read(struct cgroup_subsys_state *css,
5116 struct cftype *cft, struct file *file,
5117 char __user *buf, size_t nbytes, loff_t *ppos)
5118{
5119 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
5120 char str[64];
5121 u64 val;
5122 int name, len;
5123 enum res_type type;
5124
5125 type = MEMFILE_TYPE(cft->private);
5126 name = MEMFILE_ATTR(cft->private);
5127
5128 switch (type) {
5129 case _MEM:
5130 if (name == RES_USAGE)
5131 val = mem_cgroup_usage(memcg, false);
5132 else
5133 val = res_counter_read_u64(&memcg->res, name);
5134 break;
5135 case _MEMSWAP:
5136 if (name == RES_USAGE)
5137 val = mem_cgroup_usage(memcg, true);
5138 else
5139 val = res_counter_read_u64(&memcg->memsw, name);
5140 break;
5141 case _KMEM:
5142 val = res_counter_read_u64(&memcg->kmem, name);
5143 break;
5144 default:
5145 BUG();
5146 }
5147
5148 len = scnprintf(str, sizeof(str), "%llu\n", (unsigned long long)val);
5149 return simple_read_from_buffer(buf, nbytes, ppos, str, len);
5150}
5151
5152static int memcg_update_kmem_limit(struct cgroup_subsys_state *css, u64 val)
5153{
5154 int ret = -EINVAL;
5155#ifdef CONFIG_MEMCG_KMEM
5156 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
5157
5158
5159
5160
5161
5162
5163
5164
5165
5166
5167
5168
5169 mutex_lock(&memcg_create_mutex);
5170 mutex_lock(&set_limit_mutex);
5171 if (!memcg->kmem_account_flags && val != RES_COUNTER_MAX) {
5172 if (cgroup_task_count(css->cgroup) || memcg_has_children(memcg)) {
5173 ret = -EBUSY;
5174 goto out;
5175 }
5176 ret = res_counter_set_limit(&memcg->kmem, val);
5177 VM_BUG_ON(ret);
5178
5179 ret = memcg_update_cache_sizes(memcg);
5180 if (ret) {
5181 res_counter_set_limit(&memcg->kmem, RES_COUNTER_MAX);
5182 goto out;
5183 }
5184 static_key_slow_inc(&memcg_kmem_enabled_key);
5185
5186
5187
5188
5189 memcg_kmem_set_active(memcg);
5190 } else
5191 ret = res_counter_set_limit(&memcg->kmem, val);
5192out:
5193 mutex_unlock(&set_limit_mutex);
5194 mutex_unlock(&memcg_create_mutex);
5195#endif
5196 return ret;
5197}
5198
5199#ifdef CONFIG_MEMCG_KMEM
5200static int memcg_propagate_kmem(struct mem_cgroup *memcg)
5201{
5202 int ret = 0;
5203 struct mem_cgroup *parent = parent_mem_cgroup(memcg);
5204 if (!parent)
5205 goto out;
5206
5207 memcg->kmem_account_flags = parent->kmem_account_flags;
5208
5209
5210
5211
5212
5213
5214
5215
5216
5217
5218 if (!memcg_kmem_is_active(memcg))
5219 goto out;
5220
5221
5222
5223
5224
5225
5226 static_key_slow_inc(&memcg_kmem_enabled_key);
5227
5228 mutex_lock(&set_limit_mutex);
5229 memcg_stop_kmem_account();
5230 ret = memcg_update_cache_sizes(memcg);
5231 memcg_resume_kmem_account();
5232 mutex_unlock(&set_limit_mutex);
5233out:
5234 return ret;
5235}
5236#endif
5237
5238
5239
5240
5241
5242static int mem_cgroup_write(struct cgroup_subsys_state *css, struct cftype *cft,
5243 const char *buffer)
5244{
5245 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
5246 enum res_type type;
5247 int name;
5248 unsigned long long val;
5249 int ret;
5250
5251 type = MEMFILE_TYPE(cft->private);
5252 name = MEMFILE_ATTR(cft->private);
5253
5254 switch (name) {
5255 case RES_LIMIT:
5256 if (mem_cgroup_is_root(memcg)) {
5257 ret = -EINVAL;
5258 break;
5259 }
5260
5261 ret = res_counter_memparse_write_strategy(buffer, &val);
5262 if (ret)
5263 break;
5264 if (type == _MEM)
5265 ret = mem_cgroup_resize_limit(memcg, val);
5266 else if (type == _MEMSWAP)
5267 ret = mem_cgroup_resize_memsw_limit(memcg, val);
5268 else if (type == _KMEM)
5269 ret = memcg_update_kmem_limit(css, val);
5270 else
5271 return -EINVAL;
5272 break;
5273 case RES_SOFT_LIMIT:
5274 ret = res_counter_memparse_write_strategy(buffer, &val);
5275 if (ret)
5276 break;
5277
5278
5279
5280
5281
5282 if (type == _MEM)
5283 ret = res_counter_set_soft_limit(&memcg->res, val);
5284 else
5285 ret = -EINVAL;
5286 break;
5287 default:
5288 ret = -EINVAL;
5289 break;
5290 }
5291 return ret;
5292}
5293
5294static void memcg_get_hierarchical_limit(struct mem_cgroup *memcg,
5295 unsigned long long *mem_limit, unsigned long long *memsw_limit)
5296{
5297 unsigned long long min_limit, min_memsw_limit, tmp;
5298
5299 min_limit = res_counter_read_u64(&memcg->res, RES_LIMIT);
5300 min_memsw_limit = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
5301 if (!memcg->use_hierarchy)
5302 goto out;
5303
5304 while (css_parent(&memcg->css)) {
5305 memcg = mem_cgroup_from_css(css_parent(&memcg->css));
5306 if (!memcg->use_hierarchy)
5307 break;
5308 tmp = res_counter_read_u64(&memcg->res, RES_LIMIT);
5309 min_limit = min(min_limit, tmp);
5310 tmp = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
5311 min_memsw_limit = min(min_memsw_limit, tmp);
5312 }
5313out:
5314 *mem_limit = min_limit;
5315 *memsw_limit = min_memsw_limit;
5316}
5317
5318static int mem_cgroup_reset(struct cgroup_subsys_state *css, unsigned int event)
5319{
5320 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
5321 int name;
5322 enum res_type type;
5323
5324 type = MEMFILE_TYPE(event);
5325 name = MEMFILE_ATTR(event);
5326
5327 switch (name) {
5328 case RES_MAX_USAGE:
5329 if (type == _MEM)
5330 res_counter_reset_max(&memcg->res);
5331 else if (type == _MEMSWAP)
5332 res_counter_reset_max(&memcg->memsw);
5333 else if (type == _KMEM)
5334 res_counter_reset_max(&memcg->kmem);
5335 else
5336 return -EINVAL;
5337 break;
5338 case RES_FAILCNT:
5339 if (type == _MEM)
5340 res_counter_reset_failcnt(&memcg->res);
5341 else if (type == _MEMSWAP)
5342 res_counter_reset_failcnt(&memcg->memsw);
5343 else if (type == _KMEM)
5344 res_counter_reset_failcnt(&memcg->kmem);
5345 else
5346 return -EINVAL;
5347 break;
5348 }
5349
5350 return 0;
5351}
5352
5353static u64 mem_cgroup_move_charge_read(struct cgroup_subsys_state *css,
5354 struct cftype *cft)
5355{
5356 return mem_cgroup_from_css(css)->move_charge_at_immigrate;
5357}
5358
5359#ifdef CONFIG_MMU
5360static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css,
5361 struct cftype *cft, u64 val)
5362{
5363 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
5364
5365 if (val >= (1 << NR_MOVE_TYPE))
5366 return -EINVAL;
5367
5368
5369
5370
5371
5372
5373
5374 memcg->move_charge_at_immigrate = val;
5375 return 0;
5376}
5377#else
5378static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css,
5379 struct cftype *cft, u64 val)
5380{
5381 return -ENOSYS;
5382}
5383#endif
5384
5385#ifdef CONFIG_NUMA
5386static int memcg_numa_stat_show(struct cgroup_subsys_state *css,
5387 struct cftype *cft, struct seq_file *m)
5388{
5389 struct numa_stat {
5390 const char *name;
5391 unsigned int lru_mask;
5392 };
5393
5394 static const struct numa_stat stats[] = {
5395 { "total", LRU_ALL },
5396 { "file", LRU_ALL_FILE },
5397 { "anon", LRU_ALL_ANON },
5398 { "unevictable", BIT(LRU_UNEVICTABLE) },
5399 };
5400 const struct numa_stat *stat;
5401 int nid;
5402 unsigned long nr;
5403 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
5404
5405 for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) {
5406 nr = mem_cgroup_nr_lru_pages(memcg, stat->lru_mask);
5407 seq_printf(m, "%s=%lu", stat->name, nr);
5408 for_each_node_state(nid, N_MEMORY) {
5409 nr = mem_cgroup_node_nr_lru_pages(memcg, nid,
5410 stat->lru_mask);
5411 seq_printf(m, " N%d=%lu", nid, nr);
5412 }
5413 seq_putc(m, '\n');
5414 }
5415
5416 for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) {
5417 struct mem_cgroup *iter;
5418
5419 nr = 0;
5420 for_each_mem_cgroup_tree(iter, memcg)
5421 nr += mem_cgroup_nr_lru_pages(iter, stat->lru_mask);
5422 seq_printf(m, "hierarchical_%s=%lu", stat->name, nr);
5423 for_each_node_state(nid, N_MEMORY) {
5424 nr = 0;
5425 for_each_mem_cgroup_tree(iter, memcg)
5426 nr += mem_cgroup_node_nr_lru_pages(
5427 iter, nid, stat->lru_mask);
5428 seq_printf(m, " N%d=%lu", nid, nr);
5429 }
5430 seq_putc(m, '\n');
5431 }
5432
5433 return 0;
5434}
5435#endif
5436
5437static inline void mem_cgroup_lru_names_not_uptodate(void)
5438{
5439 BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_lru_names) != NR_LRU_LISTS);
5440}
5441
5442static int memcg_stat_show(struct cgroup_subsys_state *css, struct cftype *cft,
5443 struct seq_file *m)
5444{
5445 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
5446 struct mem_cgroup *mi;
5447 unsigned int i;
5448
5449 for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) {
5450 if (i == MEM_CGROUP_STAT_SWAP && !do_swap_account)
5451 continue;
5452 seq_printf(m, "%s %ld\n", mem_cgroup_stat_names[i],
5453 mem_cgroup_read_stat(memcg, i) * PAGE_SIZE);
5454 }
5455
5456 for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++)
5457 seq_printf(m, "%s %lu\n", mem_cgroup_events_names[i],
5458 mem_cgroup_read_events(memcg, i));
5459
5460 for (i = 0; i < NR_LRU_LISTS; i++)
5461 seq_printf(m, "%s %lu\n", mem_cgroup_lru_names[i],
5462 mem_cgroup_nr_lru_pages(memcg, BIT(i)) * PAGE_SIZE);
5463
5464
5465 {
5466 unsigned long long limit, memsw_limit;
5467 memcg_get_hierarchical_limit(memcg, &limit, &memsw_limit);
5468 seq_printf(m, "hierarchical_memory_limit %llu\n", limit);
5469 if (do_swap_account)
5470 seq_printf(m, "hierarchical_memsw_limit %llu\n",
5471 memsw_limit);
5472 }
5473
5474 for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) {
5475 long long val = 0;
5476
5477 if (i == MEM_CGROUP_STAT_SWAP && !do_swap_account)
5478 continue;
5479 for_each_mem_cgroup_tree(mi, memcg)
5480 val += mem_cgroup_read_stat(mi, i) * PAGE_SIZE;
5481 seq_printf(m, "total_%s %lld\n", mem_cgroup_stat_names[i], val);
5482 }
5483
5484 for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++) {
5485 unsigned long long val = 0;
5486
5487 for_each_mem_cgroup_tree(mi, memcg)
5488 val += mem_cgroup_read_events(mi, i);
5489 seq_printf(m, "total_%s %llu\n",
5490 mem_cgroup_events_names[i], val);
5491 }
5492
5493 for (i = 0; i < NR_LRU_LISTS; i++) {
5494 unsigned long long val = 0;
5495
5496 for_each_mem_cgroup_tree(mi, memcg)
5497 val += mem_cgroup_nr_lru_pages(mi, BIT(i)) * PAGE_SIZE;
5498 seq_printf(m, "total_%s %llu\n", mem_cgroup_lru_names[i], val);
5499 }
5500
5501#ifdef CONFIG_DEBUG_VM
5502 {
5503 int nid, zid;
5504 struct mem_cgroup_per_zone *mz;
5505 struct zone_reclaim_stat *rstat;
5506 unsigned long recent_rotated[2] = {0, 0};
5507 unsigned long recent_scanned[2] = {0, 0};
5508
5509 for_each_online_node(nid)
5510 for (zid = 0; zid < MAX_NR_ZONES; zid++) {
5511 mz = mem_cgroup_zoneinfo(memcg, nid, zid);
5512 rstat = &mz->lruvec.reclaim_stat;
5513
5514 recent_rotated[0] += rstat->recent_rotated[0];
5515 recent_rotated[1] += rstat->recent_rotated[1];
5516 recent_scanned[0] += rstat->recent_scanned[0];
5517 recent_scanned[1] += rstat->recent_scanned[1];
5518 }
5519 seq_printf(m, "recent_rotated_anon %lu\n", recent_rotated[0]);
5520 seq_printf(m, "recent_rotated_file %lu\n", recent_rotated[1]);
5521 seq_printf(m, "recent_scanned_anon %lu\n", recent_scanned[0]);
5522 seq_printf(m, "recent_scanned_file %lu\n", recent_scanned[1]);
5523 }
5524#endif
5525
5526 return 0;
5527}
5528
5529static u64 mem_cgroup_swappiness_read(struct cgroup_subsys_state *css,
5530 struct cftype *cft)
5531{
5532 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
5533
5534 return mem_cgroup_swappiness(memcg);
5535}
5536
5537static int mem_cgroup_swappiness_write(struct cgroup_subsys_state *css,
5538 struct cftype *cft, u64 val)
5539{
5540 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
5541 struct mem_cgroup *parent = mem_cgroup_from_css(css_parent(&memcg->css));
5542
5543 if (val > 100 || !parent)
5544 return -EINVAL;
5545
5546 mutex_lock(&memcg_create_mutex);
5547
5548
5549 if ((parent->use_hierarchy) || memcg_has_children(memcg)) {
5550 mutex_unlock(&memcg_create_mutex);
5551 return -EINVAL;
5552 }
5553
5554 memcg->swappiness = val;
5555
5556 mutex_unlock(&memcg_create_mutex);
5557
5558 return 0;
5559}
5560
5561static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap)
5562{
5563 struct mem_cgroup_threshold_ary *t;
5564 u64 usage;
5565 int i;
5566
5567 rcu_read_lock();
5568 if (!swap)
5569 t = rcu_dereference(memcg->thresholds.primary);
5570 else
5571 t = rcu_dereference(memcg->memsw_thresholds.primary);
5572
5573 if (!t)
5574 goto unlock;
5575
5576 usage = mem_cgroup_usage(memcg, swap);
5577
5578
5579
5580
5581
5582
5583 i = t->current_threshold;
5584
5585
5586
5587
5588
5589
5590
5591 for (; i >= 0 && unlikely(t->entries[i].threshold > usage); i--)
5592 eventfd_signal(t->entries[i].eventfd, 1);
5593
5594
5595 i++;
5596
5597
5598
5599
5600
5601
5602
5603 for (; i < t->size && unlikely(t->entries[i].threshold <= usage); i++)
5604 eventfd_signal(t->entries[i].eventfd, 1);
5605
5606
5607 t->current_threshold = i - 1;
5608unlock:
5609 rcu_read_unlock();
5610}
5611
5612static void mem_cgroup_threshold(struct mem_cgroup *memcg)
5613{
5614 while (memcg) {
5615 __mem_cgroup_threshold(memcg, false);
5616 if (do_swap_account)
5617 __mem_cgroup_threshold(memcg, true);
5618
5619 memcg = parent_mem_cgroup(memcg);
5620 }
5621}
5622
5623static int compare_thresholds(const void *a, const void *b)
5624{
5625 const struct mem_cgroup_threshold *_a = a;
5626 const struct mem_cgroup_threshold *_b = b;
5627
5628 if (_a->threshold > _b->threshold)
5629 return 1;
5630
5631 if (_a->threshold < _b->threshold)
5632 return -1;
5633
5634 return 0;
5635}
5636
5637static int mem_cgroup_oom_notify_cb(struct mem_cgroup *memcg)
5638{
5639 struct mem_cgroup_eventfd_list *ev;
5640
5641 list_for_each_entry(ev, &memcg->oom_notify, list)
5642 eventfd_signal(ev->eventfd, 1);
5643 return 0;
5644}
5645
5646static void mem_cgroup_oom_notify(struct mem_cgroup *memcg)
5647{
5648 struct mem_cgroup *iter;
5649
5650 for_each_mem_cgroup_tree(iter, memcg)
5651 mem_cgroup_oom_notify_cb(iter);
5652}
5653
5654static int mem_cgroup_usage_register_event(struct cgroup_subsys_state *css,
5655 struct cftype *cft, struct eventfd_ctx *eventfd, const char *args)
5656{
5657 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
5658 struct mem_cgroup_thresholds *thresholds;
5659 struct mem_cgroup_threshold_ary *new;
5660 enum res_type type = MEMFILE_TYPE(cft->private);
5661 u64 threshold, usage;
5662 int i, size, ret;
5663
5664 ret = res_counter_memparse_write_strategy(args, &threshold);
5665 if (ret)
5666 return ret;
5667
5668 mutex_lock(&memcg->thresholds_lock);
5669
5670 if (type == _MEM)
5671 thresholds = &memcg->thresholds;
5672 else if (type == _MEMSWAP)
5673 thresholds = &memcg->memsw_thresholds;
5674 else
5675 BUG();
5676
5677 usage = mem_cgroup_usage(memcg, type == _MEMSWAP);
5678
5679
5680 if (thresholds->primary)
5681 __mem_cgroup_threshold(memcg, type == _MEMSWAP);
5682
5683 size = thresholds->primary ? thresholds->primary->size + 1 : 1;
5684
5685
5686 new = kmalloc(sizeof(*new) + size * sizeof(struct mem_cgroup_threshold),
5687 GFP_KERNEL);
5688 if (!new) {
5689 ret = -ENOMEM;
5690 goto unlock;
5691 }
5692 new->size = size;
5693
5694
5695 if (thresholds->primary) {
5696 memcpy(new->entries, thresholds->primary->entries, (size - 1) *
5697 sizeof(struct mem_cgroup_threshold));
5698 }
5699
5700
5701 new->entries[size - 1].eventfd = eventfd;
5702 new->entries[size - 1].threshold = threshold;
5703
5704
5705 sort(new->entries, size, sizeof(struct mem_cgroup_threshold),
5706 compare_thresholds, NULL);
5707
5708
5709 new->current_threshold = -1;
5710 for (i = 0; i < size; i++) {
5711 if (new->entries[i].threshold <= usage) {
5712
5713
5714
5715
5716
5717 ++new->current_threshold;
5718 } else
5719 break;
5720 }
5721
5722
5723 kfree(thresholds->spare);
5724 thresholds->spare = thresholds->primary;
5725
5726 rcu_assign_pointer(thresholds->primary, new);
5727
5728
5729 synchronize_rcu();
5730
5731unlock:
5732 mutex_unlock(&memcg->thresholds_lock);
5733
5734 return ret;
5735}
5736
5737static void mem_cgroup_usage_unregister_event(struct cgroup_subsys_state *css,
5738 struct cftype *cft, struct eventfd_ctx *eventfd)
5739{
5740 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
5741 struct mem_cgroup_thresholds *thresholds;
5742 struct mem_cgroup_threshold_ary *new;
5743 enum res_type type = MEMFILE_TYPE(cft->private);
5744 u64 usage;
5745 int i, j, size;
5746
5747 mutex_lock(&memcg->thresholds_lock);
5748 if (type == _MEM)
5749 thresholds = &memcg->thresholds;
5750 else if (type == _MEMSWAP)
5751 thresholds = &memcg->memsw_thresholds;
5752 else
5753 BUG();
5754
5755 if (!thresholds->primary)
5756 goto unlock;
5757
5758 usage = mem_cgroup_usage(memcg, type == _MEMSWAP);
5759
5760
5761 __mem_cgroup_threshold(memcg, type == _MEMSWAP);
5762
5763
5764 size = 0;
5765 for (i = 0; i < thresholds->primary->size; i++) {
5766 if (thresholds->primary->entries[i].eventfd != eventfd)
5767 size++;
5768 }
5769
5770 new = thresholds->spare;
5771
5772
5773 if (!size) {
5774 kfree(new);
5775 new = NULL;
5776 goto swap_buffers;
5777 }
5778
5779 new->size = size;
5780
5781
5782 new->current_threshold = -1;
5783 for (i = 0, j = 0; i < thresholds->primary->size; i++) {
5784 if (thresholds->primary->entries[i].eventfd == eventfd)
5785 continue;
5786
5787 new->entries[j] = thresholds->primary->entries[i];
5788 if (new->entries[j].threshold <= usage) {
5789
5790
5791
5792
5793
5794 ++new->current_threshold;
5795 }
5796 j++;
5797 }
5798
5799swap_buffers:
5800
5801 thresholds->spare = thresholds->primary;
5802
5803 if (!new) {
5804 kfree(thresholds->spare);
5805 thresholds->spare = NULL;
5806 }
5807
5808 rcu_assign_pointer(thresholds->primary, new);
5809
5810
5811 synchronize_rcu();
5812unlock:
5813 mutex_unlock(&memcg->thresholds_lock);
5814}
5815
5816static int mem_cgroup_oom_register_event(struct cgroup_subsys_state *css,
5817 struct cftype *cft, struct eventfd_ctx *eventfd, const char *args)
5818{
5819 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
5820 struct mem_cgroup_eventfd_list *event;
5821 enum res_type type = MEMFILE_TYPE(cft->private);
5822
5823 BUG_ON(type != _OOM_TYPE);
5824 event = kmalloc(sizeof(*event), GFP_KERNEL);
5825 if (!event)
5826 return -ENOMEM;
5827
5828 spin_lock(&memcg_oom_lock);
5829
5830 event->eventfd = eventfd;
5831 list_add(&event->list, &memcg->oom_notify);
5832
5833
5834 if (atomic_read(&memcg->under_oom))
5835 eventfd_signal(eventfd, 1);
5836 spin_unlock(&memcg_oom_lock);
5837
5838 return 0;
5839}
5840
5841static void mem_cgroup_oom_unregister_event(struct cgroup_subsys_state *css,
5842 struct cftype *cft, struct eventfd_ctx *eventfd)
5843{
5844 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
5845 struct mem_cgroup_eventfd_list *ev, *tmp;
5846 enum res_type type = MEMFILE_TYPE(cft->private);
5847
5848 BUG_ON(type != _OOM_TYPE);
5849
5850 spin_lock(&memcg_oom_lock);
5851
5852 list_for_each_entry_safe(ev, tmp, &memcg->oom_notify, list) {
5853 if (ev->eventfd == eventfd) {
5854 list_del(&ev->list);
5855 kfree(ev);
5856 }
5857 }
5858
5859 spin_unlock(&memcg_oom_lock);
5860}
5861
5862static int mem_cgroup_oom_control_read(struct cgroup_subsys_state *css,
5863 struct cftype *cft, struct cgroup_map_cb *cb)
5864{
5865 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
5866
5867 cb->fill(cb, "oom_kill_disable", memcg->oom_kill_disable);
5868
5869 if (atomic_read(&memcg->under_oom))
5870 cb->fill(cb, "under_oom", 1);
5871 else
5872 cb->fill(cb, "under_oom", 0);
5873 return 0;
5874}
5875
5876static int mem_cgroup_oom_control_write(struct cgroup_subsys_state *css,
5877 struct cftype *cft, u64 val)
5878{
5879 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
5880 struct mem_cgroup *parent = mem_cgroup_from_css(css_parent(&memcg->css));
5881
5882
5883 if (!parent || !((val == 0) || (val == 1)))
5884 return -EINVAL;
5885
5886 mutex_lock(&memcg_create_mutex);
5887
5888 if ((parent->use_hierarchy) || memcg_has_children(memcg)) {
5889 mutex_unlock(&memcg_create_mutex);
5890 return -EINVAL;
5891 }
5892 memcg->oom_kill_disable = val;
5893 if (!val)
5894 memcg_oom_recover(memcg);
5895 mutex_unlock(&memcg_create_mutex);
5896 return 0;
5897}
5898
5899#ifdef CONFIG_MEMCG_KMEM
5900static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
5901{
5902 int ret;
5903
5904 memcg->kmemcg_id = -1;
5905 ret = memcg_propagate_kmem(memcg);
5906 if (ret)
5907 return ret;
5908
5909 return mem_cgroup_sockets_init(memcg, ss);
5910}
5911
5912static void memcg_destroy_kmem(struct mem_cgroup *memcg)
5913{
5914 mem_cgroup_sockets_destroy(memcg);
5915}
5916
5917static void kmem_cgroup_css_offline(struct mem_cgroup *memcg)
5918{
5919 if (!memcg_kmem_is_active(memcg))
5920 return;
5921
5922
5923
5924
5925
5926
5927
5928
5929
5930
5931
5932
5933
5934
5935
5936
5937
5938
5939
5940 css_get(&memcg->css);
5941
5942 memcg_kmem_mark_dead(memcg);
5943
5944 if (res_counter_read_u64(&memcg->kmem, RES_USAGE) != 0)
5945 return;
5946
5947 if (memcg_kmem_test_and_clear_dead(memcg))
5948 css_put(&memcg->css);
5949}
5950#else
5951static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
5952{
5953 return 0;
5954}
5955
5956static void memcg_destroy_kmem(struct mem_cgroup *memcg)
5957{
5958}
5959
5960static void kmem_cgroup_css_offline(struct mem_cgroup *memcg)
5961{
5962}
5963#endif
5964
5965static struct cftype mem_cgroup_files[] = {
5966 {
5967 .name = "usage_in_bytes",
5968 .private = MEMFILE_PRIVATE(_MEM, RES_USAGE),
5969 .read = mem_cgroup_read,
5970 .register_event = mem_cgroup_usage_register_event,
5971 .unregister_event = mem_cgroup_usage_unregister_event,
5972 },
5973 {
5974 .name = "max_usage_in_bytes",
5975 .private = MEMFILE_PRIVATE(_MEM, RES_MAX_USAGE),
5976 .trigger = mem_cgroup_reset,
5977 .read = mem_cgroup_read,
5978 },
5979 {
5980 .name = "limit_in_bytes",
5981 .private = MEMFILE_PRIVATE(_MEM, RES_LIMIT),
5982 .write_string = mem_cgroup_write,
5983 .read = mem_cgroup_read,
5984 },
5985 {
5986 .name = "soft_limit_in_bytes",
5987 .private = MEMFILE_PRIVATE(_MEM, RES_SOFT_LIMIT),
5988 .write_string = mem_cgroup_write,
5989 .read = mem_cgroup_read,
5990 },
5991 {
5992 .name = "failcnt",
5993 .private = MEMFILE_PRIVATE(_MEM, RES_FAILCNT),
5994 .trigger = mem_cgroup_reset,
5995 .read = mem_cgroup_read,
5996 },
5997 {
5998 .name = "stat",
5999 .read_seq_string = memcg_stat_show,
6000 },
6001 {
6002 .name = "force_empty",
6003 .trigger = mem_cgroup_force_empty_write,
6004 },
6005 {
6006 .name = "use_hierarchy",
6007 .flags = CFTYPE_INSANE,
6008 .write_u64 = mem_cgroup_hierarchy_write,
6009 .read_u64 = mem_cgroup_hierarchy_read,
6010 },
6011 {
6012 .name = "swappiness",
6013 .read_u64 = mem_cgroup_swappiness_read,
6014 .write_u64 = mem_cgroup_swappiness_write,
6015 },
6016 {
6017 .name = "move_charge_at_immigrate",
6018 .read_u64 = mem_cgroup_move_charge_read,
6019 .write_u64 = mem_cgroup_move_charge_write,
6020 },
6021 {
6022 .name = "oom_control",
6023 .read_map = mem_cgroup_oom_control_read,
6024 .write_u64 = mem_cgroup_oom_control_write,
6025 .register_event = mem_cgroup_oom_register_event,
6026 .unregister_event = mem_cgroup_oom_unregister_event,
6027 .private = MEMFILE_PRIVATE(_OOM_TYPE, OOM_CONTROL),
6028 },
6029 {
6030 .name = "pressure_level",
6031 .register_event = vmpressure_register_event,
6032 .unregister_event = vmpressure_unregister_event,
6033 },
6034#ifdef CONFIG_NUMA
6035 {
6036 .name = "numa_stat",
6037 .read_seq_string = memcg_numa_stat_show,
6038 },
6039#endif
6040#ifdef CONFIG_MEMCG_KMEM
6041 {
6042 .name = "kmem.limit_in_bytes",
6043 .private = MEMFILE_PRIVATE(_KMEM, RES_LIMIT),
6044 .write_string = mem_cgroup_write,
6045 .read = mem_cgroup_read,
6046 },
6047 {
6048 .name = "kmem.usage_in_bytes",
6049 .private = MEMFILE_PRIVATE(_KMEM, RES_USAGE),
6050 .read = mem_cgroup_read,
6051 },
6052 {
6053 .name = "kmem.failcnt",
6054 .private = MEMFILE_PRIVATE(_KMEM, RES_FAILCNT),
6055 .trigger = mem_cgroup_reset,
6056 .read = mem_cgroup_read,
6057 },
6058 {
6059 .name = "kmem.max_usage_in_bytes",
6060 .private = MEMFILE_PRIVATE(_KMEM, RES_MAX_USAGE),
6061 .trigger = mem_cgroup_reset,
6062 .read = mem_cgroup_read,
6063 },
6064#ifdef CONFIG_SLABINFO
6065 {
6066 .name = "kmem.slabinfo",
6067 .read_seq_string = mem_cgroup_slabinfo_read,
6068 },
6069#endif
6070#endif
6071 { },
6072};
6073
6074#ifdef CONFIG_MEMCG_SWAP
6075static struct cftype memsw_cgroup_files[] = {
6076 {
6077 .name = "memsw.usage_in_bytes",
6078 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE),
6079 .read = mem_cgroup_read,
6080 .register_event = mem_cgroup_usage_register_event,
6081 .unregister_event = mem_cgroup_usage_unregister_event,
6082 },
6083 {
6084 .name = "memsw.max_usage_in_bytes",
6085 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_MAX_USAGE),
6086 .trigger = mem_cgroup_reset,
6087 .read = mem_cgroup_read,
6088 },
6089 {
6090 .name = "memsw.limit_in_bytes",
6091 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_LIMIT),
6092 .write_string = mem_cgroup_write,
6093 .read = mem_cgroup_read,
6094 },
6095 {
6096 .name = "memsw.failcnt",
6097 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_FAILCNT),
6098 .trigger = mem_cgroup_reset,
6099 .read = mem_cgroup_read,
6100 },
6101 { },
6102};
6103#endif
6104static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node)
6105{
6106 struct mem_cgroup_per_node *pn;
6107 struct mem_cgroup_per_zone *mz;
6108 int zone, tmp = node;
6109
6110
6111
6112
6113
6114
6115
6116
6117 if (!node_state(node, N_NORMAL_MEMORY))
6118 tmp = -1;
6119 pn = kzalloc_node(sizeof(*pn), GFP_KERNEL, tmp);
6120 if (!pn)
6121 return 1;
6122
6123 for (zone = 0; zone < MAX_NR_ZONES; zone++) {
6124 mz = &pn->zoneinfo[zone];
6125 lruvec_init(&mz->lruvec);
6126 mz->usage_in_excess = 0;
6127 mz->on_tree = false;
6128 mz->memcg = memcg;
6129 }
6130 memcg->nodeinfo[node] = pn;
6131 return 0;
6132}
6133
6134static void free_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node)
6135{
6136 kfree(memcg->nodeinfo[node]);
6137}
6138
6139static struct mem_cgroup *mem_cgroup_alloc(void)
6140{
6141 struct mem_cgroup *memcg;
6142 size_t size = memcg_size();
6143
6144
6145 if (size < PAGE_SIZE)
6146 memcg = kzalloc(size, GFP_KERNEL);
6147 else
6148 memcg = vzalloc(size);
6149
6150 if (!memcg)
6151 return NULL;
6152
6153 memcg->stat = alloc_percpu(struct mem_cgroup_stat_cpu);
6154 if (!memcg->stat)
6155 goto out_free;
6156 spin_lock_init(&memcg->pcp_counter_lock);
6157 return memcg;
6158
6159out_free:
6160 if (size < PAGE_SIZE)
6161 kfree(memcg);
6162 else
6163 vfree(memcg);
6164 return NULL;
6165}
6166
6167
6168
6169
6170
6171
6172
6173
6174
6175
6176
6177
6178static void __mem_cgroup_free(struct mem_cgroup *memcg)
6179{
6180 int node;
6181 size_t size = memcg_size();
6182
6183 mem_cgroup_remove_from_trees(memcg);
6184
6185 for_each_node(node)
6186 free_mem_cgroup_per_zone_info(memcg, node);
6187
6188 free_percpu(memcg->stat);
6189
6190
6191
6192
6193
6194
6195
6196
6197
6198
6199
6200
6201 disarm_static_keys(memcg);
6202 if (size < PAGE_SIZE)
6203 kfree(memcg);
6204 else
6205 vfree(memcg);
6206}
6207
6208
6209
6210
6211struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg)
6212{
6213 if (!memcg->res.parent)
6214 return NULL;
6215 return mem_cgroup_from_res_counter(memcg->res.parent, res);
6216}
6217EXPORT_SYMBOL(parent_mem_cgroup);
6218
6219static void __init mem_cgroup_soft_limit_tree_init(void)
6220{
6221 struct mem_cgroup_tree_per_node *rtpn;
6222 struct mem_cgroup_tree_per_zone *rtpz;
6223 int tmp, node, zone;
6224
6225 for_each_node(node) {
6226 tmp = node;
6227 if (!node_state(node, N_NORMAL_MEMORY))
6228 tmp = -1;
6229 rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL, tmp);
6230 BUG_ON(!rtpn);
6231
6232 soft_limit_tree.rb_tree_per_node[node] = rtpn;
6233
6234 for (zone = 0; zone < MAX_NR_ZONES; zone++) {
6235 rtpz = &rtpn->rb_tree_per_zone[zone];
6236 rtpz->rb_root = RB_ROOT;
6237 spin_lock_init(&rtpz->lock);
6238 }
6239 }
6240}
6241
6242static struct cgroup_subsys_state * __ref
6243mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
6244{
6245 struct mem_cgroup *memcg;
6246 long error = -ENOMEM;
6247 int node;
6248
6249 memcg = mem_cgroup_alloc();
6250 if (!memcg)
6251 return ERR_PTR(error);
6252
6253 for_each_node(node)
6254 if (alloc_mem_cgroup_per_zone_info(memcg, node))
6255 goto free_out;
6256
6257
6258 if (parent_css == NULL) {
6259 root_mem_cgroup = memcg;
6260 res_counter_init(&memcg->res, NULL);
6261 res_counter_init(&memcg->memsw, NULL);
6262 res_counter_init(&memcg->kmem, NULL);
6263 }
6264
6265 memcg->last_scanned_node = MAX_NUMNODES;
6266 INIT_LIST_HEAD(&memcg->oom_notify);
6267 memcg->move_charge_at_immigrate = 0;
6268 mutex_init(&memcg->thresholds_lock);
6269 spin_lock_init(&memcg->move_lock);
6270 vmpressure_init(&memcg->vmpressure);
6271
6272 return &memcg->css;
6273
6274free_out:
6275 __mem_cgroup_free(memcg);
6276 return ERR_PTR(error);
6277}
6278
6279static int
6280mem_cgroup_css_online(struct cgroup_subsys_state *css)
6281{
6282 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
6283 struct mem_cgroup *parent = mem_cgroup_from_css(css_parent(css));
6284 int error = 0;
6285
6286 if (css->cgroup->id > MEM_CGROUP_ID_MAX)
6287 return -ENOSPC;
6288
6289 if (!parent)
6290 return 0;
6291
6292 mutex_lock(&memcg_create_mutex);
6293
6294 memcg->use_hierarchy = parent->use_hierarchy;
6295 memcg->oom_kill_disable = parent->oom_kill_disable;
6296 memcg->swappiness = mem_cgroup_swappiness(parent);
6297
6298 if (parent->use_hierarchy) {
6299 res_counter_init(&memcg->res, &parent->res);
6300 res_counter_init(&memcg->memsw, &parent->memsw);
6301 res_counter_init(&memcg->kmem, &parent->kmem);
6302
6303
6304
6305
6306
6307 } else {
6308 res_counter_init(&memcg->res, NULL);
6309 res_counter_init(&memcg->memsw, NULL);
6310 res_counter_init(&memcg->kmem, NULL);
6311
6312
6313
6314
6315
6316 if (parent != root_mem_cgroup)
6317 mem_cgroup_subsys.broken_hierarchy = true;
6318 }
6319
6320 error = memcg_init_kmem(memcg, &mem_cgroup_subsys);
6321 mutex_unlock(&memcg_create_mutex);
6322 return error;
6323}
6324
6325
6326
6327
6328static void mem_cgroup_invalidate_reclaim_iterators(struct mem_cgroup *memcg)
6329{
6330 struct mem_cgroup *parent = memcg;
6331
6332 while ((parent = parent_mem_cgroup(parent)))
6333 mem_cgroup_iter_invalidate(parent);
6334
6335
6336
6337
6338
6339 if (!root_mem_cgroup->use_hierarchy)
6340 mem_cgroup_iter_invalidate(root_mem_cgroup);
6341}
6342
6343static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
6344{
6345 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
6346
6347 kmem_cgroup_css_offline(memcg);
6348
6349 mem_cgroup_invalidate_reclaim_iterators(memcg);
6350 mem_cgroup_reparent_charges(memcg);
6351 mem_cgroup_destroy_all_caches(memcg);
6352 vmpressure_cleanup(&memcg->vmpressure);
6353}
6354
6355static void mem_cgroup_css_free(struct cgroup_subsys_state *css)
6356{
6357 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
6358
6359
6360
6361
6362
6363
6364
6365
6366
6367
6368
6369
6370
6371
6372
6373
6374
6375
6376
6377
6378
6379
6380
6381
6382
6383
6384
6385
6386
6387
6388
6389
6390
6391
6392
6393 mem_cgroup_reparent_charges(memcg);
6394
6395 memcg_destroy_kmem(memcg);
6396 __mem_cgroup_free(memcg);
6397}
6398
6399#ifdef CONFIG_MMU
6400
6401#define PRECHARGE_COUNT_AT_ONCE 256
6402static int mem_cgroup_do_precharge(unsigned long count)
6403{
6404 int ret = 0;
6405 int batch_count = PRECHARGE_COUNT_AT_ONCE;
6406 struct mem_cgroup *memcg = mc.to;
6407
6408 if (mem_cgroup_is_root(memcg)) {
6409 mc.precharge += count;
6410
6411 return ret;
6412 }
6413
6414 if (count > 1) {
6415 struct res_counter *dummy;
6416
6417
6418
6419
6420
6421
6422 if (res_counter_charge(&memcg->res, PAGE_SIZE * count, &dummy))
6423 goto one_by_one;
6424 if (do_swap_account && res_counter_charge(&memcg->memsw,
6425 PAGE_SIZE * count, &dummy)) {
6426 res_counter_uncharge(&memcg->res, PAGE_SIZE * count);
6427 goto one_by_one;
6428 }
6429 mc.precharge += count;
6430 return ret;
6431 }
6432one_by_one:
6433
6434 while (count--) {
6435 if (signal_pending(current)) {
6436 ret = -EINTR;
6437 break;
6438 }
6439 if (!batch_count--) {
6440 batch_count = PRECHARGE_COUNT_AT_ONCE;
6441 cond_resched();
6442 }
6443 ret = __mem_cgroup_try_charge(NULL,
6444 GFP_KERNEL, 1, &memcg, false);
6445 if (ret)
6446
6447 return ret;
6448 mc.precharge++;
6449 }
6450 return ret;
6451}
6452
6453
6454
6455
6456
6457
6458
6459
6460
6461
6462
6463
6464
6465
6466
6467
6468
6469
6470
6471union mc_target {
6472 struct page *page;
6473 swp_entry_t ent;
6474};
6475
6476enum mc_target_type {
6477 MC_TARGET_NONE = 0,
6478 MC_TARGET_PAGE,
6479 MC_TARGET_SWAP,
6480};
6481
6482static struct page *mc_handle_present_pte(struct vm_area_struct *vma,
6483 unsigned long addr, pte_t ptent)
6484{
6485 struct page *page = vm_normal_page(vma, addr, ptent);
6486
6487 if (!page || !page_mapped(page))
6488 return NULL;
6489 if (PageAnon(page)) {
6490
6491 if (!move_anon())
6492 return NULL;
6493 } else if (!move_file())
6494
6495 return NULL;
6496 if (!get_page_unless_zero(page))
6497 return NULL;
6498
6499 return page;
6500}
6501
6502#ifdef CONFIG_SWAP
6503static struct page *mc_handle_swap_pte(struct vm_area_struct *vma,
6504 unsigned long addr, pte_t ptent, swp_entry_t *entry)
6505{
6506 struct page *page = NULL;
6507 swp_entry_t ent = pte_to_swp_entry(ptent);
6508
6509 if (!move_anon() || non_swap_entry(ent))
6510 return NULL;
6511
6512
6513
6514
6515 page = find_get_page(swap_address_space(ent), ent.val);
6516 if (do_swap_account)
6517 entry->val = ent.val;
6518
6519 return page;
6520}
6521#else
6522static struct page *mc_handle_swap_pte(struct vm_area_struct *vma,
6523 unsigned long addr, pte_t ptent, swp_entry_t *entry)
6524{
6525 return NULL;
6526}
6527#endif
6528
6529static struct page *mc_handle_file_pte(struct vm_area_struct *vma,
6530 unsigned long addr, pte_t ptent, swp_entry_t *entry)
6531{
6532 struct page *page = NULL;
6533 struct address_space *mapping;
6534 pgoff_t pgoff;
6535
6536 if (!vma->vm_file)
6537 return NULL;
6538 if (!move_file())
6539 return NULL;
6540
6541 mapping = vma->vm_file->f_mapping;
6542 if (pte_none(ptent))
6543 pgoff = linear_page_index(vma, addr);
6544 else
6545 pgoff = pte_to_pgoff(ptent);
6546
6547
6548 page = find_get_page(mapping, pgoff);
6549
6550#ifdef CONFIG_SWAP
6551
6552 if (radix_tree_exceptional_entry(page)) {
6553 swp_entry_t swap = radix_to_swp_entry(page);
6554 if (do_swap_account)
6555 *entry = swap;
6556 page = find_get_page(swap_address_space(swap), swap.val);
6557 }
6558#endif
6559 return page;
6560}
6561
6562static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma,
6563 unsigned long addr, pte_t ptent, union mc_target *target)
6564{
6565 struct page *page = NULL;
6566 struct page_cgroup *pc;
6567 enum mc_target_type ret = MC_TARGET_NONE;
6568 swp_entry_t ent = { .val = 0 };
6569
6570 if (pte_present(ptent))
6571 page = mc_handle_present_pte(vma, addr, ptent);
6572 else if (is_swap_pte(ptent))
6573 page = mc_handle_swap_pte(vma, addr, ptent, &ent);
6574 else if (pte_none(ptent) || pte_file(ptent))
6575 page = mc_handle_file_pte(vma, addr, ptent, &ent);
6576
6577 if (!page && !ent.val)
6578 return ret;
6579 if (page) {
6580 pc = lookup_page_cgroup(page);
6581
6582
6583
6584
6585
6586 if (PageCgroupUsed(pc) && pc->mem_cgroup == mc.from) {
6587 ret = MC_TARGET_PAGE;
6588 if (target)
6589 target->page = page;
6590 }
6591 if (!ret || !target)
6592 put_page(page);
6593 }
6594
6595 if (ent.val && !ret &&
6596 mem_cgroup_id(mc.from) == lookup_swap_cgroup_id(ent)) {
6597 ret = MC_TARGET_SWAP;
6598 if (target)
6599 target->ent = ent;
6600 }
6601 return ret;
6602}
6603
6604#ifdef CONFIG_TRANSPARENT_HUGEPAGE
6605
6606
6607
6608
6609
6610static enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma,
6611 unsigned long addr, pmd_t pmd, union mc_target *target)
6612{
6613 struct page *page = NULL;
6614 struct page_cgroup *pc;
6615 enum mc_target_type ret = MC_TARGET_NONE;
6616
6617 page = pmd_page(pmd);
6618 VM_BUG_ON(!page || !PageHead(page));
6619 if (!move_anon())
6620 return ret;
6621 pc = lookup_page_cgroup(page);
6622 if (PageCgroupUsed(pc) && pc->mem_cgroup == mc.from) {
6623 ret = MC_TARGET_PAGE;
6624 if (target) {
6625 get_page(page);
6626 target->page = page;
6627 }
6628 }
6629 return ret;
6630}
6631#else
6632static inline enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma,
6633 unsigned long addr, pmd_t pmd, union mc_target *target)
6634{
6635 return MC_TARGET_NONE;
6636}
6637#endif
6638
6639static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd,
6640 unsigned long addr, unsigned long end,
6641 struct mm_walk *walk)
6642{
6643 struct vm_area_struct *vma = walk->private;
6644 pte_t *pte;
6645 spinlock_t *ptl;
6646
6647 if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
6648 if (get_mctgt_type_thp(vma, addr, *pmd, NULL) == MC_TARGET_PAGE)
6649 mc.precharge += HPAGE_PMD_NR;
6650 spin_unlock(ptl);
6651 return 0;
6652 }
6653
6654 if (pmd_trans_unstable(pmd))
6655 return 0;
6656 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
6657 for (; addr != end; pte++, addr += PAGE_SIZE)
6658 if (get_mctgt_type(vma, addr, *pte, NULL))
6659 mc.precharge++;
6660 pte_unmap_unlock(pte - 1, ptl);
6661 cond_resched();
6662
6663 return 0;
6664}
6665
6666static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm)
6667{
6668 unsigned long precharge;
6669 struct vm_area_struct *vma;
6670
6671 down_read(&mm->mmap_sem);
6672 for (vma = mm->mmap; vma; vma = vma->vm_next) {
6673 struct mm_walk mem_cgroup_count_precharge_walk = {
6674 .pmd_entry = mem_cgroup_count_precharge_pte_range,
6675 .mm = mm,
6676 .private = vma,
6677 };
6678 if (is_vm_hugetlb_page(vma))
6679 continue;
6680 walk_page_range(vma->vm_start, vma->vm_end,
6681 &mem_cgroup_count_precharge_walk);
6682 }
6683 up_read(&mm->mmap_sem);
6684
6685 precharge = mc.precharge;
6686 mc.precharge = 0;
6687
6688 return precharge;
6689}
6690
6691static int mem_cgroup_precharge_mc(struct mm_struct *mm)
6692{
6693 unsigned long precharge = mem_cgroup_count_precharge(mm);
6694
6695 VM_BUG_ON(mc.moving_task);
6696 mc.moving_task = current;
6697 return mem_cgroup_do_precharge(precharge);
6698}
6699
6700
6701static void __mem_cgroup_clear_mc(void)
6702{
6703 struct mem_cgroup *from = mc.from;
6704 struct mem_cgroup *to = mc.to;
6705 int i;
6706
6707
6708 if (mc.precharge) {
6709 __mem_cgroup_cancel_charge(mc.to, mc.precharge);
6710 mc.precharge = 0;
6711 }
6712
6713
6714
6715
6716 if (mc.moved_charge) {
6717 __mem_cgroup_cancel_charge(mc.from, mc.moved_charge);
6718 mc.moved_charge = 0;
6719 }
6720
6721 if (mc.moved_swap) {
6722
6723 if (!mem_cgroup_is_root(mc.from))
6724 res_counter_uncharge(&mc.from->memsw,
6725 PAGE_SIZE * mc.moved_swap);
6726
6727 for (i = 0; i < mc.moved_swap; i++)
6728 css_put(&mc.from->css);
6729
6730 if (!mem_cgroup_is_root(mc.to)) {
6731
6732
6733
6734
6735 res_counter_uncharge(&mc.to->res,
6736 PAGE_SIZE * mc.moved_swap);
6737 }
6738
6739 mc.moved_swap = 0;
6740 }
6741 memcg_oom_recover(from);
6742 memcg_oom_recover(to);
6743 wake_up_all(&mc.waitq);
6744}
6745
6746static void mem_cgroup_clear_mc(void)
6747{
6748 struct mem_cgroup *from = mc.from;
6749
6750
6751
6752
6753
6754 mc.moving_task = NULL;
6755 __mem_cgroup_clear_mc();
6756 spin_lock(&mc.lock);
6757 mc.from = NULL;
6758 mc.to = NULL;
6759 spin_unlock(&mc.lock);
6760 mem_cgroup_end_move(from);
6761}
6762
6763static int mem_cgroup_can_attach(struct cgroup_subsys_state *css,
6764 struct cgroup_taskset *tset)
6765{
6766 struct task_struct *p = cgroup_taskset_first(tset);
6767 int ret = 0;
6768 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
6769 unsigned long move_charge_at_immigrate;
6770
6771
6772
6773
6774
6775
6776 move_charge_at_immigrate = memcg->move_charge_at_immigrate;
6777 if (move_charge_at_immigrate) {
6778 struct mm_struct *mm;
6779 struct mem_cgroup *from = mem_cgroup_from_task(p);
6780
6781 VM_BUG_ON(from == memcg);
6782
6783 mm = get_task_mm(p);
6784 if (!mm)
6785 return 0;
6786
6787 if (mm->owner == p) {
6788 VM_BUG_ON(mc.from);
6789 VM_BUG_ON(mc.to);
6790 VM_BUG_ON(mc.precharge);
6791 VM_BUG_ON(mc.moved_charge);
6792 VM_BUG_ON(mc.moved_swap);
6793 mem_cgroup_start_move(from);
6794 spin_lock(&mc.lock);
6795 mc.from = from;
6796 mc.to = memcg;
6797 mc.immigrate_flags = move_charge_at_immigrate;
6798 spin_unlock(&mc.lock);
6799
6800
6801 ret = mem_cgroup_precharge_mc(mm);
6802 if (ret)
6803 mem_cgroup_clear_mc();
6804 }
6805 mmput(mm);
6806 }
6807 return ret;
6808}
6809
6810static void mem_cgroup_cancel_attach(struct cgroup_subsys_state *css,
6811 struct cgroup_taskset *tset)
6812{
6813 mem_cgroup_clear_mc();
6814}
6815
6816static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
6817 unsigned long addr, unsigned long end,
6818 struct mm_walk *walk)
6819{
6820 int ret = 0;
6821 struct vm_area_struct *vma = walk->private;
6822 pte_t *pte;
6823 spinlock_t *ptl;
6824 enum mc_target_type target_type;
6825 union mc_target target;
6826 struct page *page;
6827 struct page_cgroup *pc;
6828
6829
6830
6831
6832
6833
6834
6835
6836
6837
6838
6839 if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
6840 if (mc.precharge < HPAGE_PMD_NR) {
6841 spin_unlock(ptl);
6842 return 0;
6843 }
6844 target_type = get_mctgt_type_thp(vma, addr, *pmd, &target);
6845 if (target_type == MC_TARGET_PAGE) {
6846 page = target.page;
6847 if (!isolate_lru_page(page)) {
6848 pc = lookup_page_cgroup(page);
6849 if (!mem_cgroup_move_account(page, HPAGE_PMD_NR,
6850 pc, mc.from, mc.to)) {
6851 mc.precharge -= HPAGE_PMD_NR;
6852 mc.moved_charge += HPAGE_PMD_NR;
6853 }
6854 putback_lru_page(page);
6855 }
6856 put_page(page);
6857 }
6858 spin_unlock(ptl);
6859 return 0;
6860 }
6861
6862 if (pmd_trans_unstable(pmd))
6863 return 0;
6864retry:
6865 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
6866 for (; addr != end; addr += PAGE_SIZE) {
6867 pte_t ptent = *(pte++);
6868 swp_entry_t ent;
6869
6870 if (!mc.precharge)
6871 break;
6872
6873 switch (get_mctgt_type(vma, addr, ptent, &target)) {
6874 case MC_TARGET_PAGE:
6875 page = target.page;
6876 if (isolate_lru_page(page))
6877 goto put;
6878 pc = lookup_page_cgroup(page);
6879 if (!mem_cgroup_move_account(page, 1, pc,
6880 mc.from, mc.to)) {
6881 mc.precharge--;
6882
6883 mc.moved_charge++;
6884 }
6885 putback_lru_page(page);
6886put:
6887 put_page(page);
6888 break;
6889 case MC_TARGET_SWAP:
6890 ent = target.ent;
6891 if (!mem_cgroup_move_swap_account(ent, mc.from, mc.to)) {
6892 mc.precharge--;
6893
6894 mc.moved_swap++;
6895 }
6896 break;
6897 default:
6898 break;
6899 }
6900 }
6901 pte_unmap_unlock(pte - 1, ptl);
6902 cond_resched();
6903
6904 if (addr != end) {
6905
6906
6907
6908
6909
6910
6911 ret = mem_cgroup_do_precharge(1);
6912 if (!ret)
6913 goto retry;
6914 }
6915
6916 return ret;
6917}
6918
6919static void mem_cgroup_move_charge(struct mm_struct *mm)
6920{
6921 struct vm_area_struct *vma;
6922
6923 lru_add_drain_all();
6924retry:
6925 if (unlikely(!down_read_trylock(&mm->mmap_sem))) {
6926
6927
6928
6929
6930
6931
6932
6933 __mem_cgroup_clear_mc();
6934 cond_resched();
6935 goto retry;
6936 }
6937 for (vma = mm->mmap; vma; vma = vma->vm_next) {
6938 int ret;
6939 struct mm_walk mem_cgroup_move_charge_walk = {
6940 .pmd_entry = mem_cgroup_move_charge_pte_range,
6941 .mm = mm,
6942 .private = vma,
6943 };
6944 if (is_vm_hugetlb_page(vma))
6945 continue;
6946 ret = walk_page_range(vma->vm_start, vma->vm_end,
6947 &mem_cgroup_move_charge_walk);
6948 if (ret)
6949
6950
6951
6952
6953 break;
6954 }
6955 up_read(&mm->mmap_sem);
6956}
6957
6958static void mem_cgroup_move_task(struct cgroup_subsys_state *css,
6959 struct cgroup_taskset *tset)
6960{
6961 struct task_struct *p = cgroup_taskset_first(tset);
6962 struct mm_struct *mm = get_task_mm(p);
6963
6964 if (mm) {
6965 if (mc.to)
6966 mem_cgroup_move_charge(mm);
6967 mmput(mm);
6968 }
6969 if (mc.to)
6970 mem_cgroup_clear_mc();
6971}
6972#else
6973static int mem_cgroup_can_attach(struct cgroup_subsys_state *css,
6974 struct cgroup_taskset *tset)
6975{
6976 return 0;
6977}
6978static void mem_cgroup_cancel_attach(struct cgroup_subsys_state *css,
6979 struct cgroup_taskset *tset)
6980{
6981}
6982static void mem_cgroup_move_task(struct cgroup_subsys_state *css,
6983 struct cgroup_taskset *tset)
6984{
6985}
6986#endif
6987
6988
6989
6990
6991
6992static void mem_cgroup_bind(struct cgroup_subsys_state *root_css)
6993{
6994
6995
6996
6997
6998
6999 if (cgroup_sane_behavior(root_css->cgroup))
7000 mem_cgroup_from_css(root_css)->use_hierarchy = true;
7001}
7002
7003struct cgroup_subsys mem_cgroup_subsys = {
7004 .name = "memory",
7005 .subsys_id = mem_cgroup_subsys_id,
7006 .css_alloc = mem_cgroup_css_alloc,
7007 .css_online = mem_cgroup_css_online,
7008 .css_offline = mem_cgroup_css_offline,
7009 .css_free = mem_cgroup_css_free,
7010 .can_attach = mem_cgroup_can_attach,
7011 .cancel_attach = mem_cgroup_cancel_attach,
7012 .attach = mem_cgroup_move_task,
7013 .bind = mem_cgroup_bind,
7014 .base_cftypes = mem_cgroup_files,
7015 .early_init = 0,
7016};
7017
7018#ifdef CONFIG_MEMCG_SWAP
7019static int __init enable_swap_account(char *s)
7020{
7021 if (!strcmp(s, "1"))
7022 really_do_swap_account = 1;
7023 else if (!strcmp(s, "0"))
7024 really_do_swap_account = 0;
7025 return 1;
7026}
7027__setup("swapaccount=", enable_swap_account);
7028
7029static void __init memsw_file_init(void)
7030{
7031 WARN_ON(cgroup_add_cftypes(&mem_cgroup_subsys, memsw_cgroup_files));
7032}
7033
7034static void __init enable_swap_cgroup(void)
7035{
7036 if (!mem_cgroup_disabled() && really_do_swap_account) {
7037 do_swap_account = 1;
7038 memsw_file_init();
7039 }
7040}
7041
7042#else
7043static void __init enable_swap_cgroup(void)
7044{
7045}
7046#endif
7047
7048
7049
7050
7051
7052
7053
7054
7055
7056static int __init mem_cgroup_init(void)
7057{
7058 hotcpu_notifier(memcg_cpu_hotplug_callback, 0);
7059 enable_swap_cgroup();
7060 mem_cgroup_soft_limit_tree_init();
7061 memcg_stock_init();
7062 return 0;
7063}
7064subsys_initcall(mem_cgroup_init);
7065