1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28#include <linux/res_counter.h>
29#include <linux/memcontrol.h>
30#include <linux/cgroup.h>
31#include <linux/mm.h>
32#include <linux/hugetlb.h>
33#include <linux/pagemap.h>
34#include <linux/smp.h>
35#include <linux/page-flags.h>
36#include <linux/backing-dev.h>
37#include <linux/bit_spinlock.h>
38#include <linux/rcupdate.h>
39#include <linux/limits.h>
40#include <linux/export.h>
41#include <linux/mutex.h>
42#include <linux/rbtree.h>
43#include <linux/slab.h>
44#include <linux/swap.h>
45#include <linux/swapops.h>
46#include <linux/spinlock.h>
47#include <linux/eventfd.h>
48#include <linux/sort.h>
49#include <linux/fs.h>
50#include <linux/seq_file.h>
51#include <linux/vmalloc.h>
52#include <linux/vmpressure.h>
53#include <linux/mm_inline.h>
54#include <linux/page_cgroup.h>
55#include <linux/cpu.h>
56#include <linux/oom.h>
57#include "internal.h"
58#include <net/sock.h>
59#include <net/ip.h>
60#include <net/tcp_memcontrol.h>
61
62#include <asm/uaccess.h>
63
64#include <trace/events/vmscan.h>
65
66struct cgroup_subsys mem_cgroup_subsys __read_mostly;
67EXPORT_SYMBOL(mem_cgroup_subsys);
68
69#define MEM_CGROUP_RECLAIM_RETRIES 5
70static struct mem_cgroup *root_mem_cgroup __read_mostly;
71
72#ifdef CONFIG_MEMCG_SWAP
73
74int do_swap_account __read_mostly;
75
76
77#ifdef CONFIG_MEMCG_SWAP_ENABLED
78static int really_do_swap_account __initdata = 1;
79#else
80static int really_do_swap_account __initdata = 0;
81#endif
82
83#else
84#define do_swap_account 0
85#endif
86
87
88
89
90
91enum mem_cgroup_stat_index {
92
93
94
95 MEM_CGROUP_STAT_CACHE,
96 MEM_CGROUP_STAT_RSS,
97 MEM_CGROUP_STAT_RSS_HUGE,
98 MEM_CGROUP_STAT_FILE_MAPPED,
99 MEM_CGROUP_STAT_SWAP,
100 MEM_CGROUP_STAT_NSTATS,
101};
102
103static const char * const mem_cgroup_stat_names[] = {
104 "cache",
105 "rss",
106 "rss_huge",
107 "mapped_file",
108 "swap",
109};
110
111enum mem_cgroup_events_index {
112 MEM_CGROUP_EVENTS_PGPGIN,
113 MEM_CGROUP_EVENTS_PGPGOUT,
114 MEM_CGROUP_EVENTS_PGFAULT,
115 MEM_CGROUP_EVENTS_PGMAJFAULT,
116 MEM_CGROUP_EVENTS_NSTATS,
117};
118
119static const char * const mem_cgroup_events_names[] = {
120 "pgpgin",
121 "pgpgout",
122 "pgfault",
123 "pgmajfault",
124};
125
126static const char * const mem_cgroup_lru_names[] = {
127 "inactive_anon",
128 "active_anon",
129 "inactive_file",
130 "active_file",
131 "unevictable",
132};
133
134
135
136
137
138
139
140enum mem_cgroup_events_target {
141 MEM_CGROUP_TARGET_THRESH,
142 MEM_CGROUP_TARGET_SOFTLIMIT,
143 MEM_CGROUP_TARGET_NUMAINFO,
144 MEM_CGROUP_NTARGETS,
145};
146#define THRESHOLDS_EVENTS_TARGET 128
147#define SOFTLIMIT_EVENTS_TARGET 1024
148#define NUMAINFO_EVENTS_TARGET 1024
149
150struct mem_cgroup_stat_cpu {
151 long count[MEM_CGROUP_STAT_NSTATS];
152 unsigned long events[MEM_CGROUP_EVENTS_NSTATS];
153 unsigned long nr_page_events;
154 unsigned long targets[MEM_CGROUP_NTARGETS];
155};
156
157struct mem_cgroup_reclaim_iter {
158
159
160
161
162 struct mem_cgroup *last_visited;
163 unsigned long last_dead_count;
164
165
166 unsigned int generation;
167};
168
169
170
171
172struct mem_cgroup_per_zone {
173 struct lruvec lruvec;
174 unsigned long lru_size[NR_LRU_LISTS];
175
176 struct mem_cgroup_reclaim_iter reclaim_iter[DEF_PRIORITY + 1];
177
178 struct rb_node tree_node;
179 unsigned long long usage_in_excess;
180
181 bool on_tree;
182 struct mem_cgroup *memcg;
183
184};
185
186struct mem_cgroup_per_node {
187 struct mem_cgroup_per_zone zoneinfo[MAX_NR_ZONES];
188};
189
190
191
192
193
194
195struct mem_cgroup_tree_per_zone {
196 struct rb_root rb_root;
197 spinlock_t lock;
198};
199
200struct mem_cgroup_tree_per_node {
201 struct mem_cgroup_tree_per_zone rb_tree_per_zone[MAX_NR_ZONES];
202};
203
204struct mem_cgroup_tree {
205 struct mem_cgroup_tree_per_node *rb_tree_per_node[MAX_NUMNODES];
206};
207
208static struct mem_cgroup_tree soft_limit_tree __read_mostly;
209
210struct mem_cgroup_threshold {
211 struct eventfd_ctx *eventfd;
212 u64 threshold;
213};
214
215
216struct mem_cgroup_threshold_ary {
217
218 int current_threshold;
219
220 unsigned int size;
221
222 struct mem_cgroup_threshold entries[0];
223};
224
225struct mem_cgroup_thresholds {
226
227 struct mem_cgroup_threshold_ary *primary;
228
229
230
231
232
233 struct mem_cgroup_threshold_ary *spare;
234};
235
236
237struct mem_cgroup_eventfd_list {
238 struct list_head list;
239 struct eventfd_ctx *eventfd;
240};
241
242static void mem_cgroup_threshold(struct mem_cgroup *memcg);
243static void mem_cgroup_oom_notify(struct mem_cgroup *memcg);
244
245
246
247
248
249
250
251
252
253
254
255
256struct mem_cgroup {
257 struct cgroup_subsys_state css;
258
259
260
261 struct res_counter res;
262
263
264 struct vmpressure vmpressure;
265
266
267
268
269 struct res_counter memsw;
270
271
272
273
274 struct res_counter kmem;
275
276
277
278 bool use_hierarchy;
279 unsigned long kmem_account_flags;
280
281 bool oom_lock;
282 atomic_t under_oom;
283
284 int swappiness;
285
286 int oom_kill_disable;
287
288
289 bool memsw_is_minimum;
290
291
292 struct mutex thresholds_lock;
293
294
295 struct mem_cgroup_thresholds thresholds;
296
297
298 struct mem_cgroup_thresholds memsw_thresholds;
299
300
301 struct list_head oom_notify;
302
303
304
305
306
307 unsigned long move_charge_at_immigrate;
308
309
310
311 atomic_t moving_account;
312
313 spinlock_t move_lock;
314
315
316
317 struct mem_cgroup_stat_cpu __percpu *stat;
318
319
320
321
322 struct mem_cgroup_stat_cpu nocpu_base;
323 spinlock_t pcp_counter_lock;
324
325 atomic_t dead_count;
326#if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_INET)
327 struct tcp_memcontrol tcp_mem;
328#endif
329#if defined(CONFIG_MEMCG_KMEM)
330
331 struct list_head memcg_slab_caches;
332
333 struct mutex slab_caches_mutex;
334
335 int kmemcg_id;
336#endif
337
338 int last_scanned_node;
339#if MAX_NUMNODES > 1
340 nodemask_t scan_nodes;
341 atomic_t numainfo_events;
342 atomic_t numainfo_updating;
343#endif
344
345 struct mem_cgroup_per_node *nodeinfo[0];
346
347};
348
349static size_t memcg_size(void)
350{
351 return sizeof(struct mem_cgroup) +
352 nr_node_ids * sizeof(struct mem_cgroup_per_node);
353}
354
355
356enum {
357 KMEM_ACCOUNTED_ACTIVE = 0,
358 KMEM_ACCOUNTED_ACTIVATED,
359 KMEM_ACCOUNTED_DEAD,
360};
361
362
363#define KMEM_ACCOUNTED_MASK \
364 ((1 << KMEM_ACCOUNTED_ACTIVE) | (1 << KMEM_ACCOUNTED_ACTIVATED))
365
366#ifdef CONFIG_MEMCG_KMEM
367static inline void memcg_kmem_set_active(struct mem_cgroup *memcg)
368{
369 set_bit(KMEM_ACCOUNTED_ACTIVE, &memcg->kmem_account_flags);
370}
371
372static bool memcg_kmem_is_active(struct mem_cgroup *memcg)
373{
374 return test_bit(KMEM_ACCOUNTED_ACTIVE, &memcg->kmem_account_flags);
375}
376
377static void memcg_kmem_set_activated(struct mem_cgroup *memcg)
378{
379 set_bit(KMEM_ACCOUNTED_ACTIVATED, &memcg->kmem_account_flags);
380}
381
382static void memcg_kmem_clear_activated(struct mem_cgroup *memcg)
383{
384 clear_bit(KMEM_ACCOUNTED_ACTIVATED, &memcg->kmem_account_flags);
385}
386
387static void memcg_kmem_mark_dead(struct mem_cgroup *memcg)
388{
389
390
391
392
393 smp_wmb();
394 if (test_bit(KMEM_ACCOUNTED_ACTIVE, &memcg->kmem_account_flags))
395 set_bit(KMEM_ACCOUNTED_DEAD, &memcg->kmem_account_flags);
396}
397
398static bool memcg_kmem_test_and_clear_dead(struct mem_cgroup *memcg)
399{
400 return test_and_clear_bit(KMEM_ACCOUNTED_DEAD,
401 &memcg->kmem_account_flags);
402}
403#endif
404
405
406
407
408
409
410enum move_type {
411 MOVE_CHARGE_TYPE_ANON,
412 MOVE_CHARGE_TYPE_FILE,
413 NR_MOVE_TYPE,
414};
415
416
417static struct move_charge_struct {
418 spinlock_t lock;
419 struct mem_cgroup *from;
420 struct mem_cgroup *to;
421 unsigned long immigrate_flags;
422 unsigned long precharge;
423 unsigned long moved_charge;
424 unsigned long moved_swap;
425 struct task_struct *moving_task;
426 wait_queue_head_t waitq;
427} mc = {
428 .lock = __SPIN_LOCK_UNLOCKED(mc.lock),
429 .waitq = __WAIT_QUEUE_HEAD_INITIALIZER(mc.waitq),
430};
431
432static bool move_anon(void)
433{
434 return test_bit(MOVE_CHARGE_TYPE_ANON, &mc.immigrate_flags);
435}
436
437static bool move_file(void)
438{
439 return test_bit(MOVE_CHARGE_TYPE_FILE, &mc.immigrate_flags);
440}
441
442
443
444
445
446#define MEM_CGROUP_MAX_RECLAIM_LOOPS 100
447#define MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS 2
448
449enum charge_type {
450 MEM_CGROUP_CHARGE_TYPE_CACHE = 0,
451 MEM_CGROUP_CHARGE_TYPE_ANON,
452 MEM_CGROUP_CHARGE_TYPE_SWAPOUT,
453 MEM_CGROUP_CHARGE_TYPE_DROP,
454 NR_CHARGE_TYPE,
455};
456
457
458enum res_type {
459 _MEM,
460 _MEMSWAP,
461 _OOM_TYPE,
462 _KMEM,
463};
464
465#define MEMFILE_PRIVATE(x, val) ((x) << 16 | (val))
466#define MEMFILE_TYPE(val) ((val) >> 16 & 0xffff)
467#define MEMFILE_ATTR(val) ((val) & 0xffff)
468
469#define OOM_CONTROL (0)
470
471
472
473
474#define MEM_CGROUP_RECLAIM_NOSWAP_BIT 0x0
475#define MEM_CGROUP_RECLAIM_NOSWAP (1 << MEM_CGROUP_RECLAIM_NOSWAP_BIT)
476#define MEM_CGROUP_RECLAIM_SHRINK_BIT 0x1
477#define MEM_CGROUP_RECLAIM_SHRINK (1 << MEM_CGROUP_RECLAIM_SHRINK_BIT)
478
479
480
481
482
483
484static DEFINE_MUTEX(memcg_create_mutex);
485
486static inline
487struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *s)
488{
489 return container_of(s, struct mem_cgroup, css);
490}
491
492
493struct vmpressure *memcg_to_vmpressure(struct mem_cgroup *memcg)
494{
495 if (!memcg)
496 memcg = root_mem_cgroup;
497 return &memcg->vmpressure;
498}
499
500struct cgroup_subsys_state *vmpressure_to_css(struct vmpressure *vmpr)
501{
502 return &container_of(vmpr, struct mem_cgroup, vmpressure)->css;
503}
504
505struct vmpressure *css_to_vmpressure(struct cgroup_subsys_state *css)
506{
507 return &mem_cgroup_from_css(css)->vmpressure;
508}
509
510static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg)
511{
512 return (memcg == root_mem_cgroup);
513}
514
515
516#if defined(CONFIG_INET) && defined(CONFIG_MEMCG_KMEM)
517
518void sock_update_memcg(struct sock *sk)
519{
520 if (mem_cgroup_sockets_enabled) {
521 struct mem_cgroup *memcg;
522 struct cg_proto *cg_proto;
523
524 BUG_ON(!sk->sk_prot->proto_cgroup);
525
526
527
528
529
530
531
532
533
534 if (sk->sk_cgrp) {
535 BUG_ON(mem_cgroup_is_root(sk->sk_cgrp->memcg));
536 css_get(&sk->sk_cgrp->memcg->css);
537 return;
538 }
539
540 rcu_read_lock();
541 memcg = mem_cgroup_from_task(current);
542 cg_proto = sk->sk_prot->proto_cgroup(memcg);
543 if (!mem_cgroup_is_root(memcg) &&
544 memcg_proto_active(cg_proto) && css_tryget(&memcg->css)) {
545 sk->sk_cgrp = cg_proto;
546 }
547 rcu_read_unlock();
548 }
549}
550EXPORT_SYMBOL(sock_update_memcg);
551
552void sock_release_memcg(struct sock *sk)
553{
554 if (mem_cgroup_sockets_enabled && sk->sk_cgrp) {
555 struct mem_cgroup *memcg;
556 WARN_ON(!sk->sk_cgrp->memcg);
557 memcg = sk->sk_cgrp->memcg;
558 css_put(&sk->sk_cgrp->memcg->css);
559 }
560}
561
562struct cg_proto *tcp_proto_cgroup(struct mem_cgroup *memcg)
563{
564 if (!memcg || mem_cgroup_is_root(memcg))
565 return NULL;
566
567 return &memcg->tcp_mem.cg_proto;
568}
569EXPORT_SYMBOL(tcp_proto_cgroup);
570
571static void disarm_sock_keys(struct mem_cgroup *memcg)
572{
573 if (!memcg_proto_activated(&memcg->tcp_mem.cg_proto))
574 return;
575 static_key_slow_dec(&memcg_socket_limit_enabled);
576}
577#else
578static void disarm_sock_keys(struct mem_cgroup *memcg)
579{
580}
581#endif
582
583#ifdef CONFIG_MEMCG_KMEM
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601static DEFINE_IDA(kmem_limited_groups);
602int memcg_limited_groups_array_size;
603
604
605
606
607
608
609
610
611
612
613
614
615
616#define MEMCG_CACHES_MIN_SIZE 4
617#define MEMCG_CACHES_MAX_SIZE 65535
618
619
620
621
622
623
624
625struct static_key memcg_kmem_enabled_key;
626EXPORT_SYMBOL(memcg_kmem_enabled_key);
627
628static void disarm_kmem_keys(struct mem_cgroup *memcg)
629{
630 if (memcg_kmem_is_active(memcg)) {
631 static_key_slow_dec(&memcg_kmem_enabled_key);
632 ida_simple_remove(&kmem_limited_groups, memcg->kmemcg_id);
633 }
634
635
636
637
638 WARN_ON(res_counter_read_u64(&memcg->kmem, RES_USAGE) != 0);
639}
640#else
641static void disarm_kmem_keys(struct mem_cgroup *memcg)
642{
643}
644#endif
645
646static void disarm_static_keys(struct mem_cgroup *memcg)
647{
648 disarm_sock_keys(memcg);
649 disarm_kmem_keys(memcg);
650}
651
652static void drain_all_stock_async(struct mem_cgroup *memcg);
653
654static struct mem_cgroup_per_zone *
655mem_cgroup_zoneinfo(struct mem_cgroup *memcg, int nid, int zid)
656{
657 VM_BUG_ON((unsigned)nid >= nr_node_ids);
658 return &memcg->nodeinfo[nid]->zoneinfo[zid];
659}
660
661struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *memcg)
662{
663 return &memcg->css;
664}
665
666static struct mem_cgroup_per_zone *
667page_cgroup_zoneinfo(struct mem_cgroup *memcg, struct page *page)
668{
669 int nid = page_to_nid(page);
670 int zid = page_zonenum(page);
671
672 return mem_cgroup_zoneinfo(memcg, nid, zid);
673}
674
675static struct mem_cgroup_tree_per_zone *
676soft_limit_tree_node_zone(int nid, int zid)
677{
678 return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid];
679}
680
681static struct mem_cgroup_tree_per_zone *
682soft_limit_tree_from_page(struct page *page)
683{
684 int nid = page_to_nid(page);
685 int zid = page_zonenum(page);
686
687 return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid];
688}
689
690static void
691__mem_cgroup_insert_exceeded(struct mem_cgroup *memcg,
692 struct mem_cgroup_per_zone *mz,
693 struct mem_cgroup_tree_per_zone *mctz,
694 unsigned long long new_usage_in_excess)
695{
696 struct rb_node **p = &mctz->rb_root.rb_node;
697 struct rb_node *parent = NULL;
698 struct mem_cgroup_per_zone *mz_node;
699
700 if (mz->on_tree)
701 return;
702
703 mz->usage_in_excess = new_usage_in_excess;
704 if (!mz->usage_in_excess)
705 return;
706 while (*p) {
707 parent = *p;
708 mz_node = rb_entry(parent, struct mem_cgroup_per_zone,
709 tree_node);
710 if (mz->usage_in_excess < mz_node->usage_in_excess)
711 p = &(*p)->rb_left;
712
713
714
715
716 else if (mz->usage_in_excess >= mz_node->usage_in_excess)
717 p = &(*p)->rb_right;
718 }
719 rb_link_node(&mz->tree_node, parent, p);
720 rb_insert_color(&mz->tree_node, &mctz->rb_root);
721 mz->on_tree = true;
722}
723
724static void
725__mem_cgroup_remove_exceeded(struct mem_cgroup *memcg,
726 struct mem_cgroup_per_zone *mz,
727 struct mem_cgroup_tree_per_zone *mctz)
728{
729 if (!mz->on_tree)
730 return;
731 rb_erase(&mz->tree_node, &mctz->rb_root);
732 mz->on_tree = false;
733}
734
735static void
736mem_cgroup_remove_exceeded(struct mem_cgroup *memcg,
737 struct mem_cgroup_per_zone *mz,
738 struct mem_cgroup_tree_per_zone *mctz)
739{
740 spin_lock(&mctz->lock);
741 __mem_cgroup_remove_exceeded(memcg, mz, mctz);
742 spin_unlock(&mctz->lock);
743}
744
745
746static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page)
747{
748 unsigned long long excess;
749 struct mem_cgroup_per_zone *mz;
750 struct mem_cgroup_tree_per_zone *mctz;
751 int nid = page_to_nid(page);
752 int zid = page_zonenum(page);
753 mctz = soft_limit_tree_from_page(page);
754
755
756
757
758
759 for (; memcg; memcg = parent_mem_cgroup(memcg)) {
760 mz = mem_cgroup_zoneinfo(memcg, nid, zid);
761 excess = res_counter_soft_limit_excess(&memcg->res);
762
763
764
765
766 if (excess || mz->on_tree) {
767 spin_lock(&mctz->lock);
768
769 if (mz->on_tree)
770 __mem_cgroup_remove_exceeded(memcg, mz, mctz);
771
772
773
774
775 __mem_cgroup_insert_exceeded(memcg, mz, mctz, excess);
776 spin_unlock(&mctz->lock);
777 }
778 }
779}
780
781static void mem_cgroup_remove_from_trees(struct mem_cgroup *memcg)
782{
783 int node, zone;
784 struct mem_cgroup_per_zone *mz;
785 struct mem_cgroup_tree_per_zone *mctz;
786
787 for_each_node(node) {
788 for (zone = 0; zone < MAX_NR_ZONES; zone++) {
789 mz = mem_cgroup_zoneinfo(memcg, node, zone);
790 mctz = soft_limit_tree_node_zone(node, zone);
791 mem_cgroup_remove_exceeded(memcg, mz, mctz);
792 }
793 }
794}
795
796static struct mem_cgroup_per_zone *
797__mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz)
798{
799 struct rb_node *rightmost = NULL;
800 struct mem_cgroup_per_zone *mz;
801
802retry:
803 mz = NULL;
804 rightmost = rb_last(&mctz->rb_root);
805 if (!rightmost)
806 goto done;
807
808 mz = rb_entry(rightmost, struct mem_cgroup_per_zone, tree_node);
809
810
811
812
813
814 __mem_cgroup_remove_exceeded(mz->memcg, mz, mctz);
815 if (!res_counter_soft_limit_excess(&mz->memcg->res) ||
816 !css_tryget(&mz->memcg->css))
817 goto retry;
818done:
819 return mz;
820}
821
822static struct mem_cgroup_per_zone *
823mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz)
824{
825 struct mem_cgroup_per_zone *mz;
826
827 spin_lock(&mctz->lock);
828 mz = __mem_cgroup_largest_soft_limit_node(mctz);
829 spin_unlock(&mctz->lock);
830 return mz;
831}
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852static long mem_cgroup_read_stat(struct mem_cgroup *memcg,
853 enum mem_cgroup_stat_index idx)
854{
855 long val = 0;
856 int cpu;
857
858 get_online_cpus();
859 for_each_online_cpu(cpu)
860 val += per_cpu(memcg->stat->count[idx], cpu);
861#ifdef CONFIG_HOTPLUG_CPU
862 spin_lock(&memcg->pcp_counter_lock);
863 val += memcg->nocpu_base.count[idx];
864 spin_unlock(&memcg->pcp_counter_lock);
865#endif
866 put_online_cpus();
867 return val;
868}
869
870static void mem_cgroup_swap_statistics(struct mem_cgroup *memcg,
871 bool charge)
872{
873 int val = (charge) ? 1 : -1;
874 this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_SWAP], val);
875}
876
877static unsigned long mem_cgroup_read_events(struct mem_cgroup *memcg,
878 enum mem_cgroup_events_index idx)
879{
880 unsigned long val = 0;
881 int cpu;
882
883 for_each_online_cpu(cpu)
884 val += per_cpu(memcg->stat->events[idx], cpu);
885#ifdef CONFIG_HOTPLUG_CPU
886 spin_lock(&memcg->pcp_counter_lock);
887 val += memcg->nocpu_base.events[idx];
888 spin_unlock(&memcg->pcp_counter_lock);
889#endif
890 return val;
891}
892
893static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg,
894 struct page *page,
895 bool anon, int nr_pages)
896{
897 preempt_disable();
898
899
900
901
902
903 if (anon)
904 __this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_RSS],
905 nr_pages);
906 else
907 __this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_CACHE],
908 nr_pages);
909
910 if (PageTransHuge(page))
911 __this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_RSS_HUGE],
912 nr_pages);
913
914
915 if (nr_pages > 0)
916 __this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGIN]);
917 else {
918 __this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGOUT]);
919 nr_pages = -nr_pages;
920 }
921
922 __this_cpu_add(memcg->stat->nr_page_events, nr_pages);
923
924 preempt_enable();
925}
926
927unsigned long
928mem_cgroup_get_lru_size(struct lruvec *lruvec, enum lru_list lru)
929{
930 struct mem_cgroup_per_zone *mz;
931
932 mz = container_of(lruvec, struct mem_cgroup_per_zone, lruvec);
933 return mz->lru_size[lru];
934}
935
936static unsigned long
937mem_cgroup_zone_nr_lru_pages(struct mem_cgroup *memcg, int nid, int zid,
938 unsigned int lru_mask)
939{
940 struct mem_cgroup_per_zone *mz;
941 enum lru_list lru;
942 unsigned long ret = 0;
943
944 mz = mem_cgroup_zoneinfo(memcg, nid, zid);
945
946 for_each_lru(lru) {
947 if (BIT(lru) & lru_mask)
948 ret += mz->lru_size[lru];
949 }
950 return ret;
951}
952
953static unsigned long
954mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg,
955 int nid, unsigned int lru_mask)
956{
957 u64 total = 0;
958 int zid;
959
960 for (zid = 0; zid < MAX_NR_ZONES; zid++)
961 total += mem_cgroup_zone_nr_lru_pages(memcg,
962 nid, zid, lru_mask);
963
964 return total;
965}
966
967static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *memcg,
968 unsigned int lru_mask)
969{
970 int nid;
971 u64 total = 0;
972
973 for_each_node_state(nid, N_MEMORY)
974 total += mem_cgroup_node_nr_lru_pages(memcg, nid, lru_mask);
975 return total;
976}
977
978static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg,
979 enum mem_cgroup_events_target target)
980{
981 unsigned long val, next;
982
983 val = __this_cpu_read(memcg->stat->nr_page_events);
984 next = __this_cpu_read(memcg->stat->targets[target]);
985
986 if ((long)next - (long)val < 0) {
987 switch (target) {
988 case MEM_CGROUP_TARGET_THRESH:
989 next = val + THRESHOLDS_EVENTS_TARGET;
990 break;
991 case MEM_CGROUP_TARGET_SOFTLIMIT:
992 next = val + SOFTLIMIT_EVENTS_TARGET;
993 break;
994 case MEM_CGROUP_TARGET_NUMAINFO:
995 next = val + NUMAINFO_EVENTS_TARGET;
996 break;
997 default:
998 break;
999 }
1000 __this_cpu_write(memcg->stat->targets[target], next);
1001 return true;
1002 }
1003 return false;
1004}
1005
1006
1007
1008
1009
1010static void memcg_check_events(struct mem_cgroup *memcg, struct page *page)
1011{
1012 preempt_disable();
1013
1014 if (unlikely(mem_cgroup_event_ratelimit(memcg,
1015 MEM_CGROUP_TARGET_THRESH))) {
1016 bool do_softlimit;
1017 bool do_numainfo __maybe_unused;
1018
1019 do_softlimit = mem_cgroup_event_ratelimit(memcg,
1020 MEM_CGROUP_TARGET_SOFTLIMIT);
1021#if MAX_NUMNODES > 1
1022 do_numainfo = mem_cgroup_event_ratelimit(memcg,
1023 MEM_CGROUP_TARGET_NUMAINFO);
1024#endif
1025 preempt_enable();
1026
1027 mem_cgroup_threshold(memcg);
1028 if (unlikely(do_softlimit))
1029 mem_cgroup_update_tree(memcg, page);
1030#if MAX_NUMNODES > 1
1031 if (unlikely(do_numainfo))
1032 atomic_inc(&memcg->numainfo_events);
1033#endif
1034 } else
1035 preempt_enable();
1036}
1037
1038struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont)
1039{
1040 return mem_cgroup_from_css(
1041 cgroup_subsys_state(cont, mem_cgroup_subsys_id));
1042}
1043
1044struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
1045{
1046
1047
1048
1049
1050
1051 if (unlikely(!p))
1052 return NULL;
1053
1054 return mem_cgroup_from_css(task_subsys_state(p, mem_cgroup_subsys_id));
1055}
1056
1057struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm)
1058{
1059 struct mem_cgroup *memcg = NULL;
1060
1061 if (!mm)
1062 return NULL;
1063
1064
1065
1066
1067
1068 rcu_read_lock();
1069 do {
1070 memcg = mem_cgroup_from_task(rcu_dereference(mm->owner));
1071 if (unlikely(!memcg))
1072 break;
1073 } while (!css_tryget(&memcg->css));
1074 rcu_read_unlock();
1075 return memcg;
1076}
1077
1078
1079
1080
1081
1082
1083
1084static struct mem_cgroup *__mem_cgroup_iter_next(struct mem_cgroup *root,
1085 struct mem_cgroup *last_visited)
1086{
1087 struct cgroup *prev_cgroup, *next_cgroup;
1088
1089
1090
1091
1092
1093 if (!last_visited)
1094 return root;
1095
1096 prev_cgroup = (last_visited == root) ? NULL
1097 : last_visited->css.cgroup;
1098skip_node:
1099 next_cgroup = cgroup_next_descendant_pre(
1100 prev_cgroup, root->css.cgroup);
1101
1102
1103
1104
1105
1106
1107
1108
1109 if (next_cgroup) {
1110 struct mem_cgroup *mem = mem_cgroup_from_cont(
1111 next_cgroup);
1112 if (css_tryget(&mem->css))
1113 return mem;
1114 else {
1115 prev_cgroup = next_cgroup;
1116 goto skip_node;
1117 }
1118 }
1119
1120 return NULL;
1121}
1122
1123static void mem_cgroup_iter_invalidate(struct mem_cgroup *root)
1124{
1125
1126
1127
1128
1129
1130 atomic_inc(&root->dead_count);
1131}
1132
1133static struct mem_cgroup *
1134mem_cgroup_iter_load(struct mem_cgroup_reclaim_iter *iter,
1135 struct mem_cgroup *root,
1136 int *sequence)
1137{
1138 struct mem_cgroup *position = NULL;
1139
1140
1141
1142
1143
1144
1145
1146
1147 *sequence = atomic_read(&root->dead_count);
1148 if (iter->last_dead_count == *sequence) {
1149 smp_rmb();
1150 position = iter->last_visited;
1151 if (position && !css_tryget(&position->css))
1152 position = NULL;
1153 }
1154 return position;
1155}
1156
1157static void mem_cgroup_iter_update(struct mem_cgroup_reclaim_iter *iter,
1158 struct mem_cgroup *last_visited,
1159 struct mem_cgroup *new_position,
1160 int sequence)
1161{
1162 if (last_visited)
1163 css_put(&last_visited->css);
1164
1165
1166
1167
1168
1169
1170 iter->last_visited = new_position;
1171 smp_wmb();
1172 iter->last_dead_count = sequence;
1173}
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
1193 struct mem_cgroup *prev,
1194 struct mem_cgroup_reclaim_cookie *reclaim)
1195{
1196 struct mem_cgroup *memcg = NULL;
1197 struct mem_cgroup *last_visited = NULL;
1198
1199 if (mem_cgroup_disabled())
1200 return NULL;
1201
1202 if (!root)
1203 root = root_mem_cgroup;
1204
1205 if (prev && !reclaim)
1206 last_visited = prev;
1207
1208 if (!root->use_hierarchy && root != root_mem_cgroup) {
1209 if (prev)
1210 goto out_css_put;
1211 return root;
1212 }
1213
1214 rcu_read_lock();
1215 while (!memcg) {
1216 struct mem_cgroup_reclaim_iter *uninitialized_var(iter);
1217 int uninitialized_var(seq);
1218
1219 if (reclaim) {
1220 int nid = zone_to_nid(reclaim->zone);
1221 int zid = zone_idx(reclaim->zone);
1222 struct mem_cgroup_per_zone *mz;
1223
1224 mz = mem_cgroup_zoneinfo(root, nid, zid);
1225 iter = &mz->reclaim_iter[reclaim->priority];
1226 if (prev && reclaim->generation != iter->generation) {
1227 iter->last_visited = NULL;
1228 goto out_unlock;
1229 }
1230
1231 last_visited = mem_cgroup_iter_load(iter, root, &seq);
1232 }
1233
1234 memcg = __mem_cgroup_iter_next(root, last_visited);
1235
1236 if (reclaim) {
1237 mem_cgroup_iter_update(iter, last_visited, memcg, seq);
1238
1239 if (!memcg)
1240 iter->generation++;
1241 else if (!prev && memcg)
1242 reclaim->generation = iter->generation;
1243 }
1244
1245 if (prev && !memcg)
1246 goto out_unlock;
1247 }
1248out_unlock:
1249 rcu_read_unlock();
1250out_css_put:
1251 if (prev && prev != root)
1252 css_put(&prev->css);
1253
1254 return memcg;
1255}
1256
1257
1258
1259
1260
1261
1262void mem_cgroup_iter_break(struct mem_cgroup *root,
1263 struct mem_cgroup *prev)
1264{
1265 if (!root)
1266 root = root_mem_cgroup;
1267 if (prev && prev != root)
1268 css_put(&prev->css);
1269}
1270
1271
1272
1273
1274
1275
1276#define for_each_mem_cgroup_tree(iter, root) \
1277 for (iter = mem_cgroup_iter(root, NULL, NULL); \
1278 iter != NULL; \
1279 iter = mem_cgroup_iter(root, iter, NULL))
1280
1281#define for_each_mem_cgroup(iter) \
1282 for (iter = mem_cgroup_iter(NULL, NULL, NULL); \
1283 iter != NULL; \
1284 iter = mem_cgroup_iter(NULL, iter, NULL))
1285
1286void __mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx)
1287{
1288 struct mem_cgroup *memcg;
1289
1290 rcu_read_lock();
1291 memcg = mem_cgroup_from_task(rcu_dereference(mm->owner));
1292 if (unlikely(!memcg))
1293 goto out;
1294
1295 switch (idx) {
1296 case PGFAULT:
1297 this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGFAULT]);
1298 break;
1299 case PGMAJFAULT:
1300 this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGMAJFAULT]);
1301 break;
1302 default:
1303 BUG();
1304 }
1305out:
1306 rcu_read_unlock();
1307}
1308EXPORT_SYMBOL(__mem_cgroup_count_vm_event);
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319struct lruvec *mem_cgroup_zone_lruvec(struct zone *zone,
1320 struct mem_cgroup *memcg)
1321{
1322 struct mem_cgroup_per_zone *mz;
1323 struct lruvec *lruvec;
1324
1325 if (mem_cgroup_disabled()) {
1326 lruvec = &zone->lruvec;
1327 goto out;
1328 }
1329
1330 mz = mem_cgroup_zoneinfo(memcg, zone_to_nid(zone), zone_idx(zone));
1331 lruvec = &mz->lruvec;
1332out:
1333
1334
1335
1336
1337
1338 if (unlikely(lruvec->zone != zone))
1339 lruvec->zone = zone;
1340 return lruvec;
1341}
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362struct lruvec *mem_cgroup_page_lruvec(struct page *page, struct zone *zone)
1363{
1364 struct mem_cgroup_per_zone *mz;
1365 struct mem_cgroup *memcg;
1366 struct page_cgroup *pc;
1367 struct lruvec *lruvec;
1368
1369 if (mem_cgroup_disabled()) {
1370 lruvec = &zone->lruvec;
1371 goto out;
1372 }
1373
1374 pc = lookup_page_cgroup(page);
1375 memcg = pc->mem_cgroup;
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386 if (!PageLRU(page) && !PageCgroupUsed(pc) && memcg != root_mem_cgroup)
1387 pc->mem_cgroup = memcg = root_mem_cgroup;
1388
1389 mz = page_cgroup_zoneinfo(memcg, page);
1390 lruvec = &mz->lruvec;
1391out:
1392
1393
1394
1395
1396
1397 if (unlikely(lruvec->zone != zone))
1398 lruvec->zone = zone;
1399 return lruvec;
1400}
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru,
1412 int nr_pages)
1413{
1414 struct mem_cgroup_per_zone *mz;
1415 unsigned long *lru_size;
1416
1417 if (mem_cgroup_disabled())
1418 return;
1419
1420 mz = container_of(lruvec, struct mem_cgroup_per_zone, lruvec);
1421 lru_size = mz->lru_size + lru;
1422 *lru_size += nr_pages;
1423 VM_BUG_ON((long)(*lru_size) < 0);
1424}
1425
1426
1427
1428
1429
1430bool __mem_cgroup_same_or_subtree(const struct mem_cgroup *root_memcg,
1431 struct mem_cgroup *memcg)
1432{
1433 if (root_memcg == memcg)
1434 return true;
1435 if (!root_memcg->use_hierarchy || !memcg)
1436 return false;
1437 return css_is_ancestor(&memcg->css, &root_memcg->css);
1438}
1439
1440static bool mem_cgroup_same_or_subtree(const struct mem_cgroup *root_memcg,
1441 struct mem_cgroup *memcg)
1442{
1443 bool ret;
1444
1445 rcu_read_lock();
1446 ret = __mem_cgroup_same_or_subtree(root_memcg, memcg);
1447 rcu_read_unlock();
1448 return ret;
1449}
1450
1451bool task_in_mem_cgroup(struct task_struct *task,
1452 const struct mem_cgroup *memcg)
1453{
1454 struct mem_cgroup *curr = NULL;
1455 struct task_struct *p;
1456 bool ret;
1457
1458 p = find_lock_task_mm(task);
1459 if (p) {
1460 curr = try_get_mem_cgroup_from_mm(p->mm);
1461 task_unlock(p);
1462 } else {
1463
1464
1465
1466
1467
1468 rcu_read_lock();
1469 curr = mem_cgroup_from_task(task);
1470 if (curr)
1471 css_get(&curr->css);
1472 rcu_read_unlock();
1473 }
1474 if (!curr)
1475 return false;
1476
1477
1478
1479
1480
1481
1482 ret = mem_cgroup_same_or_subtree(memcg, curr);
1483 css_put(&curr->css);
1484 return ret;
1485}
1486
1487int mem_cgroup_inactive_anon_is_low(struct lruvec *lruvec)
1488{
1489 unsigned long inactive_ratio;
1490 unsigned long inactive;
1491 unsigned long active;
1492 unsigned long gb;
1493
1494 inactive = mem_cgroup_get_lru_size(lruvec, LRU_INACTIVE_ANON);
1495 active = mem_cgroup_get_lru_size(lruvec, LRU_ACTIVE_ANON);
1496
1497 gb = (inactive + active) >> (30 - PAGE_SHIFT);
1498 if (gb)
1499 inactive_ratio = int_sqrt(10 * gb);
1500 else
1501 inactive_ratio = 1;
1502
1503 return inactive * inactive_ratio < active;
1504}
1505
1506#define mem_cgroup_from_res_counter(counter, member) \
1507 container_of(counter, struct mem_cgroup, member)
1508
1509
1510
1511
1512
1513
1514
1515
1516static unsigned long mem_cgroup_margin(struct mem_cgroup *memcg)
1517{
1518 unsigned long long margin;
1519
1520 margin = res_counter_margin(&memcg->res);
1521 if (do_swap_account)
1522 margin = min(margin, res_counter_margin(&memcg->memsw));
1523 return margin >> PAGE_SHIFT;
1524}
1525
1526int mem_cgroup_swappiness(struct mem_cgroup *memcg)
1527{
1528 struct cgroup *cgrp = memcg->css.cgroup;
1529
1530
1531 if (cgrp->parent == NULL)
1532 return vm_swappiness;
1533
1534 return memcg->swappiness;
1535}
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553atomic_t memcg_moving __read_mostly;
1554
1555static void mem_cgroup_start_move(struct mem_cgroup *memcg)
1556{
1557 atomic_inc(&memcg_moving);
1558 atomic_inc(&memcg->moving_account);
1559 synchronize_rcu();
1560}
1561
1562static void mem_cgroup_end_move(struct mem_cgroup *memcg)
1563{
1564
1565
1566
1567
1568 if (memcg) {
1569 atomic_dec(&memcg_moving);
1570 atomic_dec(&memcg->moving_account);
1571 }
1572}
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586static bool mem_cgroup_stolen(struct mem_cgroup *memcg)
1587{
1588 VM_BUG_ON(!rcu_read_lock_held());
1589 return atomic_read(&memcg->moving_account) > 0;
1590}
1591
1592static bool mem_cgroup_under_move(struct mem_cgroup *memcg)
1593{
1594 struct mem_cgroup *from;
1595 struct mem_cgroup *to;
1596 bool ret = false;
1597
1598
1599
1600
1601 spin_lock(&mc.lock);
1602 from = mc.from;
1603 to = mc.to;
1604 if (!from)
1605 goto unlock;
1606
1607 ret = mem_cgroup_same_or_subtree(memcg, from)
1608 || mem_cgroup_same_or_subtree(memcg, to);
1609unlock:
1610 spin_unlock(&mc.lock);
1611 return ret;
1612}
1613
1614static bool mem_cgroup_wait_acct_move(struct mem_cgroup *memcg)
1615{
1616 if (mc.moving_task && current != mc.moving_task) {
1617 if (mem_cgroup_under_move(memcg)) {
1618 DEFINE_WAIT(wait);
1619 prepare_to_wait(&mc.waitq, &wait, TASK_INTERRUPTIBLE);
1620
1621 if (mc.moving_task)
1622 schedule();
1623 finish_wait(&mc.waitq, &wait);
1624 return true;
1625 }
1626 }
1627 return false;
1628}
1629
1630
1631
1632
1633
1634
1635
1636static void move_lock_mem_cgroup(struct mem_cgroup *memcg,
1637 unsigned long *flags)
1638{
1639 spin_lock_irqsave(&memcg->move_lock, *flags);
1640}
1641
1642static void move_unlock_mem_cgroup(struct mem_cgroup *memcg,
1643 unsigned long *flags)
1644{
1645 spin_unlock_irqrestore(&memcg->move_lock, *flags);
1646}
1647
1648#define K(x) ((x) << (PAGE_SHIFT-10))
1649
1650
1651
1652
1653
1654
1655
1656
1657void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
1658{
1659 struct cgroup *task_cgrp;
1660 struct cgroup *mem_cgrp;
1661
1662
1663
1664
1665
1666 static char memcg_name[PATH_MAX];
1667 int ret;
1668 struct mem_cgroup *iter;
1669 unsigned int i;
1670
1671 if (!p)
1672 return;
1673
1674 rcu_read_lock();
1675
1676 mem_cgrp = memcg->css.cgroup;
1677 task_cgrp = task_cgroup(p, mem_cgroup_subsys_id);
1678
1679 ret = cgroup_path(task_cgrp, memcg_name, PATH_MAX);
1680 if (ret < 0) {
1681
1682
1683
1684
1685 rcu_read_unlock();
1686 goto done;
1687 }
1688 rcu_read_unlock();
1689
1690 pr_info("Task in %s killed", memcg_name);
1691
1692 rcu_read_lock();
1693 ret = cgroup_path(mem_cgrp, memcg_name, PATH_MAX);
1694 if (ret < 0) {
1695 rcu_read_unlock();
1696 goto done;
1697 }
1698 rcu_read_unlock();
1699
1700
1701
1702
1703 pr_cont(" as a result of limit of %s\n", memcg_name);
1704done:
1705
1706 pr_info("memory: usage %llukB, limit %llukB, failcnt %llu\n",
1707 res_counter_read_u64(&memcg->res, RES_USAGE) >> 10,
1708 res_counter_read_u64(&memcg->res, RES_LIMIT) >> 10,
1709 res_counter_read_u64(&memcg->res, RES_FAILCNT));
1710 pr_info("memory+swap: usage %llukB, limit %llukB, failcnt %llu\n",
1711 res_counter_read_u64(&memcg->memsw, RES_USAGE) >> 10,
1712 res_counter_read_u64(&memcg->memsw, RES_LIMIT) >> 10,
1713 res_counter_read_u64(&memcg->memsw, RES_FAILCNT));
1714 pr_info("kmem: usage %llukB, limit %llukB, failcnt %llu\n",
1715 res_counter_read_u64(&memcg->kmem, RES_USAGE) >> 10,
1716 res_counter_read_u64(&memcg->kmem, RES_LIMIT) >> 10,
1717 res_counter_read_u64(&memcg->kmem, RES_FAILCNT));
1718
1719 for_each_mem_cgroup_tree(iter, memcg) {
1720 pr_info("Memory cgroup stats");
1721
1722 rcu_read_lock();
1723 ret = cgroup_path(iter->css.cgroup, memcg_name, PATH_MAX);
1724 if (!ret)
1725 pr_cont(" for %s", memcg_name);
1726 rcu_read_unlock();
1727 pr_cont(":");
1728
1729 for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) {
1730 if (i == MEM_CGROUP_STAT_SWAP && !do_swap_account)
1731 continue;
1732 pr_cont(" %s:%ldKB", mem_cgroup_stat_names[i],
1733 K(mem_cgroup_read_stat(iter, i)));
1734 }
1735
1736 for (i = 0; i < NR_LRU_LISTS; i++)
1737 pr_cont(" %s:%luKB", mem_cgroup_lru_names[i],
1738 K(mem_cgroup_nr_lru_pages(iter, BIT(i))));
1739
1740 pr_cont("\n");
1741 }
1742}
1743
1744
1745
1746
1747
1748static int mem_cgroup_count_children(struct mem_cgroup *memcg)
1749{
1750 int num = 0;
1751 struct mem_cgroup *iter;
1752
1753 for_each_mem_cgroup_tree(iter, memcg)
1754 num++;
1755 return num;
1756}
1757
1758
1759
1760
1761static u64 mem_cgroup_get_limit(struct mem_cgroup *memcg)
1762{
1763 u64 limit;
1764
1765 limit = res_counter_read_u64(&memcg->res, RES_LIMIT);
1766
1767
1768
1769
1770 if (mem_cgroup_swappiness(memcg)) {
1771 u64 memsw;
1772
1773 limit += total_swap_pages << PAGE_SHIFT;
1774 memsw = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
1775
1776
1777
1778
1779
1780 limit = min(limit, memsw);
1781 }
1782
1783 return limit;
1784}
1785
1786static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
1787 int order)
1788{
1789 struct mem_cgroup *iter;
1790 unsigned long chosen_points = 0;
1791 unsigned long totalpages;
1792 unsigned int points = 0;
1793 struct task_struct *chosen = NULL;
1794
1795
1796
1797
1798
1799
1800 if (fatal_signal_pending(current) || current->flags & PF_EXITING) {
1801 set_thread_flag(TIF_MEMDIE);
1802 return;
1803 }
1804
1805 check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, order, NULL);
1806 totalpages = mem_cgroup_get_limit(memcg) >> PAGE_SHIFT ? : 1;
1807 for_each_mem_cgroup_tree(iter, memcg) {
1808 struct cgroup *cgroup = iter->css.cgroup;
1809 struct cgroup_iter it;
1810 struct task_struct *task;
1811
1812 cgroup_iter_start(cgroup, &it);
1813 while ((task = cgroup_iter_next(cgroup, &it))) {
1814 switch (oom_scan_process_thread(task, totalpages, NULL,
1815 false)) {
1816 case OOM_SCAN_SELECT:
1817 if (chosen)
1818 put_task_struct(chosen);
1819 chosen = task;
1820 chosen_points = ULONG_MAX;
1821 get_task_struct(chosen);
1822
1823 case OOM_SCAN_CONTINUE:
1824 continue;
1825 case OOM_SCAN_ABORT:
1826 cgroup_iter_end(cgroup, &it);
1827 mem_cgroup_iter_break(memcg, iter);
1828 if (chosen)
1829 put_task_struct(chosen);
1830 return;
1831 case OOM_SCAN_OK:
1832 break;
1833 };
1834 points = oom_badness(task, memcg, NULL, totalpages);
1835 if (points > chosen_points) {
1836 if (chosen)
1837 put_task_struct(chosen);
1838 chosen = task;
1839 chosen_points = points;
1840 get_task_struct(chosen);
1841 }
1842 }
1843 cgroup_iter_end(cgroup, &it);
1844 }
1845
1846 if (!chosen)
1847 return;
1848 points = chosen_points * 1000 / totalpages;
1849 oom_kill_process(chosen, gfp_mask, order, points, totalpages, memcg,
1850 NULL, "Memory cgroup out of memory");
1851}
1852
1853static unsigned long mem_cgroup_reclaim(struct mem_cgroup *memcg,
1854 gfp_t gfp_mask,
1855 unsigned long flags)
1856{
1857 unsigned long total = 0;
1858 bool noswap = false;
1859 int loop;
1860
1861 if (flags & MEM_CGROUP_RECLAIM_NOSWAP)
1862 noswap = true;
1863 if (!(flags & MEM_CGROUP_RECLAIM_SHRINK) && memcg->memsw_is_minimum)
1864 noswap = true;
1865
1866 for (loop = 0; loop < MEM_CGROUP_MAX_RECLAIM_LOOPS; loop++) {
1867 if (loop)
1868 drain_all_stock_async(memcg);
1869 total += try_to_free_mem_cgroup_pages(memcg, gfp_mask, noswap);
1870
1871
1872
1873
1874
1875 if (total && (flags & MEM_CGROUP_RECLAIM_SHRINK))
1876 break;
1877 if (mem_cgroup_margin(memcg))
1878 break;
1879
1880
1881
1882
1883 if (loop && !total)
1884 break;
1885 }
1886 return total;
1887}
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899static bool test_mem_cgroup_node_reclaimable(struct mem_cgroup *memcg,
1900 int nid, bool noswap)
1901{
1902 if (mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL_FILE))
1903 return true;
1904 if (noswap || !total_swap_pages)
1905 return false;
1906 if (mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL_ANON))
1907 return true;
1908 return false;
1909
1910}
1911#if MAX_NUMNODES > 1
1912
1913
1914
1915
1916
1917
1918
1919static void mem_cgroup_may_update_nodemask(struct mem_cgroup *memcg)
1920{
1921 int nid;
1922
1923
1924
1925
1926 if (!atomic_read(&memcg->numainfo_events))
1927 return;
1928 if (atomic_inc_return(&memcg->numainfo_updating) > 1)
1929 return;
1930
1931
1932 memcg->scan_nodes = node_states[N_MEMORY];
1933
1934 for_each_node_mask(nid, node_states[N_MEMORY]) {
1935
1936 if (!test_mem_cgroup_node_reclaimable(memcg, nid, false))
1937 node_clear(nid, memcg->scan_nodes);
1938 }
1939
1940 atomic_set(&memcg->numainfo_events, 0);
1941 atomic_set(&memcg->numainfo_updating, 0);
1942}
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956int mem_cgroup_select_victim_node(struct mem_cgroup *memcg)
1957{
1958 int node;
1959
1960 mem_cgroup_may_update_nodemask(memcg);
1961 node = memcg->last_scanned_node;
1962
1963 node = next_node(node, memcg->scan_nodes);
1964 if (node == MAX_NUMNODES)
1965 node = first_node(memcg->scan_nodes);
1966
1967
1968
1969
1970
1971
1972 if (unlikely(node == MAX_NUMNODES))
1973 node = numa_node_id();
1974
1975 memcg->last_scanned_node = node;
1976 return node;
1977}
1978
1979
1980
1981
1982
1983
1984
1985static bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap)
1986{
1987 int nid;
1988
1989
1990
1991
1992
1993 if (!nodes_empty(memcg->scan_nodes)) {
1994 for (nid = first_node(memcg->scan_nodes);
1995 nid < MAX_NUMNODES;
1996 nid = next_node(nid, memcg->scan_nodes)) {
1997
1998 if (test_mem_cgroup_node_reclaimable(memcg, nid, noswap))
1999 return true;
2000 }
2001 }
2002
2003
2004
2005 for_each_node_state(nid, N_MEMORY) {
2006 if (node_isset(nid, memcg->scan_nodes))
2007 continue;
2008 if (test_mem_cgroup_node_reclaimable(memcg, nid, noswap))
2009 return true;
2010 }
2011 return false;
2012}
2013
2014#else
2015int mem_cgroup_select_victim_node(struct mem_cgroup *memcg)
2016{
2017 return 0;
2018}
2019
2020static bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap)
2021{
2022 return test_mem_cgroup_node_reclaimable(memcg, 0, noswap);
2023}
2024#endif
2025
2026static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg,
2027 struct zone *zone,
2028 gfp_t gfp_mask,
2029 unsigned long *total_scanned)
2030{
2031 struct mem_cgroup *victim = NULL;
2032 int total = 0;
2033 int loop = 0;
2034 unsigned long excess;
2035 unsigned long nr_scanned;
2036 struct mem_cgroup_reclaim_cookie reclaim = {
2037 .zone = zone,
2038 .priority = 0,
2039 };
2040
2041 excess = res_counter_soft_limit_excess(&root_memcg->res) >> PAGE_SHIFT;
2042
2043 while (1) {
2044 victim = mem_cgroup_iter(root_memcg, victim, &reclaim);
2045 if (!victim) {
2046 loop++;
2047 if (loop >= 2) {
2048
2049
2050
2051
2052
2053 if (!total)
2054 break;
2055
2056
2057
2058
2059
2060
2061 if (total >= (excess >> 2) ||
2062 (loop > MEM_CGROUP_MAX_RECLAIM_LOOPS))
2063 break;
2064 }
2065 continue;
2066 }
2067 if (!mem_cgroup_reclaimable(victim, false))
2068 continue;
2069 total += mem_cgroup_shrink_node_zone(victim, gfp_mask, false,
2070 zone, &nr_scanned);
2071 *total_scanned += nr_scanned;
2072 if (!res_counter_soft_limit_excess(&root_memcg->res))
2073 break;
2074 }
2075 mem_cgroup_iter_break(root_memcg, victim);
2076 return total;
2077}
2078
2079
2080
2081
2082
2083
2084static bool mem_cgroup_oom_lock(struct mem_cgroup *memcg)
2085{
2086 struct mem_cgroup *iter, *failed = NULL;
2087
2088 for_each_mem_cgroup_tree(iter, memcg) {
2089 if (iter->oom_lock) {
2090
2091
2092
2093
2094 failed = iter;
2095 mem_cgroup_iter_break(memcg, iter);
2096 break;
2097 } else
2098 iter->oom_lock = true;
2099 }
2100
2101 if (!failed)
2102 return true;
2103
2104
2105
2106
2107
2108 for_each_mem_cgroup_tree(iter, memcg) {
2109 if (iter == failed) {
2110 mem_cgroup_iter_break(memcg, iter);
2111 break;
2112 }
2113 iter->oom_lock = false;
2114 }
2115 return false;
2116}
2117
2118
2119
2120
2121static int mem_cgroup_oom_unlock(struct mem_cgroup *memcg)
2122{
2123 struct mem_cgroup *iter;
2124
2125 for_each_mem_cgroup_tree(iter, memcg)
2126 iter->oom_lock = false;
2127 return 0;
2128}
2129
2130static void mem_cgroup_mark_under_oom(struct mem_cgroup *memcg)
2131{
2132 struct mem_cgroup *iter;
2133
2134 for_each_mem_cgroup_tree(iter, memcg)
2135 atomic_inc(&iter->under_oom);
2136}
2137
2138static void mem_cgroup_unmark_under_oom(struct mem_cgroup *memcg)
2139{
2140 struct mem_cgroup *iter;
2141
2142
2143
2144
2145
2146
2147 for_each_mem_cgroup_tree(iter, memcg)
2148 atomic_add_unless(&iter->under_oom, -1, 0);
2149}
2150
2151static DEFINE_SPINLOCK(memcg_oom_lock);
2152static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq);
2153
2154struct oom_wait_info {
2155 struct mem_cgroup *memcg;
2156 wait_queue_t wait;
2157};
2158
2159static int memcg_oom_wake_function(wait_queue_t *wait,
2160 unsigned mode, int sync, void *arg)
2161{
2162 struct mem_cgroup *wake_memcg = (struct mem_cgroup *)arg;
2163 struct mem_cgroup *oom_wait_memcg;
2164 struct oom_wait_info *oom_wait_info;
2165
2166 oom_wait_info = container_of(wait, struct oom_wait_info, wait);
2167 oom_wait_memcg = oom_wait_info->memcg;
2168
2169
2170
2171
2172
2173 if (!mem_cgroup_same_or_subtree(oom_wait_memcg, wake_memcg)
2174 && !mem_cgroup_same_or_subtree(wake_memcg, oom_wait_memcg))
2175 return 0;
2176 return autoremove_wake_function(wait, mode, sync, arg);
2177}
2178
2179static void memcg_wakeup_oom(struct mem_cgroup *memcg)
2180{
2181
2182 __wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, memcg);
2183}
2184
2185static void memcg_oom_recover(struct mem_cgroup *memcg)
2186{
2187 if (memcg && atomic_read(&memcg->under_oom))
2188 memcg_wakeup_oom(memcg);
2189}
2190
2191
2192
2193
2194static bool mem_cgroup_handle_oom(struct mem_cgroup *memcg, gfp_t mask,
2195 int order)
2196{
2197 struct oom_wait_info owait;
2198 bool locked, need_to_kill;
2199
2200 owait.memcg = memcg;
2201 owait.wait.flags = 0;
2202 owait.wait.func = memcg_oom_wake_function;
2203 owait.wait.private = current;
2204 INIT_LIST_HEAD(&owait.wait.task_list);
2205 need_to_kill = true;
2206 mem_cgroup_mark_under_oom(memcg);
2207
2208
2209 spin_lock(&memcg_oom_lock);
2210 locked = mem_cgroup_oom_lock(memcg);
2211
2212
2213
2214
2215
2216 prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE);
2217 if (!locked || memcg->oom_kill_disable)
2218 need_to_kill = false;
2219 if (locked)
2220 mem_cgroup_oom_notify(memcg);
2221 spin_unlock(&memcg_oom_lock);
2222
2223 if (need_to_kill) {
2224 finish_wait(&memcg_oom_waitq, &owait.wait);
2225 mem_cgroup_out_of_memory(memcg, mask, order);
2226 } else {
2227 schedule();
2228 finish_wait(&memcg_oom_waitq, &owait.wait);
2229 }
2230 spin_lock(&memcg_oom_lock);
2231 if (locked)
2232 mem_cgroup_oom_unlock(memcg);
2233 memcg_wakeup_oom(memcg);
2234 spin_unlock(&memcg_oom_lock);
2235
2236 mem_cgroup_unmark_under_oom(memcg);
2237
2238 if (test_thread_flag(TIF_MEMDIE) || fatal_signal_pending(current))
2239 return false;
2240
2241 schedule_timeout_uninterruptible(1);
2242 return true;
2243}
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269void __mem_cgroup_begin_update_page_stat(struct page *page,
2270 bool *locked, unsigned long *flags)
2271{
2272 struct mem_cgroup *memcg;
2273 struct page_cgroup *pc;
2274
2275 pc = lookup_page_cgroup(page);
2276again:
2277 memcg = pc->mem_cgroup;
2278 if (unlikely(!memcg || !PageCgroupUsed(pc)))
2279 return;
2280
2281
2282
2283
2284
2285
2286 if (!mem_cgroup_stolen(memcg))
2287 return;
2288
2289 move_lock_mem_cgroup(memcg, flags);
2290 if (memcg != pc->mem_cgroup || !PageCgroupUsed(pc)) {
2291 move_unlock_mem_cgroup(memcg, flags);
2292 goto again;
2293 }
2294 *locked = true;
2295}
2296
2297void __mem_cgroup_end_update_page_stat(struct page *page, unsigned long *flags)
2298{
2299 struct page_cgroup *pc = lookup_page_cgroup(page);
2300
2301
2302
2303
2304
2305
2306 move_unlock_mem_cgroup(pc->mem_cgroup, flags);
2307}
2308
2309void mem_cgroup_update_page_stat(struct page *page,
2310 enum mem_cgroup_page_stat_item idx, int val)
2311{
2312 struct mem_cgroup *memcg;
2313 struct page_cgroup *pc = lookup_page_cgroup(page);
2314 unsigned long uninitialized_var(flags);
2315
2316 if (mem_cgroup_disabled())
2317 return;
2318
2319 memcg = pc->mem_cgroup;
2320 if (unlikely(!memcg || !PageCgroupUsed(pc)))
2321 return;
2322
2323 switch (idx) {
2324 case MEMCG_NR_FILE_MAPPED:
2325 idx = MEM_CGROUP_STAT_FILE_MAPPED;
2326 break;
2327 default:
2328 BUG();
2329 }
2330
2331 this_cpu_add(memcg->stat->count[idx], val);
2332}
2333
2334
2335
2336
2337
2338#define CHARGE_BATCH 32U
2339struct memcg_stock_pcp {
2340 struct mem_cgroup *cached;
2341 unsigned int nr_pages;
2342 struct work_struct work;
2343 unsigned long flags;
2344#define FLUSHING_CACHED_CHARGE 0
2345};
2346static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock);
2347static DEFINE_MUTEX(percpu_charge_mutex);
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
2361{
2362 struct memcg_stock_pcp *stock;
2363 bool ret = true;
2364
2365 if (nr_pages > CHARGE_BATCH)
2366 return false;
2367
2368 stock = &get_cpu_var(memcg_stock);
2369 if (memcg == stock->cached && stock->nr_pages >= nr_pages)
2370 stock->nr_pages -= nr_pages;
2371 else
2372 ret = false;
2373 put_cpu_var(memcg_stock);
2374 return ret;
2375}
2376
2377
2378
2379
2380static void drain_stock(struct memcg_stock_pcp *stock)
2381{
2382 struct mem_cgroup *old = stock->cached;
2383
2384 if (stock->nr_pages) {
2385 unsigned long bytes = stock->nr_pages * PAGE_SIZE;
2386
2387 res_counter_uncharge(&old->res, bytes);
2388 if (do_swap_account)
2389 res_counter_uncharge(&old->memsw, bytes);
2390 stock->nr_pages = 0;
2391 }
2392 stock->cached = NULL;
2393}
2394
2395
2396
2397
2398
2399static void drain_local_stock(struct work_struct *dummy)
2400{
2401 struct memcg_stock_pcp *stock = &__get_cpu_var(memcg_stock);
2402 drain_stock(stock);
2403 clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags);
2404}
2405
2406static void __init memcg_stock_init(void)
2407{
2408 int cpu;
2409
2410 for_each_possible_cpu(cpu) {
2411 struct memcg_stock_pcp *stock =
2412 &per_cpu(memcg_stock, cpu);
2413 INIT_WORK(&stock->work, drain_local_stock);
2414 }
2415}
2416
2417
2418
2419
2420
2421static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
2422{
2423 struct memcg_stock_pcp *stock = &get_cpu_var(memcg_stock);
2424
2425 if (stock->cached != memcg) {
2426 drain_stock(stock);
2427 stock->cached = memcg;
2428 }
2429 stock->nr_pages += nr_pages;
2430 put_cpu_var(memcg_stock);
2431}
2432
2433
2434
2435
2436
2437
2438static void drain_all_stock(struct mem_cgroup *root_memcg, bool sync)
2439{
2440 int cpu, curcpu;
2441
2442
2443 get_online_cpus();
2444 curcpu = get_cpu();
2445 for_each_online_cpu(cpu) {
2446 struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu);
2447 struct mem_cgroup *memcg;
2448
2449 memcg = stock->cached;
2450 if (!memcg || !stock->nr_pages)
2451 continue;
2452 if (!mem_cgroup_same_or_subtree(root_memcg, memcg))
2453 continue;
2454 if (!test_and_set_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) {
2455 if (cpu == curcpu)
2456 drain_local_stock(&stock->work);
2457 else
2458 schedule_work_on(cpu, &stock->work);
2459 }
2460 }
2461 put_cpu();
2462
2463 if (!sync)
2464 goto out;
2465
2466 for_each_online_cpu(cpu) {
2467 struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu);
2468 if (test_bit(FLUSHING_CACHED_CHARGE, &stock->flags))
2469 flush_work(&stock->work);
2470 }
2471out:
2472 put_online_cpus();
2473}
2474
2475
2476
2477
2478
2479
2480
2481static void drain_all_stock_async(struct mem_cgroup *root_memcg)
2482{
2483
2484
2485
2486 if (!mutex_trylock(&percpu_charge_mutex))
2487 return;
2488 drain_all_stock(root_memcg, false);
2489 mutex_unlock(&percpu_charge_mutex);
2490}
2491
2492
2493static void drain_all_stock_sync(struct mem_cgroup *root_memcg)
2494{
2495
2496 mutex_lock(&percpu_charge_mutex);
2497 drain_all_stock(root_memcg, true);
2498 mutex_unlock(&percpu_charge_mutex);
2499}
2500
2501
2502
2503
2504
2505static void mem_cgroup_drain_pcp_counter(struct mem_cgroup *memcg, int cpu)
2506{
2507 int i;
2508
2509 spin_lock(&memcg->pcp_counter_lock);
2510 for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) {
2511 long x = per_cpu(memcg->stat->count[i], cpu);
2512
2513 per_cpu(memcg->stat->count[i], cpu) = 0;
2514 memcg->nocpu_base.count[i] += x;
2515 }
2516 for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++) {
2517 unsigned long x = per_cpu(memcg->stat->events[i], cpu);
2518
2519 per_cpu(memcg->stat->events[i], cpu) = 0;
2520 memcg->nocpu_base.events[i] += x;
2521 }
2522 spin_unlock(&memcg->pcp_counter_lock);
2523}
2524
2525static int memcg_cpu_hotplug_callback(struct notifier_block *nb,
2526 unsigned long action,
2527 void *hcpu)
2528{
2529 int cpu = (unsigned long)hcpu;
2530 struct memcg_stock_pcp *stock;
2531 struct mem_cgroup *iter;
2532
2533 if (action == CPU_ONLINE)
2534 return NOTIFY_OK;
2535
2536 if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
2537 return NOTIFY_OK;
2538
2539 for_each_mem_cgroup(iter)
2540 mem_cgroup_drain_pcp_counter(iter, cpu);
2541
2542 stock = &per_cpu(memcg_stock, cpu);
2543 drain_stock(stock);
2544 return NOTIFY_OK;
2545}
2546
2547
2548
2549enum {
2550 CHARGE_OK,
2551 CHARGE_RETRY,
2552 CHARGE_NOMEM,
2553 CHARGE_WOULDBLOCK,
2554 CHARGE_OOM_DIE,
2555};
2556
2557static int mem_cgroup_do_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
2558 unsigned int nr_pages, unsigned int min_pages,
2559 bool oom_check)
2560{
2561 unsigned long csize = nr_pages * PAGE_SIZE;
2562 struct mem_cgroup *mem_over_limit;
2563 struct res_counter *fail_res;
2564 unsigned long flags = 0;
2565 int ret;
2566
2567 ret = res_counter_charge(&memcg->res, csize, &fail_res);
2568
2569 if (likely(!ret)) {
2570 if (!do_swap_account)
2571 return CHARGE_OK;
2572 ret = res_counter_charge(&memcg->memsw, csize, &fail_res);
2573 if (likely(!ret))
2574 return CHARGE_OK;
2575
2576 res_counter_uncharge(&memcg->res, csize);
2577 mem_over_limit = mem_cgroup_from_res_counter(fail_res, memsw);
2578 flags |= MEM_CGROUP_RECLAIM_NOSWAP;
2579 } else
2580 mem_over_limit = mem_cgroup_from_res_counter(fail_res, res);
2581
2582
2583
2584
2585 if (nr_pages > min_pages)
2586 return CHARGE_RETRY;
2587
2588 if (!(gfp_mask & __GFP_WAIT))
2589 return CHARGE_WOULDBLOCK;
2590
2591 if (gfp_mask & __GFP_NORETRY)
2592 return CHARGE_NOMEM;
2593
2594 ret = mem_cgroup_reclaim(mem_over_limit, gfp_mask, flags);
2595 if (mem_cgroup_margin(mem_over_limit) >= nr_pages)
2596 return CHARGE_RETRY;
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606 if (nr_pages <= (1 << PAGE_ALLOC_COSTLY_ORDER) && ret)
2607 return CHARGE_RETRY;
2608
2609
2610
2611
2612
2613 if (mem_cgroup_wait_acct_move(mem_over_limit))
2614 return CHARGE_RETRY;
2615
2616
2617 if (!oom_check)
2618 return CHARGE_NOMEM;
2619
2620 if (!mem_cgroup_handle_oom(mem_over_limit, gfp_mask, get_order(csize)))
2621 return CHARGE_OOM_DIE;
2622
2623 return CHARGE_RETRY;
2624}
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647static int __mem_cgroup_try_charge(struct mm_struct *mm,
2648 gfp_t gfp_mask,
2649 unsigned int nr_pages,
2650 struct mem_cgroup **ptr,
2651 bool oom)
2652{
2653 unsigned int batch = max(CHARGE_BATCH, nr_pages);
2654 int nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES;
2655 struct mem_cgroup *memcg = NULL;
2656 int ret;
2657
2658
2659
2660
2661
2662
2663 if (unlikely(test_thread_flag(TIF_MEMDIE)
2664 || fatal_signal_pending(current)))
2665 goto bypass;
2666
2667
2668
2669
2670
2671
2672
2673 if (!*ptr && !mm)
2674 *ptr = root_mem_cgroup;
2675again:
2676 if (*ptr) {
2677 memcg = *ptr;
2678 if (mem_cgroup_is_root(memcg))
2679 goto done;
2680 if (consume_stock(memcg, nr_pages))
2681 goto done;
2682 css_get(&memcg->css);
2683 } else {
2684 struct task_struct *p;
2685
2686 rcu_read_lock();
2687 p = rcu_dereference(mm->owner);
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698 memcg = mem_cgroup_from_task(p);
2699 if (!memcg)
2700 memcg = root_mem_cgroup;
2701 if (mem_cgroup_is_root(memcg)) {
2702 rcu_read_unlock();
2703 goto done;
2704 }
2705 if (consume_stock(memcg, nr_pages)) {
2706
2707
2708
2709
2710
2711
2712
2713
2714 rcu_read_unlock();
2715 goto done;
2716 }
2717
2718 if (!css_tryget(&memcg->css)) {
2719 rcu_read_unlock();
2720 goto again;
2721 }
2722 rcu_read_unlock();
2723 }
2724
2725 do {
2726 bool oom_check;
2727
2728
2729 if (fatal_signal_pending(current)) {
2730 css_put(&memcg->css);
2731 goto bypass;
2732 }
2733
2734 oom_check = false;
2735 if (oom && !nr_oom_retries) {
2736 oom_check = true;
2737 nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES;
2738 }
2739
2740 ret = mem_cgroup_do_charge(memcg, gfp_mask, batch, nr_pages,
2741 oom_check);
2742 switch (ret) {
2743 case CHARGE_OK:
2744 break;
2745 case CHARGE_RETRY:
2746 batch = nr_pages;
2747 css_put(&memcg->css);
2748 memcg = NULL;
2749 goto again;
2750 case CHARGE_WOULDBLOCK:
2751 css_put(&memcg->css);
2752 goto nomem;
2753 case CHARGE_NOMEM:
2754 if (!oom) {
2755 css_put(&memcg->css);
2756 goto nomem;
2757 }
2758
2759 nr_oom_retries--;
2760 break;
2761 case CHARGE_OOM_DIE:
2762 css_put(&memcg->css);
2763 goto bypass;
2764 }
2765 } while (ret != CHARGE_OK);
2766
2767 if (batch > nr_pages)
2768 refill_stock(memcg, batch - nr_pages);
2769 css_put(&memcg->css);
2770done:
2771 *ptr = memcg;
2772 return 0;
2773nomem:
2774 *ptr = NULL;
2775 return -ENOMEM;
2776bypass:
2777 *ptr = root_mem_cgroup;
2778 return -EINTR;
2779}
2780
2781
2782
2783
2784
2785
2786static void __mem_cgroup_cancel_charge(struct mem_cgroup *memcg,
2787 unsigned int nr_pages)
2788{
2789 if (!mem_cgroup_is_root(memcg)) {
2790 unsigned long bytes = nr_pages * PAGE_SIZE;
2791
2792 res_counter_uncharge(&memcg->res, bytes);
2793 if (do_swap_account)
2794 res_counter_uncharge(&memcg->memsw, bytes);
2795 }
2796}
2797
2798
2799
2800
2801
2802static void __mem_cgroup_cancel_local_charge(struct mem_cgroup *memcg,
2803 unsigned int nr_pages)
2804{
2805 unsigned long bytes = nr_pages * PAGE_SIZE;
2806
2807 if (mem_cgroup_is_root(memcg))
2808 return;
2809
2810 res_counter_uncharge_until(&memcg->res, memcg->res.parent, bytes);
2811 if (do_swap_account)
2812 res_counter_uncharge_until(&memcg->memsw,
2813 memcg->memsw.parent, bytes);
2814}
2815
2816
2817
2818
2819
2820
2821
2822static struct mem_cgroup *mem_cgroup_lookup(unsigned short id)
2823{
2824 struct cgroup_subsys_state *css;
2825
2826
2827 if (!id)
2828 return NULL;
2829 css = css_lookup(&mem_cgroup_subsys, id);
2830 if (!css)
2831 return NULL;
2832 return mem_cgroup_from_css(css);
2833}
2834
2835struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page)
2836{
2837 struct mem_cgroup *memcg = NULL;
2838 struct page_cgroup *pc;
2839 unsigned short id;
2840 swp_entry_t ent;
2841
2842 VM_BUG_ON(!PageLocked(page));
2843
2844 pc = lookup_page_cgroup(page);
2845 lock_page_cgroup(pc);
2846 if (PageCgroupUsed(pc)) {
2847 memcg = pc->mem_cgroup;
2848 if (memcg && !css_tryget(&memcg->css))
2849 memcg = NULL;
2850 } else if (PageSwapCache(page)) {
2851 ent.val = page_private(page);
2852 id = lookup_swap_cgroup_id(ent);
2853 rcu_read_lock();
2854 memcg = mem_cgroup_lookup(id);
2855 if (memcg && !css_tryget(&memcg->css))
2856 memcg = NULL;
2857 rcu_read_unlock();
2858 }
2859 unlock_page_cgroup(pc);
2860 return memcg;
2861}
2862
2863static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg,
2864 struct page *page,
2865 unsigned int nr_pages,
2866 enum charge_type ctype,
2867 bool lrucare)
2868{
2869 struct page_cgroup *pc = lookup_page_cgroup(page);
2870 struct zone *uninitialized_var(zone);
2871 struct lruvec *lruvec;
2872 bool was_on_lru = false;
2873 bool anon;
2874
2875 lock_page_cgroup(pc);
2876 VM_BUG_ON(PageCgroupUsed(pc));
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886 if (lrucare) {
2887 zone = page_zone(page);
2888 spin_lock_irq(&zone->lru_lock);
2889 if (PageLRU(page)) {
2890 lruvec = mem_cgroup_zone_lruvec(zone, pc->mem_cgroup);
2891 ClearPageLRU(page);
2892 del_page_from_lru_list(page, lruvec, page_lru(page));
2893 was_on_lru = true;
2894 }
2895 }
2896
2897 pc->mem_cgroup = memcg;
2898
2899
2900
2901
2902
2903
2904
2905 smp_wmb();
2906 SetPageCgroupUsed(pc);
2907
2908 if (lrucare) {
2909 if (was_on_lru) {
2910 lruvec = mem_cgroup_zone_lruvec(zone, pc->mem_cgroup);
2911 VM_BUG_ON(PageLRU(page));
2912 SetPageLRU(page);
2913 add_page_to_lru_list(page, lruvec, page_lru(page));
2914 }
2915 spin_unlock_irq(&zone->lru_lock);
2916 }
2917
2918 if (ctype == MEM_CGROUP_CHARGE_TYPE_ANON)
2919 anon = true;
2920 else
2921 anon = false;
2922
2923 mem_cgroup_charge_statistics(memcg, page, anon, nr_pages);
2924 unlock_page_cgroup(pc);
2925
2926
2927
2928
2929
2930
2931 memcg_check_events(memcg, page);
2932}
2933
2934static DEFINE_MUTEX(set_limit_mutex);
2935
2936#ifdef CONFIG_MEMCG_KMEM
2937static inline bool memcg_can_account_kmem(struct mem_cgroup *memcg)
2938{
2939 return !mem_cgroup_disabled() && !mem_cgroup_is_root(memcg) &&
2940 (memcg->kmem_account_flags & KMEM_ACCOUNTED_MASK);
2941}
2942
2943
2944
2945
2946
2947static struct kmem_cache *memcg_params_to_cache(struct memcg_cache_params *p)
2948{
2949 struct kmem_cache *cachep;
2950
2951 VM_BUG_ON(p->is_root_cache);
2952 cachep = p->root_cache;
2953 return cachep->memcg_params->memcg_caches[memcg_cache_id(p->memcg)];
2954}
2955
2956#ifdef CONFIG_SLABINFO
2957static int mem_cgroup_slabinfo_read(struct cgroup *cont, struct cftype *cft,
2958 struct seq_file *m)
2959{
2960 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
2961 struct memcg_cache_params *params;
2962
2963 if (!memcg_can_account_kmem(memcg))
2964 return -EIO;
2965
2966 print_slabinfo_header(m);
2967
2968 mutex_lock(&memcg->slab_caches_mutex);
2969 list_for_each_entry(params, &memcg->memcg_slab_caches, list)
2970 cache_show(memcg_params_to_cache(params), m);
2971 mutex_unlock(&memcg->slab_caches_mutex);
2972
2973 return 0;
2974}
2975#endif
2976
2977static int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp, u64 size)
2978{
2979 struct res_counter *fail_res;
2980 struct mem_cgroup *_memcg;
2981 int ret = 0;
2982 bool may_oom;
2983
2984 ret = res_counter_charge(&memcg->kmem, size, &fail_res);
2985 if (ret)
2986 return ret;
2987
2988
2989
2990
2991
2992 may_oom = (gfp & __GFP_FS) && !(gfp & __GFP_NORETRY);
2993
2994 _memcg = memcg;
2995 ret = __mem_cgroup_try_charge(NULL, gfp, size >> PAGE_SHIFT,
2996 &_memcg, may_oom);
2997
2998 if (ret == -EINTR) {
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014 res_counter_charge_nofail(&memcg->res, size, &fail_res);
3015 if (do_swap_account)
3016 res_counter_charge_nofail(&memcg->memsw, size,
3017 &fail_res);
3018 ret = 0;
3019 } else if (ret)
3020 res_counter_uncharge(&memcg->kmem, size);
3021
3022 return ret;
3023}
3024
3025static void memcg_uncharge_kmem(struct mem_cgroup *memcg, u64 size)
3026{
3027 res_counter_uncharge(&memcg->res, size);
3028 if (do_swap_account)
3029 res_counter_uncharge(&memcg->memsw, size);
3030
3031
3032 if (res_counter_uncharge(&memcg->kmem, size))
3033 return;
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043 if (memcg_kmem_test_and_clear_dead(memcg))
3044 css_put(&memcg->css);
3045}
3046
3047void memcg_cache_list_add(struct mem_cgroup *memcg, struct kmem_cache *cachep)
3048{
3049 if (!memcg)
3050 return;
3051
3052 mutex_lock(&memcg->slab_caches_mutex);
3053 list_add(&cachep->memcg_params->list, &memcg->memcg_slab_caches);
3054 mutex_unlock(&memcg->slab_caches_mutex);
3055}
3056
3057
3058
3059
3060
3061
3062int memcg_cache_id(struct mem_cgroup *memcg)
3063{
3064 return memcg ? memcg->kmemcg_id : -1;
3065}
3066
3067
3068
3069
3070
3071
3072
3073
3074int memcg_update_cache_sizes(struct mem_cgroup *memcg)
3075{
3076 int num, ret;
3077
3078 num = ida_simple_get(&kmem_limited_groups,
3079 0, MEMCG_CACHES_MAX_SIZE, GFP_KERNEL);
3080 if (num < 0)
3081 return num;
3082
3083
3084
3085
3086
3087
3088
3089 memcg_kmem_set_activated(memcg);
3090
3091 ret = memcg_update_all_caches(num+1);
3092 if (ret) {
3093 ida_simple_remove(&kmem_limited_groups, num);
3094 memcg_kmem_clear_activated(memcg);
3095 return ret;
3096 }
3097
3098 memcg->kmemcg_id = num;
3099 INIT_LIST_HEAD(&memcg->memcg_slab_caches);
3100 mutex_init(&memcg->slab_caches_mutex);
3101 return 0;
3102}
3103
3104static size_t memcg_caches_array_size(int num_groups)
3105{
3106 ssize_t size;
3107 if (num_groups <= 0)
3108 return 0;
3109
3110 size = 2 * num_groups;
3111 if (size < MEMCG_CACHES_MIN_SIZE)
3112 size = MEMCG_CACHES_MIN_SIZE;
3113 else if (size > MEMCG_CACHES_MAX_SIZE)
3114 size = MEMCG_CACHES_MAX_SIZE;
3115
3116 return size;
3117}
3118
3119
3120
3121
3122
3123
3124void memcg_update_array_size(int num)
3125{
3126 if (num > memcg_limited_groups_array_size)
3127 memcg_limited_groups_array_size = memcg_caches_array_size(num);
3128}
3129
3130static void kmem_cache_destroy_work_func(struct work_struct *w);
3131
3132int memcg_update_cache_size(struct kmem_cache *s, int num_groups)
3133{
3134 struct memcg_cache_params *cur_params = s->memcg_params;
3135
3136 VM_BUG_ON(s->memcg_params && !s->memcg_params->is_root_cache);
3137
3138 if (num_groups > memcg_limited_groups_array_size) {
3139 int i;
3140 ssize_t size = memcg_caches_array_size(num_groups);
3141
3142 size *= sizeof(void *);
3143 size += sizeof(struct memcg_cache_params);
3144
3145 s->memcg_params = kzalloc(size, GFP_KERNEL);
3146 if (!s->memcg_params) {
3147 s->memcg_params = cur_params;
3148 return -ENOMEM;
3149 }
3150
3151 s->memcg_params->is_root_cache = true;
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162 for (i = 0; i < memcg_limited_groups_array_size; i++) {
3163 if (!cur_params->memcg_caches[i])
3164 continue;
3165 s->memcg_params->memcg_caches[i] =
3166 cur_params->memcg_caches[i];
3167 }
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178 kfree(cur_params);
3179 }
3180 return 0;
3181}
3182
3183int memcg_register_cache(struct mem_cgroup *memcg, struct kmem_cache *s,
3184 struct kmem_cache *root_cache)
3185{
3186 size_t size = sizeof(struct memcg_cache_params);
3187
3188 if (!memcg_kmem_enabled())
3189 return 0;
3190
3191 if (!memcg)
3192 size += memcg_limited_groups_array_size * sizeof(void *);
3193
3194 s->memcg_params = kzalloc(size, GFP_KERNEL);
3195 if (!s->memcg_params)
3196 return -ENOMEM;
3197
3198 if (memcg) {
3199 s->memcg_params->memcg = memcg;
3200 s->memcg_params->root_cache = root_cache;
3201 INIT_WORK(&s->memcg_params->destroy,
3202 kmem_cache_destroy_work_func);
3203 } else
3204 s->memcg_params->is_root_cache = true;
3205
3206 return 0;
3207}
3208
3209void memcg_release_cache(struct kmem_cache *s)
3210{
3211 struct kmem_cache *root;
3212 struct mem_cgroup *memcg;
3213 int id;
3214
3215
3216
3217
3218
3219 if (!s->memcg_params)
3220 return;
3221
3222 if (s->memcg_params->is_root_cache)
3223 goto out;
3224
3225 memcg = s->memcg_params->memcg;
3226 id = memcg_cache_id(memcg);
3227
3228 root = s->memcg_params->root_cache;
3229 root->memcg_params->memcg_caches[id] = NULL;
3230
3231 mutex_lock(&memcg->slab_caches_mutex);
3232 list_del(&s->memcg_params->list);
3233 mutex_unlock(&memcg->slab_caches_mutex);
3234
3235 css_put(&memcg->css);
3236out:
3237 kfree(s->memcg_params);
3238}
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259static inline void memcg_stop_kmem_account(void)
3260{
3261 VM_BUG_ON(!current->mm);
3262 current->memcg_kmem_skip_account++;
3263}
3264
3265static inline void memcg_resume_kmem_account(void)
3266{
3267 VM_BUG_ON(!current->mm);
3268 current->memcg_kmem_skip_account--;
3269}
3270
3271static void kmem_cache_destroy_work_func(struct work_struct *w)
3272{
3273 struct kmem_cache *cachep;
3274 struct memcg_cache_params *p;
3275
3276 p = container_of(w, struct memcg_cache_params, destroy);
3277
3278 cachep = memcg_params_to_cache(p);
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296 if (atomic_read(&cachep->memcg_params->nr_pages) != 0) {
3297 kmem_cache_shrink(cachep);
3298 if (atomic_read(&cachep->memcg_params->nr_pages) == 0)
3299 return;
3300 } else
3301 kmem_cache_destroy(cachep);
3302}
3303
3304void mem_cgroup_destroy_cache(struct kmem_cache *cachep)
3305{
3306 if (!cachep->memcg_params->dead)
3307 return;
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327 if (work_pending(&cachep->memcg_params->destroy))
3328 return;
3329
3330
3331
3332
3333 schedule_work(&cachep->memcg_params->destroy);
3334}
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344static DEFINE_MUTEX(memcg_cache_mutex);
3345
3346
3347
3348
3349static struct kmem_cache *kmem_cache_dup(struct mem_cgroup *memcg,
3350 struct kmem_cache *s)
3351{
3352 struct kmem_cache *new;
3353 static char *tmp_name = NULL;
3354
3355 lockdep_assert_held(&memcg_cache_mutex);
3356
3357
3358
3359
3360
3361
3362
3363 if (!tmp_name) {
3364 tmp_name = kmalloc(PATH_MAX, GFP_KERNEL);
3365 if (!tmp_name)
3366 return NULL;
3367 }
3368
3369 rcu_read_lock();
3370 snprintf(tmp_name, PATH_MAX, "%s(%d:%s)", s->name,
3371 memcg_cache_id(memcg), cgroup_name(memcg->css.cgroup));
3372 rcu_read_unlock();
3373
3374 new = kmem_cache_create_memcg(memcg, tmp_name, s->object_size, s->align,
3375 (s->flags & ~SLAB_PANIC), s->ctor, s);
3376
3377 if (new)
3378 new->allocflags |= __GFP_KMEMCG;
3379
3380 return new;
3381}
3382
3383static struct kmem_cache *memcg_create_kmem_cache(struct mem_cgroup *memcg,
3384 struct kmem_cache *cachep)
3385{
3386 struct kmem_cache *new_cachep;
3387 int idx;
3388
3389 BUG_ON(!memcg_can_account_kmem(memcg));
3390
3391 idx = memcg_cache_id(memcg);
3392
3393 mutex_lock(&memcg_cache_mutex);
3394 new_cachep = cachep->memcg_params->memcg_caches[idx];
3395 if (new_cachep) {
3396 css_put(&memcg->css);
3397 goto out;
3398 }
3399
3400 new_cachep = kmem_cache_dup(memcg, cachep);
3401 if (new_cachep == NULL) {
3402 new_cachep = cachep;
3403 css_put(&memcg->css);
3404 goto out;
3405 }
3406
3407 atomic_set(&new_cachep->memcg_params->nr_pages , 0);
3408
3409 cachep->memcg_params->memcg_caches[idx] = new_cachep;
3410
3411
3412
3413
3414 wmb();
3415out:
3416 mutex_unlock(&memcg_cache_mutex);
3417 return new_cachep;
3418}
3419
3420void kmem_cache_destroy_memcg_children(struct kmem_cache *s)
3421{
3422 struct kmem_cache *c;
3423 int i;
3424
3425 if (!s->memcg_params)
3426 return;
3427 if (!s->memcg_params->is_root_cache)
3428 return;
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439 mutex_lock(&set_limit_mutex);
3440 for (i = 0; i < memcg_limited_groups_array_size; i++) {
3441 c = s->memcg_params->memcg_caches[i];
3442 if (!c)
3443 continue;
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458 c->memcg_params->dead = false;
3459 cancel_work_sync(&c->memcg_params->destroy);
3460 kmem_cache_destroy(c);
3461 }
3462 mutex_unlock(&set_limit_mutex);
3463}
3464
3465struct create_work {
3466 struct mem_cgroup *memcg;
3467 struct kmem_cache *cachep;
3468 struct work_struct work;
3469};
3470
3471static void mem_cgroup_destroy_all_caches(struct mem_cgroup *memcg)
3472{
3473 struct kmem_cache *cachep;
3474 struct memcg_cache_params *params;
3475
3476 if (!memcg_kmem_is_active(memcg))
3477 return;
3478
3479 mutex_lock(&memcg->slab_caches_mutex);
3480 list_for_each_entry(params, &memcg->memcg_slab_caches, list) {
3481 cachep = memcg_params_to_cache(params);
3482 cachep->memcg_params->dead = true;
3483 schedule_work(&cachep->memcg_params->destroy);
3484 }
3485 mutex_unlock(&memcg->slab_caches_mutex);
3486}
3487
3488static void memcg_create_cache_work_func(struct work_struct *w)
3489{
3490 struct create_work *cw;
3491
3492 cw = container_of(w, struct create_work, work);
3493 memcg_create_kmem_cache(cw->memcg, cw->cachep);
3494 kfree(cw);
3495}
3496
3497
3498
3499
3500static void __memcg_create_cache_enqueue(struct mem_cgroup *memcg,
3501 struct kmem_cache *cachep)
3502{
3503 struct create_work *cw;
3504
3505 cw = kmalloc(sizeof(struct create_work), GFP_NOWAIT);
3506 if (cw == NULL) {
3507 css_put(&memcg->css);
3508 return;
3509 }
3510
3511 cw->memcg = memcg;
3512 cw->cachep = cachep;
3513
3514 INIT_WORK(&cw->work, memcg_create_cache_work_func);
3515 schedule_work(&cw->work);
3516}
3517
3518static void memcg_create_cache_enqueue(struct mem_cgroup *memcg,
3519 struct kmem_cache *cachep)
3520{
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532 memcg_stop_kmem_account();
3533 __memcg_create_cache_enqueue(memcg, cachep);
3534 memcg_resume_kmem_account();
3535}
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep,
3550 gfp_t gfp)
3551{
3552 struct mem_cgroup *memcg;
3553 int idx;
3554
3555 VM_BUG_ON(!cachep->memcg_params);
3556 VM_BUG_ON(!cachep->memcg_params->is_root_cache);
3557
3558 if (!current->mm || current->memcg_kmem_skip_account)
3559 return cachep;
3560
3561 rcu_read_lock();
3562 memcg = mem_cgroup_from_task(rcu_dereference(current->mm->owner));
3563
3564 if (!memcg_can_account_kmem(memcg))
3565 goto out;
3566
3567 idx = memcg_cache_id(memcg);
3568
3569
3570
3571
3572
3573 read_barrier_depends();
3574 if (likely(cachep->memcg_params->memcg_caches[idx])) {
3575 cachep = cachep->memcg_params->memcg_caches[idx];
3576 goto out;
3577 }
3578
3579
3580 if (!css_tryget(&memcg->css))
3581 goto out;
3582 rcu_read_unlock();
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601 memcg_create_cache_enqueue(memcg, cachep);
3602 return cachep;
3603out:
3604 rcu_read_unlock();
3605 return cachep;
3606}
3607EXPORT_SYMBOL(__memcg_kmem_get_cache);
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623bool
3624__memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **_memcg, int order)
3625{
3626 struct mem_cgroup *memcg;
3627 int ret;
3628
3629 *_memcg = NULL;
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655 if (!current->mm || current->memcg_kmem_skip_account)
3656 return true;
3657
3658 memcg = try_get_mem_cgroup_from_mm(current->mm);
3659
3660
3661
3662
3663
3664
3665 if (unlikely(!memcg))
3666 return true;
3667
3668 if (!memcg_can_account_kmem(memcg)) {
3669 css_put(&memcg->css);
3670 return true;
3671 }
3672
3673 ret = memcg_charge_kmem(memcg, gfp, PAGE_SIZE << order);
3674 if (!ret)
3675 *_memcg = memcg;
3676
3677 css_put(&memcg->css);
3678 return (ret == 0);
3679}
3680
3681void __memcg_kmem_commit_charge(struct page *page, struct mem_cgroup *memcg,
3682 int order)
3683{
3684 struct page_cgroup *pc;
3685
3686 VM_BUG_ON(mem_cgroup_is_root(memcg));
3687
3688
3689 if (!page) {
3690 memcg_uncharge_kmem(memcg, PAGE_SIZE << order);
3691 return;
3692 }
3693
3694 pc = lookup_page_cgroup(page);
3695 lock_page_cgroup(pc);
3696 pc->mem_cgroup = memcg;
3697 SetPageCgroupUsed(pc);
3698 unlock_page_cgroup(pc);
3699}
3700
3701void __memcg_kmem_uncharge_pages(struct page *page, int order)
3702{
3703 struct mem_cgroup *memcg = NULL;
3704 struct page_cgroup *pc;
3705
3706
3707 pc = lookup_page_cgroup(page);
3708
3709
3710
3711
3712 if (!PageCgroupUsed(pc))
3713 return;
3714
3715 lock_page_cgroup(pc);
3716 if (PageCgroupUsed(pc)) {
3717 memcg = pc->mem_cgroup;
3718 ClearPageCgroupUsed(pc);
3719 }
3720 unlock_page_cgroup(pc);
3721
3722
3723
3724
3725
3726 if (!memcg)
3727 return;
3728
3729 VM_BUG_ON(mem_cgroup_is_root(memcg));
3730 memcg_uncharge_kmem(memcg, PAGE_SIZE << order);
3731}
3732#else
3733static inline void mem_cgroup_destroy_all_caches(struct mem_cgroup *memcg)
3734{
3735}
3736#endif
3737
3738#ifdef CONFIG_TRANSPARENT_HUGEPAGE
3739
3740#define PCGF_NOCOPY_AT_SPLIT (1 << PCG_LOCK | 1 << PCG_MIGRATION)
3741
3742
3743
3744
3745
3746
3747void mem_cgroup_split_huge_fixup(struct page *head)
3748{
3749 struct page_cgroup *head_pc = lookup_page_cgroup(head);
3750 struct page_cgroup *pc;
3751 struct mem_cgroup *memcg;
3752 int i;
3753
3754 if (mem_cgroup_disabled())
3755 return;
3756
3757 memcg = head_pc->mem_cgroup;
3758 for (i = 1; i < HPAGE_PMD_NR; i++) {
3759 pc = head_pc + i;
3760 pc->mem_cgroup = memcg;
3761 smp_wmb();
3762 pc->flags = head_pc->flags & ~PCGF_NOCOPY_AT_SPLIT;
3763 }
3764 __this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_RSS_HUGE],
3765 HPAGE_PMD_NR);
3766}
3767#endif
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784static int mem_cgroup_move_account(struct page *page,
3785 unsigned int nr_pages,
3786 struct page_cgroup *pc,
3787 struct mem_cgroup *from,
3788 struct mem_cgroup *to)
3789{
3790 unsigned long flags;
3791 int ret;
3792 bool anon = PageAnon(page);
3793
3794 VM_BUG_ON(from == to);
3795 VM_BUG_ON(PageLRU(page));
3796
3797
3798
3799
3800
3801
3802 ret = -EBUSY;
3803 if (nr_pages > 1 && !PageTransHuge(page))
3804 goto out;
3805
3806 lock_page_cgroup(pc);
3807
3808 ret = -EINVAL;
3809 if (!PageCgroupUsed(pc) || pc->mem_cgroup != from)
3810 goto unlock;
3811
3812 move_lock_mem_cgroup(from, &flags);
3813
3814 if (!anon && page_mapped(page)) {
3815
3816 preempt_disable();
3817 __this_cpu_dec(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
3818 __this_cpu_inc(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
3819 preempt_enable();
3820 }
3821 mem_cgroup_charge_statistics(from, page, anon, -nr_pages);
3822
3823
3824 pc->mem_cgroup = to;
3825 mem_cgroup_charge_statistics(to, page, anon, nr_pages);
3826 move_unlock_mem_cgroup(from, &flags);
3827 ret = 0;
3828unlock:
3829 unlock_page_cgroup(pc);
3830
3831
3832
3833 memcg_check_events(to, page);
3834 memcg_check_events(from, page);
3835out:
3836 return ret;
3837}
3838
3839
3840
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850
3851
3852
3853
3854
3855
3856
3857
3858
3859
3860static int mem_cgroup_move_parent(struct page *page,
3861 struct page_cgroup *pc,
3862 struct mem_cgroup *child)
3863{
3864 struct mem_cgroup *parent;
3865 unsigned int nr_pages;
3866 unsigned long uninitialized_var(flags);
3867 int ret;
3868
3869 VM_BUG_ON(mem_cgroup_is_root(child));
3870
3871 ret = -EBUSY;
3872 if (!get_page_unless_zero(page))
3873 goto out;
3874 if (isolate_lru_page(page))
3875 goto put;
3876
3877 nr_pages = hpage_nr_pages(page);
3878
3879 parent = parent_mem_cgroup(child);
3880
3881
3882
3883 if (!parent)
3884 parent = root_mem_cgroup;
3885
3886 if (nr_pages > 1) {
3887 VM_BUG_ON(!PageTransHuge(page));
3888 flags = compound_lock_irqsave(page);
3889 }
3890
3891 ret = mem_cgroup_move_account(page, nr_pages,
3892 pc, child, parent);
3893 if (!ret)
3894 __mem_cgroup_cancel_local_charge(child, nr_pages);
3895
3896 if (nr_pages > 1)
3897 compound_unlock_irqrestore(page, flags);
3898 putback_lru_page(page);
3899put:
3900 put_page(page);
3901out:
3902 return ret;
3903}
3904
3905
3906
3907
3908
3909
3910
3911static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
3912 gfp_t gfp_mask, enum charge_type ctype)
3913{
3914 struct mem_cgroup *memcg = NULL;
3915 unsigned int nr_pages = 1;
3916 bool oom = true;
3917 int ret;
3918
3919 if (PageTransHuge(page)) {
3920 nr_pages <<= compound_order(page);
3921 VM_BUG_ON(!PageTransHuge(page));
3922
3923
3924
3925
3926 oom = false;
3927 }
3928
3929 ret = __mem_cgroup_try_charge(mm, gfp_mask, nr_pages, &memcg, oom);
3930 if (ret == -ENOMEM)
3931 return ret;
3932 __mem_cgroup_commit_charge(memcg, page, nr_pages, ctype, false);
3933 return 0;
3934}
3935
3936int mem_cgroup_newpage_charge(struct page *page,
3937 struct mm_struct *mm, gfp_t gfp_mask)
3938{
3939 if (mem_cgroup_disabled())
3940 return 0;
3941 VM_BUG_ON(page_mapped(page));
3942 VM_BUG_ON(page->mapping && !PageAnon(page));
3943 VM_BUG_ON(!mm);
3944 return mem_cgroup_charge_common(page, mm, gfp_mask,
3945 MEM_CGROUP_CHARGE_TYPE_ANON);
3946}
3947
3948
3949
3950
3951
3952
3953
3954static int __mem_cgroup_try_charge_swapin(struct mm_struct *mm,
3955 struct page *page,
3956 gfp_t mask,
3957 struct mem_cgroup **memcgp)
3958{
3959 struct mem_cgroup *memcg;
3960 struct page_cgroup *pc;
3961 int ret;
3962
3963 pc = lookup_page_cgroup(page);
3964
3965
3966
3967
3968
3969
3970
3971 if (PageCgroupUsed(pc))
3972 return 0;
3973 if (!do_swap_account)
3974 goto charge_cur_mm;
3975 memcg = try_get_mem_cgroup_from_page(page);
3976 if (!memcg)
3977 goto charge_cur_mm;
3978 *memcgp = memcg;
3979 ret = __mem_cgroup_try_charge(NULL, mask, 1, memcgp, true);
3980 css_put(&memcg->css);
3981 if (ret == -EINTR)
3982 ret = 0;
3983 return ret;
3984charge_cur_mm:
3985 ret = __mem_cgroup_try_charge(mm, mask, 1, memcgp, true);
3986 if (ret == -EINTR)
3987 ret = 0;
3988 return ret;
3989}
3990
3991int mem_cgroup_try_charge_swapin(struct mm_struct *mm, struct page *page,
3992 gfp_t gfp_mask, struct mem_cgroup **memcgp)
3993{
3994 *memcgp = NULL;
3995 if (mem_cgroup_disabled())
3996 return 0;
3997
3998
3999
4000
4001
4002
4003 if (!PageSwapCache(page)) {
4004 int ret;
4005
4006 ret = __mem_cgroup_try_charge(mm, gfp_mask, 1, memcgp, true);
4007 if (ret == -EINTR)
4008 ret = 0;
4009 return ret;
4010 }
4011 return __mem_cgroup_try_charge_swapin(mm, page, gfp_mask, memcgp);
4012}
4013
4014void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *memcg)
4015{
4016 if (mem_cgroup_disabled())
4017 return;
4018 if (!memcg)
4019 return;
4020 __mem_cgroup_cancel_charge(memcg, 1);
4021}
4022
4023static void
4024__mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *memcg,
4025 enum charge_type ctype)
4026{
4027 if (mem_cgroup_disabled())
4028 return;
4029 if (!memcg)
4030 return;
4031
4032 __mem_cgroup_commit_charge(memcg, page, 1, ctype, true);
4033
4034
4035
4036
4037
4038
4039
4040 if (do_swap_account && PageSwapCache(page)) {
4041 swp_entry_t ent = {.val = page_private(page)};
4042 mem_cgroup_uncharge_swap(ent);
4043 }
4044}
4045
4046void mem_cgroup_commit_charge_swapin(struct page *page,
4047 struct mem_cgroup *memcg)
4048{
4049 __mem_cgroup_commit_charge_swapin(page, memcg,
4050 MEM_CGROUP_CHARGE_TYPE_ANON);
4051}
4052
4053int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
4054 gfp_t gfp_mask)
4055{
4056 struct mem_cgroup *memcg = NULL;
4057 enum charge_type type = MEM_CGROUP_CHARGE_TYPE_CACHE;
4058 int ret;
4059
4060 if (mem_cgroup_disabled())
4061 return 0;
4062 if (PageCompound(page))
4063 return 0;
4064
4065 if (!PageSwapCache(page))
4066 ret = mem_cgroup_charge_common(page, mm, gfp_mask, type);
4067 else {
4068 ret = __mem_cgroup_try_charge_swapin(mm, page,
4069 gfp_mask, &memcg);
4070 if (!ret)
4071 __mem_cgroup_commit_charge_swapin(page, memcg, type);
4072 }
4073 return ret;
4074}
4075
4076static void mem_cgroup_do_uncharge(struct mem_cgroup *memcg,
4077 unsigned int nr_pages,
4078 const enum charge_type ctype)
4079{
4080 struct memcg_batch_info *batch = NULL;
4081 bool uncharge_memsw = true;
4082
4083
4084 if (!do_swap_account || ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT)
4085 uncharge_memsw = false;
4086
4087 batch = ¤t->memcg_batch;
4088
4089
4090
4091
4092
4093 if (!batch->memcg)
4094 batch->memcg = memcg;
4095
4096
4097
4098
4099
4100
4101
4102
4103 if (!batch->do_batch || test_thread_flag(TIF_MEMDIE))
4104 goto direct_uncharge;
4105
4106 if (nr_pages > 1)
4107 goto direct_uncharge;
4108
4109
4110
4111
4112
4113
4114 if (batch->memcg != memcg)
4115 goto direct_uncharge;
4116
4117 batch->nr_pages++;
4118 if (uncharge_memsw)
4119 batch->memsw_nr_pages++;
4120 return;
4121direct_uncharge:
4122 res_counter_uncharge(&memcg->res, nr_pages * PAGE_SIZE);
4123 if (uncharge_memsw)
4124 res_counter_uncharge(&memcg->memsw, nr_pages * PAGE_SIZE);
4125 if (unlikely(batch->memcg != memcg))
4126 memcg_oom_recover(memcg);
4127}
4128
4129
4130
4131
4132static struct mem_cgroup *
4133__mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype,
4134 bool end_migration)
4135{
4136 struct mem_cgroup *memcg = NULL;
4137 unsigned int nr_pages = 1;
4138 struct page_cgroup *pc;
4139 bool anon;
4140
4141 if (mem_cgroup_disabled())
4142 return NULL;
4143
4144 if (PageTransHuge(page)) {
4145 nr_pages <<= compound_order(page);
4146 VM_BUG_ON(!PageTransHuge(page));
4147 }
4148
4149
4150
4151 pc = lookup_page_cgroup(page);
4152 if (unlikely(!PageCgroupUsed(pc)))
4153 return NULL;
4154
4155 lock_page_cgroup(pc);
4156
4157 memcg = pc->mem_cgroup;
4158
4159 if (!PageCgroupUsed(pc))
4160 goto unlock_out;
4161
4162 anon = PageAnon(page);
4163
4164 switch (ctype) {
4165 case MEM_CGROUP_CHARGE_TYPE_ANON:
4166
4167
4168
4169
4170
4171 anon = true;
4172
4173 case MEM_CGROUP_CHARGE_TYPE_DROP:
4174
4175 if (page_mapped(page))
4176 goto unlock_out;
4177
4178
4179
4180
4181
4182
4183
4184 if (!end_migration && PageCgroupMigration(pc))
4185 goto unlock_out;
4186 break;
4187 case MEM_CGROUP_CHARGE_TYPE_SWAPOUT:
4188 if (!PageAnon(page)) {
4189 if (page->mapping && !page_is_file_cache(page))
4190 goto unlock_out;
4191 } else if (page_mapped(page))
4192 goto unlock_out;
4193 break;
4194 default:
4195 break;
4196 }
4197
4198 mem_cgroup_charge_statistics(memcg, page, anon, -nr_pages);
4199
4200 ClearPageCgroupUsed(pc);
4201
4202
4203
4204
4205
4206
4207
4208 unlock_page_cgroup(pc);
4209
4210
4211
4212
4213 memcg_check_events(memcg, page);
4214 if (do_swap_account && ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT) {
4215 mem_cgroup_swap_statistics(memcg, true);
4216 css_get(&memcg->css);
4217 }
4218
4219
4220
4221
4222
4223 if (!end_migration && !mem_cgroup_is_root(memcg))
4224 mem_cgroup_do_uncharge(memcg, nr_pages, ctype);
4225
4226 return memcg;
4227
4228unlock_out:
4229 unlock_page_cgroup(pc);
4230 return NULL;
4231}
4232
4233void mem_cgroup_uncharge_page(struct page *page)
4234{
4235
4236 if (page_mapped(page))
4237 return;
4238 VM_BUG_ON(page->mapping && !PageAnon(page));
4239
4240
4241
4242
4243
4244
4245
4246
4247
4248
4249
4250
4251 if (PageSwapCache(page))
4252 return;
4253 __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_ANON, false);
4254}
4255
4256void mem_cgroup_uncharge_cache_page(struct page *page)
4257{
4258 VM_BUG_ON(page_mapped(page));
4259 VM_BUG_ON(page->mapping);
4260 __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE, false);
4261}
4262
4263
4264
4265
4266
4267
4268
4269
4270
4271void mem_cgroup_uncharge_start(void)
4272{
4273 current->memcg_batch.do_batch++;
4274
4275 if (current->memcg_batch.do_batch == 1) {
4276 current->memcg_batch.memcg = NULL;
4277 current->memcg_batch.nr_pages = 0;
4278 current->memcg_batch.memsw_nr_pages = 0;
4279 }
4280}
4281
4282void mem_cgroup_uncharge_end(void)
4283{
4284 struct memcg_batch_info *batch = ¤t->memcg_batch;
4285
4286 if (!batch->do_batch)
4287 return;
4288
4289 batch->do_batch--;
4290 if (batch->do_batch)
4291 return;
4292
4293 if (!batch->memcg)
4294 return;
4295
4296
4297
4298
4299 if (batch->nr_pages)
4300 res_counter_uncharge(&batch->memcg->res,
4301 batch->nr_pages * PAGE_SIZE);
4302 if (batch->memsw_nr_pages)
4303 res_counter_uncharge(&batch->memcg->memsw,
4304 batch->memsw_nr_pages * PAGE_SIZE);
4305 memcg_oom_recover(batch->memcg);
4306
4307 batch->memcg = NULL;
4308}
4309
4310#ifdef CONFIG_SWAP
4311
4312
4313
4314
4315void
4316mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent, bool swapout)
4317{
4318 struct mem_cgroup *memcg;
4319 int ctype = MEM_CGROUP_CHARGE_TYPE_SWAPOUT;
4320
4321 if (!swapout)
4322 ctype = MEM_CGROUP_CHARGE_TYPE_DROP;
4323
4324 memcg = __mem_cgroup_uncharge_common(page, ctype, false);
4325
4326
4327
4328
4329
4330 if (do_swap_account && swapout && memcg)
4331 swap_cgroup_record(ent, css_id(&memcg->css));
4332}
4333#endif
4334
4335#ifdef CONFIG_MEMCG_SWAP
4336
4337
4338
4339
4340void mem_cgroup_uncharge_swap(swp_entry_t ent)
4341{
4342 struct mem_cgroup *memcg;
4343 unsigned short id;
4344
4345 if (!do_swap_account)
4346 return;
4347
4348 id = swap_cgroup_record(ent, 0);
4349 rcu_read_lock();
4350 memcg = mem_cgroup_lookup(id);
4351 if (memcg) {
4352
4353
4354
4355
4356 if (!mem_cgroup_is_root(memcg))
4357 res_counter_uncharge(&memcg->memsw, PAGE_SIZE);
4358 mem_cgroup_swap_statistics(memcg, false);
4359 css_put(&memcg->css);
4360 }
4361 rcu_read_unlock();
4362}
4363
4364
4365
4366
4367
4368
4369
4370
4371
4372
4373
4374
4375
4376
4377
4378static int mem_cgroup_move_swap_account(swp_entry_t entry,
4379 struct mem_cgroup *from, struct mem_cgroup *to)
4380{
4381 unsigned short old_id, new_id;
4382
4383 old_id = css_id(&from->css);
4384 new_id = css_id(&to->css);
4385
4386 if (swap_cgroup_cmpxchg(entry, old_id, new_id) == old_id) {
4387 mem_cgroup_swap_statistics(from, false);
4388 mem_cgroup_swap_statistics(to, true);
4389
4390
4391
4392
4393
4394
4395
4396
4397
4398
4399
4400 css_get(&to->css);
4401 return 0;
4402 }
4403 return -EINVAL;
4404}
4405#else
4406static inline int mem_cgroup_move_swap_account(swp_entry_t entry,
4407 struct mem_cgroup *from, struct mem_cgroup *to)
4408{
4409 return -EINVAL;
4410}
4411#endif
4412
4413
4414
4415
4416
4417void mem_cgroup_prepare_migration(struct page *page, struct page *newpage,
4418 struct mem_cgroup **memcgp)
4419{
4420 struct mem_cgroup *memcg = NULL;
4421 unsigned int nr_pages = 1;
4422 struct page_cgroup *pc;
4423 enum charge_type ctype;
4424
4425 *memcgp = NULL;
4426
4427 if (mem_cgroup_disabled())
4428 return;
4429
4430 if (PageTransHuge(page))
4431 nr_pages <<= compound_order(page);
4432
4433 pc = lookup_page_cgroup(page);
4434 lock_page_cgroup(pc);
4435 if (PageCgroupUsed(pc)) {
4436 memcg = pc->mem_cgroup;
4437 css_get(&memcg->css);
4438
4439
4440
4441
4442
4443
4444
4445
4446
4447
4448
4449
4450
4451
4452
4453
4454
4455
4456
4457
4458
4459
4460
4461
4462
4463
4464
4465
4466
4467 if (PageAnon(page))
4468 SetPageCgroupMigration(pc);
4469 }
4470 unlock_page_cgroup(pc);
4471
4472
4473
4474
4475 if (!memcg)
4476 return;
4477
4478 *memcgp = memcg;
4479
4480
4481
4482
4483
4484
4485 if (PageAnon(page))
4486 ctype = MEM_CGROUP_CHARGE_TYPE_ANON;
4487 else
4488 ctype = MEM_CGROUP_CHARGE_TYPE_CACHE;
4489
4490
4491
4492
4493
4494 __mem_cgroup_commit_charge(memcg, newpage, nr_pages, ctype, false);
4495}
4496
4497
4498void mem_cgroup_end_migration(struct mem_cgroup *memcg,
4499 struct page *oldpage, struct page *newpage, bool migration_ok)
4500{
4501 struct page *used, *unused;
4502 struct page_cgroup *pc;
4503 bool anon;
4504
4505 if (!memcg)
4506 return;
4507
4508 if (!migration_ok) {
4509 used = oldpage;
4510 unused = newpage;
4511 } else {
4512 used = newpage;
4513 unused = oldpage;
4514 }
4515 anon = PageAnon(used);
4516 __mem_cgroup_uncharge_common(unused,
4517 anon ? MEM_CGROUP_CHARGE_TYPE_ANON
4518 : MEM_CGROUP_CHARGE_TYPE_CACHE,
4519 true);
4520 css_put(&memcg->css);
4521
4522
4523
4524
4525
4526 pc = lookup_page_cgroup(oldpage);
4527 lock_page_cgroup(pc);
4528 ClearPageCgroupMigration(pc);
4529 unlock_page_cgroup(pc);
4530
4531
4532
4533
4534
4535
4536
4537
4538
4539 if (anon)
4540 mem_cgroup_uncharge_page(used);
4541}
4542
4543
4544
4545
4546
4547
4548void mem_cgroup_replace_page_cache(struct page *oldpage,
4549 struct page *newpage)
4550{
4551 struct mem_cgroup *memcg = NULL;
4552 struct page_cgroup *pc;
4553 enum charge_type type = MEM_CGROUP_CHARGE_TYPE_CACHE;
4554
4555 if (mem_cgroup_disabled())
4556 return;
4557
4558 pc = lookup_page_cgroup(oldpage);
4559
4560 lock_page_cgroup(pc);
4561 if (PageCgroupUsed(pc)) {
4562 memcg = pc->mem_cgroup;
4563 mem_cgroup_charge_statistics(memcg, oldpage, false, -1);
4564 ClearPageCgroupUsed(pc);
4565 }
4566 unlock_page_cgroup(pc);
4567
4568
4569
4570
4571
4572 if (!memcg)
4573 return;
4574
4575
4576
4577
4578
4579 __mem_cgroup_commit_charge(memcg, newpage, 1, type, true);
4580}
4581
4582#ifdef CONFIG_DEBUG_VM
4583static struct page_cgroup *lookup_page_cgroup_used(struct page *page)
4584{
4585 struct page_cgroup *pc;
4586
4587 pc = lookup_page_cgroup(page);
4588
4589
4590
4591
4592
4593 if (likely(pc) && PageCgroupUsed(pc))
4594 return pc;
4595 return NULL;
4596}
4597
4598bool mem_cgroup_bad_page_check(struct page *page)
4599{
4600 if (mem_cgroup_disabled())
4601 return false;
4602
4603 return lookup_page_cgroup_used(page) != NULL;
4604}
4605
4606void mem_cgroup_print_bad_page(struct page *page)
4607{
4608 struct page_cgroup *pc;
4609
4610 pc = lookup_page_cgroup_used(page);
4611 if (pc) {
4612 pr_alert("pc:%p pc->flags:%lx pc->mem_cgroup:%p\n",
4613 pc, pc->flags, pc->mem_cgroup);
4614 }
4615}
4616#endif
4617
4618static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
4619 unsigned long long val)
4620{
4621 int retry_count;
4622 u64 memswlimit, memlimit;
4623 int ret = 0;
4624 int children = mem_cgroup_count_children(memcg);
4625 u64 curusage, oldusage;
4626 int enlarge;
4627
4628
4629
4630
4631
4632
4633 retry_count = MEM_CGROUP_RECLAIM_RETRIES * children;
4634
4635 oldusage = res_counter_read_u64(&memcg->res, RES_USAGE);
4636
4637 enlarge = 0;
4638 while (retry_count) {
4639 if (signal_pending(current)) {
4640 ret = -EINTR;
4641 break;
4642 }
4643
4644
4645
4646
4647
4648 mutex_lock(&set_limit_mutex);
4649 memswlimit = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
4650 if (memswlimit < val) {
4651 ret = -EINVAL;
4652 mutex_unlock(&set_limit_mutex);
4653 break;
4654 }
4655
4656 memlimit = res_counter_read_u64(&memcg->res, RES_LIMIT);
4657 if (memlimit < val)
4658 enlarge = 1;
4659
4660 ret = res_counter_set_limit(&memcg->res, val);
4661 if (!ret) {
4662 if (memswlimit == val)
4663 memcg->memsw_is_minimum = true;
4664 else
4665 memcg->memsw_is_minimum = false;
4666 }
4667 mutex_unlock(&set_limit_mutex);
4668
4669 if (!ret)
4670 break;
4671
4672 mem_cgroup_reclaim(memcg, GFP_KERNEL,
4673 MEM_CGROUP_RECLAIM_SHRINK);
4674 curusage = res_counter_read_u64(&memcg->res, RES_USAGE);
4675
4676 if (curusage >= oldusage)
4677 retry_count--;
4678 else
4679 oldusage = curusage;
4680 }
4681 if (!ret && enlarge)
4682 memcg_oom_recover(memcg);
4683
4684 return ret;
4685}
4686
4687static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg,
4688 unsigned long long val)
4689{
4690 int retry_count;
4691 u64 memlimit, memswlimit, oldusage, curusage;
4692 int children = mem_cgroup_count_children(memcg);
4693 int ret = -EBUSY;
4694 int enlarge = 0;
4695
4696
4697 retry_count = children * MEM_CGROUP_RECLAIM_RETRIES;
4698 oldusage = res_counter_read_u64(&memcg->memsw, RES_USAGE);
4699 while (retry_count) {
4700 if (signal_pending(current)) {
4701 ret = -EINTR;
4702 break;
4703 }
4704
4705
4706
4707
4708
4709 mutex_lock(&set_limit_mutex);
4710 memlimit = res_counter_read_u64(&memcg->res, RES_LIMIT);
4711 if (memlimit > val) {
4712 ret = -EINVAL;
4713 mutex_unlock(&set_limit_mutex);
4714 break;
4715 }
4716 memswlimit = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
4717 if (memswlimit < val)
4718 enlarge = 1;
4719 ret = res_counter_set_limit(&memcg->memsw, val);
4720 if (!ret) {
4721 if (memlimit == val)
4722 memcg->memsw_is_minimum = true;
4723 else
4724 memcg->memsw_is_minimum = false;
4725 }
4726 mutex_unlock(&set_limit_mutex);
4727
4728 if (!ret)
4729 break;
4730
4731 mem_cgroup_reclaim(memcg, GFP_KERNEL,
4732 MEM_CGROUP_RECLAIM_NOSWAP |
4733 MEM_CGROUP_RECLAIM_SHRINK);
4734 curusage = res_counter_read_u64(&memcg->memsw, RES_USAGE);
4735
4736 if (curusage >= oldusage)
4737 retry_count--;
4738 else
4739 oldusage = curusage;
4740 }
4741 if (!ret && enlarge)
4742 memcg_oom_recover(memcg);
4743 return ret;
4744}
4745
4746unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
4747 gfp_t gfp_mask,
4748 unsigned long *total_scanned)
4749{
4750 unsigned long nr_reclaimed = 0;
4751 struct mem_cgroup_per_zone *mz, *next_mz = NULL;
4752 unsigned long reclaimed;
4753 int loop = 0;
4754 struct mem_cgroup_tree_per_zone *mctz;
4755 unsigned long long excess;
4756 unsigned long nr_scanned;
4757
4758 if (order > 0)
4759 return 0;
4760
4761 mctz = soft_limit_tree_node_zone(zone_to_nid(zone), zone_idx(zone));
4762
4763
4764
4765
4766
4767 do {
4768 if (next_mz)
4769 mz = next_mz;
4770 else
4771 mz = mem_cgroup_largest_soft_limit_node(mctz);
4772 if (!mz)
4773 break;
4774
4775 nr_scanned = 0;
4776 reclaimed = mem_cgroup_soft_reclaim(mz->memcg, zone,
4777 gfp_mask, &nr_scanned);
4778 nr_reclaimed += reclaimed;
4779 *total_scanned += nr_scanned;
4780 spin_lock(&mctz->lock);
4781
4782
4783
4784
4785
4786 next_mz = NULL;
4787 if (!reclaimed) {
4788 do {
4789
4790
4791
4792
4793
4794
4795
4796
4797
4798
4799
4800 next_mz =
4801 __mem_cgroup_largest_soft_limit_node(mctz);
4802 if (next_mz == mz)
4803 css_put(&next_mz->memcg->css);
4804 else
4805 break;
4806 } while (1);
4807 }
4808 __mem_cgroup_remove_exceeded(mz->memcg, mz, mctz);
4809 excess = res_counter_soft_limit_excess(&mz->memcg->res);
4810
4811
4812
4813
4814
4815
4816
4817
4818
4819 __mem_cgroup_insert_exceeded(mz->memcg, mz, mctz, excess);
4820 spin_unlock(&mctz->lock);
4821 css_put(&mz->memcg->css);
4822 loop++;
4823
4824
4825
4826
4827
4828 if (!nr_reclaimed &&
4829 (next_mz == NULL ||
4830 loop > MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS))
4831 break;
4832 } while (!nr_reclaimed);
4833 if (next_mz)
4834 css_put(&next_mz->memcg->css);
4835 return nr_reclaimed;
4836}
4837
4838
4839
4840
4841
4842
4843
4844
4845
4846
4847
4848
4849static void mem_cgroup_force_empty_list(struct mem_cgroup *memcg,
4850 int node, int zid, enum lru_list lru)
4851{
4852 struct lruvec *lruvec;
4853 unsigned long flags;
4854 struct list_head *list;
4855 struct page *busy;
4856 struct zone *zone;
4857
4858 zone = &NODE_DATA(node)->node_zones[zid];
4859 lruvec = mem_cgroup_zone_lruvec(zone, memcg);
4860 list = &lruvec->lists[lru];
4861
4862 busy = NULL;
4863 do {
4864 struct page_cgroup *pc;
4865 struct page *page;
4866
4867 spin_lock_irqsave(&zone->lru_lock, flags);
4868 if (list_empty(list)) {
4869 spin_unlock_irqrestore(&zone->lru_lock, flags);
4870 break;
4871 }
4872 page = list_entry(list->prev, struct page, lru);
4873 if (busy == page) {
4874 list_move(&page->lru, list);
4875 busy = NULL;
4876 spin_unlock_irqrestore(&zone->lru_lock, flags);
4877 continue;
4878 }
4879 spin_unlock_irqrestore(&zone->lru_lock, flags);
4880
4881 pc = lookup_page_cgroup(page);
4882
4883 if (mem_cgroup_move_parent(page, pc, memcg)) {
4884
4885 busy = page;
4886 cond_resched();
4887 } else
4888 busy = NULL;
4889 } while (!list_empty(list));
4890}
4891
4892
4893
4894
4895
4896
4897
4898
4899static void mem_cgroup_reparent_charges(struct mem_cgroup *memcg)
4900{
4901 int node, zid;
4902 u64 usage;
4903
4904 do {
4905
4906 lru_add_drain_all();
4907 drain_all_stock_sync(memcg);
4908 mem_cgroup_start_move(memcg);
4909 for_each_node_state(node, N_MEMORY) {
4910 for (zid = 0; zid < MAX_NR_ZONES; zid++) {
4911 enum lru_list lru;
4912 for_each_lru(lru) {
4913 mem_cgroup_force_empty_list(memcg,
4914 node, zid, lru);
4915 }
4916 }
4917 }
4918 mem_cgroup_end_move(memcg);
4919 memcg_oom_recover(memcg);
4920 cond_resched();
4921
4922
4923
4924
4925
4926
4927
4928
4929
4930
4931
4932
4933
4934 usage = res_counter_read_u64(&memcg->res, RES_USAGE) -
4935 res_counter_read_u64(&memcg->kmem, RES_USAGE);
4936 } while (usage > 0);
4937}
4938
4939
4940
4941
4942
4943
4944static inline bool __memcg_has_children(struct mem_cgroup *memcg)
4945{
4946 struct cgroup *pos;
4947
4948
4949 cgroup_for_each_child(pos, memcg->css.cgroup)
4950 return true;
4951 return false;
4952}
4953
4954
4955
4956
4957
4958
4959
4960
4961static inline bool memcg_has_children(struct mem_cgroup *memcg)
4962{
4963 return memcg->use_hierarchy && __memcg_has_children(memcg);
4964}
4965
4966
4967
4968
4969
4970
4971
4972static int mem_cgroup_force_empty(struct mem_cgroup *memcg)
4973{
4974 int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
4975 struct cgroup *cgrp = memcg->css.cgroup;
4976
4977
4978 if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children))
4979 return -EBUSY;
4980
4981
4982 lru_add_drain_all();
4983
4984 while (nr_retries && res_counter_read_u64(&memcg->res, RES_USAGE) > 0) {
4985 int progress;
4986
4987 if (signal_pending(current))
4988 return -EINTR;
4989
4990 progress = try_to_free_mem_cgroup_pages(memcg, GFP_KERNEL,
4991 false);
4992 if (!progress) {
4993 nr_retries--;
4994
4995 congestion_wait(BLK_RW_ASYNC, HZ/10);
4996 }
4997
4998 }
4999 lru_add_drain();
5000 mem_cgroup_reparent_charges(memcg);
5001
5002 return 0;
5003}
5004
5005static int mem_cgroup_force_empty_write(struct cgroup *cont, unsigned int event)
5006{
5007 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
5008 int ret;
5009
5010 if (mem_cgroup_is_root(memcg))
5011 return -EINVAL;
5012 css_get(&memcg->css);
5013 ret = mem_cgroup_force_empty(memcg);
5014 css_put(&memcg->css);
5015
5016 return ret;
5017}
5018
5019
5020static u64 mem_cgroup_hierarchy_read(struct cgroup *cont, struct cftype *cft)
5021{
5022 return mem_cgroup_from_cont(cont)->use_hierarchy;
5023}
5024
5025static int mem_cgroup_hierarchy_write(struct cgroup *cont, struct cftype *cft,
5026 u64 val)
5027{
5028 int retval = 0;
5029 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
5030 struct cgroup *parent = cont->parent;
5031 struct mem_cgroup *parent_memcg = NULL;
5032
5033 if (parent)
5034 parent_memcg = mem_cgroup_from_cont(parent);
5035
5036 mutex_lock(&memcg_create_mutex);
5037
5038 if (memcg->use_hierarchy == val)
5039 goto out;
5040
5041
5042
5043
5044
5045
5046
5047
5048
5049 if ((!parent_memcg || !parent_memcg->use_hierarchy) &&
5050 (val == 1 || val == 0)) {
5051 if (!__memcg_has_children(memcg))
5052 memcg->use_hierarchy = val;
5053 else
5054 retval = -EBUSY;
5055 } else
5056 retval = -EINVAL;
5057
5058out:
5059 mutex_unlock(&memcg_create_mutex);
5060
5061 return retval;
5062}
5063
5064
5065static unsigned long mem_cgroup_recursive_stat(struct mem_cgroup *memcg,
5066 enum mem_cgroup_stat_index idx)
5067{
5068 struct mem_cgroup *iter;
5069 long val = 0;
5070
5071
5072 for_each_mem_cgroup_tree(iter, memcg)
5073 val += mem_cgroup_read_stat(iter, idx);
5074
5075 if (val < 0)
5076 val = 0;
5077 return val;
5078}
5079
5080static inline u64 mem_cgroup_usage(struct mem_cgroup *memcg, bool swap)
5081{
5082 u64 val;
5083
5084 if (!mem_cgroup_is_root(memcg)) {
5085 if (!swap)
5086 return res_counter_read_u64(&memcg->res, RES_USAGE);
5087 else
5088 return res_counter_read_u64(&memcg->memsw, RES_USAGE);
5089 }
5090
5091
5092
5093
5094
5095 val = mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_CACHE);
5096 val += mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_RSS);
5097
5098 if (swap)
5099 val += mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_SWAP);
5100
5101 return val << PAGE_SHIFT;
5102}
5103
5104static ssize_t mem_cgroup_read(struct cgroup *cont, struct cftype *cft,
5105 struct file *file, char __user *buf,
5106 size_t nbytes, loff_t *ppos)
5107{
5108 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
5109 char str[64];
5110 u64 val;
5111 int name, len;
5112 enum res_type type;
5113
5114 type = MEMFILE_TYPE(cft->private);
5115 name = MEMFILE_ATTR(cft->private);
5116
5117 switch (type) {
5118 case _MEM:
5119 if (name == RES_USAGE)
5120 val = mem_cgroup_usage(memcg, false);
5121 else
5122 val = res_counter_read_u64(&memcg->res, name);
5123 break;
5124 case _MEMSWAP:
5125 if (name == RES_USAGE)
5126 val = mem_cgroup_usage(memcg, true);
5127 else
5128 val = res_counter_read_u64(&memcg->memsw, name);
5129 break;
5130 case _KMEM:
5131 val = res_counter_read_u64(&memcg->kmem, name);
5132 break;
5133 default:
5134 BUG();
5135 }
5136
5137 len = scnprintf(str, sizeof(str), "%llu\n", (unsigned long long)val);
5138 return simple_read_from_buffer(buf, nbytes, ppos, str, len);
5139}
5140
5141static int memcg_update_kmem_limit(struct cgroup *cont, u64 val)
5142{
5143 int ret = -EINVAL;
5144#ifdef CONFIG_MEMCG_KMEM
5145 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
5146
5147
5148
5149
5150
5151
5152
5153
5154
5155
5156
5157
5158 mutex_lock(&memcg_create_mutex);
5159 mutex_lock(&set_limit_mutex);
5160 if (!memcg->kmem_account_flags && val != RESOURCE_MAX) {
5161 if (cgroup_task_count(cont) || memcg_has_children(memcg)) {
5162 ret = -EBUSY;
5163 goto out;
5164 }
5165 ret = res_counter_set_limit(&memcg->kmem, val);
5166 VM_BUG_ON(ret);
5167
5168 ret = memcg_update_cache_sizes(memcg);
5169 if (ret) {
5170 res_counter_set_limit(&memcg->kmem, RESOURCE_MAX);
5171 goto out;
5172 }
5173 static_key_slow_inc(&memcg_kmem_enabled_key);
5174
5175
5176
5177
5178 memcg_kmem_set_active(memcg);
5179 } else
5180 ret = res_counter_set_limit(&memcg->kmem, val);
5181out:
5182 mutex_unlock(&set_limit_mutex);
5183 mutex_unlock(&memcg_create_mutex);
5184#endif
5185 return ret;
5186}
5187
5188#ifdef CONFIG_MEMCG_KMEM
5189static int memcg_propagate_kmem(struct mem_cgroup *memcg)
5190{
5191 int ret = 0;
5192 struct mem_cgroup *parent = parent_mem_cgroup(memcg);
5193 if (!parent)
5194 goto out;
5195
5196 memcg->kmem_account_flags = parent->kmem_account_flags;
5197
5198
5199
5200
5201
5202
5203
5204
5205
5206
5207 if (!memcg_kmem_is_active(memcg))
5208 goto out;
5209
5210
5211
5212
5213
5214
5215 static_key_slow_inc(&memcg_kmem_enabled_key);
5216
5217 mutex_lock(&set_limit_mutex);
5218 memcg_stop_kmem_account();
5219 ret = memcg_update_cache_sizes(memcg);
5220 memcg_resume_kmem_account();
5221 mutex_unlock(&set_limit_mutex);
5222out:
5223 return ret;
5224}
5225#endif
5226
5227
5228
5229
5230
5231static int mem_cgroup_write(struct cgroup *cont, struct cftype *cft,
5232 const char *buffer)
5233{
5234 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
5235 enum res_type type;
5236 int name;
5237 unsigned long long val;
5238 int ret;
5239
5240 type = MEMFILE_TYPE(cft->private);
5241 name = MEMFILE_ATTR(cft->private);
5242
5243 switch (name) {
5244 case RES_LIMIT:
5245 if (mem_cgroup_is_root(memcg)) {
5246 ret = -EINVAL;
5247 break;
5248 }
5249
5250 ret = res_counter_memparse_write_strategy(buffer, &val);
5251 if (ret)
5252 break;
5253 if (type == _MEM)
5254 ret = mem_cgroup_resize_limit(memcg, val);
5255 else if (type == _MEMSWAP)
5256 ret = mem_cgroup_resize_memsw_limit(memcg, val);
5257 else if (type == _KMEM)
5258 ret = memcg_update_kmem_limit(cont, val);
5259 else
5260 return -EINVAL;
5261 break;
5262 case RES_SOFT_LIMIT:
5263 ret = res_counter_memparse_write_strategy(buffer, &val);
5264 if (ret)
5265 break;
5266
5267
5268
5269
5270
5271 if (type == _MEM)
5272 ret = res_counter_set_soft_limit(&memcg->res, val);
5273 else
5274 ret = -EINVAL;
5275 break;
5276 default:
5277 ret = -EINVAL;
5278 break;
5279 }
5280 return ret;
5281}
5282
5283static void memcg_get_hierarchical_limit(struct mem_cgroup *memcg,
5284 unsigned long long *mem_limit, unsigned long long *memsw_limit)
5285{
5286 struct cgroup *cgroup;
5287 unsigned long long min_limit, min_memsw_limit, tmp;
5288
5289 min_limit = res_counter_read_u64(&memcg->res, RES_LIMIT);
5290 min_memsw_limit = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
5291 cgroup = memcg->css.cgroup;
5292 if (!memcg->use_hierarchy)
5293 goto out;
5294
5295 while (cgroup->parent) {
5296 cgroup = cgroup->parent;
5297 memcg = mem_cgroup_from_cont(cgroup);
5298 if (!memcg->use_hierarchy)
5299 break;
5300 tmp = res_counter_read_u64(&memcg->res, RES_LIMIT);
5301 min_limit = min(min_limit, tmp);
5302 tmp = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
5303 min_memsw_limit = min(min_memsw_limit, tmp);
5304 }
5305out:
5306 *mem_limit = min_limit;
5307 *memsw_limit = min_memsw_limit;
5308}
5309
5310static int mem_cgroup_reset(struct cgroup *cont, unsigned int event)
5311{
5312 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
5313 int name;
5314 enum res_type type;
5315
5316 type = MEMFILE_TYPE(event);
5317 name = MEMFILE_ATTR(event);
5318
5319 switch (name) {
5320 case RES_MAX_USAGE:
5321 if (type == _MEM)
5322 res_counter_reset_max(&memcg->res);
5323 else if (type == _MEMSWAP)
5324 res_counter_reset_max(&memcg->memsw);
5325 else if (type == _KMEM)
5326 res_counter_reset_max(&memcg->kmem);
5327 else
5328 return -EINVAL;
5329 break;
5330 case RES_FAILCNT:
5331 if (type == _MEM)
5332 res_counter_reset_failcnt(&memcg->res);
5333 else if (type == _MEMSWAP)
5334 res_counter_reset_failcnt(&memcg->memsw);
5335 else if (type == _KMEM)
5336 res_counter_reset_failcnt(&memcg->kmem);
5337 else
5338 return -EINVAL;
5339 break;
5340 }
5341
5342 return 0;
5343}
5344
5345static u64 mem_cgroup_move_charge_read(struct cgroup *cgrp,
5346 struct cftype *cft)
5347{
5348 return mem_cgroup_from_cont(cgrp)->move_charge_at_immigrate;
5349}
5350
5351#ifdef CONFIG_MMU
5352static int mem_cgroup_move_charge_write(struct cgroup *cgrp,
5353 struct cftype *cft, u64 val)
5354{
5355 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
5356
5357 if (val >= (1 << NR_MOVE_TYPE))
5358 return -EINVAL;
5359
5360
5361
5362
5363
5364
5365
5366 memcg->move_charge_at_immigrate = val;
5367 return 0;
5368}
5369#else
5370static int mem_cgroup_move_charge_write(struct cgroup *cgrp,
5371 struct cftype *cft, u64 val)
5372{
5373 return -ENOSYS;
5374}
5375#endif
5376
5377#ifdef CONFIG_NUMA
5378static int memcg_numa_stat_show(struct cgroup *cont, struct cftype *cft,
5379 struct seq_file *m)
5380{
5381 int nid;
5382 unsigned long total_nr, file_nr, anon_nr, unevictable_nr;
5383 unsigned long node_nr;
5384 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
5385
5386 total_nr = mem_cgroup_nr_lru_pages(memcg, LRU_ALL);
5387 seq_printf(m, "total=%lu", total_nr);
5388 for_each_node_state(nid, N_MEMORY) {
5389 node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL);
5390 seq_printf(m, " N%d=%lu", nid, node_nr);
5391 }
5392 seq_putc(m, '\n');
5393
5394 file_nr = mem_cgroup_nr_lru_pages(memcg, LRU_ALL_FILE);
5395 seq_printf(m, "file=%lu", file_nr);
5396 for_each_node_state(nid, N_MEMORY) {
5397 node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid,
5398 LRU_ALL_FILE);
5399 seq_printf(m, " N%d=%lu", nid, node_nr);
5400 }
5401 seq_putc(m, '\n');
5402
5403 anon_nr = mem_cgroup_nr_lru_pages(memcg, LRU_ALL_ANON);
5404 seq_printf(m, "anon=%lu", anon_nr);
5405 for_each_node_state(nid, N_MEMORY) {
5406 node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid,
5407 LRU_ALL_ANON);
5408 seq_printf(m, " N%d=%lu", nid, node_nr);
5409 }
5410 seq_putc(m, '\n');
5411
5412 unevictable_nr = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_UNEVICTABLE));
5413 seq_printf(m, "unevictable=%lu", unevictable_nr);
5414 for_each_node_state(nid, N_MEMORY) {
5415 node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid,
5416 BIT(LRU_UNEVICTABLE));
5417 seq_printf(m, " N%d=%lu", nid, node_nr);
5418 }
5419 seq_putc(m, '\n');
5420 return 0;
5421}
5422#endif
5423
5424static inline void mem_cgroup_lru_names_not_uptodate(void)
5425{
5426 BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_lru_names) != NR_LRU_LISTS);
5427}
5428
5429static int memcg_stat_show(struct cgroup *cont, struct cftype *cft,
5430 struct seq_file *m)
5431{
5432 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
5433 struct mem_cgroup *mi;
5434 unsigned int i;
5435
5436 for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) {
5437 if (i == MEM_CGROUP_STAT_SWAP && !do_swap_account)
5438 continue;
5439 seq_printf(m, "%s %ld\n", mem_cgroup_stat_names[i],
5440 mem_cgroup_read_stat(memcg, i) * PAGE_SIZE);
5441 }
5442
5443 for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++)
5444 seq_printf(m, "%s %lu\n", mem_cgroup_events_names[i],
5445 mem_cgroup_read_events(memcg, i));
5446
5447 for (i = 0; i < NR_LRU_LISTS; i++)
5448 seq_printf(m, "%s %lu\n", mem_cgroup_lru_names[i],
5449 mem_cgroup_nr_lru_pages(memcg, BIT(i)) * PAGE_SIZE);
5450
5451
5452 {
5453 unsigned long long limit, memsw_limit;
5454 memcg_get_hierarchical_limit(memcg, &limit, &memsw_limit);
5455 seq_printf(m, "hierarchical_memory_limit %llu\n", limit);
5456 if (do_swap_account)
5457 seq_printf(m, "hierarchical_memsw_limit %llu\n",
5458 memsw_limit);
5459 }
5460
5461 for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) {
5462 long long val = 0;
5463
5464 if (i == MEM_CGROUP_STAT_SWAP && !do_swap_account)
5465 continue;
5466 for_each_mem_cgroup_tree(mi, memcg)
5467 val += mem_cgroup_read_stat(mi, i) * PAGE_SIZE;
5468 seq_printf(m, "total_%s %lld\n", mem_cgroup_stat_names[i], val);
5469 }
5470
5471 for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++) {
5472 unsigned long long val = 0;
5473
5474 for_each_mem_cgroup_tree(mi, memcg)
5475 val += mem_cgroup_read_events(mi, i);
5476 seq_printf(m, "total_%s %llu\n",
5477 mem_cgroup_events_names[i], val);
5478 }
5479
5480 for (i = 0; i < NR_LRU_LISTS; i++) {
5481 unsigned long long val = 0;
5482
5483 for_each_mem_cgroup_tree(mi, memcg)
5484 val += mem_cgroup_nr_lru_pages(mi, BIT(i)) * PAGE_SIZE;
5485 seq_printf(m, "total_%s %llu\n", mem_cgroup_lru_names[i], val);
5486 }
5487
5488#ifdef CONFIG_DEBUG_VM
5489 {
5490 int nid, zid;
5491 struct mem_cgroup_per_zone *mz;
5492 struct zone_reclaim_stat *rstat;
5493 unsigned long recent_rotated[2] = {0, 0};
5494 unsigned long recent_scanned[2] = {0, 0};
5495
5496 for_each_online_node(nid)
5497 for (zid = 0; zid < MAX_NR_ZONES; zid++) {
5498 mz = mem_cgroup_zoneinfo(memcg, nid, zid);
5499 rstat = &mz->lruvec.reclaim_stat;
5500
5501 recent_rotated[0] += rstat->recent_rotated[0];
5502 recent_rotated[1] += rstat->recent_rotated[1];
5503 recent_scanned[0] += rstat->recent_scanned[0];
5504 recent_scanned[1] += rstat->recent_scanned[1];
5505 }
5506 seq_printf(m, "recent_rotated_anon %lu\n", recent_rotated[0]);
5507 seq_printf(m, "recent_rotated_file %lu\n", recent_rotated[1]);
5508 seq_printf(m, "recent_scanned_anon %lu\n", recent_scanned[0]);
5509 seq_printf(m, "recent_scanned_file %lu\n", recent_scanned[1]);
5510 }
5511#endif
5512
5513 return 0;
5514}
5515
5516static u64 mem_cgroup_swappiness_read(struct cgroup *cgrp, struct cftype *cft)
5517{
5518 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
5519
5520 return mem_cgroup_swappiness(memcg);
5521}
5522
5523static int mem_cgroup_swappiness_write(struct cgroup *cgrp, struct cftype *cft,
5524 u64 val)
5525{
5526 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
5527 struct mem_cgroup *parent;
5528
5529 if (val > 100)
5530 return -EINVAL;
5531
5532 if (cgrp->parent == NULL)
5533 return -EINVAL;
5534
5535 parent = mem_cgroup_from_cont(cgrp->parent);
5536
5537 mutex_lock(&memcg_create_mutex);
5538
5539
5540 if ((parent->use_hierarchy) || memcg_has_children(memcg)) {
5541 mutex_unlock(&memcg_create_mutex);
5542 return -EINVAL;
5543 }
5544
5545 memcg->swappiness = val;
5546
5547 mutex_unlock(&memcg_create_mutex);
5548
5549 return 0;
5550}
5551
5552static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap)
5553{
5554 struct mem_cgroup_threshold_ary *t;
5555 u64 usage;
5556 int i;
5557
5558 rcu_read_lock();
5559 if (!swap)
5560 t = rcu_dereference(memcg->thresholds.primary);
5561 else
5562 t = rcu_dereference(memcg->memsw_thresholds.primary);
5563
5564 if (!t)
5565 goto unlock;
5566
5567 usage = mem_cgroup_usage(memcg, swap);
5568
5569
5570
5571
5572
5573
5574 i = t->current_threshold;
5575
5576
5577
5578
5579
5580
5581
5582 for (; i >= 0 && unlikely(t->entries[i].threshold > usage); i--)
5583 eventfd_signal(t->entries[i].eventfd, 1);
5584
5585
5586 i++;
5587
5588
5589
5590
5591
5592
5593
5594 for (; i < t->size && unlikely(t->entries[i].threshold <= usage); i++)
5595 eventfd_signal(t->entries[i].eventfd, 1);
5596
5597
5598 t->current_threshold = i - 1;
5599unlock:
5600 rcu_read_unlock();
5601}
5602
5603static void mem_cgroup_threshold(struct mem_cgroup *memcg)
5604{
5605 while (memcg) {
5606 __mem_cgroup_threshold(memcg, false);
5607 if (do_swap_account)
5608 __mem_cgroup_threshold(memcg, true);
5609
5610 memcg = parent_mem_cgroup(memcg);
5611 }
5612}
5613
5614static int compare_thresholds(const void *a, const void *b)
5615{
5616 const struct mem_cgroup_threshold *_a = a;
5617 const struct mem_cgroup_threshold *_b = b;
5618
5619 return _a->threshold - _b->threshold;
5620}
5621
5622static int mem_cgroup_oom_notify_cb(struct mem_cgroup *memcg)
5623{
5624 struct mem_cgroup_eventfd_list *ev;
5625
5626 list_for_each_entry(ev, &memcg->oom_notify, list)
5627 eventfd_signal(ev->eventfd, 1);
5628 return 0;
5629}
5630
5631static void mem_cgroup_oom_notify(struct mem_cgroup *memcg)
5632{
5633 struct mem_cgroup *iter;
5634
5635 for_each_mem_cgroup_tree(iter, memcg)
5636 mem_cgroup_oom_notify_cb(iter);
5637}
5638
5639static int mem_cgroup_usage_register_event(struct cgroup *cgrp,
5640 struct cftype *cft, struct eventfd_ctx *eventfd, const char *args)
5641{
5642 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
5643 struct mem_cgroup_thresholds *thresholds;
5644 struct mem_cgroup_threshold_ary *new;
5645 enum res_type type = MEMFILE_TYPE(cft->private);
5646 u64 threshold, usage;
5647 int i, size, ret;
5648
5649 ret = res_counter_memparse_write_strategy(args, &threshold);
5650 if (ret)
5651 return ret;
5652
5653 mutex_lock(&memcg->thresholds_lock);
5654
5655 if (type == _MEM)
5656 thresholds = &memcg->thresholds;
5657 else if (type == _MEMSWAP)
5658 thresholds = &memcg->memsw_thresholds;
5659 else
5660 BUG();
5661
5662 usage = mem_cgroup_usage(memcg, type == _MEMSWAP);
5663
5664
5665 if (thresholds->primary)
5666 __mem_cgroup_threshold(memcg, type == _MEMSWAP);
5667
5668 size = thresholds->primary ? thresholds->primary->size + 1 : 1;
5669
5670
5671 new = kmalloc(sizeof(*new) + size * sizeof(struct mem_cgroup_threshold),
5672 GFP_KERNEL);
5673 if (!new) {
5674 ret = -ENOMEM;
5675 goto unlock;
5676 }
5677 new->size = size;
5678
5679
5680 if (thresholds->primary) {
5681 memcpy(new->entries, thresholds->primary->entries, (size - 1) *
5682 sizeof(struct mem_cgroup_threshold));
5683 }
5684
5685
5686 new->entries[size - 1].eventfd = eventfd;
5687 new->entries[size - 1].threshold = threshold;
5688
5689
5690 sort(new->entries, size, sizeof(struct mem_cgroup_threshold),
5691 compare_thresholds, NULL);
5692
5693
5694 new->current_threshold = -1;
5695 for (i = 0; i < size; i++) {
5696 if (new->entries[i].threshold <= usage) {
5697
5698
5699
5700
5701
5702 ++new->current_threshold;
5703 } else
5704 break;
5705 }
5706
5707
5708 kfree(thresholds->spare);
5709 thresholds->spare = thresholds->primary;
5710
5711 rcu_assign_pointer(thresholds->primary, new);
5712
5713
5714 synchronize_rcu();
5715
5716unlock:
5717 mutex_unlock(&memcg->thresholds_lock);
5718
5719 return ret;
5720}
5721
5722static void mem_cgroup_usage_unregister_event(struct cgroup *cgrp,
5723 struct cftype *cft, struct eventfd_ctx *eventfd)
5724{
5725 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
5726 struct mem_cgroup_thresholds *thresholds;
5727 struct mem_cgroup_threshold_ary *new;
5728 enum res_type type = MEMFILE_TYPE(cft->private);
5729 u64 usage;
5730 int i, j, size;
5731
5732 mutex_lock(&memcg->thresholds_lock);
5733 if (type == _MEM)
5734 thresholds = &memcg->thresholds;
5735 else if (type == _MEMSWAP)
5736 thresholds = &memcg->memsw_thresholds;
5737 else
5738 BUG();
5739
5740 if (!thresholds->primary)
5741 goto unlock;
5742
5743 usage = mem_cgroup_usage(memcg, type == _MEMSWAP);
5744
5745
5746 __mem_cgroup_threshold(memcg, type == _MEMSWAP);
5747
5748
5749 size = 0;
5750 for (i = 0; i < thresholds->primary->size; i++) {
5751 if (thresholds->primary->entries[i].eventfd != eventfd)
5752 size++;
5753 }
5754
5755 new = thresholds->spare;
5756
5757
5758 if (!size) {
5759 kfree(new);
5760 new = NULL;
5761 goto swap_buffers;
5762 }
5763
5764 new->size = size;
5765
5766
5767 new->current_threshold = -1;
5768 for (i = 0, j = 0; i < thresholds->primary->size; i++) {
5769 if (thresholds->primary->entries[i].eventfd == eventfd)
5770 continue;
5771
5772 new->entries[j] = thresholds->primary->entries[i];
5773 if (new->entries[j].threshold <= usage) {
5774
5775
5776
5777
5778
5779 ++new->current_threshold;
5780 }
5781 j++;
5782 }
5783
5784swap_buffers:
5785
5786 thresholds->spare = thresholds->primary;
5787
5788 if (!new) {
5789 kfree(thresholds->spare);
5790 thresholds->spare = NULL;
5791 }
5792
5793 rcu_assign_pointer(thresholds->primary, new);
5794
5795
5796 synchronize_rcu();
5797unlock:
5798 mutex_unlock(&memcg->thresholds_lock);
5799}
5800
5801static int mem_cgroup_oom_register_event(struct cgroup *cgrp,
5802 struct cftype *cft, struct eventfd_ctx *eventfd, const char *args)
5803{
5804 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
5805 struct mem_cgroup_eventfd_list *event;
5806 enum res_type type = MEMFILE_TYPE(cft->private);
5807
5808 BUG_ON(type != _OOM_TYPE);
5809 event = kmalloc(sizeof(*event), GFP_KERNEL);
5810 if (!event)
5811 return -ENOMEM;
5812
5813 spin_lock(&memcg_oom_lock);
5814
5815 event->eventfd = eventfd;
5816 list_add(&event->list, &memcg->oom_notify);
5817
5818
5819 if (atomic_read(&memcg->under_oom))
5820 eventfd_signal(eventfd, 1);
5821 spin_unlock(&memcg_oom_lock);
5822
5823 return 0;
5824}
5825
5826static void mem_cgroup_oom_unregister_event(struct cgroup *cgrp,
5827 struct cftype *cft, struct eventfd_ctx *eventfd)
5828{
5829 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
5830 struct mem_cgroup_eventfd_list *ev, *tmp;
5831 enum res_type type = MEMFILE_TYPE(cft->private);
5832
5833 BUG_ON(type != _OOM_TYPE);
5834
5835 spin_lock(&memcg_oom_lock);
5836
5837 list_for_each_entry_safe(ev, tmp, &memcg->oom_notify, list) {
5838 if (ev->eventfd == eventfd) {
5839 list_del(&ev->list);
5840 kfree(ev);
5841 }
5842 }
5843
5844 spin_unlock(&memcg_oom_lock);
5845}
5846
5847static int mem_cgroup_oom_control_read(struct cgroup *cgrp,
5848 struct cftype *cft, struct cgroup_map_cb *cb)
5849{
5850 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
5851
5852 cb->fill(cb, "oom_kill_disable", memcg->oom_kill_disable);
5853
5854 if (atomic_read(&memcg->under_oom))
5855 cb->fill(cb, "under_oom", 1);
5856 else
5857 cb->fill(cb, "under_oom", 0);
5858 return 0;
5859}
5860
5861static int mem_cgroup_oom_control_write(struct cgroup *cgrp,
5862 struct cftype *cft, u64 val)
5863{
5864 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
5865 struct mem_cgroup *parent;
5866
5867
5868 if (!cgrp->parent || !((val == 0) || (val == 1)))
5869 return -EINVAL;
5870
5871 parent = mem_cgroup_from_cont(cgrp->parent);
5872
5873 mutex_lock(&memcg_create_mutex);
5874
5875 if ((parent->use_hierarchy) || memcg_has_children(memcg)) {
5876 mutex_unlock(&memcg_create_mutex);
5877 return -EINVAL;
5878 }
5879 memcg->oom_kill_disable = val;
5880 if (!val)
5881 memcg_oom_recover(memcg);
5882 mutex_unlock(&memcg_create_mutex);
5883 return 0;
5884}
5885
5886#ifdef CONFIG_MEMCG_KMEM
5887static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
5888{
5889 int ret;
5890
5891 memcg->kmemcg_id = -1;
5892 ret = memcg_propagate_kmem(memcg);
5893 if (ret)
5894 return ret;
5895
5896 return mem_cgroup_sockets_init(memcg, ss);
5897}
5898
5899static void memcg_destroy_kmem(struct mem_cgroup *memcg)
5900{
5901 mem_cgroup_sockets_destroy(memcg);
5902}
5903
5904static void kmem_cgroup_css_offline(struct mem_cgroup *memcg)
5905{
5906 if (!memcg_kmem_is_active(memcg))
5907 return;
5908
5909
5910
5911
5912
5913
5914
5915
5916
5917
5918
5919
5920
5921
5922
5923
5924
5925
5926
5927 css_get(&memcg->css);
5928
5929 memcg_kmem_mark_dead(memcg);
5930
5931 if (res_counter_read_u64(&memcg->kmem, RES_USAGE) != 0)
5932 return;
5933
5934 if (memcg_kmem_test_and_clear_dead(memcg))
5935 css_put(&memcg->css);
5936}
5937#else
5938static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
5939{
5940 return 0;
5941}
5942
5943static void memcg_destroy_kmem(struct mem_cgroup *memcg)
5944{
5945}
5946
5947static void kmem_cgroup_css_offline(struct mem_cgroup *memcg)
5948{
5949}
5950#endif
5951
5952static struct cftype mem_cgroup_files[] = {
5953 {
5954 .name = "usage_in_bytes",
5955 .private = MEMFILE_PRIVATE(_MEM, RES_USAGE),
5956 .read = mem_cgroup_read,
5957 .register_event = mem_cgroup_usage_register_event,
5958 .unregister_event = mem_cgroup_usage_unregister_event,
5959 },
5960 {
5961 .name = "max_usage_in_bytes",
5962 .private = MEMFILE_PRIVATE(_MEM, RES_MAX_USAGE),
5963 .trigger = mem_cgroup_reset,
5964 .read = mem_cgroup_read,
5965 },
5966 {
5967 .name = "limit_in_bytes",
5968 .private = MEMFILE_PRIVATE(_MEM, RES_LIMIT),
5969 .write_string = mem_cgroup_write,
5970 .read = mem_cgroup_read,
5971 },
5972 {
5973 .name = "soft_limit_in_bytes",
5974 .private = MEMFILE_PRIVATE(_MEM, RES_SOFT_LIMIT),
5975 .write_string = mem_cgroup_write,
5976 .read = mem_cgroup_read,
5977 },
5978 {
5979 .name = "failcnt",
5980 .private = MEMFILE_PRIVATE(_MEM, RES_FAILCNT),
5981 .trigger = mem_cgroup_reset,
5982 .read = mem_cgroup_read,
5983 },
5984 {
5985 .name = "stat",
5986 .read_seq_string = memcg_stat_show,
5987 },
5988 {
5989 .name = "force_empty",
5990 .trigger = mem_cgroup_force_empty_write,
5991 },
5992 {
5993 .name = "use_hierarchy",
5994 .flags = CFTYPE_INSANE,
5995 .write_u64 = mem_cgroup_hierarchy_write,
5996 .read_u64 = mem_cgroup_hierarchy_read,
5997 },
5998 {
5999 .name = "swappiness",
6000 .read_u64 = mem_cgroup_swappiness_read,
6001 .write_u64 = mem_cgroup_swappiness_write,
6002 },
6003 {
6004 .name = "move_charge_at_immigrate",
6005 .read_u64 = mem_cgroup_move_charge_read,
6006 .write_u64 = mem_cgroup_move_charge_write,
6007 },
6008 {
6009 .name = "oom_control",
6010 .read_map = mem_cgroup_oom_control_read,
6011 .write_u64 = mem_cgroup_oom_control_write,
6012 .register_event = mem_cgroup_oom_register_event,
6013 .unregister_event = mem_cgroup_oom_unregister_event,
6014 .private = MEMFILE_PRIVATE(_OOM_TYPE, OOM_CONTROL),
6015 },
6016 {
6017 .name = "pressure_level",
6018 .register_event = vmpressure_register_event,
6019 .unregister_event = vmpressure_unregister_event,
6020 },
6021#ifdef CONFIG_NUMA
6022 {
6023 .name = "numa_stat",
6024 .read_seq_string = memcg_numa_stat_show,
6025 },
6026#endif
6027#ifdef CONFIG_MEMCG_KMEM
6028 {
6029 .name = "kmem.limit_in_bytes",
6030 .private = MEMFILE_PRIVATE(_KMEM, RES_LIMIT),
6031 .write_string = mem_cgroup_write,
6032 .read = mem_cgroup_read,
6033 },
6034 {
6035 .name = "kmem.usage_in_bytes",
6036 .private = MEMFILE_PRIVATE(_KMEM, RES_USAGE),
6037 .read = mem_cgroup_read,
6038 },
6039 {
6040 .name = "kmem.failcnt",
6041 .private = MEMFILE_PRIVATE(_KMEM, RES_FAILCNT),
6042 .trigger = mem_cgroup_reset,
6043 .read = mem_cgroup_read,
6044 },
6045 {
6046 .name = "kmem.max_usage_in_bytes",
6047 .private = MEMFILE_PRIVATE(_KMEM, RES_MAX_USAGE),
6048 .trigger = mem_cgroup_reset,
6049 .read = mem_cgroup_read,
6050 },
6051#ifdef CONFIG_SLABINFO
6052 {
6053 .name = "kmem.slabinfo",
6054 .read_seq_string = mem_cgroup_slabinfo_read,
6055 },
6056#endif
6057#endif
6058 { },
6059};
6060
6061#ifdef CONFIG_MEMCG_SWAP
6062static struct cftype memsw_cgroup_files[] = {
6063 {
6064 .name = "memsw.usage_in_bytes",
6065 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE),
6066 .read = mem_cgroup_read,
6067 .register_event = mem_cgroup_usage_register_event,
6068 .unregister_event = mem_cgroup_usage_unregister_event,
6069 },
6070 {
6071 .name = "memsw.max_usage_in_bytes",
6072 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_MAX_USAGE),
6073 .trigger = mem_cgroup_reset,
6074 .read = mem_cgroup_read,
6075 },
6076 {
6077 .name = "memsw.limit_in_bytes",
6078 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_LIMIT),
6079 .write_string = mem_cgroup_write,
6080 .read = mem_cgroup_read,
6081 },
6082 {
6083 .name = "memsw.failcnt",
6084 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_FAILCNT),
6085 .trigger = mem_cgroup_reset,
6086 .read = mem_cgroup_read,
6087 },
6088 { },
6089};
6090#endif
6091static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node)
6092{
6093 struct mem_cgroup_per_node *pn;
6094 struct mem_cgroup_per_zone *mz;
6095 int zone, tmp = node;
6096
6097
6098
6099
6100
6101
6102
6103
6104 if (!node_state(node, N_NORMAL_MEMORY))
6105 tmp = -1;
6106 pn = kzalloc_node(sizeof(*pn), GFP_KERNEL, tmp);
6107 if (!pn)
6108 return 1;
6109
6110 for (zone = 0; zone < MAX_NR_ZONES; zone++) {
6111 mz = &pn->zoneinfo[zone];
6112 lruvec_init(&mz->lruvec);
6113 mz->usage_in_excess = 0;
6114 mz->on_tree = false;
6115 mz->memcg = memcg;
6116 }
6117 memcg->nodeinfo[node] = pn;
6118 return 0;
6119}
6120
6121static void free_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node)
6122{
6123 kfree(memcg->nodeinfo[node]);
6124}
6125
6126static struct mem_cgroup *mem_cgroup_alloc(void)
6127{
6128 struct mem_cgroup *memcg;
6129 size_t size = memcg_size();
6130
6131
6132 if (size < PAGE_SIZE)
6133 memcg = kzalloc(size, GFP_KERNEL);
6134 else
6135 memcg = vzalloc(size);
6136
6137 if (!memcg)
6138 return NULL;
6139
6140 memcg->stat = alloc_percpu(struct mem_cgroup_stat_cpu);
6141 if (!memcg->stat)
6142 goto out_free;
6143 spin_lock_init(&memcg->pcp_counter_lock);
6144 return memcg;
6145
6146out_free:
6147 if (size < PAGE_SIZE)
6148 kfree(memcg);
6149 else
6150 vfree(memcg);
6151 return NULL;
6152}
6153
6154
6155
6156
6157
6158
6159
6160
6161
6162
6163
6164
6165static void __mem_cgroup_free(struct mem_cgroup *memcg)
6166{
6167 int node;
6168 size_t size = memcg_size();
6169
6170 mem_cgroup_remove_from_trees(memcg);
6171 free_css_id(&mem_cgroup_subsys, &memcg->css);
6172
6173 for_each_node(node)
6174 free_mem_cgroup_per_zone_info(memcg, node);
6175
6176 free_percpu(memcg->stat);
6177
6178
6179
6180
6181
6182
6183
6184
6185
6186
6187
6188
6189 disarm_static_keys(memcg);
6190 if (size < PAGE_SIZE)
6191 kfree(memcg);
6192 else
6193 vfree(memcg);
6194}
6195
6196
6197
6198
6199struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg)
6200{
6201 if (!memcg->res.parent)
6202 return NULL;
6203 return mem_cgroup_from_res_counter(memcg->res.parent, res);
6204}
6205EXPORT_SYMBOL(parent_mem_cgroup);
6206
6207static void __init mem_cgroup_soft_limit_tree_init(void)
6208{
6209 struct mem_cgroup_tree_per_node *rtpn;
6210 struct mem_cgroup_tree_per_zone *rtpz;
6211 int tmp, node, zone;
6212
6213 for_each_node(node) {
6214 tmp = node;
6215 if (!node_state(node, N_NORMAL_MEMORY))
6216 tmp = -1;
6217 rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL, tmp);
6218 BUG_ON(!rtpn);
6219
6220 soft_limit_tree.rb_tree_per_node[node] = rtpn;
6221
6222 for (zone = 0; zone < MAX_NR_ZONES; zone++) {
6223 rtpz = &rtpn->rb_tree_per_zone[zone];
6224 rtpz->rb_root = RB_ROOT;
6225 spin_lock_init(&rtpz->lock);
6226 }
6227 }
6228}
6229
6230static struct cgroup_subsys_state * __ref
6231mem_cgroup_css_alloc(struct cgroup *cont)
6232{
6233 struct mem_cgroup *memcg;
6234 long error = -ENOMEM;
6235 int node;
6236
6237 memcg = mem_cgroup_alloc();
6238 if (!memcg)
6239 return ERR_PTR(error);
6240
6241 for_each_node(node)
6242 if (alloc_mem_cgroup_per_zone_info(memcg, node))
6243 goto free_out;
6244
6245
6246 if (cont->parent == NULL) {
6247 root_mem_cgroup = memcg;
6248 res_counter_init(&memcg->res, NULL);
6249 res_counter_init(&memcg->memsw, NULL);
6250 res_counter_init(&memcg->kmem, NULL);
6251 }
6252
6253 memcg->last_scanned_node = MAX_NUMNODES;
6254 INIT_LIST_HEAD(&memcg->oom_notify);
6255 memcg->move_charge_at_immigrate = 0;
6256 mutex_init(&memcg->thresholds_lock);
6257 spin_lock_init(&memcg->move_lock);
6258 vmpressure_init(&memcg->vmpressure);
6259
6260 return &memcg->css;
6261
6262free_out:
6263 __mem_cgroup_free(memcg);
6264 return ERR_PTR(error);
6265}
6266
6267static int
6268mem_cgroup_css_online(struct cgroup *cont)
6269{
6270 struct mem_cgroup *memcg, *parent;
6271 int error = 0;
6272
6273 if (!cont->parent)
6274 return 0;
6275
6276 mutex_lock(&memcg_create_mutex);
6277 memcg = mem_cgroup_from_cont(cont);
6278 parent = mem_cgroup_from_cont(cont->parent);
6279
6280 memcg->use_hierarchy = parent->use_hierarchy;
6281 memcg->oom_kill_disable = parent->oom_kill_disable;
6282 memcg->swappiness = mem_cgroup_swappiness(parent);
6283
6284 if (parent->use_hierarchy) {
6285 res_counter_init(&memcg->res, &parent->res);
6286 res_counter_init(&memcg->memsw, &parent->memsw);
6287 res_counter_init(&memcg->kmem, &parent->kmem);
6288
6289
6290
6291
6292
6293 } else {
6294 res_counter_init(&memcg->res, NULL);
6295 res_counter_init(&memcg->memsw, NULL);
6296 res_counter_init(&memcg->kmem, NULL);
6297
6298
6299
6300
6301
6302 if (parent != root_mem_cgroup)
6303 mem_cgroup_subsys.broken_hierarchy = true;
6304 }
6305
6306 error = memcg_init_kmem(memcg, &mem_cgroup_subsys);
6307 mutex_unlock(&memcg_create_mutex);
6308 return error;
6309}
6310
6311
6312
6313
6314static void mem_cgroup_invalidate_reclaim_iterators(struct mem_cgroup *memcg)
6315{
6316 struct mem_cgroup *parent = memcg;
6317
6318 while ((parent = parent_mem_cgroup(parent)))
6319 mem_cgroup_iter_invalidate(parent);
6320
6321
6322
6323
6324
6325 if (!root_mem_cgroup->use_hierarchy)
6326 mem_cgroup_iter_invalidate(root_mem_cgroup);
6327}
6328
6329static void mem_cgroup_css_offline(struct cgroup *cont)
6330{
6331 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
6332
6333 kmem_cgroup_css_offline(memcg);
6334
6335 mem_cgroup_invalidate_reclaim_iterators(memcg);
6336 mem_cgroup_reparent_charges(memcg);
6337 mem_cgroup_destroy_all_caches(memcg);
6338 vmpressure_cleanup(&memcg->vmpressure);
6339}
6340
6341static void mem_cgroup_css_free(struct cgroup *cont)
6342{
6343 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
6344
6345 memcg_destroy_kmem(memcg);
6346 __mem_cgroup_free(memcg);
6347}
6348
6349#ifdef CONFIG_MMU
6350
6351#define PRECHARGE_COUNT_AT_ONCE 256
6352static int mem_cgroup_do_precharge(unsigned long count)
6353{
6354 int ret = 0;
6355 int batch_count = PRECHARGE_COUNT_AT_ONCE;
6356 struct mem_cgroup *memcg = mc.to;
6357
6358 if (mem_cgroup_is_root(memcg)) {
6359 mc.precharge += count;
6360
6361 return ret;
6362 }
6363
6364 if (count > 1) {
6365 struct res_counter *dummy;
6366
6367
6368
6369
6370
6371
6372 if (res_counter_charge(&memcg->res, PAGE_SIZE * count, &dummy))
6373 goto one_by_one;
6374 if (do_swap_account && res_counter_charge(&memcg->memsw,
6375 PAGE_SIZE * count, &dummy)) {
6376 res_counter_uncharge(&memcg->res, PAGE_SIZE * count);
6377 goto one_by_one;
6378 }
6379 mc.precharge += count;
6380 return ret;
6381 }
6382one_by_one:
6383
6384 while (count--) {
6385 if (signal_pending(current)) {
6386 ret = -EINTR;
6387 break;
6388 }
6389 if (!batch_count--) {
6390 batch_count = PRECHARGE_COUNT_AT_ONCE;
6391 cond_resched();
6392 }
6393 ret = __mem_cgroup_try_charge(NULL,
6394 GFP_KERNEL, 1, &memcg, false);
6395 if (ret)
6396
6397 return ret;
6398 mc.precharge++;
6399 }
6400 return ret;
6401}
6402
6403
6404
6405
6406
6407
6408
6409
6410
6411
6412
6413
6414
6415
6416
6417
6418
6419
6420
6421union mc_target {
6422 struct page *page;
6423 swp_entry_t ent;
6424};
6425
6426enum mc_target_type {
6427 MC_TARGET_NONE = 0,
6428 MC_TARGET_PAGE,
6429 MC_TARGET_SWAP,
6430};
6431
6432static struct page *mc_handle_present_pte(struct vm_area_struct *vma,
6433 unsigned long addr, pte_t ptent)
6434{
6435 struct page *page = vm_normal_page(vma, addr, ptent);
6436
6437 if (!page || !page_mapped(page))
6438 return NULL;
6439 if (PageAnon(page)) {
6440
6441 if (!move_anon())
6442 return NULL;
6443 } else if (!move_file())
6444
6445 return NULL;
6446 if (!get_page_unless_zero(page))
6447 return NULL;
6448
6449 return page;
6450}
6451
6452#ifdef CONFIG_SWAP
6453static struct page *mc_handle_swap_pte(struct vm_area_struct *vma,
6454 unsigned long addr, pte_t ptent, swp_entry_t *entry)
6455{
6456 struct page *page = NULL;
6457 swp_entry_t ent = pte_to_swp_entry(ptent);
6458
6459 if (!move_anon() || non_swap_entry(ent))
6460 return NULL;
6461
6462
6463
6464
6465 page = find_get_page(swap_address_space(ent), ent.val);
6466 if (do_swap_account)
6467 entry->val = ent.val;
6468
6469 return page;
6470}
6471#else
6472static struct page *mc_handle_swap_pte(struct vm_area_struct *vma,
6473 unsigned long addr, pte_t ptent, swp_entry_t *entry)
6474{
6475 return NULL;
6476}
6477#endif
6478
6479static struct page *mc_handle_file_pte(struct vm_area_struct *vma,
6480 unsigned long addr, pte_t ptent, swp_entry_t *entry)
6481{
6482 struct page *page = NULL;
6483 struct address_space *mapping;
6484 pgoff_t pgoff;
6485
6486 if (!vma->vm_file)
6487 return NULL;
6488 if (!move_file())
6489 return NULL;
6490
6491 mapping = vma->vm_file->f_mapping;
6492 if (pte_none(ptent))
6493 pgoff = linear_page_index(vma, addr);
6494 else
6495 pgoff = pte_to_pgoff(ptent);
6496
6497
6498 page = find_get_page(mapping, pgoff);
6499
6500#ifdef CONFIG_SWAP
6501
6502 if (radix_tree_exceptional_entry(page)) {
6503 swp_entry_t swap = radix_to_swp_entry(page);
6504 if (do_swap_account)
6505 *entry = swap;
6506 page = find_get_page(swap_address_space(swap), swap.val);
6507 }
6508#endif
6509 return page;
6510}
6511
6512static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma,
6513 unsigned long addr, pte_t ptent, union mc_target *target)
6514{
6515 struct page *page = NULL;
6516 struct page_cgroup *pc;
6517 enum mc_target_type ret = MC_TARGET_NONE;
6518 swp_entry_t ent = { .val = 0 };
6519
6520 if (pte_present(ptent))
6521 page = mc_handle_present_pte(vma, addr, ptent);
6522 else if (is_swap_pte(ptent))
6523 page = mc_handle_swap_pte(vma, addr, ptent, &ent);
6524 else if (pte_none(ptent) || pte_file(ptent))
6525 page = mc_handle_file_pte(vma, addr, ptent, &ent);
6526
6527 if (!page && !ent.val)
6528 return ret;
6529 if (page) {
6530 pc = lookup_page_cgroup(page);
6531
6532
6533
6534
6535
6536 if (PageCgroupUsed(pc) && pc->mem_cgroup == mc.from) {
6537 ret = MC_TARGET_PAGE;
6538 if (target)
6539 target->page = page;
6540 }
6541 if (!ret || !target)
6542 put_page(page);
6543 }
6544
6545 if (ent.val && !ret &&
6546 css_id(&mc.from->css) == lookup_swap_cgroup_id(ent)) {
6547 ret = MC_TARGET_SWAP;
6548 if (target)
6549 target->ent = ent;
6550 }
6551 return ret;
6552}
6553
6554#ifdef CONFIG_TRANSPARENT_HUGEPAGE
6555
6556
6557
6558
6559
6560static enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma,
6561 unsigned long addr, pmd_t pmd, union mc_target *target)
6562{
6563 struct page *page = NULL;
6564 struct page_cgroup *pc;
6565 enum mc_target_type ret = MC_TARGET_NONE;
6566
6567 page = pmd_page(pmd);
6568 VM_BUG_ON(!page || !PageHead(page));
6569 if (!move_anon())
6570 return ret;
6571 pc = lookup_page_cgroup(page);
6572 if (PageCgroupUsed(pc) && pc->mem_cgroup == mc.from) {
6573 ret = MC_TARGET_PAGE;
6574 if (target) {
6575 get_page(page);
6576 target->page = page;
6577 }
6578 }
6579 return ret;
6580}
6581#else
6582static inline enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma,
6583 unsigned long addr, pmd_t pmd, union mc_target *target)
6584{
6585 return MC_TARGET_NONE;
6586}
6587#endif
6588
6589static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd,
6590 unsigned long addr, unsigned long end,
6591 struct mm_walk *walk)
6592{
6593 struct vm_area_struct *vma = walk->private;
6594 pte_t *pte;
6595 spinlock_t *ptl;
6596
6597 if (pmd_trans_huge_lock(pmd, vma) == 1) {
6598 if (get_mctgt_type_thp(vma, addr, *pmd, NULL) == MC_TARGET_PAGE)
6599 mc.precharge += HPAGE_PMD_NR;
6600 spin_unlock(&vma->vm_mm->page_table_lock);
6601 return 0;
6602 }
6603
6604 if (pmd_trans_unstable(pmd))
6605 return 0;
6606 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
6607 for (; addr != end; pte++, addr += PAGE_SIZE)
6608 if (get_mctgt_type(vma, addr, *pte, NULL))
6609 mc.precharge++;
6610 pte_unmap_unlock(pte - 1, ptl);
6611 cond_resched();
6612
6613 return 0;
6614}
6615
6616static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm)
6617{
6618 unsigned long precharge;
6619 struct vm_area_struct *vma;
6620
6621 down_read(&mm->mmap_sem);
6622 for (vma = mm->mmap; vma; vma = vma->vm_next) {
6623 struct mm_walk mem_cgroup_count_precharge_walk = {
6624 .pmd_entry = mem_cgroup_count_precharge_pte_range,
6625 .mm = mm,
6626 .private = vma,
6627 };
6628 if (is_vm_hugetlb_page(vma))
6629 continue;
6630 walk_page_range(vma->vm_start, vma->vm_end,
6631 &mem_cgroup_count_precharge_walk);
6632 }
6633 up_read(&mm->mmap_sem);
6634
6635 precharge = mc.precharge;
6636 mc.precharge = 0;
6637
6638 return precharge;
6639}
6640
6641static int mem_cgroup_precharge_mc(struct mm_struct *mm)
6642{
6643 unsigned long precharge = mem_cgroup_count_precharge(mm);
6644
6645 VM_BUG_ON(mc.moving_task);
6646 mc.moving_task = current;
6647 return mem_cgroup_do_precharge(precharge);
6648}
6649
6650
6651static void __mem_cgroup_clear_mc(void)
6652{
6653 struct mem_cgroup *from = mc.from;
6654 struct mem_cgroup *to = mc.to;
6655 int i;
6656
6657
6658 if (mc.precharge) {
6659 __mem_cgroup_cancel_charge(mc.to, mc.precharge);
6660 mc.precharge = 0;
6661 }
6662
6663
6664
6665
6666 if (mc.moved_charge) {
6667 __mem_cgroup_cancel_charge(mc.from, mc.moved_charge);
6668 mc.moved_charge = 0;
6669 }
6670
6671 if (mc.moved_swap) {
6672
6673 if (!mem_cgroup_is_root(mc.from))
6674 res_counter_uncharge(&mc.from->memsw,
6675 PAGE_SIZE * mc.moved_swap);
6676
6677 for (i = 0; i < mc.moved_swap; i++)
6678 css_put(&mc.from->css);
6679
6680 if (!mem_cgroup_is_root(mc.to)) {
6681
6682
6683
6684
6685 res_counter_uncharge(&mc.to->res,
6686 PAGE_SIZE * mc.moved_swap);
6687 }
6688
6689 mc.moved_swap = 0;
6690 }
6691 memcg_oom_recover(from);
6692 memcg_oom_recover(to);
6693 wake_up_all(&mc.waitq);
6694}
6695
6696static void mem_cgroup_clear_mc(void)
6697{
6698 struct mem_cgroup *from = mc.from;
6699
6700
6701
6702
6703
6704 mc.moving_task = NULL;
6705 __mem_cgroup_clear_mc();
6706 spin_lock(&mc.lock);
6707 mc.from = NULL;
6708 mc.to = NULL;
6709 spin_unlock(&mc.lock);
6710 mem_cgroup_end_move(from);
6711}
6712
6713static int mem_cgroup_can_attach(struct cgroup *cgroup,
6714 struct cgroup_taskset *tset)
6715{
6716 struct task_struct *p = cgroup_taskset_first(tset);
6717 int ret = 0;
6718 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgroup);
6719 unsigned long move_charge_at_immigrate;
6720
6721
6722
6723
6724
6725
6726 move_charge_at_immigrate = memcg->move_charge_at_immigrate;
6727 if (move_charge_at_immigrate) {
6728 struct mm_struct *mm;
6729 struct mem_cgroup *from = mem_cgroup_from_task(p);
6730
6731 VM_BUG_ON(from == memcg);
6732
6733 mm = get_task_mm(p);
6734 if (!mm)
6735 return 0;
6736
6737 if (mm->owner == p) {
6738 VM_BUG_ON(mc.from);
6739 VM_BUG_ON(mc.to);
6740 VM_BUG_ON(mc.precharge);
6741 VM_BUG_ON(mc.moved_charge);
6742 VM_BUG_ON(mc.moved_swap);
6743 mem_cgroup_start_move(from);
6744 spin_lock(&mc.lock);
6745 mc.from = from;
6746 mc.to = memcg;
6747 mc.immigrate_flags = move_charge_at_immigrate;
6748 spin_unlock(&mc.lock);
6749
6750
6751 ret = mem_cgroup_precharge_mc(mm);
6752 if (ret)
6753 mem_cgroup_clear_mc();
6754 }
6755 mmput(mm);
6756 }
6757 return ret;
6758}
6759
6760static void mem_cgroup_cancel_attach(struct cgroup *cgroup,
6761 struct cgroup_taskset *tset)
6762{
6763 mem_cgroup_clear_mc();
6764}
6765
6766static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
6767 unsigned long addr, unsigned long end,
6768 struct mm_walk *walk)
6769{
6770 int ret = 0;
6771 struct vm_area_struct *vma = walk->private;
6772 pte_t *pte;
6773 spinlock_t *ptl;
6774 enum mc_target_type target_type;
6775 union mc_target target;
6776 struct page *page;
6777 struct page_cgroup *pc;
6778
6779
6780
6781
6782
6783
6784
6785
6786
6787
6788
6789 if (pmd_trans_huge_lock(pmd, vma) == 1) {
6790 if (mc.precharge < HPAGE_PMD_NR) {
6791 spin_unlock(&vma->vm_mm->page_table_lock);
6792 return 0;
6793 }
6794 target_type = get_mctgt_type_thp(vma, addr, *pmd, &target);
6795 if (target_type == MC_TARGET_PAGE) {
6796 page = target.page;
6797 if (!isolate_lru_page(page)) {
6798 pc = lookup_page_cgroup(page);
6799 if (!mem_cgroup_move_account(page, HPAGE_PMD_NR,
6800 pc, mc.from, mc.to)) {
6801 mc.precharge -= HPAGE_PMD_NR;
6802 mc.moved_charge += HPAGE_PMD_NR;
6803 }
6804 putback_lru_page(page);
6805 }
6806 put_page(page);
6807 }
6808 spin_unlock(&vma->vm_mm->page_table_lock);
6809 return 0;
6810 }
6811
6812 if (pmd_trans_unstable(pmd))
6813 return 0;
6814retry:
6815 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
6816 for (; addr != end; addr += PAGE_SIZE) {
6817 pte_t ptent = *(pte++);
6818 swp_entry_t ent;
6819
6820 if (!mc.precharge)
6821 break;
6822
6823 switch (get_mctgt_type(vma, addr, ptent, &target)) {
6824 case MC_TARGET_PAGE:
6825 page = target.page;
6826 if (isolate_lru_page(page))
6827 goto put;
6828 pc = lookup_page_cgroup(page);
6829 if (!mem_cgroup_move_account(page, 1, pc,
6830 mc.from, mc.to)) {
6831 mc.precharge--;
6832
6833 mc.moved_charge++;
6834 }
6835 putback_lru_page(page);
6836put:
6837 put_page(page);
6838 break;
6839 case MC_TARGET_SWAP:
6840 ent = target.ent;
6841 if (!mem_cgroup_move_swap_account(ent, mc.from, mc.to)) {
6842 mc.precharge--;
6843
6844 mc.moved_swap++;
6845 }
6846 break;
6847 default:
6848 break;
6849 }
6850 }
6851 pte_unmap_unlock(pte - 1, ptl);
6852 cond_resched();
6853
6854 if (addr != end) {
6855
6856
6857
6858
6859
6860
6861 ret = mem_cgroup_do_precharge(1);
6862 if (!ret)
6863 goto retry;
6864 }
6865
6866 return ret;
6867}
6868
6869static void mem_cgroup_move_charge(struct mm_struct *mm)
6870{
6871 struct vm_area_struct *vma;
6872
6873 lru_add_drain_all();
6874retry:
6875 if (unlikely(!down_read_trylock(&mm->mmap_sem))) {
6876
6877
6878
6879
6880
6881
6882
6883 __mem_cgroup_clear_mc();
6884 cond_resched();
6885 goto retry;
6886 }
6887 for (vma = mm->mmap; vma; vma = vma->vm_next) {
6888 int ret;
6889 struct mm_walk mem_cgroup_move_charge_walk = {
6890 .pmd_entry = mem_cgroup_move_charge_pte_range,
6891 .mm = mm,
6892 .private = vma,
6893 };
6894 if (is_vm_hugetlb_page(vma))
6895 continue;
6896 ret = walk_page_range(vma->vm_start, vma->vm_end,
6897 &mem_cgroup_move_charge_walk);
6898 if (ret)
6899
6900
6901
6902
6903 break;
6904 }
6905 up_read(&mm->mmap_sem);
6906}
6907
6908static void mem_cgroup_move_task(struct cgroup *cont,
6909 struct cgroup_taskset *tset)
6910{
6911 struct task_struct *p = cgroup_taskset_first(tset);
6912 struct mm_struct *mm = get_task_mm(p);
6913
6914 if (mm) {
6915 if (mc.to)
6916 mem_cgroup_move_charge(mm);
6917 mmput(mm);
6918 }
6919 if (mc.to)
6920 mem_cgroup_clear_mc();
6921}
6922#else
6923static int mem_cgroup_can_attach(struct cgroup *cgroup,
6924 struct cgroup_taskset *tset)
6925{
6926 return 0;
6927}
6928static void mem_cgroup_cancel_attach(struct cgroup *cgroup,
6929 struct cgroup_taskset *tset)
6930{
6931}
6932static void mem_cgroup_move_task(struct cgroup *cont,
6933 struct cgroup_taskset *tset)
6934{
6935}
6936#endif
6937
6938
6939
6940
6941
6942static void mem_cgroup_bind(struct cgroup *root)
6943{
6944
6945
6946
6947
6948
6949 if (cgroup_sane_behavior(root))
6950 mem_cgroup_from_cont(root)->use_hierarchy = true;
6951}
6952
6953struct cgroup_subsys mem_cgroup_subsys = {
6954 .name = "memory",
6955 .subsys_id = mem_cgroup_subsys_id,
6956 .css_alloc = mem_cgroup_css_alloc,
6957 .css_online = mem_cgroup_css_online,
6958 .css_offline = mem_cgroup_css_offline,
6959 .css_free = mem_cgroup_css_free,
6960 .can_attach = mem_cgroup_can_attach,
6961 .cancel_attach = mem_cgroup_cancel_attach,
6962 .attach = mem_cgroup_move_task,
6963 .bind = mem_cgroup_bind,
6964 .base_cftypes = mem_cgroup_files,
6965 .early_init = 0,
6966 .use_id = 1,
6967};
6968
6969#ifdef CONFIG_MEMCG_SWAP
6970static int __init enable_swap_account(char *s)
6971{
6972 if (!strcmp(s, "1"))
6973 really_do_swap_account = 1;
6974 else if (!strcmp(s, "0"))
6975 really_do_swap_account = 0;
6976 return 1;
6977}
6978__setup("swapaccount=", enable_swap_account);
6979
6980static void __init memsw_file_init(void)
6981{
6982 WARN_ON(cgroup_add_cftypes(&mem_cgroup_subsys, memsw_cgroup_files));
6983}
6984
6985static void __init enable_swap_cgroup(void)
6986{
6987 if (!mem_cgroup_disabled() && really_do_swap_account) {
6988 do_swap_account = 1;
6989 memsw_file_init();
6990 }
6991}
6992
6993#else
6994static void __init enable_swap_cgroup(void)
6995{
6996}
6997#endif
6998
6999
7000
7001
7002
7003
7004
7005
7006
7007static int __init mem_cgroup_init(void)
7008{
7009 hotcpu_notifier(memcg_cpu_hotplug_callback, 0);
7010 enable_swap_cgroup();
7011 mem_cgroup_soft_limit_tree_init();
7012 memcg_stock_init();
7013 return 0;
7014}
7015subsys_initcall(mem_cgroup_init);
7016