1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28#include <linux/res_counter.h>
29#include <linux/memcontrol.h>
30#include <linux/cgroup.h>
31#include <linux/mm.h>
32#include <linux/hugetlb.h>
33#include <linux/pagemap.h>
34#include <linux/smp.h>
35#include <linux/page-flags.h>
36#include <linux/backing-dev.h>
37#include <linux/bit_spinlock.h>
38#include <linux/rcupdate.h>
39#include <linux/limits.h>
40#include <linux/export.h>
41#include <linux/mutex.h>
42#include <linux/rbtree.h>
43#include <linux/slab.h>
44#include <linux/swap.h>
45#include <linux/swapops.h>
46#include <linux/spinlock.h>
47#include <linux/eventfd.h>
48#include <linux/poll.h>
49#include <linux/sort.h>
50#include <linux/fs.h>
51#include <linux/seq_file.h>
52#include <linux/vmpressure.h>
53#include <linux/mm_inline.h>
54#include <linux/page_cgroup.h>
55#include <linux/cpu.h>
56#include <linux/oom.h>
57#include <linux/lockdep.h>
58#include <linux/file.h>
59#include "internal.h"
60#include <net/sock.h>
61#include <net/ip.h>
62#include <net/tcp_memcontrol.h>
63#include "slab.h"
64
65#include <asm/uaccess.h>
66
67#include <trace/events/vmscan.h>
68
69struct cgroup_subsys mem_cgroup_subsys __read_mostly;
70EXPORT_SYMBOL(mem_cgroup_subsys);
71
72#define MEM_CGROUP_RECLAIM_RETRIES 5
73static struct mem_cgroup *root_mem_cgroup __read_mostly;
74
75#ifdef CONFIG_MEMCG_SWAP
76
77int do_swap_account __read_mostly;
78
79
80#ifdef CONFIG_MEMCG_SWAP_ENABLED
81static int really_do_swap_account __initdata = 1;
82#else
83static int really_do_swap_account __initdata = 0;
84#endif
85
86#else
87#define do_swap_account 0
88#endif
89
90
91static const char * const mem_cgroup_stat_names[] = {
92 "cache",
93 "rss",
94 "rss_huge",
95 "mapped_file",
96 "writeback",
97 "swap",
98};
99
100enum mem_cgroup_events_index {
101 MEM_CGROUP_EVENTS_PGPGIN,
102 MEM_CGROUP_EVENTS_PGPGOUT,
103 MEM_CGROUP_EVENTS_PGFAULT,
104 MEM_CGROUP_EVENTS_PGMAJFAULT,
105 MEM_CGROUP_EVENTS_NSTATS,
106};
107
108static const char * const mem_cgroup_events_names[] = {
109 "pgpgin",
110 "pgpgout",
111 "pgfault",
112 "pgmajfault",
113};
114
115static const char * const mem_cgroup_lru_names[] = {
116 "inactive_anon",
117 "active_anon",
118 "inactive_file",
119 "active_file",
120 "unevictable",
121};
122
123
124
125
126
127
128
129enum mem_cgroup_events_target {
130 MEM_CGROUP_TARGET_THRESH,
131 MEM_CGROUP_TARGET_SOFTLIMIT,
132 MEM_CGROUP_TARGET_NUMAINFO,
133 MEM_CGROUP_NTARGETS,
134};
135#define THRESHOLDS_EVENTS_TARGET 128
136#define SOFTLIMIT_EVENTS_TARGET 1024
137#define NUMAINFO_EVENTS_TARGET 1024
138
139struct mem_cgroup_stat_cpu {
140 long count[MEM_CGROUP_STAT_NSTATS];
141 unsigned long events[MEM_CGROUP_EVENTS_NSTATS];
142 unsigned long nr_page_events;
143 unsigned long targets[MEM_CGROUP_NTARGETS];
144};
145
146struct mem_cgroup_reclaim_iter {
147
148
149
150
151 struct mem_cgroup *last_visited;
152 int last_dead_count;
153
154
155 unsigned int generation;
156};
157
158
159
160
161struct mem_cgroup_per_zone {
162 struct lruvec lruvec;
163 unsigned long lru_size[NR_LRU_LISTS];
164
165 struct mem_cgroup_reclaim_iter reclaim_iter[DEF_PRIORITY + 1];
166
167 struct rb_node tree_node;
168 unsigned long long usage_in_excess;
169
170 bool on_tree;
171 struct mem_cgroup *memcg;
172
173};
174
175struct mem_cgroup_per_node {
176 struct mem_cgroup_per_zone zoneinfo[MAX_NR_ZONES];
177};
178
179
180
181
182
183
184struct mem_cgroup_tree_per_zone {
185 struct rb_root rb_root;
186 spinlock_t lock;
187};
188
189struct mem_cgroup_tree_per_node {
190 struct mem_cgroup_tree_per_zone rb_tree_per_zone[MAX_NR_ZONES];
191};
192
193struct mem_cgroup_tree {
194 struct mem_cgroup_tree_per_node *rb_tree_per_node[MAX_NUMNODES];
195};
196
197static struct mem_cgroup_tree soft_limit_tree __read_mostly;
198
199struct mem_cgroup_threshold {
200 struct eventfd_ctx *eventfd;
201 u64 threshold;
202};
203
204
205struct mem_cgroup_threshold_ary {
206
207 int current_threshold;
208
209 unsigned int size;
210
211 struct mem_cgroup_threshold entries[0];
212};
213
214struct mem_cgroup_thresholds {
215
216 struct mem_cgroup_threshold_ary *primary;
217
218
219
220
221
222 struct mem_cgroup_threshold_ary *spare;
223};
224
225
226struct mem_cgroup_eventfd_list {
227 struct list_head list;
228 struct eventfd_ctx *eventfd;
229};
230
231
232
233
234struct mem_cgroup_event {
235
236
237
238 struct mem_cgroup *memcg;
239
240
241
242 struct eventfd_ctx *eventfd;
243
244
245
246 struct list_head list;
247
248
249
250
251
252 int (*register_event)(struct mem_cgroup *memcg,
253 struct eventfd_ctx *eventfd, const char *args);
254
255
256
257
258
259 void (*unregister_event)(struct mem_cgroup *memcg,
260 struct eventfd_ctx *eventfd);
261
262
263
264
265 poll_table pt;
266 wait_queue_head_t *wqh;
267 wait_queue_t wait;
268 struct work_struct remove;
269};
270
271static void mem_cgroup_threshold(struct mem_cgroup *memcg);
272static void mem_cgroup_oom_notify(struct mem_cgroup *memcg);
273
274
275
276
277
278
279
280
281
282
283
284
285struct mem_cgroup {
286 struct cgroup_subsys_state css;
287
288
289
290 struct res_counter res;
291
292
293 struct vmpressure vmpressure;
294
295
296
297
298 struct res_counter memsw;
299
300
301
302
303 struct res_counter kmem;
304
305
306
307 bool use_hierarchy;
308 unsigned long kmem_account_flags;
309
310 bool oom_lock;
311 atomic_t under_oom;
312 atomic_t oom_wakeups;
313
314 int swappiness;
315
316 int oom_kill_disable;
317
318
319 bool memsw_is_minimum;
320
321
322 struct mutex thresholds_lock;
323
324
325 struct mem_cgroup_thresholds thresholds;
326
327
328 struct mem_cgroup_thresholds memsw_thresholds;
329
330
331 struct list_head oom_notify;
332
333
334
335
336
337 unsigned long move_charge_at_immigrate;
338
339
340
341 atomic_t moving_account;
342
343 spinlock_t move_lock;
344
345
346
347 struct mem_cgroup_stat_cpu __percpu *stat;
348
349
350
351
352 struct mem_cgroup_stat_cpu nocpu_base;
353 spinlock_t pcp_counter_lock;
354
355 atomic_t dead_count;
356#if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_INET)
357 struct cg_proto tcp_mem;
358#endif
359#if defined(CONFIG_MEMCG_KMEM)
360
361 struct list_head memcg_slab_caches;
362
363 struct mutex slab_caches_mutex;
364
365 int kmemcg_id;
366#endif
367
368 int last_scanned_node;
369#if MAX_NUMNODES > 1
370 nodemask_t scan_nodes;
371 atomic_t numainfo_events;
372 atomic_t numainfo_updating;
373#endif
374
375
376 struct list_head event_list;
377 spinlock_t event_list_lock;
378
379 struct mem_cgroup_per_node *nodeinfo[0];
380
381};
382
383
384enum {
385 KMEM_ACCOUNTED_ACTIVE,
386 KMEM_ACCOUNTED_DEAD,
387};
388
389#ifdef CONFIG_MEMCG_KMEM
390static inline void memcg_kmem_set_active(struct mem_cgroup *memcg)
391{
392 set_bit(KMEM_ACCOUNTED_ACTIVE, &memcg->kmem_account_flags);
393}
394
395static bool memcg_kmem_is_active(struct mem_cgroup *memcg)
396{
397 return test_bit(KMEM_ACCOUNTED_ACTIVE, &memcg->kmem_account_flags);
398}
399
400static void memcg_kmem_mark_dead(struct mem_cgroup *memcg)
401{
402
403
404
405
406 smp_wmb();
407 if (test_bit(KMEM_ACCOUNTED_ACTIVE, &memcg->kmem_account_flags))
408 set_bit(KMEM_ACCOUNTED_DEAD, &memcg->kmem_account_flags);
409}
410
411static bool memcg_kmem_test_and_clear_dead(struct mem_cgroup *memcg)
412{
413 return test_and_clear_bit(KMEM_ACCOUNTED_DEAD,
414 &memcg->kmem_account_flags);
415}
416#endif
417
418
419
420
421
422
423enum move_type {
424 MOVE_CHARGE_TYPE_ANON,
425 MOVE_CHARGE_TYPE_FILE,
426 NR_MOVE_TYPE,
427};
428
429
430static struct move_charge_struct {
431 spinlock_t lock;
432 struct mem_cgroup *from;
433 struct mem_cgroup *to;
434 unsigned long immigrate_flags;
435 unsigned long precharge;
436 unsigned long moved_charge;
437 unsigned long moved_swap;
438 struct task_struct *moving_task;
439 wait_queue_head_t waitq;
440} mc = {
441 .lock = __SPIN_LOCK_UNLOCKED(mc.lock),
442 .waitq = __WAIT_QUEUE_HEAD_INITIALIZER(mc.waitq),
443};
444
445static bool move_anon(void)
446{
447 return test_bit(MOVE_CHARGE_TYPE_ANON, &mc.immigrate_flags);
448}
449
450static bool move_file(void)
451{
452 return test_bit(MOVE_CHARGE_TYPE_FILE, &mc.immigrate_flags);
453}
454
455
456
457
458
459#define MEM_CGROUP_MAX_RECLAIM_LOOPS 100
460#define MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS 2
461
462enum charge_type {
463 MEM_CGROUP_CHARGE_TYPE_CACHE = 0,
464 MEM_CGROUP_CHARGE_TYPE_ANON,
465 MEM_CGROUP_CHARGE_TYPE_SWAPOUT,
466 MEM_CGROUP_CHARGE_TYPE_DROP,
467 NR_CHARGE_TYPE,
468};
469
470
471enum res_type {
472 _MEM,
473 _MEMSWAP,
474 _OOM_TYPE,
475 _KMEM,
476};
477
478#define MEMFILE_PRIVATE(x, val) ((x) << 16 | (val))
479#define MEMFILE_TYPE(val) ((val) >> 16 & 0xffff)
480#define MEMFILE_ATTR(val) ((val) & 0xffff)
481
482#define OOM_CONTROL (0)
483
484
485
486
487#define MEM_CGROUP_RECLAIM_NOSWAP_BIT 0x0
488#define MEM_CGROUP_RECLAIM_NOSWAP (1 << MEM_CGROUP_RECLAIM_NOSWAP_BIT)
489#define MEM_CGROUP_RECLAIM_SHRINK_BIT 0x1
490#define MEM_CGROUP_RECLAIM_SHRINK (1 << MEM_CGROUP_RECLAIM_SHRINK_BIT)
491
492
493
494
495
496
497static DEFINE_MUTEX(memcg_create_mutex);
498
499struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *s)
500{
501 return s ? container_of(s, struct mem_cgroup, css) : NULL;
502}
503
504
505struct vmpressure *memcg_to_vmpressure(struct mem_cgroup *memcg)
506{
507 if (!memcg)
508 memcg = root_mem_cgroup;
509 return &memcg->vmpressure;
510}
511
512struct cgroup_subsys_state *vmpressure_to_css(struct vmpressure *vmpr)
513{
514 return &container_of(vmpr, struct mem_cgroup, vmpressure)->css;
515}
516
517static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg)
518{
519 return (memcg == root_mem_cgroup);
520}
521
522
523
524
525
526#define MEM_CGROUP_ID_MAX USHRT_MAX
527
528static inline unsigned short mem_cgroup_id(struct mem_cgroup *memcg)
529{
530
531
532
533
534 return memcg->css.cgroup->id + 1;
535}
536
537static inline struct mem_cgroup *mem_cgroup_from_id(unsigned short id)
538{
539 struct cgroup_subsys_state *css;
540
541 css = css_from_id(id - 1, &mem_cgroup_subsys);
542 return mem_cgroup_from_css(css);
543}
544
545
546#if defined(CONFIG_INET) && defined(CONFIG_MEMCG_KMEM)
547
548void sock_update_memcg(struct sock *sk)
549{
550 if (mem_cgroup_sockets_enabled) {
551 struct mem_cgroup *memcg;
552 struct cg_proto *cg_proto;
553
554 BUG_ON(!sk->sk_prot->proto_cgroup);
555
556
557
558
559
560
561
562
563
564 if (sk->sk_cgrp) {
565 BUG_ON(mem_cgroup_is_root(sk->sk_cgrp->memcg));
566 css_get(&sk->sk_cgrp->memcg->css);
567 return;
568 }
569
570 rcu_read_lock();
571 memcg = mem_cgroup_from_task(current);
572 cg_proto = sk->sk_prot->proto_cgroup(memcg);
573 if (!mem_cgroup_is_root(memcg) &&
574 memcg_proto_active(cg_proto) && css_tryget(&memcg->css)) {
575 sk->sk_cgrp = cg_proto;
576 }
577 rcu_read_unlock();
578 }
579}
580EXPORT_SYMBOL(sock_update_memcg);
581
582void sock_release_memcg(struct sock *sk)
583{
584 if (mem_cgroup_sockets_enabled && sk->sk_cgrp) {
585 struct mem_cgroup *memcg;
586 WARN_ON(!sk->sk_cgrp->memcg);
587 memcg = sk->sk_cgrp->memcg;
588 css_put(&sk->sk_cgrp->memcg->css);
589 }
590}
591
592struct cg_proto *tcp_proto_cgroup(struct mem_cgroup *memcg)
593{
594 if (!memcg || mem_cgroup_is_root(memcg))
595 return NULL;
596
597 return &memcg->tcp_mem;
598}
599EXPORT_SYMBOL(tcp_proto_cgroup);
600
601static void disarm_sock_keys(struct mem_cgroup *memcg)
602{
603 if (!memcg_proto_activated(&memcg->tcp_mem))
604 return;
605 static_key_slow_dec(&memcg_socket_limit_enabled);
606}
607#else
608static void disarm_sock_keys(struct mem_cgroup *memcg)
609{
610}
611#endif
612
613#ifdef CONFIG_MEMCG_KMEM
614
615
616
617
618
619
620
621
622
623
624
625
626static DEFINE_IDA(kmem_limited_groups);
627int memcg_limited_groups_array_size;
628
629
630
631
632
633
634
635
636
637
638
639
640
641#define MEMCG_CACHES_MIN_SIZE 4
642#define MEMCG_CACHES_MAX_SIZE MEM_CGROUP_ID_MAX
643
644
645
646
647
648
649
650struct static_key memcg_kmem_enabled_key;
651EXPORT_SYMBOL(memcg_kmem_enabled_key);
652
653static void disarm_kmem_keys(struct mem_cgroup *memcg)
654{
655 if (memcg_kmem_is_active(memcg)) {
656 static_key_slow_dec(&memcg_kmem_enabled_key);
657 ida_simple_remove(&kmem_limited_groups, memcg->kmemcg_id);
658 }
659
660
661
662
663 WARN_ON(res_counter_read_u64(&memcg->kmem, RES_USAGE) != 0);
664}
665#else
666static void disarm_kmem_keys(struct mem_cgroup *memcg)
667{
668}
669#endif
670
671static void disarm_static_keys(struct mem_cgroup *memcg)
672{
673 disarm_sock_keys(memcg);
674 disarm_kmem_keys(memcg);
675}
676
677static void drain_all_stock_async(struct mem_cgroup *memcg);
678
679static struct mem_cgroup_per_zone *
680mem_cgroup_zoneinfo(struct mem_cgroup *memcg, int nid, int zid)
681{
682 VM_BUG_ON((unsigned)nid >= nr_node_ids);
683 return &memcg->nodeinfo[nid]->zoneinfo[zid];
684}
685
686struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *memcg)
687{
688 return &memcg->css;
689}
690
691static struct mem_cgroup_per_zone *
692page_cgroup_zoneinfo(struct mem_cgroup *memcg, struct page *page)
693{
694 int nid = page_to_nid(page);
695 int zid = page_zonenum(page);
696
697 return mem_cgroup_zoneinfo(memcg, nid, zid);
698}
699
700static struct mem_cgroup_tree_per_zone *
701soft_limit_tree_node_zone(int nid, int zid)
702{
703 return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid];
704}
705
706static struct mem_cgroup_tree_per_zone *
707soft_limit_tree_from_page(struct page *page)
708{
709 int nid = page_to_nid(page);
710 int zid = page_zonenum(page);
711
712 return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid];
713}
714
715static void
716__mem_cgroup_insert_exceeded(struct mem_cgroup *memcg,
717 struct mem_cgroup_per_zone *mz,
718 struct mem_cgroup_tree_per_zone *mctz,
719 unsigned long long new_usage_in_excess)
720{
721 struct rb_node **p = &mctz->rb_root.rb_node;
722 struct rb_node *parent = NULL;
723 struct mem_cgroup_per_zone *mz_node;
724
725 if (mz->on_tree)
726 return;
727
728 mz->usage_in_excess = new_usage_in_excess;
729 if (!mz->usage_in_excess)
730 return;
731 while (*p) {
732 parent = *p;
733 mz_node = rb_entry(parent, struct mem_cgroup_per_zone,
734 tree_node);
735 if (mz->usage_in_excess < mz_node->usage_in_excess)
736 p = &(*p)->rb_left;
737
738
739
740
741 else if (mz->usage_in_excess >= mz_node->usage_in_excess)
742 p = &(*p)->rb_right;
743 }
744 rb_link_node(&mz->tree_node, parent, p);
745 rb_insert_color(&mz->tree_node, &mctz->rb_root);
746 mz->on_tree = true;
747}
748
749static void
750__mem_cgroup_remove_exceeded(struct mem_cgroup *memcg,
751 struct mem_cgroup_per_zone *mz,
752 struct mem_cgroup_tree_per_zone *mctz)
753{
754 if (!mz->on_tree)
755 return;
756 rb_erase(&mz->tree_node, &mctz->rb_root);
757 mz->on_tree = false;
758}
759
760static void
761mem_cgroup_remove_exceeded(struct mem_cgroup *memcg,
762 struct mem_cgroup_per_zone *mz,
763 struct mem_cgroup_tree_per_zone *mctz)
764{
765 spin_lock(&mctz->lock);
766 __mem_cgroup_remove_exceeded(memcg, mz, mctz);
767 spin_unlock(&mctz->lock);
768}
769
770
771static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page)
772{
773 unsigned long long excess;
774 struct mem_cgroup_per_zone *mz;
775 struct mem_cgroup_tree_per_zone *mctz;
776 int nid = page_to_nid(page);
777 int zid = page_zonenum(page);
778 mctz = soft_limit_tree_from_page(page);
779
780
781
782
783
784 for (; memcg; memcg = parent_mem_cgroup(memcg)) {
785 mz = mem_cgroup_zoneinfo(memcg, nid, zid);
786 excess = res_counter_soft_limit_excess(&memcg->res);
787
788
789
790
791 if (excess || mz->on_tree) {
792 spin_lock(&mctz->lock);
793
794 if (mz->on_tree)
795 __mem_cgroup_remove_exceeded(memcg, mz, mctz);
796
797
798
799
800 __mem_cgroup_insert_exceeded(memcg, mz, mctz, excess);
801 spin_unlock(&mctz->lock);
802 }
803 }
804}
805
806static void mem_cgroup_remove_from_trees(struct mem_cgroup *memcg)
807{
808 int node, zone;
809 struct mem_cgroup_per_zone *mz;
810 struct mem_cgroup_tree_per_zone *mctz;
811
812 for_each_node(node) {
813 for (zone = 0; zone < MAX_NR_ZONES; zone++) {
814 mz = mem_cgroup_zoneinfo(memcg, node, zone);
815 mctz = soft_limit_tree_node_zone(node, zone);
816 mem_cgroup_remove_exceeded(memcg, mz, mctz);
817 }
818 }
819}
820
821static struct mem_cgroup_per_zone *
822__mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz)
823{
824 struct rb_node *rightmost = NULL;
825 struct mem_cgroup_per_zone *mz;
826
827retry:
828 mz = NULL;
829 rightmost = rb_last(&mctz->rb_root);
830 if (!rightmost)
831 goto done;
832
833 mz = rb_entry(rightmost, struct mem_cgroup_per_zone, tree_node);
834
835
836
837
838
839 __mem_cgroup_remove_exceeded(mz->memcg, mz, mctz);
840 if (!res_counter_soft_limit_excess(&mz->memcg->res) ||
841 !css_tryget(&mz->memcg->css))
842 goto retry;
843done:
844 return mz;
845}
846
847static struct mem_cgroup_per_zone *
848mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz)
849{
850 struct mem_cgroup_per_zone *mz;
851
852 spin_lock(&mctz->lock);
853 mz = __mem_cgroup_largest_soft_limit_node(mctz);
854 spin_unlock(&mctz->lock);
855 return mz;
856}
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877static long mem_cgroup_read_stat(struct mem_cgroup *memcg,
878 enum mem_cgroup_stat_index idx)
879{
880 long val = 0;
881 int cpu;
882
883 get_online_cpus();
884 for_each_online_cpu(cpu)
885 val += per_cpu(memcg->stat->count[idx], cpu);
886#ifdef CONFIG_HOTPLUG_CPU
887 spin_lock(&memcg->pcp_counter_lock);
888 val += memcg->nocpu_base.count[idx];
889 spin_unlock(&memcg->pcp_counter_lock);
890#endif
891 put_online_cpus();
892 return val;
893}
894
895static void mem_cgroup_swap_statistics(struct mem_cgroup *memcg,
896 bool charge)
897{
898 int val = (charge) ? 1 : -1;
899 this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_SWAP], val);
900}
901
902static unsigned long mem_cgroup_read_events(struct mem_cgroup *memcg,
903 enum mem_cgroup_events_index idx)
904{
905 unsigned long val = 0;
906 int cpu;
907
908 get_online_cpus();
909 for_each_online_cpu(cpu)
910 val += per_cpu(memcg->stat->events[idx], cpu);
911#ifdef CONFIG_HOTPLUG_CPU
912 spin_lock(&memcg->pcp_counter_lock);
913 val += memcg->nocpu_base.events[idx];
914 spin_unlock(&memcg->pcp_counter_lock);
915#endif
916 put_online_cpus();
917 return val;
918}
919
920static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg,
921 struct page *page,
922 bool anon, int nr_pages)
923{
924 preempt_disable();
925
926
927
928
929
930 if (anon)
931 __this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_RSS],
932 nr_pages);
933 else
934 __this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_CACHE],
935 nr_pages);
936
937 if (PageTransHuge(page))
938 __this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_RSS_HUGE],
939 nr_pages);
940
941
942 if (nr_pages > 0)
943 __this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGIN]);
944 else {
945 __this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGOUT]);
946 nr_pages = -nr_pages;
947 }
948
949 __this_cpu_add(memcg->stat->nr_page_events, nr_pages);
950
951 preempt_enable();
952}
953
954unsigned long
955mem_cgroup_get_lru_size(struct lruvec *lruvec, enum lru_list lru)
956{
957 struct mem_cgroup_per_zone *mz;
958
959 mz = container_of(lruvec, struct mem_cgroup_per_zone, lruvec);
960 return mz->lru_size[lru];
961}
962
963static unsigned long
964mem_cgroup_zone_nr_lru_pages(struct mem_cgroup *memcg, int nid, int zid,
965 unsigned int lru_mask)
966{
967 struct mem_cgroup_per_zone *mz;
968 enum lru_list lru;
969 unsigned long ret = 0;
970
971 mz = mem_cgroup_zoneinfo(memcg, nid, zid);
972
973 for_each_lru(lru) {
974 if (BIT(lru) & lru_mask)
975 ret += mz->lru_size[lru];
976 }
977 return ret;
978}
979
980static unsigned long
981mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg,
982 int nid, unsigned int lru_mask)
983{
984 u64 total = 0;
985 int zid;
986
987 for (zid = 0; zid < MAX_NR_ZONES; zid++)
988 total += mem_cgroup_zone_nr_lru_pages(memcg,
989 nid, zid, lru_mask);
990
991 return total;
992}
993
994static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *memcg,
995 unsigned int lru_mask)
996{
997 int nid;
998 u64 total = 0;
999
1000 for_each_node_state(nid, N_MEMORY)
1001 total += mem_cgroup_node_nr_lru_pages(memcg, nid, lru_mask);
1002 return total;
1003}
1004
1005static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg,
1006 enum mem_cgroup_events_target target)
1007{
1008 unsigned long val, next;
1009
1010 val = __this_cpu_read(memcg->stat->nr_page_events);
1011 next = __this_cpu_read(memcg->stat->targets[target]);
1012
1013 if ((long)next - (long)val < 0) {
1014 switch (target) {
1015 case MEM_CGROUP_TARGET_THRESH:
1016 next = val + THRESHOLDS_EVENTS_TARGET;
1017 break;
1018 case MEM_CGROUP_TARGET_SOFTLIMIT:
1019 next = val + SOFTLIMIT_EVENTS_TARGET;
1020 break;
1021 case MEM_CGROUP_TARGET_NUMAINFO:
1022 next = val + NUMAINFO_EVENTS_TARGET;
1023 break;
1024 default:
1025 break;
1026 }
1027 __this_cpu_write(memcg->stat->targets[target], next);
1028 return true;
1029 }
1030 return false;
1031}
1032
1033
1034
1035
1036
1037static void memcg_check_events(struct mem_cgroup *memcg, struct page *page)
1038{
1039 preempt_disable();
1040
1041 if (unlikely(mem_cgroup_event_ratelimit(memcg,
1042 MEM_CGROUP_TARGET_THRESH))) {
1043 bool do_softlimit;
1044 bool do_numainfo __maybe_unused;
1045
1046 do_softlimit = mem_cgroup_event_ratelimit(memcg,
1047 MEM_CGROUP_TARGET_SOFTLIMIT);
1048#if MAX_NUMNODES > 1
1049 do_numainfo = mem_cgroup_event_ratelimit(memcg,
1050 MEM_CGROUP_TARGET_NUMAINFO);
1051#endif
1052 preempt_enable();
1053
1054 mem_cgroup_threshold(memcg);
1055 if (unlikely(do_softlimit))
1056 mem_cgroup_update_tree(memcg, page);
1057#if MAX_NUMNODES > 1
1058 if (unlikely(do_numainfo))
1059 atomic_inc(&memcg->numainfo_events);
1060#endif
1061 } else
1062 preempt_enable();
1063}
1064
1065struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
1066{
1067
1068
1069
1070
1071
1072 if (unlikely(!p))
1073 return NULL;
1074
1075 return mem_cgroup_from_css(task_css(p, mem_cgroup_subsys_id));
1076}
1077
1078struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm)
1079{
1080 struct mem_cgroup *memcg = NULL;
1081
1082 if (!mm)
1083 return NULL;
1084
1085
1086
1087
1088
1089 rcu_read_lock();
1090 do {
1091 memcg = mem_cgroup_from_task(rcu_dereference(mm->owner));
1092 if (unlikely(!memcg))
1093 break;
1094 } while (!css_tryget(&memcg->css));
1095 rcu_read_unlock();
1096 return memcg;
1097}
1098
1099
1100
1101
1102
1103
1104
1105static struct mem_cgroup *__mem_cgroup_iter_next(struct mem_cgroup *root,
1106 struct mem_cgroup *last_visited)
1107{
1108 struct cgroup_subsys_state *prev_css, *next_css;
1109
1110 prev_css = last_visited ? &last_visited->css : NULL;
1111skip_node:
1112 next_css = css_next_descendant_pre(prev_css, &root->css);
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129 if (next_css) {
1130 if ((next_css == &root->css) ||
1131 ((next_css->flags & CSS_ONLINE) && css_tryget(next_css)))
1132 return mem_cgroup_from_css(next_css);
1133
1134 prev_css = next_css;
1135 goto skip_node;
1136 }
1137
1138 return NULL;
1139}
1140
1141static void mem_cgroup_iter_invalidate(struct mem_cgroup *root)
1142{
1143
1144
1145
1146
1147
1148 atomic_inc(&root->dead_count);
1149}
1150
1151static struct mem_cgroup *
1152mem_cgroup_iter_load(struct mem_cgroup_reclaim_iter *iter,
1153 struct mem_cgroup *root,
1154 int *sequence)
1155{
1156 struct mem_cgroup *position = NULL;
1157
1158
1159
1160
1161
1162
1163
1164
1165 *sequence = atomic_read(&root->dead_count);
1166 if (iter->last_dead_count == *sequence) {
1167 smp_rmb();
1168 position = iter->last_visited;
1169
1170
1171
1172
1173
1174
1175
1176 if (position && position != root &&
1177 !css_tryget(&position->css))
1178 position = NULL;
1179 }
1180 return position;
1181}
1182
1183static void mem_cgroup_iter_update(struct mem_cgroup_reclaim_iter *iter,
1184 struct mem_cgroup *last_visited,
1185 struct mem_cgroup *new_position,
1186 struct mem_cgroup *root,
1187 int sequence)
1188{
1189
1190 if (last_visited && last_visited != root)
1191 css_put(&last_visited->css);
1192
1193
1194
1195
1196
1197
1198 iter->last_visited = new_position;
1199 smp_wmb();
1200 iter->last_dead_count = sequence;
1201}
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
1221 struct mem_cgroup *prev,
1222 struct mem_cgroup_reclaim_cookie *reclaim)
1223{
1224 struct mem_cgroup *memcg = NULL;
1225 struct mem_cgroup *last_visited = NULL;
1226
1227 if (mem_cgroup_disabled())
1228 return NULL;
1229
1230 if (!root)
1231 root = root_mem_cgroup;
1232
1233 if (prev && !reclaim)
1234 last_visited = prev;
1235
1236 if (!root->use_hierarchy && root != root_mem_cgroup) {
1237 if (prev)
1238 goto out_css_put;
1239 return root;
1240 }
1241
1242 rcu_read_lock();
1243 while (!memcg) {
1244 struct mem_cgroup_reclaim_iter *uninitialized_var(iter);
1245 int uninitialized_var(seq);
1246
1247 if (reclaim) {
1248 int nid = zone_to_nid(reclaim->zone);
1249 int zid = zone_idx(reclaim->zone);
1250 struct mem_cgroup_per_zone *mz;
1251
1252 mz = mem_cgroup_zoneinfo(root, nid, zid);
1253 iter = &mz->reclaim_iter[reclaim->priority];
1254 if (prev && reclaim->generation != iter->generation) {
1255 iter->last_visited = NULL;
1256 goto out_unlock;
1257 }
1258
1259 last_visited = mem_cgroup_iter_load(iter, root, &seq);
1260 }
1261
1262 memcg = __mem_cgroup_iter_next(root, last_visited);
1263
1264 if (reclaim) {
1265 mem_cgroup_iter_update(iter, last_visited, memcg, root,
1266 seq);
1267
1268 if (!memcg)
1269 iter->generation++;
1270 else if (!prev && memcg)
1271 reclaim->generation = iter->generation;
1272 }
1273
1274 if (prev && !memcg)
1275 goto out_unlock;
1276 }
1277out_unlock:
1278 rcu_read_unlock();
1279out_css_put:
1280 if (prev && prev != root)
1281 css_put(&prev->css);
1282
1283 return memcg;
1284}
1285
1286
1287
1288
1289
1290
1291void mem_cgroup_iter_break(struct mem_cgroup *root,
1292 struct mem_cgroup *prev)
1293{
1294 if (!root)
1295 root = root_mem_cgroup;
1296 if (prev && prev != root)
1297 css_put(&prev->css);
1298}
1299
1300
1301
1302
1303
1304
1305#define for_each_mem_cgroup_tree(iter, root) \
1306 for (iter = mem_cgroup_iter(root, NULL, NULL); \
1307 iter != NULL; \
1308 iter = mem_cgroup_iter(root, iter, NULL))
1309
1310#define for_each_mem_cgroup(iter) \
1311 for (iter = mem_cgroup_iter(NULL, NULL, NULL); \
1312 iter != NULL; \
1313 iter = mem_cgroup_iter(NULL, iter, NULL))
1314
1315void __mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx)
1316{
1317 struct mem_cgroup *memcg;
1318
1319 rcu_read_lock();
1320 memcg = mem_cgroup_from_task(rcu_dereference(mm->owner));
1321 if (unlikely(!memcg))
1322 goto out;
1323
1324 switch (idx) {
1325 case PGFAULT:
1326 this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGFAULT]);
1327 break;
1328 case PGMAJFAULT:
1329 this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGMAJFAULT]);
1330 break;
1331 default:
1332 BUG();
1333 }
1334out:
1335 rcu_read_unlock();
1336}
1337EXPORT_SYMBOL(__mem_cgroup_count_vm_event);
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348struct lruvec *mem_cgroup_zone_lruvec(struct zone *zone,
1349 struct mem_cgroup *memcg)
1350{
1351 struct mem_cgroup_per_zone *mz;
1352 struct lruvec *lruvec;
1353
1354 if (mem_cgroup_disabled()) {
1355 lruvec = &zone->lruvec;
1356 goto out;
1357 }
1358
1359 mz = mem_cgroup_zoneinfo(memcg, zone_to_nid(zone), zone_idx(zone));
1360 lruvec = &mz->lruvec;
1361out:
1362
1363
1364
1365
1366
1367 if (unlikely(lruvec->zone != zone))
1368 lruvec->zone = zone;
1369 return lruvec;
1370}
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391struct lruvec *mem_cgroup_page_lruvec(struct page *page, struct zone *zone)
1392{
1393 struct mem_cgroup_per_zone *mz;
1394 struct mem_cgroup *memcg;
1395 struct page_cgroup *pc;
1396 struct lruvec *lruvec;
1397
1398 if (mem_cgroup_disabled()) {
1399 lruvec = &zone->lruvec;
1400 goto out;
1401 }
1402
1403 pc = lookup_page_cgroup(page);
1404 memcg = pc->mem_cgroup;
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415 if (!PageLRU(page) && !PageCgroupUsed(pc) && memcg != root_mem_cgroup)
1416 pc->mem_cgroup = memcg = root_mem_cgroup;
1417
1418 mz = page_cgroup_zoneinfo(memcg, page);
1419 lruvec = &mz->lruvec;
1420out:
1421
1422
1423
1424
1425
1426 if (unlikely(lruvec->zone != zone))
1427 lruvec->zone = zone;
1428 return lruvec;
1429}
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru,
1441 int nr_pages)
1442{
1443 struct mem_cgroup_per_zone *mz;
1444 unsigned long *lru_size;
1445
1446 if (mem_cgroup_disabled())
1447 return;
1448
1449 mz = container_of(lruvec, struct mem_cgroup_per_zone, lruvec);
1450 lru_size = mz->lru_size + lru;
1451 *lru_size += nr_pages;
1452 VM_BUG_ON((long)(*lru_size) < 0);
1453}
1454
1455
1456
1457
1458
1459bool __mem_cgroup_same_or_subtree(const struct mem_cgroup *root_memcg,
1460 struct mem_cgroup *memcg)
1461{
1462 if (root_memcg == memcg)
1463 return true;
1464 if (!root_memcg->use_hierarchy || !memcg)
1465 return false;
1466 return cgroup_is_descendant(memcg->css.cgroup, root_memcg->css.cgroup);
1467}
1468
1469static bool mem_cgroup_same_or_subtree(const struct mem_cgroup *root_memcg,
1470 struct mem_cgroup *memcg)
1471{
1472 bool ret;
1473
1474 rcu_read_lock();
1475 ret = __mem_cgroup_same_or_subtree(root_memcg, memcg);
1476 rcu_read_unlock();
1477 return ret;
1478}
1479
1480bool task_in_mem_cgroup(struct task_struct *task,
1481 const struct mem_cgroup *memcg)
1482{
1483 struct mem_cgroup *curr = NULL;
1484 struct task_struct *p;
1485 bool ret;
1486
1487 p = find_lock_task_mm(task);
1488 if (p) {
1489 curr = try_get_mem_cgroup_from_mm(p->mm);
1490 task_unlock(p);
1491 } else {
1492
1493
1494
1495
1496
1497 rcu_read_lock();
1498 curr = mem_cgroup_from_task(task);
1499 if (curr)
1500 css_get(&curr->css);
1501 rcu_read_unlock();
1502 }
1503 if (!curr)
1504 return false;
1505
1506
1507
1508
1509
1510
1511 ret = mem_cgroup_same_or_subtree(memcg, curr);
1512 css_put(&curr->css);
1513 return ret;
1514}
1515
1516int mem_cgroup_inactive_anon_is_low(struct lruvec *lruvec)
1517{
1518 unsigned long inactive_ratio;
1519 unsigned long inactive;
1520 unsigned long active;
1521 unsigned long gb;
1522
1523 inactive = mem_cgroup_get_lru_size(lruvec, LRU_INACTIVE_ANON);
1524 active = mem_cgroup_get_lru_size(lruvec, LRU_ACTIVE_ANON);
1525
1526 gb = (inactive + active) >> (30 - PAGE_SHIFT);
1527 if (gb)
1528 inactive_ratio = int_sqrt(10 * gb);
1529 else
1530 inactive_ratio = 1;
1531
1532 return inactive * inactive_ratio < active;
1533}
1534
1535#define mem_cgroup_from_res_counter(counter, member) \
1536 container_of(counter, struct mem_cgroup, member)
1537
1538
1539
1540
1541
1542
1543
1544
1545static unsigned long mem_cgroup_margin(struct mem_cgroup *memcg)
1546{
1547 unsigned long long margin;
1548
1549 margin = res_counter_margin(&memcg->res);
1550 if (do_swap_account)
1551 margin = min(margin, res_counter_margin(&memcg->memsw));
1552 return margin >> PAGE_SHIFT;
1553}
1554
1555int mem_cgroup_swappiness(struct mem_cgroup *memcg)
1556{
1557
1558 if (!css_parent(&memcg->css))
1559 return vm_swappiness;
1560
1561 return memcg->swappiness;
1562}
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580atomic_t memcg_moving __read_mostly;
1581
1582static void mem_cgroup_start_move(struct mem_cgroup *memcg)
1583{
1584 atomic_inc(&memcg_moving);
1585 atomic_inc(&memcg->moving_account);
1586 synchronize_rcu();
1587}
1588
1589static void mem_cgroup_end_move(struct mem_cgroup *memcg)
1590{
1591
1592
1593
1594
1595 if (memcg) {
1596 atomic_dec(&memcg_moving);
1597 atomic_dec(&memcg->moving_account);
1598 }
1599}
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613static bool mem_cgroup_stolen(struct mem_cgroup *memcg)
1614{
1615 VM_BUG_ON(!rcu_read_lock_held());
1616 return atomic_read(&memcg->moving_account) > 0;
1617}
1618
1619static bool mem_cgroup_under_move(struct mem_cgroup *memcg)
1620{
1621 struct mem_cgroup *from;
1622 struct mem_cgroup *to;
1623 bool ret = false;
1624
1625
1626
1627
1628 spin_lock(&mc.lock);
1629 from = mc.from;
1630 to = mc.to;
1631 if (!from)
1632 goto unlock;
1633
1634 ret = mem_cgroup_same_or_subtree(memcg, from)
1635 || mem_cgroup_same_or_subtree(memcg, to);
1636unlock:
1637 spin_unlock(&mc.lock);
1638 return ret;
1639}
1640
1641static bool mem_cgroup_wait_acct_move(struct mem_cgroup *memcg)
1642{
1643 if (mc.moving_task && current != mc.moving_task) {
1644 if (mem_cgroup_under_move(memcg)) {
1645 DEFINE_WAIT(wait);
1646 prepare_to_wait(&mc.waitq, &wait, TASK_INTERRUPTIBLE);
1647
1648 if (mc.moving_task)
1649 schedule();
1650 finish_wait(&mc.waitq, &wait);
1651 return true;
1652 }
1653 }
1654 return false;
1655}
1656
1657
1658
1659
1660
1661
1662
1663static void move_lock_mem_cgroup(struct mem_cgroup *memcg,
1664 unsigned long *flags)
1665{
1666 spin_lock_irqsave(&memcg->move_lock, *flags);
1667}
1668
1669static void move_unlock_mem_cgroup(struct mem_cgroup *memcg,
1670 unsigned long *flags)
1671{
1672 spin_unlock_irqrestore(&memcg->move_lock, *flags);
1673}
1674
1675#define K(x) ((x) << (PAGE_SHIFT-10))
1676
1677
1678
1679
1680
1681
1682
1683
1684void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
1685{
1686
1687
1688
1689
1690 static DEFINE_MUTEX(oom_info_lock);
1691 struct cgroup *task_cgrp;
1692 struct cgroup *mem_cgrp;
1693 static char memcg_name[PATH_MAX];
1694 int ret;
1695 struct mem_cgroup *iter;
1696 unsigned int i;
1697
1698 if (!p)
1699 return;
1700
1701 mutex_lock(&oom_info_lock);
1702 rcu_read_lock();
1703
1704 mem_cgrp = memcg->css.cgroup;
1705 task_cgrp = task_cgroup(p, mem_cgroup_subsys_id);
1706
1707 ret = cgroup_path(task_cgrp, memcg_name, PATH_MAX);
1708 if (ret < 0) {
1709
1710
1711
1712
1713 rcu_read_unlock();
1714 goto done;
1715 }
1716 rcu_read_unlock();
1717
1718 pr_info("Task in %s killed", memcg_name);
1719
1720 rcu_read_lock();
1721 ret = cgroup_path(mem_cgrp, memcg_name, PATH_MAX);
1722 if (ret < 0) {
1723 rcu_read_unlock();
1724 goto done;
1725 }
1726 rcu_read_unlock();
1727
1728
1729
1730
1731 pr_cont(" as a result of limit of %s\n", memcg_name);
1732done:
1733
1734 pr_info("memory: usage %llukB, limit %llukB, failcnt %llu\n",
1735 res_counter_read_u64(&memcg->res, RES_USAGE) >> 10,
1736 res_counter_read_u64(&memcg->res, RES_LIMIT) >> 10,
1737 res_counter_read_u64(&memcg->res, RES_FAILCNT));
1738 pr_info("memory+swap: usage %llukB, limit %llukB, failcnt %llu\n",
1739 res_counter_read_u64(&memcg->memsw, RES_USAGE) >> 10,
1740 res_counter_read_u64(&memcg->memsw, RES_LIMIT) >> 10,
1741 res_counter_read_u64(&memcg->memsw, RES_FAILCNT));
1742 pr_info("kmem: usage %llukB, limit %llukB, failcnt %llu\n",
1743 res_counter_read_u64(&memcg->kmem, RES_USAGE) >> 10,
1744 res_counter_read_u64(&memcg->kmem, RES_LIMIT) >> 10,
1745 res_counter_read_u64(&memcg->kmem, RES_FAILCNT));
1746
1747 for_each_mem_cgroup_tree(iter, memcg) {
1748 pr_info("Memory cgroup stats");
1749
1750 rcu_read_lock();
1751 ret = cgroup_path(iter->css.cgroup, memcg_name, PATH_MAX);
1752 if (!ret)
1753 pr_cont(" for %s", memcg_name);
1754 rcu_read_unlock();
1755 pr_cont(":");
1756
1757 for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) {
1758 if (i == MEM_CGROUP_STAT_SWAP && !do_swap_account)
1759 continue;
1760 pr_cont(" %s:%ldKB", mem_cgroup_stat_names[i],
1761 K(mem_cgroup_read_stat(iter, i)));
1762 }
1763
1764 for (i = 0; i < NR_LRU_LISTS; i++)
1765 pr_cont(" %s:%luKB", mem_cgroup_lru_names[i],
1766 K(mem_cgroup_nr_lru_pages(iter, BIT(i))));
1767
1768 pr_cont("\n");
1769 }
1770 mutex_unlock(&oom_info_lock);
1771}
1772
1773
1774
1775
1776
1777static int mem_cgroup_count_children(struct mem_cgroup *memcg)
1778{
1779 int num = 0;
1780 struct mem_cgroup *iter;
1781
1782 for_each_mem_cgroup_tree(iter, memcg)
1783 num++;
1784 return num;
1785}
1786
1787
1788
1789
1790static u64 mem_cgroup_get_limit(struct mem_cgroup *memcg)
1791{
1792 u64 limit;
1793
1794 limit = res_counter_read_u64(&memcg->res, RES_LIMIT);
1795
1796
1797
1798
1799 if (mem_cgroup_swappiness(memcg)) {
1800 u64 memsw;
1801
1802 limit += total_swap_pages << PAGE_SHIFT;
1803 memsw = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
1804
1805
1806
1807
1808
1809 limit = min(limit, memsw);
1810 }
1811
1812 return limit;
1813}
1814
1815static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
1816 int order)
1817{
1818 struct mem_cgroup *iter;
1819 unsigned long chosen_points = 0;
1820 unsigned long totalpages;
1821 unsigned int points = 0;
1822 struct task_struct *chosen = NULL;
1823
1824
1825
1826
1827
1828
1829 if (fatal_signal_pending(current) || current->flags & PF_EXITING) {
1830 set_thread_flag(TIF_MEMDIE);
1831 return;
1832 }
1833
1834 check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, order, NULL);
1835 totalpages = mem_cgroup_get_limit(memcg) >> PAGE_SHIFT ? : 1;
1836 for_each_mem_cgroup_tree(iter, memcg) {
1837 struct css_task_iter it;
1838 struct task_struct *task;
1839
1840 css_task_iter_start(&iter->css, &it);
1841 while ((task = css_task_iter_next(&it))) {
1842 switch (oom_scan_process_thread(task, totalpages, NULL,
1843 false)) {
1844 case OOM_SCAN_SELECT:
1845 if (chosen)
1846 put_task_struct(chosen);
1847 chosen = task;
1848 chosen_points = ULONG_MAX;
1849 get_task_struct(chosen);
1850
1851 case OOM_SCAN_CONTINUE:
1852 continue;
1853 case OOM_SCAN_ABORT:
1854 css_task_iter_end(&it);
1855 mem_cgroup_iter_break(memcg, iter);
1856 if (chosen)
1857 put_task_struct(chosen);
1858 return;
1859 case OOM_SCAN_OK:
1860 break;
1861 };
1862 points = oom_badness(task, memcg, NULL, totalpages);
1863 if (!points || points < chosen_points)
1864 continue;
1865
1866 if (points == chosen_points &&
1867 thread_group_leader(chosen))
1868 continue;
1869
1870 if (chosen)
1871 put_task_struct(chosen);
1872 chosen = task;
1873 chosen_points = points;
1874 get_task_struct(chosen);
1875 }
1876 css_task_iter_end(&it);
1877 }
1878
1879 if (!chosen)
1880 return;
1881 points = chosen_points * 1000 / totalpages;
1882 oom_kill_process(chosen, gfp_mask, order, points, totalpages, memcg,
1883 NULL, "Memory cgroup out of memory");
1884}
1885
1886static unsigned long mem_cgroup_reclaim(struct mem_cgroup *memcg,
1887 gfp_t gfp_mask,
1888 unsigned long flags)
1889{
1890 unsigned long total = 0;
1891 bool noswap = false;
1892 int loop;
1893
1894 if (flags & MEM_CGROUP_RECLAIM_NOSWAP)
1895 noswap = true;
1896 if (!(flags & MEM_CGROUP_RECLAIM_SHRINK) && memcg->memsw_is_minimum)
1897 noswap = true;
1898
1899 for (loop = 0; loop < MEM_CGROUP_MAX_RECLAIM_LOOPS; loop++) {
1900 if (loop)
1901 drain_all_stock_async(memcg);
1902 total += try_to_free_mem_cgroup_pages(memcg, gfp_mask, noswap);
1903
1904
1905
1906
1907
1908 if (total && (flags & MEM_CGROUP_RECLAIM_SHRINK))
1909 break;
1910 if (mem_cgroup_margin(memcg))
1911 break;
1912
1913
1914
1915
1916 if (loop && !total)
1917 break;
1918 }
1919 return total;
1920}
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932static bool test_mem_cgroup_node_reclaimable(struct mem_cgroup *memcg,
1933 int nid, bool noswap)
1934{
1935 if (mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL_FILE))
1936 return true;
1937 if (noswap || !total_swap_pages)
1938 return false;
1939 if (mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL_ANON))
1940 return true;
1941 return false;
1942
1943}
1944#if MAX_NUMNODES > 1
1945
1946
1947
1948
1949
1950
1951
1952static void mem_cgroup_may_update_nodemask(struct mem_cgroup *memcg)
1953{
1954 int nid;
1955
1956
1957
1958
1959 if (!atomic_read(&memcg->numainfo_events))
1960 return;
1961 if (atomic_inc_return(&memcg->numainfo_updating) > 1)
1962 return;
1963
1964
1965 memcg->scan_nodes = node_states[N_MEMORY];
1966
1967 for_each_node_mask(nid, node_states[N_MEMORY]) {
1968
1969 if (!test_mem_cgroup_node_reclaimable(memcg, nid, false))
1970 node_clear(nid, memcg->scan_nodes);
1971 }
1972
1973 atomic_set(&memcg->numainfo_events, 0);
1974 atomic_set(&memcg->numainfo_updating, 0);
1975}
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989int mem_cgroup_select_victim_node(struct mem_cgroup *memcg)
1990{
1991 int node;
1992
1993 mem_cgroup_may_update_nodemask(memcg);
1994 node = memcg->last_scanned_node;
1995
1996 node = next_node(node, memcg->scan_nodes);
1997 if (node == MAX_NUMNODES)
1998 node = first_node(memcg->scan_nodes);
1999
2000
2001
2002
2003
2004
2005 if (unlikely(node == MAX_NUMNODES))
2006 node = numa_node_id();
2007
2008 memcg->last_scanned_node = node;
2009 return node;
2010}
2011
2012
2013
2014
2015
2016
2017
2018static bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap)
2019{
2020 int nid;
2021
2022
2023
2024
2025
2026 if (!nodes_empty(memcg->scan_nodes)) {
2027 for (nid = first_node(memcg->scan_nodes);
2028 nid < MAX_NUMNODES;
2029 nid = next_node(nid, memcg->scan_nodes)) {
2030
2031 if (test_mem_cgroup_node_reclaimable(memcg, nid, noswap))
2032 return true;
2033 }
2034 }
2035
2036
2037
2038 for_each_node_state(nid, N_MEMORY) {
2039 if (node_isset(nid, memcg->scan_nodes))
2040 continue;
2041 if (test_mem_cgroup_node_reclaimable(memcg, nid, noswap))
2042 return true;
2043 }
2044 return false;
2045}
2046
2047#else
2048int mem_cgroup_select_victim_node(struct mem_cgroup *memcg)
2049{
2050 return 0;
2051}
2052
2053static bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap)
2054{
2055 return test_mem_cgroup_node_reclaimable(memcg, 0, noswap);
2056}
2057#endif
2058
2059static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg,
2060 struct zone *zone,
2061 gfp_t gfp_mask,
2062 unsigned long *total_scanned)
2063{
2064 struct mem_cgroup *victim = NULL;
2065 int total = 0;
2066 int loop = 0;
2067 unsigned long excess;
2068 unsigned long nr_scanned;
2069 struct mem_cgroup_reclaim_cookie reclaim = {
2070 .zone = zone,
2071 .priority = 0,
2072 };
2073
2074 excess = res_counter_soft_limit_excess(&root_memcg->res) >> PAGE_SHIFT;
2075
2076 while (1) {
2077 victim = mem_cgroup_iter(root_memcg, victim, &reclaim);
2078 if (!victim) {
2079 loop++;
2080 if (loop >= 2) {
2081
2082
2083
2084
2085
2086 if (!total)
2087 break;
2088
2089
2090
2091
2092
2093
2094 if (total >= (excess >> 2) ||
2095 (loop > MEM_CGROUP_MAX_RECLAIM_LOOPS))
2096 break;
2097 }
2098 continue;
2099 }
2100 if (!mem_cgroup_reclaimable(victim, false))
2101 continue;
2102 total += mem_cgroup_shrink_node_zone(victim, gfp_mask, false,
2103 zone, &nr_scanned);
2104 *total_scanned += nr_scanned;
2105 if (!res_counter_soft_limit_excess(&root_memcg->res))
2106 break;
2107 }
2108 mem_cgroup_iter_break(root_memcg, victim);
2109 return total;
2110}
2111
2112#ifdef CONFIG_LOCKDEP
2113static struct lockdep_map memcg_oom_lock_dep_map = {
2114 .name = "memcg_oom_lock",
2115};
2116#endif
2117
2118static DEFINE_SPINLOCK(memcg_oom_lock);
2119
2120
2121
2122
2123
2124static bool mem_cgroup_oom_trylock(struct mem_cgroup *memcg)
2125{
2126 struct mem_cgroup *iter, *failed = NULL;
2127
2128 spin_lock(&memcg_oom_lock);
2129
2130 for_each_mem_cgroup_tree(iter, memcg) {
2131 if (iter->oom_lock) {
2132
2133
2134
2135
2136 failed = iter;
2137 mem_cgroup_iter_break(memcg, iter);
2138 break;
2139 } else
2140 iter->oom_lock = true;
2141 }
2142
2143 if (failed) {
2144
2145
2146
2147
2148 for_each_mem_cgroup_tree(iter, memcg) {
2149 if (iter == failed) {
2150 mem_cgroup_iter_break(memcg, iter);
2151 break;
2152 }
2153 iter->oom_lock = false;
2154 }
2155 } else
2156 mutex_acquire(&memcg_oom_lock_dep_map, 0, 1, _RET_IP_);
2157
2158 spin_unlock(&memcg_oom_lock);
2159
2160 return !failed;
2161}
2162
2163static void mem_cgroup_oom_unlock(struct mem_cgroup *memcg)
2164{
2165 struct mem_cgroup *iter;
2166
2167 spin_lock(&memcg_oom_lock);
2168 mutex_release(&memcg_oom_lock_dep_map, 1, _RET_IP_);
2169 for_each_mem_cgroup_tree(iter, memcg)
2170 iter->oom_lock = false;
2171 spin_unlock(&memcg_oom_lock);
2172}
2173
2174static void mem_cgroup_mark_under_oom(struct mem_cgroup *memcg)
2175{
2176 struct mem_cgroup *iter;
2177
2178 for_each_mem_cgroup_tree(iter, memcg)
2179 atomic_inc(&iter->under_oom);
2180}
2181
2182static void mem_cgroup_unmark_under_oom(struct mem_cgroup *memcg)
2183{
2184 struct mem_cgroup *iter;
2185
2186
2187
2188
2189
2190
2191 for_each_mem_cgroup_tree(iter, memcg)
2192 atomic_add_unless(&iter->under_oom, -1, 0);
2193}
2194
2195static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq);
2196
2197struct oom_wait_info {
2198 struct mem_cgroup *memcg;
2199 wait_queue_t wait;
2200};
2201
2202static int memcg_oom_wake_function(wait_queue_t *wait,
2203 unsigned mode, int sync, void *arg)
2204{
2205 struct mem_cgroup *wake_memcg = (struct mem_cgroup *)arg;
2206 struct mem_cgroup *oom_wait_memcg;
2207 struct oom_wait_info *oom_wait_info;
2208
2209 oom_wait_info = container_of(wait, struct oom_wait_info, wait);
2210 oom_wait_memcg = oom_wait_info->memcg;
2211
2212
2213
2214
2215
2216 if (!mem_cgroup_same_or_subtree(oom_wait_memcg, wake_memcg)
2217 && !mem_cgroup_same_or_subtree(wake_memcg, oom_wait_memcg))
2218 return 0;
2219 return autoremove_wake_function(wait, mode, sync, arg);
2220}
2221
2222static void memcg_wakeup_oom(struct mem_cgroup *memcg)
2223{
2224 atomic_inc(&memcg->oom_wakeups);
2225
2226 __wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, memcg);
2227}
2228
2229static void memcg_oom_recover(struct mem_cgroup *memcg)
2230{
2231 if (memcg && atomic_read(&memcg->under_oom))
2232 memcg_wakeup_oom(memcg);
2233}
2234
2235static void mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order)
2236{
2237 if (!current->memcg_oom.may_oom)
2238 return;
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253 css_get(&memcg->css);
2254 current->memcg_oom.memcg = memcg;
2255 current->memcg_oom.gfp_mask = mask;
2256 current->memcg_oom.order = order;
2257}
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276bool mem_cgroup_oom_synchronize(bool handle)
2277{
2278 struct mem_cgroup *memcg = current->memcg_oom.memcg;
2279 struct oom_wait_info owait;
2280 bool locked;
2281
2282
2283 if (!memcg)
2284 return false;
2285
2286 if (!handle)
2287 goto cleanup;
2288
2289 owait.memcg = memcg;
2290 owait.wait.flags = 0;
2291 owait.wait.func = memcg_oom_wake_function;
2292 owait.wait.private = current;
2293 INIT_LIST_HEAD(&owait.wait.task_list);
2294
2295 prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE);
2296 mem_cgroup_mark_under_oom(memcg);
2297
2298 locked = mem_cgroup_oom_trylock(memcg);
2299
2300 if (locked)
2301 mem_cgroup_oom_notify(memcg);
2302
2303 if (locked && !memcg->oom_kill_disable) {
2304 mem_cgroup_unmark_under_oom(memcg);
2305 finish_wait(&memcg_oom_waitq, &owait.wait);
2306 mem_cgroup_out_of_memory(memcg, current->memcg_oom.gfp_mask,
2307 current->memcg_oom.order);
2308 } else {
2309 schedule();
2310 mem_cgroup_unmark_under_oom(memcg);
2311 finish_wait(&memcg_oom_waitq, &owait.wait);
2312 }
2313
2314 if (locked) {
2315 mem_cgroup_oom_unlock(memcg);
2316
2317
2318
2319
2320
2321 memcg_oom_recover(memcg);
2322 }
2323cleanup:
2324 current->memcg_oom.memcg = NULL;
2325 css_put(&memcg->css);
2326 return true;
2327}
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353void __mem_cgroup_begin_update_page_stat(struct page *page,
2354 bool *locked, unsigned long *flags)
2355{
2356 struct mem_cgroup *memcg;
2357 struct page_cgroup *pc;
2358
2359 pc = lookup_page_cgroup(page);
2360again:
2361 memcg = pc->mem_cgroup;
2362 if (unlikely(!memcg || !PageCgroupUsed(pc)))
2363 return;
2364
2365
2366
2367
2368
2369
2370 if (!mem_cgroup_stolen(memcg))
2371 return;
2372
2373 move_lock_mem_cgroup(memcg, flags);
2374 if (memcg != pc->mem_cgroup || !PageCgroupUsed(pc)) {
2375 move_unlock_mem_cgroup(memcg, flags);
2376 goto again;
2377 }
2378 *locked = true;
2379}
2380
2381void __mem_cgroup_end_update_page_stat(struct page *page, unsigned long *flags)
2382{
2383 struct page_cgroup *pc = lookup_page_cgroup(page);
2384
2385
2386
2387
2388
2389
2390 move_unlock_mem_cgroup(pc->mem_cgroup, flags);
2391}
2392
2393void mem_cgroup_update_page_stat(struct page *page,
2394 enum mem_cgroup_stat_index idx, int val)
2395{
2396 struct mem_cgroup *memcg;
2397 struct page_cgroup *pc = lookup_page_cgroup(page);
2398 unsigned long uninitialized_var(flags);
2399
2400 if (mem_cgroup_disabled())
2401 return;
2402
2403 VM_BUG_ON(!rcu_read_lock_held());
2404 memcg = pc->mem_cgroup;
2405 if (unlikely(!memcg || !PageCgroupUsed(pc)))
2406 return;
2407
2408 this_cpu_add(memcg->stat->count[idx], val);
2409}
2410
2411
2412
2413
2414
2415#define CHARGE_BATCH 32U
2416struct memcg_stock_pcp {
2417 struct mem_cgroup *cached;
2418 unsigned int nr_pages;
2419 struct work_struct work;
2420 unsigned long flags;
2421#define FLUSHING_CACHED_CHARGE 0
2422};
2423static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock);
2424static DEFINE_MUTEX(percpu_charge_mutex);
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
2438{
2439 struct memcg_stock_pcp *stock;
2440 bool ret = true;
2441
2442 if (nr_pages > CHARGE_BATCH)
2443 return false;
2444
2445 stock = &get_cpu_var(memcg_stock);
2446 if (memcg == stock->cached && stock->nr_pages >= nr_pages)
2447 stock->nr_pages -= nr_pages;
2448 else
2449 ret = false;
2450 put_cpu_var(memcg_stock);
2451 return ret;
2452}
2453
2454
2455
2456
2457static void drain_stock(struct memcg_stock_pcp *stock)
2458{
2459 struct mem_cgroup *old = stock->cached;
2460
2461 if (stock->nr_pages) {
2462 unsigned long bytes = stock->nr_pages * PAGE_SIZE;
2463
2464 res_counter_uncharge(&old->res, bytes);
2465 if (do_swap_account)
2466 res_counter_uncharge(&old->memsw, bytes);
2467 stock->nr_pages = 0;
2468 }
2469 stock->cached = NULL;
2470}
2471
2472
2473
2474
2475
2476static void drain_local_stock(struct work_struct *dummy)
2477{
2478 struct memcg_stock_pcp *stock = &__get_cpu_var(memcg_stock);
2479 drain_stock(stock);
2480 clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags);
2481}
2482
2483static void __init memcg_stock_init(void)
2484{
2485 int cpu;
2486
2487 for_each_possible_cpu(cpu) {
2488 struct memcg_stock_pcp *stock =
2489 &per_cpu(memcg_stock, cpu);
2490 INIT_WORK(&stock->work, drain_local_stock);
2491 }
2492}
2493
2494
2495
2496
2497
2498static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
2499{
2500 struct memcg_stock_pcp *stock = &get_cpu_var(memcg_stock);
2501
2502 if (stock->cached != memcg) {
2503 drain_stock(stock);
2504 stock->cached = memcg;
2505 }
2506 stock->nr_pages += nr_pages;
2507 put_cpu_var(memcg_stock);
2508}
2509
2510
2511
2512
2513
2514
2515static void drain_all_stock(struct mem_cgroup *root_memcg, bool sync)
2516{
2517 int cpu, curcpu;
2518
2519
2520 get_online_cpus();
2521 curcpu = get_cpu();
2522 for_each_online_cpu(cpu) {
2523 struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu);
2524 struct mem_cgroup *memcg;
2525
2526 memcg = stock->cached;
2527 if (!memcg || !stock->nr_pages)
2528 continue;
2529 if (!mem_cgroup_same_or_subtree(root_memcg, memcg))
2530 continue;
2531 if (!test_and_set_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) {
2532 if (cpu == curcpu)
2533 drain_local_stock(&stock->work);
2534 else
2535 schedule_work_on(cpu, &stock->work);
2536 }
2537 }
2538 put_cpu();
2539
2540 if (!sync)
2541 goto out;
2542
2543 for_each_online_cpu(cpu) {
2544 struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu);
2545 if (test_bit(FLUSHING_CACHED_CHARGE, &stock->flags))
2546 flush_work(&stock->work);
2547 }
2548out:
2549 put_online_cpus();
2550}
2551
2552
2553
2554
2555
2556
2557
2558static void drain_all_stock_async(struct mem_cgroup *root_memcg)
2559{
2560
2561
2562
2563 if (!mutex_trylock(&percpu_charge_mutex))
2564 return;
2565 drain_all_stock(root_memcg, false);
2566 mutex_unlock(&percpu_charge_mutex);
2567}
2568
2569
2570static void drain_all_stock_sync(struct mem_cgroup *root_memcg)
2571{
2572
2573 mutex_lock(&percpu_charge_mutex);
2574 drain_all_stock(root_memcg, true);
2575 mutex_unlock(&percpu_charge_mutex);
2576}
2577
2578
2579
2580
2581
2582static void mem_cgroup_drain_pcp_counter(struct mem_cgroup *memcg, int cpu)
2583{
2584 int i;
2585
2586 spin_lock(&memcg->pcp_counter_lock);
2587 for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) {
2588 long x = per_cpu(memcg->stat->count[i], cpu);
2589
2590 per_cpu(memcg->stat->count[i], cpu) = 0;
2591 memcg->nocpu_base.count[i] += x;
2592 }
2593 for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++) {
2594 unsigned long x = per_cpu(memcg->stat->events[i], cpu);
2595
2596 per_cpu(memcg->stat->events[i], cpu) = 0;
2597 memcg->nocpu_base.events[i] += x;
2598 }
2599 spin_unlock(&memcg->pcp_counter_lock);
2600}
2601
2602static int memcg_cpu_hotplug_callback(struct notifier_block *nb,
2603 unsigned long action,
2604 void *hcpu)
2605{
2606 int cpu = (unsigned long)hcpu;
2607 struct memcg_stock_pcp *stock;
2608 struct mem_cgroup *iter;
2609
2610 if (action == CPU_ONLINE)
2611 return NOTIFY_OK;
2612
2613 if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
2614 return NOTIFY_OK;
2615
2616 for_each_mem_cgroup(iter)
2617 mem_cgroup_drain_pcp_counter(iter, cpu);
2618
2619 stock = &per_cpu(memcg_stock, cpu);
2620 drain_stock(stock);
2621 return NOTIFY_OK;
2622}
2623
2624
2625
2626enum {
2627 CHARGE_OK,
2628 CHARGE_RETRY,
2629 CHARGE_NOMEM,
2630 CHARGE_WOULDBLOCK,
2631};
2632
2633static int mem_cgroup_do_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
2634 unsigned int nr_pages, unsigned int min_pages,
2635 bool invoke_oom)
2636{
2637 unsigned long csize = nr_pages * PAGE_SIZE;
2638 struct mem_cgroup *mem_over_limit;
2639 struct res_counter *fail_res;
2640 unsigned long flags = 0;
2641 int ret;
2642
2643 ret = res_counter_charge(&memcg->res, csize, &fail_res);
2644
2645 if (likely(!ret)) {
2646 if (!do_swap_account)
2647 return CHARGE_OK;
2648 ret = res_counter_charge(&memcg->memsw, csize, &fail_res);
2649 if (likely(!ret))
2650 return CHARGE_OK;
2651
2652 res_counter_uncharge(&memcg->res, csize);
2653 mem_over_limit = mem_cgroup_from_res_counter(fail_res, memsw);
2654 flags |= MEM_CGROUP_RECLAIM_NOSWAP;
2655 } else
2656 mem_over_limit = mem_cgroup_from_res_counter(fail_res, res);
2657
2658
2659
2660
2661 if (nr_pages > min_pages)
2662 return CHARGE_RETRY;
2663
2664 if (!(gfp_mask & __GFP_WAIT))
2665 return CHARGE_WOULDBLOCK;
2666
2667 if (gfp_mask & __GFP_NORETRY)
2668 return CHARGE_NOMEM;
2669
2670 ret = mem_cgroup_reclaim(mem_over_limit, gfp_mask, flags);
2671 if (mem_cgroup_margin(mem_over_limit) >= nr_pages)
2672 return CHARGE_RETRY;
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682 if (nr_pages <= (1 << PAGE_ALLOC_COSTLY_ORDER) && ret)
2683 return CHARGE_RETRY;
2684
2685
2686
2687
2688
2689 if (mem_cgroup_wait_acct_move(mem_over_limit))
2690 return CHARGE_RETRY;
2691
2692 if (invoke_oom)
2693 mem_cgroup_oom(mem_over_limit, gfp_mask, get_order(csize));
2694
2695 return CHARGE_NOMEM;
2696}
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719static int __mem_cgroup_try_charge(struct mm_struct *mm,
2720 gfp_t gfp_mask,
2721 unsigned int nr_pages,
2722 struct mem_cgroup **ptr,
2723 bool oom)
2724{
2725 unsigned int batch = max(CHARGE_BATCH, nr_pages);
2726 int nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES;
2727 struct mem_cgroup *memcg = NULL;
2728 int ret;
2729
2730
2731
2732
2733
2734
2735 if (unlikely(test_thread_flag(TIF_MEMDIE)
2736 || fatal_signal_pending(current)))
2737 goto bypass;
2738
2739 if (unlikely(task_in_memcg_oom(current)))
2740 goto nomem;
2741
2742 if (gfp_mask & __GFP_NOFAIL)
2743 oom = false;
2744
2745
2746
2747
2748
2749
2750
2751 if (!*ptr && !mm)
2752 *ptr = root_mem_cgroup;
2753again:
2754 if (*ptr) {
2755 memcg = *ptr;
2756 if (mem_cgroup_is_root(memcg))
2757 goto done;
2758 if (consume_stock(memcg, nr_pages))
2759 goto done;
2760 css_get(&memcg->css);
2761 } else {
2762 struct task_struct *p;
2763
2764 rcu_read_lock();
2765 p = rcu_dereference(mm->owner);
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776 memcg = mem_cgroup_from_task(p);
2777 if (!memcg)
2778 memcg = root_mem_cgroup;
2779 if (mem_cgroup_is_root(memcg)) {
2780 rcu_read_unlock();
2781 goto done;
2782 }
2783 if (consume_stock(memcg, nr_pages)) {
2784
2785
2786
2787
2788
2789
2790
2791
2792 rcu_read_unlock();
2793 goto done;
2794 }
2795
2796 if (!css_tryget(&memcg->css)) {
2797 rcu_read_unlock();
2798 goto again;
2799 }
2800 rcu_read_unlock();
2801 }
2802
2803 do {
2804 bool invoke_oom = oom && !nr_oom_retries;
2805
2806
2807 if (fatal_signal_pending(current)) {
2808 css_put(&memcg->css);
2809 goto bypass;
2810 }
2811
2812 ret = mem_cgroup_do_charge(memcg, gfp_mask, batch,
2813 nr_pages, invoke_oom);
2814 switch (ret) {
2815 case CHARGE_OK:
2816 break;
2817 case CHARGE_RETRY:
2818 batch = nr_pages;
2819 css_put(&memcg->css);
2820 memcg = NULL;
2821 goto again;
2822 case CHARGE_WOULDBLOCK:
2823 css_put(&memcg->css);
2824 goto nomem;
2825 case CHARGE_NOMEM:
2826 if (!oom || invoke_oom) {
2827 css_put(&memcg->css);
2828 goto nomem;
2829 }
2830 nr_oom_retries--;
2831 break;
2832 }
2833 } while (ret != CHARGE_OK);
2834
2835 if (batch > nr_pages)
2836 refill_stock(memcg, batch - nr_pages);
2837 css_put(&memcg->css);
2838done:
2839 *ptr = memcg;
2840 return 0;
2841nomem:
2842 if (!(gfp_mask & __GFP_NOFAIL)) {
2843 *ptr = NULL;
2844 return -ENOMEM;
2845 }
2846bypass:
2847 *ptr = root_mem_cgroup;
2848 return -EINTR;
2849}
2850
2851
2852
2853
2854
2855
2856static void __mem_cgroup_cancel_charge(struct mem_cgroup *memcg,
2857 unsigned int nr_pages)
2858{
2859 if (!mem_cgroup_is_root(memcg)) {
2860 unsigned long bytes = nr_pages * PAGE_SIZE;
2861
2862 res_counter_uncharge(&memcg->res, bytes);
2863 if (do_swap_account)
2864 res_counter_uncharge(&memcg->memsw, bytes);
2865 }
2866}
2867
2868
2869
2870
2871
2872static void __mem_cgroup_cancel_local_charge(struct mem_cgroup *memcg,
2873 unsigned int nr_pages)
2874{
2875 unsigned long bytes = nr_pages * PAGE_SIZE;
2876
2877 if (mem_cgroup_is_root(memcg))
2878 return;
2879
2880 res_counter_uncharge_until(&memcg->res, memcg->res.parent, bytes);
2881 if (do_swap_account)
2882 res_counter_uncharge_until(&memcg->memsw,
2883 memcg->memsw.parent, bytes);
2884}
2885
2886
2887
2888
2889
2890
2891
2892static struct mem_cgroup *mem_cgroup_lookup(unsigned short id)
2893{
2894
2895 if (!id)
2896 return NULL;
2897 return mem_cgroup_from_id(id);
2898}
2899
2900struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page)
2901{
2902 struct mem_cgroup *memcg = NULL;
2903 struct page_cgroup *pc;
2904 unsigned short id;
2905 swp_entry_t ent;
2906
2907 VM_BUG_ON_PAGE(!PageLocked(page), page);
2908
2909 pc = lookup_page_cgroup(page);
2910 lock_page_cgroup(pc);
2911 if (PageCgroupUsed(pc)) {
2912 memcg = pc->mem_cgroup;
2913 if (memcg && !css_tryget(&memcg->css))
2914 memcg = NULL;
2915 } else if (PageSwapCache(page)) {
2916 ent.val = page_private(page);
2917 id = lookup_swap_cgroup_id(ent);
2918 rcu_read_lock();
2919 memcg = mem_cgroup_lookup(id);
2920 if (memcg && !css_tryget(&memcg->css))
2921 memcg = NULL;
2922 rcu_read_unlock();
2923 }
2924 unlock_page_cgroup(pc);
2925 return memcg;
2926}
2927
2928static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg,
2929 struct page *page,
2930 unsigned int nr_pages,
2931 enum charge_type ctype,
2932 bool lrucare)
2933{
2934 struct page_cgroup *pc = lookup_page_cgroup(page);
2935 struct zone *uninitialized_var(zone);
2936 struct lruvec *lruvec;
2937 bool was_on_lru = false;
2938 bool anon;
2939
2940 lock_page_cgroup(pc);
2941 VM_BUG_ON_PAGE(PageCgroupUsed(pc), page);
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951 if (lrucare) {
2952 zone = page_zone(page);
2953 spin_lock_irq(&zone->lru_lock);
2954 if (PageLRU(page)) {
2955 lruvec = mem_cgroup_zone_lruvec(zone, pc->mem_cgroup);
2956 ClearPageLRU(page);
2957 del_page_from_lru_list(page, lruvec, page_lru(page));
2958 was_on_lru = true;
2959 }
2960 }
2961
2962 pc->mem_cgroup = memcg;
2963
2964
2965
2966
2967
2968
2969
2970 smp_wmb();
2971 SetPageCgroupUsed(pc);
2972
2973 if (lrucare) {
2974 if (was_on_lru) {
2975 lruvec = mem_cgroup_zone_lruvec(zone, pc->mem_cgroup);
2976 VM_BUG_ON_PAGE(PageLRU(page), page);
2977 SetPageLRU(page);
2978 add_page_to_lru_list(page, lruvec, page_lru(page));
2979 }
2980 spin_unlock_irq(&zone->lru_lock);
2981 }
2982
2983 if (ctype == MEM_CGROUP_CHARGE_TYPE_ANON)
2984 anon = true;
2985 else
2986 anon = false;
2987
2988 mem_cgroup_charge_statistics(memcg, page, anon, nr_pages);
2989 unlock_page_cgroup(pc);
2990
2991
2992
2993
2994
2995
2996 memcg_check_events(memcg, page);
2997}
2998
2999static DEFINE_MUTEX(set_limit_mutex);
3000
3001#ifdef CONFIG_MEMCG_KMEM
3002static DEFINE_MUTEX(activate_kmem_mutex);
3003
3004static inline bool memcg_can_account_kmem(struct mem_cgroup *memcg)
3005{
3006 return !mem_cgroup_disabled() && !mem_cgroup_is_root(memcg) &&
3007 memcg_kmem_is_active(memcg);
3008}
3009
3010
3011
3012
3013
3014static struct kmem_cache *memcg_params_to_cache(struct memcg_cache_params *p)
3015{
3016 struct kmem_cache *cachep;
3017
3018 VM_BUG_ON(p->is_root_cache);
3019 cachep = p->root_cache;
3020 return cache_from_memcg_idx(cachep, memcg_cache_id(p->memcg));
3021}
3022
3023#ifdef CONFIG_SLABINFO
3024static int mem_cgroup_slabinfo_read(struct seq_file *m, void *v)
3025{
3026 struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
3027 struct memcg_cache_params *params;
3028
3029 if (!memcg_can_account_kmem(memcg))
3030 return -EIO;
3031
3032 print_slabinfo_header(m);
3033
3034 mutex_lock(&memcg->slab_caches_mutex);
3035 list_for_each_entry(params, &memcg->memcg_slab_caches, list)
3036 cache_show(memcg_params_to_cache(params), m);
3037 mutex_unlock(&memcg->slab_caches_mutex);
3038
3039 return 0;
3040}
3041#endif
3042
3043static int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp, u64 size)
3044{
3045 struct res_counter *fail_res;
3046 struct mem_cgroup *_memcg;
3047 int ret = 0;
3048
3049 ret = res_counter_charge(&memcg->kmem, size, &fail_res);
3050 if (ret)
3051 return ret;
3052
3053 _memcg = memcg;
3054 ret = __mem_cgroup_try_charge(NULL, gfp, size >> PAGE_SHIFT,
3055 &_memcg, oom_gfp_allowed(gfp));
3056
3057 if (ret == -EINTR) {
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073 res_counter_charge_nofail(&memcg->res, size, &fail_res);
3074 if (do_swap_account)
3075 res_counter_charge_nofail(&memcg->memsw, size,
3076 &fail_res);
3077 ret = 0;
3078 } else if (ret)
3079 res_counter_uncharge(&memcg->kmem, size);
3080
3081 return ret;
3082}
3083
3084static void memcg_uncharge_kmem(struct mem_cgroup *memcg, u64 size)
3085{
3086 res_counter_uncharge(&memcg->res, size);
3087 if (do_swap_account)
3088 res_counter_uncharge(&memcg->memsw, size);
3089
3090
3091 if (res_counter_uncharge(&memcg->kmem, size))
3092 return;
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102 if (memcg_kmem_test_and_clear_dead(memcg))
3103 css_put(&memcg->css);
3104}
3105
3106
3107
3108
3109
3110
3111int memcg_cache_id(struct mem_cgroup *memcg)
3112{
3113 return memcg ? memcg->kmemcg_id : -1;
3114}
3115
3116static size_t memcg_caches_array_size(int num_groups)
3117{
3118 ssize_t size;
3119 if (num_groups <= 0)
3120 return 0;
3121
3122 size = 2 * num_groups;
3123 if (size < MEMCG_CACHES_MIN_SIZE)
3124 size = MEMCG_CACHES_MIN_SIZE;
3125 else if (size > MEMCG_CACHES_MAX_SIZE)
3126 size = MEMCG_CACHES_MAX_SIZE;
3127
3128 return size;
3129}
3130
3131
3132
3133
3134
3135
3136void memcg_update_array_size(int num)
3137{
3138 if (num > memcg_limited_groups_array_size)
3139 memcg_limited_groups_array_size = memcg_caches_array_size(num);
3140}
3141
3142static void kmem_cache_destroy_work_func(struct work_struct *w);
3143
3144int memcg_update_cache_size(struct kmem_cache *s, int num_groups)
3145{
3146 struct memcg_cache_params *cur_params = s->memcg_params;
3147
3148 VM_BUG_ON(!is_root_cache(s));
3149
3150 if (num_groups > memcg_limited_groups_array_size) {
3151 int i;
3152 struct memcg_cache_params *new_params;
3153 ssize_t size = memcg_caches_array_size(num_groups);
3154
3155 size *= sizeof(void *);
3156 size += offsetof(struct memcg_cache_params, memcg_caches);
3157
3158 new_params = kzalloc(size, GFP_KERNEL);
3159 if (!new_params)
3160 return -ENOMEM;
3161
3162 new_params->is_root_cache = true;
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173 for (i = 0; i < memcg_limited_groups_array_size; i++) {
3174 if (!cur_params->memcg_caches[i])
3175 continue;
3176 new_params->memcg_caches[i] =
3177 cur_params->memcg_caches[i];
3178 }
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189 rcu_assign_pointer(s->memcg_params, new_params);
3190 if (cur_params)
3191 kfree_rcu(cur_params, rcu_head);
3192 }
3193 return 0;
3194}
3195
3196int memcg_alloc_cache_params(struct mem_cgroup *memcg, struct kmem_cache *s,
3197 struct kmem_cache *root_cache)
3198{
3199 size_t size;
3200
3201 if (!memcg_kmem_enabled())
3202 return 0;
3203
3204 if (!memcg) {
3205 size = offsetof(struct memcg_cache_params, memcg_caches);
3206 size += memcg_limited_groups_array_size * sizeof(void *);
3207 } else
3208 size = sizeof(struct memcg_cache_params);
3209
3210 s->memcg_params = kzalloc(size, GFP_KERNEL);
3211 if (!s->memcg_params)
3212 return -ENOMEM;
3213
3214 if (memcg) {
3215 s->memcg_params->memcg = memcg;
3216 s->memcg_params->root_cache = root_cache;
3217 INIT_WORK(&s->memcg_params->destroy,
3218 kmem_cache_destroy_work_func);
3219 } else
3220 s->memcg_params->is_root_cache = true;
3221
3222 return 0;
3223}
3224
3225void memcg_free_cache_params(struct kmem_cache *s)
3226{
3227 kfree(s->memcg_params);
3228}
3229
3230void memcg_register_cache(struct kmem_cache *s)
3231{
3232 struct kmem_cache *root;
3233 struct mem_cgroup *memcg;
3234 int id;
3235
3236 if (is_root_cache(s))
3237 return;
3238
3239
3240
3241
3242
3243 lockdep_assert_held(&slab_mutex);
3244
3245 root = s->memcg_params->root_cache;
3246 memcg = s->memcg_params->memcg;
3247 id = memcg_cache_id(memcg);
3248
3249 css_get(&memcg->css);
3250
3251
3252
3253
3254
3255
3256
3257 smp_wmb();
3258
3259
3260
3261
3262
3263
3264 VM_BUG_ON(root->memcg_params->memcg_caches[id]);
3265 root->memcg_params->memcg_caches[id] = s;
3266
3267 mutex_lock(&memcg->slab_caches_mutex);
3268 list_add(&s->memcg_params->list, &memcg->memcg_slab_caches);
3269 mutex_unlock(&memcg->slab_caches_mutex);
3270}
3271
3272void memcg_unregister_cache(struct kmem_cache *s)
3273{
3274 struct kmem_cache *root;
3275 struct mem_cgroup *memcg;
3276 int id;
3277
3278 if (is_root_cache(s))
3279 return;
3280
3281
3282
3283
3284
3285 lockdep_assert_held(&slab_mutex);
3286
3287 root = s->memcg_params->root_cache;
3288 memcg = s->memcg_params->memcg;
3289 id = memcg_cache_id(memcg);
3290
3291 mutex_lock(&memcg->slab_caches_mutex);
3292 list_del(&s->memcg_params->list);
3293 mutex_unlock(&memcg->slab_caches_mutex);
3294
3295
3296
3297
3298
3299
3300 VM_BUG_ON(!root->memcg_params->memcg_caches[id]);
3301 root->memcg_params->memcg_caches[id] = NULL;
3302
3303 css_put(&memcg->css);
3304}
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325static inline void memcg_stop_kmem_account(void)
3326{
3327 VM_BUG_ON(!current->mm);
3328 current->memcg_kmem_skip_account++;
3329}
3330
3331static inline void memcg_resume_kmem_account(void)
3332{
3333 VM_BUG_ON(!current->mm);
3334 current->memcg_kmem_skip_account--;
3335}
3336
3337static void kmem_cache_destroy_work_func(struct work_struct *w)
3338{
3339 struct kmem_cache *cachep;
3340 struct memcg_cache_params *p;
3341
3342 p = container_of(w, struct memcg_cache_params, destroy);
3343
3344 cachep = memcg_params_to_cache(p);
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362 if (atomic_read(&cachep->memcg_params->nr_pages) != 0)
3363 kmem_cache_shrink(cachep);
3364 else
3365 kmem_cache_destroy(cachep);
3366}
3367
3368void mem_cgroup_destroy_cache(struct kmem_cache *cachep)
3369{
3370 if (!cachep->memcg_params->dead)
3371 return;
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391 if (work_pending(&cachep->memcg_params->destroy))
3392 return;
3393
3394
3395
3396
3397 schedule_work(&cachep->memcg_params->destroy);
3398}
3399
3400static struct kmem_cache *memcg_create_kmem_cache(struct mem_cgroup *memcg,
3401 struct kmem_cache *s)
3402{
3403 struct kmem_cache *new = NULL;
3404 static char *tmp_name = NULL;
3405 static DEFINE_MUTEX(mutex);
3406
3407 BUG_ON(!memcg_can_account_kmem(memcg));
3408
3409 mutex_lock(&mutex);
3410
3411
3412
3413
3414
3415
3416 if (!tmp_name) {
3417 tmp_name = kmalloc(PATH_MAX, GFP_KERNEL);
3418 if (!tmp_name)
3419 goto out;
3420 }
3421
3422 rcu_read_lock();
3423 snprintf(tmp_name, PATH_MAX, "%s(%d:%s)", s->name,
3424 memcg_cache_id(memcg), cgroup_name(memcg->css.cgroup));
3425 rcu_read_unlock();
3426
3427 new = kmem_cache_create_memcg(memcg, tmp_name, s->object_size, s->align,
3428 (s->flags & ~SLAB_PANIC), s->ctor, s);
3429 if (new)
3430 new->allocflags |= __GFP_KMEMCG;
3431 else
3432 new = s;
3433out:
3434 mutex_unlock(&mutex);
3435 return new;
3436}
3437
3438void kmem_cache_destroy_memcg_children(struct kmem_cache *s)
3439{
3440 struct kmem_cache *c;
3441 int i;
3442
3443 if (!s->memcg_params)
3444 return;
3445 if (!s->memcg_params->is_root_cache)
3446 return;
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458 mutex_lock(&activate_kmem_mutex);
3459 for_each_memcg_cache_index(i) {
3460 c = cache_from_memcg_idx(s, i);
3461 if (!c)
3462 continue;
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477 c->memcg_params->dead = false;
3478 cancel_work_sync(&c->memcg_params->destroy);
3479 kmem_cache_destroy(c);
3480 }
3481 mutex_unlock(&activate_kmem_mutex);
3482}
3483
3484struct create_work {
3485 struct mem_cgroup *memcg;
3486 struct kmem_cache *cachep;
3487 struct work_struct work;
3488};
3489
3490static void mem_cgroup_destroy_all_caches(struct mem_cgroup *memcg)
3491{
3492 struct kmem_cache *cachep;
3493 struct memcg_cache_params *params;
3494
3495 if (!memcg_kmem_is_active(memcg))
3496 return;
3497
3498 mutex_lock(&memcg->slab_caches_mutex);
3499 list_for_each_entry(params, &memcg->memcg_slab_caches, list) {
3500 cachep = memcg_params_to_cache(params);
3501 cachep->memcg_params->dead = true;
3502 schedule_work(&cachep->memcg_params->destroy);
3503 }
3504 mutex_unlock(&memcg->slab_caches_mutex);
3505}
3506
3507static void memcg_create_cache_work_func(struct work_struct *w)
3508{
3509 struct create_work *cw;
3510
3511 cw = container_of(w, struct create_work, work);
3512 memcg_create_kmem_cache(cw->memcg, cw->cachep);
3513 css_put(&cw->memcg->css);
3514 kfree(cw);
3515}
3516
3517
3518
3519
3520static void __memcg_create_cache_enqueue(struct mem_cgroup *memcg,
3521 struct kmem_cache *cachep)
3522{
3523 struct create_work *cw;
3524
3525 cw = kmalloc(sizeof(struct create_work), GFP_NOWAIT);
3526 if (cw == NULL) {
3527 css_put(&memcg->css);
3528 return;
3529 }
3530
3531 cw->memcg = memcg;
3532 cw->cachep = cachep;
3533
3534 INIT_WORK(&cw->work, memcg_create_cache_work_func);
3535 schedule_work(&cw->work);
3536}
3537
3538static void memcg_create_cache_enqueue(struct mem_cgroup *memcg,
3539 struct kmem_cache *cachep)
3540{
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552 memcg_stop_kmem_account();
3553 __memcg_create_cache_enqueue(memcg, cachep);
3554 memcg_resume_kmem_account();
3555}
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep,
3570 gfp_t gfp)
3571{
3572 struct mem_cgroup *memcg;
3573 struct kmem_cache *memcg_cachep;
3574
3575 VM_BUG_ON(!cachep->memcg_params);
3576 VM_BUG_ON(!cachep->memcg_params->is_root_cache);
3577
3578 if (!current->mm || current->memcg_kmem_skip_account)
3579 return cachep;
3580
3581 rcu_read_lock();
3582 memcg = mem_cgroup_from_task(rcu_dereference(current->mm->owner));
3583
3584 if (!memcg_can_account_kmem(memcg))
3585 goto out;
3586
3587 memcg_cachep = cache_from_memcg_idx(cachep, memcg_cache_id(memcg));
3588 if (likely(memcg_cachep)) {
3589 cachep = memcg_cachep;
3590 goto out;
3591 }
3592
3593
3594 if (!css_tryget(&memcg->css))
3595 goto out;
3596 rcu_read_unlock();
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615 memcg_create_cache_enqueue(memcg, cachep);
3616 return cachep;
3617out:
3618 rcu_read_unlock();
3619 return cachep;
3620}
3621EXPORT_SYMBOL(__memcg_kmem_get_cache);
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637bool
3638__memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **_memcg, int order)
3639{
3640 struct mem_cgroup *memcg;
3641 int ret;
3642
3643 *_memcg = NULL;
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669 if (!current->mm || current->memcg_kmem_skip_account)
3670 return true;
3671
3672 memcg = try_get_mem_cgroup_from_mm(current->mm);
3673
3674
3675
3676
3677
3678
3679 if (unlikely(!memcg))
3680 return true;
3681
3682 if (!memcg_can_account_kmem(memcg)) {
3683 css_put(&memcg->css);
3684 return true;
3685 }
3686
3687 ret = memcg_charge_kmem(memcg, gfp, PAGE_SIZE << order);
3688 if (!ret)
3689 *_memcg = memcg;
3690
3691 css_put(&memcg->css);
3692 return (ret == 0);
3693}
3694
3695void __memcg_kmem_commit_charge(struct page *page, struct mem_cgroup *memcg,
3696 int order)
3697{
3698 struct page_cgroup *pc;
3699
3700 VM_BUG_ON(mem_cgroup_is_root(memcg));
3701
3702
3703 if (!page) {
3704 memcg_uncharge_kmem(memcg, PAGE_SIZE << order);
3705 return;
3706 }
3707
3708 pc = lookup_page_cgroup(page);
3709 lock_page_cgroup(pc);
3710 pc->mem_cgroup = memcg;
3711 SetPageCgroupUsed(pc);
3712 unlock_page_cgroup(pc);
3713}
3714
3715void __memcg_kmem_uncharge_pages(struct page *page, int order)
3716{
3717 struct mem_cgroup *memcg = NULL;
3718 struct page_cgroup *pc;
3719
3720
3721 pc = lookup_page_cgroup(page);
3722
3723
3724
3725
3726 if (!PageCgroupUsed(pc))
3727 return;
3728
3729 lock_page_cgroup(pc);
3730 if (PageCgroupUsed(pc)) {
3731 memcg = pc->mem_cgroup;
3732 ClearPageCgroupUsed(pc);
3733 }
3734 unlock_page_cgroup(pc);
3735
3736
3737
3738
3739
3740 if (!memcg)
3741 return;
3742
3743 VM_BUG_ON_PAGE(mem_cgroup_is_root(memcg), page);
3744 memcg_uncharge_kmem(memcg, PAGE_SIZE << order);
3745}
3746#else
3747static inline void mem_cgroup_destroy_all_caches(struct mem_cgroup *memcg)
3748{
3749}
3750#endif
3751
3752#ifdef CONFIG_TRANSPARENT_HUGEPAGE
3753
3754#define PCGF_NOCOPY_AT_SPLIT (1 << PCG_LOCK | 1 << PCG_MIGRATION)
3755
3756
3757
3758
3759
3760
3761void mem_cgroup_split_huge_fixup(struct page *head)
3762{
3763 struct page_cgroup *head_pc = lookup_page_cgroup(head);
3764 struct page_cgroup *pc;
3765 struct mem_cgroup *memcg;
3766 int i;
3767
3768 if (mem_cgroup_disabled())
3769 return;
3770
3771 memcg = head_pc->mem_cgroup;
3772 for (i = 1; i < HPAGE_PMD_NR; i++) {
3773 pc = head_pc + i;
3774 pc->mem_cgroup = memcg;
3775 smp_wmb();
3776 pc->flags = head_pc->flags & ~PCGF_NOCOPY_AT_SPLIT;
3777 }
3778 __this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_RSS_HUGE],
3779 HPAGE_PMD_NR);
3780}
3781#endif
3782
3783static inline
3784void mem_cgroup_move_account_page_stat(struct mem_cgroup *from,
3785 struct mem_cgroup *to,
3786 unsigned int nr_pages,
3787 enum mem_cgroup_stat_index idx)
3788{
3789
3790 preempt_disable();
3791 __this_cpu_sub(from->stat->count[idx], nr_pages);
3792 __this_cpu_add(to->stat->count[idx], nr_pages);
3793 preempt_enable();
3794}
3795
3796
3797
3798
3799
3800
3801
3802
3803
3804
3805
3806
3807
3808
3809
3810
3811static int mem_cgroup_move_account(struct page *page,
3812 unsigned int nr_pages,
3813 struct page_cgroup *pc,
3814 struct mem_cgroup *from,
3815 struct mem_cgroup *to)
3816{
3817 unsigned long flags;
3818 int ret;
3819 bool anon = PageAnon(page);
3820
3821 VM_BUG_ON(from == to);
3822 VM_BUG_ON_PAGE(PageLRU(page), page);
3823
3824
3825
3826
3827
3828
3829 ret = -EBUSY;
3830 if (nr_pages > 1 && !PageTransHuge(page))
3831 goto out;
3832
3833 lock_page_cgroup(pc);
3834
3835 ret = -EINVAL;
3836 if (!PageCgroupUsed(pc) || pc->mem_cgroup != from)
3837 goto unlock;
3838
3839 move_lock_mem_cgroup(from, &flags);
3840
3841 if (!anon && page_mapped(page))
3842 mem_cgroup_move_account_page_stat(from, to, nr_pages,
3843 MEM_CGROUP_STAT_FILE_MAPPED);
3844
3845 if (PageWriteback(page))
3846 mem_cgroup_move_account_page_stat(from, to, nr_pages,
3847 MEM_CGROUP_STAT_WRITEBACK);
3848
3849 mem_cgroup_charge_statistics(from, page, anon, -nr_pages);
3850
3851
3852 pc->mem_cgroup = to;
3853 mem_cgroup_charge_statistics(to, page, anon, nr_pages);
3854 move_unlock_mem_cgroup(from, &flags);
3855 ret = 0;
3856unlock:
3857 unlock_page_cgroup(pc);
3858
3859
3860
3861 memcg_check_events(to, page);
3862 memcg_check_events(from, page);
3863out:
3864 return ret;
3865}
3866
3867
3868
3869
3870
3871
3872
3873
3874
3875
3876
3877
3878
3879
3880
3881
3882
3883
3884
3885
3886
3887
3888static int mem_cgroup_move_parent(struct page *page,
3889 struct page_cgroup *pc,
3890 struct mem_cgroup *child)
3891{
3892 struct mem_cgroup *parent;
3893 unsigned int nr_pages;
3894 unsigned long uninitialized_var(flags);
3895 int ret;
3896
3897 VM_BUG_ON(mem_cgroup_is_root(child));
3898
3899 ret = -EBUSY;
3900 if (!get_page_unless_zero(page))
3901 goto out;
3902 if (isolate_lru_page(page))
3903 goto put;
3904
3905 nr_pages = hpage_nr_pages(page);
3906
3907 parent = parent_mem_cgroup(child);
3908
3909
3910
3911 if (!parent)
3912 parent = root_mem_cgroup;
3913
3914 if (nr_pages > 1) {
3915 VM_BUG_ON_PAGE(!PageTransHuge(page), page);
3916 flags = compound_lock_irqsave(page);
3917 }
3918
3919 ret = mem_cgroup_move_account(page, nr_pages,
3920 pc, child, parent);
3921 if (!ret)
3922 __mem_cgroup_cancel_local_charge(child, nr_pages);
3923
3924 if (nr_pages > 1)
3925 compound_unlock_irqrestore(page, flags);
3926 putback_lru_page(page);
3927put:
3928 put_page(page);
3929out:
3930 return ret;
3931}
3932
3933
3934
3935
3936
3937
3938
3939static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
3940 gfp_t gfp_mask, enum charge_type ctype)
3941{
3942 struct mem_cgroup *memcg = NULL;
3943 unsigned int nr_pages = 1;
3944 bool oom = true;
3945 int ret;
3946
3947 if (PageTransHuge(page)) {
3948 nr_pages <<= compound_order(page);
3949 VM_BUG_ON_PAGE(!PageTransHuge(page), page);
3950
3951
3952
3953
3954 oom = false;
3955 }
3956
3957 ret = __mem_cgroup_try_charge(mm, gfp_mask, nr_pages, &memcg, oom);
3958 if (ret == -ENOMEM)
3959 return ret;
3960 __mem_cgroup_commit_charge(memcg, page, nr_pages, ctype, false);
3961 return 0;
3962}
3963
3964int mem_cgroup_newpage_charge(struct page *page,
3965 struct mm_struct *mm, gfp_t gfp_mask)
3966{
3967 if (mem_cgroup_disabled())
3968 return 0;
3969 VM_BUG_ON_PAGE(page_mapped(page), page);
3970 VM_BUG_ON_PAGE(page->mapping && !PageAnon(page), page);
3971 VM_BUG_ON(!mm);
3972 return mem_cgroup_charge_common(page, mm, gfp_mask,
3973 MEM_CGROUP_CHARGE_TYPE_ANON);
3974}
3975
3976
3977
3978
3979
3980
3981
3982static int __mem_cgroup_try_charge_swapin(struct mm_struct *mm,
3983 struct page *page,
3984 gfp_t mask,
3985 struct mem_cgroup **memcgp)
3986{
3987 struct mem_cgroup *memcg;
3988 struct page_cgroup *pc;
3989 int ret;
3990
3991 pc = lookup_page_cgroup(page);
3992
3993
3994
3995
3996
3997
3998
3999 if (PageCgroupUsed(pc))
4000 return 0;
4001 if (!do_swap_account)
4002 goto charge_cur_mm;
4003 memcg = try_get_mem_cgroup_from_page(page);
4004 if (!memcg)
4005 goto charge_cur_mm;
4006 *memcgp = memcg;
4007 ret = __mem_cgroup_try_charge(NULL, mask, 1, memcgp, true);
4008 css_put(&memcg->css);
4009 if (ret == -EINTR)
4010 ret = 0;
4011 return ret;
4012charge_cur_mm:
4013 ret = __mem_cgroup_try_charge(mm, mask, 1, memcgp, true);
4014 if (ret == -EINTR)
4015 ret = 0;
4016 return ret;
4017}
4018
4019int mem_cgroup_try_charge_swapin(struct mm_struct *mm, struct page *page,
4020 gfp_t gfp_mask, struct mem_cgroup **memcgp)
4021{
4022 *memcgp = NULL;
4023 if (mem_cgroup_disabled())
4024 return 0;
4025
4026
4027
4028
4029
4030
4031 if (!PageSwapCache(page)) {
4032 int ret;
4033
4034 ret = __mem_cgroup_try_charge(mm, gfp_mask, 1, memcgp, true);
4035 if (ret == -EINTR)
4036 ret = 0;
4037 return ret;
4038 }
4039 return __mem_cgroup_try_charge_swapin(mm, page, gfp_mask, memcgp);
4040}
4041
4042void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *memcg)
4043{
4044 if (mem_cgroup_disabled())
4045 return;
4046 if (!memcg)
4047 return;
4048 __mem_cgroup_cancel_charge(memcg, 1);
4049}
4050
4051static void
4052__mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *memcg,
4053 enum charge_type ctype)
4054{
4055 if (mem_cgroup_disabled())
4056 return;
4057 if (!memcg)
4058 return;
4059
4060 __mem_cgroup_commit_charge(memcg, page, 1, ctype, true);
4061
4062
4063
4064
4065
4066
4067
4068 if (do_swap_account && PageSwapCache(page)) {
4069 swp_entry_t ent = {.val = page_private(page)};
4070 mem_cgroup_uncharge_swap(ent);
4071 }
4072}
4073
4074void mem_cgroup_commit_charge_swapin(struct page *page,
4075 struct mem_cgroup *memcg)
4076{
4077 __mem_cgroup_commit_charge_swapin(page, memcg,
4078 MEM_CGROUP_CHARGE_TYPE_ANON);
4079}
4080
4081int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
4082 gfp_t gfp_mask)
4083{
4084 struct mem_cgroup *memcg = NULL;
4085 enum charge_type type = MEM_CGROUP_CHARGE_TYPE_CACHE;
4086 int ret;
4087
4088 if (mem_cgroup_disabled())
4089 return 0;
4090 if (PageCompound(page))
4091 return 0;
4092
4093 if (!PageSwapCache(page))
4094 ret = mem_cgroup_charge_common(page, mm, gfp_mask, type);
4095 else {
4096 ret = __mem_cgroup_try_charge_swapin(mm, page,
4097 gfp_mask, &memcg);
4098 if (!ret)
4099 __mem_cgroup_commit_charge_swapin(page, memcg, type);
4100 }
4101 return ret;
4102}
4103
4104static void mem_cgroup_do_uncharge(struct mem_cgroup *memcg,
4105 unsigned int nr_pages,
4106 const enum charge_type ctype)
4107{
4108 struct memcg_batch_info *batch = NULL;
4109 bool uncharge_memsw = true;
4110
4111
4112 if (!do_swap_account || ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT)
4113 uncharge_memsw = false;
4114
4115 batch = ¤t->memcg_batch;
4116
4117
4118
4119
4120
4121 if (!batch->memcg)
4122 batch->memcg = memcg;
4123
4124
4125
4126
4127
4128
4129
4130
4131 if (!batch->do_batch || test_thread_flag(TIF_MEMDIE))
4132 goto direct_uncharge;
4133
4134 if (nr_pages > 1)
4135 goto direct_uncharge;
4136
4137
4138
4139
4140
4141
4142 if (batch->memcg != memcg)
4143 goto direct_uncharge;
4144
4145 batch->nr_pages++;
4146 if (uncharge_memsw)
4147 batch->memsw_nr_pages++;
4148 return;
4149direct_uncharge:
4150 res_counter_uncharge(&memcg->res, nr_pages * PAGE_SIZE);
4151 if (uncharge_memsw)
4152 res_counter_uncharge(&memcg->memsw, nr_pages * PAGE_SIZE);
4153 if (unlikely(batch->memcg != memcg))
4154 memcg_oom_recover(memcg);
4155}
4156
4157
4158
4159
4160static struct mem_cgroup *
4161__mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype,
4162 bool end_migration)
4163{
4164 struct mem_cgroup *memcg = NULL;
4165 unsigned int nr_pages = 1;
4166 struct page_cgroup *pc;
4167 bool anon;
4168
4169 if (mem_cgroup_disabled())
4170 return NULL;
4171
4172 if (PageTransHuge(page)) {
4173 nr_pages <<= compound_order(page);
4174 VM_BUG_ON_PAGE(!PageTransHuge(page), page);
4175 }
4176
4177
4178
4179 pc = lookup_page_cgroup(page);
4180 if (unlikely(!PageCgroupUsed(pc)))
4181 return NULL;
4182
4183 lock_page_cgroup(pc);
4184
4185 memcg = pc->mem_cgroup;
4186
4187 if (!PageCgroupUsed(pc))
4188 goto unlock_out;
4189
4190 anon = PageAnon(page);
4191
4192 switch (ctype) {
4193 case MEM_CGROUP_CHARGE_TYPE_ANON:
4194
4195
4196
4197
4198
4199 anon = true;
4200
4201 case MEM_CGROUP_CHARGE_TYPE_DROP:
4202
4203 if (page_mapped(page))
4204 goto unlock_out;
4205
4206
4207
4208
4209
4210
4211
4212 if (!end_migration && PageCgroupMigration(pc))
4213 goto unlock_out;
4214 break;
4215 case MEM_CGROUP_CHARGE_TYPE_SWAPOUT:
4216 if (!PageAnon(page)) {
4217 if (page->mapping && !page_is_file_cache(page))
4218 goto unlock_out;
4219 } else if (page_mapped(page))
4220 goto unlock_out;
4221 break;
4222 default:
4223 break;
4224 }
4225
4226 mem_cgroup_charge_statistics(memcg, page, anon, -nr_pages);
4227
4228 ClearPageCgroupUsed(pc);
4229
4230
4231
4232
4233
4234
4235
4236 unlock_page_cgroup(pc);
4237
4238
4239
4240
4241 memcg_check_events(memcg, page);
4242 if (do_swap_account && ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT) {
4243 mem_cgroup_swap_statistics(memcg, true);
4244 css_get(&memcg->css);
4245 }
4246
4247
4248
4249
4250
4251 if (!end_migration && !mem_cgroup_is_root(memcg))
4252 mem_cgroup_do_uncharge(memcg, nr_pages, ctype);
4253
4254 return memcg;
4255
4256unlock_out:
4257 unlock_page_cgroup(pc);
4258 return NULL;
4259}
4260
4261void mem_cgroup_uncharge_page(struct page *page)
4262{
4263
4264 if (page_mapped(page))
4265 return;
4266 VM_BUG_ON_PAGE(page->mapping && !PageAnon(page), page);
4267
4268
4269
4270
4271
4272
4273
4274
4275
4276
4277
4278
4279 if (PageSwapCache(page))
4280 return;
4281 __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_ANON, false);
4282}
4283
4284void mem_cgroup_uncharge_cache_page(struct page *page)
4285{
4286 VM_BUG_ON_PAGE(page_mapped(page), page);
4287 VM_BUG_ON_PAGE(page->mapping, page);
4288 __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE, false);
4289}
4290
4291
4292
4293
4294
4295
4296
4297
4298
4299void mem_cgroup_uncharge_start(void)
4300{
4301 current->memcg_batch.do_batch++;
4302
4303 if (current->memcg_batch.do_batch == 1) {
4304 current->memcg_batch.memcg = NULL;
4305 current->memcg_batch.nr_pages = 0;
4306 current->memcg_batch.memsw_nr_pages = 0;
4307 }
4308}
4309
4310void mem_cgroup_uncharge_end(void)
4311{
4312 struct memcg_batch_info *batch = ¤t->memcg_batch;
4313
4314 if (!batch->do_batch)
4315 return;
4316
4317 batch->do_batch--;
4318 if (batch->do_batch)
4319 return;
4320
4321 if (!batch->memcg)
4322 return;
4323
4324
4325
4326
4327 if (batch->nr_pages)
4328 res_counter_uncharge(&batch->memcg->res,
4329 batch->nr_pages * PAGE_SIZE);
4330 if (batch->memsw_nr_pages)
4331 res_counter_uncharge(&batch->memcg->memsw,
4332 batch->memsw_nr_pages * PAGE_SIZE);
4333 memcg_oom_recover(batch->memcg);
4334
4335 batch->memcg = NULL;
4336}
4337
4338#ifdef CONFIG_SWAP
4339
4340
4341
4342
4343void
4344mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent, bool swapout)
4345{
4346 struct mem_cgroup *memcg;
4347 int ctype = MEM_CGROUP_CHARGE_TYPE_SWAPOUT;
4348
4349 if (!swapout)
4350 ctype = MEM_CGROUP_CHARGE_TYPE_DROP;
4351
4352 memcg = __mem_cgroup_uncharge_common(page, ctype, false);
4353
4354
4355
4356
4357
4358 if (do_swap_account && swapout && memcg)
4359 swap_cgroup_record(ent, mem_cgroup_id(memcg));
4360}
4361#endif
4362
4363#ifdef CONFIG_MEMCG_SWAP
4364
4365
4366
4367
4368void mem_cgroup_uncharge_swap(swp_entry_t ent)
4369{
4370 struct mem_cgroup *memcg;
4371 unsigned short id;
4372
4373 if (!do_swap_account)
4374 return;
4375
4376 id = swap_cgroup_record(ent, 0);
4377 rcu_read_lock();
4378 memcg = mem_cgroup_lookup(id);
4379 if (memcg) {
4380
4381
4382
4383
4384 if (!mem_cgroup_is_root(memcg))
4385 res_counter_uncharge(&memcg->memsw, PAGE_SIZE);
4386 mem_cgroup_swap_statistics(memcg, false);
4387 css_put(&memcg->css);
4388 }
4389 rcu_read_unlock();
4390}
4391
4392
4393
4394
4395
4396
4397
4398
4399
4400
4401
4402
4403
4404
4405
4406static int mem_cgroup_move_swap_account(swp_entry_t entry,
4407 struct mem_cgroup *from, struct mem_cgroup *to)
4408{
4409 unsigned short old_id, new_id;
4410
4411 old_id = mem_cgroup_id(from);
4412 new_id = mem_cgroup_id(to);
4413
4414 if (swap_cgroup_cmpxchg(entry, old_id, new_id) == old_id) {
4415 mem_cgroup_swap_statistics(from, false);
4416 mem_cgroup_swap_statistics(to, true);
4417
4418
4419
4420
4421
4422
4423
4424
4425
4426
4427
4428 css_get(&to->css);
4429 return 0;
4430 }
4431 return -EINVAL;
4432}
4433#else
4434static inline int mem_cgroup_move_swap_account(swp_entry_t entry,
4435 struct mem_cgroup *from, struct mem_cgroup *to)
4436{
4437 return -EINVAL;
4438}
4439#endif
4440
4441
4442
4443
4444
4445void mem_cgroup_prepare_migration(struct page *page, struct page *newpage,
4446 struct mem_cgroup **memcgp)
4447{
4448 struct mem_cgroup *memcg = NULL;
4449 unsigned int nr_pages = 1;
4450 struct page_cgroup *pc;
4451 enum charge_type ctype;
4452
4453 *memcgp = NULL;
4454
4455 if (mem_cgroup_disabled())
4456 return;
4457
4458 if (PageTransHuge(page))
4459 nr_pages <<= compound_order(page);
4460
4461 pc = lookup_page_cgroup(page);
4462 lock_page_cgroup(pc);
4463 if (PageCgroupUsed(pc)) {
4464 memcg = pc->mem_cgroup;
4465 css_get(&memcg->css);
4466
4467
4468
4469
4470
4471
4472
4473
4474
4475
4476
4477
4478
4479
4480
4481
4482
4483
4484
4485
4486
4487
4488
4489
4490
4491
4492
4493
4494
4495 if (PageAnon(page))
4496 SetPageCgroupMigration(pc);
4497 }
4498 unlock_page_cgroup(pc);
4499
4500
4501
4502
4503 if (!memcg)
4504 return;
4505
4506 *memcgp = memcg;
4507
4508
4509
4510
4511
4512
4513 if (PageAnon(page))
4514 ctype = MEM_CGROUP_CHARGE_TYPE_ANON;
4515 else
4516 ctype = MEM_CGROUP_CHARGE_TYPE_CACHE;
4517
4518
4519
4520
4521
4522 __mem_cgroup_commit_charge(memcg, newpage, nr_pages, ctype, false);
4523}
4524
4525
4526void mem_cgroup_end_migration(struct mem_cgroup *memcg,
4527 struct page *oldpage, struct page *newpage, bool migration_ok)
4528{
4529 struct page *used, *unused;
4530 struct page_cgroup *pc;
4531 bool anon;
4532
4533 if (!memcg)
4534 return;
4535
4536 if (!migration_ok) {
4537 used = oldpage;
4538 unused = newpage;
4539 } else {
4540 used = newpage;
4541 unused = oldpage;
4542 }
4543 anon = PageAnon(used);
4544 __mem_cgroup_uncharge_common(unused,
4545 anon ? MEM_CGROUP_CHARGE_TYPE_ANON
4546 : MEM_CGROUP_CHARGE_TYPE_CACHE,
4547 true);
4548 css_put(&memcg->css);
4549
4550
4551
4552
4553
4554 pc = lookup_page_cgroup(oldpage);
4555 lock_page_cgroup(pc);
4556 ClearPageCgroupMigration(pc);
4557 unlock_page_cgroup(pc);
4558
4559
4560
4561
4562
4563
4564
4565
4566
4567 if (anon)
4568 mem_cgroup_uncharge_page(used);
4569}
4570
4571
4572
4573
4574
4575
4576void mem_cgroup_replace_page_cache(struct page *oldpage,
4577 struct page *newpage)
4578{
4579 struct mem_cgroup *memcg = NULL;
4580 struct page_cgroup *pc;
4581 enum charge_type type = MEM_CGROUP_CHARGE_TYPE_CACHE;
4582
4583 if (mem_cgroup_disabled())
4584 return;
4585
4586 pc = lookup_page_cgroup(oldpage);
4587
4588 lock_page_cgroup(pc);
4589 if (PageCgroupUsed(pc)) {
4590 memcg = pc->mem_cgroup;
4591 mem_cgroup_charge_statistics(memcg, oldpage, false, -1);
4592 ClearPageCgroupUsed(pc);
4593 }
4594 unlock_page_cgroup(pc);
4595
4596
4597
4598
4599
4600 if (!memcg)
4601 return;
4602
4603
4604
4605
4606
4607 __mem_cgroup_commit_charge(memcg, newpage, 1, type, true);
4608}
4609
4610#ifdef CONFIG_DEBUG_VM
4611static struct page_cgroup *lookup_page_cgroup_used(struct page *page)
4612{
4613 struct page_cgroup *pc;
4614
4615 pc = lookup_page_cgroup(page);
4616
4617
4618
4619
4620
4621 if (likely(pc) && PageCgroupUsed(pc))
4622 return pc;
4623 return NULL;
4624}
4625
4626bool mem_cgroup_bad_page_check(struct page *page)
4627{
4628 if (mem_cgroup_disabled())
4629 return false;
4630
4631 return lookup_page_cgroup_used(page) != NULL;
4632}
4633
4634void mem_cgroup_print_bad_page(struct page *page)
4635{
4636 struct page_cgroup *pc;
4637
4638 pc = lookup_page_cgroup_used(page);
4639 if (pc) {
4640 pr_alert("pc:%p pc->flags:%lx pc->mem_cgroup:%p\n",
4641 pc, pc->flags, pc->mem_cgroup);
4642 }
4643}
4644#endif
4645
4646static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
4647 unsigned long long val)
4648{
4649 int retry_count;
4650 u64 memswlimit, memlimit;
4651 int ret = 0;
4652 int children = mem_cgroup_count_children(memcg);
4653 u64 curusage, oldusage;
4654 int enlarge;
4655
4656
4657
4658
4659
4660
4661 retry_count = MEM_CGROUP_RECLAIM_RETRIES * children;
4662
4663 oldusage = res_counter_read_u64(&memcg->res, RES_USAGE);
4664
4665 enlarge = 0;
4666 while (retry_count) {
4667 if (signal_pending(current)) {
4668 ret = -EINTR;
4669 break;
4670 }
4671
4672
4673
4674
4675
4676 mutex_lock(&set_limit_mutex);
4677 memswlimit = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
4678 if (memswlimit < val) {
4679 ret = -EINVAL;
4680 mutex_unlock(&set_limit_mutex);
4681 break;
4682 }
4683
4684 memlimit = res_counter_read_u64(&memcg->res, RES_LIMIT);
4685 if (memlimit < val)
4686 enlarge = 1;
4687
4688 ret = res_counter_set_limit(&memcg->res, val);
4689 if (!ret) {
4690 if (memswlimit == val)
4691 memcg->memsw_is_minimum = true;
4692 else
4693 memcg->memsw_is_minimum = false;
4694 }
4695 mutex_unlock(&set_limit_mutex);
4696
4697 if (!ret)
4698 break;
4699
4700 mem_cgroup_reclaim(memcg, GFP_KERNEL,
4701 MEM_CGROUP_RECLAIM_SHRINK);
4702 curusage = res_counter_read_u64(&memcg->res, RES_USAGE);
4703
4704 if (curusage >= oldusage)
4705 retry_count--;
4706 else
4707 oldusage = curusage;
4708 }
4709 if (!ret && enlarge)
4710 memcg_oom_recover(memcg);
4711
4712 return ret;
4713}
4714
4715static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg,
4716 unsigned long long val)
4717{
4718 int retry_count;
4719 u64 memlimit, memswlimit, oldusage, curusage;
4720 int children = mem_cgroup_count_children(memcg);
4721 int ret = -EBUSY;
4722 int enlarge = 0;
4723
4724
4725 retry_count = children * MEM_CGROUP_RECLAIM_RETRIES;
4726 oldusage = res_counter_read_u64(&memcg->memsw, RES_USAGE);
4727 while (retry_count) {
4728 if (signal_pending(current)) {
4729 ret = -EINTR;
4730 break;
4731 }
4732
4733
4734
4735
4736
4737 mutex_lock(&set_limit_mutex);
4738 memlimit = res_counter_read_u64(&memcg->res, RES_LIMIT);
4739 if (memlimit > val) {
4740 ret = -EINVAL;
4741 mutex_unlock(&set_limit_mutex);
4742 break;
4743 }
4744 memswlimit = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
4745 if (memswlimit < val)
4746 enlarge = 1;
4747 ret = res_counter_set_limit(&memcg->memsw, val);
4748 if (!ret) {
4749 if (memlimit == val)
4750 memcg->memsw_is_minimum = true;
4751 else
4752 memcg->memsw_is_minimum = false;
4753 }
4754 mutex_unlock(&set_limit_mutex);
4755
4756 if (!ret)
4757 break;
4758
4759 mem_cgroup_reclaim(memcg, GFP_KERNEL,
4760 MEM_CGROUP_RECLAIM_NOSWAP |
4761 MEM_CGROUP_RECLAIM_SHRINK);
4762 curusage = res_counter_read_u64(&memcg->memsw, RES_USAGE);
4763
4764 if (curusage >= oldusage)
4765 retry_count--;
4766 else
4767 oldusage = curusage;
4768 }
4769 if (!ret && enlarge)
4770 memcg_oom_recover(memcg);
4771 return ret;
4772}
4773
4774unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
4775 gfp_t gfp_mask,
4776 unsigned long *total_scanned)
4777{
4778 unsigned long nr_reclaimed = 0;
4779 struct mem_cgroup_per_zone *mz, *next_mz = NULL;
4780 unsigned long reclaimed;
4781 int loop = 0;
4782 struct mem_cgroup_tree_per_zone *mctz;
4783 unsigned long long excess;
4784 unsigned long nr_scanned;
4785
4786 if (order > 0)
4787 return 0;
4788
4789 mctz = soft_limit_tree_node_zone(zone_to_nid(zone), zone_idx(zone));
4790
4791
4792
4793
4794
4795 do {
4796 if (next_mz)
4797 mz = next_mz;
4798 else
4799 mz = mem_cgroup_largest_soft_limit_node(mctz);
4800 if (!mz)
4801 break;
4802
4803 nr_scanned = 0;
4804 reclaimed = mem_cgroup_soft_reclaim(mz->memcg, zone,
4805 gfp_mask, &nr_scanned);
4806 nr_reclaimed += reclaimed;
4807 *total_scanned += nr_scanned;
4808 spin_lock(&mctz->lock);
4809
4810
4811
4812
4813
4814 next_mz = NULL;
4815 if (!reclaimed) {
4816 do {
4817
4818
4819
4820
4821
4822
4823
4824
4825
4826
4827
4828 next_mz =
4829 __mem_cgroup_largest_soft_limit_node(mctz);
4830 if (next_mz == mz)
4831 css_put(&next_mz->memcg->css);
4832 else
4833 break;
4834 } while (1);
4835 }
4836 __mem_cgroup_remove_exceeded(mz->memcg, mz, mctz);
4837 excess = res_counter_soft_limit_excess(&mz->memcg->res);
4838
4839
4840
4841
4842
4843
4844
4845
4846
4847 __mem_cgroup_insert_exceeded(mz->memcg, mz, mctz, excess);
4848 spin_unlock(&mctz->lock);
4849 css_put(&mz->memcg->css);
4850 loop++;
4851
4852
4853
4854
4855
4856 if (!nr_reclaimed &&
4857 (next_mz == NULL ||
4858 loop > MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS))
4859 break;
4860 } while (!nr_reclaimed);
4861 if (next_mz)
4862 css_put(&next_mz->memcg->css);
4863 return nr_reclaimed;
4864}
4865
4866
4867
4868
4869
4870
4871
4872
4873
4874
4875
4876
4877static void mem_cgroup_force_empty_list(struct mem_cgroup *memcg,
4878 int node, int zid, enum lru_list lru)
4879{
4880 struct lruvec *lruvec;
4881 unsigned long flags;
4882 struct list_head *list;
4883 struct page *busy;
4884 struct zone *zone;
4885
4886 zone = &NODE_DATA(node)->node_zones[zid];
4887 lruvec = mem_cgroup_zone_lruvec(zone, memcg);
4888 list = &lruvec->lists[lru];
4889
4890 busy = NULL;
4891 do {
4892 struct page_cgroup *pc;
4893 struct page *page;
4894
4895 spin_lock_irqsave(&zone->lru_lock, flags);
4896 if (list_empty(list)) {
4897 spin_unlock_irqrestore(&zone->lru_lock, flags);
4898 break;
4899 }
4900 page = list_entry(list->prev, struct page, lru);
4901 if (busy == page) {
4902 list_move(&page->lru, list);
4903 busy = NULL;
4904 spin_unlock_irqrestore(&zone->lru_lock, flags);
4905 continue;
4906 }
4907 spin_unlock_irqrestore(&zone->lru_lock, flags);
4908
4909 pc = lookup_page_cgroup(page);
4910
4911 if (mem_cgroup_move_parent(page, pc, memcg)) {
4912
4913 busy = page;
4914 cond_resched();
4915 } else
4916 busy = NULL;
4917 } while (!list_empty(list));
4918}
4919
4920
4921
4922
4923
4924
4925
4926
4927static void mem_cgroup_reparent_charges(struct mem_cgroup *memcg)
4928{
4929 int node, zid;
4930 u64 usage;
4931
4932 do {
4933
4934 lru_add_drain_all();
4935 drain_all_stock_sync(memcg);
4936 mem_cgroup_start_move(memcg);
4937 for_each_node_state(node, N_MEMORY) {
4938 for (zid = 0; zid < MAX_NR_ZONES; zid++) {
4939 enum lru_list lru;
4940 for_each_lru(lru) {
4941 mem_cgroup_force_empty_list(memcg,
4942 node, zid, lru);
4943 }
4944 }
4945 }
4946 mem_cgroup_end_move(memcg);
4947 memcg_oom_recover(memcg);
4948 cond_resched();
4949
4950
4951
4952
4953
4954
4955
4956
4957
4958
4959
4960
4961
4962 usage = res_counter_read_u64(&memcg->res, RES_USAGE) -
4963 res_counter_read_u64(&memcg->kmem, RES_USAGE);
4964 } while (usage > 0);
4965}
4966
4967static inline bool memcg_has_children(struct mem_cgroup *memcg)
4968{
4969 lockdep_assert_held(&memcg_create_mutex);
4970
4971
4972
4973
4974
4975
4976
4977 return memcg->use_hierarchy &&
4978 !list_empty(&memcg->css.cgroup->children);
4979}
4980
4981
4982
4983
4984
4985
4986
4987static int mem_cgroup_force_empty(struct mem_cgroup *memcg)
4988{
4989 int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
4990 struct cgroup *cgrp = memcg->css.cgroup;
4991
4992
4993 if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children))
4994 return -EBUSY;
4995
4996
4997 lru_add_drain_all();
4998
4999 while (nr_retries && res_counter_read_u64(&memcg->res, RES_USAGE) > 0) {
5000 int progress;
5001
5002 if (signal_pending(current))
5003 return -EINTR;
5004
5005 progress = try_to_free_mem_cgroup_pages(memcg, GFP_KERNEL,
5006 false);
5007 if (!progress) {
5008 nr_retries--;
5009
5010 congestion_wait(BLK_RW_ASYNC, HZ/10);
5011 }
5012
5013 }
5014 lru_add_drain();
5015 mem_cgroup_reparent_charges(memcg);
5016
5017 return 0;
5018}
5019
5020static int mem_cgroup_force_empty_write(struct cgroup_subsys_state *css,
5021 unsigned int event)
5022{
5023 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
5024
5025 if (mem_cgroup_is_root(memcg))
5026 return -EINVAL;
5027 return mem_cgroup_force_empty(memcg);
5028}
5029
5030static u64 mem_cgroup_hierarchy_read(struct cgroup_subsys_state *css,
5031 struct cftype *cft)
5032{
5033 return mem_cgroup_from_css(css)->use_hierarchy;
5034}
5035
5036static int mem_cgroup_hierarchy_write(struct cgroup_subsys_state *css,
5037 struct cftype *cft, u64 val)
5038{
5039 int retval = 0;
5040 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
5041 struct mem_cgroup *parent_memcg = mem_cgroup_from_css(css_parent(&memcg->css));
5042
5043 mutex_lock(&memcg_create_mutex);
5044
5045 if (memcg->use_hierarchy == val)
5046 goto out;
5047
5048
5049
5050
5051
5052
5053
5054
5055
5056 if ((!parent_memcg || !parent_memcg->use_hierarchy) &&
5057 (val == 1 || val == 0)) {
5058 if (list_empty(&memcg->css.cgroup->children))
5059 memcg->use_hierarchy = val;
5060 else
5061 retval = -EBUSY;
5062 } else
5063 retval = -EINVAL;
5064
5065out:
5066 mutex_unlock(&memcg_create_mutex);
5067
5068 return retval;
5069}
5070
5071
5072static unsigned long mem_cgroup_recursive_stat(struct mem_cgroup *memcg,
5073 enum mem_cgroup_stat_index idx)
5074{
5075 struct mem_cgroup *iter;
5076 long val = 0;
5077
5078
5079 for_each_mem_cgroup_tree(iter, memcg)
5080 val += mem_cgroup_read_stat(iter, idx);
5081
5082 if (val < 0)
5083 val = 0;
5084 return val;
5085}
5086
5087static inline u64 mem_cgroup_usage(struct mem_cgroup *memcg, bool swap)
5088{
5089 u64 val;
5090
5091 if (!mem_cgroup_is_root(memcg)) {
5092 if (!swap)
5093 return res_counter_read_u64(&memcg->res, RES_USAGE);
5094 else
5095 return res_counter_read_u64(&memcg->memsw, RES_USAGE);
5096 }
5097
5098
5099
5100
5101
5102 val = mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_CACHE);
5103 val += mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_RSS);
5104
5105 if (swap)
5106 val += mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_SWAP);
5107
5108 return val << PAGE_SHIFT;
5109}
5110
5111static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css,
5112 struct cftype *cft)
5113{
5114 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
5115 u64 val;
5116 int name;
5117 enum res_type type;
5118
5119 type = MEMFILE_TYPE(cft->private);
5120 name = MEMFILE_ATTR(cft->private);
5121
5122 switch (type) {
5123 case _MEM:
5124 if (name == RES_USAGE)
5125 val = mem_cgroup_usage(memcg, false);
5126 else
5127 val = res_counter_read_u64(&memcg->res, name);
5128 break;
5129 case _MEMSWAP:
5130 if (name == RES_USAGE)
5131 val = mem_cgroup_usage(memcg, true);
5132 else
5133 val = res_counter_read_u64(&memcg->memsw, name);
5134 break;
5135 case _KMEM:
5136 val = res_counter_read_u64(&memcg->kmem, name);
5137 break;
5138 default:
5139 BUG();
5140 }
5141
5142 return val;
5143}
5144
5145#ifdef CONFIG_MEMCG_KMEM
5146
5147static int __memcg_activate_kmem(struct mem_cgroup *memcg,
5148 unsigned long long limit)
5149{
5150 int err = 0;
5151 int memcg_id;
5152
5153 if (memcg_kmem_is_active(memcg))
5154 return 0;
5155
5156
5157
5158
5159
5160 memcg_stop_kmem_account();
5161
5162
5163
5164
5165
5166
5167
5168
5169
5170
5171
5172
5173
5174 mutex_lock(&memcg_create_mutex);
5175 if (cgroup_task_count(memcg->css.cgroup) || memcg_has_children(memcg))
5176 err = -EBUSY;
5177 mutex_unlock(&memcg_create_mutex);
5178 if (err)
5179 goto out;
5180
5181 memcg_id = ida_simple_get(&kmem_limited_groups,
5182 0, MEMCG_CACHES_MAX_SIZE, GFP_KERNEL);
5183 if (memcg_id < 0) {
5184 err = memcg_id;
5185 goto out;
5186 }
5187
5188
5189
5190
5191
5192 err = memcg_update_all_caches(memcg_id + 1);
5193 if (err)
5194 goto out_rmid;
5195
5196 memcg->kmemcg_id = memcg_id;
5197 INIT_LIST_HEAD(&memcg->memcg_slab_caches);
5198 mutex_init(&memcg->slab_caches_mutex);
5199
5200
5201
5202
5203
5204 err = res_counter_set_limit(&memcg->kmem, limit);
5205 VM_BUG_ON(err);
5206
5207 static_key_slow_inc(&memcg_kmem_enabled_key);
5208
5209
5210
5211
5212
5213 memcg_kmem_set_active(memcg);
5214out:
5215 memcg_resume_kmem_account();
5216 return err;
5217
5218out_rmid:
5219 ida_simple_remove(&kmem_limited_groups, memcg_id);
5220 goto out;
5221}
5222
5223static int memcg_activate_kmem(struct mem_cgroup *memcg,
5224 unsigned long long limit)
5225{
5226 int ret;
5227
5228 mutex_lock(&activate_kmem_mutex);
5229 ret = __memcg_activate_kmem(memcg, limit);
5230 mutex_unlock(&activate_kmem_mutex);
5231 return ret;
5232}
5233
5234static int memcg_update_kmem_limit(struct mem_cgroup *memcg,
5235 unsigned long long val)
5236{
5237 int ret;
5238
5239 if (!memcg_kmem_is_active(memcg))
5240 ret = memcg_activate_kmem(memcg, val);
5241 else
5242 ret = res_counter_set_limit(&memcg->kmem, val);
5243 return ret;
5244}
5245
5246static int memcg_propagate_kmem(struct mem_cgroup *memcg)
5247{
5248 int ret = 0;
5249 struct mem_cgroup *parent = parent_mem_cgroup(memcg);
5250
5251 if (!parent)
5252 return 0;
5253
5254 mutex_lock(&activate_kmem_mutex);
5255
5256
5257
5258
5259 if (memcg_kmem_is_active(parent))
5260 ret = __memcg_activate_kmem(memcg, RES_COUNTER_MAX);
5261 mutex_unlock(&activate_kmem_mutex);
5262 return ret;
5263}
5264#else
5265static int memcg_update_kmem_limit(struct mem_cgroup *memcg,
5266 unsigned long long val)
5267{
5268 return -EINVAL;
5269}
5270#endif
5271
5272
5273
5274
5275
5276static int mem_cgroup_write(struct cgroup_subsys_state *css, struct cftype *cft,
5277 const char *buffer)
5278{
5279 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
5280 enum res_type type;
5281 int name;
5282 unsigned long long val;
5283 int ret;
5284
5285 type = MEMFILE_TYPE(cft->private);
5286 name = MEMFILE_ATTR(cft->private);
5287
5288 switch (name) {
5289 case RES_LIMIT:
5290 if (mem_cgroup_is_root(memcg)) {
5291 ret = -EINVAL;
5292 break;
5293 }
5294
5295 ret = res_counter_memparse_write_strategy(buffer, &val);
5296 if (ret)
5297 break;
5298 if (type == _MEM)
5299 ret = mem_cgroup_resize_limit(memcg, val);
5300 else if (type == _MEMSWAP)
5301 ret = mem_cgroup_resize_memsw_limit(memcg, val);
5302 else if (type == _KMEM)
5303 ret = memcg_update_kmem_limit(memcg, val);
5304 else
5305 return -EINVAL;
5306 break;
5307 case RES_SOFT_LIMIT:
5308 ret = res_counter_memparse_write_strategy(buffer, &val);
5309 if (ret)
5310 break;
5311
5312
5313
5314
5315
5316 if (type == _MEM)
5317 ret = res_counter_set_soft_limit(&memcg->res, val);
5318 else
5319 ret = -EINVAL;
5320 break;
5321 default:
5322 ret = -EINVAL;
5323 break;
5324 }
5325 return ret;
5326}
5327
5328static void memcg_get_hierarchical_limit(struct mem_cgroup *memcg,
5329 unsigned long long *mem_limit, unsigned long long *memsw_limit)
5330{
5331 unsigned long long min_limit, min_memsw_limit, tmp;
5332
5333 min_limit = res_counter_read_u64(&memcg->res, RES_LIMIT);
5334 min_memsw_limit = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
5335 if (!memcg->use_hierarchy)
5336 goto out;
5337
5338 while (css_parent(&memcg->css)) {
5339 memcg = mem_cgroup_from_css(css_parent(&memcg->css));
5340 if (!memcg->use_hierarchy)
5341 break;
5342 tmp = res_counter_read_u64(&memcg->res, RES_LIMIT);
5343 min_limit = min(min_limit, tmp);
5344 tmp = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
5345 min_memsw_limit = min(min_memsw_limit, tmp);
5346 }
5347out:
5348 *mem_limit = min_limit;
5349 *memsw_limit = min_memsw_limit;
5350}
5351
5352static int mem_cgroup_reset(struct cgroup_subsys_state *css, unsigned int event)
5353{
5354 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
5355 int name;
5356 enum res_type type;
5357
5358 type = MEMFILE_TYPE(event);
5359 name = MEMFILE_ATTR(event);
5360
5361 switch (name) {
5362 case RES_MAX_USAGE:
5363 if (type == _MEM)
5364 res_counter_reset_max(&memcg->res);
5365 else if (type == _MEMSWAP)
5366 res_counter_reset_max(&memcg->memsw);
5367 else if (type == _KMEM)
5368 res_counter_reset_max(&memcg->kmem);
5369 else
5370 return -EINVAL;
5371 break;
5372 case RES_FAILCNT:
5373 if (type == _MEM)
5374 res_counter_reset_failcnt(&memcg->res);
5375 else if (type == _MEMSWAP)
5376 res_counter_reset_failcnt(&memcg->memsw);
5377 else if (type == _KMEM)
5378 res_counter_reset_failcnt(&memcg->kmem);
5379 else
5380 return -EINVAL;
5381 break;
5382 }
5383
5384 return 0;
5385}
5386
5387static u64 mem_cgroup_move_charge_read(struct cgroup_subsys_state *css,
5388 struct cftype *cft)
5389{
5390 return mem_cgroup_from_css(css)->move_charge_at_immigrate;
5391}
5392
5393#ifdef CONFIG_MMU
5394static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css,
5395 struct cftype *cft, u64 val)
5396{
5397 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
5398
5399 if (val >= (1 << NR_MOVE_TYPE))
5400 return -EINVAL;
5401
5402
5403
5404
5405
5406
5407
5408 memcg->move_charge_at_immigrate = val;
5409 return 0;
5410}
5411#else
5412static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css,
5413 struct cftype *cft, u64 val)
5414{
5415 return -ENOSYS;
5416}
5417#endif
5418
5419#ifdef CONFIG_NUMA
5420static int memcg_numa_stat_show(struct seq_file *m, void *v)
5421{
5422 struct numa_stat {
5423 const char *name;
5424 unsigned int lru_mask;
5425 };
5426
5427 static const struct numa_stat stats[] = {
5428 { "total", LRU_ALL },
5429 { "file", LRU_ALL_FILE },
5430 { "anon", LRU_ALL_ANON },
5431 { "unevictable", BIT(LRU_UNEVICTABLE) },
5432 };
5433 const struct numa_stat *stat;
5434 int nid;
5435 unsigned long nr;
5436 struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
5437
5438 for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) {
5439 nr = mem_cgroup_nr_lru_pages(memcg, stat->lru_mask);
5440 seq_printf(m, "%s=%lu", stat->name, nr);
5441 for_each_node_state(nid, N_MEMORY) {
5442 nr = mem_cgroup_node_nr_lru_pages(memcg, nid,
5443 stat->lru_mask);
5444 seq_printf(m, " N%d=%lu", nid, nr);
5445 }
5446 seq_putc(m, '\n');
5447 }
5448
5449 for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) {
5450 struct mem_cgroup *iter;
5451
5452 nr = 0;
5453 for_each_mem_cgroup_tree(iter, memcg)
5454 nr += mem_cgroup_nr_lru_pages(iter, stat->lru_mask);
5455 seq_printf(m, "hierarchical_%s=%lu", stat->name, nr);
5456 for_each_node_state(nid, N_MEMORY) {
5457 nr = 0;
5458 for_each_mem_cgroup_tree(iter, memcg)
5459 nr += mem_cgroup_node_nr_lru_pages(
5460 iter, nid, stat->lru_mask);
5461 seq_printf(m, " N%d=%lu", nid, nr);
5462 }
5463 seq_putc(m, '\n');
5464 }
5465
5466 return 0;
5467}
5468#endif
5469
5470static inline void mem_cgroup_lru_names_not_uptodate(void)
5471{
5472 BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_lru_names) != NR_LRU_LISTS);
5473}
5474
5475static int memcg_stat_show(struct seq_file *m, void *v)
5476{
5477 struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
5478 struct mem_cgroup *mi;
5479 unsigned int i;
5480
5481 for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) {
5482 if (i == MEM_CGROUP_STAT_SWAP && !do_swap_account)
5483 continue;
5484 seq_printf(m, "%s %ld\n", mem_cgroup_stat_names[i],
5485 mem_cgroup_read_stat(memcg, i) * PAGE_SIZE);
5486 }
5487
5488 for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++)
5489 seq_printf(m, "%s %lu\n", mem_cgroup_events_names[i],
5490 mem_cgroup_read_events(memcg, i));
5491
5492 for (i = 0; i < NR_LRU_LISTS; i++)
5493 seq_printf(m, "%s %lu\n", mem_cgroup_lru_names[i],
5494 mem_cgroup_nr_lru_pages(memcg, BIT(i)) * PAGE_SIZE);
5495
5496
5497 {
5498 unsigned long long limit, memsw_limit;
5499 memcg_get_hierarchical_limit(memcg, &limit, &memsw_limit);
5500 seq_printf(m, "hierarchical_memory_limit %llu\n", limit);
5501 if (do_swap_account)
5502 seq_printf(m, "hierarchical_memsw_limit %llu\n",
5503 memsw_limit);
5504 }
5505
5506 for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) {
5507 long long val = 0;
5508
5509 if (i == MEM_CGROUP_STAT_SWAP && !do_swap_account)
5510 continue;
5511 for_each_mem_cgroup_tree(mi, memcg)
5512 val += mem_cgroup_read_stat(mi, i) * PAGE_SIZE;
5513 seq_printf(m, "total_%s %lld\n", mem_cgroup_stat_names[i], val);
5514 }
5515
5516 for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++) {
5517 unsigned long long val = 0;
5518
5519 for_each_mem_cgroup_tree(mi, memcg)
5520 val += mem_cgroup_read_events(mi, i);
5521 seq_printf(m, "total_%s %llu\n",
5522 mem_cgroup_events_names[i], val);
5523 }
5524
5525 for (i = 0; i < NR_LRU_LISTS; i++) {
5526 unsigned long long val = 0;
5527
5528 for_each_mem_cgroup_tree(mi, memcg)
5529 val += mem_cgroup_nr_lru_pages(mi, BIT(i)) * PAGE_SIZE;
5530 seq_printf(m, "total_%s %llu\n", mem_cgroup_lru_names[i], val);
5531 }
5532
5533#ifdef CONFIG_DEBUG_VM
5534 {
5535 int nid, zid;
5536 struct mem_cgroup_per_zone *mz;
5537 struct zone_reclaim_stat *rstat;
5538 unsigned long recent_rotated[2] = {0, 0};
5539 unsigned long recent_scanned[2] = {0, 0};
5540
5541 for_each_online_node(nid)
5542 for (zid = 0; zid < MAX_NR_ZONES; zid++) {
5543 mz = mem_cgroup_zoneinfo(memcg, nid, zid);
5544 rstat = &mz->lruvec.reclaim_stat;
5545
5546 recent_rotated[0] += rstat->recent_rotated[0];
5547 recent_rotated[1] += rstat->recent_rotated[1];
5548 recent_scanned[0] += rstat->recent_scanned[0];
5549 recent_scanned[1] += rstat->recent_scanned[1];
5550 }
5551 seq_printf(m, "recent_rotated_anon %lu\n", recent_rotated[0]);
5552 seq_printf(m, "recent_rotated_file %lu\n", recent_rotated[1]);
5553 seq_printf(m, "recent_scanned_anon %lu\n", recent_scanned[0]);
5554 seq_printf(m, "recent_scanned_file %lu\n", recent_scanned[1]);
5555 }
5556#endif
5557
5558 return 0;
5559}
5560
5561static u64 mem_cgroup_swappiness_read(struct cgroup_subsys_state *css,
5562 struct cftype *cft)
5563{
5564 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
5565
5566 return mem_cgroup_swappiness(memcg);
5567}
5568
5569static int mem_cgroup_swappiness_write(struct cgroup_subsys_state *css,
5570 struct cftype *cft, u64 val)
5571{
5572 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
5573 struct mem_cgroup *parent = mem_cgroup_from_css(css_parent(&memcg->css));
5574
5575 if (val > 100 || !parent)
5576 return -EINVAL;
5577
5578 mutex_lock(&memcg_create_mutex);
5579
5580
5581 if ((parent->use_hierarchy) || memcg_has_children(memcg)) {
5582 mutex_unlock(&memcg_create_mutex);
5583 return -EINVAL;
5584 }
5585
5586 memcg->swappiness = val;
5587
5588 mutex_unlock(&memcg_create_mutex);
5589
5590 return 0;
5591}
5592
5593static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap)
5594{
5595 struct mem_cgroup_threshold_ary *t;
5596 u64 usage;
5597 int i;
5598
5599 rcu_read_lock();
5600 if (!swap)
5601 t = rcu_dereference(memcg->thresholds.primary);
5602 else
5603 t = rcu_dereference(memcg->memsw_thresholds.primary);
5604
5605 if (!t)
5606 goto unlock;
5607
5608 usage = mem_cgroup_usage(memcg, swap);
5609
5610
5611
5612
5613
5614
5615 i = t->current_threshold;
5616
5617
5618
5619
5620
5621
5622
5623 for (; i >= 0 && unlikely(t->entries[i].threshold > usage); i--)
5624 eventfd_signal(t->entries[i].eventfd, 1);
5625
5626
5627 i++;
5628
5629
5630
5631
5632
5633
5634
5635 for (; i < t->size && unlikely(t->entries[i].threshold <= usage); i++)
5636 eventfd_signal(t->entries[i].eventfd, 1);
5637
5638
5639 t->current_threshold = i - 1;
5640unlock:
5641 rcu_read_unlock();
5642}
5643
5644static void mem_cgroup_threshold(struct mem_cgroup *memcg)
5645{
5646 while (memcg) {
5647 __mem_cgroup_threshold(memcg, false);
5648 if (do_swap_account)
5649 __mem_cgroup_threshold(memcg, true);
5650
5651 memcg = parent_mem_cgroup(memcg);
5652 }
5653}
5654
5655static int compare_thresholds(const void *a, const void *b)
5656{
5657 const struct mem_cgroup_threshold *_a = a;
5658 const struct mem_cgroup_threshold *_b = b;
5659
5660 if (_a->threshold > _b->threshold)
5661 return 1;
5662
5663 if (_a->threshold < _b->threshold)
5664 return -1;
5665
5666 return 0;
5667}
5668
5669static int mem_cgroup_oom_notify_cb(struct mem_cgroup *memcg)
5670{
5671 struct mem_cgroup_eventfd_list *ev;
5672
5673 list_for_each_entry(ev, &memcg->oom_notify, list)
5674 eventfd_signal(ev->eventfd, 1);
5675 return 0;
5676}
5677
5678static void mem_cgroup_oom_notify(struct mem_cgroup *memcg)
5679{
5680 struct mem_cgroup *iter;
5681
5682 for_each_mem_cgroup_tree(iter, memcg)
5683 mem_cgroup_oom_notify_cb(iter);
5684}
5685
5686static int __mem_cgroup_usage_register_event(struct mem_cgroup *memcg,
5687 struct eventfd_ctx *eventfd, const char *args, enum res_type type)
5688{
5689 struct mem_cgroup_thresholds *thresholds;
5690 struct mem_cgroup_threshold_ary *new;
5691 u64 threshold, usage;
5692 int i, size, ret;
5693
5694 ret = res_counter_memparse_write_strategy(args, &threshold);
5695 if (ret)
5696 return ret;
5697
5698 mutex_lock(&memcg->thresholds_lock);
5699
5700 if (type == _MEM)
5701 thresholds = &memcg->thresholds;
5702 else if (type == _MEMSWAP)
5703 thresholds = &memcg->memsw_thresholds;
5704 else
5705 BUG();
5706
5707 usage = mem_cgroup_usage(memcg, type == _MEMSWAP);
5708
5709
5710 if (thresholds->primary)
5711 __mem_cgroup_threshold(memcg, type == _MEMSWAP);
5712
5713 size = thresholds->primary ? thresholds->primary->size + 1 : 1;
5714
5715
5716 new = kmalloc(sizeof(*new) + size * sizeof(struct mem_cgroup_threshold),
5717 GFP_KERNEL);
5718 if (!new) {
5719 ret = -ENOMEM;
5720 goto unlock;
5721 }
5722 new->size = size;
5723
5724
5725 if (thresholds->primary) {
5726 memcpy(new->entries, thresholds->primary->entries, (size - 1) *
5727 sizeof(struct mem_cgroup_threshold));
5728 }
5729
5730
5731 new->entries[size - 1].eventfd = eventfd;
5732 new->entries[size - 1].threshold = threshold;
5733
5734
5735 sort(new->entries, size, sizeof(struct mem_cgroup_threshold),
5736 compare_thresholds, NULL);
5737
5738
5739 new->current_threshold = -1;
5740 for (i = 0; i < size; i++) {
5741 if (new->entries[i].threshold <= usage) {
5742
5743
5744
5745
5746
5747 ++new->current_threshold;
5748 } else
5749 break;
5750 }
5751
5752
5753 kfree(thresholds->spare);
5754 thresholds->spare = thresholds->primary;
5755
5756 rcu_assign_pointer(thresholds->primary, new);
5757
5758
5759 synchronize_rcu();
5760
5761unlock:
5762 mutex_unlock(&memcg->thresholds_lock);
5763
5764 return ret;
5765}
5766
5767static int mem_cgroup_usage_register_event(struct mem_cgroup *memcg,
5768 struct eventfd_ctx *eventfd, const char *args)
5769{
5770 return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEM);
5771}
5772
5773static int memsw_cgroup_usage_register_event(struct mem_cgroup *memcg,
5774 struct eventfd_ctx *eventfd, const char *args)
5775{
5776 return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEMSWAP);
5777}
5778
5779static void __mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
5780 struct eventfd_ctx *eventfd, enum res_type type)
5781{
5782 struct mem_cgroup_thresholds *thresholds;
5783 struct mem_cgroup_threshold_ary *new;
5784 u64 usage;
5785 int i, j, size;
5786
5787 mutex_lock(&memcg->thresholds_lock);
5788 if (type == _MEM)
5789 thresholds = &memcg->thresholds;
5790 else if (type == _MEMSWAP)
5791 thresholds = &memcg->memsw_thresholds;
5792 else
5793 BUG();
5794
5795 if (!thresholds->primary)
5796 goto unlock;
5797
5798 usage = mem_cgroup_usage(memcg, type == _MEMSWAP);
5799
5800
5801 __mem_cgroup_threshold(memcg, type == _MEMSWAP);
5802
5803
5804 size = 0;
5805 for (i = 0; i < thresholds->primary->size; i++) {
5806 if (thresholds->primary->entries[i].eventfd != eventfd)
5807 size++;
5808 }
5809
5810 new = thresholds->spare;
5811
5812
5813 if (!size) {
5814 kfree(new);
5815 new = NULL;
5816 goto swap_buffers;
5817 }
5818
5819 new->size = size;
5820
5821
5822 new->current_threshold = -1;
5823 for (i = 0, j = 0; i < thresholds->primary->size; i++) {
5824 if (thresholds->primary->entries[i].eventfd == eventfd)
5825 continue;
5826
5827 new->entries[j] = thresholds->primary->entries[i];
5828 if (new->entries[j].threshold <= usage) {
5829
5830
5831
5832
5833
5834 ++new->current_threshold;
5835 }
5836 j++;
5837 }
5838
5839swap_buffers:
5840
5841 thresholds->spare = thresholds->primary;
5842
5843 if (!new) {
5844 kfree(thresholds->spare);
5845 thresholds->spare = NULL;
5846 }
5847
5848 rcu_assign_pointer(thresholds->primary, new);
5849
5850
5851 synchronize_rcu();
5852unlock:
5853 mutex_unlock(&memcg->thresholds_lock);
5854}
5855
5856static void mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
5857 struct eventfd_ctx *eventfd)
5858{
5859 return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEM);
5860}
5861
5862static void memsw_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
5863 struct eventfd_ctx *eventfd)
5864{
5865 return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEMSWAP);
5866}
5867
5868static int mem_cgroup_oom_register_event(struct mem_cgroup *memcg,
5869 struct eventfd_ctx *eventfd, const char *args)
5870{
5871 struct mem_cgroup_eventfd_list *event;
5872
5873 event = kmalloc(sizeof(*event), GFP_KERNEL);
5874 if (!event)
5875 return -ENOMEM;
5876
5877 spin_lock(&memcg_oom_lock);
5878
5879 event->eventfd = eventfd;
5880 list_add(&event->list, &memcg->oom_notify);
5881
5882
5883 if (atomic_read(&memcg->under_oom))
5884 eventfd_signal(eventfd, 1);
5885 spin_unlock(&memcg_oom_lock);
5886
5887 return 0;
5888}
5889
5890static void mem_cgroup_oom_unregister_event(struct mem_cgroup *memcg,
5891 struct eventfd_ctx *eventfd)
5892{
5893 struct mem_cgroup_eventfd_list *ev, *tmp;
5894
5895 spin_lock(&memcg_oom_lock);
5896
5897 list_for_each_entry_safe(ev, tmp, &memcg->oom_notify, list) {
5898 if (ev->eventfd == eventfd) {
5899 list_del(&ev->list);
5900 kfree(ev);
5901 }
5902 }
5903
5904 spin_unlock(&memcg_oom_lock);
5905}
5906
5907static int mem_cgroup_oom_control_read(struct seq_file *sf, void *v)
5908{
5909 struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(sf));
5910
5911 seq_printf(sf, "oom_kill_disable %d\n", memcg->oom_kill_disable);
5912 seq_printf(sf, "under_oom %d\n", (bool)atomic_read(&memcg->under_oom));
5913 return 0;
5914}
5915
5916static int mem_cgroup_oom_control_write(struct cgroup_subsys_state *css,
5917 struct cftype *cft, u64 val)
5918{
5919 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
5920 struct mem_cgroup *parent = mem_cgroup_from_css(css_parent(&memcg->css));
5921
5922
5923 if (!parent || !((val == 0) || (val == 1)))
5924 return -EINVAL;
5925
5926 mutex_lock(&memcg_create_mutex);
5927
5928 if ((parent->use_hierarchy) || memcg_has_children(memcg)) {
5929 mutex_unlock(&memcg_create_mutex);
5930 return -EINVAL;
5931 }
5932 memcg->oom_kill_disable = val;
5933 if (!val)
5934 memcg_oom_recover(memcg);
5935 mutex_unlock(&memcg_create_mutex);
5936 return 0;
5937}
5938
5939#ifdef CONFIG_MEMCG_KMEM
5940static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
5941{
5942 int ret;
5943
5944 memcg->kmemcg_id = -1;
5945 ret = memcg_propagate_kmem(memcg);
5946 if (ret)
5947 return ret;
5948
5949 return mem_cgroup_sockets_init(memcg, ss);
5950}
5951
5952static void memcg_destroy_kmem(struct mem_cgroup *memcg)
5953{
5954 mem_cgroup_sockets_destroy(memcg);
5955}
5956
5957static void kmem_cgroup_css_offline(struct mem_cgroup *memcg)
5958{
5959 if (!memcg_kmem_is_active(memcg))
5960 return;
5961
5962
5963
5964
5965
5966
5967
5968
5969
5970
5971
5972
5973
5974
5975
5976
5977
5978
5979
5980 css_get(&memcg->css);
5981
5982 memcg_kmem_mark_dead(memcg);
5983
5984 if (res_counter_read_u64(&memcg->kmem, RES_USAGE) != 0)
5985 return;
5986
5987 if (memcg_kmem_test_and_clear_dead(memcg))
5988 css_put(&memcg->css);
5989}
5990#else
5991static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
5992{
5993 return 0;
5994}
5995
5996static void memcg_destroy_kmem(struct mem_cgroup *memcg)
5997{
5998}
5999
6000static void kmem_cgroup_css_offline(struct mem_cgroup *memcg)
6001{
6002}
6003#endif
6004
6005
6006
6007
6008
6009
6010
6011
6012
6013
6014
6015
6016
6017
6018
6019
6020
6021
6022
6023static void memcg_event_remove(struct work_struct *work)
6024{
6025 struct mem_cgroup_event *event =
6026 container_of(work, struct mem_cgroup_event, remove);
6027 struct mem_cgroup *memcg = event->memcg;
6028
6029 remove_wait_queue(event->wqh, &event->wait);
6030
6031 event->unregister_event(memcg, event->eventfd);
6032
6033
6034 eventfd_signal(event->eventfd, 1);
6035
6036 eventfd_ctx_put(event->eventfd);
6037 kfree(event);
6038 css_put(&memcg->css);
6039}
6040
6041
6042
6043
6044
6045
6046static int memcg_event_wake(wait_queue_t *wait, unsigned mode,
6047 int sync, void *key)
6048{
6049 struct mem_cgroup_event *event =
6050 container_of(wait, struct mem_cgroup_event, wait);
6051 struct mem_cgroup *memcg = event->memcg;
6052 unsigned long flags = (unsigned long)key;
6053
6054 if (flags & POLLHUP) {
6055
6056
6057
6058
6059
6060
6061
6062
6063
6064 spin_lock(&memcg->event_list_lock);
6065 if (!list_empty(&event->list)) {
6066 list_del_init(&event->list);
6067
6068
6069
6070
6071 schedule_work(&event->remove);
6072 }
6073 spin_unlock(&memcg->event_list_lock);
6074 }
6075
6076 return 0;
6077}
6078
6079static void memcg_event_ptable_queue_proc(struct file *file,
6080 wait_queue_head_t *wqh, poll_table *pt)
6081{
6082 struct mem_cgroup_event *event =
6083 container_of(pt, struct mem_cgroup_event, pt);
6084
6085 event->wqh = wqh;
6086 add_wait_queue(wqh, &event->wait);
6087}
6088
6089
6090
6091
6092
6093
6094
6095
6096
6097static int memcg_write_event_control(struct cgroup_subsys_state *css,
6098 struct cftype *cft, const char *buffer)
6099{
6100 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
6101 struct mem_cgroup_event *event;
6102 struct cgroup_subsys_state *cfile_css;
6103 unsigned int efd, cfd;
6104 struct fd efile;
6105 struct fd cfile;
6106 const char *name;
6107 char *endp;
6108 int ret;
6109
6110 efd = simple_strtoul(buffer, &endp, 10);
6111 if (*endp != ' ')
6112 return -EINVAL;
6113 buffer = endp + 1;
6114
6115 cfd = simple_strtoul(buffer, &endp, 10);
6116 if ((*endp != ' ') && (*endp != '\0'))
6117 return -EINVAL;
6118 buffer = endp + 1;
6119
6120 event = kzalloc(sizeof(*event), GFP_KERNEL);
6121 if (!event)
6122 return -ENOMEM;
6123
6124 event->memcg = memcg;
6125 INIT_LIST_HEAD(&event->list);
6126 init_poll_funcptr(&event->pt, memcg_event_ptable_queue_proc);
6127 init_waitqueue_func_entry(&event->wait, memcg_event_wake);
6128 INIT_WORK(&event->remove, memcg_event_remove);
6129
6130 efile = fdget(efd);
6131 if (!efile.file) {
6132 ret = -EBADF;
6133 goto out_kfree;
6134 }
6135
6136 event->eventfd = eventfd_ctx_fileget(efile.file);
6137 if (IS_ERR(event->eventfd)) {
6138 ret = PTR_ERR(event->eventfd);
6139 goto out_put_efile;
6140 }
6141
6142 cfile = fdget(cfd);
6143 if (!cfile.file) {
6144 ret = -EBADF;
6145 goto out_put_eventfd;
6146 }
6147
6148
6149
6150 ret = inode_permission(file_inode(cfile.file), MAY_READ);
6151 if (ret < 0)
6152 goto out_put_cfile;
6153
6154
6155
6156
6157
6158
6159
6160
6161
6162 name = cfile.file->f_dentry->d_name.name;
6163
6164 if (!strcmp(name, "memory.usage_in_bytes")) {
6165 event->register_event = mem_cgroup_usage_register_event;
6166 event->unregister_event = mem_cgroup_usage_unregister_event;
6167 } else if (!strcmp(name, "memory.oom_control")) {
6168 event->register_event = mem_cgroup_oom_register_event;
6169 event->unregister_event = mem_cgroup_oom_unregister_event;
6170 } else if (!strcmp(name, "memory.pressure_level")) {
6171 event->register_event = vmpressure_register_event;
6172 event->unregister_event = vmpressure_unregister_event;
6173 } else if (!strcmp(name, "memory.memsw.usage_in_bytes")) {
6174 event->register_event = memsw_cgroup_usage_register_event;
6175 event->unregister_event = memsw_cgroup_usage_unregister_event;
6176 } else {
6177 ret = -EINVAL;
6178 goto out_put_cfile;
6179 }
6180
6181
6182
6183
6184
6185
6186 rcu_read_lock();
6187
6188 ret = -EINVAL;
6189 cfile_css = css_from_dir(cfile.file->f_dentry->d_parent,
6190 &mem_cgroup_subsys);
6191 if (cfile_css == css && css_tryget(css))
6192 ret = 0;
6193
6194 rcu_read_unlock();
6195 if (ret)
6196 goto out_put_cfile;
6197
6198 ret = event->register_event(memcg, event->eventfd, buffer);
6199 if (ret)
6200 goto out_put_css;
6201
6202 efile.file->f_op->poll(efile.file, &event->pt);
6203
6204 spin_lock(&memcg->event_list_lock);
6205 list_add(&event->list, &memcg->event_list);
6206 spin_unlock(&memcg->event_list_lock);
6207
6208 fdput(cfile);
6209 fdput(efile);
6210
6211 return 0;
6212
6213out_put_css:
6214 css_put(css);
6215out_put_cfile:
6216 fdput(cfile);
6217out_put_eventfd:
6218 eventfd_ctx_put(event->eventfd);
6219out_put_efile:
6220 fdput(efile);
6221out_kfree:
6222 kfree(event);
6223
6224 return ret;
6225}
6226
6227static struct cftype mem_cgroup_files[] = {
6228 {
6229 .name = "usage_in_bytes",
6230 .private = MEMFILE_PRIVATE(_MEM, RES_USAGE),
6231 .read_u64 = mem_cgroup_read_u64,
6232 },
6233 {
6234 .name = "max_usage_in_bytes",
6235 .private = MEMFILE_PRIVATE(_MEM, RES_MAX_USAGE),
6236 .trigger = mem_cgroup_reset,
6237 .read_u64 = mem_cgroup_read_u64,
6238 },
6239 {
6240 .name = "limit_in_bytes",
6241 .private = MEMFILE_PRIVATE(_MEM, RES_LIMIT),
6242 .write_string = mem_cgroup_write,
6243 .read_u64 = mem_cgroup_read_u64,
6244 },
6245 {
6246 .name = "soft_limit_in_bytes",
6247 .private = MEMFILE_PRIVATE(_MEM, RES_SOFT_LIMIT),
6248 .write_string = mem_cgroup_write,
6249 .read_u64 = mem_cgroup_read_u64,
6250 },
6251 {
6252 .name = "failcnt",
6253 .private = MEMFILE_PRIVATE(_MEM, RES_FAILCNT),
6254 .trigger = mem_cgroup_reset,
6255 .read_u64 = mem_cgroup_read_u64,
6256 },
6257 {
6258 .name = "stat",
6259 .seq_show = memcg_stat_show,
6260 },
6261 {
6262 .name = "force_empty",
6263 .trigger = mem_cgroup_force_empty_write,
6264 },
6265 {
6266 .name = "use_hierarchy",
6267 .flags = CFTYPE_INSANE,
6268 .write_u64 = mem_cgroup_hierarchy_write,
6269 .read_u64 = mem_cgroup_hierarchy_read,
6270 },
6271 {
6272 .name = "cgroup.event_control",
6273 .write_string = memcg_write_event_control,
6274 .flags = CFTYPE_NO_PREFIX,
6275 .mode = S_IWUGO,
6276 },
6277 {
6278 .name = "swappiness",
6279 .read_u64 = mem_cgroup_swappiness_read,
6280 .write_u64 = mem_cgroup_swappiness_write,
6281 },
6282 {
6283 .name = "move_charge_at_immigrate",
6284 .read_u64 = mem_cgroup_move_charge_read,
6285 .write_u64 = mem_cgroup_move_charge_write,
6286 },
6287 {
6288 .name = "oom_control",
6289 .seq_show = mem_cgroup_oom_control_read,
6290 .write_u64 = mem_cgroup_oom_control_write,
6291 .private = MEMFILE_PRIVATE(_OOM_TYPE, OOM_CONTROL),
6292 },
6293 {
6294 .name = "pressure_level",
6295 },
6296#ifdef CONFIG_NUMA
6297 {
6298 .name = "numa_stat",
6299 .seq_show = memcg_numa_stat_show,
6300 },
6301#endif
6302#ifdef CONFIG_MEMCG_KMEM
6303 {
6304 .name = "kmem.limit_in_bytes",
6305 .private = MEMFILE_PRIVATE(_KMEM, RES_LIMIT),
6306 .write_string = mem_cgroup_write,
6307 .read_u64 = mem_cgroup_read_u64,
6308 },
6309 {
6310 .name = "kmem.usage_in_bytes",
6311 .private = MEMFILE_PRIVATE(_KMEM, RES_USAGE),
6312 .read_u64 = mem_cgroup_read_u64,
6313 },
6314 {
6315 .name = "kmem.failcnt",
6316 .private = MEMFILE_PRIVATE(_KMEM, RES_FAILCNT),
6317 .trigger = mem_cgroup_reset,
6318 .read_u64 = mem_cgroup_read_u64,
6319 },
6320 {
6321 .name = "kmem.max_usage_in_bytes",
6322 .private = MEMFILE_PRIVATE(_KMEM, RES_MAX_USAGE),
6323 .trigger = mem_cgroup_reset,
6324 .read_u64 = mem_cgroup_read_u64,
6325 },
6326#ifdef CONFIG_SLABINFO
6327 {
6328 .name = "kmem.slabinfo",
6329 .seq_show = mem_cgroup_slabinfo_read,
6330 },
6331#endif
6332#endif
6333 { },
6334};
6335
6336#ifdef CONFIG_MEMCG_SWAP
6337static struct cftype memsw_cgroup_files[] = {
6338 {
6339 .name = "memsw.usage_in_bytes",
6340 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE),
6341 .read_u64 = mem_cgroup_read_u64,
6342 },
6343 {
6344 .name = "memsw.max_usage_in_bytes",
6345 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_MAX_USAGE),
6346 .trigger = mem_cgroup_reset,
6347 .read_u64 = mem_cgroup_read_u64,
6348 },
6349 {
6350 .name = "memsw.limit_in_bytes",
6351 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_LIMIT),
6352 .write_string = mem_cgroup_write,
6353 .read_u64 = mem_cgroup_read_u64,
6354 },
6355 {
6356 .name = "memsw.failcnt",
6357 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_FAILCNT),
6358 .trigger = mem_cgroup_reset,
6359 .read_u64 = mem_cgroup_read_u64,
6360 },
6361 { },
6362};
6363#endif
6364static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node)
6365{
6366 struct mem_cgroup_per_node *pn;
6367 struct mem_cgroup_per_zone *mz;
6368 int zone, tmp = node;
6369
6370
6371
6372
6373
6374
6375
6376
6377 if (!node_state(node, N_NORMAL_MEMORY))
6378 tmp = -1;
6379 pn = kzalloc_node(sizeof(*pn), GFP_KERNEL, tmp);
6380 if (!pn)
6381 return 1;
6382
6383 for (zone = 0; zone < MAX_NR_ZONES; zone++) {
6384 mz = &pn->zoneinfo[zone];
6385 lruvec_init(&mz->lruvec);
6386 mz->usage_in_excess = 0;
6387 mz->on_tree = false;
6388 mz->memcg = memcg;
6389 }
6390 memcg->nodeinfo[node] = pn;
6391 return 0;
6392}
6393
6394static void free_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node)
6395{
6396 kfree(memcg->nodeinfo[node]);
6397}
6398
6399static struct mem_cgroup *mem_cgroup_alloc(void)
6400{
6401 struct mem_cgroup *memcg;
6402 size_t size;
6403
6404 size = sizeof(struct mem_cgroup);
6405 size += nr_node_ids * sizeof(struct mem_cgroup_per_node *);
6406
6407 memcg = kzalloc(size, GFP_KERNEL);
6408 if (!memcg)
6409 return NULL;
6410
6411 memcg->stat = alloc_percpu(struct mem_cgroup_stat_cpu);
6412 if (!memcg->stat)
6413 goto out_free;
6414 spin_lock_init(&memcg->pcp_counter_lock);
6415 return memcg;
6416
6417out_free:
6418 kfree(memcg);
6419 return NULL;
6420}
6421
6422
6423
6424
6425
6426
6427
6428
6429
6430
6431
6432
6433static void __mem_cgroup_free(struct mem_cgroup *memcg)
6434{
6435 int node;
6436
6437 mem_cgroup_remove_from_trees(memcg);
6438
6439 for_each_node(node)
6440 free_mem_cgroup_per_zone_info(memcg, node);
6441
6442 free_percpu(memcg->stat);
6443
6444
6445
6446
6447
6448
6449
6450
6451
6452
6453
6454
6455 disarm_static_keys(memcg);
6456 kfree(memcg);
6457}
6458
6459
6460
6461
6462struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg)
6463{
6464 if (!memcg->res.parent)
6465 return NULL;
6466 return mem_cgroup_from_res_counter(memcg->res.parent, res);
6467}
6468EXPORT_SYMBOL(parent_mem_cgroup);
6469
6470static void __init mem_cgroup_soft_limit_tree_init(void)
6471{
6472 struct mem_cgroup_tree_per_node *rtpn;
6473 struct mem_cgroup_tree_per_zone *rtpz;
6474 int tmp, node, zone;
6475
6476 for_each_node(node) {
6477 tmp = node;
6478 if (!node_state(node, N_NORMAL_MEMORY))
6479 tmp = -1;
6480 rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL, tmp);
6481 BUG_ON(!rtpn);
6482
6483 soft_limit_tree.rb_tree_per_node[node] = rtpn;
6484
6485 for (zone = 0; zone < MAX_NR_ZONES; zone++) {
6486 rtpz = &rtpn->rb_tree_per_zone[zone];
6487 rtpz->rb_root = RB_ROOT;
6488 spin_lock_init(&rtpz->lock);
6489 }
6490 }
6491}
6492
6493static struct cgroup_subsys_state * __ref
6494mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
6495{
6496 struct mem_cgroup *memcg;
6497 long error = -ENOMEM;
6498 int node;
6499
6500 memcg = mem_cgroup_alloc();
6501 if (!memcg)
6502 return ERR_PTR(error);
6503
6504 for_each_node(node)
6505 if (alloc_mem_cgroup_per_zone_info(memcg, node))
6506 goto free_out;
6507
6508
6509 if (parent_css == NULL) {
6510 root_mem_cgroup = memcg;
6511 res_counter_init(&memcg->res, NULL);
6512 res_counter_init(&memcg->memsw, NULL);
6513 res_counter_init(&memcg->kmem, NULL);
6514 }
6515
6516 memcg->last_scanned_node = MAX_NUMNODES;
6517 INIT_LIST_HEAD(&memcg->oom_notify);
6518 memcg->move_charge_at_immigrate = 0;
6519 mutex_init(&memcg->thresholds_lock);
6520 spin_lock_init(&memcg->move_lock);
6521 vmpressure_init(&memcg->vmpressure);
6522 INIT_LIST_HEAD(&memcg->event_list);
6523 spin_lock_init(&memcg->event_list_lock);
6524
6525 return &memcg->css;
6526
6527free_out:
6528 __mem_cgroup_free(memcg);
6529 return ERR_PTR(error);
6530}
6531
6532static int
6533mem_cgroup_css_online(struct cgroup_subsys_state *css)
6534{
6535 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
6536 struct mem_cgroup *parent = mem_cgroup_from_css(css_parent(css));
6537
6538 if (css->cgroup->id > MEM_CGROUP_ID_MAX)
6539 return -ENOSPC;
6540
6541 if (!parent)
6542 return 0;
6543
6544 mutex_lock(&memcg_create_mutex);
6545
6546 memcg->use_hierarchy = parent->use_hierarchy;
6547 memcg->oom_kill_disable = parent->oom_kill_disable;
6548 memcg->swappiness = mem_cgroup_swappiness(parent);
6549
6550 if (parent->use_hierarchy) {
6551 res_counter_init(&memcg->res, &parent->res);
6552 res_counter_init(&memcg->memsw, &parent->memsw);
6553 res_counter_init(&memcg->kmem, &parent->kmem);
6554
6555
6556
6557
6558
6559 } else {
6560 res_counter_init(&memcg->res, NULL);
6561 res_counter_init(&memcg->memsw, NULL);
6562 res_counter_init(&memcg->kmem, NULL);
6563
6564
6565
6566
6567
6568 if (parent != root_mem_cgroup)
6569 mem_cgroup_subsys.broken_hierarchy = true;
6570 }
6571 mutex_unlock(&memcg_create_mutex);
6572
6573 return memcg_init_kmem(memcg, &mem_cgroup_subsys);
6574}
6575
6576
6577
6578
6579static void mem_cgroup_invalidate_reclaim_iterators(struct mem_cgroup *memcg)
6580{
6581 struct mem_cgroup *parent = memcg;
6582
6583 while ((parent = parent_mem_cgroup(parent)))
6584 mem_cgroup_iter_invalidate(parent);
6585
6586
6587
6588
6589
6590 if (!root_mem_cgroup->use_hierarchy)
6591 mem_cgroup_iter_invalidate(root_mem_cgroup);
6592}
6593
6594static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
6595{
6596 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
6597 struct mem_cgroup_event *event, *tmp;
6598 struct cgroup_subsys_state *iter;
6599
6600
6601
6602
6603
6604
6605 spin_lock(&memcg->event_list_lock);
6606 list_for_each_entry_safe(event, tmp, &memcg->event_list, list) {
6607 list_del_init(&event->list);
6608 schedule_work(&event->remove);
6609 }
6610 spin_unlock(&memcg->event_list_lock);
6611
6612 kmem_cgroup_css_offline(memcg);
6613
6614 mem_cgroup_invalidate_reclaim_iterators(memcg);
6615
6616
6617
6618
6619
6620 css_for_each_descendant_post(iter, css)
6621 mem_cgroup_reparent_charges(mem_cgroup_from_css(iter));
6622
6623 mem_cgroup_destroy_all_caches(memcg);
6624 vmpressure_cleanup(&memcg->vmpressure);
6625}
6626
6627static void mem_cgroup_css_free(struct cgroup_subsys_state *css)
6628{
6629 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
6630
6631
6632
6633
6634
6635
6636
6637
6638
6639
6640
6641
6642
6643
6644
6645
6646
6647
6648
6649
6650
6651
6652
6653
6654
6655
6656
6657
6658
6659
6660
6661
6662
6663
6664
6665 mem_cgroup_reparent_charges(memcg);
6666
6667 memcg_destroy_kmem(memcg);
6668 __mem_cgroup_free(memcg);
6669}
6670
6671#ifdef CONFIG_MMU
6672
6673#define PRECHARGE_COUNT_AT_ONCE 256
6674static int mem_cgroup_do_precharge(unsigned long count)
6675{
6676 int ret = 0;
6677 int batch_count = PRECHARGE_COUNT_AT_ONCE;
6678 struct mem_cgroup *memcg = mc.to;
6679
6680 if (mem_cgroup_is_root(memcg)) {
6681 mc.precharge += count;
6682
6683 return ret;
6684 }
6685
6686 if (count > 1) {
6687 struct res_counter *dummy;
6688
6689
6690
6691
6692
6693
6694 if (res_counter_charge(&memcg->res, PAGE_SIZE * count, &dummy))
6695 goto one_by_one;
6696 if (do_swap_account && res_counter_charge(&memcg->memsw,
6697 PAGE_SIZE * count, &dummy)) {
6698 res_counter_uncharge(&memcg->res, PAGE_SIZE * count);
6699 goto one_by_one;
6700 }
6701 mc.precharge += count;
6702 return ret;
6703 }
6704one_by_one:
6705
6706 while (count--) {
6707 if (signal_pending(current)) {
6708 ret = -EINTR;
6709 break;
6710 }
6711 if (!batch_count--) {
6712 batch_count = PRECHARGE_COUNT_AT_ONCE;
6713 cond_resched();
6714 }
6715 ret = __mem_cgroup_try_charge(NULL,
6716 GFP_KERNEL, 1, &memcg, false);
6717 if (ret)
6718
6719 return ret;
6720 mc.precharge++;
6721 }
6722 return ret;
6723}
6724
6725
6726
6727
6728
6729
6730
6731
6732
6733
6734
6735
6736
6737
6738
6739
6740
6741
6742
6743union mc_target {
6744 struct page *page;
6745 swp_entry_t ent;
6746};
6747
6748enum mc_target_type {
6749 MC_TARGET_NONE = 0,
6750 MC_TARGET_PAGE,
6751 MC_TARGET_SWAP,
6752};
6753
6754static struct page *mc_handle_present_pte(struct vm_area_struct *vma,
6755 unsigned long addr, pte_t ptent)
6756{
6757 struct page *page = vm_normal_page(vma, addr, ptent);
6758
6759 if (!page || !page_mapped(page))
6760 return NULL;
6761 if (PageAnon(page)) {
6762
6763 if (!move_anon())
6764 return NULL;
6765 } else if (!move_file())
6766
6767 return NULL;
6768 if (!get_page_unless_zero(page))
6769 return NULL;
6770
6771 return page;
6772}
6773
6774#ifdef CONFIG_SWAP
6775static struct page *mc_handle_swap_pte(struct vm_area_struct *vma,
6776 unsigned long addr, pte_t ptent, swp_entry_t *entry)
6777{
6778 struct page *page = NULL;
6779 swp_entry_t ent = pte_to_swp_entry(ptent);
6780
6781 if (!move_anon() || non_swap_entry(ent))
6782 return NULL;
6783
6784
6785
6786
6787 page = find_get_page(swap_address_space(ent), ent.val);
6788 if (do_swap_account)
6789 entry->val = ent.val;
6790
6791 return page;
6792}
6793#else
6794static struct page *mc_handle_swap_pte(struct vm_area_struct *vma,
6795 unsigned long addr, pte_t ptent, swp_entry_t *entry)
6796{
6797 return NULL;
6798}
6799#endif
6800
6801static struct page *mc_handle_file_pte(struct vm_area_struct *vma,
6802 unsigned long addr, pte_t ptent, swp_entry_t *entry)
6803{
6804 struct page *page = NULL;
6805 struct address_space *mapping;
6806 pgoff_t pgoff;
6807
6808 if (!vma->vm_file)
6809 return NULL;
6810 if (!move_file())
6811 return NULL;
6812
6813 mapping = vma->vm_file->f_mapping;
6814 if (pte_none(ptent))
6815 pgoff = linear_page_index(vma, addr);
6816 else
6817 pgoff = pte_to_pgoff(ptent);
6818
6819
6820 page = find_get_page(mapping, pgoff);
6821
6822#ifdef CONFIG_SWAP
6823
6824 if (radix_tree_exceptional_entry(page)) {
6825 swp_entry_t swap = radix_to_swp_entry(page);
6826 if (do_swap_account)
6827 *entry = swap;
6828 page = find_get_page(swap_address_space(swap), swap.val);
6829 }
6830#endif
6831 return page;
6832}
6833
6834static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma,
6835 unsigned long addr, pte_t ptent, union mc_target *target)
6836{
6837 struct page *page = NULL;
6838 struct page_cgroup *pc;
6839 enum mc_target_type ret = MC_TARGET_NONE;
6840 swp_entry_t ent = { .val = 0 };
6841
6842 if (pte_present(ptent))
6843 page = mc_handle_present_pte(vma, addr, ptent);
6844 else if (is_swap_pte(ptent))
6845 page = mc_handle_swap_pte(vma, addr, ptent, &ent);
6846 else if (pte_none(ptent) || pte_file(ptent))
6847 page = mc_handle_file_pte(vma, addr, ptent, &ent);
6848
6849 if (!page && !ent.val)
6850 return ret;
6851 if (page) {
6852 pc = lookup_page_cgroup(page);
6853
6854
6855
6856
6857
6858 if (PageCgroupUsed(pc) && pc->mem_cgroup == mc.from) {
6859 ret = MC_TARGET_PAGE;
6860 if (target)
6861 target->page = page;
6862 }
6863 if (!ret || !target)
6864 put_page(page);
6865 }
6866
6867 if (ent.val && !ret &&
6868 mem_cgroup_id(mc.from) == lookup_swap_cgroup_id(ent)) {
6869 ret = MC_TARGET_SWAP;
6870 if (target)
6871 target->ent = ent;
6872 }
6873 return ret;
6874}
6875
6876#ifdef CONFIG_TRANSPARENT_HUGEPAGE
6877
6878
6879
6880
6881
6882static enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma,
6883 unsigned long addr, pmd_t pmd, union mc_target *target)
6884{
6885 struct page *page = NULL;
6886 struct page_cgroup *pc;
6887 enum mc_target_type ret = MC_TARGET_NONE;
6888
6889 page = pmd_page(pmd);
6890 VM_BUG_ON_PAGE(!page || !PageHead(page), page);
6891 if (!move_anon())
6892 return ret;
6893 pc = lookup_page_cgroup(page);
6894 if (PageCgroupUsed(pc) && pc->mem_cgroup == mc.from) {
6895 ret = MC_TARGET_PAGE;
6896 if (target) {
6897 get_page(page);
6898 target->page = page;
6899 }
6900 }
6901 return ret;
6902}
6903#else
6904static inline enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma,
6905 unsigned long addr, pmd_t pmd, union mc_target *target)
6906{
6907 return MC_TARGET_NONE;
6908}
6909#endif
6910
6911static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd,
6912 unsigned long addr, unsigned long end,
6913 struct mm_walk *walk)
6914{
6915 struct vm_area_struct *vma = walk->private;
6916 pte_t *pte;
6917 spinlock_t *ptl;
6918
6919 if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
6920 if (get_mctgt_type_thp(vma, addr, *pmd, NULL) == MC_TARGET_PAGE)
6921 mc.precharge += HPAGE_PMD_NR;
6922 spin_unlock(ptl);
6923 return 0;
6924 }
6925
6926 if (pmd_trans_unstable(pmd))
6927 return 0;
6928 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
6929 for (; addr != end; pte++, addr += PAGE_SIZE)
6930 if (get_mctgt_type(vma, addr, *pte, NULL))
6931 mc.precharge++;
6932 pte_unmap_unlock(pte - 1, ptl);
6933 cond_resched();
6934
6935 return 0;
6936}
6937
6938static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm)
6939{
6940 unsigned long precharge;
6941 struct vm_area_struct *vma;
6942
6943 down_read(&mm->mmap_sem);
6944 for (vma = mm->mmap; vma; vma = vma->vm_next) {
6945 struct mm_walk mem_cgroup_count_precharge_walk = {
6946 .pmd_entry = mem_cgroup_count_precharge_pte_range,
6947 .mm = mm,
6948 .private = vma,
6949 };
6950 if (is_vm_hugetlb_page(vma))
6951 continue;
6952 walk_page_range(vma->vm_start, vma->vm_end,
6953 &mem_cgroup_count_precharge_walk);
6954 }
6955 up_read(&mm->mmap_sem);
6956
6957 precharge = mc.precharge;
6958 mc.precharge = 0;
6959
6960 return precharge;
6961}
6962
6963static int mem_cgroup_precharge_mc(struct mm_struct *mm)
6964{
6965 unsigned long precharge = mem_cgroup_count_precharge(mm);
6966
6967 VM_BUG_ON(mc.moving_task);
6968 mc.moving_task = current;
6969 return mem_cgroup_do_precharge(precharge);
6970}
6971
6972
6973static void __mem_cgroup_clear_mc(void)
6974{
6975 struct mem_cgroup *from = mc.from;
6976 struct mem_cgroup *to = mc.to;
6977 int i;
6978
6979
6980 if (mc.precharge) {
6981 __mem_cgroup_cancel_charge(mc.to, mc.precharge);
6982 mc.precharge = 0;
6983 }
6984
6985
6986
6987
6988 if (mc.moved_charge) {
6989 __mem_cgroup_cancel_charge(mc.from, mc.moved_charge);
6990 mc.moved_charge = 0;
6991 }
6992
6993 if (mc.moved_swap) {
6994
6995 if (!mem_cgroup_is_root(mc.from))
6996 res_counter_uncharge(&mc.from->memsw,
6997 PAGE_SIZE * mc.moved_swap);
6998
6999 for (i = 0; i < mc.moved_swap; i++)
7000 css_put(&mc.from->css);
7001
7002 if (!mem_cgroup_is_root(mc.to)) {
7003
7004
7005
7006
7007 res_counter_uncharge(&mc.to->res,
7008 PAGE_SIZE * mc.moved_swap);
7009 }
7010
7011 mc.moved_swap = 0;
7012 }
7013 memcg_oom_recover(from);
7014 memcg_oom_recover(to);
7015 wake_up_all(&mc.waitq);
7016}
7017
7018static void mem_cgroup_clear_mc(void)
7019{
7020 struct mem_cgroup *from = mc.from;
7021
7022
7023
7024
7025
7026 mc.moving_task = NULL;
7027 __mem_cgroup_clear_mc();
7028 spin_lock(&mc.lock);
7029 mc.from = NULL;
7030 mc.to = NULL;
7031 spin_unlock(&mc.lock);
7032 mem_cgroup_end_move(from);
7033}
7034
7035static int mem_cgroup_can_attach(struct cgroup_subsys_state *css,
7036 struct cgroup_taskset *tset)
7037{
7038 struct task_struct *p = cgroup_taskset_first(tset);
7039 int ret = 0;
7040 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
7041 unsigned long move_charge_at_immigrate;
7042
7043
7044
7045
7046
7047
7048 move_charge_at_immigrate = memcg->move_charge_at_immigrate;
7049 if (move_charge_at_immigrate) {
7050 struct mm_struct *mm;
7051 struct mem_cgroup *from = mem_cgroup_from_task(p);
7052
7053 VM_BUG_ON(from == memcg);
7054
7055 mm = get_task_mm(p);
7056 if (!mm)
7057 return 0;
7058
7059 if (mm->owner == p) {
7060 VM_BUG_ON(mc.from);
7061 VM_BUG_ON(mc.to);
7062 VM_BUG_ON(mc.precharge);
7063 VM_BUG_ON(mc.moved_charge);
7064 VM_BUG_ON(mc.moved_swap);
7065 mem_cgroup_start_move(from);
7066 spin_lock(&mc.lock);
7067 mc.from = from;
7068 mc.to = memcg;
7069 mc.immigrate_flags = move_charge_at_immigrate;
7070 spin_unlock(&mc.lock);
7071
7072
7073 ret = mem_cgroup_precharge_mc(mm);
7074 if (ret)
7075 mem_cgroup_clear_mc();
7076 }
7077 mmput(mm);
7078 }
7079 return ret;
7080}
7081
7082static void mem_cgroup_cancel_attach(struct cgroup_subsys_state *css,
7083 struct cgroup_taskset *tset)
7084{
7085 mem_cgroup_clear_mc();
7086}
7087
7088static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
7089 unsigned long addr, unsigned long end,
7090 struct mm_walk *walk)
7091{
7092 int ret = 0;
7093 struct vm_area_struct *vma = walk->private;
7094 pte_t *pte;
7095 spinlock_t *ptl;
7096 enum mc_target_type target_type;
7097 union mc_target target;
7098 struct page *page;
7099 struct page_cgroup *pc;
7100
7101
7102
7103
7104
7105
7106
7107
7108
7109
7110
7111 if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
7112 if (mc.precharge < HPAGE_PMD_NR) {
7113 spin_unlock(ptl);
7114 return 0;
7115 }
7116 target_type = get_mctgt_type_thp(vma, addr, *pmd, &target);
7117 if (target_type == MC_TARGET_PAGE) {
7118 page = target.page;
7119 if (!isolate_lru_page(page)) {
7120 pc = lookup_page_cgroup(page);
7121 if (!mem_cgroup_move_account(page, HPAGE_PMD_NR,
7122 pc, mc.from, mc.to)) {
7123 mc.precharge -= HPAGE_PMD_NR;
7124 mc.moved_charge += HPAGE_PMD_NR;
7125 }
7126 putback_lru_page(page);
7127 }
7128 put_page(page);
7129 }
7130 spin_unlock(ptl);
7131 return 0;
7132 }
7133
7134 if (pmd_trans_unstable(pmd))
7135 return 0;
7136retry:
7137 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
7138 for (; addr != end; addr += PAGE_SIZE) {
7139 pte_t ptent = *(pte++);
7140 swp_entry_t ent;
7141
7142 if (!mc.precharge)
7143 break;
7144
7145 switch (get_mctgt_type(vma, addr, ptent, &target)) {
7146 case MC_TARGET_PAGE:
7147 page = target.page;
7148 if (isolate_lru_page(page))
7149 goto put;
7150 pc = lookup_page_cgroup(page);
7151 if (!mem_cgroup_move_account(page, 1, pc,
7152 mc.from, mc.to)) {
7153 mc.precharge--;
7154
7155 mc.moved_charge++;
7156 }
7157 putback_lru_page(page);
7158put:
7159 put_page(page);
7160 break;
7161 case MC_TARGET_SWAP:
7162 ent = target.ent;
7163 if (!mem_cgroup_move_swap_account(ent, mc.from, mc.to)) {
7164 mc.precharge--;
7165
7166 mc.moved_swap++;
7167 }
7168 break;
7169 default:
7170 break;
7171 }
7172 }
7173 pte_unmap_unlock(pte - 1, ptl);
7174 cond_resched();
7175
7176 if (addr != end) {
7177
7178
7179
7180
7181
7182
7183 ret = mem_cgroup_do_precharge(1);
7184 if (!ret)
7185 goto retry;
7186 }
7187
7188 return ret;
7189}
7190
7191static void mem_cgroup_move_charge(struct mm_struct *mm)
7192{
7193 struct vm_area_struct *vma;
7194
7195 lru_add_drain_all();
7196retry:
7197 if (unlikely(!down_read_trylock(&mm->mmap_sem))) {
7198
7199
7200
7201
7202
7203
7204
7205 __mem_cgroup_clear_mc();
7206 cond_resched();
7207 goto retry;
7208 }
7209 for (vma = mm->mmap; vma; vma = vma->vm_next) {
7210 int ret;
7211 struct mm_walk mem_cgroup_move_charge_walk = {
7212 .pmd_entry = mem_cgroup_move_charge_pte_range,
7213 .mm = mm,
7214 .private = vma,
7215 };
7216 if (is_vm_hugetlb_page(vma))
7217 continue;
7218 ret = walk_page_range(vma->vm_start, vma->vm_end,
7219 &mem_cgroup_move_charge_walk);
7220 if (ret)
7221
7222
7223
7224
7225 break;
7226 }
7227 up_read(&mm->mmap_sem);
7228}
7229
7230static void mem_cgroup_move_task(struct cgroup_subsys_state *css,
7231 struct cgroup_taskset *tset)
7232{
7233 struct task_struct *p = cgroup_taskset_first(tset);
7234 struct mm_struct *mm = get_task_mm(p);
7235
7236 if (mm) {
7237 if (mc.to)
7238 mem_cgroup_move_charge(mm);
7239 mmput(mm);
7240 }
7241 if (mc.to)
7242 mem_cgroup_clear_mc();
7243}
7244#else
7245static int mem_cgroup_can_attach(struct cgroup_subsys_state *css,
7246 struct cgroup_taskset *tset)
7247{
7248 return 0;
7249}
7250static void mem_cgroup_cancel_attach(struct cgroup_subsys_state *css,
7251 struct cgroup_taskset *tset)
7252{
7253}
7254static void mem_cgroup_move_task(struct cgroup_subsys_state *css,
7255 struct cgroup_taskset *tset)
7256{
7257}
7258#endif
7259
7260
7261
7262
7263
7264static void mem_cgroup_bind(struct cgroup_subsys_state *root_css)
7265{
7266
7267
7268
7269
7270
7271 if (cgroup_sane_behavior(root_css->cgroup))
7272 mem_cgroup_from_css(root_css)->use_hierarchy = true;
7273}
7274
7275struct cgroup_subsys mem_cgroup_subsys = {
7276 .name = "memory",
7277 .subsys_id = mem_cgroup_subsys_id,
7278 .css_alloc = mem_cgroup_css_alloc,
7279 .css_online = mem_cgroup_css_online,
7280 .css_offline = mem_cgroup_css_offline,
7281 .css_free = mem_cgroup_css_free,
7282 .can_attach = mem_cgroup_can_attach,
7283 .cancel_attach = mem_cgroup_cancel_attach,
7284 .attach = mem_cgroup_move_task,
7285 .bind = mem_cgroup_bind,
7286 .base_cftypes = mem_cgroup_files,
7287 .early_init = 0,
7288};
7289
7290#ifdef CONFIG_MEMCG_SWAP
7291static int __init enable_swap_account(char *s)
7292{
7293 if (!strcmp(s, "1"))
7294 really_do_swap_account = 1;
7295 else if (!strcmp(s, "0"))
7296 really_do_swap_account = 0;
7297 return 1;
7298}
7299__setup("swapaccount=", enable_swap_account);
7300
7301static void __init memsw_file_init(void)
7302{
7303 WARN_ON(cgroup_add_cftypes(&mem_cgroup_subsys, memsw_cgroup_files));
7304}
7305
7306static void __init enable_swap_cgroup(void)
7307{
7308 if (!mem_cgroup_disabled() && really_do_swap_account) {
7309 do_swap_account = 1;
7310 memsw_file_init();
7311 }
7312}
7313
7314#else
7315static void __init enable_swap_cgroup(void)
7316{
7317}
7318#endif
7319
7320
7321
7322
7323
7324
7325
7326
7327
7328static int __init mem_cgroup_init(void)
7329{
7330 hotcpu_notifier(memcg_cpu_hotplug_callback, 0);
7331 enable_swap_cgroup();
7332 mem_cgroup_soft_limit_tree_init();
7333 memcg_stock_init();
7334 return 0;
7335}
7336subsys_initcall(mem_cgroup_init);
7337