1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28#include <linux/res_counter.h>
29#include <linux/memcontrol.h>
30#include <linux/cgroup.h>
31#include <linux/mm.h>
32#include <linux/hugetlb.h>
33#include <linux/pagemap.h>
34#include <linux/smp.h>
35#include <linux/page-flags.h>
36#include <linux/backing-dev.h>
37#include <linux/bit_spinlock.h>
38#include <linux/rcupdate.h>
39#include <linux/limits.h>
40#include <linux/export.h>
41#include <linux/mutex.h>
42#include <linux/rbtree.h>
43#include <linux/slab.h>
44#include <linux/swap.h>
45#include <linux/swapops.h>
46#include <linux/spinlock.h>
47#include <linux/eventfd.h>
48#include <linux/poll.h>
49#include <linux/sort.h>
50#include <linux/fs.h>
51#include <linux/seq_file.h>
52#include <linux/vmpressure.h>
53#include <linux/mm_inline.h>
54#include <linux/page_cgroup.h>
55#include <linux/cpu.h>
56#include <linux/oom.h>
57#include <linux/lockdep.h>
58#include <linux/file.h>
59#include "internal.h"
60#include <net/sock.h>
61#include <net/ip.h>
62#include <net/tcp_memcontrol.h>
63#include "slab.h"
64
65#include <asm/uaccess.h>
66
67#include <trace/events/vmscan.h>
68
69struct cgroup_subsys memory_cgrp_subsys __read_mostly;
70EXPORT_SYMBOL(memory_cgrp_subsys);
71
72#define MEM_CGROUP_RECLAIM_RETRIES 5
73static struct mem_cgroup *root_mem_cgroup __read_mostly;
74
75#ifdef CONFIG_MEMCG_SWAP
76
77int do_swap_account __read_mostly;
78
79
80#ifdef CONFIG_MEMCG_SWAP_ENABLED
81static int really_do_swap_account __initdata = 1;
82#else
83static int really_do_swap_account __initdata = 0;
84#endif
85
86#else
87#define do_swap_account 0
88#endif
89
90
91static const char * const mem_cgroup_stat_names[] = {
92 "cache",
93 "rss",
94 "rss_huge",
95 "mapped_file",
96 "writeback",
97 "swap",
98};
99
100enum mem_cgroup_events_index {
101 MEM_CGROUP_EVENTS_PGPGIN,
102 MEM_CGROUP_EVENTS_PGPGOUT,
103 MEM_CGROUP_EVENTS_PGFAULT,
104 MEM_CGROUP_EVENTS_PGMAJFAULT,
105 MEM_CGROUP_EVENTS_NSTATS,
106};
107
108static const char * const mem_cgroup_events_names[] = {
109 "pgpgin",
110 "pgpgout",
111 "pgfault",
112 "pgmajfault",
113};
114
115static const char * const mem_cgroup_lru_names[] = {
116 "inactive_anon",
117 "active_anon",
118 "inactive_file",
119 "active_file",
120 "unevictable",
121};
122
123
124
125
126
127
128
129enum mem_cgroup_events_target {
130 MEM_CGROUP_TARGET_THRESH,
131 MEM_CGROUP_TARGET_SOFTLIMIT,
132 MEM_CGROUP_TARGET_NUMAINFO,
133 MEM_CGROUP_NTARGETS,
134};
135#define THRESHOLDS_EVENTS_TARGET 128
136#define SOFTLIMIT_EVENTS_TARGET 1024
137#define NUMAINFO_EVENTS_TARGET 1024
138
139struct mem_cgroup_stat_cpu {
140 long count[MEM_CGROUP_STAT_NSTATS];
141 unsigned long events[MEM_CGROUP_EVENTS_NSTATS];
142 unsigned long nr_page_events;
143 unsigned long targets[MEM_CGROUP_NTARGETS];
144};
145
146struct mem_cgroup_reclaim_iter {
147
148
149
150
151 struct mem_cgroup *last_visited;
152 int last_dead_count;
153
154
155 unsigned int generation;
156};
157
158
159
160
161struct mem_cgroup_per_zone {
162 struct lruvec lruvec;
163 unsigned long lru_size[NR_LRU_LISTS];
164
165 struct mem_cgroup_reclaim_iter reclaim_iter[DEF_PRIORITY + 1];
166
167 struct rb_node tree_node;
168 unsigned long long usage_in_excess;
169
170 bool on_tree;
171 struct mem_cgroup *memcg;
172
173};
174
175struct mem_cgroup_per_node {
176 struct mem_cgroup_per_zone zoneinfo[MAX_NR_ZONES];
177};
178
179
180
181
182
183
184struct mem_cgroup_tree_per_zone {
185 struct rb_root rb_root;
186 spinlock_t lock;
187};
188
189struct mem_cgroup_tree_per_node {
190 struct mem_cgroup_tree_per_zone rb_tree_per_zone[MAX_NR_ZONES];
191};
192
193struct mem_cgroup_tree {
194 struct mem_cgroup_tree_per_node *rb_tree_per_node[MAX_NUMNODES];
195};
196
197static struct mem_cgroup_tree soft_limit_tree __read_mostly;
198
199struct mem_cgroup_threshold {
200 struct eventfd_ctx *eventfd;
201 u64 threshold;
202};
203
204
205struct mem_cgroup_threshold_ary {
206
207 int current_threshold;
208
209 unsigned int size;
210
211 struct mem_cgroup_threshold entries[0];
212};
213
214struct mem_cgroup_thresholds {
215
216 struct mem_cgroup_threshold_ary *primary;
217
218
219
220
221
222 struct mem_cgroup_threshold_ary *spare;
223};
224
225
226struct mem_cgroup_eventfd_list {
227 struct list_head list;
228 struct eventfd_ctx *eventfd;
229};
230
231
232
233
234struct mem_cgroup_event {
235
236
237
238 struct mem_cgroup *memcg;
239
240
241
242 struct eventfd_ctx *eventfd;
243
244
245
246 struct list_head list;
247
248
249
250
251
252 int (*register_event)(struct mem_cgroup *memcg,
253 struct eventfd_ctx *eventfd, const char *args);
254
255
256
257
258
259 void (*unregister_event)(struct mem_cgroup *memcg,
260 struct eventfd_ctx *eventfd);
261
262
263
264
265 poll_table pt;
266 wait_queue_head_t *wqh;
267 wait_queue_t wait;
268 struct work_struct remove;
269};
270
271static void mem_cgroup_threshold(struct mem_cgroup *memcg);
272static void mem_cgroup_oom_notify(struct mem_cgroup *memcg);
273
274
275
276
277
278
279
280
281
282
283
284
285struct mem_cgroup {
286 struct cgroup_subsys_state css;
287
288
289
290 struct res_counter res;
291
292
293 struct vmpressure vmpressure;
294
295
296
297
298 struct res_counter memsw;
299
300
301
302
303 struct res_counter kmem;
304
305
306
307 bool use_hierarchy;
308 unsigned long kmem_account_flags;
309
310 bool oom_lock;
311 atomic_t under_oom;
312 atomic_t oom_wakeups;
313
314 int swappiness;
315
316 int oom_kill_disable;
317
318
319 bool memsw_is_minimum;
320
321
322 struct mutex thresholds_lock;
323
324
325 struct mem_cgroup_thresholds thresholds;
326
327
328 struct mem_cgroup_thresholds memsw_thresholds;
329
330
331 struct list_head oom_notify;
332
333
334
335
336
337 unsigned long move_charge_at_immigrate;
338
339
340
341 atomic_t moving_account;
342
343 spinlock_t move_lock;
344
345
346
347 struct mem_cgroup_stat_cpu __percpu *stat;
348
349
350
351
352 struct mem_cgroup_stat_cpu nocpu_base;
353 spinlock_t pcp_counter_lock;
354
355 atomic_t dead_count;
356#if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_INET)
357 struct cg_proto tcp_mem;
358#endif
359#if defined(CONFIG_MEMCG_KMEM)
360
361 struct list_head memcg_slab_caches;
362
363 struct mutex slab_caches_mutex;
364
365 int kmemcg_id;
366#endif
367
368 int last_scanned_node;
369#if MAX_NUMNODES > 1
370 nodemask_t scan_nodes;
371 atomic_t numainfo_events;
372 atomic_t numainfo_updating;
373#endif
374
375
376 struct list_head event_list;
377 spinlock_t event_list_lock;
378
379 struct mem_cgroup_per_node *nodeinfo[0];
380
381};
382
383
384enum {
385 KMEM_ACCOUNTED_ACTIVE,
386 KMEM_ACCOUNTED_DEAD,
387};
388
389#ifdef CONFIG_MEMCG_KMEM
390static inline void memcg_kmem_set_active(struct mem_cgroup *memcg)
391{
392 set_bit(KMEM_ACCOUNTED_ACTIVE, &memcg->kmem_account_flags);
393}
394
395static bool memcg_kmem_is_active(struct mem_cgroup *memcg)
396{
397 return test_bit(KMEM_ACCOUNTED_ACTIVE, &memcg->kmem_account_flags);
398}
399
400static void memcg_kmem_mark_dead(struct mem_cgroup *memcg)
401{
402
403
404
405
406 smp_wmb();
407 if (test_bit(KMEM_ACCOUNTED_ACTIVE, &memcg->kmem_account_flags))
408 set_bit(KMEM_ACCOUNTED_DEAD, &memcg->kmem_account_flags);
409}
410
411static bool memcg_kmem_test_and_clear_dead(struct mem_cgroup *memcg)
412{
413 return test_and_clear_bit(KMEM_ACCOUNTED_DEAD,
414 &memcg->kmem_account_flags);
415}
416#endif
417
418
419
420
421
422
423enum move_type {
424 MOVE_CHARGE_TYPE_ANON,
425 MOVE_CHARGE_TYPE_FILE,
426 NR_MOVE_TYPE,
427};
428
429
430static struct move_charge_struct {
431 spinlock_t lock;
432 struct mem_cgroup *from;
433 struct mem_cgroup *to;
434 unsigned long immigrate_flags;
435 unsigned long precharge;
436 unsigned long moved_charge;
437 unsigned long moved_swap;
438 struct task_struct *moving_task;
439 wait_queue_head_t waitq;
440} mc = {
441 .lock = __SPIN_LOCK_UNLOCKED(mc.lock),
442 .waitq = __WAIT_QUEUE_HEAD_INITIALIZER(mc.waitq),
443};
444
445static bool move_anon(void)
446{
447 return test_bit(MOVE_CHARGE_TYPE_ANON, &mc.immigrate_flags);
448}
449
450static bool move_file(void)
451{
452 return test_bit(MOVE_CHARGE_TYPE_FILE, &mc.immigrate_flags);
453}
454
455
456
457
458
459#define MEM_CGROUP_MAX_RECLAIM_LOOPS 100
460#define MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS 2
461
462enum charge_type {
463 MEM_CGROUP_CHARGE_TYPE_CACHE = 0,
464 MEM_CGROUP_CHARGE_TYPE_ANON,
465 MEM_CGROUP_CHARGE_TYPE_SWAPOUT,
466 MEM_CGROUP_CHARGE_TYPE_DROP,
467 NR_CHARGE_TYPE,
468};
469
470
471enum res_type {
472 _MEM,
473 _MEMSWAP,
474 _OOM_TYPE,
475 _KMEM,
476};
477
478#define MEMFILE_PRIVATE(x, val) ((x) << 16 | (val))
479#define MEMFILE_TYPE(val) ((val) >> 16 & 0xffff)
480#define MEMFILE_ATTR(val) ((val) & 0xffff)
481
482#define OOM_CONTROL (0)
483
484
485
486
487#define MEM_CGROUP_RECLAIM_NOSWAP_BIT 0x0
488#define MEM_CGROUP_RECLAIM_NOSWAP (1 << MEM_CGROUP_RECLAIM_NOSWAP_BIT)
489#define MEM_CGROUP_RECLAIM_SHRINK_BIT 0x1
490#define MEM_CGROUP_RECLAIM_SHRINK (1 << MEM_CGROUP_RECLAIM_SHRINK_BIT)
491
492
493
494
495
496
497static DEFINE_MUTEX(memcg_create_mutex);
498
499struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *s)
500{
501 return s ? container_of(s, struct mem_cgroup, css) : NULL;
502}
503
504
505struct vmpressure *memcg_to_vmpressure(struct mem_cgroup *memcg)
506{
507 if (!memcg)
508 memcg = root_mem_cgroup;
509 return &memcg->vmpressure;
510}
511
512struct cgroup_subsys_state *vmpressure_to_css(struct vmpressure *vmpr)
513{
514 return &container_of(vmpr, struct mem_cgroup, vmpressure)->css;
515}
516
517static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg)
518{
519 return (memcg == root_mem_cgroup);
520}
521
522
523
524
525
526#define MEM_CGROUP_ID_MAX USHRT_MAX
527
528static inline unsigned short mem_cgroup_id(struct mem_cgroup *memcg)
529{
530
531
532
533
534 return memcg->css.cgroup->id + 1;
535}
536
537static inline struct mem_cgroup *mem_cgroup_from_id(unsigned short id)
538{
539 struct cgroup_subsys_state *css;
540
541 css = css_from_id(id - 1, &memory_cgrp_subsys);
542 return mem_cgroup_from_css(css);
543}
544
545
546#if defined(CONFIG_INET) && defined(CONFIG_MEMCG_KMEM)
547
548void sock_update_memcg(struct sock *sk)
549{
550 if (mem_cgroup_sockets_enabled) {
551 struct mem_cgroup *memcg;
552 struct cg_proto *cg_proto;
553
554 BUG_ON(!sk->sk_prot->proto_cgroup);
555
556
557
558
559
560
561
562
563
564 if (sk->sk_cgrp) {
565 BUG_ON(mem_cgroup_is_root(sk->sk_cgrp->memcg));
566 css_get(&sk->sk_cgrp->memcg->css);
567 return;
568 }
569
570 rcu_read_lock();
571 memcg = mem_cgroup_from_task(current);
572 cg_proto = sk->sk_prot->proto_cgroup(memcg);
573 if (!mem_cgroup_is_root(memcg) &&
574 memcg_proto_active(cg_proto) && css_tryget(&memcg->css)) {
575 sk->sk_cgrp = cg_proto;
576 }
577 rcu_read_unlock();
578 }
579}
580EXPORT_SYMBOL(sock_update_memcg);
581
582void sock_release_memcg(struct sock *sk)
583{
584 if (mem_cgroup_sockets_enabled && sk->sk_cgrp) {
585 struct mem_cgroup *memcg;
586 WARN_ON(!sk->sk_cgrp->memcg);
587 memcg = sk->sk_cgrp->memcg;
588 css_put(&sk->sk_cgrp->memcg->css);
589 }
590}
591
592struct cg_proto *tcp_proto_cgroup(struct mem_cgroup *memcg)
593{
594 if (!memcg || mem_cgroup_is_root(memcg))
595 return NULL;
596
597 return &memcg->tcp_mem;
598}
599EXPORT_SYMBOL(tcp_proto_cgroup);
600
601static void disarm_sock_keys(struct mem_cgroup *memcg)
602{
603 if (!memcg_proto_activated(&memcg->tcp_mem))
604 return;
605 static_key_slow_dec(&memcg_socket_limit_enabled);
606}
607#else
608static void disarm_sock_keys(struct mem_cgroup *memcg)
609{
610}
611#endif
612
613#ifdef CONFIG_MEMCG_KMEM
614
615
616
617
618
619
620
621
622
623
624
625
626static DEFINE_IDA(kmem_limited_groups);
627int memcg_limited_groups_array_size;
628
629
630
631
632
633
634
635
636
637
638
639
640
641#define MEMCG_CACHES_MIN_SIZE 4
642#define MEMCG_CACHES_MAX_SIZE MEM_CGROUP_ID_MAX
643
644
645
646
647
648
649
650struct static_key memcg_kmem_enabled_key;
651EXPORT_SYMBOL(memcg_kmem_enabled_key);
652
653static void disarm_kmem_keys(struct mem_cgroup *memcg)
654{
655 if (memcg_kmem_is_active(memcg)) {
656 static_key_slow_dec(&memcg_kmem_enabled_key);
657 ida_simple_remove(&kmem_limited_groups, memcg->kmemcg_id);
658 }
659
660
661
662
663 WARN_ON(res_counter_read_u64(&memcg->kmem, RES_USAGE) != 0);
664}
665#else
666static void disarm_kmem_keys(struct mem_cgroup *memcg)
667{
668}
669#endif
670
671static void disarm_static_keys(struct mem_cgroup *memcg)
672{
673 disarm_sock_keys(memcg);
674 disarm_kmem_keys(memcg);
675}
676
677static void drain_all_stock_async(struct mem_cgroup *memcg);
678
679static struct mem_cgroup_per_zone *
680mem_cgroup_zoneinfo(struct mem_cgroup *memcg, int nid, int zid)
681{
682 VM_BUG_ON((unsigned)nid >= nr_node_ids);
683 return &memcg->nodeinfo[nid]->zoneinfo[zid];
684}
685
686struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *memcg)
687{
688 return &memcg->css;
689}
690
691static struct mem_cgroup_per_zone *
692page_cgroup_zoneinfo(struct mem_cgroup *memcg, struct page *page)
693{
694 int nid = page_to_nid(page);
695 int zid = page_zonenum(page);
696
697 return mem_cgroup_zoneinfo(memcg, nid, zid);
698}
699
700static struct mem_cgroup_tree_per_zone *
701soft_limit_tree_node_zone(int nid, int zid)
702{
703 return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid];
704}
705
706static struct mem_cgroup_tree_per_zone *
707soft_limit_tree_from_page(struct page *page)
708{
709 int nid = page_to_nid(page);
710 int zid = page_zonenum(page);
711
712 return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid];
713}
714
715static void
716__mem_cgroup_insert_exceeded(struct mem_cgroup *memcg,
717 struct mem_cgroup_per_zone *mz,
718 struct mem_cgroup_tree_per_zone *mctz,
719 unsigned long long new_usage_in_excess)
720{
721 struct rb_node **p = &mctz->rb_root.rb_node;
722 struct rb_node *parent = NULL;
723 struct mem_cgroup_per_zone *mz_node;
724
725 if (mz->on_tree)
726 return;
727
728 mz->usage_in_excess = new_usage_in_excess;
729 if (!mz->usage_in_excess)
730 return;
731 while (*p) {
732 parent = *p;
733 mz_node = rb_entry(parent, struct mem_cgroup_per_zone,
734 tree_node);
735 if (mz->usage_in_excess < mz_node->usage_in_excess)
736 p = &(*p)->rb_left;
737
738
739
740
741 else if (mz->usage_in_excess >= mz_node->usage_in_excess)
742 p = &(*p)->rb_right;
743 }
744 rb_link_node(&mz->tree_node, parent, p);
745 rb_insert_color(&mz->tree_node, &mctz->rb_root);
746 mz->on_tree = true;
747}
748
749static void
750__mem_cgroup_remove_exceeded(struct mem_cgroup *memcg,
751 struct mem_cgroup_per_zone *mz,
752 struct mem_cgroup_tree_per_zone *mctz)
753{
754 if (!mz->on_tree)
755 return;
756 rb_erase(&mz->tree_node, &mctz->rb_root);
757 mz->on_tree = false;
758}
759
760static void
761mem_cgroup_remove_exceeded(struct mem_cgroup *memcg,
762 struct mem_cgroup_per_zone *mz,
763 struct mem_cgroup_tree_per_zone *mctz)
764{
765 spin_lock(&mctz->lock);
766 __mem_cgroup_remove_exceeded(memcg, mz, mctz);
767 spin_unlock(&mctz->lock);
768}
769
770
771static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page)
772{
773 unsigned long long excess;
774 struct mem_cgroup_per_zone *mz;
775 struct mem_cgroup_tree_per_zone *mctz;
776 int nid = page_to_nid(page);
777 int zid = page_zonenum(page);
778 mctz = soft_limit_tree_from_page(page);
779
780
781
782
783
784 for (; memcg; memcg = parent_mem_cgroup(memcg)) {
785 mz = mem_cgroup_zoneinfo(memcg, nid, zid);
786 excess = res_counter_soft_limit_excess(&memcg->res);
787
788
789
790
791 if (excess || mz->on_tree) {
792 spin_lock(&mctz->lock);
793
794 if (mz->on_tree)
795 __mem_cgroup_remove_exceeded(memcg, mz, mctz);
796
797
798
799
800 __mem_cgroup_insert_exceeded(memcg, mz, mctz, excess);
801 spin_unlock(&mctz->lock);
802 }
803 }
804}
805
806static void mem_cgroup_remove_from_trees(struct mem_cgroup *memcg)
807{
808 int node, zone;
809 struct mem_cgroup_per_zone *mz;
810 struct mem_cgroup_tree_per_zone *mctz;
811
812 for_each_node(node) {
813 for (zone = 0; zone < MAX_NR_ZONES; zone++) {
814 mz = mem_cgroup_zoneinfo(memcg, node, zone);
815 mctz = soft_limit_tree_node_zone(node, zone);
816 mem_cgroup_remove_exceeded(memcg, mz, mctz);
817 }
818 }
819}
820
821static struct mem_cgroup_per_zone *
822__mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz)
823{
824 struct rb_node *rightmost = NULL;
825 struct mem_cgroup_per_zone *mz;
826
827retry:
828 mz = NULL;
829 rightmost = rb_last(&mctz->rb_root);
830 if (!rightmost)
831 goto done;
832
833 mz = rb_entry(rightmost, struct mem_cgroup_per_zone, tree_node);
834
835
836
837
838
839 __mem_cgroup_remove_exceeded(mz->memcg, mz, mctz);
840 if (!res_counter_soft_limit_excess(&mz->memcg->res) ||
841 !css_tryget(&mz->memcg->css))
842 goto retry;
843done:
844 return mz;
845}
846
847static struct mem_cgroup_per_zone *
848mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz)
849{
850 struct mem_cgroup_per_zone *mz;
851
852 spin_lock(&mctz->lock);
853 mz = __mem_cgroup_largest_soft_limit_node(mctz);
854 spin_unlock(&mctz->lock);
855 return mz;
856}
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877static long mem_cgroup_read_stat(struct mem_cgroup *memcg,
878 enum mem_cgroup_stat_index idx)
879{
880 long val = 0;
881 int cpu;
882
883 get_online_cpus();
884 for_each_online_cpu(cpu)
885 val += per_cpu(memcg->stat->count[idx], cpu);
886#ifdef CONFIG_HOTPLUG_CPU
887 spin_lock(&memcg->pcp_counter_lock);
888 val += memcg->nocpu_base.count[idx];
889 spin_unlock(&memcg->pcp_counter_lock);
890#endif
891 put_online_cpus();
892 return val;
893}
894
895static void mem_cgroup_swap_statistics(struct mem_cgroup *memcg,
896 bool charge)
897{
898 int val = (charge) ? 1 : -1;
899 this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_SWAP], val);
900}
901
902static unsigned long mem_cgroup_read_events(struct mem_cgroup *memcg,
903 enum mem_cgroup_events_index idx)
904{
905 unsigned long val = 0;
906 int cpu;
907
908 get_online_cpus();
909 for_each_online_cpu(cpu)
910 val += per_cpu(memcg->stat->events[idx], cpu);
911#ifdef CONFIG_HOTPLUG_CPU
912 spin_lock(&memcg->pcp_counter_lock);
913 val += memcg->nocpu_base.events[idx];
914 spin_unlock(&memcg->pcp_counter_lock);
915#endif
916 put_online_cpus();
917 return val;
918}
919
920static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg,
921 struct page *page,
922 bool anon, int nr_pages)
923{
924
925
926
927
928 if (anon)
929 __this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_RSS],
930 nr_pages);
931 else
932 __this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_CACHE],
933 nr_pages);
934
935 if (PageTransHuge(page))
936 __this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_RSS_HUGE],
937 nr_pages);
938
939
940 if (nr_pages > 0)
941 __this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGIN]);
942 else {
943 __this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGOUT]);
944 nr_pages = -nr_pages;
945 }
946
947 __this_cpu_add(memcg->stat->nr_page_events, nr_pages);
948}
949
950unsigned long
951mem_cgroup_get_lru_size(struct lruvec *lruvec, enum lru_list lru)
952{
953 struct mem_cgroup_per_zone *mz;
954
955 mz = container_of(lruvec, struct mem_cgroup_per_zone, lruvec);
956 return mz->lru_size[lru];
957}
958
959static unsigned long
960mem_cgroup_zone_nr_lru_pages(struct mem_cgroup *memcg, int nid, int zid,
961 unsigned int lru_mask)
962{
963 struct mem_cgroup_per_zone *mz;
964 enum lru_list lru;
965 unsigned long ret = 0;
966
967 mz = mem_cgroup_zoneinfo(memcg, nid, zid);
968
969 for_each_lru(lru) {
970 if (BIT(lru) & lru_mask)
971 ret += mz->lru_size[lru];
972 }
973 return ret;
974}
975
976static unsigned long
977mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg,
978 int nid, unsigned int lru_mask)
979{
980 u64 total = 0;
981 int zid;
982
983 for (zid = 0; zid < MAX_NR_ZONES; zid++)
984 total += mem_cgroup_zone_nr_lru_pages(memcg,
985 nid, zid, lru_mask);
986
987 return total;
988}
989
990static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *memcg,
991 unsigned int lru_mask)
992{
993 int nid;
994 u64 total = 0;
995
996 for_each_node_state(nid, N_MEMORY)
997 total += mem_cgroup_node_nr_lru_pages(memcg, nid, lru_mask);
998 return total;
999}
1000
1001static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg,
1002 enum mem_cgroup_events_target target)
1003{
1004 unsigned long val, next;
1005
1006 val = __this_cpu_read(memcg->stat->nr_page_events);
1007 next = __this_cpu_read(memcg->stat->targets[target]);
1008
1009 if ((long)next - (long)val < 0) {
1010 switch (target) {
1011 case MEM_CGROUP_TARGET_THRESH:
1012 next = val + THRESHOLDS_EVENTS_TARGET;
1013 break;
1014 case MEM_CGROUP_TARGET_SOFTLIMIT:
1015 next = val + SOFTLIMIT_EVENTS_TARGET;
1016 break;
1017 case MEM_CGROUP_TARGET_NUMAINFO:
1018 next = val + NUMAINFO_EVENTS_TARGET;
1019 break;
1020 default:
1021 break;
1022 }
1023 __this_cpu_write(memcg->stat->targets[target], next);
1024 return true;
1025 }
1026 return false;
1027}
1028
1029
1030
1031
1032
1033static void memcg_check_events(struct mem_cgroup *memcg, struct page *page)
1034{
1035 preempt_disable();
1036
1037 if (unlikely(mem_cgroup_event_ratelimit(memcg,
1038 MEM_CGROUP_TARGET_THRESH))) {
1039 bool do_softlimit;
1040 bool do_numainfo __maybe_unused;
1041
1042 do_softlimit = mem_cgroup_event_ratelimit(memcg,
1043 MEM_CGROUP_TARGET_SOFTLIMIT);
1044#if MAX_NUMNODES > 1
1045 do_numainfo = mem_cgroup_event_ratelimit(memcg,
1046 MEM_CGROUP_TARGET_NUMAINFO);
1047#endif
1048 preempt_enable();
1049
1050 mem_cgroup_threshold(memcg);
1051 if (unlikely(do_softlimit))
1052 mem_cgroup_update_tree(memcg, page);
1053#if MAX_NUMNODES > 1
1054 if (unlikely(do_numainfo))
1055 atomic_inc(&memcg->numainfo_events);
1056#endif
1057 } else
1058 preempt_enable();
1059}
1060
1061struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
1062{
1063
1064
1065
1066
1067
1068 if (unlikely(!p))
1069 return NULL;
1070
1071 return mem_cgroup_from_css(task_css(p, memory_cgrp_id));
1072}
1073
1074static struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm)
1075{
1076 struct mem_cgroup *memcg = NULL;
1077
1078 rcu_read_lock();
1079 do {
1080
1081
1082
1083
1084
1085 if (unlikely(!mm))
1086 memcg = root_mem_cgroup;
1087 else {
1088 memcg = mem_cgroup_from_task(rcu_dereference(mm->owner));
1089 if (unlikely(!memcg))
1090 memcg = root_mem_cgroup;
1091 }
1092 } while (!css_tryget(&memcg->css));
1093 rcu_read_unlock();
1094 return memcg;
1095}
1096
1097
1098
1099
1100
1101
1102
1103static struct mem_cgroup *__mem_cgroup_iter_next(struct mem_cgroup *root,
1104 struct mem_cgroup *last_visited)
1105{
1106 struct cgroup_subsys_state *prev_css, *next_css;
1107
1108 prev_css = last_visited ? &last_visited->css : NULL;
1109skip_node:
1110 next_css = css_next_descendant_pre(prev_css, &root->css);
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127 if (next_css) {
1128 if ((next_css == &root->css) ||
1129 ((next_css->flags & CSS_ONLINE) && css_tryget(next_css)))
1130 return mem_cgroup_from_css(next_css);
1131
1132 prev_css = next_css;
1133 goto skip_node;
1134 }
1135
1136 return NULL;
1137}
1138
1139static void mem_cgroup_iter_invalidate(struct mem_cgroup *root)
1140{
1141
1142
1143
1144
1145
1146 atomic_inc(&root->dead_count);
1147}
1148
1149static struct mem_cgroup *
1150mem_cgroup_iter_load(struct mem_cgroup_reclaim_iter *iter,
1151 struct mem_cgroup *root,
1152 int *sequence)
1153{
1154 struct mem_cgroup *position = NULL;
1155
1156
1157
1158
1159
1160
1161
1162
1163 *sequence = atomic_read(&root->dead_count);
1164 if (iter->last_dead_count == *sequence) {
1165 smp_rmb();
1166 position = iter->last_visited;
1167
1168
1169
1170
1171
1172
1173
1174 if (position && position != root &&
1175 !css_tryget(&position->css))
1176 position = NULL;
1177 }
1178 return position;
1179}
1180
1181static void mem_cgroup_iter_update(struct mem_cgroup_reclaim_iter *iter,
1182 struct mem_cgroup *last_visited,
1183 struct mem_cgroup *new_position,
1184 struct mem_cgroup *root,
1185 int sequence)
1186{
1187
1188 if (last_visited && last_visited != root)
1189 css_put(&last_visited->css);
1190
1191
1192
1193
1194
1195
1196 iter->last_visited = new_position;
1197 smp_wmb();
1198 iter->last_dead_count = sequence;
1199}
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
1219 struct mem_cgroup *prev,
1220 struct mem_cgroup_reclaim_cookie *reclaim)
1221{
1222 struct mem_cgroup *memcg = NULL;
1223 struct mem_cgroup *last_visited = NULL;
1224
1225 if (mem_cgroup_disabled())
1226 return NULL;
1227
1228 if (!root)
1229 root = root_mem_cgroup;
1230
1231 if (prev && !reclaim)
1232 last_visited = prev;
1233
1234 if (!root->use_hierarchy && root != root_mem_cgroup) {
1235 if (prev)
1236 goto out_css_put;
1237 return root;
1238 }
1239
1240 rcu_read_lock();
1241 while (!memcg) {
1242 struct mem_cgroup_reclaim_iter *uninitialized_var(iter);
1243 int uninitialized_var(seq);
1244
1245 if (reclaim) {
1246 int nid = zone_to_nid(reclaim->zone);
1247 int zid = zone_idx(reclaim->zone);
1248 struct mem_cgroup_per_zone *mz;
1249
1250 mz = mem_cgroup_zoneinfo(root, nid, zid);
1251 iter = &mz->reclaim_iter[reclaim->priority];
1252 if (prev && reclaim->generation != iter->generation) {
1253 iter->last_visited = NULL;
1254 goto out_unlock;
1255 }
1256
1257 last_visited = mem_cgroup_iter_load(iter, root, &seq);
1258 }
1259
1260 memcg = __mem_cgroup_iter_next(root, last_visited);
1261
1262 if (reclaim) {
1263 mem_cgroup_iter_update(iter, last_visited, memcg, root,
1264 seq);
1265
1266 if (!memcg)
1267 iter->generation++;
1268 else if (!prev && memcg)
1269 reclaim->generation = iter->generation;
1270 }
1271
1272 if (prev && !memcg)
1273 goto out_unlock;
1274 }
1275out_unlock:
1276 rcu_read_unlock();
1277out_css_put:
1278 if (prev && prev != root)
1279 css_put(&prev->css);
1280
1281 return memcg;
1282}
1283
1284
1285
1286
1287
1288
1289void mem_cgroup_iter_break(struct mem_cgroup *root,
1290 struct mem_cgroup *prev)
1291{
1292 if (!root)
1293 root = root_mem_cgroup;
1294 if (prev && prev != root)
1295 css_put(&prev->css);
1296}
1297
1298
1299
1300
1301
1302
1303#define for_each_mem_cgroup_tree(iter, root) \
1304 for (iter = mem_cgroup_iter(root, NULL, NULL); \
1305 iter != NULL; \
1306 iter = mem_cgroup_iter(root, iter, NULL))
1307
1308#define for_each_mem_cgroup(iter) \
1309 for (iter = mem_cgroup_iter(NULL, NULL, NULL); \
1310 iter != NULL; \
1311 iter = mem_cgroup_iter(NULL, iter, NULL))
1312
1313void __mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx)
1314{
1315 struct mem_cgroup *memcg;
1316
1317 rcu_read_lock();
1318 memcg = mem_cgroup_from_task(rcu_dereference(mm->owner));
1319 if (unlikely(!memcg))
1320 goto out;
1321
1322 switch (idx) {
1323 case PGFAULT:
1324 this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGFAULT]);
1325 break;
1326 case PGMAJFAULT:
1327 this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGMAJFAULT]);
1328 break;
1329 default:
1330 BUG();
1331 }
1332out:
1333 rcu_read_unlock();
1334}
1335EXPORT_SYMBOL(__mem_cgroup_count_vm_event);
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346struct lruvec *mem_cgroup_zone_lruvec(struct zone *zone,
1347 struct mem_cgroup *memcg)
1348{
1349 struct mem_cgroup_per_zone *mz;
1350 struct lruvec *lruvec;
1351
1352 if (mem_cgroup_disabled()) {
1353 lruvec = &zone->lruvec;
1354 goto out;
1355 }
1356
1357 mz = mem_cgroup_zoneinfo(memcg, zone_to_nid(zone), zone_idx(zone));
1358 lruvec = &mz->lruvec;
1359out:
1360
1361
1362
1363
1364
1365 if (unlikely(lruvec->zone != zone))
1366 lruvec->zone = zone;
1367 return lruvec;
1368}
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389struct lruvec *mem_cgroup_page_lruvec(struct page *page, struct zone *zone)
1390{
1391 struct mem_cgroup_per_zone *mz;
1392 struct mem_cgroup *memcg;
1393 struct page_cgroup *pc;
1394 struct lruvec *lruvec;
1395
1396 if (mem_cgroup_disabled()) {
1397 lruvec = &zone->lruvec;
1398 goto out;
1399 }
1400
1401 pc = lookup_page_cgroup(page);
1402 memcg = pc->mem_cgroup;
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413 if (!PageLRU(page) && !PageCgroupUsed(pc) && memcg != root_mem_cgroup)
1414 pc->mem_cgroup = memcg = root_mem_cgroup;
1415
1416 mz = page_cgroup_zoneinfo(memcg, page);
1417 lruvec = &mz->lruvec;
1418out:
1419
1420
1421
1422
1423
1424 if (unlikely(lruvec->zone != zone))
1425 lruvec->zone = zone;
1426 return lruvec;
1427}
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru,
1439 int nr_pages)
1440{
1441 struct mem_cgroup_per_zone *mz;
1442 unsigned long *lru_size;
1443
1444 if (mem_cgroup_disabled())
1445 return;
1446
1447 mz = container_of(lruvec, struct mem_cgroup_per_zone, lruvec);
1448 lru_size = mz->lru_size + lru;
1449 *lru_size += nr_pages;
1450 VM_BUG_ON((long)(*lru_size) < 0);
1451}
1452
1453
1454
1455
1456
1457bool __mem_cgroup_same_or_subtree(const struct mem_cgroup *root_memcg,
1458 struct mem_cgroup *memcg)
1459{
1460 if (root_memcg == memcg)
1461 return true;
1462 if (!root_memcg->use_hierarchy || !memcg)
1463 return false;
1464 return cgroup_is_descendant(memcg->css.cgroup, root_memcg->css.cgroup);
1465}
1466
1467static bool mem_cgroup_same_or_subtree(const struct mem_cgroup *root_memcg,
1468 struct mem_cgroup *memcg)
1469{
1470 bool ret;
1471
1472 rcu_read_lock();
1473 ret = __mem_cgroup_same_or_subtree(root_memcg, memcg);
1474 rcu_read_unlock();
1475 return ret;
1476}
1477
1478bool task_in_mem_cgroup(struct task_struct *task,
1479 const struct mem_cgroup *memcg)
1480{
1481 struct mem_cgroup *curr = NULL;
1482 struct task_struct *p;
1483 bool ret;
1484
1485 p = find_lock_task_mm(task);
1486 if (p) {
1487 curr = get_mem_cgroup_from_mm(p->mm);
1488 task_unlock(p);
1489 } else {
1490
1491
1492
1493
1494
1495 rcu_read_lock();
1496 curr = mem_cgroup_from_task(task);
1497 if (curr)
1498 css_get(&curr->css);
1499 rcu_read_unlock();
1500 }
1501
1502
1503
1504
1505
1506
1507 ret = mem_cgroup_same_or_subtree(memcg, curr);
1508 css_put(&curr->css);
1509 return ret;
1510}
1511
1512int mem_cgroup_inactive_anon_is_low(struct lruvec *lruvec)
1513{
1514 unsigned long inactive_ratio;
1515 unsigned long inactive;
1516 unsigned long active;
1517 unsigned long gb;
1518
1519 inactive = mem_cgroup_get_lru_size(lruvec, LRU_INACTIVE_ANON);
1520 active = mem_cgroup_get_lru_size(lruvec, LRU_ACTIVE_ANON);
1521
1522 gb = (inactive + active) >> (30 - PAGE_SHIFT);
1523 if (gb)
1524 inactive_ratio = int_sqrt(10 * gb);
1525 else
1526 inactive_ratio = 1;
1527
1528 return inactive * inactive_ratio < active;
1529}
1530
1531#define mem_cgroup_from_res_counter(counter, member) \
1532 container_of(counter, struct mem_cgroup, member)
1533
1534
1535
1536
1537
1538
1539
1540
1541static unsigned long mem_cgroup_margin(struct mem_cgroup *memcg)
1542{
1543 unsigned long long margin;
1544
1545 margin = res_counter_margin(&memcg->res);
1546 if (do_swap_account)
1547 margin = min(margin, res_counter_margin(&memcg->memsw));
1548 return margin >> PAGE_SHIFT;
1549}
1550
1551int mem_cgroup_swappiness(struct mem_cgroup *memcg)
1552{
1553
1554 if (!css_parent(&memcg->css))
1555 return vm_swappiness;
1556
1557 return memcg->swappiness;
1558}
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576atomic_t memcg_moving __read_mostly;
1577
1578static void mem_cgroup_start_move(struct mem_cgroup *memcg)
1579{
1580 atomic_inc(&memcg_moving);
1581 atomic_inc(&memcg->moving_account);
1582 synchronize_rcu();
1583}
1584
1585static void mem_cgroup_end_move(struct mem_cgroup *memcg)
1586{
1587
1588
1589
1590
1591 if (memcg) {
1592 atomic_dec(&memcg_moving);
1593 atomic_dec(&memcg->moving_account);
1594 }
1595}
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609static bool mem_cgroup_stolen(struct mem_cgroup *memcg)
1610{
1611 VM_BUG_ON(!rcu_read_lock_held());
1612 return atomic_read(&memcg->moving_account) > 0;
1613}
1614
1615static bool mem_cgroup_under_move(struct mem_cgroup *memcg)
1616{
1617 struct mem_cgroup *from;
1618 struct mem_cgroup *to;
1619 bool ret = false;
1620
1621
1622
1623
1624 spin_lock(&mc.lock);
1625 from = mc.from;
1626 to = mc.to;
1627 if (!from)
1628 goto unlock;
1629
1630 ret = mem_cgroup_same_or_subtree(memcg, from)
1631 || mem_cgroup_same_or_subtree(memcg, to);
1632unlock:
1633 spin_unlock(&mc.lock);
1634 return ret;
1635}
1636
1637static bool mem_cgroup_wait_acct_move(struct mem_cgroup *memcg)
1638{
1639 if (mc.moving_task && current != mc.moving_task) {
1640 if (mem_cgroup_under_move(memcg)) {
1641 DEFINE_WAIT(wait);
1642 prepare_to_wait(&mc.waitq, &wait, TASK_INTERRUPTIBLE);
1643
1644 if (mc.moving_task)
1645 schedule();
1646 finish_wait(&mc.waitq, &wait);
1647 return true;
1648 }
1649 }
1650 return false;
1651}
1652
1653
1654
1655
1656
1657
1658
1659static void move_lock_mem_cgroup(struct mem_cgroup *memcg,
1660 unsigned long *flags)
1661{
1662 spin_lock_irqsave(&memcg->move_lock, *flags);
1663}
1664
1665static void move_unlock_mem_cgroup(struct mem_cgroup *memcg,
1666 unsigned long *flags)
1667{
1668 spin_unlock_irqrestore(&memcg->move_lock, *flags);
1669}
1670
1671#define K(x) ((x) << (PAGE_SHIFT-10))
1672
1673
1674
1675
1676
1677
1678
1679
1680void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
1681{
1682
1683 static DEFINE_MUTEX(oom_info_lock);
1684 struct mem_cgroup *iter;
1685 unsigned int i;
1686
1687 if (!p)
1688 return;
1689
1690 mutex_lock(&oom_info_lock);
1691 rcu_read_lock();
1692
1693 pr_info("Task in ");
1694 pr_cont_cgroup_path(task_cgroup(p, memory_cgrp_id));
1695 pr_info(" killed as a result of limit of ");
1696 pr_cont_cgroup_path(memcg->css.cgroup);
1697 pr_info("\n");
1698
1699 rcu_read_unlock();
1700
1701 pr_info("memory: usage %llukB, limit %llukB, failcnt %llu\n",
1702 res_counter_read_u64(&memcg->res, RES_USAGE) >> 10,
1703 res_counter_read_u64(&memcg->res, RES_LIMIT) >> 10,
1704 res_counter_read_u64(&memcg->res, RES_FAILCNT));
1705 pr_info("memory+swap: usage %llukB, limit %llukB, failcnt %llu\n",
1706 res_counter_read_u64(&memcg->memsw, RES_USAGE) >> 10,
1707 res_counter_read_u64(&memcg->memsw, RES_LIMIT) >> 10,
1708 res_counter_read_u64(&memcg->memsw, RES_FAILCNT));
1709 pr_info("kmem: usage %llukB, limit %llukB, failcnt %llu\n",
1710 res_counter_read_u64(&memcg->kmem, RES_USAGE) >> 10,
1711 res_counter_read_u64(&memcg->kmem, RES_LIMIT) >> 10,
1712 res_counter_read_u64(&memcg->kmem, RES_FAILCNT));
1713
1714 for_each_mem_cgroup_tree(iter, memcg) {
1715 pr_info("Memory cgroup stats for ");
1716 pr_cont_cgroup_path(iter->css.cgroup);
1717 pr_cont(":");
1718
1719 for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) {
1720 if (i == MEM_CGROUP_STAT_SWAP && !do_swap_account)
1721 continue;
1722 pr_cont(" %s:%ldKB", mem_cgroup_stat_names[i],
1723 K(mem_cgroup_read_stat(iter, i)));
1724 }
1725
1726 for (i = 0; i < NR_LRU_LISTS; i++)
1727 pr_cont(" %s:%luKB", mem_cgroup_lru_names[i],
1728 K(mem_cgroup_nr_lru_pages(iter, BIT(i))));
1729
1730 pr_cont("\n");
1731 }
1732 mutex_unlock(&oom_info_lock);
1733}
1734
1735
1736
1737
1738
1739static int mem_cgroup_count_children(struct mem_cgroup *memcg)
1740{
1741 int num = 0;
1742 struct mem_cgroup *iter;
1743
1744 for_each_mem_cgroup_tree(iter, memcg)
1745 num++;
1746 return num;
1747}
1748
1749
1750
1751
1752static u64 mem_cgroup_get_limit(struct mem_cgroup *memcg)
1753{
1754 u64 limit;
1755
1756 limit = res_counter_read_u64(&memcg->res, RES_LIMIT);
1757
1758
1759
1760
1761 if (mem_cgroup_swappiness(memcg)) {
1762 u64 memsw;
1763
1764 limit += total_swap_pages << PAGE_SHIFT;
1765 memsw = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
1766
1767
1768
1769
1770
1771 limit = min(limit, memsw);
1772 }
1773
1774 return limit;
1775}
1776
1777static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
1778 int order)
1779{
1780 struct mem_cgroup *iter;
1781 unsigned long chosen_points = 0;
1782 unsigned long totalpages;
1783 unsigned int points = 0;
1784 struct task_struct *chosen = NULL;
1785
1786
1787
1788
1789
1790
1791 if (fatal_signal_pending(current) || current->flags & PF_EXITING) {
1792 set_thread_flag(TIF_MEMDIE);
1793 return;
1794 }
1795
1796 check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, order, NULL);
1797 totalpages = mem_cgroup_get_limit(memcg) >> PAGE_SHIFT ? : 1;
1798 for_each_mem_cgroup_tree(iter, memcg) {
1799 struct css_task_iter it;
1800 struct task_struct *task;
1801
1802 css_task_iter_start(&iter->css, &it);
1803 while ((task = css_task_iter_next(&it))) {
1804 switch (oom_scan_process_thread(task, totalpages, NULL,
1805 false)) {
1806 case OOM_SCAN_SELECT:
1807 if (chosen)
1808 put_task_struct(chosen);
1809 chosen = task;
1810 chosen_points = ULONG_MAX;
1811 get_task_struct(chosen);
1812
1813 case OOM_SCAN_CONTINUE:
1814 continue;
1815 case OOM_SCAN_ABORT:
1816 css_task_iter_end(&it);
1817 mem_cgroup_iter_break(memcg, iter);
1818 if (chosen)
1819 put_task_struct(chosen);
1820 return;
1821 case OOM_SCAN_OK:
1822 break;
1823 };
1824 points = oom_badness(task, memcg, NULL, totalpages);
1825 if (!points || points < chosen_points)
1826 continue;
1827
1828 if (points == chosen_points &&
1829 thread_group_leader(chosen))
1830 continue;
1831
1832 if (chosen)
1833 put_task_struct(chosen);
1834 chosen = task;
1835 chosen_points = points;
1836 get_task_struct(chosen);
1837 }
1838 css_task_iter_end(&it);
1839 }
1840
1841 if (!chosen)
1842 return;
1843 points = chosen_points * 1000 / totalpages;
1844 oom_kill_process(chosen, gfp_mask, order, points, totalpages, memcg,
1845 NULL, "Memory cgroup out of memory");
1846}
1847
1848static unsigned long mem_cgroup_reclaim(struct mem_cgroup *memcg,
1849 gfp_t gfp_mask,
1850 unsigned long flags)
1851{
1852 unsigned long total = 0;
1853 bool noswap = false;
1854 int loop;
1855
1856 if (flags & MEM_CGROUP_RECLAIM_NOSWAP)
1857 noswap = true;
1858 if (!(flags & MEM_CGROUP_RECLAIM_SHRINK) && memcg->memsw_is_minimum)
1859 noswap = true;
1860
1861 for (loop = 0; loop < MEM_CGROUP_MAX_RECLAIM_LOOPS; loop++) {
1862 if (loop)
1863 drain_all_stock_async(memcg);
1864 total += try_to_free_mem_cgroup_pages(memcg, gfp_mask, noswap);
1865
1866
1867
1868
1869
1870 if (total && (flags & MEM_CGROUP_RECLAIM_SHRINK))
1871 break;
1872 if (mem_cgroup_margin(memcg))
1873 break;
1874
1875
1876
1877
1878 if (loop && !total)
1879 break;
1880 }
1881 return total;
1882}
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894static bool test_mem_cgroup_node_reclaimable(struct mem_cgroup *memcg,
1895 int nid, bool noswap)
1896{
1897 if (mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL_FILE))
1898 return true;
1899 if (noswap || !total_swap_pages)
1900 return false;
1901 if (mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL_ANON))
1902 return true;
1903 return false;
1904
1905}
1906#if MAX_NUMNODES > 1
1907
1908
1909
1910
1911
1912
1913
1914static void mem_cgroup_may_update_nodemask(struct mem_cgroup *memcg)
1915{
1916 int nid;
1917
1918
1919
1920
1921 if (!atomic_read(&memcg->numainfo_events))
1922 return;
1923 if (atomic_inc_return(&memcg->numainfo_updating) > 1)
1924 return;
1925
1926
1927 memcg->scan_nodes = node_states[N_MEMORY];
1928
1929 for_each_node_mask(nid, node_states[N_MEMORY]) {
1930
1931 if (!test_mem_cgroup_node_reclaimable(memcg, nid, false))
1932 node_clear(nid, memcg->scan_nodes);
1933 }
1934
1935 atomic_set(&memcg->numainfo_events, 0);
1936 atomic_set(&memcg->numainfo_updating, 0);
1937}
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951int mem_cgroup_select_victim_node(struct mem_cgroup *memcg)
1952{
1953 int node;
1954
1955 mem_cgroup_may_update_nodemask(memcg);
1956 node = memcg->last_scanned_node;
1957
1958 node = next_node(node, memcg->scan_nodes);
1959 if (node == MAX_NUMNODES)
1960 node = first_node(memcg->scan_nodes);
1961
1962
1963
1964
1965
1966
1967 if (unlikely(node == MAX_NUMNODES))
1968 node = numa_node_id();
1969
1970 memcg->last_scanned_node = node;
1971 return node;
1972}
1973
1974
1975
1976
1977
1978
1979
1980static bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap)
1981{
1982 int nid;
1983
1984
1985
1986
1987
1988 if (!nodes_empty(memcg->scan_nodes)) {
1989 for (nid = first_node(memcg->scan_nodes);
1990 nid < MAX_NUMNODES;
1991 nid = next_node(nid, memcg->scan_nodes)) {
1992
1993 if (test_mem_cgroup_node_reclaimable(memcg, nid, noswap))
1994 return true;
1995 }
1996 }
1997
1998
1999
2000 for_each_node_state(nid, N_MEMORY) {
2001 if (node_isset(nid, memcg->scan_nodes))
2002 continue;
2003 if (test_mem_cgroup_node_reclaimable(memcg, nid, noswap))
2004 return true;
2005 }
2006 return false;
2007}
2008
2009#else
2010int mem_cgroup_select_victim_node(struct mem_cgroup *memcg)
2011{
2012 return 0;
2013}
2014
2015static bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap)
2016{
2017 return test_mem_cgroup_node_reclaimable(memcg, 0, noswap);
2018}
2019#endif
2020
2021static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg,
2022 struct zone *zone,
2023 gfp_t gfp_mask,
2024 unsigned long *total_scanned)
2025{
2026 struct mem_cgroup *victim = NULL;
2027 int total = 0;
2028 int loop = 0;
2029 unsigned long excess;
2030 unsigned long nr_scanned;
2031 struct mem_cgroup_reclaim_cookie reclaim = {
2032 .zone = zone,
2033 .priority = 0,
2034 };
2035
2036 excess = res_counter_soft_limit_excess(&root_memcg->res) >> PAGE_SHIFT;
2037
2038 while (1) {
2039 victim = mem_cgroup_iter(root_memcg, victim, &reclaim);
2040 if (!victim) {
2041 loop++;
2042 if (loop >= 2) {
2043
2044
2045
2046
2047
2048 if (!total)
2049 break;
2050
2051
2052
2053
2054
2055
2056 if (total >= (excess >> 2) ||
2057 (loop > MEM_CGROUP_MAX_RECLAIM_LOOPS))
2058 break;
2059 }
2060 continue;
2061 }
2062 if (!mem_cgroup_reclaimable(victim, false))
2063 continue;
2064 total += mem_cgroup_shrink_node_zone(victim, gfp_mask, false,
2065 zone, &nr_scanned);
2066 *total_scanned += nr_scanned;
2067 if (!res_counter_soft_limit_excess(&root_memcg->res))
2068 break;
2069 }
2070 mem_cgroup_iter_break(root_memcg, victim);
2071 return total;
2072}
2073
2074#ifdef CONFIG_LOCKDEP
2075static struct lockdep_map memcg_oom_lock_dep_map = {
2076 .name = "memcg_oom_lock",
2077};
2078#endif
2079
2080static DEFINE_SPINLOCK(memcg_oom_lock);
2081
2082
2083
2084
2085
2086static bool mem_cgroup_oom_trylock(struct mem_cgroup *memcg)
2087{
2088 struct mem_cgroup *iter, *failed = NULL;
2089
2090 spin_lock(&memcg_oom_lock);
2091
2092 for_each_mem_cgroup_tree(iter, memcg) {
2093 if (iter->oom_lock) {
2094
2095
2096
2097
2098 failed = iter;
2099 mem_cgroup_iter_break(memcg, iter);
2100 break;
2101 } else
2102 iter->oom_lock = true;
2103 }
2104
2105 if (failed) {
2106
2107
2108
2109
2110 for_each_mem_cgroup_tree(iter, memcg) {
2111 if (iter == failed) {
2112 mem_cgroup_iter_break(memcg, iter);
2113 break;
2114 }
2115 iter->oom_lock = false;
2116 }
2117 } else
2118 mutex_acquire(&memcg_oom_lock_dep_map, 0, 1, _RET_IP_);
2119
2120 spin_unlock(&memcg_oom_lock);
2121
2122 return !failed;
2123}
2124
2125static void mem_cgroup_oom_unlock(struct mem_cgroup *memcg)
2126{
2127 struct mem_cgroup *iter;
2128
2129 spin_lock(&memcg_oom_lock);
2130 mutex_release(&memcg_oom_lock_dep_map, 1, _RET_IP_);
2131 for_each_mem_cgroup_tree(iter, memcg)
2132 iter->oom_lock = false;
2133 spin_unlock(&memcg_oom_lock);
2134}
2135
2136static void mem_cgroup_mark_under_oom(struct mem_cgroup *memcg)
2137{
2138 struct mem_cgroup *iter;
2139
2140 for_each_mem_cgroup_tree(iter, memcg)
2141 atomic_inc(&iter->under_oom);
2142}
2143
2144static void mem_cgroup_unmark_under_oom(struct mem_cgroup *memcg)
2145{
2146 struct mem_cgroup *iter;
2147
2148
2149
2150
2151
2152
2153 for_each_mem_cgroup_tree(iter, memcg)
2154 atomic_add_unless(&iter->under_oom, -1, 0);
2155}
2156
2157static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq);
2158
2159struct oom_wait_info {
2160 struct mem_cgroup *memcg;
2161 wait_queue_t wait;
2162};
2163
2164static int memcg_oom_wake_function(wait_queue_t *wait,
2165 unsigned mode, int sync, void *arg)
2166{
2167 struct mem_cgroup *wake_memcg = (struct mem_cgroup *)arg;
2168 struct mem_cgroup *oom_wait_memcg;
2169 struct oom_wait_info *oom_wait_info;
2170
2171 oom_wait_info = container_of(wait, struct oom_wait_info, wait);
2172 oom_wait_memcg = oom_wait_info->memcg;
2173
2174
2175
2176
2177
2178 if (!mem_cgroup_same_or_subtree(oom_wait_memcg, wake_memcg)
2179 && !mem_cgroup_same_or_subtree(wake_memcg, oom_wait_memcg))
2180 return 0;
2181 return autoremove_wake_function(wait, mode, sync, arg);
2182}
2183
2184static void memcg_wakeup_oom(struct mem_cgroup *memcg)
2185{
2186 atomic_inc(&memcg->oom_wakeups);
2187
2188 __wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, memcg);
2189}
2190
2191static void memcg_oom_recover(struct mem_cgroup *memcg)
2192{
2193 if (memcg && atomic_read(&memcg->under_oom))
2194 memcg_wakeup_oom(memcg);
2195}
2196
2197static void mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order)
2198{
2199 if (!current->memcg_oom.may_oom)
2200 return;
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215 css_get(&memcg->css);
2216 current->memcg_oom.memcg = memcg;
2217 current->memcg_oom.gfp_mask = mask;
2218 current->memcg_oom.order = order;
2219}
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238bool mem_cgroup_oom_synchronize(bool handle)
2239{
2240 struct mem_cgroup *memcg = current->memcg_oom.memcg;
2241 struct oom_wait_info owait;
2242 bool locked;
2243
2244
2245 if (!memcg)
2246 return false;
2247
2248 if (!handle)
2249 goto cleanup;
2250
2251 owait.memcg = memcg;
2252 owait.wait.flags = 0;
2253 owait.wait.func = memcg_oom_wake_function;
2254 owait.wait.private = current;
2255 INIT_LIST_HEAD(&owait.wait.task_list);
2256
2257 prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE);
2258 mem_cgroup_mark_under_oom(memcg);
2259
2260 locked = mem_cgroup_oom_trylock(memcg);
2261
2262 if (locked)
2263 mem_cgroup_oom_notify(memcg);
2264
2265 if (locked && !memcg->oom_kill_disable) {
2266 mem_cgroup_unmark_under_oom(memcg);
2267 finish_wait(&memcg_oom_waitq, &owait.wait);
2268 mem_cgroup_out_of_memory(memcg, current->memcg_oom.gfp_mask,
2269 current->memcg_oom.order);
2270 } else {
2271 schedule();
2272 mem_cgroup_unmark_under_oom(memcg);
2273 finish_wait(&memcg_oom_waitq, &owait.wait);
2274 }
2275
2276 if (locked) {
2277 mem_cgroup_oom_unlock(memcg);
2278
2279
2280
2281
2282
2283 memcg_oom_recover(memcg);
2284 }
2285cleanup:
2286 current->memcg_oom.memcg = NULL;
2287 css_put(&memcg->css);
2288 return true;
2289}
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315void __mem_cgroup_begin_update_page_stat(struct page *page,
2316 bool *locked, unsigned long *flags)
2317{
2318 struct mem_cgroup *memcg;
2319 struct page_cgroup *pc;
2320
2321 pc = lookup_page_cgroup(page);
2322again:
2323 memcg = pc->mem_cgroup;
2324 if (unlikely(!memcg || !PageCgroupUsed(pc)))
2325 return;
2326
2327
2328
2329
2330
2331
2332 if (!mem_cgroup_stolen(memcg))
2333 return;
2334
2335 move_lock_mem_cgroup(memcg, flags);
2336 if (memcg != pc->mem_cgroup || !PageCgroupUsed(pc)) {
2337 move_unlock_mem_cgroup(memcg, flags);
2338 goto again;
2339 }
2340 *locked = true;
2341}
2342
2343void __mem_cgroup_end_update_page_stat(struct page *page, unsigned long *flags)
2344{
2345 struct page_cgroup *pc = lookup_page_cgroup(page);
2346
2347
2348
2349
2350
2351
2352 move_unlock_mem_cgroup(pc->mem_cgroup, flags);
2353}
2354
2355void mem_cgroup_update_page_stat(struct page *page,
2356 enum mem_cgroup_stat_index idx, int val)
2357{
2358 struct mem_cgroup *memcg;
2359 struct page_cgroup *pc = lookup_page_cgroup(page);
2360 unsigned long uninitialized_var(flags);
2361
2362 if (mem_cgroup_disabled())
2363 return;
2364
2365 VM_BUG_ON(!rcu_read_lock_held());
2366 memcg = pc->mem_cgroup;
2367 if (unlikely(!memcg || !PageCgroupUsed(pc)))
2368 return;
2369
2370 this_cpu_add(memcg->stat->count[idx], val);
2371}
2372
2373
2374
2375
2376
2377#define CHARGE_BATCH 32U
2378struct memcg_stock_pcp {
2379 struct mem_cgroup *cached;
2380 unsigned int nr_pages;
2381 struct work_struct work;
2382 unsigned long flags;
2383#define FLUSHING_CACHED_CHARGE 0
2384};
2385static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock);
2386static DEFINE_MUTEX(percpu_charge_mutex);
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
2400{
2401 struct memcg_stock_pcp *stock;
2402 bool ret = true;
2403
2404 if (nr_pages > CHARGE_BATCH)
2405 return false;
2406
2407 stock = &get_cpu_var(memcg_stock);
2408 if (memcg == stock->cached && stock->nr_pages >= nr_pages)
2409 stock->nr_pages -= nr_pages;
2410 else
2411 ret = false;
2412 put_cpu_var(memcg_stock);
2413 return ret;
2414}
2415
2416
2417
2418
2419static void drain_stock(struct memcg_stock_pcp *stock)
2420{
2421 struct mem_cgroup *old = stock->cached;
2422
2423 if (stock->nr_pages) {
2424 unsigned long bytes = stock->nr_pages * PAGE_SIZE;
2425
2426 res_counter_uncharge(&old->res, bytes);
2427 if (do_swap_account)
2428 res_counter_uncharge(&old->memsw, bytes);
2429 stock->nr_pages = 0;
2430 }
2431 stock->cached = NULL;
2432}
2433
2434
2435
2436
2437
2438static void drain_local_stock(struct work_struct *dummy)
2439{
2440 struct memcg_stock_pcp *stock = &__get_cpu_var(memcg_stock);
2441 drain_stock(stock);
2442 clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags);
2443}
2444
2445static void __init memcg_stock_init(void)
2446{
2447 int cpu;
2448
2449 for_each_possible_cpu(cpu) {
2450 struct memcg_stock_pcp *stock =
2451 &per_cpu(memcg_stock, cpu);
2452 INIT_WORK(&stock->work, drain_local_stock);
2453 }
2454}
2455
2456
2457
2458
2459
2460static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
2461{
2462 struct memcg_stock_pcp *stock = &get_cpu_var(memcg_stock);
2463
2464 if (stock->cached != memcg) {
2465 drain_stock(stock);
2466 stock->cached = memcg;
2467 }
2468 stock->nr_pages += nr_pages;
2469 put_cpu_var(memcg_stock);
2470}
2471
2472
2473
2474
2475
2476
2477static void drain_all_stock(struct mem_cgroup *root_memcg, bool sync)
2478{
2479 int cpu, curcpu;
2480
2481
2482 get_online_cpus();
2483 curcpu = get_cpu();
2484 for_each_online_cpu(cpu) {
2485 struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu);
2486 struct mem_cgroup *memcg;
2487
2488 memcg = stock->cached;
2489 if (!memcg || !stock->nr_pages)
2490 continue;
2491 if (!mem_cgroup_same_or_subtree(root_memcg, memcg))
2492 continue;
2493 if (!test_and_set_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) {
2494 if (cpu == curcpu)
2495 drain_local_stock(&stock->work);
2496 else
2497 schedule_work_on(cpu, &stock->work);
2498 }
2499 }
2500 put_cpu();
2501
2502 if (!sync)
2503 goto out;
2504
2505 for_each_online_cpu(cpu) {
2506 struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu);
2507 if (test_bit(FLUSHING_CACHED_CHARGE, &stock->flags))
2508 flush_work(&stock->work);
2509 }
2510out:
2511 put_online_cpus();
2512}
2513
2514
2515
2516
2517
2518
2519
2520static void drain_all_stock_async(struct mem_cgroup *root_memcg)
2521{
2522
2523
2524
2525 if (!mutex_trylock(&percpu_charge_mutex))
2526 return;
2527 drain_all_stock(root_memcg, false);
2528 mutex_unlock(&percpu_charge_mutex);
2529}
2530
2531
2532static void drain_all_stock_sync(struct mem_cgroup *root_memcg)
2533{
2534
2535 mutex_lock(&percpu_charge_mutex);
2536 drain_all_stock(root_memcg, true);
2537 mutex_unlock(&percpu_charge_mutex);
2538}
2539
2540
2541
2542
2543
2544static void mem_cgroup_drain_pcp_counter(struct mem_cgroup *memcg, int cpu)
2545{
2546 int i;
2547
2548 spin_lock(&memcg->pcp_counter_lock);
2549 for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) {
2550 long x = per_cpu(memcg->stat->count[i], cpu);
2551
2552 per_cpu(memcg->stat->count[i], cpu) = 0;
2553 memcg->nocpu_base.count[i] += x;
2554 }
2555 for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++) {
2556 unsigned long x = per_cpu(memcg->stat->events[i], cpu);
2557
2558 per_cpu(memcg->stat->events[i], cpu) = 0;
2559 memcg->nocpu_base.events[i] += x;
2560 }
2561 spin_unlock(&memcg->pcp_counter_lock);
2562}
2563
2564static int memcg_cpu_hotplug_callback(struct notifier_block *nb,
2565 unsigned long action,
2566 void *hcpu)
2567{
2568 int cpu = (unsigned long)hcpu;
2569 struct memcg_stock_pcp *stock;
2570 struct mem_cgroup *iter;
2571
2572 if (action == CPU_ONLINE)
2573 return NOTIFY_OK;
2574
2575 if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
2576 return NOTIFY_OK;
2577
2578 for_each_mem_cgroup(iter)
2579 mem_cgroup_drain_pcp_counter(iter, cpu);
2580
2581 stock = &per_cpu(memcg_stock, cpu);
2582 drain_stock(stock);
2583 return NOTIFY_OK;
2584}
2585
2586
2587
2588enum {
2589 CHARGE_OK,
2590 CHARGE_RETRY,
2591 CHARGE_NOMEM,
2592 CHARGE_WOULDBLOCK,
2593};
2594
2595static int mem_cgroup_do_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
2596 unsigned int nr_pages, unsigned int min_pages,
2597 bool invoke_oom)
2598{
2599 unsigned long csize = nr_pages * PAGE_SIZE;
2600 struct mem_cgroup *mem_over_limit;
2601 struct res_counter *fail_res;
2602 unsigned long flags = 0;
2603 int ret;
2604
2605 ret = res_counter_charge(&memcg->res, csize, &fail_res);
2606
2607 if (likely(!ret)) {
2608 if (!do_swap_account)
2609 return CHARGE_OK;
2610 ret = res_counter_charge(&memcg->memsw, csize, &fail_res);
2611 if (likely(!ret))
2612 return CHARGE_OK;
2613
2614 res_counter_uncharge(&memcg->res, csize);
2615 mem_over_limit = mem_cgroup_from_res_counter(fail_res, memsw);
2616 flags |= MEM_CGROUP_RECLAIM_NOSWAP;
2617 } else
2618 mem_over_limit = mem_cgroup_from_res_counter(fail_res, res);
2619
2620
2621
2622
2623 if (nr_pages > min_pages)
2624 return CHARGE_RETRY;
2625
2626 if (!(gfp_mask & __GFP_WAIT))
2627 return CHARGE_WOULDBLOCK;
2628
2629 if (gfp_mask & __GFP_NORETRY)
2630 return CHARGE_NOMEM;
2631
2632 ret = mem_cgroup_reclaim(mem_over_limit, gfp_mask, flags);
2633 if (mem_cgroup_margin(mem_over_limit) >= nr_pages)
2634 return CHARGE_RETRY;
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644 if (nr_pages <= (1 << PAGE_ALLOC_COSTLY_ORDER) && ret)
2645 return CHARGE_RETRY;
2646
2647
2648
2649
2650
2651 if (mem_cgroup_wait_acct_move(mem_over_limit))
2652 return CHARGE_RETRY;
2653
2654 if (invoke_oom)
2655 mem_cgroup_oom(mem_over_limit, gfp_mask, get_order(csize));
2656
2657 return CHARGE_NOMEM;
2658}
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669static int mem_cgroup_try_charge(struct mem_cgroup *memcg,
2670 gfp_t gfp_mask,
2671 unsigned int nr_pages,
2672 bool oom)
2673{
2674 unsigned int batch = max(CHARGE_BATCH, nr_pages);
2675 int nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES;
2676 int ret;
2677
2678 if (mem_cgroup_is_root(memcg))
2679 goto done;
2680
2681
2682
2683
2684
2685
2686 if (unlikely(test_thread_flag(TIF_MEMDIE) ||
2687 fatal_signal_pending(current)))
2688 goto bypass;
2689
2690 if (unlikely(task_in_memcg_oom(current)))
2691 goto nomem;
2692
2693 if (gfp_mask & __GFP_NOFAIL)
2694 oom = false;
2695again:
2696 if (consume_stock(memcg, nr_pages))
2697 goto done;
2698
2699 do {
2700 bool invoke_oom = oom && !nr_oom_retries;
2701
2702
2703 if (fatal_signal_pending(current))
2704 goto bypass;
2705
2706 ret = mem_cgroup_do_charge(memcg, gfp_mask, batch,
2707 nr_pages, invoke_oom);
2708 switch (ret) {
2709 case CHARGE_OK:
2710 break;
2711 case CHARGE_RETRY:
2712 batch = nr_pages;
2713 goto again;
2714 case CHARGE_WOULDBLOCK:
2715 goto nomem;
2716 case CHARGE_NOMEM:
2717 if (!oom || invoke_oom)
2718 goto nomem;
2719 nr_oom_retries--;
2720 break;
2721 }
2722 } while (ret != CHARGE_OK);
2723
2724 if (batch > nr_pages)
2725 refill_stock(memcg, batch - nr_pages);
2726done:
2727 return 0;
2728nomem:
2729 if (!(gfp_mask & __GFP_NOFAIL))
2730 return -ENOMEM;
2731bypass:
2732 return -EINTR;
2733}
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744static struct mem_cgroup *mem_cgroup_try_charge_mm(struct mm_struct *mm,
2745 gfp_t gfp_mask,
2746 unsigned int nr_pages,
2747 bool oom)
2748
2749{
2750 struct mem_cgroup *memcg;
2751 int ret;
2752
2753 memcg = get_mem_cgroup_from_mm(mm);
2754 ret = mem_cgroup_try_charge(memcg, gfp_mask, nr_pages, oom);
2755 css_put(&memcg->css);
2756 if (ret == -EINTR)
2757 memcg = root_mem_cgroup;
2758 else if (ret)
2759 memcg = NULL;
2760
2761 return memcg;
2762}
2763
2764
2765
2766
2767
2768
2769static void __mem_cgroup_cancel_charge(struct mem_cgroup *memcg,
2770 unsigned int nr_pages)
2771{
2772 if (!mem_cgroup_is_root(memcg)) {
2773 unsigned long bytes = nr_pages * PAGE_SIZE;
2774
2775 res_counter_uncharge(&memcg->res, bytes);
2776 if (do_swap_account)
2777 res_counter_uncharge(&memcg->memsw, bytes);
2778 }
2779}
2780
2781
2782
2783
2784
2785static void __mem_cgroup_cancel_local_charge(struct mem_cgroup *memcg,
2786 unsigned int nr_pages)
2787{
2788 unsigned long bytes = nr_pages * PAGE_SIZE;
2789
2790 if (mem_cgroup_is_root(memcg))
2791 return;
2792
2793 res_counter_uncharge_until(&memcg->res, memcg->res.parent, bytes);
2794 if (do_swap_account)
2795 res_counter_uncharge_until(&memcg->memsw,
2796 memcg->memsw.parent, bytes);
2797}
2798
2799
2800
2801
2802
2803
2804
2805static struct mem_cgroup *mem_cgroup_lookup(unsigned short id)
2806{
2807
2808 if (!id)
2809 return NULL;
2810 return mem_cgroup_from_id(id);
2811}
2812
2813struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page)
2814{
2815 struct mem_cgroup *memcg = NULL;
2816 struct page_cgroup *pc;
2817 unsigned short id;
2818 swp_entry_t ent;
2819
2820 VM_BUG_ON_PAGE(!PageLocked(page), page);
2821
2822 pc = lookup_page_cgroup(page);
2823 lock_page_cgroup(pc);
2824 if (PageCgroupUsed(pc)) {
2825 memcg = pc->mem_cgroup;
2826 if (memcg && !css_tryget(&memcg->css))
2827 memcg = NULL;
2828 } else if (PageSwapCache(page)) {
2829 ent.val = page_private(page);
2830 id = lookup_swap_cgroup_id(ent);
2831 rcu_read_lock();
2832 memcg = mem_cgroup_lookup(id);
2833 if (memcg && !css_tryget(&memcg->css))
2834 memcg = NULL;
2835 rcu_read_unlock();
2836 }
2837 unlock_page_cgroup(pc);
2838 return memcg;
2839}
2840
2841static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg,
2842 struct page *page,
2843 unsigned int nr_pages,
2844 enum charge_type ctype,
2845 bool lrucare)
2846{
2847 struct page_cgroup *pc = lookup_page_cgroup(page);
2848 struct zone *uninitialized_var(zone);
2849 struct lruvec *lruvec;
2850 bool was_on_lru = false;
2851 bool anon;
2852
2853 lock_page_cgroup(pc);
2854 VM_BUG_ON_PAGE(PageCgroupUsed(pc), page);
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864 if (lrucare) {
2865 zone = page_zone(page);
2866 spin_lock_irq(&zone->lru_lock);
2867 if (PageLRU(page)) {
2868 lruvec = mem_cgroup_zone_lruvec(zone, pc->mem_cgroup);
2869 ClearPageLRU(page);
2870 del_page_from_lru_list(page, lruvec, page_lru(page));
2871 was_on_lru = true;
2872 }
2873 }
2874
2875 pc->mem_cgroup = memcg;
2876
2877
2878
2879
2880
2881
2882
2883 smp_wmb();
2884 SetPageCgroupUsed(pc);
2885
2886 if (lrucare) {
2887 if (was_on_lru) {
2888 lruvec = mem_cgroup_zone_lruvec(zone, pc->mem_cgroup);
2889 VM_BUG_ON_PAGE(PageLRU(page), page);
2890 SetPageLRU(page);
2891 add_page_to_lru_list(page, lruvec, page_lru(page));
2892 }
2893 spin_unlock_irq(&zone->lru_lock);
2894 }
2895
2896 if (ctype == MEM_CGROUP_CHARGE_TYPE_ANON)
2897 anon = true;
2898 else
2899 anon = false;
2900
2901 mem_cgroup_charge_statistics(memcg, page, anon, nr_pages);
2902 unlock_page_cgroup(pc);
2903
2904
2905
2906
2907
2908
2909 memcg_check_events(memcg, page);
2910}
2911
2912static DEFINE_MUTEX(set_limit_mutex);
2913
2914#ifdef CONFIG_MEMCG_KMEM
2915static DEFINE_MUTEX(activate_kmem_mutex);
2916
2917static inline bool memcg_can_account_kmem(struct mem_cgroup *memcg)
2918{
2919 return !mem_cgroup_disabled() && !mem_cgroup_is_root(memcg) &&
2920 memcg_kmem_is_active(memcg);
2921}
2922
2923
2924
2925
2926
2927static struct kmem_cache *memcg_params_to_cache(struct memcg_cache_params *p)
2928{
2929 struct kmem_cache *cachep;
2930
2931 VM_BUG_ON(p->is_root_cache);
2932 cachep = p->root_cache;
2933 return cache_from_memcg_idx(cachep, memcg_cache_id(p->memcg));
2934}
2935
2936#ifdef CONFIG_SLABINFO
2937static int mem_cgroup_slabinfo_read(struct seq_file *m, void *v)
2938{
2939 struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
2940 struct memcg_cache_params *params;
2941
2942 if (!memcg_can_account_kmem(memcg))
2943 return -EIO;
2944
2945 print_slabinfo_header(m);
2946
2947 mutex_lock(&memcg->slab_caches_mutex);
2948 list_for_each_entry(params, &memcg->memcg_slab_caches, list)
2949 cache_show(memcg_params_to_cache(params), m);
2950 mutex_unlock(&memcg->slab_caches_mutex);
2951
2952 return 0;
2953}
2954#endif
2955
2956static int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp, u64 size)
2957{
2958 struct res_counter *fail_res;
2959 int ret = 0;
2960
2961 ret = res_counter_charge(&memcg->kmem, size, &fail_res);
2962 if (ret)
2963 return ret;
2964
2965 ret = mem_cgroup_try_charge(memcg, gfp, size >> PAGE_SHIFT,
2966 oom_gfp_allowed(gfp));
2967 if (ret == -EINTR) {
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983 res_counter_charge_nofail(&memcg->res, size, &fail_res);
2984 if (do_swap_account)
2985 res_counter_charge_nofail(&memcg->memsw, size,
2986 &fail_res);
2987 ret = 0;
2988 } else if (ret)
2989 res_counter_uncharge(&memcg->kmem, size);
2990
2991 return ret;
2992}
2993
2994static void memcg_uncharge_kmem(struct mem_cgroup *memcg, u64 size)
2995{
2996 res_counter_uncharge(&memcg->res, size);
2997 if (do_swap_account)
2998 res_counter_uncharge(&memcg->memsw, size);
2999
3000
3001 if (res_counter_uncharge(&memcg->kmem, size))
3002 return;
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012 if (memcg_kmem_test_and_clear_dead(memcg))
3013 css_put(&memcg->css);
3014}
3015
3016
3017
3018
3019
3020
3021int memcg_cache_id(struct mem_cgroup *memcg)
3022{
3023 return memcg ? memcg->kmemcg_id : -1;
3024}
3025
3026static size_t memcg_caches_array_size(int num_groups)
3027{
3028 ssize_t size;
3029 if (num_groups <= 0)
3030 return 0;
3031
3032 size = 2 * num_groups;
3033 if (size < MEMCG_CACHES_MIN_SIZE)
3034 size = MEMCG_CACHES_MIN_SIZE;
3035 else if (size > MEMCG_CACHES_MAX_SIZE)
3036 size = MEMCG_CACHES_MAX_SIZE;
3037
3038 return size;
3039}
3040
3041
3042
3043
3044
3045
3046void memcg_update_array_size(int num)
3047{
3048 if (num > memcg_limited_groups_array_size)
3049 memcg_limited_groups_array_size = memcg_caches_array_size(num);
3050}
3051
3052static void kmem_cache_destroy_work_func(struct work_struct *w);
3053
3054int memcg_update_cache_size(struct kmem_cache *s, int num_groups)
3055{
3056 struct memcg_cache_params *cur_params = s->memcg_params;
3057
3058 VM_BUG_ON(!is_root_cache(s));
3059
3060 if (num_groups > memcg_limited_groups_array_size) {
3061 int i;
3062 struct memcg_cache_params *new_params;
3063 ssize_t size = memcg_caches_array_size(num_groups);
3064
3065 size *= sizeof(void *);
3066 size += offsetof(struct memcg_cache_params, memcg_caches);
3067
3068 new_params = kzalloc(size, GFP_KERNEL);
3069 if (!new_params)
3070 return -ENOMEM;
3071
3072 new_params->is_root_cache = true;
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083 for (i = 0; i < memcg_limited_groups_array_size; i++) {
3084 if (!cur_params->memcg_caches[i])
3085 continue;
3086 new_params->memcg_caches[i] =
3087 cur_params->memcg_caches[i];
3088 }
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099 rcu_assign_pointer(s->memcg_params, new_params);
3100 if (cur_params)
3101 kfree_rcu(cur_params, rcu_head);
3102 }
3103 return 0;
3104}
3105
3106char *memcg_create_cache_name(struct mem_cgroup *memcg,
3107 struct kmem_cache *root_cache)
3108{
3109 static char *buf = NULL;
3110
3111
3112
3113
3114
3115
3116 lockdep_assert_held(&slab_mutex);
3117
3118 if (!buf) {
3119 buf = kmalloc(NAME_MAX + 1, GFP_KERNEL);
3120 if (!buf)
3121 return NULL;
3122 }
3123
3124 cgroup_name(memcg->css.cgroup, buf, NAME_MAX + 1);
3125 return kasprintf(GFP_KERNEL, "%s(%d:%s)", root_cache->name,
3126 memcg_cache_id(memcg), buf);
3127}
3128
3129int memcg_alloc_cache_params(struct mem_cgroup *memcg, struct kmem_cache *s,
3130 struct kmem_cache *root_cache)
3131{
3132 size_t size;
3133
3134 if (!memcg_kmem_enabled())
3135 return 0;
3136
3137 if (!memcg) {
3138 size = offsetof(struct memcg_cache_params, memcg_caches);
3139 size += memcg_limited_groups_array_size * sizeof(void *);
3140 } else
3141 size = sizeof(struct memcg_cache_params);
3142
3143 s->memcg_params = kzalloc(size, GFP_KERNEL);
3144 if (!s->memcg_params)
3145 return -ENOMEM;
3146
3147 if (memcg) {
3148 s->memcg_params->memcg = memcg;
3149 s->memcg_params->root_cache = root_cache;
3150 INIT_WORK(&s->memcg_params->destroy,
3151 kmem_cache_destroy_work_func);
3152 css_get(&memcg->css);
3153 } else
3154 s->memcg_params->is_root_cache = true;
3155
3156 return 0;
3157}
3158
3159void memcg_free_cache_params(struct kmem_cache *s)
3160{
3161 if (!s->memcg_params)
3162 return;
3163 if (!s->memcg_params->is_root_cache)
3164 css_put(&s->memcg_params->memcg->css);
3165 kfree(s->memcg_params);
3166}
3167
3168void memcg_register_cache(struct kmem_cache *s)
3169{
3170 struct kmem_cache *root;
3171 struct mem_cgroup *memcg;
3172 int id;
3173
3174 if (is_root_cache(s))
3175 return;
3176
3177
3178
3179
3180
3181 lockdep_assert_held(&slab_mutex);
3182
3183 root = s->memcg_params->root_cache;
3184 memcg = s->memcg_params->memcg;
3185 id = memcg_cache_id(memcg);
3186
3187
3188
3189
3190
3191
3192 smp_wmb();
3193
3194
3195
3196
3197
3198
3199 VM_BUG_ON(root->memcg_params->memcg_caches[id]);
3200 root->memcg_params->memcg_caches[id] = s;
3201
3202 mutex_lock(&memcg->slab_caches_mutex);
3203 list_add(&s->memcg_params->list, &memcg->memcg_slab_caches);
3204 mutex_unlock(&memcg->slab_caches_mutex);
3205}
3206
3207void memcg_unregister_cache(struct kmem_cache *s)
3208{
3209 struct kmem_cache *root;
3210 struct mem_cgroup *memcg;
3211 int id;
3212
3213 if (is_root_cache(s))
3214 return;
3215
3216
3217
3218
3219
3220 lockdep_assert_held(&slab_mutex);
3221
3222 root = s->memcg_params->root_cache;
3223 memcg = s->memcg_params->memcg;
3224 id = memcg_cache_id(memcg);
3225
3226 mutex_lock(&memcg->slab_caches_mutex);
3227 list_del(&s->memcg_params->list);
3228 mutex_unlock(&memcg->slab_caches_mutex);
3229
3230
3231
3232
3233
3234
3235 VM_BUG_ON(root->memcg_params->memcg_caches[id] != s);
3236 root->memcg_params->memcg_caches[id] = NULL;
3237}
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258static inline void memcg_stop_kmem_account(void)
3259{
3260 VM_BUG_ON(!current->mm);
3261 current->memcg_kmem_skip_account++;
3262}
3263
3264static inline void memcg_resume_kmem_account(void)
3265{
3266 VM_BUG_ON(!current->mm);
3267 current->memcg_kmem_skip_account--;
3268}
3269
3270static void kmem_cache_destroy_work_func(struct work_struct *w)
3271{
3272 struct kmem_cache *cachep;
3273 struct memcg_cache_params *p;
3274
3275 p = container_of(w, struct memcg_cache_params, destroy);
3276
3277 cachep = memcg_params_to_cache(p);
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295 if (atomic_read(&cachep->memcg_params->nr_pages) != 0)
3296 kmem_cache_shrink(cachep);
3297 else
3298 kmem_cache_destroy(cachep);
3299}
3300
3301void mem_cgroup_destroy_cache(struct kmem_cache *cachep)
3302{
3303 if (!cachep->memcg_params->dead)
3304 return;
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324 if (work_pending(&cachep->memcg_params->destroy))
3325 return;
3326
3327
3328
3329
3330 schedule_work(&cachep->memcg_params->destroy);
3331}
3332
3333int __kmem_cache_destroy_memcg_children(struct kmem_cache *s)
3334{
3335 struct kmem_cache *c;
3336 int i, failed = 0;
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348 mutex_lock(&activate_kmem_mutex);
3349 for_each_memcg_cache_index(i) {
3350 c = cache_from_memcg_idx(s, i);
3351 if (!c)
3352 continue;
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367 c->memcg_params->dead = false;
3368 cancel_work_sync(&c->memcg_params->destroy);
3369 kmem_cache_destroy(c);
3370
3371 if (cache_from_memcg_idx(s, i))
3372 failed++;
3373 }
3374 mutex_unlock(&activate_kmem_mutex);
3375 return failed;
3376}
3377
3378static void mem_cgroup_destroy_all_caches(struct mem_cgroup *memcg)
3379{
3380 struct kmem_cache *cachep;
3381 struct memcg_cache_params *params;
3382
3383 if (!memcg_kmem_is_active(memcg))
3384 return;
3385
3386 mutex_lock(&memcg->slab_caches_mutex);
3387 list_for_each_entry(params, &memcg->memcg_slab_caches, list) {
3388 cachep = memcg_params_to_cache(params);
3389 cachep->memcg_params->dead = true;
3390 schedule_work(&cachep->memcg_params->destroy);
3391 }
3392 mutex_unlock(&memcg->slab_caches_mutex);
3393}
3394
3395struct create_work {
3396 struct mem_cgroup *memcg;
3397 struct kmem_cache *cachep;
3398 struct work_struct work;
3399};
3400
3401static void memcg_create_cache_work_func(struct work_struct *w)
3402{
3403 struct create_work *cw = container_of(w, struct create_work, work);
3404 struct mem_cgroup *memcg = cw->memcg;
3405 struct kmem_cache *cachep = cw->cachep;
3406
3407 kmem_cache_create_memcg(memcg, cachep);
3408 css_put(&memcg->css);
3409 kfree(cw);
3410}
3411
3412
3413
3414
3415static void __memcg_create_cache_enqueue(struct mem_cgroup *memcg,
3416 struct kmem_cache *cachep)
3417{
3418 struct create_work *cw;
3419
3420 cw = kmalloc(sizeof(struct create_work), GFP_NOWAIT);
3421 if (cw == NULL) {
3422 css_put(&memcg->css);
3423 return;
3424 }
3425
3426 cw->memcg = memcg;
3427 cw->cachep = cachep;
3428
3429 INIT_WORK(&cw->work, memcg_create_cache_work_func);
3430 schedule_work(&cw->work);
3431}
3432
3433static void memcg_create_cache_enqueue(struct mem_cgroup *memcg,
3434 struct kmem_cache *cachep)
3435{
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447 memcg_stop_kmem_account();
3448 __memcg_create_cache_enqueue(memcg, cachep);
3449 memcg_resume_kmem_account();
3450}
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep,
3465 gfp_t gfp)
3466{
3467 struct mem_cgroup *memcg;
3468 struct kmem_cache *memcg_cachep;
3469
3470 VM_BUG_ON(!cachep->memcg_params);
3471 VM_BUG_ON(!cachep->memcg_params->is_root_cache);
3472
3473 if (!current->mm || current->memcg_kmem_skip_account)
3474 return cachep;
3475
3476 rcu_read_lock();
3477 memcg = mem_cgroup_from_task(rcu_dereference(current->mm->owner));
3478
3479 if (!memcg_can_account_kmem(memcg))
3480 goto out;
3481
3482 memcg_cachep = cache_from_memcg_idx(cachep, memcg_cache_id(memcg));
3483 if (likely(memcg_cachep)) {
3484 cachep = memcg_cachep;
3485 goto out;
3486 }
3487
3488
3489 if (!css_tryget(&memcg->css))
3490 goto out;
3491 rcu_read_unlock();
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510 memcg_create_cache_enqueue(memcg, cachep);
3511 return cachep;
3512out:
3513 rcu_read_unlock();
3514 return cachep;
3515}
3516EXPORT_SYMBOL(__memcg_kmem_get_cache);
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532bool
3533__memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **_memcg, int order)
3534{
3535 struct mem_cgroup *memcg;
3536 int ret;
3537
3538 *_memcg = NULL;
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564 if (!current->mm || current->memcg_kmem_skip_account)
3565 return true;
3566
3567 memcg = get_mem_cgroup_from_mm(current->mm);
3568
3569 if (!memcg_can_account_kmem(memcg)) {
3570 css_put(&memcg->css);
3571 return true;
3572 }
3573
3574 ret = memcg_charge_kmem(memcg, gfp, PAGE_SIZE << order);
3575 if (!ret)
3576 *_memcg = memcg;
3577
3578 css_put(&memcg->css);
3579 return (ret == 0);
3580}
3581
3582void __memcg_kmem_commit_charge(struct page *page, struct mem_cgroup *memcg,
3583 int order)
3584{
3585 struct page_cgroup *pc;
3586
3587 VM_BUG_ON(mem_cgroup_is_root(memcg));
3588
3589
3590 if (!page) {
3591 memcg_uncharge_kmem(memcg, PAGE_SIZE << order);
3592 return;
3593 }
3594
3595 pc = lookup_page_cgroup(page);
3596 lock_page_cgroup(pc);
3597 pc->mem_cgroup = memcg;
3598 SetPageCgroupUsed(pc);
3599 unlock_page_cgroup(pc);
3600}
3601
3602void __memcg_kmem_uncharge_pages(struct page *page, int order)
3603{
3604 struct mem_cgroup *memcg = NULL;
3605 struct page_cgroup *pc;
3606
3607
3608 pc = lookup_page_cgroup(page);
3609
3610
3611
3612
3613 if (!PageCgroupUsed(pc))
3614 return;
3615
3616 lock_page_cgroup(pc);
3617 if (PageCgroupUsed(pc)) {
3618 memcg = pc->mem_cgroup;
3619 ClearPageCgroupUsed(pc);
3620 }
3621 unlock_page_cgroup(pc);
3622
3623
3624
3625
3626
3627 if (!memcg)
3628 return;
3629
3630 VM_BUG_ON_PAGE(mem_cgroup_is_root(memcg), page);
3631 memcg_uncharge_kmem(memcg, PAGE_SIZE << order);
3632}
3633#else
3634static inline void mem_cgroup_destroy_all_caches(struct mem_cgroup *memcg)
3635{
3636}
3637#endif
3638
3639#ifdef CONFIG_TRANSPARENT_HUGEPAGE
3640
3641#define PCGF_NOCOPY_AT_SPLIT (1 << PCG_LOCK | 1 << PCG_MIGRATION)
3642
3643
3644
3645
3646
3647
3648void mem_cgroup_split_huge_fixup(struct page *head)
3649{
3650 struct page_cgroup *head_pc = lookup_page_cgroup(head);
3651 struct page_cgroup *pc;
3652 struct mem_cgroup *memcg;
3653 int i;
3654
3655 if (mem_cgroup_disabled())
3656 return;
3657
3658 memcg = head_pc->mem_cgroup;
3659 for (i = 1; i < HPAGE_PMD_NR; i++) {
3660 pc = head_pc + i;
3661 pc->mem_cgroup = memcg;
3662 smp_wmb();
3663 pc->flags = head_pc->flags & ~PCGF_NOCOPY_AT_SPLIT;
3664 }
3665 __this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_RSS_HUGE],
3666 HPAGE_PMD_NR);
3667}
3668#endif
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685static int mem_cgroup_move_account(struct page *page,
3686 unsigned int nr_pages,
3687 struct page_cgroup *pc,
3688 struct mem_cgroup *from,
3689 struct mem_cgroup *to)
3690{
3691 unsigned long flags;
3692 int ret;
3693 bool anon = PageAnon(page);
3694
3695 VM_BUG_ON(from == to);
3696 VM_BUG_ON_PAGE(PageLRU(page), page);
3697
3698
3699
3700
3701
3702
3703 ret = -EBUSY;
3704 if (nr_pages > 1 && !PageTransHuge(page))
3705 goto out;
3706
3707 lock_page_cgroup(pc);
3708
3709 ret = -EINVAL;
3710 if (!PageCgroupUsed(pc) || pc->mem_cgroup != from)
3711 goto unlock;
3712
3713 move_lock_mem_cgroup(from, &flags);
3714
3715 if (!anon && page_mapped(page)) {
3716 __this_cpu_sub(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED],
3717 nr_pages);
3718 __this_cpu_add(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED],
3719 nr_pages);
3720 }
3721
3722 if (PageWriteback(page)) {
3723 __this_cpu_sub(from->stat->count[MEM_CGROUP_STAT_WRITEBACK],
3724 nr_pages);
3725 __this_cpu_add(to->stat->count[MEM_CGROUP_STAT_WRITEBACK],
3726 nr_pages);
3727 }
3728
3729 mem_cgroup_charge_statistics(from, page, anon, -nr_pages);
3730
3731
3732 pc->mem_cgroup = to;
3733 mem_cgroup_charge_statistics(to, page, anon, nr_pages);
3734 move_unlock_mem_cgroup(from, &flags);
3735 ret = 0;
3736unlock:
3737 unlock_page_cgroup(pc);
3738
3739
3740
3741 memcg_check_events(to, page);
3742 memcg_check_events(from, page);
3743out:
3744 return ret;
3745}
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
3764
3765
3766
3767
3768static int mem_cgroup_move_parent(struct page *page,
3769 struct page_cgroup *pc,
3770 struct mem_cgroup *child)
3771{
3772 struct mem_cgroup *parent;
3773 unsigned int nr_pages;
3774 unsigned long uninitialized_var(flags);
3775 int ret;
3776
3777 VM_BUG_ON(mem_cgroup_is_root(child));
3778
3779 ret = -EBUSY;
3780 if (!get_page_unless_zero(page))
3781 goto out;
3782 if (isolate_lru_page(page))
3783 goto put;
3784
3785 nr_pages = hpage_nr_pages(page);
3786
3787 parent = parent_mem_cgroup(child);
3788
3789
3790
3791 if (!parent)
3792 parent = root_mem_cgroup;
3793
3794 if (nr_pages > 1) {
3795 VM_BUG_ON_PAGE(!PageTransHuge(page), page);
3796 flags = compound_lock_irqsave(page);
3797 }
3798
3799 ret = mem_cgroup_move_account(page, nr_pages,
3800 pc, child, parent);
3801 if (!ret)
3802 __mem_cgroup_cancel_local_charge(child, nr_pages);
3803
3804 if (nr_pages > 1)
3805 compound_unlock_irqrestore(page, flags);
3806 putback_lru_page(page);
3807put:
3808 put_page(page);
3809out:
3810 return ret;
3811}
3812
3813int mem_cgroup_charge_anon(struct page *page,
3814 struct mm_struct *mm, gfp_t gfp_mask)
3815{
3816 unsigned int nr_pages = 1;
3817 struct mem_cgroup *memcg;
3818 bool oom = true;
3819
3820 if (mem_cgroup_disabled())
3821 return 0;
3822
3823 VM_BUG_ON_PAGE(page_mapped(page), page);
3824 VM_BUG_ON_PAGE(page->mapping && !PageAnon(page), page);
3825 VM_BUG_ON(!mm);
3826
3827 if (PageTransHuge(page)) {
3828 nr_pages <<= compound_order(page);
3829 VM_BUG_ON_PAGE(!PageTransHuge(page), page);
3830
3831
3832
3833
3834 oom = false;
3835 }
3836
3837 memcg = mem_cgroup_try_charge_mm(mm, gfp_mask, nr_pages, oom);
3838 if (!memcg)
3839 return -ENOMEM;
3840 __mem_cgroup_commit_charge(memcg, page, nr_pages,
3841 MEM_CGROUP_CHARGE_TYPE_ANON, false);
3842 return 0;
3843}
3844
3845
3846
3847
3848
3849
3850
3851static int __mem_cgroup_try_charge_swapin(struct mm_struct *mm,
3852 struct page *page,
3853 gfp_t mask,
3854 struct mem_cgroup **memcgp)
3855{
3856 struct mem_cgroup *memcg = NULL;
3857 struct page_cgroup *pc;
3858 int ret;
3859
3860 pc = lookup_page_cgroup(page);
3861
3862
3863
3864
3865
3866
3867
3868 if (PageCgroupUsed(pc))
3869 goto out;
3870 if (do_swap_account)
3871 memcg = try_get_mem_cgroup_from_page(page);
3872 if (!memcg)
3873 memcg = get_mem_cgroup_from_mm(mm);
3874 ret = mem_cgroup_try_charge(memcg, mask, 1, true);
3875 css_put(&memcg->css);
3876 if (ret == -EINTR)
3877 memcg = root_mem_cgroup;
3878 else if (ret)
3879 return ret;
3880out:
3881 *memcgp = memcg;
3882 return 0;
3883}
3884
3885int mem_cgroup_try_charge_swapin(struct mm_struct *mm, struct page *page,
3886 gfp_t gfp_mask, struct mem_cgroup **memcgp)
3887{
3888 if (mem_cgroup_disabled()) {
3889 *memcgp = NULL;
3890 return 0;
3891 }
3892
3893
3894
3895
3896
3897
3898 if (!PageSwapCache(page)) {
3899 struct mem_cgroup *memcg;
3900
3901 memcg = mem_cgroup_try_charge_mm(mm, gfp_mask, 1, true);
3902 if (!memcg)
3903 return -ENOMEM;
3904 *memcgp = memcg;
3905 return 0;
3906 }
3907 return __mem_cgroup_try_charge_swapin(mm, page, gfp_mask, memcgp);
3908}
3909
3910void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *memcg)
3911{
3912 if (mem_cgroup_disabled())
3913 return;
3914 if (!memcg)
3915 return;
3916 __mem_cgroup_cancel_charge(memcg, 1);
3917}
3918
3919static void
3920__mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *memcg,
3921 enum charge_type ctype)
3922{
3923 if (mem_cgroup_disabled())
3924 return;
3925 if (!memcg)
3926 return;
3927
3928 __mem_cgroup_commit_charge(memcg, page, 1, ctype, true);
3929
3930
3931
3932
3933
3934
3935
3936 if (do_swap_account && PageSwapCache(page)) {
3937 swp_entry_t ent = {.val = page_private(page)};
3938 mem_cgroup_uncharge_swap(ent);
3939 }
3940}
3941
3942void mem_cgroup_commit_charge_swapin(struct page *page,
3943 struct mem_cgroup *memcg)
3944{
3945 __mem_cgroup_commit_charge_swapin(page, memcg,
3946 MEM_CGROUP_CHARGE_TYPE_ANON);
3947}
3948
3949int mem_cgroup_charge_file(struct page *page, struct mm_struct *mm,
3950 gfp_t gfp_mask)
3951{
3952 enum charge_type type = MEM_CGROUP_CHARGE_TYPE_CACHE;
3953 struct mem_cgroup *memcg;
3954 int ret;
3955
3956 if (mem_cgroup_disabled())
3957 return 0;
3958 if (PageCompound(page))
3959 return 0;
3960
3961 if (PageSwapCache(page)) {
3962 ret = __mem_cgroup_try_charge_swapin(mm, page,
3963 gfp_mask, &memcg);
3964 if (ret)
3965 return ret;
3966 __mem_cgroup_commit_charge_swapin(page, memcg, type);
3967 return 0;
3968 }
3969
3970 memcg = mem_cgroup_try_charge_mm(mm, gfp_mask, 1, true);
3971 if (!memcg)
3972 return -ENOMEM;
3973 __mem_cgroup_commit_charge(memcg, page, 1, type, false);
3974 return 0;
3975}
3976
3977static void mem_cgroup_do_uncharge(struct mem_cgroup *memcg,
3978 unsigned int nr_pages,
3979 const enum charge_type ctype)
3980{
3981 struct memcg_batch_info *batch = NULL;
3982 bool uncharge_memsw = true;
3983
3984
3985 if (!do_swap_account || ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT)
3986 uncharge_memsw = false;
3987
3988 batch = ¤t->memcg_batch;
3989
3990
3991
3992
3993
3994 if (!batch->memcg)
3995 batch->memcg = memcg;
3996
3997
3998
3999
4000
4001
4002
4003
4004 if (!batch->do_batch || test_thread_flag(TIF_MEMDIE))
4005 goto direct_uncharge;
4006
4007 if (nr_pages > 1)
4008 goto direct_uncharge;
4009
4010
4011
4012
4013
4014
4015 if (batch->memcg != memcg)
4016 goto direct_uncharge;
4017
4018 batch->nr_pages++;
4019 if (uncharge_memsw)
4020 batch->memsw_nr_pages++;
4021 return;
4022direct_uncharge:
4023 res_counter_uncharge(&memcg->res, nr_pages * PAGE_SIZE);
4024 if (uncharge_memsw)
4025 res_counter_uncharge(&memcg->memsw, nr_pages * PAGE_SIZE);
4026 if (unlikely(batch->memcg != memcg))
4027 memcg_oom_recover(memcg);
4028}
4029
4030
4031
4032
4033static struct mem_cgroup *
4034__mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype,
4035 bool end_migration)
4036{
4037 struct mem_cgroup *memcg = NULL;
4038 unsigned int nr_pages = 1;
4039 struct page_cgroup *pc;
4040 bool anon;
4041
4042 if (mem_cgroup_disabled())
4043 return NULL;
4044
4045 if (PageTransHuge(page)) {
4046 nr_pages <<= compound_order(page);
4047 VM_BUG_ON_PAGE(!PageTransHuge(page), page);
4048 }
4049
4050
4051
4052 pc = lookup_page_cgroup(page);
4053 if (unlikely(!PageCgroupUsed(pc)))
4054 return NULL;
4055
4056 lock_page_cgroup(pc);
4057
4058 memcg = pc->mem_cgroup;
4059
4060 if (!PageCgroupUsed(pc))
4061 goto unlock_out;
4062
4063 anon = PageAnon(page);
4064
4065 switch (ctype) {
4066 case MEM_CGROUP_CHARGE_TYPE_ANON:
4067
4068
4069
4070
4071
4072 anon = true;
4073
4074 case MEM_CGROUP_CHARGE_TYPE_DROP:
4075
4076 if (page_mapped(page))
4077 goto unlock_out;
4078
4079
4080
4081
4082
4083
4084
4085 if (!end_migration && PageCgroupMigration(pc))
4086 goto unlock_out;
4087 break;
4088 case MEM_CGROUP_CHARGE_TYPE_SWAPOUT:
4089 if (!PageAnon(page)) {
4090 if (page->mapping && !page_is_file_cache(page))
4091 goto unlock_out;
4092 } else if (page_mapped(page))
4093 goto unlock_out;
4094 break;
4095 default:
4096 break;
4097 }
4098
4099 mem_cgroup_charge_statistics(memcg, page, anon, -nr_pages);
4100
4101 ClearPageCgroupUsed(pc);
4102
4103
4104
4105
4106
4107
4108
4109 unlock_page_cgroup(pc);
4110
4111
4112
4113
4114 memcg_check_events(memcg, page);
4115 if (do_swap_account && ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT) {
4116 mem_cgroup_swap_statistics(memcg, true);
4117 css_get(&memcg->css);
4118 }
4119
4120
4121
4122
4123
4124 if (!end_migration && !mem_cgroup_is_root(memcg))
4125 mem_cgroup_do_uncharge(memcg, nr_pages, ctype);
4126
4127 return memcg;
4128
4129unlock_out:
4130 unlock_page_cgroup(pc);
4131 return NULL;
4132}
4133
4134void mem_cgroup_uncharge_page(struct page *page)
4135{
4136
4137 if (page_mapped(page))
4138 return;
4139 VM_BUG_ON_PAGE(page->mapping && !PageAnon(page), page);
4140
4141
4142
4143
4144
4145
4146
4147
4148
4149
4150
4151
4152 if (PageSwapCache(page))
4153 return;
4154 __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_ANON, false);
4155}
4156
4157void mem_cgroup_uncharge_cache_page(struct page *page)
4158{
4159 VM_BUG_ON_PAGE(page_mapped(page), page);
4160 VM_BUG_ON_PAGE(page->mapping, page);
4161 __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE, false);
4162}
4163
4164
4165
4166
4167
4168
4169
4170
4171
4172void mem_cgroup_uncharge_start(void)
4173{
4174 current->memcg_batch.do_batch++;
4175
4176 if (current->memcg_batch.do_batch == 1) {
4177 current->memcg_batch.memcg = NULL;
4178 current->memcg_batch.nr_pages = 0;
4179 current->memcg_batch.memsw_nr_pages = 0;
4180 }
4181}
4182
4183void mem_cgroup_uncharge_end(void)
4184{
4185 struct memcg_batch_info *batch = ¤t->memcg_batch;
4186
4187 if (!batch->do_batch)
4188 return;
4189
4190 batch->do_batch--;
4191 if (batch->do_batch)
4192 return;
4193
4194 if (!batch->memcg)
4195 return;
4196
4197
4198
4199
4200 if (batch->nr_pages)
4201 res_counter_uncharge(&batch->memcg->res,
4202 batch->nr_pages * PAGE_SIZE);
4203 if (batch->memsw_nr_pages)
4204 res_counter_uncharge(&batch->memcg->memsw,
4205 batch->memsw_nr_pages * PAGE_SIZE);
4206 memcg_oom_recover(batch->memcg);
4207
4208 batch->memcg = NULL;
4209}
4210
4211#ifdef CONFIG_SWAP
4212
4213
4214
4215
4216void
4217mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent, bool swapout)
4218{
4219 struct mem_cgroup *memcg;
4220 int ctype = MEM_CGROUP_CHARGE_TYPE_SWAPOUT;
4221
4222 if (!swapout)
4223 ctype = MEM_CGROUP_CHARGE_TYPE_DROP;
4224
4225 memcg = __mem_cgroup_uncharge_common(page, ctype, false);
4226
4227
4228
4229
4230
4231 if (do_swap_account && swapout && memcg)
4232 swap_cgroup_record(ent, mem_cgroup_id(memcg));
4233}
4234#endif
4235
4236#ifdef CONFIG_MEMCG_SWAP
4237
4238
4239
4240
4241void mem_cgroup_uncharge_swap(swp_entry_t ent)
4242{
4243 struct mem_cgroup *memcg;
4244 unsigned short id;
4245
4246 if (!do_swap_account)
4247 return;
4248
4249 id = swap_cgroup_record(ent, 0);
4250 rcu_read_lock();
4251 memcg = mem_cgroup_lookup(id);
4252 if (memcg) {
4253
4254
4255
4256
4257 if (!mem_cgroup_is_root(memcg))
4258 res_counter_uncharge(&memcg->memsw, PAGE_SIZE);
4259 mem_cgroup_swap_statistics(memcg, false);
4260 css_put(&memcg->css);
4261 }
4262 rcu_read_unlock();
4263}
4264
4265
4266
4267
4268
4269
4270
4271
4272
4273
4274
4275
4276
4277
4278
4279static int mem_cgroup_move_swap_account(swp_entry_t entry,
4280 struct mem_cgroup *from, struct mem_cgroup *to)
4281{
4282 unsigned short old_id, new_id;
4283
4284 old_id = mem_cgroup_id(from);
4285 new_id = mem_cgroup_id(to);
4286
4287 if (swap_cgroup_cmpxchg(entry, old_id, new_id) == old_id) {
4288 mem_cgroup_swap_statistics(from, false);
4289 mem_cgroup_swap_statistics(to, true);
4290
4291
4292
4293
4294
4295
4296
4297
4298
4299
4300
4301 css_get(&to->css);
4302 return 0;
4303 }
4304 return -EINVAL;
4305}
4306#else
4307static inline int mem_cgroup_move_swap_account(swp_entry_t entry,
4308 struct mem_cgroup *from, struct mem_cgroup *to)
4309{
4310 return -EINVAL;
4311}
4312#endif
4313
4314
4315
4316
4317
4318void mem_cgroup_prepare_migration(struct page *page, struct page *newpage,
4319 struct mem_cgroup **memcgp)
4320{
4321 struct mem_cgroup *memcg = NULL;
4322 unsigned int nr_pages = 1;
4323 struct page_cgroup *pc;
4324 enum charge_type ctype;
4325
4326 *memcgp = NULL;
4327
4328 if (mem_cgroup_disabled())
4329 return;
4330
4331 if (PageTransHuge(page))
4332 nr_pages <<= compound_order(page);
4333
4334 pc = lookup_page_cgroup(page);
4335 lock_page_cgroup(pc);
4336 if (PageCgroupUsed(pc)) {
4337 memcg = pc->mem_cgroup;
4338 css_get(&memcg->css);
4339
4340
4341
4342
4343
4344
4345
4346
4347
4348
4349
4350
4351
4352
4353
4354
4355
4356
4357
4358
4359
4360
4361
4362
4363
4364
4365
4366
4367
4368 if (PageAnon(page))
4369 SetPageCgroupMigration(pc);
4370 }
4371 unlock_page_cgroup(pc);
4372
4373
4374
4375
4376 if (!memcg)
4377 return;
4378
4379 *memcgp = memcg;
4380
4381
4382
4383
4384
4385
4386 if (PageAnon(page))
4387 ctype = MEM_CGROUP_CHARGE_TYPE_ANON;
4388 else
4389 ctype = MEM_CGROUP_CHARGE_TYPE_CACHE;
4390
4391
4392
4393
4394
4395 __mem_cgroup_commit_charge(memcg, newpage, nr_pages, ctype, false);
4396}
4397
4398
4399void mem_cgroup_end_migration(struct mem_cgroup *memcg,
4400 struct page *oldpage, struct page *newpage, bool migration_ok)
4401{
4402 struct page *used, *unused;
4403 struct page_cgroup *pc;
4404 bool anon;
4405
4406 if (!memcg)
4407 return;
4408
4409 if (!migration_ok) {
4410 used = oldpage;
4411 unused = newpage;
4412 } else {
4413 used = newpage;
4414 unused = oldpage;
4415 }
4416 anon = PageAnon(used);
4417 __mem_cgroup_uncharge_common(unused,
4418 anon ? MEM_CGROUP_CHARGE_TYPE_ANON
4419 : MEM_CGROUP_CHARGE_TYPE_CACHE,
4420 true);
4421 css_put(&memcg->css);
4422
4423
4424
4425
4426
4427 pc = lookup_page_cgroup(oldpage);
4428 lock_page_cgroup(pc);
4429 ClearPageCgroupMigration(pc);
4430 unlock_page_cgroup(pc);
4431
4432
4433
4434
4435
4436
4437
4438
4439
4440 if (anon)
4441 mem_cgroup_uncharge_page(used);
4442}
4443
4444
4445
4446
4447
4448
4449void mem_cgroup_replace_page_cache(struct page *oldpage,
4450 struct page *newpage)
4451{
4452 struct mem_cgroup *memcg = NULL;
4453 struct page_cgroup *pc;
4454 enum charge_type type = MEM_CGROUP_CHARGE_TYPE_CACHE;
4455
4456 if (mem_cgroup_disabled())
4457 return;
4458
4459 pc = lookup_page_cgroup(oldpage);
4460
4461 lock_page_cgroup(pc);
4462 if (PageCgroupUsed(pc)) {
4463 memcg = pc->mem_cgroup;
4464 mem_cgroup_charge_statistics(memcg, oldpage, false, -1);
4465 ClearPageCgroupUsed(pc);
4466 }
4467 unlock_page_cgroup(pc);
4468
4469
4470
4471
4472
4473 if (!memcg)
4474 return;
4475
4476
4477
4478
4479
4480 __mem_cgroup_commit_charge(memcg, newpage, 1, type, true);
4481}
4482
4483#ifdef CONFIG_DEBUG_VM
4484static struct page_cgroup *lookup_page_cgroup_used(struct page *page)
4485{
4486 struct page_cgroup *pc;
4487
4488 pc = lookup_page_cgroup(page);
4489
4490
4491
4492
4493
4494 if (likely(pc) && PageCgroupUsed(pc))
4495 return pc;
4496 return NULL;
4497}
4498
4499bool mem_cgroup_bad_page_check(struct page *page)
4500{
4501 if (mem_cgroup_disabled())
4502 return false;
4503
4504 return lookup_page_cgroup_used(page) != NULL;
4505}
4506
4507void mem_cgroup_print_bad_page(struct page *page)
4508{
4509 struct page_cgroup *pc;
4510
4511 pc = lookup_page_cgroup_used(page);
4512 if (pc) {
4513 pr_alert("pc:%p pc->flags:%lx pc->mem_cgroup:%p\n",
4514 pc, pc->flags, pc->mem_cgroup);
4515 }
4516}
4517#endif
4518
4519static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
4520 unsigned long long val)
4521{
4522 int retry_count;
4523 u64 memswlimit, memlimit;
4524 int ret = 0;
4525 int children = mem_cgroup_count_children(memcg);
4526 u64 curusage, oldusage;
4527 int enlarge;
4528
4529
4530
4531
4532
4533
4534 retry_count = MEM_CGROUP_RECLAIM_RETRIES * children;
4535
4536 oldusage = res_counter_read_u64(&memcg->res, RES_USAGE);
4537
4538 enlarge = 0;
4539 while (retry_count) {
4540 if (signal_pending(current)) {
4541 ret = -EINTR;
4542 break;
4543 }
4544
4545
4546
4547
4548
4549 mutex_lock(&set_limit_mutex);
4550 memswlimit = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
4551 if (memswlimit < val) {
4552 ret = -EINVAL;
4553 mutex_unlock(&set_limit_mutex);
4554 break;
4555 }
4556
4557 memlimit = res_counter_read_u64(&memcg->res, RES_LIMIT);
4558 if (memlimit < val)
4559 enlarge = 1;
4560
4561 ret = res_counter_set_limit(&memcg->res, val);
4562 if (!ret) {
4563 if (memswlimit == val)
4564 memcg->memsw_is_minimum = true;
4565 else
4566 memcg->memsw_is_minimum = false;
4567 }
4568 mutex_unlock(&set_limit_mutex);
4569
4570 if (!ret)
4571 break;
4572
4573 mem_cgroup_reclaim(memcg, GFP_KERNEL,
4574 MEM_CGROUP_RECLAIM_SHRINK);
4575 curusage = res_counter_read_u64(&memcg->res, RES_USAGE);
4576
4577 if (curusage >= oldusage)
4578 retry_count--;
4579 else
4580 oldusage = curusage;
4581 }
4582 if (!ret && enlarge)
4583 memcg_oom_recover(memcg);
4584
4585 return ret;
4586}
4587
4588static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg,
4589 unsigned long long val)
4590{
4591 int retry_count;
4592 u64 memlimit, memswlimit, oldusage, curusage;
4593 int children = mem_cgroup_count_children(memcg);
4594 int ret = -EBUSY;
4595 int enlarge = 0;
4596
4597
4598 retry_count = children * MEM_CGROUP_RECLAIM_RETRIES;
4599 oldusage = res_counter_read_u64(&memcg->memsw, RES_USAGE);
4600 while (retry_count) {
4601 if (signal_pending(current)) {
4602 ret = -EINTR;
4603 break;
4604 }
4605
4606
4607
4608
4609
4610 mutex_lock(&set_limit_mutex);
4611 memlimit = res_counter_read_u64(&memcg->res, RES_LIMIT);
4612 if (memlimit > val) {
4613 ret = -EINVAL;
4614 mutex_unlock(&set_limit_mutex);
4615 break;
4616 }
4617 memswlimit = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
4618 if (memswlimit < val)
4619 enlarge = 1;
4620 ret = res_counter_set_limit(&memcg->memsw, val);
4621 if (!ret) {
4622 if (memlimit == val)
4623 memcg->memsw_is_minimum = true;
4624 else
4625 memcg->memsw_is_minimum = false;
4626 }
4627 mutex_unlock(&set_limit_mutex);
4628
4629 if (!ret)
4630 break;
4631
4632 mem_cgroup_reclaim(memcg, GFP_KERNEL,
4633 MEM_CGROUP_RECLAIM_NOSWAP |
4634 MEM_CGROUP_RECLAIM_SHRINK);
4635 curusage = res_counter_read_u64(&memcg->memsw, RES_USAGE);
4636
4637 if (curusage >= oldusage)
4638 retry_count--;
4639 else
4640 oldusage = curusage;
4641 }
4642 if (!ret && enlarge)
4643 memcg_oom_recover(memcg);
4644 return ret;
4645}
4646
4647unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
4648 gfp_t gfp_mask,
4649 unsigned long *total_scanned)
4650{
4651 unsigned long nr_reclaimed = 0;
4652 struct mem_cgroup_per_zone *mz, *next_mz = NULL;
4653 unsigned long reclaimed;
4654 int loop = 0;
4655 struct mem_cgroup_tree_per_zone *mctz;
4656 unsigned long long excess;
4657 unsigned long nr_scanned;
4658
4659 if (order > 0)
4660 return 0;
4661
4662 mctz = soft_limit_tree_node_zone(zone_to_nid(zone), zone_idx(zone));
4663
4664
4665
4666
4667
4668 do {
4669 if (next_mz)
4670 mz = next_mz;
4671 else
4672 mz = mem_cgroup_largest_soft_limit_node(mctz);
4673 if (!mz)
4674 break;
4675
4676 nr_scanned = 0;
4677 reclaimed = mem_cgroup_soft_reclaim(mz->memcg, zone,
4678 gfp_mask, &nr_scanned);
4679 nr_reclaimed += reclaimed;
4680 *total_scanned += nr_scanned;
4681 spin_lock(&mctz->lock);
4682
4683
4684
4685
4686
4687 next_mz = NULL;
4688 if (!reclaimed) {
4689 do {
4690
4691
4692
4693
4694
4695
4696
4697
4698
4699
4700
4701 next_mz =
4702 __mem_cgroup_largest_soft_limit_node(mctz);
4703 if (next_mz == mz)
4704 css_put(&next_mz->memcg->css);
4705 else
4706 break;
4707 } while (1);
4708 }
4709 __mem_cgroup_remove_exceeded(mz->memcg, mz, mctz);
4710 excess = res_counter_soft_limit_excess(&mz->memcg->res);
4711
4712
4713
4714
4715
4716
4717
4718
4719
4720 __mem_cgroup_insert_exceeded(mz->memcg, mz, mctz, excess);
4721 spin_unlock(&mctz->lock);
4722 css_put(&mz->memcg->css);
4723 loop++;
4724
4725
4726
4727
4728
4729 if (!nr_reclaimed &&
4730 (next_mz == NULL ||
4731 loop > MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS))
4732 break;
4733 } while (!nr_reclaimed);
4734 if (next_mz)
4735 css_put(&next_mz->memcg->css);
4736 return nr_reclaimed;
4737}
4738
4739
4740
4741
4742
4743
4744
4745
4746
4747
4748
4749
4750static void mem_cgroup_force_empty_list(struct mem_cgroup *memcg,
4751 int node, int zid, enum lru_list lru)
4752{
4753 struct lruvec *lruvec;
4754 unsigned long flags;
4755 struct list_head *list;
4756 struct page *busy;
4757 struct zone *zone;
4758
4759 zone = &NODE_DATA(node)->node_zones[zid];
4760 lruvec = mem_cgroup_zone_lruvec(zone, memcg);
4761 list = &lruvec->lists[lru];
4762
4763 busy = NULL;
4764 do {
4765 struct page_cgroup *pc;
4766 struct page *page;
4767
4768 spin_lock_irqsave(&zone->lru_lock, flags);
4769 if (list_empty(list)) {
4770 spin_unlock_irqrestore(&zone->lru_lock, flags);
4771 break;
4772 }
4773 page = list_entry(list->prev, struct page, lru);
4774 if (busy == page) {
4775 list_move(&page->lru, list);
4776 busy = NULL;
4777 spin_unlock_irqrestore(&zone->lru_lock, flags);
4778 continue;
4779 }
4780 spin_unlock_irqrestore(&zone->lru_lock, flags);
4781
4782 pc = lookup_page_cgroup(page);
4783
4784 if (mem_cgroup_move_parent(page, pc, memcg)) {
4785
4786 busy = page;
4787 cond_resched();
4788 } else
4789 busy = NULL;
4790 } while (!list_empty(list));
4791}
4792
4793
4794
4795
4796
4797
4798
4799
4800static void mem_cgroup_reparent_charges(struct mem_cgroup *memcg)
4801{
4802 int node, zid;
4803 u64 usage;
4804
4805 do {
4806
4807 lru_add_drain_all();
4808 drain_all_stock_sync(memcg);
4809 mem_cgroup_start_move(memcg);
4810 for_each_node_state(node, N_MEMORY) {
4811 for (zid = 0; zid < MAX_NR_ZONES; zid++) {
4812 enum lru_list lru;
4813 for_each_lru(lru) {
4814 mem_cgroup_force_empty_list(memcg,
4815 node, zid, lru);
4816 }
4817 }
4818 }
4819 mem_cgroup_end_move(memcg);
4820 memcg_oom_recover(memcg);
4821 cond_resched();
4822
4823
4824
4825
4826
4827
4828
4829
4830
4831
4832
4833
4834
4835 usage = res_counter_read_u64(&memcg->res, RES_USAGE) -
4836 res_counter_read_u64(&memcg->kmem, RES_USAGE);
4837 } while (usage > 0);
4838}
4839
4840static inline bool memcg_has_children(struct mem_cgroup *memcg)
4841{
4842 lockdep_assert_held(&memcg_create_mutex);
4843
4844
4845
4846
4847
4848
4849
4850 return memcg->use_hierarchy &&
4851 !list_empty(&memcg->css.cgroup->children);
4852}
4853
4854
4855
4856
4857
4858
4859
4860static int mem_cgroup_force_empty(struct mem_cgroup *memcg)
4861{
4862 int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
4863 struct cgroup *cgrp = memcg->css.cgroup;
4864
4865
4866 if (cgroup_has_tasks(cgrp) || !list_empty(&cgrp->children))
4867 return -EBUSY;
4868
4869
4870 lru_add_drain_all();
4871
4872 while (nr_retries && res_counter_read_u64(&memcg->res, RES_USAGE) > 0) {
4873 int progress;
4874
4875 if (signal_pending(current))
4876 return -EINTR;
4877
4878 progress = try_to_free_mem_cgroup_pages(memcg, GFP_KERNEL,
4879 false);
4880 if (!progress) {
4881 nr_retries--;
4882
4883 congestion_wait(BLK_RW_ASYNC, HZ/10);
4884 }
4885
4886 }
4887 lru_add_drain();
4888 mem_cgroup_reparent_charges(memcg);
4889
4890 return 0;
4891}
4892
4893static int mem_cgroup_force_empty_write(struct cgroup_subsys_state *css,
4894 unsigned int event)
4895{
4896 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
4897
4898 if (mem_cgroup_is_root(memcg))
4899 return -EINVAL;
4900 return mem_cgroup_force_empty(memcg);
4901}
4902
4903static u64 mem_cgroup_hierarchy_read(struct cgroup_subsys_state *css,
4904 struct cftype *cft)
4905{
4906 return mem_cgroup_from_css(css)->use_hierarchy;
4907}
4908
4909static int mem_cgroup_hierarchy_write(struct cgroup_subsys_state *css,
4910 struct cftype *cft, u64 val)
4911{
4912 int retval = 0;
4913 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
4914 struct mem_cgroup *parent_memcg = mem_cgroup_from_css(css_parent(&memcg->css));
4915
4916 mutex_lock(&memcg_create_mutex);
4917
4918 if (memcg->use_hierarchy == val)
4919 goto out;
4920
4921
4922
4923
4924
4925
4926
4927
4928
4929 if ((!parent_memcg || !parent_memcg->use_hierarchy) &&
4930 (val == 1 || val == 0)) {
4931 if (list_empty(&memcg->css.cgroup->children))
4932 memcg->use_hierarchy = val;
4933 else
4934 retval = -EBUSY;
4935 } else
4936 retval = -EINVAL;
4937
4938out:
4939 mutex_unlock(&memcg_create_mutex);
4940
4941 return retval;
4942}
4943
4944
4945static unsigned long mem_cgroup_recursive_stat(struct mem_cgroup *memcg,
4946 enum mem_cgroup_stat_index idx)
4947{
4948 struct mem_cgroup *iter;
4949 long val = 0;
4950
4951
4952 for_each_mem_cgroup_tree(iter, memcg)
4953 val += mem_cgroup_read_stat(iter, idx);
4954
4955 if (val < 0)
4956 val = 0;
4957 return val;
4958}
4959
4960static inline u64 mem_cgroup_usage(struct mem_cgroup *memcg, bool swap)
4961{
4962 u64 val;
4963
4964 if (!mem_cgroup_is_root(memcg)) {
4965 if (!swap)
4966 return res_counter_read_u64(&memcg->res, RES_USAGE);
4967 else
4968 return res_counter_read_u64(&memcg->memsw, RES_USAGE);
4969 }
4970
4971
4972
4973
4974
4975 val = mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_CACHE);
4976 val += mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_RSS);
4977
4978 if (swap)
4979 val += mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_SWAP);
4980
4981 return val << PAGE_SHIFT;
4982}
4983
4984static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css,
4985 struct cftype *cft)
4986{
4987 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
4988 u64 val;
4989 int name;
4990 enum res_type type;
4991
4992 type = MEMFILE_TYPE(cft->private);
4993 name = MEMFILE_ATTR(cft->private);
4994
4995 switch (type) {
4996 case _MEM:
4997 if (name == RES_USAGE)
4998 val = mem_cgroup_usage(memcg, false);
4999 else
5000 val = res_counter_read_u64(&memcg->res, name);
5001 break;
5002 case _MEMSWAP:
5003 if (name == RES_USAGE)
5004 val = mem_cgroup_usage(memcg, true);
5005 else
5006 val = res_counter_read_u64(&memcg->memsw, name);
5007 break;
5008 case _KMEM:
5009 val = res_counter_read_u64(&memcg->kmem, name);
5010 break;
5011 default:
5012 BUG();
5013 }
5014
5015 return val;
5016}
5017
5018#ifdef CONFIG_MEMCG_KMEM
5019
5020static int __memcg_activate_kmem(struct mem_cgroup *memcg,
5021 unsigned long long limit)
5022{
5023 int err = 0;
5024 int memcg_id;
5025
5026 if (memcg_kmem_is_active(memcg))
5027 return 0;
5028
5029
5030
5031
5032
5033 memcg_stop_kmem_account();
5034
5035
5036
5037
5038
5039
5040
5041
5042
5043
5044
5045
5046
5047 mutex_lock(&memcg_create_mutex);
5048 if (cgroup_has_tasks(memcg->css.cgroup) || memcg_has_children(memcg))
5049 err = -EBUSY;
5050 mutex_unlock(&memcg_create_mutex);
5051 if (err)
5052 goto out;
5053
5054 memcg_id = ida_simple_get(&kmem_limited_groups,
5055 0, MEMCG_CACHES_MAX_SIZE, GFP_KERNEL);
5056 if (memcg_id < 0) {
5057 err = memcg_id;
5058 goto out;
5059 }
5060
5061
5062
5063
5064
5065 err = memcg_update_all_caches(memcg_id + 1);
5066 if (err)
5067 goto out_rmid;
5068
5069 memcg->kmemcg_id = memcg_id;
5070 INIT_LIST_HEAD(&memcg->memcg_slab_caches);
5071 mutex_init(&memcg->slab_caches_mutex);
5072
5073
5074
5075
5076
5077 err = res_counter_set_limit(&memcg->kmem, limit);
5078 VM_BUG_ON(err);
5079
5080 static_key_slow_inc(&memcg_kmem_enabled_key);
5081
5082
5083
5084
5085
5086 memcg_kmem_set_active(memcg);
5087out:
5088 memcg_resume_kmem_account();
5089 return err;
5090
5091out_rmid:
5092 ida_simple_remove(&kmem_limited_groups, memcg_id);
5093 goto out;
5094}
5095
5096static int memcg_activate_kmem(struct mem_cgroup *memcg,
5097 unsigned long long limit)
5098{
5099 int ret;
5100
5101 mutex_lock(&activate_kmem_mutex);
5102 ret = __memcg_activate_kmem(memcg, limit);
5103 mutex_unlock(&activate_kmem_mutex);
5104 return ret;
5105}
5106
5107static int memcg_update_kmem_limit(struct mem_cgroup *memcg,
5108 unsigned long long val)
5109{
5110 int ret;
5111
5112 if (!memcg_kmem_is_active(memcg))
5113 ret = memcg_activate_kmem(memcg, val);
5114 else
5115 ret = res_counter_set_limit(&memcg->kmem, val);
5116 return ret;
5117}
5118
5119static int memcg_propagate_kmem(struct mem_cgroup *memcg)
5120{
5121 int ret = 0;
5122 struct mem_cgroup *parent = parent_mem_cgroup(memcg);
5123
5124 if (!parent)
5125 return 0;
5126
5127 mutex_lock(&activate_kmem_mutex);
5128
5129
5130
5131
5132 if (memcg_kmem_is_active(parent))
5133 ret = __memcg_activate_kmem(memcg, RES_COUNTER_MAX);
5134 mutex_unlock(&activate_kmem_mutex);
5135 return ret;
5136}
5137#else
5138static int memcg_update_kmem_limit(struct mem_cgroup *memcg,
5139 unsigned long long val)
5140{
5141 return -EINVAL;
5142}
5143#endif
5144
5145
5146
5147
5148
5149static int mem_cgroup_write(struct cgroup_subsys_state *css, struct cftype *cft,
5150 char *buffer)
5151{
5152 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
5153 enum res_type type;
5154 int name;
5155 unsigned long long val;
5156 int ret;
5157
5158 type = MEMFILE_TYPE(cft->private);
5159 name = MEMFILE_ATTR(cft->private);
5160
5161 switch (name) {
5162 case RES_LIMIT:
5163 if (mem_cgroup_is_root(memcg)) {
5164 ret = -EINVAL;
5165 break;
5166 }
5167
5168 ret = res_counter_memparse_write_strategy(buffer, &val);
5169 if (ret)
5170 break;
5171 if (type == _MEM)
5172 ret = mem_cgroup_resize_limit(memcg, val);
5173 else if (type == _MEMSWAP)
5174 ret = mem_cgroup_resize_memsw_limit(memcg, val);
5175 else if (type == _KMEM)
5176 ret = memcg_update_kmem_limit(memcg, val);
5177 else
5178 return -EINVAL;
5179 break;
5180 case RES_SOFT_LIMIT:
5181 ret = res_counter_memparse_write_strategy(buffer, &val);
5182 if (ret)
5183 break;
5184
5185
5186
5187
5188
5189 if (type == _MEM)
5190 ret = res_counter_set_soft_limit(&memcg->res, val);
5191 else
5192 ret = -EINVAL;
5193 break;
5194 default:
5195 ret = -EINVAL;
5196 break;
5197 }
5198 return ret;
5199}
5200
5201static void memcg_get_hierarchical_limit(struct mem_cgroup *memcg,
5202 unsigned long long *mem_limit, unsigned long long *memsw_limit)
5203{
5204 unsigned long long min_limit, min_memsw_limit, tmp;
5205
5206 min_limit = res_counter_read_u64(&memcg->res, RES_LIMIT);
5207 min_memsw_limit = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
5208 if (!memcg->use_hierarchy)
5209 goto out;
5210
5211 while (css_parent(&memcg->css)) {
5212 memcg = mem_cgroup_from_css(css_parent(&memcg->css));
5213 if (!memcg->use_hierarchy)
5214 break;
5215 tmp = res_counter_read_u64(&memcg->res, RES_LIMIT);
5216 min_limit = min(min_limit, tmp);
5217 tmp = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
5218 min_memsw_limit = min(min_memsw_limit, tmp);
5219 }
5220out:
5221 *mem_limit = min_limit;
5222 *memsw_limit = min_memsw_limit;
5223}
5224
5225static int mem_cgroup_reset(struct cgroup_subsys_state *css, unsigned int event)
5226{
5227 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
5228 int name;
5229 enum res_type type;
5230
5231 type = MEMFILE_TYPE(event);
5232 name = MEMFILE_ATTR(event);
5233
5234 switch (name) {
5235 case RES_MAX_USAGE:
5236 if (type == _MEM)
5237 res_counter_reset_max(&memcg->res);
5238 else if (type == _MEMSWAP)
5239 res_counter_reset_max(&memcg->memsw);
5240 else if (type == _KMEM)
5241 res_counter_reset_max(&memcg->kmem);
5242 else
5243 return -EINVAL;
5244 break;
5245 case RES_FAILCNT:
5246 if (type == _MEM)
5247 res_counter_reset_failcnt(&memcg->res);
5248 else if (type == _MEMSWAP)
5249 res_counter_reset_failcnt(&memcg->memsw);
5250 else if (type == _KMEM)
5251 res_counter_reset_failcnt(&memcg->kmem);
5252 else
5253 return -EINVAL;
5254 break;
5255 }
5256
5257 return 0;
5258}
5259
5260static u64 mem_cgroup_move_charge_read(struct cgroup_subsys_state *css,
5261 struct cftype *cft)
5262{
5263 return mem_cgroup_from_css(css)->move_charge_at_immigrate;
5264}
5265
5266#ifdef CONFIG_MMU
5267static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css,
5268 struct cftype *cft, u64 val)
5269{
5270 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
5271
5272 if (val >= (1 << NR_MOVE_TYPE))
5273 return -EINVAL;
5274
5275
5276
5277
5278
5279
5280
5281 memcg->move_charge_at_immigrate = val;
5282 return 0;
5283}
5284#else
5285static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css,
5286 struct cftype *cft, u64 val)
5287{
5288 return -ENOSYS;
5289}
5290#endif
5291
5292#ifdef CONFIG_NUMA
5293static int memcg_numa_stat_show(struct seq_file *m, void *v)
5294{
5295 struct numa_stat {
5296 const char *name;
5297 unsigned int lru_mask;
5298 };
5299
5300 static const struct numa_stat stats[] = {
5301 { "total", LRU_ALL },
5302 { "file", LRU_ALL_FILE },
5303 { "anon", LRU_ALL_ANON },
5304 { "unevictable", BIT(LRU_UNEVICTABLE) },
5305 };
5306 const struct numa_stat *stat;
5307 int nid;
5308 unsigned long nr;
5309 struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
5310
5311 for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) {
5312 nr = mem_cgroup_nr_lru_pages(memcg, stat->lru_mask);
5313 seq_printf(m, "%s=%lu", stat->name, nr);
5314 for_each_node_state(nid, N_MEMORY) {
5315 nr = mem_cgroup_node_nr_lru_pages(memcg, nid,
5316 stat->lru_mask);
5317 seq_printf(m, " N%d=%lu", nid, nr);
5318 }
5319 seq_putc(m, '\n');
5320 }
5321
5322 for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) {
5323 struct mem_cgroup *iter;
5324
5325 nr = 0;
5326 for_each_mem_cgroup_tree(iter, memcg)
5327 nr += mem_cgroup_nr_lru_pages(iter, stat->lru_mask);
5328 seq_printf(m, "hierarchical_%s=%lu", stat->name, nr);
5329 for_each_node_state(nid, N_MEMORY) {
5330 nr = 0;
5331 for_each_mem_cgroup_tree(iter, memcg)
5332 nr += mem_cgroup_node_nr_lru_pages(
5333 iter, nid, stat->lru_mask);
5334 seq_printf(m, " N%d=%lu", nid, nr);
5335 }
5336 seq_putc(m, '\n');
5337 }
5338
5339 return 0;
5340}
5341#endif
5342
5343static inline void mem_cgroup_lru_names_not_uptodate(void)
5344{
5345 BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_lru_names) != NR_LRU_LISTS);
5346}
5347
5348static int memcg_stat_show(struct seq_file *m, void *v)
5349{
5350 struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
5351 struct mem_cgroup *mi;
5352 unsigned int i;
5353
5354 for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) {
5355 if (i == MEM_CGROUP_STAT_SWAP && !do_swap_account)
5356 continue;
5357 seq_printf(m, "%s %ld\n", mem_cgroup_stat_names[i],
5358 mem_cgroup_read_stat(memcg, i) * PAGE_SIZE);
5359 }
5360
5361 for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++)
5362 seq_printf(m, "%s %lu\n", mem_cgroup_events_names[i],
5363 mem_cgroup_read_events(memcg, i));
5364
5365 for (i = 0; i < NR_LRU_LISTS; i++)
5366 seq_printf(m, "%s %lu\n", mem_cgroup_lru_names[i],
5367 mem_cgroup_nr_lru_pages(memcg, BIT(i)) * PAGE_SIZE);
5368
5369
5370 {
5371 unsigned long long limit, memsw_limit;
5372 memcg_get_hierarchical_limit(memcg, &limit, &memsw_limit);
5373 seq_printf(m, "hierarchical_memory_limit %llu\n", limit);
5374 if (do_swap_account)
5375 seq_printf(m, "hierarchical_memsw_limit %llu\n",
5376 memsw_limit);
5377 }
5378
5379 for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) {
5380 long long val = 0;
5381
5382 if (i == MEM_CGROUP_STAT_SWAP && !do_swap_account)
5383 continue;
5384 for_each_mem_cgroup_tree(mi, memcg)
5385 val += mem_cgroup_read_stat(mi, i) * PAGE_SIZE;
5386 seq_printf(m, "total_%s %lld\n", mem_cgroup_stat_names[i], val);
5387 }
5388
5389 for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++) {
5390 unsigned long long val = 0;
5391
5392 for_each_mem_cgroup_tree(mi, memcg)
5393 val += mem_cgroup_read_events(mi, i);
5394 seq_printf(m, "total_%s %llu\n",
5395 mem_cgroup_events_names[i], val);
5396 }
5397
5398 for (i = 0; i < NR_LRU_LISTS; i++) {
5399 unsigned long long val = 0;
5400
5401 for_each_mem_cgroup_tree(mi, memcg)
5402 val += mem_cgroup_nr_lru_pages(mi, BIT(i)) * PAGE_SIZE;
5403 seq_printf(m, "total_%s %llu\n", mem_cgroup_lru_names[i], val);
5404 }
5405
5406#ifdef CONFIG_DEBUG_VM
5407 {
5408 int nid, zid;
5409 struct mem_cgroup_per_zone *mz;
5410 struct zone_reclaim_stat *rstat;
5411 unsigned long recent_rotated[2] = {0, 0};
5412 unsigned long recent_scanned[2] = {0, 0};
5413
5414 for_each_online_node(nid)
5415 for (zid = 0; zid < MAX_NR_ZONES; zid++) {
5416 mz = mem_cgroup_zoneinfo(memcg, nid, zid);
5417 rstat = &mz->lruvec.reclaim_stat;
5418
5419 recent_rotated[0] += rstat->recent_rotated[0];
5420 recent_rotated[1] += rstat->recent_rotated[1];
5421 recent_scanned[0] += rstat->recent_scanned[0];
5422 recent_scanned[1] += rstat->recent_scanned[1];
5423 }
5424 seq_printf(m, "recent_rotated_anon %lu\n", recent_rotated[0]);
5425 seq_printf(m, "recent_rotated_file %lu\n", recent_rotated[1]);
5426 seq_printf(m, "recent_scanned_anon %lu\n", recent_scanned[0]);
5427 seq_printf(m, "recent_scanned_file %lu\n", recent_scanned[1]);
5428 }
5429#endif
5430
5431 return 0;
5432}
5433
5434static u64 mem_cgroup_swappiness_read(struct cgroup_subsys_state *css,
5435 struct cftype *cft)
5436{
5437 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
5438
5439 return mem_cgroup_swappiness(memcg);
5440}
5441
5442static int mem_cgroup_swappiness_write(struct cgroup_subsys_state *css,
5443 struct cftype *cft, u64 val)
5444{
5445 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
5446 struct mem_cgroup *parent = mem_cgroup_from_css(css_parent(&memcg->css));
5447
5448 if (val > 100 || !parent)
5449 return -EINVAL;
5450
5451 mutex_lock(&memcg_create_mutex);
5452
5453
5454 if ((parent->use_hierarchy) || memcg_has_children(memcg)) {
5455 mutex_unlock(&memcg_create_mutex);
5456 return -EINVAL;
5457 }
5458
5459 memcg->swappiness = val;
5460
5461 mutex_unlock(&memcg_create_mutex);
5462
5463 return 0;
5464}
5465
5466static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap)
5467{
5468 struct mem_cgroup_threshold_ary *t;
5469 u64 usage;
5470 int i;
5471
5472 rcu_read_lock();
5473 if (!swap)
5474 t = rcu_dereference(memcg->thresholds.primary);
5475 else
5476 t = rcu_dereference(memcg->memsw_thresholds.primary);
5477
5478 if (!t)
5479 goto unlock;
5480
5481 usage = mem_cgroup_usage(memcg, swap);
5482
5483
5484
5485
5486
5487
5488 i = t->current_threshold;
5489
5490
5491
5492
5493
5494
5495
5496 for (; i >= 0 && unlikely(t->entries[i].threshold > usage); i--)
5497 eventfd_signal(t->entries[i].eventfd, 1);
5498
5499
5500 i++;
5501
5502
5503
5504
5505
5506
5507
5508 for (; i < t->size && unlikely(t->entries[i].threshold <= usage); i++)
5509 eventfd_signal(t->entries[i].eventfd, 1);
5510
5511
5512 t->current_threshold = i - 1;
5513unlock:
5514 rcu_read_unlock();
5515}
5516
5517static void mem_cgroup_threshold(struct mem_cgroup *memcg)
5518{
5519 while (memcg) {
5520 __mem_cgroup_threshold(memcg, false);
5521 if (do_swap_account)
5522 __mem_cgroup_threshold(memcg, true);
5523
5524 memcg = parent_mem_cgroup(memcg);
5525 }
5526}
5527
5528static int compare_thresholds(const void *a, const void *b)
5529{
5530 const struct mem_cgroup_threshold *_a = a;
5531 const struct mem_cgroup_threshold *_b = b;
5532
5533 if (_a->threshold > _b->threshold)
5534 return 1;
5535
5536 if (_a->threshold < _b->threshold)
5537 return -1;
5538
5539 return 0;
5540}
5541
5542static int mem_cgroup_oom_notify_cb(struct mem_cgroup *memcg)
5543{
5544 struct mem_cgroup_eventfd_list *ev;
5545
5546 list_for_each_entry(ev, &memcg->oom_notify, list)
5547 eventfd_signal(ev->eventfd, 1);
5548 return 0;
5549}
5550
5551static void mem_cgroup_oom_notify(struct mem_cgroup *memcg)
5552{
5553 struct mem_cgroup *iter;
5554
5555 for_each_mem_cgroup_tree(iter, memcg)
5556 mem_cgroup_oom_notify_cb(iter);
5557}
5558
5559static int __mem_cgroup_usage_register_event(struct mem_cgroup *memcg,
5560 struct eventfd_ctx *eventfd, const char *args, enum res_type type)
5561{
5562 struct mem_cgroup_thresholds *thresholds;
5563 struct mem_cgroup_threshold_ary *new;
5564 u64 threshold, usage;
5565 int i, size, ret;
5566
5567 ret = res_counter_memparse_write_strategy(args, &threshold);
5568 if (ret)
5569 return ret;
5570
5571 mutex_lock(&memcg->thresholds_lock);
5572
5573 if (type == _MEM)
5574 thresholds = &memcg->thresholds;
5575 else if (type == _MEMSWAP)
5576 thresholds = &memcg->memsw_thresholds;
5577 else
5578 BUG();
5579
5580 usage = mem_cgroup_usage(memcg, type == _MEMSWAP);
5581
5582
5583 if (thresholds->primary)
5584 __mem_cgroup_threshold(memcg, type == _MEMSWAP);
5585
5586 size = thresholds->primary ? thresholds->primary->size + 1 : 1;
5587
5588
5589 new = kmalloc(sizeof(*new) + size * sizeof(struct mem_cgroup_threshold),
5590 GFP_KERNEL);
5591 if (!new) {
5592 ret = -ENOMEM;
5593 goto unlock;
5594 }
5595 new->size = size;
5596
5597
5598 if (thresholds->primary) {
5599 memcpy(new->entries, thresholds->primary->entries, (size - 1) *
5600 sizeof(struct mem_cgroup_threshold));
5601 }
5602
5603
5604 new->entries[size - 1].eventfd = eventfd;
5605 new->entries[size - 1].threshold = threshold;
5606
5607
5608 sort(new->entries, size, sizeof(struct mem_cgroup_threshold),
5609 compare_thresholds, NULL);
5610
5611
5612 new->current_threshold = -1;
5613 for (i = 0; i < size; i++) {
5614 if (new->entries[i].threshold <= usage) {
5615
5616
5617
5618
5619
5620 ++new->current_threshold;
5621 } else
5622 break;
5623 }
5624
5625
5626 kfree(thresholds->spare);
5627 thresholds->spare = thresholds->primary;
5628
5629 rcu_assign_pointer(thresholds->primary, new);
5630
5631
5632 synchronize_rcu();
5633
5634unlock:
5635 mutex_unlock(&memcg->thresholds_lock);
5636
5637 return ret;
5638}
5639
5640static int mem_cgroup_usage_register_event(struct mem_cgroup *memcg,
5641 struct eventfd_ctx *eventfd, const char *args)
5642{
5643 return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEM);
5644}
5645
5646static int memsw_cgroup_usage_register_event(struct mem_cgroup *memcg,
5647 struct eventfd_ctx *eventfd, const char *args)
5648{
5649 return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEMSWAP);
5650}
5651
5652static void __mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
5653 struct eventfd_ctx *eventfd, enum res_type type)
5654{
5655 struct mem_cgroup_thresholds *thresholds;
5656 struct mem_cgroup_threshold_ary *new;
5657 u64 usage;
5658 int i, j, size;
5659
5660 mutex_lock(&memcg->thresholds_lock);
5661 if (type == _MEM)
5662 thresholds = &memcg->thresholds;
5663 else if (type == _MEMSWAP)
5664 thresholds = &memcg->memsw_thresholds;
5665 else
5666 BUG();
5667
5668 if (!thresholds->primary)
5669 goto unlock;
5670
5671 usage = mem_cgroup_usage(memcg, type == _MEMSWAP);
5672
5673
5674 __mem_cgroup_threshold(memcg, type == _MEMSWAP);
5675
5676
5677 size = 0;
5678 for (i = 0; i < thresholds->primary->size; i++) {
5679 if (thresholds->primary->entries[i].eventfd != eventfd)
5680 size++;
5681 }
5682
5683 new = thresholds->spare;
5684
5685
5686 if (!size) {
5687 kfree(new);
5688 new = NULL;
5689 goto swap_buffers;
5690 }
5691
5692 new->size = size;
5693
5694
5695 new->current_threshold = -1;
5696 for (i = 0, j = 0; i < thresholds->primary->size; i++) {
5697 if (thresholds->primary->entries[i].eventfd == eventfd)
5698 continue;
5699
5700 new->entries[j] = thresholds->primary->entries[i];
5701 if (new->entries[j].threshold <= usage) {
5702
5703
5704
5705
5706
5707 ++new->current_threshold;
5708 }
5709 j++;
5710 }
5711
5712swap_buffers:
5713
5714 thresholds->spare = thresholds->primary;
5715
5716 if (!new) {
5717 kfree(thresholds->spare);
5718 thresholds->spare = NULL;
5719 }
5720
5721 rcu_assign_pointer(thresholds->primary, new);
5722
5723
5724 synchronize_rcu();
5725unlock:
5726 mutex_unlock(&memcg->thresholds_lock);
5727}
5728
5729static void mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
5730 struct eventfd_ctx *eventfd)
5731{
5732 return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEM);
5733}
5734
5735static void memsw_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
5736 struct eventfd_ctx *eventfd)
5737{
5738 return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEMSWAP);
5739}
5740
5741static int mem_cgroup_oom_register_event(struct mem_cgroup *memcg,
5742 struct eventfd_ctx *eventfd, const char *args)
5743{
5744 struct mem_cgroup_eventfd_list *event;
5745
5746 event = kmalloc(sizeof(*event), GFP_KERNEL);
5747 if (!event)
5748 return -ENOMEM;
5749
5750 spin_lock(&memcg_oom_lock);
5751
5752 event->eventfd = eventfd;
5753 list_add(&event->list, &memcg->oom_notify);
5754
5755
5756 if (atomic_read(&memcg->under_oom))
5757 eventfd_signal(eventfd, 1);
5758 spin_unlock(&memcg_oom_lock);
5759
5760 return 0;
5761}
5762
5763static void mem_cgroup_oom_unregister_event(struct mem_cgroup *memcg,
5764 struct eventfd_ctx *eventfd)
5765{
5766 struct mem_cgroup_eventfd_list *ev, *tmp;
5767
5768 spin_lock(&memcg_oom_lock);
5769
5770 list_for_each_entry_safe(ev, tmp, &memcg->oom_notify, list) {
5771 if (ev->eventfd == eventfd) {
5772 list_del(&ev->list);
5773 kfree(ev);
5774 }
5775 }
5776
5777 spin_unlock(&memcg_oom_lock);
5778}
5779
5780static int mem_cgroup_oom_control_read(struct seq_file *sf, void *v)
5781{
5782 struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(sf));
5783
5784 seq_printf(sf, "oom_kill_disable %d\n", memcg->oom_kill_disable);
5785 seq_printf(sf, "under_oom %d\n", (bool)atomic_read(&memcg->under_oom));
5786 return 0;
5787}
5788
5789static int mem_cgroup_oom_control_write(struct cgroup_subsys_state *css,
5790 struct cftype *cft, u64 val)
5791{
5792 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
5793 struct mem_cgroup *parent = mem_cgroup_from_css(css_parent(&memcg->css));
5794
5795
5796 if (!parent || !((val == 0) || (val == 1)))
5797 return -EINVAL;
5798
5799 mutex_lock(&memcg_create_mutex);
5800
5801 if ((parent->use_hierarchy) || memcg_has_children(memcg)) {
5802 mutex_unlock(&memcg_create_mutex);
5803 return -EINVAL;
5804 }
5805 memcg->oom_kill_disable = val;
5806 if (!val)
5807 memcg_oom_recover(memcg);
5808 mutex_unlock(&memcg_create_mutex);
5809 return 0;
5810}
5811
5812#ifdef CONFIG_MEMCG_KMEM
5813static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
5814{
5815 int ret;
5816
5817 memcg->kmemcg_id = -1;
5818 ret = memcg_propagate_kmem(memcg);
5819 if (ret)
5820 return ret;
5821
5822 return mem_cgroup_sockets_init(memcg, ss);
5823}
5824
5825static void memcg_destroy_kmem(struct mem_cgroup *memcg)
5826{
5827 mem_cgroup_sockets_destroy(memcg);
5828}
5829
5830static void kmem_cgroup_css_offline(struct mem_cgroup *memcg)
5831{
5832 if (!memcg_kmem_is_active(memcg))
5833 return;
5834
5835
5836
5837
5838
5839
5840
5841
5842
5843
5844
5845
5846
5847
5848
5849
5850
5851
5852
5853 css_get(&memcg->css);
5854
5855 memcg_kmem_mark_dead(memcg);
5856
5857 if (res_counter_read_u64(&memcg->kmem, RES_USAGE) != 0)
5858 return;
5859
5860 if (memcg_kmem_test_and_clear_dead(memcg))
5861 css_put(&memcg->css);
5862}
5863#else
5864static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
5865{
5866 return 0;
5867}
5868
5869static void memcg_destroy_kmem(struct mem_cgroup *memcg)
5870{
5871}
5872
5873static void kmem_cgroup_css_offline(struct mem_cgroup *memcg)
5874{
5875}
5876#endif
5877
5878
5879
5880
5881
5882
5883
5884
5885
5886
5887
5888
5889
5890
5891
5892
5893
5894
5895
5896static void memcg_event_remove(struct work_struct *work)
5897{
5898 struct mem_cgroup_event *event =
5899 container_of(work, struct mem_cgroup_event, remove);
5900 struct mem_cgroup *memcg = event->memcg;
5901
5902 remove_wait_queue(event->wqh, &event->wait);
5903
5904 event->unregister_event(memcg, event->eventfd);
5905
5906
5907 eventfd_signal(event->eventfd, 1);
5908
5909 eventfd_ctx_put(event->eventfd);
5910 kfree(event);
5911 css_put(&memcg->css);
5912}
5913
5914
5915
5916
5917
5918
5919static int memcg_event_wake(wait_queue_t *wait, unsigned mode,
5920 int sync, void *key)
5921{
5922 struct mem_cgroup_event *event =
5923 container_of(wait, struct mem_cgroup_event, wait);
5924 struct mem_cgroup *memcg = event->memcg;
5925 unsigned long flags = (unsigned long)key;
5926
5927 if (flags & POLLHUP) {
5928
5929
5930
5931
5932
5933
5934
5935
5936
5937 spin_lock(&memcg->event_list_lock);
5938 if (!list_empty(&event->list)) {
5939 list_del_init(&event->list);
5940
5941
5942
5943
5944 schedule_work(&event->remove);
5945 }
5946 spin_unlock(&memcg->event_list_lock);
5947 }
5948
5949 return 0;
5950}
5951
5952static void memcg_event_ptable_queue_proc(struct file *file,
5953 wait_queue_head_t *wqh, poll_table *pt)
5954{
5955 struct mem_cgroup_event *event =
5956 container_of(pt, struct mem_cgroup_event, pt);
5957
5958 event->wqh = wqh;
5959 add_wait_queue(wqh, &event->wait);
5960}
5961
5962
5963
5964
5965
5966
5967
5968
5969
5970static int memcg_write_event_control(struct cgroup_subsys_state *css,
5971 struct cftype *cft, char *buffer)
5972{
5973 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
5974 struct mem_cgroup_event *event;
5975 struct cgroup_subsys_state *cfile_css;
5976 unsigned int efd, cfd;
5977 struct fd efile;
5978 struct fd cfile;
5979 const char *name;
5980 char *endp;
5981 int ret;
5982
5983 efd = simple_strtoul(buffer, &endp, 10);
5984 if (*endp != ' ')
5985 return -EINVAL;
5986 buffer = endp + 1;
5987
5988 cfd = simple_strtoul(buffer, &endp, 10);
5989 if ((*endp != ' ') && (*endp != '\0'))
5990 return -EINVAL;
5991 buffer = endp + 1;
5992
5993 event = kzalloc(sizeof(*event), GFP_KERNEL);
5994 if (!event)
5995 return -ENOMEM;
5996
5997 event->memcg = memcg;
5998 INIT_LIST_HEAD(&event->list);
5999 init_poll_funcptr(&event->pt, memcg_event_ptable_queue_proc);
6000 init_waitqueue_func_entry(&event->wait, memcg_event_wake);
6001 INIT_WORK(&event->remove, memcg_event_remove);
6002
6003 efile = fdget(efd);
6004 if (!efile.file) {
6005 ret = -EBADF;
6006 goto out_kfree;
6007 }
6008
6009 event->eventfd = eventfd_ctx_fileget(efile.file);
6010 if (IS_ERR(event->eventfd)) {
6011 ret = PTR_ERR(event->eventfd);
6012 goto out_put_efile;
6013 }
6014
6015 cfile = fdget(cfd);
6016 if (!cfile.file) {
6017 ret = -EBADF;
6018 goto out_put_eventfd;
6019 }
6020
6021
6022
6023 ret = inode_permission(file_inode(cfile.file), MAY_READ);
6024 if (ret < 0)
6025 goto out_put_cfile;
6026
6027
6028
6029
6030
6031
6032
6033
6034
6035 name = cfile.file->f_dentry->d_name.name;
6036
6037 if (!strcmp(name, "memory.usage_in_bytes")) {
6038 event->register_event = mem_cgroup_usage_register_event;
6039 event->unregister_event = mem_cgroup_usage_unregister_event;
6040 } else if (!strcmp(name, "memory.oom_control")) {
6041 event->register_event = mem_cgroup_oom_register_event;
6042 event->unregister_event = mem_cgroup_oom_unregister_event;
6043 } else if (!strcmp(name, "memory.pressure_level")) {
6044 event->register_event = vmpressure_register_event;
6045 event->unregister_event = vmpressure_unregister_event;
6046 } else if (!strcmp(name, "memory.memsw.usage_in_bytes")) {
6047 event->register_event = memsw_cgroup_usage_register_event;
6048 event->unregister_event = memsw_cgroup_usage_unregister_event;
6049 } else {
6050 ret = -EINVAL;
6051 goto out_put_cfile;
6052 }
6053
6054
6055
6056
6057
6058
6059 cfile_css = css_tryget_from_dir(cfile.file->f_dentry->d_parent,
6060 &memory_cgrp_subsys);
6061 ret = -EINVAL;
6062 if (IS_ERR(cfile_css))
6063 goto out_put_cfile;
6064 if (cfile_css != css) {
6065 css_put(cfile_css);
6066 goto out_put_cfile;
6067 }
6068
6069 ret = event->register_event(memcg, event->eventfd, buffer);
6070 if (ret)
6071 goto out_put_css;
6072
6073 efile.file->f_op->poll(efile.file, &event->pt);
6074
6075 spin_lock(&memcg->event_list_lock);
6076 list_add(&event->list, &memcg->event_list);
6077 spin_unlock(&memcg->event_list_lock);
6078
6079 fdput(cfile);
6080 fdput(efile);
6081
6082 return 0;
6083
6084out_put_css:
6085 css_put(css);
6086out_put_cfile:
6087 fdput(cfile);
6088out_put_eventfd:
6089 eventfd_ctx_put(event->eventfd);
6090out_put_efile:
6091 fdput(efile);
6092out_kfree:
6093 kfree(event);
6094
6095 return ret;
6096}
6097
6098static struct cftype mem_cgroup_files[] = {
6099 {
6100 .name = "usage_in_bytes",
6101 .private = MEMFILE_PRIVATE(_MEM, RES_USAGE),
6102 .read_u64 = mem_cgroup_read_u64,
6103 },
6104 {
6105 .name = "max_usage_in_bytes",
6106 .private = MEMFILE_PRIVATE(_MEM, RES_MAX_USAGE),
6107 .trigger = mem_cgroup_reset,
6108 .read_u64 = mem_cgroup_read_u64,
6109 },
6110 {
6111 .name = "limit_in_bytes",
6112 .private = MEMFILE_PRIVATE(_MEM, RES_LIMIT),
6113 .write_string = mem_cgroup_write,
6114 .read_u64 = mem_cgroup_read_u64,
6115 },
6116 {
6117 .name = "soft_limit_in_bytes",
6118 .private = MEMFILE_PRIVATE(_MEM, RES_SOFT_LIMIT),
6119 .write_string = mem_cgroup_write,
6120 .read_u64 = mem_cgroup_read_u64,
6121 },
6122 {
6123 .name = "failcnt",
6124 .private = MEMFILE_PRIVATE(_MEM, RES_FAILCNT),
6125 .trigger = mem_cgroup_reset,
6126 .read_u64 = mem_cgroup_read_u64,
6127 },
6128 {
6129 .name = "stat",
6130 .seq_show = memcg_stat_show,
6131 },
6132 {
6133 .name = "force_empty",
6134 .trigger = mem_cgroup_force_empty_write,
6135 },
6136 {
6137 .name = "use_hierarchy",
6138 .flags = CFTYPE_INSANE,
6139 .write_u64 = mem_cgroup_hierarchy_write,
6140 .read_u64 = mem_cgroup_hierarchy_read,
6141 },
6142 {
6143 .name = "cgroup.event_control",
6144 .write_string = memcg_write_event_control,
6145 .flags = CFTYPE_NO_PREFIX,
6146 .mode = S_IWUGO,
6147 },
6148 {
6149 .name = "swappiness",
6150 .read_u64 = mem_cgroup_swappiness_read,
6151 .write_u64 = mem_cgroup_swappiness_write,
6152 },
6153 {
6154 .name = "move_charge_at_immigrate",
6155 .read_u64 = mem_cgroup_move_charge_read,
6156 .write_u64 = mem_cgroup_move_charge_write,
6157 },
6158 {
6159 .name = "oom_control",
6160 .seq_show = mem_cgroup_oom_control_read,
6161 .write_u64 = mem_cgroup_oom_control_write,
6162 .private = MEMFILE_PRIVATE(_OOM_TYPE, OOM_CONTROL),
6163 },
6164 {
6165 .name = "pressure_level",
6166 },
6167#ifdef CONFIG_NUMA
6168 {
6169 .name = "numa_stat",
6170 .seq_show = memcg_numa_stat_show,
6171 },
6172#endif
6173#ifdef CONFIG_MEMCG_KMEM
6174 {
6175 .name = "kmem.limit_in_bytes",
6176 .private = MEMFILE_PRIVATE(_KMEM, RES_LIMIT),
6177 .write_string = mem_cgroup_write,
6178 .read_u64 = mem_cgroup_read_u64,
6179 },
6180 {
6181 .name = "kmem.usage_in_bytes",
6182 .private = MEMFILE_PRIVATE(_KMEM, RES_USAGE),
6183 .read_u64 = mem_cgroup_read_u64,
6184 },
6185 {
6186 .name = "kmem.failcnt",
6187 .private = MEMFILE_PRIVATE(_KMEM, RES_FAILCNT),
6188 .trigger = mem_cgroup_reset,
6189 .read_u64 = mem_cgroup_read_u64,
6190 },
6191 {
6192 .name = "kmem.max_usage_in_bytes",
6193 .private = MEMFILE_PRIVATE(_KMEM, RES_MAX_USAGE),
6194 .trigger = mem_cgroup_reset,
6195 .read_u64 = mem_cgroup_read_u64,
6196 },
6197#ifdef CONFIG_SLABINFO
6198 {
6199 .name = "kmem.slabinfo",
6200 .seq_show = mem_cgroup_slabinfo_read,
6201 },
6202#endif
6203#endif
6204 { },
6205};
6206
6207#ifdef CONFIG_MEMCG_SWAP
6208static struct cftype memsw_cgroup_files[] = {
6209 {
6210 .name = "memsw.usage_in_bytes",
6211 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE),
6212 .read_u64 = mem_cgroup_read_u64,
6213 },
6214 {
6215 .name = "memsw.max_usage_in_bytes",
6216 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_MAX_USAGE),
6217 .trigger = mem_cgroup_reset,
6218 .read_u64 = mem_cgroup_read_u64,
6219 },
6220 {
6221 .name = "memsw.limit_in_bytes",
6222 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_LIMIT),
6223 .write_string = mem_cgroup_write,
6224 .read_u64 = mem_cgroup_read_u64,
6225 },
6226 {
6227 .name = "memsw.failcnt",
6228 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_FAILCNT),
6229 .trigger = mem_cgroup_reset,
6230 .read_u64 = mem_cgroup_read_u64,
6231 },
6232 { },
6233};
6234#endif
6235static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node)
6236{
6237 struct mem_cgroup_per_node *pn;
6238 struct mem_cgroup_per_zone *mz;
6239 int zone, tmp = node;
6240
6241
6242
6243
6244
6245
6246
6247
6248 if (!node_state(node, N_NORMAL_MEMORY))
6249 tmp = -1;
6250 pn = kzalloc_node(sizeof(*pn), GFP_KERNEL, tmp);
6251 if (!pn)
6252 return 1;
6253
6254 for (zone = 0; zone < MAX_NR_ZONES; zone++) {
6255 mz = &pn->zoneinfo[zone];
6256 lruvec_init(&mz->lruvec);
6257 mz->usage_in_excess = 0;
6258 mz->on_tree = false;
6259 mz->memcg = memcg;
6260 }
6261 memcg->nodeinfo[node] = pn;
6262 return 0;
6263}
6264
6265static void free_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node)
6266{
6267 kfree(memcg->nodeinfo[node]);
6268}
6269
6270static struct mem_cgroup *mem_cgroup_alloc(void)
6271{
6272 struct mem_cgroup *memcg;
6273 size_t size;
6274
6275 size = sizeof(struct mem_cgroup);
6276 size += nr_node_ids * sizeof(struct mem_cgroup_per_node *);
6277
6278 memcg = kzalloc(size, GFP_KERNEL);
6279 if (!memcg)
6280 return NULL;
6281
6282 memcg->stat = alloc_percpu(struct mem_cgroup_stat_cpu);
6283 if (!memcg->stat)
6284 goto out_free;
6285 spin_lock_init(&memcg->pcp_counter_lock);
6286 return memcg;
6287
6288out_free:
6289 kfree(memcg);
6290 return NULL;
6291}
6292
6293
6294
6295
6296
6297
6298
6299
6300
6301
6302
6303
6304static void __mem_cgroup_free(struct mem_cgroup *memcg)
6305{
6306 int node;
6307
6308 mem_cgroup_remove_from_trees(memcg);
6309
6310 for_each_node(node)
6311 free_mem_cgroup_per_zone_info(memcg, node);
6312
6313 free_percpu(memcg->stat);
6314
6315
6316
6317
6318
6319
6320
6321
6322
6323
6324
6325
6326 disarm_static_keys(memcg);
6327 kfree(memcg);
6328}
6329
6330
6331
6332
6333struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg)
6334{
6335 if (!memcg->res.parent)
6336 return NULL;
6337 return mem_cgroup_from_res_counter(memcg->res.parent, res);
6338}
6339EXPORT_SYMBOL(parent_mem_cgroup);
6340
6341static void __init mem_cgroup_soft_limit_tree_init(void)
6342{
6343 struct mem_cgroup_tree_per_node *rtpn;
6344 struct mem_cgroup_tree_per_zone *rtpz;
6345 int tmp, node, zone;
6346
6347 for_each_node(node) {
6348 tmp = node;
6349 if (!node_state(node, N_NORMAL_MEMORY))
6350 tmp = -1;
6351 rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL, tmp);
6352 BUG_ON(!rtpn);
6353
6354 soft_limit_tree.rb_tree_per_node[node] = rtpn;
6355
6356 for (zone = 0; zone < MAX_NR_ZONES; zone++) {
6357 rtpz = &rtpn->rb_tree_per_zone[zone];
6358 rtpz->rb_root = RB_ROOT;
6359 spin_lock_init(&rtpz->lock);
6360 }
6361 }
6362}
6363
6364static struct cgroup_subsys_state * __ref
6365mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
6366{
6367 struct mem_cgroup *memcg;
6368 long error = -ENOMEM;
6369 int node;
6370
6371 memcg = mem_cgroup_alloc();
6372 if (!memcg)
6373 return ERR_PTR(error);
6374
6375 for_each_node(node)
6376 if (alloc_mem_cgroup_per_zone_info(memcg, node))
6377 goto free_out;
6378
6379
6380 if (parent_css == NULL) {
6381 root_mem_cgroup = memcg;
6382 res_counter_init(&memcg->res, NULL);
6383 res_counter_init(&memcg->memsw, NULL);
6384 res_counter_init(&memcg->kmem, NULL);
6385 }
6386
6387 memcg->last_scanned_node = MAX_NUMNODES;
6388 INIT_LIST_HEAD(&memcg->oom_notify);
6389 memcg->move_charge_at_immigrate = 0;
6390 mutex_init(&memcg->thresholds_lock);
6391 spin_lock_init(&memcg->move_lock);
6392 vmpressure_init(&memcg->vmpressure);
6393 INIT_LIST_HEAD(&memcg->event_list);
6394 spin_lock_init(&memcg->event_list_lock);
6395
6396 return &memcg->css;
6397
6398free_out:
6399 __mem_cgroup_free(memcg);
6400 return ERR_PTR(error);
6401}
6402
6403static int
6404mem_cgroup_css_online(struct cgroup_subsys_state *css)
6405{
6406 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
6407 struct mem_cgroup *parent = mem_cgroup_from_css(css_parent(css));
6408
6409 if (css->cgroup->id > MEM_CGROUP_ID_MAX)
6410 return -ENOSPC;
6411
6412 if (!parent)
6413 return 0;
6414
6415 mutex_lock(&memcg_create_mutex);
6416
6417 memcg->use_hierarchy = parent->use_hierarchy;
6418 memcg->oom_kill_disable = parent->oom_kill_disable;
6419 memcg->swappiness = mem_cgroup_swappiness(parent);
6420
6421 if (parent->use_hierarchy) {
6422 res_counter_init(&memcg->res, &parent->res);
6423 res_counter_init(&memcg->memsw, &parent->memsw);
6424 res_counter_init(&memcg->kmem, &parent->kmem);
6425
6426
6427
6428
6429
6430 } else {
6431 res_counter_init(&memcg->res, NULL);
6432 res_counter_init(&memcg->memsw, NULL);
6433 res_counter_init(&memcg->kmem, NULL);
6434
6435
6436
6437
6438
6439 if (parent != root_mem_cgroup)
6440 memory_cgrp_subsys.broken_hierarchy = true;
6441 }
6442 mutex_unlock(&memcg_create_mutex);
6443
6444 return memcg_init_kmem(memcg, &memory_cgrp_subsys);
6445}
6446
6447
6448
6449
6450static void mem_cgroup_invalidate_reclaim_iterators(struct mem_cgroup *memcg)
6451{
6452 struct mem_cgroup *parent = memcg;
6453
6454 while ((parent = parent_mem_cgroup(parent)))
6455 mem_cgroup_iter_invalidate(parent);
6456
6457
6458
6459
6460
6461 if (!root_mem_cgroup->use_hierarchy)
6462 mem_cgroup_iter_invalidate(root_mem_cgroup);
6463}
6464
6465static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
6466{
6467 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
6468 struct mem_cgroup_event *event, *tmp;
6469 struct cgroup_subsys_state *iter;
6470
6471
6472
6473
6474
6475
6476 spin_lock(&memcg->event_list_lock);
6477 list_for_each_entry_safe(event, tmp, &memcg->event_list, list) {
6478 list_del_init(&event->list);
6479 schedule_work(&event->remove);
6480 }
6481 spin_unlock(&memcg->event_list_lock);
6482
6483 kmem_cgroup_css_offline(memcg);
6484
6485 mem_cgroup_invalidate_reclaim_iterators(memcg);
6486
6487
6488
6489
6490
6491 css_for_each_descendant_post(iter, css)
6492 mem_cgroup_reparent_charges(mem_cgroup_from_css(iter));
6493
6494 mem_cgroup_destroy_all_caches(memcg);
6495 vmpressure_cleanup(&memcg->vmpressure);
6496}
6497
6498static void mem_cgroup_css_free(struct cgroup_subsys_state *css)
6499{
6500 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
6501
6502
6503
6504
6505
6506
6507
6508
6509
6510
6511
6512
6513
6514
6515
6516
6517
6518
6519
6520
6521
6522
6523
6524
6525
6526
6527
6528
6529
6530
6531
6532
6533
6534
6535
6536 mem_cgroup_reparent_charges(memcg);
6537
6538 memcg_destroy_kmem(memcg);
6539 __mem_cgroup_free(memcg);
6540}
6541
6542#ifdef CONFIG_MMU
6543
6544#define PRECHARGE_COUNT_AT_ONCE 256
6545static int mem_cgroup_do_precharge(unsigned long count)
6546{
6547 int ret = 0;
6548 int batch_count = PRECHARGE_COUNT_AT_ONCE;
6549 struct mem_cgroup *memcg = mc.to;
6550
6551 if (mem_cgroup_is_root(memcg)) {
6552 mc.precharge += count;
6553
6554 return ret;
6555 }
6556
6557 if (count > 1) {
6558 struct res_counter *dummy;
6559
6560
6561
6562
6563
6564
6565 if (res_counter_charge(&memcg->res, PAGE_SIZE * count, &dummy))
6566 goto one_by_one;
6567 if (do_swap_account && res_counter_charge(&memcg->memsw,
6568 PAGE_SIZE * count, &dummy)) {
6569 res_counter_uncharge(&memcg->res, PAGE_SIZE * count);
6570 goto one_by_one;
6571 }
6572 mc.precharge += count;
6573 return ret;
6574 }
6575one_by_one:
6576
6577 while (count--) {
6578 if (signal_pending(current)) {
6579 ret = -EINTR;
6580 break;
6581 }
6582 if (!batch_count--) {
6583 batch_count = PRECHARGE_COUNT_AT_ONCE;
6584 cond_resched();
6585 }
6586 ret = mem_cgroup_try_charge(memcg, GFP_KERNEL, 1, false);
6587 if (ret)
6588
6589 return ret;
6590 mc.precharge++;
6591 }
6592 return ret;
6593}
6594
6595
6596
6597
6598
6599
6600
6601
6602
6603
6604
6605
6606
6607
6608
6609
6610
6611
6612
6613union mc_target {
6614 struct page *page;
6615 swp_entry_t ent;
6616};
6617
6618enum mc_target_type {
6619 MC_TARGET_NONE = 0,
6620 MC_TARGET_PAGE,
6621 MC_TARGET_SWAP,
6622};
6623
6624static struct page *mc_handle_present_pte(struct vm_area_struct *vma,
6625 unsigned long addr, pte_t ptent)
6626{
6627 struct page *page = vm_normal_page(vma, addr, ptent);
6628
6629 if (!page || !page_mapped(page))
6630 return NULL;
6631 if (PageAnon(page)) {
6632
6633 if (!move_anon())
6634 return NULL;
6635 } else if (!move_file())
6636
6637 return NULL;
6638 if (!get_page_unless_zero(page))
6639 return NULL;
6640
6641 return page;
6642}
6643
6644#ifdef CONFIG_SWAP
6645static struct page *mc_handle_swap_pte(struct vm_area_struct *vma,
6646 unsigned long addr, pte_t ptent, swp_entry_t *entry)
6647{
6648 struct page *page = NULL;
6649 swp_entry_t ent = pte_to_swp_entry(ptent);
6650
6651 if (!move_anon() || non_swap_entry(ent))
6652 return NULL;
6653
6654
6655
6656
6657 page = find_get_page(swap_address_space(ent), ent.val);
6658 if (do_swap_account)
6659 entry->val = ent.val;
6660
6661 return page;
6662}
6663#else
6664static struct page *mc_handle_swap_pte(struct vm_area_struct *vma,
6665 unsigned long addr, pte_t ptent, swp_entry_t *entry)
6666{
6667 return NULL;
6668}
6669#endif
6670
6671static struct page *mc_handle_file_pte(struct vm_area_struct *vma,
6672 unsigned long addr, pte_t ptent, swp_entry_t *entry)
6673{
6674 struct page *page = NULL;
6675 struct address_space *mapping;
6676 pgoff_t pgoff;
6677
6678 if (!vma->vm_file)
6679 return NULL;
6680 if (!move_file())
6681 return NULL;
6682
6683 mapping = vma->vm_file->f_mapping;
6684 if (pte_none(ptent))
6685 pgoff = linear_page_index(vma, addr);
6686 else
6687 pgoff = pte_to_pgoff(ptent);
6688
6689
6690#ifdef CONFIG_SWAP
6691
6692 if (shmem_mapping(mapping)) {
6693 page = find_get_entry(mapping, pgoff);
6694 if (radix_tree_exceptional_entry(page)) {
6695 swp_entry_t swp = radix_to_swp_entry(page);
6696 if (do_swap_account)
6697 *entry = swp;
6698 page = find_get_page(swap_address_space(swp), swp.val);
6699 }
6700 } else
6701 page = find_get_page(mapping, pgoff);
6702#else
6703 page = find_get_page(mapping, pgoff);
6704#endif
6705 return page;
6706}
6707
6708static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma,
6709 unsigned long addr, pte_t ptent, union mc_target *target)
6710{
6711 struct page *page = NULL;
6712 struct page_cgroup *pc;
6713 enum mc_target_type ret = MC_TARGET_NONE;
6714 swp_entry_t ent = { .val = 0 };
6715
6716 if (pte_present(ptent))
6717 page = mc_handle_present_pte(vma, addr, ptent);
6718 else if (is_swap_pte(ptent))
6719 page = mc_handle_swap_pte(vma, addr, ptent, &ent);
6720 else if (pte_none(ptent) || pte_file(ptent))
6721 page = mc_handle_file_pte(vma, addr, ptent, &ent);
6722
6723 if (!page && !ent.val)
6724 return ret;
6725 if (page) {
6726 pc = lookup_page_cgroup(page);
6727
6728
6729
6730
6731
6732 if (PageCgroupUsed(pc) && pc->mem_cgroup == mc.from) {
6733 ret = MC_TARGET_PAGE;
6734 if (target)
6735 target->page = page;
6736 }
6737 if (!ret || !target)
6738 put_page(page);
6739 }
6740
6741 if (ent.val && !ret &&
6742 mem_cgroup_id(mc.from) == lookup_swap_cgroup_id(ent)) {
6743 ret = MC_TARGET_SWAP;
6744 if (target)
6745 target->ent = ent;
6746 }
6747 return ret;
6748}
6749
6750#ifdef CONFIG_TRANSPARENT_HUGEPAGE
6751
6752
6753
6754
6755
6756static enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma,
6757 unsigned long addr, pmd_t pmd, union mc_target *target)
6758{
6759 struct page *page = NULL;
6760 struct page_cgroup *pc;
6761 enum mc_target_type ret = MC_TARGET_NONE;
6762
6763 page = pmd_page(pmd);
6764 VM_BUG_ON_PAGE(!page || !PageHead(page), page);
6765 if (!move_anon())
6766 return ret;
6767 pc = lookup_page_cgroup(page);
6768 if (PageCgroupUsed(pc) && pc->mem_cgroup == mc.from) {
6769 ret = MC_TARGET_PAGE;
6770 if (target) {
6771 get_page(page);
6772 target->page = page;
6773 }
6774 }
6775 return ret;
6776}
6777#else
6778static inline enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma,
6779 unsigned long addr, pmd_t pmd, union mc_target *target)
6780{
6781 return MC_TARGET_NONE;
6782}
6783#endif
6784
6785static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd,
6786 unsigned long addr, unsigned long end,
6787 struct mm_walk *walk)
6788{
6789 struct vm_area_struct *vma = walk->private;
6790 pte_t *pte;
6791 spinlock_t *ptl;
6792
6793 if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
6794 if (get_mctgt_type_thp(vma, addr, *pmd, NULL) == MC_TARGET_PAGE)
6795 mc.precharge += HPAGE_PMD_NR;
6796 spin_unlock(ptl);
6797 return 0;
6798 }
6799
6800 if (pmd_trans_unstable(pmd))
6801 return 0;
6802 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
6803 for (; addr != end; pte++, addr += PAGE_SIZE)
6804 if (get_mctgt_type(vma, addr, *pte, NULL))
6805 mc.precharge++;
6806 pte_unmap_unlock(pte - 1, ptl);
6807 cond_resched();
6808
6809 return 0;
6810}
6811
6812static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm)
6813{
6814 unsigned long precharge;
6815 struct vm_area_struct *vma;
6816
6817 down_read(&mm->mmap_sem);
6818 for (vma = mm->mmap; vma; vma = vma->vm_next) {
6819 struct mm_walk mem_cgroup_count_precharge_walk = {
6820 .pmd_entry = mem_cgroup_count_precharge_pte_range,
6821 .mm = mm,
6822 .private = vma,
6823 };
6824 if (is_vm_hugetlb_page(vma))
6825 continue;
6826 walk_page_range(vma->vm_start, vma->vm_end,
6827 &mem_cgroup_count_precharge_walk);
6828 }
6829 up_read(&mm->mmap_sem);
6830
6831 precharge = mc.precharge;
6832 mc.precharge = 0;
6833
6834 return precharge;
6835}
6836
6837static int mem_cgroup_precharge_mc(struct mm_struct *mm)
6838{
6839 unsigned long precharge = mem_cgroup_count_precharge(mm);
6840
6841 VM_BUG_ON(mc.moving_task);
6842 mc.moving_task = current;
6843 return mem_cgroup_do_precharge(precharge);
6844}
6845
6846
6847static void __mem_cgroup_clear_mc(void)
6848{
6849 struct mem_cgroup *from = mc.from;
6850 struct mem_cgroup *to = mc.to;
6851 int i;
6852
6853
6854 if (mc.precharge) {
6855 __mem_cgroup_cancel_charge(mc.to, mc.precharge);
6856 mc.precharge = 0;
6857 }
6858
6859
6860
6861
6862 if (mc.moved_charge) {
6863 __mem_cgroup_cancel_charge(mc.from, mc.moved_charge);
6864 mc.moved_charge = 0;
6865 }
6866
6867 if (mc.moved_swap) {
6868
6869 if (!mem_cgroup_is_root(mc.from))
6870 res_counter_uncharge(&mc.from->memsw,
6871 PAGE_SIZE * mc.moved_swap);
6872
6873 for (i = 0; i < mc.moved_swap; i++)
6874 css_put(&mc.from->css);
6875
6876 if (!mem_cgroup_is_root(mc.to)) {
6877
6878
6879
6880
6881 res_counter_uncharge(&mc.to->res,
6882 PAGE_SIZE * mc.moved_swap);
6883 }
6884
6885 mc.moved_swap = 0;
6886 }
6887 memcg_oom_recover(from);
6888 memcg_oom_recover(to);
6889 wake_up_all(&mc.waitq);
6890}
6891
6892static void mem_cgroup_clear_mc(void)
6893{
6894 struct mem_cgroup *from = mc.from;
6895
6896
6897
6898
6899
6900 mc.moving_task = NULL;
6901 __mem_cgroup_clear_mc();
6902 spin_lock(&mc.lock);
6903 mc.from = NULL;
6904 mc.to = NULL;
6905 spin_unlock(&mc.lock);
6906 mem_cgroup_end_move(from);
6907}
6908
6909static int mem_cgroup_can_attach(struct cgroup_subsys_state *css,
6910 struct cgroup_taskset *tset)
6911{
6912 struct task_struct *p = cgroup_taskset_first(tset);
6913 int ret = 0;
6914 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
6915 unsigned long move_charge_at_immigrate;
6916
6917
6918
6919
6920
6921
6922 move_charge_at_immigrate = memcg->move_charge_at_immigrate;
6923 if (move_charge_at_immigrate) {
6924 struct mm_struct *mm;
6925 struct mem_cgroup *from = mem_cgroup_from_task(p);
6926
6927 VM_BUG_ON(from == memcg);
6928
6929 mm = get_task_mm(p);
6930 if (!mm)
6931 return 0;
6932
6933 if (mm->owner == p) {
6934 VM_BUG_ON(mc.from);
6935 VM_BUG_ON(mc.to);
6936 VM_BUG_ON(mc.precharge);
6937 VM_BUG_ON(mc.moved_charge);
6938 VM_BUG_ON(mc.moved_swap);
6939 mem_cgroup_start_move(from);
6940 spin_lock(&mc.lock);
6941 mc.from = from;
6942 mc.to = memcg;
6943 mc.immigrate_flags = move_charge_at_immigrate;
6944 spin_unlock(&mc.lock);
6945
6946
6947 ret = mem_cgroup_precharge_mc(mm);
6948 if (ret)
6949 mem_cgroup_clear_mc();
6950 }
6951 mmput(mm);
6952 }
6953 return ret;
6954}
6955
6956static void mem_cgroup_cancel_attach(struct cgroup_subsys_state *css,
6957 struct cgroup_taskset *tset)
6958{
6959 mem_cgroup_clear_mc();
6960}
6961
6962static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
6963 unsigned long addr, unsigned long end,
6964 struct mm_walk *walk)
6965{
6966 int ret = 0;
6967 struct vm_area_struct *vma = walk->private;
6968 pte_t *pte;
6969 spinlock_t *ptl;
6970 enum mc_target_type target_type;
6971 union mc_target target;
6972 struct page *page;
6973 struct page_cgroup *pc;
6974
6975
6976
6977
6978
6979
6980
6981
6982
6983
6984
6985 if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
6986 if (mc.precharge < HPAGE_PMD_NR) {
6987 spin_unlock(ptl);
6988 return 0;
6989 }
6990 target_type = get_mctgt_type_thp(vma, addr, *pmd, &target);
6991 if (target_type == MC_TARGET_PAGE) {
6992 page = target.page;
6993 if (!isolate_lru_page(page)) {
6994 pc = lookup_page_cgroup(page);
6995 if (!mem_cgroup_move_account(page, HPAGE_PMD_NR,
6996 pc, mc.from, mc.to)) {
6997 mc.precharge -= HPAGE_PMD_NR;
6998 mc.moved_charge += HPAGE_PMD_NR;
6999 }
7000 putback_lru_page(page);
7001 }
7002 put_page(page);
7003 }
7004 spin_unlock(ptl);
7005 return 0;
7006 }
7007
7008 if (pmd_trans_unstable(pmd))
7009 return 0;
7010retry:
7011 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
7012 for (; addr != end; addr += PAGE_SIZE) {
7013 pte_t ptent = *(pte++);
7014 swp_entry_t ent;
7015
7016 if (!mc.precharge)
7017 break;
7018
7019 switch (get_mctgt_type(vma, addr, ptent, &target)) {
7020 case MC_TARGET_PAGE:
7021 page = target.page;
7022 if (isolate_lru_page(page))
7023 goto put;
7024 pc = lookup_page_cgroup(page);
7025 if (!mem_cgroup_move_account(page, 1, pc,
7026 mc.from, mc.to)) {
7027 mc.precharge--;
7028
7029 mc.moved_charge++;
7030 }
7031 putback_lru_page(page);
7032put:
7033 put_page(page);
7034 break;
7035 case MC_TARGET_SWAP:
7036 ent = target.ent;
7037 if (!mem_cgroup_move_swap_account(ent, mc.from, mc.to)) {
7038 mc.precharge--;
7039
7040 mc.moved_swap++;
7041 }
7042 break;
7043 default:
7044 break;
7045 }
7046 }
7047 pte_unmap_unlock(pte - 1, ptl);
7048 cond_resched();
7049
7050 if (addr != end) {
7051
7052
7053
7054
7055
7056
7057 ret = mem_cgroup_do_precharge(1);
7058 if (!ret)
7059 goto retry;
7060 }
7061
7062 return ret;
7063}
7064
7065static void mem_cgroup_move_charge(struct mm_struct *mm)
7066{
7067 struct vm_area_struct *vma;
7068
7069 lru_add_drain_all();
7070retry:
7071 if (unlikely(!down_read_trylock(&mm->mmap_sem))) {
7072
7073
7074
7075
7076
7077
7078
7079 __mem_cgroup_clear_mc();
7080 cond_resched();
7081 goto retry;
7082 }
7083 for (vma = mm->mmap; vma; vma = vma->vm_next) {
7084 int ret;
7085 struct mm_walk mem_cgroup_move_charge_walk = {
7086 .pmd_entry = mem_cgroup_move_charge_pte_range,
7087 .mm = mm,
7088 .private = vma,
7089 };
7090 if (is_vm_hugetlb_page(vma))
7091 continue;
7092 ret = walk_page_range(vma->vm_start, vma->vm_end,
7093 &mem_cgroup_move_charge_walk);
7094 if (ret)
7095
7096
7097
7098
7099 break;
7100 }
7101 up_read(&mm->mmap_sem);
7102}
7103
7104static void mem_cgroup_move_task(struct cgroup_subsys_state *css,
7105 struct cgroup_taskset *tset)
7106{
7107 struct task_struct *p = cgroup_taskset_first(tset);
7108 struct mm_struct *mm = get_task_mm(p);
7109
7110 if (mm) {
7111 if (mc.to)
7112 mem_cgroup_move_charge(mm);
7113 mmput(mm);
7114 }
7115 if (mc.to)
7116 mem_cgroup_clear_mc();
7117}
7118#else
7119static int mem_cgroup_can_attach(struct cgroup_subsys_state *css,
7120 struct cgroup_taskset *tset)
7121{
7122 return 0;
7123}
7124static void mem_cgroup_cancel_attach(struct cgroup_subsys_state *css,
7125 struct cgroup_taskset *tset)
7126{
7127}
7128static void mem_cgroup_move_task(struct cgroup_subsys_state *css,
7129 struct cgroup_taskset *tset)
7130{
7131}
7132#endif
7133
7134
7135
7136
7137
7138static void mem_cgroup_bind(struct cgroup_subsys_state *root_css)
7139{
7140
7141
7142
7143
7144
7145 if (cgroup_sane_behavior(root_css->cgroup))
7146 mem_cgroup_from_css(root_css)->use_hierarchy = true;
7147}
7148
7149struct cgroup_subsys memory_cgrp_subsys = {
7150 .css_alloc = mem_cgroup_css_alloc,
7151 .css_online = mem_cgroup_css_online,
7152 .css_offline = mem_cgroup_css_offline,
7153 .css_free = mem_cgroup_css_free,
7154 .can_attach = mem_cgroup_can_attach,
7155 .cancel_attach = mem_cgroup_cancel_attach,
7156 .attach = mem_cgroup_move_task,
7157 .bind = mem_cgroup_bind,
7158 .base_cftypes = mem_cgroup_files,
7159 .early_init = 0,
7160};
7161
7162#ifdef CONFIG_MEMCG_SWAP
7163static int __init enable_swap_account(char *s)
7164{
7165 if (!strcmp(s, "1"))
7166 really_do_swap_account = 1;
7167 else if (!strcmp(s, "0"))
7168 really_do_swap_account = 0;
7169 return 1;
7170}
7171__setup("swapaccount=", enable_swap_account);
7172
7173static void __init memsw_file_init(void)
7174{
7175 WARN_ON(cgroup_add_cftypes(&memory_cgrp_subsys, memsw_cgroup_files));
7176}
7177
7178static void __init enable_swap_cgroup(void)
7179{
7180 if (!mem_cgroup_disabled() && really_do_swap_account) {
7181 do_swap_account = 1;
7182 memsw_file_init();
7183 }
7184}
7185
7186#else
7187static void __init enable_swap_cgroup(void)
7188{
7189}
7190#endif
7191
7192
7193
7194
7195
7196
7197
7198
7199
7200static int __init mem_cgroup_init(void)
7201{
7202 hotcpu_notifier(memcg_cpu_hotplug_callback, 0);
7203 enable_swap_cgroup();
7204 mem_cgroup_soft_limit_tree_init();
7205 memcg_stock_init();
7206 return 0;
7207}
7208subsys_initcall(mem_cgroup_init);
7209