1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28#include <linux/res_counter.h>
29#include <linux/memcontrol.h>
30#include <linux/cgroup.h>
31#include <linux/mm.h>
32#include <linux/hugetlb.h>
33#include <linux/pagemap.h>
34#include <linux/smp.h>
35#include <linux/page-flags.h>
36#include <linux/backing-dev.h>
37#include <linux/bit_spinlock.h>
38#include <linux/rcupdate.h>
39#include <linux/limits.h>
40#include <linux/export.h>
41#include <linux/mutex.h>
42#include <linux/rbtree.h>
43#include <linux/slab.h>
44#include <linux/swap.h>
45#include <linux/swapops.h>
46#include <linux/spinlock.h>
47#include <linux/eventfd.h>
48#include <linux/poll.h>
49#include <linux/sort.h>
50#include <linux/fs.h>
51#include <linux/seq_file.h>
52#include <linux/vmpressure.h>
53#include <linux/mm_inline.h>
54#include <linux/page_cgroup.h>
55#include <linux/cpu.h>
56#include <linux/oom.h>
57#include <linux/lockdep.h>
58#include <linux/file.h>
59#include "internal.h"
60#include <net/sock.h>
61#include <net/ip.h>
62#include <net/tcp_memcontrol.h>
63#include "slab.h"
64
65#include <asm/uaccess.h>
66
67#include <trace/events/vmscan.h>
68
69struct cgroup_subsys memory_cgrp_subsys __read_mostly;
70EXPORT_SYMBOL(memory_cgrp_subsys);
71
72#define MEM_CGROUP_RECLAIM_RETRIES 5
73static struct mem_cgroup *root_mem_cgroup __read_mostly;
74
75#ifdef CONFIG_MEMCG_SWAP
76
77int do_swap_account __read_mostly;
78
79
80#ifdef CONFIG_MEMCG_SWAP_ENABLED
81static int really_do_swap_account __initdata = 1;
82#else
83static int really_do_swap_account __initdata;
84#endif
85
86#else
87#define do_swap_account 0
88#endif
89
90
91static const char * const mem_cgroup_stat_names[] = {
92 "cache",
93 "rss",
94 "rss_huge",
95 "mapped_file",
96 "writeback",
97 "swap",
98};
99
100enum mem_cgroup_events_index {
101 MEM_CGROUP_EVENTS_PGPGIN,
102 MEM_CGROUP_EVENTS_PGPGOUT,
103 MEM_CGROUP_EVENTS_PGFAULT,
104 MEM_CGROUP_EVENTS_PGMAJFAULT,
105 MEM_CGROUP_EVENTS_NSTATS,
106};
107
108static const char * const mem_cgroup_events_names[] = {
109 "pgpgin",
110 "pgpgout",
111 "pgfault",
112 "pgmajfault",
113};
114
115static const char * const mem_cgroup_lru_names[] = {
116 "inactive_anon",
117 "active_anon",
118 "inactive_file",
119 "active_file",
120 "unevictable",
121};
122
123
124
125
126
127
128
129enum mem_cgroup_events_target {
130 MEM_CGROUP_TARGET_THRESH,
131 MEM_CGROUP_TARGET_SOFTLIMIT,
132 MEM_CGROUP_TARGET_NUMAINFO,
133 MEM_CGROUP_NTARGETS,
134};
135#define THRESHOLDS_EVENTS_TARGET 128
136#define SOFTLIMIT_EVENTS_TARGET 1024
137#define NUMAINFO_EVENTS_TARGET 1024
138
139struct mem_cgroup_stat_cpu {
140 long count[MEM_CGROUP_STAT_NSTATS];
141 unsigned long events[MEM_CGROUP_EVENTS_NSTATS];
142 unsigned long nr_page_events;
143 unsigned long targets[MEM_CGROUP_NTARGETS];
144};
145
146struct mem_cgroup_reclaim_iter {
147
148
149
150
151 struct mem_cgroup *last_visited;
152 int last_dead_count;
153
154
155 unsigned int generation;
156};
157
158
159
160
161struct mem_cgroup_per_zone {
162 struct lruvec lruvec;
163 unsigned long lru_size[NR_LRU_LISTS];
164
165 struct mem_cgroup_reclaim_iter reclaim_iter[DEF_PRIORITY + 1];
166
167 struct rb_node tree_node;
168 unsigned long long usage_in_excess;
169
170 bool on_tree;
171 struct mem_cgroup *memcg;
172
173};
174
175struct mem_cgroup_per_node {
176 struct mem_cgroup_per_zone zoneinfo[MAX_NR_ZONES];
177};
178
179
180
181
182
183
184struct mem_cgroup_tree_per_zone {
185 struct rb_root rb_root;
186 spinlock_t lock;
187};
188
189struct mem_cgroup_tree_per_node {
190 struct mem_cgroup_tree_per_zone rb_tree_per_zone[MAX_NR_ZONES];
191};
192
193struct mem_cgroup_tree {
194 struct mem_cgroup_tree_per_node *rb_tree_per_node[MAX_NUMNODES];
195};
196
197static struct mem_cgroup_tree soft_limit_tree __read_mostly;
198
199struct mem_cgroup_threshold {
200 struct eventfd_ctx *eventfd;
201 u64 threshold;
202};
203
204
205struct mem_cgroup_threshold_ary {
206
207 int current_threshold;
208
209 unsigned int size;
210
211 struct mem_cgroup_threshold entries[0];
212};
213
214struct mem_cgroup_thresholds {
215
216 struct mem_cgroup_threshold_ary *primary;
217
218
219
220
221
222 struct mem_cgroup_threshold_ary *spare;
223};
224
225
226struct mem_cgroup_eventfd_list {
227 struct list_head list;
228 struct eventfd_ctx *eventfd;
229};
230
231
232
233
234struct mem_cgroup_event {
235
236
237
238 struct mem_cgroup *memcg;
239
240
241
242 struct eventfd_ctx *eventfd;
243
244
245
246 struct list_head list;
247
248
249
250
251
252 int (*register_event)(struct mem_cgroup *memcg,
253 struct eventfd_ctx *eventfd, const char *args);
254
255
256
257
258
259 void (*unregister_event)(struct mem_cgroup *memcg,
260 struct eventfd_ctx *eventfd);
261
262
263
264
265 poll_table pt;
266 wait_queue_head_t *wqh;
267 wait_queue_t wait;
268 struct work_struct remove;
269};
270
271static void mem_cgroup_threshold(struct mem_cgroup *memcg);
272static void mem_cgroup_oom_notify(struct mem_cgroup *memcg);
273
274
275
276
277
278
279
280
281
282
283
284
285struct mem_cgroup {
286 struct cgroup_subsys_state css;
287
288
289
290 struct res_counter res;
291
292
293 struct vmpressure vmpressure;
294
295
296 int initialized;
297
298
299
300
301 struct res_counter memsw;
302
303
304
305
306 struct res_counter kmem;
307
308
309
310 bool use_hierarchy;
311 unsigned long kmem_account_flags;
312
313 bool oom_lock;
314 atomic_t under_oom;
315 atomic_t oom_wakeups;
316
317 int swappiness;
318
319 int oom_kill_disable;
320
321
322 struct mutex thresholds_lock;
323
324
325 struct mem_cgroup_thresholds thresholds;
326
327
328 struct mem_cgroup_thresholds memsw_thresholds;
329
330
331 struct list_head oom_notify;
332
333
334
335
336
337 unsigned long move_charge_at_immigrate;
338
339
340
341 atomic_t moving_account;
342
343 spinlock_t move_lock;
344
345
346
347 struct mem_cgroup_stat_cpu __percpu *stat;
348
349
350
351
352 struct mem_cgroup_stat_cpu nocpu_base;
353 spinlock_t pcp_counter_lock;
354
355 atomic_t dead_count;
356#if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_INET)
357 struct cg_proto tcp_mem;
358#endif
359#if defined(CONFIG_MEMCG_KMEM)
360
361
362 struct list_head memcg_slab_caches;
363
364 int kmemcg_id;
365#endif
366
367 int last_scanned_node;
368#if MAX_NUMNODES > 1
369 nodemask_t scan_nodes;
370 atomic_t numainfo_events;
371 atomic_t numainfo_updating;
372#endif
373
374
375 struct list_head event_list;
376 spinlock_t event_list_lock;
377
378 struct mem_cgroup_per_node *nodeinfo[0];
379
380};
381
382
383enum {
384 KMEM_ACCOUNTED_ACTIVE,
385 KMEM_ACCOUNTED_DEAD,
386};
387
388#ifdef CONFIG_MEMCG_KMEM
389static inline void memcg_kmem_set_active(struct mem_cgroup *memcg)
390{
391 set_bit(KMEM_ACCOUNTED_ACTIVE, &memcg->kmem_account_flags);
392}
393
394static bool memcg_kmem_is_active(struct mem_cgroup *memcg)
395{
396 return test_bit(KMEM_ACCOUNTED_ACTIVE, &memcg->kmem_account_flags);
397}
398
399static void memcg_kmem_mark_dead(struct mem_cgroup *memcg)
400{
401
402
403
404
405 smp_wmb();
406 if (test_bit(KMEM_ACCOUNTED_ACTIVE, &memcg->kmem_account_flags))
407 set_bit(KMEM_ACCOUNTED_DEAD, &memcg->kmem_account_flags);
408}
409
410static bool memcg_kmem_test_and_clear_dead(struct mem_cgroup *memcg)
411{
412 return test_and_clear_bit(KMEM_ACCOUNTED_DEAD,
413 &memcg->kmem_account_flags);
414}
415#endif
416
417
418
419
420
421
422enum move_type {
423 MOVE_CHARGE_TYPE_ANON,
424 MOVE_CHARGE_TYPE_FILE,
425 NR_MOVE_TYPE,
426};
427
428
429static struct move_charge_struct {
430 spinlock_t lock;
431 struct mem_cgroup *from;
432 struct mem_cgroup *to;
433 unsigned long immigrate_flags;
434 unsigned long precharge;
435 unsigned long moved_charge;
436 unsigned long moved_swap;
437 struct task_struct *moving_task;
438 wait_queue_head_t waitq;
439} mc = {
440 .lock = __SPIN_LOCK_UNLOCKED(mc.lock),
441 .waitq = __WAIT_QUEUE_HEAD_INITIALIZER(mc.waitq),
442};
443
444static bool move_anon(void)
445{
446 return test_bit(MOVE_CHARGE_TYPE_ANON, &mc.immigrate_flags);
447}
448
449static bool move_file(void)
450{
451 return test_bit(MOVE_CHARGE_TYPE_FILE, &mc.immigrate_flags);
452}
453
454
455
456
457
458#define MEM_CGROUP_MAX_RECLAIM_LOOPS 100
459#define MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS 2
460
461enum charge_type {
462 MEM_CGROUP_CHARGE_TYPE_CACHE = 0,
463 MEM_CGROUP_CHARGE_TYPE_ANON,
464 MEM_CGROUP_CHARGE_TYPE_SWAPOUT,
465 MEM_CGROUP_CHARGE_TYPE_DROP,
466 NR_CHARGE_TYPE,
467};
468
469
470enum res_type {
471 _MEM,
472 _MEMSWAP,
473 _OOM_TYPE,
474 _KMEM,
475};
476
477#define MEMFILE_PRIVATE(x, val) ((x) << 16 | (val))
478#define MEMFILE_TYPE(val) ((val) >> 16 & 0xffff)
479#define MEMFILE_ATTR(val) ((val) & 0xffff)
480
481#define OOM_CONTROL (0)
482
483
484
485
486
487
488static DEFINE_MUTEX(memcg_create_mutex);
489
490struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *s)
491{
492 return s ? container_of(s, struct mem_cgroup, css) : NULL;
493}
494
495
496struct vmpressure *memcg_to_vmpressure(struct mem_cgroup *memcg)
497{
498 if (!memcg)
499 memcg = root_mem_cgroup;
500 return &memcg->vmpressure;
501}
502
503struct cgroup_subsys_state *vmpressure_to_css(struct vmpressure *vmpr)
504{
505 return &container_of(vmpr, struct mem_cgroup, vmpressure)->css;
506}
507
508static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg)
509{
510 return (memcg == root_mem_cgroup);
511}
512
513
514
515
516
517#define MEM_CGROUP_ID_MAX USHRT_MAX
518
519static inline unsigned short mem_cgroup_id(struct mem_cgroup *memcg)
520{
521 return memcg->css.id;
522}
523
524static inline struct mem_cgroup *mem_cgroup_from_id(unsigned short id)
525{
526 struct cgroup_subsys_state *css;
527
528 css = css_from_id(id, &memory_cgrp_subsys);
529 return mem_cgroup_from_css(css);
530}
531
532
533#if defined(CONFIG_INET) && defined(CONFIG_MEMCG_KMEM)
534
535void sock_update_memcg(struct sock *sk)
536{
537 if (mem_cgroup_sockets_enabled) {
538 struct mem_cgroup *memcg;
539 struct cg_proto *cg_proto;
540
541 BUG_ON(!sk->sk_prot->proto_cgroup);
542
543
544
545
546
547
548
549
550
551 if (sk->sk_cgrp) {
552 BUG_ON(mem_cgroup_is_root(sk->sk_cgrp->memcg));
553 css_get(&sk->sk_cgrp->memcg->css);
554 return;
555 }
556
557 rcu_read_lock();
558 memcg = mem_cgroup_from_task(current);
559 cg_proto = sk->sk_prot->proto_cgroup(memcg);
560 if (!mem_cgroup_is_root(memcg) &&
561 memcg_proto_active(cg_proto) &&
562 css_tryget_online(&memcg->css)) {
563 sk->sk_cgrp = cg_proto;
564 }
565 rcu_read_unlock();
566 }
567}
568EXPORT_SYMBOL(sock_update_memcg);
569
570void sock_release_memcg(struct sock *sk)
571{
572 if (mem_cgroup_sockets_enabled && sk->sk_cgrp) {
573 struct mem_cgroup *memcg;
574 WARN_ON(!sk->sk_cgrp->memcg);
575 memcg = sk->sk_cgrp->memcg;
576 css_put(&sk->sk_cgrp->memcg->css);
577 }
578}
579
580struct cg_proto *tcp_proto_cgroup(struct mem_cgroup *memcg)
581{
582 if (!memcg || mem_cgroup_is_root(memcg))
583 return NULL;
584
585 return &memcg->tcp_mem;
586}
587EXPORT_SYMBOL(tcp_proto_cgroup);
588
589static void disarm_sock_keys(struct mem_cgroup *memcg)
590{
591 if (!memcg_proto_activated(&memcg->tcp_mem))
592 return;
593 static_key_slow_dec(&memcg_socket_limit_enabled);
594}
595#else
596static void disarm_sock_keys(struct mem_cgroup *memcg)
597{
598}
599#endif
600
601#ifdef CONFIG_MEMCG_KMEM
602
603
604
605
606
607
608
609
610
611
612
613
614static DEFINE_IDA(kmem_limited_groups);
615int memcg_limited_groups_array_size;
616
617
618
619
620
621
622
623
624
625
626
627
628
629#define MEMCG_CACHES_MIN_SIZE 4
630#define MEMCG_CACHES_MAX_SIZE MEM_CGROUP_ID_MAX
631
632
633
634
635
636
637
638struct static_key memcg_kmem_enabled_key;
639EXPORT_SYMBOL(memcg_kmem_enabled_key);
640
641static void memcg_free_cache_id(int id);
642
643static void disarm_kmem_keys(struct mem_cgroup *memcg)
644{
645 if (memcg_kmem_is_active(memcg)) {
646 static_key_slow_dec(&memcg_kmem_enabled_key);
647 memcg_free_cache_id(memcg->kmemcg_id);
648 }
649
650
651
652
653 WARN_ON(res_counter_read_u64(&memcg->kmem, RES_USAGE) != 0);
654}
655#else
656static void disarm_kmem_keys(struct mem_cgroup *memcg)
657{
658}
659#endif
660
661static void disarm_static_keys(struct mem_cgroup *memcg)
662{
663 disarm_sock_keys(memcg);
664 disarm_kmem_keys(memcg);
665}
666
667static void drain_all_stock_async(struct mem_cgroup *memcg);
668
669static struct mem_cgroup_per_zone *
670mem_cgroup_zone_zoneinfo(struct mem_cgroup *memcg, struct zone *zone)
671{
672 int nid = zone_to_nid(zone);
673 int zid = zone_idx(zone);
674
675 return &memcg->nodeinfo[nid]->zoneinfo[zid];
676}
677
678struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *memcg)
679{
680 return &memcg->css;
681}
682
683static struct mem_cgroup_per_zone *
684mem_cgroup_page_zoneinfo(struct mem_cgroup *memcg, struct page *page)
685{
686 int nid = page_to_nid(page);
687 int zid = page_zonenum(page);
688
689 return &memcg->nodeinfo[nid]->zoneinfo[zid];
690}
691
692static struct mem_cgroup_tree_per_zone *
693soft_limit_tree_node_zone(int nid, int zid)
694{
695 return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid];
696}
697
698static struct mem_cgroup_tree_per_zone *
699soft_limit_tree_from_page(struct page *page)
700{
701 int nid = page_to_nid(page);
702 int zid = page_zonenum(page);
703
704 return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid];
705}
706
707static void __mem_cgroup_insert_exceeded(struct mem_cgroup_per_zone *mz,
708 struct mem_cgroup_tree_per_zone *mctz,
709 unsigned long long new_usage_in_excess)
710{
711 struct rb_node **p = &mctz->rb_root.rb_node;
712 struct rb_node *parent = NULL;
713 struct mem_cgroup_per_zone *mz_node;
714
715 if (mz->on_tree)
716 return;
717
718 mz->usage_in_excess = new_usage_in_excess;
719 if (!mz->usage_in_excess)
720 return;
721 while (*p) {
722 parent = *p;
723 mz_node = rb_entry(parent, struct mem_cgroup_per_zone,
724 tree_node);
725 if (mz->usage_in_excess < mz_node->usage_in_excess)
726 p = &(*p)->rb_left;
727
728
729
730
731 else if (mz->usage_in_excess >= mz_node->usage_in_excess)
732 p = &(*p)->rb_right;
733 }
734 rb_link_node(&mz->tree_node, parent, p);
735 rb_insert_color(&mz->tree_node, &mctz->rb_root);
736 mz->on_tree = true;
737}
738
739static void __mem_cgroup_remove_exceeded(struct mem_cgroup_per_zone *mz,
740 struct mem_cgroup_tree_per_zone *mctz)
741{
742 if (!mz->on_tree)
743 return;
744 rb_erase(&mz->tree_node, &mctz->rb_root);
745 mz->on_tree = false;
746}
747
748static void mem_cgroup_remove_exceeded(struct mem_cgroup_per_zone *mz,
749 struct mem_cgroup_tree_per_zone *mctz)
750{
751 unsigned long flags;
752
753 spin_lock_irqsave(&mctz->lock, flags);
754 __mem_cgroup_remove_exceeded(mz, mctz);
755 spin_unlock_irqrestore(&mctz->lock, flags);
756}
757
758
759static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page)
760{
761 unsigned long long excess;
762 struct mem_cgroup_per_zone *mz;
763 struct mem_cgroup_tree_per_zone *mctz;
764
765 mctz = soft_limit_tree_from_page(page);
766
767
768
769
770 for (; memcg; memcg = parent_mem_cgroup(memcg)) {
771 mz = mem_cgroup_page_zoneinfo(memcg, page);
772 excess = res_counter_soft_limit_excess(&memcg->res);
773
774
775
776
777 if (excess || mz->on_tree) {
778 unsigned long flags;
779
780 spin_lock_irqsave(&mctz->lock, flags);
781
782 if (mz->on_tree)
783 __mem_cgroup_remove_exceeded(mz, mctz);
784
785
786
787
788 __mem_cgroup_insert_exceeded(mz, mctz, excess);
789 spin_unlock_irqrestore(&mctz->lock, flags);
790 }
791 }
792}
793
794static void mem_cgroup_remove_from_trees(struct mem_cgroup *memcg)
795{
796 struct mem_cgroup_tree_per_zone *mctz;
797 struct mem_cgroup_per_zone *mz;
798 int nid, zid;
799
800 for_each_node(nid) {
801 for (zid = 0; zid < MAX_NR_ZONES; zid++) {
802 mz = &memcg->nodeinfo[nid]->zoneinfo[zid];
803 mctz = soft_limit_tree_node_zone(nid, zid);
804 mem_cgroup_remove_exceeded(mz, mctz);
805 }
806 }
807}
808
809static struct mem_cgroup_per_zone *
810__mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz)
811{
812 struct rb_node *rightmost = NULL;
813 struct mem_cgroup_per_zone *mz;
814
815retry:
816 mz = NULL;
817 rightmost = rb_last(&mctz->rb_root);
818 if (!rightmost)
819 goto done;
820
821 mz = rb_entry(rightmost, struct mem_cgroup_per_zone, tree_node);
822
823
824
825
826
827 __mem_cgroup_remove_exceeded(mz, mctz);
828 if (!res_counter_soft_limit_excess(&mz->memcg->res) ||
829 !css_tryget_online(&mz->memcg->css))
830 goto retry;
831done:
832 return mz;
833}
834
835static struct mem_cgroup_per_zone *
836mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz)
837{
838 struct mem_cgroup_per_zone *mz;
839
840 spin_lock_irq(&mctz->lock);
841 mz = __mem_cgroup_largest_soft_limit_node(mctz);
842 spin_unlock_irq(&mctz->lock);
843 return mz;
844}
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865static long mem_cgroup_read_stat(struct mem_cgroup *memcg,
866 enum mem_cgroup_stat_index idx)
867{
868 long val = 0;
869 int cpu;
870
871 get_online_cpus();
872 for_each_online_cpu(cpu)
873 val += per_cpu(memcg->stat->count[idx], cpu);
874#ifdef CONFIG_HOTPLUG_CPU
875 spin_lock(&memcg->pcp_counter_lock);
876 val += memcg->nocpu_base.count[idx];
877 spin_unlock(&memcg->pcp_counter_lock);
878#endif
879 put_online_cpus();
880 return val;
881}
882
883static unsigned long mem_cgroup_read_events(struct mem_cgroup *memcg,
884 enum mem_cgroup_events_index idx)
885{
886 unsigned long val = 0;
887 int cpu;
888
889 get_online_cpus();
890 for_each_online_cpu(cpu)
891 val += per_cpu(memcg->stat->events[idx], cpu);
892#ifdef CONFIG_HOTPLUG_CPU
893 spin_lock(&memcg->pcp_counter_lock);
894 val += memcg->nocpu_base.events[idx];
895 spin_unlock(&memcg->pcp_counter_lock);
896#endif
897 put_online_cpus();
898 return val;
899}
900
901static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg,
902 struct page *page,
903 int nr_pages)
904{
905
906
907
908
909 if (PageAnon(page))
910 __this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_RSS],
911 nr_pages);
912 else
913 __this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_CACHE],
914 nr_pages);
915
916 if (PageTransHuge(page))
917 __this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_RSS_HUGE],
918 nr_pages);
919
920
921 if (nr_pages > 0)
922 __this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGIN]);
923 else {
924 __this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGOUT]);
925 nr_pages = -nr_pages;
926 }
927
928 __this_cpu_add(memcg->stat->nr_page_events, nr_pages);
929}
930
931unsigned long mem_cgroup_get_lru_size(struct lruvec *lruvec, enum lru_list lru)
932{
933 struct mem_cgroup_per_zone *mz;
934
935 mz = container_of(lruvec, struct mem_cgroup_per_zone, lruvec);
936 return mz->lru_size[lru];
937}
938
939static unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg,
940 int nid,
941 unsigned int lru_mask)
942{
943 unsigned long nr = 0;
944 int zid;
945
946 VM_BUG_ON((unsigned)nid >= nr_node_ids);
947
948 for (zid = 0; zid < MAX_NR_ZONES; zid++) {
949 struct mem_cgroup_per_zone *mz;
950 enum lru_list lru;
951
952 for_each_lru(lru) {
953 if (!(BIT(lru) & lru_mask))
954 continue;
955 mz = &memcg->nodeinfo[nid]->zoneinfo[zid];
956 nr += mz->lru_size[lru];
957 }
958 }
959 return nr;
960}
961
962static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *memcg,
963 unsigned int lru_mask)
964{
965 unsigned long nr = 0;
966 int nid;
967
968 for_each_node_state(nid, N_MEMORY)
969 nr += mem_cgroup_node_nr_lru_pages(memcg, nid, lru_mask);
970 return nr;
971}
972
973static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg,
974 enum mem_cgroup_events_target target)
975{
976 unsigned long val, next;
977
978 val = __this_cpu_read(memcg->stat->nr_page_events);
979 next = __this_cpu_read(memcg->stat->targets[target]);
980
981 if ((long)next - (long)val < 0) {
982 switch (target) {
983 case MEM_CGROUP_TARGET_THRESH:
984 next = val + THRESHOLDS_EVENTS_TARGET;
985 break;
986 case MEM_CGROUP_TARGET_SOFTLIMIT:
987 next = val + SOFTLIMIT_EVENTS_TARGET;
988 break;
989 case MEM_CGROUP_TARGET_NUMAINFO:
990 next = val + NUMAINFO_EVENTS_TARGET;
991 break;
992 default:
993 break;
994 }
995 __this_cpu_write(memcg->stat->targets[target], next);
996 return true;
997 }
998 return false;
999}
1000
1001
1002
1003
1004
1005static void memcg_check_events(struct mem_cgroup *memcg, struct page *page)
1006{
1007
1008 if (unlikely(mem_cgroup_event_ratelimit(memcg,
1009 MEM_CGROUP_TARGET_THRESH))) {
1010 bool do_softlimit;
1011 bool do_numainfo __maybe_unused;
1012
1013 do_softlimit = mem_cgroup_event_ratelimit(memcg,
1014 MEM_CGROUP_TARGET_SOFTLIMIT);
1015#if MAX_NUMNODES > 1
1016 do_numainfo = mem_cgroup_event_ratelimit(memcg,
1017 MEM_CGROUP_TARGET_NUMAINFO);
1018#endif
1019 mem_cgroup_threshold(memcg);
1020 if (unlikely(do_softlimit))
1021 mem_cgroup_update_tree(memcg, page);
1022#if MAX_NUMNODES > 1
1023 if (unlikely(do_numainfo))
1024 atomic_inc(&memcg->numainfo_events);
1025#endif
1026 }
1027}
1028
1029struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
1030{
1031
1032
1033
1034
1035
1036 if (unlikely(!p))
1037 return NULL;
1038
1039 return mem_cgroup_from_css(task_css(p, memory_cgrp_id));
1040}
1041
1042static struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm)
1043{
1044 struct mem_cgroup *memcg = NULL;
1045
1046 rcu_read_lock();
1047 do {
1048
1049
1050
1051
1052
1053 if (unlikely(!mm))
1054 memcg = root_mem_cgroup;
1055 else {
1056 memcg = mem_cgroup_from_task(rcu_dereference(mm->owner));
1057 if (unlikely(!memcg))
1058 memcg = root_mem_cgroup;
1059 }
1060 } while (!css_tryget_online(&memcg->css));
1061 rcu_read_unlock();
1062 return memcg;
1063}
1064
1065
1066
1067
1068
1069
1070
1071static struct mem_cgroup *__mem_cgroup_iter_next(struct mem_cgroup *root,
1072 struct mem_cgroup *last_visited)
1073{
1074 struct cgroup_subsys_state *prev_css, *next_css;
1075
1076 prev_css = last_visited ? &last_visited->css : NULL;
1077skip_node:
1078 next_css = css_next_descendant_pre(prev_css, &root->css);
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095 if (next_css) {
1096 struct mem_cgroup *memcg = mem_cgroup_from_css(next_css);
1097
1098 if (next_css == &root->css)
1099 return memcg;
1100
1101 if (css_tryget_online(next_css)) {
1102
1103
1104
1105
1106
1107 if (smp_load_acquire(&memcg->initialized))
1108 return memcg;
1109 css_put(next_css);
1110 }
1111
1112 prev_css = next_css;
1113 goto skip_node;
1114 }
1115
1116 return NULL;
1117}
1118
1119static void mem_cgroup_iter_invalidate(struct mem_cgroup *root)
1120{
1121
1122
1123
1124
1125
1126 atomic_inc(&root->dead_count);
1127}
1128
1129static struct mem_cgroup *
1130mem_cgroup_iter_load(struct mem_cgroup_reclaim_iter *iter,
1131 struct mem_cgroup *root,
1132 int *sequence)
1133{
1134 struct mem_cgroup *position = NULL;
1135
1136
1137
1138
1139
1140
1141
1142
1143 *sequence = atomic_read(&root->dead_count);
1144 if (iter->last_dead_count == *sequence) {
1145 smp_rmb();
1146 position = iter->last_visited;
1147
1148
1149
1150
1151
1152
1153
1154 if (position && position != root &&
1155 !css_tryget_online(&position->css))
1156 position = NULL;
1157 }
1158 return position;
1159}
1160
1161static void mem_cgroup_iter_update(struct mem_cgroup_reclaim_iter *iter,
1162 struct mem_cgroup *last_visited,
1163 struct mem_cgroup *new_position,
1164 struct mem_cgroup *root,
1165 int sequence)
1166{
1167
1168 if (last_visited && last_visited != root)
1169 css_put(&last_visited->css);
1170
1171
1172
1173
1174
1175
1176 iter->last_visited = new_position;
1177 smp_wmb();
1178 iter->last_dead_count = sequence;
1179}
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
1199 struct mem_cgroup *prev,
1200 struct mem_cgroup_reclaim_cookie *reclaim)
1201{
1202 struct mem_cgroup *memcg = NULL;
1203 struct mem_cgroup *last_visited = NULL;
1204
1205 if (mem_cgroup_disabled())
1206 return NULL;
1207
1208 if (!root)
1209 root = root_mem_cgroup;
1210
1211 if (prev && !reclaim)
1212 last_visited = prev;
1213
1214 if (!root->use_hierarchy && root != root_mem_cgroup) {
1215 if (prev)
1216 goto out_css_put;
1217 return root;
1218 }
1219
1220 rcu_read_lock();
1221 while (!memcg) {
1222 struct mem_cgroup_reclaim_iter *uninitialized_var(iter);
1223 int uninitialized_var(seq);
1224
1225 if (reclaim) {
1226 struct mem_cgroup_per_zone *mz;
1227
1228 mz = mem_cgroup_zone_zoneinfo(root, reclaim->zone);
1229 iter = &mz->reclaim_iter[reclaim->priority];
1230 if (prev && reclaim->generation != iter->generation) {
1231 iter->last_visited = NULL;
1232 goto out_unlock;
1233 }
1234
1235 last_visited = mem_cgroup_iter_load(iter, root, &seq);
1236 }
1237
1238 memcg = __mem_cgroup_iter_next(root, last_visited);
1239
1240 if (reclaim) {
1241 mem_cgroup_iter_update(iter, last_visited, memcg, root,
1242 seq);
1243
1244 if (!memcg)
1245 iter->generation++;
1246 else if (!prev && memcg)
1247 reclaim->generation = iter->generation;
1248 }
1249
1250 if (prev && !memcg)
1251 goto out_unlock;
1252 }
1253out_unlock:
1254 rcu_read_unlock();
1255out_css_put:
1256 if (prev && prev != root)
1257 css_put(&prev->css);
1258
1259 return memcg;
1260}
1261
1262
1263
1264
1265
1266
1267void mem_cgroup_iter_break(struct mem_cgroup *root,
1268 struct mem_cgroup *prev)
1269{
1270 if (!root)
1271 root = root_mem_cgroup;
1272 if (prev && prev != root)
1273 css_put(&prev->css);
1274}
1275
1276
1277
1278
1279
1280
1281#define for_each_mem_cgroup_tree(iter, root) \
1282 for (iter = mem_cgroup_iter(root, NULL, NULL); \
1283 iter != NULL; \
1284 iter = mem_cgroup_iter(root, iter, NULL))
1285
1286#define for_each_mem_cgroup(iter) \
1287 for (iter = mem_cgroup_iter(NULL, NULL, NULL); \
1288 iter != NULL; \
1289 iter = mem_cgroup_iter(NULL, iter, NULL))
1290
1291void __mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx)
1292{
1293 struct mem_cgroup *memcg;
1294
1295 rcu_read_lock();
1296 memcg = mem_cgroup_from_task(rcu_dereference(mm->owner));
1297 if (unlikely(!memcg))
1298 goto out;
1299
1300 switch (idx) {
1301 case PGFAULT:
1302 this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGFAULT]);
1303 break;
1304 case PGMAJFAULT:
1305 this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGMAJFAULT]);
1306 break;
1307 default:
1308 BUG();
1309 }
1310out:
1311 rcu_read_unlock();
1312}
1313EXPORT_SYMBOL(__mem_cgroup_count_vm_event);
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324struct lruvec *mem_cgroup_zone_lruvec(struct zone *zone,
1325 struct mem_cgroup *memcg)
1326{
1327 struct mem_cgroup_per_zone *mz;
1328 struct lruvec *lruvec;
1329
1330 if (mem_cgroup_disabled()) {
1331 lruvec = &zone->lruvec;
1332 goto out;
1333 }
1334
1335 mz = mem_cgroup_zone_zoneinfo(memcg, zone);
1336 lruvec = &mz->lruvec;
1337out:
1338
1339
1340
1341
1342
1343 if (unlikely(lruvec->zone != zone))
1344 lruvec->zone = zone;
1345 return lruvec;
1346}
1347
1348
1349
1350
1351
1352
1353struct lruvec *mem_cgroup_page_lruvec(struct page *page, struct zone *zone)
1354{
1355 struct mem_cgroup_per_zone *mz;
1356 struct mem_cgroup *memcg;
1357 struct page_cgroup *pc;
1358 struct lruvec *lruvec;
1359
1360 if (mem_cgroup_disabled()) {
1361 lruvec = &zone->lruvec;
1362 goto out;
1363 }
1364
1365 pc = lookup_page_cgroup(page);
1366 memcg = pc->mem_cgroup;
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377 if (!PageLRU(page) && !PageCgroupUsed(pc) && memcg != root_mem_cgroup)
1378 pc->mem_cgroup = memcg = root_mem_cgroup;
1379
1380 mz = mem_cgroup_page_zoneinfo(memcg, page);
1381 lruvec = &mz->lruvec;
1382out:
1383
1384
1385
1386
1387
1388 if (unlikely(lruvec->zone != zone))
1389 lruvec->zone = zone;
1390 return lruvec;
1391}
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru,
1403 int nr_pages)
1404{
1405 struct mem_cgroup_per_zone *mz;
1406 unsigned long *lru_size;
1407
1408 if (mem_cgroup_disabled())
1409 return;
1410
1411 mz = container_of(lruvec, struct mem_cgroup_per_zone, lruvec);
1412 lru_size = mz->lru_size + lru;
1413 *lru_size += nr_pages;
1414 VM_BUG_ON((long)(*lru_size) < 0);
1415}
1416
1417
1418
1419
1420
1421bool __mem_cgroup_same_or_subtree(const struct mem_cgroup *root_memcg,
1422 struct mem_cgroup *memcg)
1423{
1424 if (root_memcg == memcg)
1425 return true;
1426 if (!root_memcg->use_hierarchy || !memcg)
1427 return false;
1428 return cgroup_is_descendant(memcg->css.cgroup, root_memcg->css.cgroup);
1429}
1430
1431static bool mem_cgroup_same_or_subtree(const struct mem_cgroup *root_memcg,
1432 struct mem_cgroup *memcg)
1433{
1434 bool ret;
1435
1436 rcu_read_lock();
1437 ret = __mem_cgroup_same_or_subtree(root_memcg, memcg);
1438 rcu_read_unlock();
1439 return ret;
1440}
1441
1442bool task_in_mem_cgroup(struct task_struct *task,
1443 const struct mem_cgroup *memcg)
1444{
1445 struct mem_cgroup *curr = NULL;
1446 struct task_struct *p;
1447 bool ret;
1448
1449 p = find_lock_task_mm(task);
1450 if (p) {
1451 curr = get_mem_cgroup_from_mm(p->mm);
1452 task_unlock(p);
1453 } else {
1454
1455
1456
1457
1458
1459 rcu_read_lock();
1460 curr = mem_cgroup_from_task(task);
1461 if (curr)
1462 css_get(&curr->css);
1463 rcu_read_unlock();
1464 }
1465
1466
1467
1468
1469
1470
1471 ret = mem_cgroup_same_or_subtree(memcg, curr);
1472 css_put(&curr->css);
1473 return ret;
1474}
1475
1476int mem_cgroup_inactive_anon_is_low(struct lruvec *lruvec)
1477{
1478 unsigned long inactive_ratio;
1479 unsigned long inactive;
1480 unsigned long active;
1481 unsigned long gb;
1482
1483 inactive = mem_cgroup_get_lru_size(lruvec, LRU_INACTIVE_ANON);
1484 active = mem_cgroup_get_lru_size(lruvec, LRU_ACTIVE_ANON);
1485
1486 gb = (inactive + active) >> (30 - PAGE_SHIFT);
1487 if (gb)
1488 inactive_ratio = int_sqrt(10 * gb);
1489 else
1490 inactive_ratio = 1;
1491
1492 return inactive * inactive_ratio < active;
1493}
1494
1495#define mem_cgroup_from_res_counter(counter, member) \
1496 container_of(counter, struct mem_cgroup, member)
1497
1498
1499
1500
1501
1502
1503
1504
1505static unsigned long mem_cgroup_margin(struct mem_cgroup *memcg)
1506{
1507 unsigned long long margin;
1508
1509 margin = res_counter_margin(&memcg->res);
1510 if (do_swap_account)
1511 margin = min(margin, res_counter_margin(&memcg->memsw));
1512 return margin >> PAGE_SHIFT;
1513}
1514
1515int mem_cgroup_swappiness(struct mem_cgroup *memcg)
1516{
1517
1518 if (mem_cgroup_disabled() || !memcg->css.parent)
1519 return vm_swappiness;
1520
1521 return memcg->swappiness;
1522}
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539static void mem_cgroup_start_move(struct mem_cgroup *memcg)
1540{
1541 atomic_inc(&memcg->moving_account);
1542 synchronize_rcu();
1543}
1544
1545static void mem_cgroup_end_move(struct mem_cgroup *memcg)
1546{
1547
1548
1549
1550
1551 if (memcg)
1552 atomic_dec(&memcg->moving_account);
1553}
1554
1555
1556
1557
1558
1559
1560
1561
1562static bool mem_cgroup_under_move(struct mem_cgroup *memcg)
1563{
1564 struct mem_cgroup *from;
1565 struct mem_cgroup *to;
1566 bool ret = false;
1567
1568
1569
1570
1571 spin_lock(&mc.lock);
1572 from = mc.from;
1573 to = mc.to;
1574 if (!from)
1575 goto unlock;
1576
1577 ret = mem_cgroup_same_or_subtree(memcg, from)
1578 || mem_cgroup_same_or_subtree(memcg, to);
1579unlock:
1580 spin_unlock(&mc.lock);
1581 return ret;
1582}
1583
1584static bool mem_cgroup_wait_acct_move(struct mem_cgroup *memcg)
1585{
1586 if (mc.moving_task && current != mc.moving_task) {
1587 if (mem_cgroup_under_move(memcg)) {
1588 DEFINE_WAIT(wait);
1589 prepare_to_wait(&mc.waitq, &wait, TASK_INTERRUPTIBLE);
1590
1591 if (mc.moving_task)
1592 schedule();
1593 finish_wait(&mc.waitq, &wait);
1594 return true;
1595 }
1596 }
1597 return false;
1598}
1599
1600
1601
1602
1603
1604
1605static void move_lock_mem_cgroup(struct mem_cgroup *memcg,
1606 unsigned long *flags)
1607{
1608 spin_lock_irqsave(&memcg->move_lock, *flags);
1609}
1610
1611static void move_unlock_mem_cgroup(struct mem_cgroup *memcg,
1612 unsigned long *flags)
1613{
1614 spin_unlock_irqrestore(&memcg->move_lock, *flags);
1615}
1616
1617#define K(x) ((x) << (PAGE_SHIFT-10))
1618
1619
1620
1621
1622
1623
1624
1625
1626void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
1627{
1628
1629 static DEFINE_MUTEX(oom_info_lock);
1630 struct mem_cgroup *iter;
1631 unsigned int i;
1632
1633 if (!p)
1634 return;
1635
1636 mutex_lock(&oom_info_lock);
1637 rcu_read_lock();
1638
1639 pr_info("Task in ");
1640 pr_cont_cgroup_path(task_cgroup(p, memory_cgrp_id));
1641 pr_info(" killed as a result of limit of ");
1642 pr_cont_cgroup_path(memcg->css.cgroup);
1643 pr_info("\n");
1644
1645 rcu_read_unlock();
1646
1647 pr_info("memory: usage %llukB, limit %llukB, failcnt %llu\n",
1648 res_counter_read_u64(&memcg->res, RES_USAGE) >> 10,
1649 res_counter_read_u64(&memcg->res, RES_LIMIT) >> 10,
1650 res_counter_read_u64(&memcg->res, RES_FAILCNT));
1651 pr_info("memory+swap: usage %llukB, limit %llukB, failcnt %llu\n",
1652 res_counter_read_u64(&memcg->memsw, RES_USAGE) >> 10,
1653 res_counter_read_u64(&memcg->memsw, RES_LIMIT) >> 10,
1654 res_counter_read_u64(&memcg->memsw, RES_FAILCNT));
1655 pr_info("kmem: usage %llukB, limit %llukB, failcnt %llu\n",
1656 res_counter_read_u64(&memcg->kmem, RES_USAGE) >> 10,
1657 res_counter_read_u64(&memcg->kmem, RES_LIMIT) >> 10,
1658 res_counter_read_u64(&memcg->kmem, RES_FAILCNT));
1659
1660 for_each_mem_cgroup_tree(iter, memcg) {
1661 pr_info("Memory cgroup stats for ");
1662 pr_cont_cgroup_path(iter->css.cgroup);
1663 pr_cont(":");
1664
1665 for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) {
1666 if (i == MEM_CGROUP_STAT_SWAP && !do_swap_account)
1667 continue;
1668 pr_cont(" %s:%ldKB", mem_cgroup_stat_names[i],
1669 K(mem_cgroup_read_stat(iter, i)));
1670 }
1671
1672 for (i = 0; i < NR_LRU_LISTS; i++)
1673 pr_cont(" %s:%luKB", mem_cgroup_lru_names[i],
1674 K(mem_cgroup_nr_lru_pages(iter, BIT(i))));
1675
1676 pr_cont("\n");
1677 }
1678 mutex_unlock(&oom_info_lock);
1679}
1680
1681
1682
1683
1684
1685static int mem_cgroup_count_children(struct mem_cgroup *memcg)
1686{
1687 int num = 0;
1688 struct mem_cgroup *iter;
1689
1690 for_each_mem_cgroup_tree(iter, memcg)
1691 num++;
1692 return num;
1693}
1694
1695
1696
1697
1698static u64 mem_cgroup_get_limit(struct mem_cgroup *memcg)
1699{
1700 u64 limit;
1701
1702 limit = res_counter_read_u64(&memcg->res, RES_LIMIT);
1703
1704
1705
1706
1707 if (mem_cgroup_swappiness(memcg)) {
1708 u64 memsw;
1709
1710 limit += total_swap_pages << PAGE_SHIFT;
1711 memsw = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
1712
1713
1714
1715
1716
1717 limit = min(limit, memsw);
1718 }
1719
1720 return limit;
1721}
1722
1723static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
1724 int order)
1725{
1726 struct mem_cgroup *iter;
1727 unsigned long chosen_points = 0;
1728 unsigned long totalpages;
1729 unsigned int points = 0;
1730 struct task_struct *chosen = NULL;
1731
1732
1733
1734
1735
1736
1737 if (fatal_signal_pending(current) || current->flags & PF_EXITING) {
1738 set_thread_flag(TIF_MEMDIE);
1739 return;
1740 }
1741
1742 check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, order, NULL);
1743 totalpages = mem_cgroup_get_limit(memcg) >> PAGE_SHIFT ? : 1;
1744 for_each_mem_cgroup_tree(iter, memcg) {
1745 struct css_task_iter it;
1746 struct task_struct *task;
1747
1748 css_task_iter_start(&iter->css, &it);
1749 while ((task = css_task_iter_next(&it))) {
1750 switch (oom_scan_process_thread(task, totalpages, NULL,
1751 false)) {
1752 case OOM_SCAN_SELECT:
1753 if (chosen)
1754 put_task_struct(chosen);
1755 chosen = task;
1756 chosen_points = ULONG_MAX;
1757 get_task_struct(chosen);
1758
1759 case OOM_SCAN_CONTINUE:
1760 continue;
1761 case OOM_SCAN_ABORT:
1762 css_task_iter_end(&it);
1763 mem_cgroup_iter_break(memcg, iter);
1764 if (chosen)
1765 put_task_struct(chosen);
1766 return;
1767 case OOM_SCAN_OK:
1768 break;
1769 };
1770 points = oom_badness(task, memcg, NULL, totalpages);
1771 if (!points || points < chosen_points)
1772 continue;
1773
1774 if (points == chosen_points &&
1775 thread_group_leader(chosen))
1776 continue;
1777
1778 if (chosen)
1779 put_task_struct(chosen);
1780 chosen = task;
1781 chosen_points = points;
1782 get_task_struct(chosen);
1783 }
1784 css_task_iter_end(&it);
1785 }
1786
1787 if (!chosen)
1788 return;
1789 points = chosen_points * 1000 / totalpages;
1790 oom_kill_process(chosen, gfp_mask, order, points, totalpages, memcg,
1791 NULL, "Memory cgroup out of memory");
1792}
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804static bool test_mem_cgroup_node_reclaimable(struct mem_cgroup *memcg,
1805 int nid, bool noswap)
1806{
1807 if (mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL_FILE))
1808 return true;
1809 if (noswap || !total_swap_pages)
1810 return false;
1811 if (mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL_ANON))
1812 return true;
1813 return false;
1814
1815}
1816#if MAX_NUMNODES > 1
1817
1818
1819
1820
1821
1822
1823
1824static void mem_cgroup_may_update_nodemask(struct mem_cgroup *memcg)
1825{
1826 int nid;
1827
1828
1829
1830
1831 if (!atomic_read(&memcg->numainfo_events))
1832 return;
1833 if (atomic_inc_return(&memcg->numainfo_updating) > 1)
1834 return;
1835
1836
1837 memcg->scan_nodes = node_states[N_MEMORY];
1838
1839 for_each_node_mask(nid, node_states[N_MEMORY]) {
1840
1841 if (!test_mem_cgroup_node_reclaimable(memcg, nid, false))
1842 node_clear(nid, memcg->scan_nodes);
1843 }
1844
1845 atomic_set(&memcg->numainfo_events, 0);
1846 atomic_set(&memcg->numainfo_updating, 0);
1847}
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861int mem_cgroup_select_victim_node(struct mem_cgroup *memcg)
1862{
1863 int node;
1864
1865 mem_cgroup_may_update_nodemask(memcg);
1866 node = memcg->last_scanned_node;
1867
1868 node = next_node(node, memcg->scan_nodes);
1869 if (node == MAX_NUMNODES)
1870 node = first_node(memcg->scan_nodes);
1871
1872
1873
1874
1875
1876
1877 if (unlikely(node == MAX_NUMNODES))
1878 node = numa_node_id();
1879
1880 memcg->last_scanned_node = node;
1881 return node;
1882}
1883
1884
1885
1886
1887
1888
1889
1890static bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap)
1891{
1892 int nid;
1893
1894
1895
1896
1897
1898 if (!nodes_empty(memcg->scan_nodes)) {
1899 for (nid = first_node(memcg->scan_nodes);
1900 nid < MAX_NUMNODES;
1901 nid = next_node(nid, memcg->scan_nodes)) {
1902
1903 if (test_mem_cgroup_node_reclaimable(memcg, nid, noswap))
1904 return true;
1905 }
1906 }
1907
1908
1909
1910 for_each_node_state(nid, N_MEMORY) {
1911 if (node_isset(nid, memcg->scan_nodes))
1912 continue;
1913 if (test_mem_cgroup_node_reclaimable(memcg, nid, noswap))
1914 return true;
1915 }
1916 return false;
1917}
1918
1919#else
1920int mem_cgroup_select_victim_node(struct mem_cgroup *memcg)
1921{
1922 return 0;
1923}
1924
1925static bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap)
1926{
1927 return test_mem_cgroup_node_reclaimable(memcg, 0, noswap);
1928}
1929#endif
1930
1931static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg,
1932 struct zone *zone,
1933 gfp_t gfp_mask,
1934 unsigned long *total_scanned)
1935{
1936 struct mem_cgroup *victim = NULL;
1937 int total = 0;
1938 int loop = 0;
1939 unsigned long excess;
1940 unsigned long nr_scanned;
1941 struct mem_cgroup_reclaim_cookie reclaim = {
1942 .zone = zone,
1943 .priority = 0,
1944 };
1945
1946 excess = res_counter_soft_limit_excess(&root_memcg->res) >> PAGE_SHIFT;
1947
1948 while (1) {
1949 victim = mem_cgroup_iter(root_memcg, victim, &reclaim);
1950 if (!victim) {
1951 loop++;
1952 if (loop >= 2) {
1953
1954
1955
1956
1957
1958 if (!total)
1959 break;
1960
1961
1962
1963
1964
1965
1966 if (total >= (excess >> 2) ||
1967 (loop > MEM_CGROUP_MAX_RECLAIM_LOOPS))
1968 break;
1969 }
1970 continue;
1971 }
1972 if (!mem_cgroup_reclaimable(victim, false))
1973 continue;
1974 total += mem_cgroup_shrink_node_zone(victim, gfp_mask, false,
1975 zone, &nr_scanned);
1976 *total_scanned += nr_scanned;
1977 if (!res_counter_soft_limit_excess(&root_memcg->res))
1978 break;
1979 }
1980 mem_cgroup_iter_break(root_memcg, victim);
1981 return total;
1982}
1983
1984#ifdef CONFIG_LOCKDEP
1985static struct lockdep_map memcg_oom_lock_dep_map = {
1986 .name = "memcg_oom_lock",
1987};
1988#endif
1989
1990static DEFINE_SPINLOCK(memcg_oom_lock);
1991
1992
1993
1994
1995
1996static bool mem_cgroup_oom_trylock(struct mem_cgroup *memcg)
1997{
1998 struct mem_cgroup *iter, *failed = NULL;
1999
2000 spin_lock(&memcg_oom_lock);
2001
2002 for_each_mem_cgroup_tree(iter, memcg) {
2003 if (iter->oom_lock) {
2004
2005
2006
2007
2008 failed = iter;
2009 mem_cgroup_iter_break(memcg, iter);
2010 break;
2011 } else
2012 iter->oom_lock = true;
2013 }
2014
2015 if (failed) {
2016
2017
2018
2019
2020 for_each_mem_cgroup_tree(iter, memcg) {
2021 if (iter == failed) {
2022 mem_cgroup_iter_break(memcg, iter);
2023 break;
2024 }
2025 iter->oom_lock = false;
2026 }
2027 } else
2028 mutex_acquire(&memcg_oom_lock_dep_map, 0, 1, _RET_IP_);
2029
2030 spin_unlock(&memcg_oom_lock);
2031
2032 return !failed;
2033}
2034
2035static void mem_cgroup_oom_unlock(struct mem_cgroup *memcg)
2036{
2037 struct mem_cgroup *iter;
2038
2039 spin_lock(&memcg_oom_lock);
2040 mutex_release(&memcg_oom_lock_dep_map, 1, _RET_IP_);
2041 for_each_mem_cgroup_tree(iter, memcg)
2042 iter->oom_lock = false;
2043 spin_unlock(&memcg_oom_lock);
2044}
2045
2046static void mem_cgroup_mark_under_oom(struct mem_cgroup *memcg)
2047{
2048 struct mem_cgroup *iter;
2049
2050 for_each_mem_cgroup_tree(iter, memcg)
2051 atomic_inc(&iter->under_oom);
2052}
2053
2054static void mem_cgroup_unmark_under_oom(struct mem_cgroup *memcg)
2055{
2056 struct mem_cgroup *iter;
2057
2058
2059
2060
2061
2062
2063 for_each_mem_cgroup_tree(iter, memcg)
2064 atomic_add_unless(&iter->under_oom, -1, 0);
2065}
2066
2067static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq);
2068
2069struct oom_wait_info {
2070 struct mem_cgroup *memcg;
2071 wait_queue_t wait;
2072};
2073
2074static int memcg_oom_wake_function(wait_queue_t *wait,
2075 unsigned mode, int sync, void *arg)
2076{
2077 struct mem_cgroup *wake_memcg = (struct mem_cgroup *)arg;
2078 struct mem_cgroup *oom_wait_memcg;
2079 struct oom_wait_info *oom_wait_info;
2080
2081 oom_wait_info = container_of(wait, struct oom_wait_info, wait);
2082 oom_wait_memcg = oom_wait_info->memcg;
2083
2084
2085
2086
2087
2088 if (!mem_cgroup_same_or_subtree(oom_wait_memcg, wake_memcg)
2089 && !mem_cgroup_same_or_subtree(wake_memcg, oom_wait_memcg))
2090 return 0;
2091 return autoremove_wake_function(wait, mode, sync, arg);
2092}
2093
2094static void memcg_wakeup_oom(struct mem_cgroup *memcg)
2095{
2096 atomic_inc(&memcg->oom_wakeups);
2097
2098 __wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, memcg);
2099}
2100
2101static void memcg_oom_recover(struct mem_cgroup *memcg)
2102{
2103 if (memcg && atomic_read(&memcg->under_oom))
2104 memcg_wakeup_oom(memcg);
2105}
2106
2107static void mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order)
2108{
2109 if (!current->memcg_oom.may_oom)
2110 return;
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125 css_get(&memcg->css);
2126 current->memcg_oom.memcg = memcg;
2127 current->memcg_oom.gfp_mask = mask;
2128 current->memcg_oom.order = order;
2129}
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148bool mem_cgroup_oom_synchronize(bool handle)
2149{
2150 struct mem_cgroup *memcg = current->memcg_oom.memcg;
2151 struct oom_wait_info owait;
2152 bool locked;
2153
2154
2155 if (!memcg)
2156 return false;
2157
2158 if (!handle)
2159 goto cleanup;
2160
2161 owait.memcg = memcg;
2162 owait.wait.flags = 0;
2163 owait.wait.func = memcg_oom_wake_function;
2164 owait.wait.private = current;
2165 INIT_LIST_HEAD(&owait.wait.task_list);
2166
2167 prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE);
2168 mem_cgroup_mark_under_oom(memcg);
2169
2170 locked = mem_cgroup_oom_trylock(memcg);
2171
2172 if (locked)
2173 mem_cgroup_oom_notify(memcg);
2174
2175 if (locked && !memcg->oom_kill_disable) {
2176 mem_cgroup_unmark_under_oom(memcg);
2177 finish_wait(&memcg_oom_waitq, &owait.wait);
2178 mem_cgroup_out_of_memory(memcg, current->memcg_oom.gfp_mask,
2179 current->memcg_oom.order);
2180 } else {
2181 schedule();
2182 mem_cgroup_unmark_under_oom(memcg);
2183 finish_wait(&memcg_oom_waitq, &owait.wait);
2184 }
2185
2186 if (locked) {
2187 mem_cgroup_oom_unlock(memcg);
2188
2189
2190
2191
2192
2193 memcg_oom_recover(memcg);
2194 }
2195cleanup:
2196 current->memcg_oom.memcg = NULL;
2197 css_put(&memcg->css);
2198 return true;
2199}
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226struct mem_cgroup *mem_cgroup_begin_page_stat(struct page *page,
2227 bool *locked,
2228 unsigned long *flags)
2229{
2230 struct mem_cgroup *memcg;
2231 struct page_cgroup *pc;
2232
2233 rcu_read_lock();
2234
2235 if (mem_cgroup_disabled())
2236 return NULL;
2237
2238 pc = lookup_page_cgroup(page);
2239again:
2240 memcg = pc->mem_cgroup;
2241 if (unlikely(!memcg || !PageCgroupUsed(pc)))
2242 return NULL;
2243
2244 *locked = false;
2245 if (atomic_read(&memcg->moving_account) <= 0)
2246 return memcg;
2247
2248 move_lock_mem_cgroup(memcg, flags);
2249 if (memcg != pc->mem_cgroup || !PageCgroupUsed(pc)) {
2250 move_unlock_mem_cgroup(memcg, flags);
2251 goto again;
2252 }
2253 *locked = true;
2254
2255 return memcg;
2256}
2257
2258
2259
2260
2261
2262
2263
2264void mem_cgroup_end_page_stat(struct mem_cgroup *memcg, bool locked,
2265 unsigned long flags)
2266{
2267 if (memcg && locked)
2268 move_unlock_mem_cgroup(memcg, &flags);
2269
2270 rcu_read_unlock();
2271}
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281void mem_cgroup_update_page_stat(struct mem_cgroup *memcg,
2282 enum mem_cgroup_stat_index idx, int val)
2283{
2284 VM_BUG_ON(!rcu_read_lock_held());
2285
2286 if (memcg)
2287 this_cpu_add(memcg->stat->count[idx], val);
2288}
2289
2290
2291
2292
2293
2294#define CHARGE_BATCH 32U
2295struct memcg_stock_pcp {
2296 struct mem_cgroup *cached;
2297 unsigned int nr_pages;
2298 struct work_struct work;
2299 unsigned long flags;
2300#define FLUSHING_CACHED_CHARGE 0
2301};
2302static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock);
2303static DEFINE_MUTEX(percpu_charge_mutex);
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
2317{
2318 struct memcg_stock_pcp *stock;
2319 bool ret = true;
2320
2321 if (nr_pages > CHARGE_BATCH)
2322 return false;
2323
2324 stock = &get_cpu_var(memcg_stock);
2325 if (memcg == stock->cached && stock->nr_pages >= nr_pages)
2326 stock->nr_pages -= nr_pages;
2327 else
2328 ret = false;
2329 put_cpu_var(memcg_stock);
2330 return ret;
2331}
2332
2333
2334
2335
2336static void drain_stock(struct memcg_stock_pcp *stock)
2337{
2338 struct mem_cgroup *old = stock->cached;
2339
2340 if (stock->nr_pages) {
2341 unsigned long bytes = stock->nr_pages * PAGE_SIZE;
2342
2343 res_counter_uncharge(&old->res, bytes);
2344 if (do_swap_account)
2345 res_counter_uncharge(&old->memsw, bytes);
2346 stock->nr_pages = 0;
2347 }
2348 stock->cached = NULL;
2349}
2350
2351
2352
2353
2354
2355static void drain_local_stock(struct work_struct *dummy)
2356{
2357 struct memcg_stock_pcp *stock = this_cpu_ptr(&memcg_stock);
2358 drain_stock(stock);
2359 clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags);
2360}
2361
2362static void __init memcg_stock_init(void)
2363{
2364 int cpu;
2365
2366 for_each_possible_cpu(cpu) {
2367 struct memcg_stock_pcp *stock =
2368 &per_cpu(memcg_stock, cpu);
2369 INIT_WORK(&stock->work, drain_local_stock);
2370 }
2371}
2372
2373
2374
2375
2376
2377static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
2378{
2379 struct memcg_stock_pcp *stock = &get_cpu_var(memcg_stock);
2380
2381 if (stock->cached != memcg) {
2382 drain_stock(stock);
2383 stock->cached = memcg;
2384 }
2385 stock->nr_pages += nr_pages;
2386 put_cpu_var(memcg_stock);
2387}
2388
2389
2390
2391
2392
2393
2394static void drain_all_stock(struct mem_cgroup *root_memcg, bool sync)
2395{
2396 int cpu, curcpu;
2397
2398
2399 get_online_cpus();
2400 curcpu = get_cpu();
2401 for_each_online_cpu(cpu) {
2402 struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu);
2403 struct mem_cgroup *memcg;
2404
2405 memcg = stock->cached;
2406 if (!memcg || !stock->nr_pages)
2407 continue;
2408 if (!mem_cgroup_same_or_subtree(root_memcg, memcg))
2409 continue;
2410 if (!test_and_set_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) {
2411 if (cpu == curcpu)
2412 drain_local_stock(&stock->work);
2413 else
2414 schedule_work_on(cpu, &stock->work);
2415 }
2416 }
2417 put_cpu();
2418
2419 if (!sync)
2420 goto out;
2421
2422 for_each_online_cpu(cpu) {
2423 struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu);
2424 if (test_bit(FLUSHING_CACHED_CHARGE, &stock->flags))
2425 flush_work(&stock->work);
2426 }
2427out:
2428 put_online_cpus();
2429}
2430
2431
2432
2433
2434
2435
2436
2437static void drain_all_stock_async(struct mem_cgroup *root_memcg)
2438{
2439
2440
2441
2442 if (!mutex_trylock(&percpu_charge_mutex))
2443 return;
2444 drain_all_stock(root_memcg, false);
2445 mutex_unlock(&percpu_charge_mutex);
2446}
2447
2448
2449static void drain_all_stock_sync(struct mem_cgroup *root_memcg)
2450{
2451
2452 mutex_lock(&percpu_charge_mutex);
2453 drain_all_stock(root_memcg, true);
2454 mutex_unlock(&percpu_charge_mutex);
2455}
2456
2457
2458
2459
2460
2461static void mem_cgroup_drain_pcp_counter(struct mem_cgroup *memcg, int cpu)
2462{
2463 int i;
2464
2465 spin_lock(&memcg->pcp_counter_lock);
2466 for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) {
2467 long x = per_cpu(memcg->stat->count[i], cpu);
2468
2469 per_cpu(memcg->stat->count[i], cpu) = 0;
2470 memcg->nocpu_base.count[i] += x;
2471 }
2472 for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++) {
2473 unsigned long x = per_cpu(memcg->stat->events[i], cpu);
2474
2475 per_cpu(memcg->stat->events[i], cpu) = 0;
2476 memcg->nocpu_base.events[i] += x;
2477 }
2478 spin_unlock(&memcg->pcp_counter_lock);
2479}
2480
2481static int memcg_cpu_hotplug_callback(struct notifier_block *nb,
2482 unsigned long action,
2483 void *hcpu)
2484{
2485 int cpu = (unsigned long)hcpu;
2486 struct memcg_stock_pcp *stock;
2487 struct mem_cgroup *iter;
2488
2489 if (action == CPU_ONLINE)
2490 return NOTIFY_OK;
2491
2492 if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
2493 return NOTIFY_OK;
2494
2495 for_each_mem_cgroup(iter)
2496 mem_cgroup_drain_pcp_counter(iter, cpu);
2497
2498 stock = &per_cpu(memcg_stock, cpu);
2499 drain_stock(stock);
2500 return NOTIFY_OK;
2501}
2502
2503static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
2504 unsigned int nr_pages)
2505{
2506 unsigned int batch = max(CHARGE_BATCH, nr_pages);
2507 int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
2508 struct mem_cgroup *mem_over_limit;
2509 struct res_counter *fail_res;
2510 unsigned long nr_reclaimed;
2511 unsigned long long size;
2512 bool may_swap = true;
2513 bool drained = false;
2514 int ret = 0;
2515
2516 if (mem_cgroup_is_root(memcg))
2517 goto done;
2518retry:
2519 if (consume_stock(memcg, nr_pages))
2520 goto done;
2521
2522 size = batch * PAGE_SIZE;
2523 if (!do_swap_account ||
2524 !res_counter_charge(&memcg->memsw, size, &fail_res)) {
2525 if (!res_counter_charge(&memcg->res, size, &fail_res))
2526 goto done_restock;
2527 if (do_swap_account)
2528 res_counter_uncharge(&memcg->memsw, size);
2529 mem_over_limit = mem_cgroup_from_res_counter(fail_res, res);
2530 } else {
2531 mem_over_limit = mem_cgroup_from_res_counter(fail_res, memsw);
2532 may_swap = false;
2533 }
2534
2535 if (batch > nr_pages) {
2536 batch = nr_pages;
2537 goto retry;
2538 }
2539
2540
2541
2542
2543
2544
2545
2546 if (unlikely(test_thread_flag(TIF_MEMDIE) ||
2547 fatal_signal_pending(current) ||
2548 current->flags & PF_EXITING))
2549 goto bypass;
2550
2551 if (unlikely(task_in_memcg_oom(current)))
2552 goto nomem;
2553
2554 if (!(gfp_mask & __GFP_WAIT))
2555 goto nomem;
2556
2557 nr_reclaimed = try_to_free_mem_cgroup_pages(mem_over_limit, nr_pages,
2558 gfp_mask, may_swap);
2559
2560 if (mem_cgroup_margin(mem_over_limit) >= nr_pages)
2561 goto retry;
2562
2563 if (!drained) {
2564 drain_all_stock_async(mem_over_limit);
2565 drained = true;
2566 goto retry;
2567 }
2568
2569 if (gfp_mask & __GFP_NORETRY)
2570 goto nomem;
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580 if (nr_reclaimed && nr_pages <= (1 << PAGE_ALLOC_COSTLY_ORDER))
2581 goto retry;
2582
2583
2584
2585
2586 if (mem_cgroup_wait_acct_move(mem_over_limit))
2587 goto retry;
2588
2589 if (nr_retries--)
2590 goto retry;
2591
2592 if (gfp_mask & __GFP_NOFAIL)
2593 goto bypass;
2594
2595 if (fatal_signal_pending(current))
2596 goto bypass;
2597
2598 mem_cgroup_oom(mem_over_limit, gfp_mask, get_order(nr_pages));
2599nomem:
2600 if (!(gfp_mask & __GFP_NOFAIL))
2601 return -ENOMEM;
2602bypass:
2603 return -EINTR;
2604
2605done_restock:
2606 if (batch > nr_pages)
2607 refill_stock(memcg, batch - nr_pages);
2608done:
2609 return ret;
2610}
2611
2612static void cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages)
2613{
2614 unsigned long bytes = nr_pages * PAGE_SIZE;
2615
2616 if (mem_cgroup_is_root(memcg))
2617 return;
2618
2619 res_counter_uncharge(&memcg->res, bytes);
2620 if (do_swap_account)
2621 res_counter_uncharge(&memcg->memsw, bytes);
2622}
2623
2624
2625
2626
2627
2628static void __mem_cgroup_cancel_local_charge(struct mem_cgroup *memcg,
2629 unsigned int nr_pages)
2630{
2631 unsigned long bytes = nr_pages * PAGE_SIZE;
2632
2633 if (mem_cgroup_is_root(memcg))
2634 return;
2635
2636 res_counter_uncharge_until(&memcg->res, memcg->res.parent, bytes);
2637 if (do_swap_account)
2638 res_counter_uncharge_until(&memcg->memsw,
2639 memcg->memsw.parent, bytes);
2640}
2641
2642
2643
2644
2645
2646
2647
2648static struct mem_cgroup *mem_cgroup_lookup(unsigned short id)
2649{
2650
2651 if (!id)
2652 return NULL;
2653 return mem_cgroup_from_id(id);
2654}
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page)
2667{
2668 struct mem_cgroup *memcg = NULL;
2669 struct page_cgroup *pc;
2670 unsigned short id;
2671 swp_entry_t ent;
2672
2673 VM_BUG_ON_PAGE(!PageLocked(page), page);
2674
2675 pc = lookup_page_cgroup(page);
2676 if (PageCgroupUsed(pc)) {
2677 memcg = pc->mem_cgroup;
2678 if (memcg && !css_tryget_online(&memcg->css))
2679 memcg = NULL;
2680 } else if (PageSwapCache(page)) {
2681 ent.val = page_private(page);
2682 id = lookup_swap_cgroup_id(ent);
2683 rcu_read_lock();
2684 memcg = mem_cgroup_lookup(id);
2685 if (memcg && !css_tryget_online(&memcg->css))
2686 memcg = NULL;
2687 rcu_read_unlock();
2688 }
2689 return memcg;
2690}
2691
2692static void lock_page_lru(struct page *page, int *isolated)
2693{
2694 struct zone *zone = page_zone(page);
2695
2696 spin_lock_irq(&zone->lru_lock);
2697 if (PageLRU(page)) {
2698 struct lruvec *lruvec;
2699
2700 lruvec = mem_cgroup_page_lruvec(page, zone);
2701 ClearPageLRU(page);
2702 del_page_from_lru_list(page, lruvec, page_lru(page));
2703 *isolated = 1;
2704 } else
2705 *isolated = 0;
2706}
2707
2708static void unlock_page_lru(struct page *page, int isolated)
2709{
2710 struct zone *zone = page_zone(page);
2711
2712 if (isolated) {
2713 struct lruvec *lruvec;
2714
2715 lruvec = mem_cgroup_page_lruvec(page, zone);
2716 VM_BUG_ON_PAGE(PageLRU(page), page);
2717 SetPageLRU(page);
2718 add_page_to_lru_list(page, lruvec, page_lru(page));
2719 }
2720 spin_unlock_irq(&zone->lru_lock);
2721}
2722
2723static void commit_charge(struct page *page, struct mem_cgroup *memcg,
2724 bool lrucare)
2725{
2726 struct page_cgroup *pc = lookup_page_cgroup(page);
2727 int isolated;
2728
2729 VM_BUG_ON_PAGE(PageCgroupUsed(pc), page);
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739 if (lrucare)
2740 lock_page_lru(page, &isolated);
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756 pc->mem_cgroup = memcg;
2757 pc->flags = PCG_USED | PCG_MEM | (do_swap_account ? PCG_MEMSW : 0);
2758
2759 if (lrucare)
2760 unlock_page_lru(page, isolated);
2761}
2762
2763static DEFINE_MUTEX(set_limit_mutex);
2764
2765#ifdef CONFIG_MEMCG_KMEM
2766
2767
2768
2769
2770static DEFINE_MUTEX(memcg_slab_mutex);
2771
2772static DEFINE_MUTEX(activate_kmem_mutex);
2773
2774
2775
2776
2777
2778static struct kmem_cache *memcg_params_to_cache(struct memcg_cache_params *p)
2779{
2780 struct kmem_cache *cachep;
2781
2782 VM_BUG_ON(p->is_root_cache);
2783 cachep = p->root_cache;
2784 return cache_from_memcg_idx(cachep, memcg_cache_id(p->memcg));
2785}
2786
2787#ifdef CONFIG_SLABINFO
2788static int mem_cgroup_slabinfo_read(struct seq_file *m, void *v)
2789{
2790 struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
2791 struct memcg_cache_params *params;
2792
2793 if (!memcg_kmem_is_active(memcg))
2794 return -EIO;
2795
2796 print_slabinfo_header(m);
2797
2798 mutex_lock(&memcg_slab_mutex);
2799 list_for_each_entry(params, &memcg->memcg_slab_caches, list)
2800 cache_show(memcg_params_to_cache(params), m);
2801 mutex_unlock(&memcg_slab_mutex);
2802
2803 return 0;
2804}
2805#endif
2806
2807static int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp, u64 size)
2808{
2809 struct res_counter *fail_res;
2810 int ret = 0;
2811
2812 ret = res_counter_charge(&memcg->kmem, size, &fail_res);
2813 if (ret)
2814 return ret;
2815
2816 ret = try_charge(memcg, gfp, size >> PAGE_SHIFT);
2817 if (ret == -EINTR) {
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833 res_counter_charge_nofail(&memcg->res, size, &fail_res);
2834 if (do_swap_account)
2835 res_counter_charge_nofail(&memcg->memsw, size,
2836 &fail_res);
2837 ret = 0;
2838 } else if (ret)
2839 res_counter_uncharge(&memcg->kmem, size);
2840
2841 return ret;
2842}
2843
2844static void memcg_uncharge_kmem(struct mem_cgroup *memcg, u64 size)
2845{
2846 res_counter_uncharge(&memcg->res, size);
2847 if (do_swap_account)
2848 res_counter_uncharge(&memcg->memsw, size);
2849
2850
2851 if (res_counter_uncharge(&memcg->kmem, size))
2852 return;
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862 if (memcg_kmem_test_and_clear_dead(memcg))
2863 css_put(&memcg->css);
2864}
2865
2866
2867
2868
2869
2870
2871int memcg_cache_id(struct mem_cgroup *memcg)
2872{
2873 return memcg ? memcg->kmemcg_id : -1;
2874}
2875
2876static int memcg_alloc_cache_id(void)
2877{
2878 int id, size;
2879 int err;
2880
2881 id = ida_simple_get(&kmem_limited_groups,
2882 0, MEMCG_CACHES_MAX_SIZE, GFP_KERNEL);
2883 if (id < 0)
2884 return id;
2885
2886 if (id < memcg_limited_groups_array_size)
2887 return id;
2888
2889
2890
2891
2892
2893
2894 size = 2 * (id + 1);
2895 if (size < MEMCG_CACHES_MIN_SIZE)
2896 size = MEMCG_CACHES_MIN_SIZE;
2897 else if (size > MEMCG_CACHES_MAX_SIZE)
2898 size = MEMCG_CACHES_MAX_SIZE;
2899
2900 mutex_lock(&memcg_slab_mutex);
2901 err = memcg_update_all_caches(size);
2902 mutex_unlock(&memcg_slab_mutex);
2903
2904 if (err) {
2905 ida_simple_remove(&kmem_limited_groups, id);
2906 return err;
2907 }
2908 return id;
2909}
2910
2911static void memcg_free_cache_id(int id)
2912{
2913 ida_simple_remove(&kmem_limited_groups, id);
2914}
2915
2916
2917
2918
2919
2920
2921void memcg_update_array_size(int num)
2922{
2923 memcg_limited_groups_array_size = num;
2924}
2925
2926static void memcg_register_cache(struct mem_cgroup *memcg,
2927 struct kmem_cache *root_cache)
2928{
2929 static char memcg_name_buf[NAME_MAX + 1];
2930
2931 struct kmem_cache *cachep;
2932 int id;
2933
2934 lockdep_assert_held(&memcg_slab_mutex);
2935
2936 id = memcg_cache_id(memcg);
2937
2938
2939
2940
2941
2942
2943 if (cache_from_memcg_idx(root_cache, id))
2944 return;
2945
2946 cgroup_name(memcg->css.cgroup, memcg_name_buf, NAME_MAX + 1);
2947 cachep = memcg_create_kmem_cache(memcg, root_cache, memcg_name_buf);
2948
2949
2950
2951
2952
2953 if (!cachep)
2954 return;
2955
2956 css_get(&memcg->css);
2957 list_add(&cachep->memcg_params->list, &memcg->memcg_slab_caches);
2958
2959
2960
2961
2962
2963
2964 smp_wmb();
2965
2966 BUG_ON(root_cache->memcg_params->memcg_caches[id]);
2967 root_cache->memcg_params->memcg_caches[id] = cachep;
2968}
2969
2970static void memcg_unregister_cache(struct kmem_cache *cachep)
2971{
2972 struct kmem_cache *root_cache;
2973 struct mem_cgroup *memcg;
2974 int id;
2975
2976 lockdep_assert_held(&memcg_slab_mutex);
2977
2978 BUG_ON(is_root_cache(cachep));
2979
2980 root_cache = cachep->memcg_params->root_cache;
2981 memcg = cachep->memcg_params->memcg;
2982 id = memcg_cache_id(memcg);
2983
2984 BUG_ON(root_cache->memcg_params->memcg_caches[id] != cachep);
2985 root_cache->memcg_params->memcg_caches[id] = NULL;
2986
2987 list_del(&cachep->memcg_params->list);
2988
2989 kmem_cache_destroy(cachep);
2990
2991
2992 css_put(&memcg->css);
2993}
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014static inline void memcg_stop_kmem_account(void)
3015{
3016 VM_BUG_ON(!current->mm);
3017 current->memcg_kmem_skip_account++;
3018}
3019
3020static inline void memcg_resume_kmem_account(void)
3021{
3022 VM_BUG_ON(!current->mm);
3023 current->memcg_kmem_skip_account--;
3024}
3025
3026int __memcg_cleanup_cache_params(struct kmem_cache *s)
3027{
3028 struct kmem_cache *c;
3029 int i, failed = 0;
3030
3031 mutex_lock(&memcg_slab_mutex);
3032 for_each_memcg_cache_index(i) {
3033 c = cache_from_memcg_idx(s, i);
3034 if (!c)
3035 continue;
3036
3037 memcg_unregister_cache(c);
3038
3039 if (cache_from_memcg_idx(s, i))
3040 failed++;
3041 }
3042 mutex_unlock(&memcg_slab_mutex);
3043 return failed;
3044}
3045
3046static void memcg_unregister_all_caches(struct mem_cgroup *memcg)
3047{
3048 struct kmem_cache *cachep;
3049 struct memcg_cache_params *params, *tmp;
3050
3051 if (!memcg_kmem_is_active(memcg))
3052 return;
3053
3054 mutex_lock(&memcg_slab_mutex);
3055 list_for_each_entry_safe(params, tmp, &memcg->memcg_slab_caches, list) {
3056 cachep = memcg_params_to_cache(params);
3057 kmem_cache_shrink(cachep);
3058 if (atomic_read(&cachep->memcg_params->nr_pages) == 0)
3059 memcg_unregister_cache(cachep);
3060 }
3061 mutex_unlock(&memcg_slab_mutex);
3062}
3063
3064struct memcg_register_cache_work {
3065 struct mem_cgroup *memcg;
3066 struct kmem_cache *cachep;
3067 struct work_struct work;
3068};
3069
3070static void memcg_register_cache_func(struct work_struct *w)
3071{
3072 struct memcg_register_cache_work *cw =
3073 container_of(w, struct memcg_register_cache_work, work);
3074 struct mem_cgroup *memcg = cw->memcg;
3075 struct kmem_cache *cachep = cw->cachep;
3076
3077 mutex_lock(&memcg_slab_mutex);
3078 memcg_register_cache(memcg, cachep);
3079 mutex_unlock(&memcg_slab_mutex);
3080
3081 css_put(&memcg->css);
3082 kfree(cw);
3083}
3084
3085
3086
3087
3088static void __memcg_schedule_register_cache(struct mem_cgroup *memcg,
3089 struct kmem_cache *cachep)
3090{
3091 struct memcg_register_cache_work *cw;
3092
3093 cw = kmalloc(sizeof(*cw), GFP_NOWAIT);
3094 if (cw == NULL) {
3095 css_put(&memcg->css);
3096 return;
3097 }
3098
3099 cw->memcg = memcg;
3100 cw->cachep = cachep;
3101
3102 INIT_WORK(&cw->work, memcg_register_cache_func);
3103 schedule_work(&cw->work);
3104}
3105
3106static void memcg_schedule_register_cache(struct mem_cgroup *memcg,
3107 struct kmem_cache *cachep)
3108{
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120 memcg_stop_kmem_account();
3121 __memcg_schedule_register_cache(memcg, cachep);
3122 memcg_resume_kmem_account();
3123}
3124
3125int __memcg_charge_slab(struct kmem_cache *cachep, gfp_t gfp, int order)
3126{
3127 int res;
3128
3129 res = memcg_charge_kmem(cachep->memcg_params->memcg, gfp,
3130 PAGE_SIZE << order);
3131 if (!res)
3132 atomic_add(1 << order, &cachep->memcg_params->nr_pages);
3133 return res;
3134}
3135
3136void __memcg_uncharge_slab(struct kmem_cache *cachep, int order)
3137{
3138 memcg_uncharge_kmem(cachep->memcg_params->memcg, PAGE_SIZE << order);
3139 atomic_sub(1 << order, &cachep->memcg_params->nr_pages);
3140}
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep,
3156 gfp_t gfp)
3157{
3158 struct mem_cgroup *memcg;
3159 struct kmem_cache *memcg_cachep;
3160
3161 VM_BUG_ON(!cachep->memcg_params);
3162 VM_BUG_ON(!cachep->memcg_params->is_root_cache);
3163
3164 if (!current->mm || current->memcg_kmem_skip_account)
3165 return cachep;
3166
3167 rcu_read_lock();
3168 memcg = mem_cgroup_from_task(rcu_dereference(current->mm->owner));
3169
3170 if (!memcg_kmem_is_active(memcg))
3171 goto out;
3172
3173 memcg_cachep = cache_from_memcg_idx(cachep, memcg_cache_id(memcg));
3174 if (likely(memcg_cachep)) {
3175 cachep = memcg_cachep;
3176 goto out;
3177 }
3178
3179
3180 if (!css_tryget_online(&memcg->css))
3181 goto out;
3182 rcu_read_unlock();
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196 memcg_schedule_register_cache(memcg, cachep);
3197 return cachep;
3198out:
3199 rcu_read_unlock();
3200 return cachep;
3201}
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217bool
3218__memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **_memcg, int order)
3219{
3220 struct mem_cgroup *memcg;
3221 int ret;
3222
3223 *_memcg = NULL;
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250 if (!current->mm || current->memcg_kmem_skip_account)
3251 return true;
3252
3253 memcg = get_mem_cgroup_from_mm(current->mm);
3254
3255 if (!memcg_kmem_is_active(memcg)) {
3256 css_put(&memcg->css);
3257 return true;
3258 }
3259
3260 ret = memcg_charge_kmem(memcg, gfp, PAGE_SIZE << order);
3261 if (!ret)
3262 *_memcg = memcg;
3263
3264 css_put(&memcg->css);
3265 return (ret == 0);
3266}
3267
3268void __memcg_kmem_commit_charge(struct page *page, struct mem_cgroup *memcg,
3269 int order)
3270{
3271 struct page_cgroup *pc;
3272
3273 VM_BUG_ON(mem_cgroup_is_root(memcg));
3274
3275
3276 if (!page) {
3277 memcg_uncharge_kmem(memcg, PAGE_SIZE << order);
3278 return;
3279 }
3280
3281
3282
3283
3284 pc = lookup_page_cgroup(page);
3285 pc->mem_cgroup = memcg;
3286 pc->flags = PCG_USED;
3287}
3288
3289void __memcg_kmem_uncharge_pages(struct page *page, int order)
3290{
3291 struct mem_cgroup *memcg = NULL;
3292 struct page_cgroup *pc;
3293
3294
3295 pc = lookup_page_cgroup(page);
3296 if (!PageCgroupUsed(pc))
3297 return;
3298
3299 memcg = pc->mem_cgroup;
3300 pc->flags = 0;
3301
3302
3303
3304
3305
3306 if (!memcg)
3307 return;
3308
3309 VM_BUG_ON_PAGE(mem_cgroup_is_root(memcg), page);
3310 memcg_uncharge_kmem(memcg, PAGE_SIZE << order);
3311}
3312#else
3313static inline void memcg_unregister_all_caches(struct mem_cgroup *memcg)
3314{
3315}
3316#endif
3317
3318#ifdef CONFIG_TRANSPARENT_HUGEPAGE
3319
3320
3321
3322
3323
3324
3325
3326void mem_cgroup_split_huge_fixup(struct page *head)
3327{
3328 struct page_cgroup *head_pc = lookup_page_cgroup(head);
3329 struct page_cgroup *pc;
3330 struct mem_cgroup *memcg;
3331 int i;
3332
3333 if (mem_cgroup_disabled())
3334 return;
3335
3336 memcg = head_pc->mem_cgroup;
3337 for (i = 1; i < HPAGE_PMD_NR; i++) {
3338 pc = head_pc + i;
3339 pc->mem_cgroup = memcg;
3340 pc->flags = head_pc->flags;
3341 }
3342 __this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_RSS_HUGE],
3343 HPAGE_PMD_NR);
3344}
3345#endif
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362static int mem_cgroup_move_account(struct page *page,
3363 unsigned int nr_pages,
3364 struct page_cgroup *pc,
3365 struct mem_cgroup *from,
3366 struct mem_cgroup *to)
3367{
3368 unsigned long flags;
3369 int ret;
3370
3371 VM_BUG_ON(from == to);
3372 VM_BUG_ON_PAGE(PageLRU(page), page);
3373
3374
3375
3376
3377
3378
3379 ret = -EBUSY;
3380 if (nr_pages > 1 && !PageTransHuge(page))
3381 goto out;
3382
3383
3384
3385
3386
3387
3388 if (!trylock_page(page))
3389 goto out;
3390
3391 ret = -EINVAL;
3392 if (!PageCgroupUsed(pc) || pc->mem_cgroup != from)
3393 goto out_unlock;
3394
3395 move_lock_mem_cgroup(from, &flags);
3396
3397 if (!PageAnon(page) && page_mapped(page)) {
3398 __this_cpu_sub(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED],
3399 nr_pages);
3400 __this_cpu_add(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED],
3401 nr_pages);
3402 }
3403
3404 if (PageWriteback(page)) {
3405 __this_cpu_sub(from->stat->count[MEM_CGROUP_STAT_WRITEBACK],
3406 nr_pages);
3407 __this_cpu_add(to->stat->count[MEM_CGROUP_STAT_WRITEBACK],
3408 nr_pages);
3409 }
3410
3411
3412
3413
3414
3415
3416
3417
3418 pc->mem_cgroup = to;
3419 move_unlock_mem_cgroup(from, &flags);
3420 ret = 0;
3421
3422 local_irq_disable();
3423 mem_cgroup_charge_statistics(to, page, nr_pages);
3424 memcg_check_events(to, page);
3425 mem_cgroup_charge_statistics(from, page, -nr_pages);
3426 memcg_check_events(from, page);
3427 local_irq_enable();
3428out_unlock:
3429 unlock_page(page);
3430out:
3431 return ret;
3432}
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455static int mem_cgroup_move_parent(struct page *page,
3456 struct page_cgroup *pc,
3457 struct mem_cgroup *child)
3458{
3459 struct mem_cgroup *parent;
3460 unsigned int nr_pages;
3461 unsigned long uninitialized_var(flags);
3462 int ret;
3463
3464 VM_BUG_ON(mem_cgroup_is_root(child));
3465
3466 ret = -EBUSY;
3467 if (!get_page_unless_zero(page))
3468 goto out;
3469 if (isolate_lru_page(page))
3470 goto put;
3471
3472 nr_pages = hpage_nr_pages(page);
3473
3474 parent = parent_mem_cgroup(child);
3475
3476
3477
3478 if (!parent)
3479 parent = root_mem_cgroup;
3480
3481 if (nr_pages > 1) {
3482 VM_BUG_ON_PAGE(!PageTransHuge(page), page);
3483 flags = compound_lock_irqsave(page);
3484 }
3485
3486 ret = mem_cgroup_move_account(page, nr_pages,
3487 pc, child, parent);
3488 if (!ret)
3489 __mem_cgroup_cancel_local_charge(child, nr_pages);
3490
3491 if (nr_pages > 1)
3492 compound_unlock_irqrestore(page, flags);
3493 putback_lru_page(page);
3494put:
3495 put_page(page);
3496out:
3497 return ret;
3498}
3499
3500#ifdef CONFIG_MEMCG_SWAP
3501static void mem_cgroup_swap_statistics(struct mem_cgroup *memcg,
3502 bool charge)
3503{
3504 int val = (charge) ? 1 : -1;
3505 this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_SWAP], val);
3506}
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522static int mem_cgroup_move_swap_account(swp_entry_t entry,
3523 struct mem_cgroup *from, struct mem_cgroup *to)
3524{
3525 unsigned short old_id, new_id;
3526
3527 old_id = mem_cgroup_id(from);
3528 new_id = mem_cgroup_id(to);
3529
3530 if (swap_cgroup_cmpxchg(entry, old_id, new_id) == old_id) {
3531 mem_cgroup_swap_statistics(from, false);
3532 mem_cgroup_swap_statistics(to, true);
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544 css_get(&to->css);
3545 return 0;
3546 }
3547 return -EINVAL;
3548}
3549#else
3550static inline int mem_cgroup_move_swap_account(swp_entry_t entry,
3551 struct mem_cgroup *from, struct mem_cgroup *to)
3552{
3553 return -EINVAL;
3554}
3555#endif
3556
3557#ifdef CONFIG_DEBUG_VM
3558static struct page_cgroup *lookup_page_cgroup_used(struct page *page)
3559{
3560 struct page_cgroup *pc;
3561
3562 pc = lookup_page_cgroup(page);
3563
3564
3565
3566
3567
3568 if (likely(pc) && PageCgroupUsed(pc))
3569 return pc;
3570 return NULL;
3571}
3572
3573bool mem_cgroup_bad_page_check(struct page *page)
3574{
3575 if (mem_cgroup_disabled())
3576 return false;
3577
3578 return lookup_page_cgroup_used(page) != NULL;
3579}
3580
3581void mem_cgroup_print_bad_page(struct page *page)
3582{
3583 struct page_cgroup *pc;
3584
3585 pc = lookup_page_cgroup_used(page);
3586 if (pc) {
3587 pr_alert("pc:%p pc->flags:%lx pc->mem_cgroup:%p\n",
3588 pc, pc->flags, pc->mem_cgroup);
3589 }
3590}
3591#endif
3592
3593static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
3594 unsigned long long val)
3595{
3596 int retry_count;
3597 int ret = 0;
3598 int children = mem_cgroup_count_children(memcg);
3599 u64 curusage, oldusage;
3600 int enlarge;
3601
3602
3603
3604
3605
3606
3607 retry_count = MEM_CGROUP_RECLAIM_RETRIES * children;
3608
3609 oldusage = res_counter_read_u64(&memcg->res, RES_USAGE);
3610
3611 enlarge = 0;
3612 while (retry_count) {
3613 if (signal_pending(current)) {
3614 ret = -EINTR;
3615 break;
3616 }
3617
3618
3619
3620
3621
3622 mutex_lock(&set_limit_mutex);
3623 if (res_counter_read_u64(&memcg->memsw, RES_LIMIT) < val) {
3624 ret = -EINVAL;
3625 mutex_unlock(&set_limit_mutex);
3626 break;
3627 }
3628
3629 if (res_counter_read_u64(&memcg->res, RES_LIMIT) < val)
3630 enlarge = 1;
3631
3632 ret = res_counter_set_limit(&memcg->res, val);
3633 mutex_unlock(&set_limit_mutex);
3634
3635 if (!ret)
3636 break;
3637
3638 try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL, true);
3639
3640 curusage = res_counter_read_u64(&memcg->res, RES_USAGE);
3641
3642 if (curusage >= oldusage)
3643 retry_count--;
3644 else
3645 oldusage = curusage;
3646 }
3647 if (!ret && enlarge)
3648 memcg_oom_recover(memcg);
3649
3650 return ret;
3651}
3652
3653static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg,
3654 unsigned long long val)
3655{
3656 int retry_count;
3657 u64 oldusage, curusage;
3658 int children = mem_cgroup_count_children(memcg);
3659 int ret = -EBUSY;
3660 int enlarge = 0;
3661
3662
3663 retry_count = children * MEM_CGROUP_RECLAIM_RETRIES;
3664 oldusage = res_counter_read_u64(&memcg->memsw, RES_USAGE);
3665 while (retry_count) {
3666 if (signal_pending(current)) {
3667 ret = -EINTR;
3668 break;
3669 }
3670
3671
3672
3673
3674
3675 mutex_lock(&set_limit_mutex);
3676 if (res_counter_read_u64(&memcg->res, RES_LIMIT) > val) {
3677 ret = -EINVAL;
3678 mutex_unlock(&set_limit_mutex);
3679 break;
3680 }
3681 if (res_counter_read_u64(&memcg->memsw, RES_LIMIT) < val)
3682 enlarge = 1;
3683 ret = res_counter_set_limit(&memcg->memsw, val);
3684 mutex_unlock(&set_limit_mutex);
3685
3686 if (!ret)
3687 break;
3688
3689 try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL, false);
3690
3691 curusage = res_counter_read_u64(&memcg->memsw, RES_USAGE);
3692
3693 if (curusage >= oldusage)
3694 retry_count--;
3695 else
3696 oldusage = curusage;
3697 }
3698 if (!ret && enlarge)
3699 memcg_oom_recover(memcg);
3700 return ret;
3701}
3702
3703unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
3704 gfp_t gfp_mask,
3705 unsigned long *total_scanned)
3706{
3707 unsigned long nr_reclaimed = 0;
3708 struct mem_cgroup_per_zone *mz, *next_mz = NULL;
3709 unsigned long reclaimed;
3710 int loop = 0;
3711 struct mem_cgroup_tree_per_zone *mctz;
3712 unsigned long long excess;
3713 unsigned long nr_scanned;
3714
3715 if (order > 0)
3716 return 0;
3717
3718 mctz = soft_limit_tree_node_zone(zone_to_nid(zone), zone_idx(zone));
3719
3720
3721
3722
3723
3724 do {
3725 if (next_mz)
3726 mz = next_mz;
3727 else
3728 mz = mem_cgroup_largest_soft_limit_node(mctz);
3729 if (!mz)
3730 break;
3731
3732 nr_scanned = 0;
3733 reclaimed = mem_cgroup_soft_reclaim(mz->memcg, zone,
3734 gfp_mask, &nr_scanned);
3735 nr_reclaimed += reclaimed;
3736 *total_scanned += nr_scanned;
3737 spin_lock_irq(&mctz->lock);
3738
3739
3740
3741
3742
3743 next_mz = NULL;
3744 if (!reclaimed) {
3745 do {
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757 next_mz =
3758 __mem_cgroup_largest_soft_limit_node(mctz);
3759 if (next_mz == mz)
3760 css_put(&next_mz->memcg->css);
3761 else
3762 break;
3763 } while (1);
3764 }
3765 __mem_cgroup_remove_exceeded(mz, mctz);
3766 excess = res_counter_soft_limit_excess(&mz->memcg->res);
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776 __mem_cgroup_insert_exceeded(mz, mctz, excess);
3777 spin_unlock_irq(&mctz->lock);
3778 css_put(&mz->memcg->css);
3779 loop++;
3780
3781
3782
3783
3784
3785 if (!nr_reclaimed &&
3786 (next_mz == NULL ||
3787 loop > MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS))
3788 break;
3789 } while (!nr_reclaimed);
3790 if (next_mz)
3791 css_put(&next_mz->memcg->css);
3792 return nr_reclaimed;
3793}
3794
3795
3796
3797
3798
3799
3800
3801
3802
3803
3804
3805
3806static void mem_cgroup_force_empty_list(struct mem_cgroup *memcg,
3807 int node, int zid, enum lru_list lru)
3808{
3809 struct lruvec *lruvec;
3810 unsigned long flags;
3811 struct list_head *list;
3812 struct page *busy;
3813 struct zone *zone;
3814
3815 zone = &NODE_DATA(node)->node_zones[zid];
3816 lruvec = mem_cgroup_zone_lruvec(zone, memcg);
3817 list = &lruvec->lists[lru];
3818
3819 busy = NULL;
3820 do {
3821 struct page_cgroup *pc;
3822 struct page *page;
3823
3824 spin_lock_irqsave(&zone->lru_lock, flags);
3825 if (list_empty(list)) {
3826 spin_unlock_irqrestore(&zone->lru_lock, flags);
3827 break;
3828 }
3829 page = list_entry(list->prev, struct page, lru);
3830 if (busy == page) {
3831 list_move(&page->lru, list);
3832 busy = NULL;
3833 spin_unlock_irqrestore(&zone->lru_lock, flags);
3834 continue;
3835 }
3836 spin_unlock_irqrestore(&zone->lru_lock, flags);
3837
3838 pc = lookup_page_cgroup(page);
3839
3840 if (mem_cgroup_move_parent(page, pc, memcg)) {
3841
3842 busy = page;
3843 } else
3844 busy = NULL;
3845 cond_resched();
3846 } while (!list_empty(list));
3847}
3848
3849
3850
3851
3852
3853
3854
3855
3856static void mem_cgroup_reparent_charges(struct mem_cgroup *memcg)
3857{
3858 int node, zid;
3859 u64 usage;
3860
3861 do {
3862
3863 lru_add_drain_all();
3864 drain_all_stock_sync(memcg);
3865 mem_cgroup_start_move(memcg);
3866 for_each_node_state(node, N_MEMORY) {
3867 for (zid = 0; zid < MAX_NR_ZONES; zid++) {
3868 enum lru_list lru;
3869 for_each_lru(lru) {
3870 mem_cgroup_force_empty_list(memcg,
3871 node, zid, lru);
3872 }
3873 }
3874 }
3875 mem_cgroup_end_move(memcg);
3876 memcg_oom_recover(memcg);
3877 cond_resched();
3878
3879
3880
3881
3882
3883
3884
3885
3886
3887
3888
3889
3890
3891 usage = res_counter_read_u64(&memcg->res, RES_USAGE) -
3892 res_counter_read_u64(&memcg->kmem, RES_USAGE);
3893 } while (usage > 0);
3894}
3895
3896
3897
3898
3899
3900
3901
3902static inline bool memcg_has_children(struct mem_cgroup *memcg)
3903{
3904 bool ret;
3905
3906
3907
3908
3909
3910
3911
3912 lockdep_assert_held(&memcg_create_mutex);
3913
3914 rcu_read_lock();
3915 ret = css_next_child(NULL, &memcg->css);
3916 rcu_read_unlock();
3917 return ret;
3918}
3919
3920
3921
3922
3923
3924
3925
3926static int mem_cgroup_force_empty(struct mem_cgroup *memcg)
3927{
3928 int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
3929
3930
3931 lru_add_drain_all();
3932
3933 while (nr_retries && res_counter_read_u64(&memcg->res, RES_USAGE) > 0) {
3934 int progress;
3935
3936 if (signal_pending(current))
3937 return -EINTR;
3938
3939 progress = try_to_free_mem_cgroup_pages(memcg, 1,
3940 GFP_KERNEL, true);
3941 if (!progress) {
3942 nr_retries--;
3943
3944 congestion_wait(BLK_RW_ASYNC, HZ/10);
3945 }
3946
3947 }
3948
3949 return 0;
3950}
3951
3952static ssize_t mem_cgroup_force_empty_write(struct kernfs_open_file *of,
3953 char *buf, size_t nbytes,
3954 loff_t off)
3955{
3956 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
3957
3958 if (mem_cgroup_is_root(memcg))
3959 return -EINVAL;
3960 return mem_cgroup_force_empty(memcg) ?: nbytes;
3961}
3962
3963static u64 mem_cgroup_hierarchy_read(struct cgroup_subsys_state *css,
3964 struct cftype *cft)
3965{
3966 return mem_cgroup_from_css(css)->use_hierarchy;
3967}
3968
3969static int mem_cgroup_hierarchy_write(struct cgroup_subsys_state *css,
3970 struct cftype *cft, u64 val)
3971{
3972 int retval = 0;
3973 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
3974 struct mem_cgroup *parent_memcg = mem_cgroup_from_css(memcg->css.parent);
3975
3976 mutex_lock(&memcg_create_mutex);
3977
3978 if (memcg->use_hierarchy == val)
3979 goto out;
3980
3981
3982
3983
3984
3985
3986
3987
3988
3989 if ((!parent_memcg || !parent_memcg->use_hierarchy) &&
3990 (val == 1 || val == 0)) {
3991 if (!memcg_has_children(memcg))
3992 memcg->use_hierarchy = val;
3993 else
3994 retval = -EBUSY;
3995 } else
3996 retval = -EINVAL;
3997
3998out:
3999 mutex_unlock(&memcg_create_mutex);
4000
4001 return retval;
4002}
4003
4004static unsigned long mem_cgroup_recursive_stat(struct mem_cgroup *memcg,
4005 enum mem_cgroup_stat_index idx)
4006{
4007 struct mem_cgroup *iter;
4008 long val = 0;
4009
4010
4011 for_each_mem_cgroup_tree(iter, memcg)
4012 val += mem_cgroup_read_stat(iter, idx);
4013
4014 if (val < 0)
4015 val = 0;
4016 return val;
4017}
4018
4019static inline u64 mem_cgroup_usage(struct mem_cgroup *memcg, bool swap)
4020{
4021 u64 val;
4022
4023 if (!mem_cgroup_is_root(memcg)) {
4024 if (!swap)
4025 return res_counter_read_u64(&memcg->res, RES_USAGE);
4026 else
4027 return res_counter_read_u64(&memcg->memsw, RES_USAGE);
4028 }
4029
4030
4031
4032
4033
4034 val = mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_CACHE);
4035 val += mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_RSS);
4036
4037 if (swap)
4038 val += mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_SWAP);
4039
4040 return val << PAGE_SHIFT;
4041}
4042
4043
4044static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css,
4045 struct cftype *cft)
4046{
4047 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
4048 enum res_type type = MEMFILE_TYPE(cft->private);
4049 int name = MEMFILE_ATTR(cft->private);
4050
4051 switch (type) {
4052 case _MEM:
4053 if (name == RES_USAGE)
4054 return mem_cgroup_usage(memcg, false);
4055 return res_counter_read_u64(&memcg->res, name);
4056 case _MEMSWAP:
4057 if (name == RES_USAGE)
4058 return mem_cgroup_usage(memcg, true);
4059 return res_counter_read_u64(&memcg->memsw, name);
4060 case _KMEM:
4061 return res_counter_read_u64(&memcg->kmem, name);
4062 break;
4063 default:
4064 BUG();
4065 }
4066}
4067
4068#ifdef CONFIG_MEMCG_KMEM
4069
4070static int __memcg_activate_kmem(struct mem_cgroup *memcg,
4071 unsigned long long limit)
4072{
4073 int err = 0;
4074 int memcg_id;
4075
4076 if (memcg_kmem_is_active(memcg))
4077 return 0;
4078
4079
4080
4081
4082
4083 memcg_stop_kmem_account();
4084
4085
4086
4087
4088
4089
4090
4091
4092
4093
4094
4095
4096
4097 mutex_lock(&memcg_create_mutex);
4098 if (cgroup_has_tasks(memcg->css.cgroup) ||
4099 (memcg->use_hierarchy && memcg_has_children(memcg)))
4100 err = -EBUSY;
4101 mutex_unlock(&memcg_create_mutex);
4102 if (err)
4103 goto out;
4104
4105 memcg_id = memcg_alloc_cache_id();
4106 if (memcg_id < 0) {
4107 err = memcg_id;
4108 goto out;
4109 }
4110
4111 memcg->kmemcg_id = memcg_id;
4112 INIT_LIST_HEAD(&memcg->memcg_slab_caches);
4113
4114
4115
4116
4117
4118 err = res_counter_set_limit(&memcg->kmem, limit);
4119 VM_BUG_ON(err);
4120
4121 static_key_slow_inc(&memcg_kmem_enabled_key);
4122
4123
4124
4125
4126
4127 memcg_kmem_set_active(memcg);
4128out:
4129 memcg_resume_kmem_account();
4130 return err;
4131}
4132
4133static int memcg_activate_kmem(struct mem_cgroup *memcg,
4134 unsigned long long limit)
4135{
4136 int ret;
4137
4138 mutex_lock(&activate_kmem_mutex);
4139 ret = __memcg_activate_kmem(memcg, limit);
4140 mutex_unlock(&activate_kmem_mutex);
4141 return ret;
4142}
4143
4144static int memcg_update_kmem_limit(struct mem_cgroup *memcg,
4145 unsigned long long val)
4146{
4147 int ret;
4148
4149 if (!memcg_kmem_is_active(memcg))
4150 ret = memcg_activate_kmem(memcg, val);
4151 else
4152 ret = res_counter_set_limit(&memcg->kmem, val);
4153 return ret;
4154}
4155
4156static int memcg_propagate_kmem(struct mem_cgroup *memcg)
4157{
4158 int ret = 0;
4159 struct mem_cgroup *parent = parent_mem_cgroup(memcg);
4160
4161 if (!parent)
4162 return 0;
4163
4164 mutex_lock(&activate_kmem_mutex);
4165
4166
4167
4168
4169 if (memcg_kmem_is_active(parent))
4170 ret = __memcg_activate_kmem(memcg, RES_COUNTER_MAX);
4171 mutex_unlock(&activate_kmem_mutex);
4172 return ret;
4173}
4174#else
4175static int memcg_update_kmem_limit(struct mem_cgroup *memcg,
4176 unsigned long long val)
4177{
4178 return -EINVAL;
4179}
4180#endif
4181
4182
4183
4184
4185
4186static ssize_t mem_cgroup_write(struct kernfs_open_file *of,
4187 char *buf, size_t nbytes, loff_t off)
4188{
4189 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
4190 enum res_type type;
4191 int name;
4192 unsigned long long val;
4193 int ret;
4194
4195 buf = strstrip(buf);
4196 type = MEMFILE_TYPE(of_cft(of)->private);
4197 name = MEMFILE_ATTR(of_cft(of)->private);
4198
4199 switch (name) {
4200 case RES_LIMIT:
4201 if (mem_cgroup_is_root(memcg)) {
4202 ret = -EINVAL;
4203 break;
4204 }
4205
4206 ret = res_counter_memparse_write_strategy(buf, &val);
4207 if (ret)
4208 break;
4209 if (type == _MEM)
4210 ret = mem_cgroup_resize_limit(memcg, val);
4211 else if (type == _MEMSWAP)
4212 ret = mem_cgroup_resize_memsw_limit(memcg, val);
4213 else if (type == _KMEM)
4214 ret = memcg_update_kmem_limit(memcg, val);
4215 else
4216 return -EINVAL;
4217 break;
4218 case RES_SOFT_LIMIT:
4219 ret = res_counter_memparse_write_strategy(buf, &val);
4220 if (ret)
4221 break;
4222
4223
4224
4225
4226
4227 if (type == _MEM)
4228 ret = res_counter_set_soft_limit(&memcg->res, val);
4229 else
4230 ret = -EINVAL;
4231 break;
4232 default:
4233 ret = -EINVAL;
4234 break;
4235 }
4236 return ret ?: nbytes;
4237}
4238
4239static void memcg_get_hierarchical_limit(struct mem_cgroup *memcg,
4240 unsigned long long *mem_limit, unsigned long long *memsw_limit)
4241{
4242 unsigned long long min_limit, min_memsw_limit, tmp;
4243
4244 min_limit = res_counter_read_u64(&memcg->res, RES_LIMIT);
4245 min_memsw_limit = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
4246 if (!memcg->use_hierarchy)
4247 goto out;
4248
4249 while (memcg->css.parent) {
4250 memcg = mem_cgroup_from_css(memcg->css.parent);
4251 if (!memcg->use_hierarchy)
4252 break;
4253 tmp = res_counter_read_u64(&memcg->res, RES_LIMIT);
4254 min_limit = min(min_limit, tmp);
4255 tmp = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
4256 min_memsw_limit = min(min_memsw_limit, tmp);
4257 }
4258out:
4259 *mem_limit = min_limit;
4260 *memsw_limit = min_memsw_limit;
4261}
4262
4263static ssize_t mem_cgroup_reset(struct kernfs_open_file *of, char *buf,
4264 size_t nbytes, loff_t off)
4265{
4266 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
4267 int name;
4268 enum res_type type;
4269
4270 type = MEMFILE_TYPE(of_cft(of)->private);
4271 name = MEMFILE_ATTR(of_cft(of)->private);
4272
4273 switch (name) {
4274 case RES_MAX_USAGE:
4275 if (type == _MEM)
4276 res_counter_reset_max(&memcg->res);
4277 else if (type == _MEMSWAP)
4278 res_counter_reset_max(&memcg->memsw);
4279 else if (type == _KMEM)
4280 res_counter_reset_max(&memcg->kmem);
4281 else
4282 return -EINVAL;
4283 break;
4284 case RES_FAILCNT:
4285 if (type == _MEM)
4286 res_counter_reset_failcnt(&memcg->res);
4287 else if (type == _MEMSWAP)
4288 res_counter_reset_failcnt(&memcg->memsw);
4289 else if (type == _KMEM)
4290 res_counter_reset_failcnt(&memcg->kmem);
4291 else
4292 return -EINVAL;
4293 break;
4294 }
4295
4296 return nbytes;
4297}
4298
4299static u64 mem_cgroup_move_charge_read(struct cgroup_subsys_state *css,
4300 struct cftype *cft)
4301{
4302 return mem_cgroup_from_css(css)->move_charge_at_immigrate;
4303}
4304
4305#ifdef CONFIG_MMU
4306static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css,
4307 struct cftype *cft, u64 val)
4308{
4309 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
4310
4311 if (val >= (1 << NR_MOVE_TYPE))
4312 return -EINVAL;
4313
4314
4315
4316
4317
4318
4319
4320 memcg->move_charge_at_immigrate = val;
4321 return 0;
4322}
4323#else
4324static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css,
4325 struct cftype *cft, u64 val)
4326{
4327 return -ENOSYS;
4328}
4329#endif
4330
4331#ifdef CONFIG_NUMA
4332static int memcg_numa_stat_show(struct seq_file *m, void *v)
4333{
4334 struct numa_stat {
4335 const char *name;
4336 unsigned int lru_mask;
4337 };
4338
4339 static const struct numa_stat stats[] = {
4340 { "total", LRU_ALL },
4341 { "file", LRU_ALL_FILE },
4342 { "anon", LRU_ALL_ANON },
4343 { "unevictable", BIT(LRU_UNEVICTABLE) },
4344 };
4345 const struct numa_stat *stat;
4346 int nid;
4347 unsigned long nr;
4348 struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
4349
4350 for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) {
4351 nr = mem_cgroup_nr_lru_pages(memcg, stat->lru_mask);
4352 seq_printf(m, "%s=%lu", stat->name, nr);
4353 for_each_node_state(nid, N_MEMORY) {
4354 nr = mem_cgroup_node_nr_lru_pages(memcg, nid,
4355 stat->lru_mask);
4356 seq_printf(m, " N%d=%lu", nid, nr);
4357 }
4358 seq_putc(m, '\n');
4359 }
4360
4361 for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) {
4362 struct mem_cgroup *iter;
4363
4364 nr = 0;
4365 for_each_mem_cgroup_tree(iter, memcg)
4366 nr += mem_cgroup_nr_lru_pages(iter, stat->lru_mask);
4367 seq_printf(m, "hierarchical_%s=%lu", stat->name, nr);
4368 for_each_node_state(nid, N_MEMORY) {
4369 nr = 0;
4370 for_each_mem_cgroup_tree(iter, memcg)
4371 nr += mem_cgroup_node_nr_lru_pages(
4372 iter, nid, stat->lru_mask);
4373 seq_printf(m, " N%d=%lu", nid, nr);
4374 }
4375 seq_putc(m, '\n');
4376 }
4377
4378 return 0;
4379}
4380#endif
4381
4382static inline void mem_cgroup_lru_names_not_uptodate(void)
4383{
4384 BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_lru_names) != NR_LRU_LISTS);
4385}
4386
4387static int memcg_stat_show(struct seq_file *m, void *v)
4388{
4389 struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
4390 struct mem_cgroup *mi;
4391 unsigned int i;
4392
4393 for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) {
4394 if (i == MEM_CGROUP_STAT_SWAP && !do_swap_account)
4395 continue;
4396 seq_printf(m, "%s %ld\n", mem_cgroup_stat_names[i],
4397 mem_cgroup_read_stat(memcg, i) * PAGE_SIZE);
4398 }
4399
4400 for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++)
4401 seq_printf(m, "%s %lu\n", mem_cgroup_events_names[i],
4402 mem_cgroup_read_events(memcg, i));
4403
4404 for (i = 0; i < NR_LRU_LISTS; i++)
4405 seq_printf(m, "%s %lu\n", mem_cgroup_lru_names[i],
4406 mem_cgroup_nr_lru_pages(memcg, BIT(i)) * PAGE_SIZE);
4407
4408
4409 {
4410 unsigned long long limit, memsw_limit;
4411 memcg_get_hierarchical_limit(memcg, &limit, &memsw_limit);
4412 seq_printf(m, "hierarchical_memory_limit %llu\n", limit);
4413 if (do_swap_account)
4414 seq_printf(m, "hierarchical_memsw_limit %llu\n",
4415 memsw_limit);
4416 }
4417
4418 for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) {
4419 long long val = 0;
4420
4421 if (i == MEM_CGROUP_STAT_SWAP && !do_swap_account)
4422 continue;
4423 for_each_mem_cgroup_tree(mi, memcg)
4424 val += mem_cgroup_read_stat(mi, i) * PAGE_SIZE;
4425 seq_printf(m, "total_%s %lld\n", mem_cgroup_stat_names[i], val);
4426 }
4427
4428 for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++) {
4429 unsigned long long val = 0;
4430
4431 for_each_mem_cgroup_tree(mi, memcg)
4432 val += mem_cgroup_read_events(mi, i);
4433 seq_printf(m, "total_%s %llu\n",
4434 mem_cgroup_events_names[i], val);
4435 }
4436
4437 for (i = 0; i < NR_LRU_LISTS; i++) {
4438 unsigned long long val = 0;
4439
4440 for_each_mem_cgroup_tree(mi, memcg)
4441 val += mem_cgroup_nr_lru_pages(mi, BIT(i)) * PAGE_SIZE;
4442 seq_printf(m, "total_%s %llu\n", mem_cgroup_lru_names[i], val);
4443 }
4444
4445#ifdef CONFIG_DEBUG_VM
4446 {
4447 int nid, zid;
4448 struct mem_cgroup_per_zone *mz;
4449 struct zone_reclaim_stat *rstat;
4450 unsigned long recent_rotated[2] = {0, 0};
4451 unsigned long recent_scanned[2] = {0, 0};
4452
4453 for_each_online_node(nid)
4454 for (zid = 0; zid < MAX_NR_ZONES; zid++) {
4455 mz = &memcg->nodeinfo[nid]->zoneinfo[zid];
4456 rstat = &mz->lruvec.reclaim_stat;
4457
4458 recent_rotated[0] += rstat->recent_rotated[0];
4459 recent_rotated[1] += rstat->recent_rotated[1];
4460 recent_scanned[0] += rstat->recent_scanned[0];
4461 recent_scanned[1] += rstat->recent_scanned[1];
4462 }
4463 seq_printf(m, "recent_rotated_anon %lu\n", recent_rotated[0]);
4464 seq_printf(m, "recent_rotated_file %lu\n", recent_rotated[1]);
4465 seq_printf(m, "recent_scanned_anon %lu\n", recent_scanned[0]);
4466 seq_printf(m, "recent_scanned_file %lu\n", recent_scanned[1]);
4467 }
4468#endif
4469
4470 return 0;
4471}
4472
4473static u64 mem_cgroup_swappiness_read(struct cgroup_subsys_state *css,
4474 struct cftype *cft)
4475{
4476 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
4477
4478 return mem_cgroup_swappiness(memcg);
4479}
4480
4481static int mem_cgroup_swappiness_write(struct cgroup_subsys_state *css,
4482 struct cftype *cft, u64 val)
4483{
4484 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
4485
4486 if (val > 100)
4487 return -EINVAL;
4488
4489 if (css->parent)
4490 memcg->swappiness = val;
4491 else
4492 vm_swappiness = val;
4493
4494 return 0;
4495}
4496
4497static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap)
4498{
4499 struct mem_cgroup_threshold_ary *t;
4500 u64 usage;
4501 int i;
4502
4503 rcu_read_lock();
4504 if (!swap)
4505 t = rcu_dereference(memcg->thresholds.primary);
4506 else
4507 t = rcu_dereference(memcg->memsw_thresholds.primary);
4508
4509 if (!t)
4510 goto unlock;
4511
4512 usage = mem_cgroup_usage(memcg, swap);
4513
4514
4515
4516
4517
4518
4519 i = t->current_threshold;
4520
4521
4522
4523
4524
4525
4526
4527 for (; i >= 0 && unlikely(t->entries[i].threshold > usage); i--)
4528 eventfd_signal(t->entries[i].eventfd, 1);
4529
4530
4531 i++;
4532
4533
4534
4535
4536
4537
4538
4539 for (; i < t->size && unlikely(t->entries[i].threshold <= usage); i++)
4540 eventfd_signal(t->entries[i].eventfd, 1);
4541
4542
4543 t->current_threshold = i - 1;
4544unlock:
4545 rcu_read_unlock();
4546}
4547
4548static void mem_cgroup_threshold(struct mem_cgroup *memcg)
4549{
4550 while (memcg) {
4551 __mem_cgroup_threshold(memcg, false);
4552 if (do_swap_account)
4553 __mem_cgroup_threshold(memcg, true);
4554
4555 memcg = parent_mem_cgroup(memcg);
4556 }
4557}
4558
4559static int compare_thresholds(const void *a, const void *b)
4560{
4561 const struct mem_cgroup_threshold *_a = a;
4562 const struct mem_cgroup_threshold *_b = b;
4563
4564 if (_a->threshold > _b->threshold)
4565 return 1;
4566
4567 if (_a->threshold < _b->threshold)
4568 return -1;
4569
4570 return 0;
4571}
4572
4573static int mem_cgroup_oom_notify_cb(struct mem_cgroup *memcg)
4574{
4575 struct mem_cgroup_eventfd_list *ev;
4576
4577 spin_lock(&memcg_oom_lock);
4578
4579 list_for_each_entry(ev, &memcg->oom_notify, list)
4580 eventfd_signal(ev->eventfd, 1);
4581
4582 spin_unlock(&memcg_oom_lock);
4583 return 0;
4584}
4585
4586static void mem_cgroup_oom_notify(struct mem_cgroup *memcg)
4587{
4588 struct mem_cgroup *iter;
4589
4590 for_each_mem_cgroup_tree(iter, memcg)
4591 mem_cgroup_oom_notify_cb(iter);
4592}
4593
4594static int __mem_cgroup_usage_register_event(struct mem_cgroup *memcg,
4595 struct eventfd_ctx *eventfd, const char *args, enum res_type type)
4596{
4597 struct mem_cgroup_thresholds *thresholds;
4598 struct mem_cgroup_threshold_ary *new;
4599 u64 threshold, usage;
4600 int i, size, ret;
4601
4602 ret = res_counter_memparse_write_strategy(args, &threshold);
4603 if (ret)
4604 return ret;
4605
4606 mutex_lock(&memcg->thresholds_lock);
4607
4608 if (type == _MEM) {
4609 thresholds = &memcg->thresholds;
4610 usage = mem_cgroup_usage(memcg, false);
4611 } else if (type == _MEMSWAP) {
4612 thresholds = &memcg->memsw_thresholds;
4613 usage = mem_cgroup_usage(memcg, true);
4614 } else
4615 BUG();
4616
4617
4618 if (thresholds->primary)
4619 __mem_cgroup_threshold(memcg, type == _MEMSWAP);
4620
4621 size = thresholds->primary ? thresholds->primary->size + 1 : 1;
4622
4623
4624 new = kmalloc(sizeof(*new) + size * sizeof(struct mem_cgroup_threshold),
4625 GFP_KERNEL);
4626 if (!new) {
4627 ret = -ENOMEM;
4628 goto unlock;
4629 }
4630 new->size = size;
4631
4632
4633 if (thresholds->primary) {
4634 memcpy(new->entries, thresholds->primary->entries, (size - 1) *
4635 sizeof(struct mem_cgroup_threshold));
4636 }
4637
4638
4639 new->entries[size - 1].eventfd = eventfd;
4640 new->entries[size - 1].threshold = threshold;
4641
4642
4643 sort(new->entries, size, sizeof(struct mem_cgroup_threshold),
4644 compare_thresholds, NULL);
4645
4646
4647 new->current_threshold = -1;
4648 for (i = 0; i < size; i++) {
4649 if (new->entries[i].threshold <= usage) {
4650
4651
4652
4653
4654
4655 ++new->current_threshold;
4656 } else
4657 break;
4658 }
4659
4660
4661 kfree(thresholds->spare);
4662 thresholds->spare = thresholds->primary;
4663
4664 rcu_assign_pointer(thresholds->primary, new);
4665
4666
4667 synchronize_rcu();
4668
4669unlock:
4670 mutex_unlock(&memcg->thresholds_lock);
4671
4672 return ret;
4673}
4674
4675static int mem_cgroup_usage_register_event(struct mem_cgroup *memcg,
4676 struct eventfd_ctx *eventfd, const char *args)
4677{
4678 return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEM);
4679}
4680
4681static int memsw_cgroup_usage_register_event(struct mem_cgroup *memcg,
4682 struct eventfd_ctx *eventfd, const char *args)
4683{
4684 return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEMSWAP);
4685}
4686
4687static void __mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
4688 struct eventfd_ctx *eventfd, enum res_type type)
4689{
4690 struct mem_cgroup_thresholds *thresholds;
4691 struct mem_cgroup_threshold_ary *new;
4692 u64 usage;
4693 int i, j, size;
4694
4695 mutex_lock(&memcg->thresholds_lock);
4696
4697 if (type == _MEM) {
4698 thresholds = &memcg->thresholds;
4699 usage = mem_cgroup_usage(memcg, false);
4700 } else if (type == _MEMSWAP) {
4701 thresholds = &memcg->memsw_thresholds;
4702 usage = mem_cgroup_usage(memcg, true);
4703 } else
4704 BUG();
4705
4706 if (!thresholds->primary)
4707 goto unlock;
4708
4709
4710 __mem_cgroup_threshold(memcg, type == _MEMSWAP);
4711
4712
4713 size = 0;
4714 for (i = 0; i < thresholds->primary->size; i++) {
4715 if (thresholds->primary->entries[i].eventfd != eventfd)
4716 size++;
4717 }
4718
4719 new = thresholds->spare;
4720
4721
4722 if (!size) {
4723 kfree(new);
4724 new = NULL;
4725 goto swap_buffers;
4726 }
4727
4728 new->size = size;
4729
4730
4731 new->current_threshold = -1;
4732 for (i = 0, j = 0; i < thresholds->primary->size; i++) {
4733 if (thresholds->primary->entries[i].eventfd == eventfd)
4734 continue;
4735
4736 new->entries[j] = thresholds->primary->entries[i];
4737 if (new->entries[j].threshold <= usage) {
4738
4739
4740
4741
4742
4743 ++new->current_threshold;
4744 }
4745 j++;
4746 }
4747
4748swap_buffers:
4749
4750 thresholds->spare = thresholds->primary;
4751
4752 if (!new) {
4753 kfree(thresholds->spare);
4754 thresholds->spare = NULL;
4755 }
4756
4757 rcu_assign_pointer(thresholds->primary, new);
4758
4759
4760 synchronize_rcu();
4761unlock:
4762 mutex_unlock(&memcg->thresholds_lock);
4763}
4764
4765static void mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
4766 struct eventfd_ctx *eventfd)
4767{
4768 return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEM);
4769}
4770
4771static void memsw_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
4772 struct eventfd_ctx *eventfd)
4773{
4774 return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEMSWAP);
4775}
4776
4777static int mem_cgroup_oom_register_event(struct mem_cgroup *memcg,
4778 struct eventfd_ctx *eventfd, const char *args)
4779{
4780 struct mem_cgroup_eventfd_list *event;
4781
4782 event = kmalloc(sizeof(*event), GFP_KERNEL);
4783 if (!event)
4784 return -ENOMEM;
4785
4786 spin_lock(&memcg_oom_lock);
4787
4788 event->eventfd = eventfd;
4789 list_add(&event->list, &memcg->oom_notify);
4790
4791
4792 if (atomic_read(&memcg->under_oom))
4793 eventfd_signal(eventfd, 1);
4794 spin_unlock(&memcg_oom_lock);
4795
4796 return 0;
4797}
4798
4799static void mem_cgroup_oom_unregister_event(struct mem_cgroup *memcg,
4800 struct eventfd_ctx *eventfd)
4801{
4802 struct mem_cgroup_eventfd_list *ev, *tmp;
4803
4804 spin_lock(&memcg_oom_lock);
4805
4806 list_for_each_entry_safe(ev, tmp, &memcg->oom_notify, list) {
4807 if (ev->eventfd == eventfd) {
4808 list_del(&ev->list);
4809 kfree(ev);
4810 }
4811 }
4812
4813 spin_unlock(&memcg_oom_lock);
4814}
4815
4816static int mem_cgroup_oom_control_read(struct seq_file *sf, void *v)
4817{
4818 struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(sf));
4819
4820 seq_printf(sf, "oom_kill_disable %d\n", memcg->oom_kill_disable);
4821 seq_printf(sf, "under_oom %d\n", (bool)atomic_read(&memcg->under_oom));
4822 return 0;
4823}
4824
4825static int mem_cgroup_oom_control_write(struct cgroup_subsys_state *css,
4826 struct cftype *cft, u64 val)
4827{
4828 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
4829
4830
4831 if (!css->parent || !((val == 0) || (val == 1)))
4832 return -EINVAL;
4833
4834 memcg->oom_kill_disable = val;
4835 if (!val)
4836 memcg_oom_recover(memcg);
4837
4838 return 0;
4839}
4840
4841#ifdef CONFIG_MEMCG_KMEM
4842static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
4843{
4844 int ret;
4845
4846 memcg->kmemcg_id = -1;
4847 ret = memcg_propagate_kmem(memcg);
4848 if (ret)
4849 return ret;
4850
4851 return mem_cgroup_sockets_init(memcg, ss);
4852}
4853
4854static void memcg_destroy_kmem(struct mem_cgroup *memcg)
4855{
4856 mem_cgroup_sockets_destroy(memcg);
4857}
4858
4859static void kmem_cgroup_css_offline(struct mem_cgroup *memcg)
4860{
4861 if (!memcg_kmem_is_active(memcg))
4862 return;
4863
4864
4865
4866
4867
4868
4869
4870
4871
4872
4873
4874
4875
4876
4877
4878
4879
4880
4881
4882 css_get(&memcg->css);
4883
4884 memcg_kmem_mark_dead(memcg);
4885
4886 if (res_counter_read_u64(&memcg->kmem, RES_USAGE) != 0)
4887 return;
4888
4889 if (memcg_kmem_test_and_clear_dead(memcg))
4890 css_put(&memcg->css);
4891}
4892#else
4893static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
4894{
4895 return 0;
4896}
4897
4898static void memcg_destroy_kmem(struct mem_cgroup *memcg)
4899{
4900}
4901
4902static void kmem_cgroup_css_offline(struct mem_cgroup *memcg)
4903{
4904}
4905#endif
4906
4907
4908
4909
4910
4911
4912
4913
4914
4915
4916
4917
4918
4919
4920
4921
4922
4923
4924
4925static void memcg_event_remove(struct work_struct *work)
4926{
4927 struct mem_cgroup_event *event =
4928 container_of(work, struct mem_cgroup_event, remove);
4929 struct mem_cgroup *memcg = event->memcg;
4930
4931 remove_wait_queue(event->wqh, &event->wait);
4932
4933 event->unregister_event(memcg, event->eventfd);
4934
4935
4936 eventfd_signal(event->eventfd, 1);
4937
4938 eventfd_ctx_put(event->eventfd);
4939 kfree(event);
4940 css_put(&memcg->css);
4941}
4942
4943
4944
4945
4946
4947
4948static int memcg_event_wake(wait_queue_t *wait, unsigned mode,
4949 int sync, void *key)
4950{
4951 struct mem_cgroup_event *event =
4952 container_of(wait, struct mem_cgroup_event, wait);
4953 struct mem_cgroup *memcg = event->memcg;
4954 unsigned long flags = (unsigned long)key;
4955
4956 if (flags & POLLHUP) {
4957
4958
4959
4960
4961
4962
4963
4964
4965
4966 spin_lock(&memcg->event_list_lock);
4967 if (!list_empty(&event->list)) {
4968 list_del_init(&event->list);
4969
4970
4971
4972
4973 schedule_work(&event->remove);
4974 }
4975 spin_unlock(&memcg->event_list_lock);
4976 }
4977
4978 return 0;
4979}
4980
4981static void memcg_event_ptable_queue_proc(struct file *file,
4982 wait_queue_head_t *wqh, poll_table *pt)
4983{
4984 struct mem_cgroup_event *event =
4985 container_of(pt, struct mem_cgroup_event, pt);
4986
4987 event->wqh = wqh;
4988 add_wait_queue(wqh, &event->wait);
4989}
4990
4991
4992
4993
4994
4995
4996
4997
4998
4999static ssize_t memcg_write_event_control(struct kernfs_open_file *of,
5000 char *buf, size_t nbytes, loff_t off)
5001{
5002 struct cgroup_subsys_state *css = of_css(of);
5003 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
5004 struct mem_cgroup_event *event;
5005 struct cgroup_subsys_state *cfile_css;
5006 unsigned int efd, cfd;
5007 struct fd efile;
5008 struct fd cfile;
5009 const char *name;
5010 char *endp;
5011 int ret;
5012
5013 buf = strstrip(buf);
5014
5015 efd = simple_strtoul(buf, &endp, 10);
5016 if (*endp != ' ')
5017 return -EINVAL;
5018 buf = endp + 1;
5019
5020 cfd = simple_strtoul(buf, &endp, 10);
5021 if ((*endp != ' ') && (*endp != '\0'))
5022 return -EINVAL;
5023 buf = endp + 1;
5024
5025 event = kzalloc(sizeof(*event), GFP_KERNEL);
5026 if (!event)
5027 return -ENOMEM;
5028
5029 event->memcg = memcg;
5030 INIT_LIST_HEAD(&event->list);
5031 init_poll_funcptr(&event->pt, memcg_event_ptable_queue_proc);
5032 init_waitqueue_func_entry(&event->wait, memcg_event_wake);
5033 INIT_WORK(&event->remove, memcg_event_remove);
5034
5035 efile = fdget(efd);
5036 if (!efile.file) {
5037 ret = -EBADF;
5038 goto out_kfree;
5039 }
5040
5041 event->eventfd = eventfd_ctx_fileget(efile.file);
5042 if (IS_ERR(event->eventfd)) {
5043 ret = PTR_ERR(event->eventfd);
5044 goto out_put_efile;
5045 }
5046
5047 cfile = fdget(cfd);
5048 if (!cfile.file) {
5049 ret = -EBADF;
5050 goto out_put_eventfd;
5051 }
5052
5053
5054
5055 ret = inode_permission(file_inode(cfile.file), MAY_READ);
5056 if (ret < 0)
5057 goto out_put_cfile;
5058
5059
5060
5061
5062
5063
5064
5065
5066
5067 name = cfile.file->f_dentry->d_name.name;
5068
5069 if (!strcmp(name, "memory.usage_in_bytes")) {
5070 event->register_event = mem_cgroup_usage_register_event;
5071 event->unregister_event = mem_cgroup_usage_unregister_event;
5072 } else if (!strcmp(name, "memory.oom_control")) {
5073 event->register_event = mem_cgroup_oom_register_event;
5074 event->unregister_event = mem_cgroup_oom_unregister_event;
5075 } else if (!strcmp(name, "memory.pressure_level")) {
5076 event->register_event = vmpressure_register_event;
5077 event->unregister_event = vmpressure_unregister_event;
5078 } else if (!strcmp(name, "memory.memsw.usage_in_bytes")) {
5079 event->register_event = memsw_cgroup_usage_register_event;
5080 event->unregister_event = memsw_cgroup_usage_unregister_event;
5081 } else {
5082 ret = -EINVAL;
5083 goto out_put_cfile;
5084 }
5085
5086
5087
5088
5089
5090
5091 cfile_css = css_tryget_online_from_dir(cfile.file->f_dentry->d_parent,
5092 &memory_cgrp_subsys);
5093 ret = -EINVAL;
5094 if (IS_ERR(cfile_css))
5095 goto out_put_cfile;
5096 if (cfile_css != css) {
5097 css_put(cfile_css);
5098 goto out_put_cfile;
5099 }
5100
5101 ret = event->register_event(memcg, event->eventfd, buf);
5102 if (ret)
5103 goto out_put_css;
5104
5105 efile.file->f_op->poll(efile.file, &event->pt);
5106
5107 spin_lock(&memcg->event_list_lock);
5108 list_add(&event->list, &memcg->event_list);
5109 spin_unlock(&memcg->event_list_lock);
5110
5111 fdput(cfile);
5112 fdput(efile);
5113
5114 return nbytes;
5115
5116out_put_css:
5117 css_put(css);
5118out_put_cfile:
5119 fdput(cfile);
5120out_put_eventfd:
5121 eventfd_ctx_put(event->eventfd);
5122out_put_efile:
5123 fdput(efile);
5124out_kfree:
5125 kfree(event);
5126
5127 return ret;
5128}
5129
5130static struct cftype mem_cgroup_files[] = {
5131 {
5132 .name = "usage_in_bytes",
5133 .private = MEMFILE_PRIVATE(_MEM, RES_USAGE),
5134 .read_u64 = mem_cgroup_read_u64,
5135 },
5136 {
5137 .name = "max_usage_in_bytes",
5138 .private = MEMFILE_PRIVATE(_MEM, RES_MAX_USAGE),
5139 .write = mem_cgroup_reset,
5140 .read_u64 = mem_cgroup_read_u64,
5141 },
5142 {
5143 .name = "limit_in_bytes",
5144 .private = MEMFILE_PRIVATE(_MEM, RES_LIMIT),
5145 .write = mem_cgroup_write,
5146 .read_u64 = mem_cgroup_read_u64,
5147 },
5148 {
5149 .name = "soft_limit_in_bytes",
5150 .private = MEMFILE_PRIVATE(_MEM, RES_SOFT_LIMIT),
5151 .write = mem_cgroup_write,
5152 .read_u64 = mem_cgroup_read_u64,
5153 },
5154 {
5155 .name = "failcnt",
5156 .private = MEMFILE_PRIVATE(_MEM, RES_FAILCNT),
5157 .write = mem_cgroup_reset,
5158 .read_u64 = mem_cgroup_read_u64,
5159 },
5160 {
5161 .name = "stat",
5162 .seq_show = memcg_stat_show,
5163 },
5164 {
5165 .name = "force_empty",
5166 .write = mem_cgroup_force_empty_write,
5167 },
5168 {
5169 .name = "use_hierarchy",
5170 .write_u64 = mem_cgroup_hierarchy_write,
5171 .read_u64 = mem_cgroup_hierarchy_read,
5172 },
5173 {
5174 .name = "cgroup.event_control",
5175 .write = memcg_write_event_control,
5176 .flags = CFTYPE_NO_PREFIX,
5177 .mode = S_IWUGO,
5178 },
5179 {
5180 .name = "swappiness",
5181 .read_u64 = mem_cgroup_swappiness_read,
5182 .write_u64 = mem_cgroup_swappiness_write,
5183 },
5184 {
5185 .name = "move_charge_at_immigrate",
5186 .read_u64 = mem_cgroup_move_charge_read,
5187 .write_u64 = mem_cgroup_move_charge_write,
5188 },
5189 {
5190 .name = "oom_control",
5191 .seq_show = mem_cgroup_oom_control_read,
5192 .write_u64 = mem_cgroup_oom_control_write,
5193 .private = MEMFILE_PRIVATE(_OOM_TYPE, OOM_CONTROL),
5194 },
5195 {
5196 .name = "pressure_level",
5197 },
5198#ifdef CONFIG_NUMA
5199 {
5200 .name = "numa_stat",
5201 .seq_show = memcg_numa_stat_show,
5202 },
5203#endif
5204#ifdef CONFIG_MEMCG_KMEM
5205 {
5206 .name = "kmem.limit_in_bytes",
5207 .private = MEMFILE_PRIVATE(_KMEM, RES_LIMIT),
5208 .write = mem_cgroup_write,
5209 .read_u64 = mem_cgroup_read_u64,
5210 },
5211 {
5212 .name = "kmem.usage_in_bytes",
5213 .private = MEMFILE_PRIVATE(_KMEM, RES_USAGE),
5214 .read_u64 = mem_cgroup_read_u64,
5215 },
5216 {
5217 .name = "kmem.failcnt",
5218 .private = MEMFILE_PRIVATE(_KMEM, RES_FAILCNT),
5219 .write = mem_cgroup_reset,
5220 .read_u64 = mem_cgroup_read_u64,
5221 },
5222 {
5223 .name = "kmem.max_usage_in_bytes",
5224 .private = MEMFILE_PRIVATE(_KMEM, RES_MAX_USAGE),
5225 .write = mem_cgroup_reset,
5226 .read_u64 = mem_cgroup_read_u64,
5227 },
5228#ifdef CONFIG_SLABINFO
5229 {
5230 .name = "kmem.slabinfo",
5231 .seq_show = mem_cgroup_slabinfo_read,
5232 },
5233#endif
5234#endif
5235 { },
5236};
5237
5238#ifdef CONFIG_MEMCG_SWAP
5239static struct cftype memsw_cgroup_files[] = {
5240 {
5241 .name = "memsw.usage_in_bytes",
5242 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE),
5243 .read_u64 = mem_cgroup_read_u64,
5244 },
5245 {
5246 .name = "memsw.max_usage_in_bytes",
5247 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_MAX_USAGE),
5248 .write = mem_cgroup_reset,
5249 .read_u64 = mem_cgroup_read_u64,
5250 },
5251 {
5252 .name = "memsw.limit_in_bytes",
5253 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_LIMIT),
5254 .write = mem_cgroup_write,
5255 .read_u64 = mem_cgroup_read_u64,
5256 },
5257 {
5258 .name = "memsw.failcnt",
5259 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_FAILCNT),
5260 .write = mem_cgroup_reset,
5261 .read_u64 = mem_cgroup_read_u64,
5262 },
5263 { },
5264};
5265#endif
5266static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node)
5267{
5268 struct mem_cgroup_per_node *pn;
5269 struct mem_cgroup_per_zone *mz;
5270 int zone, tmp = node;
5271
5272
5273
5274
5275
5276
5277
5278
5279 if (!node_state(node, N_NORMAL_MEMORY))
5280 tmp = -1;
5281 pn = kzalloc_node(sizeof(*pn), GFP_KERNEL, tmp);
5282 if (!pn)
5283 return 1;
5284
5285 for (zone = 0; zone < MAX_NR_ZONES; zone++) {
5286 mz = &pn->zoneinfo[zone];
5287 lruvec_init(&mz->lruvec);
5288 mz->usage_in_excess = 0;
5289 mz->on_tree = false;
5290 mz->memcg = memcg;
5291 }
5292 memcg->nodeinfo[node] = pn;
5293 return 0;
5294}
5295
5296static void free_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node)
5297{
5298 kfree(memcg->nodeinfo[node]);
5299}
5300
5301static struct mem_cgroup *mem_cgroup_alloc(void)
5302{
5303 struct mem_cgroup *memcg;
5304 size_t size;
5305
5306 size = sizeof(struct mem_cgroup);
5307 size += nr_node_ids * sizeof(struct mem_cgroup_per_node *);
5308
5309 memcg = kzalloc(size, GFP_KERNEL);
5310 if (!memcg)
5311 return NULL;
5312
5313 memcg->stat = alloc_percpu(struct mem_cgroup_stat_cpu);
5314 if (!memcg->stat)
5315 goto out_free;
5316 spin_lock_init(&memcg->pcp_counter_lock);
5317 return memcg;
5318
5319out_free:
5320 kfree(memcg);
5321 return NULL;
5322}
5323
5324
5325
5326
5327
5328
5329
5330
5331
5332
5333
5334
5335static void __mem_cgroup_free(struct mem_cgroup *memcg)
5336{
5337 int node;
5338
5339 mem_cgroup_remove_from_trees(memcg);
5340
5341 for_each_node(node)
5342 free_mem_cgroup_per_zone_info(memcg, node);
5343
5344 free_percpu(memcg->stat);
5345
5346
5347
5348
5349
5350
5351
5352
5353
5354
5355
5356
5357 disarm_static_keys(memcg);
5358 kfree(memcg);
5359}
5360
5361
5362
5363
5364struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg)
5365{
5366 if (!memcg->res.parent)
5367 return NULL;
5368 return mem_cgroup_from_res_counter(memcg->res.parent, res);
5369}
5370EXPORT_SYMBOL(parent_mem_cgroup);
5371
5372static void __init mem_cgroup_soft_limit_tree_init(void)
5373{
5374 struct mem_cgroup_tree_per_node *rtpn;
5375 struct mem_cgroup_tree_per_zone *rtpz;
5376 int tmp, node, zone;
5377
5378 for_each_node(node) {
5379 tmp = node;
5380 if (!node_state(node, N_NORMAL_MEMORY))
5381 tmp = -1;
5382 rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL, tmp);
5383 BUG_ON(!rtpn);
5384
5385 soft_limit_tree.rb_tree_per_node[node] = rtpn;
5386
5387 for (zone = 0; zone < MAX_NR_ZONES; zone++) {
5388 rtpz = &rtpn->rb_tree_per_zone[zone];
5389 rtpz->rb_root = RB_ROOT;
5390 spin_lock_init(&rtpz->lock);
5391 }
5392 }
5393}
5394
5395static struct cgroup_subsys_state * __ref
5396mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
5397{
5398 struct mem_cgroup *memcg;
5399 long error = -ENOMEM;
5400 int node;
5401
5402 memcg = mem_cgroup_alloc();
5403 if (!memcg)
5404 return ERR_PTR(error);
5405
5406 for_each_node(node)
5407 if (alloc_mem_cgroup_per_zone_info(memcg, node))
5408 goto free_out;
5409
5410
5411 if (parent_css == NULL) {
5412 root_mem_cgroup = memcg;
5413 res_counter_init(&memcg->res, NULL);
5414 res_counter_init(&memcg->memsw, NULL);
5415 res_counter_init(&memcg->kmem, NULL);
5416 }
5417
5418 memcg->last_scanned_node = MAX_NUMNODES;
5419 INIT_LIST_HEAD(&memcg->oom_notify);
5420 memcg->move_charge_at_immigrate = 0;
5421 mutex_init(&memcg->thresholds_lock);
5422 spin_lock_init(&memcg->move_lock);
5423 vmpressure_init(&memcg->vmpressure);
5424 INIT_LIST_HEAD(&memcg->event_list);
5425 spin_lock_init(&memcg->event_list_lock);
5426
5427 return &memcg->css;
5428
5429free_out:
5430 __mem_cgroup_free(memcg);
5431 return ERR_PTR(error);
5432}
5433
5434static int
5435mem_cgroup_css_online(struct cgroup_subsys_state *css)
5436{
5437 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
5438 struct mem_cgroup *parent = mem_cgroup_from_css(css->parent);
5439 int ret;
5440
5441 if (css->id > MEM_CGROUP_ID_MAX)
5442 return -ENOSPC;
5443
5444 if (!parent)
5445 return 0;
5446
5447 mutex_lock(&memcg_create_mutex);
5448
5449 memcg->use_hierarchy = parent->use_hierarchy;
5450 memcg->oom_kill_disable = parent->oom_kill_disable;
5451 memcg->swappiness = mem_cgroup_swappiness(parent);
5452
5453 if (parent->use_hierarchy) {
5454 res_counter_init(&memcg->res, &parent->res);
5455 res_counter_init(&memcg->memsw, &parent->memsw);
5456 res_counter_init(&memcg->kmem, &parent->kmem);
5457
5458
5459
5460
5461
5462 } else {
5463 res_counter_init(&memcg->res, NULL);
5464 res_counter_init(&memcg->memsw, NULL);
5465 res_counter_init(&memcg->kmem, NULL);
5466
5467
5468
5469
5470
5471 if (parent != root_mem_cgroup)
5472 memory_cgrp_subsys.broken_hierarchy = true;
5473 }
5474 mutex_unlock(&memcg_create_mutex);
5475
5476 ret = memcg_init_kmem(memcg, &memory_cgrp_subsys);
5477 if (ret)
5478 return ret;
5479
5480
5481
5482
5483
5484
5485 smp_store_release(&memcg->initialized, 1);
5486
5487 return 0;
5488}
5489
5490
5491
5492
5493static void mem_cgroup_invalidate_reclaim_iterators(struct mem_cgroup *memcg)
5494{
5495 struct mem_cgroup *parent = memcg;
5496
5497 while ((parent = parent_mem_cgroup(parent)))
5498 mem_cgroup_iter_invalidate(parent);
5499
5500
5501
5502
5503
5504 if (!root_mem_cgroup->use_hierarchy)
5505 mem_cgroup_iter_invalidate(root_mem_cgroup);
5506}
5507
5508static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
5509{
5510 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
5511 struct mem_cgroup_event *event, *tmp;
5512 struct cgroup_subsys_state *iter;
5513
5514
5515
5516
5517
5518
5519 spin_lock(&memcg->event_list_lock);
5520 list_for_each_entry_safe(event, tmp, &memcg->event_list, list) {
5521 list_del_init(&event->list);
5522 schedule_work(&event->remove);
5523 }
5524 spin_unlock(&memcg->event_list_lock);
5525
5526 kmem_cgroup_css_offline(memcg);
5527
5528 mem_cgroup_invalidate_reclaim_iterators(memcg);
5529
5530
5531
5532
5533
5534 css_for_each_descendant_post(iter, css)
5535 mem_cgroup_reparent_charges(mem_cgroup_from_css(iter));
5536
5537 memcg_unregister_all_caches(memcg);
5538 vmpressure_cleanup(&memcg->vmpressure);
5539}
5540
5541static void mem_cgroup_css_free(struct cgroup_subsys_state *css)
5542{
5543 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
5544
5545
5546
5547
5548
5549
5550
5551
5552
5553
5554
5555
5556
5557
5558
5559
5560
5561
5562
5563
5564
5565
5566
5567
5568
5569
5570
5571
5572
5573
5574
5575
5576
5577
5578
5579 mem_cgroup_reparent_charges(memcg);
5580
5581 memcg_destroy_kmem(memcg);
5582 __mem_cgroup_free(memcg);
5583}
5584
5585
5586
5587
5588
5589
5590
5591
5592
5593
5594
5595
5596
5597
5598static void mem_cgroup_css_reset(struct cgroup_subsys_state *css)
5599{
5600 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
5601
5602 mem_cgroup_resize_limit(memcg, ULLONG_MAX);
5603 mem_cgroup_resize_memsw_limit(memcg, ULLONG_MAX);
5604 memcg_update_kmem_limit(memcg, ULLONG_MAX);
5605 res_counter_set_soft_limit(&memcg->res, ULLONG_MAX);
5606}
5607
5608#ifdef CONFIG_MMU
5609
5610static int mem_cgroup_do_precharge(unsigned long count)
5611{
5612 int ret;
5613
5614
5615 ret = try_charge(mc.to, GFP_KERNEL & ~__GFP_WAIT, count);
5616 if (!ret) {
5617 mc.precharge += count;
5618 return ret;
5619 }
5620 if (ret == -EINTR) {
5621 cancel_charge(root_mem_cgroup, count);
5622 return ret;
5623 }
5624
5625
5626 while (count--) {
5627 ret = try_charge(mc.to, GFP_KERNEL & ~__GFP_NORETRY, 1);
5628
5629
5630
5631
5632
5633
5634 if (ret == -EINTR)
5635 cancel_charge(root_mem_cgroup, 1);
5636 if (ret)
5637 return ret;
5638 mc.precharge++;
5639 cond_resched();
5640 }
5641 return 0;
5642}
5643
5644
5645
5646
5647
5648
5649
5650
5651
5652
5653
5654
5655
5656
5657
5658
5659
5660
5661
5662union mc_target {
5663 struct page *page;
5664 swp_entry_t ent;
5665};
5666
5667enum mc_target_type {
5668 MC_TARGET_NONE = 0,
5669 MC_TARGET_PAGE,
5670 MC_TARGET_SWAP,
5671};
5672
5673static struct page *mc_handle_present_pte(struct vm_area_struct *vma,
5674 unsigned long addr, pte_t ptent)
5675{
5676 struct page *page = vm_normal_page(vma, addr, ptent);
5677
5678 if (!page || !page_mapped(page))
5679 return NULL;
5680 if (PageAnon(page)) {
5681
5682 if (!move_anon())
5683 return NULL;
5684 } else if (!move_file())
5685
5686 return NULL;
5687 if (!get_page_unless_zero(page))
5688 return NULL;
5689
5690 return page;
5691}
5692
5693#ifdef CONFIG_SWAP
5694static struct page *mc_handle_swap_pte(struct vm_area_struct *vma,
5695 unsigned long addr, pte_t ptent, swp_entry_t *entry)
5696{
5697 struct page *page = NULL;
5698 swp_entry_t ent = pte_to_swp_entry(ptent);
5699
5700 if (!move_anon() || non_swap_entry(ent))
5701 return NULL;
5702
5703
5704
5705
5706 page = find_get_page(swap_address_space(ent), ent.val);
5707 if (do_swap_account)
5708 entry->val = ent.val;
5709
5710 return page;
5711}
5712#else
5713static struct page *mc_handle_swap_pte(struct vm_area_struct *vma,
5714 unsigned long addr, pte_t ptent, swp_entry_t *entry)
5715{
5716 return NULL;
5717}
5718#endif
5719
5720static struct page *mc_handle_file_pte(struct vm_area_struct *vma,
5721 unsigned long addr, pte_t ptent, swp_entry_t *entry)
5722{
5723 struct page *page = NULL;
5724 struct address_space *mapping;
5725 pgoff_t pgoff;
5726
5727 if (!vma->vm_file)
5728 return NULL;
5729 if (!move_file())
5730 return NULL;
5731
5732 mapping = vma->vm_file->f_mapping;
5733 if (pte_none(ptent))
5734 pgoff = linear_page_index(vma, addr);
5735 else
5736 pgoff = pte_to_pgoff(ptent);
5737
5738
5739#ifdef CONFIG_SWAP
5740
5741 if (shmem_mapping(mapping)) {
5742 page = find_get_entry(mapping, pgoff);
5743 if (radix_tree_exceptional_entry(page)) {
5744 swp_entry_t swp = radix_to_swp_entry(page);
5745 if (do_swap_account)
5746 *entry = swp;
5747 page = find_get_page(swap_address_space(swp), swp.val);
5748 }
5749 } else
5750 page = find_get_page(mapping, pgoff);
5751#else
5752 page = find_get_page(mapping, pgoff);
5753#endif
5754 return page;
5755}
5756
5757static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma,
5758 unsigned long addr, pte_t ptent, union mc_target *target)
5759{
5760 struct page *page = NULL;
5761 struct page_cgroup *pc;
5762 enum mc_target_type ret = MC_TARGET_NONE;
5763 swp_entry_t ent = { .val = 0 };
5764
5765 if (pte_present(ptent))
5766 page = mc_handle_present_pte(vma, addr, ptent);
5767 else if (is_swap_pte(ptent))
5768 page = mc_handle_swap_pte(vma, addr, ptent, &ent);
5769 else if (pte_none(ptent) || pte_file(ptent))
5770 page = mc_handle_file_pte(vma, addr, ptent, &ent);
5771
5772 if (!page && !ent.val)
5773 return ret;
5774 if (page) {
5775 pc = lookup_page_cgroup(page);
5776
5777
5778
5779
5780
5781 if (PageCgroupUsed(pc) && pc->mem_cgroup == mc.from) {
5782 ret = MC_TARGET_PAGE;
5783 if (target)
5784 target->page = page;
5785 }
5786 if (!ret || !target)
5787 put_page(page);
5788 }
5789
5790 if (ent.val && !ret &&
5791 mem_cgroup_id(mc.from) == lookup_swap_cgroup_id(ent)) {
5792 ret = MC_TARGET_SWAP;
5793 if (target)
5794 target->ent = ent;
5795 }
5796 return ret;
5797}
5798
5799#ifdef CONFIG_TRANSPARENT_HUGEPAGE
5800
5801
5802
5803
5804
5805static enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma,
5806 unsigned long addr, pmd_t pmd, union mc_target *target)
5807{
5808 struct page *page = NULL;
5809 struct page_cgroup *pc;
5810 enum mc_target_type ret = MC_TARGET_NONE;
5811
5812 page = pmd_page(pmd);
5813 VM_BUG_ON_PAGE(!page || !PageHead(page), page);
5814 if (!move_anon())
5815 return ret;
5816 pc = lookup_page_cgroup(page);
5817 if (PageCgroupUsed(pc) && pc->mem_cgroup == mc.from) {
5818 ret = MC_TARGET_PAGE;
5819 if (target) {
5820 get_page(page);
5821 target->page = page;
5822 }
5823 }
5824 return ret;
5825}
5826#else
5827static inline enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma,
5828 unsigned long addr, pmd_t pmd, union mc_target *target)
5829{
5830 return MC_TARGET_NONE;
5831}
5832#endif
5833
5834static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd,
5835 unsigned long addr, unsigned long end,
5836 struct mm_walk *walk)
5837{
5838 struct vm_area_struct *vma = walk->private;
5839 pte_t *pte;
5840 spinlock_t *ptl;
5841
5842 if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
5843 if (get_mctgt_type_thp(vma, addr, *pmd, NULL) == MC_TARGET_PAGE)
5844 mc.precharge += HPAGE_PMD_NR;
5845 spin_unlock(ptl);
5846 return 0;
5847 }
5848
5849 if (pmd_trans_unstable(pmd))
5850 return 0;
5851 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
5852 for (; addr != end; pte++, addr += PAGE_SIZE)
5853 if (get_mctgt_type(vma, addr, *pte, NULL))
5854 mc.precharge++;
5855 pte_unmap_unlock(pte - 1, ptl);
5856 cond_resched();
5857
5858 return 0;
5859}
5860
5861static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm)
5862{
5863 unsigned long precharge;
5864 struct vm_area_struct *vma;
5865
5866 down_read(&mm->mmap_sem);
5867 for (vma = mm->mmap; vma; vma = vma->vm_next) {
5868 struct mm_walk mem_cgroup_count_precharge_walk = {
5869 .pmd_entry = mem_cgroup_count_precharge_pte_range,
5870 .mm = mm,
5871 .private = vma,
5872 };
5873 if (is_vm_hugetlb_page(vma))
5874 continue;
5875 walk_page_range(vma->vm_start, vma->vm_end,
5876 &mem_cgroup_count_precharge_walk);
5877 }
5878 up_read(&mm->mmap_sem);
5879
5880 precharge = mc.precharge;
5881 mc.precharge = 0;
5882
5883 return precharge;
5884}
5885
5886static int mem_cgroup_precharge_mc(struct mm_struct *mm)
5887{
5888 unsigned long precharge = mem_cgroup_count_precharge(mm);
5889
5890 VM_BUG_ON(mc.moving_task);
5891 mc.moving_task = current;
5892 return mem_cgroup_do_precharge(precharge);
5893}
5894
5895
5896static void __mem_cgroup_clear_mc(void)
5897{
5898 struct mem_cgroup *from = mc.from;
5899 struct mem_cgroup *to = mc.to;
5900 int i;
5901
5902
5903 if (mc.precharge) {
5904 cancel_charge(mc.to, mc.precharge);
5905 mc.precharge = 0;
5906 }
5907
5908
5909
5910
5911 if (mc.moved_charge) {
5912 cancel_charge(mc.from, mc.moved_charge);
5913 mc.moved_charge = 0;
5914 }
5915
5916 if (mc.moved_swap) {
5917
5918 if (!mem_cgroup_is_root(mc.from))
5919 res_counter_uncharge(&mc.from->memsw,
5920 PAGE_SIZE * mc.moved_swap);
5921
5922 for (i = 0; i < mc.moved_swap; i++)
5923 css_put(&mc.from->css);
5924
5925
5926
5927
5928
5929 if (!mem_cgroup_is_root(mc.to))
5930 res_counter_uncharge(&mc.to->res,
5931 PAGE_SIZE * mc.moved_swap);
5932
5933 mc.moved_swap = 0;
5934 }
5935 memcg_oom_recover(from);
5936 memcg_oom_recover(to);
5937 wake_up_all(&mc.waitq);
5938}
5939
5940static void mem_cgroup_clear_mc(void)
5941{
5942 struct mem_cgroup *from = mc.from;
5943
5944
5945
5946
5947
5948 mc.moving_task = NULL;
5949 __mem_cgroup_clear_mc();
5950 spin_lock(&mc.lock);
5951 mc.from = NULL;
5952 mc.to = NULL;
5953 spin_unlock(&mc.lock);
5954 mem_cgroup_end_move(from);
5955}
5956
5957static int mem_cgroup_can_attach(struct cgroup_subsys_state *css,
5958 struct cgroup_taskset *tset)
5959{
5960 struct task_struct *p = cgroup_taskset_first(tset);
5961 int ret = 0;
5962 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
5963 unsigned long move_charge_at_immigrate;
5964
5965
5966
5967
5968
5969
5970 move_charge_at_immigrate = memcg->move_charge_at_immigrate;
5971 if (move_charge_at_immigrate) {
5972 struct mm_struct *mm;
5973 struct mem_cgroup *from = mem_cgroup_from_task(p);
5974
5975 VM_BUG_ON(from == memcg);
5976
5977 mm = get_task_mm(p);
5978 if (!mm)
5979 return 0;
5980
5981 if (mm->owner == p) {
5982 VM_BUG_ON(mc.from);
5983 VM_BUG_ON(mc.to);
5984 VM_BUG_ON(mc.precharge);
5985 VM_BUG_ON(mc.moved_charge);
5986 VM_BUG_ON(mc.moved_swap);
5987 mem_cgroup_start_move(from);
5988 spin_lock(&mc.lock);
5989 mc.from = from;
5990 mc.to = memcg;
5991 mc.immigrate_flags = move_charge_at_immigrate;
5992 spin_unlock(&mc.lock);
5993
5994
5995 ret = mem_cgroup_precharge_mc(mm);
5996 if (ret)
5997 mem_cgroup_clear_mc();
5998 }
5999 mmput(mm);
6000 }
6001 return ret;
6002}
6003
6004static void mem_cgroup_cancel_attach(struct cgroup_subsys_state *css,
6005 struct cgroup_taskset *tset)
6006{
6007 mem_cgroup_clear_mc();
6008}
6009
6010static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
6011 unsigned long addr, unsigned long end,
6012 struct mm_walk *walk)
6013{
6014 int ret = 0;
6015 struct vm_area_struct *vma = walk->private;
6016 pte_t *pte;
6017 spinlock_t *ptl;
6018 enum mc_target_type target_type;
6019 union mc_target target;
6020 struct page *page;
6021 struct page_cgroup *pc;
6022
6023
6024
6025
6026
6027
6028
6029
6030
6031
6032
6033 if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
6034 if (mc.precharge < HPAGE_PMD_NR) {
6035 spin_unlock(ptl);
6036 return 0;
6037 }
6038 target_type = get_mctgt_type_thp(vma, addr, *pmd, &target);
6039 if (target_type == MC_TARGET_PAGE) {
6040 page = target.page;
6041 if (!isolate_lru_page(page)) {
6042 pc = lookup_page_cgroup(page);
6043 if (!mem_cgroup_move_account(page, HPAGE_PMD_NR,
6044 pc, mc.from, mc.to)) {
6045 mc.precharge -= HPAGE_PMD_NR;
6046 mc.moved_charge += HPAGE_PMD_NR;
6047 }
6048 putback_lru_page(page);
6049 }
6050 put_page(page);
6051 }
6052 spin_unlock(ptl);
6053 return 0;
6054 }
6055
6056 if (pmd_trans_unstable(pmd))
6057 return 0;
6058retry:
6059 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
6060 for (; addr != end; addr += PAGE_SIZE) {
6061 pte_t ptent = *(pte++);
6062 swp_entry_t ent;
6063
6064 if (!mc.precharge)
6065 break;
6066
6067 switch (get_mctgt_type(vma, addr, ptent, &target)) {
6068 case MC_TARGET_PAGE:
6069 page = target.page;
6070 if (isolate_lru_page(page))
6071 goto put;
6072 pc = lookup_page_cgroup(page);
6073 if (!mem_cgroup_move_account(page, 1, pc,
6074 mc.from, mc.to)) {
6075 mc.precharge--;
6076
6077 mc.moved_charge++;
6078 }
6079 putback_lru_page(page);
6080put:
6081 put_page(page);
6082 break;
6083 case MC_TARGET_SWAP:
6084 ent = target.ent;
6085 if (!mem_cgroup_move_swap_account(ent, mc.from, mc.to)) {
6086 mc.precharge--;
6087
6088 mc.moved_swap++;
6089 }
6090 break;
6091 default:
6092 break;
6093 }
6094 }
6095 pte_unmap_unlock(pte - 1, ptl);
6096 cond_resched();
6097
6098 if (addr != end) {
6099
6100
6101
6102
6103
6104
6105 ret = mem_cgroup_do_precharge(1);
6106 if (!ret)
6107 goto retry;
6108 }
6109
6110 return ret;
6111}
6112
6113static void mem_cgroup_move_charge(struct mm_struct *mm)
6114{
6115 struct vm_area_struct *vma;
6116
6117 lru_add_drain_all();
6118retry:
6119 if (unlikely(!down_read_trylock(&mm->mmap_sem))) {
6120
6121
6122
6123
6124
6125
6126
6127 __mem_cgroup_clear_mc();
6128 cond_resched();
6129 goto retry;
6130 }
6131 for (vma = mm->mmap; vma; vma = vma->vm_next) {
6132 int ret;
6133 struct mm_walk mem_cgroup_move_charge_walk = {
6134 .pmd_entry = mem_cgroup_move_charge_pte_range,
6135 .mm = mm,
6136 .private = vma,
6137 };
6138 if (is_vm_hugetlb_page(vma))
6139 continue;
6140 ret = walk_page_range(vma->vm_start, vma->vm_end,
6141 &mem_cgroup_move_charge_walk);
6142 if (ret)
6143
6144
6145
6146
6147 break;
6148 }
6149 up_read(&mm->mmap_sem);
6150}
6151
6152static void mem_cgroup_move_task(struct cgroup_subsys_state *css,
6153 struct cgroup_taskset *tset)
6154{
6155 struct task_struct *p = cgroup_taskset_first(tset);
6156 struct mm_struct *mm = get_task_mm(p);
6157
6158 if (mm) {
6159 if (mc.to)
6160 mem_cgroup_move_charge(mm);
6161 mmput(mm);
6162 }
6163 if (mc.to)
6164 mem_cgroup_clear_mc();
6165}
6166#else
6167static int mem_cgroup_can_attach(struct cgroup_subsys_state *css,
6168 struct cgroup_taskset *tset)
6169{
6170 return 0;
6171}
6172static void mem_cgroup_cancel_attach(struct cgroup_subsys_state *css,
6173 struct cgroup_taskset *tset)
6174{
6175}
6176static void mem_cgroup_move_task(struct cgroup_subsys_state *css,
6177 struct cgroup_taskset *tset)
6178{
6179}
6180#endif
6181
6182
6183
6184
6185
6186
6187static void mem_cgroup_bind(struct cgroup_subsys_state *root_css)
6188{
6189
6190
6191
6192
6193
6194 if (cgroup_on_dfl(root_css->cgroup))
6195 mem_cgroup_from_css(root_css)->use_hierarchy = true;
6196}
6197
6198struct cgroup_subsys memory_cgrp_subsys = {
6199 .css_alloc = mem_cgroup_css_alloc,
6200 .css_online = mem_cgroup_css_online,
6201 .css_offline = mem_cgroup_css_offline,
6202 .css_free = mem_cgroup_css_free,
6203 .css_reset = mem_cgroup_css_reset,
6204 .can_attach = mem_cgroup_can_attach,
6205 .cancel_attach = mem_cgroup_cancel_attach,
6206 .attach = mem_cgroup_move_task,
6207 .bind = mem_cgroup_bind,
6208 .legacy_cftypes = mem_cgroup_files,
6209 .early_init = 0,
6210};
6211
6212#ifdef CONFIG_MEMCG_SWAP
6213static int __init enable_swap_account(char *s)
6214{
6215 if (!strcmp(s, "1"))
6216 really_do_swap_account = 1;
6217 else if (!strcmp(s, "0"))
6218 really_do_swap_account = 0;
6219 return 1;
6220}
6221__setup("swapaccount=", enable_swap_account);
6222
6223static void __init memsw_file_init(void)
6224{
6225 WARN_ON(cgroup_add_legacy_cftypes(&memory_cgrp_subsys,
6226 memsw_cgroup_files));
6227}
6228
6229static void __init enable_swap_cgroup(void)
6230{
6231 if (!mem_cgroup_disabled() && really_do_swap_account) {
6232 do_swap_account = 1;
6233 memsw_file_init();
6234 }
6235}
6236
6237#else
6238static void __init enable_swap_cgroup(void)
6239{
6240}
6241#endif
6242
6243#ifdef CONFIG_MEMCG_SWAP
6244
6245
6246
6247
6248
6249
6250
6251void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
6252{
6253 struct page_cgroup *pc;
6254 unsigned short oldid;
6255
6256 VM_BUG_ON_PAGE(PageLRU(page), page);
6257 VM_BUG_ON_PAGE(page_count(page), page);
6258
6259 if (!do_swap_account)
6260 return;
6261
6262 pc = lookup_page_cgroup(page);
6263
6264
6265 if (!PageCgroupUsed(pc))
6266 return;
6267
6268 VM_BUG_ON_PAGE(!(pc->flags & PCG_MEMSW), page);
6269
6270 oldid = swap_cgroup_record(entry, mem_cgroup_id(pc->mem_cgroup));
6271 VM_BUG_ON_PAGE(oldid, page);
6272
6273 pc->flags &= ~PCG_MEMSW;
6274 css_get(&pc->mem_cgroup->css);
6275 mem_cgroup_swap_statistics(pc->mem_cgroup, true);
6276}
6277
6278
6279
6280
6281
6282
6283
6284void mem_cgroup_uncharge_swap(swp_entry_t entry)
6285{
6286 struct mem_cgroup *memcg;
6287 unsigned short id;
6288
6289 if (!do_swap_account)
6290 return;
6291
6292 id = swap_cgroup_record(entry, 0);
6293 rcu_read_lock();
6294 memcg = mem_cgroup_lookup(id);
6295 if (memcg) {
6296 if (!mem_cgroup_is_root(memcg))
6297 res_counter_uncharge(&memcg->memsw, PAGE_SIZE);
6298 mem_cgroup_swap_statistics(memcg, false);
6299 css_put(&memcg->css);
6300 }
6301 rcu_read_unlock();
6302}
6303#endif
6304
6305
6306
6307
6308
6309
6310
6311
6312
6313
6314
6315
6316
6317
6318
6319
6320
6321
6322int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm,
6323 gfp_t gfp_mask, struct mem_cgroup **memcgp)
6324{
6325 struct mem_cgroup *memcg = NULL;
6326 unsigned int nr_pages = 1;
6327 int ret = 0;
6328
6329 if (mem_cgroup_disabled())
6330 goto out;
6331
6332 if (PageSwapCache(page)) {
6333 struct page_cgroup *pc = lookup_page_cgroup(page);
6334
6335
6336
6337
6338
6339
6340
6341 if (PageCgroupUsed(pc))
6342 goto out;
6343 }
6344
6345 if (PageTransHuge(page)) {
6346 nr_pages <<= compound_order(page);
6347 VM_BUG_ON_PAGE(!PageTransHuge(page), page);
6348 }
6349
6350 if (do_swap_account && PageSwapCache(page))
6351 memcg = try_get_mem_cgroup_from_page(page);
6352 if (!memcg)
6353 memcg = get_mem_cgroup_from_mm(mm);
6354
6355 ret = try_charge(memcg, gfp_mask, nr_pages);
6356
6357 css_put(&memcg->css);
6358
6359 if (ret == -EINTR) {
6360 memcg = root_mem_cgroup;
6361 ret = 0;
6362 }
6363out:
6364 *memcgp = memcg;
6365 return ret;
6366}
6367
6368
6369
6370
6371
6372
6373
6374
6375
6376
6377
6378
6379
6380
6381
6382
6383
6384void mem_cgroup_commit_charge(struct page *page, struct mem_cgroup *memcg,
6385 bool lrucare)
6386{
6387 unsigned int nr_pages = 1;
6388
6389 VM_BUG_ON_PAGE(!page->mapping, page);
6390 VM_BUG_ON_PAGE(PageLRU(page) && !lrucare, page);
6391
6392 if (mem_cgroup_disabled())
6393 return;
6394
6395
6396
6397
6398
6399 if (!memcg)
6400 return;
6401
6402 commit_charge(page, memcg, lrucare);
6403
6404 if (PageTransHuge(page)) {
6405 nr_pages <<= compound_order(page);
6406 VM_BUG_ON_PAGE(!PageTransHuge(page), page);
6407 }
6408
6409 local_irq_disable();
6410 mem_cgroup_charge_statistics(memcg, page, nr_pages);
6411 memcg_check_events(memcg, page);
6412 local_irq_enable();
6413
6414 if (do_swap_account && PageSwapCache(page)) {
6415 swp_entry_t entry = { .val = page_private(page) };
6416
6417
6418
6419
6420
6421 mem_cgroup_uncharge_swap(entry);
6422 }
6423}
6424
6425
6426
6427
6428
6429
6430
6431
6432void mem_cgroup_cancel_charge(struct page *page, struct mem_cgroup *memcg)
6433{
6434 unsigned int nr_pages = 1;
6435
6436 if (mem_cgroup_disabled())
6437 return;
6438
6439
6440
6441
6442
6443 if (!memcg)
6444 return;
6445
6446 if (PageTransHuge(page)) {
6447 nr_pages <<= compound_order(page);
6448 VM_BUG_ON_PAGE(!PageTransHuge(page), page);
6449 }
6450
6451 cancel_charge(memcg, nr_pages);
6452}
6453
6454static void uncharge_batch(struct mem_cgroup *memcg, unsigned long pgpgout,
6455 unsigned long nr_mem, unsigned long nr_memsw,
6456 unsigned long nr_anon, unsigned long nr_file,
6457 unsigned long nr_huge, struct page *dummy_page)
6458{
6459 unsigned long flags;
6460
6461 if (!mem_cgroup_is_root(memcg)) {
6462 if (nr_mem)
6463 res_counter_uncharge(&memcg->res,
6464 nr_mem * PAGE_SIZE);
6465 if (nr_memsw)
6466 res_counter_uncharge(&memcg->memsw,
6467 nr_memsw * PAGE_SIZE);
6468 memcg_oom_recover(memcg);
6469 }
6470
6471 local_irq_save(flags);
6472 __this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_RSS], nr_anon);
6473 __this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_CACHE], nr_file);
6474 __this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_RSS_HUGE], nr_huge);
6475 __this_cpu_add(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGOUT], pgpgout);
6476 __this_cpu_add(memcg->stat->nr_page_events, nr_anon + nr_file);
6477 memcg_check_events(memcg, dummy_page);
6478 local_irq_restore(flags);
6479}
6480
6481static void uncharge_list(struct list_head *page_list)
6482{
6483 struct mem_cgroup *memcg = NULL;
6484 unsigned long nr_memsw = 0;
6485 unsigned long nr_anon = 0;
6486 unsigned long nr_file = 0;
6487 unsigned long nr_huge = 0;
6488 unsigned long pgpgout = 0;
6489 unsigned long nr_mem = 0;
6490 struct list_head *next;
6491 struct page *page;
6492
6493 next = page_list->next;
6494 do {
6495 unsigned int nr_pages = 1;
6496 struct page_cgroup *pc;
6497
6498 page = list_entry(next, struct page, lru);
6499 next = page->lru.next;
6500
6501 VM_BUG_ON_PAGE(PageLRU(page), page);
6502 VM_BUG_ON_PAGE(page_count(page), page);
6503
6504 pc = lookup_page_cgroup(page);
6505 if (!PageCgroupUsed(pc))
6506 continue;
6507
6508
6509
6510
6511
6512
6513
6514 if (memcg != pc->mem_cgroup) {
6515 if (memcg) {
6516 uncharge_batch(memcg, pgpgout, nr_mem, nr_memsw,
6517 nr_anon, nr_file, nr_huge, page);
6518 pgpgout = nr_mem = nr_memsw = 0;
6519 nr_anon = nr_file = nr_huge = 0;
6520 }
6521 memcg = pc->mem_cgroup;
6522 }
6523
6524 if (PageTransHuge(page)) {
6525 nr_pages <<= compound_order(page);
6526 VM_BUG_ON_PAGE(!PageTransHuge(page), page);
6527 nr_huge += nr_pages;
6528 }
6529
6530 if (PageAnon(page))
6531 nr_anon += nr_pages;
6532 else
6533 nr_file += nr_pages;
6534
6535 if (pc->flags & PCG_MEM)
6536 nr_mem += nr_pages;
6537 if (pc->flags & PCG_MEMSW)
6538 nr_memsw += nr_pages;
6539 pc->flags = 0;
6540
6541 pgpgout++;
6542 } while (next != page_list);
6543
6544 if (memcg)
6545 uncharge_batch(memcg, pgpgout, nr_mem, nr_memsw,
6546 nr_anon, nr_file, nr_huge, page);
6547}
6548
6549
6550
6551
6552
6553
6554
6555
6556void mem_cgroup_uncharge(struct page *page)
6557{
6558 struct page_cgroup *pc;
6559
6560 if (mem_cgroup_disabled())
6561 return;
6562
6563
6564 pc = lookup_page_cgroup(page);
6565 if (!PageCgroupUsed(pc))
6566 return;
6567
6568 INIT_LIST_HEAD(&page->lru);
6569 uncharge_list(&page->lru);
6570}
6571
6572
6573
6574
6575
6576
6577
6578
6579void mem_cgroup_uncharge_list(struct list_head *page_list)
6580{
6581 if (mem_cgroup_disabled())
6582 return;
6583
6584 if (!list_empty(page_list))
6585 uncharge_list(page_list);
6586}
6587
6588
6589
6590
6591
6592
6593
6594
6595
6596
6597
6598void mem_cgroup_migrate(struct page *oldpage, struct page *newpage,
6599 bool lrucare)
6600{
6601 struct page_cgroup *pc;
6602 int isolated;
6603
6604 VM_BUG_ON_PAGE(!PageLocked(oldpage), oldpage);
6605 VM_BUG_ON_PAGE(!PageLocked(newpage), newpage);
6606 VM_BUG_ON_PAGE(!lrucare && PageLRU(oldpage), oldpage);
6607 VM_BUG_ON_PAGE(!lrucare && PageLRU(newpage), newpage);
6608 VM_BUG_ON_PAGE(PageAnon(oldpage) != PageAnon(newpage), newpage);
6609 VM_BUG_ON_PAGE(PageTransHuge(oldpage) != PageTransHuge(newpage),
6610 newpage);
6611
6612 if (mem_cgroup_disabled())
6613 return;
6614
6615
6616 pc = lookup_page_cgroup(newpage);
6617 if (PageCgroupUsed(pc))
6618 return;
6619
6620
6621 pc = lookup_page_cgroup(oldpage);
6622 if (!PageCgroupUsed(pc))
6623 return;
6624
6625 VM_BUG_ON_PAGE(!(pc->flags & PCG_MEM), oldpage);
6626 VM_BUG_ON_PAGE(do_swap_account && !(pc->flags & PCG_MEMSW), oldpage);
6627
6628 if (lrucare)
6629 lock_page_lru(oldpage, &isolated);
6630
6631 pc->flags = 0;
6632
6633 if (lrucare)
6634 unlock_page_lru(oldpage, isolated);
6635
6636 commit_charge(newpage, pc->mem_cgroup, lrucare);
6637}
6638
6639
6640
6641
6642
6643
6644
6645
6646
6647static int __init mem_cgroup_init(void)
6648{
6649 hotcpu_notifier(memcg_cpu_hotplug_callback, 0);
6650 enable_swap_cgroup();
6651 mem_cgroup_soft_limit_tree_init();
6652 memcg_stock_init();
6653 return 0;
6654}
6655subsys_initcall(mem_cgroup_init);
6656