1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24#include <linux/res_counter.h>
25#include <linux/memcontrol.h>
26#include <linux/cgroup.h>
27#include <linux/mm.h>
28#include <linux/hugetlb.h>
29#include <linux/pagemap.h>
30#include <linux/smp.h>
31#include <linux/page-flags.h>
32#include <linux/backing-dev.h>
33#include <linux/bit_spinlock.h>
34#include <linux/rcupdate.h>
35#include <linux/limits.h>
36#include <linux/export.h>
37#include <linux/mutex.h>
38#include <linux/rbtree.h>
39#include <linux/slab.h>
40#include <linux/swap.h>
41#include <linux/swapops.h>
42#include <linux/spinlock.h>
43#include <linux/eventfd.h>
44#include <linux/sort.h>
45#include <linux/fs.h>
46#include <linux/seq_file.h>
47#include <linux/vmalloc.h>
48#include <linux/mm_inline.h>
49#include <linux/page_cgroup.h>
50#include <linux/cpu.h>
51#include <linux/oom.h>
52#include "internal.h"
53#include <net/sock.h>
54#include <net/tcp_memcontrol.h>
55
56#include <asm/uaccess.h>
57
58#include <trace/events/vmscan.h>
59
60struct cgroup_subsys mem_cgroup_subsys __read_mostly;
61#define MEM_CGROUP_RECLAIM_RETRIES 5
62static struct mem_cgroup *root_mem_cgroup __read_mostly;
63
64#ifdef CONFIG_MEMCG_SWAP
65
66int do_swap_account __read_mostly;
67
68
69#ifdef CONFIG_MEMCG_SWAP_ENABLED
70static int really_do_swap_account __initdata = 1;
71#else
72static int really_do_swap_account __initdata = 0;
73#endif
74
75#else
76#define do_swap_account 0
77#endif
78
79
80
81
82
83enum mem_cgroup_stat_index {
84
85
86
87 MEM_CGROUP_STAT_CACHE,
88 MEM_CGROUP_STAT_RSS,
89 MEM_CGROUP_STAT_FILE_MAPPED,
90 MEM_CGROUP_STAT_SWAP,
91 MEM_CGROUP_STAT_NSTATS,
92};
93
94static const char * const mem_cgroup_stat_names[] = {
95 "cache",
96 "rss",
97 "mapped_file",
98 "swap",
99};
100
101enum mem_cgroup_events_index {
102 MEM_CGROUP_EVENTS_PGPGIN,
103 MEM_CGROUP_EVENTS_PGPGOUT,
104 MEM_CGROUP_EVENTS_PGFAULT,
105 MEM_CGROUP_EVENTS_PGMAJFAULT,
106 MEM_CGROUP_EVENTS_NSTATS,
107};
108
109static const char * const mem_cgroup_events_names[] = {
110 "pgpgin",
111 "pgpgout",
112 "pgfault",
113 "pgmajfault",
114};
115
116
117
118
119
120
121
122enum mem_cgroup_events_target {
123 MEM_CGROUP_TARGET_THRESH,
124 MEM_CGROUP_TARGET_SOFTLIMIT,
125 MEM_CGROUP_TARGET_NUMAINFO,
126 MEM_CGROUP_NTARGETS,
127};
128#define THRESHOLDS_EVENTS_TARGET 128
129#define SOFTLIMIT_EVENTS_TARGET 1024
130#define NUMAINFO_EVENTS_TARGET 1024
131
132struct mem_cgroup_stat_cpu {
133 long count[MEM_CGROUP_STAT_NSTATS];
134 unsigned long events[MEM_CGROUP_EVENTS_NSTATS];
135 unsigned long nr_page_events;
136 unsigned long targets[MEM_CGROUP_NTARGETS];
137};
138
139struct mem_cgroup_reclaim_iter {
140
141 int position;
142
143 unsigned int generation;
144};
145
146
147
148
149struct mem_cgroup_per_zone {
150 struct lruvec lruvec;
151 unsigned long lru_size[NR_LRU_LISTS];
152
153 struct mem_cgroup_reclaim_iter reclaim_iter[DEF_PRIORITY + 1];
154
155 struct rb_node tree_node;
156 unsigned long long usage_in_excess;
157
158 bool on_tree;
159 struct mem_cgroup *memcg;
160
161};
162
163struct mem_cgroup_per_node {
164 struct mem_cgroup_per_zone zoneinfo[MAX_NR_ZONES];
165};
166
167struct mem_cgroup_lru_info {
168 struct mem_cgroup_per_node *nodeinfo[MAX_NUMNODES];
169};
170
171
172
173
174
175
176struct mem_cgroup_tree_per_zone {
177 struct rb_root rb_root;
178 spinlock_t lock;
179};
180
181struct mem_cgroup_tree_per_node {
182 struct mem_cgroup_tree_per_zone rb_tree_per_zone[MAX_NR_ZONES];
183};
184
185struct mem_cgroup_tree {
186 struct mem_cgroup_tree_per_node *rb_tree_per_node[MAX_NUMNODES];
187};
188
189static struct mem_cgroup_tree soft_limit_tree __read_mostly;
190
191struct mem_cgroup_threshold {
192 struct eventfd_ctx *eventfd;
193 u64 threshold;
194};
195
196
197struct mem_cgroup_threshold_ary {
198
199 int current_threshold;
200
201 unsigned int size;
202
203 struct mem_cgroup_threshold entries[0];
204};
205
206struct mem_cgroup_thresholds {
207
208 struct mem_cgroup_threshold_ary *primary;
209
210
211
212
213
214 struct mem_cgroup_threshold_ary *spare;
215};
216
217
218struct mem_cgroup_eventfd_list {
219 struct list_head list;
220 struct eventfd_ctx *eventfd;
221};
222
223static void mem_cgroup_threshold(struct mem_cgroup *memcg);
224static void mem_cgroup_oom_notify(struct mem_cgroup *memcg);
225
226
227
228
229
230
231
232
233
234
235
236
237struct mem_cgroup {
238 struct cgroup_subsys_state css;
239
240
241
242 struct res_counter res;
243
244 union {
245
246
247
248 struct res_counter memsw;
249
250
251
252
253
254
255
256
257
258
259 struct rcu_head rcu_freeing;
260
261
262
263
264 struct work_struct work_freeing;
265 };
266
267
268
269
270
271 struct mem_cgroup_lru_info info;
272 int last_scanned_node;
273#if MAX_NUMNODES > 1
274 nodemask_t scan_nodes;
275 atomic_t numainfo_events;
276 atomic_t numainfo_updating;
277#endif
278
279
280
281 bool use_hierarchy;
282
283 bool oom_lock;
284 atomic_t under_oom;
285
286 atomic_t refcnt;
287
288 int swappiness;
289
290 int oom_kill_disable;
291
292
293 bool memsw_is_minimum;
294
295
296 struct mutex thresholds_lock;
297
298
299 struct mem_cgroup_thresholds thresholds;
300
301
302 struct mem_cgroup_thresholds memsw_thresholds;
303
304
305 struct list_head oom_notify;
306
307
308
309
310
311 unsigned long move_charge_at_immigrate;
312
313
314
315 atomic_t moving_account;
316
317 spinlock_t move_lock;
318
319
320
321 struct mem_cgroup_stat_cpu __percpu *stat;
322
323
324
325
326 struct mem_cgroup_stat_cpu nocpu_base;
327 spinlock_t pcp_counter_lock;
328
329#ifdef CONFIG_INET
330 struct tcp_memcontrol tcp_mem;
331#endif
332};
333
334
335
336
337
338
339enum move_type {
340 MOVE_CHARGE_TYPE_ANON,
341 MOVE_CHARGE_TYPE_FILE,
342 NR_MOVE_TYPE,
343};
344
345
346static struct move_charge_struct {
347 spinlock_t lock;
348 struct mem_cgroup *from;
349 struct mem_cgroup *to;
350 unsigned long precharge;
351 unsigned long moved_charge;
352 unsigned long moved_swap;
353 struct task_struct *moving_task;
354 wait_queue_head_t waitq;
355} mc = {
356 .lock = __SPIN_LOCK_UNLOCKED(mc.lock),
357 .waitq = __WAIT_QUEUE_HEAD_INITIALIZER(mc.waitq),
358};
359
360static bool move_anon(void)
361{
362 return test_bit(MOVE_CHARGE_TYPE_ANON,
363 &mc.to->move_charge_at_immigrate);
364}
365
366static bool move_file(void)
367{
368 return test_bit(MOVE_CHARGE_TYPE_FILE,
369 &mc.to->move_charge_at_immigrate);
370}
371
372
373
374
375
376#define MEM_CGROUP_MAX_RECLAIM_LOOPS 100
377#define MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS 2
378
379enum charge_type {
380 MEM_CGROUP_CHARGE_TYPE_CACHE = 0,
381 MEM_CGROUP_CHARGE_TYPE_ANON,
382 MEM_CGROUP_CHARGE_TYPE_SWAPOUT,
383 MEM_CGROUP_CHARGE_TYPE_DROP,
384 NR_CHARGE_TYPE,
385};
386
387
388#define _MEM (0)
389#define _MEMSWAP (1)
390#define _OOM_TYPE (2)
391#define MEMFILE_PRIVATE(x, val) ((x) << 16 | (val))
392#define MEMFILE_TYPE(val) ((val) >> 16 & 0xffff)
393#define MEMFILE_ATTR(val) ((val) & 0xffff)
394
395#define OOM_CONTROL (0)
396
397
398
399
400#define MEM_CGROUP_RECLAIM_NOSWAP_BIT 0x0
401#define MEM_CGROUP_RECLAIM_NOSWAP (1 << MEM_CGROUP_RECLAIM_NOSWAP_BIT)
402#define MEM_CGROUP_RECLAIM_SHRINK_BIT 0x1
403#define MEM_CGROUP_RECLAIM_SHRINK (1 << MEM_CGROUP_RECLAIM_SHRINK_BIT)
404
405static void mem_cgroup_get(struct mem_cgroup *memcg);
406static void mem_cgroup_put(struct mem_cgroup *memcg);
407
408static inline
409struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *s)
410{
411 return container_of(s, struct mem_cgroup, css);
412}
413
414
415#ifdef CONFIG_MEMCG_KMEM
416#include <net/sock.h>
417#include <net/ip.h>
418
419static bool mem_cgroup_is_root(struct mem_cgroup *memcg);
420void sock_update_memcg(struct sock *sk)
421{
422 if (mem_cgroup_sockets_enabled) {
423 struct mem_cgroup *memcg;
424 struct cg_proto *cg_proto;
425
426 BUG_ON(!sk->sk_prot->proto_cgroup);
427
428
429
430
431
432
433
434
435
436 if (sk->sk_cgrp) {
437 BUG_ON(mem_cgroup_is_root(sk->sk_cgrp->memcg));
438 mem_cgroup_get(sk->sk_cgrp->memcg);
439 return;
440 }
441
442 rcu_read_lock();
443 memcg = mem_cgroup_from_task(current);
444 cg_proto = sk->sk_prot->proto_cgroup(memcg);
445 if (!mem_cgroup_is_root(memcg) && memcg_proto_active(cg_proto)) {
446 mem_cgroup_get(memcg);
447 sk->sk_cgrp = cg_proto;
448 }
449 rcu_read_unlock();
450 }
451}
452EXPORT_SYMBOL(sock_update_memcg);
453
454void sock_release_memcg(struct sock *sk)
455{
456 if (mem_cgroup_sockets_enabled && sk->sk_cgrp) {
457 struct mem_cgroup *memcg;
458 WARN_ON(!sk->sk_cgrp->memcg);
459 memcg = sk->sk_cgrp->memcg;
460 mem_cgroup_put(memcg);
461 }
462}
463
464#ifdef CONFIG_INET
465struct cg_proto *tcp_proto_cgroup(struct mem_cgroup *memcg)
466{
467 if (!memcg || mem_cgroup_is_root(memcg))
468 return NULL;
469
470 return &memcg->tcp_mem.cg_proto;
471}
472EXPORT_SYMBOL(tcp_proto_cgroup);
473#endif
474#endif
475
476#if defined(CONFIG_INET) && defined(CONFIG_MEMCG_KMEM)
477static void disarm_sock_keys(struct mem_cgroup *memcg)
478{
479 if (!memcg_proto_activated(&memcg->tcp_mem.cg_proto))
480 return;
481 static_key_slow_dec(&memcg_socket_limit_enabled);
482}
483#else
484static void disarm_sock_keys(struct mem_cgroup *memcg)
485{
486}
487#endif
488
489static void drain_all_stock_async(struct mem_cgroup *memcg);
490
491static struct mem_cgroup_per_zone *
492mem_cgroup_zoneinfo(struct mem_cgroup *memcg, int nid, int zid)
493{
494 return &memcg->info.nodeinfo[nid]->zoneinfo[zid];
495}
496
497struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *memcg)
498{
499 return &memcg->css;
500}
501
502static struct mem_cgroup_per_zone *
503page_cgroup_zoneinfo(struct mem_cgroup *memcg, struct page *page)
504{
505 int nid = page_to_nid(page);
506 int zid = page_zonenum(page);
507
508 return mem_cgroup_zoneinfo(memcg, nid, zid);
509}
510
511static struct mem_cgroup_tree_per_zone *
512soft_limit_tree_node_zone(int nid, int zid)
513{
514 return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid];
515}
516
517static struct mem_cgroup_tree_per_zone *
518soft_limit_tree_from_page(struct page *page)
519{
520 int nid = page_to_nid(page);
521 int zid = page_zonenum(page);
522
523 return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid];
524}
525
526static void
527__mem_cgroup_insert_exceeded(struct mem_cgroup *memcg,
528 struct mem_cgroup_per_zone *mz,
529 struct mem_cgroup_tree_per_zone *mctz,
530 unsigned long long new_usage_in_excess)
531{
532 struct rb_node **p = &mctz->rb_root.rb_node;
533 struct rb_node *parent = NULL;
534 struct mem_cgroup_per_zone *mz_node;
535
536 if (mz->on_tree)
537 return;
538
539 mz->usage_in_excess = new_usage_in_excess;
540 if (!mz->usage_in_excess)
541 return;
542 while (*p) {
543 parent = *p;
544 mz_node = rb_entry(parent, struct mem_cgroup_per_zone,
545 tree_node);
546 if (mz->usage_in_excess < mz_node->usage_in_excess)
547 p = &(*p)->rb_left;
548
549
550
551
552 else if (mz->usage_in_excess >= mz_node->usage_in_excess)
553 p = &(*p)->rb_right;
554 }
555 rb_link_node(&mz->tree_node, parent, p);
556 rb_insert_color(&mz->tree_node, &mctz->rb_root);
557 mz->on_tree = true;
558}
559
560static void
561__mem_cgroup_remove_exceeded(struct mem_cgroup *memcg,
562 struct mem_cgroup_per_zone *mz,
563 struct mem_cgroup_tree_per_zone *mctz)
564{
565 if (!mz->on_tree)
566 return;
567 rb_erase(&mz->tree_node, &mctz->rb_root);
568 mz->on_tree = false;
569}
570
571static void
572mem_cgroup_remove_exceeded(struct mem_cgroup *memcg,
573 struct mem_cgroup_per_zone *mz,
574 struct mem_cgroup_tree_per_zone *mctz)
575{
576 spin_lock(&mctz->lock);
577 __mem_cgroup_remove_exceeded(memcg, mz, mctz);
578 spin_unlock(&mctz->lock);
579}
580
581
582static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page)
583{
584 unsigned long long excess;
585 struct mem_cgroup_per_zone *mz;
586 struct mem_cgroup_tree_per_zone *mctz;
587 int nid = page_to_nid(page);
588 int zid = page_zonenum(page);
589 mctz = soft_limit_tree_from_page(page);
590
591
592
593
594
595 for (; memcg; memcg = parent_mem_cgroup(memcg)) {
596 mz = mem_cgroup_zoneinfo(memcg, nid, zid);
597 excess = res_counter_soft_limit_excess(&memcg->res);
598
599
600
601
602 if (excess || mz->on_tree) {
603 spin_lock(&mctz->lock);
604
605 if (mz->on_tree)
606 __mem_cgroup_remove_exceeded(memcg, mz, mctz);
607
608
609
610
611 __mem_cgroup_insert_exceeded(memcg, mz, mctz, excess);
612 spin_unlock(&mctz->lock);
613 }
614 }
615}
616
617static void mem_cgroup_remove_from_trees(struct mem_cgroup *memcg)
618{
619 int node, zone;
620 struct mem_cgroup_per_zone *mz;
621 struct mem_cgroup_tree_per_zone *mctz;
622
623 for_each_node(node) {
624 for (zone = 0; zone < MAX_NR_ZONES; zone++) {
625 mz = mem_cgroup_zoneinfo(memcg, node, zone);
626 mctz = soft_limit_tree_node_zone(node, zone);
627 mem_cgroup_remove_exceeded(memcg, mz, mctz);
628 }
629 }
630}
631
632static struct mem_cgroup_per_zone *
633__mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz)
634{
635 struct rb_node *rightmost = NULL;
636 struct mem_cgroup_per_zone *mz;
637
638retry:
639 mz = NULL;
640 rightmost = rb_last(&mctz->rb_root);
641 if (!rightmost)
642 goto done;
643
644 mz = rb_entry(rightmost, struct mem_cgroup_per_zone, tree_node);
645
646
647
648
649
650 __mem_cgroup_remove_exceeded(mz->memcg, mz, mctz);
651 if (!res_counter_soft_limit_excess(&mz->memcg->res) ||
652 !css_tryget(&mz->memcg->css))
653 goto retry;
654done:
655 return mz;
656}
657
658static struct mem_cgroup_per_zone *
659mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz)
660{
661 struct mem_cgroup_per_zone *mz;
662
663 spin_lock(&mctz->lock);
664 mz = __mem_cgroup_largest_soft_limit_node(mctz);
665 spin_unlock(&mctz->lock);
666 return mz;
667}
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688static long mem_cgroup_read_stat(struct mem_cgroup *memcg,
689 enum mem_cgroup_stat_index idx)
690{
691 long val = 0;
692 int cpu;
693
694 get_online_cpus();
695 for_each_online_cpu(cpu)
696 val += per_cpu(memcg->stat->count[idx], cpu);
697#ifdef CONFIG_HOTPLUG_CPU
698 spin_lock(&memcg->pcp_counter_lock);
699 val += memcg->nocpu_base.count[idx];
700 spin_unlock(&memcg->pcp_counter_lock);
701#endif
702 put_online_cpus();
703 return val;
704}
705
706static void mem_cgroup_swap_statistics(struct mem_cgroup *memcg,
707 bool charge)
708{
709 int val = (charge) ? 1 : -1;
710 this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_SWAP], val);
711}
712
713static unsigned long mem_cgroup_read_events(struct mem_cgroup *memcg,
714 enum mem_cgroup_events_index idx)
715{
716 unsigned long val = 0;
717 int cpu;
718
719 for_each_online_cpu(cpu)
720 val += per_cpu(memcg->stat->events[idx], cpu);
721#ifdef CONFIG_HOTPLUG_CPU
722 spin_lock(&memcg->pcp_counter_lock);
723 val += memcg->nocpu_base.events[idx];
724 spin_unlock(&memcg->pcp_counter_lock);
725#endif
726 return val;
727}
728
729static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg,
730 bool anon, int nr_pages)
731{
732 preempt_disable();
733
734
735
736
737
738 if (anon)
739 __this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_RSS],
740 nr_pages);
741 else
742 __this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_CACHE],
743 nr_pages);
744
745
746 if (nr_pages > 0)
747 __this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGIN]);
748 else {
749 __this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGOUT]);
750 nr_pages = -nr_pages;
751 }
752
753 __this_cpu_add(memcg->stat->nr_page_events, nr_pages);
754
755 preempt_enable();
756}
757
758unsigned long
759mem_cgroup_get_lru_size(struct lruvec *lruvec, enum lru_list lru)
760{
761 struct mem_cgroup_per_zone *mz;
762
763 mz = container_of(lruvec, struct mem_cgroup_per_zone, lruvec);
764 return mz->lru_size[lru];
765}
766
767static unsigned long
768mem_cgroup_zone_nr_lru_pages(struct mem_cgroup *memcg, int nid, int zid,
769 unsigned int lru_mask)
770{
771 struct mem_cgroup_per_zone *mz;
772 enum lru_list lru;
773 unsigned long ret = 0;
774
775 mz = mem_cgroup_zoneinfo(memcg, nid, zid);
776
777 for_each_lru(lru) {
778 if (BIT(lru) & lru_mask)
779 ret += mz->lru_size[lru];
780 }
781 return ret;
782}
783
784static unsigned long
785mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg,
786 int nid, unsigned int lru_mask)
787{
788 u64 total = 0;
789 int zid;
790
791 for (zid = 0; zid < MAX_NR_ZONES; zid++)
792 total += mem_cgroup_zone_nr_lru_pages(memcg,
793 nid, zid, lru_mask);
794
795 return total;
796}
797
798static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *memcg,
799 unsigned int lru_mask)
800{
801 int nid;
802 u64 total = 0;
803
804 for_each_node_state(nid, N_HIGH_MEMORY)
805 total += mem_cgroup_node_nr_lru_pages(memcg, nid, lru_mask);
806 return total;
807}
808
809static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg,
810 enum mem_cgroup_events_target target)
811{
812 unsigned long val, next;
813
814 val = __this_cpu_read(memcg->stat->nr_page_events);
815 next = __this_cpu_read(memcg->stat->targets[target]);
816
817 if ((long)next - (long)val < 0) {
818 switch (target) {
819 case MEM_CGROUP_TARGET_THRESH:
820 next = val + THRESHOLDS_EVENTS_TARGET;
821 break;
822 case MEM_CGROUP_TARGET_SOFTLIMIT:
823 next = val + SOFTLIMIT_EVENTS_TARGET;
824 break;
825 case MEM_CGROUP_TARGET_NUMAINFO:
826 next = val + NUMAINFO_EVENTS_TARGET;
827 break;
828 default:
829 break;
830 }
831 __this_cpu_write(memcg->stat->targets[target], next);
832 return true;
833 }
834 return false;
835}
836
837
838
839
840
841static void memcg_check_events(struct mem_cgroup *memcg, struct page *page)
842{
843 preempt_disable();
844
845 if (unlikely(mem_cgroup_event_ratelimit(memcg,
846 MEM_CGROUP_TARGET_THRESH))) {
847 bool do_softlimit;
848 bool do_numainfo __maybe_unused;
849
850 do_softlimit = mem_cgroup_event_ratelimit(memcg,
851 MEM_CGROUP_TARGET_SOFTLIMIT);
852#if MAX_NUMNODES > 1
853 do_numainfo = mem_cgroup_event_ratelimit(memcg,
854 MEM_CGROUP_TARGET_NUMAINFO);
855#endif
856 preempt_enable();
857
858 mem_cgroup_threshold(memcg);
859 if (unlikely(do_softlimit))
860 mem_cgroup_update_tree(memcg, page);
861#if MAX_NUMNODES > 1
862 if (unlikely(do_numainfo))
863 atomic_inc(&memcg->numainfo_events);
864#endif
865 } else
866 preempt_enable();
867}
868
869struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont)
870{
871 return mem_cgroup_from_css(
872 cgroup_subsys_state(cont, mem_cgroup_subsys_id));
873}
874
875struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
876{
877
878
879
880
881
882 if (unlikely(!p))
883 return NULL;
884
885 return mem_cgroup_from_css(task_subsys_state(p, mem_cgroup_subsys_id));
886}
887
888struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm)
889{
890 struct mem_cgroup *memcg = NULL;
891
892 if (!mm)
893 return NULL;
894
895
896
897
898
899 rcu_read_lock();
900 do {
901 memcg = mem_cgroup_from_task(rcu_dereference(mm->owner));
902 if (unlikely(!memcg))
903 break;
904 } while (!css_tryget(&memcg->css));
905 rcu_read_unlock();
906 return memcg;
907}
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
927 struct mem_cgroup *prev,
928 struct mem_cgroup_reclaim_cookie *reclaim)
929{
930 struct mem_cgroup *memcg = NULL;
931 int id = 0;
932
933 if (mem_cgroup_disabled())
934 return NULL;
935
936 if (!root)
937 root = root_mem_cgroup;
938
939 if (prev && !reclaim)
940 id = css_id(&prev->css);
941
942 if (prev && prev != root)
943 css_put(&prev->css);
944
945 if (!root->use_hierarchy && root != root_mem_cgroup) {
946 if (prev)
947 return NULL;
948 return root;
949 }
950
951 while (!memcg) {
952 struct mem_cgroup_reclaim_iter *uninitialized_var(iter);
953 struct cgroup_subsys_state *css;
954
955 if (reclaim) {
956 int nid = zone_to_nid(reclaim->zone);
957 int zid = zone_idx(reclaim->zone);
958 struct mem_cgroup_per_zone *mz;
959
960 mz = mem_cgroup_zoneinfo(root, nid, zid);
961 iter = &mz->reclaim_iter[reclaim->priority];
962 if (prev && reclaim->generation != iter->generation)
963 return NULL;
964 id = iter->position;
965 }
966
967 rcu_read_lock();
968 css = css_get_next(&mem_cgroup_subsys, id + 1, &root->css, &id);
969 if (css) {
970 if (css == &root->css || css_tryget(css))
971 memcg = mem_cgroup_from_css(css);
972 } else
973 id = 0;
974 rcu_read_unlock();
975
976 if (reclaim) {
977 iter->position = id;
978 if (!css)
979 iter->generation++;
980 else if (!prev && memcg)
981 reclaim->generation = iter->generation;
982 }
983
984 if (prev && !css)
985 return NULL;
986 }
987 return memcg;
988}
989
990
991
992
993
994
995void mem_cgroup_iter_break(struct mem_cgroup *root,
996 struct mem_cgroup *prev)
997{
998 if (!root)
999 root = root_mem_cgroup;
1000 if (prev && prev != root)
1001 css_put(&prev->css);
1002}
1003
1004
1005
1006
1007
1008
1009#define for_each_mem_cgroup_tree(iter, root) \
1010 for (iter = mem_cgroup_iter(root, NULL, NULL); \
1011 iter != NULL; \
1012 iter = mem_cgroup_iter(root, iter, NULL))
1013
1014#define for_each_mem_cgroup(iter) \
1015 for (iter = mem_cgroup_iter(NULL, NULL, NULL); \
1016 iter != NULL; \
1017 iter = mem_cgroup_iter(NULL, iter, NULL))
1018
1019static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg)
1020{
1021 return (memcg == root_mem_cgroup);
1022}
1023
1024void mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx)
1025{
1026 struct mem_cgroup *memcg;
1027
1028 if (!mm)
1029 return;
1030
1031 rcu_read_lock();
1032 memcg = mem_cgroup_from_task(rcu_dereference(mm->owner));
1033 if (unlikely(!memcg))
1034 goto out;
1035
1036 switch (idx) {
1037 case PGFAULT:
1038 this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGFAULT]);
1039 break;
1040 case PGMAJFAULT:
1041 this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGMAJFAULT]);
1042 break;
1043 default:
1044 BUG();
1045 }
1046out:
1047 rcu_read_unlock();
1048}
1049EXPORT_SYMBOL(mem_cgroup_count_vm_event);
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060struct lruvec *mem_cgroup_zone_lruvec(struct zone *zone,
1061 struct mem_cgroup *memcg)
1062{
1063 struct mem_cgroup_per_zone *mz;
1064
1065 if (mem_cgroup_disabled())
1066 return &zone->lruvec;
1067
1068 mz = mem_cgroup_zoneinfo(memcg, zone_to_nid(zone), zone_idx(zone));
1069 return &mz->lruvec;
1070}
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091struct lruvec *mem_cgroup_page_lruvec(struct page *page, struct zone *zone)
1092{
1093 struct mem_cgroup_per_zone *mz;
1094 struct mem_cgroup *memcg;
1095 struct page_cgroup *pc;
1096
1097 if (mem_cgroup_disabled())
1098 return &zone->lruvec;
1099
1100 pc = lookup_page_cgroup(page);
1101 memcg = pc->mem_cgroup;
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112 if (!PageLRU(page) && !PageCgroupUsed(pc) && memcg != root_mem_cgroup)
1113 pc->mem_cgroup = memcg = root_mem_cgroup;
1114
1115 mz = page_cgroup_zoneinfo(memcg, page);
1116 return &mz->lruvec;
1117}
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru,
1129 int nr_pages)
1130{
1131 struct mem_cgroup_per_zone *mz;
1132 unsigned long *lru_size;
1133
1134 if (mem_cgroup_disabled())
1135 return;
1136
1137 mz = container_of(lruvec, struct mem_cgroup_per_zone, lruvec);
1138 lru_size = mz->lru_size + lru;
1139 *lru_size += nr_pages;
1140 VM_BUG_ON((long)(*lru_size) < 0);
1141}
1142
1143
1144
1145
1146
1147bool __mem_cgroup_same_or_subtree(const struct mem_cgroup *root_memcg,
1148 struct mem_cgroup *memcg)
1149{
1150 if (root_memcg == memcg)
1151 return true;
1152 if (!root_memcg->use_hierarchy || !memcg)
1153 return false;
1154 return css_is_ancestor(&memcg->css, &root_memcg->css);
1155}
1156
1157static bool mem_cgroup_same_or_subtree(const struct mem_cgroup *root_memcg,
1158 struct mem_cgroup *memcg)
1159{
1160 bool ret;
1161
1162 rcu_read_lock();
1163 ret = __mem_cgroup_same_or_subtree(root_memcg, memcg);
1164 rcu_read_unlock();
1165 return ret;
1166}
1167
1168int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *memcg)
1169{
1170 int ret;
1171 struct mem_cgroup *curr = NULL;
1172 struct task_struct *p;
1173
1174 p = find_lock_task_mm(task);
1175 if (p) {
1176 curr = try_get_mem_cgroup_from_mm(p->mm);
1177 task_unlock(p);
1178 } else {
1179
1180
1181
1182
1183
1184 task_lock(task);
1185 curr = mem_cgroup_from_task(task);
1186 if (curr)
1187 css_get(&curr->css);
1188 task_unlock(task);
1189 }
1190 if (!curr)
1191 return 0;
1192
1193
1194
1195
1196
1197
1198 ret = mem_cgroup_same_or_subtree(memcg, curr);
1199 css_put(&curr->css);
1200 return ret;
1201}
1202
1203int mem_cgroup_inactive_anon_is_low(struct lruvec *lruvec)
1204{
1205 unsigned long inactive_ratio;
1206 unsigned long inactive;
1207 unsigned long active;
1208 unsigned long gb;
1209
1210 inactive = mem_cgroup_get_lru_size(lruvec, LRU_INACTIVE_ANON);
1211 active = mem_cgroup_get_lru_size(lruvec, LRU_ACTIVE_ANON);
1212
1213 gb = (inactive + active) >> (30 - PAGE_SHIFT);
1214 if (gb)
1215 inactive_ratio = int_sqrt(10 * gb);
1216 else
1217 inactive_ratio = 1;
1218
1219 return inactive * inactive_ratio < active;
1220}
1221
1222int mem_cgroup_inactive_file_is_low(struct lruvec *lruvec)
1223{
1224 unsigned long active;
1225 unsigned long inactive;
1226
1227 inactive = mem_cgroup_get_lru_size(lruvec, LRU_INACTIVE_FILE);
1228 active = mem_cgroup_get_lru_size(lruvec, LRU_ACTIVE_FILE);
1229
1230 return (active > inactive);
1231}
1232
1233#define mem_cgroup_from_res_counter(counter, member) \
1234 container_of(counter, struct mem_cgroup, member)
1235
1236
1237
1238
1239
1240
1241
1242
1243static unsigned long mem_cgroup_margin(struct mem_cgroup *memcg)
1244{
1245 unsigned long long margin;
1246
1247 margin = res_counter_margin(&memcg->res);
1248 if (do_swap_account)
1249 margin = min(margin, res_counter_margin(&memcg->memsw));
1250 return margin >> PAGE_SHIFT;
1251}
1252
1253int mem_cgroup_swappiness(struct mem_cgroup *memcg)
1254{
1255 struct cgroup *cgrp = memcg->css.cgroup;
1256
1257
1258 if (cgrp->parent == NULL)
1259 return vm_swappiness;
1260
1261 return memcg->swappiness;
1262}
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280atomic_t memcg_moving __read_mostly;
1281
1282static void mem_cgroup_start_move(struct mem_cgroup *memcg)
1283{
1284 atomic_inc(&memcg_moving);
1285 atomic_inc(&memcg->moving_account);
1286 synchronize_rcu();
1287}
1288
1289static void mem_cgroup_end_move(struct mem_cgroup *memcg)
1290{
1291
1292
1293
1294
1295 if (memcg) {
1296 atomic_dec(&memcg_moving);
1297 atomic_dec(&memcg->moving_account);
1298 }
1299}
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313static bool mem_cgroup_stolen(struct mem_cgroup *memcg)
1314{
1315 VM_BUG_ON(!rcu_read_lock_held());
1316 return atomic_read(&memcg->moving_account) > 0;
1317}
1318
1319static bool mem_cgroup_under_move(struct mem_cgroup *memcg)
1320{
1321 struct mem_cgroup *from;
1322 struct mem_cgroup *to;
1323 bool ret = false;
1324
1325
1326
1327
1328 spin_lock(&mc.lock);
1329 from = mc.from;
1330 to = mc.to;
1331 if (!from)
1332 goto unlock;
1333
1334 ret = mem_cgroup_same_or_subtree(memcg, from)
1335 || mem_cgroup_same_or_subtree(memcg, to);
1336unlock:
1337 spin_unlock(&mc.lock);
1338 return ret;
1339}
1340
1341static bool mem_cgroup_wait_acct_move(struct mem_cgroup *memcg)
1342{
1343 if (mc.moving_task && current != mc.moving_task) {
1344 if (mem_cgroup_under_move(memcg)) {
1345 DEFINE_WAIT(wait);
1346 prepare_to_wait(&mc.waitq, &wait, TASK_INTERRUPTIBLE);
1347
1348 if (mc.moving_task)
1349 schedule();
1350 finish_wait(&mc.waitq, &wait);
1351 return true;
1352 }
1353 }
1354 return false;
1355}
1356
1357
1358
1359
1360
1361
1362
1363static void move_lock_mem_cgroup(struct mem_cgroup *memcg,
1364 unsigned long *flags)
1365{
1366 spin_lock_irqsave(&memcg->move_lock, *flags);
1367}
1368
1369static void move_unlock_mem_cgroup(struct mem_cgroup *memcg,
1370 unsigned long *flags)
1371{
1372 spin_unlock_irqrestore(&memcg->move_lock, *flags);
1373}
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
1384{
1385 struct cgroup *task_cgrp;
1386 struct cgroup *mem_cgrp;
1387
1388
1389
1390
1391
1392 static char memcg_name[PATH_MAX];
1393 int ret;
1394
1395 if (!memcg || !p)
1396 return;
1397
1398 rcu_read_lock();
1399
1400 mem_cgrp = memcg->css.cgroup;
1401 task_cgrp = task_cgroup(p, mem_cgroup_subsys_id);
1402
1403 ret = cgroup_path(task_cgrp, memcg_name, PATH_MAX);
1404 if (ret < 0) {
1405
1406
1407
1408
1409 rcu_read_unlock();
1410 goto done;
1411 }
1412 rcu_read_unlock();
1413
1414 printk(KERN_INFO "Task in %s killed", memcg_name);
1415
1416 rcu_read_lock();
1417 ret = cgroup_path(mem_cgrp, memcg_name, PATH_MAX);
1418 if (ret < 0) {
1419 rcu_read_unlock();
1420 goto done;
1421 }
1422 rcu_read_unlock();
1423
1424
1425
1426
1427 printk(KERN_CONT " as a result of limit of %s\n", memcg_name);
1428done:
1429
1430 printk(KERN_INFO "memory: usage %llukB, limit %llukB, failcnt %llu\n",
1431 res_counter_read_u64(&memcg->res, RES_USAGE) >> 10,
1432 res_counter_read_u64(&memcg->res, RES_LIMIT) >> 10,
1433 res_counter_read_u64(&memcg->res, RES_FAILCNT));
1434 printk(KERN_INFO "memory+swap: usage %llukB, limit %llukB, "
1435 "failcnt %llu\n",
1436 res_counter_read_u64(&memcg->memsw, RES_USAGE) >> 10,
1437 res_counter_read_u64(&memcg->memsw, RES_LIMIT) >> 10,
1438 res_counter_read_u64(&memcg->memsw, RES_FAILCNT));
1439}
1440
1441
1442
1443
1444
1445static int mem_cgroup_count_children(struct mem_cgroup *memcg)
1446{
1447 int num = 0;
1448 struct mem_cgroup *iter;
1449
1450 for_each_mem_cgroup_tree(iter, memcg)
1451 num++;
1452 return num;
1453}
1454
1455
1456
1457
1458static u64 mem_cgroup_get_limit(struct mem_cgroup *memcg)
1459{
1460 u64 limit;
1461 u64 memsw;
1462
1463 limit = res_counter_read_u64(&memcg->res, RES_LIMIT);
1464 limit += total_swap_pages << PAGE_SHIFT;
1465
1466 memsw = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
1467
1468
1469
1470
1471 return min(limit, memsw);
1472}
1473
1474void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
1475 int order)
1476{
1477 struct mem_cgroup *iter;
1478 unsigned long chosen_points = 0;
1479 unsigned long totalpages;
1480 unsigned int points = 0;
1481 struct task_struct *chosen = NULL;
1482
1483
1484
1485
1486
1487
1488 if (fatal_signal_pending(current)) {
1489 set_thread_flag(TIF_MEMDIE);
1490 return;
1491 }
1492
1493 check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, order, NULL);
1494 totalpages = mem_cgroup_get_limit(memcg) >> PAGE_SHIFT ? : 1;
1495 for_each_mem_cgroup_tree(iter, memcg) {
1496 struct cgroup *cgroup = iter->css.cgroup;
1497 struct cgroup_iter it;
1498 struct task_struct *task;
1499
1500 cgroup_iter_start(cgroup, &it);
1501 while ((task = cgroup_iter_next(cgroup, &it))) {
1502 switch (oom_scan_process_thread(task, totalpages, NULL,
1503 false)) {
1504 case OOM_SCAN_SELECT:
1505 if (chosen)
1506 put_task_struct(chosen);
1507 chosen = task;
1508 chosen_points = ULONG_MAX;
1509 get_task_struct(chosen);
1510
1511 case OOM_SCAN_CONTINUE:
1512 continue;
1513 case OOM_SCAN_ABORT:
1514 cgroup_iter_end(cgroup, &it);
1515 mem_cgroup_iter_break(memcg, iter);
1516 if (chosen)
1517 put_task_struct(chosen);
1518 return;
1519 case OOM_SCAN_OK:
1520 break;
1521 };
1522 points = oom_badness(task, memcg, NULL, totalpages);
1523 if (points > chosen_points) {
1524 if (chosen)
1525 put_task_struct(chosen);
1526 chosen = task;
1527 chosen_points = points;
1528 get_task_struct(chosen);
1529 }
1530 }
1531 cgroup_iter_end(cgroup, &it);
1532 }
1533
1534 if (!chosen)
1535 return;
1536 points = chosen_points * 1000 / totalpages;
1537 oom_kill_process(chosen, gfp_mask, order, points, totalpages, memcg,
1538 NULL, "Memory cgroup out of memory");
1539}
1540
1541static unsigned long mem_cgroup_reclaim(struct mem_cgroup *memcg,
1542 gfp_t gfp_mask,
1543 unsigned long flags)
1544{
1545 unsigned long total = 0;
1546 bool noswap = false;
1547 int loop;
1548
1549 if (flags & MEM_CGROUP_RECLAIM_NOSWAP)
1550 noswap = true;
1551 if (!(flags & MEM_CGROUP_RECLAIM_SHRINK) && memcg->memsw_is_minimum)
1552 noswap = true;
1553
1554 for (loop = 0; loop < MEM_CGROUP_MAX_RECLAIM_LOOPS; loop++) {
1555 if (loop)
1556 drain_all_stock_async(memcg);
1557 total += try_to_free_mem_cgroup_pages(memcg, gfp_mask, noswap);
1558
1559
1560
1561
1562
1563 if (total && (flags & MEM_CGROUP_RECLAIM_SHRINK))
1564 break;
1565 if (mem_cgroup_margin(memcg))
1566 break;
1567
1568
1569
1570
1571 if (loop && !total)
1572 break;
1573 }
1574 return total;
1575}
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587static bool test_mem_cgroup_node_reclaimable(struct mem_cgroup *memcg,
1588 int nid, bool noswap)
1589{
1590 if (mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL_FILE))
1591 return true;
1592 if (noswap || !total_swap_pages)
1593 return false;
1594 if (mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL_ANON))
1595 return true;
1596 return false;
1597
1598}
1599#if MAX_NUMNODES > 1
1600
1601
1602
1603
1604
1605
1606
1607static void mem_cgroup_may_update_nodemask(struct mem_cgroup *memcg)
1608{
1609 int nid;
1610
1611
1612
1613
1614 if (!atomic_read(&memcg->numainfo_events))
1615 return;
1616 if (atomic_inc_return(&memcg->numainfo_updating) > 1)
1617 return;
1618
1619
1620 memcg->scan_nodes = node_states[N_HIGH_MEMORY];
1621
1622 for_each_node_mask(nid, node_states[N_HIGH_MEMORY]) {
1623
1624 if (!test_mem_cgroup_node_reclaimable(memcg, nid, false))
1625 node_clear(nid, memcg->scan_nodes);
1626 }
1627
1628 atomic_set(&memcg->numainfo_events, 0);
1629 atomic_set(&memcg->numainfo_updating, 0);
1630}
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644int mem_cgroup_select_victim_node(struct mem_cgroup *memcg)
1645{
1646 int node;
1647
1648 mem_cgroup_may_update_nodemask(memcg);
1649 node = memcg->last_scanned_node;
1650
1651 node = next_node(node, memcg->scan_nodes);
1652 if (node == MAX_NUMNODES)
1653 node = first_node(memcg->scan_nodes);
1654
1655
1656
1657
1658
1659
1660 if (unlikely(node == MAX_NUMNODES))
1661 node = numa_node_id();
1662
1663 memcg->last_scanned_node = node;
1664 return node;
1665}
1666
1667
1668
1669
1670
1671
1672
1673static bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap)
1674{
1675 int nid;
1676
1677
1678
1679
1680
1681 if (!nodes_empty(memcg->scan_nodes)) {
1682 for (nid = first_node(memcg->scan_nodes);
1683 nid < MAX_NUMNODES;
1684 nid = next_node(nid, memcg->scan_nodes)) {
1685
1686 if (test_mem_cgroup_node_reclaimable(memcg, nid, noswap))
1687 return true;
1688 }
1689 }
1690
1691
1692
1693 for_each_node_state(nid, N_HIGH_MEMORY) {
1694 if (node_isset(nid, memcg->scan_nodes))
1695 continue;
1696 if (test_mem_cgroup_node_reclaimable(memcg, nid, noswap))
1697 return true;
1698 }
1699 return false;
1700}
1701
1702#else
1703int mem_cgroup_select_victim_node(struct mem_cgroup *memcg)
1704{
1705 return 0;
1706}
1707
1708static bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap)
1709{
1710 return test_mem_cgroup_node_reclaimable(memcg, 0, noswap);
1711}
1712#endif
1713
1714static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg,
1715 struct zone *zone,
1716 gfp_t gfp_mask,
1717 unsigned long *total_scanned)
1718{
1719 struct mem_cgroup *victim = NULL;
1720 int total = 0;
1721 int loop = 0;
1722 unsigned long excess;
1723 unsigned long nr_scanned;
1724 struct mem_cgroup_reclaim_cookie reclaim = {
1725 .zone = zone,
1726 .priority = 0,
1727 };
1728
1729 excess = res_counter_soft_limit_excess(&root_memcg->res) >> PAGE_SHIFT;
1730
1731 while (1) {
1732 victim = mem_cgroup_iter(root_memcg, victim, &reclaim);
1733 if (!victim) {
1734 loop++;
1735 if (loop >= 2) {
1736
1737
1738
1739
1740
1741 if (!total)
1742 break;
1743
1744
1745
1746
1747
1748
1749 if (total >= (excess >> 2) ||
1750 (loop > MEM_CGROUP_MAX_RECLAIM_LOOPS))
1751 break;
1752 }
1753 continue;
1754 }
1755 if (!mem_cgroup_reclaimable(victim, false))
1756 continue;
1757 total += mem_cgroup_shrink_node_zone(victim, gfp_mask, false,
1758 zone, &nr_scanned);
1759 *total_scanned += nr_scanned;
1760 if (!res_counter_soft_limit_excess(&root_memcg->res))
1761 break;
1762 }
1763 mem_cgroup_iter_break(root_memcg, victim);
1764 return total;
1765}
1766
1767
1768
1769
1770
1771
1772static bool mem_cgroup_oom_lock(struct mem_cgroup *memcg)
1773{
1774 struct mem_cgroup *iter, *failed = NULL;
1775
1776 for_each_mem_cgroup_tree(iter, memcg) {
1777 if (iter->oom_lock) {
1778
1779
1780
1781
1782 failed = iter;
1783 mem_cgroup_iter_break(memcg, iter);
1784 break;
1785 } else
1786 iter->oom_lock = true;
1787 }
1788
1789 if (!failed)
1790 return true;
1791
1792
1793
1794
1795
1796 for_each_mem_cgroup_tree(iter, memcg) {
1797 if (iter == failed) {
1798 mem_cgroup_iter_break(memcg, iter);
1799 break;
1800 }
1801 iter->oom_lock = false;
1802 }
1803 return false;
1804}
1805
1806
1807
1808
1809static int mem_cgroup_oom_unlock(struct mem_cgroup *memcg)
1810{
1811 struct mem_cgroup *iter;
1812
1813 for_each_mem_cgroup_tree(iter, memcg)
1814 iter->oom_lock = false;
1815 return 0;
1816}
1817
1818static void mem_cgroup_mark_under_oom(struct mem_cgroup *memcg)
1819{
1820 struct mem_cgroup *iter;
1821
1822 for_each_mem_cgroup_tree(iter, memcg)
1823 atomic_inc(&iter->under_oom);
1824}
1825
1826static void mem_cgroup_unmark_under_oom(struct mem_cgroup *memcg)
1827{
1828 struct mem_cgroup *iter;
1829
1830
1831
1832
1833
1834
1835 for_each_mem_cgroup_tree(iter, memcg)
1836 atomic_add_unless(&iter->under_oom, -1, 0);
1837}
1838
1839static DEFINE_SPINLOCK(memcg_oom_lock);
1840static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq);
1841
1842struct oom_wait_info {
1843 struct mem_cgroup *memcg;
1844 wait_queue_t wait;
1845};
1846
1847static int memcg_oom_wake_function(wait_queue_t *wait,
1848 unsigned mode, int sync, void *arg)
1849{
1850 struct mem_cgroup *wake_memcg = (struct mem_cgroup *)arg;
1851 struct mem_cgroup *oom_wait_memcg;
1852 struct oom_wait_info *oom_wait_info;
1853
1854 oom_wait_info = container_of(wait, struct oom_wait_info, wait);
1855 oom_wait_memcg = oom_wait_info->memcg;
1856
1857
1858
1859
1860
1861 if (!mem_cgroup_same_or_subtree(oom_wait_memcg, wake_memcg)
1862 && !mem_cgroup_same_or_subtree(wake_memcg, oom_wait_memcg))
1863 return 0;
1864 return autoremove_wake_function(wait, mode, sync, arg);
1865}
1866
1867static void memcg_wakeup_oom(struct mem_cgroup *memcg)
1868{
1869
1870 __wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, memcg);
1871}
1872
1873static void memcg_oom_recover(struct mem_cgroup *memcg)
1874{
1875 if (memcg && atomic_read(&memcg->under_oom))
1876 memcg_wakeup_oom(memcg);
1877}
1878
1879
1880
1881
1882static bool mem_cgroup_handle_oom(struct mem_cgroup *memcg, gfp_t mask,
1883 int order)
1884{
1885 struct oom_wait_info owait;
1886 bool locked, need_to_kill;
1887
1888 owait.memcg = memcg;
1889 owait.wait.flags = 0;
1890 owait.wait.func = memcg_oom_wake_function;
1891 owait.wait.private = current;
1892 INIT_LIST_HEAD(&owait.wait.task_list);
1893 need_to_kill = true;
1894 mem_cgroup_mark_under_oom(memcg);
1895
1896
1897 spin_lock(&memcg_oom_lock);
1898 locked = mem_cgroup_oom_lock(memcg);
1899
1900
1901
1902
1903
1904 prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE);
1905 if (!locked || memcg->oom_kill_disable)
1906 need_to_kill = false;
1907 if (locked)
1908 mem_cgroup_oom_notify(memcg);
1909 spin_unlock(&memcg_oom_lock);
1910
1911 if (need_to_kill) {
1912 finish_wait(&memcg_oom_waitq, &owait.wait);
1913 mem_cgroup_out_of_memory(memcg, mask, order);
1914 } else {
1915 schedule();
1916 finish_wait(&memcg_oom_waitq, &owait.wait);
1917 }
1918 spin_lock(&memcg_oom_lock);
1919 if (locked)
1920 mem_cgroup_oom_unlock(memcg);
1921 memcg_wakeup_oom(memcg);
1922 spin_unlock(&memcg_oom_lock);
1923
1924 mem_cgroup_unmark_under_oom(memcg);
1925
1926 if (test_thread_flag(TIF_MEMDIE) || fatal_signal_pending(current))
1927 return false;
1928
1929 schedule_timeout_uninterruptible(1);
1930 return true;
1931}
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957void __mem_cgroup_begin_update_page_stat(struct page *page,
1958 bool *locked, unsigned long *flags)
1959{
1960 struct mem_cgroup *memcg;
1961 struct page_cgroup *pc;
1962
1963 pc = lookup_page_cgroup(page);
1964again:
1965 memcg = pc->mem_cgroup;
1966 if (unlikely(!memcg || !PageCgroupUsed(pc)))
1967 return;
1968
1969
1970
1971
1972
1973
1974 if (!mem_cgroup_stolen(memcg))
1975 return;
1976
1977 move_lock_mem_cgroup(memcg, flags);
1978 if (memcg != pc->mem_cgroup || !PageCgroupUsed(pc)) {
1979 move_unlock_mem_cgroup(memcg, flags);
1980 goto again;
1981 }
1982 *locked = true;
1983}
1984
1985void __mem_cgroup_end_update_page_stat(struct page *page, unsigned long *flags)
1986{
1987 struct page_cgroup *pc = lookup_page_cgroup(page);
1988
1989
1990
1991
1992
1993
1994 move_unlock_mem_cgroup(pc->mem_cgroup, flags);
1995}
1996
1997void mem_cgroup_update_page_stat(struct page *page,
1998 enum mem_cgroup_page_stat_item idx, int val)
1999{
2000 struct mem_cgroup *memcg;
2001 struct page_cgroup *pc = lookup_page_cgroup(page);
2002 unsigned long uninitialized_var(flags);
2003
2004 if (mem_cgroup_disabled())
2005 return;
2006
2007 memcg = pc->mem_cgroup;
2008 if (unlikely(!memcg || !PageCgroupUsed(pc)))
2009 return;
2010
2011 switch (idx) {
2012 case MEMCG_NR_FILE_MAPPED:
2013 idx = MEM_CGROUP_STAT_FILE_MAPPED;
2014 break;
2015 default:
2016 BUG();
2017 }
2018
2019 this_cpu_add(memcg->stat->count[idx], val);
2020}
2021
2022
2023
2024
2025
2026#define CHARGE_BATCH 32U
2027struct memcg_stock_pcp {
2028 struct mem_cgroup *cached;
2029 unsigned int nr_pages;
2030 struct work_struct work;
2031 unsigned long flags;
2032#define FLUSHING_CACHED_CHARGE 0
2033};
2034static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock);
2035static DEFINE_MUTEX(percpu_charge_mutex);
2036
2037
2038
2039
2040
2041
2042
2043static bool consume_stock(struct mem_cgroup *memcg)
2044{
2045 struct memcg_stock_pcp *stock;
2046 bool ret = true;
2047
2048 stock = &get_cpu_var(memcg_stock);
2049 if (memcg == stock->cached && stock->nr_pages)
2050 stock->nr_pages--;
2051 else
2052 ret = false;
2053 put_cpu_var(memcg_stock);
2054 return ret;
2055}
2056
2057
2058
2059
2060static void drain_stock(struct memcg_stock_pcp *stock)
2061{
2062 struct mem_cgroup *old = stock->cached;
2063
2064 if (stock->nr_pages) {
2065 unsigned long bytes = stock->nr_pages * PAGE_SIZE;
2066
2067 res_counter_uncharge(&old->res, bytes);
2068 if (do_swap_account)
2069 res_counter_uncharge(&old->memsw, bytes);
2070 stock->nr_pages = 0;
2071 }
2072 stock->cached = NULL;
2073}
2074
2075
2076
2077
2078
2079static void drain_local_stock(struct work_struct *dummy)
2080{
2081 struct memcg_stock_pcp *stock = &__get_cpu_var(memcg_stock);
2082 drain_stock(stock);
2083 clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags);
2084}
2085
2086
2087
2088
2089
2090static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
2091{
2092 struct memcg_stock_pcp *stock = &get_cpu_var(memcg_stock);
2093
2094 if (stock->cached != memcg) {
2095 drain_stock(stock);
2096 stock->cached = memcg;
2097 }
2098 stock->nr_pages += nr_pages;
2099 put_cpu_var(memcg_stock);
2100}
2101
2102
2103
2104
2105
2106
2107static void drain_all_stock(struct mem_cgroup *root_memcg, bool sync)
2108{
2109 int cpu, curcpu;
2110
2111
2112 get_online_cpus();
2113 curcpu = get_cpu();
2114 for_each_online_cpu(cpu) {
2115 struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu);
2116 struct mem_cgroup *memcg;
2117
2118 memcg = stock->cached;
2119 if (!memcg || !stock->nr_pages)
2120 continue;
2121 if (!mem_cgroup_same_or_subtree(root_memcg, memcg))
2122 continue;
2123 if (!test_and_set_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) {
2124 if (cpu == curcpu)
2125 drain_local_stock(&stock->work);
2126 else
2127 schedule_work_on(cpu, &stock->work);
2128 }
2129 }
2130 put_cpu();
2131
2132 if (!sync)
2133 goto out;
2134
2135 for_each_online_cpu(cpu) {
2136 struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu);
2137 if (test_bit(FLUSHING_CACHED_CHARGE, &stock->flags))
2138 flush_work(&stock->work);
2139 }
2140out:
2141 put_online_cpus();
2142}
2143
2144
2145
2146
2147
2148
2149
2150static void drain_all_stock_async(struct mem_cgroup *root_memcg)
2151{
2152
2153
2154
2155 if (!mutex_trylock(&percpu_charge_mutex))
2156 return;
2157 drain_all_stock(root_memcg, false);
2158 mutex_unlock(&percpu_charge_mutex);
2159}
2160
2161
2162static void drain_all_stock_sync(struct mem_cgroup *root_memcg)
2163{
2164
2165 mutex_lock(&percpu_charge_mutex);
2166 drain_all_stock(root_memcg, true);
2167 mutex_unlock(&percpu_charge_mutex);
2168}
2169
2170
2171
2172
2173
2174static void mem_cgroup_drain_pcp_counter(struct mem_cgroup *memcg, int cpu)
2175{
2176 int i;
2177
2178 spin_lock(&memcg->pcp_counter_lock);
2179 for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) {
2180 long x = per_cpu(memcg->stat->count[i], cpu);
2181
2182 per_cpu(memcg->stat->count[i], cpu) = 0;
2183 memcg->nocpu_base.count[i] += x;
2184 }
2185 for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++) {
2186 unsigned long x = per_cpu(memcg->stat->events[i], cpu);
2187
2188 per_cpu(memcg->stat->events[i], cpu) = 0;
2189 memcg->nocpu_base.events[i] += x;
2190 }
2191 spin_unlock(&memcg->pcp_counter_lock);
2192}
2193
2194static int __cpuinit memcg_cpu_hotplug_callback(struct notifier_block *nb,
2195 unsigned long action,
2196 void *hcpu)
2197{
2198 int cpu = (unsigned long)hcpu;
2199 struct memcg_stock_pcp *stock;
2200 struct mem_cgroup *iter;
2201
2202 if (action == CPU_ONLINE)
2203 return NOTIFY_OK;
2204
2205 if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
2206 return NOTIFY_OK;
2207
2208 for_each_mem_cgroup(iter)
2209 mem_cgroup_drain_pcp_counter(iter, cpu);
2210
2211 stock = &per_cpu(memcg_stock, cpu);
2212 drain_stock(stock);
2213 return NOTIFY_OK;
2214}
2215
2216
2217
2218enum {
2219 CHARGE_OK,
2220 CHARGE_RETRY,
2221 CHARGE_NOMEM,
2222 CHARGE_WOULDBLOCK,
2223 CHARGE_OOM_DIE,
2224};
2225
2226static int mem_cgroup_do_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
2227 unsigned int nr_pages, bool oom_check)
2228{
2229 unsigned long csize = nr_pages * PAGE_SIZE;
2230 struct mem_cgroup *mem_over_limit;
2231 struct res_counter *fail_res;
2232 unsigned long flags = 0;
2233 int ret;
2234
2235 ret = res_counter_charge(&memcg->res, csize, &fail_res);
2236
2237 if (likely(!ret)) {
2238 if (!do_swap_account)
2239 return CHARGE_OK;
2240 ret = res_counter_charge(&memcg->memsw, csize, &fail_res);
2241 if (likely(!ret))
2242 return CHARGE_OK;
2243
2244 res_counter_uncharge(&memcg->res, csize);
2245 mem_over_limit = mem_cgroup_from_res_counter(fail_res, memsw);
2246 flags |= MEM_CGROUP_RECLAIM_NOSWAP;
2247 } else
2248 mem_over_limit = mem_cgroup_from_res_counter(fail_res, res);
2249
2250
2251
2252
2253
2254
2255
2256 if (nr_pages == CHARGE_BATCH)
2257 return CHARGE_RETRY;
2258
2259 if (!(gfp_mask & __GFP_WAIT))
2260 return CHARGE_WOULDBLOCK;
2261
2262 ret = mem_cgroup_reclaim(mem_over_limit, gfp_mask, flags);
2263 if (mem_cgroup_margin(mem_over_limit) >= nr_pages)
2264 return CHARGE_RETRY;
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274 if (nr_pages == 1 && ret)
2275 return CHARGE_RETRY;
2276
2277
2278
2279
2280
2281 if (mem_cgroup_wait_acct_move(mem_over_limit))
2282 return CHARGE_RETRY;
2283
2284
2285 if (!oom_check)
2286 return CHARGE_NOMEM;
2287
2288 if (!mem_cgroup_handle_oom(mem_over_limit, gfp_mask, get_order(csize)))
2289 return CHARGE_OOM_DIE;
2290
2291 return CHARGE_RETRY;
2292}
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315static int __mem_cgroup_try_charge(struct mm_struct *mm,
2316 gfp_t gfp_mask,
2317 unsigned int nr_pages,
2318 struct mem_cgroup **ptr,
2319 bool oom)
2320{
2321 unsigned int batch = max(CHARGE_BATCH, nr_pages);
2322 int nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES;
2323 struct mem_cgroup *memcg = NULL;
2324 int ret;
2325
2326
2327
2328
2329
2330
2331 if (unlikely(test_thread_flag(TIF_MEMDIE)
2332 || fatal_signal_pending(current)))
2333 goto bypass;
2334
2335
2336
2337
2338
2339
2340
2341 if (!*ptr && !mm)
2342 *ptr = root_mem_cgroup;
2343again:
2344 if (*ptr) {
2345 memcg = *ptr;
2346 VM_BUG_ON(css_is_removed(&memcg->css));
2347 if (mem_cgroup_is_root(memcg))
2348 goto done;
2349 if (nr_pages == 1 && consume_stock(memcg))
2350 goto done;
2351 css_get(&memcg->css);
2352 } else {
2353 struct task_struct *p;
2354
2355 rcu_read_lock();
2356 p = rcu_dereference(mm->owner);
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367 memcg = mem_cgroup_from_task(p);
2368 if (!memcg)
2369 memcg = root_mem_cgroup;
2370 if (mem_cgroup_is_root(memcg)) {
2371 rcu_read_unlock();
2372 goto done;
2373 }
2374 if (nr_pages == 1 && consume_stock(memcg)) {
2375
2376
2377
2378
2379
2380
2381
2382
2383 rcu_read_unlock();
2384 goto done;
2385 }
2386
2387 if (!css_tryget(&memcg->css)) {
2388 rcu_read_unlock();
2389 goto again;
2390 }
2391 rcu_read_unlock();
2392 }
2393
2394 do {
2395 bool oom_check;
2396
2397
2398 if (fatal_signal_pending(current)) {
2399 css_put(&memcg->css);
2400 goto bypass;
2401 }
2402
2403 oom_check = false;
2404 if (oom && !nr_oom_retries) {
2405 oom_check = true;
2406 nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES;
2407 }
2408
2409 ret = mem_cgroup_do_charge(memcg, gfp_mask, batch, oom_check);
2410 switch (ret) {
2411 case CHARGE_OK:
2412 break;
2413 case CHARGE_RETRY:
2414 batch = nr_pages;
2415 css_put(&memcg->css);
2416 memcg = NULL;
2417 goto again;
2418 case CHARGE_WOULDBLOCK:
2419 css_put(&memcg->css);
2420 goto nomem;
2421 case CHARGE_NOMEM:
2422 if (!oom) {
2423 css_put(&memcg->css);
2424 goto nomem;
2425 }
2426
2427 nr_oom_retries--;
2428 break;
2429 case CHARGE_OOM_DIE:
2430 css_put(&memcg->css);
2431 goto bypass;
2432 }
2433 } while (ret != CHARGE_OK);
2434
2435 if (batch > nr_pages)
2436 refill_stock(memcg, batch - nr_pages);
2437 css_put(&memcg->css);
2438done:
2439 *ptr = memcg;
2440 return 0;
2441nomem:
2442 *ptr = NULL;
2443 return -ENOMEM;
2444bypass:
2445 *ptr = root_mem_cgroup;
2446 return -EINTR;
2447}
2448
2449
2450
2451
2452
2453
2454static void __mem_cgroup_cancel_charge(struct mem_cgroup *memcg,
2455 unsigned int nr_pages)
2456{
2457 if (!mem_cgroup_is_root(memcg)) {
2458 unsigned long bytes = nr_pages * PAGE_SIZE;
2459
2460 res_counter_uncharge(&memcg->res, bytes);
2461 if (do_swap_account)
2462 res_counter_uncharge(&memcg->memsw, bytes);
2463 }
2464}
2465
2466
2467
2468
2469
2470static void __mem_cgroup_cancel_local_charge(struct mem_cgroup *memcg,
2471 unsigned int nr_pages)
2472{
2473 unsigned long bytes = nr_pages * PAGE_SIZE;
2474
2475 if (mem_cgroup_is_root(memcg))
2476 return;
2477
2478 res_counter_uncharge_until(&memcg->res, memcg->res.parent, bytes);
2479 if (do_swap_account)
2480 res_counter_uncharge_until(&memcg->memsw,
2481 memcg->memsw.parent, bytes);
2482}
2483
2484
2485
2486
2487
2488
2489
2490static struct mem_cgroup *mem_cgroup_lookup(unsigned short id)
2491{
2492 struct cgroup_subsys_state *css;
2493
2494
2495 if (!id)
2496 return NULL;
2497 css = css_lookup(&mem_cgroup_subsys, id);
2498 if (!css)
2499 return NULL;
2500 return mem_cgroup_from_css(css);
2501}
2502
2503struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page)
2504{
2505 struct mem_cgroup *memcg = NULL;
2506 struct page_cgroup *pc;
2507 unsigned short id;
2508 swp_entry_t ent;
2509
2510 VM_BUG_ON(!PageLocked(page));
2511
2512 pc = lookup_page_cgroup(page);
2513 lock_page_cgroup(pc);
2514 if (PageCgroupUsed(pc)) {
2515 memcg = pc->mem_cgroup;
2516 if (memcg && !css_tryget(&memcg->css))
2517 memcg = NULL;
2518 } else if (PageSwapCache(page)) {
2519 ent.val = page_private(page);
2520 id = lookup_swap_cgroup_id(ent);
2521 rcu_read_lock();
2522 memcg = mem_cgroup_lookup(id);
2523 if (memcg && !css_tryget(&memcg->css))
2524 memcg = NULL;
2525 rcu_read_unlock();
2526 }
2527 unlock_page_cgroup(pc);
2528 return memcg;
2529}
2530
2531static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg,
2532 struct page *page,
2533 unsigned int nr_pages,
2534 enum charge_type ctype,
2535 bool lrucare)
2536{
2537 struct page_cgroup *pc = lookup_page_cgroup(page);
2538 struct zone *uninitialized_var(zone);
2539 struct lruvec *lruvec;
2540 bool was_on_lru = false;
2541 bool anon;
2542
2543 lock_page_cgroup(pc);
2544 VM_BUG_ON(PageCgroupUsed(pc));
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554 if (lrucare) {
2555 zone = page_zone(page);
2556 spin_lock_irq(&zone->lru_lock);
2557 if (PageLRU(page)) {
2558 lruvec = mem_cgroup_zone_lruvec(zone, pc->mem_cgroup);
2559 ClearPageLRU(page);
2560 del_page_from_lru_list(page, lruvec, page_lru(page));
2561 was_on_lru = true;
2562 }
2563 }
2564
2565 pc->mem_cgroup = memcg;
2566
2567
2568
2569
2570
2571
2572
2573 smp_wmb();
2574 SetPageCgroupUsed(pc);
2575
2576 if (lrucare) {
2577 if (was_on_lru) {
2578 lruvec = mem_cgroup_zone_lruvec(zone, pc->mem_cgroup);
2579 VM_BUG_ON(PageLRU(page));
2580 SetPageLRU(page);
2581 add_page_to_lru_list(page, lruvec, page_lru(page));
2582 }
2583 spin_unlock_irq(&zone->lru_lock);
2584 }
2585
2586 if (ctype == MEM_CGROUP_CHARGE_TYPE_ANON)
2587 anon = true;
2588 else
2589 anon = false;
2590
2591 mem_cgroup_charge_statistics(memcg, anon, nr_pages);
2592 unlock_page_cgroup(pc);
2593
2594
2595
2596
2597
2598
2599 memcg_check_events(memcg, page);
2600}
2601
2602#ifdef CONFIG_TRANSPARENT_HUGEPAGE
2603
2604#define PCGF_NOCOPY_AT_SPLIT (1 << PCG_LOCK | 1 << PCG_MIGRATION)
2605
2606
2607
2608
2609
2610
2611void mem_cgroup_split_huge_fixup(struct page *head)
2612{
2613 struct page_cgroup *head_pc = lookup_page_cgroup(head);
2614 struct page_cgroup *pc;
2615 int i;
2616
2617 if (mem_cgroup_disabled())
2618 return;
2619 for (i = 1; i < HPAGE_PMD_NR; i++) {
2620 pc = head_pc + i;
2621 pc->mem_cgroup = head_pc->mem_cgroup;
2622 smp_wmb();
2623 pc->flags = head_pc->flags & ~PCGF_NOCOPY_AT_SPLIT;
2624 }
2625}
2626#endif
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643static int mem_cgroup_move_account(struct page *page,
2644 unsigned int nr_pages,
2645 struct page_cgroup *pc,
2646 struct mem_cgroup *from,
2647 struct mem_cgroup *to)
2648{
2649 unsigned long flags;
2650 int ret;
2651 bool anon = PageAnon(page);
2652
2653 VM_BUG_ON(from == to);
2654 VM_BUG_ON(PageLRU(page));
2655
2656
2657
2658
2659
2660
2661 ret = -EBUSY;
2662 if (nr_pages > 1 && !PageTransHuge(page))
2663 goto out;
2664
2665 lock_page_cgroup(pc);
2666
2667 ret = -EINVAL;
2668 if (!PageCgroupUsed(pc) || pc->mem_cgroup != from)
2669 goto unlock;
2670
2671 move_lock_mem_cgroup(from, &flags);
2672
2673 if (!anon && page_mapped(page)) {
2674
2675 preempt_disable();
2676 __this_cpu_dec(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
2677 __this_cpu_inc(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
2678 preempt_enable();
2679 }
2680 mem_cgroup_charge_statistics(from, anon, -nr_pages);
2681
2682
2683 pc->mem_cgroup = to;
2684 mem_cgroup_charge_statistics(to, anon, nr_pages);
2685
2686
2687
2688
2689
2690
2691
2692 move_unlock_mem_cgroup(from, &flags);
2693 ret = 0;
2694unlock:
2695 unlock_page_cgroup(pc);
2696
2697
2698
2699 memcg_check_events(to, page);
2700 memcg_check_events(from, page);
2701out:
2702 return ret;
2703}
2704
2705
2706
2707
2708
2709static int mem_cgroup_move_parent(struct page *page,
2710 struct page_cgroup *pc,
2711 struct mem_cgroup *child)
2712{
2713 struct mem_cgroup *parent;
2714 unsigned int nr_pages;
2715 unsigned long uninitialized_var(flags);
2716 int ret;
2717
2718
2719 if (mem_cgroup_is_root(child))
2720 return -EINVAL;
2721
2722 ret = -EBUSY;
2723 if (!get_page_unless_zero(page))
2724 goto out;
2725 if (isolate_lru_page(page))
2726 goto put;
2727
2728 nr_pages = hpage_nr_pages(page);
2729
2730 parent = parent_mem_cgroup(child);
2731
2732
2733
2734 if (!parent)
2735 parent = root_mem_cgroup;
2736
2737 if (nr_pages > 1)
2738 flags = compound_lock_irqsave(page);
2739
2740 ret = mem_cgroup_move_account(page, nr_pages,
2741 pc, child, parent);
2742 if (!ret)
2743 __mem_cgroup_cancel_local_charge(child, nr_pages);
2744
2745 if (nr_pages > 1)
2746 compound_unlock_irqrestore(page, flags);
2747 putback_lru_page(page);
2748put:
2749 put_page(page);
2750out:
2751 return ret;
2752}
2753
2754
2755
2756
2757
2758
2759
2760static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
2761 gfp_t gfp_mask, enum charge_type ctype)
2762{
2763 struct mem_cgroup *memcg = NULL;
2764 unsigned int nr_pages = 1;
2765 bool oom = true;
2766 int ret;
2767
2768 if (PageTransHuge(page)) {
2769 nr_pages <<= compound_order(page);
2770 VM_BUG_ON(!PageTransHuge(page));
2771
2772
2773
2774
2775 oom = false;
2776 }
2777
2778 ret = __mem_cgroup_try_charge(mm, gfp_mask, nr_pages, &memcg, oom);
2779 if (ret == -ENOMEM)
2780 return ret;
2781 __mem_cgroup_commit_charge(memcg, page, nr_pages, ctype, false);
2782 return 0;
2783}
2784
2785int mem_cgroup_newpage_charge(struct page *page,
2786 struct mm_struct *mm, gfp_t gfp_mask)
2787{
2788 if (mem_cgroup_disabled())
2789 return 0;
2790 VM_BUG_ON(page_mapped(page));
2791 VM_BUG_ON(page->mapping && !PageAnon(page));
2792 VM_BUG_ON(!mm);
2793 return mem_cgroup_charge_common(page, mm, gfp_mask,
2794 MEM_CGROUP_CHARGE_TYPE_ANON);
2795}
2796
2797
2798
2799
2800
2801
2802
2803static int __mem_cgroup_try_charge_swapin(struct mm_struct *mm,
2804 struct page *page,
2805 gfp_t mask,
2806 struct mem_cgroup **memcgp)
2807{
2808 struct mem_cgroup *memcg;
2809 struct page_cgroup *pc;
2810 int ret;
2811
2812 pc = lookup_page_cgroup(page);
2813
2814
2815
2816
2817
2818
2819
2820 if (PageCgroupUsed(pc))
2821 return 0;
2822 if (!do_swap_account)
2823 goto charge_cur_mm;
2824 memcg = try_get_mem_cgroup_from_page(page);
2825 if (!memcg)
2826 goto charge_cur_mm;
2827 *memcgp = memcg;
2828 ret = __mem_cgroup_try_charge(NULL, mask, 1, memcgp, true);
2829 css_put(&memcg->css);
2830 if (ret == -EINTR)
2831 ret = 0;
2832 return ret;
2833charge_cur_mm:
2834 ret = __mem_cgroup_try_charge(mm, mask, 1, memcgp, true);
2835 if (ret == -EINTR)
2836 ret = 0;
2837 return ret;
2838}
2839
2840int mem_cgroup_try_charge_swapin(struct mm_struct *mm, struct page *page,
2841 gfp_t gfp_mask, struct mem_cgroup **memcgp)
2842{
2843 *memcgp = NULL;
2844 if (mem_cgroup_disabled())
2845 return 0;
2846
2847
2848
2849
2850
2851
2852 if (!PageSwapCache(page)) {
2853 int ret;
2854
2855 ret = __mem_cgroup_try_charge(mm, gfp_mask, 1, memcgp, true);
2856 if (ret == -EINTR)
2857 ret = 0;
2858 return ret;
2859 }
2860 return __mem_cgroup_try_charge_swapin(mm, page, gfp_mask, memcgp);
2861}
2862
2863void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *memcg)
2864{
2865 if (mem_cgroup_disabled())
2866 return;
2867 if (!memcg)
2868 return;
2869 __mem_cgroup_cancel_charge(memcg, 1);
2870}
2871
2872static void
2873__mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *memcg,
2874 enum charge_type ctype)
2875{
2876 if (mem_cgroup_disabled())
2877 return;
2878 if (!memcg)
2879 return;
2880 cgroup_exclude_rmdir(&memcg->css);
2881
2882 __mem_cgroup_commit_charge(memcg, page, 1, ctype, true);
2883
2884
2885
2886
2887
2888
2889
2890 if (do_swap_account && PageSwapCache(page)) {
2891 swp_entry_t ent = {.val = page_private(page)};
2892 mem_cgroup_uncharge_swap(ent);
2893 }
2894
2895
2896
2897
2898
2899 cgroup_release_and_wakeup_rmdir(&memcg->css);
2900}
2901
2902void mem_cgroup_commit_charge_swapin(struct page *page,
2903 struct mem_cgroup *memcg)
2904{
2905 __mem_cgroup_commit_charge_swapin(page, memcg,
2906 MEM_CGROUP_CHARGE_TYPE_ANON);
2907}
2908
2909int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
2910 gfp_t gfp_mask)
2911{
2912 struct mem_cgroup *memcg = NULL;
2913 enum charge_type type = MEM_CGROUP_CHARGE_TYPE_CACHE;
2914 int ret;
2915
2916 if (mem_cgroup_disabled())
2917 return 0;
2918 if (PageCompound(page))
2919 return 0;
2920
2921 if (!PageSwapCache(page))
2922 ret = mem_cgroup_charge_common(page, mm, gfp_mask, type);
2923 else {
2924 ret = __mem_cgroup_try_charge_swapin(mm, page,
2925 gfp_mask, &memcg);
2926 if (!ret)
2927 __mem_cgroup_commit_charge_swapin(page, memcg, type);
2928 }
2929 return ret;
2930}
2931
2932static void mem_cgroup_do_uncharge(struct mem_cgroup *memcg,
2933 unsigned int nr_pages,
2934 const enum charge_type ctype)
2935{
2936 struct memcg_batch_info *batch = NULL;
2937 bool uncharge_memsw = true;
2938
2939
2940 if (!do_swap_account || ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT)
2941 uncharge_memsw = false;
2942
2943 batch = ¤t->memcg_batch;
2944
2945
2946
2947
2948
2949 if (!batch->memcg)
2950 batch->memcg = memcg;
2951
2952
2953
2954
2955
2956
2957
2958
2959 if (!batch->do_batch || test_thread_flag(TIF_MEMDIE))
2960 goto direct_uncharge;
2961
2962 if (nr_pages > 1)
2963 goto direct_uncharge;
2964
2965
2966
2967
2968
2969
2970 if (batch->memcg != memcg)
2971 goto direct_uncharge;
2972
2973 batch->nr_pages++;
2974 if (uncharge_memsw)
2975 batch->memsw_nr_pages++;
2976 return;
2977direct_uncharge:
2978 res_counter_uncharge(&memcg->res, nr_pages * PAGE_SIZE);
2979 if (uncharge_memsw)
2980 res_counter_uncharge(&memcg->memsw, nr_pages * PAGE_SIZE);
2981 if (unlikely(batch->memcg != memcg))
2982 memcg_oom_recover(memcg);
2983}
2984
2985
2986
2987
2988static struct mem_cgroup *
2989__mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype,
2990 bool end_migration)
2991{
2992 struct mem_cgroup *memcg = NULL;
2993 unsigned int nr_pages = 1;
2994 struct page_cgroup *pc;
2995 bool anon;
2996
2997 if (mem_cgroup_disabled())
2998 return NULL;
2999
3000 VM_BUG_ON(PageSwapCache(page));
3001
3002 if (PageTransHuge(page)) {
3003 nr_pages <<= compound_order(page);
3004 VM_BUG_ON(!PageTransHuge(page));
3005 }
3006
3007
3008
3009 pc = lookup_page_cgroup(page);
3010 if (unlikely(!PageCgroupUsed(pc)))
3011 return NULL;
3012
3013 lock_page_cgroup(pc);
3014
3015 memcg = pc->mem_cgroup;
3016
3017 if (!PageCgroupUsed(pc))
3018 goto unlock_out;
3019
3020 anon = PageAnon(page);
3021
3022 switch (ctype) {
3023 case MEM_CGROUP_CHARGE_TYPE_ANON:
3024
3025
3026
3027
3028
3029 anon = true;
3030
3031 case MEM_CGROUP_CHARGE_TYPE_DROP:
3032
3033 if (page_mapped(page))
3034 goto unlock_out;
3035
3036
3037
3038
3039
3040
3041
3042 if (!end_migration && PageCgroupMigration(pc))
3043 goto unlock_out;
3044 break;
3045 case MEM_CGROUP_CHARGE_TYPE_SWAPOUT:
3046 if (!PageAnon(page)) {
3047 if (page->mapping && !page_is_file_cache(page))
3048 goto unlock_out;
3049 } else if (page_mapped(page))
3050 goto unlock_out;
3051 break;
3052 default:
3053 break;
3054 }
3055
3056 mem_cgroup_charge_statistics(memcg, anon, -nr_pages);
3057
3058 ClearPageCgroupUsed(pc);
3059
3060
3061
3062
3063
3064
3065
3066 unlock_page_cgroup(pc);
3067
3068
3069
3070
3071 memcg_check_events(memcg, page);
3072 if (do_swap_account && ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT) {
3073 mem_cgroup_swap_statistics(memcg, true);
3074 mem_cgroup_get(memcg);
3075 }
3076
3077
3078
3079
3080
3081 if (!end_migration && !mem_cgroup_is_root(memcg))
3082 mem_cgroup_do_uncharge(memcg, nr_pages, ctype);
3083
3084 return memcg;
3085
3086unlock_out:
3087 unlock_page_cgroup(pc);
3088 return NULL;
3089}
3090
3091void mem_cgroup_uncharge_page(struct page *page)
3092{
3093
3094 if (page_mapped(page))
3095 return;
3096 VM_BUG_ON(page->mapping && !PageAnon(page));
3097 if (PageSwapCache(page))
3098 return;
3099 __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_ANON, false);
3100}
3101
3102void mem_cgroup_uncharge_cache_page(struct page *page)
3103{
3104 VM_BUG_ON(page_mapped(page));
3105 VM_BUG_ON(page->mapping);
3106 __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE, false);
3107}
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117void mem_cgroup_uncharge_start(void)
3118{
3119 current->memcg_batch.do_batch++;
3120
3121 if (current->memcg_batch.do_batch == 1) {
3122 current->memcg_batch.memcg = NULL;
3123 current->memcg_batch.nr_pages = 0;
3124 current->memcg_batch.memsw_nr_pages = 0;
3125 }
3126}
3127
3128void mem_cgroup_uncharge_end(void)
3129{
3130 struct memcg_batch_info *batch = ¤t->memcg_batch;
3131
3132 if (!batch->do_batch)
3133 return;
3134
3135 batch->do_batch--;
3136 if (batch->do_batch)
3137 return;
3138
3139 if (!batch->memcg)
3140 return;
3141
3142
3143
3144
3145 if (batch->nr_pages)
3146 res_counter_uncharge(&batch->memcg->res,
3147 batch->nr_pages * PAGE_SIZE);
3148 if (batch->memsw_nr_pages)
3149 res_counter_uncharge(&batch->memcg->memsw,
3150 batch->memsw_nr_pages * PAGE_SIZE);
3151 memcg_oom_recover(batch->memcg);
3152
3153 batch->memcg = NULL;
3154}
3155
3156#ifdef CONFIG_SWAP
3157
3158
3159
3160
3161void
3162mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent, bool swapout)
3163{
3164 struct mem_cgroup *memcg;
3165 int ctype = MEM_CGROUP_CHARGE_TYPE_SWAPOUT;
3166
3167 if (!swapout)
3168 ctype = MEM_CGROUP_CHARGE_TYPE_DROP;
3169
3170 memcg = __mem_cgroup_uncharge_common(page, ctype, false);
3171
3172
3173
3174
3175
3176 if (do_swap_account && swapout && memcg)
3177 swap_cgroup_record(ent, css_id(&memcg->css));
3178}
3179#endif
3180
3181#ifdef CONFIG_MEMCG_SWAP
3182
3183
3184
3185
3186void mem_cgroup_uncharge_swap(swp_entry_t ent)
3187{
3188 struct mem_cgroup *memcg;
3189 unsigned short id;
3190
3191 if (!do_swap_account)
3192 return;
3193
3194 id = swap_cgroup_record(ent, 0);
3195 rcu_read_lock();
3196 memcg = mem_cgroup_lookup(id);
3197 if (memcg) {
3198
3199
3200
3201
3202 if (!mem_cgroup_is_root(memcg))
3203 res_counter_uncharge(&memcg->memsw, PAGE_SIZE);
3204 mem_cgroup_swap_statistics(memcg, false);
3205 mem_cgroup_put(memcg);
3206 }
3207 rcu_read_unlock();
3208}
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224static int mem_cgroup_move_swap_account(swp_entry_t entry,
3225 struct mem_cgroup *from, struct mem_cgroup *to)
3226{
3227 unsigned short old_id, new_id;
3228
3229 old_id = css_id(&from->css);
3230 new_id = css_id(&to->css);
3231
3232 if (swap_cgroup_cmpxchg(entry, old_id, new_id) == old_id) {
3233 mem_cgroup_swap_statistics(from, false);
3234 mem_cgroup_swap_statistics(to, true);
3235
3236
3237
3238
3239
3240
3241
3242
3243 mem_cgroup_get(to);
3244 return 0;
3245 }
3246 return -EINVAL;
3247}
3248#else
3249static inline int mem_cgroup_move_swap_account(swp_entry_t entry,
3250 struct mem_cgroup *from, struct mem_cgroup *to)
3251{
3252 return -EINVAL;
3253}
3254#endif
3255
3256
3257
3258
3259
3260void mem_cgroup_prepare_migration(struct page *page, struct page *newpage,
3261 struct mem_cgroup **memcgp)
3262{
3263 struct mem_cgroup *memcg = NULL;
3264 struct page_cgroup *pc;
3265 enum charge_type ctype;
3266
3267 *memcgp = NULL;
3268
3269 VM_BUG_ON(PageTransHuge(page));
3270 if (mem_cgroup_disabled())
3271 return;
3272
3273 pc = lookup_page_cgroup(page);
3274 lock_page_cgroup(pc);
3275 if (PageCgroupUsed(pc)) {
3276 memcg = pc->mem_cgroup;
3277 css_get(&memcg->css);
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307 if (PageAnon(page))
3308 SetPageCgroupMigration(pc);
3309 }
3310 unlock_page_cgroup(pc);
3311
3312
3313
3314
3315 if (!memcg)
3316 return;
3317
3318 *memcgp = memcg;
3319
3320
3321
3322
3323
3324
3325 if (PageAnon(page))
3326 ctype = MEM_CGROUP_CHARGE_TYPE_ANON;
3327 else
3328 ctype = MEM_CGROUP_CHARGE_TYPE_CACHE;
3329
3330
3331
3332
3333
3334 __mem_cgroup_commit_charge(memcg, newpage, 1, ctype, false);
3335}
3336
3337
3338void mem_cgroup_end_migration(struct mem_cgroup *memcg,
3339 struct page *oldpage, struct page *newpage, bool migration_ok)
3340{
3341 struct page *used, *unused;
3342 struct page_cgroup *pc;
3343 bool anon;
3344
3345 if (!memcg)
3346 return;
3347
3348 cgroup_exclude_rmdir(&memcg->css);
3349 if (!migration_ok) {
3350 used = oldpage;
3351 unused = newpage;
3352 } else {
3353 used = newpage;
3354 unused = oldpage;
3355 }
3356 anon = PageAnon(used);
3357 __mem_cgroup_uncharge_common(unused,
3358 anon ? MEM_CGROUP_CHARGE_TYPE_ANON
3359 : MEM_CGROUP_CHARGE_TYPE_CACHE,
3360 true);
3361 css_put(&memcg->css);
3362
3363
3364
3365
3366
3367 pc = lookup_page_cgroup(oldpage);
3368 lock_page_cgroup(pc);
3369 ClearPageCgroupMigration(pc);
3370 unlock_page_cgroup(pc);
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380 if (anon)
3381 mem_cgroup_uncharge_page(used);
3382
3383
3384
3385
3386
3387
3388 cgroup_release_and_wakeup_rmdir(&memcg->css);
3389}
3390
3391
3392
3393
3394
3395
3396void mem_cgroup_replace_page_cache(struct page *oldpage,
3397 struct page *newpage)
3398{
3399 struct mem_cgroup *memcg = NULL;
3400 struct page_cgroup *pc;
3401 enum charge_type type = MEM_CGROUP_CHARGE_TYPE_CACHE;
3402
3403 if (mem_cgroup_disabled())
3404 return;
3405
3406 pc = lookup_page_cgroup(oldpage);
3407
3408 lock_page_cgroup(pc);
3409 if (PageCgroupUsed(pc)) {
3410 memcg = pc->mem_cgroup;
3411 mem_cgroup_charge_statistics(memcg, false, -1);
3412 ClearPageCgroupUsed(pc);
3413 }
3414 unlock_page_cgroup(pc);
3415
3416
3417
3418
3419
3420 if (!memcg)
3421 return;
3422
3423
3424
3425
3426
3427 __mem_cgroup_commit_charge(memcg, newpage, 1, type, true);
3428}
3429
3430#ifdef CONFIG_DEBUG_VM
3431static struct page_cgroup *lookup_page_cgroup_used(struct page *page)
3432{
3433 struct page_cgroup *pc;
3434
3435 pc = lookup_page_cgroup(page);
3436
3437
3438
3439
3440
3441 if (likely(pc) && PageCgroupUsed(pc))
3442 return pc;
3443 return NULL;
3444}
3445
3446bool mem_cgroup_bad_page_check(struct page *page)
3447{
3448 if (mem_cgroup_disabled())
3449 return false;
3450
3451 return lookup_page_cgroup_used(page) != NULL;
3452}
3453
3454void mem_cgroup_print_bad_page(struct page *page)
3455{
3456 struct page_cgroup *pc;
3457
3458 pc = lookup_page_cgroup_used(page);
3459 if (pc) {
3460 printk(KERN_ALERT "pc:%p pc->flags:%lx pc->mem_cgroup:%p\n",
3461 pc, pc->flags, pc->mem_cgroup);
3462 }
3463}
3464#endif
3465
3466static DEFINE_MUTEX(set_limit_mutex);
3467
3468static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
3469 unsigned long long val)
3470{
3471 int retry_count;
3472 u64 memswlimit, memlimit;
3473 int ret = 0;
3474 int children = mem_cgroup_count_children(memcg);
3475 u64 curusage, oldusage;
3476 int enlarge;
3477
3478
3479
3480
3481
3482
3483 retry_count = MEM_CGROUP_RECLAIM_RETRIES * children;
3484
3485 oldusage = res_counter_read_u64(&memcg->res, RES_USAGE);
3486
3487 enlarge = 0;
3488 while (retry_count) {
3489 if (signal_pending(current)) {
3490 ret = -EINTR;
3491 break;
3492 }
3493
3494
3495
3496
3497
3498 mutex_lock(&set_limit_mutex);
3499 memswlimit = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
3500 if (memswlimit < val) {
3501 ret = -EINVAL;
3502 mutex_unlock(&set_limit_mutex);
3503 break;
3504 }
3505
3506 memlimit = res_counter_read_u64(&memcg->res, RES_LIMIT);
3507 if (memlimit < val)
3508 enlarge = 1;
3509
3510 ret = res_counter_set_limit(&memcg->res, val);
3511 if (!ret) {
3512 if (memswlimit == val)
3513 memcg->memsw_is_minimum = true;
3514 else
3515 memcg->memsw_is_minimum = false;
3516 }
3517 mutex_unlock(&set_limit_mutex);
3518
3519 if (!ret)
3520 break;
3521
3522 mem_cgroup_reclaim(memcg, GFP_KERNEL,
3523 MEM_CGROUP_RECLAIM_SHRINK);
3524 curusage = res_counter_read_u64(&memcg->res, RES_USAGE);
3525
3526 if (curusage >= oldusage)
3527 retry_count--;
3528 else
3529 oldusage = curusage;
3530 }
3531 if (!ret && enlarge)
3532 memcg_oom_recover(memcg);
3533
3534 return ret;
3535}
3536
3537static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg,
3538 unsigned long long val)
3539{
3540 int retry_count;
3541 u64 memlimit, memswlimit, oldusage, curusage;
3542 int children = mem_cgroup_count_children(memcg);
3543 int ret = -EBUSY;
3544 int enlarge = 0;
3545
3546
3547 retry_count = children * MEM_CGROUP_RECLAIM_RETRIES;
3548 oldusage = res_counter_read_u64(&memcg->memsw, RES_USAGE);
3549 while (retry_count) {
3550 if (signal_pending(current)) {
3551 ret = -EINTR;
3552 break;
3553 }
3554
3555
3556
3557
3558
3559 mutex_lock(&set_limit_mutex);
3560 memlimit = res_counter_read_u64(&memcg->res, RES_LIMIT);
3561 if (memlimit > val) {
3562 ret = -EINVAL;
3563 mutex_unlock(&set_limit_mutex);
3564 break;
3565 }
3566 memswlimit = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
3567 if (memswlimit < val)
3568 enlarge = 1;
3569 ret = res_counter_set_limit(&memcg->memsw, val);
3570 if (!ret) {
3571 if (memlimit == val)
3572 memcg->memsw_is_minimum = true;
3573 else
3574 memcg->memsw_is_minimum = false;
3575 }
3576 mutex_unlock(&set_limit_mutex);
3577
3578 if (!ret)
3579 break;
3580
3581 mem_cgroup_reclaim(memcg, GFP_KERNEL,
3582 MEM_CGROUP_RECLAIM_NOSWAP |
3583 MEM_CGROUP_RECLAIM_SHRINK);
3584 curusage = res_counter_read_u64(&memcg->memsw, RES_USAGE);
3585
3586 if (curusage >= oldusage)
3587 retry_count--;
3588 else
3589 oldusage = curusage;
3590 }
3591 if (!ret && enlarge)
3592 memcg_oom_recover(memcg);
3593 return ret;
3594}
3595
3596unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
3597 gfp_t gfp_mask,
3598 unsigned long *total_scanned)
3599{
3600 unsigned long nr_reclaimed = 0;
3601 struct mem_cgroup_per_zone *mz, *next_mz = NULL;
3602 unsigned long reclaimed;
3603 int loop = 0;
3604 struct mem_cgroup_tree_per_zone *mctz;
3605 unsigned long long excess;
3606 unsigned long nr_scanned;
3607
3608 if (order > 0)
3609 return 0;
3610
3611 mctz = soft_limit_tree_node_zone(zone_to_nid(zone), zone_idx(zone));
3612
3613
3614
3615
3616
3617 do {
3618 if (next_mz)
3619 mz = next_mz;
3620 else
3621 mz = mem_cgroup_largest_soft_limit_node(mctz);
3622 if (!mz)
3623 break;
3624
3625 nr_scanned = 0;
3626 reclaimed = mem_cgroup_soft_reclaim(mz->memcg, zone,
3627 gfp_mask, &nr_scanned);
3628 nr_reclaimed += reclaimed;
3629 *total_scanned += nr_scanned;
3630 spin_lock(&mctz->lock);
3631
3632
3633
3634
3635
3636 next_mz = NULL;
3637 if (!reclaimed) {
3638 do {
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650 next_mz =
3651 __mem_cgroup_largest_soft_limit_node(mctz);
3652 if (next_mz == mz)
3653 css_put(&next_mz->memcg->css);
3654 else
3655 break;
3656 } while (1);
3657 }
3658 __mem_cgroup_remove_exceeded(mz->memcg, mz, mctz);
3659 excess = res_counter_soft_limit_excess(&mz->memcg->res);
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669 __mem_cgroup_insert_exceeded(mz->memcg, mz, mctz, excess);
3670 spin_unlock(&mctz->lock);
3671 css_put(&mz->memcg->css);
3672 loop++;
3673
3674
3675
3676
3677
3678 if (!nr_reclaimed &&
3679 (next_mz == NULL ||
3680 loop > MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS))
3681 break;
3682 } while (!nr_reclaimed);
3683 if (next_mz)
3684 css_put(&next_mz->memcg->css);
3685 return nr_reclaimed;
3686}
3687
3688
3689
3690
3691
3692
3693
3694static bool mem_cgroup_force_empty_list(struct mem_cgroup *memcg,
3695 int node, int zid, enum lru_list lru)
3696{
3697 struct mem_cgroup_per_zone *mz;
3698 unsigned long flags, loop;
3699 struct list_head *list;
3700 struct page *busy;
3701 struct zone *zone;
3702
3703 zone = &NODE_DATA(node)->node_zones[zid];
3704 mz = mem_cgroup_zoneinfo(memcg, node, zid);
3705 list = &mz->lruvec.lists[lru];
3706
3707 loop = mz->lru_size[lru];
3708
3709 loop += 256;
3710 busy = NULL;
3711 while (loop--) {
3712 struct page_cgroup *pc;
3713 struct page *page;
3714
3715 spin_lock_irqsave(&zone->lru_lock, flags);
3716 if (list_empty(list)) {
3717 spin_unlock_irqrestore(&zone->lru_lock, flags);
3718 break;
3719 }
3720 page = list_entry(list->prev, struct page, lru);
3721 if (busy == page) {
3722 list_move(&page->lru, list);
3723 busy = NULL;
3724 spin_unlock_irqrestore(&zone->lru_lock, flags);
3725 continue;
3726 }
3727 spin_unlock_irqrestore(&zone->lru_lock, flags);
3728
3729 pc = lookup_page_cgroup(page);
3730
3731 if (mem_cgroup_move_parent(page, pc, memcg)) {
3732
3733 busy = page;
3734 cond_resched();
3735 } else
3736 busy = NULL;
3737 }
3738 return !list_empty(list);
3739}
3740
3741
3742
3743
3744
3745static int mem_cgroup_force_empty(struct mem_cgroup *memcg, bool free_all)
3746{
3747 int ret;
3748 int node, zid, shrink;
3749 int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
3750 struct cgroup *cgrp = memcg->css.cgroup;
3751
3752 css_get(&memcg->css);
3753
3754 shrink = 0;
3755
3756 if (free_all)
3757 goto try_to_free;
3758move_account:
3759 do {
3760 ret = -EBUSY;
3761 if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children))
3762 goto out;
3763
3764 lru_add_drain_all();
3765 drain_all_stock_sync(memcg);
3766 ret = 0;
3767 mem_cgroup_start_move(memcg);
3768 for_each_node_state(node, N_HIGH_MEMORY) {
3769 for (zid = 0; !ret && zid < MAX_NR_ZONES; zid++) {
3770 enum lru_list lru;
3771 for_each_lru(lru) {
3772 ret = mem_cgroup_force_empty_list(memcg,
3773 node, zid, lru);
3774 if (ret)
3775 break;
3776 }
3777 }
3778 if (ret)
3779 break;
3780 }
3781 mem_cgroup_end_move(memcg);
3782 memcg_oom_recover(memcg);
3783 cond_resched();
3784
3785 } while (res_counter_read_u64(&memcg->res, RES_USAGE) > 0 || ret);
3786out:
3787 css_put(&memcg->css);
3788 return ret;
3789
3790try_to_free:
3791
3792 if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children) || shrink) {
3793 ret = -EBUSY;
3794 goto out;
3795 }
3796
3797 lru_add_drain_all();
3798
3799 shrink = 1;
3800 while (nr_retries && res_counter_read_u64(&memcg->res, RES_USAGE) > 0) {
3801 int progress;
3802
3803 if (signal_pending(current)) {
3804 ret = -EINTR;
3805 goto out;
3806 }
3807 progress = try_to_free_mem_cgroup_pages(memcg, GFP_KERNEL,
3808 false);
3809 if (!progress) {
3810 nr_retries--;
3811
3812 congestion_wait(BLK_RW_ASYNC, HZ/10);
3813 }
3814
3815 }
3816 lru_add_drain();
3817
3818 goto move_account;
3819}
3820
3821static int mem_cgroup_force_empty_write(struct cgroup *cont, unsigned int event)
3822{
3823 return mem_cgroup_force_empty(mem_cgroup_from_cont(cont), true);
3824}
3825
3826
3827static u64 mem_cgroup_hierarchy_read(struct cgroup *cont, struct cftype *cft)
3828{
3829 return mem_cgroup_from_cont(cont)->use_hierarchy;
3830}
3831
3832static int mem_cgroup_hierarchy_write(struct cgroup *cont, struct cftype *cft,
3833 u64 val)
3834{
3835 int retval = 0;
3836 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
3837 struct cgroup *parent = cont->parent;
3838 struct mem_cgroup *parent_memcg = NULL;
3839
3840 if (parent)
3841 parent_memcg = mem_cgroup_from_cont(parent);
3842
3843 cgroup_lock();
3844
3845 if (memcg->use_hierarchy == val)
3846 goto out;
3847
3848
3849
3850
3851
3852
3853
3854
3855
3856 if ((!parent_memcg || !parent_memcg->use_hierarchy) &&
3857 (val == 1 || val == 0)) {
3858 if (list_empty(&cont->children))
3859 memcg->use_hierarchy = val;
3860 else
3861 retval = -EBUSY;
3862 } else
3863 retval = -EINVAL;
3864
3865out:
3866 cgroup_unlock();
3867
3868 return retval;
3869}
3870
3871
3872static unsigned long mem_cgroup_recursive_stat(struct mem_cgroup *memcg,
3873 enum mem_cgroup_stat_index idx)
3874{
3875 struct mem_cgroup *iter;
3876 long val = 0;
3877
3878
3879 for_each_mem_cgroup_tree(iter, memcg)
3880 val += mem_cgroup_read_stat(iter, idx);
3881
3882 if (val < 0)
3883 val = 0;
3884 return val;
3885}
3886
3887static inline u64 mem_cgroup_usage(struct mem_cgroup *memcg, bool swap)
3888{
3889 u64 val;
3890
3891 if (!mem_cgroup_is_root(memcg)) {
3892 if (!swap)
3893 return res_counter_read_u64(&memcg->res, RES_USAGE);
3894 else
3895 return res_counter_read_u64(&memcg->memsw, RES_USAGE);
3896 }
3897
3898 val = mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_CACHE);
3899 val += mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_RSS);
3900
3901 if (swap)
3902 val += mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_SWAP);
3903
3904 return val << PAGE_SHIFT;
3905}
3906
3907static ssize_t mem_cgroup_read(struct cgroup *cont, struct cftype *cft,
3908 struct file *file, char __user *buf,
3909 size_t nbytes, loff_t *ppos)
3910{
3911 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
3912 char str[64];
3913 u64 val;
3914 int type, name, len;
3915
3916 type = MEMFILE_TYPE(cft->private);
3917 name = MEMFILE_ATTR(cft->private);
3918
3919 if (!do_swap_account && type == _MEMSWAP)
3920 return -EOPNOTSUPP;
3921
3922 switch (type) {
3923 case _MEM:
3924 if (name == RES_USAGE)
3925 val = mem_cgroup_usage(memcg, false);
3926 else
3927 val = res_counter_read_u64(&memcg->res, name);
3928 break;
3929 case _MEMSWAP:
3930 if (name == RES_USAGE)
3931 val = mem_cgroup_usage(memcg, true);
3932 else
3933 val = res_counter_read_u64(&memcg->memsw, name);
3934 break;
3935 default:
3936 BUG();
3937 }
3938
3939 len = scnprintf(str, sizeof(str), "%llu\n", (unsigned long long)val);
3940 return simple_read_from_buffer(buf, nbytes, ppos, str, len);
3941}
3942
3943
3944
3945
3946static int mem_cgroup_write(struct cgroup *cont, struct cftype *cft,
3947 const char *buffer)
3948{
3949 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
3950 int type, name;
3951 unsigned long long val;
3952 int ret;
3953
3954 type = MEMFILE_TYPE(cft->private);
3955 name = MEMFILE_ATTR(cft->private);
3956
3957 if (!do_swap_account && type == _MEMSWAP)
3958 return -EOPNOTSUPP;
3959
3960 switch (name) {
3961 case RES_LIMIT:
3962 if (mem_cgroup_is_root(memcg)) {
3963 ret = -EINVAL;
3964 break;
3965 }
3966
3967 ret = res_counter_memparse_write_strategy(buffer, &val);
3968 if (ret)
3969 break;
3970 if (type == _MEM)
3971 ret = mem_cgroup_resize_limit(memcg, val);
3972 else
3973 ret = mem_cgroup_resize_memsw_limit(memcg, val);
3974 break;
3975 case RES_SOFT_LIMIT:
3976 ret = res_counter_memparse_write_strategy(buffer, &val);
3977 if (ret)
3978 break;
3979
3980
3981
3982
3983
3984 if (type == _MEM)
3985 ret = res_counter_set_soft_limit(&memcg->res, val);
3986 else
3987 ret = -EINVAL;
3988 break;
3989 default:
3990 ret = -EINVAL;
3991 break;
3992 }
3993 return ret;
3994}
3995
3996static void memcg_get_hierarchical_limit(struct mem_cgroup *memcg,
3997 unsigned long long *mem_limit, unsigned long long *memsw_limit)
3998{
3999 struct cgroup *cgroup;
4000 unsigned long long min_limit, min_memsw_limit, tmp;
4001
4002 min_limit = res_counter_read_u64(&memcg->res, RES_LIMIT);
4003 min_memsw_limit = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
4004 cgroup = memcg->css.cgroup;
4005 if (!memcg->use_hierarchy)
4006 goto out;
4007
4008 while (cgroup->parent) {
4009 cgroup = cgroup->parent;
4010 memcg = mem_cgroup_from_cont(cgroup);
4011 if (!memcg->use_hierarchy)
4012 break;
4013 tmp = res_counter_read_u64(&memcg->res, RES_LIMIT);
4014 min_limit = min(min_limit, tmp);
4015 tmp = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
4016 min_memsw_limit = min(min_memsw_limit, tmp);
4017 }
4018out:
4019 *mem_limit = min_limit;
4020 *memsw_limit = min_memsw_limit;
4021}
4022
4023static int mem_cgroup_reset(struct cgroup *cont, unsigned int event)
4024{
4025 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
4026 int type, name;
4027
4028 type = MEMFILE_TYPE(event);
4029 name = MEMFILE_ATTR(event);
4030
4031 if (!do_swap_account && type == _MEMSWAP)
4032 return -EOPNOTSUPP;
4033
4034 switch (name) {
4035 case RES_MAX_USAGE:
4036 if (type == _MEM)
4037 res_counter_reset_max(&memcg->res);
4038 else
4039 res_counter_reset_max(&memcg->memsw);
4040 break;
4041 case RES_FAILCNT:
4042 if (type == _MEM)
4043 res_counter_reset_failcnt(&memcg->res);
4044 else
4045 res_counter_reset_failcnt(&memcg->memsw);
4046 break;
4047 }
4048
4049 return 0;
4050}
4051
4052static u64 mem_cgroup_move_charge_read(struct cgroup *cgrp,
4053 struct cftype *cft)
4054{
4055 return mem_cgroup_from_cont(cgrp)->move_charge_at_immigrate;
4056}
4057
4058#ifdef CONFIG_MMU
4059static int mem_cgroup_move_charge_write(struct cgroup *cgrp,
4060 struct cftype *cft, u64 val)
4061{
4062 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
4063
4064 if (val >= (1 << NR_MOVE_TYPE))
4065 return -EINVAL;
4066
4067
4068
4069
4070
4071 cgroup_lock();
4072 memcg->move_charge_at_immigrate = val;
4073 cgroup_unlock();
4074
4075 return 0;
4076}
4077#else
4078static int mem_cgroup_move_charge_write(struct cgroup *cgrp,
4079 struct cftype *cft, u64 val)
4080{
4081 return -ENOSYS;
4082}
4083#endif
4084
4085#ifdef CONFIG_NUMA
4086static int memcg_numa_stat_show(struct cgroup *cont, struct cftype *cft,
4087 struct seq_file *m)
4088{
4089 int nid;
4090 unsigned long total_nr, file_nr, anon_nr, unevictable_nr;
4091 unsigned long node_nr;
4092 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
4093
4094 total_nr = mem_cgroup_nr_lru_pages(memcg, LRU_ALL);
4095 seq_printf(m, "total=%lu", total_nr);
4096 for_each_node_state(nid, N_HIGH_MEMORY) {
4097 node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL);
4098 seq_printf(m, " N%d=%lu", nid, node_nr);
4099 }
4100 seq_putc(m, '\n');
4101
4102 file_nr = mem_cgroup_nr_lru_pages(memcg, LRU_ALL_FILE);
4103 seq_printf(m, "file=%lu", file_nr);
4104 for_each_node_state(nid, N_HIGH_MEMORY) {
4105 node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid,
4106 LRU_ALL_FILE);
4107 seq_printf(m, " N%d=%lu", nid, node_nr);
4108 }
4109 seq_putc(m, '\n');
4110
4111 anon_nr = mem_cgroup_nr_lru_pages(memcg, LRU_ALL_ANON);
4112 seq_printf(m, "anon=%lu", anon_nr);
4113 for_each_node_state(nid, N_HIGH_MEMORY) {
4114 node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid,
4115 LRU_ALL_ANON);
4116 seq_printf(m, " N%d=%lu", nid, node_nr);
4117 }
4118 seq_putc(m, '\n');
4119
4120 unevictable_nr = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_UNEVICTABLE));
4121 seq_printf(m, "unevictable=%lu", unevictable_nr);
4122 for_each_node_state(nid, N_HIGH_MEMORY) {
4123 node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid,
4124 BIT(LRU_UNEVICTABLE));
4125 seq_printf(m, " N%d=%lu", nid, node_nr);
4126 }
4127 seq_putc(m, '\n');
4128 return 0;
4129}
4130#endif
4131
4132static const char * const mem_cgroup_lru_names[] = {
4133 "inactive_anon",
4134 "active_anon",
4135 "inactive_file",
4136 "active_file",
4137 "unevictable",
4138};
4139
4140static inline void mem_cgroup_lru_names_not_uptodate(void)
4141{
4142 BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_lru_names) != NR_LRU_LISTS);
4143}
4144
4145static int memcg_stat_show(struct cgroup *cont, struct cftype *cft,
4146 struct seq_file *m)
4147{
4148 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
4149 struct mem_cgroup *mi;
4150 unsigned int i;
4151
4152 for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) {
4153 if (i == MEM_CGROUP_STAT_SWAP && !do_swap_account)
4154 continue;
4155 seq_printf(m, "%s %ld\n", mem_cgroup_stat_names[i],
4156 mem_cgroup_read_stat(memcg, i) * PAGE_SIZE);
4157 }
4158
4159 for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++)
4160 seq_printf(m, "%s %lu\n", mem_cgroup_events_names[i],
4161 mem_cgroup_read_events(memcg, i));
4162
4163 for (i = 0; i < NR_LRU_LISTS; i++)
4164 seq_printf(m, "%s %lu\n", mem_cgroup_lru_names[i],
4165 mem_cgroup_nr_lru_pages(memcg, BIT(i)) * PAGE_SIZE);
4166
4167
4168 {
4169 unsigned long long limit, memsw_limit;
4170 memcg_get_hierarchical_limit(memcg, &limit, &memsw_limit);
4171 seq_printf(m, "hierarchical_memory_limit %llu\n", limit);
4172 if (do_swap_account)
4173 seq_printf(m, "hierarchical_memsw_limit %llu\n",
4174 memsw_limit);
4175 }
4176
4177 for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) {
4178 long long val = 0;
4179
4180 if (i == MEM_CGROUP_STAT_SWAP && !do_swap_account)
4181 continue;
4182 for_each_mem_cgroup_tree(mi, memcg)
4183 val += mem_cgroup_read_stat(mi, i) * PAGE_SIZE;
4184 seq_printf(m, "total_%s %lld\n", mem_cgroup_stat_names[i], val);
4185 }
4186
4187 for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++) {
4188 unsigned long long val = 0;
4189
4190 for_each_mem_cgroup_tree(mi, memcg)
4191 val += mem_cgroup_read_events(mi, i);
4192 seq_printf(m, "total_%s %llu\n",
4193 mem_cgroup_events_names[i], val);
4194 }
4195
4196 for (i = 0; i < NR_LRU_LISTS; i++) {
4197 unsigned long long val = 0;
4198
4199 for_each_mem_cgroup_tree(mi, memcg)
4200 val += mem_cgroup_nr_lru_pages(mi, BIT(i)) * PAGE_SIZE;
4201 seq_printf(m, "total_%s %llu\n", mem_cgroup_lru_names[i], val);
4202 }
4203
4204#ifdef CONFIG_DEBUG_VM
4205 {
4206 int nid, zid;
4207 struct mem_cgroup_per_zone *mz;
4208 struct zone_reclaim_stat *rstat;
4209 unsigned long recent_rotated[2] = {0, 0};
4210 unsigned long recent_scanned[2] = {0, 0};
4211
4212 for_each_online_node(nid)
4213 for (zid = 0; zid < MAX_NR_ZONES; zid++) {
4214 mz = mem_cgroup_zoneinfo(memcg, nid, zid);
4215 rstat = &mz->lruvec.reclaim_stat;
4216
4217 recent_rotated[0] += rstat->recent_rotated[0];
4218 recent_rotated[1] += rstat->recent_rotated[1];
4219 recent_scanned[0] += rstat->recent_scanned[0];
4220 recent_scanned[1] += rstat->recent_scanned[1];
4221 }
4222 seq_printf(m, "recent_rotated_anon %lu\n", recent_rotated[0]);
4223 seq_printf(m, "recent_rotated_file %lu\n", recent_rotated[1]);
4224 seq_printf(m, "recent_scanned_anon %lu\n", recent_scanned[0]);
4225 seq_printf(m, "recent_scanned_file %lu\n", recent_scanned[1]);
4226 }
4227#endif
4228
4229 return 0;
4230}
4231
4232static u64 mem_cgroup_swappiness_read(struct cgroup *cgrp, struct cftype *cft)
4233{
4234 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
4235
4236 return mem_cgroup_swappiness(memcg);
4237}
4238
4239static int mem_cgroup_swappiness_write(struct cgroup *cgrp, struct cftype *cft,
4240 u64 val)
4241{
4242 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
4243 struct mem_cgroup *parent;
4244
4245 if (val > 100)
4246 return -EINVAL;
4247
4248 if (cgrp->parent == NULL)
4249 return -EINVAL;
4250
4251 parent = mem_cgroup_from_cont(cgrp->parent);
4252
4253 cgroup_lock();
4254
4255
4256 if ((parent->use_hierarchy) ||
4257 (memcg->use_hierarchy && !list_empty(&cgrp->children))) {
4258 cgroup_unlock();
4259 return -EINVAL;
4260 }
4261
4262 memcg->swappiness = val;
4263
4264 cgroup_unlock();
4265
4266 return 0;
4267}
4268
4269static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap)
4270{
4271 struct mem_cgroup_threshold_ary *t;
4272 u64 usage;
4273 int i;
4274
4275 rcu_read_lock();
4276 if (!swap)
4277 t = rcu_dereference(memcg->thresholds.primary);
4278 else
4279 t = rcu_dereference(memcg->memsw_thresholds.primary);
4280
4281 if (!t)
4282 goto unlock;
4283
4284 usage = mem_cgroup_usage(memcg, swap);
4285
4286
4287
4288
4289
4290
4291 i = t->current_threshold;
4292
4293
4294
4295
4296
4297
4298
4299 for (; i >= 0 && unlikely(t->entries[i].threshold > usage); i--)
4300 eventfd_signal(t->entries[i].eventfd, 1);
4301
4302
4303 i++;
4304
4305
4306
4307
4308
4309
4310
4311 for (; i < t->size && unlikely(t->entries[i].threshold <= usage); i++)
4312 eventfd_signal(t->entries[i].eventfd, 1);
4313
4314
4315 t->current_threshold = i - 1;
4316unlock:
4317 rcu_read_unlock();
4318}
4319
4320static void mem_cgroup_threshold(struct mem_cgroup *memcg)
4321{
4322 while (memcg) {
4323 __mem_cgroup_threshold(memcg, false);
4324 if (do_swap_account)
4325 __mem_cgroup_threshold(memcg, true);
4326
4327 memcg = parent_mem_cgroup(memcg);
4328 }
4329}
4330
4331static int compare_thresholds(const void *a, const void *b)
4332{
4333 const struct mem_cgroup_threshold *_a = a;
4334 const struct mem_cgroup_threshold *_b = b;
4335
4336 return _a->threshold - _b->threshold;
4337}
4338
4339static int mem_cgroup_oom_notify_cb(struct mem_cgroup *memcg)
4340{
4341 struct mem_cgroup_eventfd_list *ev;
4342
4343 list_for_each_entry(ev, &memcg->oom_notify, list)
4344 eventfd_signal(ev->eventfd, 1);
4345 return 0;
4346}
4347
4348static void mem_cgroup_oom_notify(struct mem_cgroup *memcg)
4349{
4350 struct mem_cgroup *iter;
4351
4352 for_each_mem_cgroup_tree(iter, memcg)
4353 mem_cgroup_oom_notify_cb(iter);
4354}
4355
4356static int mem_cgroup_usage_register_event(struct cgroup *cgrp,
4357 struct cftype *cft, struct eventfd_ctx *eventfd, const char *args)
4358{
4359 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
4360 struct mem_cgroup_thresholds *thresholds;
4361 struct mem_cgroup_threshold_ary *new;
4362 int type = MEMFILE_TYPE(cft->private);
4363 u64 threshold, usage;
4364 int i, size, ret;
4365
4366 ret = res_counter_memparse_write_strategy(args, &threshold);
4367 if (ret)
4368 return ret;
4369
4370 mutex_lock(&memcg->thresholds_lock);
4371
4372 if (type == _MEM)
4373 thresholds = &memcg->thresholds;
4374 else if (type == _MEMSWAP)
4375 thresholds = &memcg->memsw_thresholds;
4376 else
4377 BUG();
4378
4379 usage = mem_cgroup_usage(memcg, type == _MEMSWAP);
4380
4381
4382 if (thresholds->primary)
4383 __mem_cgroup_threshold(memcg, type == _MEMSWAP);
4384
4385 size = thresholds->primary ? thresholds->primary->size + 1 : 1;
4386
4387
4388 new = kmalloc(sizeof(*new) + size * sizeof(struct mem_cgroup_threshold),
4389 GFP_KERNEL);
4390 if (!new) {
4391 ret = -ENOMEM;
4392 goto unlock;
4393 }
4394 new->size = size;
4395
4396
4397 if (thresholds->primary) {
4398 memcpy(new->entries, thresholds->primary->entries, (size - 1) *
4399 sizeof(struct mem_cgroup_threshold));
4400 }
4401
4402
4403 new->entries[size - 1].eventfd = eventfd;
4404 new->entries[size - 1].threshold = threshold;
4405
4406
4407 sort(new->entries, size, sizeof(struct mem_cgroup_threshold),
4408 compare_thresholds, NULL);
4409
4410
4411 new->current_threshold = -1;
4412 for (i = 0; i < size; i++) {
4413 if (new->entries[i].threshold <= usage) {
4414
4415
4416
4417
4418
4419 ++new->current_threshold;
4420 } else
4421 break;
4422 }
4423
4424
4425 kfree(thresholds->spare);
4426 thresholds->spare = thresholds->primary;
4427
4428 rcu_assign_pointer(thresholds->primary, new);
4429
4430
4431 synchronize_rcu();
4432
4433unlock:
4434 mutex_unlock(&memcg->thresholds_lock);
4435
4436 return ret;
4437}
4438
4439static void mem_cgroup_usage_unregister_event(struct cgroup *cgrp,
4440 struct cftype *cft, struct eventfd_ctx *eventfd)
4441{
4442 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
4443 struct mem_cgroup_thresholds *thresholds;
4444 struct mem_cgroup_threshold_ary *new;
4445 int type = MEMFILE_TYPE(cft->private);
4446 u64 usage;
4447 int i, j, size;
4448
4449 mutex_lock(&memcg->thresholds_lock);
4450 if (type == _MEM)
4451 thresholds = &memcg->thresholds;
4452 else if (type == _MEMSWAP)
4453 thresholds = &memcg->memsw_thresholds;
4454 else
4455 BUG();
4456
4457 if (!thresholds->primary)
4458 goto unlock;
4459
4460 usage = mem_cgroup_usage(memcg, type == _MEMSWAP);
4461
4462
4463 __mem_cgroup_threshold(memcg, type == _MEMSWAP);
4464
4465
4466 size = 0;
4467 for (i = 0; i < thresholds->primary->size; i++) {
4468 if (thresholds->primary->entries[i].eventfd != eventfd)
4469 size++;
4470 }
4471
4472 new = thresholds->spare;
4473
4474
4475 if (!size) {
4476 kfree(new);
4477 new = NULL;
4478 goto swap_buffers;
4479 }
4480
4481 new->size = size;
4482
4483
4484 new->current_threshold = -1;
4485 for (i = 0, j = 0; i < thresholds->primary->size; i++) {
4486 if (thresholds->primary->entries[i].eventfd == eventfd)
4487 continue;
4488
4489 new->entries[j] = thresholds->primary->entries[i];
4490 if (new->entries[j].threshold <= usage) {
4491
4492
4493
4494
4495
4496 ++new->current_threshold;
4497 }
4498 j++;
4499 }
4500
4501swap_buffers:
4502
4503 thresholds->spare = thresholds->primary;
4504
4505 if (!new) {
4506 kfree(thresholds->spare);
4507 thresholds->spare = NULL;
4508 }
4509
4510 rcu_assign_pointer(thresholds->primary, new);
4511
4512
4513 synchronize_rcu();
4514unlock:
4515 mutex_unlock(&memcg->thresholds_lock);
4516}
4517
4518static int mem_cgroup_oom_register_event(struct cgroup *cgrp,
4519 struct cftype *cft, struct eventfd_ctx *eventfd, const char *args)
4520{
4521 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
4522 struct mem_cgroup_eventfd_list *event;
4523 int type = MEMFILE_TYPE(cft->private);
4524
4525 BUG_ON(type != _OOM_TYPE);
4526 event = kmalloc(sizeof(*event), GFP_KERNEL);
4527 if (!event)
4528 return -ENOMEM;
4529
4530 spin_lock(&memcg_oom_lock);
4531
4532 event->eventfd = eventfd;
4533 list_add(&event->list, &memcg->oom_notify);
4534
4535
4536 if (atomic_read(&memcg->under_oom))
4537 eventfd_signal(eventfd, 1);
4538 spin_unlock(&memcg_oom_lock);
4539
4540 return 0;
4541}
4542
4543static void mem_cgroup_oom_unregister_event(struct cgroup *cgrp,
4544 struct cftype *cft, struct eventfd_ctx *eventfd)
4545{
4546 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
4547 struct mem_cgroup_eventfd_list *ev, *tmp;
4548 int type = MEMFILE_TYPE(cft->private);
4549
4550 BUG_ON(type != _OOM_TYPE);
4551
4552 spin_lock(&memcg_oom_lock);
4553
4554 list_for_each_entry_safe(ev, tmp, &memcg->oom_notify, list) {
4555 if (ev->eventfd == eventfd) {
4556 list_del(&ev->list);
4557 kfree(ev);
4558 }
4559 }
4560
4561 spin_unlock(&memcg_oom_lock);
4562}
4563
4564static int mem_cgroup_oom_control_read(struct cgroup *cgrp,
4565 struct cftype *cft, struct cgroup_map_cb *cb)
4566{
4567 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
4568
4569 cb->fill(cb, "oom_kill_disable", memcg->oom_kill_disable);
4570
4571 if (atomic_read(&memcg->under_oom))
4572 cb->fill(cb, "under_oom", 1);
4573 else
4574 cb->fill(cb, "under_oom", 0);
4575 return 0;
4576}
4577
4578static int mem_cgroup_oom_control_write(struct cgroup *cgrp,
4579 struct cftype *cft, u64 val)
4580{
4581 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
4582 struct mem_cgroup *parent;
4583
4584
4585 if (!cgrp->parent || !((val == 0) || (val == 1)))
4586 return -EINVAL;
4587
4588 parent = mem_cgroup_from_cont(cgrp->parent);
4589
4590 cgroup_lock();
4591
4592 if ((parent->use_hierarchy) ||
4593 (memcg->use_hierarchy && !list_empty(&cgrp->children))) {
4594 cgroup_unlock();
4595 return -EINVAL;
4596 }
4597 memcg->oom_kill_disable = val;
4598 if (!val)
4599 memcg_oom_recover(memcg);
4600 cgroup_unlock();
4601 return 0;
4602}
4603
4604#ifdef CONFIG_MEMCG_KMEM
4605static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
4606{
4607 return mem_cgroup_sockets_init(memcg, ss);
4608};
4609
4610static void kmem_cgroup_destroy(struct mem_cgroup *memcg)
4611{
4612 mem_cgroup_sockets_destroy(memcg);
4613}
4614#else
4615static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
4616{
4617 return 0;
4618}
4619
4620static void kmem_cgroup_destroy(struct mem_cgroup *memcg)
4621{
4622}
4623#endif
4624
4625static struct cftype mem_cgroup_files[] = {
4626 {
4627 .name = "usage_in_bytes",
4628 .private = MEMFILE_PRIVATE(_MEM, RES_USAGE),
4629 .read = mem_cgroup_read,
4630 .register_event = mem_cgroup_usage_register_event,
4631 .unregister_event = mem_cgroup_usage_unregister_event,
4632 },
4633 {
4634 .name = "max_usage_in_bytes",
4635 .private = MEMFILE_PRIVATE(_MEM, RES_MAX_USAGE),
4636 .trigger = mem_cgroup_reset,
4637 .read = mem_cgroup_read,
4638 },
4639 {
4640 .name = "limit_in_bytes",
4641 .private = MEMFILE_PRIVATE(_MEM, RES_LIMIT),
4642 .write_string = mem_cgroup_write,
4643 .read = mem_cgroup_read,
4644 },
4645 {
4646 .name = "soft_limit_in_bytes",
4647 .private = MEMFILE_PRIVATE(_MEM, RES_SOFT_LIMIT),
4648 .write_string = mem_cgroup_write,
4649 .read = mem_cgroup_read,
4650 },
4651 {
4652 .name = "failcnt",
4653 .private = MEMFILE_PRIVATE(_MEM, RES_FAILCNT),
4654 .trigger = mem_cgroup_reset,
4655 .read = mem_cgroup_read,
4656 },
4657 {
4658 .name = "stat",
4659 .read_seq_string = memcg_stat_show,
4660 },
4661 {
4662 .name = "force_empty",
4663 .trigger = mem_cgroup_force_empty_write,
4664 },
4665 {
4666 .name = "use_hierarchy",
4667 .write_u64 = mem_cgroup_hierarchy_write,
4668 .read_u64 = mem_cgroup_hierarchy_read,
4669 },
4670 {
4671 .name = "swappiness",
4672 .read_u64 = mem_cgroup_swappiness_read,
4673 .write_u64 = mem_cgroup_swappiness_write,
4674 },
4675 {
4676 .name = "move_charge_at_immigrate",
4677 .read_u64 = mem_cgroup_move_charge_read,
4678 .write_u64 = mem_cgroup_move_charge_write,
4679 },
4680 {
4681 .name = "oom_control",
4682 .read_map = mem_cgroup_oom_control_read,
4683 .write_u64 = mem_cgroup_oom_control_write,
4684 .register_event = mem_cgroup_oom_register_event,
4685 .unregister_event = mem_cgroup_oom_unregister_event,
4686 .private = MEMFILE_PRIVATE(_OOM_TYPE, OOM_CONTROL),
4687 },
4688#ifdef CONFIG_NUMA
4689 {
4690 .name = "numa_stat",
4691 .read_seq_string = memcg_numa_stat_show,
4692 },
4693#endif
4694#ifdef CONFIG_MEMCG_SWAP
4695 {
4696 .name = "memsw.usage_in_bytes",
4697 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE),
4698 .read = mem_cgroup_read,
4699 .register_event = mem_cgroup_usage_register_event,
4700 .unregister_event = mem_cgroup_usage_unregister_event,
4701 },
4702 {
4703 .name = "memsw.max_usage_in_bytes",
4704 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_MAX_USAGE),
4705 .trigger = mem_cgroup_reset,
4706 .read = mem_cgroup_read,
4707 },
4708 {
4709 .name = "memsw.limit_in_bytes",
4710 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_LIMIT),
4711 .write_string = mem_cgroup_write,
4712 .read = mem_cgroup_read,
4713 },
4714 {
4715 .name = "memsw.failcnt",
4716 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_FAILCNT),
4717 .trigger = mem_cgroup_reset,
4718 .read = mem_cgroup_read,
4719 },
4720#endif
4721 { },
4722};
4723
4724static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node)
4725{
4726 struct mem_cgroup_per_node *pn;
4727 struct mem_cgroup_per_zone *mz;
4728 int zone, tmp = node;
4729
4730
4731
4732
4733
4734
4735
4736
4737 if (!node_state(node, N_NORMAL_MEMORY))
4738 tmp = -1;
4739 pn = kzalloc_node(sizeof(*pn), GFP_KERNEL, tmp);
4740 if (!pn)
4741 return 1;
4742
4743 for (zone = 0; zone < MAX_NR_ZONES; zone++) {
4744 mz = &pn->zoneinfo[zone];
4745 lruvec_init(&mz->lruvec, &NODE_DATA(node)->node_zones[zone]);
4746 mz->usage_in_excess = 0;
4747 mz->on_tree = false;
4748 mz->memcg = memcg;
4749 }
4750 memcg->info.nodeinfo[node] = pn;
4751 return 0;
4752}
4753
4754static void free_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node)
4755{
4756 kfree(memcg->info.nodeinfo[node]);
4757}
4758
4759static struct mem_cgroup *mem_cgroup_alloc(void)
4760{
4761 struct mem_cgroup *memcg;
4762 int size = sizeof(struct mem_cgroup);
4763
4764
4765 if (size < PAGE_SIZE)
4766 memcg = kzalloc(size, GFP_KERNEL);
4767 else
4768 memcg = vzalloc(size);
4769
4770 if (!memcg)
4771 return NULL;
4772
4773 memcg->stat = alloc_percpu(struct mem_cgroup_stat_cpu);
4774 if (!memcg->stat)
4775 goto out_free;
4776 spin_lock_init(&memcg->pcp_counter_lock);
4777 return memcg;
4778
4779out_free:
4780 if (size < PAGE_SIZE)
4781 kfree(memcg);
4782 else
4783 vfree(memcg);
4784 return NULL;
4785}
4786
4787
4788
4789
4790
4791
4792static void free_work(struct work_struct *work)
4793{
4794 struct mem_cgroup *memcg;
4795 int size = sizeof(struct mem_cgroup);
4796
4797 memcg = container_of(work, struct mem_cgroup, work_freeing);
4798
4799
4800
4801
4802
4803
4804
4805
4806
4807
4808
4809 disarm_sock_keys(memcg);
4810 if (size < PAGE_SIZE)
4811 kfree(memcg);
4812 else
4813 vfree(memcg);
4814}
4815
4816static void free_rcu(struct rcu_head *rcu_head)
4817{
4818 struct mem_cgroup *memcg;
4819
4820 memcg = container_of(rcu_head, struct mem_cgroup, rcu_freeing);
4821 INIT_WORK(&memcg->work_freeing, free_work);
4822 schedule_work(&memcg->work_freeing);
4823}
4824
4825
4826
4827
4828
4829
4830
4831
4832
4833
4834
4835
4836static void __mem_cgroup_free(struct mem_cgroup *memcg)
4837{
4838 int node;
4839
4840 mem_cgroup_remove_from_trees(memcg);
4841 free_css_id(&mem_cgroup_subsys, &memcg->css);
4842
4843 for_each_node(node)
4844 free_mem_cgroup_per_zone_info(memcg, node);
4845
4846 free_percpu(memcg->stat);
4847 call_rcu(&memcg->rcu_freeing, free_rcu);
4848}
4849
4850static void mem_cgroup_get(struct mem_cgroup *memcg)
4851{
4852 atomic_inc(&memcg->refcnt);
4853}
4854
4855static void __mem_cgroup_put(struct mem_cgroup *memcg, int count)
4856{
4857 if (atomic_sub_and_test(count, &memcg->refcnt)) {
4858 struct mem_cgroup *parent = parent_mem_cgroup(memcg);
4859 __mem_cgroup_free(memcg);
4860 if (parent)
4861 mem_cgroup_put(parent);
4862 }
4863}
4864
4865static void mem_cgroup_put(struct mem_cgroup *memcg)
4866{
4867 __mem_cgroup_put(memcg, 1);
4868}
4869
4870
4871
4872
4873struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg)
4874{
4875 if (!memcg->res.parent)
4876 return NULL;
4877 return mem_cgroup_from_res_counter(memcg->res.parent, res);
4878}
4879EXPORT_SYMBOL(parent_mem_cgroup);
4880
4881#ifdef CONFIG_MEMCG_SWAP
4882static void __init enable_swap_cgroup(void)
4883{
4884 if (!mem_cgroup_disabled() && really_do_swap_account)
4885 do_swap_account = 1;
4886}
4887#else
4888static void __init enable_swap_cgroup(void)
4889{
4890}
4891#endif
4892
4893static int mem_cgroup_soft_limit_tree_init(void)
4894{
4895 struct mem_cgroup_tree_per_node *rtpn;
4896 struct mem_cgroup_tree_per_zone *rtpz;
4897 int tmp, node, zone;
4898
4899 for_each_node(node) {
4900 tmp = node;
4901 if (!node_state(node, N_NORMAL_MEMORY))
4902 tmp = -1;
4903 rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL, tmp);
4904 if (!rtpn)
4905 goto err_cleanup;
4906
4907 soft_limit_tree.rb_tree_per_node[node] = rtpn;
4908
4909 for (zone = 0; zone < MAX_NR_ZONES; zone++) {
4910 rtpz = &rtpn->rb_tree_per_zone[zone];
4911 rtpz->rb_root = RB_ROOT;
4912 spin_lock_init(&rtpz->lock);
4913 }
4914 }
4915 return 0;
4916
4917err_cleanup:
4918 for_each_node(node) {
4919 if (!soft_limit_tree.rb_tree_per_node[node])
4920 break;
4921 kfree(soft_limit_tree.rb_tree_per_node[node]);
4922 soft_limit_tree.rb_tree_per_node[node] = NULL;
4923 }
4924 return 1;
4925
4926}
4927
4928static struct cgroup_subsys_state * __ref
4929mem_cgroup_create(struct cgroup *cont)
4930{
4931 struct mem_cgroup *memcg, *parent;
4932 long error = -ENOMEM;
4933 int node;
4934
4935 memcg = mem_cgroup_alloc();
4936 if (!memcg)
4937 return ERR_PTR(error);
4938
4939 for_each_node(node)
4940 if (alloc_mem_cgroup_per_zone_info(memcg, node))
4941 goto free_out;
4942
4943
4944 if (cont->parent == NULL) {
4945 int cpu;
4946 enable_swap_cgroup();
4947 parent = NULL;
4948 if (mem_cgroup_soft_limit_tree_init())
4949 goto free_out;
4950 root_mem_cgroup = memcg;
4951 for_each_possible_cpu(cpu) {
4952 struct memcg_stock_pcp *stock =
4953 &per_cpu(memcg_stock, cpu);
4954 INIT_WORK(&stock->work, drain_local_stock);
4955 }
4956 hotcpu_notifier(memcg_cpu_hotplug_callback, 0);
4957 } else {
4958 parent = mem_cgroup_from_cont(cont->parent);
4959 memcg->use_hierarchy = parent->use_hierarchy;
4960 memcg->oom_kill_disable = parent->oom_kill_disable;
4961 }
4962
4963 if (parent && parent->use_hierarchy) {
4964 res_counter_init(&memcg->res, &parent->res);
4965 res_counter_init(&memcg->memsw, &parent->memsw);
4966
4967
4968
4969
4970
4971
4972 mem_cgroup_get(parent);
4973 } else {
4974 res_counter_init(&memcg->res, NULL);
4975 res_counter_init(&memcg->memsw, NULL);
4976 }
4977 memcg->last_scanned_node = MAX_NUMNODES;
4978 INIT_LIST_HEAD(&memcg->oom_notify);
4979
4980 if (parent)
4981 memcg->swappiness = mem_cgroup_swappiness(parent);
4982 atomic_set(&memcg->refcnt, 1);
4983 memcg->move_charge_at_immigrate = 0;
4984 mutex_init(&memcg->thresholds_lock);
4985 spin_lock_init(&memcg->move_lock);
4986
4987 error = memcg_init_kmem(memcg, &mem_cgroup_subsys);
4988 if (error) {
4989
4990
4991
4992
4993
4994 mem_cgroup_put(memcg);
4995 return ERR_PTR(error);
4996 }
4997 return &memcg->css;
4998free_out:
4999 __mem_cgroup_free(memcg);
5000 return ERR_PTR(error);
5001}
5002
5003static int mem_cgroup_pre_destroy(struct cgroup *cont)
5004{
5005 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
5006
5007 return mem_cgroup_force_empty(memcg, false);
5008}
5009
5010static void mem_cgroup_destroy(struct cgroup *cont)
5011{
5012 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
5013
5014 kmem_cgroup_destroy(memcg);
5015
5016 mem_cgroup_put(memcg);
5017}
5018
5019#ifdef CONFIG_MMU
5020
5021#define PRECHARGE_COUNT_AT_ONCE 256
5022static int mem_cgroup_do_precharge(unsigned long count)
5023{
5024 int ret = 0;
5025 int batch_count = PRECHARGE_COUNT_AT_ONCE;
5026 struct mem_cgroup *memcg = mc.to;
5027
5028 if (mem_cgroup_is_root(memcg)) {
5029 mc.precharge += count;
5030
5031 return ret;
5032 }
5033
5034 if (count > 1) {
5035 struct res_counter *dummy;
5036
5037
5038
5039
5040
5041
5042 if (res_counter_charge(&memcg->res, PAGE_SIZE * count, &dummy))
5043 goto one_by_one;
5044 if (do_swap_account && res_counter_charge(&memcg->memsw,
5045 PAGE_SIZE * count, &dummy)) {
5046 res_counter_uncharge(&memcg->res, PAGE_SIZE * count);
5047 goto one_by_one;
5048 }
5049 mc.precharge += count;
5050 return ret;
5051 }
5052one_by_one:
5053
5054 while (count--) {
5055 if (signal_pending(current)) {
5056 ret = -EINTR;
5057 break;
5058 }
5059 if (!batch_count--) {
5060 batch_count = PRECHARGE_COUNT_AT_ONCE;
5061 cond_resched();
5062 }
5063 ret = __mem_cgroup_try_charge(NULL,
5064 GFP_KERNEL, 1, &memcg, false);
5065 if (ret)
5066
5067 return ret;
5068 mc.precharge++;
5069 }
5070 return ret;
5071}
5072
5073
5074
5075
5076
5077
5078
5079
5080
5081
5082
5083
5084
5085
5086
5087
5088
5089
5090
5091union mc_target {
5092 struct page *page;
5093 swp_entry_t ent;
5094};
5095
5096enum mc_target_type {
5097 MC_TARGET_NONE = 0,
5098 MC_TARGET_PAGE,
5099 MC_TARGET_SWAP,
5100};
5101
5102static struct page *mc_handle_present_pte(struct vm_area_struct *vma,
5103 unsigned long addr, pte_t ptent)
5104{
5105 struct page *page = vm_normal_page(vma, addr, ptent);
5106
5107 if (!page || !page_mapped(page))
5108 return NULL;
5109 if (PageAnon(page)) {
5110
5111 if (!move_anon())
5112 return NULL;
5113 } else if (!move_file())
5114
5115 return NULL;
5116 if (!get_page_unless_zero(page))
5117 return NULL;
5118
5119 return page;
5120}
5121
5122#ifdef CONFIG_SWAP
5123static struct page *mc_handle_swap_pte(struct vm_area_struct *vma,
5124 unsigned long addr, pte_t ptent, swp_entry_t *entry)
5125{
5126 struct page *page = NULL;
5127 swp_entry_t ent = pte_to_swp_entry(ptent);
5128
5129 if (!move_anon() || non_swap_entry(ent))
5130 return NULL;
5131
5132
5133
5134
5135 page = find_get_page(&swapper_space, ent.val);
5136 if (do_swap_account)
5137 entry->val = ent.val;
5138
5139 return page;
5140}
5141#else
5142static struct page *mc_handle_swap_pte(struct vm_area_struct *vma,
5143 unsigned long addr, pte_t ptent, swp_entry_t *entry)
5144{
5145 return NULL;
5146}
5147#endif
5148
5149static struct page *mc_handle_file_pte(struct vm_area_struct *vma,
5150 unsigned long addr, pte_t ptent, swp_entry_t *entry)
5151{
5152 struct page *page = NULL;
5153 struct address_space *mapping;
5154 pgoff_t pgoff;
5155
5156 if (!vma->vm_file)
5157 return NULL;
5158 if (!move_file())
5159 return NULL;
5160
5161 mapping = vma->vm_file->f_mapping;
5162 if (pte_none(ptent))
5163 pgoff = linear_page_index(vma, addr);
5164 else
5165 pgoff = pte_to_pgoff(ptent);
5166
5167
5168 page = find_get_page(mapping, pgoff);
5169
5170#ifdef CONFIG_SWAP
5171
5172 if (radix_tree_exceptional_entry(page)) {
5173 swp_entry_t swap = radix_to_swp_entry(page);
5174 if (do_swap_account)
5175 *entry = swap;
5176 page = find_get_page(&swapper_space, swap.val);
5177 }
5178#endif
5179 return page;
5180}
5181
5182static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma,
5183 unsigned long addr, pte_t ptent, union mc_target *target)
5184{
5185 struct page *page = NULL;
5186 struct page_cgroup *pc;
5187 enum mc_target_type ret = MC_TARGET_NONE;
5188 swp_entry_t ent = { .val = 0 };
5189
5190 if (pte_present(ptent))
5191 page = mc_handle_present_pte(vma, addr, ptent);
5192 else if (is_swap_pte(ptent))
5193 page = mc_handle_swap_pte(vma, addr, ptent, &ent);
5194 else if (pte_none(ptent) || pte_file(ptent))
5195 page = mc_handle_file_pte(vma, addr, ptent, &ent);
5196
5197 if (!page && !ent.val)
5198 return ret;
5199 if (page) {
5200 pc = lookup_page_cgroup(page);
5201
5202
5203
5204
5205
5206 if (PageCgroupUsed(pc) && pc->mem_cgroup == mc.from) {
5207 ret = MC_TARGET_PAGE;
5208 if (target)
5209 target->page = page;
5210 }
5211 if (!ret || !target)
5212 put_page(page);
5213 }
5214
5215 if (ent.val && !ret &&
5216 css_id(&mc.from->css) == lookup_swap_cgroup_id(ent)) {
5217 ret = MC_TARGET_SWAP;
5218 if (target)
5219 target->ent = ent;
5220 }
5221 return ret;
5222}
5223
5224#ifdef CONFIG_TRANSPARENT_HUGEPAGE
5225
5226
5227
5228
5229
5230static enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma,
5231 unsigned long addr, pmd_t pmd, union mc_target *target)
5232{
5233 struct page *page = NULL;
5234 struct page_cgroup *pc;
5235 enum mc_target_type ret = MC_TARGET_NONE;
5236
5237 page = pmd_page(pmd);
5238 VM_BUG_ON(!page || !PageHead(page));
5239 if (!move_anon())
5240 return ret;
5241 pc = lookup_page_cgroup(page);
5242 if (PageCgroupUsed(pc) && pc->mem_cgroup == mc.from) {
5243 ret = MC_TARGET_PAGE;
5244 if (target) {
5245 get_page(page);
5246 target->page = page;
5247 }
5248 }
5249 return ret;
5250}
5251#else
5252static inline enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma,
5253 unsigned long addr, pmd_t pmd, union mc_target *target)
5254{
5255 return MC_TARGET_NONE;
5256}
5257#endif
5258
5259static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd,
5260 unsigned long addr, unsigned long end,
5261 struct mm_walk *walk)
5262{
5263 struct vm_area_struct *vma = walk->private;
5264 pte_t *pte;
5265 spinlock_t *ptl;
5266
5267 if (pmd_trans_huge_lock(pmd, vma) == 1) {
5268 if (get_mctgt_type_thp(vma, addr, *pmd, NULL) == MC_TARGET_PAGE)
5269 mc.precharge += HPAGE_PMD_NR;
5270 spin_unlock(&vma->vm_mm->page_table_lock);
5271 return 0;
5272 }
5273
5274 if (pmd_trans_unstable(pmd))
5275 return 0;
5276 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
5277 for (; addr != end; pte++, addr += PAGE_SIZE)
5278 if (get_mctgt_type(vma, addr, *pte, NULL))
5279 mc.precharge++;
5280 pte_unmap_unlock(pte - 1, ptl);
5281 cond_resched();
5282
5283 return 0;
5284}
5285
5286static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm)
5287{
5288 unsigned long precharge;
5289 struct vm_area_struct *vma;
5290
5291 down_read(&mm->mmap_sem);
5292 for (vma = mm->mmap; vma; vma = vma->vm_next) {
5293 struct mm_walk mem_cgroup_count_precharge_walk = {
5294 .pmd_entry = mem_cgroup_count_precharge_pte_range,
5295 .mm = mm,
5296 .private = vma,
5297 };
5298 if (is_vm_hugetlb_page(vma))
5299 continue;
5300 walk_page_range(vma->vm_start, vma->vm_end,
5301 &mem_cgroup_count_precharge_walk);
5302 }
5303 up_read(&mm->mmap_sem);
5304
5305 precharge = mc.precharge;
5306 mc.precharge = 0;
5307
5308 return precharge;
5309}
5310
5311static int mem_cgroup_precharge_mc(struct mm_struct *mm)
5312{
5313 unsigned long precharge = mem_cgroup_count_precharge(mm);
5314
5315 VM_BUG_ON(mc.moving_task);
5316 mc.moving_task = current;
5317 return mem_cgroup_do_precharge(precharge);
5318}
5319
5320
5321static void __mem_cgroup_clear_mc(void)
5322{
5323 struct mem_cgroup *from = mc.from;
5324 struct mem_cgroup *to = mc.to;
5325
5326
5327 if (mc.precharge) {
5328 __mem_cgroup_cancel_charge(mc.to, mc.precharge);
5329 mc.precharge = 0;
5330 }
5331
5332
5333
5334
5335 if (mc.moved_charge) {
5336 __mem_cgroup_cancel_charge(mc.from, mc.moved_charge);
5337 mc.moved_charge = 0;
5338 }
5339
5340 if (mc.moved_swap) {
5341
5342 if (!mem_cgroup_is_root(mc.from))
5343 res_counter_uncharge(&mc.from->memsw,
5344 PAGE_SIZE * mc.moved_swap);
5345 __mem_cgroup_put(mc.from, mc.moved_swap);
5346
5347 if (!mem_cgroup_is_root(mc.to)) {
5348
5349
5350
5351
5352 res_counter_uncharge(&mc.to->res,
5353 PAGE_SIZE * mc.moved_swap);
5354 }
5355
5356 mc.moved_swap = 0;
5357 }
5358 memcg_oom_recover(from);
5359 memcg_oom_recover(to);
5360 wake_up_all(&mc.waitq);
5361}
5362
5363static void mem_cgroup_clear_mc(void)
5364{
5365 struct mem_cgroup *from = mc.from;
5366
5367
5368
5369
5370
5371 mc.moving_task = NULL;
5372 __mem_cgroup_clear_mc();
5373 spin_lock(&mc.lock);
5374 mc.from = NULL;
5375 mc.to = NULL;
5376 spin_unlock(&mc.lock);
5377 mem_cgroup_end_move(from);
5378}
5379
5380static int mem_cgroup_can_attach(struct cgroup *cgroup,
5381 struct cgroup_taskset *tset)
5382{
5383 struct task_struct *p = cgroup_taskset_first(tset);
5384 int ret = 0;
5385 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgroup);
5386
5387 if (memcg->move_charge_at_immigrate) {
5388 struct mm_struct *mm;
5389 struct mem_cgroup *from = mem_cgroup_from_task(p);
5390
5391 VM_BUG_ON(from == memcg);
5392
5393 mm = get_task_mm(p);
5394 if (!mm)
5395 return 0;
5396
5397 if (mm->owner == p) {
5398 VM_BUG_ON(mc.from);
5399 VM_BUG_ON(mc.to);
5400 VM_BUG_ON(mc.precharge);
5401 VM_BUG_ON(mc.moved_charge);
5402 VM_BUG_ON(mc.moved_swap);
5403 mem_cgroup_start_move(from);
5404 spin_lock(&mc.lock);
5405 mc.from = from;
5406 mc.to = memcg;
5407 spin_unlock(&mc.lock);
5408
5409
5410 ret = mem_cgroup_precharge_mc(mm);
5411 if (ret)
5412 mem_cgroup_clear_mc();
5413 }
5414 mmput(mm);
5415 }
5416 return ret;
5417}
5418
5419static void mem_cgroup_cancel_attach(struct cgroup *cgroup,
5420 struct cgroup_taskset *tset)
5421{
5422 mem_cgroup_clear_mc();
5423}
5424
5425static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
5426 unsigned long addr, unsigned long end,
5427 struct mm_walk *walk)
5428{
5429 int ret = 0;
5430 struct vm_area_struct *vma = walk->private;
5431 pte_t *pte;
5432 spinlock_t *ptl;
5433 enum mc_target_type target_type;
5434 union mc_target target;
5435 struct page *page;
5436 struct page_cgroup *pc;
5437
5438
5439
5440
5441
5442
5443
5444
5445
5446
5447
5448 if (pmd_trans_huge_lock(pmd, vma) == 1) {
5449 if (mc.precharge < HPAGE_PMD_NR) {
5450 spin_unlock(&vma->vm_mm->page_table_lock);
5451 return 0;
5452 }
5453 target_type = get_mctgt_type_thp(vma, addr, *pmd, &target);
5454 if (target_type == MC_TARGET_PAGE) {
5455 page = target.page;
5456 if (!isolate_lru_page(page)) {
5457 pc = lookup_page_cgroup(page);
5458 if (!mem_cgroup_move_account(page, HPAGE_PMD_NR,
5459 pc, mc.from, mc.to)) {
5460 mc.precharge -= HPAGE_PMD_NR;
5461 mc.moved_charge += HPAGE_PMD_NR;
5462 }
5463 putback_lru_page(page);
5464 }
5465 put_page(page);
5466 }
5467 spin_unlock(&vma->vm_mm->page_table_lock);
5468 return 0;
5469 }
5470
5471 if (pmd_trans_unstable(pmd))
5472 return 0;
5473retry:
5474 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
5475 for (; addr != end; addr += PAGE_SIZE) {
5476 pte_t ptent = *(pte++);
5477 swp_entry_t ent;
5478
5479 if (!mc.precharge)
5480 break;
5481
5482 switch (get_mctgt_type(vma, addr, ptent, &target)) {
5483 case MC_TARGET_PAGE:
5484 page = target.page;
5485 if (isolate_lru_page(page))
5486 goto put;
5487 pc = lookup_page_cgroup(page);
5488 if (!mem_cgroup_move_account(page, 1, pc,
5489 mc.from, mc.to)) {
5490 mc.precharge--;
5491
5492 mc.moved_charge++;
5493 }
5494 putback_lru_page(page);
5495put:
5496 put_page(page);
5497 break;
5498 case MC_TARGET_SWAP:
5499 ent = target.ent;
5500 if (!mem_cgroup_move_swap_account(ent, mc.from, mc.to)) {
5501 mc.precharge--;
5502
5503 mc.moved_swap++;
5504 }
5505 break;
5506 default:
5507 break;
5508 }
5509 }
5510 pte_unmap_unlock(pte - 1, ptl);
5511 cond_resched();
5512
5513 if (addr != end) {
5514
5515
5516
5517
5518
5519
5520 ret = mem_cgroup_do_precharge(1);
5521 if (!ret)
5522 goto retry;
5523 }
5524
5525 return ret;
5526}
5527
5528static void mem_cgroup_move_charge(struct mm_struct *mm)
5529{
5530 struct vm_area_struct *vma;
5531
5532 lru_add_drain_all();
5533retry:
5534 if (unlikely(!down_read_trylock(&mm->mmap_sem))) {
5535
5536
5537
5538
5539
5540
5541
5542 __mem_cgroup_clear_mc();
5543 cond_resched();
5544 goto retry;
5545 }
5546 for (vma = mm->mmap; vma; vma = vma->vm_next) {
5547 int ret;
5548 struct mm_walk mem_cgroup_move_charge_walk = {
5549 .pmd_entry = mem_cgroup_move_charge_pte_range,
5550 .mm = mm,
5551 .private = vma,
5552 };
5553 if (is_vm_hugetlb_page(vma))
5554 continue;
5555 ret = walk_page_range(vma->vm_start, vma->vm_end,
5556 &mem_cgroup_move_charge_walk);
5557 if (ret)
5558
5559
5560
5561
5562 break;
5563 }
5564 up_read(&mm->mmap_sem);
5565}
5566
5567static void mem_cgroup_move_task(struct cgroup *cont,
5568 struct cgroup_taskset *tset)
5569{
5570 struct task_struct *p = cgroup_taskset_first(tset);
5571 struct mm_struct *mm = get_task_mm(p);
5572
5573 if (mm) {
5574 if (mc.to)
5575 mem_cgroup_move_charge(mm);
5576 mmput(mm);
5577 }
5578 if (mc.to)
5579 mem_cgroup_clear_mc();
5580}
5581#else
5582static int mem_cgroup_can_attach(struct cgroup *cgroup,
5583 struct cgroup_taskset *tset)
5584{
5585 return 0;
5586}
5587static void mem_cgroup_cancel_attach(struct cgroup *cgroup,
5588 struct cgroup_taskset *tset)
5589{
5590}
5591static void mem_cgroup_move_task(struct cgroup *cont,
5592 struct cgroup_taskset *tset)
5593{
5594}
5595#endif
5596
5597struct cgroup_subsys mem_cgroup_subsys = {
5598 .name = "memory",
5599 .subsys_id = mem_cgroup_subsys_id,
5600 .create = mem_cgroup_create,
5601 .pre_destroy = mem_cgroup_pre_destroy,
5602 .destroy = mem_cgroup_destroy,
5603 .can_attach = mem_cgroup_can_attach,
5604 .cancel_attach = mem_cgroup_cancel_attach,
5605 .attach = mem_cgroup_move_task,
5606 .base_cftypes = mem_cgroup_files,
5607 .early_init = 0,
5608 .use_id = 1,
5609 .__DEPRECATED_clear_css_refs = true,
5610};
5611
5612#ifdef CONFIG_MEMCG_SWAP
5613static int __init enable_swap_account(char *s)
5614{
5615
5616 if (!strcmp(s, "1"))
5617 really_do_swap_account = 1;
5618 else if (!strcmp(s, "0"))
5619 really_do_swap_account = 0;
5620 return 1;
5621}
5622__setup("swapaccount=", enable_swap_account);
5623
5624#endif
5625