1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24#include <linux/res_counter.h>
25#include <linux/memcontrol.h>
26#include <linux/cgroup.h>
27#include <linux/mm.h>
28#include <linux/hugetlb.h>
29#include <linux/pagemap.h>
30#include <linux/smp.h>
31#include <linux/page-flags.h>
32#include <linux/backing-dev.h>
33#include <linux/bit_spinlock.h>
34#include <linux/rcupdate.h>
35#include <linux/limits.h>
36#include <linux/mutex.h>
37#include <linux/rbtree.h>
38#include <linux/slab.h>
39#include <linux/swap.h>
40#include <linux/swapops.h>
41#include <linux/spinlock.h>
42#include <linux/eventfd.h>
43#include <linux/sort.h>
44#include <linux/fs.h>
45#include <linux/seq_file.h>
46#include <linux/vmalloc.h>
47#include <linux/mm_inline.h>
48#include <linux/page_cgroup.h>
49#include <linux/cpu.h>
50#include <linux/oom.h>
51#include "internal.h"
52
53#include <asm/uaccess.h>
54
55#include <trace/events/vmscan.h>
56
57struct cgroup_subsys mem_cgroup_subsys __read_mostly;
58#define MEM_CGROUP_RECLAIM_RETRIES 5
59struct mem_cgroup *root_mem_cgroup __read_mostly;
60
61#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
62
63int do_swap_account __read_mostly;
64
65
66#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP_ENABLED
67static int really_do_swap_account __initdata = 1;
68#else
69static int really_do_swap_account __initdata = 0;
70#endif
71
72#else
73#define do_swap_account (0)
74#endif
75
76
77
78
79
80
81
82
83#define THRESHOLDS_EVENTS_THRESH (7)
84#define SOFTLIMIT_EVENTS_THRESH (10)
85
86
87
88
89enum mem_cgroup_stat_index {
90
91
92
93 MEM_CGROUP_STAT_CACHE,
94 MEM_CGROUP_STAT_RSS,
95 MEM_CGROUP_STAT_FILE_MAPPED,
96 MEM_CGROUP_STAT_PGPGIN_COUNT,
97 MEM_CGROUP_STAT_PGPGOUT_COUNT,
98 MEM_CGROUP_STAT_SWAPOUT,
99 MEM_CGROUP_STAT_DATA,
100
101 MEM_CGROUP_EVENTS = MEM_CGROUP_STAT_DATA,
102 MEM_CGROUP_ON_MOVE,
103
104 MEM_CGROUP_STAT_NSTATS,
105};
106
107struct mem_cgroup_stat_cpu {
108 s64 count[MEM_CGROUP_STAT_NSTATS];
109};
110
111
112
113
114struct mem_cgroup_per_zone {
115
116
117
118 struct list_head lists[NR_LRU_LISTS];
119 unsigned long count[NR_LRU_LISTS];
120
121 struct zone_reclaim_stat reclaim_stat;
122 struct rb_node tree_node;
123 unsigned long long usage_in_excess;
124
125 bool on_tree;
126 struct mem_cgroup *mem;
127
128};
129
130#define MEM_CGROUP_ZSTAT(mz, idx) ((mz)->count[(idx)])
131
132struct mem_cgroup_per_node {
133 struct mem_cgroup_per_zone zoneinfo[MAX_NR_ZONES];
134};
135
136struct mem_cgroup_lru_info {
137 struct mem_cgroup_per_node *nodeinfo[MAX_NUMNODES];
138};
139
140
141
142
143
144
145struct mem_cgroup_tree_per_zone {
146 struct rb_root rb_root;
147 spinlock_t lock;
148};
149
150struct mem_cgroup_tree_per_node {
151 struct mem_cgroup_tree_per_zone rb_tree_per_zone[MAX_NR_ZONES];
152};
153
154struct mem_cgroup_tree {
155 struct mem_cgroup_tree_per_node *rb_tree_per_node[MAX_NUMNODES];
156};
157
158static struct mem_cgroup_tree soft_limit_tree __read_mostly;
159
160struct mem_cgroup_threshold {
161 struct eventfd_ctx *eventfd;
162 u64 threshold;
163};
164
165
166struct mem_cgroup_threshold_ary {
167
168 int current_threshold;
169
170 unsigned int size;
171
172 struct mem_cgroup_threshold entries[0];
173};
174
175struct mem_cgroup_thresholds {
176
177 struct mem_cgroup_threshold_ary *primary;
178
179
180
181
182
183 struct mem_cgroup_threshold_ary *spare;
184};
185
186
187struct mem_cgroup_eventfd_list {
188 struct list_head list;
189 struct eventfd_ctx *eventfd;
190};
191
192static void mem_cgroup_threshold(struct mem_cgroup *mem);
193static void mem_cgroup_oom_notify(struct mem_cgroup *mem);
194
195
196
197
198
199
200
201
202
203
204
205
206struct mem_cgroup {
207 struct cgroup_subsys_state css;
208
209
210
211 struct res_counter res;
212
213
214
215 struct res_counter memsw;
216
217
218
219
220 struct mem_cgroup_lru_info info;
221
222
223
224
225 spinlock_t reclaim_param_lock;
226
227
228
229
230
231 int last_scanned_child;
232
233
234
235 bool use_hierarchy;
236 atomic_t oom_lock;
237 atomic_t refcnt;
238
239 unsigned int swappiness;
240
241 int oom_kill_disable;
242
243
244 bool memsw_is_minimum;
245
246
247 struct mutex thresholds_lock;
248
249
250 struct mem_cgroup_thresholds thresholds;
251
252
253 struct mem_cgroup_thresholds memsw_thresholds;
254
255
256 struct list_head oom_notify;
257
258
259
260
261
262 unsigned long move_charge_at_immigrate;
263
264
265
266 struct mem_cgroup_stat_cpu *stat;
267
268
269
270
271 struct mem_cgroup_stat_cpu nocpu_base;
272 spinlock_t pcp_counter_lock;
273};
274
275
276
277
278
279
280enum move_type {
281 MOVE_CHARGE_TYPE_ANON,
282 MOVE_CHARGE_TYPE_FILE,
283 NR_MOVE_TYPE,
284};
285
286
287static struct move_charge_struct {
288 spinlock_t lock;
289 struct mem_cgroup *from;
290 struct mem_cgroup *to;
291 unsigned long precharge;
292 unsigned long moved_charge;
293 unsigned long moved_swap;
294 struct task_struct *moving_task;
295 wait_queue_head_t waitq;
296} mc = {
297 .lock = __SPIN_LOCK_UNLOCKED(mc.lock),
298 .waitq = __WAIT_QUEUE_HEAD_INITIALIZER(mc.waitq),
299};
300
301static bool move_anon(void)
302{
303 return test_bit(MOVE_CHARGE_TYPE_ANON,
304 &mc.to->move_charge_at_immigrate);
305}
306
307static bool move_file(void)
308{
309 return test_bit(MOVE_CHARGE_TYPE_FILE,
310 &mc.to->move_charge_at_immigrate);
311}
312
313
314
315
316
317#define MEM_CGROUP_MAX_RECLAIM_LOOPS (100)
318#define MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS (2)
319
320enum charge_type {
321 MEM_CGROUP_CHARGE_TYPE_CACHE = 0,
322 MEM_CGROUP_CHARGE_TYPE_MAPPED,
323 MEM_CGROUP_CHARGE_TYPE_SHMEM,
324 MEM_CGROUP_CHARGE_TYPE_FORCE,
325 MEM_CGROUP_CHARGE_TYPE_SWAPOUT,
326 MEM_CGROUP_CHARGE_TYPE_DROP,
327 NR_CHARGE_TYPE,
328};
329
330
331#define PCGF_CACHE (1UL << PCG_CACHE)
332#define PCGF_USED (1UL << PCG_USED)
333#define PCGF_LOCK (1UL << PCG_LOCK)
334
335#define PCGF_ACCT (1UL << PCG_ACCT)
336
337
338#define _MEM (0)
339#define _MEMSWAP (1)
340#define _OOM_TYPE (2)
341#define MEMFILE_PRIVATE(x, val) (((x) << 16) | (val))
342#define MEMFILE_TYPE(val) (((val) >> 16) & 0xffff)
343#define MEMFILE_ATTR(val) ((val) & 0xffff)
344
345#define OOM_CONTROL (0)
346
347
348
349
350#define MEM_CGROUP_RECLAIM_NOSWAP_BIT 0x0
351#define MEM_CGROUP_RECLAIM_NOSWAP (1 << MEM_CGROUP_RECLAIM_NOSWAP_BIT)
352#define MEM_CGROUP_RECLAIM_SHRINK_BIT 0x1
353#define MEM_CGROUP_RECLAIM_SHRINK (1 << MEM_CGROUP_RECLAIM_SHRINK_BIT)
354#define MEM_CGROUP_RECLAIM_SOFT_BIT 0x2
355#define MEM_CGROUP_RECLAIM_SOFT (1 << MEM_CGROUP_RECLAIM_SOFT_BIT)
356
357static void mem_cgroup_get(struct mem_cgroup *mem);
358static void mem_cgroup_put(struct mem_cgroup *mem);
359static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *mem);
360static void drain_all_stock_async(void);
361
362static struct mem_cgroup_per_zone *
363mem_cgroup_zoneinfo(struct mem_cgroup *mem, int nid, int zid)
364{
365 return &mem->info.nodeinfo[nid]->zoneinfo[zid];
366}
367
368struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *mem)
369{
370 return &mem->css;
371}
372
373static struct mem_cgroup_per_zone *
374page_cgroup_zoneinfo(struct page_cgroup *pc)
375{
376 struct mem_cgroup *mem = pc->mem_cgroup;
377 int nid = page_cgroup_nid(pc);
378 int zid = page_cgroup_zid(pc);
379
380 if (!mem)
381 return NULL;
382
383 return mem_cgroup_zoneinfo(mem, nid, zid);
384}
385
386static struct mem_cgroup_tree_per_zone *
387soft_limit_tree_node_zone(int nid, int zid)
388{
389 return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid];
390}
391
392static struct mem_cgroup_tree_per_zone *
393soft_limit_tree_from_page(struct page *page)
394{
395 int nid = page_to_nid(page);
396 int zid = page_zonenum(page);
397
398 return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid];
399}
400
401static void
402__mem_cgroup_insert_exceeded(struct mem_cgroup *mem,
403 struct mem_cgroup_per_zone *mz,
404 struct mem_cgroup_tree_per_zone *mctz,
405 unsigned long long new_usage_in_excess)
406{
407 struct rb_node **p = &mctz->rb_root.rb_node;
408 struct rb_node *parent = NULL;
409 struct mem_cgroup_per_zone *mz_node;
410
411 if (mz->on_tree)
412 return;
413
414 mz->usage_in_excess = new_usage_in_excess;
415 if (!mz->usage_in_excess)
416 return;
417 while (*p) {
418 parent = *p;
419 mz_node = rb_entry(parent, struct mem_cgroup_per_zone,
420 tree_node);
421 if (mz->usage_in_excess < mz_node->usage_in_excess)
422 p = &(*p)->rb_left;
423
424
425
426
427 else if (mz->usage_in_excess >= mz_node->usage_in_excess)
428 p = &(*p)->rb_right;
429 }
430 rb_link_node(&mz->tree_node, parent, p);
431 rb_insert_color(&mz->tree_node, &mctz->rb_root);
432 mz->on_tree = true;
433}
434
435static void
436__mem_cgroup_remove_exceeded(struct mem_cgroup *mem,
437 struct mem_cgroup_per_zone *mz,
438 struct mem_cgroup_tree_per_zone *mctz)
439{
440 if (!mz->on_tree)
441 return;
442 rb_erase(&mz->tree_node, &mctz->rb_root);
443 mz->on_tree = false;
444}
445
446static void
447mem_cgroup_remove_exceeded(struct mem_cgroup *mem,
448 struct mem_cgroup_per_zone *mz,
449 struct mem_cgroup_tree_per_zone *mctz)
450{
451 spin_lock(&mctz->lock);
452 __mem_cgroup_remove_exceeded(mem, mz, mctz);
453 spin_unlock(&mctz->lock);
454}
455
456
457static void mem_cgroup_update_tree(struct mem_cgroup *mem, struct page *page)
458{
459 unsigned long long excess;
460 struct mem_cgroup_per_zone *mz;
461 struct mem_cgroup_tree_per_zone *mctz;
462 int nid = page_to_nid(page);
463 int zid = page_zonenum(page);
464 mctz = soft_limit_tree_from_page(page);
465
466
467
468
469
470 for (; mem; mem = parent_mem_cgroup(mem)) {
471 mz = mem_cgroup_zoneinfo(mem, nid, zid);
472 excess = res_counter_soft_limit_excess(&mem->res);
473
474
475
476
477 if (excess || mz->on_tree) {
478 spin_lock(&mctz->lock);
479
480 if (mz->on_tree)
481 __mem_cgroup_remove_exceeded(mem, mz, mctz);
482
483
484
485
486 __mem_cgroup_insert_exceeded(mem, mz, mctz, excess);
487 spin_unlock(&mctz->lock);
488 }
489 }
490}
491
492static void mem_cgroup_remove_from_trees(struct mem_cgroup *mem)
493{
494 int node, zone;
495 struct mem_cgroup_per_zone *mz;
496 struct mem_cgroup_tree_per_zone *mctz;
497
498 for_each_node_state(node, N_POSSIBLE) {
499 for (zone = 0; zone < MAX_NR_ZONES; zone++) {
500 mz = mem_cgroup_zoneinfo(mem, node, zone);
501 mctz = soft_limit_tree_node_zone(node, zone);
502 mem_cgroup_remove_exceeded(mem, mz, mctz);
503 }
504 }
505}
506
507static inline unsigned long mem_cgroup_get_excess(struct mem_cgroup *mem)
508{
509 return res_counter_soft_limit_excess(&mem->res) >> PAGE_SHIFT;
510}
511
512static struct mem_cgroup_per_zone *
513__mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz)
514{
515 struct rb_node *rightmost = NULL;
516 struct mem_cgroup_per_zone *mz;
517
518retry:
519 mz = NULL;
520 rightmost = rb_last(&mctz->rb_root);
521 if (!rightmost)
522 goto done;
523
524 mz = rb_entry(rightmost, struct mem_cgroup_per_zone, tree_node);
525
526
527
528
529
530 __mem_cgroup_remove_exceeded(mz->mem, mz, mctz);
531 if (!res_counter_soft_limit_excess(&mz->mem->res) ||
532 !css_tryget(&mz->mem->css))
533 goto retry;
534done:
535 return mz;
536}
537
538static struct mem_cgroup_per_zone *
539mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz)
540{
541 struct mem_cgroup_per_zone *mz;
542
543 spin_lock(&mctz->lock);
544 mz = __mem_cgroup_largest_soft_limit_node(mctz);
545 spin_unlock(&mctz->lock);
546 return mz;
547}
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568static s64 mem_cgroup_read_stat(struct mem_cgroup *mem,
569 enum mem_cgroup_stat_index idx)
570{
571 int cpu;
572 s64 val = 0;
573
574 get_online_cpus();
575 for_each_online_cpu(cpu)
576 val += per_cpu(mem->stat->count[idx], cpu);
577#ifdef CONFIG_HOTPLUG_CPU
578 spin_lock(&mem->pcp_counter_lock);
579 val += mem->nocpu_base.count[idx];
580 spin_unlock(&mem->pcp_counter_lock);
581#endif
582 put_online_cpus();
583 return val;
584}
585
586static s64 mem_cgroup_local_usage(struct mem_cgroup *mem)
587{
588 s64 ret;
589
590 ret = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_RSS);
591 ret += mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_CACHE);
592 return ret;
593}
594
595static void mem_cgroup_swap_statistics(struct mem_cgroup *mem,
596 bool charge)
597{
598 int val = (charge) ? 1 : -1;
599 this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_SWAPOUT], val);
600}
601
602static void mem_cgroup_charge_statistics(struct mem_cgroup *mem,
603 bool file, int nr_pages)
604{
605 preempt_disable();
606
607 if (file)
608 __this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_CACHE], nr_pages);
609 else
610 __this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_RSS], nr_pages);
611
612
613 if (nr_pages > 0)
614 __this_cpu_inc(mem->stat->count[MEM_CGROUP_STAT_PGPGIN_COUNT]);
615 else {
616 __this_cpu_inc(mem->stat->count[MEM_CGROUP_STAT_PGPGOUT_COUNT]);
617 nr_pages = -nr_pages;
618 }
619
620 __this_cpu_add(mem->stat->count[MEM_CGROUP_EVENTS], nr_pages);
621
622 preempt_enable();
623}
624
625static unsigned long mem_cgroup_get_local_zonestat(struct mem_cgroup *mem,
626 enum lru_list idx)
627{
628 int nid, zid;
629 struct mem_cgroup_per_zone *mz;
630 u64 total = 0;
631
632 for_each_online_node(nid)
633 for (zid = 0; zid < MAX_NR_ZONES; zid++) {
634 mz = mem_cgroup_zoneinfo(mem, nid, zid);
635 total += MEM_CGROUP_ZSTAT(mz, idx);
636 }
637 return total;
638}
639
640static bool __memcg_event_check(struct mem_cgroup *mem, int event_mask_shift)
641{
642 s64 val;
643
644 val = this_cpu_read(mem->stat->count[MEM_CGROUP_EVENTS]);
645
646 return !(val & ((1 << event_mask_shift) - 1));
647}
648
649
650
651
652
653static void memcg_check_events(struct mem_cgroup *mem, struct page *page)
654{
655
656 if (unlikely(__memcg_event_check(mem, THRESHOLDS_EVENTS_THRESH))) {
657 mem_cgroup_threshold(mem);
658 if (unlikely(__memcg_event_check(mem, SOFTLIMIT_EVENTS_THRESH)))
659 mem_cgroup_update_tree(mem, page);
660 }
661}
662
663static struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont)
664{
665 return container_of(cgroup_subsys_state(cont,
666 mem_cgroup_subsys_id), struct mem_cgroup,
667 css);
668}
669
670struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
671{
672
673
674
675
676
677 if (unlikely(!p))
678 return NULL;
679
680 return container_of(task_subsys_state(p, mem_cgroup_subsys_id),
681 struct mem_cgroup, css);
682}
683
684static struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm)
685{
686 struct mem_cgroup *mem = NULL;
687
688 if (!mm)
689 return NULL;
690
691
692
693
694
695 rcu_read_lock();
696 do {
697 mem = mem_cgroup_from_task(rcu_dereference(mm->owner));
698 if (unlikely(!mem))
699 break;
700 } while (!css_tryget(&mem->css));
701 rcu_read_unlock();
702 return mem;
703}
704
705
706static struct mem_cgroup *mem_cgroup_start_loop(struct mem_cgroup *mem)
707{
708 struct cgroup_subsys_state *css;
709 int found;
710
711 if (!mem)
712 return root_mem_cgroup;
713 if (!mem->use_hierarchy) {
714 if (css_tryget(&mem->css))
715 return mem;
716 return NULL;
717 }
718 rcu_read_lock();
719
720
721
722
723 css = css_get_next(&mem_cgroup_subsys, 1, &mem->css, &found);
724 if (css && css_tryget(css))
725 mem = container_of(css, struct mem_cgroup, css);
726 else
727 mem = NULL;
728 rcu_read_unlock();
729 return mem;
730}
731
732static struct mem_cgroup *mem_cgroup_get_next(struct mem_cgroup *iter,
733 struct mem_cgroup *root,
734 bool cond)
735{
736 int nextid = css_id(&iter->css) + 1;
737 int found;
738 int hierarchy_used;
739 struct cgroup_subsys_state *css;
740
741 hierarchy_used = iter->use_hierarchy;
742
743 css_put(&iter->css);
744
745 if (!cond || (root && !hierarchy_used))
746 return NULL;
747
748 if (!root)
749 root = root_mem_cgroup;
750
751 do {
752 iter = NULL;
753 rcu_read_lock();
754
755 css = css_get_next(&mem_cgroup_subsys, nextid,
756 &root->css, &found);
757 if (css && css_tryget(css))
758 iter = container_of(css, struct mem_cgroup, css);
759 rcu_read_unlock();
760
761 nextid = found + 1;
762 } while (css && !iter);
763
764 return iter;
765}
766
767
768
769
770
771#define for_each_mem_cgroup_tree_cond(iter, root, cond) \
772 for (iter = mem_cgroup_start_loop(root);\
773 iter != NULL;\
774 iter = mem_cgroup_get_next(iter, root, cond))
775
776#define for_each_mem_cgroup_tree(iter, root) \
777 for_each_mem_cgroup_tree_cond(iter, root, true)
778
779#define for_each_mem_cgroup_all(iter) \
780 for_each_mem_cgroup_tree_cond(iter, NULL, true)
781
782
783static inline bool mem_cgroup_is_root(struct mem_cgroup *mem)
784{
785 return (mem == root_mem_cgroup);
786}
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802void mem_cgroup_del_lru_list(struct page *page, enum lru_list lru)
803{
804 struct page_cgroup *pc;
805 struct mem_cgroup_per_zone *mz;
806
807 if (mem_cgroup_disabled())
808 return;
809 pc = lookup_page_cgroup(page);
810
811 if (!TestClearPageCgroupAcctLRU(pc))
812 return;
813 VM_BUG_ON(!pc->mem_cgroup);
814
815
816
817
818 mz = page_cgroup_zoneinfo(pc);
819
820 MEM_CGROUP_ZSTAT(mz, lru) -= 1 << compound_order(page);
821 if (mem_cgroup_is_root(pc->mem_cgroup))
822 return;
823 VM_BUG_ON(list_empty(&pc->lru));
824 list_del_init(&pc->lru);
825}
826
827void mem_cgroup_del_lru(struct page *page)
828{
829 mem_cgroup_del_lru_list(page, page_lru(page));
830}
831
832void mem_cgroup_rotate_lru_list(struct page *page, enum lru_list lru)
833{
834 struct mem_cgroup_per_zone *mz;
835 struct page_cgroup *pc;
836
837 if (mem_cgroup_disabled())
838 return;
839
840 pc = lookup_page_cgroup(page);
841
842 if (!PageCgroupUsed(pc))
843 return;
844
845 smp_rmb();
846 if (mem_cgroup_is_root(pc->mem_cgroup))
847 return;
848 mz = page_cgroup_zoneinfo(pc);
849 list_move(&pc->lru, &mz->lists[lru]);
850}
851
852void mem_cgroup_add_lru_list(struct page *page, enum lru_list lru)
853{
854 struct page_cgroup *pc;
855 struct mem_cgroup_per_zone *mz;
856
857 if (mem_cgroup_disabled())
858 return;
859 pc = lookup_page_cgroup(page);
860 VM_BUG_ON(PageCgroupAcctLRU(pc));
861 if (!PageCgroupUsed(pc))
862 return;
863
864 smp_rmb();
865 mz = page_cgroup_zoneinfo(pc);
866
867 MEM_CGROUP_ZSTAT(mz, lru) += 1 << compound_order(page);
868 SetPageCgroupAcctLRU(pc);
869 if (mem_cgroup_is_root(pc->mem_cgroup))
870 return;
871 list_add(&pc->lru, &mz->lists[lru]);
872}
873
874
875
876
877
878
879
880
881static void mem_cgroup_lru_del_before_commit_swapcache(struct page *page)
882{
883 unsigned long flags;
884 struct zone *zone = page_zone(page);
885 struct page_cgroup *pc = lookup_page_cgroup(page);
886
887 spin_lock_irqsave(&zone->lru_lock, flags);
888
889
890
891
892 if (!PageCgroupUsed(pc))
893 mem_cgroup_del_lru_list(page, page_lru(page));
894 spin_unlock_irqrestore(&zone->lru_lock, flags);
895}
896
897static void mem_cgroup_lru_add_after_commit_swapcache(struct page *page)
898{
899 unsigned long flags;
900 struct zone *zone = page_zone(page);
901 struct page_cgroup *pc = lookup_page_cgroup(page);
902
903 spin_lock_irqsave(&zone->lru_lock, flags);
904
905 if (PageLRU(page) && !PageCgroupAcctLRU(pc))
906 mem_cgroup_add_lru_list(page, page_lru(page));
907 spin_unlock_irqrestore(&zone->lru_lock, flags);
908}
909
910
911void mem_cgroup_move_lists(struct page *page,
912 enum lru_list from, enum lru_list to)
913{
914 if (mem_cgroup_disabled())
915 return;
916 mem_cgroup_del_lru_list(page, from);
917 mem_cgroup_add_lru_list(page, to);
918}
919
920int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem)
921{
922 int ret;
923 struct mem_cgroup *curr = NULL;
924 struct task_struct *p;
925
926 p = find_lock_task_mm(task);
927 if (!p)
928 return 0;
929 curr = try_get_mem_cgroup_from_mm(p->mm);
930 task_unlock(p);
931 if (!curr)
932 return 0;
933
934
935
936
937
938
939 if (mem->use_hierarchy)
940 ret = css_is_ancestor(&curr->css, &mem->css);
941 else
942 ret = (curr == mem);
943 css_put(&curr->css);
944 return ret;
945}
946
947static int calc_inactive_ratio(struct mem_cgroup *memcg, unsigned long *present_pages)
948{
949 unsigned long active;
950 unsigned long inactive;
951 unsigned long gb;
952 unsigned long inactive_ratio;
953
954 inactive = mem_cgroup_get_local_zonestat(memcg, LRU_INACTIVE_ANON);
955 active = mem_cgroup_get_local_zonestat(memcg, LRU_ACTIVE_ANON);
956
957 gb = (inactive + active) >> (30 - PAGE_SHIFT);
958 if (gb)
959 inactive_ratio = int_sqrt(10 * gb);
960 else
961 inactive_ratio = 1;
962
963 if (present_pages) {
964 present_pages[0] = inactive;
965 present_pages[1] = active;
966 }
967
968 return inactive_ratio;
969}
970
971int mem_cgroup_inactive_anon_is_low(struct mem_cgroup *memcg)
972{
973 unsigned long active;
974 unsigned long inactive;
975 unsigned long present_pages[2];
976 unsigned long inactive_ratio;
977
978 inactive_ratio = calc_inactive_ratio(memcg, present_pages);
979
980 inactive = present_pages[0];
981 active = present_pages[1];
982
983 if (inactive * inactive_ratio < active)
984 return 1;
985
986 return 0;
987}
988
989int mem_cgroup_inactive_file_is_low(struct mem_cgroup *memcg)
990{
991 unsigned long active;
992 unsigned long inactive;
993
994 inactive = mem_cgroup_get_local_zonestat(memcg, LRU_INACTIVE_FILE);
995 active = mem_cgroup_get_local_zonestat(memcg, LRU_ACTIVE_FILE);
996
997 return (active > inactive);
998}
999
1000unsigned long mem_cgroup_zone_nr_pages(struct mem_cgroup *memcg,
1001 struct zone *zone,
1002 enum lru_list lru)
1003{
1004 int nid = zone_to_nid(zone);
1005 int zid = zone_idx(zone);
1006 struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(memcg, nid, zid);
1007
1008 return MEM_CGROUP_ZSTAT(mz, lru);
1009}
1010
1011struct zone_reclaim_stat *mem_cgroup_get_reclaim_stat(struct mem_cgroup *memcg,
1012 struct zone *zone)
1013{
1014 int nid = zone_to_nid(zone);
1015 int zid = zone_idx(zone);
1016 struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(memcg, nid, zid);
1017
1018 return &mz->reclaim_stat;
1019}
1020
1021struct zone_reclaim_stat *
1022mem_cgroup_get_reclaim_stat_from_page(struct page *page)
1023{
1024 struct page_cgroup *pc;
1025 struct mem_cgroup_per_zone *mz;
1026
1027 if (mem_cgroup_disabled())
1028 return NULL;
1029
1030 pc = lookup_page_cgroup(page);
1031 if (!PageCgroupUsed(pc))
1032 return NULL;
1033
1034 smp_rmb();
1035 mz = page_cgroup_zoneinfo(pc);
1036 if (!mz)
1037 return NULL;
1038
1039 return &mz->reclaim_stat;
1040}
1041
1042unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
1043 struct list_head *dst,
1044 unsigned long *scanned, int order,
1045 int mode, struct zone *z,
1046 struct mem_cgroup *mem_cont,
1047 int active, int file)
1048{
1049 unsigned long nr_taken = 0;
1050 struct page *page;
1051 unsigned long scan;
1052 LIST_HEAD(pc_list);
1053 struct list_head *src;
1054 struct page_cgroup *pc, *tmp;
1055 int nid = zone_to_nid(z);
1056 int zid = zone_idx(z);
1057 struct mem_cgroup_per_zone *mz;
1058 int lru = LRU_FILE * file + active;
1059 int ret;
1060
1061 BUG_ON(!mem_cont);
1062 mz = mem_cgroup_zoneinfo(mem_cont, nid, zid);
1063 src = &mz->lists[lru];
1064
1065 scan = 0;
1066 list_for_each_entry_safe_reverse(pc, tmp, src, lru) {
1067 if (scan >= nr_to_scan)
1068 break;
1069
1070 page = pc->page;
1071 if (unlikely(!PageCgroupUsed(pc)))
1072 continue;
1073 if (unlikely(!PageLRU(page)))
1074 continue;
1075
1076 scan++;
1077 ret = __isolate_lru_page(page, mode, file);
1078 switch (ret) {
1079 case 0:
1080 list_move(&page->lru, dst);
1081 mem_cgroup_del_lru(page);
1082 nr_taken += hpage_nr_pages(page);
1083 break;
1084 case -EBUSY:
1085
1086 mem_cgroup_rotate_lru_list(page, page_lru(page));
1087 break;
1088 default:
1089 break;
1090 }
1091 }
1092
1093 *scanned = scan;
1094
1095 trace_mm_vmscan_memcg_isolate(0, nr_to_scan, scan, nr_taken,
1096 0, 0, 0, mode);
1097
1098 return nr_taken;
1099}
1100
1101#define mem_cgroup_from_res_counter(counter, member) \
1102 container_of(counter, struct mem_cgroup, member)
1103
1104static bool mem_cgroup_check_under_limit(struct mem_cgroup *mem)
1105{
1106 if (do_swap_account) {
1107 if (res_counter_check_under_limit(&mem->res) &&
1108 res_counter_check_under_limit(&mem->memsw))
1109 return true;
1110 } else
1111 if (res_counter_check_under_limit(&mem->res))
1112 return true;
1113 return false;
1114}
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124static bool mem_cgroup_check_margin(struct mem_cgroup *mem, unsigned long bytes)
1125{
1126 if (!res_counter_check_margin(&mem->res, bytes))
1127 return false;
1128 if (do_swap_account && !res_counter_check_margin(&mem->memsw, bytes))
1129 return false;
1130 return true;
1131}
1132
1133static unsigned int get_swappiness(struct mem_cgroup *memcg)
1134{
1135 struct cgroup *cgrp = memcg->css.cgroup;
1136 unsigned int swappiness;
1137
1138
1139 if (cgrp->parent == NULL)
1140 return vm_swappiness;
1141
1142 spin_lock(&memcg->reclaim_param_lock);
1143 swappiness = memcg->swappiness;
1144 spin_unlock(&memcg->reclaim_param_lock);
1145
1146 return swappiness;
1147}
1148
1149static void mem_cgroup_start_move(struct mem_cgroup *mem)
1150{
1151 int cpu;
1152
1153 get_online_cpus();
1154 spin_lock(&mem->pcp_counter_lock);
1155 for_each_online_cpu(cpu)
1156 per_cpu(mem->stat->count[MEM_CGROUP_ON_MOVE], cpu) += 1;
1157 mem->nocpu_base.count[MEM_CGROUP_ON_MOVE] += 1;
1158 spin_unlock(&mem->pcp_counter_lock);
1159 put_online_cpus();
1160
1161 synchronize_rcu();
1162}
1163
1164static void mem_cgroup_end_move(struct mem_cgroup *mem)
1165{
1166 int cpu;
1167
1168 if (!mem)
1169 return;
1170 get_online_cpus();
1171 spin_lock(&mem->pcp_counter_lock);
1172 for_each_online_cpu(cpu)
1173 per_cpu(mem->stat->count[MEM_CGROUP_ON_MOVE], cpu) -= 1;
1174 mem->nocpu_base.count[MEM_CGROUP_ON_MOVE] -= 1;
1175 spin_unlock(&mem->pcp_counter_lock);
1176 put_online_cpus();
1177}
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190static bool mem_cgroup_stealed(struct mem_cgroup *mem)
1191{
1192 VM_BUG_ON(!rcu_read_lock_held());
1193 return this_cpu_read(mem->stat->count[MEM_CGROUP_ON_MOVE]) > 0;
1194}
1195
1196static bool mem_cgroup_under_move(struct mem_cgroup *mem)
1197{
1198 struct mem_cgroup *from;
1199 struct mem_cgroup *to;
1200 bool ret = false;
1201
1202
1203
1204
1205 spin_lock(&mc.lock);
1206 from = mc.from;
1207 to = mc.to;
1208 if (!from)
1209 goto unlock;
1210 if (from == mem || to == mem
1211 || (mem->use_hierarchy && css_is_ancestor(&from->css, &mem->css))
1212 || (mem->use_hierarchy && css_is_ancestor(&to->css, &mem->css)))
1213 ret = true;
1214unlock:
1215 spin_unlock(&mc.lock);
1216 return ret;
1217}
1218
1219static bool mem_cgroup_wait_acct_move(struct mem_cgroup *mem)
1220{
1221 if (mc.moving_task && current != mc.moving_task) {
1222 if (mem_cgroup_under_move(mem)) {
1223 DEFINE_WAIT(wait);
1224 prepare_to_wait(&mc.waitq, &wait, TASK_INTERRUPTIBLE);
1225
1226 if (mc.moving_task)
1227 schedule();
1228 finish_wait(&mc.waitq, &wait);
1229 return true;
1230 }
1231 }
1232 return false;
1233}
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
1244{
1245 struct cgroup *task_cgrp;
1246 struct cgroup *mem_cgrp;
1247
1248
1249
1250
1251
1252 static char memcg_name[PATH_MAX];
1253 int ret;
1254
1255 if (!memcg || !p)
1256 return;
1257
1258
1259 rcu_read_lock();
1260
1261 mem_cgrp = memcg->css.cgroup;
1262 task_cgrp = task_cgroup(p, mem_cgroup_subsys_id);
1263
1264 ret = cgroup_path(task_cgrp, memcg_name, PATH_MAX);
1265 if (ret < 0) {
1266
1267
1268
1269
1270 rcu_read_unlock();
1271 goto done;
1272 }
1273 rcu_read_unlock();
1274
1275 printk(KERN_INFO "Task in %s killed", memcg_name);
1276
1277 rcu_read_lock();
1278 ret = cgroup_path(mem_cgrp, memcg_name, PATH_MAX);
1279 if (ret < 0) {
1280 rcu_read_unlock();
1281 goto done;
1282 }
1283 rcu_read_unlock();
1284
1285
1286
1287
1288 printk(KERN_CONT " as a result of limit of %s\n", memcg_name);
1289done:
1290
1291 printk(KERN_INFO "memory: usage %llukB, limit %llukB, failcnt %llu\n",
1292 res_counter_read_u64(&memcg->res, RES_USAGE) >> 10,
1293 res_counter_read_u64(&memcg->res, RES_LIMIT) >> 10,
1294 res_counter_read_u64(&memcg->res, RES_FAILCNT));
1295 printk(KERN_INFO "memory+swap: usage %llukB, limit %llukB, "
1296 "failcnt %llu\n",
1297 res_counter_read_u64(&memcg->memsw, RES_USAGE) >> 10,
1298 res_counter_read_u64(&memcg->memsw, RES_LIMIT) >> 10,
1299 res_counter_read_u64(&memcg->memsw, RES_FAILCNT));
1300}
1301
1302
1303
1304
1305
1306static int mem_cgroup_count_children(struct mem_cgroup *mem)
1307{
1308 int num = 0;
1309 struct mem_cgroup *iter;
1310
1311 for_each_mem_cgroup_tree(iter, mem)
1312 num++;
1313 return num;
1314}
1315
1316
1317
1318
1319u64 mem_cgroup_get_limit(struct mem_cgroup *memcg)
1320{
1321 u64 limit;
1322 u64 memsw;
1323
1324 limit = res_counter_read_u64(&memcg->res, RES_LIMIT);
1325 limit += total_swap_pages << PAGE_SHIFT;
1326
1327 memsw = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
1328
1329
1330
1331
1332 return min(limit, memsw);
1333}
1334
1335
1336
1337
1338
1339
1340static struct mem_cgroup *
1341mem_cgroup_select_victim(struct mem_cgroup *root_mem)
1342{
1343 struct mem_cgroup *ret = NULL;
1344 struct cgroup_subsys_state *css;
1345 int nextid, found;
1346
1347 if (!root_mem->use_hierarchy) {
1348 css_get(&root_mem->css);
1349 ret = root_mem;
1350 }
1351
1352 while (!ret) {
1353 rcu_read_lock();
1354 nextid = root_mem->last_scanned_child + 1;
1355 css = css_get_next(&mem_cgroup_subsys, nextid, &root_mem->css,
1356 &found);
1357 if (css && css_tryget(css))
1358 ret = container_of(css, struct mem_cgroup, css);
1359
1360 rcu_read_unlock();
1361
1362 spin_lock(&root_mem->reclaim_param_lock);
1363 if (!css) {
1364
1365 root_mem->last_scanned_child = 0;
1366 } else
1367 root_mem->last_scanned_child = found;
1368 spin_unlock(&root_mem->reclaim_param_lock);
1369 }
1370
1371 return ret;
1372}
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,
1387 struct zone *zone,
1388 gfp_t gfp_mask,
1389 unsigned long reclaim_options)
1390{
1391 struct mem_cgroup *victim;
1392 int ret, total = 0;
1393 int loop = 0;
1394 bool noswap = reclaim_options & MEM_CGROUP_RECLAIM_NOSWAP;
1395 bool shrink = reclaim_options & MEM_CGROUP_RECLAIM_SHRINK;
1396 bool check_soft = reclaim_options & MEM_CGROUP_RECLAIM_SOFT;
1397 unsigned long excess = mem_cgroup_get_excess(root_mem);
1398
1399
1400 if (root_mem->memsw_is_minimum)
1401 noswap = true;
1402
1403 while (1) {
1404 victim = mem_cgroup_select_victim(root_mem);
1405 if (victim == root_mem) {
1406 loop++;
1407 if (loop >= 1)
1408 drain_all_stock_async();
1409 if (loop >= 2) {
1410
1411
1412
1413
1414
1415 if (!check_soft || !total) {
1416 css_put(&victim->css);
1417 break;
1418 }
1419
1420
1421
1422
1423
1424
1425 if (total >= (excess >> 2) ||
1426 (loop > MEM_CGROUP_MAX_RECLAIM_LOOPS)) {
1427 css_put(&victim->css);
1428 break;
1429 }
1430 }
1431 }
1432 if (!mem_cgroup_local_usage(victim)) {
1433
1434 css_put(&victim->css);
1435 continue;
1436 }
1437
1438 if (check_soft)
1439 ret = mem_cgroup_shrink_node_zone(victim, gfp_mask,
1440 noswap, get_swappiness(victim), zone);
1441 else
1442 ret = try_to_free_mem_cgroup_pages(victim, gfp_mask,
1443 noswap, get_swappiness(victim));
1444 css_put(&victim->css);
1445
1446
1447
1448
1449
1450 if (shrink)
1451 return ret;
1452 total += ret;
1453 if (check_soft) {
1454 if (res_counter_check_under_soft_limit(&root_mem->res))
1455 return total;
1456 } else if (mem_cgroup_check_under_limit(root_mem))
1457 return 1 + total;
1458 }
1459 return total;
1460}
1461
1462
1463
1464
1465
1466static bool mem_cgroup_oom_lock(struct mem_cgroup *mem)
1467{
1468 int x, lock_count = 0;
1469 struct mem_cgroup *iter;
1470
1471 for_each_mem_cgroup_tree(iter, mem) {
1472 x = atomic_inc_return(&iter->oom_lock);
1473 lock_count = max(x, lock_count);
1474 }
1475
1476 if (lock_count == 1)
1477 return true;
1478 return false;
1479}
1480
1481static int mem_cgroup_oom_unlock(struct mem_cgroup *mem)
1482{
1483 struct mem_cgroup *iter;
1484
1485
1486
1487
1488
1489
1490 for_each_mem_cgroup_tree(iter, mem)
1491 atomic_add_unless(&iter->oom_lock, -1, 0);
1492 return 0;
1493}
1494
1495
1496static DEFINE_MUTEX(memcg_oom_mutex);
1497static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq);
1498
1499struct oom_wait_info {
1500 struct mem_cgroup *mem;
1501 wait_queue_t wait;
1502};
1503
1504static int memcg_oom_wake_function(wait_queue_t *wait,
1505 unsigned mode, int sync, void *arg)
1506{
1507 struct mem_cgroup *wake_mem = (struct mem_cgroup *)arg;
1508 struct oom_wait_info *oom_wait_info;
1509
1510 oom_wait_info = container_of(wait, struct oom_wait_info, wait);
1511
1512 if (oom_wait_info->mem == wake_mem)
1513 goto wakeup;
1514
1515 if (!oom_wait_info->mem->use_hierarchy || !wake_mem->use_hierarchy)
1516 return 0;
1517
1518
1519
1520
1521 if (!css_is_ancestor(&oom_wait_info->mem->css, &wake_mem->css) &&
1522 !css_is_ancestor(&wake_mem->css, &oom_wait_info->mem->css))
1523 return 0;
1524
1525wakeup:
1526 return autoremove_wake_function(wait, mode, sync, arg);
1527}
1528
1529static void memcg_wakeup_oom(struct mem_cgroup *mem)
1530{
1531
1532 __wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, mem);
1533}
1534
1535static void memcg_oom_recover(struct mem_cgroup *mem)
1536{
1537 if (mem && atomic_read(&mem->oom_lock))
1538 memcg_wakeup_oom(mem);
1539}
1540
1541
1542
1543
1544bool mem_cgroup_handle_oom(struct mem_cgroup *mem, gfp_t mask)
1545{
1546 struct oom_wait_info owait;
1547 bool locked, need_to_kill;
1548
1549 owait.mem = mem;
1550 owait.wait.flags = 0;
1551 owait.wait.func = memcg_oom_wake_function;
1552 owait.wait.private = current;
1553 INIT_LIST_HEAD(&owait.wait.task_list);
1554 need_to_kill = true;
1555
1556 mutex_lock(&memcg_oom_mutex);
1557 locked = mem_cgroup_oom_lock(mem);
1558
1559
1560
1561
1562
1563 prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE);
1564 if (!locked || mem->oom_kill_disable)
1565 need_to_kill = false;
1566 if (locked)
1567 mem_cgroup_oom_notify(mem);
1568 mutex_unlock(&memcg_oom_mutex);
1569
1570 if (need_to_kill) {
1571 finish_wait(&memcg_oom_waitq, &owait.wait);
1572 mem_cgroup_out_of_memory(mem, mask);
1573 } else {
1574 schedule();
1575 finish_wait(&memcg_oom_waitq, &owait.wait);
1576 }
1577 mutex_lock(&memcg_oom_mutex);
1578 mem_cgroup_oom_unlock(mem);
1579 memcg_wakeup_oom(mem);
1580 mutex_unlock(&memcg_oom_mutex);
1581
1582 if (test_thread_flag(TIF_MEMDIE) || fatal_signal_pending(current))
1583 return false;
1584
1585 schedule_timeout(1);
1586 return true;
1587}
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613void mem_cgroup_update_page_stat(struct page *page,
1614 enum mem_cgroup_page_stat_item idx, int val)
1615{
1616 struct mem_cgroup *mem;
1617 struct page_cgroup *pc = lookup_page_cgroup(page);
1618 bool need_unlock = false;
1619 unsigned long uninitialized_var(flags);
1620
1621 if (unlikely(!pc))
1622 return;
1623
1624 rcu_read_lock();
1625 mem = pc->mem_cgroup;
1626 if (unlikely(!mem || !PageCgroupUsed(pc)))
1627 goto out;
1628
1629 if (unlikely(mem_cgroup_stealed(mem)) || PageTransHuge(page)) {
1630
1631 move_lock_page_cgroup(pc, &flags);
1632 need_unlock = true;
1633 mem = pc->mem_cgroup;
1634 if (!mem || !PageCgroupUsed(pc))
1635 goto out;
1636 }
1637
1638 switch (idx) {
1639 case MEMCG_NR_FILE_MAPPED:
1640 if (val > 0)
1641 SetPageCgroupFileMapped(pc);
1642 else if (!page_mapped(page))
1643 ClearPageCgroupFileMapped(pc);
1644 idx = MEM_CGROUP_STAT_FILE_MAPPED;
1645 break;
1646 default:
1647 BUG();
1648 }
1649
1650 this_cpu_add(mem->stat->count[idx], val);
1651
1652out:
1653 if (unlikely(need_unlock))
1654 move_unlock_page_cgroup(pc, &flags);
1655 rcu_read_unlock();
1656 return;
1657}
1658EXPORT_SYMBOL(mem_cgroup_update_page_stat);
1659
1660
1661
1662
1663
1664#define CHARGE_SIZE (32 * PAGE_SIZE)
1665struct memcg_stock_pcp {
1666 struct mem_cgroup *cached;
1667 int charge;
1668 struct work_struct work;
1669};
1670static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock);
1671static atomic_t memcg_drain_count;
1672
1673
1674
1675
1676
1677
1678
1679static bool consume_stock(struct mem_cgroup *mem)
1680{
1681 struct memcg_stock_pcp *stock;
1682 bool ret = true;
1683
1684 stock = &get_cpu_var(memcg_stock);
1685 if (mem == stock->cached && stock->charge)
1686 stock->charge -= PAGE_SIZE;
1687 else
1688 ret = false;
1689 put_cpu_var(memcg_stock);
1690 return ret;
1691}
1692
1693
1694
1695
1696static void drain_stock(struct memcg_stock_pcp *stock)
1697{
1698 struct mem_cgroup *old = stock->cached;
1699
1700 if (stock->charge) {
1701 res_counter_uncharge(&old->res, stock->charge);
1702 if (do_swap_account)
1703 res_counter_uncharge(&old->memsw, stock->charge);
1704 }
1705 stock->cached = NULL;
1706 stock->charge = 0;
1707}
1708
1709
1710
1711
1712
1713static void drain_local_stock(struct work_struct *dummy)
1714{
1715 struct memcg_stock_pcp *stock = &__get_cpu_var(memcg_stock);
1716 drain_stock(stock);
1717}
1718
1719
1720
1721
1722
1723static void refill_stock(struct mem_cgroup *mem, int val)
1724{
1725 struct memcg_stock_pcp *stock = &get_cpu_var(memcg_stock);
1726
1727 if (stock->cached != mem) {
1728 drain_stock(stock);
1729 stock->cached = mem;
1730 }
1731 stock->charge += val;
1732 put_cpu_var(memcg_stock);
1733}
1734
1735
1736
1737
1738
1739
1740
1741static void drain_all_stock_async(void)
1742{
1743 int cpu;
1744
1745
1746
1747
1748
1749
1750 if (atomic_read(&memcg_drain_count))
1751 return;
1752
1753 atomic_inc(&memcg_drain_count);
1754 get_online_cpus();
1755 for_each_online_cpu(cpu) {
1756 struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu);
1757 schedule_work_on(cpu, &stock->work);
1758 }
1759 put_online_cpus();
1760 atomic_dec(&memcg_drain_count);
1761
1762}
1763
1764
1765static void drain_all_stock_sync(void)
1766{
1767
1768 atomic_inc(&memcg_drain_count);
1769 schedule_on_each_cpu(drain_local_stock);
1770 atomic_dec(&memcg_drain_count);
1771}
1772
1773
1774
1775
1776
1777static void mem_cgroup_drain_pcp_counter(struct mem_cgroup *mem, int cpu)
1778{
1779 int i;
1780
1781 spin_lock(&mem->pcp_counter_lock);
1782 for (i = 0; i < MEM_CGROUP_STAT_DATA; i++) {
1783 s64 x = per_cpu(mem->stat->count[i], cpu);
1784
1785 per_cpu(mem->stat->count[i], cpu) = 0;
1786 mem->nocpu_base.count[i] += x;
1787 }
1788
1789 per_cpu(mem->stat->count[MEM_CGROUP_ON_MOVE], cpu) = 0;
1790 spin_unlock(&mem->pcp_counter_lock);
1791}
1792
1793static void synchronize_mem_cgroup_on_move(struct mem_cgroup *mem, int cpu)
1794{
1795 int idx = MEM_CGROUP_ON_MOVE;
1796
1797 spin_lock(&mem->pcp_counter_lock);
1798 per_cpu(mem->stat->count[idx], cpu) = mem->nocpu_base.count[idx];
1799 spin_unlock(&mem->pcp_counter_lock);
1800}
1801
1802static int __cpuinit memcg_cpu_hotplug_callback(struct notifier_block *nb,
1803 unsigned long action,
1804 void *hcpu)
1805{
1806 int cpu = (unsigned long)hcpu;
1807 struct memcg_stock_pcp *stock;
1808 struct mem_cgroup *iter;
1809
1810 if ((action == CPU_ONLINE)) {
1811 for_each_mem_cgroup_all(iter)
1812 synchronize_mem_cgroup_on_move(iter, cpu);
1813 return NOTIFY_OK;
1814 }
1815
1816 if ((action != CPU_DEAD) || action != CPU_DEAD_FROZEN)
1817 return NOTIFY_OK;
1818
1819 for_each_mem_cgroup_all(iter)
1820 mem_cgroup_drain_pcp_counter(iter, cpu);
1821
1822 stock = &per_cpu(memcg_stock, cpu);
1823 drain_stock(stock);
1824 return NOTIFY_OK;
1825}
1826
1827
1828
1829enum {
1830 CHARGE_OK,
1831 CHARGE_RETRY,
1832 CHARGE_NOMEM,
1833 CHARGE_WOULDBLOCK,
1834 CHARGE_OOM_DIE,
1835};
1836
1837static int __mem_cgroup_do_charge(struct mem_cgroup *mem, gfp_t gfp_mask,
1838 int csize, bool oom_check)
1839{
1840 struct mem_cgroup *mem_over_limit;
1841 struct res_counter *fail_res;
1842 unsigned long flags = 0;
1843 int ret;
1844
1845 ret = res_counter_charge(&mem->res, csize, &fail_res);
1846
1847 if (likely(!ret)) {
1848 if (!do_swap_account)
1849 return CHARGE_OK;
1850 ret = res_counter_charge(&mem->memsw, csize, &fail_res);
1851 if (likely(!ret))
1852 return CHARGE_OK;
1853
1854 res_counter_uncharge(&mem->res, csize);
1855 mem_over_limit = mem_cgroup_from_res_counter(fail_res, memsw);
1856 flags |= MEM_CGROUP_RECLAIM_NOSWAP;
1857 } else
1858 mem_over_limit = mem_cgroup_from_res_counter(fail_res, res);
1859
1860
1861
1862
1863
1864
1865
1866
1867 if (csize == CHARGE_SIZE)
1868 return CHARGE_RETRY;
1869
1870 if (!(gfp_mask & __GFP_WAIT))
1871 return CHARGE_WOULDBLOCK;
1872
1873 ret = mem_cgroup_hierarchical_reclaim(mem_over_limit, NULL,
1874 gfp_mask, flags);
1875 if (mem_cgroup_check_margin(mem_over_limit, csize))
1876 return CHARGE_RETRY;
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886 if (csize == PAGE_SIZE && ret)
1887 return CHARGE_RETRY;
1888
1889
1890
1891
1892
1893 if (mem_cgroup_wait_acct_move(mem_over_limit))
1894 return CHARGE_RETRY;
1895
1896
1897 if (!oom_check)
1898 return CHARGE_NOMEM;
1899
1900 if (!mem_cgroup_handle_oom(mem_over_limit, gfp_mask))
1901 return CHARGE_OOM_DIE;
1902
1903 return CHARGE_RETRY;
1904}
1905
1906
1907
1908
1909
1910static int __mem_cgroup_try_charge(struct mm_struct *mm,
1911 gfp_t gfp_mask,
1912 struct mem_cgroup **memcg, bool oom,
1913 int page_size)
1914{
1915 int nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES;
1916 struct mem_cgroup *mem = NULL;
1917 int ret;
1918 int csize = max(CHARGE_SIZE, (unsigned long) page_size);
1919
1920
1921
1922
1923
1924
1925 if (unlikely(test_thread_flag(TIF_MEMDIE)
1926 || fatal_signal_pending(current)))
1927 goto bypass;
1928
1929
1930
1931
1932
1933
1934
1935 if (!*memcg && !mm)
1936 goto bypass;
1937again:
1938 if (*memcg) {
1939 mem = *memcg;
1940 VM_BUG_ON(css_is_removed(&mem->css));
1941 if (mem_cgroup_is_root(mem))
1942 goto done;
1943 if (page_size == PAGE_SIZE && consume_stock(mem))
1944 goto done;
1945 css_get(&mem->css);
1946 } else {
1947 struct task_struct *p;
1948
1949 rcu_read_lock();
1950 p = rcu_dereference(mm->owner);
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961 mem = mem_cgroup_from_task(p);
1962 if (!mem || mem_cgroup_is_root(mem)) {
1963 rcu_read_unlock();
1964 goto done;
1965 }
1966 if (page_size == PAGE_SIZE && consume_stock(mem)) {
1967
1968
1969
1970
1971
1972
1973
1974
1975 rcu_read_unlock();
1976 goto done;
1977 }
1978
1979 if (!css_tryget(&mem->css)) {
1980 rcu_read_unlock();
1981 goto again;
1982 }
1983 rcu_read_unlock();
1984 }
1985
1986 do {
1987 bool oom_check;
1988
1989
1990 if (fatal_signal_pending(current)) {
1991 css_put(&mem->css);
1992 goto bypass;
1993 }
1994
1995 oom_check = false;
1996 if (oom && !nr_oom_retries) {
1997 oom_check = true;
1998 nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES;
1999 }
2000
2001 ret = __mem_cgroup_do_charge(mem, gfp_mask, csize, oom_check);
2002
2003 switch (ret) {
2004 case CHARGE_OK:
2005 break;
2006 case CHARGE_RETRY:
2007 csize = page_size;
2008 css_put(&mem->css);
2009 mem = NULL;
2010 goto again;
2011 case CHARGE_WOULDBLOCK:
2012 css_put(&mem->css);
2013 goto nomem;
2014 case CHARGE_NOMEM:
2015 if (!oom) {
2016 css_put(&mem->css);
2017 goto nomem;
2018 }
2019
2020 nr_oom_retries--;
2021 break;
2022 case CHARGE_OOM_DIE:
2023 css_put(&mem->css);
2024 goto bypass;
2025 }
2026 } while (ret != CHARGE_OK);
2027
2028 if (csize > page_size)
2029 refill_stock(mem, csize - page_size);
2030 css_put(&mem->css);
2031done:
2032 *memcg = mem;
2033 return 0;
2034nomem:
2035 *memcg = NULL;
2036 return -ENOMEM;
2037bypass:
2038 *memcg = NULL;
2039 return 0;
2040}
2041
2042
2043
2044
2045
2046
2047static void __mem_cgroup_cancel_charge(struct mem_cgroup *mem,
2048 unsigned long count)
2049{
2050 if (!mem_cgroup_is_root(mem)) {
2051 res_counter_uncharge(&mem->res, PAGE_SIZE * count);
2052 if (do_swap_account)
2053 res_counter_uncharge(&mem->memsw, PAGE_SIZE * count);
2054 }
2055}
2056
2057static void mem_cgroup_cancel_charge(struct mem_cgroup *mem,
2058 int page_size)
2059{
2060 __mem_cgroup_cancel_charge(mem, page_size >> PAGE_SHIFT);
2061}
2062
2063
2064
2065
2066
2067
2068
2069static struct mem_cgroup *mem_cgroup_lookup(unsigned short id)
2070{
2071 struct cgroup_subsys_state *css;
2072
2073
2074 if (!id)
2075 return NULL;
2076 css = css_lookup(&mem_cgroup_subsys, id);
2077 if (!css)
2078 return NULL;
2079 return container_of(css, struct mem_cgroup, css);
2080}
2081
2082struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page)
2083{
2084 struct mem_cgroup *mem = NULL;
2085 struct page_cgroup *pc;
2086 unsigned short id;
2087 swp_entry_t ent;
2088
2089 VM_BUG_ON(!PageLocked(page));
2090
2091 pc = lookup_page_cgroup(page);
2092 lock_page_cgroup(pc);
2093 if (PageCgroupUsed(pc)) {
2094 mem = pc->mem_cgroup;
2095 if (mem && !css_tryget(&mem->css))
2096 mem = NULL;
2097 } else if (PageSwapCache(page)) {
2098 ent.val = page_private(page);
2099 id = lookup_swap_cgroup(ent);
2100 rcu_read_lock();
2101 mem = mem_cgroup_lookup(id);
2102 if (mem && !css_tryget(&mem->css))
2103 mem = NULL;
2104 rcu_read_unlock();
2105 }
2106 unlock_page_cgroup(pc);
2107 return mem;
2108}
2109
2110static void __mem_cgroup_commit_charge(struct mem_cgroup *mem,
2111 struct page_cgroup *pc,
2112 enum charge_type ctype,
2113 int page_size)
2114{
2115 int nr_pages = page_size >> PAGE_SHIFT;
2116
2117
2118 if (!mem)
2119 return;
2120
2121 lock_page_cgroup(pc);
2122 if (unlikely(PageCgroupUsed(pc))) {
2123 unlock_page_cgroup(pc);
2124 mem_cgroup_cancel_charge(mem, page_size);
2125 return;
2126 }
2127
2128
2129
2130
2131 pc->mem_cgroup = mem;
2132
2133
2134
2135
2136
2137
2138
2139 smp_wmb();
2140 switch (ctype) {
2141 case MEM_CGROUP_CHARGE_TYPE_CACHE:
2142 case MEM_CGROUP_CHARGE_TYPE_SHMEM:
2143 SetPageCgroupCache(pc);
2144 SetPageCgroupUsed(pc);
2145 break;
2146 case MEM_CGROUP_CHARGE_TYPE_MAPPED:
2147 ClearPageCgroupCache(pc);
2148 SetPageCgroupUsed(pc);
2149 break;
2150 default:
2151 break;
2152 }
2153
2154 mem_cgroup_charge_statistics(mem, PageCgroupCache(pc), nr_pages);
2155 unlock_page_cgroup(pc);
2156
2157
2158
2159
2160
2161 memcg_check_events(mem, pc->page);
2162}
2163
2164#ifdef CONFIG_TRANSPARENT_HUGEPAGE
2165
2166#define PCGF_NOCOPY_AT_SPLIT ((1 << PCG_LOCK) | (1 << PCG_MOVE_LOCK) |\
2167 (1 << PCG_ACCT_LRU) | (1 << PCG_MIGRATION))
2168
2169
2170
2171
2172void mem_cgroup_split_huge_fixup(struct page *head, struct page *tail)
2173{
2174 struct page_cgroup *head_pc = lookup_page_cgroup(head);
2175 struct page_cgroup *tail_pc = lookup_page_cgroup(tail);
2176 unsigned long flags;
2177
2178 if (mem_cgroup_disabled())
2179 return;
2180
2181
2182
2183
2184 move_lock_page_cgroup(head_pc, &flags);
2185
2186 tail_pc->mem_cgroup = head_pc->mem_cgroup;
2187 smp_wmb();
2188 if (PageCgroupAcctLRU(head_pc)) {
2189 enum lru_list lru;
2190 struct mem_cgroup_per_zone *mz;
2191
2192
2193
2194
2195
2196
2197 lru = page_lru(head);
2198 mz = page_cgroup_zoneinfo(head_pc);
2199 MEM_CGROUP_ZSTAT(mz, lru) -= 1;
2200 }
2201 tail_pc->flags = head_pc->flags & ~PCGF_NOCOPY_AT_SPLIT;
2202 move_unlock_page_cgroup(head_pc, &flags);
2203}
2204#endif
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223static void __mem_cgroup_move_account(struct page_cgroup *pc,
2224 struct mem_cgroup *from, struct mem_cgroup *to, bool uncharge,
2225 int charge_size)
2226{
2227 int nr_pages = charge_size >> PAGE_SHIFT;
2228
2229 VM_BUG_ON(from == to);
2230 VM_BUG_ON(PageLRU(pc->page));
2231 VM_BUG_ON(!page_is_cgroup_locked(pc));
2232 VM_BUG_ON(!PageCgroupUsed(pc));
2233 VM_BUG_ON(pc->mem_cgroup != from);
2234
2235 if (PageCgroupFileMapped(pc)) {
2236
2237 preempt_disable();
2238 __this_cpu_dec(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
2239 __this_cpu_inc(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
2240 preempt_enable();
2241 }
2242 mem_cgroup_charge_statistics(from, PageCgroupCache(pc), -nr_pages);
2243 if (uncharge)
2244
2245 mem_cgroup_cancel_charge(from, charge_size);
2246
2247
2248 pc->mem_cgroup = to;
2249 mem_cgroup_charge_statistics(to, PageCgroupCache(pc), nr_pages);
2250
2251
2252
2253
2254
2255
2256
2257}
2258
2259
2260
2261
2262
2263static int mem_cgroup_move_account(struct page_cgroup *pc,
2264 struct mem_cgroup *from, struct mem_cgroup *to,
2265 bool uncharge, int charge_size)
2266{
2267 int ret = -EINVAL;
2268 unsigned long flags;
2269
2270
2271
2272
2273
2274
2275 if ((charge_size > PAGE_SIZE) && !PageTransHuge(pc->page))
2276 return -EBUSY;
2277
2278 lock_page_cgroup(pc);
2279 if (PageCgroupUsed(pc) && pc->mem_cgroup == from) {
2280 move_lock_page_cgroup(pc, &flags);
2281 __mem_cgroup_move_account(pc, from, to, uncharge, charge_size);
2282 move_unlock_page_cgroup(pc, &flags);
2283 ret = 0;
2284 }
2285 unlock_page_cgroup(pc);
2286
2287
2288
2289 memcg_check_events(to, pc->page);
2290 memcg_check_events(from, pc->page);
2291 return ret;
2292}
2293
2294
2295
2296
2297
2298static int mem_cgroup_move_parent(struct page_cgroup *pc,
2299 struct mem_cgroup *child,
2300 gfp_t gfp_mask)
2301{
2302 struct page *page = pc->page;
2303 struct cgroup *cg = child->css.cgroup;
2304 struct cgroup *pcg = cg->parent;
2305 struct mem_cgroup *parent;
2306 int page_size = PAGE_SIZE;
2307 unsigned long flags;
2308 int ret;
2309
2310
2311 if (!pcg)
2312 return -EINVAL;
2313
2314 ret = -EBUSY;
2315 if (!get_page_unless_zero(page))
2316 goto out;
2317 if (isolate_lru_page(page))
2318 goto put;
2319
2320 if (PageTransHuge(page))
2321 page_size = HPAGE_SIZE;
2322
2323 parent = mem_cgroup_from_cont(pcg);
2324 ret = __mem_cgroup_try_charge(NULL, gfp_mask,
2325 &parent, false, page_size);
2326 if (ret || !parent)
2327 goto put_back;
2328
2329 if (page_size > PAGE_SIZE)
2330 flags = compound_lock_irqsave(page);
2331
2332 ret = mem_cgroup_move_account(pc, child, parent, true, page_size);
2333 if (ret)
2334 mem_cgroup_cancel_charge(parent, page_size);
2335
2336 if (page_size > PAGE_SIZE)
2337 compound_unlock_irqrestore(page, flags);
2338put_back:
2339 putback_lru_page(page);
2340put:
2341 put_page(page);
2342out:
2343 return ret;
2344}
2345
2346
2347
2348
2349
2350
2351
2352static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
2353 gfp_t gfp_mask, enum charge_type ctype)
2354{
2355 struct mem_cgroup *mem = NULL;
2356 int page_size = PAGE_SIZE;
2357 struct page_cgroup *pc;
2358 bool oom = true;
2359 int ret;
2360
2361 if (PageTransHuge(page)) {
2362 page_size <<= compound_order(page);
2363 VM_BUG_ON(!PageTransHuge(page));
2364
2365
2366
2367
2368 oom = false;
2369 }
2370
2371 pc = lookup_page_cgroup(page);
2372
2373 if (unlikely(!pc))
2374 return 0;
2375 prefetchw(pc);
2376
2377 ret = __mem_cgroup_try_charge(mm, gfp_mask, &mem, oom, page_size);
2378 if (ret || !mem)
2379 return ret;
2380
2381 __mem_cgroup_commit_charge(mem, pc, ctype, page_size);
2382 return 0;
2383}
2384
2385int mem_cgroup_newpage_charge(struct page *page,
2386 struct mm_struct *mm, gfp_t gfp_mask)
2387{
2388 if (mem_cgroup_disabled())
2389 return 0;
2390
2391
2392
2393
2394
2395
2396
2397 if (page_mapped(page) || (page->mapping && !PageAnon(page)))
2398 return 0;
2399 if (unlikely(!mm))
2400 mm = &init_mm;
2401 return mem_cgroup_charge_common(page, mm, gfp_mask,
2402 MEM_CGROUP_CHARGE_TYPE_MAPPED);
2403}
2404
2405static void
2406__mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr,
2407 enum charge_type ctype);
2408
2409int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
2410 gfp_t gfp_mask)
2411{
2412 int ret;
2413
2414 if (mem_cgroup_disabled())
2415 return 0;
2416 if (PageCompound(page))
2417 return 0;
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429 if (!(gfp_mask & __GFP_WAIT)) {
2430 struct page_cgroup *pc;
2431
2432 pc = lookup_page_cgroup(page);
2433 if (!pc)
2434 return 0;
2435 lock_page_cgroup(pc);
2436 if (PageCgroupUsed(pc)) {
2437 unlock_page_cgroup(pc);
2438 return 0;
2439 }
2440 unlock_page_cgroup(pc);
2441 }
2442
2443 if (unlikely(!mm))
2444 mm = &init_mm;
2445
2446 if (page_is_file_cache(page))
2447 return mem_cgroup_charge_common(page, mm, gfp_mask,
2448 MEM_CGROUP_CHARGE_TYPE_CACHE);
2449
2450
2451 if (PageSwapCache(page)) {
2452 struct mem_cgroup *mem = NULL;
2453
2454 ret = mem_cgroup_try_charge_swapin(mm, page, gfp_mask, &mem);
2455 if (!ret)
2456 __mem_cgroup_commit_charge_swapin(page, mem,
2457 MEM_CGROUP_CHARGE_TYPE_SHMEM);
2458 } else
2459 ret = mem_cgroup_charge_common(page, mm, gfp_mask,
2460 MEM_CGROUP_CHARGE_TYPE_SHMEM);
2461
2462 return ret;
2463}
2464
2465
2466
2467
2468
2469
2470
2471int mem_cgroup_try_charge_swapin(struct mm_struct *mm,
2472 struct page *page,
2473 gfp_t mask, struct mem_cgroup **ptr)
2474{
2475 struct mem_cgroup *mem;
2476 int ret;
2477
2478 if (mem_cgroup_disabled())
2479 return 0;
2480
2481 if (!do_swap_account)
2482 goto charge_cur_mm;
2483
2484
2485
2486
2487
2488
2489 if (!PageSwapCache(page))
2490 goto charge_cur_mm;
2491 mem = try_get_mem_cgroup_from_page(page);
2492 if (!mem)
2493 goto charge_cur_mm;
2494 *ptr = mem;
2495 ret = __mem_cgroup_try_charge(NULL, mask, ptr, true, PAGE_SIZE);
2496 css_put(&mem->css);
2497 return ret;
2498charge_cur_mm:
2499 if (unlikely(!mm))
2500 mm = &init_mm;
2501 return __mem_cgroup_try_charge(mm, mask, ptr, true, PAGE_SIZE);
2502}
2503
2504static void
2505__mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr,
2506 enum charge_type ctype)
2507{
2508 struct page_cgroup *pc;
2509
2510 if (mem_cgroup_disabled())
2511 return;
2512 if (!ptr)
2513 return;
2514 cgroup_exclude_rmdir(&ptr->css);
2515 pc = lookup_page_cgroup(page);
2516 mem_cgroup_lru_del_before_commit_swapcache(page);
2517 __mem_cgroup_commit_charge(ptr, pc, ctype, PAGE_SIZE);
2518 mem_cgroup_lru_add_after_commit_swapcache(page);
2519
2520
2521
2522
2523
2524
2525
2526 if (do_swap_account && PageSwapCache(page)) {
2527 swp_entry_t ent = {.val = page_private(page)};
2528 unsigned short id;
2529 struct mem_cgroup *memcg;
2530
2531 id = swap_cgroup_record(ent, 0);
2532 rcu_read_lock();
2533 memcg = mem_cgroup_lookup(id);
2534 if (memcg) {
2535
2536
2537
2538
2539 if (!mem_cgroup_is_root(memcg))
2540 res_counter_uncharge(&memcg->memsw, PAGE_SIZE);
2541 mem_cgroup_swap_statistics(memcg, false);
2542 mem_cgroup_put(memcg);
2543 }
2544 rcu_read_unlock();
2545 }
2546
2547
2548
2549
2550
2551 cgroup_release_and_wakeup_rmdir(&ptr->css);
2552}
2553
2554void mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr)
2555{
2556 __mem_cgroup_commit_charge_swapin(page, ptr,
2557 MEM_CGROUP_CHARGE_TYPE_MAPPED);
2558}
2559
2560void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *mem)
2561{
2562 if (mem_cgroup_disabled())
2563 return;
2564 if (!mem)
2565 return;
2566 mem_cgroup_cancel_charge(mem, PAGE_SIZE);
2567}
2568
2569static void
2570__do_uncharge(struct mem_cgroup *mem, const enum charge_type ctype,
2571 int page_size)
2572{
2573 struct memcg_batch_info *batch = NULL;
2574 bool uncharge_memsw = true;
2575
2576 if (!do_swap_account || ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT)
2577 uncharge_memsw = false;
2578
2579 batch = ¤t->memcg_batch;
2580
2581
2582
2583
2584
2585 if (!batch->memcg)
2586 batch->memcg = mem;
2587
2588
2589
2590
2591
2592
2593
2594
2595 if (!batch->do_batch || test_thread_flag(TIF_MEMDIE))
2596 goto direct_uncharge;
2597
2598 if (page_size != PAGE_SIZE)
2599 goto direct_uncharge;
2600
2601
2602
2603
2604
2605
2606 if (batch->memcg != mem)
2607 goto direct_uncharge;
2608
2609 batch->bytes += PAGE_SIZE;
2610 if (uncharge_memsw)
2611 batch->memsw_bytes += PAGE_SIZE;
2612 return;
2613direct_uncharge:
2614 res_counter_uncharge(&mem->res, page_size);
2615 if (uncharge_memsw)
2616 res_counter_uncharge(&mem->memsw, page_size);
2617 if (unlikely(batch->memcg != mem))
2618 memcg_oom_recover(mem);
2619 return;
2620}
2621
2622
2623
2624
2625static struct mem_cgroup *
2626__mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
2627{
2628 int count;
2629 struct page_cgroup *pc;
2630 struct mem_cgroup *mem = NULL;
2631 int page_size = PAGE_SIZE;
2632
2633 if (mem_cgroup_disabled())
2634 return NULL;
2635
2636 if (PageSwapCache(page))
2637 return NULL;
2638
2639 if (PageTransHuge(page)) {
2640 page_size <<= compound_order(page);
2641 VM_BUG_ON(!PageTransHuge(page));
2642 }
2643
2644 count = page_size >> PAGE_SHIFT;
2645
2646
2647
2648 pc = lookup_page_cgroup(page);
2649 if (unlikely(!pc || !PageCgroupUsed(pc)))
2650 return NULL;
2651
2652 lock_page_cgroup(pc);
2653
2654 mem = pc->mem_cgroup;
2655
2656 if (!PageCgroupUsed(pc))
2657 goto unlock_out;
2658
2659 switch (ctype) {
2660 case MEM_CGROUP_CHARGE_TYPE_MAPPED:
2661 case MEM_CGROUP_CHARGE_TYPE_DROP:
2662
2663 if (page_mapped(page) || PageCgroupMigration(pc))
2664 goto unlock_out;
2665 break;
2666 case MEM_CGROUP_CHARGE_TYPE_SWAPOUT:
2667 if (!PageAnon(page)) {
2668 if (page->mapping && !page_is_file_cache(page))
2669 goto unlock_out;
2670 } else if (page_mapped(page))
2671 goto unlock_out;
2672 break;
2673 default:
2674 break;
2675 }
2676
2677 mem_cgroup_charge_statistics(mem, PageCgroupCache(pc), -count);
2678
2679 ClearPageCgroupUsed(pc);
2680
2681
2682
2683
2684
2685
2686
2687 unlock_page_cgroup(pc);
2688
2689
2690
2691
2692 memcg_check_events(mem, page);
2693 if (do_swap_account && ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT) {
2694 mem_cgroup_swap_statistics(mem, true);
2695 mem_cgroup_get(mem);
2696 }
2697 if (!mem_cgroup_is_root(mem))
2698 __do_uncharge(mem, ctype, page_size);
2699
2700 return mem;
2701
2702unlock_out:
2703 unlock_page_cgroup(pc);
2704 return NULL;
2705}
2706
2707void mem_cgroup_uncharge_page(struct page *page)
2708{
2709
2710 if (page_mapped(page))
2711 return;
2712 if (page->mapping && !PageAnon(page))
2713 return;
2714 __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_MAPPED);
2715}
2716
2717void mem_cgroup_uncharge_cache_page(struct page *page)
2718{
2719 VM_BUG_ON(page_mapped(page));
2720 VM_BUG_ON(page->mapping);
2721 __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE);
2722}
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732void mem_cgroup_uncharge_start(void)
2733{
2734 current->memcg_batch.do_batch++;
2735
2736 if (current->memcg_batch.do_batch == 1) {
2737 current->memcg_batch.memcg = NULL;
2738 current->memcg_batch.bytes = 0;
2739 current->memcg_batch.memsw_bytes = 0;
2740 }
2741}
2742
2743void mem_cgroup_uncharge_end(void)
2744{
2745 struct memcg_batch_info *batch = ¤t->memcg_batch;
2746
2747 if (!batch->do_batch)
2748 return;
2749
2750 batch->do_batch--;
2751 if (batch->do_batch)
2752 return;
2753
2754 if (!batch->memcg)
2755 return;
2756
2757
2758
2759
2760 if (batch->bytes)
2761 res_counter_uncharge(&batch->memcg->res, batch->bytes);
2762 if (batch->memsw_bytes)
2763 res_counter_uncharge(&batch->memcg->memsw, batch->memsw_bytes);
2764 memcg_oom_recover(batch->memcg);
2765
2766 batch->memcg = NULL;
2767}
2768
2769#ifdef CONFIG_SWAP
2770
2771
2772
2773
2774void
2775mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent, bool swapout)
2776{
2777 struct mem_cgroup *memcg;
2778 int ctype = MEM_CGROUP_CHARGE_TYPE_SWAPOUT;
2779
2780 if (!swapout)
2781 ctype = MEM_CGROUP_CHARGE_TYPE_DROP;
2782
2783 memcg = __mem_cgroup_uncharge_common(page, ctype);
2784
2785
2786
2787
2788
2789 if (do_swap_account && swapout && memcg)
2790 swap_cgroup_record(ent, css_id(&memcg->css));
2791}
2792#endif
2793
2794#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
2795
2796
2797
2798
2799void mem_cgroup_uncharge_swap(swp_entry_t ent)
2800{
2801 struct mem_cgroup *memcg;
2802 unsigned short id;
2803
2804 if (!do_swap_account)
2805 return;
2806
2807 id = swap_cgroup_record(ent, 0);
2808 rcu_read_lock();
2809 memcg = mem_cgroup_lookup(id);
2810 if (memcg) {
2811
2812
2813
2814
2815 if (!mem_cgroup_is_root(memcg))
2816 res_counter_uncharge(&memcg->memsw, PAGE_SIZE);
2817 mem_cgroup_swap_statistics(memcg, false);
2818 mem_cgroup_put(memcg);
2819 }
2820 rcu_read_unlock();
2821}
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838static int mem_cgroup_move_swap_account(swp_entry_t entry,
2839 struct mem_cgroup *from, struct mem_cgroup *to, bool need_fixup)
2840{
2841 unsigned short old_id, new_id;
2842
2843 old_id = css_id(&from->css);
2844 new_id = css_id(&to->css);
2845
2846 if (swap_cgroup_cmpxchg(entry, old_id, new_id) == old_id) {
2847 mem_cgroup_swap_statistics(from, false);
2848 mem_cgroup_swap_statistics(to, true);
2849
2850
2851
2852
2853
2854
2855
2856
2857 mem_cgroup_get(to);
2858 if (need_fixup) {
2859 if (!mem_cgroup_is_root(from))
2860 res_counter_uncharge(&from->memsw, PAGE_SIZE);
2861 mem_cgroup_put(from);
2862
2863
2864
2865
2866 if (!mem_cgroup_is_root(to))
2867 res_counter_uncharge(&to->res, PAGE_SIZE);
2868 }
2869 return 0;
2870 }
2871 return -EINVAL;
2872}
2873#else
2874static inline int mem_cgroup_move_swap_account(swp_entry_t entry,
2875 struct mem_cgroup *from, struct mem_cgroup *to, bool need_fixup)
2876{
2877 return -EINVAL;
2878}
2879#endif
2880
2881
2882
2883
2884
2885int mem_cgroup_prepare_migration(struct page *page,
2886 struct page *newpage, struct mem_cgroup **ptr)
2887{
2888 struct page_cgroup *pc;
2889 struct mem_cgroup *mem = NULL;
2890 enum charge_type ctype;
2891 int ret = 0;
2892
2893 VM_BUG_ON(PageTransHuge(page));
2894 if (mem_cgroup_disabled())
2895 return 0;
2896
2897 pc = lookup_page_cgroup(page);
2898 lock_page_cgroup(pc);
2899 if (PageCgroupUsed(pc)) {
2900 mem = pc->mem_cgroup;
2901 css_get(&mem->css);
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931 if (PageAnon(page))
2932 SetPageCgroupMigration(pc);
2933 }
2934 unlock_page_cgroup(pc);
2935
2936
2937
2938
2939 if (!mem)
2940 return 0;
2941
2942 *ptr = mem;
2943 ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, ptr, false, PAGE_SIZE);
2944 css_put(&mem->css);
2945 if (ret || *ptr == NULL) {
2946 if (PageAnon(page)) {
2947 lock_page_cgroup(pc);
2948 ClearPageCgroupMigration(pc);
2949 unlock_page_cgroup(pc);
2950
2951
2952
2953 mem_cgroup_uncharge_page(page);
2954 }
2955 return -ENOMEM;
2956 }
2957
2958
2959
2960
2961
2962
2963 pc = lookup_page_cgroup(newpage);
2964 if (PageAnon(page))
2965 ctype = MEM_CGROUP_CHARGE_TYPE_MAPPED;
2966 else if (page_is_file_cache(page))
2967 ctype = MEM_CGROUP_CHARGE_TYPE_CACHE;
2968 else
2969 ctype = MEM_CGROUP_CHARGE_TYPE_SHMEM;
2970 __mem_cgroup_commit_charge(mem, pc, ctype, PAGE_SIZE);
2971 return ret;
2972}
2973
2974
2975void mem_cgroup_end_migration(struct mem_cgroup *mem,
2976 struct page *oldpage, struct page *newpage, bool migration_ok)
2977{
2978 struct page *used, *unused;
2979 struct page_cgroup *pc;
2980
2981 if (!mem)
2982 return;
2983
2984 cgroup_exclude_rmdir(&mem->css);
2985 if (!migration_ok) {
2986 used = oldpage;
2987 unused = newpage;
2988 } else {
2989 used = newpage;
2990 unused = oldpage;
2991 }
2992
2993
2994
2995
2996
2997 pc = lookup_page_cgroup(oldpage);
2998 lock_page_cgroup(pc);
2999 ClearPageCgroupMigration(pc);
3000 unlock_page_cgroup(pc);
3001
3002 __mem_cgroup_uncharge_common(unused, MEM_CGROUP_CHARGE_TYPE_FORCE);
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012 if (PageAnon(used))
3013 mem_cgroup_uncharge_page(used);
3014
3015
3016
3017
3018
3019
3020 cgroup_release_and_wakeup_rmdir(&mem->css);
3021}
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031int mem_cgroup_shmem_charge_fallback(struct page *page,
3032 struct mm_struct *mm,
3033 gfp_t gfp_mask)
3034{
3035 struct mem_cgroup *mem = NULL;
3036 int ret;
3037
3038 if (mem_cgroup_disabled())
3039 return 0;
3040
3041 ret = mem_cgroup_try_charge_swapin(mm, page, gfp_mask, &mem);
3042 if (!ret)
3043 mem_cgroup_cancel_charge_swapin(mem);
3044
3045 return ret;
3046}
3047
3048static DEFINE_MUTEX(set_limit_mutex);
3049
3050static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
3051 unsigned long long val)
3052{
3053 int retry_count;
3054 u64 memswlimit, memlimit;
3055 int ret = 0;
3056 int children = mem_cgroup_count_children(memcg);
3057 u64 curusage, oldusage;
3058 int enlarge;
3059
3060
3061
3062
3063
3064
3065 retry_count = MEM_CGROUP_RECLAIM_RETRIES * children;
3066
3067 oldusage = res_counter_read_u64(&memcg->res, RES_USAGE);
3068
3069 enlarge = 0;
3070 while (retry_count) {
3071 if (signal_pending(current)) {
3072 ret = -EINTR;
3073 break;
3074 }
3075
3076
3077
3078
3079
3080 mutex_lock(&set_limit_mutex);
3081 memswlimit = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
3082 if (memswlimit < val) {
3083 ret = -EINVAL;
3084 mutex_unlock(&set_limit_mutex);
3085 break;
3086 }
3087
3088 memlimit = res_counter_read_u64(&memcg->res, RES_LIMIT);
3089 if (memlimit < val)
3090 enlarge = 1;
3091
3092 ret = res_counter_set_limit(&memcg->res, val);
3093 if (!ret) {
3094 if (memswlimit == val)
3095 memcg->memsw_is_minimum = true;
3096 else
3097 memcg->memsw_is_minimum = false;
3098 }
3099 mutex_unlock(&set_limit_mutex);
3100
3101 if (!ret)
3102 break;
3103
3104 mem_cgroup_hierarchical_reclaim(memcg, NULL, GFP_KERNEL,
3105 MEM_CGROUP_RECLAIM_SHRINK);
3106 curusage = res_counter_read_u64(&memcg->res, RES_USAGE);
3107
3108 if (curusage >= oldusage)
3109 retry_count--;
3110 else
3111 oldusage = curusage;
3112 }
3113 if (!ret && enlarge)
3114 memcg_oom_recover(memcg);
3115
3116 return ret;
3117}
3118
3119static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg,
3120 unsigned long long val)
3121{
3122 int retry_count;
3123 u64 memlimit, memswlimit, oldusage, curusage;
3124 int children = mem_cgroup_count_children(memcg);
3125 int ret = -EBUSY;
3126 int enlarge = 0;
3127
3128
3129 retry_count = children * MEM_CGROUP_RECLAIM_RETRIES;
3130 oldusage = res_counter_read_u64(&memcg->memsw, RES_USAGE);
3131 while (retry_count) {
3132 if (signal_pending(current)) {
3133 ret = -EINTR;
3134 break;
3135 }
3136
3137
3138
3139
3140
3141 mutex_lock(&set_limit_mutex);
3142 memlimit = res_counter_read_u64(&memcg->res, RES_LIMIT);
3143 if (memlimit > val) {
3144 ret = -EINVAL;
3145 mutex_unlock(&set_limit_mutex);
3146 break;
3147 }
3148 memswlimit = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
3149 if (memswlimit < val)
3150 enlarge = 1;
3151 ret = res_counter_set_limit(&memcg->memsw, val);
3152 if (!ret) {
3153 if (memlimit == val)
3154 memcg->memsw_is_minimum = true;
3155 else
3156 memcg->memsw_is_minimum = false;
3157 }
3158 mutex_unlock(&set_limit_mutex);
3159
3160 if (!ret)
3161 break;
3162
3163 mem_cgroup_hierarchical_reclaim(memcg, NULL, GFP_KERNEL,
3164 MEM_CGROUP_RECLAIM_NOSWAP |
3165 MEM_CGROUP_RECLAIM_SHRINK);
3166 curusage = res_counter_read_u64(&memcg->memsw, RES_USAGE);
3167
3168 if (curusage >= oldusage)
3169 retry_count--;
3170 else
3171 oldusage = curusage;
3172 }
3173 if (!ret && enlarge)
3174 memcg_oom_recover(memcg);
3175 return ret;
3176}
3177
3178unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
3179 gfp_t gfp_mask)
3180{
3181 unsigned long nr_reclaimed = 0;
3182 struct mem_cgroup_per_zone *mz, *next_mz = NULL;
3183 unsigned long reclaimed;
3184 int loop = 0;
3185 struct mem_cgroup_tree_per_zone *mctz;
3186 unsigned long long excess;
3187
3188 if (order > 0)
3189 return 0;
3190
3191 mctz = soft_limit_tree_node_zone(zone_to_nid(zone), zone_idx(zone));
3192
3193
3194
3195
3196
3197 do {
3198 if (next_mz)
3199 mz = next_mz;
3200 else
3201 mz = mem_cgroup_largest_soft_limit_node(mctz);
3202 if (!mz)
3203 break;
3204
3205 reclaimed = mem_cgroup_hierarchical_reclaim(mz->mem, zone,
3206 gfp_mask,
3207 MEM_CGROUP_RECLAIM_SOFT);
3208 nr_reclaimed += reclaimed;
3209 spin_lock(&mctz->lock);
3210
3211
3212
3213
3214
3215 next_mz = NULL;
3216 if (!reclaimed) {
3217 do {
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229 next_mz =
3230 __mem_cgroup_largest_soft_limit_node(mctz);
3231 if (next_mz == mz) {
3232 css_put(&next_mz->mem->css);
3233 next_mz = NULL;
3234 } else
3235 break;
3236 } while (1);
3237 }
3238 __mem_cgroup_remove_exceeded(mz->mem, mz, mctz);
3239 excess = res_counter_soft_limit_excess(&mz->mem->res);
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249 __mem_cgroup_insert_exceeded(mz->mem, mz, mctz, excess);
3250 spin_unlock(&mctz->lock);
3251 css_put(&mz->mem->css);
3252 loop++;
3253
3254
3255
3256
3257
3258 if (!nr_reclaimed &&
3259 (next_mz == NULL ||
3260 loop > MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS))
3261 break;
3262 } while (!nr_reclaimed);
3263 if (next_mz)
3264 css_put(&next_mz->mem->css);
3265 return nr_reclaimed;
3266}
3267
3268
3269
3270
3271
3272static int mem_cgroup_force_empty_list(struct mem_cgroup *mem,
3273 int node, int zid, enum lru_list lru)
3274{
3275 struct zone *zone;
3276 struct mem_cgroup_per_zone *mz;
3277 struct page_cgroup *pc, *busy;
3278 unsigned long flags, loop;
3279 struct list_head *list;
3280 int ret = 0;
3281
3282 zone = &NODE_DATA(node)->node_zones[zid];
3283 mz = mem_cgroup_zoneinfo(mem, node, zid);
3284 list = &mz->lists[lru];
3285
3286 loop = MEM_CGROUP_ZSTAT(mz, lru);
3287
3288 loop += 256;
3289 busy = NULL;
3290 while (loop--) {
3291 ret = 0;
3292 spin_lock_irqsave(&zone->lru_lock, flags);
3293 if (list_empty(list)) {
3294 spin_unlock_irqrestore(&zone->lru_lock, flags);
3295 break;
3296 }
3297 pc = list_entry(list->prev, struct page_cgroup, lru);
3298 if (busy == pc) {
3299 list_move(&pc->lru, list);
3300 busy = NULL;
3301 spin_unlock_irqrestore(&zone->lru_lock, flags);
3302 continue;
3303 }
3304 spin_unlock_irqrestore(&zone->lru_lock, flags);
3305
3306 ret = mem_cgroup_move_parent(pc, mem, GFP_KERNEL);
3307 if (ret == -ENOMEM)
3308 break;
3309
3310 if (ret == -EBUSY || ret == -EINVAL) {
3311
3312 busy = pc;
3313 cond_resched();
3314 } else
3315 busy = NULL;
3316 }
3317
3318 if (!ret && !list_empty(list))
3319 return -EBUSY;
3320 return ret;
3321}
3322
3323
3324
3325
3326
3327static int mem_cgroup_force_empty(struct mem_cgroup *mem, bool free_all)
3328{
3329 int ret;
3330 int node, zid, shrink;
3331 int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
3332 struct cgroup *cgrp = mem->css.cgroup;
3333
3334 css_get(&mem->css);
3335
3336 shrink = 0;
3337
3338 if (free_all)
3339 goto try_to_free;
3340move_account:
3341 do {
3342 ret = -EBUSY;
3343 if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children))
3344 goto out;
3345 ret = -EINTR;
3346 if (signal_pending(current))
3347 goto out;
3348
3349 lru_add_drain_all();
3350 drain_all_stock_sync();
3351 ret = 0;
3352 mem_cgroup_start_move(mem);
3353 for_each_node_state(node, N_HIGH_MEMORY) {
3354 for (zid = 0; !ret && zid < MAX_NR_ZONES; zid++) {
3355 enum lru_list l;
3356 for_each_lru(l) {
3357 ret = mem_cgroup_force_empty_list(mem,
3358 node, zid, l);
3359 if (ret)
3360 break;
3361 }
3362 }
3363 if (ret)
3364 break;
3365 }
3366 mem_cgroup_end_move(mem);
3367 memcg_oom_recover(mem);
3368
3369 if (ret == -ENOMEM)
3370 goto try_to_free;
3371 cond_resched();
3372
3373 } while (mem->res.usage > 0 || ret);
3374out:
3375 css_put(&mem->css);
3376 return ret;
3377
3378try_to_free:
3379
3380 if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children) || shrink) {
3381 ret = -EBUSY;
3382 goto out;
3383 }
3384
3385 lru_add_drain_all();
3386
3387 shrink = 1;
3388 while (nr_retries && mem->res.usage > 0) {
3389 int progress;
3390
3391 if (signal_pending(current)) {
3392 ret = -EINTR;
3393 goto out;
3394 }
3395 progress = try_to_free_mem_cgroup_pages(mem, GFP_KERNEL,
3396 false, get_swappiness(mem));
3397 if (!progress) {
3398 nr_retries--;
3399
3400 congestion_wait(BLK_RW_ASYNC, HZ/10);
3401 }
3402
3403 }
3404 lru_add_drain();
3405
3406 goto move_account;
3407}
3408
3409int mem_cgroup_force_empty_write(struct cgroup *cont, unsigned int event)
3410{
3411 return mem_cgroup_force_empty(mem_cgroup_from_cont(cont), true);
3412}
3413
3414
3415static u64 mem_cgroup_hierarchy_read(struct cgroup *cont, struct cftype *cft)
3416{
3417 return mem_cgroup_from_cont(cont)->use_hierarchy;
3418}
3419
3420static int mem_cgroup_hierarchy_write(struct cgroup *cont, struct cftype *cft,
3421 u64 val)
3422{
3423 int retval = 0;
3424 struct mem_cgroup *mem = mem_cgroup_from_cont(cont);
3425 struct cgroup *parent = cont->parent;
3426 struct mem_cgroup *parent_mem = NULL;
3427
3428 if (parent)
3429 parent_mem = mem_cgroup_from_cont(parent);
3430
3431 cgroup_lock();
3432
3433
3434
3435
3436
3437
3438
3439
3440 if ((!parent_mem || !parent_mem->use_hierarchy) &&
3441 (val == 1 || val == 0)) {
3442 if (list_empty(&cont->children))
3443 mem->use_hierarchy = val;
3444 else
3445 retval = -EBUSY;
3446 } else
3447 retval = -EINVAL;
3448 cgroup_unlock();
3449
3450 return retval;
3451}
3452
3453
3454static u64 mem_cgroup_get_recursive_idx_stat(struct mem_cgroup *mem,
3455 enum mem_cgroup_stat_index idx)
3456{
3457 struct mem_cgroup *iter;
3458 s64 val = 0;
3459
3460
3461 for_each_mem_cgroup_tree(iter, mem)
3462 val += mem_cgroup_read_stat(iter, idx);
3463
3464 if (val < 0)
3465 val = 0;
3466 return val;
3467}
3468
3469static inline u64 mem_cgroup_usage(struct mem_cgroup *mem, bool swap)
3470{
3471 u64 val;
3472
3473 if (!mem_cgroup_is_root(mem)) {
3474 if (!swap)
3475 return res_counter_read_u64(&mem->res, RES_USAGE);
3476 else
3477 return res_counter_read_u64(&mem->memsw, RES_USAGE);
3478 }
3479
3480 val = mem_cgroup_get_recursive_idx_stat(mem, MEM_CGROUP_STAT_CACHE);
3481 val += mem_cgroup_get_recursive_idx_stat(mem, MEM_CGROUP_STAT_RSS);
3482
3483 if (swap)
3484 val += mem_cgroup_get_recursive_idx_stat(mem,
3485 MEM_CGROUP_STAT_SWAPOUT);
3486
3487 return val << PAGE_SHIFT;
3488}
3489
3490static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft)
3491{
3492 struct mem_cgroup *mem = mem_cgroup_from_cont(cont);
3493 u64 val;
3494 int type, name;
3495
3496 type = MEMFILE_TYPE(cft->private);
3497 name = MEMFILE_ATTR(cft->private);
3498 switch (type) {
3499 case _MEM:
3500 if (name == RES_USAGE)
3501 val = mem_cgroup_usage(mem, false);
3502 else
3503 val = res_counter_read_u64(&mem->res, name);
3504 break;
3505 case _MEMSWAP:
3506 if (name == RES_USAGE)
3507 val = mem_cgroup_usage(mem, true);
3508 else
3509 val = res_counter_read_u64(&mem->memsw, name);
3510 break;
3511 default:
3512 BUG();
3513 break;
3514 }
3515 return val;
3516}
3517
3518
3519
3520
3521static int mem_cgroup_write(struct cgroup *cont, struct cftype *cft,
3522 const char *buffer)
3523{
3524 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
3525 int type, name;
3526 unsigned long long val;
3527 int ret;
3528
3529 type = MEMFILE_TYPE(cft->private);
3530 name = MEMFILE_ATTR(cft->private);
3531 switch (name) {
3532 case RES_LIMIT:
3533 if (mem_cgroup_is_root(memcg)) {
3534 ret = -EINVAL;
3535 break;
3536 }
3537
3538 ret = res_counter_memparse_write_strategy(buffer, &val);
3539 if (ret)
3540 break;
3541 if (type == _MEM)
3542 ret = mem_cgroup_resize_limit(memcg, val);
3543 else
3544 ret = mem_cgroup_resize_memsw_limit(memcg, val);
3545 break;
3546 case RES_SOFT_LIMIT:
3547 ret = res_counter_memparse_write_strategy(buffer, &val);
3548 if (ret)
3549 break;
3550
3551
3552
3553
3554
3555 if (type == _MEM)
3556 ret = res_counter_set_soft_limit(&memcg->res, val);
3557 else
3558 ret = -EINVAL;
3559 break;
3560 default:
3561 ret = -EINVAL;
3562 break;
3563 }
3564 return ret;
3565}
3566
3567static void memcg_get_hierarchical_limit(struct mem_cgroup *memcg,
3568 unsigned long long *mem_limit, unsigned long long *memsw_limit)
3569{
3570 struct cgroup *cgroup;
3571 unsigned long long min_limit, min_memsw_limit, tmp;
3572
3573 min_limit = res_counter_read_u64(&memcg->res, RES_LIMIT);
3574 min_memsw_limit = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
3575 cgroup = memcg->css.cgroup;
3576 if (!memcg->use_hierarchy)
3577 goto out;
3578
3579 while (cgroup->parent) {
3580 cgroup = cgroup->parent;
3581 memcg = mem_cgroup_from_cont(cgroup);
3582 if (!memcg->use_hierarchy)
3583 break;
3584 tmp = res_counter_read_u64(&memcg->res, RES_LIMIT);
3585 min_limit = min(min_limit, tmp);
3586 tmp = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
3587 min_memsw_limit = min(min_memsw_limit, tmp);
3588 }
3589out:
3590 *mem_limit = min_limit;
3591 *memsw_limit = min_memsw_limit;
3592 return;
3593}
3594
3595static int mem_cgroup_reset(struct cgroup *cont, unsigned int event)
3596{
3597 struct mem_cgroup *mem;
3598 int type, name;
3599
3600 mem = mem_cgroup_from_cont(cont);
3601 type = MEMFILE_TYPE(event);
3602 name = MEMFILE_ATTR(event);
3603 switch (name) {
3604 case RES_MAX_USAGE:
3605 if (type == _MEM)
3606 res_counter_reset_max(&mem->res);
3607 else
3608 res_counter_reset_max(&mem->memsw);
3609 break;
3610 case RES_FAILCNT:
3611 if (type == _MEM)
3612 res_counter_reset_failcnt(&mem->res);
3613 else
3614 res_counter_reset_failcnt(&mem->memsw);
3615 break;
3616 }
3617
3618 return 0;
3619}
3620
3621static u64 mem_cgroup_move_charge_read(struct cgroup *cgrp,
3622 struct cftype *cft)
3623{
3624 return mem_cgroup_from_cont(cgrp)->move_charge_at_immigrate;
3625}
3626
3627#ifdef CONFIG_MMU
3628static int mem_cgroup_move_charge_write(struct cgroup *cgrp,
3629 struct cftype *cft, u64 val)
3630{
3631 struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp);
3632
3633 if (val >= (1 << NR_MOVE_TYPE))
3634 return -EINVAL;
3635
3636
3637
3638
3639
3640 cgroup_lock();
3641 mem->move_charge_at_immigrate = val;
3642 cgroup_unlock();
3643
3644 return 0;
3645}
3646#else
3647static int mem_cgroup_move_charge_write(struct cgroup *cgrp,
3648 struct cftype *cft, u64 val)
3649{
3650 return -ENOSYS;
3651}
3652#endif
3653
3654
3655
3656enum {
3657 MCS_CACHE,
3658 MCS_RSS,
3659 MCS_FILE_MAPPED,
3660 MCS_PGPGIN,
3661 MCS_PGPGOUT,
3662 MCS_SWAP,
3663 MCS_INACTIVE_ANON,
3664 MCS_ACTIVE_ANON,
3665 MCS_INACTIVE_FILE,
3666 MCS_ACTIVE_FILE,
3667 MCS_UNEVICTABLE,
3668 NR_MCS_STAT,
3669};
3670
3671struct mcs_total_stat {
3672 s64 stat[NR_MCS_STAT];
3673};
3674
3675struct {
3676 char *local_name;
3677 char *total_name;
3678} memcg_stat_strings[NR_MCS_STAT] = {
3679 {"cache", "total_cache"},
3680 {"rss", "total_rss"},
3681 {"mapped_file", "total_mapped_file"},
3682 {"pgpgin", "total_pgpgin"},
3683 {"pgpgout", "total_pgpgout"},
3684 {"swap", "total_swap"},
3685 {"inactive_anon", "total_inactive_anon"},
3686 {"active_anon", "total_active_anon"},
3687 {"inactive_file", "total_inactive_file"},
3688 {"active_file", "total_active_file"},
3689 {"unevictable", "total_unevictable"}
3690};
3691
3692
3693static void
3694mem_cgroup_get_local_stat(struct mem_cgroup *mem, struct mcs_total_stat *s)
3695{
3696 s64 val;
3697
3698
3699 val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_CACHE);
3700 s->stat[MCS_CACHE] += val * PAGE_SIZE;
3701 val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_RSS);
3702 s->stat[MCS_RSS] += val * PAGE_SIZE;
3703 val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_FILE_MAPPED);
3704 s->stat[MCS_FILE_MAPPED] += val * PAGE_SIZE;
3705 val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_PGPGIN_COUNT);
3706 s->stat[MCS_PGPGIN] += val;
3707 val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_PGPGOUT_COUNT);
3708 s->stat[MCS_PGPGOUT] += val;
3709 if (do_swap_account) {
3710 val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_SWAPOUT);
3711 s->stat[MCS_SWAP] += val * PAGE_SIZE;
3712 }
3713
3714
3715 val = mem_cgroup_get_local_zonestat(mem, LRU_INACTIVE_ANON);
3716 s->stat[MCS_INACTIVE_ANON] += val * PAGE_SIZE;
3717 val = mem_cgroup_get_local_zonestat(mem, LRU_ACTIVE_ANON);
3718 s->stat[MCS_ACTIVE_ANON] += val * PAGE_SIZE;
3719 val = mem_cgroup_get_local_zonestat(mem, LRU_INACTIVE_FILE);
3720 s->stat[MCS_INACTIVE_FILE] += val * PAGE_SIZE;
3721 val = mem_cgroup_get_local_zonestat(mem, LRU_ACTIVE_FILE);
3722 s->stat[MCS_ACTIVE_FILE] += val * PAGE_SIZE;
3723 val = mem_cgroup_get_local_zonestat(mem, LRU_UNEVICTABLE);
3724 s->stat[MCS_UNEVICTABLE] += val * PAGE_SIZE;
3725}
3726
3727static void
3728mem_cgroup_get_total_stat(struct mem_cgroup *mem, struct mcs_total_stat *s)
3729{
3730 struct mem_cgroup *iter;
3731
3732 for_each_mem_cgroup_tree(iter, mem)
3733 mem_cgroup_get_local_stat(iter, s);
3734}
3735
3736static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft,
3737 struct cgroup_map_cb *cb)
3738{
3739 struct mem_cgroup *mem_cont = mem_cgroup_from_cont(cont);
3740 struct mcs_total_stat mystat;
3741 int i;
3742
3743 memset(&mystat, 0, sizeof(mystat));
3744 mem_cgroup_get_local_stat(mem_cont, &mystat);
3745
3746 for (i = 0; i < NR_MCS_STAT; i++) {
3747 if (i == MCS_SWAP && !do_swap_account)
3748 continue;
3749 cb->fill(cb, memcg_stat_strings[i].local_name, mystat.stat[i]);
3750 }
3751
3752
3753 {
3754 unsigned long long limit, memsw_limit;
3755 memcg_get_hierarchical_limit(mem_cont, &limit, &memsw_limit);
3756 cb->fill(cb, "hierarchical_memory_limit", limit);
3757 if (do_swap_account)
3758 cb->fill(cb, "hierarchical_memsw_limit", memsw_limit);
3759 }
3760
3761 memset(&mystat, 0, sizeof(mystat));
3762 mem_cgroup_get_total_stat(mem_cont, &mystat);
3763 for (i = 0; i < NR_MCS_STAT; i++) {
3764 if (i == MCS_SWAP && !do_swap_account)
3765 continue;
3766 cb->fill(cb, memcg_stat_strings[i].total_name, mystat.stat[i]);
3767 }
3768
3769#ifdef CONFIG_DEBUG_VM
3770 cb->fill(cb, "inactive_ratio", calc_inactive_ratio(mem_cont, NULL));
3771
3772 {
3773 int nid, zid;
3774 struct mem_cgroup_per_zone *mz;
3775 unsigned long recent_rotated[2] = {0, 0};
3776 unsigned long recent_scanned[2] = {0, 0};
3777
3778 for_each_online_node(nid)
3779 for (zid = 0; zid < MAX_NR_ZONES; zid++) {
3780 mz = mem_cgroup_zoneinfo(mem_cont, nid, zid);
3781
3782 recent_rotated[0] +=
3783 mz->reclaim_stat.recent_rotated[0];
3784 recent_rotated[1] +=
3785 mz->reclaim_stat.recent_rotated[1];
3786 recent_scanned[0] +=
3787 mz->reclaim_stat.recent_scanned[0];
3788 recent_scanned[1] +=
3789 mz->reclaim_stat.recent_scanned[1];
3790 }
3791 cb->fill(cb, "recent_rotated_anon", recent_rotated[0]);
3792 cb->fill(cb, "recent_rotated_file", recent_rotated[1]);
3793 cb->fill(cb, "recent_scanned_anon", recent_scanned[0]);
3794 cb->fill(cb, "recent_scanned_file", recent_scanned[1]);
3795 }
3796#endif
3797
3798 return 0;
3799}
3800
3801static u64 mem_cgroup_swappiness_read(struct cgroup *cgrp, struct cftype *cft)
3802{
3803 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
3804
3805 return get_swappiness(memcg);
3806}
3807
3808static int mem_cgroup_swappiness_write(struct cgroup *cgrp, struct cftype *cft,
3809 u64 val)
3810{
3811 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
3812 struct mem_cgroup *parent;
3813
3814 if (val > 100)
3815 return -EINVAL;
3816
3817 if (cgrp->parent == NULL)
3818 return -EINVAL;
3819
3820 parent = mem_cgroup_from_cont(cgrp->parent);
3821
3822 cgroup_lock();
3823
3824
3825 if ((parent->use_hierarchy) ||
3826 (memcg->use_hierarchy && !list_empty(&cgrp->children))) {
3827 cgroup_unlock();
3828 return -EINVAL;
3829 }
3830
3831 spin_lock(&memcg->reclaim_param_lock);
3832 memcg->swappiness = val;
3833 spin_unlock(&memcg->reclaim_param_lock);
3834
3835 cgroup_unlock();
3836
3837 return 0;
3838}
3839
3840static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap)
3841{
3842 struct mem_cgroup_threshold_ary *t;
3843 u64 usage;
3844 int i;
3845
3846 rcu_read_lock();
3847 if (!swap)
3848 t = rcu_dereference(memcg->thresholds.primary);
3849 else
3850 t = rcu_dereference(memcg->memsw_thresholds.primary);
3851
3852 if (!t)
3853 goto unlock;
3854
3855 usage = mem_cgroup_usage(memcg, swap);
3856
3857
3858
3859
3860
3861
3862 i = t->current_threshold;
3863
3864
3865
3866
3867
3868
3869
3870 for (; i >= 0 && unlikely(t->entries[i].threshold > usage); i--)
3871 eventfd_signal(t->entries[i].eventfd, 1);
3872
3873
3874 i++;
3875
3876
3877
3878
3879
3880
3881
3882 for (; i < t->size && unlikely(t->entries[i].threshold <= usage); i++)
3883 eventfd_signal(t->entries[i].eventfd, 1);
3884
3885
3886 t->current_threshold = i - 1;
3887unlock:
3888 rcu_read_unlock();
3889}
3890
3891static void mem_cgroup_threshold(struct mem_cgroup *memcg)
3892{
3893 while (memcg) {
3894 __mem_cgroup_threshold(memcg, false);
3895 if (do_swap_account)
3896 __mem_cgroup_threshold(memcg, true);
3897
3898 memcg = parent_mem_cgroup(memcg);
3899 }
3900}
3901
3902static int compare_thresholds(const void *a, const void *b)
3903{
3904 const struct mem_cgroup_threshold *_a = a;
3905 const struct mem_cgroup_threshold *_b = b;
3906
3907 return _a->threshold - _b->threshold;
3908}
3909
3910static int mem_cgroup_oom_notify_cb(struct mem_cgroup *mem)
3911{
3912 struct mem_cgroup_eventfd_list *ev;
3913
3914 list_for_each_entry(ev, &mem->oom_notify, list)
3915 eventfd_signal(ev->eventfd, 1);
3916 return 0;
3917}
3918
3919static void mem_cgroup_oom_notify(struct mem_cgroup *mem)
3920{
3921 struct mem_cgroup *iter;
3922
3923 for_each_mem_cgroup_tree(iter, mem)
3924 mem_cgroup_oom_notify_cb(iter);
3925}
3926
3927static int mem_cgroup_usage_register_event(struct cgroup *cgrp,
3928 struct cftype *cft, struct eventfd_ctx *eventfd, const char *args)
3929{
3930 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
3931 struct mem_cgroup_thresholds *thresholds;
3932 struct mem_cgroup_threshold_ary *new;
3933 int type = MEMFILE_TYPE(cft->private);
3934 u64 threshold, usage;
3935 int i, size, ret;
3936
3937 ret = res_counter_memparse_write_strategy(args, &threshold);
3938 if (ret)
3939 return ret;
3940
3941 mutex_lock(&memcg->thresholds_lock);
3942
3943 if (type == _MEM)
3944 thresholds = &memcg->thresholds;
3945 else if (type == _MEMSWAP)
3946 thresholds = &memcg->memsw_thresholds;
3947 else
3948 BUG();
3949
3950 usage = mem_cgroup_usage(memcg, type == _MEMSWAP);
3951
3952
3953 if (thresholds->primary)
3954 __mem_cgroup_threshold(memcg, type == _MEMSWAP);
3955
3956 size = thresholds->primary ? thresholds->primary->size + 1 : 1;
3957
3958
3959 new = kmalloc(sizeof(*new) + size * sizeof(struct mem_cgroup_threshold),
3960 GFP_KERNEL);
3961 if (!new) {
3962 ret = -ENOMEM;
3963 goto unlock;
3964 }
3965 new->size = size;
3966
3967
3968 if (thresholds->primary) {
3969 memcpy(new->entries, thresholds->primary->entries, (size - 1) *
3970 sizeof(struct mem_cgroup_threshold));
3971 }
3972
3973
3974 new->entries[size - 1].eventfd = eventfd;
3975 new->entries[size - 1].threshold = threshold;
3976
3977
3978 sort(new->entries, size, sizeof(struct mem_cgroup_threshold),
3979 compare_thresholds, NULL);
3980
3981
3982 new->current_threshold = -1;
3983 for (i = 0; i < size; i++) {
3984 if (new->entries[i].threshold < usage) {
3985
3986
3987
3988
3989
3990 ++new->current_threshold;
3991 }
3992 }
3993
3994
3995 kfree(thresholds->spare);
3996 thresholds->spare = thresholds->primary;
3997
3998 rcu_assign_pointer(thresholds->primary, new);
3999
4000
4001 synchronize_rcu();
4002
4003unlock:
4004 mutex_unlock(&memcg->thresholds_lock);
4005
4006 return ret;
4007}
4008
4009static void mem_cgroup_usage_unregister_event(struct cgroup *cgrp,
4010 struct cftype *cft, struct eventfd_ctx *eventfd)
4011{
4012 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
4013 struct mem_cgroup_thresholds *thresholds;
4014 struct mem_cgroup_threshold_ary *new;
4015 int type = MEMFILE_TYPE(cft->private);
4016 u64 usage;
4017 int i, j, size;
4018
4019 mutex_lock(&memcg->thresholds_lock);
4020 if (type == _MEM)
4021 thresholds = &memcg->thresholds;
4022 else if (type == _MEMSWAP)
4023 thresholds = &memcg->memsw_thresholds;
4024 else
4025 BUG();
4026
4027
4028
4029
4030
4031 BUG_ON(!thresholds);
4032
4033 usage = mem_cgroup_usage(memcg, type == _MEMSWAP);
4034
4035
4036 __mem_cgroup_threshold(memcg, type == _MEMSWAP);
4037
4038
4039 size = 0;
4040 for (i = 0; i < thresholds->primary->size; i++) {
4041 if (thresholds->primary->entries[i].eventfd != eventfd)
4042 size++;
4043 }
4044
4045 new = thresholds->spare;
4046
4047
4048 if (!size) {
4049 kfree(new);
4050 new = NULL;
4051 goto swap_buffers;
4052 }
4053
4054 new->size = size;
4055
4056
4057 new->current_threshold = -1;
4058 for (i = 0, j = 0; i < thresholds->primary->size; i++) {
4059 if (thresholds->primary->entries[i].eventfd == eventfd)
4060 continue;
4061
4062 new->entries[j] = thresholds->primary->entries[i];
4063 if (new->entries[j].threshold < usage) {
4064
4065
4066
4067
4068
4069 ++new->current_threshold;
4070 }
4071 j++;
4072 }
4073
4074swap_buffers:
4075
4076 thresholds->spare = thresholds->primary;
4077 rcu_assign_pointer(thresholds->primary, new);
4078
4079
4080 synchronize_rcu();
4081
4082 mutex_unlock(&memcg->thresholds_lock);
4083}
4084
4085static int mem_cgroup_oom_register_event(struct cgroup *cgrp,
4086 struct cftype *cft, struct eventfd_ctx *eventfd, const char *args)
4087{
4088 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
4089 struct mem_cgroup_eventfd_list *event;
4090 int type = MEMFILE_TYPE(cft->private);
4091
4092 BUG_ON(type != _OOM_TYPE);
4093 event = kmalloc(sizeof(*event), GFP_KERNEL);
4094 if (!event)
4095 return -ENOMEM;
4096
4097 mutex_lock(&memcg_oom_mutex);
4098
4099 event->eventfd = eventfd;
4100 list_add(&event->list, &memcg->oom_notify);
4101
4102
4103 if (atomic_read(&memcg->oom_lock))
4104 eventfd_signal(eventfd, 1);
4105 mutex_unlock(&memcg_oom_mutex);
4106
4107 return 0;
4108}
4109
4110static void mem_cgroup_oom_unregister_event(struct cgroup *cgrp,
4111 struct cftype *cft, struct eventfd_ctx *eventfd)
4112{
4113 struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp);
4114 struct mem_cgroup_eventfd_list *ev, *tmp;
4115 int type = MEMFILE_TYPE(cft->private);
4116
4117 BUG_ON(type != _OOM_TYPE);
4118
4119 mutex_lock(&memcg_oom_mutex);
4120
4121 list_for_each_entry_safe(ev, tmp, &mem->oom_notify, list) {
4122 if (ev->eventfd == eventfd) {
4123 list_del(&ev->list);
4124 kfree(ev);
4125 }
4126 }
4127
4128 mutex_unlock(&memcg_oom_mutex);
4129}
4130
4131static int mem_cgroup_oom_control_read(struct cgroup *cgrp,
4132 struct cftype *cft, struct cgroup_map_cb *cb)
4133{
4134 struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp);
4135
4136 cb->fill(cb, "oom_kill_disable", mem->oom_kill_disable);
4137
4138 if (atomic_read(&mem->oom_lock))
4139 cb->fill(cb, "under_oom", 1);
4140 else
4141 cb->fill(cb, "under_oom", 0);
4142 return 0;
4143}
4144
4145static int mem_cgroup_oom_control_write(struct cgroup *cgrp,
4146 struct cftype *cft, u64 val)
4147{
4148 struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp);
4149 struct mem_cgroup *parent;
4150
4151
4152 if (!cgrp->parent || !((val == 0) || (val == 1)))
4153 return -EINVAL;
4154
4155 parent = mem_cgroup_from_cont(cgrp->parent);
4156
4157 cgroup_lock();
4158
4159 if ((parent->use_hierarchy) ||
4160 (mem->use_hierarchy && !list_empty(&cgrp->children))) {
4161 cgroup_unlock();
4162 return -EINVAL;
4163 }
4164 mem->oom_kill_disable = val;
4165 if (!val)
4166 memcg_oom_recover(mem);
4167 cgroup_unlock();
4168 return 0;
4169}
4170
4171static struct cftype mem_cgroup_files[] = {
4172 {
4173 .name = "usage_in_bytes",
4174 .private = MEMFILE_PRIVATE(_MEM, RES_USAGE),
4175 .read_u64 = mem_cgroup_read,
4176 .register_event = mem_cgroup_usage_register_event,
4177 .unregister_event = mem_cgroup_usage_unregister_event,
4178 },
4179 {
4180 .name = "max_usage_in_bytes",
4181 .private = MEMFILE_PRIVATE(_MEM, RES_MAX_USAGE),
4182 .trigger = mem_cgroup_reset,
4183 .read_u64 = mem_cgroup_read,
4184 },
4185 {
4186 .name = "limit_in_bytes",
4187 .private = MEMFILE_PRIVATE(_MEM, RES_LIMIT),
4188 .write_string = mem_cgroup_write,
4189 .read_u64 = mem_cgroup_read,
4190 },
4191 {
4192 .name = "soft_limit_in_bytes",
4193 .private = MEMFILE_PRIVATE(_MEM, RES_SOFT_LIMIT),
4194 .write_string = mem_cgroup_write,
4195 .read_u64 = mem_cgroup_read,
4196 },
4197 {
4198 .name = "failcnt",
4199 .private = MEMFILE_PRIVATE(_MEM, RES_FAILCNT),
4200 .trigger = mem_cgroup_reset,
4201 .read_u64 = mem_cgroup_read,
4202 },
4203 {
4204 .name = "stat",
4205 .read_map = mem_control_stat_show,
4206 },
4207 {
4208 .name = "force_empty",
4209 .trigger = mem_cgroup_force_empty_write,
4210 },
4211 {
4212 .name = "use_hierarchy",
4213 .write_u64 = mem_cgroup_hierarchy_write,
4214 .read_u64 = mem_cgroup_hierarchy_read,
4215 },
4216 {
4217 .name = "swappiness",
4218 .read_u64 = mem_cgroup_swappiness_read,
4219 .write_u64 = mem_cgroup_swappiness_write,
4220 },
4221 {
4222 .name = "move_charge_at_immigrate",
4223 .read_u64 = mem_cgroup_move_charge_read,
4224 .write_u64 = mem_cgroup_move_charge_write,
4225 },
4226 {
4227 .name = "oom_control",
4228 .read_map = mem_cgroup_oom_control_read,
4229 .write_u64 = mem_cgroup_oom_control_write,
4230 .register_event = mem_cgroup_oom_register_event,
4231 .unregister_event = mem_cgroup_oom_unregister_event,
4232 .private = MEMFILE_PRIVATE(_OOM_TYPE, OOM_CONTROL),
4233 },
4234};
4235
4236#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
4237static struct cftype memsw_cgroup_files[] = {
4238 {
4239 .name = "memsw.usage_in_bytes",
4240 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE),
4241 .read_u64 = mem_cgroup_read,
4242 .register_event = mem_cgroup_usage_register_event,
4243 .unregister_event = mem_cgroup_usage_unregister_event,
4244 },
4245 {
4246 .name = "memsw.max_usage_in_bytes",
4247 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_MAX_USAGE),
4248 .trigger = mem_cgroup_reset,
4249 .read_u64 = mem_cgroup_read,
4250 },
4251 {
4252 .name = "memsw.limit_in_bytes",
4253 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_LIMIT),
4254 .write_string = mem_cgroup_write,
4255 .read_u64 = mem_cgroup_read,
4256 },
4257 {
4258 .name = "memsw.failcnt",
4259 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_FAILCNT),
4260 .trigger = mem_cgroup_reset,
4261 .read_u64 = mem_cgroup_read,
4262 },
4263};
4264
4265static int register_memsw_files(struct cgroup *cont, struct cgroup_subsys *ss)
4266{
4267 if (!do_swap_account)
4268 return 0;
4269 return cgroup_add_files(cont, ss, memsw_cgroup_files,
4270 ARRAY_SIZE(memsw_cgroup_files));
4271};
4272#else
4273static int register_memsw_files(struct cgroup *cont, struct cgroup_subsys *ss)
4274{
4275 return 0;
4276}
4277#endif
4278
4279static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node)
4280{
4281 struct mem_cgroup_per_node *pn;
4282 struct mem_cgroup_per_zone *mz;
4283 enum lru_list l;
4284 int zone, tmp = node;
4285
4286
4287
4288
4289
4290
4291
4292
4293 if (!node_state(node, N_NORMAL_MEMORY))
4294 tmp = -1;
4295 pn = kzalloc_node(sizeof(*pn), GFP_KERNEL, tmp);
4296 if (!pn)
4297 return 1;
4298
4299 mem->info.nodeinfo[node] = pn;
4300 for (zone = 0; zone < MAX_NR_ZONES; zone++) {
4301 mz = &pn->zoneinfo[zone];
4302 for_each_lru(l)
4303 INIT_LIST_HEAD(&mz->lists[l]);
4304 mz->usage_in_excess = 0;
4305 mz->on_tree = false;
4306 mz->mem = mem;
4307 }
4308 return 0;
4309}
4310
4311static void free_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node)
4312{
4313 kfree(mem->info.nodeinfo[node]);
4314}
4315
4316static struct mem_cgroup *mem_cgroup_alloc(void)
4317{
4318 struct mem_cgroup *mem;
4319 int size = sizeof(struct mem_cgroup);
4320
4321
4322 if (size < PAGE_SIZE)
4323 mem = kzalloc(size, GFP_KERNEL);
4324 else
4325 mem = vzalloc(size);
4326
4327 if (!mem)
4328 return NULL;
4329
4330 mem->stat = alloc_percpu(struct mem_cgroup_stat_cpu);
4331 if (!mem->stat)
4332 goto out_free;
4333 spin_lock_init(&mem->pcp_counter_lock);
4334 return mem;
4335
4336out_free:
4337 if (size < PAGE_SIZE)
4338 kfree(mem);
4339 else
4340 vfree(mem);
4341 return NULL;
4342}
4343
4344
4345
4346
4347
4348
4349
4350
4351
4352
4353
4354
4355static void __mem_cgroup_free(struct mem_cgroup *mem)
4356{
4357 int node;
4358
4359 mem_cgroup_remove_from_trees(mem);
4360 free_css_id(&mem_cgroup_subsys, &mem->css);
4361
4362 for_each_node_state(node, N_POSSIBLE)
4363 free_mem_cgroup_per_zone_info(mem, node);
4364
4365 free_percpu(mem->stat);
4366 if (sizeof(struct mem_cgroup) < PAGE_SIZE)
4367 kfree(mem);
4368 else
4369 vfree(mem);
4370}
4371
4372static void mem_cgroup_get(struct mem_cgroup *mem)
4373{
4374 atomic_inc(&mem->refcnt);
4375}
4376
4377static void __mem_cgroup_put(struct mem_cgroup *mem, int count)
4378{
4379 if (atomic_sub_and_test(count, &mem->refcnt)) {
4380 struct mem_cgroup *parent = parent_mem_cgroup(mem);
4381 __mem_cgroup_free(mem);
4382 if (parent)
4383 mem_cgroup_put(parent);
4384 }
4385}
4386
4387static void mem_cgroup_put(struct mem_cgroup *mem)
4388{
4389 __mem_cgroup_put(mem, 1);
4390}
4391
4392
4393
4394
4395static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *mem)
4396{
4397 if (!mem->res.parent)
4398 return NULL;
4399 return mem_cgroup_from_res_counter(mem->res.parent, res);
4400}
4401
4402#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
4403static void __init enable_swap_cgroup(void)
4404{
4405 if (!mem_cgroup_disabled() && really_do_swap_account)
4406 do_swap_account = 1;
4407}
4408#else
4409static void __init enable_swap_cgroup(void)
4410{
4411}
4412#endif
4413
4414static int mem_cgroup_soft_limit_tree_init(void)
4415{
4416 struct mem_cgroup_tree_per_node *rtpn;
4417 struct mem_cgroup_tree_per_zone *rtpz;
4418 int tmp, node, zone;
4419
4420 for_each_node_state(node, N_POSSIBLE) {
4421 tmp = node;
4422 if (!node_state(node, N_NORMAL_MEMORY))
4423 tmp = -1;
4424 rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL, tmp);
4425 if (!rtpn)
4426 return 1;
4427
4428 soft_limit_tree.rb_tree_per_node[node] = rtpn;
4429
4430 for (zone = 0; zone < MAX_NR_ZONES; zone++) {
4431 rtpz = &rtpn->rb_tree_per_zone[zone];
4432 rtpz->rb_root = RB_ROOT;
4433 spin_lock_init(&rtpz->lock);
4434 }
4435 }
4436 return 0;
4437}
4438
4439static struct cgroup_subsys_state * __ref
4440mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
4441{
4442 struct mem_cgroup *mem, *parent;
4443 long error = -ENOMEM;
4444 int node;
4445
4446 mem = mem_cgroup_alloc();
4447 if (!mem)
4448 return ERR_PTR(error);
4449
4450 for_each_node_state(node, N_POSSIBLE)
4451 if (alloc_mem_cgroup_per_zone_info(mem, node))
4452 goto free_out;
4453
4454
4455 if (cont->parent == NULL) {
4456 int cpu;
4457 enable_swap_cgroup();
4458 parent = NULL;
4459 root_mem_cgroup = mem;
4460 if (mem_cgroup_soft_limit_tree_init())
4461 goto free_out;
4462 for_each_possible_cpu(cpu) {
4463 struct memcg_stock_pcp *stock =
4464 &per_cpu(memcg_stock, cpu);
4465 INIT_WORK(&stock->work, drain_local_stock);
4466 }
4467 hotcpu_notifier(memcg_cpu_hotplug_callback, 0);
4468 } else {
4469 parent = mem_cgroup_from_cont(cont->parent);
4470 mem->use_hierarchy = parent->use_hierarchy;
4471 mem->oom_kill_disable = parent->oom_kill_disable;
4472 }
4473
4474 if (parent && parent->use_hierarchy) {
4475 res_counter_init(&mem->res, &parent->res);
4476 res_counter_init(&mem->memsw, &parent->memsw);
4477
4478
4479
4480
4481
4482
4483 mem_cgroup_get(parent);
4484 } else {
4485 res_counter_init(&mem->res, NULL);
4486 res_counter_init(&mem->memsw, NULL);
4487 }
4488 mem->last_scanned_child = 0;
4489 spin_lock_init(&mem->reclaim_param_lock);
4490 INIT_LIST_HEAD(&mem->oom_notify);
4491
4492 if (parent)
4493 mem->swappiness = get_swappiness(parent);
4494 atomic_set(&mem->refcnt, 1);
4495 mem->move_charge_at_immigrate = 0;
4496 mutex_init(&mem->thresholds_lock);
4497 return &mem->css;
4498free_out:
4499 __mem_cgroup_free(mem);
4500 root_mem_cgroup = NULL;
4501 return ERR_PTR(error);
4502}
4503
4504static int mem_cgroup_pre_destroy(struct cgroup_subsys *ss,
4505 struct cgroup *cont)
4506{
4507 struct mem_cgroup *mem = mem_cgroup_from_cont(cont);
4508
4509 return mem_cgroup_force_empty(mem, false);
4510}
4511
4512static void mem_cgroup_destroy(struct cgroup_subsys *ss,
4513 struct cgroup *cont)
4514{
4515 struct mem_cgroup *mem = mem_cgroup_from_cont(cont);
4516
4517 mem_cgroup_put(mem);
4518}
4519
4520static int mem_cgroup_populate(struct cgroup_subsys *ss,
4521 struct cgroup *cont)
4522{
4523 int ret;
4524
4525 ret = cgroup_add_files(cont, ss, mem_cgroup_files,
4526 ARRAY_SIZE(mem_cgroup_files));
4527
4528 if (!ret)
4529 ret = register_memsw_files(cont, ss);
4530 return ret;
4531}
4532
4533#ifdef CONFIG_MMU
4534
4535#define PRECHARGE_COUNT_AT_ONCE 256
4536static int mem_cgroup_do_precharge(unsigned long count)
4537{
4538 int ret = 0;
4539 int batch_count = PRECHARGE_COUNT_AT_ONCE;
4540 struct mem_cgroup *mem = mc.to;
4541
4542 if (mem_cgroup_is_root(mem)) {
4543 mc.precharge += count;
4544
4545 return ret;
4546 }
4547
4548 if (count > 1) {
4549 struct res_counter *dummy;
4550
4551
4552
4553
4554
4555
4556 if (res_counter_charge(&mem->res, PAGE_SIZE * count, &dummy))
4557 goto one_by_one;
4558 if (do_swap_account && res_counter_charge(&mem->memsw,
4559 PAGE_SIZE * count, &dummy)) {
4560 res_counter_uncharge(&mem->res, PAGE_SIZE * count);
4561 goto one_by_one;
4562 }
4563 mc.precharge += count;
4564 return ret;
4565 }
4566one_by_one:
4567
4568 while (count--) {
4569 if (signal_pending(current)) {
4570 ret = -EINTR;
4571 break;
4572 }
4573 if (!batch_count--) {
4574 batch_count = PRECHARGE_COUNT_AT_ONCE;
4575 cond_resched();
4576 }
4577 ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, &mem, false,
4578 PAGE_SIZE);
4579 if (ret || !mem)
4580
4581 return -ENOMEM;
4582 mc.precharge++;
4583 }
4584 return ret;
4585}
4586
4587
4588
4589
4590
4591
4592
4593
4594
4595
4596
4597
4598
4599
4600
4601
4602
4603
4604
4605union mc_target {
4606 struct page *page;
4607 swp_entry_t ent;
4608};
4609
4610enum mc_target_type {
4611 MC_TARGET_NONE,
4612 MC_TARGET_PAGE,
4613 MC_TARGET_SWAP,
4614};
4615
4616static struct page *mc_handle_present_pte(struct vm_area_struct *vma,
4617 unsigned long addr, pte_t ptent)
4618{
4619 struct page *page = vm_normal_page(vma, addr, ptent);
4620
4621 if (!page || !page_mapped(page))
4622 return NULL;
4623 if (PageAnon(page)) {
4624
4625 if (!move_anon() || page_mapcount(page) > 2)
4626 return NULL;
4627 } else if (!move_file())
4628
4629 return NULL;
4630 if (!get_page_unless_zero(page))
4631 return NULL;
4632
4633 return page;
4634}
4635
4636static struct page *mc_handle_swap_pte(struct vm_area_struct *vma,
4637 unsigned long addr, pte_t ptent, swp_entry_t *entry)
4638{
4639 int usage_count;
4640 struct page *page = NULL;
4641 swp_entry_t ent = pte_to_swp_entry(ptent);
4642
4643 if (!move_anon() || non_swap_entry(ent))
4644 return NULL;
4645 usage_count = mem_cgroup_count_swap_user(ent, &page);
4646 if (usage_count > 1) {
4647 if (page)
4648 put_page(page);
4649 return NULL;
4650 }
4651 if (do_swap_account)
4652 entry->val = ent.val;
4653
4654 return page;
4655}
4656
4657static struct page *mc_handle_file_pte(struct vm_area_struct *vma,
4658 unsigned long addr, pte_t ptent, swp_entry_t *entry)
4659{
4660 struct page *page = NULL;
4661 struct inode *inode;
4662 struct address_space *mapping;
4663 pgoff_t pgoff;
4664
4665 if (!vma->vm_file)
4666 return NULL;
4667 if (!move_file())
4668 return NULL;
4669
4670 inode = vma->vm_file->f_path.dentry->d_inode;
4671 mapping = vma->vm_file->f_mapping;
4672 if (pte_none(ptent))
4673 pgoff = linear_page_index(vma, addr);
4674 else
4675 pgoff = pte_to_pgoff(ptent);
4676
4677
4678 if (!mapping_cap_swap_backed(mapping)) {
4679 page = find_get_page(mapping, pgoff);
4680 } else {
4681 swp_entry_t ent;
4682 mem_cgroup_get_shmem_target(inode, pgoff, &page, &ent);
4683 if (do_swap_account)
4684 entry->val = ent.val;
4685 }
4686
4687 return page;
4688}
4689
4690static int is_target_pte_for_mc(struct vm_area_struct *vma,
4691 unsigned long addr, pte_t ptent, union mc_target *target)
4692{
4693 struct page *page = NULL;
4694 struct page_cgroup *pc;
4695 int ret = 0;
4696 swp_entry_t ent = { .val = 0 };
4697
4698 if (pte_present(ptent))
4699 page = mc_handle_present_pte(vma, addr, ptent);
4700 else if (is_swap_pte(ptent))
4701 page = mc_handle_swap_pte(vma, addr, ptent, &ent);
4702 else if (pte_none(ptent) || pte_file(ptent))
4703 page = mc_handle_file_pte(vma, addr, ptent, &ent);
4704
4705 if (!page && !ent.val)
4706 return 0;
4707 if (page) {
4708 pc = lookup_page_cgroup(page);
4709
4710
4711
4712
4713
4714 if (PageCgroupUsed(pc) && pc->mem_cgroup == mc.from) {
4715 ret = MC_TARGET_PAGE;
4716 if (target)
4717 target->page = page;
4718 }
4719 if (!ret || !target)
4720 put_page(page);
4721 }
4722
4723 if (ent.val && !ret &&
4724 css_id(&mc.from->css) == lookup_swap_cgroup(ent)) {
4725 ret = MC_TARGET_SWAP;
4726 if (target)
4727 target->ent = ent;
4728 }
4729 return ret;
4730}
4731
4732static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd,
4733 unsigned long addr, unsigned long end,
4734 struct mm_walk *walk)
4735{
4736 struct vm_area_struct *vma = walk->private;
4737 pte_t *pte;
4738 spinlock_t *ptl;
4739
4740 VM_BUG_ON(pmd_trans_huge(*pmd));
4741 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
4742 for (; addr != end; pte++, addr += PAGE_SIZE)
4743 if (is_target_pte_for_mc(vma, addr, *pte, NULL))
4744 mc.precharge++;
4745 pte_unmap_unlock(pte - 1, ptl);
4746 cond_resched();
4747
4748 return 0;
4749}
4750
4751static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm)
4752{
4753 unsigned long precharge;
4754 struct vm_area_struct *vma;
4755
4756 down_read(&mm->mmap_sem);
4757 for (vma = mm->mmap; vma; vma = vma->vm_next) {
4758 struct mm_walk mem_cgroup_count_precharge_walk = {
4759 .pmd_entry = mem_cgroup_count_precharge_pte_range,
4760 .mm = mm,
4761 .private = vma,
4762 };
4763 if (is_vm_hugetlb_page(vma))
4764 continue;
4765 walk_page_range(vma->vm_start, vma->vm_end,
4766 &mem_cgroup_count_precharge_walk);
4767 }
4768 up_read(&mm->mmap_sem);
4769
4770 precharge = mc.precharge;
4771 mc.precharge = 0;
4772
4773 return precharge;
4774}
4775
4776static int mem_cgroup_precharge_mc(struct mm_struct *mm)
4777{
4778 unsigned long precharge = mem_cgroup_count_precharge(mm);
4779
4780 VM_BUG_ON(mc.moving_task);
4781 mc.moving_task = current;
4782 return mem_cgroup_do_precharge(precharge);
4783}
4784
4785
4786static void __mem_cgroup_clear_mc(void)
4787{
4788 struct mem_cgroup *from = mc.from;
4789 struct mem_cgroup *to = mc.to;
4790
4791
4792 if (mc.precharge) {
4793 __mem_cgroup_cancel_charge(mc.to, mc.precharge);
4794 mc.precharge = 0;
4795 }
4796
4797
4798
4799
4800 if (mc.moved_charge) {
4801 __mem_cgroup_cancel_charge(mc.from, mc.moved_charge);
4802 mc.moved_charge = 0;
4803 }
4804
4805 if (mc.moved_swap) {
4806
4807 if (!mem_cgroup_is_root(mc.from))
4808 res_counter_uncharge(&mc.from->memsw,
4809 PAGE_SIZE * mc.moved_swap);
4810 __mem_cgroup_put(mc.from, mc.moved_swap);
4811
4812 if (!mem_cgroup_is_root(mc.to)) {
4813
4814
4815
4816
4817 res_counter_uncharge(&mc.to->res,
4818 PAGE_SIZE * mc.moved_swap);
4819 }
4820
4821 mc.moved_swap = 0;
4822 }
4823 memcg_oom_recover(from);
4824 memcg_oom_recover(to);
4825 wake_up_all(&mc.waitq);
4826}
4827
4828static void mem_cgroup_clear_mc(void)
4829{
4830 struct mem_cgroup *from = mc.from;
4831
4832
4833
4834
4835
4836 mc.moving_task = NULL;
4837 __mem_cgroup_clear_mc();
4838 spin_lock(&mc.lock);
4839 mc.from = NULL;
4840 mc.to = NULL;
4841 spin_unlock(&mc.lock);
4842 mem_cgroup_end_move(from);
4843}
4844
4845static int mem_cgroup_can_attach(struct cgroup_subsys *ss,
4846 struct cgroup *cgroup,
4847 struct task_struct *p,
4848 bool threadgroup)
4849{
4850 int ret = 0;
4851 struct mem_cgroup *mem = mem_cgroup_from_cont(cgroup);
4852
4853 if (mem->move_charge_at_immigrate) {
4854 struct mm_struct *mm;
4855 struct mem_cgroup *from = mem_cgroup_from_task(p);
4856
4857 VM_BUG_ON(from == mem);
4858
4859 mm = get_task_mm(p);
4860 if (!mm)
4861 return 0;
4862
4863 if (mm->owner == p) {
4864 VM_BUG_ON(mc.from);
4865 VM_BUG_ON(mc.to);
4866 VM_BUG_ON(mc.precharge);
4867 VM_BUG_ON(mc.moved_charge);
4868 VM_BUG_ON(mc.moved_swap);
4869 mem_cgroup_start_move(from);
4870 spin_lock(&mc.lock);
4871 mc.from = from;
4872 mc.to = mem;
4873 spin_unlock(&mc.lock);
4874
4875
4876 ret = mem_cgroup_precharge_mc(mm);
4877 if (ret)
4878 mem_cgroup_clear_mc();
4879 }
4880 mmput(mm);
4881 }
4882 return ret;
4883}
4884
4885static void mem_cgroup_cancel_attach(struct cgroup_subsys *ss,
4886 struct cgroup *cgroup,
4887 struct task_struct *p,
4888 bool threadgroup)
4889{
4890 mem_cgroup_clear_mc();
4891}
4892
4893static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
4894 unsigned long addr, unsigned long end,
4895 struct mm_walk *walk)
4896{
4897 int ret = 0;
4898 struct vm_area_struct *vma = walk->private;
4899 pte_t *pte;
4900 spinlock_t *ptl;
4901
4902retry:
4903 VM_BUG_ON(pmd_trans_huge(*pmd));
4904 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
4905 for (; addr != end; addr += PAGE_SIZE) {
4906 pte_t ptent = *(pte++);
4907 union mc_target target;
4908 int type;
4909 struct page *page;
4910 struct page_cgroup *pc;
4911 swp_entry_t ent;
4912
4913 if (!mc.precharge)
4914 break;
4915
4916 type = is_target_pte_for_mc(vma, addr, ptent, &target);
4917 switch (type) {
4918 case MC_TARGET_PAGE:
4919 page = target.page;
4920 if (isolate_lru_page(page))
4921 goto put;
4922 pc = lookup_page_cgroup(page);
4923 if (!mem_cgroup_move_account(pc,
4924 mc.from, mc.to, false, PAGE_SIZE)) {
4925 mc.precharge--;
4926
4927 mc.moved_charge++;
4928 }
4929 putback_lru_page(page);
4930put:
4931 put_page(page);
4932 break;
4933 case MC_TARGET_SWAP:
4934 ent = target.ent;
4935 if (!mem_cgroup_move_swap_account(ent,
4936 mc.from, mc.to, false)) {
4937 mc.precharge--;
4938
4939 mc.moved_swap++;
4940 }
4941 break;
4942 default:
4943 break;
4944 }
4945 }
4946 pte_unmap_unlock(pte - 1, ptl);
4947 cond_resched();
4948
4949 if (addr != end) {
4950
4951
4952
4953
4954
4955
4956 ret = mem_cgroup_do_precharge(1);
4957 if (!ret)
4958 goto retry;
4959 }
4960
4961 return ret;
4962}
4963
4964static void mem_cgroup_move_charge(struct mm_struct *mm)
4965{
4966 struct vm_area_struct *vma;
4967
4968 lru_add_drain_all();
4969retry:
4970 if (unlikely(!down_read_trylock(&mm->mmap_sem))) {
4971
4972
4973
4974
4975
4976
4977
4978 __mem_cgroup_clear_mc();
4979 cond_resched();
4980 goto retry;
4981 }
4982 for (vma = mm->mmap; vma; vma = vma->vm_next) {
4983 int ret;
4984 struct mm_walk mem_cgroup_move_charge_walk = {
4985 .pmd_entry = mem_cgroup_move_charge_pte_range,
4986 .mm = mm,
4987 .private = vma,
4988 };
4989 if (is_vm_hugetlb_page(vma))
4990 continue;
4991 ret = walk_page_range(vma->vm_start, vma->vm_end,
4992 &mem_cgroup_move_charge_walk);
4993 if (ret)
4994
4995
4996
4997
4998 break;
4999 }
5000 up_read(&mm->mmap_sem);
5001}
5002
5003static void mem_cgroup_move_task(struct cgroup_subsys *ss,
5004 struct cgroup *cont,
5005 struct cgroup *old_cont,
5006 struct task_struct *p,
5007 bool threadgroup)
5008{
5009 struct mm_struct *mm;
5010
5011 if (!mc.to)
5012
5013 return;
5014
5015 mm = get_task_mm(p);
5016 if (mm) {
5017 mem_cgroup_move_charge(mm);
5018 mmput(mm);
5019 }
5020 mem_cgroup_clear_mc();
5021}
5022#else
5023static int mem_cgroup_can_attach(struct cgroup_subsys *ss,
5024 struct cgroup *cgroup,
5025 struct task_struct *p,
5026 bool threadgroup)
5027{
5028 return 0;
5029}
5030static void mem_cgroup_cancel_attach(struct cgroup_subsys *ss,
5031 struct cgroup *cgroup,
5032 struct task_struct *p,
5033 bool threadgroup)
5034{
5035}
5036static void mem_cgroup_move_task(struct cgroup_subsys *ss,
5037 struct cgroup *cont,
5038 struct cgroup *old_cont,
5039 struct task_struct *p,
5040 bool threadgroup)
5041{
5042}
5043#endif
5044
5045struct cgroup_subsys mem_cgroup_subsys = {
5046 .name = "memory",
5047 .subsys_id = mem_cgroup_subsys_id,
5048 .create = mem_cgroup_create,
5049 .pre_destroy = mem_cgroup_pre_destroy,
5050 .destroy = mem_cgroup_destroy,
5051 .populate = mem_cgroup_populate,
5052 .can_attach = mem_cgroup_can_attach,
5053 .cancel_attach = mem_cgroup_cancel_attach,
5054 .attach = mem_cgroup_move_task,
5055 .early_init = 0,
5056 .use_id = 1,
5057};
5058
5059#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
5060static int __init enable_swap_account(char *s)
5061{
5062
5063 if (!(*s) || !strcmp(s, "=1"))
5064 really_do_swap_account = 1;
5065 else if (!strcmp(s, "=0"))
5066 really_do_swap_account = 0;
5067 return 1;
5068}
5069__setup("swapaccount", enable_swap_account);
5070
5071static int __init disable_swap_account(char *s)
5072{
5073 printk_once("noswapaccount is deprecated and will be removed in 2.6.40. Use swapaccount=0 instead\n");
5074 enable_swap_account("=0");
5075 return 1;
5076}
5077__setup("noswapaccount", disable_swap_account);
5078#endif
5079