1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24#include <linux/res_counter.h>
25#include <linux/memcontrol.h>
26#include <linux/cgroup.h>
27#include <linux/mm.h>
28#include <linux/hugetlb.h>
29#include <linux/pagemap.h>
30#include <linux/smp.h>
31#include <linux/page-flags.h>
32#include <linux/backing-dev.h>
33#include <linux/bit_spinlock.h>
34#include <linux/rcupdate.h>
35#include <linux/limits.h>
36#include <linux/mutex.h>
37#include <linux/rbtree.h>
38#include <linux/slab.h>
39#include <linux/swap.h>
40#include <linux/swapops.h>
41#include <linux/spinlock.h>
42#include <linux/eventfd.h>
43#include <linux/sort.h>
44#include <linux/fs.h>
45#include <linux/seq_file.h>
46#include <linux/vmalloc.h>
47#include <linux/mm_inline.h>
48#include <linux/page_cgroup.h>
49#include <linux/cpu.h>
50#include <linux/oom.h>
51#include "internal.h"
52
53#include <asm/uaccess.h>
54
55#include <trace/events/vmscan.h>
56
57struct cgroup_subsys mem_cgroup_subsys __read_mostly;
58#define MEM_CGROUP_RECLAIM_RETRIES 5
59struct mem_cgroup *root_mem_cgroup __read_mostly;
60
61#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
62
63int do_swap_account __read_mostly;
64
65
66#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP_ENABLED
67static int really_do_swap_account __initdata = 1;
68#else
69static int really_do_swap_account __initdata = 0;
70#endif
71
72#else
73#define do_swap_account (0)
74#endif
75
76
77
78
79
80enum mem_cgroup_stat_index {
81
82
83
84 MEM_CGROUP_STAT_CACHE,
85 MEM_CGROUP_STAT_RSS,
86 MEM_CGROUP_STAT_FILE_MAPPED,
87 MEM_CGROUP_STAT_SWAPOUT,
88 MEM_CGROUP_STAT_DATA,
89 MEM_CGROUP_ON_MOVE,
90 MEM_CGROUP_STAT_NSTATS,
91};
92
93enum mem_cgroup_events_index {
94 MEM_CGROUP_EVENTS_PGPGIN,
95 MEM_CGROUP_EVENTS_PGPGOUT,
96 MEM_CGROUP_EVENTS_COUNT,
97 MEM_CGROUP_EVENTS_NSTATS,
98};
99
100
101
102
103
104
105enum mem_cgroup_events_target {
106 MEM_CGROUP_TARGET_THRESH,
107 MEM_CGROUP_TARGET_SOFTLIMIT,
108 MEM_CGROUP_NTARGETS,
109};
110#define THRESHOLDS_EVENTS_TARGET (128)
111#define SOFTLIMIT_EVENTS_TARGET (1024)
112
113struct mem_cgroup_stat_cpu {
114 long count[MEM_CGROUP_STAT_NSTATS];
115 unsigned long events[MEM_CGROUP_EVENTS_NSTATS];
116 unsigned long targets[MEM_CGROUP_NTARGETS];
117};
118
119
120
121
122struct mem_cgroup_per_zone {
123
124
125
126 struct list_head lists[NR_LRU_LISTS];
127 unsigned long count[NR_LRU_LISTS];
128
129 struct zone_reclaim_stat reclaim_stat;
130 struct rb_node tree_node;
131 unsigned long long usage_in_excess;
132
133 bool on_tree;
134 struct mem_cgroup *mem;
135
136};
137
138#define MEM_CGROUP_ZSTAT(mz, idx) ((mz)->count[(idx)])
139
140struct mem_cgroup_per_node {
141 struct mem_cgroup_per_zone zoneinfo[MAX_NR_ZONES];
142};
143
144struct mem_cgroup_lru_info {
145 struct mem_cgroup_per_node *nodeinfo[MAX_NUMNODES];
146};
147
148
149
150
151
152
153struct mem_cgroup_tree_per_zone {
154 struct rb_root rb_root;
155 spinlock_t lock;
156};
157
158struct mem_cgroup_tree_per_node {
159 struct mem_cgroup_tree_per_zone rb_tree_per_zone[MAX_NR_ZONES];
160};
161
162struct mem_cgroup_tree {
163 struct mem_cgroup_tree_per_node *rb_tree_per_node[MAX_NUMNODES];
164};
165
166static struct mem_cgroup_tree soft_limit_tree __read_mostly;
167
168struct mem_cgroup_threshold {
169 struct eventfd_ctx *eventfd;
170 u64 threshold;
171};
172
173
174struct mem_cgroup_threshold_ary {
175
176 int current_threshold;
177
178 unsigned int size;
179
180 struct mem_cgroup_threshold entries[0];
181};
182
183struct mem_cgroup_thresholds {
184
185 struct mem_cgroup_threshold_ary *primary;
186
187
188
189
190
191 struct mem_cgroup_threshold_ary *spare;
192};
193
194
195struct mem_cgroup_eventfd_list {
196 struct list_head list;
197 struct eventfd_ctx *eventfd;
198};
199
200static void mem_cgroup_threshold(struct mem_cgroup *mem);
201static void mem_cgroup_oom_notify(struct mem_cgroup *mem);
202
203
204
205
206
207
208
209
210
211
212
213
214struct mem_cgroup {
215 struct cgroup_subsys_state css;
216
217
218
219 struct res_counter res;
220
221
222
223 struct res_counter memsw;
224
225
226
227
228 struct mem_cgroup_lru_info info;
229
230
231
232
233 int last_scanned_child;
234
235
236
237 bool use_hierarchy;
238 atomic_t oom_lock;
239 atomic_t refcnt;
240
241 unsigned int swappiness;
242
243 int oom_kill_disable;
244
245
246 bool memsw_is_minimum;
247
248
249 struct mutex thresholds_lock;
250
251
252 struct mem_cgroup_thresholds thresholds;
253
254
255 struct mem_cgroup_thresholds memsw_thresholds;
256
257
258 struct list_head oom_notify;
259
260
261
262
263
264 unsigned long move_charge_at_immigrate;
265
266
267
268 struct mem_cgroup_stat_cpu *stat;
269
270
271
272
273 struct mem_cgroup_stat_cpu nocpu_base;
274 spinlock_t pcp_counter_lock;
275};
276
277
278
279
280
281
282enum move_type {
283 MOVE_CHARGE_TYPE_ANON,
284 MOVE_CHARGE_TYPE_FILE,
285 NR_MOVE_TYPE,
286};
287
288
289static struct move_charge_struct {
290 spinlock_t lock;
291 struct mem_cgroup *from;
292 struct mem_cgroup *to;
293 unsigned long precharge;
294 unsigned long moved_charge;
295 unsigned long moved_swap;
296 struct task_struct *moving_task;
297 wait_queue_head_t waitq;
298} mc = {
299 .lock = __SPIN_LOCK_UNLOCKED(mc.lock),
300 .waitq = __WAIT_QUEUE_HEAD_INITIALIZER(mc.waitq),
301};
302
303static bool move_anon(void)
304{
305 return test_bit(MOVE_CHARGE_TYPE_ANON,
306 &mc.to->move_charge_at_immigrate);
307}
308
309static bool move_file(void)
310{
311 return test_bit(MOVE_CHARGE_TYPE_FILE,
312 &mc.to->move_charge_at_immigrate);
313}
314
315
316
317
318
319#define MEM_CGROUP_MAX_RECLAIM_LOOPS (100)
320#define MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS (2)
321
322enum charge_type {
323 MEM_CGROUP_CHARGE_TYPE_CACHE = 0,
324 MEM_CGROUP_CHARGE_TYPE_MAPPED,
325 MEM_CGROUP_CHARGE_TYPE_SHMEM,
326 MEM_CGROUP_CHARGE_TYPE_FORCE,
327 MEM_CGROUP_CHARGE_TYPE_SWAPOUT,
328 MEM_CGROUP_CHARGE_TYPE_DROP,
329 NR_CHARGE_TYPE,
330};
331
332
333#define _MEM (0)
334#define _MEMSWAP (1)
335#define _OOM_TYPE (2)
336#define MEMFILE_PRIVATE(x, val) (((x) << 16) | (val))
337#define MEMFILE_TYPE(val) (((val) >> 16) & 0xffff)
338#define MEMFILE_ATTR(val) ((val) & 0xffff)
339
340#define OOM_CONTROL (0)
341
342
343
344
345#define MEM_CGROUP_RECLAIM_NOSWAP_BIT 0x0
346#define MEM_CGROUP_RECLAIM_NOSWAP (1 << MEM_CGROUP_RECLAIM_NOSWAP_BIT)
347#define MEM_CGROUP_RECLAIM_SHRINK_BIT 0x1
348#define MEM_CGROUP_RECLAIM_SHRINK (1 << MEM_CGROUP_RECLAIM_SHRINK_BIT)
349#define MEM_CGROUP_RECLAIM_SOFT_BIT 0x2
350#define MEM_CGROUP_RECLAIM_SOFT (1 << MEM_CGROUP_RECLAIM_SOFT_BIT)
351
352static void mem_cgroup_get(struct mem_cgroup *mem);
353static void mem_cgroup_put(struct mem_cgroup *mem);
354static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *mem);
355static void drain_all_stock_async(void);
356
357static struct mem_cgroup_per_zone *
358mem_cgroup_zoneinfo(struct mem_cgroup *mem, int nid, int zid)
359{
360 return &mem->info.nodeinfo[nid]->zoneinfo[zid];
361}
362
363struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *mem)
364{
365 return &mem->css;
366}
367
368static struct mem_cgroup_per_zone *
369page_cgroup_zoneinfo(struct mem_cgroup *mem, struct page *page)
370{
371 int nid = page_to_nid(page);
372 int zid = page_zonenum(page);
373
374 return mem_cgroup_zoneinfo(mem, nid, zid);
375}
376
377static struct mem_cgroup_tree_per_zone *
378soft_limit_tree_node_zone(int nid, int zid)
379{
380 return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid];
381}
382
383static struct mem_cgroup_tree_per_zone *
384soft_limit_tree_from_page(struct page *page)
385{
386 int nid = page_to_nid(page);
387 int zid = page_zonenum(page);
388
389 return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid];
390}
391
392static void
393__mem_cgroup_insert_exceeded(struct mem_cgroup *mem,
394 struct mem_cgroup_per_zone *mz,
395 struct mem_cgroup_tree_per_zone *mctz,
396 unsigned long long new_usage_in_excess)
397{
398 struct rb_node **p = &mctz->rb_root.rb_node;
399 struct rb_node *parent = NULL;
400 struct mem_cgroup_per_zone *mz_node;
401
402 if (mz->on_tree)
403 return;
404
405 mz->usage_in_excess = new_usage_in_excess;
406 if (!mz->usage_in_excess)
407 return;
408 while (*p) {
409 parent = *p;
410 mz_node = rb_entry(parent, struct mem_cgroup_per_zone,
411 tree_node);
412 if (mz->usage_in_excess < mz_node->usage_in_excess)
413 p = &(*p)->rb_left;
414
415
416
417
418 else if (mz->usage_in_excess >= mz_node->usage_in_excess)
419 p = &(*p)->rb_right;
420 }
421 rb_link_node(&mz->tree_node, parent, p);
422 rb_insert_color(&mz->tree_node, &mctz->rb_root);
423 mz->on_tree = true;
424}
425
426static void
427__mem_cgroup_remove_exceeded(struct mem_cgroup *mem,
428 struct mem_cgroup_per_zone *mz,
429 struct mem_cgroup_tree_per_zone *mctz)
430{
431 if (!mz->on_tree)
432 return;
433 rb_erase(&mz->tree_node, &mctz->rb_root);
434 mz->on_tree = false;
435}
436
437static void
438mem_cgroup_remove_exceeded(struct mem_cgroup *mem,
439 struct mem_cgroup_per_zone *mz,
440 struct mem_cgroup_tree_per_zone *mctz)
441{
442 spin_lock(&mctz->lock);
443 __mem_cgroup_remove_exceeded(mem, mz, mctz);
444 spin_unlock(&mctz->lock);
445}
446
447
448static void mem_cgroup_update_tree(struct mem_cgroup *mem, struct page *page)
449{
450 unsigned long long excess;
451 struct mem_cgroup_per_zone *mz;
452 struct mem_cgroup_tree_per_zone *mctz;
453 int nid = page_to_nid(page);
454 int zid = page_zonenum(page);
455 mctz = soft_limit_tree_from_page(page);
456
457
458
459
460
461 for (; mem; mem = parent_mem_cgroup(mem)) {
462 mz = mem_cgroup_zoneinfo(mem, nid, zid);
463 excess = res_counter_soft_limit_excess(&mem->res);
464
465
466
467
468 if (excess || mz->on_tree) {
469 spin_lock(&mctz->lock);
470
471 if (mz->on_tree)
472 __mem_cgroup_remove_exceeded(mem, mz, mctz);
473
474
475
476
477 __mem_cgroup_insert_exceeded(mem, mz, mctz, excess);
478 spin_unlock(&mctz->lock);
479 }
480 }
481}
482
483static void mem_cgroup_remove_from_trees(struct mem_cgroup *mem)
484{
485 int node, zone;
486 struct mem_cgroup_per_zone *mz;
487 struct mem_cgroup_tree_per_zone *mctz;
488
489 for_each_node_state(node, N_POSSIBLE) {
490 for (zone = 0; zone < MAX_NR_ZONES; zone++) {
491 mz = mem_cgroup_zoneinfo(mem, node, zone);
492 mctz = soft_limit_tree_node_zone(node, zone);
493 mem_cgroup_remove_exceeded(mem, mz, mctz);
494 }
495 }
496}
497
498static struct mem_cgroup_per_zone *
499__mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz)
500{
501 struct rb_node *rightmost = NULL;
502 struct mem_cgroup_per_zone *mz;
503
504retry:
505 mz = NULL;
506 rightmost = rb_last(&mctz->rb_root);
507 if (!rightmost)
508 goto done;
509
510 mz = rb_entry(rightmost, struct mem_cgroup_per_zone, tree_node);
511
512
513
514
515
516 __mem_cgroup_remove_exceeded(mz->mem, mz, mctz);
517 if (!res_counter_soft_limit_excess(&mz->mem->res) ||
518 !css_tryget(&mz->mem->css))
519 goto retry;
520done:
521 return mz;
522}
523
524static struct mem_cgroup_per_zone *
525mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz)
526{
527 struct mem_cgroup_per_zone *mz;
528
529 spin_lock(&mctz->lock);
530 mz = __mem_cgroup_largest_soft_limit_node(mctz);
531 spin_unlock(&mctz->lock);
532 return mz;
533}
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554static long mem_cgroup_read_stat(struct mem_cgroup *mem,
555 enum mem_cgroup_stat_index idx)
556{
557 long val = 0;
558 int cpu;
559
560 get_online_cpus();
561 for_each_online_cpu(cpu)
562 val += per_cpu(mem->stat->count[idx], cpu);
563#ifdef CONFIG_HOTPLUG_CPU
564 spin_lock(&mem->pcp_counter_lock);
565 val += mem->nocpu_base.count[idx];
566 spin_unlock(&mem->pcp_counter_lock);
567#endif
568 put_online_cpus();
569 return val;
570}
571
572static long mem_cgroup_local_usage(struct mem_cgroup *mem)
573{
574 long ret;
575
576 ret = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_RSS);
577 ret += mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_CACHE);
578 return ret;
579}
580
581static void mem_cgroup_swap_statistics(struct mem_cgroup *mem,
582 bool charge)
583{
584 int val = (charge) ? 1 : -1;
585 this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_SWAPOUT], val);
586}
587
588static unsigned long mem_cgroup_read_events(struct mem_cgroup *mem,
589 enum mem_cgroup_events_index idx)
590{
591 unsigned long val = 0;
592 int cpu;
593
594 for_each_online_cpu(cpu)
595 val += per_cpu(mem->stat->events[idx], cpu);
596#ifdef CONFIG_HOTPLUG_CPU
597 spin_lock(&mem->pcp_counter_lock);
598 val += mem->nocpu_base.events[idx];
599 spin_unlock(&mem->pcp_counter_lock);
600#endif
601 return val;
602}
603
604static void mem_cgroup_charge_statistics(struct mem_cgroup *mem,
605 bool file, int nr_pages)
606{
607 preempt_disable();
608
609 if (file)
610 __this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_CACHE], nr_pages);
611 else
612 __this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_RSS], nr_pages);
613
614
615 if (nr_pages > 0)
616 __this_cpu_inc(mem->stat->events[MEM_CGROUP_EVENTS_PGPGIN]);
617 else {
618 __this_cpu_inc(mem->stat->events[MEM_CGROUP_EVENTS_PGPGOUT]);
619 nr_pages = -nr_pages;
620 }
621
622 __this_cpu_add(mem->stat->events[MEM_CGROUP_EVENTS_COUNT], nr_pages);
623
624 preempt_enable();
625}
626
627static unsigned long mem_cgroup_get_local_zonestat(struct mem_cgroup *mem,
628 enum lru_list idx)
629{
630 int nid, zid;
631 struct mem_cgroup_per_zone *mz;
632 u64 total = 0;
633
634 for_each_online_node(nid)
635 for (zid = 0; zid < MAX_NR_ZONES; zid++) {
636 mz = mem_cgroup_zoneinfo(mem, nid, zid);
637 total += MEM_CGROUP_ZSTAT(mz, idx);
638 }
639 return total;
640}
641
642static bool __memcg_event_check(struct mem_cgroup *mem, int target)
643{
644 unsigned long val, next;
645
646 val = this_cpu_read(mem->stat->events[MEM_CGROUP_EVENTS_COUNT]);
647 next = this_cpu_read(mem->stat->targets[target]);
648
649 return ((long)next - (long)val < 0);
650}
651
652static void __mem_cgroup_target_update(struct mem_cgroup *mem, int target)
653{
654 unsigned long val, next;
655
656 val = this_cpu_read(mem->stat->events[MEM_CGROUP_EVENTS_COUNT]);
657
658 switch (target) {
659 case MEM_CGROUP_TARGET_THRESH:
660 next = val + THRESHOLDS_EVENTS_TARGET;
661 break;
662 case MEM_CGROUP_TARGET_SOFTLIMIT:
663 next = val + SOFTLIMIT_EVENTS_TARGET;
664 break;
665 default:
666 return;
667 }
668
669 this_cpu_write(mem->stat->targets[target], next);
670}
671
672
673
674
675
676static void memcg_check_events(struct mem_cgroup *mem, struct page *page)
677{
678
679 if (unlikely(__memcg_event_check(mem, MEM_CGROUP_TARGET_THRESH))) {
680 mem_cgroup_threshold(mem);
681 __mem_cgroup_target_update(mem, MEM_CGROUP_TARGET_THRESH);
682 if (unlikely(__memcg_event_check(mem,
683 MEM_CGROUP_TARGET_SOFTLIMIT))){
684 mem_cgroup_update_tree(mem, page);
685 __mem_cgroup_target_update(mem,
686 MEM_CGROUP_TARGET_SOFTLIMIT);
687 }
688 }
689}
690
691static struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont)
692{
693 return container_of(cgroup_subsys_state(cont,
694 mem_cgroup_subsys_id), struct mem_cgroup,
695 css);
696}
697
698struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
699{
700
701
702
703
704
705 if (unlikely(!p))
706 return NULL;
707
708 return container_of(task_subsys_state(p, mem_cgroup_subsys_id),
709 struct mem_cgroup, css);
710}
711
712static struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm)
713{
714 struct mem_cgroup *mem = NULL;
715
716 if (!mm)
717 return NULL;
718
719
720
721
722
723 rcu_read_lock();
724 do {
725 mem = mem_cgroup_from_task(rcu_dereference(mm->owner));
726 if (unlikely(!mem))
727 break;
728 } while (!css_tryget(&mem->css));
729 rcu_read_unlock();
730 return mem;
731}
732
733
734static struct mem_cgroup *mem_cgroup_start_loop(struct mem_cgroup *mem)
735{
736 struct cgroup_subsys_state *css;
737 int found;
738
739 if (!mem)
740 return root_mem_cgroup;
741 if (!mem->use_hierarchy) {
742 if (css_tryget(&mem->css))
743 return mem;
744 return NULL;
745 }
746 rcu_read_lock();
747
748
749
750
751 css = css_get_next(&mem_cgroup_subsys, 1, &mem->css, &found);
752 if (css && css_tryget(css))
753 mem = container_of(css, struct mem_cgroup, css);
754 else
755 mem = NULL;
756 rcu_read_unlock();
757 return mem;
758}
759
760static struct mem_cgroup *mem_cgroup_get_next(struct mem_cgroup *iter,
761 struct mem_cgroup *root,
762 bool cond)
763{
764 int nextid = css_id(&iter->css) + 1;
765 int found;
766 int hierarchy_used;
767 struct cgroup_subsys_state *css;
768
769 hierarchy_used = iter->use_hierarchy;
770
771 css_put(&iter->css);
772
773 if (!cond || (root && !hierarchy_used))
774 return NULL;
775
776 if (!root)
777 root = root_mem_cgroup;
778
779 do {
780 iter = NULL;
781 rcu_read_lock();
782
783 css = css_get_next(&mem_cgroup_subsys, nextid,
784 &root->css, &found);
785 if (css && css_tryget(css))
786 iter = container_of(css, struct mem_cgroup, css);
787 rcu_read_unlock();
788
789 nextid = found + 1;
790 } while (css && !iter);
791
792 return iter;
793}
794
795
796
797
798
799#define for_each_mem_cgroup_tree_cond(iter, root, cond) \
800 for (iter = mem_cgroup_start_loop(root);\
801 iter != NULL;\
802 iter = mem_cgroup_get_next(iter, root, cond))
803
804#define for_each_mem_cgroup_tree(iter, root) \
805 for_each_mem_cgroup_tree_cond(iter, root, true)
806
807#define for_each_mem_cgroup_all(iter) \
808 for_each_mem_cgroup_tree_cond(iter, NULL, true)
809
810
811static inline bool mem_cgroup_is_root(struct mem_cgroup *mem)
812{
813 return (mem == root_mem_cgroup);
814}
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830void mem_cgroup_del_lru_list(struct page *page, enum lru_list lru)
831{
832 struct page_cgroup *pc;
833 struct mem_cgroup_per_zone *mz;
834
835 if (mem_cgroup_disabled())
836 return;
837 pc = lookup_page_cgroup(page);
838
839 if (!TestClearPageCgroupAcctLRU(pc))
840 return;
841 VM_BUG_ON(!pc->mem_cgroup);
842
843
844
845
846 mz = page_cgroup_zoneinfo(pc->mem_cgroup, page);
847
848 MEM_CGROUP_ZSTAT(mz, lru) -= 1 << compound_order(page);
849 if (mem_cgroup_is_root(pc->mem_cgroup))
850 return;
851 VM_BUG_ON(list_empty(&pc->lru));
852 list_del_init(&pc->lru);
853}
854
855void mem_cgroup_del_lru(struct page *page)
856{
857 mem_cgroup_del_lru_list(page, page_lru(page));
858}
859
860
861
862
863
864
865void mem_cgroup_rotate_reclaimable_page(struct page *page)
866{
867 struct mem_cgroup_per_zone *mz;
868 struct page_cgroup *pc;
869 enum lru_list lru = page_lru(page);
870
871 if (mem_cgroup_disabled())
872 return;
873
874 pc = lookup_page_cgroup(page);
875
876 if (!PageCgroupUsed(pc))
877 return;
878
879 smp_rmb();
880 if (mem_cgroup_is_root(pc->mem_cgroup))
881 return;
882 mz = page_cgroup_zoneinfo(pc->mem_cgroup, page);
883 list_move_tail(&pc->lru, &mz->lists[lru]);
884}
885
886void mem_cgroup_rotate_lru_list(struct page *page, enum lru_list lru)
887{
888 struct mem_cgroup_per_zone *mz;
889 struct page_cgroup *pc;
890
891 if (mem_cgroup_disabled())
892 return;
893
894 pc = lookup_page_cgroup(page);
895
896 if (!PageCgroupUsed(pc))
897 return;
898
899 smp_rmb();
900 if (mem_cgroup_is_root(pc->mem_cgroup))
901 return;
902 mz = page_cgroup_zoneinfo(pc->mem_cgroup, page);
903 list_move(&pc->lru, &mz->lists[lru]);
904}
905
906void mem_cgroup_add_lru_list(struct page *page, enum lru_list lru)
907{
908 struct page_cgroup *pc;
909 struct mem_cgroup_per_zone *mz;
910
911 if (mem_cgroup_disabled())
912 return;
913 pc = lookup_page_cgroup(page);
914 VM_BUG_ON(PageCgroupAcctLRU(pc));
915 if (!PageCgroupUsed(pc))
916 return;
917
918 smp_rmb();
919 mz = page_cgroup_zoneinfo(pc->mem_cgroup, page);
920
921 MEM_CGROUP_ZSTAT(mz, lru) += 1 << compound_order(page);
922 SetPageCgroupAcctLRU(pc);
923 if (mem_cgroup_is_root(pc->mem_cgroup))
924 return;
925 list_add(&pc->lru, &mz->lists[lru]);
926}
927
928
929
930
931
932
933
934static void mem_cgroup_lru_del_before_commit(struct page *page)
935{
936 unsigned long flags;
937 struct zone *zone = page_zone(page);
938 struct page_cgroup *pc = lookup_page_cgroup(page);
939
940
941
942
943
944
945
946
947
948 if (likely(!PageLRU(page)))
949 return;
950
951 spin_lock_irqsave(&zone->lru_lock, flags);
952
953
954
955
956 if (!PageCgroupUsed(pc))
957 mem_cgroup_del_lru_list(page, page_lru(page));
958 spin_unlock_irqrestore(&zone->lru_lock, flags);
959}
960
961static void mem_cgroup_lru_add_after_commit(struct page *page)
962{
963 unsigned long flags;
964 struct zone *zone = page_zone(page);
965 struct page_cgroup *pc = lookup_page_cgroup(page);
966
967
968 if (likely(!PageLRU(page)))
969 return;
970 spin_lock_irqsave(&zone->lru_lock, flags);
971
972 if (PageLRU(page) && !PageCgroupAcctLRU(pc))
973 mem_cgroup_add_lru_list(page, page_lru(page));
974 spin_unlock_irqrestore(&zone->lru_lock, flags);
975}
976
977
978void mem_cgroup_move_lists(struct page *page,
979 enum lru_list from, enum lru_list to)
980{
981 if (mem_cgroup_disabled())
982 return;
983 mem_cgroup_del_lru_list(page, from);
984 mem_cgroup_add_lru_list(page, to);
985}
986
987int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem)
988{
989 int ret;
990 struct mem_cgroup *curr = NULL;
991 struct task_struct *p;
992
993 p = find_lock_task_mm(task);
994 if (!p)
995 return 0;
996 curr = try_get_mem_cgroup_from_mm(p->mm);
997 task_unlock(p);
998 if (!curr)
999 return 0;
1000
1001
1002
1003
1004
1005
1006 if (mem->use_hierarchy)
1007 ret = css_is_ancestor(&curr->css, &mem->css);
1008 else
1009 ret = (curr == mem);
1010 css_put(&curr->css);
1011 return ret;
1012}
1013
1014static int calc_inactive_ratio(struct mem_cgroup *memcg, unsigned long *present_pages)
1015{
1016 unsigned long active;
1017 unsigned long inactive;
1018 unsigned long gb;
1019 unsigned long inactive_ratio;
1020
1021 inactive = mem_cgroup_get_local_zonestat(memcg, LRU_INACTIVE_ANON);
1022 active = mem_cgroup_get_local_zonestat(memcg, LRU_ACTIVE_ANON);
1023
1024 gb = (inactive + active) >> (30 - PAGE_SHIFT);
1025 if (gb)
1026 inactive_ratio = int_sqrt(10 * gb);
1027 else
1028 inactive_ratio = 1;
1029
1030 if (present_pages) {
1031 present_pages[0] = inactive;
1032 present_pages[1] = active;
1033 }
1034
1035 return inactive_ratio;
1036}
1037
1038int mem_cgroup_inactive_anon_is_low(struct mem_cgroup *memcg)
1039{
1040 unsigned long active;
1041 unsigned long inactive;
1042 unsigned long present_pages[2];
1043 unsigned long inactive_ratio;
1044
1045 inactive_ratio = calc_inactive_ratio(memcg, present_pages);
1046
1047 inactive = present_pages[0];
1048 active = present_pages[1];
1049
1050 if (inactive * inactive_ratio < active)
1051 return 1;
1052
1053 return 0;
1054}
1055
1056int mem_cgroup_inactive_file_is_low(struct mem_cgroup *memcg)
1057{
1058 unsigned long active;
1059 unsigned long inactive;
1060
1061 inactive = mem_cgroup_get_local_zonestat(memcg, LRU_INACTIVE_FILE);
1062 active = mem_cgroup_get_local_zonestat(memcg, LRU_ACTIVE_FILE);
1063
1064 return (active > inactive);
1065}
1066
1067unsigned long mem_cgroup_zone_nr_pages(struct mem_cgroup *memcg,
1068 struct zone *zone,
1069 enum lru_list lru)
1070{
1071 int nid = zone_to_nid(zone);
1072 int zid = zone_idx(zone);
1073 struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(memcg, nid, zid);
1074
1075 return MEM_CGROUP_ZSTAT(mz, lru);
1076}
1077
1078struct zone_reclaim_stat *mem_cgroup_get_reclaim_stat(struct mem_cgroup *memcg,
1079 struct zone *zone)
1080{
1081 int nid = zone_to_nid(zone);
1082 int zid = zone_idx(zone);
1083 struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(memcg, nid, zid);
1084
1085 return &mz->reclaim_stat;
1086}
1087
1088struct zone_reclaim_stat *
1089mem_cgroup_get_reclaim_stat_from_page(struct page *page)
1090{
1091 struct page_cgroup *pc;
1092 struct mem_cgroup_per_zone *mz;
1093
1094 if (mem_cgroup_disabled())
1095 return NULL;
1096
1097 pc = lookup_page_cgroup(page);
1098 if (!PageCgroupUsed(pc))
1099 return NULL;
1100
1101 smp_rmb();
1102 mz = page_cgroup_zoneinfo(pc->mem_cgroup, page);
1103 return &mz->reclaim_stat;
1104}
1105
1106unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
1107 struct list_head *dst,
1108 unsigned long *scanned, int order,
1109 int mode, struct zone *z,
1110 struct mem_cgroup *mem_cont,
1111 int active, int file)
1112{
1113 unsigned long nr_taken = 0;
1114 struct page *page;
1115 unsigned long scan;
1116 LIST_HEAD(pc_list);
1117 struct list_head *src;
1118 struct page_cgroup *pc, *tmp;
1119 int nid = zone_to_nid(z);
1120 int zid = zone_idx(z);
1121 struct mem_cgroup_per_zone *mz;
1122 int lru = LRU_FILE * file + active;
1123 int ret;
1124
1125 BUG_ON(!mem_cont);
1126 mz = mem_cgroup_zoneinfo(mem_cont, nid, zid);
1127 src = &mz->lists[lru];
1128
1129 scan = 0;
1130 list_for_each_entry_safe_reverse(pc, tmp, src, lru) {
1131 if (scan >= nr_to_scan)
1132 break;
1133
1134 if (unlikely(!PageCgroupUsed(pc)))
1135 continue;
1136
1137 page = lookup_cgroup_page(pc);
1138
1139 if (unlikely(!PageLRU(page)))
1140 continue;
1141
1142 scan++;
1143 ret = __isolate_lru_page(page, mode, file);
1144 switch (ret) {
1145 case 0:
1146 list_move(&page->lru, dst);
1147 mem_cgroup_del_lru(page);
1148 nr_taken += hpage_nr_pages(page);
1149 break;
1150 case -EBUSY:
1151
1152 mem_cgroup_rotate_lru_list(page, page_lru(page));
1153 break;
1154 default:
1155 break;
1156 }
1157 }
1158
1159 *scanned = scan;
1160
1161 trace_mm_vmscan_memcg_isolate(0, nr_to_scan, scan, nr_taken,
1162 0, 0, 0, mode);
1163
1164 return nr_taken;
1165}
1166
1167#define mem_cgroup_from_res_counter(counter, member) \
1168 container_of(counter, struct mem_cgroup, member)
1169
1170
1171
1172
1173
1174
1175
1176
1177static unsigned long mem_cgroup_margin(struct mem_cgroup *mem)
1178{
1179 unsigned long long margin;
1180
1181 margin = res_counter_margin(&mem->res);
1182 if (do_swap_account)
1183 margin = min(margin, res_counter_margin(&mem->memsw));
1184 return margin >> PAGE_SHIFT;
1185}
1186
1187static unsigned int get_swappiness(struct mem_cgroup *memcg)
1188{
1189 struct cgroup *cgrp = memcg->css.cgroup;
1190
1191
1192 if (cgrp->parent == NULL)
1193 return vm_swappiness;
1194
1195 return memcg->swappiness;
1196}
1197
1198static void mem_cgroup_start_move(struct mem_cgroup *mem)
1199{
1200 int cpu;
1201
1202 get_online_cpus();
1203 spin_lock(&mem->pcp_counter_lock);
1204 for_each_online_cpu(cpu)
1205 per_cpu(mem->stat->count[MEM_CGROUP_ON_MOVE], cpu) += 1;
1206 mem->nocpu_base.count[MEM_CGROUP_ON_MOVE] += 1;
1207 spin_unlock(&mem->pcp_counter_lock);
1208 put_online_cpus();
1209
1210 synchronize_rcu();
1211}
1212
1213static void mem_cgroup_end_move(struct mem_cgroup *mem)
1214{
1215 int cpu;
1216
1217 if (!mem)
1218 return;
1219 get_online_cpus();
1220 spin_lock(&mem->pcp_counter_lock);
1221 for_each_online_cpu(cpu)
1222 per_cpu(mem->stat->count[MEM_CGROUP_ON_MOVE], cpu) -= 1;
1223 mem->nocpu_base.count[MEM_CGROUP_ON_MOVE] -= 1;
1224 spin_unlock(&mem->pcp_counter_lock);
1225 put_online_cpus();
1226}
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239static bool mem_cgroup_stealed(struct mem_cgroup *mem)
1240{
1241 VM_BUG_ON(!rcu_read_lock_held());
1242 return this_cpu_read(mem->stat->count[MEM_CGROUP_ON_MOVE]) > 0;
1243}
1244
1245static bool mem_cgroup_under_move(struct mem_cgroup *mem)
1246{
1247 struct mem_cgroup *from;
1248 struct mem_cgroup *to;
1249 bool ret = false;
1250
1251
1252
1253
1254 spin_lock(&mc.lock);
1255 from = mc.from;
1256 to = mc.to;
1257 if (!from)
1258 goto unlock;
1259 if (from == mem || to == mem
1260 || (mem->use_hierarchy && css_is_ancestor(&from->css, &mem->css))
1261 || (mem->use_hierarchy && css_is_ancestor(&to->css, &mem->css)))
1262 ret = true;
1263unlock:
1264 spin_unlock(&mc.lock);
1265 return ret;
1266}
1267
1268static bool mem_cgroup_wait_acct_move(struct mem_cgroup *mem)
1269{
1270 if (mc.moving_task && current != mc.moving_task) {
1271 if (mem_cgroup_under_move(mem)) {
1272 DEFINE_WAIT(wait);
1273 prepare_to_wait(&mc.waitq, &wait, TASK_INTERRUPTIBLE);
1274
1275 if (mc.moving_task)
1276 schedule();
1277 finish_wait(&mc.waitq, &wait);
1278 return true;
1279 }
1280 }
1281 return false;
1282}
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
1293{
1294 struct cgroup *task_cgrp;
1295 struct cgroup *mem_cgrp;
1296
1297
1298
1299
1300
1301 static char memcg_name[PATH_MAX];
1302 int ret;
1303
1304 if (!memcg || !p)
1305 return;
1306
1307
1308 rcu_read_lock();
1309
1310 mem_cgrp = memcg->css.cgroup;
1311 task_cgrp = task_cgroup(p, mem_cgroup_subsys_id);
1312
1313 ret = cgroup_path(task_cgrp, memcg_name, PATH_MAX);
1314 if (ret < 0) {
1315
1316
1317
1318
1319 rcu_read_unlock();
1320 goto done;
1321 }
1322 rcu_read_unlock();
1323
1324 printk(KERN_INFO "Task in %s killed", memcg_name);
1325
1326 rcu_read_lock();
1327 ret = cgroup_path(mem_cgrp, memcg_name, PATH_MAX);
1328 if (ret < 0) {
1329 rcu_read_unlock();
1330 goto done;
1331 }
1332 rcu_read_unlock();
1333
1334
1335
1336
1337 printk(KERN_CONT " as a result of limit of %s\n", memcg_name);
1338done:
1339
1340 printk(KERN_INFO "memory: usage %llukB, limit %llukB, failcnt %llu\n",
1341 res_counter_read_u64(&memcg->res, RES_USAGE) >> 10,
1342 res_counter_read_u64(&memcg->res, RES_LIMIT) >> 10,
1343 res_counter_read_u64(&memcg->res, RES_FAILCNT));
1344 printk(KERN_INFO "memory+swap: usage %llukB, limit %llukB, "
1345 "failcnt %llu\n",
1346 res_counter_read_u64(&memcg->memsw, RES_USAGE) >> 10,
1347 res_counter_read_u64(&memcg->memsw, RES_LIMIT) >> 10,
1348 res_counter_read_u64(&memcg->memsw, RES_FAILCNT));
1349}
1350
1351
1352
1353
1354
1355static int mem_cgroup_count_children(struct mem_cgroup *mem)
1356{
1357 int num = 0;
1358 struct mem_cgroup *iter;
1359
1360 for_each_mem_cgroup_tree(iter, mem)
1361 num++;
1362 return num;
1363}
1364
1365
1366
1367
1368u64 mem_cgroup_get_limit(struct mem_cgroup *memcg)
1369{
1370 u64 limit;
1371 u64 memsw;
1372
1373 limit = res_counter_read_u64(&memcg->res, RES_LIMIT);
1374 limit += total_swap_pages << PAGE_SHIFT;
1375
1376 memsw = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
1377
1378
1379
1380
1381 return min(limit, memsw);
1382}
1383
1384
1385
1386
1387
1388
1389static struct mem_cgroup *
1390mem_cgroup_select_victim(struct mem_cgroup *root_mem)
1391{
1392 struct mem_cgroup *ret = NULL;
1393 struct cgroup_subsys_state *css;
1394 int nextid, found;
1395
1396 if (!root_mem->use_hierarchy) {
1397 css_get(&root_mem->css);
1398 ret = root_mem;
1399 }
1400
1401 while (!ret) {
1402 rcu_read_lock();
1403 nextid = root_mem->last_scanned_child + 1;
1404 css = css_get_next(&mem_cgroup_subsys, nextid, &root_mem->css,
1405 &found);
1406 if (css && css_tryget(css))
1407 ret = container_of(css, struct mem_cgroup, css);
1408
1409 rcu_read_unlock();
1410
1411 if (!css) {
1412
1413 root_mem->last_scanned_child = 0;
1414 } else
1415 root_mem->last_scanned_child = found;
1416 }
1417
1418 return ret;
1419}
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,
1434 struct zone *zone,
1435 gfp_t gfp_mask,
1436 unsigned long reclaim_options)
1437{
1438 struct mem_cgroup *victim;
1439 int ret, total = 0;
1440 int loop = 0;
1441 bool noswap = reclaim_options & MEM_CGROUP_RECLAIM_NOSWAP;
1442 bool shrink = reclaim_options & MEM_CGROUP_RECLAIM_SHRINK;
1443 bool check_soft = reclaim_options & MEM_CGROUP_RECLAIM_SOFT;
1444 unsigned long excess;
1445
1446 excess = res_counter_soft_limit_excess(&root_mem->res) >> PAGE_SHIFT;
1447
1448
1449 if (root_mem->memsw_is_minimum)
1450 noswap = true;
1451
1452 while (1) {
1453 victim = mem_cgroup_select_victim(root_mem);
1454 if (victim == root_mem) {
1455 loop++;
1456 if (loop >= 1)
1457 drain_all_stock_async();
1458 if (loop >= 2) {
1459
1460
1461
1462
1463
1464 if (!check_soft || !total) {
1465 css_put(&victim->css);
1466 break;
1467 }
1468
1469
1470
1471
1472
1473
1474 if (total >= (excess >> 2) ||
1475 (loop > MEM_CGROUP_MAX_RECLAIM_LOOPS)) {
1476 css_put(&victim->css);
1477 break;
1478 }
1479 }
1480 }
1481 if (!mem_cgroup_local_usage(victim)) {
1482
1483 css_put(&victim->css);
1484 continue;
1485 }
1486
1487 if (check_soft)
1488 ret = mem_cgroup_shrink_node_zone(victim, gfp_mask,
1489 noswap, get_swappiness(victim), zone);
1490 else
1491 ret = try_to_free_mem_cgroup_pages(victim, gfp_mask,
1492 noswap, get_swappiness(victim));
1493 css_put(&victim->css);
1494
1495
1496
1497
1498
1499 if (shrink)
1500 return ret;
1501 total += ret;
1502 if (check_soft) {
1503 if (!res_counter_soft_limit_excess(&root_mem->res))
1504 return total;
1505 } else if (mem_cgroup_margin(root_mem))
1506 return 1 + total;
1507 }
1508 return total;
1509}
1510
1511
1512
1513
1514
1515static bool mem_cgroup_oom_lock(struct mem_cgroup *mem)
1516{
1517 int x, lock_count = 0;
1518 struct mem_cgroup *iter;
1519
1520 for_each_mem_cgroup_tree(iter, mem) {
1521 x = atomic_inc_return(&iter->oom_lock);
1522 lock_count = max(x, lock_count);
1523 }
1524
1525 if (lock_count == 1)
1526 return true;
1527 return false;
1528}
1529
1530static int mem_cgroup_oom_unlock(struct mem_cgroup *mem)
1531{
1532 struct mem_cgroup *iter;
1533
1534
1535
1536
1537
1538
1539 for_each_mem_cgroup_tree(iter, mem)
1540 atomic_add_unless(&iter->oom_lock, -1, 0);
1541 return 0;
1542}
1543
1544
1545static DEFINE_MUTEX(memcg_oom_mutex);
1546static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq);
1547
1548struct oom_wait_info {
1549 struct mem_cgroup *mem;
1550 wait_queue_t wait;
1551};
1552
1553static int memcg_oom_wake_function(wait_queue_t *wait,
1554 unsigned mode, int sync, void *arg)
1555{
1556 struct mem_cgroup *wake_mem = (struct mem_cgroup *)arg;
1557 struct oom_wait_info *oom_wait_info;
1558
1559 oom_wait_info = container_of(wait, struct oom_wait_info, wait);
1560
1561 if (oom_wait_info->mem == wake_mem)
1562 goto wakeup;
1563
1564 if (!oom_wait_info->mem->use_hierarchy || !wake_mem->use_hierarchy)
1565 return 0;
1566
1567
1568
1569
1570 if (!css_is_ancestor(&oom_wait_info->mem->css, &wake_mem->css) &&
1571 !css_is_ancestor(&wake_mem->css, &oom_wait_info->mem->css))
1572 return 0;
1573
1574wakeup:
1575 return autoremove_wake_function(wait, mode, sync, arg);
1576}
1577
1578static void memcg_wakeup_oom(struct mem_cgroup *mem)
1579{
1580
1581 __wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, mem);
1582}
1583
1584static void memcg_oom_recover(struct mem_cgroup *mem)
1585{
1586 if (mem && atomic_read(&mem->oom_lock))
1587 memcg_wakeup_oom(mem);
1588}
1589
1590
1591
1592
1593bool mem_cgroup_handle_oom(struct mem_cgroup *mem, gfp_t mask)
1594{
1595 struct oom_wait_info owait;
1596 bool locked, need_to_kill;
1597
1598 owait.mem = mem;
1599 owait.wait.flags = 0;
1600 owait.wait.func = memcg_oom_wake_function;
1601 owait.wait.private = current;
1602 INIT_LIST_HEAD(&owait.wait.task_list);
1603 need_to_kill = true;
1604
1605 mutex_lock(&memcg_oom_mutex);
1606 locked = mem_cgroup_oom_lock(mem);
1607
1608
1609
1610
1611
1612 prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE);
1613 if (!locked || mem->oom_kill_disable)
1614 need_to_kill = false;
1615 if (locked)
1616 mem_cgroup_oom_notify(mem);
1617 mutex_unlock(&memcg_oom_mutex);
1618
1619 if (need_to_kill) {
1620 finish_wait(&memcg_oom_waitq, &owait.wait);
1621 mem_cgroup_out_of_memory(mem, mask);
1622 } else {
1623 schedule();
1624 finish_wait(&memcg_oom_waitq, &owait.wait);
1625 }
1626 mutex_lock(&memcg_oom_mutex);
1627 mem_cgroup_oom_unlock(mem);
1628 memcg_wakeup_oom(mem);
1629 mutex_unlock(&memcg_oom_mutex);
1630
1631 if (test_thread_flag(TIF_MEMDIE) || fatal_signal_pending(current))
1632 return false;
1633
1634 schedule_timeout(1);
1635 return true;
1636}
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662void mem_cgroup_update_page_stat(struct page *page,
1663 enum mem_cgroup_page_stat_item idx, int val)
1664{
1665 struct mem_cgroup *mem;
1666 struct page_cgroup *pc = lookup_page_cgroup(page);
1667 bool need_unlock = false;
1668 unsigned long uninitialized_var(flags);
1669
1670 if (unlikely(!pc))
1671 return;
1672
1673 rcu_read_lock();
1674 mem = pc->mem_cgroup;
1675 if (unlikely(!mem || !PageCgroupUsed(pc)))
1676 goto out;
1677
1678 if (unlikely(mem_cgroup_stealed(mem)) || PageTransHuge(page)) {
1679
1680 move_lock_page_cgroup(pc, &flags);
1681 need_unlock = true;
1682 mem = pc->mem_cgroup;
1683 if (!mem || !PageCgroupUsed(pc))
1684 goto out;
1685 }
1686
1687 switch (idx) {
1688 case MEMCG_NR_FILE_MAPPED:
1689 if (val > 0)
1690 SetPageCgroupFileMapped(pc);
1691 else if (!page_mapped(page))
1692 ClearPageCgroupFileMapped(pc);
1693 idx = MEM_CGROUP_STAT_FILE_MAPPED;
1694 break;
1695 default:
1696 BUG();
1697 }
1698
1699 this_cpu_add(mem->stat->count[idx], val);
1700
1701out:
1702 if (unlikely(need_unlock))
1703 move_unlock_page_cgroup(pc, &flags);
1704 rcu_read_unlock();
1705 return;
1706}
1707EXPORT_SYMBOL(mem_cgroup_update_page_stat);
1708
1709
1710
1711
1712
1713#define CHARGE_BATCH 32U
1714struct memcg_stock_pcp {
1715 struct mem_cgroup *cached;
1716 unsigned int nr_pages;
1717 struct work_struct work;
1718};
1719static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock);
1720static atomic_t memcg_drain_count;
1721
1722
1723
1724
1725
1726
1727
1728static bool consume_stock(struct mem_cgroup *mem)
1729{
1730 struct memcg_stock_pcp *stock;
1731 bool ret = true;
1732
1733 stock = &get_cpu_var(memcg_stock);
1734 if (mem == stock->cached && stock->nr_pages)
1735 stock->nr_pages--;
1736 else
1737 ret = false;
1738 put_cpu_var(memcg_stock);
1739 return ret;
1740}
1741
1742
1743
1744
1745static void drain_stock(struct memcg_stock_pcp *stock)
1746{
1747 struct mem_cgroup *old = stock->cached;
1748
1749 if (stock->nr_pages) {
1750 unsigned long bytes = stock->nr_pages * PAGE_SIZE;
1751
1752 res_counter_uncharge(&old->res, bytes);
1753 if (do_swap_account)
1754 res_counter_uncharge(&old->memsw, bytes);
1755 stock->nr_pages = 0;
1756 }
1757 stock->cached = NULL;
1758}
1759
1760
1761
1762
1763
1764static void drain_local_stock(struct work_struct *dummy)
1765{
1766 struct memcg_stock_pcp *stock = &__get_cpu_var(memcg_stock);
1767 drain_stock(stock);
1768}
1769
1770
1771
1772
1773
1774static void refill_stock(struct mem_cgroup *mem, unsigned int nr_pages)
1775{
1776 struct memcg_stock_pcp *stock = &get_cpu_var(memcg_stock);
1777
1778 if (stock->cached != mem) {
1779 drain_stock(stock);
1780 stock->cached = mem;
1781 }
1782 stock->nr_pages += nr_pages;
1783 put_cpu_var(memcg_stock);
1784}
1785
1786
1787
1788
1789
1790
1791
1792static void drain_all_stock_async(void)
1793{
1794 int cpu;
1795
1796
1797
1798
1799
1800
1801 if (atomic_read(&memcg_drain_count))
1802 return;
1803
1804 atomic_inc(&memcg_drain_count);
1805 get_online_cpus();
1806 for_each_online_cpu(cpu) {
1807 struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu);
1808 schedule_work_on(cpu, &stock->work);
1809 }
1810 put_online_cpus();
1811 atomic_dec(&memcg_drain_count);
1812
1813}
1814
1815
1816static void drain_all_stock_sync(void)
1817{
1818
1819 atomic_inc(&memcg_drain_count);
1820 schedule_on_each_cpu(drain_local_stock);
1821 atomic_dec(&memcg_drain_count);
1822}
1823
1824
1825
1826
1827
1828static void mem_cgroup_drain_pcp_counter(struct mem_cgroup *mem, int cpu)
1829{
1830 int i;
1831
1832 spin_lock(&mem->pcp_counter_lock);
1833 for (i = 0; i < MEM_CGROUP_STAT_DATA; i++) {
1834 long x = per_cpu(mem->stat->count[i], cpu);
1835
1836 per_cpu(mem->stat->count[i], cpu) = 0;
1837 mem->nocpu_base.count[i] += x;
1838 }
1839 for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++) {
1840 unsigned long x = per_cpu(mem->stat->events[i], cpu);
1841
1842 per_cpu(mem->stat->events[i], cpu) = 0;
1843 mem->nocpu_base.events[i] += x;
1844 }
1845
1846 per_cpu(mem->stat->count[MEM_CGROUP_ON_MOVE], cpu) = 0;
1847 spin_unlock(&mem->pcp_counter_lock);
1848}
1849
1850static void synchronize_mem_cgroup_on_move(struct mem_cgroup *mem, int cpu)
1851{
1852 int idx = MEM_CGROUP_ON_MOVE;
1853
1854 spin_lock(&mem->pcp_counter_lock);
1855 per_cpu(mem->stat->count[idx], cpu) = mem->nocpu_base.count[idx];
1856 spin_unlock(&mem->pcp_counter_lock);
1857}
1858
1859static int __cpuinit memcg_cpu_hotplug_callback(struct notifier_block *nb,
1860 unsigned long action,
1861 void *hcpu)
1862{
1863 int cpu = (unsigned long)hcpu;
1864 struct memcg_stock_pcp *stock;
1865 struct mem_cgroup *iter;
1866
1867 if ((action == CPU_ONLINE)) {
1868 for_each_mem_cgroup_all(iter)
1869 synchronize_mem_cgroup_on_move(iter, cpu);
1870 return NOTIFY_OK;
1871 }
1872
1873 if ((action != CPU_DEAD) || action != CPU_DEAD_FROZEN)
1874 return NOTIFY_OK;
1875
1876 for_each_mem_cgroup_all(iter)
1877 mem_cgroup_drain_pcp_counter(iter, cpu);
1878
1879 stock = &per_cpu(memcg_stock, cpu);
1880 drain_stock(stock);
1881 return NOTIFY_OK;
1882}
1883
1884
1885
1886enum {
1887 CHARGE_OK,
1888 CHARGE_RETRY,
1889 CHARGE_NOMEM,
1890 CHARGE_WOULDBLOCK,
1891 CHARGE_OOM_DIE,
1892};
1893
1894static int mem_cgroup_do_charge(struct mem_cgroup *mem, gfp_t gfp_mask,
1895 unsigned int nr_pages, bool oom_check)
1896{
1897 unsigned long csize = nr_pages * PAGE_SIZE;
1898 struct mem_cgroup *mem_over_limit;
1899 struct res_counter *fail_res;
1900 unsigned long flags = 0;
1901 int ret;
1902
1903 ret = res_counter_charge(&mem->res, csize, &fail_res);
1904
1905 if (likely(!ret)) {
1906 if (!do_swap_account)
1907 return CHARGE_OK;
1908 ret = res_counter_charge(&mem->memsw, csize, &fail_res);
1909 if (likely(!ret))
1910 return CHARGE_OK;
1911
1912 res_counter_uncharge(&mem->res, csize);
1913 mem_over_limit = mem_cgroup_from_res_counter(fail_res, memsw);
1914 flags |= MEM_CGROUP_RECLAIM_NOSWAP;
1915 } else
1916 mem_over_limit = mem_cgroup_from_res_counter(fail_res, res);
1917
1918
1919
1920
1921
1922
1923
1924 if (nr_pages == CHARGE_BATCH)
1925 return CHARGE_RETRY;
1926
1927 if (!(gfp_mask & __GFP_WAIT))
1928 return CHARGE_WOULDBLOCK;
1929
1930 ret = mem_cgroup_hierarchical_reclaim(mem_over_limit, NULL,
1931 gfp_mask, flags);
1932 if (mem_cgroup_margin(mem_over_limit) >= nr_pages)
1933 return CHARGE_RETRY;
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943 if (nr_pages == 1 && ret)
1944 return CHARGE_RETRY;
1945
1946
1947
1948
1949
1950 if (mem_cgroup_wait_acct_move(mem_over_limit))
1951 return CHARGE_RETRY;
1952
1953
1954 if (!oom_check)
1955 return CHARGE_NOMEM;
1956
1957 if (!mem_cgroup_handle_oom(mem_over_limit, gfp_mask))
1958 return CHARGE_OOM_DIE;
1959
1960 return CHARGE_RETRY;
1961}
1962
1963
1964
1965
1966
1967static int __mem_cgroup_try_charge(struct mm_struct *mm,
1968 gfp_t gfp_mask,
1969 unsigned int nr_pages,
1970 struct mem_cgroup **memcg,
1971 bool oom)
1972{
1973 unsigned int batch = max(CHARGE_BATCH, nr_pages);
1974 int nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES;
1975 struct mem_cgroup *mem = NULL;
1976 int ret;
1977
1978
1979
1980
1981
1982
1983 if (unlikely(test_thread_flag(TIF_MEMDIE)
1984 || fatal_signal_pending(current)))
1985 goto bypass;
1986
1987
1988
1989
1990
1991
1992
1993 if (!*memcg && !mm)
1994 goto bypass;
1995again:
1996 if (*memcg) {
1997 mem = *memcg;
1998 VM_BUG_ON(css_is_removed(&mem->css));
1999 if (mem_cgroup_is_root(mem))
2000 goto done;
2001 if (nr_pages == 1 && consume_stock(mem))
2002 goto done;
2003 css_get(&mem->css);
2004 } else {
2005 struct task_struct *p;
2006
2007 rcu_read_lock();
2008 p = rcu_dereference(mm->owner);
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019 mem = mem_cgroup_from_task(p);
2020 if (!mem || mem_cgroup_is_root(mem)) {
2021 rcu_read_unlock();
2022 goto done;
2023 }
2024 if (nr_pages == 1 && consume_stock(mem)) {
2025
2026
2027
2028
2029
2030
2031
2032
2033 rcu_read_unlock();
2034 goto done;
2035 }
2036
2037 if (!css_tryget(&mem->css)) {
2038 rcu_read_unlock();
2039 goto again;
2040 }
2041 rcu_read_unlock();
2042 }
2043
2044 do {
2045 bool oom_check;
2046
2047
2048 if (fatal_signal_pending(current)) {
2049 css_put(&mem->css);
2050 goto bypass;
2051 }
2052
2053 oom_check = false;
2054 if (oom && !nr_oom_retries) {
2055 oom_check = true;
2056 nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES;
2057 }
2058
2059 ret = mem_cgroup_do_charge(mem, gfp_mask, batch, oom_check);
2060 switch (ret) {
2061 case CHARGE_OK:
2062 break;
2063 case CHARGE_RETRY:
2064 batch = nr_pages;
2065 css_put(&mem->css);
2066 mem = NULL;
2067 goto again;
2068 case CHARGE_WOULDBLOCK:
2069 css_put(&mem->css);
2070 goto nomem;
2071 case CHARGE_NOMEM:
2072 if (!oom) {
2073 css_put(&mem->css);
2074 goto nomem;
2075 }
2076
2077 nr_oom_retries--;
2078 break;
2079 case CHARGE_OOM_DIE:
2080 css_put(&mem->css);
2081 goto bypass;
2082 }
2083 } while (ret != CHARGE_OK);
2084
2085 if (batch > nr_pages)
2086 refill_stock(mem, batch - nr_pages);
2087 css_put(&mem->css);
2088done:
2089 *memcg = mem;
2090 return 0;
2091nomem:
2092 *memcg = NULL;
2093 return -ENOMEM;
2094bypass:
2095 *memcg = NULL;
2096 return 0;
2097}
2098
2099
2100
2101
2102
2103
2104static void __mem_cgroup_cancel_charge(struct mem_cgroup *mem,
2105 unsigned int nr_pages)
2106{
2107 if (!mem_cgroup_is_root(mem)) {
2108 unsigned long bytes = nr_pages * PAGE_SIZE;
2109
2110 res_counter_uncharge(&mem->res, bytes);
2111 if (do_swap_account)
2112 res_counter_uncharge(&mem->memsw, bytes);
2113 }
2114}
2115
2116
2117
2118
2119
2120
2121
2122static struct mem_cgroup *mem_cgroup_lookup(unsigned short id)
2123{
2124 struct cgroup_subsys_state *css;
2125
2126
2127 if (!id)
2128 return NULL;
2129 css = css_lookup(&mem_cgroup_subsys, id);
2130 if (!css)
2131 return NULL;
2132 return container_of(css, struct mem_cgroup, css);
2133}
2134
2135struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page)
2136{
2137 struct mem_cgroup *mem = NULL;
2138 struct page_cgroup *pc;
2139 unsigned short id;
2140 swp_entry_t ent;
2141
2142 VM_BUG_ON(!PageLocked(page));
2143
2144 pc = lookup_page_cgroup(page);
2145 lock_page_cgroup(pc);
2146 if (PageCgroupUsed(pc)) {
2147 mem = pc->mem_cgroup;
2148 if (mem && !css_tryget(&mem->css))
2149 mem = NULL;
2150 } else if (PageSwapCache(page)) {
2151 ent.val = page_private(page);
2152 id = lookup_swap_cgroup(ent);
2153 rcu_read_lock();
2154 mem = mem_cgroup_lookup(id);
2155 if (mem && !css_tryget(&mem->css))
2156 mem = NULL;
2157 rcu_read_unlock();
2158 }
2159 unlock_page_cgroup(pc);
2160 return mem;
2161}
2162
2163static void __mem_cgroup_commit_charge(struct mem_cgroup *mem,
2164 struct page *page,
2165 unsigned int nr_pages,
2166 struct page_cgroup *pc,
2167 enum charge_type ctype)
2168{
2169 lock_page_cgroup(pc);
2170 if (unlikely(PageCgroupUsed(pc))) {
2171 unlock_page_cgroup(pc);
2172 __mem_cgroup_cancel_charge(mem, nr_pages);
2173 return;
2174 }
2175
2176
2177
2178
2179 pc->mem_cgroup = mem;
2180
2181
2182
2183
2184
2185
2186
2187 smp_wmb();
2188 switch (ctype) {
2189 case MEM_CGROUP_CHARGE_TYPE_CACHE:
2190 case MEM_CGROUP_CHARGE_TYPE_SHMEM:
2191 SetPageCgroupCache(pc);
2192 SetPageCgroupUsed(pc);
2193 break;
2194 case MEM_CGROUP_CHARGE_TYPE_MAPPED:
2195 ClearPageCgroupCache(pc);
2196 SetPageCgroupUsed(pc);
2197 break;
2198 default:
2199 break;
2200 }
2201
2202 mem_cgroup_charge_statistics(mem, PageCgroupCache(pc), nr_pages);
2203 unlock_page_cgroup(pc);
2204
2205
2206
2207
2208
2209 memcg_check_events(mem, page);
2210}
2211
2212#ifdef CONFIG_TRANSPARENT_HUGEPAGE
2213
2214#define PCGF_NOCOPY_AT_SPLIT ((1 << PCG_LOCK) | (1 << PCG_MOVE_LOCK) |\
2215 (1 << PCG_ACCT_LRU) | (1 << PCG_MIGRATION))
2216
2217
2218
2219
2220void mem_cgroup_split_huge_fixup(struct page *head, struct page *tail)
2221{
2222 struct page_cgroup *head_pc = lookup_page_cgroup(head);
2223 struct page_cgroup *tail_pc = lookup_page_cgroup(tail);
2224 unsigned long flags;
2225
2226 if (mem_cgroup_disabled())
2227 return;
2228
2229
2230
2231
2232 move_lock_page_cgroup(head_pc, &flags);
2233
2234 tail_pc->mem_cgroup = head_pc->mem_cgroup;
2235 smp_wmb();
2236 if (PageCgroupAcctLRU(head_pc)) {
2237 enum lru_list lru;
2238 struct mem_cgroup_per_zone *mz;
2239
2240
2241
2242
2243
2244
2245 lru = page_lru(head);
2246 mz = page_cgroup_zoneinfo(head_pc->mem_cgroup, head);
2247 MEM_CGROUP_ZSTAT(mz, lru) -= 1;
2248 }
2249 tail_pc->flags = head_pc->flags & ~PCGF_NOCOPY_AT_SPLIT;
2250 move_unlock_page_cgroup(head_pc, &flags);
2251}
2252#endif
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272static int mem_cgroup_move_account(struct page *page,
2273 unsigned int nr_pages,
2274 struct page_cgroup *pc,
2275 struct mem_cgroup *from,
2276 struct mem_cgroup *to,
2277 bool uncharge)
2278{
2279 unsigned long flags;
2280 int ret;
2281
2282 VM_BUG_ON(from == to);
2283 VM_BUG_ON(PageLRU(page));
2284
2285
2286
2287
2288
2289
2290 ret = -EBUSY;
2291 if (nr_pages > 1 && !PageTransHuge(page))
2292 goto out;
2293
2294 lock_page_cgroup(pc);
2295
2296 ret = -EINVAL;
2297 if (!PageCgroupUsed(pc) || pc->mem_cgroup != from)
2298 goto unlock;
2299
2300 move_lock_page_cgroup(pc, &flags);
2301
2302 if (PageCgroupFileMapped(pc)) {
2303
2304 preempt_disable();
2305 __this_cpu_dec(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
2306 __this_cpu_inc(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
2307 preempt_enable();
2308 }
2309 mem_cgroup_charge_statistics(from, PageCgroupCache(pc), -nr_pages);
2310 if (uncharge)
2311
2312 __mem_cgroup_cancel_charge(from, nr_pages);
2313
2314
2315 pc->mem_cgroup = to;
2316 mem_cgroup_charge_statistics(to, PageCgroupCache(pc), nr_pages);
2317
2318
2319
2320
2321
2322
2323
2324 move_unlock_page_cgroup(pc, &flags);
2325 ret = 0;
2326unlock:
2327 unlock_page_cgroup(pc);
2328
2329
2330
2331 memcg_check_events(to, page);
2332 memcg_check_events(from, page);
2333out:
2334 return ret;
2335}
2336
2337
2338
2339
2340
2341static int mem_cgroup_move_parent(struct page *page,
2342 struct page_cgroup *pc,
2343 struct mem_cgroup *child,
2344 gfp_t gfp_mask)
2345{
2346 struct cgroup *cg = child->css.cgroup;
2347 struct cgroup *pcg = cg->parent;
2348 struct mem_cgroup *parent;
2349 unsigned int nr_pages;
2350 unsigned long uninitialized_var(flags);
2351 int ret;
2352
2353
2354 if (!pcg)
2355 return -EINVAL;
2356
2357 ret = -EBUSY;
2358 if (!get_page_unless_zero(page))
2359 goto out;
2360 if (isolate_lru_page(page))
2361 goto put;
2362
2363 nr_pages = hpage_nr_pages(page);
2364
2365 parent = mem_cgroup_from_cont(pcg);
2366 ret = __mem_cgroup_try_charge(NULL, gfp_mask, nr_pages, &parent, false);
2367 if (ret || !parent)
2368 goto put_back;
2369
2370 if (nr_pages > 1)
2371 flags = compound_lock_irqsave(page);
2372
2373 ret = mem_cgroup_move_account(page, nr_pages, pc, child, parent, true);
2374 if (ret)
2375 __mem_cgroup_cancel_charge(parent, nr_pages);
2376
2377 if (nr_pages > 1)
2378 compound_unlock_irqrestore(page, flags);
2379put_back:
2380 putback_lru_page(page);
2381put:
2382 put_page(page);
2383out:
2384 return ret;
2385}
2386
2387
2388
2389
2390
2391
2392
2393static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
2394 gfp_t gfp_mask, enum charge_type ctype)
2395{
2396 struct mem_cgroup *mem = NULL;
2397 unsigned int nr_pages = 1;
2398 struct page_cgroup *pc;
2399 bool oom = true;
2400 int ret;
2401
2402 if (PageTransHuge(page)) {
2403 nr_pages <<= compound_order(page);
2404 VM_BUG_ON(!PageTransHuge(page));
2405
2406
2407
2408
2409 oom = false;
2410 }
2411
2412 pc = lookup_page_cgroup(page);
2413 BUG_ON(!pc);
2414
2415 ret = __mem_cgroup_try_charge(mm, gfp_mask, nr_pages, &mem, oom);
2416 if (ret || !mem)
2417 return ret;
2418
2419 __mem_cgroup_commit_charge(mem, page, nr_pages, pc, ctype);
2420 return 0;
2421}
2422
2423int mem_cgroup_newpage_charge(struct page *page,
2424 struct mm_struct *mm, gfp_t gfp_mask)
2425{
2426 if (mem_cgroup_disabled())
2427 return 0;
2428
2429
2430
2431
2432
2433
2434
2435 if (page_mapped(page) || (page->mapping && !PageAnon(page)))
2436 return 0;
2437 if (unlikely(!mm))
2438 mm = &init_mm;
2439 return mem_cgroup_charge_common(page, mm, gfp_mask,
2440 MEM_CGROUP_CHARGE_TYPE_MAPPED);
2441}
2442
2443static void
2444__mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr,
2445 enum charge_type ctype);
2446
2447static void
2448__mem_cgroup_commit_charge_lrucare(struct page *page, struct mem_cgroup *mem,
2449 enum charge_type ctype)
2450{
2451 struct page_cgroup *pc = lookup_page_cgroup(page);
2452
2453
2454
2455
2456
2457 mem_cgroup_lru_del_before_commit(page);
2458 __mem_cgroup_commit_charge(mem, page, 1, pc, ctype);
2459 mem_cgroup_lru_add_after_commit(page);
2460 return;
2461}
2462
2463int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
2464 gfp_t gfp_mask)
2465{
2466 struct mem_cgroup *mem = NULL;
2467 int ret;
2468
2469 if (mem_cgroup_disabled())
2470 return 0;
2471 if (PageCompound(page))
2472 return 0;
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484 if (!(gfp_mask & __GFP_WAIT)) {
2485 struct page_cgroup *pc;
2486
2487 pc = lookup_page_cgroup(page);
2488 if (!pc)
2489 return 0;
2490 lock_page_cgroup(pc);
2491 if (PageCgroupUsed(pc)) {
2492 unlock_page_cgroup(pc);
2493 return 0;
2494 }
2495 unlock_page_cgroup(pc);
2496 }
2497
2498 if (unlikely(!mm))
2499 mm = &init_mm;
2500
2501 if (page_is_file_cache(page)) {
2502 ret = __mem_cgroup_try_charge(mm, gfp_mask, 1, &mem, true);
2503 if (ret || !mem)
2504 return ret;
2505
2506
2507
2508
2509
2510
2511 __mem_cgroup_commit_charge_lrucare(page, mem,
2512 MEM_CGROUP_CHARGE_TYPE_CACHE);
2513 return ret;
2514 }
2515
2516 if (PageSwapCache(page)) {
2517 ret = mem_cgroup_try_charge_swapin(mm, page, gfp_mask, &mem);
2518 if (!ret)
2519 __mem_cgroup_commit_charge_swapin(page, mem,
2520 MEM_CGROUP_CHARGE_TYPE_SHMEM);
2521 } else
2522 ret = mem_cgroup_charge_common(page, mm, gfp_mask,
2523 MEM_CGROUP_CHARGE_TYPE_SHMEM);
2524
2525 return ret;
2526}
2527
2528
2529
2530
2531
2532
2533
2534int mem_cgroup_try_charge_swapin(struct mm_struct *mm,
2535 struct page *page,
2536 gfp_t mask, struct mem_cgroup **ptr)
2537{
2538 struct mem_cgroup *mem;
2539 int ret;
2540
2541 *ptr = NULL;
2542
2543 if (mem_cgroup_disabled())
2544 return 0;
2545
2546 if (!do_swap_account)
2547 goto charge_cur_mm;
2548
2549
2550
2551
2552
2553
2554 if (!PageSwapCache(page))
2555 goto charge_cur_mm;
2556 mem = try_get_mem_cgroup_from_page(page);
2557 if (!mem)
2558 goto charge_cur_mm;
2559 *ptr = mem;
2560 ret = __mem_cgroup_try_charge(NULL, mask, 1, ptr, true);
2561 css_put(&mem->css);
2562 return ret;
2563charge_cur_mm:
2564 if (unlikely(!mm))
2565 mm = &init_mm;
2566 return __mem_cgroup_try_charge(mm, mask, 1, ptr, true);
2567}
2568
2569static void
2570__mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr,
2571 enum charge_type ctype)
2572{
2573 if (mem_cgroup_disabled())
2574 return;
2575 if (!ptr)
2576 return;
2577 cgroup_exclude_rmdir(&ptr->css);
2578
2579 __mem_cgroup_commit_charge_lrucare(page, ptr, ctype);
2580
2581
2582
2583
2584
2585
2586
2587 if (do_swap_account && PageSwapCache(page)) {
2588 swp_entry_t ent = {.val = page_private(page)};
2589 unsigned short id;
2590 struct mem_cgroup *memcg;
2591
2592 id = swap_cgroup_record(ent, 0);
2593 rcu_read_lock();
2594 memcg = mem_cgroup_lookup(id);
2595 if (memcg) {
2596
2597
2598
2599
2600 if (!mem_cgroup_is_root(memcg))
2601 res_counter_uncharge(&memcg->memsw, PAGE_SIZE);
2602 mem_cgroup_swap_statistics(memcg, false);
2603 mem_cgroup_put(memcg);
2604 }
2605 rcu_read_unlock();
2606 }
2607
2608
2609
2610
2611
2612 cgroup_release_and_wakeup_rmdir(&ptr->css);
2613}
2614
2615void mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr)
2616{
2617 __mem_cgroup_commit_charge_swapin(page, ptr,
2618 MEM_CGROUP_CHARGE_TYPE_MAPPED);
2619}
2620
2621void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *mem)
2622{
2623 if (mem_cgroup_disabled())
2624 return;
2625 if (!mem)
2626 return;
2627 __mem_cgroup_cancel_charge(mem, 1);
2628}
2629
2630static void mem_cgroup_do_uncharge(struct mem_cgroup *mem,
2631 unsigned int nr_pages,
2632 const enum charge_type ctype)
2633{
2634 struct memcg_batch_info *batch = NULL;
2635 bool uncharge_memsw = true;
2636
2637
2638 if (!do_swap_account || ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT)
2639 uncharge_memsw = false;
2640
2641 batch = ¤t->memcg_batch;
2642
2643
2644
2645
2646
2647 if (!batch->memcg)
2648 batch->memcg = mem;
2649
2650
2651
2652
2653
2654
2655
2656
2657 if (!batch->do_batch || test_thread_flag(TIF_MEMDIE))
2658 goto direct_uncharge;
2659
2660 if (nr_pages > 1)
2661 goto direct_uncharge;
2662
2663
2664
2665
2666
2667
2668 if (batch->memcg != mem)
2669 goto direct_uncharge;
2670
2671 batch->nr_pages++;
2672 if (uncharge_memsw)
2673 batch->memsw_nr_pages++;
2674 return;
2675direct_uncharge:
2676 res_counter_uncharge(&mem->res, nr_pages * PAGE_SIZE);
2677 if (uncharge_memsw)
2678 res_counter_uncharge(&mem->memsw, nr_pages * PAGE_SIZE);
2679 if (unlikely(batch->memcg != mem))
2680 memcg_oom_recover(mem);
2681 return;
2682}
2683
2684
2685
2686
2687static struct mem_cgroup *
2688__mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
2689{
2690 struct mem_cgroup *mem = NULL;
2691 unsigned int nr_pages = 1;
2692 struct page_cgroup *pc;
2693
2694 if (mem_cgroup_disabled())
2695 return NULL;
2696
2697 if (PageSwapCache(page))
2698 return NULL;
2699
2700 if (PageTransHuge(page)) {
2701 nr_pages <<= compound_order(page);
2702 VM_BUG_ON(!PageTransHuge(page));
2703 }
2704
2705
2706
2707 pc = lookup_page_cgroup(page);
2708 if (unlikely(!pc || !PageCgroupUsed(pc)))
2709 return NULL;
2710
2711 lock_page_cgroup(pc);
2712
2713 mem = pc->mem_cgroup;
2714
2715 if (!PageCgroupUsed(pc))
2716 goto unlock_out;
2717
2718 switch (ctype) {
2719 case MEM_CGROUP_CHARGE_TYPE_MAPPED:
2720 case MEM_CGROUP_CHARGE_TYPE_DROP:
2721
2722 if (page_mapped(page) || PageCgroupMigration(pc))
2723 goto unlock_out;
2724 break;
2725 case MEM_CGROUP_CHARGE_TYPE_SWAPOUT:
2726 if (!PageAnon(page)) {
2727 if (page->mapping && !page_is_file_cache(page))
2728 goto unlock_out;
2729 } else if (page_mapped(page))
2730 goto unlock_out;
2731 break;
2732 default:
2733 break;
2734 }
2735
2736 mem_cgroup_charge_statistics(mem, PageCgroupCache(pc), -nr_pages);
2737
2738 ClearPageCgroupUsed(pc);
2739
2740
2741
2742
2743
2744
2745
2746 unlock_page_cgroup(pc);
2747
2748
2749
2750
2751 memcg_check_events(mem, page);
2752 if (do_swap_account && ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT) {
2753 mem_cgroup_swap_statistics(mem, true);
2754 mem_cgroup_get(mem);
2755 }
2756 if (!mem_cgroup_is_root(mem))
2757 mem_cgroup_do_uncharge(mem, nr_pages, ctype);
2758
2759 return mem;
2760
2761unlock_out:
2762 unlock_page_cgroup(pc);
2763 return NULL;
2764}
2765
2766void mem_cgroup_uncharge_page(struct page *page)
2767{
2768
2769 if (page_mapped(page))
2770 return;
2771 if (page->mapping && !PageAnon(page))
2772 return;
2773 __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_MAPPED);
2774}
2775
2776void mem_cgroup_uncharge_cache_page(struct page *page)
2777{
2778 VM_BUG_ON(page_mapped(page));
2779 VM_BUG_ON(page->mapping);
2780 __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE);
2781}
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791void mem_cgroup_uncharge_start(void)
2792{
2793 current->memcg_batch.do_batch++;
2794
2795 if (current->memcg_batch.do_batch == 1) {
2796 current->memcg_batch.memcg = NULL;
2797 current->memcg_batch.nr_pages = 0;
2798 current->memcg_batch.memsw_nr_pages = 0;
2799 }
2800}
2801
2802void mem_cgroup_uncharge_end(void)
2803{
2804 struct memcg_batch_info *batch = ¤t->memcg_batch;
2805
2806 if (!batch->do_batch)
2807 return;
2808
2809 batch->do_batch--;
2810 if (batch->do_batch)
2811 return;
2812
2813 if (!batch->memcg)
2814 return;
2815
2816
2817
2818
2819 if (batch->nr_pages)
2820 res_counter_uncharge(&batch->memcg->res,
2821 batch->nr_pages * PAGE_SIZE);
2822 if (batch->memsw_nr_pages)
2823 res_counter_uncharge(&batch->memcg->memsw,
2824 batch->memsw_nr_pages * PAGE_SIZE);
2825 memcg_oom_recover(batch->memcg);
2826
2827 batch->memcg = NULL;
2828}
2829
2830#ifdef CONFIG_SWAP
2831
2832
2833
2834
2835void
2836mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent, bool swapout)
2837{
2838 struct mem_cgroup *memcg;
2839 int ctype = MEM_CGROUP_CHARGE_TYPE_SWAPOUT;
2840
2841 if (!swapout)
2842 ctype = MEM_CGROUP_CHARGE_TYPE_DROP;
2843
2844 memcg = __mem_cgroup_uncharge_common(page, ctype);
2845
2846
2847
2848
2849
2850 if (do_swap_account && swapout && memcg)
2851 swap_cgroup_record(ent, css_id(&memcg->css));
2852}
2853#endif
2854
2855#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
2856
2857
2858
2859
2860void mem_cgroup_uncharge_swap(swp_entry_t ent)
2861{
2862 struct mem_cgroup *memcg;
2863 unsigned short id;
2864
2865 if (!do_swap_account)
2866 return;
2867
2868 id = swap_cgroup_record(ent, 0);
2869 rcu_read_lock();
2870 memcg = mem_cgroup_lookup(id);
2871 if (memcg) {
2872
2873
2874
2875
2876 if (!mem_cgroup_is_root(memcg))
2877 res_counter_uncharge(&memcg->memsw, PAGE_SIZE);
2878 mem_cgroup_swap_statistics(memcg, false);
2879 mem_cgroup_put(memcg);
2880 }
2881 rcu_read_unlock();
2882}
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899static int mem_cgroup_move_swap_account(swp_entry_t entry,
2900 struct mem_cgroup *from, struct mem_cgroup *to, bool need_fixup)
2901{
2902 unsigned short old_id, new_id;
2903
2904 old_id = css_id(&from->css);
2905 new_id = css_id(&to->css);
2906
2907 if (swap_cgroup_cmpxchg(entry, old_id, new_id) == old_id) {
2908 mem_cgroup_swap_statistics(from, false);
2909 mem_cgroup_swap_statistics(to, true);
2910
2911
2912
2913
2914
2915
2916
2917
2918 mem_cgroup_get(to);
2919 if (need_fixup) {
2920 if (!mem_cgroup_is_root(from))
2921 res_counter_uncharge(&from->memsw, PAGE_SIZE);
2922 mem_cgroup_put(from);
2923
2924
2925
2926
2927 if (!mem_cgroup_is_root(to))
2928 res_counter_uncharge(&to->res, PAGE_SIZE);
2929 }
2930 return 0;
2931 }
2932 return -EINVAL;
2933}
2934#else
2935static inline int mem_cgroup_move_swap_account(swp_entry_t entry,
2936 struct mem_cgroup *from, struct mem_cgroup *to, bool need_fixup)
2937{
2938 return -EINVAL;
2939}
2940#endif
2941
2942
2943
2944
2945
2946int mem_cgroup_prepare_migration(struct page *page,
2947 struct page *newpage, struct mem_cgroup **ptr, gfp_t gfp_mask)
2948{
2949 struct mem_cgroup *mem = NULL;
2950 struct page_cgroup *pc;
2951 enum charge_type ctype;
2952 int ret = 0;
2953
2954 *ptr = NULL;
2955
2956 VM_BUG_ON(PageTransHuge(page));
2957 if (mem_cgroup_disabled())
2958 return 0;
2959
2960 pc = lookup_page_cgroup(page);
2961 lock_page_cgroup(pc);
2962 if (PageCgroupUsed(pc)) {
2963 mem = pc->mem_cgroup;
2964 css_get(&mem->css);
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994 if (PageAnon(page))
2995 SetPageCgroupMigration(pc);
2996 }
2997 unlock_page_cgroup(pc);
2998
2999
3000
3001
3002 if (!mem)
3003 return 0;
3004
3005 *ptr = mem;
3006 ret = __mem_cgroup_try_charge(NULL, gfp_mask, 1, ptr, false);
3007 css_put(&mem->css);
3008 if (ret || *ptr == NULL) {
3009 if (PageAnon(page)) {
3010 lock_page_cgroup(pc);
3011 ClearPageCgroupMigration(pc);
3012 unlock_page_cgroup(pc);
3013
3014
3015
3016 mem_cgroup_uncharge_page(page);
3017 }
3018 return -ENOMEM;
3019 }
3020
3021
3022
3023
3024
3025
3026 pc = lookup_page_cgroup(newpage);
3027 if (PageAnon(page))
3028 ctype = MEM_CGROUP_CHARGE_TYPE_MAPPED;
3029 else if (page_is_file_cache(page))
3030 ctype = MEM_CGROUP_CHARGE_TYPE_CACHE;
3031 else
3032 ctype = MEM_CGROUP_CHARGE_TYPE_SHMEM;
3033 __mem_cgroup_commit_charge(mem, page, 1, pc, ctype);
3034 return ret;
3035}
3036
3037
3038void mem_cgroup_end_migration(struct mem_cgroup *mem,
3039 struct page *oldpage, struct page *newpage, bool migration_ok)
3040{
3041 struct page *used, *unused;
3042 struct page_cgroup *pc;
3043
3044 if (!mem)
3045 return;
3046
3047 cgroup_exclude_rmdir(&mem->css);
3048 if (!migration_ok) {
3049 used = oldpage;
3050 unused = newpage;
3051 } else {
3052 used = newpage;
3053 unused = oldpage;
3054 }
3055
3056
3057
3058
3059
3060 pc = lookup_page_cgroup(oldpage);
3061 lock_page_cgroup(pc);
3062 ClearPageCgroupMigration(pc);
3063 unlock_page_cgroup(pc);
3064
3065 __mem_cgroup_uncharge_common(unused, MEM_CGROUP_CHARGE_TYPE_FORCE);
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075 if (PageAnon(used))
3076 mem_cgroup_uncharge_page(used);
3077
3078
3079
3080
3081
3082
3083 cgroup_release_and_wakeup_rmdir(&mem->css);
3084}
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094int mem_cgroup_shmem_charge_fallback(struct page *page,
3095 struct mm_struct *mm,
3096 gfp_t gfp_mask)
3097{
3098 struct mem_cgroup *mem;
3099 int ret;
3100
3101 if (mem_cgroup_disabled())
3102 return 0;
3103
3104 ret = mem_cgroup_try_charge_swapin(mm, page, gfp_mask, &mem);
3105 if (!ret)
3106 mem_cgroup_cancel_charge_swapin(mem);
3107
3108 return ret;
3109}
3110
3111#ifdef CONFIG_DEBUG_VM
3112static struct page_cgroup *lookup_page_cgroup_used(struct page *page)
3113{
3114 struct page_cgroup *pc;
3115
3116 pc = lookup_page_cgroup(page);
3117 if (likely(pc) && PageCgroupUsed(pc))
3118 return pc;
3119 return NULL;
3120}
3121
3122bool mem_cgroup_bad_page_check(struct page *page)
3123{
3124 if (mem_cgroup_disabled())
3125 return false;
3126
3127 return lookup_page_cgroup_used(page) != NULL;
3128}
3129
3130void mem_cgroup_print_bad_page(struct page *page)
3131{
3132 struct page_cgroup *pc;
3133
3134 pc = lookup_page_cgroup_used(page);
3135 if (pc) {
3136 int ret = -1;
3137 char *path;
3138
3139 printk(KERN_ALERT "pc:%p pc->flags:%lx pc->mem_cgroup:%p",
3140 pc, pc->flags, pc->mem_cgroup);
3141
3142 path = kmalloc(PATH_MAX, GFP_KERNEL);
3143 if (path) {
3144 rcu_read_lock();
3145 ret = cgroup_path(pc->mem_cgroup->css.cgroup,
3146 path, PATH_MAX);
3147 rcu_read_unlock();
3148 }
3149
3150 printk(KERN_CONT "(%s)\n",
3151 (ret < 0) ? "cannot get the path" : path);
3152 kfree(path);
3153 }
3154}
3155#endif
3156
3157static DEFINE_MUTEX(set_limit_mutex);
3158
3159static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
3160 unsigned long long val)
3161{
3162 int retry_count;
3163 u64 memswlimit, memlimit;
3164 int ret = 0;
3165 int children = mem_cgroup_count_children(memcg);
3166 u64 curusage, oldusage;
3167 int enlarge;
3168
3169
3170
3171
3172
3173
3174 retry_count = MEM_CGROUP_RECLAIM_RETRIES * children;
3175
3176 oldusage = res_counter_read_u64(&memcg->res, RES_USAGE);
3177
3178 enlarge = 0;
3179 while (retry_count) {
3180 if (signal_pending(current)) {
3181 ret = -EINTR;
3182 break;
3183 }
3184
3185
3186
3187
3188
3189 mutex_lock(&set_limit_mutex);
3190 memswlimit = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
3191 if (memswlimit < val) {
3192 ret = -EINVAL;
3193 mutex_unlock(&set_limit_mutex);
3194 break;
3195 }
3196
3197 memlimit = res_counter_read_u64(&memcg->res, RES_LIMIT);
3198 if (memlimit < val)
3199 enlarge = 1;
3200
3201 ret = res_counter_set_limit(&memcg->res, val);
3202 if (!ret) {
3203 if (memswlimit == val)
3204 memcg->memsw_is_minimum = true;
3205 else
3206 memcg->memsw_is_minimum = false;
3207 }
3208 mutex_unlock(&set_limit_mutex);
3209
3210 if (!ret)
3211 break;
3212
3213 mem_cgroup_hierarchical_reclaim(memcg, NULL, GFP_KERNEL,
3214 MEM_CGROUP_RECLAIM_SHRINK);
3215 curusage = res_counter_read_u64(&memcg->res, RES_USAGE);
3216
3217 if (curusage >= oldusage)
3218 retry_count--;
3219 else
3220 oldusage = curusage;
3221 }
3222 if (!ret && enlarge)
3223 memcg_oom_recover(memcg);
3224
3225 return ret;
3226}
3227
3228static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg,
3229 unsigned long long val)
3230{
3231 int retry_count;
3232 u64 memlimit, memswlimit, oldusage, curusage;
3233 int children = mem_cgroup_count_children(memcg);
3234 int ret = -EBUSY;
3235 int enlarge = 0;
3236
3237
3238 retry_count = children * MEM_CGROUP_RECLAIM_RETRIES;
3239 oldusage = res_counter_read_u64(&memcg->memsw, RES_USAGE);
3240 while (retry_count) {
3241 if (signal_pending(current)) {
3242 ret = -EINTR;
3243 break;
3244 }
3245
3246
3247
3248
3249
3250 mutex_lock(&set_limit_mutex);
3251 memlimit = res_counter_read_u64(&memcg->res, RES_LIMIT);
3252 if (memlimit > val) {
3253 ret = -EINVAL;
3254 mutex_unlock(&set_limit_mutex);
3255 break;
3256 }
3257 memswlimit = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
3258 if (memswlimit < val)
3259 enlarge = 1;
3260 ret = res_counter_set_limit(&memcg->memsw, val);
3261 if (!ret) {
3262 if (memlimit == val)
3263 memcg->memsw_is_minimum = true;
3264 else
3265 memcg->memsw_is_minimum = false;
3266 }
3267 mutex_unlock(&set_limit_mutex);
3268
3269 if (!ret)
3270 break;
3271
3272 mem_cgroup_hierarchical_reclaim(memcg, NULL, GFP_KERNEL,
3273 MEM_CGROUP_RECLAIM_NOSWAP |
3274 MEM_CGROUP_RECLAIM_SHRINK);
3275 curusage = res_counter_read_u64(&memcg->memsw, RES_USAGE);
3276
3277 if (curusage >= oldusage)
3278 retry_count--;
3279 else
3280 oldusage = curusage;
3281 }
3282 if (!ret && enlarge)
3283 memcg_oom_recover(memcg);
3284 return ret;
3285}
3286
3287unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
3288 gfp_t gfp_mask)
3289{
3290 unsigned long nr_reclaimed = 0;
3291 struct mem_cgroup_per_zone *mz, *next_mz = NULL;
3292 unsigned long reclaimed;
3293 int loop = 0;
3294 struct mem_cgroup_tree_per_zone *mctz;
3295 unsigned long long excess;
3296
3297 if (order > 0)
3298 return 0;
3299
3300 mctz = soft_limit_tree_node_zone(zone_to_nid(zone), zone_idx(zone));
3301
3302
3303
3304
3305
3306 do {
3307 if (next_mz)
3308 mz = next_mz;
3309 else
3310 mz = mem_cgroup_largest_soft_limit_node(mctz);
3311 if (!mz)
3312 break;
3313
3314 reclaimed = mem_cgroup_hierarchical_reclaim(mz->mem, zone,
3315 gfp_mask,
3316 MEM_CGROUP_RECLAIM_SOFT);
3317 nr_reclaimed += reclaimed;
3318 spin_lock(&mctz->lock);
3319
3320
3321
3322
3323
3324 next_mz = NULL;
3325 if (!reclaimed) {
3326 do {
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338 next_mz =
3339 __mem_cgroup_largest_soft_limit_node(mctz);
3340 if (next_mz == mz) {
3341 css_put(&next_mz->mem->css);
3342 next_mz = NULL;
3343 } else
3344 break;
3345 } while (1);
3346 }
3347 __mem_cgroup_remove_exceeded(mz->mem, mz, mctz);
3348 excess = res_counter_soft_limit_excess(&mz->mem->res);
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358 __mem_cgroup_insert_exceeded(mz->mem, mz, mctz, excess);
3359 spin_unlock(&mctz->lock);
3360 css_put(&mz->mem->css);
3361 loop++;
3362
3363
3364
3365
3366
3367 if (!nr_reclaimed &&
3368 (next_mz == NULL ||
3369 loop > MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS))
3370 break;
3371 } while (!nr_reclaimed);
3372 if (next_mz)
3373 css_put(&next_mz->mem->css);
3374 return nr_reclaimed;
3375}
3376
3377
3378
3379
3380
3381static int mem_cgroup_force_empty_list(struct mem_cgroup *mem,
3382 int node, int zid, enum lru_list lru)
3383{
3384 struct zone *zone;
3385 struct mem_cgroup_per_zone *mz;
3386 struct page_cgroup *pc, *busy;
3387 unsigned long flags, loop;
3388 struct list_head *list;
3389 int ret = 0;
3390
3391 zone = &NODE_DATA(node)->node_zones[zid];
3392 mz = mem_cgroup_zoneinfo(mem, node, zid);
3393 list = &mz->lists[lru];
3394
3395 loop = MEM_CGROUP_ZSTAT(mz, lru);
3396
3397 loop += 256;
3398 busy = NULL;
3399 while (loop--) {
3400 struct page *page;
3401
3402 ret = 0;
3403 spin_lock_irqsave(&zone->lru_lock, flags);
3404 if (list_empty(list)) {
3405 spin_unlock_irqrestore(&zone->lru_lock, flags);
3406 break;
3407 }
3408 pc = list_entry(list->prev, struct page_cgroup, lru);
3409 if (busy == pc) {
3410 list_move(&pc->lru, list);
3411 busy = NULL;
3412 spin_unlock_irqrestore(&zone->lru_lock, flags);
3413 continue;
3414 }
3415 spin_unlock_irqrestore(&zone->lru_lock, flags);
3416
3417 page = lookup_cgroup_page(pc);
3418
3419 ret = mem_cgroup_move_parent(page, pc, mem, GFP_KERNEL);
3420 if (ret == -ENOMEM)
3421 break;
3422
3423 if (ret == -EBUSY || ret == -EINVAL) {
3424
3425 busy = pc;
3426 cond_resched();
3427 } else
3428 busy = NULL;
3429 }
3430
3431 if (!ret && !list_empty(list))
3432 return -EBUSY;
3433 return ret;
3434}
3435
3436
3437
3438
3439
3440static int mem_cgroup_force_empty(struct mem_cgroup *mem, bool free_all)
3441{
3442 int ret;
3443 int node, zid, shrink;
3444 int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
3445 struct cgroup *cgrp = mem->css.cgroup;
3446
3447 css_get(&mem->css);
3448
3449 shrink = 0;
3450
3451 if (free_all)
3452 goto try_to_free;
3453move_account:
3454 do {
3455 ret = -EBUSY;
3456 if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children))
3457 goto out;
3458 ret = -EINTR;
3459 if (signal_pending(current))
3460 goto out;
3461
3462 lru_add_drain_all();
3463 drain_all_stock_sync();
3464 ret = 0;
3465 mem_cgroup_start_move(mem);
3466 for_each_node_state(node, N_HIGH_MEMORY) {
3467 for (zid = 0; !ret && zid < MAX_NR_ZONES; zid++) {
3468 enum lru_list l;
3469 for_each_lru(l) {
3470 ret = mem_cgroup_force_empty_list(mem,
3471 node, zid, l);
3472 if (ret)
3473 break;
3474 }
3475 }
3476 if (ret)
3477 break;
3478 }
3479 mem_cgroup_end_move(mem);
3480 memcg_oom_recover(mem);
3481
3482 if (ret == -ENOMEM)
3483 goto try_to_free;
3484 cond_resched();
3485
3486 } while (mem->res.usage > 0 || ret);
3487out:
3488 css_put(&mem->css);
3489 return ret;
3490
3491try_to_free:
3492
3493 if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children) || shrink) {
3494 ret = -EBUSY;
3495 goto out;
3496 }
3497
3498 lru_add_drain_all();
3499
3500 shrink = 1;
3501 while (nr_retries && mem->res.usage > 0) {
3502 int progress;
3503
3504 if (signal_pending(current)) {
3505 ret = -EINTR;
3506 goto out;
3507 }
3508 progress = try_to_free_mem_cgroup_pages(mem, GFP_KERNEL,
3509 false, get_swappiness(mem));
3510 if (!progress) {
3511 nr_retries--;
3512
3513 congestion_wait(BLK_RW_ASYNC, HZ/10);
3514 }
3515
3516 }
3517 lru_add_drain();
3518
3519 goto move_account;
3520}
3521
3522int mem_cgroup_force_empty_write(struct cgroup *cont, unsigned int event)
3523{
3524 return mem_cgroup_force_empty(mem_cgroup_from_cont(cont), true);
3525}
3526
3527
3528static u64 mem_cgroup_hierarchy_read(struct cgroup *cont, struct cftype *cft)
3529{
3530 return mem_cgroup_from_cont(cont)->use_hierarchy;
3531}
3532
3533static int mem_cgroup_hierarchy_write(struct cgroup *cont, struct cftype *cft,
3534 u64 val)
3535{
3536 int retval = 0;
3537 struct mem_cgroup *mem = mem_cgroup_from_cont(cont);
3538 struct cgroup *parent = cont->parent;
3539 struct mem_cgroup *parent_mem = NULL;
3540
3541 if (parent)
3542 parent_mem = mem_cgroup_from_cont(parent);
3543
3544 cgroup_lock();
3545
3546
3547
3548
3549
3550
3551
3552
3553 if ((!parent_mem || !parent_mem->use_hierarchy) &&
3554 (val == 1 || val == 0)) {
3555 if (list_empty(&cont->children))
3556 mem->use_hierarchy = val;
3557 else
3558 retval = -EBUSY;
3559 } else
3560 retval = -EINVAL;
3561 cgroup_unlock();
3562
3563 return retval;
3564}
3565
3566
3567static unsigned long mem_cgroup_recursive_stat(struct mem_cgroup *mem,
3568 enum mem_cgroup_stat_index idx)
3569{
3570 struct mem_cgroup *iter;
3571 long val = 0;
3572
3573
3574 for_each_mem_cgroup_tree(iter, mem)
3575 val += mem_cgroup_read_stat(iter, idx);
3576
3577 if (val < 0)
3578 val = 0;
3579 return val;
3580}
3581
3582static inline u64 mem_cgroup_usage(struct mem_cgroup *mem, bool swap)
3583{
3584 u64 val;
3585
3586 if (!mem_cgroup_is_root(mem)) {
3587 if (!swap)
3588 return res_counter_read_u64(&mem->res, RES_USAGE);
3589 else
3590 return res_counter_read_u64(&mem->memsw, RES_USAGE);
3591 }
3592
3593 val = mem_cgroup_recursive_stat(mem, MEM_CGROUP_STAT_CACHE);
3594 val += mem_cgroup_recursive_stat(mem, MEM_CGROUP_STAT_RSS);
3595
3596 if (swap)
3597 val += mem_cgroup_recursive_stat(mem, MEM_CGROUP_STAT_SWAPOUT);
3598
3599 return val << PAGE_SHIFT;
3600}
3601
3602static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft)
3603{
3604 struct mem_cgroup *mem = mem_cgroup_from_cont(cont);
3605 u64 val;
3606 int type, name;
3607
3608 type = MEMFILE_TYPE(cft->private);
3609 name = MEMFILE_ATTR(cft->private);
3610 switch (type) {
3611 case _MEM:
3612 if (name == RES_USAGE)
3613 val = mem_cgroup_usage(mem, false);
3614 else
3615 val = res_counter_read_u64(&mem->res, name);
3616 break;
3617 case _MEMSWAP:
3618 if (name == RES_USAGE)
3619 val = mem_cgroup_usage(mem, true);
3620 else
3621 val = res_counter_read_u64(&mem->memsw, name);
3622 break;
3623 default:
3624 BUG();
3625 break;
3626 }
3627 return val;
3628}
3629
3630
3631
3632
3633static int mem_cgroup_write(struct cgroup *cont, struct cftype *cft,
3634 const char *buffer)
3635{
3636 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
3637 int type, name;
3638 unsigned long long val;
3639 int ret;
3640
3641 type = MEMFILE_TYPE(cft->private);
3642 name = MEMFILE_ATTR(cft->private);
3643 switch (name) {
3644 case RES_LIMIT:
3645 if (mem_cgroup_is_root(memcg)) {
3646 ret = -EINVAL;
3647 break;
3648 }
3649
3650 ret = res_counter_memparse_write_strategy(buffer, &val);
3651 if (ret)
3652 break;
3653 if (type == _MEM)
3654 ret = mem_cgroup_resize_limit(memcg, val);
3655 else
3656 ret = mem_cgroup_resize_memsw_limit(memcg, val);
3657 break;
3658 case RES_SOFT_LIMIT:
3659 ret = res_counter_memparse_write_strategy(buffer, &val);
3660 if (ret)
3661 break;
3662
3663
3664
3665
3666
3667 if (type == _MEM)
3668 ret = res_counter_set_soft_limit(&memcg->res, val);
3669 else
3670 ret = -EINVAL;
3671 break;
3672 default:
3673 ret = -EINVAL;
3674 break;
3675 }
3676 return ret;
3677}
3678
3679static void memcg_get_hierarchical_limit(struct mem_cgroup *memcg,
3680 unsigned long long *mem_limit, unsigned long long *memsw_limit)
3681{
3682 struct cgroup *cgroup;
3683 unsigned long long min_limit, min_memsw_limit, tmp;
3684
3685 min_limit = res_counter_read_u64(&memcg->res, RES_LIMIT);
3686 min_memsw_limit = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
3687 cgroup = memcg->css.cgroup;
3688 if (!memcg->use_hierarchy)
3689 goto out;
3690
3691 while (cgroup->parent) {
3692 cgroup = cgroup->parent;
3693 memcg = mem_cgroup_from_cont(cgroup);
3694 if (!memcg->use_hierarchy)
3695 break;
3696 tmp = res_counter_read_u64(&memcg->res, RES_LIMIT);
3697 min_limit = min(min_limit, tmp);
3698 tmp = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
3699 min_memsw_limit = min(min_memsw_limit, tmp);
3700 }
3701out:
3702 *mem_limit = min_limit;
3703 *memsw_limit = min_memsw_limit;
3704 return;
3705}
3706
3707static int mem_cgroup_reset(struct cgroup *cont, unsigned int event)
3708{
3709 struct mem_cgroup *mem;
3710 int type, name;
3711
3712 mem = mem_cgroup_from_cont(cont);
3713 type = MEMFILE_TYPE(event);
3714 name = MEMFILE_ATTR(event);
3715 switch (name) {
3716 case RES_MAX_USAGE:
3717 if (type == _MEM)
3718 res_counter_reset_max(&mem->res);
3719 else
3720 res_counter_reset_max(&mem->memsw);
3721 break;
3722 case RES_FAILCNT:
3723 if (type == _MEM)
3724 res_counter_reset_failcnt(&mem->res);
3725 else
3726 res_counter_reset_failcnt(&mem->memsw);
3727 break;
3728 }
3729
3730 return 0;
3731}
3732
3733static u64 mem_cgroup_move_charge_read(struct cgroup *cgrp,
3734 struct cftype *cft)
3735{
3736 return mem_cgroup_from_cont(cgrp)->move_charge_at_immigrate;
3737}
3738
3739#ifdef CONFIG_MMU
3740static int mem_cgroup_move_charge_write(struct cgroup *cgrp,
3741 struct cftype *cft, u64 val)
3742{
3743 struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp);
3744
3745 if (val >= (1 << NR_MOVE_TYPE))
3746 return -EINVAL;
3747
3748
3749
3750
3751
3752 cgroup_lock();
3753 mem->move_charge_at_immigrate = val;
3754 cgroup_unlock();
3755
3756 return 0;
3757}
3758#else
3759static int mem_cgroup_move_charge_write(struct cgroup *cgrp,
3760 struct cftype *cft, u64 val)
3761{
3762 return -ENOSYS;
3763}
3764#endif
3765
3766
3767
3768enum {
3769 MCS_CACHE,
3770 MCS_RSS,
3771 MCS_FILE_MAPPED,
3772 MCS_PGPGIN,
3773 MCS_PGPGOUT,
3774 MCS_SWAP,
3775 MCS_INACTIVE_ANON,
3776 MCS_ACTIVE_ANON,
3777 MCS_INACTIVE_FILE,
3778 MCS_ACTIVE_FILE,
3779 MCS_UNEVICTABLE,
3780 NR_MCS_STAT,
3781};
3782
3783struct mcs_total_stat {
3784 s64 stat[NR_MCS_STAT];
3785};
3786
3787struct {
3788 char *local_name;
3789 char *total_name;
3790} memcg_stat_strings[NR_MCS_STAT] = {
3791 {"cache", "total_cache"},
3792 {"rss", "total_rss"},
3793 {"mapped_file", "total_mapped_file"},
3794 {"pgpgin", "total_pgpgin"},
3795 {"pgpgout", "total_pgpgout"},
3796 {"swap", "total_swap"},
3797 {"inactive_anon", "total_inactive_anon"},
3798 {"active_anon", "total_active_anon"},
3799 {"inactive_file", "total_inactive_file"},
3800 {"active_file", "total_active_file"},
3801 {"unevictable", "total_unevictable"}
3802};
3803
3804
3805static void
3806mem_cgroup_get_local_stat(struct mem_cgroup *mem, struct mcs_total_stat *s)
3807{
3808 s64 val;
3809
3810
3811 val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_CACHE);
3812 s->stat[MCS_CACHE] += val * PAGE_SIZE;
3813 val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_RSS);
3814 s->stat[MCS_RSS] += val * PAGE_SIZE;
3815 val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_FILE_MAPPED);
3816 s->stat[MCS_FILE_MAPPED] += val * PAGE_SIZE;
3817 val = mem_cgroup_read_events(mem, MEM_CGROUP_EVENTS_PGPGIN);
3818 s->stat[MCS_PGPGIN] += val;
3819 val = mem_cgroup_read_events(mem, MEM_CGROUP_EVENTS_PGPGOUT);
3820 s->stat[MCS_PGPGOUT] += val;
3821 if (do_swap_account) {
3822 val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_SWAPOUT);
3823 s->stat[MCS_SWAP] += val * PAGE_SIZE;
3824 }
3825
3826
3827 val = mem_cgroup_get_local_zonestat(mem, LRU_INACTIVE_ANON);
3828 s->stat[MCS_INACTIVE_ANON] += val * PAGE_SIZE;
3829 val = mem_cgroup_get_local_zonestat(mem, LRU_ACTIVE_ANON);
3830 s->stat[MCS_ACTIVE_ANON] += val * PAGE_SIZE;
3831 val = mem_cgroup_get_local_zonestat(mem, LRU_INACTIVE_FILE);
3832 s->stat[MCS_INACTIVE_FILE] += val * PAGE_SIZE;
3833 val = mem_cgroup_get_local_zonestat(mem, LRU_ACTIVE_FILE);
3834 s->stat[MCS_ACTIVE_FILE] += val * PAGE_SIZE;
3835 val = mem_cgroup_get_local_zonestat(mem, LRU_UNEVICTABLE);
3836 s->stat[MCS_UNEVICTABLE] += val * PAGE_SIZE;
3837}
3838
3839static void
3840mem_cgroup_get_total_stat(struct mem_cgroup *mem, struct mcs_total_stat *s)
3841{
3842 struct mem_cgroup *iter;
3843
3844 for_each_mem_cgroup_tree(iter, mem)
3845 mem_cgroup_get_local_stat(iter, s);
3846}
3847
3848static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft,
3849 struct cgroup_map_cb *cb)
3850{
3851 struct mem_cgroup *mem_cont = mem_cgroup_from_cont(cont);
3852 struct mcs_total_stat mystat;
3853 int i;
3854
3855 memset(&mystat, 0, sizeof(mystat));
3856 mem_cgroup_get_local_stat(mem_cont, &mystat);
3857
3858 for (i = 0; i < NR_MCS_STAT; i++) {
3859 if (i == MCS_SWAP && !do_swap_account)
3860 continue;
3861 cb->fill(cb, memcg_stat_strings[i].local_name, mystat.stat[i]);
3862 }
3863
3864
3865 {
3866 unsigned long long limit, memsw_limit;
3867 memcg_get_hierarchical_limit(mem_cont, &limit, &memsw_limit);
3868 cb->fill(cb, "hierarchical_memory_limit", limit);
3869 if (do_swap_account)
3870 cb->fill(cb, "hierarchical_memsw_limit", memsw_limit);
3871 }
3872
3873 memset(&mystat, 0, sizeof(mystat));
3874 mem_cgroup_get_total_stat(mem_cont, &mystat);
3875 for (i = 0; i < NR_MCS_STAT; i++) {
3876 if (i == MCS_SWAP && !do_swap_account)
3877 continue;
3878 cb->fill(cb, memcg_stat_strings[i].total_name, mystat.stat[i]);
3879 }
3880
3881#ifdef CONFIG_DEBUG_VM
3882 cb->fill(cb, "inactive_ratio", calc_inactive_ratio(mem_cont, NULL));
3883
3884 {
3885 int nid, zid;
3886 struct mem_cgroup_per_zone *mz;
3887 unsigned long recent_rotated[2] = {0, 0};
3888 unsigned long recent_scanned[2] = {0, 0};
3889
3890 for_each_online_node(nid)
3891 for (zid = 0; zid < MAX_NR_ZONES; zid++) {
3892 mz = mem_cgroup_zoneinfo(mem_cont, nid, zid);
3893
3894 recent_rotated[0] +=
3895 mz->reclaim_stat.recent_rotated[0];
3896 recent_rotated[1] +=
3897 mz->reclaim_stat.recent_rotated[1];
3898 recent_scanned[0] +=
3899 mz->reclaim_stat.recent_scanned[0];
3900 recent_scanned[1] +=
3901 mz->reclaim_stat.recent_scanned[1];
3902 }
3903 cb->fill(cb, "recent_rotated_anon", recent_rotated[0]);
3904 cb->fill(cb, "recent_rotated_file", recent_rotated[1]);
3905 cb->fill(cb, "recent_scanned_anon", recent_scanned[0]);
3906 cb->fill(cb, "recent_scanned_file", recent_scanned[1]);
3907 }
3908#endif
3909
3910 return 0;
3911}
3912
3913static u64 mem_cgroup_swappiness_read(struct cgroup *cgrp, struct cftype *cft)
3914{
3915 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
3916
3917 return get_swappiness(memcg);
3918}
3919
3920static int mem_cgroup_swappiness_write(struct cgroup *cgrp, struct cftype *cft,
3921 u64 val)
3922{
3923 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
3924 struct mem_cgroup *parent;
3925
3926 if (val > 100)
3927 return -EINVAL;
3928
3929 if (cgrp->parent == NULL)
3930 return -EINVAL;
3931
3932 parent = mem_cgroup_from_cont(cgrp->parent);
3933
3934 cgroup_lock();
3935
3936
3937 if ((parent->use_hierarchy) ||
3938 (memcg->use_hierarchy && !list_empty(&cgrp->children))) {
3939 cgroup_unlock();
3940 return -EINVAL;
3941 }
3942
3943 memcg->swappiness = val;
3944
3945 cgroup_unlock();
3946
3947 return 0;
3948}
3949
3950static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap)
3951{
3952 struct mem_cgroup_threshold_ary *t;
3953 u64 usage;
3954 int i;
3955
3956 rcu_read_lock();
3957 if (!swap)
3958 t = rcu_dereference(memcg->thresholds.primary);
3959 else
3960 t = rcu_dereference(memcg->memsw_thresholds.primary);
3961
3962 if (!t)
3963 goto unlock;
3964
3965 usage = mem_cgroup_usage(memcg, swap);
3966
3967
3968
3969
3970
3971
3972 i = t->current_threshold;
3973
3974
3975
3976
3977
3978
3979
3980 for (; i >= 0 && unlikely(t->entries[i].threshold > usage); i--)
3981 eventfd_signal(t->entries[i].eventfd, 1);
3982
3983
3984 i++;
3985
3986
3987
3988
3989
3990
3991
3992 for (; i < t->size && unlikely(t->entries[i].threshold <= usage); i++)
3993 eventfd_signal(t->entries[i].eventfd, 1);
3994
3995
3996 t->current_threshold = i - 1;
3997unlock:
3998 rcu_read_unlock();
3999}
4000
4001static void mem_cgroup_threshold(struct mem_cgroup *memcg)
4002{
4003 while (memcg) {
4004 __mem_cgroup_threshold(memcg, false);
4005 if (do_swap_account)
4006 __mem_cgroup_threshold(memcg, true);
4007
4008 memcg = parent_mem_cgroup(memcg);
4009 }
4010}
4011
4012static int compare_thresholds(const void *a, const void *b)
4013{
4014 const struct mem_cgroup_threshold *_a = a;
4015 const struct mem_cgroup_threshold *_b = b;
4016
4017 return _a->threshold - _b->threshold;
4018}
4019
4020static int mem_cgroup_oom_notify_cb(struct mem_cgroup *mem)
4021{
4022 struct mem_cgroup_eventfd_list *ev;
4023
4024 list_for_each_entry(ev, &mem->oom_notify, list)
4025 eventfd_signal(ev->eventfd, 1);
4026 return 0;
4027}
4028
4029static void mem_cgroup_oom_notify(struct mem_cgroup *mem)
4030{
4031 struct mem_cgroup *iter;
4032
4033 for_each_mem_cgroup_tree(iter, mem)
4034 mem_cgroup_oom_notify_cb(iter);
4035}
4036
4037static int mem_cgroup_usage_register_event(struct cgroup *cgrp,
4038 struct cftype *cft, struct eventfd_ctx *eventfd, const char *args)
4039{
4040 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
4041 struct mem_cgroup_thresholds *thresholds;
4042 struct mem_cgroup_threshold_ary *new;
4043 int type = MEMFILE_TYPE(cft->private);
4044 u64 threshold, usage;
4045 int i, size, ret;
4046
4047 ret = res_counter_memparse_write_strategy(args, &threshold);
4048 if (ret)
4049 return ret;
4050
4051 mutex_lock(&memcg->thresholds_lock);
4052
4053 if (type == _MEM)
4054 thresholds = &memcg->thresholds;
4055 else if (type == _MEMSWAP)
4056 thresholds = &memcg->memsw_thresholds;
4057 else
4058 BUG();
4059
4060 usage = mem_cgroup_usage(memcg, type == _MEMSWAP);
4061
4062
4063 if (thresholds->primary)
4064 __mem_cgroup_threshold(memcg, type == _MEMSWAP);
4065
4066 size = thresholds->primary ? thresholds->primary->size + 1 : 1;
4067
4068
4069 new = kmalloc(sizeof(*new) + size * sizeof(struct mem_cgroup_threshold),
4070 GFP_KERNEL);
4071 if (!new) {
4072 ret = -ENOMEM;
4073 goto unlock;
4074 }
4075 new->size = size;
4076
4077
4078 if (thresholds->primary) {
4079 memcpy(new->entries, thresholds->primary->entries, (size - 1) *
4080 sizeof(struct mem_cgroup_threshold));
4081 }
4082
4083
4084 new->entries[size - 1].eventfd = eventfd;
4085 new->entries[size - 1].threshold = threshold;
4086
4087
4088 sort(new->entries, size, sizeof(struct mem_cgroup_threshold),
4089 compare_thresholds, NULL);
4090
4091
4092 new->current_threshold = -1;
4093 for (i = 0; i < size; i++) {
4094 if (new->entries[i].threshold < usage) {
4095
4096
4097
4098
4099
4100 ++new->current_threshold;
4101 }
4102 }
4103
4104
4105 kfree(thresholds->spare);
4106 thresholds->spare = thresholds->primary;
4107
4108 rcu_assign_pointer(thresholds->primary, new);
4109
4110
4111 synchronize_rcu();
4112
4113unlock:
4114 mutex_unlock(&memcg->thresholds_lock);
4115
4116 return ret;
4117}
4118
4119static void mem_cgroup_usage_unregister_event(struct cgroup *cgrp,
4120 struct cftype *cft, struct eventfd_ctx *eventfd)
4121{
4122 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
4123 struct mem_cgroup_thresholds *thresholds;
4124 struct mem_cgroup_threshold_ary *new;
4125 int type = MEMFILE_TYPE(cft->private);
4126 u64 usage;
4127 int i, j, size;
4128
4129 mutex_lock(&memcg->thresholds_lock);
4130 if (type == _MEM)
4131 thresholds = &memcg->thresholds;
4132 else if (type == _MEMSWAP)
4133 thresholds = &memcg->memsw_thresholds;
4134 else
4135 BUG();
4136
4137
4138
4139
4140
4141 BUG_ON(!thresholds);
4142
4143 usage = mem_cgroup_usage(memcg, type == _MEMSWAP);
4144
4145
4146 __mem_cgroup_threshold(memcg, type == _MEMSWAP);
4147
4148
4149 size = 0;
4150 for (i = 0; i < thresholds->primary->size; i++) {
4151 if (thresholds->primary->entries[i].eventfd != eventfd)
4152 size++;
4153 }
4154
4155 new = thresholds->spare;
4156
4157
4158 if (!size) {
4159 kfree(new);
4160 new = NULL;
4161 goto swap_buffers;
4162 }
4163
4164 new->size = size;
4165
4166
4167 new->current_threshold = -1;
4168 for (i = 0, j = 0; i < thresholds->primary->size; i++) {
4169 if (thresholds->primary->entries[i].eventfd == eventfd)
4170 continue;
4171
4172 new->entries[j] = thresholds->primary->entries[i];
4173 if (new->entries[j].threshold < usage) {
4174
4175
4176
4177
4178
4179 ++new->current_threshold;
4180 }
4181 j++;
4182 }
4183
4184swap_buffers:
4185
4186 thresholds->spare = thresholds->primary;
4187 rcu_assign_pointer(thresholds->primary, new);
4188
4189
4190 synchronize_rcu();
4191
4192 mutex_unlock(&memcg->thresholds_lock);
4193}
4194
4195static int mem_cgroup_oom_register_event(struct cgroup *cgrp,
4196 struct cftype *cft, struct eventfd_ctx *eventfd, const char *args)
4197{
4198 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
4199 struct mem_cgroup_eventfd_list *event;
4200 int type = MEMFILE_TYPE(cft->private);
4201
4202 BUG_ON(type != _OOM_TYPE);
4203 event = kmalloc(sizeof(*event), GFP_KERNEL);
4204 if (!event)
4205 return -ENOMEM;
4206
4207 mutex_lock(&memcg_oom_mutex);
4208
4209 event->eventfd = eventfd;
4210 list_add(&event->list, &memcg->oom_notify);
4211
4212
4213 if (atomic_read(&memcg->oom_lock))
4214 eventfd_signal(eventfd, 1);
4215 mutex_unlock(&memcg_oom_mutex);
4216
4217 return 0;
4218}
4219
4220static void mem_cgroup_oom_unregister_event(struct cgroup *cgrp,
4221 struct cftype *cft, struct eventfd_ctx *eventfd)
4222{
4223 struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp);
4224 struct mem_cgroup_eventfd_list *ev, *tmp;
4225 int type = MEMFILE_TYPE(cft->private);
4226
4227 BUG_ON(type != _OOM_TYPE);
4228
4229 mutex_lock(&memcg_oom_mutex);
4230
4231 list_for_each_entry_safe(ev, tmp, &mem->oom_notify, list) {
4232 if (ev->eventfd == eventfd) {
4233 list_del(&ev->list);
4234 kfree(ev);
4235 }
4236 }
4237
4238 mutex_unlock(&memcg_oom_mutex);
4239}
4240
4241static int mem_cgroup_oom_control_read(struct cgroup *cgrp,
4242 struct cftype *cft, struct cgroup_map_cb *cb)
4243{
4244 struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp);
4245
4246 cb->fill(cb, "oom_kill_disable", mem->oom_kill_disable);
4247
4248 if (atomic_read(&mem->oom_lock))
4249 cb->fill(cb, "under_oom", 1);
4250 else
4251 cb->fill(cb, "under_oom", 0);
4252 return 0;
4253}
4254
4255static int mem_cgroup_oom_control_write(struct cgroup *cgrp,
4256 struct cftype *cft, u64 val)
4257{
4258 struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp);
4259 struct mem_cgroup *parent;
4260
4261
4262 if (!cgrp->parent || !((val == 0) || (val == 1)))
4263 return -EINVAL;
4264
4265 parent = mem_cgroup_from_cont(cgrp->parent);
4266
4267 cgroup_lock();
4268
4269 if ((parent->use_hierarchy) ||
4270 (mem->use_hierarchy && !list_empty(&cgrp->children))) {
4271 cgroup_unlock();
4272 return -EINVAL;
4273 }
4274 mem->oom_kill_disable = val;
4275 if (!val)
4276 memcg_oom_recover(mem);
4277 cgroup_unlock();
4278 return 0;
4279}
4280
4281static struct cftype mem_cgroup_files[] = {
4282 {
4283 .name = "usage_in_bytes",
4284 .private = MEMFILE_PRIVATE(_MEM, RES_USAGE),
4285 .read_u64 = mem_cgroup_read,
4286 .register_event = mem_cgroup_usage_register_event,
4287 .unregister_event = mem_cgroup_usage_unregister_event,
4288 },
4289 {
4290 .name = "max_usage_in_bytes",
4291 .private = MEMFILE_PRIVATE(_MEM, RES_MAX_USAGE),
4292 .trigger = mem_cgroup_reset,
4293 .read_u64 = mem_cgroup_read,
4294 },
4295 {
4296 .name = "limit_in_bytes",
4297 .private = MEMFILE_PRIVATE(_MEM, RES_LIMIT),
4298 .write_string = mem_cgroup_write,
4299 .read_u64 = mem_cgroup_read,
4300 },
4301 {
4302 .name = "soft_limit_in_bytes",
4303 .private = MEMFILE_PRIVATE(_MEM, RES_SOFT_LIMIT),
4304 .write_string = mem_cgroup_write,
4305 .read_u64 = mem_cgroup_read,
4306 },
4307 {
4308 .name = "failcnt",
4309 .private = MEMFILE_PRIVATE(_MEM, RES_FAILCNT),
4310 .trigger = mem_cgroup_reset,
4311 .read_u64 = mem_cgroup_read,
4312 },
4313 {
4314 .name = "stat",
4315 .read_map = mem_control_stat_show,
4316 },
4317 {
4318 .name = "force_empty",
4319 .trigger = mem_cgroup_force_empty_write,
4320 },
4321 {
4322 .name = "use_hierarchy",
4323 .write_u64 = mem_cgroup_hierarchy_write,
4324 .read_u64 = mem_cgroup_hierarchy_read,
4325 },
4326 {
4327 .name = "swappiness",
4328 .read_u64 = mem_cgroup_swappiness_read,
4329 .write_u64 = mem_cgroup_swappiness_write,
4330 },
4331 {
4332 .name = "move_charge_at_immigrate",
4333 .read_u64 = mem_cgroup_move_charge_read,
4334 .write_u64 = mem_cgroup_move_charge_write,
4335 },
4336 {
4337 .name = "oom_control",
4338 .read_map = mem_cgroup_oom_control_read,
4339 .write_u64 = mem_cgroup_oom_control_write,
4340 .register_event = mem_cgroup_oom_register_event,
4341 .unregister_event = mem_cgroup_oom_unregister_event,
4342 .private = MEMFILE_PRIVATE(_OOM_TYPE, OOM_CONTROL),
4343 },
4344};
4345
4346#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
4347static struct cftype memsw_cgroup_files[] = {
4348 {
4349 .name = "memsw.usage_in_bytes",
4350 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE),
4351 .read_u64 = mem_cgroup_read,
4352 .register_event = mem_cgroup_usage_register_event,
4353 .unregister_event = mem_cgroup_usage_unregister_event,
4354 },
4355 {
4356 .name = "memsw.max_usage_in_bytes",
4357 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_MAX_USAGE),
4358 .trigger = mem_cgroup_reset,
4359 .read_u64 = mem_cgroup_read,
4360 },
4361 {
4362 .name = "memsw.limit_in_bytes",
4363 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_LIMIT),
4364 .write_string = mem_cgroup_write,
4365 .read_u64 = mem_cgroup_read,
4366 },
4367 {
4368 .name = "memsw.failcnt",
4369 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_FAILCNT),
4370 .trigger = mem_cgroup_reset,
4371 .read_u64 = mem_cgroup_read,
4372 },
4373};
4374
4375static int register_memsw_files(struct cgroup *cont, struct cgroup_subsys *ss)
4376{
4377 if (!do_swap_account)
4378 return 0;
4379 return cgroup_add_files(cont, ss, memsw_cgroup_files,
4380 ARRAY_SIZE(memsw_cgroup_files));
4381};
4382#else
4383static int register_memsw_files(struct cgroup *cont, struct cgroup_subsys *ss)
4384{
4385 return 0;
4386}
4387#endif
4388
4389static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node)
4390{
4391 struct mem_cgroup_per_node *pn;
4392 struct mem_cgroup_per_zone *mz;
4393 enum lru_list l;
4394 int zone, tmp = node;
4395
4396
4397
4398
4399
4400
4401
4402
4403 if (!node_state(node, N_NORMAL_MEMORY))
4404 tmp = -1;
4405 pn = kzalloc_node(sizeof(*pn), GFP_KERNEL, tmp);
4406 if (!pn)
4407 return 1;
4408
4409 mem->info.nodeinfo[node] = pn;
4410 for (zone = 0; zone < MAX_NR_ZONES; zone++) {
4411 mz = &pn->zoneinfo[zone];
4412 for_each_lru(l)
4413 INIT_LIST_HEAD(&mz->lists[l]);
4414 mz->usage_in_excess = 0;
4415 mz->on_tree = false;
4416 mz->mem = mem;
4417 }
4418 return 0;
4419}
4420
4421static void free_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node)
4422{
4423 kfree(mem->info.nodeinfo[node]);
4424}
4425
4426static struct mem_cgroup *mem_cgroup_alloc(void)
4427{
4428 struct mem_cgroup *mem;
4429 int size = sizeof(struct mem_cgroup);
4430
4431
4432 if (size < PAGE_SIZE)
4433 mem = kzalloc(size, GFP_KERNEL);
4434 else
4435 mem = vzalloc(size);
4436
4437 if (!mem)
4438 return NULL;
4439
4440 mem->stat = alloc_percpu(struct mem_cgroup_stat_cpu);
4441 if (!mem->stat)
4442 goto out_free;
4443 spin_lock_init(&mem->pcp_counter_lock);
4444 return mem;
4445
4446out_free:
4447 if (size < PAGE_SIZE)
4448 kfree(mem);
4449 else
4450 vfree(mem);
4451 return NULL;
4452}
4453
4454
4455
4456
4457
4458
4459
4460
4461
4462
4463
4464
4465static void __mem_cgroup_free(struct mem_cgroup *mem)
4466{
4467 int node;
4468
4469 mem_cgroup_remove_from_trees(mem);
4470 free_css_id(&mem_cgroup_subsys, &mem->css);
4471
4472 for_each_node_state(node, N_POSSIBLE)
4473 free_mem_cgroup_per_zone_info(mem, node);
4474
4475 free_percpu(mem->stat);
4476 if (sizeof(struct mem_cgroup) < PAGE_SIZE)
4477 kfree(mem);
4478 else
4479 vfree(mem);
4480}
4481
4482static void mem_cgroup_get(struct mem_cgroup *mem)
4483{
4484 atomic_inc(&mem->refcnt);
4485}
4486
4487static void __mem_cgroup_put(struct mem_cgroup *mem, int count)
4488{
4489 if (atomic_sub_and_test(count, &mem->refcnt)) {
4490 struct mem_cgroup *parent = parent_mem_cgroup(mem);
4491 __mem_cgroup_free(mem);
4492 if (parent)
4493 mem_cgroup_put(parent);
4494 }
4495}
4496
4497static void mem_cgroup_put(struct mem_cgroup *mem)
4498{
4499 __mem_cgroup_put(mem, 1);
4500}
4501
4502
4503
4504
4505static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *mem)
4506{
4507 if (!mem->res.parent)
4508 return NULL;
4509 return mem_cgroup_from_res_counter(mem->res.parent, res);
4510}
4511
4512#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
4513static void __init enable_swap_cgroup(void)
4514{
4515 if (!mem_cgroup_disabled() && really_do_swap_account)
4516 do_swap_account = 1;
4517}
4518#else
4519static void __init enable_swap_cgroup(void)
4520{
4521}
4522#endif
4523
4524static int mem_cgroup_soft_limit_tree_init(void)
4525{
4526 struct mem_cgroup_tree_per_node *rtpn;
4527 struct mem_cgroup_tree_per_zone *rtpz;
4528 int tmp, node, zone;
4529
4530 for_each_node_state(node, N_POSSIBLE) {
4531 tmp = node;
4532 if (!node_state(node, N_NORMAL_MEMORY))
4533 tmp = -1;
4534 rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL, tmp);
4535 if (!rtpn)
4536 return 1;
4537
4538 soft_limit_tree.rb_tree_per_node[node] = rtpn;
4539
4540 for (zone = 0; zone < MAX_NR_ZONES; zone++) {
4541 rtpz = &rtpn->rb_tree_per_zone[zone];
4542 rtpz->rb_root = RB_ROOT;
4543 spin_lock_init(&rtpz->lock);
4544 }
4545 }
4546 return 0;
4547}
4548
4549static struct cgroup_subsys_state * __ref
4550mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
4551{
4552 struct mem_cgroup *mem, *parent;
4553 long error = -ENOMEM;
4554 int node;
4555
4556 mem = mem_cgroup_alloc();
4557 if (!mem)
4558 return ERR_PTR(error);
4559
4560 for_each_node_state(node, N_POSSIBLE)
4561 if (alloc_mem_cgroup_per_zone_info(mem, node))
4562 goto free_out;
4563
4564
4565 if (cont->parent == NULL) {
4566 int cpu;
4567 enable_swap_cgroup();
4568 parent = NULL;
4569 root_mem_cgroup = mem;
4570 if (mem_cgroup_soft_limit_tree_init())
4571 goto free_out;
4572 for_each_possible_cpu(cpu) {
4573 struct memcg_stock_pcp *stock =
4574 &per_cpu(memcg_stock, cpu);
4575 INIT_WORK(&stock->work, drain_local_stock);
4576 }
4577 hotcpu_notifier(memcg_cpu_hotplug_callback, 0);
4578 } else {
4579 parent = mem_cgroup_from_cont(cont->parent);
4580 mem->use_hierarchy = parent->use_hierarchy;
4581 mem->oom_kill_disable = parent->oom_kill_disable;
4582 }
4583
4584 if (parent && parent->use_hierarchy) {
4585 res_counter_init(&mem->res, &parent->res);
4586 res_counter_init(&mem->memsw, &parent->memsw);
4587
4588
4589
4590
4591
4592
4593 mem_cgroup_get(parent);
4594 } else {
4595 res_counter_init(&mem->res, NULL);
4596 res_counter_init(&mem->memsw, NULL);
4597 }
4598 mem->last_scanned_child = 0;
4599 INIT_LIST_HEAD(&mem->oom_notify);
4600
4601 if (parent)
4602 mem->swappiness = get_swappiness(parent);
4603 atomic_set(&mem->refcnt, 1);
4604 mem->move_charge_at_immigrate = 0;
4605 mutex_init(&mem->thresholds_lock);
4606 return &mem->css;
4607free_out:
4608 __mem_cgroup_free(mem);
4609 root_mem_cgroup = NULL;
4610 return ERR_PTR(error);
4611}
4612
4613static int mem_cgroup_pre_destroy(struct cgroup_subsys *ss,
4614 struct cgroup *cont)
4615{
4616 struct mem_cgroup *mem = mem_cgroup_from_cont(cont);
4617
4618 return mem_cgroup_force_empty(mem, false);
4619}
4620
4621static void mem_cgroup_destroy(struct cgroup_subsys *ss,
4622 struct cgroup *cont)
4623{
4624 struct mem_cgroup *mem = mem_cgroup_from_cont(cont);
4625
4626 mem_cgroup_put(mem);
4627}
4628
4629static int mem_cgroup_populate(struct cgroup_subsys *ss,
4630 struct cgroup *cont)
4631{
4632 int ret;
4633
4634 ret = cgroup_add_files(cont, ss, mem_cgroup_files,
4635 ARRAY_SIZE(mem_cgroup_files));
4636
4637 if (!ret)
4638 ret = register_memsw_files(cont, ss);
4639 return ret;
4640}
4641
4642#ifdef CONFIG_MMU
4643
4644#define PRECHARGE_COUNT_AT_ONCE 256
4645static int mem_cgroup_do_precharge(unsigned long count)
4646{
4647 int ret = 0;
4648 int batch_count = PRECHARGE_COUNT_AT_ONCE;
4649 struct mem_cgroup *mem = mc.to;
4650
4651 if (mem_cgroup_is_root(mem)) {
4652 mc.precharge += count;
4653
4654 return ret;
4655 }
4656
4657 if (count > 1) {
4658 struct res_counter *dummy;
4659
4660
4661
4662
4663
4664
4665 if (res_counter_charge(&mem->res, PAGE_SIZE * count, &dummy))
4666 goto one_by_one;
4667 if (do_swap_account && res_counter_charge(&mem->memsw,
4668 PAGE_SIZE * count, &dummy)) {
4669 res_counter_uncharge(&mem->res, PAGE_SIZE * count);
4670 goto one_by_one;
4671 }
4672 mc.precharge += count;
4673 return ret;
4674 }
4675one_by_one:
4676
4677 while (count--) {
4678 if (signal_pending(current)) {
4679 ret = -EINTR;
4680 break;
4681 }
4682 if (!batch_count--) {
4683 batch_count = PRECHARGE_COUNT_AT_ONCE;
4684 cond_resched();
4685 }
4686 ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, 1, &mem, false);
4687 if (ret || !mem)
4688
4689 return -ENOMEM;
4690 mc.precharge++;
4691 }
4692 return ret;
4693}
4694
4695
4696
4697
4698
4699
4700
4701
4702
4703
4704
4705
4706
4707
4708
4709
4710
4711
4712
4713union mc_target {
4714 struct page *page;
4715 swp_entry_t ent;
4716};
4717
4718enum mc_target_type {
4719 MC_TARGET_NONE,
4720 MC_TARGET_PAGE,
4721 MC_TARGET_SWAP,
4722};
4723
4724static struct page *mc_handle_present_pte(struct vm_area_struct *vma,
4725 unsigned long addr, pte_t ptent)
4726{
4727 struct page *page = vm_normal_page(vma, addr, ptent);
4728
4729 if (!page || !page_mapped(page))
4730 return NULL;
4731 if (PageAnon(page)) {
4732
4733 if (!move_anon() || page_mapcount(page) > 2)
4734 return NULL;
4735 } else if (!move_file())
4736
4737 return NULL;
4738 if (!get_page_unless_zero(page))
4739 return NULL;
4740
4741 return page;
4742}
4743
4744static struct page *mc_handle_swap_pte(struct vm_area_struct *vma,
4745 unsigned long addr, pte_t ptent, swp_entry_t *entry)
4746{
4747 int usage_count;
4748 struct page *page = NULL;
4749 swp_entry_t ent = pte_to_swp_entry(ptent);
4750
4751 if (!move_anon() || non_swap_entry(ent))
4752 return NULL;
4753 usage_count = mem_cgroup_count_swap_user(ent, &page);
4754 if (usage_count > 1) {
4755 if (page)
4756 put_page(page);
4757 return NULL;
4758 }
4759 if (do_swap_account)
4760 entry->val = ent.val;
4761
4762 return page;
4763}
4764
4765static struct page *mc_handle_file_pte(struct vm_area_struct *vma,
4766 unsigned long addr, pte_t ptent, swp_entry_t *entry)
4767{
4768 struct page *page = NULL;
4769 struct inode *inode;
4770 struct address_space *mapping;
4771 pgoff_t pgoff;
4772
4773 if (!vma->vm_file)
4774 return NULL;
4775 if (!move_file())
4776 return NULL;
4777
4778 inode = vma->vm_file->f_path.dentry->d_inode;
4779 mapping = vma->vm_file->f_mapping;
4780 if (pte_none(ptent))
4781 pgoff = linear_page_index(vma, addr);
4782 else
4783 pgoff = pte_to_pgoff(ptent);
4784
4785
4786 if (!mapping_cap_swap_backed(mapping)) {
4787 page = find_get_page(mapping, pgoff);
4788 } else {
4789 swp_entry_t ent;
4790 mem_cgroup_get_shmem_target(inode, pgoff, &page, &ent);
4791 if (do_swap_account)
4792 entry->val = ent.val;
4793 }
4794
4795 return page;
4796}
4797
4798static int is_target_pte_for_mc(struct vm_area_struct *vma,
4799 unsigned long addr, pte_t ptent, union mc_target *target)
4800{
4801 struct page *page = NULL;
4802 struct page_cgroup *pc;
4803 int ret = 0;
4804 swp_entry_t ent = { .val = 0 };
4805
4806 if (pte_present(ptent))
4807 page = mc_handle_present_pte(vma, addr, ptent);
4808 else if (is_swap_pte(ptent))
4809 page = mc_handle_swap_pte(vma, addr, ptent, &ent);
4810 else if (pte_none(ptent) || pte_file(ptent))
4811 page = mc_handle_file_pte(vma, addr, ptent, &ent);
4812
4813 if (!page && !ent.val)
4814 return 0;
4815 if (page) {
4816 pc = lookup_page_cgroup(page);
4817
4818
4819
4820
4821
4822 if (PageCgroupUsed(pc) && pc->mem_cgroup == mc.from) {
4823 ret = MC_TARGET_PAGE;
4824 if (target)
4825 target->page = page;
4826 }
4827 if (!ret || !target)
4828 put_page(page);
4829 }
4830
4831 if (ent.val && !ret &&
4832 css_id(&mc.from->css) == lookup_swap_cgroup(ent)) {
4833 ret = MC_TARGET_SWAP;
4834 if (target)
4835 target->ent = ent;
4836 }
4837 return ret;
4838}
4839
4840static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd,
4841 unsigned long addr, unsigned long end,
4842 struct mm_walk *walk)
4843{
4844 struct vm_area_struct *vma = walk->private;
4845 pte_t *pte;
4846 spinlock_t *ptl;
4847
4848 split_huge_page_pmd(walk->mm, pmd);
4849
4850 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
4851 for (; addr != end; pte++, addr += PAGE_SIZE)
4852 if (is_target_pte_for_mc(vma, addr, *pte, NULL))
4853 mc.precharge++;
4854 pte_unmap_unlock(pte - 1, ptl);
4855 cond_resched();
4856
4857 return 0;
4858}
4859
4860static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm)
4861{
4862 unsigned long precharge;
4863 struct vm_area_struct *vma;
4864
4865 down_read(&mm->mmap_sem);
4866 for (vma = mm->mmap; vma; vma = vma->vm_next) {
4867 struct mm_walk mem_cgroup_count_precharge_walk = {
4868 .pmd_entry = mem_cgroup_count_precharge_pte_range,
4869 .mm = mm,
4870 .private = vma,
4871 };
4872 if (is_vm_hugetlb_page(vma))
4873 continue;
4874 walk_page_range(vma->vm_start, vma->vm_end,
4875 &mem_cgroup_count_precharge_walk);
4876 }
4877 up_read(&mm->mmap_sem);
4878
4879 precharge = mc.precharge;
4880 mc.precharge = 0;
4881
4882 return precharge;
4883}
4884
4885static int mem_cgroup_precharge_mc(struct mm_struct *mm)
4886{
4887 unsigned long precharge = mem_cgroup_count_precharge(mm);
4888
4889 VM_BUG_ON(mc.moving_task);
4890 mc.moving_task = current;
4891 return mem_cgroup_do_precharge(precharge);
4892}
4893
4894
4895static void __mem_cgroup_clear_mc(void)
4896{
4897 struct mem_cgroup *from = mc.from;
4898 struct mem_cgroup *to = mc.to;
4899
4900
4901 if (mc.precharge) {
4902 __mem_cgroup_cancel_charge(mc.to, mc.precharge);
4903 mc.precharge = 0;
4904 }
4905
4906
4907
4908
4909 if (mc.moved_charge) {
4910 __mem_cgroup_cancel_charge(mc.from, mc.moved_charge);
4911 mc.moved_charge = 0;
4912 }
4913
4914 if (mc.moved_swap) {
4915
4916 if (!mem_cgroup_is_root(mc.from))
4917 res_counter_uncharge(&mc.from->memsw,
4918 PAGE_SIZE * mc.moved_swap);
4919 __mem_cgroup_put(mc.from, mc.moved_swap);
4920
4921 if (!mem_cgroup_is_root(mc.to)) {
4922
4923
4924
4925
4926 res_counter_uncharge(&mc.to->res,
4927 PAGE_SIZE * mc.moved_swap);
4928 }
4929
4930 mc.moved_swap = 0;
4931 }
4932 memcg_oom_recover(from);
4933 memcg_oom_recover(to);
4934 wake_up_all(&mc.waitq);
4935}
4936
4937static void mem_cgroup_clear_mc(void)
4938{
4939 struct mem_cgroup *from = mc.from;
4940
4941
4942
4943
4944
4945 mc.moving_task = NULL;
4946 __mem_cgroup_clear_mc();
4947 spin_lock(&mc.lock);
4948 mc.from = NULL;
4949 mc.to = NULL;
4950 spin_unlock(&mc.lock);
4951 mem_cgroup_end_move(from);
4952}
4953
4954static int mem_cgroup_can_attach(struct cgroup_subsys *ss,
4955 struct cgroup *cgroup,
4956 struct task_struct *p,
4957 bool threadgroup)
4958{
4959 int ret = 0;
4960 struct mem_cgroup *mem = mem_cgroup_from_cont(cgroup);
4961
4962 if (mem->move_charge_at_immigrate) {
4963 struct mm_struct *mm;
4964 struct mem_cgroup *from = mem_cgroup_from_task(p);
4965
4966 VM_BUG_ON(from == mem);
4967
4968 mm = get_task_mm(p);
4969 if (!mm)
4970 return 0;
4971
4972 if (mm->owner == p) {
4973 VM_BUG_ON(mc.from);
4974 VM_BUG_ON(mc.to);
4975 VM_BUG_ON(mc.precharge);
4976 VM_BUG_ON(mc.moved_charge);
4977 VM_BUG_ON(mc.moved_swap);
4978 mem_cgroup_start_move(from);
4979 spin_lock(&mc.lock);
4980 mc.from = from;
4981 mc.to = mem;
4982 spin_unlock(&mc.lock);
4983
4984
4985 ret = mem_cgroup_precharge_mc(mm);
4986 if (ret)
4987 mem_cgroup_clear_mc();
4988 }
4989 mmput(mm);
4990 }
4991 return ret;
4992}
4993
4994static void mem_cgroup_cancel_attach(struct cgroup_subsys *ss,
4995 struct cgroup *cgroup,
4996 struct task_struct *p,
4997 bool threadgroup)
4998{
4999 mem_cgroup_clear_mc();
5000}
5001
5002static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
5003 unsigned long addr, unsigned long end,
5004 struct mm_walk *walk)
5005{
5006 int ret = 0;
5007 struct vm_area_struct *vma = walk->private;
5008 pte_t *pte;
5009 spinlock_t *ptl;
5010
5011 split_huge_page_pmd(walk->mm, pmd);
5012retry:
5013 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
5014 for (; addr != end; addr += PAGE_SIZE) {
5015 pte_t ptent = *(pte++);
5016 union mc_target target;
5017 int type;
5018 struct page *page;
5019 struct page_cgroup *pc;
5020 swp_entry_t ent;
5021
5022 if (!mc.precharge)
5023 break;
5024
5025 type = is_target_pte_for_mc(vma, addr, ptent, &target);
5026 switch (type) {
5027 case MC_TARGET_PAGE:
5028 page = target.page;
5029 if (isolate_lru_page(page))
5030 goto put;
5031 pc = lookup_page_cgroup(page);
5032 if (!mem_cgroup_move_account(page, 1, pc,
5033 mc.from, mc.to, false)) {
5034 mc.precharge--;
5035
5036 mc.moved_charge++;
5037 }
5038 putback_lru_page(page);
5039put:
5040 put_page(page);
5041 break;
5042 case MC_TARGET_SWAP:
5043 ent = target.ent;
5044 if (!mem_cgroup_move_swap_account(ent,
5045 mc.from, mc.to, false)) {
5046 mc.precharge--;
5047
5048 mc.moved_swap++;
5049 }
5050 break;
5051 default:
5052 break;
5053 }
5054 }
5055 pte_unmap_unlock(pte - 1, ptl);
5056 cond_resched();
5057
5058 if (addr != end) {
5059
5060
5061
5062
5063
5064
5065 ret = mem_cgroup_do_precharge(1);
5066 if (!ret)
5067 goto retry;
5068 }
5069
5070 return ret;
5071}
5072
5073static void mem_cgroup_move_charge(struct mm_struct *mm)
5074{
5075 struct vm_area_struct *vma;
5076
5077 lru_add_drain_all();
5078retry:
5079 if (unlikely(!down_read_trylock(&mm->mmap_sem))) {
5080
5081
5082
5083
5084
5085
5086
5087 __mem_cgroup_clear_mc();
5088 cond_resched();
5089 goto retry;
5090 }
5091 for (vma = mm->mmap; vma; vma = vma->vm_next) {
5092 int ret;
5093 struct mm_walk mem_cgroup_move_charge_walk = {
5094 .pmd_entry = mem_cgroup_move_charge_pte_range,
5095 .mm = mm,
5096 .private = vma,
5097 };
5098 if (is_vm_hugetlb_page(vma))
5099 continue;
5100 ret = walk_page_range(vma->vm_start, vma->vm_end,
5101 &mem_cgroup_move_charge_walk);
5102 if (ret)
5103
5104
5105
5106
5107 break;
5108 }
5109 up_read(&mm->mmap_sem);
5110}
5111
5112static void mem_cgroup_move_task(struct cgroup_subsys *ss,
5113 struct cgroup *cont,
5114 struct cgroup *old_cont,
5115 struct task_struct *p,
5116 bool threadgroup)
5117{
5118 struct mm_struct *mm;
5119
5120 if (!mc.to)
5121
5122 return;
5123
5124 mm = get_task_mm(p);
5125 if (mm) {
5126 mem_cgroup_move_charge(mm);
5127 mmput(mm);
5128 }
5129 mem_cgroup_clear_mc();
5130}
5131#else
5132static int mem_cgroup_can_attach(struct cgroup_subsys *ss,
5133 struct cgroup *cgroup,
5134 struct task_struct *p,
5135 bool threadgroup)
5136{
5137 return 0;
5138}
5139static void mem_cgroup_cancel_attach(struct cgroup_subsys *ss,
5140 struct cgroup *cgroup,
5141 struct task_struct *p,
5142 bool threadgroup)
5143{
5144}
5145static void mem_cgroup_move_task(struct cgroup_subsys *ss,
5146 struct cgroup *cont,
5147 struct cgroup *old_cont,
5148 struct task_struct *p,
5149 bool threadgroup)
5150{
5151}
5152#endif
5153
5154struct cgroup_subsys mem_cgroup_subsys = {
5155 .name = "memory",
5156 .subsys_id = mem_cgroup_subsys_id,
5157 .create = mem_cgroup_create,
5158 .pre_destroy = mem_cgroup_pre_destroy,
5159 .destroy = mem_cgroup_destroy,
5160 .populate = mem_cgroup_populate,
5161 .can_attach = mem_cgroup_can_attach,
5162 .cancel_attach = mem_cgroup_cancel_attach,
5163 .attach = mem_cgroup_move_task,
5164 .early_init = 0,
5165 .use_id = 1,
5166};
5167
5168#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
5169static int __init enable_swap_account(char *s)
5170{
5171
5172 if (!(*s) || !strcmp(s, "=1"))
5173 really_do_swap_account = 1;
5174 else if (!strcmp(s, "=0"))
5175 really_do_swap_account = 0;
5176 return 1;
5177}
5178__setup("swapaccount", enable_swap_account);
5179
5180static int __init disable_swap_account(char *s)
5181{
5182 printk_once("noswapaccount is deprecated and will be removed in 2.6.40. Use swapaccount=0 instead\n");
5183 enable_swap_account("=0");
5184 return 1;
5185}
5186__setup("noswapaccount", disable_swap_account);
5187#endif
5188