1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24#include <linux/res_counter.h>
25#include <linux/memcontrol.h>
26#include <linux/cgroup.h>
27#include <linux/mm.h>
28#include <linux/hugetlb.h>
29#include <linux/pagemap.h>
30#include <linux/smp.h>
31#include <linux/page-flags.h>
32#include <linux/backing-dev.h>
33#include <linux/bit_spinlock.h>
34#include <linux/rcupdate.h>
35#include <linux/limits.h>
36#include <linux/export.h>
37#include <linux/mutex.h>
38#include <linux/rbtree.h>
39#include <linux/slab.h>
40#include <linux/swap.h>
41#include <linux/swapops.h>
42#include <linux/spinlock.h>
43#include <linux/eventfd.h>
44#include <linux/sort.h>
45#include <linux/fs.h>
46#include <linux/seq_file.h>
47#include <linux/vmalloc.h>
48#include <linux/mm_inline.h>
49#include <linux/page_cgroup.h>
50#include <linux/cpu.h>
51#include <linux/oom.h>
52#include "internal.h"
53#include <net/sock.h>
54#include <net/ip.h>
55#include <net/tcp_memcontrol.h>
56
57#include <asm/uaccess.h>
58
59#include <trace/events/vmscan.h>
60
61struct cgroup_subsys mem_cgroup_subsys __read_mostly;
62#define MEM_CGROUP_RECLAIM_RETRIES 5
63static struct mem_cgroup *root_mem_cgroup __read_mostly;
64
65#ifdef CONFIG_MEMCG_SWAP
66
67int do_swap_account __read_mostly;
68
69
70#ifdef CONFIG_MEMCG_SWAP_ENABLED
71static int really_do_swap_account __initdata = 1;
72#else
73static int really_do_swap_account __initdata = 0;
74#endif
75
76#else
77#define do_swap_account 0
78#endif
79
80
81
82
83
84enum mem_cgroup_stat_index {
85
86
87
88 MEM_CGROUP_STAT_CACHE,
89 MEM_CGROUP_STAT_RSS,
90 MEM_CGROUP_STAT_FILE_MAPPED,
91 MEM_CGROUP_STAT_SWAP,
92 MEM_CGROUP_STAT_NSTATS,
93};
94
95static const char * const mem_cgroup_stat_names[] = {
96 "cache",
97 "rss",
98 "mapped_file",
99 "swap",
100};
101
102enum mem_cgroup_events_index {
103 MEM_CGROUP_EVENTS_PGPGIN,
104 MEM_CGROUP_EVENTS_PGPGOUT,
105 MEM_CGROUP_EVENTS_PGFAULT,
106 MEM_CGROUP_EVENTS_PGMAJFAULT,
107 MEM_CGROUP_EVENTS_NSTATS,
108};
109
110static const char * const mem_cgroup_events_names[] = {
111 "pgpgin",
112 "pgpgout",
113 "pgfault",
114 "pgmajfault",
115};
116
117
118
119
120
121
122
123enum mem_cgroup_events_target {
124 MEM_CGROUP_TARGET_THRESH,
125 MEM_CGROUP_TARGET_SOFTLIMIT,
126 MEM_CGROUP_TARGET_NUMAINFO,
127 MEM_CGROUP_NTARGETS,
128};
129#define THRESHOLDS_EVENTS_TARGET 128
130#define SOFTLIMIT_EVENTS_TARGET 1024
131#define NUMAINFO_EVENTS_TARGET 1024
132
133struct mem_cgroup_stat_cpu {
134 long count[MEM_CGROUP_STAT_NSTATS];
135 unsigned long events[MEM_CGROUP_EVENTS_NSTATS];
136 unsigned long nr_page_events;
137 unsigned long targets[MEM_CGROUP_NTARGETS];
138};
139
140struct mem_cgroup_reclaim_iter {
141
142 int position;
143
144 unsigned int generation;
145};
146
147
148
149
150struct mem_cgroup_per_zone {
151 struct lruvec lruvec;
152 unsigned long lru_size[NR_LRU_LISTS];
153
154 struct mem_cgroup_reclaim_iter reclaim_iter[DEF_PRIORITY + 1];
155
156 struct rb_node tree_node;
157 unsigned long long usage_in_excess;
158
159 bool on_tree;
160 struct mem_cgroup *memcg;
161
162};
163
164struct mem_cgroup_per_node {
165 struct mem_cgroup_per_zone zoneinfo[MAX_NR_ZONES];
166};
167
168struct mem_cgroup_lru_info {
169 struct mem_cgroup_per_node *nodeinfo[MAX_NUMNODES];
170};
171
172
173
174
175
176
177struct mem_cgroup_tree_per_zone {
178 struct rb_root rb_root;
179 spinlock_t lock;
180};
181
182struct mem_cgroup_tree_per_node {
183 struct mem_cgroup_tree_per_zone rb_tree_per_zone[MAX_NR_ZONES];
184};
185
186struct mem_cgroup_tree {
187 struct mem_cgroup_tree_per_node *rb_tree_per_node[MAX_NUMNODES];
188};
189
190static struct mem_cgroup_tree soft_limit_tree __read_mostly;
191
192struct mem_cgroup_threshold {
193 struct eventfd_ctx *eventfd;
194 u64 threshold;
195};
196
197
198struct mem_cgroup_threshold_ary {
199
200 int current_threshold;
201
202 unsigned int size;
203
204 struct mem_cgroup_threshold entries[0];
205};
206
207struct mem_cgroup_thresholds {
208
209 struct mem_cgroup_threshold_ary *primary;
210
211
212
213
214
215 struct mem_cgroup_threshold_ary *spare;
216};
217
218
219struct mem_cgroup_eventfd_list {
220 struct list_head list;
221 struct eventfd_ctx *eventfd;
222};
223
224static void mem_cgroup_threshold(struct mem_cgroup *memcg);
225static void mem_cgroup_oom_notify(struct mem_cgroup *memcg);
226
227
228
229
230
231
232
233
234
235
236
237
238struct mem_cgroup {
239 struct cgroup_subsys_state css;
240
241
242
243 struct res_counter res;
244
245 union {
246
247
248
249 struct res_counter memsw;
250
251
252
253
254
255
256
257
258
259
260 struct rcu_head rcu_freeing;
261
262
263
264
265 struct work_struct work_freeing;
266 };
267
268
269
270
271
272 struct mem_cgroup_lru_info info;
273 int last_scanned_node;
274#if MAX_NUMNODES > 1
275 nodemask_t scan_nodes;
276 atomic_t numainfo_events;
277 atomic_t numainfo_updating;
278#endif
279
280
281
282 bool use_hierarchy;
283
284 bool oom_lock;
285 atomic_t under_oom;
286
287 atomic_t refcnt;
288
289 int swappiness;
290
291 int oom_kill_disable;
292
293
294 bool memsw_is_minimum;
295
296
297 struct mutex thresholds_lock;
298
299
300 struct mem_cgroup_thresholds thresholds;
301
302
303 struct mem_cgroup_thresholds memsw_thresholds;
304
305
306 struct list_head oom_notify;
307
308
309
310
311
312 unsigned long move_charge_at_immigrate;
313
314
315
316 atomic_t moving_account;
317
318 spinlock_t move_lock;
319
320
321
322 struct mem_cgroup_stat_cpu __percpu *stat;
323
324
325
326
327 struct mem_cgroup_stat_cpu nocpu_base;
328 spinlock_t pcp_counter_lock;
329
330#if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_INET)
331 struct tcp_memcontrol tcp_mem;
332#endif
333};
334
335
336
337
338
339
340enum move_type {
341 MOVE_CHARGE_TYPE_ANON,
342 MOVE_CHARGE_TYPE_FILE,
343 NR_MOVE_TYPE,
344};
345
346
347static struct move_charge_struct {
348 spinlock_t lock;
349 struct mem_cgroup *from;
350 struct mem_cgroup *to;
351 unsigned long precharge;
352 unsigned long moved_charge;
353 unsigned long moved_swap;
354 struct task_struct *moving_task;
355 wait_queue_head_t waitq;
356} mc = {
357 .lock = __SPIN_LOCK_UNLOCKED(mc.lock),
358 .waitq = __WAIT_QUEUE_HEAD_INITIALIZER(mc.waitq),
359};
360
361static bool move_anon(void)
362{
363 return test_bit(MOVE_CHARGE_TYPE_ANON,
364 &mc.to->move_charge_at_immigrate);
365}
366
367static bool move_file(void)
368{
369 return test_bit(MOVE_CHARGE_TYPE_FILE,
370 &mc.to->move_charge_at_immigrate);
371}
372
373
374
375
376
377#define MEM_CGROUP_MAX_RECLAIM_LOOPS 100
378#define MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS 2
379
380enum charge_type {
381 MEM_CGROUP_CHARGE_TYPE_CACHE = 0,
382 MEM_CGROUP_CHARGE_TYPE_ANON,
383 MEM_CGROUP_CHARGE_TYPE_SWAPOUT,
384 MEM_CGROUP_CHARGE_TYPE_DROP,
385 NR_CHARGE_TYPE,
386};
387
388
389#define _MEM (0)
390#define _MEMSWAP (1)
391#define _OOM_TYPE (2)
392#define MEMFILE_PRIVATE(x, val) ((x) << 16 | (val))
393#define MEMFILE_TYPE(val) ((val) >> 16 & 0xffff)
394#define MEMFILE_ATTR(val) ((val) & 0xffff)
395
396#define OOM_CONTROL (0)
397
398
399
400
401#define MEM_CGROUP_RECLAIM_NOSWAP_BIT 0x0
402#define MEM_CGROUP_RECLAIM_NOSWAP (1 << MEM_CGROUP_RECLAIM_NOSWAP_BIT)
403#define MEM_CGROUP_RECLAIM_SHRINK_BIT 0x1
404#define MEM_CGROUP_RECLAIM_SHRINK (1 << MEM_CGROUP_RECLAIM_SHRINK_BIT)
405
406static void mem_cgroup_get(struct mem_cgroup *memcg);
407static void mem_cgroup_put(struct mem_cgroup *memcg);
408
409static inline
410struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *s)
411{
412 return container_of(s, struct mem_cgroup, css);
413}
414
415static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg)
416{
417 return (memcg == root_mem_cgroup);
418}
419
420
421#if defined(CONFIG_INET) && defined(CONFIG_MEMCG_KMEM)
422
423void sock_update_memcg(struct sock *sk)
424{
425 if (mem_cgroup_sockets_enabled) {
426 struct mem_cgroup *memcg;
427 struct cg_proto *cg_proto;
428
429 BUG_ON(!sk->sk_prot->proto_cgroup);
430
431
432
433
434
435
436
437
438
439 if (sk->sk_cgrp) {
440 BUG_ON(mem_cgroup_is_root(sk->sk_cgrp->memcg));
441 mem_cgroup_get(sk->sk_cgrp->memcg);
442 return;
443 }
444
445 rcu_read_lock();
446 memcg = mem_cgroup_from_task(current);
447 cg_proto = sk->sk_prot->proto_cgroup(memcg);
448 if (!mem_cgroup_is_root(memcg) && memcg_proto_active(cg_proto)) {
449 mem_cgroup_get(memcg);
450 sk->sk_cgrp = cg_proto;
451 }
452 rcu_read_unlock();
453 }
454}
455EXPORT_SYMBOL(sock_update_memcg);
456
457void sock_release_memcg(struct sock *sk)
458{
459 if (mem_cgroup_sockets_enabled && sk->sk_cgrp) {
460 struct mem_cgroup *memcg;
461 WARN_ON(!sk->sk_cgrp->memcg);
462 memcg = sk->sk_cgrp->memcg;
463 mem_cgroup_put(memcg);
464 }
465}
466
467struct cg_proto *tcp_proto_cgroup(struct mem_cgroup *memcg)
468{
469 if (!memcg || mem_cgroup_is_root(memcg))
470 return NULL;
471
472 return &memcg->tcp_mem.cg_proto;
473}
474EXPORT_SYMBOL(tcp_proto_cgroup);
475
476static void disarm_sock_keys(struct mem_cgroup *memcg)
477{
478 if (!memcg_proto_activated(&memcg->tcp_mem.cg_proto))
479 return;
480 static_key_slow_dec(&memcg_socket_limit_enabled);
481}
482#else
483static void disarm_sock_keys(struct mem_cgroup *memcg)
484{
485}
486#endif
487
488static void drain_all_stock_async(struct mem_cgroup *memcg);
489
490static struct mem_cgroup_per_zone *
491mem_cgroup_zoneinfo(struct mem_cgroup *memcg, int nid, int zid)
492{
493 return &memcg->info.nodeinfo[nid]->zoneinfo[zid];
494}
495
496struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *memcg)
497{
498 return &memcg->css;
499}
500
501static struct mem_cgroup_per_zone *
502page_cgroup_zoneinfo(struct mem_cgroup *memcg, struct page *page)
503{
504 int nid = page_to_nid(page);
505 int zid = page_zonenum(page);
506
507 return mem_cgroup_zoneinfo(memcg, nid, zid);
508}
509
510static struct mem_cgroup_tree_per_zone *
511soft_limit_tree_node_zone(int nid, int zid)
512{
513 return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid];
514}
515
516static struct mem_cgroup_tree_per_zone *
517soft_limit_tree_from_page(struct page *page)
518{
519 int nid = page_to_nid(page);
520 int zid = page_zonenum(page);
521
522 return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid];
523}
524
525static void
526__mem_cgroup_insert_exceeded(struct mem_cgroup *memcg,
527 struct mem_cgroup_per_zone *mz,
528 struct mem_cgroup_tree_per_zone *mctz,
529 unsigned long long new_usage_in_excess)
530{
531 struct rb_node **p = &mctz->rb_root.rb_node;
532 struct rb_node *parent = NULL;
533 struct mem_cgroup_per_zone *mz_node;
534
535 if (mz->on_tree)
536 return;
537
538 mz->usage_in_excess = new_usage_in_excess;
539 if (!mz->usage_in_excess)
540 return;
541 while (*p) {
542 parent = *p;
543 mz_node = rb_entry(parent, struct mem_cgroup_per_zone,
544 tree_node);
545 if (mz->usage_in_excess < mz_node->usage_in_excess)
546 p = &(*p)->rb_left;
547
548
549
550
551 else if (mz->usage_in_excess >= mz_node->usage_in_excess)
552 p = &(*p)->rb_right;
553 }
554 rb_link_node(&mz->tree_node, parent, p);
555 rb_insert_color(&mz->tree_node, &mctz->rb_root);
556 mz->on_tree = true;
557}
558
559static void
560__mem_cgroup_remove_exceeded(struct mem_cgroup *memcg,
561 struct mem_cgroup_per_zone *mz,
562 struct mem_cgroup_tree_per_zone *mctz)
563{
564 if (!mz->on_tree)
565 return;
566 rb_erase(&mz->tree_node, &mctz->rb_root);
567 mz->on_tree = false;
568}
569
570static void
571mem_cgroup_remove_exceeded(struct mem_cgroup *memcg,
572 struct mem_cgroup_per_zone *mz,
573 struct mem_cgroup_tree_per_zone *mctz)
574{
575 spin_lock(&mctz->lock);
576 __mem_cgroup_remove_exceeded(memcg, mz, mctz);
577 spin_unlock(&mctz->lock);
578}
579
580
581static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page)
582{
583 unsigned long long excess;
584 struct mem_cgroup_per_zone *mz;
585 struct mem_cgroup_tree_per_zone *mctz;
586 int nid = page_to_nid(page);
587 int zid = page_zonenum(page);
588 mctz = soft_limit_tree_from_page(page);
589
590
591
592
593
594 for (; memcg; memcg = parent_mem_cgroup(memcg)) {
595 mz = mem_cgroup_zoneinfo(memcg, nid, zid);
596 excess = res_counter_soft_limit_excess(&memcg->res);
597
598
599
600
601 if (excess || mz->on_tree) {
602 spin_lock(&mctz->lock);
603
604 if (mz->on_tree)
605 __mem_cgroup_remove_exceeded(memcg, mz, mctz);
606
607
608
609
610 __mem_cgroup_insert_exceeded(memcg, mz, mctz, excess);
611 spin_unlock(&mctz->lock);
612 }
613 }
614}
615
616static void mem_cgroup_remove_from_trees(struct mem_cgroup *memcg)
617{
618 int node, zone;
619 struct mem_cgroup_per_zone *mz;
620 struct mem_cgroup_tree_per_zone *mctz;
621
622 for_each_node(node) {
623 for (zone = 0; zone < MAX_NR_ZONES; zone++) {
624 mz = mem_cgroup_zoneinfo(memcg, node, zone);
625 mctz = soft_limit_tree_node_zone(node, zone);
626 mem_cgroup_remove_exceeded(memcg, mz, mctz);
627 }
628 }
629}
630
631static struct mem_cgroup_per_zone *
632__mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz)
633{
634 struct rb_node *rightmost = NULL;
635 struct mem_cgroup_per_zone *mz;
636
637retry:
638 mz = NULL;
639 rightmost = rb_last(&mctz->rb_root);
640 if (!rightmost)
641 goto done;
642
643 mz = rb_entry(rightmost, struct mem_cgroup_per_zone, tree_node);
644
645
646
647
648
649 __mem_cgroup_remove_exceeded(mz->memcg, mz, mctz);
650 if (!res_counter_soft_limit_excess(&mz->memcg->res) ||
651 !css_tryget(&mz->memcg->css))
652 goto retry;
653done:
654 return mz;
655}
656
657static struct mem_cgroup_per_zone *
658mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz)
659{
660 struct mem_cgroup_per_zone *mz;
661
662 spin_lock(&mctz->lock);
663 mz = __mem_cgroup_largest_soft_limit_node(mctz);
664 spin_unlock(&mctz->lock);
665 return mz;
666}
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687static long mem_cgroup_read_stat(struct mem_cgroup *memcg,
688 enum mem_cgroup_stat_index idx)
689{
690 long val = 0;
691 int cpu;
692
693 get_online_cpus();
694 for_each_online_cpu(cpu)
695 val += per_cpu(memcg->stat->count[idx], cpu);
696#ifdef CONFIG_HOTPLUG_CPU
697 spin_lock(&memcg->pcp_counter_lock);
698 val += memcg->nocpu_base.count[idx];
699 spin_unlock(&memcg->pcp_counter_lock);
700#endif
701 put_online_cpus();
702 return val;
703}
704
705static void mem_cgroup_swap_statistics(struct mem_cgroup *memcg,
706 bool charge)
707{
708 int val = (charge) ? 1 : -1;
709 this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_SWAP], val);
710}
711
712static unsigned long mem_cgroup_read_events(struct mem_cgroup *memcg,
713 enum mem_cgroup_events_index idx)
714{
715 unsigned long val = 0;
716 int cpu;
717
718 for_each_online_cpu(cpu)
719 val += per_cpu(memcg->stat->events[idx], cpu);
720#ifdef CONFIG_HOTPLUG_CPU
721 spin_lock(&memcg->pcp_counter_lock);
722 val += memcg->nocpu_base.events[idx];
723 spin_unlock(&memcg->pcp_counter_lock);
724#endif
725 return val;
726}
727
728static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg,
729 bool anon, int nr_pages)
730{
731 preempt_disable();
732
733
734
735
736
737 if (anon)
738 __this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_RSS],
739 nr_pages);
740 else
741 __this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_CACHE],
742 nr_pages);
743
744
745 if (nr_pages > 0)
746 __this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGIN]);
747 else {
748 __this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGOUT]);
749 nr_pages = -nr_pages;
750 }
751
752 __this_cpu_add(memcg->stat->nr_page_events, nr_pages);
753
754 preempt_enable();
755}
756
757unsigned long
758mem_cgroup_get_lru_size(struct lruvec *lruvec, enum lru_list lru)
759{
760 struct mem_cgroup_per_zone *mz;
761
762 mz = container_of(lruvec, struct mem_cgroup_per_zone, lruvec);
763 return mz->lru_size[lru];
764}
765
766static unsigned long
767mem_cgroup_zone_nr_lru_pages(struct mem_cgroup *memcg, int nid, int zid,
768 unsigned int lru_mask)
769{
770 struct mem_cgroup_per_zone *mz;
771 enum lru_list lru;
772 unsigned long ret = 0;
773
774 mz = mem_cgroup_zoneinfo(memcg, nid, zid);
775
776 for_each_lru(lru) {
777 if (BIT(lru) & lru_mask)
778 ret += mz->lru_size[lru];
779 }
780 return ret;
781}
782
783static unsigned long
784mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg,
785 int nid, unsigned int lru_mask)
786{
787 u64 total = 0;
788 int zid;
789
790 for (zid = 0; zid < MAX_NR_ZONES; zid++)
791 total += mem_cgroup_zone_nr_lru_pages(memcg,
792 nid, zid, lru_mask);
793
794 return total;
795}
796
797static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *memcg,
798 unsigned int lru_mask)
799{
800 int nid;
801 u64 total = 0;
802
803 for_each_node_state(nid, N_HIGH_MEMORY)
804 total += mem_cgroup_node_nr_lru_pages(memcg, nid, lru_mask);
805 return total;
806}
807
808static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg,
809 enum mem_cgroup_events_target target)
810{
811 unsigned long val, next;
812
813 val = __this_cpu_read(memcg->stat->nr_page_events);
814 next = __this_cpu_read(memcg->stat->targets[target]);
815
816 if ((long)next - (long)val < 0) {
817 switch (target) {
818 case MEM_CGROUP_TARGET_THRESH:
819 next = val + THRESHOLDS_EVENTS_TARGET;
820 break;
821 case MEM_CGROUP_TARGET_SOFTLIMIT:
822 next = val + SOFTLIMIT_EVENTS_TARGET;
823 break;
824 case MEM_CGROUP_TARGET_NUMAINFO:
825 next = val + NUMAINFO_EVENTS_TARGET;
826 break;
827 default:
828 break;
829 }
830 __this_cpu_write(memcg->stat->targets[target], next);
831 return true;
832 }
833 return false;
834}
835
836
837
838
839
840static void memcg_check_events(struct mem_cgroup *memcg, struct page *page)
841{
842 preempt_disable();
843
844 if (unlikely(mem_cgroup_event_ratelimit(memcg,
845 MEM_CGROUP_TARGET_THRESH))) {
846 bool do_softlimit;
847 bool do_numainfo __maybe_unused;
848
849 do_softlimit = mem_cgroup_event_ratelimit(memcg,
850 MEM_CGROUP_TARGET_SOFTLIMIT);
851#if MAX_NUMNODES > 1
852 do_numainfo = mem_cgroup_event_ratelimit(memcg,
853 MEM_CGROUP_TARGET_NUMAINFO);
854#endif
855 preempt_enable();
856
857 mem_cgroup_threshold(memcg);
858 if (unlikely(do_softlimit))
859 mem_cgroup_update_tree(memcg, page);
860#if MAX_NUMNODES > 1
861 if (unlikely(do_numainfo))
862 atomic_inc(&memcg->numainfo_events);
863#endif
864 } else
865 preempt_enable();
866}
867
868struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont)
869{
870 return mem_cgroup_from_css(
871 cgroup_subsys_state(cont, mem_cgroup_subsys_id));
872}
873
874struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
875{
876
877
878
879
880
881 if (unlikely(!p))
882 return NULL;
883
884 return mem_cgroup_from_css(task_subsys_state(p, mem_cgroup_subsys_id));
885}
886
887struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm)
888{
889 struct mem_cgroup *memcg = NULL;
890
891 if (!mm)
892 return NULL;
893
894
895
896
897
898 rcu_read_lock();
899 do {
900 memcg = mem_cgroup_from_task(rcu_dereference(mm->owner));
901 if (unlikely(!memcg))
902 break;
903 } while (!css_tryget(&memcg->css));
904 rcu_read_unlock();
905 return memcg;
906}
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
926 struct mem_cgroup *prev,
927 struct mem_cgroup_reclaim_cookie *reclaim)
928{
929 struct mem_cgroup *memcg = NULL;
930 int id = 0;
931
932 if (mem_cgroup_disabled())
933 return NULL;
934
935 if (!root)
936 root = root_mem_cgroup;
937
938 if (prev && !reclaim)
939 id = css_id(&prev->css);
940
941 if (prev && prev != root)
942 css_put(&prev->css);
943
944 if (!root->use_hierarchy && root != root_mem_cgroup) {
945 if (prev)
946 return NULL;
947 return root;
948 }
949
950 while (!memcg) {
951 struct mem_cgroup_reclaim_iter *uninitialized_var(iter);
952 struct cgroup_subsys_state *css;
953
954 if (reclaim) {
955 int nid = zone_to_nid(reclaim->zone);
956 int zid = zone_idx(reclaim->zone);
957 struct mem_cgroup_per_zone *mz;
958
959 mz = mem_cgroup_zoneinfo(root, nid, zid);
960 iter = &mz->reclaim_iter[reclaim->priority];
961 if (prev && reclaim->generation != iter->generation)
962 return NULL;
963 id = iter->position;
964 }
965
966 rcu_read_lock();
967 css = css_get_next(&mem_cgroup_subsys, id + 1, &root->css, &id);
968 if (css) {
969 if (css == &root->css || css_tryget(css))
970 memcg = mem_cgroup_from_css(css);
971 } else
972 id = 0;
973 rcu_read_unlock();
974
975 if (reclaim) {
976 iter->position = id;
977 if (!css)
978 iter->generation++;
979 else if (!prev && memcg)
980 reclaim->generation = iter->generation;
981 }
982
983 if (prev && !css)
984 return NULL;
985 }
986 return memcg;
987}
988
989
990
991
992
993
994void mem_cgroup_iter_break(struct mem_cgroup *root,
995 struct mem_cgroup *prev)
996{
997 if (!root)
998 root = root_mem_cgroup;
999 if (prev && prev != root)
1000 css_put(&prev->css);
1001}
1002
1003
1004
1005
1006
1007
1008#define for_each_mem_cgroup_tree(iter, root) \
1009 for (iter = mem_cgroup_iter(root, NULL, NULL); \
1010 iter != NULL; \
1011 iter = mem_cgroup_iter(root, iter, NULL))
1012
1013#define for_each_mem_cgroup(iter) \
1014 for (iter = mem_cgroup_iter(NULL, NULL, NULL); \
1015 iter != NULL; \
1016 iter = mem_cgroup_iter(NULL, iter, NULL))
1017
1018void mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx)
1019{
1020 struct mem_cgroup *memcg;
1021
1022 if (!mm)
1023 return;
1024
1025 rcu_read_lock();
1026 memcg = mem_cgroup_from_task(rcu_dereference(mm->owner));
1027 if (unlikely(!memcg))
1028 goto out;
1029
1030 switch (idx) {
1031 case PGFAULT:
1032 this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGFAULT]);
1033 break;
1034 case PGMAJFAULT:
1035 this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGMAJFAULT]);
1036 break;
1037 default:
1038 BUG();
1039 }
1040out:
1041 rcu_read_unlock();
1042}
1043EXPORT_SYMBOL(mem_cgroup_count_vm_event);
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054struct lruvec *mem_cgroup_zone_lruvec(struct zone *zone,
1055 struct mem_cgroup *memcg)
1056{
1057 struct mem_cgroup_per_zone *mz;
1058 struct lruvec *lruvec;
1059
1060 if (mem_cgroup_disabled()) {
1061 lruvec = &zone->lruvec;
1062 goto out;
1063 }
1064
1065 mz = mem_cgroup_zoneinfo(memcg, zone_to_nid(zone), zone_idx(zone));
1066 lruvec = &mz->lruvec;
1067out:
1068
1069
1070
1071
1072
1073 if (unlikely(lruvec->zone != zone))
1074 lruvec->zone = zone;
1075 return lruvec;
1076}
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097struct lruvec *mem_cgroup_page_lruvec(struct page *page, struct zone *zone)
1098{
1099 struct mem_cgroup_per_zone *mz;
1100 struct mem_cgroup *memcg;
1101 struct page_cgroup *pc;
1102 struct lruvec *lruvec;
1103
1104 if (mem_cgroup_disabled()) {
1105 lruvec = &zone->lruvec;
1106 goto out;
1107 }
1108
1109 pc = lookup_page_cgroup(page);
1110 memcg = pc->mem_cgroup;
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121 if (!PageLRU(page) && !PageCgroupUsed(pc) && memcg != root_mem_cgroup)
1122 pc->mem_cgroup = memcg = root_mem_cgroup;
1123
1124 mz = page_cgroup_zoneinfo(memcg, page);
1125 lruvec = &mz->lruvec;
1126out:
1127
1128
1129
1130
1131
1132 if (unlikely(lruvec->zone != zone))
1133 lruvec->zone = zone;
1134 return lruvec;
1135}
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru,
1147 int nr_pages)
1148{
1149 struct mem_cgroup_per_zone *mz;
1150 unsigned long *lru_size;
1151
1152 if (mem_cgroup_disabled())
1153 return;
1154
1155 mz = container_of(lruvec, struct mem_cgroup_per_zone, lruvec);
1156 lru_size = mz->lru_size + lru;
1157 *lru_size += nr_pages;
1158 VM_BUG_ON((long)(*lru_size) < 0);
1159}
1160
1161
1162
1163
1164
1165bool __mem_cgroup_same_or_subtree(const struct mem_cgroup *root_memcg,
1166 struct mem_cgroup *memcg)
1167{
1168 if (root_memcg == memcg)
1169 return true;
1170 if (!root_memcg->use_hierarchy || !memcg)
1171 return false;
1172 return css_is_ancestor(&memcg->css, &root_memcg->css);
1173}
1174
1175static bool mem_cgroup_same_or_subtree(const struct mem_cgroup *root_memcg,
1176 struct mem_cgroup *memcg)
1177{
1178 bool ret;
1179
1180 rcu_read_lock();
1181 ret = __mem_cgroup_same_or_subtree(root_memcg, memcg);
1182 rcu_read_unlock();
1183 return ret;
1184}
1185
1186int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *memcg)
1187{
1188 int ret;
1189 struct mem_cgroup *curr = NULL;
1190 struct task_struct *p;
1191
1192 p = find_lock_task_mm(task);
1193 if (p) {
1194 curr = try_get_mem_cgroup_from_mm(p->mm);
1195 task_unlock(p);
1196 } else {
1197
1198
1199
1200
1201
1202 task_lock(task);
1203 curr = mem_cgroup_from_task(task);
1204 if (curr)
1205 css_get(&curr->css);
1206 task_unlock(task);
1207 }
1208 if (!curr)
1209 return 0;
1210
1211
1212
1213
1214
1215
1216 ret = mem_cgroup_same_or_subtree(memcg, curr);
1217 css_put(&curr->css);
1218 return ret;
1219}
1220
1221int mem_cgroup_inactive_anon_is_low(struct lruvec *lruvec)
1222{
1223 unsigned long inactive_ratio;
1224 unsigned long inactive;
1225 unsigned long active;
1226 unsigned long gb;
1227
1228 inactive = mem_cgroup_get_lru_size(lruvec, LRU_INACTIVE_ANON);
1229 active = mem_cgroup_get_lru_size(lruvec, LRU_ACTIVE_ANON);
1230
1231 gb = (inactive + active) >> (30 - PAGE_SHIFT);
1232 if (gb)
1233 inactive_ratio = int_sqrt(10 * gb);
1234 else
1235 inactive_ratio = 1;
1236
1237 return inactive * inactive_ratio < active;
1238}
1239
1240int mem_cgroup_inactive_file_is_low(struct lruvec *lruvec)
1241{
1242 unsigned long active;
1243 unsigned long inactive;
1244
1245 inactive = mem_cgroup_get_lru_size(lruvec, LRU_INACTIVE_FILE);
1246 active = mem_cgroup_get_lru_size(lruvec, LRU_ACTIVE_FILE);
1247
1248 return (active > inactive);
1249}
1250
1251#define mem_cgroup_from_res_counter(counter, member) \
1252 container_of(counter, struct mem_cgroup, member)
1253
1254
1255
1256
1257
1258
1259
1260
1261static unsigned long mem_cgroup_margin(struct mem_cgroup *memcg)
1262{
1263 unsigned long long margin;
1264
1265 margin = res_counter_margin(&memcg->res);
1266 if (do_swap_account)
1267 margin = min(margin, res_counter_margin(&memcg->memsw));
1268 return margin >> PAGE_SHIFT;
1269}
1270
1271int mem_cgroup_swappiness(struct mem_cgroup *memcg)
1272{
1273 struct cgroup *cgrp = memcg->css.cgroup;
1274
1275
1276 if (cgrp->parent == NULL)
1277 return vm_swappiness;
1278
1279 return memcg->swappiness;
1280}
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298atomic_t memcg_moving __read_mostly;
1299
1300static void mem_cgroup_start_move(struct mem_cgroup *memcg)
1301{
1302 atomic_inc(&memcg_moving);
1303 atomic_inc(&memcg->moving_account);
1304 synchronize_rcu();
1305}
1306
1307static void mem_cgroup_end_move(struct mem_cgroup *memcg)
1308{
1309
1310
1311
1312
1313 if (memcg) {
1314 atomic_dec(&memcg_moving);
1315 atomic_dec(&memcg->moving_account);
1316 }
1317}
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331static bool mem_cgroup_stolen(struct mem_cgroup *memcg)
1332{
1333 VM_BUG_ON(!rcu_read_lock_held());
1334 return atomic_read(&memcg->moving_account) > 0;
1335}
1336
1337static bool mem_cgroup_under_move(struct mem_cgroup *memcg)
1338{
1339 struct mem_cgroup *from;
1340 struct mem_cgroup *to;
1341 bool ret = false;
1342
1343
1344
1345
1346 spin_lock(&mc.lock);
1347 from = mc.from;
1348 to = mc.to;
1349 if (!from)
1350 goto unlock;
1351
1352 ret = mem_cgroup_same_or_subtree(memcg, from)
1353 || mem_cgroup_same_or_subtree(memcg, to);
1354unlock:
1355 spin_unlock(&mc.lock);
1356 return ret;
1357}
1358
1359static bool mem_cgroup_wait_acct_move(struct mem_cgroup *memcg)
1360{
1361 if (mc.moving_task && current != mc.moving_task) {
1362 if (mem_cgroup_under_move(memcg)) {
1363 DEFINE_WAIT(wait);
1364 prepare_to_wait(&mc.waitq, &wait, TASK_INTERRUPTIBLE);
1365
1366 if (mc.moving_task)
1367 schedule();
1368 finish_wait(&mc.waitq, &wait);
1369 return true;
1370 }
1371 }
1372 return false;
1373}
1374
1375
1376
1377
1378
1379
1380
1381static void move_lock_mem_cgroup(struct mem_cgroup *memcg,
1382 unsigned long *flags)
1383{
1384 spin_lock_irqsave(&memcg->move_lock, *flags);
1385}
1386
1387static void move_unlock_mem_cgroup(struct mem_cgroup *memcg,
1388 unsigned long *flags)
1389{
1390 spin_unlock_irqrestore(&memcg->move_lock, *flags);
1391}
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
1402{
1403 struct cgroup *task_cgrp;
1404 struct cgroup *mem_cgrp;
1405
1406
1407
1408
1409
1410 static char memcg_name[PATH_MAX];
1411 int ret;
1412
1413 if (!memcg || !p)
1414 return;
1415
1416 rcu_read_lock();
1417
1418 mem_cgrp = memcg->css.cgroup;
1419 task_cgrp = task_cgroup(p, mem_cgroup_subsys_id);
1420
1421 ret = cgroup_path(task_cgrp, memcg_name, PATH_MAX);
1422 if (ret < 0) {
1423
1424
1425
1426
1427 rcu_read_unlock();
1428 goto done;
1429 }
1430 rcu_read_unlock();
1431
1432 printk(KERN_INFO "Task in %s killed", memcg_name);
1433
1434 rcu_read_lock();
1435 ret = cgroup_path(mem_cgrp, memcg_name, PATH_MAX);
1436 if (ret < 0) {
1437 rcu_read_unlock();
1438 goto done;
1439 }
1440 rcu_read_unlock();
1441
1442
1443
1444
1445 printk(KERN_CONT " as a result of limit of %s\n", memcg_name);
1446done:
1447
1448 printk(KERN_INFO "memory: usage %llukB, limit %llukB, failcnt %llu\n",
1449 res_counter_read_u64(&memcg->res, RES_USAGE) >> 10,
1450 res_counter_read_u64(&memcg->res, RES_LIMIT) >> 10,
1451 res_counter_read_u64(&memcg->res, RES_FAILCNT));
1452 printk(KERN_INFO "memory+swap: usage %llukB, limit %llukB, "
1453 "failcnt %llu\n",
1454 res_counter_read_u64(&memcg->memsw, RES_USAGE) >> 10,
1455 res_counter_read_u64(&memcg->memsw, RES_LIMIT) >> 10,
1456 res_counter_read_u64(&memcg->memsw, RES_FAILCNT));
1457}
1458
1459
1460
1461
1462
1463static int mem_cgroup_count_children(struct mem_cgroup *memcg)
1464{
1465 int num = 0;
1466 struct mem_cgroup *iter;
1467
1468 for_each_mem_cgroup_tree(iter, memcg)
1469 num++;
1470 return num;
1471}
1472
1473
1474
1475
1476static u64 mem_cgroup_get_limit(struct mem_cgroup *memcg)
1477{
1478 u64 limit;
1479
1480 limit = res_counter_read_u64(&memcg->res, RES_LIMIT);
1481
1482
1483
1484
1485 if (mem_cgroup_swappiness(memcg)) {
1486 u64 memsw;
1487
1488 limit += total_swap_pages << PAGE_SHIFT;
1489 memsw = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
1490
1491
1492
1493
1494
1495 limit = min(limit, memsw);
1496 }
1497
1498 return limit;
1499}
1500
1501void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
1502 int order)
1503{
1504 struct mem_cgroup *iter;
1505 unsigned long chosen_points = 0;
1506 unsigned long totalpages;
1507 unsigned int points = 0;
1508 struct task_struct *chosen = NULL;
1509
1510
1511
1512
1513
1514
1515 if (fatal_signal_pending(current)) {
1516 set_thread_flag(TIF_MEMDIE);
1517 return;
1518 }
1519
1520 check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, order, NULL);
1521 totalpages = mem_cgroup_get_limit(memcg) >> PAGE_SHIFT ? : 1;
1522 for_each_mem_cgroup_tree(iter, memcg) {
1523 struct cgroup *cgroup = iter->css.cgroup;
1524 struct cgroup_iter it;
1525 struct task_struct *task;
1526
1527 cgroup_iter_start(cgroup, &it);
1528 while ((task = cgroup_iter_next(cgroup, &it))) {
1529 switch (oom_scan_process_thread(task, totalpages, NULL,
1530 false)) {
1531 case OOM_SCAN_SELECT:
1532 if (chosen)
1533 put_task_struct(chosen);
1534 chosen = task;
1535 chosen_points = ULONG_MAX;
1536 get_task_struct(chosen);
1537
1538 case OOM_SCAN_CONTINUE:
1539 continue;
1540 case OOM_SCAN_ABORT:
1541 cgroup_iter_end(cgroup, &it);
1542 mem_cgroup_iter_break(memcg, iter);
1543 if (chosen)
1544 put_task_struct(chosen);
1545 return;
1546 case OOM_SCAN_OK:
1547 break;
1548 };
1549 points = oom_badness(task, memcg, NULL, totalpages);
1550 if (points > chosen_points) {
1551 if (chosen)
1552 put_task_struct(chosen);
1553 chosen = task;
1554 chosen_points = points;
1555 get_task_struct(chosen);
1556 }
1557 }
1558 cgroup_iter_end(cgroup, &it);
1559 }
1560
1561 if (!chosen)
1562 return;
1563 points = chosen_points * 1000 / totalpages;
1564 oom_kill_process(chosen, gfp_mask, order, points, totalpages, memcg,
1565 NULL, "Memory cgroup out of memory");
1566}
1567
1568static unsigned long mem_cgroup_reclaim(struct mem_cgroup *memcg,
1569 gfp_t gfp_mask,
1570 unsigned long flags)
1571{
1572 unsigned long total = 0;
1573 bool noswap = false;
1574 int loop;
1575
1576 if (flags & MEM_CGROUP_RECLAIM_NOSWAP)
1577 noswap = true;
1578 if (!(flags & MEM_CGROUP_RECLAIM_SHRINK) && memcg->memsw_is_minimum)
1579 noswap = true;
1580
1581 for (loop = 0; loop < MEM_CGROUP_MAX_RECLAIM_LOOPS; loop++) {
1582 if (loop)
1583 drain_all_stock_async(memcg);
1584 total += try_to_free_mem_cgroup_pages(memcg, gfp_mask, noswap);
1585
1586
1587
1588
1589
1590 if (total && (flags & MEM_CGROUP_RECLAIM_SHRINK))
1591 break;
1592 if (mem_cgroup_margin(memcg))
1593 break;
1594
1595
1596
1597
1598 if (loop && !total)
1599 break;
1600 }
1601 return total;
1602}
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614static bool test_mem_cgroup_node_reclaimable(struct mem_cgroup *memcg,
1615 int nid, bool noswap)
1616{
1617 if (mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL_FILE))
1618 return true;
1619 if (noswap || !total_swap_pages)
1620 return false;
1621 if (mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL_ANON))
1622 return true;
1623 return false;
1624
1625}
1626#if MAX_NUMNODES > 1
1627
1628
1629
1630
1631
1632
1633
1634static void mem_cgroup_may_update_nodemask(struct mem_cgroup *memcg)
1635{
1636 int nid;
1637
1638
1639
1640
1641 if (!atomic_read(&memcg->numainfo_events))
1642 return;
1643 if (atomic_inc_return(&memcg->numainfo_updating) > 1)
1644 return;
1645
1646
1647 memcg->scan_nodes = node_states[N_HIGH_MEMORY];
1648
1649 for_each_node_mask(nid, node_states[N_HIGH_MEMORY]) {
1650
1651 if (!test_mem_cgroup_node_reclaimable(memcg, nid, false))
1652 node_clear(nid, memcg->scan_nodes);
1653 }
1654
1655 atomic_set(&memcg->numainfo_events, 0);
1656 atomic_set(&memcg->numainfo_updating, 0);
1657}
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671int mem_cgroup_select_victim_node(struct mem_cgroup *memcg)
1672{
1673 int node;
1674
1675 mem_cgroup_may_update_nodemask(memcg);
1676 node = memcg->last_scanned_node;
1677
1678 node = next_node(node, memcg->scan_nodes);
1679 if (node == MAX_NUMNODES)
1680 node = first_node(memcg->scan_nodes);
1681
1682
1683
1684
1685
1686
1687 if (unlikely(node == MAX_NUMNODES))
1688 node = numa_node_id();
1689
1690 memcg->last_scanned_node = node;
1691 return node;
1692}
1693
1694
1695
1696
1697
1698
1699
1700static bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap)
1701{
1702 int nid;
1703
1704
1705
1706
1707
1708 if (!nodes_empty(memcg->scan_nodes)) {
1709 for (nid = first_node(memcg->scan_nodes);
1710 nid < MAX_NUMNODES;
1711 nid = next_node(nid, memcg->scan_nodes)) {
1712
1713 if (test_mem_cgroup_node_reclaimable(memcg, nid, noswap))
1714 return true;
1715 }
1716 }
1717
1718
1719
1720 for_each_node_state(nid, N_HIGH_MEMORY) {
1721 if (node_isset(nid, memcg->scan_nodes))
1722 continue;
1723 if (test_mem_cgroup_node_reclaimable(memcg, nid, noswap))
1724 return true;
1725 }
1726 return false;
1727}
1728
1729#else
1730int mem_cgroup_select_victim_node(struct mem_cgroup *memcg)
1731{
1732 return 0;
1733}
1734
1735static bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap)
1736{
1737 return test_mem_cgroup_node_reclaimable(memcg, 0, noswap);
1738}
1739#endif
1740
1741static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg,
1742 struct zone *zone,
1743 gfp_t gfp_mask,
1744 unsigned long *total_scanned)
1745{
1746 struct mem_cgroup *victim = NULL;
1747 int total = 0;
1748 int loop = 0;
1749 unsigned long excess;
1750 unsigned long nr_scanned;
1751 struct mem_cgroup_reclaim_cookie reclaim = {
1752 .zone = zone,
1753 .priority = 0,
1754 };
1755
1756 excess = res_counter_soft_limit_excess(&root_memcg->res) >> PAGE_SHIFT;
1757
1758 while (1) {
1759 victim = mem_cgroup_iter(root_memcg, victim, &reclaim);
1760 if (!victim) {
1761 loop++;
1762 if (loop >= 2) {
1763
1764
1765
1766
1767
1768 if (!total)
1769 break;
1770
1771
1772
1773
1774
1775
1776 if (total >= (excess >> 2) ||
1777 (loop > MEM_CGROUP_MAX_RECLAIM_LOOPS))
1778 break;
1779 }
1780 continue;
1781 }
1782 if (!mem_cgroup_reclaimable(victim, false))
1783 continue;
1784 total += mem_cgroup_shrink_node_zone(victim, gfp_mask, false,
1785 zone, &nr_scanned);
1786 *total_scanned += nr_scanned;
1787 if (!res_counter_soft_limit_excess(&root_memcg->res))
1788 break;
1789 }
1790 mem_cgroup_iter_break(root_memcg, victim);
1791 return total;
1792}
1793
1794
1795
1796
1797
1798
1799static bool mem_cgroup_oom_lock(struct mem_cgroup *memcg)
1800{
1801 struct mem_cgroup *iter, *failed = NULL;
1802
1803 for_each_mem_cgroup_tree(iter, memcg) {
1804 if (iter->oom_lock) {
1805
1806
1807
1808
1809 failed = iter;
1810 mem_cgroup_iter_break(memcg, iter);
1811 break;
1812 } else
1813 iter->oom_lock = true;
1814 }
1815
1816 if (!failed)
1817 return true;
1818
1819
1820
1821
1822
1823 for_each_mem_cgroup_tree(iter, memcg) {
1824 if (iter == failed) {
1825 mem_cgroup_iter_break(memcg, iter);
1826 break;
1827 }
1828 iter->oom_lock = false;
1829 }
1830 return false;
1831}
1832
1833
1834
1835
1836static int mem_cgroup_oom_unlock(struct mem_cgroup *memcg)
1837{
1838 struct mem_cgroup *iter;
1839
1840 for_each_mem_cgroup_tree(iter, memcg)
1841 iter->oom_lock = false;
1842 return 0;
1843}
1844
1845static void mem_cgroup_mark_under_oom(struct mem_cgroup *memcg)
1846{
1847 struct mem_cgroup *iter;
1848
1849 for_each_mem_cgroup_tree(iter, memcg)
1850 atomic_inc(&iter->under_oom);
1851}
1852
1853static void mem_cgroup_unmark_under_oom(struct mem_cgroup *memcg)
1854{
1855 struct mem_cgroup *iter;
1856
1857
1858
1859
1860
1861
1862 for_each_mem_cgroup_tree(iter, memcg)
1863 atomic_add_unless(&iter->under_oom, -1, 0);
1864}
1865
1866static DEFINE_SPINLOCK(memcg_oom_lock);
1867static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq);
1868
1869struct oom_wait_info {
1870 struct mem_cgroup *memcg;
1871 wait_queue_t wait;
1872};
1873
1874static int memcg_oom_wake_function(wait_queue_t *wait,
1875 unsigned mode, int sync, void *arg)
1876{
1877 struct mem_cgroup *wake_memcg = (struct mem_cgroup *)arg;
1878 struct mem_cgroup *oom_wait_memcg;
1879 struct oom_wait_info *oom_wait_info;
1880
1881 oom_wait_info = container_of(wait, struct oom_wait_info, wait);
1882 oom_wait_memcg = oom_wait_info->memcg;
1883
1884
1885
1886
1887
1888 if (!mem_cgroup_same_or_subtree(oom_wait_memcg, wake_memcg)
1889 && !mem_cgroup_same_or_subtree(wake_memcg, oom_wait_memcg))
1890 return 0;
1891 return autoremove_wake_function(wait, mode, sync, arg);
1892}
1893
1894static void memcg_wakeup_oom(struct mem_cgroup *memcg)
1895{
1896
1897 __wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, memcg);
1898}
1899
1900static void memcg_oom_recover(struct mem_cgroup *memcg)
1901{
1902 if (memcg && atomic_read(&memcg->under_oom))
1903 memcg_wakeup_oom(memcg);
1904}
1905
1906
1907
1908
1909static bool mem_cgroup_handle_oom(struct mem_cgroup *memcg, gfp_t mask,
1910 int order)
1911{
1912 struct oom_wait_info owait;
1913 bool locked, need_to_kill;
1914
1915 owait.memcg = memcg;
1916 owait.wait.flags = 0;
1917 owait.wait.func = memcg_oom_wake_function;
1918 owait.wait.private = current;
1919 INIT_LIST_HEAD(&owait.wait.task_list);
1920 need_to_kill = true;
1921 mem_cgroup_mark_under_oom(memcg);
1922
1923
1924 spin_lock(&memcg_oom_lock);
1925 locked = mem_cgroup_oom_lock(memcg);
1926
1927
1928
1929
1930
1931 prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE);
1932 if (!locked || memcg->oom_kill_disable)
1933 need_to_kill = false;
1934 if (locked)
1935 mem_cgroup_oom_notify(memcg);
1936 spin_unlock(&memcg_oom_lock);
1937
1938 if (need_to_kill) {
1939 finish_wait(&memcg_oom_waitq, &owait.wait);
1940 mem_cgroup_out_of_memory(memcg, mask, order);
1941 } else {
1942 schedule();
1943 finish_wait(&memcg_oom_waitq, &owait.wait);
1944 }
1945 spin_lock(&memcg_oom_lock);
1946 if (locked)
1947 mem_cgroup_oom_unlock(memcg);
1948 memcg_wakeup_oom(memcg);
1949 spin_unlock(&memcg_oom_lock);
1950
1951 mem_cgroup_unmark_under_oom(memcg);
1952
1953 if (test_thread_flag(TIF_MEMDIE) || fatal_signal_pending(current))
1954 return false;
1955
1956 schedule_timeout_uninterruptible(1);
1957 return true;
1958}
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984void __mem_cgroup_begin_update_page_stat(struct page *page,
1985 bool *locked, unsigned long *flags)
1986{
1987 struct mem_cgroup *memcg;
1988 struct page_cgroup *pc;
1989
1990 pc = lookup_page_cgroup(page);
1991again:
1992 memcg = pc->mem_cgroup;
1993 if (unlikely(!memcg || !PageCgroupUsed(pc)))
1994 return;
1995
1996
1997
1998
1999
2000
2001 if (!mem_cgroup_stolen(memcg))
2002 return;
2003
2004 move_lock_mem_cgroup(memcg, flags);
2005 if (memcg != pc->mem_cgroup || !PageCgroupUsed(pc)) {
2006 move_unlock_mem_cgroup(memcg, flags);
2007 goto again;
2008 }
2009 *locked = true;
2010}
2011
2012void __mem_cgroup_end_update_page_stat(struct page *page, unsigned long *flags)
2013{
2014 struct page_cgroup *pc = lookup_page_cgroup(page);
2015
2016
2017
2018
2019
2020
2021 move_unlock_mem_cgroup(pc->mem_cgroup, flags);
2022}
2023
2024void mem_cgroup_update_page_stat(struct page *page,
2025 enum mem_cgroup_page_stat_item idx, int val)
2026{
2027 struct mem_cgroup *memcg;
2028 struct page_cgroup *pc = lookup_page_cgroup(page);
2029 unsigned long uninitialized_var(flags);
2030
2031 if (mem_cgroup_disabled())
2032 return;
2033
2034 memcg = pc->mem_cgroup;
2035 if (unlikely(!memcg || !PageCgroupUsed(pc)))
2036 return;
2037
2038 switch (idx) {
2039 case MEMCG_NR_FILE_MAPPED:
2040 idx = MEM_CGROUP_STAT_FILE_MAPPED;
2041 break;
2042 default:
2043 BUG();
2044 }
2045
2046 this_cpu_add(memcg->stat->count[idx], val);
2047}
2048
2049
2050
2051
2052
2053#define CHARGE_BATCH 32U
2054struct memcg_stock_pcp {
2055 struct mem_cgroup *cached;
2056 unsigned int nr_pages;
2057 struct work_struct work;
2058 unsigned long flags;
2059#define FLUSHING_CACHED_CHARGE 0
2060};
2061static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock);
2062static DEFINE_MUTEX(percpu_charge_mutex);
2063
2064
2065
2066
2067
2068
2069
2070static bool consume_stock(struct mem_cgroup *memcg)
2071{
2072 struct memcg_stock_pcp *stock;
2073 bool ret = true;
2074
2075 stock = &get_cpu_var(memcg_stock);
2076 if (memcg == stock->cached && stock->nr_pages)
2077 stock->nr_pages--;
2078 else
2079 ret = false;
2080 put_cpu_var(memcg_stock);
2081 return ret;
2082}
2083
2084
2085
2086
2087static void drain_stock(struct memcg_stock_pcp *stock)
2088{
2089 struct mem_cgroup *old = stock->cached;
2090
2091 if (stock->nr_pages) {
2092 unsigned long bytes = stock->nr_pages * PAGE_SIZE;
2093
2094 res_counter_uncharge(&old->res, bytes);
2095 if (do_swap_account)
2096 res_counter_uncharge(&old->memsw, bytes);
2097 stock->nr_pages = 0;
2098 }
2099 stock->cached = NULL;
2100}
2101
2102
2103
2104
2105
2106static void drain_local_stock(struct work_struct *dummy)
2107{
2108 struct memcg_stock_pcp *stock = &__get_cpu_var(memcg_stock);
2109 drain_stock(stock);
2110 clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags);
2111}
2112
2113
2114
2115
2116
2117static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
2118{
2119 struct memcg_stock_pcp *stock = &get_cpu_var(memcg_stock);
2120
2121 if (stock->cached != memcg) {
2122 drain_stock(stock);
2123 stock->cached = memcg;
2124 }
2125 stock->nr_pages += nr_pages;
2126 put_cpu_var(memcg_stock);
2127}
2128
2129
2130
2131
2132
2133
2134static void drain_all_stock(struct mem_cgroup *root_memcg, bool sync)
2135{
2136 int cpu, curcpu;
2137
2138
2139 get_online_cpus();
2140 curcpu = get_cpu();
2141 for_each_online_cpu(cpu) {
2142 struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu);
2143 struct mem_cgroup *memcg;
2144
2145 memcg = stock->cached;
2146 if (!memcg || !stock->nr_pages)
2147 continue;
2148 if (!mem_cgroup_same_or_subtree(root_memcg, memcg))
2149 continue;
2150 if (!test_and_set_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) {
2151 if (cpu == curcpu)
2152 drain_local_stock(&stock->work);
2153 else
2154 schedule_work_on(cpu, &stock->work);
2155 }
2156 }
2157 put_cpu();
2158
2159 if (!sync)
2160 goto out;
2161
2162 for_each_online_cpu(cpu) {
2163 struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu);
2164 if (test_bit(FLUSHING_CACHED_CHARGE, &stock->flags))
2165 flush_work(&stock->work);
2166 }
2167out:
2168 put_online_cpus();
2169}
2170
2171
2172
2173
2174
2175
2176
2177static void drain_all_stock_async(struct mem_cgroup *root_memcg)
2178{
2179
2180
2181
2182 if (!mutex_trylock(&percpu_charge_mutex))
2183 return;
2184 drain_all_stock(root_memcg, false);
2185 mutex_unlock(&percpu_charge_mutex);
2186}
2187
2188
2189static void drain_all_stock_sync(struct mem_cgroup *root_memcg)
2190{
2191
2192 mutex_lock(&percpu_charge_mutex);
2193 drain_all_stock(root_memcg, true);
2194 mutex_unlock(&percpu_charge_mutex);
2195}
2196
2197
2198
2199
2200
2201static void mem_cgroup_drain_pcp_counter(struct mem_cgroup *memcg, int cpu)
2202{
2203 int i;
2204
2205 spin_lock(&memcg->pcp_counter_lock);
2206 for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) {
2207 long x = per_cpu(memcg->stat->count[i], cpu);
2208
2209 per_cpu(memcg->stat->count[i], cpu) = 0;
2210 memcg->nocpu_base.count[i] += x;
2211 }
2212 for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++) {
2213 unsigned long x = per_cpu(memcg->stat->events[i], cpu);
2214
2215 per_cpu(memcg->stat->events[i], cpu) = 0;
2216 memcg->nocpu_base.events[i] += x;
2217 }
2218 spin_unlock(&memcg->pcp_counter_lock);
2219}
2220
2221static int __cpuinit memcg_cpu_hotplug_callback(struct notifier_block *nb,
2222 unsigned long action,
2223 void *hcpu)
2224{
2225 int cpu = (unsigned long)hcpu;
2226 struct memcg_stock_pcp *stock;
2227 struct mem_cgroup *iter;
2228
2229 if (action == CPU_ONLINE)
2230 return NOTIFY_OK;
2231
2232 if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
2233 return NOTIFY_OK;
2234
2235 for_each_mem_cgroup(iter)
2236 mem_cgroup_drain_pcp_counter(iter, cpu);
2237
2238 stock = &per_cpu(memcg_stock, cpu);
2239 drain_stock(stock);
2240 return NOTIFY_OK;
2241}
2242
2243
2244
2245enum {
2246 CHARGE_OK,
2247 CHARGE_RETRY,
2248 CHARGE_NOMEM,
2249 CHARGE_WOULDBLOCK,
2250 CHARGE_OOM_DIE,
2251};
2252
2253static int mem_cgroup_do_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
2254 unsigned int nr_pages, bool oom_check)
2255{
2256 unsigned long csize = nr_pages * PAGE_SIZE;
2257 struct mem_cgroup *mem_over_limit;
2258 struct res_counter *fail_res;
2259 unsigned long flags = 0;
2260 int ret;
2261
2262 ret = res_counter_charge(&memcg->res, csize, &fail_res);
2263
2264 if (likely(!ret)) {
2265 if (!do_swap_account)
2266 return CHARGE_OK;
2267 ret = res_counter_charge(&memcg->memsw, csize, &fail_res);
2268 if (likely(!ret))
2269 return CHARGE_OK;
2270
2271 res_counter_uncharge(&memcg->res, csize);
2272 mem_over_limit = mem_cgroup_from_res_counter(fail_res, memsw);
2273 flags |= MEM_CGROUP_RECLAIM_NOSWAP;
2274 } else
2275 mem_over_limit = mem_cgroup_from_res_counter(fail_res, res);
2276
2277
2278
2279
2280
2281
2282
2283 if (nr_pages == CHARGE_BATCH)
2284 return CHARGE_RETRY;
2285
2286 if (!(gfp_mask & __GFP_WAIT))
2287 return CHARGE_WOULDBLOCK;
2288
2289 ret = mem_cgroup_reclaim(mem_over_limit, gfp_mask, flags);
2290 if (mem_cgroup_margin(mem_over_limit) >= nr_pages)
2291 return CHARGE_RETRY;
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301 if (nr_pages == 1 && ret)
2302 return CHARGE_RETRY;
2303
2304
2305
2306
2307
2308 if (mem_cgroup_wait_acct_move(mem_over_limit))
2309 return CHARGE_RETRY;
2310
2311
2312 if (!oom_check)
2313 return CHARGE_NOMEM;
2314
2315 if (!mem_cgroup_handle_oom(mem_over_limit, gfp_mask, get_order(csize)))
2316 return CHARGE_OOM_DIE;
2317
2318 return CHARGE_RETRY;
2319}
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342static int __mem_cgroup_try_charge(struct mm_struct *mm,
2343 gfp_t gfp_mask,
2344 unsigned int nr_pages,
2345 struct mem_cgroup **ptr,
2346 bool oom)
2347{
2348 unsigned int batch = max(CHARGE_BATCH, nr_pages);
2349 int nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES;
2350 struct mem_cgroup *memcg = NULL;
2351 int ret;
2352
2353
2354
2355
2356
2357
2358 if (unlikely(test_thread_flag(TIF_MEMDIE)
2359 || fatal_signal_pending(current)))
2360 goto bypass;
2361
2362
2363
2364
2365
2366
2367
2368 if (!*ptr && !mm)
2369 *ptr = root_mem_cgroup;
2370again:
2371 if (*ptr) {
2372 memcg = *ptr;
2373 VM_BUG_ON(css_is_removed(&memcg->css));
2374 if (mem_cgroup_is_root(memcg))
2375 goto done;
2376 if (nr_pages == 1 && consume_stock(memcg))
2377 goto done;
2378 css_get(&memcg->css);
2379 } else {
2380 struct task_struct *p;
2381
2382 rcu_read_lock();
2383 p = rcu_dereference(mm->owner);
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394 memcg = mem_cgroup_from_task(p);
2395 if (!memcg)
2396 memcg = root_mem_cgroup;
2397 if (mem_cgroup_is_root(memcg)) {
2398 rcu_read_unlock();
2399 goto done;
2400 }
2401 if (nr_pages == 1 && consume_stock(memcg)) {
2402
2403
2404
2405
2406
2407
2408
2409
2410 rcu_read_unlock();
2411 goto done;
2412 }
2413
2414 if (!css_tryget(&memcg->css)) {
2415 rcu_read_unlock();
2416 goto again;
2417 }
2418 rcu_read_unlock();
2419 }
2420
2421 do {
2422 bool oom_check;
2423
2424
2425 if (fatal_signal_pending(current)) {
2426 css_put(&memcg->css);
2427 goto bypass;
2428 }
2429
2430 oom_check = false;
2431 if (oom && !nr_oom_retries) {
2432 oom_check = true;
2433 nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES;
2434 }
2435
2436 ret = mem_cgroup_do_charge(memcg, gfp_mask, batch, oom_check);
2437 switch (ret) {
2438 case CHARGE_OK:
2439 break;
2440 case CHARGE_RETRY:
2441 batch = nr_pages;
2442 css_put(&memcg->css);
2443 memcg = NULL;
2444 goto again;
2445 case CHARGE_WOULDBLOCK:
2446 css_put(&memcg->css);
2447 goto nomem;
2448 case CHARGE_NOMEM:
2449 if (!oom) {
2450 css_put(&memcg->css);
2451 goto nomem;
2452 }
2453
2454 nr_oom_retries--;
2455 break;
2456 case CHARGE_OOM_DIE:
2457 css_put(&memcg->css);
2458 goto bypass;
2459 }
2460 } while (ret != CHARGE_OK);
2461
2462 if (batch > nr_pages)
2463 refill_stock(memcg, batch - nr_pages);
2464 css_put(&memcg->css);
2465done:
2466 *ptr = memcg;
2467 return 0;
2468nomem:
2469 *ptr = NULL;
2470 return -ENOMEM;
2471bypass:
2472 *ptr = root_mem_cgroup;
2473 return -EINTR;
2474}
2475
2476
2477
2478
2479
2480
2481static void __mem_cgroup_cancel_charge(struct mem_cgroup *memcg,
2482 unsigned int nr_pages)
2483{
2484 if (!mem_cgroup_is_root(memcg)) {
2485 unsigned long bytes = nr_pages * PAGE_SIZE;
2486
2487 res_counter_uncharge(&memcg->res, bytes);
2488 if (do_swap_account)
2489 res_counter_uncharge(&memcg->memsw, bytes);
2490 }
2491}
2492
2493
2494
2495
2496
2497static void __mem_cgroup_cancel_local_charge(struct mem_cgroup *memcg,
2498 unsigned int nr_pages)
2499{
2500 unsigned long bytes = nr_pages * PAGE_SIZE;
2501
2502 if (mem_cgroup_is_root(memcg))
2503 return;
2504
2505 res_counter_uncharge_until(&memcg->res, memcg->res.parent, bytes);
2506 if (do_swap_account)
2507 res_counter_uncharge_until(&memcg->memsw,
2508 memcg->memsw.parent, bytes);
2509}
2510
2511
2512
2513
2514
2515
2516
2517static struct mem_cgroup *mem_cgroup_lookup(unsigned short id)
2518{
2519 struct cgroup_subsys_state *css;
2520
2521
2522 if (!id)
2523 return NULL;
2524 css = css_lookup(&mem_cgroup_subsys, id);
2525 if (!css)
2526 return NULL;
2527 return mem_cgroup_from_css(css);
2528}
2529
2530struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page)
2531{
2532 struct mem_cgroup *memcg = NULL;
2533 struct page_cgroup *pc;
2534 unsigned short id;
2535 swp_entry_t ent;
2536
2537 VM_BUG_ON(!PageLocked(page));
2538
2539 pc = lookup_page_cgroup(page);
2540 lock_page_cgroup(pc);
2541 if (PageCgroupUsed(pc)) {
2542 memcg = pc->mem_cgroup;
2543 if (memcg && !css_tryget(&memcg->css))
2544 memcg = NULL;
2545 } else if (PageSwapCache(page)) {
2546 ent.val = page_private(page);
2547 id = lookup_swap_cgroup_id(ent);
2548 rcu_read_lock();
2549 memcg = mem_cgroup_lookup(id);
2550 if (memcg && !css_tryget(&memcg->css))
2551 memcg = NULL;
2552 rcu_read_unlock();
2553 }
2554 unlock_page_cgroup(pc);
2555 return memcg;
2556}
2557
2558static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg,
2559 struct page *page,
2560 unsigned int nr_pages,
2561 enum charge_type ctype,
2562 bool lrucare)
2563{
2564 struct page_cgroup *pc = lookup_page_cgroup(page);
2565 struct zone *uninitialized_var(zone);
2566 struct lruvec *lruvec;
2567 bool was_on_lru = false;
2568 bool anon;
2569
2570 lock_page_cgroup(pc);
2571 VM_BUG_ON(PageCgroupUsed(pc));
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581 if (lrucare) {
2582 zone = page_zone(page);
2583 spin_lock_irq(&zone->lru_lock);
2584 if (PageLRU(page)) {
2585 lruvec = mem_cgroup_zone_lruvec(zone, pc->mem_cgroup);
2586 ClearPageLRU(page);
2587 del_page_from_lru_list(page, lruvec, page_lru(page));
2588 was_on_lru = true;
2589 }
2590 }
2591
2592 pc->mem_cgroup = memcg;
2593
2594
2595
2596
2597
2598
2599
2600 smp_wmb();
2601 SetPageCgroupUsed(pc);
2602
2603 if (lrucare) {
2604 if (was_on_lru) {
2605 lruvec = mem_cgroup_zone_lruvec(zone, pc->mem_cgroup);
2606 VM_BUG_ON(PageLRU(page));
2607 SetPageLRU(page);
2608 add_page_to_lru_list(page, lruvec, page_lru(page));
2609 }
2610 spin_unlock_irq(&zone->lru_lock);
2611 }
2612
2613 if (ctype == MEM_CGROUP_CHARGE_TYPE_ANON)
2614 anon = true;
2615 else
2616 anon = false;
2617
2618 mem_cgroup_charge_statistics(memcg, anon, nr_pages);
2619 unlock_page_cgroup(pc);
2620
2621
2622
2623
2624
2625
2626 memcg_check_events(memcg, page);
2627}
2628
2629#ifdef CONFIG_TRANSPARENT_HUGEPAGE
2630
2631#define PCGF_NOCOPY_AT_SPLIT (1 << PCG_LOCK | 1 << PCG_MIGRATION)
2632
2633
2634
2635
2636
2637
2638void mem_cgroup_split_huge_fixup(struct page *head)
2639{
2640 struct page_cgroup *head_pc = lookup_page_cgroup(head);
2641 struct page_cgroup *pc;
2642 int i;
2643
2644 if (mem_cgroup_disabled())
2645 return;
2646 for (i = 1; i < HPAGE_PMD_NR; i++) {
2647 pc = head_pc + i;
2648 pc->mem_cgroup = head_pc->mem_cgroup;
2649 smp_wmb();
2650 pc->flags = head_pc->flags & ~PCGF_NOCOPY_AT_SPLIT;
2651 }
2652}
2653#endif
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670static int mem_cgroup_move_account(struct page *page,
2671 unsigned int nr_pages,
2672 struct page_cgroup *pc,
2673 struct mem_cgroup *from,
2674 struct mem_cgroup *to)
2675{
2676 unsigned long flags;
2677 int ret;
2678 bool anon = PageAnon(page);
2679
2680 VM_BUG_ON(from == to);
2681 VM_BUG_ON(PageLRU(page));
2682
2683
2684
2685
2686
2687
2688 ret = -EBUSY;
2689 if (nr_pages > 1 && !PageTransHuge(page))
2690 goto out;
2691
2692 lock_page_cgroup(pc);
2693
2694 ret = -EINVAL;
2695 if (!PageCgroupUsed(pc) || pc->mem_cgroup != from)
2696 goto unlock;
2697
2698 move_lock_mem_cgroup(from, &flags);
2699
2700 if (!anon && page_mapped(page)) {
2701
2702 preempt_disable();
2703 __this_cpu_dec(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
2704 __this_cpu_inc(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
2705 preempt_enable();
2706 }
2707 mem_cgroup_charge_statistics(from, anon, -nr_pages);
2708
2709
2710 pc->mem_cgroup = to;
2711 mem_cgroup_charge_statistics(to, anon, nr_pages);
2712
2713
2714
2715
2716
2717
2718
2719 move_unlock_mem_cgroup(from, &flags);
2720 ret = 0;
2721unlock:
2722 unlock_page_cgroup(pc);
2723
2724
2725
2726 memcg_check_events(to, page);
2727 memcg_check_events(from, page);
2728out:
2729 return ret;
2730}
2731
2732
2733
2734
2735
2736static int mem_cgroup_move_parent(struct page *page,
2737 struct page_cgroup *pc,
2738 struct mem_cgroup *child)
2739{
2740 struct mem_cgroup *parent;
2741 unsigned int nr_pages;
2742 unsigned long uninitialized_var(flags);
2743 int ret;
2744
2745
2746 if (mem_cgroup_is_root(child))
2747 return -EINVAL;
2748
2749 ret = -EBUSY;
2750 if (!get_page_unless_zero(page))
2751 goto out;
2752 if (isolate_lru_page(page))
2753 goto put;
2754
2755 nr_pages = hpage_nr_pages(page);
2756
2757 parent = parent_mem_cgroup(child);
2758
2759
2760
2761 if (!parent)
2762 parent = root_mem_cgroup;
2763
2764 if (nr_pages > 1)
2765 flags = compound_lock_irqsave(page);
2766
2767 ret = mem_cgroup_move_account(page, nr_pages,
2768 pc, child, parent);
2769 if (!ret)
2770 __mem_cgroup_cancel_local_charge(child, nr_pages);
2771
2772 if (nr_pages > 1)
2773 compound_unlock_irqrestore(page, flags);
2774 putback_lru_page(page);
2775put:
2776 put_page(page);
2777out:
2778 return ret;
2779}
2780
2781
2782
2783
2784
2785
2786
2787static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
2788 gfp_t gfp_mask, enum charge_type ctype)
2789{
2790 struct mem_cgroup *memcg = NULL;
2791 unsigned int nr_pages = 1;
2792 bool oom = true;
2793 int ret;
2794
2795 if (PageTransHuge(page)) {
2796 nr_pages <<= compound_order(page);
2797 VM_BUG_ON(!PageTransHuge(page));
2798
2799
2800
2801
2802 oom = false;
2803 }
2804
2805 ret = __mem_cgroup_try_charge(mm, gfp_mask, nr_pages, &memcg, oom);
2806 if (ret == -ENOMEM)
2807 return ret;
2808 __mem_cgroup_commit_charge(memcg, page, nr_pages, ctype, false);
2809 return 0;
2810}
2811
2812int mem_cgroup_newpage_charge(struct page *page,
2813 struct mm_struct *mm, gfp_t gfp_mask)
2814{
2815 if (mem_cgroup_disabled())
2816 return 0;
2817 VM_BUG_ON(page_mapped(page));
2818 VM_BUG_ON(page->mapping && !PageAnon(page));
2819 VM_BUG_ON(!mm);
2820 return mem_cgroup_charge_common(page, mm, gfp_mask,
2821 MEM_CGROUP_CHARGE_TYPE_ANON);
2822}
2823
2824
2825
2826
2827
2828
2829
2830static int __mem_cgroup_try_charge_swapin(struct mm_struct *mm,
2831 struct page *page,
2832 gfp_t mask,
2833 struct mem_cgroup **memcgp)
2834{
2835 struct mem_cgroup *memcg;
2836 struct page_cgroup *pc;
2837 int ret;
2838
2839 pc = lookup_page_cgroup(page);
2840
2841
2842
2843
2844
2845
2846
2847 if (PageCgroupUsed(pc))
2848 return 0;
2849 if (!do_swap_account)
2850 goto charge_cur_mm;
2851 memcg = try_get_mem_cgroup_from_page(page);
2852 if (!memcg)
2853 goto charge_cur_mm;
2854 *memcgp = memcg;
2855 ret = __mem_cgroup_try_charge(NULL, mask, 1, memcgp, true);
2856 css_put(&memcg->css);
2857 if (ret == -EINTR)
2858 ret = 0;
2859 return ret;
2860charge_cur_mm:
2861 ret = __mem_cgroup_try_charge(mm, mask, 1, memcgp, true);
2862 if (ret == -EINTR)
2863 ret = 0;
2864 return ret;
2865}
2866
2867int mem_cgroup_try_charge_swapin(struct mm_struct *mm, struct page *page,
2868 gfp_t gfp_mask, struct mem_cgroup **memcgp)
2869{
2870 *memcgp = NULL;
2871 if (mem_cgroup_disabled())
2872 return 0;
2873
2874
2875
2876
2877
2878
2879 if (!PageSwapCache(page)) {
2880 int ret;
2881
2882 ret = __mem_cgroup_try_charge(mm, gfp_mask, 1, memcgp, true);
2883 if (ret == -EINTR)
2884 ret = 0;
2885 return ret;
2886 }
2887 return __mem_cgroup_try_charge_swapin(mm, page, gfp_mask, memcgp);
2888}
2889
2890void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *memcg)
2891{
2892 if (mem_cgroup_disabled())
2893 return;
2894 if (!memcg)
2895 return;
2896 __mem_cgroup_cancel_charge(memcg, 1);
2897}
2898
2899static void
2900__mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *memcg,
2901 enum charge_type ctype)
2902{
2903 if (mem_cgroup_disabled())
2904 return;
2905 if (!memcg)
2906 return;
2907 cgroup_exclude_rmdir(&memcg->css);
2908
2909 __mem_cgroup_commit_charge(memcg, page, 1, ctype, true);
2910
2911
2912
2913
2914
2915
2916
2917 if (do_swap_account && PageSwapCache(page)) {
2918 swp_entry_t ent = {.val = page_private(page)};
2919 mem_cgroup_uncharge_swap(ent);
2920 }
2921
2922
2923
2924
2925
2926 cgroup_release_and_wakeup_rmdir(&memcg->css);
2927}
2928
2929void mem_cgroup_commit_charge_swapin(struct page *page,
2930 struct mem_cgroup *memcg)
2931{
2932 __mem_cgroup_commit_charge_swapin(page, memcg,
2933 MEM_CGROUP_CHARGE_TYPE_ANON);
2934}
2935
2936int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
2937 gfp_t gfp_mask)
2938{
2939 struct mem_cgroup *memcg = NULL;
2940 enum charge_type type = MEM_CGROUP_CHARGE_TYPE_CACHE;
2941 int ret;
2942
2943 if (mem_cgroup_disabled())
2944 return 0;
2945 if (PageCompound(page))
2946 return 0;
2947
2948 if (!PageSwapCache(page))
2949 ret = mem_cgroup_charge_common(page, mm, gfp_mask, type);
2950 else {
2951 ret = __mem_cgroup_try_charge_swapin(mm, page,
2952 gfp_mask, &memcg);
2953 if (!ret)
2954 __mem_cgroup_commit_charge_swapin(page, memcg, type);
2955 }
2956 return ret;
2957}
2958
2959static void mem_cgroup_do_uncharge(struct mem_cgroup *memcg,
2960 unsigned int nr_pages,
2961 const enum charge_type ctype)
2962{
2963 struct memcg_batch_info *batch = NULL;
2964 bool uncharge_memsw = true;
2965
2966
2967 if (!do_swap_account || ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT)
2968 uncharge_memsw = false;
2969
2970 batch = ¤t->memcg_batch;
2971
2972
2973
2974
2975
2976 if (!batch->memcg)
2977 batch->memcg = memcg;
2978
2979
2980
2981
2982
2983
2984
2985
2986 if (!batch->do_batch || test_thread_flag(TIF_MEMDIE))
2987 goto direct_uncharge;
2988
2989 if (nr_pages > 1)
2990 goto direct_uncharge;
2991
2992
2993
2994
2995
2996
2997 if (batch->memcg != memcg)
2998 goto direct_uncharge;
2999
3000 batch->nr_pages++;
3001 if (uncharge_memsw)
3002 batch->memsw_nr_pages++;
3003 return;
3004direct_uncharge:
3005 res_counter_uncharge(&memcg->res, nr_pages * PAGE_SIZE);
3006 if (uncharge_memsw)
3007 res_counter_uncharge(&memcg->memsw, nr_pages * PAGE_SIZE);
3008 if (unlikely(batch->memcg != memcg))
3009 memcg_oom_recover(memcg);
3010}
3011
3012
3013
3014
3015static struct mem_cgroup *
3016__mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype,
3017 bool end_migration)
3018{
3019 struct mem_cgroup *memcg = NULL;
3020 unsigned int nr_pages = 1;
3021 struct page_cgroup *pc;
3022 bool anon;
3023
3024 if (mem_cgroup_disabled())
3025 return NULL;
3026
3027 VM_BUG_ON(PageSwapCache(page));
3028
3029 if (PageTransHuge(page)) {
3030 nr_pages <<= compound_order(page);
3031 VM_BUG_ON(!PageTransHuge(page));
3032 }
3033
3034
3035
3036 pc = lookup_page_cgroup(page);
3037 if (unlikely(!PageCgroupUsed(pc)))
3038 return NULL;
3039
3040 lock_page_cgroup(pc);
3041
3042 memcg = pc->mem_cgroup;
3043
3044 if (!PageCgroupUsed(pc))
3045 goto unlock_out;
3046
3047 anon = PageAnon(page);
3048
3049 switch (ctype) {
3050 case MEM_CGROUP_CHARGE_TYPE_ANON:
3051
3052
3053
3054
3055
3056 anon = true;
3057
3058 case MEM_CGROUP_CHARGE_TYPE_DROP:
3059
3060 if (page_mapped(page))
3061 goto unlock_out;
3062
3063
3064
3065
3066
3067
3068
3069 if (!end_migration && PageCgroupMigration(pc))
3070 goto unlock_out;
3071 break;
3072 case MEM_CGROUP_CHARGE_TYPE_SWAPOUT:
3073 if (!PageAnon(page)) {
3074 if (page->mapping && !page_is_file_cache(page))
3075 goto unlock_out;
3076 } else if (page_mapped(page))
3077 goto unlock_out;
3078 break;
3079 default:
3080 break;
3081 }
3082
3083 mem_cgroup_charge_statistics(memcg, anon, -nr_pages);
3084
3085 ClearPageCgroupUsed(pc);
3086
3087
3088
3089
3090
3091
3092
3093 unlock_page_cgroup(pc);
3094
3095
3096
3097
3098 memcg_check_events(memcg, page);
3099 if (do_swap_account && ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT) {
3100 mem_cgroup_swap_statistics(memcg, true);
3101 mem_cgroup_get(memcg);
3102 }
3103
3104
3105
3106
3107
3108 if (!end_migration && !mem_cgroup_is_root(memcg))
3109 mem_cgroup_do_uncharge(memcg, nr_pages, ctype);
3110
3111 return memcg;
3112
3113unlock_out:
3114 unlock_page_cgroup(pc);
3115 return NULL;
3116}
3117
3118void mem_cgroup_uncharge_page(struct page *page)
3119{
3120
3121 if (page_mapped(page))
3122 return;
3123 VM_BUG_ON(page->mapping && !PageAnon(page));
3124 if (PageSwapCache(page))
3125 return;
3126 __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_ANON, false);
3127}
3128
3129void mem_cgroup_uncharge_cache_page(struct page *page)
3130{
3131 VM_BUG_ON(page_mapped(page));
3132 VM_BUG_ON(page->mapping);
3133 __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE, false);
3134}
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144void mem_cgroup_uncharge_start(void)
3145{
3146 current->memcg_batch.do_batch++;
3147
3148 if (current->memcg_batch.do_batch == 1) {
3149 current->memcg_batch.memcg = NULL;
3150 current->memcg_batch.nr_pages = 0;
3151 current->memcg_batch.memsw_nr_pages = 0;
3152 }
3153}
3154
3155void mem_cgroup_uncharge_end(void)
3156{
3157 struct memcg_batch_info *batch = ¤t->memcg_batch;
3158
3159 if (!batch->do_batch)
3160 return;
3161
3162 batch->do_batch--;
3163 if (batch->do_batch)
3164 return;
3165
3166 if (!batch->memcg)
3167 return;
3168
3169
3170
3171
3172 if (batch->nr_pages)
3173 res_counter_uncharge(&batch->memcg->res,
3174 batch->nr_pages * PAGE_SIZE);
3175 if (batch->memsw_nr_pages)
3176 res_counter_uncharge(&batch->memcg->memsw,
3177 batch->memsw_nr_pages * PAGE_SIZE);
3178 memcg_oom_recover(batch->memcg);
3179
3180 batch->memcg = NULL;
3181}
3182
3183#ifdef CONFIG_SWAP
3184
3185
3186
3187
3188void
3189mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent, bool swapout)
3190{
3191 struct mem_cgroup *memcg;
3192 int ctype = MEM_CGROUP_CHARGE_TYPE_SWAPOUT;
3193
3194 if (!swapout)
3195 ctype = MEM_CGROUP_CHARGE_TYPE_DROP;
3196
3197 memcg = __mem_cgroup_uncharge_common(page, ctype, false);
3198
3199
3200
3201
3202
3203 if (do_swap_account && swapout && memcg)
3204 swap_cgroup_record(ent, css_id(&memcg->css));
3205}
3206#endif
3207
3208#ifdef CONFIG_MEMCG_SWAP
3209
3210
3211
3212
3213void mem_cgroup_uncharge_swap(swp_entry_t ent)
3214{
3215 struct mem_cgroup *memcg;
3216 unsigned short id;
3217
3218 if (!do_swap_account)
3219 return;
3220
3221 id = swap_cgroup_record(ent, 0);
3222 rcu_read_lock();
3223 memcg = mem_cgroup_lookup(id);
3224 if (memcg) {
3225
3226
3227
3228
3229 if (!mem_cgroup_is_root(memcg))
3230 res_counter_uncharge(&memcg->memsw, PAGE_SIZE);
3231 mem_cgroup_swap_statistics(memcg, false);
3232 mem_cgroup_put(memcg);
3233 }
3234 rcu_read_unlock();
3235}
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251static int mem_cgroup_move_swap_account(swp_entry_t entry,
3252 struct mem_cgroup *from, struct mem_cgroup *to)
3253{
3254 unsigned short old_id, new_id;
3255
3256 old_id = css_id(&from->css);
3257 new_id = css_id(&to->css);
3258
3259 if (swap_cgroup_cmpxchg(entry, old_id, new_id) == old_id) {
3260 mem_cgroup_swap_statistics(from, false);
3261 mem_cgroup_swap_statistics(to, true);
3262
3263
3264
3265
3266
3267
3268
3269
3270 mem_cgroup_get(to);
3271 return 0;
3272 }
3273 return -EINVAL;
3274}
3275#else
3276static inline int mem_cgroup_move_swap_account(swp_entry_t entry,
3277 struct mem_cgroup *from, struct mem_cgroup *to)
3278{
3279 return -EINVAL;
3280}
3281#endif
3282
3283
3284
3285
3286
3287void mem_cgroup_prepare_migration(struct page *page, struct page *newpage,
3288 struct mem_cgroup **memcgp)
3289{
3290 struct mem_cgroup *memcg = NULL;
3291 struct page_cgroup *pc;
3292 enum charge_type ctype;
3293
3294 *memcgp = NULL;
3295
3296 VM_BUG_ON(PageTransHuge(page));
3297 if (mem_cgroup_disabled())
3298 return;
3299
3300 pc = lookup_page_cgroup(page);
3301 lock_page_cgroup(pc);
3302 if (PageCgroupUsed(pc)) {
3303 memcg = pc->mem_cgroup;
3304 css_get(&memcg->css);
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334 if (PageAnon(page))
3335 SetPageCgroupMigration(pc);
3336 }
3337 unlock_page_cgroup(pc);
3338
3339
3340
3341
3342 if (!memcg)
3343 return;
3344
3345 *memcgp = memcg;
3346
3347
3348
3349
3350
3351
3352 if (PageAnon(page))
3353 ctype = MEM_CGROUP_CHARGE_TYPE_ANON;
3354 else
3355 ctype = MEM_CGROUP_CHARGE_TYPE_CACHE;
3356
3357
3358
3359
3360
3361 __mem_cgroup_commit_charge(memcg, newpage, 1, ctype, false);
3362}
3363
3364
3365void mem_cgroup_end_migration(struct mem_cgroup *memcg,
3366 struct page *oldpage, struct page *newpage, bool migration_ok)
3367{
3368 struct page *used, *unused;
3369 struct page_cgroup *pc;
3370 bool anon;
3371
3372 if (!memcg)
3373 return;
3374
3375 cgroup_exclude_rmdir(&memcg->css);
3376 if (!migration_ok) {
3377 used = oldpage;
3378 unused = newpage;
3379 } else {
3380 used = newpage;
3381 unused = oldpage;
3382 }
3383 anon = PageAnon(used);
3384 __mem_cgroup_uncharge_common(unused,
3385 anon ? MEM_CGROUP_CHARGE_TYPE_ANON
3386 : MEM_CGROUP_CHARGE_TYPE_CACHE,
3387 true);
3388 css_put(&memcg->css);
3389
3390
3391
3392
3393
3394 pc = lookup_page_cgroup(oldpage);
3395 lock_page_cgroup(pc);
3396 ClearPageCgroupMigration(pc);
3397 unlock_page_cgroup(pc);
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407 if (anon)
3408 mem_cgroup_uncharge_page(used);
3409
3410
3411
3412
3413
3414
3415 cgroup_release_and_wakeup_rmdir(&memcg->css);
3416}
3417
3418
3419
3420
3421
3422
3423void mem_cgroup_replace_page_cache(struct page *oldpage,
3424 struct page *newpage)
3425{
3426 struct mem_cgroup *memcg = NULL;
3427 struct page_cgroup *pc;
3428 enum charge_type type = MEM_CGROUP_CHARGE_TYPE_CACHE;
3429
3430 if (mem_cgroup_disabled())
3431 return;
3432
3433 pc = lookup_page_cgroup(oldpage);
3434
3435 lock_page_cgroup(pc);
3436 if (PageCgroupUsed(pc)) {
3437 memcg = pc->mem_cgroup;
3438 mem_cgroup_charge_statistics(memcg, false, -1);
3439 ClearPageCgroupUsed(pc);
3440 }
3441 unlock_page_cgroup(pc);
3442
3443
3444
3445
3446
3447 if (!memcg)
3448 return;
3449
3450
3451
3452
3453
3454 __mem_cgroup_commit_charge(memcg, newpage, 1, type, true);
3455}
3456
3457#ifdef CONFIG_DEBUG_VM
3458static struct page_cgroup *lookup_page_cgroup_used(struct page *page)
3459{
3460 struct page_cgroup *pc;
3461
3462 pc = lookup_page_cgroup(page);
3463
3464
3465
3466
3467
3468 if (likely(pc) && PageCgroupUsed(pc))
3469 return pc;
3470 return NULL;
3471}
3472
3473bool mem_cgroup_bad_page_check(struct page *page)
3474{
3475 if (mem_cgroup_disabled())
3476 return false;
3477
3478 return lookup_page_cgroup_used(page) != NULL;
3479}
3480
3481void mem_cgroup_print_bad_page(struct page *page)
3482{
3483 struct page_cgroup *pc;
3484
3485 pc = lookup_page_cgroup_used(page);
3486 if (pc) {
3487 printk(KERN_ALERT "pc:%p pc->flags:%lx pc->mem_cgroup:%p\n",
3488 pc, pc->flags, pc->mem_cgroup);
3489 }
3490}
3491#endif
3492
3493static DEFINE_MUTEX(set_limit_mutex);
3494
3495static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
3496 unsigned long long val)
3497{
3498 int retry_count;
3499 u64 memswlimit, memlimit;
3500 int ret = 0;
3501 int children = mem_cgroup_count_children(memcg);
3502 u64 curusage, oldusage;
3503 int enlarge;
3504
3505
3506
3507
3508
3509
3510 retry_count = MEM_CGROUP_RECLAIM_RETRIES * children;
3511
3512 oldusage = res_counter_read_u64(&memcg->res, RES_USAGE);
3513
3514 enlarge = 0;
3515 while (retry_count) {
3516 if (signal_pending(current)) {
3517 ret = -EINTR;
3518 break;
3519 }
3520
3521
3522
3523
3524
3525 mutex_lock(&set_limit_mutex);
3526 memswlimit = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
3527 if (memswlimit < val) {
3528 ret = -EINVAL;
3529 mutex_unlock(&set_limit_mutex);
3530 break;
3531 }
3532
3533 memlimit = res_counter_read_u64(&memcg->res, RES_LIMIT);
3534 if (memlimit < val)
3535 enlarge = 1;
3536
3537 ret = res_counter_set_limit(&memcg->res, val);
3538 if (!ret) {
3539 if (memswlimit == val)
3540 memcg->memsw_is_minimum = true;
3541 else
3542 memcg->memsw_is_minimum = false;
3543 }
3544 mutex_unlock(&set_limit_mutex);
3545
3546 if (!ret)
3547 break;
3548
3549 mem_cgroup_reclaim(memcg, GFP_KERNEL,
3550 MEM_CGROUP_RECLAIM_SHRINK);
3551 curusage = res_counter_read_u64(&memcg->res, RES_USAGE);
3552
3553 if (curusage >= oldusage)
3554 retry_count--;
3555 else
3556 oldusage = curusage;
3557 }
3558 if (!ret && enlarge)
3559 memcg_oom_recover(memcg);
3560
3561 return ret;
3562}
3563
3564static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg,
3565 unsigned long long val)
3566{
3567 int retry_count;
3568 u64 memlimit, memswlimit, oldusage, curusage;
3569 int children = mem_cgroup_count_children(memcg);
3570 int ret = -EBUSY;
3571 int enlarge = 0;
3572
3573
3574 retry_count = children * MEM_CGROUP_RECLAIM_RETRIES;
3575 oldusage = res_counter_read_u64(&memcg->memsw, RES_USAGE);
3576 while (retry_count) {
3577 if (signal_pending(current)) {
3578 ret = -EINTR;
3579 break;
3580 }
3581
3582
3583
3584
3585
3586 mutex_lock(&set_limit_mutex);
3587 memlimit = res_counter_read_u64(&memcg->res, RES_LIMIT);
3588 if (memlimit > val) {
3589 ret = -EINVAL;
3590 mutex_unlock(&set_limit_mutex);
3591 break;
3592 }
3593 memswlimit = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
3594 if (memswlimit < val)
3595 enlarge = 1;
3596 ret = res_counter_set_limit(&memcg->memsw, val);
3597 if (!ret) {
3598 if (memlimit == val)
3599 memcg->memsw_is_minimum = true;
3600 else
3601 memcg->memsw_is_minimum = false;
3602 }
3603 mutex_unlock(&set_limit_mutex);
3604
3605 if (!ret)
3606 break;
3607
3608 mem_cgroup_reclaim(memcg, GFP_KERNEL,
3609 MEM_CGROUP_RECLAIM_NOSWAP |
3610 MEM_CGROUP_RECLAIM_SHRINK);
3611 curusage = res_counter_read_u64(&memcg->memsw, RES_USAGE);
3612
3613 if (curusage >= oldusage)
3614 retry_count--;
3615 else
3616 oldusage = curusage;
3617 }
3618 if (!ret && enlarge)
3619 memcg_oom_recover(memcg);
3620 return ret;
3621}
3622
3623unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
3624 gfp_t gfp_mask,
3625 unsigned long *total_scanned)
3626{
3627 unsigned long nr_reclaimed = 0;
3628 struct mem_cgroup_per_zone *mz, *next_mz = NULL;
3629 unsigned long reclaimed;
3630 int loop = 0;
3631 struct mem_cgroup_tree_per_zone *mctz;
3632 unsigned long long excess;
3633 unsigned long nr_scanned;
3634
3635 if (order > 0)
3636 return 0;
3637
3638 mctz = soft_limit_tree_node_zone(zone_to_nid(zone), zone_idx(zone));
3639
3640
3641
3642
3643
3644 do {
3645 if (next_mz)
3646 mz = next_mz;
3647 else
3648 mz = mem_cgroup_largest_soft_limit_node(mctz);
3649 if (!mz)
3650 break;
3651
3652 nr_scanned = 0;
3653 reclaimed = mem_cgroup_soft_reclaim(mz->memcg, zone,
3654 gfp_mask, &nr_scanned);
3655 nr_reclaimed += reclaimed;
3656 *total_scanned += nr_scanned;
3657 spin_lock(&mctz->lock);
3658
3659
3660
3661
3662
3663 next_mz = NULL;
3664 if (!reclaimed) {
3665 do {
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677 next_mz =
3678 __mem_cgroup_largest_soft_limit_node(mctz);
3679 if (next_mz == mz)
3680 css_put(&next_mz->memcg->css);
3681 else
3682 break;
3683 } while (1);
3684 }
3685 __mem_cgroup_remove_exceeded(mz->memcg, mz, mctz);
3686 excess = res_counter_soft_limit_excess(&mz->memcg->res);
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696 __mem_cgroup_insert_exceeded(mz->memcg, mz, mctz, excess);
3697 spin_unlock(&mctz->lock);
3698 css_put(&mz->memcg->css);
3699 loop++;
3700
3701
3702
3703
3704
3705 if (!nr_reclaimed &&
3706 (next_mz == NULL ||
3707 loop > MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS))
3708 break;
3709 } while (!nr_reclaimed);
3710 if (next_mz)
3711 css_put(&next_mz->memcg->css);
3712 return nr_reclaimed;
3713}
3714
3715
3716
3717
3718
3719
3720
3721static bool mem_cgroup_force_empty_list(struct mem_cgroup *memcg,
3722 int node, int zid, enum lru_list lru)
3723{
3724 struct lruvec *lruvec;
3725 unsigned long flags, loop;
3726 struct list_head *list;
3727 struct page *busy;
3728 struct zone *zone;
3729
3730 zone = &NODE_DATA(node)->node_zones[zid];
3731 lruvec = mem_cgroup_zone_lruvec(zone, memcg);
3732 list = &lruvec->lists[lru];
3733
3734 loop = mem_cgroup_get_lru_size(lruvec, lru);
3735
3736 loop += 256;
3737 busy = NULL;
3738 while (loop--) {
3739 struct page_cgroup *pc;
3740 struct page *page;
3741
3742 spin_lock_irqsave(&zone->lru_lock, flags);
3743 if (list_empty(list)) {
3744 spin_unlock_irqrestore(&zone->lru_lock, flags);
3745 break;
3746 }
3747 page = list_entry(list->prev, struct page, lru);
3748 if (busy == page) {
3749 list_move(&page->lru, list);
3750 busy = NULL;
3751 spin_unlock_irqrestore(&zone->lru_lock, flags);
3752 continue;
3753 }
3754 spin_unlock_irqrestore(&zone->lru_lock, flags);
3755
3756 pc = lookup_page_cgroup(page);
3757
3758 if (mem_cgroup_move_parent(page, pc, memcg)) {
3759
3760 busy = page;
3761 cond_resched();
3762 } else
3763 busy = NULL;
3764 }
3765 return !list_empty(list);
3766}
3767
3768
3769
3770
3771
3772static int mem_cgroup_force_empty(struct mem_cgroup *memcg, bool free_all)
3773{
3774 int ret;
3775 int node, zid, shrink;
3776 int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
3777 struct cgroup *cgrp = memcg->css.cgroup;
3778
3779 css_get(&memcg->css);
3780
3781 shrink = 0;
3782
3783 if (free_all)
3784 goto try_to_free;
3785move_account:
3786 do {
3787 ret = -EBUSY;
3788 if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children))
3789 goto out;
3790
3791 lru_add_drain_all();
3792 drain_all_stock_sync(memcg);
3793 ret = 0;
3794 mem_cgroup_start_move(memcg);
3795 for_each_node_state(node, N_HIGH_MEMORY) {
3796 for (zid = 0; !ret && zid < MAX_NR_ZONES; zid++) {
3797 enum lru_list lru;
3798 for_each_lru(lru) {
3799 ret = mem_cgroup_force_empty_list(memcg,
3800 node, zid, lru);
3801 if (ret)
3802 break;
3803 }
3804 }
3805 if (ret)
3806 break;
3807 }
3808 mem_cgroup_end_move(memcg);
3809 memcg_oom_recover(memcg);
3810 cond_resched();
3811
3812 } while (res_counter_read_u64(&memcg->res, RES_USAGE) > 0 || ret);
3813out:
3814 css_put(&memcg->css);
3815 return ret;
3816
3817try_to_free:
3818
3819 if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children) || shrink) {
3820 ret = -EBUSY;
3821 goto out;
3822 }
3823
3824 lru_add_drain_all();
3825
3826 shrink = 1;
3827 while (nr_retries && res_counter_read_u64(&memcg->res, RES_USAGE) > 0) {
3828 int progress;
3829
3830 if (signal_pending(current)) {
3831 ret = -EINTR;
3832 goto out;
3833 }
3834 progress = try_to_free_mem_cgroup_pages(memcg, GFP_KERNEL,
3835 false);
3836 if (!progress) {
3837 nr_retries--;
3838
3839 congestion_wait(BLK_RW_ASYNC, HZ/10);
3840 }
3841
3842 }
3843 lru_add_drain();
3844
3845 goto move_account;
3846}
3847
3848static int mem_cgroup_force_empty_write(struct cgroup *cont, unsigned int event)
3849{
3850 return mem_cgroup_force_empty(mem_cgroup_from_cont(cont), true);
3851}
3852
3853
3854static u64 mem_cgroup_hierarchy_read(struct cgroup *cont, struct cftype *cft)
3855{
3856 return mem_cgroup_from_cont(cont)->use_hierarchy;
3857}
3858
3859static int mem_cgroup_hierarchy_write(struct cgroup *cont, struct cftype *cft,
3860 u64 val)
3861{
3862 int retval = 0;
3863 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
3864 struct cgroup *parent = cont->parent;
3865 struct mem_cgroup *parent_memcg = NULL;
3866
3867 if (parent)
3868 parent_memcg = mem_cgroup_from_cont(parent);
3869
3870 cgroup_lock();
3871
3872 if (memcg->use_hierarchy == val)
3873 goto out;
3874
3875
3876
3877
3878
3879
3880
3881
3882
3883 if ((!parent_memcg || !parent_memcg->use_hierarchy) &&
3884 (val == 1 || val == 0)) {
3885 if (list_empty(&cont->children))
3886 memcg->use_hierarchy = val;
3887 else
3888 retval = -EBUSY;
3889 } else
3890 retval = -EINVAL;
3891
3892out:
3893 cgroup_unlock();
3894
3895 return retval;
3896}
3897
3898
3899static unsigned long mem_cgroup_recursive_stat(struct mem_cgroup *memcg,
3900 enum mem_cgroup_stat_index idx)
3901{
3902 struct mem_cgroup *iter;
3903 long val = 0;
3904
3905
3906 for_each_mem_cgroup_tree(iter, memcg)
3907 val += mem_cgroup_read_stat(iter, idx);
3908
3909 if (val < 0)
3910 val = 0;
3911 return val;
3912}
3913
3914static inline u64 mem_cgroup_usage(struct mem_cgroup *memcg, bool swap)
3915{
3916 u64 val;
3917
3918 if (!mem_cgroup_is_root(memcg)) {
3919 if (!swap)
3920 return res_counter_read_u64(&memcg->res, RES_USAGE);
3921 else
3922 return res_counter_read_u64(&memcg->memsw, RES_USAGE);
3923 }
3924
3925 val = mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_CACHE);
3926 val += mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_RSS);
3927
3928 if (swap)
3929 val += mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_SWAP);
3930
3931 return val << PAGE_SHIFT;
3932}
3933
3934static ssize_t mem_cgroup_read(struct cgroup *cont, struct cftype *cft,
3935 struct file *file, char __user *buf,
3936 size_t nbytes, loff_t *ppos)
3937{
3938 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
3939 char str[64];
3940 u64 val;
3941 int type, name, len;
3942
3943 type = MEMFILE_TYPE(cft->private);
3944 name = MEMFILE_ATTR(cft->private);
3945
3946 if (!do_swap_account && type == _MEMSWAP)
3947 return -EOPNOTSUPP;
3948
3949 switch (type) {
3950 case _MEM:
3951 if (name == RES_USAGE)
3952 val = mem_cgroup_usage(memcg, false);
3953 else
3954 val = res_counter_read_u64(&memcg->res, name);
3955 break;
3956 case _MEMSWAP:
3957 if (name == RES_USAGE)
3958 val = mem_cgroup_usage(memcg, true);
3959 else
3960 val = res_counter_read_u64(&memcg->memsw, name);
3961 break;
3962 default:
3963 BUG();
3964 }
3965
3966 len = scnprintf(str, sizeof(str), "%llu\n", (unsigned long long)val);
3967 return simple_read_from_buffer(buf, nbytes, ppos, str, len);
3968}
3969
3970
3971
3972
3973static int mem_cgroup_write(struct cgroup *cont, struct cftype *cft,
3974 const char *buffer)
3975{
3976 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
3977 int type, name;
3978 unsigned long long val;
3979 int ret;
3980
3981 type = MEMFILE_TYPE(cft->private);
3982 name = MEMFILE_ATTR(cft->private);
3983
3984 if (!do_swap_account && type == _MEMSWAP)
3985 return -EOPNOTSUPP;
3986
3987 switch (name) {
3988 case RES_LIMIT:
3989 if (mem_cgroup_is_root(memcg)) {
3990 ret = -EINVAL;
3991 break;
3992 }
3993
3994 ret = res_counter_memparse_write_strategy(buffer, &val);
3995 if (ret)
3996 break;
3997 if (type == _MEM)
3998 ret = mem_cgroup_resize_limit(memcg, val);
3999 else
4000 ret = mem_cgroup_resize_memsw_limit(memcg, val);
4001 break;
4002 case RES_SOFT_LIMIT:
4003 ret = res_counter_memparse_write_strategy(buffer, &val);
4004 if (ret)
4005 break;
4006
4007
4008
4009
4010
4011 if (type == _MEM)
4012 ret = res_counter_set_soft_limit(&memcg->res, val);
4013 else
4014 ret = -EINVAL;
4015 break;
4016 default:
4017 ret = -EINVAL;
4018 break;
4019 }
4020 return ret;
4021}
4022
4023static void memcg_get_hierarchical_limit(struct mem_cgroup *memcg,
4024 unsigned long long *mem_limit, unsigned long long *memsw_limit)
4025{
4026 struct cgroup *cgroup;
4027 unsigned long long min_limit, min_memsw_limit, tmp;
4028
4029 min_limit = res_counter_read_u64(&memcg->res, RES_LIMIT);
4030 min_memsw_limit = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
4031 cgroup = memcg->css.cgroup;
4032 if (!memcg->use_hierarchy)
4033 goto out;
4034
4035 while (cgroup->parent) {
4036 cgroup = cgroup->parent;
4037 memcg = mem_cgroup_from_cont(cgroup);
4038 if (!memcg->use_hierarchy)
4039 break;
4040 tmp = res_counter_read_u64(&memcg->res, RES_LIMIT);
4041 min_limit = min(min_limit, tmp);
4042 tmp = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
4043 min_memsw_limit = min(min_memsw_limit, tmp);
4044 }
4045out:
4046 *mem_limit = min_limit;
4047 *memsw_limit = min_memsw_limit;
4048}
4049
4050static int mem_cgroup_reset(struct cgroup *cont, unsigned int event)
4051{
4052 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
4053 int type, name;
4054
4055 type = MEMFILE_TYPE(event);
4056 name = MEMFILE_ATTR(event);
4057
4058 if (!do_swap_account && type == _MEMSWAP)
4059 return -EOPNOTSUPP;
4060
4061 switch (name) {
4062 case RES_MAX_USAGE:
4063 if (type == _MEM)
4064 res_counter_reset_max(&memcg->res);
4065 else
4066 res_counter_reset_max(&memcg->memsw);
4067 break;
4068 case RES_FAILCNT:
4069 if (type == _MEM)
4070 res_counter_reset_failcnt(&memcg->res);
4071 else
4072 res_counter_reset_failcnt(&memcg->memsw);
4073 break;
4074 }
4075
4076 return 0;
4077}
4078
4079static u64 mem_cgroup_move_charge_read(struct cgroup *cgrp,
4080 struct cftype *cft)
4081{
4082 return mem_cgroup_from_cont(cgrp)->move_charge_at_immigrate;
4083}
4084
4085#ifdef CONFIG_MMU
4086static int mem_cgroup_move_charge_write(struct cgroup *cgrp,
4087 struct cftype *cft, u64 val)
4088{
4089 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
4090
4091 if (val >= (1 << NR_MOVE_TYPE))
4092 return -EINVAL;
4093
4094
4095
4096
4097
4098 cgroup_lock();
4099 memcg->move_charge_at_immigrate = val;
4100 cgroup_unlock();
4101
4102 return 0;
4103}
4104#else
4105static int mem_cgroup_move_charge_write(struct cgroup *cgrp,
4106 struct cftype *cft, u64 val)
4107{
4108 return -ENOSYS;
4109}
4110#endif
4111
4112#ifdef CONFIG_NUMA
4113static int memcg_numa_stat_show(struct cgroup *cont, struct cftype *cft,
4114 struct seq_file *m)
4115{
4116 int nid;
4117 unsigned long total_nr, file_nr, anon_nr, unevictable_nr;
4118 unsigned long node_nr;
4119 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
4120
4121 total_nr = mem_cgroup_nr_lru_pages(memcg, LRU_ALL);
4122 seq_printf(m, "total=%lu", total_nr);
4123 for_each_node_state(nid, N_HIGH_MEMORY) {
4124 node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL);
4125 seq_printf(m, " N%d=%lu", nid, node_nr);
4126 }
4127 seq_putc(m, '\n');
4128
4129 file_nr = mem_cgroup_nr_lru_pages(memcg, LRU_ALL_FILE);
4130 seq_printf(m, "file=%lu", file_nr);
4131 for_each_node_state(nid, N_HIGH_MEMORY) {
4132 node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid,
4133 LRU_ALL_FILE);
4134 seq_printf(m, " N%d=%lu", nid, node_nr);
4135 }
4136 seq_putc(m, '\n');
4137
4138 anon_nr = mem_cgroup_nr_lru_pages(memcg, LRU_ALL_ANON);
4139 seq_printf(m, "anon=%lu", anon_nr);
4140 for_each_node_state(nid, N_HIGH_MEMORY) {
4141 node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid,
4142 LRU_ALL_ANON);
4143 seq_printf(m, " N%d=%lu", nid, node_nr);
4144 }
4145 seq_putc(m, '\n');
4146
4147 unevictable_nr = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_UNEVICTABLE));
4148 seq_printf(m, "unevictable=%lu", unevictable_nr);
4149 for_each_node_state(nid, N_HIGH_MEMORY) {
4150 node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid,
4151 BIT(LRU_UNEVICTABLE));
4152 seq_printf(m, " N%d=%lu", nid, node_nr);
4153 }
4154 seq_putc(m, '\n');
4155 return 0;
4156}
4157#endif
4158
4159static const char * const mem_cgroup_lru_names[] = {
4160 "inactive_anon",
4161 "active_anon",
4162 "inactive_file",
4163 "active_file",
4164 "unevictable",
4165};
4166
4167static inline void mem_cgroup_lru_names_not_uptodate(void)
4168{
4169 BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_lru_names) != NR_LRU_LISTS);
4170}
4171
4172static int memcg_stat_show(struct cgroup *cont, struct cftype *cft,
4173 struct seq_file *m)
4174{
4175 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
4176 struct mem_cgroup *mi;
4177 unsigned int i;
4178
4179 for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) {
4180 if (i == MEM_CGROUP_STAT_SWAP && !do_swap_account)
4181 continue;
4182 seq_printf(m, "%s %ld\n", mem_cgroup_stat_names[i],
4183 mem_cgroup_read_stat(memcg, i) * PAGE_SIZE);
4184 }
4185
4186 for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++)
4187 seq_printf(m, "%s %lu\n", mem_cgroup_events_names[i],
4188 mem_cgroup_read_events(memcg, i));
4189
4190 for (i = 0; i < NR_LRU_LISTS; i++)
4191 seq_printf(m, "%s %lu\n", mem_cgroup_lru_names[i],
4192 mem_cgroup_nr_lru_pages(memcg, BIT(i)) * PAGE_SIZE);
4193
4194
4195 {
4196 unsigned long long limit, memsw_limit;
4197 memcg_get_hierarchical_limit(memcg, &limit, &memsw_limit);
4198 seq_printf(m, "hierarchical_memory_limit %llu\n", limit);
4199 if (do_swap_account)
4200 seq_printf(m, "hierarchical_memsw_limit %llu\n",
4201 memsw_limit);
4202 }
4203
4204 for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) {
4205 long long val = 0;
4206
4207 if (i == MEM_CGROUP_STAT_SWAP && !do_swap_account)
4208 continue;
4209 for_each_mem_cgroup_tree(mi, memcg)
4210 val += mem_cgroup_read_stat(mi, i) * PAGE_SIZE;
4211 seq_printf(m, "total_%s %lld\n", mem_cgroup_stat_names[i], val);
4212 }
4213
4214 for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++) {
4215 unsigned long long val = 0;
4216
4217 for_each_mem_cgroup_tree(mi, memcg)
4218 val += mem_cgroup_read_events(mi, i);
4219 seq_printf(m, "total_%s %llu\n",
4220 mem_cgroup_events_names[i], val);
4221 }
4222
4223 for (i = 0; i < NR_LRU_LISTS; i++) {
4224 unsigned long long val = 0;
4225
4226 for_each_mem_cgroup_tree(mi, memcg)
4227 val += mem_cgroup_nr_lru_pages(mi, BIT(i)) * PAGE_SIZE;
4228 seq_printf(m, "total_%s %llu\n", mem_cgroup_lru_names[i], val);
4229 }
4230
4231#ifdef CONFIG_DEBUG_VM
4232 {
4233 int nid, zid;
4234 struct mem_cgroup_per_zone *mz;
4235 struct zone_reclaim_stat *rstat;
4236 unsigned long recent_rotated[2] = {0, 0};
4237 unsigned long recent_scanned[2] = {0, 0};
4238
4239 for_each_online_node(nid)
4240 for (zid = 0; zid < MAX_NR_ZONES; zid++) {
4241 mz = mem_cgroup_zoneinfo(memcg, nid, zid);
4242 rstat = &mz->lruvec.reclaim_stat;
4243
4244 recent_rotated[0] += rstat->recent_rotated[0];
4245 recent_rotated[1] += rstat->recent_rotated[1];
4246 recent_scanned[0] += rstat->recent_scanned[0];
4247 recent_scanned[1] += rstat->recent_scanned[1];
4248 }
4249 seq_printf(m, "recent_rotated_anon %lu\n", recent_rotated[0]);
4250 seq_printf(m, "recent_rotated_file %lu\n", recent_rotated[1]);
4251 seq_printf(m, "recent_scanned_anon %lu\n", recent_scanned[0]);
4252 seq_printf(m, "recent_scanned_file %lu\n", recent_scanned[1]);
4253 }
4254#endif
4255
4256 return 0;
4257}
4258
4259static u64 mem_cgroup_swappiness_read(struct cgroup *cgrp, struct cftype *cft)
4260{
4261 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
4262
4263 return mem_cgroup_swappiness(memcg);
4264}
4265
4266static int mem_cgroup_swappiness_write(struct cgroup *cgrp, struct cftype *cft,
4267 u64 val)
4268{
4269 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
4270 struct mem_cgroup *parent;
4271
4272 if (val > 100)
4273 return -EINVAL;
4274
4275 if (cgrp->parent == NULL)
4276 return -EINVAL;
4277
4278 parent = mem_cgroup_from_cont(cgrp->parent);
4279
4280 cgroup_lock();
4281
4282
4283 if ((parent->use_hierarchy) ||
4284 (memcg->use_hierarchy && !list_empty(&cgrp->children))) {
4285 cgroup_unlock();
4286 return -EINVAL;
4287 }
4288
4289 memcg->swappiness = val;
4290
4291 cgroup_unlock();
4292
4293 return 0;
4294}
4295
4296static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap)
4297{
4298 struct mem_cgroup_threshold_ary *t;
4299 u64 usage;
4300 int i;
4301
4302 rcu_read_lock();
4303 if (!swap)
4304 t = rcu_dereference(memcg->thresholds.primary);
4305 else
4306 t = rcu_dereference(memcg->memsw_thresholds.primary);
4307
4308 if (!t)
4309 goto unlock;
4310
4311 usage = mem_cgroup_usage(memcg, swap);
4312
4313
4314
4315
4316
4317
4318 i = t->current_threshold;
4319
4320
4321
4322
4323
4324
4325
4326 for (; i >= 0 && unlikely(t->entries[i].threshold > usage); i--)
4327 eventfd_signal(t->entries[i].eventfd, 1);
4328
4329
4330 i++;
4331
4332
4333
4334
4335
4336
4337
4338 for (; i < t->size && unlikely(t->entries[i].threshold <= usage); i++)
4339 eventfd_signal(t->entries[i].eventfd, 1);
4340
4341
4342 t->current_threshold = i - 1;
4343unlock:
4344 rcu_read_unlock();
4345}
4346
4347static void mem_cgroup_threshold(struct mem_cgroup *memcg)
4348{
4349 while (memcg) {
4350 __mem_cgroup_threshold(memcg, false);
4351 if (do_swap_account)
4352 __mem_cgroup_threshold(memcg, true);
4353
4354 memcg = parent_mem_cgroup(memcg);
4355 }
4356}
4357
4358static int compare_thresholds(const void *a, const void *b)
4359{
4360 const struct mem_cgroup_threshold *_a = a;
4361 const struct mem_cgroup_threshold *_b = b;
4362
4363 return _a->threshold - _b->threshold;
4364}
4365
4366static int mem_cgroup_oom_notify_cb(struct mem_cgroup *memcg)
4367{
4368 struct mem_cgroup_eventfd_list *ev;
4369
4370 list_for_each_entry(ev, &memcg->oom_notify, list)
4371 eventfd_signal(ev->eventfd, 1);
4372 return 0;
4373}
4374
4375static void mem_cgroup_oom_notify(struct mem_cgroup *memcg)
4376{
4377 struct mem_cgroup *iter;
4378
4379 for_each_mem_cgroup_tree(iter, memcg)
4380 mem_cgroup_oom_notify_cb(iter);
4381}
4382
4383static int mem_cgroup_usage_register_event(struct cgroup *cgrp,
4384 struct cftype *cft, struct eventfd_ctx *eventfd, const char *args)
4385{
4386 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
4387 struct mem_cgroup_thresholds *thresholds;
4388 struct mem_cgroup_threshold_ary *new;
4389 int type = MEMFILE_TYPE(cft->private);
4390 u64 threshold, usage;
4391 int i, size, ret;
4392
4393 ret = res_counter_memparse_write_strategy(args, &threshold);
4394 if (ret)
4395 return ret;
4396
4397 mutex_lock(&memcg->thresholds_lock);
4398
4399 if (type == _MEM)
4400 thresholds = &memcg->thresholds;
4401 else if (type == _MEMSWAP)
4402 thresholds = &memcg->memsw_thresholds;
4403 else
4404 BUG();
4405
4406 usage = mem_cgroup_usage(memcg, type == _MEMSWAP);
4407
4408
4409 if (thresholds->primary)
4410 __mem_cgroup_threshold(memcg, type == _MEMSWAP);
4411
4412 size = thresholds->primary ? thresholds->primary->size + 1 : 1;
4413
4414
4415 new = kmalloc(sizeof(*new) + size * sizeof(struct mem_cgroup_threshold),
4416 GFP_KERNEL);
4417 if (!new) {
4418 ret = -ENOMEM;
4419 goto unlock;
4420 }
4421 new->size = size;
4422
4423
4424 if (thresholds->primary) {
4425 memcpy(new->entries, thresholds->primary->entries, (size - 1) *
4426 sizeof(struct mem_cgroup_threshold));
4427 }
4428
4429
4430 new->entries[size - 1].eventfd = eventfd;
4431 new->entries[size - 1].threshold = threshold;
4432
4433
4434 sort(new->entries, size, sizeof(struct mem_cgroup_threshold),
4435 compare_thresholds, NULL);
4436
4437
4438 new->current_threshold = -1;
4439 for (i = 0; i < size; i++) {
4440 if (new->entries[i].threshold <= usage) {
4441
4442
4443
4444
4445
4446 ++new->current_threshold;
4447 } else
4448 break;
4449 }
4450
4451
4452 kfree(thresholds->spare);
4453 thresholds->spare = thresholds->primary;
4454
4455 rcu_assign_pointer(thresholds->primary, new);
4456
4457
4458 synchronize_rcu();
4459
4460unlock:
4461 mutex_unlock(&memcg->thresholds_lock);
4462
4463 return ret;
4464}
4465
4466static void mem_cgroup_usage_unregister_event(struct cgroup *cgrp,
4467 struct cftype *cft, struct eventfd_ctx *eventfd)
4468{
4469 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
4470 struct mem_cgroup_thresholds *thresholds;
4471 struct mem_cgroup_threshold_ary *new;
4472 int type = MEMFILE_TYPE(cft->private);
4473 u64 usage;
4474 int i, j, size;
4475
4476 mutex_lock(&memcg->thresholds_lock);
4477 if (type == _MEM)
4478 thresholds = &memcg->thresholds;
4479 else if (type == _MEMSWAP)
4480 thresholds = &memcg->memsw_thresholds;
4481 else
4482 BUG();
4483
4484 if (!thresholds->primary)
4485 goto unlock;
4486
4487 usage = mem_cgroup_usage(memcg, type == _MEMSWAP);
4488
4489
4490 __mem_cgroup_threshold(memcg, type == _MEMSWAP);
4491
4492
4493 size = 0;
4494 for (i = 0; i < thresholds->primary->size; i++) {
4495 if (thresholds->primary->entries[i].eventfd != eventfd)
4496 size++;
4497 }
4498
4499 new = thresholds->spare;
4500
4501
4502 if (!size) {
4503 kfree(new);
4504 new = NULL;
4505 goto swap_buffers;
4506 }
4507
4508 new->size = size;
4509
4510
4511 new->current_threshold = -1;
4512 for (i = 0, j = 0; i < thresholds->primary->size; i++) {
4513 if (thresholds->primary->entries[i].eventfd == eventfd)
4514 continue;
4515
4516 new->entries[j] = thresholds->primary->entries[i];
4517 if (new->entries[j].threshold <= usage) {
4518
4519
4520
4521
4522
4523 ++new->current_threshold;
4524 }
4525 j++;
4526 }
4527
4528swap_buffers:
4529
4530 thresholds->spare = thresholds->primary;
4531
4532 if (!new) {
4533 kfree(thresholds->spare);
4534 thresholds->spare = NULL;
4535 }
4536
4537 rcu_assign_pointer(thresholds->primary, new);
4538
4539
4540 synchronize_rcu();
4541unlock:
4542 mutex_unlock(&memcg->thresholds_lock);
4543}
4544
4545static int mem_cgroup_oom_register_event(struct cgroup *cgrp,
4546 struct cftype *cft, struct eventfd_ctx *eventfd, const char *args)
4547{
4548 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
4549 struct mem_cgroup_eventfd_list *event;
4550 int type = MEMFILE_TYPE(cft->private);
4551
4552 BUG_ON(type != _OOM_TYPE);
4553 event = kmalloc(sizeof(*event), GFP_KERNEL);
4554 if (!event)
4555 return -ENOMEM;
4556
4557 spin_lock(&memcg_oom_lock);
4558
4559 event->eventfd = eventfd;
4560 list_add(&event->list, &memcg->oom_notify);
4561
4562
4563 if (atomic_read(&memcg->under_oom))
4564 eventfd_signal(eventfd, 1);
4565 spin_unlock(&memcg_oom_lock);
4566
4567 return 0;
4568}
4569
4570static void mem_cgroup_oom_unregister_event(struct cgroup *cgrp,
4571 struct cftype *cft, struct eventfd_ctx *eventfd)
4572{
4573 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
4574 struct mem_cgroup_eventfd_list *ev, *tmp;
4575 int type = MEMFILE_TYPE(cft->private);
4576
4577 BUG_ON(type != _OOM_TYPE);
4578
4579 spin_lock(&memcg_oom_lock);
4580
4581 list_for_each_entry_safe(ev, tmp, &memcg->oom_notify, list) {
4582 if (ev->eventfd == eventfd) {
4583 list_del(&ev->list);
4584 kfree(ev);
4585 }
4586 }
4587
4588 spin_unlock(&memcg_oom_lock);
4589}
4590
4591static int mem_cgroup_oom_control_read(struct cgroup *cgrp,
4592 struct cftype *cft, struct cgroup_map_cb *cb)
4593{
4594 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
4595
4596 cb->fill(cb, "oom_kill_disable", memcg->oom_kill_disable);
4597
4598 if (atomic_read(&memcg->under_oom))
4599 cb->fill(cb, "under_oom", 1);
4600 else
4601 cb->fill(cb, "under_oom", 0);
4602 return 0;
4603}
4604
4605static int mem_cgroup_oom_control_write(struct cgroup *cgrp,
4606 struct cftype *cft, u64 val)
4607{
4608 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
4609 struct mem_cgroup *parent;
4610
4611
4612 if (!cgrp->parent || !((val == 0) || (val == 1)))
4613 return -EINVAL;
4614
4615 parent = mem_cgroup_from_cont(cgrp->parent);
4616
4617 cgroup_lock();
4618
4619 if ((parent->use_hierarchy) ||
4620 (memcg->use_hierarchy && !list_empty(&cgrp->children))) {
4621 cgroup_unlock();
4622 return -EINVAL;
4623 }
4624 memcg->oom_kill_disable = val;
4625 if (!val)
4626 memcg_oom_recover(memcg);
4627 cgroup_unlock();
4628 return 0;
4629}
4630
4631#ifdef CONFIG_MEMCG_KMEM
4632static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
4633{
4634 return mem_cgroup_sockets_init(memcg, ss);
4635};
4636
4637static void kmem_cgroup_destroy(struct mem_cgroup *memcg)
4638{
4639 mem_cgroup_sockets_destroy(memcg);
4640}
4641#else
4642static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
4643{
4644 return 0;
4645}
4646
4647static void kmem_cgroup_destroy(struct mem_cgroup *memcg)
4648{
4649}
4650#endif
4651
4652static struct cftype mem_cgroup_files[] = {
4653 {
4654 .name = "usage_in_bytes",
4655 .private = MEMFILE_PRIVATE(_MEM, RES_USAGE),
4656 .read = mem_cgroup_read,
4657 .register_event = mem_cgroup_usage_register_event,
4658 .unregister_event = mem_cgroup_usage_unregister_event,
4659 },
4660 {
4661 .name = "max_usage_in_bytes",
4662 .private = MEMFILE_PRIVATE(_MEM, RES_MAX_USAGE),
4663 .trigger = mem_cgroup_reset,
4664 .read = mem_cgroup_read,
4665 },
4666 {
4667 .name = "limit_in_bytes",
4668 .private = MEMFILE_PRIVATE(_MEM, RES_LIMIT),
4669 .write_string = mem_cgroup_write,
4670 .read = mem_cgroup_read,
4671 },
4672 {
4673 .name = "soft_limit_in_bytes",
4674 .private = MEMFILE_PRIVATE(_MEM, RES_SOFT_LIMIT),
4675 .write_string = mem_cgroup_write,
4676 .read = mem_cgroup_read,
4677 },
4678 {
4679 .name = "failcnt",
4680 .private = MEMFILE_PRIVATE(_MEM, RES_FAILCNT),
4681 .trigger = mem_cgroup_reset,
4682 .read = mem_cgroup_read,
4683 },
4684 {
4685 .name = "stat",
4686 .read_seq_string = memcg_stat_show,
4687 },
4688 {
4689 .name = "force_empty",
4690 .trigger = mem_cgroup_force_empty_write,
4691 },
4692 {
4693 .name = "use_hierarchy",
4694 .write_u64 = mem_cgroup_hierarchy_write,
4695 .read_u64 = mem_cgroup_hierarchy_read,
4696 },
4697 {
4698 .name = "swappiness",
4699 .read_u64 = mem_cgroup_swappiness_read,
4700 .write_u64 = mem_cgroup_swappiness_write,
4701 },
4702 {
4703 .name = "move_charge_at_immigrate",
4704 .read_u64 = mem_cgroup_move_charge_read,
4705 .write_u64 = mem_cgroup_move_charge_write,
4706 },
4707 {
4708 .name = "oom_control",
4709 .read_map = mem_cgroup_oom_control_read,
4710 .write_u64 = mem_cgroup_oom_control_write,
4711 .register_event = mem_cgroup_oom_register_event,
4712 .unregister_event = mem_cgroup_oom_unregister_event,
4713 .private = MEMFILE_PRIVATE(_OOM_TYPE, OOM_CONTROL),
4714 },
4715#ifdef CONFIG_NUMA
4716 {
4717 .name = "numa_stat",
4718 .read_seq_string = memcg_numa_stat_show,
4719 },
4720#endif
4721#ifdef CONFIG_MEMCG_SWAP
4722 {
4723 .name = "memsw.usage_in_bytes",
4724 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE),
4725 .read = mem_cgroup_read,
4726 .register_event = mem_cgroup_usage_register_event,
4727 .unregister_event = mem_cgroup_usage_unregister_event,
4728 },
4729 {
4730 .name = "memsw.max_usage_in_bytes",
4731 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_MAX_USAGE),
4732 .trigger = mem_cgroup_reset,
4733 .read = mem_cgroup_read,
4734 },
4735 {
4736 .name = "memsw.limit_in_bytes",
4737 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_LIMIT),
4738 .write_string = mem_cgroup_write,
4739 .read = mem_cgroup_read,
4740 },
4741 {
4742 .name = "memsw.failcnt",
4743 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_FAILCNT),
4744 .trigger = mem_cgroup_reset,
4745 .read = mem_cgroup_read,
4746 },
4747#endif
4748 { },
4749};
4750
4751static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node)
4752{
4753 struct mem_cgroup_per_node *pn;
4754 struct mem_cgroup_per_zone *mz;
4755 int zone, tmp = node;
4756
4757
4758
4759
4760
4761
4762
4763
4764 if (!node_state(node, N_NORMAL_MEMORY))
4765 tmp = -1;
4766 pn = kzalloc_node(sizeof(*pn), GFP_KERNEL, tmp);
4767 if (!pn)
4768 return 1;
4769
4770 for (zone = 0; zone < MAX_NR_ZONES; zone++) {
4771 mz = &pn->zoneinfo[zone];
4772 lruvec_init(&mz->lruvec);
4773 mz->usage_in_excess = 0;
4774 mz->on_tree = false;
4775 mz->memcg = memcg;
4776 }
4777 memcg->info.nodeinfo[node] = pn;
4778 return 0;
4779}
4780
4781static void free_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node)
4782{
4783 kfree(memcg->info.nodeinfo[node]);
4784}
4785
4786static struct mem_cgroup *mem_cgroup_alloc(void)
4787{
4788 struct mem_cgroup *memcg;
4789 int size = sizeof(struct mem_cgroup);
4790
4791
4792 if (size < PAGE_SIZE)
4793 memcg = kzalloc(size, GFP_KERNEL);
4794 else
4795 memcg = vzalloc(size);
4796
4797 if (!memcg)
4798 return NULL;
4799
4800 memcg->stat = alloc_percpu(struct mem_cgroup_stat_cpu);
4801 if (!memcg->stat)
4802 goto out_free;
4803 spin_lock_init(&memcg->pcp_counter_lock);
4804 return memcg;
4805
4806out_free:
4807 if (size < PAGE_SIZE)
4808 kfree(memcg);
4809 else
4810 vfree(memcg);
4811 return NULL;
4812}
4813
4814
4815
4816
4817
4818
4819static void free_work(struct work_struct *work)
4820{
4821 struct mem_cgroup *memcg;
4822 int size = sizeof(struct mem_cgroup);
4823
4824 memcg = container_of(work, struct mem_cgroup, work_freeing);
4825
4826
4827
4828
4829
4830
4831
4832
4833
4834
4835
4836 disarm_sock_keys(memcg);
4837 if (size < PAGE_SIZE)
4838 kfree(memcg);
4839 else
4840 vfree(memcg);
4841}
4842
4843static void free_rcu(struct rcu_head *rcu_head)
4844{
4845 struct mem_cgroup *memcg;
4846
4847 memcg = container_of(rcu_head, struct mem_cgroup, rcu_freeing);
4848 INIT_WORK(&memcg->work_freeing, free_work);
4849 schedule_work(&memcg->work_freeing);
4850}
4851
4852
4853
4854
4855
4856
4857
4858
4859
4860
4861
4862
4863static void __mem_cgroup_free(struct mem_cgroup *memcg)
4864{
4865 int node;
4866
4867 mem_cgroup_remove_from_trees(memcg);
4868 free_css_id(&mem_cgroup_subsys, &memcg->css);
4869
4870 for_each_node(node)
4871 free_mem_cgroup_per_zone_info(memcg, node);
4872
4873 free_percpu(memcg->stat);
4874 call_rcu(&memcg->rcu_freeing, free_rcu);
4875}
4876
4877static void mem_cgroup_get(struct mem_cgroup *memcg)
4878{
4879 atomic_inc(&memcg->refcnt);
4880}
4881
4882static void __mem_cgroup_put(struct mem_cgroup *memcg, int count)
4883{
4884 if (atomic_sub_and_test(count, &memcg->refcnt)) {
4885 struct mem_cgroup *parent = parent_mem_cgroup(memcg);
4886 __mem_cgroup_free(memcg);
4887 if (parent)
4888 mem_cgroup_put(parent);
4889 }
4890}
4891
4892static void mem_cgroup_put(struct mem_cgroup *memcg)
4893{
4894 __mem_cgroup_put(memcg, 1);
4895}
4896
4897
4898
4899
4900struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg)
4901{
4902 if (!memcg->res.parent)
4903 return NULL;
4904 return mem_cgroup_from_res_counter(memcg->res.parent, res);
4905}
4906EXPORT_SYMBOL(parent_mem_cgroup);
4907
4908#ifdef CONFIG_MEMCG_SWAP
4909static void __init enable_swap_cgroup(void)
4910{
4911 if (!mem_cgroup_disabled() && really_do_swap_account)
4912 do_swap_account = 1;
4913}
4914#else
4915static void __init enable_swap_cgroup(void)
4916{
4917}
4918#endif
4919
4920static int mem_cgroup_soft_limit_tree_init(void)
4921{
4922 struct mem_cgroup_tree_per_node *rtpn;
4923 struct mem_cgroup_tree_per_zone *rtpz;
4924 int tmp, node, zone;
4925
4926 for_each_node(node) {
4927 tmp = node;
4928 if (!node_state(node, N_NORMAL_MEMORY))
4929 tmp = -1;
4930 rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL, tmp);
4931 if (!rtpn)
4932 goto err_cleanup;
4933
4934 soft_limit_tree.rb_tree_per_node[node] = rtpn;
4935
4936 for (zone = 0; zone < MAX_NR_ZONES; zone++) {
4937 rtpz = &rtpn->rb_tree_per_zone[zone];
4938 rtpz->rb_root = RB_ROOT;
4939 spin_lock_init(&rtpz->lock);
4940 }
4941 }
4942 return 0;
4943
4944err_cleanup:
4945 for_each_node(node) {
4946 if (!soft_limit_tree.rb_tree_per_node[node])
4947 break;
4948 kfree(soft_limit_tree.rb_tree_per_node[node]);
4949 soft_limit_tree.rb_tree_per_node[node] = NULL;
4950 }
4951 return 1;
4952
4953}
4954
4955static struct cgroup_subsys_state * __ref
4956mem_cgroup_create(struct cgroup *cont)
4957{
4958 struct mem_cgroup *memcg, *parent;
4959 long error = -ENOMEM;
4960 int node;
4961
4962 memcg = mem_cgroup_alloc();
4963 if (!memcg)
4964 return ERR_PTR(error);
4965
4966 for_each_node(node)
4967 if (alloc_mem_cgroup_per_zone_info(memcg, node))
4968 goto free_out;
4969
4970
4971 if (cont->parent == NULL) {
4972 int cpu;
4973 enable_swap_cgroup();
4974 parent = NULL;
4975 if (mem_cgroup_soft_limit_tree_init())
4976 goto free_out;
4977 root_mem_cgroup = memcg;
4978 for_each_possible_cpu(cpu) {
4979 struct memcg_stock_pcp *stock =
4980 &per_cpu(memcg_stock, cpu);
4981 INIT_WORK(&stock->work, drain_local_stock);
4982 }
4983 hotcpu_notifier(memcg_cpu_hotplug_callback, 0);
4984 } else {
4985 parent = mem_cgroup_from_cont(cont->parent);
4986 memcg->use_hierarchy = parent->use_hierarchy;
4987 memcg->oom_kill_disable = parent->oom_kill_disable;
4988 }
4989
4990 if (parent && parent->use_hierarchy) {
4991 res_counter_init(&memcg->res, &parent->res);
4992 res_counter_init(&memcg->memsw, &parent->memsw);
4993
4994
4995
4996
4997
4998
4999 mem_cgroup_get(parent);
5000 } else {
5001 res_counter_init(&memcg->res, NULL);
5002 res_counter_init(&memcg->memsw, NULL);
5003
5004
5005
5006
5007
5008 if (parent && parent != root_mem_cgroup)
5009 mem_cgroup_subsys.broken_hierarchy = true;
5010 }
5011 memcg->last_scanned_node = MAX_NUMNODES;
5012 INIT_LIST_HEAD(&memcg->oom_notify);
5013
5014 if (parent)
5015 memcg->swappiness = mem_cgroup_swappiness(parent);
5016 atomic_set(&memcg->refcnt, 1);
5017 memcg->move_charge_at_immigrate = 0;
5018 mutex_init(&memcg->thresholds_lock);
5019 spin_lock_init(&memcg->move_lock);
5020
5021 error = memcg_init_kmem(memcg, &mem_cgroup_subsys);
5022 if (error) {
5023
5024
5025
5026
5027
5028 mem_cgroup_put(memcg);
5029 return ERR_PTR(error);
5030 }
5031 return &memcg->css;
5032free_out:
5033 __mem_cgroup_free(memcg);
5034 return ERR_PTR(error);
5035}
5036
5037static int mem_cgroup_pre_destroy(struct cgroup *cont)
5038{
5039 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
5040
5041 return mem_cgroup_force_empty(memcg, false);
5042}
5043
5044static void mem_cgroup_destroy(struct cgroup *cont)
5045{
5046 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
5047
5048 kmem_cgroup_destroy(memcg);
5049
5050 mem_cgroup_put(memcg);
5051}
5052
5053#ifdef CONFIG_MMU
5054
5055#define PRECHARGE_COUNT_AT_ONCE 256
5056static int mem_cgroup_do_precharge(unsigned long count)
5057{
5058 int ret = 0;
5059 int batch_count = PRECHARGE_COUNT_AT_ONCE;
5060 struct mem_cgroup *memcg = mc.to;
5061
5062 if (mem_cgroup_is_root(memcg)) {
5063 mc.precharge += count;
5064
5065 return ret;
5066 }
5067
5068 if (count > 1) {
5069 struct res_counter *dummy;
5070
5071
5072
5073
5074
5075
5076 if (res_counter_charge(&memcg->res, PAGE_SIZE * count, &dummy))
5077 goto one_by_one;
5078 if (do_swap_account && res_counter_charge(&memcg->memsw,
5079 PAGE_SIZE * count, &dummy)) {
5080 res_counter_uncharge(&memcg->res, PAGE_SIZE * count);
5081 goto one_by_one;
5082 }
5083 mc.precharge += count;
5084 return ret;
5085 }
5086one_by_one:
5087
5088 while (count--) {
5089 if (signal_pending(current)) {
5090 ret = -EINTR;
5091 break;
5092 }
5093 if (!batch_count--) {
5094 batch_count = PRECHARGE_COUNT_AT_ONCE;
5095 cond_resched();
5096 }
5097 ret = __mem_cgroup_try_charge(NULL,
5098 GFP_KERNEL, 1, &memcg, false);
5099 if (ret)
5100
5101 return ret;
5102 mc.precharge++;
5103 }
5104 return ret;
5105}
5106
5107
5108
5109
5110
5111
5112
5113
5114
5115
5116
5117
5118
5119
5120
5121
5122
5123
5124
5125union mc_target {
5126 struct page *page;
5127 swp_entry_t ent;
5128};
5129
5130enum mc_target_type {
5131 MC_TARGET_NONE = 0,
5132 MC_TARGET_PAGE,
5133 MC_TARGET_SWAP,
5134};
5135
5136static struct page *mc_handle_present_pte(struct vm_area_struct *vma,
5137 unsigned long addr, pte_t ptent)
5138{
5139 struct page *page = vm_normal_page(vma, addr, ptent);
5140
5141 if (!page || !page_mapped(page))
5142 return NULL;
5143 if (PageAnon(page)) {
5144
5145 if (!move_anon())
5146 return NULL;
5147 } else if (!move_file())
5148
5149 return NULL;
5150 if (!get_page_unless_zero(page))
5151 return NULL;
5152
5153 return page;
5154}
5155
5156#ifdef CONFIG_SWAP
5157static struct page *mc_handle_swap_pte(struct vm_area_struct *vma,
5158 unsigned long addr, pte_t ptent, swp_entry_t *entry)
5159{
5160 struct page *page = NULL;
5161 swp_entry_t ent = pte_to_swp_entry(ptent);
5162
5163 if (!move_anon() || non_swap_entry(ent))
5164 return NULL;
5165
5166
5167
5168
5169 page = find_get_page(&swapper_space, ent.val);
5170 if (do_swap_account)
5171 entry->val = ent.val;
5172
5173 return page;
5174}
5175#else
5176static struct page *mc_handle_swap_pte(struct vm_area_struct *vma,
5177 unsigned long addr, pte_t ptent, swp_entry_t *entry)
5178{
5179 return NULL;
5180}
5181#endif
5182
5183static struct page *mc_handle_file_pte(struct vm_area_struct *vma,
5184 unsigned long addr, pte_t ptent, swp_entry_t *entry)
5185{
5186 struct page *page = NULL;
5187 struct address_space *mapping;
5188 pgoff_t pgoff;
5189
5190 if (!vma->vm_file)
5191 return NULL;
5192 if (!move_file())
5193 return NULL;
5194
5195 mapping = vma->vm_file->f_mapping;
5196 if (pte_none(ptent))
5197 pgoff = linear_page_index(vma, addr);
5198 else
5199 pgoff = pte_to_pgoff(ptent);
5200
5201
5202 page = find_get_page(mapping, pgoff);
5203
5204#ifdef CONFIG_SWAP
5205
5206 if (radix_tree_exceptional_entry(page)) {
5207 swp_entry_t swap = radix_to_swp_entry(page);
5208 if (do_swap_account)
5209 *entry = swap;
5210 page = find_get_page(&swapper_space, swap.val);
5211 }
5212#endif
5213 return page;
5214}
5215
5216static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma,
5217 unsigned long addr, pte_t ptent, union mc_target *target)
5218{
5219 struct page *page = NULL;
5220 struct page_cgroup *pc;
5221 enum mc_target_type ret = MC_TARGET_NONE;
5222 swp_entry_t ent = { .val = 0 };
5223
5224 if (pte_present(ptent))
5225 page = mc_handle_present_pte(vma, addr, ptent);
5226 else if (is_swap_pte(ptent))
5227 page = mc_handle_swap_pte(vma, addr, ptent, &ent);
5228 else if (pte_none(ptent) || pte_file(ptent))
5229 page = mc_handle_file_pte(vma, addr, ptent, &ent);
5230
5231 if (!page && !ent.val)
5232 return ret;
5233 if (page) {
5234 pc = lookup_page_cgroup(page);
5235
5236
5237
5238
5239
5240 if (PageCgroupUsed(pc) && pc->mem_cgroup == mc.from) {
5241 ret = MC_TARGET_PAGE;
5242 if (target)
5243 target->page = page;
5244 }
5245 if (!ret || !target)
5246 put_page(page);
5247 }
5248
5249 if (ent.val && !ret &&
5250 css_id(&mc.from->css) == lookup_swap_cgroup_id(ent)) {
5251 ret = MC_TARGET_SWAP;
5252 if (target)
5253 target->ent = ent;
5254 }
5255 return ret;
5256}
5257
5258#ifdef CONFIG_TRANSPARENT_HUGEPAGE
5259
5260
5261
5262
5263
5264static enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma,
5265 unsigned long addr, pmd_t pmd, union mc_target *target)
5266{
5267 struct page *page = NULL;
5268 struct page_cgroup *pc;
5269 enum mc_target_type ret = MC_TARGET_NONE;
5270
5271 page = pmd_page(pmd);
5272 VM_BUG_ON(!page || !PageHead(page));
5273 if (!move_anon())
5274 return ret;
5275 pc = lookup_page_cgroup(page);
5276 if (PageCgroupUsed(pc) && pc->mem_cgroup == mc.from) {
5277 ret = MC_TARGET_PAGE;
5278 if (target) {
5279 get_page(page);
5280 target->page = page;
5281 }
5282 }
5283 return ret;
5284}
5285#else
5286static inline enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma,
5287 unsigned long addr, pmd_t pmd, union mc_target *target)
5288{
5289 return MC_TARGET_NONE;
5290}
5291#endif
5292
5293static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd,
5294 unsigned long addr, unsigned long end,
5295 struct mm_walk *walk)
5296{
5297 struct vm_area_struct *vma = walk->private;
5298 pte_t *pte;
5299 spinlock_t *ptl;
5300
5301 if (pmd_trans_huge_lock(pmd, vma) == 1) {
5302 if (get_mctgt_type_thp(vma, addr, *pmd, NULL) == MC_TARGET_PAGE)
5303 mc.precharge += HPAGE_PMD_NR;
5304 spin_unlock(&vma->vm_mm->page_table_lock);
5305 return 0;
5306 }
5307
5308 if (pmd_trans_unstable(pmd))
5309 return 0;
5310 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
5311 for (; addr != end; pte++, addr += PAGE_SIZE)
5312 if (get_mctgt_type(vma, addr, *pte, NULL))
5313 mc.precharge++;
5314 pte_unmap_unlock(pte - 1, ptl);
5315 cond_resched();
5316
5317 return 0;
5318}
5319
5320static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm)
5321{
5322 unsigned long precharge;
5323 struct vm_area_struct *vma;
5324
5325 down_read(&mm->mmap_sem);
5326 for (vma = mm->mmap; vma; vma = vma->vm_next) {
5327 struct mm_walk mem_cgroup_count_precharge_walk = {
5328 .pmd_entry = mem_cgroup_count_precharge_pte_range,
5329 .mm = mm,
5330 .private = vma,
5331 };
5332 if (is_vm_hugetlb_page(vma))
5333 continue;
5334 walk_page_range(vma->vm_start, vma->vm_end,
5335 &mem_cgroup_count_precharge_walk);
5336 }
5337 up_read(&mm->mmap_sem);
5338
5339 precharge = mc.precharge;
5340 mc.precharge = 0;
5341
5342 return precharge;
5343}
5344
5345static int mem_cgroup_precharge_mc(struct mm_struct *mm)
5346{
5347 unsigned long precharge = mem_cgroup_count_precharge(mm);
5348
5349 VM_BUG_ON(mc.moving_task);
5350 mc.moving_task = current;
5351 return mem_cgroup_do_precharge(precharge);
5352}
5353
5354
5355static void __mem_cgroup_clear_mc(void)
5356{
5357 struct mem_cgroup *from = mc.from;
5358 struct mem_cgroup *to = mc.to;
5359
5360
5361 if (mc.precharge) {
5362 __mem_cgroup_cancel_charge(mc.to, mc.precharge);
5363 mc.precharge = 0;
5364 }
5365
5366
5367
5368
5369 if (mc.moved_charge) {
5370 __mem_cgroup_cancel_charge(mc.from, mc.moved_charge);
5371 mc.moved_charge = 0;
5372 }
5373
5374 if (mc.moved_swap) {
5375
5376 if (!mem_cgroup_is_root(mc.from))
5377 res_counter_uncharge(&mc.from->memsw,
5378 PAGE_SIZE * mc.moved_swap);
5379 __mem_cgroup_put(mc.from, mc.moved_swap);
5380
5381 if (!mem_cgroup_is_root(mc.to)) {
5382
5383
5384
5385
5386 res_counter_uncharge(&mc.to->res,
5387 PAGE_SIZE * mc.moved_swap);
5388 }
5389
5390 mc.moved_swap = 0;
5391 }
5392 memcg_oom_recover(from);
5393 memcg_oom_recover(to);
5394 wake_up_all(&mc.waitq);
5395}
5396
5397static void mem_cgroup_clear_mc(void)
5398{
5399 struct mem_cgroup *from = mc.from;
5400
5401
5402
5403
5404
5405 mc.moving_task = NULL;
5406 __mem_cgroup_clear_mc();
5407 spin_lock(&mc.lock);
5408 mc.from = NULL;
5409 mc.to = NULL;
5410 spin_unlock(&mc.lock);
5411 mem_cgroup_end_move(from);
5412}
5413
5414static int mem_cgroup_can_attach(struct cgroup *cgroup,
5415 struct cgroup_taskset *tset)
5416{
5417 struct task_struct *p = cgroup_taskset_first(tset);
5418 int ret = 0;
5419 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgroup);
5420
5421 if (memcg->move_charge_at_immigrate) {
5422 struct mm_struct *mm;
5423 struct mem_cgroup *from = mem_cgroup_from_task(p);
5424
5425 VM_BUG_ON(from == memcg);
5426
5427 mm = get_task_mm(p);
5428 if (!mm)
5429 return 0;
5430
5431 if (mm->owner == p) {
5432 VM_BUG_ON(mc.from);
5433 VM_BUG_ON(mc.to);
5434 VM_BUG_ON(mc.precharge);
5435 VM_BUG_ON(mc.moved_charge);
5436 VM_BUG_ON(mc.moved_swap);
5437 mem_cgroup_start_move(from);
5438 spin_lock(&mc.lock);
5439 mc.from = from;
5440 mc.to = memcg;
5441 spin_unlock(&mc.lock);
5442
5443
5444 ret = mem_cgroup_precharge_mc(mm);
5445 if (ret)
5446 mem_cgroup_clear_mc();
5447 }
5448 mmput(mm);
5449 }
5450 return ret;
5451}
5452
5453static void mem_cgroup_cancel_attach(struct cgroup *cgroup,
5454 struct cgroup_taskset *tset)
5455{
5456 mem_cgroup_clear_mc();
5457}
5458
5459static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
5460 unsigned long addr, unsigned long end,
5461 struct mm_walk *walk)
5462{
5463 int ret = 0;
5464 struct vm_area_struct *vma = walk->private;
5465 pte_t *pte;
5466 spinlock_t *ptl;
5467 enum mc_target_type target_type;
5468 union mc_target target;
5469 struct page *page;
5470 struct page_cgroup *pc;
5471
5472
5473
5474
5475
5476
5477
5478
5479
5480
5481
5482 if (pmd_trans_huge_lock(pmd, vma) == 1) {
5483 if (mc.precharge < HPAGE_PMD_NR) {
5484 spin_unlock(&vma->vm_mm->page_table_lock);
5485 return 0;
5486 }
5487 target_type = get_mctgt_type_thp(vma, addr, *pmd, &target);
5488 if (target_type == MC_TARGET_PAGE) {
5489 page = target.page;
5490 if (!isolate_lru_page(page)) {
5491 pc = lookup_page_cgroup(page);
5492 if (!mem_cgroup_move_account(page, HPAGE_PMD_NR,
5493 pc, mc.from, mc.to)) {
5494 mc.precharge -= HPAGE_PMD_NR;
5495 mc.moved_charge += HPAGE_PMD_NR;
5496 }
5497 putback_lru_page(page);
5498 }
5499 put_page(page);
5500 }
5501 spin_unlock(&vma->vm_mm->page_table_lock);
5502 return 0;
5503 }
5504
5505 if (pmd_trans_unstable(pmd))
5506 return 0;
5507retry:
5508 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
5509 for (; addr != end; addr += PAGE_SIZE) {
5510 pte_t ptent = *(pte++);
5511 swp_entry_t ent;
5512
5513 if (!mc.precharge)
5514 break;
5515
5516 switch (get_mctgt_type(vma, addr, ptent, &target)) {
5517 case MC_TARGET_PAGE:
5518 page = target.page;
5519 if (isolate_lru_page(page))
5520 goto put;
5521 pc = lookup_page_cgroup(page);
5522 if (!mem_cgroup_move_account(page, 1, pc,
5523 mc.from, mc.to)) {
5524 mc.precharge--;
5525
5526 mc.moved_charge++;
5527 }
5528 putback_lru_page(page);
5529put:
5530 put_page(page);
5531 break;
5532 case MC_TARGET_SWAP:
5533 ent = target.ent;
5534 if (!mem_cgroup_move_swap_account(ent, mc.from, mc.to)) {
5535 mc.precharge--;
5536
5537 mc.moved_swap++;
5538 }
5539 break;
5540 default:
5541 break;
5542 }
5543 }
5544 pte_unmap_unlock(pte - 1, ptl);
5545 cond_resched();
5546
5547 if (addr != end) {
5548
5549
5550
5551
5552
5553
5554 ret = mem_cgroup_do_precharge(1);
5555 if (!ret)
5556 goto retry;
5557 }
5558
5559 return ret;
5560}
5561
5562static void mem_cgroup_move_charge(struct mm_struct *mm)
5563{
5564 struct vm_area_struct *vma;
5565
5566 lru_add_drain_all();
5567retry:
5568 if (unlikely(!down_read_trylock(&mm->mmap_sem))) {
5569
5570
5571
5572
5573
5574
5575
5576 __mem_cgroup_clear_mc();
5577 cond_resched();
5578 goto retry;
5579 }
5580 for (vma = mm->mmap; vma; vma = vma->vm_next) {
5581 int ret;
5582 struct mm_walk mem_cgroup_move_charge_walk = {
5583 .pmd_entry = mem_cgroup_move_charge_pte_range,
5584 .mm = mm,
5585 .private = vma,
5586 };
5587 if (is_vm_hugetlb_page(vma))
5588 continue;
5589 ret = walk_page_range(vma->vm_start, vma->vm_end,
5590 &mem_cgroup_move_charge_walk);
5591 if (ret)
5592
5593
5594
5595
5596 break;
5597 }
5598 up_read(&mm->mmap_sem);
5599}
5600
5601static void mem_cgroup_move_task(struct cgroup *cont,
5602 struct cgroup_taskset *tset)
5603{
5604 struct task_struct *p = cgroup_taskset_first(tset);
5605 struct mm_struct *mm = get_task_mm(p);
5606
5607 if (mm) {
5608 if (mc.to)
5609 mem_cgroup_move_charge(mm);
5610 mmput(mm);
5611 }
5612 if (mc.to)
5613 mem_cgroup_clear_mc();
5614}
5615#else
5616static int mem_cgroup_can_attach(struct cgroup *cgroup,
5617 struct cgroup_taskset *tset)
5618{
5619 return 0;
5620}
5621static void mem_cgroup_cancel_attach(struct cgroup *cgroup,
5622 struct cgroup_taskset *tset)
5623{
5624}
5625static void mem_cgroup_move_task(struct cgroup *cont,
5626 struct cgroup_taskset *tset)
5627{
5628}
5629#endif
5630
5631struct cgroup_subsys mem_cgroup_subsys = {
5632 .name = "memory",
5633 .subsys_id = mem_cgroup_subsys_id,
5634 .create = mem_cgroup_create,
5635 .pre_destroy = mem_cgroup_pre_destroy,
5636 .destroy = mem_cgroup_destroy,
5637 .can_attach = mem_cgroup_can_attach,
5638 .cancel_attach = mem_cgroup_cancel_attach,
5639 .attach = mem_cgroup_move_task,
5640 .base_cftypes = mem_cgroup_files,
5641 .early_init = 0,
5642 .use_id = 1,
5643 .__DEPRECATED_clear_css_refs = true,
5644};
5645
5646#ifdef CONFIG_MEMCG_SWAP
5647static int __init enable_swap_account(char *s)
5648{
5649
5650 if (!strcmp(s, "1"))
5651 really_do_swap_account = 1;
5652 else if (!strcmp(s, "0"))
5653 really_do_swap_account = 0;
5654 return 1;
5655}
5656__setup("swapaccount=", enable_swap_account);
5657
5658#endif
5659