1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25#include <linux/page_counter.h>
26#include <linux/memcontrol.h>
27#include <linux/cgroup.h>
28#include <linux/mm.h>
29#include <linux/sched/mm.h>
30#include <linux/shmem_fs.h>
31#include <linux/hugetlb.h>
32#include <linux/pagemap.h>
33#include <linux/vm_event_item.h>
34#include <linux/smp.h>
35#include <linux/page-flags.h>
36#include <linux/backing-dev.h>
37#include <linux/bit_spinlock.h>
38#include <linux/rcupdate.h>
39#include <linux/limits.h>
40#include <linux/export.h>
41#include <linux/mutex.h>
42#include <linux/rbtree.h>
43#include <linux/slab.h>
44#include <linux/swap.h>
45#include <linux/swapops.h>
46#include <linux/spinlock.h>
47#include <linux/eventfd.h>
48#include <linux/poll.h>
49#include <linux/sort.h>
50#include <linux/fs.h>
51#include <linux/seq_file.h>
52#include <linux/vmpressure.h>
53#include <linux/mm_inline.h>
54#include <linux/swap_cgroup.h>
55#include <linux/cpu.h>
56#include <linux/oom.h>
57#include <linux/lockdep.h>
58#include <linux/file.h>
59#include <linux/tracehook.h>
60#include "internal.h"
61#include <net/sock.h>
62#include <net/ip.h>
63#include "slab.h"
64
65#include <linux/uaccess.h>
66
67#include <trace/events/vmscan.h>
68
69struct cgroup_subsys memory_cgrp_subsys __read_mostly;
70EXPORT_SYMBOL(memory_cgrp_subsys);
71
72struct mem_cgroup *root_mem_cgroup __read_mostly;
73
74#define MEM_CGROUP_RECLAIM_RETRIES 5
75
76
77static bool cgroup_memory_nosocket;
78
79
80static bool cgroup_memory_nokmem;
81
82
83#ifdef CONFIG_MEMCG_SWAP
84int do_swap_account __read_mostly;
85#else
86#define do_swap_account 0
87#endif
88
89
90static bool do_memsw_account(void)
91{
92 return !cgroup_subsys_on_dfl(memory_cgrp_subsys) && do_swap_account;
93}
94
95static const char *const mem_cgroup_lru_names[] = {
96 "inactive_anon",
97 "active_anon",
98 "inactive_file",
99 "active_file",
100 "unevictable",
101};
102
103#define THRESHOLDS_EVENTS_TARGET 128
104#define SOFTLIMIT_EVENTS_TARGET 1024
105#define NUMAINFO_EVENTS_TARGET 1024
106
107
108
109
110
111
112struct mem_cgroup_tree_per_node {
113 struct rb_root rb_root;
114 struct rb_node *rb_rightmost;
115 spinlock_t lock;
116};
117
118struct mem_cgroup_tree {
119 struct mem_cgroup_tree_per_node *rb_tree_per_node[MAX_NUMNODES];
120};
121
122static struct mem_cgroup_tree soft_limit_tree __read_mostly;
123
124
125struct mem_cgroup_eventfd_list {
126 struct list_head list;
127 struct eventfd_ctx *eventfd;
128};
129
130
131
132
133struct mem_cgroup_event {
134
135
136
137 struct mem_cgroup *memcg;
138
139
140
141 struct eventfd_ctx *eventfd;
142
143
144
145 struct list_head list;
146
147
148
149
150
151 int (*register_event)(struct mem_cgroup *memcg,
152 struct eventfd_ctx *eventfd, const char *args);
153
154
155
156
157
158 void (*unregister_event)(struct mem_cgroup *memcg,
159 struct eventfd_ctx *eventfd);
160
161
162
163
164 poll_table pt;
165 wait_queue_head_t *wqh;
166 wait_queue_entry_t wait;
167 struct work_struct remove;
168};
169
170static void mem_cgroup_threshold(struct mem_cgroup *memcg);
171static void mem_cgroup_oom_notify(struct mem_cgroup *memcg);
172
173
174
175
176
177#define MOVE_ANON 0x1U
178#define MOVE_FILE 0x2U
179#define MOVE_MASK (MOVE_ANON | MOVE_FILE)
180
181
182static struct move_charge_struct {
183 spinlock_t lock;
184 struct mm_struct *mm;
185 struct mem_cgroup *from;
186 struct mem_cgroup *to;
187 unsigned long flags;
188 unsigned long precharge;
189 unsigned long moved_charge;
190 unsigned long moved_swap;
191 struct task_struct *moving_task;
192 wait_queue_head_t waitq;
193} mc = {
194 .lock = __SPIN_LOCK_UNLOCKED(mc.lock),
195 .waitq = __WAIT_QUEUE_HEAD_INITIALIZER(mc.waitq),
196};
197
198
199
200
201
202#define MEM_CGROUP_MAX_RECLAIM_LOOPS 100
203#define MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS 2
204
205enum charge_type {
206 MEM_CGROUP_CHARGE_TYPE_CACHE = 0,
207 MEM_CGROUP_CHARGE_TYPE_ANON,
208 MEM_CGROUP_CHARGE_TYPE_SWAPOUT,
209 MEM_CGROUP_CHARGE_TYPE_DROP,
210 NR_CHARGE_TYPE,
211};
212
213
214enum res_type {
215 _MEM,
216 _MEMSWAP,
217 _OOM_TYPE,
218 _KMEM,
219 _TCP,
220};
221
222#define MEMFILE_PRIVATE(x, val) ((x) << 16 | (val))
223#define MEMFILE_TYPE(val) ((val) >> 16 & 0xffff)
224#define MEMFILE_ATTR(val) ((val) & 0xffff)
225
226#define OOM_CONTROL (0)
227
228
229
230
231
232
233#define for_each_mem_cgroup_tree(iter, root) \
234 for (iter = mem_cgroup_iter(root, NULL, NULL); \
235 iter != NULL; \
236 iter = mem_cgroup_iter(root, iter, NULL))
237
238#define for_each_mem_cgroup(iter) \
239 for (iter = mem_cgroup_iter(NULL, NULL, NULL); \
240 iter != NULL; \
241 iter = mem_cgroup_iter(NULL, iter, NULL))
242
243static inline bool should_force_charge(void)
244{
245 return tsk_is_oom_victim(current) || fatal_signal_pending(current) ||
246 (current->flags & PF_EXITING);
247}
248
249
250struct vmpressure *memcg_to_vmpressure(struct mem_cgroup *memcg)
251{
252 if (!memcg)
253 memcg = root_mem_cgroup;
254 return &memcg->vmpressure;
255}
256
257struct cgroup_subsys_state *vmpressure_to_css(struct vmpressure *vmpr)
258{
259 return &container_of(vmpr, struct mem_cgroup, vmpressure)->css;
260}
261
262#ifdef CONFIG_MEMCG_KMEM
263
264
265
266
267
268
269
270
271
272
273
274static DEFINE_IDA(memcg_cache_ida);
275int memcg_nr_cache_ids;
276
277
278static DECLARE_RWSEM(memcg_cache_ids_sem);
279
280void memcg_get_cache_ids(void)
281{
282 down_read(&memcg_cache_ids_sem);
283}
284
285void memcg_put_cache_ids(void)
286{
287 up_read(&memcg_cache_ids_sem);
288}
289
290
291
292
293
294
295
296
297
298
299
300
301
302#define MEMCG_CACHES_MIN_SIZE 4
303#define MEMCG_CACHES_MAX_SIZE MEM_CGROUP_ID_MAX
304
305
306
307
308
309
310
311DEFINE_STATIC_KEY_FALSE(memcg_kmem_enabled_key);
312EXPORT_SYMBOL(memcg_kmem_enabled_key);
313
314struct workqueue_struct *memcg_kmem_cache_wq;
315
316static int memcg_shrinker_map_size;
317static DEFINE_MUTEX(memcg_shrinker_map_mutex);
318
319static void memcg_free_shrinker_map_rcu(struct rcu_head *head)
320{
321 kvfree(container_of(head, struct memcg_shrinker_map, rcu));
322}
323
324static int memcg_expand_one_shrinker_map(struct mem_cgroup *memcg,
325 int size, int old_size)
326{
327 struct memcg_shrinker_map *new, *old;
328 int nid;
329
330 lockdep_assert_held(&memcg_shrinker_map_mutex);
331
332 for_each_node(nid) {
333 old = rcu_dereference_protected(
334 mem_cgroup_nodeinfo(memcg, nid)->shrinker_map, true);
335
336 if (!old)
337 return 0;
338
339 new = kvmalloc(sizeof(*new) + size, GFP_KERNEL);
340 if (!new)
341 return -ENOMEM;
342
343
344 memset(new->map, (int)0xff, old_size);
345 memset((void *)new->map + old_size, 0, size - old_size);
346
347 rcu_assign_pointer(memcg->nodeinfo[nid]->shrinker_map, new);
348 call_rcu(&old->rcu, memcg_free_shrinker_map_rcu);
349 }
350
351 return 0;
352}
353
354static void memcg_free_shrinker_maps(struct mem_cgroup *memcg)
355{
356 struct mem_cgroup_per_node *pn;
357 struct memcg_shrinker_map *map;
358 int nid;
359
360 if (mem_cgroup_is_root(memcg))
361 return;
362
363 for_each_node(nid) {
364 pn = mem_cgroup_nodeinfo(memcg, nid);
365 map = rcu_dereference_protected(pn->shrinker_map, true);
366 if (map)
367 kvfree(map);
368 rcu_assign_pointer(pn->shrinker_map, NULL);
369 }
370}
371
372static int memcg_alloc_shrinker_maps(struct mem_cgroup *memcg)
373{
374 struct memcg_shrinker_map *map;
375 int nid, size, ret = 0;
376
377 if (mem_cgroup_is_root(memcg))
378 return 0;
379
380 mutex_lock(&memcg_shrinker_map_mutex);
381 size = memcg_shrinker_map_size;
382 for_each_node(nid) {
383 map = kvzalloc(sizeof(*map) + size, GFP_KERNEL);
384 if (!map) {
385 memcg_free_shrinker_maps(memcg);
386 ret = -ENOMEM;
387 break;
388 }
389 rcu_assign_pointer(memcg->nodeinfo[nid]->shrinker_map, map);
390 }
391 mutex_unlock(&memcg_shrinker_map_mutex);
392
393 return ret;
394}
395
396int memcg_expand_shrinker_maps(int new_id)
397{
398 int size, old_size, ret = 0;
399 struct mem_cgroup *memcg;
400
401 size = DIV_ROUND_UP(new_id + 1, BITS_PER_LONG) * sizeof(unsigned long);
402 old_size = memcg_shrinker_map_size;
403 if (size <= old_size)
404 return 0;
405
406 mutex_lock(&memcg_shrinker_map_mutex);
407 if (!root_mem_cgroup)
408 goto unlock;
409
410 for_each_mem_cgroup(memcg) {
411 if (mem_cgroup_is_root(memcg))
412 continue;
413 ret = memcg_expand_one_shrinker_map(memcg, size, old_size);
414 if (ret)
415 goto unlock;
416 }
417unlock:
418 if (!ret)
419 memcg_shrinker_map_size = size;
420 mutex_unlock(&memcg_shrinker_map_mutex);
421 return ret;
422}
423
424void memcg_set_shrinker_bit(struct mem_cgroup *memcg, int nid, int shrinker_id)
425{
426 if (shrinker_id >= 0 && memcg && !mem_cgroup_is_root(memcg)) {
427 struct memcg_shrinker_map *map;
428
429 rcu_read_lock();
430 map = rcu_dereference(memcg->nodeinfo[nid]->shrinker_map);
431
432 smp_mb__before_atomic();
433 set_bit(shrinker_id, map->map);
434 rcu_read_unlock();
435 }
436}
437
438#else
439static int memcg_alloc_shrinker_maps(struct mem_cgroup *memcg)
440{
441 return 0;
442}
443static void memcg_free_shrinker_maps(struct mem_cgroup *memcg) { }
444#endif
445
446
447
448
449
450
451
452
453
454
455
456
457struct cgroup_subsys_state *mem_cgroup_css_from_page(struct page *page)
458{
459 struct mem_cgroup *memcg;
460
461 memcg = page->mem_cgroup;
462
463 if (!memcg || !cgroup_subsys_on_dfl(memory_cgrp_subsys))
464 memcg = root_mem_cgroup;
465
466 return &memcg->css;
467}
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482ino_t page_cgroup_ino(struct page *page)
483{
484 struct mem_cgroup *memcg;
485 unsigned long ino = 0;
486
487 rcu_read_lock();
488 memcg = READ_ONCE(page->mem_cgroup);
489 while (memcg && !(memcg->css.flags & CSS_ONLINE))
490 memcg = parent_mem_cgroup(memcg);
491 if (memcg)
492 ino = cgroup_ino(memcg->css.cgroup);
493 rcu_read_unlock();
494 return ino;
495}
496
497static struct mem_cgroup_per_node *
498mem_cgroup_page_nodeinfo(struct mem_cgroup *memcg, struct page *page)
499{
500 int nid = page_to_nid(page);
501
502 return memcg->nodeinfo[nid];
503}
504
505static struct mem_cgroup_tree_per_node *
506soft_limit_tree_node(int nid)
507{
508 return soft_limit_tree.rb_tree_per_node[nid];
509}
510
511static struct mem_cgroup_tree_per_node *
512soft_limit_tree_from_page(struct page *page)
513{
514 int nid = page_to_nid(page);
515
516 return soft_limit_tree.rb_tree_per_node[nid];
517}
518
519static void __mem_cgroup_insert_exceeded(struct mem_cgroup_per_node *mz,
520 struct mem_cgroup_tree_per_node *mctz,
521 unsigned long new_usage_in_excess)
522{
523 struct rb_node **p = &mctz->rb_root.rb_node;
524 struct rb_node *parent = NULL;
525 struct mem_cgroup_per_node *mz_node;
526 bool rightmost = true;
527
528 if (mz->on_tree)
529 return;
530
531 mz->usage_in_excess = new_usage_in_excess;
532 if (!mz->usage_in_excess)
533 return;
534 while (*p) {
535 parent = *p;
536 mz_node = rb_entry(parent, struct mem_cgroup_per_node,
537 tree_node);
538 if (mz->usage_in_excess < mz_node->usage_in_excess) {
539 p = &(*p)->rb_left;
540 rightmost = false;
541 }
542
543
544
545
546
547 else if (mz->usage_in_excess >= mz_node->usage_in_excess)
548 p = &(*p)->rb_right;
549 }
550
551 if (rightmost)
552 mctz->rb_rightmost = &mz->tree_node;
553
554 rb_link_node(&mz->tree_node, parent, p);
555 rb_insert_color(&mz->tree_node, &mctz->rb_root);
556 mz->on_tree = true;
557}
558
559static void __mem_cgroup_remove_exceeded(struct mem_cgroup_per_node *mz,
560 struct mem_cgroup_tree_per_node *mctz)
561{
562 if (!mz->on_tree)
563 return;
564
565 if (&mz->tree_node == mctz->rb_rightmost)
566 mctz->rb_rightmost = rb_prev(&mz->tree_node);
567
568 rb_erase(&mz->tree_node, &mctz->rb_root);
569 mz->on_tree = false;
570}
571
572static void mem_cgroup_remove_exceeded(struct mem_cgroup_per_node *mz,
573 struct mem_cgroup_tree_per_node *mctz)
574{
575 unsigned long flags;
576
577 spin_lock_irqsave(&mctz->lock, flags);
578 __mem_cgroup_remove_exceeded(mz, mctz);
579 spin_unlock_irqrestore(&mctz->lock, flags);
580}
581
582static unsigned long soft_limit_excess(struct mem_cgroup *memcg)
583{
584 unsigned long nr_pages = page_counter_read(&memcg->memory);
585 unsigned long soft_limit = READ_ONCE(memcg->soft_limit);
586 unsigned long excess = 0;
587
588 if (nr_pages > soft_limit)
589 excess = nr_pages - soft_limit;
590
591 return excess;
592}
593
594static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page)
595{
596 unsigned long excess;
597 struct mem_cgroup_per_node *mz;
598 struct mem_cgroup_tree_per_node *mctz;
599
600 mctz = soft_limit_tree_from_page(page);
601 if (!mctz)
602 return;
603
604
605
606
607 for (; memcg; memcg = parent_mem_cgroup(memcg)) {
608 mz = mem_cgroup_page_nodeinfo(memcg, page);
609 excess = soft_limit_excess(memcg);
610
611
612
613
614 if (excess || mz->on_tree) {
615 unsigned long flags;
616
617 spin_lock_irqsave(&mctz->lock, flags);
618
619 if (mz->on_tree)
620 __mem_cgroup_remove_exceeded(mz, mctz);
621
622
623
624
625 __mem_cgroup_insert_exceeded(mz, mctz, excess);
626 spin_unlock_irqrestore(&mctz->lock, flags);
627 }
628 }
629}
630
631static void mem_cgroup_remove_from_trees(struct mem_cgroup *memcg)
632{
633 struct mem_cgroup_tree_per_node *mctz;
634 struct mem_cgroup_per_node *mz;
635 int nid;
636
637 for_each_node(nid) {
638 mz = mem_cgroup_nodeinfo(memcg, nid);
639 mctz = soft_limit_tree_node(nid);
640 if (mctz)
641 mem_cgroup_remove_exceeded(mz, mctz);
642 }
643}
644
645static struct mem_cgroup_per_node *
646__mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz)
647{
648 struct mem_cgroup_per_node *mz;
649
650retry:
651 mz = NULL;
652 if (!mctz->rb_rightmost)
653 goto done;
654
655 mz = rb_entry(mctz->rb_rightmost,
656 struct mem_cgroup_per_node, tree_node);
657
658
659
660
661
662 __mem_cgroup_remove_exceeded(mz, mctz);
663 if (!soft_limit_excess(mz->memcg) ||
664 !css_tryget_online(&mz->memcg->css))
665 goto retry;
666done:
667 return mz;
668}
669
670static struct mem_cgroup_per_node *
671mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz)
672{
673 struct mem_cgroup_per_node *mz;
674
675 spin_lock_irq(&mctz->lock);
676 mz = __mem_cgroup_largest_soft_limit_node(mctz);
677 spin_unlock_irq(&mctz->lock);
678 return mz;
679}
680
681
682
683
684
685
686
687void __mod_memcg_state(struct mem_cgroup *memcg, int idx, int val)
688{
689 long x;
690
691 if (mem_cgroup_disabled())
692 return;
693
694 __this_cpu_add(memcg->vmstats_local->stat[idx], val);
695
696 x = val + __this_cpu_read(memcg->vmstats_percpu->stat[idx]);
697 if (unlikely(abs(x) > MEMCG_CHARGE_BATCH)) {
698 struct mem_cgroup *mi;
699
700 for (mi = memcg; mi; mi = parent_mem_cgroup(mi))
701 atomic_long_add(x, &mi->vmstats[idx]);
702 x = 0;
703 }
704 __this_cpu_write(memcg->vmstats_percpu->stat[idx], x);
705}
706
707static struct mem_cgroup_per_node *
708parent_nodeinfo(struct mem_cgroup_per_node *pn, int nid)
709{
710 struct mem_cgroup *parent;
711
712 parent = parent_mem_cgroup(pn->memcg);
713 if (!parent)
714 return NULL;
715 return mem_cgroup_nodeinfo(parent, nid);
716}
717
718
719
720
721
722
723
724
725
726
727
728void __mod_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx,
729 int val)
730{
731 pg_data_t *pgdat = lruvec_pgdat(lruvec);
732 struct mem_cgroup_per_node *pn;
733 struct mem_cgroup *memcg;
734 long x;
735
736
737 __mod_node_page_state(pgdat, idx, val);
738
739 if (mem_cgroup_disabled())
740 return;
741
742 pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
743 memcg = pn->memcg;
744
745
746 __mod_memcg_state(memcg, idx, val);
747
748
749 __this_cpu_add(pn->lruvec_stat_local->count[idx], val);
750
751 x = val + __this_cpu_read(pn->lruvec_stat_cpu->count[idx]);
752 if (unlikely(abs(x) > MEMCG_CHARGE_BATCH)) {
753 struct mem_cgroup_per_node *pi;
754
755 for (pi = pn; pi; pi = parent_nodeinfo(pi, pgdat->node_id))
756 atomic_long_add(x, &pi->lruvec_stat[idx]);
757 x = 0;
758 }
759 __this_cpu_write(pn->lruvec_stat_cpu->count[idx], x);
760}
761
762
763
764
765
766
767
768void __count_memcg_events(struct mem_cgroup *memcg, enum vm_event_item idx,
769 unsigned long count)
770{
771 unsigned long x;
772
773 if (mem_cgroup_disabled())
774 return;
775
776 __this_cpu_add(memcg->vmstats_local->events[idx], count);
777
778 x = count + __this_cpu_read(memcg->vmstats_percpu->events[idx]);
779 if (unlikely(x > MEMCG_CHARGE_BATCH)) {
780 struct mem_cgroup *mi;
781
782 for (mi = memcg; mi; mi = parent_mem_cgroup(mi))
783 atomic_long_add(x, &mi->vmevents[idx]);
784 x = 0;
785 }
786 __this_cpu_write(memcg->vmstats_percpu->events[idx], x);
787}
788
789static unsigned long memcg_events(struct mem_cgroup *memcg, int event)
790{
791 return atomic_long_read(&memcg->vmevents[event]);
792}
793
794static unsigned long memcg_events_local(struct mem_cgroup *memcg, int event)
795{
796 long x = 0;
797 int cpu;
798
799 for_each_possible_cpu(cpu)
800 x += per_cpu(memcg->vmstats_local->events[event], cpu);
801 return x;
802}
803
804static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg,
805 struct page *page,
806 bool compound, int nr_pages)
807{
808
809
810
811
812 if (PageAnon(page))
813 __mod_memcg_state(memcg, MEMCG_RSS, nr_pages);
814 else {
815 __mod_memcg_state(memcg, MEMCG_CACHE, nr_pages);
816 if (PageSwapBacked(page))
817 __mod_memcg_state(memcg, NR_SHMEM, nr_pages);
818 }
819
820 if (compound) {
821 VM_BUG_ON_PAGE(!PageTransHuge(page), page);
822 __mod_memcg_state(memcg, MEMCG_RSS_HUGE, nr_pages);
823 }
824
825
826 if (nr_pages > 0)
827 __count_memcg_events(memcg, PGPGIN, 1);
828 else {
829 __count_memcg_events(memcg, PGPGOUT, 1);
830 nr_pages = -nr_pages;
831 }
832
833 __this_cpu_add(memcg->vmstats_percpu->nr_page_events, nr_pages);
834}
835
836static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg,
837 enum mem_cgroup_events_target target)
838{
839 unsigned long val, next;
840
841 val = __this_cpu_read(memcg->vmstats_percpu->nr_page_events);
842 next = __this_cpu_read(memcg->vmstats_percpu->targets[target]);
843
844 if ((long)(next - val) < 0) {
845 switch (target) {
846 case MEM_CGROUP_TARGET_THRESH:
847 next = val + THRESHOLDS_EVENTS_TARGET;
848 break;
849 case MEM_CGROUP_TARGET_SOFTLIMIT:
850 next = val + SOFTLIMIT_EVENTS_TARGET;
851 break;
852 case MEM_CGROUP_TARGET_NUMAINFO:
853 next = val + NUMAINFO_EVENTS_TARGET;
854 break;
855 default:
856 break;
857 }
858 __this_cpu_write(memcg->vmstats_percpu->targets[target], next);
859 return true;
860 }
861 return false;
862}
863
864
865
866
867
868static void memcg_check_events(struct mem_cgroup *memcg, struct page *page)
869{
870
871 if (unlikely(mem_cgroup_event_ratelimit(memcg,
872 MEM_CGROUP_TARGET_THRESH))) {
873 bool do_softlimit;
874 bool do_numainfo __maybe_unused;
875
876 do_softlimit = mem_cgroup_event_ratelimit(memcg,
877 MEM_CGROUP_TARGET_SOFTLIMIT);
878#if MAX_NUMNODES > 1
879 do_numainfo = mem_cgroup_event_ratelimit(memcg,
880 MEM_CGROUP_TARGET_NUMAINFO);
881#endif
882 mem_cgroup_threshold(memcg);
883 if (unlikely(do_softlimit))
884 mem_cgroup_update_tree(memcg, page);
885#if MAX_NUMNODES > 1
886 if (unlikely(do_numainfo))
887 atomic_inc(&memcg->numainfo_events);
888#endif
889 }
890}
891
892struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
893{
894
895
896
897
898
899 if (unlikely(!p))
900 return NULL;
901
902 return mem_cgroup_from_css(task_css(p, memory_cgrp_id));
903}
904EXPORT_SYMBOL(mem_cgroup_from_task);
905
906
907
908
909
910
911
912
913
914struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm)
915{
916 struct mem_cgroup *memcg;
917
918 if (mem_cgroup_disabled())
919 return NULL;
920
921 rcu_read_lock();
922 do {
923
924
925
926
927
928 if (unlikely(!mm))
929 memcg = root_mem_cgroup;
930 else {
931 memcg = mem_cgroup_from_task(rcu_dereference(mm->owner));
932 if (unlikely(!memcg))
933 memcg = root_mem_cgroup;
934 }
935 } while (!css_tryget_online(&memcg->css));
936 rcu_read_unlock();
937 return memcg;
938}
939EXPORT_SYMBOL(get_mem_cgroup_from_mm);
940
941
942
943
944
945
946
947
948struct mem_cgroup *get_mem_cgroup_from_page(struct page *page)
949{
950 struct mem_cgroup *memcg = page->mem_cgroup;
951
952 if (mem_cgroup_disabled())
953 return NULL;
954
955 rcu_read_lock();
956 if (!memcg || !css_tryget_online(&memcg->css))
957 memcg = root_mem_cgroup;
958 rcu_read_unlock();
959 return memcg;
960}
961EXPORT_SYMBOL(get_mem_cgroup_from_page);
962
963
964
965
966static __always_inline struct mem_cgroup *get_mem_cgroup_from_current(void)
967{
968 if (unlikely(current->active_memcg)) {
969 struct mem_cgroup *memcg = root_mem_cgroup;
970
971 rcu_read_lock();
972 if (css_tryget_online(¤t->active_memcg->css))
973 memcg = current->active_memcg;
974 rcu_read_unlock();
975 return memcg;
976 }
977 return get_mem_cgroup_from_mm(current->mm);
978}
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
998 struct mem_cgroup *prev,
999 struct mem_cgroup_reclaim_cookie *reclaim)
1000{
1001 struct mem_cgroup_reclaim_iter *uninitialized_var(iter);
1002 struct cgroup_subsys_state *css = NULL;
1003 struct mem_cgroup *memcg = NULL;
1004 struct mem_cgroup *pos = NULL;
1005
1006 if (mem_cgroup_disabled())
1007 return NULL;
1008
1009 if (!root)
1010 root = root_mem_cgroup;
1011
1012 if (prev && !reclaim)
1013 pos = prev;
1014
1015 if (!root->use_hierarchy && root != root_mem_cgroup) {
1016 if (prev)
1017 goto out;
1018 return root;
1019 }
1020
1021 rcu_read_lock();
1022
1023 if (reclaim) {
1024 struct mem_cgroup_per_node *mz;
1025
1026 mz = mem_cgroup_nodeinfo(root, reclaim->pgdat->node_id);
1027 iter = &mz->iter[reclaim->priority];
1028
1029 if (prev && reclaim->generation != iter->generation)
1030 goto out_unlock;
1031
1032 while (1) {
1033 pos = READ_ONCE(iter->position);
1034 if (!pos || css_tryget(&pos->css))
1035 break;
1036
1037
1038
1039
1040
1041
1042
1043
1044 (void)cmpxchg(&iter->position, pos, NULL);
1045 }
1046 }
1047
1048 if (pos)
1049 css = &pos->css;
1050
1051 for (;;) {
1052 css = css_next_descendant_pre(css, &root->css);
1053 if (!css) {
1054
1055
1056
1057
1058
1059
1060 if (!prev)
1061 continue;
1062 break;
1063 }
1064
1065
1066
1067
1068
1069
1070 memcg = mem_cgroup_from_css(css);
1071
1072 if (css == &root->css)
1073 break;
1074
1075 if (css_tryget(css))
1076 break;
1077
1078 memcg = NULL;
1079 }
1080
1081 if (reclaim) {
1082
1083
1084
1085
1086
1087 (void)cmpxchg(&iter->position, pos, memcg);
1088
1089 if (pos)
1090 css_put(&pos->css);
1091
1092 if (!memcg)
1093 iter->generation++;
1094 else if (!prev)
1095 reclaim->generation = iter->generation;
1096 }
1097
1098out_unlock:
1099 rcu_read_unlock();
1100out:
1101 if (prev && prev != root)
1102 css_put(&prev->css);
1103
1104 return memcg;
1105}
1106
1107
1108
1109
1110
1111
1112void mem_cgroup_iter_break(struct mem_cgroup *root,
1113 struct mem_cgroup *prev)
1114{
1115 if (!root)
1116 root = root_mem_cgroup;
1117 if (prev && prev != root)
1118 css_put(&prev->css);
1119}
1120
1121static void invalidate_reclaim_iterators(struct mem_cgroup *dead_memcg)
1122{
1123 struct mem_cgroup *memcg = dead_memcg;
1124 struct mem_cgroup_reclaim_iter *iter;
1125 struct mem_cgroup_per_node *mz;
1126 int nid;
1127 int i;
1128
1129 for (; memcg; memcg = parent_mem_cgroup(memcg)) {
1130 for_each_node(nid) {
1131 mz = mem_cgroup_nodeinfo(memcg, nid);
1132 for (i = 0; i <= DEF_PRIORITY; i++) {
1133 iter = &mz->iter[i];
1134 cmpxchg(&iter->position,
1135 dead_memcg, NULL);
1136 }
1137 }
1138 }
1139}
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154int mem_cgroup_scan_tasks(struct mem_cgroup *memcg,
1155 int (*fn)(struct task_struct *, void *), void *arg)
1156{
1157 struct mem_cgroup *iter;
1158 int ret = 0;
1159
1160 BUG_ON(memcg == root_mem_cgroup);
1161
1162 for_each_mem_cgroup_tree(iter, memcg) {
1163 struct css_task_iter it;
1164 struct task_struct *task;
1165
1166 css_task_iter_start(&iter->css, 0, &it);
1167 while (!ret && (task = css_task_iter_next(&it)))
1168 ret = fn(task, arg);
1169 css_task_iter_end(&it);
1170 if (ret) {
1171 mem_cgroup_iter_break(memcg, iter);
1172 break;
1173 }
1174 }
1175 return ret;
1176}
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187struct lruvec *mem_cgroup_page_lruvec(struct page *page, struct pglist_data *pgdat)
1188{
1189 struct mem_cgroup_per_node *mz;
1190 struct mem_cgroup *memcg;
1191 struct lruvec *lruvec;
1192
1193 if (mem_cgroup_disabled()) {
1194 lruvec = &pgdat->lruvec;
1195 goto out;
1196 }
1197
1198 memcg = page->mem_cgroup;
1199
1200
1201
1202
1203 if (!memcg)
1204 memcg = root_mem_cgroup;
1205
1206 mz = mem_cgroup_page_nodeinfo(memcg, page);
1207 lruvec = &mz->lruvec;
1208out:
1209
1210
1211
1212
1213
1214 if (unlikely(lruvec->pgdat != pgdat))
1215 lruvec->pgdat = pgdat;
1216 return lruvec;
1217}
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru,
1231 int zid, int nr_pages)
1232{
1233 struct mem_cgroup_per_node *mz;
1234 unsigned long *lru_size;
1235 long size;
1236
1237 if (mem_cgroup_disabled())
1238 return;
1239
1240 mz = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
1241 lru_size = &mz->lru_zone_size[zid][lru];
1242
1243 if (nr_pages < 0)
1244 *lru_size += nr_pages;
1245
1246 size = *lru_size;
1247 if (WARN_ONCE(size < 0,
1248 "%s(%p, %d, %d): lru_size %ld\n",
1249 __func__, lruvec, lru, nr_pages, size)) {
1250 VM_BUG_ON(1);
1251 *lru_size = 0;
1252 }
1253
1254 if (nr_pages > 0)
1255 *lru_size += nr_pages;
1256}
1257
1258bool task_in_mem_cgroup(struct task_struct *task, struct mem_cgroup *memcg)
1259{
1260 struct mem_cgroup *task_memcg;
1261 struct task_struct *p;
1262 bool ret;
1263
1264 p = find_lock_task_mm(task);
1265 if (p) {
1266 task_memcg = get_mem_cgroup_from_mm(p->mm);
1267 task_unlock(p);
1268 } else {
1269
1270
1271
1272
1273
1274 rcu_read_lock();
1275 task_memcg = mem_cgroup_from_task(task);
1276 css_get(&task_memcg->css);
1277 rcu_read_unlock();
1278 }
1279 ret = mem_cgroup_is_descendant(task_memcg, memcg);
1280 css_put(&task_memcg->css);
1281 return ret;
1282}
1283
1284
1285
1286
1287
1288
1289
1290
1291static unsigned long mem_cgroup_margin(struct mem_cgroup *memcg)
1292{
1293 unsigned long margin = 0;
1294 unsigned long count;
1295 unsigned long limit;
1296
1297 count = page_counter_read(&memcg->memory);
1298 limit = READ_ONCE(memcg->memory.max);
1299 if (count < limit)
1300 margin = limit - count;
1301
1302 if (do_memsw_account()) {
1303 count = page_counter_read(&memcg->memsw);
1304 limit = READ_ONCE(memcg->memsw.max);
1305 if (count <= limit)
1306 margin = min(margin, limit - count);
1307 else
1308 margin = 0;
1309 }
1310
1311 return margin;
1312}
1313
1314
1315
1316
1317
1318
1319
1320
1321static bool mem_cgroup_under_move(struct mem_cgroup *memcg)
1322{
1323 struct mem_cgroup *from;
1324 struct mem_cgroup *to;
1325 bool ret = false;
1326
1327
1328
1329
1330 spin_lock(&mc.lock);
1331 from = mc.from;
1332 to = mc.to;
1333 if (!from)
1334 goto unlock;
1335
1336 ret = mem_cgroup_is_descendant(from, memcg) ||
1337 mem_cgroup_is_descendant(to, memcg);
1338unlock:
1339 spin_unlock(&mc.lock);
1340 return ret;
1341}
1342
1343static bool mem_cgroup_wait_acct_move(struct mem_cgroup *memcg)
1344{
1345 if (mc.moving_task && current != mc.moving_task) {
1346 if (mem_cgroup_under_move(memcg)) {
1347 DEFINE_WAIT(wait);
1348 prepare_to_wait(&mc.waitq, &wait, TASK_INTERRUPTIBLE);
1349
1350 if (mc.moving_task)
1351 schedule();
1352 finish_wait(&mc.waitq, &wait);
1353 return true;
1354 }
1355 }
1356 return false;
1357}
1358
1359static const unsigned int memcg1_stats[] = {
1360 MEMCG_CACHE,
1361 MEMCG_RSS,
1362 MEMCG_RSS_HUGE,
1363 NR_SHMEM,
1364 NR_FILE_MAPPED,
1365 NR_FILE_DIRTY,
1366 NR_WRITEBACK,
1367 MEMCG_SWAP,
1368};
1369
1370static const char *const memcg1_stat_names[] = {
1371 "cache",
1372 "rss",
1373 "rss_huge",
1374 "shmem",
1375 "mapped_file",
1376 "dirty",
1377 "writeback",
1378 "swap",
1379};
1380
1381#define K(x) ((x) << (PAGE_SHIFT-10))
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391void mem_cgroup_print_oom_context(struct mem_cgroup *memcg, struct task_struct *p)
1392{
1393 rcu_read_lock();
1394
1395 if (memcg) {
1396 pr_cont(",oom_memcg=");
1397 pr_cont_cgroup_path(memcg->css.cgroup);
1398 } else
1399 pr_cont(",global_oom");
1400 if (p) {
1401 pr_cont(",task_memcg=");
1402 pr_cont_cgroup_path(task_cgroup(p, memory_cgrp_id));
1403 }
1404 rcu_read_unlock();
1405}
1406
1407
1408
1409
1410
1411
1412void mem_cgroup_print_oom_meminfo(struct mem_cgroup *memcg)
1413{
1414 struct mem_cgroup *iter;
1415 unsigned int i;
1416
1417 pr_info("memory: usage %llukB, limit %llukB, failcnt %lu\n",
1418 K((u64)page_counter_read(&memcg->memory)),
1419 K((u64)memcg->memory.max), memcg->memory.failcnt);
1420 pr_info("memory+swap: usage %llukB, limit %llukB, failcnt %lu\n",
1421 K((u64)page_counter_read(&memcg->memsw)),
1422 K((u64)memcg->memsw.max), memcg->memsw.failcnt);
1423 pr_info("kmem: usage %llukB, limit %llukB, failcnt %lu\n",
1424 K((u64)page_counter_read(&memcg->kmem)),
1425 K((u64)memcg->kmem.max), memcg->kmem.failcnt);
1426
1427 for_each_mem_cgroup_tree(iter, memcg) {
1428 pr_info("Memory cgroup stats for ");
1429 pr_cont_cgroup_path(iter->css.cgroup);
1430 pr_cont(":");
1431
1432 for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) {
1433 if (memcg1_stats[i] == MEMCG_SWAP && !do_swap_account)
1434 continue;
1435 pr_cont(" %s:%luKB", memcg1_stat_names[i],
1436 K(memcg_page_state_local(iter,
1437 memcg1_stats[i])));
1438 }
1439
1440 for (i = 0; i < NR_LRU_LISTS; i++)
1441 pr_cont(" %s:%luKB", mem_cgroup_lru_names[i],
1442 K(memcg_page_state_local(iter,
1443 NR_LRU_BASE + i)));
1444
1445 pr_cont("\n");
1446 }
1447}
1448
1449
1450
1451
1452unsigned long mem_cgroup_get_max(struct mem_cgroup *memcg)
1453{
1454 unsigned long max;
1455
1456 max = memcg->memory.max;
1457 if (mem_cgroup_swappiness(memcg)) {
1458 unsigned long memsw_max;
1459 unsigned long swap_max;
1460
1461 memsw_max = memcg->memsw.max;
1462 swap_max = memcg->swap.max;
1463 swap_max = min(swap_max, (unsigned long)total_swap_pages);
1464 max = min(max + swap_max, memsw_max);
1465 }
1466 return max;
1467}
1468
1469static bool mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
1470 int order)
1471{
1472 struct oom_control oc = {
1473 .zonelist = NULL,
1474 .nodemask = NULL,
1475 .memcg = memcg,
1476 .gfp_mask = gfp_mask,
1477 .order = order,
1478 };
1479 bool ret;
1480
1481 if (mutex_lock_killable(&oom_lock))
1482 return true;
1483
1484
1485
1486
1487 ret = should_force_charge() || out_of_memory(&oc);
1488 mutex_unlock(&oom_lock);
1489 return ret;
1490}
1491
1492#if MAX_NUMNODES > 1
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504static bool test_mem_cgroup_node_reclaimable(struct mem_cgroup *memcg,
1505 int nid, bool noswap)
1506{
1507 struct lruvec *lruvec = mem_cgroup_lruvec(NODE_DATA(nid), memcg);
1508
1509 if (lruvec_page_state(lruvec, NR_INACTIVE_FILE) ||
1510 lruvec_page_state(lruvec, NR_ACTIVE_FILE))
1511 return true;
1512 if (noswap || !total_swap_pages)
1513 return false;
1514 if (lruvec_page_state(lruvec, NR_INACTIVE_ANON) ||
1515 lruvec_page_state(lruvec, NR_ACTIVE_ANON))
1516 return true;
1517 return false;
1518
1519}
1520
1521
1522
1523
1524
1525
1526
1527static void mem_cgroup_may_update_nodemask(struct mem_cgroup *memcg)
1528{
1529 int nid;
1530
1531
1532
1533
1534 if (!atomic_read(&memcg->numainfo_events))
1535 return;
1536 if (atomic_inc_return(&memcg->numainfo_updating) > 1)
1537 return;
1538
1539
1540 memcg->scan_nodes = node_states[N_MEMORY];
1541
1542 for_each_node_mask(nid, node_states[N_MEMORY]) {
1543
1544 if (!test_mem_cgroup_node_reclaimable(memcg, nid, false))
1545 node_clear(nid, memcg->scan_nodes);
1546 }
1547
1548 atomic_set(&memcg->numainfo_events, 0);
1549 atomic_set(&memcg->numainfo_updating, 0);
1550}
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564int mem_cgroup_select_victim_node(struct mem_cgroup *memcg)
1565{
1566 int node;
1567
1568 mem_cgroup_may_update_nodemask(memcg);
1569 node = memcg->last_scanned_node;
1570
1571 node = next_node_in(node, memcg->scan_nodes);
1572
1573
1574
1575
1576
1577 if (unlikely(node == MAX_NUMNODES))
1578 node = numa_node_id();
1579
1580 memcg->last_scanned_node = node;
1581 return node;
1582}
1583#else
1584int mem_cgroup_select_victim_node(struct mem_cgroup *memcg)
1585{
1586 return 0;
1587}
1588#endif
1589
1590static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg,
1591 pg_data_t *pgdat,
1592 gfp_t gfp_mask,
1593 unsigned long *total_scanned)
1594{
1595 struct mem_cgroup *victim = NULL;
1596 int total = 0;
1597 int loop = 0;
1598 unsigned long excess;
1599 unsigned long nr_scanned;
1600 struct mem_cgroup_reclaim_cookie reclaim = {
1601 .pgdat = pgdat,
1602 .priority = 0,
1603 };
1604
1605 excess = soft_limit_excess(root_memcg);
1606
1607 while (1) {
1608 victim = mem_cgroup_iter(root_memcg, victim, &reclaim);
1609 if (!victim) {
1610 loop++;
1611 if (loop >= 2) {
1612
1613
1614
1615
1616
1617 if (!total)
1618 break;
1619
1620
1621
1622
1623
1624
1625 if (total >= (excess >> 2) ||
1626 (loop > MEM_CGROUP_MAX_RECLAIM_LOOPS))
1627 break;
1628 }
1629 continue;
1630 }
1631 total += mem_cgroup_shrink_node(victim, gfp_mask, false,
1632 pgdat, &nr_scanned);
1633 *total_scanned += nr_scanned;
1634 if (!soft_limit_excess(root_memcg))
1635 break;
1636 }
1637 mem_cgroup_iter_break(root_memcg, victim);
1638 return total;
1639}
1640
1641#ifdef CONFIG_LOCKDEP
1642static struct lockdep_map memcg_oom_lock_dep_map = {
1643 .name = "memcg_oom_lock",
1644};
1645#endif
1646
1647static DEFINE_SPINLOCK(memcg_oom_lock);
1648
1649
1650
1651
1652
1653static bool mem_cgroup_oom_trylock(struct mem_cgroup *memcg)
1654{
1655 struct mem_cgroup *iter, *failed = NULL;
1656
1657 spin_lock(&memcg_oom_lock);
1658
1659 for_each_mem_cgroup_tree(iter, memcg) {
1660 if (iter->oom_lock) {
1661
1662
1663
1664
1665 failed = iter;
1666 mem_cgroup_iter_break(memcg, iter);
1667 break;
1668 } else
1669 iter->oom_lock = true;
1670 }
1671
1672 if (failed) {
1673
1674
1675
1676
1677 for_each_mem_cgroup_tree(iter, memcg) {
1678 if (iter == failed) {
1679 mem_cgroup_iter_break(memcg, iter);
1680 break;
1681 }
1682 iter->oom_lock = false;
1683 }
1684 } else
1685 mutex_acquire(&memcg_oom_lock_dep_map, 0, 1, _RET_IP_);
1686
1687 spin_unlock(&memcg_oom_lock);
1688
1689 return !failed;
1690}
1691
1692static void mem_cgroup_oom_unlock(struct mem_cgroup *memcg)
1693{
1694 struct mem_cgroup *iter;
1695
1696 spin_lock(&memcg_oom_lock);
1697 mutex_release(&memcg_oom_lock_dep_map, 1, _RET_IP_);
1698 for_each_mem_cgroup_tree(iter, memcg)
1699 iter->oom_lock = false;
1700 spin_unlock(&memcg_oom_lock);
1701}
1702
1703static void mem_cgroup_mark_under_oom(struct mem_cgroup *memcg)
1704{
1705 struct mem_cgroup *iter;
1706
1707 spin_lock(&memcg_oom_lock);
1708 for_each_mem_cgroup_tree(iter, memcg)
1709 iter->under_oom++;
1710 spin_unlock(&memcg_oom_lock);
1711}
1712
1713static void mem_cgroup_unmark_under_oom(struct mem_cgroup *memcg)
1714{
1715 struct mem_cgroup *iter;
1716
1717
1718
1719
1720
1721 spin_lock(&memcg_oom_lock);
1722 for_each_mem_cgroup_tree(iter, memcg)
1723 if (iter->under_oom > 0)
1724 iter->under_oom--;
1725 spin_unlock(&memcg_oom_lock);
1726}
1727
1728static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq);
1729
1730struct oom_wait_info {
1731 struct mem_cgroup *memcg;
1732 wait_queue_entry_t wait;
1733};
1734
1735static int memcg_oom_wake_function(wait_queue_entry_t *wait,
1736 unsigned mode, int sync, void *arg)
1737{
1738 struct mem_cgroup *wake_memcg = (struct mem_cgroup *)arg;
1739 struct mem_cgroup *oom_wait_memcg;
1740 struct oom_wait_info *oom_wait_info;
1741
1742 oom_wait_info = container_of(wait, struct oom_wait_info, wait);
1743 oom_wait_memcg = oom_wait_info->memcg;
1744
1745 if (!mem_cgroup_is_descendant(wake_memcg, oom_wait_memcg) &&
1746 !mem_cgroup_is_descendant(oom_wait_memcg, wake_memcg))
1747 return 0;
1748 return autoremove_wake_function(wait, mode, sync, arg);
1749}
1750
1751static void memcg_oom_recover(struct mem_cgroup *memcg)
1752{
1753
1754
1755
1756
1757
1758
1759
1760
1761 if (memcg && memcg->under_oom)
1762 __wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, memcg);
1763}
1764
1765enum oom_status {
1766 OOM_SUCCESS,
1767 OOM_FAILED,
1768 OOM_ASYNC,
1769 OOM_SKIPPED
1770};
1771
1772static enum oom_status mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order)
1773{
1774 enum oom_status ret;
1775 bool locked;
1776
1777 if (order > PAGE_ALLOC_COSTLY_ORDER)
1778 return OOM_SKIPPED;
1779
1780 memcg_memory_event(memcg, MEMCG_OOM);
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800 if (memcg->oom_kill_disable) {
1801 if (!current->in_user_fault)
1802 return OOM_SKIPPED;
1803 css_get(&memcg->css);
1804 current->memcg_in_oom = memcg;
1805 current->memcg_oom_gfp_mask = mask;
1806 current->memcg_oom_order = order;
1807
1808 return OOM_ASYNC;
1809 }
1810
1811 mem_cgroup_mark_under_oom(memcg);
1812
1813 locked = mem_cgroup_oom_trylock(memcg);
1814
1815 if (locked)
1816 mem_cgroup_oom_notify(memcg);
1817
1818 mem_cgroup_unmark_under_oom(memcg);
1819 if (mem_cgroup_out_of_memory(memcg, mask, order))
1820 ret = OOM_SUCCESS;
1821 else
1822 ret = OOM_FAILED;
1823
1824 if (locked)
1825 mem_cgroup_oom_unlock(memcg);
1826
1827 return ret;
1828}
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847bool mem_cgroup_oom_synchronize(bool handle)
1848{
1849 struct mem_cgroup *memcg = current->memcg_in_oom;
1850 struct oom_wait_info owait;
1851 bool locked;
1852
1853
1854 if (!memcg)
1855 return false;
1856
1857 if (!handle)
1858 goto cleanup;
1859
1860 owait.memcg = memcg;
1861 owait.wait.flags = 0;
1862 owait.wait.func = memcg_oom_wake_function;
1863 owait.wait.private = current;
1864 INIT_LIST_HEAD(&owait.wait.entry);
1865
1866 prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE);
1867 mem_cgroup_mark_under_oom(memcg);
1868
1869 locked = mem_cgroup_oom_trylock(memcg);
1870
1871 if (locked)
1872 mem_cgroup_oom_notify(memcg);
1873
1874 if (locked && !memcg->oom_kill_disable) {
1875 mem_cgroup_unmark_under_oom(memcg);
1876 finish_wait(&memcg_oom_waitq, &owait.wait);
1877 mem_cgroup_out_of_memory(memcg, current->memcg_oom_gfp_mask,
1878 current->memcg_oom_order);
1879 } else {
1880 schedule();
1881 mem_cgroup_unmark_under_oom(memcg);
1882 finish_wait(&memcg_oom_waitq, &owait.wait);
1883 }
1884
1885 if (locked) {
1886 mem_cgroup_oom_unlock(memcg);
1887
1888
1889
1890
1891
1892 memcg_oom_recover(memcg);
1893 }
1894cleanup:
1895 current->memcg_in_oom = NULL;
1896 css_put(&memcg->css);
1897 return true;
1898}
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910struct mem_cgroup *mem_cgroup_get_oom_group(struct task_struct *victim,
1911 struct mem_cgroup *oom_domain)
1912{
1913 struct mem_cgroup *oom_group = NULL;
1914 struct mem_cgroup *memcg;
1915
1916 if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
1917 return NULL;
1918
1919 if (!oom_domain)
1920 oom_domain = root_mem_cgroup;
1921
1922 rcu_read_lock();
1923
1924 memcg = mem_cgroup_from_task(victim);
1925 if (memcg == root_mem_cgroup)
1926 goto out;
1927
1928
1929
1930
1931
1932
1933 for (; memcg; memcg = parent_mem_cgroup(memcg)) {
1934 if (memcg->oom_group)
1935 oom_group = memcg;
1936
1937 if (memcg == oom_domain)
1938 break;
1939 }
1940
1941 if (oom_group)
1942 css_get(&oom_group->css);
1943out:
1944 rcu_read_unlock();
1945
1946 return oom_group;
1947}
1948
1949void mem_cgroup_print_oom_group(struct mem_cgroup *memcg)
1950{
1951 pr_info("Tasks in ");
1952 pr_cont_cgroup_path(memcg->css.cgroup);
1953 pr_cont(" are going to be killed due to memory.oom.group set\n");
1954}
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967struct mem_cgroup *lock_page_memcg(struct page *page)
1968{
1969 struct mem_cgroup *memcg;
1970 unsigned long flags;
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983 rcu_read_lock();
1984
1985 if (mem_cgroup_disabled())
1986 return NULL;
1987again:
1988 memcg = page->mem_cgroup;
1989 if (unlikely(!memcg))
1990 return NULL;
1991
1992 if (atomic_read(&memcg->moving_account) <= 0)
1993 return memcg;
1994
1995 spin_lock_irqsave(&memcg->move_lock, flags);
1996 if (memcg != page->mem_cgroup) {
1997 spin_unlock_irqrestore(&memcg->move_lock, flags);
1998 goto again;
1999 }
2000
2001
2002
2003
2004
2005
2006 memcg->move_lock_task = current;
2007 memcg->move_lock_flags = flags;
2008
2009 return memcg;
2010}
2011EXPORT_SYMBOL(lock_page_memcg);
2012
2013
2014
2015
2016
2017
2018
2019void __unlock_page_memcg(struct mem_cgroup *memcg)
2020{
2021 if (memcg && memcg->move_lock_task == current) {
2022 unsigned long flags = memcg->move_lock_flags;
2023
2024 memcg->move_lock_task = NULL;
2025 memcg->move_lock_flags = 0;
2026
2027 spin_unlock_irqrestore(&memcg->move_lock, flags);
2028 }
2029
2030 rcu_read_unlock();
2031}
2032
2033
2034
2035
2036
2037void unlock_page_memcg(struct page *page)
2038{
2039 __unlock_page_memcg(page->mem_cgroup);
2040}
2041EXPORT_SYMBOL(unlock_page_memcg);
2042
2043struct memcg_stock_pcp {
2044 struct mem_cgroup *cached;
2045 unsigned int nr_pages;
2046 struct work_struct work;
2047 unsigned long flags;
2048#define FLUSHING_CACHED_CHARGE 0
2049};
2050static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock);
2051static DEFINE_MUTEX(percpu_charge_mutex);
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
2065{
2066 struct memcg_stock_pcp *stock;
2067 unsigned long flags;
2068 bool ret = false;
2069
2070 if (nr_pages > MEMCG_CHARGE_BATCH)
2071 return ret;
2072
2073 local_irq_save(flags);
2074
2075 stock = this_cpu_ptr(&memcg_stock);
2076 if (memcg == stock->cached && stock->nr_pages >= nr_pages) {
2077 stock->nr_pages -= nr_pages;
2078 ret = true;
2079 }
2080
2081 local_irq_restore(flags);
2082
2083 return ret;
2084}
2085
2086
2087
2088
2089static void drain_stock(struct memcg_stock_pcp *stock)
2090{
2091 struct mem_cgroup *old = stock->cached;
2092
2093 if (stock->nr_pages) {
2094 page_counter_uncharge(&old->memory, stock->nr_pages);
2095 if (do_memsw_account())
2096 page_counter_uncharge(&old->memsw, stock->nr_pages);
2097 css_put_many(&old->css, stock->nr_pages);
2098 stock->nr_pages = 0;
2099 }
2100 stock->cached = NULL;
2101}
2102
2103static void drain_local_stock(struct work_struct *dummy)
2104{
2105 struct memcg_stock_pcp *stock;
2106 unsigned long flags;
2107
2108
2109
2110
2111
2112 local_irq_save(flags);
2113
2114 stock = this_cpu_ptr(&memcg_stock);
2115 drain_stock(stock);
2116 clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags);
2117
2118 local_irq_restore(flags);
2119}
2120
2121
2122
2123
2124
2125static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
2126{
2127 struct memcg_stock_pcp *stock;
2128 unsigned long flags;
2129
2130 local_irq_save(flags);
2131
2132 stock = this_cpu_ptr(&memcg_stock);
2133 if (stock->cached != memcg) {
2134 drain_stock(stock);
2135 stock->cached = memcg;
2136 }
2137 stock->nr_pages += nr_pages;
2138
2139 if (stock->nr_pages > MEMCG_CHARGE_BATCH)
2140 drain_stock(stock);
2141
2142 local_irq_restore(flags);
2143}
2144
2145
2146
2147
2148
2149static void drain_all_stock(struct mem_cgroup *root_memcg)
2150{
2151 int cpu, curcpu;
2152
2153
2154 if (!mutex_trylock(&percpu_charge_mutex))
2155 return;
2156
2157
2158
2159
2160
2161
2162 curcpu = get_cpu();
2163 for_each_online_cpu(cpu) {
2164 struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu);
2165 struct mem_cgroup *memcg;
2166
2167 memcg = stock->cached;
2168 if (!memcg || !stock->nr_pages || !css_tryget(&memcg->css))
2169 continue;
2170 if (!mem_cgroup_is_descendant(memcg, root_memcg)) {
2171 css_put(&memcg->css);
2172 continue;
2173 }
2174 if (!test_and_set_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) {
2175 if (cpu == curcpu)
2176 drain_local_stock(&stock->work);
2177 else
2178 schedule_work_on(cpu, &stock->work);
2179 }
2180 css_put(&memcg->css);
2181 }
2182 put_cpu();
2183 mutex_unlock(&percpu_charge_mutex);
2184}
2185
2186static int memcg_hotplug_cpu_dead(unsigned int cpu)
2187{
2188 struct memcg_stock_pcp *stock;
2189 struct mem_cgroup *memcg, *mi;
2190
2191 stock = &per_cpu(memcg_stock, cpu);
2192 drain_stock(stock);
2193
2194 for_each_mem_cgroup(memcg) {
2195 int i;
2196
2197 for (i = 0; i < MEMCG_NR_STAT; i++) {
2198 int nid;
2199 long x;
2200
2201 x = this_cpu_xchg(memcg->vmstats_percpu->stat[i], 0);
2202 if (x)
2203 for (mi = memcg; mi; mi = parent_mem_cgroup(mi))
2204 atomic_long_add(x, &memcg->vmstats[i]);
2205
2206 if (i >= NR_VM_NODE_STAT_ITEMS)
2207 continue;
2208
2209 for_each_node(nid) {
2210 struct mem_cgroup_per_node *pn;
2211
2212 pn = mem_cgroup_nodeinfo(memcg, nid);
2213 x = this_cpu_xchg(pn->lruvec_stat_cpu->count[i], 0);
2214 if (x)
2215 do {
2216 atomic_long_add(x, &pn->lruvec_stat[i]);
2217 } while ((pn = parent_nodeinfo(pn, nid)));
2218 }
2219 }
2220
2221 for (i = 0; i < NR_VM_EVENT_ITEMS; i++) {
2222 long x;
2223
2224 x = this_cpu_xchg(memcg->vmstats_percpu->events[i], 0);
2225 if (x)
2226 for (mi = memcg; mi; mi = parent_mem_cgroup(mi))
2227 atomic_long_add(x, &memcg->vmevents[i]);
2228 }
2229 }
2230
2231 return 0;
2232}
2233
2234static void reclaim_high(struct mem_cgroup *memcg,
2235 unsigned int nr_pages,
2236 gfp_t gfp_mask)
2237{
2238 do {
2239 if (page_counter_read(&memcg->memory) <= memcg->high)
2240 continue;
2241 memcg_memory_event(memcg, MEMCG_HIGH);
2242 try_to_free_mem_cgroup_pages(memcg, nr_pages, gfp_mask, true);
2243 } while ((memcg = parent_mem_cgroup(memcg)));
2244}
2245
2246static void high_work_func(struct work_struct *work)
2247{
2248 struct mem_cgroup *memcg;
2249
2250 memcg = container_of(work, struct mem_cgroup, high_work);
2251 reclaim_high(memcg, MEMCG_CHARGE_BATCH, GFP_KERNEL);
2252}
2253
2254
2255
2256
2257
2258void mem_cgroup_handle_over_high(void)
2259{
2260 unsigned int nr_pages = current->memcg_nr_pages_over_high;
2261 struct mem_cgroup *memcg;
2262
2263 if (likely(!nr_pages))
2264 return;
2265
2266 memcg = get_mem_cgroup_from_mm(current->mm);
2267 reclaim_high(memcg, nr_pages, GFP_KERNEL);
2268 css_put(&memcg->css);
2269 current->memcg_nr_pages_over_high = 0;
2270}
2271
2272static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
2273 unsigned int nr_pages)
2274{
2275 unsigned int batch = max(MEMCG_CHARGE_BATCH, nr_pages);
2276 int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
2277 struct mem_cgroup *mem_over_limit;
2278 struct page_counter *counter;
2279 unsigned long nr_reclaimed;
2280 bool may_swap = true;
2281 bool drained = false;
2282 bool oomed = false;
2283 enum oom_status oom_status;
2284
2285 if (mem_cgroup_is_root(memcg))
2286 return 0;
2287retry:
2288 if (consume_stock(memcg, nr_pages))
2289 return 0;
2290
2291 if (!do_memsw_account() ||
2292 page_counter_try_charge(&memcg->memsw, batch, &counter)) {
2293 if (page_counter_try_charge(&memcg->memory, batch, &counter))
2294 goto done_restock;
2295 if (do_memsw_account())
2296 page_counter_uncharge(&memcg->memsw, batch);
2297 mem_over_limit = mem_cgroup_from_counter(counter, memory);
2298 } else {
2299 mem_over_limit = mem_cgroup_from_counter(counter, memsw);
2300 may_swap = false;
2301 }
2302
2303 if (batch > nr_pages) {
2304 batch = nr_pages;
2305 goto retry;
2306 }
2307
2308
2309
2310
2311
2312
2313
2314 if (unlikely(should_force_charge()))
2315 goto force;
2316
2317
2318
2319
2320
2321
2322
2323 if (unlikely(current->flags & PF_MEMALLOC))
2324 goto force;
2325
2326 if (unlikely(task_in_memcg_oom(current)))
2327 goto nomem;
2328
2329 if (!gfpflags_allow_blocking(gfp_mask))
2330 goto nomem;
2331
2332 memcg_memory_event(mem_over_limit, MEMCG_MAX);
2333
2334 nr_reclaimed = try_to_free_mem_cgroup_pages(mem_over_limit, nr_pages,
2335 gfp_mask, may_swap);
2336
2337 if (mem_cgroup_margin(mem_over_limit) >= nr_pages)
2338 goto retry;
2339
2340 if (!drained) {
2341 drain_all_stock(mem_over_limit);
2342 drained = true;
2343 goto retry;
2344 }
2345
2346 if (gfp_mask & __GFP_NORETRY)
2347 goto nomem;
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357 if (nr_reclaimed && nr_pages <= (1 << PAGE_ALLOC_COSTLY_ORDER))
2358 goto retry;
2359
2360
2361
2362
2363 if (mem_cgroup_wait_acct_move(mem_over_limit))
2364 goto retry;
2365
2366 if (nr_retries--)
2367 goto retry;
2368
2369 if (gfp_mask & __GFP_RETRY_MAYFAIL && oomed)
2370 goto nomem;
2371
2372 if (gfp_mask & __GFP_NOFAIL)
2373 goto force;
2374
2375 if (fatal_signal_pending(current))
2376 goto force;
2377
2378
2379
2380
2381
2382
2383 oom_status = mem_cgroup_oom(mem_over_limit, gfp_mask,
2384 get_order(nr_pages * PAGE_SIZE));
2385 switch (oom_status) {
2386 case OOM_SUCCESS:
2387 nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
2388 oomed = true;
2389 goto retry;
2390 case OOM_FAILED:
2391 goto force;
2392 default:
2393 goto nomem;
2394 }
2395nomem:
2396 if (!(gfp_mask & __GFP_NOFAIL))
2397 return -ENOMEM;
2398force:
2399
2400
2401
2402
2403
2404 page_counter_charge(&memcg->memory, nr_pages);
2405 if (do_memsw_account())
2406 page_counter_charge(&memcg->memsw, nr_pages);
2407 css_get_many(&memcg->css, nr_pages);
2408
2409 return 0;
2410
2411done_restock:
2412 css_get_many(&memcg->css, batch);
2413 if (batch > nr_pages)
2414 refill_stock(memcg, batch - nr_pages);
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425 do {
2426 if (page_counter_read(&memcg->memory) > memcg->high) {
2427
2428 if (in_interrupt()) {
2429 schedule_work(&memcg->high_work);
2430 break;
2431 }
2432 current->memcg_nr_pages_over_high += batch;
2433 set_notify_resume(current);
2434 break;
2435 }
2436 } while ((memcg = parent_mem_cgroup(memcg)));
2437
2438 return 0;
2439}
2440
2441static void cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages)
2442{
2443 if (mem_cgroup_is_root(memcg))
2444 return;
2445
2446 page_counter_uncharge(&memcg->memory, nr_pages);
2447 if (do_memsw_account())
2448 page_counter_uncharge(&memcg->memsw, nr_pages);
2449
2450 css_put_many(&memcg->css, nr_pages);
2451}
2452
2453static void lock_page_lru(struct page *page, int *isolated)
2454{
2455 pg_data_t *pgdat = page_pgdat(page);
2456
2457 spin_lock_irq(&pgdat->lru_lock);
2458 if (PageLRU(page)) {
2459 struct lruvec *lruvec;
2460
2461 lruvec = mem_cgroup_page_lruvec(page, pgdat);
2462 ClearPageLRU(page);
2463 del_page_from_lru_list(page, lruvec, page_lru(page));
2464 *isolated = 1;
2465 } else
2466 *isolated = 0;
2467}
2468
2469static void unlock_page_lru(struct page *page, int isolated)
2470{
2471 pg_data_t *pgdat = page_pgdat(page);
2472
2473 if (isolated) {
2474 struct lruvec *lruvec;
2475
2476 lruvec = mem_cgroup_page_lruvec(page, pgdat);
2477 VM_BUG_ON_PAGE(PageLRU(page), page);
2478 SetPageLRU(page);
2479 add_page_to_lru_list(page, lruvec, page_lru(page));
2480 }
2481 spin_unlock_irq(&pgdat->lru_lock);
2482}
2483
2484static void commit_charge(struct page *page, struct mem_cgroup *memcg,
2485 bool lrucare)
2486{
2487 int isolated;
2488
2489 VM_BUG_ON_PAGE(page->mem_cgroup, page);
2490
2491
2492
2493
2494
2495 if (lrucare)
2496 lock_page_lru(page, &isolated);
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512 page->mem_cgroup = memcg;
2513
2514 if (lrucare)
2515 unlock_page_lru(page, isolated);
2516}
2517
2518#ifdef CONFIG_MEMCG_KMEM
2519static int memcg_alloc_cache_id(void)
2520{
2521 int id, size;
2522 int err;
2523
2524 id = ida_simple_get(&memcg_cache_ida,
2525 0, MEMCG_CACHES_MAX_SIZE, GFP_KERNEL);
2526 if (id < 0)
2527 return id;
2528
2529 if (id < memcg_nr_cache_ids)
2530 return id;
2531
2532
2533
2534
2535
2536 down_write(&memcg_cache_ids_sem);
2537
2538 size = 2 * (id + 1);
2539 if (size < MEMCG_CACHES_MIN_SIZE)
2540 size = MEMCG_CACHES_MIN_SIZE;
2541 else if (size > MEMCG_CACHES_MAX_SIZE)
2542 size = MEMCG_CACHES_MAX_SIZE;
2543
2544 err = memcg_update_all_caches(size);
2545 if (!err)
2546 err = memcg_update_all_list_lrus(size);
2547 if (!err)
2548 memcg_nr_cache_ids = size;
2549
2550 up_write(&memcg_cache_ids_sem);
2551
2552 if (err) {
2553 ida_simple_remove(&memcg_cache_ida, id);
2554 return err;
2555 }
2556 return id;
2557}
2558
2559static void memcg_free_cache_id(int id)
2560{
2561 ida_simple_remove(&memcg_cache_ida, id);
2562}
2563
2564struct memcg_kmem_cache_create_work {
2565 struct mem_cgroup *memcg;
2566 struct kmem_cache *cachep;
2567 struct work_struct work;
2568};
2569
2570static void memcg_kmem_cache_create_func(struct work_struct *w)
2571{
2572 struct memcg_kmem_cache_create_work *cw =
2573 container_of(w, struct memcg_kmem_cache_create_work, work);
2574 struct mem_cgroup *memcg = cw->memcg;
2575 struct kmem_cache *cachep = cw->cachep;
2576
2577 memcg_create_kmem_cache(memcg, cachep);
2578
2579 css_put(&memcg->css);
2580 kfree(cw);
2581}
2582
2583
2584
2585
2586static void memcg_schedule_kmem_cache_create(struct mem_cgroup *memcg,
2587 struct kmem_cache *cachep)
2588{
2589 struct memcg_kmem_cache_create_work *cw;
2590
2591 cw = kmalloc(sizeof(*cw), GFP_NOWAIT | __GFP_NOWARN);
2592 if (!cw)
2593 return;
2594
2595 css_get(&memcg->css);
2596
2597 cw->memcg = memcg;
2598 cw->cachep = cachep;
2599 INIT_WORK(&cw->work, memcg_kmem_cache_create_func);
2600
2601 queue_work(memcg_kmem_cache_wq, &cw->work);
2602}
2603
2604static inline bool memcg_kmem_bypass(void)
2605{
2606 if (in_interrupt() || !current->mm || (current->flags & PF_KTHREAD))
2607 return true;
2608 return false;
2609}
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627struct kmem_cache *memcg_kmem_get_cache(struct kmem_cache *cachep)
2628{
2629 struct mem_cgroup *memcg;
2630 struct kmem_cache *memcg_cachep;
2631 int kmemcg_id;
2632
2633 VM_BUG_ON(!is_root_cache(cachep));
2634
2635 if (memcg_kmem_bypass())
2636 return cachep;
2637
2638 memcg = get_mem_cgroup_from_current();
2639 kmemcg_id = READ_ONCE(memcg->kmemcg_id);
2640 if (kmemcg_id < 0)
2641 goto out;
2642
2643 memcg_cachep = cache_from_memcg_idx(cachep, kmemcg_id);
2644 if (likely(memcg_cachep))
2645 return memcg_cachep;
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659 memcg_schedule_kmem_cache_create(memcg, cachep);
2660out:
2661 css_put(&memcg->css);
2662 return cachep;
2663}
2664
2665
2666
2667
2668
2669void memcg_kmem_put_cache(struct kmem_cache *cachep)
2670{
2671 if (!is_root_cache(cachep))
2672 css_put(&cachep->memcg_params.memcg->css);
2673}
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684int __memcg_kmem_charge_memcg(struct page *page, gfp_t gfp, int order,
2685 struct mem_cgroup *memcg)
2686{
2687 unsigned int nr_pages = 1 << order;
2688 struct page_counter *counter;
2689 int ret;
2690
2691 ret = try_charge(memcg, gfp, nr_pages);
2692 if (ret)
2693 return ret;
2694
2695 if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) &&
2696 !page_counter_try_charge(&memcg->kmem, nr_pages, &counter)) {
2697 cancel_charge(memcg, nr_pages);
2698 return -ENOMEM;
2699 }
2700
2701 page->mem_cgroup = memcg;
2702
2703 return 0;
2704}
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714int __memcg_kmem_charge(struct page *page, gfp_t gfp, int order)
2715{
2716 struct mem_cgroup *memcg;
2717 int ret = 0;
2718
2719 if (memcg_kmem_bypass())
2720 return 0;
2721
2722 memcg = get_mem_cgroup_from_current();
2723 if (!mem_cgroup_is_root(memcg)) {
2724 ret = __memcg_kmem_charge_memcg(page, gfp, order, memcg);
2725 if (!ret)
2726 __SetPageKmemcg(page);
2727 }
2728 css_put(&memcg->css);
2729 return ret;
2730}
2731
2732
2733
2734
2735
2736void __memcg_kmem_uncharge(struct page *page, int order)
2737{
2738 struct mem_cgroup *memcg = page->mem_cgroup;
2739 unsigned int nr_pages = 1 << order;
2740
2741 if (!memcg)
2742 return;
2743
2744 VM_BUG_ON_PAGE(mem_cgroup_is_root(memcg), page);
2745
2746 if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
2747 page_counter_uncharge(&memcg->kmem, nr_pages);
2748
2749 page_counter_uncharge(&memcg->memory, nr_pages);
2750 if (do_memsw_account())
2751 page_counter_uncharge(&memcg->memsw, nr_pages);
2752
2753 page->mem_cgroup = NULL;
2754
2755
2756 if (PageKmemcg(page))
2757 __ClearPageKmemcg(page);
2758
2759 css_put_many(&memcg->css, nr_pages);
2760}
2761#endif
2762
2763#ifdef CONFIG_TRANSPARENT_HUGEPAGE
2764
2765
2766
2767
2768
2769void mem_cgroup_split_huge_fixup(struct page *head)
2770{
2771 int i;
2772
2773 if (mem_cgroup_disabled())
2774 return;
2775
2776 for (i = 1; i < HPAGE_PMD_NR; i++)
2777 head[i].mem_cgroup = head->mem_cgroup;
2778
2779 __mod_memcg_state(head->mem_cgroup, MEMCG_RSS_HUGE, -HPAGE_PMD_NR);
2780}
2781#endif
2782
2783#ifdef CONFIG_MEMCG_SWAP
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798static int mem_cgroup_move_swap_account(swp_entry_t entry,
2799 struct mem_cgroup *from, struct mem_cgroup *to)
2800{
2801 unsigned short old_id, new_id;
2802
2803 old_id = mem_cgroup_id(from);
2804 new_id = mem_cgroup_id(to);
2805
2806 if (swap_cgroup_cmpxchg(entry, old_id, new_id) == old_id) {
2807 mod_memcg_state(from, MEMCG_SWAP, -1);
2808 mod_memcg_state(to, MEMCG_SWAP, 1);
2809 return 0;
2810 }
2811 return -EINVAL;
2812}
2813#else
2814static inline int mem_cgroup_move_swap_account(swp_entry_t entry,
2815 struct mem_cgroup *from, struct mem_cgroup *to)
2816{
2817 return -EINVAL;
2818}
2819#endif
2820
2821static DEFINE_MUTEX(memcg_max_mutex);
2822
2823static int mem_cgroup_resize_max(struct mem_cgroup *memcg,
2824 unsigned long max, bool memsw)
2825{
2826 bool enlarge = false;
2827 bool drained = false;
2828 int ret;
2829 bool limits_invariant;
2830 struct page_counter *counter = memsw ? &memcg->memsw : &memcg->memory;
2831
2832 do {
2833 if (signal_pending(current)) {
2834 ret = -EINTR;
2835 break;
2836 }
2837
2838 mutex_lock(&memcg_max_mutex);
2839
2840
2841
2842
2843 limits_invariant = memsw ? max >= memcg->memory.max :
2844 max <= memcg->memsw.max;
2845 if (!limits_invariant) {
2846 mutex_unlock(&memcg_max_mutex);
2847 ret = -EINVAL;
2848 break;
2849 }
2850 if (max > counter->max)
2851 enlarge = true;
2852 ret = page_counter_set_max(counter, max);
2853 mutex_unlock(&memcg_max_mutex);
2854
2855 if (!ret)
2856 break;
2857
2858 if (!drained) {
2859 drain_all_stock(memcg);
2860 drained = true;
2861 continue;
2862 }
2863
2864 if (!try_to_free_mem_cgroup_pages(memcg, 1,
2865 GFP_KERNEL, !memsw)) {
2866 ret = -EBUSY;
2867 break;
2868 }
2869 } while (true);
2870
2871 if (!ret && enlarge)
2872 memcg_oom_recover(memcg);
2873
2874 return ret;
2875}
2876
2877unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order,
2878 gfp_t gfp_mask,
2879 unsigned long *total_scanned)
2880{
2881 unsigned long nr_reclaimed = 0;
2882 struct mem_cgroup_per_node *mz, *next_mz = NULL;
2883 unsigned long reclaimed;
2884 int loop = 0;
2885 struct mem_cgroup_tree_per_node *mctz;
2886 unsigned long excess;
2887 unsigned long nr_scanned;
2888
2889 if (order > 0)
2890 return 0;
2891
2892 mctz = soft_limit_tree_node(pgdat->node_id);
2893
2894
2895
2896
2897
2898
2899 if (!mctz || RB_EMPTY_ROOT(&mctz->rb_root))
2900 return 0;
2901
2902
2903
2904
2905
2906
2907 do {
2908 if (next_mz)
2909 mz = next_mz;
2910 else
2911 mz = mem_cgroup_largest_soft_limit_node(mctz);
2912 if (!mz)
2913 break;
2914
2915 nr_scanned = 0;
2916 reclaimed = mem_cgroup_soft_reclaim(mz->memcg, pgdat,
2917 gfp_mask, &nr_scanned);
2918 nr_reclaimed += reclaimed;
2919 *total_scanned += nr_scanned;
2920 spin_lock_irq(&mctz->lock);
2921 __mem_cgroup_remove_exceeded(mz, mctz);
2922
2923
2924
2925
2926
2927 next_mz = NULL;
2928 if (!reclaimed)
2929 next_mz = __mem_cgroup_largest_soft_limit_node(mctz);
2930
2931 excess = soft_limit_excess(mz->memcg);
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941 __mem_cgroup_insert_exceeded(mz, mctz, excess);
2942 spin_unlock_irq(&mctz->lock);
2943 css_put(&mz->memcg->css);
2944 loop++;
2945
2946
2947
2948
2949
2950 if (!nr_reclaimed &&
2951 (next_mz == NULL ||
2952 loop > MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS))
2953 break;
2954 } while (!nr_reclaimed);
2955 if (next_mz)
2956 css_put(&next_mz->memcg->css);
2957 return nr_reclaimed;
2958}
2959
2960
2961
2962
2963
2964
2965
2966static inline bool memcg_has_children(struct mem_cgroup *memcg)
2967{
2968 bool ret;
2969
2970 rcu_read_lock();
2971 ret = css_next_child(NULL, &memcg->css);
2972 rcu_read_unlock();
2973 return ret;
2974}
2975
2976
2977
2978
2979
2980
2981static int mem_cgroup_force_empty(struct mem_cgroup *memcg)
2982{
2983 int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
2984
2985
2986 lru_add_drain_all();
2987
2988 drain_all_stock(memcg);
2989
2990
2991 while (nr_retries && page_counter_read(&memcg->memory)) {
2992 int progress;
2993
2994 if (signal_pending(current))
2995 return -EINTR;
2996
2997 progress = try_to_free_mem_cgroup_pages(memcg, 1,
2998 GFP_KERNEL, true);
2999 if (!progress) {
3000 nr_retries--;
3001
3002 congestion_wait(BLK_RW_ASYNC, HZ/10);
3003 }
3004
3005 }
3006
3007 return 0;
3008}
3009
3010static ssize_t mem_cgroup_force_empty_write(struct kernfs_open_file *of,
3011 char *buf, size_t nbytes,
3012 loff_t off)
3013{
3014 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
3015
3016 if (mem_cgroup_is_root(memcg))
3017 return -EINVAL;
3018 return mem_cgroup_force_empty(memcg) ?: nbytes;
3019}
3020
3021static u64 mem_cgroup_hierarchy_read(struct cgroup_subsys_state *css,
3022 struct cftype *cft)
3023{
3024 return mem_cgroup_from_css(css)->use_hierarchy;
3025}
3026
3027static int mem_cgroup_hierarchy_write(struct cgroup_subsys_state *css,
3028 struct cftype *cft, u64 val)
3029{
3030 int retval = 0;
3031 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
3032 struct mem_cgroup *parent_memcg = mem_cgroup_from_css(memcg->css.parent);
3033
3034 if (memcg->use_hierarchy == val)
3035 return 0;
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045 if ((!parent_memcg || !parent_memcg->use_hierarchy) &&
3046 (val == 1 || val == 0)) {
3047 if (!memcg_has_children(memcg))
3048 memcg->use_hierarchy = val;
3049 else
3050 retval = -EBUSY;
3051 } else
3052 retval = -EINVAL;
3053
3054 return retval;
3055}
3056
3057static unsigned long mem_cgroup_usage(struct mem_cgroup *memcg, bool swap)
3058{
3059 unsigned long val;
3060
3061 if (mem_cgroup_is_root(memcg)) {
3062 val = memcg_page_state(memcg, MEMCG_CACHE) +
3063 memcg_page_state(memcg, MEMCG_RSS);
3064 if (swap)
3065 val += memcg_page_state(memcg, MEMCG_SWAP);
3066 } else {
3067 if (!swap)
3068 val = page_counter_read(&memcg->memory);
3069 else
3070 val = page_counter_read(&memcg->memsw);
3071 }
3072 return val;
3073}
3074
3075enum {
3076 RES_USAGE,
3077 RES_LIMIT,
3078 RES_MAX_USAGE,
3079 RES_FAILCNT,
3080 RES_SOFT_LIMIT,
3081};
3082
3083static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css,
3084 struct cftype *cft)
3085{
3086 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
3087 struct page_counter *counter;
3088
3089 switch (MEMFILE_TYPE(cft->private)) {
3090 case _MEM:
3091 counter = &memcg->memory;
3092 break;
3093 case _MEMSWAP:
3094 counter = &memcg->memsw;
3095 break;
3096 case _KMEM:
3097 counter = &memcg->kmem;
3098 break;
3099 case _TCP:
3100 counter = &memcg->tcpmem;
3101 break;
3102 default:
3103 BUG();
3104 }
3105
3106 switch (MEMFILE_ATTR(cft->private)) {
3107 case RES_USAGE:
3108 if (counter == &memcg->memory)
3109 return (u64)mem_cgroup_usage(memcg, false) * PAGE_SIZE;
3110 if (counter == &memcg->memsw)
3111 return (u64)mem_cgroup_usage(memcg, true) * PAGE_SIZE;
3112 return (u64)page_counter_read(counter) * PAGE_SIZE;
3113 case RES_LIMIT:
3114 return (u64)counter->max * PAGE_SIZE;
3115 case RES_MAX_USAGE:
3116 return (u64)counter->watermark * PAGE_SIZE;
3117 case RES_FAILCNT:
3118 return counter->failcnt;
3119 case RES_SOFT_LIMIT:
3120 return (u64)memcg->soft_limit * PAGE_SIZE;
3121 default:
3122 BUG();
3123 }
3124}
3125
3126#ifdef CONFIG_MEMCG_KMEM
3127static int memcg_online_kmem(struct mem_cgroup *memcg)
3128{
3129 int memcg_id;
3130
3131 if (cgroup_memory_nokmem)
3132 return 0;
3133
3134 BUG_ON(memcg->kmemcg_id >= 0);
3135 BUG_ON(memcg->kmem_state);
3136
3137 memcg_id = memcg_alloc_cache_id();
3138 if (memcg_id < 0)
3139 return memcg_id;
3140
3141 static_branch_inc(&memcg_kmem_enabled_key);
3142
3143
3144
3145
3146
3147
3148 memcg->kmemcg_id = memcg_id;
3149 memcg->kmem_state = KMEM_ONLINE;
3150 INIT_LIST_HEAD(&memcg->kmem_caches);
3151
3152 return 0;
3153}
3154
3155static void memcg_offline_kmem(struct mem_cgroup *memcg)
3156{
3157 struct cgroup_subsys_state *css;
3158 struct mem_cgroup *parent, *child;
3159 int kmemcg_id;
3160
3161 if (memcg->kmem_state != KMEM_ONLINE)
3162 return;
3163
3164
3165
3166
3167
3168
3169 memcg->kmem_state = KMEM_ALLOCATED;
3170
3171 memcg_deactivate_kmem_caches(memcg);
3172
3173 kmemcg_id = memcg->kmemcg_id;
3174 BUG_ON(kmemcg_id < 0);
3175
3176 parent = parent_mem_cgroup(memcg);
3177 if (!parent)
3178 parent = root_mem_cgroup;
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188 rcu_read_lock();
3189 css_for_each_descendant_pre(css, &memcg->css) {
3190 child = mem_cgroup_from_css(css);
3191 BUG_ON(child->kmemcg_id != kmemcg_id);
3192 child->kmemcg_id = parent->kmemcg_id;
3193 if (!memcg->use_hierarchy)
3194 break;
3195 }
3196 rcu_read_unlock();
3197
3198 memcg_drain_all_list_lrus(kmemcg_id, parent);
3199
3200 memcg_free_cache_id(kmemcg_id);
3201}
3202
3203static void memcg_free_kmem(struct mem_cgroup *memcg)
3204{
3205
3206 if (unlikely(memcg->kmem_state == KMEM_ONLINE))
3207 memcg_offline_kmem(memcg);
3208
3209 if (memcg->kmem_state == KMEM_ALLOCATED) {
3210 memcg_destroy_kmem_caches(memcg);
3211 static_branch_dec(&memcg_kmem_enabled_key);
3212 WARN_ON(page_counter_read(&memcg->kmem));
3213 }
3214}
3215#else
3216static int memcg_online_kmem(struct mem_cgroup *memcg)
3217{
3218 return 0;
3219}
3220static void memcg_offline_kmem(struct mem_cgroup *memcg)
3221{
3222}
3223static void memcg_free_kmem(struct mem_cgroup *memcg)
3224{
3225}
3226#endif
3227
3228static int memcg_update_kmem_max(struct mem_cgroup *memcg,
3229 unsigned long max)
3230{
3231 int ret;
3232
3233 mutex_lock(&memcg_max_mutex);
3234 ret = page_counter_set_max(&memcg->kmem, max);
3235 mutex_unlock(&memcg_max_mutex);
3236 return ret;
3237}
3238
3239static int memcg_update_tcp_max(struct mem_cgroup *memcg, unsigned long max)
3240{
3241 int ret;
3242
3243 mutex_lock(&memcg_max_mutex);
3244
3245 ret = page_counter_set_max(&memcg->tcpmem, max);
3246 if (ret)
3247 goto out;
3248
3249 if (!memcg->tcpmem_active) {
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266 static_branch_inc(&memcg_sockets_enabled_key);
3267 memcg->tcpmem_active = true;
3268 }
3269out:
3270 mutex_unlock(&memcg_max_mutex);
3271 return ret;
3272}
3273
3274
3275
3276
3277
3278static ssize_t mem_cgroup_write(struct kernfs_open_file *of,
3279 char *buf, size_t nbytes, loff_t off)
3280{
3281 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
3282 unsigned long nr_pages;
3283 int ret;
3284
3285 buf = strstrip(buf);
3286 ret = page_counter_memparse(buf, "-1", &nr_pages);
3287 if (ret)
3288 return ret;
3289
3290 switch (MEMFILE_ATTR(of_cft(of)->private)) {
3291 case RES_LIMIT:
3292 if (mem_cgroup_is_root(memcg)) {
3293 ret = -EINVAL;
3294 break;
3295 }
3296 switch (MEMFILE_TYPE(of_cft(of)->private)) {
3297 case _MEM:
3298 ret = mem_cgroup_resize_max(memcg, nr_pages, false);
3299 break;
3300 case _MEMSWAP:
3301 ret = mem_cgroup_resize_max(memcg, nr_pages, true);
3302 break;
3303 case _KMEM:
3304 ret = memcg_update_kmem_max(memcg, nr_pages);
3305 break;
3306 case _TCP:
3307 ret = memcg_update_tcp_max(memcg, nr_pages);
3308 break;
3309 }
3310 break;
3311 case RES_SOFT_LIMIT:
3312 memcg->soft_limit = nr_pages;
3313 ret = 0;
3314 break;
3315 }
3316 return ret ?: nbytes;
3317}
3318
3319static ssize_t mem_cgroup_reset(struct kernfs_open_file *of, char *buf,
3320 size_t nbytes, loff_t off)
3321{
3322 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
3323 struct page_counter *counter;
3324
3325 switch (MEMFILE_TYPE(of_cft(of)->private)) {
3326 case _MEM:
3327 counter = &memcg->memory;
3328 break;
3329 case _MEMSWAP:
3330 counter = &memcg->memsw;
3331 break;
3332 case _KMEM:
3333 counter = &memcg->kmem;
3334 break;
3335 case _TCP:
3336 counter = &memcg->tcpmem;
3337 break;
3338 default:
3339 BUG();
3340 }
3341
3342 switch (MEMFILE_ATTR(of_cft(of)->private)) {
3343 case RES_MAX_USAGE:
3344 page_counter_reset_watermark(counter);
3345 break;
3346 case RES_FAILCNT:
3347 counter->failcnt = 0;
3348 break;
3349 default:
3350 BUG();
3351 }
3352
3353 return nbytes;
3354}
3355
3356static u64 mem_cgroup_move_charge_read(struct cgroup_subsys_state *css,
3357 struct cftype *cft)
3358{
3359 return mem_cgroup_from_css(css)->move_charge_at_immigrate;
3360}
3361
3362#ifdef CONFIG_MMU
3363static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css,
3364 struct cftype *cft, u64 val)
3365{
3366 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
3367
3368 if (val & ~MOVE_MASK)
3369 return -EINVAL;
3370
3371
3372
3373
3374
3375
3376
3377 memcg->move_charge_at_immigrate = val;
3378 return 0;
3379}
3380#else
3381static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css,
3382 struct cftype *cft, u64 val)
3383{
3384 return -ENOSYS;
3385}
3386#endif
3387
3388#ifdef CONFIG_NUMA
3389
3390#define LRU_ALL_FILE (BIT(LRU_INACTIVE_FILE) | BIT(LRU_ACTIVE_FILE))
3391#define LRU_ALL_ANON (BIT(LRU_INACTIVE_ANON) | BIT(LRU_ACTIVE_ANON))
3392#define LRU_ALL ((1 << NR_LRU_LISTS) - 1)
3393
3394static unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg,
3395 int nid, unsigned int lru_mask)
3396{
3397 struct lruvec *lruvec = mem_cgroup_lruvec(NODE_DATA(nid), memcg);
3398 unsigned long nr = 0;
3399 enum lru_list lru;
3400
3401 VM_BUG_ON((unsigned)nid >= nr_node_ids);
3402
3403 for_each_lru(lru) {
3404 if (!(BIT(lru) & lru_mask))
3405 continue;
3406 nr += lruvec_page_state_local(lruvec, NR_LRU_BASE + lru);
3407 }
3408 return nr;
3409}
3410
3411static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *memcg,
3412 unsigned int lru_mask)
3413{
3414 unsigned long nr = 0;
3415 enum lru_list lru;
3416
3417 for_each_lru(lru) {
3418 if (!(BIT(lru) & lru_mask))
3419 continue;
3420 nr += memcg_page_state_local(memcg, NR_LRU_BASE + lru);
3421 }
3422 return nr;
3423}
3424
3425static int memcg_numa_stat_show(struct seq_file *m, void *v)
3426{
3427 struct numa_stat {
3428 const char *name;
3429 unsigned int lru_mask;
3430 };
3431
3432 static const struct numa_stat stats[] = {
3433 { "total", LRU_ALL },
3434 { "file", LRU_ALL_FILE },
3435 { "anon", LRU_ALL_ANON },
3436 { "unevictable", BIT(LRU_UNEVICTABLE) },
3437 };
3438 const struct numa_stat *stat;
3439 int nid;
3440 unsigned long nr;
3441 struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
3442
3443 for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) {
3444 nr = mem_cgroup_nr_lru_pages(memcg, stat->lru_mask);
3445 seq_printf(m, "%s=%lu", stat->name, nr);
3446 for_each_node_state(nid, N_MEMORY) {
3447 nr = mem_cgroup_node_nr_lru_pages(memcg, nid,
3448 stat->lru_mask);
3449 seq_printf(m, " N%d=%lu", nid, nr);
3450 }
3451 seq_putc(m, '\n');
3452 }
3453
3454 for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) {
3455 struct mem_cgroup *iter;
3456
3457 nr = 0;
3458 for_each_mem_cgroup_tree(iter, memcg)
3459 nr += mem_cgroup_nr_lru_pages(iter, stat->lru_mask);
3460 seq_printf(m, "hierarchical_%s=%lu", stat->name, nr);
3461 for_each_node_state(nid, N_MEMORY) {
3462 nr = 0;
3463 for_each_mem_cgroup_tree(iter, memcg)
3464 nr += mem_cgroup_node_nr_lru_pages(
3465 iter, nid, stat->lru_mask);
3466 seq_printf(m, " N%d=%lu", nid, nr);
3467 }
3468 seq_putc(m, '\n');
3469 }
3470
3471 return 0;
3472}
3473#endif
3474
3475
3476static const unsigned int memcg1_events[] = {
3477 PGPGIN,
3478 PGPGOUT,
3479 PGFAULT,
3480 PGMAJFAULT,
3481};
3482
3483static const char *const memcg1_event_names[] = {
3484 "pgpgin",
3485 "pgpgout",
3486 "pgfault",
3487 "pgmajfault",
3488};
3489
3490static int memcg_stat_show(struct seq_file *m, void *v)
3491{
3492 struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
3493 unsigned long memory, memsw;
3494 struct mem_cgroup *mi;
3495 unsigned int i;
3496
3497 BUILD_BUG_ON(ARRAY_SIZE(memcg1_stat_names) != ARRAY_SIZE(memcg1_stats));
3498 BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_lru_names) != NR_LRU_LISTS);
3499
3500 for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) {
3501 if (memcg1_stats[i] == MEMCG_SWAP && !do_memsw_account())
3502 continue;
3503 seq_printf(m, "%s %lu\n", memcg1_stat_names[i],
3504 memcg_page_state_local(memcg, memcg1_stats[i]) *
3505 PAGE_SIZE);
3506 }
3507
3508 for (i = 0; i < ARRAY_SIZE(memcg1_events); i++)
3509 seq_printf(m, "%s %lu\n", memcg1_event_names[i],
3510 memcg_events_local(memcg, memcg1_events[i]));
3511
3512 for (i = 0; i < NR_LRU_LISTS; i++)
3513 seq_printf(m, "%s %lu\n", mem_cgroup_lru_names[i],
3514 memcg_page_state_local(memcg, NR_LRU_BASE + i) *
3515 PAGE_SIZE);
3516
3517
3518 memory = memsw = PAGE_COUNTER_MAX;
3519 for (mi = memcg; mi; mi = parent_mem_cgroup(mi)) {
3520 memory = min(memory, mi->memory.max);
3521 memsw = min(memsw, mi->memsw.max);
3522 }
3523 seq_printf(m, "hierarchical_memory_limit %llu\n",
3524 (u64)memory * PAGE_SIZE);
3525 if (do_memsw_account())
3526 seq_printf(m, "hierarchical_memsw_limit %llu\n",
3527 (u64)memsw * PAGE_SIZE);
3528
3529 for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) {
3530 if (memcg1_stats[i] == MEMCG_SWAP && !do_memsw_account())
3531 continue;
3532 seq_printf(m, "total_%s %llu\n", memcg1_stat_names[i],
3533 (u64)memcg_page_state(memcg, i) * PAGE_SIZE);
3534 }
3535
3536 for (i = 0; i < ARRAY_SIZE(memcg1_events); i++)
3537 seq_printf(m, "total_%s %llu\n", memcg1_event_names[i],
3538 (u64)memcg_events(memcg, i));
3539
3540 for (i = 0; i < NR_LRU_LISTS; i++)
3541 seq_printf(m, "total_%s %llu\n", mem_cgroup_lru_names[i],
3542 (u64)memcg_page_state(memcg, NR_LRU_BASE + i) *
3543 PAGE_SIZE);
3544
3545#ifdef CONFIG_DEBUG_VM
3546 {
3547 pg_data_t *pgdat;
3548 struct mem_cgroup_per_node *mz;
3549 struct zone_reclaim_stat *rstat;
3550 unsigned long recent_rotated[2] = {0, 0};
3551 unsigned long recent_scanned[2] = {0, 0};
3552
3553 for_each_online_pgdat(pgdat) {
3554 mz = mem_cgroup_nodeinfo(memcg, pgdat->node_id);
3555 rstat = &mz->lruvec.reclaim_stat;
3556
3557 recent_rotated[0] += rstat->recent_rotated[0];
3558 recent_rotated[1] += rstat->recent_rotated[1];
3559 recent_scanned[0] += rstat->recent_scanned[0];
3560 recent_scanned[1] += rstat->recent_scanned[1];
3561 }
3562 seq_printf(m, "recent_rotated_anon %lu\n", recent_rotated[0]);
3563 seq_printf(m, "recent_rotated_file %lu\n", recent_rotated[1]);
3564 seq_printf(m, "recent_scanned_anon %lu\n", recent_scanned[0]);
3565 seq_printf(m, "recent_scanned_file %lu\n", recent_scanned[1]);
3566 }
3567#endif
3568
3569 return 0;
3570}
3571
3572static u64 mem_cgroup_swappiness_read(struct cgroup_subsys_state *css,
3573 struct cftype *cft)
3574{
3575 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
3576
3577 return mem_cgroup_swappiness(memcg);
3578}
3579
3580static int mem_cgroup_swappiness_write(struct cgroup_subsys_state *css,
3581 struct cftype *cft, u64 val)
3582{
3583 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
3584
3585 if (val > 100)
3586 return -EINVAL;
3587
3588 if (css->parent)
3589 memcg->swappiness = val;
3590 else
3591 vm_swappiness = val;
3592
3593 return 0;
3594}
3595
3596static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap)
3597{
3598 struct mem_cgroup_threshold_ary *t;
3599 unsigned long usage;
3600 int i;
3601
3602 rcu_read_lock();
3603 if (!swap)
3604 t = rcu_dereference(memcg->thresholds.primary);
3605 else
3606 t = rcu_dereference(memcg->memsw_thresholds.primary);
3607
3608 if (!t)
3609 goto unlock;
3610
3611 usage = mem_cgroup_usage(memcg, swap);
3612
3613
3614
3615
3616
3617
3618 i = t->current_threshold;
3619
3620
3621
3622
3623
3624
3625
3626 for (; i >= 0 && unlikely(t->entries[i].threshold > usage); i--)
3627 eventfd_signal(t->entries[i].eventfd, 1);
3628
3629
3630 i++;
3631
3632
3633
3634
3635
3636
3637
3638 for (; i < t->size && unlikely(t->entries[i].threshold <= usage); i++)
3639 eventfd_signal(t->entries[i].eventfd, 1);
3640
3641
3642 t->current_threshold = i - 1;
3643unlock:
3644 rcu_read_unlock();
3645}
3646
3647static void mem_cgroup_threshold(struct mem_cgroup *memcg)
3648{
3649 while (memcg) {
3650 __mem_cgroup_threshold(memcg, false);
3651 if (do_memsw_account())
3652 __mem_cgroup_threshold(memcg, true);
3653
3654 memcg = parent_mem_cgroup(memcg);
3655 }
3656}
3657
3658static int compare_thresholds(const void *a, const void *b)
3659{
3660 const struct mem_cgroup_threshold *_a = a;
3661 const struct mem_cgroup_threshold *_b = b;
3662
3663 if (_a->threshold > _b->threshold)
3664 return 1;
3665
3666 if (_a->threshold < _b->threshold)
3667 return -1;
3668
3669 return 0;
3670}
3671
3672static int mem_cgroup_oom_notify_cb(struct mem_cgroup *memcg)
3673{
3674 struct mem_cgroup_eventfd_list *ev;
3675
3676 spin_lock(&memcg_oom_lock);
3677
3678 list_for_each_entry(ev, &memcg->oom_notify, list)
3679 eventfd_signal(ev->eventfd, 1);
3680
3681 spin_unlock(&memcg_oom_lock);
3682 return 0;
3683}
3684
3685static void mem_cgroup_oom_notify(struct mem_cgroup *memcg)
3686{
3687 struct mem_cgroup *iter;
3688
3689 for_each_mem_cgroup_tree(iter, memcg)
3690 mem_cgroup_oom_notify_cb(iter);
3691}
3692
3693static int __mem_cgroup_usage_register_event(struct mem_cgroup *memcg,
3694 struct eventfd_ctx *eventfd, const char *args, enum res_type type)
3695{
3696 struct mem_cgroup_thresholds *thresholds;
3697 struct mem_cgroup_threshold_ary *new;
3698 unsigned long threshold;
3699 unsigned long usage;
3700 int i, size, ret;
3701
3702 ret = page_counter_memparse(args, "-1", &threshold);
3703 if (ret)
3704 return ret;
3705
3706 mutex_lock(&memcg->thresholds_lock);
3707
3708 if (type == _MEM) {
3709 thresholds = &memcg->thresholds;
3710 usage = mem_cgroup_usage(memcg, false);
3711 } else if (type == _MEMSWAP) {
3712 thresholds = &memcg->memsw_thresholds;
3713 usage = mem_cgroup_usage(memcg, true);
3714 } else
3715 BUG();
3716
3717
3718 if (thresholds->primary)
3719 __mem_cgroup_threshold(memcg, type == _MEMSWAP);
3720
3721 size = thresholds->primary ? thresholds->primary->size + 1 : 1;
3722
3723
3724 new = kmalloc(struct_size(new, entries, size), GFP_KERNEL);
3725 if (!new) {
3726 ret = -ENOMEM;
3727 goto unlock;
3728 }
3729 new->size = size;
3730
3731
3732 if (thresholds->primary) {
3733 memcpy(new->entries, thresholds->primary->entries, (size - 1) *
3734 sizeof(struct mem_cgroup_threshold));
3735 }
3736
3737
3738 new->entries[size - 1].eventfd = eventfd;
3739 new->entries[size - 1].threshold = threshold;
3740
3741
3742 sort(new->entries, size, sizeof(struct mem_cgroup_threshold),
3743 compare_thresholds, NULL);
3744
3745
3746 new->current_threshold = -1;
3747 for (i = 0; i < size; i++) {
3748 if (new->entries[i].threshold <= usage) {
3749
3750
3751
3752
3753
3754 ++new->current_threshold;
3755 } else
3756 break;
3757 }
3758
3759
3760 kfree(thresholds->spare);
3761 thresholds->spare = thresholds->primary;
3762
3763 rcu_assign_pointer(thresholds->primary, new);
3764
3765
3766 synchronize_rcu();
3767
3768unlock:
3769 mutex_unlock(&memcg->thresholds_lock);
3770
3771 return ret;
3772}
3773
3774static int mem_cgroup_usage_register_event(struct mem_cgroup *memcg,
3775 struct eventfd_ctx *eventfd, const char *args)
3776{
3777 return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEM);
3778}
3779
3780static int memsw_cgroup_usage_register_event(struct mem_cgroup *memcg,
3781 struct eventfd_ctx *eventfd, const char *args)
3782{
3783 return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEMSWAP);
3784}
3785
3786static void __mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
3787 struct eventfd_ctx *eventfd, enum res_type type)
3788{
3789 struct mem_cgroup_thresholds *thresholds;
3790 struct mem_cgroup_threshold_ary *new;
3791 unsigned long usage;
3792 int i, j, size;
3793
3794 mutex_lock(&memcg->thresholds_lock);
3795
3796 if (type == _MEM) {
3797 thresholds = &memcg->thresholds;
3798 usage = mem_cgroup_usage(memcg, false);
3799 } else if (type == _MEMSWAP) {
3800 thresholds = &memcg->memsw_thresholds;
3801 usage = mem_cgroup_usage(memcg, true);
3802 } else
3803 BUG();
3804
3805 if (!thresholds->primary)
3806 goto unlock;
3807
3808
3809 __mem_cgroup_threshold(memcg, type == _MEMSWAP);
3810
3811
3812 size = 0;
3813 for (i = 0; i < thresholds->primary->size; i++) {
3814 if (thresholds->primary->entries[i].eventfd != eventfd)
3815 size++;
3816 }
3817
3818 new = thresholds->spare;
3819
3820
3821 if (!size) {
3822 kfree(new);
3823 new = NULL;
3824 goto swap_buffers;
3825 }
3826
3827 new->size = size;
3828
3829
3830 new->current_threshold = -1;
3831 for (i = 0, j = 0; i < thresholds->primary->size; i++) {
3832 if (thresholds->primary->entries[i].eventfd == eventfd)
3833 continue;
3834
3835 new->entries[j] = thresholds->primary->entries[i];
3836 if (new->entries[j].threshold <= usage) {
3837
3838
3839
3840
3841
3842 ++new->current_threshold;
3843 }
3844 j++;
3845 }
3846
3847swap_buffers:
3848
3849 thresholds->spare = thresholds->primary;
3850
3851 rcu_assign_pointer(thresholds->primary, new);
3852
3853
3854 synchronize_rcu();
3855
3856
3857 if (!new) {
3858 kfree(thresholds->spare);
3859 thresholds->spare = NULL;
3860 }
3861unlock:
3862 mutex_unlock(&memcg->thresholds_lock);
3863}
3864
3865static void mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
3866 struct eventfd_ctx *eventfd)
3867{
3868 return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEM);
3869}
3870
3871static void memsw_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
3872 struct eventfd_ctx *eventfd)
3873{
3874 return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEMSWAP);
3875}
3876
3877static int mem_cgroup_oom_register_event(struct mem_cgroup *memcg,
3878 struct eventfd_ctx *eventfd, const char *args)
3879{
3880 struct mem_cgroup_eventfd_list *event;
3881
3882 event = kmalloc(sizeof(*event), GFP_KERNEL);
3883 if (!event)
3884 return -ENOMEM;
3885
3886 spin_lock(&memcg_oom_lock);
3887
3888 event->eventfd = eventfd;
3889 list_add(&event->list, &memcg->oom_notify);
3890
3891
3892 if (memcg->under_oom)
3893 eventfd_signal(eventfd, 1);
3894 spin_unlock(&memcg_oom_lock);
3895
3896 return 0;
3897}
3898
3899static void mem_cgroup_oom_unregister_event(struct mem_cgroup *memcg,
3900 struct eventfd_ctx *eventfd)
3901{
3902 struct mem_cgroup_eventfd_list *ev, *tmp;
3903
3904 spin_lock(&memcg_oom_lock);
3905
3906 list_for_each_entry_safe(ev, tmp, &memcg->oom_notify, list) {
3907 if (ev->eventfd == eventfd) {
3908 list_del(&ev->list);
3909 kfree(ev);
3910 }
3911 }
3912
3913 spin_unlock(&memcg_oom_lock);
3914}
3915
3916static int mem_cgroup_oom_control_read(struct seq_file *sf, void *v)
3917{
3918 struct mem_cgroup *memcg = mem_cgroup_from_seq(sf);
3919
3920 seq_printf(sf, "oom_kill_disable %d\n", memcg->oom_kill_disable);
3921 seq_printf(sf, "under_oom %d\n", (bool)memcg->under_oom);
3922 seq_printf(sf, "oom_kill %lu\n",
3923 atomic_long_read(&memcg->memory_events[MEMCG_OOM_KILL]));
3924 return 0;
3925}
3926
3927static int mem_cgroup_oom_control_write(struct cgroup_subsys_state *css,
3928 struct cftype *cft, u64 val)
3929{
3930 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
3931
3932
3933 if (!css->parent || !((val == 0) || (val == 1)))
3934 return -EINVAL;
3935
3936 memcg->oom_kill_disable = val;
3937 if (!val)
3938 memcg_oom_recover(memcg);
3939
3940 return 0;
3941}
3942
3943#ifdef CONFIG_CGROUP_WRITEBACK
3944
3945static int memcg_wb_domain_init(struct mem_cgroup *memcg, gfp_t gfp)
3946{
3947 return wb_domain_init(&memcg->cgwb_domain, gfp);
3948}
3949
3950static void memcg_wb_domain_exit(struct mem_cgroup *memcg)
3951{
3952 wb_domain_exit(&memcg->cgwb_domain);
3953}
3954
3955static void memcg_wb_domain_size_changed(struct mem_cgroup *memcg)
3956{
3957 wb_domain_size_changed(&memcg->cgwb_domain);
3958}
3959
3960struct wb_domain *mem_cgroup_wb_domain(struct bdi_writeback *wb)
3961{
3962 struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css);
3963
3964 if (!memcg->css.parent)
3965 return NULL;
3966
3967 return &memcg->cgwb_domain;
3968}
3969
3970
3971
3972
3973
3974static unsigned long memcg_exact_page_state(struct mem_cgroup *memcg, int idx)
3975{
3976 long x = atomic_long_read(&memcg->vmstats[idx]);
3977 int cpu;
3978
3979 for_each_online_cpu(cpu)
3980 x += per_cpu_ptr(memcg->vmstats_percpu, cpu)->stat[idx];
3981 if (x < 0)
3982 x = 0;
3983 return x;
3984}
3985
3986
3987
3988
3989
3990
3991
3992
3993
3994
3995
3996
3997
3998
3999
4000
4001
4002
4003
4004void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages,
4005 unsigned long *pheadroom, unsigned long *pdirty,
4006 unsigned long *pwriteback)
4007{
4008 struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css);
4009 struct mem_cgroup *parent;
4010
4011 *pdirty = memcg_exact_page_state(memcg, NR_FILE_DIRTY);
4012
4013
4014 *pwriteback = memcg_exact_page_state(memcg, NR_WRITEBACK);
4015 *pfilepages = memcg_exact_page_state(memcg, NR_INACTIVE_FILE) +
4016 memcg_exact_page_state(memcg, NR_ACTIVE_FILE);
4017 *pheadroom = PAGE_COUNTER_MAX;
4018
4019 while ((parent = parent_mem_cgroup(memcg))) {
4020 unsigned long ceiling = min(memcg->memory.max, memcg->high);
4021 unsigned long used = page_counter_read(&memcg->memory);
4022
4023 *pheadroom = min(*pheadroom, ceiling - min(ceiling, used));
4024 memcg = parent;
4025 }
4026}
4027
4028#else
4029
4030static int memcg_wb_domain_init(struct mem_cgroup *memcg, gfp_t gfp)
4031{
4032 return 0;
4033}
4034
4035static void memcg_wb_domain_exit(struct mem_cgroup *memcg)
4036{
4037}
4038
4039static void memcg_wb_domain_size_changed(struct mem_cgroup *memcg)
4040{
4041}
4042
4043#endif
4044
4045
4046
4047
4048
4049
4050
4051
4052
4053
4054
4055
4056
4057
4058
4059
4060
4061
4062
4063static void memcg_event_remove(struct work_struct *work)
4064{
4065 struct mem_cgroup_event *event =
4066 container_of(work, struct mem_cgroup_event, remove);
4067 struct mem_cgroup *memcg = event->memcg;
4068
4069 remove_wait_queue(event->wqh, &event->wait);
4070
4071 event->unregister_event(memcg, event->eventfd);
4072
4073
4074 eventfd_signal(event->eventfd, 1);
4075
4076 eventfd_ctx_put(event->eventfd);
4077 kfree(event);
4078 css_put(&memcg->css);
4079}
4080
4081
4082
4083
4084
4085
4086static int memcg_event_wake(wait_queue_entry_t *wait, unsigned mode,
4087 int sync, void *key)
4088{
4089 struct mem_cgroup_event *event =
4090 container_of(wait, struct mem_cgroup_event, wait);
4091 struct mem_cgroup *memcg = event->memcg;
4092 __poll_t flags = key_to_poll(key);
4093
4094 if (flags & EPOLLHUP) {
4095
4096
4097
4098
4099
4100
4101
4102
4103
4104 spin_lock(&memcg->event_list_lock);
4105 if (!list_empty(&event->list)) {
4106 list_del_init(&event->list);
4107
4108
4109
4110
4111 schedule_work(&event->remove);
4112 }
4113 spin_unlock(&memcg->event_list_lock);
4114 }
4115
4116 return 0;
4117}
4118
4119static void memcg_event_ptable_queue_proc(struct file *file,
4120 wait_queue_head_t *wqh, poll_table *pt)
4121{
4122 struct mem_cgroup_event *event =
4123 container_of(pt, struct mem_cgroup_event, pt);
4124
4125 event->wqh = wqh;
4126 add_wait_queue(wqh, &event->wait);
4127}
4128
4129
4130
4131
4132
4133
4134
4135
4136
4137static ssize_t memcg_write_event_control(struct kernfs_open_file *of,
4138 char *buf, size_t nbytes, loff_t off)
4139{
4140 struct cgroup_subsys_state *css = of_css(of);
4141 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
4142 struct mem_cgroup_event *event;
4143 struct cgroup_subsys_state *cfile_css;
4144 unsigned int efd, cfd;
4145 struct fd efile;
4146 struct fd cfile;
4147 const char *name;
4148 char *endp;
4149 int ret;
4150
4151 buf = strstrip(buf);
4152
4153 efd = simple_strtoul(buf, &endp, 10);
4154 if (*endp != ' ')
4155 return -EINVAL;
4156 buf = endp + 1;
4157
4158 cfd = simple_strtoul(buf, &endp, 10);
4159 if ((*endp != ' ') && (*endp != '\0'))
4160 return -EINVAL;
4161 buf = endp + 1;
4162
4163 event = kzalloc(sizeof(*event), GFP_KERNEL);
4164 if (!event)
4165 return -ENOMEM;
4166
4167 event->memcg = memcg;
4168 INIT_LIST_HEAD(&event->list);
4169 init_poll_funcptr(&event->pt, memcg_event_ptable_queue_proc);
4170 init_waitqueue_func_entry(&event->wait, memcg_event_wake);
4171 INIT_WORK(&event->remove, memcg_event_remove);
4172
4173 efile = fdget(efd);
4174 if (!efile.file) {
4175 ret = -EBADF;
4176 goto out_kfree;
4177 }
4178
4179 event->eventfd = eventfd_ctx_fileget(efile.file);
4180 if (IS_ERR(event->eventfd)) {
4181 ret = PTR_ERR(event->eventfd);
4182 goto out_put_efile;
4183 }
4184
4185 cfile = fdget(cfd);
4186 if (!cfile.file) {
4187 ret = -EBADF;
4188 goto out_put_eventfd;
4189 }
4190
4191
4192
4193 ret = inode_permission(file_inode(cfile.file), MAY_READ);
4194 if (ret < 0)
4195 goto out_put_cfile;
4196
4197
4198
4199
4200
4201
4202
4203
4204
4205 name = cfile.file->f_path.dentry->d_name.name;
4206
4207 if (!strcmp(name, "memory.usage_in_bytes")) {
4208 event->register_event = mem_cgroup_usage_register_event;
4209 event->unregister_event = mem_cgroup_usage_unregister_event;
4210 } else if (!strcmp(name, "memory.oom_control")) {
4211 event->register_event = mem_cgroup_oom_register_event;
4212 event->unregister_event = mem_cgroup_oom_unregister_event;
4213 } else if (!strcmp(name, "memory.pressure_level")) {
4214 event->register_event = vmpressure_register_event;
4215 event->unregister_event = vmpressure_unregister_event;
4216 } else if (!strcmp(name, "memory.memsw.usage_in_bytes")) {
4217 event->register_event = memsw_cgroup_usage_register_event;
4218 event->unregister_event = memsw_cgroup_usage_unregister_event;
4219 } else {
4220 ret = -EINVAL;
4221 goto out_put_cfile;
4222 }
4223
4224
4225
4226
4227
4228
4229 cfile_css = css_tryget_online_from_dir(cfile.file->f_path.dentry->d_parent,
4230 &memory_cgrp_subsys);
4231 ret = -EINVAL;
4232 if (IS_ERR(cfile_css))
4233 goto out_put_cfile;
4234 if (cfile_css != css) {
4235 css_put(cfile_css);
4236 goto out_put_cfile;
4237 }
4238
4239 ret = event->register_event(memcg, event->eventfd, buf);
4240 if (ret)
4241 goto out_put_css;
4242
4243 vfs_poll(efile.file, &event->pt);
4244
4245 spin_lock(&memcg->event_list_lock);
4246 list_add(&event->list, &memcg->event_list);
4247 spin_unlock(&memcg->event_list_lock);
4248
4249 fdput(cfile);
4250 fdput(efile);
4251
4252 return nbytes;
4253
4254out_put_css:
4255 css_put(css);
4256out_put_cfile:
4257 fdput(cfile);
4258out_put_eventfd:
4259 eventfd_ctx_put(event->eventfd);
4260out_put_efile:
4261 fdput(efile);
4262out_kfree:
4263 kfree(event);
4264
4265 return ret;
4266}
4267
4268static struct cftype mem_cgroup_legacy_files[] = {
4269 {
4270 .name = "usage_in_bytes",
4271 .private = MEMFILE_PRIVATE(_MEM, RES_USAGE),
4272 .read_u64 = mem_cgroup_read_u64,
4273 },
4274 {
4275 .name = "max_usage_in_bytes",
4276 .private = MEMFILE_PRIVATE(_MEM, RES_MAX_USAGE),
4277 .write = mem_cgroup_reset,
4278 .read_u64 = mem_cgroup_read_u64,
4279 },
4280 {
4281 .name = "limit_in_bytes",
4282 .private = MEMFILE_PRIVATE(_MEM, RES_LIMIT),
4283 .write = mem_cgroup_write,
4284 .read_u64 = mem_cgroup_read_u64,
4285 },
4286 {
4287 .name = "soft_limit_in_bytes",
4288 .private = MEMFILE_PRIVATE(_MEM, RES_SOFT_LIMIT),
4289 .write = mem_cgroup_write,
4290 .read_u64 = mem_cgroup_read_u64,
4291 },
4292 {
4293 .name = "failcnt",
4294 .private = MEMFILE_PRIVATE(_MEM, RES_FAILCNT),
4295 .write = mem_cgroup_reset,
4296 .read_u64 = mem_cgroup_read_u64,
4297 },
4298 {
4299 .name = "stat",
4300 .seq_show = memcg_stat_show,
4301 },
4302 {
4303 .name = "force_empty",
4304 .write = mem_cgroup_force_empty_write,
4305 },
4306 {
4307 .name = "use_hierarchy",
4308 .write_u64 = mem_cgroup_hierarchy_write,
4309 .read_u64 = mem_cgroup_hierarchy_read,
4310 },
4311 {
4312 .name = "cgroup.event_control",
4313 .write = memcg_write_event_control,
4314 .flags = CFTYPE_NO_PREFIX | CFTYPE_WORLD_WRITABLE,
4315 },
4316 {
4317 .name = "swappiness",
4318 .read_u64 = mem_cgroup_swappiness_read,
4319 .write_u64 = mem_cgroup_swappiness_write,
4320 },
4321 {
4322 .name = "move_charge_at_immigrate",
4323 .read_u64 = mem_cgroup_move_charge_read,
4324 .write_u64 = mem_cgroup_move_charge_write,
4325 },
4326 {
4327 .name = "oom_control",
4328 .seq_show = mem_cgroup_oom_control_read,
4329 .write_u64 = mem_cgroup_oom_control_write,
4330 .private = MEMFILE_PRIVATE(_OOM_TYPE, OOM_CONTROL),
4331 },
4332 {
4333 .name = "pressure_level",
4334 },
4335#ifdef CONFIG_NUMA
4336 {
4337 .name = "numa_stat",
4338 .seq_show = memcg_numa_stat_show,
4339 },
4340#endif
4341 {
4342 .name = "kmem.limit_in_bytes",
4343 .private = MEMFILE_PRIVATE(_KMEM, RES_LIMIT),
4344 .write = mem_cgroup_write,
4345 .read_u64 = mem_cgroup_read_u64,
4346 },
4347 {
4348 .name = "kmem.usage_in_bytes",
4349 .private = MEMFILE_PRIVATE(_KMEM, RES_USAGE),
4350 .read_u64 = mem_cgroup_read_u64,
4351 },
4352 {
4353 .name = "kmem.failcnt",
4354 .private = MEMFILE_PRIVATE(_KMEM, RES_FAILCNT),
4355 .write = mem_cgroup_reset,
4356 .read_u64 = mem_cgroup_read_u64,
4357 },
4358 {
4359 .name = "kmem.max_usage_in_bytes",
4360 .private = MEMFILE_PRIVATE(_KMEM, RES_MAX_USAGE),
4361 .write = mem_cgroup_reset,
4362 .read_u64 = mem_cgroup_read_u64,
4363 },
4364#if defined(CONFIG_SLAB) || defined(CONFIG_SLUB_DEBUG)
4365 {
4366 .name = "kmem.slabinfo",
4367 .seq_start = memcg_slab_start,
4368 .seq_next = memcg_slab_next,
4369 .seq_stop = memcg_slab_stop,
4370 .seq_show = memcg_slab_show,
4371 },
4372#endif
4373 {
4374 .name = "kmem.tcp.limit_in_bytes",
4375 .private = MEMFILE_PRIVATE(_TCP, RES_LIMIT),
4376 .write = mem_cgroup_write,
4377 .read_u64 = mem_cgroup_read_u64,
4378 },
4379 {
4380 .name = "kmem.tcp.usage_in_bytes",
4381 .private = MEMFILE_PRIVATE(_TCP, RES_USAGE),
4382 .read_u64 = mem_cgroup_read_u64,
4383 },
4384 {
4385 .name = "kmem.tcp.failcnt",
4386 .private = MEMFILE_PRIVATE(_TCP, RES_FAILCNT),
4387 .write = mem_cgroup_reset,
4388 .read_u64 = mem_cgroup_read_u64,
4389 },
4390 {
4391 .name = "kmem.tcp.max_usage_in_bytes",
4392 .private = MEMFILE_PRIVATE(_TCP, RES_MAX_USAGE),
4393 .write = mem_cgroup_reset,
4394 .read_u64 = mem_cgroup_read_u64,
4395 },
4396 { },
4397};
4398
4399
4400
4401
4402
4403
4404
4405
4406
4407
4408
4409
4410
4411
4412
4413
4414
4415
4416
4417
4418
4419
4420
4421
4422
4423static DEFINE_IDR(mem_cgroup_idr);
4424
4425static void mem_cgroup_id_remove(struct mem_cgroup *memcg)
4426{
4427 if (memcg->id.id > 0) {
4428 idr_remove(&mem_cgroup_idr, memcg->id.id);
4429 memcg->id.id = 0;
4430 }
4431}
4432
4433static void mem_cgroup_id_get_many(struct mem_cgroup *memcg, unsigned int n)
4434{
4435 refcount_add(n, &memcg->id.ref);
4436}
4437
4438static void mem_cgroup_id_put_many(struct mem_cgroup *memcg, unsigned int n)
4439{
4440 if (refcount_sub_and_test(n, &memcg->id.ref)) {
4441 mem_cgroup_id_remove(memcg);
4442
4443
4444 css_put(&memcg->css);
4445 }
4446}
4447
4448static inline void mem_cgroup_id_get(struct mem_cgroup *memcg)
4449{
4450 mem_cgroup_id_get_many(memcg, 1);
4451}
4452
4453static inline void mem_cgroup_id_put(struct mem_cgroup *memcg)
4454{
4455 mem_cgroup_id_put_many(memcg, 1);
4456}
4457
4458
4459
4460
4461
4462
4463
4464struct mem_cgroup *mem_cgroup_from_id(unsigned short id)
4465{
4466 WARN_ON_ONCE(!rcu_read_lock_held());
4467 return idr_find(&mem_cgroup_idr, id);
4468}
4469
4470static int alloc_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node)
4471{
4472 struct mem_cgroup_per_node *pn;
4473 int tmp = node;
4474
4475
4476
4477
4478
4479
4480
4481
4482 if (!node_state(node, N_NORMAL_MEMORY))
4483 tmp = -1;
4484 pn = kzalloc_node(sizeof(*pn), GFP_KERNEL, tmp);
4485 if (!pn)
4486 return 1;
4487
4488 pn->lruvec_stat_local = alloc_percpu(struct lruvec_stat);
4489 if (!pn->lruvec_stat_local) {
4490 kfree(pn);
4491 return 1;
4492 }
4493
4494 pn->lruvec_stat_cpu = alloc_percpu(struct lruvec_stat);
4495 if (!pn->lruvec_stat_cpu) {
4496 free_percpu(pn->lruvec_stat_local);
4497 kfree(pn);
4498 return 1;
4499 }
4500
4501 lruvec_init(&pn->lruvec);
4502 pn->usage_in_excess = 0;
4503 pn->on_tree = false;
4504 pn->memcg = memcg;
4505
4506 memcg->nodeinfo[node] = pn;
4507 return 0;
4508}
4509
4510static void free_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node)
4511{
4512 struct mem_cgroup_per_node *pn = memcg->nodeinfo[node];
4513
4514 if (!pn)
4515 return;
4516
4517 free_percpu(pn->lruvec_stat_cpu);
4518 free_percpu(pn->lruvec_stat_local);
4519 kfree(pn);
4520}
4521
4522static void __mem_cgroup_free(struct mem_cgroup *memcg)
4523{
4524 int node;
4525
4526 for_each_node(node)
4527 free_mem_cgroup_per_node_info(memcg, node);
4528 free_percpu(memcg->vmstats_percpu);
4529 free_percpu(memcg->vmstats_local);
4530 kfree(memcg);
4531}
4532
4533static void mem_cgroup_free(struct mem_cgroup *memcg)
4534{
4535 memcg_wb_domain_exit(memcg);
4536 __mem_cgroup_free(memcg);
4537}
4538
4539static struct mem_cgroup *mem_cgroup_alloc(void)
4540{
4541 struct mem_cgroup *memcg;
4542 unsigned int size;
4543 int node;
4544
4545 size = sizeof(struct mem_cgroup);
4546 size += nr_node_ids * sizeof(struct mem_cgroup_per_node *);
4547
4548 memcg = kzalloc(size, GFP_KERNEL);
4549 if (!memcg)
4550 return NULL;
4551
4552 memcg->id.id = idr_alloc(&mem_cgroup_idr, NULL,
4553 1, MEM_CGROUP_ID_MAX,
4554 GFP_KERNEL);
4555 if (memcg->id.id < 0)
4556 goto fail;
4557
4558 memcg->vmstats_local = alloc_percpu(struct memcg_vmstats_percpu);
4559 if (!memcg->vmstats_local)
4560 goto fail;
4561
4562 memcg->vmstats_percpu = alloc_percpu(struct memcg_vmstats_percpu);
4563 if (!memcg->vmstats_percpu)
4564 goto fail;
4565
4566 for_each_node(node)
4567 if (alloc_mem_cgroup_per_node_info(memcg, node))
4568 goto fail;
4569
4570 if (memcg_wb_domain_init(memcg, GFP_KERNEL))
4571 goto fail;
4572
4573 INIT_WORK(&memcg->high_work, high_work_func);
4574 memcg->last_scanned_node = MAX_NUMNODES;
4575 INIT_LIST_HEAD(&memcg->oom_notify);
4576 mutex_init(&memcg->thresholds_lock);
4577 spin_lock_init(&memcg->move_lock);
4578 vmpressure_init(&memcg->vmpressure);
4579 INIT_LIST_HEAD(&memcg->event_list);
4580 spin_lock_init(&memcg->event_list_lock);
4581 memcg->socket_pressure = jiffies;
4582#ifdef CONFIG_MEMCG_KMEM
4583 memcg->kmemcg_id = -1;
4584#endif
4585#ifdef CONFIG_CGROUP_WRITEBACK
4586 INIT_LIST_HEAD(&memcg->cgwb_list);
4587#endif
4588 idr_replace(&mem_cgroup_idr, memcg, memcg->id.id);
4589 return memcg;
4590fail:
4591 mem_cgroup_id_remove(memcg);
4592 __mem_cgroup_free(memcg);
4593 return NULL;
4594}
4595
4596static struct cgroup_subsys_state * __ref
4597mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
4598{
4599 struct mem_cgroup *parent = mem_cgroup_from_css(parent_css);
4600 struct mem_cgroup *memcg;
4601 long error = -ENOMEM;
4602
4603 memcg = mem_cgroup_alloc();
4604 if (!memcg)
4605 return ERR_PTR(error);
4606
4607 memcg->high = PAGE_COUNTER_MAX;
4608 memcg->soft_limit = PAGE_COUNTER_MAX;
4609 if (parent) {
4610 memcg->swappiness = mem_cgroup_swappiness(parent);
4611 memcg->oom_kill_disable = parent->oom_kill_disable;
4612 }
4613 if (parent && parent->use_hierarchy) {
4614 memcg->use_hierarchy = true;
4615 page_counter_init(&memcg->memory, &parent->memory);
4616 page_counter_init(&memcg->swap, &parent->swap);
4617 page_counter_init(&memcg->memsw, &parent->memsw);
4618 page_counter_init(&memcg->kmem, &parent->kmem);
4619 page_counter_init(&memcg->tcpmem, &parent->tcpmem);
4620 } else {
4621 page_counter_init(&memcg->memory, NULL);
4622 page_counter_init(&memcg->swap, NULL);
4623 page_counter_init(&memcg->memsw, NULL);
4624 page_counter_init(&memcg->kmem, NULL);
4625 page_counter_init(&memcg->tcpmem, NULL);
4626
4627
4628
4629
4630
4631 if (parent != root_mem_cgroup)
4632 memory_cgrp_subsys.broken_hierarchy = true;
4633 }
4634
4635
4636 if (!parent) {
4637 root_mem_cgroup = memcg;
4638 return &memcg->css;
4639 }
4640
4641 error = memcg_online_kmem(memcg);
4642 if (error)
4643 goto fail;
4644
4645 if (cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_nosocket)
4646 static_branch_inc(&memcg_sockets_enabled_key);
4647
4648 return &memcg->css;
4649fail:
4650 mem_cgroup_id_remove(memcg);
4651 mem_cgroup_free(memcg);
4652 return ERR_PTR(-ENOMEM);
4653}
4654
4655static int mem_cgroup_css_online(struct cgroup_subsys_state *css)
4656{
4657 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
4658
4659
4660
4661
4662
4663
4664 if (memcg_alloc_shrinker_maps(memcg)) {
4665 mem_cgroup_id_remove(memcg);
4666 return -ENOMEM;
4667 }
4668
4669
4670 refcount_set(&memcg->id.ref, 1);
4671 css_get(css);
4672 return 0;
4673}
4674
4675static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
4676{
4677 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
4678 struct mem_cgroup_event *event, *tmp;
4679
4680
4681
4682
4683
4684
4685 spin_lock(&memcg->event_list_lock);
4686 list_for_each_entry_safe(event, tmp, &memcg->event_list, list) {
4687 list_del_init(&event->list);
4688 schedule_work(&event->remove);
4689 }
4690 spin_unlock(&memcg->event_list_lock);
4691
4692 page_counter_set_min(&memcg->memory, 0);
4693 page_counter_set_low(&memcg->memory, 0);
4694
4695 memcg_offline_kmem(memcg);
4696 wb_memcg_offline(memcg);
4697
4698 drain_all_stock(memcg);
4699
4700 mem_cgroup_id_put(memcg);
4701}
4702
4703static void mem_cgroup_css_released(struct cgroup_subsys_state *css)
4704{
4705 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
4706
4707 invalidate_reclaim_iterators(memcg);
4708}
4709
4710static void mem_cgroup_css_free(struct cgroup_subsys_state *css)
4711{
4712 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
4713
4714 if (cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_nosocket)
4715 static_branch_dec(&memcg_sockets_enabled_key);
4716
4717 if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && memcg->tcpmem_active)
4718 static_branch_dec(&memcg_sockets_enabled_key);
4719
4720 vmpressure_cleanup(&memcg->vmpressure);
4721 cancel_work_sync(&memcg->high_work);
4722 mem_cgroup_remove_from_trees(memcg);
4723 memcg_free_shrinker_maps(memcg);
4724 memcg_free_kmem(memcg);
4725 mem_cgroup_free(memcg);
4726}
4727
4728
4729
4730
4731
4732
4733
4734
4735
4736
4737
4738
4739
4740
4741static void mem_cgroup_css_reset(struct cgroup_subsys_state *css)
4742{
4743 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
4744
4745 page_counter_set_max(&memcg->memory, PAGE_COUNTER_MAX);
4746 page_counter_set_max(&memcg->swap, PAGE_COUNTER_MAX);
4747 page_counter_set_max(&memcg->memsw, PAGE_COUNTER_MAX);
4748 page_counter_set_max(&memcg->kmem, PAGE_COUNTER_MAX);
4749 page_counter_set_max(&memcg->tcpmem, PAGE_COUNTER_MAX);
4750 page_counter_set_min(&memcg->memory, 0);
4751 page_counter_set_low(&memcg->memory, 0);
4752 memcg->high = PAGE_COUNTER_MAX;
4753 memcg->soft_limit = PAGE_COUNTER_MAX;
4754 memcg_wb_domain_size_changed(memcg);
4755}
4756
4757#ifdef CONFIG_MMU
4758
4759static int mem_cgroup_do_precharge(unsigned long count)
4760{
4761 int ret;
4762
4763
4764 ret = try_charge(mc.to, GFP_KERNEL & ~__GFP_DIRECT_RECLAIM, count);
4765 if (!ret) {
4766 mc.precharge += count;
4767 return ret;
4768 }
4769
4770
4771 while (count--) {
4772 ret = try_charge(mc.to, GFP_KERNEL | __GFP_NORETRY, 1);
4773 if (ret)
4774 return ret;
4775 mc.precharge++;
4776 cond_resched();
4777 }
4778 return 0;
4779}
4780
4781union mc_target {
4782 struct page *page;
4783 swp_entry_t ent;
4784};
4785
4786enum mc_target_type {
4787 MC_TARGET_NONE = 0,
4788 MC_TARGET_PAGE,
4789 MC_TARGET_SWAP,
4790 MC_TARGET_DEVICE,
4791};
4792
4793static struct page *mc_handle_present_pte(struct vm_area_struct *vma,
4794 unsigned long addr, pte_t ptent)
4795{
4796 struct page *page = _vm_normal_page(vma, addr, ptent, true);
4797
4798 if (!page || !page_mapped(page))
4799 return NULL;
4800 if (PageAnon(page)) {
4801 if (!(mc.flags & MOVE_ANON))
4802 return NULL;
4803 } else {
4804 if (!(mc.flags & MOVE_FILE))
4805 return NULL;
4806 }
4807 if (!get_page_unless_zero(page))
4808 return NULL;
4809
4810 return page;
4811}
4812
4813#if defined(CONFIG_SWAP) || defined(CONFIG_DEVICE_PRIVATE)
4814static struct page *mc_handle_swap_pte(struct vm_area_struct *vma,
4815 pte_t ptent, swp_entry_t *entry)
4816{
4817 struct page *page = NULL;
4818 swp_entry_t ent = pte_to_swp_entry(ptent);
4819
4820 if (!(mc.flags & MOVE_ANON) || non_swap_entry(ent))
4821 return NULL;
4822
4823
4824
4825
4826
4827
4828 if (is_device_private_entry(ent)) {
4829 page = device_private_entry_to_page(ent);
4830
4831
4832
4833
4834 if (!page_ref_add_unless(page, 1, 1))
4835 return NULL;
4836 return page;
4837 }
4838
4839
4840
4841
4842
4843 page = find_get_page(swap_address_space(ent), swp_offset(ent));
4844 if (do_memsw_account())
4845 entry->val = ent.val;
4846
4847 return page;
4848}
4849#else
4850static struct page *mc_handle_swap_pte(struct vm_area_struct *vma,
4851 pte_t ptent, swp_entry_t *entry)
4852{
4853 return NULL;
4854}
4855#endif
4856
4857static struct page *mc_handle_file_pte(struct vm_area_struct *vma,
4858 unsigned long addr, pte_t ptent, swp_entry_t *entry)
4859{
4860 struct page *page = NULL;
4861 struct address_space *mapping;
4862 pgoff_t pgoff;
4863
4864 if (!vma->vm_file)
4865 return NULL;
4866 if (!(mc.flags & MOVE_FILE))
4867 return NULL;
4868
4869 mapping = vma->vm_file->f_mapping;
4870 pgoff = linear_page_index(vma, addr);
4871
4872
4873#ifdef CONFIG_SWAP
4874
4875 if (shmem_mapping(mapping)) {
4876 page = find_get_entry(mapping, pgoff);
4877 if (xa_is_value(page)) {
4878 swp_entry_t swp = radix_to_swp_entry(page);
4879 if (do_memsw_account())
4880 *entry = swp;
4881 page = find_get_page(swap_address_space(swp),
4882 swp_offset(swp));
4883 }
4884 } else
4885 page = find_get_page(mapping, pgoff);
4886#else
4887 page = find_get_page(mapping, pgoff);
4888#endif
4889 return page;
4890}
4891
4892
4893
4894
4895
4896
4897
4898
4899
4900
4901
4902
4903
4904static int mem_cgroup_move_account(struct page *page,
4905 bool compound,
4906 struct mem_cgroup *from,
4907 struct mem_cgroup *to)
4908{
4909 unsigned long flags;
4910 unsigned int nr_pages = compound ? hpage_nr_pages(page) : 1;
4911 int ret;
4912 bool anon;
4913
4914 VM_BUG_ON(from == to);
4915 VM_BUG_ON_PAGE(PageLRU(page), page);
4916 VM_BUG_ON(compound && !PageTransHuge(page));
4917
4918
4919
4920
4921
4922 ret = -EBUSY;
4923 if (!trylock_page(page))
4924 goto out;
4925
4926 ret = -EINVAL;
4927 if (page->mem_cgroup != from)
4928 goto out_unlock;
4929
4930 anon = PageAnon(page);
4931
4932 spin_lock_irqsave(&from->move_lock, flags);
4933
4934 if (!anon && page_mapped(page)) {
4935 __mod_memcg_state(from, NR_FILE_MAPPED, -nr_pages);
4936 __mod_memcg_state(to, NR_FILE_MAPPED, nr_pages);
4937 }
4938
4939
4940
4941
4942
4943
4944 if (!anon && PageDirty(page)) {
4945 struct address_space *mapping = page_mapping(page);
4946
4947 if (mapping_cap_account_dirty(mapping)) {
4948 __mod_memcg_state(from, NR_FILE_DIRTY, -nr_pages);
4949 __mod_memcg_state(to, NR_FILE_DIRTY, nr_pages);
4950 }
4951 }
4952
4953 if (PageWriteback(page)) {
4954 __mod_memcg_state(from, NR_WRITEBACK, -nr_pages);
4955 __mod_memcg_state(to, NR_WRITEBACK, nr_pages);
4956 }
4957
4958
4959
4960
4961
4962
4963
4964
4965 page->mem_cgroup = to;
4966 spin_unlock_irqrestore(&from->move_lock, flags);
4967
4968 ret = 0;
4969
4970 local_irq_disable();
4971 mem_cgroup_charge_statistics(to, page, compound, nr_pages);
4972 memcg_check_events(to, page);
4973 mem_cgroup_charge_statistics(from, page, compound, -nr_pages);
4974 memcg_check_events(from, page);
4975 local_irq_enable();
4976out_unlock:
4977 unlock_page(page);
4978out:
4979 return ret;
4980}
4981
4982
4983
4984
4985
4986
4987
4988
4989
4990
4991
4992
4993
4994
4995
4996
4997
4998
4999
5000
5001
5002
5003
5004
5005
5006
5007
5008static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma,
5009 unsigned long addr, pte_t ptent, union mc_target *target)
5010{
5011 struct page *page = NULL;
5012 enum mc_target_type ret = MC_TARGET_NONE;
5013 swp_entry_t ent = { .val = 0 };
5014
5015 if (pte_present(ptent))
5016 page = mc_handle_present_pte(vma, addr, ptent);
5017 else if (is_swap_pte(ptent))
5018 page = mc_handle_swap_pte(vma, ptent, &ent);
5019 else if (pte_none(ptent))
5020 page = mc_handle_file_pte(vma, addr, ptent, &ent);
5021
5022 if (!page && !ent.val)
5023 return ret;
5024 if (page) {
5025
5026
5027
5028
5029
5030 if (page->mem_cgroup == mc.from) {
5031 ret = MC_TARGET_PAGE;
5032 if (is_device_private_page(page) ||
5033 is_device_public_page(page))
5034 ret = MC_TARGET_DEVICE;
5035 if (target)
5036 target->page = page;
5037 }
5038 if (!ret || !target)
5039 put_page(page);
5040 }
5041
5042
5043
5044
5045 if (ent.val && !ret && (!page || !PageTransCompound(page)) &&
5046 mem_cgroup_id(mc.from) == lookup_swap_cgroup_id(ent)) {
5047 ret = MC_TARGET_SWAP;
5048 if (target)
5049 target->ent = ent;
5050 }
5051 return ret;
5052}
5053
5054#ifdef CONFIG_TRANSPARENT_HUGEPAGE
5055
5056
5057
5058
5059
5060static enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma,
5061 unsigned long addr, pmd_t pmd, union mc_target *target)
5062{
5063 struct page *page = NULL;
5064 enum mc_target_type ret = MC_TARGET_NONE;
5065
5066 if (unlikely(is_swap_pmd(pmd))) {
5067 VM_BUG_ON(thp_migration_supported() &&
5068 !is_pmd_migration_entry(pmd));
5069 return ret;
5070 }
5071 page = pmd_page(pmd);
5072 VM_BUG_ON_PAGE(!page || !PageHead(page), page);
5073 if (!(mc.flags & MOVE_ANON))
5074 return ret;
5075 if (page->mem_cgroup == mc.from) {
5076 ret = MC_TARGET_PAGE;
5077 if (target) {
5078 get_page(page);
5079 target->page = page;
5080 }
5081 }
5082 return ret;
5083}
5084#else
5085static inline enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma,
5086 unsigned long addr, pmd_t pmd, union mc_target *target)
5087{
5088 return MC_TARGET_NONE;
5089}
5090#endif
5091
5092static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd,
5093 unsigned long addr, unsigned long end,
5094 struct mm_walk *walk)
5095{
5096 struct vm_area_struct *vma = walk->vma;
5097 pte_t *pte;
5098 spinlock_t *ptl;
5099
5100 ptl = pmd_trans_huge_lock(pmd, vma);
5101 if (ptl) {
5102
5103
5104
5105
5106
5107 if (get_mctgt_type_thp(vma, addr, *pmd, NULL) == MC_TARGET_PAGE)
5108 mc.precharge += HPAGE_PMD_NR;
5109 spin_unlock(ptl);
5110 return 0;
5111 }
5112
5113 if (pmd_trans_unstable(pmd))
5114 return 0;
5115 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
5116 for (; addr != end; pte++, addr += PAGE_SIZE)
5117 if (get_mctgt_type(vma, addr, *pte, NULL))
5118 mc.precharge++;
5119 pte_unmap_unlock(pte - 1, ptl);
5120 cond_resched();
5121
5122 return 0;
5123}
5124
5125static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm)
5126{
5127 unsigned long precharge;
5128
5129 struct mm_walk mem_cgroup_count_precharge_walk = {
5130 .pmd_entry = mem_cgroup_count_precharge_pte_range,
5131 .mm = mm,
5132 };
5133 down_read(&mm->mmap_sem);
5134 walk_page_range(0, mm->highest_vm_end,
5135 &mem_cgroup_count_precharge_walk);
5136 up_read(&mm->mmap_sem);
5137
5138 precharge = mc.precharge;
5139 mc.precharge = 0;
5140
5141 return precharge;
5142}
5143
5144static int mem_cgroup_precharge_mc(struct mm_struct *mm)
5145{
5146 unsigned long precharge = mem_cgroup_count_precharge(mm);
5147
5148 VM_BUG_ON(mc.moving_task);
5149 mc.moving_task = current;
5150 return mem_cgroup_do_precharge(precharge);
5151}
5152
5153
5154static void __mem_cgroup_clear_mc(void)
5155{
5156 struct mem_cgroup *from = mc.from;
5157 struct mem_cgroup *to = mc.to;
5158
5159
5160 if (mc.precharge) {
5161 cancel_charge(mc.to, mc.precharge);
5162 mc.precharge = 0;
5163 }
5164
5165
5166
5167
5168 if (mc.moved_charge) {
5169 cancel_charge(mc.from, mc.moved_charge);
5170 mc.moved_charge = 0;
5171 }
5172
5173 if (mc.moved_swap) {
5174
5175 if (!mem_cgroup_is_root(mc.from))
5176 page_counter_uncharge(&mc.from->memsw, mc.moved_swap);
5177
5178 mem_cgroup_id_put_many(mc.from, mc.moved_swap);
5179
5180
5181
5182
5183
5184 if (!mem_cgroup_is_root(mc.to))
5185 page_counter_uncharge(&mc.to->memory, mc.moved_swap);
5186
5187 mem_cgroup_id_get_many(mc.to, mc.moved_swap);
5188 css_put_many(&mc.to->css, mc.moved_swap);
5189
5190 mc.moved_swap = 0;
5191 }
5192 memcg_oom_recover(from);
5193 memcg_oom_recover(to);
5194 wake_up_all(&mc.waitq);
5195}
5196
5197static void mem_cgroup_clear_mc(void)
5198{
5199 struct mm_struct *mm = mc.mm;
5200
5201
5202
5203
5204
5205 mc.moving_task = NULL;
5206 __mem_cgroup_clear_mc();
5207 spin_lock(&mc.lock);
5208 mc.from = NULL;
5209 mc.to = NULL;
5210 mc.mm = NULL;
5211 spin_unlock(&mc.lock);
5212
5213 mmput(mm);
5214}
5215
5216static int mem_cgroup_can_attach(struct cgroup_taskset *tset)
5217{
5218 struct cgroup_subsys_state *css;
5219 struct mem_cgroup *memcg = NULL;
5220 struct mem_cgroup *from;
5221 struct task_struct *leader, *p;
5222 struct mm_struct *mm;
5223 unsigned long move_flags;
5224 int ret = 0;
5225
5226
5227 if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
5228 return 0;
5229
5230
5231
5232
5233
5234
5235
5236 p = NULL;
5237 cgroup_taskset_for_each_leader(leader, css, tset) {
5238 WARN_ON_ONCE(p);
5239 p = leader;
5240 memcg = mem_cgroup_from_css(css);
5241 }
5242 if (!p)
5243 return 0;
5244
5245
5246
5247
5248
5249
5250 move_flags = READ_ONCE(memcg->move_charge_at_immigrate);
5251 if (!move_flags)
5252 return 0;
5253
5254 from = mem_cgroup_from_task(p);
5255
5256 VM_BUG_ON(from == memcg);
5257
5258 mm = get_task_mm(p);
5259 if (!mm)
5260 return 0;
5261
5262 if (mm->owner == p) {
5263 VM_BUG_ON(mc.from);
5264 VM_BUG_ON(mc.to);
5265 VM_BUG_ON(mc.precharge);
5266 VM_BUG_ON(mc.moved_charge);
5267 VM_BUG_ON(mc.moved_swap);
5268
5269 spin_lock(&mc.lock);
5270 mc.mm = mm;
5271 mc.from = from;
5272 mc.to = memcg;
5273 mc.flags = move_flags;
5274 spin_unlock(&mc.lock);
5275
5276
5277 ret = mem_cgroup_precharge_mc(mm);
5278 if (ret)
5279 mem_cgroup_clear_mc();
5280 } else {
5281 mmput(mm);
5282 }
5283 return ret;
5284}
5285
5286static void mem_cgroup_cancel_attach(struct cgroup_taskset *tset)
5287{
5288 if (mc.to)
5289 mem_cgroup_clear_mc();
5290}
5291
5292static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
5293 unsigned long addr, unsigned long end,
5294 struct mm_walk *walk)
5295{
5296 int ret = 0;
5297 struct vm_area_struct *vma = walk->vma;
5298 pte_t *pte;
5299 spinlock_t *ptl;
5300 enum mc_target_type target_type;
5301 union mc_target target;
5302 struct page *page;
5303
5304 ptl = pmd_trans_huge_lock(pmd, vma);
5305 if (ptl) {
5306 if (mc.precharge < HPAGE_PMD_NR) {
5307 spin_unlock(ptl);
5308 return 0;
5309 }
5310 target_type = get_mctgt_type_thp(vma, addr, *pmd, &target);
5311 if (target_type == MC_TARGET_PAGE) {
5312 page = target.page;
5313 if (!isolate_lru_page(page)) {
5314 if (!mem_cgroup_move_account(page, true,
5315 mc.from, mc.to)) {
5316 mc.precharge -= HPAGE_PMD_NR;
5317 mc.moved_charge += HPAGE_PMD_NR;
5318 }
5319 putback_lru_page(page);
5320 }
5321 put_page(page);
5322 } else if (target_type == MC_TARGET_DEVICE) {
5323 page = target.page;
5324 if (!mem_cgroup_move_account(page, true,
5325 mc.from, mc.to)) {
5326 mc.precharge -= HPAGE_PMD_NR;
5327 mc.moved_charge += HPAGE_PMD_NR;
5328 }
5329 put_page(page);
5330 }
5331 spin_unlock(ptl);
5332 return 0;
5333 }
5334
5335 if (pmd_trans_unstable(pmd))
5336 return 0;
5337retry:
5338 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
5339 for (; addr != end; addr += PAGE_SIZE) {
5340 pte_t ptent = *(pte++);
5341 bool device = false;
5342 swp_entry_t ent;
5343
5344 if (!mc.precharge)
5345 break;
5346
5347 switch (get_mctgt_type(vma, addr, ptent, &target)) {
5348 case MC_TARGET_DEVICE:
5349 device = true;
5350
5351 case MC_TARGET_PAGE:
5352 page = target.page;
5353
5354
5355
5356
5357
5358
5359 if (PageTransCompound(page))
5360 goto put;
5361 if (!device && isolate_lru_page(page))
5362 goto put;
5363 if (!mem_cgroup_move_account(page, false,
5364 mc.from, mc.to)) {
5365 mc.precharge--;
5366
5367 mc.moved_charge++;
5368 }
5369 if (!device)
5370 putback_lru_page(page);
5371put:
5372 put_page(page);
5373 break;
5374 case MC_TARGET_SWAP:
5375 ent = target.ent;
5376 if (!mem_cgroup_move_swap_account(ent, mc.from, mc.to)) {
5377 mc.precharge--;
5378
5379 mc.moved_swap++;
5380 }
5381 break;
5382 default:
5383 break;
5384 }
5385 }
5386 pte_unmap_unlock(pte - 1, ptl);
5387 cond_resched();
5388
5389 if (addr != end) {
5390
5391
5392
5393
5394
5395
5396 ret = mem_cgroup_do_precharge(1);
5397 if (!ret)
5398 goto retry;
5399 }
5400
5401 return ret;
5402}
5403
5404static void mem_cgroup_move_charge(void)
5405{
5406 struct mm_walk mem_cgroup_move_charge_walk = {
5407 .pmd_entry = mem_cgroup_move_charge_pte_range,
5408 .mm = mc.mm,
5409 };
5410
5411 lru_add_drain_all();
5412
5413
5414
5415
5416
5417 atomic_inc(&mc.from->moving_account);
5418 synchronize_rcu();
5419retry:
5420 if (unlikely(!down_read_trylock(&mc.mm->mmap_sem))) {
5421
5422
5423
5424
5425
5426
5427
5428 __mem_cgroup_clear_mc();
5429 cond_resched();
5430 goto retry;
5431 }
5432
5433
5434
5435
5436 walk_page_range(0, mc.mm->highest_vm_end, &mem_cgroup_move_charge_walk);
5437
5438 up_read(&mc.mm->mmap_sem);
5439 atomic_dec(&mc.from->moving_account);
5440}
5441
5442static void mem_cgroup_move_task(void)
5443{
5444 if (mc.to) {
5445 mem_cgroup_move_charge();
5446 mem_cgroup_clear_mc();
5447 }
5448}
5449#else
5450static int mem_cgroup_can_attach(struct cgroup_taskset *tset)
5451{
5452 return 0;
5453}
5454static void mem_cgroup_cancel_attach(struct cgroup_taskset *tset)
5455{
5456}
5457static void mem_cgroup_move_task(void)
5458{
5459}
5460#endif
5461
5462
5463
5464
5465
5466
5467static void mem_cgroup_bind(struct cgroup_subsys_state *root_css)
5468{
5469
5470
5471
5472
5473
5474 if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
5475 root_mem_cgroup->use_hierarchy = true;
5476 else
5477 root_mem_cgroup->use_hierarchy = false;
5478}
5479
5480static int seq_puts_memcg_tunable(struct seq_file *m, unsigned long value)
5481{
5482 if (value == PAGE_COUNTER_MAX)
5483 seq_puts(m, "max\n");
5484 else
5485 seq_printf(m, "%llu\n", (u64)value * PAGE_SIZE);
5486
5487 return 0;
5488}
5489
5490static u64 memory_current_read(struct cgroup_subsys_state *css,
5491 struct cftype *cft)
5492{
5493 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
5494
5495 return (u64)page_counter_read(&memcg->memory) * PAGE_SIZE;
5496}
5497
5498static int memory_min_show(struct seq_file *m, void *v)
5499{
5500 return seq_puts_memcg_tunable(m,
5501 READ_ONCE(mem_cgroup_from_seq(m)->memory.min));
5502}
5503
5504static ssize_t memory_min_write(struct kernfs_open_file *of,
5505 char *buf, size_t nbytes, loff_t off)
5506{
5507 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
5508 unsigned long min;
5509 int err;
5510
5511 buf = strstrip(buf);
5512 err = page_counter_memparse(buf, "max", &min);
5513 if (err)
5514 return err;
5515
5516 page_counter_set_min(&memcg->memory, min);
5517
5518 return nbytes;
5519}
5520
5521static int memory_low_show(struct seq_file *m, void *v)
5522{
5523 return seq_puts_memcg_tunable(m,
5524 READ_ONCE(mem_cgroup_from_seq(m)->memory.low));
5525}
5526
5527static ssize_t memory_low_write(struct kernfs_open_file *of,
5528 char *buf, size_t nbytes, loff_t off)
5529{
5530 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
5531 unsigned long low;
5532 int err;
5533
5534 buf = strstrip(buf);
5535 err = page_counter_memparse(buf, "max", &low);
5536 if (err)
5537 return err;
5538
5539 page_counter_set_low(&memcg->memory, low);
5540
5541 return nbytes;
5542}
5543
5544static int memory_high_show(struct seq_file *m, void *v)
5545{
5546 return seq_puts_memcg_tunable(m, READ_ONCE(mem_cgroup_from_seq(m)->high));
5547}
5548
5549static ssize_t memory_high_write(struct kernfs_open_file *of,
5550 char *buf, size_t nbytes, loff_t off)
5551{
5552 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
5553 unsigned long nr_pages;
5554 unsigned long high;
5555 int err;
5556
5557 buf = strstrip(buf);
5558 err = page_counter_memparse(buf, "max", &high);
5559 if (err)
5560 return err;
5561
5562 memcg->high = high;
5563
5564 nr_pages = page_counter_read(&memcg->memory);
5565 if (nr_pages > high)
5566 try_to_free_mem_cgroup_pages(memcg, nr_pages - high,
5567 GFP_KERNEL, true);
5568
5569 memcg_wb_domain_size_changed(memcg);
5570 return nbytes;
5571}
5572
5573static int memory_max_show(struct seq_file *m, void *v)
5574{
5575 return seq_puts_memcg_tunable(m,
5576 READ_ONCE(mem_cgroup_from_seq(m)->memory.max));
5577}
5578
5579static ssize_t memory_max_write(struct kernfs_open_file *of,
5580 char *buf, size_t nbytes, loff_t off)
5581{
5582 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
5583 unsigned int nr_reclaims = MEM_CGROUP_RECLAIM_RETRIES;
5584 bool drained = false;
5585 unsigned long max;
5586 int err;
5587
5588 buf = strstrip(buf);
5589 err = page_counter_memparse(buf, "max", &max);
5590 if (err)
5591 return err;
5592
5593 xchg(&memcg->memory.max, max);
5594
5595 for (;;) {
5596 unsigned long nr_pages = page_counter_read(&memcg->memory);
5597
5598 if (nr_pages <= max)
5599 break;
5600
5601 if (signal_pending(current)) {
5602 err = -EINTR;
5603 break;
5604 }
5605
5606 if (!drained) {
5607 drain_all_stock(memcg);
5608 drained = true;
5609 continue;
5610 }
5611
5612 if (nr_reclaims) {
5613 if (!try_to_free_mem_cgroup_pages(memcg, nr_pages - max,
5614 GFP_KERNEL, true))
5615 nr_reclaims--;
5616 continue;
5617 }
5618
5619 memcg_memory_event(memcg, MEMCG_OOM);
5620 if (!mem_cgroup_out_of_memory(memcg, GFP_KERNEL, 0))
5621 break;
5622 }
5623
5624 memcg_wb_domain_size_changed(memcg);
5625 return nbytes;
5626}
5627
5628static int memory_events_show(struct seq_file *m, void *v)
5629{
5630 struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
5631
5632 seq_printf(m, "low %lu\n",
5633 atomic_long_read(&memcg->memory_events[MEMCG_LOW]));
5634 seq_printf(m, "high %lu\n",
5635 atomic_long_read(&memcg->memory_events[MEMCG_HIGH]));
5636 seq_printf(m, "max %lu\n",
5637 atomic_long_read(&memcg->memory_events[MEMCG_MAX]));
5638 seq_printf(m, "oom %lu\n",
5639 atomic_long_read(&memcg->memory_events[MEMCG_OOM]));
5640 seq_printf(m, "oom_kill %lu\n",
5641 atomic_long_read(&memcg->memory_events[MEMCG_OOM_KILL]));
5642
5643 return 0;
5644}
5645
5646static int memory_stat_show(struct seq_file *m, void *v)
5647{
5648 struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
5649 int i;
5650
5651
5652
5653
5654
5655
5656
5657
5658
5659
5660
5661
5662 seq_printf(m, "anon %llu\n",
5663 (u64)memcg_page_state(memcg, MEMCG_RSS) * PAGE_SIZE);
5664 seq_printf(m, "file %llu\n",
5665 (u64)memcg_page_state(memcg, MEMCG_CACHE) * PAGE_SIZE);
5666 seq_printf(m, "kernel_stack %llu\n",
5667 (u64)memcg_page_state(memcg, MEMCG_KERNEL_STACK_KB) * 1024);
5668 seq_printf(m, "slab %llu\n",
5669 (u64)(memcg_page_state(memcg, NR_SLAB_RECLAIMABLE) +
5670 memcg_page_state(memcg, NR_SLAB_UNRECLAIMABLE)) *
5671 PAGE_SIZE);
5672 seq_printf(m, "sock %llu\n",
5673 (u64)memcg_page_state(memcg, MEMCG_SOCK) * PAGE_SIZE);
5674
5675 seq_printf(m, "shmem %llu\n",
5676 (u64)memcg_page_state(memcg, NR_SHMEM) * PAGE_SIZE);
5677 seq_printf(m, "file_mapped %llu\n",
5678 (u64)memcg_page_state(memcg, NR_FILE_MAPPED) * PAGE_SIZE);
5679 seq_printf(m, "file_dirty %llu\n",
5680 (u64)memcg_page_state(memcg, NR_FILE_DIRTY) * PAGE_SIZE);
5681 seq_printf(m, "file_writeback %llu\n",
5682 (u64)memcg_page_state(memcg, NR_WRITEBACK) * PAGE_SIZE);
5683
5684
5685
5686
5687
5688
5689
5690 seq_printf(m, "anon_thp %llu\n",
5691 (u64)memcg_page_state(memcg, MEMCG_RSS_HUGE) * PAGE_SIZE);
5692
5693 for (i = 0; i < NR_LRU_LISTS; i++)
5694 seq_printf(m, "%s %llu\n", mem_cgroup_lru_names[i],
5695 (u64)memcg_page_state(memcg, NR_LRU_BASE + i) *
5696 PAGE_SIZE);
5697
5698 seq_printf(m, "slab_reclaimable %llu\n",
5699 (u64)memcg_page_state(memcg, NR_SLAB_RECLAIMABLE) *
5700 PAGE_SIZE);
5701 seq_printf(m, "slab_unreclaimable %llu\n",
5702 (u64)memcg_page_state(memcg, NR_SLAB_UNRECLAIMABLE) *
5703 PAGE_SIZE);
5704
5705
5706
5707 seq_printf(m, "pgfault %lu\n", memcg_events(memcg, PGFAULT));
5708 seq_printf(m, "pgmajfault %lu\n", memcg_events(memcg, PGMAJFAULT));
5709
5710 seq_printf(m, "workingset_refault %lu\n",
5711 memcg_page_state(memcg, WORKINGSET_REFAULT));
5712 seq_printf(m, "workingset_activate %lu\n",
5713 memcg_page_state(memcg, WORKINGSET_ACTIVATE));
5714 seq_printf(m, "workingset_nodereclaim %lu\n",
5715 memcg_page_state(memcg, WORKINGSET_NODERECLAIM));
5716
5717 seq_printf(m, "pgrefill %lu\n", memcg_events(memcg, PGREFILL));
5718 seq_printf(m, "pgscan %lu\n", memcg_events(memcg, PGSCAN_KSWAPD) +
5719 memcg_events(memcg, PGSCAN_DIRECT));
5720 seq_printf(m, "pgsteal %lu\n", memcg_events(memcg, PGSTEAL_KSWAPD) +
5721 memcg_events(memcg, PGSTEAL_DIRECT));
5722 seq_printf(m, "pgactivate %lu\n", memcg_events(memcg, PGACTIVATE));
5723 seq_printf(m, "pgdeactivate %lu\n", memcg_events(memcg, PGDEACTIVATE));
5724 seq_printf(m, "pglazyfree %lu\n", memcg_events(memcg, PGLAZYFREE));
5725 seq_printf(m, "pglazyfreed %lu\n", memcg_events(memcg, PGLAZYFREED));
5726
5727#ifdef CONFIG_TRANSPARENT_HUGEPAGE
5728 seq_printf(m, "thp_fault_alloc %lu\n",
5729 memcg_events(memcg, THP_FAULT_ALLOC));
5730 seq_printf(m, "thp_collapse_alloc %lu\n",
5731 memcg_events(memcg, THP_COLLAPSE_ALLOC));
5732#endif
5733
5734 return 0;
5735}
5736
5737static int memory_oom_group_show(struct seq_file *m, void *v)
5738{
5739 struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
5740
5741 seq_printf(m, "%d\n", memcg->oom_group);
5742
5743 return 0;
5744}
5745
5746static ssize_t memory_oom_group_write(struct kernfs_open_file *of,
5747 char *buf, size_t nbytes, loff_t off)
5748{
5749 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
5750 int ret, oom_group;
5751
5752 buf = strstrip(buf);
5753 if (!buf)
5754 return -EINVAL;
5755
5756 ret = kstrtoint(buf, 0, &oom_group);
5757 if (ret)
5758 return ret;
5759
5760 if (oom_group != 0 && oom_group != 1)
5761 return -EINVAL;
5762
5763 memcg->oom_group = oom_group;
5764
5765 return nbytes;
5766}
5767
5768static struct cftype memory_files[] = {
5769 {
5770 .name = "current",
5771 .flags = CFTYPE_NOT_ON_ROOT,
5772 .read_u64 = memory_current_read,
5773 },
5774 {
5775 .name = "min",
5776 .flags = CFTYPE_NOT_ON_ROOT,
5777 .seq_show = memory_min_show,
5778 .write = memory_min_write,
5779 },
5780 {
5781 .name = "low",
5782 .flags = CFTYPE_NOT_ON_ROOT,
5783 .seq_show = memory_low_show,
5784 .write = memory_low_write,
5785 },
5786 {
5787 .name = "high",
5788 .flags = CFTYPE_NOT_ON_ROOT,
5789 .seq_show = memory_high_show,
5790 .write = memory_high_write,
5791 },
5792 {
5793 .name = "max",
5794 .flags = CFTYPE_NOT_ON_ROOT,
5795 .seq_show = memory_max_show,
5796 .write = memory_max_write,
5797 },
5798 {
5799 .name = "events",
5800 .flags = CFTYPE_NOT_ON_ROOT,
5801 .file_offset = offsetof(struct mem_cgroup, events_file),
5802 .seq_show = memory_events_show,
5803 },
5804 {
5805 .name = "stat",
5806 .flags = CFTYPE_NOT_ON_ROOT,
5807 .seq_show = memory_stat_show,
5808 },
5809 {
5810 .name = "oom.group",
5811 .flags = CFTYPE_NOT_ON_ROOT | CFTYPE_NS_DELEGATABLE,
5812 .seq_show = memory_oom_group_show,
5813 .write = memory_oom_group_write,
5814 },
5815 { }
5816};
5817
5818struct cgroup_subsys memory_cgrp_subsys = {
5819 .css_alloc = mem_cgroup_css_alloc,
5820 .css_online = mem_cgroup_css_online,
5821 .css_offline = mem_cgroup_css_offline,
5822 .css_released = mem_cgroup_css_released,
5823 .css_free = mem_cgroup_css_free,
5824 .css_reset = mem_cgroup_css_reset,
5825 .can_attach = mem_cgroup_can_attach,
5826 .cancel_attach = mem_cgroup_cancel_attach,
5827 .post_attach = mem_cgroup_move_task,
5828 .bind = mem_cgroup_bind,
5829 .dfl_cftypes = memory_files,
5830 .legacy_cftypes = mem_cgroup_legacy_files,
5831 .early_init = 0,
5832};
5833
5834
5835
5836
5837
5838
5839
5840
5841
5842
5843
5844
5845
5846
5847
5848
5849
5850
5851
5852
5853
5854
5855
5856
5857
5858
5859
5860
5861
5862
5863
5864
5865
5866
5867
5868
5869
5870
5871
5872
5873
5874
5875
5876
5877
5878
5879
5880
5881
5882
5883
5884
5885
5886
5887
5888
5889
5890
5891
5892
5893
5894
5895
5896
5897
5898
5899
5900
5901
5902
5903
5904enum mem_cgroup_protection mem_cgroup_protected(struct mem_cgroup *root,
5905 struct mem_cgroup *memcg)
5906{
5907 struct mem_cgroup *parent;
5908 unsigned long emin, parent_emin;
5909 unsigned long elow, parent_elow;
5910 unsigned long usage;
5911
5912 if (mem_cgroup_disabled())
5913 return MEMCG_PROT_NONE;
5914
5915 if (!root)
5916 root = root_mem_cgroup;
5917 if (memcg == root)
5918 return MEMCG_PROT_NONE;
5919
5920 usage = page_counter_read(&memcg->memory);
5921 if (!usage)
5922 return MEMCG_PROT_NONE;
5923
5924 emin = memcg->memory.min;
5925 elow = memcg->memory.low;
5926
5927 parent = parent_mem_cgroup(memcg);
5928
5929 if (!parent)
5930 return MEMCG_PROT_NONE;
5931
5932 if (parent == root)
5933 goto exit;
5934
5935 parent_emin = READ_ONCE(parent->memory.emin);
5936 emin = min(emin, parent_emin);
5937 if (emin && parent_emin) {
5938 unsigned long min_usage, siblings_min_usage;
5939
5940 min_usage = min(usage, memcg->memory.min);
5941 siblings_min_usage = atomic_long_read(
5942 &parent->memory.children_min_usage);
5943
5944 if (min_usage && siblings_min_usage)
5945 emin = min(emin, parent_emin * min_usage /
5946 siblings_min_usage);
5947 }
5948
5949 parent_elow = READ_ONCE(parent->memory.elow);
5950 elow = min(elow, parent_elow);
5951 if (elow && parent_elow) {
5952 unsigned long low_usage, siblings_low_usage;
5953
5954 low_usage = min(usage, memcg->memory.low);
5955 siblings_low_usage = atomic_long_read(
5956 &parent->memory.children_low_usage);
5957
5958 if (low_usage && siblings_low_usage)
5959 elow = min(elow, parent_elow * low_usage /
5960 siblings_low_usage);
5961 }
5962
5963exit:
5964 memcg->memory.emin = emin;
5965 memcg->memory.elow = elow;
5966
5967 if (usage <= emin)
5968 return MEMCG_PROT_MIN;
5969 else if (usage <= elow)
5970 return MEMCG_PROT_LOW;
5971 else
5972 return MEMCG_PROT_NONE;
5973}
5974
5975
5976
5977
5978
5979
5980
5981
5982
5983
5984
5985
5986
5987
5988
5989
5990
5991
5992
5993int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm,
5994 gfp_t gfp_mask, struct mem_cgroup **memcgp,
5995 bool compound)
5996{
5997 struct mem_cgroup *memcg = NULL;
5998 unsigned int nr_pages = compound ? hpage_nr_pages(page) : 1;
5999 int ret = 0;
6000
6001 if (mem_cgroup_disabled())
6002 goto out;
6003
6004 if (PageSwapCache(page)) {
6005
6006
6007
6008
6009
6010
6011
6012 VM_BUG_ON_PAGE(!PageLocked(page), page);
6013 if (compound_head(page)->mem_cgroup)
6014 goto out;
6015
6016 if (do_swap_account) {
6017 swp_entry_t ent = { .val = page_private(page), };
6018 unsigned short id = lookup_swap_cgroup_id(ent);
6019
6020 rcu_read_lock();
6021 memcg = mem_cgroup_from_id(id);
6022 if (memcg && !css_tryget_online(&memcg->css))
6023 memcg = NULL;
6024 rcu_read_unlock();
6025 }
6026 }
6027
6028 if (!memcg)
6029 memcg = get_mem_cgroup_from_mm(mm);
6030
6031 ret = try_charge(memcg, gfp_mask, nr_pages);
6032
6033 css_put(&memcg->css);
6034out:
6035 *memcgp = memcg;
6036 return ret;
6037}
6038
6039int mem_cgroup_try_charge_delay(struct page *page, struct mm_struct *mm,
6040 gfp_t gfp_mask, struct mem_cgroup **memcgp,
6041 bool compound)
6042{
6043 struct mem_cgroup *memcg;
6044 int ret;
6045
6046 ret = mem_cgroup_try_charge(page, mm, gfp_mask, memcgp, compound);
6047 memcg = *memcgp;
6048 mem_cgroup_throttle_swaprate(memcg, page_to_nid(page), gfp_mask);
6049 return ret;
6050}
6051
6052
6053
6054
6055
6056
6057
6058
6059
6060
6061
6062
6063
6064
6065
6066
6067
6068
6069void mem_cgroup_commit_charge(struct page *page, struct mem_cgroup *memcg,
6070 bool lrucare, bool compound)
6071{
6072 unsigned int nr_pages = compound ? hpage_nr_pages(page) : 1;
6073
6074 VM_BUG_ON_PAGE(!page->mapping, page);
6075 VM_BUG_ON_PAGE(PageLRU(page) && !lrucare, page);
6076
6077 if (mem_cgroup_disabled())
6078 return;
6079
6080
6081
6082
6083
6084 if (!memcg)
6085 return;
6086
6087 commit_charge(page, memcg, lrucare);
6088
6089 local_irq_disable();
6090 mem_cgroup_charge_statistics(memcg, page, compound, nr_pages);
6091 memcg_check_events(memcg, page);
6092 local_irq_enable();
6093
6094 if (do_memsw_account() && PageSwapCache(page)) {
6095 swp_entry_t entry = { .val = page_private(page) };
6096
6097
6098
6099
6100
6101 mem_cgroup_uncharge_swap(entry, nr_pages);
6102 }
6103}
6104
6105
6106
6107
6108
6109
6110
6111
6112
6113void mem_cgroup_cancel_charge(struct page *page, struct mem_cgroup *memcg,
6114 bool compound)
6115{
6116 unsigned int nr_pages = compound ? hpage_nr_pages(page) : 1;
6117
6118 if (mem_cgroup_disabled())
6119 return;
6120
6121
6122
6123
6124
6125 if (!memcg)
6126 return;
6127
6128 cancel_charge(memcg, nr_pages);
6129}
6130
6131struct uncharge_gather {
6132 struct mem_cgroup *memcg;
6133 unsigned long pgpgout;
6134 unsigned long nr_anon;
6135 unsigned long nr_file;
6136 unsigned long nr_kmem;
6137 unsigned long nr_huge;
6138 unsigned long nr_shmem;
6139 struct page *dummy_page;
6140};
6141
6142static inline void uncharge_gather_clear(struct uncharge_gather *ug)
6143{
6144 memset(ug, 0, sizeof(*ug));
6145}
6146
6147static void uncharge_batch(const struct uncharge_gather *ug)
6148{
6149 unsigned long nr_pages = ug->nr_anon + ug->nr_file + ug->nr_kmem;
6150 unsigned long flags;
6151
6152 if (!mem_cgroup_is_root(ug->memcg)) {
6153 page_counter_uncharge(&ug->memcg->memory, nr_pages);
6154 if (do_memsw_account())
6155 page_counter_uncharge(&ug->memcg->memsw, nr_pages);
6156 if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && ug->nr_kmem)
6157 page_counter_uncharge(&ug->memcg->kmem, ug->nr_kmem);
6158 memcg_oom_recover(ug->memcg);
6159 }
6160
6161 local_irq_save(flags);
6162 __mod_memcg_state(ug->memcg, MEMCG_RSS, -ug->nr_anon);
6163 __mod_memcg_state(ug->memcg, MEMCG_CACHE, -ug->nr_file);
6164 __mod_memcg_state(ug->memcg, MEMCG_RSS_HUGE, -ug->nr_huge);
6165 __mod_memcg_state(ug->memcg, NR_SHMEM, -ug->nr_shmem);
6166 __count_memcg_events(ug->memcg, PGPGOUT, ug->pgpgout);
6167 __this_cpu_add(ug->memcg->vmstats_percpu->nr_page_events, nr_pages);
6168 memcg_check_events(ug->memcg, ug->dummy_page);
6169 local_irq_restore(flags);
6170
6171 if (!mem_cgroup_is_root(ug->memcg))
6172 css_put_many(&ug->memcg->css, nr_pages);
6173}
6174
6175static void uncharge_page(struct page *page, struct uncharge_gather *ug)
6176{
6177 VM_BUG_ON_PAGE(PageLRU(page), page);
6178 VM_BUG_ON_PAGE(page_count(page) && !is_zone_device_page(page) &&
6179 !PageHWPoison(page) , page);
6180
6181 if (!page->mem_cgroup)
6182 return;
6183
6184
6185
6186
6187
6188
6189
6190 if (ug->memcg != page->mem_cgroup) {
6191 if (ug->memcg) {
6192 uncharge_batch(ug);
6193 uncharge_gather_clear(ug);
6194 }
6195 ug->memcg = page->mem_cgroup;
6196 }
6197
6198 if (!PageKmemcg(page)) {
6199 unsigned int nr_pages = 1;
6200
6201 if (PageTransHuge(page)) {
6202 nr_pages <<= compound_order(page);
6203 ug->nr_huge += nr_pages;
6204 }
6205 if (PageAnon(page))
6206 ug->nr_anon += nr_pages;
6207 else {
6208 ug->nr_file += nr_pages;
6209 if (PageSwapBacked(page))
6210 ug->nr_shmem += nr_pages;
6211 }
6212 ug->pgpgout++;
6213 } else {
6214 ug->nr_kmem += 1 << compound_order(page);
6215 __ClearPageKmemcg(page);
6216 }
6217
6218 ug->dummy_page = page;
6219 page->mem_cgroup = NULL;
6220}
6221
6222static void uncharge_list(struct list_head *page_list)
6223{
6224 struct uncharge_gather ug;
6225 struct list_head *next;
6226
6227 uncharge_gather_clear(&ug);
6228
6229
6230
6231
6232
6233 next = page_list->next;
6234 do {
6235 struct page *page;
6236
6237 page = list_entry(next, struct page, lru);
6238 next = page->lru.next;
6239
6240 uncharge_page(page, &ug);
6241 } while (next != page_list);
6242
6243 if (ug.memcg)
6244 uncharge_batch(&ug);
6245}
6246
6247
6248
6249
6250
6251
6252
6253
6254void mem_cgroup_uncharge(struct page *page)
6255{
6256 struct uncharge_gather ug;
6257
6258 if (mem_cgroup_disabled())
6259 return;
6260
6261
6262 if (!page->mem_cgroup)
6263 return;
6264
6265 uncharge_gather_clear(&ug);
6266 uncharge_page(page, &ug);
6267 uncharge_batch(&ug);
6268}
6269
6270
6271
6272
6273
6274
6275
6276
6277void mem_cgroup_uncharge_list(struct list_head *page_list)
6278{
6279 if (mem_cgroup_disabled())
6280 return;
6281
6282 if (!list_empty(page_list))
6283 uncharge_list(page_list);
6284}
6285
6286
6287
6288
6289
6290
6291
6292
6293
6294
6295
6296void mem_cgroup_migrate(struct page *oldpage, struct page *newpage)
6297{
6298 struct mem_cgroup *memcg;
6299 unsigned int nr_pages;
6300 bool compound;
6301 unsigned long flags;
6302
6303 VM_BUG_ON_PAGE(!PageLocked(oldpage), oldpage);
6304 VM_BUG_ON_PAGE(!PageLocked(newpage), newpage);
6305 VM_BUG_ON_PAGE(PageAnon(oldpage) != PageAnon(newpage), newpage);
6306 VM_BUG_ON_PAGE(PageTransHuge(oldpage) != PageTransHuge(newpage),
6307 newpage);
6308
6309 if (mem_cgroup_disabled())
6310 return;
6311
6312
6313 if (newpage->mem_cgroup)
6314 return;
6315
6316
6317 memcg = oldpage->mem_cgroup;
6318 if (!memcg)
6319 return;
6320
6321
6322 compound = PageTransHuge(newpage);
6323 nr_pages = compound ? hpage_nr_pages(newpage) : 1;
6324
6325 page_counter_charge(&memcg->memory, nr_pages);
6326 if (do_memsw_account())
6327 page_counter_charge(&memcg->memsw, nr_pages);
6328 css_get_many(&memcg->css, nr_pages);
6329
6330 commit_charge(newpage, memcg, false);
6331
6332 local_irq_save(flags);
6333 mem_cgroup_charge_statistics(memcg, newpage, compound, nr_pages);
6334 memcg_check_events(memcg, newpage);
6335 local_irq_restore(flags);
6336}
6337
6338DEFINE_STATIC_KEY_FALSE(memcg_sockets_enabled_key);
6339EXPORT_SYMBOL(memcg_sockets_enabled_key);
6340
6341void mem_cgroup_sk_alloc(struct sock *sk)
6342{
6343 struct mem_cgroup *memcg;
6344
6345 if (!mem_cgroup_sockets_enabled)
6346 return;
6347
6348
6349
6350
6351
6352
6353
6354
6355
6356
6357 if (sk->sk_memcg) {
6358 css_get(&sk->sk_memcg->css);
6359 return;
6360 }
6361
6362 rcu_read_lock();
6363 memcg = mem_cgroup_from_task(current);
6364 if (memcg == root_mem_cgroup)
6365 goto out;
6366 if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && !memcg->tcpmem_active)
6367 goto out;
6368 if (css_tryget_online(&memcg->css))
6369 sk->sk_memcg = memcg;
6370out:
6371 rcu_read_unlock();
6372}
6373
6374void mem_cgroup_sk_free(struct sock *sk)
6375{
6376 if (sk->sk_memcg)
6377 css_put(&sk->sk_memcg->css);
6378}
6379
6380
6381
6382
6383
6384
6385
6386
6387
6388bool mem_cgroup_charge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages)
6389{
6390 gfp_t gfp_mask = GFP_KERNEL;
6391
6392 if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) {
6393 struct page_counter *fail;
6394
6395 if (page_counter_try_charge(&memcg->tcpmem, nr_pages, &fail)) {
6396 memcg->tcpmem_pressure = 0;
6397 return true;
6398 }
6399 page_counter_charge(&memcg->tcpmem, nr_pages);
6400 memcg->tcpmem_pressure = 1;
6401 return false;
6402 }
6403
6404
6405 if (in_softirq())
6406 gfp_mask = GFP_NOWAIT;
6407
6408 mod_memcg_state(memcg, MEMCG_SOCK, nr_pages);
6409
6410 if (try_charge(memcg, gfp_mask, nr_pages) == 0)
6411 return true;
6412
6413 try_charge(memcg, gfp_mask|__GFP_NOFAIL, nr_pages);
6414 return false;
6415}
6416
6417
6418
6419
6420
6421
6422void mem_cgroup_uncharge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages)
6423{
6424 if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) {
6425 page_counter_uncharge(&memcg->tcpmem, nr_pages);
6426 return;
6427 }
6428
6429 mod_memcg_state(memcg, MEMCG_SOCK, -nr_pages);
6430
6431 refill_stock(memcg, nr_pages);
6432}
6433
6434static int __init cgroup_memory(char *s)
6435{
6436 char *token;
6437
6438 while ((token = strsep(&s, ",")) != NULL) {
6439 if (!*token)
6440 continue;
6441 if (!strcmp(token, "nosocket"))
6442 cgroup_memory_nosocket = true;
6443 if (!strcmp(token, "nokmem"))
6444 cgroup_memory_nokmem = true;
6445 }
6446 return 0;
6447}
6448__setup("cgroup.memory=", cgroup_memory);
6449
6450
6451
6452
6453
6454
6455
6456
6457
6458static int __init mem_cgroup_init(void)
6459{
6460 int cpu, node;
6461
6462#ifdef CONFIG_MEMCG_KMEM
6463
6464
6465
6466
6467
6468
6469 memcg_kmem_cache_wq = alloc_workqueue("memcg_kmem_cache", 0, 1);
6470 BUG_ON(!memcg_kmem_cache_wq);
6471#endif
6472
6473 cpuhp_setup_state_nocalls(CPUHP_MM_MEMCQ_DEAD, "mm/memctrl:dead", NULL,
6474 memcg_hotplug_cpu_dead);
6475
6476 for_each_possible_cpu(cpu)
6477 INIT_WORK(&per_cpu_ptr(&memcg_stock, cpu)->work,
6478 drain_local_stock);
6479
6480 for_each_node(node) {
6481 struct mem_cgroup_tree_per_node *rtpn;
6482
6483 rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL,
6484 node_online(node) ? node : NUMA_NO_NODE);
6485
6486 rtpn->rb_root = RB_ROOT;
6487 rtpn->rb_rightmost = NULL;
6488 spin_lock_init(&rtpn->lock);
6489 soft_limit_tree.rb_tree_per_node[node] = rtpn;
6490 }
6491
6492 return 0;
6493}
6494subsys_initcall(mem_cgroup_init);
6495
6496#ifdef CONFIG_MEMCG_SWAP
6497static struct mem_cgroup *mem_cgroup_id_get_online(struct mem_cgroup *memcg)
6498{
6499 while (!refcount_inc_not_zero(&memcg->id.ref)) {
6500
6501
6502
6503
6504 if (WARN_ON_ONCE(memcg == root_mem_cgroup)) {
6505 VM_BUG_ON(1);
6506 break;
6507 }
6508 memcg = parent_mem_cgroup(memcg);
6509 if (!memcg)
6510 memcg = root_mem_cgroup;
6511 }
6512 return memcg;
6513}
6514
6515
6516
6517
6518
6519
6520
6521
6522void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
6523{
6524 struct mem_cgroup *memcg, *swap_memcg;
6525 unsigned int nr_entries;
6526 unsigned short oldid;
6527
6528 VM_BUG_ON_PAGE(PageLRU(page), page);
6529 VM_BUG_ON_PAGE(page_count(page), page);
6530
6531 if (!do_memsw_account())
6532 return;
6533
6534 memcg = page->mem_cgroup;
6535
6536
6537 if (!memcg)
6538 return;
6539
6540
6541
6542
6543
6544
6545 swap_memcg = mem_cgroup_id_get_online(memcg);
6546 nr_entries = hpage_nr_pages(page);
6547
6548 if (nr_entries > 1)
6549 mem_cgroup_id_get_many(swap_memcg, nr_entries - 1);
6550 oldid = swap_cgroup_record(entry, mem_cgroup_id(swap_memcg),
6551 nr_entries);
6552 VM_BUG_ON_PAGE(oldid, page);
6553 mod_memcg_state(swap_memcg, MEMCG_SWAP, nr_entries);
6554
6555 page->mem_cgroup = NULL;
6556
6557 if (!mem_cgroup_is_root(memcg))
6558 page_counter_uncharge(&memcg->memory, nr_entries);
6559
6560 if (memcg != swap_memcg) {
6561 if (!mem_cgroup_is_root(swap_memcg))
6562 page_counter_charge(&swap_memcg->memsw, nr_entries);
6563 page_counter_uncharge(&memcg->memsw, nr_entries);
6564 }
6565
6566
6567
6568
6569
6570
6571
6572 VM_BUG_ON(!irqs_disabled());
6573 mem_cgroup_charge_statistics(memcg, page, PageTransHuge(page),
6574 -nr_entries);
6575 memcg_check_events(memcg, page);
6576
6577 if (!mem_cgroup_is_root(memcg))
6578 css_put_many(&memcg->css, nr_entries);
6579}
6580
6581
6582
6583
6584
6585
6586
6587
6588
6589
6590int mem_cgroup_try_charge_swap(struct page *page, swp_entry_t entry)
6591{
6592 unsigned int nr_pages = hpage_nr_pages(page);
6593 struct page_counter *counter;
6594 struct mem_cgroup *memcg;
6595 unsigned short oldid;
6596
6597 if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) || !do_swap_account)
6598 return 0;
6599
6600 memcg = page->mem_cgroup;
6601
6602
6603 if (!memcg)
6604 return 0;
6605
6606 if (!entry.val) {
6607 memcg_memory_event(memcg, MEMCG_SWAP_FAIL);
6608 return 0;
6609 }
6610
6611 memcg = mem_cgroup_id_get_online(memcg);
6612
6613 if (!mem_cgroup_is_root(memcg) &&
6614 !page_counter_try_charge(&memcg->swap, nr_pages, &counter)) {
6615 memcg_memory_event(memcg, MEMCG_SWAP_MAX);
6616 memcg_memory_event(memcg, MEMCG_SWAP_FAIL);
6617 mem_cgroup_id_put(memcg);
6618 return -ENOMEM;
6619 }
6620
6621
6622 if (nr_pages > 1)
6623 mem_cgroup_id_get_many(memcg, nr_pages - 1);
6624 oldid = swap_cgroup_record(entry, mem_cgroup_id(memcg), nr_pages);
6625 VM_BUG_ON_PAGE(oldid, page);
6626 mod_memcg_state(memcg, MEMCG_SWAP, nr_pages);
6627
6628 return 0;
6629}
6630
6631
6632
6633
6634
6635
6636void mem_cgroup_uncharge_swap(swp_entry_t entry, unsigned int nr_pages)
6637{
6638 struct mem_cgroup *memcg;
6639 unsigned short id;
6640
6641 if (!do_swap_account)
6642 return;
6643
6644 id = swap_cgroup_record(entry, 0, nr_pages);
6645 rcu_read_lock();
6646 memcg = mem_cgroup_from_id(id);
6647 if (memcg) {
6648 if (!mem_cgroup_is_root(memcg)) {
6649 if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
6650 page_counter_uncharge(&memcg->swap, nr_pages);
6651 else
6652 page_counter_uncharge(&memcg->memsw, nr_pages);
6653 }
6654 mod_memcg_state(memcg, MEMCG_SWAP, -nr_pages);
6655 mem_cgroup_id_put_many(memcg, nr_pages);
6656 }
6657 rcu_read_unlock();
6658}
6659
6660long mem_cgroup_get_nr_swap_pages(struct mem_cgroup *memcg)
6661{
6662 long nr_swap_pages = get_nr_swap_pages();
6663
6664 if (!do_swap_account || !cgroup_subsys_on_dfl(memory_cgrp_subsys))
6665 return nr_swap_pages;
6666 for (; memcg != root_mem_cgroup; memcg = parent_mem_cgroup(memcg))
6667 nr_swap_pages = min_t(long, nr_swap_pages,
6668 READ_ONCE(memcg->swap.max) -
6669 page_counter_read(&memcg->swap));
6670 return nr_swap_pages;
6671}
6672
6673bool mem_cgroup_swap_full(struct page *page)
6674{
6675 struct mem_cgroup *memcg;
6676
6677 VM_BUG_ON_PAGE(!PageLocked(page), page);
6678
6679 if (vm_swap_full())
6680 return true;
6681 if (!do_swap_account || !cgroup_subsys_on_dfl(memory_cgrp_subsys))
6682 return false;
6683
6684 memcg = page->mem_cgroup;
6685 if (!memcg)
6686 return false;
6687
6688 for (; memcg != root_mem_cgroup; memcg = parent_mem_cgroup(memcg))
6689 if (page_counter_read(&memcg->swap) * 2 >= memcg->swap.max)
6690 return true;
6691
6692 return false;
6693}
6694
6695
6696#ifdef CONFIG_MEMCG_SWAP_ENABLED
6697static int really_do_swap_account __initdata = 1;
6698#else
6699static int really_do_swap_account __initdata;
6700#endif
6701
6702static int __init enable_swap_account(char *s)
6703{
6704 if (!strcmp(s, "1"))
6705 really_do_swap_account = 1;
6706 else if (!strcmp(s, "0"))
6707 really_do_swap_account = 0;
6708 return 1;
6709}
6710__setup("swapaccount=", enable_swap_account);
6711
6712static u64 swap_current_read(struct cgroup_subsys_state *css,
6713 struct cftype *cft)
6714{
6715 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
6716
6717 return (u64)page_counter_read(&memcg->swap) * PAGE_SIZE;
6718}
6719
6720static int swap_max_show(struct seq_file *m, void *v)
6721{
6722 return seq_puts_memcg_tunable(m,
6723 READ_ONCE(mem_cgroup_from_seq(m)->swap.max));
6724}
6725
6726static ssize_t swap_max_write(struct kernfs_open_file *of,
6727 char *buf, size_t nbytes, loff_t off)
6728{
6729 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
6730 unsigned long max;
6731 int err;
6732
6733 buf = strstrip(buf);
6734 err = page_counter_memparse(buf, "max", &max);
6735 if (err)
6736 return err;
6737
6738 xchg(&memcg->swap.max, max);
6739
6740 return nbytes;
6741}
6742
6743static int swap_events_show(struct seq_file *m, void *v)
6744{
6745 struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
6746
6747 seq_printf(m, "max %lu\n",
6748 atomic_long_read(&memcg->memory_events[MEMCG_SWAP_MAX]));
6749 seq_printf(m, "fail %lu\n",
6750 atomic_long_read(&memcg->memory_events[MEMCG_SWAP_FAIL]));
6751
6752 return 0;
6753}
6754
6755static struct cftype swap_files[] = {
6756 {
6757 .name = "swap.current",
6758 .flags = CFTYPE_NOT_ON_ROOT,
6759 .read_u64 = swap_current_read,
6760 },
6761 {
6762 .name = "swap.max",
6763 .flags = CFTYPE_NOT_ON_ROOT,
6764 .seq_show = swap_max_show,
6765 .write = swap_max_write,
6766 },
6767 {
6768 .name = "swap.events",
6769 .flags = CFTYPE_NOT_ON_ROOT,
6770 .file_offset = offsetof(struct mem_cgroup, swap_events_file),
6771 .seq_show = swap_events_show,
6772 },
6773 { }
6774};
6775
6776static struct cftype memsw_cgroup_files[] = {
6777 {
6778 .name = "memsw.usage_in_bytes",
6779 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE),
6780 .read_u64 = mem_cgroup_read_u64,
6781 },
6782 {
6783 .name = "memsw.max_usage_in_bytes",
6784 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_MAX_USAGE),
6785 .write = mem_cgroup_reset,
6786 .read_u64 = mem_cgroup_read_u64,
6787 },
6788 {
6789 .name = "memsw.limit_in_bytes",
6790 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_LIMIT),
6791 .write = mem_cgroup_write,
6792 .read_u64 = mem_cgroup_read_u64,
6793 },
6794 {
6795 .name = "memsw.failcnt",
6796 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_FAILCNT),
6797 .write = mem_cgroup_reset,
6798 .read_u64 = mem_cgroup_read_u64,
6799 },
6800 { },
6801};
6802
6803static int __init mem_cgroup_swap_init(void)
6804{
6805 if (!mem_cgroup_disabled() && really_do_swap_account) {
6806 do_swap_account = 1;
6807 WARN_ON(cgroup_add_dfl_cftypes(&memory_cgrp_subsys,
6808 swap_files));
6809 WARN_ON(cgroup_add_legacy_cftypes(&memory_cgrp_subsys,
6810 memsw_cgroup_files));
6811 }
6812 return 0;
6813}
6814subsys_initcall(mem_cgroup_swap_init);
6815
6816#endif
6817