1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34#include <linux/page_counter.h>
35#include <linux/memcontrol.h>
36#include <linux/cgroup.h>
37#include <linux/mm.h>
38#include <linux/sched/mm.h>
39#include <linux/shmem_fs.h>
40#include <linux/hugetlb.h>
41#include <linux/pagemap.h>
42#include <linux/smp.h>
43#include <linux/page-flags.h>
44#include <linux/backing-dev.h>
45#include <linux/bit_spinlock.h>
46#include <linux/rcupdate.h>
47#include <linux/limits.h>
48#include <linux/export.h>
49#include <linux/mutex.h>
50#include <linux/rbtree.h>
51#include <linux/slab.h>
52#include <linux/swap.h>
53#include <linux/swapops.h>
54#include <linux/spinlock.h>
55#include <linux/eventfd.h>
56#include <linux/poll.h>
57#include <linux/sort.h>
58#include <linux/fs.h>
59#include <linux/seq_file.h>
60#include <linux/vmpressure.h>
61#include <linux/mm_inline.h>
62#include <linux/swap_cgroup.h>
63#include <linux/cpu.h>
64#include <linux/oom.h>
65#include <linux/lockdep.h>
66#include <linux/file.h>
67#include <linux/tracehook.h>
68#include "internal.h"
69#include <net/sock.h>
70#include <net/ip.h>
71#include "slab.h"
72
73#include <linux/uaccess.h>
74
75#include <trace/events/vmscan.h>
76
77struct cgroup_subsys memory_cgrp_subsys __read_mostly;
78EXPORT_SYMBOL(memory_cgrp_subsys);
79
80struct mem_cgroup *root_mem_cgroup __read_mostly;
81
82#define MEM_CGROUP_RECLAIM_RETRIES 5
83
84
85static bool cgroup_memory_nosocket;
86
87
88static bool cgroup_memory_nokmem;
89
90
91#ifdef CONFIG_MEMCG_SWAP
92int do_swap_account __read_mostly;
93#else
94#define do_swap_account 0
95#endif
96
97
98static bool do_memsw_account(void)
99{
100 return !cgroup_subsys_on_dfl(memory_cgrp_subsys) && do_swap_account;
101}
102
103static const char *const mem_cgroup_lru_names[] = {
104 "inactive_anon",
105 "active_anon",
106 "inactive_file",
107 "active_file",
108 "unevictable",
109};
110
111#define THRESHOLDS_EVENTS_TARGET 128
112#define SOFTLIMIT_EVENTS_TARGET 1024
113#define NUMAINFO_EVENTS_TARGET 1024
114
115
116
117
118
119
120struct mem_cgroup_tree_per_node {
121 struct rb_root rb_root;
122 struct rb_node *rb_rightmost;
123 spinlock_t lock;
124};
125
126struct mem_cgroup_tree {
127 struct mem_cgroup_tree_per_node *rb_tree_per_node[MAX_NUMNODES];
128};
129
130static struct mem_cgroup_tree soft_limit_tree __read_mostly;
131
132
133struct mem_cgroup_eventfd_list {
134 struct list_head list;
135 struct eventfd_ctx *eventfd;
136};
137
138
139
140
141struct mem_cgroup_event {
142
143
144
145 struct mem_cgroup *memcg;
146
147
148
149 struct eventfd_ctx *eventfd;
150
151
152
153 struct list_head list;
154
155
156
157
158
159 int (*register_event)(struct mem_cgroup *memcg,
160 struct eventfd_ctx *eventfd, const char *args);
161
162
163
164
165
166 void (*unregister_event)(struct mem_cgroup *memcg,
167 struct eventfd_ctx *eventfd);
168
169
170
171
172 poll_table pt;
173 wait_queue_head_t *wqh;
174 wait_queue_entry_t wait;
175 struct work_struct remove;
176};
177
178static void mem_cgroup_threshold(struct mem_cgroup *memcg);
179static void mem_cgroup_oom_notify(struct mem_cgroup *memcg);
180
181
182
183
184
185#define MOVE_ANON 0x1U
186#define MOVE_FILE 0x2U
187#define MOVE_MASK (MOVE_ANON | MOVE_FILE)
188
189
190static struct move_charge_struct {
191 spinlock_t lock;
192 struct mm_struct *mm;
193 struct mem_cgroup *from;
194 struct mem_cgroup *to;
195 unsigned long flags;
196 unsigned long precharge;
197 unsigned long moved_charge;
198 unsigned long moved_swap;
199 struct task_struct *moving_task;
200 wait_queue_head_t waitq;
201} mc = {
202 .lock = __SPIN_LOCK_UNLOCKED(mc.lock),
203 .waitq = __WAIT_QUEUE_HEAD_INITIALIZER(mc.waitq),
204};
205
206
207
208
209
210#define MEM_CGROUP_MAX_RECLAIM_LOOPS 100
211#define MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS 2
212
213enum charge_type {
214 MEM_CGROUP_CHARGE_TYPE_CACHE = 0,
215 MEM_CGROUP_CHARGE_TYPE_ANON,
216 MEM_CGROUP_CHARGE_TYPE_SWAPOUT,
217 MEM_CGROUP_CHARGE_TYPE_DROP,
218 NR_CHARGE_TYPE,
219};
220
221
222enum res_type {
223 _MEM,
224 _MEMSWAP,
225 _OOM_TYPE,
226 _KMEM,
227 _TCP,
228};
229
230#define MEMFILE_PRIVATE(x, val) ((x) << 16 | (val))
231#define MEMFILE_TYPE(val) ((val) >> 16 & 0xffff)
232#define MEMFILE_ATTR(val) ((val) & 0xffff)
233
234#define OOM_CONTROL (0)
235
236
237
238
239
240
241#define for_each_mem_cgroup_tree(iter, root) \
242 for (iter = mem_cgroup_iter(root, NULL, NULL); \
243 iter != NULL; \
244 iter = mem_cgroup_iter(root, iter, NULL))
245
246#define for_each_mem_cgroup(iter) \
247 for (iter = mem_cgroup_iter(NULL, NULL, NULL); \
248 iter != NULL; \
249 iter = mem_cgroup_iter(NULL, iter, NULL))
250
251
252struct vmpressure *memcg_to_vmpressure(struct mem_cgroup *memcg)
253{
254 if (!memcg)
255 memcg = root_mem_cgroup;
256 return &memcg->vmpressure;
257}
258
259struct cgroup_subsys_state *vmpressure_to_css(struct vmpressure *vmpr)
260{
261 return &container_of(vmpr, struct mem_cgroup, vmpressure)->css;
262}
263
264#ifdef CONFIG_MEMCG_KMEM
265
266
267
268
269
270
271
272
273
274
275
276static DEFINE_IDA(memcg_cache_ida);
277int memcg_nr_cache_ids;
278
279
280static DECLARE_RWSEM(memcg_cache_ids_sem);
281
282void memcg_get_cache_ids(void)
283{
284 down_read(&memcg_cache_ids_sem);
285}
286
287void memcg_put_cache_ids(void)
288{
289 up_read(&memcg_cache_ids_sem);
290}
291
292
293
294
295
296
297
298
299
300
301
302
303
304#define MEMCG_CACHES_MIN_SIZE 4
305#define MEMCG_CACHES_MAX_SIZE MEM_CGROUP_ID_MAX
306
307
308
309
310
311
312
313DEFINE_STATIC_KEY_FALSE(memcg_kmem_enabled_key);
314EXPORT_SYMBOL(memcg_kmem_enabled_key);
315
316struct workqueue_struct *memcg_kmem_cache_wq;
317
318static int memcg_shrinker_map_size;
319static DEFINE_MUTEX(memcg_shrinker_map_mutex);
320
321static void memcg_free_shrinker_map_rcu(struct rcu_head *head)
322{
323 kvfree(container_of(head, struct memcg_shrinker_map, rcu));
324}
325
326static int memcg_expand_one_shrinker_map(struct mem_cgroup *memcg,
327 int size, int old_size)
328{
329 struct memcg_shrinker_map *new, *old;
330 int nid;
331
332 lockdep_assert_held(&memcg_shrinker_map_mutex);
333
334 for_each_node(nid) {
335 old = rcu_dereference_protected(
336 mem_cgroup_nodeinfo(memcg, nid)->shrinker_map, true);
337
338 if (!old)
339 return 0;
340
341 new = kvmalloc(sizeof(*new) + size, GFP_KERNEL);
342 if (!new)
343 return -ENOMEM;
344
345
346 memset(new->map, (int)0xff, old_size);
347 memset((void *)new->map + old_size, 0, size - old_size);
348
349 rcu_assign_pointer(memcg->nodeinfo[nid]->shrinker_map, new);
350 call_rcu(&old->rcu, memcg_free_shrinker_map_rcu);
351 }
352
353 return 0;
354}
355
356static void memcg_free_shrinker_maps(struct mem_cgroup *memcg)
357{
358 struct mem_cgroup_per_node *pn;
359 struct memcg_shrinker_map *map;
360 int nid;
361
362 if (mem_cgroup_is_root(memcg))
363 return;
364
365 for_each_node(nid) {
366 pn = mem_cgroup_nodeinfo(memcg, nid);
367 map = rcu_dereference_protected(pn->shrinker_map, true);
368 if (map)
369 kvfree(map);
370 rcu_assign_pointer(pn->shrinker_map, NULL);
371 }
372}
373
374static int memcg_alloc_shrinker_maps(struct mem_cgroup *memcg)
375{
376 struct memcg_shrinker_map *map;
377 int nid, size, ret = 0;
378
379 if (mem_cgroup_is_root(memcg))
380 return 0;
381
382 mutex_lock(&memcg_shrinker_map_mutex);
383 size = memcg_shrinker_map_size;
384 for_each_node(nid) {
385 map = kvzalloc(sizeof(*map) + size, GFP_KERNEL);
386 if (!map) {
387 memcg_free_shrinker_maps(memcg);
388 ret = -ENOMEM;
389 break;
390 }
391 rcu_assign_pointer(memcg->nodeinfo[nid]->shrinker_map, map);
392 }
393 mutex_unlock(&memcg_shrinker_map_mutex);
394
395 return ret;
396}
397
398int memcg_expand_shrinker_maps(int new_id)
399{
400 int size, old_size, ret = 0;
401 struct mem_cgroup *memcg;
402
403 size = DIV_ROUND_UP(new_id + 1, BITS_PER_LONG) * sizeof(unsigned long);
404 old_size = memcg_shrinker_map_size;
405 if (size <= old_size)
406 return 0;
407
408 mutex_lock(&memcg_shrinker_map_mutex);
409 if (!root_mem_cgroup)
410 goto unlock;
411
412 for_each_mem_cgroup(memcg) {
413 if (mem_cgroup_is_root(memcg))
414 continue;
415 ret = memcg_expand_one_shrinker_map(memcg, size, old_size);
416 if (ret)
417 goto unlock;
418 }
419unlock:
420 if (!ret)
421 memcg_shrinker_map_size = size;
422 mutex_unlock(&memcg_shrinker_map_mutex);
423 return ret;
424}
425
426void memcg_set_shrinker_bit(struct mem_cgroup *memcg, int nid, int shrinker_id)
427{
428 if (shrinker_id >= 0 && memcg && !mem_cgroup_is_root(memcg)) {
429 struct memcg_shrinker_map *map;
430
431 rcu_read_lock();
432 map = rcu_dereference(memcg->nodeinfo[nid]->shrinker_map);
433
434 smp_mb__before_atomic();
435 set_bit(shrinker_id, map->map);
436 rcu_read_unlock();
437 }
438}
439
440#else
441static int memcg_alloc_shrinker_maps(struct mem_cgroup *memcg)
442{
443 return 0;
444}
445static void memcg_free_shrinker_maps(struct mem_cgroup *memcg) { }
446#endif
447
448
449
450
451
452
453
454
455
456
457
458
459struct cgroup_subsys_state *mem_cgroup_css_from_page(struct page *page)
460{
461 struct mem_cgroup *memcg;
462
463 memcg = page->mem_cgroup;
464
465 if (!memcg || !cgroup_subsys_on_dfl(memory_cgrp_subsys))
466 memcg = root_mem_cgroup;
467
468 return &memcg->css;
469}
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484ino_t page_cgroup_ino(struct page *page)
485{
486 struct mem_cgroup *memcg;
487 unsigned long ino = 0;
488
489 rcu_read_lock();
490 memcg = READ_ONCE(page->mem_cgroup);
491 while (memcg && !(memcg->css.flags & CSS_ONLINE))
492 memcg = parent_mem_cgroup(memcg);
493 if (memcg)
494 ino = cgroup_ino(memcg->css.cgroup);
495 rcu_read_unlock();
496 return ino;
497}
498
499static struct mem_cgroup_per_node *
500mem_cgroup_page_nodeinfo(struct mem_cgroup *memcg, struct page *page)
501{
502 int nid = page_to_nid(page);
503
504 return memcg->nodeinfo[nid];
505}
506
507static struct mem_cgroup_tree_per_node *
508soft_limit_tree_node(int nid)
509{
510 return soft_limit_tree.rb_tree_per_node[nid];
511}
512
513static struct mem_cgroup_tree_per_node *
514soft_limit_tree_from_page(struct page *page)
515{
516 int nid = page_to_nid(page);
517
518 return soft_limit_tree.rb_tree_per_node[nid];
519}
520
521static void __mem_cgroup_insert_exceeded(struct mem_cgroup_per_node *mz,
522 struct mem_cgroup_tree_per_node *mctz,
523 unsigned long new_usage_in_excess)
524{
525 struct rb_node **p = &mctz->rb_root.rb_node;
526 struct rb_node *parent = NULL;
527 struct mem_cgroup_per_node *mz_node;
528 bool rightmost = true;
529
530 if (mz->on_tree)
531 return;
532
533 mz->usage_in_excess = new_usage_in_excess;
534 if (!mz->usage_in_excess)
535 return;
536 while (*p) {
537 parent = *p;
538 mz_node = rb_entry(parent, struct mem_cgroup_per_node,
539 tree_node);
540 if (mz->usage_in_excess < mz_node->usage_in_excess) {
541 p = &(*p)->rb_left;
542 rightmost = false;
543 }
544
545
546
547
548
549 else if (mz->usage_in_excess >= mz_node->usage_in_excess)
550 p = &(*p)->rb_right;
551 }
552
553 if (rightmost)
554 mctz->rb_rightmost = &mz->tree_node;
555
556 rb_link_node(&mz->tree_node, parent, p);
557 rb_insert_color(&mz->tree_node, &mctz->rb_root);
558 mz->on_tree = true;
559}
560
561static void __mem_cgroup_remove_exceeded(struct mem_cgroup_per_node *mz,
562 struct mem_cgroup_tree_per_node *mctz)
563{
564 if (!mz->on_tree)
565 return;
566
567 if (&mz->tree_node == mctz->rb_rightmost)
568 mctz->rb_rightmost = rb_prev(&mz->tree_node);
569
570 rb_erase(&mz->tree_node, &mctz->rb_root);
571 mz->on_tree = false;
572}
573
574static void mem_cgroup_remove_exceeded(struct mem_cgroup_per_node *mz,
575 struct mem_cgroup_tree_per_node *mctz)
576{
577 unsigned long flags;
578
579 spin_lock_irqsave(&mctz->lock, flags);
580 __mem_cgroup_remove_exceeded(mz, mctz);
581 spin_unlock_irqrestore(&mctz->lock, flags);
582}
583
584static unsigned long soft_limit_excess(struct mem_cgroup *memcg)
585{
586 unsigned long nr_pages = page_counter_read(&memcg->memory);
587 unsigned long soft_limit = READ_ONCE(memcg->soft_limit);
588 unsigned long excess = 0;
589
590 if (nr_pages > soft_limit)
591 excess = nr_pages - soft_limit;
592
593 return excess;
594}
595
596static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page)
597{
598 unsigned long excess;
599 struct mem_cgroup_per_node *mz;
600 struct mem_cgroup_tree_per_node *mctz;
601
602 mctz = soft_limit_tree_from_page(page);
603 if (!mctz)
604 return;
605
606
607
608
609 for (; memcg; memcg = parent_mem_cgroup(memcg)) {
610 mz = mem_cgroup_page_nodeinfo(memcg, page);
611 excess = soft_limit_excess(memcg);
612
613
614
615
616 if (excess || mz->on_tree) {
617 unsigned long flags;
618
619 spin_lock_irqsave(&mctz->lock, flags);
620
621 if (mz->on_tree)
622 __mem_cgroup_remove_exceeded(mz, mctz);
623
624
625
626
627 __mem_cgroup_insert_exceeded(mz, mctz, excess);
628 spin_unlock_irqrestore(&mctz->lock, flags);
629 }
630 }
631}
632
633static void mem_cgroup_remove_from_trees(struct mem_cgroup *memcg)
634{
635 struct mem_cgroup_tree_per_node *mctz;
636 struct mem_cgroup_per_node *mz;
637 int nid;
638
639 for_each_node(nid) {
640 mz = mem_cgroup_nodeinfo(memcg, nid);
641 mctz = soft_limit_tree_node(nid);
642 if (mctz)
643 mem_cgroup_remove_exceeded(mz, mctz);
644 }
645}
646
647static struct mem_cgroup_per_node *
648__mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz)
649{
650 struct mem_cgroup_per_node *mz;
651
652retry:
653 mz = NULL;
654 if (!mctz->rb_rightmost)
655 goto done;
656
657 mz = rb_entry(mctz->rb_rightmost,
658 struct mem_cgroup_per_node, tree_node);
659
660
661
662
663
664 __mem_cgroup_remove_exceeded(mz, mctz);
665 if (!soft_limit_excess(mz->memcg) ||
666 !css_tryget_online(&mz->memcg->css))
667 goto retry;
668done:
669 return mz;
670}
671
672static struct mem_cgroup_per_node *
673mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz)
674{
675 struct mem_cgroup_per_node *mz;
676
677 spin_lock_irq(&mctz->lock);
678 mz = __mem_cgroup_largest_soft_limit_node(mctz);
679 spin_unlock_irq(&mctz->lock);
680 return mz;
681}
682
683static unsigned long memcg_sum_events(struct mem_cgroup *memcg,
684 int event)
685{
686 return atomic_long_read(&memcg->events[event]);
687}
688
689static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg,
690 struct page *page,
691 bool compound, int nr_pages)
692{
693
694
695
696
697 if (PageAnon(page))
698 __mod_memcg_state(memcg, MEMCG_RSS, nr_pages);
699 else {
700 __mod_memcg_state(memcg, MEMCG_CACHE, nr_pages);
701 if (PageSwapBacked(page))
702 __mod_memcg_state(memcg, NR_SHMEM, nr_pages);
703 }
704
705 if (compound) {
706 VM_BUG_ON_PAGE(!PageTransHuge(page), page);
707 __mod_memcg_state(memcg, MEMCG_RSS_HUGE, nr_pages);
708 }
709
710
711 if (nr_pages > 0)
712 __count_memcg_events(memcg, PGPGIN, 1);
713 else {
714 __count_memcg_events(memcg, PGPGOUT, 1);
715 nr_pages = -nr_pages;
716 }
717
718 __this_cpu_add(memcg->stat_cpu->nr_page_events, nr_pages);
719}
720
721unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg,
722 int nid, unsigned int lru_mask)
723{
724 struct lruvec *lruvec = mem_cgroup_lruvec(NODE_DATA(nid), memcg);
725 unsigned long nr = 0;
726 enum lru_list lru;
727
728 VM_BUG_ON((unsigned)nid >= nr_node_ids);
729
730 for_each_lru(lru) {
731 if (!(BIT(lru) & lru_mask))
732 continue;
733 nr += mem_cgroup_get_lru_size(lruvec, lru);
734 }
735 return nr;
736}
737
738static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *memcg,
739 unsigned int lru_mask)
740{
741 unsigned long nr = 0;
742 int nid;
743
744 for_each_node_state(nid, N_MEMORY)
745 nr += mem_cgroup_node_nr_lru_pages(memcg, nid, lru_mask);
746 return nr;
747}
748
749static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg,
750 enum mem_cgroup_events_target target)
751{
752 unsigned long val, next;
753
754 val = __this_cpu_read(memcg->stat_cpu->nr_page_events);
755 next = __this_cpu_read(memcg->stat_cpu->targets[target]);
756
757 if ((long)(next - val) < 0) {
758 switch (target) {
759 case MEM_CGROUP_TARGET_THRESH:
760 next = val + THRESHOLDS_EVENTS_TARGET;
761 break;
762 case MEM_CGROUP_TARGET_SOFTLIMIT:
763 next = val + SOFTLIMIT_EVENTS_TARGET;
764 break;
765 case MEM_CGROUP_TARGET_NUMAINFO:
766 next = val + NUMAINFO_EVENTS_TARGET;
767 break;
768 default:
769 break;
770 }
771 __this_cpu_write(memcg->stat_cpu->targets[target], next);
772 return true;
773 }
774 return false;
775}
776
777
778
779
780
781static void memcg_check_events(struct mem_cgroup *memcg, struct page *page)
782{
783
784 if (unlikely(mem_cgroup_event_ratelimit(memcg,
785 MEM_CGROUP_TARGET_THRESH))) {
786 bool do_softlimit;
787 bool do_numainfo __maybe_unused;
788
789 do_softlimit = mem_cgroup_event_ratelimit(memcg,
790 MEM_CGROUP_TARGET_SOFTLIMIT);
791#if MAX_NUMNODES > 1
792 do_numainfo = mem_cgroup_event_ratelimit(memcg,
793 MEM_CGROUP_TARGET_NUMAINFO);
794#endif
795 mem_cgroup_threshold(memcg);
796 if (unlikely(do_softlimit))
797 mem_cgroup_update_tree(memcg, page);
798#if MAX_NUMNODES > 1
799 if (unlikely(do_numainfo))
800 atomic_inc(&memcg->numainfo_events);
801#endif
802 }
803}
804
805struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
806{
807
808
809
810
811
812 if (unlikely(!p))
813 return NULL;
814
815 return mem_cgroup_from_css(task_css(p, memory_cgrp_id));
816}
817EXPORT_SYMBOL(mem_cgroup_from_task);
818
819
820
821
822
823
824
825
826
827struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm)
828{
829 struct mem_cgroup *memcg;
830
831 if (mem_cgroup_disabled())
832 return NULL;
833
834 rcu_read_lock();
835 do {
836
837
838
839
840
841 if (unlikely(!mm))
842 memcg = root_mem_cgroup;
843 else {
844 memcg = mem_cgroup_from_task(rcu_dereference(mm->owner));
845 if (unlikely(!memcg))
846 memcg = root_mem_cgroup;
847 }
848 } while (!css_tryget_online(&memcg->css));
849 rcu_read_unlock();
850 return memcg;
851}
852EXPORT_SYMBOL(get_mem_cgroup_from_mm);
853
854
855
856
857
858
859
860
861struct mem_cgroup *get_mem_cgroup_from_page(struct page *page)
862{
863 struct mem_cgroup *memcg = page->mem_cgroup;
864
865 if (mem_cgroup_disabled())
866 return NULL;
867
868 rcu_read_lock();
869 if (!memcg || !css_tryget_online(&memcg->css))
870 memcg = root_mem_cgroup;
871 rcu_read_unlock();
872 return memcg;
873}
874EXPORT_SYMBOL(get_mem_cgroup_from_page);
875
876
877
878
879static __always_inline struct mem_cgroup *get_mem_cgroup_from_current(void)
880{
881 if (unlikely(current->active_memcg)) {
882 struct mem_cgroup *memcg = root_mem_cgroup;
883
884 rcu_read_lock();
885 if (css_tryget_online(¤t->active_memcg->css))
886 memcg = current->active_memcg;
887 rcu_read_unlock();
888 return memcg;
889 }
890 return get_mem_cgroup_from_mm(current->mm);
891}
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
911 struct mem_cgroup *prev,
912 struct mem_cgroup_reclaim_cookie *reclaim)
913{
914 struct mem_cgroup_reclaim_iter *uninitialized_var(iter);
915 struct cgroup_subsys_state *css = NULL;
916 struct mem_cgroup *memcg = NULL;
917 struct mem_cgroup *pos = NULL;
918
919 if (mem_cgroup_disabled())
920 return NULL;
921
922 if (!root)
923 root = root_mem_cgroup;
924
925 if (prev && !reclaim)
926 pos = prev;
927
928 if (!root->use_hierarchy && root != root_mem_cgroup) {
929 if (prev)
930 goto out;
931 return root;
932 }
933
934 rcu_read_lock();
935
936 if (reclaim) {
937 struct mem_cgroup_per_node *mz;
938
939 mz = mem_cgroup_nodeinfo(root, reclaim->pgdat->node_id);
940 iter = &mz->iter[reclaim->priority];
941
942 if (prev && reclaim->generation != iter->generation)
943 goto out_unlock;
944
945 while (1) {
946 pos = READ_ONCE(iter->position);
947 if (!pos || css_tryget(&pos->css))
948 break;
949
950
951
952
953
954
955
956
957 (void)cmpxchg(&iter->position, pos, NULL);
958 }
959 }
960
961 if (pos)
962 css = &pos->css;
963
964 for (;;) {
965 css = css_next_descendant_pre(css, &root->css);
966 if (!css) {
967
968
969
970
971
972
973 if (!prev)
974 continue;
975 break;
976 }
977
978
979
980
981
982
983 memcg = mem_cgroup_from_css(css);
984
985 if (css == &root->css)
986 break;
987
988 if (css_tryget(css))
989 break;
990
991 memcg = NULL;
992 }
993
994 if (reclaim) {
995
996
997
998
999
1000 (void)cmpxchg(&iter->position, pos, memcg);
1001
1002 if (pos)
1003 css_put(&pos->css);
1004
1005 if (!memcg)
1006 iter->generation++;
1007 else if (!prev)
1008 reclaim->generation = iter->generation;
1009 }
1010
1011out_unlock:
1012 rcu_read_unlock();
1013out:
1014 if (prev && prev != root)
1015 css_put(&prev->css);
1016
1017 return memcg;
1018}
1019
1020
1021
1022
1023
1024
1025void mem_cgroup_iter_break(struct mem_cgroup *root,
1026 struct mem_cgroup *prev)
1027{
1028 if (!root)
1029 root = root_mem_cgroup;
1030 if (prev && prev != root)
1031 css_put(&prev->css);
1032}
1033
1034static void invalidate_reclaim_iterators(struct mem_cgroup *dead_memcg)
1035{
1036 struct mem_cgroup *memcg = dead_memcg;
1037 struct mem_cgroup_reclaim_iter *iter;
1038 struct mem_cgroup_per_node *mz;
1039 int nid;
1040 int i;
1041
1042 for (; memcg; memcg = parent_mem_cgroup(memcg)) {
1043 for_each_node(nid) {
1044 mz = mem_cgroup_nodeinfo(memcg, nid);
1045 for (i = 0; i <= DEF_PRIORITY; i++) {
1046 iter = &mz->iter[i];
1047 cmpxchg(&iter->position,
1048 dead_memcg, NULL);
1049 }
1050 }
1051 }
1052}
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067int mem_cgroup_scan_tasks(struct mem_cgroup *memcg,
1068 int (*fn)(struct task_struct *, void *), void *arg)
1069{
1070 struct mem_cgroup *iter;
1071 int ret = 0;
1072
1073 BUG_ON(memcg == root_mem_cgroup);
1074
1075 for_each_mem_cgroup_tree(iter, memcg) {
1076 struct css_task_iter it;
1077 struct task_struct *task;
1078
1079 css_task_iter_start(&iter->css, 0, &it);
1080 while (!ret && (task = css_task_iter_next(&it)))
1081 ret = fn(task, arg);
1082 css_task_iter_end(&it);
1083 if (ret) {
1084 mem_cgroup_iter_break(memcg, iter);
1085 break;
1086 }
1087 }
1088 return ret;
1089}
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100struct lruvec *mem_cgroup_page_lruvec(struct page *page, struct pglist_data *pgdat)
1101{
1102 struct mem_cgroup_per_node *mz;
1103 struct mem_cgroup *memcg;
1104 struct lruvec *lruvec;
1105
1106 if (mem_cgroup_disabled()) {
1107 lruvec = &pgdat->lruvec;
1108 goto out;
1109 }
1110
1111 memcg = page->mem_cgroup;
1112
1113
1114
1115
1116 if (!memcg)
1117 memcg = root_mem_cgroup;
1118
1119 mz = mem_cgroup_page_nodeinfo(memcg, page);
1120 lruvec = &mz->lruvec;
1121out:
1122
1123
1124
1125
1126
1127 if (unlikely(lruvec->pgdat != pgdat))
1128 lruvec->pgdat = pgdat;
1129 return lruvec;
1130}
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru,
1144 int zid, int nr_pages)
1145{
1146 struct mem_cgroup_per_node *mz;
1147 unsigned long *lru_size;
1148 long size;
1149
1150 if (mem_cgroup_disabled())
1151 return;
1152
1153 mz = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
1154 lru_size = &mz->lru_zone_size[zid][lru];
1155
1156 if (nr_pages < 0)
1157 *lru_size += nr_pages;
1158
1159 size = *lru_size;
1160 if (WARN_ONCE(size < 0,
1161 "%s(%p, %d, %d): lru_size %ld\n",
1162 __func__, lruvec, lru, nr_pages, size)) {
1163 VM_BUG_ON(1);
1164 *lru_size = 0;
1165 }
1166
1167 if (nr_pages > 0)
1168 *lru_size += nr_pages;
1169}
1170
1171bool task_in_mem_cgroup(struct task_struct *task, struct mem_cgroup *memcg)
1172{
1173 struct mem_cgroup *task_memcg;
1174 struct task_struct *p;
1175 bool ret;
1176
1177 p = find_lock_task_mm(task);
1178 if (p) {
1179 task_memcg = get_mem_cgroup_from_mm(p->mm);
1180 task_unlock(p);
1181 } else {
1182
1183
1184
1185
1186
1187 rcu_read_lock();
1188 task_memcg = mem_cgroup_from_task(task);
1189 css_get(&task_memcg->css);
1190 rcu_read_unlock();
1191 }
1192 ret = mem_cgroup_is_descendant(task_memcg, memcg);
1193 css_put(&task_memcg->css);
1194 return ret;
1195}
1196
1197
1198
1199
1200
1201
1202
1203
1204static unsigned long mem_cgroup_margin(struct mem_cgroup *memcg)
1205{
1206 unsigned long margin = 0;
1207 unsigned long count;
1208 unsigned long limit;
1209
1210 count = page_counter_read(&memcg->memory);
1211 limit = READ_ONCE(memcg->memory.max);
1212 if (count < limit)
1213 margin = limit - count;
1214
1215 if (do_memsw_account()) {
1216 count = page_counter_read(&memcg->memsw);
1217 limit = READ_ONCE(memcg->memsw.max);
1218 if (count <= limit)
1219 margin = min(margin, limit - count);
1220 else
1221 margin = 0;
1222 }
1223
1224 return margin;
1225}
1226
1227
1228
1229
1230
1231
1232
1233
1234static bool mem_cgroup_under_move(struct mem_cgroup *memcg)
1235{
1236 struct mem_cgroup *from;
1237 struct mem_cgroup *to;
1238 bool ret = false;
1239
1240
1241
1242
1243 spin_lock(&mc.lock);
1244 from = mc.from;
1245 to = mc.to;
1246 if (!from)
1247 goto unlock;
1248
1249 ret = mem_cgroup_is_descendant(from, memcg) ||
1250 mem_cgroup_is_descendant(to, memcg);
1251unlock:
1252 spin_unlock(&mc.lock);
1253 return ret;
1254}
1255
1256static bool mem_cgroup_wait_acct_move(struct mem_cgroup *memcg)
1257{
1258 if (mc.moving_task && current != mc.moving_task) {
1259 if (mem_cgroup_under_move(memcg)) {
1260 DEFINE_WAIT(wait);
1261 prepare_to_wait(&mc.waitq, &wait, TASK_INTERRUPTIBLE);
1262
1263 if (mc.moving_task)
1264 schedule();
1265 finish_wait(&mc.waitq, &wait);
1266 return true;
1267 }
1268 }
1269 return false;
1270}
1271
1272static const unsigned int memcg1_stats[] = {
1273 MEMCG_CACHE,
1274 MEMCG_RSS,
1275 MEMCG_RSS_HUGE,
1276 NR_SHMEM,
1277 NR_FILE_MAPPED,
1278 NR_FILE_DIRTY,
1279 NR_WRITEBACK,
1280 MEMCG_SWAP,
1281};
1282
1283static const char *const memcg1_stat_names[] = {
1284 "cache",
1285 "rss",
1286 "rss_huge",
1287 "shmem",
1288 "mapped_file",
1289 "dirty",
1290 "writeback",
1291 "swap",
1292};
1293
1294#define K(x) ((x) << (PAGE_SHIFT-10))
1295
1296
1297
1298
1299
1300
1301
1302
1303void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
1304{
1305 struct mem_cgroup *iter;
1306 unsigned int i;
1307
1308 rcu_read_lock();
1309
1310 if (p) {
1311 pr_info("Task in ");
1312 pr_cont_cgroup_path(task_cgroup(p, memory_cgrp_id));
1313 pr_cont(" killed as a result of limit of ");
1314 } else {
1315 pr_info("Memory limit reached of cgroup ");
1316 }
1317
1318 pr_cont_cgroup_path(memcg->css.cgroup);
1319 pr_cont("\n");
1320
1321 rcu_read_unlock();
1322
1323 pr_info("memory: usage %llukB, limit %llukB, failcnt %lu\n",
1324 K((u64)page_counter_read(&memcg->memory)),
1325 K((u64)memcg->memory.max), memcg->memory.failcnt);
1326 pr_info("memory+swap: usage %llukB, limit %llukB, failcnt %lu\n",
1327 K((u64)page_counter_read(&memcg->memsw)),
1328 K((u64)memcg->memsw.max), memcg->memsw.failcnt);
1329 pr_info("kmem: usage %llukB, limit %llukB, failcnt %lu\n",
1330 K((u64)page_counter_read(&memcg->kmem)),
1331 K((u64)memcg->kmem.max), memcg->kmem.failcnt);
1332
1333 for_each_mem_cgroup_tree(iter, memcg) {
1334 pr_info("Memory cgroup stats for ");
1335 pr_cont_cgroup_path(iter->css.cgroup);
1336 pr_cont(":");
1337
1338 for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) {
1339 if (memcg1_stats[i] == MEMCG_SWAP && !do_swap_account)
1340 continue;
1341 pr_cont(" %s:%luKB", memcg1_stat_names[i],
1342 K(memcg_page_state(iter, memcg1_stats[i])));
1343 }
1344
1345 for (i = 0; i < NR_LRU_LISTS; i++)
1346 pr_cont(" %s:%luKB", mem_cgroup_lru_names[i],
1347 K(mem_cgroup_nr_lru_pages(iter, BIT(i))));
1348
1349 pr_cont("\n");
1350 }
1351}
1352
1353
1354
1355
1356unsigned long mem_cgroup_get_max(struct mem_cgroup *memcg)
1357{
1358 unsigned long max;
1359
1360 max = memcg->memory.max;
1361 if (mem_cgroup_swappiness(memcg)) {
1362 unsigned long memsw_max;
1363 unsigned long swap_max;
1364
1365 memsw_max = memcg->memsw.max;
1366 swap_max = memcg->swap.max;
1367 swap_max = min(swap_max, (unsigned long)total_swap_pages);
1368 max = min(max + swap_max, memsw_max);
1369 }
1370 return max;
1371}
1372
1373static bool mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
1374 int order)
1375{
1376 struct oom_control oc = {
1377 .zonelist = NULL,
1378 .nodemask = NULL,
1379 .memcg = memcg,
1380 .gfp_mask = gfp_mask,
1381 .order = order,
1382 };
1383 bool ret;
1384
1385 mutex_lock(&oom_lock);
1386 ret = out_of_memory(&oc);
1387 mutex_unlock(&oom_lock);
1388 return ret;
1389}
1390
1391#if MAX_NUMNODES > 1
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403static bool test_mem_cgroup_node_reclaimable(struct mem_cgroup *memcg,
1404 int nid, bool noswap)
1405{
1406 if (mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL_FILE))
1407 return true;
1408 if (noswap || !total_swap_pages)
1409 return false;
1410 if (mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL_ANON))
1411 return true;
1412 return false;
1413
1414}
1415
1416
1417
1418
1419
1420
1421
1422static void mem_cgroup_may_update_nodemask(struct mem_cgroup *memcg)
1423{
1424 int nid;
1425
1426
1427
1428
1429 if (!atomic_read(&memcg->numainfo_events))
1430 return;
1431 if (atomic_inc_return(&memcg->numainfo_updating) > 1)
1432 return;
1433
1434
1435 memcg->scan_nodes = node_states[N_MEMORY];
1436
1437 for_each_node_mask(nid, node_states[N_MEMORY]) {
1438
1439 if (!test_mem_cgroup_node_reclaimable(memcg, nid, false))
1440 node_clear(nid, memcg->scan_nodes);
1441 }
1442
1443 atomic_set(&memcg->numainfo_events, 0);
1444 atomic_set(&memcg->numainfo_updating, 0);
1445}
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459int mem_cgroup_select_victim_node(struct mem_cgroup *memcg)
1460{
1461 int node;
1462
1463 mem_cgroup_may_update_nodemask(memcg);
1464 node = memcg->last_scanned_node;
1465
1466 node = next_node_in(node, memcg->scan_nodes);
1467
1468
1469
1470
1471
1472 if (unlikely(node == MAX_NUMNODES))
1473 node = numa_node_id();
1474
1475 memcg->last_scanned_node = node;
1476 return node;
1477}
1478#else
1479int mem_cgroup_select_victim_node(struct mem_cgroup *memcg)
1480{
1481 return 0;
1482}
1483#endif
1484
1485static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg,
1486 pg_data_t *pgdat,
1487 gfp_t gfp_mask,
1488 unsigned long *total_scanned)
1489{
1490 struct mem_cgroup *victim = NULL;
1491 int total = 0;
1492 int loop = 0;
1493 unsigned long excess;
1494 unsigned long nr_scanned;
1495 struct mem_cgroup_reclaim_cookie reclaim = {
1496 .pgdat = pgdat,
1497 .priority = 0,
1498 };
1499
1500 excess = soft_limit_excess(root_memcg);
1501
1502 while (1) {
1503 victim = mem_cgroup_iter(root_memcg, victim, &reclaim);
1504 if (!victim) {
1505 loop++;
1506 if (loop >= 2) {
1507
1508
1509
1510
1511
1512 if (!total)
1513 break;
1514
1515
1516
1517
1518
1519
1520 if (total >= (excess >> 2) ||
1521 (loop > MEM_CGROUP_MAX_RECLAIM_LOOPS))
1522 break;
1523 }
1524 continue;
1525 }
1526 total += mem_cgroup_shrink_node(victim, gfp_mask, false,
1527 pgdat, &nr_scanned);
1528 *total_scanned += nr_scanned;
1529 if (!soft_limit_excess(root_memcg))
1530 break;
1531 }
1532 mem_cgroup_iter_break(root_memcg, victim);
1533 return total;
1534}
1535
1536#ifdef CONFIG_LOCKDEP
1537static struct lockdep_map memcg_oom_lock_dep_map = {
1538 .name = "memcg_oom_lock",
1539};
1540#endif
1541
1542static DEFINE_SPINLOCK(memcg_oom_lock);
1543
1544
1545
1546
1547
1548static bool mem_cgroup_oom_trylock(struct mem_cgroup *memcg)
1549{
1550 struct mem_cgroup *iter, *failed = NULL;
1551
1552 spin_lock(&memcg_oom_lock);
1553
1554 for_each_mem_cgroup_tree(iter, memcg) {
1555 if (iter->oom_lock) {
1556
1557
1558
1559
1560 failed = iter;
1561 mem_cgroup_iter_break(memcg, iter);
1562 break;
1563 } else
1564 iter->oom_lock = true;
1565 }
1566
1567 if (failed) {
1568
1569
1570
1571
1572 for_each_mem_cgroup_tree(iter, memcg) {
1573 if (iter == failed) {
1574 mem_cgroup_iter_break(memcg, iter);
1575 break;
1576 }
1577 iter->oom_lock = false;
1578 }
1579 } else
1580 mutex_acquire(&memcg_oom_lock_dep_map, 0, 1, _RET_IP_);
1581
1582 spin_unlock(&memcg_oom_lock);
1583
1584 return !failed;
1585}
1586
1587static void mem_cgroup_oom_unlock(struct mem_cgroup *memcg)
1588{
1589 struct mem_cgroup *iter;
1590
1591 spin_lock(&memcg_oom_lock);
1592 mutex_release(&memcg_oom_lock_dep_map, 1, _RET_IP_);
1593 for_each_mem_cgroup_tree(iter, memcg)
1594 iter->oom_lock = false;
1595 spin_unlock(&memcg_oom_lock);
1596}
1597
1598static void mem_cgroup_mark_under_oom(struct mem_cgroup *memcg)
1599{
1600 struct mem_cgroup *iter;
1601
1602 spin_lock(&memcg_oom_lock);
1603 for_each_mem_cgroup_tree(iter, memcg)
1604 iter->under_oom++;
1605 spin_unlock(&memcg_oom_lock);
1606}
1607
1608static void mem_cgroup_unmark_under_oom(struct mem_cgroup *memcg)
1609{
1610 struct mem_cgroup *iter;
1611
1612
1613
1614
1615
1616 spin_lock(&memcg_oom_lock);
1617 for_each_mem_cgroup_tree(iter, memcg)
1618 if (iter->under_oom > 0)
1619 iter->under_oom--;
1620 spin_unlock(&memcg_oom_lock);
1621}
1622
1623static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq);
1624
1625struct oom_wait_info {
1626 struct mem_cgroup *memcg;
1627 wait_queue_entry_t wait;
1628};
1629
1630static int memcg_oom_wake_function(wait_queue_entry_t *wait,
1631 unsigned mode, int sync, void *arg)
1632{
1633 struct mem_cgroup *wake_memcg = (struct mem_cgroup *)arg;
1634 struct mem_cgroup *oom_wait_memcg;
1635 struct oom_wait_info *oom_wait_info;
1636
1637 oom_wait_info = container_of(wait, struct oom_wait_info, wait);
1638 oom_wait_memcg = oom_wait_info->memcg;
1639
1640 if (!mem_cgroup_is_descendant(wake_memcg, oom_wait_memcg) &&
1641 !mem_cgroup_is_descendant(oom_wait_memcg, wake_memcg))
1642 return 0;
1643 return autoremove_wake_function(wait, mode, sync, arg);
1644}
1645
1646static void memcg_oom_recover(struct mem_cgroup *memcg)
1647{
1648
1649
1650
1651
1652
1653
1654
1655
1656 if (memcg && memcg->under_oom)
1657 __wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, memcg);
1658}
1659
1660enum oom_status {
1661 OOM_SUCCESS,
1662 OOM_FAILED,
1663 OOM_ASYNC,
1664 OOM_SKIPPED
1665};
1666
1667static enum oom_status mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order)
1668{
1669 if (order > PAGE_ALLOC_COSTLY_ORDER)
1670 return OOM_SKIPPED;
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690 if (memcg->oom_kill_disable) {
1691 if (!current->in_user_fault)
1692 return OOM_SKIPPED;
1693 css_get(&memcg->css);
1694 current->memcg_in_oom = memcg;
1695 current->memcg_oom_gfp_mask = mask;
1696 current->memcg_oom_order = order;
1697
1698 return OOM_ASYNC;
1699 }
1700
1701 if (mem_cgroup_out_of_memory(memcg, mask, order))
1702 return OOM_SUCCESS;
1703
1704 return OOM_FAILED;
1705}
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724bool mem_cgroup_oom_synchronize(bool handle)
1725{
1726 struct mem_cgroup *memcg = current->memcg_in_oom;
1727 struct oom_wait_info owait;
1728 bool locked;
1729
1730
1731 if (!memcg)
1732 return false;
1733
1734 if (!handle)
1735 goto cleanup;
1736
1737 owait.memcg = memcg;
1738 owait.wait.flags = 0;
1739 owait.wait.func = memcg_oom_wake_function;
1740 owait.wait.private = current;
1741 INIT_LIST_HEAD(&owait.wait.entry);
1742
1743 prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE);
1744 mem_cgroup_mark_under_oom(memcg);
1745
1746 locked = mem_cgroup_oom_trylock(memcg);
1747
1748 if (locked)
1749 mem_cgroup_oom_notify(memcg);
1750
1751 if (locked && !memcg->oom_kill_disable) {
1752 mem_cgroup_unmark_under_oom(memcg);
1753 finish_wait(&memcg_oom_waitq, &owait.wait);
1754 mem_cgroup_out_of_memory(memcg, current->memcg_oom_gfp_mask,
1755 current->memcg_oom_order);
1756 } else {
1757 schedule();
1758 mem_cgroup_unmark_under_oom(memcg);
1759 finish_wait(&memcg_oom_waitq, &owait.wait);
1760 }
1761
1762 if (locked) {
1763 mem_cgroup_oom_unlock(memcg);
1764
1765
1766
1767
1768
1769 memcg_oom_recover(memcg);
1770 }
1771cleanup:
1772 current->memcg_in_oom = NULL;
1773 css_put(&memcg->css);
1774 return true;
1775}
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787struct mem_cgroup *mem_cgroup_get_oom_group(struct task_struct *victim,
1788 struct mem_cgroup *oom_domain)
1789{
1790 struct mem_cgroup *oom_group = NULL;
1791 struct mem_cgroup *memcg;
1792
1793 if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
1794 return NULL;
1795
1796 if (!oom_domain)
1797 oom_domain = root_mem_cgroup;
1798
1799 rcu_read_lock();
1800
1801 memcg = mem_cgroup_from_task(victim);
1802 if (memcg == root_mem_cgroup)
1803 goto out;
1804
1805
1806
1807
1808
1809
1810 for (; memcg; memcg = parent_mem_cgroup(memcg)) {
1811 if (memcg->oom_group)
1812 oom_group = memcg;
1813
1814 if (memcg == oom_domain)
1815 break;
1816 }
1817
1818 if (oom_group)
1819 css_get(&oom_group->css);
1820out:
1821 rcu_read_unlock();
1822
1823 return oom_group;
1824}
1825
1826void mem_cgroup_print_oom_group(struct mem_cgroup *memcg)
1827{
1828 pr_info("Tasks in ");
1829 pr_cont_cgroup_path(memcg->css.cgroup);
1830 pr_cont(" are going to be killed due to memory.oom.group set\n");
1831}
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844struct mem_cgroup *lock_page_memcg(struct page *page)
1845{
1846 struct mem_cgroup *memcg;
1847 unsigned long flags;
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860 rcu_read_lock();
1861
1862 if (mem_cgroup_disabled())
1863 return NULL;
1864again:
1865 memcg = page->mem_cgroup;
1866 if (unlikely(!memcg))
1867 return NULL;
1868
1869 if (atomic_read(&memcg->moving_account) <= 0)
1870 return memcg;
1871
1872 spin_lock_irqsave(&memcg->move_lock, flags);
1873 if (memcg != page->mem_cgroup) {
1874 spin_unlock_irqrestore(&memcg->move_lock, flags);
1875 goto again;
1876 }
1877
1878
1879
1880
1881
1882
1883 memcg->move_lock_task = current;
1884 memcg->move_lock_flags = flags;
1885
1886 return memcg;
1887}
1888EXPORT_SYMBOL(lock_page_memcg);
1889
1890
1891
1892
1893
1894
1895
1896void __unlock_page_memcg(struct mem_cgroup *memcg)
1897{
1898 if (memcg && memcg->move_lock_task == current) {
1899 unsigned long flags = memcg->move_lock_flags;
1900
1901 memcg->move_lock_task = NULL;
1902 memcg->move_lock_flags = 0;
1903
1904 spin_unlock_irqrestore(&memcg->move_lock, flags);
1905 }
1906
1907 rcu_read_unlock();
1908}
1909
1910
1911
1912
1913
1914void unlock_page_memcg(struct page *page)
1915{
1916 __unlock_page_memcg(page->mem_cgroup);
1917}
1918EXPORT_SYMBOL(unlock_page_memcg);
1919
1920struct memcg_stock_pcp {
1921 struct mem_cgroup *cached;
1922 unsigned int nr_pages;
1923 struct work_struct work;
1924 unsigned long flags;
1925#define FLUSHING_CACHED_CHARGE 0
1926};
1927static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock);
1928static DEFINE_MUTEX(percpu_charge_mutex);
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
1942{
1943 struct memcg_stock_pcp *stock;
1944 unsigned long flags;
1945 bool ret = false;
1946
1947 if (nr_pages > MEMCG_CHARGE_BATCH)
1948 return ret;
1949
1950 local_irq_save(flags);
1951
1952 stock = this_cpu_ptr(&memcg_stock);
1953 if (memcg == stock->cached && stock->nr_pages >= nr_pages) {
1954 stock->nr_pages -= nr_pages;
1955 ret = true;
1956 }
1957
1958 local_irq_restore(flags);
1959
1960 return ret;
1961}
1962
1963
1964
1965
1966static void drain_stock(struct memcg_stock_pcp *stock)
1967{
1968 struct mem_cgroup *old = stock->cached;
1969
1970 if (stock->nr_pages) {
1971 page_counter_uncharge(&old->memory, stock->nr_pages);
1972 if (do_memsw_account())
1973 page_counter_uncharge(&old->memsw, stock->nr_pages);
1974 css_put_many(&old->css, stock->nr_pages);
1975 stock->nr_pages = 0;
1976 }
1977 stock->cached = NULL;
1978}
1979
1980static void drain_local_stock(struct work_struct *dummy)
1981{
1982 struct memcg_stock_pcp *stock;
1983 unsigned long flags;
1984
1985
1986
1987
1988
1989 local_irq_save(flags);
1990
1991 stock = this_cpu_ptr(&memcg_stock);
1992 drain_stock(stock);
1993 clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags);
1994
1995 local_irq_restore(flags);
1996}
1997
1998
1999
2000
2001
2002static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
2003{
2004 struct memcg_stock_pcp *stock;
2005 unsigned long flags;
2006
2007 local_irq_save(flags);
2008
2009 stock = this_cpu_ptr(&memcg_stock);
2010 if (stock->cached != memcg) {
2011 drain_stock(stock);
2012 stock->cached = memcg;
2013 }
2014 stock->nr_pages += nr_pages;
2015
2016 if (stock->nr_pages > MEMCG_CHARGE_BATCH)
2017 drain_stock(stock);
2018
2019 local_irq_restore(flags);
2020}
2021
2022
2023
2024
2025
2026static void drain_all_stock(struct mem_cgroup *root_memcg)
2027{
2028 int cpu, curcpu;
2029
2030
2031 if (!mutex_trylock(&percpu_charge_mutex))
2032 return;
2033
2034
2035
2036
2037
2038
2039 curcpu = get_cpu();
2040 for_each_online_cpu(cpu) {
2041 struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu);
2042 struct mem_cgroup *memcg;
2043
2044 memcg = stock->cached;
2045 if (!memcg || !stock->nr_pages || !css_tryget(&memcg->css))
2046 continue;
2047 if (!mem_cgroup_is_descendant(memcg, root_memcg)) {
2048 css_put(&memcg->css);
2049 continue;
2050 }
2051 if (!test_and_set_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) {
2052 if (cpu == curcpu)
2053 drain_local_stock(&stock->work);
2054 else
2055 schedule_work_on(cpu, &stock->work);
2056 }
2057 css_put(&memcg->css);
2058 }
2059 put_cpu();
2060 mutex_unlock(&percpu_charge_mutex);
2061}
2062
2063static int memcg_hotplug_cpu_dead(unsigned int cpu)
2064{
2065 struct memcg_stock_pcp *stock;
2066 struct mem_cgroup *memcg;
2067
2068 stock = &per_cpu(memcg_stock, cpu);
2069 drain_stock(stock);
2070
2071 for_each_mem_cgroup(memcg) {
2072 int i;
2073
2074 for (i = 0; i < MEMCG_NR_STAT; i++) {
2075 int nid;
2076 long x;
2077
2078 x = this_cpu_xchg(memcg->stat_cpu->count[i], 0);
2079 if (x)
2080 atomic_long_add(x, &memcg->stat[i]);
2081
2082 if (i >= NR_VM_NODE_STAT_ITEMS)
2083 continue;
2084
2085 for_each_node(nid) {
2086 struct mem_cgroup_per_node *pn;
2087
2088 pn = mem_cgroup_nodeinfo(memcg, nid);
2089 x = this_cpu_xchg(pn->lruvec_stat_cpu->count[i], 0);
2090 if (x)
2091 atomic_long_add(x, &pn->lruvec_stat[i]);
2092 }
2093 }
2094
2095 for (i = 0; i < NR_VM_EVENT_ITEMS; i++) {
2096 long x;
2097
2098 x = this_cpu_xchg(memcg->stat_cpu->events[i], 0);
2099 if (x)
2100 atomic_long_add(x, &memcg->events[i]);
2101 }
2102 }
2103
2104 return 0;
2105}
2106
2107static void reclaim_high(struct mem_cgroup *memcg,
2108 unsigned int nr_pages,
2109 gfp_t gfp_mask)
2110{
2111 do {
2112 if (page_counter_read(&memcg->memory) <= memcg->high)
2113 continue;
2114 memcg_memory_event(memcg, MEMCG_HIGH);
2115 try_to_free_mem_cgroup_pages(memcg, nr_pages, gfp_mask, true);
2116 } while ((memcg = parent_mem_cgroup(memcg)));
2117}
2118
2119static void high_work_func(struct work_struct *work)
2120{
2121 struct mem_cgroup *memcg;
2122
2123 memcg = container_of(work, struct mem_cgroup, high_work);
2124 reclaim_high(memcg, MEMCG_CHARGE_BATCH, GFP_KERNEL);
2125}
2126
2127
2128
2129
2130
2131void mem_cgroup_handle_over_high(void)
2132{
2133 unsigned int nr_pages = current->memcg_nr_pages_over_high;
2134 struct mem_cgroup *memcg;
2135
2136 if (likely(!nr_pages))
2137 return;
2138
2139 memcg = get_mem_cgroup_from_mm(current->mm);
2140 reclaim_high(memcg, nr_pages, GFP_KERNEL);
2141 css_put(&memcg->css);
2142 current->memcg_nr_pages_over_high = 0;
2143}
2144
2145static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
2146 unsigned int nr_pages)
2147{
2148 unsigned int batch = max(MEMCG_CHARGE_BATCH, nr_pages);
2149 int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
2150 struct mem_cgroup *mem_over_limit;
2151 struct page_counter *counter;
2152 unsigned long nr_reclaimed;
2153 bool may_swap = true;
2154 bool drained = false;
2155 bool oomed = false;
2156 enum oom_status oom_status;
2157
2158 if (mem_cgroup_is_root(memcg))
2159 return 0;
2160retry:
2161 if (consume_stock(memcg, nr_pages))
2162 return 0;
2163
2164 if (!do_memsw_account() ||
2165 page_counter_try_charge(&memcg->memsw, batch, &counter)) {
2166 if (page_counter_try_charge(&memcg->memory, batch, &counter))
2167 goto done_restock;
2168 if (do_memsw_account())
2169 page_counter_uncharge(&memcg->memsw, batch);
2170 mem_over_limit = mem_cgroup_from_counter(counter, memory);
2171 } else {
2172 mem_over_limit = mem_cgroup_from_counter(counter, memsw);
2173 may_swap = false;
2174 }
2175
2176 if (batch > nr_pages) {
2177 batch = nr_pages;
2178 goto retry;
2179 }
2180
2181
2182
2183
2184
2185
2186
2187 if (unlikely(tsk_is_oom_victim(current) ||
2188 fatal_signal_pending(current) ||
2189 current->flags & PF_EXITING))
2190 goto force;
2191
2192
2193
2194
2195
2196
2197
2198 if (unlikely(current->flags & PF_MEMALLOC))
2199 goto force;
2200
2201 if (unlikely(task_in_memcg_oom(current)))
2202 goto nomem;
2203
2204 if (!gfpflags_allow_blocking(gfp_mask))
2205 goto nomem;
2206
2207 memcg_memory_event(mem_over_limit, MEMCG_MAX);
2208
2209 nr_reclaimed = try_to_free_mem_cgroup_pages(mem_over_limit, nr_pages,
2210 gfp_mask, may_swap);
2211
2212 if (mem_cgroup_margin(mem_over_limit) >= nr_pages)
2213 goto retry;
2214
2215 if (!drained) {
2216 drain_all_stock(mem_over_limit);
2217 drained = true;
2218 goto retry;
2219 }
2220
2221 if (gfp_mask & __GFP_NORETRY)
2222 goto nomem;
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232 if (nr_reclaimed && nr_pages <= (1 << PAGE_ALLOC_COSTLY_ORDER))
2233 goto retry;
2234
2235
2236
2237
2238 if (mem_cgroup_wait_acct_move(mem_over_limit))
2239 goto retry;
2240
2241 if (nr_retries--)
2242 goto retry;
2243
2244 if (gfp_mask & __GFP_RETRY_MAYFAIL && oomed)
2245 goto nomem;
2246
2247 if (gfp_mask & __GFP_NOFAIL)
2248 goto force;
2249
2250 if (fatal_signal_pending(current))
2251 goto force;
2252
2253 memcg_memory_event(mem_over_limit, MEMCG_OOM);
2254
2255
2256
2257
2258
2259
2260 oom_status = mem_cgroup_oom(mem_over_limit, gfp_mask,
2261 get_order(nr_pages * PAGE_SIZE));
2262 switch (oom_status) {
2263 case OOM_SUCCESS:
2264 nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
2265 oomed = true;
2266 goto retry;
2267 case OOM_FAILED:
2268 goto force;
2269 default:
2270 goto nomem;
2271 }
2272nomem:
2273 if (!(gfp_mask & __GFP_NOFAIL))
2274 return -ENOMEM;
2275force:
2276
2277
2278
2279
2280
2281 page_counter_charge(&memcg->memory, nr_pages);
2282 if (do_memsw_account())
2283 page_counter_charge(&memcg->memsw, nr_pages);
2284 css_get_many(&memcg->css, nr_pages);
2285
2286 return 0;
2287
2288done_restock:
2289 css_get_many(&memcg->css, batch);
2290 if (batch > nr_pages)
2291 refill_stock(memcg, batch - nr_pages);
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302 do {
2303 if (page_counter_read(&memcg->memory) > memcg->high) {
2304
2305 if (in_interrupt()) {
2306 schedule_work(&memcg->high_work);
2307 break;
2308 }
2309 current->memcg_nr_pages_over_high += batch;
2310 set_notify_resume(current);
2311 break;
2312 }
2313 } while ((memcg = parent_mem_cgroup(memcg)));
2314
2315 return 0;
2316}
2317
2318static void cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages)
2319{
2320 if (mem_cgroup_is_root(memcg))
2321 return;
2322
2323 page_counter_uncharge(&memcg->memory, nr_pages);
2324 if (do_memsw_account())
2325 page_counter_uncharge(&memcg->memsw, nr_pages);
2326
2327 css_put_many(&memcg->css, nr_pages);
2328}
2329
2330static void lock_page_lru(struct page *page, int *isolated)
2331{
2332 struct zone *zone = page_zone(page);
2333
2334 spin_lock_irq(zone_lru_lock(zone));
2335 if (PageLRU(page)) {
2336 struct lruvec *lruvec;
2337
2338 lruvec = mem_cgroup_page_lruvec(page, zone->zone_pgdat);
2339 ClearPageLRU(page);
2340 del_page_from_lru_list(page, lruvec, page_lru(page));
2341 *isolated = 1;
2342 } else
2343 *isolated = 0;
2344}
2345
2346static void unlock_page_lru(struct page *page, int isolated)
2347{
2348 struct zone *zone = page_zone(page);
2349
2350 if (isolated) {
2351 struct lruvec *lruvec;
2352
2353 lruvec = mem_cgroup_page_lruvec(page, zone->zone_pgdat);
2354 VM_BUG_ON_PAGE(PageLRU(page), page);
2355 SetPageLRU(page);
2356 add_page_to_lru_list(page, lruvec, page_lru(page));
2357 }
2358 spin_unlock_irq(zone_lru_lock(zone));
2359}
2360
2361static void commit_charge(struct page *page, struct mem_cgroup *memcg,
2362 bool lrucare)
2363{
2364 int isolated;
2365
2366 VM_BUG_ON_PAGE(page->mem_cgroup, page);
2367
2368
2369
2370
2371
2372 if (lrucare)
2373 lock_page_lru(page, &isolated);
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389 page->mem_cgroup = memcg;
2390
2391 if (lrucare)
2392 unlock_page_lru(page, isolated);
2393}
2394
2395#ifdef CONFIG_MEMCG_KMEM
2396static int memcg_alloc_cache_id(void)
2397{
2398 int id, size;
2399 int err;
2400
2401 id = ida_simple_get(&memcg_cache_ida,
2402 0, MEMCG_CACHES_MAX_SIZE, GFP_KERNEL);
2403 if (id < 0)
2404 return id;
2405
2406 if (id < memcg_nr_cache_ids)
2407 return id;
2408
2409
2410
2411
2412
2413 down_write(&memcg_cache_ids_sem);
2414
2415 size = 2 * (id + 1);
2416 if (size < MEMCG_CACHES_MIN_SIZE)
2417 size = MEMCG_CACHES_MIN_SIZE;
2418 else if (size > MEMCG_CACHES_MAX_SIZE)
2419 size = MEMCG_CACHES_MAX_SIZE;
2420
2421 err = memcg_update_all_caches(size);
2422 if (!err)
2423 err = memcg_update_all_list_lrus(size);
2424 if (!err)
2425 memcg_nr_cache_ids = size;
2426
2427 up_write(&memcg_cache_ids_sem);
2428
2429 if (err) {
2430 ida_simple_remove(&memcg_cache_ida, id);
2431 return err;
2432 }
2433 return id;
2434}
2435
2436static void memcg_free_cache_id(int id)
2437{
2438 ida_simple_remove(&memcg_cache_ida, id);
2439}
2440
2441struct memcg_kmem_cache_create_work {
2442 struct mem_cgroup *memcg;
2443 struct kmem_cache *cachep;
2444 struct work_struct work;
2445};
2446
2447static void memcg_kmem_cache_create_func(struct work_struct *w)
2448{
2449 struct memcg_kmem_cache_create_work *cw =
2450 container_of(w, struct memcg_kmem_cache_create_work, work);
2451 struct mem_cgroup *memcg = cw->memcg;
2452 struct kmem_cache *cachep = cw->cachep;
2453
2454 memcg_create_kmem_cache(memcg, cachep);
2455
2456 css_put(&memcg->css);
2457 kfree(cw);
2458}
2459
2460
2461
2462
2463static void __memcg_schedule_kmem_cache_create(struct mem_cgroup *memcg,
2464 struct kmem_cache *cachep)
2465{
2466 struct memcg_kmem_cache_create_work *cw;
2467
2468 cw = kmalloc(sizeof(*cw), GFP_NOWAIT | __GFP_NOWARN);
2469 if (!cw)
2470 return;
2471
2472 css_get(&memcg->css);
2473
2474 cw->memcg = memcg;
2475 cw->cachep = cachep;
2476 INIT_WORK(&cw->work, memcg_kmem_cache_create_func);
2477
2478 queue_work(memcg_kmem_cache_wq, &cw->work);
2479}
2480
2481static void memcg_schedule_kmem_cache_create(struct mem_cgroup *memcg,
2482 struct kmem_cache *cachep)
2483{
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495 current->memcg_kmem_skip_account = 1;
2496 __memcg_schedule_kmem_cache_create(memcg, cachep);
2497 current->memcg_kmem_skip_account = 0;
2498}
2499
2500static inline bool memcg_kmem_bypass(void)
2501{
2502 if (in_interrupt() || !current->mm || (current->flags & PF_KTHREAD))
2503 return true;
2504 return false;
2505}
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523struct kmem_cache *memcg_kmem_get_cache(struct kmem_cache *cachep)
2524{
2525 struct mem_cgroup *memcg;
2526 struct kmem_cache *memcg_cachep;
2527 int kmemcg_id;
2528
2529 VM_BUG_ON(!is_root_cache(cachep));
2530
2531 if (memcg_kmem_bypass())
2532 return cachep;
2533
2534 if (current->memcg_kmem_skip_account)
2535 return cachep;
2536
2537 memcg = get_mem_cgroup_from_current();
2538 kmemcg_id = READ_ONCE(memcg->kmemcg_id);
2539 if (kmemcg_id < 0)
2540 goto out;
2541
2542 memcg_cachep = cache_from_memcg_idx(cachep, kmemcg_id);
2543 if (likely(memcg_cachep))
2544 return memcg_cachep;
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558 memcg_schedule_kmem_cache_create(memcg, cachep);
2559out:
2560 css_put(&memcg->css);
2561 return cachep;
2562}
2563
2564
2565
2566
2567
2568void memcg_kmem_put_cache(struct kmem_cache *cachep)
2569{
2570 if (!is_root_cache(cachep))
2571 css_put(&cachep->memcg_params.memcg->css);
2572}
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583int memcg_kmem_charge_memcg(struct page *page, gfp_t gfp, int order,
2584 struct mem_cgroup *memcg)
2585{
2586 unsigned int nr_pages = 1 << order;
2587 struct page_counter *counter;
2588 int ret;
2589
2590 ret = try_charge(memcg, gfp, nr_pages);
2591 if (ret)
2592 return ret;
2593
2594 if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) &&
2595 !page_counter_try_charge(&memcg->kmem, nr_pages, &counter)) {
2596 cancel_charge(memcg, nr_pages);
2597 return -ENOMEM;
2598 }
2599
2600 page->mem_cgroup = memcg;
2601
2602 return 0;
2603}
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613int memcg_kmem_charge(struct page *page, gfp_t gfp, int order)
2614{
2615 struct mem_cgroup *memcg;
2616 int ret = 0;
2617
2618 if (memcg_kmem_bypass())
2619 return 0;
2620
2621 memcg = get_mem_cgroup_from_current();
2622 if (!mem_cgroup_is_root(memcg)) {
2623 ret = memcg_kmem_charge_memcg(page, gfp, order, memcg);
2624 if (!ret)
2625 __SetPageKmemcg(page);
2626 }
2627 css_put(&memcg->css);
2628 return ret;
2629}
2630
2631
2632
2633
2634
2635void memcg_kmem_uncharge(struct page *page, int order)
2636{
2637 struct mem_cgroup *memcg = page->mem_cgroup;
2638 unsigned int nr_pages = 1 << order;
2639
2640 if (!memcg)
2641 return;
2642
2643 VM_BUG_ON_PAGE(mem_cgroup_is_root(memcg), page);
2644
2645 if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
2646 page_counter_uncharge(&memcg->kmem, nr_pages);
2647
2648 page_counter_uncharge(&memcg->memory, nr_pages);
2649 if (do_memsw_account())
2650 page_counter_uncharge(&memcg->memsw, nr_pages);
2651
2652 page->mem_cgroup = NULL;
2653
2654
2655 if (PageKmemcg(page))
2656 __ClearPageKmemcg(page);
2657
2658 css_put_many(&memcg->css, nr_pages);
2659}
2660#endif
2661
2662#ifdef CONFIG_TRANSPARENT_HUGEPAGE
2663
2664
2665
2666
2667
2668void mem_cgroup_split_huge_fixup(struct page *head)
2669{
2670 int i;
2671
2672 if (mem_cgroup_disabled())
2673 return;
2674
2675 for (i = 1; i < HPAGE_PMD_NR; i++)
2676 head[i].mem_cgroup = head->mem_cgroup;
2677
2678 __mod_memcg_state(head->mem_cgroup, MEMCG_RSS_HUGE, -HPAGE_PMD_NR);
2679}
2680#endif
2681
2682#ifdef CONFIG_MEMCG_SWAP
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697static int mem_cgroup_move_swap_account(swp_entry_t entry,
2698 struct mem_cgroup *from, struct mem_cgroup *to)
2699{
2700 unsigned short old_id, new_id;
2701
2702 old_id = mem_cgroup_id(from);
2703 new_id = mem_cgroup_id(to);
2704
2705 if (swap_cgroup_cmpxchg(entry, old_id, new_id) == old_id) {
2706 mod_memcg_state(from, MEMCG_SWAP, -1);
2707 mod_memcg_state(to, MEMCG_SWAP, 1);
2708 return 0;
2709 }
2710 return -EINVAL;
2711}
2712#else
2713static inline int mem_cgroup_move_swap_account(swp_entry_t entry,
2714 struct mem_cgroup *from, struct mem_cgroup *to)
2715{
2716 return -EINVAL;
2717}
2718#endif
2719
2720static DEFINE_MUTEX(memcg_max_mutex);
2721
2722static int mem_cgroup_resize_max(struct mem_cgroup *memcg,
2723 unsigned long max, bool memsw)
2724{
2725 bool enlarge = false;
2726 bool drained = false;
2727 int ret;
2728 bool limits_invariant;
2729 struct page_counter *counter = memsw ? &memcg->memsw : &memcg->memory;
2730
2731 do {
2732 if (signal_pending(current)) {
2733 ret = -EINTR;
2734 break;
2735 }
2736
2737 mutex_lock(&memcg_max_mutex);
2738
2739
2740
2741
2742 limits_invariant = memsw ? max >= memcg->memory.max :
2743 max <= memcg->memsw.max;
2744 if (!limits_invariant) {
2745 mutex_unlock(&memcg_max_mutex);
2746 ret = -EINVAL;
2747 break;
2748 }
2749 if (max > counter->max)
2750 enlarge = true;
2751 ret = page_counter_set_max(counter, max);
2752 mutex_unlock(&memcg_max_mutex);
2753
2754 if (!ret)
2755 break;
2756
2757 if (!drained) {
2758 drain_all_stock(memcg);
2759 drained = true;
2760 continue;
2761 }
2762
2763 if (!try_to_free_mem_cgroup_pages(memcg, 1,
2764 GFP_KERNEL, !memsw)) {
2765 ret = -EBUSY;
2766 break;
2767 }
2768 } while (true);
2769
2770 if (!ret && enlarge)
2771 memcg_oom_recover(memcg);
2772
2773 return ret;
2774}
2775
2776unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order,
2777 gfp_t gfp_mask,
2778 unsigned long *total_scanned)
2779{
2780 unsigned long nr_reclaimed = 0;
2781 struct mem_cgroup_per_node *mz, *next_mz = NULL;
2782 unsigned long reclaimed;
2783 int loop = 0;
2784 struct mem_cgroup_tree_per_node *mctz;
2785 unsigned long excess;
2786 unsigned long nr_scanned;
2787
2788 if (order > 0)
2789 return 0;
2790
2791 mctz = soft_limit_tree_node(pgdat->node_id);
2792
2793
2794
2795
2796
2797
2798 if (!mctz || RB_EMPTY_ROOT(&mctz->rb_root))
2799 return 0;
2800
2801
2802
2803
2804
2805
2806 do {
2807 if (next_mz)
2808 mz = next_mz;
2809 else
2810 mz = mem_cgroup_largest_soft_limit_node(mctz);
2811 if (!mz)
2812 break;
2813
2814 nr_scanned = 0;
2815 reclaimed = mem_cgroup_soft_reclaim(mz->memcg, pgdat,
2816 gfp_mask, &nr_scanned);
2817 nr_reclaimed += reclaimed;
2818 *total_scanned += nr_scanned;
2819 spin_lock_irq(&mctz->lock);
2820 __mem_cgroup_remove_exceeded(mz, mctz);
2821
2822
2823
2824
2825
2826 next_mz = NULL;
2827 if (!reclaimed)
2828 next_mz = __mem_cgroup_largest_soft_limit_node(mctz);
2829
2830 excess = soft_limit_excess(mz->memcg);
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840 __mem_cgroup_insert_exceeded(mz, mctz, excess);
2841 spin_unlock_irq(&mctz->lock);
2842 css_put(&mz->memcg->css);
2843 loop++;
2844
2845
2846
2847
2848
2849 if (!nr_reclaimed &&
2850 (next_mz == NULL ||
2851 loop > MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS))
2852 break;
2853 } while (!nr_reclaimed);
2854 if (next_mz)
2855 css_put(&next_mz->memcg->css);
2856 return nr_reclaimed;
2857}
2858
2859
2860
2861
2862
2863
2864
2865static inline bool memcg_has_children(struct mem_cgroup *memcg)
2866{
2867 bool ret;
2868
2869 rcu_read_lock();
2870 ret = css_next_child(NULL, &memcg->css);
2871 rcu_read_unlock();
2872 return ret;
2873}
2874
2875
2876
2877
2878
2879
2880static int mem_cgroup_force_empty(struct mem_cgroup *memcg)
2881{
2882 int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
2883
2884
2885 lru_add_drain_all();
2886
2887 drain_all_stock(memcg);
2888
2889
2890 while (nr_retries && page_counter_read(&memcg->memory)) {
2891 int progress;
2892
2893 if (signal_pending(current))
2894 return -EINTR;
2895
2896 progress = try_to_free_mem_cgroup_pages(memcg, 1,
2897 GFP_KERNEL, true);
2898 if (!progress) {
2899 nr_retries--;
2900
2901 congestion_wait(BLK_RW_ASYNC, HZ/10);
2902 }
2903
2904 }
2905
2906 return 0;
2907}
2908
2909static ssize_t mem_cgroup_force_empty_write(struct kernfs_open_file *of,
2910 char *buf, size_t nbytes,
2911 loff_t off)
2912{
2913 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
2914
2915 if (mem_cgroup_is_root(memcg))
2916 return -EINVAL;
2917 return mem_cgroup_force_empty(memcg) ?: nbytes;
2918}
2919
2920static u64 mem_cgroup_hierarchy_read(struct cgroup_subsys_state *css,
2921 struct cftype *cft)
2922{
2923 return mem_cgroup_from_css(css)->use_hierarchy;
2924}
2925
2926static int mem_cgroup_hierarchy_write(struct cgroup_subsys_state *css,
2927 struct cftype *cft, u64 val)
2928{
2929 int retval = 0;
2930 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
2931 struct mem_cgroup *parent_memcg = mem_cgroup_from_css(memcg->css.parent);
2932
2933 if (memcg->use_hierarchy == val)
2934 return 0;
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944 if ((!parent_memcg || !parent_memcg->use_hierarchy) &&
2945 (val == 1 || val == 0)) {
2946 if (!memcg_has_children(memcg))
2947 memcg->use_hierarchy = val;
2948 else
2949 retval = -EBUSY;
2950 } else
2951 retval = -EINVAL;
2952
2953 return retval;
2954}
2955
2956struct accumulated_stats {
2957 unsigned long stat[MEMCG_NR_STAT];
2958 unsigned long events[NR_VM_EVENT_ITEMS];
2959 unsigned long lru_pages[NR_LRU_LISTS];
2960 const unsigned int *stats_array;
2961 const unsigned int *events_array;
2962 int stats_size;
2963 int events_size;
2964};
2965
2966static void accumulate_memcg_tree(struct mem_cgroup *memcg,
2967 struct accumulated_stats *acc)
2968{
2969 struct mem_cgroup *mi;
2970 int i;
2971
2972 for_each_mem_cgroup_tree(mi, memcg) {
2973 for (i = 0; i < acc->stats_size; i++)
2974 acc->stat[i] += memcg_page_state(mi,
2975 acc->stats_array ? acc->stats_array[i] : i);
2976
2977 for (i = 0; i < acc->events_size; i++)
2978 acc->events[i] += memcg_sum_events(mi,
2979 acc->events_array ? acc->events_array[i] : i);
2980
2981 for (i = 0; i < NR_LRU_LISTS; i++)
2982 acc->lru_pages[i] +=
2983 mem_cgroup_nr_lru_pages(mi, BIT(i));
2984 }
2985}
2986
2987static unsigned long mem_cgroup_usage(struct mem_cgroup *memcg, bool swap)
2988{
2989 unsigned long val = 0;
2990
2991 if (mem_cgroup_is_root(memcg)) {
2992 struct mem_cgroup *iter;
2993
2994 for_each_mem_cgroup_tree(iter, memcg) {
2995 val += memcg_page_state(iter, MEMCG_CACHE);
2996 val += memcg_page_state(iter, MEMCG_RSS);
2997 if (swap)
2998 val += memcg_page_state(iter, MEMCG_SWAP);
2999 }
3000 } else {
3001 if (!swap)
3002 val = page_counter_read(&memcg->memory);
3003 else
3004 val = page_counter_read(&memcg->memsw);
3005 }
3006 return val;
3007}
3008
3009enum {
3010 RES_USAGE,
3011 RES_LIMIT,
3012 RES_MAX_USAGE,
3013 RES_FAILCNT,
3014 RES_SOFT_LIMIT,
3015};
3016
3017static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css,
3018 struct cftype *cft)
3019{
3020 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
3021 struct page_counter *counter;
3022
3023 switch (MEMFILE_TYPE(cft->private)) {
3024 case _MEM:
3025 counter = &memcg->memory;
3026 break;
3027 case _MEMSWAP:
3028 counter = &memcg->memsw;
3029 break;
3030 case _KMEM:
3031 counter = &memcg->kmem;
3032 break;
3033 case _TCP:
3034 counter = &memcg->tcpmem;
3035 break;
3036 default:
3037 BUG();
3038 }
3039
3040 switch (MEMFILE_ATTR(cft->private)) {
3041 case RES_USAGE:
3042 if (counter == &memcg->memory)
3043 return (u64)mem_cgroup_usage(memcg, false) * PAGE_SIZE;
3044 if (counter == &memcg->memsw)
3045 return (u64)mem_cgroup_usage(memcg, true) * PAGE_SIZE;
3046 return (u64)page_counter_read(counter) * PAGE_SIZE;
3047 case RES_LIMIT:
3048 return (u64)counter->max * PAGE_SIZE;
3049 case RES_MAX_USAGE:
3050 return (u64)counter->watermark * PAGE_SIZE;
3051 case RES_FAILCNT:
3052 return counter->failcnt;
3053 case RES_SOFT_LIMIT:
3054 return (u64)memcg->soft_limit * PAGE_SIZE;
3055 default:
3056 BUG();
3057 }
3058}
3059
3060#ifdef CONFIG_MEMCG_KMEM
3061static int memcg_online_kmem(struct mem_cgroup *memcg)
3062{
3063 int memcg_id;
3064
3065 if (cgroup_memory_nokmem)
3066 return 0;
3067
3068 BUG_ON(memcg->kmemcg_id >= 0);
3069 BUG_ON(memcg->kmem_state);
3070
3071 memcg_id = memcg_alloc_cache_id();
3072 if (memcg_id < 0)
3073 return memcg_id;
3074
3075 static_branch_inc(&memcg_kmem_enabled_key);
3076
3077
3078
3079
3080
3081
3082 memcg->kmemcg_id = memcg_id;
3083 memcg->kmem_state = KMEM_ONLINE;
3084 INIT_LIST_HEAD(&memcg->kmem_caches);
3085
3086 return 0;
3087}
3088
3089static void memcg_offline_kmem(struct mem_cgroup *memcg)
3090{
3091 struct cgroup_subsys_state *css;
3092 struct mem_cgroup *parent, *child;
3093 int kmemcg_id;
3094
3095 if (memcg->kmem_state != KMEM_ONLINE)
3096 return;
3097
3098
3099
3100
3101
3102
3103 memcg->kmem_state = KMEM_ALLOCATED;
3104
3105 memcg_deactivate_kmem_caches(memcg);
3106
3107 kmemcg_id = memcg->kmemcg_id;
3108 BUG_ON(kmemcg_id < 0);
3109
3110 parent = parent_mem_cgroup(memcg);
3111 if (!parent)
3112 parent = root_mem_cgroup;
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122 rcu_read_lock();
3123 css_for_each_descendant_pre(css, &memcg->css) {
3124 child = mem_cgroup_from_css(css);
3125 BUG_ON(child->kmemcg_id != kmemcg_id);
3126 child->kmemcg_id = parent->kmemcg_id;
3127 if (!memcg->use_hierarchy)
3128 break;
3129 }
3130 rcu_read_unlock();
3131
3132 memcg_drain_all_list_lrus(kmemcg_id, parent);
3133
3134 memcg_free_cache_id(kmemcg_id);
3135}
3136
3137static void memcg_free_kmem(struct mem_cgroup *memcg)
3138{
3139
3140 if (unlikely(memcg->kmem_state == KMEM_ONLINE))
3141 memcg_offline_kmem(memcg);
3142
3143 if (memcg->kmem_state == KMEM_ALLOCATED) {
3144 memcg_destroy_kmem_caches(memcg);
3145 static_branch_dec(&memcg_kmem_enabled_key);
3146 WARN_ON(page_counter_read(&memcg->kmem));
3147 }
3148}
3149#else
3150static int memcg_online_kmem(struct mem_cgroup *memcg)
3151{
3152 return 0;
3153}
3154static void memcg_offline_kmem(struct mem_cgroup *memcg)
3155{
3156}
3157static void memcg_free_kmem(struct mem_cgroup *memcg)
3158{
3159}
3160#endif
3161
3162static int memcg_update_kmem_max(struct mem_cgroup *memcg,
3163 unsigned long max)
3164{
3165 int ret;
3166
3167 mutex_lock(&memcg_max_mutex);
3168 ret = page_counter_set_max(&memcg->kmem, max);
3169 mutex_unlock(&memcg_max_mutex);
3170 return ret;
3171}
3172
3173static int memcg_update_tcp_max(struct mem_cgroup *memcg, unsigned long max)
3174{
3175 int ret;
3176
3177 mutex_lock(&memcg_max_mutex);
3178
3179 ret = page_counter_set_max(&memcg->tcpmem, max);
3180 if (ret)
3181 goto out;
3182
3183 if (!memcg->tcpmem_active) {
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200 static_branch_inc(&memcg_sockets_enabled_key);
3201 memcg->tcpmem_active = true;
3202 }
3203out:
3204 mutex_unlock(&memcg_max_mutex);
3205 return ret;
3206}
3207
3208
3209
3210
3211
3212static ssize_t mem_cgroup_write(struct kernfs_open_file *of,
3213 char *buf, size_t nbytes, loff_t off)
3214{
3215 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
3216 unsigned long nr_pages;
3217 int ret;
3218
3219 buf = strstrip(buf);
3220 ret = page_counter_memparse(buf, "-1", &nr_pages);
3221 if (ret)
3222 return ret;
3223
3224 switch (MEMFILE_ATTR(of_cft(of)->private)) {
3225 case RES_LIMIT:
3226 if (mem_cgroup_is_root(memcg)) {
3227 ret = -EINVAL;
3228 break;
3229 }
3230 switch (MEMFILE_TYPE(of_cft(of)->private)) {
3231 case _MEM:
3232 ret = mem_cgroup_resize_max(memcg, nr_pages, false);
3233 break;
3234 case _MEMSWAP:
3235 ret = mem_cgroup_resize_max(memcg, nr_pages, true);
3236 break;
3237 case _KMEM:
3238 ret = memcg_update_kmem_max(memcg, nr_pages);
3239 break;
3240 case _TCP:
3241 ret = memcg_update_tcp_max(memcg, nr_pages);
3242 break;
3243 }
3244 break;
3245 case RES_SOFT_LIMIT:
3246 memcg->soft_limit = nr_pages;
3247 ret = 0;
3248 break;
3249 }
3250 return ret ?: nbytes;
3251}
3252
3253static ssize_t mem_cgroup_reset(struct kernfs_open_file *of, char *buf,
3254 size_t nbytes, loff_t off)
3255{
3256 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
3257 struct page_counter *counter;
3258
3259 switch (MEMFILE_TYPE(of_cft(of)->private)) {
3260 case _MEM:
3261 counter = &memcg->memory;
3262 break;
3263 case _MEMSWAP:
3264 counter = &memcg->memsw;
3265 break;
3266 case _KMEM:
3267 counter = &memcg->kmem;
3268 break;
3269 case _TCP:
3270 counter = &memcg->tcpmem;
3271 break;
3272 default:
3273 BUG();
3274 }
3275
3276 switch (MEMFILE_ATTR(of_cft(of)->private)) {
3277 case RES_MAX_USAGE:
3278 page_counter_reset_watermark(counter);
3279 break;
3280 case RES_FAILCNT:
3281 counter->failcnt = 0;
3282 break;
3283 default:
3284 BUG();
3285 }
3286
3287 return nbytes;
3288}
3289
3290static u64 mem_cgroup_move_charge_read(struct cgroup_subsys_state *css,
3291 struct cftype *cft)
3292{
3293 return mem_cgroup_from_css(css)->move_charge_at_immigrate;
3294}
3295
3296#ifdef CONFIG_MMU
3297static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css,
3298 struct cftype *cft, u64 val)
3299{
3300 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
3301
3302 if (val & ~MOVE_MASK)
3303 return -EINVAL;
3304
3305
3306
3307
3308
3309
3310
3311 memcg->move_charge_at_immigrate = val;
3312 return 0;
3313}
3314#else
3315static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css,
3316 struct cftype *cft, u64 val)
3317{
3318 return -ENOSYS;
3319}
3320#endif
3321
3322#ifdef CONFIG_NUMA
3323static int memcg_numa_stat_show(struct seq_file *m, void *v)
3324{
3325 struct numa_stat {
3326 const char *name;
3327 unsigned int lru_mask;
3328 };
3329
3330 static const struct numa_stat stats[] = {
3331 { "total", LRU_ALL },
3332 { "file", LRU_ALL_FILE },
3333 { "anon", LRU_ALL_ANON },
3334 { "unevictable", BIT(LRU_UNEVICTABLE) },
3335 };
3336 const struct numa_stat *stat;
3337 int nid;
3338 unsigned long nr;
3339 struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
3340
3341 for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) {
3342 nr = mem_cgroup_nr_lru_pages(memcg, stat->lru_mask);
3343 seq_printf(m, "%s=%lu", stat->name, nr);
3344 for_each_node_state(nid, N_MEMORY) {
3345 nr = mem_cgroup_node_nr_lru_pages(memcg, nid,
3346 stat->lru_mask);
3347 seq_printf(m, " N%d=%lu", nid, nr);
3348 }
3349 seq_putc(m, '\n');
3350 }
3351
3352 for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) {
3353 struct mem_cgroup *iter;
3354
3355 nr = 0;
3356 for_each_mem_cgroup_tree(iter, memcg)
3357 nr += mem_cgroup_nr_lru_pages(iter, stat->lru_mask);
3358 seq_printf(m, "hierarchical_%s=%lu", stat->name, nr);
3359 for_each_node_state(nid, N_MEMORY) {
3360 nr = 0;
3361 for_each_mem_cgroup_tree(iter, memcg)
3362 nr += mem_cgroup_node_nr_lru_pages(
3363 iter, nid, stat->lru_mask);
3364 seq_printf(m, " N%d=%lu", nid, nr);
3365 }
3366 seq_putc(m, '\n');
3367 }
3368
3369 return 0;
3370}
3371#endif
3372
3373
3374static const unsigned int memcg1_events[] = {
3375 PGPGIN,
3376 PGPGOUT,
3377 PGFAULT,
3378 PGMAJFAULT,
3379};
3380
3381static const char *const memcg1_event_names[] = {
3382 "pgpgin",
3383 "pgpgout",
3384 "pgfault",
3385 "pgmajfault",
3386};
3387
3388static int memcg_stat_show(struct seq_file *m, void *v)
3389{
3390 struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
3391 unsigned long memory, memsw;
3392 struct mem_cgroup *mi;
3393 unsigned int i;
3394 struct accumulated_stats acc;
3395
3396 BUILD_BUG_ON(ARRAY_SIZE(memcg1_stat_names) != ARRAY_SIZE(memcg1_stats));
3397 BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_lru_names) != NR_LRU_LISTS);
3398
3399 for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) {
3400 if (memcg1_stats[i] == MEMCG_SWAP && !do_memsw_account())
3401 continue;
3402 seq_printf(m, "%s %lu\n", memcg1_stat_names[i],
3403 memcg_page_state(memcg, memcg1_stats[i]) *
3404 PAGE_SIZE);
3405 }
3406
3407 for (i = 0; i < ARRAY_SIZE(memcg1_events); i++)
3408 seq_printf(m, "%s %lu\n", memcg1_event_names[i],
3409 memcg_sum_events(memcg, memcg1_events[i]));
3410
3411 for (i = 0; i < NR_LRU_LISTS; i++)
3412 seq_printf(m, "%s %lu\n", mem_cgroup_lru_names[i],
3413 mem_cgroup_nr_lru_pages(memcg, BIT(i)) * PAGE_SIZE);
3414
3415
3416 memory = memsw = PAGE_COUNTER_MAX;
3417 for (mi = memcg; mi; mi = parent_mem_cgroup(mi)) {
3418 memory = min(memory, mi->memory.max);
3419 memsw = min(memsw, mi->memsw.max);
3420 }
3421 seq_printf(m, "hierarchical_memory_limit %llu\n",
3422 (u64)memory * PAGE_SIZE);
3423 if (do_memsw_account())
3424 seq_printf(m, "hierarchical_memsw_limit %llu\n",
3425 (u64)memsw * PAGE_SIZE);
3426
3427 memset(&acc, 0, sizeof(acc));
3428 acc.stats_size = ARRAY_SIZE(memcg1_stats);
3429 acc.stats_array = memcg1_stats;
3430 acc.events_size = ARRAY_SIZE(memcg1_events);
3431 acc.events_array = memcg1_events;
3432 accumulate_memcg_tree(memcg, &acc);
3433
3434 for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) {
3435 if (memcg1_stats[i] == MEMCG_SWAP && !do_memsw_account())
3436 continue;
3437 seq_printf(m, "total_%s %llu\n", memcg1_stat_names[i],
3438 (u64)acc.stat[i] * PAGE_SIZE);
3439 }
3440
3441 for (i = 0; i < ARRAY_SIZE(memcg1_events); i++)
3442 seq_printf(m, "total_%s %llu\n", memcg1_event_names[i],
3443 (u64)acc.events[i]);
3444
3445 for (i = 0; i < NR_LRU_LISTS; i++)
3446 seq_printf(m, "total_%s %llu\n", mem_cgroup_lru_names[i],
3447 (u64)acc.lru_pages[i] * PAGE_SIZE);
3448
3449#ifdef CONFIG_DEBUG_VM
3450 {
3451 pg_data_t *pgdat;
3452 struct mem_cgroup_per_node *mz;
3453 struct zone_reclaim_stat *rstat;
3454 unsigned long recent_rotated[2] = {0, 0};
3455 unsigned long recent_scanned[2] = {0, 0};
3456
3457 for_each_online_pgdat(pgdat) {
3458 mz = mem_cgroup_nodeinfo(memcg, pgdat->node_id);
3459 rstat = &mz->lruvec.reclaim_stat;
3460
3461 recent_rotated[0] += rstat->recent_rotated[0];
3462 recent_rotated[1] += rstat->recent_rotated[1];
3463 recent_scanned[0] += rstat->recent_scanned[0];
3464 recent_scanned[1] += rstat->recent_scanned[1];
3465 }
3466 seq_printf(m, "recent_rotated_anon %lu\n", recent_rotated[0]);
3467 seq_printf(m, "recent_rotated_file %lu\n", recent_rotated[1]);
3468 seq_printf(m, "recent_scanned_anon %lu\n", recent_scanned[0]);
3469 seq_printf(m, "recent_scanned_file %lu\n", recent_scanned[1]);
3470 }
3471#endif
3472
3473 return 0;
3474}
3475
3476static u64 mem_cgroup_swappiness_read(struct cgroup_subsys_state *css,
3477 struct cftype *cft)
3478{
3479 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
3480
3481 return mem_cgroup_swappiness(memcg);
3482}
3483
3484static int mem_cgroup_swappiness_write(struct cgroup_subsys_state *css,
3485 struct cftype *cft, u64 val)
3486{
3487 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
3488
3489 if (val > 100)
3490 return -EINVAL;
3491
3492 if (css->parent)
3493 memcg->swappiness = val;
3494 else
3495 vm_swappiness = val;
3496
3497 return 0;
3498}
3499
3500static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap)
3501{
3502 struct mem_cgroup_threshold_ary *t;
3503 unsigned long usage;
3504 int i;
3505
3506 rcu_read_lock();
3507 if (!swap)
3508 t = rcu_dereference(memcg->thresholds.primary);
3509 else
3510 t = rcu_dereference(memcg->memsw_thresholds.primary);
3511
3512 if (!t)
3513 goto unlock;
3514
3515 usage = mem_cgroup_usage(memcg, swap);
3516
3517
3518
3519
3520
3521
3522 i = t->current_threshold;
3523
3524
3525
3526
3527
3528
3529
3530 for (; i >= 0 && unlikely(t->entries[i].threshold > usage); i--)
3531 eventfd_signal(t->entries[i].eventfd, 1);
3532
3533
3534 i++;
3535
3536
3537
3538
3539
3540
3541
3542 for (; i < t->size && unlikely(t->entries[i].threshold <= usage); i++)
3543 eventfd_signal(t->entries[i].eventfd, 1);
3544
3545
3546 t->current_threshold = i - 1;
3547unlock:
3548 rcu_read_unlock();
3549}
3550
3551static void mem_cgroup_threshold(struct mem_cgroup *memcg)
3552{
3553 while (memcg) {
3554 __mem_cgroup_threshold(memcg, false);
3555 if (do_memsw_account())
3556 __mem_cgroup_threshold(memcg, true);
3557
3558 memcg = parent_mem_cgroup(memcg);
3559 }
3560}
3561
3562static int compare_thresholds(const void *a, const void *b)
3563{
3564 const struct mem_cgroup_threshold *_a = a;
3565 const struct mem_cgroup_threshold *_b = b;
3566
3567 if (_a->threshold > _b->threshold)
3568 return 1;
3569
3570 if (_a->threshold < _b->threshold)
3571 return -1;
3572
3573 return 0;
3574}
3575
3576static int mem_cgroup_oom_notify_cb(struct mem_cgroup *memcg)
3577{
3578 struct mem_cgroup_eventfd_list *ev;
3579
3580 spin_lock(&memcg_oom_lock);
3581
3582 list_for_each_entry(ev, &memcg->oom_notify, list)
3583 eventfd_signal(ev->eventfd, 1);
3584
3585 spin_unlock(&memcg_oom_lock);
3586 return 0;
3587}
3588
3589static void mem_cgroup_oom_notify(struct mem_cgroup *memcg)
3590{
3591 struct mem_cgroup *iter;
3592
3593 for_each_mem_cgroup_tree(iter, memcg)
3594 mem_cgroup_oom_notify_cb(iter);
3595}
3596
3597static int __mem_cgroup_usage_register_event(struct mem_cgroup *memcg,
3598 struct eventfd_ctx *eventfd, const char *args, enum res_type type)
3599{
3600 struct mem_cgroup_thresholds *thresholds;
3601 struct mem_cgroup_threshold_ary *new;
3602 unsigned long threshold;
3603 unsigned long usage;
3604 int i, size, ret;
3605
3606 ret = page_counter_memparse(args, "-1", &threshold);
3607 if (ret)
3608 return ret;
3609
3610 mutex_lock(&memcg->thresholds_lock);
3611
3612 if (type == _MEM) {
3613 thresholds = &memcg->thresholds;
3614 usage = mem_cgroup_usage(memcg, false);
3615 } else if (type == _MEMSWAP) {
3616 thresholds = &memcg->memsw_thresholds;
3617 usage = mem_cgroup_usage(memcg, true);
3618 } else
3619 BUG();
3620
3621
3622 if (thresholds->primary)
3623 __mem_cgroup_threshold(memcg, type == _MEMSWAP);
3624
3625 size = thresholds->primary ? thresholds->primary->size + 1 : 1;
3626
3627
3628 new = kmalloc(sizeof(*new) + size * sizeof(struct mem_cgroup_threshold),
3629 GFP_KERNEL);
3630 if (!new) {
3631 ret = -ENOMEM;
3632 goto unlock;
3633 }
3634 new->size = size;
3635
3636
3637 if (thresholds->primary) {
3638 memcpy(new->entries, thresholds->primary->entries, (size - 1) *
3639 sizeof(struct mem_cgroup_threshold));
3640 }
3641
3642
3643 new->entries[size - 1].eventfd = eventfd;
3644 new->entries[size - 1].threshold = threshold;
3645
3646
3647 sort(new->entries, size, sizeof(struct mem_cgroup_threshold),
3648 compare_thresholds, NULL);
3649
3650
3651 new->current_threshold = -1;
3652 for (i = 0; i < size; i++) {
3653 if (new->entries[i].threshold <= usage) {
3654
3655
3656
3657
3658
3659 ++new->current_threshold;
3660 } else
3661 break;
3662 }
3663
3664
3665 kfree(thresholds->spare);
3666 thresholds->spare = thresholds->primary;
3667
3668 rcu_assign_pointer(thresholds->primary, new);
3669
3670
3671 synchronize_rcu();
3672
3673unlock:
3674 mutex_unlock(&memcg->thresholds_lock);
3675
3676 return ret;
3677}
3678
3679static int mem_cgroup_usage_register_event(struct mem_cgroup *memcg,
3680 struct eventfd_ctx *eventfd, const char *args)
3681{
3682 return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEM);
3683}
3684
3685static int memsw_cgroup_usage_register_event(struct mem_cgroup *memcg,
3686 struct eventfd_ctx *eventfd, const char *args)
3687{
3688 return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEMSWAP);
3689}
3690
3691static void __mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
3692 struct eventfd_ctx *eventfd, enum res_type type)
3693{
3694 struct mem_cgroup_thresholds *thresholds;
3695 struct mem_cgroup_threshold_ary *new;
3696 unsigned long usage;
3697 int i, j, size;
3698
3699 mutex_lock(&memcg->thresholds_lock);
3700
3701 if (type == _MEM) {
3702 thresholds = &memcg->thresholds;
3703 usage = mem_cgroup_usage(memcg, false);
3704 } else if (type == _MEMSWAP) {
3705 thresholds = &memcg->memsw_thresholds;
3706 usage = mem_cgroup_usage(memcg, true);
3707 } else
3708 BUG();
3709
3710 if (!thresholds->primary)
3711 goto unlock;
3712
3713
3714 __mem_cgroup_threshold(memcg, type == _MEMSWAP);
3715
3716
3717 size = 0;
3718 for (i = 0; i < thresholds->primary->size; i++) {
3719 if (thresholds->primary->entries[i].eventfd != eventfd)
3720 size++;
3721 }
3722
3723 new = thresholds->spare;
3724
3725
3726 if (!size) {
3727 kfree(new);
3728 new = NULL;
3729 goto swap_buffers;
3730 }
3731
3732 new->size = size;
3733
3734
3735 new->current_threshold = -1;
3736 for (i = 0, j = 0; i < thresholds->primary->size; i++) {
3737 if (thresholds->primary->entries[i].eventfd == eventfd)
3738 continue;
3739
3740 new->entries[j] = thresholds->primary->entries[i];
3741 if (new->entries[j].threshold <= usage) {
3742
3743
3744
3745
3746
3747 ++new->current_threshold;
3748 }
3749 j++;
3750 }
3751
3752swap_buffers:
3753
3754 thresholds->spare = thresholds->primary;
3755
3756 rcu_assign_pointer(thresholds->primary, new);
3757
3758
3759 synchronize_rcu();
3760
3761
3762 if (!new) {
3763 kfree(thresholds->spare);
3764 thresholds->spare = NULL;
3765 }
3766unlock:
3767 mutex_unlock(&memcg->thresholds_lock);
3768}
3769
3770static void mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
3771 struct eventfd_ctx *eventfd)
3772{
3773 return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEM);
3774}
3775
3776static void memsw_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
3777 struct eventfd_ctx *eventfd)
3778{
3779 return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEMSWAP);
3780}
3781
3782static int mem_cgroup_oom_register_event(struct mem_cgroup *memcg,
3783 struct eventfd_ctx *eventfd, const char *args)
3784{
3785 struct mem_cgroup_eventfd_list *event;
3786
3787 event = kmalloc(sizeof(*event), GFP_KERNEL);
3788 if (!event)
3789 return -ENOMEM;
3790
3791 spin_lock(&memcg_oom_lock);
3792
3793 event->eventfd = eventfd;
3794 list_add(&event->list, &memcg->oom_notify);
3795
3796
3797 if (memcg->under_oom)
3798 eventfd_signal(eventfd, 1);
3799 spin_unlock(&memcg_oom_lock);
3800
3801 return 0;
3802}
3803
3804static void mem_cgroup_oom_unregister_event(struct mem_cgroup *memcg,
3805 struct eventfd_ctx *eventfd)
3806{
3807 struct mem_cgroup_eventfd_list *ev, *tmp;
3808
3809 spin_lock(&memcg_oom_lock);
3810
3811 list_for_each_entry_safe(ev, tmp, &memcg->oom_notify, list) {
3812 if (ev->eventfd == eventfd) {
3813 list_del(&ev->list);
3814 kfree(ev);
3815 }
3816 }
3817
3818 spin_unlock(&memcg_oom_lock);
3819}
3820
3821static int mem_cgroup_oom_control_read(struct seq_file *sf, void *v)
3822{
3823 struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(sf));
3824
3825 seq_printf(sf, "oom_kill_disable %d\n", memcg->oom_kill_disable);
3826 seq_printf(sf, "under_oom %d\n", (bool)memcg->under_oom);
3827 seq_printf(sf, "oom_kill %lu\n",
3828 atomic_long_read(&memcg->memory_events[MEMCG_OOM_KILL]));
3829 return 0;
3830}
3831
3832static int mem_cgroup_oom_control_write(struct cgroup_subsys_state *css,
3833 struct cftype *cft, u64 val)
3834{
3835 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
3836
3837
3838 if (!css->parent || !((val == 0) || (val == 1)))
3839 return -EINVAL;
3840
3841 memcg->oom_kill_disable = val;
3842 if (!val)
3843 memcg_oom_recover(memcg);
3844
3845 return 0;
3846}
3847
3848#ifdef CONFIG_CGROUP_WRITEBACK
3849
3850static int memcg_wb_domain_init(struct mem_cgroup *memcg, gfp_t gfp)
3851{
3852 return wb_domain_init(&memcg->cgwb_domain, gfp);
3853}
3854
3855static void memcg_wb_domain_exit(struct mem_cgroup *memcg)
3856{
3857 wb_domain_exit(&memcg->cgwb_domain);
3858}
3859
3860static void memcg_wb_domain_size_changed(struct mem_cgroup *memcg)
3861{
3862 wb_domain_size_changed(&memcg->cgwb_domain);
3863}
3864
3865struct wb_domain *mem_cgroup_wb_domain(struct bdi_writeback *wb)
3866{
3867 struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css);
3868
3869 if (!memcg->css.parent)
3870 return NULL;
3871
3872 return &memcg->cgwb_domain;
3873}
3874
3875
3876
3877
3878
3879
3880
3881
3882
3883
3884
3885
3886
3887
3888
3889
3890
3891
3892
3893void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages,
3894 unsigned long *pheadroom, unsigned long *pdirty,
3895 unsigned long *pwriteback)
3896{
3897 struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css);
3898 struct mem_cgroup *parent;
3899
3900 *pdirty = memcg_page_state(memcg, NR_FILE_DIRTY);
3901
3902
3903 *pwriteback = memcg_page_state(memcg, NR_WRITEBACK);
3904 *pfilepages = mem_cgroup_nr_lru_pages(memcg, (1 << LRU_INACTIVE_FILE) |
3905 (1 << LRU_ACTIVE_FILE));
3906 *pheadroom = PAGE_COUNTER_MAX;
3907
3908 while ((parent = parent_mem_cgroup(memcg))) {
3909 unsigned long ceiling = min(memcg->memory.max, memcg->high);
3910 unsigned long used = page_counter_read(&memcg->memory);
3911
3912 *pheadroom = min(*pheadroom, ceiling - min(ceiling, used));
3913 memcg = parent;
3914 }
3915}
3916
3917#else
3918
3919static int memcg_wb_domain_init(struct mem_cgroup *memcg, gfp_t gfp)
3920{
3921 return 0;
3922}
3923
3924static void memcg_wb_domain_exit(struct mem_cgroup *memcg)
3925{
3926}
3927
3928static void memcg_wb_domain_size_changed(struct mem_cgroup *memcg)
3929{
3930}
3931
3932#endif
3933
3934
3935
3936
3937
3938
3939
3940
3941
3942
3943
3944
3945
3946
3947
3948
3949
3950
3951
3952static void memcg_event_remove(struct work_struct *work)
3953{
3954 struct mem_cgroup_event *event =
3955 container_of(work, struct mem_cgroup_event, remove);
3956 struct mem_cgroup *memcg = event->memcg;
3957
3958 remove_wait_queue(event->wqh, &event->wait);
3959
3960 event->unregister_event(memcg, event->eventfd);
3961
3962
3963 eventfd_signal(event->eventfd, 1);
3964
3965 eventfd_ctx_put(event->eventfd);
3966 kfree(event);
3967 css_put(&memcg->css);
3968}
3969
3970
3971
3972
3973
3974
3975static int memcg_event_wake(wait_queue_entry_t *wait, unsigned mode,
3976 int sync, void *key)
3977{
3978 struct mem_cgroup_event *event =
3979 container_of(wait, struct mem_cgroup_event, wait);
3980 struct mem_cgroup *memcg = event->memcg;
3981 __poll_t flags = key_to_poll(key);
3982
3983 if (flags & EPOLLHUP) {
3984
3985
3986
3987
3988
3989
3990
3991
3992
3993 spin_lock(&memcg->event_list_lock);
3994 if (!list_empty(&event->list)) {
3995 list_del_init(&event->list);
3996
3997
3998
3999
4000 schedule_work(&event->remove);
4001 }
4002 spin_unlock(&memcg->event_list_lock);
4003 }
4004
4005 return 0;
4006}
4007
4008static void memcg_event_ptable_queue_proc(struct file *file,
4009 wait_queue_head_t *wqh, poll_table *pt)
4010{
4011 struct mem_cgroup_event *event =
4012 container_of(pt, struct mem_cgroup_event, pt);
4013
4014 event->wqh = wqh;
4015 add_wait_queue(wqh, &event->wait);
4016}
4017
4018
4019
4020
4021
4022
4023
4024
4025
4026static ssize_t memcg_write_event_control(struct kernfs_open_file *of,
4027 char *buf, size_t nbytes, loff_t off)
4028{
4029 struct cgroup_subsys_state *css = of_css(of);
4030 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
4031 struct mem_cgroup_event *event;
4032 struct cgroup_subsys_state *cfile_css;
4033 unsigned int efd, cfd;
4034 struct fd efile;
4035 struct fd cfile;
4036 const char *name;
4037 char *endp;
4038 int ret;
4039
4040 buf = strstrip(buf);
4041
4042 efd = simple_strtoul(buf, &endp, 10);
4043 if (*endp != ' ')
4044 return -EINVAL;
4045 buf = endp + 1;
4046
4047 cfd = simple_strtoul(buf, &endp, 10);
4048 if ((*endp != ' ') && (*endp != '\0'))
4049 return -EINVAL;
4050 buf = endp + 1;
4051
4052 event = kzalloc(sizeof(*event), GFP_KERNEL);
4053 if (!event)
4054 return -ENOMEM;
4055
4056 event->memcg = memcg;
4057 INIT_LIST_HEAD(&event->list);
4058 init_poll_funcptr(&event->pt, memcg_event_ptable_queue_proc);
4059 init_waitqueue_func_entry(&event->wait, memcg_event_wake);
4060 INIT_WORK(&event->remove, memcg_event_remove);
4061
4062 efile = fdget(efd);
4063 if (!efile.file) {
4064 ret = -EBADF;
4065 goto out_kfree;
4066 }
4067
4068 event->eventfd = eventfd_ctx_fileget(efile.file);
4069 if (IS_ERR(event->eventfd)) {
4070 ret = PTR_ERR(event->eventfd);
4071 goto out_put_efile;
4072 }
4073
4074 cfile = fdget(cfd);
4075 if (!cfile.file) {
4076 ret = -EBADF;
4077 goto out_put_eventfd;
4078 }
4079
4080
4081
4082 ret = inode_permission(file_inode(cfile.file), MAY_READ);
4083 if (ret < 0)
4084 goto out_put_cfile;
4085
4086
4087
4088
4089
4090
4091
4092
4093
4094 name = cfile.file->f_path.dentry->d_name.name;
4095
4096 if (!strcmp(name, "memory.usage_in_bytes")) {
4097 event->register_event = mem_cgroup_usage_register_event;
4098 event->unregister_event = mem_cgroup_usage_unregister_event;
4099 } else if (!strcmp(name, "memory.oom_control")) {
4100 event->register_event = mem_cgroup_oom_register_event;
4101 event->unregister_event = mem_cgroup_oom_unregister_event;
4102 } else if (!strcmp(name, "memory.pressure_level")) {
4103 event->register_event = vmpressure_register_event;
4104 event->unregister_event = vmpressure_unregister_event;
4105 } else if (!strcmp(name, "memory.memsw.usage_in_bytes")) {
4106 event->register_event = memsw_cgroup_usage_register_event;
4107 event->unregister_event = memsw_cgroup_usage_unregister_event;
4108 } else {
4109 ret = -EINVAL;
4110 goto out_put_cfile;
4111 }
4112
4113
4114
4115
4116
4117
4118 cfile_css = css_tryget_online_from_dir(cfile.file->f_path.dentry->d_parent,
4119 &memory_cgrp_subsys);
4120 ret = -EINVAL;
4121 if (IS_ERR(cfile_css))
4122 goto out_put_cfile;
4123 if (cfile_css != css) {
4124 css_put(cfile_css);
4125 goto out_put_cfile;
4126 }
4127
4128 ret = event->register_event(memcg, event->eventfd, buf);
4129 if (ret)
4130 goto out_put_css;
4131
4132 vfs_poll(efile.file, &event->pt);
4133
4134 spin_lock(&memcg->event_list_lock);
4135 list_add(&event->list, &memcg->event_list);
4136 spin_unlock(&memcg->event_list_lock);
4137
4138 fdput(cfile);
4139 fdput(efile);
4140
4141 return nbytes;
4142
4143out_put_css:
4144 css_put(css);
4145out_put_cfile:
4146 fdput(cfile);
4147out_put_eventfd:
4148 eventfd_ctx_put(event->eventfd);
4149out_put_efile:
4150 fdput(efile);
4151out_kfree:
4152 kfree(event);
4153
4154 return ret;
4155}
4156
4157static struct cftype mem_cgroup_legacy_files[] = {
4158 {
4159 .name = "usage_in_bytes",
4160 .private = MEMFILE_PRIVATE(_MEM, RES_USAGE),
4161 .read_u64 = mem_cgroup_read_u64,
4162 },
4163 {
4164 .name = "max_usage_in_bytes",
4165 .private = MEMFILE_PRIVATE(_MEM, RES_MAX_USAGE),
4166 .write = mem_cgroup_reset,
4167 .read_u64 = mem_cgroup_read_u64,
4168 },
4169 {
4170 .name = "limit_in_bytes",
4171 .private = MEMFILE_PRIVATE(_MEM, RES_LIMIT),
4172 .write = mem_cgroup_write,
4173 .read_u64 = mem_cgroup_read_u64,
4174 },
4175 {
4176 .name = "soft_limit_in_bytes",
4177 .private = MEMFILE_PRIVATE(_MEM, RES_SOFT_LIMIT),
4178 .write = mem_cgroup_write,
4179 .read_u64 = mem_cgroup_read_u64,
4180 },
4181 {
4182 .name = "failcnt",
4183 .private = MEMFILE_PRIVATE(_MEM, RES_FAILCNT),
4184 .write = mem_cgroup_reset,
4185 .read_u64 = mem_cgroup_read_u64,
4186 },
4187 {
4188 .name = "stat",
4189 .seq_show = memcg_stat_show,
4190 },
4191 {
4192 .name = "force_empty",
4193 .write = mem_cgroup_force_empty_write,
4194 },
4195 {
4196 .name = "use_hierarchy",
4197 .write_u64 = mem_cgroup_hierarchy_write,
4198 .read_u64 = mem_cgroup_hierarchy_read,
4199 },
4200 {
4201 .name = "cgroup.event_control",
4202 .write = memcg_write_event_control,
4203 .flags = CFTYPE_NO_PREFIX | CFTYPE_WORLD_WRITABLE,
4204 },
4205 {
4206 .name = "swappiness",
4207 .read_u64 = mem_cgroup_swappiness_read,
4208 .write_u64 = mem_cgroup_swappiness_write,
4209 },
4210 {
4211 .name = "move_charge_at_immigrate",
4212 .read_u64 = mem_cgroup_move_charge_read,
4213 .write_u64 = mem_cgroup_move_charge_write,
4214 },
4215 {
4216 .name = "oom_control",
4217 .seq_show = mem_cgroup_oom_control_read,
4218 .write_u64 = mem_cgroup_oom_control_write,
4219 .private = MEMFILE_PRIVATE(_OOM_TYPE, OOM_CONTROL),
4220 },
4221 {
4222 .name = "pressure_level",
4223 },
4224#ifdef CONFIG_NUMA
4225 {
4226 .name = "numa_stat",
4227 .seq_show = memcg_numa_stat_show,
4228 },
4229#endif
4230 {
4231 .name = "kmem.limit_in_bytes",
4232 .private = MEMFILE_PRIVATE(_KMEM, RES_LIMIT),
4233 .write = mem_cgroup_write,
4234 .read_u64 = mem_cgroup_read_u64,
4235 },
4236 {
4237 .name = "kmem.usage_in_bytes",
4238 .private = MEMFILE_PRIVATE(_KMEM, RES_USAGE),
4239 .read_u64 = mem_cgroup_read_u64,
4240 },
4241 {
4242 .name = "kmem.failcnt",
4243 .private = MEMFILE_PRIVATE(_KMEM, RES_FAILCNT),
4244 .write = mem_cgroup_reset,
4245 .read_u64 = mem_cgroup_read_u64,
4246 },
4247 {
4248 .name = "kmem.max_usage_in_bytes",
4249 .private = MEMFILE_PRIVATE(_KMEM, RES_MAX_USAGE),
4250 .write = mem_cgroup_reset,
4251 .read_u64 = mem_cgroup_read_u64,
4252 },
4253#if defined(CONFIG_SLAB) || defined(CONFIG_SLUB_DEBUG)
4254 {
4255 .name = "kmem.slabinfo",
4256 .seq_start = memcg_slab_start,
4257 .seq_next = memcg_slab_next,
4258 .seq_stop = memcg_slab_stop,
4259 .seq_show = memcg_slab_show,
4260 },
4261#endif
4262 {
4263 .name = "kmem.tcp.limit_in_bytes",
4264 .private = MEMFILE_PRIVATE(_TCP, RES_LIMIT),
4265 .write = mem_cgroup_write,
4266 .read_u64 = mem_cgroup_read_u64,
4267 },
4268 {
4269 .name = "kmem.tcp.usage_in_bytes",
4270 .private = MEMFILE_PRIVATE(_TCP, RES_USAGE),
4271 .read_u64 = mem_cgroup_read_u64,
4272 },
4273 {
4274 .name = "kmem.tcp.failcnt",
4275 .private = MEMFILE_PRIVATE(_TCP, RES_FAILCNT),
4276 .write = mem_cgroup_reset,
4277 .read_u64 = mem_cgroup_read_u64,
4278 },
4279 {
4280 .name = "kmem.tcp.max_usage_in_bytes",
4281 .private = MEMFILE_PRIVATE(_TCP, RES_MAX_USAGE),
4282 .write = mem_cgroup_reset,
4283 .read_u64 = mem_cgroup_read_u64,
4284 },
4285 { },
4286};
4287
4288
4289
4290
4291
4292
4293
4294
4295
4296
4297
4298
4299
4300
4301
4302
4303
4304
4305
4306
4307
4308
4309
4310
4311
4312static DEFINE_IDR(mem_cgroup_idr);
4313
4314static void mem_cgroup_id_remove(struct mem_cgroup *memcg)
4315{
4316 if (memcg->id.id > 0) {
4317 idr_remove(&mem_cgroup_idr, memcg->id.id);
4318 memcg->id.id = 0;
4319 }
4320}
4321
4322static void mem_cgroup_id_get_many(struct mem_cgroup *memcg, unsigned int n)
4323{
4324 VM_BUG_ON(atomic_read(&memcg->id.ref) <= 0);
4325 atomic_add(n, &memcg->id.ref);
4326}
4327
4328static void mem_cgroup_id_put_many(struct mem_cgroup *memcg, unsigned int n)
4329{
4330 VM_BUG_ON(atomic_read(&memcg->id.ref) < n);
4331 if (atomic_sub_and_test(n, &memcg->id.ref)) {
4332 mem_cgroup_id_remove(memcg);
4333
4334
4335 css_put(&memcg->css);
4336 }
4337}
4338
4339static inline void mem_cgroup_id_get(struct mem_cgroup *memcg)
4340{
4341 mem_cgroup_id_get_many(memcg, 1);
4342}
4343
4344static inline void mem_cgroup_id_put(struct mem_cgroup *memcg)
4345{
4346 mem_cgroup_id_put_many(memcg, 1);
4347}
4348
4349
4350
4351
4352
4353
4354
4355struct mem_cgroup *mem_cgroup_from_id(unsigned short id)
4356{
4357 WARN_ON_ONCE(!rcu_read_lock_held());
4358 return idr_find(&mem_cgroup_idr, id);
4359}
4360
4361static int alloc_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node)
4362{
4363 struct mem_cgroup_per_node *pn;
4364 int tmp = node;
4365
4366
4367
4368
4369
4370
4371
4372
4373 if (!node_state(node, N_NORMAL_MEMORY))
4374 tmp = -1;
4375 pn = kzalloc_node(sizeof(*pn), GFP_KERNEL, tmp);
4376 if (!pn)
4377 return 1;
4378
4379 pn->lruvec_stat_cpu = alloc_percpu(struct lruvec_stat);
4380 if (!pn->lruvec_stat_cpu) {
4381 kfree(pn);
4382 return 1;
4383 }
4384
4385 lruvec_init(&pn->lruvec);
4386 pn->usage_in_excess = 0;
4387 pn->on_tree = false;
4388 pn->memcg = memcg;
4389
4390 memcg->nodeinfo[node] = pn;
4391 return 0;
4392}
4393
4394static void free_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node)
4395{
4396 struct mem_cgroup_per_node *pn = memcg->nodeinfo[node];
4397
4398 if (!pn)
4399 return;
4400
4401 free_percpu(pn->lruvec_stat_cpu);
4402 kfree(pn);
4403}
4404
4405static void __mem_cgroup_free(struct mem_cgroup *memcg)
4406{
4407 int node;
4408
4409 for_each_node(node)
4410 free_mem_cgroup_per_node_info(memcg, node);
4411 free_percpu(memcg->stat_cpu);
4412 kfree(memcg);
4413}
4414
4415static void mem_cgroup_free(struct mem_cgroup *memcg)
4416{
4417 memcg_wb_domain_exit(memcg);
4418 __mem_cgroup_free(memcg);
4419}
4420
4421static struct mem_cgroup *mem_cgroup_alloc(void)
4422{
4423 struct mem_cgroup *memcg;
4424 size_t size;
4425 int node;
4426
4427 size = sizeof(struct mem_cgroup);
4428 size += nr_node_ids * sizeof(struct mem_cgroup_per_node *);
4429
4430 memcg = kzalloc(size, GFP_KERNEL);
4431 if (!memcg)
4432 return NULL;
4433
4434 memcg->id.id = idr_alloc(&mem_cgroup_idr, NULL,
4435 1, MEM_CGROUP_ID_MAX,
4436 GFP_KERNEL);
4437 if (memcg->id.id < 0)
4438 goto fail;
4439
4440 memcg->stat_cpu = alloc_percpu(struct mem_cgroup_stat_cpu);
4441 if (!memcg->stat_cpu)
4442 goto fail;
4443
4444 for_each_node(node)
4445 if (alloc_mem_cgroup_per_node_info(memcg, node))
4446 goto fail;
4447
4448 if (memcg_wb_domain_init(memcg, GFP_KERNEL))
4449 goto fail;
4450
4451 INIT_WORK(&memcg->high_work, high_work_func);
4452 memcg->last_scanned_node = MAX_NUMNODES;
4453 INIT_LIST_HEAD(&memcg->oom_notify);
4454 mutex_init(&memcg->thresholds_lock);
4455 spin_lock_init(&memcg->move_lock);
4456 vmpressure_init(&memcg->vmpressure);
4457 INIT_LIST_HEAD(&memcg->event_list);
4458 spin_lock_init(&memcg->event_list_lock);
4459 memcg->socket_pressure = jiffies;
4460#ifdef CONFIG_MEMCG_KMEM
4461 memcg->kmemcg_id = -1;
4462#endif
4463#ifdef CONFIG_CGROUP_WRITEBACK
4464 INIT_LIST_HEAD(&memcg->cgwb_list);
4465#endif
4466 idr_replace(&mem_cgroup_idr, memcg, memcg->id.id);
4467 return memcg;
4468fail:
4469 mem_cgroup_id_remove(memcg);
4470 __mem_cgroup_free(memcg);
4471 return NULL;
4472}
4473
4474static struct cgroup_subsys_state * __ref
4475mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
4476{
4477 struct mem_cgroup *parent = mem_cgroup_from_css(parent_css);
4478 struct mem_cgroup *memcg;
4479 long error = -ENOMEM;
4480
4481 memcg = mem_cgroup_alloc();
4482 if (!memcg)
4483 return ERR_PTR(error);
4484
4485 memcg->high = PAGE_COUNTER_MAX;
4486 memcg->soft_limit = PAGE_COUNTER_MAX;
4487 if (parent) {
4488 memcg->swappiness = mem_cgroup_swappiness(parent);
4489 memcg->oom_kill_disable = parent->oom_kill_disable;
4490 }
4491 if (parent && parent->use_hierarchy) {
4492 memcg->use_hierarchy = true;
4493 page_counter_init(&memcg->memory, &parent->memory);
4494 page_counter_init(&memcg->swap, &parent->swap);
4495 page_counter_init(&memcg->memsw, &parent->memsw);
4496 page_counter_init(&memcg->kmem, &parent->kmem);
4497 page_counter_init(&memcg->tcpmem, &parent->tcpmem);
4498 } else {
4499 page_counter_init(&memcg->memory, NULL);
4500 page_counter_init(&memcg->swap, NULL);
4501 page_counter_init(&memcg->memsw, NULL);
4502 page_counter_init(&memcg->kmem, NULL);
4503 page_counter_init(&memcg->tcpmem, NULL);
4504
4505
4506
4507
4508
4509 if (parent != root_mem_cgroup)
4510 memory_cgrp_subsys.broken_hierarchy = true;
4511 }
4512
4513
4514 if (!parent) {
4515 root_mem_cgroup = memcg;
4516 return &memcg->css;
4517 }
4518
4519 error = memcg_online_kmem(memcg);
4520 if (error)
4521 goto fail;
4522
4523 if (cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_nosocket)
4524 static_branch_inc(&memcg_sockets_enabled_key);
4525
4526 return &memcg->css;
4527fail:
4528 mem_cgroup_id_remove(memcg);
4529 mem_cgroup_free(memcg);
4530 return ERR_PTR(-ENOMEM);
4531}
4532
4533static int mem_cgroup_css_online(struct cgroup_subsys_state *css)
4534{
4535 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
4536
4537
4538
4539
4540
4541
4542 if (memcg_alloc_shrinker_maps(memcg)) {
4543 mem_cgroup_id_remove(memcg);
4544 return -ENOMEM;
4545 }
4546
4547
4548 atomic_set(&memcg->id.ref, 1);
4549 css_get(css);
4550 return 0;
4551}
4552
4553static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
4554{
4555 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
4556 struct mem_cgroup_event *event, *tmp;
4557
4558
4559
4560
4561
4562
4563 spin_lock(&memcg->event_list_lock);
4564 list_for_each_entry_safe(event, tmp, &memcg->event_list, list) {
4565 list_del_init(&event->list);
4566 schedule_work(&event->remove);
4567 }
4568 spin_unlock(&memcg->event_list_lock);
4569
4570 page_counter_set_min(&memcg->memory, 0);
4571 page_counter_set_low(&memcg->memory, 0);
4572
4573 memcg_offline_kmem(memcg);
4574 wb_memcg_offline(memcg);
4575
4576 mem_cgroup_id_put(memcg);
4577}
4578
4579static void mem_cgroup_css_released(struct cgroup_subsys_state *css)
4580{
4581 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
4582
4583 invalidate_reclaim_iterators(memcg);
4584}
4585
4586static void mem_cgroup_css_free(struct cgroup_subsys_state *css)
4587{
4588 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
4589
4590 if (cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_nosocket)
4591 static_branch_dec(&memcg_sockets_enabled_key);
4592
4593 if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && memcg->tcpmem_active)
4594 static_branch_dec(&memcg_sockets_enabled_key);
4595
4596 vmpressure_cleanup(&memcg->vmpressure);
4597 cancel_work_sync(&memcg->high_work);
4598 mem_cgroup_remove_from_trees(memcg);
4599 memcg_free_shrinker_maps(memcg);
4600 memcg_free_kmem(memcg);
4601 mem_cgroup_free(memcg);
4602}
4603
4604
4605
4606
4607
4608
4609
4610
4611
4612
4613
4614
4615
4616
4617static void mem_cgroup_css_reset(struct cgroup_subsys_state *css)
4618{
4619 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
4620
4621 page_counter_set_max(&memcg->memory, PAGE_COUNTER_MAX);
4622 page_counter_set_max(&memcg->swap, PAGE_COUNTER_MAX);
4623 page_counter_set_max(&memcg->memsw, PAGE_COUNTER_MAX);
4624 page_counter_set_max(&memcg->kmem, PAGE_COUNTER_MAX);
4625 page_counter_set_max(&memcg->tcpmem, PAGE_COUNTER_MAX);
4626 page_counter_set_min(&memcg->memory, 0);
4627 page_counter_set_low(&memcg->memory, 0);
4628 memcg->high = PAGE_COUNTER_MAX;
4629 memcg->soft_limit = PAGE_COUNTER_MAX;
4630 memcg_wb_domain_size_changed(memcg);
4631}
4632
4633#ifdef CONFIG_MMU
4634
4635static int mem_cgroup_do_precharge(unsigned long count)
4636{
4637 int ret;
4638
4639
4640 ret = try_charge(mc.to, GFP_KERNEL & ~__GFP_DIRECT_RECLAIM, count);
4641 if (!ret) {
4642 mc.precharge += count;
4643 return ret;
4644 }
4645
4646
4647 while (count--) {
4648 ret = try_charge(mc.to, GFP_KERNEL | __GFP_NORETRY, 1);
4649 if (ret)
4650 return ret;
4651 mc.precharge++;
4652 cond_resched();
4653 }
4654 return 0;
4655}
4656
4657union mc_target {
4658 struct page *page;
4659 swp_entry_t ent;
4660};
4661
4662enum mc_target_type {
4663 MC_TARGET_NONE = 0,
4664 MC_TARGET_PAGE,
4665 MC_TARGET_SWAP,
4666 MC_TARGET_DEVICE,
4667};
4668
4669static struct page *mc_handle_present_pte(struct vm_area_struct *vma,
4670 unsigned long addr, pte_t ptent)
4671{
4672 struct page *page = _vm_normal_page(vma, addr, ptent, true);
4673
4674 if (!page || !page_mapped(page))
4675 return NULL;
4676 if (PageAnon(page)) {
4677 if (!(mc.flags & MOVE_ANON))
4678 return NULL;
4679 } else {
4680 if (!(mc.flags & MOVE_FILE))
4681 return NULL;
4682 }
4683 if (!get_page_unless_zero(page))
4684 return NULL;
4685
4686 return page;
4687}
4688
4689#if defined(CONFIG_SWAP) || defined(CONFIG_DEVICE_PRIVATE)
4690static struct page *mc_handle_swap_pte(struct vm_area_struct *vma,
4691 pte_t ptent, swp_entry_t *entry)
4692{
4693 struct page *page = NULL;
4694 swp_entry_t ent = pte_to_swp_entry(ptent);
4695
4696 if (!(mc.flags & MOVE_ANON) || non_swap_entry(ent))
4697 return NULL;
4698
4699
4700
4701
4702
4703
4704 if (is_device_private_entry(ent)) {
4705 page = device_private_entry_to_page(ent);
4706
4707
4708
4709
4710 if (!page_ref_add_unless(page, 1, 1))
4711 return NULL;
4712 return page;
4713 }
4714
4715
4716
4717
4718
4719 page = find_get_page(swap_address_space(ent), swp_offset(ent));
4720 if (do_memsw_account())
4721 entry->val = ent.val;
4722
4723 return page;
4724}
4725#else
4726static struct page *mc_handle_swap_pte(struct vm_area_struct *vma,
4727 pte_t ptent, swp_entry_t *entry)
4728{
4729 return NULL;
4730}
4731#endif
4732
4733static struct page *mc_handle_file_pte(struct vm_area_struct *vma,
4734 unsigned long addr, pte_t ptent, swp_entry_t *entry)
4735{
4736 struct page *page = NULL;
4737 struct address_space *mapping;
4738 pgoff_t pgoff;
4739
4740 if (!vma->vm_file)
4741 return NULL;
4742 if (!(mc.flags & MOVE_FILE))
4743 return NULL;
4744
4745 mapping = vma->vm_file->f_mapping;
4746 pgoff = linear_page_index(vma, addr);
4747
4748
4749#ifdef CONFIG_SWAP
4750
4751 if (shmem_mapping(mapping)) {
4752 page = find_get_entry(mapping, pgoff);
4753 if (radix_tree_exceptional_entry(page)) {
4754 swp_entry_t swp = radix_to_swp_entry(page);
4755 if (do_memsw_account())
4756 *entry = swp;
4757 page = find_get_page(swap_address_space(swp),
4758 swp_offset(swp));
4759 }
4760 } else
4761 page = find_get_page(mapping, pgoff);
4762#else
4763 page = find_get_page(mapping, pgoff);
4764#endif
4765 return page;
4766}
4767
4768
4769
4770
4771
4772
4773
4774
4775
4776
4777
4778
4779
4780static int mem_cgroup_move_account(struct page *page,
4781 bool compound,
4782 struct mem_cgroup *from,
4783 struct mem_cgroup *to)
4784{
4785 unsigned long flags;
4786 unsigned int nr_pages = compound ? hpage_nr_pages(page) : 1;
4787 int ret;
4788 bool anon;
4789
4790 VM_BUG_ON(from == to);
4791 VM_BUG_ON_PAGE(PageLRU(page), page);
4792 VM_BUG_ON(compound && !PageTransHuge(page));
4793
4794
4795
4796
4797
4798 ret = -EBUSY;
4799 if (!trylock_page(page))
4800 goto out;
4801
4802 ret = -EINVAL;
4803 if (page->mem_cgroup != from)
4804 goto out_unlock;
4805
4806 anon = PageAnon(page);
4807
4808 spin_lock_irqsave(&from->move_lock, flags);
4809
4810 if (!anon && page_mapped(page)) {
4811 __mod_memcg_state(from, NR_FILE_MAPPED, -nr_pages);
4812 __mod_memcg_state(to, NR_FILE_MAPPED, nr_pages);
4813 }
4814
4815
4816
4817
4818
4819
4820 if (!anon && PageDirty(page)) {
4821 struct address_space *mapping = page_mapping(page);
4822
4823 if (mapping_cap_account_dirty(mapping)) {
4824 __mod_memcg_state(from, NR_FILE_DIRTY, -nr_pages);
4825 __mod_memcg_state(to, NR_FILE_DIRTY, nr_pages);
4826 }
4827 }
4828
4829 if (PageWriteback(page)) {
4830 __mod_memcg_state(from, NR_WRITEBACK, -nr_pages);
4831 __mod_memcg_state(to, NR_WRITEBACK, nr_pages);
4832 }
4833
4834
4835
4836
4837
4838
4839
4840
4841 page->mem_cgroup = to;
4842 spin_unlock_irqrestore(&from->move_lock, flags);
4843
4844 ret = 0;
4845
4846 local_irq_disable();
4847 mem_cgroup_charge_statistics(to, page, compound, nr_pages);
4848 memcg_check_events(to, page);
4849 mem_cgroup_charge_statistics(from, page, compound, -nr_pages);
4850 memcg_check_events(from, page);
4851 local_irq_enable();
4852out_unlock:
4853 unlock_page(page);
4854out:
4855 return ret;
4856}
4857
4858
4859
4860
4861
4862
4863
4864
4865
4866
4867
4868
4869
4870
4871
4872
4873
4874
4875
4876
4877
4878
4879
4880
4881
4882
4883
4884static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma,
4885 unsigned long addr, pte_t ptent, union mc_target *target)
4886{
4887 struct page *page = NULL;
4888 enum mc_target_type ret = MC_TARGET_NONE;
4889 swp_entry_t ent = { .val = 0 };
4890
4891 if (pte_present(ptent))
4892 page = mc_handle_present_pte(vma, addr, ptent);
4893 else if (is_swap_pte(ptent))
4894 page = mc_handle_swap_pte(vma, ptent, &ent);
4895 else if (pte_none(ptent))
4896 page = mc_handle_file_pte(vma, addr, ptent, &ent);
4897
4898 if (!page && !ent.val)
4899 return ret;
4900 if (page) {
4901
4902
4903
4904
4905
4906 if (page->mem_cgroup == mc.from) {
4907 ret = MC_TARGET_PAGE;
4908 if (is_device_private_page(page) ||
4909 is_device_public_page(page))
4910 ret = MC_TARGET_DEVICE;
4911 if (target)
4912 target->page = page;
4913 }
4914 if (!ret || !target)
4915 put_page(page);
4916 }
4917
4918
4919
4920
4921 if (ent.val && !ret && (!page || !PageTransCompound(page)) &&
4922 mem_cgroup_id(mc.from) == lookup_swap_cgroup_id(ent)) {
4923 ret = MC_TARGET_SWAP;
4924 if (target)
4925 target->ent = ent;
4926 }
4927 return ret;
4928}
4929
4930#ifdef CONFIG_TRANSPARENT_HUGEPAGE
4931
4932
4933
4934
4935
4936static enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma,
4937 unsigned long addr, pmd_t pmd, union mc_target *target)
4938{
4939 struct page *page = NULL;
4940 enum mc_target_type ret = MC_TARGET_NONE;
4941
4942 if (unlikely(is_swap_pmd(pmd))) {
4943 VM_BUG_ON(thp_migration_supported() &&
4944 !is_pmd_migration_entry(pmd));
4945 return ret;
4946 }
4947 page = pmd_page(pmd);
4948 VM_BUG_ON_PAGE(!page || !PageHead(page), page);
4949 if (!(mc.flags & MOVE_ANON))
4950 return ret;
4951 if (page->mem_cgroup == mc.from) {
4952 ret = MC_TARGET_PAGE;
4953 if (target) {
4954 get_page(page);
4955 target->page = page;
4956 }
4957 }
4958 return ret;
4959}
4960#else
4961static inline enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma,
4962 unsigned long addr, pmd_t pmd, union mc_target *target)
4963{
4964 return MC_TARGET_NONE;
4965}
4966#endif
4967
4968static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd,
4969 unsigned long addr, unsigned long end,
4970 struct mm_walk *walk)
4971{
4972 struct vm_area_struct *vma = walk->vma;
4973 pte_t *pte;
4974 spinlock_t *ptl;
4975
4976 ptl = pmd_trans_huge_lock(pmd, vma);
4977 if (ptl) {
4978
4979
4980
4981
4982
4983 if (get_mctgt_type_thp(vma, addr, *pmd, NULL) == MC_TARGET_PAGE)
4984 mc.precharge += HPAGE_PMD_NR;
4985 spin_unlock(ptl);
4986 return 0;
4987 }
4988
4989 if (pmd_trans_unstable(pmd))
4990 return 0;
4991 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
4992 for (; addr != end; pte++, addr += PAGE_SIZE)
4993 if (get_mctgt_type(vma, addr, *pte, NULL))
4994 mc.precharge++;
4995 pte_unmap_unlock(pte - 1, ptl);
4996 cond_resched();
4997
4998 return 0;
4999}
5000
5001static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm)
5002{
5003 unsigned long precharge;
5004
5005 struct mm_walk mem_cgroup_count_precharge_walk = {
5006 .pmd_entry = mem_cgroup_count_precharge_pte_range,
5007 .mm = mm,
5008 };
5009 down_read(&mm->mmap_sem);
5010 walk_page_range(0, mm->highest_vm_end,
5011 &mem_cgroup_count_precharge_walk);
5012 up_read(&mm->mmap_sem);
5013
5014 precharge = mc.precharge;
5015 mc.precharge = 0;
5016
5017 return precharge;
5018}
5019
5020static int mem_cgroup_precharge_mc(struct mm_struct *mm)
5021{
5022 unsigned long precharge = mem_cgroup_count_precharge(mm);
5023
5024 VM_BUG_ON(mc.moving_task);
5025 mc.moving_task = current;
5026 return mem_cgroup_do_precharge(precharge);
5027}
5028
5029
5030static void __mem_cgroup_clear_mc(void)
5031{
5032 struct mem_cgroup *from = mc.from;
5033 struct mem_cgroup *to = mc.to;
5034
5035
5036 if (mc.precharge) {
5037 cancel_charge(mc.to, mc.precharge);
5038 mc.precharge = 0;
5039 }
5040
5041
5042
5043
5044 if (mc.moved_charge) {
5045 cancel_charge(mc.from, mc.moved_charge);
5046 mc.moved_charge = 0;
5047 }
5048
5049 if (mc.moved_swap) {
5050
5051 if (!mem_cgroup_is_root(mc.from))
5052 page_counter_uncharge(&mc.from->memsw, mc.moved_swap);
5053
5054 mem_cgroup_id_put_many(mc.from, mc.moved_swap);
5055
5056
5057
5058
5059
5060 if (!mem_cgroup_is_root(mc.to))
5061 page_counter_uncharge(&mc.to->memory, mc.moved_swap);
5062
5063 mem_cgroup_id_get_many(mc.to, mc.moved_swap);
5064 css_put_many(&mc.to->css, mc.moved_swap);
5065
5066 mc.moved_swap = 0;
5067 }
5068 memcg_oom_recover(from);
5069 memcg_oom_recover(to);
5070 wake_up_all(&mc.waitq);
5071}
5072
5073static void mem_cgroup_clear_mc(void)
5074{
5075 struct mm_struct *mm = mc.mm;
5076
5077
5078
5079
5080
5081 mc.moving_task = NULL;
5082 __mem_cgroup_clear_mc();
5083 spin_lock(&mc.lock);
5084 mc.from = NULL;
5085 mc.to = NULL;
5086 mc.mm = NULL;
5087 spin_unlock(&mc.lock);
5088
5089 mmput(mm);
5090}
5091
5092static int mem_cgroup_can_attach(struct cgroup_taskset *tset)
5093{
5094 struct cgroup_subsys_state *css;
5095 struct mem_cgroup *memcg = NULL;
5096 struct mem_cgroup *from;
5097 struct task_struct *leader, *p;
5098 struct mm_struct *mm;
5099 unsigned long move_flags;
5100 int ret = 0;
5101
5102
5103 if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
5104 return 0;
5105
5106
5107
5108
5109
5110
5111
5112 p = NULL;
5113 cgroup_taskset_for_each_leader(leader, css, tset) {
5114 WARN_ON_ONCE(p);
5115 p = leader;
5116 memcg = mem_cgroup_from_css(css);
5117 }
5118 if (!p)
5119 return 0;
5120
5121
5122
5123
5124
5125
5126 move_flags = READ_ONCE(memcg->move_charge_at_immigrate);
5127 if (!move_flags)
5128 return 0;
5129
5130 from = mem_cgroup_from_task(p);
5131
5132 VM_BUG_ON(from == memcg);
5133
5134 mm = get_task_mm(p);
5135 if (!mm)
5136 return 0;
5137
5138 if (mm->owner == p) {
5139 VM_BUG_ON(mc.from);
5140 VM_BUG_ON(mc.to);
5141 VM_BUG_ON(mc.precharge);
5142 VM_BUG_ON(mc.moved_charge);
5143 VM_BUG_ON(mc.moved_swap);
5144
5145 spin_lock(&mc.lock);
5146 mc.mm = mm;
5147 mc.from = from;
5148 mc.to = memcg;
5149 mc.flags = move_flags;
5150 spin_unlock(&mc.lock);
5151
5152
5153 ret = mem_cgroup_precharge_mc(mm);
5154 if (ret)
5155 mem_cgroup_clear_mc();
5156 } else {
5157 mmput(mm);
5158 }
5159 return ret;
5160}
5161
5162static void mem_cgroup_cancel_attach(struct cgroup_taskset *tset)
5163{
5164 if (mc.to)
5165 mem_cgroup_clear_mc();
5166}
5167
5168static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
5169 unsigned long addr, unsigned long end,
5170 struct mm_walk *walk)
5171{
5172 int ret = 0;
5173 struct vm_area_struct *vma = walk->vma;
5174 pte_t *pte;
5175 spinlock_t *ptl;
5176 enum mc_target_type target_type;
5177 union mc_target target;
5178 struct page *page;
5179
5180 ptl = pmd_trans_huge_lock(pmd, vma);
5181 if (ptl) {
5182 if (mc.precharge < HPAGE_PMD_NR) {
5183 spin_unlock(ptl);
5184 return 0;
5185 }
5186 target_type = get_mctgt_type_thp(vma, addr, *pmd, &target);
5187 if (target_type == MC_TARGET_PAGE) {
5188 page = target.page;
5189 if (!isolate_lru_page(page)) {
5190 if (!mem_cgroup_move_account(page, true,
5191 mc.from, mc.to)) {
5192 mc.precharge -= HPAGE_PMD_NR;
5193 mc.moved_charge += HPAGE_PMD_NR;
5194 }
5195 putback_lru_page(page);
5196 }
5197 put_page(page);
5198 } else if (target_type == MC_TARGET_DEVICE) {
5199 page = target.page;
5200 if (!mem_cgroup_move_account(page, true,
5201 mc.from, mc.to)) {
5202 mc.precharge -= HPAGE_PMD_NR;
5203 mc.moved_charge += HPAGE_PMD_NR;
5204 }
5205 put_page(page);
5206 }
5207 spin_unlock(ptl);
5208 return 0;
5209 }
5210
5211 if (pmd_trans_unstable(pmd))
5212 return 0;
5213retry:
5214 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
5215 for (; addr != end; addr += PAGE_SIZE) {
5216 pte_t ptent = *(pte++);
5217 bool device = false;
5218 swp_entry_t ent;
5219
5220 if (!mc.precharge)
5221 break;
5222
5223 switch (get_mctgt_type(vma, addr, ptent, &target)) {
5224 case MC_TARGET_DEVICE:
5225 device = true;
5226
5227 case MC_TARGET_PAGE:
5228 page = target.page;
5229
5230
5231
5232
5233
5234
5235 if (PageTransCompound(page))
5236 goto put;
5237 if (!device && isolate_lru_page(page))
5238 goto put;
5239 if (!mem_cgroup_move_account(page, false,
5240 mc.from, mc.to)) {
5241 mc.precharge--;
5242
5243 mc.moved_charge++;
5244 }
5245 if (!device)
5246 putback_lru_page(page);
5247put:
5248 put_page(page);
5249 break;
5250 case MC_TARGET_SWAP:
5251 ent = target.ent;
5252 if (!mem_cgroup_move_swap_account(ent, mc.from, mc.to)) {
5253 mc.precharge--;
5254
5255 mc.moved_swap++;
5256 }
5257 break;
5258 default:
5259 break;
5260 }
5261 }
5262 pte_unmap_unlock(pte - 1, ptl);
5263 cond_resched();
5264
5265 if (addr != end) {
5266
5267
5268
5269
5270
5271
5272 ret = mem_cgroup_do_precharge(1);
5273 if (!ret)
5274 goto retry;
5275 }
5276
5277 return ret;
5278}
5279
5280static void mem_cgroup_move_charge(void)
5281{
5282 struct mm_walk mem_cgroup_move_charge_walk = {
5283 .pmd_entry = mem_cgroup_move_charge_pte_range,
5284 .mm = mc.mm,
5285 };
5286
5287 lru_add_drain_all();
5288
5289
5290
5291
5292
5293 atomic_inc(&mc.from->moving_account);
5294 synchronize_rcu();
5295retry:
5296 if (unlikely(!down_read_trylock(&mc.mm->mmap_sem))) {
5297
5298
5299
5300
5301
5302
5303
5304 __mem_cgroup_clear_mc();
5305 cond_resched();
5306 goto retry;
5307 }
5308
5309
5310
5311
5312 walk_page_range(0, mc.mm->highest_vm_end, &mem_cgroup_move_charge_walk);
5313
5314 up_read(&mc.mm->mmap_sem);
5315 atomic_dec(&mc.from->moving_account);
5316}
5317
5318static void mem_cgroup_move_task(void)
5319{
5320 if (mc.to) {
5321 mem_cgroup_move_charge();
5322 mem_cgroup_clear_mc();
5323 }
5324}
5325#else
5326static int mem_cgroup_can_attach(struct cgroup_taskset *tset)
5327{
5328 return 0;
5329}
5330static void mem_cgroup_cancel_attach(struct cgroup_taskset *tset)
5331{
5332}
5333static void mem_cgroup_move_task(void)
5334{
5335}
5336#endif
5337
5338
5339
5340
5341
5342
5343static void mem_cgroup_bind(struct cgroup_subsys_state *root_css)
5344{
5345
5346
5347
5348
5349
5350 if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
5351 root_mem_cgroup->use_hierarchy = true;
5352 else
5353 root_mem_cgroup->use_hierarchy = false;
5354}
5355
5356static u64 memory_current_read(struct cgroup_subsys_state *css,
5357 struct cftype *cft)
5358{
5359 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
5360
5361 return (u64)page_counter_read(&memcg->memory) * PAGE_SIZE;
5362}
5363
5364static int memory_min_show(struct seq_file *m, void *v)
5365{
5366 struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
5367 unsigned long min = READ_ONCE(memcg->memory.min);
5368
5369 if (min == PAGE_COUNTER_MAX)
5370 seq_puts(m, "max\n");
5371 else
5372 seq_printf(m, "%llu\n", (u64)min * PAGE_SIZE);
5373
5374 return 0;
5375}
5376
5377static ssize_t memory_min_write(struct kernfs_open_file *of,
5378 char *buf, size_t nbytes, loff_t off)
5379{
5380 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
5381 unsigned long min;
5382 int err;
5383
5384 buf = strstrip(buf);
5385 err = page_counter_memparse(buf, "max", &min);
5386 if (err)
5387 return err;
5388
5389 page_counter_set_min(&memcg->memory, min);
5390
5391 return nbytes;
5392}
5393
5394static int memory_low_show(struct seq_file *m, void *v)
5395{
5396 struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
5397 unsigned long low = READ_ONCE(memcg->memory.low);
5398
5399 if (low == PAGE_COUNTER_MAX)
5400 seq_puts(m, "max\n");
5401 else
5402 seq_printf(m, "%llu\n", (u64)low * PAGE_SIZE);
5403
5404 return 0;
5405}
5406
5407static ssize_t memory_low_write(struct kernfs_open_file *of,
5408 char *buf, size_t nbytes, loff_t off)
5409{
5410 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
5411 unsigned long low;
5412 int err;
5413
5414 buf = strstrip(buf);
5415 err = page_counter_memparse(buf, "max", &low);
5416 if (err)
5417 return err;
5418
5419 page_counter_set_low(&memcg->memory, low);
5420
5421 return nbytes;
5422}
5423
5424static int memory_high_show(struct seq_file *m, void *v)
5425{
5426 struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
5427 unsigned long high = READ_ONCE(memcg->high);
5428
5429 if (high == PAGE_COUNTER_MAX)
5430 seq_puts(m, "max\n");
5431 else
5432 seq_printf(m, "%llu\n", (u64)high * PAGE_SIZE);
5433
5434 return 0;
5435}
5436
5437static ssize_t memory_high_write(struct kernfs_open_file *of,
5438 char *buf, size_t nbytes, loff_t off)
5439{
5440 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
5441 unsigned long nr_pages;
5442 unsigned long high;
5443 int err;
5444
5445 buf = strstrip(buf);
5446 err = page_counter_memparse(buf, "max", &high);
5447 if (err)
5448 return err;
5449
5450 memcg->high = high;
5451
5452 nr_pages = page_counter_read(&memcg->memory);
5453 if (nr_pages > high)
5454 try_to_free_mem_cgroup_pages(memcg, nr_pages - high,
5455 GFP_KERNEL, true);
5456
5457 memcg_wb_domain_size_changed(memcg);
5458 return nbytes;
5459}
5460
5461static int memory_max_show(struct seq_file *m, void *v)
5462{
5463 struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
5464 unsigned long max = READ_ONCE(memcg->memory.max);
5465
5466 if (max == PAGE_COUNTER_MAX)
5467 seq_puts(m, "max\n");
5468 else
5469 seq_printf(m, "%llu\n", (u64)max * PAGE_SIZE);
5470
5471 return 0;
5472}
5473
5474static ssize_t memory_max_write(struct kernfs_open_file *of,
5475 char *buf, size_t nbytes, loff_t off)
5476{
5477 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
5478 unsigned int nr_reclaims = MEM_CGROUP_RECLAIM_RETRIES;
5479 bool drained = false;
5480 unsigned long max;
5481 int err;
5482
5483 buf = strstrip(buf);
5484 err = page_counter_memparse(buf, "max", &max);
5485 if (err)
5486 return err;
5487
5488 xchg(&memcg->memory.max, max);
5489
5490 for (;;) {
5491 unsigned long nr_pages = page_counter_read(&memcg->memory);
5492
5493 if (nr_pages <= max)
5494 break;
5495
5496 if (signal_pending(current)) {
5497 err = -EINTR;
5498 break;
5499 }
5500
5501 if (!drained) {
5502 drain_all_stock(memcg);
5503 drained = true;
5504 continue;
5505 }
5506
5507 if (nr_reclaims) {
5508 if (!try_to_free_mem_cgroup_pages(memcg, nr_pages - max,
5509 GFP_KERNEL, true))
5510 nr_reclaims--;
5511 continue;
5512 }
5513
5514 memcg_memory_event(memcg, MEMCG_OOM);
5515 if (!mem_cgroup_out_of_memory(memcg, GFP_KERNEL, 0))
5516 break;
5517 }
5518
5519 memcg_wb_domain_size_changed(memcg);
5520 return nbytes;
5521}
5522
5523static int memory_events_show(struct seq_file *m, void *v)
5524{
5525 struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
5526
5527 seq_printf(m, "low %lu\n",
5528 atomic_long_read(&memcg->memory_events[MEMCG_LOW]));
5529 seq_printf(m, "high %lu\n",
5530 atomic_long_read(&memcg->memory_events[MEMCG_HIGH]));
5531 seq_printf(m, "max %lu\n",
5532 atomic_long_read(&memcg->memory_events[MEMCG_MAX]));
5533 seq_printf(m, "oom %lu\n",
5534 atomic_long_read(&memcg->memory_events[MEMCG_OOM]));
5535 seq_printf(m, "oom_kill %lu\n",
5536 atomic_long_read(&memcg->memory_events[MEMCG_OOM_KILL]));
5537
5538 return 0;
5539}
5540
5541static int memory_stat_show(struct seq_file *m, void *v)
5542{
5543 struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
5544 struct accumulated_stats acc;
5545 int i;
5546
5547
5548
5549
5550
5551
5552
5553
5554
5555
5556
5557
5558 memset(&acc, 0, sizeof(acc));
5559 acc.stats_size = MEMCG_NR_STAT;
5560 acc.events_size = NR_VM_EVENT_ITEMS;
5561 accumulate_memcg_tree(memcg, &acc);
5562
5563 seq_printf(m, "anon %llu\n",
5564 (u64)acc.stat[MEMCG_RSS] * PAGE_SIZE);
5565 seq_printf(m, "file %llu\n",
5566 (u64)acc.stat[MEMCG_CACHE] * PAGE_SIZE);
5567 seq_printf(m, "kernel_stack %llu\n",
5568 (u64)acc.stat[MEMCG_KERNEL_STACK_KB] * 1024);
5569 seq_printf(m, "slab %llu\n",
5570 (u64)(acc.stat[NR_SLAB_RECLAIMABLE] +
5571 acc.stat[NR_SLAB_UNRECLAIMABLE]) * PAGE_SIZE);
5572 seq_printf(m, "sock %llu\n",
5573 (u64)acc.stat[MEMCG_SOCK] * PAGE_SIZE);
5574
5575 seq_printf(m, "shmem %llu\n",
5576 (u64)acc.stat[NR_SHMEM] * PAGE_SIZE);
5577 seq_printf(m, "file_mapped %llu\n",
5578 (u64)acc.stat[NR_FILE_MAPPED] * PAGE_SIZE);
5579 seq_printf(m, "file_dirty %llu\n",
5580 (u64)acc.stat[NR_FILE_DIRTY] * PAGE_SIZE);
5581 seq_printf(m, "file_writeback %llu\n",
5582 (u64)acc.stat[NR_WRITEBACK] * PAGE_SIZE);
5583
5584 for (i = 0; i < NR_LRU_LISTS; i++)
5585 seq_printf(m, "%s %llu\n", mem_cgroup_lru_names[i],
5586 (u64)acc.lru_pages[i] * PAGE_SIZE);
5587
5588 seq_printf(m, "slab_reclaimable %llu\n",
5589 (u64)acc.stat[NR_SLAB_RECLAIMABLE] * PAGE_SIZE);
5590 seq_printf(m, "slab_unreclaimable %llu\n",
5591 (u64)acc.stat[NR_SLAB_UNRECLAIMABLE] * PAGE_SIZE);
5592
5593
5594
5595 seq_printf(m, "pgfault %lu\n", acc.events[PGFAULT]);
5596 seq_printf(m, "pgmajfault %lu\n", acc.events[PGMAJFAULT]);
5597
5598 seq_printf(m, "pgrefill %lu\n", acc.events[PGREFILL]);
5599 seq_printf(m, "pgscan %lu\n", acc.events[PGSCAN_KSWAPD] +
5600 acc.events[PGSCAN_DIRECT]);
5601 seq_printf(m, "pgsteal %lu\n", acc.events[PGSTEAL_KSWAPD] +
5602 acc.events[PGSTEAL_DIRECT]);
5603 seq_printf(m, "pgactivate %lu\n", acc.events[PGACTIVATE]);
5604 seq_printf(m, "pgdeactivate %lu\n", acc.events[PGDEACTIVATE]);
5605 seq_printf(m, "pglazyfree %lu\n", acc.events[PGLAZYFREE]);
5606 seq_printf(m, "pglazyfreed %lu\n", acc.events[PGLAZYFREED]);
5607
5608 seq_printf(m, "workingset_refault %lu\n",
5609 acc.stat[WORKINGSET_REFAULT]);
5610 seq_printf(m, "workingset_activate %lu\n",
5611 acc.stat[WORKINGSET_ACTIVATE]);
5612 seq_printf(m, "workingset_nodereclaim %lu\n",
5613 acc.stat[WORKINGSET_NODERECLAIM]);
5614
5615 return 0;
5616}
5617
5618static int memory_oom_group_show(struct seq_file *m, void *v)
5619{
5620 struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
5621
5622 seq_printf(m, "%d\n", memcg->oom_group);
5623
5624 return 0;
5625}
5626
5627static ssize_t memory_oom_group_write(struct kernfs_open_file *of,
5628 char *buf, size_t nbytes, loff_t off)
5629{
5630 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
5631 int ret, oom_group;
5632
5633 buf = strstrip(buf);
5634 if (!buf)
5635 return -EINVAL;
5636
5637 ret = kstrtoint(buf, 0, &oom_group);
5638 if (ret)
5639 return ret;
5640
5641 if (oom_group != 0 && oom_group != 1)
5642 return -EINVAL;
5643
5644 memcg->oom_group = oom_group;
5645
5646 return nbytes;
5647}
5648
5649static struct cftype memory_files[] = {
5650 {
5651 .name = "current",
5652 .flags = CFTYPE_NOT_ON_ROOT,
5653 .read_u64 = memory_current_read,
5654 },
5655 {
5656 .name = "min",
5657 .flags = CFTYPE_NOT_ON_ROOT,
5658 .seq_show = memory_min_show,
5659 .write = memory_min_write,
5660 },
5661 {
5662 .name = "low",
5663 .flags = CFTYPE_NOT_ON_ROOT,
5664 .seq_show = memory_low_show,
5665 .write = memory_low_write,
5666 },
5667 {
5668 .name = "high",
5669 .flags = CFTYPE_NOT_ON_ROOT,
5670 .seq_show = memory_high_show,
5671 .write = memory_high_write,
5672 },
5673 {
5674 .name = "max",
5675 .flags = CFTYPE_NOT_ON_ROOT,
5676 .seq_show = memory_max_show,
5677 .write = memory_max_write,
5678 },
5679 {
5680 .name = "events",
5681 .flags = CFTYPE_NOT_ON_ROOT,
5682 .file_offset = offsetof(struct mem_cgroup, events_file),
5683 .seq_show = memory_events_show,
5684 },
5685 {
5686 .name = "stat",
5687 .flags = CFTYPE_NOT_ON_ROOT,
5688 .seq_show = memory_stat_show,
5689 },
5690 {
5691 .name = "oom.group",
5692 .flags = CFTYPE_NOT_ON_ROOT | CFTYPE_NS_DELEGATABLE,
5693 .seq_show = memory_oom_group_show,
5694 .write = memory_oom_group_write,
5695 },
5696 { }
5697};
5698
5699struct cgroup_subsys memory_cgrp_subsys = {
5700 .css_alloc = mem_cgroup_css_alloc,
5701 .css_online = mem_cgroup_css_online,
5702 .css_offline = mem_cgroup_css_offline,
5703 .css_released = mem_cgroup_css_released,
5704 .css_free = mem_cgroup_css_free,
5705 .css_reset = mem_cgroup_css_reset,
5706 .can_attach = mem_cgroup_can_attach,
5707 .cancel_attach = mem_cgroup_cancel_attach,
5708 .post_attach = mem_cgroup_move_task,
5709 .bind = mem_cgroup_bind,
5710 .dfl_cftypes = memory_files,
5711 .legacy_cftypes = mem_cgroup_legacy_files,
5712 .early_init = 0,
5713};
5714
5715
5716
5717
5718
5719
5720
5721
5722
5723
5724
5725
5726
5727
5728
5729
5730
5731
5732
5733
5734
5735
5736
5737
5738
5739
5740
5741
5742
5743
5744
5745
5746
5747
5748
5749
5750
5751
5752
5753
5754
5755
5756
5757
5758
5759
5760
5761
5762
5763
5764
5765
5766
5767
5768
5769
5770
5771
5772
5773
5774
5775
5776
5777
5778
5779
5780
5781
5782
5783
5784
5785enum mem_cgroup_protection mem_cgroup_protected(struct mem_cgroup *root,
5786 struct mem_cgroup *memcg)
5787{
5788 struct mem_cgroup *parent;
5789 unsigned long emin, parent_emin;
5790 unsigned long elow, parent_elow;
5791 unsigned long usage;
5792
5793 if (mem_cgroup_disabled())
5794 return MEMCG_PROT_NONE;
5795
5796 if (!root)
5797 root = root_mem_cgroup;
5798 if (memcg == root)
5799 return MEMCG_PROT_NONE;
5800
5801 usage = page_counter_read(&memcg->memory);
5802 if (!usage)
5803 return MEMCG_PROT_NONE;
5804
5805 emin = memcg->memory.min;
5806 elow = memcg->memory.low;
5807
5808 parent = parent_mem_cgroup(memcg);
5809
5810 if (!parent)
5811 return MEMCG_PROT_NONE;
5812
5813 if (parent == root)
5814 goto exit;
5815
5816 parent_emin = READ_ONCE(parent->memory.emin);
5817 emin = min(emin, parent_emin);
5818 if (emin && parent_emin) {
5819 unsigned long min_usage, siblings_min_usage;
5820
5821 min_usage = min(usage, memcg->memory.min);
5822 siblings_min_usage = atomic_long_read(
5823 &parent->memory.children_min_usage);
5824
5825 if (min_usage && siblings_min_usage)
5826 emin = min(emin, parent_emin * min_usage /
5827 siblings_min_usage);
5828 }
5829
5830 parent_elow = READ_ONCE(parent->memory.elow);
5831 elow = min(elow, parent_elow);
5832 if (elow && parent_elow) {
5833 unsigned long low_usage, siblings_low_usage;
5834
5835 low_usage = min(usage, memcg->memory.low);
5836 siblings_low_usage = atomic_long_read(
5837 &parent->memory.children_low_usage);
5838
5839 if (low_usage && siblings_low_usage)
5840 elow = min(elow, parent_elow * low_usage /
5841 siblings_low_usage);
5842 }
5843
5844exit:
5845 memcg->memory.emin = emin;
5846 memcg->memory.elow = elow;
5847
5848 if (usage <= emin)
5849 return MEMCG_PROT_MIN;
5850 else if (usage <= elow)
5851 return MEMCG_PROT_LOW;
5852 else
5853 return MEMCG_PROT_NONE;
5854}
5855
5856
5857
5858
5859
5860
5861
5862
5863
5864
5865
5866
5867
5868
5869
5870
5871
5872
5873
5874int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm,
5875 gfp_t gfp_mask, struct mem_cgroup **memcgp,
5876 bool compound)
5877{
5878 struct mem_cgroup *memcg = NULL;
5879 unsigned int nr_pages = compound ? hpage_nr_pages(page) : 1;
5880 int ret = 0;
5881
5882 if (mem_cgroup_disabled())
5883 goto out;
5884
5885 if (PageSwapCache(page)) {
5886
5887
5888
5889
5890
5891
5892
5893 VM_BUG_ON_PAGE(!PageLocked(page), page);
5894 if (compound_head(page)->mem_cgroup)
5895 goto out;
5896
5897 if (do_swap_account) {
5898 swp_entry_t ent = { .val = page_private(page), };
5899 unsigned short id = lookup_swap_cgroup_id(ent);
5900
5901 rcu_read_lock();
5902 memcg = mem_cgroup_from_id(id);
5903 if (memcg && !css_tryget_online(&memcg->css))
5904 memcg = NULL;
5905 rcu_read_unlock();
5906 }
5907 }
5908
5909 if (!memcg)
5910 memcg = get_mem_cgroup_from_mm(mm);
5911
5912 ret = try_charge(memcg, gfp_mask, nr_pages);
5913
5914 css_put(&memcg->css);
5915out:
5916 *memcgp = memcg;
5917 return ret;
5918}
5919
5920int mem_cgroup_try_charge_delay(struct page *page, struct mm_struct *mm,
5921 gfp_t gfp_mask, struct mem_cgroup **memcgp,
5922 bool compound)
5923{
5924 struct mem_cgroup *memcg;
5925 int ret;
5926
5927 ret = mem_cgroup_try_charge(page, mm, gfp_mask, memcgp, compound);
5928 memcg = *memcgp;
5929 mem_cgroup_throttle_swaprate(memcg, page_to_nid(page), gfp_mask);
5930 return ret;
5931}
5932
5933
5934
5935
5936
5937
5938
5939
5940
5941
5942
5943
5944
5945
5946
5947
5948
5949
5950void mem_cgroup_commit_charge(struct page *page, struct mem_cgroup *memcg,
5951 bool lrucare, bool compound)
5952{
5953 unsigned int nr_pages = compound ? hpage_nr_pages(page) : 1;
5954
5955 VM_BUG_ON_PAGE(!page->mapping, page);
5956 VM_BUG_ON_PAGE(PageLRU(page) && !lrucare, page);
5957
5958 if (mem_cgroup_disabled())
5959 return;
5960
5961
5962
5963
5964
5965 if (!memcg)
5966 return;
5967
5968 commit_charge(page, memcg, lrucare);
5969
5970 local_irq_disable();
5971 mem_cgroup_charge_statistics(memcg, page, compound, nr_pages);
5972 memcg_check_events(memcg, page);
5973 local_irq_enable();
5974
5975 if (do_memsw_account() && PageSwapCache(page)) {
5976 swp_entry_t entry = { .val = page_private(page) };
5977
5978
5979
5980
5981
5982 mem_cgroup_uncharge_swap(entry, nr_pages);
5983 }
5984}
5985
5986
5987
5988
5989
5990
5991
5992
5993
5994void mem_cgroup_cancel_charge(struct page *page, struct mem_cgroup *memcg,
5995 bool compound)
5996{
5997 unsigned int nr_pages = compound ? hpage_nr_pages(page) : 1;
5998
5999 if (mem_cgroup_disabled())
6000 return;
6001
6002
6003
6004
6005
6006 if (!memcg)
6007 return;
6008
6009 cancel_charge(memcg, nr_pages);
6010}
6011
6012struct uncharge_gather {
6013 struct mem_cgroup *memcg;
6014 unsigned long pgpgout;
6015 unsigned long nr_anon;
6016 unsigned long nr_file;
6017 unsigned long nr_kmem;
6018 unsigned long nr_huge;
6019 unsigned long nr_shmem;
6020 struct page *dummy_page;
6021};
6022
6023static inline void uncharge_gather_clear(struct uncharge_gather *ug)
6024{
6025 memset(ug, 0, sizeof(*ug));
6026}
6027
6028static void uncharge_batch(const struct uncharge_gather *ug)
6029{
6030 unsigned long nr_pages = ug->nr_anon + ug->nr_file + ug->nr_kmem;
6031 unsigned long flags;
6032
6033 if (!mem_cgroup_is_root(ug->memcg)) {
6034 page_counter_uncharge(&ug->memcg->memory, nr_pages);
6035 if (do_memsw_account())
6036 page_counter_uncharge(&ug->memcg->memsw, nr_pages);
6037 if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && ug->nr_kmem)
6038 page_counter_uncharge(&ug->memcg->kmem, ug->nr_kmem);
6039 memcg_oom_recover(ug->memcg);
6040 }
6041
6042 local_irq_save(flags);
6043 __mod_memcg_state(ug->memcg, MEMCG_RSS, -ug->nr_anon);
6044 __mod_memcg_state(ug->memcg, MEMCG_CACHE, -ug->nr_file);
6045 __mod_memcg_state(ug->memcg, MEMCG_RSS_HUGE, -ug->nr_huge);
6046 __mod_memcg_state(ug->memcg, NR_SHMEM, -ug->nr_shmem);
6047 __count_memcg_events(ug->memcg, PGPGOUT, ug->pgpgout);
6048 __this_cpu_add(ug->memcg->stat_cpu->nr_page_events, nr_pages);
6049 memcg_check_events(ug->memcg, ug->dummy_page);
6050 local_irq_restore(flags);
6051
6052 if (!mem_cgroup_is_root(ug->memcg))
6053 css_put_many(&ug->memcg->css, nr_pages);
6054}
6055
6056static void uncharge_page(struct page *page, struct uncharge_gather *ug)
6057{
6058 VM_BUG_ON_PAGE(PageLRU(page), page);
6059 VM_BUG_ON_PAGE(page_count(page) && !is_zone_device_page(page) &&
6060 !PageHWPoison(page) , page);
6061
6062 if (!page->mem_cgroup)
6063 return;
6064
6065
6066
6067
6068
6069
6070
6071 if (ug->memcg != page->mem_cgroup) {
6072 if (ug->memcg) {
6073 uncharge_batch(ug);
6074 uncharge_gather_clear(ug);
6075 }
6076 ug->memcg = page->mem_cgroup;
6077 }
6078
6079 if (!PageKmemcg(page)) {
6080 unsigned int nr_pages = 1;
6081
6082 if (PageTransHuge(page)) {
6083 nr_pages <<= compound_order(page);
6084 ug->nr_huge += nr_pages;
6085 }
6086 if (PageAnon(page))
6087 ug->nr_anon += nr_pages;
6088 else {
6089 ug->nr_file += nr_pages;
6090 if (PageSwapBacked(page))
6091 ug->nr_shmem += nr_pages;
6092 }
6093 ug->pgpgout++;
6094 } else {
6095 ug->nr_kmem += 1 << compound_order(page);
6096 __ClearPageKmemcg(page);
6097 }
6098
6099 ug->dummy_page = page;
6100 page->mem_cgroup = NULL;
6101}
6102
6103static void uncharge_list(struct list_head *page_list)
6104{
6105 struct uncharge_gather ug;
6106 struct list_head *next;
6107
6108 uncharge_gather_clear(&ug);
6109
6110
6111
6112
6113
6114 next = page_list->next;
6115 do {
6116 struct page *page;
6117
6118 page = list_entry(next, struct page, lru);
6119 next = page->lru.next;
6120
6121 uncharge_page(page, &ug);
6122 } while (next != page_list);
6123
6124 if (ug.memcg)
6125 uncharge_batch(&ug);
6126}
6127
6128
6129
6130
6131
6132
6133
6134
6135void mem_cgroup_uncharge(struct page *page)
6136{
6137 struct uncharge_gather ug;
6138
6139 if (mem_cgroup_disabled())
6140 return;
6141
6142
6143 if (!page->mem_cgroup)
6144 return;
6145
6146 uncharge_gather_clear(&ug);
6147 uncharge_page(page, &ug);
6148 uncharge_batch(&ug);
6149}
6150
6151
6152
6153
6154
6155
6156
6157
6158void mem_cgroup_uncharge_list(struct list_head *page_list)
6159{
6160 if (mem_cgroup_disabled())
6161 return;
6162
6163 if (!list_empty(page_list))
6164 uncharge_list(page_list);
6165}
6166
6167
6168
6169
6170
6171
6172
6173
6174
6175
6176
6177void mem_cgroup_migrate(struct page *oldpage, struct page *newpage)
6178{
6179 struct mem_cgroup *memcg;
6180 unsigned int nr_pages;
6181 bool compound;
6182 unsigned long flags;
6183
6184 VM_BUG_ON_PAGE(!PageLocked(oldpage), oldpage);
6185 VM_BUG_ON_PAGE(!PageLocked(newpage), newpage);
6186 VM_BUG_ON_PAGE(PageAnon(oldpage) != PageAnon(newpage), newpage);
6187 VM_BUG_ON_PAGE(PageTransHuge(oldpage) != PageTransHuge(newpage),
6188 newpage);
6189
6190 if (mem_cgroup_disabled())
6191 return;
6192
6193
6194 if (newpage->mem_cgroup)
6195 return;
6196
6197
6198 memcg = oldpage->mem_cgroup;
6199 if (!memcg)
6200 return;
6201
6202
6203 compound = PageTransHuge(newpage);
6204 nr_pages = compound ? hpage_nr_pages(newpage) : 1;
6205
6206 page_counter_charge(&memcg->memory, nr_pages);
6207 if (do_memsw_account())
6208 page_counter_charge(&memcg->memsw, nr_pages);
6209 css_get_many(&memcg->css, nr_pages);
6210
6211 commit_charge(newpage, memcg, false);
6212
6213 local_irq_save(flags);
6214 mem_cgroup_charge_statistics(memcg, newpage, compound, nr_pages);
6215 memcg_check_events(memcg, newpage);
6216 local_irq_restore(flags);
6217}
6218
6219DEFINE_STATIC_KEY_FALSE(memcg_sockets_enabled_key);
6220EXPORT_SYMBOL(memcg_sockets_enabled_key);
6221
6222void mem_cgroup_sk_alloc(struct sock *sk)
6223{
6224 struct mem_cgroup *memcg;
6225
6226 if (!mem_cgroup_sockets_enabled)
6227 return;
6228
6229
6230
6231
6232
6233
6234
6235
6236
6237
6238 if (sk->sk_memcg) {
6239 css_get(&sk->sk_memcg->css);
6240 return;
6241 }
6242
6243 rcu_read_lock();
6244 memcg = mem_cgroup_from_task(current);
6245 if (memcg == root_mem_cgroup)
6246 goto out;
6247 if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && !memcg->tcpmem_active)
6248 goto out;
6249 if (css_tryget_online(&memcg->css))
6250 sk->sk_memcg = memcg;
6251out:
6252 rcu_read_unlock();
6253}
6254
6255void mem_cgroup_sk_free(struct sock *sk)
6256{
6257 if (sk->sk_memcg)
6258 css_put(&sk->sk_memcg->css);
6259}
6260
6261
6262
6263
6264
6265
6266
6267
6268
6269bool mem_cgroup_charge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages)
6270{
6271 gfp_t gfp_mask = GFP_KERNEL;
6272
6273 if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) {
6274 struct page_counter *fail;
6275
6276 if (page_counter_try_charge(&memcg->tcpmem, nr_pages, &fail)) {
6277 memcg->tcpmem_pressure = 0;
6278 return true;
6279 }
6280 page_counter_charge(&memcg->tcpmem, nr_pages);
6281 memcg->tcpmem_pressure = 1;
6282 return false;
6283 }
6284
6285
6286 if (in_softirq())
6287 gfp_mask = GFP_NOWAIT;
6288
6289 mod_memcg_state(memcg, MEMCG_SOCK, nr_pages);
6290
6291 if (try_charge(memcg, gfp_mask, nr_pages) == 0)
6292 return true;
6293
6294 try_charge(memcg, gfp_mask|__GFP_NOFAIL, nr_pages);
6295 return false;
6296}
6297
6298
6299
6300
6301
6302
6303void mem_cgroup_uncharge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages)
6304{
6305 if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) {
6306 page_counter_uncharge(&memcg->tcpmem, nr_pages);
6307 return;
6308 }
6309
6310 mod_memcg_state(memcg, MEMCG_SOCK, -nr_pages);
6311
6312 refill_stock(memcg, nr_pages);
6313}
6314
6315static int __init cgroup_memory(char *s)
6316{
6317 char *token;
6318
6319 while ((token = strsep(&s, ",")) != NULL) {
6320 if (!*token)
6321 continue;
6322 if (!strcmp(token, "nosocket"))
6323 cgroup_memory_nosocket = true;
6324 if (!strcmp(token, "nokmem"))
6325 cgroup_memory_nokmem = true;
6326 }
6327 return 0;
6328}
6329__setup("cgroup.memory=", cgroup_memory);
6330
6331
6332
6333
6334
6335
6336
6337
6338
6339static int __init mem_cgroup_init(void)
6340{
6341 int cpu, node;
6342
6343#ifdef CONFIG_MEMCG_KMEM
6344
6345
6346
6347
6348
6349
6350 memcg_kmem_cache_wq = alloc_workqueue("memcg_kmem_cache", 0, 1);
6351 BUG_ON(!memcg_kmem_cache_wq);
6352#endif
6353
6354 cpuhp_setup_state_nocalls(CPUHP_MM_MEMCQ_DEAD, "mm/memctrl:dead", NULL,
6355 memcg_hotplug_cpu_dead);
6356
6357 for_each_possible_cpu(cpu)
6358 INIT_WORK(&per_cpu_ptr(&memcg_stock, cpu)->work,
6359 drain_local_stock);
6360
6361 for_each_node(node) {
6362 struct mem_cgroup_tree_per_node *rtpn;
6363
6364 rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL,
6365 node_online(node) ? node : NUMA_NO_NODE);
6366
6367 rtpn->rb_root = RB_ROOT;
6368 rtpn->rb_rightmost = NULL;
6369 spin_lock_init(&rtpn->lock);
6370 soft_limit_tree.rb_tree_per_node[node] = rtpn;
6371 }
6372
6373 return 0;
6374}
6375subsys_initcall(mem_cgroup_init);
6376
6377#ifdef CONFIG_MEMCG_SWAP
6378static struct mem_cgroup *mem_cgroup_id_get_online(struct mem_cgroup *memcg)
6379{
6380 while (!atomic_inc_not_zero(&memcg->id.ref)) {
6381
6382
6383
6384
6385 if (WARN_ON_ONCE(memcg == root_mem_cgroup)) {
6386 VM_BUG_ON(1);
6387 break;
6388 }
6389 memcg = parent_mem_cgroup(memcg);
6390 if (!memcg)
6391 memcg = root_mem_cgroup;
6392 }
6393 return memcg;
6394}
6395
6396
6397
6398
6399
6400
6401
6402
6403void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
6404{
6405 struct mem_cgroup *memcg, *swap_memcg;
6406 unsigned int nr_entries;
6407 unsigned short oldid;
6408
6409 VM_BUG_ON_PAGE(PageLRU(page), page);
6410 VM_BUG_ON_PAGE(page_count(page), page);
6411
6412 if (!do_memsw_account())
6413 return;
6414
6415 memcg = page->mem_cgroup;
6416
6417
6418 if (!memcg)
6419 return;
6420
6421
6422
6423
6424
6425
6426 swap_memcg = mem_cgroup_id_get_online(memcg);
6427 nr_entries = hpage_nr_pages(page);
6428
6429 if (nr_entries > 1)
6430 mem_cgroup_id_get_many(swap_memcg, nr_entries - 1);
6431 oldid = swap_cgroup_record(entry, mem_cgroup_id(swap_memcg),
6432 nr_entries);
6433 VM_BUG_ON_PAGE(oldid, page);
6434 mod_memcg_state(swap_memcg, MEMCG_SWAP, nr_entries);
6435
6436 page->mem_cgroup = NULL;
6437
6438 if (!mem_cgroup_is_root(memcg))
6439 page_counter_uncharge(&memcg->memory, nr_entries);
6440
6441 if (memcg != swap_memcg) {
6442 if (!mem_cgroup_is_root(swap_memcg))
6443 page_counter_charge(&swap_memcg->memsw, nr_entries);
6444 page_counter_uncharge(&memcg->memsw, nr_entries);
6445 }
6446
6447
6448
6449
6450
6451
6452
6453 VM_BUG_ON(!irqs_disabled());
6454 mem_cgroup_charge_statistics(memcg, page, PageTransHuge(page),
6455 -nr_entries);
6456 memcg_check_events(memcg, page);
6457
6458 if (!mem_cgroup_is_root(memcg))
6459 css_put_many(&memcg->css, nr_entries);
6460}
6461
6462
6463
6464
6465
6466
6467
6468
6469
6470
6471int mem_cgroup_try_charge_swap(struct page *page, swp_entry_t entry)
6472{
6473 unsigned int nr_pages = hpage_nr_pages(page);
6474 struct page_counter *counter;
6475 struct mem_cgroup *memcg;
6476 unsigned short oldid;
6477
6478 if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) || !do_swap_account)
6479 return 0;
6480
6481 memcg = page->mem_cgroup;
6482
6483
6484 if (!memcg)
6485 return 0;
6486
6487 if (!entry.val) {
6488 memcg_memory_event(memcg, MEMCG_SWAP_FAIL);
6489 return 0;
6490 }
6491
6492 memcg = mem_cgroup_id_get_online(memcg);
6493
6494 if (!mem_cgroup_is_root(memcg) &&
6495 !page_counter_try_charge(&memcg->swap, nr_pages, &counter)) {
6496 memcg_memory_event(memcg, MEMCG_SWAP_MAX);
6497 memcg_memory_event(memcg, MEMCG_SWAP_FAIL);
6498 mem_cgroup_id_put(memcg);
6499 return -ENOMEM;
6500 }
6501
6502
6503 if (nr_pages > 1)
6504 mem_cgroup_id_get_many(memcg, nr_pages - 1);
6505 oldid = swap_cgroup_record(entry, mem_cgroup_id(memcg), nr_pages);
6506 VM_BUG_ON_PAGE(oldid, page);
6507 mod_memcg_state(memcg, MEMCG_SWAP, nr_pages);
6508
6509 return 0;
6510}
6511
6512
6513
6514
6515
6516
6517void mem_cgroup_uncharge_swap(swp_entry_t entry, unsigned int nr_pages)
6518{
6519 struct mem_cgroup *memcg;
6520 unsigned short id;
6521
6522 if (!do_swap_account)
6523 return;
6524
6525 id = swap_cgroup_record(entry, 0, nr_pages);
6526 rcu_read_lock();
6527 memcg = mem_cgroup_from_id(id);
6528 if (memcg) {
6529 if (!mem_cgroup_is_root(memcg)) {
6530 if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
6531 page_counter_uncharge(&memcg->swap, nr_pages);
6532 else
6533 page_counter_uncharge(&memcg->memsw, nr_pages);
6534 }
6535 mod_memcg_state(memcg, MEMCG_SWAP, -nr_pages);
6536 mem_cgroup_id_put_many(memcg, nr_pages);
6537 }
6538 rcu_read_unlock();
6539}
6540
6541long mem_cgroup_get_nr_swap_pages(struct mem_cgroup *memcg)
6542{
6543 long nr_swap_pages = get_nr_swap_pages();
6544
6545 if (!do_swap_account || !cgroup_subsys_on_dfl(memory_cgrp_subsys))
6546 return nr_swap_pages;
6547 for (; memcg != root_mem_cgroup; memcg = parent_mem_cgroup(memcg))
6548 nr_swap_pages = min_t(long, nr_swap_pages,
6549 READ_ONCE(memcg->swap.max) -
6550 page_counter_read(&memcg->swap));
6551 return nr_swap_pages;
6552}
6553
6554bool mem_cgroup_swap_full(struct page *page)
6555{
6556 struct mem_cgroup *memcg;
6557
6558 VM_BUG_ON_PAGE(!PageLocked(page), page);
6559
6560 if (vm_swap_full())
6561 return true;
6562 if (!do_swap_account || !cgroup_subsys_on_dfl(memory_cgrp_subsys))
6563 return false;
6564
6565 memcg = page->mem_cgroup;
6566 if (!memcg)
6567 return false;
6568
6569 for (; memcg != root_mem_cgroup; memcg = parent_mem_cgroup(memcg))
6570 if (page_counter_read(&memcg->swap) * 2 >= memcg->swap.max)
6571 return true;
6572
6573 return false;
6574}
6575
6576
6577#ifdef CONFIG_MEMCG_SWAP_ENABLED
6578static int really_do_swap_account __initdata = 1;
6579#else
6580static int really_do_swap_account __initdata;
6581#endif
6582
6583static int __init enable_swap_account(char *s)
6584{
6585 if (!strcmp(s, "1"))
6586 really_do_swap_account = 1;
6587 else if (!strcmp(s, "0"))
6588 really_do_swap_account = 0;
6589 return 1;
6590}
6591__setup("swapaccount=", enable_swap_account);
6592
6593static u64 swap_current_read(struct cgroup_subsys_state *css,
6594 struct cftype *cft)
6595{
6596 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
6597
6598 return (u64)page_counter_read(&memcg->swap) * PAGE_SIZE;
6599}
6600
6601static int swap_max_show(struct seq_file *m, void *v)
6602{
6603 struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
6604 unsigned long max = READ_ONCE(memcg->swap.max);
6605
6606 if (max == PAGE_COUNTER_MAX)
6607 seq_puts(m, "max\n");
6608 else
6609 seq_printf(m, "%llu\n", (u64)max * PAGE_SIZE);
6610
6611 return 0;
6612}
6613
6614static ssize_t swap_max_write(struct kernfs_open_file *of,
6615 char *buf, size_t nbytes, loff_t off)
6616{
6617 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
6618 unsigned long max;
6619 int err;
6620
6621 buf = strstrip(buf);
6622 err = page_counter_memparse(buf, "max", &max);
6623 if (err)
6624 return err;
6625
6626 xchg(&memcg->swap.max, max);
6627
6628 return nbytes;
6629}
6630
6631static int swap_events_show(struct seq_file *m, void *v)
6632{
6633 struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
6634
6635 seq_printf(m, "max %lu\n",
6636 atomic_long_read(&memcg->memory_events[MEMCG_SWAP_MAX]));
6637 seq_printf(m, "fail %lu\n",
6638 atomic_long_read(&memcg->memory_events[MEMCG_SWAP_FAIL]));
6639
6640 return 0;
6641}
6642
6643static struct cftype swap_files[] = {
6644 {
6645 .name = "swap.current",
6646 .flags = CFTYPE_NOT_ON_ROOT,
6647 .read_u64 = swap_current_read,
6648 },
6649 {
6650 .name = "swap.max",
6651 .flags = CFTYPE_NOT_ON_ROOT,
6652 .seq_show = swap_max_show,
6653 .write = swap_max_write,
6654 },
6655 {
6656 .name = "swap.events",
6657 .flags = CFTYPE_NOT_ON_ROOT,
6658 .file_offset = offsetof(struct mem_cgroup, swap_events_file),
6659 .seq_show = swap_events_show,
6660 },
6661 { }
6662};
6663
6664static struct cftype memsw_cgroup_files[] = {
6665 {
6666 .name = "memsw.usage_in_bytes",
6667 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE),
6668 .read_u64 = mem_cgroup_read_u64,
6669 },
6670 {
6671 .name = "memsw.max_usage_in_bytes",
6672 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_MAX_USAGE),
6673 .write = mem_cgroup_reset,
6674 .read_u64 = mem_cgroup_read_u64,
6675 },
6676 {
6677 .name = "memsw.limit_in_bytes",
6678 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_LIMIT),
6679 .write = mem_cgroup_write,
6680 .read_u64 = mem_cgroup_read_u64,
6681 },
6682 {
6683 .name = "memsw.failcnt",
6684 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_FAILCNT),
6685 .write = mem_cgroup_reset,
6686 .read_u64 = mem_cgroup_read_u64,
6687 },
6688 { },
6689};
6690
6691static int __init mem_cgroup_swap_init(void)
6692{
6693 if (!mem_cgroup_disabled() && really_do_swap_account) {
6694 do_swap_account = 1;
6695 WARN_ON(cgroup_add_dfl_cftypes(&memory_cgrp_subsys,
6696 swap_files));
6697 WARN_ON(cgroup_add_legacy_cftypes(&memory_cgrp_subsys,
6698 memsw_cgroup_files));
6699 }
6700 return 0;
6701}
6702subsys_initcall(mem_cgroup_swap_init);
6703
6704#endif
6705