1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28#include <linux/page_counter.h>
29#include <linux/memcontrol.h>
30#include <linux/cgroup.h>
31#include <linux/pagewalk.h>
32#include <linux/sched/mm.h>
33#include <linux/shmem_fs.h>
34#include <linux/hugetlb.h>
35#include <linux/pagemap.h>
36#include <linux/vm_event_item.h>
37#include <linux/smp.h>
38#include <linux/page-flags.h>
39#include <linux/backing-dev.h>
40#include <linux/bit_spinlock.h>
41#include <linux/rcupdate.h>
42#include <linux/limits.h>
43#include <linux/export.h>
44#include <linux/mutex.h>
45#include <linux/rbtree.h>
46#include <linux/slab.h>
47#include <linux/swap.h>
48#include <linux/swapops.h>
49#include <linux/spinlock.h>
50#include <linux/eventfd.h>
51#include <linux/poll.h>
52#include <linux/sort.h>
53#include <linux/fs.h>
54#include <linux/seq_file.h>
55#include <linux/vmpressure.h>
56#include <linux/mm_inline.h>
57#include <linux/swap_cgroup.h>
58#include <linux/cpu.h>
59#include <linux/oom.h>
60#include <linux/lockdep.h>
61#include <linux/file.h>
62#include <linux/tracehook.h>
63#include <linux/psi.h>
64#include <linux/seq_buf.h>
65#include "internal.h"
66#include <net/sock.h>
67#include <net/ip.h>
68#include "slab.h"
69
70#include <linux/uaccess.h>
71
72#include <trace/events/vmscan.h>
73
74struct cgroup_subsys memory_cgrp_subsys __read_mostly;
75EXPORT_SYMBOL(memory_cgrp_subsys);
76
77struct mem_cgroup *root_mem_cgroup __read_mostly;
78
79
80DEFINE_PER_CPU(struct mem_cgroup *, int_active_memcg);
81
82
83static bool cgroup_memory_nosocket;
84
85
86static bool cgroup_memory_nokmem;
87
88
89#ifdef CONFIG_MEMCG_SWAP
90bool cgroup_memory_noswap __read_mostly;
91#else
92#define cgroup_memory_noswap 1
93#endif
94
95#ifdef CONFIG_CGROUP_WRITEBACK
96static DECLARE_WAIT_QUEUE_HEAD(memcg_cgwb_frn_waitq);
97#endif
98
99
100static bool do_memsw_account(void)
101{
102 return !cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_noswap;
103}
104
105#define THRESHOLDS_EVENTS_TARGET 128
106#define SOFTLIMIT_EVENTS_TARGET 1024
107
108
109
110
111
112
113struct mem_cgroup_tree_per_node {
114 struct rb_root rb_root;
115 struct rb_node *rb_rightmost;
116 spinlock_t lock;
117};
118
119struct mem_cgroup_tree {
120 struct mem_cgroup_tree_per_node *rb_tree_per_node[MAX_NUMNODES];
121};
122
123static struct mem_cgroup_tree soft_limit_tree __read_mostly;
124
125
126struct mem_cgroup_eventfd_list {
127 struct list_head list;
128 struct eventfd_ctx *eventfd;
129};
130
131
132
133
134struct mem_cgroup_event {
135
136
137
138 struct mem_cgroup *memcg;
139
140
141
142 struct eventfd_ctx *eventfd;
143
144
145
146 struct list_head list;
147
148
149
150
151
152 int (*register_event)(struct mem_cgroup *memcg,
153 struct eventfd_ctx *eventfd, const char *args);
154
155
156
157
158
159 void (*unregister_event)(struct mem_cgroup *memcg,
160 struct eventfd_ctx *eventfd);
161
162
163
164
165 poll_table pt;
166 wait_queue_head_t *wqh;
167 wait_queue_entry_t wait;
168 struct work_struct remove;
169};
170
171static void mem_cgroup_threshold(struct mem_cgroup *memcg);
172static void mem_cgroup_oom_notify(struct mem_cgroup *memcg);
173
174
175
176
177
178#define MOVE_ANON 0x1U
179#define MOVE_FILE 0x2U
180#define MOVE_MASK (MOVE_ANON | MOVE_FILE)
181
182
183static struct move_charge_struct {
184 spinlock_t lock;
185 struct mm_struct *mm;
186 struct mem_cgroup *from;
187 struct mem_cgroup *to;
188 unsigned long flags;
189 unsigned long precharge;
190 unsigned long moved_charge;
191 unsigned long moved_swap;
192 struct task_struct *moving_task;
193 wait_queue_head_t waitq;
194} mc = {
195 .lock = __SPIN_LOCK_UNLOCKED(mc.lock),
196 .waitq = __WAIT_QUEUE_HEAD_INITIALIZER(mc.waitq),
197};
198
199
200
201
202
203#define MEM_CGROUP_MAX_RECLAIM_LOOPS 100
204#define MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS 2
205
206
207enum res_type {
208 _MEM,
209 _MEMSWAP,
210 _OOM_TYPE,
211 _KMEM,
212 _TCP,
213};
214
215#define MEMFILE_PRIVATE(x, val) ((x) << 16 | (val))
216#define MEMFILE_TYPE(val) ((val) >> 16 & 0xffff)
217#define MEMFILE_ATTR(val) ((val) & 0xffff)
218
219#define OOM_CONTROL (0)
220
221
222
223
224
225
226#define for_each_mem_cgroup_tree(iter, root) \
227 for (iter = mem_cgroup_iter(root, NULL, NULL); \
228 iter != NULL; \
229 iter = mem_cgroup_iter(root, iter, NULL))
230
231#define for_each_mem_cgroup(iter) \
232 for (iter = mem_cgroup_iter(NULL, NULL, NULL); \
233 iter != NULL; \
234 iter = mem_cgroup_iter(NULL, iter, NULL))
235
236static inline bool should_force_charge(void)
237{
238 return tsk_is_oom_victim(current) || fatal_signal_pending(current) ||
239 (current->flags & PF_EXITING);
240}
241
242
243struct vmpressure *memcg_to_vmpressure(struct mem_cgroup *memcg)
244{
245 if (!memcg)
246 memcg = root_mem_cgroup;
247 return &memcg->vmpressure;
248}
249
250struct cgroup_subsys_state *vmpressure_to_css(struct vmpressure *vmpr)
251{
252 return &container_of(vmpr, struct mem_cgroup, vmpressure)->css;
253}
254
255#ifdef CONFIG_MEMCG_KMEM
256extern spinlock_t css_set_lock;
257
258static int __memcg_kmem_charge(struct mem_cgroup *memcg, gfp_t gfp,
259 unsigned int nr_pages);
260static void __memcg_kmem_uncharge(struct mem_cgroup *memcg,
261 unsigned int nr_pages);
262
263static void obj_cgroup_release(struct percpu_ref *ref)
264{
265 struct obj_cgroup *objcg = container_of(ref, struct obj_cgroup, refcnt);
266 struct mem_cgroup *memcg;
267 unsigned int nr_bytes;
268 unsigned int nr_pages;
269 unsigned long flags;
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291 nr_bytes = atomic_read(&objcg->nr_charged_bytes);
292 WARN_ON_ONCE(nr_bytes & (PAGE_SIZE - 1));
293 nr_pages = nr_bytes >> PAGE_SHIFT;
294
295 spin_lock_irqsave(&css_set_lock, flags);
296 memcg = obj_cgroup_memcg(objcg);
297 if (nr_pages)
298 __memcg_kmem_uncharge(memcg, nr_pages);
299 list_del(&objcg->list);
300 mem_cgroup_put(memcg);
301 spin_unlock_irqrestore(&css_set_lock, flags);
302
303 percpu_ref_exit(ref);
304 kfree_rcu(objcg, rcu);
305}
306
307static struct obj_cgroup *obj_cgroup_alloc(void)
308{
309 struct obj_cgroup *objcg;
310 int ret;
311
312 objcg = kzalloc(sizeof(struct obj_cgroup), GFP_KERNEL);
313 if (!objcg)
314 return NULL;
315
316 ret = percpu_ref_init(&objcg->refcnt, obj_cgroup_release, 0,
317 GFP_KERNEL);
318 if (ret) {
319 kfree(objcg);
320 return NULL;
321 }
322 INIT_LIST_HEAD(&objcg->list);
323 return objcg;
324}
325
326static void memcg_reparent_objcgs(struct mem_cgroup *memcg,
327 struct mem_cgroup *parent)
328{
329 struct obj_cgroup *objcg, *iter;
330
331 objcg = rcu_replace_pointer(memcg->objcg, NULL, true);
332
333 spin_lock_irq(&css_set_lock);
334
335
336 xchg(&objcg->memcg, parent);
337 css_get(&parent->css);
338 list_add(&objcg->list, &parent->objcg_list);
339
340
341 list_for_each_entry(iter, &memcg->objcg_list, list) {
342 css_get(&parent->css);
343 xchg(&iter->memcg, parent);
344 css_put(&memcg->css);
345 }
346 list_splice(&memcg->objcg_list, &parent->objcg_list);
347
348 spin_unlock_irq(&css_set_lock);
349
350 percpu_ref_kill(&objcg->refcnt);
351}
352
353
354
355
356
357
358
359
360
361
362
363
364static DEFINE_IDA(memcg_cache_ida);
365int memcg_nr_cache_ids;
366
367
368static DECLARE_RWSEM(memcg_cache_ids_sem);
369
370void memcg_get_cache_ids(void)
371{
372 down_read(&memcg_cache_ids_sem);
373}
374
375void memcg_put_cache_ids(void)
376{
377 up_read(&memcg_cache_ids_sem);
378}
379
380
381
382
383
384
385
386
387
388
389
390
391
392#define MEMCG_CACHES_MIN_SIZE 4
393#define MEMCG_CACHES_MAX_SIZE MEM_CGROUP_ID_MAX
394
395
396
397
398
399
400
401DEFINE_STATIC_KEY_FALSE(memcg_kmem_enabled_key);
402EXPORT_SYMBOL(memcg_kmem_enabled_key);
403#endif
404
405static int memcg_shrinker_map_size;
406static DEFINE_MUTEX(memcg_shrinker_map_mutex);
407
408static void memcg_free_shrinker_map_rcu(struct rcu_head *head)
409{
410 kvfree(container_of(head, struct memcg_shrinker_map, rcu));
411}
412
413static int memcg_expand_one_shrinker_map(struct mem_cgroup *memcg,
414 int size, int old_size)
415{
416 struct memcg_shrinker_map *new, *old;
417 int nid;
418
419 lockdep_assert_held(&memcg_shrinker_map_mutex);
420
421 for_each_node(nid) {
422 old = rcu_dereference_protected(
423 mem_cgroup_nodeinfo(memcg, nid)->shrinker_map, true);
424
425 if (!old)
426 return 0;
427
428 new = kvmalloc_node(sizeof(*new) + size, GFP_KERNEL, nid);
429 if (!new)
430 return -ENOMEM;
431
432
433 memset(new->map, (int)0xff, old_size);
434 memset((void *)new->map + old_size, 0, size - old_size);
435
436 rcu_assign_pointer(memcg->nodeinfo[nid]->shrinker_map, new);
437 call_rcu(&old->rcu, memcg_free_shrinker_map_rcu);
438 }
439
440 return 0;
441}
442
443static void memcg_free_shrinker_maps(struct mem_cgroup *memcg)
444{
445 struct mem_cgroup_per_node *pn;
446 struct memcg_shrinker_map *map;
447 int nid;
448
449 if (mem_cgroup_is_root(memcg))
450 return;
451
452 for_each_node(nid) {
453 pn = mem_cgroup_nodeinfo(memcg, nid);
454 map = rcu_dereference_protected(pn->shrinker_map, true);
455 kvfree(map);
456 rcu_assign_pointer(pn->shrinker_map, NULL);
457 }
458}
459
460static int memcg_alloc_shrinker_maps(struct mem_cgroup *memcg)
461{
462 struct memcg_shrinker_map *map;
463 int nid, size, ret = 0;
464
465 if (mem_cgroup_is_root(memcg))
466 return 0;
467
468 mutex_lock(&memcg_shrinker_map_mutex);
469 size = memcg_shrinker_map_size;
470 for_each_node(nid) {
471 map = kvzalloc_node(sizeof(*map) + size, GFP_KERNEL, nid);
472 if (!map) {
473 memcg_free_shrinker_maps(memcg);
474 ret = -ENOMEM;
475 break;
476 }
477 rcu_assign_pointer(memcg->nodeinfo[nid]->shrinker_map, map);
478 }
479 mutex_unlock(&memcg_shrinker_map_mutex);
480
481 return ret;
482}
483
484int memcg_expand_shrinker_maps(int new_id)
485{
486 int size, old_size, ret = 0;
487 struct mem_cgroup *memcg;
488
489 size = DIV_ROUND_UP(new_id + 1, BITS_PER_LONG) * sizeof(unsigned long);
490 old_size = memcg_shrinker_map_size;
491 if (size <= old_size)
492 return 0;
493
494 mutex_lock(&memcg_shrinker_map_mutex);
495 if (!root_mem_cgroup)
496 goto unlock;
497
498 for_each_mem_cgroup(memcg) {
499 if (mem_cgroup_is_root(memcg))
500 continue;
501 ret = memcg_expand_one_shrinker_map(memcg, size, old_size);
502 if (ret) {
503 mem_cgroup_iter_break(NULL, memcg);
504 goto unlock;
505 }
506 }
507unlock:
508 if (!ret)
509 memcg_shrinker_map_size = size;
510 mutex_unlock(&memcg_shrinker_map_mutex);
511 return ret;
512}
513
514void memcg_set_shrinker_bit(struct mem_cgroup *memcg, int nid, int shrinker_id)
515{
516 if (shrinker_id >= 0 && memcg && !mem_cgroup_is_root(memcg)) {
517 struct memcg_shrinker_map *map;
518
519 rcu_read_lock();
520 map = rcu_dereference(memcg->nodeinfo[nid]->shrinker_map);
521
522 smp_mb__before_atomic();
523 set_bit(shrinker_id, map->map);
524 rcu_read_unlock();
525 }
526}
527
528
529
530
531
532
533
534
535
536
537
538
539struct cgroup_subsys_state *mem_cgroup_css_from_page(struct page *page)
540{
541 struct mem_cgroup *memcg;
542
543 memcg = page_memcg(page);
544
545 if (!memcg || !cgroup_subsys_on_dfl(memory_cgrp_subsys))
546 memcg = root_mem_cgroup;
547
548 return &memcg->css;
549}
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564ino_t page_cgroup_ino(struct page *page)
565{
566 struct mem_cgroup *memcg;
567 unsigned long ino = 0;
568
569 rcu_read_lock();
570 memcg = page_memcg_check(page);
571
572 while (memcg && !(memcg->css.flags & CSS_ONLINE))
573 memcg = parent_mem_cgroup(memcg);
574 if (memcg)
575 ino = cgroup_ino(memcg->css.cgroup);
576 rcu_read_unlock();
577 return ino;
578}
579
580static struct mem_cgroup_per_node *
581mem_cgroup_page_nodeinfo(struct mem_cgroup *memcg, struct page *page)
582{
583 int nid = page_to_nid(page);
584
585 return memcg->nodeinfo[nid];
586}
587
588static struct mem_cgroup_tree_per_node *
589soft_limit_tree_node(int nid)
590{
591 return soft_limit_tree.rb_tree_per_node[nid];
592}
593
594static struct mem_cgroup_tree_per_node *
595soft_limit_tree_from_page(struct page *page)
596{
597 int nid = page_to_nid(page);
598
599 return soft_limit_tree.rb_tree_per_node[nid];
600}
601
602static void __mem_cgroup_insert_exceeded(struct mem_cgroup_per_node *mz,
603 struct mem_cgroup_tree_per_node *mctz,
604 unsigned long new_usage_in_excess)
605{
606 struct rb_node **p = &mctz->rb_root.rb_node;
607 struct rb_node *parent = NULL;
608 struct mem_cgroup_per_node *mz_node;
609 bool rightmost = true;
610
611 if (mz->on_tree)
612 return;
613
614 mz->usage_in_excess = new_usage_in_excess;
615 if (!mz->usage_in_excess)
616 return;
617 while (*p) {
618 parent = *p;
619 mz_node = rb_entry(parent, struct mem_cgroup_per_node,
620 tree_node);
621 if (mz->usage_in_excess < mz_node->usage_in_excess) {
622 p = &(*p)->rb_left;
623 rightmost = false;
624 } else {
625 p = &(*p)->rb_right;
626 }
627 }
628
629 if (rightmost)
630 mctz->rb_rightmost = &mz->tree_node;
631
632 rb_link_node(&mz->tree_node, parent, p);
633 rb_insert_color(&mz->tree_node, &mctz->rb_root);
634 mz->on_tree = true;
635}
636
637static void __mem_cgroup_remove_exceeded(struct mem_cgroup_per_node *mz,
638 struct mem_cgroup_tree_per_node *mctz)
639{
640 if (!mz->on_tree)
641 return;
642
643 if (&mz->tree_node == mctz->rb_rightmost)
644 mctz->rb_rightmost = rb_prev(&mz->tree_node);
645
646 rb_erase(&mz->tree_node, &mctz->rb_root);
647 mz->on_tree = false;
648}
649
650static void mem_cgroup_remove_exceeded(struct mem_cgroup_per_node *mz,
651 struct mem_cgroup_tree_per_node *mctz)
652{
653 unsigned long flags;
654
655 spin_lock_irqsave(&mctz->lock, flags);
656 __mem_cgroup_remove_exceeded(mz, mctz);
657 spin_unlock_irqrestore(&mctz->lock, flags);
658}
659
660static unsigned long soft_limit_excess(struct mem_cgroup *memcg)
661{
662 unsigned long nr_pages = page_counter_read(&memcg->memory);
663 unsigned long soft_limit = READ_ONCE(memcg->soft_limit);
664 unsigned long excess = 0;
665
666 if (nr_pages > soft_limit)
667 excess = nr_pages - soft_limit;
668
669 return excess;
670}
671
672static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page)
673{
674 unsigned long excess;
675 struct mem_cgroup_per_node *mz;
676 struct mem_cgroup_tree_per_node *mctz;
677
678 mctz = soft_limit_tree_from_page(page);
679 if (!mctz)
680 return;
681
682
683
684
685 for (; memcg; memcg = parent_mem_cgroup(memcg)) {
686 mz = mem_cgroup_page_nodeinfo(memcg, page);
687 excess = soft_limit_excess(memcg);
688
689
690
691
692 if (excess || mz->on_tree) {
693 unsigned long flags;
694
695 spin_lock_irqsave(&mctz->lock, flags);
696
697 if (mz->on_tree)
698 __mem_cgroup_remove_exceeded(mz, mctz);
699
700
701
702
703 __mem_cgroup_insert_exceeded(mz, mctz, excess);
704 spin_unlock_irqrestore(&mctz->lock, flags);
705 }
706 }
707}
708
709static void mem_cgroup_remove_from_trees(struct mem_cgroup *memcg)
710{
711 struct mem_cgroup_tree_per_node *mctz;
712 struct mem_cgroup_per_node *mz;
713 int nid;
714
715 for_each_node(nid) {
716 mz = mem_cgroup_nodeinfo(memcg, nid);
717 mctz = soft_limit_tree_node(nid);
718 if (mctz)
719 mem_cgroup_remove_exceeded(mz, mctz);
720 }
721}
722
723static struct mem_cgroup_per_node *
724__mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz)
725{
726 struct mem_cgroup_per_node *mz;
727
728retry:
729 mz = NULL;
730 if (!mctz->rb_rightmost)
731 goto done;
732
733 mz = rb_entry(mctz->rb_rightmost,
734 struct mem_cgroup_per_node, tree_node);
735
736
737
738
739
740 __mem_cgroup_remove_exceeded(mz, mctz);
741 if (!soft_limit_excess(mz->memcg) ||
742 !css_tryget(&mz->memcg->css))
743 goto retry;
744done:
745 return mz;
746}
747
748static struct mem_cgroup_per_node *
749mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz)
750{
751 struct mem_cgroup_per_node *mz;
752
753 spin_lock_irq(&mctz->lock);
754 mz = __mem_cgroup_largest_soft_limit_node(mctz);
755 spin_unlock_irq(&mctz->lock);
756 return mz;
757}
758
759
760
761
762
763
764
765void __mod_memcg_state(struct mem_cgroup *memcg, int idx, int val)
766{
767 long x, threshold = MEMCG_CHARGE_BATCH;
768
769 if (mem_cgroup_disabled())
770 return;
771
772 if (memcg_stat_item_in_bytes(idx))
773 threshold <<= PAGE_SHIFT;
774
775 x = val + __this_cpu_read(memcg->vmstats_percpu->stat[idx]);
776 if (unlikely(abs(x) > threshold)) {
777 struct mem_cgroup *mi;
778
779
780
781
782
783 __this_cpu_add(memcg->vmstats_local->stat[idx], x);
784 for (mi = memcg; mi; mi = parent_mem_cgroup(mi))
785 atomic_long_add(x, &mi->vmstats[idx]);
786 x = 0;
787 }
788 __this_cpu_write(memcg->vmstats_percpu->stat[idx], x);
789}
790
791static struct mem_cgroup_per_node *
792parent_nodeinfo(struct mem_cgroup_per_node *pn, int nid)
793{
794 struct mem_cgroup *parent;
795
796 parent = parent_mem_cgroup(pn->memcg);
797 if (!parent)
798 return NULL;
799 return mem_cgroup_nodeinfo(parent, nid);
800}
801
802void __mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx,
803 int val)
804{
805 struct mem_cgroup_per_node *pn;
806 struct mem_cgroup *memcg;
807 long x, threshold = MEMCG_CHARGE_BATCH;
808
809 pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
810 memcg = pn->memcg;
811
812
813 __mod_memcg_state(memcg, idx, val);
814
815
816 __this_cpu_add(pn->lruvec_stat_local->count[idx], val);
817
818 if (vmstat_item_in_bytes(idx))
819 threshold <<= PAGE_SHIFT;
820
821 x = val + __this_cpu_read(pn->lruvec_stat_cpu->count[idx]);
822 if (unlikely(abs(x) > threshold)) {
823 pg_data_t *pgdat = lruvec_pgdat(lruvec);
824 struct mem_cgroup_per_node *pi;
825
826 for (pi = pn; pi; pi = parent_nodeinfo(pi, pgdat->node_id))
827 atomic_long_add(x, &pi->lruvec_stat[idx]);
828 x = 0;
829 }
830 __this_cpu_write(pn->lruvec_stat_cpu->count[idx], x);
831}
832
833
834
835
836
837
838
839
840
841
842
843void __mod_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx,
844 int val)
845{
846
847 __mod_node_page_state(lruvec_pgdat(lruvec), idx, val);
848
849
850 if (!mem_cgroup_disabled())
851 __mod_memcg_lruvec_state(lruvec, idx, val);
852}
853
854void __mod_lruvec_page_state(struct page *page, enum node_stat_item idx,
855 int val)
856{
857 struct page *head = compound_head(page);
858 struct mem_cgroup *memcg = page_memcg(head);
859 pg_data_t *pgdat = page_pgdat(page);
860 struct lruvec *lruvec;
861
862
863 if (!memcg) {
864 __mod_node_page_state(pgdat, idx, val);
865 return;
866 }
867
868 lruvec = mem_cgroup_lruvec(memcg, pgdat);
869 __mod_lruvec_state(lruvec, idx, val);
870}
871EXPORT_SYMBOL(__mod_lruvec_page_state);
872
873void __mod_lruvec_kmem_state(void *p, enum node_stat_item idx, int val)
874{
875 pg_data_t *pgdat = page_pgdat(virt_to_page(p));
876 struct mem_cgroup *memcg;
877 struct lruvec *lruvec;
878
879 rcu_read_lock();
880 memcg = mem_cgroup_from_obj(p);
881
882
883
884
885
886
887
888 if (!memcg) {
889 __mod_node_page_state(pgdat, idx, val);
890 } else {
891 lruvec = mem_cgroup_lruvec(memcg, pgdat);
892 __mod_lruvec_state(lruvec, idx, val);
893 }
894 rcu_read_unlock();
895}
896
897
898
899
900
901
902
903void __count_memcg_events(struct mem_cgroup *memcg, enum vm_event_item idx,
904 unsigned long count)
905{
906 unsigned long x;
907
908 if (mem_cgroup_disabled())
909 return;
910
911 x = count + __this_cpu_read(memcg->vmstats_percpu->events[idx]);
912 if (unlikely(x > MEMCG_CHARGE_BATCH)) {
913 struct mem_cgroup *mi;
914
915
916
917
918
919 __this_cpu_add(memcg->vmstats_local->events[idx], x);
920 for (mi = memcg; mi; mi = parent_mem_cgroup(mi))
921 atomic_long_add(x, &mi->vmevents[idx]);
922 x = 0;
923 }
924 __this_cpu_write(memcg->vmstats_percpu->events[idx], x);
925}
926
927static unsigned long memcg_events(struct mem_cgroup *memcg, int event)
928{
929 return atomic_long_read(&memcg->vmevents[event]);
930}
931
932static unsigned long memcg_events_local(struct mem_cgroup *memcg, int event)
933{
934 long x = 0;
935 int cpu;
936
937 for_each_possible_cpu(cpu)
938 x += per_cpu(memcg->vmstats_local->events[event], cpu);
939 return x;
940}
941
942static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg,
943 struct page *page,
944 int nr_pages)
945{
946
947 if (nr_pages > 0)
948 __count_memcg_events(memcg, PGPGIN, 1);
949 else {
950 __count_memcg_events(memcg, PGPGOUT, 1);
951 nr_pages = -nr_pages;
952 }
953
954 __this_cpu_add(memcg->vmstats_percpu->nr_page_events, nr_pages);
955}
956
957static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg,
958 enum mem_cgroup_events_target target)
959{
960 unsigned long val, next;
961
962 val = __this_cpu_read(memcg->vmstats_percpu->nr_page_events);
963 next = __this_cpu_read(memcg->vmstats_percpu->targets[target]);
964
965 if ((long)(next - val) < 0) {
966 switch (target) {
967 case MEM_CGROUP_TARGET_THRESH:
968 next = val + THRESHOLDS_EVENTS_TARGET;
969 break;
970 case MEM_CGROUP_TARGET_SOFTLIMIT:
971 next = val + SOFTLIMIT_EVENTS_TARGET;
972 break;
973 default:
974 break;
975 }
976 __this_cpu_write(memcg->vmstats_percpu->targets[target], next);
977 return true;
978 }
979 return false;
980}
981
982
983
984
985
986static void memcg_check_events(struct mem_cgroup *memcg, struct page *page)
987{
988
989 if (unlikely(mem_cgroup_event_ratelimit(memcg,
990 MEM_CGROUP_TARGET_THRESH))) {
991 bool do_softlimit;
992
993 do_softlimit = mem_cgroup_event_ratelimit(memcg,
994 MEM_CGROUP_TARGET_SOFTLIMIT);
995 mem_cgroup_threshold(memcg);
996 if (unlikely(do_softlimit))
997 mem_cgroup_update_tree(memcg, page);
998 }
999}
1000
1001struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
1002{
1003
1004
1005
1006
1007
1008 if (unlikely(!p))
1009 return NULL;
1010
1011 return mem_cgroup_from_css(task_css(p, memory_cgrp_id));
1012}
1013EXPORT_SYMBOL(mem_cgroup_from_task);
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm)
1024{
1025 struct mem_cgroup *memcg;
1026
1027 if (mem_cgroup_disabled())
1028 return NULL;
1029
1030 rcu_read_lock();
1031 do {
1032
1033
1034
1035
1036
1037 if (unlikely(!mm))
1038 memcg = root_mem_cgroup;
1039 else {
1040 memcg = mem_cgroup_from_task(rcu_dereference(mm->owner));
1041 if (unlikely(!memcg))
1042 memcg = root_mem_cgroup;
1043 }
1044 } while (!css_tryget(&memcg->css));
1045 rcu_read_unlock();
1046 return memcg;
1047}
1048EXPORT_SYMBOL(get_mem_cgroup_from_mm);
1049
1050static __always_inline struct mem_cgroup *active_memcg(void)
1051{
1052 if (in_interrupt())
1053 return this_cpu_read(int_active_memcg);
1054 else
1055 return current->active_memcg;
1056}
1057
1058static __always_inline struct mem_cgroup *get_active_memcg(void)
1059{
1060 struct mem_cgroup *memcg;
1061
1062 rcu_read_lock();
1063 memcg = active_memcg();
1064
1065 if (memcg && WARN_ON_ONCE(!css_tryget(&memcg->css)))
1066 memcg = root_mem_cgroup;
1067 rcu_read_unlock();
1068
1069 return memcg;
1070}
1071
1072static __always_inline bool memcg_kmem_bypass(void)
1073{
1074
1075 if (unlikely(active_memcg()))
1076 return false;
1077
1078
1079 if (in_interrupt() || !current->mm || (current->flags & PF_KTHREAD))
1080 return true;
1081
1082 return false;
1083}
1084
1085
1086
1087
1088static __always_inline struct mem_cgroup *get_mem_cgroup_from_current(void)
1089{
1090 if (memcg_kmem_bypass())
1091 return NULL;
1092
1093 if (unlikely(active_memcg()))
1094 return get_active_memcg();
1095
1096 return get_mem_cgroup_from_mm(current->mm);
1097}
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
1117 struct mem_cgroup *prev,
1118 struct mem_cgroup_reclaim_cookie *reclaim)
1119{
1120 struct mem_cgroup_reclaim_iter *iter;
1121 struct cgroup_subsys_state *css = NULL;
1122 struct mem_cgroup *memcg = NULL;
1123 struct mem_cgroup *pos = NULL;
1124
1125 if (mem_cgroup_disabled())
1126 return NULL;
1127
1128 if (!root)
1129 root = root_mem_cgroup;
1130
1131 if (prev && !reclaim)
1132 pos = prev;
1133
1134 rcu_read_lock();
1135
1136 if (reclaim) {
1137 struct mem_cgroup_per_node *mz;
1138
1139 mz = mem_cgroup_nodeinfo(root, reclaim->pgdat->node_id);
1140 iter = &mz->iter;
1141
1142 if (prev && reclaim->generation != iter->generation)
1143 goto out_unlock;
1144
1145 while (1) {
1146 pos = READ_ONCE(iter->position);
1147 if (!pos || css_tryget(&pos->css))
1148 break;
1149
1150
1151
1152
1153
1154
1155
1156
1157 (void)cmpxchg(&iter->position, pos, NULL);
1158 }
1159 }
1160
1161 if (pos)
1162 css = &pos->css;
1163
1164 for (;;) {
1165 css = css_next_descendant_pre(css, &root->css);
1166 if (!css) {
1167
1168
1169
1170
1171
1172
1173 if (!prev)
1174 continue;
1175 break;
1176 }
1177
1178
1179
1180
1181
1182
1183 memcg = mem_cgroup_from_css(css);
1184
1185 if (css == &root->css)
1186 break;
1187
1188 if (css_tryget(css))
1189 break;
1190
1191 memcg = NULL;
1192 }
1193
1194 if (reclaim) {
1195
1196
1197
1198
1199
1200 (void)cmpxchg(&iter->position, pos, memcg);
1201
1202 if (pos)
1203 css_put(&pos->css);
1204
1205 if (!memcg)
1206 iter->generation++;
1207 else if (!prev)
1208 reclaim->generation = iter->generation;
1209 }
1210
1211out_unlock:
1212 rcu_read_unlock();
1213 if (prev && prev != root)
1214 css_put(&prev->css);
1215
1216 return memcg;
1217}
1218
1219
1220
1221
1222
1223
1224void mem_cgroup_iter_break(struct mem_cgroup *root,
1225 struct mem_cgroup *prev)
1226{
1227 if (!root)
1228 root = root_mem_cgroup;
1229 if (prev && prev != root)
1230 css_put(&prev->css);
1231}
1232
1233static void __invalidate_reclaim_iterators(struct mem_cgroup *from,
1234 struct mem_cgroup *dead_memcg)
1235{
1236 struct mem_cgroup_reclaim_iter *iter;
1237 struct mem_cgroup_per_node *mz;
1238 int nid;
1239
1240 for_each_node(nid) {
1241 mz = mem_cgroup_nodeinfo(from, nid);
1242 iter = &mz->iter;
1243 cmpxchg(&iter->position, dead_memcg, NULL);
1244 }
1245}
1246
1247static void invalidate_reclaim_iterators(struct mem_cgroup *dead_memcg)
1248{
1249 struct mem_cgroup *memcg = dead_memcg;
1250 struct mem_cgroup *last;
1251
1252 do {
1253 __invalidate_reclaim_iterators(memcg, dead_memcg);
1254 last = memcg;
1255 } while ((memcg = parent_mem_cgroup(memcg)));
1256
1257
1258
1259
1260
1261
1262
1263 if (last != root_mem_cgroup)
1264 __invalidate_reclaim_iterators(root_mem_cgroup,
1265 dead_memcg);
1266}
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281int mem_cgroup_scan_tasks(struct mem_cgroup *memcg,
1282 int (*fn)(struct task_struct *, void *), void *arg)
1283{
1284 struct mem_cgroup *iter;
1285 int ret = 0;
1286
1287 BUG_ON(memcg == root_mem_cgroup);
1288
1289 for_each_mem_cgroup_tree(iter, memcg) {
1290 struct css_task_iter it;
1291 struct task_struct *task;
1292
1293 css_task_iter_start(&iter->css, CSS_TASK_ITER_PROCS, &it);
1294 while (!ret && (task = css_task_iter_next(&it)))
1295 ret = fn(task, arg);
1296 css_task_iter_end(&it);
1297 if (ret) {
1298 mem_cgroup_iter_break(memcg, iter);
1299 break;
1300 }
1301 }
1302 return ret;
1303}
1304
1305#ifdef CONFIG_DEBUG_VM
1306void lruvec_memcg_debug(struct lruvec *lruvec, struct page *page)
1307{
1308 struct mem_cgroup *memcg;
1309
1310 if (mem_cgroup_disabled())
1311 return;
1312
1313 memcg = page_memcg(page);
1314
1315 if (!memcg)
1316 VM_BUG_ON_PAGE(lruvec_memcg(lruvec) != root_mem_cgroup, page);
1317 else
1318 VM_BUG_ON_PAGE(lruvec_memcg(lruvec) != memcg, page);
1319}
1320#endif
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332struct lruvec *lock_page_lruvec(struct page *page)
1333{
1334 struct lruvec *lruvec;
1335 struct pglist_data *pgdat = page_pgdat(page);
1336
1337 lruvec = mem_cgroup_page_lruvec(page, pgdat);
1338 spin_lock(&lruvec->lru_lock);
1339
1340 lruvec_memcg_debug(lruvec, page);
1341
1342 return lruvec;
1343}
1344
1345struct lruvec *lock_page_lruvec_irq(struct page *page)
1346{
1347 struct lruvec *lruvec;
1348 struct pglist_data *pgdat = page_pgdat(page);
1349
1350 lruvec = mem_cgroup_page_lruvec(page, pgdat);
1351 spin_lock_irq(&lruvec->lru_lock);
1352
1353 lruvec_memcg_debug(lruvec, page);
1354
1355 return lruvec;
1356}
1357
1358struct lruvec *lock_page_lruvec_irqsave(struct page *page, unsigned long *flags)
1359{
1360 struct lruvec *lruvec;
1361 struct pglist_data *pgdat = page_pgdat(page);
1362
1363 lruvec = mem_cgroup_page_lruvec(page, pgdat);
1364 spin_lock_irqsave(&lruvec->lru_lock, *flags);
1365
1366 lruvec_memcg_debug(lruvec, page);
1367
1368 return lruvec;
1369}
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru,
1383 int zid, int nr_pages)
1384{
1385 struct mem_cgroup_per_node *mz;
1386 unsigned long *lru_size;
1387 long size;
1388
1389 if (mem_cgroup_disabled())
1390 return;
1391
1392 mz = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
1393 lru_size = &mz->lru_zone_size[zid][lru];
1394
1395 if (nr_pages < 0)
1396 *lru_size += nr_pages;
1397
1398 size = *lru_size;
1399 if (WARN_ONCE(size < 0,
1400 "%s(%p, %d, %d): lru_size %ld\n",
1401 __func__, lruvec, lru, nr_pages, size)) {
1402 VM_BUG_ON(1);
1403 *lru_size = 0;
1404 }
1405
1406 if (nr_pages > 0)
1407 *lru_size += nr_pages;
1408}
1409
1410
1411
1412
1413
1414
1415
1416
1417static unsigned long mem_cgroup_margin(struct mem_cgroup *memcg)
1418{
1419 unsigned long margin = 0;
1420 unsigned long count;
1421 unsigned long limit;
1422
1423 count = page_counter_read(&memcg->memory);
1424 limit = READ_ONCE(memcg->memory.max);
1425 if (count < limit)
1426 margin = limit - count;
1427
1428 if (do_memsw_account()) {
1429 count = page_counter_read(&memcg->memsw);
1430 limit = READ_ONCE(memcg->memsw.max);
1431 if (count < limit)
1432 margin = min(margin, limit - count);
1433 else
1434 margin = 0;
1435 }
1436
1437 return margin;
1438}
1439
1440
1441
1442
1443
1444
1445
1446
1447static bool mem_cgroup_under_move(struct mem_cgroup *memcg)
1448{
1449 struct mem_cgroup *from;
1450 struct mem_cgroup *to;
1451 bool ret = false;
1452
1453
1454
1455
1456 spin_lock(&mc.lock);
1457 from = mc.from;
1458 to = mc.to;
1459 if (!from)
1460 goto unlock;
1461
1462 ret = mem_cgroup_is_descendant(from, memcg) ||
1463 mem_cgroup_is_descendant(to, memcg);
1464unlock:
1465 spin_unlock(&mc.lock);
1466 return ret;
1467}
1468
1469static bool mem_cgroup_wait_acct_move(struct mem_cgroup *memcg)
1470{
1471 if (mc.moving_task && current != mc.moving_task) {
1472 if (mem_cgroup_under_move(memcg)) {
1473 DEFINE_WAIT(wait);
1474 prepare_to_wait(&mc.waitq, &wait, TASK_INTERRUPTIBLE);
1475
1476 if (mc.moving_task)
1477 schedule();
1478 finish_wait(&mc.waitq, &wait);
1479 return true;
1480 }
1481 }
1482 return false;
1483}
1484
1485struct memory_stat {
1486 const char *name;
1487 unsigned int idx;
1488};
1489
1490static const struct memory_stat memory_stats[] = {
1491 { "anon", NR_ANON_MAPPED },
1492 { "file", NR_FILE_PAGES },
1493 { "kernel_stack", NR_KERNEL_STACK_KB },
1494 { "pagetables", NR_PAGETABLE },
1495 { "percpu", MEMCG_PERCPU_B },
1496 { "sock", MEMCG_SOCK },
1497 { "shmem", NR_SHMEM },
1498 { "file_mapped", NR_FILE_MAPPED },
1499 { "file_dirty", NR_FILE_DIRTY },
1500 { "file_writeback", NR_WRITEBACK },
1501#ifdef CONFIG_SWAP
1502 { "swapcached", NR_SWAPCACHE },
1503#endif
1504#ifdef CONFIG_TRANSPARENT_HUGEPAGE
1505 { "anon_thp", NR_ANON_THPS },
1506 { "file_thp", NR_FILE_THPS },
1507 { "shmem_thp", NR_SHMEM_THPS },
1508#endif
1509 { "inactive_anon", NR_INACTIVE_ANON },
1510 { "active_anon", NR_ACTIVE_ANON },
1511 { "inactive_file", NR_INACTIVE_FILE },
1512 { "active_file", NR_ACTIVE_FILE },
1513 { "unevictable", NR_UNEVICTABLE },
1514 { "slab_reclaimable", NR_SLAB_RECLAIMABLE_B },
1515 { "slab_unreclaimable", NR_SLAB_UNRECLAIMABLE_B },
1516
1517
1518 { "workingset_refault_anon", WORKINGSET_REFAULT_ANON },
1519 { "workingset_refault_file", WORKINGSET_REFAULT_FILE },
1520 { "workingset_activate_anon", WORKINGSET_ACTIVATE_ANON },
1521 { "workingset_activate_file", WORKINGSET_ACTIVATE_FILE },
1522 { "workingset_restore_anon", WORKINGSET_RESTORE_ANON },
1523 { "workingset_restore_file", WORKINGSET_RESTORE_FILE },
1524 { "workingset_nodereclaim", WORKINGSET_NODERECLAIM },
1525};
1526
1527
1528static int memcg_page_state_unit(int item)
1529{
1530 switch (item) {
1531 case MEMCG_PERCPU_B:
1532 case NR_SLAB_RECLAIMABLE_B:
1533 case NR_SLAB_UNRECLAIMABLE_B:
1534 case WORKINGSET_REFAULT_ANON:
1535 case WORKINGSET_REFAULT_FILE:
1536 case WORKINGSET_ACTIVATE_ANON:
1537 case WORKINGSET_ACTIVATE_FILE:
1538 case WORKINGSET_RESTORE_ANON:
1539 case WORKINGSET_RESTORE_FILE:
1540 case WORKINGSET_NODERECLAIM:
1541 return 1;
1542 case NR_KERNEL_STACK_KB:
1543 return SZ_1K;
1544 default:
1545 return PAGE_SIZE;
1546 }
1547}
1548
1549static inline unsigned long memcg_page_state_output(struct mem_cgroup *memcg,
1550 int item)
1551{
1552 return memcg_page_state(memcg, item) * memcg_page_state_unit(item);
1553}
1554
1555static char *memory_stat_format(struct mem_cgroup *memcg)
1556{
1557 struct seq_buf s;
1558 int i;
1559
1560 seq_buf_init(&s, kmalloc(PAGE_SIZE, GFP_KERNEL), PAGE_SIZE);
1561 if (!s.buffer)
1562 return NULL;
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575 for (i = 0; i < ARRAY_SIZE(memory_stats); i++) {
1576 u64 size;
1577
1578 size = memcg_page_state_output(memcg, memory_stats[i].idx);
1579 seq_buf_printf(&s, "%s %llu\n", memory_stats[i].name, size);
1580
1581 if (unlikely(memory_stats[i].idx == NR_SLAB_UNRECLAIMABLE_B)) {
1582 size += memcg_page_state_output(memcg,
1583 NR_SLAB_RECLAIMABLE_B);
1584 seq_buf_printf(&s, "slab %llu\n", size);
1585 }
1586 }
1587
1588
1589
1590 seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGFAULT),
1591 memcg_events(memcg, PGFAULT));
1592 seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGMAJFAULT),
1593 memcg_events(memcg, PGMAJFAULT));
1594 seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGREFILL),
1595 memcg_events(memcg, PGREFILL));
1596 seq_buf_printf(&s, "pgscan %lu\n",
1597 memcg_events(memcg, PGSCAN_KSWAPD) +
1598 memcg_events(memcg, PGSCAN_DIRECT));
1599 seq_buf_printf(&s, "pgsteal %lu\n",
1600 memcg_events(memcg, PGSTEAL_KSWAPD) +
1601 memcg_events(memcg, PGSTEAL_DIRECT));
1602 seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGACTIVATE),
1603 memcg_events(memcg, PGACTIVATE));
1604 seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGDEACTIVATE),
1605 memcg_events(memcg, PGDEACTIVATE));
1606 seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGLAZYFREE),
1607 memcg_events(memcg, PGLAZYFREE));
1608 seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGLAZYFREED),
1609 memcg_events(memcg, PGLAZYFREED));
1610
1611#ifdef CONFIG_TRANSPARENT_HUGEPAGE
1612 seq_buf_printf(&s, "%s %lu\n", vm_event_name(THP_FAULT_ALLOC),
1613 memcg_events(memcg, THP_FAULT_ALLOC));
1614 seq_buf_printf(&s, "%s %lu\n", vm_event_name(THP_COLLAPSE_ALLOC),
1615 memcg_events(memcg, THP_COLLAPSE_ALLOC));
1616#endif
1617
1618
1619 WARN_ON_ONCE(seq_buf_has_overflowed(&s));
1620
1621 return s.buffer;
1622}
1623
1624#define K(x) ((x) << (PAGE_SHIFT-10))
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634void mem_cgroup_print_oom_context(struct mem_cgroup *memcg, struct task_struct *p)
1635{
1636 rcu_read_lock();
1637
1638 if (memcg) {
1639 pr_cont(",oom_memcg=");
1640 pr_cont_cgroup_path(memcg->css.cgroup);
1641 } else
1642 pr_cont(",global_oom");
1643 if (p) {
1644 pr_cont(",task_memcg=");
1645 pr_cont_cgroup_path(task_cgroup(p, memory_cgrp_id));
1646 }
1647 rcu_read_unlock();
1648}
1649
1650
1651
1652
1653
1654
1655void mem_cgroup_print_oom_meminfo(struct mem_cgroup *memcg)
1656{
1657 char *buf;
1658
1659 pr_info("memory: usage %llukB, limit %llukB, failcnt %lu\n",
1660 K((u64)page_counter_read(&memcg->memory)),
1661 K((u64)READ_ONCE(memcg->memory.max)), memcg->memory.failcnt);
1662 if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
1663 pr_info("swap: usage %llukB, limit %llukB, failcnt %lu\n",
1664 K((u64)page_counter_read(&memcg->swap)),
1665 K((u64)READ_ONCE(memcg->swap.max)), memcg->swap.failcnt);
1666 else {
1667 pr_info("memory+swap: usage %llukB, limit %llukB, failcnt %lu\n",
1668 K((u64)page_counter_read(&memcg->memsw)),
1669 K((u64)memcg->memsw.max), memcg->memsw.failcnt);
1670 pr_info("kmem: usage %llukB, limit %llukB, failcnt %lu\n",
1671 K((u64)page_counter_read(&memcg->kmem)),
1672 K((u64)memcg->kmem.max), memcg->kmem.failcnt);
1673 }
1674
1675 pr_info("Memory cgroup stats for ");
1676 pr_cont_cgroup_path(memcg->css.cgroup);
1677 pr_cont(":");
1678 buf = memory_stat_format(memcg);
1679 if (!buf)
1680 return;
1681 pr_info("%s", buf);
1682 kfree(buf);
1683}
1684
1685
1686
1687
1688unsigned long mem_cgroup_get_max(struct mem_cgroup *memcg)
1689{
1690 unsigned long max = READ_ONCE(memcg->memory.max);
1691
1692 if (cgroup_subsys_on_dfl(memory_cgrp_subsys)) {
1693 if (mem_cgroup_swappiness(memcg))
1694 max += min(READ_ONCE(memcg->swap.max),
1695 (unsigned long)total_swap_pages);
1696 } else {
1697 if (mem_cgroup_swappiness(memcg)) {
1698
1699 unsigned long swap = READ_ONCE(memcg->memsw.max) - max;
1700
1701 max += min(swap, (unsigned long)total_swap_pages);
1702 }
1703 }
1704 return max;
1705}
1706
1707unsigned long mem_cgroup_size(struct mem_cgroup *memcg)
1708{
1709 return page_counter_read(&memcg->memory);
1710}
1711
1712static bool mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
1713 int order)
1714{
1715 struct oom_control oc = {
1716 .zonelist = NULL,
1717 .nodemask = NULL,
1718 .memcg = memcg,
1719 .gfp_mask = gfp_mask,
1720 .order = order,
1721 };
1722 bool ret = true;
1723
1724 if (mutex_lock_killable(&oom_lock))
1725 return true;
1726
1727 if (mem_cgroup_margin(memcg) >= (1 << order))
1728 goto unlock;
1729
1730
1731
1732
1733
1734 ret = should_force_charge() || out_of_memory(&oc);
1735
1736unlock:
1737 mutex_unlock(&oom_lock);
1738 return ret;
1739}
1740
1741static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg,
1742 pg_data_t *pgdat,
1743 gfp_t gfp_mask,
1744 unsigned long *total_scanned)
1745{
1746 struct mem_cgroup *victim = NULL;
1747 int total = 0;
1748 int loop = 0;
1749 unsigned long excess;
1750 unsigned long nr_scanned;
1751 struct mem_cgroup_reclaim_cookie reclaim = {
1752 .pgdat = pgdat,
1753 };
1754
1755 excess = soft_limit_excess(root_memcg);
1756
1757 while (1) {
1758 victim = mem_cgroup_iter(root_memcg, victim, &reclaim);
1759 if (!victim) {
1760 loop++;
1761 if (loop >= 2) {
1762
1763
1764
1765
1766
1767 if (!total)
1768 break;
1769
1770
1771
1772
1773
1774
1775 if (total >= (excess >> 2) ||
1776 (loop > MEM_CGROUP_MAX_RECLAIM_LOOPS))
1777 break;
1778 }
1779 continue;
1780 }
1781 total += mem_cgroup_shrink_node(victim, gfp_mask, false,
1782 pgdat, &nr_scanned);
1783 *total_scanned += nr_scanned;
1784 if (!soft_limit_excess(root_memcg))
1785 break;
1786 }
1787 mem_cgroup_iter_break(root_memcg, victim);
1788 return total;
1789}
1790
1791#ifdef CONFIG_LOCKDEP
1792static struct lockdep_map memcg_oom_lock_dep_map = {
1793 .name = "memcg_oom_lock",
1794};
1795#endif
1796
1797static DEFINE_SPINLOCK(memcg_oom_lock);
1798
1799
1800
1801
1802
1803static bool mem_cgroup_oom_trylock(struct mem_cgroup *memcg)
1804{
1805 struct mem_cgroup *iter, *failed = NULL;
1806
1807 spin_lock(&memcg_oom_lock);
1808
1809 for_each_mem_cgroup_tree(iter, memcg) {
1810 if (iter->oom_lock) {
1811
1812
1813
1814
1815 failed = iter;
1816 mem_cgroup_iter_break(memcg, iter);
1817 break;
1818 } else
1819 iter->oom_lock = true;
1820 }
1821
1822 if (failed) {
1823
1824
1825
1826
1827 for_each_mem_cgroup_tree(iter, memcg) {
1828 if (iter == failed) {
1829 mem_cgroup_iter_break(memcg, iter);
1830 break;
1831 }
1832 iter->oom_lock = false;
1833 }
1834 } else
1835 mutex_acquire(&memcg_oom_lock_dep_map, 0, 1, _RET_IP_);
1836
1837 spin_unlock(&memcg_oom_lock);
1838
1839 return !failed;
1840}
1841
1842static void mem_cgroup_oom_unlock(struct mem_cgroup *memcg)
1843{
1844 struct mem_cgroup *iter;
1845
1846 spin_lock(&memcg_oom_lock);
1847 mutex_release(&memcg_oom_lock_dep_map, _RET_IP_);
1848 for_each_mem_cgroup_tree(iter, memcg)
1849 iter->oom_lock = false;
1850 spin_unlock(&memcg_oom_lock);
1851}
1852
1853static void mem_cgroup_mark_under_oom(struct mem_cgroup *memcg)
1854{
1855 struct mem_cgroup *iter;
1856
1857 spin_lock(&memcg_oom_lock);
1858 for_each_mem_cgroup_tree(iter, memcg)
1859 iter->under_oom++;
1860 spin_unlock(&memcg_oom_lock);
1861}
1862
1863static void mem_cgroup_unmark_under_oom(struct mem_cgroup *memcg)
1864{
1865 struct mem_cgroup *iter;
1866
1867
1868
1869
1870
1871 spin_lock(&memcg_oom_lock);
1872 for_each_mem_cgroup_tree(iter, memcg)
1873 if (iter->under_oom > 0)
1874 iter->under_oom--;
1875 spin_unlock(&memcg_oom_lock);
1876}
1877
1878static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq);
1879
1880struct oom_wait_info {
1881 struct mem_cgroup *memcg;
1882 wait_queue_entry_t wait;
1883};
1884
1885static int memcg_oom_wake_function(wait_queue_entry_t *wait,
1886 unsigned mode, int sync, void *arg)
1887{
1888 struct mem_cgroup *wake_memcg = (struct mem_cgroup *)arg;
1889 struct mem_cgroup *oom_wait_memcg;
1890 struct oom_wait_info *oom_wait_info;
1891
1892 oom_wait_info = container_of(wait, struct oom_wait_info, wait);
1893 oom_wait_memcg = oom_wait_info->memcg;
1894
1895 if (!mem_cgroup_is_descendant(wake_memcg, oom_wait_memcg) &&
1896 !mem_cgroup_is_descendant(oom_wait_memcg, wake_memcg))
1897 return 0;
1898 return autoremove_wake_function(wait, mode, sync, arg);
1899}
1900
1901static void memcg_oom_recover(struct mem_cgroup *memcg)
1902{
1903
1904
1905
1906
1907
1908
1909
1910
1911 if (memcg && memcg->under_oom)
1912 __wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, memcg);
1913}
1914
1915enum oom_status {
1916 OOM_SUCCESS,
1917 OOM_FAILED,
1918 OOM_ASYNC,
1919 OOM_SKIPPED
1920};
1921
1922static enum oom_status mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order)
1923{
1924 enum oom_status ret;
1925 bool locked;
1926
1927 if (order > PAGE_ALLOC_COSTLY_ORDER)
1928 return OOM_SKIPPED;
1929
1930 memcg_memory_event(memcg, MEMCG_OOM);
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950 if (memcg->oom_kill_disable) {
1951 if (!current->in_user_fault)
1952 return OOM_SKIPPED;
1953 css_get(&memcg->css);
1954 current->memcg_in_oom = memcg;
1955 current->memcg_oom_gfp_mask = mask;
1956 current->memcg_oom_order = order;
1957
1958 return OOM_ASYNC;
1959 }
1960
1961 mem_cgroup_mark_under_oom(memcg);
1962
1963 locked = mem_cgroup_oom_trylock(memcg);
1964
1965 if (locked)
1966 mem_cgroup_oom_notify(memcg);
1967
1968 mem_cgroup_unmark_under_oom(memcg);
1969 if (mem_cgroup_out_of_memory(memcg, mask, order))
1970 ret = OOM_SUCCESS;
1971 else
1972 ret = OOM_FAILED;
1973
1974 if (locked)
1975 mem_cgroup_oom_unlock(memcg);
1976
1977 return ret;
1978}
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997bool mem_cgroup_oom_synchronize(bool handle)
1998{
1999 struct mem_cgroup *memcg = current->memcg_in_oom;
2000 struct oom_wait_info owait;
2001 bool locked;
2002
2003
2004 if (!memcg)
2005 return false;
2006
2007 if (!handle)
2008 goto cleanup;
2009
2010 owait.memcg = memcg;
2011 owait.wait.flags = 0;
2012 owait.wait.func = memcg_oom_wake_function;
2013 owait.wait.private = current;
2014 INIT_LIST_HEAD(&owait.wait.entry);
2015
2016 prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE);
2017 mem_cgroup_mark_under_oom(memcg);
2018
2019 locked = mem_cgroup_oom_trylock(memcg);
2020
2021 if (locked)
2022 mem_cgroup_oom_notify(memcg);
2023
2024 if (locked && !memcg->oom_kill_disable) {
2025 mem_cgroup_unmark_under_oom(memcg);
2026 finish_wait(&memcg_oom_waitq, &owait.wait);
2027 mem_cgroup_out_of_memory(memcg, current->memcg_oom_gfp_mask,
2028 current->memcg_oom_order);
2029 } else {
2030 schedule();
2031 mem_cgroup_unmark_under_oom(memcg);
2032 finish_wait(&memcg_oom_waitq, &owait.wait);
2033 }
2034
2035 if (locked) {
2036 mem_cgroup_oom_unlock(memcg);
2037
2038
2039
2040
2041
2042 memcg_oom_recover(memcg);
2043 }
2044cleanup:
2045 current->memcg_in_oom = NULL;
2046 css_put(&memcg->css);
2047 return true;
2048}
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060struct mem_cgroup *mem_cgroup_get_oom_group(struct task_struct *victim,
2061 struct mem_cgroup *oom_domain)
2062{
2063 struct mem_cgroup *oom_group = NULL;
2064 struct mem_cgroup *memcg;
2065
2066 if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
2067 return NULL;
2068
2069 if (!oom_domain)
2070 oom_domain = root_mem_cgroup;
2071
2072 rcu_read_lock();
2073
2074 memcg = mem_cgroup_from_task(victim);
2075 if (memcg == root_mem_cgroup)
2076 goto out;
2077
2078
2079
2080
2081
2082
2083 if (unlikely(!mem_cgroup_is_descendant(memcg, oom_domain)))
2084 goto out;
2085
2086
2087
2088
2089
2090
2091 for (; memcg; memcg = parent_mem_cgroup(memcg)) {
2092 if (memcg->oom_group)
2093 oom_group = memcg;
2094
2095 if (memcg == oom_domain)
2096 break;
2097 }
2098
2099 if (oom_group)
2100 css_get(&oom_group->css);
2101out:
2102 rcu_read_unlock();
2103
2104 return oom_group;
2105}
2106
2107void mem_cgroup_print_oom_group(struct mem_cgroup *memcg)
2108{
2109 pr_info("Tasks in ");
2110 pr_cont_cgroup_path(memcg->css.cgroup);
2111 pr_cont(" are going to be killed due to memory.oom.group set\n");
2112}
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125struct mem_cgroup *lock_page_memcg(struct page *page)
2126{
2127 struct page *head = compound_head(page);
2128 struct mem_cgroup *memcg;
2129 unsigned long flags;
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142 rcu_read_lock();
2143
2144 if (mem_cgroup_disabled())
2145 return NULL;
2146again:
2147 memcg = page_memcg(head);
2148 if (unlikely(!memcg))
2149 return NULL;
2150
2151#ifdef CONFIG_PROVE_LOCKING
2152 local_irq_save(flags);
2153 might_lock(&memcg->move_lock);
2154 local_irq_restore(flags);
2155#endif
2156
2157 if (atomic_read(&memcg->moving_account) <= 0)
2158 return memcg;
2159
2160 spin_lock_irqsave(&memcg->move_lock, flags);
2161 if (memcg != page_memcg(head)) {
2162 spin_unlock_irqrestore(&memcg->move_lock, flags);
2163 goto again;
2164 }
2165
2166
2167
2168
2169
2170
2171 memcg->move_lock_task = current;
2172 memcg->move_lock_flags = flags;
2173
2174 return memcg;
2175}
2176EXPORT_SYMBOL(lock_page_memcg);
2177
2178
2179
2180
2181
2182
2183
2184void __unlock_page_memcg(struct mem_cgroup *memcg)
2185{
2186 if (memcg && memcg->move_lock_task == current) {
2187 unsigned long flags = memcg->move_lock_flags;
2188
2189 memcg->move_lock_task = NULL;
2190 memcg->move_lock_flags = 0;
2191
2192 spin_unlock_irqrestore(&memcg->move_lock, flags);
2193 }
2194
2195 rcu_read_unlock();
2196}
2197
2198
2199
2200
2201
2202void unlock_page_memcg(struct page *page)
2203{
2204 struct page *head = compound_head(page);
2205
2206 __unlock_page_memcg(page_memcg(head));
2207}
2208EXPORT_SYMBOL(unlock_page_memcg);
2209
2210struct memcg_stock_pcp {
2211 struct mem_cgroup *cached;
2212 unsigned int nr_pages;
2213
2214#ifdef CONFIG_MEMCG_KMEM
2215 struct obj_cgroup *cached_objcg;
2216 unsigned int nr_bytes;
2217#endif
2218
2219 struct work_struct work;
2220 unsigned long flags;
2221#define FLUSHING_CACHED_CHARGE 0
2222};
2223static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock);
2224static DEFINE_MUTEX(percpu_charge_mutex);
2225
2226#ifdef CONFIG_MEMCG_KMEM
2227static void drain_obj_stock(struct memcg_stock_pcp *stock);
2228static bool obj_stock_flush_required(struct memcg_stock_pcp *stock,
2229 struct mem_cgroup *root_memcg);
2230
2231#else
2232static inline void drain_obj_stock(struct memcg_stock_pcp *stock)
2233{
2234}
2235static bool obj_stock_flush_required(struct memcg_stock_pcp *stock,
2236 struct mem_cgroup *root_memcg)
2237{
2238 return false;
2239}
2240#endif
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
2254{
2255 struct memcg_stock_pcp *stock;
2256 unsigned long flags;
2257 bool ret = false;
2258
2259 if (nr_pages > MEMCG_CHARGE_BATCH)
2260 return ret;
2261
2262 local_irq_save(flags);
2263
2264 stock = this_cpu_ptr(&memcg_stock);
2265 if (memcg == stock->cached && stock->nr_pages >= nr_pages) {
2266 stock->nr_pages -= nr_pages;
2267 ret = true;
2268 }
2269
2270 local_irq_restore(flags);
2271
2272 return ret;
2273}
2274
2275
2276
2277
2278static void drain_stock(struct memcg_stock_pcp *stock)
2279{
2280 struct mem_cgroup *old = stock->cached;
2281
2282 if (!old)
2283 return;
2284
2285 if (stock->nr_pages) {
2286 page_counter_uncharge(&old->memory, stock->nr_pages);
2287 if (do_memsw_account())
2288 page_counter_uncharge(&old->memsw, stock->nr_pages);
2289 stock->nr_pages = 0;
2290 }
2291
2292 css_put(&old->css);
2293 stock->cached = NULL;
2294}
2295
2296static void drain_local_stock(struct work_struct *dummy)
2297{
2298 struct memcg_stock_pcp *stock;
2299 unsigned long flags;
2300
2301
2302
2303
2304
2305 local_irq_save(flags);
2306
2307 stock = this_cpu_ptr(&memcg_stock);
2308 drain_obj_stock(stock);
2309 drain_stock(stock);
2310 clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags);
2311
2312 local_irq_restore(flags);
2313}
2314
2315
2316
2317
2318
2319static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
2320{
2321 struct memcg_stock_pcp *stock;
2322 unsigned long flags;
2323
2324 local_irq_save(flags);
2325
2326 stock = this_cpu_ptr(&memcg_stock);
2327 if (stock->cached != memcg) {
2328 drain_stock(stock);
2329 css_get(&memcg->css);
2330 stock->cached = memcg;
2331 }
2332 stock->nr_pages += nr_pages;
2333
2334 if (stock->nr_pages > MEMCG_CHARGE_BATCH)
2335 drain_stock(stock);
2336
2337 local_irq_restore(flags);
2338}
2339
2340
2341
2342
2343
2344static void drain_all_stock(struct mem_cgroup *root_memcg)
2345{
2346 int cpu, curcpu;
2347
2348
2349 if (!mutex_trylock(&percpu_charge_mutex))
2350 return;
2351
2352
2353
2354
2355
2356
2357 curcpu = get_cpu();
2358 for_each_online_cpu(cpu) {
2359 struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu);
2360 struct mem_cgroup *memcg;
2361 bool flush = false;
2362
2363 rcu_read_lock();
2364 memcg = stock->cached;
2365 if (memcg && stock->nr_pages &&
2366 mem_cgroup_is_descendant(memcg, root_memcg))
2367 flush = true;
2368 if (obj_stock_flush_required(stock, root_memcg))
2369 flush = true;
2370 rcu_read_unlock();
2371
2372 if (flush &&
2373 !test_and_set_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) {
2374 if (cpu == curcpu)
2375 drain_local_stock(&stock->work);
2376 else
2377 schedule_work_on(cpu, &stock->work);
2378 }
2379 }
2380 put_cpu();
2381 mutex_unlock(&percpu_charge_mutex);
2382}
2383
2384static int memcg_hotplug_cpu_dead(unsigned int cpu)
2385{
2386 struct memcg_stock_pcp *stock;
2387 struct mem_cgroup *memcg, *mi;
2388
2389 stock = &per_cpu(memcg_stock, cpu);
2390 drain_stock(stock);
2391
2392 for_each_mem_cgroup(memcg) {
2393 int i;
2394
2395 for (i = 0; i < MEMCG_NR_STAT; i++) {
2396 int nid;
2397 long x;
2398
2399 x = this_cpu_xchg(memcg->vmstats_percpu->stat[i], 0);
2400 if (x)
2401 for (mi = memcg; mi; mi = parent_mem_cgroup(mi))
2402 atomic_long_add(x, &memcg->vmstats[i]);
2403
2404 if (i >= NR_VM_NODE_STAT_ITEMS)
2405 continue;
2406
2407 for_each_node(nid) {
2408 struct mem_cgroup_per_node *pn;
2409
2410 pn = mem_cgroup_nodeinfo(memcg, nid);
2411 x = this_cpu_xchg(pn->lruvec_stat_cpu->count[i], 0);
2412 if (x)
2413 do {
2414 atomic_long_add(x, &pn->lruvec_stat[i]);
2415 } while ((pn = parent_nodeinfo(pn, nid)));
2416 }
2417 }
2418
2419 for (i = 0; i < NR_VM_EVENT_ITEMS; i++) {
2420 long x;
2421
2422 x = this_cpu_xchg(memcg->vmstats_percpu->events[i], 0);
2423 if (x)
2424 for (mi = memcg; mi; mi = parent_mem_cgroup(mi))
2425 atomic_long_add(x, &memcg->vmevents[i]);
2426 }
2427 }
2428
2429 return 0;
2430}
2431
2432static unsigned long reclaim_high(struct mem_cgroup *memcg,
2433 unsigned int nr_pages,
2434 gfp_t gfp_mask)
2435{
2436 unsigned long nr_reclaimed = 0;
2437
2438 do {
2439 unsigned long pflags;
2440
2441 if (page_counter_read(&memcg->memory) <=
2442 READ_ONCE(memcg->memory.high))
2443 continue;
2444
2445 memcg_memory_event(memcg, MEMCG_HIGH);
2446
2447 psi_memstall_enter(&pflags);
2448 nr_reclaimed += try_to_free_mem_cgroup_pages(memcg, nr_pages,
2449 gfp_mask, true);
2450 psi_memstall_leave(&pflags);
2451 } while ((memcg = parent_mem_cgroup(memcg)) &&
2452 !mem_cgroup_is_root(memcg));
2453
2454 return nr_reclaimed;
2455}
2456
2457static void high_work_func(struct work_struct *work)
2458{
2459 struct mem_cgroup *memcg;
2460
2461 memcg = container_of(work, struct mem_cgroup, high_work);
2462 reclaim_high(memcg, MEMCG_CHARGE_BATCH, GFP_KERNEL);
2463}
2464
2465
2466
2467
2468
2469
2470#define MEMCG_MAX_HIGH_DELAY_JIFFIES (2UL*HZ)
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515 #define MEMCG_DELAY_PRECISION_SHIFT 20
2516 #define MEMCG_DELAY_SCALING_SHIFT 14
2517
2518static u64 calculate_overage(unsigned long usage, unsigned long high)
2519{
2520 u64 overage;
2521
2522 if (usage <= high)
2523 return 0;
2524
2525
2526
2527
2528
2529 high = max(high, 1UL);
2530
2531 overage = usage - high;
2532 overage <<= MEMCG_DELAY_PRECISION_SHIFT;
2533 return div64_u64(overage, high);
2534}
2535
2536static u64 mem_find_max_overage(struct mem_cgroup *memcg)
2537{
2538 u64 overage, max_overage = 0;
2539
2540 do {
2541 overage = calculate_overage(page_counter_read(&memcg->memory),
2542 READ_ONCE(memcg->memory.high));
2543 max_overage = max(overage, max_overage);
2544 } while ((memcg = parent_mem_cgroup(memcg)) &&
2545 !mem_cgroup_is_root(memcg));
2546
2547 return max_overage;
2548}
2549
2550static u64 swap_find_max_overage(struct mem_cgroup *memcg)
2551{
2552 u64 overage, max_overage = 0;
2553
2554 do {
2555 overage = calculate_overage(page_counter_read(&memcg->swap),
2556 READ_ONCE(memcg->swap.high));
2557 if (overage)
2558 memcg_memory_event(memcg, MEMCG_SWAP_HIGH);
2559 max_overage = max(overage, max_overage);
2560 } while ((memcg = parent_mem_cgroup(memcg)) &&
2561 !mem_cgroup_is_root(memcg));
2562
2563 return max_overage;
2564}
2565
2566
2567
2568
2569
2570static unsigned long calculate_high_delay(struct mem_cgroup *memcg,
2571 unsigned int nr_pages,
2572 u64 max_overage)
2573{
2574 unsigned long penalty_jiffies;
2575
2576 if (!max_overage)
2577 return 0;
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587 penalty_jiffies = max_overage * max_overage * HZ;
2588 penalty_jiffies >>= MEMCG_DELAY_PRECISION_SHIFT;
2589 penalty_jiffies >>= MEMCG_DELAY_SCALING_SHIFT;
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599 return penalty_jiffies * nr_pages / MEMCG_CHARGE_BATCH;
2600}
2601
2602
2603
2604
2605
2606void mem_cgroup_handle_over_high(void)
2607{
2608 unsigned long penalty_jiffies;
2609 unsigned long pflags;
2610 unsigned long nr_reclaimed;
2611 unsigned int nr_pages = current->memcg_nr_pages_over_high;
2612 int nr_retries = MAX_RECLAIM_RETRIES;
2613 struct mem_cgroup *memcg;
2614 bool in_retry = false;
2615
2616 if (likely(!nr_pages))
2617 return;
2618
2619 memcg = get_mem_cgroup_from_mm(current->mm);
2620 current->memcg_nr_pages_over_high = 0;
2621
2622retry_reclaim:
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632 nr_reclaimed = reclaim_high(memcg,
2633 in_retry ? SWAP_CLUSTER_MAX : nr_pages,
2634 GFP_KERNEL);
2635
2636
2637
2638
2639
2640 penalty_jiffies = calculate_high_delay(memcg, nr_pages,
2641 mem_find_max_overage(memcg));
2642
2643 penalty_jiffies += calculate_high_delay(memcg, nr_pages,
2644 swap_find_max_overage(memcg));
2645
2646
2647
2648
2649
2650
2651 penalty_jiffies = min(penalty_jiffies, MEMCG_MAX_HIGH_DELAY_JIFFIES);
2652
2653
2654
2655
2656
2657
2658
2659 if (penalty_jiffies <= HZ / 100)
2660 goto out;
2661
2662
2663
2664
2665
2666
2667 if (nr_reclaimed || nr_retries--) {
2668 in_retry = true;
2669 goto retry_reclaim;
2670 }
2671
2672
2673
2674
2675
2676
2677 psi_memstall_enter(&pflags);
2678 schedule_timeout_killable(penalty_jiffies);
2679 psi_memstall_leave(&pflags);
2680
2681out:
2682 css_put(&memcg->css);
2683}
2684
2685static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
2686 unsigned int nr_pages)
2687{
2688 unsigned int batch = max(MEMCG_CHARGE_BATCH, nr_pages);
2689 int nr_retries = MAX_RECLAIM_RETRIES;
2690 struct mem_cgroup *mem_over_limit;
2691 struct page_counter *counter;
2692 enum oom_status oom_status;
2693 unsigned long nr_reclaimed;
2694 bool may_swap = true;
2695 bool drained = false;
2696 unsigned long pflags;
2697
2698 if (mem_cgroup_is_root(memcg))
2699 return 0;
2700retry:
2701 if (consume_stock(memcg, nr_pages))
2702 return 0;
2703
2704 if (!do_memsw_account() ||
2705 page_counter_try_charge(&memcg->memsw, batch, &counter)) {
2706 if (page_counter_try_charge(&memcg->memory, batch, &counter))
2707 goto done_restock;
2708 if (do_memsw_account())
2709 page_counter_uncharge(&memcg->memsw, batch);
2710 mem_over_limit = mem_cgroup_from_counter(counter, memory);
2711 } else {
2712 mem_over_limit = mem_cgroup_from_counter(counter, memsw);
2713 may_swap = false;
2714 }
2715
2716 if (batch > nr_pages) {
2717 batch = nr_pages;
2718 goto retry;
2719 }
2720
2721
2722
2723
2724
2725
2726
2727 if (gfp_mask & __GFP_ATOMIC)
2728 goto force;
2729
2730
2731
2732
2733
2734
2735
2736 if (unlikely(should_force_charge()))
2737 goto force;
2738
2739
2740
2741
2742
2743
2744
2745 if (unlikely(current->flags & PF_MEMALLOC))
2746 goto force;
2747
2748 if (unlikely(task_in_memcg_oom(current)))
2749 goto nomem;
2750
2751 if (!gfpflags_allow_blocking(gfp_mask))
2752 goto nomem;
2753
2754 memcg_memory_event(mem_over_limit, MEMCG_MAX);
2755
2756 psi_memstall_enter(&pflags);
2757 nr_reclaimed = try_to_free_mem_cgroup_pages(mem_over_limit, nr_pages,
2758 gfp_mask, may_swap);
2759 psi_memstall_leave(&pflags);
2760
2761 if (mem_cgroup_margin(mem_over_limit) >= nr_pages)
2762 goto retry;
2763
2764 if (!drained) {
2765 drain_all_stock(mem_over_limit);
2766 drained = true;
2767 goto retry;
2768 }
2769
2770 if (gfp_mask & __GFP_NORETRY)
2771 goto nomem;
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781 if (nr_reclaimed && nr_pages <= (1 << PAGE_ALLOC_COSTLY_ORDER))
2782 goto retry;
2783
2784
2785
2786
2787 if (mem_cgroup_wait_acct_move(mem_over_limit))
2788 goto retry;
2789
2790 if (nr_retries--)
2791 goto retry;
2792
2793 if (gfp_mask & __GFP_RETRY_MAYFAIL)
2794 goto nomem;
2795
2796 if (gfp_mask & __GFP_NOFAIL)
2797 goto force;
2798
2799 if (fatal_signal_pending(current))
2800 goto force;
2801
2802
2803
2804
2805
2806
2807 oom_status = mem_cgroup_oom(mem_over_limit, gfp_mask,
2808 get_order(nr_pages * PAGE_SIZE));
2809 switch (oom_status) {
2810 case OOM_SUCCESS:
2811 nr_retries = MAX_RECLAIM_RETRIES;
2812 goto retry;
2813 case OOM_FAILED:
2814 goto force;
2815 default:
2816 goto nomem;
2817 }
2818nomem:
2819 if (!(gfp_mask & __GFP_NOFAIL))
2820 return -ENOMEM;
2821force:
2822
2823
2824
2825
2826
2827 page_counter_charge(&memcg->memory, nr_pages);
2828 if (do_memsw_account())
2829 page_counter_charge(&memcg->memsw, nr_pages);
2830
2831 return 0;
2832
2833done_restock:
2834 if (batch > nr_pages)
2835 refill_stock(memcg, batch - nr_pages);
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846 do {
2847 bool mem_high, swap_high;
2848
2849 mem_high = page_counter_read(&memcg->memory) >
2850 READ_ONCE(memcg->memory.high);
2851 swap_high = page_counter_read(&memcg->swap) >
2852 READ_ONCE(memcg->swap.high);
2853
2854
2855 if (in_interrupt()) {
2856 if (mem_high) {
2857 schedule_work(&memcg->high_work);
2858 break;
2859 }
2860 continue;
2861 }
2862
2863 if (mem_high || swap_high) {
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873 current->memcg_nr_pages_over_high += batch;
2874 set_notify_resume(current);
2875 break;
2876 }
2877 } while ((memcg = parent_mem_cgroup(memcg)));
2878
2879 return 0;
2880}
2881
2882#if defined(CONFIG_MEMCG_KMEM) || defined(CONFIG_MMU)
2883static void cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages)
2884{
2885 if (mem_cgroup_is_root(memcg))
2886 return;
2887
2888 page_counter_uncharge(&memcg->memory, nr_pages);
2889 if (do_memsw_account())
2890 page_counter_uncharge(&memcg->memsw, nr_pages);
2891}
2892#endif
2893
2894static void commit_charge(struct page *page, struct mem_cgroup *memcg)
2895{
2896 VM_BUG_ON_PAGE(page_memcg(page), page);
2897
2898
2899
2900
2901
2902
2903
2904
2905 page->memcg_data = (unsigned long)memcg;
2906}
2907
2908#ifdef CONFIG_MEMCG_KMEM
2909int memcg_alloc_page_obj_cgroups(struct page *page, struct kmem_cache *s,
2910 gfp_t gfp, bool new_page)
2911{
2912 unsigned int objects = objs_per_slab_page(s, page);
2913 unsigned long memcg_data;
2914 void *vec;
2915
2916 vec = kcalloc_node(objects, sizeof(struct obj_cgroup *), gfp,
2917 page_to_nid(page));
2918 if (!vec)
2919 return -ENOMEM;
2920
2921 memcg_data = (unsigned long) vec | MEMCG_DATA_OBJCGS;
2922 if (new_page) {
2923
2924
2925
2926
2927
2928 page->memcg_data = memcg_data;
2929 } else if (cmpxchg(&page->memcg_data, 0, memcg_data)) {
2930
2931
2932
2933
2934
2935 kfree(vec);
2936 return 0;
2937 }
2938
2939 kmemleak_not_leak(vec);
2940 return 0;
2941}
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955struct mem_cgroup *mem_cgroup_from_obj(void *p)
2956{
2957 struct page *page;
2958
2959 if (mem_cgroup_disabled())
2960 return NULL;
2961
2962 page = virt_to_head_page(p);
2963
2964
2965
2966
2967
2968
2969 if (page_objcgs_check(page)) {
2970 struct obj_cgroup *objcg;
2971 unsigned int off;
2972
2973 off = obj_to_index(page->slab_cache, page, p);
2974 objcg = page_objcgs(page)[off];
2975 if (objcg)
2976 return obj_cgroup_memcg(objcg);
2977
2978 return NULL;
2979 }
2980
2981
2982
2983
2984
2985
2986
2987
2988 return page_memcg_check(page);
2989}
2990
2991__always_inline struct obj_cgroup *get_obj_cgroup_from_current(void)
2992{
2993 struct obj_cgroup *objcg = NULL;
2994 struct mem_cgroup *memcg;
2995
2996 if (memcg_kmem_bypass())
2997 return NULL;
2998
2999 rcu_read_lock();
3000 if (unlikely(active_memcg()))
3001 memcg = active_memcg();
3002 else
3003 memcg = mem_cgroup_from_task(current);
3004
3005 for (; memcg != root_mem_cgroup; memcg = parent_mem_cgroup(memcg)) {
3006 objcg = rcu_dereference(memcg->objcg);
3007 if (objcg && obj_cgroup_tryget(objcg))
3008 break;
3009 objcg = NULL;
3010 }
3011 rcu_read_unlock();
3012
3013 return objcg;
3014}
3015
3016static int memcg_alloc_cache_id(void)
3017{
3018 int id, size;
3019 int err;
3020
3021 id = ida_simple_get(&memcg_cache_ida,
3022 0, MEMCG_CACHES_MAX_SIZE, GFP_KERNEL);
3023 if (id < 0)
3024 return id;
3025
3026 if (id < memcg_nr_cache_ids)
3027 return id;
3028
3029
3030
3031
3032
3033 down_write(&memcg_cache_ids_sem);
3034
3035 size = 2 * (id + 1);
3036 if (size < MEMCG_CACHES_MIN_SIZE)
3037 size = MEMCG_CACHES_MIN_SIZE;
3038 else if (size > MEMCG_CACHES_MAX_SIZE)
3039 size = MEMCG_CACHES_MAX_SIZE;
3040
3041 err = memcg_update_all_list_lrus(size);
3042 if (!err)
3043 memcg_nr_cache_ids = size;
3044
3045 up_write(&memcg_cache_ids_sem);
3046
3047 if (err) {
3048 ida_simple_remove(&memcg_cache_ida, id);
3049 return err;
3050 }
3051 return id;
3052}
3053
3054static void memcg_free_cache_id(int id)
3055{
3056 ida_simple_remove(&memcg_cache_ida, id);
3057}
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067static int __memcg_kmem_charge(struct mem_cgroup *memcg, gfp_t gfp,
3068 unsigned int nr_pages)
3069{
3070 struct page_counter *counter;
3071 int ret;
3072
3073 ret = try_charge(memcg, gfp, nr_pages);
3074 if (ret)
3075 return ret;
3076
3077 if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) &&
3078 !page_counter_try_charge(&memcg->kmem, nr_pages, &counter)) {
3079
3080
3081
3082
3083
3084
3085 if (gfp & __GFP_NOFAIL) {
3086 page_counter_charge(&memcg->kmem, nr_pages);
3087 return 0;
3088 }
3089 cancel_charge(memcg, nr_pages);
3090 return -ENOMEM;
3091 }
3092 return 0;
3093}
3094
3095
3096
3097
3098
3099
3100static void __memcg_kmem_uncharge(struct mem_cgroup *memcg, unsigned int nr_pages)
3101{
3102 if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
3103 page_counter_uncharge(&memcg->kmem, nr_pages);
3104
3105 refill_stock(memcg, nr_pages);
3106}
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116int __memcg_kmem_charge_page(struct page *page, gfp_t gfp, int order)
3117{
3118 struct mem_cgroup *memcg;
3119 int ret = 0;
3120
3121 memcg = get_mem_cgroup_from_current();
3122 if (memcg && !mem_cgroup_is_root(memcg)) {
3123 ret = __memcg_kmem_charge(memcg, gfp, 1 << order);
3124 if (!ret) {
3125 page->memcg_data = (unsigned long)memcg |
3126 MEMCG_DATA_KMEM;
3127 return 0;
3128 }
3129 css_put(&memcg->css);
3130 }
3131 return ret;
3132}
3133
3134
3135
3136
3137
3138
3139void __memcg_kmem_uncharge_page(struct page *page, int order)
3140{
3141 struct mem_cgroup *memcg = page_memcg(page);
3142 unsigned int nr_pages = 1 << order;
3143
3144 if (!memcg)
3145 return;
3146
3147 VM_BUG_ON_PAGE(mem_cgroup_is_root(memcg), page);
3148 __memcg_kmem_uncharge(memcg, nr_pages);
3149 page->memcg_data = 0;
3150 css_put(&memcg->css);
3151}
3152
3153static bool consume_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes)
3154{
3155 struct memcg_stock_pcp *stock;
3156 unsigned long flags;
3157 bool ret = false;
3158
3159 local_irq_save(flags);
3160
3161 stock = this_cpu_ptr(&memcg_stock);
3162 if (objcg == stock->cached_objcg && stock->nr_bytes >= nr_bytes) {
3163 stock->nr_bytes -= nr_bytes;
3164 ret = true;
3165 }
3166
3167 local_irq_restore(flags);
3168
3169 return ret;
3170}
3171
3172static void drain_obj_stock(struct memcg_stock_pcp *stock)
3173{
3174 struct obj_cgroup *old = stock->cached_objcg;
3175
3176 if (!old)
3177 return;
3178
3179 if (stock->nr_bytes) {
3180 unsigned int nr_pages = stock->nr_bytes >> PAGE_SHIFT;
3181 unsigned int nr_bytes = stock->nr_bytes & (PAGE_SIZE - 1);
3182
3183 if (nr_pages) {
3184 rcu_read_lock();
3185 __memcg_kmem_uncharge(obj_cgroup_memcg(old), nr_pages);
3186 rcu_read_unlock();
3187 }
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199 atomic_add(nr_bytes, &old->nr_charged_bytes);
3200 stock->nr_bytes = 0;
3201 }
3202
3203 obj_cgroup_put(old);
3204 stock->cached_objcg = NULL;
3205}
3206
3207static bool obj_stock_flush_required(struct memcg_stock_pcp *stock,
3208 struct mem_cgroup *root_memcg)
3209{
3210 struct mem_cgroup *memcg;
3211
3212 if (stock->cached_objcg) {
3213 memcg = obj_cgroup_memcg(stock->cached_objcg);
3214 if (memcg && mem_cgroup_is_descendant(memcg, root_memcg))
3215 return true;
3216 }
3217
3218 return false;
3219}
3220
3221static void refill_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes)
3222{
3223 struct memcg_stock_pcp *stock;
3224 unsigned long flags;
3225
3226 local_irq_save(flags);
3227
3228 stock = this_cpu_ptr(&memcg_stock);
3229 if (stock->cached_objcg != objcg) {
3230 drain_obj_stock(stock);
3231 obj_cgroup_get(objcg);
3232 stock->cached_objcg = objcg;
3233 stock->nr_bytes = atomic_xchg(&objcg->nr_charged_bytes, 0);
3234 }
3235 stock->nr_bytes += nr_bytes;
3236
3237 if (stock->nr_bytes > PAGE_SIZE)
3238 drain_obj_stock(stock);
3239
3240 local_irq_restore(flags);
3241}
3242
3243int obj_cgroup_charge(struct obj_cgroup *objcg, gfp_t gfp, size_t size)
3244{
3245 struct mem_cgroup *memcg;
3246 unsigned int nr_pages, nr_bytes;
3247 int ret;
3248
3249 if (consume_obj_stock(objcg, size))
3250 return 0;
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262 rcu_read_lock();
3263retry:
3264 memcg = obj_cgroup_memcg(objcg);
3265 if (unlikely(!css_tryget(&memcg->css)))
3266 goto retry;
3267 rcu_read_unlock();
3268
3269 nr_pages = size >> PAGE_SHIFT;
3270 nr_bytes = size & (PAGE_SIZE - 1);
3271
3272 if (nr_bytes)
3273 nr_pages += 1;
3274
3275 ret = __memcg_kmem_charge(memcg, gfp, nr_pages);
3276 if (!ret && nr_bytes)
3277 refill_obj_stock(objcg, PAGE_SIZE - nr_bytes);
3278
3279 css_put(&memcg->css);
3280 return ret;
3281}
3282
3283void obj_cgroup_uncharge(struct obj_cgroup *objcg, size_t size)
3284{
3285 refill_obj_stock(objcg, size);
3286}
3287
3288#endif
3289
3290
3291
3292
3293void split_page_memcg(struct page *head, unsigned int nr)
3294{
3295 struct mem_cgroup *memcg = page_memcg(head);
3296 int i;
3297
3298 if (mem_cgroup_disabled() || !memcg)
3299 return;
3300
3301 for (i = 1; i < nr; i++)
3302 head[i].memcg_data = head->memcg_data;
3303 css_get_many(&memcg->css, nr - 1);
3304}
3305
3306#ifdef CONFIG_MEMCG_SWAP
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321static int mem_cgroup_move_swap_account(swp_entry_t entry,
3322 struct mem_cgroup *from, struct mem_cgroup *to)
3323{
3324 unsigned short old_id, new_id;
3325
3326 old_id = mem_cgroup_id(from);
3327 new_id = mem_cgroup_id(to);
3328
3329 if (swap_cgroup_cmpxchg(entry, old_id, new_id) == old_id) {
3330 mod_memcg_state(from, MEMCG_SWAP, -1);
3331 mod_memcg_state(to, MEMCG_SWAP, 1);
3332 return 0;
3333 }
3334 return -EINVAL;
3335}
3336#else
3337static inline int mem_cgroup_move_swap_account(swp_entry_t entry,
3338 struct mem_cgroup *from, struct mem_cgroup *to)
3339{
3340 return -EINVAL;
3341}
3342#endif
3343
3344static DEFINE_MUTEX(memcg_max_mutex);
3345
3346static int mem_cgroup_resize_max(struct mem_cgroup *memcg,
3347 unsigned long max, bool memsw)
3348{
3349 bool enlarge = false;
3350 bool drained = false;
3351 int ret;
3352 bool limits_invariant;
3353 struct page_counter *counter = memsw ? &memcg->memsw : &memcg->memory;
3354
3355 do {
3356 if (signal_pending(current)) {
3357 ret = -EINTR;
3358 break;
3359 }
3360
3361 mutex_lock(&memcg_max_mutex);
3362
3363
3364
3365
3366 limits_invariant = memsw ? max >= READ_ONCE(memcg->memory.max) :
3367 max <= memcg->memsw.max;
3368 if (!limits_invariant) {
3369 mutex_unlock(&memcg_max_mutex);
3370 ret = -EINVAL;
3371 break;
3372 }
3373 if (max > counter->max)
3374 enlarge = true;
3375 ret = page_counter_set_max(counter, max);
3376 mutex_unlock(&memcg_max_mutex);
3377
3378 if (!ret)
3379 break;
3380
3381 if (!drained) {
3382 drain_all_stock(memcg);
3383 drained = true;
3384 continue;
3385 }
3386
3387 if (!try_to_free_mem_cgroup_pages(memcg, 1,
3388 GFP_KERNEL, !memsw)) {
3389 ret = -EBUSY;
3390 break;
3391 }
3392 } while (true);
3393
3394 if (!ret && enlarge)
3395 memcg_oom_recover(memcg);
3396
3397 return ret;
3398}
3399
3400unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order,
3401 gfp_t gfp_mask,
3402 unsigned long *total_scanned)
3403{
3404 unsigned long nr_reclaimed = 0;
3405 struct mem_cgroup_per_node *mz, *next_mz = NULL;
3406 unsigned long reclaimed;
3407 int loop = 0;
3408 struct mem_cgroup_tree_per_node *mctz;
3409 unsigned long excess;
3410 unsigned long nr_scanned;
3411
3412 if (order > 0)
3413 return 0;
3414
3415 mctz = soft_limit_tree_node(pgdat->node_id);
3416
3417
3418
3419
3420
3421
3422 if (!mctz || RB_EMPTY_ROOT(&mctz->rb_root))
3423 return 0;
3424
3425
3426
3427
3428
3429
3430 do {
3431 if (next_mz)
3432 mz = next_mz;
3433 else
3434 mz = mem_cgroup_largest_soft_limit_node(mctz);
3435 if (!mz)
3436 break;
3437
3438 nr_scanned = 0;
3439 reclaimed = mem_cgroup_soft_reclaim(mz->memcg, pgdat,
3440 gfp_mask, &nr_scanned);
3441 nr_reclaimed += reclaimed;
3442 *total_scanned += nr_scanned;
3443 spin_lock_irq(&mctz->lock);
3444 __mem_cgroup_remove_exceeded(mz, mctz);
3445
3446
3447
3448
3449
3450 next_mz = NULL;
3451 if (!reclaimed)
3452 next_mz = __mem_cgroup_largest_soft_limit_node(mctz);
3453
3454 excess = soft_limit_excess(mz->memcg);
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464 __mem_cgroup_insert_exceeded(mz, mctz, excess);
3465 spin_unlock_irq(&mctz->lock);
3466 css_put(&mz->memcg->css);
3467 loop++;
3468
3469
3470
3471
3472
3473 if (!nr_reclaimed &&
3474 (next_mz == NULL ||
3475 loop > MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS))
3476 break;
3477 } while (!nr_reclaimed);
3478 if (next_mz)
3479 css_put(&next_mz->memcg->css);
3480 return nr_reclaimed;
3481}
3482
3483
3484
3485
3486
3487
3488static int mem_cgroup_force_empty(struct mem_cgroup *memcg)
3489{
3490 int nr_retries = MAX_RECLAIM_RETRIES;
3491
3492
3493 lru_add_drain_all();
3494
3495 drain_all_stock(memcg);
3496
3497
3498 while (nr_retries && page_counter_read(&memcg->memory)) {
3499 int progress;
3500
3501 if (signal_pending(current))
3502 return -EINTR;
3503
3504 progress = try_to_free_mem_cgroup_pages(memcg, 1,
3505 GFP_KERNEL, true);
3506 if (!progress) {
3507 nr_retries--;
3508
3509 congestion_wait(BLK_RW_ASYNC, HZ/10);
3510 }
3511
3512 }
3513
3514 return 0;
3515}
3516
3517static ssize_t mem_cgroup_force_empty_write(struct kernfs_open_file *of,
3518 char *buf, size_t nbytes,
3519 loff_t off)
3520{
3521 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
3522
3523 if (mem_cgroup_is_root(memcg))
3524 return -EINVAL;
3525 return mem_cgroup_force_empty(memcg) ?: nbytes;
3526}
3527
3528static u64 mem_cgroup_hierarchy_read(struct cgroup_subsys_state *css,
3529 struct cftype *cft)
3530{
3531 return 1;
3532}
3533
3534static int mem_cgroup_hierarchy_write(struct cgroup_subsys_state *css,
3535 struct cftype *cft, u64 val)
3536{
3537 if (val == 1)
3538 return 0;
3539
3540 pr_warn_once("Non-hierarchical mode is deprecated. "
3541 "Please report your usecase to linux-mm@kvack.org if you "
3542 "depend on this functionality.\n");
3543
3544 return -EINVAL;
3545}
3546
3547static unsigned long mem_cgroup_usage(struct mem_cgroup *memcg, bool swap)
3548{
3549 unsigned long val;
3550
3551 if (mem_cgroup_is_root(memcg)) {
3552 val = memcg_page_state(memcg, NR_FILE_PAGES) +
3553 memcg_page_state(memcg, NR_ANON_MAPPED);
3554 if (swap)
3555 val += memcg_page_state(memcg, MEMCG_SWAP);
3556 } else {
3557 if (!swap)
3558 val = page_counter_read(&memcg->memory);
3559 else
3560 val = page_counter_read(&memcg->memsw);
3561 }
3562 return val;
3563}
3564
3565enum {
3566 RES_USAGE,
3567 RES_LIMIT,
3568 RES_MAX_USAGE,
3569 RES_FAILCNT,
3570 RES_SOFT_LIMIT,
3571};
3572
3573static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css,
3574 struct cftype *cft)
3575{
3576 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
3577 struct page_counter *counter;
3578
3579 switch (MEMFILE_TYPE(cft->private)) {
3580 case _MEM:
3581 counter = &memcg->memory;
3582 break;
3583 case _MEMSWAP:
3584 counter = &memcg->memsw;
3585 break;
3586 case _KMEM:
3587 counter = &memcg->kmem;
3588 break;
3589 case _TCP:
3590 counter = &memcg->tcpmem;
3591 break;
3592 default:
3593 BUG();
3594 }
3595
3596 switch (MEMFILE_ATTR(cft->private)) {
3597 case RES_USAGE:
3598 if (counter == &memcg->memory)
3599 return (u64)mem_cgroup_usage(memcg, false) * PAGE_SIZE;
3600 if (counter == &memcg->memsw)
3601 return (u64)mem_cgroup_usage(memcg, true) * PAGE_SIZE;
3602 return (u64)page_counter_read(counter) * PAGE_SIZE;
3603 case RES_LIMIT:
3604 return (u64)counter->max * PAGE_SIZE;
3605 case RES_MAX_USAGE:
3606 return (u64)counter->watermark * PAGE_SIZE;
3607 case RES_FAILCNT:
3608 return counter->failcnt;
3609 case RES_SOFT_LIMIT:
3610 return (u64)memcg->soft_limit * PAGE_SIZE;
3611 default:
3612 BUG();
3613 }
3614}
3615
3616static void memcg_flush_percpu_vmstats(struct mem_cgroup *memcg)
3617{
3618 unsigned long stat[MEMCG_NR_STAT] = {0};
3619 struct mem_cgroup *mi;
3620 int node, cpu, i;
3621
3622 for_each_online_cpu(cpu)
3623 for (i = 0; i < MEMCG_NR_STAT; i++)
3624 stat[i] += per_cpu(memcg->vmstats_percpu->stat[i], cpu);
3625
3626 for (mi = memcg; mi; mi = parent_mem_cgroup(mi))
3627 for (i = 0; i < MEMCG_NR_STAT; i++)
3628 atomic_long_add(stat[i], &mi->vmstats[i]);
3629
3630 for_each_node(node) {
3631 struct mem_cgroup_per_node *pn = memcg->nodeinfo[node];
3632 struct mem_cgroup_per_node *pi;
3633
3634 for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++)
3635 stat[i] = 0;
3636
3637 for_each_online_cpu(cpu)
3638 for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++)
3639 stat[i] += per_cpu(
3640 pn->lruvec_stat_cpu->count[i], cpu);
3641
3642 for (pi = pn; pi; pi = parent_nodeinfo(pi, node))
3643 for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++)
3644 atomic_long_add(stat[i], &pi->lruvec_stat[i]);
3645 }
3646}
3647
3648static void memcg_flush_percpu_vmevents(struct mem_cgroup *memcg)
3649{
3650 unsigned long events[NR_VM_EVENT_ITEMS];
3651 struct mem_cgroup *mi;
3652 int cpu, i;
3653
3654 for (i = 0; i < NR_VM_EVENT_ITEMS; i++)
3655 events[i] = 0;
3656
3657 for_each_online_cpu(cpu)
3658 for (i = 0; i < NR_VM_EVENT_ITEMS; i++)
3659 events[i] += per_cpu(memcg->vmstats_percpu->events[i],
3660 cpu);
3661
3662 for (mi = memcg; mi; mi = parent_mem_cgroup(mi))
3663 for (i = 0; i < NR_VM_EVENT_ITEMS; i++)
3664 atomic_long_add(events[i], &mi->vmevents[i]);
3665}
3666
3667#ifdef CONFIG_MEMCG_KMEM
3668static int memcg_online_kmem(struct mem_cgroup *memcg)
3669{
3670 struct obj_cgroup *objcg;
3671 int memcg_id;
3672
3673 if (cgroup_memory_nokmem)
3674 return 0;
3675
3676 BUG_ON(memcg->kmemcg_id >= 0);
3677 BUG_ON(memcg->kmem_state);
3678
3679 memcg_id = memcg_alloc_cache_id();
3680 if (memcg_id < 0)
3681 return memcg_id;
3682
3683 objcg = obj_cgroup_alloc();
3684 if (!objcg) {
3685 memcg_free_cache_id(memcg_id);
3686 return -ENOMEM;
3687 }
3688 objcg->memcg = memcg;
3689 rcu_assign_pointer(memcg->objcg, objcg);
3690
3691 static_branch_enable(&memcg_kmem_enabled_key);
3692
3693 memcg->kmemcg_id = memcg_id;
3694 memcg->kmem_state = KMEM_ONLINE;
3695
3696 return 0;
3697}
3698
3699static void memcg_offline_kmem(struct mem_cgroup *memcg)
3700{
3701 struct cgroup_subsys_state *css;
3702 struct mem_cgroup *parent, *child;
3703 int kmemcg_id;
3704
3705 if (memcg->kmem_state != KMEM_ONLINE)
3706 return;
3707
3708 memcg->kmem_state = KMEM_ALLOCATED;
3709
3710 parent = parent_mem_cgroup(memcg);
3711 if (!parent)
3712 parent = root_mem_cgroup;
3713
3714 memcg_reparent_objcgs(memcg, parent);
3715
3716 kmemcg_id = memcg->kmemcg_id;
3717 BUG_ON(kmemcg_id < 0);
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727 rcu_read_lock();
3728 css_for_each_descendant_pre(css, &memcg->css) {
3729 child = mem_cgroup_from_css(css);
3730 BUG_ON(child->kmemcg_id != kmemcg_id);
3731 child->kmemcg_id = parent->kmemcg_id;
3732 }
3733 rcu_read_unlock();
3734
3735 memcg_drain_all_list_lrus(kmemcg_id, parent);
3736
3737 memcg_free_cache_id(kmemcg_id);
3738}
3739
3740static void memcg_free_kmem(struct mem_cgroup *memcg)
3741{
3742
3743 if (unlikely(memcg->kmem_state == KMEM_ONLINE))
3744 memcg_offline_kmem(memcg);
3745}
3746#else
3747static int memcg_online_kmem(struct mem_cgroup *memcg)
3748{
3749 return 0;
3750}
3751static void memcg_offline_kmem(struct mem_cgroup *memcg)
3752{
3753}
3754static void memcg_free_kmem(struct mem_cgroup *memcg)
3755{
3756}
3757#endif
3758
3759static int memcg_update_kmem_max(struct mem_cgroup *memcg,
3760 unsigned long max)
3761{
3762 int ret;
3763
3764 mutex_lock(&memcg_max_mutex);
3765 ret = page_counter_set_max(&memcg->kmem, max);
3766 mutex_unlock(&memcg_max_mutex);
3767 return ret;
3768}
3769
3770static int memcg_update_tcp_max(struct mem_cgroup *memcg, unsigned long max)
3771{
3772 int ret;
3773
3774 mutex_lock(&memcg_max_mutex);
3775
3776 ret = page_counter_set_max(&memcg->tcpmem, max);
3777 if (ret)
3778 goto out;
3779
3780 if (!memcg->tcpmem_active) {
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794
3795
3796
3797 static_branch_inc(&memcg_sockets_enabled_key);
3798 memcg->tcpmem_active = true;
3799 }
3800out:
3801 mutex_unlock(&memcg_max_mutex);
3802 return ret;
3803}
3804
3805
3806
3807
3808
3809static ssize_t mem_cgroup_write(struct kernfs_open_file *of,
3810 char *buf, size_t nbytes, loff_t off)
3811{
3812 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
3813 unsigned long nr_pages;
3814 int ret;
3815
3816 buf = strstrip(buf);
3817 ret = page_counter_memparse(buf, "-1", &nr_pages);
3818 if (ret)
3819 return ret;
3820
3821 switch (MEMFILE_ATTR(of_cft(of)->private)) {
3822 case RES_LIMIT:
3823 if (mem_cgroup_is_root(memcg)) {
3824 ret = -EINVAL;
3825 break;
3826 }
3827 switch (MEMFILE_TYPE(of_cft(of)->private)) {
3828 case _MEM:
3829 ret = mem_cgroup_resize_max(memcg, nr_pages, false);
3830 break;
3831 case _MEMSWAP:
3832 ret = mem_cgroup_resize_max(memcg, nr_pages, true);
3833 break;
3834 case _KMEM:
3835 pr_warn_once("kmem.limit_in_bytes is deprecated and will be removed. "
3836 "Please report your usecase to linux-mm@kvack.org if you "
3837 "depend on this functionality.\n");
3838 ret = memcg_update_kmem_max(memcg, nr_pages);
3839 break;
3840 case _TCP:
3841 ret = memcg_update_tcp_max(memcg, nr_pages);
3842 break;
3843 }
3844 break;
3845 case RES_SOFT_LIMIT:
3846 memcg->soft_limit = nr_pages;
3847 ret = 0;
3848 break;
3849 }
3850 return ret ?: nbytes;
3851}
3852
3853static ssize_t mem_cgroup_reset(struct kernfs_open_file *of, char *buf,
3854 size_t nbytes, loff_t off)
3855{
3856 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
3857 struct page_counter *counter;
3858
3859 switch (MEMFILE_TYPE(of_cft(of)->private)) {
3860 case _MEM:
3861 counter = &memcg->memory;
3862 break;
3863 case _MEMSWAP:
3864 counter = &memcg->memsw;
3865 break;
3866 case _KMEM:
3867 counter = &memcg->kmem;
3868 break;
3869 case _TCP:
3870 counter = &memcg->tcpmem;
3871 break;
3872 default:
3873 BUG();
3874 }
3875
3876 switch (MEMFILE_ATTR(of_cft(of)->private)) {
3877 case RES_MAX_USAGE:
3878 page_counter_reset_watermark(counter);
3879 break;
3880 case RES_FAILCNT:
3881 counter->failcnt = 0;
3882 break;
3883 default:
3884 BUG();
3885 }
3886
3887 return nbytes;
3888}
3889
3890static u64 mem_cgroup_move_charge_read(struct cgroup_subsys_state *css,
3891 struct cftype *cft)
3892{
3893 return mem_cgroup_from_css(css)->move_charge_at_immigrate;
3894}
3895
3896#ifdef CONFIG_MMU
3897static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css,
3898 struct cftype *cft, u64 val)
3899{
3900 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
3901
3902 if (val & ~MOVE_MASK)
3903 return -EINVAL;
3904
3905
3906
3907
3908
3909
3910
3911 memcg->move_charge_at_immigrate = val;
3912 return 0;
3913}
3914#else
3915static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css,
3916 struct cftype *cft, u64 val)
3917{
3918 return -ENOSYS;
3919}
3920#endif
3921
3922#ifdef CONFIG_NUMA
3923
3924#define LRU_ALL_FILE (BIT(LRU_INACTIVE_FILE) | BIT(LRU_ACTIVE_FILE))
3925#define LRU_ALL_ANON (BIT(LRU_INACTIVE_ANON) | BIT(LRU_ACTIVE_ANON))
3926#define LRU_ALL ((1 << NR_LRU_LISTS) - 1)
3927
3928static unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg,
3929 int nid, unsigned int lru_mask, bool tree)
3930{
3931 struct lruvec *lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(nid));
3932 unsigned long nr = 0;
3933 enum lru_list lru;
3934
3935 VM_BUG_ON((unsigned)nid >= nr_node_ids);
3936
3937 for_each_lru(lru) {
3938 if (!(BIT(lru) & lru_mask))
3939 continue;
3940 if (tree)
3941 nr += lruvec_page_state(lruvec, NR_LRU_BASE + lru);
3942 else
3943 nr += lruvec_page_state_local(lruvec, NR_LRU_BASE + lru);
3944 }
3945 return nr;
3946}
3947
3948static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *memcg,
3949 unsigned int lru_mask,
3950 bool tree)
3951{
3952 unsigned long nr = 0;
3953 enum lru_list lru;
3954
3955 for_each_lru(lru) {
3956 if (!(BIT(lru) & lru_mask))
3957 continue;
3958 if (tree)
3959 nr += memcg_page_state(memcg, NR_LRU_BASE + lru);
3960 else
3961 nr += memcg_page_state_local(memcg, NR_LRU_BASE + lru);
3962 }
3963 return nr;
3964}
3965
3966static int memcg_numa_stat_show(struct seq_file *m, void *v)
3967{
3968 struct numa_stat {
3969 const char *name;
3970 unsigned int lru_mask;
3971 };
3972
3973 static const struct numa_stat stats[] = {
3974 { "total", LRU_ALL },
3975 { "file", LRU_ALL_FILE },
3976 { "anon", LRU_ALL_ANON },
3977 { "unevictable", BIT(LRU_UNEVICTABLE) },
3978 };
3979 const struct numa_stat *stat;
3980 int nid;
3981 struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
3982
3983 for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) {
3984 seq_printf(m, "%s=%lu", stat->name,
3985 mem_cgroup_nr_lru_pages(memcg, stat->lru_mask,
3986 false));
3987 for_each_node_state(nid, N_MEMORY)
3988 seq_printf(m, " N%d=%lu", nid,
3989 mem_cgroup_node_nr_lru_pages(memcg, nid,
3990 stat->lru_mask, false));
3991 seq_putc(m, '\n');
3992 }
3993
3994 for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) {
3995
3996 seq_printf(m, "hierarchical_%s=%lu", stat->name,
3997 mem_cgroup_nr_lru_pages(memcg, stat->lru_mask,
3998 true));
3999 for_each_node_state(nid, N_MEMORY)
4000 seq_printf(m, " N%d=%lu", nid,
4001 mem_cgroup_node_nr_lru_pages(memcg, nid,
4002 stat->lru_mask, true));
4003 seq_putc(m, '\n');
4004 }
4005
4006 return 0;
4007}
4008#endif
4009
4010static const unsigned int memcg1_stats[] = {
4011 NR_FILE_PAGES,
4012 NR_ANON_MAPPED,
4013#ifdef CONFIG_TRANSPARENT_HUGEPAGE
4014 NR_ANON_THPS,
4015#endif
4016 NR_SHMEM,
4017 NR_FILE_MAPPED,
4018 NR_FILE_DIRTY,
4019 NR_WRITEBACK,
4020 MEMCG_SWAP,
4021};
4022
4023static const char *const memcg1_stat_names[] = {
4024 "cache",
4025 "rss",
4026#ifdef CONFIG_TRANSPARENT_HUGEPAGE
4027 "rss_huge",
4028#endif
4029 "shmem",
4030 "mapped_file",
4031 "dirty",
4032 "writeback",
4033 "swap",
4034};
4035
4036
4037static const unsigned int memcg1_events[] = {
4038 PGPGIN,
4039 PGPGOUT,
4040 PGFAULT,
4041 PGMAJFAULT,
4042};
4043
4044static int memcg_stat_show(struct seq_file *m, void *v)
4045{
4046 struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
4047 unsigned long memory, memsw;
4048 struct mem_cgroup *mi;
4049 unsigned int i;
4050
4051 BUILD_BUG_ON(ARRAY_SIZE(memcg1_stat_names) != ARRAY_SIZE(memcg1_stats));
4052
4053 for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) {
4054 unsigned long nr;
4055
4056 if (memcg1_stats[i] == MEMCG_SWAP && !do_memsw_account())
4057 continue;
4058 nr = memcg_page_state_local(memcg, memcg1_stats[i]);
4059 seq_printf(m, "%s %lu\n", memcg1_stat_names[i], nr * PAGE_SIZE);
4060 }
4061
4062 for (i = 0; i < ARRAY_SIZE(memcg1_events); i++)
4063 seq_printf(m, "%s %lu\n", vm_event_name(memcg1_events[i]),
4064 memcg_events_local(memcg, memcg1_events[i]));
4065
4066 for (i = 0; i < NR_LRU_LISTS; i++)
4067 seq_printf(m, "%s %lu\n", lru_list_name(i),
4068 memcg_page_state_local(memcg, NR_LRU_BASE + i) *
4069 PAGE_SIZE);
4070
4071
4072 memory = memsw = PAGE_COUNTER_MAX;
4073 for (mi = memcg; mi; mi = parent_mem_cgroup(mi)) {
4074 memory = min(memory, READ_ONCE(mi->memory.max));
4075 memsw = min(memsw, READ_ONCE(mi->memsw.max));
4076 }
4077 seq_printf(m, "hierarchical_memory_limit %llu\n",
4078 (u64)memory * PAGE_SIZE);
4079 if (do_memsw_account())
4080 seq_printf(m, "hierarchical_memsw_limit %llu\n",
4081 (u64)memsw * PAGE_SIZE);
4082
4083 for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) {
4084 unsigned long nr;
4085
4086 if (memcg1_stats[i] == MEMCG_SWAP && !do_memsw_account())
4087 continue;
4088 nr = memcg_page_state(memcg, memcg1_stats[i]);
4089 seq_printf(m, "total_%s %llu\n", memcg1_stat_names[i],
4090 (u64)nr * PAGE_SIZE);
4091 }
4092
4093 for (i = 0; i < ARRAY_SIZE(memcg1_events); i++)
4094 seq_printf(m, "total_%s %llu\n",
4095 vm_event_name(memcg1_events[i]),
4096 (u64)memcg_events(memcg, memcg1_events[i]));
4097
4098 for (i = 0; i < NR_LRU_LISTS; i++)
4099 seq_printf(m, "total_%s %llu\n", lru_list_name(i),
4100 (u64)memcg_page_state(memcg, NR_LRU_BASE + i) *
4101 PAGE_SIZE);
4102
4103#ifdef CONFIG_DEBUG_VM
4104 {
4105 pg_data_t *pgdat;
4106 struct mem_cgroup_per_node *mz;
4107 unsigned long anon_cost = 0;
4108 unsigned long file_cost = 0;
4109
4110 for_each_online_pgdat(pgdat) {
4111 mz = mem_cgroup_nodeinfo(memcg, pgdat->node_id);
4112
4113 anon_cost += mz->lruvec.anon_cost;
4114 file_cost += mz->lruvec.file_cost;
4115 }
4116 seq_printf(m, "anon_cost %lu\n", anon_cost);
4117 seq_printf(m, "file_cost %lu\n", file_cost);
4118 }
4119#endif
4120
4121 return 0;
4122}
4123
4124static u64 mem_cgroup_swappiness_read(struct cgroup_subsys_state *css,
4125 struct cftype *cft)
4126{
4127 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
4128
4129 return mem_cgroup_swappiness(memcg);
4130}
4131
4132static int mem_cgroup_swappiness_write(struct cgroup_subsys_state *css,
4133 struct cftype *cft, u64 val)
4134{
4135 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
4136
4137 if (val > 100)
4138 return -EINVAL;
4139
4140 if (css->parent)
4141 memcg->swappiness = val;
4142 else
4143 vm_swappiness = val;
4144
4145 return 0;
4146}
4147
4148static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap)
4149{
4150 struct mem_cgroup_threshold_ary *t;
4151 unsigned long usage;
4152 int i;
4153
4154 rcu_read_lock();
4155 if (!swap)
4156 t = rcu_dereference(memcg->thresholds.primary);
4157 else
4158 t = rcu_dereference(memcg->memsw_thresholds.primary);
4159
4160 if (!t)
4161 goto unlock;
4162
4163 usage = mem_cgroup_usage(memcg, swap);
4164
4165
4166
4167
4168
4169
4170 i = t->current_threshold;
4171
4172
4173
4174
4175
4176
4177
4178 for (; i >= 0 && unlikely(t->entries[i].threshold > usage); i--)
4179 eventfd_signal(t->entries[i].eventfd, 1);
4180
4181
4182 i++;
4183
4184
4185
4186
4187
4188
4189
4190 for (; i < t->size && unlikely(t->entries[i].threshold <= usage); i++)
4191 eventfd_signal(t->entries[i].eventfd, 1);
4192
4193
4194 t->current_threshold = i - 1;
4195unlock:
4196 rcu_read_unlock();
4197}
4198
4199static void mem_cgroup_threshold(struct mem_cgroup *memcg)
4200{
4201 while (memcg) {
4202 __mem_cgroup_threshold(memcg, false);
4203 if (do_memsw_account())
4204 __mem_cgroup_threshold(memcg, true);
4205
4206 memcg = parent_mem_cgroup(memcg);
4207 }
4208}
4209
4210static int compare_thresholds(const void *a, const void *b)
4211{
4212 const struct mem_cgroup_threshold *_a = a;
4213 const struct mem_cgroup_threshold *_b = b;
4214
4215 if (_a->threshold > _b->threshold)
4216 return 1;
4217
4218 if (_a->threshold < _b->threshold)
4219 return -1;
4220
4221 return 0;
4222}
4223
4224static int mem_cgroup_oom_notify_cb(struct mem_cgroup *memcg)
4225{
4226 struct mem_cgroup_eventfd_list *ev;
4227
4228 spin_lock(&memcg_oom_lock);
4229
4230 list_for_each_entry(ev, &memcg->oom_notify, list)
4231 eventfd_signal(ev->eventfd, 1);
4232
4233 spin_unlock(&memcg_oom_lock);
4234 return 0;
4235}
4236
4237static void mem_cgroup_oom_notify(struct mem_cgroup *memcg)
4238{
4239 struct mem_cgroup *iter;
4240
4241 for_each_mem_cgroup_tree(iter, memcg)
4242 mem_cgroup_oom_notify_cb(iter);
4243}
4244
4245static int __mem_cgroup_usage_register_event(struct mem_cgroup *memcg,
4246 struct eventfd_ctx *eventfd, const char *args, enum res_type type)
4247{
4248 struct mem_cgroup_thresholds *thresholds;
4249 struct mem_cgroup_threshold_ary *new;
4250 unsigned long threshold;
4251 unsigned long usage;
4252 int i, size, ret;
4253
4254 ret = page_counter_memparse(args, "-1", &threshold);
4255 if (ret)
4256 return ret;
4257
4258 mutex_lock(&memcg->thresholds_lock);
4259
4260 if (type == _MEM) {
4261 thresholds = &memcg->thresholds;
4262 usage = mem_cgroup_usage(memcg, false);
4263 } else if (type == _MEMSWAP) {
4264 thresholds = &memcg->memsw_thresholds;
4265 usage = mem_cgroup_usage(memcg, true);
4266 } else
4267 BUG();
4268
4269
4270 if (thresholds->primary)
4271 __mem_cgroup_threshold(memcg, type == _MEMSWAP);
4272
4273 size = thresholds->primary ? thresholds->primary->size + 1 : 1;
4274
4275
4276 new = kmalloc(struct_size(new, entries, size), GFP_KERNEL);
4277 if (!new) {
4278 ret = -ENOMEM;
4279 goto unlock;
4280 }
4281 new->size = size;
4282
4283
4284 if (thresholds->primary)
4285 memcpy(new->entries, thresholds->primary->entries,
4286 flex_array_size(new, entries, size - 1));
4287
4288
4289 new->entries[size - 1].eventfd = eventfd;
4290 new->entries[size - 1].threshold = threshold;
4291
4292
4293 sort(new->entries, size, sizeof(*new->entries),
4294 compare_thresholds, NULL);
4295
4296
4297 new->current_threshold = -1;
4298 for (i = 0; i < size; i++) {
4299 if (new->entries[i].threshold <= usage) {
4300
4301
4302
4303
4304
4305 ++new->current_threshold;
4306 } else
4307 break;
4308 }
4309
4310
4311 kfree(thresholds->spare);
4312 thresholds->spare = thresholds->primary;
4313
4314 rcu_assign_pointer(thresholds->primary, new);
4315
4316
4317 synchronize_rcu();
4318
4319unlock:
4320 mutex_unlock(&memcg->thresholds_lock);
4321
4322 return ret;
4323}
4324
4325static int mem_cgroup_usage_register_event(struct mem_cgroup *memcg,
4326 struct eventfd_ctx *eventfd, const char *args)
4327{
4328 return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEM);
4329}
4330
4331static int memsw_cgroup_usage_register_event(struct mem_cgroup *memcg,
4332 struct eventfd_ctx *eventfd, const char *args)
4333{
4334 return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEMSWAP);
4335}
4336
4337static void __mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
4338 struct eventfd_ctx *eventfd, enum res_type type)
4339{
4340 struct mem_cgroup_thresholds *thresholds;
4341 struct mem_cgroup_threshold_ary *new;
4342 unsigned long usage;
4343 int i, j, size, entries;
4344
4345 mutex_lock(&memcg->thresholds_lock);
4346
4347 if (type == _MEM) {
4348 thresholds = &memcg->thresholds;
4349 usage = mem_cgroup_usage(memcg, false);
4350 } else if (type == _MEMSWAP) {
4351 thresholds = &memcg->memsw_thresholds;
4352 usage = mem_cgroup_usage(memcg, true);
4353 } else
4354 BUG();
4355
4356 if (!thresholds->primary)
4357 goto unlock;
4358
4359
4360 __mem_cgroup_threshold(memcg, type == _MEMSWAP);
4361
4362
4363 size = entries = 0;
4364 for (i = 0; i < thresholds->primary->size; i++) {
4365 if (thresholds->primary->entries[i].eventfd != eventfd)
4366 size++;
4367 else
4368 entries++;
4369 }
4370
4371 new = thresholds->spare;
4372
4373
4374 if (!entries)
4375 goto unlock;
4376
4377
4378 if (!size) {
4379 kfree(new);
4380 new = NULL;
4381 goto swap_buffers;
4382 }
4383
4384 new->size = size;
4385
4386
4387 new->current_threshold = -1;
4388 for (i = 0, j = 0; i < thresholds->primary->size; i++) {
4389 if (thresholds->primary->entries[i].eventfd == eventfd)
4390 continue;
4391
4392 new->entries[j] = thresholds->primary->entries[i];
4393 if (new->entries[j].threshold <= usage) {
4394
4395
4396
4397
4398
4399 ++new->current_threshold;
4400 }
4401 j++;
4402 }
4403
4404swap_buffers:
4405
4406 thresholds->spare = thresholds->primary;
4407
4408 rcu_assign_pointer(thresholds->primary, new);
4409
4410
4411 synchronize_rcu();
4412
4413
4414 if (!new) {
4415 kfree(thresholds->spare);
4416 thresholds->spare = NULL;
4417 }
4418unlock:
4419 mutex_unlock(&memcg->thresholds_lock);
4420}
4421
4422static void mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
4423 struct eventfd_ctx *eventfd)
4424{
4425 return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEM);
4426}
4427
4428static void memsw_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
4429 struct eventfd_ctx *eventfd)
4430{
4431 return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEMSWAP);
4432}
4433
4434static int mem_cgroup_oom_register_event(struct mem_cgroup *memcg,
4435 struct eventfd_ctx *eventfd, const char *args)
4436{
4437 struct mem_cgroup_eventfd_list *event;
4438
4439 event = kmalloc(sizeof(*event), GFP_KERNEL);
4440 if (!event)
4441 return -ENOMEM;
4442
4443 spin_lock(&memcg_oom_lock);
4444
4445 event->eventfd = eventfd;
4446 list_add(&event->list, &memcg->oom_notify);
4447
4448
4449 if (memcg->under_oom)
4450 eventfd_signal(eventfd, 1);
4451 spin_unlock(&memcg_oom_lock);
4452
4453 return 0;
4454}
4455
4456static void mem_cgroup_oom_unregister_event(struct mem_cgroup *memcg,
4457 struct eventfd_ctx *eventfd)
4458{
4459 struct mem_cgroup_eventfd_list *ev, *tmp;
4460
4461 spin_lock(&memcg_oom_lock);
4462
4463 list_for_each_entry_safe(ev, tmp, &memcg->oom_notify, list) {
4464 if (ev->eventfd == eventfd) {
4465 list_del(&ev->list);
4466 kfree(ev);
4467 }
4468 }
4469
4470 spin_unlock(&memcg_oom_lock);
4471}
4472
4473static int mem_cgroup_oom_control_read(struct seq_file *sf, void *v)
4474{
4475 struct mem_cgroup *memcg = mem_cgroup_from_seq(sf);
4476
4477 seq_printf(sf, "oom_kill_disable %d\n", memcg->oom_kill_disable);
4478 seq_printf(sf, "under_oom %d\n", (bool)memcg->under_oom);
4479 seq_printf(sf, "oom_kill %lu\n",
4480 atomic_long_read(&memcg->memory_events[MEMCG_OOM_KILL]));
4481 return 0;
4482}
4483
4484static int mem_cgroup_oom_control_write(struct cgroup_subsys_state *css,
4485 struct cftype *cft, u64 val)
4486{
4487 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
4488
4489
4490 if (!css->parent || !((val == 0) || (val == 1)))
4491 return -EINVAL;
4492
4493 memcg->oom_kill_disable = val;
4494 if (!val)
4495 memcg_oom_recover(memcg);
4496
4497 return 0;
4498}
4499
4500#ifdef CONFIG_CGROUP_WRITEBACK
4501
4502#include <trace/events/writeback.h>
4503
4504static int memcg_wb_domain_init(struct mem_cgroup *memcg, gfp_t gfp)
4505{
4506 return wb_domain_init(&memcg->cgwb_domain, gfp);
4507}
4508
4509static void memcg_wb_domain_exit(struct mem_cgroup *memcg)
4510{
4511 wb_domain_exit(&memcg->cgwb_domain);
4512}
4513
4514static void memcg_wb_domain_size_changed(struct mem_cgroup *memcg)
4515{
4516 wb_domain_size_changed(&memcg->cgwb_domain);
4517}
4518
4519struct wb_domain *mem_cgroup_wb_domain(struct bdi_writeback *wb)
4520{
4521 struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css);
4522
4523 if (!memcg->css.parent)
4524 return NULL;
4525
4526 return &memcg->cgwb_domain;
4527}
4528
4529
4530
4531
4532
4533static unsigned long memcg_exact_page_state(struct mem_cgroup *memcg, int idx)
4534{
4535 long x = atomic_long_read(&memcg->vmstats[idx]);
4536 int cpu;
4537
4538 for_each_online_cpu(cpu)
4539 x += per_cpu_ptr(memcg->vmstats_percpu, cpu)->stat[idx];
4540 if (x < 0)
4541 x = 0;
4542 return x;
4543}
4544
4545
4546
4547
4548
4549
4550
4551
4552
4553
4554
4555
4556
4557
4558
4559
4560
4561
4562
4563void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages,
4564 unsigned long *pheadroom, unsigned long *pdirty,
4565 unsigned long *pwriteback)
4566{
4567 struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css);
4568 struct mem_cgroup *parent;
4569
4570 *pdirty = memcg_exact_page_state(memcg, NR_FILE_DIRTY);
4571
4572 *pwriteback = memcg_exact_page_state(memcg, NR_WRITEBACK);
4573 *pfilepages = memcg_exact_page_state(memcg, NR_INACTIVE_FILE) +
4574 memcg_exact_page_state(memcg, NR_ACTIVE_FILE);
4575 *pheadroom = PAGE_COUNTER_MAX;
4576
4577 while ((parent = parent_mem_cgroup(memcg))) {
4578 unsigned long ceiling = min(READ_ONCE(memcg->memory.max),
4579 READ_ONCE(memcg->memory.high));
4580 unsigned long used = page_counter_read(&memcg->memory);
4581
4582 *pheadroom = min(*pheadroom, ceiling - min(ceiling, used));
4583 memcg = parent;
4584 }
4585}
4586
4587
4588
4589
4590
4591
4592
4593
4594
4595
4596
4597
4598
4599
4600
4601
4602
4603
4604
4605
4606
4607
4608
4609
4610
4611
4612
4613
4614
4615
4616
4617
4618
4619
4620
4621
4622
4623
4624
4625
4626
4627
4628
4629
4630
4631void mem_cgroup_track_foreign_dirty_slowpath(struct page *page,
4632 struct bdi_writeback *wb)
4633{
4634 struct mem_cgroup *memcg = page_memcg(page);
4635 struct memcg_cgwb_frn *frn;
4636 u64 now = get_jiffies_64();
4637 u64 oldest_at = now;
4638 int oldest = -1;
4639 int i;
4640
4641 trace_track_foreign_dirty(page, wb);
4642
4643
4644
4645
4646
4647
4648 for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++) {
4649 frn = &memcg->cgwb_frn[i];
4650 if (frn->bdi_id == wb->bdi->id &&
4651 frn->memcg_id == wb->memcg_css->id)
4652 break;
4653 if (time_before64(frn->at, oldest_at) &&
4654 atomic_read(&frn->done.cnt) == 1) {
4655 oldest = i;
4656 oldest_at = frn->at;
4657 }
4658 }
4659
4660 if (i < MEMCG_CGWB_FRN_CNT) {
4661
4662
4663
4664
4665
4666
4667
4668 unsigned long update_intv =
4669 min_t(unsigned long, HZ,
4670 msecs_to_jiffies(dirty_expire_interval * 10) / 8);
4671
4672 if (time_before64(frn->at, now - update_intv))
4673 frn->at = now;
4674 } else if (oldest >= 0) {
4675
4676 frn = &memcg->cgwb_frn[oldest];
4677 frn->bdi_id = wb->bdi->id;
4678 frn->memcg_id = wb->memcg_css->id;
4679 frn->at = now;
4680 }
4681}
4682
4683
4684void mem_cgroup_flush_foreign(struct bdi_writeback *wb)
4685{
4686 struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css);
4687 unsigned long intv = msecs_to_jiffies(dirty_expire_interval * 10);
4688 u64 now = jiffies_64;
4689 int i;
4690
4691 for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++) {
4692 struct memcg_cgwb_frn *frn = &memcg->cgwb_frn[i];
4693
4694
4695
4696
4697
4698
4699
4700 if (time_after64(frn->at, now - intv) &&
4701 atomic_read(&frn->done.cnt) == 1) {
4702 frn->at = 0;
4703 trace_flush_foreign(wb, frn->bdi_id, frn->memcg_id);
4704 cgroup_writeback_by_id(frn->bdi_id, frn->memcg_id, 0,
4705 WB_REASON_FOREIGN_FLUSH,
4706 &frn->done);
4707 }
4708 }
4709}
4710
4711#else
4712
4713static int memcg_wb_domain_init(struct mem_cgroup *memcg, gfp_t gfp)
4714{
4715 return 0;
4716}
4717
4718static void memcg_wb_domain_exit(struct mem_cgroup *memcg)
4719{
4720}
4721
4722static void memcg_wb_domain_size_changed(struct mem_cgroup *memcg)
4723{
4724}
4725
4726#endif
4727
4728
4729
4730
4731
4732
4733
4734
4735
4736
4737
4738
4739
4740
4741
4742
4743
4744
4745
4746static void memcg_event_remove(struct work_struct *work)
4747{
4748 struct mem_cgroup_event *event =
4749 container_of(work, struct mem_cgroup_event, remove);
4750 struct mem_cgroup *memcg = event->memcg;
4751
4752 remove_wait_queue(event->wqh, &event->wait);
4753
4754 event->unregister_event(memcg, event->eventfd);
4755
4756
4757 eventfd_signal(event->eventfd, 1);
4758
4759 eventfd_ctx_put(event->eventfd);
4760 kfree(event);
4761 css_put(&memcg->css);
4762}
4763
4764
4765
4766
4767
4768
4769static int memcg_event_wake(wait_queue_entry_t *wait, unsigned mode,
4770 int sync, void *key)
4771{
4772 struct mem_cgroup_event *event =
4773 container_of(wait, struct mem_cgroup_event, wait);
4774 struct mem_cgroup *memcg = event->memcg;
4775 __poll_t flags = key_to_poll(key);
4776
4777 if (flags & EPOLLHUP) {
4778
4779
4780
4781
4782
4783
4784
4785
4786
4787 spin_lock(&memcg->event_list_lock);
4788 if (!list_empty(&event->list)) {
4789 list_del_init(&event->list);
4790
4791
4792
4793
4794 schedule_work(&event->remove);
4795 }
4796 spin_unlock(&memcg->event_list_lock);
4797 }
4798
4799 return 0;
4800}
4801
4802static void memcg_event_ptable_queue_proc(struct file *file,
4803 wait_queue_head_t *wqh, poll_table *pt)
4804{
4805 struct mem_cgroup_event *event =
4806 container_of(pt, struct mem_cgroup_event, pt);
4807
4808 event->wqh = wqh;
4809 add_wait_queue(wqh, &event->wait);
4810}
4811
4812
4813
4814
4815
4816
4817
4818
4819
4820static ssize_t memcg_write_event_control(struct kernfs_open_file *of,
4821 char *buf, size_t nbytes, loff_t off)
4822{
4823 struct cgroup_subsys_state *css = of_css(of);
4824 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
4825 struct mem_cgroup_event *event;
4826 struct cgroup_subsys_state *cfile_css;
4827 unsigned int efd, cfd;
4828 struct fd efile;
4829 struct fd cfile;
4830 const char *name;
4831 char *endp;
4832 int ret;
4833
4834 buf = strstrip(buf);
4835
4836 efd = simple_strtoul(buf, &endp, 10);
4837 if (*endp != ' ')
4838 return -EINVAL;
4839 buf = endp + 1;
4840
4841 cfd = simple_strtoul(buf, &endp, 10);
4842 if ((*endp != ' ') && (*endp != '\0'))
4843 return -EINVAL;
4844 buf = endp + 1;
4845
4846 event = kzalloc(sizeof(*event), GFP_KERNEL);
4847 if (!event)
4848 return -ENOMEM;
4849
4850 event->memcg = memcg;
4851 INIT_LIST_HEAD(&event->list);
4852 init_poll_funcptr(&event->pt, memcg_event_ptable_queue_proc);
4853 init_waitqueue_func_entry(&event->wait, memcg_event_wake);
4854 INIT_WORK(&event->remove, memcg_event_remove);
4855
4856 efile = fdget(efd);
4857 if (!efile.file) {
4858 ret = -EBADF;
4859 goto out_kfree;
4860 }
4861
4862 event->eventfd = eventfd_ctx_fileget(efile.file);
4863 if (IS_ERR(event->eventfd)) {
4864 ret = PTR_ERR(event->eventfd);
4865 goto out_put_efile;
4866 }
4867
4868 cfile = fdget(cfd);
4869 if (!cfile.file) {
4870 ret = -EBADF;
4871 goto out_put_eventfd;
4872 }
4873
4874
4875
4876 ret = file_permission(cfile.file, MAY_READ);
4877 if (ret < 0)
4878 goto out_put_cfile;
4879
4880
4881
4882
4883
4884
4885
4886
4887
4888 name = cfile.file->f_path.dentry->d_name.name;
4889
4890 if (!strcmp(name, "memory.usage_in_bytes")) {
4891 event->register_event = mem_cgroup_usage_register_event;
4892 event->unregister_event = mem_cgroup_usage_unregister_event;
4893 } else if (!strcmp(name, "memory.oom_control")) {
4894 event->register_event = mem_cgroup_oom_register_event;
4895 event->unregister_event = mem_cgroup_oom_unregister_event;
4896 } else if (!strcmp(name, "memory.pressure_level")) {
4897 event->register_event = vmpressure_register_event;
4898 event->unregister_event = vmpressure_unregister_event;
4899 } else if (!strcmp(name, "memory.memsw.usage_in_bytes")) {
4900 event->register_event = memsw_cgroup_usage_register_event;
4901 event->unregister_event = memsw_cgroup_usage_unregister_event;
4902 } else {
4903 ret = -EINVAL;
4904 goto out_put_cfile;
4905 }
4906
4907
4908
4909
4910
4911
4912 cfile_css = css_tryget_online_from_dir(cfile.file->f_path.dentry->d_parent,
4913 &memory_cgrp_subsys);
4914 ret = -EINVAL;
4915 if (IS_ERR(cfile_css))
4916 goto out_put_cfile;
4917 if (cfile_css != css) {
4918 css_put(cfile_css);
4919 goto out_put_cfile;
4920 }
4921
4922 ret = event->register_event(memcg, event->eventfd, buf);
4923 if (ret)
4924 goto out_put_css;
4925
4926 vfs_poll(efile.file, &event->pt);
4927
4928 spin_lock(&memcg->event_list_lock);
4929 list_add(&event->list, &memcg->event_list);
4930 spin_unlock(&memcg->event_list_lock);
4931
4932 fdput(cfile);
4933 fdput(efile);
4934
4935 return nbytes;
4936
4937out_put_css:
4938 css_put(css);
4939out_put_cfile:
4940 fdput(cfile);
4941out_put_eventfd:
4942 eventfd_ctx_put(event->eventfd);
4943out_put_efile:
4944 fdput(efile);
4945out_kfree:
4946 kfree(event);
4947
4948 return ret;
4949}
4950
4951static struct cftype mem_cgroup_legacy_files[] = {
4952 {
4953 .name = "usage_in_bytes",
4954 .private = MEMFILE_PRIVATE(_MEM, RES_USAGE),
4955 .read_u64 = mem_cgroup_read_u64,
4956 },
4957 {
4958 .name = "max_usage_in_bytes",
4959 .private = MEMFILE_PRIVATE(_MEM, RES_MAX_USAGE),
4960 .write = mem_cgroup_reset,
4961 .read_u64 = mem_cgroup_read_u64,
4962 },
4963 {
4964 .name = "limit_in_bytes",
4965 .private = MEMFILE_PRIVATE(_MEM, RES_LIMIT),
4966 .write = mem_cgroup_write,
4967 .read_u64 = mem_cgroup_read_u64,
4968 },
4969 {
4970 .name = "soft_limit_in_bytes",
4971 .private = MEMFILE_PRIVATE(_MEM, RES_SOFT_LIMIT),
4972 .write = mem_cgroup_write,
4973 .read_u64 = mem_cgroup_read_u64,
4974 },
4975 {
4976 .name = "failcnt",
4977 .private = MEMFILE_PRIVATE(_MEM, RES_FAILCNT),
4978 .write = mem_cgroup_reset,
4979 .read_u64 = mem_cgroup_read_u64,
4980 },
4981 {
4982 .name = "stat",
4983 .seq_show = memcg_stat_show,
4984 },
4985 {
4986 .name = "force_empty",
4987 .write = mem_cgroup_force_empty_write,
4988 },
4989 {
4990 .name = "use_hierarchy",
4991 .write_u64 = mem_cgroup_hierarchy_write,
4992 .read_u64 = mem_cgroup_hierarchy_read,
4993 },
4994 {
4995 .name = "cgroup.event_control",
4996 .write = memcg_write_event_control,
4997 .flags = CFTYPE_NO_PREFIX | CFTYPE_WORLD_WRITABLE,
4998 },
4999 {
5000 .name = "swappiness",
5001 .read_u64 = mem_cgroup_swappiness_read,
5002 .write_u64 = mem_cgroup_swappiness_write,
5003 },
5004 {
5005 .name = "move_charge_at_immigrate",
5006 .read_u64 = mem_cgroup_move_charge_read,
5007 .write_u64 = mem_cgroup_move_charge_write,
5008 },
5009 {
5010 .name = "oom_control",
5011 .seq_show = mem_cgroup_oom_control_read,
5012 .write_u64 = mem_cgroup_oom_control_write,
5013 .private = MEMFILE_PRIVATE(_OOM_TYPE, OOM_CONTROL),
5014 },
5015 {
5016 .name = "pressure_level",
5017 },
5018#ifdef CONFIG_NUMA
5019 {
5020 .name = "numa_stat",
5021 .seq_show = memcg_numa_stat_show,
5022 },
5023#endif
5024 {
5025 .name = "kmem.limit_in_bytes",
5026 .private = MEMFILE_PRIVATE(_KMEM, RES_LIMIT),
5027 .write = mem_cgroup_write,
5028 .read_u64 = mem_cgroup_read_u64,
5029 },
5030 {
5031 .name = "kmem.usage_in_bytes",
5032 .private = MEMFILE_PRIVATE(_KMEM, RES_USAGE),
5033 .read_u64 = mem_cgroup_read_u64,
5034 },
5035 {
5036 .name = "kmem.failcnt",
5037 .private = MEMFILE_PRIVATE(_KMEM, RES_FAILCNT),
5038 .write = mem_cgroup_reset,
5039 .read_u64 = mem_cgroup_read_u64,
5040 },
5041 {
5042 .name = "kmem.max_usage_in_bytes",
5043 .private = MEMFILE_PRIVATE(_KMEM, RES_MAX_USAGE),
5044 .write = mem_cgroup_reset,
5045 .read_u64 = mem_cgroup_read_u64,
5046 },
5047#if defined(CONFIG_MEMCG_KMEM) && \
5048 (defined(CONFIG_SLAB) || defined(CONFIG_SLUB_DEBUG))
5049 {
5050 .name = "kmem.slabinfo",
5051 .seq_show = memcg_slab_show,
5052 },
5053#endif
5054 {
5055 .name = "kmem.tcp.limit_in_bytes",
5056 .private = MEMFILE_PRIVATE(_TCP, RES_LIMIT),
5057 .write = mem_cgroup_write,
5058 .read_u64 = mem_cgroup_read_u64,
5059 },
5060 {
5061 .name = "kmem.tcp.usage_in_bytes",
5062 .private = MEMFILE_PRIVATE(_TCP, RES_USAGE),
5063 .read_u64 = mem_cgroup_read_u64,
5064 },
5065 {
5066 .name = "kmem.tcp.failcnt",
5067 .private = MEMFILE_PRIVATE(_TCP, RES_FAILCNT),
5068 .write = mem_cgroup_reset,
5069 .read_u64 = mem_cgroup_read_u64,
5070 },
5071 {
5072 .name = "kmem.tcp.max_usage_in_bytes",
5073 .private = MEMFILE_PRIVATE(_TCP, RES_MAX_USAGE),
5074 .write = mem_cgroup_reset,
5075 .read_u64 = mem_cgroup_read_u64,
5076 },
5077 { },
5078};
5079
5080
5081
5082
5083
5084
5085
5086
5087
5088
5089
5090
5091
5092
5093
5094
5095
5096
5097
5098
5099
5100
5101
5102
5103
5104static DEFINE_IDR(mem_cgroup_idr);
5105
5106static void mem_cgroup_id_remove(struct mem_cgroup *memcg)
5107{
5108 if (memcg->id.id > 0) {
5109 idr_remove(&mem_cgroup_idr, memcg->id.id);
5110 memcg->id.id = 0;
5111 }
5112}
5113
5114static void __maybe_unused mem_cgroup_id_get_many(struct mem_cgroup *memcg,
5115 unsigned int n)
5116{
5117 refcount_add(n, &memcg->id.ref);
5118}
5119
5120static void mem_cgroup_id_put_many(struct mem_cgroup *memcg, unsigned int n)
5121{
5122 if (refcount_sub_and_test(n, &memcg->id.ref)) {
5123 mem_cgroup_id_remove(memcg);
5124
5125
5126 css_put(&memcg->css);
5127 }
5128}
5129
5130static inline void mem_cgroup_id_put(struct mem_cgroup *memcg)
5131{
5132 mem_cgroup_id_put_many(memcg, 1);
5133}
5134
5135
5136
5137
5138
5139
5140
5141struct mem_cgroup *mem_cgroup_from_id(unsigned short id)
5142{
5143 WARN_ON_ONCE(!rcu_read_lock_held());
5144 return idr_find(&mem_cgroup_idr, id);
5145}
5146
5147static int alloc_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node)
5148{
5149 struct mem_cgroup_per_node *pn;
5150 int tmp = node;
5151
5152
5153
5154
5155
5156
5157
5158
5159 if (!node_state(node, N_NORMAL_MEMORY))
5160 tmp = -1;
5161 pn = kzalloc_node(sizeof(*pn), GFP_KERNEL, tmp);
5162 if (!pn)
5163 return 1;
5164
5165 pn->lruvec_stat_local = alloc_percpu_gfp(struct lruvec_stat,
5166 GFP_KERNEL_ACCOUNT);
5167 if (!pn->lruvec_stat_local) {
5168 kfree(pn);
5169 return 1;
5170 }
5171
5172 pn->lruvec_stat_cpu = alloc_percpu_gfp(struct batched_lruvec_stat,
5173 GFP_KERNEL_ACCOUNT);
5174 if (!pn->lruvec_stat_cpu) {
5175 free_percpu(pn->lruvec_stat_local);
5176 kfree(pn);
5177 return 1;
5178 }
5179
5180 lruvec_init(&pn->lruvec);
5181 pn->usage_in_excess = 0;
5182 pn->on_tree = false;
5183 pn->memcg = memcg;
5184
5185 memcg->nodeinfo[node] = pn;
5186 return 0;
5187}
5188
5189static void free_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node)
5190{
5191 struct mem_cgroup_per_node *pn = memcg->nodeinfo[node];
5192
5193 if (!pn)
5194 return;
5195
5196 free_percpu(pn->lruvec_stat_cpu);
5197 free_percpu(pn->lruvec_stat_local);
5198 kfree(pn);
5199}
5200
5201static void __mem_cgroup_free(struct mem_cgroup *memcg)
5202{
5203 int node;
5204
5205 for_each_node(node)
5206 free_mem_cgroup_per_node_info(memcg, node);
5207 free_percpu(memcg->vmstats_percpu);
5208 free_percpu(memcg->vmstats_local);
5209 kfree(memcg);
5210}
5211
5212static void mem_cgroup_free(struct mem_cgroup *memcg)
5213{
5214 memcg_wb_domain_exit(memcg);
5215
5216
5217
5218
5219 memcg_flush_percpu_vmstats(memcg);
5220 memcg_flush_percpu_vmevents(memcg);
5221 __mem_cgroup_free(memcg);
5222}
5223
5224static struct mem_cgroup *mem_cgroup_alloc(void)
5225{
5226 struct mem_cgroup *memcg;
5227 unsigned int size;
5228 int node;
5229 int __maybe_unused i;
5230 long error = -ENOMEM;
5231
5232 size = sizeof(struct mem_cgroup);
5233 size += nr_node_ids * sizeof(struct mem_cgroup_per_node *);
5234
5235 memcg = kzalloc(size, GFP_KERNEL);
5236 if (!memcg)
5237 return ERR_PTR(error);
5238
5239 memcg->id.id = idr_alloc(&mem_cgroup_idr, NULL,
5240 1, MEM_CGROUP_ID_MAX,
5241 GFP_KERNEL);
5242 if (memcg->id.id < 0) {
5243 error = memcg->id.id;
5244 goto fail;
5245 }
5246
5247 memcg->vmstats_local = alloc_percpu_gfp(struct memcg_vmstats_percpu,
5248 GFP_KERNEL_ACCOUNT);
5249 if (!memcg->vmstats_local)
5250 goto fail;
5251
5252 memcg->vmstats_percpu = alloc_percpu_gfp(struct memcg_vmstats_percpu,
5253 GFP_KERNEL_ACCOUNT);
5254 if (!memcg->vmstats_percpu)
5255 goto fail;
5256
5257 for_each_node(node)
5258 if (alloc_mem_cgroup_per_node_info(memcg, node))
5259 goto fail;
5260
5261 if (memcg_wb_domain_init(memcg, GFP_KERNEL))
5262 goto fail;
5263
5264 INIT_WORK(&memcg->high_work, high_work_func);
5265 INIT_LIST_HEAD(&memcg->oom_notify);
5266 mutex_init(&memcg->thresholds_lock);
5267 spin_lock_init(&memcg->move_lock);
5268 vmpressure_init(&memcg->vmpressure);
5269 INIT_LIST_HEAD(&memcg->event_list);
5270 spin_lock_init(&memcg->event_list_lock);
5271 memcg->socket_pressure = jiffies;
5272#ifdef CONFIG_MEMCG_KMEM
5273 memcg->kmemcg_id = -1;
5274 INIT_LIST_HEAD(&memcg->objcg_list);
5275#endif
5276#ifdef CONFIG_CGROUP_WRITEBACK
5277 INIT_LIST_HEAD(&memcg->cgwb_list);
5278 for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++)
5279 memcg->cgwb_frn[i].done =
5280 __WB_COMPLETION_INIT(&memcg_cgwb_frn_waitq);
5281#endif
5282#ifdef CONFIG_TRANSPARENT_HUGEPAGE
5283 spin_lock_init(&memcg->deferred_split_queue.split_queue_lock);
5284 INIT_LIST_HEAD(&memcg->deferred_split_queue.split_queue);
5285 memcg->deferred_split_queue.split_queue_len = 0;
5286#endif
5287 idr_replace(&mem_cgroup_idr, memcg, memcg->id.id);
5288 return memcg;
5289fail:
5290 mem_cgroup_id_remove(memcg);
5291 __mem_cgroup_free(memcg);
5292 return ERR_PTR(error);
5293}
5294
5295static struct cgroup_subsys_state * __ref
5296mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
5297{
5298 struct mem_cgroup *parent = mem_cgroup_from_css(parent_css);
5299 struct mem_cgroup *memcg, *old_memcg;
5300 long error = -ENOMEM;
5301
5302 old_memcg = set_active_memcg(parent);
5303 memcg = mem_cgroup_alloc();
5304 set_active_memcg(old_memcg);
5305 if (IS_ERR(memcg))
5306 return ERR_CAST(memcg);
5307
5308 page_counter_set_high(&memcg->memory, PAGE_COUNTER_MAX);
5309 memcg->soft_limit = PAGE_COUNTER_MAX;
5310 page_counter_set_high(&memcg->swap, PAGE_COUNTER_MAX);
5311 if (parent) {
5312 memcg->swappiness = mem_cgroup_swappiness(parent);
5313 memcg->oom_kill_disable = parent->oom_kill_disable;
5314
5315 page_counter_init(&memcg->memory, &parent->memory);
5316 page_counter_init(&memcg->swap, &parent->swap);
5317 page_counter_init(&memcg->kmem, &parent->kmem);
5318 page_counter_init(&memcg->tcpmem, &parent->tcpmem);
5319 } else {
5320 page_counter_init(&memcg->memory, NULL);
5321 page_counter_init(&memcg->swap, NULL);
5322 page_counter_init(&memcg->kmem, NULL);
5323 page_counter_init(&memcg->tcpmem, NULL);
5324
5325 root_mem_cgroup = memcg;
5326 return &memcg->css;
5327 }
5328
5329
5330 error = memcg_online_kmem(memcg);
5331 if (error)
5332 goto fail;
5333
5334 if (cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_nosocket)
5335 static_branch_inc(&memcg_sockets_enabled_key);
5336
5337 return &memcg->css;
5338fail:
5339 mem_cgroup_id_remove(memcg);
5340 mem_cgroup_free(memcg);
5341 return ERR_PTR(error);
5342}
5343
5344static int mem_cgroup_css_online(struct cgroup_subsys_state *css)
5345{
5346 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
5347
5348
5349
5350
5351
5352
5353 if (memcg_alloc_shrinker_maps(memcg)) {
5354 mem_cgroup_id_remove(memcg);
5355 return -ENOMEM;
5356 }
5357
5358
5359 refcount_set(&memcg->id.ref, 1);
5360 css_get(css);
5361 return 0;
5362}
5363
5364static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
5365{
5366 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
5367 struct mem_cgroup_event *event, *tmp;
5368
5369
5370
5371
5372
5373
5374 spin_lock(&memcg->event_list_lock);
5375 list_for_each_entry_safe(event, tmp, &memcg->event_list, list) {
5376 list_del_init(&event->list);
5377 schedule_work(&event->remove);
5378 }
5379 spin_unlock(&memcg->event_list_lock);
5380
5381 page_counter_set_min(&memcg->memory, 0);
5382 page_counter_set_low(&memcg->memory, 0);
5383
5384 memcg_offline_kmem(memcg);
5385 wb_memcg_offline(memcg);
5386
5387 drain_all_stock(memcg);
5388
5389 mem_cgroup_id_put(memcg);
5390}
5391
5392static void mem_cgroup_css_released(struct cgroup_subsys_state *css)
5393{
5394 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
5395
5396 invalidate_reclaim_iterators(memcg);
5397}
5398
5399static void mem_cgroup_css_free(struct cgroup_subsys_state *css)
5400{
5401 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
5402 int __maybe_unused i;
5403
5404#ifdef CONFIG_CGROUP_WRITEBACK
5405 for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++)
5406 wb_wait_for_completion(&memcg->cgwb_frn[i].done);
5407#endif
5408 if (cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_nosocket)
5409 static_branch_dec(&memcg_sockets_enabled_key);
5410
5411 if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && memcg->tcpmem_active)
5412 static_branch_dec(&memcg_sockets_enabled_key);
5413
5414 vmpressure_cleanup(&memcg->vmpressure);
5415 cancel_work_sync(&memcg->high_work);
5416 mem_cgroup_remove_from_trees(memcg);
5417 memcg_free_shrinker_maps(memcg);
5418 memcg_free_kmem(memcg);
5419 mem_cgroup_free(memcg);
5420}
5421
5422
5423
5424
5425
5426
5427
5428
5429
5430
5431
5432
5433
5434
5435static void mem_cgroup_css_reset(struct cgroup_subsys_state *css)
5436{
5437 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
5438
5439 page_counter_set_max(&memcg->memory, PAGE_COUNTER_MAX);
5440 page_counter_set_max(&memcg->swap, PAGE_COUNTER_MAX);
5441 page_counter_set_max(&memcg->kmem, PAGE_COUNTER_MAX);
5442 page_counter_set_max(&memcg->tcpmem, PAGE_COUNTER_MAX);
5443 page_counter_set_min(&memcg->memory, 0);
5444 page_counter_set_low(&memcg->memory, 0);
5445 page_counter_set_high(&memcg->memory, PAGE_COUNTER_MAX);
5446 memcg->soft_limit = PAGE_COUNTER_MAX;
5447 page_counter_set_high(&memcg->swap, PAGE_COUNTER_MAX);
5448 memcg_wb_domain_size_changed(memcg);
5449}
5450
5451#ifdef CONFIG_MMU
5452
5453static int mem_cgroup_do_precharge(unsigned long count)
5454{
5455 int ret;
5456
5457
5458 ret = try_charge(mc.to, GFP_KERNEL & ~__GFP_DIRECT_RECLAIM, count);
5459 if (!ret) {
5460 mc.precharge += count;
5461 return ret;
5462 }
5463
5464
5465 while (count--) {
5466 ret = try_charge(mc.to, GFP_KERNEL | __GFP_NORETRY, 1);
5467 if (ret)
5468 return ret;
5469 mc.precharge++;
5470 cond_resched();
5471 }
5472 return 0;
5473}
5474
5475union mc_target {
5476 struct page *page;
5477 swp_entry_t ent;
5478};
5479
5480enum mc_target_type {
5481 MC_TARGET_NONE = 0,
5482 MC_TARGET_PAGE,
5483 MC_TARGET_SWAP,
5484 MC_TARGET_DEVICE,
5485};
5486
5487static struct page *mc_handle_present_pte(struct vm_area_struct *vma,
5488 unsigned long addr, pte_t ptent)
5489{
5490 struct page *page = vm_normal_page(vma, addr, ptent);
5491
5492 if (!page || !page_mapped(page))
5493 return NULL;
5494 if (PageAnon(page)) {
5495 if (!(mc.flags & MOVE_ANON))
5496 return NULL;
5497 } else {
5498 if (!(mc.flags & MOVE_FILE))
5499 return NULL;
5500 }
5501 if (!get_page_unless_zero(page))
5502 return NULL;
5503
5504 return page;
5505}
5506
5507#if defined(CONFIG_SWAP) || defined(CONFIG_DEVICE_PRIVATE)
5508static struct page *mc_handle_swap_pte(struct vm_area_struct *vma,
5509 pte_t ptent, swp_entry_t *entry)
5510{
5511 struct page *page = NULL;
5512 swp_entry_t ent = pte_to_swp_entry(ptent);
5513
5514 if (!(mc.flags & MOVE_ANON))
5515 return NULL;
5516
5517
5518
5519
5520
5521
5522 if (is_device_private_entry(ent)) {
5523 page = device_private_entry_to_page(ent);
5524
5525
5526
5527
5528 if (!page_ref_add_unless(page, 1, 1))
5529 return NULL;
5530 return page;
5531 }
5532
5533 if (non_swap_entry(ent))
5534 return NULL;
5535
5536
5537
5538
5539
5540 page = find_get_page(swap_address_space(ent), swp_offset(ent));
5541 entry->val = ent.val;
5542
5543 return page;
5544}
5545#else
5546static struct page *mc_handle_swap_pte(struct vm_area_struct *vma,
5547 pte_t ptent, swp_entry_t *entry)
5548{
5549 return NULL;
5550}
5551#endif
5552
5553static struct page *mc_handle_file_pte(struct vm_area_struct *vma,
5554 unsigned long addr, pte_t ptent, swp_entry_t *entry)
5555{
5556 if (!vma->vm_file)
5557 return NULL;
5558 if (!(mc.flags & MOVE_FILE))
5559 return NULL;
5560
5561
5562
5563 return find_get_incore_page(vma->vm_file->f_mapping,
5564 linear_page_index(vma, addr));
5565}
5566
5567
5568
5569
5570
5571
5572
5573
5574
5575
5576
5577
5578
5579static int mem_cgroup_move_account(struct page *page,
5580 bool compound,
5581 struct mem_cgroup *from,
5582 struct mem_cgroup *to)
5583{
5584 struct lruvec *from_vec, *to_vec;
5585 struct pglist_data *pgdat;
5586 unsigned int nr_pages = compound ? thp_nr_pages(page) : 1;
5587 int ret;
5588
5589 VM_BUG_ON(from == to);
5590 VM_BUG_ON_PAGE(PageLRU(page), page);
5591 VM_BUG_ON(compound && !PageTransHuge(page));
5592
5593
5594
5595
5596
5597 ret = -EBUSY;
5598 if (!trylock_page(page))
5599 goto out;
5600
5601 ret = -EINVAL;
5602 if (page_memcg(page) != from)
5603 goto out_unlock;
5604
5605 pgdat = page_pgdat(page);
5606 from_vec = mem_cgroup_lruvec(from, pgdat);
5607 to_vec = mem_cgroup_lruvec(to, pgdat);
5608
5609 lock_page_memcg(page);
5610
5611 if (PageAnon(page)) {
5612 if (page_mapped(page)) {
5613 __mod_lruvec_state(from_vec, NR_ANON_MAPPED, -nr_pages);
5614 __mod_lruvec_state(to_vec, NR_ANON_MAPPED, nr_pages);
5615 if (PageTransHuge(page)) {
5616 __mod_lruvec_state(from_vec, NR_ANON_THPS,
5617 -nr_pages);
5618 __mod_lruvec_state(to_vec, NR_ANON_THPS,
5619 nr_pages);
5620 }
5621 }
5622 } else {
5623 __mod_lruvec_state(from_vec, NR_FILE_PAGES, -nr_pages);
5624 __mod_lruvec_state(to_vec, NR_FILE_PAGES, nr_pages);
5625
5626 if (PageSwapBacked(page)) {
5627 __mod_lruvec_state(from_vec, NR_SHMEM, -nr_pages);
5628 __mod_lruvec_state(to_vec, NR_SHMEM, nr_pages);
5629 }
5630
5631 if (page_mapped(page)) {
5632 __mod_lruvec_state(from_vec, NR_FILE_MAPPED, -nr_pages);
5633 __mod_lruvec_state(to_vec, NR_FILE_MAPPED, nr_pages);
5634 }
5635
5636 if (PageDirty(page)) {
5637 struct address_space *mapping = page_mapping(page);
5638
5639 if (mapping_can_writeback(mapping)) {
5640 __mod_lruvec_state(from_vec, NR_FILE_DIRTY,
5641 -nr_pages);
5642 __mod_lruvec_state(to_vec, NR_FILE_DIRTY,
5643 nr_pages);
5644 }
5645 }
5646 }
5647
5648 if (PageWriteback(page)) {
5649 __mod_lruvec_state(from_vec, NR_WRITEBACK, -nr_pages);
5650 __mod_lruvec_state(to_vec, NR_WRITEBACK, nr_pages);
5651 }
5652
5653
5654
5655
5656
5657
5658
5659
5660
5661
5662
5663
5664
5665
5666 smp_mb();
5667
5668 css_get(&to->css);
5669 css_put(&from->css);
5670
5671 page->memcg_data = (unsigned long)to;
5672
5673 __unlock_page_memcg(from);
5674
5675 ret = 0;
5676
5677 local_irq_disable();
5678 mem_cgroup_charge_statistics(to, page, nr_pages);
5679 memcg_check_events(to, page);
5680 mem_cgroup_charge_statistics(from, page, -nr_pages);
5681 memcg_check_events(from, page);
5682 local_irq_enable();
5683out_unlock:
5684 unlock_page(page);
5685out:
5686 return ret;
5687}
5688
5689
5690
5691
5692
5693
5694
5695
5696
5697
5698
5699
5700
5701
5702
5703
5704
5705
5706
5707
5708
5709
5710
5711
5712
5713
5714
5715static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma,
5716 unsigned long addr, pte_t ptent, union mc_target *target)
5717{
5718 struct page *page = NULL;
5719 enum mc_target_type ret = MC_TARGET_NONE;
5720 swp_entry_t ent = { .val = 0 };
5721
5722 if (pte_present(ptent))
5723 page = mc_handle_present_pte(vma, addr, ptent);
5724 else if (is_swap_pte(ptent))
5725 page = mc_handle_swap_pte(vma, ptent, &ent);
5726 else if (pte_none(ptent))
5727 page = mc_handle_file_pte(vma, addr, ptent, &ent);
5728
5729 if (!page && !ent.val)
5730 return ret;
5731 if (page) {
5732
5733
5734
5735
5736
5737 if (page_memcg(page) == mc.from) {
5738 ret = MC_TARGET_PAGE;
5739 if (is_device_private_page(page))
5740 ret = MC_TARGET_DEVICE;
5741 if (target)
5742 target->page = page;
5743 }
5744 if (!ret || !target)
5745 put_page(page);
5746 }
5747
5748
5749
5750
5751 if (ent.val && !ret && (!page || !PageTransCompound(page)) &&
5752 mem_cgroup_id(mc.from) == lookup_swap_cgroup_id(ent)) {
5753 ret = MC_TARGET_SWAP;
5754 if (target)
5755 target->ent = ent;
5756 }
5757 return ret;
5758}
5759
5760#ifdef CONFIG_TRANSPARENT_HUGEPAGE
5761
5762
5763
5764
5765
5766static enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma,
5767 unsigned long addr, pmd_t pmd, union mc_target *target)
5768{
5769 struct page *page = NULL;
5770 enum mc_target_type ret = MC_TARGET_NONE;
5771
5772 if (unlikely(is_swap_pmd(pmd))) {
5773 VM_BUG_ON(thp_migration_supported() &&
5774 !is_pmd_migration_entry(pmd));
5775 return ret;
5776 }
5777 page = pmd_page(pmd);
5778 VM_BUG_ON_PAGE(!page || !PageHead(page), page);
5779 if (!(mc.flags & MOVE_ANON))
5780 return ret;
5781 if (page_memcg(page) == mc.from) {
5782 ret = MC_TARGET_PAGE;
5783 if (target) {
5784 get_page(page);
5785 target->page = page;
5786 }
5787 }
5788 return ret;
5789}
5790#else
5791static inline enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma,
5792 unsigned long addr, pmd_t pmd, union mc_target *target)
5793{
5794 return MC_TARGET_NONE;
5795}
5796#endif
5797
5798static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd,
5799 unsigned long addr, unsigned long end,
5800 struct mm_walk *walk)
5801{
5802 struct vm_area_struct *vma = walk->vma;
5803 pte_t *pte;
5804 spinlock_t *ptl;
5805
5806 ptl = pmd_trans_huge_lock(pmd, vma);
5807 if (ptl) {
5808
5809
5810
5811
5812
5813 if (get_mctgt_type_thp(vma, addr, *pmd, NULL) == MC_TARGET_PAGE)
5814 mc.precharge += HPAGE_PMD_NR;
5815 spin_unlock(ptl);
5816 return 0;
5817 }
5818
5819 if (pmd_trans_unstable(pmd))
5820 return 0;
5821 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
5822 for (; addr != end; pte++, addr += PAGE_SIZE)
5823 if (get_mctgt_type(vma, addr, *pte, NULL))
5824 mc.precharge++;
5825 pte_unmap_unlock(pte - 1, ptl);
5826 cond_resched();
5827
5828 return 0;
5829}
5830
5831static const struct mm_walk_ops precharge_walk_ops = {
5832 .pmd_entry = mem_cgroup_count_precharge_pte_range,
5833};
5834
5835static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm)
5836{
5837 unsigned long precharge;
5838
5839 mmap_read_lock(mm);
5840 walk_page_range(mm, 0, mm->highest_vm_end, &precharge_walk_ops, NULL);
5841 mmap_read_unlock(mm);
5842
5843 precharge = mc.precharge;
5844 mc.precharge = 0;
5845
5846 return precharge;
5847}
5848
5849static int mem_cgroup_precharge_mc(struct mm_struct *mm)
5850{
5851 unsigned long precharge = mem_cgroup_count_precharge(mm);
5852
5853 VM_BUG_ON(mc.moving_task);
5854 mc.moving_task = current;
5855 return mem_cgroup_do_precharge(precharge);
5856}
5857
5858
5859static void __mem_cgroup_clear_mc(void)
5860{
5861 struct mem_cgroup *from = mc.from;
5862 struct mem_cgroup *to = mc.to;
5863
5864
5865 if (mc.precharge) {
5866 cancel_charge(mc.to, mc.precharge);
5867 mc.precharge = 0;
5868 }
5869
5870
5871
5872
5873 if (mc.moved_charge) {
5874 cancel_charge(mc.from, mc.moved_charge);
5875 mc.moved_charge = 0;
5876 }
5877
5878 if (mc.moved_swap) {
5879
5880 if (!mem_cgroup_is_root(mc.from))
5881 page_counter_uncharge(&mc.from->memsw, mc.moved_swap);
5882
5883 mem_cgroup_id_put_many(mc.from, mc.moved_swap);
5884
5885
5886
5887
5888
5889 if (!mem_cgroup_is_root(mc.to))
5890 page_counter_uncharge(&mc.to->memory, mc.moved_swap);
5891
5892 mc.moved_swap = 0;
5893 }
5894 memcg_oom_recover(from);
5895 memcg_oom_recover(to);
5896 wake_up_all(&mc.waitq);
5897}
5898
5899static void mem_cgroup_clear_mc(void)
5900{
5901 struct mm_struct *mm = mc.mm;
5902
5903
5904
5905
5906
5907 mc.moving_task = NULL;
5908 __mem_cgroup_clear_mc();
5909 spin_lock(&mc.lock);
5910 mc.from = NULL;
5911 mc.to = NULL;
5912 mc.mm = NULL;
5913 spin_unlock(&mc.lock);
5914
5915 mmput(mm);
5916}
5917
5918static int mem_cgroup_can_attach(struct cgroup_taskset *tset)
5919{
5920 struct cgroup_subsys_state *css;
5921 struct mem_cgroup *memcg = NULL;
5922 struct mem_cgroup *from;
5923 struct task_struct *leader, *p;
5924 struct mm_struct *mm;
5925 unsigned long move_flags;
5926 int ret = 0;
5927
5928
5929 if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
5930 return 0;
5931
5932
5933
5934
5935
5936
5937
5938 p = NULL;
5939 cgroup_taskset_for_each_leader(leader, css, tset) {
5940 WARN_ON_ONCE(p);
5941 p = leader;
5942 memcg = mem_cgroup_from_css(css);
5943 }
5944 if (!p)
5945 return 0;
5946
5947
5948
5949
5950
5951
5952 move_flags = READ_ONCE(memcg->move_charge_at_immigrate);
5953 if (!move_flags)
5954 return 0;
5955
5956 from = mem_cgroup_from_task(p);
5957
5958 VM_BUG_ON(from == memcg);
5959
5960 mm = get_task_mm(p);
5961 if (!mm)
5962 return 0;
5963
5964 if (mm->owner == p) {
5965 VM_BUG_ON(mc.from);
5966 VM_BUG_ON(mc.to);
5967 VM_BUG_ON(mc.precharge);
5968 VM_BUG_ON(mc.moved_charge);
5969 VM_BUG_ON(mc.moved_swap);
5970
5971 spin_lock(&mc.lock);
5972 mc.mm = mm;
5973 mc.from = from;
5974 mc.to = memcg;
5975 mc.flags = move_flags;
5976 spin_unlock(&mc.lock);
5977
5978
5979 ret = mem_cgroup_precharge_mc(mm);
5980 if (ret)
5981 mem_cgroup_clear_mc();
5982 } else {
5983 mmput(mm);
5984 }
5985 return ret;
5986}
5987
5988static void mem_cgroup_cancel_attach(struct cgroup_taskset *tset)
5989{
5990 if (mc.to)
5991 mem_cgroup_clear_mc();
5992}
5993
5994static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
5995 unsigned long addr, unsigned long end,
5996 struct mm_walk *walk)
5997{
5998 int ret = 0;
5999 struct vm_area_struct *vma = walk->vma;
6000 pte_t *pte;
6001 spinlock_t *ptl;
6002 enum mc_target_type target_type;
6003 union mc_target target;
6004 struct page *page;
6005
6006 ptl = pmd_trans_huge_lock(pmd, vma);
6007 if (ptl) {
6008 if (mc.precharge < HPAGE_PMD_NR) {
6009 spin_unlock(ptl);
6010 return 0;
6011 }
6012 target_type = get_mctgt_type_thp(vma, addr, *pmd, &target);
6013 if (target_type == MC_TARGET_PAGE) {
6014 page = target.page;
6015 if (!isolate_lru_page(page)) {
6016 if (!mem_cgroup_move_account(page, true,
6017 mc.from, mc.to)) {
6018 mc.precharge -= HPAGE_PMD_NR;
6019 mc.moved_charge += HPAGE_PMD_NR;
6020 }
6021 putback_lru_page(page);
6022 }
6023 put_page(page);
6024 } else if (target_type == MC_TARGET_DEVICE) {
6025 page = target.page;
6026 if (!mem_cgroup_move_account(page, true,
6027 mc.from, mc.to)) {
6028 mc.precharge -= HPAGE_PMD_NR;
6029 mc.moved_charge += HPAGE_PMD_NR;
6030 }
6031 put_page(page);
6032 }
6033 spin_unlock(ptl);
6034 return 0;
6035 }
6036
6037 if (pmd_trans_unstable(pmd))
6038 return 0;
6039retry:
6040 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
6041 for (; addr != end; addr += PAGE_SIZE) {
6042 pte_t ptent = *(pte++);
6043 bool device = false;
6044 swp_entry_t ent;
6045
6046 if (!mc.precharge)
6047 break;
6048
6049 switch (get_mctgt_type(vma, addr, ptent, &target)) {
6050 case MC_TARGET_DEVICE:
6051 device = true;
6052 fallthrough;
6053 case MC_TARGET_PAGE:
6054 page = target.page;
6055
6056
6057
6058
6059
6060
6061 if (PageTransCompound(page))
6062 goto put;
6063 if (!device && isolate_lru_page(page))
6064 goto put;
6065 if (!mem_cgroup_move_account(page, false,
6066 mc.from, mc.to)) {
6067 mc.precharge--;
6068
6069 mc.moved_charge++;
6070 }
6071 if (!device)
6072 putback_lru_page(page);
6073put:
6074 put_page(page);
6075 break;
6076 case MC_TARGET_SWAP:
6077 ent = target.ent;
6078 if (!mem_cgroup_move_swap_account(ent, mc.from, mc.to)) {
6079 mc.precharge--;
6080 mem_cgroup_id_get_many(mc.to, 1);
6081
6082 mc.moved_swap++;
6083 }
6084 break;
6085 default:
6086 break;
6087 }
6088 }
6089 pte_unmap_unlock(pte - 1, ptl);
6090 cond_resched();
6091
6092 if (addr != end) {
6093
6094
6095
6096
6097
6098
6099 ret = mem_cgroup_do_precharge(1);
6100 if (!ret)
6101 goto retry;
6102 }
6103
6104 return ret;
6105}
6106
6107static const struct mm_walk_ops charge_walk_ops = {
6108 .pmd_entry = mem_cgroup_move_charge_pte_range,
6109};
6110
6111static void mem_cgroup_move_charge(void)
6112{
6113 lru_add_drain_all();
6114
6115
6116
6117
6118
6119 atomic_inc(&mc.from->moving_account);
6120 synchronize_rcu();
6121retry:
6122 if (unlikely(!mmap_read_trylock(mc.mm))) {
6123
6124
6125
6126
6127
6128
6129
6130 __mem_cgroup_clear_mc();
6131 cond_resched();
6132 goto retry;
6133 }
6134
6135
6136
6137
6138 walk_page_range(mc.mm, 0, mc.mm->highest_vm_end, &charge_walk_ops,
6139 NULL);
6140
6141 mmap_read_unlock(mc.mm);
6142 atomic_dec(&mc.from->moving_account);
6143}
6144
6145static void mem_cgroup_move_task(void)
6146{
6147 if (mc.to) {
6148 mem_cgroup_move_charge();
6149 mem_cgroup_clear_mc();
6150 }
6151}
6152#else
6153static int mem_cgroup_can_attach(struct cgroup_taskset *tset)
6154{
6155 return 0;
6156}
6157static void mem_cgroup_cancel_attach(struct cgroup_taskset *tset)
6158{
6159}
6160static void mem_cgroup_move_task(void)
6161{
6162}
6163#endif
6164
6165static int seq_puts_memcg_tunable(struct seq_file *m, unsigned long value)
6166{
6167 if (value == PAGE_COUNTER_MAX)
6168 seq_puts(m, "max\n");
6169 else
6170 seq_printf(m, "%llu\n", (u64)value * PAGE_SIZE);
6171
6172 return 0;
6173}
6174
6175static u64 memory_current_read(struct cgroup_subsys_state *css,
6176 struct cftype *cft)
6177{
6178 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
6179
6180 return (u64)page_counter_read(&memcg->memory) * PAGE_SIZE;
6181}
6182
6183static int memory_min_show(struct seq_file *m, void *v)
6184{
6185 return seq_puts_memcg_tunable(m,
6186 READ_ONCE(mem_cgroup_from_seq(m)->memory.min));
6187}
6188
6189static ssize_t memory_min_write(struct kernfs_open_file *of,
6190 char *buf, size_t nbytes, loff_t off)
6191{
6192 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
6193 unsigned long min;
6194 int err;
6195
6196 buf = strstrip(buf);
6197 err = page_counter_memparse(buf, "max", &min);
6198 if (err)
6199 return err;
6200
6201 page_counter_set_min(&memcg->memory, min);
6202
6203 return nbytes;
6204}
6205
6206static int memory_low_show(struct seq_file *m, void *v)
6207{
6208 return seq_puts_memcg_tunable(m,
6209 READ_ONCE(mem_cgroup_from_seq(m)->memory.low));
6210}
6211
6212static ssize_t memory_low_write(struct kernfs_open_file *of,
6213 char *buf, size_t nbytes, loff_t off)
6214{
6215 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
6216 unsigned long low;
6217 int err;
6218
6219 buf = strstrip(buf);
6220 err = page_counter_memparse(buf, "max", &low);
6221 if (err)
6222 return err;
6223
6224 page_counter_set_low(&memcg->memory, low);
6225
6226 return nbytes;
6227}
6228
6229static int memory_high_show(struct seq_file *m, void *v)
6230{
6231 return seq_puts_memcg_tunable(m,
6232 READ_ONCE(mem_cgroup_from_seq(m)->memory.high));
6233}
6234
6235static ssize_t memory_high_write(struct kernfs_open_file *of,
6236 char *buf, size_t nbytes, loff_t off)
6237{
6238 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
6239 unsigned int nr_retries = MAX_RECLAIM_RETRIES;
6240 bool drained = false;
6241 unsigned long high;
6242 int err;
6243
6244 buf = strstrip(buf);
6245 err = page_counter_memparse(buf, "max", &high);
6246 if (err)
6247 return err;
6248
6249 page_counter_set_high(&memcg->memory, high);
6250
6251 for (;;) {
6252 unsigned long nr_pages = page_counter_read(&memcg->memory);
6253 unsigned long reclaimed;
6254
6255 if (nr_pages <= high)
6256 break;
6257
6258 if (signal_pending(current))
6259 break;
6260
6261 if (!drained) {
6262 drain_all_stock(memcg);
6263 drained = true;
6264 continue;
6265 }
6266
6267 reclaimed = try_to_free_mem_cgroup_pages(memcg, nr_pages - high,
6268 GFP_KERNEL, true);
6269
6270 if (!reclaimed && !nr_retries--)
6271 break;
6272 }
6273
6274 memcg_wb_domain_size_changed(memcg);
6275 return nbytes;
6276}
6277
6278static int memory_max_show(struct seq_file *m, void *v)
6279{
6280 return seq_puts_memcg_tunable(m,
6281 READ_ONCE(mem_cgroup_from_seq(m)->memory.max));
6282}
6283
6284static ssize_t memory_max_write(struct kernfs_open_file *of,
6285 char *buf, size_t nbytes, loff_t off)
6286{
6287 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
6288 unsigned int nr_reclaims = MAX_RECLAIM_RETRIES;
6289 bool drained = false;
6290 unsigned long max;
6291 int err;
6292
6293 buf = strstrip(buf);
6294 err = page_counter_memparse(buf, "max", &max);
6295 if (err)
6296 return err;
6297
6298 xchg(&memcg->memory.max, max);
6299
6300 for (;;) {
6301 unsigned long nr_pages = page_counter_read(&memcg->memory);
6302
6303 if (nr_pages <= max)
6304 break;
6305
6306 if (signal_pending(current))
6307 break;
6308
6309 if (!drained) {
6310 drain_all_stock(memcg);
6311 drained = true;
6312 continue;
6313 }
6314
6315 if (nr_reclaims) {
6316 if (!try_to_free_mem_cgroup_pages(memcg, nr_pages - max,
6317 GFP_KERNEL, true))
6318 nr_reclaims--;
6319 continue;
6320 }
6321
6322 memcg_memory_event(memcg, MEMCG_OOM);
6323 if (!mem_cgroup_out_of_memory(memcg, GFP_KERNEL, 0))
6324 break;
6325 }
6326
6327 memcg_wb_domain_size_changed(memcg);
6328 return nbytes;
6329}
6330
6331static void __memory_events_show(struct seq_file *m, atomic_long_t *events)
6332{
6333 seq_printf(m, "low %lu\n", atomic_long_read(&events[MEMCG_LOW]));
6334 seq_printf(m, "high %lu\n", atomic_long_read(&events[MEMCG_HIGH]));
6335 seq_printf(m, "max %lu\n", atomic_long_read(&events[MEMCG_MAX]));
6336 seq_printf(m, "oom %lu\n", atomic_long_read(&events[MEMCG_OOM]));
6337 seq_printf(m, "oom_kill %lu\n",
6338 atomic_long_read(&events[MEMCG_OOM_KILL]));
6339}
6340
6341static int memory_events_show(struct seq_file *m, void *v)
6342{
6343 struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
6344
6345 __memory_events_show(m, memcg->memory_events);
6346 return 0;
6347}
6348
6349static int memory_events_local_show(struct seq_file *m, void *v)
6350{
6351 struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
6352
6353 __memory_events_show(m, memcg->memory_events_local);
6354 return 0;
6355}
6356
6357static int memory_stat_show(struct seq_file *m, void *v)
6358{
6359 struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
6360 char *buf;
6361
6362 buf = memory_stat_format(memcg);
6363 if (!buf)
6364 return -ENOMEM;
6365 seq_puts(m, buf);
6366 kfree(buf);
6367 return 0;
6368}
6369
6370#ifdef CONFIG_NUMA
6371static inline unsigned long lruvec_page_state_output(struct lruvec *lruvec,
6372 int item)
6373{
6374 return lruvec_page_state(lruvec, item) * memcg_page_state_unit(item);
6375}
6376
6377static int memory_numa_stat_show(struct seq_file *m, void *v)
6378{
6379 int i;
6380 struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
6381
6382 for (i = 0; i < ARRAY_SIZE(memory_stats); i++) {
6383 int nid;
6384
6385 if (memory_stats[i].idx >= NR_VM_NODE_STAT_ITEMS)
6386 continue;
6387
6388 seq_printf(m, "%s", memory_stats[i].name);
6389 for_each_node_state(nid, N_MEMORY) {
6390 u64 size;
6391 struct lruvec *lruvec;
6392
6393 lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(nid));
6394 size = lruvec_page_state_output(lruvec,
6395 memory_stats[i].idx);
6396 seq_printf(m, " N%d=%llu", nid, size);
6397 }
6398 seq_putc(m, '\n');
6399 }
6400
6401 return 0;
6402}
6403#endif
6404
6405static int memory_oom_group_show(struct seq_file *m, void *v)
6406{
6407 struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
6408
6409 seq_printf(m, "%d\n", memcg->oom_group);
6410
6411 return 0;
6412}
6413
6414static ssize_t memory_oom_group_write(struct kernfs_open_file *of,
6415 char *buf, size_t nbytes, loff_t off)
6416{
6417 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
6418 int ret, oom_group;
6419
6420 buf = strstrip(buf);
6421 if (!buf)
6422 return -EINVAL;
6423
6424 ret = kstrtoint(buf, 0, &oom_group);
6425 if (ret)
6426 return ret;
6427
6428 if (oom_group != 0 && oom_group != 1)
6429 return -EINVAL;
6430
6431 memcg->oom_group = oom_group;
6432
6433 return nbytes;
6434}
6435
6436static struct cftype memory_files[] = {
6437 {
6438 .name = "current",
6439 .flags = CFTYPE_NOT_ON_ROOT,
6440 .read_u64 = memory_current_read,
6441 },
6442 {
6443 .name = "min",
6444 .flags = CFTYPE_NOT_ON_ROOT,
6445 .seq_show = memory_min_show,
6446 .write = memory_min_write,
6447 },
6448 {
6449 .name = "low",
6450 .flags = CFTYPE_NOT_ON_ROOT,
6451 .seq_show = memory_low_show,
6452 .write = memory_low_write,
6453 },
6454 {
6455 .name = "high",
6456 .flags = CFTYPE_NOT_ON_ROOT,
6457 .seq_show = memory_high_show,
6458 .write = memory_high_write,
6459 },
6460 {
6461 .name = "max",
6462 .flags = CFTYPE_NOT_ON_ROOT,
6463 .seq_show = memory_max_show,
6464 .write = memory_max_write,
6465 },
6466 {
6467 .name = "events",
6468 .flags = CFTYPE_NOT_ON_ROOT,
6469 .file_offset = offsetof(struct mem_cgroup, events_file),
6470 .seq_show = memory_events_show,
6471 },
6472 {
6473 .name = "events.local",
6474 .flags = CFTYPE_NOT_ON_ROOT,
6475 .file_offset = offsetof(struct mem_cgroup, events_local_file),
6476 .seq_show = memory_events_local_show,
6477 },
6478 {
6479 .name = "stat",
6480 .seq_show = memory_stat_show,
6481 },
6482#ifdef CONFIG_NUMA
6483 {
6484 .name = "numa_stat",
6485 .seq_show = memory_numa_stat_show,
6486 },
6487#endif
6488 {
6489 .name = "oom.group",
6490 .flags = CFTYPE_NOT_ON_ROOT | CFTYPE_NS_DELEGATABLE,
6491 .seq_show = memory_oom_group_show,
6492 .write = memory_oom_group_write,
6493 },
6494 { }
6495};
6496
6497struct cgroup_subsys memory_cgrp_subsys = {
6498 .css_alloc = mem_cgroup_css_alloc,
6499 .css_online = mem_cgroup_css_online,
6500 .css_offline = mem_cgroup_css_offline,
6501 .css_released = mem_cgroup_css_released,
6502 .css_free = mem_cgroup_css_free,
6503 .css_reset = mem_cgroup_css_reset,
6504 .can_attach = mem_cgroup_can_attach,
6505 .cancel_attach = mem_cgroup_cancel_attach,
6506 .post_attach = mem_cgroup_move_task,
6507 .dfl_cftypes = memory_files,
6508 .legacy_cftypes = mem_cgroup_legacy_files,
6509 .early_init = 0,
6510};
6511
6512
6513
6514
6515
6516
6517
6518
6519
6520
6521
6522
6523
6524
6525
6526
6527
6528
6529
6530
6531
6532
6533
6534
6535
6536
6537
6538
6539
6540
6541
6542
6543
6544
6545
6546
6547
6548
6549
6550
6551
6552
6553
6554
6555static unsigned long effective_protection(unsigned long usage,
6556 unsigned long parent_usage,
6557 unsigned long setting,
6558 unsigned long parent_effective,
6559 unsigned long siblings_protected)
6560{
6561 unsigned long protected;
6562 unsigned long ep;
6563
6564 protected = min(usage, setting);
6565
6566
6567
6568
6569
6570
6571
6572
6573
6574
6575 if (siblings_protected > parent_effective)
6576 return protected * parent_effective / siblings_protected;
6577
6578
6579
6580
6581
6582
6583
6584
6585
6586
6587
6588
6589
6590
6591
6592
6593 ep = protected;
6594
6595
6596
6597
6598
6599
6600
6601
6602
6603
6604
6605
6606
6607
6608
6609
6610
6611 if (!(cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_RECURSIVE_PROT))
6612 return ep;
6613 if (parent_effective > siblings_protected &&
6614 parent_usage > siblings_protected &&
6615 usage > protected) {
6616 unsigned long unclaimed;
6617
6618 unclaimed = parent_effective - siblings_protected;
6619 unclaimed *= usage - protected;
6620 unclaimed /= parent_usage - siblings_protected;
6621
6622 ep += unclaimed;
6623 }
6624
6625 return ep;
6626}
6627
6628
6629
6630
6631
6632
6633
6634
6635
6636void mem_cgroup_calculate_protection(struct mem_cgroup *root,
6637 struct mem_cgroup *memcg)
6638{
6639 unsigned long usage, parent_usage;
6640 struct mem_cgroup *parent;
6641
6642 if (mem_cgroup_disabled())
6643 return;
6644
6645 if (!root)
6646 root = root_mem_cgroup;
6647
6648
6649
6650
6651
6652
6653
6654
6655 if (memcg == root)
6656 return;
6657
6658 usage = page_counter_read(&memcg->memory);
6659 if (!usage)
6660 return;
6661
6662 parent = parent_mem_cgroup(memcg);
6663
6664 if (!parent)
6665 return;
6666
6667 if (parent == root) {
6668 memcg->memory.emin = READ_ONCE(memcg->memory.min);
6669 memcg->memory.elow = READ_ONCE(memcg->memory.low);
6670 return;
6671 }
6672
6673 parent_usage = page_counter_read(&parent->memory);
6674
6675 WRITE_ONCE(memcg->memory.emin, effective_protection(usage, parent_usage,
6676 READ_ONCE(memcg->memory.min),
6677 READ_ONCE(parent->memory.emin),
6678 atomic_long_read(&parent->memory.children_min_usage)));
6679
6680 WRITE_ONCE(memcg->memory.elow, effective_protection(usage, parent_usage,
6681 READ_ONCE(memcg->memory.low),
6682 READ_ONCE(parent->memory.elow),
6683 atomic_long_read(&parent->memory.children_low_usage)));
6684}
6685
6686
6687
6688
6689
6690
6691
6692
6693
6694
6695
6696
6697int mem_cgroup_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask)
6698{
6699 unsigned int nr_pages = thp_nr_pages(page);
6700 struct mem_cgroup *memcg = NULL;
6701 int ret = 0;
6702
6703 if (mem_cgroup_disabled())
6704 goto out;
6705
6706 if (PageSwapCache(page)) {
6707 swp_entry_t ent = { .val = page_private(page), };
6708 unsigned short id;
6709
6710
6711
6712
6713
6714
6715
6716
6717 VM_BUG_ON_PAGE(!PageLocked(page), page);
6718 if (page_memcg(compound_head(page)))
6719 goto out;
6720
6721 id = lookup_swap_cgroup_id(ent);
6722 rcu_read_lock();
6723 memcg = mem_cgroup_from_id(id);
6724 if (memcg && !css_tryget_online(&memcg->css))
6725 memcg = NULL;
6726 rcu_read_unlock();
6727 }
6728
6729 if (!memcg)
6730 memcg = get_mem_cgroup_from_mm(mm);
6731
6732 ret = try_charge(memcg, gfp_mask, nr_pages);
6733 if (ret)
6734 goto out_put;
6735
6736 css_get(&memcg->css);
6737 commit_charge(page, memcg);
6738
6739 local_irq_disable();
6740 mem_cgroup_charge_statistics(memcg, page, nr_pages);
6741 memcg_check_events(memcg, page);
6742 local_irq_enable();
6743
6744
6745
6746
6747
6748
6749
6750
6751
6752
6753
6754
6755
6756 if (do_memsw_account() && PageSwapCache(page)) {
6757 swp_entry_t entry = { .val = page_private(page) };
6758
6759
6760
6761
6762
6763 mem_cgroup_uncharge_swap(entry, nr_pages);
6764 }
6765
6766out_put:
6767 css_put(&memcg->css);
6768out:
6769 return ret;
6770}
6771
6772struct uncharge_gather {
6773 struct mem_cgroup *memcg;
6774 unsigned long nr_pages;
6775 unsigned long pgpgout;
6776 unsigned long nr_kmem;
6777 struct page *dummy_page;
6778};
6779
6780static inline void uncharge_gather_clear(struct uncharge_gather *ug)
6781{
6782 memset(ug, 0, sizeof(*ug));
6783}
6784
6785static void uncharge_batch(const struct uncharge_gather *ug)
6786{
6787 unsigned long flags;
6788
6789 if (!mem_cgroup_is_root(ug->memcg)) {
6790 page_counter_uncharge(&ug->memcg->memory, ug->nr_pages);
6791 if (do_memsw_account())
6792 page_counter_uncharge(&ug->memcg->memsw, ug->nr_pages);
6793 if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && ug->nr_kmem)
6794 page_counter_uncharge(&ug->memcg->kmem, ug->nr_kmem);
6795 memcg_oom_recover(ug->memcg);
6796 }
6797
6798 local_irq_save(flags);
6799 __count_memcg_events(ug->memcg, PGPGOUT, ug->pgpgout);
6800 __this_cpu_add(ug->memcg->vmstats_percpu->nr_page_events, ug->nr_pages);
6801 memcg_check_events(ug->memcg, ug->dummy_page);
6802 local_irq_restore(flags);
6803
6804
6805 css_put(&ug->memcg->css);
6806}
6807
6808static void uncharge_page(struct page *page, struct uncharge_gather *ug)
6809{
6810 unsigned long nr_pages;
6811
6812 VM_BUG_ON_PAGE(PageLRU(page), page);
6813
6814 if (!page_memcg(page))
6815 return;
6816
6817
6818
6819
6820
6821
6822
6823 if (ug->memcg != page_memcg(page)) {
6824 if (ug->memcg) {
6825 uncharge_batch(ug);
6826 uncharge_gather_clear(ug);
6827 }
6828 ug->memcg = page_memcg(page);
6829
6830
6831 css_get(&ug->memcg->css);
6832 }
6833
6834 nr_pages = compound_nr(page);
6835 ug->nr_pages += nr_pages;
6836
6837 if (PageMemcgKmem(page))
6838 ug->nr_kmem += nr_pages;
6839 else
6840 ug->pgpgout++;
6841
6842 ug->dummy_page = page;
6843 page->memcg_data = 0;
6844 css_put(&ug->memcg->css);
6845}
6846
6847
6848
6849
6850
6851
6852
6853void mem_cgroup_uncharge(struct page *page)
6854{
6855 struct uncharge_gather ug;
6856
6857 if (mem_cgroup_disabled())
6858 return;
6859
6860
6861 if (!page_memcg(page))
6862 return;
6863
6864 uncharge_gather_clear(&ug);
6865 uncharge_page(page, &ug);
6866 uncharge_batch(&ug);
6867}
6868
6869
6870
6871
6872
6873
6874
6875
6876void mem_cgroup_uncharge_list(struct list_head *page_list)
6877{
6878 struct uncharge_gather ug;
6879 struct page *page;
6880
6881 if (mem_cgroup_disabled())
6882 return;
6883
6884 uncharge_gather_clear(&ug);
6885 list_for_each_entry(page, page_list, lru)
6886 uncharge_page(page, &ug);
6887 if (ug.memcg)
6888 uncharge_batch(&ug);
6889}
6890
6891
6892
6893
6894
6895
6896
6897
6898
6899
6900
6901void mem_cgroup_migrate(struct page *oldpage, struct page *newpage)
6902{
6903 struct mem_cgroup *memcg;
6904 unsigned int nr_pages;
6905 unsigned long flags;
6906
6907 VM_BUG_ON_PAGE(!PageLocked(oldpage), oldpage);
6908 VM_BUG_ON_PAGE(!PageLocked(newpage), newpage);
6909 VM_BUG_ON_PAGE(PageAnon(oldpage) != PageAnon(newpage), newpage);
6910 VM_BUG_ON_PAGE(PageTransHuge(oldpage) != PageTransHuge(newpage),
6911 newpage);
6912
6913 if (mem_cgroup_disabled())
6914 return;
6915
6916
6917 if (page_memcg(newpage))
6918 return;
6919
6920 memcg = page_memcg(oldpage);
6921 VM_WARN_ON_ONCE_PAGE(!memcg, oldpage);
6922 if (!memcg)
6923 return;
6924
6925
6926 nr_pages = thp_nr_pages(newpage);
6927
6928 page_counter_charge(&memcg->memory, nr_pages);
6929 if (do_memsw_account())
6930 page_counter_charge(&memcg->memsw, nr_pages);
6931
6932 css_get(&memcg->css);
6933 commit_charge(newpage, memcg);
6934
6935 local_irq_save(flags);
6936 mem_cgroup_charge_statistics(memcg, newpage, nr_pages);
6937 memcg_check_events(memcg, newpage);
6938 local_irq_restore(flags);
6939}
6940
6941DEFINE_STATIC_KEY_FALSE(memcg_sockets_enabled_key);
6942EXPORT_SYMBOL(memcg_sockets_enabled_key);
6943
6944void mem_cgroup_sk_alloc(struct sock *sk)
6945{
6946 struct mem_cgroup *memcg;
6947
6948 if (!mem_cgroup_sockets_enabled)
6949 return;
6950
6951
6952 if (in_interrupt())
6953 return;
6954
6955 rcu_read_lock();
6956 memcg = mem_cgroup_from_task(current);
6957 if (memcg == root_mem_cgroup)
6958 goto out;
6959 if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && !memcg->tcpmem_active)
6960 goto out;
6961 if (css_tryget(&memcg->css))
6962 sk->sk_memcg = memcg;
6963out:
6964 rcu_read_unlock();
6965}
6966
6967void mem_cgroup_sk_free(struct sock *sk)
6968{
6969 if (sk->sk_memcg)
6970 css_put(&sk->sk_memcg->css);
6971}
6972
6973
6974
6975
6976
6977
6978
6979
6980
6981bool mem_cgroup_charge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages)
6982{
6983 gfp_t gfp_mask = GFP_KERNEL;
6984
6985 if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) {
6986 struct page_counter *fail;
6987
6988 if (page_counter_try_charge(&memcg->tcpmem, nr_pages, &fail)) {
6989 memcg->tcpmem_pressure = 0;
6990 return true;
6991 }
6992 page_counter_charge(&memcg->tcpmem, nr_pages);
6993 memcg->tcpmem_pressure = 1;
6994 return false;
6995 }
6996
6997
6998 if (in_softirq())
6999 gfp_mask = GFP_NOWAIT;
7000
7001 mod_memcg_state(memcg, MEMCG_SOCK, nr_pages);
7002
7003 if (try_charge(memcg, gfp_mask, nr_pages) == 0)
7004 return true;
7005
7006 try_charge(memcg, gfp_mask|__GFP_NOFAIL, nr_pages);
7007 return false;
7008}
7009
7010
7011
7012
7013
7014
7015void mem_cgroup_uncharge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages)
7016{
7017 if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) {
7018 page_counter_uncharge(&memcg->tcpmem, nr_pages);
7019 return;
7020 }
7021
7022 mod_memcg_state(memcg, MEMCG_SOCK, -nr_pages);
7023
7024 refill_stock(memcg, nr_pages);
7025}
7026
7027static int __init cgroup_memory(char *s)
7028{
7029 char *token;
7030
7031 while ((token = strsep(&s, ",")) != NULL) {
7032 if (!*token)
7033 continue;
7034 if (!strcmp(token, "nosocket"))
7035 cgroup_memory_nosocket = true;
7036 if (!strcmp(token, "nokmem"))
7037 cgroup_memory_nokmem = true;
7038 }
7039 return 0;
7040}
7041__setup("cgroup.memory=", cgroup_memory);
7042
7043
7044
7045
7046
7047
7048
7049
7050
7051static int __init mem_cgroup_init(void)
7052{
7053 int cpu, node;
7054
7055
7056
7057
7058
7059
7060
7061 BUILD_BUG_ON(MEMCG_CHARGE_BATCH > S32_MAX / PAGE_SIZE);
7062
7063 cpuhp_setup_state_nocalls(CPUHP_MM_MEMCQ_DEAD, "mm/memctrl:dead", NULL,
7064 memcg_hotplug_cpu_dead);
7065
7066 for_each_possible_cpu(cpu)
7067 INIT_WORK(&per_cpu_ptr(&memcg_stock, cpu)->work,
7068 drain_local_stock);
7069
7070 for_each_node(node) {
7071 struct mem_cgroup_tree_per_node *rtpn;
7072
7073 rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL,
7074 node_online(node) ? node : NUMA_NO_NODE);
7075
7076 rtpn->rb_root = RB_ROOT;
7077 rtpn->rb_rightmost = NULL;
7078 spin_lock_init(&rtpn->lock);
7079 soft_limit_tree.rb_tree_per_node[node] = rtpn;
7080 }
7081
7082 return 0;
7083}
7084subsys_initcall(mem_cgroup_init);
7085
7086#ifdef CONFIG_MEMCG_SWAP
7087static struct mem_cgroup *mem_cgroup_id_get_online(struct mem_cgroup *memcg)
7088{
7089 while (!refcount_inc_not_zero(&memcg->id.ref)) {
7090
7091
7092
7093
7094 if (WARN_ON_ONCE(memcg == root_mem_cgroup)) {
7095 VM_BUG_ON(1);
7096 break;
7097 }
7098 memcg = parent_mem_cgroup(memcg);
7099 if (!memcg)
7100 memcg = root_mem_cgroup;
7101 }
7102 return memcg;
7103}
7104
7105
7106
7107
7108
7109
7110
7111
7112void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
7113{
7114 struct mem_cgroup *memcg, *swap_memcg;
7115 unsigned int nr_entries;
7116 unsigned short oldid;
7117
7118 VM_BUG_ON_PAGE(PageLRU(page), page);
7119 VM_BUG_ON_PAGE(page_count(page), page);
7120
7121 if (mem_cgroup_disabled())
7122 return;
7123
7124 if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
7125 return;
7126
7127 memcg = page_memcg(page);
7128
7129 VM_WARN_ON_ONCE_PAGE(!memcg, page);
7130 if (!memcg)
7131 return;
7132
7133
7134
7135
7136
7137
7138 swap_memcg = mem_cgroup_id_get_online(memcg);
7139 nr_entries = thp_nr_pages(page);
7140
7141 if (nr_entries > 1)
7142 mem_cgroup_id_get_many(swap_memcg, nr_entries - 1);
7143 oldid = swap_cgroup_record(entry, mem_cgroup_id(swap_memcg),
7144 nr_entries);
7145 VM_BUG_ON_PAGE(oldid, page);
7146 mod_memcg_state(swap_memcg, MEMCG_SWAP, nr_entries);
7147
7148 page->memcg_data = 0;
7149
7150 if (!mem_cgroup_is_root(memcg))
7151 page_counter_uncharge(&memcg->memory, nr_entries);
7152
7153 if (!cgroup_memory_noswap && memcg != swap_memcg) {
7154 if (!mem_cgroup_is_root(swap_memcg))
7155 page_counter_charge(&swap_memcg->memsw, nr_entries);
7156 page_counter_uncharge(&memcg->memsw, nr_entries);
7157 }
7158
7159
7160
7161
7162
7163
7164
7165 VM_BUG_ON(!irqs_disabled());
7166 mem_cgroup_charge_statistics(memcg, page, -nr_entries);
7167 memcg_check_events(memcg, page);
7168
7169 css_put(&memcg->css);
7170}
7171
7172
7173
7174
7175
7176
7177
7178
7179
7180
7181int mem_cgroup_try_charge_swap(struct page *page, swp_entry_t entry)
7182{
7183 unsigned int nr_pages = thp_nr_pages(page);
7184 struct page_counter *counter;
7185 struct mem_cgroup *memcg;
7186 unsigned short oldid;
7187
7188 if (mem_cgroup_disabled())
7189 return 0;
7190
7191 if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
7192 return 0;
7193
7194 memcg = page_memcg(page);
7195
7196 VM_WARN_ON_ONCE_PAGE(!memcg, page);
7197 if (!memcg)
7198 return 0;
7199
7200 if (!entry.val) {
7201 memcg_memory_event(memcg, MEMCG_SWAP_FAIL);
7202 return 0;
7203 }
7204
7205 memcg = mem_cgroup_id_get_online(memcg);
7206
7207 if (!cgroup_memory_noswap && !mem_cgroup_is_root(memcg) &&
7208 !page_counter_try_charge(&memcg->swap, nr_pages, &counter)) {
7209 memcg_memory_event(memcg, MEMCG_SWAP_MAX);
7210 memcg_memory_event(memcg, MEMCG_SWAP_FAIL);
7211 mem_cgroup_id_put(memcg);
7212 return -ENOMEM;
7213 }
7214
7215
7216 if (nr_pages > 1)
7217 mem_cgroup_id_get_many(memcg, nr_pages - 1);
7218 oldid = swap_cgroup_record(entry, mem_cgroup_id(memcg), nr_pages);
7219 VM_BUG_ON_PAGE(oldid, page);
7220 mod_memcg_state(memcg, MEMCG_SWAP, nr_pages);
7221
7222 return 0;
7223}
7224
7225
7226
7227
7228
7229
7230void mem_cgroup_uncharge_swap(swp_entry_t entry, unsigned int nr_pages)
7231{
7232 struct mem_cgroup *memcg;
7233 unsigned short id;
7234
7235 id = swap_cgroup_record(entry, 0, nr_pages);
7236 rcu_read_lock();
7237 memcg = mem_cgroup_from_id(id);
7238 if (memcg) {
7239 if (!cgroup_memory_noswap && !mem_cgroup_is_root(memcg)) {
7240 if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
7241 page_counter_uncharge(&memcg->swap, nr_pages);
7242 else
7243 page_counter_uncharge(&memcg->memsw, nr_pages);
7244 }
7245 mod_memcg_state(memcg, MEMCG_SWAP, -nr_pages);
7246 mem_cgroup_id_put_many(memcg, nr_pages);
7247 }
7248 rcu_read_unlock();
7249}
7250
7251long mem_cgroup_get_nr_swap_pages(struct mem_cgroup *memcg)
7252{
7253 long nr_swap_pages = get_nr_swap_pages();
7254
7255 if (cgroup_memory_noswap || !cgroup_subsys_on_dfl(memory_cgrp_subsys))
7256 return nr_swap_pages;
7257 for (; memcg != root_mem_cgroup; memcg = parent_mem_cgroup(memcg))
7258 nr_swap_pages = min_t(long, nr_swap_pages,
7259 READ_ONCE(memcg->swap.max) -
7260 page_counter_read(&memcg->swap));
7261 return nr_swap_pages;
7262}
7263
7264bool mem_cgroup_swap_full(struct page *page)
7265{
7266 struct mem_cgroup *memcg;
7267
7268 VM_BUG_ON_PAGE(!PageLocked(page), page);
7269
7270 if (vm_swap_full())
7271 return true;
7272 if (cgroup_memory_noswap || !cgroup_subsys_on_dfl(memory_cgrp_subsys))
7273 return false;
7274
7275 memcg = page_memcg(page);
7276 if (!memcg)
7277 return false;
7278
7279 for (; memcg != root_mem_cgroup; memcg = parent_mem_cgroup(memcg)) {
7280 unsigned long usage = page_counter_read(&memcg->swap);
7281
7282 if (usage * 2 >= READ_ONCE(memcg->swap.high) ||
7283 usage * 2 >= READ_ONCE(memcg->swap.max))
7284 return true;
7285 }
7286
7287 return false;
7288}
7289
7290static int __init setup_swap_account(char *s)
7291{
7292 if (!strcmp(s, "1"))
7293 cgroup_memory_noswap = false;
7294 else if (!strcmp(s, "0"))
7295 cgroup_memory_noswap = true;
7296 return 1;
7297}
7298__setup("swapaccount=", setup_swap_account);
7299
7300static u64 swap_current_read(struct cgroup_subsys_state *css,
7301 struct cftype *cft)
7302{
7303 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
7304
7305 return (u64)page_counter_read(&memcg->swap) * PAGE_SIZE;
7306}
7307
7308static int swap_high_show(struct seq_file *m, void *v)
7309{
7310 return seq_puts_memcg_tunable(m,
7311 READ_ONCE(mem_cgroup_from_seq(m)->swap.high));
7312}
7313
7314static ssize_t swap_high_write(struct kernfs_open_file *of,
7315 char *buf, size_t nbytes, loff_t off)
7316{
7317 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
7318 unsigned long high;
7319 int err;
7320
7321 buf = strstrip(buf);
7322 err = page_counter_memparse(buf, "max", &high);
7323 if (err)
7324 return err;
7325
7326 page_counter_set_high(&memcg->swap, high);
7327
7328 return nbytes;
7329}
7330
7331static int swap_max_show(struct seq_file *m, void *v)
7332{
7333 return seq_puts_memcg_tunable(m,
7334 READ_ONCE(mem_cgroup_from_seq(m)->swap.max));
7335}
7336
7337static ssize_t swap_max_write(struct kernfs_open_file *of,
7338 char *buf, size_t nbytes, loff_t off)
7339{
7340 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
7341 unsigned long max;
7342 int err;
7343
7344 buf = strstrip(buf);
7345 err = page_counter_memparse(buf, "max", &max);
7346 if (err)
7347 return err;
7348
7349 xchg(&memcg->swap.max, max);
7350
7351 return nbytes;
7352}
7353
7354static int swap_events_show(struct seq_file *m, void *v)
7355{
7356 struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
7357
7358 seq_printf(m, "high %lu\n",
7359 atomic_long_read(&memcg->memory_events[MEMCG_SWAP_HIGH]));
7360 seq_printf(m, "max %lu\n",
7361 atomic_long_read(&memcg->memory_events[MEMCG_SWAP_MAX]));
7362 seq_printf(m, "fail %lu\n",
7363 atomic_long_read(&memcg->memory_events[MEMCG_SWAP_FAIL]));
7364
7365 return 0;
7366}
7367
7368static struct cftype swap_files[] = {
7369 {
7370 .name = "swap.current",
7371 .flags = CFTYPE_NOT_ON_ROOT,
7372 .read_u64 = swap_current_read,
7373 },
7374 {
7375 .name = "swap.high",
7376 .flags = CFTYPE_NOT_ON_ROOT,
7377 .seq_show = swap_high_show,
7378 .write = swap_high_write,
7379 },
7380 {
7381 .name = "swap.max",
7382 .flags = CFTYPE_NOT_ON_ROOT,
7383 .seq_show = swap_max_show,
7384 .write = swap_max_write,
7385 },
7386 {
7387 .name = "swap.events",
7388 .flags = CFTYPE_NOT_ON_ROOT,
7389 .file_offset = offsetof(struct mem_cgroup, swap_events_file),
7390 .seq_show = swap_events_show,
7391 },
7392 { }
7393};
7394
7395static struct cftype memsw_files[] = {
7396 {
7397 .name = "memsw.usage_in_bytes",
7398 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE),
7399 .read_u64 = mem_cgroup_read_u64,
7400 },
7401 {
7402 .name = "memsw.max_usage_in_bytes",
7403 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_MAX_USAGE),
7404 .write = mem_cgroup_reset,
7405 .read_u64 = mem_cgroup_read_u64,
7406 },
7407 {
7408 .name = "memsw.limit_in_bytes",
7409 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_LIMIT),
7410 .write = mem_cgroup_write,
7411 .read_u64 = mem_cgroup_read_u64,
7412 },
7413 {
7414 .name = "memsw.failcnt",
7415 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_FAILCNT),
7416 .write = mem_cgroup_reset,
7417 .read_u64 = mem_cgroup_read_u64,
7418 },
7419 { },
7420};
7421
7422
7423
7424
7425
7426
7427
7428
7429static int __init mem_cgroup_swap_init(void)
7430{
7431
7432 if (mem_cgroup_disabled())
7433 cgroup_memory_noswap = true;
7434
7435 if (cgroup_memory_noswap)
7436 return 0;
7437
7438 WARN_ON(cgroup_add_dfl_cftypes(&memory_cgrp_subsys, swap_files));
7439 WARN_ON(cgroup_add_legacy_cftypes(&memory_cgrp_subsys, memsw_files));
7440
7441 return 0;
7442}
7443core_initcall(mem_cgroup_swap_init);
7444
7445#endif
7446