1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28#include <linux/page_counter.h>
29#include <linux/memcontrol.h>
30#include <linux/cgroup.h>
31#include <linux/pagewalk.h>
32#include <linux/sched/mm.h>
33#include <linux/shmem_fs.h>
34#include <linux/hugetlb.h>
35#include <linux/pagemap.h>
36#include <linux/vm_event_item.h>
37#include <linux/smp.h>
38#include <linux/page-flags.h>
39#include <linux/backing-dev.h>
40#include <linux/bit_spinlock.h>
41#include <linux/rcupdate.h>
42#include <linux/limits.h>
43#include <linux/export.h>
44#include <linux/mutex.h>
45#include <linux/rbtree.h>
46#include <linux/slab.h>
47#include <linux/swap.h>
48#include <linux/swapops.h>
49#include <linux/spinlock.h>
50#include <linux/eventfd.h>
51#include <linux/poll.h>
52#include <linux/sort.h>
53#include <linux/fs.h>
54#include <linux/seq_file.h>
55#include <linux/vmpressure.h>
56#include <linux/mm_inline.h>
57#include <linux/swap_cgroup.h>
58#include <linux/cpu.h>
59#include <linux/oom.h>
60#include <linux/lockdep.h>
61#include <linux/file.h>
62#include <linux/tracehook.h>
63#include <linux/psi.h>
64#include <linux/seq_buf.h>
65#include "internal.h"
66#include <net/sock.h>
67#include <net/ip.h>
68#include "slab.h"
69
70#include <linux/uaccess.h>
71
72#include <trace/events/vmscan.h>
73
74struct cgroup_subsys memory_cgrp_subsys __read_mostly;
75EXPORT_SYMBOL(memory_cgrp_subsys);
76
77struct mem_cgroup *root_mem_cgroup __read_mostly;
78
79
80DEFINE_PER_CPU(struct mem_cgroup *, int_active_memcg);
81EXPORT_PER_CPU_SYMBOL_GPL(int_active_memcg);
82
83
84static bool cgroup_memory_nosocket __ro_after_init;
85
86
87bool cgroup_memory_nokmem __ro_after_init;
88
89
90#ifdef CONFIG_MEMCG_SWAP
91bool cgroup_memory_noswap __ro_after_init;
92#else
93#define cgroup_memory_noswap 1
94#endif
95
96#ifdef CONFIG_CGROUP_WRITEBACK
97static DECLARE_WAIT_QUEUE_HEAD(memcg_cgwb_frn_waitq);
98#endif
99
100
101static bool do_memsw_account(void)
102{
103 return !cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_noswap;
104}
105
106
107static void flush_memcg_stats_dwork(struct work_struct *w);
108static DECLARE_DEFERRABLE_WORK(stats_flush_dwork, flush_memcg_stats_dwork);
109static DEFINE_SPINLOCK(stats_flush_lock);
110
111#define THRESHOLDS_EVENTS_TARGET 128
112#define SOFTLIMIT_EVENTS_TARGET 1024
113
114
115
116
117
118
119struct mem_cgroup_tree_per_node {
120 struct rb_root rb_root;
121 struct rb_node *rb_rightmost;
122 spinlock_t lock;
123};
124
125struct mem_cgroup_tree {
126 struct mem_cgroup_tree_per_node *rb_tree_per_node[MAX_NUMNODES];
127};
128
129static struct mem_cgroup_tree soft_limit_tree __read_mostly;
130
131
132struct mem_cgroup_eventfd_list {
133 struct list_head list;
134 struct eventfd_ctx *eventfd;
135};
136
137
138
139
140struct mem_cgroup_event {
141
142
143
144 struct mem_cgroup *memcg;
145
146
147
148 struct eventfd_ctx *eventfd;
149
150
151
152 struct list_head list;
153
154
155
156
157
158 int (*register_event)(struct mem_cgroup *memcg,
159 struct eventfd_ctx *eventfd, const char *args);
160
161
162
163
164
165 void (*unregister_event)(struct mem_cgroup *memcg,
166 struct eventfd_ctx *eventfd);
167
168
169
170
171 poll_table pt;
172 wait_queue_head_t *wqh;
173 wait_queue_entry_t wait;
174 struct work_struct remove;
175};
176
177static void mem_cgroup_threshold(struct mem_cgroup *memcg);
178static void mem_cgroup_oom_notify(struct mem_cgroup *memcg);
179
180
181
182
183
184#define MOVE_ANON 0x1U
185#define MOVE_FILE 0x2U
186#define MOVE_MASK (MOVE_ANON | MOVE_FILE)
187
188
189static struct move_charge_struct {
190 spinlock_t lock;
191 struct mm_struct *mm;
192 struct mem_cgroup *from;
193 struct mem_cgroup *to;
194 unsigned long flags;
195 unsigned long precharge;
196 unsigned long moved_charge;
197 unsigned long moved_swap;
198 struct task_struct *moving_task;
199 wait_queue_head_t waitq;
200} mc = {
201 .lock = __SPIN_LOCK_UNLOCKED(mc.lock),
202 .waitq = __WAIT_QUEUE_HEAD_INITIALIZER(mc.waitq),
203};
204
205
206
207
208
209#define MEM_CGROUP_MAX_RECLAIM_LOOPS 100
210#define MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS 2
211
212
213enum res_type {
214 _MEM,
215 _MEMSWAP,
216 _OOM_TYPE,
217 _KMEM,
218 _TCP,
219};
220
221#define MEMFILE_PRIVATE(x, val) ((x) << 16 | (val))
222#define MEMFILE_TYPE(val) ((val) >> 16 & 0xffff)
223#define MEMFILE_ATTR(val) ((val) & 0xffff)
224
225#define OOM_CONTROL (0)
226
227
228
229
230
231
232#define for_each_mem_cgroup_tree(iter, root) \
233 for (iter = mem_cgroup_iter(root, NULL, NULL); \
234 iter != NULL; \
235 iter = mem_cgroup_iter(root, iter, NULL))
236
237#define for_each_mem_cgroup(iter) \
238 for (iter = mem_cgroup_iter(NULL, NULL, NULL); \
239 iter != NULL; \
240 iter = mem_cgroup_iter(NULL, iter, NULL))
241
242static inline bool should_force_charge(void)
243{
244 return tsk_is_oom_victim(current) || fatal_signal_pending(current) ||
245 (current->flags & PF_EXITING);
246}
247
248
249struct vmpressure *memcg_to_vmpressure(struct mem_cgroup *memcg)
250{
251 if (!memcg)
252 memcg = root_mem_cgroup;
253 return &memcg->vmpressure;
254}
255
256struct mem_cgroup *vmpressure_to_memcg(struct vmpressure *vmpr)
257{
258 return container_of(vmpr, struct mem_cgroup, vmpressure);
259}
260
261#ifdef CONFIG_MEMCG_KMEM
262extern spinlock_t css_set_lock;
263
264bool mem_cgroup_kmem_disabled(void)
265{
266 return cgroup_memory_nokmem;
267}
268
269static void obj_cgroup_uncharge_pages(struct obj_cgroup *objcg,
270 unsigned int nr_pages);
271
272static void obj_cgroup_release(struct percpu_ref *ref)
273{
274 struct obj_cgroup *objcg = container_of(ref, struct obj_cgroup, refcnt);
275 unsigned int nr_bytes;
276 unsigned int nr_pages;
277 unsigned long flags;
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299 nr_bytes = atomic_read(&objcg->nr_charged_bytes);
300 WARN_ON_ONCE(nr_bytes & (PAGE_SIZE - 1));
301 nr_pages = nr_bytes >> PAGE_SHIFT;
302
303 if (nr_pages)
304 obj_cgroup_uncharge_pages(objcg, nr_pages);
305
306 spin_lock_irqsave(&css_set_lock, flags);
307 list_del(&objcg->list);
308 spin_unlock_irqrestore(&css_set_lock, flags);
309
310 percpu_ref_exit(ref);
311 kfree_rcu(objcg, rcu);
312}
313
314static struct obj_cgroup *obj_cgroup_alloc(void)
315{
316 struct obj_cgroup *objcg;
317 int ret;
318
319 objcg = kzalloc(sizeof(struct obj_cgroup), GFP_KERNEL);
320 if (!objcg)
321 return NULL;
322
323 ret = percpu_ref_init(&objcg->refcnt, obj_cgroup_release, 0,
324 GFP_KERNEL);
325 if (ret) {
326 kfree(objcg);
327 return NULL;
328 }
329 INIT_LIST_HEAD(&objcg->list);
330 return objcg;
331}
332
333static void memcg_reparent_objcgs(struct mem_cgroup *memcg,
334 struct mem_cgroup *parent)
335{
336 struct obj_cgroup *objcg, *iter;
337
338 objcg = rcu_replace_pointer(memcg->objcg, NULL, true);
339
340 spin_lock_irq(&css_set_lock);
341
342
343 list_add(&objcg->list, &memcg->objcg_list);
344
345 list_for_each_entry(iter, &memcg->objcg_list, list)
346 WRITE_ONCE(iter->memcg, parent);
347
348 list_splice(&memcg->objcg_list, &parent->objcg_list);
349
350 spin_unlock_irq(&css_set_lock);
351
352 percpu_ref_kill(&objcg->refcnt);
353}
354
355
356
357
358
359
360
361
362
363
364
365
366static DEFINE_IDA(memcg_cache_ida);
367int memcg_nr_cache_ids;
368
369
370static DECLARE_RWSEM(memcg_cache_ids_sem);
371
372void memcg_get_cache_ids(void)
373{
374 down_read(&memcg_cache_ids_sem);
375}
376
377void memcg_put_cache_ids(void)
378{
379 up_read(&memcg_cache_ids_sem);
380}
381
382
383
384
385
386
387
388
389
390
391
392
393
394#define MEMCG_CACHES_MIN_SIZE 4
395#define MEMCG_CACHES_MAX_SIZE MEM_CGROUP_ID_MAX
396
397
398
399
400
401
402
403DEFINE_STATIC_KEY_FALSE(memcg_kmem_enabled_key);
404EXPORT_SYMBOL(memcg_kmem_enabled_key);
405#endif
406
407
408
409
410
411
412
413
414
415
416
417
418struct cgroup_subsys_state *mem_cgroup_css_from_page(struct page *page)
419{
420 struct mem_cgroup *memcg;
421
422 memcg = page_memcg(page);
423
424 if (!memcg || !cgroup_subsys_on_dfl(memory_cgrp_subsys))
425 memcg = root_mem_cgroup;
426
427 return &memcg->css;
428}
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443ino_t page_cgroup_ino(struct page *page)
444{
445 struct mem_cgroup *memcg;
446 unsigned long ino = 0;
447
448 rcu_read_lock();
449 memcg = page_memcg_check(page);
450
451 while (memcg && !(memcg->css.flags & CSS_ONLINE))
452 memcg = parent_mem_cgroup(memcg);
453 if (memcg)
454 ino = cgroup_ino(memcg->css.cgroup);
455 rcu_read_unlock();
456 return ino;
457}
458
459static struct mem_cgroup_per_node *
460mem_cgroup_page_nodeinfo(struct mem_cgroup *memcg, struct page *page)
461{
462 int nid = page_to_nid(page);
463
464 return memcg->nodeinfo[nid];
465}
466
467static struct mem_cgroup_tree_per_node *
468soft_limit_tree_node(int nid)
469{
470 return soft_limit_tree.rb_tree_per_node[nid];
471}
472
473static struct mem_cgroup_tree_per_node *
474soft_limit_tree_from_page(struct page *page)
475{
476 int nid = page_to_nid(page);
477
478 return soft_limit_tree.rb_tree_per_node[nid];
479}
480
481static void __mem_cgroup_insert_exceeded(struct mem_cgroup_per_node *mz,
482 struct mem_cgroup_tree_per_node *mctz,
483 unsigned long new_usage_in_excess)
484{
485 struct rb_node **p = &mctz->rb_root.rb_node;
486 struct rb_node *parent = NULL;
487 struct mem_cgroup_per_node *mz_node;
488 bool rightmost = true;
489
490 if (mz->on_tree)
491 return;
492
493 mz->usage_in_excess = new_usage_in_excess;
494 if (!mz->usage_in_excess)
495 return;
496 while (*p) {
497 parent = *p;
498 mz_node = rb_entry(parent, struct mem_cgroup_per_node,
499 tree_node);
500 if (mz->usage_in_excess < mz_node->usage_in_excess) {
501 p = &(*p)->rb_left;
502 rightmost = false;
503 } else {
504 p = &(*p)->rb_right;
505 }
506 }
507
508 if (rightmost)
509 mctz->rb_rightmost = &mz->tree_node;
510
511 rb_link_node(&mz->tree_node, parent, p);
512 rb_insert_color(&mz->tree_node, &mctz->rb_root);
513 mz->on_tree = true;
514}
515
516static void __mem_cgroup_remove_exceeded(struct mem_cgroup_per_node *mz,
517 struct mem_cgroup_tree_per_node *mctz)
518{
519 if (!mz->on_tree)
520 return;
521
522 if (&mz->tree_node == mctz->rb_rightmost)
523 mctz->rb_rightmost = rb_prev(&mz->tree_node);
524
525 rb_erase(&mz->tree_node, &mctz->rb_root);
526 mz->on_tree = false;
527}
528
529static void mem_cgroup_remove_exceeded(struct mem_cgroup_per_node *mz,
530 struct mem_cgroup_tree_per_node *mctz)
531{
532 unsigned long flags;
533
534 spin_lock_irqsave(&mctz->lock, flags);
535 __mem_cgroup_remove_exceeded(mz, mctz);
536 spin_unlock_irqrestore(&mctz->lock, flags);
537}
538
539static unsigned long soft_limit_excess(struct mem_cgroup *memcg)
540{
541 unsigned long nr_pages = page_counter_read(&memcg->memory);
542 unsigned long soft_limit = READ_ONCE(memcg->soft_limit);
543 unsigned long excess = 0;
544
545 if (nr_pages > soft_limit)
546 excess = nr_pages - soft_limit;
547
548 return excess;
549}
550
551static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page)
552{
553 unsigned long excess;
554 struct mem_cgroup_per_node *mz;
555 struct mem_cgroup_tree_per_node *mctz;
556
557 mctz = soft_limit_tree_from_page(page);
558 if (!mctz)
559 return;
560
561
562
563
564 for (; memcg; memcg = parent_mem_cgroup(memcg)) {
565 mz = mem_cgroup_page_nodeinfo(memcg, page);
566 excess = soft_limit_excess(memcg);
567
568
569
570
571 if (excess || mz->on_tree) {
572 unsigned long flags;
573
574 spin_lock_irqsave(&mctz->lock, flags);
575
576 if (mz->on_tree)
577 __mem_cgroup_remove_exceeded(mz, mctz);
578
579
580
581
582 __mem_cgroup_insert_exceeded(mz, mctz, excess);
583 spin_unlock_irqrestore(&mctz->lock, flags);
584 }
585 }
586}
587
588static void mem_cgroup_remove_from_trees(struct mem_cgroup *memcg)
589{
590 struct mem_cgroup_tree_per_node *mctz;
591 struct mem_cgroup_per_node *mz;
592 int nid;
593
594 for_each_node(nid) {
595 mz = memcg->nodeinfo[nid];
596 mctz = soft_limit_tree_node(nid);
597 if (mctz)
598 mem_cgroup_remove_exceeded(mz, mctz);
599 }
600}
601
602static struct mem_cgroup_per_node *
603__mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz)
604{
605 struct mem_cgroup_per_node *mz;
606
607retry:
608 mz = NULL;
609 if (!mctz->rb_rightmost)
610 goto done;
611
612 mz = rb_entry(mctz->rb_rightmost,
613 struct mem_cgroup_per_node, tree_node);
614
615
616
617
618
619 __mem_cgroup_remove_exceeded(mz, mctz);
620 if (!soft_limit_excess(mz->memcg) ||
621 !css_tryget(&mz->memcg->css))
622 goto retry;
623done:
624 return mz;
625}
626
627static struct mem_cgroup_per_node *
628mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz)
629{
630 struct mem_cgroup_per_node *mz;
631
632 spin_lock_irq(&mctz->lock);
633 mz = __mem_cgroup_largest_soft_limit_node(mctz);
634 spin_unlock_irq(&mctz->lock);
635 return mz;
636}
637
638
639
640
641
642
643
644void __mod_memcg_state(struct mem_cgroup *memcg, int idx, int val)
645{
646 if (mem_cgroup_disabled())
647 return;
648
649 __this_cpu_add(memcg->vmstats_percpu->state[idx], val);
650 cgroup_rstat_updated(memcg->css.cgroup, smp_processor_id());
651}
652
653
654static unsigned long memcg_page_state_local(struct mem_cgroup *memcg, int idx)
655{
656 long x = 0;
657 int cpu;
658
659 for_each_possible_cpu(cpu)
660 x += per_cpu(memcg->vmstats_percpu->state[idx], cpu);
661#ifdef CONFIG_SMP
662 if (x < 0)
663 x = 0;
664#endif
665 return x;
666}
667
668void __mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx,
669 int val)
670{
671 struct mem_cgroup_per_node *pn;
672 struct mem_cgroup *memcg;
673
674 pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
675 memcg = pn->memcg;
676
677
678 __mod_memcg_state(memcg, idx, val);
679
680
681 __this_cpu_add(pn->lruvec_stats_percpu->state[idx], val);
682}
683
684
685
686
687
688
689
690
691
692
693
694void __mod_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx,
695 int val)
696{
697
698 __mod_node_page_state(lruvec_pgdat(lruvec), idx, val);
699
700
701 if (!mem_cgroup_disabled())
702 __mod_memcg_lruvec_state(lruvec, idx, val);
703}
704
705void __mod_lruvec_page_state(struct page *page, enum node_stat_item idx,
706 int val)
707{
708 struct page *head = compound_head(page);
709 struct mem_cgroup *memcg;
710 pg_data_t *pgdat = page_pgdat(page);
711 struct lruvec *lruvec;
712
713 rcu_read_lock();
714 memcg = page_memcg(head);
715
716 if (!memcg) {
717 rcu_read_unlock();
718 __mod_node_page_state(pgdat, idx, val);
719 return;
720 }
721
722 lruvec = mem_cgroup_lruvec(memcg, pgdat);
723 __mod_lruvec_state(lruvec, idx, val);
724 rcu_read_unlock();
725}
726EXPORT_SYMBOL(__mod_lruvec_page_state);
727
728void __mod_lruvec_kmem_state(void *p, enum node_stat_item idx, int val)
729{
730 pg_data_t *pgdat = page_pgdat(virt_to_page(p));
731 struct mem_cgroup *memcg;
732 struct lruvec *lruvec;
733
734 rcu_read_lock();
735 memcg = mem_cgroup_from_obj(p);
736
737
738
739
740
741
742
743 if (!memcg) {
744 __mod_node_page_state(pgdat, idx, val);
745 } else {
746 lruvec = mem_cgroup_lruvec(memcg, pgdat);
747 __mod_lruvec_state(lruvec, idx, val);
748 }
749 rcu_read_unlock();
750}
751
752
753
754
755
756static inline void mod_objcg_mlstate(struct obj_cgroup *objcg,
757 struct pglist_data *pgdat,
758 enum node_stat_item idx, int nr)
759{
760 struct mem_cgroup *memcg;
761 struct lruvec *lruvec;
762
763 rcu_read_lock();
764 memcg = obj_cgroup_memcg(objcg);
765 lruvec = mem_cgroup_lruvec(memcg, pgdat);
766 mod_memcg_lruvec_state(lruvec, idx, nr);
767 rcu_read_unlock();
768}
769
770
771
772
773
774
775
776void __count_memcg_events(struct mem_cgroup *memcg, enum vm_event_item idx,
777 unsigned long count)
778{
779 if (mem_cgroup_disabled())
780 return;
781
782 __this_cpu_add(memcg->vmstats_percpu->events[idx], count);
783 cgroup_rstat_updated(memcg->css.cgroup, smp_processor_id());
784}
785
786static unsigned long memcg_events(struct mem_cgroup *memcg, int event)
787{
788 return READ_ONCE(memcg->vmstats.events[event]);
789}
790
791static unsigned long memcg_events_local(struct mem_cgroup *memcg, int event)
792{
793 long x = 0;
794 int cpu;
795
796 for_each_possible_cpu(cpu)
797 x += per_cpu(memcg->vmstats_percpu->events[event], cpu);
798 return x;
799}
800
801static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg,
802 struct page *page,
803 int nr_pages)
804{
805
806 if (nr_pages > 0)
807 __count_memcg_events(memcg, PGPGIN, 1);
808 else {
809 __count_memcg_events(memcg, PGPGOUT, 1);
810 nr_pages = -nr_pages;
811 }
812
813 __this_cpu_add(memcg->vmstats_percpu->nr_page_events, nr_pages);
814}
815
816static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg,
817 enum mem_cgroup_events_target target)
818{
819 unsigned long val, next;
820
821 val = __this_cpu_read(memcg->vmstats_percpu->nr_page_events);
822 next = __this_cpu_read(memcg->vmstats_percpu->targets[target]);
823
824 if ((long)(next - val) < 0) {
825 switch (target) {
826 case MEM_CGROUP_TARGET_THRESH:
827 next = val + THRESHOLDS_EVENTS_TARGET;
828 break;
829 case MEM_CGROUP_TARGET_SOFTLIMIT:
830 next = val + SOFTLIMIT_EVENTS_TARGET;
831 break;
832 default:
833 break;
834 }
835 __this_cpu_write(memcg->vmstats_percpu->targets[target], next);
836 return true;
837 }
838 return false;
839}
840
841
842
843
844
845static void memcg_check_events(struct mem_cgroup *memcg, struct page *page)
846{
847
848 if (unlikely(mem_cgroup_event_ratelimit(memcg,
849 MEM_CGROUP_TARGET_THRESH))) {
850 bool do_softlimit;
851
852 do_softlimit = mem_cgroup_event_ratelimit(memcg,
853 MEM_CGROUP_TARGET_SOFTLIMIT);
854 mem_cgroup_threshold(memcg);
855 if (unlikely(do_softlimit))
856 mem_cgroup_update_tree(memcg, page);
857 }
858}
859
860struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
861{
862
863
864
865
866
867 if (unlikely(!p))
868 return NULL;
869
870 return mem_cgroup_from_css(task_css(p, memory_cgrp_id));
871}
872EXPORT_SYMBOL(mem_cgroup_from_task);
873
874static __always_inline struct mem_cgroup *active_memcg(void)
875{
876 if (!in_task())
877 return this_cpu_read(int_active_memcg);
878 else
879 return current->active_memcg;
880}
881
882
883
884
885
886
887
888
889
890
891
892
893struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm)
894{
895 struct mem_cgroup *memcg;
896
897 if (mem_cgroup_disabled())
898 return NULL;
899
900
901
902
903
904
905
906
907
908
909 if (unlikely(!mm)) {
910 memcg = active_memcg();
911 if (unlikely(memcg)) {
912
913 css_get(&memcg->css);
914 return memcg;
915 }
916 mm = current->mm;
917 if (unlikely(!mm))
918 return root_mem_cgroup;
919 }
920
921 rcu_read_lock();
922 do {
923 memcg = mem_cgroup_from_task(rcu_dereference(mm->owner));
924 if (unlikely(!memcg))
925 memcg = root_mem_cgroup;
926 } while (!css_tryget(&memcg->css));
927 rcu_read_unlock();
928 return memcg;
929}
930EXPORT_SYMBOL(get_mem_cgroup_from_mm);
931
932static __always_inline bool memcg_kmem_bypass(void)
933{
934
935 if (unlikely(active_memcg()))
936 return false;
937
938
939 if (!in_task() || !current->mm || (current->flags & PF_KTHREAD))
940 return true;
941
942 return false;
943}
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
963 struct mem_cgroup *prev,
964 struct mem_cgroup_reclaim_cookie *reclaim)
965{
966 struct mem_cgroup_reclaim_iter *iter;
967 struct cgroup_subsys_state *css = NULL;
968 struct mem_cgroup *memcg = NULL;
969 struct mem_cgroup *pos = NULL;
970
971 if (mem_cgroup_disabled())
972 return NULL;
973
974 if (!root)
975 root = root_mem_cgroup;
976
977 if (prev && !reclaim)
978 pos = prev;
979
980 rcu_read_lock();
981
982 if (reclaim) {
983 struct mem_cgroup_per_node *mz;
984
985 mz = root->nodeinfo[reclaim->pgdat->node_id];
986 iter = &mz->iter;
987
988 if (prev && reclaim->generation != iter->generation)
989 goto out_unlock;
990
991 while (1) {
992 pos = READ_ONCE(iter->position);
993 if (!pos || css_tryget(&pos->css))
994 break;
995
996
997
998
999
1000
1001
1002
1003 (void)cmpxchg(&iter->position, pos, NULL);
1004 }
1005 }
1006
1007 if (pos)
1008 css = &pos->css;
1009
1010 for (;;) {
1011 css = css_next_descendant_pre(css, &root->css);
1012 if (!css) {
1013
1014
1015
1016
1017
1018
1019 if (!prev)
1020 continue;
1021 break;
1022 }
1023
1024
1025
1026
1027
1028
1029 memcg = mem_cgroup_from_css(css);
1030
1031 if (css == &root->css)
1032 break;
1033
1034 if (css_tryget(css))
1035 break;
1036
1037 memcg = NULL;
1038 }
1039
1040 if (reclaim) {
1041
1042
1043
1044
1045
1046 (void)cmpxchg(&iter->position, pos, memcg);
1047
1048 if (pos)
1049 css_put(&pos->css);
1050
1051 if (!memcg)
1052 iter->generation++;
1053 else if (!prev)
1054 reclaim->generation = iter->generation;
1055 }
1056
1057out_unlock:
1058 rcu_read_unlock();
1059 if (prev && prev != root)
1060 css_put(&prev->css);
1061
1062 return memcg;
1063}
1064
1065
1066
1067
1068
1069
1070void mem_cgroup_iter_break(struct mem_cgroup *root,
1071 struct mem_cgroup *prev)
1072{
1073 if (!root)
1074 root = root_mem_cgroup;
1075 if (prev && prev != root)
1076 css_put(&prev->css);
1077}
1078
1079static void __invalidate_reclaim_iterators(struct mem_cgroup *from,
1080 struct mem_cgroup *dead_memcg)
1081{
1082 struct mem_cgroup_reclaim_iter *iter;
1083 struct mem_cgroup_per_node *mz;
1084 int nid;
1085
1086 for_each_node(nid) {
1087 mz = from->nodeinfo[nid];
1088 iter = &mz->iter;
1089 cmpxchg(&iter->position, dead_memcg, NULL);
1090 }
1091}
1092
1093static void invalidate_reclaim_iterators(struct mem_cgroup *dead_memcg)
1094{
1095 struct mem_cgroup *memcg = dead_memcg;
1096 struct mem_cgroup *last;
1097
1098 do {
1099 __invalidate_reclaim_iterators(memcg, dead_memcg);
1100 last = memcg;
1101 } while ((memcg = parent_mem_cgroup(memcg)));
1102
1103
1104
1105
1106
1107
1108
1109 if (last != root_mem_cgroup)
1110 __invalidate_reclaim_iterators(root_mem_cgroup,
1111 dead_memcg);
1112}
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127int mem_cgroup_scan_tasks(struct mem_cgroup *memcg,
1128 int (*fn)(struct task_struct *, void *), void *arg)
1129{
1130 struct mem_cgroup *iter;
1131 int ret = 0;
1132
1133 BUG_ON(memcg == root_mem_cgroup);
1134
1135 for_each_mem_cgroup_tree(iter, memcg) {
1136 struct css_task_iter it;
1137 struct task_struct *task;
1138
1139 css_task_iter_start(&iter->css, CSS_TASK_ITER_PROCS, &it);
1140 while (!ret && (task = css_task_iter_next(&it)))
1141 ret = fn(task, arg);
1142 css_task_iter_end(&it);
1143 if (ret) {
1144 mem_cgroup_iter_break(memcg, iter);
1145 break;
1146 }
1147 }
1148 return ret;
1149}
1150
1151#ifdef CONFIG_DEBUG_VM
1152void lruvec_memcg_debug(struct lruvec *lruvec, struct page *page)
1153{
1154 struct mem_cgroup *memcg;
1155
1156 if (mem_cgroup_disabled())
1157 return;
1158
1159 memcg = page_memcg(page);
1160
1161 if (!memcg)
1162 VM_BUG_ON_PAGE(lruvec_memcg(lruvec) != root_mem_cgroup, page);
1163 else
1164 VM_BUG_ON_PAGE(lruvec_memcg(lruvec) != memcg, page);
1165}
1166#endif
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178struct lruvec *lock_page_lruvec(struct page *page)
1179{
1180 struct lruvec *lruvec;
1181
1182 lruvec = mem_cgroup_page_lruvec(page);
1183 spin_lock(&lruvec->lru_lock);
1184
1185 lruvec_memcg_debug(lruvec, page);
1186
1187 return lruvec;
1188}
1189
1190struct lruvec *lock_page_lruvec_irq(struct page *page)
1191{
1192 struct lruvec *lruvec;
1193
1194 lruvec = mem_cgroup_page_lruvec(page);
1195 spin_lock_irq(&lruvec->lru_lock);
1196
1197 lruvec_memcg_debug(lruvec, page);
1198
1199 return lruvec;
1200}
1201
1202struct lruvec *lock_page_lruvec_irqsave(struct page *page, unsigned long *flags)
1203{
1204 struct lruvec *lruvec;
1205
1206 lruvec = mem_cgroup_page_lruvec(page);
1207 spin_lock_irqsave(&lruvec->lru_lock, *flags);
1208
1209 lruvec_memcg_debug(lruvec, page);
1210
1211 return lruvec;
1212}
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru,
1226 int zid, int nr_pages)
1227{
1228 struct mem_cgroup_per_node *mz;
1229 unsigned long *lru_size;
1230 long size;
1231
1232 if (mem_cgroup_disabled())
1233 return;
1234
1235 mz = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
1236 lru_size = &mz->lru_zone_size[zid][lru];
1237
1238 if (nr_pages < 0)
1239 *lru_size += nr_pages;
1240
1241 size = *lru_size;
1242 if (WARN_ONCE(size < 0,
1243 "%s(%p, %d, %d): lru_size %ld\n",
1244 __func__, lruvec, lru, nr_pages, size)) {
1245 VM_BUG_ON(1);
1246 *lru_size = 0;
1247 }
1248
1249 if (nr_pages > 0)
1250 *lru_size += nr_pages;
1251}
1252
1253
1254
1255
1256
1257
1258
1259
1260static unsigned long mem_cgroup_margin(struct mem_cgroup *memcg)
1261{
1262 unsigned long margin = 0;
1263 unsigned long count;
1264 unsigned long limit;
1265
1266 count = page_counter_read(&memcg->memory);
1267 limit = READ_ONCE(memcg->memory.max);
1268 if (count < limit)
1269 margin = limit - count;
1270
1271 if (do_memsw_account()) {
1272 count = page_counter_read(&memcg->memsw);
1273 limit = READ_ONCE(memcg->memsw.max);
1274 if (count < limit)
1275 margin = min(margin, limit - count);
1276 else
1277 margin = 0;
1278 }
1279
1280 return margin;
1281}
1282
1283
1284
1285
1286
1287
1288
1289
1290static bool mem_cgroup_under_move(struct mem_cgroup *memcg)
1291{
1292 struct mem_cgroup *from;
1293 struct mem_cgroup *to;
1294 bool ret = false;
1295
1296
1297
1298
1299 spin_lock(&mc.lock);
1300 from = mc.from;
1301 to = mc.to;
1302 if (!from)
1303 goto unlock;
1304
1305 ret = mem_cgroup_is_descendant(from, memcg) ||
1306 mem_cgroup_is_descendant(to, memcg);
1307unlock:
1308 spin_unlock(&mc.lock);
1309 return ret;
1310}
1311
1312static bool mem_cgroup_wait_acct_move(struct mem_cgroup *memcg)
1313{
1314 if (mc.moving_task && current != mc.moving_task) {
1315 if (mem_cgroup_under_move(memcg)) {
1316 DEFINE_WAIT(wait);
1317 prepare_to_wait(&mc.waitq, &wait, TASK_INTERRUPTIBLE);
1318
1319 if (mc.moving_task)
1320 schedule();
1321 finish_wait(&mc.waitq, &wait);
1322 return true;
1323 }
1324 }
1325 return false;
1326}
1327
1328struct memory_stat {
1329 const char *name;
1330 unsigned int idx;
1331};
1332
1333static const struct memory_stat memory_stats[] = {
1334 { "anon", NR_ANON_MAPPED },
1335 { "file", NR_FILE_PAGES },
1336 { "kernel_stack", NR_KERNEL_STACK_KB },
1337 { "pagetables", NR_PAGETABLE },
1338 { "percpu", MEMCG_PERCPU_B },
1339 { "sock", MEMCG_SOCK },
1340 { "shmem", NR_SHMEM },
1341 { "file_mapped", NR_FILE_MAPPED },
1342 { "file_dirty", NR_FILE_DIRTY },
1343 { "file_writeback", NR_WRITEBACK },
1344#ifdef CONFIG_SWAP
1345 { "swapcached", NR_SWAPCACHE },
1346#endif
1347#ifdef CONFIG_TRANSPARENT_HUGEPAGE
1348 { "anon_thp", NR_ANON_THPS },
1349 { "file_thp", NR_FILE_THPS },
1350 { "shmem_thp", NR_SHMEM_THPS },
1351#endif
1352 { "inactive_anon", NR_INACTIVE_ANON },
1353 { "active_anon", NR_ACTIVE_ANON },
1354 { "inactive_file", NR_INACTIVE_FILE },
1355 { "active_file", NR_ACTIVE_FILE },
1356 { "unevictable", NR_UNEVICTABLE },
1357 { "slab_reclaimable", NR_SLAB_RECLAIMABLE_B },
1358 { "slab_unreclaimable", NR_SLAB_UNRECLAIMABLE_B },
1359
1360
1361 { "workingset_refault_anon", WORKINGSET_REFAULT_ANON },
1362 { "workingset_refault_file", WORKINGSET_REFAULT_FILE },
1363 { "workingset_activate_anon", WORKINGSET_ACTIVATE_ANON },
1364 { "workingset_activate_file", WORKINGSET_ACTIVATE_FILE },
1365 { "workingset_restore_anon", WORKINGSET_RESTORE_ANON },
1366 { "workingset_restore_file", WORKINGSET_RESTORE_FILE },
1367 { "workingset_nodereclaim", WORKINGSET_NODERECLAIM },
1368};
1369
1370
1371static int memcg_page_state_unit(int item)
1372{
1373 switch (item) {
1374 case MEMCG_PERCPU_B:
1375 case NR_SLAB_RECLAIMABLE_B:
1376 case NR_SLAB_UNRECLAIMABLE_B:
1377 case WORKINGSET_REFAULT_ANON:
1378 case WORKINGSET_REFAULT_FILE:
1379 case WORKINGSET_ACTIVATE_ANON:
1380 case WORKINGSET_ACTIVATE_FILE:
1381 case WORKINGSET_RESTORE_ANON:
1382 case WORKINGSET_RESTORE_FILE:
1383 case WORKINGSET_NODERECLAIM:
1384 return 1;
1385 case NR_KERNEL_STACK_KB:
1386 return SZ_1K;
1387 default:
1388 return PAGE_SIZE;
1389 }
1390}
1391
1392static inline unsigned long memcg_page_state_output(struct mem_cgroup *memcg,
1393 int item)
1394{
1395 return memcg_page_state(memcg, item) * memcg_page_state_unit(item);
1396}
1397
1398static char *memory_stat_format(struct mem_cgroup *memcg)
1399{
1400 struct seq_buf s;
1401 int i;
1402
1403 seq_buf_init(&s, kmalloc(PAGE_SIZE, GFP_KERNEL), PAGE_SIZE);
1404 if (!s.buffer)
1405 return NULL;
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417 cgroup_rstat_flush(memcg->css.cgroup);
1418
1419 for (i = 0; i < ARRAY_SIZE(memory_stats); i++) {
1420 u64 size;
1421
1422 size = memcg_page_state_output(memcg, memory_stats[i].idx);
1423 seq_buf_printf(&s, "%s %llu\n", memory_stats[i].name, size);
1424
1425 if (unlikely(memory_stats[i].idx == NR_SLAB_UNRECLAIMABLE_B)) {
1426 size += memcg_page_state_output(memcg,
1427 NR_SLAB_RECLAIMABLE_B);
1428 seq_buf_printf(&s, "slab %llu\n", size);
1429 }
1430 }
1431
1432
1433
1434 seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGFAULT),
1435 memcg_events(memcg, PGFAULT));
1436 seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGMAJFAULT),
1437 memcg_events(memcg, PGMAJFAULT));
1438 seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGREFILL),
1439 memcg_events(memcg, PGREFILL));
1440 seq_buf_printf(&s, "pgscan %lu\n",
1441 memcg_events(memcg, PGSCAN_KSWAPD) +
1442 memcg_events(memcg, PGSCAN_DIRECT));
1443 seq_buf_printf(&s, "pgsteal %lu\n",
1444 memcg_events(memcg, PGSTEAL_KSWAPD) +
1445 memcg_events(memcg, PGSTEAL_DIRECT));
1446 seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGACTIVATE),
1447 memcg_events(memcg, PGACTIVATE));
1448 seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGDEACTIVATE),
1449 memcg_events(memcg, PGDEACTIVATE));
1450 seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGLAZYFREE),
1451 memcg_events(memcg, PGLAZYFREE));
1452 seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGLAZYFREED),
1453 memcg_events(memcg, PGLAZYFREED));
1454
1455#ifdef CONFIG_TRANSPARENT_HUGEPAGE
1456 seq_buf_printf(&s, "%s %lu\n", vm_event_name(THP_FAULT_ALLOC),
1457 memcg_events(memcg, THP_FAULT_ALLOC));
1458 seq_buf_printf(&s, "%s %lu\n", vm_event_name(THP_COLLAPSE_ALLOC),
1459 memcg_events(memcg, THP_COLLAPSE_ALLOC));
1460#endif
1461
1462
1463 WARN_ON_ONCE(seq_buf_has_overflowed(&s));
1464
1465 return s.buffer;
1466}
1467
1468#define K(x) ((x) << (PAGE_SHIFT-10))
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478void mem_cgroup_print_oom_context(struct mem_cgroup *memcg, struct task_struct *p)
1479{
1480 rcu_read_lock();
1481
1482 if (memcg) {
1483 pr_cont(",oom_memcg=");
1484 pr_cont_cgroup_path(memcg->css.cgroup);
1485 } else
1486 pr_cont(",global_oom");
1487 if (p) {
1488 pr_cont(",task_memcg=");
1489 pr_cont_cgroup_path(task_cgroup(p, memory_cgrp_id));
1490 }
1491 rcu_read_unlock();
1492}
1493
1494
1495
1496
1497
1498
1499void mem_cgroup_print_oom_meminfo(struct mem_cgroup *memcg)
1500{
1501 char *buf;
1502
1503 pr_info("memory: usage %llukB, limit %llukB, failcnt %lu\n",
1504 K((u64)page_counter_read(&memcg->memory)),
1505 K((u64)READ_ONCE(memcg->memory.max)), memcg->memory.failcnt);
1506 if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
1507 pr_info("swap: usage %llukB, limit %llukB, failcnt %lu\n",
1508 K((u64)page_counter_read(&memcg->swap)),
1509 K((u64)READ_ONCE(memcg->swap.max)), memcg->swap.failcnt);
1510 else {
1511 pr_info("memory+swap: usage %llukB, limit %llukB, failcnt %lu\n",
1512 K((u64)page_counter_read(&memcg->memsw)),
1513 K((u64)memcg->memsw.max), memcg->memsw.failcnt);
1514 pr_info("kmem: usage %llukB, limit %llukB, failcnt %lu\n",
1515 K((u64)page_counter_read(&memcg->kmem)),
1516 K((u64)memcg->kmem.max), memcg->kmem.failcnt);
1517 }
1518
1519 pr_info("Memory cgroup stats for ");
1520 pr_cont_cgroup_path(memcg->css.cgroup);
1521 pr_cont(":");
1522 buf = memory_stat_format(memcg);
1523 if (!buf)
1524 return;
1525 pr_info("%s", buf);
1526 kfree(buf);
1527}
1528
1529
1530
1531
1532unsigned long mem_cgroup_get_max(struct mem_cgroup *memcg)
1533{
1534 unsigned long max = READ_ONCE(memcg->memory.max);
1535
1536 if (cgroup_subsys_on_dfl(memory_cgrp_subsys)) {
1537 if (mem_cgroup_swappiness(memcg))
1538 max += min(READ_ONCE(memcg->swap.max),
1539 (unsigned long)total_swap_pages);
1540 } else {
1541 if (mem_cgroup_swappiness(memcg)) {
1542
1543 unsigned long swap = READ_ONCE(memcg->memsw.max) - max;
1544
1545 max += min(swap, (unsigned long)total_swap_pages);
1546 }
1547 }
1548 return max;
1549}
1550
1551unsigned long mem_cgroup_size(struct mem_cgroup *memcg)
1552{
1553 return page_counter_read(&memcg->memory);
1554}
1555
1556static bool mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
1557 int order)
1558{
1559 struct oom_control oc = {
1560 .zonelist = NULL,
1561 .nodemask = NULL,
1562 .memcg = memcg,
1563 .gfp_mask = gfp_mask,
1564 .order = order,
1565 };
1566 bool ret = true;
1567
1568 if (mutex_lock_killable(&oom_lock))
1569 return true;
1570
1571 if (mem_cgroup_margin(memcg) >= (1 << order))
1572 goto unlock;
1573
1574
1575
1576
1577
1578 ret = should_force_charge() || out_of_memory(&oc);
1579
1580unlock:
1581 mutex_unlock(&oom_lock);
1582 return ret;
1583}
1584
1585static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg,
1586 pg_data_t *pgdat,
1587 gfp_t gfp_mask,
1588 unsigned long *total_scanned)
1589{
1590 struct mem_cgroup *victim = NULL;
1591 int total = 0;
1592 int loop = 0;
1593 unsigned long excess;
1594 unsigned long nr_scanned;
1595 struct mem_cgroup_reclaim_cookie reclaim = {
1596 .pgdat = pgdat,
1597 };
1598
1599 excess = soft_limit_excess(root_memcg);
1600
1601 while (1) {
1602 victim = mem_cgroup_iter(root_memcg, victim, &reclaim);
1603 if (!victim) {
1604 loop++;
1605 if (loop >= 2) {
1606
1607
1608
1609
1610
1611 if (!total)
1612 break;
1613
1614
1615
1616
1617
1618
1619 if (total >= (excess >> 2) ||
1620 (loop > MEM_CGROUP_MAX_RECLAIM_LOOPS))
1621 break;
1622 }
1623 continue;
1624 }
1625 total += mem_cgroup_shrink_node(victim, gfp_mask, false,
1626 pgdat, &nr_scanned);
1627 *total_scanned += nr_scanned;
1628 if (!soft_limit_excess(root_memcg))
1629 break;
1630 }
1631 mem_cgroup_iter_break(root_memcg, victim);
1632 return total;
1633}
1634
1635#ifdef CONFIG_LOCKDEP
1636static struct lockdep_map memcg_oom_lock_dep_map = {
1637 .name = "memcg_oom_lock",
1638};
1639#endif
1640
1641static DEFINE_SPINLOCK(memcg_oom_lock);
1642
1643
1644
1645
1646
1647static bool mem_cgroup_oom_trylock(struct mem_cgroup *memcg)
1648{
1649 struct mem_cgroup *iter, *failed = NULL;
1650
1651 spin_lock(&memcg_oom_lock);
1652
1653 for_each_mem_cgroup_tree(iter, memcg) {
1654 if (iter->oom_lock) {
1655
1656
1657
1658
1659 failed = iter;
1660 mem_cgroup_iter_break(memcg, iter);
1661 break;
1662 } else
1663 iter->oom_lock = true;
1664 }
1665
1666 if (failed) {
1667
1668
1669
1670
1671 for_each_mem_cgroup_tree(iter, memcg) {
1672 if (iter == failed) {
1673 mem_cgroup_iter_break(memcg, iter);
1674 break;
1675 }
1676 iter->oom_lock = false;
1677 }
1678 } else
1679 mutex_acquire(&memcg_oom_lock_dep_map, 0, 1, _RET_IP_);
1680
1681 spin_unlock(&memcg_oom_lock);
1682
1683 return !failed;
1684}
1685
1686static void mem_cgroup_oom_unlock(struct mem_cgroup *memcg)
1687{
1688 struct mem_cgroup *iter;
1689
1690 spin_lock(&memcg_oom_lock);
1691 mutex_release(&memcg_oom_lock_dep_map, _RET_IP_);
1692 for_each_mem_cgroup_tree(iter, memcg)
1693 iter->oom_lock = false;
1694 spin_unlock(&memcg_oom_lock);
1695}
1696
1697static void mem_cgroup_mark_under_oom(struct mem_cgroup *memcg)
1698{
1699 struct mem_cgroup *iter;
1700
1701 spin_lock(&memcg_oom_lock);
1702 for_each_mem_cgroup_tree(iter, memcg)
1703 iter->under_oom++;
1704 spin_unlock(&memcg_oom_lock);
1705}
1706
1707static void mem_cgroup_unmark_under_oom(struct mem_cgroup *memcg)
1708{
1709 struct mem_cgroup *iter;
1710
1711
1712
1713
1714
1715 spin_lock(&memcg_oom_lock);
1716 for_each_mem_cgroup_tree(iter, memcg)
1717 if (iter->under_oom > 0)
1718 iter->under_oom--;
1719 spin_unlock(&memcg_oom_lock);
1720}
1721
1722static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq);
1723
1724struct oom_wait_info {
1725 struct mem_cgroup *memcg;
1726 wait_queue_entry_t wait;
1727};
1728
1729static int memcg_oom_wake_function(wait_queue_entry_t *wait,
1730 unsigned mode, int sync, void *arg)
1731{
1732 struct mem_cgroup *wake_memcg = (struct mem_cgroup *)arg;
1733 struct mem_cgroup *oom_wait_memcg;
1734 struct oom_wait_info *oom_wait_info;
1735
1736 oom_wait_info = container_of(wait, struct oom_wait_info, wait);
1737 oom_wait_memcg = oom_wait_info->memcg;
1738
1739 if (!mem_cgroup_is_descendant(wake_memcg, oom_wait_memcg) &&
1740 !mem_cgroup_is_descendant(oom_wait_memcg, wake_memcg))
1741 return 0;
1742 return autoremove_wake_function(wait, mode, sync, arg);
1743}
1744
1745static void memcg_oom_recover(struct mem_cgroup *memcg)
1746{
1747
1748
1749
1750
1751
1752
1753
1754
1755 if (memcg && memcg->under_oom)
1756 __wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, memcg);
1757}
1758
1759enum oom_status {
1760 OOM_SUCCESS,
1761 OOM_FAILED,
1762 OOM_ASYNC,
1763 OOM_SKIPPED
1764};
1765
1766static enum oom_status mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order)
1767{
1768 enum oom_status ret;
1769 bool locked;
1770
1771 if (order > PAGE_ALLOC_COSTLY_ORDER)
1772 return OOM_SKIPPED;
1773
1774 memcg_memory_event(memcg, MEMCG_OOM);
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794 if (memcg->oom_kill_disable) {
1795 if (!current->in_user_fault)
1796 return OOM_SKIPPED;
1797 css_get(&memcg->css);
1798 current->memcg_in_oom = memcg;
1799 current->memcg_oom_gfp_mask = mask;
1800 current->memcg_oom_order = order;
1801
1802 return OOM_ASYNC;
1803 }
1804
1805 mem_cgroup_mark_under_oom(memcg);
1806
1807 locked = mem_cgroup_oom_trylock(memcg);
1808
1809 if (locked)
1810 mem_cgroup_oom_notify(memcg);
1811
1812 mem_cgroup_unmark_under_oom(memcg);
1813 if (mem_cgroup_out_of_memory(memcg, mask, order))
1814 ret = OOM_SUCCESS;
1815 else
1816 ret = OOM_FAILED;
1817
1818 if (locked)
1819 mem_cgroup_oom_unlock(memcg);
1820
1821 return ret;
1822}
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841bool mem_cgroup_oom_synchronize(bool handle)
1842{
1843 struct mem_cgroup *memcg = current->memcg_in_oom;
1844 struct oom_wait_info owait;
1845 bool locked;
1846
1847
1848 if (!memcg)
1849 return false;
1850
1851 if (!handle)
1852 goto cleanup;
1853
1854 owait.memcg = memcg;
1855 owait.wait.flags = 0;
1856 owait.wait.func = memcg_oom_wake_function;
1857 owait.wait.private = current;
1858 INIT_LIST_HEAD(&owait.wait.entry);
1859
1860 prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE);
1861 mem_cgroup_mark_under_oom(memcg);
1862
1863 locked = mem_cgroup_oom_trylock(memcg);
1864
1865 if (locked)
1866 mem_cgroup_oom_notify(memcg);
1867
1868 if (locked && !memcg->oom_kill_disable) {
1869 mem_cgroup_unmark_under_oom(memcg);
1870 finish_wait(&memcg_oom_waitq, &owait.wait);
1871 mem_cgroup_out_of_memory(memcg, current->memcg_oom_gfp_mask,
1872 current->memcg_oom_order);
1873 } else {
1874 schedule();
1875 mem_cgroup_unmark_under_oom(memcg);
1876 finish_wait(&memcg_oom_waitq, &owait.wait);
1877 }
1878
1879 if (locked) {
1880 mem_cgroup_oom_unlock(memcg);
1881
1882
1883
1884
1885
1886 memcg_oom_recover(memcg);
1887 }
1888cleanup:
1889 current->memcg_in_oom = NULL;
1890 css_put(&memcg->css);
1891 return true;
1892}
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904struct mem_cgroup *mem_cgroup_get_oom_group(struct task_struct *victim,
1905 struct mem_cgroup *oom_domain)
1906{
1907 struct mem_cgroup *oom_group = NULL;
1908 struct mem_cgroup *memcg;
1909
1910 if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
1911 return NULL;
1912
1913 if (!oom_domain)
1914 oom_domain = root_mem_cgroup;
1915
1916 rcu_read_lock();
1917
1918 memcg = mem_cgroup_from_task(victim);
1919 if (memcg == root_mem_cgroup)
1920 goto out;
1921
1922
1923
1924
1925
1926
1927 if (unlikely(!mem_cgroup_is_descendant(memcg, oom_domain)))
1928 goto out;
1929
1930
1931
1932
1933
1934
1935 for (; memcg; memcg = parent_mem_cgroup(memcg)) {
1936 if (memcg->oom_group)
1937 oom_group = memcg;
1938
1939 if (memcg == oom_domain)
1940 break;
1941 }
1942
1943 if (oom_group)
1944 css_get(&oom_group->css);
1945out:
1946 rcu_read_unlock();
1947
1948 return oom_group;
1949}
1950
1951void mem_cgroup_print_oom_group(struct mem_cgroup *memcg)
1952{
1953 pr_info("Tasks in ");
1954 pr_cont_cgroup_path(memcg->css.cgroup);
1955 pr_cont(" are going to be killed due to memory.oom.group set\n");
1956}
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968void lock_page_memcg(struct page *page)
1969{
1970 struct page *head = compound_head(page);
1971 struct mem_cgroup *memcg;
1972 unsigned long flags;
1973
1974
1975
1976
1977
1978
1979 rcu_read_lock();
1980
1981 if (mem_cgroup_disabled())
1982 return;
1983again:
1984 memcg = page_memcg(head);
1985 if (unlikely(!memcg))
1986 return;
1987
1988#ifdef CONFIG_PROVE_LOCKING
1989 local_irq_save(flags);
1990 might_lock(&memcg->move_lock);
1991 local_irq_restore(flags);
1992#endif
1993
1994 if (atomic_read(&memcg->moving_account) <= 0)
1995 return;
1996
1997 spin_lock_irqsave(&memcg->move_lock, flags);
1998 if (memcg != page_memcg(head)) {
1999 spin_unlock_irqrestore(&memcg->move_lock, flags);
2000 goto again;
2001 }
2002
2003
2004
2005
2006
2007
2008
2009 memcg->move_lock_task = current;
2010 memcg->move_lock_flags = flags;
2011}
2012EXPORT_SYMBOL(lock_page_memcg);
2013
2014static void __unlock_page_memcg(struct mem_cgroup *memcg)
2015{
2016 if (memcg && memcg->move_lock_task == current) {
2017 unsigned long flags = memcg->move_lock_flags;
2018
2019 memcg->move_lock_task = NULL;
2020 memcg->move_lock_flags = 0;
2021
2022 spin_unlock_irqrestore(&memcg->move_lock, flags);
2023 }
2024
2025 rcu_read_unlock();
2026}
2027
2028
2029
2030
2031
2032void unlock_page_memcg(struct page *page)
2033{
2034 struct page *head = compound_head(page);
2035
2036 __unlock_page_memcg(page_memcg(head));
2037}
2038EXPORT_SYMBOL(unlock_page_memcg);
2039
2040struct obj_stock {
2041#ifdef CONFIG_MEMCG_KMEM
2042 struct obj_cgroup *cached_objcg;
2043 struct pglist_data *cached_pgdat;
2044 unsigned int nr_bytes;
2045 int nr_slab_reclaimable_b;
2046 int nr_slab_unreclaimable_b;
2047#else
2048 int dummy[0];
2049#endif
2050};
2051
2052struct memcg_stock_pcp {
2053 struct mem_cgroup *cached;
2054 unsigned int nr_pages;
2055 struct obj_stock task_obj;
2056 struct obj_stock irq_obj;
2057
2058 struct work_struct work;
2059 unsigned long flags;
2060#define FLUSHING_CACHED_CHARGE 0
2061};
2062static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock);
2063static DEFINE_MUTEX(percpu_charge_mutex);
2064
2065#ifdef CONFIG_MEMCG_KMEM
2066static void drain_obj_stock(struct obj_stock *stock);
2067static bool obj_stock_flush_required(struct memcg_stock_pcp *stock,
2068 struct mem_cgroup *root_memcg);
2069
2070#else
2071static inline void drain_obj_stock(struct obj_stock *stock)
2072{
2073}
2074static bool obj_stock_flush_required(struct memcg_stock_pcp *stock,
2075 struct mem_cgroup *root_memcg)
2076{
2077 return false;
2078}
2079#endif
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092static inline struct obj_stock *get_obj_stock(unsigned long *pflags)
2093{
2094 struct memcg_stock_pcp *stock;
2095
2096 if (likely(in_task())) {
2097 *pflags = 0UL;
2098 preempt_disable();
2099 stock = this_cpu_ptr(&memcg_stock);
2100 return &stock->task_obj;
2101 }
2102
2103 local_irq_save(*pflags);
2104 stock = this_cpu_ptr(&memcg_stock);
2105 return &stock->irq_obj;
2106}
2107
2108static inline void put_obj_stock(unsigned long flags)
2109{
2110 if (likely(in_task()))
2111 preempt_enable();
2112 else
2113 local_irq_restore(flags);
2114}
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
2128{
2129 struct memcg_stock_pcp *stock;
2130 unsigned long flags;
2131 bool ret = false;
2132
2133 if (nr_pages > MEMCG_CHARGE_BATCH)
2134 return ret;
2135
2136 local_irq_save(flags);
2137
2138 stock = this_cpu_ptr(&memcg_stock);
2139 if (memcg == stock->cached && stock->nr_pages >= nr_pages) {
2140 stock->nr_pages -= nr_pages;
2141 ret = true;
2142 }
2143
2144 local_irq_restore(flags);
2145
2146 return ret;
2147}
2148
2149
2150
2151
2152static void drain_stock(struct memcg_stock_pcp *stock)
2153{
2154 struct mem_cgroup *old = stock->cached;
2155
2156 if (!old)
2157 return;
2158
2159 if (stock->nr_pages) {
2160 page_counter_uncharge(&old->memory, stock->nr_pages);
2161 if (do_memsw_account())
2162 page_counter_uncharge(&old->memsw, stock->nr_pages);
2163 stock->nr_pages = 0;
2164 }
2165
2166 css_put(&old->css);
2167 stock->cached = NULL;
2168}
2169
2170static void drain_local_stock(struct work_struct *dummy)
2171{
2172 struct memcg_stock_pcp *stock;
2173 unsigned long flags;
2174
2175
2176
2177
2178
2179
2180 local_irq_save(flags);
2181
2182 stock = this_cpu_ptr(&memcg_stock);
2183 drain_obj_stock(&stock->irq_obj);
2184 if (in_task())
2185 drain_obj_stock(&stock->task_obj);
2186 drain_stock(stock);
2187 clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags);
2188
2189 local_irq_restore(flags);
2190}
2191
2192
2193
2194
2195
2196static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
2197{
2198 struct memcg_stock_pcp *stock;
2199 unsigned long flags;
2200
2201 local_irq_save(flags);
2202
2203 stock = this_cpu_ptr(&memcg_stock);
2204 if (stock->cached != memcg) {
2205 drain_stock(stock);
2206 css_get(&memcg->css);
2207 stock->cached = memcg;
2208 }
2209 stock->nr_pages += nr_pages;
2210
2211 if (stock->nr_pages > MEMCG_CHARGE_BATCH)
2212 drain_stock(stock);
2213
2214 local_irq_restore(flags);
2215}
2216
2217
2218
2219
2220
2221static void drain_all_stock(struct mem_cgroup *root_memcg)
2222{
2223 int cpu, curcpu;
2224
2225
2226 if (!mutex_trylock(&percpu_charge_mutex))
2227 return;
2228
2229
2230
2231
2232
2233
2234 curcpu = get_cpu();
2235 for_each_online_cpu(cpu) {
2236 struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu);
2237 struct mem_cgroup *memcg;
2238 bool flush = false;
2239
2240 rcu_read_lock();
2241 memcg = stock->cached;
2242 if (memcg && stock->nr_pages &&
2243 mem_cgroup_is_descendant(memcg, root_memcg))
2244 flush = true;
2245 else if (obj_stock_flush_required(stock, root_memcg))
2246 flush = true;
2247 rcu_read_unlock();
2248
2249 if (flush &&
2250 !test_and_set_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) {
2251 if (cpu == curcpu)
2252 drain_local_stock(&stock->work);
2253 else
2254 schedule_work_on(cpu, &stock->work);
2255 }
2256 }
2257 put_cpu();
2258 mutex_unlock(&percpu_charge_mutex);
2259}
2260
2261static int memcg_hotplug_cpu_dead(unsigned int cpu)
2262{
2263 struct memcg_stock_pcp *stock;
2264
2265 stock = &per_cpu(memcg_stock, cpu);
2266 drain_stock(stock);
2267
2268 return 0;
2269}
2270
2271static unsigned long reclaim_high(struct mem_cgroup *memcg,
2272 unsigned int nr_pages,
2273 gfp_t gfp_mask)
2274{
2275 unsigned long nr_reclaimed = 0;
2276
2277 do {
2278 unsigned long pflags;
2279
2280 if (page_counter_read(&memcg->memory) <=
2281 READ_ONCE(memcg->memory.high))
2282 continue;
2283
2284 memcg_memory_event(memcg, MEMCG_HIGH);
2285
2286 psi_memstall_enter(&pflags);
2287 nr_reclaimed += try_to_free_mem_cgroup_pages(memcg, nr_pages,
2288 gfp_mask, true);
2289 psi_memstall_leave(&pflags);
2290 } while ((memcg = parent_mem_cgroup(memcg)) &&
2291 !mem_cgroup_is_root(memcg));
2292
2293 return nr_reclaimed;
2294}
2295
2296static void high_work_func(struct work_struct *work)
2297{
2298 struct mem_cgroup *memcg;
2299
2300 memcg = container_of(work, struct mem_cgroup, high_work);
2301 reclaim_high(memcg, MEMCG_CHARGE_BATCH, GFP_KERNEL);
2302}
2303
2304
2305
2306
2307
2308
2309#define MEMCG_MAX_HIGH_DELAY_JIFFIES (2UL*HZ)
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354 #define MEMCG_DELAY_PRECISION_SHIFT 20
2355 #define MEMCG_DELAY_SCALING_SHIFT 14
2356
2357static u64 calculate_overage(unsigned long usage, unsigned long high)
2358{
2359 u64 overage;
2360
2361 if (usage <= high)
2362 return 0;
2363
2364
2365
2366
2367
2368 high = max(high, 1UL);
2369
2370 overage = usage - high;
2371 overage <<= MEMCG_DELAY_PRECISION_SHIFT;
2372 return div64_u64(overage, high);
2373}
2374
2375static u64 mem_find_max_overage(struct mem_cgroup *memcg)
2376{
2377 u64 overage, max_overage = 0;
2378
2379 do {
2380 overage = calculate_overage(page_counter_read(&memcg->memory),
2381 READ_ONCE(memcg->memory.high));
2382 max_overage = max(overage, max_overage);
2383 } while ((memcg = parent_mem_cgroup(memcg)) &&
2384 !mem_cgroup_is_root(memcg));
2385
2386 return max_overage;
2387}
2388
2389static u64 swap_find_max_overage(struct mem_cgroup *memcg)
2390{
2391 u64 overage, max_overage = 0;
2392
2393 do {
2394 overage = calculate_overage(page_counter_read(&memcg->swap),
2395 READ_ONCE(memcg->swap.high));
2396 if (overage)
2397 memcg_memory_event(memcg, MEMCG_SWAP_HIGH);
2398 max_overage = max(overage, max_overage);
2399 } while ((memcg = parent_mem_cgroup(memcg)) &&
2400 !mem_cgroup_is_root(memcg));
2401
2402 return max_overage;
2403}
2404
2405
2406
2407
2408
2409static unsigned long calculate_high_delay(struct mem_cgroup *memcg,
2410 unsigned int nr_pages,
2411 u64 max_overage)
2412{
2413 unsigned long penalty_jiffies;
2414
2415 if (!max_overage)
2416 return 0;
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426 penalty_jiffies = max_overage * max_overage * HZ;
2427 penalty_jiffies >>= MEMCG_DELAY_PRECISION_SHIFT;
2428 penalty_jiffies >>= MEMCG_DELAY_SCALING_SHIFT;
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438 return penalty_jiffies * nr_pages / MEMCG_CHARGE_BATCH;
2439}
2440
2441
2442
2443
2444
2445void mem_cgroup_handle_over_high(void)
2446{
2447 unsigned long penalty_jiffies;
2448 unsigned long pflags;
2449 unsigned long nr_reclaimed;
2450 unsigned int nr_pages = current->memcg_nr_pages_over_high;
2451 int nr_retries = MAX_RECLAIM_RETRIES;
2452 struct mem_cgroup *memcg;
2453 bool in_retry = false;
2454
2455 if (likely(!nr_pages))
2456 return;
2457
2458 memcg = get_mem_cgroup_from_mm(current->mm);
2459 current->memcg_nr_pages_over_high = 0;
2460
2461retry_reclaim:
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471 nr_reclaimed = reclaim_high(memcg,
2472 in_retry ? SWAP_CLUSTER_MAX : nr_pages,
2473 GFP_KERNEL);
2474
2475
2476
2477
2478
2479 penalty_jiffies = calculate_high_delay(memcg, nr_pages,
2480 mem_find_max_overage(memcg));
2481
2482 penalty_jiffies += calculate_high_delay(memcg, nr_pages,
2483 swap_find_max_overage(memcg));
2484
2485
2486
2487
2488
2489
2490 penalty_jiffies = min(penalty_jiffies, MEMCG_MAX_HIGH_DELAY_JIFFIES);
2491
2492
2493
2494
2495
2496
2497
2498 if (penalty_jiffies <= HZ / 100)
2499 goto out;
2500
2501
2502
2503
2504
2505
2506 if (nr_reclaimed || nr_retries--) {
2507 in_retry = true;
2508 goto retry_reclaim;
2509 }
2510
2511
2512
2513
2514
2515
2516 psi_memstall_enter(&pflags);
2517 schedule_timeout_killable(penalty_jiffies);
2518 psi_memstall_leave(&pflags);
2519
2520out:
2521 css_put(&memcg->css);
2522}
2523
2524static int try_charge_memcg(struct mem_cgroup *memcg, gfp_t gfp_mask,
2525 unsigned int nr_pages)
2526{
2527 unsigned int batch = max(MEMCG_CHARGE_BATCH, nr_pages);
2528 int nr_retries = MAX_RECLAIM_RETRIES;
2529 struct mem_cgroup *mem_over_limit;
2530 struct page_counter *counter;
2531 enum oom_status oom_status;
2532 unsigned long nr_reclaimed;
2533 bool may_swap = true;
2534 bool drained = false;
2535 unsigned long pflags;
2536
2537retry:
2538 if (consume_stock(memcg, nr_pages))
2539 return 0;
2540
2541 if (!do_memsw_account() ||
2542 page_counter_try_charge(&memcg->memsw, batch, &counter)) {
2543 if (page_counter_try_charge(&memcg->memory, batch, &counter))
2544 goto done_restock;
2545 if (do_memsw_account())
2546 page_counter_uncharge(&memcg->memsw, batch);
2547 mem_over_limit = mem_cgroup_from_counter(counter, memory);
2548 } else {
2549 mem_over_limit = mem_cgroup_from_counter(counter, memsw);
2550 may_swap = false;
2551 }
2552
2553 if (batch > nr_pages) {
2554 batch = nr_pages;
2555 goto retry;
2556 }
2557
2558
2559
2560
2561
2562
2563
2564 if (gfp_mask & __GFP_ATOMIC)
2565 goto force;
2566
2567
2568
2569
2570
2571
2572
2573 if (unlikely(should_force_charge()))
2574 goto force;
2575
2576
2577
2578
2579
2580
2581
2582 if (unlikely(current->flags & PF_MEMALLOC))
2583 goto force;
2584
2585 if (unlikely(task_in_memcg_oom(current)))
2586 goto nomem;
2587
2588 if (!gfpflags_allow_blocking(gfp_mask))
2589 goto nomem;
2590
2591 memcg_memory_event(mem_over_limit, MEMCG_MAX);
2592
2593 psi_memstall_enter(&pflags);
2594 nr_reclaimed = try_to_free_mem_cgroup_pages(mem_over_limit, nr_pages,
2595 gfp_mask, may_swap);
2596 psi_memstall_leave(&pflags);
2597
2598 if (mem_cgroup_margin(mem_over_limit) >= nr_pages)
2599 goto retry;
2600
2601 if (!drained) {
2602 drain_all_stock(mem_over_limit);
2603 drained = true;
2604 goto retry;
2605 }
2606
2607 if (gfp_mask & __GFP_NORETRY)
2608 goto nomem;
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618 if (nr_reclaimed && nr_pages <= (1 << PAGE_ALLOC_COSTLY_ORDER))
2619 goto retry;
2620
2621
2622
2623
2624 if (mem_cgroup_wait_acct_move(mem_over_limit))
2625 goto retry;
2626
2627 if (nr_retries--)
2628 goto retry;
2629
2630 if (gfp_mask & __GFP_RETRY_MAYFAIL)
2631 goto nomem;
2632
2633 if (fatal_signal_pending(current))
2634 goto force;
2635
2636
2637
2638
2639
2640
2641 oom_status = mem_cgroup_oom(mem_over_limit, gfp_mask,
2642 get_order(nr_pages * PAGE_SIZE));
2643 switch (oom_status) {
2644 case OOM_SUCCESS:
2645 nr_retries = MAX_RECLAIM_RETRIES;
2646 goto retry;
2647 case OOM_FAILED:
2648 goto force;
2649 default:
2650 goto nomem;
2651 }
2652nomem:
2653 if (!(gfp_mask & __GFP_NOFAIL))
2654 return -ENOMEM;
2655force:
2656
2657
2658
2659
2660
2661 page_counter_charge(&memcg->memory, nr_pages);
2662 if (do_memsw_account())
2663 page_counter_charge(&memcg->memsw, nr_pages);
2664
2665 return 0;
2666
2667done_restock:
2668 if (batch > nr_pages)
2669 refill_stock(memcg, batch - nr_pages);
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680 do {
2681 bool mem_high, swap_high;
2682
2683 mem_high = page_counter_read(&memcg->memory) >
2684 READ_ONCE(memcg->memory.high);
2685 swap_high = page_counter_read(&memcg->swap) >
2686 READ_ONCE(memcg->swap.high);
2687
2688
2689 if (in_interrupt()) {
2690 if (mem_high) {
2691 schedule_work(&memcg->high_work);
2692 break;
2693 }
2694 continue;
2695 }
2696
2697 if (mem_high || swap_high) {
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707 current->memcg_nr_pages_over_high += batch;
2708 set_notify_resume(current);
2709 break;
2710 }
2711 } while ((memcg = parent_mem_cgroup(memcg)));
2712
2713 return 0;
2714}
2715
2716static inline int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
2717 unsigned int nr_pages)
2718{
2719 if (mem_cgroup_is_root(memcg))
2720 return 0;
2721
2722 return try_charge_memcg(memcg, gfp_mask, nr_pages);
2723}
2724
2725#if defined(CONFIG_MEMCG_KMEM) || defined(CONFIG_MMU)
2726static void cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages)
2727{
2728 if (mem_cgroup_is_root(memcg))
2729 return;
2730
2731 page_counter_uncharge(&memcg->memory, nr_pages);
2732 if (do_memsw_account())
2733 page_counter_uncharge(&memcg->memsw, nr_pages);
2734}
2735#endif
2736
2737static void commit_charge(struct page *page, struct mem_cgroup *memcg)
2738{
2739 VM_BUG_ON_PAGE(page_memcg(page), page);
2740
2741
2742
2743
2744
2745
2746
2747
2748 page->memcg_data = (unsigned long)memcg;
2749}
2750
2751static struct mem_cgroup *get_mem_cgroup_from_objcg(struct obj_cgroup *objcg)
2752{
2753 struct mem_cgroup *memcg;
2754
2755 rcu_read_lock();
2756retry:
2757 memcg = obj_cgroup_memcg(objcg);
2758 if (unlikely(!css_tryget(&memcg->css)))
2759 goto retry;
2760 rcu_read_unlock();
2761
2762 return memcg;
2763}
2764
2765#ifdef CONFIG_MEMCG_KMEM
2766
2767
2768
2769
2770
2771#define OBJCGS_CLEAR_MASK (__GFP_DMA | __GFP_RECLAIMABLE | __GFP_ACCOUNT)
2772
2773int memcg_alloc_page_obj_cgroups(struct page *page, struct kmem_cache *s,
2774 gfp_t gfp, bool new_page)
2775{
2776 unsigned int objects = objs_per_slab_page(s, page);
2777 unsigned long memcg_data;
2778 void *vec;
2779
2780 gfp &= ~OBJCGS_CLEAR_MASK;
2781 vec = kcalloc_node(objects, sizeof(struct obj_cgroup *), gfp,
2782 page_to_nid(page));
2783 if (!vec)
2784 return -ENOMEM;
2785
2786 memcg_data = (unsigned long) vec | MEMCG_DATA_OBJCGS;
2787 if (new_page) {
2788
2789
2790
2791
2792
2793 page->memcg_data = memcg_data;
2794 } else if (cmpxchg(&page->memcg_data, 0, memcg_data)) {
2795
2796
2797
2798
2799
2800 kfree(vec);
2801 return 0;
2802 }
2803
2804 kmemleak_not_leak(vec);
2805 return 0;
2806}
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820struct mem_cgroup *mem_cgroup_from_obj(void *p)
2821{
2822 struct page *page;
2823
2824 if (mem_cgroup_disabled())
2825 return NULL;
2826
2827 page = virt_to_head_page(p);
2828
2829
2830
2831
2832
2833
2834 if (page_objcgs_check(page)) {
2835 struct obj_cgroup *objcg;
2836 unsigned int off;
2837
2838 off = obj_to_index(page->slab_cache, page, p);
2839 objcg = page_objcgs(page)[off];
2840 if (objcg)
2841 return obj_cgroup_memcg(objcg);
2842
2843 return NULL;
2844 }
2845
2846
2847
2848
2849
2850
2851
2852
2853 return page_memcg_check(page);
2854}
2855
2856__always_inline struct obj_cgroup *get_obj_cgroup_from_current(void)
2857{
2858 struct obj_cgroup *objcg = NULL;
2859 struct mem_cgroup *memcg;
2860
2861 if (memcg_kmem_bypass())
2862 return NULL;
2863
2864 rcu_read_lock();
2865 if (unlikely(active_memcg()))
2866 memcg = active_memcg();
2867 else
2868 memcg = mem_cgroup_from_task(current);
2869
2870 for (; memcg != root_mem_cgroup; memcg = parent_mem_cgroup(memcg)) {
2871 objcg = rcu_dereference(memcg->objcg);
2872 if (objcg && obj_cgroup_tryget(objcg))
2873 break;
2874 objcg = NULL;
2875 }
2876 rcu_read_unlock();
2877
2878 return objcg;
2879}
2880
2881static int memcg_alloc_cache_id(void)
2882{
2883 int id, size;
2884 int err;
2885
2886 id = ida_simple_get(&memcg_cache_ida,
2887 0, MEMCG_CACHES_MAX_SIZE, GFP_KERNEL);
2888 if (id < 0)
2889 return id;
2890
2891 if (id < memcg_nr_cache_ids)
2892 return id;
2893
2894
2895
2896
2897
2898 down_write(&memcg_cache_ids_sem);
2899
2900 size = 2 * (id + 1);
2901 if (size < MEMCG_CACHES_MIN_SIZE)
2902 size = MEMCG_CACHES_MIN_SIZE;
2903 else if (size > MEMCG_CACHES_MAX_SIZE)
2904 size = MEMCG_CACHES_MAX_SIZE;
2905
2906 err = memcg_update_all_list_lrus(size);
2907 if (!err)
2908 memcg_nr_cache_ids = size;
2909
2910 up_write(&memcg_cache_ids_sem);
2911
2912 if (err) {
2913 ida_simple_remove(&memcg_cache_ida, id);
2914 return err;
2915 }
2916 return id;
2917}
2918
2919static void memcg_free_cache_id(int id)
2920{
2921 ida_simple_remove(&memcg_cache_ida, id);
2922}
2923
2924
2925
2926
2927
2928
2929static void obj_cgroup_uncharge_pages(struct obj_cgroup *objcg,
2930 unsigned int nr_pages)
2931{
2932 struct mem_cgroup *memcg;
2933
2934 memcg = get_mem_cgroup_from_objcg(objcg);
2935
2936 if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
2937 page_counter_uncharge(&memcg->kmem, nr_pages);
2938 refill_stock(memcg, nr_pages);
2939
2940 css_put(&memcg->css);
2941}
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951static int obj_cgroup_charge_pages(struct obj_cgroup *objcg, gfp_t gfp,
2952 unsigned int nr_pages)
2953{
2954 struct page_counter *counter;
2955 struct mem_cgroup *memcg;
2956 int ret;
2957
2958 memcg = get_mem_cgroup_from_objcg(objcg);
2959
2960 ret = try_charge_memcg(memcg, gfp, nr_pages);
2961 if (ret)
2962 goto out;
2963
2964 if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) &&
2965 !page_counter_try_charge(&memcg->kmem, nr_pages, &counter)) {
2966
2967
2968
2969
2970
2971
2972 if (gfp & __GFP_NOFAIL) {
2973 page_counter_charge(&memcg->kmem, nr_pages);
2974 goto out;
2975 }
2976 cancel_charge(memcg, nr_pages);
2977 ret = -ENOMEM;
2978 }
2979out:
2980 css_put(&memcg->css);
2981
2982 return ret;
2983}
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993int __memcg_kmem_charge_page(struct page *page, gfp_t gfp, int order)
2994{
2995 struct obj_cgroup *objcg;
2996 int ret = 0;
2997
2998 objcg = get_obj_cgroup_from_current();
2999 if (objcg) {
3000 ret = obj_cgroup_charge_pages(objcg, gfp, 1 << order);
3001 if (!ret) {
3002 page->memcg_data = (unsigned long)objcg |
3003 MEMCG_DATA_KMEM;
3004 return 0;
3005 }
3006 obj_cgroup_put(objcg);
3007 }
3008 return ret;
3009}
3010
3011
3012
3013
3014
3015
3016void __memcg_kmem_uncharge_page(struct page *page, int order)
3017{
3018 struct obj_cgroup *objcg;
3019 unsigned int nr_pages = 1 << order;
3020
3021 if (!PageMemcgKmem(page))
3022 return;
3023
3024 objcg = __page_objcg(page);
3025 obj_cgroup_uncharge_pages(objcg, nr_pages);
3026 page->memcg_data = 0;
3027 obj_cgroup_put(objcg);
3028}
3029
3030void mod_objcg_state(struct obj_cgroup *objcg, struct pglist_data *pgdat,
3031 enum node_stat_item idx, int nr)
3032{
3033 unsigned long flags;
3034 struct obj_stock *stock = get_obj_stock(&flags);
3035 int *bytes;
3036
3037
3038
3039
3040
3041
3042 if (stock->cached_objcg != objcg) {
3043 drain_obj_stock(stock);
3044 obj_cgroup_get(objcg);
3045 stock->nr_bytes = atomic_read(&objcg->nr_charged_bytes)
3046 ? atomic_xchg(&objcg->nr_charged_bytes, 0) : 0;
3047 stock->cached_objcg = objcg;
3048 stock->cached_pgdat = pgdat;
3049 } else if (stock->cached_pgdat != pgdat) {
3050
3051 struct pglist_data *oldpg = stock->cached_pgdat;
3052
3053 if (stock->nr_slab_reclaimable_b) {
3054 mod_objcg_mlstate(objcg, oldpg, NR_SLAB_RECLAIMABLE_B,
3055 stock->nr_slab_reclaimable_b);
3056 stock->nr_slab_reclaimable_b = 0;
3057 }
3058 if (stock->nr_slab_unreclaimable_b) {
3059 mod_objcg_mlstate(objcg, oldpg, NR_SLAB_UNRECLAIMABLE_B,
3060 stock->nr_slab_unreclaimable_b);
3061 stock->nr_slab_unreclaimable_b = 0;
3062 }
3063 stock->cached_pgdat = pgdat;
3064 }
3065
3066 bytes = (idx == NR_SLAB_RECLAIMABLE_B) ? &stock->nr_slab_reclaimable_b
3067 : &stock->nr_slab_unreclaimable_b;
3068
3069
3070
3071
3072 if (!*bytes) {
3073 *bytes = nr;
3074 nr = 0;
3075 } else {
3076 *bytes += nr;
3077 if (abs(*bytes) > PAGE_SIZE) {
3078 nr = *bytes;
3079 *bytes = 0;
3080 } else {
3081 nr = 0;
3082 }
3083 }
3084 if (nr)
3085 mod_objcg_mlstate(objcg, pgdat, idx, nr);
3086
3087 put_obj_stock(flags);
3088}
3089
3090static bool consume_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes)
3091{
3092 unsigned long flags;
3093 struct obj_stock *stock = get_obj_stock(&flags);
3094 bool ret = false;
3095
3096 if (objcg == stock->cached_objcg && stock->nr_bytes >= nr_bytes) {
3097 stock->nr_bytes -= nr_bytes;
3098 ret = true;
3099 }
3100
3101 put_obj_stock(flags);
3102
3103 return ret;
3104}
3105
3106static void drain_obj_stock(struct obj_stock *stock)
3107{
3108 struct obj_cgroup *old = stock->cached_objcg;
3109
3110 if (!old)
3111 return;
3112
3113 if (stock->nr_bytes) {
3114 unsigned int nr_pages = stock->nr_bytes >> PAGE_SHIFT;
3115 unsigned int nr_bytes = stock->nr_bytes & (PAGE_SIZE - 1);
3116
3117 if (nr_pages)
3118 obj_cgroup_uncharge_pages(old, nr_pages);
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130 atomic_add(nr_bytes, &old->nr_charged_bytes);
3131 stock->nr_bytes = 0;
3132 }
3133
3134
3135
3136
3137 if (stock->nr_slab_reclaimable_b || stock->nr_slab_unreclaimable_b) {
3138 if (stock->nr_slab_reclaimable_b) {
3139 mod_objcg_mlstate(old, stock->cached_pgdat,
3140 NR_SLAB_RECLAIMABLE_B,
3141 stock->nr_slab_reclaimable_b);
3142 stock->nr_slab_reclaimable_b = 0;
3143 }
3144 if (stock->nr_slab_unreclaimable_b) {
3145 mod_objcg_mlstate(old, stock->cached_pgdat,
3146 NR_SLAB_UNRECLAIMABLE_B,
3147 stock->nr_slab_unreclaimable_b);
3148 stock->nr_slab_unreclaimable_b = 0;
3149 }
3150 stock->cached_pgdat = NULL;
3151 }
3152
3153 obj_cgroup_put(old);
3154 stock->cached_objcg = NULL;
3155}
3156
3157static bool obj_stock_flush_required(struct memcg_stock_pcp *stock,
3158 struct mem_cgroup *root_memcg)
3159{
3160 struct mem_cgroup *memcg;
3161
3162 if (in_task() && stock->task_obj.cached_objcg) {
3163 memcg = obj_cgroup_memcg(stock->task_obj.cached_objcg);
3164 if (memcg && mem_cgroup_is_descendant(memcg, root_memcg))
3165 return true;
3166 }
3167 if (stock->irq_obj.cached_objcg) {
3168 memcg = obj_cgroup_memcg(stock->irq_obj.cached_objcg);
3169 if (memcg && mem_cgroup_is_descendant(memcg, root_memcg))
3170 return true;
3171 }
3172
3173 return false;
3174}
3175
3176static void refill_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes,
3177 bool allow_uncharge)
3178{
3179 unsigned long flags;
3180 struct obj_stock *stock = get_obj_stock(&flags);
3181 unsigned int nr_pages = 0;
3182
3183 if (stock->cached_objcg != objcg) {
3184 drain_obj_stock(stock);
3185 obj_cgroup_get(objcg);
3186 stock->cached_objcg = objcg;
3187 stock->nr_bytes = atomic_read(&objcg->nr_charged_bytes)
3188 ? atomic_xchg(&objcg->nr_charged_bytes, 0) : 0;
3189 allow_uncharge = true;
3190 }
3191 stock->nr_bytes += nr_bytes;
3192
3193 if (allow_uncharge && (stock->nr_bytes > PAGE_SIZE)) {
3194 nr_pages = stock->nr_bytes >> PAGE_SHIFT;
3195 stock->nr_bytes &= (PAGE_SIZE - 1);
3196 }
3197
3198 put_obj_stock(flags);
3199
3200 if (nr_pages)
3201 obj_cgroup_uncharge_pages(objcg, nr_pages);
3202}
3203
3204int obj_cgroup_charge(struct obj_cgroup *objcg, gfp_t gfp, size_t size)
3205{
3206 unsigned int nr_pages, nr_bytes;
3207 int ret;
3208
3209 if (consume_obj_stock(objcg, size))
3210 return 0;
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235 nr_pages = size >> PAGE_SHIFT;
3236 nr_bytes = size & (PAGE_SIZE - 1);
3237
3238 if (nr_bytes)
3239 nr_pages += 1;
3240
3241 ret = obj_cgroup_charge_pages(objcg, gfp, nr_pages);
3242 if (!ret && nr_bytes)
3243 refill_obj_stock(objcg, PAGE_SIZE - nr_bytes, false);
3244
3245 return ret;
3246}
3247
3248void obj_cgroup_uncharge(struct obj_cgroup *objcg, size_t size)
3249{
3250 refill_obj_stock(objcg, size, true);
3251}
3252
3253#endif
3254
3255
3256
3257
3258void split_page_memcg(struct page *head, unsigned int nr)
3259{
3260 struct mem_cgroup *memcg = page_memcg(head);
3261 int i;
3262
3263 if (mem_cgroup_disabled() || !memcg)
3264 return;
3265
3266 for (i = 1; i < nr; i++)
3267 head[i].memcg_data = head->memcg_data;
3268
3269 if (PageMemcgKmem(head))
3270 obj_cgroup_get_many(__page_objcg(head), nr - 1);
3271 else
3272 css_get_many(&memcg->css, nr - 1);
3273}
3274
3275#ifdef CONFIG_MEMCG_SWAP
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290static int mem_cgroup_move_swap_account(swp_entry_t entry,
3291 struct mem_cgroup *from, struct mem_cgroup *to)
3292{
3293 unsigned short old_id, new_id;
3294
3295 old_id = mem_cgroup_id(from);
3296 new_id = mem_cgroup_id(to);
3297
3298 if (swap_cgroup_cmpxchg(entry, old_id, new_id) == old_id) {
3299 mod_memcg_state(from, MEMCG_SWAP, -1);
3300 mod_memcg_state(to, MEMCG_SWAP, 1);
3301 return 0;
3302 }
3303 return -EINVAL;
3304}
3305#else
3306static inline int mem_cgroup_move_swap_account(swp_entry_t entry,
3307 struct mem_cgroup *from, struct mem_cgroup *to)
3308{
3309 return -EINVAL;
3310}
3311#endif
3312
3313static DEFINE_MUTEX(memcg_max_mutex);
3314
3315static int mem_cgroup_resize_max(struct mem_cgroup *memcg,
3316 unsigned long max, bool memsw)
3317{
3318 bool enlarge = false;
3319 bool drained = false;
3320 int ret;
3321 bool limits_invariant;
3322 struct page_counter *counter = memsw ? &memcg->memsw : &memcg->memory;
3323
3324 do {
3325 if (signal_pending(current)) {
3326 ret = -EINTR;
3327 break;
3328 }
3329
3330 mutex_lock(&memcg_max_mutex);
3331
3332
3333
3334
3335 limits_invariant = memsw ? max >= READ_ONCE(memcg->memory.max) :
3336 max <= memcg->memsw.max;
3337 if (!limits_invariant) {
3338 mutex_unlock(&memcg_max_mutex);
3339 ret = -EINVAL;
3340 break;
3341 }
3342 if (max > counter->max)
3343 enlarge = true;
3344 ret = page_counter_set_max(counter, max);
3345 mutex_unlock(&memcg_max_mutex);
3346
3347 if (!ret)
3348 break;
3349
3350 if (!drained) {
3351 drain_all_stock(memcg);
3352 drained = true;
3353 continue;
3354 }
3355
3356 if (!try_to_free_mem_cgroup_pages(memcg, 1,
3357 GFP_KERNEL, !memsw)) {
3358 ret = -EBUSY;
3359 break;
3360 }
3361 } while (true);
3362
3363 if (!ret && enlarge)
3364 memcg_oom_recover(memcg);
3365
3366 return ret;
3367}
3368
3369unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order,
3370 gfp_t gfp_mask,
3371 unsigned long *total_scanned)
3372{
3373 unsigned long nr_reclaimed = 0;
3374 struct mem_cgroup_per_node *mz, *next_mz = NULL;
3375 unsigned long reclaimed;
3376 int loop = 0;
3377 struct mem_cgroup_tree_per_node *mctz;
3378 unsigned long excess;
3379 unsigned long nr_scanned;
3380
3381 if (order > 0)
3382 return 0;
3383
3384 mctz = soft_limit_tree_node(pgdat->node_id);
3385
3386
3387
3388
3389
3390
3391 if (!mctz || RB_EMPTY_ROOT(&mctz->rb_root))
3392 return 0;
3393
3394
3395
3396
3397
3398
3399 do {
3400 if (next_mz)
3401 mz = next_mz;
3402 else
3403 mz = mem_cgroup_largest_soft_limit_node(mctz);
3404 if (!mz)
3405 break;
3406
3407 nr_scanned = 0;
3408 reclaimed = mem_cgroup_soft_reclaim(mz->memcg, pgdat,
3409 gfp_mask, &nr_scanned);
3410 nr_reclaimed += reclaimed;
3411 *total_scanned += nr_scanned;
3412 spin_lock_irq(&mctz->lock);
3413 __mem_cgroup_remove_exceeded(mz, mctz);
3414
3415
3416
3417
3418
3419 next_mz = NULL;
3420 if (!reclaimed)
3421 next_mz = __mem_cgroup_largest_soft_limit_node(mctz);
3422
3423 excess = soft_limit_excess(mz->memcg);
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433 __mem_cgroup_insert_exceeded(mz, mctz, excess);
3434 spin_unlock_irq(&mctz->lock);
3435 css_put(&mz->memcg->css);
3436 loop++;
3437
3438
3439
3440
3441
3442 if (!nr_reclaimed &&
3443 (next_mz == NULL ||
3444 loop > MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS))
3445 break;
3446 } while (!nr_reclaimed);
3447 if (next_mz)
3448 css_put(&next_mz->memcg->css);
3449 return nr_reclaimed;
3450}
3451
3452
3453
3454
3455
3456
3457static int mem_cgroup_force_empty(struct mem_cgroup *memcg)
3458{
3459 int nr_retries = MAX_RECLAIM_RETRIES;
3460
3461
3462 lru_add_drain_all();
3463
3464 drain_all_stock(memcg);
3465
3466
3467 while (nr_retries && page_counter_read(&memcg->memory)) {
3468 int progress;
3469
3470 if (signal_pending(current))
3471 return -EINTR;
3472
3473 progress = try_to_free_mem_cgroup_pages(memcg, 1,
3474 GFP_KERNEL, true);
3475 if (!progress) {
3476 nr_retries--;
3477
3478 congestion_wait(BLK_RW_ASYNC, HZ/10);
3479 }
3480
3481 }
3482
3483 return 0;
3484}
3485
3486static ssize_t mem_cgroup_force_empty_write(struct kernfs_open_file *of,
3487 char *buf, size_t nbytes,
3488 loff_t off)
3489{
3490 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
3491
3492 if (mem_cgroup_is_root(memcg))
3493 return -EINVAL;
3494 return mem_cgroup_force_empty(memcg) ?: nbytes;
3495}
3496
3497static u64 mem_cgroup_hierarchy_read(struct cgroup_subsys_state *css,
3498 struct cftype *cft)
3499{
3500 return 1;
3501}
3502
3503static int mem_cgroup_hierarchy_write(struct cgroup_subsys_state *css,
3504 struct cftype *cft, u64 val)
3505{
3506 if (val == 1)
3507 return 0;
3508
3509 pr_warn_once("Non-hierarchical mode is deprecated. "
3510 "Please report your usecase to linux-mm@kvack.org if you "
3511 "depend on this functionality.\n");
3512
3513 return -EINVAL;
3514}
3515
3516static unsigned long mem_cgroup_usage(struct mem_cgroup *memcg, bool swap)
3517{
3518 unsigned long val;
3519
3520 if (mem_cgroup_is_root(memcg)) {
3521
3522 cgroup_rstat_flush_irqsafe(memcg->css.cgroup);
3523 val = memcg_page_state(memcg, NR_FILE_PAGES) +
3524 memcg_page_state(memcg, NR_ANON_MAPPED);
3525 if (swap)
3526 val += memcg_page_state(memcg, MEMCG_SWAP);
3527 } else {
3528 if (!swap)
3529 val = page_counter_read(&memcg->memory);
3530 else
3531 val = page_counter_read(&memcg->memsw);
3532 }
3533 return val;
3534}
3535
3536enum {
3537 RES_USAGE,
3538 RES_LIMIT,
3539 RES_MAX_USAGE,
3540 RES_FAILCNT,
3541 RES_SOFT_LIMIT,
3542};
3543
3544static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css,
3545 struct cftype *cft)
3546{
3547 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
3548 struct page_counter *counter;
3549
3550 switch (MEMFILE_TYPE(cft->private)) {
3551 case _MEM:
3552 counter = &memcg->memory;
3553 break;
3554 case _MEMSWAP:
3555 counter = &memcg->memsw;
3556 break;
3557 case _KMEM:
3558 counter = &memcg->kmem;
3559 break;
3560 case _TCP:
3561 counter = &memcg->tcpmem;
3562 break;
3563 default:
3564 BUG();
3565 }
3566
3567 switch (MEMFILE_ATTR(cft->private)) {
3568 case RES_USAGE:
3569 if (counter == &memcg->memory)
3570 return (u64)mem_cgroup_usage(memcg, false) * PAGE_SIZE;
3571 if (counter == &memcg->memsw)
3572 return (u64)mem_cgroup_usage(memcg, true) * PAGE_SIZE;
3573 return (u64)page_counter_read(counter) * PAGE_SIZE;
3574 case RES_LIMIT:
3575 return (u64)counter->max * PAGE_SIZE;
3576 case RES_MAX_USAGE:
3577 return (u64)counter->watermark * PAGE_SIZE;
3578 case RES_FAILCNT:
3579 return counter->failcnt;
3580 case RES_SOFT_LIMIT:
3581 return (u64)memcg->soft_limit * PAGE_SIZE;
3582 default:
3583 BUG();
3584 }
3585}
3586
3587#ifdef CONFIG_MEMCG_KMEM
3588static int memcg_online_kmem(struct mem_cgroup *memcg)
3589{
3590 struct obj_cgroup *objcg;
3591 int memcg_id;
3592
3593 if (cgroup_memory_nokmem)
3594 return 0;
3595
3596 BUG_ON(memcg->kmemcg_id >= 0);
3597 BUG_ON(memcg->kmem_state);
3598
3599 memcg_id = memcg_alloc_cache_id();
3600 if (memcg_id < 0)
3601 return memcg_id;
3602
3603 objcg = obj_cgroup_alloc();
3604 if (!objcg) {
3605 memcg_free_cache_id(memcg_id);
3606 return -ENOMEM;
3607 }
3608 objcg->memcg = memcg;
3609 rcu_assign_pointer(memcg->objcg, objcg);
3610
3611 static_branch_enable(&memcg_kmem_enabled_key);
3612
3613 memcg->kmemcg_id = memcg_id;
3614 memcg->kmem_state = KMEM_ONLINE;
3615
3616 return 0;
3617}
3618
3619static void memcg_offline_kmem(struct mem_cgroup *memcg)
3620{
3621 struct cgroup_subsys_state *css;
3622 struct mem_cgroup *parent, *child;
3623 int kmemcg_id;
3624
3625 if (memcg->kmem_state != KMEM_ONLINE)
3626 return;
3627
3628 memcg->kmem_state = KMEM_ALLOCATED;
3629
3630 parent = parent_mem_cgroup(memcg);
3631 if (!parent)
3632 parent = root_mem_cgroup;
3633
3634 memcg_reparent_objcgs(memcg, parent);
3635
3636 kmemcg_id = memcg->kmemcg_id;
3637 BUG_ON(kmemcg_id < 0);
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647 rcu_read_lock();
3648 css_for_each_descendant_pre(css, &memcg->css) {
3649 child = mem_cgroup_from_css(css);
3650 BUG_ON(child->kmemcg_id != kmemcg_id);
3651 child->kmemcg_id = parent->kmemcg_id;
3652 }
3653 rcu_read_unlock();
3654
3655 memcg_drain_all_list_lrus(kmemcg_id, parent);
3656
3657 memcg_free_cache_id(kmemcg_id);
3658}
3659
3660static void memcg_free_kmem(struct mem_cgroup *memcg)
3661{
3662
3663 if (unlikely(memcg->kmem_state == KMEM_ONLINE))
3664 memcg_offline_kmem(memcg);
3665}
3666#else
3667static int memcg_online_kmem(struct mem_cgroup *memcg)
3668{
3669 return 0;
3670}
3671static void memcg_offline_kmem(struct mem_cgroup *memcg)
3672{
3673}
3674static void memcg_free_kmem(struct mem_cgroup *memcg)
3675{
3676}
3677#endif
3678
3679static int memcg_update_kmem_max(struct mem_cgroup *memcg,
3680 unsigned long max)
3681{
3682 int ret;
3683
3684 mutex_lock(&memcg_max_mutex);
3685 ret = page_counter_set_max(&memcg->kmem, max);
3686 mutex_unlock(&memcg_max_mutex);
3687 return ret;
3688}
3689
3690static int memcg_update_tcp_max(struct mem_cgroup *memcg, unsigned long max)
3691{
3692 int ret;
3693
3694 mutex_lock(&memcg_max_mutex);
3695
3696 ret = page_counter_set_max(&memcg->tcpmem, max);
3697 if (ret)
3698 goto out;
3699
3700 if (!memcg->tcpmem_active) {
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717 static_branch_inc(&memcg_sockets_enabled_key);
3718 memcg->tcpmem_active = true;
3719 }
3720out:
3721 mutex_unlock(&memcg_max_mutex);
3722 return ret;
3723}
3724
3725
3726
3727
3728
3729static ssize_t mem_cgroup_write(struct kernfs_open_file *of,
3730 char *buf, size_t nbytes, loff_t off)
3731{
3732 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
3733 unsigned long nr_pages;
3734 int ret;
3735
3736 buf = strstrip(buf);
3737 ret = page_counter_memparse(buf, "-1", &nr_pages);
3738 if (ret)
3739 return ret;
3740
3741 switch (MEMFILE_ATTR(of_cft(of)->private)) {
3742 case RES_LIMIT:
3743 if (mem_cgroup_is_root(memcg)) {
3744 ret = -EINVAL;
3745 break;
3746 }
3747 switch (MEMFILE_TYPE(of_cft(of)->private)) {
3748 case _MEM:
3749 ret = mem_cgroup_resize_max(memcg, nr_pages, false);
3750 break;
3751 case _MEMSWAP:
3752 ret = mem_cgroup_resize_max(memcg, nr_pages, true);
3753 break;
3754 case _KMEM:
3755 pr_warn_once("kmem.limit_in_bytes is deprecated and will be removed. "
3756 "Please report your usecase to linux-mm@kvack.org if you "
3757 "depend on this functionality.\n");
3758 ret = memcg_update_kmem_max(memcg, nr_pages);
3759 break;
3760 case _TCP:
3761 ret = memcg_update_tcp_max(memcg, nr_pages);
3762 break;
3763 }
3764 break;
3765 case RES_SOFT_LIMIT:
3766 memcg->soft_limit = nr_pages;
3767 ret = 0;
3768 break;
3769 }
3770 return ret ?: nbytes;
3771}
3772
3773static ssize_t mem_cgroup_reset(struct kernfs_open_file *of, char *buf,
3774 size_t nbytes, loff_t off)
3775{
3776 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
3777 struct page_counter *counter;
3778
3779 switch (MEMFILE_TYPE(of_cft(of)->private)) {
3780 case _MEM:
3781 counter = &memcg->memory;
3782 break;
3783 case _MEMSWAP:
3784 counter = &memcg->memsw;
3785 break;
3786 case _KMEM:
3787 counter = &memcg->kmem;
3788 break;
3789 case _TCP:
3790 counter = &memcg->tcpmem;
3791 break;
3792 default:
3793 BUG();
3794 }
3795
3796 switch (MEMFILE_ATTR(of_cft(of)->private)) {
3797 case RES_MAX_USAGE:
3798 page_counter_reset_watermark(counter);
3799 break;
3800 case RES_FAILCNT:
3801 counter->failcnt = 0;
3802 break;
3803 default:
3804 BUG();
3805 }
3806
3807 return nbytes;
3808}
3809
3810static u64 mem_cgroup_move_charge_read(struct cgroup_subsys_state *css,
3811 struct cftype *cft)
3812{
3813 return mem_cgroup_from_css(css)->move_charge_at_immigrate;
3814}
3815
3816#ifdef CONFIG_MMU
3817static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css,
3818 struct cftype *cft, u64 val)
3819{
3820 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
3821
3822 if (val & ~MOVE_MASK)
3823 return -EINVAL;
3824
3825
3826
3827
3828
3829
3830
3831 memcg->move_charge_at_immigrate = val;
3832 return 0;
3833}
3834#else
3835static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css,
3836 struct cftype *cft, u64 val)
3837{
3838 return -ENOSYS;
3839}
3840#endif
3841
3842#ifdef CONFIG_NUMA
3843
3844#define LRU_ALL_FILE (BIT(LRU_INACTIVE_FILE) | BIT(LRU_ACTIVE_FILE))
3845#define LRU_ALL_ANON (BIT(LRU_INACTIVE_ANON) | BIT(LRU_ACTIVE_ANON))
3846#define LRU_ALL ((1 << NR_LRU_LISTS) - 1)
3847
3848static unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg,
3849 int nid, unsigned int lru_mask, bool tree)
3850{
3851 struct lruvec *lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(nid));
3852 unsigned long nr = 0;
3853 enum lru_list lru;
3854
3855 VM_BUG_ON((unsigned)nid >= nr_node_ids);
3856
3857 for_each_lru(lru) {
3858 if (!(BIT(lru) & lru_mask))
3859 continue;
3860 if (tree)
3861 nr += lruvec_page_state(lruvec, NR_LRU_BASE + lru);
3862 else
3863 nr += lruvec_page_state_local(lruvec, NR_LRU_BASE + lru);
3864 }
3865 return nr;
3866}
3867
3868static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *memcg,
3869 unsigned int lru_mask,
3870 bool tree)
3871{
3872 unsigned long nr = 0;
3873 enum lru_list lru;
3874
3875 for_each_lru(lru) {
3876 if (!(BIT(lru) & lru_mask))
3877 continue;
3878 if (tree)
3879 nr += memcg_page_state(memcg, NR_LRU_BASE + lru);
3880 else
3881 nr += memcg_page_state_local(memcg, NR_LRU_BASE + lru);
3882 }
3883 return nr;
3884}
3885
3886static int memcg_numa_stat_show(struct seq_file *m, void *v)
3887{
3888 struct numa_stat {
3889 const char *name;
3890 unsigned int lru_mask;
3891 };
3892
3893 static const struct numa_stat stats[] = {
3894 { "total", LRU_ALL },
3895 { "file", LRU_ALL_FILE },
3896 { "anon", LRU_ALL_ANON },
3897 { "unevictable", BIT(LRU_UNEVICTABLE) },
3898 };
3899 const struct numa_stat *stat;
3900 int nid;
3901 struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
3902
3903 cgroup_rstat_flush(memcg->css.cgroup);
3904
3905 for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) {
3906 seq_printf(m, "%s=%lu", stat->name,
3907 mem_cgroup_nr_lru_pages(memcg, stat->lru_mask,
3908 false));
3909 for_each_node_state(nid, N_MEMORY)
3910 seq_printf(m, " N%d=%lu", nid,
3911 mem_cgroup_node_nr_lru_pages(memcg, nid,
3912 stat->lru_mask, false));
3913 seq_putc(m, '\n');
3914 }
3915
3916 for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) {
3917
3918 seq_printf(m, "hierarchical_%s=%lu", stat->name,
3919 mem_cgroup_nr_lru_pages(memcg, stat->lru_mask,
3920 true));
3921 for_each_node_state(nid, N_MEMORY)
3922 seq_printf(m, " N%d=%lu", nid,
3923 mem_cgroup_node_nr_lru_pages(memcg, nid,
3924 stat->lru_mask, true));
3925 seq_putc(m, '\n');
3926 }
3927
3928 return 0;
3929}
3930#endif
3931
3932static const unsigned int memcg1_stats[] = {
3933 NR_FILE_PAGES,
3934 NR_ANON_MAPPED,
3935#ifdef CONFIG_TRANSPARENT_HUGEPAGE
3936 NR_ANON_THPS,
3937#endif
3938 NR_SHMEM,
3939 NR_FILE_MAPPED,
3940 NR_FILE_DIRTY,
3941 NR_WRITEBACK,
3942 MEMCG_SWAP,
3943};
3944
3945static const char *const memcg1_stat_names[] = {
3946 "cache",
3947 "rss",
3948#ifdef CONFIG_TRANSPARENT_HUGEPAGE
3949 "rss_huge",
3950#endif
3951 "shmem",
3952 "mapped_file",
3953 "dirty",
3954 "writeback",
3955 "swap",
3956};
3957
3958
3959static const unsigned int memcg1_events[] = {
3960 PGPGIN,
3961 PGPGOUT,
3962 PGFAULT,
3963 PGMAJFAULT,
3964};
3965
3966static int memcg_stat_show(struct seq_file *m, void *v)
3967{
3968 struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
3969 unsigned long memory, memsw;
3970 struct mem_cgroup *mi;
3971 unsigned int i;
3972
3973 BUILD_BUG_ON(ARRAY_SIZE(memcg1_stat_names) != ARRAY_SIZE(memcg1_stats));
3974
3975 cgroup_rstat_flush(memcg->css.cgroup);
3976
3977 for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) {
3978 unsigned long nr;
3979
3980 if (memcg1_stats[i] == MEMCG_SWAP && !do_memsw_account())
3981 continue;
3982 nr = memcg_page_state_local(memcg, memcg1_stats[i]);
3983 seq_printf(m, "%s %lu\n", memcg1_stat_names[i], nr * PAGE_SIZE);
3984 }
3985
3986 for (i = 0; i < ARRAY_SIZE(memcg1_events); i++)
3987 seq_printf(m, "%s %lu\n", vm_event_name(memcg1_events[i]),
3988 memcg_events_local(memcg, memcg1_events[i]));
3989
3990 for (i = 0; i < NR_LRU_LISTS; i++)
3991 seq_printf(m, "%s %lu\n", lru_list_name(i),
3992 memcg_page_state_local(memcg, NR_LRU_BASE + i) *
3993 PAGE_SIZE);
3994
3995
3996 memory = memsw = PAGE_COUNTER_MAX;
3997 for (mi = memcg; mi; mi = parent_mem_cgroup(mi)) {
3998 memory = min(memory, READ_ONCE(mi->memory.max));
3999 memsw = min(memsw, READ_ONCE(mi->memsw.max));
4000 }
4001 seq_printf(m, "hierarchical_memory_limit %llu\n",
4002 (u64)memory * PAGE_SIZE);
4003 if (do_memsw_account())
4004 seq_printf(m, "hierarchical_memsw_limit %llu\n",
4005 (u64)memsw * PAGE_SIZE);
4006
4007 for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) {
4008 unsigned long nr;
4009
4010 if (memcg1_stats[i] == MEMCG_SWAP && !do_memsw_account())
4011 continue;
4012 nr = memcg_page_state(memcg, memcg1_stats[i]);
4013 seq_printf(m, "total_%s %llu\n", memcg1_stat_names[i],
4014 (u64)nr * PAGE_SIZE);
4015 }
4016
4017 for (i = 0; i < ARRAY_SIZE(memcg1_events); i++)
4018 seq_printf(m, "total_%s %llu\n",
4019 vm_event_name(memcg1_events[i]),
4020 (u64)memcg_events(memcg, memcg1_events[i]));
4021
4022 for (i = 0; i < NR_LRU_LISTS; i++)
4023 seq_printf(m, "total_%s %llu\n", lru_list_name(i),
4024 (u64)memcg_page_state(memcg, NR_LRU_BASE + i) *
4025 PAGE_SIZE);
4026
4027#ifdef CONFIG_DEBUG_VM
4028 {
4029 pg_data_t *pgdat;
4030 struct mem_cgroup_per_node *mz;
4031 unsigned long anon_cost = 0;
4032 unsigned long file_cost = 0;
4033
4034 for_each_online_pgdat(pgdat) {
4035 mz = memcg->nodeinfo[pgdat->node_id];
4036
4037 anon_cost += mz->lruvec.anon_cost;
4038 file_cost += mz->lruvec.file_cost;
4039 }
4040 seq_printf(m, "anon_cost %lu\n", anon_cost);
4041 seq_printf(m, "file_cost %lu\n", file_cost);
4042 }
4043#endif
4044
4045 return 0;
4046}
4047
4048static u64 mem_cgroup_swappiness_read(struct cgroup_subsys_state *css,
4049 struct cftype *cft)
4050{
4051 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
4052
4053 return mem_cgroup_swappiness(memcg);
4054}
4055
4056static int mem_cgroup_swappiness_write(struct cgroup_subsys_state *css,
4057 struct cftype *cft, u64 val)
4058{
4059 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
4060
4061 if (val > 200)
4062 return -EINVAL;
4063
4064 if (!mem_cgroup_is_root(memcg))
4065 memcg->swappiness = val;
4066 else
4067 vm_swappiness = val;
4068
4069 return 0;
4070}
4071
4072static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap)
4073{
4074 struct mem_cgroup_threshold_ary *t;
4075 unsigned long usage;
4076 int i;
4077
4078 rcu_read_lock();
4079 if (!swap)
4080 t = rcu_dereference(memcg->thresholds.primary);
4081 else
4082 t = rcu_dereference(memcg->memsw_thresholds.primary);
4083
4084 if (!t)
4085 goto unlock;
4086
4087 usage = mem_cgroup_usage(memcg, swap);
4088
4089
4090
4091
4092
4093
4094 i = t->current_threshold;
4095
4096
4097
4098
4099
4100
4101
4102 for (; i >= 0 && unlikely(t->entries[i].threshold > usage); i--)
4103 eventfd_signal(t->entries[i].eventfd, 1);
4104
4105
4106 i++;
4107
4108
4109
4110
4111
4112
4113
4114 for (; i < t->size && unlikely(t->entries[i].threshold <= usage); i++)
4115 eventfd_signal(t->entries[i].eventfd, 1);
4116
4117
4118 t->current_threshold = i - 1;
4119unlock:
4120 rcu_read_unlock();
4121}
4122
4123static void mem_cgroup_threshold(struct mem_cgroup *memcg)
4124{
4125 while (memcg) {
4126 __mem_cgroup_threshold(memcg, false);
4127 if (do_memsw_account())
4128 __mem_cgroup_threshold(memcg, true);
4129
4130 memcg = parent_mem_cgroup(memcg);
4131 }
4132}
4133
4134static int compare_thresholds(const void *a, const void *b)
4135{
4136 const struct mem_cgroup_threshold *_a = a;
4137 const struct mem_cgroup_threshold *_b = b;
4138
4139 if (_a->threshold > _b->threshold)
4140 return 1;
4141
4142 if (_a->threshold < _b->threshold)
4143 return -1;
4144
4145 return 0;
4146}
4147
4148static int mem_cgroup_oom_notify_cb(struct mem_cgroup *memcg)
4149{
4150 struct mem_cgroup_eventfd_list *ev;
4151
4152 spin_lock(&memcg_oom_lock);
4153
4154 list_for_each_entry(ev, &memcg->oom_notify, list)
4155 eventfd_signal(ev->eventfd, 1);
4156
4157 spin_unlock(&memcg_oom_lock);
4158 return 0;
4159}
4160
4161static void mem_cgroup_oom_notify(struct mem_cgroup *memcg)
4162{
4163 struct mem_cgroup *iter;
4164
4165 for_each_mem_cgroup_tree(iter, memcg)
4166 mem_cgroup_oom_notify_cb(iter);
4167}
4168
4169static int __mem_cgroup_usage_register_event(struct mem_cgroup *memcg,
4170 struct eventfd_ctx *eventfd, const char *args, enum res_type type)
4171{
4172 struct mem_cgroup_thresholds *thresholds;
4173 struct mem_cgroup_threshold_ary *new;
4174 unsigned long threshold;
4175 unsigned long usage;
4176 int i, size, ret;
4177
4178 ret = page_counter_memparse(args, "-1", &threshold);
4179 if (ret)
4180 return ret;
4181
4182 mutex_lock(&memcg->thresholds_lock);
4183
4184 if (type == _MEM) {
4185 thresholds = &memcg->thresholds;
4186 usage = mem_cgroup_usage(memcg, false);
4187 } else if (type == _MEMSWAP) {
4188 thresholds = &memcg->memsw_thresholds;
4189 usage = mem_cgroup_usage(memcg, true);
4190 } else
4191 BUG();
4192
4193
4194 if (thresholds->primary)
4195 __mem_cgroup_threshold(memcg, type == _MEMSWAP);
4196
4197 size = thresholds->primary ? thresholds->primary->size + 1 : 1;
4198
4199
4200 new = kmalloc(struct_size(new, entries, size), GFP_KERNEL);
4201 if (!new) {
4202 ret = -ENOMEM;
4203 goto unlock;
4204 }
4205 new->size = size;
4206
4207
4208 if (thresholds->primary)
4209 memcpy(new->entries, thresholds->primary->entries,
4210 flex_array_size(new, entries, size - 1));
4211
4212
4213 new->entries[size - 1].eventfd = eventfd;
4214 new->entries[size - 1].threshold = threshold;
4215
4216
4217 sort(new->entries, size, sizeof(*new->entries),
4218 compare_thresholds, NULL);
4219
4220
4221 new->current_threshold = -1;
4222 for (i = 0; i < size; i++) {
4223 if (new->entries[i].threshold <= usage) {
4224
4225
4226
4227
4228
4229 ++new->current_threshold;
4230 } else
4231 break;
4232 }
4233
4234
4235 kfree(thresholds->spare);
4236 thresholds->spare = thresholds->primary;
4237
4238 rcu_assign_pointer(thresholds->primary, new);
4239
4240
4241 synchronize_rcu();
4242
4243unlock:
4244 mutex_unlock(&memcg->thresholds_lock);
4245
4246 return ret;
4247}
4248
4249static int mem_cgroup_usage_register_event(struct mem_cgroup *memcg,
4250 struct eventfd_ctx *eventfd, const char *args)
4251{
4252 return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEM);
4253}
4254
4255static int memsw_cgroup_usage_register_event(struct mem_cgroup *memcg,
4256 struct eventfd_ctx *eventfd, const char *args)
4257{
4258 return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEMSWAP);
4259}
4260
4261static void __mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
4262 struct eventfd_ctx *eventfd, enum res_type type)
4263{
4264 struct mem_cgroup_thresholds *thresholds;
4265 struct mem_cgroup_threshold_ary *new;
4266 unsigned long usage;
4267 int i, j, size, entries;
4268
4269 mutex_lock(&memcg->thresholds_lock);
4270
4271 if (type == _MEM) {
4272 thresholds = &memcg->thresholds;
4273 usage = mem_cgroup_usage(memcg, false);
4274 } else if (type == _MEMSWAP) {
4275 thresholds = &memcg->memsw_thresholds;
4276 usage = mem_cgroup_usage(memcg, true);
4277 } else
4278 BUG();
4279
4280 if (!thresholds->primary)
4281 goto unlock;
4282
4283
4284 __mem_cgroup_threshold(memcg, type == _MEMSWAP);
4285
4286
4287 size = entries = 0;
4288 for (i = 0; i < thresholds->primary->size; i++) {
4289 if (thresholds->primary->entries[i].eventfd != eventfd)
4290 size++;
4291 else
4292 entries++;
4293 }
4294
4295 new = thresholds->spare;
4296
4297
4298 if (!entries)
4299 goto unlock;
4300
4301
4302 if (!size) {
4303 kfree(new);
4304 new = NULL;
4305 goto swap_buffers;
4306 }
4307
4308 new->size = size;
4309
4310
4311 new->current_threshold = -1;
4312 for (i = 0, j = 0; i < thresholds->primary->size; i++) {
4313 if (thresholds->primary->entries[i].eventfd == eventfd)
4314 continue;
4315
4316 new->entries[j] = thresholds->primary->entries[i];
4317 if (new->entries[j].threshold <= usage) {
4318
4319
4320
4321
4322
4323 ++new->current_threshold;
4324 }
4325 j++;
4326 }
4327
4328swap_buffers:
4329
4330 thresholds->spare = thresholds->primary;
4331
4332 rcu_assign_pointer(thresholds->primary, new);
4333
4334
4335 synchronize_rcu();
4336
4337
4338 if (!new) {
4339 kfree(thresholds->spare);
4340 thresholds->spare = NULL;
4341 }
4342unlock:
4343 mutex_unlock(&memcg->thresholds_lock);
4344}
4345
4346static void mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
4347 struct eventfd_ctx *eventfd)
4348{
4349 return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEM);
4350}
4351
4352static void memsw_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
4353 struct eventfd_ctx *eventfd)
4354{
4355 return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEMSWAP);
4356}
4357
4358static int mem_cgroup_oom_register_event(struct mem_cgroup *memcg,
4359 struct eventfd_ctx *eventfd, const char *args)
4360{
4361 struct mem_cgroup_eventfd_list *event;
4362
4363 event = kmalloc(sizeof(*event), GFP_KERNEL);
4364 if (!event)
4365 return -ENOMEM;
4366
4367 spin_lock(&memcg_oom_lock);
4368
4369 event->eventfd = eventfd;
4370 list_add(&event->list, &memcg->oom_notify);
4371
4372
4373 if (memcg->under_oom)
4374 eventfd_signal(eventfd, 1);
4375 spin_unlock(&memcg_oom_lock);
4376
4377 return 0;
4378}
4379
4380static void mem_cgroup_oom_unregister_event(struct mem_cgroup *memcg,
4381 struct eventfd_ctx *eventfd)
4382{
4383 struct mem_cgroup_eventfd_list *ev, *tmp;
4384
4385 spin_lock(&memcg_oom_lock);
4386
4387 list_for_each_entry_safe(ev, tmp, &memcg->oom_notify, list) {
4388 if (ev->eventfd == eventfd) {
4389 list_del(&ev->list);
4390 kfree(ev);
4391 }
4392 }
4393
4394 spin_unlock(&memcg_oom_lock);
4395}
4396
4397static int mem_cgroup_oom_control_read(struct seq_file *sf, void *v)
4398{
4399 struct mem_cgroup *memcg = mem_cgroup_from_seq(sf);
4400
4401 seq_printf(sf, "oom_kill_disable %d\n", memcg->oom_kill_disable);
4402 seq_printf(sf, "under_oom %d\n", (bool)memcg->under_oom);
4403 seq_printf(sf, "oom_kill %lu\n",
4404 atomic_long_read(&memcg->memory_events[MEMCG_OOM_KILL]));
4405 return 0;
4406}
4407
4408static int mem_cgroup_oom_control_write(struct cgroup_subsys_state *css,
4409 struct cftype *cft, u64 val)
4410{
4411 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
4412
4413
4414 if (mem_cgroup_is_root(memcg) || !((val == 0) || (val == 1)))
4415 return -EINVAL;
4416
4417 memcg->oom_kill_disable = val;
4418 if (!val)
4419 memcg_oom_recover(memcg);
4420
4421 return 0;
4422}
4423
4424#ifdef CONFIG_CGROUP_WRITEBACK
4425
4426#include <trace/events/writeback.h>
4427
4428static int memcg_wb_domain_init(struct mem_cgroup *memcg, gfp_t gfp)
4429{
4430 return wb_domain_init(&memcg->cgwb_domain, gfp);
4431}
4432
4433static void memcg_wb_domain_exit(struct mem_cgroup *memcg)
4434{
4435 wb_domain_exit(&memcg->cgwb_domain);
4436}
4437
4438static void memcg_wb_domain_size_changed(struct mem_cgroup *memcg)
4439{
4440 wb_domain_size_changed(&memcg->cgwb_domain);
4441}
4442
4443struct wb_domain *mem_cgroup_wb_domain(struct bdi_writeback *wb)
4444{
4445 struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css);
4446
4447 if (!memcg->css.parent)
4448 return NULL;
4449
4450 return &memcg->cgwb_domain;
4451}
4452
4453
4454
4455
4456
4457
4458
4459
4460
4461
4462
4463
4464
4465
4466
4467
4468
4469
4470
4471void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages,
4472 unsigned long *pheadroom, unsigned long *pdirty,
4473 unsigned long *pwriteback)
4474{
4475 struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css);
4476 struct mem_cgroup *parent;
4477
4478 cgroup_rstat_flush_irqsafe(memcg->css.cgroup);
4479
4480 *pdirty = memcg_page_state(memcg, NR_FILE_DIRTY);
4481 *pwriteback = memcg_page_state(memcg, NR_WRITEBACK);
4482 *pfilepages = memcg_page_state(memcg, NR_INACTIVE_FILE) +
4483 memcg_page_state(memcg, NR_ACTIVE_FILE);
4484
4485 *pheadroom = PAGE_COUNTER_MAX;
4486 while ((parent = parent_mem_cgroup(memcg))) {
4487 unsigned long ceiling = min(READ_ONCE(memcg->memory.max),
4488 READ_ONCE(memcg->memory.high));
4489 unsigned long used = page_counter_read(&memcg->memory);
4490
4491 *pheadroom = min(*pheadroom, ceiling - min(ceiling, used));
4492 memcg = parent;
4493 }
4494}
4495
4496
4497
4498
4499
4500
4501
4502
4503
4504
4505
4506
4507
4508
4509
4510
4511
4512
4513
4514
4515
4516
4517
4518
4519
4520
4521
4522
4523
4524
4525
4526
4527
4528
4529
4530
4531
4532
4533
4534
4535
4536
4537
4538
4539
4540void mem_cgroup_track_foreign_dirty_slowpath(struct page *page,
4541 struct bdi_writeback *wb)
4542{
4543 struct mem_cgroup *memcg = page_memcg(page);
4544 struct memcg_cgwb_frn *frn;
4545 u64 now = get_jiffies_64();
4546 u64 oldest_at = now;
4547 int oldest = -1;
4548 int i;
4549
4550 trace_track_foreign_dirty(page, wb);
4551
4552
4553
4554
4555
4556
4557 for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++) {
4558 frn = &memcg->cgwb_frn[i];
4559 if (frn->bdi_id == wb->bdi->id &&
4560 frn->memcg_id == wb->memcg_css->id)
4561 break;
4562 if (time_before64(frn->at, oldest_at) &&
4563 atomic_read(&frn->done.cnt) == 1) {
4564 oldest = i;
4565 oldest_at = frn->at;
4566 }
4567 }
4568
4569 if (i < MEMCG_CGWB_FRN_CNT) {
4570
4571
4572
4573
4574
4575
4576
4577 unsigned long update_intv =
4578 min_t(unsigned long, HZ,
4579 msecs_to_jiffies(dirty_expire_interval * 10) / 8);
4580
4581 if (time_before64(frn->at, now - update_intv))
4582 frn->at = now;
4583 } else if (oldest >= 0) {
4584
4585 frn = &memcg->cgwb_frn[oldest];
4586 frn->bdi_id = wb->bdi->id;
4587 frn->memcg_id = wb->memcg_css->id;
4588 frn->at = now;
4589 }
4590}
4591
4592
4593void mem_cgroup_flush_foreign(struct bdi_writeback *wb)
4594{
4595 struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css);
4596 unsigned long intv = msecs_to_jiffies(dirty_expire_interval * 10);
4597 u64 now = jiffies_64;
4598 int i;
4599
4600 for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++) {
4601 struct memcg_cgwb_frn *frn = &memcg->cgwb_frn[i];
4602
4603
4604
4605
4606
4607
4608
4609 if (time_after64(frn->at, now - intv) &&
4610 atomic_read(&frn->done.cnt) == 1) {
4611 frn->at = 0;
4612 trace_flush_foreign(wb, frn->bdi_id, frn->memcg_id);
4613 cgroup_writeback_by_id(frn->bdi_id, frn->memcg_id,
4614 WB_REASON_FOREIGN_FLUSH,
4615 &frn->done);
4616 }
4617 }
4618}
4619
4620#else
4621
4622static int memcg_wb_domain_init(struct mem_cgroup *memcg, gfp_t gfp)
4623{
4624 return 0;
4625}
4626
4627static void memcg_wb_domain_exit(struct mem_cgroup *memcg)
4628{
4629}
4630
4631static void memcg_wb_domain_size_changed(struct mem_cgroup *memcg)
4632{
4633}
4634
4635#endif
4636
4637
4638
4639
4640
4641
4642
4643
4644
4645
4646
4647
4648
4649
4650
4651
4652
4653
4654
4655static void memcg_event_remove(struct work_struct *work)
4656{
4657 struct mem_cgroup_event *event =
4658 container_of(work, struct mem_cgroup_event, remove);
4659 struct mem_cgroup *memcg = event->memcg;
4660
4661 remove_wait_queue(event->wqh, &event->wait);
4662
4663 event->unregister_event(memcg, event->eventfd);
4664
4665
4666 eventfd_signal(event->eventfd, 1);
4667
4668 eventfd_ctx_put(event->eventfd);
4669 kfree(event);
4670 css_put(&memcg->css);
4671}
4672
4673
4674
4675
4676
4677
4678static int memcg_event_wake(wait_queue_entry_t *wait, unsigned mode,
4679 int sync, void *key)
4680{
4681 struct mem_cgroup_event *event =
4682 container_of(wait, struct mem_cgroup_event, wait);
4683 struct mem_cgroup *memcg = event->memcg;
4684 __poll_t flags = key_to_poll(key);
4685
4686 if (flags & EPOLLHUP) {
4687
4688
4689
4690
4691
4692
4693
4694
4695
4696 spin_lock(&memcg->event_list_lock);
4697 if (!list_empty(&event->list)) {
4698 list_del_init(&event->list);
4699
4700
4701
4702
4703 schedule_work(&event->remove);
4704 }
4705 spin_unlock(&memcg->event_list_lock);
4706 }
4707
4708 return 0;
4709}
4710
4711static void memcg_event_ptable_queue_proc(struct file *file,
4712 wait_queue_head_t *wqh, poll_table *pt)
4713{
4714 struct mem_cgroup_event *event =
4715 container_of(pt, struct mem_cgroup_event, pt);
4716
4717 event->wqh = wqh;
4718 add_wait_queue(wqh, &event->wait);
4719}
4720
4721
4722
4723
4724
4725
4726
4727
4728
4729static ssize_t memcg_write_event_control(struct kernfs_open_file *of,
4730 char *buf, size_t nbytes, loff_t off)
4731{
4732 struct cgroup_subsys_state *css = of_css(of);
4733 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
4734 struct mem_cgroup_event *event;
4735 struct cgroup_subsys_state *cfile_css;
4736 unsigned int efd, cfd;
4737 struct fd efile;
4738 struct fd cfile;
4739 const char *name;
4740 char *endp;
4741 int ret;
4742
4743 buf = strstrip(buf);
4744
4745 efd = simple_strtoul(buf, &endp, 10);
4746 if (*endp != ' ')
4747 return -EINVAL;
4748 buf = endp + 1;
4749
4750 cfd = simple_strtoul(buf, &endp, 10);
4751 if ((*endp != ' ') && (*endp != '\0'))
4752 return -EINVAL;
4753 buf = endp + 1;
4754
4755 event = kzalloc(sizeof(*event), GFP_KERNEL);
4756 if (!event)
4757 return -ENOMEM;
4758
4759 event->memcg = memcg;
4760 INIT_LIST_HEAD(&event->list);
4761 init_poll_funcptr(&event->pt, memcg_event_ptable_queue_proc);
4762 init_waitqueue_func_entry(&event->wait, memcg_event_wake);
4763 INIT_WORK(&event->remove, memcg_event_remove);
4764
4765 efile = fdget(efd);
4766 if (!efile.file) {
4767 ret = -EBADF;
4768 goto out_kfree;
4769 }
4770
4771 event->eventfd = eventfd_ctx_fileget(efile.file);
4772 if (IS_ERR(event->eventfd)) {
4773 ret = PTR_ERR(event->eventfd);
4774 goto out_put_efile;
4775 }
4776
4777 cfile = fdget(cfd);
4778 if (!cfile.file) {
4779 ret = -EBADF;
4780 goto out_put_eventfd;
4781 }
4782
4783
4784
4785 ret = file_permission(cfile.file, MAY_READ);
4786 if (ret < 0)
4787 goto out_put_cfile;
4788
4789
4790
4791
4792
4793
4794
4795
4796
4797 name = cfile.file->f_path.dentry->d_name.name;
4798
4799 if (!strcmp(name, "memory.usage_in_bytes")) {
4800 event->register_event = mem_cgroup_usage_register_event;
4801 event->unregister_event = mem_cgroup_usage_unregister_event;
4802 } else if (!strcmp(name, "memory.oom_control")) {
4803 event->register_event = mem_cgroup_oom_register_event;
4804 event->unregister_event = mem_cgroup_oom_unregister_event;
4805 } else if (!strcmp(name, "memory.pressure_level")) {
4806 event->register_event = vmpressure_register_event;
4807 event->unregister_event = vmpressure_unregister_event;
4808 } else if (!strcmp(name, "memory.memsw.usage_in_bytes")) {
4809 event->register_event = memsw_cgroup_usage_register_event;
4810 event->unregister_event = memsw_cgroup_usage_unregister_event;
4811 } else {
4812 ret = -EINVAL;
4813 goto out_put_cfile;
4814 }
4815
4816
4817
4818
4819
4820
4821 cfile_css = css_tryget_online_from_dir(cfile.file->f_path.dentry->d_parent,
4822 &memory_cgrp_subsys);
4823 ret = -EINVAL;
4824 if (IS_ERR(cfile_css))
4825 goto out_put_cfile;
4826 if (cfile_css != css) {
4827 css_put(cfile_css);
4828 goto out_put_cfile;
4829 }
4830
4831 ret = event->register_event(memcg, event->eventfd, buf);
4832 if (ret)
4833 goto out_put_css;
4834
4835 vfs_poll(efile.file, &event->pt);
4836
4837 spin_lock_irq(&memcg->event_list_lock);
4838 list_add(&event->list, &memcg->event_list);
4839 spin_unlock_irq(&memcg->event_list_lock);
4840
4841 fdput(cfile);
4842 fdput(efile);
4843
4844 return nbytes;
4845
4846out_put_css:
4847 css_put(css);
4848out_put_cfile:
4849 fdput(cfile);
4850out_put_eventfd:
4851 eventfd_ctx_put(event->eventfd);
4852out_put_efile:
4853 fdput(efile);
4854out_kfree:
4855 kfree(event);
4856
4857 return ret;
4858}
4859
4860static struct cftype mem_cgroup_legacy_files[] = {
4861 {
4862 .name = "usage_in_bytes",
4863 .private = MEMFILE_PRIVATE(_MEM, RES_USAGE),
4864 .read_u64 = mem_cgroup_read_u64,
4865 },
4866 {
4867 .name = "max_usage_in_bytes",
4868 .private = MEMFILE_PRIVATE(_MEM, RES_MAX_USAGE),
4869 .write = mem_cgroup_reset,
4870 .read_u64 = mem_cgroup_read_u64,
4871 },
4872 {
4873 .name = "limit_in_bytes",
4874 .private = MEMFILE_PRIVATE(_MEM, RES_LIMIT),
4875 .write = mem_cgroup_write,
4876 .read_u64 = mem_cgroup_read_u64,
4877 },
4878 {
4879 .name = "soft_limit_in_bytes",
4880 .private = MEMFILE_PRIVATE(_MEM, RES_SOFT_LIMIT),
4881 .write = mem_cgroup_write,
4882 .read_u64 = mem_cgroup_read_u64,
4883 },
4884 {
4885 .name = "failcnt",
4886 .private = MEMFILE_PRIVATE(_MEM, RES_FAILCNT),
4887 .write = mem_cgroup_reset,
4888 .read_u64 = mem_cgroup_read_u64,
4889 },
4890 {
4891 .name = "stat",
4892 .seq_show = memcg_stat_show,
4893 },
4894 {
4895 .name = "force_empty",
4896 .write = mem_cgroup_force_empty_write,
4897 },
4898 {
4899 .name = "use_hierarchy",
4900 .write_u64 = mem_cgroup_hierarchy_write,
4901 .read_u64 = mem_cgroup_hierarchy_read,
4902 },
4903 {
4904 .name = "cgroup.event_control",
4905 .write = memcg_write_event_control,
4906 .flags = CFTYPE_NO_PREFIX | CFTYPE_WORLD_WRITABLE,
4907 },
4908 {
4909 .name = "swappiness",
4910 .read_u64 = mem_cgroup_swappiness_read,
4911 .write_u64 = mem_cgroup_swappiness_write,
4912 },
4913 {
4914 .name = "move_charge_at_immigrate",
4915 .read_u64 = mem_cgroup_move_charge_read,
4916 .write_u64 = mem_cgroup_move_charge_write,
4917 },
4918 {
4919 .name = "oom_control",
4920 .seq_show = mem_cgroup_oom_control_read,
4921 .write_u64 = mem_cgroup_oom_control_write,
4922 .private = MEMFILE_PRIVATE(_OOM_TYPE, OOM_CONTROL),
4923 },
4924 {
4925 .name = "pressure_level",
4926 },
4927#ifdef CONFIG_NUMA
4928 {
4929 .name = "numa_stat",
4930 .seq_show = memcg_numa_stat_show,
4931 },
4932#endif
4933 {
4934 .name = "kmem.limit_in_bytes",
4935 .private = MEMFILE_PRIVATE(_KMEM, RES_LIMIT),
4936 .write = mem_cgroup_write,
4937 .read_u64 = mem_cgroup_read_u64,
4938 },
4939 {
4940 .name = "kmem.usage_in_bytes",
4941 .private = MEMFILE_PRIVATE(_KMEM, RES_USAGE),
4942 .read_u64 = mem_cgroup_read_u64,
4943 },
4944 {
4945 .name = "kmem.failcnt",
4946 .private = MEMFILE_PRIVATE(_KMEM, RES_FAILCNT),
4947 .write = mem_cgroup_reset,
4948 .read_u64 = mem_cgroup_read_u64,
4949 },
4950 {
4951 .name = "kmem.max_usage_in_bytes",
4952 .private = MEMFILE_PRIVATE(_KMEM, RES_MAX_USAGE),
4953 .write = mem_cgroup_reset,
4954 .read_u64 = mem_cgroup_read_u64,
4955 },
4956#if defined(CONFIG_MEMCG_KMEM) && \
4957 (defined(CONFIG_SLAB) || defined(CONFIG_SLUB_DEBUG))
4958 {
4959 .name = "kmem.slabinfo",
4960 .seq_show = memcg_slab_show,
4961 },
4962#endif
4963 {
4964 .name = "kmem.tcp.limit_in_bytes",
4965 .private = MEMFILE_PRIVATE(_TCP, RES_LIMIT),
4966 .write = mem_cgroup_write,
4967 .read_u64 = mem_cgroup_read_u64,
4968 },
4969 {
4970 .name = "kmem.tcp.usage_in_bytes",
4971 .private = MEMFILE_PRIVATE(_TCP, RES_USAGE),
4972 .read_u64 = mem_cgroup_read_u64,
4973 },
4974 {
4975 .name = "kmem.tcp.failcnt",
4976 .private = MEMFILE_PRIVATE(_TCP, RES_FAILCNT),
4977 .write = mem_cgroup_reset,
4978 .read_u64 = mem_cgroup_read_u64,
4979 },
4980 {
4981 .name = "kmem.tcp.max_usage_in_bytes",
4982 .private = MEMFILE_PRIVATE(_TCP, RES_MAX_USAGE),
4983 .write = mem_cgroup_reset,
4984 .read_u64 = mem_cgroup_read_u64,
4985 },
4986 { },
4987};
4988
4989
4990
4991
4992
4993
4994
4995
4996
4997
4998
4999
5000
5001
5002
5003
5004
5005
5006
5007
5008
5009
5010
5011
5012
5013static DEFINE_IDR(mem_cgroup_idr);
5014
5015static void mem_cgroup_id_remove(struct mem_cgroup *memcg)
5016{
5017 if (memcg->id.id > 0) {
5018 idr_remove(&mem_cgroup_idr, memcg->id.id);
5019 memcg->id.id = 0;
5020 }
5021}
5022
5023static void __maybe_unused mem_cgroup_id_get_many(struct mem_cgroup *memcg,
5024 unsigned int n)
5025{
5026 refcount_add(n, &memcg->id.ref);
5027}
5028
5029static void mem_cgroup_id_put_many(struct mem_cgroup *memcg, unsigned int n)
5030{
5031 if (refcount_sub_and_test(n, &memcg->id.ref)) {
5032 mem_cgroup_id_remove(memcg);
5033
5034
5035 css_put(&memcg->css);
5036 }
5037}
5038
5039static inline void mem_cgroup_id_put(struct mem_cgroup *memcg)
5040{
5041 mem_cgroup_id_put_many(memcg, 1);
5042}
5043
5044
5045
5046
5047
5048
5049
5050struct mem_cgroup *mem_cgroup_from_id(unsigned short id)
5051{
5052 WARN_ON_ONCE(!rcu_read_lock_held());
5053 return idr_find(&mem_cgroup_idr, id);
5054}
5055
5056static int alloc_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node)
5057{
5058 struct mem_cgroup_per_node *pn;
5059 int tmp = node;
5060
5061
5062
5063
5064
5065
5066
5067
5068 if (!node_state(node, N_NORMAL_MEMORY))
5069 tmp = -1;
5070 pn = kzalloc_node(sizeof(*pn), GFP_KERNEL, tmp);
5071 if (!pn)
5072 return 1;
5073
5074 pn->lruvec_stats_percpu = alloc_percpu_gfp(struct lruvec_stats_percpu,
5075 GFP_KERNEL_ACCOUNT);
5076 if (!pn->lruvec_stats_percpu) {
5077 kfree(pn);
5078 return 1;
5079 }
5080
5081 lruvec_init(&pn->lruvec);
5082 pn->usage_in_excess = 0;
5083 pn->on_tree = false;
5084 pn->memcg = memcg;
5085
5086 memcg->nodeinfo[node] = pn;
5087 return 0;
5088}
5089
5090static void free_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node)
5091{
5092 struct mem_cgroup_per_node *pn = memcg->nodeinfo[node];
5093
5094 if (!pn)
5095 return;
5096
5097 free_percpu(pn->lruvec_stats_percpu);
5098 kfree(pn);
5099}
5100
5101static void __mem_cgroup_free(struct mem_cgroup *memcg)
5102{
5103 int node;
5104
5105 for_each_node(node)
5106 free_mem_cgroup_per_node_info(memcg, node);
5107 free_percpu(memcg->vmstats_percpu);
5108 kfree(memcg);
5109}
5110
5111static void mem_cgroup_free(struct mem_cgroup *memcg)
5112{
5113 memcg_wb_domain_exit(memcg);
5114 __mem_cgroup_free(memcg);
5115}
5116
5117static struct mem_cgroup *mem_cgroup_alloc(void)
5118{
5119 struct mem_cgroup *memcg;
5120 unsigned int size;
5121 int node;
5122 int __maybe_unused i;
5123 long error = -ENOMEM;
5124
5125 size = sizeof(struct mem_cgroup);
5126 size += nr_node_ids * sizeof(struct mem_cgroup_per_node *);
5127
5128 memcg = kzalloc(size, GFP_KERNEL);
5129 if (!memcg)
5130 return ERR_PTR(error);
5131
5132 memcg->id.id = idr_alloc(&mem_cgroup_idr, NULL,
5133 1, MEM_CGROUP_ID_MAX,
5134 GFP_KERNEL);
5135 if (memcg->id.id < 0) {
5136 error = memcg->id.id;
5137 goto fail;
5138 }
5139
5140 memcg->vmstats_percpu = alloc_percpu_gfp(struct memcg_vmstats_percpu,
5141 GFP_KERNEL_ACCOUNT);
5142 if (!memcg->vmstats_percpu)
5143 goto fail;
5144
5145 for_each_node(node)
5146 if (alloc_mem_cgroup_per_node_info(memcg, node))
5147 goto fail;
5148
5149 if (memcg_wb_domain_init(memcg, GFP_KERNEL))
5150 goto fail;
5151
5152 INIT_WORK(&memcg->high_work, high_work_func);
5153 INIT_LIST_HEAD(&memcg->oom_notify);
5154 mutex_init(&memcg->thresholds_lock);
5155 spin_lock_init(&memcg->move_lock);
5156 vmpressure_init(&memcg->vmpressure);
5157 INIT_LIST_HEAD(&memcg->event_list);
5158 spin_lock_init(&memcg->event_list_lock);
5159 memcg->socket_pressure = jiffies;
5160#ifdef CONFIG_MEMCG_KMEM
5161 memcg->kmemcg_id = -1;
5162 INIT_LIST_HEAD(&memcg->objcg_list);
5163#endif
5164#ifdef CONFIG_CGROUP_WRITEBACK
5165 INIT_LIST_HEAD(&memcg->cgwb_list);
5166 for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++)
5167 memcg->cgwb_frn[i].done =
5168 __WB_COMPLETION_INIT(&memcg_cgwb_frn_waitq);
5169#endif
5170#ifdef CONFIG_TRANSPARENT_HUGEPAGE
5171 spin_lock_init(&memcg->deferred_split_queue.split_queue_lock);
5172 INIT_LIST_HEAD(&memcg->deferred_split_queue.split_queue);
5173 memcg->deferred_split_queue.split_queue_len = 0;
5174#endif
5175 idr_replace(&mem_cgroup_idr, memcg, memcg->id.id);
5176 return memcg;
5177fail:
5178 mem_cgroup_id_remove(memcg);
5179 __mem_cgroup_free(memcg);
5180 return ERR_PTR(error);
5181}
5182
5183static struct cgroup_subsys_state * __ref
5184mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
5185{
5186 struct mem_cgroup *parent = mem_cgroup_from_css(parent_css);
5187 struct mem_cgroup *memcg, *old_memcg;
5188 long error = -ENOMEM;
5189
5190 old_memcg = set_active_memcg(parent);
5191 memcg = mem_cgroup_alloc();
5192 set_active_memcg(old_memcg);
5193 if (IS_ERR(memcg))
5194 return ERR_CAST(memcg);
5195
5196 page_counter_set_high(&memcg->memory, PAGE_COUNTER_MAX);
5197 memcg->soft_limit = PAGE_COUNTER_MAX;
5198 page_counter_set_high(&memcg->swap, PAGE_COUNTER_MAX);
5199 if (parent) {
5200 memcg->swappiness = mem_cgroup_swappiness(parent);
5201 memcg->oom_kill_disable = parent->oom_kill_disable;
5202
5203 page_counter_init(&memcg->memory, &parent->memory);
5204 page_counter_init(&memcg->swap, &parent->swap);
5205 page_counter_init(&memcg->kmem, &parent->kmem);
5206 page_counter_init(&memcg->tcpmem, &parent->tcpmem);
5207 } else {
5208 page_counter_init(&memcg->memory, NULL);
5209 page_counter_init(&memcg->swap, NULL);
5210 page_counter_init(&memcg->kmem, NULL);
5211 page_counter_init(&memcg->tcpmem, NULL);
5212
5213 root_mem_cgroup = memcg;
5214 return &memcg->css;
5215 }
5216
5217
5218 error = memcg_online_kmem(memcg);
5219 if (error)
5220 goto fail;
5221
5222 if (cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_nosocket)
5223 static_branch_inc(&memcg_sockets_enabled_key);
5224
5225 return &memcg->css;
5226fail:
5227 mem_cgroup_id_remove(memcg);
5228 mem_cgroup_free(memcg);
5229 return ERR_PTR(error);
5230}
5231
5232static int mem_cgroup_css_online(struct cgroup_subsys_state *css)
5233{
5234 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
5235
5236
5237
5238
5239
5240
5241 if (alloc_shrinker_info(memcg)) {
5242 mem_cgroup_id_remove(memcg);
5243 return -ENOMEM;
5244 }
5245
5246
5247 refcount_set(&memcg->id.ref, 1);
5248 css_get(css);
5249
5250 if (unlikely(mem_cgroup_is_root(memcg)))
5251 queue_delayed_work(system_unbound_wq, &stats_flush_dwork,
5252 2UL*HZ);
5253 return 0;
5254}
5255
5256static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
5257{
5258 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
5259 struct mem_cgroup_event *event, *tmp;
5260
5261
5262
5263
5264
5265
5266 spin_lock_irq(&memcg->event_list_lock);
5267 list_for_each_entry_safe(event, tmp, &memcg->event_list, list) {
5268 list_del_init(&event->list);
5269 schedule_work(&event->remove);
5270 }
5271 spin_unlock_irq(&memcg->event_list_lock);
5272
5273 page_counter_set_min(&memcg->memory, 0);
5274 page_counter_set_low(&memcg->memory, 0);
5275
5276 memcg_offline_kmem(memcg);
5277 reparent_shrinker_deferred(memcg);
5278 wb_memcg_offline(memcg);
5279
5280 drain_all_stock(memcg);
5281
5282 mem_cgroup_id_put(memcg);
5283}
5284
5285static void mem_cgroup_css_released(struct cgroup_subsys_state *css)
5286{
5287 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
5288
5289 invalidate_reclaim_iterators(memcg);
5290}
5291
5292static void mem_cgroup_css_free(struct cgroup_subsys_state *css)
5293{
5294 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
5295 int __maybe_unused i;
5296
5297#ifdef CONFIG_CGROUP_WRITEBACK
5298 for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++)
5299 wb_wait_for_completion(&memcg->cgwb_frn[i].done);
5300#endif
5301 if (cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_nosocket)
5302 static_branch_dec(&memcg_sockets_enabled_key);
5303
5304 if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && memcg->tcpmem_active)
5305 static_branch_dec(&memcg_sockets_enabled_key);
5306
5307 vmpressure_cleanup(&memcg->vmpressure);
5308 cancel_work_sync(&memcg->high_work);
5309 mem_cgroup_remove_from_trees(memcg);
5310 free_shrinker_info(memcg);
5311 memcg_free_kmem(memcg);
5312 mem_cgroup_free(memcg);
5313}
5314
5315
5316
5317
5318
5319
5320
5321
5322
5323
5324
5325
5326
5327
5328static void mem_cgroup_css_reset(struct cgroup_subsys_state *css)
5329{
5330 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
5331
5332 page_counter_set_max(&memcg->memory, PAGE_COUNTER_MAX);
5333 page_counter_set_max(&memcg->swap, PAGE_COUNTER_MAX);
5334 page_counter_set_max(&memcg->kmem, PAGE_COUNTER_MAX);
5335 page_counter_set_max(&memcg->tcpmem, PAGE_COUNTER_MAX);
5336 page_counter_set_min(&memcg->memory, 0);
5337 page_counter_set_low(&memcg->memory, 0);
5338 page_counter_set_high(&memcg->memory, PAGE_COUNTER_MAX);
5339 memcg->soft_limit = PAGE_COUNTER_MAX;
5340 page_counter_set_high(&memcg->swap, PAGE_COUNTER_MAX);
5341 memcg_wb_domain_size_changed(memcg);
5342}
5343
5344void mem_cgroup_flush_stats(void)
5345{
5346 if (!spin_trylock(&stats_flush_lock))
5347 return;
5348
5349 cgroup_rstat_flush_irqsafe(root_mem_cgroup->css.cgroup);
5350 spin_unlock(&stats_flush_lock);
5351}
5352
5353static void flush_memcg_stats_dwork(struct work_struct *w)
5354{
5355 mem_cgroup_flush_stats();
5356 queue_delayed_work(system_unbound_wq, &stats_flush_dwork, 2UL*HZ);
5357}
5358
5359static void mem_cgroup_css_rstat_flush(struct cgroup_subsys_state *css, int cpu)
5360{
5361 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
5362 struct mem_cgroup *parent = parent_mem_cgroup(memcg);
5363 struct memcg_vmstats_percpu *statc;
5364 long delta, v;
5365 int i, nid;
5366
5367 statc = per_cpu_ptr(memcg->vmstats_percpu, cpu);
5368
5369 for (i = 0; i < MEMCG_NR_STAT; i++) {
5370
5371
5372
5373
5374
5375 delta = memcg->vmstats.state_pending[i];
5376 if (delta)
5377 memcg->vmstats.state_pending[i] = 0;
5378
5379
5380 v = READ_ONCE(statc->state[i]);
5381 if (v != statc->state_prev[i]) {
5382 delta += v - statc->state_prev[i];
5383 statc->state_prev[i] = v;
5384 }
5385
5386 if (!delta)
5387 continue;
5388
5389
5390 memcg->vmstats.state[i] += delta;
5391 if (parent)
5392 parent->vmstats.state_pending[i] += delta;
5393 }
5394
5395 for (i = 0; i < NR_VM_EVENT_ITEMS; i++) {
5396 delta = memcg->vmstats.events_pending[i];
5397 if (delta)
5398 memcg->vmstats.events_pending[i] = 0;
5399
5400 v = READ_ONCE(statc->events[i]);
5401 if (v != statc->events_prev[i]) {
5402 delta += v - statc->events_prev[i];
5403 statc->events_prev[i] = v;
5404 }
5405
5406 if (!delta)
5407 continue;
5408
5409 memcg->vmstats.events[i] += delta;
5410 if (parent)
5411 parent->vmstats.events_pending[i] += delta;
5412 }
5413
5414 for_each_node_state(nid, N_MEMORY) {
5415 struct mem_cgroup_per_node *pn = memcg->nodeinfo[nid];
5416 struct mem_cgroup_per_node *ppn = NULL;
5417 struct lruvec_stats_percpu *lstatc;
5418
5419 if (parent)
5420 ppn = parent->nodeinfo[nid];
5421
5422 lstatc = per_cpu_ptr(pn->lruvec_stats_percpu, cpu);
5423
5424 for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) {
5425 delta = pn->lruvec_stats.state_pending[i];
5426 if (delta)
5427 pn->lruvec_stats.state_pending[i] = 0;
5428
5429 v = READ_ONCE(lstatc->state[i]);
5430 if (v != lstatc->state_prev[i]) {
5431 delta += v - lstatc->state_prev[i];
5432 lstatc->state_prev[i] = v;
5433 }
5434
5435 if (!delta)
5436 continue;
5437
5438 pn->lruvec_stats.state[i] += delta;
5439 if (ppn)
5440 ppn->lruvec_stats.state_pending[i] += delta;
5441 }
5442 }
5443}
5444
5445#ifdef CONFIG_MMU
5446
5447static int mem_cgroup_do_precharge(unsigned long count)
5448{
5449 int ret;
5450
5451
5452 ret = try_charge(mc.to, GFP_KERNEL & ~__GFP_DIRECT_RECLAIM, count);
5453 if (!ret) {
5454 mc.precharge += count;
5455 return ret;
5456 }
5457
5458
5459 while (count--) {
5460 ret = try_charge(mc.to, GFP_KERNEL | __GFP_NORETRY, 1);
5461 if (ret)
5462 return ret;
5463 mc.precharge++;
5464 cond_resched();
5465 }
5466 return 0;
5467}
5468
5469union mc_target {
5470 struct page *page;
5471 swp_entry_t ent;
5472};
5473
5474enum mc_target_type {
5475 MC_TARGET_NONE = 0,
5476 MC_TARGET_PAGE,
5477 MC_TARGET_SWAP,
5478 MC_TARGET_DEVICE,
5479};
5480
5481static struct page *mc_handle_present_pte(struct vm_area_struct *vma,
5482 unsigned long addr, pte_t ptent)
5483{
5484 struct page *page = vm_normal_page(vma, addr, ptent);
5485
5486 if (!page || !page_mapped(page))
5487 return NULL;
5488 if (PageAnon(page)) {
5489 if (!(mc.flags & MOVE_ANON))
5490 return NULL;
5491 } else {
5492 if (!(mc.flags & MOVE_FILE))
5493 return NULL;
5494 }
5495 if (!get_page_unless_zero(page))
5496 return NULL;
5497
5498 return page;
5499}
5500
5501#if defined(CONFIG_SWAP) || defined(CONFIG_DEVICE_PRIVATE)
5502static struct page *mc_handle_swap_pte(struct vm_area_struct *vma,
5503 pte_t ptent, swp_entry_t *entry)
5504{
5505 struct page *page = NULL;
5506 swp_entry_t ent = pte_to_swp_entry(ptent);
5507
5508 if (!(mc.flags & MOVE_ANON))
5509 return NULL;
5510
5511
5512
5513
5514
5515
5516 if (is_device_private_entry(ent)) {
5517 page = pfn_swap_entry_to_page(ent);
5518
5519
5520
5521
5522 if (!page_ref_add_unless(page, 1, 1))
5523 return NULL;
5524 return page;
5525 }
5526
5527 if (non_swap_entry(ent))
5528 return NULL;
5529
5530
5531
5532
5533
5534 page = find_get_page(swap_address_space(ent), swp_offset(ent));
5535 entry->val = ent.val;
5536
5537 return page;
5538}
5539#else
5540static struct page *mc_handle_swap_pte(struct vm_area_struct *vma,
5541 pte_t ptent, swp_entry_t *entry)
5542{
5543 return NULL;
5544}
5545#endif
5546
5547static struct page *mc_handle_file_pte(struct vm_area_struct *vma,
5548 unsigned long addr, pte_t ptent, swp_entry_t *entry)
5549{
5550 if (!vma->vm_file)
5551 return NULL;
5552 if (!(mc.flags & MOVE_FILE))
5553 return NULL;
5554
5555
5556
5557 return find_get_incore_page(vma->vm_file->f_mapping,
5558 linear_page_index(vma, addr));
5559}
5560
5561
5562
5563
5564
5565
5566
5567
5568
5569
5570
5571
5572
5573static int mem_cgroup_move_account(struct page *page,
5574 bool compound,
5575 struct mem_cgroup *from,
5576 struct mem_cgroup *to)
5577{
5578 struct lruvec *from_vec, *to_vec;
5579 struct pglist_data *pgdat;
5580 unsigned int nr_pages = compound ? thp_nr_pages(page) : 1;
5581 int ret;
5582
5583 VM_BUG_ON(from == to);
5584 VM_BUG_ON_PAGE(PageLRU(page), page);
5585 VM_BUG_ON(compound && !PageTransHuge(page));
5586
5587
5588
5589
5590
5591 ret = -EBUSY;
5592 if (!trylock_page(page))
5593 goto out;
5594
5595 ret = -EINVAL;
5596 if (page_memcg(page) != from)
5597 goto out_unlock;
5598
5599 pgdat = page_pgdat(page);
5600 from_vec = mem_cgroup_lruvec(from, pgdat);
5601 to_vec = mem_cgroup_lruvec(to, pgdat);
5602
5603 lock_page_memcg(page);
5604
5605 if (PageAnon(page)) {
5606 if (page_mapped(page)) {
5607 __mod_lruvec_state(from_vec, NR_ANON_MAPPED, -nr_pages);
5608 __mod_lruvec_state(to_vec, NR_ANON_MAPPED, nr_pages);
5609 if (PageTransHuge(page)) {
5610 __mod_lruvec_state(from_vec, NR_ANON_THPS,
5611 -nr_pages);
5612 __mod_lruvec_state(to_vec, NR_ANON_THPS,
5613 nr_pages);
5614 }
5615 }
5616 } else {
5617 __mod_lruvec_state(from_vec, NR_FILE_PAGES, -nr_pages);
5618 __mod_lruvec_state(to_vec, NR_FILE_PAGES, nr_pages);
5619
5620 if (PageSwapBacked(page)) {
5621 __mod_lruvec_state(from_vec, NR_SHMEM, -nr_pages);
5622 __mod_lruvec_state(to_vec, NR_SHMEM, nr_pages);
5623 }
5624
5625 if (page_mapped(page)) {
5626 __mod_lruvec_state(from_vec, NR_FILE_MAPPED, -nr_pages);
5627 __mod_lruvec_state(to_vec, NR_FILE_MAPPED, nr_pages);
5628 }
5629
5630 if (PageDirty(page)) {
5631 struct address_space *mapping = page_mapping(page);
5632
5633 if (mapping_can_writeback(mapping)) {
5634 __mod_lruvec_state(from_vec, NR_FILE_DIRTY,
5635 -nr_pages);
5636 __mod_lruvec_state(to_vec, NR_FILE_DIRTY,
5637 nr_pages);
5638 }
5639 }
5640 }
5641
5642 if (PageWriteback(page)) {
5643 __mod_lruvec_state(from_vec, NR_WRITEBACK, -nr_pages);
5644 __mod_lruvec_state(to_vec, NR_WRITEBACK, nr_pages);
5645 }
5646
5647
5648
5649
5650
5651
5652
5653
5654
5655
5656
5657
5658
5659
5660 smp_mb();
5661
5662 css_get(&to->css);
5663 css_put(&from->css);
5664
5665 page->memcg_data = (unsigned long)to;
5666
5667 __unlock_page_memcg(from);
5668
5669 ret = 0;
5670
5671 local_irq_disable();
5672 mem_cgroup_charge_statistics(to, page, nr_pages);
5673 memcg_check_events(to, page);
5674 mem_cgroup_charge_statistics(from, page, -nr_pages);
5675 memcg_check_events(from, page);
5676 local_irq_enable();
5677out_unlock:
5678 unlock_page(page);
5679out:
5680 return ret;
5681}
5682
5683
5684
5685
5686
5687
5688
5689
5690
5691
5692
5693
5694
5695
5696
5697
5698
5699
5700
5701
5702
5703
5704
5705
5706
5707
5708
5709static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma,
5710 unsigned long addr, pte_t ptent, union mc_target *target)
5711{
5712 struct page *page = NULL;
5713 enum mc_target_type ret = MC_TARGET_NONE;
5714 swp_entry_t ent = { .val = 0 };
5715
5716 if (pte_present(ptent))
5717 page = mc_handle_present_pte(vma, addr, ptent);
5718 else if (is_swap_pte(ptent))
5719 page = mc_handle_swap_pte(vma, ptent, &ent);
5720 else if (pte_none(ptent))
5721 page = mc_handle_file_pte(vma, addr, ptent, &ent);
5722
5723 if (!page && !ent.val)
5724 return ret;
5725 if (page) {
5726
5727
5728
5729
5730
5731 if (page_memcg(page) == mc.from) {
5732 ret = MC_TARGET_PAGE;
5733 if (is_device_private_page(page))
5734 ret = MC_TARGET_DEVICE;
5735 if (target)
5736 target->page = page;
5737 }
5738 if (!ret || !target)
5739 put_page(page);
5740 }
5741
5742
5743
5744
5745 if (ent.val && !ret && (!page || !PageTransCompound(page)) &&
5746 mem_cgroup_id(mc.from) == lookup_swap_cgroup_id(ent)) {
5747 ret = MC_TARGET_SWAP;
5748 if (target)
5749 target->ent = ent;
5750 }
5751 return ret;
5752}
5753
5754#ifdef CONFIG_TRANSPARENT_HUGEPAGE
5755
5756
5757
5758
5759
5760static enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma,
5761 unsigned long addr, pmd_t pmd, union mc_target *target)
5762{
5763 struct page *page = NULL;
5764 enum mc_target_type ret = MC_TARGET_NONE;
5765
5766 if (unlikely(is_swap_pmd(pmd))) {
5767 VM_BUG_ON(thp_migration_supported() &&
5768 !is_pmd_migration_entry(pmd));
5769 return ret;
5770 }
5771 page = pmd_page(pmd);
5772 VM_BUG_ON_PAGE(!page || !PageHead(page), page);
5773 if (!(mc.flags & MOVE_ANON))
5774 return ret;
5775 if (page_memcg(page) == mc.from) {
5776 ret = MC_TARGET_PAGE;
5777 if (target) {
5778 get_page(page);
5779 target->page = page;
5780 }
5781 }
5782 return ret;
5783}
5784#else
5785static inline enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma,
5786 unsigned long addr, pmd_t pmd, union mc_target *target)
5787{
5788 return MC_TARGET_NONE;
5789}
5790#endif
5791
5792static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd,
5793 unsigned long addr, unsigned long end,
5794 struct mm_walk *walk)
5795{
5796 struct vm_area_struct *vma = walk->vma;
5797 pte_t *pte;
5798 spinlock_t *ptl;
5799
5800 ptl = pmd_trans_huge_lock(pmd, vma);
5801 if (ptl) {
5802
5803
5804
5805
5806
5807 if (get_mctgt_type_thp(vma, addr, *pmd, NULL) == MC_TARGET_PAGE)
5808 mc.precharge += HPAGE_PMD_NR;
5809 spin_unlock(ptl);
5810 return 0;
5811 }
5812
5813 if (pmd_trans_unstable(pmd))
5814 return 0;
5815 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
5816 for (; addr != end; pte++, addr += PAGE_SIZE)
5817 if (get_mctgt_type(vma, addr, *pte, NULL))
5818 mc.precharge++;
5819 pte_unmap_unlock(pte - 1, ptl);
5820 cond_resched();
5821
5822 return 0;
5823}
5824
5825static const struct mm_walk_ops precharge_walk_ops = {
5826 .pmd_entry = mem_cgroup_count_precharge_pte_range,
5827};
5828
5829static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm)
5830{
5831 unsigned long precharge;
5832
5833 mmap_read_lock(mm);
5834 walk_page_range(mm, 0, mm->highest_vm_end, &precharge_walk_ops, NULL);
5835 mmap_read_unlock(mm);
5836
5837 precharge = mc.precharge;
5838 mc.precharge = 0;
5839
5840 return precharge;
5841}
5842
5843static int mem_cgroup_precharge_mc(struct mm_struct *mm)
5844{
5845 unsigned long precharge = mem_cgroup_count_precharge(mm);
5846
5847 VM_BUG_ON(mc.moving_task);
5848 mc.moving_task = current;
5849 return mem_cgroup_do_precharge(precharge);
5850}
5851
5852
5853static void __mem_cgroup_clear_mc(void)
5854{
5855 struct mem_cgroup *from = mc.from;
5856 struct mem_cgroup *to = mc.to;
5857
5858
5859 if (mc.precharge) {
5860 cancel_charge(mc.to, mc.precharge);
5861 mc.precharge = 0;
5862 }
5863
5864
5865
5866
5867 if (mc.moved_charge) {
5868 cancel_charge(mc.from, mc.moved_charge);
5869 mc.moved_charge = 0;
5870 }
5871
5872 if (mc.moved_swap) {
5873
5874 if (!mem_cgroup_is_root(mc.from))
5875 page_counter_uncharge(&mc.from->memsw, mc.moved_swap);
5876
5877 mem_cgroup_id_put_many(mc.from, mc.moved_swap);
5878
5879
5880
5881
5882
5883 if (!mem_cgroup_is_root(mc.to))
5884 page_counter_uncharge(&mc.to->memory, mc.moved_swap);
5885
5886 mc.moved_swap = 0;
5887 }
5888 memcg_oom_recover(from);
5889 memcg_oom_recover(to);
5890 wake_up_all(&mc.waitq);
5891}
5892
5893static void mem_cgroup_clear_mc(void)
5894{
5895 struct mm_struct *mm = mc.mm;
5896
5897
5898
5899
5900
5901 mc.moving_task = NULL;
5902 __mem_cgroup_clear_mc();
5903 spin_lock(&mc.lock);
5904 mc.from = NULL;
5905 mc.to = NULL;
5906 mc.mm = NULL;
5907 spin_unlock(&mc.lock);
5908
5909 mmput(mm);
5910}
5911
5912static int mem_cgroup_can_attach(struct cgroup_taskset *tset)
5913{
5914 struct cgroup_subsys_state *css;
5915 struct mem_cgroup *memcg = NULL;
5916 struct mem_cgroup *from;
5917 struct task_struct *leader, *p;
5918 struct mm_struct *mm;
5919 unsigned long move_flags;
5920 int ret = 0;
5921
5922
5923 if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
5924 return 0;
5925
5926
5927
5928
5929
5930
5931
5932 p = NULL;
5933 cgroup_taskset_for_each_leader(leader, css, tset) {
5934 WARN_ON_ONCE(p);
5935 p = leader;
5936 memcg = mem_cgroup_from_css(css);
5937 }
5938 if (!p)
5939 return 0;
5940
5941
5942
5943
5944
5945
5946 move_flags = READ_ONCE(memcg->move_charge_at_immigrate);
5947 if (!move_flags)
5948 return 0;
5949
5950 from = mem_cgroup_from_task(p);
5951
5952 VM_BUG_ON(from == memcg);
5953
5954 mm = get_task_mm(p);
5955 if (!mm)
5956 return 0;
5957
5958 if (mm->owner == p) {
5959 VM_BUG_ON(mc.from);
5960 VM_BUG_ON(mc.to);
5961 VM_BUG_ON(mc.precharge);
5962 VM_BUG_ON(mc.moved_charge);
5963 VM_BUG_ON(mc.moved_swap);
5964
5965 spin_lock(&mc.lock);
5966 mc.mm = mm;
5967 mc.from = from;
5968 mc.to = memcg;
5969 mc.flags = move_flags;
5970 spin_unlock(&mc.lock);
5971
5972
5973 ret = mem_cgroup_precharge_mc(mm);
5974 if (ret)
5975 mem_cgroup_clear_mc();
5976 } else {
5977 mmput(mm);
5978 }
5979 return ret;
5980}
5981
5982static void mem_cgroup_cancel_attach(struct cgroup_taskset *tset)
5983{
5984 if (mc.to)
5985 mem_cgroup_clear_mc();
5986}
5987
5988static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
5989 unsigned long addr, unsigned long end,
5990 struct mm_walk *walk)
5991{
5992 int ret = 0;
5993 struct vm_area_struct *vma = walk->vma;
5994 pte_t *pte;
5995 spinlock_t *ptl;
5996 enum mc_target_type target_type;
5997 union mc_target target;
5998 struct page *page;
5999
6000 ptl = pmd_trans_huge_lock(pmd, vma);
6001 if (ptl) {
6002 if (mc.precharge < HPAGE_PMD_NR) {
6003 spin_unlock(ptl);
6004 return 0;
6005 }
6006 target_type = get_mctgt_type_thp(vma, addr, *pmd, &target);
6007 if (target_type == MC_TARGET_PAGE) {
6008 page = target.page;
6009 if (!isolate_lru_page(page)) {
6010 if (!mem_cgroup_move_account(page, true,
6011 mc.from, mc.to)) {
6012 mc.precharge -= HPAGE_PMD_NR;
6013 mc.moved_charge += HPAGE_PMD_NR;
6014 }
6015 putback_lru_page(page);
6016 }
6017 put_page(page);
6018 } else if (target_type == MC_TARGET_DEVICE) {
6019 page = target.page;
6020 if (!mem_cgroup_move_account(page, true,
6021 mc.from, mc.to)) {
6022 mc.precharge -= HPAGE_PMD_NR;
6023 mc.moved_charge += HPAGE_PMD_NR;
6024 }
6025 put_page(page);
6026 }
6027 spin_unlock(ptl);
6028 return 0;
6029 }
6030
6031 if (pmd_trans_unstable(pmd))
6032 return 0;
6033retry:
6034 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
6035 for (; addr != end; addr += PAGE_SIZE) {
6036 pte_t ptent = *(pte++);
6037 bool device = false;
6038 swp_entry_t ent;
6039
6040 if (!mc.precharge)
6041 break;
6042
6043 switch (get_mctgt_type(vma, addr, ptent, &target)) {
6044 case MC_TARGET_DEVICE:
6045 device = true;
6046 fallthrough;
6047 case MC_TARGET_PAGE:
6048 page = target.page;
6049
6050
6051
6052
6053
6054
6055 if (PageTransCompound(page))
6056 goto put;
6057 if (!device && isolate_lru_page(page))
6058 goto put;
6059 if (!mem_cgroup_move_account(page, false,
6060 mc.from, mc.to)) {
6061 mc.precharge--;
6062
6063 mc.moved_charge++;
6064 }
6065 if (!device)
6066 putback_lru_page(page);
6067put:
6068 put_page(page);
6069 break;
6070 case MC_TARGET_SWAP:
6071 ent = target.ent;
6072 if (!mem_cgroup_move_swap_account(ent, mc.from, mc.to)) {
6073 mc.precharge--;
6074 mem_cgroup_id_get_many(mc.to, 1);
6075
6076 mc.moved_swap++;
6077 }
6078 break;
6079 default:
6080 break;
6081 }
6082 }
6083 pte_unmap_unlock(pte - 1, ptl);
6084 cond_resched();
6085
6086 if (addr != end) {
6087
6088
6089
6090
6091
6092
6093 ret = mem_cgroup_do_precharge(1);
6094 if (!ret)
6095 goto retry;
6096 }
6097
6098 return ret;
6099}
6100
6101static const struct mm_walk_ops charge_walk_ops = {
6102 .pmd_entry = mem_cgroup_move_charge_pte_range,
6103};
6104
6105static void mem_cgroup_move_charge(void)
6106{
6107 lru_add_drain_all();
6108
6109
6110
6111
6112
6113 atomic_inc(&mc.from->moving_account);
6114 synchronize_rcu();
6115retry:
6116 if (unlikely(!mmap_read_trylock(mc.mm))) {
6117
6118
6119
6120
6121
6122
6123
6124 __mem_cgroup_clear_mc();
6125 cond_resched();
6126 goto retry;
6127 }
6128
6129
6130
6131
6132 walk_page_range(mc.mm, 0, mc.mm->highest_vm_end, &charge_walk_ops,
6133 NULL);
6134
6135 mmap_read_unlock(mc.mm);
6136 atomic_dec(&mc.from->moving_account);
6137}
6138
6139static void mem_cgroup_move_task(void)
6140{
6141 if (mc.to) {
6142 mem_cgroup_move_charge();
6143 mem_cgroup_clear_mc();
6144 }
6145}
6146#else
6147static int mem_cgroup_can_attach(struct cgroup_taskset *tset)
6148{
6149 return 0;
6150}
6151static void mem_cgroup_cancel_attach(struct cgroup_taskset *tset)
6152{
6153}
6154static void mem_cgroup_move_task(void)
6155{
6156}
6157#endif
6158
6159static int seq_puts_memcg_tunable(struct seq_file *m, unsigned long value)
6160{
6161 if (value == PAGE_COUNTER_MAX)
6162 seq_puts(m, "max\n");
6163 else
6164 seq_printf(m, "%llu\n", (u64)value * PAGE_SIZE);
6165
6166 return 0;
6167}
6168
6169static u64 memory_current_read(struct cgroup_subsys_state *css,
6170 struct cftype *cft)
6171{
6172 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
6173
6174 return (u64)page_counter_read(&memcg->memory) * PAGE_SIZE;
6175}
6176
6177static int memory_min_show(struct seq_file *m, void *v)
6178{
6179 return seq_puts_memcg_tunable(m,
6180 READ_ONCE(mem_cgroup_from_seq(m)->memory.min));
6181}
6182
6183static ssize_t memory_min_write(struct kernfs_open_file *of,
6184 char *buf, size_t nbytes, loff_t off)
6185{
6186 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
6187 unsigned long min;
6188 int err;
6189
6190 buf = strstrip(buf);
6191 err = page_counter_memparse(buf, "max", &min);
6192 if (err)
6193 return err;
6194
6195 page_counter_set_min(&memcg->memory, min);
6196
6197 return nbytes;
6198}
6199
6200static int memory_low_show(struct seq_file *m, void *v)
6201{
6202 return seq_puts_memcg_tunable(m,
6203 READ_ONCE(mem_cgroup_from_seq(m)->memory.low));
6204}
6205
6206static ssize_t memory_low_write(struct kernfs_open_file *of,
6207 char *buf, size_t nbytes, loff_t off)
6208{
6209 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
6210 unsigned long low;
6211 int err;
6212
6213 buf = strstrip(buf);
6214 err = page_counter_memparse(buf, "max", &low);
6215 if (err)
6216 return err;
6217
6218 page_counter_set_low(&memcg->memory, low);
6219
6220 return nbytes;
6221}
6222
6223static int memory_high_show(struct seq_file *m, void *v)
6224{
6225 return seq_puts_memcg_tunable(m,
6226 READ_ONCE(mem_cgroup_from_seq(m)->memory.high));
6227}
6228
6229static ssize_t memory_high_write(struct kernfs_open_file *of,
6230 char *buf, size_t nbytes, loff_t off)
6231{
6232 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
6233 unsigned int nr_retries = MAX_RECLAIM_RETRIES;
6234 bool drained = false;
6235 unsigned long high;
6236 int err;
6237
6238 buf = strstrip(buf);
6239 err = page_counter_memparse(buf, "max", &high);
6240 if (err)
6241 return err;
6242
6243 page_counter_set_high(&memcg->memory, high);
6244
6245 for (;;) {
6246 unsigned long nr_pages = page_counter_read(&memcg->memory);
6247 unsigned long reclaimed;
6248
6249 if (nr_pages <= high)
6250 break;
6251
6252 if (signal_pending(current))
6253 break;
6254
6255 if (!drained) {
6256 drain_all_stock(memcg);
6257 drained = true;
6258 continue;
6259 }
6260
6261 reclaimed = try_to_free_mem_cgroup_pages(memcg, nr_pages - high,
6262 GFP_KERNEL, true);
6263
6264 if (!reclaimed && !nr_retries--)
6265 break;
6266 }
6267
6268 memcg_wb_domain_size_changed(memcg);
6269 return nbytes;
6270}
6271
6272static int memory_max_show(struct seq_file *m, void *v)
6273{
6274 return seq_puts_memcg_tunable(m,
6275 READ_ONCE(mem_cgroup_from_seq(m)->memory.max));
6276}
6277
6278static ssize_t memory_max_write(struct kernfs_open_file *of,
6279 char *buf, size_t nbytes, loff_t off)
6280{
6281 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
6282 unsigned int nr_reclaims = MAX_RECLAIM_RETRIES;
6283 bool drained = false;
6284 unsigned long max;
6285 int err;
6286
6287 buf = strstrip(buf);
6288 err = page_counter_memparse(buf, "max", &max);
6289 if (err)
6290 return err;
6291
6292 xchg(&memcg->memory.max, max);
6293
6294 for (;;) {
6295 unsigned long nr_pages = page_counter_read(&memcg->memory);
6296
6297 if (nr_pages <= max)
6298 break;
6299
6300 if (signal_pending(current))
6301 break;
6302
6303 if (!drained) {
6304 drain_all_stock(memcg);
6305 drained = true;
6306 continue;
6307 }
6308
6309 if (nr_reclaims) {
6310 if (!try_to_free_mem_cgroup_pages(memcg, nr_pages - max,
6311 GFP_KERNEL, true))
6312 nr_reclaims--;
6313 continue;
6314 }
6315
6316 memcg_memory_event(memcg, MEMCG_OOM);
6317 if (!mem_cgroup_out_of_memory(memcg, GFP_KERNEL, 0))
6318 break;
6319 }
6320
6321 memcg_wb_domain_size_changed(memcg);
6322 return nbytes;
6323}
6324
6325static void __memory_events_show(struct seq_file *m, atomic_long_t *events)
6326{
6327 seq_printf(m, "low %lu\n", atomic_long_read(&events[MEMCG_LOW]));
6328 seq_printf(m, "high %lu\n", atomic_long_read(&events[MEMCG_HIGH]));
6329 seq_printf(m, "max %lu\n", atomic_long_read(&events[MEMCG_MAX]));
6330 seq_printf(m, "oom %lu\n", atomic_long_read(&events[MEMCG_OOM]));
6331 seq_printf(m, "oom_kill %lu\n",
6332 atomic_long_read(&events[MEMCG_OOM_KILL]));
6333}
6334
6335static int memory_events_show(struct seq_file *m, void *v)
6336{
6337 struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
6338
6339 __memory_events_show(m, memcg->memory_events);
6340 return 0;
6341}
6342
6343static int memory_events_local_show(struct seq_file *m, void *v)
6344{
6345 struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
6346
6347 __memory_events_show(m, memcg->memory_events_local);
6348 return 0;
6349}
6350
6351static int memory_stat_show(struct seq_file *m, void *v)
6352{
6353 struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
6354 char *buf;
6355
6356 buf = memory_stat_format(memcg);
6357 if (!buf)
6358 return -ENOMEM;
6359 seq_puts(m, buf);
6360 kfree(buf);
6361 return 0;
6362}
6363
6364#ifdef CONFIG_NUMA
6365static inline unsigned long lruvec_page_state_output(struct lruvec *lruvec,
6366 int item)
6367{
6368 return lruvec_page_state(lruvec, item) * memcg_page_state_unit(item);
6369}
6370
6371static int memory_numa_stat_show(struct seq_file *m, void *v)
6372{
6373 int i;
6374 struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
6375
6376 cgroup_rstat_flush(memcg->css.cgroup);
6377
6378 for (i = 0; i < ARRAY_SIZE(memory_stats); i++) {
6379 int nid;
6380
6381 if (memory_stats[i].idx >= NR_VM_NODE_STAT_ITEMS)
6382 continue;
6383
6384 seq_printf(m, "%s", memory_stats[i].name);
6385 for_each_node_state(nid, N_MEMORY) {
6386 u64 size;
6387 struct lruvec *lruvec;
6388
6389 lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(nid));
6390 size = lruvec_page_state_output(lruvec,
6391 memory_stats[i].idx);
6392 seq_printf(m, " N%d=%llu", nid, size);
6393 }
6394 seq_putc(m, '\n');
6395 }
6396
6397 return 0;
6398}
6399#endif
6400
6401static int memory_oom_group_show(struct seq_file *m, void *v)
6402{
6403 struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
6404
6405 seq_printf(m, "%d\n", memcg->oom_group);
6406
6407 return 0;
6408}
6409
6410static ssize_t memory_oom_group_write(struct kernfs_open_file *of,
6411 char *buf, size_t nbytes, loff_t off)
6412{
6413 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
6414 int ret, oom_group;
6415
6416 buf = strstrip(buf);
6417 if (!buf)
6418 return -EINVAL;
6419
6420 ret = kstrtoint(buf, 0, &oom_group);
6421 if (ret)
6422 return ret;
6423
6424 if (oom_group != 0 && oom_group != 1)
6425 return -EINVAL;
6426
6427 memcg->oom_group = oom_group;
6428
6429 return nbytes;
6430}
6431
6432static struct cftype memory_files[] = {
6433 {
6434 .name = "current",
6435 .flags = CFTYPE_NOT_ON_ROOT,
6436 .read_u64 = memory_current_read,
6437 },
6438 {
6439 .name = "min",
6440 .flags = CFTYPE_NOT_ON_ROOT,
6441 .seq_show = memory_min_show,
6442 .write = memory_min_write,
6443 },
6444 {
6445 .name = "low",
6446 .flags = CFTYPE_NOT_ON_ROOT,
6447 .seq_show = memory_low_show,
6448 .write = memory_low_write,
6449 },
6450 {
6451 .name = "high",
6452 .flags = CFTYPE_NOT_ON_ROOT,
6453 .seq_show = memory_high_show,
6454 .write = memory_high_write,
6455 },
6456 {
6457 .name = "max",
6458 .flags = CFTYPE_NOT_ON_ROOT,
6459 .seq_show = memory_max_show,
6460 .write = memory_max_write,
6461 },
6462 {
6463 .name = "events",
6464 .flags = CFTYPE_NOT_ON_ROOT,
6465 .file_offset = offsetof(struct mem_cgroup, events_file),
6466 .seq_show = memory_events_show,
6467 },
6468 {
6469 .name = "events.local",
6470 .flags = CFTYPE_NOT_ON_ROOT,
6471 .file_offset = offsetof(struct mem_cgroup, events_local_file),
6472 .seq_show = memory_events_local_show,
6473 },
6474 {
6475 .name = "stat",
6476 .seq_show = memory_stat_show,
6477 },
6478#ifdef CONFIG_NUMA
6479 {
6480 .name = "numa_stat",
6481 .seq_show = memory_numa_stat_show,
6482 },
6483#endif
6484 {
6485 .name = "oom.group",
6486 .flags = CFTYPE_NOT_ON_ROOT | CFTYPE_NS_DELEGATABLE,
6487 .seq_show = memory_oom_group_show,
6488 .write = memory_oom_group_write,
6489 },
6490 { }
6491};
6492
6493struct cgroup_subsys memory_cgrp_subsys = {
6494 .css_alloc = mem_cgroup_css_alloc,
6495 .css_online = mem_cgroup_css_online,
6496 .css_offline = mem_cgroup_css_offline,
6497 .css_released = mem_cgroup_css_released,
6498 .css_free = mem_cgroup_css_free,
6499 .css_reset = mem_cgroup_css_reset,
6500 .css_rstat_flush = mem_cgroup_css_rstat_flush,
6501 .can_attach = mem_cgroup_can_attach,
6502 .cancel_attach = mem_cgroup_cancel_attach,
6503 .post_attach = mem_cgroup_move_task,
6504 .dfl_cftypes = memory_files,
6505 .legacy_cftypes = mem_cgroup_legacy_files,
6506 .early_init = 0,
6507};
6508
6509
6510
6511
6512
6513
6514
6515
6516
6517
6518
6519
6520
6521
6522
6523
6524
6525
6526
6527
6528
6529
6530
6531
6532
6533
6534
6535
6536
6537
6538
6539
6540
6541
6542
6543
6544
6545
6546
6547
6548
6549
6550
6551
6552static unsigned long effective_protection(unsigned long usage,
6553 unsigned long parent_usage,
6554 unsigned long setting,
6555 unsigned long parent_effective,
6556 unsigned long siblings_protected)
6557{
6558 unsigned long protected;
6559 unsigned long ep;
6560
6561 protected = min(usage, setting);
6562
6563
6564
6565
6566
6567
6568
6569
6570
6571
6572 if (siblings_protected > parent_effective)
6573 return protected * parent_effective / siblings_protected;
6574
6575
6576
6577
6578
6579
6580
6581
6582
6583
6584
6585
6586
6587
6588
6589
6590 ep = protected;
6591
6592
6593
6594
6595
6596
6597
6598
6599
6600
6601
6602
6603
6604
6605
6606
6607
6608 if (!(cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_RECURSIVE_PROT))
6609 return ep;
6610 if (parent_effective > siblings_protected &&
6611 parent_usage > siblings_protected &&
6612 usage > protected) {
6613 unsigned long unclaimed;
6614
6615 unclaimed = parent_effective - siblings_protected;
6616 unclaimed *= usage - protected;
6617 unclaimed /= parent_usage - siblings_protected;
6618
6619 ep += unclaimed;
6620 }
6621
6622 return ep;
6623}
6624
6625
6626
6627
6628
6629
6630
6631
6632
6633void mem_cgroup_calculate_protection(struct mem_cgroup *root,
6634 struct mem_cgroup *memcg)
6635{
6636 unsigned long usage, parent_usage;
6637 struct mem_cgroup *parent;
6638
6639 if (mem_cgroup_disabled())
6640 return;
6641
6642 if (!root)
6643 root = root_mem_cgroup;
6644
6645
6646
6647
6648
6649
6650
6651
6652 if (memcg == root)
6653 return;
6654
6655 usage = page_counter_read(&memcg->memory);
6656 if (!usage)
6657 return;
6658
6659 parent = parent_mem_cgroup(memcg);
6660
6661 if (!parent)
6662 return;
6663
6664 if (parent == root) {
6665 memcg->memory.emin = READ_ONCE(memcg->memory.min);
6666 memcg->memory.elow = READ_ONCE(memcg->memory.low);
6667 return;
6668 }
6669
6670 parent_usage = page_counter_read(&parent->memory);
6671
6672 WRITE_ONCE(memcg->memory.emin, effective_protection(usage, parent_usage,
6673 READ_ONCE(memcg->memory.min),
6674 READ_ONCE(parent->memory.emin),
6675 atomic_long_read(&parent->memory.children_min_usage)));
6676
6677 WRITE_ONCE(memcg->memory.elow, effective_protection(usage, parent_usage,
6678 READ_ONCE(memcg->memory.low),
6679 READ_ONCE(parent->memory.elow),
6680 atomic_long_read(&parent->memory.children_low_usage)));
6681}
6682
6683static int charge_memcg(struct page *page, struct mem_cgroup *memcg, gfp_t gfp)
6684{
6685 unsigned int nr_pages = thp_nr_pages(page);
6686 int ret;
6687
6688 ret = try_charge(memcg, gfp, nr_pages);
6689 if (ret)
6690 goto out;
6691
6692 css_get(&memcg->css);
6693 commit_charge(page, memcg);
6694
6695 local_irq_disable();
6696 mem_cgroup_charge_statistics(memcg, page, nr_pages);
6697 memcg_check_events(memcg, page);
6698 local_irq_enable();
6699out:
6700 return ret;
6701}
6702
6703
6704
6705
6706
6707
6708
6709
6710
6711
6712
6713
6714
6715
6716
6717int __mem_cgroup_charge(struct page *page, struct mm_struct *mm,
6718 gfp_t gfp_mask)
6719{
6720 struct mem_cgroup *memcg;
6721 int ret;
6722
6723 memcg = get_mem_cgroup_from_mm(mm);
6724 ret = charge_memcg(page, memcg, gfp_mask);
6725 css_put(&memcg->css);
6726
6727 return ret;
6728}
6729
6730
6731
6732
6733
6734
6735
6736
6737
6738
6739
6740
6741
6742int mem_cgroup_swapin_charge_page(struct page *page, struct mm_struct *mm,
6743 gfp_t gfp, swp_entry_t entry)
6744{
6745 struct mem_cgroup *memcg;
6746 unsigned short id;
6747 int ret;
6748
6749 if (mem_cgroup_disabled())
6750 return 0;
6751
6752 id = lookup_swap_cgroup_id(entry);
6753 rcu_read_lock();
6754 memcg = mem_cgroup_from_id(id);
6755 if (!memcg || !css_tryget_online(&memcg->css))
6756 memcg = get_mem_cgroup_from_mm(mm);
6757 rcu_read_unlock();
6758
6759 ret = charge_memcg(page, memcg, gfp);
6760
6761 css_put(&memcg->css);
6762 return ret;
6763}
6764
6765
6766
6767
6768
6769
6770
6771
6772
6773
6774void mem_cgroup_swapin_uncharge_swap(swp_entry_t entry)
6775{
6776
6777
6778
6779
6780
6781
6782
6783
6784
6785
6786
6787
6788 if (!mem_cgroup_disabled() && do_memsw_account()) {
6789
6790
6791
6792
6793
6794 mem_cgroup_uncharge_swap(entry, 1);
6795 }
6796}
6797
6798struct uncharge_gather {
6799 struct mem_cgroup *memcg;
6800 unsigned long nr_memory;
6801 unsigned long pgpgout;
6802 unsigned long nr_kmem;
6803 struct page *dummy_page;
6804};
6805
6806static inline void uncharge_gather_clear(struct uncharge_gather *ug)
6807{
6808 memset(ug, 0, sizeof(*ug));
6809}
6810
6811static void uncharge_batch(const struct uncharge_gather *ug)
6812{
6813 unsigned long flags;
6814
6815 if (ug->nr_memory) {
6816 page_counter_uncharge(&ug->memcg->memory, ug->nr_memory);
6817 if (do_memsw_account())
6818 page_counter_uncharge(&ug->memcg->memsw, ug->nr_memory);
6819 if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && ug->nr_kmem)
6820 page_counter_uncharge(&ug->memcg->kmem, ug->nr_kmem);
6821 memcg_oom_recover(ug->memcg);
6822 }
6823
6824 local_irq_save(flags);
6825 __count_memcg_events(ug->memcg, PGPGOUT, ug->pgpgout);
6826 __this_cpu_add(ug->memcg->vmstats_percpu->nr_page_events, ug->nr_memory);
6827 memcg_check_events(ug->memcg, ug->dummy_page);
6828 local_irq_restore(flags);
6829
6830
6831 css_put(&ug->memcg->css);
6832}
6833
6834static void uncharge_page(struct page *page, struct uncharge_gather *ug)
6835{
6836 unsigned long nr_pages;
6837 struct mem_cgroup *memcg;
6838 struct obj_cgroup *objcg;
6839 bool use_objcg = PageMemcgKmem(page);
6840
6841 VM_BUG_ON_PAGE(PageLRU(page), page);
6842
6843
6844
6845
6846
6847
6848 if (use_objcg) {
6849 objcg = __page_objcg(page);
6850
6851
6852
6853
6854 memcg = get_mem_cgroup_from_objcg(objcg);
6855 } else {
6856 memcg = __page_memcg(page);
6857 }
6858
6859 if (!memcg)
6860 return;
6861
6862 if (ug->memcg != memcg) {
6863 if (ug->memcg) {
6864 uncharge_batch(ug);
6865 uncharge_gather_clear(ug);
6866 }
6867 ug->memcg = memcg;
6868 ug->dummy_page = page;
6869
6870
6871 css_get(&memcg->css);
6872 }
6873
6874 nr_pages = compound_nr(page);
6875
6876 if (use_objcg) {
6877 ug->nr_memory += nr_pages;
6878 ug->nr_kmem += nr_pages;
6879
6880 page->memcg_data = 0;
6881 obj_cgroup_put(objcg);
6882 } else {
6883
6884 if (!mem_cgroup_is_root(memcg))
6885 ug->nr_memory += nr_pages;
6886 ug->pgpgout++;
6887
6888 page->memcg_data = 0;
6889 }
6890
6891 css_put(&memcg->css);
6892}
6893
6894
6895
6896
6897
6898
6899
6900void __mem_cgroup_uncharge(struct page *page)
6901{
6902 struct uncharge_gather ug;
6903
6904
6905 if (!page_memcg(page))
6906 return;
6907
6908 uncharge_gather_clear(&ug);
6909 uncharge_page(page, &ug);
6910 uncharge_batch(&ug);
6911}
6912
6913
6914
6915
6916
6917
6918
6919
6920void __mem_cgroup_uncharge_list(struct list_head *page_list)
6921{
6922 struct uncharge_gather ug;
6923 struct page *page;
6924
6925 uncharge_gather_clear(&ug);
6926 list_for_each_entry(page, page_list, lru)
6927 uncharge_page(page, &ug);
6928 if (ug.memcg)
6929 uncharge_batch(&ug);
6930}
6931
6932
6933
6934
6935
6936
6937
6938
6939
6940
6941
6942void mem_cgroup_migrate(struct page *oldpage, struct page *newpage)
6943{
6944 struct mem_cgroup *memcg;
6945 unsigned int nr_pages;
6946 unsigned long flags;
6947
6948 VM_BUG_ON_PAGE(!PageLocked(oldpage), oldpage);
6949 VM_BUG_ON_PAGE(!PageLocked(newpage), newpage);
6950 VM_BUG_ON_PAGE(PageAnon(oldpage) != PageAnon(newpage), newpage);
6951 VM_BUG_ON_PAGE(PageTransHuge(oldpage) != PageTransHuge(newpage),
6952 newpage);
6953
6954 if (mem_cgroup_disabled())
6955 return;
6956
6957
6958 if (page_memcg(newpage))
6959 return;
6960
6961 memcg = page_memcg(oldpage);
6962 VM_WARN_ON_ONCE_PAGE(!memcg, oldpage);
6963 if (!memcg)
6964 return;
6965
6966
6967 nr_pages = thp_nr_pages(newpage);
6968
6969 if (!mem_cgroup_is_root(memcg)) {
6970 page_counter_charge(&memcg->memory, nr_pages);
6971 if (do_memsw_account())
6972 page_counter_charge(&memcg->memsw, nr_pages);
6973 }
6974
6975 css_get(&memcg->css);
6976 commit_charge(newpage, memcg);
6977
6978 local_irq_save(flags);
6979 mem_cgroup_charge_statistics(memcg, newpage, nr_pages);
6980 memcg_check_events(memcg, newpage);
6981 local_irq_restore(flags);
6982}
6983
6984DEFINE_STATIC_KEY_FALSE(memcg_sockets_enabled_key);
6985EXPORT_SYMBOL(memcg_sockets_enabled_key);
6986
6987void mem_cgroup_sk_alloc(struct sock *sk)
6988{
6989 struct mem_cgroup *memcg;
6990
6991 if (!mem_cgroup_sockets_enabled)
6992 return;
6993
6994
6995 if (in_interrupt())
6996 return;
6997
6998 rcu_read_lock();
6999 memcg = mem_cgroup_from_task(current);
7000 if (memcg == root_mem_cgroup)
7001 goto out;
7002 if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && !memcg->tcpmem_active)
7003 goto out;
7004 if (css_tryget(&memcg->css))
7005 sk->sk_memcg = memcg;
7006out:
7007 rcu_read_unlock();
7008}
7009
7010void mem_cgroup_sk_free(struct sock *sk)
7011{
7012 if (sk->sk_memcg)
7013 css_put(&sk->sk_memcg->css);
7014}
7015
7016
7017
7018
7019
7020
7021
7022
7023
7024
7025bool mem_cgroup_charge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages,
7026 gfp_t gfp_mask)
7027{
7028 if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) {
7029 struct page_counter *fail;
7030
7031 if (page_counter_try_charge(&memcg->tcpmem, nr_pages, &fail)) {
7032 memcg->tcpmem_pressure = 0;
7033 return true;
7034 }
7035 memcg->tcpmem_pressure = 1;
7036 if (gfp_mask & __GFP_NOFAIL) {
7037 page_counter_charge(&memcg->tcpmem, nr_pages);
7038 return true;
7039 }
7040 return false;
7041 }
7042
7043 if (try_charge(memcg, gfp_mask, nr_pages) == 0) {
7044 mod_memcg_state(memcg, MEMCG_SOCK, nr_pages);
7045 return true;
7046 }
7047
7048 return false;
7049}
7050
7051
7052
7053
7054
7055
7056void mem_cgroup_uncharge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages)
7057{
7058 if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) {
7059 page_counter_uncharge(&memcg->tcpmem, nr_pages);
7060 return;
7061 }
7062
7063 mod_memcg_state(memcg, MEMCG_SOCK, -nr_pages);
7064
7065 refill_stock(memcg, nr_pages);
7066}
7067
7068static int __init cgroup_memory(char *s)
7069{
7070 char *token;
7071
7072 while ((token = strsep(&s, ",")) != NULL) {
7073 if (!*token)
7074 continue;
7075 if (!strcmp(token, "nosocket"))
7076 cgroup_memory_nosocket = true;
7077 if (!strcmp(token, "nokmem"))
7078 cgroup_memory_nokmem = true;
7079 }
7080 return 0;
7081}
7082__setup("cgroup.memory=", cgroup_memory);
7083
7084
7085
7086
7087
7088
7089
7090
7091
7092static int __init mem_cgroup_init(void)
7093{
7094 int cpu, node;
7095
7096
7097
7098
7099
7100
7101
7102 BUILD_BUG_ON(MEMCG_CHARGE_BATCH > S32_MAX / PAGE_SIZE);
7103
7104 cpuhp_setup_state_nocalls(CPUHP_MM_MEMCQ_DEAD, "mm/memctrl:dead", NULL,
7105 memcg_hotplug_cpu_dead);
7106
7107 for_each_possible_cpu(cpu)
7108 INIT_WORK(&per_cpu_ptr(&memcg_stock, cpu)->work,
7109 drain_local_stock);
7110
7111 for_each_node(node) {
7112 struct mem_cgroup_tree_per_node *rtpn;
7113
7114 rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL,
7115 node_online(node) ? node : NUMA_NO_NODE);
7116
7117 rtpn->rb_root = RB_ROOT;
7118 rtpn->rb_rightmost = NULL;
7119 spin_lock_init(&rtpn->lock);
7120 soft_limit_tree.rb_tree_per_node[node] = rtpn;
7121 }
7122
7123 return 0;
7124}
7125subsys_initcall(mem_cgroup_init);
7126
7127#ifdef CONFIG_MEMCG_SWAP
7128static struct mem_cgroup *mem_cgroup_id_get_online(struct mem_cgroup *memcg)
7129{
7130 while (!refcount_inc_not_zero(&memcg->id.ref)) {
7131
7132
7133
7134
7135 if (WARN_ON_ONCE(memcg == root_mem_cgroup)) {
7136 VM_BUG_ON(1);
7137 break;
7138 }
7139 memcg = parent_mem_cgroup(memcg);
7140 if (!memcg)
7141 memcg = root_mem_cgroup;
7142 }
7143 return memcg;
7144}
7145
7146
7147
7148
7149
7150
7151
7152
7153void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
7154{
7155 struct mem_cgroup *memcg, *swap_memcg;
7156 unsigned int nr_entries;
7157 unsigned short oldid;
7158
7159 VM_BUG_ON_PAGE(PageLRU(page), page);
7160 VM_BUG_ON_PAGE(page_count(page), page);
7161
7162 if (mem_cgroup_disabled())
7163 return;
7164
7165 if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
7166 return;
7167
7168 memcg = page_memcg(page);
7169
7170 VM_WARN_ON_ONCE_PAGE(!memcg, page);
7171 if (!memcg)
7172 return;
7173
7174
7175
7176
7177
7178
7179 swap_memcg = mem_cgroup_id_get_online(memcg);
7180 nr_entries = thp_nr_pages(page);
7181
7182 if (nr_entries > 1)
7183 mem_cgroup_id_get_many(swap_memcg, nr_entries - 1);
7184 oldid = swap_cgroup_record(entry, mem_cgroup_id(swap_memcg),
7185 nr_entries);
7186 VM_BUG_ON_PAGE(oldid, page);
7187 mod_memcg_state(swap_memcg, MEMCG_SWAP, nr_entries);
7188
7189 page->memcg_data = 0;
7190
7191 if (!mem_cgroup_is_root(memcg))
7192 page_counter_uncharge(&memcg->memory, nr_entries);
7193
7194 if (!cgroup_memory_noswap && memcg != swap_memcg) {
7195 if (!mem_cgroup_is_root(swap_memcg))
7196 page_counter_charge(&swap_memcg->memsw, nr_entries);
7197 page_counter_uncharge(&memcg->memsw, nr_entries);
7198 }
7199
7200
7201
7202
7203
7204
7205
7206 VM_BUG_ON(!irqs_disabled());
7207 mem_cgroup_charge_statistics(memcg, page, -nr_entries);
7208 memcg_check_events(memcg, page);
7209
7210 css_put(&memcg->css);
7211}
7212
7213
7214
7215
7216
7217
7218
7219
7220
7221
7222int __mem_cgroup_try_charge_swap(struct page *page, swp_entry_t entry)
7223{
7224 unsigned int nr_pages = thp_nr_pages(page);
7225 struct page_counter *counter;
7226 struct mem_cgroup *memcg;
7227 unsigned short oldid;
7228
7229 if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
7230 return 0;
7231
7232 memcg = page_memcg(page);
7233
7234 VM_WARN_ON_ONCE_PAGE(!memcg, page);
7235 if (!memcg)
7236 return 0;
7237
7238 if (!entry.val) {
7239 memcg_memory_event(memcg, MEMCG_SWAP_FAIL);
7240 return 0;
7241 }
7242
7243 memcg = mem_cgroup_id_get_online(memcg);
7244
7245 if (!cgroup_memory_noswap && !mem_cgroup_is_root(memcg) &&
7246 !page_counter_try_charge(&memcg->swap, nr_pages, &counter)) {
7247 memcg_memory_event(memcg, MEMCG_SWAP_MAX);
7248 memcg_memory_event(memcg, MEMCG_SWAP_FAIL);
7249 mem_cgroup_id_put(memcg);
7250 return -ENOMEM;
7251 }
7252
7253
7254 if (nr_pages > 1)
7255 mem_cgroup_id_get_many(memcg, nr_pages - 1);
7256 oldid = swap_cgroup_record(entry, mem_cgroup_id(memcg), nr_pages);
7257 VM_BUG_ON_PAGE(oldid, page);
7258 mod_memcg_state(memcg, MEMCG_SWAP, nr_pages);
7259
7260 return 0;
7261}
7262
7263
7264
7265
7266
7267
7268void __mem_cgroup_uncharge_swap(swp_entry_t entry, unsigned int nr_pages)
7269{
7270 struct mem_cgroup *memcg;
7271 unsigned short id;
7272
7273 id = swap_cgroup_record(entry, 0, nr_pages);
7274 rcu_read_lock();
7275 memcg = mem_cgroup_from_id(id);
7276 if (memcg) {
7277 if (!cgroup_memory_noswap && !mem_cgroup_is_root(memcg)) {
7278 if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
7279 page_counter_uncharge(&memcg->swap, nr_pages);
7280 else
7281 page_counter_uncharge(&memcg->memsw, nr_pages);
7282 }
7283 mod_memcg_state(memcg, MEMCG_SWAP, -nr_pages);
7284 mem_cgroup_id_put_many(memcg, nr_pages);
7285 }
7286 rcu_read_unlock();
7287}
7288
7289long mem_cgroup_get_nr_swap_pages(struct mem_cgroup *memcg)
7290{
7291 long nr_swap_pages = get_nr_swap_pages();
7292
7293 if (cgroup_memory_noswap || !cgroup_subsys_on_dfl(memory_cgrp_subsys))
7294 return nr_swap_pages;
7295 for (; memcg != root_mem_cgroup; memcg = parent_mem_cgroup(memcg))
7296 nr_swap_pages = min_t(long, nr_swap_pages,
7297 READ_ONCE(memcg->swap.max) -
7298 page_counter_read(&memcg->swap));
7299 return nr_swap_pages;
7300}
7301
7302bool mem_cgroup_swap_full(struct page *page)
7303{
7304 struct mem_cgroup *memcg;
7305
7306 VM_BUG_ON_PAGE(!PageLocked(page), page);
7307
7308 if (vm_swap_full())
7309 return true;
7310 if (cgroup_memory_noswap || !cgroup_subsys_on_dfl(memory_cgrp_subsys))
7311 return false;
7312
7313 memcg = page_memcg(page);
7314 if (!memcg)
7315 return false;
7316
7317 for (; memcg != root_mem_cgroup; memcg = parent_mem_cgroup(memcg)) {
7318 unsigned long usage = page_counter_read(&memcg->swap);
7319
7320 if (usage * 2 >= READ_ONCE(memcg->swap.high) ||
7321 usage * 2 >= READ_ONCE(memcg->swap.max))
7322 return true;
7323 }
7324
7325 return false;
7326}
7327
7328static int __init setup_swap_account(char *s)
7329{
7330 if (!strcmp(s, "1"))
7331 cgroup_memory_noswap = false;
7332 else if (!strcmp(s, "0"))
7333 cgroup_memory_noswap = true;
7334 return 1;
7335}
7336__setup("swapaccount=", setup_swap_account);
7337
7338static u64 swap_current_read(struct cgroup_subsys_state *css,
7339 struct cftype *cft)
7340{
7341 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
7342
7343 return (u64)page_counter_read(&memcg->swap) * PAGE_SIZE;
7344}
7345
7346static int swap_high_show(struct seq_file *m, void *v)
7347{
7348 return seq_puts_memcg_tunable(m,
7349 READ_ONCE(mem_cgroup_from_seq(m)->swap.high));
7350}
7351
7352static ssize_t swap_high_write(struct kernfs_open_file *of,
7353 char *buf, size_t nbytes, loff_t off)
7354{
7355 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
7356 unsigned long high;
7357 int err;
7358
7359 buf = strstrip(buf);
7360 err = page_counter_memparse(buf, "max", &high);
7361 if (err)
7362 return err;
7363
7364 page_counter_set_high(&memcg->swap, high);
7365
7366 return nbytes;
7367}
7368
7369static int swap_max_show(struct seq_file *m, void *v)
7370{
7371 return seq_puts_memcg_tunable(m,
7372 READ_ONCE(mem_cgroup_from_seq(m)->swap.max));
7373}
7374
7375static ssize_t swap_max_write(struct kernfs_open_file *of,
7376 char *buf, size_t nbytes, loff_t off)
7377{
7378 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
7379 unsigned long max;
7380 int err;
7381
7382 buf = strstrip(buf);
7383 err = page_counter_memparse(buf, "max", &max);
7384 if (err)
7385 return err;
7386
7387 xchg(&memcg->swap.max, max);
7388
7389 return nbytes;
7390}
7391
7392static int swap_events_show(struct seq_file *m, void *v)
7393{
7394 struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
7395
7396 seq_printf(m, "high %lu\n",
7397 atomic_long_read(&memcg->memory_events[MEMCG_SWAP_HIGH]));
7398 seq_printf(m, "max %lu\n",
7399 atomic_long_read(&memcg->memory_events[MEMCG_SWAP_MAX]));
7400 seq_printf(m, "fail %lu\n",
7401 atomic_long_read(&memcg->memory_events[MEMCG_SWAP_FAIL]));
7402
7403 return 0;
7404}
7405
7406static struct cftype swap_files[] = {
7407 {
7408 .name = "swap.current",
7409 .flags = CFTYPE_NOT_ON_ROOT,
7410 .read_u64 = swap_current_read,
7411 },
7412 {
7413 .name = "swap.high",
7414 .flags = CFTYPE_NOT_ON_ROOT,
7415 .seq_show = swap_high_show,
7416 .write = swap_high_write,
7417 },
7418 {
7419 .name = "swap.max",
7420 .flags = CFTYPE_NOT_ON_ROOT,
7421 .seq_show = swap_max_show,
7422 .write = swap_max_write,
7423 },
7424 {
7425 .name = "swap.events",
7426 .flags = CFTYPE_NOT_ON_ROOT,
7427 .file_offset = offsetof(struct mem_cgroup, swap_events_file),
7428 .seq_show = swap_events_show,
7429 },
7430 { }
7431};
7432
7433static struct cftype memsw_files[] = {
7434 {
7435 .name = "memsw.usage_in_bytes",
7436 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE),
7437 .read_u64 = mem_cgroup_read_u64,
7438 },
7439 {
7440 .name = "memsw.max_usage_in_bytes",
7441 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_MAX_USAGE),
7442 .write = mem_cgroup_reset,
7443 .read_u64 = mem_cgroup_read_u64,
7444 },
7445 {
7446 .name = "memsw.limit_in_bytes",
7447 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_LIMIT),
7448 .write = mem_cgroup_write,
7449 .read_u64 = mem_cgroup_read_u64,
7450 },
7451 {
7452 .name = "memsw.failcnt",
7453 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_FAILCNT),
7454 .write = mem_cgroup_reset,
7455 .read_u64 = mem_cgroup_read_u64,
7456 },
7457 { },
7458};
7459
7460
7461
7462
7463
7464
7465
7466
7467static int __init mem_cgroup_swap_init(void)
7468{
7469
7470 if (mem_cgroup_disabled())
7471 cgroup_memory_noswap = true;
7472
7473 if (cgroup_memory_noswap)
7474 return 0;
7475
7476 WARN_ON(cgroup_add_dfl_cftypes(&memory_cgrp_subsys, swap_files));
7477 WARN_ON(cgroup_add_legacy_cftypes(&memory_cgrp_subsys, memsw_files));
7478
7479 return 0;
7480}
7481core_initcall(mem_cgroup_swap_init);
7482
7483#endif
7484