1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28#include <linux/page_counter.h>
29#include <linux/memcontrol.h>
30#include <linux/cgroup.h>
31#include <linux/pagewalk.h>
32#include <linux/sched/mm.h>
33#include <linux/shmem_fs.h>
34#include <linux/hugetlb.h>
35#include <linux/pagemap.h>
36#include <linux/vm_event_item.h>
37#include <linux/smp.h>
38#include <linux/page-flags.h>
39#include <linux/backing-dev.h>
40#include <linux/bit_spinlock.h>
41#include <linux/rcupdate.h>
42#include <linux/limits.h>
43#include <linux/export.h>
44#include <linux/mutex.h>
45#include <linux/rbtree.h>
46#include <linux/slab.h>
47#include <linux/swap.h>
48#include <linux/swapops.h>
49#include <linux/spinlock.h>
50#include <linux/eventfd.h>
51#include <linux/poll.h>
52#include <linux/sort.h>
53#include <linux/fs.h>
54#include <linux/seq_file.h>
55#include <linux/vmpressure.h>
56#include <linux/memremap.h>
57#include <linux/mm_inline.h>
58#include <linux/swap_cgroup.h>
59#include <linux/cpu.h>
60#include <linux/oom.h>
61#include <linux/lockdep.h>
62#include <linux/file.h>
63#include <linux/resume_user_mode.h>
64#include <linux/psi.h>
65#include <linux/seq_buf.h>
66#include "internal.h"
67#include <net/sock.h>
68#include <net/ip.h>
69#include "slab.h"
70#include "swap.h"
71
72#include <linux/uaccess.h>
73
74#include <trace/events/vmscan.h>
75
76struct cgroup_subsys memory_cgrp_subsys __read_mostly;
77EXPORT_SYMBOL(memory_cgrp_subsys);
78
79struct mem_cgroup *root_mem_cgroup __read_mostly;
80
81
82DEFINE_PER_CPU(struct mem_cgroup *, int_active_memcg);
83EXPORT_PER_CPU_SYMBOL_GPL(int_active_memcg);
84
85
86static bool cgroup_memory_nosocket __ro_after_init;
87
88
89static bool cgroup_memory_nokmem __ro_after_init;
90
91
92#ifdef CONFIG_MEMCG_SWAP
93static bool cgroup_memory_noswap __ro_after_init;
94#else
95#define cgroup_memory_noswap 1
96#endif
97
98#ifdef CONFIG_CGROUP_WRITEBACK
99static DECLARE_WAIT_QUEUE_HEAD(memcg_cgwb_frn_waitq);
100#endif
101
102
103static bool do_memsw_account(void)
104{
105 return !cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_noswap;
106}
107
108#define THRESHOLDS_EVENTS_TARGET 128
109#define SOFTLIMIT_EVENTS_TARGET 1024
110
111
112
113
114
115
116struct mem_cgroup_tree_per_node {
117 struct rb_root rb_root;
118 struct rb_node *rb_rightmost;
119 spinlock_t lock;
120};
121
122struct mem_cgroup_tree {
123 struct mem_cgroup_tree_per_node *rb_tree_per_node[MAX_NUMNODES];
124};
125
126static struct mem_cgroup_tree soft_limit_tree __read_mostly;
127
128
129struct mem_cgroup_eventfd_list {
130 struct list_head list;
131 struct eventfd_ctx *eventfd;
132};
133
134
135
136
137struct mem_cgroup_event {
138
139
140
141 struct mem_cgroup *memcg;
142
143
144
145 struct eventfd_ctx *eventfd;
146
147
148
149 struct list_head list;
150
151
152
153
154
155 int (*register_event)(struct mem_cgroup *memcg,
156 struct eventfd_ctx *eventfd, const char *args);
157
158
159
160
161
162 void (*unregister_event)(struct mem_cgroup *memcg,
163 struct eventfd_ctx *eventfd);
164
165
166
167
168 poll_table pt;
169 wait_queue_head_t *wqh;
170 wait_queue_entry_t wait;
171 struct work_struct remove;
172};
173
174static void mem_cgroup_threshold(struct mem_cgroup *memcg);
175static void mem_cgroup_oom_notify(struct mem_cgroup *memcg);
176
177
178
179
180
181#define MOVE_ANON 0x1U
182#define MOVE_FILE 0x2U
183#define MOVE_MASK (MOVE_ANON | MOVE_FILE)
184
185
186static struct move_charge_struct {
187 spinlock_t lock;
188 struct mm_struct *mm;
189 struct mem_cgroup *from;
190 struct mem_cgroup *to;
191 unsigned long flags;
192 unsigned long precharge;
193 unsigned long moved_charge;
194 unsigned long moved_swap;
195 struct task_struct *moving_task;
196 wait_queue_head_t waitq;
197} mc = {
198 .lock = __SPIN_LOCK_UNLOCKED(mc.lock),
199 .waitq = __WAIT_QUEUE_HEAD_INITIALIZER(mc.waitq),
200};
201
202
203
204
205
206#define MEM_CGROUP_MAX_RECLAIM_LOOPS 100
207#define MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS 2
208
209
210enum res_type {
211 _MEM,
212 _MEMSWAP,
213 _KMEM,
214 _TCP,
215};
216
217#define MEMFILE_PRIVATE(x, val) ((x) << 16 | (val))
218#define MEMFILE_TYPE(val) ((val) >> 16 & 0xffff)
219#define MEMFILE_ATTR(val) ((val) & 0xffff)
220
221
222
223
224
225
226#define for_each_mem_cgroup_tree(iter, root) \
227 for (iter = mem_cgroup_iter(root, NULL, NULL); \
228 iter != NULL; \
229 iter = mem_cgroup_iter(root, iter, NULL))
230
231#define for_each_mem_cgroup(iter) \
232 for (iter = mem_cgroup_iter(NULL, NULL, NULL); \
233 iter != NULL; \
234 iter = mem_cgroup_iter(NULL, iter, NULL))
235
236static inline bool task_is_dying(void)
237{
238 return tsk_is_oom_victim(current) || fatal_signal_pending(current) ||
239 (current->flags & PF_EXITING);
240}
241
242
243struct vmpressure *memcg_to_vmpressure(struct mem_cgroup *memcg)
244{
245 if (!memcg)
246 memcg = root_mem_cgroup;
247 return &memcg->vmpressure;
248}
249
250struct mem_cgroup *vmpressure_to_memcg(struct vmpressure *vmpr)
251{
252 return container_of(vmpr, struct mem_cgroup, vmpressure);
253}
254
255#ifdef CONFIG_MEMCG_KMEM
256static DEFINE_SPINLOCK(objcg_lock);
257
258bool mem_cgroup_kmem_disabled(void)
259{
260 return cgroup_memory_nokmem;
261}
262
263static void obj_cgroup_uncharge_pages(struct obj_cgroup *objcg,
264 unsigned int nr_pages);
265
266static void obj_cgroup_release(struct percpu_ref *ref)
267{
268 struct obj_cgroup *objcg = container_of(ref, struct obj_cgroup, refcnt);
269 unsigned int nr_bytes;
270 unsigned int nr_pages;
271 unsigned long flags;
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293 nr_bytes = atomic_read(&objcg->nr_charged_bytes);
294 WARN_ON_ONCE(nr_bytes & (PAGE_SIZE - 1));
295 nr_pages = nr_bytes >> PAGE_SHIFT;
296
297 if (nr_pages)
298 obj_cgroup_uncharge_pages(objcg, nr_pages);
299
300 spin_lock_irqsave(&objcg_lock, flags);
301 list_del(&objcg->list);
302 spin_unlock_irqrestore(&objcg_lock, flags);
303
304 percpu_ref_exit(ref);
305 kfree_rcu(objcg, rcu);
306}
307
308static struct obj_cgroup *obj_cgroup_alloc(void)
309{
310 struct obj_cgroup *objcg;
311 int ret;
312
313 objcg = kzalloc(sizeof(struct obj_cgroup), GFP_KERNEL);
314 if (!objcg)
315 return NULL;
316
317 ret = percpu_ref_init(&objcg->refcnt, obj_cgroup_release, 0,
318 GFP_KERNEL);
319 if (ret) {
320 kfree(objcg);
321 return NULL;
322 }
323 INIT_LIST_HEAD(&objcg->list);
324 return objcg;
325}
326
327static void memcg_reparent_objcgs(struct mem_cgroup *memcg,
328 struct mem_cgroup *parent)
329{
330 struct obj_cgroup *objcg, *iter;
331
332 objcg = rcu_replace_pointer(memcg->objcg, NULL, true);
333
334 spin_lock_irq(&objcg_lock);
335
336
337 list_add(&objcg->list, &memcg->objcg_list);
338
339 list_for_each_entry(iter, &memcg->objcg_list, list)
340 WRITE_ONCE(iter->memcg, parent);
341
342 list_splice(&memcg->objcg_list, &parent->objcg_list);
343
344 spin_unlock_irq(&objcg_lock);
345
346 percpu_ref_kill(&objcg->refcnt);
347}
348
349
350
351
352
353
354
355DEFINE_STATIC_KEY_FALSE(memcg_kmem_enabled_key);
356EXPORT_SYMBOL(memcg_kmem_enabled_key);
357#endif
358
359
360
361
362
363
364
365
366
367
368
369
370struct cgroup_subsys_state *mem_cgroup_css_from_page(struct page *page)
371{
372 struct mem_cgroup *memcg;
373
374 memcg = page_memcg(page);
375
376 if (!memcg || !cgroup_subsys_on_dfl(memory_cgrp_subsys))
377 memcg = root_mem_cgroup;
378
379 return &memcg->css;
380}
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395ino_t page_cgroup_ino(struct page *page)
396{
397 struct mem_cgroup *memcg;
398 unsigned long ino = 0;
399
400 rcu_read_lock();
401 memcg = page_memcg_check(page);
402
403 while (memcg && !(memcg->css.flags & CSS_ONLINE))
404 memcg = parent_mem_cgroup(memcg);
405 if (memcg)
406 ino = cgroup_ino(memcg->css.cgroup);
407 rcu_read_unlock();
408 return ino;
409}
410
411static void __mem_cgroup_insert_exceeded(struct mem_cgroup_per_node *mz,
412 struct mem_cgroup_tree_per_node *mctz,
413 unsigned long new_usage_in_excess)
414{
415 struct rb_node **p = &mctz->rb_root.rb_node;
416 struct rb_node *parent = NULL;
417 struct mem_cgroup_per_node *mz_node;
418 bool rightmost = true;
419
420 if (mz->on_tree)
421 return;
422
423 mz->usage_in_excess = new_usage_in_excess;
424 if (!mz->usage_in_excess)
425 return;
426 while (*p) {
427 parent = *p;
428 mz_node = rb_entry(parent, struct mem_cgroup_per_node,
429 tree_node);
430 if (mz->usage_in_excess < mz_node->usage_in_excess) {
431 p = &(*p)->rb_left;
432 rightmost = false;
433 } else {
434 p = &(*p)->rb_right;
435 }
436 }
437
438 if (rightmost)
439 mctz->rb_rightmost = &mz->tree_node;
440
441 rb_link_node(&mz->tree_node, parent, p);
442 rb_insert_color(&mz->tree_node, &mctz->rb_root);
443 mz->on_tree = true;
444}
445
446static void __mem_cgroup_remove_exceeded(struct mem_cgroup_per_node *mz,
447 struct mem_cgroup_tree_per_node *mctz)
448{
449 if (!mz->on_tree)
450 return;
451
452 if (&mz->tree_node == mctz->rb_rightmost)
453 mctz->rb_rightmost = rb_prev(&mz->tree_node);
454
455 rb_erase(&mz->tree_node, &mctz->rb_root);
456 mz->on_tree = false;
457}
458
459static void mem_cgroup_remove_exceeded(struct mem_cgroup_per_node *mz,
460 struct mem_cgroup_tree_per_node *mctz)
461{
462 unsigned long flags;
463
464 spin_lock_irqsave(&mctz->lock, flags);
465 __mem_cgroup_remove_exceeded(mz, mctz);
466 spin_unlock_irqrestore(&mctz->lock, flags);
467}
468
469static unsigned long soft_limit_excess(struct mem_cgroup *memcg)
470{
471 unsigned long nr_pages = page_counter_read(&memcg->memory);
472 unsigned long soft_limit = READ_ONCE(memcg->soft_limit);
473 unsigned long excess = 0;
474
475 if (nr_pages > soft_limit)
476 excess = nr_pages - soft_limit;
477
478 return excess;
479}
480
481static void mem_cgroup_update_tree(struct mem_cgroup *memcg, int nid)
482{
483 unsigned long excess;
484 struct mem_cgroup_per_node *mz;
485 struct mem_cgroup_tree_per_node *mctz;
486
487 mctz = soft_limit_tree.rb_tree_per_node[nid];
488 if (!mctz)
489 return;
490
491
492
493
494 for (; memcg; memcg = parent_mem_cgroup(memcg)) {
495 mz = memcg->nodeinfo[nid];
496 excess = soft_limit_excess(memcg);
497
498
499
500
501 if (excess || mz->on_tree) {
502 unsigned long flags;
503
504 spin_lock_irqsave(&mctz->lock, flags);
505
506 if (mz->on_tree)
507 __mem_cgroup_remove_exceeded(mz, mctz);
508
509
510
511
512 __mem_cgroup_insert_exceeded(mz, mctz, excess);
513 spin_unlock_irqrestore(&mctz->lock, flags);
514 }
515 }
516}
517
518static void mem_cgroup_remove_from_trees(struct mem_cgroup *memcg)
519{
520 struct mem_cgroup_tree_per_node *mctz;
521 struct mem_cgroup_per_node *mz;
522 int nid;
523
524 for_each_node(nid) {
525 mz = memcg->nodeinfo[nid];
526 mctz = soft_limit_tree.rb_tree_per_node[nid];
527 if (mctz)
528 mem_cgroup_remove_exceeded(mz, mctz);
529 }
530}
531
532static struct mem_cgroup_per_node *
533__mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz)
534{
535 struct mem_cgroup_per_node *mz;
536
537retry:
538 mz = NULL;
539 if (!mctz->rb_rightmost)
540 goto done;
541
542 mz = rb_entry(mctz->rb_rightmost,
543 struct mem_cgroup_per_node, tree_node);
544
545
546
547
548
549 __mem_cgroup_remove_exceeded(mz, mctz);
550 if (!soft_limit_excess(mz->memcg) ||
551 !css_tryget(&mz->memcg->css))
552 goto retry;
553done:
554 return mz;
555}
556
557static struct mem_cgroup_per_node *
558mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz)
559{
560 struct mem_cgroup_per_node *mz;
561
562 spin_lock_irq(&mctz->lock);
563 mz = __mem_cgroup_largest_soft_limit_node(mctz);
564 spin_unlock_irq(&mctz->lock);
565 return mz;
566}
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583static void flush_memcg_stats_dwork(struct work_struct *w);
584static DECLARE_DEFERRABLE_WORK(stats_flush_dwork, flush_memcg_stats_dwork);
585static DEFINE_SPINLOCK(stats_flush_lock);
586static DEFINE_PER_CPU(unsigned int, stats_updates);
587static atomic_t stats_flush_threshold = ATOMIC_INIT(0);
588static u64 flush_next_time;
589
590#define FLUSH_TIME (2UL*HZ)
591
592
593
594
595
596
597
598static void memcg_stats_lock(void)
599{
600#ifdef CONFIG_PREEMPT_RT
601 preempt_disable();
602#else
603 VM_BUG_ON(!irqs_disabled());
604#endif
605}
606
607static void __memcg_stats_lock(void)
608{
609#ifdef CONFIG_PREEMPT_RT
610 preempt_disable();
611#endif
612}
613
614static void memcg_stats_unlock(void)
615{
616#ifdef CONFIG_PREEMPT_RT
617 preempt_enable();
618#endif
619}
620
621static inline void memcg_rstat_updated(struct mem_cgroup *memcg, int val)
622{
623 unsigned int x;
624
625 cgroup_rstat_updated(memcg->css.cgroup, smp_processor_id());
626
627 x = __this_cpu_add_return(stats_updates, abs(val));
628 if (x > MEMCG_CHARGE_BATCH) {
629 atomic_add(x / MEMCG_CHARGE_BATCH, &stats_flush_threshold);
630 __this_cpu_write(stats_updates, 0);
631 }
632}
633
634static void __mem_cgroup_flush_stats(void)
635{
636 unsigned long flag;
637
638 if (!spin_trylock_irqsave(&stats_flush_lock, flag))
639 return;
640
641 flush_next_time = jiffies_64 + 2*FLUSH_TIME;
642 cgroup_rstat_flush_irqsafe(root_mem_cgroup->css.cgroup);
643 atomic_set(&stats_flush_threshold, 0);
644 spin_unlock_irqrestore(&stats_flush_lock, flag);
645}
646
647void mem_cgroup_flush_stats(void)
648{
649 if (atomic_read(&stats_flush_threshold) > num_online_cpus())
650 __mem_cgroup_flush_stats();
651}
652
653void mem_cgroup_flush_stats_delayed(void)
654{
655 if (time_after64(jiffies_64, flush_next_time))
656 mem_cgroup_flush_stats();
657}
658
659static void flush_memcg_stats_dwork(struct work_struct *w)
660{
661 __mem_cgroup_flush_stats();
662 queue_delayed_work(system_unbound_wq, &stats_flush_dwork, FLUSH_TIME);
663}
664
665
666
667
668
669
670
671void __mod_memcg_state(struct mem_cgroup *memcg, int idx, int val)
672{
673 if (mem_cgroup_disabled())
674 return;
675
676 __this_cpu_add(memcg->vmstats_percpu->state[idx], val);
677 memcg_rstat_updated(memcg, val);
678}
679
680
681static unsigned long memcg_page_state_local(struct mem_cgroup *memcg, int idx)
682{
683 long x = 0;
684 int cpu;
685
686 for_each_possible_cpu(cpu)
687 x += per_cpu(memcg->vmstats_percpu->state[idx], cpu);
688#ifdef CONFIG_SMP
689 if (x < 0)
690 x = 0;
691#endif
692 return x;
693}
694
695void __mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx,
696 int val)
697{
698 struct mem_cgroup_per_node *pn;
699 struct mem_cgroup *memcg;
700
701 pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
702 memcg = pn->memcg;
703
704
705
706
707
708
709
710 __memcg_stats_lock();
711 if (IS_ENABLED(CONFIG_DEBUG_VM) && !IS_ENABLED(CONFIG_PREEMPT_RT)) {
712 switch (idx) {
713 case NR_ANON_MAPPED:
714 case NR_FILE_MAPPED:
715 case NR_ANON_THPS:
716 case NR_SHMEM_PMDMAPPED:
717 case NR_FILE_PMDMAPPED:
718 WARN_ON_ONCE(!in_task());
719 break;
720 default:
721 WARN_ON_ONCE(!irqs_disabled());
722 }
723 }
724
725
726 __this_cpu_add(memcg->vmstats_percpu->state[idx], val);
727
728
729 __this_cpu_add(pn->lruvec_stats_percpu->state[idx], val);
730
731 memcg_rstat_updated(memcg, val);
732 memcg_stats_unlock();
733}
734
735
736
737
738
739
740
741
742
743
744
745void __mod_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx,
746 int val)
747{
748
749 __mod_node_page_state(lruvec_pgdat(lruvec), idx, val);
750
751
752 if (!mem_cgroup_disabled())
753 __mod_memcg_lruvec_state(lruvec, idx, val);
754}
755
756void __mod_lruvec_page_state(struct page *page, enum node_stat_item idx,
757 int val)
758{
759 struct page *head = compound_head(page);
760 struct mem_cgroup *memcg;
761 pg_data_t *pgdat = page_pgdat(page);
762 struct lruvec *lruvec;
763
764 rcu_read_lock();
765 memcg = page_memcg(head);
766
767 if (!memcg) {
768 rcu_read_unlock();
769 __mod_node_page_state(pgdat, idx, val);
770 return;
771 }
772
773 lruvec = mem_cgroup_lruvec(memcg, pgdat);
774 __mod_lruvec_state(lruvec, idx, val);
775 rcu_read_unlock();
776}
777EXPORT_SYMBOL(__mod_lruvec_page_state);
778
779void __mod_lruvec_kmem_state(void *p, enum node_stat_item idx, int val)
780{
781 pg_data_t *pgdat = page_pgdat(virt_to_page(p));
782 struct mem_cgroup *memcg;
783 struct lruvec *lruvec;
784
785 rcu_read_lock();
786 memcg = mem_cgroup_from_obj(p);
787
788
789
790
791
792
793
794 if (!memcg) {
795 __mod_node_page_state(pgdat, idx, val);
796 } else {
797 lruvec = mem_cgroup_lruvec(memcg, pgdat);
798 __mod_lruvec_state(lruvec, idx, val);
799 }
800 rcu_read_unlock();
801}
802
803
804
805
806
807
808
809void __count_memcg_events(struct mem_cgroup *memcg, enum vm_event_item idx,
810 unsigned long count)
811{
812 if (mem_cgroup_disabled())
813 return;
814
815 memcg_stats_lock();
816 __this_cpu_add(memcg->vmstats_percpu->events[idx], count);
817 memcg_rstat_updated(memcg, count);
818 memcg_stats_unlock();
819}
820
821static unsigned long memcg_events(struct mem_cgroup *memcg, int event)
822{
823 return READ_ONCE(memcg->vmstats.events[event]);
824}
825
826static unsigned long memcg_events_local(struct mem_cgroup *memcg, int event)
827{
828 long x = 0;
829 int cpu;
830
831 for_each_possible_cpu(cpu)
832 x += per_cpu(memcg->vmstats_percpu->events[event], cpu);
833 return x;
834}
835
836static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg,
837 int nr_pages)
838{
839
840 if (nr_pages > 0)
841 __count_memcg_events(memcg, PGPGIN, 1);
842 else {
843 __count_memcg_events(memcg, PGPGOUT, 1);
844 nr_pages = -nr_pages;
845 }
846
847 __this_cpu_add(memcg->vmstats_percpu->nr_page_events, nr_pages);
848}
849
850static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg,
851 enum mem_cgroup_events_target target)
852{
853 unsigned long val, next;
854
855 val = __this_cpu_read(memcg->vmstats_percpu->nr_page_events);
856 next = __this_cpu_read(memcg->vmstats_percpu->targets[target]);
857
858 if ((long)(next - val) < 0) {
859 switch (target) {
860 case MEM_CGROUP_TARGET_THRESH:
861 next = val + THRESHOLDS_EVENTS_TARGET;
862 break;
863 case MEM_CGROUP_TARGET_SOFTLIMIT:
864 next = val + SOFTLIMIT_EVENTS_TARGET;
865 break;
866 default:
867 break;
868 }
869 __this_cpu_write(memcg->vmstats_percpu->targets[target], next);
870 return true;
871 }
872 return false;
873}
874
875
876
877
878
879static void memcg_check_events(struct mem_cgroup *memcg, int nid)
880{
881 if (IS_ENABLED(CONFIG_PREEMPT_RT))
882 return;
883
884
885 if (unlikely(mem_cgroup_event_ratelimit(memcg,
886 MEM_CGROUP_TARGET_THRESH))) {
887 bool do_softlimit;
888
889 do_softlimit = mem_cgroup_event_ratelimit(memcg,
890 MEM_CGROUP_TARGET_SOFTLIMIT);
891 mem_cgroup_threshold(memcg);
892 if (unlikely(do_softlimit))
893 mem_cgroup_update_tree(memcg, nid);
894 }
895}
896
897struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
898{
899
900
901
902
903
904 if (unlikely(!p))
905 return NULL;
906
907 return mem_cgroup_from_css(task_css(p, memory_cgrp_id));
908}
909EXPORT_SYMBOL(mem_cgroup_from_task);
910
911static __always_inline struct mem_cgroup *active_memcg(void)
912{
913 if (!in_task())
914 return this_cpu_read(int_active_memcg);
915 else
916 return current->active_memcg;
917}
918
919
920
921
922
923
924
925
926
927
928
929
930struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm)
931{
932 struct mem_cgroup *memcg;
933
934 if (mem_cgroup_disabled())
935 return NULL;
936
937
938
939
940
941
942
943
944
945
946 if (unlikely(!mm)) {
947 memcg = active_memcg();
948 if (unlikely(memcg)) {
949
950 css_get(&memcg->css);
951 return memcg;
952 }
953 mm = current->mm;
954 if (unlikely(!mm))
955 return root_mem_cgroup;
956 }
957
958 rcu_read_lock();
959 do {
960 memcg = mem_cgroup_from_task(rcu_dereference(mm->owner));
961 if (unlikely(!memcg))
962 memcg = root_mem_cgroup;
963 } while (!css_tryget(&memcg->css));
964 rcu_read_unlock();
965 return memcg;
966}
967EXPORT_SYMBOL(get_mem_cgroup_from_mm);
968
969static __always_inline bool memcg_kmem_bypass(void)
970{
971
972 if (unlikely(active_memcg()))
973 return false;
974
975
976 if (!in_task() || !current->mm || (current->flags & PF_KTHREAD))
977 return true;
978
979 return false;
980}
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
1000 struct mem_cgroup *prev,
1001 struct mem_cgroup_reclaim_cookie *reclaim)
1002{
1003 struct mem_cgroup_reclaim_iter *iter;
1004 struct cgroup_subsys_state *css = NULL;
1005 struct mem_cgroup *memcg = NULL;
1006 struct mem_cgroup *pos = NULL;
1007
1008 if (mem_cgroup_disabled())
1009 return NULL;
1010
1011 if (!root)
1012 root = root_mem_cgroup;
1013
1014 rcu_read_lock();
1015
1016 if (reclaim) {
1017 struct mem_cgroup_per_node *mz;
1018
1019 mz = root->nodeinfo[reclaim->pgdat->node_id];
1020 iter = &mz->iter;
1021
1022
1023
1024
1025
1026 if (!prev)
1027 reclaim->generation = iter->generation;
1028 else if (reclaim->generation != iter->generation)
1029 goto out_unlock;
1030
1031 while (1) {
1032 pos = READ_ONCE(iter->position);
1033 if (!pos || css_tryget(&pos->css))
1034 break;
1035
1036
1037
1038
1039
1040
1041
1042
1043 (void)cmpxchg(&iter->position, pos, NULL);
1044 }
1045 } else if (prev) {
1046 pos = prev;
1047 }
1048
1049 if (pos)
1050 css = &pos->css;
1051
1052 for (;;) {
1053 css = css_next_descendant_pre(css, &root->css);
1054 if (!css) {
1055
1056
1057
1058
1059
1060
1061 if (!prev)
1062 continue;
1063 break;
1064 }
1065
1066
1067
1068
1069
1070
1071 if (css == &root->css || css_tryget(css)) {
1072 memcg = mem_cgroup_from_css(css);
1073 break;
1074 }
1075 }
1076
1077 if (reclaim) {
1078
1079
1080
1081
1082
1083 (void)cmpxchg(&iter->position, pos, memcg);
1084
1085 if (pos)
1086 css_put(&pos->css);
1087
1088 if (!memcg)
1089 iter->generation++;
1090 }
1091
1092out_unlock:
1093 rcu_read_unlock();
1094 if (prev && prev != root)
1095 css_put(&prev->css);
1096
1097 return memcg;
1098}
1099
1100
1101
1102
1103
1104
1105void mem_cgroup_iter_break(struct mem_cgroup *root,
1106 struct mem_cgroup *prev)
1107{
1108 if (!root)
1109 root = root_mem_cgroup;
1110 if (prev && prev != root)
1111 css_put(&prev->css);
1112}
1113
1114static void __invalidate_reclaim_iterators(struct mem_cgroup *from,
1115 struct mem_cgroup *dead_memcg)
1116{
1117 struct mem_cgroup_reclaim_iter *iter;
1118 struct mem_cgroup_per_node *mz;
1119 int nid;
1120
1121 for_each_node(nid) {
1122 mz = from->nodeinfo[nid];
1123 iter = &mz->iter;
1124 cmpxchg(&iter->position, dead_memcg, NULL);
1125 }
1126}
1127
1128static void invalidate_reclaim_iterators(struct mem_cgroup *dead_memcg)
1129{
1130 struct mem_cgroup *memcg = dead_memcg;
1131 struct mem_cgroup *last;
1132
1133 do {
1134 __invalidate_reclaim_iterators(memcg, dead_memcg);
1135 last = memcg;
1136 } while ((memcg = parent_mem_cgroup(memcg)));
1137
1138
1139
1140
1141
1142
1143
1144 if (last != root_mem_cgroup)
1145 __invalidate_reclaim_iterators(root_mem_cgroup,
1146 dead_memcg);
1147}
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162int mem_cgroup_scan_tasks(struct mem_cgroup *memcg,
1163 int (*fn)(struct task_struct *, void *), void *arg)
1164{
1165 struct mem_cgroup *iter;
1166 int ret = 0;
1167
1168 BUG_ON(memcg == root_mem_cgroup);
1169
1170 for_each_mem_cgroup_tree(iter, memcg) {
1171 struct css_task_iter it;
1172 struct task_struct *task;
1173
1174 css_task_iter_start(&iter->css, CSS_TASK_ITER_PROCS, &it);
1175 while (!ret && (task = css_task_iter_next(&it)))
1176 ret = fn(task, arg);
1177 css_task_iter_end(&it);
1178 if (ret) {
1179 mem_cgroup_iter_break(memcg, iter);
1180 break;
1181 }
1182 }
1183 return ret;
1184}
1185
1186#ifdef CONFIG_DEBUG_VM
1187void lruvec_memcg_debug(struct lruvec *lruvec, struct folio *folio)
1188{
1189 struct mem_cgroup *memcg;
1190
1191 if (mem_cgroup_disabled())
1192 return;
1193
1194 memcg = folio_memcg(folio);
1195
1196 if (!memcg)
1197 VM_BUG_ON_FOLIO(lruvec_memcg(lruvec) != root_mem_cgroup, folio);
1198 else
1199 VM_BUG_ON_FOLIO(lruvec_memcg(lruvec) != memcg, folio);
1200}
1201#endif
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215struct lruvec *folio_lruvec_lock(struct folio *folio)
1216{
1217 struct lruvec *lruvec = folio_lruvec(folio);
1218
1219 spin_lock(&lruvec->lru_lock);
1220 lruvec_memcg_debug(lruvec, folio);
1221
1222 return lruvec;
1223}
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238struct lruvec *folio_lruvec_lock_irq(struct folio *folio)
1239{
1240 struct lruvec *lruvec = folio_lruvec(folio);
1241
1242 spin_lock_irq(&lruvec->lru_lock);
1243 lruvec_memcg_debug(lruvec, folio);
1244
1245 return lruvec;
1246}
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262struct lruvec *folio_lruvec_lock_irqsave(struct folio *folio,
1263 unsigned long *flags)
1264{
1265 struct lruvec *lruvec = folio_lruvec(folio);
1266
1267 spin_lock_irqsave(&lruvec->lru_lock, *flags);
1268 lruvec_memcg_debug(lruvec, folio);
1269
1270 return lruvec;
1271}
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru,
1284 int zid, int nr_pages)
1285{
1286 struct mem_cgroup_per_node *mz;
1287 unsigned long *lru_size;
1288 long size;
1289
1290 if (mem_cgroup_disabled())
1291 return;
1292
1293 mz = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
1294 lru_size = &mz->lru_zone_size[zid][lru];
1295
1296 if (nr_pages < 0)
1297 *lru_size += nr_pages;
1298
1299 size = *lru_size;
1300 if (WARN_ONCE(size < 0,
1301 "%s(%p, %d, %d): lru_size %ld\n",
1302 __func__, lruvec, lru, nr_pages, size)) {
1303 VM_BUG_ON(1);
1304 *lru_size = 0;
1305 }
1306
1307 if (nr_pages > 0)
1308 *lru_size += nr_pages;
1309}
1310
1311
1312
1313
1314
1315
1316
1317
1318static unsigned long mem_cgroup_margin(struct mem_cgroup *memcg)
1319{
1320 unsigned long margin = 0;
1321 unsigned long count;
1322 unsigned long limit;
1323
1324 count = page_counter_read(&memcg->memory);
1325 limit = READ_ONCE(memcg->memory.max);
1326 if (count < limit)
1327 margin = limit - count;
1328
1329 if (do_memsw_account()) {
1330 count = page_counter_read(&memcg->memsw);
1331 limit = READ_ONCE(memcg->memsw.max);
1332 if (count < limit)
1333 margin = min(margin, limit - count);
1334 else
1335 margin = 0;
1336 }
1337
1338 return margin;
1339}
1340
1341
1342
1343
1344
1345
1346
1347
1348static bool mem_cgroup_under_move(struct mem_cgroup *memcg)
1349{
1350 struct mem_cgroup *from;
1351 struct mem_cgroup *to;
1352 bool ret = false;
1353
1354
1355
1356
1357 spin_lock(&mc.lock);
1358 from = mc.from;
1359 to = mc.to;
1360 if (!from)
1361 goto unlock;
1362
1363 ret = mem_cgroup_is_descendant(from, memcg) ||
1364 mem_cgroup_is_descendant(to, memcg);
1365unlock:
1366 spin_unlock(&mc.lock);
1367 return ret;
1368}
1369
1370static bool mem_cgroup_wait_acct_move(struct mem_cgroup *memcg)
1371{
1372 if (mc.moving_task && current != mc.moving_task) {
1373 if (mem_cgroup_under_move(memcg)) {
1374 DEFINE_WAIT(wait);
1375 prepare_to_wait(&mc.waitq, &wait, TASK_INTERRUPTIBLE);
1376
1377 if (mc.moving_task)
1378 schedule();
1379 finish_wait(&mc.waitq, &wait);
1380 return true;
1381 }
1382 }
1383 return false;
1384}
1385
1386struct memory_stat {
1387 const char *name;
1388 unsigned int idx;
1389};
1390
1391static const struct memory_stat memory_stats[] = {
1392 { "anon", NR_ANON_MAPPED },
1393 { "file", NR_FILE_PAGES },
1394 { "kernel", MEMCG_KMEM },
1395 { "kernel_stack", NR_KERNEL_STACK_KB },
1396 { "pagetables", NR_PAGETABLE },
1397 { "percpu", MEMCG_PERCPU_B },
1398 { "sock", MEMCG_SOCK },
1399 { "vmalloc", MEMCG_VMALLOC },
1400 { "shmem", NR_SHMEM },
1401#if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_ZSWAP)
1402 { "zswap", MEMCG_ZSWAP_B },
1403 { "zswapped", MEMCG_ZSWAPPED },
1404#endif
1405 { "file_mapped", NR_FILE_MAPPED },
1406 { "file_dirty", NR_FILE_DIRTY },
1407 { "file_writeback", NR_WRITEBACK },
1408#ifdef CONFIG_SWAP
1409 { "swapcached", NR_SWAPCACHE },
1410#endif
1411#ifdef CONFIG_TRANSPARENT_HUGEPAGE
1412 { "anon_thp", NR_ANON_THPS },
1413 { "file_thp", NR_FILE_THPS },
1414 { "shmem_thp", NR_SHMEM_THPS },
1415#endif
1416 { "inactive_anon", NR_INACTIVE_ANON },
1417 { "active_anon", NR_ACTIVE_ANON },
1418 { "inactive_file", NR_INACTIVE_FILE },
1419 { "active_file", NR_ACTIVE_FILE },
1420 { "unevictable", NR_UNEVICTABLE },
1421 { "slab_reclaimable", NR_SLAB_RECLAIMABLE_B },
1422 { "slab_unreclaimable", NR_SLAB_UNRECLAIMABLE_B },
1423
1424
1425 { "workingset_refault_anon", WORKINGSET_REFAULT_ANON },
1426 { "workingset_refault_file", WORKINGSET_REFAULT_FILE },
1427 { "workingset_activate_anon", WORKINGSET_ACTIVATE_ANON },
1428 { "workingset_activate_file", WORKINGSET_ACTIVATE_FILE },
1429 { "workingset_restore_anon", WORKINGSET_RESTORE_ANON },
1430 { "workingset_restore_file", WORKINGSET_RESTORE_FILE },
1431 { "workingset_nodereclaim", WORKINGSET_NODERECLAIM },
1432};
1433
1434
1435static int memcg_page_state_unit(int item)
1436{
1437 switch (item) {
1438 case MEMCG_PERCPU_B:
1439 case MEMCG_ZSWAP_B:
1440 case NR_SLAB_RECLAIMABLE_B:
1441 case NR_SLAB_UNRECLAIMABLE_B:
1442 case WORKINGSET_REFAULT_ANON:
1443 case WORKINGSET_REFAULT_FILE:
1444 case WORKINGSET_ACTIVATE_ANON:
1445 case WORKINGSET_ACTIVATE_FILE:
1446 case WORKINGSET_RESTORE_ANON:
1447 case WORKINGSET_RESTORE_FILE:
1448 case WORKINGSET_NODERECLAIM:
1449 return 1;
1450 case NR_KERNEL_STACK_KB:
1451 return SZ_1K;
1452 default:
1453 return PAGE_SIZE;
1454 }
1455}
1456
1457static inline unsigned long memcg_page_state_output(struct mem_cgroup *memcg,
1458 int item)
1459{
1460 return memcg_page_state(memcg, item) * memcg_page_state_unit(item);
1461}
1462
1463static char *memory_stat_format(struct mem_cgroup *memcg)
1464{
1465 struct seq_buf s;
1466 int i;
1467
1468 seq_buf_init(&s, kmalloc(PAGE_SIZE, GFP_KERNEL), PAGE_SIZE);
1469 if (!s.buffer)
1470 return NULL;
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482 mem_cgroup_flush_stats();
1483
1484 for (i = 0; i < ARRAY_SIZE(memory_stats); i++) {
1485 u64 size;
1486
1487 size = memcg_page_state_output(memcg, memory_stats[i].idx);
1488 seq_buf_printf(&s, "%s %llu\n", memory_stats[i].name, size);
1489
1490 if (unlikely(memory_stats[i].idx == NR_SLAB_UNRECLAIMABLE_B)) {
1491 size += memcg_page_state_output(memcg,
1492 NR_SLAB_RECLAIMABLE_B);
1493 seq_buf_printf(&s, "slab %llu\n", size);
1494 }
1495 }
1496
1497
1498
1499 seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGFAULT),
1500 memcg_events(memcg, PGFAULT));
1501 seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGMAJFAULT),
1502 memcg_events(memcg, PGMAJFAULT));
1503 seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGREFILL),
1504 memcg_events(memcg, PGREFILL));
1505 seq_buf_printf(&s, "pgscan %lu\n",
1506 memcg_events(memcg, PGSCAN_KSWAPD) +
1507 memcg_events(memcg, PGSCAN_DIRECT));
1508 seq_buf_printf(&s, "pgsteal %lu\n",
1509 memcg_events(memcg, PGSTEAL_KSWAPD) +
1510 memcg_events(memcg, PGSTEAL_DIRECT));
1511 seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGACTIVATE),
1512 memcg_events(memcg, PGACTIVATE));
1513 seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGDEACTIVATE),
1514 memcg_events(memcg, PGDEACTIVATE));
1515 seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGLAZYFREE),
1516 memcg_events(memcg, PGLAZYFREE));
1517 seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGLAZYFREED),
1518 memcg_events(memcg, PGLAZYFREED));
1519
1520#if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_ZSWAP)
1521 seq_buf_printf(&s, "%s %lu\n", vm_event_name(ZSWPIN),
1522 memcg_events(memcg, ZSWPIN));
1523 seq_buf_printf(&s, "%s %lu\n", vm_event_name(ZSWPOUT),
1524 memcg_events(memcg, ZSWPOUT));
1525#endif
1526
1527#ifdef CONFIG_TRANSPARENT_HUGEPAGE
1528 seq_buf_printf(&s, "%s %lu\n", vm_event_name(THP_FAULT_ALLOC),
1529 memcg_events(memcg, THP_FAULT_ALLOC));
1530 seq_buf_printf(&s, "%s %lu\n", vm_event_name(THP_COLLAPSE_ALLOC),
1531 memcg_events(memcg, THP_COLLAPSE_ALLOC));
1532#endif
1533
1534
1535 WARN_ON_ONCE(seq_buf_has_overflowed(&s));
1536
1537 return s.buffer;
1538}
1539
1540#define K(x) ((x) << (PAGE_SHIFT-10))
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550void mem_cgroup_print_oom_context(struct mem_cgroup *memcg, struct task_struct *p)
1551{
1552 rcu_read_lock();
1553
1554 if (memcg) {
1555 pr_cont(",oom_memcg=");
1556 pr_cont_cgroup_path(memcg->css.cgroup);
1557 } else
1558 pr_cont(",global_oom");
1559 if (p) {
1560 pr_cont(",task_memcg=");
1561 pr_cont_cgroup_path(task_cgroup(p, memory_cgrp_id));
1562 }
1563 rcu_read_unlock();
1564}
1565
1566
1567
1568
1569
1570
1571void mem_cgroup_print_oom_meminfo(struct mem_cgroup *memcg)
1572{
1573 char *buf;
1574
1575 pr_info("memory: usage %llukB, limit %llukB, failcnt %lu\n",
1576 K((u64)page_counter_read(&memcg->memory)),
1577 K((u64)READ_ONCE(memcg->memory.max)), memcg->memory.failcnt);
1578 if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
1579 pr_info("swap: usage %llukB, limit %llukB, failcnt %lu\n",
1580 K((u64)page_counter_read(&memcg->swap)),
1581 K((u64)READ_ONCE(memcg->swap.max)), memcg->swap.failcnt);
1582 else {
1583 pr_info("memory+swap: usage %llukB, limit %llukB, failcnt %lu\n",
1584 K((u64)page_counter_read(&memcg->memsw)),
1585 K((u64)memcg->memsw.max), memcg->memsw.failcnt);
1586 pr_info("kmem: usage %llukB, limit %llukB, failcnt %lu\n",
1587 K((u64)page_counter_read(&memcg->kmem)),
1588 K((u64)memcg->kmem.max), memcg->kmem.failcnt);
1589 }
1590
1591 pr_info("Memory cgroup stats for ");
1592 pr_cont_cgroup_path(memcg->css.cgroup);
1593 pr_cont(":");
1594 buf = memory_stat_format(memcg);
1595 if (!buf)
1596 return;
1597 pr_info("%s", buf);
1598 kfree(buf);
1599}
1600
1601
1602
1603
1604unsigned long mem_cgroup_get_max(struct mem_cgroup *memcg)
1605{
1606 unsigned long max = READ_ONCE(memcg->memory.max);
1607
1608 if (cgroup_subsys_on_dfl(memory_cgrp_subsys)) {
1609 if (mem_cgroup_swappiness(memcg))
1610 max += min(READ_ONCE(memcg->swap.max),
1611 (unsigned long)total_swap_pages);
1612 } else {
1613 if (mem_cgroup_swappiness(memcg)) {
1614
1615 unsigned long swap = READ_ONCE(memcg->memsw.max) - max;
1616
1617 max += min(swap, (unsigned long)total_swap_pages);
1618 }
1619 }
1620 return max;
1621}
1622
1623unsigned long mem_cgroup_size(struct mem_cgroup *memcg)
1624{
1625 return page_counter_read(&memcg->memory);
1626}
1627
1628static bool mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
1629 int order)
1630{
1631 struct oom_control oc = {
1632 .zonelist = NULL,
1633 .nodemask = NULL,
1634 .memcg = memcg,
1635 .gfp_mask = gfp_mask,
1636 .order = order,
1637 };
1638 bool ret = true;
1639
1640 if (mutex_lock_killable(&oom_lock))
1641 return true;
1642
1643 if (mem_cgroup_margin(memcg) >= (1 << order))
1644 goto unlock;
1645
1646
1647
1648
1649
1650 ret = task_is_dying() || out_of_memory(&oc);
1651
1652unlock:
1653 mutex_unlock(&oom_lock);
1654 return ret;
1655}
1656
1657static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg,
1658 pg_data_t *pgdat,
1659 gfp_t gfp_mask,
1660 unsigned long *total_scanned)
1661{
1662 struct mem_cgroup *victim = NULL;
1663 int total = 0;
1664 int loop = 0;
1665 unsigned long excess;
1666 unsigned long nr_scanned;
1667 struct mem_cgroup_reclaim_cookie reclaim = {
1668 .pgdat = pgdat,
1669 };
1670
1671 excess = soft_limit_excess(root_memcg);
1672
1673 while (1) {
1674 victim = mem_cgroup_iter(root_memcg, victim, &reclaim);
1675 if (!victim) {
1676 loop++;
1677 if (loop >= 2) {
1678
1679
1680
1681
1682
1683 if (!total)
1684 break;
1685
1686
1687
1688
1689
1690
1691 if (total >= (excess >> 2) ||
1692 (loop > MEM_CGROUP_MAX_RECLAIM_LOOPS))
1693 break;
1694 }
1695 continue;
1696 }
1697 total += mem_cgroup_shrink_node(victim, gfp_mask, false,
1698 pgdat, &nr_scanned);
1699 *total_scanned += nr_scanned;
1700 if (!soft_limit_excess(root_memcg))
1701 break;
1702 }
1703 mem_cgroup_iter_break(root_memcg, victim);
1704 return total;
1705}
1706
1707#ifdef CONFIG_LOCKDEP
1708static struct lockdep_map memcg_oom_lock_dep_map = {
1709 .name = "memcg_oom_lock",
1710};
1711#endif
1712
1713static DEFINE_SPINLOCK(memcg_oom_lock);
1714
1715
1716
1717
1718
1719static bool mem_cgroup_oom_trylock(struct mem_cgroup *memcg)
1720{
1721 struct mem_cgroup *iter, *failed = NULL;
1722
1723 spin_lock(&memcg_oom_lock);
1724
1725 for_each_mem_cgroup_tree(iter, memcg) {
1726 if (iter->oom_lock) {
1727
1728
1729
1730
1731 failed = iter;
1732 mem_cgroup_iter_break(memcg, iter);
1733 break;
1734 } else
1735 iter->oom_lock = true;
1736 }
1737
1738 if (failed) {
1739
1740
1741
1742
1743 for_each_mem_cgroup_tree(iter, memcg) {
1744 if (iter == failed) {
1745 mem_cgroup_iter_break(memcg, iter);
1746 break;
1747 }
1748 iter->oom_lock = false;
1749 }
1750 } else
1751 mutex_acquire(&memcg_oom_lock_dep_map, 0, 1, _RET_IP_);
1752
1753 spin_unlock(&memcg_oom_lock);
1754
1755 return !failed;
1756}
1757
1758static void mem_cgroup_oom_unlock(struct mem_cgroup *memcg)
1759{
1760 struct mem_cgroup *iter;
1761
1762 spin_lock(&memcg_oom_lock);
1763 mutex_release(&memcg_oom_lock_dep_map, _RET_IP_);
1764 for_each_mem_cgroup_tree(iter, memcg)
1765 iter->oom_lock = false;
1766 spin_unlock(&memcg_oom_lock);
1767}
1768
1769static void mem_cgroup_mark_under_oom(struct mem_cgroup *memcg)
1770{
1771 struct mem_cgroup *iter;
1772
1773 spin_lock(&memcg_oom_lock);
1774 for_each_mem_cgroup_tree(iter, memcg)
1775 iter->under_oom++;
1776 spin_unlock(&memcg_oom_lock);
1777}
1778
1779static void mem_cgroup_unmark_under_oom(struct mem_cgroup *memcg)
1780{
1781 struct mem_cgroup *iter;
1782
1783
1784
1785
1786
1787 spin_lock(&memcg_oom_lock);
1788 for_each_mem_cgroup_tree(iter, memcg)
1789 if (iter->under_oom > 0)
1790 iter->under_oom--;
1791 spin_unlock(&memcg_oom_lock);
1792}
1793
1794static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq);
1795
1796struct oom_wait_info {
1797 struct mem_cgroup *memcg;
1798 wait_queue_entry_t wait;
1799};
1800
1801static int memcg_oom_wake_function(wait_queue_entry_t *wait,
1802 unsigned mode, int sync, void *arg)
1803{
1804 struct mem_cgroup *wake_memcg = (struct mem_cgroup *)arg;
1805 struct mem_cgroup *oom_wait_memcg;
1806 struct oom_wait_info *oom_wait_info;
1807
1808 oom_wait_info = container_of(wait, struct oom_wait_info, wait);
1809 oom_wait_memcg = oom_wait_info->memcg;
1810
1811 if (!mem_cgroup_is_descendant(wake_memcg, oom_wait_memcg) &&
1812 !mem_cgroup_is_descendant(oom_wait_memcg, wake_memcg))
1813 return 0;
1814 return autoremove_wake_function(wait, mode, sync, arg);
1815}
1816
1817static void memcg_oom_recover(struct mem_cgroup *memcg)
1818{
1819
1820
1821
1822
1823
1824
1825
1826
1827 if (memcg && memcg->under_oom)
1828 __wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, memcg);
1829}
1830
1831
1832
1833
1834
1835static bool mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order)
1836{
1837 bool locked, ret;
1838
1839 if (order > PAGE_ALLOC_COSTLY_ORDER)
1840 return false;
1841
1842 memcg_memory_event(memcg, MEMCG_OOM);
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862 if (memcg->oom_kill_disable) {
1863 if (current->in_user_fault) {
1864 css_get(&memcg->css);
1865 current->memcg_in_oom = memcg;
1866 current->memcg_oom_gfp_mask = mask;
1867 current->memcg_oom_order = order;
1868 }
1869 return false;
1870 }
1871
1872 mem_cgroup_mark_under_oom(memcg);
1873
1874 locked = mem_cgroup_oom_trylock(memcg);
1875
1876 if (locked)
1877 mem_cgroup_oom_notify(memcg);
1878
1879 mem_cgroup_unmark_under_oom(memcg);
1880 ret = mem_cgroup_out_of_memory(memcg, mask, order);
1881
1882 if (locked)
1883 mem_cgroup_oom_unlock(memcg);
1884
1885 return ret;
1886}
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905bool mem_cgroup_oom_synchronize(bool handle)
1906{
1907 struct mem_cgroup *memcg = current->memcg_in_oom;
1908 struct oom_wait_info owait;
1909 bool locked;
1910
1911
1912 if (!memcg)
1913 return false;
1914
1915 if (!handle)
1916 goto cleanup;
1917
1918 owait.memcg = memcg;
1919 owait.wait.flags = 0;
1920 owait.wait.func = memcg_oom_wake_function;
1921 owait.wait.private = current;
1922 INIT_LIST_HEAD(&owait.wait.entry);
1923
1924 prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE);
1925 mem_cgroup_mark_under_oom(memcg);
1926
1927 locked = mem_cgroup_oom_trylock(memcg);
1928
1929 if (locked)
1930 mem_cgroup_oom_notify(memcg);
1931
1932 if (locked && !memcg->oom_kill_disable) {
1933 mem_cgroup_unmark_under_oom(memcg);
1934 finish_wait(&memcg_oom_waitq, &owait.wait);
1935 mem_cgroup_out_of_memory(memcg, current->memcg_oom_gfp_mask,
1936 current->memcg_oom_order);
1937 } else {
1938 schedule();
1939 mem_cgroup_unmark_under_oom(memcg);
1940 finish_wait(&memcg_oom_waitq, &owait.wait);
1941 }
1942
1943 if (locked) {
1944 mem_cgroup_oom_unlock(memcg);
1945
1946
1947
1948
1949
1950 memcg_oom_recover(memcg);
1951 }
1952cleanup:
1953 current->memcg_in_oom = NULL;
1954 css_put(&memcg->css);
1955 return true;
1956}
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968struct mem_cgroup *mem_cgroup_get_oom_group(struct task_struct *victim,
1969 struct mem_cgroup *oom_domain)
1970{
1971 struct mem_cgroup *oom_group = NULL;
1972 struct mem_cgroup *memcg;
1973
1974 if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
1975 return NULL;
1976
1977 if (!oom_domain)
1978 oom_domain = root_mem_cgroup;
1979
1980 rcu_read_lock();
1981
1982 memcg = mem_cgroup_from_task(victim);
1983 if (memcg == root_mem_cgroup)
1984 goto out;
1985
1986
1987
1988
1989
1990
1991 if (unlikely(!mem_cgroup_is_descendant(memcg, oom_domain)))
1992 goto out;
1993
1994
1995
1996
1997
1998
1999 for (; memcg; memcg = parent_mem_cgroup(memcg)) {
2000 if (memcg->oom_group)
2001 oom_group = memcg;
2002
2003 if (memcg == oom_domain)
2004 break;
2005 }
2006
2007 if (oom_group)
2008 css_get(&oom_group->css);
2009out:
2010 rcu_read_unlock();
2011
2012 return oom_group;
2013}
2014
2015void mem_cgroup_print_oom_group(struct mem_cgroup *memcg)
2016{
2017 pr_info("Tasks in ");
2018 pr_cont_cgroup_path(memcg->css.cgroup);
2019 pr_cont(" are going to be killed due to memory.oom.group set\n");
2020}
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032void folio_memcg_lock(struct folio *folio)
2033{
2034 struct mem_cgroup *memcg;
2035 unsigned long flags;
2036
2037
2038
2039
2040
2041
2042 rcu_read_lock();
2043
2044 if (mem_cgroup_disabled())
2045 return;
2046again:
2047 memcg = folio_memcg(folio);
2048 if (unlikely(!memcg))
2049 return;
2050
2051#ifdef CONFIG_PROVE_LOCKING
2052 local_irq_save(flags);
2053 might_lock(&memcg->move_lock);
2054 local_irq_restore(flags);
2055#endif
2056
2057 if (atomic_read(&memcg->moving_account) <= 0)
2058 return;
2059
2060 spin_lock_irqsave(&memcg->move_lock, flags);
2061 if (memcg != folio_memcg(folio)) {
2062 spin_unlock_irqrestore(&memcg->move_lock, flags);
2063 goto again;
2064 }
2065
2066
2067
2068
2069
2070
2071
2072 memcg->move_lock_task = current;
2073 memcg->move_lock_flags = flags;
2074}
2075
2076void lock_page_memcg(struct page *page)
2077{
2078 folio_memcg_lock(page_folio(page));
2079}
2080
2081static void __folio_memcg_unlock(struct mem_cgroup *memcg)
2082{
2083 if (memcg && memcg->move_lock_task == current) {
2084 unsigned long flags = memcg->move_lock_flags;
2085
2086 memcg->move_lock_task = NULL;
2087 memcg->move_lock_flags = 0;
2088
2089 spin_unlock_irqrestore(&memcg->move_lock, flags);
2090 }
2091
2092 rcu_read_unlock();
2093}
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103void folio_memcg_unlock(struct folio *folio)
2104{
2105 __folio_memcg_unlock(folio_memcg(folio));
2106}
2107
2108void unlock_page_memcg(struct page *page)
2109{
2110 folio_memcg_unlock(page_folio(page));
2111}
2112
2113struct memcg_stock_pcp {
2114 local_lock_t stock_lock;
2115 struct mem_cgroup *cached;
2116 unsigned int nr_pages;
2117
2118#ifdef CONFIG_MEMCG_KMEM
2119 struct obj_cgroup *cached_objcg;
2120 struct pglist_data *cached_pgdat;
2121 unsigned int nr_bytes;
2122 int nr_slab_reclaimable_b;
2123 int nr_slab_unreclaimable_b;
2124#endif
2125
2126 struct work_struct work;
2127 unsigned long flags;
2128#define FLUSHING_CACHED_CHARGE 0
2129};
2130static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock) = {
2131 .stock_lock = INIT_LOCAL_LOCK(stock_lock),
2132};
2133static DEFINE_MUTEX(percpu_charge_mutex);
2134
2135#ifdef CONFIG_MEMCG_KMEM
2136static struct obj_cgroup *drain_obj_stock(struct memcg_stock_pcp *stock);
2137static bool obj_stock_flush_required(struct memcg_stock_pcp *stock,
2138 struct mem_cgroup *root_memcg);
2139static void memcg_account_kmem(struct mem_cgroup *memcg, int nr_pages);
2140
2141#else
2142static inline struct obj_cgroup *drain_obj_stock(struct memcg_stock_pcp *stock)
2143{
2144 return NULL;
2145}
2146static bool obj_stock_flush_required(struct memcg_stock_pcp *stock,
2147 struct mem_cgroup *root_memcg)
2148{
2149 return false;
2150}
2151static void memcg_account_kmem(struct mem_cgroup *memcg, int nr_pages)
2152{
2153}
2154#endif
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
2168{
2169 struct memcg_stock_pcp *stock;
2170 unsigned long flags;
2171 bool ret = false;
2172
2173 if (nr_pages > MEMCG_CHARGE_BATCH)
2174 return ret;
2175
2176 local_lock_irqsave(&memcg_stock.stock_lock, flags);
2177
2178 stock = this_cpu_ptr(&memcg_stock);
2179 if (memcg == stock->cached && stock->nr_pages >= nr_pages) {
2180 stock->nr_pages -= nr_pages;
2181 ret = true;
2182 }
2183
2184 local_unlock_irqrestore(&memcg_stock.stock_lock, flags);
2185
2186 return ret;
2187}
2188
2189
2190
2191
2192static void drain_stock(struct memcg_stock_pcp *stock)
2193{
2194 struct mem_cgroup *old = stock->cached;
2195
2196 if (!old)
2197 return;
2198
2199 if (stock->nr_pages) {
2200 page_counter_uncharge(&old->memory, stock->nr_pages);
2201 if (do_memsw_account())
2202 page_counter_uncharge(&old->memsw, stock->nr_pages);
2203 stock->nr_pages = 0;
2204 }
2205
2206 css_put(&old->css);
2207 stock->cached = NULL;
2208}
2209
2210static void drain_local_stock(struct work_struct *dummy)
2211{
2212 struct memcg_stock_pcp *stock;
2213 struct obj_cgroup *old = NULL;
2214 unsigned long flags;
2215
2216
2217
2218
2219
2220
2221 local_lock_irqsave(&memcg_stock.stock_lock, flags);
2222
2223 stock = this_cpu_ptr(&memcg_stock);
2224 old = drain_obj_stock(stock);
2225 drain_stock(stock);
2226 clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags);
2227
2228 local_unlock_irqrestore(&memcg_stock.stock_lock, flags);
2229 if (old)
2230 obj_cgroup_put(old);
2231}
2232
2233
2234
2235
2236
2237static void __refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
2238{
2239 struct memcg_stock_pcp *stock;
2240
2241 stock = this_cpu_ptr(&memcg_stock);
2242 if (stock->cached != memcg) {
2243 drain_stock(stock);
2244 css_get(&memcg->css);
2245 stock->cached = memcg;
2246 }
2247 stock->nr_pages += nr_pages;
2248
2249 if (stock->nr_pages > MEMCG_CHARGE_BATCH)
2250 drain_stock(stock);
2251}
2252
2253static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
2254{
2255 unsigned long flags;
2256
2257 local_lock_irqsave(&memcg_stock.stock_lock, flags);
2258 __refill_stock(memcg, nr_pages);
2259 local_unlock_irqrestore(&memcg_stock.stock_lock, flags);
2260}
2261
2262
2263
2264
2265
2266static void drain_all_stock(struct mem_cgroup *root_memcg)
2267{
2268 int cpu, curcpu;
2269
2270
2271 if (!mutex_trylock(&percpu_charge_mutex))
2272 return;
2273
2274
2275
2276
2277
2278
2279 migrate_disable();
2280 curcpu = smp_processor_id();
2281 for_each_online_cpu(cpu) {
2282 struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu);
2283 struct mem_cgroup *memcg;
2284 bool flush = false;
2285
2286 rcu_read_lock();
2287 memcg = stock->cached;
2288 if (memcg && stock->nr_pages &&
2289 mem_cgroup_is_descendant(memcg, root_memcg))
2290 flush = true;
2291 else if (obj_stock_flush_required(stock, root_memcg))
2292 flush = true;
2293 rcu_read_unlock();
2294
2295 if (flush &&
2296 !test_and_set_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) {
2297 if (cpu == curcpu)
2298 drain_local_stock(&stock->work);
2299 else
2300 schedule_work_on(cpu, &stock->work);
2301 }
2302 }
2303 migrate_enable();
2304 mutex_unlock(&percpu_charge_mutex);
2305}
2306
2307static int memcg_hotplug_cpu_dead(unsigned int cpu)
2308{
2309 struct memcg_stock_pcp *stock;
2310
2311 stock = &per_cpu(memcg_stock, cpu);
2312 drain_stock(stock);
2313
2314 return 0;
2315}
2316
2317static unsigned long reclaim_high(struct mem_cgroup *memcg,
2318 unsigned int nr_pages,
2319 gfp_t gfp_mask)
2320{
2321 unsigned long nr_reclaimed = 0;
2322
2323 do {
2324 unsigned long pflags;
2325
2326 if (page_counter_read(&memcg->memory) <=
2327 READ_ONCE(memcg->memory.high))
2328 continue;
2329
2330 memcg_memory_event(memcg, MEMCG_HIGH);
2331
2332 psi_memstall_enter(&pflags);
2333 nr_reclaimed += try_to_free_mem_cgroup_pages(memcg, nr_pages,
2334 gfp_mask, true);
2335 psi_memstall_leave(&pflags);
2336 } while ((memcg = parent_mem_cgroup(memcg)) &&
2337 !mem_cgroup_is_root(memcg));
2338
2339 return nr_reclaimed;
2340}
2341
2342static void high_work_func(struct work_struct *work)
2343{
2344 struct mem_cgroup *memcg;
2345
2346 memcg = container_of(work, struct mem_cgroup, high_work);
2347 reclaim_high(memcg, MEMCG_CHARGE_BATCH, GFP_KERNEL);
2348}
2349
2350
2351
2352
2353
2354
2355#define MEMCG_MAX_HIGH_DELAY_JIFFIES (2UL*HZ)
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400 #define MEMCG_DELAY_PRECISION_SHIFT 20
2401 #define MEMCG_DELAY_SCALING_SHIFT 14
2402
2403static u64 calculate_overage(unsigned long usage, unsigned long high)
2404{
2405 u64 overage;
2406
2407 if (usage <= high)
2408 return 0;
2409
2410
2411
2412
2413
2414 high = max(high, 1UL);
2415
2416 overage = usage - high;
2417 overage <<= MEMCG_DELAY_PRECISION_SHIFT;
2418 return div64_u64(overage, high);
2419}
2420
2421static u64 mem_find_max_overage(struct mem_cgroup *memcg)
2422{
2423 u64 overage, max_overage = 0;
2424
2425 do {
2426 overage = calculate_overage(page_counter_read(&memcg->memory),
2427 READ_ONCE(memcg->memory.high));
2428 max_overage = max(overage, max_overage);
2429 } while ((memcg = parent_mem_cgroup(memcg)) &&
2430 !mem_cgroup_is_root(memcg));
2431
2432 return max_overage;
2433}
2434
2435static u64 swap_find_max_overage(struct mem_cgroup *memcg)
2436{
2437 u64 overage, max_overage = 0;
2438
2439 do {
2440 overage = calculate_overage(page_counter_read(&memcg->swap),
2441 READ_ONCE(memcg->swap.high));
2442 if (overage)
2443 memcg_memory_event(memcg, MEMCG_SWAP_HIGH);
2444 max_overage = max(overage, max_overage);
2445 } while ((memcg = parent_mem_cgroup(memcg)) &&
2446 !mem_cgroup_is_root(memcg));
2447
2448 return max_overage;
2449}
2450
2451
2452
2453
2454
2455static unsigned long calculate_high_delay(struct mem_cgroup *memcg,
2456 unsigned int nr_pages,
2457 u64 max_overage)
2458{
2459 unsigned long penalty_jiffies;
2460
2461 if (!max_overage)
2462 return 0;
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472 penalty_jiffies = max_overage * max_overage * HZ;
2473 penalty_jiffies >>= MEMCG_DELAY_PRECISION_SHIFT;
2474 penalty_jiffies >>= MEMCG_DELAY_SCALING_SHIFT;
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484 return penalty_jiffies * nr_pages / MEMCG_CHARGE_BATCH;
2485}
2486
2487
2488
2489
2490
2491void mem_cgroup_handle_over_high(void)
2492{
2493 unsigned long penalty_jiffies;
2494 unsigned long pflags;
2495 unsigned long nr_reclaimed;
2496 unsigned int nr_pages = current->memcg_nr_pages_over_high;
2497 int nr_retries = MAX_RECLAIM_RETRIES;
2498 struct mem_cgroup *memcg;
2499 bool in_retry = false;
2500
2501 if (likely(!nr_pages))
2502 return;
2503
2504 memcg = get_mem_cgroup_from_mm(current->mm);
2505 current->memcg_nr_pages_over_high = 0;
2506
2507retry_reclaim:
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517 nr_reclaimed = reclaim_high(memcg,
2518 in_retry ? SWAP_CLUSTER_MAX : nr_pages,
2519 GFP_KERNEL);
2520
2521
2522
2523
2524
2525 penalty_jiffies = calculate_high_delay(memcg, nr_pages,
2526 mem_find_max_overage(memcg));
2527
2528 penalty_jiffies += calculate_high_delay(memcg, nr_pages,
2529 swap_find_max_overage(memcg));
2530
2531
2532
2533
2534
2535
2536 penalty_jiffies = min(penalty_jiffies, MEMCG_MAX_HIGH_DELAY_JIFFIES);
2537
2538
2539
2540
2541
2542
2543
2544 if (penalty_jiffies <= HZ / 100)
2545 goto out;
2546
2547
2548
2549
2550
2551
2552 if (nr_reclaimed || nr_retries--) {
2553 in_retry = true;
2554 goto retry_reclaim;
2555 }
2556
2557
2558
2559
2560
2561
2562 psi_memstall_enter(&pflags);
2563 schedule_timeout_killable(penalty_jiffies);
2564 psi_memstall_leave(&pflags);
2565
2566out:
2567 css_put(&memcg->css);
2568}
2569
2570static int try_charge_memcg(struct mem_cgroup *memcg, gfp_t gfp_mask,
2571 unsigned int nr_pages)
2572{
2573 unsigned int batch = max(MEMCG_CHARGE_BATCH, nr_pages);
2574 int nr_retries = MAX_RECLAIM_RETRIES;
2575 struct mem_cgroup *mem_over_limit;
2576 struct page_counter *counter;
2577 unsigned long nr_reclaimed;
2578 bool passed_oom = false;
2579 bool may_swap = true;
2580 bool drained = false;
2581 unsigned long pflags;
2582
2583retry:
2584 if (consume_stock(memcg, nr_pages))
2585 return 0;
2586
2587 if (!do_memsw_account() ||
2588 page_counter_try_charge(&memcg->memsw, batch, &counter)) {
2589 if (page_counter_try_charge(&memcg->memory, batch, &counter))
2590 goto done_restock;
2591 if (do_memsw_account())
2592 page_counter_uncharge(&memcg->memsw, batch);
2593 mem_over_limit = mem_cgroup_from_counter(counter, memory);
2594 } else {
2595 mem_over_limit = mem_cgroup_from_counter(counter, memsw);
2596 may_swap = false;
2597 }
2598
2599 if (batch > nr_pages) {
2600 batch = nr_pages;
2601 goto retry;
2602 }
2603
2604
2605
2606
2607
2608
2609
2610 if (unlikely(current->flags & PF_MEMALLOC))
2611 goto force;
2612
2613 if (unlikely(task_in_memcg_oom(current)))
2614 goto nomem;
2615
2616 if (!gfpflags_allow_blocking(gfp_mask))
2617 goto nomem;
2618
2619 memcg_memory_event(mem_over_limit, MEMCG_MAX);
2620
2621 psi_memstall_enter(&pflags);
2622 nr_reclaimed = try_to_free_mem_cgroup_pages(mem_over_limit, nr_pages,
2623 gfp_mask, may_swap);
2624 psi_memstall_leave(&pflags);
2625
2626 if (mem_cgroup_margin(mem_over_limit) >= nr_pages)
2627 goto retry;
2628
2629 if (!drained) {
2630 drain_all_stock(mem_over_limit);
2631 drained = true;
2632 goto retry;
2633 }
2634
2635 if (gfp_mask & __GFP_NORETRY)
2636 goto nomem;
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646 if (nr_reclaimed && nr_pages <= (1 << PAGE_ALLOC_COSTLY_ORDER))
2647 goto retry;
2648
2649
2650
2651
2652 if (mem_cgroup_wait_acct_move(mem_over_limit))
2653 goto retry;
2654
2655 if (nr_retries--)
2656 goto retry;
2657
2658 if (gfp_mask & __GFP_RETRY_MAYFAIL)
2659 goto nomem;
2660
2661
2662 if (passed_oom && task_is_dying())
2663 goto nomem;
2664
2665
2666
2667
2668
2669
2670 if (mem_cgroup_oom(mem_over_limit, gfp_mask,
2671 get_order(nr_pages * PAGE_SIZE))) {
2672 passed_oom = true;
2673 nr_retries = MAX_RECLAIM_RETRIES;
2674 goto retry;
2675 }
2676nomem:
2677
2678
2679
2680
2681
2682
2683 if (!(gfp_mask & (__GFP_NOFAIL | __GFP_HIGH)))
2684 return -ENOMEM;
2685force:
2686
2687
2688
2689
2690
2691 page_counter_charge(&memcg->memory, nr_pages);
2692 if (do_memsw_account())
2693 page_counter_charge(&memcg->memsw, nr_pages);
2694
2695 return 0;
2696
2697done_restock:
2698 if (batch > nr_pages)
2699 refill_stock(memcg, batch - nr_pages);
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710 do {
2711 bool mem_high, swap_high;
2712
2713 mem_high = page_counter_read(&memcg->memory) >
2714 READ_ONCE(memcg->memory.high);
2715 swap_high = page_counter_read(&memcg->swap) >
2716 READ_ONCE(memcg->swap.high);
2717
2718
2719 if (!in_task()) {
2720 if (mem_high) {
2721 schedule_work(&memcg->high_work);
2722 break;
2723 }
2724 continue;
2725 }
2726
2727 if (mem_high || swap_high) {
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737 current->memcg_nr_pages_over_high += batch;
2738 set_notify_resume(current);
2739 break;
2740 }
2741 } while ((memcg = parent_mem_cgroup(memcg)));
2742
2743 if (current->memcg_nr_pages_over_high > MEMCG_CHARGE_BATCH &&
2744 !(current->flags & PF_MEMALLOC) &&
2745 gfpflags_allow_blocking(gfp_mask)) {
2746 mem_cgroup_handle_over_high();
2747 }
2748 return 0;
2749}
2750
2751static inline int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
2752 unsigned int nr_pages)
2753{
2754 if (mem_cgroup_is_root(memcg))
2755 return 0;
2756
2757 return try_charge_memcg(memcg, gfp_mask, nr_pages);
2758}
2759
2760static inline void cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages)
2761{
2762 if (mem_cgroup_is_root(memcg))
2763 return;
2764
2765 page_counter_uncharge(&memcg->memory, nr_pages);
2766 if (do_memsw_account())
2767 page_counter_uncharge(&memcg->memsw, nr_pages);
2768}
2769
2770static void commit_charge(struct folio *folio, struct mem_cgroup *memcg)
2771{
2772 VM_BUG_ON_FOLIO(folio_memcg(folio), folio);
2773
2774
2775
2776
2777
2778
2779
2780
2781 folio->memcg_data = (unsigned long)memcg;
2782}
2783
2784#ifdef CONFIG_MEMCG_KMEM
2785
2786
2787
2788
2789
2790#define OBJCGS_CLEAR_MASK (__GFP_DMA | __GFP_RECLAIMABLE | __GFP_ACCOUNT)
2791
2792
2793
2794
2795
2796static inline void mod_objcg_mlstate(struct obj_cgroup *objcg,
2797 struct pglist_data *pgdat,
2798 enum node_stat_item idx, int nr)
2799{
2800 struct mem_cgroup *memcg;
2801 struct lruvec *lruvec;
2802
2803 rcu_read_lock();
2804 memcg = obj_cgroup_memcg(objcg);
2805 lruvec = mem_cgroup_lruvec(memcg, pgdat);
2806 mod_memcg_lruvec_state(lruvec, idx, nr);
2807 rcu_read_unlock();
2808}
2809
2810int memcg_alloc_slab_cgroups(struct slab *slab, struct kmem_cache *s,
2811 gfp_t gfp, bool new_slab)
2812{
2813 unsigned int objects = objs_per_slab(s, slab);
2814 unsigned long memcg_data;
2815 void *vec;
2816
2817 gfp &= ~OBJCGS_CLEAR_MASK;
2818 vec = kcalloc_node(objects, sizeof(struct obj_cgroup *), gfp,
2819 slab_nid(slab));
2820 if (!vec)
2821 return -ENOMEM;
2822
2823 memcg_data = (unsigned long) vec | MEMCG_DATA_OBJCGS;
2824 if (new_slab) {
2825
2826
2827
2828
2829
2830 slab->memcg_data = memcg_data;
2831 } else if (cmpxchg(&slab->memcg_data, 0, memcg_data)) {
2832
2833
2834
2835
2836
2837 kfree(vec);
2838 return 0;
2839 }
2840
2841 kmemleak_not_leak(vec);
2842 return 0;
2843}
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857struct mem_cgroup *mem_cgroup_from_obj(void *p)
2858{
2859 struct folio *folio;
2860
2861 if (mem_cgroup_disabled())
2862 return NULL;
2863
2864 folio = virt_to_folio(p);
2865
2866
2867
2868
2869
2870
2871 if (folio_test_slab(folio)) {
2872 struct obj_cgroup **objcgs;
2873 struct slab *slab;
2874 unsigned int off;
2875
2876 slab = folio_slab(folio);
2877 objcgs = slab_objcgs(slab);
2878 if (!objcgs)
2879 return NULL;
2880
2881 off = obj_to_index(slab->slab_cache, slab, p);
2882 if (objcgs[off])
2883 return obj_cgroup_memcg(objcgs[off]);
2884
2885 return NULL;
2886 }
2887
2888
2889
2890
2891
2892
2893
2894
2895 return page_memcg_check(folio_page(folio, 0));
2896}
2897
2898static struct obj_cgroup *__get_obj_cgroup_from_memcg(struct mem_cgroup *memcg)
2899{
2900 struct obj_cgroup *objcg = NULL;
2901
2902 for (; memcg != root_mem_cgroup; memcg = parent_mem_cgroup(memcg)) {
2903 objcg = rcu_dereference(memcg->objcg);
2904 if (objcg && obj_cgroup_tryget(objcg))
2905 break;
2906 objcg = NULL;
2907 }
2908 return objcg;
2909}
2910
2911__always_inline struct obj_cgroup *get_obj_cgroup_from_current(void)
2912{
2913 struct obj_cgroup *objcg = NULL;
2914 struct mem_cgroup *memcg;
2915
2916 if (memcg_kmem_bypass())
2917 return NULL;
2918
2919 rcu_read_lock();
2920 if (unlikely(active_memcg()))
2921 memcg = active_memcg();
2922 else
2923 memcg = mem_cgroup_from_task(current);
2924 objcg = __get_obj_cgroup_from_memcg(memcg);
2925 rcu_read_unlock();
2926 return objcg;
2927}
2928
2929struct obj_cgroup *get_obj_cgroup_from_page(struct page *page)
2930{
2931 struct obj_cgroup *objcg;
2932
2933 if (!memcg_kmem_enabled() || memcg_kmem_bypass())
2934 return NULL;
2935
2936 if (PageMemcgKmem(page)) {
2937 objcg = __folio_objcg(page_folio(page));
2938 obj_cgroup_get(objcg);
2939 } else {
2940 struct mem_cgroup *memcg;
2941
2942 rcu_read_lock();
2943 memcg = __folio_memcg(page_folio(page));
2944 if (memcg)
2945 objcg = __get_obj_cgroup_from_memcg(memcg);
2946 else
2947 objcg = NULL;
2948 rcu_read_unlock();
2949 }
2950 return objcg;
2951}
2952
2953static void memcg_account_kmem(struct mem_cgroup *memcg, int nr_pages)
2954{
2955 mod_memcg_state(memcg, MEMCG_KMEM, nr_pages);
2956 if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) {
2957 if (nr_pages > 0)
2958 page_counter_charge(&memcg->kmem, nr_pages);
2959 else
2960 page_counter_uncharge(&memcg->kmem, -nr_pages);
2961 }
2962}
2963
2964
2965
2966
2967
2968
2969
2970static void obj_cgroup_uncharge_pages(struct obj_cgroup *objcg,
2971 unsigned int nr_pages)
2972{
2973 struct mem_cgroup *memcg;
2974
2975 memcg = get_mem_cgroup_from_objcg(objcg);
2976
2977 memcg_account_kmem(memcg, -nr_pages);
2978 refill_stock(memcg, nr_pages);
2979
2980 css_put(&memcg->css);
2981}
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991static int obj_cgroup_charge_pages(struct obj_cgroup *objcg, gfp_t gfp,
2992 unsigned int nr_pages)
2993{
2994 struct mem_cgroup *memcg;
2995 int ret;
2996
2997 memcg = get_mem_cgroup_from_objcg(objcg);
2998
2999 ret = try_charge_memcg(memcg, gfp, nr_pages);
3000 if (ret)
3001 goto out;
3002
3003 memcg_account_kmem(memcg, nr_pages);
3004out:
3005 css_put(&memcg->css);
3006
3007 return ret;
3008}
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018int __memcg_kmem_charge_page(struct page *page, gfp_t gfp, int order)
3019{
3020 struct obj_cgroup *objcg;
3021 int ret = 0;
3022
3023 objcg = get_obj_cgroup_from_current();
3024 if (objcg) {
3025 ret = obj_cgroup_charge_pages(objcg, gfp, 1 << order);
3026 if (!ret) {
3027 page->memcg_data = (unsigned long)objcg |
3028 MEMCG_DATA_KMEM;
3029 return 0;
3030 }
3031 obj_cgroup_put(objcg);
3032 }
3033 return ret;
3034}
3035
3036
3037
3038
3039
3040
3041void __memcg_kmem_uncharge_page(struct page *page, int order)
3042{
3043 struct folio *folio = page_folio(page);
3044 struct obj_cgroup *objcg;
3045 unsigned int nr_pages = 1 << order;
3046
3047 if (!folio_memcg_kmem(folio))
3048 return;
3049
3050 objcg = __folio_objcg(folio);
3051 obj_cgroup_uncharge_pages(objcg, nr_pages);
3052 folio->memcg_data = 0;
3053 obj_cgroup_put(objcg);
3054}
3055
3056void mod_objcg_state(struct obj_cgroup *objcg, struct pglist_data *pgdat,
3057 enum node_stat_item idx, int nr)
3058{
3059 struct memcg_stock_pcp *stock;
3060 struct obj_cgroup *old = NULL;
3061 unsigned long flags;
3062 int *bytes;
3063
3064 local_lock_irqsave(&memcg_stock.stock_lock, flags);
3065 stock = this_cpu_ptr(&memcg_stock);
3066
3067
3068
3069
3070
3071
3072 if (stock->cached_objcg != objcg) {
3073 old = drain_obj_stock(stock);
3074 obj_cgroup_get(objcg);
3075 stock->nr_bytes = atomic_read(&objcg->nr_charged_bytes)
3076 ? atomic_xchg(&objcg->nr_charged_bytes, 0) : 0;
3077 stock->cached_objcg = objcg;
3078 stock->cached_pgdat = pgdat;
3079 } else if (stock->cached_pgdat != pgdat) {
3080
3081 struct pglist_data *oldpg = stock->cached_pgdat;
3082
3083 if (stock->nr_slab_reclaimable_b) {
3084 mod_objcg_mlstate(objcg, oldpg, NR_SLAB_RECLAIMABLE_B,
3085 stock->nr_slab_reclaimable_b);
3086 stock->nr_slab_reclaimable_b = 0;
3087 }
3088 if (stock->nr_slab_unreclaimable_b) {
3089 mod_objcg_mlstate(objcg, oldpg, NR_SLAB_UNRECLAIMABLE_B,
3090 stock->nr_slab_unreclaimable_b);
3091 stock->nr_slab_unreclaimable_b = 0;
3092 }
3093 stock->cached_pgdat = pgdat;
3094 }
3095
3096 bytes = (idx == NR_SLAB_RECLAIMABLE_B) ? &stock->nr_slab_reclaimable_b
3097 : &stock->nr_slab_unreclaimable_b;
3098
3099
3100
3101
3102 if (!*bytes) {
3103 *bytes = nr;
3104 nr = 0;
3105 } else {
3106 *bytes += nr;
3107 if (abs(*bytes) > PAGE_SIZE) {
3108 nr = *bytes;
3109 *bytes = 0;
3110 } else {
3111 nr = 0;
3112 }
3113 }
3114 if (nr)
3115 mod_objcg_mlstate(objcg, pgdat, idx, nr);
3116
3117 local_unlock_irqrestore(&memcg_stock.stock_lock, flags);
3118 if (old)
3119 obj_cgroup_put(old);
3120}
3121
3122static bool consume_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes)
3123{
3124 struct memcg_stock_pcp *stock;
3125 unsigned long flags;
3126 bool ret = false;
3127
3128 local_lock_irqsave(&memcg_stock.stock_lock, flags);
3129
3130 stock = this_cpu_ptr(&memcg_stock);
3131 if (objcg == stock->cached_objcg && stock->nr_bytes >= nr_bytes) {
3132 stock->nr_bytes -= nr_bytes;
3133 ret = true;
3134 }
3135
3136 local_unlock_irqrestore(&memcg_stock.stock_lock, flags);
3137
3138 return ret;
3139}
3140
3141static struct obj_cgroup *drain_obj_stock(struct memcg_stock_pcp *stock)
3142{
3143 struct obj_cgroup *old = stock->cached_objcg;
3144
3145 if (!old)
3146 return NULL;
3147
3148 if (stock->nr_bytes) {
3149 unsigned int nr_pages = stock->nr_bytes >> PAGE_SHIFT;
3150 unsigned int nr_bytes = stock->nr_bytes & (PAGE_SIZE - 1);
3151
3152 if (nr_pages) {
3153 struct mem_cgroup *memcg;
3154
3155 memcg = get_mem_cgroup_from_objcg(old);
3156
3157 memcg_account_kmem(memcg, -nr_pages);
3158 __refill_stock(memcg, nr_pages);
3159
3160 css_put(&memcg->css);
3161 }
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173 atomic_add(nr_bytes, &old->nr_charged_bytes);
3174 stock->nr_bytes = 0;
3175 }
3176
3177
3178
3179
3180 if (stock->nr_slab_reclaimable_b || stock->nr_slab_unreclaimable_b) {
3181 if (stock->nr_slab_reclaimable_b) {
3182 mod_objcg_mlstate(old, stock->cached_pgdat,
3183 NR_SLAB_RECLAIMABLE_B,
3184 stock->nr_slab_reclaimable_b);
3185 stock->nr_slab_reclaimable_b = 0;
3186 }
3187 if (stock->nr_slab_unreclaimable_b) {
3188 mod_objcg_mlstate(old, stock->cached_pgdat,
3189 NR_SLAB_UNRECLAIMABLE_B,
3190 stock->nr_slab_unreclaimable_b);
3191 stock->nr_slab_unreclaimable_b = 0;
3192 }
3193 stock->cached_pgdat = NULL;
3194 }
3195
3196 stock->cached_objcg = NULL;
3197
3198
3199
3200
3201 return old;
3202}
3203
3204static bool obj_stock_flush_required(struct memcg_stock_pcp *stock,
3205 struct mem_cgroup *root_memcg)
3206{
3207 struct mem_cgroup *memcg;
3208
3209 if (stock->cached_objcg) {
3210 memcg = obj_cgroup_memcg(stock->cached_objcg);
3211 if (memcg && mem_cgroup_is_descendant(memcg, root_memcg))
3212 return true;
3213 }
3214
3215 return false;
3216}
3217
3218static void refill_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes,
3219 bool allow_uncharge)
3220{
3221 struct memcg_stock_pcp *stock;
3222 struct obj_cgroup *old = NULL;
3223 unsigned long flags;
3224 unsigned int nr_pages = 0;
3225
3226 local_lock_irqsave(&memcg_stock.stock_lock, flags);
3227
3228 stock = this_cpu_ptr(&memcg_stock);
3229 if (stock->cached_objcg != objcg) {
3230 old = drain_obj_stock(stock);
3231 obj_cgroup_get(objcg);
3232 stock->cached_objcg = objcg;
3233 stock->nr_bytes = atomic_read(&objcg->nr_charged_bytes)
3234 ? atomic_xchg(&objcg->nr_charged_bytes, 0) : 0;
3235 allow_uncharge = true;
3236 }
3237 stock->nr_bytes += nr_bytes;
3238
3239 if (allow_uncharge && (stock->nr_bytes > PAGE_SIZE)) {
3240 nr_pages = stock->nr_bytes >> PAGE_SHIFT;
3241 stock->nr_bytes &= (PAGE_SIZE - 1);
3242 }
3243
3244 local_unlock_irqrestore(&memcg_stock.stock_lock, flags);
3245 if (old)
3246 obj_cgroup_put(old);
3247
3248 if (nr_pages)
3249 obj_cgroup_uncharge_pages(objcg, nr_pages);
3250}
3251
3252int obj_cgroup_charge(struct obj_cgroup *objcg, gfp_t gfp, size_t size)
3253{
3254 unsigned int nr_pages, nr_bytes;
3255 int ret;
3256
3257 if (consume_obj_stock(objcg, size))
3258 return 0;
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283 nr_pages = size >> PAGE_SHIFT;
3284 nr_bytes = size & (PAGE_SIZE - 1);
3285
3286 if (nr_bytes)
3287 nr_pages += 1;
3288
3289 ret = obj_cgroup_charge_pages(objcg, gfp, nr_pages);
3290 if (!ret && nr_bytes)
3291 refill_obj_stock(objcg, PAGE_SIZE - nr_bytes, false);
3292
3293 return ret;
3294}
3295
3296void obj_cgroup_uncharge(struct obj_cgroup *objcg, size_t size)
3297{
3298 refill_obj_stock(objcg, size, true);
3299}
3300
3301#endif
3302
3303
3304
3305
3306void split_page_memcg(struct page *head, unsigned int nr)
3307{
3308 struct folio *folio = page_folio(head);
3309 struct mem_cgroup *memcg = folio_memcg(folio);
3310 int i;
3311
3312 if (mem_cgroup_disabled() || !memcg)
3313 return;
3314
3315 for (i = 1; i < nr; i++)
3316 folio_page(folio, i)->memcg_data = folio->memcg_data;
3317
3318 if (folio_memcg_kmem(folio))
3319 obj_cgroup_get_many(__folio_objcg(folio), nr - 1);
3320 else
3321 css_get_many(&memcg->css, nr - 1);
3322}
3323
3324#ifdef CONFIG_MEMCG_SWAP
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339static int mem_cgroup_move_swap_account(swp_entry_t entry,
3340 struct mem_cgroup *from, struct mem_cgroup *to)
3341{
3342 unsigned short old_id, new_id;
3343
3344 old_id = mem_cgroup_id(from);
3345 new_id = mem_cgroup_id(to);
3346
3347 if (swap_cgroup_cmpxchg(entry, old_id, new_id) == old_id) {
3348 mod_memcg_state(from, MEMCG_SWAP, -1);
3349 mod_memcg_state(to, MEMCG_SWAP, 1);
3350 return 0;
3351 }
3352 return -EINVAL;
3353}
3354#else
3355static inline int mem_cgroup_move_swap_account(swp_entry_t entry,
3356 struct mem_cgroup *from, struct mem_cgroup *to)
3357{
3358 return -EINVAL;
3359}
3360#endif
3361
3362static DEFINE_MUTEX(memcg_max_mutex);
3363
3364static int mem_cgroup_resize_max(struct mem_cgroup *memcg,
3365 unsigned long max, bool memsw)
3366{
3367 bool enlarge = false;
3368 bool drained = false;
3369 int ret;
3370 bool limits_invariant;
3371 struct page_counter *counter = memsw ? &memcg->memsw : &memcg->memory;
3372
3373 do {
3374 if (signal_pending(current)) {
3375 ret = -EINTR;
3376 break;
3377 }
3378
3379 mutex_lock(&memcg_max_mutex);
3380
3381
3382
3383
3384 limits_invariant = memsw ? max >= READ_ONCE(memcg->memory.max) :
3385 max <= memcg->memsw.max;
3386 if (!limits_invariant) {
3387 mutex_unlock(&memcg_max_mutex);
3388 ret = -EINVAL;
3389 break;
3390 }
3391 if (max > counter->max)
3392 enlarge = true;
3393 ret = page_counter_set_max(counter, max);
3394 mutex_unlock(&memcg_max_mutex);
3395
3396 if (!ret)
3397 break;
3398
3399 if (!drained) {
3400 drain_all_stock(memcg);
3401 drained = true;
3402 continue;
3403 }
3404
3405 if (!try_to_free_mem_cgroup_pages(memcg, 1,
3406 GFP_KERNEL, !memsw)) {
3407 ret = -EBUSY;
3408 break;
3409 }
3410 } while (true);
3411
3412 if (!ret && enlarge)
3413 memcg_oom_recover(memcg);
3414
3415 return ret;
3416}
3417
3418unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order,
3419 gfp_t gfp_mask,
3420 unsigned long *total_scanned)
3421{
3422 unsigned long nr_reclaimed = 0;
3423 struct mem_cgroup_per_node *mz, *next_mz = NULL;
3424 unsigned long reclaimed;
3425 int loop = 0;
3426 struct mem_cgroup_tree_per_node *mctz;
3427 unsigned long excess;
3428
3429 if (order > 0)
3430 return 0;
3431
3432 mctz = soft_limit_tree.rb_tree_per_node[pgdat->node_id];
3433
3434
3435
3436
3437
3438
3439 if (!mctz || RB_EMPTY_ROOT(&mctz->rb_root))
3440 return 0;
3441
3442
3443
3444
3445
3446
3447 do {
3448 if (next_mz)
3449 mz = next_mz;
3450 else
3451 mz = mem_cgroup_largest_soft_limit_node(mctz);
3452 if (!mz)
3453 break;
3454
3455 reclaimed = mem_cgroup_soft_reclaim(mz->memcg, pgdat,
3456 gfp_mask, total_scanned);
3457 nr_reclaimed += reclaimed;
3458 spin_lock_irq(&mctz->lock);
3459
3460
3461
3462
3463
3464 next_mz = NULL;
3465 if (!reclaimed)
3466 next_mz = __mem_cgroup_largest_soft_limit_node(mctz);
3467
3468 excess = soft_limit_excess(mz->memcg);
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478 __mem_cgroup_insert_exceeded(mz, mctz, excess);
3479 spin_unlock_irq(&mctz->lock);
3480 css_put(&mz->memcg->css);
3481 loop++;
3482
3483
3484
3485
3486
3487 if (!nr_reclaimed &&
3488 (next_mz == NULL ||
3489 loop > MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS))
3490 break;
3491 } while (!nr_reclaimed);
3492 if (next_mz)
3493 css_put(&next_mz->memcg->css);
3494 return nr_reclaimed;
3495}
3496
3497
3498
3499
3500
3501
3502static int mem_cgroup_force_empty(struct mem_cgroup *memcg)
3503{
3504 int nr_retries = MAX_RECLAIM_RETRIES;
3505
3506
3507 lru_add_drain_all();
3508
3509 drain_all_stock(memcg);
3510
3511
3512 while (nr_retries && page_counter_read(&memcg->memory)) {
3513 if (signal_pending(current))
3514 return -EINTR;
3515
3516 if (!try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL, true))
3517 nr_retries--;
3518 }
3519
3520 return 0;
3521}
3522
3523static ssize_t mem_cgroup_force_empty_write(struct kernfs_open_file *of,
3524 char *buf, size_t nbytes,
3525 loff_t off)
3526{
3527 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
3528
3529 if (mem_cgroup_is_root(memcg))
3530 return -EINVAL;
3531 return mem_cgroup_force_empty(memcg) ?: nbytes;
3532}
3533
3534static u64 mem_cgroup_hierarchy_read(struct cgroup_subsys_state *css,
3535 struct cftype *cft)
3536{
3537 return 1;
3538}
3539
3540static int mem_cgroup_hierarchy_write(struct cgroup_subsys_state *css,
3541 struct cftype *cft, u64 val)
3542{
3543 if (val == 1)
3544 return 0;
3545
3546 pr_warn_once("Non-hierarchical mode is deprecated. "
3547 "Please report your usecase to linux-mm@kvack.org if you "
3548 "depend on this functionality.\n");
3549
3550 return -EINVAL;
3551}
3552
3553static unsigned long mem_cgroup_usage(struct mem_cgroup *memcg, bool swap)
3554{
3555 unsigned long val;
3556
3557 if (mem_cgroup_is_root(memcg)) {
3558 mem_cgroup_flush_stats();
3559 val = memcg_page_state(memcg, NR_FILE_PAGES) +
3560 memcg_page_state(memcg, NR_ANON_MAPPED);
3561 if (swap)
3562 val += memcg_page_state(memcg, MEMCG_SWAP);
3563 } else {
3564 if (!swap)
3565 val = page_counter_read(&memcg->memory);
3566 else
3567 val = page_counter_read(&memcg->memsw);
3568 }
3569 return val;
3570}
3571
3572enum {
3573 RES_USAGE,
3574 RES_LIMIT,
3575 RES_MAX_USAGE,
3576 RES_FAILCNT,
3577 RES_SOFT_LIMIT,
3578};
3579
3580static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css,
3581 struct cftype *cft)
3582{
3583 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
3584 struct page_counter *counter;
3585
3586 switch (MEMFILE_TYPE(cft->private)) {
3587 case _MEM:
3588 counter = &memcg->memory;
3589 break;
3590 case _MEMSWAP:
3591 counter = &memcg->memsw;
3592 break;
3593 case _KMEM:
3594 counter = &memcg->kmem;
3595 break;
3596 case _TCP:
3597 counter = &memcg->tcpmem;
3598 break;
3599 default:
3600 BUG();
3601 }
3602
3603 switch (MEMFILE_ATTR(cft->private)) {
3604 case RES_USAGE:
3605 if (counter == &memcg->memory)
3606 return (u64)mem_cgroup_usage(memcg, false) * PAGE_SIZE;
3607 if (counter == &memcg->memsw)
3608 return (u64)mem_cgroup_usage(memcg, true) * PAGE_SIZE;
3609 return (u64)page_counter_read(counter) * PAGE_SIZE;
3610 case RES_LIMIT:
3611 return (u64)counter->max * PAGE_SIZE;
3612 case RES_MAX_USAGE:
3613 return (u64)counter->watermark * PAGE_SIZE;
3614 case RES_FAILCNT:
3615 return counter->failcnt;
3616 case RES_SOFT_LIMIT:
3617 return (u64)memcg->soft_limit * PAGE_SIZE;
3618 default:
3619 BUG();
3620 }
3621}
3622
3623#ifdef CONFIG_MEMCG_KMEM
3624static int memcg_online_kmem(struct mem_cgroup *memcg)
3625{
3626 struct obj_cgroup *objcg;
3627
3628 if (cgroup_memory_nokmem)
3629 return 0;
3630
3631 if (unlikely(mem_cgroup_is_root(memcg)))
3632 return 0;
3633
3634 objcg = obj_cgroup_alloc();
3635 if (!objcg)
3636 return -ENOMEM;
3637
3638 objcg->memcg = memcg;
3639 rcu_assign_pointer(memcg->objcg, objcg);
3640
3641 static_branch_enable(&memcg_kmem_enabled_key);
3642
3643 memcg->kmemcg_id = memcg->id.id;
3644
3645 return 0;
3646}
3647
3648static void memcg_offline_kmem(struct mem_cgroup *memcg)
3649{
3650 struct mem_cgroup *parent;
3651
3652 if (cgroup_memory_nokmem)
3653 return;
3654
3655 if (unlikely(mem_cgroup_is_root(memcg)))
3656 return;
3657
3658 parent = parent_mem_cgroup(memcg);
3659 if (!parent)
3660 parent = root_mem_cgroup;
3661
3662 memcg_reparent_objcgs(memcg, parent);
3663
3664
3665
3666
3667
3668
3669
3670 memcg_reparent_list_lrus(memcg, parent);
3671}
3672#else
3673static int memcg_online_kmem(struct mem_cgroup *memcg)
3674{
3675 return 0;
3676}
3677static void memcg_offline_kmem(struct mem_cgroup *memcg)
3678{
3679}
3680#endif
3681
3682static int memcg_update_tcp_max(struct mem_cgroup *memcg, unsigned long max)
3683{
3684 int ret;
3685
3686 mutex_lock(&memcg_max_mutex);
3687
3688 ret = page_counter_set_max(&memcg->tcpmem, max);
3689 if (ret)
3690 goto out;
3691
3692 if (!memcg->tcpmem_active) {
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709 static_branch_inc(&memcg_sockets_enabled_key);
3710 memcg->tcpmem_active = true;
3711 }
3712out:
3713 mutex_unlock(&memcg_max_mutex);
3714 return ret;
3715}
3716
3717
3718
3719
3720
3721static ssize_t mem_cgroup_write(struct kernfs_open_file *of,
3722 char *buf, size_t nbytes, loff_t off)
3723{
3724 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
3725 unsigned long nr_pages;
3726 int ret;
3727
3728 buf = strstrip(buf);
3729 ret = page_counter_memparse(buf, "-1", &nr_pages);
3730 if (ret)
3731 return ret;
3732
3733 switch (MEMFILE_ATTR(of_cft(of)->private)) {
3734 case RES_LIMIT:
3735 if (mem_cgroup_is_root(memcg)) {
3736 ret = -EINVAL;
3737 break;
3738 }
3739 switch (MEMFILE_TYPE(of_cft(of)->private)) {
3740 case _MEM:
3741 ret = mem_cgroup_resize_max(memcg, nr_pages, false);
3742 break;
3743 case _MEMSWAP:
3744 ret = mem_cgroup_resize_max(memcg, nr_pages, true);
3745 break;
3746 case _KMEM:
3747
3748 ret = -EOPNOTSUPP;
3749 break;
3750 case _TCP:
3751 ret = memcg_update_tcp_max(memcg, nr_pages);
3752 break;
3753 }
3754 break;
3755 case RES_SOFT_LIMIT:
3756 if (IS_ENABLED(CONFIG_PREEMPT_RT)) {
3757 ret = -EOPNOTSUPP;
3758 } else {
3759 memcg->soft_limit = nr_pages;
3760 ret = 0;
3761 }
3762 break;
3763 }
3764 return ret ?: nbytes;
3765}
3766
3767static ssize_t mem_cgroup_reset(struct kernfs_open_file *of, char *buf,
3768 size_t nbytes, loff_t off)
3769{
3770 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
3771 struct page_counter *counter;
3772
3773 switch (MEMFILE_TYPE(of_cft(of)->private)) {
3774 case _MEM:
3775 counter = &memcg->memory;
3776 break;
3777 case _MEMSWAP:
3778 counter = &memcg->memsw;
3779 break;
3780 case _KMEM:
3781 counter = &memcg->kmem;
3782 break;
3783 case _TCP:
3784 counter = &memcg->tcpmem;
3785 break;
3786 default:
3787 BUG();
3788 }
3789
3790 switch (MEMFILE_ATTR(of_cft(of)->private)) {
3791 case RES_MAX_USAGE:
3792 page_counter_reset_watermark(counter);
3793 break;
3794 case RES_FAILCNT:
3795 counter->failcnt = 0;
3796 break;
3797 default:
3798 BUG();
3799 }
3800
3801 return nbytes;
3802}
3803
3804static u64 mem_cgroup_move_charge_read(struct cgroup_subsys_state *css,
3805 struct cftype *cft)
3806{
3807 return mem_cgroup_from_css(css)->move_charge_at_immigrate;
3808}
3809
3810#ifdef CONFIG_MMU
3811static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css,
3812 struct cftype *cft, u64 val)
3813{
3814 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
3815
3816 if (val & ~MOVE_MASK)
3817 return -EINVAL;
3818
3819
3820
3821
3822
3823
3824
3825 memcg->move_charge_at_immigrate = val;
3826 return 0;
3827}
3828#else
3829static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css,
3830 struct cftype *cft, u64 val)
3831{
3832 return -ENOSYS;
3833}
3834#endif
3835
3836#ifdef CONFIG_NUMA
3837
3838#define LRU_ALL_FILE (BIT(LRU_INACTIVE_FILE) | BIT(LRU_ACTIVE_FILE))
3839#define LRU_ALL_ANON (BIT(LRU_INACTIVE_ANON) | BIT(LRU_ACTIVE_ANON))
3840#define LRU_ALL ((1 << NR_LRU_LISTS) - 1)
3841
3842static unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg,
3843 int nid, unsigned int lru_mask, bool tree)
3844{
3845 struct lruvec *lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(nid));
3846 unsigned long nr = 0;
3847 enum lru_list lru;
3848
3849 VM_BUG_ON((unsigned)nid >= nr_node_ids);
3850
3851 for_each_lru(lru) {
3852 if (!(BIT(lru) & lru_mask))
3853 continue;
3854 if (tree)
3855 nr += lruvec_page_state(lruvec, NR_LRU_BASE + lru);
3856 else
3857 nr += lruvec_page_state_local(lruvec, NR_LRU_BASE + lru);
3858 }
3859 return nr;
3860}
3861
3862static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *memcg,
3863 unsigned int lru_mask,
3864 bool tree)
3865{
3866 unsigned long nr = 0;
3867 enum lru_list lru;
3868
3869 for_each_lru(lru) {
3870 if (!(BIT(lru) & lru_mask))
3871 continue;
3872 if (tree)
3873 nr += memcg_page_state(memcg, NR_LRU_BASE + lru);
3874 else
3875 nr += memcg_page_state_local(memcg, NR_LRU_BASE + lru);
3876 }
3877 return nr;
3878}
3879
3880static int memcg_numa_stat_show(struct seq_file *m, void *v)
3881{
3882 struct numa_stat {
3883 const char *name;
3884 unsigned int lru_mask;
3885 };
3886
3887 static const struct numa_stat stats[] = {
3888 { "total", LRU_ALL },
3889 { "file", LRU_ALL_FILE },
3890 { "anon", LRU_ALL_ANON },
3891 { "unevictable", BIT(LRU_UNEVICTABLE) },
3892 };
3893 const struct numa_stat *stat;
3894 int nid;
3895 struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
3896
3897 mem_cgroup_flush_stats();
3898
3899 for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) {
3900 seq_printf(m, "%s=%lu", stat->name,
3901 mem_cgroup_nr_lru_pages(memcg, stat->lru_mask,
3902 false));
3903 for_each_node_state(nid, N_MEMORY)
3904 seq_printf(m, " N%d=%lu", nid,
3905 mem_cgroup_node_nr_lru_pages(memcg, nid,
3906 stat->lru_mask, false));
3907 seq_putc(m, '\n');
3908 }
3909
3910 for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) {
3911
3912 seq_printf(m, "hierarchical_%s=%lu", stat->name,
3913 mem_cgroup_nr_lru_pages(memcg, stat->lru_mask,
3914 true));
3915 for_each_node_state(nid, N_MEMORY)
3916 seq_printf(m, " N%d=%lu", nid,
3917 mem_cgroup_node_nr_lru_pages(memcg, nid,
3918 stat->lru_mask, true));
3919 seq_putc(m, '\n');
3920 }
3921
3922 return 0;
3923}
3924#endif
3925
3926static const unsigned int memcg1_stats[] = {
3927 NR_FILE_PAGES,
3928 NR_ANON_MAPPED,
3929#ifdef CONFIG_TRANSPARENT_HUGEPAGE
3930 NR_ANON_THPS,
3931#endif
3932 NR_SHMEM,
3933 NR_FILE_MAPPED,
3934 NR_FILE_DIRTY,
3935 NR_WRITEBACK,
3936 MEMCG_SWAP,
3937};
3938
3939static const char *const memcg1_stat_names[] = {
3940 "cache",
3941 "rss",
3942#ifdef CONFIG_TRANSPARENT_HUGEPAGE
3943 "rss_huge",
3944#endif
3945 "shmem",
3946 "mapped_file",
3947 "dirty",
3948 "writeback",
3949 "swap",
3950};
3951
3952
3953static const unsigned int memcg1_events[] = {
3954 PGPGIN,
3955 PGPGOUT,
3956 PGFAULT,
3957 PGMAJFAULT,
3958};
3959
3960static int memcg_stat_show(struct seq_file *m, void *v)
3961{
3962 struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
3963 unsigned long memory, memsw;
3964 struct mem_cgroup *mi;
3965 unsigned int i;
3966
3967 BUILD_BUG_ON(ARRAY_SIZE(memcg1_stat_names) != ARRAY_SIZE(memcg1_stats));
3968
3969 mem_cgroup_flush_stats();
3970
3971 for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) {
3972 unsigned long nr;
3973
3974 if (memcg1_stats[i] == MEMCG_SWAP && !do_memsw_account())
3975 continue;
3976 nr = memcg_page_state_local(memcg, memcg1_stats[i]);
3977 seq_printf(m, "%s %lu\n", memcg1_stat_names[i], nr * PAGE_SIZE);
3978 }
3979
3980 for (i = 0; i < ARRAY_SIZE(memcg1_events); i++)
3981 seq_printf(m, "%s %lu\n", vm_event_name(memcg1_events[i]),
3982 memcg_events_local(memcg, memcg1_events[i]));
3983
3984 for (i = 0; i < NR_LRU_LISTS; i++)
3985 seq_printf(m, "%s %lu\n", lru_list_name(i),
3986 memcg_page_state_local(memcg, NR_LRU_BASE + i) *
3987 PAGE_SIZE);
3988
3989
3990 memory = memsw = PAGE_COUNTER_MAX;
3991 for (mi = memcg; mi; mi = parent_mem_cgroup(mi)) {
3992 memory = min(memory, READ_ONCE(mi->memory.max));
3993 memsw = min(memsw, READ_ONCE(mi->memsw.max));
3994 }
3995 seq_printf(m, "hierarchical_memory_limit %llu\n",
3996 (u64)memory * PAGE_SIZE);
3997 if (do_memsw_account())
3998 seq_printf(m, "hierarchical_memsw_limit %llu\n",
3999 (u64)memsw * PAGE_SIZE);
4000
4001 for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) {
4002 unsigned long nr;
4003
4004 if (memcg1_stats[i] == MEMCG_SWAP && !do_memsw_account())
4005 continue;
4006 nr = memcg_page_state(memcg, memcg1_stats[i]);
4007 seq_printf(m, "total_%s %llu\n", memcg1_stat_names[i],
4008 (u64)nr * PAGE_SIZE);
4009 }
4010
4011 for (i = 0; i < ARRAY_SIZE(memcg1_events); i++)
4012 seq_printf(m, "total_%s %llu\n",
4013 vm_event_name(memcg1_events[i]),
4014 (u64)memcg_events(memcg, memcg1_events[i]));
4015
4016 for (i = 0; i < NR_LRU_LISTS; i++)
4017 seq_printf(m, "total_%s %llu\n", lru_list_name(i),
4018 (u64)memcg_page_state(memcg, NR_LRU_BASE + i) *
4019 PAGE_SIZE);
4020
4021#ifdef CONFIG_DEBUG_VM
4022 {
4023 pg_data_t *pgdat;
4024 struct mem_cgroup_per_node *mz;
4025 unsigned long anon_cost = 0;
4026 unsigned long file_cost = 0;
4027
4028 for_each_online_pgdat(pgdat) {
4029 mz = memcg->nodeinfo[pgdat->node_id];
4030
4031 anon_cost += mz->lruvec.anon_cost;
4032 file_cost += mz->lruvec.file_cost;
4033 }
4034 seq_printf(m, "anon_cost %lu\n", anon_cost);
4035 seq_printf(m, "file_cost %lu\n", file_cost);
4036 }
4037#endif
4038
4039 return 0;
4040}
4041
4042static u64 mem_cgroup_swappiness_read(struct cgroup_subsys_state *css,
4043 struct cftype *cft)
4044{
4045 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
4046
4047 return mem_cgroup_swappiness(memcg);
4048}
4049
4050static int mem_cgroup_swappiness_write(struct cgroup_subsys_state *css,
4051 struct cftype *cft, u64 val)
4052{
4053 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
4054
4055 if (val > 200)
4056 return -EINVAL;
4057
4058 if (!mem_cgroup_is_root(memcg))
4059 memcg->swappiness = val;
4060 else
4061 vm_swappiness = val;
4062
4063 return 0;
4064}
4065
4066static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap)
4067{
4068 struct mem_cgroup_threshold_ary *t;
4069 unsigned long usage;
4070 int i;
4071
4072 rcu_read_lock();
4073 if (!swap)
4074 t = rcu_dereference(memcg->thresholds.primary);
4075 else
4076 t = rcu_dereference(memcg->memsw_thresholds.primary);
4077
4078 if (!t)
4079 goto unlock;
4080
4081 usage = mem_cgroup_usage(memcg, swap);
4082
4083
4084
4085
4086
4087
4088 i = t->current_threshold;
4089
4090
4091
4092
4093
4094
4095
4096 for (; i >= 0 && unlikely(t->entries[i].threshold > usage); i--)
4097 eventfd_signal(t->entries[i].eventfd, 1);
4098
4099
4100 i++;
4101
4102
4103
4104
4105
4106
4107
4108 for (; i < t->size && unlikely(t->entries[i].threshold <= usage); i++)
4109 eventfd_signal(t->entries[i].eventfd, 1);
4110
4111
4112 t->current_threshold = i - 1;
4113unlock:
4114 rcu_read_unlock();
4115}
4116
4117static void mem_cgroup_threshold(struct mem_cgroup *memcg)
4118{
4119 while (memcg) {
4120 __mem_cgroup_threshold(memcg, false);
4121 if (do_memsw_account())
4122 __mem_cgroup_threshold(memcg, true);
4123
4124 memcg = parent_mem_cgroup(memcg);
4125 }
4126}
4127
4128static int compare_thresholds(const void *a, const void *b)
4129{
4130 const struct mem_cgroup_threshold *_a = a;
4131 const struct mem_cgroup_threshold *_b = b;
4132
4133 if (_a->threshold > _b->threshold)
4134 return 1;
4135
4136 if (_a->threshold < _b->threshold)
4137 return -1;
4138
4139 return 0;
4140}
4141
4142static int mem_cgroup_oom_notify_cb(struct mem_cgroup *memcg)
4143{
4144 struct mem_cgroup_eventfd_list *ev;
4145
4146 spin_lock(&memcg_oom_lock);
4147
4148 list_for_each_entry(ev, &memcg->oom_notify, list)
4149 eventfd_signal(ev->eventfd, 1);
4150
4151 spin_unlock(&memcg_oom_lock);
4152 return 0;
4153}
4154
4155static void mem_cgroup_oom_notify(struct mem_cgroup *memcg)
4156{
4157 struct mem_cgroup *iter;
4158
4159 for_each_mem_cgroup_tree(iter, memcg)
4160 mem_cgroup_oom_notify_cb(iter);
4161}
4162
4163static int __mem_cgroup_usage_register_event(struct mem_cgroup *memcg,
4164 struct eventfd_ctx *eventfd, const char *args, enum res_type type)
4165{
4166 struct mem_cgroup_thresholds *thresholds;
4167 struct mem_cgroup_threshold_ary *new;
4168 unsigned long threshold;
4169 unsigned long usage;
4170 int i, size, ret;
4171
4172 ret = page_counter_memparse(args, "-1", &threshold);
4173 if (ret)
4174 return ret;
4175
4176 mutex_lock(&memcg->thresholds_lock);
4177
4178 if (type == _MEM) {
4179 thresholds = &memcg->thresholds;
4180 usage = mem_cgroup_usage(memcg, false);
4181 } else if (type == _MEMSWAP) {
4182 thresholds = &memcg->memsw_thresholds;
4183 usage = mem_cgroup_usage(memcg, true);
4184 } else
4185 BUG();
4186
4187
4188 if (thresholds->primary)
4189 __mem_cgroup_threshold(memcg, type == _MEMSWAP);
4190
4191 size = thresholds->primary ? thresholds->primary->size + 1 : 1;
4192
4193
4194 new = kmalloc(struct_size(new, entries, size), GFP_KERNEL);
4195 if (!new) {
4196 ret = -ENOMEM;
4197 goto unlock;
4198 }
4199 new->size = size;
4200
4201
4202 if (thresholds->primary)
4203 memcpy(new->entries, thresholds->primary->entries,
4204 flex_array_size(new, entries, size - 1));
4205
4206
4207 new->entries[size - 1].eventfd = eventfd;
4208 new->entries[size - 1].threshold = threshold;
4209
4210
4211 sort(new->entries, size, sizeof(*new->entries),
4212 compare_thresholds, NULL);
4213
4214
4215 new->current_threshold = -1;
4216 for (i = 0; i < size; i++) {
4217 if (new->entries[i].threshold <= usage) {
4218
4219
4220
4221
4222
4223 ++new->current_threshold;
4224 } else
4225 break;
4226 }
4227
4228
4229 kfree(thresholds->spare);
4230 thresholds->spare = thresholds->primary;
4231
4232 rcu_assign_pointer(thresholds->primary, new);
4233
4234
4235 synchronize_rcu();
4236
4237unlock:
4238 mutex_unlock(&memcg->thresholds_lock);
4239
4240 return ret;
4241}
4242
4243static int mem_cgroup_usage_register_event(struct mem_cgroup *memcg,
4244 struct eventfd_ctx *eventfd, const char *args)
4245{
4246 return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEM);
4247}
4248
4249static int memsw_cgroup_usage_register_event(struct mem_cgroup *memcg,
4250 struct eventfd_ctx *eventfd, const char *args)
4251{
4252 return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEMSWAP);
4253}
4254
4255static void __mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
4256 struct eventfd_ctx *eventfd, enum res_type type)
4257{
4258 struct mem_cgroup_thresholds *thresholds;
4259 struct mem_cgroup_threshold_ary *new;
4260 unsigned long usage;
4261 int i, j, size, entries;
4262
4263 mutex_lock(&memcg->thresholds_lock);
4264
4265 if (type == _MEM) {
4266 thresholds = &memcg->thresholds;
4267 usage = mem_cgroup_usage(memcg, false);
4268 } else if (type == _MEMSWAP) {
4269 thresholds = &memcg->memsw_thresholds;
4270 usage = mem_cgroup_usage(memcg, true);
4271 } else
4272 BUG();
4273
4274 if (!thresholds->primary)
4275 goto unlock;
4276
4277
4278 __mem_cgroup_threshold(memcg, type == _MEMSWAP);
4279
4280
4281 size = entries = 0;
4282 for (i = 0; i < thresholds->primary->size; i++) {
4283 if (thresholds->primary->entries[i].eventfd != eventfd)
4284 size++;
4285 else
4286 entries++;
4287 }
4288
4289 new = thresholds->spare;
4290
4291
4292 if (!entries)
4293 goto unlock;
4294
4295
4296 if (!size) {
4297 kfree(new);
4298 new = NULL;
4299 goto swap_buffers;
4300 }
4301
4302 new->size = size;
4303
4304
4305 new->current_threshold = -1;
4306 for (i = 0, j = 0; i < thresholds->primary->size; i++) {
4307 if (thresholds->primary->entries[i].eventfd == eventfd)
4308 continue;
4309
4310 new->entries[j] = thresholds->primary->entries[i];
4311 if (new->entries[j].threshold <= usage) {
4312
4313
4314
4315
4316
4317 ++new->current_threshold;
4318 }
4319 j++;
4320 }
4321
4322swap_buffers:
4323
4324 thresholds->spare = thresholds->primary;
4325
4326 rcu_assign_pointer(thresholds->primary, new);
4327
4328
4329 synchronize_rcu();
4330
4331
4332 if (!new) {
4333 kfree(thresholds->spare);
4334 thresholds->spare = NULL;
4335 }
4336unlock:
4337 mutex_unlock(&memcg->thresholds_lock);
4338}
4339
4340static void mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
4341 struct eventfd_ctx *eventfd)
4342{
4343 return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEM);
4344}
4345
4346static void memsw_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
4347 struct eventfd_ctx *eventfd)
4348{
4349 return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEMSWAP);
4350}
4351
4352static int mem_cgroup_oom_register_event(struct mem_cgroup *memcg,
4353 struct eventfd_ctx *eventfd, const char *args)
4354{
4355 struct mem_cgroup_eventfd_list *event;
4356
4357 event = kmalloc(sizeof(*event), GFP_KERNEL);
4358 if (!event)
4359 return -ENOMEM;
4360
4361 spin_lock(&memcg_oom_lock);
4362
4363 event->eventfd = eventfd;
4364 list_add(&event->list, &memcg->oom_notify);
4365
4366
4367 if (memcg->under_oom)
4368 eventfd_signal(eventfd, 1);
4369 spin_unlock(&memcg_oom_lock);
4370
4371 return 0;
4372}
4373
4374static void mem_cgroup_oom_unregister_event(struct mem_cgroup *memcg,
4375 struct eventfd_ctx *eventfd)
4376{
4377 struct mem_cgroup_eventfd_list *ev, *tmp;
4378
4379 spin_lock(&memcg_oom_lock);
4380
4381 list_for_each_entry_safe(ev, tmp, &memcg->oom_notify, list) {
4382 if (ev->eventfd == eventfd) {
4383 list_del(&ev->list);
4384 kfree(ev);
4385 }
4386 }
4387
4388 spin_unlock(&memcg_oom_lock);
4389}
4390
4391static int mem_cgroup_oom_control_read(struct seq_file *sf, void *v)
4392{
4393 struct mem_cgroup *memcg = mem_cgroup_from_seq(sf);
4394
4395 seq_printf(sf, "oom_kill_disable %d\n", memcg->oom_kill_disable);
4396 seq_printf(sf, "under_oom %d\n", (bool)memcg->under_oom);
4397 seq_printf(sf, "oom_kill %lu\n",
4398 atomic_long_read(&memcg->memory_events[MEMCG_OOM_KILL]));
4399 return 0;
4400}
4401
4402static int mem_cgroup_oom_control_write(struct cgroup_subsys_state *css,
4403 struct cftype *cft, u64 val)
4404{
4405 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
4406
4407
4408 if (mem_cgroup_is_root(memcg) || !((val == 0) || (val == 1)))
4409 return -EINVAL;
4410
4411 memcg->oom_kill_disable = val;
4412 if (!val)
4413 memcg_oom_recover(memcg);
4414
4415 return 0;
4416}
4417
4418#ifdef CONFIG_CGROUP_WRITEBACK
4419
4420#include <trace/events/writeback.h>
4421
4422static int memcg_wb_domain_init(struct mem_cgroup *memcg, gfp_t gfp)
4423{
4424 return wb_domain_init(&memcg->cgwb_domain, gfp);
4425}
4426
4427static void memcg_wb_domain_exit(struct mem_cgroup *memcg)
4428{
4429 wb_domain_exit(&memcg->cgwb_domain);
4430}
4431
4432static void memcg_wb_domain_size_changed(struct mem_cgroup *memcg)
4433{
4434 wb_domain_size_changed(&memcg->cgwb_domain);
4435}
4436
4437struct wb_domain *mem_cgroup_wb_domain(struct bdi_writeback *wb)
4438{
4439 struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css);
4440
4441 if (!memcg->css.parent)
4442 return NULL;
4443
4444 return &memcg->cgwb_domain;
4445}
4446
4447
4448
4449
4450
4451
4452
4453
4454
4455
4456
4457
4458
4459
4460
4461
4462
4463
4464
4465void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages,
4466 unsigned long *pheadroom, unsigned long *pdirty,
4467 unsigned long *pwriteback)
4468{
4469 struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css);
4470 struct mem_cgroup *parent;
4471
4472 mem_cgroup_flush_stats();
4473
4474 *pdirty = memcg_page_state(memcg, NR_FILE_DIRTY);
4475 *pwriteback = memcg_page_state(memcg, NR_WRITEBACK);
4476 *pfilepages = memcg_page_state(memcg, NR_INACTIVE_FILE) +
4477 memcg_page_state(memcg, NR_ACTIVE_FILE);
4478
4479 *pheadroom = PAGE_COUNTER_MAX;
4480 while ((parent = parent_mem_cgroup(memcg))) {
4481 unsigned long ceiling = min(READ_ONCE(memcg->memory.max),
4482 READ_ONCE(memcg->memory.high));
4483 unsigned long used = page_counter_read(&memcg->memory);
4484
4485 *pheadroom = min(*pheadroom, ceiling - min(ceiling, used));
4486 memcg = parent;
4487 }
4488}
4489
4490
4491
4492
4493
4494
4495
4496
4497
4498
4499
4500
4501
4502
4503
4504
4505
4506
4507
4508
4509
4510
4511
4512
4513
4514
4515
4516
4517
4518
4519
4520
4521
4522
4523
4524
4525
4526
4527
4528
4529
4530
4531
4532
4533
4534void mem_cgroup_track_foreign_dirty_slowpath(struct folio *folio,
4535 struct bdi_writeback *wb)
4536{
4537 struct mem_cgroup *memcg = folio_memcg(folio);
4538 struct memcg_cgwb_frn *frn;
4539 u64 now = get_jiffies_64();
4540 u64 oldest_at = now;
4541 int oldest = -1;
4542 int i;
4543
4544 trace_track_foreign_dirty(folio, wb);
4545
4546
4547
4548
4549
4550
4551 for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++) {
4552 frn = &memcg->cgwb_frn[i];
4553 if (frn->bdi_id == wb->bdi->id &&
4554 frn->memcg_id == wb->memcg_css->id)
4555 break;
4556 if (time_before64(frn->at, oldest_at) &&
4557 atomic_read(&frn->done.cnt) == 1) {
4558 oldest = i;
4559 oldest_at = frn->at;
4560 }
4561 }
4562
4563 if (i < MEMCG_CGWB_FRN_CNT) {
4564
4565
4566
4567
4568
4569
4570
4571 unsigned long update_intv =
4572 min_t(unsigned long, HZ,
4573 msecs_to_jiffies(dirty_expire_interval * 10) / 8);
4574
4575 if (time_before64(frn->at, now - update_intv))
4576 frn->at = now;
4577 } else if (oldest >= 0) {
4578
4579 frn = &memcg->cgwb_frn[oldest];
4580 frn->bdi_id = wb->bdi->id;
4581 frn->memcg_id = wb->memcg_css->id;
4582 frn->at = now;
4583 }
4584}
4585
4586
4587void mem_cgroup_flush_foreign(struct bdi_writeback *wb)
4588{
4589 struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css);
4590 unsigned long intv = msecs_to_jiffies(dirty_expire_interval * 10);
4591 u64 now = jiffies_64;
4592 int i;
4593
4594 for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++) {
4595 struct memcg_cgwb_frn *frn = &memcg->cgwb_frn[i];
4596
4597
4598
4599
4600
4601
4602
4603 if (time_after64(frn->at, now - intv) &&
4604 atomic_read(&frn->done.cnt) == 1) {
4605 frn->at = 0;
4606 trace_flush_foreign(wb, frn->bdi_id, frn->memcg_id);
4607 cgroup_writeback_by_id(frn->bdi_id, frn->memcg_id,
4608 WB_REASON_FOREIGN_FLUSH,
4609 &frn->done);
4610 }
4611 }
4612}
4613
4614#else
4615
4616static int memcg_wb_domain_init(struct mem_cgroup *memcg, gfp_t gfp)
4617{
4618 return 0;
4619}
4620
4621static void memcg_wb_domain_exit(struct mem_cgroup *memcg)
4622{
4623}
4624
4625static void memcg_wb_domain_size_changed(struct mem_cgroup *memcg)
4626{
4627}
4628
4629#endif
4630
4631
4632
4633
4634
4635
4636
4637
4638
4639
4640
4641
4642
4643
4644
4645
4646
4647
4648
4649static void memcg_event_remove(struct work_struct *work)
4650{
4651 struct mem_cgroup_event *event =
4652 container_of(work, struct mem_cgroup_event, remove);
4653 struct mem_cgroup *memcg = event->memcg;
4654
4655 remove_wait_queue(event->wqh, &event->wait);
4656
4657 event->unregister_event(memcg, event->eventfd);
4658
4659
4660 eventfd_signal(event->eventfd, 1);
4661
4662 eventfd_ctx_put(event->eventfd);
4663 kfree(event);
4664 css_put(&memcg->css);
4665}
4666
4667
4668
4669
4670
4671
4672static int memcg_event_wake(wait_queue_entry_t *wait, unsigned mode,
4673 int sync, void *key)
4674{
4675 struct mem_cgroup_event *event =
4676 container_of(wait, struct mem_cgroup_event, wait);
4677 struct mem_cgroup *memcg = event->memcg;
4678 __poll_t flags = key_to_poll(key);
4679
4680 if (flags & EPOLLHUP) {
4681
4682
4683
4684
4685
4686
4687
4688
4689
4690 spin_lock(&memcg->event_list_lock);
4691 if (!list_empty(&event->list)) {
4692 list_del_init(&event->list);
4693
4694
4695
4696
4697 schedule_work(&event->remove);
4698 }
4699 spin_unlock(&memcg->event_list_lock);
4700 }
4701
4702 return 0;
4703}
4704
4705static void memcg_event_ptable_queue_proc(struct file *file,
4706 wait_queue_head_t *wqh, poll_table *pt)
4707{
4708 struct mem_cgroup_event *event =
4709 container_of(pt, struct mem_cgroup_event, pt);
4710
4711 event->wqh = wqh;
4712 add_wait_queue(wqh, &event->wait);
4713}
4714
4715
4716
4717
4718
4719
4720
4721
4722
4723static ssize_t memcg_write_event_control(struct kernfs_open_file *of,
4724 char *buf, size_t nbytes, loff_t off)
4725{
4726 struct cgroup_subsys_state *css = of_css(of);
4727 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
4728 struct mem_cgroup_event *event;
4729 struct cgroup_subsys_state *cfile_css;
4730 unsigned int efd, cfd;
4731 struct fd efile;
4732 struct fd cfile;
4733 const char *name;
4734 char *endp;
4735 int ret;
4736
4737 if (IS_ENABLED(CONFIG_PREEMPT_RT))
4738 return -EOPNOTSUPP;
4739
4740 buf = strstrip(buf);
4741
4742 efd = simple_strtoul(buf, &endp, 10);
4743 if (*endp != ' ')
4744 return -EINVAL;
4745 buf = endp + 1;
4746
4747 cfd = simple_strtoul(buf, &endp, 10);
4748 if ((*endp != ' ') && (*endp != '\0'))
4749 return -EINVAL;
4750 buf = endp + 1;
4751
4752 event = kzalloc(sizeof(*event), GFP_KERNEL);
4753 if (!event)
4754 return -ENOMEM;
4755
4756 event->memcg = memcg;
4757 INIT_LIST_HEAD(&event->list);
4758 init_poll_funcptr(&event->pt, memcg_event_ptable_queue_proc);
4759 init_waitqueue_func_entry(&event->wait, memcg_event_wake);
4760 INIT_WORK(&event->remove, memcg_event_remove);
4761
4762 efile = fdget(efd);
4763 if (!efile.file) {
4764 ret = -EBADF;
4765 goto out_kfree;
4766 }
4767
4768 event->eventfd = eventfd_ctx_fileget(efile.file);
4769 if (IS_ERR(event->eventfd)) {
4770 ret = PTR_ERR(event->eventfd);
4771 goto out_put_efile;
4772 }
4773
4774 cfile = fdget(cfd);
4775 if (!cfile.file) {
4776 ret = -EBADF;
4777 goto out_put_eventfd;
4778 }
4779
4780
4781
4782 ret = file_permission(cfile.file, MAY_READ);
4783 if (ret < 0)
4784 goto out_put_cfile;
4785
4786
4787
4788
4789
4790
4791
4792
4793
4794 name = cfile.file->f_path.dentry->d_name.name;
4795
4796 if (!strcmp(name, "memory.usage_in_bytes")) {
4797 event->register_event = mem_cgroup_usage_register_event;
4798 event->unregister_event = mem_cgroup_usage_unregister_event;
4799 } else if (!strcmp(name, "memory.oom_control")) {
4800 event->register_event = mem_cgroup_oom_register_event;
4801 event->unregister_event = mem_cgroup_oom_unregister_event;
4802 } else if (!strcmp(name, "memory.pressure_level")) {
4803 event->register_event = vmpressure_register_event;
4804 event->unregister_event = vmpressure_unregister_event;
4805 } else if (!strcmp(name, "memory.memsw.usage_in_bytes")) {
4806 event->register_event = memsw_cgroup_usage_register_event;
4807 event->unregister_event = memsw_cgroup_usage_unregister_event;
4808 } else {
4809 ret = -EINVAL;
4810 goto out_put_cfile;
4811 }
4812
4813
4814
4815
4816
4817
4818 cfile_css = css_tryget_online_from_dir(cfile.file->f_path.dentry->d_parent,
4819 &memory_cgrp_subsys);
4820 ret = -EINVAL;
4821 if (IS_ERR(cfile_css))
4822 goto out_put_cfile;
4823 if (cfile_css != css) {
4824 css_put(cfile_css);
4825 goto out_put_cfile;
4826 }
4827
4828 ret = event->register_event(memcg, event->eventfd, buf);
4829 if (ret)
4830 goto out_put_css;
4831
4832 vfs_poll(efile.file, &event->pt);
4833
4834 spin_lock_irq(&memcg->event_list_lock);
4835 list_add(&event->list, &memcg->event_list);
4836 spin_unlock_irq(&memcg->event_list_lock);
4837
4838 fdput(cfile);
4839 fdput(efile);
4840
4841 return nbytes;
4842
4843out_put_css:
4844 css_put(css);
4845out_put_cfile:
4846 fdput(cfile);
4847out_put_eventfd:
4848 eventfd_ctx_put(event->eventfd);
4849out_put_efile:
4850 fdput(efile);
4851out_kfree:
4852 kfree(event);
4853
4854 return ret;
4855}
4856
4857#if defined(CONFIG_MEMCG_KMEM) && (defined(CONFIG_SLAB) || defined(CONFIG_SLUB_DEBUG))
4858static int mem_cgroup_slab_show(struct seq_file *m, void *p)
4859{
4860
4861
4862
4863
4864 return 0;
4865}
4866#endif
4867
4868static struct cftype mem_cgroup_legacy_files[] = {
4869 {
4870 .name = "usage_in_bytes",
4871 .private = MEMFILE_PRIVATE(_MEM, RES_USAGE),
4872 .read_u64 = mem_cgroup_read_u64,
4873 },
4874 {
4875 .name = "max_usage_in_bytes",
4876 .private = MEMFILE_PRIVATE(_MEM, RES_MAX_USAGE),
4877 .write = mem_cgroup_reset,
4878 .read_u64 = mem_cgroup_read_u64,
4879 },
4880 {
4881 .name = "limit_in_bytes",
4882 .private = MEMFILE_PRIVATE(_MEM, RES_LIMIT),
4883 .write = mem_cgroup_write,
4884 .read_u64 = mem_cgroup_read_u64,
4885 },
4886 {
4887 .name = "soft_limit_in_bytes",
4888 .private = MEMFILE_PRIVATE(_MEM, RES_SOFT_LIMIT),
4889 .write = mem_cgroup_write,
4890 .read_u64 = mem_cgroup_read_u64,
4891 },
4892 {
4893 .name = "failcnt",
4894 .private = MEMFILE_PRIVATE(_MEM, RES_FAILCNT),
4895 .write = mem_cgroup_reset,
4896 .read_u64 = mem_cgroup_read_u64,
4897 },
4898 {
4899 .name = "stat",
4900 .seq_show = memcg_stat_show,
4901 },
4902 {
4903 .name = "force_empty",
4904 .write = mem_cgroup_force_empty_write,
4905 },
4906 {
4907 .name = "use_hierarchy",
4908 .write_u64 = mem_cgroup_hierarchy_write,
4909 .read_u64 = mem_cgroup_hierarchy_read,
4910 },
4911 {
4912 .name = "cgroup.event_control",
4913 .write = memcg_write_event_control,
4914 .flags = CFTYPE_NO_PREFIX | CFTYPE_WORLD_WRITABLE,
4915 },
4916 {
4917 .name = "swappiness",
4918 .read_u64 = mem_cgroup_swappiness_read,
4919 .write_u64 = mem_cgroup_swappiness_write,
4920 },
4921 {
4922 .name = "move_charge_at_immigrate",
4923 .read_u64 = mem_cgroup_move_charge_read,
4924 .write_u64 = mem_cgroup_move_charge_write,
4925 },
4926 {
4927 .name = "oom_control",
4928 .seq_show = mem_cgroup_oom_control_read,
4929 .write_u64 = mem_cgroup_oom_control_write,
4930 },
4931 {
4932 .name = "pressure_level",
4933 },
4934#ifdef CONFIG_NUMA
4935 {
4936 .name = "numa_stat",
4937 .seq_show = memcg_numa_stat_show,
4938 },
4939#endif
4940 {
4941 .name = "kmem.limit_in_bytes",
4942 .private = MEMFILE_PRIVATE(_KMEM, RES_LIMIT),
4943 .write = mem_cgroup_write,
4944 .read_u64 = mem_cgroup_read_u64,
4945 },
4946 {
4947 .name = "kmem.usage_in_bytes",
4948 .private = MEMFILE_PRIVATE(_KMEM, RES_USAGE),
4949 .read_u64 = mem_cgroup_read_u64,
4950 },
4951 {
4952 .name = "kmem.failcnt",
4953 .private = MEMFILE_PRIVATE(_KMEM, RES_FAILCNT),
4954 .write = mem_cgroup_reset,
4955 .read_u64 = mem_cgroup_read_u64,
4956 },
4957 {
4958 .name = "kmem.max_usage_in_bytes",
4959 .private = MEMFILE_PRIVATE(_KMEM, RES_MAX_USAGE),
4960 .write = mem_cgroup_reset,
4961 .read_u64 = mem_cgroup_read_u64,
4962 },
4963#if defined(CONFIG_MEMCG_KMEM) && \
4964 (defined(CONFIG_SLAB) || defined(CONFIG_SLUB_DEBUG))
4965 {
4966 .name = "kmem.slabinfo",
4967 .seq_show = mem_cgroup_slab_show,
4968 },
4969#endif
4970 {
4971 .name = "kmem.tcp.limit_in_bytes",
4972 .private = MEMFILE_PRIVATE(_TCP, RES_LIMIT),
4973 .write = mem_cgroup_write,
4974 .read_u64 = mem_cgroup_read_u64,
4975 },
4976 {
4977 .name = "kmem.tcp.usage_in_bytes",
4978 .private = MEMFILE_PRIVATE(_TCP, RES_USAGE),
4979 .read_u64 = mem_cgroup_read_u64,
4980 },
4981 {
4982 .name = "kmem.tcp.failcnt",
4983 .private = MEMFILE_PRIVATE(_TCP, RES_FAILCNT),
4984 .write = mem_cgroup_reset,
4985 .read_u64 = mem_cgroup_read_u64,
4986 },
4987 {
4988 .name = "kmem.tcp.max_usage_in_bytes",
4989 .private = MEMFILE_PRIVATE(_TCP, RES_MAX_USAGE),
4990 .write = mem_cgroup_reset,
4991 .read_u64 = mem_cgroup_read_u64,
4992 },
4993 { },
4994};
4995
4996
4997
4998
4999
5000
5001
5002
5003
5004
5005
5006
5007
5008
5009
5010
5011
5012
5013
5014
5015
5016
5017
5018
5019
5020static DEFINE_IDR(mem_cgroup_idr);
5021
5022static void mem_cgroup_id_remove(struct mem_cgroup *memcg)
5023{
5024 if (memcg->id.id > 0) {
5025 idr_remove(&mem_cgroup_idr, memcg->id.id);
5026 memcg->id.id = 0;
5027 }
5028}
5029
5030static void __maybe_unused mem_cgroup_id_get_many(struct mem_cgroup *memcg,
5031 unsigned int n)
5032{
5033 refcount_add(n, &memcg->id.ref);
5034}
5035
5036static void mem_cgroup_id_put_many(struct mem_cgroup *memcg, unsigned int n)
5037{
5038 if (refcount_sub_and_test(n, &memcg->id.ref)) {
5039 mem_cgroup_id_remove(memcg);
5040
5041
5042 css_put(&memcg->css);
5043 }
5044}
5045
5046static inline void mem_cgroup_id_put(struct mem_cgroup *memcg)
5047{
5048 mem_cgroup_id_put_many(memcg, 1);
5049}
5050
5051
5052
5053
5054
5055
5056
5057struct mem_cgroup *mem_cgroup_from_id(unsigned short id)
5058{
5059 WARN_ON_ONCE(!rcu_read_lock_held());
5060 return idr_find(&mem_cgroup_idr, id);
5061}
5062
5063static int alloc_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node)
5064{
5065 struct mem_cgroup_per_node *pn;
5066
5067 pn = kzalloc_node(sizeof(*pn), GFP_KERNEL, node);
5068 if (!pn)
5069 return 1;
5070
5071 pn->lruvec_stats_percpu = alloc_percpu_gfp(struct lruvec_stats_percpu,
5072 GFP_KERNEL_ACCOUNT);
5073 if (!pn->lruvec_stats_percpu) {
5074 kfree(pn);
5075 return 1;
5076 }
5077
5078 lruvec_init(&pn->lruvec);
5079 pn->memcg = memcg;
5080
5081 memcg->nodeinfo[node] = pn;
5082 return 0;
5083}
5084
5085static void free_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node)
5086{
5087 struct mem_cgroup_per_node *pn = memcg->nodeinfo[node];
5088
5089 if (!pn)
5090 return;
5091
5092 free_percpu(pn->lruvec_stats_percpu);
5093 kfree(pn);
5094}
5095
5096static void __mem_cgroup_free(struct mem_cgroup *memcg)
5097{
5098 int node;
5099
5100 for_each_node(node)
5101 free_mem_cgroup_per_node_info(memcg, node);
5102 free_percpu(memcg->vmstats_percpu);
5103 kfree(memcg);
5104}
5105
5106static void mem_cgroup_free(struct mem_cgroup *memcg)
5107{
5108 memcg_wb_domain_exit(memcg);
5109 __mem_cgroup_free(memcg);
5110}
5111
5112static struct mem_cgroup *mem_cgroup_alloc(void)
5113{
5114 struct mem_cgroup *memcg;
5115 int node;
5116 int __maybe_unused i;
5117 long error = -ENOMEM;
5118
5119 memcg = kzalloc(struct_size(memcg, nodeinfo, nr_node_ids), GFP_KERNEL);
5120 if (!memcg)
5121 return ERR_PTR(error);
5122
5123 memcg->id.id = idr_alloc(&mem_cgroup_idr, NULL,
5124 1, MEM_CGROUP_ID_MAX + 1, GFP_KERNEL);
5125 if (memcg->id.id < 0) {
5126 error = memcg->id.id;
5127 goto fail;
5128 }
5129
5130 memcg->vmstats_percpu = alloc_percpu_gfp(struct memcg_vmstats_percpu,
5131 GFP_KERNEL_ACCOUNT);
5132 if (!memcg->vmstats_percpu)
5133 goto fail;
5134
5135 for_each_node(node)
5136 if (alloc_mem_cgroup_per_node_info(memcg, node))
5137 goto fail;
5138
5139 if (memcg_wb_domain_init(memcg, GFP_KERNEL))
5140 goto fail;
5141
5142 INIT_WORK(&memcg->high_work, high_work_func);
5143 INIT_LIST_HEAD(&memcg->oom_notify);
5144 mutex_init(&memcg->thresholds_lock);
5145 spin_lock_init(&memcg->move_lock);
5146 vmpressure_init(&memcg->vmpressure);
5147 INIT_LIST_HEAD(&memcg->event_list);
5148 spin_lock_init(&memcg->event_list_lock);
5149 memcg->socket_pressure = jiffies;
5150#ifdef CONFIG_MEMCG_KMEM
5151 memcg->kmemcg_id = -1;
5152 INIT_LIST_HEAD(&memcg->objcg_list);
5153#endif
5154#ifdef CONFIG_CGROUP_WRITEBACK
5155 INIT_LIST_HEAD(&memcg->cgwb_list);
5156 for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++)
5157 memcg->cgwb_frn[i].done =
5158 __WB_COMPLETION_INIT(&memcg_cgwb_frn_waitq);
5159#endif
5160#ifdef CONFIG_TRANSPARENT_HUGEPAGE
5161 spin_lock_init(&memcg->deferred_split_queue.split_queue_lock);
5162 INIT_LIST_HEAD(&memcg->deferred_split_queue.split_queue);
5163 memcg->deferred_split_queue.split_queue_len = 0;
5164#endif
5165 idr_replace(&mem_cgroup_idr, memcg, memcg->id.id);
5166 return memcg;
5167fail:
5168 mem_cgroup_id_remove(memcg);
5169 __mem_cgroup_free(memcg);
5170 return ERR_PTR(error);
5171}
5172
5173static struct cgroup_subsys_state * __ref
5174mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
5175{
5176 struct mem_cgroup *parent = mem_cgroup_from_css(parent_css);
5177 struct mem_cgroup *memcg, *old_memcg;
5178
5179 old_memcg = set_active_memcg(parent);
5180 memcg = mem_cgroup_alloc();
5181 set_active_memcg(old_memcg);
5182 if (IS_ERR(memcg))
5183 return ERR_CAST(memcg);
5184
5185 page_counter_set_high(&memcg->memory, PAGE_COUNTER_MAX);
5186 memcg->soft_limit = PAGE_COUNTER_MAX;
5187#if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_ZSWAP)
5188 memcg->zswap_max = PAGE_COUNTER_MAX;
5189#endif
5190 page_counter_set_high(&memcg->swap, PAGE_COUNTER_MAX);
5191 if (parent) {
5192 memcg->swappiness = mem_cgroup_swappiness(parent);
5193 memcg->oom_kill_disable = parent->oom_kill_disable;
5194
5195 page_counter_init(&memcg->memory, &parent->memory);
5196 page_counter_init(&memcg->swap, &parent->swap);
5197 page_counter_init(&memcg->kmem, &parent->kmem);
5198 page_counter_init(&memcg->tcpmem, &parent->tcpmem);
5199 } else {
5200 page_counter_init(&memcg->memory, NULL);
5201 page_counter_init(&memcg->swap, NULL);
5202 page_counter_init(&memcg->kmem, NULL);
5203 page_counter_init(&memcg->tcpmem, NULL);
5204
5205 root_mem_cgroup = memcg;
5206 return &memcg->css;
5207 }
5208
5209 if (cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_nosocket)
5210 static_branch_inc(&memcg_sockets_enabled_key);
5211
5212 return &memcg->css;
5213}
5214
5215static int mem_cgroup_css_online(struct cgroup_subsys_state *css)
5216{
5217 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
5218
5219 if (memcg_online_kmem(memcg))
5220 goto remove_id;
5221
5222
5223
5224
5225
5226
5227 if (alloc_shrinker_info(memcg))
5228 goto offline_kmem;
5229
5230
5231 refcount_set(&memcg->id.ref, 1);
5232 css_get(css);
5233
5234 if (unlikely(mem_cgroup_is_root(memcg)))
5235 queue_delayed_work(system_unbound_wq, &stats_flush_dwork,
5236 2UL*HZ);
5237 return 0;
5238offline_kmem:
5239 memcg_offline_kmem(memcg);
5240remove_id:
5241 mem_cgroup_id_remove(memcg);
5242 return -ENOMEM;
5243}
5244
5245static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
5246{
5247 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
5248 struct mem_cgroup_event *event, *tmp;
5249
5250
5251
5252
5253
5254
5255 spin_lock_irq(&memcg->event_list_lock);
5256 list_for_each_entry_safe(event, tmp, &memcg->event_list, list) {
5257 list_del_init(&event->list);
5258 schedule_work(&event->remove);
5259 }
5260 spin_unlock_irq(&memcg->event_list_lock);
5261
5262 page_counter_set_min(&memcg->memory, 0);
5263 page_counter_set_low(&memcg->memory, 0);
5264
5265 memcg_offline_kmem(memcg);
5266 reparent_shrinker_deferred(memcg);
5267 wb_memcg_offline(memcg);
5268
5269 drain_all_stock(memcg);
5270
5271 mem_cgroup_id_put(memcg);
5272}
5273
5274static void mem_cgroup_css_released(struct cgroup_subsys_state *css)
5275{
5276 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
5277
5278 invalidate_reclaim_iterators(memcg);
5279}
5280
5281static void mem_cgroup_css_free(struct cgroup_subsys_state *css)
5282{
5283 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
5284 int __maybe_unused i;
5285
5286#ifdef CONFIG_CGROUP_WRITEBACK
5287 for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++)
5288 wb_wait_for_completion(&memcg->cgwb_frn[i].done);
5289#endif
5290 if (cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_nosocket)
5291 static_branch_dec(&memcg_sockets_enabled_key);
5292
5293 if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && memcg->tcpmem_active)
5294 static_branch_dec(&memcg_sockets_enabled_key);
5295
5296 vmpressure_cleanup(&memcg->vmpressure);
5297 cancel_work_sync(&memcg->high_work);
5298 mem_cgroup_remove_from_trees(memcg);
5299 free_shrinker_info(memcg);
5300 mem_cgroup_free(memcg);
5301}
5302
5303
5304
5305
5306
5307
5308
5309
5310
5311
5312
5313
5314
5315
5316static void mem_cgroup_css_reset(struct cgroup_subsys_state *css)
5317{
5318 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
5319
5320 page_counter_set_max(&memcg->memory, PAGE_COUNTER_MAX);
5321 page_counter_set_max(&memcg->swap, PAGE_COUNTER_MAX);
5322 page_counter_set_max(&memcg->kmem, PAGE_COUNTER_MAX);
5323 page_counter_set_max(&memcg->tcpmem, PAGE_COUNTER_MAX);
5324 page_counter_set_min(&memcg->memory, 0);
5325 page_counter_set_low(&memcg->memory, 0);
5326 page_counter_set_high(&memcg->memory, PAGE_COUNTER_MAX);
5327 memcg->soft_limit = PAGE_COUNTER_MAX;
5328 page_counter_set_high(&memcg->swap, PAGE_COUNTER_MAX);
5329 memcg_wb_domain_size_changed(memcg);
5330}
5331
5332static void mem_cgroup_css_rstat_flush(struct cgroup_subsys_state *css, int cpu)
5333{
5334 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
5335 struct mem_cgroup *parent = parent_mem_cgroup(memcg);
5336 struct memcg_vmstats_percpu *statc;
5337 long delta, v;
5338 int i, nid;
5339
5340 statc = per_cpu_ptr(memcg->vmstats_percpu, cpu);
5341
5342 for (i = 0; i < MEMCG_NR_STAT; i++) {
5343
5344
5345
5346
5347
5348 delta = memcg->vmstats.state_pending[i];
5349 if (delta)
5350 memcg->vmstats.state_pending[i] = 0;
5351
5352
5353 v = READ_ONCE(statc->state[i]);
5354 if (v != statc->state_prev[i]) {
5355 delta += v - statc->state_prev[i];
5356 statc->state_prev[i] = v;
5357 }
5358
5359 if (!delta)
5360 continue;
5361
5362
5363 memcg->vmstats.state[i] += delta;
5364 if (parent)
5365 parent->vmstats.state_pending[i] += delta;
5366 }
5367
5368 for (i = 0; i < NR_VM_EVENT_ITEMS; i++) {
5369 delta = memcg->vmstats.events_pending[i];
5370 if (delta)
5371 memcg->vmstats.events_pending[i] = 0;
5372
5373 v = READ_ONCE(statc->events[i]);
5374 if (v != statc->events_prev[i]) {
5375 delta += v - statc->events_prev[i];
5376 statc->events_prev[i] = v;
5377 }
5378
5379 if (!delta)
5380 continue;
5381
5382 memcg->vmstats.events[i] += delta;
5383 if (parent)
5384 parent->vmstats.events_pending[i] += delta;
5385 }
5386
5387 for_each_node_state(nid, N_MEMORY) {
5388 struct mem_cgroup_per_node *pn = memcg->nodeinfo[nid];
5389 struct mem_cgroup_per_node *ppn = NULL;
5390 struct lruvec_stats_percpu *lstatc;
5391
5392 if (parent)
5393 ppn = parent->nodeinfo[nid];
5394
5395 lstatc = per_cpu_ptr(pn->lruvec_stats_percpu, cpu);
5396
5397 for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) {
5398 delta = pn->lruvec_stats.state_pending[i];
5399 if (delta)
5400 pn->lruvec_stats.state_pending[i] = 0;
5401
5402 v = READ_ONCE(lstatc->state[i]);
5403 if (v != lstatc->state_prev[i]) {
5404 delta += v - lstatc->state_prev[i];
5405 lstatc->state_prev[i] = v;
5406 }
5407
5408 if (!delta)
5409 continue;
5410
5411 pn->lruvec_stats.state[i] += delta;
5412 if (ppn)
5413 ppn->lruvec_stats.state_pending[i] += delta;
5414 }
5415 }
5416}
5417
5418#ifdef CONFIG_MMU
5419
5420static int mem_cgroup_do_precharge(unsigned long count)
5421{
5422 int ret;
5423
5424
5425 ret = try_charge(mc.to, GFP_KERNEL & ~__GFP_DIRECT_RECLAIM, count);
5426 if (!ret) {
5427 mc.precharge += count;
5428 return ret;
5429 }
5430
5431
5432 while (count--) {
5433 ret = try_charge(mc.to, GFP_KERNEL | __GFP_NORETRY, 1);
5434 if (ret)
5435 return ret;
5436 mc.precharge++;
5437 cond_resched();
5438 }
5439 return 0;
5440}
5441
5442union mc_target {
5443 struct page *page;
5444 swp_entry_t ent;
5445};
5446
5447enum mc_target_type {
5448 MC_TARGET_NONE = 0,
5449 MC_TARGET_PAGE,
5450 MC_TARGET_SWAP,
5451 MC_TARGET_DEVICE,
5452};
5453
5454static struct page *mc_handle_present_pte(struct vm_area_struct *vma,
5455 unsigned long addr, pte_t ptent)
5456{
5457 struct page *page = vm_normal_page(vma, addr, ptent);
5458
5459 if (!page || !page_mapped(page))
5460 return NULL;
5461 if (PageAnon(page)) {
5462 if (!(mc.flags & MOVE_ANON))
5463 return NULL;
5464 } else {
5465 if (!(mc.flags & MOVE_FILE))
5466 return NULL;
5467 }
5468 if (!get_page_unless_zero(page))
5469 return NULL;
5470
5471 return page;
5472}
5473
5474#if defined(CONFIG_SWAP) || defined(CONFIG_DEVICE_PRIVATE)
5475static struct page *mc_handle_swap_pte(struct vm_area_struct *vma,
5476 pte_t ptent, swp_entry_t *entry)
5477{
5478 struct page *page = NULL;
5479 swp_entry_t ent = pte_to_swp_entry(ptent);
5480
5481 if (!(mc.flags & MOVE_ANON))
5482 return NULL;
5483
5484
5485
5486
5487
5488 if (is_device_private_entry(ent)) {
5489 page = pfn_swap_entry_to_page(ent);
5490 if (!get_page_unless_zero(page))
5491 return NULL;
5492 return page;
5493 }
5494
5495 if (non_swap_entry(ent))
5496 return NULL;
5497
5498
5499
5500
5501
5502 page = find_get_page(swap_address_space(ent), swp_offset(ent));
5503 entry->val = ent.val;
5504
5505 return page;
5506}
5507#else
5508static struct page *mc_handle_swap_pte(struct vm_area_struct *vma,
5509 pte_t ptent, swp_entry_t *entry)
5510{
5511 return NULL;
5512}
5513#endif
5514
5515static struct page *mc_handle_file_pte(struct vm_area_struct *vma,
5516 unsigned long addr, pte_t ptent)
5517{
5518 if (!vma->vm_file)
5519 return NULL;
5520 if (!(mc.flags & MOVE_FILE))
5521 return NULL;
5522
5523
5524
5525 return find_get_incore_page(vma->vm_file->f_mapping,
5526 linear_page_index(vma, addr));
5527}
5528
5529
5530
5531
5532
5533
5534
5535
5536
5537
5538
5539
5540
5541static int mem_cgroup_move_account(struct page *page,
5542 bool compound,
5543 struct mem_cgroup *from,
5544 struct mem_cgroup *to)
5545{
5546 struct folio *folio = page_folio(page);
5547 struct lruvec *from_vec, *to_vec;
5548 struct pglist_data *pgdat;
5549 unsigned int nr_pages = compound ? folio_nr_pages(folio) : 1;
5550 int nid, ret;
5551
5552 VM_BUG_ON(from == to);
5553 VM_BUG_ON_FOLIO(folio_test_lru(folio), folio);
5554 VM_BUG_ON(compound && !folio_test_large(folio));
5555
5556
5557
5558
5559
5560 ret = -EBUSY;
5561 if (!folio_trylock(folio))
5562 goto out;
5563
5564 ret = -EINVAL;
5565 if (folio_memcg(folio) != from)
5566 goto out_unlock;
5567
5568 pgdat = folio_pgdat(folio);
5569 from_vec = mem_cgroup_lruvec(from, pgdat);
5570 to_vec = mem_cgroup_lruvec(to, pgdat);
5571
5572 folio_memcg_lock(folio);
5573
5574 if (folio_test_anon(folio)) {
5575 if (folio_mapped(folio)) {
5576 __mod_lruvec_state(from_vec, NR_ANON_MAPPED, -nr_pages);
5577 __mod_lruvec_state(to_vec, NR_ANON_MAPPED, nr_pages);
5578 if (folio_test_transhuge(folio)) {
5579 __mod_lruvec_state(from_vec, NR_ANON_THPS,
5580 -nr_pages);
5581 __mod_lruvec_state(to_vec, NR_ANON_THPS,
5582 nr_pages);
5583 }
5584 }
5585 } else {
5586 __mod_lruvec_state(from_vec, NR_FILE_PAGES, -nr_pages);
5587 __mod_lruvec_state(to_vec, NR_FILE_PAGES, nr_pages);
5588
5589 if (folio_test_swapbacked(folio)) {
5590 __mod_lruvec_state(from_vec, NR_SHMEM, -nr_pages);
5591 __mod_lruvec_state(to_vec, NR_SHMEM, nr_pages);
5592 }
5593
5594 if (folio_mapped(folio)) {
5595 __mod_lruvec_state(from_vec, NR_FILE_MAPPED, -nr_pages);
5596 __mod_lruvec_state(to_vec, NR_FILE_MAPPED, nr_pages);
5597 }
5598
5599 if (folio_test_dirty(folio)) {
5600 struct address_space *mapping = folio_mapping(folio);
5601
5602 if (mapping_can_writeback(mapping)) {
5603 __mod_lruvec_state(from_vec, NR_FILE_DIRTY,
5604 -nr_pages);
5605 __mod_lruvec_state(to_vec, NR_FILE_DIRTY,
5606 nr_pages);
5607 }
5608 }
5609 }
5610
5611 if (folio_test_writeback(folio)) {
5612 __mod_lruvec_state(from_vec, NR_WRITEBACK, -nr_pages);
5613 __mod_lruvec_state(to_vec, NR_WRITEBACK, nr_pages);
5614 }
5615
5616
5617
5618
5619
5620
5621
5622
5623
5624
5625
5626
5627
5628
5629 smp_mb();
5630
5631 css_get(&to->css);
5632 css_put(&from->css);
5633
5634 folio->memcg_data = (unsigned long)to;
5635
5636 __folio_memcg_unlock(from);
5637
5638 ret = 0;
5639 nid = folio_nid(folio);
5640
5641 local_irq_disable();
5642 mem_cgroup_charge_statistics(to, nr_pages);
5643 memcg_check_events(to, nid);
5644 mem_cgroup_charge_statistics(from, -nr_pages);
5645 memcg_check_events(from, nid);
5646 local_irq_enable();
5647out_unlock:
5648 folio_unlock(folio);
5649out:
5650 return ret;
5651}
5652
5653
5654
5655
5656
5657
5658
5659
5660
5661
5662
5663
5664
5665
5666
5667
5668
5669
5670
5671
5672
5673
5674
5675
5676
5677
5678
5679static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma,
5680 unsigned long addr, pte_t ptent, union mc_target *target)
5681{
5682 struct page *page = NULL;
5683 enum mc_target_type ret = MC_TARGET_NONE;
5684 swp_entry_t ent = { .val = 0 };
5685
5686 if (pte_present(ptent))
5687 page = mc_handle_present_pte(vma, addr, ptent);
5688 else if (pte_none_mostly(ptent))
5689
5690
5691
5692
5693 page = mc_handle_file_pte(vma, addr, ptent);
5694 else if (is_swap_pte(ptent))
5695 page = mc_handle_swap_pte(vma, ptent, &ent);
5696
5697 if (!page && !ent.val)
5698 return ret;
5699 if (page) {
5700
5701
5702
5703
5704
5705 if (page_memcg(page) == mc.from) {
5706 ret = MC_TARGET_PAGE;
5707 if (is_device_private_page(page))
5708 ret = MC_TARGET_DEVICE;
5709 if (target)
5710 target->page = page;
5711 }
5712 if (!ret || !target)
5713 put_page(page);
5714 }
5715
5716
5717
5718
5719 if (ent.val && !ret && (!page || !PageTransCompound(page)) &&
5720 mem_cgroup_id(mc.from) == lookup_swap_cgroup_id(ent)) {
5721 ret = MC_TARGET_SWAP;
5722 if (target)
5723 target->ent = ent;
5724 }
5725 return ret;
5726}
5727
5728#ifdef CONFIG_TRANSPARENT_HUGEPAGE
5729
5730
5731
5732
5733
5734static enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma,
5735 unsigned long addr, pmd_t pmd, union mc_target *target)
5736{
5737 struct page *page = NULL;
5738 enum mc_target_type ret = MC_TARGET_NONE;
5739
5740 if (unlikely(is_swap_pmd(pmd))) {
5741 VM_BUG_ON(thp_migration_supported() &&
5742 !is_pmd_migration_entry(pmd));
5743 return ret;
5744 }
5745 page = pmd_page(pmd);
5746 VM_BUG_ON_PAGE(!page || !PageHead(page), page);
5747 if (!(mc.flags & MOVE_ANON))
5748 return ret;
5749 if (page_memcg(page) == mc.from) {
5750 ret = MC_TARGET_PAGE;
5751 if (target) {
5752 get_page(page);
5753 target->page = page;
5754 }
5755 }
5756 return ret;
5757}
5758#else
5759static inline enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma,
5760 unsigned long addr, pmd_t pmd, union mc_target *target)
5761{
5762 return MC_TARGET_NONE;
5763}
5764#endif
5765
5766static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd,
5767 unsigned long addr, unsigned long end,
5768 struct mm_walk *walk)
5769{
5770 struct vm_area_struct *vma = walk->vma;
5771 pte_t *pte;
5772 spinlock_t *ptl;
5773
5774 ptl = pmd_trans_huge_lock(pmd, vma);
5775 if (ptl) {
5776
5777
5778
5779
5780
5781 if (get_mctgt_type_thp(vma, addr, *pmd, NULL) == MC_TARGET_PAGE)
5782 mc.precharge += HPAGE_PMD_NR;
5783 spin_unlock(ptl);
5784 return 0;
5785 }
5786
5787 if (pmd_trans_unstable(pmd))
5788 return 0;
5789 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
5790 for (; addr != end; pte++, addr += PAGE_SIZE)
5791 if (get_mctgt_type(vma, addr, *pte, NULL))
5792 mc.precharge++;
5793 pte_unmap_unlock(pte - 1, ptl);
5794 cond_resched();
5795
5796 return 0;
5797}
5798
5799static const struct mm_walk_ops precharge_walk_ops = {
5800 .pmd_entry = mem_cgroup_count_precharge_pte_range,
5801};
5802
5803static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm)
5804{
5805 unsigned long precharge;
5806
5807 mmap_read_lock(mm);
5808 walk_page_range(mm, 0, mm->highest_vm_end, &precharge_walk_ops, NULL);
5809 mmap_read_unlock(mm);
5810
5811 precharge = mc.precharge;
5812 mc.precharge = 0;
5813
5814 return precharge;
5815}
5816
5817static int mem_cgroup_precharge_mc(struct mm_struct *mm)
5818{
5819 unsigned long precharge = mem_cgroup_count_precharge(mm);
5820
5821 VM_BUG_ON(mc.moving_task);
5822 mc.moving_task = current;
5823 return mem_cgroup_do_precharge(precharge);
5824}
5825
5826
5827static void __mem_cgroup_clear_mc(void)
5828{
5829 struct mem_cgroup *from = mc.from;
5830 struct mem_cgroup *to = mc.to;
5831
5832
5833 if (mc.precharge) {
5834 cancel_charge(mc.to, mc.precharge);
5835 mc.precharge = 0;
5836 }
5837
5838
5839
5840
5841 if (mc.moved_charge) {
5842 cancel_charge(mc.from, mc.moved_charge);
5843 mc.moved_charge = 0;
5844 }
5845
5846 if (mc.moved_swap) {
5847
5848 if (!mem_cgroup_is_root(mc.from))
5849 page_counter_uncharge(&mc.from->memsw, mc.moved_swap);
5850
5851 mem_cgroup_id_put_many(mc.from, mc.moved_swap);
5852
5853
5854
5855
5856
5857 if (!mem_cgroup_is_root(mc.to))
5858 page_counter_uncharge(&mc.to->memory, mc.moved_swap);
5859
5860 mc.moved_swap = 0;
5861 }
5862 memcg_oom_recover(from);
5863 memcg_oom_recover(to);
5864 wake_up_all(&mc.waitq);
5865}
5866
5867static void mem_cgroup_clear_mc(void)
5868{
5869 struct mm_struct *mm = mc.mm;
5870
5871
5872
5873
5874
5875 mc.moving_task = NULL;
5876 __mem_cgroup_clear_mc();
5877 spin_lock(&mc.lock);
5878 mc.from = NULL;
5879 mc.to = NULL;
5880 mc.mm = NULL;
5881 spin_unlock(&mc.lock);
5882
5883 mmput(mm);
5884}
5885
5886static int mem_cgroup_can_attach(struct cgroup_taskset *tset)
5887{
5888 struct cgroup_subsys_state *css;
5889 struct mem_cgroup *memcg = NULL;
5890 struct mem_cgroup *from;
5891 struct task_struct *leader, *p;
5892 struct mm_struct *mm;
5893 unsigned long move_flags;
5894 int ret = 0;
5895
5896
5897 if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
5898 return 0;
5899
5900
5901
5902
5903
5904
5905
5906 p = NULL;
5907 cgroup_taskset_for_each_leader(leader, css, tset) {
5908 WARN_ON_ONCE(p);
5909 p = leader;
5910 memcg = mem_cgroup_from_css(css);
5911 }
5912 if (!p)
5913 return 0;
5914
5915
5916
5917
5918
5919
5920 move_flags = READ_ONCE(memcg->move_charge_at_immigrate);
5921 if (!move_flags)
5922 return 0;
5923
5924 from = mem_cgroup_from_task(p);
5925
5926 VM_BUG_ON(from == memcg);
5927
5928 mm = get_task_mm(p);
5929 if (!mm)
5930 return 0;
5931
5932 if (mm->owner == p) {
5933 VM_BUG_ON(mc.from);
5934 VM_BUG_ON(mc.to);
5935 VM_BUG_ON(mc.precharge);
5936 VM_BUG_ON(mc.moved_charge);
5937 VM_BUG_ON(mc.moved_swap);
5938
5939 spin_lock(&mc.lock);
5940 mc.mm = mm;
5941 mc.from = from;
5942 mc.to = memcg;
5943 mc.flags = move_flags;
5944 spin_unlock(&mc.lock);
5945
5946
5947 ret = mem_cgroup_precharge_mc(mm);
5948 if (ret)
5949 mem_cgroup_clear_mc();
5950 } else {
5951 mmput(mm);
5952 }
5953 return ret;
5954}
5955
5956static void mem_cgroup_cancel_attach(struct cgroup_taskset *tset)
5957{
5958 if (mc.to)
5959 mem_cgroup_clear_mc();
5960}
5961
5962static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
5963 unsigned long addr, unsigned long end,
5964 struct mm_walk *walk)
5965{
5966 int ret = 0;
5967 struct vm_area_struct *vma = walk->vma;
5968 pte_t *pte;
5969 spinlock_t *ptl;
5970 enum mc_target_type target_type;
5971 union mc_target target;
5972 struct page *page;
5973
5974 ptl = pmd_trans_huge_lock(pmd, vma);
5975 if (ptl) {
5976 if (mc.precharge < HPAGE_PMD_NR) {
5977 spin_unlock(ptl);
5978 return 0;
5979 }
5980 target_type = get_mctgt_type_thp(vma, addr, *pmd, &target);
5981 if (target_type == MC_TARGET_PAGE) {
5982 page = target.page;
5983 if (!isolate_lru_page(page)) {
5984 if (!mem_cgroup_move_account(page, true,
5985 mc.from, mc.to)) {
5986 mc.precharge -= HPAGE_PMD_NR;
5987 mc.moved_charge += HPAGE_PMD_NR;
5988 }
5989 putback_lru_page(page);
5990 }
5991 put_page(page);
5992 } else if (target_type == MC_TARGET_DEVICE) {
5993 page = target.page;
5994 if (!mem_cgroup_move_account(page, true,
5995 mc.from, mc.to)) {
5996 mc.precharge -= HPAGE_PMD_NR;
5997 mc.moved_charge += HPAGE_PMD_NR;
5998 }
5999 put_page(page);
6000 }
6001 spin_unlock(ptl);
6002 return 0;
6003 }
6004
6005 if (pmd_trans_unstable(pmd))
6006 return 0;
6007retry:
6008 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
6009 for (; addr != end; addr += PAGE_SIZE) {
6010 pte_t ptent = *(pte++);
6011 bool device = false;
6012 swp_entry_t ent;
6013
6014 if (!mc.precharge)
6015 break;
6016
6017 switch (get_mctgt_type(vma, addr, ptent, &target)) {
6018 case MC_TARGET_DEVICE:
6019 device = true;
6020 fallthrough;
6021 case MC_TARGET_PAGE:
6022 page = target.page;
6023
6024
6025
6026
6027
6028
6029 if (PageTransCompound(page))
6030 goto put;
6031 if (!device && isolate_lru_page(page))
6032 goto put;
6033 if (!mem_cgroup_move_account(page, false,
6034 mc.from, mc.to)) {
6035 mc.precharge--;
6036
6037 mc.moved_charge++;
6038 }
6039 if (!device)
6040 putback_lru_page(page);
6041put:
6042 put_page(page);
6043 break;
6044 case MC_TARGET_SWAP:
6045 ent = target.ent;
6046 if (!mem_cgroup_move_swap_account(ent, mc.from, mc.to)) {
6047 mc.precharge--;
6048 mem_cgroup_id_get_many(mc.to, 1);
6049
6050 mc.moved_swap++;
6051 }
6052 break;
6053 default:
6054 break;
6055 }
6056 }
6057 pte_unmap_unlock(pte - 1, ptl);
6058 cond_resched();
6059
6060 if (addr != end) {
6061
6062
6063
6064
6065
6066
6067 ret = mem_cgroup_do_precharge(1);
6068 if (!ret)
6069 goto retry;
6070 }
6071
6072 return ret;
6073}
6074
6075static const struct mm_walk_ops charge_walk_ops = {
6076 .pmd_entry = mem_cgroup_move_charge_pte_range,
6077};
6078
6079static void mem_cgroup_move_charge(void)
6080{
6081 lru_add_drain_all();
6082
6083
6084
6085
6086
6087 atomic_inc(&mc.from->moving_account);
6088 synchronize_rcu();
6089retry:
6090 if (unlikely(!mmap_read_trylock(mc.mm))) {
6091
6092
6093
6094
6095
6096
6097
6098 __mem_cgroup_clear_mc();
6099 cond_resched();
6100 goto retry;
6101 }
6102
6103
6104
6105
6106 walk_page_range(mc.mm, 0, mc.mm->highest_vm_end, &charge_walk_ops,
6107 NULL);
6108
6109 mmap_read_unlock(mc.mm);
6110 atomic_dec(&mc.from->moving_account);
6111}
6112
6113static void mem_cgroup_move_task(void)
6114{
6115 if (mc.to) {
6116 mem_cgroup_move_charge();
6117 mem_cgroup_clear_mc();
6118 }
6119}
6120#else
6121static int mem_cgroup_can_attach(struct cgroup_taskset *tset)
6122{
6123 return 0;
6124}
6125static void mem_cgroup_cancel_attach(struct cgroup_taskset *tset)
6126{
6127}
6128static void mem_cgroup_move_task(void)
6129{
6130}
6131#endif
6132
6133static int seq_puts_memcg_tunable(struct seq_file *m, unsigned long value)
6134{
6135 if (value == PAGE_COUNTER_MAX)
6136 seq_puts(m, "max\n");
6137 else
6138 seq_printf(m, "%llu\n", (u64)value * PAGE_SIZE);
6139
6140 return 0;
6141}
6142
6143static u64 memory_current_read(struct cgroup_subsys_state *css,
6144 struct cftype *cft)
6145{
6146 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
6147
6148 return (u64)page_counter_read(&memcg->memory) * PAGE_SIZE;
6149}
6150
6151static u64 memory_peak_read(struct cgroup_subsys_state *css,
6152 struct cftype *cft)
6153{
6154 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
6155
6156 return (u64)memcg->memory.watermark * PAGE_SIZE;
6157}
6158
6159static int memory_min_show(struct seq_file *m, void *v)
6160{
6161 return seq_puts_memcg_tunable(m,
6162 READ_ONCE(mem_cgroup_from_seq(m)->memory.min));
6163}
6164
6165static ssize_t memory_min_write(struct kernfs_open_file *of,
6166 char *buf, size_t nbytes, loff_t off)
6167{
6168 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
6169 unsigned long min;
6170 int err;
6171
6172 buf = strstrip(buf);
6173 err = page_counter_memparse(buf, "max", &min);
6174 if (err)
6175 return err;
6176
6177 page_counter_set_min(&memcg->memory, min);
6178
6179 return nbytes;
6180}
6181
6182static int memory_low_show(struct seq_file *m, void *v)
6183{
6184 return seq_puts_memcg_tunable(m,
6185 READ_ONCE(mem_cgroup_from_seq(m)->memory.low));
6186}
6187
6188static ssize_t memory_low_write(struct kernfs_open_file *of,
6189 char *buf, size_t nbytes, loff_t off)
6190{
6191 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
6192 unsigned long low;
6193 int err;
6194
6195 buf = strstrip(buf);
6196 err = page_counter_memparse(buf, "max", &low);
6197 if (err)
6198 return err;
6199
6200 page_counter_set_low(&memcg->memory, low);
6201
6202 return nbytes;
6203}
6204
6205static int memory_high_show(struct seq_file *m, void *v)
6206{
6207 return seq_puts_memcg_tunable(m,
6208 READ_ONCE(mem_cgroup_from_seq(m)->memory.high));
6209}
6210
6211static ssize_t memory_high_write(struct kernfs_open_file *of,
6212 char *buf, size_t nbytes, loff_t off)
6213{
6214 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
6215 unsigned int nr_retries = MAX_RECLAIM_RETRIES;
6216 bool drained = false;
6217 unsigned long high;
6218 int err;
6219
6220 buf = strstrip(buf);
6221 err = page_counter_memparse(buf, "max", &high);
6222 if (err)
6223 return err;
6224
6225 page_counter_set_high(&memcg->memory, high);
6226
6227 for (;;) {
6228 unsigned long nr_pages = page_counter_read(&memcg->memory);
6229 unsigned long reclaimed;
6230
6231 if (nr_pages <= high)
6232 break;
6233
6234 if (signal_pending(current))
6235 break;
6236
6237 if (!drained) {
6238 drain_all_stock(memcg);
6239 drained = true;
6240 continue;
6241 }
6242
6243 reclaimed = try_to_free_mem_cgroup_pages(memcg, nr_pages - high,
6244 GFP_KERNEL, true);
6245
6246 if (!reclaimed && !nr_retries--)
6247 break;
6248 }
6249
6250 memcg_wb_domain_size_changed(memcg);
6251 return nbytes;
6252}
6253
6254static int memory_max_show(struct seq_file *m, void *v)
6255{
6256 return seq_puts_memcg_tunable(m,
6257 READ_ONCE(mem_cgroup_from_seq(m)->memory.max));
6258}
6259
6260static ssize_t memory_max_write(struct kernfs_open_file *of,
6261 char *buf, size_t nbytes, loff_t off)
6262{
6263 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
6264 unsigned int nr_reclaims = MAX_RECLAIM_RETRIES;
6265 bool drained = false;
6266 unsigned long max;
6267 int err;
6268
6269 buf = strstrip(buf);
6270 err = page_counter_memparse(buf, "max", &max);
6271 if (err)
6272 return err;
6273
6274 xchg(&memcg->memory.max, max);
6275
6276 for (;;) {
6277 unsigned long nr_pages = page_counter_read(&memcg->memory);
6278
6279 if (nr_pages <= max)
6280 break;
6281
6282 if (signal_pending(current))
6283 break;
6284
6285 if (!drained) {
6286 drain_all_stock(memcg);
6287 drained = true;
6288 continue;
6289 }
6290
6291 if (nr_reclaims) {
6292 if (!try_to_free_mem_cgroup_pages(memcg, nr_pages - max,
6293 GFP_KERNEL, true))
6294 nr_reclaims--;
6295 continue;
6296 }
6297
6298 memcg_memory_event(memcg, MEMCG_OOM);
6299 if (!mem_cgroup_out_of_memory(memcg, GFP_KERNEL, 0))
6300 break;
6301 }
6302
6303 memcg_wb_domain_size_changed(memcg);
6304 return nbytes;
6305}
6306
6307static void __memory_events_show(struct seq_file *m, atomic_long_t *events)
6308{
6309 seq_printf(m, "low %lu\n", atomic_long_read(&events[MEMCG_LOW]));
6310 seq_printf(m, "high %lu\n", atomic_long_read(&events[MEMCG_HIGH]));
6311 seq_printf(m, "max %lu\n", atomic_long_read(&events[MEMCG_MAX]));
6312 seq_printf(m, "oom %lu\n", atomic_long_read(&events[MEMCG_OOM]));
6313 seq_printf(m, "oom_kill %lu\n",
6314 atomic_long_read(&events[MEMCG_OOM_KILL]));
6315 seq_printf(m, "oom_group_kill %lu\n",
6316 atomic_long_read(&events[MEMCG_OOM_GROUP_KILL]));
6317}
6318
6319static int memory_events_show(struct seq_file *m, void *v)
6320{
6321 struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
6322
6323 __memory_events_show(m, memcg->memory_events);
6324 return 0;
6325}
6326
6327static int memory_events_local_show(struct seq_file *m, void *v)
6328{
6329 struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
6330
6331 __memory_events_show(m, memcg->memory_events_local);
6332 return 0;
6333}
6334
6335static int memory_stat_show(struct seq_file *m, void *v)
6336{
6337 struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
6338 char *buf;
6339
6340 buf = memory_stat_format(memcg);
6341 if (!buf)
6342 return -ENOMEM;
6343 seq_puts(m, buf);
6344 kfree(buf);
6345 return 0;
6346}
6347
6348#ifdef CONFIG_NUMA
6349static inline unsigned long lruvec_page_state_output(struct lruvec *lruvec,
6350 int item)
6351{
6352 return lruvec_page_state(lruvec, item) * memcg_page_state_unit(item);
6353}
6354
6355static int memory_numa_stat_show(struct seq_file *m, void *v)
6356{
6357 int i;
6358 struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
6359
6360 mem_cgroup_flush_stats();
6361
6362 for (i = 0; i < ARRAY_SIZE(memory_stats); i++) {
6363 int nid;
6364
6365 if (memory_stats[i].idx >= NR_VM_NODE_STAT_ITEMS)
6366 continue;
6367
6368 seq_printf(m, "%s", memory_stats[i].name);
6369 for_each_node_state(nid, N_MEMORY) {
6370 u64 size;
6371 struct lruvec *lruvec;
6372
6373 lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(nid));
6374 size = lruvec_page_state_output(lruvec,
6375 memory_stats[i].idx);
6376 seq_printf(m, " N%d=%llu", nid, size);
6377 }
6378 seq_putc(m, '\n');
6379 }
6380
6381 return 0;
6382}
6383#endif
6384
6385static int memory_oom_group_show(struct seq_file *m, void *v)
6386{
6387 struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
6388
6389 seq_printf(m, "%d\n", memcg->oom_group);
6390
6391 return 0;
6392}
6393
6394static ssize_t memory_oom_group_write(struct kernfs_open_file *of,
6395 char *buf, size_t nbytes, loff_t off)
6396{
6397 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
6398 int ret, oom_group;
6399
6400 buf = strstrip(buf);
6401 if (!buf)
6402 return -EINVAL;
6403
6404 ret = kstrtoint(buf, 0, &oom_group);
6405 if (ret)
6406 return ret;
6407
6408 if (oom_group != 0 && oom_group != 1)
6409 return -EINVAL;
6410
6411 memcg->oom_group = oom_group;
6412
6413 return nbytes;
6414}
6415
6416static ssize_t memory_reclaim(struct kernfs_open_file *of, char *buf,
6417 size_t nbytes, loff_t off)
6418{
6419 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
6420 unsigned int nr_retries = MAX_RECLAIM_RETRIES;
6421 unsigned long nr_to_reclaim, nr_reclaimed = 0;
6422 int err;
6423
6424 buf = strstrip(buf);
6425 err = page_counter_memparse(buf, "", &nr_to_reclaim);
6426 if (err)
6427 return err;
6428
6429 while (nr_reclaimed < nr_to_reclaim) {
6430 unsigned long reclaimed;
6431
6432 if (signal_pending(current))
6433 return -EINTR;
6434
6435
6436
6437
6438
6439
6440 if (!nr_retries)
6441 lru_add_drain_all();
6442
6443 reclaimed = try_to_free_mem_cgroup_pages(memcg,
6444 nr_to_reclaim - nr_reclaimed,
6445 GFP_KERNEL, true);
6446
6447 if (!reclaimed && !nr_retries--)
6448 return -EAGAIN;
6449
6450 nr_reclaimed += reclaimed;
6451 }
6452
6453 return nbytes;
6454}
6455
6456static struct cftype memory_files[] = {
6457 {
6458 .name = "current",
6459 .flags = CFTYPE_NOT_ON_ROOT,
6460 .read_u64 = memory_current_read,
6461 },
6462 {
6463 .name = "peak",
6464 .flags = CFTYPE_NOT_ON_ROOT,
6465 .read_u64 = memory_peak_read,
6466 },
6467 {
6468 .name = "min",
6469 .flags = CFTYPE_NOT_ON_ROOT,
6470 .seq_show = memory_min_show,
6471 .write = memory_min_write,
6472 },
6473 {
6474 .name = "low",
6475 .flags = CFTYPE_NOT_ON_ROOT,
6476 .seq_show = memory_low_show,
6477 .write = memory_low_write,
6478 },
6479 {
6480 .name = "high",
6481 .flags = CFTYPE_NOT_ON_ROOT,
6482 .seq_show = memory_high_show,
6483 .write = memory_high_write,
6484 },
6485 {
6486 .name = "max",
6487 .flags = CFTYPE_NOT_ON_ROOT,
6488 .seq_show = memory_max_show,
6489 .write = memory_max_write,
6490 },
6491 {
6492 .name = "events",
6493 .flags = CFTYPE_NOT_ON_ROOT,
6494 .file_offset = offsetof(struct mem_cgroup, events_file),
6495 .seq_show = memory_events_show,
6496 },
6497 {
6498 .name = "events.local",
6499 .flags = CFTYPE_NOT_ON_ROOT,
6500 .file_offset = offsetof(struct mem_cgroup, events_local_file),
6501 .seq_show = memory_events_local_show,
6502 },
6503 {
6504 .name = "stat",
6505 .seq_show = memory_stat_show,
6506 },
6507#ifdef CONFIG_NUMA
6508 {
6509 .name = "numa_stat",
6510 .seq_show = memory_numa_stat_show,
6511 },
6512#endif
6513 {
6514 .name = "oom.group",
6515 .flags = CFTYPE_NOT_ON_ROOT | CFTYPE_NS_DELEGATABLE,
6516 .seq_show = memory_oom_group_show,
6517 .write = memory_oom_group_write,
6518 },
6519 {
6520 .name = "reclaim",
6521 .flags = CFTYPE_NS_DELEGATABLE,
6522 .write = memory_reclaim,
6523 },
6524 { }
6525};
6526
6527struct cgroup_subsys memory_cgrp_subsys = {
6528 .css_alloc = mem_cgroup_css_alloc,
6529 .css_online = mem_cgroup_css_online,
6530 .css_offline = mem_cgroup_css_offline,
6531 .css_released = mem_cgroup_css_released,
6532 .css_free = mem_cgroup_css_free,
6533 .css_reset = mem_cgroup_css_reset,
6534 .css_rstat_flush = mem_cgroup_css_rstat_flush,
6535 .can_attach = mem_cgroup_can_attach,
6536 .cancel_attach = mem_cgroup_cancel_attach,
6537 .post_attach = mem_cgroup_move_task,
6538 .dfl_cftypes = memory_files,
6539 .legacy_cftypes = mem_cgroup_legacy_files,
6540 .early_init = 0,
6541};
6542
6543
6544
6545
6546
6547
6548
6549
6550
6551
6552
6553
6554
6555
6556
6557
6558
6559
6560
6561
6562
6563
6564
6565
6566
6567
6568
6569
6570
6571
6572
6573
6574
6575
6576
6577
6578
6579
6580
6581
6582
6583
6584
6585
6586static unsigned long effective_protection(unsigned long usage,
6587 unsigned long parent_usage,
6588 unsigned long setting,
6589 unsigned long parent_effective,
6590 unsigned long siblings_protected)
6591{
6592 unsigned long protected;
6593 unsigned long ep;
6594
6595 protected = min(usage, setting);
6596
6597
6598
6599
6600
6601
6602
6603
6604
6605
6606 if (siblings_protected > parent_effective)
6607 return protected * parent_effective / siblings_protected;
6608
6609
6610
6611
6612
6613
6614
6615
6616
6617
6618
6619
6620
6621
6622
6623
6624 ep = protected;
6625
6626
6627
6628
6629
6630
6631
6632
6633
6634
6635
6636
6637
6638
6639
6640
6641
6642 if (!(cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_RECURSIVE_PROT))
6643 return ep;
6644 if (parent_effective > siblings_protected &&
6645 parent_usage > siblings_protected &&
6646 usage > protected) {
6647 unsigned long unclaimed;
6648
6649 unclaimed = parent_effective - siblings_protected;
6650 unclaimed *= usage - protected;
6651 unclaimed /= parent_usage - siblings_protected;
6652
6653 ep += unclaimed;
6654 }
6655
6656 return ep;
6657}
6658
6659
6660
6661
6662
6663
6664
6665
6666
6667void mem_cgroup_calculate_protection(struct mem_cgroup *root,
6668 struct mem_cgroup *memcg)
6669{
6670 unsigned long usage, parent_usage;
6671 struct mem_cgroup *parent;
6672
6673 if (mem_cgroup_disabled())
6674 return;
6675
6676 if (!root)
6677 root = root_mem_cgroup;
6678
6679
6680
6681
6682
6683
6684
6685
6686 if (memcg == root)
6687 return;
6688
6689 usage = page_counter_read(&memcg->memory);
6690 if (!usage)
6691 return;
6692
6693 parent = parent_mem_cgroup(memcg);
6694
6695 if (parent == root) {
6696 memcg->memory.emin = READ_ONCE(memcg->memory.min);
6697 memcg->memory.elow = READ_ONCE(memcg->memory.low);
6698 return;
6699 }
6700
6701 parent_usage = page_counter_read(&parent->memory);
6702
6703 WRITE_ONCE(memcg->memory.emin, effective_protection(usage, parent_usage,
6704 READ_ONCE(memcg->memory.min),
6705 READ_ONCE(parent->memory.emin),
6706 atomic_long_read(&parent->memory.children_min_usage)));
6707
6708 WRITE_ONCE(memcg->memory.elow, effective_protection(usage, parent_usage,
6709 READ_ONCE(memcg->memory.low),
6710 READ_ONCE(parent->memory.elow),
6711 atomic_long_read(&parent->memory.children_low_usage)));
6712}
6713
6714static int charge_memcg(struct folio *folio, struct mem_cgroup *memcg,
6715 gfp_t gfp)
6716{
6717 long nr_pages = folio_nr_pages(folio);
6718 int ret;
6719
6720 ret = try_charge(memcg, gfp, nr_pages);
6721 if (ret)
6722 goto out;
6723
6724 css_get(&memcg->css);
6725 commit_charge(folio, memcg);
6726
6727 local_irq_disable();
6728 mem_cgroup_charge_statistics(memcg, nr_pages);
6729 memcg_check_events(memcg, folio_nid(folio));
6730 local_irq_enable();
6731out:
6732 return ret;
6733}
6734
6735int __mem_cgroup_charge(struct folio *folio, struct mm_struct *mm, gfp_t gfp)
6736{
6737 struct mem_cgroup *memcg;
6738 int ret;
6739
6740 memcg = get_mem_cgroup_from_mm(mm);
6741 ret = charge_memcg(folio, memcg, gfp);
6742 css_put(&memcg->css);
6743
6744 return ret;
6745}
6746
6747
6748
6749
6750
6751
6752
6753
6754
6755
6756
6757
6758
6759int mem_cgroup_swapin_charge_page(struct page *page, struct mm_struct *mm,
6760 gfp_t gfp, swp_entry_t entry)
6761{
6762 struct folio *folio = page_folio(page);
6763 struct mem_cgroup *memcg;
6764 unsigned short id;
6765 int ret;
6766
6767 if (mem_cgroup_disabled())
6768 return 0;
6769
6770 id = lookup_swap_cgroup_id(entry);
6771 rcu_read_lock();
6772 memcg = mem_cgroup_from_id(id);
6773 if (!memcg || !css_tryget_online(&memcg->css))
6774 memcg = get_mem_cgroup_from_mm(mm);
6775 rcu_read_unlock();
6776
6777 ret = charge_memcg(folio, memcg, gfp);
6778
6779 css_put(&memcg->css);
6780 return ret;
6781}
6782
6783
6784
6785
6786
6787
6788
6789
6790
6791
6792void mem_cgroup_swapin_uncharge_swap(swp_entry_t entry)
6793{
6794
6795
6796
6797
6798
6799
6800
6801
6802
6803
6804
6805
6806 if (!mem_cgroup_disabled() && do_memsw_account()) {
6807
6808
6809
6810
6811
6812 mem_cgroup_uncharge_swap(entry, 1);
6813 }
6814}
6815
6816struct uncharge_gather {
6817 struct mem_cgroup *memcg;
6818 unsigned long nr_memory;
6819 unsigned long pgpgout;
6820 unsigned long nr_kmem;
6821 int nid;
6822};
6823
6824static inline void uncharge_gather_clear(struct uncharge_gather *ug)
6825{
6826 memset(ug, 0, sizeof(*ug));
6827}
6828
6829static void uncharge_batch(const struct uncharge_gather *ug)
6830{
6831 unsigned long flags;
6832
6833 if (ug->nr_memory) {
6834 page_counter_uncharge(&ug->memcg->memory, ug->nr_memory);
6835 if (do_memsw_account())
6836 page_counter_uncharge(&ug->memcg->memsw, ug->nr_memory);
6837 if (ug->nr_kmem)
6838 memcg_account_kmem(ug->memcg, -ug->nr_kmem);
6839 memcg_oom_recover(ug->memcg);
6840 }
6841
6842 local_irq_save(flags);
6843 __count_memcg_events(ug->memcg, PGPGOUT, ug->pgpgout);
6844 __this_cpu_add(ug->memcg->vmstats_percpu->nr_page_events, ug->nr_memory);
6845 memcg_check_events(ug->memcg, ug->nid);
6846 local_irq_restore(flags);
6847
6848
6849 css_put(&ug->memcg->css);
6850}
6851
6852static void uncharge_folio(struct folio *folio, struct uncharge_gather *ug)
6853{
6854 long nr_pages;
6855 struct mem_cgroup *memcg;
6856 struct obj_cgroup *objcg;
6857
6858 VM_BUG_ON_FOLIO(folio_test_lru(folio), folio);
6859
6860
6861
6862
6863
6864
6865 if (folio_memcg_kmem(folio)) {
6866 objcg = __folio_objcg(folio);
6867
6868
6869
6870
6871 memcg = get_mem_cgroup_from_objcg(objcg);
6872 } else {
6873 memcg = __folio_memcg(folio);
6874 }
6875
6876 if (!memcg)
6877 return;
6878
6879 if (ug->memcg != memcg) {
6880 if (ug->memcg) {
6881 uncharge_batch(ug);
6882 uncharge_gather_clear(ug);
6883 }
6884 ug->memcg = memcg;
6885 ug->nid = folio_nid(folio);
6886
6887
6888 css_get(&memcg->css);
6889 }
6890
6891 nr_pages = folio_nr_pages(folio);
6892
6893 if (folio_memcg_kmem(folio)) {
6894 ug->nr_memory += nr_pages;
6895 ug->nr_kmem += nr_pages;
6896
6897 folio->memcg_data = 0;
6898 obj_cgroup_put(objcg);
6899 } else {
6900
6901 if (!mem_cgroup_is_root(memcg))
6902 ug->nr_memory += nr_pages;
6903 ug->pgpgout++;
6904
6905 folio->memcg_data = 0;
6906 }
6907
6908 css_put(&memcg->css);
6909}
6910
6911void __mem_cgroup_uncharge(struct folio *folio)
6912{
6913 struct uncharge_gather ug;
6914
6915
6916 if (!folio_memcg(folio))
6917 return;
6918
6919 uncharge_gather_clear(&ug);
6920 uncharge_folio(folio, &ug);
6921 uncharge_batch(&ug);
6922}
6923
6924
6925
6926
6927
6928
6929
6930
6931void __mem_cgroup_uncharge_list(struct list_head *page_list)
6932{
6933 struct uncharge_gather ug;
6934 struct folio *folio;
6935
6936 uncharge_gather_clear(&ug);
6937 list_for_each_entry(folio, page_list, lru)
6938 uncharge_folio(folio, &ug);
6939 if (ug.memcg)
6940 uncharge_batch(&ug);
6941}
6942
6943
6944
6945
6946
6947
6948
6949
6950
6951
6952
6953void mem_cgroup_migrate(struct folio *old, struct folio *new)
6954{
6955 struct mem_cgroup *memcg;
6956 long nr_pages = folio_nr_pages(new);
6957 unsigned long flags;
6958
6959 VM_BUG_ON_FOLIO(!folio_test_locked(old), old);
6960 VM_BUG_ON_FOLIO(!folio_test_locked(new), new);
6961 VM_BUG_ON_FOLIO(folio_test_anon(old) != folio_test_anon(new), new);
6962 VM_BUG_ON_FOLIO(folio_nr_pages(old) != nr_pages, new);
6963
6964 if (mem_cgroup_disabled())
6965 return;
6966
6967
6968 if (folio_memcg(new))
6969 return;
6970
6971 memcg = folio_memcg(old);
6972 VM_WARN_ON_ONCE_FOLIO(!memcg, old);
6973 if (!memcg)
6974 return;
6975
6976
6977 if (!mem_cgroup_is_root(memcg)) {
6978 page_counter_charge(&memcg->memory, nr_pages);
6979 if (do_memsw_account())
6980 page_counter_charge(&memcg->memsw, nr_pages);
6981 }
6982
6983 css_get(&memcg->css);
6984 commit_charge(new, memcg);
6985
6986 local_irq_save(flags);
6987 mem_cgroup_charge_statistics(memcg, nr_pages);
6988 memcg_check_events(memcg, folio_nid(new));
6989 local_irq_restore(flags);
6990}
6991
6992DEFINE_STATIC_KEY_FALSE(memcg_sockets_enabled_key);
6993EXPORT_SYMBOL(memcg_sockets_enabled_key);
6994
6995void mem_cgroup_sk_alloc(struct sock *sk)
6996{
6997 struct mem_cgroup *memcg;
6998
6999 if (!mem_cgroup_sockets_enabled)
7000 return;
7001
7002
7003 if (!in_task())
7004 return;
7005
7006 rcu_read_lock();
7007 memcg = mem_cgroup_from_task(current);
7008 if (memcg == root_mem_cgroup)
7009 goto out;
7010 if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && !memcg->tcpmem_active)
7011 goto out;
7012 if (css_tryget(&memcg->css))
7013 sk->sk_memcg = memcg;
7014out:
7015 rcu_read_unlock();
7016}
7017
7018void mem_cgroup_sk_free(struct sock *sk)
7019{
7020 if (sk->sk_memcg)
7021 css_put(&sk->sk_memcg->css);
7022}
7023
7024
7025
7026
7027
7028
7029
7030
7031
7032
7033bool mem_cgroup_charge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages,
7034 gfp_t gfp_mask)
7035{
7036 if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) {
7037 struct page_counter *fail;
7038
7039 if (page_counter_try_charge(&memcg->tcpmem, nr_pages, &fail)) {
7040 memcg->tcpmem_pressure = 0;
7041 return true;
7042 }
7043 memcg->tcpmem_pressure = 1;
7044 if (gfp_mask & __GFP_NOFAIL) {
7045 page_counter_charge(&memcg->tcpmem, nr_pages);
7046 return true;
7047 }
7048 return false;
7049 }
7050
7051 if (try_charge(memcg, gfp_mask, nr_pages) == 0) {
7052 mod_memcg_state(memcg, MEMCG_SOCK, nr_pages);
7053 return true;
7054 }
7055
7056 return false;
7057}
7058
7059
7060
7061
7062
7063
7064void mem_cgroup_uncharge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages)
7065{
7066 if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) {
7067 page_counter_uncharge(&memcg->tcpmem, nr_pages);
7068 return;
7069 }
7070
7071 mod_memcg_state(memcg, MEMCG_SOCK, -nr_pages);
7072
7073 refill_stock(memcg, nr_pages);
7074}
7075
7076static int __init cgroup_memory(char *s)
7077{
7078 char *token;
7079
7080 while ((token = strsep(&s, ",")) != NULL) {
7081 if (!*token)
7082 continue;
7083 if (!strcmp(token, "nosocket"))
7084 cgroup_memory_nosocket = true;
7085 if (!strcmp(token, "nokmem"))
7086 cgroup_memory_nokmem = true;
7087 }
7088 return 1;
7089}
7090__setup("cgroup.memory=", cgroup_memory);
7091
7092
7093
7094
7095
7096
7097
7098
7099
7100static int __init mem_cgroup_init(void)
7101{
7102 int cpu, node;
7103
7104
7105
7106
7107
7108
7109
7110 BUILD_BUG_ON(MEMCG_CHARGE_BATCH > S32_MAX / PAGE_SIZE);
7111
7112 cpuhp_setup_state_nocalls(CPUHP_MM_MEMCQ_DEAD, "mm/memctrl:dead", NULL,
7113 memcg_hotplug_cpu_dead);
7114
7115 for_each_possible_cpu(cpu)
7116 INIT_WORK(&per_cpu_ptr(&memcg_stock, cpu)->work,
7117 drain_local_stock);
7118
7119 for_each_node(node) {
7120 struct mem_cgroup_tree_per_node *rtpn;
7121
7122 rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL,
7123 node_online(node) ? node : NUMA_NO_NODE);
7124
7125 rtpn->rb_root = RB_ROOT;
7126 rtpn->rb_rightmost = NULL;
7127 spin_lock_init(&rtpn->lock);
7128 soft_limit_tree.rb_tree_per_node[node] = rtpn;
7129 }
7130
7131 return 0;
7132}
7133subsys_initcall(mem_cgroup_init);
7134
7135#ifdef CONFIG_MEMCG_SWAP
7136static struct mem_cgroup *mem_cgroup_id_get_online(struct mem_cgroup *memcg)
7137{
7138 while (!refcount_inc_not_zero(&memcg->id.ref)) {
7139
7140
7141
7142
7143 if (WARN_ON_ONCE(memcg == root_mem_cgroup)) {
7144 VM_BUG_ON(1);
7145 break;
7146 }
7147 memcg = parent_mem_cgroup(memcg);
7148 if (!memcg)
7149 memcg = root_mem_cgroup;
7150 }
7151 return memcg;
7152}
7153
7154
7155
7156
7157
7158
7159
7160
7161void mem_cgroup_swapout(struct folio *folio, swp_entry_t entry)
7162{
7163 struct mem_cgroup *memcg, *swap_memcg;
7164 unsigned int nr_entries;
7165 unsigned short oldid;
7166
7167 VM_BUG_ON_FOLIO(folio_test_lru(folio), folio);
7168 VM_BUG_ON_FOLIO(folio_ref_count(folio), folio);
7169
7170 if (mem_cgroup_disabled())
7171 return;
7172
7173 if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
7174 return;
7175
7176 memcg = folio_memcg(folio);
7177
7178 VM_WARN_ON_ONCE_FOLIO(!memcg, folio);
7179 if (!memcg)
7180 return;
7181
7182
7183
7184
7185
7186
7187 swap_memcg = mem_cgroup_id_get_online(memcg);
7188 nr_entries = folio_nr_pages(folio);
7189
7190 if (nr_entries > 1)
7191 mem_cgroup_id_get_many(swap_memcg, nr_entries - 1);
7192 oldid = swap_cgroup_record(entry, mem_cgroup_id(swap_memcg),
7193 nr_entries);
7194 VM_BUG_ON_FOLIO(oldid, folio);
7195 mod_memcg_state(swap_memcg, MEMCG_SWAP, nr_entries);
7196
7197 folio->memcg_data = 0;
7198
7199 if (!mem_cgroup_is_root(memcg))
7200 page_counter_uncharge(&memcg->memory, nr_entries);
7201
7202 if (!cgroup_memory_noswap && memcg != swap_memcg) {
7203 if (!mem_cgroup_is_root(swap_memcg))
7204 page_counter_charge(&swap_memcg->memsw, nr_entries);
7205 page_counter_uncharge(&memcg->memsw, nr_entries);
7206 }
7207
7208
7209
7210
7211
7212
7213
7214 memcg_stats_lock();
7215 mem_cgroup_charge_statistics(memcg, -nr_entries);
7216 memcg_stats_unlock();
7217 memcg_check_events(memcg, folio_nid(folio));
7218
7219 css_put(&memcg->css);
7220}
7221
7222
7223
7224
7225
7226
7227
7228
7229
7230
7231int __mem_cgroup_try_charge_swap(struct folio *folio, swp_entry_t entry)
7232{
7233 unsigned int nr_pages = folio_nr_pages(folio);
7234 struct page_counter *counter;
7235 struct mem_cgroup *memcg;
7236 unsigned short oldid;
7237
7238 if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
7239 return 0;
7240
7241 memcg = folio_memcg(folio);
7242
7243 VM_WARN_ON_ONCE_FOLIO(!memcg, folio);
7244 if (!memcg)
7245 return 0;
7246
7247 if (!entry.val) {
7248 memcg_memory_event(memcg, MEMCG_SWAP_FAIL);
7249 return 0;
7250 }
7251
7252 memcg = mem_cgroup_id_get_online(memcg);
7253
7254 if (!cgroup_memory_noswap && !mem_cgroup_is_root(memcg) &&
7255 !page_counter_try_charge(&memcg->swap, nr_pages, &counter)) {
7256 memcg_memory_event(memcg, MEMCG_SWAP_MAX);
7257 memcg_memory_event(memcg, MEMCG_SWAP_FAIL);
7258 mem_cgroup_id_put(memcg);
7259 return -ENOMEM;
7260 }
7261
7262
7263 if (nr_pages > 1)
7264 mem_cgroup_id_get_many(memcg, nr_pages - 1);
7265 oldid = swap_cgroup_record(entry, mem_cgroup_id(memcg), nr_pages);
7266 VM_BUG_ON_FOLIO(oldid, folio);
7267 mod_memcg_state(memcg, MEMCG_SWAP, nr_pages);
7268
7269 return 0;
7270}
7271
7272
7273
7274
7275
7276
7277void __mem_cgroup_uncharge_swap(swp_entry_t entry, unsigned int nr_pages)
7278{
7279 struct mem_cgroup *memcg;
7280 unsigned short id;
7281
7282 id = swap_cgroup_record(entry, 0, nr_pages);
7283 rcu_read_lock();
7284 memcg = mem_cgroup_from_id(id);
7285 if (memcg) {
7286 if (!cgroup_memory_noswap && !mem_cgroup_is_root(memcg)) {
7287 if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
7288 page_counter_uncharge(&memcg->swap, nr_pages);
7289 else
7290 page_counter_uncharge(&memcg->memsw, nr_pages);
7291 }
7292 mod_memcg_state(memcg, MEMCG_SWAP, -nr_pages);
7293 mem_cgroup_id_put_many(memcg, nr_pages);
7294 }
7295 rcu_read_unlock();
7296}
7297
7298long mem_cgroup_get_nr_swap_pages(struct mem_cgroup *memcg)
7299{
7300 long nr_swap_pages = get_nr_swap_pages();
7301
7302 if (cgroup_memory_noswap || !cgroup_subsys_on_dfl(memory_cgrp_subsys))
7303 return nr_swap_pages;
7304 for (; memcg != root_mem_cgroup; memcg = parent_mem_cgroup(memcg))
7305 nr_swap_pages = min_t(long, nr_swap_pages,
7306 READ_ONCE(memcg->swap.max) -
7307 page_counter_read(&memcg->swap));
7308 return nr_swap_pages;
7309}
7310
7311bool mem_cgroup_swap_full(struct page *page)
7312{
7313 struct mem_cgroup *memcg;
7314
7315 VM_BUG_ON_PAGE(!PageLocked(page), page);
7316
7317 if (vm_swap_full())
7318 return true;
7319 if (cgroup_memory_noswap || !cgroup_subsys_on_dfl(memory_cgrp_subsys))
7320 return false;
7321
7322 memcg = page_memcg(page);
7323 if (!memcg)
7324 return false;
7325
7326 for (; memcg != root_mem_cgroup; memcg = parent_mem_cgroup(memcg)) {
7327 unsigned long usage = page_counter_read(&memcg->swap);
7328
7329 if (usage * 2 >= READ_ONCE(memcg->swap.high) ||
7330 usage * 2 >= READ_ONCE(memcg->swap.max))
7331 return true;
7332 }
7333
7334 return false;
7335}
7336
7337static int __init setup_swap_account(char *s)
7338{
7339 if (!strcmp(s, "1"))
7340 cgroup_memory_noswap = false;
7341 else if (!strcmp(s, "0"))
7342 cgroup_memory_noswap = true;
7343 return 1;
7344}
7345__setup("swapaccount=", setup_swap_account);
7346
7347static u64 swap_current_read(struct cgroup_subsys_state *css,
7348 struct cftype *cft)
7349{
7350 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
7351
7352 return (u64)page_counter_read(&memcg->swap) * PAGE_SIZE;
7353}
7354
7355static int swap_high_show(struct seq_file *m, void *v)
7356{
7357 return seq_puts_memcg_tunable(m,
7358 READ_ONCE(mem_cgroup_from_seq(m)->swap.high));
7359}
7360
7361static ssize_t swap_high_write(struct kernfs_open_file *of,
7362 char *buf, size_t nbytes, loff_t off)
7363{
7364 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
7365 unsigned long high;
7366 int err;
7367
7368 buf = strstrip(buf);
7369 err = page_counter_memparse(buf, "max", &high);
7370 if (err)
7371 return err;
7372
7373 page_counter_set_high(&memcg->swap, high);
7374
7375 return nbytes;
7376}
7377
7378static int swap_max_show(struct seq_file *m, void *v)
7379{
7380 return seq_puts_memcg_tunable(m,
7381 READ_ONCE(mem_cgroup_from_seq(m)->swap.max));
7382}
7383
7384static ssize_t swap_max_write(struct kernfs_open_file *of,
7385 char *buf, size_t nbytes, loff_t off)
7386{
7387 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
7388 unsigned long max;
7389 int err;
7390
7391 buf = strstrip(buf);
7392 err = page_counter_memparse(buf, "max", &max);
7393 if (err)
7394 return err;
7395
7396 xchg(&memcg->swap.max, max);
7397
7398 return nbytes;
7399}
7400
7401static int swap_events_show(struct seq_file *m, void *v)
7402{
7403 struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
7404
7405 seq_printf(m, "high %lu\n",
7406 atomic_long_read(&memcg->memory_events[MEMCG_SWAP_HIGH]));
7407 seq_printf(m, "max %lu\n",
7408 atomic_long_read(&memcg->memory_events[MEMCG_SWAP_MAX]));
7409 seq_printf(m, "fail %lu\n",
7410 atomic_long_read(&memcg->memory_events[MEMCG_SWAP_FAIL]));
7411
7412 return 0;
7413}
7414
7415static struct cftype swap_files[] = {
7416 {
7417 .name = "swap.current",
7418 .flags = CFTYPE_NOT_ON_ROOT,
7419 .read_u64 = swap_current_read,
7420 },
7421 {
7422 .name = "swap.high",
7423 .flags = CFTYPE_NOT_ON_ROOT,
7424 .seq_show = swap_high_show,
7425 .write = swap_high_write,
7426 },
7427 {
7428 .name = "swap.max",
7429 .flags = CFTYPE_NOT_ON_ROOT,
7430 .seq_show = swap_max_show,
7431 .write = swap_max_write,
7432 },
7433 {
7434 .name = "swap.events",
7435 .flags = CFTYPE_NOT_ON_ROOT,
7436 .file_offset = offsetof(struct mem_cgroup, swap_events_file),
7437 .seq_show = swap_events_show,
7438 },
7439 { }
7440};
7441
7442static struct cftype memsw_files[] = {
7443 {
7444 .name = "memsw.usage_in_bytes",
7445 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE),
7446 .read_u64 = mem_cgroup_read_u64,
7447 },
7448 {
7449 .name = "memsw.max_usage_in_bytes",
7450 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_MAX_USAGE),
7451 .write = mem_cgroup_reset,
7452 .read_u64 = mem_cgroup_read_u64,
7453 },
7454 {
7455 .name = "memsw.limit_in_bytes",
7456 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_LIMIT),
7457 .write = mem_cgroup_write,
7458 .read_u64 = mem_cgroup_read_u64,
7459 },
7460 {
7461 .name = "memsw.failcnt",
7462 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_FAILCNT),
7463 .write = mem_cgroup_reset,
7464 .read_u64 = mem_cgroup_read_u64,
7465 },
7466 { },
7467};
7468
7469#if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_ZSWAP)
7470
7471
7472
7473
7474
7475
7476
7477
7478
7479
7480
7481
7482bool obj_cgroup_may_zswap(struct obj_cgroup *objcg)
7483{
7484 struct mem_cgroup *memcg, *original_memcg;
7485 bool ret = true;
7486
7487 if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
7488 return true;
7489
7490 original_memcg = get_mem_cgroup_from_objcg(objcg);
7491 for (memcg = original_memcg; memcg != root_mem_cgroup;
7492 memcg = parent_mem_cgroup(memcg)) {
7493 unsigned long max = READ_ONCE(memcg->zswap_max);
7494 unsigned long pages;
7495
7496 if (max == PAGE_COUNTER_MAX)
7497 continue;
7498 if (max == 0) {
7499 ret = false;
7500 break;
7501 }
7502
7503 cgroup_rstat_flush(memcg->css.cgroup);
7504 pages = memcg_page_state(memcg, MEMCG_ZSWAP_B) / PAGE_SIZE;
7505 if (pages < max)
7506 continue;
7507 ret = false;
7508 break;
7509 }
7510 mem_cgroup_put(original_memcg);
7511 return ret;
7512}
7513
7514
7515
7516
7517
7518
7519
7520
7521
7522void obj_cgroup_charge_zswap(struct obj_cgroup *objcg, size_t size)
7523{
7524 struct mem_cgroup *memcg;
7525
7526 if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
7527 return;
7528
7529 VM_WARN_ON_ONCE(!(current->flags & PF_MEMALLOC));
7530
7531
7532 if (obj_cgroup_charge(objcg, GFP_KERNEL, size))
7533 VM_WARN_ON_ONCE(1);
7534
7535 rcu_read_lock();
7536 memcg = obj_cgroup_memcg(objcg);
7537 mod_memcg_state(memcg, MEMCG_ZSWAP_B, size);
7538 mod_memcg_state(memcg, MEMCG_ZSWAPPED, 1);
7539 rcu_read_unlock();
7540}
7541
7542
7543
7544
7545
7546
7547
7548
7549void obj_cgroup_uncharge_zswap(struct obj_cgroup *objcg, size_t size)
7550{
7551 struct mem_cgroup *memcg;
7552
7553 if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
7554 return;
7555
7556 obj_cgroup_uncharge(objcg, size);
7557
7558 rcu_read_lock();
7559 memcg = obj_cgroup_memcg(objcg);
7560 mod_memcg_state(memcg, MEMCG_ZSWAP_B, -size);
7561 mod_memcg_state(memcg, MEMCG_ZSWAPPED, -1);
7562 rcu_read_unlock();
7563}
7564
7565static u64 zswap_current_read(struct cgroup_subsys_state *css,
7566 struct cftype *cft)
7567{
7568 cgroup_rstat_flush(css->cgroup);
7569 return memcg_page_state(mem_cgroup_from_css(css), MEMCG_ZSWAP_B);
7570}
7571
7572static int zswap_max_show(struct seq_file *m, void *v)
7573{
7574 return seq_puts_memcg_tunable(m,
7575 READ_ONCE(mem_cgroup_from_seq(m)->zswap_max));
7576}
7577
7578static ssize_t zswap_max_write(struct kernfs_open_file *of,
7579 char *buf, size_t nbytes, loff_t off)
7580{
7581 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
7582 unsigned long max;
7583 int err;
7584
7585 buf = strstrip(buf);
7586 err = page_counter_memparse(buf, "max", &max);
7587 if (err)
7588 return err;
7589
7590 xchg(&memcg->zswap_max, max);
7591
7592 return nbytes;
7593}
7594
7595static struct cftype zswap_files[] = {
7596 {
7597 .name = "zswap.current",
7598 .flags = CFTYPE_NOT_ON_ROOT,
7599 .read_u64 = zswap_current_read,
7600 },
7601 {
7602 .name = "zswap.max",
7603 .flags = CFTYPE_NOT_ON_ROOT,
7604 .seq_show = zswap_max_show,
7605 .write = zswap_max_write,
7606 },
7607 { }
7608};
7609#endif
7610
7611
7612
7613
7614
7615
7616
7617
7618static int __init mem_cgroup_swap_init(void)
7619{
7620
7621 if (mem_cgroup_disabled())
7622 cgroup_memory_noswap = true;
7623
7624 if (cgroup_memory_noswap)
7625 return 0;
7626
7627 WARN_ON(cgroup_add_dfl_cftypes(&memory_cgrp_subsys, swap_files));
7628 WARN_ON(cgroup_add_legacy_cftypes(&memory_cgrp_subsys, memsw_files));
7629#if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_ZSWAP)
7630 WARN_ON(cgroup_add_dfl_cftypes(&memory_cgrp_subsys, zswap_files));
7631#endif
7632 return 0;
7633}
7634core_initcall(mem_cgroup_swap_init);
7635
7636#endif
7637