1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25#include <linux/page_counter.h>
26#include <linux/memcontrol.h>
27#include <linux/cgroup.h>
28#include <linux/pagewalk.h>
29#include <linux/sched/mm.h>
30#include <linux/shmem_fs.h>
31#include <linux/hugetlb.h>
32#include <linux/pagemap.h>
33#include <linux/vm_event_item.h>
34#include <linux/smp.h>
35#include <linux/page-flags.h>
36#include <linux/backing-dev.h>
37#include <linux/bit_spinlock.h>
38#include <linux/rcupdate.h>
39#include <linux/limits.h>
40#include <linux/export.h>
41#include <linux/mutex.h>
42#include <linux/rbtree.h>
43#include <linux/slab.h>
44#include <linux/swap.h>
45#include <linux/swapops.h>
46#include <linux/spinlock.h>
47#include <linux/eventfd.h>
48#include <linux/poll.h>
49#include <linux/sort.h>
50#include <linux/fs.h>
51#include <linux/seq_file.h>
52#include <linux/vmpressure.h>
53#include <linux/mm_inline.h>
54#include <linux/swap_cgroup.h>
55#include <linux/cpu.h>
56#include <linux/oom.h>
57#include <linux/lockdep.h>
58#include <linux/file.h>
59#include <linux/tracehook.h>
60#include <linux/psi.h>
61#include <linux/seq_buf.h>
62#include "internal.h"
63#include <net/sock.h>
64#include <net/ip.h>
65#include "slab.h"
66
67#include <linux/uaccess.h>
68
69#include <trace/events/vmscan.h>
70
71struct cgroup_subsys memory_cgrp_subsys __read_mostly;
72EXPORT_SYMBOL(memory_cgrp_subsys);
73
74struct mem_cgroup *root_mem_cgroup __read_mostly;
75
76#define MEM_CGROUP_RECLAIM_RETRIES 5
77
78
79static bool cgroup_memory_nosocket;
80
81
82static bool cgroup_memory_nokmem;
83
84
85#ifdef CONFIG_MEMCG_SWAP
86bool cgroup_memory_noswap __read_mostly;
87#else
88#define cgroup_memory_noswap 1
89#endif
90
91#ifdef CONFIG_CGROUP_WRITEBACK
92static DECLARE_WAIT_QUEUE_HEAD(memcg_cgwb_frn_waitq);
93#endif
94
95
96static bool do_memsw_account(void)
97{
98 return !cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_noswap;
99}
100
101#define THRESHOLDS_EVENTS_TARGET 128
102#define SOFTLIMIT_EVENTS_TARGET 1024
103
104
105
106
107
108
109struct mem_cgroup_tree_per_node {
110 struct rb_root rb_root;
111 struct rb_node *rb_rightmost;
112 spinlock_t lock;
113};
114
115struct mem_cgroup_tree {
116 struct mem_cgroup_tree_per_node *rb_tree_per_node[MAX_NUMNODES];
117};
118
119static struct mem_cgroup_tree soft_limit_tree __read_mostly;
120
121
122struct mem_cgroup_eventfd_list {
123 struct list_head list;
124 struct eventfd_ctx *eventfd;
125};
126
127
128
129
130struct mem_cgroup_event {
131
132
133
134 struct mem_cgroup *memcg;
135
136
137
138 struct eventfd_ctx *eventfd;
139
140
141
142 struct list_head list;
143
144
145
146
147
148 int (*register_event)(struct mem_cgroup *memcg,
149 struct eventfd_ctx *eventfd, const char *args);
150
151
152
153
154
155 void (*unregister_event)(struct mem_cgroup *memcg,
156 struct eventfd_ctx *eventfd);
157
158
159
160
161 poll_table pt;
162 wait_queue_head_t *wqh;
163 wait_queue_entry_t wait;
164 struct work_struct remove;
165};
166
167static void mem_cgroup_threshold(struct mem_cgroup *memcg);
168static void mem_cgroup_oom_notify(struct mem_cgroup *memcg);
169
170
171
172
173
174#define MOVE_ANON 0x1U
175#define MOVE_FILE 0x2U
176#define MOVE_MASK (MOVE_ANON | MOVE_FILE)
177
178
179static struct move_charge_struct {
180 spinlock_t lock;
181 struct mm_struct *mm;
182 struct mem_cgroup *from;
183 struct mem_cgroup *to;
184 unsigned long flags;
185 unsigned long precharge;
186 unsigned long moved_charge;
187 unsigned long moved_swap;
188 struct task_struct *moving_task;
189 wait_queue_head_t waitq;
190} mc = {
191 .lock = __SPIN_LOCK_UNLOCKED(mc.lock),
192 .waitq = __WAIT_QUEUE_HEAD_INITIALIZER(mc.waitq),
193};
194
195
196
197
198
199#define MEM_CGROUP_MAX_RECLAIM_LOOPS 100
200#define MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS 2
201
202enum charge_type {
203 MEM_CGROUP_CHARGE_TYPE_CACHE = 0,
204 MEM_CGROUP_CHARGE_TYPE_ANON,
205 MEM_CGROUP_CHARGE_TYPE_SWAPOUT,
206 MEM_CGROUP_CHARGE_TYPE_DROP,
207 NR_CHARGE_TYPE,
208};
209
210
211enum res_type {
212 _MEM,
213 _MEMSWAP,
214 _OOM_TYPE,
215 _KMEM,
216 _TCP,
217};
218
219#define MEMFILE_PRIVATE(x, val) ((x) << 16 | (val))
220#define MEMFILE_TYPE(val) ((val) >> 16 & 0xffff)
221#define MEMFILE_ATTR(val) ((val) & 0xffff)
222
223#define OOM_CONTROL (0)
224
225
226
227
228
229
230#define for_each_mem_cgroup_tree(iter, root) \
231 for (iter = mem_cgroup_iter(root, NULL, NULL); \
232 iter != NULL; \
233 iter = mem_cgroup_iter(root, iter, NULL))
234
235#define for_each_mem_cgroup(iter) \
236 for (iter = mem_cgroup_iter(NULL, NULL, NULL); \
237 iter != NULL; \
238 iter = mem_cgroup_iter(NULL, iter, NULL))
239
240static inline bool should_force_charge(void)
241{
242 return tsk_is_oom_victim(current) || fatal_signal_pending(current) ||
243 (current->flags & PF_EXITING);
244}
245
246
247struct vmpressure *memcg_to_vmpressure(struct mem_cgroup *memcg)
248{
249 if (!memcg)
250 memcg = root_mem_cgroup;
251 return &memcg->vmpressure;
252}
253
254struct cgroup_subsys_state *vmpressure_to_css(struct vmpressure *vmpr)
255{
256 return &container_of(vmpr, struct mem_cgroup, vmpressure)->css;
257}
258
259#ifdef CONFIG_MEMCG_KMEM
260
261
262
263
264
265
266
267
268
269
270
271static DEFINE_IDA(memcg_cache_ida);
272int memcg_nr_cache_ids;
273
274
275static DECLARE_RWSEM(memcg_cache_ids_sem);
276
277void memcg_get_cache_ids(void)
278{
279 down_read(&memcg_cache_ids_sem);
280}
281
282void memcg_put_cache_ids(void)
283{
284 up_read(&memcg_cache_ids_sem);
285}
286
287
288
289
290
291
292
293
294
295
296
297
298
299#define MEMCG_CACHES_MIN_SIZE 4
300#define MEMCG_CACHES_MAX_SIZE MEM_CGROUP_ID_MAX
301
302
303
304
305
306
307
308DEFINE_STATIC_KEY_FALSE(memcg_kmem_enabled_key);
309EXPORT_SYMBOL(memcg_kmem_enabled_key);
310
311struct workqueue_struct *memcg_kmem_cache_wq;
312#endif
313
314static int memcg_shrinker_map_size;
315static DEFINE_MUTEX(memcg_shrinker_map_mutex);
316
317static void memcg_free_shrinker_map_rcu(struct rcu_head *head)
318{
319 kvfree(container_of(head, struct memcg_shrinker_map, rcu));
320}
321
322static int memcg_expand_one_shrinker_map(struct mem_cgroup *memcg,
323 int size, int old_size)
324{
325 struct memcg_shrinker_map *new, *old;
326 int nid;
327
328 lockdep_assert_held(&memcg_shrinker_map_mutex);
329
330 for_each_node(nid) {
331 old = rcu_dereference_protected(
332 mem_cgroup_nodeinfo(memcg, nid)->shrinker_map, true);
333
334 if (!old)
335 return 0;
336
337 new = kvmalloc_node(sizeof(*new) + size, GFP_KERNEL, nid);
338 if (!new)
339 return -ENOMEM;
340
341
342 memset(new->map, (int)0xff, old_size);
343 memset((void *)new->map + old_size, 0, size - old_size);
344
345 rcu_assign_pointer(memcg->nodeinfo[nid]->shrinker_map, new);
346 call_rcu(&old->rcu, memcg_free_shrinker_map_rcu);
347 }
348
349 return 0;
350}
351
352static void memcg_free_shrinker_maps(struct mem_cgroup *memcg)
353{
354 struct mem_cgroup_per_node *pn;
355 struct memcg_shrinker_map *map;
356 int nid;
357
358 if (mem_cgroup_is_root(memcg))
359 return;
360
361 for_each_node(nid) {
362 pn = mem_cgroup_nodeinfo(memcg, nid);
363 map = rcu_dereference_protected(pn->shrinker_map, true);
364 if (map)
365 kvfree(map);
366 rcu_assign_pointer(pn->shrinker_map, NULL);
367 }
368}
369
370static int memcg_alloc_shrinker_maps(struct mem_cgroup *memcg)
371{
372 struct memcg_shrinker_map *map;
373 int nid, size, ret = 0;
374
375 if (mem_cgroup_is_root(memcg))
376 return 0;
377
378 mutex_lock(&memcg_shrinker_map_mutex);
379 size = memcg_shrinker_map_size;
380 for_each_node(nid) {
381 map = kvzalloc_node(sizeof(*map) + size, GFP_KERNEL, nid);
382 if (!map) {
383 memcg_free_shrinker_maps(memcg);
384 ret = -ENOMEM;
385 break;
386 }
387 rcu_assign_pointer(memcg->nodeinfo[nid]->shrinker_map, map);
388 }
389 mutex_unlock(&memcg_shrinker_map_mutex);
390
391 return ret;
392}
393
394int memcg_expand_shrinker_maps(int new_id)
395{
396 int size, old_size, ret = 0;
397 struct mem_cgroup *memcg;
398
399 size = DIV_ROUND_UP(new_id + 1, BITS_PER_LONG) * sizeof(unsigned long);
400 old_size = memcg_shrinker_map_size;
401 if (size <= old_size)
402 return 0;
403
404 mutex_lock(&memcg_shrinker_map_mutex);
405 if (!root_mem_cgroup)
406 goto unlock;
407
408 for_each_mem_cgroup(memcg) {
409 if (mem_cgroup_is_root(memcg))
410 continue;
411 ret = memcg_expand_one_shrinker_map(memcg, size, old_size);
412 if (ret) {
413 mem_cgroup_iter_break(NULL, memcg);
414 goto unlock;
415 }
416 }
417unlock:
418 if (!ret)
419 memcg_shrinker_map_size = size;
420 mutex_unlock(&memcg_shrinker_map_mutex);
421 return ret;
422}
423
424void memcg_set_shrinker_bit(struct mem_cgroup *memcg, int nid, int shrinker_id)
425{
426 if (shrinker_id >= 0 && memcg && !mem_cgroup_is_root(memcg)) {
427 struct memcg_shrinker_map *map;
428
429 rcu_read_lock();
430 map = rcu_dereference(memcg->nodeinfo[nid]->shrinker_map);
431
432 smp_mb__before_atomic();
433 set_bit(shrinker_id, map->map);
434 rcu_read_unlock();
435 }
436}
437
438
439
440
441
442
443
444
445
446
447
448
449struct cgroup_subsys_state *mem_cgroup_css_from_page(struct page *page)
450{
451 struct mem_cgroup *memcg;
452
453 memcg = page->mem_cgroup;
454
455 if (!memcg || !cgroup_subsys_on_dfl(memory_cgrp_subsys))
456 memcg = root_mem_cgroup;
457
458 return &memcg->css;
459}
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474ino_t page_cgroup_ino(struct page *page)
475{
476 struct mem_cgroup *memcg;
477 unsigned long ino = 0;
478
479 rcu_read_lock();
480 if (PageSlab(page) && !PageTail(page))
481 memcg = memcg_from_slab_page(page);
482 else
483 memcg = READ_ONCE(page->mem_cgroup);
484 while (memcg && !(memcg->css.flags & CSS_ONLINE))
485 memcg = parent_mem_cgroup(memcg);
486 if (memcg)
487 ino = cgroup_ino(memcg->css.cgroup);
488 rcu_read_unlock();
489 return ino;
490}
491
492static struct mem_cgroup_per_node *
493mem_cgroup_page_nodeinfo(struct mem_cgroup *memcg, struct page *page)
494{
495 int nid = page_to_nid(page);
496
497 return memcg->nodeinfo[nid];
498}
499
500static struct mem_cgroup_tree_per_node *
501soft_limit_tree_node(int nid)
502{
503 return soft_limit_tree.rb_tree_per_node[nid];
504}
505
506static struct mem_cgroup_tree_per_node *
507soft_limit_tree_from_page(struct page *page)
508{
509 int nid = page_to_nid(page);
510
511 return soft_limit_tree.rb_tree_per_node[nid];
512}
513
514static void __mem_cgroup_insert_exceeded(struct mem_cgroup_per_node *mz,
515 struct mem_cgroup_tree_per_node *mctz,
516 unsigned long new_usage_in_excess)
517{
518 struct rb_node **p = &mctz->rb_root.rb_node;
519 struct rb_node *parent = NULL;
520 struct mem_cgroup_per_node *mz_node;
521 bool rightmost = true;
522
523 if (mz->on_tree)
524 return;
525
526 mz->usage_in_excess = new_usage_in_excess;
527 if (!mz->usage_in_excess)
528 return;
529 while (*p) {
530 parent = *p;
531 mz_node = rb_entry(parent, struct mem_cgroup_per_node,
532 tree_node);
533 if (mz->usage_in_excess < mz_node->usage_in_excess) {
534 p = &(*p)->rb_left;
535 rightmost = false;
536 }
537
538
539
540
541
542 else if (mz->usage_in_excess >= mz_node->usage_in_excess)
543 p = &(*p)->rb_right;
544 }
545
546 if (rightmost)
547 mctz->rb_rightmost = &mz->tree_node;
548
549 rb_link_node(&mz->tree_node, parent, p);
550 rb_insert_color(&mz->tree_node, &mctz->rb_root);
551 mz->on_tree = true;
552}
553
554static void __mem_cgroup_remove_exceeded(struct mem_cgroup_per_node *mz,
555 struct mem_cgroup_tree_per_node *mctz)
556{
557 if (!mz->on_tree)
558 return;
559
560 if (&mz->tree_node == mctz->rb_rightmost)
561 mctz->rb_rightmost = rb_prev(&mz->tree_node);
562
563 rb_erase(&mz->tree_node, &mctz->rb_root);
564 mz->on_tree = false;
565}
566
567static void mem_cgroup_remove_exceeded(struct mem_cgroup_per_node *mz,
568 struct mem_cgroup_tree_per_node *mctz)
569{
570 unsigned long flags;
571
572 spin_lock_irqsave(&mctz->lock, flags);
573 __mem_cgroup_remove_exceeded(mz, mctz);
574 spin_unlock_irqrestore(&mctz->lock, flags);
575}
576
577static unsigned long soft_limit_excess(struct mem_cgroup *memcg)
578{
579 unsigned long nr_pages = page_counter_read(&memcg->memory);
580 unsigned long soft_limit = READ_ONCE(memcg->soft_limit);
581 unsigned long excess = 0;
582
583 if (nr_pages > soft_limit)
584 excess = nr_pages - soft_limit;
585
586 return excess;
587}
588
589static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page)
590{
591 unsigned long excess;
592 struct mem_cgroup_per_node *mz;
593 struct mem_cgroup_tree_per_node *mctz;
594
595 mctz = soft_limit_tree_from_page(page);
596 if (!mctz)
597 return;
598
599
600
601
602 for (; memcg; memcg = parent_mem_cgroup(memcg)) {
603 mz = mem_cgroup_page_nodeinfo(memcg, page);
604 excess = soft_limit_excess(memcg);
605
606
607
608
609 if (excess || mz->on_tree) {
610 unsigned long flags;
611
612 spin_lock_irqsave(&mctz->lock, flags);
613
614 if (mz->on_tree)
615 __mem_cgroup_remove_exceeded(mz, mctz);
616
617
618
619
620 __mem_cgroup_insert_exceeded(mz, mctz, excess);
621 spin_unlock_irqrestore(&mctz->lock, flags);
622 }
623 }
624}
625
626static void mem_cgroup_remove_from_trees(struct mem_cgroup *memcg)
627{
628 struct mem_cgroup_tree_per_node *mctz;
629 struct mem_cgroup_per_node *mz;
630 int nid;
631
632 for_each_node(nid) {
633 mz = mem_cgroup_nodeinfo(memcg, nid);
634 mctz = soft_limit_tree_node(nid);
635 if (mctz)
636 mem_cgroup_remove_exceeded(mz, mctz);
637 }
638}
639
640static struct mem_cgroup_per_node *
641__mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz)
642{
643 struct mem_cgroup_per_node *mz;
644
645retry:
646 mz = NULL;
647 if (!mctz->rb_rightmost)
648 goto done;
649
650 mz = rb_entry(mctz->rb_rightmost,
651 struct mem_cgroup_per_node, tree_node);
652
653
654
655
656
657 __mem_cgroup_remove_exceeded(mz, mctz);
658 if (!soft_limit_excess(mz->memcg) ||
659 !css_tryget(&mz->memcg->css))
660 goto retry;
661done:
662 return mz;
663}
664
665static struct mem_cgroup_per_node *
666mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz)
667{
668 struct mem_cgroup_per_node *mz;
669
670 spin_lock_irq(&mctz->lock);
671 mz = __mem_cgroup_largest_soft_limit_node(mctz);
672 spin_unlock_irq(&mctz->lock);
673 return mz;
674}
675
676
677
678
679
680
681
682void __mod_memcg_state(struct mem_cgroup *memcg, int idx, int val)
683{
684 long x;
685
686 if (mem_cgroup_disabled())
687 return;
688
689 x = val + __this_cpu_read(memcg->vmstats_percpu->stat[idx]);
690 if (unlikely(abs(x) > MEMCG_CHARGE_BATCH)) {
691 struct mem_cgroup *mi;
692
693
694
695
696
697 __this_cpu_add(memcg->vmstats_local->stat[idx], x);
698 for (mi = memcg; mi; mi = parent_mem_cgroup(mi))
699 atomic_long_add(x, &mi->vmstats[idx]);
700 x = 0;
701 }
702 __this_cpu_write(memcg->vmstats_percpu->stat[idx], x);
703}
704
705static struct mem_cgroup_per_node *
706parent_nodeinfo(struct mem_cgroup_per_node *pn, int nid)
707{
708 struct mem_cgroup *parent;
709
710 parent = parent_mem_cgroup(pn->memcg);
711 if (!parent)
712 return NULL;
713 return mem_cgroup_nodeinfo(parent, nid);
714}
715
716
717
718
719
720
721
722
723
724
725
726void __mod_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx,
727 int val)
728{
729 pg_data_t *pgdat = lruvec_pgdat(lruvec);
730 struct mem_cgroup_per_node *pn;
731 struct mem_cgroup *memcg;
732 long x;
733
734
735 __mod_node_page_state(pgdat, idx, val);
736
737 if (mem_cgroup_disabled())
738 return;
739
740 pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
741 memcg = pn->memcg;
742
743
744 __mod_memcg_state(memcg, idx, val);
745
746
747 __this_cpu_add(pn->lruvec_stat_local->count[idx], val);
748
749 x = val + __this_cpu_read(pn->lruvec_stat_cpu->count[idx]);
750 if (unlikely(abs(x) > MEMCG_CHARGE_BATCH)) {
751 struct mem_cgroup_per_node *pi;
752
753 for (pi = pn; pi; pi = parent_nodeinfo(pi, pgdat->node_id))
754 atomic_long_add(x, &pi->lruvec_stat[idx]);
755 x = 0;
756 }
757 __this_cpu_write(pn->lruvec_stat_cpu->count[idx], x);
758}
759
760void __mod_lruvec_slab_state(void *p, enum node_stat_item idx, int val)
761{
762 pg_data_t *pgdat = page_pgdat(virt_to_page(p));
763 struct mem_cgroup *memcg;
764 struct lruvec *lruvec;
765
766 rcu_read_lock();
767 memcg = mem_cgroup_from_obj(p);
768
769
770 if (!memcg || memcg == root_mem_cgroup) {
771 __mod_node_page_state(pgdat, idx, val);
772 } else {
773 lruvec = mem_cgroup_lruvec(memcg, pgdat);
774 __mod_lruvec_state(lruvec, idx, val);
775 }
776 rcu_read_unlock();
777}
778
779void mod_memcg_obj_state(void *p, int idx, int val)
780{
781 struct mem_cgroup *memcg;
782
783 rcu_read_lock();
784 memcg = mem_cgroup_from_obj(p);
785 if (memcg)
786 mod_memcg_state(memcg, idx, val);
787 rcu_read_unlock();
788}
789
790
791
792
793
794
795
796void __count_memcg_events(struct mem_cgroup *memcg, enum vm_event_item idx,
797 unsigned long count)
798{
799 unsigned long x;
800
801 if (mem_cgroup_disabled())
802 return;
803
804 x = count + __this_cpu_read(memcg->vmstats_percpu->events[idx]);
805 if (unlikely(x > MEMCG_CHARGE_BATCH)) {
806 struct mem_cgroup *mi;
807
808
809
810
811
812 __this_cpu_add(memcg->vmstats_local->events[idx], x);
813 for (mi = memcg; mi; mi = parent_mem_cgroup(mi))
814 atomic_long_add(x, &mi->vmevents[idx]);
815 x = 0;
816 }
817 __this_cpu_write(memcg->vmstats_percpu->events[idx], x);
818}
819
820static unsigned long memcg_events(struct mem_cgroup *memcg, int event)
821{
822 return atomic_long_read(&memcg->vmevents[event]);
823}
824
825static unsigned long memcg_events_local(struct mem_cgroup *memcg, int event)
826{
827 long x = 0;
828 int cpu;
829
830 for_each_possible_cpu(cpu)
831 x += per_cpu(memcg->vmstats_local->events[event], cpu);
832 return x;
833}
834
835static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg,
836 struct page *page,
837 int nr_pages)
838{
839
840 if (nr_pages > 0)
841 __count_memcg_events(memcg, PGPGIN, 1);
842 else {
843 __count_memcg_events(memcg, PGPGOUT, 1);
844 nr_pages = -nr_pages;
845 }
846
847 __this_cpu_add(memcg->vmstats_percpu->nr_page_events, nr_pages);
848}
849
850static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg,
851 enum mem_cgroup_events_target target)
852{
853 unsigned long val, next;
854
855 val = __this_cpu_read(memcg->vmstats_percpu->nr_page_events);
856 next = __this_cpu_read(memcg->vmstats_percpu->targets[target]);
857
858 if ((long)(next - val) < 0) {
859 switch (target) {
860 case MEM_CGROUP_TARGET_THRESH:
861 next = val + THRESHOLDS_EVENTS_TARGET;
862 break;
863 case MEM_CGROUP_TARGET_SOFTLIMIT:
864 next = val + SOFTLIMIT_EVENTS_TARGET;
865 break;
866 default:
867 break;
868 }
869 __this_cpu_write(memcg->vmstats_percpu->targets[target], next);
870 return true;
871 }
872 return false;
873}
874
875
876
877
878
879static void memcg_check_events(struct mem_cgroup *memcg, struct page *page)
880{
881
882 if (unlikely(mem_cgroup_event_ratelimit(memcg,
883 MEM_CGROUP_TARGET_THRESH))) {
884 bool do_softlimit;
885
886 do_softlimit = mem_cgroup_event_ratelimit(memcg,
887 MEM_CGROUP_TARGET_SOFTLIMIT);
888 mem_cgroup_threshold(memcg);
889 if (unlikely(do_softlimit))
890 mem_cgroup_update_tree(memcg, page);
891 }
892}
893
894struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
895{
896
897
898
899
900
901 if (unlikely(!p))
902 return NULL;
903
904 return mem_cgroup_from_css(task_css(p, memory_cgrp_id));
905}
906EXPORT_SYMBOL(mem_cgroup_from_task);
907
908
909
910
911
912
913
914
915
916struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm)
917{
918 struct mem_cgroup *memcg;
919
920 if (mem_cgroup_disabled())
921 return NULL;
922
923 rcu_read_lock();
924 do {
925
926
927
928
929
930 if (unlikely(!mm))
931 memcg = root_mem_cgroup;
932 else {
933 memcg = mem_cgroup_from_task(rcu_dereference(mm->owner));
934 if (unlikely(!memcg))
935 memcg = root_mem_cgroup;
936 }
937 } while (!css_tryget(&memcg->css));
938 rcu_read_unlock();
939 return memcg;
940}
941EXPORT_SYMBOL(get_mem_cgroup_from_mm);
942
943
944
945
946
947
948
949
950struct mem_cgroup *get_mem_cgroup_from_page(struct page *page)
951{
952 struct mem_cgroup *memcg = page->mem_cgroup;
953
954 if (mem_cgroup_disabled())
955 return NULL;
956
957 rcu_read_lock();
958
959 if (!memcg || WARN_ON_ONCE(!css_tryget(&memcg->css)))
960 memcg = root_mem_cgroup;
961 rcu_read_unlock();
962 return memcg;
963}
964EXPORT_SYMBOL(get_mem_cgroup_from_page);
965
966
967
968
969static __always_inline struct mem_cgroup *get_mem_cgroup_from_current(void)
970{
971 if (unlikely(current->active_memcg)) {
972 struct mem_cgroup *memcg;
973
974 rcu_read_lock();
975
976 if (WARN_ON_ONCE(!css_tryget(¤t->active_memcg->css)))
977 memcg = root_mem_cgroup;
978 else
979 memcg = current->active_memcg;
980 rcu_read_unlock();
981 return memcg;
982 }
983 return get_mem_cgroup_from_mm(current->mm);
984}
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
1004 struct mem_cgroup *prev,
1005 struct mem_cgroup_reclaim_cookie *reclaim)
1006{
1007 struct mem_cgroup_reclaim_iter *uninitialized_var(iter);
1008 struct cgroup_subsys_state *css = NULL;
1009 struct mem_cgroup *memcg = NULL;
1010 struct mem_cgroup *pos = NULL;
1011
1012 if (mem_cgroup_disabled())
1013 return NULL;
1014
1015 if (!root)
1016 root = root_mem_cgroup;
1017
1018 if (prev && !reclaim)
1019 pos = prev;
1020
1021 if (!root->use_hierarchy && root != root_mem_cgroup) {
1022 if (prev)
1023 goto out;
1024 return root;
1025 }
1026
1027 rcu_read_lock();
1028
1029 if (reclaim) {
1030 struct mem_cgroup_per_node *mz;
1031
1032 mz = mem_cgroup_nodeinfo(root, reclaim->pgdat->node_id);
1033 iter = &mz->iter;
1034
1035 if (prev && reclaim->generation != iter->generation)
1036 goto out_unlock;
1037
1038 while (1) {
1039 pos = READ_ONCE(iter->position);
1040 if (!pos || css_tryget(&pos->css))
1041 break;
1042
1043
1044
1045
1046
1047
1048
1049
1050 (void)cmpxchg(&iter->position, pos, NULL);
1051 }
1052 }
1053
1054 if (pos)
1055 css = &pos->css;
1056
1057 for (;;) {
1058 css = css_next_descendant_pre(css, &root->css);
1059 if (!css) {
1060
1061
1062
1063
1064
1065
1066 if (!prev)
1067 continue;
1068 break;
1069 }
1070
1071
1072
1073
1074
1075
1076 memcg = mem_cgroup_from_css(css);
1077
1078 if (css == &root->css)
1079 break;
1080
1081 if (css_tryget(css))
1082 break;
1083
1084 memcg = NULL;
1085 }
1086
1087 if (reclaim) {
1088
1089
1090
1091
1092
1093 (void)cmpxchg(&iter->position, pos, memcg);
1094
1095 if (pos)
1096 css_put(&pos->css);
1097
1098 if (!memcg)
1099 iter->generation++;
1100 else if (!prev)
1101 reclaim->generation = iter->generation;
1102 }
1103
1104out_unlock:
1105 rcu_read_unlock();
1106out:
1107 if (prev && prev != root)
1108 css_put(&prev->css);
1109
1110 return memcg;
1111}
1112
1113
1114
1115
1116
1117
1118void mem_cgroup_iter_break(struct mem_cgroup *root,
1119 struct mem_cgroup *prev)
1120{
1121 if (!root)
1122 root = root_mem_cgroup;
1123 if (prev && prev != root)
1124 css_put(&prev->css);
1125}
1126
1127static void __invalidate_reclaim_iterators(struct mem_cgroup *from,
1128 struct mem_cgroup *dead_memcg)
1129{
1130 struct mem_cgroup_reclaim_iter *iter;
1131 struct mem_cgroup_per_node *mz;
1132 int nid;
1133
1134 for_each_node(nid) {
1135 mz = mem_cgroup_nodeinfo(from, nid);
1136 iter = &mz->iter;
1137 cmpxchg(&iter->position, dead_memcg, NULL);
1138 }
1139}
1140
1141static void invalidate_reclaim_iterators(struct mem_cgroup *dead_memcg)
1142{
1143 struct mem_cgroup *memcg = dead_memcg;
1144 struct mem_cgroup *last;
1145
1146 do {
1147 __invalidate_reclaim_iterators(memcg, dead_memcg);
1148 last = memcg;
1149 } while ((memcg = parent_mem_cgroup(memcg)));
1150
1151
1152
1153
1154
1155
1156
1157 if (last != root_mem_cgroup)
1158 __invalidate_reclaim_iterators(root_mem_cgroup,
1159 dead_memcg);
1160}
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175int mem_cgroup_scan_tasks(struct mem_cgroup *memcg,
1176 int (*fn)(struct task_struct *, void *), void *arg)
1177{
1178 struct mem_cgroup *iter;
1179 int ret = 0;
1180
1181 BUG_ON(memcg == root_mem_cgroup);
1182
1183 for_each_mem_cgroup_tree(iter, memcg) {
1184 struct css_task_iter it;
1185 struct task_struct *task;
1186
1187 css_task_iter_start(&iter->css, CSS_TASK_ITER_PROCS, &it);
1188 while (!ret && (task = css_task_iter_next(&it)))
1189 ret = fn(task, arg);
1190 css_task_iter_end(&it);
1191 if (ret) {
1192 mem_cgroup_iter_break(memcg, iter);
1193 break;
1194 }
1195 }
1196 return ret;
1197}
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207struct lruvec *mem_cgroup_page_lruvec(struct page *page, struct pglist_data *pgdat)
1208{
1209 struct mem_cgroup_per_node *mz;
1210 struct mem_cgroup *memcg;
1211 struct lruvec *lruvec;
1212
1213 if (mem_cgroup_disabled()) {
1214 lruvec = &pgdat->__lruvec;
1215 goto out;
1216 }
1217
1218 memcg = page->mem_cgroup;
1219
1220
1221
1222
1223 if (!memcg)
1224 memcg = root_mem_cgroup;
1225
1226 mz = mem_cgroup_page_nodeinfo(memcg, page);
1227 lruvec = &mz->lruvec;
1228out:
1229
1230
1231
1232
1233
1234 if (unlikely(lruvec->pgdat != pgdat))
1235 lruvec->pgdat = pgdat;
1236 return lruvec;
1237}
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru,
1251 int zid, int nr_pages)
1252{
1253 struct mem_cgroup_per_node *mz;
1254 unsigned long *lru_size;
1255 long size;
1256
1257 if (mem_cgroup_disabled())
1258 return;
1259
1260 mz = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
1261 lru_size = &mz->lru_zone_size[zid][lru];
1262
1263 if (nr_pages < 0)
1264 *lru_size += nr_pages;
1265
1266 size = *lru_size;
1267 if (WARN_ONCE(size < 0,
1268 "%s(%p, %d, %d): lru_size %ld\n",
1269 __func__, lruvec, lru, nr_pages, size)) {
1270 VM_BUG_ON(1);
1271 *lru_size = 0;
1272 }
1273
1274 if (nr_pages > 0)
1275 *lru_size += nr_pages;
1276}
1277
1278
1279
1280
1281
1282
1283
1284
1285static unsigned long mem_cgroup_margin(struct mem_cgroup *memcg)
1286{
1287 unsigned long margin = 0;
1288 unsigned long count;
1289 unsigned long limit;
1290
1291 count = page_counter_read(&memcg->memory);
1292 limit = READ_ONCE(memcg->memory.max);
1293 if (count < limit)
1294 margin = limit - count;
1295
1296 if (do_memsw_account()) {
1297 count = page_counter_read(&memcg->memsw);
1298 limit = READ_ONCE(memcg->memsw.max);
1299 if (count < limit)
1300 margin = min(margin, limit - count);
1301 else
1302 margin = 0;
1303 }
1304
1305 return margin;
1306}
1307
1308
1309
1310
1311
1312
1313
1314
1315static bool mem_cgroup_under_move(struct mem_cgroup *memcg)
1316{
1317 struct mem_cgroup *from;
1318 struct mem_cgroup *to;
1319 bool ret = false;
1320
1321
1322
1323
1324 spin_lock(&mc.lock);
1325 from = mc.from;
1326 to = mc.to;
1327 if (!from)
1328 goto unlock;
1329
1330 ret = mem_cgroup_is_descendant(from, memcg) ||
1331 mem_cgroup_is_descendant(to, memcg);
1332unlock:
1333 spin_unlock(&mc.lock);
1334 return ret;
1335}
1336
1337static bool mem_cgroup_wait_acct_move(struct mem_cgroup *memcg)
1338{
1339 if (mc.moving_task && current != mc.moving_task) {
1340 if (mem_cgroup_under_move(memcg)) {
1341 DEFINE_WAIT(wait);
1342 prepare_to_wait(&mc.waitq, &wait, TASK_INTERRUPTIBLE);
1343
1344 if (mc.moving_task)
1345 schedule();
1346 finish_wait(&mc.waitq, &wait);
1347 return true;
1348 }
1349 }
1350 return false;
1351}
1352
1353static char *memory_stat_format(struct mem_cgroup *memcg)
1354{
1355 struct seq_buf s;
1356 int i;
1357
1358 seq_buf_init(&s, kmalloc(PAGE_SIZE, GFP_KERNEL), PAGE_SIZE);
1359 if (!s.buffer)
1360 return NULL;
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373 seq_buf_printf(&s, "anon %llu\n",
1374 (u64)memcg_page_state(memcg, NR_ANON_MAPPED) *
1375 PAGE_SIZE);
1376 seq_buf_printf(&s, "file %llu\n",
1377 (u64)memcg_page_state(memcg, NR_FILE_PAGES) *
1378 PAGE_SIZE);
1379 seq_buf_printf(&s, "kernel_stack %llu\n",
1380 (u64)memcg_page_state(memcg, MEMCG_KERNEL_STACK_KB) *
1381 1024);
1382 seq_buf_printf(&s, "slab %llu\n",
1383 (u64)(memcg_page_state(memcg, NR_SLAB_RECLAIMABLE) +
1384 memcg_page_state(memcg, NR_SLAB_UNRECLAIMABLE)) *
1385 PAGE_SIZE);
1386 seq_buf_printf(&s, "sock %llu\n",
1387 (u64)memcg_page_state(memcg, MEMCG_SOCK) *
1388 PAGE_SIZE);
1389
1390 seq_buf_printf(&s, "shmem %llu\n",
1391 (u64)memcg_page_state(memcg, NR_SHMEM) *
1392 PAGE_SIZE);
1393 seq_buf_printf(&s, "file_mapped %llu\n",
1394 (u64)memcg_page_state(memcg, NR_FILE_MAPPED) *
1395 PAGE_SIZE);
1396 seq_buf_printf(&s, "file_dirty %llu\n",
1397 (u64)memcg_page_state(memcg, NR_FILE_DIRTY) *
1398 PAGE_SIZE);
1399 seq_buf_printf(&s, "file_writeback %llu\n",
1400 (u64)memcg_page_state(memcg, NR_WRITEBACK) *
1401 PAGE_SIZE);
1402
1403#ifdef CONFIG_TRANSPARENT_HUGEPAGE
1404 seq_buf_printf(&s, "anon_thp %llu\n",
1405 (u64)memcg_page_state(memcg, NR_ANON_THPS) *
1406 HPAGE_PMD_SIZE);
1407#endif
1408
1409 for (i = 0; i < NR_LRU_LISTS; i++)
1410 seq_buf_printf(&s, "%s %llu\n", lru_list_name(i),
1411 (u64)memcg_page_state(memcg, NR_LRU_BASE + i) *
1412 PAGE_SIZE);
1413
1414 seq_buf_printf(&s, "slab_reclaimable %llu\n",
1415 (u64)memcg_page_state(memcg, NR_SLAB_RECLAIMABLE) *
1416 PAGE_SIZE);
1417 seq_buf_printf(&s, "slab_unreclaimable %llu\n",
1418 (u64)memcg_page_state(memcg, NR_SLAB_UNRECLAIMABLE) *
1419 PAGE_SIZE);
1420
1421
1422
1423 seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGFAULT),
1424 memcg_events(memcg, PGFAULT));
1425 seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGMAJFAULT),
1426 memcg_events(memcg, PGMAJFAULT));
1427
1428 seq_buf_printf(&s, "workingset_refault %lu\n",
1429 memcg_page_state(memcg, WORKINGSET_REFAULT));
1430 seq_buf_printf(&s, "workingset_activate %lu\n",
1431 memcg_page_state(memcg, WORKINGSET_ACTIVATE));
1432 seq_buf_printf(&s, "workingset_restore %lu\n",
1433 memcg_page_state(memcg, WORKINGSET_RESTORE));
1434 seq_buf_printf(&s, "workingset_nodereclaim %lu\n",
1435 memcg_page_state(memcg, WORKINGSET_NODERECLAIM));
1436
1437 seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGREFILL),
1438 memcg_events(memcg, PGREFILL));
1439 seq_buf_printf(&s, "pgscan %lu\n",
1440 memcg_events(memcg, PGSCAN_KSWAPD) +
1441 memcg_events(memcg, PGSCAN_DIRECT));
1442 seq_buf_printf(&s, "pgsteal %lu\n",
1443 memcg_events(memcg, PGSTEAL_KSWAPD) +
1444 memcg_events(memcg, PGSTEAL_DIRECT));
1445 seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGACTIVATE),
1446 memcg_events(memcg, PGACTIVATE));
1447 seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGDEACTIVATE),
1448 memcg_events(memcg, PGDEACTIVATE));
1449 seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGLAZYFREE),
1450 memcg_events(memcg, PGLAZYFREE));
1451 seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGLAZYFREED),
1452 memcg_events(memcg, PGLAZYFREED));
1453
1454#ifdef CONFIG_TRANSPARENT_HUGEPAGE
1455 seq_buf_printf(&s, "%s %lu\n", vm_event_name(THP_FAULT_ALLOC),
1456 memcg_events(memcg, THP_FAULT_ALLOC));
1457 seq_buf_printf(&s, "%s %lu\n", vm_event_name(THP_COLLAPSE_ALLOC),
1458 memcg_events(memcg, THP_COLLAPSE_ALLOC));
1459#endif
1460
1461
1462 WARN_ON_ONCE(seq_buf_has_overflowed(&s));
1463
1464 return s.buffer;
1465}
1466
1467#define K(x) ((x) << (PAGE_SHIFT-10))
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477void mem_cgroup_print_oom_context(struct mem_cgroup *memcg, struct task_struct *p)
1478{
1479 rcu_read_lock();
1480
1481 if (memcg) {
1482 pr_cont(",oom_memcg=");
1483 pr_cont_cgroup_path(memcg->css.cgroup);
1484 } else
1485 pr_cont(",global_oom");
1486 if (p) {
1487 pr_cont(",task_memcg=");
1488 pr_cont_cgroup_path(task_cgroup(p, memory_cgrp_id));
1489 }
1490 rcu_read_unlock();
1491}
1492
1493
1494
1495
1496
1497
1498void mem_cgroup_print_oom_meminfo(struct mem_cgroup *memcg)
1499{
1500 char *buf;
1501
1502 pr_info("memory: usage %llukB, limit %llukB, failcnt %lu\n",
1503 K((u64)page_counter_read(&memcg->memory)),
1504 K((u64)READ_ONCE(memcg->memory.max)), memcg->memory.failcnt);
1505 if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
1506 pr_info("swap: usage %llukB, limit %llukB, failcnt %lu\n",
1507 K((u64)page_counter_read(&memcg->swap)),
1508 K((u64)READ_ONCE(memcg->swap.max)), memcg->swap.failcnt);
1509 else {
1510 pr_info("memory+swap: usage %llukB, limit %llukB, failcnt %lu\n",
1511 K((u64)page_counter_read(&memcg->memsw)),
1512 K((u64)memcg->memsw.max), memcg->memsw.failcnt);
1513 pr_info("kmem: usage %llukB, limit %llukB, failcnt %lu\n",
1514 K((u64)page_counter_read(&memcg->kmem)),
1515 K((u64)memcg->kmem.max), memcg->kmem.failcnt);
1516 }
1517
1518 pr_info("Memory cgroup stats for ");
1519 pr_cont_cgroup_path(memcg->css.cgroup);
1520 pr_cont(":");
1521 buf = memory_stat_format(memcg);
1522 if (!buf)
1523 return;
1524 pr_info("%s", buf);
1525 kfree(buf);
1526}
1527
1528
1529
1530
1531unsigned long mem_cgroup_get_max(struct mem_cgroup *memcg)
1532{
1533 unsigned long max;
1534
1535 max = READ_ONCE(memcg->memory.max);
1536 if (mem_cgroup_swappiness(memcg)) {
1537 unsigned long memsw_max;
1538 unsigned long swap_max;
1539
1540 memsw_max = memcg->memsw.max;
1541 swap_max = READ_ONCE(memcg->swap.max);
1542 swap_max = min(swap_max, (unsigned long)total_swap_pages);
1543 max = min(max + swap_max, memsw_max);
1544 }
1545 return max;
1546}
1547
1548unsigned long mem_cgroup_size(struct mem_cgroup *memcg)
1549{
1550 return page_counter_read(&memcg->memory);
1551}
1552
1553static bool mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
1554 int order)
1555{
1556 struct oom_control oc = {
1557 .zonelist = NULL,
1558 .nodemask = NULL,
1559 .memcg = memcg,
1560 .gfp_mask = gfp_mask,
1561 .order = order,
1562 };
1563 bool ret;
1564
1565 if (mutex_lock_killable(&oom_lock))
1566 return true;
1567
1568
1569
1570
1571 ret = should_force_charge() || out_of_memory(&oc);
1572 mutex_unlock(&oom_lock);
1573 return ret;
1574}
1575
1576static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg,
1577 pg_data_t *pgdat,
1578 gfp_t gfp_mask,
1579 unsigned long *total_scanned)
1580{
1581 struct mem_cgroup *victim = NULL;
1582 int total = 0;
1583 int loop = 0;
1584 unsigned long excess;
1585 unsigned long nr_scanned;
1586 struct mem_cgroup_reclaim_cookie reclaim = {
1587 .pgdat = pgdat,
1588 };
1589
1590 excess = soft_limit_excess(root_memcg);
1591
1592 while (1) {
1593 victim = mem_cgroup_iter(root_memcg, victim, &reclaim);
1594 if (!victim) {
1595 loop++;
1596 if (loop >= 2) {
1597
1598
1599
1600
1601
1602 if (!total)
1603 break;
1604
1605
1606
1607
1608
1609
1610 if (total >= (excess >> 2) ||
1611 (loop > MEM_CGROUP_MAX_RECLAIM_LOOPS))
1612 break;
1613 }
1614 continue;
1615 }
1616 total += mem_cgroup_shrink_node(victim, gfp_mask, false,
1617 pgdat, &nr_scanned);
1618 *total_scanned += nr_scanned;
1619 if (!soft_limit_excess(root_memcg))
1620 break;
1621 }
1622 mem_cgroup_iter_break(root_memcg, victim);
1623 return total;
1624}
1625
1626#ifdef CONFIG_LOCKDEP
1627static struct lockdep_map memcg_oom_lock_dep_map = {
1628 .name = "memcg_oom_lock",
1629};
1630#endif
1631
1632static DEFINE_SPINLOCK(memcg_oom_lock);
1633
1634
1635
1636
1637
1638static bool mem_cgroup_oom_trylock(struct mem_cgroup *memcg)
1639{
1640 struct mem_cgroup *iter, *failed = NULL;
1641
1642 spin_lock(&memcg_oom_lock);
1643
1644 for_each_mem_cgroup_tree(iter, memcg) {
1645 if (iter->oom_lock) {
1646
1647
1648
1649
1650 failed = iter;
1651 mem_cgroup_iter_break(memcg, iter);
1652 break;
1653 } else
1654 iter->oom_lock = true;
1655 }
1656
1657 if (failed) {
1658
1659
1660
1661
1662 for_each_mem_cgroup_tree(iter, memcg) {
1663 if (iter == failed) {
1664 mem_cgroup_iter_break(memcg, iter);
1665 break;
1666 }
1667 iter->oom_lock = false;
1668 }
1669 } else
1670 mutex_acquire(&memcg_oom_lock_dep_map, 0, 1, _RET_IP_);
1671
1672 spin_unlock(&memcg_oom_lock);
1673
1674 return !failed;
1675}
1676
1677static void mem_cgroup_oom_unlock(struct mem_cgroup *memcg)
1678{
1679 struct mem_cgroup *iter;
1680
1681 spin_lock(&memcg_oom_lock);
1682 mutex_release(&memcg_oom_lock_dep_map, _RET_IP_);
1683 for_each_mem_cgroup_tree(iter, memcg)
1684 iter->oom_lock = false;
1685 spin_unlock(&memcg_oom_lock);
1686}
1687
1688static void mem_cgroup_mark_under_oom(struct mem_cgroup *memcg)
1689{
1690 struct mem_cgroup *iter;
1691
1692 spin_lock(&memcg_oom_lock);
1693 for_each_mem_cgroup_tree(iter, memcg)
1694 iter->under_oom++;
1695 spin_unlock(&memcg_oom_lock);
1696}
1697
1698static void mem_cgroup_unmark_under_oom(struct mem_cgroup *memcg)
1699{
1700 struct mem_cgroup *iter;
1701
1702
1703
1704
1705
1706 spin_lock(&memcg_oom_lock);
1707 for_each_mem_cgroup_tree(iter, memcg)
1708 if (iter->under_oom > 0)
1709 iter->under_oom--;
1710 spin_unlock(&memcg_oom_lock);
1711}
1712
1713static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq);
1714
1715struct oom_wait_info {
1716 struct mem_cgroup *memcg;
1717 wait_queue_entry_t wait;
1718};
1719
1720static int memcg_oom_wake_function(wait_queue_entry_t *wait,
1721 unsigned mode, int sync, void *arg)
1722{
1723 struct mem_cgroup *wake_memcg = (struct mem_cgroup *)arg;
1724 struct mem_cgroup *oom_wait_memcg;
1725 struct oom_wait_info *oom_wait_info;
1726
1727 oom_wait_info = container_of(wait, struct oom_wait_info, wait);
1728 oom_wait_memcg = oom_wait_info->memcg;
1729
1730 if (!mem_cgroup_is_descendant(wake_memcg, oom_wait_memcg) &&
1731 !mem_cgroup_is_descendant(oom_wait_memcg, wake_memcg))
1732 return 0;
1733 return autoremove_wake_function(wait, mode, sync, arg);
1734}
1735
1736static void memcg_oom_recover(struct mem_cgroup *memcg)
1737{
1738
1739
1740
1741
1742
1743
1744
1745
1746 if (memcg && memcg->under_oom)
1747 __wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, memcg);
1748}
1749
1750enum oom_status {
1751 OOM_SUCCESS,
1752 OOM_FAILED,
1753 OOM_ASYNC,
1754 OOM_SKIPPED
1755};
1756
1757static enum oom_status mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order)
1758{
1759 enum oom_status ret;
1760 bool locked;
1761
1762 if (order > PAGE_ALLOC_COSTLY_ORDER)
1763 return OOM_SKIPPED;
1764
1765 memcg_memory_event(memcg, MEMCG_OOM);
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785 if (memcg->oom_kill_disable) {
1786 if (!current->in_user_fault)
1787 return OOM_SKIPPED;
1788 css_get(&memcg->css);
1789 current->memcg_in_oom = memcg;
1790 current->memcg_oom_gfp_mask = mask;
1791 current->memcg_oom_order = order;
1792
1793 return OOM_ASYNC;
1794 }
1795
1796 mem_cgroup_mark_under_oom(memcg);
1797
1798 locked = mem_cgroup_oom_trylock(memcg);
1799
1800 if (locked)
1801 mem_cgroup_oom_notify(memcg);
1802
1803 mem_cgroup_unmark_under_oom(memcg);
1804 if (mem_cgroup_out_of_memory(memcg, mask, order))
1805 ret = OOM_SUCCESS;
1806 else
1807 ret = OOM_FAILED;
1808
1809 if (locked)
1810 mem_cgroup_oom_unlock(memcg);
1811
1812 return ret;
1813}
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832bool mem_cgroup_oom_synchronize(bool handle)
1833{
1834 struct mem_cgroup *memcg = current->memcg_in_oom;
1835 struct oom_wait_info owait;
1836 bool locked;
1837
1838
1839 if (!memcg)
1840 return false;
1841
1842 if (!handle)
1843 goto cleanup;
1844
1845 owait.memcg = memcg;
1846 owait.wait.flags = 0;
1847 owait.wait.func = memcg_oom_wake_function;
1848 owait.wait.private = current;
1849 INIT_LIST_HEAD(&owait.wait.entry);
1850
1851 prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE);
1852 mem_cgroup_mark_under_oom(memcg);
1853
1854 locked = mem_cgroup_oom_trylock(memcg);
1855
1856 if (locked)
1857 mem_cgroup_oom_notify(memcg);
1858
1859 if (locked && !memcg->oom_kill_disable) {
1860 mem_cgroup_unmark_under_oom(memcg);
1861 finish_wait(&memcg_oom_waitq, &owait.wait);
1862 mem_cgroup_out_of_memory(memcg, current->memcg_oom_gfp_mask,
1863 current->memcg_oom_order);
1864 } else {
1865 schedule();
1866 mem_cgroup_unmark_under_oom(memcg);
1867 finish_wait(&memcg_oom_waitq, &owait.wait);
1868 }
1869
1870 if (locked) {
1871 mem_cgroup_oom_unlock(memcg);
1872
1873
1874
1875
1876
1877 memcg_oom_recover(memcg);
1878 }
1879cleanup:
1880 current->memcg_in_oom = NULL;
1881 css_put(&memcg->css);
1882 return true;
1883}
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895struct mem_cgroup *mem_cgroup_get_oom_group(struct task_struct *victim,
1896 struct mem_cgroup *oom_domain)
1897{
1898 struct mem_cgroup *oom_group = NULL;
1899 struct mem_cgroup *memcg;
1900
1901 if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
1902 return NULL;
1903
1904 if (!oom_domain)
1905 oom_domain = root_mem_cgroup;
1906
1907 rcu_read_lock();
1908
1909 memcg = mem_cgroup_from_task(victim);
1910 if (memcg == root_mem_cgroup)
1911 goto out;
1912
1913
1914
1915
1916
1917
1918 if (unlikely(!mem_cgroup_is_descendant(memcg, oom_domain)))
1919 goto out;
1920
1921
1922
1923
1924
1925
1926 for (; memcg; memcg = parent_mem_cgroup(memcg)) {
1927 if (memcg->oom_group)
1928 oom_group = memcg;
1929
1930 if (memcg == oom_domain)
1931 break;
1932 }
1933
1934 if (oom_group)
1935 css_get(&oom_group->css);
1936out:
1937 rcu_read_unlock();
1938
1939 return oom_group;
1940}
1941
1942void mem_cgroup_print_oom_group(struct mem_cgroup *memcg)
1943{
1944 pr_info("Tasks in ");
1945 pr_cont_cgroup_path(memcg->css.cgroup);
1946 pr_cont(" are going to be killed due to memory.oom.group set\n");
1947}
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960struct mem_cgroup *lock_page_memcg(struct page *page)
1961{
1962 struct page *head = compound_head(page);
1963 struct mem_cgroup *memcg;
1964 unsigned long flags;
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977 rcu_read_lock();
1978
1979 if (mem_cgroup_disabled())
1980 return NULL;
1981again:
1982 memcg = head->mem_cgroup;
1983 if (unlikely(!memcg))
1984 return NULL;
1985
1986 if (atomic_read(&memcg->moving_account) <= 0)
1987 return memcg;
1988
1989 spin_lock_irqsave(&memcg->move_lock, flags);
1990 if (memcg != head->mem_cgroup) {
1991 spin_unlock_irqrestore(&memcg->move_lock, flags);
1992 goto again;
1993 }
1994
1995
1996
1997
1998
1999
2000 memcg->move_lock_task = current;
2001 memcg->move_lock_flags = flags;
2002
2003 return memcg;
2004}
2005EXPORT_SYMBOL(lock_page_memcg);
2006
2007
2008
2009
2010
2011
2012
2013void __unlock_page_memcg(struct mem_cgroup *memcg)
2014{
2015 if (memcg && memcg->move_lock_task == current) {
2016 unsigned long flags = memcg->move_lock_flags;
2017
2018 memcg->move_lock_task = NULL;
2019 memcg->move_lock_flags = 0;
2020
2021 spin_unlock_irqrestore(&memcg->move_lock, flags);
2022 }
2023
2024 rcu_read_unlock();
2025}
2026
2027
2028
2029
2030
2031void unlock_page_memcg(struct page *page)
2032{
2033 struct page *head = compound_head(page);
2034
2035 __unlock_page_memcg(head->mem_cgroup);
2036}
2037EXPORT_SYMBOL(unlock_page_memcg);
2038
2039struct memcg_stock_pcp {
2040 struct mem_cgroup *cached;
2041 unsigned int nr_pages;
2042 struct work_struct work;
2043 unsigned long flags;
2044#define FLUSHING_CACHED_CHARGE 0
2045};
2046static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock);
2047static DEFINE_MUTEX(percpu_charge_mutex);
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
2061{
2062 struct memcg_stock_pcp *stock;
2063 unsigned long flags;
2064 bool ret = false;
2065
2066 if (nr_pages > MEMCG_CHARGE_BATCH)
2067 return ret;
2068
2069 local_irq_save(flags);
2070
2071 stock = this_cpu_ptr(&memcg_stock);
2072 if (memcg == stock->cached && stock->nr_pages >= nr_pages) {
2073 stock->nr_pages -= nr_pages;
2074 ret = true;
2075 }
2076
2077 local_irq_restore(flags);
2078
2079 return ret;
2080}
2081
2082
2083
2084
2085static void drain_stock(struct memcg_stock_pcp *stock)
2086{
2087 struct mem_cgroup *old = stock->cached;
2088
2089 if (stock->nr_pages) {
2090 page_counter_uncharge(&old->memory, stock->nr_pages);
2091 if (do_memsw_account())
2092 page_counter_uncharge(&old->memsw, stock->nr_pages);
2093 css_put_many(&old->css, stock->nr_pages);
2094 stock->nr_pages = 0;
2095 }
2096 stock->cached = NULL;
2097}
2098
2099static void drain_local_stock(struct work_struct *dummy)
2100{
2101 struct memcg_stock_pcp *stock;
2102 unsigned long flags;
2103
2104
2105
2106
2107
2108 local_irq_save(flags);
2109
2110 stock = this_cpu_ptr(&memcg_stock);
2111 drain_stock(stock);
2112 clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags);
2113
2114 local_irq_restore(flags);
2115}
2116
2117
2118
2119
2120
2121static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
2122{
2123 struct memcg_stock_pcp *stock;
2124 unsigned long flags;
2125
2126 local_irq_save(flags);
2127
2128 stock = this_cpu_ptr(&memcg_stock);
2129 if (stock->cached != memcg) {
2130 drain_stock(stock);
2131 stock->cached = memcg;
2132 }
2133 stock->nr_pages += nr_pages;
2134
2135 if (stock->nr_pages > MEMCG_CHARGE_BATCH)
2136 drain_stock(stock);
2137
2138 local_irq_restore(flags);
2139}
2140
2141
2142
2143
2144
2145static void drain_all_stock(struct mem_cgroup *root_memcg)
2146{
2147 int cpu, curcpu;
2148
2149
2150 if (!mutex_trylock(&percpu_charge_mutex))
2151 return;
2152
2153
2154
2155
2156
2157
2158 curcpu = get_cpu();
2159 for_each_online_cpu(cpu) {
2160 struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu);
2161 struct mem_cgroup *memcg;
2162 bool flush = false;
2163
2164 rcu_read_lock();
2165 memcg = stock->cached;
2166 if (memcg && stock->nr_pages &&
2167 mem_cgroup_is_descendant(memcg, root_memcg))
2168 flush = true;
2169 rcu_read_unlock();
2170
2171 if (flush &&
2172 !test_and_set_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) {
2173 if (cpu == curcpu)
2174 drain_local_stock(&stock->work);
2175 else
2176 schedule_work_on(cpu, &stock->work);
2177 }
2178 }
2179 put_cpu();
2180 mutex_unlock(&percpu_charge_mutex);
2181}
2182
2183static int memcg_hotplug_cpu_dead(unsigned int cpu)
2184{
2185 struct memcg_stock_pcp *stock;
2186 struct mem_cgroup *memcg, *mi;
2187
2188 stock = &per_cpu(memcg_stock, cpu);
2189 drain_stock(stock);
2190
2191 for_each_mem_cgroup(memcg) {
2192 int i;
2193
2194 for (i = 0; i < MEMCG_NR_STAT; i++) {
2195 int nid;
2196 long x;
2197
2198 x = this_cpu_xchg(memcg->vmstats_percpu->stat[i], 0);
2199 if (x)
2200 for (mi = memcg; mi; mi = parent_mem_cgroup(mi))
2201 atomic_long_add(x, &memcg->vmstats[i]);
2202
2203 if (i >= NR_VM_NODE_STAT_ITEMS)
2204 continue;
2205
2206 for_each_node(nid) {
2207 struct mem_cgroup_per_node *pn;
2208
2209 pn = mem_cgroup_nodeinfo(memcg, nid);
2210 x = this_cpu_xchg(pn->lruvec_stat_cpu->count[i], 0);
2211 if (x)
2212 do {
2213 atomic_long_add(x, &pn->lruvec_stat[i]);
2214 } while ((pn = parent_nodeinfo(pn, nid)));
2215 }
2216 }
2217
2218 for (i = 0; i < NR_VM_EVENT_ITEMS; i++) {
2219 long x;
2220
2221 x = this_cpu_xchg(memcg->vmstats_percpu->events[i], 0);
2222 if (x)
2223 for (mi = memcg; mi; mi = parent_mem_cgroup(mi))
2224 atomic_long_add(x, &memcg->vmevents[i]);
2225 }
2226 }
2227
2228 return 0;
2229}
2230
2231static void reclaim_high(struct mem_cgroup *memcg,
2232 unsigned int nr_pages,
2233 gfp_t gfp_mask)
2234{
2235 do {
2236 if (page_counter_read(&memcg->memory) <=
2237 READ_ONCE(memcg->memory.high))
2238 continue;
2239 memcg_memory_event(memcg, MEMCG_HIGH);
2240 try_to_free_mem_cgroup_pages(memcg, nr_pages, gfp_mask, true);
2241 } while ((memcg = parent_mem_cgroup(memcg)) &&
2242 !mem_cgroup_is_root(memcg));
2243}
2244
2245static void high_work_func(struct work_struct *work)
2246{
2247 struct mem_cgroup *memcg;
2248
2249 memcg = container_of(work, struct mem_cgroup, high_work);
2250 reclaim_high(memcg, MEMCG_CHARGE_BATCH, GFP_KERNEL);
2251}
2252
2253
2254
2255
2256
2257
2258#define MEMCG_MAX_HIGH_DELAY_JIFFIES (2UL*HZ)
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303 #define MEMCG_DELAY_PRECISION_SHIFT 20
2304 #define MEMCG_DELAY_SCALING_SHIFT 14
2305
2306static u64 calculate_overage(unsigned long usage, unsigned long high)
2307{
2308 u64 overage;
2309
2310 if (usage <= high)
2311 return 0;
2312
2313
2314
2315
2316
2317 high = max(high, 1UL);
2318
2319 overage = usage - high;
2320 overage <<= MEMCG_DELAY_PRECISION_SHIFT;
2321 return div64_u64(overage, high);
2322}
2323
2324static u64 mem_find_max_overage(struct mem_cgroup *memcg)
2325{
2326 u64 overage, max_overage = 0;
2327
2328 do {
2329 overage = calculate_overage(page_counter_read(&memcg->memory),
2330 READ_ONCE(memcg->memory.high));
2331 max_overage = max(overage, max_overage);
2332 } while ((memcg = parent_mem_cgroup(memcg)) &&
2333 !mem_cgroup_is_root(memcg));
2334
2335 return max_overage;
2336}
2337
2338static u64 swap_find_max_overage(struct mem_cgroup *memcg)
2339{
2340 u64 overage, max_overage = 0;
2341
2342 do {
2343 overage = calculate_overage(page_counter_read(&memcg->swap),
2344 READ_ONCE(memcg->swap.high));
2345 if (overage)
2346 memcg_memory_event(memcg, MEMCG_SWAP_HIGH);
2347 max_overage = max(overage, max_overage);
2348 } while ((memcg = parent_mem_cgroup(memcg)) &&
2349 !mem_cgroup_is_root(memcg));
2350
2351 return max_overage;
2352}
2353
2354
2355
2356
2357
2358static unsigned long calculate_high_delay(struct mem_cgroup *memcg,
2359 unsigned int nr_pages,
2360 u64 max_overage)
2361{
2362 unsigned long penalty_jiffies;
2363
2364 if (!max_overage)
2365 return 0;
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375 penalty_jiffies = max_overage * max_overage * HZ;
2376 penalty_jiffies >>= MEMCG_DELAY_PRECISION_SHIFT;
2377 penalty_jiffies >>= MEMCG_DELAY_SCALING_SHIFT;
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387 return penalty_jiffies * nr_pages / MEMCG_CHARGE_BATCH;
2388}
2389
2390
2391
2392
2393
2394void mem_cgroup_handle_over_high(void)
2395{
2396 unsigned long penalty_jiffies;
2397 unsigned long pflags;
2398 unsigned int nr_pages = current->memcg_nr_pages_over_high;
2399 struct mem_cgroup *memcg;
2400
2401 if (likely(!nr_pages))
2402 return;
2403
2404 memcg = get_mem_cgroup_from_mm(current->mm);
2405 reclaim_high(memcg, nr_pages, GFP_KERNEL);
2406 current->memcg_nr_pages_over_high = 0;
2407
2408
2409
2410
2411
2412 penalty_jiffies = calculate_high_delay(memcg, nr_pages,
2413 mem_find_max_overage(memcg));
2414
2415 penalty_jiffies += calculate_high_delay(memcg, nr_pages,
2416 swap_find_max_overage(memcg));
2417
2418
2419
2420
2421
2422
2423 penalty_jiffies = min(penalty_jiffies, MEMCG_MAX_HIGH_DELAY_JIFFIES);
2424
2425
2426
2427
2428
2429
2430
2431 if (penalty_jiffies <= HZ / 100)
2432 goto out;
2433
2434
2435
2436
2437
2438
2439 psi_memstall_enter(&pflags);
2440 schedule_timeout_killable(penalty_jiffies);
2441 psi_memstall_leave(&pflags);
2442
2443out:
2444 css_put(&memcg->css);
2445}
2446
2447static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
2448 unsigned int nr_pages)
2449{
2450 unsigned int batch = max(MEMCG_CHARGE_BATCH, nr_pages);
2451 int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
2452 struct mem_cgroup *mem_over_limit;
2453 struct page_counter *counter;
2454 unsigned long nr_reclaimed;
2455 bool may_swap = true;
2456 bool drained = false;
2457 enum oom_status oom_status;
2458
2459 if (mem_cgroup_is_root(memcg))
2460 return 0;
2461retry:
2462 if (consume_stock(memcg, nr_pages))
2463 return 0;
2464
2465 if (!do_memsw_account() ||
2466 page_counter_try_charge(&memcg->memsw, batch, &counter)) {
2467 if (page_counter_try_charge(&memcg->memory, batch, &counter))
2468 goto done_restock;
2469 if (do_memsw_account())
2470 page_counter_uncharge(&memcg->memsw, batch);
2471 mem_over_limit = mem_cgroup_from_counter(counter, memory);
2472 } else {
2473 mem_over_limit = mem_cgroup_from_counter(counter, memsw);
2474 may_swap = false;
2475 }
2476
2477 if (batch > nr_pages) {
2478 batch = nr_pages;
2479 goto retry;
2480 }
2481
2482
2483
2484
2485
2486
2487
2488 if (gfp_mask & __GFP_ATOMIC)
2489 goto force;
2490
2491
2492
2493
2494
2495
2496
2497 if (unlikely(should_force_charge()))
2498 goto force;
2499
2500
2501
2502
2503
2504
2505
2506 if (unlikely(current->flags & PF_MEMALLOC))
2507 goto force;
2508
2509 if (unlikely(task_in_memcg_oom(current)))
2510 goto nomem;
2511
2512 if (!gfpflags_allow_blocking(gfp_mask))
2513 goto nomem;
2514
2515 memcg_memory_event(mem_over_limit, MEMCG_MAX);
2516
2517 nr_reclaimed = try_to_free_mem_cgroup_pages(mem_over_limit, nr_pages,
2518 gfp_mask, may_swap);
2519
2520 if (mem_cgroup_margin(mem_over_limit) >= nr_pages)
2521 goto retry;
2522
2523 if (!drained) {
2524 drain_all_stock(mem_over_limit);
2525 drained = true;
2526 goto retry;
2527 }
2528
2529 if (gfp_mask & __GFP_NORETRY)
2530 goto nomem;
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540 if (nr_reclaimed && nr_pages <= (1 << PAGE_ALLOC_COSTLY_ORDER))
2541 goto retry;
2542
2543
2544
2545
2546 if (mem_cgroup_wait_acct_move(mem_over_limit))
2547 goto retry;
2548
2549 if (nr_retries--)
2550 goto retry;
2551
2552 if (gfp_mask & __GFP_RETRY_MAYFAIL)
2553 goto nomem;
2554
2555 if (gfp_mask & __GFP_NOFAIL)
2556 goto force;
2557
2558 if (fatal_signal_pending(current))
2559 goto force;
2560
2561
2562
2563
2564
2565
2566 oom_status = mem_cgroup_oom(mem_over_limit, gfp_mask,
2567 get_order(nr_pages * PAGE_SIZE));
2568 switch (oom_status) {
2569 case OOM_SUCCESS:
2570 nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
2571 goto retry;
2572 case OOM_FAILED:
2573 goto force;
2574 default:
2575 goto nomem;
2576 }
2577nomem:
2578 if (!(gfp_mask & __GFP_NOFAIL))
2579 return -ENOMEM;
2580force:
2581
2582
2583
2584
2585
2586 page_counter_charge(&memcg->memory, nr_pages);
2587 if (do_memsw_account())
2588 page_counter_charge(&memcg->memsw, nr_pages);
2589 css_get_many(&memcg->css, nr_pages);
2590
2591 return 0;
2592
2593done_restock:
2594 css_get_many(&memcg->css, batch);
2595 if (batch > nr_pages)
2596 refill_stock(memcg, batch - nr_pages);
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607 do {
2608 bool mem_high, swap_high;
2609
2610 mem_high = page_counter_read(&memcg->memory) >
2611 READ_ONCE(memcg->memory.high);
2612 swap_high = page_counter_read(&memcg->swap) >
2613 READ_ONCE(memcg->swap.high);
2614
2615
2616 if (in_interrupt()) {
2617 if (mem_high) {
2618 schedule_work(&memcg->high_work);
2619 break;
2620 }
2621 continue;
2622 }
2623
2624 if (mem_high || swap_high) {
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634 current->memcg_nr_pages_over_high += batch;
2635 set_notify_resume(current);
2636 break;
2637 }
2638 } while ((memcg = parent_mem_cgroup(memcg)));
2639
2640 return 0;
2641}
2642
2643#if defined(CONFIG_MEMCG_KMEM) || defined(CONFIG_MMU)
2644static void cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages)
2645{
2646 if (mem_cgroup_is_root(memcg))
2647 return;
2648
2649 page_counter_uncharge(&memcg->memory, nr_pages);
2650 if (do_memsw_account())
2651 page_counter_uncharge(&memcg->memsw, nr_pages);
2652
2653 css_put_many(&memcg->css, nr_pages);
2654}
2655#endif
2656
2657static void commit_charge(struct page *page, struct mem_cgroup *memcg)
2658{
2659 VM_BUG_ON_PAGE(page->mem_cgroup, page);
2660
2661
2662
2663
2664
2665
2666
2667
2668 page->mem_cgroup = memcg;
2669}
2670
2671#ifdef CONFIG_MEMCG_KMEM
2672
2673
2674
2675
2676
2677
2678struct mem_cgroup *mem_cgroup_from_obj(void *p)
2679{
2680 struct page *page;
2681
2682 if (mem_cgroup_disabled())
2683 return NULL;
2684
2685 page = virt_to_head_page(p);
2686
2687
2688
2689
2690
2691
2692 if (PageSlab(page))
2693 return memcg_from_slab_page(page);
2694
2695
2696 return page->mem_cgroup;
2697}
2698
2699static int memcg_alloc_cache_id(void)
2700{
2701 int id, size;
2702 int err;
2703
2704 id = ida_simple_get(&memcg_cache_ida,
2705 0, MEMCG_CACHES_MAX_SIZE, GFP_KERNEL);
2706 if (id < 0)
2707 return id;
2708
2709 if (id < memcg_nr_cache_ids)
2710 return id;
2711
2712
2713
2714
2715
2716 down_write(&memcg_cache_ids_sem);
2717
2718 size = 2 * (id + 1);
2719 if (size < MEMCG_CACHES_MIN_SIZE)
2720 size = MEMCG_CACHES_MIN_SIZE;
2721 else if (size > MEMCG_CACHES_MAX_SIZE)
2722 size = MEMCG_CACHES_MAX_SIZE;
2723
2724 err = memcg_update_all_caches(size);
2725 if (!err)
2726 err = memcg_update_all_list_lrus(size);
2727 if (!err)
2728 memcg_nr_cache_ids = size;
2729
2730 up_write(&memcg_cache_ids_sem);
2731
2732 if (err) {
2733 ida_simple_remove(&memcg_cache_ida, id);
2734 return err;
2735 }
2736 return id;
2737}
2738
2739static void memcg_free_cache_id(int id)
2740{
2741 ida_simple_remove(&memcg_cache_ida, id);
2742}
2743
2744struct memcg_kmem_cache_create_work {
2745 struct mem_cgroup *memcg;
2746 struct kmem_cache *cachep;
2747 struct work_struct work;
2748};
2749
2750static void memcg_kmem_cache_create_func(struct work_struct *w)
2751{
2752 struct memcg_kmem_cache_create_work *cw =
2753 container_of(w, struct memcg_kmem_cache_create_work, work);
2754 struct mem_cgroup *memcg = cw->memcg;
2755 struct kmem_cache *cachep = cw->cachep;
2756
2757 memcg_create_kmem_cache(memcg, cachep);
2758
2759 css_put(&memcg->css);
2760 kfree(cw);
2761}
2762
2763
2764
2765
2766static void memcg_schedule_kmem_cache_create(struct mem_cgroup *memcg,
2767 struct kmem_cache *cachep)
2768{
2769 struct memcg_kmem_cache_create_work *cw;
2770
2771 if (!css_tryget_online(&memcg->css))
2772 return;
2773
2774 cw = kmalloc(sizeof(*cw), GFP_NOWAIT | __GFP_NOWARN);
2775 if (!cw) {
2776 css_put(&memcg->css);
2777 return;
2778 }
2779
2780 cw->memcg = memcg;
2781 cw->cachep = cachep;
2782 INIT_WORK(&cw->work, memcg_kmem_cache_create_func);
2783
2784 queue_work(memcg_kmem_cache_wq, &cw->work);
2785}
2786
2787static inline bool memcg_kmem_bypass(void)
2788{
2789 if (in_interrupt())
2790 return true;
2791
2792
2793 if ((!current->mm || (current->flags & PF_KTHREAD)) &&
2794 !current->active_memcg)
2795 return true;
2796 return false;
2797}
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815struct kmem_cache *memcg_kmem_get_cache(struct kmem_cache *cachep)
2816{
2817 struct mem_cgroup *memcg;
2818 struct kmem_cache *memcg_cachep;
2819 struct memcg_cache_array *arr;
2820 int kmemcg_id;
2821
2822 VM_BUG_ON(!is_root_cache(cachep));
2823
2824 if (memcg_kmem_bypass())
2825 return cachep;
2826
2827 rcu_read_lock();
2828
2829 if (unlikely(current->active_memcg))
2830 memcg = current->active_memcg;
2831 else
2832 memcg = mem_cgroup_from_task(current);
2833
2834 if (!memcg || memcg == root_mem_cgroup)
2835 goto out_unlock;
2836
2837 kmemcg_id = READ_ONCE(memcg->kmemcg_id);
2838 if (kmemcg_id < 0)
2839 goto out_unlock;
2840
2841 arr = rcu_dereference(cachep->memcg_params.memcg_caches);
2842
2843
2844
2845
2846
2847
2848 memcg_cachep = READ_ONCE(arr->entries[kmemcg_id]);
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869 if (unlikely(!memcg_cachep))
2870 memcg_schedule_kmem_cache_create(memcg, cachep);
2871 else if (percpu_ref_tryget(&memcg_cachep->memcg_params.refcnt))
2872 cachep = memcg_cachep;
2873out_unlock:
2874 rcu_read_unlock();
2875 return cachep;
2876}
2877
2878
2879
2880
2881
2882void memcg_kmem_put_cache(struct kmem_cache *cachep)
2883{
2884 if (!is_root_cache(cachep))
2885 percpu_ref_put(&cachep->memcg_params.refcnt);
2886}
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896int __memcg_kmem_charge(struct mem_cgroup *memcg, gfp_t gfp,
2897 unsigned int nr_pages)
2898{
2899 struct page_counter *counter;
2900 int ret;
2901
2902 ret = try_charge(memcg, gfp, nr_pages);
2903 if (ret)
2904 return ret;
2905
2906 if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) &&
2907 !page_counter_try_charge(&memcg->kmem, nr_pages, &counter)) {
2908
2909
2910
2911
2912
2913
2914 if (gfp & __GFP_NOFAIL) {
2915 page_counter_charge(&memcg->kmem, nr_pages);
2916 return 0;
2917 }
2918 cancel_charge(memcg, nr_pages);
2919 return -ENOMEM;
2920 }
2921 return 0;
2922}
2923
2924
2925
2926
2927
2928
2929void __memcg_kmem_uncharge(struct mem_cgroup *memcg, unsigned int nr_pages)
2930{
2931 if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
2932 page_counter_uncharge(&memcg->kmem, nr_pages);
2933
2934 page_counter_uncharge(&memcg->memory, nr_pages);
2935 if (do_memsw_account())
2936 page_counter_uncharge(&memcg->memsw, nr_pages);
2937}
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947int __memcg_kmem_charge_page(struct page *page, gfp_t gfp, int order)
2948{
2949 struct mem_cgroup *memcg;
2950 int ret = 0;
2951
2952 if (memcg_kmem_bypass())
2953 return 0;
2954
2955 memcg = get_mem_cgroup_from_current();
2956 if (!mem_cgroup_is_root(memcg)) {
2957 ret = __memcg_kmem_charge(memcg, gfp, 1 << order);
2958 if (!ret) {
2959 page->mem_cgroup = memcg;
2960 __SetPageKmemcg(page);
2961 }
2962 }
2963 css_put(&memcg->css);
2964 return ret;
2965}
2966
2967
2968
2969
2970
2971
2972void __memcg_kmem_uncharge_page(struct page *page, int order)
2973{
2974 struct mem_cgroup *memcg = page->mem_cgroup;
2975 unsigned int nr_pages = 1 << order;
2976
2977 if (!memcg)
2978 return;
2979
2980 VM_BUG_ON_PAGE(mem_cgroup_is_root(memcg), page);
2981 __memcg_kmem_uncharge(memcg, nr_pages);
2982 page->mem_cgroup = NULL;
2983
2984
2985 if (PageKmemcg(page))
2986 __ClearPageKmemcg(page);
2987
2988 css_put_many(&memcg->css, nr_pages);
2989}
2990#endif
2991
2992#ifdef CONFIG_TRANSPARENT_HUGEPAGE
2993
2994
2995
2996
2997
2998void mem_cgroup_split_huge_fixup(struct page *head)
2999{
3000 int i;
3001
3002 if (mem_cgroup_disabled())
3003 return;
3004
3005 for (i = 1; i < HPAGE_PMD_NR; i++)
3006 head[i].mem_cgroup = head->mem_cgroup;
3007}
3008#endif
3009
3010#ifdef CONFIG_MEMCG_SWAP
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025static int mem_cgroup_move_swap_account(swp_entry_t entry,
3026 struct mem_cgroup *from, struct mem_cgroup *to)
3027{
3028 unsigned short old_id, new_id;
3029
3030 old_id = mem_cgroup_id(from);
3031 new_id = mem_cgroup_id(to);
3032
3033 if (swap_cgroup_cmpxchg(entry, old_id, new_id) == old_id) {
3034 mod_memcg_state(from, MEMCG_SWAP, -1);
3035 mod_memcg_state(to, MEMCG_SWAP, 1);
3036 return 0;
3037 }
3038 return -EINVAL;
3039}
3040#else
3041static inline int mem_cgroup_move_swap_account(swp_entry_t entry,
3042 struct mem_cgroup *from, struct mem_cgroup *to)
3043{
3044 return -EINVAL;
3045}
3046#endif
3047
3048static DEFINE_MUTEX(memcg_max_mutex);
3049
3050static int mem_cgroup_resize_max(struct mem_cgroup *memcg,
3051 unsigned long max, bool memsw)
3052{
3053 bool enlarge = false;
3054 bool drained = false;
3055 int ret;
3056 bool limits_invariant;
3057 struct page_counter *counter = memsw ? &memcg->memsw : &memcg->memory;
3058
3059 do {
3060 if (signal_pending(current)) {
3061 ret = -EINTR;
3062 break;
3063 }
3064
3065 mutex_lock(&memcg_max_mutex);
3066
3067
3068
3069
3070 limits_invariant = memsw ? max >= READ_ONCE(memcg->memory.max) :
3071 max <= memcg->memsw.max;
3072 if (!limits_invariant) {
3073 mutex_unlock(&memcg_max_mutex);
3074 ret = -EINVAL;
3075 break;
3076 }
3077 if (max > counter->max)
3078 enlarge = true;
3079 ret = page_counter_set_max(counter, max);
3080 mutex_unlock(&memcg_max_mutex);
3081
3082 if (!ret)
3083 break;
3084
3085 if (!drained) {
3086 drain_all_stock(memcg);
3087 drained = true;
3088 continue;
3089 }
3090
3091 if (!try_to_free_mem_cgroup_pages(memcg, 1,
3092 GFP_KERNEL, !memsw)) {
3093 ret = -EBUSY;
3094 break;
3095 }
3096 } while (true);
3097
3098 if (!ret && enlarge)
3099 memcg_oom_recover(memcg);
3100
3101 return ret;
3102}
3103
3104unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order,
3105 gfp_t gfp_mask,
3106 unsigned long *total_scanned)
3107{
3108 unsigned long nr_reclaimed = 0;
3109 struct mem_cgroup_per_node *mz, *next_mz = NULL;
3110 unsigned long reclaimed;
3111 int loop = 0;
3112 struct mem_cgroup_tree_per_node *mctz;
3113 unsigned long excess;
3114 unsigned long nr_scanned;
3115
3116 if (order > 0)
3117 return 0;
3118
3119 mctz = soft_limit_tree_node(pgdat->node_id);
3120
3121
3122
3123
3124
3125
3126 if (!mctz || RB_EMPTY_ROOT(&mctz->rb_root))
3127 return 0;
3128
3129
3130
3131
3132
3133
3134 do {
3135 if (next_mz)
3136 mz = next_mz;
3137 else
3138 mz = mem_cgroup_largest_soft_limit_node(mctz);
3139 if (!mz)
3140 break;
3141
3142 nr_scanned = 0;
3143 reclaimed = mem_cgroup_soft_reclaim(mz->memcg, pgdat,
3144 gfp_mask, &nr_scanned);
3145 nr_reclaimed += reclaimed;
3146 *total_scanned += nr_scanned;
3147 spin_lock_irq(&mctz->lock);
3148 __mem_cgroup_remove_exceeded(mz, mctz);
3149
3150
3151
3152
3153
3154 next_mz = NULL;
3155 if (!reclaimed)
3156 next_mz = __mem_cgroup_largest_soft_limit_node(mctz);
3157
3158 excess = soft_limit_excess(mz->memcg);
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168 __mem_cgroup_insert_exceeded(mz, mctz, excess);
3169 spin_unlock_irq(&mctz->lock);
3170 css_put(&mz->memcg->css);
3171 loop++;
3172
3173
3174
3175
3176
3177 if (!nr_reclaimed &&
3178 (next_mz == NULL ||
3179 loop > MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS))
3180 break;
3181 } while (!nr_reclaimed);
3182 if (next_mz)
3183 css_put(&next_mz->memcg->css);
3184 return nr_reclaimed;
3185}
3186
3187
3188
3189
3190
3191
3192
3193static inline bool memcg_has_children(struct mem_cgroup *memcg)
3194{
3195 bool ret;
3196
3197 rcu_read_lock();
3198 ret = css_next_child(NULL, &memcg->css);
3199 rcu_read_unlock();
3200 return ret;
3201}
3202
3203
3204
3205
3206
3207
3208static int mem_cgroup_force_empty(struct mem_cgroup *memcg)
3209{
3210 int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
3211
3212
3213 lru_add_drain_all();
3214
3215 drain_all_stock(memcg);
3216
3217
3218 while (nr_retries && page_counter_read(&memcg->memory)) {
3219 int progress;
3220
3221 if (signal_pending(current))
3222 return -EINTR;
3223
3224 progress = try_to_free_mem_cgroup_pages(memcg, 1,
3225 GFP_KERNEL, true);
3226 if (!progress) {
3227 nr_retries--;
3228
3229 congestion_wait(BLK_RW_ASYNC, HZ/10);
3230 }
3231
3232 }
3233
3234 return 0;
3235}
3236
3237static ssize_t mem_cgroup_force_empty_write(struct kernfs_open_file *of,
3238 char *buf, size_t nbytes,
3239 loff_t off)
3240{
3241 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
3242
3243 if (mem_cgroup_is_root(memcg))
3244 return -EINVAL;
3245 return mem_cgroup_force_empty(memcg) ?: nbytes;
3246}
3247
3248static u64 mem_cgroup_hierarchy_read(struct cgroup_subsys_state *css,
3249 struct cftype *cft)
3250{
3251 return mem_cgroup_from_css(css)->use_hierarchy;
3252}
3253
3254static int mem_cgroup_hierarchy_write(struct cgroup_subsys_state *css,
3255 struct cftype *cft, u64 val)
3256{
3257 int retval = 0;
3258 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
3259 struct mem_cgroup *parent_memcg = mem_cgroup_from_css(memcg->css.parent);
3260
3261 if (memcg->use_hierarchy == val)
3262 return 0;
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272 if ((!parent_memcg || !parent_memcg->use_hierarchy) &&
3273 (val == 1 || val == 0)) {
3274 if (!memcg_has_children(memcg))
3275 memcg->use_hierarchy = val;
3276 else
3277 retval = -EBUSY;
3278 } else
3279 retval = -EINVAL;
3280
3281 return retval;
3282}
3283
3284static unsigned long mem_cgroup_usage(struct mem_cgroup *memcg, bool swap)
3285{
3286 unsigned long val;
3287
3288 if (mem_cgroup_is_root(memcg)) {
3289 val = memcg_page_state(memcg, NR_FILE_PAGES) +
3290 memcg_page_state(memcg, NR_ANON_MAPPED);
3291 if (swap)
3292 val += memcg_page_state(memcg, MEMCG_SWAP);
3293 } else {
3294 if (!swap)
3295 val = page_counter_read(&memcg->memory);
3296 else
3297 val = page_counter_read(&memcg->memsw);
3298 }
3299 return val;
3300}
3301
3302enum {
3303 RES_USAGE,
3304 RES_LIMIT,
3305 RES_MAX_USAGE,
3306 RES_FAILCNT,
3307 RES_SOFT_LIMIT,
3308};
3309
3310static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css,
3311 struct cftype *cft)
3312{
3313 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
3314 struct page_counter *counter;
3315
3316 switch (MEMFILE_TYPE(cft->private)) {
3317 case _MEM:
3318 counter = &memcg->memory;
3319 break;
3320 case _MEMSWAP:
3321 counter = &memcg->memsw;
3322 break;
3323 case _KMEM:
3324 counter = &memcg->kmem;
3325 break;
3326 case _TCP:
3327 counter = &memcg->tcpmem;
3328 break;
3329 default:
3330 BUG();
3331 }
3332
3333 switch (MEMFILE_ATTR(cft->private)) {
3334 case RES_USAGE:
3335 if (counter == &memcg->memory)
3336 return (u64)mem_cgroup_usage(memcg, false) * PAGE_SIZE;
3337 if (counter == &memcg->memsw)
3338 return (u64)mem_cgroup_usage(memcg, true) * PAGE_SIZE;
3339 return (u64)page_counter_read(counter) * PAGE_SIZE;
3340 case RES_LIMIT:
3341 return (u64)counter->max * PAGE_SIZE;
3342 case RES_MAX_USAGE:
3343 return (u64)counter->watermark * PAGE_SIZE;
3344 case RES_FAILCNT:
3345 return counter->failcnt;
3346 case RES_SOFT_LIMIT:
3347 return (u64)memcg->soft_limit * PAGE_SIZE;
3348 default:
3349 BUG();
3350 }
3351}
3352
3353static void memcg_flush_percpu_vmstats(struct mem_cgroup *memcg)
3354{
3355 unsigned long stat[MEMCG_NR_STAT] = {0};
3356 struct mem_cgroup *mi;
3357 int node, cpu, i;
3358
3359 for_each_online_cpu(cpu)
3360 for (i = 0; i < MEMCG_NR_STAT; i++)
3361 stat[i] += per_cpu(memcg->vmstats_percpu->stat[i], cpu);
3362
3363 for (mi = memcg; mi; mi = parent_mem_cgroup(mi))
3364 for (i = 0; i < MEMCG_NR_STAT; i++)
3365 atomic_long_add(stat[i], &mi->vmstats[i]);
3366
3367 for_each_node(node) {
3368 struct mem_cgroup_per_node *pn = memcg->nodeinfo[node];
3369 struct mem_cgroup_per_node *pi;
3370
3371 for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++)
3372 stat[i] = 0;
3373
3374 for_each_online_cpu(cpu)
3375 for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++)
3376 stat[i] += per_cpu(
3377 pn->lruvec_stat_cpu->count[i], cpu);
3378
3379 for (pi = pn; pi; pi = parent_nodeinfo(pi, node))
3380 for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++)
3381 atomic_long_add(stat[i], &pi->lruvec_stat[i]);
3382 }
3383}
3384
3385static void memcg_flush_percpu_vmevents(struct mem_cgroup *memcg)
3386{
3387 unsigned long events[NR_VM_EVENT_ITEMS];
3388 struct mem_cgroup *mi;
3389 int cpu, i;
3390
3391 for (i = 0; i < NR_VM_EVENT_ITEMS; i++)
3392 events[i] = 0;
3393
3394 for_each_online_cpu(cpu)
3395 for (i = 0; i < NR_VM_EVENT_ITEMS; i++)
3396 events[i] += per_cpu(memcg->vmstats_percpu->events[i],
3397 cpu);
3398
3399 for (mi = memcg; mi; mi = parent_mem_cgroup(mi))
3400 for (i = 0; i < NR_VM_EVENT_ITEMS; i++)
3401 atomic_long_add(events[i], &mi->vmevents[i]);
3402}
3403
3404#ifdef CONFIG_MEMCG_KMEM
3405static int memcg_online_kmem(struct mem_cgroup *memcg)
3406{
3407 int memcg_id;
3408
3409 if (cgroup_memory_nokmem)
3410 return 0;
3411
3412 BUG_ON(memcg->kmemcg_id >= 0);
3413 BUG_ON(memcg->kmem_state);
3414
3415 memcg_id = memcg_alloc_cache_id();
3416 if (memcg_id < 0)
3417 return memcg_id;
3418
3419 static_branch_inc(&memcg_kmem_enabled_key);
3420
3421
3422
3423
3424
3425
3426 memcg->kmemcg_id = memcg_id;
3427 memcg->kmem_state = KMEM_ONLINE;
3428 INIT_LIST_HEAD(&memcg->kmem_caches);
3429
3430 return 0;
3431}
3432
3433static void memcg_offline_kmem(struct mem_cgroup *memcg)
3434{
3435 struct cgroup_subsys_state *css;
3436 struct mem_cgroup *parent, *child;
3437 int kmemcg_id;
3438
3439 if (memcg->kmem_state != KMEM_ONLINE)
3440 return;
3441
3442
3443
3444
3445
3446
3447 memcg->kmem_state = KMEM_ALLOCATED;
3448
3449 parent = parent_mem_cgroup(memcg);
3450 if (!parent)
3451 parent = root_mem_cgroup;
3452
3453
3454
3455
3456 memcg_deactivate_kmem_caches(memcg, parent);
3457
3458 kmemcg_id = memcg->kmemcg_id;
3459 BUG_ON(kmemcg_id < 0);
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469 rcu_read_lock();
3470 css_for_each_descendant_pre(css, &memcg->css) {
3471 child = mem_cgroup_from_css(css);
3472 BUG_ON(child->kmemcg_id != kmemcg_id);
3473 child->kmemcg_id = parent->kmemcg_id;
3474 if (!memcg->use_hierarchy)
3475 break;
3476 }
3477 rcu_read_unlock();
3478
3479 memcg_drain_all_list_lrus(kmemcg_id, parent);
3480
3481 memcg_free_cache_id(kmemcg_id);
3482}
3483
3484static void memcg_free_kmem(struct mem_cgroup *memcg)
3485{
3486
3487 if (unlikely(memcg->kmem_state == KMEM_ONLINE))
3488 memcg_offline_kmem(memcg);
3489
3490 if (memcg->kmem_state == KMEM_ALLOCATED) {
3491 WARN_ON(!list_empty(&memcg->kmem_caches));
3492 static_branch_dec(&memcg_kmem_enabled_key);
3493 }
3494}
3495#else
3496static int memcg_online_kmem(struct mem_cgroup *memcg)
3497{
3498 return 0;
3499}
3500static void memcg_offline_kmem(struct mem_cgroup *memcg)
3501{
3502}
3503static void memcg_free_kmem(struct mem_cgroup *memcg)
3504{
3505}
3506#endif
3507
3508static int memcg_update_kmem_max(struct mem_cgroup *memcg,
3509 unsigned long max)
3510{
3511 int ret;
3512
3513 mutex_lock(&memcg_max_mutex);
3514 ret = page_counter_set_max(&memcg->kmem, max);
3515 mutex_unlock(&memcg_max_mutex);
3516 return ret;
3517}
3518
3519static int memcg_update_tcp_max(struct mem_cgroup *memcg, unsigned long max)
3520{
3521 int ret;
3522
3523 mutex_lock(&memcg_max_mutex);
3524
3525 ret = page_counter_set_max(&memcg->tcpmem, max);
3526 if (ret)
3527 goto out;
3528
3529 if (!memcg->tcpmem_active) {
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546 static_branch_inc(&memcg_sockets_enabled_key);
3547 memcg->tcpmem_active = true;
3548 }
3549out:
3550 mutex_unlock(&memcg_max_mutex);
3551 return ret;
3552}
3553
3554
3555
3556
3557
3558static ssize_t mem_cgroup_write(struct kernfs_open_file *of,
3559 char *buf, size_t nbytes, loff_t off)
3560{
3561 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
3562 unsigned long nr_pages;
3563 int ret;
3564
3565 buf = strstrip(buf);
3566 ret = page_counter_memparse(buf, "-1", &nr_pages);
3567 if (ret)
3568 return ret;
3569
3570 switch (MEMFILE_ATTR(of_cft(of)->private)) {
3571 case RES_LIMIT:
3572 if (mem_cgroup_is_root(memcg)) {
3573 ret = -EINVAL;
3574 break;
3575 }
3576 switch (MEMFILE_TYPE(of_cft(of)->private)) {
3577 case _MEM:
3578 ret = mem_cgroup_resize_max(memcg, nr_pages, false);
3579 break;
3580 case _MEMSWAP:
3581 ret = mem_cgroup_resize_max(memcg, nr_pages, true);
3582 break;
3583 case _KMEM:
3584 pr_warn_once("kmem.limit_in_bytes is deprecated and will be removed. "
3585 "Please report your usecase to linux-mm@kvack.org if you "
3586 "depend on this functionality.\n");
3587 ret = memcg_update_kmem_max(memcg, nr_pages);
3588 break;
3589 case _TCP:
3590 ret = memcg_update_tcp_max(memcg, nr_pages);
3591 break;
3592 }
3593 break;
3594 case RES_SOFT_LIMIT:
3595 memcg->soft_limit = nr_pages;
3596 ret = 0;
3597 break;
3598 }
3599 return ret ?: nbytes;
3600}
3601
3602static ssize_t mem_cgroup_reset(struct kernfs_open_file *of, char *buf,
3603 size_t nbytes, loff_t off)
3604{
3605 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
3606 struct page_counter *counter;
3607
3608 switch (MEMFILE_TYPE(of_cft(of)->private)) {
3609 case _MEM:
3610 counter = &memcg->memory;
3611 break;
3612 case _MEMSWAP:
3613 counter = &memcg->memsw;
3614 break;
3615 case _KMEM:
3616 counter = &memcg->kmem;
3617 break;
3618 case _TCP:
3619 counter = &memcg->tcpmem;
3620 break;
3621 default:
3622 BUG();
3623 }
3624
3625 switch (MEMFILE_ATTR(of_cft(of)->private)) {
3626 case RES_MAX_USAGE:
3627 page_counter_reset_watermark(counter);
3628 break;
3629 case RES_FAILCNT:
3630 counter->failcnt = 0;
3631 break;
3632 default:
3633 BUG();
3634 }
3635
3636 return nbytes;
3637}
3638
3639static u64 mem_cgroup_move_charge_read(struct cgroup_subsys_state *css,
3640 struct cftype *cft)
3641{
3642 return mem_cgroup_from_css(css)->move_charge_at_immigrate;
3643}
3644
3645#ifdef CONFIG_MMU
3646static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css,
3647 struct cftype *cft, u64 val)
3648{
3649 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
3650
3651 if (val & ~MOVE_MASK)
3652 return -EINVAL;
3653
3654
3655
3656
3657
3658
3659
3660 memcg->move_charge_at_immigrate = val;
3661 return 0;
3662}
3663#else
3664static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css,
3665 struct cftype *cft, u64 val)
3666{
3667 return -ENOSYS;
3668}
3669#endif
3670
3671#ifdef CONFIG_NUMA
3672
3673#define LRU_ALL_FILE (BIT(LRU_INACTIVE_FILE) | BIT(LRU_ACTIVE_FILE))
3674#define LRU_ALL_ANON (BIT(LRU_INACTIVE_ANON) | BIT(LRU_ACTIVE_ANON))
3675#define LRU_ALL ((1 << NR_LRU_LISTS) - 1)
3676
3677static unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg,
3678 int nid, unsigned int lru_mask, bool tree)
3679{
3680 struct lruvec *lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(nid));
3681 unsigned long nr = 0;
3682 enum lru_list lru;
3683
3684 VM_BUG_ON((unsigned)nid >= nr_node_ids);
3685
3686 for_each_lru(lru) {
3687 if (!(BIT(lru) & lru_mask))
3688 continue;
3689 if (tree)
3690 nr += lruvec_page_state(lruvec, NR_LRU_BASE + lru);
3691 else
3692 nr += lruvec_page_state_local(lruvec, NR_LRU_BASE + lru);
3693 }
3694 return nr;
3695}
3696
3697static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *memcg,
3698 unsigned int lru_mask,
3699 bool tree)
3700{
3701 unsigned long nr = 0;
3702 enum lru_list lru;
3703
3704 for_each_lru(lru) {
3705 if (!(BIT(lru) & lru_mask))
3706 continue;
3707 if (tree)
3708 nr += memcg_page_state(memcg, NR_LRU_BASE + lru);
3709 else
3710 nr += memcg_page_state_local(memcg, NR_LRU_BASE + lru);
3711 }
3712 return nr;
3713}
3714
3715static int memcg_numa_stat_show(struct seq_file *m, void *v)
3716{
3717 struct numa_stat {
3718 const char *name;
3719 unsigned int lru_mask;
3720 };
3721
3722 static const struct numa_stat stats[] = {
3723 { "total", LRU_ALL },
3724 { "file", LRU_ALL_FILE },
3725 { "anon", LRU_ALL_ANON },
3726 { "unevictable", BIT(LRU_UNEVICTABLE) },
3727 };
3728 const struct numa_stat *stat;
3729 int nid;
3730 struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
3731
3732 for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) {
3733 seq_printf(m, "%s=%lu", stat->name,
3734 mem_cgroup_nr_lru_pages(memcg, stat->lru_mask,
3735 false));
3736 for_each_node_state(nid, N_MEMORY)
3737 seq_printf(m, " N%d=%lu", nid,
3738 mem_cgroup_node_nr_lru_pages(memcg, nid,
3739 stat->lru_mask, false));
3740 seq_putc(m, '\n');
3741 }
3742
3743 for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) {
3744
3745 seq_printf(m, "hierarchical_%s=%lu", stat->name,
3746 mem_cgroup_nr_lru_pages(memcg, stat->lru_mask,
3747 true));
3748 for_each_node_state(nid, N_MEMORY)
3749 seq_printf(m, " N%d=%lu", nid,
3750 mem_cgroup_node_nr_lru_pages(memcg, nid,
3751 stat->lru_mask, true));
3752 seq_putc(m, '\n');
3753 }
3754
3755 return 0;
3756}
3757#endif
3758
3759static const unsigned int memcg1_stats[] = {
3760 NR_FILE_PAGES,
3761 NR_ANON_MAPPED,
3762#ifdef CONFIG_TRANSPARENT_HUGEPAGE
3763 NR_ANON_THPS,
3764#endif
3765 NR_SHMEM,
3766 NR_FILE_MAPPED,
3767 NR_FILE_DIRTY,
3768 NR_WRITEBACK,
3769 MEMCG_SWAP,
3770};
3771
3772static const char *const memcg1_stat_names[] = {
3773 "cache",
3774 "rss",
3775#ifdef CONFIG_TRANSPARENT_HUGEPAGE
3776 "rss_huge",
3777#endif
3778 "shmem",
3779 "mapped_file",
3780 "dirty",
3781 "writeback",
3782 "swap",
3783};
3784
3785
3786static const unsigned int memcg1_events[] = {
3787 PGPGIN,
3788 PGPGOUT,
3789 PGFAULT,
3790 PGMAJFAULT,
3791};
3792
3793static int memcg_stat_show(struct seq_file *m, void *v)
3794{
3795 struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
3796 unsigned long memory, memsw;
3797 struct mem_cgroup *mi;
3798 unsigned int i;
3799
3800 BUILD_BUG_ON(ARRAY_SIZE(memcg1_stat_names) != ARRAY_SIZE(memcg1_stats));
3801
3802 for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) {
3803 unsigned long nr;
3804
3805 if (memcg1_stats[i] == MEMCG_SWAP && !do_memsw_account())
3806 continue;
3807 nr = memcg_page_state_local(memcg, memcg1_stats[i]);
3808#ifdef CONFIG_TRANSPARENT_HUGEPAGE
3809 if (memcg1_stats[i] == NR_ANON_THPS)
3810 nr *= HPAGE_PMD_NR;
3811#endif
3812 seq_printf(m, "%s %lu\n", memcg1_stat_names[i], nr * PAGE_SIZE);
3813 }
3814
3815 for (i = 0; i < ARRAY_SIZE(memcg1_events); i++)
3816 seq_printf(m, "%s %lu\n", vm_event_name(memcg1_events[i]),
3817 memcg_events_local(memcg, memcg1_events[i]));
3818
3819 for (i = 0; i < NR_LRU_LISTS; i++)
3820 seq_printf(m, "%s %lu\n", lru_list_name(i),
3821 memcg_page_state_local(memcg, NR_LRU_BASE + i) *
3822 PAGE_SIZE);
3823
3824
3825 memory = memsw = PAGE_COUNTER_MAX;
3826 for (mi = memcg; mi; mi = parent_mem_cgroup(mi)) {
3827 memory = min(memory, READ_ONCE(mi->memory.max));
3828 memsw = min(memsw, READ_ONCE(mi->memsw.max));
3829 }
3830 seq_printf(m, "hierarchical_memory_limit %llu\n",
3831 (u64)memory * PAGE_SIZE);
3832 if (do_memsw_account())
3833 seq_printf(m, "hierarchical_memsw_limit %llu\n",
3834 (u64)memsw * PAGE_SIZE);
3835
3836 for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) {
3837 if (memcg1_stats[i] == MEMCG_SWAP && !do_memsw_account())
3838 continue;
3839 seq_printf(m, "total_%s %llu\n", memcg1_stat_names[i],
3840 (u64)memcg_page_state(memcg, memcg1_stats[i]) *
3841 PAGE_SIZE);
3842 }
3843
3844 for (i = 0; i < ARRAY_SIZE(memcg1_events); i++)
3845 seq_printf(m, "total_%s %llu\n",
3846 vm_event_name(memcg1_events[i]),
3847 (u64)memcg_events(memcg, memcg1_events[i]));
3848
3849 for (i = 0; i < NR_LRU_LISTS; i++)
3850 seq_printf(m, "total_%s %llu\n", lru_list_name(i),
3851 (u64)memcg_page_state(memcg, NR_LRU_BASE + i) *
3852 PAGE_SIZE);
3853
3854#ifdef CONFIG_DEBUG_VM
3855 {
3856 pg_data_t *pgdat;
3857 struct mem_cgroup_per_node *mz;
3858 unsigned long anon_cost = 0;
3859 unsigned long file_cost = 0;
3860
3861 for_each_online_pgdat(pgdat) {
3862 mz = mem_cgroup_nodeinfo(memcg, pgdat->node_id);
3863
3864 anon_cost += mz->lruvec.anon_cost;
3865 file_cost += mz->lruvec.file_cost;
3866 }
3867 seq_printf(m, "anon_cost %lu\n", anon_cost);
3868 seq_printf(m, "file_cost %lu\n", file_cost);
3869 }
3870#endif
3871
3872 return 0;
3873}
3874
3875static u64 mem_cgroup_swappiness_read(struct cgroup_subsys_state *css,
3876 struct cftype *cft)
3877{
3878 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
3879
3880 return mem_cgroup_swappiness(memcg);
3881}
3882
3883static int mem_cgroup_swappiness_write(struct cgroup_subsys_state *css,
3884 struct cftype *cft, u64 val)
3885{
3886 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
3887
3888 if (val > 100)
3889 return -EINVAL;
3890
3891 if (css->parent)
3892 memcg->swappiness = val;
3893 else
3894 vm_swappiness = val;
3895
3896 return 0;
3897}
3898
3899static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap)
3900{
3901 struct mem_cgroup_threshold_ary *t;
3902 unsigned long usage;
3903 int i;
3904
3905 rcu_read_lock();
3906 if (!swap)
3907 t = rcu_dereference(memcg->thresholds.primary);
3908 else
3909 t = rcu_dereference(memcg->memsw_thresholds.primary);
3910
3911 if (!t)
3912 goto unlock;
3913
3914 usage = mem_cgroup_usage(memcg, swap);
3915
3916
3917
3918
3919
3920
3921 i = t->current_threshold;
3922
3923
3924
3925
3926
3927
3928
3929 for (; i >= 0 && unlikely(t->entries[i].threshold > usage); i--)
3930 eventfd_signal(t->entries[i].eventfd, 1);
3931
3932
3933 i++;
3934
3935
3936
3937
3938
3939
3940
3941 for (; i < t->size && unlikely(t->entries[i].threshold <= usage); i++)
3942 eventfd_signal(t->entries[i].eventfd, 1);
3943
3944
3945 t->current_threshold = i - 1;
3946unlock:
3947 rcu_read_unlock();
3948}
3949
3950static void mem_cgroup_threshold(struct mem_cgroup *memcg)
3951{
3952 while (memcg) {
3953 __mem_cgroup_threshold(memcg, false);
3954 if (do_memsw_account())
3955 __mem_cgroup_threshold(memcg, true);
3956
3957 memcg = parent_mem_cgroup(memcg);
3958 }
3959}
3960
3961static int compare_thresholds(const void *a, const void *b)
3962{
3963 const struct mem_cgroup_threshold *_a = a;
3964 const struct mem_cgroup_threshold *_b = b;
3965
3966 if (_a->threshold > _b->threshold)
3967 return 1;
3968
3969 if (_a->threshold < _b->threshold)
3970 return -1;
3971
3972 return 0;
3973}
3974
3975static int mem_cgroup_oom_notify_cb(struct mem_cgroup *memcg)
3976{
3977 struct mem_cgroup_eventfd_list *ev;
3978
3979 spin_lock(&memcg_oom_lock);
3980
3981 list_for_each_entry(ev, &memcg->oom_notify, list)
3982 eventfd_signal(ev->eventfd, 1);
3983
3984 spin_unlock(&memcg_oom_lock);
3985 return 0;
3986}
3987
3988static void mem_cgroup_oom_notify(struct mem_cgroup *memcg)
3989{
3990 struct mem_cgroup *iter;
3991
3992 for_each_mem_cgroup_tree(iter, memcg)
3993 mem_cgroup_oom_notify_cb(iter);
3994}
3995
3996static int __mem_cgroup_usage_register_event(struct mem_cgroup *memcg,
3997 struct eventfd_ctx *eventfd, const char *args, enum res_type type)
3998{
3999 struct mem_cgroup_thresholds *thresholds;
4000 struct mem_cgroup_threshold_ary *new;
4001 unsigned long threshold;
4002 unsigned long usage;
4003 int i, size, ret;
4004
4005 ret = page_counter_memparse(args, "-1", &threshold);
4006 if (ret)
4007 return ret;
4008
4009 mutex_lock(&memcg->thresholds_lock);
4010
4011 if (type == _MEM) {
4012 thresholds = &memcg->thresholds;
4013 usage = mem_cgroup_usage(memcg, false);
4014 } else if (type == _MEMSWAP) {
4015 thresholds = &memcg->memsw_thresholds;
4016 usage = mem_cgroup_usage(memcg, true);
4017 } else
4018 BUG();
4019
4020
4021 if (thresholds->primary)
4022 __mem_cgroup_threshold(memcg, type == _MEMSWAP);
4023
4024 size = thresholds->primary ? thresholds->primary->size + 1 : 1;
4025
4026
4027 new = kmalloc(struct_size(new, entries, size), GFP_KERNEL);
4028 if (!new) {
4029 ret = -ENOMEM;
4030 goto unlock;
4031 }
4032 new->size = size;
4033
4034
4035 if (thresholds->primary) {
4036 memcpy(new->entries, thresholds->primary->entries, (size - 1) *
4037 sizeof(struct mem_cgroup_threshold));
4038 }
4039
4040
4041 new->entries[size - 1].eventfd = eventfd;
4042 new->entries[size - 1].threshold = threshold;
4043
4044
4045 sort(new->entries, size, sizeof(struct mem_cgroup_threshold),
4046 compare_thresholds, NULL);
4047
4048
4049 new->current_threshold = -1;
4050 for (i = 0; i < size; i++) {
4051 if (new->entries[i].threshold <= usage) {
4052
4053
4054
4055
4056
4057 ++new->current_threshold;
4058 } else
4059 break;
4060 }
4061
4062
4063 kfree(thresholds->spare);
4064 thresholds->spare = thresholds->primary;
4065
4066 rcu_assign_pointer(thresholds->primary, new);
4067
4068
4069 synchronize_rcu();
4070
4071unlock:
4072 mutex_unlock(&memcg->thresholds_lock);
4073
4074 return ret;
4075}
4076
4077static int mem_cgroup_usage_register_event(struct mem_cgroup *memcg,
4078 struct eventfd_ctx *eventfd, const char *args)
4079{
4080 return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEM);
4081}
4082
4083static int memsw_cgroup_usage_register_event(struct mem_cgroup *memcg,
4084 struct eventfd_ctx *eventfd, const char *args)
4085{
4086 return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEMSWAP);
4087}
4088
4089static void __mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
4090 struct eventfd_ctx *eventfd, enum res_type type)
4091{
4092 struct mem_cgroup_thresholds *thresholds;
4093 struct mem_cgroup_threshold_ary *new;
4094 unsigned long usage;
4095 int i, j, size, entries;
4096
4097 mutex_lock(&memcg->thresholds_lock);
4098
4099 if (type == _MEM) {
4100 thresholds = &memcg->thresholds;
4101 usage = mem_cgroup_usage(memcg, false);
4102 } else if (type == _MEMSWAP) {
4103 thresholds = &memcg->memsw_thresholds;
4104 usage = mem_cgroup_usage(memcg, true);
4105 } else
4106 BUG();
4107
4108 if (!thresholds->primary)
4109 goto unlock;
4110
4111
4112 __mem_cgroup_threshold(memcg, type == _MEMSWAP);
4113
4114
4115 size = entries = 0;
4116 for (i = 0; i < thresholds->primary->size; i++) {
4117 if (thresholds->primary->entries[i].eventfd != eventfd)
4118 size++;
4119 else
4120 entries++;
4121 }
4122
4123 new = thresholds->spare;
4124
4125
4126 if (!entries)
4127 goto unlock;
4128
4129
4130 if (!size) {
4131 kfree(new);
4132 new = NULL;
4133 goto swap_buffers;
4134 }
4135
4136 new->size = size;
4137
4138
4139 new->current_threshold = -1;
4140 for (i = 0, j = 0; i < thresholds->primary->size; i++) {
4141 if (thresholds->primary->entries[i].eventfd == eventfd)
4142 continue;
4143
4144 new->entries[j] = thresholds->primary->entries[i];
4145 if (new->entries[j].threshold <= usage) {
4146
4147
4148
4149
4150
4151 ++new->current_threshold;
4152 }
4153 j++;
4154 }
4155
4156swap_buffers:
4157
4158 thresholds->spare = thresholds->primary;
4159
4160 rcu_assign_pointer(thresholds->primary, new);
4161
4162
4163 synchronize_rcu();
4164
4165
4166 if (!new) {
4167 kfree(thresholds->spare);
4168 thresholds->spare = NULL;
4169 }
4170unlock:
4171 mutex_unlock(&memcg->thresholds_lock);
4172}
4173
4174static void mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
4175 struct eventfd_ctx *eventfd)
4176{
4177 return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEM);
4178}
4179
4180static void memsw_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
4181 struct eventfd_ctx *eventfd)
4182{
4183 return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEMSWAP);
4184}
4185
4186static int mem_cgroup_oom_register_event(struct mem_cgroup *memcg,
4187 struct eventfd_ctx *eventfd, const char *args)
4188{
4189 struct mem_cgroup_eventfd_list *event;
4190
4191 event = kmalloc(sizeof(*event), GFP_KERNEL);
4192 if (!event)
4193 return -ENOMEM;
4194
4195 spin_lock(&memcg_oom_lock);
4196
4197 event->eventfd = eventfd;
4198 list_add(&event->list, &memcg->oom_notify);
4199
4200
4201 if (memcg->under_oom)
4202 eventfd_signal(eventfd, 1);
4203 spin_unlock(&memcg_oom_lock);
4204
4205 return 0;
4206}
4207
4208static void mem_cgroup_oom_unregister_event(struct mem_cgroup *memcg,
4209 struct eventfd_ctx *eventfd)
4210{
4211 struct mem_cgroup_eventfd_list *ev, *tmp;
4212
4213 spin_lock(&memcg_oom_lock);
4214
4215 list_for_each_entry_safe(ev, tmp, &memcg->oom_notify, list) {
4216 if (ev->eventfd == eventfd) {
4217 list_del(&ev->list);
4218 kfree(ev);
4219 }
4220 }
4221
4222 spin_unlock(&memcg_oom_lock);
4223}
4224
4225static int mem_cgroup_oom_control_read(struct seq_file *sf, void *v)
4226{
4227 struct mem_cgroup *memcg = mem_cgroup_from_seq(sf);
4228
4229 seq_printf(sf, "oom_kill_disable %d\n", memcg->oom_kill_disable);
4230 seq_printf(sf, "under_oom %d\n", (bool)memcg->under_oom);
4231 seq_printf(sf, "oom_kill %lu\n",
4232 atomic_long_read(&memcg->memory_events[MEMCG_OOM_KILL]));
4233 return 0;
4234}
4235
4236static int mem_cgroup_oom_control_write(struct cgroup_subsys_state *css,
4237 struct cftype *cft, u64 val)
4238{
4239 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
4240
4241
4242 if (!css->parent || !((val == 0) || (val == 1)))
4243 return -EINVAL;
4244
4245 memcg->oom_kill_disable = val;
4246 if (!val)
4247 memcg_oom_recover(memcg);
4248
4249 return 0;
4250}
4251
4252#ifdef CONFIG_CGROUP_WRITEBACK
4253
4254#include <trace/events/writeback.h>
4255
4256static int memcg_wb_domain_init(struct mem_cgroup *memcg, gfp_t gfp)
4257{
4258 return wb_domain_init(&memcg->cgwb_domain, gfp);
4259}
4260
4261static void memcg_wb_domain_exit(struct mem_cgroup *memcg)
4262{
4263 wb_domain_exit(&memcg->cgwb_domain);
4264}
4265
4266static void memcg_wb_domain_size_changed(struct mem_cgroup *memcg)
4267{
4268 wb_domain_size_changed(&memcg->cgwb_domain);
4269}
4270
4271struct wb_domain *mem_cgroup_wb_domain(struct bdi_writeback *wb)
4272{
4273 struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css);
4274
4275 if (!memcg->css.parent)
4276 return NULL;
4277
4278 return &memcg->cgwb_domain;
4279}
4280
4281
4282
4283
4284
4285static unsigned long memcg_exact_page_state(struct mem_cgroup *memcg, int idx)
4286{
4287 long x = atomic_long_read(&memcg->vmstats[idx]);
4288 int cpu;
4289
4290 for_each_online_cpu(cpu)
4291 x += per_cpu_ptr(memcg->vmstats_percpu, cpu)->stat[idx];
4292 if (x < 0)
4293 x = 0;
4294 return x;
4295}
4296
4297
4298
4299
4300
4301
4302
4303
4304
4305
4306
4307
4308
4309
4310
4311
4312
4313
4314
4315void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages,
4316 unsigned long *pheadroom, unsigned long *pdirty,
4317 unsigned long *pwriteback)
4318{
4319 struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css);
4320 struct mem_cgroup *parent;
4321
4322 *pdirty = memcg_exact_page_state(memcg, NR_FILE_DIRTY);
4323
4324 *pwriteback = memcg_exact_page_state(memcg, NR_WRITEBACK);
4325 *pfilepages = memcg_exact_page_state(memcg, NR_INACTIVE_FILE) +
4326 memcg_exact_page_state(memcg, NR_ACTIVE_FILE);
4327 *pheadroom = PAGE_COUNTER_MAX;
4328
4329 while ((parent = parent_mem_cgroup(memcg))) {
4330 unsigned long ceiling = min(READ_ONCE(memcg->memory.max),
4331 READ_ONCE(memcg->memory.high));
4332 unsigned long used = page_counter_read(&memcg->memory);
4333
4334 *pheadroom = min(*pheadroom, ceiling - min(ceiling, used));
4335 memcg = parent;
4336 }
4337}
4338
4339
4340
4341
4342
4343
4344
4345
4346
4347
4348
4349
4350
4351
4352
4353
4354
4355
4356
4357
4358
4359
4360
4361
4362
4363
4364
4365
4366
4367
4368
4369
4370
4371
4372
4373
4374
4375
4376
4377
4378
4379
4380
4381
4382
4383void mem_cgroup_track_foreign_dirty_slowpath(struct page *page,
4384 struct bdi_writeback *wb)
4385{
4386 struct mem_cgroup *memcg = page->mem_cgroup;
4387 struct memcg_cgwb_frn *frn;
4388 u64 now = get_jiffies_64();
4389 u64 oldest_at = now;
4390 int oldest = -1;
4391 int i;
4392
4393 trace_track_foreign_dirty(page, wb);
4394
4395
4396
4397
4398
4399
4400 for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++) {
4401 frn = &memcg->cgwb_frn[i];
4402 if (frn->bdi_id == wb->bdi->id &&
4403 frn->memcg_id == wb->memcg_css->id)
4404 break;
4405 if (time_before64(frn->at, oldest_at) &&
4406 atomic_read(&frn->done.cnt) == 1) {
4407 oldest = i;
4408 oldest_at = frn->at;
4409 }
4410 }
4411
4412 if (i < MEMCG_CGWB_FRN_CNT) {
4413
4414
4415
4416
4417
4418
4419
4420 unsigned long update_intv =
4421 min_t(unsigned long, HZ,
4422 msecs_to_jiffies(dirty_expire_interval * 10) / 8);
4423
4424 if (time_before64(frn->at, now - update_intv))
4425 frn->at = now;
4426 } else if (oldest >= 0) {
4427
4428 frn = &memcg->cgwb_frn[oldest];
4429 frn->bdi_id = wb->bdi->id;
4430 frn->memcg_id = wb->memcg_css->id;
4431 frn->at = now;
4432 }
4433}
4434
4435
4436void mem_cgroup_flush_foreign(struct bdi_writeback *wb)
4437{
4438 struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css);
4439 unsigned long intv = msecs_to_jiffies(dirty_expire_interval * 10);
4440 u64 now = jiffies_64;
4441 int i;
4442
4443 for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++) {
4444 struct memcg_cgwb_frn *frn = &memcg->cgwb_frn[i];
4445
4446
4447
4448
4449
4450
4451
4452 if (time_after64(frn->at, now - intv) &&
4453 atomic_read(&frn->done.cnt) == 1) {
4454 frn->at = 0;
4455 trace_flush_foreign(wb, frn->bdi_id, frn->memcg_id);
4456 cgroup_writeback_by_id(frn->bdi_id, frn->memcg_id, 0,
4457 WB_REASON_FOREIGN_FLUSH,
4458 &frn->done);
4459 }
4460 }
4461}
4462
4463#else
4464
4465static int memcg_wb_domain_init(struct mem_cgroup *memcg, gfp_t gfp)
4466{
4467 return 0;
4468}
4469
4470static void memcg_wb_domain_exit(struct mem_cgroup *memcg)
4471{
4472}
4473
4474static void memcg_wb_domain_size_changed(struct mem_cgroup *memcg)
4475{
4476}
4477
4478#endif
4479
4480
4481
4482
4483
4484
4485
4486
4487
4488
4489
4490
4491
4492
4493
4494
4495
4496
4497
4498static void memcg_event_remove(struct work_struct *work)
4499{
4500 struct mem_cgroup_event *event =
4501 container_of(work, struct mem_cgroup_event, remove);
4502 struct mem_cgroup *memcg = event->memcg;
4503
4504 remove_wait_queue(event->wqh, &event->wait);
4505
4506 event->unregister_event(memcg, event->eventfd);
4507
4508
4509 eventfd_signal(event->eventfd, 1);
4510
4511 eventfd_ctx_put(event->eventfd);
4512 kfree(event);
4513 css_put(&memcg->css);
4514}
4515
4516
4517
4518
4519
4520
4521static int memcg_event_wake(wait_queue_entry_t *wait, unsigned mode,
4522 int sync, void *key)
4523{
4524 struct mem_cgroup_event *event =
4525 container_of(wait, struct mem_cgroup_event, wait);
4526 struct mem_cgroup *memcg = event->memcg;
4527 __poll_t flags = key_to_poll(key);
4528
4529 if (flags & EPOLLHUP) {
4530
4531
4532
4533
4534
4535
4536
4537
4538
4539 spin_lock(&memcg->event_list_lock);
4540 if (!list_empty(&event->list)) {
4541 list_del_init(&event->list);
4542
4543
4544
4545
4546 schedule_work(&event->remove);
4547 }
4548 spin_unlock(&memcg->event_list_lock);
4549 }
4550
4551 return 0;
4552}
4553
4554static void memcg_event_ptable_queue_proc(struct file *file,
4555 wait_queue_head_t *wqh, poll_table *pt)
4556{
4557 struct mem_cgroup_event *event =
4558 container_of(pt, struct mem_cgroup_event, pt);
4559
4560 event->wqh = wqh;
4561 add_wait_queue(wqh, &event->wait);
4562}
4563
4564
4565
4566
4567
4568
4569
4570
4571
4572static ssize_t memcg_write_event_control(struct kernfs_open_file *of,
4573 char *buf, size_t nbytes, loff_t off)
4574{
4575 struct cgroup_subsys_state *css = of_css(of);
4576 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
4577 struct mem_cgroup_event *event;
4578 struct cgroup_subsys_state *cfile_css;
4579 unsigned int efd, cfd;
4580 struct fd efile;
4581 struct fd cfile;
4582 const char *name;
4583 char *endp;
4584 int ret;
4585
4586 buf = strstrip(buf);
4587
4588 efd = simple_strtoul(buf, &endp, 10);
4589 if (*endp != ' ')
4590 return -EINVAL;
4591 buf = endp + 1;
4592
4593 cfd = simple_strtoul(buf, &endp, 10);
4594 if ((*endp != ' ') && (*endp != '\0'))
4595 return -EINVAL;
4596 buf = endp + 1;
4597
4598 event = kzalloc(sizeof(*event), GFP_KERNEL);
4599 if (!event)
4600 return -ENOMEM;
4601
4602 event->memcg = memcg;
4603 INIT_LIST_HEAD(&event->list);
4604 init_poll_funcptr(&event->pt, memcg_event_ptable_queue_proc);
4605 init_waitqueue_func_entry(&event->wait, memcg_event_wake);
4606 INIT_WORK(&event->remove, memcg_event_remove);
4607
4608 efile = fdget(efd);
4609 if (!efile.file) {
4610 ret = -EBADF;
4611 goto out_kfree;
4612 }
4613
4614 event->eventfd = eventfd_ctx_fileget(efile.file);
4615 if (IS_ERR(event->eventfd)) {
4616 ret = PTR_ERR(event->eventfd);
4617 goto out_put_efile;
4618 }
4619
4620 cfile = fdget(cfd);
4621 if (!cfile.file) {
4622 ret = -EBADF;
4623 goto out_put_eventfd;
4624 }
4625
4626
4627
4628 ret = inode_permission(file_inode(cfile.file), MAY_READ);
4629 if (ret < 0)
4630 goto out_put_cfile;
4631
4632
4633
4634
4635
4636
4637
4638
4639
4640 name = cfile.file->f_path.dentry->d_name.name;
4641
4642 if (!strcmp(name, "memory.usage_in_bytes")) {
4643 event->register_event = mem_cgroup_usage_register_event;
4644 event->unregister_event = mem_cgroup_usage_unregister_event;
4645 } else if (!strcmp(name, "memory.oom_control")) {
4646 event->register_event = mem_cgroup_oom_register_event;
4647 event->unregister_event = mem_cgroup_oom_unregister_event;
4648 } else if (!strcmp(name, "memory.pressure_level")) {
4649 event->register_event = vmpressure_register_event;
4650 event->unregister_event = vmpressure_unregister_event;
4651 } else if (!strcmp(name, "memory.memsw.usage_in_bytes")) {
4652 event->register_event = memsw_cgroup_usage_register_event;
4653 event->unregister_event = memsw_cgroup_usage_unregister_event;
4654 } else {
4655 ret = -EINVAL;
4656 goto out_put_cfile;
4657 }
4658
4659
4660
4661
4662
4663
4664 cfile_css = css_tryget_online_from_dir(cfile.file->f_path.dentry->d_parent,
4665 &memory_cgrp_subsys);
4666 ret = -EINVAL;
4667 if (IS_ERR(cfile_css))
4668 goto out_put_cfile;
4669 if (cfile_css != css) {
4670 css_put(cfile_css);
4671 goto out_put_cfile;
4672 }
4673
4674 ret = event->register_event(memcg, event->eventfd, buf);
4675 if (ret)
4676 goto out_put_css;
4677
4678 vfs_poll(efile.file, &event->pt);
4679
4680 spin_lock(&memcg->event_list_lock);
4681 list_add(&event->list, &memcg->event_list);
4682 spin_unlock(&memcg->event_list_lock);
4683
4684 fdput(cfile);
4685 fdput(efile);
4686
4687 return nbytes;
4688
4689out_put_css:
4690 css_put(css);
4691out_put_cfile:
4692 fdput(cfile);
4693out_put_eventfd:
4694 eventfd_ctx_put(event->eventfd);
4695out_put_efile:
4696 fdput(efile);
4697out_kfree:
4698 kfree(event);
4699
4700 return ret;
4701}
4702
4703static struct cftype mem_cgroup_legacy_files[] = {
4704 {
4705 .name = "usage_in_bytes",
4706 .private = MEMFILE_PRIVATE(_MEM, RES_USAGE),
4707 .read_u64 = mem_cgroup_read_u64,
4708 },
4709 {
4710 .name = "max_usage_in_bytes",
4711 .private = MEMFILE_PRIVATE(_MEM, RES_MAX_USAGE),
4712 .write = mem_cgroup_reset,
4713 .read_u64 = mem_cgroup_read_u64,
4714 },
4715 {
4716 .name = "limit_in_bytes",
4717 .private = MEMFILE_PRIVATE(_MEM, RES_LIMIT),
4718 .write = mem_cgroup_write,
4719 .read_u64 = mem_cgroup_read_u64,
4720 },
4721 {
4722 .name = "soft_limit_in_bytes",
4723 .private = MEMFILE_PRIVATE(_MEM, RES_SOFT_LIMIT),
4724 .write = mem_cgroup_write,
4725 .read_u64 = mem_cgroup_read_u64,
4726 },
4727 {
4728 .name = "failcnt",
4729 .private = MEMFILE_PRIVATE(_MEM, RES_FAILCNT),
4730 .write = mem_cgroup_reset,
4731 .read_u64 = mem_cgroup_read_u64,
4732 },
4733 {
4734 .name = "stat",
4735 .seq_show = memcg_stat_show,
4736 },
4737 {
4738 .name = "force_empty",
4739 .write = mem_cgroup_force_empty_write,
4740 },
4741 {
4742 .name = "use_hierarchy",
4743 .write_u64 = mem_cgroup_hierarchy_write,
4744 .read_u64 = mem_cgroup_hierarchy_read,
4745 },
4746 {
4747 .name = "cgroup.event_control",
4748 .write = memcg_write_event_control,
4749 .flags = CFTYPE_NO_PREFIX | CFTYPE_WORLD_WRITABLE,
4750 },
4751 {
4752 .name = "swappiness",
4753 .read_u64 = mem_cgroup_swappiness_read,
4754 .write_u64 = mem_cgroup_swappiness_write,
4755 },
4756 {
4757 .name = "move_charge_at_immigrate",
4758 .read_u64 = mem_cgroup_move_charge_read,
4759 .write_u64 = mem_cgroup_move_charge_write,
4760 },
4761 {
4762 .name = "oom_control",
4763 .seq_show = mem_cgroup_oom_control_read,
4764 .write_u64 = mem_cgroup_oom_control_write,
4765 .private = MEMFILE_PRIVATE(_OOM_TYPE, OOM_CONTROL),
4766 },
4767 {
4768 .name = "pressure_level",
4769 },
4770#ifdef CONFIG_NUMA
4771 {
4772 .name = "numa_stat",
4773 .seq_show = memcg_numa_stat_show,
4774 },
4775#endif
4776 {
4777 .name = "kmem.limit_in_bytes",
4778 .private = MEMFILE_PRIVATE(_KMEM, RES_LIMIT),
4779 .write = mem_cgroup_write,
4780 .read_u64 = mem_cgroup_read_u64,
4781 },
4782 {
4783 .name = "kmem.usage_in_bytes",
4784 .private = MEMFILE_PRIVATE(_KMEM, RES_USAGE),
4785 .read_u64 = mem_cgroup_read_u64,
4786 },
4787 {
4788 .name = "kmem.failcnt",
4789 .private = MEMFILE_PRIVATE(_KMEM, RES_FAILCNT),
4790 .write = mem_cgroup_reset,
4791 .read_u64 = mem_cgroup_read_u64,
4792 },
4793 {
4794 .name = "kmem.max_usage_in_bytes",
4795 .private = MEMFILE_PRIVATE(_KMEM, RES_MAX_USAGE),
4796 .write = mem_cgroup_reset,
4797 .read_u64 = mem_cgroup_read_u64,
4798 },
4799#if defined(CONFIG_MEMCG_KMEM) && \
4800 (defined(CONFIG_SLAB) || defined(CONFIG_SLUB_DEBUG))
4801 {
4802 .name = "kmem.slabinfo",
4803 .seq_start = memcg_slab_start,
4804 .seq_next = memcg_slab_next,
4805 .seq_stop = memcg_slab_stop,
4806 .seq_show = memcg_slab_show,
4807 },
4808#endif
4809 {
4810 .name = "kmem.tcp.limit_in_bytes",
4811 .private = MEMFILE_PRIVATE(_TCP, RES_LIMIT),
4812 .write = mem_cgroup_write,
4813 .read_u64 = mem_cgroup_read_u64,
4814 },
4815 {
4816 .name = "kmem.tcp.usage_in_bytes",
4817 .private = MEMFILE_PRIVATE(_TCP, RES_USAGE),
4818 .read_u64 = mem_cgroup_read_u64,
4819 },
4820 {
4821 .name = "kmem.tcp.failcnt",
4822 .private = MEMFILE_PRIVATE(_TCP, RES_FAILCNT),
4823 .write = mem_cgroup_reset,
4824 .read_u64 = mem_cgroup_read_u64,
4825 },
4826 {
4827 .name = "kmem.tcp.max_usage_in_bytes",
4828 .private = MEMFILE_PRIVATE(_TCP, RES_MAX_USAGE),
4829 .write = mem_cgroup_reset,
4830 .read_u64 = mem_cgroup_read_u64,
4831 },
4832 { },
4833};
4834
4835
4836
4837
4838
4839
4840
4841
4842
4843
4844
4845
4846
4847
4848
4849
4850
4851
4852
4853
4854
4855
4856
4857
4858
4859static DEFINE_IDR(mem_cgroup_idr);
4860
4861static void mem_cgroup_id_remove(struct mem_cgroup *memcg)
4862{
4863 if (memcg->id.id > 0) {
4864 idr_remove(&mem_cgroup_idr, memcg->id.id);
4865 memcg->id.id = 0;
4866 }
4867}
4868
4869static void __maybe_unused mem_cgroup_id_get_many(struct mem_cgroup *memcg,
4870 unsigned int n)
4871{
4872 refcount_add(n, &memcg->id.ref);
4873}
4874
4875static void mem_cgroup_id_put_many(struct mem_cgroup *memcg, unsigned int n)
4876{
4877 if (refcount_sub_and_test(n, &memcg->id.ref)) {
4878 mem_cgroup_id_remove(memcg);
4879
4880
4881 css_put(&memcg->css);
4882 }
4883}
4884
4885static inline void mem_cgroup_id_put(struct mem_cgroup *memcg)
4886{
4887 mem_cgroup_id_put_many(memcg, 1);
4888}
4889
4890
4891
4892
4893
4894
4895
4896struct mem_cgroup *mem_cgroup_from_id(unsigned short id)
4897{
4898 WARN_ON_ONCE(!rcu_read_lock_held());
4899 return idr_find(&mem_cgroup_idr, id);
4900}
4901
4902static int alloc_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node)
4903{
4904 struct mem_cgroup_per_node *pn;
4905 int tmp = node;
4906
4907
4908
4909
4910
4911
4912
4913
4914 if (!node_state(node, N_NORMAL_MEMORY))
4915 tmp = -1;
4916 pn = kzalloc_node(sizeof(*pn), GFP_KERNEL, tmp);
4917 if (!pn)
4918 return 1;
4919
4920 pn->lruvec_stat_local = alloc_percpu(struct lruvec_stat);
4921 if (!pn->lruvec_stat_local) {
4922 kfree(pn);
4923 return 1;
4924 }
4925
4926 pn->lruvec_stat_cpu = alloc_percpu(struct lruvec_stat);
4927 if (!pn->lruvec_stat_cpu) {
4928 free_percpu(pn->lruvec_stat_local);
4929 kfree(pn);
4930 return 1;
4931 }
4932
4933 lruvec_init(&pn->lruvec);
4934 pn->usage_in_excess = 0;
4935 pn->on_tree = false;
4936 pn->memcg = memcg;
4937
4938 memcg->nodeinfo[node] = pn;
4939 return 0;
4940}
4941
4942static void free_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node)
4943{
4944 struct mem_cgroup_per_node *pn = memcg->nodeinfo[node];
4945
4946 if (!pn)
4947 return;
4948
4949 free_percpu(pn->lruvec_stat_cpu);
4950 free_percpu(pn->lruvec_stat_local);
4951 kfree(pn);
4952}
4953
4954static void __mem_cgroup_free(struct mem_cgroup *memcg)
4955{
4956 int node;
4957
4958 for_each_node(node)
4959 free_mem_cgroup_per_node_info(memcg, node);
4960 free_percpu(memcg->vmstats_percpu);
4961 free_percpu(memcg->vmstats_local);
4962 kfree(memcg);
4963}
4964
4965static void mem_cgroup_free(struct mem_cgroup *memcg)
4966{
4967 memcg_wb_domain_exit(memcg);
4968
4969
4970
4971
4972 memcg_flush_percpu_vmstats(memcg);
4973 memcg_flush_percpu_vmevents(memcg);
4974 __mem_cgroup_free(memcg);
4975}
4976
4977static struct mem_cgroup *mem_cgroup_alloc(void)
4978{
4979 struct mem_cgroup *memcg;
4980 unsigned int size;
4981 int node;
4982 int __maybe_unused i;
4983 long error = -ENOMEM;
4984
4985 size = sizeof(struct mem_cgroup);
4986 size += nr_node_ids * sizeof(struct mem_cgroup_per_node *);
4987
4988 memcg = kzalloc(size, GFP_KERNEL);
4989 if (!memcg)
4990 return ERR_PTR(error);
4991
4992 memcg->id.id = idr_alloc(&mem_cgroup_idr, NULL,
4993 1, MEM_CGROUP_ID_MAX,
4994 GFP_KERNEL);
4995 if (memcg->id.id < 0) {
4996 error = memcg->id.id;
4997 goto fail;
4998 }
4999
5000 memcg->vmstats_local = alloc_percpu(struct memcg_vmstats_percpu);
5001 if (!memcg->vmstats_local)
5002 goto fail;
5003
5004 memcg->vmstats_percpu = alloc_percpu(struct memcg_vmstats_percpu);
5005 if (!memcg->vmstats_percpu)
5006 goto fail;
5007
5008 for_each_node(node)
5009 if (alloc_mem_cgroup_per_node_info(memcg, node))
5010 goto fail;
5011
5012 if (memcg_wb_domain_init(memcg, GFP_KERNEL))
5013 goto fail;
5014
5015 INIT_WORK(&memcg->high_work, high_work_func);
5016 INIT_LIST_HEAD(&memcg->oom_notify);
5017 mutex_init(&memcg->thresholds_lock);
5018 spin_lock_init(&memcg->move_lock);
5019 vmpressure_init(&memcg->vmpressure);
5020 INIT_LIST_HEAD(&memcg->event_list);
5021 spin_lock_init(&memcg->event_list_lock);
5022 memcg->socket_pressure = jiffies;
5023#ifdef CONFIG_MEMCG_KMEM
5024 memcg->kmemcg_id = -1;
5025#endif
5026#ifdef CONFIG_CGROUP_WRITEBACK
5027 INIT_LIST_HEAD(&memcg->cgwb_list);
5028 for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++)
5029 memcg->cgwb_frn[i].done =
5030 __WB_COMPLETION_INIT(&memcg_cgwb_frn_waitq);
5031#endif
5032#ifdef CONFIG_TRANSPARENT_HUGEPAGE
5033 spin_lock_init(&memcg->deferred_split_queue.split_queue_lock);
5034 INIT_LIST_HEAD(&memcg->deferred_split_queue.split_queue);
5035 memcg->deferred_split_queue.split_queue_len = 0;
5036#endif
5037 idr_replace(&mem_cgroup_idr, memcg, memcg->id.id);
5038 return memcg;
5039fail:
5040 mem_cgroup_id_remove(memcg);
5041 __mem_cgroup_free(memcg);
5042 return ERR_PTR(error);
5043}
5044
5045static struct cgroup_subsys_state * __ref
5046mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
5047{
5048 struct mem_cgroup *parent = mem_cgroup_from_css(parent_css);
5049 struct mem_cgroup *memcg;
5050 long error = -ENOMEM;
5051
5052 memcg = mem_cgroup_alloc();
5053 if (IS_ERR(memcg))
5054 return ERR_CAST(memcg);
5055
5056 page_counter_set_high(&memcg->memory, PAGE_COUNTER_MAX);
5057 memcg->soft_limit = PAGE_COUNTER_MAX;
5058 page_counter_set_high(&memcg->swap, PAGE_COUNTER_MAX);
5059 if (parent) {
5060 memcg->swappiness = mem_cgroup_swappiness(parent);
5061 memcg->oom_kill_disable = parent->oom_kill_disable;
5062 }
5063 if (parent && parent->use_hierarchy) {
5064 memcg->use_hierarchy = true;
5065 page_counter_init(&memcg->memory, &parent->memory);
5066 page_counter_init(&memcg->swap, &parent->swap);
5067 page_counter_init(&memcg->memsw, &parent->memsw);
5068 page_counter_init(&memcg->kmem, &parent->kmem);
5069 page_counter_init(&memcg->tcpmem, &parent->tcpmem);
5070 } else {
5071 page_counter_init(&memcg->memory, NULL);
5072 page_counter_init(&memcg->swap, NULL);
5073 page_counter_init(&memcg->memsw, NULL);
5074 page_counter_init(&memcg->kmem, NULL);
5075 page_counter_init(&memcg->tcpmem, NULL);
5076
5077
5078
5079
5080
5081 if (parent != root_mem_cgroup)
5082 memory_cgrp_subsys.broken_hierarchy = true;
5083 }
5084
5085
5086 if (!parent) {
5087#ifdef CONFIG_MEMCG_KMEM
5088 INIT_LIST_HEAD(&memcg->kmem_caches);
5089#endif
5090 root_mem_cgroup = memcg;
5091 return &memcg->css;
5092 }
5093
5094 error = memcg_online_kmem(memcg);
5095 if (error)
5096 goto fail;
5097
5098 if (cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_nosocket)
5099 static_branch_inc(&memcg_sockets_enabled_key);
5100
5101 return &memcg->css;
5102fail:
5103 mem_cgroup_id_remove(memcg);
5104 mem_cgroup_free(memcg);
5105 return ERR_PTR(error);
5106}
5107
5108static int mem_cgroup_css_online(struct cgroup_subsys_state *css)
5109{
5110 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
5111
5112
5113
5114
5115
5116
5117 if (memcg_alloc_shrinker_maps(memcg)) {
5118 mem_cgroup_id_remove(memcg);
5119 return -ENOMEM;
5120 }
5121
5122
5123 refcount_set(&memcg->id.ref, 1);
5124 css_get(css);
5125 return 0;
5126}
5127
5128static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
5129{
5130 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
5131 struct mem_cgroup_event *event, *tmp;
5132
5133
5134
5135
5136
5137
5138 spin_lock(&memcg->event_list_lock);
5139 list_for_each_entry_safe(event, tmp, &memcg->event_list, list) {
5140 list_del_init(&event->list);
5141 schedule_work(&event->remove);
5142 }
5143 spin_unlock(&memcg->event_list_lock);
5144
5145 page_counter_set_min(&memcg->memory, 0);
5146 page_counter_set_low(&memcg->memory, 0);
5147
5148 memcg_offline_kmem(memcg);
5149 wb_memcg_offline(memcg);
5150
5151 drain_all_stock(memcg);
5152
5153 mem_cgroup_id_put(memcg);
5154}
5155
5156static void mem_cgroup_css_released(struct cgroup_subsys_state *css)
5157{
5158 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
5159
5160 invalidate_reclaim_iterators(memcg);
5161}
5162
5163static void mem_cgroup_css_free(struct cgroup_subsys_state *css)
5164{
5165 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
5166 int __maybe_unused i;
5167
5168#ifdef CONFIG_CGROUP_WRITEBACK
5169 for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++)
5170 wb_wait_for_completion(&memcg->cgwb_frn[i].done);
5171#endif
5172 if (cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_nosocket)
5173 static_branch_dec(&memcg_sockets_enabled_key);
5174
5175 if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && memcg->tcpmem_active)
5176 static_branch_dec(&memcg_sockets_enabled_key);
5177
5178 vmpressure_cleanup(&memcg->vmpressure);
5179 cancel_work_sync(&memcg->high_work);
5180 mem_cgroup_remove_from_trees(memcg);
5181 memcg_free_shrinker_maps(memcg);
5182 memcg_free_kmem(memcg);
5183 mem_cgroup_free(memcg);
5184}
5185
5186
5187
5188
5189
5190
5191
5192
5193
5194
5195
5196
5197
5198
5199static void mem_cgroup_css_reset(struct cgroup_subsys_state *css)
5200{
5201 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
5202
5203 page_counter_set_max(&memcg->memory, PAGE_COUNTER_MAX);
5204 page_counter_set_max(&memcg->swap, PAGE_COUNTER_MAX);
5205 page_counter_set_max(&memcg->memsw, PAGE_COUNTER_MAX);
5206 page_counter_set_max(&memcg->kmem, PAGE_COUNTER_MAX);
5207 page_counter_set_max(&memcg->tcpmem, PAGE_COUNTER_MAX);
5208 page_counter_set_min(&memcg->memory, 0);
5209 page_counter_set_low(&memcg->memory, 0);
5210 page_counter_set_high(&memcg->memory, PAGE_COUNTER_MAX);
5211 memcg->soft_limit = PAGE_COUNTER_MAX;
5212 page_counter_set_high(&memcg->swap, PAGE_COUNTER_MAX);
5213 memcg_wb_domain_size_changed(memcg);
5214}
5215
5216#ifdef CONFIG_MMU
5217
5218static int mem_cgroup_do_precharge(unsigned long count)
5219{
5220 int ret;
5221
5222
5223 ret = try_charge(mc.to, GFP_KERNEL & ~__GFP_DIRECT_RECLAIM, count);
5224 if (!ret) {
5225 mc.precharge += count;
5226 return ret;
5227 }
5228
5229
5230 while (count--) {
5231 ret = try_charge(mc.to, GFP_KERNEL | __GFP_NORETRY, 1);
5232 if (ret)
5233 return ret;
5234 mc.precharge++;
5235 cond_resched();
5236 }
5237 return 0;
5238}
5239
5240union mc_target {
5241 struct page *page;
5242 swp_entry_t ent;
5243};
5244
5245enum mc_target_type {
5246 MC_TARGET_NONE = 0,
5247 MC_TARGET_PAGE,
5248 MC_TARGET_SWAP,
5249 MC_TARGET_DEVICE,
5250};
5251
5252static struct page *mc_handle_present_pte(struct vm_area_struct *vma,
5253 unsigned long addr, pte_t ptent)
5254{
5255 struct page *page = vm_normal_page(vma, addr, ptent);
5256
5257 if (!page || !page_mapped(page))
5258 return NULL;
5259 if (PageAnon(page)) {
5260 if (!(mc.flags & MOVE_ANON))
5261 return NULL;
5262 } else {
5263 if (!(mc.flags & MOVE_FILE))
5264 return NULL;
5265 }
5266 if (!get_page_unless_zero(page))
5267 return NULL;
5268
5269 return page;
5270}
5271
5272#if defined(CONFIG_SWAP) || defined(CONFIG_DEVICE_PRIVATE)
5273static struct page *mc_handle_swap_pte(struct vm_area_struct *vma,
5274 pte_t ptent, swp_entry_t *entry)
5275{
5276 struct page *page = NULL;
5277 swp_entry_t ent = pte_to_swp_entry(ptent);
5278
5279 if (!(mc.flags & MOVE_ANON) || non_swap_entry(ent))
5280 return NULL;
5281
5282
5283
5284
5285
5286
5287 if (is_device_private_entry(ent)) {
5288 page = device_private_entry_to_page(ent);
5289
5290
5291
5292
5293 if (!page_ref_add_unless(page, 1, 1))
5294 return NULL;
5295 return page;
5296 }
5297
5298
5299
5300
5301
5302 page = find_get_page(swap_address_space(ent), swp_offset(ent));
5303 entry->val = ent.val;
5304
5305 return page;
5306}
5307#else
5308static struct page *mc_handle_swap_pte(struct vm_area_struct *vma,
5309 pte_t ptent, swp_entry_t *entry)
5310{
5311 return NULL;
5312}
5313#endif
5314
5315static struct page *mc_handle_file_pte(struct vm_area_struct *vma,
5316 unsigned long addr, pte_t ptent, swp_entry_t *entry)
5317{
5318 struct page *page = NULL;
5319 struct address_space *mapping;
5320 pgoff_t pgoff;
5321
5322 if (!vma->vm_file)
5323 return NULL;
5324 if (!(mc.flags & MOVE_FILE))
5325 return NULL;
5326
5327 mapping = vma->vm_file->f_mapping;
5328 pgoff = linear_page_index(vma, addr);
5329
5330
5331#ifdef CONFIG_SWAP
5332
5333 if (shmem_mapping(mapping)) {
5334 page = find_get_entry(mapping, pgoff);
5335 if (xa_is_value(page)) {
5336 swp_entry_t swp = radix_to_swp_entry(page);
5337 *entry = swp;
5338 page = find_get_page(swap_address_space(swp),
5339 swp_offset(swp));
5340 }
5341 } else
5342 page = find_get_page(mapping, pgoff);
5343#else
5344 page = find_get_page(mapping, pgoff);
5345#endif
5346 return page;
5347}
5348
5349
5350
5351
5352
5353
5354
5355
5356
5357
5358
5359
5360
5361static int mem_cgroup_move_account(struct page *page,
5362 bool compound,
5363 struct mem_cgroup *from,
5364 struct mem_cgroup *to)
5365{
5366 struct lruvec *from_vec, *to_vec;
5367 struct pglist_data *pgdat;
5368 unsigned int nr_pages = compound ? hpage_nr_pages(page) : 1;
5369 int ret;
5370
5371 VM_BUG_ON(from == to);
5372 VM_BUG_ON_PAGE(PageLRU(page), page);
5373 VM_BUG_ON(compound && !PageTransHuge(page));
5374
5375
5376
5377
5378
5379 ret = -EBUSY;
5380 if (!trylock_page(page))
5381 goto out;
5382
5383 ret = -EINVAL;
5384 if (page->mem_cgroup != from)
5385 goto out_unlock;
5386
5387 pgdat = page_pgdat(page);
5388 from_vec = mem_cgroup_lruvec(from, pgdat);
5389 to_vec = mem_cgroup_lruvec(to, pgdat);
5390
5391 lock_page_memcg(page);
5392
5393 if (PageAnon(page)) {
5394 if (page_mapped(page)) {
5395 __mod_lruvec_state(from_vec, NR_ANON_MAPPED, -nr_pages);
5396 __mod_lruvec_state(to_vec, NR_ANON_MAPPED, nr_pages);
5397 if (PageTransHuge(page)) {
5398 __mod_lruvec_state(from_vec, NR_ANON_THPS,
5399 -nr_pages);
5400 __mod_lruvec_state(to_vec, NR_ANON_THPS,
5401 nr_pages);
5402 }
5403
5404 }
5405 } else {
5406 __mod_lruvec_state(from_vec, NR_FILE_PAGES, -nr_pages);
5407 __mod_lruvec_state(to_vec, NR_FILE_PAGES, nr_pages);
5408
5409 if (PageSwapBacked(page)) {
5410 __mod_lruvec_state(from_vec, NR_SHMEM, -nr_pages);
5411 __mod_lruvec_state(to_vec, NR_SHMEM, nr_pages);
5412 }
5413
5414 if (page_mapped(page)) {
5415 __mod_lruvec_state(from_vec, NR_FILE_MAPPED, -nr_pages);
5416 __mod_lruvec_state(to_vec, NR_FILE_MAPPED, nr_pages);
5417 }
5418
5419 if (PageDirty(page)) {
5420 struct address_space *mapping = page_mapping(page);
5421
5422 if (mapping_cap_account_dirty(mapping)) {
5423 __mod_lruvec_state(from_vec, NR_FILE_DIRTY,
5424 -nr_pages);
5425 __mod_lruvec_state(to_vec, NR_FILE_DIRTY,
5426 nr_pages);
5427 }
5428 }
5429 }
5430
5431 if (PageWriteback(page)) {
5432 __mod_lruvec_state(from_vec, NR_WRITEBACK, -nr_pages);
5433 __mod_lruvec_state(to_vec, NR_WRITEBACK, nr_pages);
5434 }
5435
5436
5437
5438
5439
5440
5441
5442
5443
5444
5445
5446
5447
5448
5449 smp_mb();
5450
5451 page->mem_cgroup = to;
5452
5453 __unlock_page_memcg(from);
5454
5455 ret = 0;
5456
5457 local_irq_disable();
5458 mem_cgroup_charge_statistics(to, page, nr_pages);
5459 memcg_check_events(to, page);
5460 mem_cgroup_charge_statistics(from, page, -nr_pages);
5461 memcg_check_events(from, page);
5462 local_irq_enable();
5463out_unlock:
5464 unlock_page(page);
5465out:
5466 return ret;
5467}
5468
5469
5470
5471
5472
5473
5474
5475
5476
5477
5478
5479
5480
5481
5482
5483
5484
5485
5486
5487
5488
5489
5490
5491
5492
5493
5494
5495static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma,
5496 unsigned long addr, pte_t ptent, union mc_target *target)
5497{
5498 struct page *page = NULL;
5499 enum mc_target_type ret = MC_TARGET_NONE;
5500 swp_entry_t ent = { .val = 0 };
5501
5502 if (pte_present(ptent))
5503 page = mc_handle_present_pte(vma, addr, ptent);
5504 else if (is_swap_pte(ptent))
5505 page = mc_handle_swap_pte(vma, ptent, &ent);
5506 else if (pte_none(ptent))
5507 page = mc_handle_file_pte(vma, addr, ptent, &ent);
5508
5509 if (!page && !ent.val)
5510 return ret;
5511 if (page) {
5512
5513
5514
5515
5516
5517 if (page->mem_cgroup == mc.from) {
5518 ret = MC_TARGET_PAGE;
5519 if (is_device_private_page(page))
5520 ret = MC_TARGET_DEVICE;
5521 if (target)
5522 target->page = page;
5523 }
5524 if (!ret || !target)
5525 put_page(page);
5526 }
5527
5528
5529
5530
5531 if (ent.val && !ret && (!page || !PageTransCompound(page)) &&
5532 mem_cgroup_id(mc.from) == lookup_swap_cgroup_id(ent)) {
5533 ret = MC_TARGET_SWAP;
5534 if (target)
5535 target->ent = ent;
5536 }
5537 return ret;
5538}
5539
5540#ifdef CONFIG_TRANSPARENT_HUGEPAGE
5541
5542
5543
5544
5545
5546static enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma,
5547 unsigned long addr, pmd_t pmd, union mc_target *target)
5548{
5549 struct page *page = NULL;
5550 enum mc_target_type ret = MC_TARGET_NONE;
5551
5552 if (unlikely(is_swap_pmd(pmd))) {
5553 VM_BUG_ON(thp_migration_supported() &&
5554 !is_pmd_migration_entry(pmd));
5555 return ret;
5556 }
5557 page = pmd_page(pmd);
5558 VM_BUG_ON_PAGE(!page || !PageHead(page), page);
5559 if (!(mc.flags & MOVE_ANON))
5560 return ret;
5561 if (page->mem_cgroup == mc.from) {
5562 ret = MC_TARGET_PAGE;
5563 if (target) {
5564 get_page(page);
5565 target->page = page;
5566 }
5567 }
5568 return ret;
5569}
5570#else
5571static inline enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma,
5572 unsigned long addr, pmd_t pmd, union mc_target *target)
5573{
5574 return MC_TARGET_NONE;
5575}
5576#endif
5577
5578static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd,
5579 unsigned long addr, unsigned long end,
5580 struct mm_walk *walk)
5581{
5582 struct vm_area_struct *vma = walk->vma;
5583 pte_t *pte;
5584 spinlock_t *ptl;
5585
5586 ptl = pmd_trans_huge_lock(pmd, vma);
5587 if (ptl) {
5588
5589
5590
5591
5592
5593 if (get_mctgt_type_thp(vma, addr, *pmd, NULL) == MC_TARGET_PAGE)
5594 mc.precharge += HPAGE_PMD_NR;
5595 spin_unlock(ptl);
5596 return 0;
5597 }
5598
5599 if (pmd_trans_unstable(pmd))
5600 return 0;
5601 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
5602 for (; addr != end; pte++, addr += PAGE_SIZE)
5603 if (get_mctgt_type(vma, addr, *pte, NULL))
5604 mc.precharge++;
5605 pte_unmap_unlock(pte - 1, ptl);
5606 cond_resched();
5607
5608 return 0;
5609}
5610
5611static const struct mm_walk_ops precharge_walk_ops = {
5612 .pmd_entry = mem_cgroup_count_precharge_pte_range,
5613};
5614
5615static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm)
5616{
5617 unsigned long precharge;
5618
5619 mmap_read_lock(mm);
5620 walk_page_range(mm, 0, mm->highest_vm_end, &precharge_walk_ops, NULL);
5621 mmap_read_unlock(mm);
5622
5623 precharge = mc.precharge;
5624 mc.precharge = 0;
5625
5626 return precharge;
5627}
5628
5629static int mem_cgroup_precharge_mc(struct mm_struct *mm)
5630{
5631 unsigned long precharge = mem_cgroup_count_precharge(mm);
5632
5633 VM_BUG_ON(mc.moving_task);
5634 mc.moving_task = current;
5635 return mem_cgroup_do_precharge(precharge);
5636}
5637
5638
5639static void __mem_cgroup_clear_mc(void)
5640{
5641 struct mem_cgroup *from = mc.from;
5642 struct mem_cgroup *to = mc.to;
5643
5644
5645 if (mc.precharge) {
5646 cancel_charge(mc.to, mc.precharge);
5647 mc.precharge = 0;
5648 }
5649
5650
5651
5652
5653 if (mc.moved_charge) {
5654 cancel_charge(mc.from, mc.moved_charge);
5655 mc.moved_charge = 0;
5656 }
5657
5658 if (mc.moved_swap) {
5659
5660 if (!mem_cgroup_is_root(mc.from))
5661 page_counter_uncharge(&mc.from->memsw, mc.moved_swap);
5662
5663 mem_cgroup_id_put_many(mc.from, mc.moved_swap);
5664
5665
5666
5667
5668
5669 if (!mem_cgroup_is_root(mc.to))
5670 page_counter_uncharge(&mc.to->memory, mc.moved_swap);
5671
5672 css_put_many(&mc.to->css, mc.moved_swap);
5673
5674 mc.moved_swap = 0;
5675 }
5676 memcg_oom_recover(from);
5677 memcg_oom_recover(to);
5678 wake_up_all(&mc.waitq);
5679}
5680
5681static void mem_cgroup_clear_mc(void)
5682{
5683 struct mm_struct *mm = mc.mm;
5684
5685
5686
5687
5688
5689 mc.moving_task = NULL;
5690 __mem_cgroup_clear_mc();
5691 spin_lock(&mc.lock);
5692 mc.from = NULL;
5693 mc.to = NULL;
5694 mc.mm = NULL;
5695 spin_unlock(&mc.lock);
5696
5697 mmput(mm);
5698}
5699
5700static int mem_cgroup_can_attach(struct cgroup_taskset *tset)
5701{
5702 struct cgroup_subsys_state *css;
5703 struct mem_cgroup *memcg = NULL;
5704 struct mem_cgroup *from;
5705 struct task_struct *leader, *p;
5706 struct mm_struct *mm;
5707 unsigned long move_flags;
5708 int ret = 0;
5709
5710
5711 if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
5712 return 0;
5713
5714
5715
5716
5717
5718
5719
5720 p = NULL;
5721 cgroup_taskset_for_each_leader(leader, css, tset) {
5722 WARN_ON_ONCE(p);
5723 p = leader;
5724 memcg = mem_cgroup_from_css(css);
5725 }
5726 if (!p)
5727 return 0;
5728
5729
5730
5731
5732
5733
5734 move_flags = READ_ONCE(memcg->move_charge_at_immigrate);
5735 if (!move_flags)
5736 return 0;
5737
5738 from = mem_cgroup_from_task(p);
5739
5740 VM_BUG_ON(from == memcg);
5741
5742 mm = get_task_mm(p);
5743 if (!mm)
5744 return 0;
5745
5746 if (mm->owner == p) {
5747 VM_BUG_ON(mc.from);
5748 VM_BUG_ON(mc.to);
5749 VM_BUG_ON(mc.precharge);
5750 VM_BUG_ON(mc.moved_charge);
5751 VM_BUG_ON(mc.moved_swap);
5752
5753 spin_lock(&mc.lock);
5754 mc.mm = mm;
5755 mc.from = from;
5756 mc.to = memcg;
5757 mc.flags = move_flags;
5758 spin_unlock(&mc.lock);
5759
5760
5761 ret = mem_cgroup_precharge_mc(mm);
5762 if (ret)
5763 mem_cgroup_clear_mc();
5764 } else {
5765 mmput(mm);
5766 }
5767 return ret;
5768}
5769
5770static void mem_cgroup_cancel_attach(struct cgroup_taskset *tset)
5771{
5772 if (mc.to)
5773 mem_cgroup_clear_mc();
5774}
5775
5776static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
5777 unsigned long addr, unsigned long end,
5778 struct mm_walk *walk)
5779{
5780 int ret = 0;
5781 struct vm_area_struct *vma = walk->vma;
5782 pte_t *pte;
5783 spinlock_t *ptl;
5784 enum mc_target_type target_type;
5785 union mc_target target;
5786 struct page *page;
5787
5788 ptl = pmd_trans_huge_lock(pmd, vma);
5789 if (ptl) {
5790 if (mc.precharge < HPAGE_PMD_NR) {
5791 spin_unlock(ptl);
5792 return 0;
5793 }
5794 target_type = get_mctgt_type_thp(vma, addr, *pmd, &target);
5795 if (target_type == MC_TARGET_PAGE) {
5796 page = target.page;
5797 if (!isolate_lru_page(page)) {
5798 if (!mem_cgroup_move_account(page, true,
5799 mc.from, mc.to)) {
5800 mc.precharge -= HPAGE_PMD_NR;
5801 mc.moved_charge += HPAGE_PMD_NR;
5802 }
5803 putback_lru_page(page);
5804 }
5805 put_page(page);
5806 } else if (target_type == MC_TARGET_DEVICE) {
5807 page = target.page;
5808 if (!mem_cgroup_move_account(page, true,
5809 mc.from, mc.to)) {
5810 mc.precharge -= HPAGE_PMD_NR;
5811 mc.moved_charge += HPAGE_PMD_NR;
5812 }
5813 put_page(page);
5814 }
5815 spin_unlock(ptl);
5816 return 0;
5817 }
5818
5819 if (pmd_trans_unstable(pmd))
5820 return 0;
5821retry:
5822 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
5823 for (; addr != end; addr += PAGE_SIZE) {
5824 pte_t ptent = *(pte++);
5825 bool device = false;
5826 swp_entry_t ent;
5827
5828 if (!mc.precharge)
5829 break;
5830
5831 switch (get_mctgt_type(vma, addr, ptent, &target)) {
5832 case MC_TARGET_DEVICE:
5833 device = true;
5834 fallthrough;
5835 case MC_TARGET_PAGE:
5836 page = target.page;
5837
5838
5839
5840
5841
5842
5843 if (PageTransCompound(page))
5844 goto put;
5845 if (!device && isolate_lru_page(page))
5846 goto put;
5847 if (!mem_cgroup_move_account(page, false,
5848 mc.from, mc.to)) {
5849 mc.precharge--;
5850
5851 mc.moved_charge++;
5852 }
5853 if (!device)
5854 putback_lru_page(page);
5855put:
5856 put_page(page);
5857 break;
5858 case MC_TARGET_SWAP:
5859 ent = target.ent;
5860 if (!mem_cgroup_move_swap_account(ent, mc.from, mc.to)) {
5861 mc.precharge--;
5862 mem_cgroup_id_get_many(mc.to, 1);
5863
5864 mc.moved_swap++;
5865 }
5866 break;
5867 default:
5868 break;
5869 }
5870 }
5871 pte_unmap_unlock(pte - 1, ptl);
5872 cond_resched();
5873
5874 if (addr != end) {
5875
5876
5877
5878
5879
5880
5881 ret = mem_cgroup_do_precharge(1);
5882 if (!ret)
5883 goto retry;
5884 }
5885
5886 return ret;
5887}
5888
5889static const struct mm_walk_ops charge_walk_ops = {
5890 .pmd_entry = mem_cgroup_move_charge_pte_range,
5891};
5892
5893static void mem_cgroup_move_charge(void)
5894{
5895 lru_add_drain_all();
5896
5897
5898
5899
5900
5901 atomic_inc(&mc.from->moving_account);
5902 synchronize_rcu();
5903retry:
5904 if (unlikely(!mmap_read_trylock(mc.mm))) {
5905
5906
5907
5908
5909
5910
5911
5912 __mem_cgroup_clear_mc();
5913 cond_resched();
5914 goto retry;
5915 }
5916
5917
5918
5919
5920 walk_page_range(mc.mm, 0, mc.mm->highest_vm_end, &charge_walk_ops,
5921 NULL);
5922
5923 mmap_read_unlock(mc.mm);
5924 atomic_dec(&mc.from->moving_account);
5925}
5926
5927static void mem_cgroup_move_task(void)
5928{
5929 if (mc.to) {
5930 mem_cgroup_move_charge();
5931 mem_cgroup_clear_mc();
5932 }
5933}
5934#else
5935static int mem_cgroup_can_attach(struct cgroup_taskset *tset)
5936{
5937 return 0;
5938}
5939static void mem_cgroup_cancel_attach(struct cgroup_taskset *tset)
5940{
5941}
5942static void mem_cgroup_move_task(void)
5943{
5944}
5945#endif
5946
5947
5948
5949
5950
5951
5952static void mem_cgroup_bind(struct cgroup_subsys_state *root_css)
5953{
5954
5955
5956
5957
5958
5959 if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
5960 root_mem_cgroup->use_hierarchy = true;
5961 else
5962 root_mem_cgroup->use_hierarchy = false;
5963}
5964
5965static int seq_puts_memcg_tunable(struct seq_file *m, unsigned long value)
5966{
5967 if (value == PAGE_COUNTER_MAX)
5968 seq_puts(m, "max\n");
5969 else
5970 seq_printf(m, "%llu\n", (u64)value * PAGE_SIZE);
5971
5972 return 0;
5973}
5974
5975static u64 memory_current_read(struct cgroup_subsys_state *css,
5976 struct cftype *cft)
5977{
5978 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
5979
5980 return (u64)page_counter_read(&memcg->memory) * PAGE_SIZE;
5981}
5982
5983static int memory_min_show(struct seq_file *m, void *v)
5984{
5985 return seq_puts_memcg_tunable(m,
5986 READ_ONCE(mem_cgroup_from_seq(m)->memory.min));
5987}
5988
5989static ssize_t memory_min_write(struct kernfs_open_file *of,
5990 char *buf, size_t nbytes, loff_t off)
5991{
5992 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
5993 unsigned long min;
5994 int err;
5995
5996 buf = strstrip(buf);
5997 err = page_counter_memparse(buf, "max", &min);
5998 if (err)
5999 return err;
6000
6001 page_counter_set_min(&memcg->memory, min);
6002
6003 return nbytes;
6004}
6005
6006static int memory_low_show(struct seq_file *m, void *v)
6007{
6008 return seq_puts_memcg_tunable(m,
6009 READ_ONCE(mem_cgroup_from_seq(m)->memory.low));
6010}
6011
6012static ssize_t memory_low_write(struct kernfs_open_file *of,
6013 char *buf, size_t nbytes, loff_t off)
6014{
6015 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
6016 unsigned long low;
6017 int err;
6018
6019 buf = strstrip(buf);
6020 err = page_counter_memparse(buf, "max", &low);
6021 if (err)
6022 return err;
6023
6024 page_counter_set_low(&memcg->memory, low);
6025
6026 return nbytes;
6027}
6028
6029static int memory_high_show(struct seq_file *m, void *v)
6030{
6031 return seq_puts_memcg_tunable(m,
6032 READ_ONCE(mem_cgroup_from_seq(m)->memory.high));
6033}
6034
6035static ssize_t memory_high_write(struct kernfs_open_file *of,
6036 char *buf, size_t nbytes, loff_t off)
6037{
6038 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
6039 unsigned int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
6040 bool drained = false;
6041 unsigned long high;
6042 int err;
6043
6044 buf = strstrip(buf);
6045 err = page_counter_memparse(buf, "max", &high);
6046 if (err)
6047 return err;
6048
6049 page_counter_set_high(&memcg->memory, high);
6050
6051 for (;;) {
6052 unsigned long nr_pages = page_counter_read(&memcg->memory);
6053 unsigned long reclaimed;
6054
6055 if (nr_pages <= high)
6056 break;
6057
6058 if (signal_pending(current))
6059 break;
6060
6061 if (!drained) {
6062 drain_all_stock(memcg);
6063 drained = true;
6064 continue;
6065 }
6066
6067 reclaimed = try_to_free_mem_cgroup_pages(memcg, nr_pages - high,
6068 GFP_KERNEL, true);
6069
6070 if (!reclaimed && !nr_retries--)
6071 break;
6072 }
6073
6074 return nbytes;
6075}
6076
6077static int memory_max_show(struct seq_file *m, void *v)
6078{
6079 return seq_puts_memcg_tunable(m,
6080 READ_ONCE(mem_cgroup_from_seq(m)->memory.max));
6081}
6082
6083static ssize_t memory_max_write(struct kernfs_open_file *of,
6084 char *buf, size_t nbytes, loff_t off)
6085{
6086 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
6087 unsigned int nr_reclaims = MEM_CGROUP_RECLAIM_RETRIES;
6088 bool drained = false;
6089 unsigned long max;
6090 int err;
6091
6092 buf = strstrip(buf);
6093 err = page_counter_memparse(buf, "max", &max);
6094 if (err)
6095 return err;
6096
6097 xchg(&memcg->memory.max, max);
6098
6099 for (;;) {
6100 unsigned long nr_pages = page_counter_read(&memcg->memory);
6101
6102 if (nr_pages <= max)
6103 break;
6104
6105 if (signal_pending(current))
6106 break;
6107
6108 if (!drained) {
6109 drain_all_stock(memcg);
6110 drained = true;
6111 continue;
6112 }
6113
6114 if (nr_reclaims) {
6115 if (!try_to_free_mem_cgroup_pages(memcg, nr_pages - max,
6116 GFP_KERNEL, true))
6117 nr_reclaims--;
6118 continue;
6119 }
6120
6121 memcg_memory_event(memcg, MEMCG_OOM);
6122 if (!mem_cgroup_out_of_memory(memcg, GFP_KERNEL, 0))
6123 break;
6124 }
6125
6126 memcg_wb_domain_size_changed(memcg);
6127 return nbytes;
6128}
6129
6130static void __memory_events_show(struct seq_file *m, atomic_long_t *events)
6131{
6132 seq_printf(m, "low %lu\n", atomic_long_read(&events[MEMCG_LOW]));
6133 seq_printf(m, "high %lu\n", atomic_long_read(&events[MEMCG_HIGH]));
6134 seq_printf(m, "max %lu\n", atomic_long_read(&events[MEMCG_MAX]));
6135 seq_printf(m, "oom %lu\n", atomic_long_read(&events[MEMCG_OOM]));
6136 seq_printf(m, "oom_kill %lu\n",
6137 atomic_long_read(&events[MEMCG_OOM_KILL]));
6138}
6139
6140static int memory_events_show(struct seq_file *m, void *v)
6141{
6142 struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
6143
6144 __memory_events_show(m, memcg->memory_events);
6145 return 0;
6146}
6147
6148static int memory_events_local_show(struct seq_file *m, void *v)
6149{
6150 struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
6151
6152 __memory_events_show(m, memcg->memory_events_local);
6153 return 0;
6154}
6155
6156static int memory_stat_show(struct seq_file *m, void *v)
6157{
6158 struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
6159 char *buf;
6160
6161 buf = memory_stat_format(memcg);
6162 if (!buf)
6163 return -ENOMEM;
6164 seq_puts(m, buf);
6165 kfree(buf);
6166 return 0;
6167}
6168
6169static int memory_oom_group_show(struct seq_file *m, void *v)
6170{
6171 struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
6172
6173 seq_printf(m, "%d\n", memcg->oom_group);
6174
6175 return 0;
6176}
6177
6178static ssize_t memory_oom_group_write(struct kernfs_open_file *of,
6179 char *buf, size_t nbytes, loff_t off)
6180{
6181 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
6182 int ret, oom_group;
6183
6184 buf = strstrip(buf);
6185 if (!buf)
6186 return -EINVAL;
6187
6188 ret = kstrtoint(buf, 0, &oom_group);
6189 if (ret)
6190 return ret;
6191
6192 if (oom_group != 0 && oom_group != 1)
6193 return -EINVAL;
6194
6195 memcg->oom_group = oom_group;
6196
6197 return nbytes;
6198}
6199
6200static struct cftype memory_files[] = {
6201 {
6202 .name = "current",
6203 .flags = CFTYPE_NOT_ON_ROOT,
6204 .read_u64 = memory_current_read,
6205 },
6206 {
6207 .name = "min",
6208 .flags = CFTYPE_NOT_ON_ROOT,
6209 .seq_show = memory_min_show,
6210 .write = memory_min_write,
6211 },
6212 {
6213 .name = "low",
6214 .flags = CFTYPE_NOT_ON_ROOT,
6215 .seq_show = memory_low_show,
6216 .write = memory_low_write,
6217 },
6218 {
6219 .name = "high",
6220 .flags = CFTYPE_NOT_ON_ROOT,
6221 .seq_show = memory_high_show,
6222 .write = memory_high_write,
6223 },
6224 {
6225 .name = "max",
6226 .flags = CFTYPE_NOT_ON_ROOT,
6227 .seq_show = memory_max_show,
6228 .write = memory_max_write,
6229 },
6230 {
6231 .name = "events",
6232 .flags = CFTYPE_NOT_ON_ROOT,
6233 .file_offset = offsetof(struct mem_cgroup, events_file),
6234 .seq_show = memory_events_show,
6235 },
6236 {
6237 .name = "events.local",
6238 .flags = CFTYPE_NOT_ON_ROOT,
6239 .file_offset = offsetof(struct mem_cgroup, events_local_file),
6240 .seq_show = memory_events_local_show,
6241 },
6242 {
6243 .name = "stat",
6244 .seq_show = memory_stat_show,
6245 },
6246 {
6247 .name = "oom.group",
6248 .flags = CFTYPE_NOT_ON_ROOT | CFTYPE_NS_DELEGATABLE,
6249 .seq_show = memory_oom_group_show,
6250 .write = memory_oom_group_write,
6251 },
6252 { }
6253};
6254
6255struct cgroup_subsys memory_cgrp_subsys = {
6256 .css_alloc = mem_cgroup_css_alloc,
6257 .css_online = mem_cgroup_css_online,
6258 .css_offline = mem_cgroup_css_offline,
6259 .css_released = mem_cgroup_css_released,
6260 .css_free = mem_cgroup_css_free,
6261 .css_reset = mem_cgroup_css_reset,
6262 .can_attach = mem_cgroup_can_attach,
6263 .cancel_attach = mem_cgroup_cancel_attach,
6264 .post_attach = mem_cgroup_move_task,
6265 .bind = mem_cgroup_bind,
6266 .dfl_cftypes = memory_files,
6267 .legacy_cftypes = mem_cgroup_legacy_files,
6268 .early_init = 0,
6269};
6270
6271
6272
6273
6274
6275
6276
6277
6278
6279
6280
6281
6282
6283
6284
6285
6286
6287
6288
6289
6290
6291
6292
6293
6294
6295
6296
6297
6298
6299
6300
6301
6302
6303
6304
6305
6306
6307
6308
6309
6310
6311
6312
6313
6314static unsigned long effective_protection(unsigned long usage,
6315 unsigned long parent_usage,
6316 unsigned long setting,
6317 unsigned long parent_effective,
6318 unsigned long siblings_protected)
6319{
6320 unsigned long protected;
6321 unsigned long ep;
6322
6323 protected = min(usage, setting);
6324
6325
6326
6327
6328
6329
6330
6331
6332
6333
6334 if (siblings_protected > parent_effective)
6335 return protected * parent_effective / siblings_protected;
6336
6337
6338
6339
6340
6341
6342
6343
6344
6345
6346
6347
6348
6349
6350
6351
6352 ep = protected;
6353
6354
6355
6356
6357
6358
6359
6360
6361
6362
6363
6364
6365
6366
6367
6368
6369
6370 if (!(cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_RECURSIVE_PROT))
6371 return ep;
6372 if (parent_effective > siblings_protected &&
6373 parent_usage > siblings_protected &&
6374 usage > protected) {
6375 unsigned long unclaimed;
6376
6377 unclaimed = parent_effective - siblings_protected;
6378 unclaimed *= usage - protected;
6379 unclaimed /= parent_usage - siblings_protected;
6380
6381 ep += unclaimed;
6382 }
6383
6384 return ep;
6385}
6386
6387
6388
6389
6390
6391
6392
6393
6394
6395
6396
6397
6398
6399
6400
6401enum mem_cgroup_protection mem_cgroup_protected(struct mem_cgroup *root,
6402 struct mem_cgroup *memcg)
6403{
6404 unsigned long usage, parent_usage;
6405 struct mem_cgroup *parent;
6406
6407 if (mem_cgroup_disabled())
6408 return MEMCG_PROT_NONE;
6409
6410 if (!root)
6411 root = root_mem_cgroup;
6412 if (memcg == root)
6413 return MEMCG_PROT_NONE;
6414
6415 usage = page_counter_read(&memcg->memory);
6416 if (!usage)
6417 return MEMCG_PROT_NONE;
6418
6419 parent = parent_mem_cgroup(memcg);
6420
6421 if (!parent)
6422 return MEMCG_PROT_NONE;
6423
6424 if (parent == root) {
6425 memcg->memory.emin = READ_ONCE(memcg->memory.min);
6426 memcg->memory.elow = READ_ONCE(memcg->memory.low);
6427 goto out;
6428 }
6429
6430 parent_usage = page_counter_read(&parent->memory);
6431
6432 WRITE_ONCE(memcg->memory.emin, effective_protection(usage, parent_usage,
6433 READ_ONCE(memcg->memory.min),
6434 READ_ONCE(parent->memory.emin),
6435 atomic_long_read(&parent->memory.children_min_usage)));
6436
6437 WRITE_ONCE(memcg->memory.elow, effective_protection(usage, parent_usage,
6438 READ_ONCE(memcg->memory.low),
6439 READ_ONCE(parent->memory.elow),
6440 atomic_long_read(&parent->memory.children_low_usage)));
6441
6442out:
6443 if (usage <= memcg->memory.emin)
6444 return MEMCG_PROT_MIN;
6445 else if (usage <= memcg->memory.elow)
6446 return MEMCG_PROT_LOW;
6447 else
6448 return MEMCG_PROT_NONE;
6449}
6450
6451
6452
6453
6454
6455
6456
6457
6458
6459
6460
6461
6462int mem_cgroup_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask)
6463{
6464 unsigned int nr_pages = hpage_nr_pages(page);
6465 struct mem_cgroup *memcg = NULL;
6466 int ret = 0;
6467
6468 if (mem_cgroup_disabled())
6469 goto out;
6470
6471 if (PageSwapCache(page)) {
6472 swp_entry_t ent = { .val = page_private(page), };
6473 unsigned short id;
6474
6475
6476
6477
6478
6479
6480
6481
6482 VM_BUG_ON_PAGE(!PageLocked(page), page);
6483 if (compound_head(page)->mem_cgroup)
6484 goto out;
6485
6486 id = lookup_swap_cgroup_id(ent);
6487 rcu_read_lock();
6488 memcg = mem_cgroup_from_id(id);
6489 if (memcg && !css_tryget_online(&memcg->css))
6490 memcg = NULL;
6491 rcu_read_unlock();
6492 }
6493
6494 if (!memcg)
6495 memcg = get_mem_cgroup_from_mm(mm);
6496
6497 ret = try_charge(memcg, gfp_mask, nr_pages);
6498 if (ret)
6499 goto out_put;
6500
6501 commit_charge(page, memcg);
6502
6503 local_irq_disable();
6504 mem_cgroup_charge_statistics(memcg, page, nr_pages);
6505 memcg_check_events(memcg, page);
6506 local_irq_enable();
6507
6508 if (PageSwapCache(page)) {
6509 swp_entry_t entry = { .val = page_private(page) };
6510
6511
6512
6513
6514
6515 mem_cgroup_uncharge_swap(entry, nr_pages);
6516 }
6517
6518out_put:
6519 css_put(&memcg->css);
6520out:
6521 return ret;
6522}
6523
6524struct uncharge_gather {
6525 struct mem_cgroup *memcg;
6526 unsigned long nr_pages;
6527 unsigned long pgpgout;
6528 unsigned long nr_kmem;
6529 struct page *dummy_page;
6530};
6531
6532static inline void uncharge_gather_clear(struct uncharge_gather *ug)
6533{
6534 memset(ug, 0, sizeof(*ug));
6535}
6536
6537static void uncharge_batch(const struct uncharge_gather *ug)
6538{
6539 unsigned long flags;
6540
6541 if (!mem_cgroup_is_root(ug->memcg)) {
6542 page_counter_uncharge(&ug->memcg->memory, ug->nr_pages);
6543 if (do_memsw_account())
6544 page_counter_uncharge(&ug->memcg->memsw, ug->nr_pages);
6545 if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && ug->nr_kmem)
6546 page_counter_uncharge(&ug->memcg->kmem, ug->nr_kmem);
6547 memcg_oom_recover(ug->memcg);
6548 }
6549
6550 local_irq_save(flags);
6551 __count_memcg_events(ug->memcg, PGPGOUT, ug->pgpgout);
6552 __this_cpu_add(ug->memcg->vmstats_percpu->nr_page_events, ug->nr_pages);
6553 memcg_check_events(ug->memcg, ug->dummy_page);
6554 local_irq_restore(flags);
6555
6556 if (!mem_cgroup_is_root(ug->memcg))
6557 css_put_many(&ug->memcg->css, ug->nr_pages);
6558}
6559
6560static void uncharge_page(struct page *page, struct uncharge_gather *ug)
6561{
6562 unsigned long nr_pages;
6563
6564 VM_BUG_ON_PAGE(PageLRU(page), page);
6565
6566 if (!page->mem_cgroup)
6567 return;
6568
6569
6570
6571
6572
6573
6574
6575 if (ug->memcg != page->mem_cgroup) {
6576 if (ug->memcg) {
6577 uncharge_batch(ug);
6578 uncharge_gather_clear(ug);
6579 }
6580 ug->memcg = page->mem_cgroup;
6581 }
6582
6583 nr_pages = compound_nr(page);
6584 ug->nr_pages += nr_pages;
6585
6586 if (!PageKmemcg(page)) {
6587 ug->pgpgout++;
6588 } else {
6589 ug->nr_kmem += nr_pages;
6590 __ClearPageKmemcg(page);
6591 }
6592
6593 ug->dummy_page = page;
6594 page->mem_cgroup = NULL;
6595}
6596
6597static void uncharge_list(struct list_head *page_list)
6598{
6599 struct uncharge_gather ug;
6600 struct list_head *next;
6601
6602 uncharge_gather_clear(&ug);
6603
6604
6605
6606
6607
6608 next = page_list->next;
6609 do {
6610 struct page *page;
6611
6612 page = list_entry(next, struct page, lru);
6613 next = page->lru.next;
6614
6615 uncharge_page(page, &ug);
6616 } while (next != page_list);
6617
6618 if (ug.memcg)
6619 uncharge_batch(&ug);
6620}
6621
6622
6623
6624
6625
6626
6627
6628void mem_cgroup_uncharge(struct page *page)
6629{
6630 struct uncharge_gather ug;
6631
6632 if (mem_cgroup_disabled())
6633 return;
6634
6635
6636 if (!page->mem_cgroup)
6637 return;
6638
6639 uncharge_gather_clear(&ug);
6640 uncharge_page(page, &ug);
6641 uncharge_batch(&ug);
6642}
6643
6644
6645
6646
6647
6648
6649
6650
6651void mem_cgroup_uncharge_list(struct list_head *page_list)
6652{
6653 if (mem_cgroup_disabled())
6654 return;
6655
6656 if (!list_empty(page_list))
6657 uncharge_list(page_list);
6658}
6659
6660
6661
6662
6663
6664
6665
6666
6667
6668
6669
6670void mem_cgroup_migrate(struct page *oldpage, struct page *newpage)
6671{
6672 struct mem_cgroup *memcg;
6673 unsigned int nr_pages;
6674 unsigned long flags;
6675
6676 VM_BUG_ON_PAGE(!PageLocked(oldpage), oldpage);
6677 VM_BUG_ON_PAGE(!PageLocked(newpage), newpage);
6678 VM_BUG_ON_PAGE(PageAnon(oldpage) != PageAnon(newpage), newpage);
6679 VM_BUG_ON_PAGE(PageTransHuge(oldpage) != PageTransHuge(newpage),
6680 newpage);
6681
6682 if (mem_cgroup_disabled())
6683 return;
6684
6685
6686 if (newpage->mem_cgroup)
6687 return;
6688
6689
6690 memcg = oldpage->mem_cgroup;
6691 if (!memcg)
6692 return;
6693
6694
6695 nr_pages = hpage_nr_pages(newpage);
6696
6697 page_counter_charge(&memcg->memory, nr_pages);
6698 if (do_memsw_account())
6699 page_counter_charge(&memcg->memsw, nr_pages);
6700 css_get_many(&memcg->css, nr_pages);
6701
6702 commit_charge(newpage, memcg);
6703
6704 local_irq_save(flags);
6705 mem_cgroup_charge_statistics(memcg, newpage, nr_pages);
6706 memcg_check_events(memcg, newpage);
6707 local_irq_restore(flags);
6708}
6709
6710DEFINE_STATIC_KEY_FALSE(memcg_sockets_enabled_key);
6711EXPORT_SYMBOL(memcg_sockets_enabled_key);
6712
6713void mem_cgroup_sk_alloc(struct sock *sk)
6714{
6715 struct mem_cgroup *memcg;
6716
6717 if (!mem_cgroup_sockets_enabled)
6718 return;
6719
6720
6721 if (in_interrupt())
6722 return;
6723
6724 rcu_read_lock();
6725 memcg = mem_cgroup_from_task(current);
6726 if (memcg == root_mem_cgroup)
6727 goto out;
6728 if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && !memcg->tcpmem_active)
6729 goto out;
6730 if (css_tryget(&memcg->css))
6731 sk->sk_memcg = memcg;
6732out:
6733 rcu_read_unlock();
6734}
6735
6736void mem_cgroup_sk_free(struct sock *sk)
6737{
6738 if (sk->sk_memcg)
6739 css_put(&sk->sk_memcg->css);
6740}
6741
6742
6743
6744
6745
6746
6747
6748
6749
6750bool mem_cgroup_charge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages)
6751{
6752 gfp_t gfp_mask = GFP_KERNEL;
6753
6754 if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) {
6755 struct page_counter *fail;
6756
6757 if (page_counter_try_charge(&memcg->tcpmem, nr_pages, &fail)) {
6758 memcg->tcpmem_pressure = 0;
6759 return true;
6760 }
6761 page_counter_charge(&memcg->tcpmem, nr_pages);
6762 memcg->tcpmem_pressure = 1;
6763 return false;
6764 }
6765
6766
6767 if (in_softirq())
6768 gfp_mask = GFP_NOWAIT;
6769
6770 mod_memcg_state(memcg, MEMCG_SOCK, nr_pages);
6771
6772 if (try_charge(memcg, gfp_mask, nr_pages) == 0)
6773 return true;
6774
6775 try_charge(memcg, gfp_mask|__GFP_NOFAIL, nr_pages);
6776 return false;
6777}
6778
6779
6780
6781
6782
6783
6784void mem_cgroup_uncharge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages)
6785{
6786 if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) {
6787 page_counter_uncharge(&memcg->tcpmem, nr_pages);
6788 return;
6789 }
6790
6791 mod_memcg_state(memcg, MEMCG_SOCK, -nr_pages);
6792
6793 refill_stock(memcg, nr_pages);
6794}
6795
6796static int __init cgroup_memory(char *s)
6797{
6798 char *token;
6799
6800 while ((token = strsep(&s, ",")) != NULL) {
6801 if (!*token)
6802 continue;
6803 if (!strcmp(token, "nosocket"))
6804 cgroup_memory_nosocket = true;
6805 if (!strcmp(token, "nokmem"))
6806 cgroup_memory_nokmem = true;
6807 }
6808 return 0;
6809}
6810__setup("cgroup.memory=", cgroup_memory);
6811
6812
6813
6814
6815
6816
6817
6818
6819
6820static int __init mem_cgroup_init(void)
6821{
6822 int cpu, node;
6823
6824#ifdef CONFIG_MEMCG_KMEM
6825
6826
6827
6828
6829
6830
6831 memcg_kmem_cache_wq = alloc_workqueue("memcg_kmem_cache", 0, 1);
6832 BUG_ON(!memcg_kmem_cache_wq);
6833#endif
6834
6835 cpuhp_setup_state_nocalls(CPUHP_MM_MEMCQ_DEAD, "mm/memctrl:dead", NULL,
6836 memcg_hotplug_cpu_dead);
6837
6838 for_each_possible_cpu(cpu)
6839 INIT_WORK(&per_cpu_ptr(&memcg_stock, cpu)->work,
6840 drain_local_stock);
6841
6842 for_each_node(node) {
6843 struct mem_cgroup_tree_per_node *rtpn;
6844
6845 rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL,
6846 node_online(node) ? node : NUMA_NO_NODE);
6847
6848 rtpn->rb_root = RB_ROOT;
6849 rtpn->rb_rightmost = NULL;
6850 spin_lock_init(&rtpn->lock);
6851 soft_limit_tree.rb_tree_per_node[node] = rtpn;
6852 }
6853
6854 return 0;
6855}
6856subsys_initcall(mem_cgroup_init);
6857
6858#ifdef CONFIG_MEMCG_SWAP
6859static struct mem_cgroup *mem_cgroup_id_get_online(struct mem_cgroup *memcg)
6860{
6861 while (!refcount_inc_not_zero(&memcg->id.ref)) {
6862
6863
6864
6865
6866 if (WARN_ON_ONCE(memcg == root_mem_cgroup)) {
6867 VM_BUG_ON(1);
6868 break;
6869 }
6870 memcg = parent_mem_cgroup(memcg);
6871 if (!memcg)
6872 memcg = root_mem_cgroup;
6873 }
6874 return memcg;
6875}
6876
6877
6878
6879
6880
6881
6882
6883
6884void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
6885{
6886 struct mem_cgroup *memcg, *swap_memcg;
6887 unsigned int nr_entries;
6888 unsigned short oldid;
6889
6890 VM_BUG_ON_PAGE(PageLRU(page), page);
6891 VM_BUG_ON_PAGE(page_count(page), page);
6892
6893 if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
6894 return;
6895
6896 memcg = page->mem_cgroup;
6897
6898
6899 if (!memcg)
6900 return;
6901
6902
6903
6904
6905
6906
6907 swap_memcg = mem_cgroup_id_get_online(memcg);
6908 nr_entries = hpage_nr_pages(page);
6909
6910 if (nr_entries > 1)
6911 mem_cgroup_id_get_many(swap_memcg, nr_entries - 1);
6912 oldid = swap_cgroup_record(entry, mem_cgroup_id(swap_memcg),
6913 nr_entries);
6914 VM_BUG_ON_PAGE(oldid, page);
6915 mod_memcg_state(swap_memcg, MEMCG_SWAP, nr_entries);
6916
6917 page->mem_cgroup = NULL;
6918
6919 if (!mem_cgroup_is_root(memcg))
6920 page_counter_uncharge(&memcg->memory, nr_entries);
6921
6922 if (!cgroup_memory_noswap && memcg != swap_memcg) {
6923 if (!mem_cgroup_is_root(swap_memcg))
6924 page_counter_charge(&swap_memcg->memsw, nr_entries);
6925 page_counter_uncharge(&memcg->memsw, nr_entries);
6926 }
6927
6928
6929
6930
6931
6932
6933
6934 VM_BUG_ON(!irqs_disabled());
6935 mem_cgroup_charge_statistics(memcg, page, -nr_entries);
6936 memcg_check_events(memcg, page);
6937
6938 if (!mem_cgroup_is_root(memcg))
6939 css_put_many(&memcg->css, nr_entries);
6940}
6941
6942
6943
6944
6945
6946
6947
6948
6949
6950
6951int mem_cgroup_try_charge_swap(struct page *page, swp_entry_t entry)
6952{
6953 unsigned int nr_pages = hpage_nr_pages(page);
6954 struct page_counter *counter;
6955 struct mem_cgroup *memcg;
6956 unsigned short oldid;
6957
6958 if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
6959 return 0;
6960
6961 memcg = page->mem_cgroup;
6962
6963
6964 if (!memcg)
6965 return 0;
6966
6967 if (!entry.val) {
6968 memcg_memory_event(memcg, MEMCG_SWAP_FAIL);
6969 return 0;
6970 }
6971
6972 memcg = mem_cgroup_id_get_online(memcg);
6973
6974 if (!cgroup_memory_noswap && !mem_cgroup_is_root(memcg) &&
6975 !page_counter_try_charge(&memcg->swap, nr_pages, &counter)) {
6976 memcg_memory_event(memcg, MEMCG_SWAP_MAX);
6977 memcg_memory_event(memcg, MEMCG_SWAP_FAIL);
6978 mem_cgroup_id_put(memcg);
6979 return -ENOMEM;
6980 }
6981
6982
6983 if (nr_pages > 1)
6984 mem_cgroup_id_get_many(memcg, nr_pages - 1);
6985 oldid = swap_cgroup_record(entry, mem_cgroup_id(memcg), nr_pages);
6986 VM_BUG_ON_PAGE(oldid, page);
6987 mod_memcg_state(memcg, MEMCG_SWAP, nr_pages);
6988
6989 return 0;
6990}
6991
6992
6993
6994
6995
6996
6997void mem_cgroup_uncharge_swap(swp_entry_t entry, unsigned int nr_pages)
6998{
6999 struct mem_cgroup *memcg;
7000 unsigned short id;
7001
7002 id = swap_cgroup_record(entry, 0, nr_pages);
7003 rcu_read_lock();
7004 memcg = mem_cgroup_from_id(id);
7005 if (memcg) {
7006 if (!cgroup_memory_noswap && !mem_cgroup_is_root(memcg)) {
7007 if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
7008 page_counter_uncharge(&memcg->swap, nr_pages);
7009 else
7010 page_counter_uncharge(&memcg->memsw, nr_pages);
7011 }
7012 mod_memcg_state(memcg, MEMCG_SWAP, -nr_pages);
7013 mem_cgroup_id_put_many(memcg, nr_pages);
7014 }
7015 rcu_read_unlock();
7016}
7017
7018long mem_cgroup_get_nr_swap_pages(struct mem_cgroup *memcg)
7019{
7020 long nr_swap_pages = get_nr_swap_pages();
7021
7022 if (cgroup_memory_noswap || !cgroup_subsys_on_dfl(memory_cgrp_subsys))
7023 return nr_swap_pages;
7024 for (; memcg != root_mem_cgroup; memcg = parent_mem_cgroup(memcg))
7025 nr_swap_pages = min_t(long, nr_swap_pages,
7026 READ_ONCE(memcg->swap.max) -
7027 page_counter_read(&memcg->swap));
7028 return nr_swap_pages;
7029}
7030
7031bool mem_cgroup_swap_full(struct page *page)
7032{
7033 struct mem_cgroup *memcg;
7034
7035 VM_BUG_ON_PAGE(!PageLocked(page), page);
7036
7037 if (vm_swap_full())
7038 return true;
7039 if (cgroup_memory_noswap || !cgroup_subsys_on_dfl(memory_cgrp_subsys))
7040 return false;
7041
7042 memcg = page->mem_cgroup;
7043 if (!memcg)
7044 return false;
7045
7046 for (; memcg != root_mem_cgroup; memcg = parent_mem_cgroup(memcg)) {
7047 unsigned long usage = page_counter_read(&memcg->swap);
7048
7049 if (usage * 2 >= READ_ONCE(memcg->swap.high) ||
7050 usage * 2 >= READ_ONCE(memcg->swap.max))
7051 return true;
7052 }
7053
7054 return false;
7055}
7056
7057static int __init setup_swap_account(char *s)
7058{
7059 if (!strcmp(s, "1"))
7060 cgroup_memory_noswap = 0;
7061 else if (!strcmp(s, "0"))
7062 cgroup_memory_noswap = 1;
7063 return 1;
7064}
7065__setup("swapaccount=", setup_swap_account);
7066
7067static u64 swap_current_read(struct cgroup_subsys_state *css,
7068 struct cftype *cft)
7069{
7070 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
7071
7072 return (u64)page_counter_read(&memcg->swap) * PAGE_SIZE;
7073}
7074
7075static int swap_high_show(struct seq_file *m, void *v)
7076{
7077 return seq_puts_memcg_tunable(m,
7078 READ_ONCE(mem_cgroup_from_seq(m)->swap.high));
7079}
7080
7081static ssize_t swap_high_write(struct kernfs_open_file *of,
7082 char *buf, size_t nbytes, loff_t off)
7083{
7084 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
7085 unsigned long high;
7086 int err;
7087
7088 buf = strstrip(buf);
7089 err = page_counter_memparse(buf, "max", &high);
7090 if (err)
7091 return err;
7092
7093 page_counter_set_high(&memcg->swap, high);
7094
7095 return nbytes;
7096}
7097
7098static int swap_max_show(struct seq_file *m, void *v)
7099{
7100 return seq_puts_memcg_tunable(m,
7101 READ_ONCE(mem_cgroup_from_seq(m)->swap.max));
7102}
7103
7104static ssize_t swap_max_write(struct kernfs_open_file *of,
7105 char *buf, size_t nbytes, loff_t off)
7106{
7107 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
7108 unsigned long max;
7109 int err;
7110
7111 buf = strstrip(buf);
7112 err = page_counter_memparse(buf, "max", &max);
7113 if (err)
7114 return err;
7115
7116 xchg(&memcg->swap.max, max);
7117
7118 return nbytes;
7119}
7120
7121static int swap_events_show(struct seq_file *m, void *v)
7122{
7123 struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
7124
7125 seq_printf(m, "high %lu\n",
7126 atomic_long_read(&memcg->memory_events[MEMCG_SWAP_HIGH]));
7127 seq_printf(m, "max %lu\n",
7128 atomic_long_read(&memcg->memory_events[MEMCG_SWAP_MAX]));
7129 seq_printf(m, "fail %lu\n",
7130 atomic_long_read(&memcg->memory_events[MEMCG_SWAP_FAIL]));
7131
7132 return 0;
7133}
7134
7135static struct cftype swap_files[] = {
7136 {
7137 .name = "swap.current",
7138 .flags = CFTYPE_NOT_ON_ROOT,
7139 .read_u64 = swap_current_read,
7140 },
7141 {
7142 .name = "swap.high",
7143 .flags = CFTYPE_NOT_ON_ROOT,
7144 .seq_show = swap_high_show,
7145 .write = swap_high_write,
7146 },
7147 {
7148 .name = "swap.max",
7149 .flags = CFTYPE_NOT_ON_ROOT,
7150 .seq_show = swap_max_show,
7151 .write = swap_max_write,
7152 },
7153 {
7154 .name = "swap.events",
7155 .flags = CFTYPE_NOT_ON_ROOT,
7156 .file_offset = offsetof(struct mem_cgroup, swap_events_file),
7157 .seq_show = swap_events_show,
7158 },
7159 { }
7160};
7161
7162static struct cftype memsw_files[] = {
7163 {
7164 .name = "memsw.usage_in_bytes",
7165 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE),
7166 .read_u64 = mem_cgroup_read_u64,
7167 },
7168 {
7169 .name = "memsw.max_usage_in_bytes",
7170 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_MAX_USAGE),
7171 .write = mem_cgroup_reset,
7172 .read_u64 = mem_cgroup_read_u64,
7173 },
7174 {
7175 .name = "memsw.limit_in_bytes",
7176 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_LIMIT),
7177 .write = mem_cgroup_write,
7178 .read_u64 = mem_cgroup_read_u64,
7179 },
7180 {
7181 .name = "memsw.failcnt",
7182 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_FAILCNT),
7183 .write = mem_cgroup_reset,
7184 .read_u64 = mem_cgroup_read_u64,
7185 },
7186 { },
7187};
7188
7189
7190
7191
7192
7193
7194
7195
7196static int __init mem_cgroup_swap_init(void)
7197{
7198
7199 if (mem_cgroup_disabled())
7200 cgroup_memory_noswap = true;
7201
7202 if (cgroup_memory_noswap)
7203 return 0;
7204
7205 WARN_ON(cgroup_add_dfl_cftypes(&memory_cgrp_subsys, swap_files));
7206 WARN_ON(cgroup_add_legacy_cftypes(&memory_cgrp_subsys, memsw_files));
7207
7208 return 0;
7209}
7210core_initcall(mem_cgroup_swap_init);
7211
7212#endif
7213