1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25#include <linux/page_counter.h>
26#include <linux/memcontrol.h>
27#include <linux/cgroup.h>
28#include <linux/pagewalk.h>
29#include <linux/sched/mm.h>
30#include <linux/shmem_fs.h>
31#include <linux/hugetlb.h>
32#include <linux/pagemap.h>
33#include <linux/vm_event_item.h>
34#include <linux/smp.h>
35#include <linux/page-flags.h>
36#include <linux/backing-dev.h>
37#include <linux/bit_spinlock.h>
38#include <linux/rcupdate.h>
39#include <linux/limits.h>
40#include <linux/export.h>
41#include <linux/mutex.h>
42#include <linux/rbtree.h>
43#include <linux/slab.h>
44#include <linux/swap.h>
45#include <linux/swapops.h>
46#include <linux/spinlock.h>
47#include <linux/eventfd.h>
48#include <linux/poll.h>
49#include <linux/sort.h>
50#include <linux/fs.h>
51#include <linux/seq_file.h>
52#include <linux/vmpressure.h>
53#include <linux/mm_inline.h>
54#include <linux/swap_cgroup.h>
55#include <linux/cpu.h>
56#include <linux/oom.h>
57#include <linux/lockdep.h>
58#include <linux/file.h>
59#include <linux/tracehook.h>
60#include <linux/psi.h>
61#include <linux/seq_buf.h>
62#include "internal.h"
63#include <net/sock.h>
64#include <net/ip.h>
65#include "slab.h"
66
67#include <linux/uaccess.h>
68
69#include <trace/events/vmscan.h>
70
71struct cgroup_subsys memory_cgrp_subsys __read_mostly;
72EXPORT_SYMBOL(memory_cgrp_subsys);
73
74struct mem_cgroup *root_mem_cgroup __read_mostly;
75
76#define MEM_CGROUP_RECLAIM_RETRIES 5
77
78
79static bool cgroup_memory_nosocket;
80
81
82static bool cgroup_memory_nokmem;
83
84
85#ifdef CONFIG_MEMCG_SWAP
86int do_swap_account __read_mostly;
87#else
88#define do_swap_account 0
89#endif
90
91#ifdef CONFIG_CGROUP_WRITEBACK
92static DECLARE_WAIT_QUEUE_HEAD(memcg_cgwb_frn_waitq);
93#endif
94
95
96static bool do_memsw_account(void)
97{
98 return !cgroup_subsys_on_dfl(memory_cgrp_subsys) && do_swap_account;
99}
100
101#define THRESHOLDS_EVENTS_TARGET 128
102#define SOFTLIMIT_EVENTS_TARGET 1024
103
104
105
106
107
108
109struct mem_cgroup_tree_per_node {
110 struct rb_root rb_root;
111 struct rb_node *rb_rightmost;
112 spinlock_t lock;
113};
114
115struct mem_cgroup_tree {
116 struct mem_cgroup_tree_per_node *rb_tree_per_node[MAX_NUMNODES];
117};
118
119static struct mem_cgroup_tree soft_limit_tree __read_mostly;
120
121
122struct mem_cgroup_eventfd_list {
123 struct list_head list;
124 struct eventfd_ctx *eventfd;
125};
126
127
128
129
130struct mem_cgroup_event {
131
132
133
134 struct mem_cgroup *memcg;
135
136
137
138 struct eventfd_ctx *eventfd;
139
140
141
142 struct list_head list;
143
144
145
146
147
148 int (*register_event)(struct mem_cgroup *memcg,
149 struct eventfd_ctx *eventfd, const char *args);
150
151
152
153
154
155 void (*unregister_event)(struct mem_cgroup *memcg,
156 struct eventfd_ctx *eventfd);
157
158
159
160
161 poll_table pt;
162 wait_queue_head_t *wqh;
163 wait_queue_entry_t wait;
164 struct work_struct remove;
165};
166
167static void mem_cgroup_threshold(struct mem_cgroup *memcg);
168static void mem_cgroup_oom_notify(struct mem_cgroup *memcg);
169
170
171
172
173
174#define MOVE_ANON 0x1U
175#define MOVE_FILE 0x2U
176#define MOVE_MASK (MOVE_ANON | MOVE_FILE)
177
178
179static struct move_charge_struct {
180 spinlock_t lock;
181 struct mm_struct *mm;
182 struct mem_cgroup *from;
183 struct mem_cgroup *to;
184 unsigned long flags;
185 unsigned long precharge;
186 unsigned long moved_charge;
187 unsigned long moved_swap;
188 struct task_struct *moving_task;
189 wait_queue_head_t waitq;
190} mc = {
191 .lock = __SPIN_LOCK_UNLOCKED(mc.lock),
192 .waitq = __WAIT_QUEUE_HEAD_INITIALIZER(mc.waitq),
193};
194
195
196
197
198
199#define MEM_CGROUP_MAX_RECLAIM_LOOPS 100
200#define MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS 2
201
202enum charge_type {
203 MEM_CGROUP_CHARGE_TYPE_CACHE = 0,
204 MEM_CGROUP_CHARGE_TYPE_ANON,
205 MEM_CGROUP_CHARGE_TYPE_SWAPOUT,
206 MEM_CGROUP_CHARGE_TYPE_DROP,
207 NR_CHARGE_TYPE,
208};
209
210
211enum res_type {
212 _MEM,
213 _MEMSWAP,
214 _OOM_TYPE,
215 _KMEM,
216 _TCP,
217};
218
219#define MEMFILE_PRIVATE(x, val) ((x) << 16 | (val))
220#define MEMFILE_TYPE(val) ((val) >> 16 & 0xffff)
221#define MEMFILE_ATTR(val) ((val) & 0xffff)
222
223#define OOM_CONTROL (0)
224
225
226
227
228
229
230#define for_each_mem_cgroup_tree(iter, root) \
231 for (iter = mem_cgroup_iter(root, NULL, NULL); \
232 iter != NULL; \
233 iter = mem_cgroup_iter(root, iter, NULL))
234
235#define for_each_mem_cgroup(iter) \
236 for (iter = mem_cgroup_iter(NULL, NULL, NULL); \
237 iter != NULL; \
238 iter = mem_cgroup_iter(NULL, iter, NULL))
239
240static inline bool should_force_charge(void)
241{
242 return tsk_is_oom_victim(current) || fatal_signal_pending(current) ||
243 (current->flags & PF_EXITING);
244}
245
246
247struct vmpressure *memcg_to_vmpressure(struct mem_cgroup *memcg)
248{
249 if (!memcg)
250 memcg = root_mem_cgroup;
251 return &memcg->vmpressure;
252}
253
254struct cgroup_subsys_state *vmpressure_to_css(struct vmpressure *vmpr)
255{
256 return &container_of(vmpr, struct mem_cgroup, vmpressure)->css;
257}
258
259#ifdef CONFIG_MEMCG_KMEM
260
261
262
263
264
265
266
267
268
269
270
271static DEFINE_IDA(memcg_cache_ida);
272int memcg_nr_cache_ids;
273
274
275static DECLARE_RWSEM(memcg_cache_ids_sem);
276
277void memcg_get_cache_ids(void)
278{
279 down_read(&memcg_cache_ids_sem);
280}
281
282void memcg_put_cache_ids(void)
283{
284 up_read(&memcg_cache_ids_sem);
285}
286
287
288
289
290
291
292
293
294
295
296
297
298
299#define MEMCG_CACHES_MIN_SIZE 4
300#define MEMCG_CACHES_MAX_SIZE MEM_CGROUP_ID_MAX
301
302
303
304
305
306
307
308DEFINE_STATIC_KEY_FALSE(memcg_kmem_enabled_key);
309EXPORT_SYMBOL(memcg_kmem_enabled_key);
310
311struct workqueue_struct *memcg_kmem_cache_wq;
312#endif
313
314static int memcg_shrinker_map_size;
315static DEFINE_MUTEX(memcg_shrinker_map_mutex);
316
317static void memcg_free_shrinker_map_rcu(struct rcu_head *head)
318{
319 kvfree(container_of(head, struct memcg_shrinker_map, rcu));
320}
321
322static int memcg_expand_one_shrinker_map(struct mem_cgroup *memcg,
323 int size, int old_size)
324{
325 struct memcg_shrinker_map *new, *old;
326 int nid;
327
328 lockdep_assert_held(&memcg_shrinker_map_mutex);
329
330 for_each_node(nid) {
331 old = rcu_dereference_protected(
332 mem_cgroup_nodeinfo(memcg, nid)->shrinker_map, true);
333
334 if (!old)
335 return 0;
336
337 new = kvmalloc(sizeof(*new) + size, GFP_KERNEL);
338 if (!new)
339 return -ENOMEM;
340
341
342 memset(new->map, (int)0xff, old_size);
343 memset((void *)new->map + old_size, 0, size - old_size);
344
345 rcu_assign_pointer(memcg->nodeinfo[nid]->shrinker_map, new);
346 call_rcu(&old->rcu, memcg_free_shrinker_map_rcu);
347 }
348
349 return 0;
350}
351
352static void memcg_free_shrinker_maps(struct mem_cgroup *memcg)
353{
354 struct mem_cgroup_per_node *pn;
355 struct memcg_shrinker_map *map;
356 int nid;
357
358 if (mem_cgroup_is_root(memcg))
359 return;
360
361 for_each_node(nid) {
362 pn = mem_cgroup_nodeinfo(memcg, nid);
363 map = rcu_dereference_protected(pn->shrinker_map, true);
364 if (map)
365 kvfree(map);
366 rcu_assign_pointer(pn->shrinker_map, NULL);
367 }
368}
369
370static int memcg_alloc_shrinker_maps(struct mem_cgroup *memcg)
371{
372 struct memcg_shrinker_map *map;
373 int nid, size, ret = 0;
374
375 if (mem_cgroup_is_root(memcg))
376 return 0;
377
378 mutex_lock(&memcg_shrinker_map_mutex);
379 size = memcg_shrinker_map_size;
380 for_each_node(nid) {
381 map = kvzalloc(sizeof(*map) + size, GFP_KERNEL);
382 if (!map) {
383 memcg_free_shrinker_maps(memcg);
384 ret = -ENOMEM;
385 break;
386 }
387 rcu_assign_pointer(memcg->nodeinfo[nid]->shrinker_map, map);
388 }
389 mutex_unlock(&memcg_shrinker_map_mutex);
390
391 return ret;
392}
393
394int memcg_expand_shrinker_maps(int new_id)
395{
396 int size, old_size, ret = 0;
397 struct mem_cgroup *memcg;
398
399 size = DIV_ROUND_UP(new_id + 1, BITS_PER_LONG) * sizeof(unsigned long);
400 old_size = memcg_shrinker_map_size;
401 if (size <= old_size)
402 return 0;
403
404 mutex_lock(&memcg_shrinker_map_mutex);
405 if (!root_mem_cgroup)
406 goto unlock;
407
408 for_each_mem_cgroup(memcg) {
409 if (mem_cgroup_is_root(memcg))
410 continue;
411 ret = memcg_expand_one_shrinker_map(memcg, size, old_size);
412 if (ret)
413 goto unlock;
414 }
415unlock:
416 if (!ret)
417 memcg_shrinker_map_size = size;
418 mutex_unlock(&memcg_shrinker_map_mutex);
419 return ret;
420}
421
422void memcg_set_shrinker_bit(struct mem_cgroup *memcg, int nid, int shrinker_id)
423{
424 if (shrinker_id >= 0 && memcg && !mem_cgroup_is_root(memcg)) {
425 struct memcg_shrinker_map *map;
426
427 rcu_read_lock();
428 map = rcu_dereference(memcg->nodeinfo[nid]->shrinker_map);
429
430 smp_mb__before_atomic();
431 set_bit(shrinker_id, map->map);
432 rcu_read_unlock();
433 }
434}
435
436
437
438
439
440
441
442
443
444
445
446
447struct cgroup_subsys_state *mem_cgroup_css_from_page(struct page *page)
448{
449 struct mem_cgroup *memcg;
450
451 memcg = page->mem_cgroup;
452
453 if (!memcg || !cgroup_subsys_on_dfl(memory_cgrp_subsys))
454 memcg = root_mem_cgroup;
455
456 return &memcg->css;
457}
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472ino_t page_cgroup_ino(struct page *page)
473{
474 struct mem_cgroup *memcg;
475 unsigned long ino = 0;
476
477 rcu_read_lock();
478 if (PageSlab(page) && !PageTail(page))
479 memcg = memcg_from_slab_page(page);
480 else
481 memcg = READ_ONCE(page->mem_cgroup);
482 while (memcg && !(memcg->css.flags & CSS_ONLINE))
483 memcg = parent_mem_cgroup(memcg);
484 if (memcg)
485 ino = cgroup_ino(memcg->css.cgroup);
486 rcu_read_unlock();
487 return ino;
488}
489
490static struct mem_cgroup_per_node *
491mem_cgroup_page_nodeinfo(struct mem_cgroup *memcg, struct page *page)
492{
493 int nid = page_to_nid(page);
494
495 return memcg->nodeinfo[nid];
496}
497
498static struct mem_cgroup_tree_per_node *
499soft_limit_tree_node(int nid)
500{
501 return soft_limit_tree.rb_tree_per_node[nid];
502}
503
504static struct mem_cgroup_tree_per_node *
505soft_limit_tree_from_page(struct page *page)
506{
507 int nid = page_to_nid(page);
508
509 return soft_limit_tree.rb_tree_per_node[nid];
510}
511
512static void __mem_cgroup_insert_exceeded(struct mem_cgroup_per_node *mz,
513 struct mem_cgroup_tree_per_node *mctz,
514 unsigned long new_usage_in_excess)
515{
516 struct rb_node **p = &mctz->rb_root.rb_node;
517 struct rb_node *parent = NULL;
518 struct mem_cgroup_per_node *mz_node;
519 bool rightmost = true;
520
521 if (mz->on_tree)
522 return;
523
524 mz->usage_in_excess = new_usage_in_excess;
525 if (!mz->usage_in_excess)
526 return;
527 while (*p) {
528 parent = *p;
529 mz_node = rb_entry(parent, struct mem_cgroup_per_node,
530 tree_node);
531 if (mz->usage_in_excess < mz_node->usage_in_excess) {
532 p = &(*p)->rb_left;
533 rightmost = false;
534 }
535
536
537
538
539
540 else if (mz->usage_in_excess >= mz_node->usage_in_excess)
541 p = &(*p)->rb_right;
542 }
543
544 if (rightmost)
545 mctz->rb_rightmost = &mz->tree_node;
546
547 rb_link_node(&mz->tree_node, parent, p);
548 rb_insert_color(&mz->tree_node, &mctz->rb_root);
549 mz->on_tree = true;
550}
551
552static void __mem_cgroup_remove_exceeded(struct mem_cgroup_per_node *mz,
553 struct mem_cgroup_tree_per_node *mctz)
554{
555 if (!mz->on_tree)
556 return;
557
558 if (&mz->tree_node == mctz->rb_rightmost)
559 mctz->rb_rightmost = rb_prev(&mz->tree_node);
560
561 rb_erase(&mz->tree_node, &mctz->rb_root);
562 mz->on_tree = false;
563}
564
565static void mem_cgroup_remove_exceeded(struct mem_cgroup_per_node *mz,
566 struct mem_cgroup_tree_per_node *mctz)
567{
568 unsigned long flags;
569
570 spin_lock_irqsave(&mctz->lock, flags);
571 __mem_cgroup_remove_exceeded(mz, mctz);
572 spin_unlock_irqrestore(&mctz->lock, flags);
573}
574
575static unsigned long soft_limit_excess(struct mem_cgroup *memcg)
576{
577 unsigned long nr_pages = page_counter_read(&memcg->memory);
578 unsigned long soft_limit = READ_ONCE(memcg->soft_limit);
579 unsigned long excess = 0;
580
581 if (nr_pages > soft_limit)
582 excess = nr_pages - soft_limit;
583
584 return excess;
585}
586
587static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page)
588{
589 unsigned long excess;
590 struct mem_cgroup_per_node *mz;
591 struct mem_cgroup_tree_per_node *mctz;
592
593 mctz = soft_limit_tree_from_page(page);
594 if (!mctz)
595 return;
596
597
598
599
600 for (; memcg; memcg = parent_mem_cgroup(memcg)) {
601 mz = mem_cgroup_page_nodeinfo(memcg, page);
602 excess = soft_limit_excess(memcg);
603
604
605
606
607 if (excess || mz->on_tree) {
608 unsigned long flags;
609
610 spin_lock_irqsave(&mctz->lock, flags);
611
612 if (mz->on_tree)
613 __mem_cgroup_remove_exceeded(mz, mctz);
614
615
616
617
618 __mem_cgroup_insert_exceeded(mz, mctz, excess);
619 spin_unlock_irqrestore(&mctz->lock, flags);
620 }
621 }
622}
623
624static void mem_cgroup_remove_from_trees(struct mem_cgroup *memcg)
625{
626 struct mem_cgroup_tree_per_node *mctz;
627 struct mem_cgroup_per_node *mz;
628 int nid;
629
630 for_each_node(nid) {
631 mz = mem_cgroup_nodeinfo(memcg, nid);
632 mctz = soft_limit_tree_node(nid);
633 if (mctz)
634 mem_cgroup_remove_exceeded(mz, mctz);
635 }
636}
637
638static struct mem_cgroup_per_node *
639__mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz)
640{
641 struct mem_cgroup_per_node *mz;
642
643retry:
644 mz = NULL;
645 if (!mctz->rb_rightmost)
646 goto done;
647
648 mz = rb_entry(mctz->rb_rightmost,
649 struct mem_cgroup_per_node, tree_node);
650
651
652
653
654
655 __mem_cgroup_remove_exceeded(mz, mctz);
656 if (!soft_limit_excess(mz->memcg) ||
657 !css_tryget_online(&mz->memcg->css))
658 goto retry;
659done:
660 return mz;
661}
662
663static struct mem_cgroup_per_node *
664mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz)
665{
666 struct mem_cgroup_per_node *mz;
667
668 spin_lock_irq(&mctz->lock);
669 mz = __mem_cgroup_largest_soft_limit_node(mctz);
670 spin_unlock_irq(&mctz->lock);
671 return mz;
672}
673
674
675
676
677
678
679
680void __mod_memcg_state(struct mem_cgroup *memcg, int idx, int val)
681{
682 long x;
683
684 if (mem_cgroup_disabled())
685 return;
686
687 x = val + __this_cpu_read(memcg->vmstats_percpu->stat[idx]);
688 if (unlikely(abs(x) > MEMCG_CHARGE_BATCH)) {
689 struct mem_cgroup *mi;
690
691
692
693
694
695 __this_cpu_add(memcg->vmstats_local->stat[idx], x);
696 for (mi = memcg; mi; mi = parent_mem_cgroup(mi))
697 atomic_long_add(x, &mi->vmstats[idx]);
698 x = 0;
699 }
700 __this_cpu_write(memcg->vmstats_percpu->stat[idx], x);
701}
702
703static struct mem_cgroup_per_node *
704parent_nodeinfo(struct mem_cgroup_per_node *pn, int nid)
705{
706 struct mem_cgroup *parent;
707
708 parent = parent_mem_cgroup(pn->memcg);
709 if (!parent)
710 return NULL;
711 return mem_cgroup_nodeinfo(parent, nid);
712}
713
714
715
716
717
718
719
720
721
722
723
724void __mod_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx,
725 int val)
726{
727 pg_data_t *pgdat = lruvec_pgdat(lruvec);
728 struct mem_cgroup_per_node *pn;
729 struct mem_cgroup *memcg;
730 long x;
731
732
733 __mod_node_page_state(pgdat, idx, val);
734
735 if (mem_cgroup_disabled())
736 return;
737
738 pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
739 memcg = pn->memcg;
740
741
742 __mod_memcg_state(memcg, idx, val);
743
744
745 __this_cpu_add(pn->lruvec_stat_local->count[idx], val);
746
747 x = val + __this_cpu_read(pn->lruvec_stat_cpu->count[idx]);
748 if (unlikely(abs(x) > MEMCG_CHARGE_BATCH)) {
749 struct mem_cgroup_per_node *pi;
750
751 for (pi = pn; pi; pi = parent_nodeinfo(pi, pgdat->node_id))
752 atomic_long_add(x, &pi->lruvec_stat[idx]);
753 x = 0;
754 }
755 __this_cpu_write(pn->lruvec_stat_cpu->count[idx], x);
756}
757
758void __mod_lruvec_slab_state(void *p, enum node_stat_item idx, int val)
759{
760 struct page *page = virt_to_head_page(p);
761 pg_data_t *pgdat = page_pgdat(page);
762 struct mem_cgroup *memcg;
763 struct lruvec *lruvec;
764
765 rcu_read_lock();
766 memcg = memcg_from_slab_page(page);
767
768
769 if (!memcg || memcg == root_mem_cgroup) {
770 __mod_node_page_state(pgdat, idx, val);
771 } else {
772 lruvec = mem_cgroup_lruvec(memcg, pgdat);
773 __mod_lruvec_state(lruvec, idx, val);
774 }
775 rcu_read_unlock();
776}
777
778
779
780
781
782
783
784void __count_memcg_events(struct mem_cgroup *memcg, enum vm_event_item idx,
785 unsigned long count)
786{
787 unsigned long x;
788
789 if (mem_cgroup_disabled())
790 return;
791
792 x = count + __this_cpu_read(memcg->vmstats_percpu->events[idx]);
793 if (unlikely(x > MEMCG_CHARGE_BATCH)) {
794 struct mem_cgroup *mi;
795
796
797
798
799
800 __this_cpu_add(memcg->vmstats_local->events[idx], x);
801 for (mi = memcg; mi; mi = parent_mem_cgroup(mi))
802 atomic_long_add(x, &mi->vmevents[idx]);
803 x = 0;
804 }
805 __this_cpu_write(memcg->vmstats_percpu->events[idx], x);
806}
807
808static unsigned long memcg_events(struct mem_cgroup *memcg, int event)
809{
810 return atomic_long_read(&memcg->vmevents[event]);
811}
812
813static unsigned long memcg_events_local(struct mem_cgroup *memcg, int event)
814{
815 long x = 0;
816 int cpu;
817
818 for_each_possible_cpu(cpu)
819 x += per_cpu(memcg->vmstats_local->events[event], cpu);
820 return x;
821}
822
823static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg,
824 struct page *page,
825 bool compound, int nr_pages)
826{
827
828
829
830
831 if (PageAnon(page))
832 __mod_memcg_state(memcg, MEMCG_RSS, nr_pages);
833 else {
834 __mod_memcg_state(memcg, MEMCG_CACHE, nr_pages);
835 if (PageSwapBacked(page))
836 __mod_memcg_state(memcg, NR_SHMEM, nr_pages);
837 }
838
839 if (compound) {
840 VM_BUG_ON_PAGE(!PageTransHuge(page), page);
841 __mod_memcg_state(memcg, MEMCG_RSS_HUGE, nr_pages);
842 }
843
844
845 if (nr_pages > 0)
846 __count_memcg_events(memcg, PGPGIN, 1);
847 else {
848 __count_memcg_events(memcg, PGPGOUT, 1);
849 nr_pages = -nr_pages;
850 }
851
852 __this_cpu_add(memcg->vmstats_percpu->nr_page_events, nr_pages);
853}
854
855static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg,
856 enum mem_cgroup_events_target target)
857{
858 unsigned long val, next;
859
860 val = __this_cpu_read(memcg->vmstats_percpu->nr_page_events);
861 next = __this_cpu_read(memcg->vmstats_percpu->targets[target]);
862
863 if ((long)(next - val) < 0) {
864 switch (target) {
865 case MEM_CGROUP_TARGET_THRESH:
866 next = val + THRESHOLDS_EVENTS_TARGET;
867 break;
868 case MEM_CGROUP_TARGET_SOFTLIMIT:
869 next = val + SOFTLIMIT_EVENTS_TARGET;
870 break;
871 default:
872 break;
873 }
874 __this_cpu_write(memcg->vmstats_percpu->targets[target], next);
875 return true;
876 }
877 return false;
878}
879
880
881
882
883
884static void memcg_check_events(struct mem_cgroup *memcg, struct page *page)
885{
886
887 if (unlikely(mem_cgroup_event_ratelimit(memcg,
888 MEM_CGROUP_TARGET_THRESH))) {
889 bool do_softlimit;
890
891 do_softlimit = mem_cgroup_event_ratelimit(memcg,
892 MEM_CGROUP_TARGET_SOFTLIMIT);
893 mem_cgroup_threshold(memcg);
894 if (unlikely(do_softlimit))
895 mem_cgroup_update_tree(memcg, page);
896 }
897}
898
899struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
900{
901
902
903
904
905
906 if (unlikely(!p))
907 return NULL;
908
909 return mem_cgroup_from_css(task_css(p, memory_cgrp_id));
910}
911EXPORT_SYMBOL(mem_cgroup_from_task);
912
913
914
915
916
917
918
919
920
921struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm)
922{
923 struct mem_cgroup *memcg;
924
925 if (mem_cgroup_disabled())
926 return NULL;
927
928 rcu_read_lock();
929 do {
930
931
932
933
934
935 if (unlikely(!mm))
936 memcg = root_mem_cgroup;
937 else {
938 memcg = mem_cgroup_from_task(rcu_dereference(mm->owner));
939 if (unlikely(!memcg))
940 memcg = root_mem_cgroup;
941 }
942 } while (!css_tryget(&memcg->css));
943 rcu_read_unlock();
944 return memcg;
945}
946EXPORT_SYMBOL(get_mem_cgroup_from_mm);
947
948
949
950
951
952
953
954
955struct mem_cgroup *get_mem_cgroup_from_page(struct page *page)
956{
957 struct mem_cgroup *memcg = page->mem_cgroup;
958
959 if (mem_cgroup_disabled())
960 return NULL;
961
962 rcu_read_lock();
963 if (!memcg || !css_tryget_online(&memcg->css))
964 memcg = root_mem_cgroup;
965 rcu_read_unlock();
966 return memcg;
967}
968EXPORT_SYMBOL(get_mem_cgroup_from_page);
969
970
971
972
973static __always_inline struct mem_cgroup *get_mem_cgroup_from_current(void)
974{
975 if (unlikely(current->active_memcg)) {
976 struct mem_cgroup *memcg = root_mem_cgroup;
977
978 rcu_read_lock();
979 if (css_tryget_online(¤t->active_memcg->css))
980 memcg = current->active_memcg;
981 rcu_read_unlock();
982 return memcg;
983 }
984 return get_mem_cgroup_from_mm(current->mm);
985}
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
1005 struct mem_cgroup *prev,
1006 struct mem_cgroup_reclaim_cookie *reclaim)
1007{
1008 struct mem_cgroup_reclaim_iter *uninitialized_var(iter);
1009 struct cgroup_subsys_state *css = NULL;
1010 struct mem_cgroup *memcg = NULL;
1011 struct mem_cgroup *pos = NULL;
1012
1013 if (mem_cgroup_disabled())
1014 return NULL;
1015
1016 if (!root)
1017 root = root_mem_cgroup;
1018
1019 if (prev && !reclaim)
1020 pos = prev;
1021
1022 if (!root->use_hierarchy && root != root_mem_cgroup) {
1023 if (prev)
1024 goto out;
1025 return root;
1026 }
1027
1028 rcu_read_lock();
1029
1030 if (reclaim) {
1031 struct mem_cgroup_per_node *mz;
1032
1033 mz = mem_cgroup_nodeinfo(root, reclaim->pgdat->node_id);
1034 iter = &mz->iter;
1035
1036 if (prev && reclaim->generation != iter->generation)
1037 goto out_unlock;
1038
1039 while (1) {
1040 pos = READ_ONCE(iter->position);
1041 if (!pos || css_tryget(&pos->css))
1042 break;
1043
1044
1045
1046
1047
1048
1049
1050
1051 (void)cmpxchg(&iter->position, pos, NULL);
1052 }
1053 }
1054
1055 if (pos)
1056 css = &pos->css;
1057
1058 for (;;) {
1059 css = css_next_descendant_pre(css, &root->css);
1060 if (!css) {
1061
1062
1063
1064
1065
1066
1067 if (!prev)
1068 continue;
1069 break;
1070 }
1071
1072
1073
1074
1075
1076
1077 memcg = mem_cgroup_from_css(css);
1078
1079 if (css == &root->css)
1080 break;
1081
1082 if (css_tryget(css))
1083 break;
1084
1085 memcg = NULL;
1086 }
1087
1088 if (reclaim) {
1089
1090
1091
1092
1093
1094 (void)cmpxchg(&iter->position, pos, memcg);
1095
1096 if (pos)
1097 css_put(&pos->css);
1098
1099 if (!memcg)
1100 iter->generation++;
1101 else if (!prev)
1102 reclaim->generation = iter->generation;
1103 }
1104
1105out_unlock:
1106 rcu_read_unlock();
1107out:
1108 if (prev && prev != root)
1109 css_put(&prev->css);
1110
1111 return memcg;
1112}
1113
1114
1115
1116
1117
1118
1119void mem_cgroup_iter_break(struct mem_cgroup *root,
1120 struct mem_cgroup *prev)
1121{
1122 if (!root)
1123 root = root_mem_cgroup;
1124 if (prev && prev != root)
1125 css_put(&prev->css);
1126}
1127
1128static void __invalidate_reclaim_iterators(struct mem_cgroup *from,
1129 struct mem_cgroup *dead_memcg)
1130{
1131 struct mem_cgroup_reclaim_iter *iter;
1132 struct mem_cgroup_per_node *mz;
1133 int nid;
1134
1135 for_each_node(nid) {
1136 mz = mem_cgroup_nodeinfo(from, nid);
1137 iter = &mz->iter;
1138 cmpxchg(&iter->position, dead_memcg, NULL);
1139 }
1140}
1141
1142static void invalidate_reclaim_iterators(struct mem_cgroup *dead_memcg)
1143{
1144 struct mem_cgroup *memcg = dead_memcg;
1145 struct mem_cgroup *last;
1146
1147 do {
1148 __invalidate_reclaim_iterators(memcg, dead_memcg);
1149 last = memcg;
1150 } while ((memcg = parent_mem_cgroup(memcg)));
1151
1152
1153
1154
1155
1156
1157
1158 if (last != root_mem_cgroup)
1159 __invalidate_reclaim_iterators(root_mem_cgroup,
1160 dead_memcg);
1161}
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176int mem_cgroup_scan_tasks(struct mem_cgroup *memcg,
1177 int (*fn)(struct task_struct *, void *), void *arg)
1178{
1179 struct mem_cgroup *iter;
1180 int ret = 0;
1181
1182 BUG_ON(memcg == root_mem_cgroup);
1183
1184 for_each_mem_cgroup_tree(iter, memcg) {
1185 struct css_task_iter it;
1186 struct task_struct *task;
1187
1188 css_task_iter_start(&iter->css, CSS_TASK_ITER_PROCS, &it);
1189 while (!ret && (task = css_task_iter_next(&it)))
1190 ret = fn(task, arg);
1191 css_task_iter_end(&it);
1192 if (ret) {
1193 mem_cgroup_iter_break(memcg, iter);
1194 break;
1195 }
1196 }
1197 return ret;
1198}
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209struct lruvec *mem_cgroup_page_lruvec(struct page *page, struct pglist_data *pgdat)
1210{
1211 struct mem_cgroup_per_node *mz;
1212 struct mem_cgroup *memcg;
1213 struct lruvec *lruvec;
1214
1215 if (mem_cgroup_disabled()) {
1216 lruvec = &pgdat->__lruvec;
1217 goto out;
1218 }
1219
1220 memcg = page->mem_cgroup;
1221
1222
1223
1224
1225 if (!memcg)
1226 memcg = root_mem_cgroup;
1227
1228 mz = mem_cgroup_page_nodeinfo(memcg, page);
1229 lruvec = &mz->lruvec;
1230out:
1231
1232
1233
1234
1235
1236 if (unlikely(lruvec->pgdat != pgdat))
1237 lruvec->pgdat = pgdat;
1238 return lruvec;
1239}
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru,
1253 int zid, int nr_pages)
1254{
1255 struct mem_cgroup_per_node *mz;
1256 unsigned long *lru_size;
1257 long size;
1258
1259 if (mem_cgroup_disabled())
1260 return;
1261
1262 mz = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
1263 lru_size = &mz->lru_zone_size[zid][lru];
1264
1265 if (nr_pages < 0)
1266 *lru_size += nr_pages;
1267
1268 size = *lru_size;
1269 if (WARN_ONCE(size < 0,
1270 "%s(%p, %d, %d): lru_size %ld\n",
1271 __func__, lruvec, lru, nr_pages, size)) {
1272 VM_BUG_ON(1);
1273 *lru_size = 0;
1274 }
1275
1276 if (nr_pages > 0)
1277 *lru_size += nr_pages;
1278}
1279
1280
1281
1282
1283
1284
1285
1286
1287static unsigned long mem_cgroup_margin(struct mem_cgroup *memcg)
1288{
1289 unsigned long margin = 0;
1290 unsigned long count;
1291 unsigned long limit;
1292
1293 count = page_counter_read(&memcg->memory);
1294 limit = READ_ONCE(memcg->memory.max);
1295 if (count < limit)
1296 margin = limit - count;
1297
1298 if (do_memsw_account()) {
1299 count = page_counter_read(&memcg->memsw);
1300 limit = READ_ONCE(memcg->memsw.max);
1301 if (count <= limit)
1302 margin = min(margin, limit - count);
1303 else
1304 margin = 0;
1305 }
1306
1307 return margin;
1308}
1309
1310
1311
1312
1313
1314
1315
1316
1317static bool mem_cgroup_under_move(struct mem_cgroup *memcg)
1318{
1319 struct mem_cgroup *from;
1320 struct mem_cgroup *to;
1321 bool ret = false;
1322
1323
1324
1325
1326 spin_lock(&mc.lock);
1327 from = mc.from;
1328 to = mc.to;
1329 if (!from)
1330 goto unlock;
1331
1332 ret = mem_cgroup_is_descendant(from, memcg) ||
1333 mem_cgroup_is_descendant(to, memcg);
1334unlock:
1335 spin_unlock(&mc.lock);
1336 return ret;
1337}
1338
1339static bool mem_cgroup_wait_acct_move(struct mem_cgroup *memcg)
1340{
1341 if (mc.moving_task && current != mc.moving_task) {
1342 if (mem_cgroup_under_move(memcg)) {
1343 DEFINE_WAIT(wait);
1344 prepare_to_wait(&mc.waitq, &wait, TASK_INTERRUPTIBLE);
1345
1346 if (mc.moving_task)
1347 schedule();
1348 finish_wait(&mc.waitq, &wait);
1349 return true;
1350 }
1351 }
1352 return false;
1353}
1354
1355static char *memory_stat_format(struct mem_cgroup *memcg)
1356{
1357 struct seq_buf s;
1358 int i;
1359
1360 seq_buf_init(&s, kmalloc(PAGE_SIZE, GFP_KERNEL), PAGE_SIZE);
1361 if (!s.buffer)
1362 return NULL;
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375 seq_buf_printf(&s, "anon %llu\n",
1376 (u64)memcg_page_state(memcg, MEMCG_RSS) *
1377 PAGE_SIZE);
1378 seq_buf_printf(&s, "file %llu\n",
1379 (u64)memcg_page_state(memcg, MEMCG_CACHE) *
1380 PAGE_SIZE);
1381 seq_buf_printf(&s, "kernel_stack %llu\n",
1382 (u64)memcg_page_state(memcg, MEMCG_KERNEL_STACK_KB) *
1383 1024);
1384 seq_buf_printf(&s, "slab %llu\n",
1385 (u64)(memcg_page_state(memcg, NR_SLAB_RECLAIMABLE) +
1386 memcg_page_state(memcg, NR_SLAB_UNRECLAIMABLE)) *
1387 PAGE_SIZE);
1388 seq_buf_printf(&s, "sock %llu\n",
1389 (u64)memcg_page_state(memcg, MEMCG_SOCK) *
1390 PAGE_SIZE);
1391
1392 seq_buf_printf(&s, "shmem %llu\n",
1393 (u64)memcg_page_state(memcg, NR_SHMEM) *
1394 PAGE_SIZE);
1395 seq_buf_printf(&s, "file_mapped %llu\n",
1396 (u64)memcg_page_state(memcg, NR_FILE_MAPPED) *
1397 PAGE_SIZE);
1398 seq_buf_printf(&s, "file_dirty %llu\n",
1399 (u64)memcg_page_state(memcg, NR_FILE_DIRTY) *
1400 PAGE_SIZE);
1401 seq_buf_printf(&s, "file_writeback %llu\n",
1402 (u64)memcg_page_state(memcg, NR_WRITEBACK) *
1403 PAGE_SIZE);
1404
1405
1406
1407
1408
1409
1410
1411 seq_buf_printf(&s, "anon_thp %llu\n",
1412 (u64)memcg_page_state(memcg, MEMCG_RSS_HUGE) *
1413 PAGE_SIZE);
1414
1415 for (i = 0; i < NR_LRU_LISTS; i++)
1416 seq_buf_printf(&s, "%s %llu\n", lru_list_name(i),
1417 (u64)memcg_page_state(memcg, NR_LRU_BASE + i) *
1418 PAGE_SIZE);
1419
1420 seq_buf_printf(&s, "slab_reclaimable %llu\n",
1421 (u64)memcg_page_state(memcg, NR_SLAB_RECLAIMABLE) *
1422 PAGE_SIZE);
1423 seq_buf_printf(&s, "slab_unreclaimable %llu\n",
1424 (u64)memcg_page_state(memcg, NR_SLAB_UNRECLAIMABLE) *
1425 PAGE_SIZE);
1426
1427
1428
1429 seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGFAULT),
1430 memcg_events(memcg, PGFAULT));
1431 seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGMAJFAULT),
1432 memcg_events(memcg, PGMAJFAULT));
1433
1434 seq_buf_printf(&s, "workingset_refault %lu\n",
1435 memcg_page_state(memcg, WORKINGSET_REFAULT));
1436 seq_buf_printf(&s, "workingset_activate %lu\n",
1437 memcg_page_state(memcg, WORKINGSET_ACTIVATE));
1438 seq_buf_printf(&s, "workingset_nodereclaim %lu\n",
1439 memcg_page_state(memcg, WORKINGSET_NODERECLAIM));
1440
1441 seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGREFILL),
1442 memcg_events(memcg, PGREFILL));
1443 seq_buf_printf(&s, "pgscan %lu\n",
1444 memcg_events(memcg, PGSCAN_KSWAPD) +
1445 memcg_events(memcg, PGSCAN_DIRECT));
1446 seq_buf_printf(&s, "pgsteal %lu\n",
1447 memcg_events(memcg, PGSTEAL_KSWAPD) +
1448 memcg_events(memcg, PGSTEAL_DIRECT));
1449 seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGACTIVATE),
1450 memcg_events(memcg, PGACTIVATE));
1451 seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGDEACTIVATE),
1452 memcg_events(memcg, PGDEACTIVATE));
1453 seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGLAZYFREE),
1454 memcg_events(memcg, PGLAZYFREE));
1455 seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGLAZYFREED),
1456 memcg_events(memcg, PGLAZYFREED));
1457
1458#ifdef CONFIG_TRANSPARENT_HUGEPAGE
1459 seq_buf_printf(&s, "%s %lu\n", vm_event_name(THP_FAULT_ALLOC),
1460 memcg_events(memcg, THP_FAULT_ALLOC));
1461 seq_buf_printf(&s, "%s %lu\n", vm_event_name(THP_COLLAPSE_ALLOC),
1462 memcg_events(memcg, THP_COLLAPSE_ALLOC));
1463#endif
1464
1465
1466 WARN_ON_ONCE(seq_buf_has_overflowed(&s));
1467
1468 return s.buffer;
1469}
1470
1471#define K(x) ((x) << (PAGE_SHIFT-10))
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481void mem_cgroup_print_oom_context(struct mem_cgroup *memcg, struct task_struct *p)
1482{
1483 rcu_read_lock();
1484
1485 if (memcg) {
1486 pr_cont(",oom_memcg=");
1487 pr_cont_cgroup_path(memcg->css.cgroup);
1488 } else
1489 pr_cont(",global_oom");
1490 if (p) {
1491 pr_cont(",task_memcg=");
1492 pr_cont_cgroup_path(task_cgroup(p, memory_cgrp_id));
1493 }
1494 rcu_read_unlock();
1495}
1496
1497
1498
1499
1500
1501
1502void mem_cgroup_print_oom_meminfo(struct mem_cgroup *memcg)
1503{
1504 char *buf;
1505
1506 pr_info("memory: usage %llukB, limit %llukB, failcnt %lu\n",
1507 K((u64)page_counter_read(&memcg->memory)),
1508 K((u64)memcg->memory.max), memcg->memory.failcnt);
1509 if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
1510 pr_info("swap: usage %llukB, limit %llukB, failcnt %lu\n",
1511 K((u64)page_counter_read(&memcg->swap)),
1512 K((u64)memcg->swap.max), memcg->swap.failcnt);
1513 else {
1514 pr_info("memory+swap: usage %llukB, limit %llukB, failcnt %lu\n",
1515 K((u64)page_counter_read(&memcg->memsw)),
1516 K((u64)memcg->memsw.max), memcg->memsw.failcnt);
1517 pr_info("kmem: usage %llukB, limit %llukB, failcnt %lu\n",
1518 K((u64)page_counter_read(&memcg->kmem)),
1519 K((u64)memcg->kmem.max), memcg->kmem.failcnt);
1520 }
1521
1522 pr_info("Memory cgroup stats for ");
1523 pr_cont_cgroup_path(memcg->css.cgroup);
1524 pr_cont(":");
1525 buf = memory_stat_format(memcg);
1526 if (!buf)
1527 return;
1528 pr_info("%s", buf);
1529 kfree(buf);
1530}
1531
1532
1533
1534
1535unsigned long mem_cgroup_get_max(struct mem_cgroup *memcg)
1536{
1537 unsigned long max;
1538
1539 max = memcg->memory.max;
1540 if (mem_cgroup_swappiness(memcg)) {
1541 unsigned long memsw_max;
1542 unsigned long swap_max;
1543
1544 memsw_max = memcg->memsw.max;
1545 swap_max = memcg->swap.max;
1546 swap_max = min(swap_max, (unsigned long)total_swap_pages);
1547 max = min(max + swap_max, memsw_max);
1548 }
1549 return max;
1550}
1551
1552unsigned long mem_cgroup_size(struct mem_cgroup *memcg)
1553{
1554 return page_counter_read(&memcg->memory);
1555}
1556
1557static bool mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
1558 int order)
1559{
1560 struct oom_control oc = {
1561 .zonelist = NULL,
1562 .nodemask = NULL,
1563 .memcg = memcg,
1564 .gfp_mask = gfp_mask,
1565 .order = order,
1566 };
1567 bool ret;
1568
1569 if (mutex_lock_killable(&oom_lock))
1570 return true;
1571
1572
1573
1574
1575 ret = should_force_charge() || out_of_memory(&oc);
1576 mutex_unlock(&oom_lock);
1577 return ret;
1578}
1579
1580static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg,
1581 pg_data_t *pgdat,
1582 gfp_t gfp_mask,
1583 unsigned long *total_scanned)
1584{
1585 struct mem_cgroup *victim = NULL;
1586 int total = 0;
1587 int loop = 0;
1588 unsigned long excess;
1589 unsigned long nr_scanned;
1590 struct mem_cgroup_reclaim_cookie reclaim = {
1591 .pgdat = pgdat,
1592 };
1593
1594 excess = soft_limit_excess(root_memcg);
1595
1596 while (1) {
1597 victim = mem_cgroup_iter(root_memcg, victim, &reclaim);
1598 if (!victim) {
1599 loop++;
1600 if (loop >= 2) {
1601
1602
1603
1604
1605
1606 if (!total)
1607 break;
1608
1609
1610
1611
1612
1613
1614 if (total >= (excess >> 2) ||
1615 (loop > MEM_CGROUP_MAX_RECLAIM_LOOPS))
1616 break;
1617 }
1618 continue;
1619 }
1620 total += mem_cgroup_shrink_node(victim, gfp_mask, false,
1621 pgdat, &nr_scanned);
1622 *total_scanned += nr_scanned;
1623 if (!soft_limit_excess(root_memcg))
1624 break;
1625 }
1626 mem_cgroup_iter_break(root_memcg, victim);
1627 return total;
1628}
1629
1630#ifdef CONFIG_LOCKDEP
1631static struct lockdep_map memcg_oom_lock_dep_map = {
1632 .name = "memcg_oom_lock",
1633};
1634#endif
1635
1636static DEFINE_SPINLOCK(memcg_oom_lock);
1637
1638
1639
1640
1641
1642static bool mem_cgroup_oom_trylock(struct mem_cgroup *memcg)
1643{
1644 struct mem_cgroup *iter, *failed = NULL;
1645
1646 spin_lock(&memcg_oom_lock);
1647
1648 for_each_mem_cgroup_tree(iter, memcg) {
1649 if (iter->oom_lock) {
1650
1651
1652
1653
1654 failed = iter;
1655 mem_cgroup_iter_break(memcg, iter);
1656 break;
1657 } else
1658 iter->oom_lock = true;
1659 }
1660
1661 if (failed) {
1662
1663
1664
1665
1666 for_each_mem_cgroup_tree(iter, memcg) {
1667 if (iter == failed) {
1668 mem_cgroup_iter_break(memcg, iter);
1669 break;
1670 }
1671 iter->oom_lock = false;
1672 }
1673 } else
1674 mutex_acquire(&memcg_oom_lock_dep_map, 0, 1, _RET_IP_);
1675
1676 spin_unlock(&memcg_oom_lock);
1677
1678 return !failed;
1679}
1680
1681static void mem_cgroup_oom_unlock(struct mem_cgroup *memcg)
1682{
1683 struct mem_cgroup *iter;
1684
1685 spin_lock(&memcg_oom_lock);
1686 mutex_release(&memcg_oom_lock_dep_map, _RET_IP_);
1687 for_each_mem_cgroup_tree(iter, memcg)
1688 iter->oom_lock = false;
1689 spin_unlock(&memcg_oom_lock);
1690}
1691
1692static void mem_cgroup_mark_under_oom(struct mem_cgroup *memcg)
1693{
1694 struct mem_cgroup *iter;
1695
1696 spin_lock(&memcg_oom_lock);
1697 for_each_mem_cgroup_tree(iter, memcg)
1698 iter->under_oom++;
1699 spin_unlock(&memcg_oom_lock);
1700}
1701
1702static void mem_cgroup_unmark_under_oom(struct mem_cgroup *memcg)
1703{
1704 struct mem_cgroup *iter;
1705
1706
1707
1708
1709
1710 spin_lock(&memcg_oom_lock);
1711 for_each_mem_cgroup_tree(iter, memcg)
1712 if (iter->under_oom > 0)
1713 iter->under_oom--;
1714 spin_unlock(&memcg_oom_lock);
1715}
1716
1717static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq);
1718
1719struct oom_wait_info {
1720 struct mem_cgroup *memcg;
1721 wait_queue_entry_t wait;
1722};
1723
1724static int memcg_oom_wake_function(wait_queue_entry_t *wait,
1725 unsigned mode, int sync, void *arg)
1726{
1727 struct mem_cgroup *wake_memcg = (struct mem_cgroup *)arg;
1728 struct mem_cgroup *oom_wait_memcg;
1729 struct oom_wait_info *oom_wait_info;
1730
1731 oom_wait_info = container_of(wait, struct oom_wait_info, wait);
1732 oom_wait_memcg = oom_wait_info->memcg;
1733
1734 if (!mem_cgroup_is_descendant(wake_memcg, oom_wait_memcg) &&
1735 !mem_cgroup_is_descendant(oom_wait_memcg, wake_memcg))
1736 return 0;
1737 return autoremove_wake_function(wait, mode, sync, arg);
1738}
1739
1740static void memcg_oom_recover(struct mem_cgroup *memcg)
1741{
1742
1743
1744
1745
1746
1747
1748
1749
1750 if (memcg && memcg->under_oom)
1751 __wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, memcg);
1752}
1753
1754enum oom_status {
1755 OOM_SUCCESS,
1756 OOM_FAILED,
1757 OOM_ASYNC,
1758 OOM_SKIPPED
1759};
1760
1761static enum oom_status mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order)
1762{
1763 enum oom_status ret;
1764 bool locked;
1765
1766 if (order > PAGE_ALLOC_COSTLY_ORDER)
1767 return OOM_SKIPPED;
1768
1769 memcg_memory_event(memcg, MEMCG_OOM);
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789 if (memcg->oom_kill_disable) {
1790 if (!current->in_user_fault)
1791 return OOM_SKIPPED;
1792 css_get(&memcg->css);
1793 current->memcg_in_oom = memcg;
1794 current->memcg_oom_gfp_mask = mask;
1795 current->memcg_oom_order = order;
1796
1797 return OOM_ASYNC;
1798 }
1799
1800 mem_cgroup_mark_under_oom(memcg);
1801
1802 locked = mem_cgroup_oom_trylock(memcg);
1803
1804 if (locked)
1805 mem_cgroup_oom_notify(memcg);
1806
1807 mem_cgroup_unmark_under_oom(memcg);
1808 if (mem_cgroup_out_of_memory(memcg, mask, order))
1809 ret = OOM_SUCCESS;
1810 else
1811 ret = OOM_FAILED;
1812
1813 if (locked)
1814 mem_cgroup_oom_unlock(memcg);
1815
1816 return ret;
1817}
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836bool mem_cgroup_oom_synchronize(bool handle)
1837{
1838 struct mem_cgroup *memcg = current->memcg_in_oom;
1839 struct oom_wait_info owait;
1840 bool locked;
1841
1842
1843 if (!memcg)
1844 return false;
1845
1846 if (!handle)
1847 goto cleanup;
1848
1849 owait.memcg = memcg;
1850 owait.wait.flags = 0;
1851 owait.wait.func = memcg_oom_wake_function;
1852 owait.wait.private = current;
1853 INIT_LIST_HEAD(&owait.wait.entry);
1854
1855 prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE);
1856 mem_cgroup_mark_under_oom(memcg);
1857
1858 locked = mem_cgroup_oom_trylock(memcg);
1859
1860 if (locked)
1861 mem_cgroup_oom_notify(memcg);
1862
1863 if (locked && !memcg->oom_kill_disable) {
1864 mem_cgroup_unmark_under_oom(memcg);
1865 finish_wait(&memcg_oom_waitq, &owait.wait);
1866 mem_cgroup_out_of_memory(memcg, current->memcg_oom_gfp_mask,
1867 current->memcg_oom_order);
1868 } else {
1869 schedule();
1870 mem_cgroup_unmark_under_oom(memcg);
1871 finish_wait(&memcg_oom_waitq, &owait.wait);
1872 }
1873
1874 if (locked) {
1875 mem_cgroup_oom_unlock(memcg);
1876
1877
1878
1879
1880
1881 memcg_oom_recover(memcg);
1882 }
1883cleanup:
1884 current->memcg_in_oom = NULL;
1885 css_put(&memcg->css);
1886 return true;
1887}
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899struct mem_cgroup *mem_cgroup_get_oom_group(struct task_struct *victim,
1900 struct mem_cgroup *oom_domain)
1901{
1902 struct mem_cgroup *oom_group = NULL;
1903 struct mem_cgroup *memcg;
1904
1905 if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
1906 return NULL;
1907
1908 if (!oom_domain)
1909 oom_domain = root_mem_cgroup;
1910
1911 rcu_read_lock();
1912
1913 memcg = mem_cgroup_from_task(victim);
1914 if (memcg == root_mem_cgroup)
1915 goto out;
1916
1917
1918
1919
1920
1921
1922 for (; memcg; memcg = parent_mem_cgroup(memcg)) {
1923 if (memcg->oom_group)
1924 oom_group = memcg;
1925
1926 if (memcg == oom_domain)
1927 break;
1928 }
1929
1930 if (oom_group)
1931 css_get(&oom_group->css);
1932out:
1933 rcu_read_unlock();
1934
1935 return oom_group;
1936}
1937
1938void mem_cgroup_print_oom_group(struct mem_cgroup *memcg)
1939{
1940 pr_info("Tasks in ");
1941 pr_cont_cgroup_path(memcg->css.cgroup);
1942 pr_cont(" are going to be killed due to memory.oom.group set\n");
1943}
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956struct mem_cgroup *lock_page_memcg(struct page *page)
1957{
1958 struct mem_cgroup *memcg;
1959 unsigned long flags;
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972 rcu_read_lock();
1973
1974 if (mem_cgroup_disabled())
1975 return NULL;
1976again:
1977 memcg = page->mem_cgroup;
1978 if (unlikely(!memcg))
1979 return NULL;
1980
1981 if (atomic_read(&memcg->moving_account) <= 0)
1982 return memcg;
1983
1984 spin_lock_irqsave(&memcg->move_lock, flags);
1985 if (memcg != page->mem_cgroup) {
1986 spin_unlock_irqrestore(&memcg->move_lock, flags);
1987 goto again;
1988 }
1989
1990
1991
1992
1993
1994
1995 memcg->move_lock_task = current;
1996 memcg->move_lock_flags = flags;
1997
1998 return memcg;
1999}
2000EXPORT_SYMBOL(lock_page_memcg);
2001
2002
2003
2004
2005
2006
2007
2008void __unlock_page_memcg(struct mem_cgroup *memcg)
2009{
2010 if (memcg && memcg->move_lock_task == current) {
2011 unsigned long flags = memcg->move_lock_flags;
2012
2013 memcg->move_lock_task = NULL;
2014 memcg->move_lock_flags = 0;
2015
2016 spin_unlock_irqrestore(&memcg->move_lock, flags);
2017 }
2018
2019 rcu_read_unlock();
2020}
2021
2022
2023
2024
2025
2026void unlock_page_memcg(struct page *page)
2027{
2028 __unlock_page_memcg(page->mem_cgroup);
2029}
2030EXPORT_SYMBOL(unlock_page_memcg);
2031
2032struct memcg_stock_pcp {
2033 struct mem_cgroup *cached;
2034 unsigned int nr_pages;
2035 struct work_struct work;
2036 unsigned long flags;
2037#define FLUSHING_CACHED_CHARGE 0
2038};
2039static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock);
2040static DEFINE_MUTEX(percpu_charge_mutex);
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
2054{
2055 struct memcg_stock_pcp *stock;
2056 unsigned long flags;
2057 bool ret = false;
2058
2059 if (nr_pages > MEMCG_CHARGE_BATCH)
2060 return ret;
2061
2062 local_irq_save(flags);
2063
2064 stock = this_cpu_ptr(&memcg_stock);
2065 if (memcg == stock->cached && stock->nr_pages >= nr_pages) {
2066 stock->nr_pages -= nr_pages;
2067 ret = true;
2068 }
2069
2070 local_irq_restore(flags);
2071
2072 return ret;
2073}
2074
2075
2076
2077
2078static void drain_stock(struct memcg_stock_pcp *stock)
2079{
2080 struct mem_cgroup *old = stock->cached;
2081
2082 if (stock->nr_pages) {
2083 page_counter_uncharge(&old->memory, stock->nr_pages);
2084 if (do_memsw_account())
2085 page_counter_uncharge(&old->memsw, stock->nr_pages);
2086 css_put_many(&old->css, stock->nr_pages);
2087 stock->nr_pages = 0;
2088 }
2089 stock->cached = NULL;
2090}
2091
2092static void drain_local_stock(struct work_struct *dummy)
2093{
2094 struct memcg_stock_pcp *stock;
2095 unsigned long flags;
2096
2097
2098
2099
2100
2101 local_irq_save(flags);
2102
2103 stock = this_cpu_ptr(&memcg_stock);
2104 drain_stock(stock);
2105 clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags);
2106
2107 local_irq_restore(flags);
2108}
2109
2110
2111
2112
2113
2114static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
2115{
2116 struct memcg_stock_pcp *stock;
2117 unsigned long flags;
2118
2119 local_irq_save(flags);
2120
2121 stock = this_cpu_ptr(&memcg_stock);
2122 if (stock->cached != memcg) {
2123 drain_stock(stock);
2124 stock->cached = memcg;
2125 }
2126 stock->nr_pages += nr_pages;
2127
2128 if (stock->nr_pages > MEMCG_CHARGE_BATCH)
2129 drain_stock(stock);
2130
2131 local_irq_restore(flags);
2132}
2133
2134
2135
2136
2137
2138static void drain_all_stock(struct mem_cgroup *root_memcg)
2139{
2140 int cpu, curcpu;
2141
2142
2143 if (!mutex_trylock(&percpu_charge_mutex))
2144 return;
2145
2146
2147
2148
2149
2150
2151 curcpu = get_cpu();
2152 for_each_online_cpu(cpu) {
2153 struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu);
2154 struct mem_cgroup *memcg;
2155 bool flush = false;
2156
2157 rcu_read_lock();
2158 memcg = stock->cached;
2159 if (memcg && stock->nr_pages &&
2160 mem_cgroup_is_descendant(memcg, root_memcg))
2161 flush = true;
2162 rcu_read_unlock();
2163
2164 if (flush &&
2165 !test_and_set_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) {
2166 if (cpu == curcpu)
2167 drain_local_stock(&stock->work);
2168 else
2169 schedule_work_on(cpu, &stock->work);
2170 }
2171 }
2172 put_cpu();
2173 mutex_unlock(&percpu_charge_mutex);
2174}
2175
2176static int memcg_hotplug_cpu_dead(unsigned int cpu)
2177{
2178 struct memcg_stock_pcp *stock;
2179 struct mem_cgroup *memcg, *mi;
2180
2181 stock = &per_cpu(memcg_stock, cpu);
2182 drain_stock(stock);
2183
2184 for_each_mem_cgroup(memcg) {
2185 int i;
2186
2187 for (i = 0; i < MEMCG_NR_STAT; i++) {
2188 int nid;
2189 long x;
2190
2191 x = this_cpu_xchg(memcg->vmstats_percpu->stat[i], 0);
2192 if (x)
2193 for (mi = memcg; mi; mi = parent_mem_cgroup(mi))
2194 atomic_long_add(x, &memcg->vmstats[i]);
2195
2196 if (i >= NR_VM_NODE_STAT_ITEMS)
2197 continue;
2198
2199 for_each_node(nid) {
2200 struct mem_cgroup_per_node *pn;
2201
2202 pn = mem_cgroup_nodeinfo(memcg, nid);
2203 x = this_cpu_xchg(pn->lruvec_stat_cpu->count[i], 0);
2204 if (x)
2205 do {
2206 atomic_long_add(x, &pn->lruvec_stat[i]);
2207 } while ((pn = parent_nodeinfo(pn, nid)));
2208 }
2209 }
2210
2211 for (i = 0; i < NR_VM_EVENT_ITEMS; i++) {
2212 long x;
2213
2214 x = this_cpu_xchg(memcg->vmstats_percpu->events[i], 0);
2215 if (x)
2216 for (mi = memcg; mi; mi = parent_mem_cgroup(mi))
2217 atomic_long_add(x, &memcg->vmevents[i]);
2218 }
2219 }
2220
2221 return 0;
2222}
2223
2224static void reclaim_high(struct mem_cgroup *memcg,
2225 unsigned int nr_pages,
2226 gfp_t gfp_mask)
2227{
2228 do {
2229 if (page_counter_read(&memcg->memory) <= memcg->high)
2230 continue;
2231 memcg_memory_event(memcg, MEMCG_HIGH);
2232 try_to_free_mem_cgroup_pages(memcg, nr_pages, gfp_mask, true);
2233 } while ((memcg = parent_mem_cgroup(memcg)));
2234}
2235
2236static void high_work_func(struct work_struct *work)
2237{
2238 struct mem_cgroup *memcg;
2239
2240 memcg = container_of(work, struct mem_cgroup, high_work);
2241 reclaim_high(memcg, MEMCG_CHARGE_BATCH, GFP_KERNEL);
2242}
2243
2244
2245
2246
2247
2248
2249#define MEMCG_MAX_HIGH_DELAY_JIFFIES (2UL*HZ)
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294 #define MEMCG_DELAY_PRECISION_SHIFT 20
2295 #define MEMCG_DELAY_SCALING_SHIFT 14
2296
2297
2298
2299
2300
2301void mem_cgroup_handle_over_high(void)
2302{
2303 unsigned long usage, high, clamped_high;
2304 unsigned long pflags;
2305 unsigned long penalty_jiffies, overage;
2306 unsigned int nr_pages = current->memcg_nr_pages_over_high;
2307 struct mem_cgroup *memcg;
2308
2309 if (likely(!nr_pages))
2310 return;
2311
2312 memcg = get_mem_cgroup_from_mm(current->mm);
2313 reclaim_high(memcg, nr_pages, GFP_KERNEL);
2314 current->memcg_nr_pages_over_high = 0;
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328 usage = page_counter_read(&memcg->memory);
2329 high = READ_ONCE(memcg->high);
2330
2331 if (usage <= high)
2332 goto out;
2333
2334
2335
2336
2337
2338 clamped_high = max(high, 1UL);
2339
2340 overage = div_u64((u64)(usage - high) << MEMCG_DELAY_PRECISION_SHIFT,
2341 clamped_high);
2342
2343 penalty_jiffies = ((u64)overage * overage * HZ)
2344 >> (MEMCG_DELAY_PRECISION_SHIFT + MEMCG_DELAY_SCALING_SHIFT);
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354 penalty_jiffies = penalty_jiffies * nr_pages / MEMCG_CHARGE_BATCH;
2355
2356
2357
2358
2359
2360
2361 penalty_jiffies = min(penalty_jiffies, MEMCG_MAX_HIGH_DELAY_JIFFIES);
2362
2363
2364
2365
2366
2367
2368
2369 if (penalty_jiffies <= HZ / 100)
2370 goto out;
2371
2372
2373
2374
2375
2376
2377 psi_memstall_enter(&pflags);
2378 schedule_timeout_killable(penalty_jiffies);
2379 psi_memstall_leave(&pflags);
2380
2381out:
2382 css_put(&memcg->css);
2383}
2384
2385static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
2386 unsigned int nr_pages)
2387{
2388 unsigned int batch = max(MEMCG_CHARGE_BATCH, nr_pages);
2389 int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
2390 struct mem_cgroup *mem_over_limit;
2391 struct page_counter *counter;
2392 unsigned long nr_reclaimed;
2393 bool may_swap = true;
2394 bool drained = false;
2395 enum oom_status oom_status;
2396
2397 if (mem_cgroup_is_root(memcg))
2398 return 0;
2399retry:
2400 if (consume_stock(memcg, nr_pages))
2401 return 0;
2402
2403 if (!do_memsw_account() ||
2404 page_counter_try_charge(&memcg->memsw, batch, &counter)) {
2405 if (page_counter_try_charge(&memcg->memory, batch, &counter))
2406 goto done_restock;
2407 if (do_memsw_account())
2408 page_counter_uncharge(&memcg->memsw, batch);
2409 mem_over_limit = mem_cgroup_from_counter(counter, memory);
2410 } else {
2411 mem_over_limit = mem_cgroup_from_counter(counter, memsw);
2412 may_swap = false;
2413 }
2414
2415 if (batch > nr_pages) {
2416 batch = nr_pages;
2417 goto retry;
2418 }
2419
2420
2421
2422
2423
2424
2425
2426 if (gfp_mask & __GFP_ATOMIC)
2427 goto force;
2428
2429
2430
2431
2432
2433
2434
2435 if (unlikely(should_force_charge()))
2436 goto force;
2437
2438
2439
2440
2441
2442
2443
2444 if (unlikely(current->flags & PF_MEMALLOC))
2445 goto force;
2446
2447 if (unlikely(task_in_memcg_oom(current)))
2448 goto nomem;
2449
2450 if (!gfpflags_allow_blocking(gfp_mask))
2451 goto nomem;
2452
2453 memcg_memory_event(mem_over_limit, MEMCG_MAX);
2454
2455 nr_reclaimed = try_to_free_mem_cgroup_pages(mem_over_limit, nr_pages,
2456 gfp_mask, may_swap);
2457
2458 if (mem_cgroup_margin(mem_over_limit) >= nr_pages)
2459 goto retry;
2460
2461 if (!drained) {
2462 drain_all_stock(mem_over_limit);
2463 drained = true;
2464 goto retry;
2465 }
2466
2467 if (gfp_mask & __GFP_NORETRY)
2468 goto nomem;
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478 if (nr_reclaimed && nr_pages <= (1 << PAGE_ALLOC_COSTLY_ORDER))
2479 goto retry;
2480
2481
2482
2483
2484 if (mem_cgroup_wait_acct_move(mem_over_limit))
2485 goto retry;
2486
2487 if (nr_retries--)
2488 goto retry;
2489
2490 if (gfp_mask & __GFP_RETRY_MAYFAIL)
2491 goto nomem;
2492
2493 if (gfp_mask & __GFP_NOFAIL)
2494 goto force;
2495
2496 if (fatal_signal_pending(current))
2497 goto force;
2498
2499
2500
2501
2502
2503
2504 oom_status = mem_cgroup_oom(mem_over_limit, gfp_mask,
2505 get_order(nr_pages * PAGE_SIZE));
2506 switch (oom_status) {
2507 case OOM_SUCCESS:
2508 nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
2509 goto retry;
2510 case OOM_FAILED:
2511 goto force;
2512 default:
2513 goto nomem;
2514 }
2515nomem:
2516 if (!(gfp_mask & __GFP_NOFAIL))
2517 return -ENOMEM;
2518force:
2519
2520
2521
2522
2523
2524 page_counter_charge(&memcg->memory, nr_pages);
2525 if (do_memsw_account())
2526 page_counter_charge(&memcg->memsw, nr_pages);
2527 css_get_many(&memcg->css, nr_pages);
2528
2529 return 0;
2530
2531done_restock:
2532 css_get_many(&memcg->css, batch);
2533 if (batch > nr_pages)
2534 refill_stock(memcg, batch - nr_pages);
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545 do {
2546 if (page_counter_read(&memcg->memory) > memcg->high) {
2547
2548 if (in_interrupt()) {
2549 schedule_work(&memcg->high_work);
2550 break;
2551 }
2552 current->memcg_nr_pages_over_high += batch;
2553 set_notify_resume(current);
2554 break;
2555 }
2556 } while ((memcg = parent_mem_cgroup(memcg)));
2557
2558 return 0;
2559}
2560
2561static void cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages)
2562{
2563 if (mem_cgroup_is_root(memcg))
2564 return;
2565
2566 page_counter_uncharge(&memcg->memory, nr_pages);
2567 if (do_memsw_account())
2568 page_counter_uncharge(&memcg->memsw, nr_pages);
2569
2570 css_put_many(&memcg->css, nr_pages);
2571}
2572
2573static void lock_page_lru(struct page *page, int *isolated)
2574{
2575 pg_data_t *pgdat = page_pgdat(page);
2576
2577 spin_lock_irq(&pgdat->lru_lock);
2578 if (PageLRU(page)) {
2579 struct lruvec *lruvec;
2580
2581 lruvec = mem_cgroup_page_lruvec(page, pgdat);
2582 ClearPageLRU(page);
2583 del_page_from_lru_list(page, lruvec, page_lru(page));
2584 *isolated = 1;
2585 } else
2586 *isolated = 0;
2587}
2588
2589static void unlock_page_lru(struct page *page, int isolated)
2590{
2591 pg_data_t *pgdat = page_pgdat(page);
2592
2593 if (isolated) {
2594 struct lruvec *lruvec;
2595
2596 lruvec = mem_cgroup_page_lruvec(page, pgdat);
2597 VM_BUG_ON_PAGE(PageLRU(page), page);
2598 SetPageLRU(page);
2599 add_page_to_lru_list(page, lruvec, page_lru(page));
2600 }
2601 spin_unlock_irq(&pgdat->lru_lock);
2602}
2603
2604static void commit_charge(struct page *page, struct mem_cgroup *memcg,
2605 bool lrucare)
2606{
2607 int isolated;
2608
2609 VM_BUG_ON_PAGE(page->mem_cgroup, page);
2610
2611
2612
2613
2614
2615 if (lrucare)
2616 lock_page_lru(page, &isolated);
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632 page->mem_cgroup = memcg;
2633
2634 if (lrucare)
2635 unlock_page_lru(page, isolated);
2636}
2637
2638#ifdef CONFIG_MEMCG_KMEM
2639static int memcg_alloc_cache_id(void)
2640{
2641 int id, size;
2642 int err;
2643
2644 id = ida_simple_get(&memcg_cache_ida,
2645 0, MEMCG_CACHES_MAX_SIZE, GFP_KERNEL);
2646 if (id < 0)
2647 return id;
2648
2649 if (id < memcg_nr_cache_ids)
2650 return id;
2651
2652
2653
2654
2655
2656 down_write(&memcg_cache_ids_sem);
2657
2658 size = 2 * (id + 1);
2659 if (size < MEMCG_CACHES_MIN_SIZE)
2660 size = MEMCG_CACHES_MIN_SIZE;
2661 else if (size > MEMCG_CACHES_MAX_SIZE)
2662 size = MEMCG_CACHES_MAX_SIZE;
2663
2664 err = memcg_update_all_caches(size);
2665 if (!err)
2666 err = memcg_update_all_list_lrus(size);
2667 if (!err)
2668 memcg_nr_cache_ids = size;
2669
2670 up_write(&memcg_cache_ids_sem);
2671
2672 if (err) {
2673 ida_simple_remove(&memcg_cache_ida, id);
2674 return err;
2675 }
2676 return id;
2677}
2678
2679static void memcg_free_cache_id(int id)
2680{
2681 ida_simple_remove(&memcg_cache_ida, id);
2682}
2683
2684struct memcg_kmem_cache_create_work {
2685 struct mem_cgroup *memcg;
2686 struct kmem_cache *cachep;
2687 struct work_struct work;
2688};
2689
2690static void memcg_kmem_cache_create_func(struct work_struct *w)
2691{
2692 struct memcg_kmem_cache_create_work *cw =
2693 container_of(w, struct memcg_kmem_cache_create_work, work);
2694 struct mem_cgroup *memcg = cw->memcg;
2695 struct kmem_cache *cachep = cw->cachep;
2696
2697 memcg_create_kmem_cache(memcg, cachep);
2698
2699 css_put(&memcg->css);
2700 kfree(cw);
2701}
2702
2703
2704
2705
2706static void memcg_schedule_kmem_cache_create(struct mem_cgroup *memcg,
2707 struct kmem_cache *cachep)
2708{
2709 struct memcg_kmem_cache_create_work *cw;
2710
2711 if (!css_tryget_online(&memcg->css))
2712 return;
2713
2714 cw = kmalloc(sizeof(*cw), GFP_NOWAIT | __GFP_NOWARN);
2715 if (!cw)
2716 return;
2717
2718 cw->memcg = memcg;
2719 cw->cachep = cachep;
2720 INIT_WORK(&cw->work, memcg_kmem_cache_create_func);
2721
2722 queue_work(memcg_kmem_cache_wq, &cw->work);
2723}
2724
2725static inline bool memcg_kmem_bypass(void)
2726{
2727 if (in_interrupt() || !current->mm || (current->flags & PF_KTHREAD))
2728 return true;
2729 return false;
2730}
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748struct kmem_cache *memcg_kmem_get_cache(struct kmem_cache *cachep)
2749{
2750 struct mem_cgroup *memcg;
2751 struct kmem_cache *memcg_cachep;
2752 struct memcg_cache_array *arr;
2753 int kmemcg_id;
2754
2755 VM_BUG_ON(!is_root_cache(cachep));
2756
2757 if (memcg_kmem_bypass())
2758 return cachep;
2759
2760 rcu_read_lock();
2761
2762 if (unlikely(current->active_memcg))
2763 memcg = current->active_memcg;
2764 else
2765 memcg = mem_cgroup_from_task(current);
2766
2767 if (!memcg || memcg == root_mem_cgroup)
2768 goto out_unlock;
2769
2770 kmemcg_id = READ_ONCE(memcg->kmemcg_id);
2771 if (kmemcg_id < 0)
2772 goto out_unlock;
2773
2774 arr = rcu_dereference(cachep->memcg_params.memcg_caches);
2775
2776
2777
2778
2779
2780
2781 memcg_cachep = READ_ONCE(arr->entries[kmemcg_id]);
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802 if (unlikely(!memcg_cachep))
2803 memcg_schedule_kmem_cache_create(memcg, cachep);
2804 else if (percpu_ref_tryget(&memcg_cachep->memcg_params.refcnt))
2805 cachep = memcg_cachep;
2806out_unlock:
2807 rcu_read_unlock();
2808 return cachep;
2809}
2810
2811
2812
2813
2814
2815void memcg_kmem_put_cache(struct kmem_cache *cachep)
2816{
2817 if (!is_root_cache(cachep))
2818 percpu_ref_put(&cachep->memcg_params.refcnt);
2819}
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830int __memcg_kmem_charge_memcg(struct page *page, gfp_t gfp, int order,
2831 struct mem_cgroup *memcg)
2832{
2833 unsigned int nr_pages = 1 << order;
2834 struct page_counter *counter;
2835 int ret;
2836
2837 ret = try_charge(memcg, gfp, nr_pages);
2838 if (ret)
2839 return ret;
2840
2841 if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) &&
2842 !page_counter_try_charge(&memcg->kmem, nr_pages, &counter)) {
2843
2844
2845
2846
2847
2848
2849 if (gfp & __GFP_NOFAIL) {
2850 page_counter_charge(&memcg->kmem, nr_pages);
2851 return 0;
2852 }
2853 cancel_charge(memcg, nr_pages);
2854 return -ENOMEM;
2855 }
2856 return 0;
2857}
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867int __memcg_kmem_charge(struct page *page, gfp_t gfp, int order)
2868{
2869 struct mem_cgroup *memcg;
2870 int ret = 0;
2871
2872 if (memcg_kmem_bypass())
2873 return 0;
2874
2875 memcg = get_mem_cgroup_from_current();
2876 if (!mem_cgroup_is_root(memcg)) {
2877 ret = __memcg_kmem_charge_memcg(page, gfp, order, memcg);
2878 if (!ret) {
2879 page->mem_cgroup = memcg;
2880 __SetPageKmemcg(page);
2881 }
2882 }
2883 css_put(&memcg->css);
2884 return ret;
2885}
2886
2887
2888
2889
2890
2891
2892void __memcg_kmem_uncharge_memcg(struct mem_cgroup *memcg,
2893 unsigned int nr_pages)
2894{
2895 if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
2896 page_counter_uncharge(&memcg->kmem, nr_pages);
2897
2898 page_counter_uncharge(&memcg->memory, nr_pages);
2899 if (do_memsw_account())
2900 page_counter_uncharge(&memcg->memsw, nr_pages);
2901}
2902
2903
2904
2905
2906
2907void __memcg_kmem_uncharge(struct page *page, int order)
2908{
2909 struct mem_cgroup *memcg = page->mem_cgroup;
2910 unsigned int nr_pages = 1 << order;
2911
2912 if (!memcg)
2913 return;
2914
2915 VM_BUG_ON_PAGE(mem_cgroup_is_root(memcg), page);
2916 __memcg_kmem_uncharge_memcg(memcg, nr_pages);
2917 page->mem_cgroup = NULL;
2918
2919
2920 if (PageKmemcg(page))
2921 __ClearPageKmemcg(page);
2922
2923 css_put_many(&memcg->css, nr_pages);
2924}
2925#endif
2926
2927#ifdef CONFIG_TRANSPARENT_HUGEPAGE
2928
2929
2930
2931
2932
2933void mem_cgroup_split_huge_fixup(struct page *head)
2934{
2935 int i;
2936
2937 if (mem_cgroup_disabled())
2938 return;
2939
2940 for (i = 1; i < HPAGE_PMD_NR; i++)
2941 head[i].mem_cgroup = head->mem_cgroup;
2942
2943 __mod_memcg_state(head->mem_cgroup, MEMCG_RSS_HUGE, -HPAGE_PMD_NR);
2944}
2945#endif
2946
2947#ifdef CONFIG_MEMCG_SWAP
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962static int mem_cgroup_move_swap_account(swp_entry_t entry,
2963 struct mem_cgroup *from, struct mem_cgroup *to)
2964{
2965 unsigned short old_id, new_id;
2966
2967 old_id = mem_cgroup_id(from);
2968 new_id = mem_cgroup_id(to);
2969
2970 if (swap_cgroup_cmpxchg(entry, old_id, new_id) == old_id) {
2971 mod_memcg_state(from, MEMCG_SWAP, -1);
2972 mod_memcg_state(to, MEMCG_SWAP, 1);
2973 return 0;
2974 }
2975 return -EINVAL;
2976}
2977#else
2978static inline int mem_cgroup_move_swap_account(swp_entry_t entry,
2979 struct mem_cgroup *from, struct mem_cgroup *to)
2980{
2981 return -EINVAL;
2982}
2983#endif
2984
2985static DEFINE_MUTEX(memcg_max_mutex);
2986
2987static int mem_cgroup_resize_max(struct mem_cgroup *memcg,
2988 unsigned long max, bool memsw)
2989{
2990 bool enlarge = false;
2991 bool drained = false;
2992 int ret;
2993 bool limits_invariant;
2994 struct page_counter *counter = memsw ? &memcg->memsw : &memcg->memory;
2995
2996 do {
2997 if (signal_pending(current)) {
2998 ret = -EINTR;
2999 break;
3000 }
3001
3002 mutex_lock(&memcg_max_mutex);
3003
3004
3005
3006
3007 limits_invariant = memsw ? max >= memcg->memory.max :
3008 max <= memcg->memsw.max;
3009 if (!limits_invariant) {
3010 mutex_unlock(&memcg_max_mutex);
3011 ret = -EINVAL;
3012 break;
3013 }
3014 if (max > counter->max)
3015 enlarge = true;
3016 ret = page_counter_set_max(counter, max);
3017 mutex_unlock(&memcg_max_mutex);
3018
3019 if (!ret)
3020 break;
3021
3022 if (!drained) {
3023 drain_all_stock(memcg);
3024 drained = true;
3025 continue;
3026 }
3027
3028 if (!try_to_free_mem_cgroup_pages(memcg, 1,
3029 GFP_KERNEL, !memsw)) {
3030 ret = -EBUSY;
3031 break;
3032 }
3033 } while (true);
3034
3035 if (!ret && enlarge)
3036 memcg_oom_recover(memcg);
3037
3038 return ret;
3039}
3040
3041unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order,
3042 gfp_t gfp_mask,
3043 unsigned long *total_scanned)
3044{
3045 unsigned long nr_reclaimed = 0;
3046 struct mem_cgroup_per_node *mz, *next_mz = NULL;
3047 unsigned long reclaimed;
3048 int loop = 0;
3049 struct mem_cgroup_tree_per_node *mctz;
3050 unsigned long excess;
3051 unsigned long nr_scanned;
3052
3053 if (order > 0)
3054 return 0;
3055
3056 mctz = soft_limit_tree_node(pgdat->node_id);
3057
3058
3059
3060
3061
3062
3063 if (!mctz || RB_EMPTY_ROOT(&mctz->rb_root))
3064 return 0;
3065
3066
3067
3068
3069
3070
3071 do {
3072 if (next_mz)
3073 mz = next_mz;
3074 else
3075 mz = mem_cgroup_largest_soft_limit_node(mctz);
3076 if (!mz)
3077 break;
3078
3079 nr_scanned = 0;
3080 reclaimed = mem_cgroup_soft_reclaim(mz->memcg, pgdat,
3081 gfp_mask, &nr_scanned);
3082 nr_reclaimed += reclaimed;
3083 *total_scanned += nr_scanned;
3084 spin_lock_irq(&mctz->lock);
3085 __mem_cgroup_remove_exceeded(mz, mctz);
3086
3087
3088
3089
3090
3091 next_mz = NULL;
3092 if (!reclaimed)
3093 next_mz = __mem_cgroup_largest_soft_limit_node(mctz);
3094
3095 excess = soft_limit_excess(mz->memcg);
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105 __mem_cgroup_insert_exceeded(mz, mctz, excess);
3106 spin_unlock_irq(&mctz->lock);
3107 css_put(&mz->memcg->css);
3108 loop++;
3109
3110
3111
3112
3113
3114 if (!nr_reclaimed &&
3115 (next_mz == NULL ||
3116 loop > MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS))
3117 break;
3118 } while (!nr_reclaimed);
3119 if (next_mz)
3120 css_put(&next_mz->memcg->css);
3121 return nr_reclaimed;
3122}
3123
3124
3125
3126
3127
3128
3129
3130static inline bool memcg_has_children(struct mem_cgroup *memcg)
3131{
3132 bool ret;
3133
3134 rcu_read_lock();
3135 ret = css_next_child(NULL, &memcg->css);
3136 rcu_read_unlock();
3137 return ret;
3138}
3139
3140
3141
3142
3143
3144
3145static int mem_cgroup_force_empty(struct mem_cgroup *memcg)
3146{
3147 int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
3148
3149
3150 lru_add_drain_all();
3151
3152 drain_all_stock(memcg);
3153
3154
3155 while (nr_retries && page_counter_read(&memcg->memory)) {
3156 int progress;
3157
3158 if (signal_pending(current))
3159 return -EINTR;
3160
3161 progress = try_to_free_mem_cgroup_pages(memcg, 1,
3162 GFP_KERNEL, true);
3163 if (!progress) {
3164 nr_retries--;
3165
3166 congestion_wait(BLK_RW_ASYNC, HZ/10);
3167 }
3168
3169 }
3170
3171 return 0;
3172}
3173
3174static ssize_t mem_cgroup_force_empty_write(struct kernfs_open_file *of,
3175 char *buf, size_t nbytes,
3176 loff_t off)
3177{
3178 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
3179
3180 if (mem_cgroup_is_root(memcg))
3181 return -EINVAL;
3182 return mem_cgroup_force_empty(memcg) ?: nbytes;
3183}
3184
3185static u64 mem_cgroup_hierarchy_read(struct cgroup_subsys_state *css,
3186 struct cftype *cft)
3187{
3188 return mem_cgroup_from_css(css)->use_hierarchy;
3189}
3190
3191static int mem_cgroup_hierarchy_write(struct cgroup_subsys_state *css,
3192 struct cftype *cft, u64 val)
3193{
3194 int retval = 0;
3195 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
3196 struct mem_cgroup *parent_memcg = mem_cgroup_from_css(memcg->css.parent);
3197
3198 if (memcg->use_hierarchy == val)
3199 return 0;
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209 if ((!parent_memcg || !parent_memcg->use_hierarchy) &&
3210 (val == 1 || val == 0)) {
3211 if (!memcg_has_children(memcg))
3212 memcg->use_hierarchy = val;
3213 else
3214 retval = -EBUSY;
3215 } else
3216 retval = -EINVAL;
3217
3218 return retval;
3219}
3220
3221static unsigned long mem_cgroup_usage(struct mem_cgroup *memcg, bool swap)
3222{
3223 unsigned long val;
3224
3225 if (mem_cgroup_is_root(memcg)) {
3226 val = memcg_page_state(memcg, MEMCG_CACHE) +
3227 memcg_page_state(memcg, MEMCG_RSS);
3228 if (swap)
3229 val += memcg_page_state(memcg, MEMCG_SWAP);
3230 } else {
3231 if (!swap)
3232 val = page_counter_read(&memcg->memory);
3233 else
3234 val = page_counter_read(&memcg->memsw);
3235 }
3236 return val;
3237}
3238
3239enum {
3240 RES_USAGE,
3241 RES_LIMIT,
3242 RES_MAX_USAGE,
3243 RES_FAILCNT,
3244 RES_SOFT_LIMIT,
3245};
3246
3247static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css,
3248 struct cftype *cft)
3249{
3250 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
3251 struct page_counter *counter;
3252
3253 switch (MEMFILE_TYPE(cft->private)) {
3254 case _MEM:
3255 counter = &memcg->memory;
3256 break;
3257 case _MEMSWAP:
3258 counter = &memcg->memsw;
3259 break;
3260 case _KMEM:
3261 counter = &memcg->kmem;
3262 break;
3263 case _TCP:
3264 counter = &memcg->tcpmem;
3265 break;
3266 default:
3267 BUG();
3268 }
3269
3270 switch (MEMFILE_ATTR(cft->private)) {
3271 case RES_USAGE:
3272 if (counter == &memcg->memory)
3273 return (u64)mem_cgroup_usage(memcg, false) * PAGE_SIZE;
3274 if (counter == &memcg->memsw)
3275 return (u64)mem_cgroup_usage(memcg, true) * PAGE_SIZE;
3276 return (u64)page_counter_read(counter) * PAGE_SIZE;
3277 case RES_LIMIT:
3278 return (u64)counter->max * PAGE_SIZE;
3279 case RES_MAX_USAGE:
3280 return (u64)counter->watermark * PAGE_SIZE;
3281 case RES_FAILCNT:
3282 return counter->failcnt;
3283 case RES_SOFT_LIMIT:
3284 return (u64)memcg->soft_limit * PAGE_SIZE;
3285 default:
3286 BUG();
3287 }
3288}
3289
3290static void memcg_flush_percpu_vmstats(struct mem_cgroup *memcg)
3291{
3292 unsigned long stat[MEMCG_NR_STAT] = {0};
3293 struct mem_cgroup *mi;
3294 int node, cpu, i;
3295
3296 for_each_online_cpu(cpu)
3297 for (i = 0; i < MEMCG_NR_STAT; i++)
3298 stat[i] += per_cpu(memcg->vmstats_percpu->stat[i], cpu);
3299
3300 for (mi = memcg; mi; mi = parent_mem_cgroup(mi))
3301 for (i = 0; i < MEMCG_NR_STAT; i++)
3302 atomic_long_add(stat[i], &mi->vmstats[i]);
3303
3304 for_each_node(node) {
3305 struct mem_cgroup_per_node *pn = memcg->nodeinfo[node];
3306 struct mem_cgroup_per_node *pi;
3307
3308 for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++)
3309 stat[i] = 0;
3310
3311 for_each_online_cpu(cpu)
3312 for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++)
3313 stat[i] += per_cpu(
3314 pn->lruvec_stat_cpu->count[i], cpu);
3315
3316 for (pi = pn; pi; pi = parent_nodeinfo(pi, node))
3317 for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++)
3318 atomic_long_add(stat[i], &pi->lruvec_stat[i]);
3319 }
3320}
3321
3322static void memcg_flush_percpu_vmevents(struct mem_cgroup *memcg)
3323{
3324 unsigned long events[NR_VM_EVENT_ITEMS];
3325 struct mem_cgroup *mi;
3326 int cpu, i;
3327
3328 for (i = 0; i < NR_VM_EVENT_ITEMS; i++)
3329 events[i] = 0;
3330
3331 for_each_online_cpu(cpu)
3332 for (i = 0; i < NR_VM_EVENT_ITEMS; i++)
3333 events[i] += per_cpu(memcg->vmstats_percpu->events[i],
3334 cpu);
3335
3336 for (mi = memcg; mi; mi = parent_mem_cgroup(mi))
3337 for (i = 0; i < NR_VM_EVENT_ITEMS; i++)
3338 atomic_long_add(events[i], &mi->vmevents[i]);
3339}
3340
3341#ifdef CONFIG_MEMCG_KMEM
3342static int memcg_online_kmem(struct mem_cgroup *memcg)
3343{
3344 int memcg_id;
3345
3346 if (cgroup_memory_nokmem)
3347 return 0;
3348
3349 BUG_ON(memcg->kmemcg_id >= 0);
3350 BUG_ON(memcg->kmem_state);
3351
3352 memcg_id = memcg_alloc_cache_id();
3353 if (memcg_id < 0)
3354 return memcg_id;
3355
3356 static_branch_inc(&memcg_kmem_enabled_key);
3357
3358
3359
3360
3361
3362
3363 memcg->kmemcg_id = memcg_id;
3364 memcg->kmem_state = KMEM_ONLINE;
3365 INIT_LIST_HEAD(&memcg->kmem_caches);
3366
3367 return 0;
3368}
3369
3370static void memcg_offline_kmem(struct mem_cgroup *memcg)
3371{
3372 struct cgroup_subsys_state *css;
3373 struct mem_cgroup *parent, *child;
3374 int kmemcg_id;
3375
3376 if (memcg->kmem_state != KMEM_ONLINE)
3377 return;
3378
3379
3380
3381
3382
3383
3384 memcg->kmem_state = KMEM_ALLOCATED;
3385
3386 parent = parent_mem_cgroup(memcg);
3387 if (!parent)
3388 parent = root_mem_cgroup;
3389
3390
3391
3392
3393 memcg_deactivate_kmem_caches(memcg, parent);
3394
3395 kmemcg_id = memcg->kmemcg_id;
3396 BUG_ON(kmemcg_id < 0);
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406 rcu_read_lock();
3407 css_for_each_descendant_pre(css, &memcg->css) {
3408 child = mem_cgroup_from_css(css);
3409 BUG_ON(child->kmemcg_id != kmemcg_id);
3410 child->kmemcg_id = parent->kmemcg_id;
3411 if (!memcg->use_hierarchy)
3412 break;
3413 }
3414 rcu_read_unlock();
3415
3416 memcg_drain_all_list_lrus(kmemcg_id, parent);
3417
3418 memcg_free_cache_id(kmemcg_id);
3419}
3420
3421static void memcg_free_kmem(struct mem_cgroup *memcg)
3422{
3423
3424 if (unlikely(memcg->kmem_state == KMEM_ONLINE))
3425 memcg_offline_kmem(memcg);
3426
3427 if (memcg->kmem_state == KMEM_ALLOCATED) {
3428 WARN_ON(!list_empty(&memcg->kmem_caches));
3429 static_branch_dec(&memcg_kmem_enabled_key);
3430 }
3431}
3432#else
3433static int memcg_online_kmem(struct mem_cgroup *memcg)
3434{
3435 return 0;
3436}
3437static void memcg_offline_kmem(struct mem_cgroup *memcg)
3438{
3439}
3440static void memcg_free_kmem(struct mem_cgroup *memcg)
3441{
3442}
3443#endif
3444
3445static int memcg_update_kmem_max(struct mem_cgroup *memcg,
3446 unsigned long max)
3447{
3448 int ret;
3449
3450 mutex_lock(&memcg_max_mutex);
3451 ret = page_counter_set_max(&memcg->kmem, max);
3452 mutex_unlock(&memcg_max_mutex);
3453 return ret;
3454}
3455
3456static int memcg_update_tcp_max(struct mem_cgroup *memcg, unsigned long max)
3457{
3458 int ret;
3459
3460 mutex_lock(&memcg_max_mutex);
3461
3462 ret = page_counter_set_max(&memcg->tcpmem, max);
3463 if (ret)
3464 goto out;
3465
3466 if (!memcg->tcpmem_active) {
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483 static_branch_inc(&memcg_sockets_enabled_key);
3484 memcg->tcpmem_active = true;
3485 }
3486out:
3487 mutex_unlock(&memcg_max_mutex);
3488 return ret;
3489}
3490
3491
3492
3493
3494
3495static ssize_t mem_cgroup_write(struct kernfs_open_file *of,
3496 char *buf, size_t nbytes, loff_t off)
3497{
3498 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
3499 unsigned long nr_pages;
3500 int ret;
3501
3502 buf = strstrip(buf);
3503 ret = page_counter_memparse(buf, "-1", &nr_pages);
3504 if (ret)
3505 return ret;
3506
3507 switch (MEMFILE_ATTR(of_cft(of)->private)) {
3508 case RES_LIMIT:
3509 if (mem_cgroup_is_root(memcg)) {
3510 ret = -EINVAL;
3511 break;
3512 }
3513 switch (MEMFILE_TYPE(of_cft(of)->private)) {
3514 case _MEM:
3515 ret = mem_cgroup_resize_max(memcg, nr_pages, false);
3516 break;
3517 case _MEMSWAP:
3518 ret = mem_cgroup_resize_max(memcg, nr_pages, true);
3519 break;
3520 case _KMEM:
3521 pr_warn_once("kmem.limit_in_bytes is deprecated and will be removed. "
3522 "Please report your usecase to linux-mm@kvack.org if you "
3523 "depend on this functionality.\n");
3524 ret = memcg_update_kmem_max(memcg, nr_pages);
3525 break;
3526 case _TCP:
3527 ret = memcg_update_tcp_max(memcg, nr_pages);
3528 break;
3529 }
3530 break;
3531 case RES_SOFT_LIMIT:
3532 memcg->soft_limit = nr_pages;
3533 ret = 0;
3534 break;
3535 }
3536 return ret ?: nbytes;
3537}
3538
3539static ssize_t mem_cgroup_reset(struct kernfs_open_file *of, char *buf,
3540 size_t nbytes, loff_t off)
3541{
3542 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
3543 struct page_counter *counter;
3544
3545 switch (MEMFILE_TYPE(of_cft(of)->private)) {
3546 case _MEM:
3547 counter = &memcg->memory;
3548 break;
3549 case _MEMSWAP:
3550 counter = &memcg->memsw;
3551 break;
3552 case _KMEM:
3553 counter = &memcg->kmem;
3554 break;
3555 case _TCP:
3556 counter = &memcg->tcpmem;
3557 break;
3558 default:
3559 BUG();
3560 }
3561
3562 switch (MEMFILE_ATTR(of_cft(of)->private)) {
3563 case RES_MAX_USAGE:
3564 page_counter_reset_watermark(counter);
3565 break;
3566 case RES_FAILCNT:
3567 counter->failcnt = 0;
3568 break;
3569 default:
3570 BUG();
3571 }
3572
3573 return nbytes;
3574}
3575
3576static u64 mem_cgroup_move_charge_read(struct cgroup_subsys_state *css,
3577 struct cftype *cft)
3578{
3579 return mem_cgroup_from_css(css)->move_charge_at_immigrate;
3580}
3581
3582#ifdef CONFIG_MMU
3583static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css,
3584 struct cftype *cft, u64 val)
3585{
3586 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
3587
3588 if (val & ~MOVE_MASK)
3589 return -EINVAL;
3590
3591
3592
3593
3594
3595
3596
3597 memcg->move_charge_at_immigrate = val;
3598 return 0;
3599}
3600#else
3601static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css,
3602 struct cftype *cft, u64 val)
3603{
3604 return -ENOSYS;
3605}
3606#endif
3607
3608#ifdef CONFIG_NUMA
3609
3610#define LRU_ALL_FILE (BIT(LRU_INACTIVE_FILE) | BIT(LRU_ACTIVE_FILE))
3611#define LRU_ALL_ANON (BIT(LRU_INACTIVE_ANON) | BIT(LRU_ACTIVE_ANON))
3612#define LRU_ALL ((1 << NR_LRU_LISTS) - 1)
3613
3614static unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg,
3615 int nid, unsigned int lru_mask)
3616{
3617 struct lruvec *lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(nid));
3618 unsigned long nr = 0;
3619 enum lru_list lru;
3620
3621 VM_BUG_ON((unsigned)nid >= nr_node_ids);
3622
3623 for_each_lru(lru) {
3624 if (!(BIT(lru) & lru_mask))
3625 continue;
3626 nr += lruvec_page_state_local(lruvec, NR_LRU_BASE + lru);
3627 }
3628 return nr;
3629}
3630
3631static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *memcg,
3632 unsigned int lru_mask)
3633{
3634 unsigned long nr = 0;
3635 enum lru_list lru;
3636
3637 for_each_lru(lru) {
3638 if (!(BIT(lru) & lru_mask))
3639 continue;
3640 nr += memcg_page_state_local(memcg, NR_LRU_BASE + lru);
3641 }
3642 return nr;
3643}
3644
3645static int memcg_numa_stat_show(struct seq_file *m, void *v)
3646{
3647 struct numa_stat {
3648 const char *name;
3649 unsigned int lru_mask;
3650 };
3651
3652 static const struct numa_stat stats[] = {
3653 { "total", LRU_ALL },
3654 { "file", LRU_ALL_FILE },
3655 { "anon", LRU_ALL_ANON },
3656 { "unevictable", BIT(LRU_UNEVICTABLE) },
3657 };
3658 const struct numa_stat *stat;
3659 int nid;
3660 unsigned long nr;
3661 struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
3662
3663 for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) {
3664 nr = mem_cgroup_nr_lru_pages(memcg, stat->lru_mask);
3665 seq_printf(m, "%s=%lu", stat->name, nr);
3666 for_each_node_state(nid, N_MEMORY) {
3667 nr = mem_cgroup_node_nr_lru_pages(memcg, nid,
3668 stat->lru_mask);
3669 seq_printf(m, " N%d=%lu", nid, nr);
3670 }
3671 seq_putc(m, '\n');
3672 }
3673
3674 for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) {
3675 struct mem_cgroup *iter;
3676
3677 nr = 0;
3678 for_each_mem_cgroup_tree(iter, memcg)
3679 nr += mem_cgroup_nr_lru_pages(iter, stat->lru_mask);
3680 seq_printf(m, "hierarchical_%s=%lu", stat->name, nr);
3681 for_each_node_state(nid, N_MEMORY) {
3682 nr = 0;
3683 for_each_mem_cgroup_tree(iter, memcg)
3684 nr += mem_cgroup_node_nr_lru_pages(
3685 iter, nid, stat->lru_mask);
3686 seq_printf(m, " N%d=%lu", nid, nr);
3687 }
3688 seq_putc(m, '\n');
3689 }
3690
3691 return 0;
3692}
3693#endif
3694
3695static const unsigned int memcg1_stats[] = {
3696 MEMCG_CACHE,
3697 MEMCG_RSS,
3698 MEMCG_RSS_HUGE,
3699 NR_SHMEM,
3700 NR_FILE_MAPPED,
3701 NR_FILE_DIRTY,
3702 NR_WRITEBACK,
3703 MEMCG_SWAP,
3704};
3705
3706static const char *const memcg1_stat_names[] = {
3707 "cache",
3708 "rss",
3709 "rss_huge",
3710 "shmem",
3711 "mapped_file",
3712 "dirty",
3713 "writeback",
3714 "swap",
3715};
3716
3717
3718static const unsigned int memcg1_events[] = {
3719 PGPGIN,
3720 PGPGOUT,
3721 PGFAULT,
3722 PGMAJFAULT,
3723};
3724
3725static int memcg_stat_show(struct seq_file *m, void *v)
3726{
3727 struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
3728 unsigned long memory, memsw;
3729 struct mem_cgroup *mi;
3730 unsigned int i;
3731
3732 BUILD_BUG_ON(ARRAY_SIZE(memcg1_stat_names) != ARRAY_SIZE(memcg1_stats));
3733
3734 for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) {
3735 if (memcg1_stats[i] == MEMCG_SWAP && !do_memsw_account())
3736 continue;
3737 seq_printf(m, "%s %lu\n", memcg1_stat_names[i],
3738 memcg_page_state_local(memcg, memcg1_stats[i]) *
3739 PAGE_SIZE);
3740 }
3741
3742 for (i = 0; i < ARRAY_SIZE(memcg1_events); i++)
3743 seq_printf(m, "%s %lu\n", vm_event_name(memcg1_events[i]),
3744 memcg_events_local(memcg, memcg1_events[i]));
3745
3746 for (i = 0; i < NR_LRU_LISTS; i++)
3747 seq_printf(m, "%s %lu\n", lru_list_name(i),
3748 memcg_page_state_local(memcg, NR_LRU_BASE + i) *
3749 PAGE_SIZE);
3750
3751
3752 memory = memsw = PAGE_COUNTER_MAX;
3753 for (mi = memcg; mi; mi = parent_mem_cgroup(mi)) {
3754 memory = min(memory, mi->memory.max);
3755 memsw = min(memsw, mi->memsw.max);
3756 }
3757 seq_printf(m, "hierarchical_memory_limit %llu\n",
3758 (u64)memory * PAGE_SIZE);
3759 if (do_memsw_account())
3760 seq_printf(m, "hierarchical_memsw_limit %llu\n",
3761 (u64)memsw * PAGE_SIZE);
3762
3763 for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) {
3764 if (memcg1_stats[i] == MEMCG_SWAP && !do_memsw_account())
3765 continue;
3766 seq_printf(m, "total_%s %llu\n", memcg1_stat_names[i],
3767 (u64)memcg_page_state(memcg, memcg1_stats[i]) *
3768 PAGE_SIZE);
3769 }
3770
3771 for (i = 0; i < ARRAY_SIZE(memcg1_events); i++)
3772 seq_printf(m, "total_%s %llu\n",
3773 vm_event_name(memcg1_events[i]),
3774 (u64)memcg_events(memcg, memcg1_events[i]));
3775
3776 for (i = 0; i < NR_LRU_LISTS; i++)
3777 seq_printf(m, "total_%s %llu\n", lru_list_name(i),
3778 (u64)memcg_page_state(memcg, NR_LRU_BASE + i) *
3779 PAGE_SIZE);
3780
3781#ifdef CONFIG_DEBUG_VM
3782 {
3783 pg_data_t *pgdat;
3784 struct mem_cgroup_per_node *mz;
3785 struct zone_reclaim_stat *rstat;
3786 unsigned long recent_rotated[2] = {0, 0};
3787 unsigned long recent_scanned[2] = {0, 0};
3788
3789 for_each_online_pgdat(pgdat) {
3790 mz = mem_cgroup_nodeinfo(memcg, pgdat->node_id);
3791 rstat = &mz->lruvec.reclaim_stat;
3792
3793 recent_rotated[0] += rstat->recent_rotated[0];
3794 recent_rotated[1] += rstat->recent_rotated[1];
3795 recent_scanned[0] += rstat->recent_scanned[0];
3796 recent_scanned[1] += rstat->recent_scanned[1];
3797 }
3798 seq_printf(m, "recent_rotated_anon %lu\n", recent_rotated[0]);
3799 seq_printf(m, "recent_rotated_file %lu\n", recent_rotated[1]);
3800 seq_printf(m, "recent_scanned_anon %lu\n", recent_scanned[0]);
3801 seq_printf(m, "recent_scanned_file %lu\n", recent_scanned[1]);
3802 }
3803#endif
3804
3805 return 0;
3806}
3807
3808static u64 mem_cgroup_swappiness_read(struct cgroup_subsys_state *css,
3809 struct cftype *cft)
3810{
3811 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
3812
3813 return mem_cgroup_swappiness(memcg);
3814}
3815
3816static int mem_cgroup_swappiness_write(struct cgroup_subsys_state *css,
3817 struct cftype *cft, u64 val)
3818{
3819 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
3820
3821 if (val > 100)
3822 return -EINVAL;
3823
3824 if (css->parent)
3825 memcg->swappiness = val;
3826 else
3827 vm_swappiness = val;
3828
3829 return 0;
3830}
3831
3832static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap)
3833{
3834 struct mem_cgroup_threshold_ary *t;
3835 unsigned long usage;
3836 int i;
3837
3838 rcu_read_lock();
3839 if (!swap)
3840 t = rcu_dereference(memcg->thresholds.primary);
3841 else
3842 t = rcu_dereference(memcg->memsw_thresholds.primary);
3843
3844 if (!t)
3845 goto unlock;
3846
3847 usage = mem_cgroup_usage(memcg, swap);
3848
3849
3850
3851
3852
3853
3854 i = t->current_threshold;
3855
3856
3857
3858
3859
3860
3861
3862 for (; i >= 0 && unlikely(t->entries[i].threshold > usage); i--)
3863 eventfd_signal(t->entries[i].eventfd, 1);
3864
3865
3866 i++;
3867
3868
3869
3870
3871
3872
3873
3874 for (; i < t->size && unlikely(t->entries[i].threshold <= usage); i++)
3875 eventfd_signal(t->entries[i].eventfd, 1);
3876
3877
3878 t->current_threshold = i - 1;
3879unlock:
3880 rcu_read_unlock();
3881}
3882
3883static void mem_cgroup_threshold(struct mem_cgroup *memcg)
3884{
3885 while (memcg) {
3886 __mem_cgroup_threshold(memcg, false);
3887 if (do_memsw_account())
3888 __mem_cgroup_threshold(memcg, true);
3889
3890 memcg = parent_mem_cgroup(memcg);
3891 }
3892}
3893
3894static int compare_thresholds(const void *a, const void *b)
3895{
3896 const struct mem_cgroup_threshold *_a = a;
3897 const struct mem_cgroup_threshold *_b = b;
3898
3899 if (_a->threshold > _b->threshold)
3900 return 1;
3901
3902 if (_a->threshold < _b->threshold)
3903 return -1;
3904
3905 return 0;
3906}
3907
3908static int mem_cgroup_oom_notify_cb(struct mem_cgroup *memcg)
3909{
3910 struct mem_cgroup_eventfd_list *ev;
3911
3912 spin_lock(&memcg_oom_lock);
3913
3914 list_for_each_entry(ev, &memcg->oom_notify, list)
3915 eventfd_signal(ev->eventfd, 1);
3916
3917 spin_unlock(&memcg_oom_lock);
3918 return 0;
3919}
3920
3921static void mem_cgroup_oom_notify(struct mem_cgroup *memcg)
3922{
3923 struct mem_cgroup *iter;
3924
3925 for_each_mem_cgroup_tree(iter, memcg)
3926 mem_cgroup_oom_notify_cb(iter);
3927}
3928
3929static int __mem_cgroup_usage_register_event(struct mem_cgroup *memcg,
3930 struct eventfd_ctx *eventfd, const char *args, enum res_type type)
3931{
3932 struct mem_cgroup_thresholds *thresholds;
3933 struct mem_cgroup_threshold_ary *new;
3934 unsigned long threshold;
3935 unsigned long usage;
3936 int i, size, ret;
3937
3938 ret = page_counter_memparse(args, "-1", &threshold);
3939 if (ret)
3940 return ret;
3941
3942 mutex_lock(&memcg->thresholds_lock);
3943
3944 if (type == _MEM) {
3945 thresholds = &memcg->thresholds;
3946 usage = mem_cgroup_usage(memcg, false);
3947 } else if (type == _MEMSWAP) {
3948 thresholds = &memcg->memsw_thresholds;
3949 usage = mem_cgroup_usage(memcg, true);
3950 } else
3951 BUG();
3952
3953
3954 if (thresholds->primary)
3955 __mem_cgroup_threshold(memcg, type == _MEMSWAP);
3956
3957 size = thresholds->primary ? thresholds->primary->size + 1 : 1;
3958
3959
3960 new = kmalloc(struct_size(new, entries, size), GFP_KERNEL);
3961 if (!new) {
3962 ret = -ENOMEM;
3963 goto unlock;
3964 }
3965 new->size = size;
3966
3967
3968 if (thresholds->primary) {
3969 memcpy(new->entries, thresholds->primary->entries, (size - 1) *
3970 sizeof(struct mem_cgroup_threshold));
3971 }
3972
3973
3974 new->entries[size - 1].eventfd = eventfd;
3975 new->entries[size - 1].threshold = threshold;
3976
3977
3978 sort(new->entries, size, sizeof(struct mem_cgroup_threshold),
3979 compare_thresholds, NULL);
3980
3981
3982 new->current_threshold = -1;
3983 for (i = 0; i < size; i++) {
3984 if (new->entries[i].threshold <= usage) {
3985
3986
3987
3988
3989
3990 ++new->current_threshold;
3991 } else
3992 break;
3993 }
3994
3995
3996 kfree(thresholds->spare);
3997 thresholds->spare = thresholds->primary;
3998
3999 rcu_assign_pointer(thresholds->primary, new);
4000
4001
4002 synchronize_rcu();
4003
4004unlock:
4005 mutex_unlock(&memcg->thresholds_lock);
4006
4007 return ret;
4008}
4009
4010static int mem_cgroup_usage_register_event(struct mem_cgroup *memcg,
4011 struct eventfd_ctx *eventfd, const char *args)
4012{
4013 return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEM);
4014}
4015
4016static int memsw_cgroup_usage_register_event(struct mem_cgroup *memcg,
4017 struct eventfd_ctx *eventfd, const char *args)
4018{
4019 return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEMSWAP);
4020}
4021
4022static void __mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
4023 struct eventfd_ctx *eventfd, enum res_type type)
4024{
4025 struct mem_cgroup_thresholds *thresholds;
4026 struct mem_cgroup_threshold_ary *new;
4027 unsigned long usage;
4028 int i, j, size;
4029
4030 mutex_lock(&memcg->thresholds_lock);
4031
4032 if (type == _MEM) {
4033 thresholds = &memcg->thresholds;
4034 usage = mem_cgroup_usage(memcg, false);
4035 } else if (type == _MEMSWAP) {
4036 thresholds = &memcg->memsw_thresholds;
4037 usage = mem_cgroup_usage(memcg, true);
4038 } else
4039 BUG();
4040
4041 if (!thresholds->primary)
4042 goto unlock;
4043
4044
4045 __mem_cgroup_threshold(memcg, type == _MEMSWAP);
4046
4047
4048 size = 0;
4049 for (i = 0; i < thresholds->primary->size; i++) {
4050 if (thresholds->primary->entries[i].eventfd != eventfd)
4051 size++;
4052 }
4053
4054 new = thresholds->spare;
4055
4056
4057 if (!size) {
4058 kfree(new);
4059 new = NULL;
4060 goto swap_buffers;
4061 }
4062
4063 new->size = size;
4064
4065
4066 new->current_threshold = -1;
4067 for (i = 0, j = 0; i < thresholds->primary->size; i++) {
4068 if (thresholds->primary->entries[i].eventfd == eventfd)
4069 continue;
4070
4071 new->entries[j] = thresholds->primary->entries[i];
4072 if (new->entries[j].threshold <= usage) {
4073
4074
4075
4076
4077
4078 ++new->current_threshold;
4079 }
4080 j++;
4081 }
4082
4083swap_buffers:
4084
4085 thresholds->spare = thresholds->primary;
4086
4087 rcu_assign_pointer(thresholds->primary, new);
4088
4089
4090 synchronize_rcu();
4091
4092
4093 if (!new) {
4094 kfree(thresholds->spare);
4095 thresholds->spare = NULL;
4096 }
4097unlock:
4098 mutex_unlock(&memcg->thresholds_lock);
4099}
4100
4101static void mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
4102 struct eventfd_ctx *eventfd)
4103{
4104 return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEM);
4105}
4106
4107static void memsw_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
4108 struct eventfd_ctx *eventfd)
4109{
4110 return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEMSWAP);
4111}
4112
4113static int mem_cgroup_oom_register_event(struct mem_cgroup *memcg,
4114 struct eventfd_ctx *eventfd, const char *args)
4115{
4116 struct mem_cgroup_eventfd_list *event;
4117
4118 event = kmalloc(sizeof(*event), GFP_KERNEL);
4119 if (!event)
4120 return -ENOMEM;
4121
4122 spin_lock(&memcg_oom_lock);
4123
4124 event->eventfd = eventfd;
4125 list_add(&event->list, &memcg->oom_notify);
4126
4127
4128 if (memcg->under_oom)
4129 eventfd_signal(eventfd, 1);
4130 spin_unlock(&memcg_oom_lock);
4131
4132 return 0;
4133}
4134
4135static void mem_cgroup_oom_unregister_event(struct mem_cgroup *memcg,
4136 struct eventfd_ctx *eventfd)
4137{
4138 struct mem_cgroup_eventfd_list *ev, *tmp;
4139
4140 spin_lock(&memcg_oom_lock);
4141
4142 list_for_each_entry_safe(ev, tmp, &memcg->oom_notify, list) {
4143 if (ev->eventfd == eventfd) {
4144 list_del(&ev->list);
4145 kfree(ev);
4146 }
4147 }
4148
4149 spin_unlock(&memcg_oom_lock);
4150}
4151
4152static int mem_cgroup_oom_control_read(struct seq_file *sf, void *v)
4153{
4154 struct mem_cgroup *memcg = mem_cgroup_from_seq(sf);
4155
4156 seq_printf(sf, "oom_kill_disable %d\n", memcg->oom_kill_disable);
4157 seq_printf(sf, "under_oom %d\n", (bool)memcg->under_oom);
4158 seq_printf(sf, "oom_kill %lu\n",
4159 atomic_long_read(&memcg->memory_events[MEMCG_OOM_KILL]));
4160 return 0;
4161}
4162
4163static int mem_cgroup_oom_control_write(struct cgroup_subsys_state *css,
4164 struct cftype *cft, u64 val)
4165{
4166 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
4167
4168
4169 if (!css->parent || !((val == 0) || (val == 1)))
4170 return -EINVAL;
4171
4172 memcg->oom_kill_disable = val;
4173 if (!val)
4174 memcg_oom_recover(memcg);
4175
4176 return 0;
4177}
4178
4179#ifdef CONFIG_CGROUP_WRITEBACK
4180
4181#include <trace/events/writeback.h>
4182
4183static int memcg_wb_domain_init(struct mem_cgroup *memcg, gfp_t gfp)
4184{
4185 return wb_domain_init(&memcg->cgwb_domain, gfp);
4186}
4187
4188static void memcg_wb_domain_exit(struct mem_cgroup *memcg)
4189{
4190 wb_domain_exit(&memcg->cgwb_domain);
4191}
4192
4193static void memcg_wb_domain_size_changed(struct mem_cgroup *memcg)
4194{
4195 wb_domain_size_changed(&memcg->cgwb_domain);
4196}
4197
4198struct wb_domain *mem_cgroup_wb_domain(struct bdi_writeback *wb)
4199{
4200 struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css);
4201
4202 if (!memcg->css.parent)
4203 return NULL;
4204
4205 return &memcg->cgwb_domain;
4206}
4207
4208
4209
4210
4211
4212static unsigned long memcg_exact_page_state(struct mem_cgroup *memcg, int idx)
4213{
4214 long x = atomic_long_read(&memcg->vmstats[idx]);
4215 int cpu;
4216
4217 for_each_online_cpu(cpu)
4218 x += per_cpu_ptr(memcg->vmstats_percpu, cpu)->stat[idx];
4219 if (x < 0)
4220 x = 0;
4221 return x;
4222}
4223
4224
4225
4226
4227
4228
4229
4230
4231
4232
4233
4234
4235
4236
4237
4238
4239
4240
4241
4242void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages,
4243 unsigned long *pheadroom, unsigned long *pdirty,
4244 unsigned long *pwriteback)
4245{
4246 struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css);
4247 struct mem_cgroup *parent;
4248
4249 *pdirty = memcg_exact_page_state(memcg, NR_FILE_DIRTY);
4250
4251
4252 *pwriteback = memcg_exact_page_state(memcg, NR_WRITEBACK);
4253 *pfilepages = memcg_exact_page_state(memcg, NR_INACTIVE_FILE) +
4254 memcg_exact_page_state(memcg, NR_ACTIVE_FILE);
4255 *pheadroom = PAGE_COUNTER_MAX;
4256
4257 while ((parent = parent_mem_cgroup(memcg))) {
4258 unsigned long ceiling = min(memcg->memory.max, memcg->high);
4259 unsigned long used = page_counter_read(&memcg->memory);
4260
4261 *pheadroom = min(*pheadroom, ceiling - min(ceiling, used));
4262 memcg = parent;
4263 }
4264}
4265
4266
4267
4268
4269
4270
4271
4272
4273
4274
4275
4276
4277
4278
4279
4280
4281
4282
4283
4284
4285
4286
4287
4288
4289
4290
4291
4292
4293
4294
4295
4296
4297
4298
4299
4300
4301
4302
4303
4304
4305
4306
4307
4308
4309
4310void mem_cgroup_track_foreign_dirty_slowpath(struct page *page,
4311 struct bdi_writeback *wb)
4312{
4313 struct mem_cgroup *memcg = page->mem_cgroup;
4314 struct memcg_cgwb_frn *frn;
4315 u64 now = get_jiffies_64();
4316 u64 oldest_at = now;
4317 int oldest = -1;
4318 int i;
4319
4320 trace_track_foreign_dirty(page, wb);
4321
4322
4323
4324
4325
4326
4327 for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++) {
4328 frn = &memcg->cgwb_frn[i];
4329 if (frn->bdi_id == wb->bdi->id &&
4330 frn->memcg_id == wb->memcg_css->id)
4331 break;
4332 if (time_before64(frn->at, oldest_at) &&
4333 atomic_read(&frn->done.cnt) == 1) {
4334 oldest = i;
4335 oldest_at = frn->at;
4336 }
4337 }
4338
4339 if (i < MEMCG_CGWB_FRN_CNT) {
4340
4341
4342
4343
4344
4345
4346
4347 unsigned long update_intv =
4348 min_t(unsigned long, HZ,
4349 msecs_to_jiffies(dirty_expire_interval * 10) / 8);
4350
4351 if (time_before64(frn->at, now - update_intv))
4352 frn->at = now;
4353 } else if (oldest >= 0) {
4354
4355 frn = &memcg->cgwb_frn[oldest];
4356 frn->bdi_id = wb->bdi->id;
4357 frn->memcg_id = wb->memcg_css->id;
4358 frn->at = now;
4359 }
4360}
4361
4362
4363void mem_cgroup_flush_foreign(struct bdi_writeback *wb)
4364{
4365 struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css);
4366 unsigned long intv = msecs_to_jiffies(dirty_expire_interval * 10);
4367 u64 now = jiffies_64;
4368 int i;
4369
4370 for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++) {
4371 struct memcg_cgwb_frn *frn = &memcg->cgwb_frn[i];
4372
4373
4374
4375
4376
4377
4378
4379 if (time_after64(frn->at, now - intv) &&
4380 atomic_read(&frn->done.cnt) == 1) {
4381 frn->at = 0;
4382 trace_flush_foreign(wb, frn->bdi_id, frn->memcg_id);
4383 cgroup_writeback_by_id(frn->bdi_id, frn->memcg_id, 0,
4384 WB_REASON_FOREIGN_FLUSH,
4385 &frn->done);
4386 }
4387 }
4388}
4389
4390#else
4391
4392static int memcg_wb_domain_init(struct mem_cgroup *memcg, gfp_t gfp)
4393{
4394 return 0;
4395}
4396
4397static void memcg_wb_domain_exit(struct mem_cgroup *memcg)
4398{
4399}
4400
4401static void memcg_wb_domain_size_changed(struct mem_cgroup *memcg)
4402{
4403}
4404
4405#endif
4406
4407
4408
4409
4410
4411
4412
4413
4414
4415
4416
4417
4418
4419
4420
4421
4422
4423
4424
4425static void memcg_event_remove(struct work_struct *work)
4426{
4427 struct mem_cgroup_event *event =
4428 container_of(work, struct mem_cgroup_event, remove);
4429 struct mem_cgroup *memcg = event->memcg;
4430
4431 remove_wait_queue(event->wqh, &event->wait);
4432
4433 event->unregister_event(memcg, event->eventfd);
4434
4435
4436 eventfd_signal(event->eventfd, 1);
4437
4438 eventfd_ctx_put(event->eventfd);
4439 kfree(event);
4440 css_put(&memcg->css);
4441}
4442
4443
4444
4445
4446
4447
4448static int memcg_event_wake(wait_queue_entry_t *wait, unsigned mode,
4449 int sync, void *key)
4450{
4451 struct mem_cgroup_event *event =
4452 container_of(wait, struct mem_cgroup_event, wait);
4453 struct mem_cgroup *memcg = event->memcg;
4454 __poll_t flags = key_to_poll(key);
4455
4456 if (flags & EPOLLHUP) {
4457
4458
4459
4460
4461
4462
4463
4464
4465
4466 spin_lock(&memcg->event_list_lock);
4467 if (!list_empty(&event->list)) {
4468 list_del_init(&event->list);
4469
4470
4471
4472
4473 schedule_work(&event->remove);
4474 }
4475 spin_unlock(&memcg->event_list_lock);
4476 }
4477
4478 return 0;
4479}
4480
4481static void memcg_event_ptable_queue_proc(struct file *file,
4482 wait_queue_head_t *wqh, poll_table *pt)
4483{
4484 struct mem_cgroup_event *event =
4485 container_of(pt, struct mem_cgroup_event, pt);
4486
4487 event->wqh = wqh;
4488 add_wait_queue(wqh, &event->wait);
4489}
4490
4491
4492
4493
4494
4495
4496
4497
4498
4499static ssize_t memcg_write_event_control(struct kernfs_open_file *of,
4500 char *buf, size_t nbytes, loff_t off)
4501{
4502 struct cgroup_subsys_state *css = of_css(of);
4503 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
4504 struct mem_cgroup_event *event;
4505 struct cgroup_subsys_state *cfile_css;
4506 unsigned int efd, cfd;
4507 struct fd efile;
4508 struct fd cfile;
4509 const char *name;
4510 char *endp;
4511 int ret;
4512
4513 buf = strstrip(buf);
4514
4515 efd = simple_strtoul(buf, &endp, 10);
4516 if (*endp != ' ')
4517 return -EINVAL;
4518 buf = endp + 1;
4519
4520 cfd = simple_strtoul(buf, &endp, 10);
4521 if ((*endp != ' ') && (*endp != '\0'))
4522 return -EINVAL;
4523 buf = endp + 1;
4524
4525 event = kzalloc(sizeof(*event), GFP_KERNEL);
4526 if (!event)
4527 return -ENOMEM;
4528
4529 event->memcg = memcg;
4530 INIT_LIST_HEAD(&event->list);
4531 init_poll_funcptr(&event->pt, memcg_event_ptable_queue_proc);
4532 init_waitqueue_func_entry(&event->wait, memcg_event_wake);
4533 INIT_WORK(&event->remove, memcg_event_remove);
4534
4535 efile = fdget(efd);
4536 if (!efile.file) {
4537 ret = -EBADF;
4538 goto out_kfree;
4539 }
4540
4541 event->eventfd = eventfd_ctx_fileget(efile.file);
4542 if (IS_ERR(event->eventfd)) {
4543 ret = PTR_ERR(event->eventfd);
4544 goto out_put_efile;
4545 }
4546
4547 cfile = fdget(cfd);
4548 if (!cfile.file) {
4549 ret = -EBADF;
4550 goto out_put_eventfd;
4551 }
4552
4553
4554
4555 ret = inode_permission(file_inode(cfile.file), MAY_READ);
4556 if (ret < 0)
4557 goto out_put_cfile;
4558
4559
4560
4561
4562
4563
4564
4565
4566
4567 name = cfile.file->f_path.dentry->d_name.name;
4568
4569 if (!strcmp(name, "memory.usage_in_bytes")) {
4570 event->register_event = mem_cgroup_usage_register_event;
4571 event->unregister_event = mem_cgroup_usage_unregister_event;
4572 } else if (!strcmp(name, "memory.oom_control")) {
4573 event->register_event = mem_cgroup_oom_register_event;
4574 event->unregister_event = mem_cgroup_oom_unregister_event;
4575 } else if (!strcmp(name, "memory.pressure_level")) {
4576 event->register_event = vmpressure_register_event;
4577 event->unregister_event = vmpressure_unregister_event;
4578 } else if (!strcmp(name, "memory.memsw.usage_in_bytes")) {
4579 event->register_event = memsw_cgroup_usage_register_event;
4580 event->unregister_event = memsw_cgroup_usage_unregister_event;
4581 } else {
4582 ret = -EINVAL;
4583 goto out_put_cfile;
4584 }
4585
4586
4587
4588
4589
4590
4591 cfile_css = css_tryget_online_from_dir(cfile.file->f_path.dentry->d_parent,
4592 &memory_cgrp_subsys);
4593 ret = -EINVAL;
4594 if (IS_ERR(cfile_css))
4595 goto out_put_cfile;
4596 if (cfile_css != css) {
4597 css_put(cfile_css);
4598 goto out_put_cfile;
4599 }
4600
4601 ret = event->register_event(memcg, event->eventfd, buf);
4602 if (ret)
4603 goto out_put_css;
4604
4605 vfs_poll(efile.file, &event->pt);
4606
4607 spin_lock(&memcg->event_list_lock);
4608 list_add(&event->list, &memcg->event_list);
4609 spin_unlock(&memcg->event_list_lock);
4610
4611 fdput(cfile);
4612 fdput(efile);
4613
4614 return nbytes;
4615
4616out_put_css:
4617 css_put(css);
4618out_put_cfile:
4619 fdput(cfile);
4620out_put_eventfd:
4621 eventfd_ctx_put(event->eventfd);
4622out_put_efile:
4623 fdput(efile);
4624out_kfree:
4625 kfree(event);
4626
4627 return ret;
4628}
4629
4630static struct cftype mem_cgroup_legacy_files[] = {
4631 {
4632 .name = "usage_in_bytes",
4633 .private = MEMFILE_PRIVATE(_MEM, RES_USAGE),
4634 .read_u64 = mem_cgroup_read_u64,
4635 },
4636 {
4637 .name = "max_usage_in_bytes",
4638 .private = MEMFILE_PRIVATE(_MEM, RES_MAX_USAGE),
4639 .write = mem_cgroup_reset,
4640 .read_u64 = mem_cgroup_read_u64,
4641 },
4642 {
4643 .name = "limit_in_bytes",
4644 .private = MEMFILE_PRIVATE(_MEM, RES_LIMIT),
4645 .write = mem_cgroup_write,
4646 .read_u64 = mem_cgroup_read_u64,
4647 },
4648 {
4649 .name = "soft_limit_in_bytes",
4650 .private = MEMFILE_PRIVATE(_MEM, RES_SOFT_LIMIT),
4651 .write = mem_cgroup_write,
4652 .read_u64 = mem_cgroup_read_u64,
4653 },
4654 {
4655 .name = "failcnt",
4656 .private = MEMFILE_PRIVATE(_MEM, RES_FAILCNT),
4657 .write = mem_cgroup_reset,
4658 .read_u64 = mem_cgroup_read_u64,
4659 },
4660 {
4661 .name = "stat",
4662 .seq_show = memcg_stat_show,
4663 },
4664 {
4665 .name = "force_empty",
4666 .write = mem_cgroup_force_empty_write,
4667 },
4668 {
4669 .name = "use_hierarchy",
4670 .write_u64 = mem_cgroup_hierarchy_write,
4671 .read_u64 = mem_cgroup_hierarchy_read,
4672 },
4673 {
4674 .name = "cgroup.event_control",
4675 .write = memcg_write_event_control,
4676 .flags = CFTYPE_NO_PREFIX | CFTYPE_WORLD_WRITABLE,
4677 },
4678 {
4679 .name = "swappiness",
4680 .read_u64 = mem_cgroup_swappiness_read,
4681 .write_u64 = mem_cgroup_swappiness_write,
4682 },
4683 {
4684 .name = "move_charge_at_immigrate",
4685 .read_u64 = mem_cgroup_move_charge_read,
4686 .write_u64 = mem_cgroup_move_charge_write,
4687 },
4688 {
4689 .name = "oom_control",
4690 .seq_show = mem_cgroup_oom_control_read,
4691 .write_u64 = mem_cgroup_oom_control_write,
4692 .private = MEMFILE_PRIVATE(_OOM_TYPE, OOM_CONTROL),
4693 },
4694 {
4695 .name = "pressure_level",
4696 },
4697#ifdef CONFIG_NUMA
4698 {
4699 .name = "numa_stat",
4700 .seq_show = memcg_numa_stat_show,
4701 },
4702#endif
4703 {
4704 .name = "kmem.limit_in_bytes",
4705 .private = MEMFILE_PRIVATE(_KMEM, RES_LIMIT),
4706 .write = mem_cgroup_write,
4707 .read_u64 = mem_cgroup_read_u64,
4708 },
4709 {
4710 .name = "kmem.usage_in_bytes",
4711 .private = MEMFILE_PRIVATE(_KMEM, RES_USAGE),
4712 .read_u64 = mem_cgroup_read_u64,
4713 },
4714 {
4715 .name = "kmem.failcnt",
4716 .private = MEMFILE_PRIVATE(_KMEM, RES_FAILCNT),
4717 .write = mem_cgroup_reset,
4718 .read_u64 = mem_cgroup_read_u64,
4719 },
4720 {
4721 .name = "kmem.max_usage_in_bytes",
4722 .private = MEMFILE_PRIVATE(_KMEM, RES_MAX_USAGE),
4723 .write = mem_cgroup_reset,
4724 .read_u64 = mem_cgroup_read_u64,
4725 },
4726#if defined(CONFIG_SLAB) || defined(CONFIG_SLUB_DEBUG)
4727 {
4728 .name = "kmem.slabinfo",
4729 .seq_start = memcg_slab_start,
4730 .seq_next = memcg_slab_next,
4731 .seq_stop = memcg_slab_stop,
4732 .seq_show = memcg_slab_show,
4733 },
4734#endif
4735 {
4736 .name = "kmem.tcp.limit_in_bytes",
4737 .private = MEMFILE_PRIVATE(_TCP, RES_LIMIT),
4738 .write = mem_cgroup_write,
4739 .read_u64 = mem_cgroup_read_u64,
4740 },
4741 {
4742 .name = "kmem.tcp.usage_in_bytes",
4743 .private = MEMFILE_PRIVATE(_TCP, RES_USAGE),
4744 .read_u64 = mem_cgroup_read_u64,
4745 },
4746 {
4747 .name = "kmem.tcp.failcnt",
4748 .private = MEMFILE_PRIVATE(_TCP, RES_FAILCNT),
4749 .write = mem_cgroup_reset,
4750 .read_u64 = mem_cgroup_read_u64,
4751 },
4752 {
4753 .name = "kmem.tcp.max_usage_in_bytes",
4754 .private = MEMFILE_PRIVATE(_TCP, RES_MAX_USAGE),
4755 .write = mem_cgroup_reset,
4756 .read_u64 = mem_cgroup_read_u64,
4757 },
4758 { },
4759};
4760
4761
4762
4763
4764
4765
4766
4767
4768
4769
4770
4771
4772
4773
4774
4775
4776
4777
4778
4779
4780
4781
4782
4783
4784
4785static DEFINE_IDR(mem_cgroup_idr);
4786
4787static void mem_cgroup_id_remove(struct mem_cgroup *memcg)
4788{
4789 if (memcg->id.id > 0) {
4790 idr_remove(&mem_cgroup_idr, memcg->id.id);
4791 memcg->id.id = 0;
4792 }
4793}
4794
4795static void mem_cgroup_id_get_many(struct mem_cgroup *memcg, unsigned int n)
4796{
4797 refcount_add(n, &memcg->id.ref);
4798}
4799
4800static void mem_cgroup_id_put_many(struct mem_cgroup *memcg, unsigned int n)
4801{
4802 if (refcount_sub_and_test(n, &memcg->id.ref)) {
4803 mem_cgroup_id_remove(memcg);
4804
4805
4806 css_put(&memcg->css);
4807 }
4808}
4809
4810static inline void mem_cgroup_id_put(struct mem_cgroup *memcg)
4811{
4812 mem_cgroup_id_put_many(memcg, 1);
4813}
4814
4815
4816
4817
4818
4819
4820
4821struct mem_cgroup *mem_cgroup_from_id(unsigned short id)
4822{
4823 WARN_ON_ONCE(!rcu_read_lock_held());
4824 return idr_find(&mem_cgroup_idr, id);
4825}
4826
4827static int alloc_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node)
4828{
4829 struct mem_cgroup_per_node *pn;
4830 int tmp = node;
4831
4832
4833
4834
4835
4836
4837
4838
4839 if (!node_state(node, N_NORMAL_MEMORY))
4840 tmp = -1;
4841 pn = kzalloc_node(sizeof(*pn), GFP_KERNEL, tmp);
4842 if (!pn)
4843 return 1;
4844
4845 pn->lruvec_stat_local = alloc_percpu(struct lruvec_stat);
4846 if (!pn->lruvec_stat_local) {
4847 kfree(pn);
4848 return 1;
4849 }
4850
4851 pn->lruvec_stat_cpu = alloc_percpu(struct lruvec_stat);
4852 if (!pn->lruvec_stat_cpu) {
4853 free_percpu(pn->lruvec_stat_local);
4854 kfree(pn);
4855 return 1;
4856 }
4857
4858 lruvec_init(&pn->lruvec);
4859 pn->usage_in_excess = 0;
4860 pn->on_tree = false;
4861 pn->memcg = memcg;
4862
4863 memcg->nodeinfo[node] = pn;
4864 return 0;
4865}
4866
4867static void free_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node)
4868{
4869 struct mem_cgroup_per_node *pn = memcg->nodeinfo[node];
4870
4871 if (!pn)
4872 return;
4873
4874 free_percpu(pn->lruvec_stat_cpu);
4875 free_percpu(pn->lruvec_stat_local);
4876 kfree(pn);
4877}
4878
4879static void __mem_cgroup_free(struct mem_cgroup *memcg)
4880{
4881 int node;
4882
4883 for_each_node(node)
4884 free_mem_cgroup_per_node_info(memcg, node);
4885 free_percpu(memcg->vmstats_percpu);
4886 free_percpu(memcg->vmstats_local);
4887 kfree(memcg);
4888}
4889
4890static void mem_cgroup_free(struct mem_cgroup *memcg)
4891{
4892 memcg_wb_domain_exit(memcg);
4893
4894
4895
4896
4897 memcg_flush_percpu_vmstats(memcg);
4898 memcg_flush_percpu_vmevents(memcg);
4899 __mem_cgroup_free(memcg);
4900}
4901
4902static struct mem_cgroup *mem_cgroup_alloc(void)
4903{
4904 struct mem_cgroup *memcg;
4905 unsigned int size;
4906 int node;
4907 int __maybe_unused i;
4908
4909 size = sizeof(struct mem_cgroup);
4910 size += nr_node_ids * sizeof(struct mem_cgroup_per_node *);
4911
4912 memcg = kzalloc(size, GFP_KERNEL);
4913 if (!memcg)
4914 return NULL;
4915
4916 memcg->id.id = idr_alloc(&mem_cgroup_idr, NULL,
4917 1, MEM_CGROUP_ID_MAX,
4918 GFP_KERNEL);
4919 if (memcg->id.id < 0)
4920 goto fail;
4921
4922 memcg->vmstats_local = alloc_percpu(struct memcg_vmstats_percpu);
4923 if (!memcg->vmstats_local)
4924 goto fail;
4925
4926 memcg->vmstats_percpu = alloc_percpu(struct memcg_vmstats_percpu);
4927 if (!memcg->vmstats_percpu)
4928 goto fail;
4929
4930 for_each_node(node)
4931 if (alloc_mem_cgroup_per_node_info(memcg, node))
4932 goto fail;
4933
4934 if (memcg_wb_domain_init(memcg, GFP_KERNEL))
4935 goto fail;
4936
4937 INIT_WORK(&memcg->high_work, high_work_func);
4938 INIT_LIST_HEAD(&memcg->oom_notify);
4939 mutex_init(&memcg->thresholds_lock);
4940 spin_lock_init(&memcg->move_lock);
4941 vmpressure_init(&memcg->vmpressure);
4942 INIT_LIST_HEAD(&memcg->event_list);
4943 spin_lock_init(&memcg->event_list_lock);
4944 memcg->socket_pressure = jiffies;
4945#ifdef CONFIG_MEMCG_KMEM
4946 memcg->kmemcg_id = -1;
4947#endif
4948#ifdef CONFIG_CGROUP_WRITEBACK
4949 INIT_LIST_HEAD(&memcg->cgwb_list);
4950 for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++)
4951 memcg->cgwb_frn[i].done =
4952 __WB_COMPLETION_INIT(&memcg_cgwb_frn_waitq);
4953#endif
4954#ifdef CONFIG_TRANSPARENT_HUGEPAGE
4955 spin_lock_init(&memcg->deferred_split_queue.split_queue_lock);
4956 INIT_LIST_HEAD(&memcg->deferred_split_queue.split_queue);
4957 memcg->deferred_split_queue.split_queue_len = 0;
4958#endif
4959 idr_replace(&mem_cgroup_idr, memcg, memcg->id.id);
4960 return memcg;
4961fail:
4962 mem_cgroup_id_remove(memcg);
4963 __mem_cgroup_free(memcg);
4964 return NULL;
4965}
4966
4967static struct cgroup_subsys_state * __ref
4968mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
4969{
4970 struct mem_cgroup *parent = mem_cgroup_from_css(parent_css);
4971 struct mem_cgroup *memcg;
4972 long error = -ENOMEM;
4973
4974 memcg = mem_cgroup_alloc();
4975 if (!memcg)
4976 return ERR_PTR(error);
4977
4978 memcg->high = PAGE_COUNTER_MAX;
4979 memcg->soft_limit = PAGE_COUNTER_MAX;
4980 if (parent) {
4981 memcg->swappiness = mem_cgroup_swappiness(parent);
4982 memcg->oom_kill_disable = parent->oom_kill_disable;
4983 }
4984 if (parent && parent->use_hierarchy) {
4985 memcg->use_hierarchy = true;
4986 page_counter_init(&memcg->memory, &parent->memory);
4987 page_counter_init(&memcg->swap, &parent->swap);
4988 page_counter_init(&memcg->memsw, &parent->memsw);
4989 page_counter_init(&memcg->kmem, &parent->kmem);
4990 page_counter_init(&memcg->tcpmem, &parent->tcpmem);
4991 } else {
4992 page_counter_init(&memcg->memory, NULL);
4993 page_counter_init(&memcg->swap, NULL);
4994 page_counter_init(&memcg->memsw, NULL);
4995 page_counter_init(&memcg->kmem, NULL);
4996 page_counter_init(&memcg->tcpmem, NULL);
4997
4998
4999
5000
5001
5002 if (parent != root_mem_cgroup)
5003 memory_cgrp_subsys.broken_hierarchy = true;
5004 }
5005
5006
5007 if (!parent) {
5008#ifdef CONFIG_MEMCG_KMEM
5009 INIT_LIST_HEAD(&memcg->kmem_caches);
5010#endif
5011 root_mem_cgroup = memcg;
5012 return &memcg->css;
5013 }
5014
5015 error = memcg_online_kmem(memcg);
5016 if (error)
5017 goto fail;
5018
5019 if (cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_nosocket)
5020 static_branch_inc(&memcg_sockets_enabled_key);
5021
5022 return &memcg->css;
5023fail:
5024 mem_cgroup_id_remove(memcg);
5025 mem_cgroup_free(memcg);
5026 return ERR_PTR(-ENOMEM);
5027}
5028
5029static int mem_cgroup_css_online(struct cgroup_subsys_state *css)
5030{
5031 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
5032
5033
5034
5035
5036
5037
5038 if (memcg_alloc_shrinker_maps(memcg)) {
5039 mem_cgroup_id_remove(memcg);
5040 return -ENOMEM;
5041 }
5042
5043
5044 refcount_set(&memcg->id.ref, 1);
5045 css_get(css);
5046 return 0;
5047}
5048
5049static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
5050{
5051 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
5052 struct mem_cgroup_event *event, *tmp;
5053
5054
5055
5056
5057
5058
5059 spin_lock(&memcg->event_list_lock);
5060 list_for_each_entry_safe(event, tmp, &memcg->event_list, list) {
5061 list_del_init(&event->list);
5062 schedule_work(&event->remove);
5063 }
5064 spin_unlock(&memcg->event_list_lock);
5065
5066 page_counter_set_min(&memcg->memory, 0);
5067 page_counter_set_low(&memcg->memory, 0);
5068
5069 memcg_offline_kmem(memcg);
5070 wb_memcg_offline(memcg);
5071
5072 drain_all_stock(memcg);
5073
5074 mem_cgroup_id_put(memcg);
5075}
5076
5077static void mem_cgroup_css_released(struct cgroup_subsys_state *css)
5078{
5079 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
5080
5081 invalidate_reclaim_iterators(memcg);
5082}
5083
5084static void mem_cgroup_css_free(struct cgroup_subsys_state *css)
5085{
5086 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
5087 int __maybe_unused i;
5088
5089#ifdef CONFIG_CGROUP_WRITEBACK
5090 for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++)
5091 wb_wait_for_completion(&memcg->cgwb_frn[i].done);
5092#endif
5093 if (cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_nosocket)
5094 static_branch_dec(&memcg_sockets_enabled_key);
5095
5096 if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && memcg->tcpmem_active)
5097 static_branch_dec(&memcg_sockets_enabled_key);
5098
5099 vmpressure_cleanup(&memcg->vmpressure);
5100 cancel_work_sync(&memcg->high_work);
5101 mem_cgroup_remove_from_trees(memcg);
5102 memcg_free_shrinker_maps(memcg);
5103 memcg_free_kmem(memcg);
5104 mem_cgroup_free(memcg);
5105}
5106
5107
5108
5109
5110
5111
5112
5113
5114
5115
5116
5117
5118
5119
5120static void mem_cgroup_css_reset(struct cgroup_subsys_state *css)
5121{
5122 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
5123
5124 page_counter_set_max(&memcg->memory, PAGE_COUNTER_MAX);
5125 page_counter_set_max(&memcg->swap, PAGE_COUNTER_MAX);
5126 page_counter_set_max(&memcg->memsw, PAGE_COUNTER_MAX);
5127 page_counter_set_max(&memcg->kmem, PAGE_COUNTER_MAX);
5128 page_counter_set_max(&memcg->tcpmem, PAGE_COUNTER_MAX);
5129 page_counter_set_min(&memcg->memory, 0);
5130 page_counter_set_low(&memcg->memory, 0);
5131 memcg->high = PAGE_COUNTER_MAX;
5132 memcg->soft_limit = PAGE_COUNTER_MAX;
5133 memcg_wb_domain_size_changed(memcg);
5134}
5135
5136#ifdef CONFIG_MMU
5137
5138static int mem_cgroup_do_precharge(unsigned long count)
5139{
5140 int ret;
5141
5142
5143 ret = try_charge(mc.to, GFP_KERNEL & ~__GFP_DIRECT_RECLAIM, count);
5144 if (!ret) {
5145 mc.precharge += count;
5146 return ret;
5147 }
5148
5149
5150 while (count--) {
5151 ret = try_charge(mc.to, GFP_KERNEL | __GFP_NORETRY, 1);
5152 if (ret)
5153 return ret;
5154 mc.precharge++;
5155 cond_resched();
5156 }
5157 return 0;
5158}
5159
5160union mc_target {
5161 struct page *page;
5162 swp_entry_t ent;
5163};
5164
5165enum mc_target_type {
5166 MC_TARGET_NONE = 0,
5167 MC_TARGET_PAGE,
5168 MC_TARGET_SWAP,
5169 MC_TARGET_DEVICE,
5170};
5171
5172static struct page *mc_handle_present_pte(struct vm_area_struct *vma,
5173 unsigned long addr, pte_t ptent)
5174{
5175 struct page *page = vm_normal_page(vma, addr, ptent);
5176
5177 if (!page || !page_mapped(page))
5178 return NULL;
5179 if (PageAnon(page)) {
5180 if (!(mc.flags & MOVE_ANON))
5181 return NULL;
5182 } else {
5183 if (!(mc.flags & MOVE_FILE))
5184 return NULL;
5185 }
5186 if (!get_page_unless_zero(page))
5187 return NULL;
5188
5189 return page;
5190}
5191
5192#if defined(CONFIG_SWAP) || defined(CONFIG_DEVICE_PRIVATE)
5193static struct page *mc_handle_swap_pte(struct vm_area_struct *vma,
5194 pte_t ptent, swp_entry_t *entry)
5195{
5196 struct page *page = NULL;
5197 swp_entry_t ent = pte_to_swp_entry(ptent);
5198
5199 if (!(mc.flags & MOVE_ANON) || non_swap_entry(ent))
5200 return NULL;
5201
5202
5203
5204
5205
5206
5207 if (is_device_private_entry(ent)) {
5208 page = device_private_entry_to_page(ent);
5209
5210
5211
5212
5213 if (!page_ref_add_unless(page, 1, 1))
5214 return NULL;
5215 return page;
5216 }
5217
5218
5219
5220
5221
5222 page = find_get_page(swap_address_space(ent), swp_offset(ent));
5223 if (do_memsw_account())
5224 entry->val = ent.val;
5225
5226 return page;
5227}
5228#else
5229static struct page *mc_handle_swap_pte(struct vm_area_struct *vma,
5230 pte_t ptent, swp_entry_t *entry)
5231{
5232 return NULL;
5233}
5234#endif
5235
5236static struct page *mc_handle_file_pte(struct vm_area_struct *vma,
5237 unsigned long addr, pte_t ptent, swp_entry_t *entry)
5238{
5239 struct page *page = NULL;
5240 struct address_space *mapping;
5241 pgoff_t pgoff;
5242
5243 if (!vma->vm_file)
5244 return NULL;
5245 if (!(mc.flags & MOVE_FILE))
5246 return NULL;
5247
5248 mapping = vma->vm_file->f_mapping;
5249 pgoff = linear_page_index(vma, addr);
5250
5251
5252#ifdef CONFIG_SWAP
5253
5254 if (shmem_mapping(mapping)) {
5255 page = find_get_entry(mapping, pgoff);
5256 if (xa_is_value(page)) {
5257 swp_entry_t swp = radix_to_swp_entry(page);
5258 if (do_memsw_account())
5259 *entry = swp;
5260 page = find_get_page(swap_address_space(swp),
5261 swp_offset(swp));
5262 }
5263 } else
5264 page = find_get_page(mapping, pgoff);
5265#else
5266 page = find_get_page(mapping, pgoff);
5267#endif
5268 return page;
5269}
5270
5271
5272
5273
5274
5275
5276
5277
5278
5279
5280
5281
5282
5283static int mem_cgroup_move_account(struct page *page,
5284 bool compound,
5285 struct mem_cgroup *from,
5286 struct mem_cgroup *to)
5287{
5288 struct lruvec *from_vec, *to_vec;
5289 struct pglist_data *pgdat;
5290 unsigned long flags;
5291 unsigned int nr_pages = compound ? hpage_nr_pages(page) : 1;
5292 int ret;
5293 bool anon;
5294
5295 VM_BUG_ON(from == to);
5296 VM_BUG_ON_PAGE(PageLRU(page), page);
5297 VM_BUG_ON(compound && !PageTransHuge(page));
5298
5299
5300
5301
5302
5303 ret = -EBUSY;
5304 if (!trylock_page(page))
5305 goto out;
5306
5307 ret = -EINVAL;
5308 if (page->mem_cgroup != from)
5309 goto out_unlock;
5310
5311 anon = PageAnon(page);
5312
5313 pgdat = page_pgdat(page);
5314 from_vec = mem_cgroup_lruvec(from, pgdat);
5315 to_vec = mem_cgroup_lruvec(to, pgdat);
5316
5317 spin_lock_irqsave(&from->move_lock, flags);
5318
5319 if (!anon && page_mapped(page)) {
5320 __mod_lruvec_state(from_vec, NR_FILE_MAPPED, -nr_pages);
5321 __mod_lruvec_state(to_vec, NR_FILE_MAPPED, nr_pages);
5322 }
5323
5324
5325
5326
5327
5328
5329 if (!anon && PageDirty(page)) {
5330 struct address_space *mapping = page_mapping(page);
5331
5332 if (mapping_cap_account_dirty(mapping)) {
5333 __mod_lruvec_state(from_vec, NR_FILE_DIRTY, -nr_pages);
5334 __mod_lruvec_state(to_vec, NR_FILE_DIRTY, nr_pages);
5335 }
5336 }
5337
5338 if (PageWriteback(page)) {
5339 __mod_lruvec_state(from_vec, NR_WRITEBACK, -nr_pages);
5340 __mod_lruvec_state(to_vec, NR_WRITEBACK, nr_pages);
5341 }
5342
5343#ifdef CONFIG_TRANSPARENT_HUGEPAGE
5344 if (compound && !list_empty(page_deferred_list(page))) {
5345 spin_lock(&from->deferred_split_queue.split_queue_lock);
5346 list_del_init(page_deferred_list(page));
5347 from->deferred_split_queue.split_queue_len--;
5348 spin_unlock(&from->deferred_split_queue.split_queue_lock);
5349 }
5350#endif
5351
5352
5353
5354
5355
5356
5357
5358 page->mem_cgroup = to;
5359
5360#ifdef CONFIG_TRANSPARENT_HUGEPAGE
5361 if (compound && list_empty(page_deferred_list(page))) {
5362 spin_lock(&to->deferred_split_queue.split_queue_lock);
5363 list_add_tail(page_deferred_list(page),
5364 &to->deferred_split_queue.split_queue);
5365 to->deferred_split_queue.split_queue_len++;
5366 spin_unlock(&to->deferred_split_queue.split_queue_lock);
5367 }
5368#endif
5369
5370 spin_unlock_irqrestore(&from->move_lock, flags);
5371
5372 ret = 0;
5373
5374 local_irq_disable();
5375 mem_cgroup_charge_statistics(to, page, compound, nr_pages);
5376 memcg_check_events(to, page);
5377 mem_cgroup_charge_statistics(from, page, compound, -nr_pages);
5378 memcg_check_events(from, page);
5379 local_irq_enable();
5380out_unlock:
5381 unlock_page(page);
5382out:
5383 return ret;
5384}
5385
5386
5387
5388
5389
5390
5391
5392
5393
5394
5395
5396
5397
5398
5399
5400
5401
5402
5403
5404
5405
5406
5407
5408
5409
5410
5411
5412static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma,
5413 unsigned long addr, pte_t ptent, union mc_target *target)
5414{
5415 struct page *page = NULL;
5416 enum mc_target_type ret = MC_TARGET_NONE;
5417 swp_entry_t ent = { .val = 0 };
5418
5419 if (pte_present(ptent))
5420 page = mc_handle_present_pte(vma, addr, ptent);
5421 else if (is_swap_pte(ptent))
5422 page = mc_handle_swap_pte(vma, ptent, &ent);
5423 else if (pte_none(ptent))
5424 page = mc_handle_file_pte(vma, addr, ptent, &ent);
5425
5426 if (!page && !ent.val)
5427 return ret;
5428 if (page) {
5429
5430
5431
5432
5433
5434 if (page->mem_cgroup == mc.from) {
5435 ret = MC_TARGET_PAGE;
5436 if (is_device_private_page(page))
5437 ret = MC_TARGET_DEVICE;
5438 if (target)
5439 target->page = page;
5440 }
5441 if (!ret || !target)
5442 put_page(page);
5443 }
5444
5445
5446
5447
5448 if (ent.val && !ret && (!page || !PageTransCompound(page)) &&
5449 mem_cgroup_id(mc.from) == lookup_swap_cgroup_id(ent)) {
5450 ret = MC_TARGET_SWAP;
5451 if (target)
5452 target->ent = ent;
5453 }
5454 return ret;
5455}
5456
5457#ifdef CONFIG_TRANSPARENT_HUGEPAGE
5458
5459
5460
5461
5462
5463static enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma,
5464 unsigned long addr, pmd_t pmd, union mc_target *target)
5465{
5466 struct page *page = NULL;
5467 enum mc_target_type ret = MC_TARGET_NONE;
5468
5469 if (unlikely(is_swap_pmd(pmd))) {
5470 VM_BUG_ON(thp_migration_supported() &&
5471 !is_pmd_migration_entry(pmd));
5472 return ret;
5473 }
5474 page = pmd_page(pmd);
5475 VM_BUG_ON_PAGE(!page || !PageHead(page), page);
5476 if (!(mc.flags & MOVE_ANON))
5477 return ret;
5478 if (page->mem_cgroup == mc.from) {
5479 ret = MC_TARGET_PAGE;
5480 if (target) {
5481 get_page(page);
5482 target->page = page;
5483 }
5484 }
5485 return ret;
5486}
5487#else
5488static inline enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma,
5489 unsigned long addr, pmd_t pmd, union mc_target *target)
5490{
5491 return MC_TARGET_NONE;
5492}
5493#endif
5494
5495static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd,
5496 unsigned long addr, unsigned long end,
5497 struct mm_walk *walk)
5498{
5499 struct vm_area_struct *vma = walk->vma;
5500 pte_t *pte;
5501 spinlock_t *ptl;
5502
5503 ptl = pmd_trans_huge_lock(pmd, vma);
5504 if (ptl) {
5505
5506
5507
5508
5509
5510 if (get_mctgt_type_thp(vma, addr, *pmd, NULL) == MC_TARGET_PAGE)
5511 mc.precharge += HPAGE_PMD_NR;
5512 spin_unlock(ptl);
5513 return 0;
5514 }
5515
5516 if (pmd_trans_unstable(pmd))
5517 return 0;
5518 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
5519 for (; addr != end; pte++, addr += PAGE_SIZE)
5520 if (get_mctgt_type(vma, addr, *pte, NULL))
5521 mc.precharge++;
5522 pte_unmap_unlock(pte - 1, ptl);
5523 cond_resched();
5524
5525 return 0;
5526}
5527
5528static const struct mm_walk_ops precharge_walk_ops = {
5529 .pmd_entry = mem_cgroup_count_precharge_pte_range,
5530};
5531
5532static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm)
5533{
5534 unsigned long precharge;
5535
5536 down_read(&mm->mmap_sem);
5537 walk_page_range(mm, 0, mm->highest_vm_end, &precharge_walk_ops, NULL);
5538 up_read(&mm->mmap_sem);
5539
5540 precharge = mc.precharge;
5541 mc.precharge = 0;
5542
5543 return precharge;
5544}
5545
5546static int mem_cgroup_precharge_mc(struct mm_struct *mm)
5547{
5548 unsigned long precharge = mem_cgroup_count_precharge(mm);
5549
5550 VM_BUG_ON(mc.moving_task);
5551 mc.moving_task = current;
5552 return mem_cgroup_do_precharge(precharge);
5553}
5554
5555
5556static void __mem_cgroup_clear_mc(void)
5557{
5558 struct mem_cgroup *from = mc.from;
5559 struct mem_cgroup *to = mc.to;
5560
5561
5562 if (mc.precharge) {
5563 cancel_charge(mc.to, mc.precharge);
5564 mc.precharge = 0;
5565 }
5566
5567
5568
5569
5570 if (mc.moved_charge) {
5571 cancel_charge(mc.from, mc.moved_charge);
5572 mc.moved_charge = 0;
5573 }
5574
5575 if (mc.moved_swap) {
5576
5577 if (!mem_cgroup_is_root(mc.from))
5578 page_counter_uncharge(&mc.from->memsw, mc.moved_swap);
5579
5580 mem_cgroup_id_put_many(mc.from, mc.moved_swap);
5581
5582
5583
5584
5585
5586 if (!mem_cgroup_is_root(mc.to))
5587 page_counter_uncharge(&mc.to->memory, mc.moved_swap);
5588
5589 mem_cgroup_id_get_many(mc.to, mc.moved_swap);
5590 css_put_many(&mc.to->css, mc.moved_swap);
5591
5592 mc.moved_swap = 0;
5593 }
5594 memcg_oom_recover(from);
5595 memcg_oom_recover(to);
5596 wake_up_all(&mc.waitq);
5597}
5598
5599static void mem_cgroup_clear_mc(void)
5600{
5601 struct mm_struct *mm = mc.mm;
5602
5603
5604
5605
5606
5607 mc.moving_task = NULL;
5608 __mem_cgroup_clear_mc();
5609 spin_lock(&mc.lock);
5610 mc.from = NULL;
5611 mc.to = NULL;
5612 mc.mm = NULL;
5613 spin_unlock(&mc.lock);
5614
5615 mmput(mm);
5616}
5617
5618static int mem_cgroup_can_attach(struct cgroup_taskset *tset)
5619{
5620 struct cgroup_subsys_state *css;
5621 struct mem_cgroup *memcg = NULL;
5622 struct mem_cgroup *from;
5623 struct task_struct *leader, *p;
5624 struct mm_struct *mm;
5625 unsigned long move_flags;
5626 int ret = 0;
5627
5628
5629 if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
5630 return 0;
5631
5632
5633
5634
5635
5636
5637
5638 p = NULL;
5639 cgroup_taskset_for_each_leader(leader, css, tset) {
5640 WARN_ON_ONCE(p);
5641 p = leader;
5642 memcg = mem_cgroup_from_css(css);
5643 }
5644 if (!p)
5645 return 0;
5646
5647
5648
5649
5650
5651
5652 move_flags = READ_ONCE(memcg->move_charge_at_immigrate);
5653 if (!move_flags)
5654 return 0;
5655
5656 from = mem_cgroup_from_task(p);
5657
5658 VM_BUG_ON(from == memcg);
5659
5660 mm = get_task_mm(p);
5661 if (!mm)
5662 return 0;
5663
5664 if (mm->owner == p) {
5665 VM_BUG_ON(mc.from);
5666 VM_BUG_ON(mc.to);
5667 VM_BUG_ON(mc.precharge);
5668 VM_BUG_ON(mc.moved_charge);
5669 VM_BUG_ON(mc.moved_swap);
5670
5671 spin_lock(&mc.lock);
5672 mc.mm = mm;
5673 mc.from = from;
5674 mc.to = memcg;
5675 mc.flags = move_flags;
5676 spin_unlock(&mc.lock);
5677
5678
5679 ret = mem_cgroup_precharge_mc(mm);
5680 if (ret)
5681 mem_cgroup_clear_mc();
5682 } else {
5683 mmput(mm);
5684 }
5685 return ret;
5686}
5687
5688static void mem_cgroup_cancel_attach(struct cgroup_taskset *tset)
5689{
5690 if (mc.to)
5691 mem_cgroup_clear_mc();
5692}
5693
5694static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
5695 unsigned long addr, unsigned long end,
5696 struct mm_walk *walk)
5697{
5698 int ret = 0;
5699 struct vm_area_struct *vma = walk->vma;
5700 pte_t *pte;
5701 spinlock_t *ptl;
5702 enum mc_target_type target_type;
5703 union mc_target target;
5704 struct page *page;
5705
5706 ptl = pmd_trans_huge_lock(pmd, vma);
5707 if (ptl) {
5708 if (mc.precharge < HPAGE_PMD_NR) {
5709 spin_unlock(ptl);
5710 return 0;
5711 }
5712 target_type = get_mctgt_type_thp(vma, addr, *pmd, &target);
5713 if (target_type == MC_TARGET_PAGE) {
5714 page = target.page;
5715 if (!isolate_lru_page(page)) {
5716 if (!mem_cgroup_move_account(page, true,
5717 mc.from, mc.to)) {
5718 mc.precharge -= HPAGE_PMD_NR;
5719 mc.moved_charge += HPAGE_PMD_NR;
5720 }
5721 putback_lru_page(page);
5722 }
5723 put_page(page);
5724 } else if (target_type == MC_TARGET_DEVICE) {
5725 page = target.page;
5726 if (!mem_cgroup_move_account(page, true,
5727 mc.from, mc.to)) {
5728 mc.precharge -= HPAGE_PMD_NR;
5729 mc.moved_charge += HPAGE_PMD_NR;
5730 }
5731 put_page(page);
5732 }
5733 spin_unlock(ptl);
5734 return 0;
5735 }
5736
5737 if (pmd_trans_unstable(pmd))
5738 return 0;
5739retry:
5740 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
5741 for (; addr != end; addr += PAGE_SIZE) {
5742 pte_t ptent = *(pte++);
5743 bool device = false;
5744 swp_entry_t ent;
5745
5746 if (!mc.precharge)
5747 break;
5748
5749 switch (get_mctgt_type(vma, addr, ptent, &target)) {
5750 case MC_TARGET_DEVICE:
5751 device = true;
5752
5753 case MC_TARGET_PAGE:
5754 page = target.page;
5755
5756
5757
5758
5759
5760
5761 if (PageTransCompound(page))
5762 goto put;
5763 if (!device && isolate_lru_page(page))
5764 goto put;
5765 if (!mem_cgroup_move_account(page, false,
5766 mc.from, mc.to)) {
5767 mc.precharge--;
5768
5769 mc.moved_charge++;
5770 }
5771 if (!device)
5772 putback_lru_page(page);
5773put:
5774 put_page(page);
5775 break;
5776 case MC_TARGET_SWAP:
5777 ent = target.ent;
5778 if (!mem_cgroup_move_swap_account(ent, mc.from, mc.to)) {
5779 mc.precharge--;
5780
5781 mc.moved_swap++;
5782 }
5783 break;
5784 default:
5785 break;
5786 }
5787 }
5788 pte_unmap_unlock(pte - 1, ptl);
5789 cond_resched();
5790
5791 if (addr != end) {
5792
5793
5794
5795
5796
5797
5798 ret = mem_cgroup_do_precharge(1);
5799 if (!ret)
5800 goto retry;
5801 }
5802
5803 return ret;
5804}
5805
5806static const struct mm_walk_ops charge_walk_ops = {
5807 .pmd_entry = mem_cgroup_move_charge_pte_range,
5808};
5809
5810static void mem_cgroup_move_charge(void)
5811{
5812 lru_add_drain_all();
5813
5814
5815
5816
5817
5818 atomic_inc(&mc.from->moving_account);
5819 synchronize_rcu();
5820retry:
5821 if (unlikely(!down_read_trylock(&mc.mm->mmap_sem))) {
5822
5823
5824
5825
5826
5827
5828
5829 __mem_cgroup_clear_mc();
5830 cond_resched();
5831 goto retry;
5832 }
5833
5834
5835
5836
5837 walk_page_range(mc.mm, 0, mc.mm->highest_vm_end, &charge_walk_ops,
5838 NULL);
5839
5840 up_read(&mc.mm->mmap_sem);
5841 atomic_dec(&mc.from->moving_account);
5842}
5843
5844static void mem_cgroup_move_task(void)
5845{
5846 if (mc.to) {
5847 mem_cgroup_move_charge();
5848 mem_cgroup_clear_mc();
5849 }
5850}
5851#else
5852static int mem_cgroup_can_attach(struct cgroup_taskset *tset)
5853{
5854 return 0;
5855}
5856static void mem_cgroup_cancel_attach(struct cgroup_taskset *tset)
5857{
5858}
5859static void mem_cgroup_move_task(void)
5860{
5861}
5862#endif
5863
5864
5865
5866
5867
5868
5869static void mem_cgroup_bind(struct cgroup_subsys_state *root_css)
5870{
5871
5872
5873
5874
5875
5876 if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
5877 root_mem_cgroup->use_hierarchy = true;
5878 else
5879 root_mem_cgroup->use_hierarchy = false;
5880}
5881
5882static int seq_puts_memcg_tunable(struct seq_file *m, unsigned long value)
5883{
5884 if (value == PAGE_COUNTER_MAX)
5885 seq_puts(m, "max\n");
5886 else
5887 seq_printf(m, "%llu\n", (u64)value * PAGE_SIZE);
5888
5889 return 0;
5890}
5891
5892static u64 memory_current_read(struct cgroup_subsys_state *css,
5893 struct cftype *cft)
5894{
5895 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
5896
5897 return (u64)page_counter_read(&memcg->memory) * PAGE_SIZE;
5898}
5899
5900static int memory_min_show(struct seq_file *m, void *v)
5901{
5902 return seq_puts_memcg_tunable(m,
5903 READ_ONCE(mem_cgroup_from_seq(m)->memory.min));
5904}
5905
5906static ssize_t memory_min_write(struct kernfs_open_file *of,
5907 char *buf, size_t nbytes, loff_t off)
5908{
5909 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
5910 unsigned long min;
5911 int err;
5912
5913 buf = strstrip(buf);
5914 err = page_counter_memparse(buf, "max", &min);
5915 if (err)
5916 return err;
5917
5918 page_counter_set_min(&memcg->memory, min);
5919
5920 return nbytes;
5921}
5922
5923static int memory_low_show(struct seq_file *m, void *v)
5924{
5925 return seq_puts_memcg_tunable(m,
5926 READ_ONCE(mem_cgroup_from_seq(m)->memory.low));
5927}
5928
5929static ssize_t memory_low_write(struct kernfs_open_file *of,
5930 char *buf, size_t nbytes, loff_t off)
5931{
5932 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
5933 unsigned long low;
5934 int err;
5935
5936 buf = strstrip(buf);
5937 err = page_counter_memparse(buf, "max", &low);
5938 if (err)
5939 return err;
5940
5941 page_counter_set_low(&memcg->memory, low);
5942
5943 return nbytes;
5944}
5945
5946static int memory_high_show(struct seq_file *m, void *v)
5947{
5948 return seq_puts_memcg_tunable(m, READ_ONCE(mem_cgroup_from_seq(m)->high));
5949}
5950
5951static ssize_t memory_high_write(struct kernfs_open_file *of,
5952 char *buf, size_t nbytes, loff_t off)
5953{
5954 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
5955 unsigned int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
5956 bool drained = false;
5957 unsigned long high;
5958 int err;
5959
5960 buf = strstrip(buf);
5961 err = page_counter_memparse(buf, "max", &high);
5962 if (err)
5963 return err;
5964
5965 memcg->high = high;
5966
5967 for (;;) {
5968 unsigned long nr_pages = page_counter_read(&memcg->memory);
5969 unsigned long reclaimed;
5970
5971 if (nr_pages <= high)
5972 break;
5973
5974 if (signal_pending(current))
5975 break;
5976
5977 if (!drained) {
5978 drain_all_stock(memcg);
5979 drained = true;
5980 continue;
5981 }
5982
5983 reclaimed = try_to_free_mem_cgroup_pages(memcg, nr_pages - high,
5984 GFP_KERNEL, true);
5985
5986 if (!reclaimed && !nr_retries--)
5987 break;
5988 }
5989
5990 return nbytes;
5991}
5992
5993static int memory_max_show(struct seq_file *m, void *v)
5994{
5995 return seq_puts_memcg_tunable(m,
5996 READ_ONCE(mem_cgroup_from_seq(m)->memory.max));
5997}
5998
5999static ssize_t memory_max_write(struct kernfs_open_file *of,
6000 char *buf, size_t nbytes, loff_t off)
6001{
6002 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
6003 unsigned int nr_reclaims = MEM_CGROUP_RECLAIM_RETRIES;
6004 bool drained = false;
6005 unsigned long max;
6006 int err;
6007
6008 buf = strstrip(buf);
6009 err = page_counter_memparse(buf, "max", &max);
6010 if (err)
6011 return err;
6012
6013 xchg(&memcg->memory.max, max);
6014
6015 for (;;) {
6016 unsigned long nr_pages = page_counter_read(&memcg->memory);
6017
6018 if (nr_pages <= max)
6019 break;
6020
6021 if (signal_pending(current))
6022 break;
6023
6024 if (!drained) {
6025 drain_all_stock(memcg);
6026 drained = true;
6027 continue;
6028 }
6029
6030 if (nr_reclaims) {
6031 if (!try_to_free_mem_cgroup_pages(memcg, nr_pages - max,
6032 GFP_KERNEL, true))
6033 nr_reclaims--;
6034 continue;
6035 }
6036
6037 memcg_memory_event(memcg, MEMCG_OOM);
6038 if (!mem_cgroup_out_of_memory(memcg, GFP_KERNEL, 0))
6039 break;
6040 }
6041
6042 memcg_wb_domain_size_changed(memcg);
6043 return nbytes;
6044}
6045
6046static void __memory_events_show(struct seq_file *m, atomic_long_t *events)
6047{
6048 seq_printf(m, "low %lu\n", atomic_long_read(&events[MEMCG_LOW]));
6049 seq_printf(m, "high %lu\n", atomic_long_read(&events[MEMCG_HIGH]));
6050 seq_printf(m, "max %lu\n", atomic_long_read(&events[MEMCG_MAX]));
6051 seq_printf(m, "oom %lu\n", atomic_long_read(&events[MEMCG_OOM]));
6052 seq_printf(m, "oom_kill %lu\n",
6053 atomic_long_read(&events[MEMCG_OOM_KILL]));
6054}
6055
6056static int memory_events_show(struct seq_file *m, void *v)
6057{
6058 struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
6059
6060 __memory_events_show(m, memcg->memory_events);
6061 return 0;
6062}
6063
6064static int memory_events_local_show(struct seq_file *m, void *v)
6065{
6066 struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
6067
6068 __memory_events_show(m, memcg->memory_events_local);
6069 return 0;
6070}
6071
6072static int memory_stat_show(struct seq_file *m, void *v)
6073{
6074 struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
6075 char *buf;
6076
6077 buf = memory_stat_format(memcg);
6078 if (!buf)
6079 return -ENOMEM;
6080 seq_puts(m, buf);
6081 kfree(buf);
6082 return 0;
6083}
6084
6085static int memory_oom_group_show(struct seq_file *m, void *v)
6086{
6087 struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
6088
6089 seq_printf(m, "%d\n", memcg->oom_group);
6090
6091 return 0;
6092}
6093
6094static ssize_t memory_oom_group_write(struct kernfs_open_file *of,
6095 char *buf, size_t nbytes, loff_t off)
6096{
6097 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
6098 int ret, oom_group;
6099
6100 buf = strstrip(buf);
6101 if (!buf)
6102 return -EINVAL;
6103
6104 ret = kstrtoint(buf, 0, &oom_group);
6105 if (ret)
6106 return ret;
6107
6108 if (oom_group != 0 && oom_group != 1)
6109 return -EINVAL;
6110
6111 memcg->oom_group = oom_group;
6112
6113 return nbytes;
6114}
6115
6116static struct cftype memory_files[] = {
6117 {
6118 .name = "current",
6119 .flags = CFTYPE_NOT_ON_ROOT,
6120 .read_u64 = memory_current_read,
6121 },
6122 {
6123 .name = "min",
6124 .flags = CFTYPE_NOT_ON_ROOT,
6125 .seq_show = memory_min_show,
6126 .write = memory_min_write,
6127 },
6128 {
6129 .name = "low",
6130 .flags = CFTYPE_NOT_ON_ROOT,
6131 .seq_show = memory_low_show,
6132 .write = memory_low_write,
6133 },
6134 {
6135 .name = "high",
6136 .flags = CFTYPE_NOT_ON_ROOT,
6137 .seq_show = memory_high_show,
6138 .write = memory_high_write,
6139 },
6140 {
6141 .name = "max",
6142 .flags = CFTYPE_NOT_ON_ROOT,
6143 .seq_show = memory_max_show,
6144 .write = memory_max_write,
6145 },
6146 {
6147 .name = "events",
6148 .flags = CFTYPE_NOT_ON_ROOT,
6149 .file_offset = offsetof(struct mem_cgroup, events_file),
6150 .seq_show = memory_events_show,
6151 },
6152 {
6153 .name = "events.local",
6154 .flags = CFTYPE_NOT_ON_ROOT,
6155 .file_offset = offsetof(struct mem_cgroup, events_local_file),
6156 .seq_show = memory_events_local_show,
6157 },
6158 {
6159 .name = "stat",
6160 .flags = CFTYPE_NOT_ON_ROOT,
6161 .seq_show = memory_stat_show,
6162 },
6163 {
6164 .name = "oom.group",
6165 .flags = CFTYPE_NOT_ON_ROOT | CFTYPE_NS_DELEGATABLE,
6166 .seq_show = memory_oom_group_show,
6167 .write = memory_oom_group_write,
6168 },
6169 { }
6170};
6171
6172struct cgroup_subsys memory_cgrp_subsys = {
6173 .css_alloc = mem_cgroup_css_alloc,
6174 .css_online = mem_cgroup_css_online,
6175 .css_offline = mem_cgroup_css_offline,
6176 .css_released = mem_cgroup_css_released,
6177 .css_free = mem_cgroup_css_free,
6178 .css_reset = mem_cgroup_css_reset,
6179 .can_attach = mem_cgroup_can_attach,
6180 .cancel_attach = mem_cgroup_cancel_attach,
6181 .post_attach = mem_cgroup_move_task,
6182 .bind = mem_cgroup_bind,
6183 .dfl_cftypes = memory_files,
6184 .legacy_cftypes = mem_cgroup_legacy_files,
6185 .early_init = 0,
6186};
6187
6188
6189
6190
6191
6192
6193
6194
6195
6196
6197
6198
6199
6200
6201
6202
6203
6204
6205
6206
6207
6208
6209
6210
6211
6212
6213
6214
6215
6216
6217
6218
6219
6220
6221
6222
6223
6224
6225
6226
6227
6228
6229
6230
6231
6232
6233
6234
6235
6236
6237
6238
6239
6240
6241
6242
6243
6244
6245
6246
6247
6248
6249
6250
6251
6252
6253
6254
6255
6256
6257
6258enum mem_cgroup_protection mem_cgroup_protected(struct mem_cgroup *root,
6259 struct mem_cgroup *memcg)
6260{
6261 struct mem_cgroup *parent;
6262 unsigned long emin, parent_emin;
6263 unsigned long elow, parent_elow;
6264 unsigned long usage;
6265
6266 if (mem_cgroup_disabled())
6267 return MEMCG_PROT_NONE;
6268
6269 if (!root)
6270 root = root_mem_cgroup;
6271 if (memcg == root)
6272 return MEMCG_PROT_NONE;
6273
6274 usage = page_counter_read(&memcg->memory);
6275 if (!usage)
6276 return MEMCG_PROT_NONE;
6277
6278 emin = memcg->memory.min;
6279 elow = memcg->memory.low;
6280
6281 parent = parent_mem_cgroup(memcg);
6282
6283 if (!parent)
6284 return MEMCG_PROT_NONE;
6285
6286 if (parent == root)
6287 goto exit;
6288
6289 parent_emin = READ_ONCE(parent->memory.emin);
6290 emin = min(emin, parent_emin);
6291 if (emin && parent_emin) {
6292 unsigned long min_usage, siblings_min_usage;
6293
6294 min_usage = min(usage, memcg->memory.min);
6295 siblings_min_usage = atomic_long_read(
6296 &parent->memory.children_min_usage);
6297
6298 if (min_usage && siblings_min_usage)
6299 emin = min(emin, parent_emin * min_usage /
6300 siblings_min_usage);
6301 }
6302
6303 parent_elow = READ_ONCE(parent->memory.elow);
6304 elow = min(elow, parent_elow);
6305 if (elow && parent_elow) {
6306 unsigned long low_usage, siblings_low_usage;
6307
6308 low_usage = min(usage, memcg->memory.low);
6309 siblings_low_usage = atomic_long_read(
6310 &parent->memory.children_low_usage);
6311
6312 if (low_usage && siblings_low_usage)
6313 elow = min(elow, parent_elow * low_usage /
6314 siblings_low_usage);
6315 }
6316
6317exit:
6318 memcg->memory.emin = emin;
6319 memcg->memory.elow = elow;
6320
6321 if (usage <= emin)
6322 return MEMCG_PROT_MIN;
6323 else if (usage <= elow)
6324 return MEMCG_PROT_LOW;
6325 else
6326 return MEMCG_PROT_NONE;
6327}
6328
6329
6330
6331
6332
6333
6334
6335
6336
6337
6338
6339
6340
6341
6342
6343
6344
6345
6346
6347int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm,
6348 gfp_t gfp_mask, struct mem_cgroup **memcgp,
6349 bool compound)
6350{
6351 struct mem_cgroup *memcg = NULL;
6352 unsigned int nr_pages = compound ? hpage_nr_pages(page) : 1;
6353 int ret = 0;
6354
6355 if (mem_cgroup_disabled())
6356 goto out;
6357
6358 if (PageSwapCache(page)) {
6359
6360
6361
6362
6363
6364
6365
6366 VM_BUG_ON_PAGE(!PageLocked(page), page);
6367 if (compound_head(page)->mem_cgroup)
6368 goto out;
6369
6370 if (do_swap_account) {
6371 swp_entry_t ent = { .val = page_private(page), };
6372 unsigned short id = lookup_swap_cgroup_id(ent);
6373
6374 rcu_read_lock();
6375 memcg = mem_cgroup_from_id(id);
6376 if (memcg && !css_tryget_online(&memcg->css))
6377 memcg = NULL;
6378 rcu_read_unlock();
6379 }
6380 }
6381
6382 if (!memcg)
6383 memcg = get_mem_cgroup_from_mm(mm);
6384
6385 ret = try_charge(memcg, gfp_mask, nr_pages);
6386
6387 css_put(&memcg->css);
6388out:
6389 *memcgp = memcg;
6390 return ret;
6391}
6392
6393int mem_cgroup_try_charge_delay(struct page *page, struct mm_struct *mm,
6394 gfp_t gfp_mask, struct mem_cgroup **memcgp,
6395 bool compound)
6396{
6397 struct mem_cgroup *memcg;
6398 int ret;
6399
6400 ret = mem_cgroup_try_charge(page, mm, gfp_mask, memcgp, compound);
6401 memcg = *memcgp;
6402 mem_cgroup_throttle_swaprate(memcg, page_to_nid(page), gfp_mask);
6403 return ret;
6404}
6405
6406
6407
6408
6409
6410
6411
6412
6413
6414
6415
6416
6417
6418
6419
6420
6421
6422
6423void mem_cgroup_commit_charge(struct page *page, struct mem_cgroup *memcg,
6424 bool lrucare, bool compound)
6425{
6426 unsigned int nr_pages = compound ? hpage_nr_pages(page) : 1;
6427
6428 VM_BUG_ON_PAGE(!page->mapping, page);
6429 VM_BUG_ON_PAGE(PageLRU(page) && !lrucare, page);
6430
6431 if (mem_cgroup_disabled())
6432 return;
6433
6434
6435
6436
6437
6438 if (!memcg)
6439 return;
6440
6441 commit_charge(page, memcg, lrucare);
6442
6443 local_irq_disable();
6444 mem_cgroup_charge_statistics(memcg, page, compound, nr_pages);
6445 memcg_check_events(memcg, page);
6446 local_irq_enable();
6447
6448 if (do_memsw_account() && PageSwapCache(page)) {
6449 swp_entry_t entry = { .val = page_private(page) };
6450
6451
6452
6453
6454
6455 mem_cgroup_uncharge_swap(entry, nr_pages);
6456 }
6457}
6458
6459
6460
6461
6462
6463
6464
6465
6466
6467void mem_cgroup_cancel_charge(struct page *page, struct mem_cgroup *memcg,
6468 bool compound)
6469{
6470 unsigned int nr_pages = compound ? hpage_nr_pages(page) : 1;
6471
6472 if (mem_cgroup_disabled())
6473 return;
6474
6475
6476
6477
6478
6479 if (!memcg)
6480 return;
6481
6482 cancel_charge(memcg, nr_pages);
6483}
6484
6485struct uncharge_gather {
6486 struct mem_cgroup *memcg;
6487 unsigned long pgpgout;
6488 unsigned long nr_anon;
6489 unsigned long nr_file;
6490 unsigned long nr_kmem;
6491 unsigned long nr_huge;
6492 unsigned long nr_shmem;
6493 struct page *dummy_page;
6494};
6495
6496static inline void uncharge_gather_clear(struct uncharge_gather *ug)
6497{
6498 memset(ug, 0, sizeof(*ug));
6499}
6500
6501static void uncharge_batch(const struct uncharge_gather *ug)
6502{
6503 unsigned long nr_pages = ug->nr_anon + ug->nr_file + ug->nr_kmem;
6504 unsigned long flags;
6505
6506 if (!mem_cgroup_is_root(ug->memcg)) {
6507 page_counter_uncharge(&ug->memcg->memory, nr_pages);
6508 if (do_memsw_account())
6509 page_counter_uncharge(&ug->memcg->memsw, nr_pages);
6510 if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && ug->nr_kmem)
6511 page_counter_uncharge(&ug->memcg->kmem, ug->nr_kmem);
6512 memcg_oom_recover(ug->memcg);
6513 }
6514
6515 local_irq_save(flags);
6516 __mod_memcg_state(ug->memcg, MEMCG_RSS, -ug->nr_anon);
6517 __mod_memcg_state(ug->memcg, MEMCG_CACHE, -ug->nr_file);
6518 __mod_memcg_state(ug->memcg, MEMCG_RSS_HUGE, -ug->nr_huge);
6519 __mod_memcg_state(ug->memcg, NR_SHMEM, -ug->nr_shmem);
6520 __count_memcg_events(ug->memcg, PGPGOUT, ug->pgpgout);
6521 __this_cpu_add(ug->memcg->vmstats_percpu->nr_page_events, nr_pages);
6522 memcg_check_events(ug->memcg, ug->dummy_page);
6523 local_irq_restore(flags);
6524
6525 if (!mem_cgroup_is_root(ug->memcg))
6526 css_put_many(&ug->memcg->css, nr_pages);
6527}
6528
6529static void uncharge_page(struct page *page, struct uncharge_gather *ug)
6530{
6531 VM_BUG_ON_PAGE(PageLRU(page), page);
6532 VM_BUG_ON_PAGE(page_count(page) && !is_zone_device_page(page) &&
6533 !PageHWPoison(page) , page);
6534
6535 if (!page->mem_cgroup)
6536 return;
6537
6538
6539
6540
6541
6542
6543
6544 if (ug->memcg != page->mem_cgroup) {
6545 if (ug->memcg) {
6546 uncharge_batch(ug);
6547 uncharge_gather_clear(ug);
6548 }
6549 ug->memcg = page->mem_cgroup;
6550 }
6551
6552 if (!PageKmemcg(page)) {
6553 unsigned int nr_pages = 1;
6554
6555 if (PageTransHuge(page)) {
6556 nr_pages = compound_nr(page);
6557 ug->nr_huge += nr_pages;
6558 }
6559 if (PageAnon(page))
6560 ug->nr_anon += nr_pages;
6561 else {
6562 ug->nr_file += nr_pages;
6563 if (PageSwapBacked(page))
6564 ug->nr_shmem += nr_pages;
6565 }
6566 ug->pgpgout++;
6567 } else {
6568 ug->nr_kmem += compound_nr(page);
6569 __ClearPageKmemcg(page);
6570 }
6571
6572 ug->dummy_page = page;
6573 page->mem_cgroup = NULL;
6574}
6575
6576static void uncharge_list(struct list_head *page_list)
6577{
6578 struct uncharge_gather ug;
6579 struct list_head *next;
6580
6581 uncharge_gather_clear(&ug);
6582
6583
6584
6585
6586
6587 next = page_list->next;
6588 do {
6589 struct page *page;
6590
6591 page = list_entry(next, struct page, lru);
6592 next = page->lru.next;
6593
6594 uncharge_page(page, &ug);
6595 } while (next != page_list);
6596
6597 if (ug.memcg)
6598 uncharge_batch(&ug);
6599}
6600
6601
6602
6603
6604
6605
6606
6607
6608void mem_cgroup_uncharge(struct page *page)
6609{
6610 struct uncharge_gather ug;
6611
6612 if (mem_cgroup_disabled())
6613 return;
6614
6615
6616 if (!page->mem_cgroup)
6617 return;
6618
6619 uncharge_gather_clear(&ug);
6620 uncharge_page(page, &ug);
6621 uncharge_batch(&ug);
6622}
6623
6624
6625
6626
6627
6628
6629
6630
6631void mem_cgroup_uncharge_list(struct list_head *page_list)
6632{
6633 if (mem_cgroup_disabled())
6634 return;
6635
6636 if (!list_empty(page_list))
6637 uncharge_list(page_list);
6638}
6639
6640
6641
6642
6643
6644
6645
6646
6647
6648
6649
6650void mem_cgroup_migrate(struct page *oldpage, struct page *newpage)
6651{
6652 struct mem_cgroup *memcg;
6653 unsigned int nr_pages;
6654 bool compound;
6655 unsigned long flags;
6656
6657 VM_BUG_ON_PAGE(!PageLocked(oldpage), oldpage);
6658 VM_BUG_ON_PAGE(!PageLocked(newpage), newpage);
6659 VM_BUG_ON_PAGE(PageAnon(oldpage) != PageAnon(newpage), newpage);
6660 VM_BUG_ON_PAGE(PageTransHuge(oldpage) != PageTransHuge(newpage),
6661 newpage);
6662
6663 if (mem_cgroup_disabled())
6664 return;
6665
6666
6667 if (newpage->mem_cgroup)
6668 return;
6669
6670
6671 memcg = oldpage->mem_cgroup;
6672 if (!memcg)
6673 return;
6674
6675
6676 compound = PageTransHuge(newpage);
6677 nr_pages = compound ? hpage_nr_pages(newpage) : 1;
6678
6679 page_counter_charge(&memcg->memory, nr_pages);
6680 if (do_memsw_account())
6681 page_counter_charge(&memcg->memsw, nr_pages);
6682 css_get_many(&memcg->css, nr_pages);
6683
6684 commit_charge(newpage, memcg, false);
6685
6686 local_irq_save(flags);
6687 mem_cgroup_charge_statistics(memcg, newpage, compound, nr_pages);
6688 memcg_check_events(memcg, newpage);
6689 local_irq_restore(flags);
6690}
6691
6692DEFINE_STATIC_KEY_FALSE(memcg_sockets_enabled_key);
6693EXPORT_SYMBOL(memcg_sockets_enabled_key);
6694
6695void mem_cgroup_sk_alloc(struct sock *sk)
6696{
6697 struct mem_cgroup *memcg;
6698
6699 if (!mem_cgroup_sockets_enabled)
6700 return;
6701
6702
6703
6704
6705
6706
6707
6708
6709
6710
6711 if (sk->sk_memcg) {
6712 css_get(&sk->sk_memcg->css);
6713 return;
6714 }
6715
6716 rcu_read_lock();
6717 memcg = mem_cgroup_from_task(current);
6718 if (memcg == root_mem_cgroup)
6719 goto out;
6720 if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && !memcg->tcpmem_active)
6721 goto out;
6722 if (css_tryget_online(&memcg->css))
6723 sk->sk_memcg = memcg;
6724out:
6725 rcu_read_unlock();
6726}
6727
6728void mem_cgroup_sk_free(struct sock *sk)
6729{
6730 if (sk->sk_memcg)
6731 css_put(&sk->sk_memcg->css);
6732}
6733
6734
6735
6736
6737
6738
6739
6740
6741
6742bool mem_cgroup_charge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages)
6743{
6744 gfp_t gfp_mask = GFP_KERNEL;
6745
6746 if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) {
6747 struct page_counter *fail;
6748
6749 if (page_counter_try_charge(&memcg->tcpmem, nr_pages, &fail)) {
6750 memcg->tcpmem_pressure = 0;
6751 return true;
6752 }
6753 page_counter_charge(&memcg->tcpmem, nr_pages);
6754 memcg->tcpmem_pressure = 1;
6755 return false;
6756 }
6757
6758
6759 if (in_softirq())
6760 gfp_mask = GFP_NOWAIT;
6761
6762 mod_memcg_state(memcg, MEMCG_SOCK, nr_pages);
6763
6764 if (try_charge(memcg, gfp_mask, nr_pages) == 0)
6765 return true;
6766
6767 try_charge(memcg, gfp_mask|__GFP_NOFAIL, nr_pages);
6768 return false;
6769}
6770
6771
6772
6773
6774
6775
6776void mem_cgroup_uncharge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages)
6777{
6778 if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) {
6779 page_counter_uncharge(&memcg->tcpmem, nr_pages);
6780 return;
6781 }
6782
6783 mod_memcg_state(memcg, MEMCG_SOCK, -nr_pages);
6784
6785 refill_stock(memcg, nr_pages);
6786}
6787
6788static int __init cgroup_memory(char *s)
6789{
6790 char *token;
6791
6792 while ((token = strsep(&s, ",")) != NULL) {
6793 if (!*token)
6794 continue;
6795 if (!strcmp(token, "nosocket"))
6796 cgroup_memory_nosocket = true;
6797 if (!strcmp(token, "nokmem"))
6798 cgroup_memory_nokmem = true;
6799 }
6800 return 0;
6801}
6802__setup("cgroup.memory=", cgroup_memory);
6803
6804
6805
6806
6807
6808
6809
6810
6811
6812static int __init mem_cgroup_init(void)
6813{
6814 int cpu, node;
6815
6816#ifdef CONFIG_MEMCG_KMEM
6817
6818
6819
6820
6821
6822
6823 memcg_kmem_cache_wq = alloc_workqueue("memcg_kmem_cache", 0, 1);
6824 BUG_ON(!memcg_kmem_cache_wq);
6825#endif
6826
6827 cpuhp_setup_state_nocalls(CPUHP_MM_MEMCQ_DEAD, "mm/memctrl:dead", NULL,
6828 memcg_hotplug_cpu_dead);
6829
6830 for_each_possible_cpu(cpu)
6831 INIT_WORK(&per_cpu_ptr(&memcg_stock, cpu)->work,
6832 drain_local_stock);
6833
6834 for_each_node(node) {
6835 struct mem_cgroup_tree_per_node *rtpn;
6836
6837 rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL,
6838 node_online(node) ? node : NUMA_NO_NODE);
6839
6840 rtpn->rb_root = RB_ROOT;
6841 rtpn->rb_rightmost = NULL;
6842 spin_lock_init(&rtpn->lock);
6843 soft_limit_tree.rb_tree_per_node[node] = rtpn;
6844 }
6845
6846 return 0;
6847}
6848subsys_initcall(mem_cgroup_init);
6849
6850#ifdef CONFIG_MEMCG_SWAP
6851static struct mem_cgroup *mem_cgroup_id_get_online(struct mem_cgroup *memcg)
6852{
6853 while (!refcount_inc_not_zero(&memcg->id.ref)) {
6854
6855
6856
6857
6858 if (WARN_ON_ONCE(memcg == root_mem_cgroup)) {
6859 VM_BUG_ON(1);
6860 break;
6861 }
6862 memcg = parent_mem_cgroup(memcg);
6863 if (!memcg)
6864 memcg = root_mem_cgroup;
6865 }
6866 return memcg;
6867}
6868
6869
6870
6871
6872
6873
6874
6875
6876void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
6877{
6878 struct mem_cgroup *memcg, *swap_memcg;
6879 unsigned int nr_entries;
6880 unsigned short oldid;
6881
6882 VM_BUG_ON_PAGE(PageLRU(page), page);
6883 VM_BUG_ON_PAGE(page_count(page), page);
6884
6885 if (!do_memsw_account())
6886 return;
6887
6888 memcg = page->mem_cgroup;
6889
6890
6891 if (!memcg)
6892 return;
6893
6894
6895
6896
6897
6898
6899 swap_memcg = mem_cgroup_id_get_online(memcg);
6900 nr_entries = hpage_nr_pages(page);
6901
6902 if (nr_entries > 1)
6903 mem_cgroup_id_get_many(swap_memcg, nr_entries - 1);
6904 oldid = swap_cgroup_record(entry, mem_cgroup_id(swap_memcg),
6905 nr_entries);
6906 VM_BUG_ON_PAGE(oldid, page);
6907 mod_memcg_state(swap_memcg, MEMCG_SWAP, nr_entries);
6908
6909 page->mem_cgroup = NULL;
6910
6911 if (!mem_cgroup_is_root(memcg))
6912 page_counter_uncharge(&memcg->memory, nr_entries);
6913
6914 if (memcg != swap_memcg) {
6915 if (!mem_cgroup_is_root(swap_memcg))
6916 page_counter_charge(&swap_memcg->memsw, nr_entries);
6917 page_counter_uncharge(&memcg->memsw, nr_entries);
6918 }
6919
6920
6921
6922
6923
6924
6925
6926 VM_BUG_ON(!irqs_disabled());
6927 mem_cgroup_charge_statistics(memcg, page, PageTransHuge(page),
6928 -nr_entries);
6929 memcg_check_events(memcg, page);
6930
6931 if (!mem_cgroup_is_root(memcg))
6932 css_put_many(&memcg->css, nr_entries);
6933}
6934
6935
6936
6937
6938
6939
6940
6941
6942
6943
6944int mem_cgroup_try_charge_swap(struct page *page, swp_entry_t entry)
6945{
6946 unsigned int nr_pages = hpage_nr_pages(page);
6947 struct page_counter *counter;
6948 struct mem_cgroup *memcg;
6949 unsigned short oldid;
6950
6951 if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) || !do_swap_account)
6952 return 0;
6953
6954 memcg = page->mem_cgroup;
6955
6956
6957 if (!memcg)
6958 return 0;
6959
6960 if (!entry.val) {
6961 memcg_memory_event(memcg, MEMCG_SWAP_FAIL);
6962 return 0;
6963 }
6964
6965 memcg = mem_cgroup_id_get_online(memcg);
6966
6967 if (!mem_cgroup_is_root(memcg) &&
6968 !page_counter_try_charge(&memcg->swap, nr_pages, &counter)) {
6969 memcg_memory_event(memcg, MEMCG_SWAP_MAX);
6970 memcg_memory_event(memcg, MEMCG_SWAP_FAIL);
6971 mem_cgroup_id_put(memcg);
6972 return -ENOMEM;
6973 }
6974
6975
6976 if (nr_pages > 1)
6977 mem_cgroup_id_get_many(memcg, nr_pages - 1);
6978 oldid = swap_cgroup_record(entry, mem_cgroup_id(memcg), nr_pages);
6979 VM_BUG_ON_PAGE(oldid, page);
6980 mod_memcg_state(memcg, MEMCG_SWAP, nr_pages);
6981
6982 return 0;
6983}
6984
6985
6986
6987
6988
6989
6990void mem_cgroup_uncharge_swap(swp_entry_t entry, unsigned int nr_pages)
6991{
6992 struct mem_cgroup *memcg;
6993 unsigned short id;
6994
6995 if (!do_swap_account)
6996 return;
6997
6998 id = swap_cgroup_record(entry, 0, nr_pages);
6999 rcu_read_lock();
7000 memcg = mem_cgroup_from_id(id);
7001 if (memcg) {
7002 if (!mem_cgroup_is_root(memcg)) {
7003 if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
7004 page_counter_uncharge(&memcg->swap, nr_pages);
7005 else
7006 page_counter_uncharge(&memcg->memsw, nr_pages);
7007 }
7008 mod_memcg_state(memcg, MEMCG_SWAP, -nr_pages);
7009 mem_cgroup_id_put_many(memcg, nr_pages);
7010 }
7011 rcu_read_unlock();
7012}
7013
7014long mem_cgroup_get_nr_swap_pages(struct mem_cgroup *memcg)
7015{
7016 long nr_swap_pages = get_nr_swap_pages();
7017
7018 if (!do_swap_account || !cgroup_subsys_on_dfl(memory_cgrp_subsys))
7019 return nr_swap_pages;
7020 for (; memcg != root_mem_cgroup; memcg = parent_mem_cgroup(memcg))
7021 nr_swap_pages = min_t(long, nr_swap_pages,
7022 READ_ONCE(memcg->swap.max) -
7023 page_counter_read(&memcg->swap));
7024 return nr_swap_pages;
7025}
7026
7027bool mem_cgroup_swap_full(struct page *page)
7028{
7029 struct mem_cgroup *memcg;
7030
7031 VM_BUG_ON_PAGE(!PageLocked(page), page);
7032
7033 if (vm_swap_full())
7034 return true;
7035 if (!do_swap_account || !cgroup_subsys_on_dfl(memory_cgrp_subsys))
7036 return false;
7037
7038 memcg = page->mem_cgroup;
7039 if (!memcg)
7040 return false;
7041
7042 for (; memcg != root_mem_cgroup; memcg = parent_mem_cgroup(memcg))
7043 if (page_counter_read(&memcg->swap) * 2 >= memcg->swap.max)
7044 return true;
7045
7046 return false;
7047}
7048
7049
7050#ifdef CONFIG_MEMCG_SWAP_ENABLED
7051static int really_do_swap_account __initdata = 1;
7052#else
7053static int really_do_swap_account __initdata;
7054#endif
7055
7056static int __init enable_swap_account(char *s)
7057{
7058 if (!strcmp(s, "1"))
7059 really_do_swap_account = 1;
7060 else if (!strcmp(s, "0"))
7061 really_do_swap_account = 0;
7062 return 1;
7063}
7064__setup("swapaccount=", enable_swap_account);
7065
7066static u64 swap_current_read(struct cgroup_subsys_state *css,
7067 struct cftype *cft)
7068{
7069 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
7070
7071 return (u64)page_counter_read(&memcg->swap) * PAGE_SIZE;
7072}
7073
7074static int swap_max_show(struct seq_file *m, void *v)
7075{
7076 return seq_puts_memcg_tunable(m,
7077 READ_ONCE(mem_cgroup_from_seq(m)->swap.max));
7078}
7079
7080static ssize_t swap_max_write(struct kernfs_open_file *of,
7081 char *buf, size_t nbytes, loff_t off)
7082{
7083 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
7084 unsigned long max;
7085 int err;
7086
7087 buf = strstrip(buf);
7088 err = page_counter_memparse(buf, "max", &max);
7089 if (err)
7090 return err;
7091
7092 xchg(&memcg->swap.max, max);
7093
7094 return nbytes;
7095}
7096
7097static int swap_events_show(struct seq_file *m, void *v)
7098{
7099 struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
7100
7101 seq_printf(m, "max %lu\n",
7102 atomic_long_read(&memcg->memory_events[MEMCG_SWAP_MAX]));
7103 seq_printf(m, "fail %lu\n",
7104 atomic_long_read(&memcg->memory_events[MEMCG_SWAP_FAIL]));
7105
7106 return 0;
7107}
7108
7109static struct cftype swap_files[] = {
7110 {
7111 .name = "swap.current",
7112 .flags = CFTYPE_NOT_ON_ROOT,
7113 .read_u64 = swap_current_read,
7114 },
7115 {
7116 .name = "swap.max",
7117 .flags = CFTYPE_NOT_ON_ROOT,
7118 .seq_show = swap_max_show,
7119 .write = swap_max_write,
7120 },
7121 {
7122 .name = "swap.events",
7123 .flags = CFTYPE_NOT_ON_ROOT,
7124 .file_offset = offsetof(struct mem_cgroup, swap_events_file),
7125 .seq_show = swap_events_show,
7126 },
7127 { }
7128};
7129
7130static struct cftype memsw_cgroup_files[] = {
7131 {
7132 .name = "memsw.usage_in_bytes",
7133 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE),
7134 .read_u64 = mem_cgroup_read_u64,
7135 },
7136 {
7137 .name = "memsw.max_usage_in_bytes",
7138 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_MAX_USAGE),
7139 .write = mem_cgroup_reset,
7140 .read_u64 = mem_cgroup_read_u64,
7141 },
7142 {
7143 .name = "memsw.limit_in_bytes",
7144 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_LIMIT),
7145 .write = mem_cgroup_write,
7146 .read_u64 = mem_cgroup_read_u64,
7147 },
7148 {
7149 .name = "memsw.failcnt",
7150 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_FAILCNT),
7151 .write = mem_cgroup_reset,
7152 .read_u64 = mem_cgroup_read_u64,
7153 },
7154 { },
7155};
7156
7157static int __init mem_cgroup_swap_init(void)
7158{
7159 if (!mem_cgroup_disabled() && really_do_swap_account) {
7160 do_swap_account = 1;
7161 WARN_ON(cgroup_add_dfl_cftypes(&memory_cgrp_subsys,
7162 swap_files));
7163 WARN_ON(cgroup_add_legacy_cftypes(&memory_cgrp_subsys,
7164 memsw_cgroup_files));
7165 }
7166 return 0;
7167}
7168subsys_initcall(mem_cgroup_swap_init);
7169
7170#endif
7171