1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25#include <linux/page_counter.h>
26#include <linux/memcontrol.h>
27#include <linux/cgroup.h>
28#include <linux/pagewalk.h>
29#include <linux/sched/mm.h>
30#include <linux/shmem_fs.h>
31#include <linux/hugetlb.h>
32#include <linux/pagemap.h>
33#include <linux/vm_event_item.h>
34#include <linux/smp.h>
35#include <linux/page-flags.h>
36#include <linux/backing-dev.h>
37#include <linux/bit_spinlock.h>
38#include <linux/rcupdate.h>
39#include <linux/limits.h>
40#include <linux/export.h>
41#include <linux/mutex.h>
42#include <linux/rbtree.h>
43#include <linux/slab.h>
44#include <linux/swap.h>
45#include <linux/swapops.h>
46#include <linux/spinlock.h>
47#include <linux/eventfd.h>
48#include <linux/poll.h>
49#include <linux/sort.h>
50#include <linux/fs.h>
51#include <linux/seq_file.h>
52#include <linux/vmpressure.h>
53#include <linux/mm_inline.h>
54#include <linux/swap_cgroup.h>
55#include <linux/cpu.h>
56#include <linux/oom.h>
57#include <linux/lockdep.h>
58#include <linux/file.h>
59#include <linux/tracehook.h>
60#include <linux/psi.h>
61#include <linux/seq_buf.h>
62#include "internal.h"
63#include <net/sock.h>
64#include <net/ip.h>
65#include "slab.h"
66
67#include <linux/uaccess.h>
68
69#include <trace/events/vmscan.h>
70
71struct cgroup_subsys memory_cgrp_subsys __read_mostly;
72EXPORT_SYMBOL(memory_cgrp_subsys);
73
74struct mem_cgroup *root_mem_cgroup __read_mostly;
75
76#define MEM_CGROUP_RECLAIM_RETRIES 5
77
78
79static bool cgroup_memory_nosocket;
80
81
82static bool cgroup_memory_nokmem;
83
84
85#ifdef CONFIG_MEMCG_SWAP
86int do_swap_account __read_mostly;
87#else
88#define do_swap_account 0
89#endif
90
91#ifdef CONFIG_CGROUP_WRITEBACK
92static DECLARE_WAIT_QUEUE_HEAD(memcg_cgwb_frn_waitq);
93#endif
94
95
96static bool do_memsw_account(void)
97{
98 return !cgroup_subsys_on_dfl(memory_cgrp_subsys) && do_swap_account;
99}
100
101static const char *const mem_cgroup_lru_names[] = {
102 "inactive_anon",
103 "active_anon",
104 "inactive_file",
105 "active_file",
106 "unevictable",
107};
108
109#define THRESHOLDS_EVENTS_TARGET 128
110#define SOFTLIMIT_EVENTS_TARGET 1024
111#define NUMAINFO_EVENTS_TARGET 1024
112
113
114
115
116
117
118struct mem_cgroup_tree_per_node {
119 struct rb_root rb_root;
120 struct rb_node *rb_rightmost;
121 spinlock_t lock;
122};
123
124struct mem_cgroup_tree {
125 struct mem_cgroup_tree_per_node *rb_tree_per_node[MAX_NUMNODES];
126};
127
128static struct mem_cgroup_tree soft_limit_tree __read_mostly;
129
130
131struct mem_cgroup_eventfd_list {
132 struct list_head list;
133 struct eventfd_ctx *eventfd;
134};
135
136
137
138
139struct mem_cgroup_event {
140
141
142
143 struct mem_cgroup *memcg;
144
145
146
147 struct eventfd_ctx *eventfd;
148
149
150
151 struct list_head list;
152
153
154
155
156
157 int (*register_event)(struct mem_cgroup *memcg,
158 struct eventfd_ctx *eventfd, const char *args);
159
160
161
162
163
164 void (*unregister_event)(struct mem_cgroup *memcg,
165 struct eventfd_ctx *eventfd);
166
167
168
169
170 poll_table pt;
171 wait_queue_head_t *wqh;
172 wait_queue_entry_t wait;
173 struct work_struct remove;
174};
175
176static void mem_cgroup_threshold(struct mem_cgroup *memcg);
177static void mem_cgroup_oom_notify(struct mem_cgroup *memcg);
178
179
180
181
182
183#define MOVE_ANON 0x1U
184#define MOVE_FILE 0x2U
185#define MOVE_MASK (MOVE_ANON | MOVE_FILE)
186
187
188static struct move_charge_struct {
189 spinlock_t lock;
190 struct mm_struct *mm;
191 struct mem_cgroup *from;
192 struct mem_cgroup *to;
193 unsigned long flags;
194 unsigned long precharge;
195 unsigned long moved_charge;
196 unsigned long moved_swap;
197 struct task_struct *moving_task;
198 wait_queue_head_t waitq;
199} mc = {
200 .lock = __SPIN_LOCK_UNLOCKED(mc.lock),
201 .waitq = __WAIT_QUEUE_HEAD_INITIALIZER(mc.waitq),
202};
203
204
205
206
207
208#define MEM_CGROUP_MAX_RECLAIM_LOOPS 100
209#define MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS 2
210
211enum charge_type {
212 MEM_CGROUP_CHARGE_TYPE_CACHE = 0,
213 MEM_CGROUP_CHARGE_TYPE_ANON,
214 MEM_CGROUP_CHARGE_TYPE_SWAPOUT,
215 MEM_CGROUP_CHARGE_TYPE_DROP,
216 NR_CHARGE_TYPE,
217};
218
219
220enum res_type {
221 _MEM,
222 _MEMSWAP,
223 _OOM_TYPE,
224 _KMEM,
225 _TCP,
226};
227
228#define MEMFILE_PRIVATE(x, val) ((x) << 16 | (val))
229#define MEMFILE_TYPE(val) ((val) >> 16 & 0xffff)
230#define MEMFILE_ATTR(val) ((val) & 0xffff)
231
232#define OOM_CONTROL (0)
233
234
235
236
237
238
239#define for_each_mem_cgroup_tree(iter, root) \
240 for (iter = mem_cgroup_iter(root, NULL, NULL); \
241 iter != NULL; \
242 iter = mem_cgroup_iter(root, iter, NULL))
243
244#define for_each_mem_cgroup(iter) \
245 for (iter = mem_cgroup_iter(NULL, NULL, NULL); \
246 iter != NULL; \
247 iter = mem_cgroup_iter(NULL, iter, NULL))
248
249static inline bool should_force_charge(void)
250{
251 return tsk_is_oom_victim(current) || fatal_signal_pending(current) ||
252 (current->flags & PF_EXITING);
253}
254
255
256struct vmpressure *memcg_to_vmpressure(struct mem_cgroup *memcg)
257{
258 if (!memcg)
259 memcg = root_mem_cgroup;
260 return &memcg->vmpressure;
261}
262
263struct cgroup_subsys_state *vmpressure_to_css(struct vmpressure *vmpr)
264{
265 return &container_of(vmpr, struct mem_cgroup, vmpressure)->css;
266}
267
268#ifdef CONFIG_MEMCG_KMEM
269
270
271
272
273
274
275
276
277
278
279
280static DEFINE_IDA(memcg_cache_ida);
281int memcg_nr_cache_ids;
282
283
284static DECLARE_RWSEM(memcg_cache_ids_sem);
285
286void memcg_get_cache_ids(void)
287{
288 down_read(&memcg_cache_ids_sem);
289}
290
291void memcg_put_cache_ids(void)
292{
293 up_read(&memcg_cache_ids_sem);
294}
295
296
297
298
299
300
301
302
303
304
305
306
307
308#define MEMCG_CACHES_MIN_SIZE 4
309#define MEMCG_CACHES_MAX_SIZE MEM_CGROUP_ID_MAX
310
311
312
313
314
315
316
317DEFINE_STATIC_KEY_FALSE(memcg_kmem_enabled_key);
318EXPORT_SYMBOL(memcg_kmem_enabled_key);
319
320struct workqueue_struct *memcg_kmem_cache_wq;
321#endif
322
323static int memcg_shrinker_map_size;
324static DEFINE_MUTEX(memcg_shrinker_map_mutex);
325
326static void memcg_free_shrinker_map_rcu(struct rcu_head *head)
327{
328 kvfree(container_of(head, struct memcg_shrinker_map, rcu));
329}
330
331static int memcg_expand_one_shrinker_map(struct mem_cgroup *memcg,
332 int size, int old_size)
333{
334 struct memcg_shrinker_map *new, *old;
335 int nid;
336
337 lockdep_assert_held(&memcg_shrinker_map_mutex);
338
339 for_each_node(nid) {
340 old = rcu_dereference_protected(
341 mem_cgroup_nodeinfo(memcg, nid)->shrinker_map, true);
342
343 if (!old)
344 return 0;
345
346 new = kvmalloc(sizeof(*new) + size, GFP_KERNEL);
347 if (!new)
348 return -ENOMEM;
349
350
351 memset(new->map, (int)0xff, old_size);
352 memset((void *)new->map + old_size, 0, size - old_size);
353
354 rcu_assign_pointer(memcg->nodeinfo[nid]->shrinker_map, new);
355 call_rcu(&old->rcu, memcg_free_shrinker_map_rcu);
356 }
357
358 return 0;
359}
360
361static void memcg_free_shrinker_maps(struct mem_cgroup *memcg)
362{
363 struct mem_cgroup_per_node *pn;
364 struct memcg_shrinker_map *map;
365 int nid;
366
367 if (mem_cgroup_is_root(memcg))
368 return;
369
370 for_each_node(nid) {
371 pn = mem_cgroup_nodeinfo(memcg, nid);
372 map = rcu_dereference_protected(pn->shrinker_map, true);
373 if (map)
374 kvfree(map);
375 rcu_assign_pointer(pn->shrinker_map, NULL);
376 }
377}
378
379static int memcg_alloc_shrinker_maps(struct mem_cgroup *memcg)
380{
381 struct memcg_shrinker_map *map;
382 int nid, size, ret = 0;
383
384 if (mem_cgroup_is_root(memcg))
385 return 0;
386
387 mutex_lock(&memcg_shrinker_map_mutex);
388 size = memcg_shrinker_map_size;
389 for_each_node(nid) {
390 map = kvzalloc(sizeof(*map) + size, GFP_KERNEL);
391 if (!map) {
392 memcg_free_shrinker_maps(memcg);
393 ret = -ENOMEM;
394 break;
395 }
396 rcu_assign_pointer(memcg->nodeinfo[nid]->shrinker_map, map);
397 }
398 mutex_unlock(&memcg_shrinker_map_mutex);
399
400 return ret;
401}
402
403int memcg_expand_shrinker_maps(int new_id)
404{
405 int size, old_size, ret = 0;
406 struct mem_cgroup *memcg;
407
408 size = DIV_ROUND_UP(new_id + 1, BITS_PER_LONG) * sizeof(unsigned long);
409 old_size = memcg_shrinker_map_size;
410 if (size <= old_size)
411 return 0;
412
413 mutex_lock(&memcg_shrinker_map_mutex);
414 if (!root_mem_cgroup)
415 goto unlock;
416
417 for_each_mem_cgroup(memcg) {
418 if (mem_cgroup_is_root(memcg))
419 continue;
420 ret = memcg_expand_one_shrinker_map(memcg, size, old_size);
421 if (ret)
422 goto unlock;
423 }
424unlock:
425 if (!ret)
426 memcg_shrinker_map_size = size;
427 mutex_unlock(&memcg_shrinker_map_mutex);
428 return ret;
429}
430
431void memcg_set_shrinker_bit(struct mem_cgroup *memcg, int nid, int shrinker_id)
432{
433 if (shrinker_id >= 0 && memcg && !mem_cgroup_is_root(memcg)) {
434 struct memcg_shrinker_map *map;
435
436 rcu_read_lock();
437 map = rcu_dereference(memcg->nodeinfo[nid]->shrinker_map);
438
439 smp_mb__before_atomic();
440 set_bit(shrinker_id, map->map);
441 rcu_read_unlock();
442 }
443}
444
445
446
447
448
449
450
451
452
453
454
455
456struct cgroup_subsys_state *mem_cgroup_css_from_page(struct page *page)
457{
458 struct mem_cgroup *memcg;
459
460 memcg = page->mem_cgroup;
461
462 if (!memcg || !cgroup_subsys_on_dfl(memory_cgrp_subsys))
463 memcg = root_mem_cgroup;
464
465 return &memcg->css;
466}
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481ino_t page_cgroup_ino(struct page *page)
482{
483 struct mem_cgroup *memcg;
484 unsigned long ino = 0;
485
486 rcu_read_lock();
487 if (PageSlab(page) && !PageTail(page))
488 memcg = memcg_from_slab_page(page);
489 else
490 memcg = READ_ONCE(page->mem_cgroup);
491 while (memcg && !(memcg->css.flags & CSS_ONLINE))
492 memcg = parent_mem_cgroup(memcg);
493 if (memcg)
494 ino = cgroup_ino(memcg->css.cgroup);
495 rcu_read_unlock();
496 return ino;
497}
498
499static struct mem_cgroup_per_node *
500mem_cgroup_page_nodeinfo(struct mem_cgroup *memcg, struct page *page)
501{
502 int nid = page_to_nid(page);
503
504 return memcg->nodeinfo[nid];
505}
506
507static struct mem_cgroup_tree_per_node *
508soft_limit_tree_node(int nid)
509{
510 return soft_limit_tree.rb_tree_per_node[nid];
511}
512
513static struct mem_cgroup_tree_per_node *
514soft_limit_tree_from_page(struct page *page)
515{
516 int nid = page_to_nid(page);
517
518 return soft_limit_tree.rb_tree_per_node[nid];
519}
520
521static void __mem_cgroup_insert_exceeded(struct mem_cgroup_per_node *mz,
522 struct mem_cgroup_tree_per_node *mctz,
523 unsigned long new_usage_in_excess)
524{
525 struct rb_node **p = &mctz->rb_root.rb_node;
526 struct rb_node *parent = NULL;
527 struct mem_cgroup_per_node *mz_node;
528 bool rightmost = true;
529
530 if (mz->on_tree)
531 return;
532
533 mz->usage_in_excess = new_usage_in_excess;
534 if (!mz->usage_in_excess)
535 return;
536 while (*p) {
537 parent = *p;
538 mz_node = rb_entry(parent, struct mem_cgroup_per_node,
539 tree_node);
540 if (mz->usage_in_excess < mz_node->usage_in_excess) {
541 p = &(*p)->rb_left;
542 rightmost = false;
543 }
544
545
546
547
548
549 else if (mz->usage_in_excess >= mz_node->usage_in_excess)
550 p = &(*p)->rb_right;
551 }
552
553 if (rightmost)
554 mctz->rb_rightmost = &mz->tree_node;
555
556 rb_link_node(&mz->tree_node, parent, p);
557 rb_insert_color(&mz->tree_node, &mctz->rb_root);
558 mz->on_tree = true;
559}
560
561static void __mem_cgroup_remove_exceeded(struct mem_cgroup_per_node *mz,
562 struct mem_cgroup_tree_per_node *mctz)
563{
564 if (!mz->on_tree)
565 return;
566
567 if (&mz->tree_node == mctz->rb_rightmost)
568 mctz->rb_rightmost = rb_prev(&mz->tree_node);
569
570 rb_erase(&mz->tree_node, &mctz->rb_root);
571 mz->on_tree = false;
572}
573
574static void mem_cgroup_remove_exceeded(struct mem_cgroup_per_node *mz,
575 struct mem_cgroup_tree_per_node *mctz)
576{
577 unsigned long flags;
578
579 spin_lock_irqsave(&mctz->lock, flags);
580 __mem_cgroup_remove_exceeded(mz, mctz);
581 spin_unlock_irqrestore(&mctz->lock, flags);
582}
583
584static unsigned long soft_limit_excess(struct mem_cgroup *memcg)
585{
586 unsigned long nr_pages = page_counter_read(&memcg->memory);
587 unsigned long soft_limit = READ_ONCE(memcg->soft_limit);
588 unsigned long excess = 0;
589
590 if (nr_pages > soft_limit)
591 excess = nr_pages - soft_limit;
592
593 return excess;
594}
595
596static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page)
597{
598 unsigned long excess;
599 struct mem_cgroup_per_node *mz;
600 struct mem_cgroup_tree_per_node *mctz;
601
602 mctz = soft_limit_tree_from_page(page);
603 if (!mctz)
604 return;
605
606
607
608
609 for (; memcg; memcg = parent_mem_cgroup(memcg)) {
610 mz = mem_cgroup_page_nodeinfo(memcg, page);
611 excess = soft_limit_excess(memcg);
612
613
614
615
616 if (excess || mz->on_tree) {
617 unsigned long flags;
618
619 spin_lock_irqsave(&mctz->lock, flags);
620
621 if (mz->on_tree)
622 __mem_cgroup_remove_exceeded(mz, mctz);
623
624
625
626
627 __mem_cgroup_insert_exceeded(mz, mctz, excess);
628 spin_unlock_irqrestore(&mctz->lock, flags);
629 }
630 }
631}
632
633static void mem_cgroup_remove_from_trees(struct mem_cgroup *memcg)
634{
635 struct mem_cgroup_tree_per_node *mctz;
636 struct mem_cgroup_per_node *mz;
637 int nid;
638
639 for_each_node(nid) {
640 mz = mem_cgroup_nodeinfo(memcg, nid);
641 mctz = soft_limit_tree_node(nid);
642 if (mctz)
643 mem_cgroup_remove_exceeded(mz, mctz);
644 }
645}
646
647static struct mem_cgroup_per_node *
648__mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz)
649{
650 struct mem_cgroup_per_node *mz;
651
652retry:
653 mz = NULL;
654 if (!mctz->rb_rightmost)
655 goto done;
656
657 mz = rb_entry(mctz->rb_rightmost,
658 struct mem_cgroup_per_node, tree_node);
659
660
661
662
663
664 __mem_cgroup_remove_exceeded(mz, mctz);
665 if (!soft_limit_excess(mz->memcg) ||
666 !css_tryget_online(&mz->memcg->css))
667 goto retry;
668done:
669 return mz;
670}
671
672static struct mem_cgroup_per_node *
673mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz)
674{
675 struct mem_cgroup_per_node *mz;
676
677 spin_lock_irq(&mctz->lock);
678 mz = __mem_cgroup_largest_soft_limit_node(mctz);
679 spin_unlock_irq(&mctz->lock);
680 return mz;
681}
682
683
684
685
686
687
688
689void __mod_memcg_state(struct mem_cgroup *memcg, int idx, int val)
690{
691 long x;
692
693 if (mem_cgroup_disabled())
694 return;
695
696 x = val + __this_cpu_read(memcg->vmstats_percpu->stat[idx]);
697 if (unlikely(abs(x) > MEMCG_CHARGE_BATCH)) {
698 struct mem_cgroup *mi;
699
700
701
702
703
704 __this_cpu_add(memcg->vmstats_local->stat[idx], x);
705 for (mi = memcg; mi; mi = parent_mem_cgroup(mi))
706 atomic_long_add(x, &mi->vmstats[idx]);
707 x = 0;
708 }
709 __this_cpu_write(memcg->vmstats_percpu->stat[idx], x);
710}
711
712static struct mem_cgroup_per_node *
713parent_nodeinfo(struct mem_cgroup_per_node *pn, int nid)
714{
715 struct mem_cgroup *parent;
716
717 parent = parent_mem_cgroup(pn->memcg);
718 if (!parent)
719 return NULL;
720 return mem_cgroup_nodeinfo(parent, nid);
721}
722
723
724
725
726
727
728
729
730
731
732
733void __mod_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx,
734 int val)
735{
736 pg_data_t *pgdat = lruvec_pgdat(lruvec);
737 struct mem_cgroup_per_node *pn;
738 struct mem_cgroup *memcg;
739 long x;
740
741
742 __mod_node_page_state(pgdat, idx, val);
743
744 if (mem_cgroup_disabled())
745 return;
746
747 pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
748 memcg = pn->memcg;
749
750
751 __mod_memcg_state(memcg, idx, val);
752
753
754 __this_cpu_add(pn->lruvec_stat_local->count[idx], val);
755
756 x = val + __this_cpu_read(pn->lruvec_stat_cpu->count[idx]);
757 if (unlikely(abs(x) > MEMCG_CHARGE_BATCH)) {
758 struct mem_cgroup_per_node *pi;
759
760 for (pi = pn; pi; pi = parent_nodeinfo(pi, pgdat->node_id))
761 atomic_long_add(x, &pi->lruvec_stat[idx]);
762 x = 0;
763 }
764 __this_cpu_write(pn->lruvec_stat_cpu->count[idx], x);
765}
766
767void __mod_lruvec_slab_state(void *p, enum node_stat_item idx, int val)
768{
769 struct page *page = virt_to_head_page(p);
770 pg_data_t *pgdat = page_pgdat(page);
771 struct mem_cgroup *memcg;
772 struct lruvec *lruvec;
773
774 rcu_read_lock();
775 memcg = memcg_from_slab_page(page);
776
777
778 if (!memcg || memcg == root_mem_cgroup) {
779 __mod_node_page_state(pgdat, idx, val);
780 } else {
781 lruvec = mem_cgroup_lruvec(pgdat, memcg);
782 __mod_lruvec_state(lruvec, idx, val);
783 }
784 rcu_read_unlock();
785}
786
787
788
789
790
791
792
793void __count_memcg_events(struct mem_cgroup *memcg, enum vm_event_item idx,
794 unsigned long count)
795{
796 unsigned long x;
797
798 if (mem_cgroup_disabled())
799 return;
800
801 x = count + __this_cpu_read(memcg->vmstats_percpu->events[idx]);
802 if (unlikely(x > MEMCG_CHARGE_BATCH)) {
803 struct mem_cgroup *mi;
804
805
806
807
808
809 __this_cpu_add(memcg->vmstats_local->events[idx], x);
810 for (mi = memcg; mi; mi = parent_mem_cgroup(mi))
811 atomic_long_add(x, &mi->vmevents[idx]);
812 x = 0;
813 }
814 __this_cpu_write(memcg->vmstats_percpu->events[idx], x);
815}
816
817static unsigned long memcg_events(struct mem_cgroup *memcg, int event)
818{
819 return atomic_long_read(&memcg->vmevents[event]);
820}
821
822static unsigned long memcg_events_local(struct mem_cgroup *memcg, int event)
823{
824 long x = 0;
825 int cpu;
826
827 for_each_possible_cpu(cpu)
828 x += per_cpu(memcg->vmstats_local->events[event], cpu);
829 return x;
830}
831
832static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg,
833 struct page *page,
834 bool compound, int nr_pages)
835{
836
837
838
839
840 if (PageAnon(page))
841 __mod_memcg_state(memcg, MEMCG_RSS, nr_pages);
842 else {
843 __mod_memcg_state(memcg, MEMCG_CACHE, nr_pages);
844 if (PageSwapBacked(page))
845 __mod_memcg_state(memcg, NR_SHMEM, nr_pages);
846 }
847
848 if (compound) {
849 VM_BUG_ON_PAGE(!PageTransHuge(page), page);
850 __mod_memcg_state(memcg, MEMCG_RSS_HUGE, nr_pages);
851 }
852
853
854 if (nr_pages > 0)
855 __count_memcg_events(memcg, PGPGIN, 1);
856 else {
857 __count_memcg_events(memcg, PGPGOUT, 1);
858 nr_pages = -nr_pages;
859 }
860
861 __this_cpu_add(memcg->vmstats_percpu->nr_page_events, nr_pages);
862}
863
864static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg,
865 enum mem_cgroup_events_target target)
866{
867 unsigned long val, next;
868
869 val = __this_cpu_read(memcg->vmstats_percpu->nr_page_events);
870 next = __this_cpu_read(memcg->vmstats_percpu->targets[target]);
871
872 if ((long)(next - val) < 0) {
873 switch (target) {
874 case MEM_CGROUP_TARGET_THRESH:
875 next = val + THRESHOLDS_EVENTS_TARGET;
876 break;
877 case MEM_CGROUP_TARGET_SOFTLIMIT:
878 next = val + SOFTLIMIT_EVENTS_TARGET;
879 break;
880 case MEM_CGROUP_TARGET_NUMAINFO:
881 next = val + NUMAINFO_EVENTS_TARGET;
882 break;
883 default:
884 break;
885 }
886 __this_cpu_write(memcg->vmstats_percpu->targets[target], next);
887 return true;
888 }
889 return false;
890}
891
892
893
894
895
896static void memcg_check_events(struct mem_cgroup *memcg, struct page *page)
897{
898
899 if (unlikely(mem_cgroup_event_ratelimit(memcg,
900 MEM_CGROUP_TARGET_THRESH))) {
901 bool do_softlimit;
902 bool do_numainfo __maybe_unused;
903
904 do_softlimit = mem_cgroup_event_ratelimit(memcg,
905 MEM_CGROUP_TARGET_SOFTLIMIT);
906#if MAX_NUMNODES > 1
907 do_numainfo = mem_cgroup_event_ratelimit(memcg,
908 MEM_CGROUP_TARGET_NUMAINFO);
909#endif
910 mem_cgroup_threshold(memcg);
911 if (unlikely(do_softlimit))
912 mem_cgroup_update_tree(memcg, page);
913#if MAX_NUMNODES > 1
914 if (unlikely(do_numainfo))
915 atomic_inc(&memcg->numainfo_events);
916#endif
917 }
918}
919
920struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
921{
922
923
924
925
926
927 if (unlikely(!p))
928 return NULL;
929
930 return mem_cgroup_from_css(task_css(p, memory_cgrp_id));
931}
932EXPORT_SYMBOL(mem_cgroup_from_task);
933
934
935
936
937
938
939
940
941
942struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm)
943{
944 struct mem_cgroup *memcg;
945
946 if (mem_cgroup_disabled())
947 return NULL;
948
949 rcu_read_lock();
950 do {
951
952
953
954
955
956 if (unlikely(!mm))
957 memcg = root_mem_cgroup;
958 else {
959 memcg = mem_cgroup_from_task(rcu_dereference(mm->owner));
960 if (unlikely(!memcg))
961 memcg = root_mem_cgroup;
962 }
963 } while (!css_tryget(&memcg->css));
964 rcu_read_unlock();
965 return memcg;
966}
967EXPORT_SYMBOL(get_mem_cgroup_from_mm);
968
969
970
971
972
973
974
975
976struct mem_cgroup *get_mem_cgroup_from_page(struct page *page)
977{
978 struct mem_cgroup *memcg = page->mem_cgroup;
979
980 if (mem_cgroup_disabled())
981 return NULL;
982
983 rcu_read_lock();
984 if (!memcg || !css_tryget_online(&memcg->css))
985 memcg = root_mem_cgroup;
986 rcu_read_unlock();
987 return memcg;
988}
989EXPORT_SYMBOL(get_mem_cgroup_from_page);
990
991
992
993
994static __always_inline struct mem_cgroup *get_mem_cgroup_from_current(void)
995{
996 if (unlikely(current->active_memcg)) {
997 struct mem_cgroup *memcg = root_mem_cgroup;
998
999 rcu_read_lock();
1000 if (css_tryget_online(¤t->active_memcg->css))
1001 memcg = current->active_memcg;
1002 rcu_read_unlock();
1003 return memcg;
1004 }
1005 return get_mem_cgroup_from_mm(current->mm);
1006}
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
1026 struct mem_cgroup *prev,
1027 struct mem_cgroup_reclaim_cookie *reclaim)
1028{
1029 struct mem_cgroup_reclaim_iter *uninitialized_var(iter);
1030 struct cgroup_subsys_state *css = NULL;
1031 struct mem_cgroup *memcg = NULL;
1032 struct mem_cgroup *pos = NULL;
1033
1034 if (mem_cgroup_disabled())
1035 return NULL;
1036
1037 if (!root)
1038 root = root_mem_cgroup;
1039
1040 if (prev && !reclaim)
1041 pos = prev;
1042
1043 if (!root->use_hierarchy && root != root_mem_cgroup) {
1044 if (prev)
1045 goto out;
1046 return root;
1047 }
1048
1049 rcu_read_lock();
1050
1051 if (reclaim) {
1052 struct mem_cgroup_per_node *mz;
1053
1054 mz = mem_cgroup_nodeinfo(root, reclaim->pgdat->node_id);
1055 iter = &mz->iter[reclaim->priority];
1056
1057 if (prev && reclaim->generation != iter->generation)
1058 goto out_unlock;
1059
1060 while (1) {
1061 pos = READ_ONCE(iter->position);
1062 if (!pos || css_tryget(&pos->css))
1063 break;
1064
1065
1066
1067
1068
1069
1070
1071
1072 (void)cmpxchg(&iter->position, pos, NULL);
1073 }
1074 }
1075
1076 if (pos)
1077 css = &pos->css;
1078
1079 for (;;) {
1080 css = css_next_descendant_pre(css, &root->css);
1081 if (!css) {
1082
1083
1084
1085
1086
1087
1088 if (!prev)
1089 continue;
1090 break;
1091 }
1092
1093
1094
1095
1096
1097
1098 memcg = mem_cgroup_from_css(css);
1099
1100 if (css == &root->css)
1101 break;
1102
1103 if (css_tryget(css))
1104 break;
1105
1106 memcg = NULL;
1107 }
1108
1109 if (reclaim) {
1110
1111
1112
1113
1114
1115 (void)cmpxchg(&iter->position, pos, memcg);
1116
1117 if (pos)
1118 css_put(&pos->css);
1119
1120 if (!memcg)
1121 iter->generation++;
1122 else if (!prev)
1123 reclaim->generation = iter->generation;
1124 }
1125
1126out_unlock:
1127 rcu_read_unlock();
1128out:
1129 if (prev && prev != root)
1130 css_put(&prev->css);
1131
1132 return memcg;
1133}
1134
1135
1136
1137
1138
1139
1140void mem_cgroup_iter_break(struct mem_cgroup *root,
1141 struct mem_cgroup *prev)
1142{
1143 if (!root)
1144 root = root_mem_cgroup;
1145 if (prev && prev != root)
1146 css_put(&prev->css);
1147}
1148
1149static void __invalidate_reclaim_iterators(struct mem_cgroup *from,
1150 struct mem_cgroup *dead_memcg)
1151{
1152 struct mem_cgroup_reclaim_iter *iter;
1153 struct mem_cgroup_per_node *mz;
1154 int nid;
1155 int i;
1156
1157 for_each_node(nid) {
1158 mz = mem_cgroup_nodeinfo(from, nid);
1159 for (i = 0; i <= DEF_PRIORITY; i++) {
1160 iter = &mz->iter[i];
1161 cmpxchg(&iter->position,
1162 dead_memcg, NULL);
1163 }
1164 }
1165}
1166
1167static void invalidate_reclaim_iterators(struct mem_cgroup *dead_memcg)
1168{
1169 struct mem_cgroup *memcg = dead_memcg;
1170 struct mem_cgroup *last;
1171
1172 do {
1173 __invalidate_reclaim_iterators(memcg, dead_memcg);
1174 last = memcg;
1175 } while ((memcg = parent_mem_cgroup(memcg)));
1176
1177
1178
1179
1180
1181
1182
1183 if (last != root_mem_cgroup)
1184 __invalidate_reclaim_iterators(root_mem_cgroup,
1185 dead_memcg);
1186}
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201int mem_cgroup_scan_tasks(struct mem_cgroup *memcg,
1202 int (*fn)(struct task_struct *, void *), void *arg)
1203{
1204 struct mem_cgroup *iter;
1205 int ret = 0;
1206
1207 BUG_ON(memcg == root_mem_cgroup);
1208
1209 for_each_mem_cgroup_tree(iter, memcg) {
1210 struct css_task_iter it;
1211 struct task_struct *task;
1212
1213 css_task_iter_start(&iter->css, CSS_TASK_ITER_PROCS, &it);
1214 while (!ret && (task = css_task_iter_next(&it)))
1215 ret = fn(task, arg);
1216 css_task_iter_end(&it);
1217 if (ret) {
1218 mem_cgroup_iter_break(memcg, iter);
1219 break;
1220 }
1221 }
1222 return ret;
1223}
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234struct lruvec *mem_cgroup_page_lruvec(struct page *page, struct pglist_data *pgdat)
1235{
1236 struct mem_cgroup_per_node *mz;
1237 struct mem_cgroup *memcg;
1238 struct lruvec *lruvec;
1239
1240 if (mem_cgroup_disabled()) {
1241 lruvec = &pgdat->lruvec;
1242 goto out;
1243 }
1244
1245 memcg = page->mem_cgroup;
1246
1247
1248
1249
1250 if (!memcg)
1251 memcg = root_mem_cgroup;
1252
1253 mz = mem_cgroup_page_nodeinfo(memcg, page);
1254 lruvec = &mz->lruvec;
1255out:
1256
1257
1258
1259
1260
1261 if (unlikely(lruvec->pgdat != pgdat))
1262 lruvec->pgdat = pgdat;
1263 return lruvec;
1264}
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru,
1278 int zid, int nr_pages)
1279{
1280 struct mem_cgroup_per_node *mz;
1281 unsigned long *lru_size;
1282 long size;
1283
1284 if (mem_cgroup_disabled())
1285 return;
1286
1287 mz = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
1288 lru_size = &mz->lru_zone_size[zid][lru];
1289
1290 if (nr_pages < 0)
1291 *lru_size += nr_pages;
1292
1293 size = *lru_size;
1294 if (WARN_ONCE(size < 0,
1295 "%s(%p, %d, %d): lru_size %ld\n",
1296 __func__, lruvec, lru, nr_pages, size)) {
1297 VM_BUG_ON(1);
1298 *lru_size = 0;
1299 }
1300
1301 if (nr_pages > 0)
1302 *lru_size += nr_pages;
1303}
1304
1305
1306
1307
1308
1309
1310
1311
1312static unsigned long mem_cgroup_margin(struct mem_cgroup *memcg)
1313{
1314 unsigned long margin = 0;
1315 unsigned long count;
1316 unsigned long limit;
1317
1318 count = page_counter_read(&memcg->memory);
1319 limit = READ_ONCE(memcg->memory.max);
1320 if (count < limit)
1321 margin = limit - count;
1322
1323 if (do_memsw_account()) {
1324 count = page_counter_read(&memcg->memsw);
1325 limit = READ_ONCE(memcg->memsw.max);
1326 if (count <= limit)
1327 margin = min(margin, limit - count);
1328 else
1329 margin = 0;
1330 }
1331
1332 return margin;
1333}
1334
1335
1336
1337
1338
1339
1340
1341
1342static bool mem_cgroup_under_move(struct mem_cgroup *memcg)
1343{
1344 struct mem_cgroup *from;
1345 struct mem_cgroup *to;
1346 bool ret = false;
1347
1348
1349
1350
1351 spin_lock(&mc.lock);
1352 from = mc.from;
1353 to = mc.to;
1354 if (!from)
1355 goto unlock;
1356
1357 ret = mem_cgroup_is_descendant(from, memcg) ||
1358 mem_cgroup_is_descendant(to, memcg);
1359unlock:
1360 spin_unlock(&mc.lock);
1361 return ret;
1362}
1363
1364static bool mem_cgroup_wait_acct_move(struct mem_cgroup *memcg)
1365{
1366 if (mc.moving_task && current != mc.moving_task) {
1367 if (mem_cgroup_under_move(memcg)) {
1368 DEFINE_WAIT(wait);
1369 prepare_to_wait(&mc.waitq, &wait, TASK_INTERRUPTIBLE);
1370
1371 if (mc.moving_task)
1372 schedule();
1373 finish_wait(&mc.waitq, &wait);
1374 return true;
1375 }
1376 }
1377 return false;
1378}
1379
1380static char *memory_stat_format(struct mem_cgroup *memcg)
1381{
1382 struct seq_buf s;
1383 int i;
1384
1385 seq_buf_init(&s, kmalloc(PAGE_SIZE, GFP_KERNEL), PAGE_SIZE);
1386 if (!s.buffer)
1387 return NULL;
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400 seq_buf_printf(&s, "anon %llu\n",
1401 (u64)memcg_page_state(memcg, MEMCG_RSS) *
1402 PAGE_SIZE);
1403 seq_buf_printf(&s, "file %llu\n",
1404 (u64)memcg_page_state(memcg, MEMCG_CACHE) *
1405 PAGE_SIZE);
1406 seq_buf_printf(&s, "kernel_stack %llu\n",
1407 (u64)memcg_page_state(memcg, MEMCG_KERNEL_STACK_KB) *
1408 1024);
1409 seq_buf_printf(&s, "slab %llu\n",
1410 (u64)(memcg_page_state(memcg, NR_SLAB_RECLAIMABLE) +
1411 memcg_page_state(memcg, NR_SLAB_UNRECLAIMABLE)) *
1412 PAGE_SIZE);
1413 seq_buf_printf(&s, "sock %llu\n",
1414 (u64)memcg_page_state(memcg, MEMCG_SOCK) *
1415 PAGE_SIZE);
1416
1417 seq_buf_printf(&s, "shmem %llu\n",
1418 (u64)memcg_page_state(memcg, NR_SHMEM) *
1419 PAGE_SIZE);
1420 seq_buf_printf(&s, "file_mapped %llu\n",
1421 (u64)memcg_page_state(memcg, NR_FILE_MAPPED) *
1422 PAGE_SIZE);
1423 seq_buf_printf(&s, "file_dirty %llu\n",
1424 (u64)memcg_page_state(memcg, NR_FILE_DIRTY) *
1425 PAGE_SIZE);
1426 seq_buf_printf(&s, "file_writeback %llu\n",
1427 (u64)memcg_page_state(memcg, NR_WRITEBACK) *
1428 PAGE_SIZE);
1429
1430
1431
1432
1433
1434
1435
1436 seq_buf_printf(&s, "anon_thp %llu\n",
1437 (u64)memcg_page_state(memcg, MEMCG_RSS_HUGE) *
1438 PAGE_SIZE);
1439
1440 for (i = 0; i < NR_LRU_LISTS; i++)
1441 seq_buf_printf(&s, "%s %llu\n", mem_cgroup_lru_names[i],
1442 (u64)memcg_page_state(memcg, NR_LRU_BASE + i) *
1443 PAGE_SIZE);
1444
1445 seq_buf_printf(&s, "slab_reclaimable %llu\n",
1446 (u64)memcg_page_state(memcg, NR_SLAB_RECLAIMABLE) *
1447 PAGE_SIZE);
1448 seq_buf_printf(&s, "slab_unreclaimable %llu\n",
1449 (u64)memcg_page_state(memcg, NR_SLAB_UNRECLAIMABLE) *
1450 PAGE_SIZE);
1451
1452
1453
1454 seq_buf_printf(&s, "pgfault %lu\n", memcg_events(memcg, PGFAULT));
1455 seq_buf_printf(&s, "pgmajfault %lu\n", memcg_events(memcg, PGMAJFAULT));
1456
1457 seq_buf_printf(&s, "workingset_refault %lu\n",
1458 memcg_page_state(memcg, WORKINGSET_REFAULT));
1459 seq_buf_printf(&s, "workingset_activate %lu\n",
1460 memcg_page_state(memcg, WORKINGSET_ACTIVATE));
1461 seq_buf_printf(&s, "workingset_nodereclaim %lu\n",
1462 memcg_page_state(memcg, WORKINGSET_NODERECLAIM));
1463
1464 seq_buf_printf(&s, "pgrefill %lu\n", memcg_events(memcg, PGREFILL));
1465 seq_buf_printf(&s, "pgscan %lu\n",
1466 memcg_events(memcg, PGSCAN_KSWAPD) +
1467 memcg_events(memcg, PGSCAN_DIRECT));
1468 seq_buf_printf(&s, "pgsteal %lu\n",
1469 memcg_events(memcg, PGSTEAL_KSWAPD) +
1470 memcg_events(memcg, PGSTEAL_DIRECT));
1471 seq_buf_printf(&s, "pgactivate %lu\n", memcg_events(memcg, PGACTIVATE));
1472 seq_buf_printf(&s, "pgdeactivate %lu\n", memcg_events(memcg, PGDEACTIVATE));
1473 seq_buf_printf(&s, "pglazyfree %lu\n", memcg_events(memcg, PGLAZYFREE));
1474 seq_buf_printf(&s, "pglazyfreed %lu\n", memcg_events(memcg, PGLAZYFREED));
1475
1476#ifdef CONFIG_TRANSPARENT_HUGEPAGE
1477 seq_buf_printf(&s, "thp_fault_alloc %lu\n",
1478 memcg_events(memcg, THP_FAULT_ALLOC));
1479 seq_buf_printf(&s, "thp_collapse_alloc %lu\n",
1480 memcg_events(memcg, THP_COLLAPSE_ALLOC));
1481#endif
1482
1483
1484 WARN_ON_ONCE(seq_buf_has_overflowed(&s));
1485
1486 return s.buffer;
1487}
1488
1489#define K(x) ((x) << (PAGE_SHIFT-10))
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499void mem_cgroup_print_oom_context(struct mem_cgroup *memcg, struct task_struct *p)
1500{
1501 rcu_read_lock();
1502
1503 if (memcg) {
1504 pr_cont(",oom_memcg=");
1505 pr_cont_cgroup_path(memcg->css.cgroup);
1506 } else
1507 pr_cont(",global_oom");
1508 if (p) {
1509 pr_cont(",task_memcg=");
1510 pr_cont_cgroup_path(task_cgroup(p, memory_cgrp_id));
1511 }
1512 rcu_read_unlock();
1513}
1514
1515
1516
1517
1518
1519
1520void mem_cgroup_print_oom_meminfo(struct mem_cgroup *memcg)
1521{
1522 char *buf;
1523
1524 pr_info("memory: usage %llukB, limit %llukB, failcnt %lu\n",
1525 K((u64)page_counter_read(&memcg->memory)),
1526 K((u64)memcg->memory.max), memcg->memory.failcnt);
1527 if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
1528 pr_info("swap: usage %llukB, limit %llukB, failcnt %lu\n",
1529 K((u64)page_counter_read(&memcg->swap)),
1530 K((u64)memcg->swap.max), memcg->swap.failcnt);
1531 else {
1532 pr_info("memory+swap: usage %llukB, limit %llukB, failcnt %lu\n",
1533 K((u64)page_counter_read(&memcg->memsw)),
1534 K((u64)memcg->memsw.max), memcg->memsw.failcnt);
1535 pr_info("kmem: usage %llukB, limit %llukB, failcnt %lu\n",
1536 K((u64)page_counter_read(&memcg->kmem)),
1537 K((u64)memcg->kmem.max), memcg->kmem.failcnt);
1538 }
1539
1540 pr_info("Memory cgroup stats for ");
1541 pr_cont_cgroup_path(memcg->css.cgroup);
1542 pr_cont(":");
1543 buf = memory_stat_format(memcg);
1544 if (!buf)
1545 return;
1546 pr_info("%s", buf);
1547 kfree(buf);
1548}
1549
1550
1551
1552
1553unsigned long mem_cgroup_get_max(struct mem_cgroup *memcg)
1554{
1555 unsigned long max;
1556
1557 max = memcg->memory.max;
1558 if (mem_cgroup_swappiness(memcg)) {
1559 unsigned long memsw_max;
1560 unsigned long swap_max;
1561
1562 memsw_max = memcg->memsw.max;
1563 swap_max = memcg->swap.max;
1564 swap_max = min(swap_max, (unsigned long)total_swap_pages);
1565 max = min(max + swap_max, memsw_max);
1566 }
1567 return max;
1568}
1569
1570unsigned long mem_cgroup_size(struct mem_cgroup *memcg)
1571{
1572 return page_counter_read(&memcg->memory);
1573}
1574
1575static bool mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
1576 int order)
1577{
1578 struct oom_control oc = {
1579 .zonelist = NULL,
1580 .nodemask = NULL,
1581 .memcg = memcg,
1582 .gfp_mask = gfp_mask,
1583 .order = order,
1584 };
1585 bool ret;
1586
1587 if (mutex_lock_killable(&oom_lock))
1588 return true;
1589
1590
1591
1592
1593 ret = should_force_charge() || out_of_memory(&oc);
1594 mutex_unlock(&oom_lock);
1595 return ret;
1596}
1597
1598#if MAX_NUMNODES > 1
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610static bool test_mem_cgroup_node_reclaimable(struct mem_cgroup *memcg,
1611 int nid, bool noswap)
1612{
1613 struct lruvec *lruvec = mem_cgroup_lruvec(NODE_DATA(nid), memcg);
1614
1615 if (lruvec_page_state(lruvec, NR_INACTIVE_FILE) ||
1616 lruvec_page_state(lruvec, NR_ACTIVE_FILE))
1617 return true;
1618 if (noswap || !total_swap_pages)
1619 return false;
1620 if (lruvec_page_state(lruvec, NR_INACTIVE_ANON) ||
1621 lruvec_page_state(lruvec, NR_ACTIVE_ANON))
1622 return true;
1623 return false;
1624
1625}
1626
1627
1628
1629
1630
1631
1632
1633static void mem_cgroup_may_update_nodemask(struct mem_cgroup *memcg)
1634{
1635 int nid;
1636
1637
1638
1639
1640 if (!atomic_read(&memcg->numainfo_events))
1641 return;
1642 if (atomic_inc_return(&memcg->numainfo_updating) > 1)
1643 return;
1644
1645
1646 memcg->scan_nodes = node_states[N_MEMORY];
1647
1648 for_each_node_mask(nid, node_states[N_MEMORY]) {
1649
1650 if (!test_mem_cgroup_node_reclaimable(memcg, nid, false))
1651 node_clear(nid, memcg->scan_nodes);
1652 }
1653
1654 atomic_set(&memcg->numainfo_events, 0);
1655 atomic_set(&memcg->numainfo_updating, 0);
1656}
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670int mem_cgroup_select_victim_node(struct mem_cgroup *memcg)
1671{
1672 int node;
1673
1674 mem_cgroup_may_update_nodemask(memcg);
1675 node = memcg->last_scanned_node;
1676
1677 node = next_node_in(node, memcg->scan_nodes);
1678
1679
1680
1681
1682
1683 if (unlikely(node == MAX_NUMNODES))
1684 node = numa_node_id();
1685
1686 memcg->last_scanned_node = node;
1687 return node;
1688}
1689#else
1690int mem_cgroup_select_victim_node(struct mem_cgroup *memcg)
1691{
1692 return 0;
1693}
1694#endif
1695
1696static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg,
1697 pg_data_t *pgdat,
1698 gfp_t gfp_mask,
1699 unsigned long *total_scanned)
1700{
1701 struct mem_cgroup *victim = NULL;
1702 int total = 0;
1703 int loop = 0;
1704 unsigned long excess;
1705 unsigned long nr_scanned;
1706 struct mem_cgroup_reclaim_cookie reclaim = {
1707 .pgdat = pgdat,
1708 .priority = 0,
1709 };
1710
1711 excess = soft_limit_excess(root_memcg);
1712
1713 while (1) {
1714 victim = mem_cgroup_iter(root_memcg, victim, &reclaim);
1715 if (!victim) {
1716 loop++;
1717 if (loop >= 2) {
1718
1719
1720
1721
1722
1723 if (!total)
1724 break;
1725
1726
1727
1728
1729
1730
1731 if (total >= (excess >> 2) ||
1732 (loop > MEM_CGROUP_MAX_RECLAIM_LOOPS))
1733 break;
1734 }
1735 continue;
1736 }
1737 total += mem_cgroup_shrink_node(victim, gfp_mask, false,
1738 pgdat, &nr_scanned);
1739 *total_scanned += nr_scanned;
1740 if (!soft_limit_excess(root_memcg))
1741 break;
1742 }
1743 mem_cgroup_iter_break(root_memcg, victim);
1744 return total;
1745}
1746
1747#ifdef CONFIG_LOCKDEP
1748static struct lockdep_map memcg_oom_lock_dep_map = {
1749 .name = "memcg_oom_lock",
1750};
1751#endif
1752
1753static DEFINE_SPINLOCK(memcg_oom_lock);
1754
1755
1756
1757
1758
1759static bool mem_cgroup_oom_trylock(struct mem_cgroup *memcg)
1760{
1761 struct mem_cgroup *iter, *failed = NULL;
1762
1763 spin_lock(&memcg_oom_lock);
1764
1765 for_each_mem_cgroup_tree(iter, memcg) {
1766 if (iter->oom_lock) {
1767
1768
1769
1770
1771 failed = iter;
1772 mem_cgroup_iter_break(memcg, iter);
1773 break;
1774 } else
1775 iter->oom_lock = true;
1776 }
1777
1778 if (failed) {
1779
1780
1781
1782
1783 for_each_mem_cgroup_tree(iter, memcg) {
1784 if (iter == failed) {
1785 mem_cgroup_iter_break(memcg, iter);
1786 break;
1787 }
1788 iter->oom_lock = false;
1789 }
1790 } else
1791 mutex_acquire(&memcg_oom_lock_dep_map, 0, 1, _RET_IP_);
1792
1793 spin_unlock(&memcg_oom_lock);
1794
1795 return !failed;
1796}
1797
1798static void mem_cgroup_oom_unlock(struct mem_cgroup *memcg)
1799{
1800 struct mem_cgroup *iter;
1801
1802 spin_lock(&memcg_oom_lock);
1803 mutex_release(&memcg_oom_lock_dep_map, 1, _RET_IP_);
1804 for_each_mem_cgroup_tree(iter, memcg)
1805 iter->oom_lock = false;
1806 spin_unlock(&memcg_oom_lock);
1807}
1808
1809static void mem_cgroup_mark_under_oom(struct mem_cgroup *memcg)
1810{
1811 struct mem_cgroup *iter;
1812
1813 spin_lock(&memcg_oom_lock);
1814 for_each_mem_cgroup_tree(iter, memcg)
1815 iter->under_oom++;
1816 spin_unlock(&memcg_oom_lock);
1817}
1818
1819static void mem_cgroup_unmark_under_oom(struct mem_cgroup *memcg)
1820{
1821 struct mem_cgroup *iter;
1822
1823
1824
1825
1826
1827 spin_lock(&memcg_oom_lock);
1828 for_each_mem_cgroup_tree(iter, memcg)
1829 if (iter->under_oom > 0)
1830 iter->under_oom--;
1831 spin_unlock(&memcg_oom_lock);
1832}
1833
1834static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq);
1835
1836struct oom_wait_info {
1837 struct mem_cgroup *memcg;
1838 wait_queue_entry_t wait;
1839};
1840
1841static int memcg_oom_wake_function(wait_queue_entry_t *wait,
1842 unsigned mode, int sync, void *arg)
1843{
1844 struct mem_cgroup *wake_memcg = (struct mem_cgroup *)arg;
1845 struct mem_cgroup *oom_wait_memcg;
1846 struct oom_wait_info *oom_wait_info;
1847
1848 oom_wait_info = container_of(wait, struct oom_wait_info, wait);
1849 oom_wait_memcg = oom_wait_info->memcg;
1850
1851 if (!mem_cgroup_is_descendant(wake_memcg, oom_wait_memcg) &&
1852 !mem_cgroup_is_descendant(oom_wait_memcg, wake_memcg))
1853 return 0;
1854 return autoremove_wake_function(wait, mode, sync, arg);
1855}
1856
1857static void memcg_oom_recover(struct mem_cgroup *memcg)
1858{
1859
1860
1861
1862
1863
1864
1865
1866
1867 if (memcg && memcg->under_oom)
1868 __wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, memcg);
1869}
1870
1871enum oom_status {
1872 OOM_SUCCESS,
1873 OOM_FAILED,
1874 OOM_ASYNC,
1875 OOM_SKIPPED
1876};
1877
1878static enum oom_status mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order)
1879{
1880 enum oom_status ret;
1881 bool locked;
1882
1883 if (order > PAGE_ALLOC_COSTLY_ORDER)
1884 return OOM_SKIPPED;
1885
1886 memcg_memory_event(memcg, MEMCG_OOM);
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906 if (memcg->oom_kill_disable) {
1907 if (!current->in_user_fault)
1908 return OOM_SKIPPED;
1909 css_get(&memcg->css);
1910 current->memcg_in_oom = memcg;
1911 current->memcg_oom_gfp_mask = mask;
1912 current->memcg_oom_order = order;
1913
1914 return OOM_ASYNC;
1915 }
1916
1917 mem_cgroup_mark_under_oom(memcg);
1918
1919 locked = mem_cgroup_oom_trylock(memcg);
1920
1921 if (locked)
1922 mem_cgroup_oom_notify(memcg);
1923
1924 mem_cgroup_unmark_under_oom(memcg);
1925 if (mem_cgroup_out_of_memory(memcg, mask, order))
1926 ret = OOM_SUCCESS;
1927 else
1928 ret = OOM_FAILED;
1929
1930 if (locked)
1931 mem_cgroup_oom_unlock(memcg);
1932
1933 return ret;
1934}
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953bool mem_cgroup_oom_synchronize(bool handle)
1954{
1955 struct mem_cgroup *memcg = current->memcg_in_oom;
1956 struct oom_wait_info owait;
1957 bool locked;
1958
1959
1960 if (!memcg)
1961 return false;
1962
1963 if (!handle)
1964 goto cleanup;
1965
1966 owait.memcg = memcg;
1967 owait.wait.flags = 0;
1968 owait.wait.func = memcg_oom_wake_function;
1969 owait.wait.private = current;
1970 INIT_LIST_HEAD(&owait.wait.entry);
1971
1972 prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE);
1973 mem_cgroup_mark_under_oom(memcg);
1974
1975 locked = mem_cgroup_oom_trylock(memcg);
1976
1977 if (locked)
1978 mem_cgroup_oom_notify(memcg);
1979
1980 if (locked && !memcg->oom_kill_disable) {
1981 mem_cgroup_unmark_under_oom(memcg);
1982 finish_wait(&memcg_oom_waitq, &owait.wait);
1983 mem_cgroup_out_of_memory(memcg, current->memcg_oom_gfp_mask,
1984 current->memcg_oom_order);
1985 } else {
1986 schedule();
1987 mem_cgroup_unmark_under_oom(memcg);
1988 finish_wait(&memcg_oom_waitq, &owait.wait);
1989 }
1990
1991 if (locked) {
1992 mem_cgroup_oom_unlock(memcg);
1993
1994
1995
1996
1997
1998 memcg_oom_recover(memcg);
1999 }
2000cleanup:
2001 current->memcg_in_oom = NULL;
2002 css_put(&memcg->css);
2003 return true;
2004}
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016struct mem_cgroup *mem_cgroup_get_oom_group(struct task_struct *victim,
2017 struct mem_cgroup *oom_domain)
2018{
2019 struct mem_cgroup *oom_group = NULL;
2020 struct mem_cgroup *memcg;
2021
2022 if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
2023 return NULL;
2024
2025 if (!oom_domain)
2026 oom_domain = root_mem_cgroup;
2027
2028 rcu_read_lock();
2029
2030 memcg = mem_cgroup_from_task(victim);
2031 if (memcg == root_mem_cgroup)
2032 goto out;
2033
2034
2035
2036
2037
2038
2039 for (; memcg; memcg = parent_mem_cgroup(memcg)) {
2040 if (memcg->oom_group)
2041 oom_group = memcg;
2042
2043 if (memcg == oom_domain)
2044 break;
2045 }
2046
2047 if (oom_group)
2048 css_get(&oom_group->css);
2049out:
2050 rcu_read_unlock();
2051
2052 return oom_group;
2053}
2054
2055void mem_cgroup_print_oom_group(struct mem_cgroup *memcg)
2056{
2057 pr_info("Tasks in ");
2058 pr_cont_cgroup_path(memcg->css.cgroup);
2059 pr_cont(" are going to be killed due to memory.oom.group set\n");
2060}
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073struct mem_cgroup *lock_page_memcg(struct page *page)
2074{
2075 struct mem_cgroup *memcg;
2076 unsigned long flags;
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089 rcu_read_lock();
2090
2091 if (mem_cgroup_disabled())
2092 return NULL;
2093again:
2094 memcg = page->mem_cgroup;
2095 if (unlikely(!memcg))
2096 return NULL;
2097
2098 if (atomic_read(&memcg->moving_account) <= 0)
2099 return memcg;
2100
2101 spin_lock_irqsave(&memcg->move_lock, flags);
2102 if (memcg != page->mem_cgroup) {
2103 spin_unlock_irqrestore(&memcg->move_lock, flags);
2104 goto again;
2105 }
2106
2107
2108
2109
2110
2111
2112 memcg->move_lock_task = current;
2113 memcg->move_lock_flags = flags;
2114
2115 return memcg;
2116}
2117EXPORT_SYMBOL(lock_page_memcg);
2118
2119
2120
2121
2122
2123
2124
2125void __unlock_page_memcg(struct mem_cgroup *memcg)
2126{
2127 if (memcg && memcg->move_lock_task == current) {
2128 unsigned long flags = memcg->move_lock_flags;
2129
2130 memcg->move_lock_task = NULL;
2131 memcg->move_lock_flags = 0;
2132
2133 spin_unlock_irqrestore(&memcg->move_lock, flags);
2134 }
2135
2136 rcu_read_unlock();
2137}
2138
2139
2140
2141
2142
2143void unlock_page_memcg(struct page *page)
2144{
2145 __unlock_page_memcg(page->mem_cgroup);
2146}
2147EXPORT_SYMBOL(unlock_page_memcg);
2148
2149struct memcg_stock_pcp {
2150 struct mem_cgroup *cached;
2151 unsigned int nr_pages;
2152 struct work_struct work;
2153 unsigned long flags;
2154#define FLUSHING_CACHED_CHARGE 0
2155};
2156static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock);
2157static DEFINE_MUTEX(percpu_charge_mutex);
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
2171{
2172 struct memcg_stock_pcp *stock;
2173 unsigned long flags;
2174 bool ret = false;
2175
2176 if (nr_pages > MEMCG_CHARGE_BATCH)
2177 return ret;
2178
2179 local_irq_save(flags);
2180
2181 stock = this_cpu_ptr(&memcg_stock);
2182 if (memcg == stock->cached && stock->nr_pages >= nr_pages) {
2183 stock->nr_pages -= nr_pages;
2184 ret = true;
2185 }
2186
2187 local_irq_restore(flags);
2188
2189 return ret;
2190}
2191
2192
2193
2194
2195static void drain_stock(struct memcg_stock_pcp *stock)
2196{
2197 struct mem_cgroup *old = stock->cached;
2198
2199 if (stock->nr_pages) {
2200 page_counter_uncharge(&old->memory, stock->nr_pages);
2201 if (do_memsw_account())
2202 page_counter_uncharge(&old->memsw, stock->nr_pages);
2203 css_put_many(&old->css, stock->nr_pages);
2204 stock->nr_pages = 0;
2205 }
2206 stock->cached = NULL;
2207}
2208
2209static void drain_local_stock(struct work_struct *dummy)
2210{
2211 struct memcg_stock_pcp *stock;
2212 unsigned long flags;
2213
2214
2215
2216
2217
2218 local_irq_save(flags);
2219
2220 stock = this_cpu_ptr(&memcg_stock);
2221 drain_stock(stock);
2222 clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags);
2223
2224 local_irq_restore(flags);
2225}
2226
2227
2228
2229
2230
2231static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
2232{
2233 struct memcg_stock_pcp *stock;
2234 unsigned long flags;
2235
2236 local_irq_save(flags);
2237
2238 stock = this_cpu_ptr(&memcg_stock);
2239 if (stock->cached != memcg) {
2240 drain_stock(stock);
2241 stock->cached = memcg;
2242 }
2243 stock->nr_pages += nr_pages;
2244
2245 if (stock->nr_pages > MEMCG_CHARGE_BATCH)
2246 drain_stock(stock);
2247
2248 local_irq_restore(flags);
2249}
2250
2251
2252
2253
2254
2255static void drain_all_stock(struct mem_cgroup *root_memcg)
2256{
2257 int cpu, curcpu;
2258
2259
2260 if (!mutex_trylock(&percpu_charge_mutex))
2261 return;
2262
2263
2264
2265
2266
2267
2268 curcpu = get_cpu();
2269 for_each_online_cpu(cpu) {
2270 struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu);
2271 struct mem_cgroup *memcg;
2272 bool flush = false;
2273
2274 rcu_read_lock();
2275 memcg = stock->cached;
2276 if (memcg && stock->nr_pages &&
2277 mem_cgroup_is_descendant(memcg, root_memcg))
2278 flush = true;
2279 rcu_read_unlock();
2280
2281 if (flush &&
2282 !test_and_set_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) {
2283 if (cpu == curcpu)
2284 drain_local_stock(&stock->work);
2285 else
2286 schedule_work_on(cpu, &stock->work);
2287 }
2288 }
2289 put_cpu();
2290 mutex_unlock(&percpu_charge_mutex);
2291}
2292
2293static int memcg_hotplug_cpu_dead(unsigned int cpu)
2294{
2295 struct memcg_stock_pcp *stock;
2296 struct mem_cgroup *memcg, *mi;
2297
2298 stock = &per_cpu(memcg_stock, cpu);
2299 drain_stock(stock);
2300
2301 for_each_mem_cgroup(memcg) {
2302 int i;
2303
2304 for (i = 0; i < MEMCG_NR_STAT; i++) {
2305 int nid;
2306 long x;
2307
2308 x = this_cpu_xchg(memcg->vmstats_percpu->stat[i], 0);
2309 if (x)
2310 for (mi = memcg; mi; mi = parent_mem_cgroup(mi))
2311 atomic_long_add(x, &memcg->vmstats[i]);
2312
2313 if (i >= NR_VM_NODE_STAT_ITEMS)
2314 continue;
2315
2316 for_each_node(nid) {
2317 struct mem_cgroup_per_node *pn;
2318
2319 pn = mem_cgroup_nodeinfo(memcg, nid);
2320 x = this_cpu_xchg(pn->lruvec_stat_cpu->count[i], 0);
2321 if (x)
2322 do {
2323 atomic_long_add(x, &pn->lruvec_stat[i]);
2324 } while ((pn = parent_nodeinfo(pn, nid)));
2325 }
2326 }
2327
2328 for (i = 0; i < NR_VM_EVENT_ITEMS; i++) {
2329 long x;
2330
2331 x = this_cpu_xchg(memcg->vmstats_percpu->events[i], 0);
2332 if (x)
2333 for (mi = memcg; mi; mi = parent_mem_cgroup(mi))
2334 atomic_long_add(x, &memcg->vmevents[i]);
2335 }
2336 }
2337
2338 return 0;
2339}
2340
2341static void reclaim_high(struct mem_cgroup *memcg,
2342 unsigned int nr_pages,
2343 gfp_t gfp_mask)
2344{
2345 do {
2346 if (page_counter_read(&memcg->memory) <= memcg->high)
2347 continue;
2348 memcg_memory_event(memcg, MEMCG_HIGH);
2349 try_to_free_mem_cgroup_pages(memcg, nr_pages, gfp_mask, true);
2350 } while ((memcg = parent_mem_cgroup(memcg)));
2351}
2352
2353static void high_work_func(struct work_struct *work)
2354{
2355 struct mem_cgroup *memcg;
2356
2357 memcg = container_of(work, struct mem_cgroup, high_work);
2358 reclaim_high(memcg, MEMCG_CHARGE_BATCH, GFP_KERNEL);
2359}
2360
2361
2362
2363
2364
2365
2366#define MEMCG_MAX_HIGH_DELAY_JIFFIES (2UL*HZ)
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411 #define MEMCG_DELAY_PRECISION_SHIFT 20
2412 #define MEMCG_DELAY_SCALING_SHIFT 14
2413
2414
2415
2416
2417
2418void mem_cgroup_handle_over_high(void)
2419{
2420 unsigned long usage, high, clamped_high;
2421 unsigned long pflags;
2422 unsigned long penalty_jiffies, overage;
2423 unsigned int nr_pages = current->memcg_nr_pages_over_high;
2424 struct mem_cgroup *memcg;
2425
2426 if (likely(!nr_pages))
2427 return;
2428
2429 memcg = get_mem_cgroup_from_mm(current->mm);
2430 reclaim_high(memcg, nr_pages, GFP_KERNEL);
2431 current->memcg_nr_pages_over_high = 0;
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445 usage = page_counter_read(&memcg->memory);
2446 high = READ_ONCE(memcg->high);
2447
2448 if (usage <= high)
2449 goto out;
2450
2451
2452
2453
2454
2455 clamped_high = max(high, 1UL);
2456
2457 overage = div_u64((u64)(usage - high) << MEMCG_DELAY_PRECISION_SHIFT,
2458 clamped_high);
2459
2460 penalty_jiffies = ((u64)overage * overage * HZ)
2461 >> (MEMCG_DELAY_PRECISION_SHIFT + MEMCG_DELAY_SCALING_SHIFT);
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471 penalty_jiffies = penalty_jiffies * nr_pages / MEMCG_CHARGE_BATCH;
2472
2473
2474
2475
2476
2477
2478 penalty_jiffies = min(penalty_jiffies, MEMCG_MAX_HIGH_DELAY_JIFFIES);
2479
2480
2481
2482
2483
2484
2485
2486 if (penalty_jiffies <= HZ / 100)
2487 goto out;
2488
2489
2490
2491
2492
2493
2494 psi_memstall_enter(&pflags);
2495 schedule_timeout_killable(penalty_jiffies);
2496 psi_memstall_leave(&pflags);
2497
2498out:
2499 css_put(&memcg->css);
2500}
2501
2502static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
2503 unsigned int nr_pages)
2504{
2505 unsigned int batch = max(MEMCG_CHARGE_BATCH, nr_pages);
2506 int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
2507 struct mem_cgroup *mem_over_limit;
2508 struct page_counter *counter;
2509 unsigned long nr_reclaimed;
2510 bool may_swap = true;
2511 bool drained = false;
2512 enum oom_status oom_status;
2513
2514 if (mem_cgroup_is_root(memcg))
2515 return 0;
2516retry:
2517 if (consume_stock(memcg, nr_pages))
2518 return 0;
2519
2520 if (!do_memsw_account() ||
2521 page_counter_try_charge(&memcg->memsw, batch, &counter)) {
2522 if (page_counter_try_charge(&memcg->memory, batch, &counter))
2523 goto done_restock;
2524 if (do_memsw_account())
2525 page_counter_uncharge(&memcg->memsw, batch);
2526 mem_over_limit = mem_cgroup_from_counter(counter, memory);
2527 } else {
2528 mem_over_limit = mem_cgroup_from_counter(counter, memsw);
2529 may_swap = false;
2530 }
2531
2532 if (batch > nr_pages) {
2533 batch = nr_pages;
2534 goto retry;
2535 }
2536
2537
2538
2539
2540
2541
2542
2543 if (gfp_mask & __GFP_ATOMIC)
2544 goto force;
2545
2546
2547
2548
2549
2550
2551
2552 if (unlikely(should_force_charge()))
2553 goto force;
2554
2555
2556
2557
2558
2559
2560
2561 if (unlikely(current->flags & PF_MEMALLOC))
2562 goto force;
2563
2564 if (unlikely(task_in_memcg_oom(current)))
2565 goto nomem;
2566
2567 if (!gfpflags_allow_blocking(gfp_mask))
2568 goto nomem;
2569
2570 memcg_memory_event(mem_over_limit, MEMCG_MAX);
2571
2572 nr_reclaimed = try_to_free_mem_cgroup_pages(mem_over_limit, nr_pages,
2573 gfp_mask, may_swap);
2574
2575 if (mem_cgroup_margin(mem_over_limit) >= nr_pages)
2576 goto retry;
2577
2578 if (!drained) {
2579 drain_all_stock(mem_over_limit);
2580 drained = true;
2581 goto retry;
2582 }
2583
2584 if (gfp_mask & __GFP_NORETRY)
2585 goto nomem;
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595 if (nr_reclaimed && nr_pages <= (1 << PAGE_ALLOC_COSTLY_ORDER))
2596 goto retry;
2597
2598
2599
2600
2601 if (mem_cgroup_wait_acct_move(mem_over_limit))
2602 goto retry;
2603
2604 if (nr_retries--)
2605 goto retry;
2606
2607 if (gfp_mask & __GFP_RETRY_MAYFAIL)
2608 goto nomem;
2609
2610 if (gfp_mask & __GFP_NOFAIL)
2611 goto force;
2612
2613 if (fatal_signal_pending(current))
2614 goto force;
2615
2616
2617
2618
2619
2620
2621 oom_status = mem_cgroup_oom(mem_over_limit, gfp_mask,
2622 get_order(nr_pages * PAGE_SIZE));
2623 switch (oom_status) {
2624 case OOM_SUCCESS:
2625 nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
2626 goto retry;
2627 case OOM_FAILED:
2628 goto force;
2629 default:
2630 goto nomem;
2631 }
2632nomem:
2633 if (!(gfp_mask & __GFP_NOFAIL))
2634 return -ENOMEM;
2635force:
2636
2637
2638
2639
2640
2641 page_counter_charge(&memcg->memory, nr_pages);
2642 if (do_memsw_account())
2643 page_counter_charge(&memcg->memsw, nr_pages);
2644 css_get_many(&memcg->css, nr_pages);
2645
2646 return 0;
2647
2648done_restock:
2649 css_get_many(&memcg->css, batch);
2650 if (batch > nr_pages)
2651 refill_stock(memcg, batch - nr_pages);
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662 do {
2663 if (page_counter_read(&memcg->memory) > memcg->high) {
2664
2665 if (in_interrupt()) {
2666 schedule_work(&memcg->high_work);
2667 break;
2668 }
2669 current->memcg_nr_pages_over_high += batch;
2670 set_notify_resume(current);
2671 break;
2672 }
2673 } while ((memcg = parent_mem_cgroup(memcg)));
2674
2675 return 0;
2676}
2677
2678static void cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages)
2679{
2680 if (mem_cgroup_is_root(memcg))
2681 return;
2682
2683 page_counter_uncharge(&memcg->memory, nr_pages);
2684 if (do_memsw_account())
2685 page_counter_uncharge(&memcg->memsw, nr_pages);
2686
2687 css_put_many(&memcg->css, nr_pages);
2688}
2689
2690static void lock_page_lru(struct page *page, int *isolated)
2691{
2692 pg_data_t *pgdat = page_pgdat(page);
2693
2694 spin_lock_irq(&pgdat->lru_lock);
2695 if (PageLRU(page)) {
2696 struct lruvec *lruvec;
2697
2698 lruvec = mem_cgroup_page_lruvec(page, pgdat);
2699 ClearPageLRU(page);
2700 del_page_from_lru_list(page, lruvec, page_lru(page));
2701 *isolated = 1;
2702 } else
2703 *isolated = 0;
2704}
2705
2706static void unlock_page_lru(struct page *page, int isolated)
2707{
2708 pg_data_t *pgdat = page_pgdat(page);
2709
2710 if (isolated) {
2711 struct lruvec *lruvec;
2712
2713 lruvec = mem_cgroup_page_lruvec(page, pgdat);
2714 VM_BUG_ON_PAGE(PageLRU(page), page);
2715 SetPageLRU(page);
2716 add_page_to_lru_list(page, lruvec, page_lru(page));
2717 }
2718 spin_unlock_irq(&pgdat->lru_lock);
2719}
2720
2721static void commit_charge(struct page *page, struct mem_cgroup *memcg,
2722 bool lrucare)
2723{
2724 int isolated;
2725
2726 VM_BUG_ON_PAGE(page->mem_cgroup, page);
2727
2728
2729
2730
2731
2732 if (lrucare)
2733 lock_page_lru(page, &isolated);
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749 page->mem_cgroup = memcg;
2750
2751 if (lrucare)
2752 unlock_page_lru(page, isolated);
2753}
2754
2755#ifdef CONFIG_MEMCG_KMEM
2756static int memcg_alloc_cache_id(void)
2757{
2758 int id, size;
2759 int err;
2760
2761 id = ida_simple_get(&memcg_cache_ida,
2762 0, MEMCG_CACHES_MAX_SIZE, GFP_KERNEL);
2763 if (id < 0)
2764 return id;
2765
2766 if (id < memcg_nr_cache_ids)
2767 return id;
2768
2769
2770
2771
2772
2773 down_write(&memcg_cache_ids_sem);
2774
2775 size = 2 * (id + 1);
2776 if (size < MEMCG_CACHES_MIN_SIZE)
2777 size = MEMCG_CACHES_MIN_SIZE;
2778 else if (size > MEMCG_CACHES_MAX_SIZE)
2779 size = MEMCG_CACHES_MAX_SIZE;
2780
2781 err = memcg_update_all_caches(size);
2782 if (!err)
2783 err = memcg_update_all_list_lrus(size);
2784 if (!err)
2785 memcg_nr_cache_ids = size;
2786
2787 up_write(&memcg_cache_ids_sem);
2788
2789 if (err) {
2790 ida_simple_remove(&memcg_cache_ida, id);
2791 return err;
2792 }
2793 return id;
2794}
2795
2796static void memcg_free_cache_id(int id)
2797{
2798 ida_simple_remove(&memcg_cache_ida, id);
2799}
2800
2801struct memcg_kmem_cache_create_work {
2802 struct mem_cgroup *memcg;
2803 struct kmem_cache *cachep;
2804 struct work_struct work;
2805};
2806
2807static void memcg_kmem_cache_create_func(struct work_struct *w)
2808{
2809 struct memcg_kmem_cache_create_work *cw =
2810 container_of(w, struct memcg_kmem_cache_create_work, work);
2811 struct mem_cgroup *memcg = cw->memcg;
2812 struct kmem_cache *cachep = cw->cachep;
2813
2814 memcg_create_kmem_cache(memcg, cachep);
2815
2816 css_put(&memcg->css);
2817 kfree(cw);
2818}
2819
2820
2821
2822
2823static void memcg_schedule_kmem_cache_create(struct mem_cgroup *memcg,
2824 struct kmem_cache *cachep)
2825{
2826 struct memcg_kmem_cache_create_work *cw;
2827
2828 if (!css_tryget_online(&memcg->css))
2829 return;
2830
2831 cw = kmalloc(sizeof(*cw), GFP_NOWAIT | __GFP_NOWARN);
2832 if (!cw)
2833 return;
2834
2835 cw->memcg = memcg;
2836 cw->cachep = cachep;
2837 INIT_WORK(&cw->work, memcg_kmem_cache_create_func);
2838
2839 queue_work(memcg_kmem_cache_wq, &cw->work);
2840}
2841
2842static inline bool memcg_kmem_bypass(void)
2843{
2844 if (in_interrupt() || !current->mm || (current->flags & PF_KTHREAD))
2845 return true;
2846 return false;
2847}
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865struct kmem_cache *memcg_kmem_get_cache(struct kmem_cache *cachep)
2866{
2867 struct mem_cgroup *memcg;
2868 struct kmem_cache *memcg_cachep;
2869 struct memcg_cache_array *arr;
2870 int kmemcg_id;
2871
2872 VM_BUG_ON(!is_root_cache(cachep));
2873
2874 if (memcg_kmem_bypass())
2875 return cachep;
2876
2877 rcu_read_lock();
2878
2879 if (unlikely(current->active_memcg))
2880 memcg = current->active_memcg;
2881 else
2882 memcg = mem_cgroup_from_task(current);
2883
2884 if (!memcg || memcg == root_mem_cgroup)
2885 goto out_unlock;
2886
2887 kmemcg_id = READ_ONCE(memcg->kmemcg_id);
2888 if (kmemcg_id < 0)
2889 goto out_unlock;
2890
2891 arr = rcu_dereference(cachep->memcg_params.memcg_caches);
2892
2893
2894
2895
2896
2897
2898 memcg_cachep = READ_ONCE(arr->entries[kmemcg_id]);
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919 if (unlikely(!memcg_cachep))
2920 memcg_schedule_kmem_cache_create(memcg, cachep);
2921 else if (percpu_ref_tryget(&memcg_cachep->memcg_params.refcnt))
2922 cachep = memcg_cachep;
2923out_unlock:
2924 rcu_read_unlock();
2925 return cachep;
2926}
2927
2928
2929
2930
2931
2932void memcg_kmem_put_cache(struct kmem_cache *cachep)
2933{
2934 if (!is_root_cache(cachep))
2935 percpu_ref_put(&cachep->memcg_params.refcnt);
2936}
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947int __memcg_kmem_charge_memcg(struct page *page, gfp_t gfp, int order,
2948 struct mem_cgroup *memcg)
2949{
2950 unsigned int nr_pages = 1 << order;
2951 struct page_counter *counter;
2952 int ret;
2953
2954 ret = try_charge(memcg, gfp, nr_pages);
2955 if (ret)
2956 return ret;
2957
2958 if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) &&
2959 !page_counter_try_charge(&memcg->kmem, nr_pages, &counter)) {
2960
2961
2962
2963
2964
2965
2966 if (gfp & __GFP_NOFAIL) {
2967 page_counter_charge(&memcg->kmem, nr_pages);
2968 return 0;
2969 }
2970 cancel_charge(memcg, nr_pages);
2971 return -ENOMEM;
2972 }
2973 return 0;
2974}
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984int __memcg_kmem_charge(struct page *page, gfp_t gfp, int order)
2985{
2986 struct mem_cgroup *memcg;
2987 int ret = 0;
2988
2989 if (memcg_kmem_bypass())
2990 return 0;
2991
2992 memcg = get_mem_cgroup_from_current();
2993 if (!mem_cgroup_is_root(memcg)) {
2994 ret = __memcg_kmem_charge_memcg(page, gfp, order, memcg);
2995 if (!ret) {
2996 page->mem_cgroup = memcg;
2997 __SetPageKmemcg(page);
2998 }
2999 }
3000 css_put(&memcg->css);
3001 return ret;
3002}
3003
3004
3005
3006
3007
3008
3009void __memcg_kmem_uncharge_memcg(struct mem_cgroup *memcg,
3010 unsigned int nr_pages)
3011{
3012 if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
3013 page_counter_uncharge(&memcg->kmem, nr_pages);
3014
3015 page_counter_uncharge(&memcg->memory, nr_pages);
3016 if (do_memsw_account())
3017 page_counter_uncharge(&memcg->memsw, nr_pages);
3018}
3019
3020
3021
3022
3023
3024void __memcg_kmem_uncharge(struct page *page, int order)
3025{
3026 struct mem_cgroup *memcg = page->mem_cgroup;
3027 unsigned int nr_pages = 1 << order;
3028
3029 if (!memcg)
3030 return;
3031
3032 VM_BUG_ON_PAGE(mem_cgroup_is_root(memcg), page);
3033 __memcg_kmem_uncharge_memcg(memcg, nr_pages);
3034 page->mem_cgroup = NULL;
3035
3036
3037 if (PageKmemcg(page))
3038 __ClearPageKmemcg(page);
3039
3040 css_put_many(&memcg->css, nr_pages);
3041}
3042#endif
3043
3044#ifdef CONFIG_TRANSPARENT_HUGEPAGE
3045
3046
3047
3048
3049
3050void mem_cgroup_split_huge_fixup(struct page *head)
3051{
3052 int i;
3053
3054 if (mem_cgroup_disabled())
3055 return;
3056
3057 for (i = 1; i < HPAGE_PMD_NR; i++)
3058 head[i].mem_cgroup = head->mem_cgroup;
3059
3060 __mod_memcg_state(head->mem_cgroup, MEMCG_RSS_HUGE, -HPAGE_PMD_NR);
3061}
3062#endif
3063
3064#ifdef CONFIG_MEMCG_SWAP
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079static int mem_cgroup_move_swap_account(swp_entry_t entry,
3080 struct mem_cgroup *from, struct mem_cgroup *to)
3081{
3082 unsigned short old_id, new_id;
3083
3084 old_id = mem_cgroup_id(from);
3085 new_id = mem_cgroup_id(to);
3086
3087 if (swap_cgroup_cmpxchg(entry, old_id, new_id) == old_id) {
3088 mod_memcg_state(from, MEMCG_SWAP, -1);
3089 mod_memcg_state(to, MEMCG_SWAP, 1);
3090 return 0;
3091 }
3092 return -EINVAL;
3093}
3094#else
3095static inline int mem_cgroup_move_swap_account(swp_entry_t entry,
3096 struct mem_cgroup *from, struct mem_cgroup *to)
3097{
3098 return -EINVAL;
3099}
3100#endif
3101
3102static DEFINE_MUTEX(memcg_max_mutex);
3103
3104static int mem_cgroup_resize_max(struct mem_cgroup *memcg,
3105 unsigned long max, bool memsw)
3106{
3107 bool enlarge = false;
3108 bool drained = false;
3109 int ret;
3110 bool limits_invariant;
3111 struct page_counter *counter = memsw ? &memcg->memsw : &memcg->memory;
3112
3113 do {
3114 if (signal_pending(current)) {
3115 ret = -EINTR;
3116 break;
3117 }
3118
3119 mutex_lock(&memcg_max_mutex);
3120
3121
3122
3123
3124 limits_invariant = memsw ? max >= memcg->memory.max :
3125 max <= memcg->memsw.max;
3126 if (!limits_invariant) {
3127 mutex_unlock(&memcg_max_mutex);
3128 ret = -EINVAL;
3129 break;
3130 }
3131 if (max > counter->max)
3132 enlarge = true;
3133 ret = page_counter_set_max(counter, max);
3134 mutex_unlock(&memcg_max_mutex);
3135
3136 if (!ret)
3137 break;
3138
3139 if (!drained) {
3140 drain_all_stock(memcg);
3141 drained = true;
3142 continue;
3143 }
3144
3145 if (!try_to_free_mem_cgroup_pages(memcg, 1,
3146 GFP_KERNEL, !memsw)) {
3147 ret = -EBUSY;
3148 break;
3149 }
3150 } while (true);
3151
3152 if (!ret && enlarge)
3153 memcg_oom_recover(memcg);
3154
3155 return ret;
3156}
3157
3158unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order,
3159 gfp_t gfp_mask,
3160 unsigned long *total_scanned)
3161{
3162 unsigned long nr_reclaimed = 0;
3163 struct mem_cgroup_per_node *mz, *next_mz = NULL;
3164 unsigned long reclaimed;
3165 int loop = 0;
3166 struct mem_cgroup_tree_per_node *mctz;
3167 unsigned long excess;
3168 unsigned long nr_scanned;
3169
3170 if (order > 0)
3171 return 0;
3172
3173 mctz = soft_limit_tree_node(pgdat->node_id);
3174
3175
3176
3177
3178
3179
3180 if (!mctz || RB_EMPTY_ROOT(&mctz->rb_root))
3181 return 0;
3182
3183
3184
3185
3186
3187
3188 do {
3189 if (next_mz)
3190 mz = next_mz;
3191 else
3192 mz = mem_cgroup_largest_soft_limit_node(mctz);
3193 if (!mz)
3194 break;
3195
3196 nr_scanned = 0;
3197 reclaimed = mem_cgroup_soft_reclaim(mz->memcg, pgdat,
3198 gfp_mask, &nr_scanned);
3199 nr_reclaimed += reclaimed;
3200 *total_scanned += nr_scanned;
3201 spin_lock_irq(&mctz->lock);
3202 __mem_cgroup_remove_exceeded(mz, mctz);
3203
3204
3205
3206
3207
3208 next_mz = NULL;
3209 if (!reclaimed)
3210 next_mz = __mem_cgroup_largest_soft_limit_node(mctz);
3211
3212 excess = soft_limit_excess(mz->memcg);
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222 __mem_cgroup_insert_exceeded(mz, mctz, excess);
3223 spin_unlock_irq(&mctz->lock);
3224 css_put(&mz->memcg->css);
3225 loop++;
3226
3227
3228
3229
3230
3231 if (!nr_reclaimed &&
3232 (next_mz == NULL ||
3233 loop > MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS))
3234 break;
3235 } while (!nr_reclaimed);
3236 if (next_mz)
3237 css_put(&next_mz->memcg->css);
3238 return nr_reclaimed;
3239}
3240
3241
3242
3243
3244
3245
3246
3247static inline bool memcg_has_children(struct mem_cgroup *memcg)
3248{
3249 bool ret;
3250
3251 rcu_read_lock();
3252 ret = css_next_child(NULL, &memcg->css);
3253 rcu_read_unlock();
3254 return ret;
3255}
3256
3257
3258
3259
3260
3261
3262static int mem_cgroup_force_empty(struct mem_cgroup *memcg)
3263{
3264 int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
3265
3266
3267 lru_add_drain_all();
3268
3269 drain_all_stock(memcg);
3270
3271
3272 while (nr_retries && page_counter_read(&memcg->memory)) {
3273 int progress;
3274
3275 if (signal_pending(current))
3276 return -EINTR;
3277
3278 progress = try_to_free_mem_cgroup_pages(memcg, 1,
3279 GFP_KERNEL, true);
3280 if (!progress) {
3281 nr_retries--;
3282
3283 congestion_wait(BLK_RW_ASYNC, HZ/10);
3284 }
3285
3286 }
3287
3288 return 0;
3289}
3290
3291static ssize_t mem_cgroup_force_empty_write(struct kernfs_open_file *of,
3292 char *buf, size_t nbytes,
3293 loff_t off)
3294{
3295 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
3296
3297 if (mem_cgroup_is_root(memcg))
3298 return -EINVAL;
3299 return mem_cgroup_force_empty(memcg) ?: nbytes;
3300}
3301
3302static u64 mem_cgroup_hierarchy_read(struct cgroup_subsys_state *css,
3303 struct cftype *cft)
3304{
3305 return mem_cgroup_from_css(css)->use_hierarchy;
3306}
3307
3308static int mem_cgroup_hierarchy_write(struct cgroup_subsys_state *css,
3309 struct cftype *cft, u64 val)
3310{
3311 int retval = 0;
3312 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
3313 struct mem_cgroup *parent_memcg = mem_cgroup_from_css(memcg->css.parent);
3314
3315 if (memcg->use_hierarchy == val)
3316 return 0;
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326 if ((!parent_memcg || !parent_memcg->use_hierarchy) &&
3327 (val == 1 || val == 0)) {
3328 if (!memcg_has_children(memcg))
3329 memcg->use_hierarchy = val;
3330 else
3331 retval = -EBUSY;
3332 } else
3333 retval = -EINVAL;
3334
3335 return retval;
3336}
3337
3338static unsigned long mem_cgroup_usage(struct mem_cgroup *memcg, bool swap)
3339{
3340 unsigned long val;
3341
3342 if (mem_cgroup_is_root(memcg)) {
3343 val = memcg_page_state(memcg, MEMCG_CACHE) +
3344 memcg_page_state(memcg, MEMCG_RSS);
3345 if (swap)
3346 val += memcg_page_state(memcg, MEMCG_SWAP);
3347 } else {
3348 if (!swap)
3349 val = page_counter_read(&memcg->memory);
3350 else
3351 val = page_counter_read(&memcg->memsw);
3352 }
3353 return val;
3354}
3355
3356enum {
3357 RES_USAGE,
3358 RES_LIMIT,
3359 RES_MAX_USAGE,
3360 RES_FAILCNT,
3361 RES_SOFT_LIMIT,
3362};
3363
3364static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css,
3365 struct cftype *cft)
3366{
3367 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
3368 struct page_counter *counter;
3369
3370 switch (MEMFILE_TYPE(cft->private)) {
3371 case _MEM:
3372 counter = &memcg->memory;
3373 break;
3374 case _MEMSWAP:
3375 counter = &memcg->memsw;
3376 break;
3377 case _KMEM:
3378 counter = &memcg->kmem;
3379 break;
3380 case _TCP:
3381 counter = &memcg->tcpmem;
3382 break;
3383 default:
3384 BUG();
3385 }
3386
3387 switch (MEMFILE_ATTR(cft->private)) {
3388 case RES_USAGE:
3389 if (counter == &memcg->memory)
3390 return (u64)mem_cgroup_usage(memcg, false) * PAGE_SIZE;
3391 if (counter == &memcg->memsw)
3392 return (u64)mem_cgroup_usage(memcg, true) * PAGE_SIZE;
3393 return (u64)page_counter_read(counter) * PAGE_SIZE;
3394 case RES_LIMIT:
3395 return (u64)counter->max * PAGE_SIZE;
3396 case RES_MAX_USAGE:
3397 return (u64)counter->watermark * PAGE_SIZE;
3398 case RES_FAILCNT:
3399 return counter->failcnt;
3400 case RES_SOFT_LIMIT:
3401 return (u64)memcg->soft_limit * PAGE_SIZE;
3402 default:
3403 BUG();
3404 }
3405}
3406
3407static void memcg_flush_percpu_vmstats(struct mem_cgroup *memcg, bool slab_only)
3408{
3409 unsigned long stat[MEMCG_NR_STAT];
3410 struct mem_cgroup *mi;
3411 int node, cpu, i;
3412 int min_idx, max_idx;
3413
3414 if (slab_only) {
3415 min_idx = NR_SLAB_RECLAIMABLE;
3416 max_idx = NR_SLAB_UNRECLAIMABLE;
3417 } else {
3418 min_idx = 0;
3419 max_idx = MEMCG_NR_STAT;
3420 }
3421
3422 for (i = min_idx; i < max_idx; i++)
3423 stat[i] = 0;
3424
3425 for_each_online_cpu(cpu)
3426 for (i = min_idx; i < max_idx; i++)
3427 stat[i] += per_cpu(memcg->vmstats_percpu->stat[i], cpu);
3428
3429 for (mi = memcg; mi; mi = parent_mem_cgroup(mi))
3430 for (i = min_idx; i < max_idx; i++)
3431 atomic_long_add(stat[i], &mi->vmstats[i]);
3432
3433 if (!slab_only)
3434 max_idx = NR_VM_NODE_STAT_ITEMS;
3435
3436 for_each_node(node) {
3437 struct mem_cgroup_per_node *pn = memcg->nodeinfo[node];
3438 struct mem_cgroup_per_node *pi;
3439
3440 for (i = min_idx; i < max_idx; i++)
3441 stat[i] = 0;
3442
3443 for_each_online_cpu(cpu)
3444 for (i = min_idx; i < max_idx; i++)
3445 stat[i] += per_cpu(
3446 pn->lruvec_stat_cpu->count[i], cpu);
3447
3448 for (pi = pn; pi; pi = parent_nodeinfo(pi, node))
3449 for (i = min_idx; i < max_idx; i++)
3450 atomic_long_add(stat[i], &pi->lruvec_stat[i]);
3451 }
3452}
3453
3454static void memcg_flush_percpu_vmevents(struct mem_cgroup *memcg)
3455{
3456 unsigned long events[NR_VM_EVENT_ITEMS];
3457 struct mem_cgroup *mi;
3458 int cpu, i;
3459
3460 for (i = 0; i < NR_VM_EVENT_ITEMS; i++)
3461 events[i] = 0;
3462
3463 for_each_online_cpu(cpu)
3464 for (i = 0; i < NR_VM_EVENT_ITEMS; i++)
3465 events[i] += per_cpu(memcg->vmstats_percpu->events[i],
3466 cpu);
3467
3468 for (mi = memcg; mi; mi = parent_mem_cgroup(mi))
3469 for (i = 0; i < NR_VM_EVENT_ITEMS; i++)
3470 atomic_long_add(events[i], &mi->vmevents[i]);
3471}
3472
3473#ifdef CONFIG_MEMCG_KMEM
3474static int memcg_online_kmem(struct mem_cgroup *memcg)
3475{
3476 int memcg_id;
3477
3478 if (cgroup_memory_nokmem)
3479 return 0;
3480
3481 BUG_ON(memcg->kmemcg_id >= 0);
3482 BUG_ON(memcg->kmem_state);
3483
3484 memcg_id = memcg_alloc_cache_id();
3485 if (memcg_id < 0)
3486 return memcg_id;
3487
3488 static_branch_inc(&memcg_kmem_enabled_key);
3489
3490
3491
3492
3493
3494
3495 memcg->kmemcg_id = memcg_id;
3496 memcg->kmem_state = KMEM_ONLINE;
3497 INIT_LIST_HEAD(&memcg->kmem_caches);
3498
3499 return 0;
3500}
3501
3502static void memcg_offline_kmem(struct mem_cgroup *memcg)
3503{
3504 struct cgroup_subsys_state *css;
3505 struct mem_cgroup *parent, *child;
3506 int kmemcg_id;
3507
3508 if (memcg->kmem_state != KMEM_ONLINE)
3509 return;
3510
3511
3512
3513
3514
3515
3516 memcg->kmem_state = KMEM_ALLOCATED;
3517
3518 parent = parent_mem_cgroup(memcg);
3519 if (!parent)
3520 parent = root_mem_cgroup;
3521
3522
3523
3524
3525
3526
3527
3528 memcg_deactivate_kmem_caches(memcg, parent);
3529 memcg_flush_percpu_vmstats(memcg, true);
3530
3531 kmemcg_id = memcg->kmemcg_id;
3532 BUG_ON(kmemcg_id < 0);
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542 rcu_read_lock();
3543 css_for_each_descendant_pre(css, &memcg->css) {
3544 child = mem_cgroup_from_css(css);
3545 BUG_ON(child->kmemcg_id != kmemcg_id);
3546 child->kmemcg_id = parent->kmemcg_id;
3547 if (!memcg->use_hierarchy)
3548 break;
3549 }
3550 rcu_read_unlock();
3551
3552 memcg_drain_all_list_lrus(kmemcg_id, parent);
3553
3554 memcg_free_cache_id(kmemcg_id);
3555}
3556
3557static void memcg_free_kmem(struct mem_cgroup *memcg)
3558{
3559
3560 if (unlikely(memcg->kmem_state == KMEM_ONLINE))
3561 memcg_offline_kmem(memcg);
3562
3563 if (memcg->kmem_state == KMEM_ALLOCATED) {
3564 WARN_ON(!list_empty(&memcg->kmem_caches));
3565 static_branch_dec(&memcg_kmem_enabled_key);
3566 }
3567}
3568#else
3569static int memcg_online_kmem(struct mem_cgroup *memcg)
3570{
3571 return 0;
3572}
3573static void memcg_offline_kmem(struct mem_cgroup *memcg)
3574{
3575}
3576static void memcg_free_kmem(struct mem_cgroup *memcg)
3577{
3578}
3579#endif
3580
3581static int memcg_update_kmem_max(struct mem_cgroup *memcg,
3582 unsigned long max)
3583{
3584 int ret;
3585
3586 mutex_lock(&memcg_max_mutex);
3587 ret = page_counter_set_max(&memcg->kmem, max);
3588 mutex_unlock(&memcg_max_mutex);
3589 return ret;
3590}
3591
3592static int memcg_update_tcp_max(struct mem_cgroup *memcg, unsigned long max)
3593{
3594 int ret;
3595
3596 mutex_lock(&memcg_max_mutex);
3597
3598 ret = page_counter_set_max(&memcg->tcpmem, max);
3599 if (ret)
3600 goto out;
3601
3602 if (!memcg->tcpmem_active) {
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619 static_branch_inc(&memcg_sockets_enabled_key);
3620 memcg->tcpmem_active = true;
3621 }
3622out:
3623 mutex_unlock(&memcg_max_mutex);
3624 return ret;
3625}
3626
3627
3628
3629
3630
3631static ssize_t mem_cgroup_write(struct kernfs_open_file *of,
3632 char *buf, size_t nbytes, loff_t off)
3633{
3634 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
3635 unsigned long nr_pages;
3636 int ret;
3637
3638 buf = strstrip(buf);
3639 ret = page_counter_memparse(buf, "-1", &nr_pages);
3640 if (ret)
3641 return ret;
3642
3643 switch (MEMFILE_ATTR(of_cft(of)->private)) {
3644 case RES_LIMIT:
3645 if (mem_cgroup_is_root(memcg)) {
3646 ret = -EINVAL;
3647 break;
3648 }
3649 switch (MEMFILE_TYPE(of_cft(of)->private)) {
3650 case _MEM:
3651 ret = mem_cgroup_resize_max(memcg, nr_pages, false);
3652 break;
3653 case _MEMSWAP:
3654 ret = mem_cgroup_resize_max(memcg, nr_pages, true);
3655 break;
3656 case _KMEM:
3657 pr_warn_once("kmem.limit_in_bytes is deprecated and will be removed. "
3658 "Please report your usecase to linux-mm@kvack.org if you "
3659 "depend on this functionality.\n");
3660 ret = memcg_update_kmem_max(memcg, nr_pages);
3661 break;
3662 case _TCP:
3663 ret = memcg_update_tcp_max(memcg, nr_pages);
3664 break;
3665 }
3666 break;
3667 case RES_SOFT_LIMIT:
3668 memcg->soft_limit = nr_pages;
3669 ret = 0;
3670 break;
3671 }
3672 return ret ?: nbytes;
3673}
3674
3675static ssize_t mem_cgroup_reset(struct kernfs_open_file *of, char *buf,
3676 size_t nbytes, loff_t off)
3677{
3678 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
3679 struct page_counter *counter;
3680
3681 switch (MEMFILE_TYPE(of_cft(of)->private)) {
3682 case _MEM:
3683 counter = &memcg->memory;
3684 break;
3685 case _MEMSWAP:
3686 counter = &memcg->memsw;
3687 break;
3688 case _KMEM:
3689 counter = &memcg->kmem;
3690 break;
3691 case _TCP:
3692 counter = &memcg->tcpmem;
3693 break;
3694 default:
3695 BUG();
3696 }
3697
3698 switch (MEMFILE_ATTR(of_cft(of)->private)) {
3699 case RES_MAX_USAGE:
3700 page_counter_reset_watermark(counter);
3701 break;
3702 case RES_FAILCNT:
3703 counter->failcnt = 0;
3704 break;
3705 default:
3706 BUG();
3707 }
3708
3709 return nbytes;
3710}
3711
3712static u64 mem_cgroup_move_charge_read(struct cgroup_subsys_state *css,
3713 struct cftype *cft)
3714{
3715 return mem_cgroup_from_css(css)->move_charge_at_immigrate;
3716}
3717
3718#ifdef CONFIG_MMU
3719static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css,
3720 struct cftype *cft, u64 val)
3721{
3722 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
3723
3724 if (val & ~MOVE_MASK)
3725 return -EINVAL;
3726
3727
3728
3729
3730
3731
3732
3733 memcg->move_charge_at_immigrate = val;
3734 return 0;
3735}
3736#else
3737static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css,
3738 struct cftype *cft, u64 val)
3739{
3740 return -ENOSYS;
3741}
3742#endif
3743
3744#ifdef CONFIG_NUMA
3745
3746#define LRU_ALL_FILE (BIT(LRU_INACTIVE_FILE) | BIT(LRU_ACTIVE_FILE))
3747#define LRU_ALL_ANON (BIT(LRU_INACTIVE_ANON) | BIT(LRU_ACTIVE_ANON))
3748#define LRU_ALL ((1 << NR_LRU_LISTS) - 1)
3749
3750static unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg,
3751 int nid, unsigned int lru_mask)
3752{
3753 struct lruvec *lruvec = mem_cgroup_lruvec(NODE_DATA(nid), memcg);
3754 unsigned long nr = 0;
3755 enum lru_list lru;
3756
3757 VM_BUG_ON((unsigned)nid >= nr_node_ids);
3758
3759 for_each_lru(lru) {
3760 if (!(BIT(lru) & lru_mask))
3761 continue;
3762 nr += lruvec_page_state_local(lruvec, NR_LRU_BASE + lru);
3763 }
3764 return nr;
3765}
3766
3767static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *memcg,
3768 unsigned int lru_mask)
3769{
3770 unsigned long nr = 0;
3771 enum lru_list lru;
3772
3773 for_each_lru(lru) {
3774 if (!(BIT(lru) & lru_mask))
3775 continue;
3776 nr += memcg_page_state_local(memcg, NR_LRU_BASE + lru);
3777 }
3778 return nr;
3779}
3780
3781static int memcg_numa_stat_show(struct seq_file *m, void *v)
3782{
3783 struct numa_stat {
3784 const char *name;
3785 unsigned int lru_mask;
3786 };
3787
3788 static const struct numa_stat stats[] = {
3789 { "total", LRU_ALL },
3790 { "file", LRU_ALL_FILE },
3791 { "anon", LRU_ALL_ANON },
3792 { "unevictable", BIT(LRU_UNEVICTABLE) },
3793 };
3794 const struct numa_stat *stat;
3795 int nid;
3796 unsigned long nr;
3797 struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
3798
3799 for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) {
3800 nr = mem_cgroup_nr_lru_pages(memcg, stat->lru_mask);
3801 seq_printf(m, "%s=%lu", stat->name, nr);
3802 for_each_node_state(nid, N_MEMORY) {
3803 nr = mem_cgroup_node_nr_lru_pages(memcg, nid,
3804 stat->lru_mask);
3805 seq_printf(m, " N%d=%lu", nid, nr);
3806 }
3807 seq_putc(m, '\n');
3808 }
3809
3810 for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) {
3811 struct mem_cgroup *iter;
3812
3813 nr = 0;
3814 for_each_mem_cgroup_tree(iter, memcg)
3815 nr += mem_cgroup_nr_lru_pages(iter, stat->lru_mask);
3816 seq_printf(m, "hierarchical_%s=%lu", stat->name, nr);
3817 for_each_node_state(nid, N_MEMORY) {
3818 nr = 0;
3819 for_each_mem_cgroup_tree(iter, memcg)
3820 nr += mem_cgroup_node_nr_lru_pages(
3821 iter, nid, stat->lru_mask);
3822 seq_printf(m, " N%d=%lu", nid, nr);
3823 }
3824 seq_putc(m, '\n');
3825 }
3826
3827 return 0;
3828}
3829#endif
3830
3831static const unsigned int memcg1_stats[] = {
3832 MEMCG_CACHE,
3833 MEMCG_RSS,
3834 MEMCG_RSS_HUGE,
3835 NR_SHMEM,
3836 NR_FILE_MAPPED,
3837 NR_FILE_DIRTY,
3838 NR_WRITEBACK,
3839 MEMCG_SWAP,
3840};
3841
3842static const char *const memcg1_stat_names[] = {
3843 "cache",
3844 "rss",
3845 "rss_huge",
3846 "shmem",
3847 "mapped_file",
3848 "dirty",
3849 "writeback",
3850 "swap",
3851};
3852
3853
3854static const unsigned int memcg1_events[] = {
3855 PGPGIN,
3856 PGPGOUT,
3857 PGFAULT,
3858 PGMAJFAULT,
3859};
3860
3861static const char *const memcg1_event_names[] = {
3862 "pgpgin",
3863 "pgpgout",
3864 "pgfault",
3865 "pgmajfault",
3866};
3867
3868static int memcg_stat_show(struct seq_file *m, void *v)
3869{
3870 struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
3871 unsigned long memory, memsw;
3872 struct mem_cgroup *mi;
3873 unsigned int i;
3874
3875 BUILD_BUG_ON(ARRAY_SIZE(memcg1_stat_names) != ARRAY_SIZE(memcg1_stats));
3876 BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_lru_names) != NR_LRU_LISTS);
3877
3878 for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) {
3879 if (memcg1_stats[i] == MEMCG_SWAP && !do_memsw_account())
3880 continue;
3881 seq_printf(m, "%s %lu\n", memcg1_stat_names[i],
3882 memcg_page_state_local(memcg, memcg1_stats[i]) *
3883 PAGE_SIZE);
3884 }
3885
3886 for (i = 0; i < ARRAY_SIZE(memcg1_events); i++)
3887 seq_printf(m, "%s %lu\n", memcg1_event_names[i],
3888 memcg_events_local(memcg, memcg1_events[i]));
3889
3890 for (i = 0; i < NR_LRU_LISTS; i++)
3891 seq_printf(m, "%s %lu\n", mem_cgroup_lru_names[i],
3892 memcg_page_state_local(memcg, NR_LRU_BASE + i) *
3893 PAGE_SIZE);
3894
3895
3896 memory = memsw = PAGE_COUNTER_MAX;
3897 for (mi = memcg; mi; mi = parent_mem_cgroup(mi)) {
3898 memory = min(memory, mi->memory.max);
3899 memsw = min(memsw, mi->memsw.max);
3900 }
3901 seq_printf(m, "hierarchical_memory_limit %llu\n",
3902 (u64)memory * PAGE_SIZE);
3903 if (do_memsw_account())
3904 seq_printf(m, "hierarchical_memsw_limit %llu\n",
3905 (u64)memsw * PAGE_SIZE);
3906
3907 for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) {
3908 if (memcg1_stats[i] == MEMCG_SWAP && !do_memsw_account())
3909 continue;
3910 seq_printf(m, "total_%s %llu\n", memcg1_stat_names[i],
3911 (u64)memcg_page_state(memcg, memcg1_stats[i]) *
3912 PAGE_SIZE);
3913 }
3914
3915 for (i = 0; i < ARRAY_SIZE(memcg1_events); i++)
3916 seq_printf(m, "total_%s %llu\n", memcg1_event_names[i],
3917 (u64)memcg_events(memcg, memcg1_events[i]));
3918
3919 for (i = 0; i < NR_LRU_LISTS; i++)
3920 seq_printf(m, "total_%s %llu\n", mem_cgroup_lru_names[i],
3921 (u64)memcg_page_state(memcg, NR_LRU_BASE + i) *
3922 PAGE_SIZE);
3923
3924#ifdef CONFIG_DEBUG_VM
3925 {
3926 pg_data_t *pgdat;
3927 struct mem_cgroup_per_node *mz;
3928 struct zone_reclaim_stat *rstat;
3929 unsigned long recent_rotated[2] = {0, 0};
3930 unsigned long recent_scanned[2] = {0, 0};
3931
3932 for_each_online_pgdat(pgdat) {
3933 mz = mem_cgroup_nodeinfo(memcg, pgdat->node_id);
3934 rstat = &mz->lruvec.reclaim_stat;
3935
3936 recent_rotated[0] += rstat->recent_rotated[0];
3937 recent_rotated[1] += rstat->recent_rotated[1];
3938 recent_scanned[0] += rstat->recent_scanned[0];
3939 recent_scanned[1] += rstat->recent_scanned[1];
3940 }
3941 seq_printf(m, "recent_rotated_anon %lu\n", recent_rotated[0]);
3942 seq_printf(m, "recent_rotated_file %lu\n", recent_rotated[1]);
3943 seq_printf(m, "recent_scanned_anon %lu\n", recent_scanned[0]);
3944 seq_printf(m, "recent_scanned_file %lu\n", recent_scanned[1]);
3945 }
3946#endif
3947
3948 return 0;
3949}
3950
3951static u64 mem_cgroup_swappiness_read(struct cgroup_subsys_state *css,
3952 struct cftype *cft)
3953{
3954 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
3955
3956 return mem_cgroup_swappiness(memcg);
3957}
3958
3959static int mem_cgroup_swappiness_write(struct cgroup_subsys_state *css,
3960 struct cftype *cft, u64 val)
3961{
3962 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
3963
3964 if (val > 100)
3965 return -EINVAL;
3966
3967 if (css->parent)
3968 memcg->swappiness = val;
3969 else
3970 vm_swappiness = val;
3971
3972 return 0;
3973}
3974
3975static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap)
3976{
3977 struct mem_cgroup_threshold_ary *t;
3978 unsigned long usage;
3979 int i;
3980
3981 rcu_read_lock();
3982 if (!swap)
3983 t = rcu_dereference(memcg->thresholds.primary);
3984 else
3985 t = rcu_dereference(memcg->memsw_thresholds.primary);
3986
3987 if (!t)
3988 goto unlock;
3989
3990 usage = mem_cgroup_usage(memcg, swap);
3991
3992
3993
3994
3995
3996
3997 i = t->current_threshold;
3998
3999
4000
4001
4002
4003
4004
4005 for (; i >= 0 && unlikely(t->entries[i].threshold > usage); i--)
4006 eventfd_signal(t->entries[i].eventfd, 1);
4007
4008
4009 i++;
4010
4011
4012
4013
4014
4015
4016
4017 for (; i < t->size && unlikely(t->entries[i].threshold <= usage); i++)
4018 eventfd_signal(t->entries[i].eventfd, 1);
4019
4020
4021 t->current_threshold = i - 1;
4022unlock:
4023 rcu_read_unlock();
4024}
4025
4026static void mem_cgroup_threshold(struct mem_cgroup *memcg)
4027{
4028 while (memcg) {
4029 __mem_cgroup_threshold(memcg, false);
4030 if (do_memsw_account())
4031 __mem_cgroup_threshold(memcg, true);
4032
4033 memcg = parent_mem_cgroup(memcg);
4034 }
4035}
4036
4037static int compare_thresholds(const void *a, const void *b)
4038{
4039 const struct mem_cgroup_threshold *_a = a;
4040 const struct mem_cgroup_threshold *_b = b;
4041
4042 if (_a->threshold > _b->threshold)
4043 return 1;
4044
4045 if (_a->threshold < _b->threshold)
4046 return -1;
4047
4048 return 0;
4049}
4050
4051static int mem_cgroup_oom_notify_cb(struct mem_cgroup *memcg)
4052{
4053 struct mem_cgroup_eventfd_list *ev;
4054
4055 spin_lock(&memcg_oom_lock);
4056
4057 list_for_each_entry(ev, &memcg->oom_notify, list)
4058 eventfd_signal(ev->eventfd, 1);
4059
4060 spin_unlock(&memcg_oom_lock);
4061 return 0;
4062}
4063
4064static void mem_cgroup_oom_notify(struct mem_cgroup *memcg)
4065{
4066 struct mem_cgroup *iter;
4067
4068 for_each_mem_cgroup_tree(iter, memcg)
4069 mem_cgroup_oom_notify_cb(iter);
4070}
4071
4072static int __mem_cgroup_usage_register_event(struct mem_cgroup *memcg,
4073 struct eventfd_ctx *eventfd, const char *args, enum res_type type)
4074{
4075 struct mem_cgroup_thresholds *thresholds;
4076 struct mem_cgroup_threshold_ary *new;
4077 unsigned long threshold;
4078 unsigned long usage;
4079 int i, size, ret;
4080
4081 ret = page_counter_memparse(args, "-1", &threshold);
4082 if (ret)
4083 return ret;
4084
4085 mutex_lock(&memcg->thresholds_lock);
4086
4087 if (type == _MEM) {
4088 thresholds = &memcg->thresholds;
4089 usage = mem_cgroup_usage(memcg, false);
4090 } else if (type == _MEMSWAP) {
4091 thresholds = &memcg->memsw_thresholds;
4092 usage = mem_cgroup_usage(memcg, true);
4093 } else
4094 BUG();
4095
4096
4097 if (thresholds->primary)
4098 __mem_cgroup_threshold(memcg, type == _MEMSWAP);
4099
4100 size = thresholds->primary ? thresholds->primary->size + 1 : 1;
4101
4102
4103 new = kmalloc(struct_size(new, entries, size), GFP_KERNEL);
4104 if (!new) {
4105 ret = -ENOMEM;
4106 goto unlock;
4107 }
4108 new->size = size;
4109
4110
4111 if (thresholds->primary) {
4112 memcpy(new->entries, thresholds->primary->entries, (size - 1) *
4113 sizeof(struct mem_cgroup_threshold));
4114 }
4115
4116
4117 new->entries[size - 1].eventfd = eventfd;
4118 new->entries[size - 1].threshold = threshold;
4119
4120
4121 sort(new->entries, size, sizeof(struct mem_cgroup_threshold),
4122 compare_thresholds, NULL);
4123
4124
4125 new->current_threshold = -1;
4126 for (i = 0; i < size; i++) {
4127 if (new->entries[i].threshold <= usage) {
4128
4129
4130
4131
4132
4133 ++new->current_threshold;
4134 } else
4135 break;
4136 }
4137
4138
4139 kfree(thresholds->spare);
4140 thresholds->spare = thresholds->primary;
4141
4142 rcu_assign_pointer(thresholds->primary, new);
4143
4144
4145 synchronize_rcu();
4146
4147unlock:
4148 mutex_unlock(&memcg->thresholds_lock);
4149
4150 return ret;
4151}
4152
4153static int mem_cgroup_usage_register_event(struct mem_cgroup *memcg,
4154 struct eventfd_ctx *eventfd, const char *args)
4155{
4156 return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEM);
4157}
4158
4159static int memsw_cgroup_usage_register_event(struct mem_cgroup *memcg,
4160 struct eventfd_ctx *eventfd, const char *args)
4161{
4162 return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEMSWAP);
4163}
4164
4165static void __mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
4166 struct eventfd_ctx *eventfd, enum res_type type)
4167{
4168 struct mem_cgroup_thresholds *thresholds;
4169 struct mem_cgroup_threshold_ary *new;
4170 unsigned long usage;
4171 int i, j, size;
4172
4173 mutex_lock(&memcg->thresholds_lock);
4174
4175 if (type == _MEM) {
4176 thresholds = &memcg->thresholds;
4177 usage = mem_cgroup_usage(memcg, false);
4178 } else if (type == _MEMSWAP) {
4179 thresholds = &memcg->memsw_thresholds;
4180 usage = mem_cgroup_usage(memcg, true);
4181 } else
4182 BUG();
4183
4184 if (!thresholds->primary)
4185 goto unlock;
4186
4187
4188 __mem_cgroup_threshold(memcg, type == _MEMSWAP);
4189
4190
4191 size = 0;
4192 for (i = 0; i < thresholds->primary->size; i++) {
4193 if (thresholds->primary->entries[i].eventfd != eventfd)
4194 size++;
4195 }
4196
4197 new = thresholds->spare;
4198
4199
4200 if (!size) {
4201 kfree(new);
4202 new = NULL;
4203 goto swap_buffers;
4204 }
4205
4206 new->size = size;
4207
4208
4209 new->current_threshold = -1;
4210 for (i = 0, j = 0; i < thresholds->primary->size; i++) {
4211 if (thresholds->primary->entries[i].eventfd == eventfd)
4212 continue;
4213
4214 new->entries[j] = thresholds->primary->entries[i];
4215 if (new->entries[j].threshold <= usage) {
4216
4217
4218
4219
4220
4221 ++new->current_threshold;
4222 }
4223 j++;
4224 }
4225
4226swap_buffers:
4227
4228 thresholds->spare = thresholds->primary;
4229
4230 rcu_assign_pointer(thresholds->primary, new);
4231
4232
4233 synchronize_rcu();
4234
4235
4236 if (!new) {
4237 kfree(thresholds->spare);
4238 thresholds->spare = NULL;
4239 }
4240unlock:
4241 mutex_unlock(&memcg->thresholds_lock);
4242}
4243
4244static void mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
4245 struct eventfd_ctx *eventfd)
4246{
4247 return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEM);
4248}
4249
4250static void memsw_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
4251 struct eventfd_ctx *eventfd)
4252{
4253 return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEMSWAP);
4254}
4255
4256static int mem_cgroup_oom_register_event(struct mem_cgroup *memcg,
4257 struct eventfd_ctx *eventfd, const char *args)
4258{
4259 struct mem_cgroup_eventfd_list *event;
4260
4261 event = kmalloc(sizeof(*event), GFP_KERNEL);
4262 if (!event)
4263 return -ENOMEM;
4264
4265 spin_lock(&memcg_oom_lock);
4266
4267 event->eventfd = eventfd;
4268 list_add(&event->list, &memcg->oom_notify);
4269
4270
4271 if (memcg->under_oom)
4272 eventfd_signal(eventfd, 1);
4273 spin_unlock(&memcg_oom_lock);
4274
4275 return 0;
4276}
4277
4278static void mem_cgroup_oom_unregister_event(struct mem_cgroup *memcg,
4279 struct eventfd_ctx *eventfd)
4280{
4281 struct mem_cgroup_eventfd_list *ev, *tmp;
4282
4283 spin_lock(&memcg_oom_lock);
4284
4285 list_for_each_entry_safe(ev, tmp, &memcg->oom_notify, list) {
4286 if (ev->eventfd == eventfd) {
4287 list_del(&ev->list);
4288 kfree(ev);
4289 }
4290 }
4291
4292 spin_unlock(&memcg_oom_lock);
4293}
4294
4295static int mem_cgroup_oom_control_read(struct seq_file *sf, void *v)
4296{
4297 struct mem_cgroup *memcg = mem_cgroup_from_seq(sf);
4298
4299 seq_printf(sf, "oom_kill_disable %d\n", memcg->oom_kill_disable);
4300 seq_printf(sf, "under_oom %d\n", (bool)memcg->under_oom);
4301 seq_printf(sf, "oom_kill %lu\n",
4302 atomic_long_read(&memcg->memory_events[MEMCG_OOM_KILL]));
4303 return 0;
4304}
4305
4306static int mem_cgroup_oom_control_write(struct cgroup_subsys_state *css,
4307 struct cftype *cft, u64 val)
4308{
4309 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
4310
4311
4312 if (!css->parent || !((val == 0) || (val == 1)))
4313 return -EINVAL;
4314
4315 memcg->oom_kill_disable = val;
4316 if (!val)
4317 memcg_oom_recover(memcg);
4318
4319 return 0;
4320}
4321
4322#ifdef CONFIG_CGROUP_WRITEBACK
4323
4324#include <trace/events/writeback.h>
4325
4326static int memcg_wb_domain_init(struct mem_cgroup *memcg, gfp_t gfp)
4327{
4328 return wb_domain_init(&memcg->cgwb_domain, gfp);
4329}
4330
4331static void memcg_wb_domain_exit(struct mem_cgroup *memcg)
4332{
4333 wb_domain_exit(&memcg->cgwb_domain);
4334}
4335
4336static void memcg_wb_domain_size_changed(struct mem_cgroup *memcg)
4337{
4338 wb_domain_size_changed(&memcg->cgwb_domain);
4339}
4340
4341struct wb_domain *mem_cgroup_wb_domain(struct bdi_writeback *wb)
4342{
4343 struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css);
4344
4345 if (!memcg->css.parent)
4346 return NULL;
4347
4348 return &memcg->cgwb_domain;
4349}
4350
4351
4352
4353
4354
4355static unsigned long memcg_exact_page_state(struct mem_cgroup *memcg, int idx)
4356{
4357 long x = atomic_long_read(&memcg->vmstats[idx]);
4358 int cpu;
4359
4360 for_each_online_cpu(cpu)
4361 x += per_cpu_ptr(memcg->vmstats_percpu, cpu)->stat[idx];
4362 if (x < 0)
4363 x = 0;
4364 return x;
4365}
4366
4367
4368
4369
4370
4371
4372
4373
4374
4375
4376
4377
4378
4379
4380
4381
4382
4383
4384
4385void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages,
4386 unsigned long *pheadroom, unsigned long *pdirty,
4387 unsigned long *pwriteback)
4388{
4389 struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css);
4390 struct mem_cgroup *parent;
4391
4392 *pdirty = memcg_exact_page_state(memcg, NR_FILE_DIRTY);
4393
4394
4395 *pwriteback = memcg_exact_page_state(memcg, NR_WRITEBACK);
4396 *pfilepages = memcg_exact_page_state(memcg, NR_INACTIVE_FILE) +
4397 memcg_exact_page_state(memcg, NR_ACTIVE_FILE);
4398 *pheadroom = PAGE_COUNTER_MAX;
4399
4400 while ((parent = parent_mem_cgroup(memcg))) {
4401 unsigned long ceiling = min(memcg->memory.max, memcg->high);
4402 unsigned long used = page_counter_read(&memcg->memory);
4403
4404 *pheadroom = min(*pheadroom, ceiling - min(ceiling, used));
4405 memcg = parent;
4406 }
4407}
4408
4409
4410
4411
4412
4413
4414
4415
4416
4417
4418
4419
4420
4421
4422
4423
4424
4425
4426
4427
4428
4429
4430
4431
4432
4433
4434
4435
4436
4437
4438
4439
4440
4441
4442
4443
4444
4445
4446
4447
4448
4449
4450
4451
4452
4453void mem_cgroup_track_foreign_dirty_slowpath(struct page *page,
4454 struct bdi_writeback *wb)
4455{
4456 struct mem_cgroup *memcg = page->mem_cgroup;
4457 struct memcg_cgwb_frn *frn;
4458 u64 now = get_jiffies_64();
4459 u64 oldest_at = now;
4460 int oldest = -1;
4461 int i;
4462
4463 trace_track_foreign_dirty(page, wb);
4464
4465
4466
4467
4468
4469
4470 for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++) {
4471 frn = &memcg->cgwb_frn[i];
4472 if (frn->bdi_id == wb->bdi->id &&
4473 frn->memcg_id == wb->memcg_css->id)
4474 break;
4475 if (time_before64(frn->at, oldest_at) &&
4476 atomic_read(&frn->done.cnt) == 1) {
4477 oldest = i;
4478 oldest_at = frn->at;
4479 }
4480 }
4481
4482 if (i < MEMCG_CGWB_FRN_CNT) {
4483
4484
4485
4486
4487
4488
4489
4490 unsigned long update_intv =
4491 min_t(unsigned long, HZ,
4492 msecs_to_jiffies(dirty_expire_interval * 10) / 8);
4493
4494 if (time_before64(frn->at, now - update_intv))
4495 frn->at = now;
4496 } else if (oldest >= 0) {
4497
4498 frn = &memcg->cgwb_frn[oldest];
4499 frn->bdi_id = wb->bdi->id;
4500 frn->memcg_id = wb->memcg_css->id;
4501 frn->at = now;
4502 }
4503}
4504
4505
4506void mem_cgroup_flush_foreign(struct bdi_writeback *wb)
4507{
4508 struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css);
4509 unsigned long intv = msecs_to_jiffies(dirty_expire_interval * 10);
4510 u64 now = jiffies_64;
4511 int i;
4512
4513 for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++) {
4514 struct memcg_cgwb_frn *frn = &memcg->cgwb_frn[i];
4515
4516
4517
4518
4519
4520
4521
4522 if (time_after64(frn->at, now - intv) &&
4523 atomic_read(&frn->done.cnt) == 1) {
4524 frn->at = 0;
4525 trace_flush_foreign(wb, frn->bdi_id, frn->memcg_id);
4526 cgroup_writeback_by_id(frn->bdi_id, frn->memcg_id, 0,
4527 WB_REASON_FOREIGN_FLUSH,
4528 &frn->done);
4529 }
4530 }
4531}
4532
4533#else
4534
4535static int memcg_wb_domain_init(struct mem_cgroup *memcg, gfp_t gfp)
4536{
4537 return 0;
4538}
4539
4540static void memcg_wb_domain_exit(struct mem_cgroup *memcg)
4541{
4542}
4543
4544static void memcg_wb_domain_size_changed(struct mem_cgroup *memcg)
4545{
4546}
4547
4548#endif
4549
4550
4551
4552
4553
4554
4555
4556
4557
4558
4559
4560
4561
4562
4563
4564
4565
4566
4567
4568static void memcg_event_remove(struct work_struct *work)
4569{
4570 struct mem_cgroup_event *event =
4571 container_of(work, struct mem_cgroup_event, remove);
4572 struct mem_cgroup *memcg = event->memcg;
4573
4574 remove_wait_queue(event->wqh, &event->wait);
4575
4576 event->unregister_event(memcg, event->eventfd);
4577
4578
4579 eventfd_signal(event->eventfd, 1);
4580
4581 eventfd_ctx_put(event->eventfd);
4582 kfree(event);
4583 css_put(&memcg->css);
4584}
4585
4586
4587
4588
4589
4590
4591static int memcg_event_wake(wait_queue_entry_t *wait, unsigned mode,
4592 int sync, void *key)
4593{
4594 struct mem_cgroup_event *event =
4595 container_of(wait, struct mem_cgroup_event, wait);
4596 struct mem_cgroup *memcg = event->memcg;
4597 __poll_t flags = key_to_poll(key);
4598
4599 if (flags & EPOLLHUP) {
4600
4601
4602
4603
4604
4605
4606
4607
4608
4609 spin_lock(&memcg->event_list_lock);
4610 if (!list_empty(&event->list)) {
4611 list_del_init(&event->list);
4612
4613
4614
4615
4616 schedule_work(&event->remove);
4617 }
4618 spin_unlock(&memcg->event_list_lock);
4619 }
4620
4621 return 0;
4622}
4623
4624static void memcg_event_ptable_queue_proc(struct file *file,
4625 wait_queue_head_t *wqh, poll_table *pt)
4626{
4627 struct mem_cgroup_event *event =
4628 container_of(pt, struct mem_cgroup_event, pt);
4629
4630 event->wqh = wqh;
4631 add_wait_queue(wqh, &event->wait);
4632}
4633
4634
4635
4636
4637
4638
4639
4640
4641
4642static ssize_t memcg_write_event_control(struct kernfs_open_file *of,
4643 char *buf, size_t nbytes, loff_t off)
4644{
4645 struct cgroup_subsys_state *css = of_css(of);
4646 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
4647 struct mem_cgroup_event *event;
4648 struct cgroup_subsys_state *cfile_css;
4649 unsigned int efd, cfd;
4650 struct fd efile;
4651 struct fd cfile;
4652 const char *name;
4653 char *endp;
4654 int ret;
4655
4656 buf = strstrip(buf);
4657
4658 efd = simple_strtoul(buf, &endp, 10);
4659 if (*endp != ' ')
4660 return -EINVAL;
4661 buf = endp + 1;
4662
4663 cfd = simple_strtoul(buf, &endp, 10);
4664 if ((*endp != ' ') && (*endp != '\0'))
4665 return -EINVAL;
4666 buf = endp + 1;
4667
4668 event = kzalloc(sizeof(*event), GFP_KERNEL);
4669 if (!event)
4670 return -ENOMEM;
4671
4672 event->memcg = memcg;
4673 INIT_LIST_HEAD(&event->list);
4674 init_poll_funcptr(&event->pt, memcg_event_ptable_queue_proc);
4675 init_waitqueue_func_entry(&event->wait, memcg_event_wake);
4676 INIT_WORK(&event->remove, memcg_event_remove);
4677
4678 efile = fdget(efd);
4679 if (!efile.file) {
4680 ret = -EBADF;
4681 goto out_kfree;
4682 }
4683
4684 event->eventfd = eventfd_ctx_fileget(efile.file);
4685 if (IS_ERR(event->eventfd)) {
4686 ret = PTR_ERR(event->eventfd);
4687 goto out_put_efile;
4688 }
4689
4690 cfile = fdget(cfd);
4691 if (!cfile.file) {
4692 ret = -EBADF;
4693 goto out_put_eventfd;
4694 }
4695
4696
4697
4698 ret = inode_permission(file_inode(cfile.file), MAY_READ);
4699 if (ret < 0)
4700 goto out_put_cfile;
4701
4702
4703
4704
4705
4706
4707
4708
4709
4710 name = cfile.file->f_path.dentry->d_name.name;
4711
4712 if (!strcmp(name, "memory.usage_in_bytes")) {
4713 event->register_event = mem_cgroup_usage_register_event;
4714 event->unregister_event = mem_cgroup_usage_unregister_event;
4715 } else if (!strcmp(name, "memory.oom_control")) {
4716 event->register_event = mem_cgroup_oom_register_event;
4717 event->unregister_event = mem_cgroup_oom_unregister_event;
4718 } else if (!strcmp(name, "memory.pressure_level")) {
4719 event->register_event = vmpressure_register_event;
4720 event->unregister_event = vmpressure_unregister_event;
4721 } else if (!strcmp(name, "memory.memsw.usage_in_bytes")) {
4722 event->register_event = memsw_cgroup_usage_register_event;
4723 event->unregister_event = memsw_cgroup_usage_unregister_event;
4724 } else {
4725 ret = -EINVAL;
4726 goto out_put_cfile;
4727 }
4728
4729
4730
4731
4732
4733
4734 cfile_css = css_tryget_online_from_dir(cfile.file->f_path.dentry->d_parent,
4735 &memory_cgrp_subsys);
4736 ret = -EINVAL;
4737 if (IS_ERR(cfile_css))
4738 goto out_put_cfile;
4739 if (cfile_css != css) {
4740 css_put(cfile_css);
4741 goto out_put_cfile;
4742 }
4743
4744 ret = event->register_event(memcg, event->eventfd, buf);
4745 if (ret)
4746 goto out_put_css;
4747
4748 vfs_poll(efile.file, &event->pt);
4749
4750 spin_lock(&memcg->event_list_lock);
4751 list_add(&event->list, &memcg->event_list);
4752 spin_unlock(&memcg->event_list_lock);
4753
4754 fdput(cfile);
4755 fdput(efile);
4756
4757 return nbytes;
4758
4759out_put_css:
4760 css_put(css);
4761out_put_cfile:
4762 fdput(cfile);
4763out_put_eventfd:
4764 eventfd_ctx_put(event->eventfd);
4765out_put_efile:
4766 fdput(efile);
4767out_kfree:
4768 kfree(event);
4769
4770 return ret;
4771}
4772
4773static struct cftype mem_cgroup_legacy_files[] = {
4774 {
4775 .name = "usage_in_bytes",
4776 .private = MEMFILE_PRIVATE(_MEM, RES_USAGE),
4777 .read_u64 = mem_cgroup_read_u64,
4778 },
4779 {
4780 .name = "max_usage_in_bytes",
4781 .private = MEMFILE_PRIVATE(_MEM, RES_MAX_USAGE),
4782 .write = mem_cgroup_reset,
4783 .read_u64 = mem_cgroup_read_u64,
4784 },
4785 {
4786 .name = "limit_in_bytes",
4787 .private = MEMFILE_PRIVATE(_MEM, RES_LIMIT),
4788 .write = mem_cgroup_write,
4789 .read_u64 = mem_cgroup_read_u64,
4790 },
4791 {
4792 .name = "soft_limit_in_bytes",
4793 .private = MEMFILE_PRIVATE(_MEM, RES_SOFT_LIMIT),
4794 .write = mem_cgroup_write,
4795 .read_u64 = mem_cgroup_read_u64,
4796 },
4797 {
4798 .name = "failcnt",
4799 .private = MEMFILE_PRIVATE(_MEM, RES_FAILCNT),
4800 .write = mem_cgroup_reset,
4801 .read_u64 = mem_cgroup_read_u64,
4802 },
4803 {
4804 .name = "stat",
4805 .seq_show = memcg_stat_show,
4806 },
4807 {
4808 .name = "force_empty",
4809 .write = mem_cgroup_force_empty_write,
4810 },
4811 {
4812 .name = "use_hierarchy",
4813 .write_u64 = mem_cgroup_hierarchy_write,
4814 .read_u64 = mem_cgroup_hierarchy_read,
4815 },
4816 {
4817 .name = "cgroup.event_control",
4818 .write = memcg_write_event_control,
4819 .flags = CFTYPE_NO_PREFIX | CFTYPE_WORLD_WRITABLE,
4820 },
4821 {
4822 .name = "swappiness",
4823 .read_u64 = mem_cgroup_swappiness_read,
4824 .write_u64 = mem_cgroup_swappiness_write,
4825 },
4826 {
4827 .name = "move_charge_at_immigrate",
4828 .read_u64 = mem_cgroup_move_charge_read,
4829 .write_u64 = mem_cgroup_move_charge_write,
4830 },
4831 {
4832 .name = "oom_control",
4833 .seq_show = mem_cgroup_oom_control_read,
4834 .write_u64 = mem_cgroup_oom_control_write,
4835 .private = MEMFILE_PRIVATE(_OOM_TYPE, OOM_CONTROL),
4836 },
4837 {
4838 .name = "pressure_level",
4839 },
4840#ifdef CONFIG_NUMA
4841 {
4842 .name = "numa_stat",
4843 .seq_show = memcg_numa_stat_show,
4844 },
4845#endif
4846 {
4847 .name = "kmem.limit_in_bytes",
4848 .private = MEMFILE_PRIVATE(_KMEM, RES_LIMIT),
4849 .write = mem_cgroup_write,
4850 .read_u64 = mem_cgroup_read_u64,
4851 },
4852 {
4853 .name = "kmem.usage_in_bytes",
4854 .private = MEMFILE_PRIVATE(_KMEM, RES_USAGE),
4855 .read_u64 = mem_cgroup_read_u64,
4856 },
4857 {
4858 .name = "kmem.failcnt",
4859 .private = MEMFILE_PRIVATE(_KMEM, RES_FAILCNT),
4860 .write = mem_cgroup_reset,
4861 .read_u64 = mem_cgroup_read_u64,
4862 },
4863 {
4864 .name = "kmem.max_usage_in_bytes",
4865 .private = MEMFILE_PRIVATE(_KMEM, RES_MAX_USAGE),
4866 .write = mem_cgroup_reset,
4867 .read_u64 = mem_cgroup_read_u64,
4868 },
4869#if defined(CONFIG_SLAB) || defined(CONFIG_SLUB_DEBUG)
4870 {
4871 .name = "kmem.slabinfo",
4872 .seq_start = memcg_slab_start,
4873 .seq_next = memcg_slab_next,
4874 .seq_stop = memcg_slab_stop,
4875 .seq_show = memcg_slab_show,
4876 },
4877#endif
4878 {
4879 .name = "kmem.tcp.limit_in_bytes",
4880 .private = MEMFILE_PRIVATE(_TCP, RES_LIMIT),
4881 .write = mem_cgroup_write,
4882 .read_u64 = mem_cgroup_read_u64,
4883 },
4884 {
4885 .name = "kmem.tcp.usage_in_bytes",
4886 .private = MEMFILE_PRIVATE(_TCP, RES_USAGE),
4887 .read_u64 = mem_cgroup_read_u64,
4888 },
4889 {
4890 .name = "kmem.tcp.failcnt",
4891 .private = MEMFILE_PRIVATE(_TCP, RES_FAILCNT),
4892 .write = mem_cgroup_reset,
4893 .read_u64 = mem_cgroup_read_u64,
4894 },
4895 {
4896 .name = "kmem.tcp.max_usage_in_bytes",
4897 .private = MEMFILE_PRIVATE(_TCP, RES_MAX_USAGE),
4898 .write = mem_cgroup_reset,
4899 .read_u64 = mem_cgroup_read_u64,
4900 },
4901 { },
4902};
4903
4904
4905
4906
4907
4908
4909
4910
4911
4912
4913
4914
4915
4916
4917
4918
4919
4920
4921
4922
4923
4924
4925
4926
4927
4928static DEFINE_IDR(mem_cgroup_idr);
4929
4930static void mem_cgroup_id_remove(struct mem_cgroup *memcg)
4931{
4932 if (memcg->id.id > 0) {
4933 idr_remove(&mem_cgroup_idr, memcg->id.id);
4934 memcg->id.id = 0;
4935 }
4936}
4937
4938static void mem_cgroup_id_get_many(struct mem_cgroup *memcg, unsigned int n)
4939{
4940 refcount_add(n, &memcg->id.ref);
4941}
4942
4943static void mem_cgroup_id_put_many(struct mem_cgroup *memcg, unsigned int n)
4944{
4945 if (refcount_sub_and_test(n, &memcg->id.ref)) {
4946 mem_cgroup_id_remove(memcg);
4947
4948
4949 css_put(&memcg->css);
4950 }
4951}
4952
4953static inline void mem_cgroup_id_put(struct mem_cgroup *memcg)
4954{
4955 mem_cgroup_id_put_many(memcg, 1);
4956}
4957
4958
4959
4960
4961
4962
4963
4964struct mem_cgroup *mem_cgroup_from_id(unsigned short id)
4965{
4966 WARN_ON_ONCE(!rcu_read_lock_held());
4967 return idr_find(&mem_cgroup_idr, id);
4968}
4969
4970static int alloc_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node)
4971{
4972 struct mem_cgroup_per_node *pn;
4973 int tmp = node;
4974
4975
4976
4977
4978
4979
4980
4981
4982 if (!node_state(node, N_NORMAL_MEMORY))
4983 tmp = -1;
4984 pn = kzalloc_node(sizeof(*pn), GFP_KERNEL, tmp);
4985 if (!pn)
4986 return 1;
4987
4988 pn->lruvec_stat_local = alloc_percpu(struct lruvec_stat);
4989 if (!pn->lruvec_stat_local) {
4990 kfree(pn);
4991 return 1;
4992 }
4993
4994 pn->lruvec_stat_cpu = alloc_percpu(struct lruvec_stat);
4995 if (!pn->lruvec_stat_cpu) {
4996 free_percpu(pn->lruvec_stat_local);
4997 kfree(pn);
4998 return 1;
4999 }
5000
5001 lruvec_init(&pn->lruvec);
5002 pn->usage_in_excess = 0;
5003 pn->on_tree = false;
5004 pn->memcg = memcg;
5005
5006 memcg->nodeinfo[node] = pn;
5007 return 0;
5008}
5009
5010static void free_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node)
5011{
5012 struct mem_cgroup_per_node *pn = memcg->nodeinfo[node];
5013
5014 if (!pn)
5015 return;
5016
5017 free_percpu(pn->lruvec_stat_cpu);
5018 free_percpu(pn->lruvec_stat_local);
5019 kfree(pn);
5020}
5021
5022static void __mem_cgroup_free(struct mem_cgroup *memcg)
5023{
5024 int node;
5025
5026 for_each_node(node)
5027 free_mem_cgroup_per_node_info(memcg, node);
5028 free_percpu(memcg->vmstats_percpu);
5029 free_percpu(memcg->vmstats_local);
5030 kfree(memcg);
5031}
5032
5033static void mem_cgroup_free(struct mem_cgroup *memcg)
5034{
5035 memcg_wb_domain_exit(memcg);
5036
5037
5038
5039
5040 memcg_flush_percpu_vmstats(memcg, false);
5041 memcg_flush_percpu_vmevents(memcg);
5042 __mem_cgroup_free(memcg);
5043}
5044
5045static struct mem_cgroup *mem_cgroup_alloc(void)
5046{
5047 struct mem_cgroup *memcg;
5048 unsigned int size;
5049 int node;
5050 int __maybe_unused i;
5051
5052 size = sizeof(struct mem_cgroup);
5053 size += nr_node_ids * sizeof(struct mem_cgroup_per_node *);
5054
5055 memcg = kzalloc(size, GFP_KERNEL);
5056 if (!memcg)
5057 return NULL;
5058
5059 memcg->id.id = idr_alloc(&mem_cgroup_idr, NULL,
5060 1, MEM_CGROUP_ID_MAX,
5061 GFP_KERNEL);
5062 if (memcg->id.id < 0)
5063 goto fail;
5064
5065 memcg->vmstats_local = alloc_percpu(struct memcg_vmstats_percpu);
5066 if (!memcg->vmstats_local)
5067 goto fail;
5068
5069 memcg->vmstats_percpu = alloc_percpu(struct memcg_vmstats_percpu);
5070 if (!memcg->vmstats_percpu)
5071 goto fail;
5072
5073 for_each_node(node)
5074 if (alloc_mem_cgroup_per_node_info(memcg, node))
5075 goto fail;
5076
5077 if (memcg_wb_domain_init(memcg, GFP_KERNEL))
5078 goto fail;
5079
5080 INIT_WORK(&memcg->high_work, high_work_func);
5081 memcg->last_scanned_node = MAX_NUMNODES;
5082 INIT_LIST_HEAD(&memcg->oom_notify);
5083 mutex_init(&memcg->thresholds_lock);
5084 spin_lock_init(&memcg->move_lock);
5085 vmpressure_init(&memcg->vmpressure);
5086 INIT_LIST_HEAD(&memcg->event_list);
5087 spin_lock_init(&memcg->event_list_lock);
5088 memcg->socket_pressure = jiffies;
5089#ifdef CONFIG_MEMCG_KMEM
5090 memcg->kmemcg_id = -1;
5091#endif
5092#ifdef CONFIG_CGROUP_WRITEBACK
5093 INIT_LIST_HEAD(&memcg->cgwb_list);
5094 for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++)
5095 memcg->cgwb_frn[i].done =
5096 __WB_COMPLETION_INIT(&memcg_cgwb_frn_waitq);
5097#endif
5098#ifdef CONFIG_TRANSPARENT_HUGEPAGE
5099 spin_lock_init(&memcg->deferred_split_queue.split_queue_lock);
5100 INIT_LIST_HEAD(&memcg->deferred_split_queue.split_queue);
5101 memcg->deferred_split_queue.split_queue_len = 0;
5102#endif
5103 idr_replace(&mem_cgroup_idr, memcg, memcg->id.id);
5104 return memcg;
5105fail:
5106 mem_cgroup_id_remove(memcg);
5107 __mem_cgroup_free(memcg);
5108 return NULL;
5109}
5110
5111static struct cgroup_subsys_state * __ref
5112mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
5113{
5114 struct mem_cgroup *parent = mem_cgroup_from_css(parent_css);
5115 struct mem_cgroup *memcg;
5116 long error = -ENOMEM;
5117
5118 memcg = mem_cgroup_alloc();
5119 if (!memcg)
5120 return ERR_PTR(error);
5121
5122 memcg->high = PAGE_COUNTER_MAX;
5123 memcg->soft_limit = PAGE_COUNTER_MAX;
5124 if (parent) {
5125 memcg->swappiness = mem_cgroup_swappiness(parent);
5126 memcg->oom_kill_disable = parent->oom_kill_disable;
5127 }
5128 if (parent && parent->use_hierarchy) {
5129 memcg->use_hierarchy = true;
5130 page_counter_init(&memcg->memory, &parent->memory);
5131 page_counter_init(&memcg->swap, &parent->swap);
5132 page_counter_init(&memcg->memsw, &parent->memsw);
5133 page_counter_init(&memcg->kmem, &parent->kmem);
5134 page_counter_init(&memcg->tcpmem, &parent->tcpmem);
5135 } else {
5136 page_counter_init(&memcg->memory, NULL);
5137 page_counter_init(&memcg->swap, NULL);
5138 page_counter_init(&memcg->memsw, NULL);
5139 page_counter_init(&memcg->kmem, NULL);
5140 page_counter_init(&memcg->tcpmem, NULL);
5141
5142
5143
5144
5145
5146 if (parent != root_mem_cgroup)
5147 memory_cgrp_subsys.broken_hierarchy = true;
5148 }
5149
5150
5151 if (!parent) {
5152#ifdef CONFIG_MEMCG_KMEM
5153 INIT_LIST_HEAD(&memcg->kmem_caches);
5154#endif
5155 root_mem_cgroup = memcg;
5156 return &memcg->css;
5157 }
5158
5159 error = memcg_online_kmem(memcg);
5160 if (error)
5161 goto fail;
5162
5163 if (cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_nosocket)
5164 static_branch_inc(&memcg_sockets_enabled_key);
5165
5166 return &memcg->css;
5167fail:
5168 mem_cgroup_id_remove(memcg);
5169 mem_cgroup_free(memcg);
5170 return ERR_PTR(-ENOMEM);
5171}
5172
5173static int mem_cgroup_css_online(struct cgroup_subsys_state *css)
5174{
5175 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
5176
5177
5178
5179
5180
5181
5182 if (memcg_alloc_shrinker_maps(memcg)) {
5183 mem_cgroup_id_remove(memcg);
5184 return -ENOMEM;
5185 }
5186
5187
5188 refcount_set(&memcg->id.ref, 1);
5189 css_get(css);
5190 return 0;
5191}
5192
5193static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
5194{
5195 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
5196 struct mem_cgroup_event *event, *tmp;
5197
5198
5199
5200
5201
5202
5203 spin_lock(&memcg->event_list_lock);
5204 list_for_each_entry_safe(event, tmp, &memcg->event_list, list) {
5205 list_del_init(&event->list);
5206 schedule_work(&event->remove);
5207 }
5208 spin_unlock(&memcg->event_list_lock);
5209
5210 page_counter_set_min(&memcg->memory, 0);
5211 page_counter_set_low(&memcg->memory, 0);
5212
5213 memcg_offline_kmem(memcg);
5214 wb_memcg_offline(memcg);
5215
5216 drain_all_stock(memcg);
5217
5218 mem_cgroup_id_put(memcg);
5219}
5220
5221static void mem_cgroup_css_released(struct cgroup_subsys_state *css)
5222{
5223 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
5224
5225 invalidate_reclaim_iterators(memcg);
5226}
5227
5228static void mem_cgroup_css_free(struct cgroup_subsys_state *css)
5229{
5230 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
5231 int __maybe_unused i;
5232
5233#ifdef CONFIG_CGROUP_WRITEBACK
5234 for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++)
5235 wb_wait_for_completion(&memcg->cgwb_frn[i].done);
5236#endif
5237 if (cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_nosocket)
5238 static_branch_dec(&memcg_sockets_enabled_key);
5239
5240 if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && memcg->tcpmem_active)
5241 static_branch_dec(&memcg_sockets_enabled_key);
5242
5243 vmpressure_cleanup(&memcg->vmpressure);
5244 cancel_work_sync(&memcg->high_work);
5245 mem_cgroup_remove_from_trees(memcg);
5246 memcg_free_shrinker_maps(memcg);
5247 memcg_free_kmem(memcg);
5248 mem_cgroup_free(memcg);
5249}
5250
5251
5252
5253
5254
5255
5256
5257
5258
5259
5260
5261
5262
5263
5264static void mem_cgroup_css_reset(struct cgroup_subsys_state *css)
5265{
5266 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
5267
5268 page_counter_set_max(&memcg->memory, PAGE_COUNTER_MAX);
5269 page_counter_set_max(&memcg->swap, PAGE_COUNTER_MAX);
5270 page_counter_set_max(&memcg->memsw, PAGE_COUNTER_MAX);
5271 page_counter_set_max(&memcg->kmem, PAGE_COUNTER_MAX);
5272 page_counter_set_max(&memcg->tcpmem, PAGE_COUNTER_MAX);
5273 page_counter_set_min(&memcg->memory, 0);
5274 page_counter_set_low(&memcg->memory, 0);
5275 memcg->high = PAGE_COUNTER_MAX;
5276 memcg->soft_limit = PAGE_COUNTER_MAX;
5277 memcg_wb_domain_size_changed(memcg);
5278}
5279
5280#ifdef CONFIG_MMU
5281
5282static int mem_cgroup_do_precharge(unsigned long count)
5283{
5284 int ret;
5285
5286
5287 ret = try_charge(mc.to, GFP_KERNEL & ~__GFP_DIRECT_RECLAIM, count);
5288 if (!ret) {
5289 mc.precharge += count;
5290 return ret;
5291 }
5292
5293
5294 while (count--) {
5295 ret = try_charge(mc.to, GFP_KERNEL | __GFP_NORETRY, 1);
5296 if (ret)
5297 return ret;
5298 mc.precharge++;
5299 cond_resched();
5300 }
5301 return 0;
5302}
5303
5304union mc_target {
5305 struct page *page;
5306 swp_entry_t ent;
5307};
5308
5309enum mc_target_type {
5310 MC_TARGET_NONE = 0,
5311 MC_TARGET_PAGE,
5312 MC_TARGET_SWAP,
5313 MC_TARGET_DEVICE,
5314};
5315
5316static struct page *mc_handle_present_pte(struct vm_area_struct *vma,
5317 unsigned long addr, pte_t ptent)
5318{
5319 struct page *page = vm_normal_page(vma, addr, ptent);
5320
5321 if (!page || !page_mapped(page))
5322 return NULL;
5323 if (PageAnon(page)) {
5324 if (!(mc.flags & MOVE_ANON))
5325 return NULL;
5326 } else {
5327 if (!(mc.flags & MOVE_FILE))
5328 return NULL;
5329 }
5330 if (!get_page_unless_zero(page))
5331 return NULL;
5332
5333 return page;
5334}
5335
5336#if defined(CONFIG_SWAP) || defined(CONFIG_DEVICE_PRIVATE)
5337static struct page *mc_handle_swap_pte(struct vm_area_struct *vma,
5338 pte_t ptent, swp_entry_t *entry)
5339{
5340 struct page *page = NULL;
5341 swp_entry_t ent = pte_to_swp_entry(ptent);
5342
5343 if (!(mc.flags & MOVE_ANON) || non_swap_entry(ent))
5344 return NULL;
5345
5346
5347
5348
5349
5350
5351 if (is_device_private_entry(ent)) {
5352 page = device_private_entry_to_page(ent);
5353
5354
5355
5356
5357 if (!page_ref_add_unless(page, 1, 1))
5358 return NULL;
5359 return page;
5360 }
5361
5362
5363
5364
5365
5366 page = find_get_page(swap_address_space(ent), swp_offset(ent));
5367 if (do_memsw_account())
5368 entry->val = ent.val;
5369
5370 return page;
5371}
5372#else
5373static struct page *mc_handle_swap_pte(struct vm_area_struct *vma,
5374 pte_t ptent, swp_entry_t *entry)
5375{
5376 return NULL;
5377}
5378#endif
5379
5380static struct page *mc_handle_file_pte(struct vm_area_struct *vma,
5381 unsigned long addr, pte_t ptent, swp_entry_t *entry)
5382{
5383 struct page *page = NULL;
5384 struct address_space *mapping;
5385 pgoff_t pgoff;
5386
5387 if (!vma->vm_file)
5388 return NULL;
5389 if (!(mc.flags & MOVE_FILE))
5390 return NULL;
5391
5392 mapping = vma->vm_file->f_mapping;
5393 pgoff = linear_page_index(vma, addr);
5394
5395
5396#ifdef CONFIG_SWAP
5397
5398 if (shmem_mapping(mapping)) {
5399 page = find_get_entry(mapping, pgoff);
5400 if (xa_is_value(page)) {
5401 swp_entry_t swp = radix_to_swp_entry(page);
5402 if (do_memsw_account())
5403 *entry = swp;
5404 page = find_get_page(swap_address_space(swp),
5405 swp_offset(swp));
5406 }
5407 } else
5408 page = find_get_page(mapping, pgoff);
5409#else
5410 page = find_get_page(mapping, pgoff);
5411#endif
5412 return page;
5413}
5414
5415
5416
5417
5418
5419
5420
5421
5422
5423
5424
5425
5426
5427static int mem_cgroup_move_account(struct page *page,
5428 bool compound,
5429 struct mem_cgroup *from,
5430 struct mem_cgroup *to)
5431{
5432 struct lruvec *from_vec, *to_vec;
5433 struct pglist_data *pgdat;
5434 unsigned long flags;
5435 unsigned int nr_pages = compound ? hpage_nr_pages(page) : 1;
5436 int ret;
5437 bool anon;
5438
5439 VM_BUG_ON(from == to);
5440 VM_BUG_ON_PAGE(PageLRU(page), page);
5441 VM_BUG_ON(compound && !PageTransHuge(page));
5442
5443
5444
5445
5446
5447 ret = -EBUSY;
5448 if (!trylock_page(page))
5449 goto out;
5450
5451 ret = -EINVAL;
5452 if (page->mem_cgroup != from)
5453 goto out_unlock;
5454
5455 anon = PageAnon(page);
5456
5457 pgdat = page_pgdat(page);
5458 from_vec = mem_cgroup_lruvec(pgdat, from);
5459 to_vec = mem_cgroup_lruvec(pgdat, to);
5460
5461 spin_lock_irqsave(&from->move_lock, flags);
5462
5463 if (!anon && page_mapped(page)) {
5464 __mod_lruvec_state(from_vec, NR_FILE_MAPPED, -nr_pages);
5465 __mod_lruvec_state(to_vec, NR_FILE_MAPPED, nr_pages);
5466 }
5467
5468
5469
5470
5471
5472
5473 if (!anon && PageDirty(page)) {
5474 struct address_space *mapping = page_mapping(page);
5475
5476 if (mapping_cap_account_dirty(mapping)) {
5477 __mod_lruvec_state(from_vec, NR_FILE_DIRTY, -nr_pages);
5478 __mod_lruvec_state(to_vec, NR_FILE_DIRTY, nr_pages);
5479 }
5480 }
5481
5482 if (PageWriteback(page)) {
5483 __mod_lruvec_state(from_vec, NR_WRITEBACK, -nr_pages);
5484 __mod_lruvec_state(to_vec, NR_WRITEBACK, nr_pages);
5485 }
5486
5487#ifdef CONFIG_TRANSPARENT_HUGEPAGE
5488 if (compound && !list_empty(page_deferred_list(page))) {
5489 spin_lock(&from->deferred_split_queue.split_queue_lock);
5490 list_del_init(page_deferred_list(page));
5491 from->deferred_split_queue.split_queue_len--;
5492 spin_unlock(&from->deferred_split_queue.split_queue_lock);
5493 }
5494#endif
5495
5496
5497
5498
5499
5500
5501
5502 page->mem_cgroup = to;
5503
5504#ifdef CONFIG_TRANSPARENT_HUGEPAGE
5505 if (compound && list_empty(page_deferred_list(page))) {
5506 spin_lock(&to->deferred_split_queue.split_queue_lock);
5507 list_add_tail(page_deferred_list(page),
5508 &to->deferred_split_queue.split_queue);
5509 to->deferred_split_queue.split_queue_len++;
5510 spin_unlock(&to->deferred_split_queue.split_queue_lock);
5511 }
5512#endif
5513
5514 spin_unlock_irqrestore(&from->move_lock, flags);
5515
5516 ret = 0;
5517
5518 local_irq_disable();
5519 mem_cgroup_charge_statistics(to, page, compound, nr_pages);
5520 memcg_check_events(to, page);
5521 mem_cgroup_charge_statistics(from, page, compound, -nr_pages);
5522 memcg_check_events(from, page);
5523 local_irq_enable();
5524out_unlock:
5525 unlock_page(page);
5526out:
5527 return ret;
5528}
5529
5530
5531
5532
5533
5534
5535
5536
5537
5538
5539
5540
5541
5542
5543
5544
5545
5546
5547
5548
5549
5550
5551
5552
5553
5554
5555
5556static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma,
5557 unsigned long addr, pte_t ptent, union mc_target *target)
5558{
5559 struct page *page = NULL;
5560 enum mc_target_type ret = MC_TARGET_NONE;
5561 swp_entry_t ent = { .val = 0 };
5562
5563 if (pte_present(ptent))
5564 page = mc_handle_present_pte(vma, addr, ptent);
5565 else if (is_swap_pte(ptent))
5566 page = mc_handle_swap_pte(vma, ptent, &ent);
5567 else if (pte_none(ptent))
5568 page = mc_handle_file_pte(vma, addr, ptent, &ent);
5569
5570 if (!page && !ent.val)
5571 return ret;
5572 if (page) {
5573
5574
5575
5576
5577
5578 if (page->mem_cgroup == mc.from) {
5579 ret = MC_TARGET_PAGE;
5580 if (is_device_private_page(page))
5581 ret = MC_TARGET_DEVICE;
5582 if (target)
5583 target->page = page;
5584 }
5585 if (!ret || !target)
5586 put_page(page);
5587 }
5588
5589
5590
5591
5592 if (ent.val && !ret && (!page || !PageTransCompound(page)) &&
5593 mem_cgroup_id(mc.from) == lookup_swap_cgroup_id(ent)) {
5594 ret = MC_TARGET_SWAP;
5595 if (target)
5596 target->ent = ent;
5597 }
5598 return ret;
5599}
5600
5601#ifdef CONFIG_TRANSPARENT_HUGEPAGE
5602
5603
5604
5605
5606
5607static enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma,
5608 unsigned long addr, pmd_t pmd, union mc_target *target)
5609{
5610 struct page *page = NULL;
5611 enum mc_target_type ret = MC_TARGET_NONE;
5612
5613 if (unlikely(is_swap_pmd(pmd))) {
5614 VM_BUG_ON(thp_migration_supported() &&
5615 !is_pmd_migration_entry(pmd));
5616 return ret;
5617 }
5618 page = pmd_page(pmd);
5619 VM_BUG_ON_PAGE(!page || !PageHead(page), page);
5620 if (!(mc.flags & MOVE_ANON))
5621 return ret;
5622 if (page->mem_cgroup == mc.from) {
5623 ret = MC_TARGET_PAGE;
5624 if (target) {
5625 get_page(page);
5626 target->page = page;
5627 }
5628 }
5629 return ret;
5630}
5631#else
5632static inline enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma,
5633 unsigned long addr, pmd_t pmd, union mc_target *target)
5634{
5635 return MC_TARGET_NONE;
5636}
5637#endif
5638
5639static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd,
5640 unsigned long addr, unsigned long end,
5641 struct mm_walk *walk)
5642{
5643 struct vm_area_struct *vma = walk->vma;
5644 pte_t *pte;
5645 spinlock_t *ptl;
5646
5647 ptl = pmd_trans_huge_lock(pmd, vma);
5648 if (ptl) {
5649
5650
5651
5652
5653
5654 if (get_mctgt_type_thp(vma, addr, *pmd, NULL) == MC_TARGET_PAGE)
5655 mc.precharge += HPAGE_PMD_NR;
5656 spin_unlock(ptl);
5657 return 0;
5658 }
5659
5660 if (pmd_trans_unstable(pmd))
5661 return 0;
5662 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
5663 for (; addr != end; pte++, addr += PAGE_SIZE)
5664 if (get_mctgt_type(vma, addr, *pte, NULL))
5665 mc.precharge++;
5666 pte_unmap_unlock(pte - 1, ptl);
5667 cond_resched();
5668
5669 return 0;
5670}
5671
5672static const struct mm_walk_ops precharge_walk_ops = {
5673 .pmd_entry = mem_cgroup_count_precharge_pte_range,
5674};
5675
5676static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm)
5677{
5678 unsigned long precharge;
5679
5680 down_read(&mm->mmap_sem);
5681 walk_page_range(mm, 0, mm->highest_vm_end, &precharge_walk_ops, NULL);
5682 up_read(&mm->mmap_sem);
5683
5684 precharge = mc.precharge;
5685 mc.precharge = 0;
5686
5687 return precharge;
5688}
5689
5690static int mem_cgroup_precharge_mc(struct mm_struct *mm)
5691{
5692 unsigned long precharge = mem_cgroup_count_precharge(mm);
5693
5694 VM_BUG_ON(mc.moving_task);
5695 mc.moving_task = current;
5696 return mem_cgroup_do_precharge(precharge);
5697}
5698
5699
5700static void __mem_cgroup_clear_mc(void)
5701{
5702 struct mem_cgroup *from = mc.from;
5703 struct mem_cgroup *to = mc.to;
5704
5705
5706 if (mc.precharge) {
5707 cancel_charge(mc.to, mc.precharge);
5708 mc.precharge = 0;
5709 }
5710
5711
5712
5713
5714 if (mc.moved_charge) {
5715 cancel_charge(mc.from, mc.moved_charge);
5716 mc.moved_charge = 0;
5717 }
5718
5719 if (mc.moved_swap) {
5720
5721 if (!mem_cgroup_is_root(mc.from))
5722 page_counter_uncharge(&mc.from->memsw, mc.moved_swap);
5723
5724 mem_cgroup_id_put_many(mc.from, mc.moved_swap);
5725
5726
5727
5728
5729
5730 if (!mem_cgroup_is_root(mc.to))
5731 page_counter_uncharge(&mc.to->memory, mc.moved_swap);
5732
5733 mem_cgroup_id_get_many(mc.to, mc.moved_swap);
5734 css_put_many(&mc.to->css, mc.moved_swap);
5735
5736 mc.moved_swap = 0;
5737 }
5738 memcg_oom_recover(from);
5739 memcg_oom_recover(to);
5740 wake_up_all(&mc.waitq);
5741}
5742
5743static void mem_cgroup_clear_mc(void)
5744{
5745 struct mm_struct *mm = mc.mm;
5746
5747
5748
5749
5750
5751 mc.moving_task = NULL;
5752 __mem_cgroup_clear_mc();
5753 spin_lock(&mc.lock);
5754 mc.from = NULL;
5755 mc.to = NULL;
5756 mc.mm = NULL;
5757 spin_unlock(&mc.lock);
5758
5759 mmput(mm);
5760}
5761
5762static int mem_cgroup_can_attach(struct cgroup_taskset *tset)
5763{
5764 struct cgroup_subsys_state *css;
5765 struct mem_cgroup *memcg = NULL;
5766 struct mem_cgroup *from;
5767 struct task_struct *leader, *p;
5768 struct mm_struct *mm;
5769 unsigned long move_flags;
5770 int ret = 0;
5771
5772
5773 if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
5774 return 0;
5775
5776
5777
5778
5779
5780
5781
5782 p = NULL;
5783 cgroup_taskset_for_each_leader(leader, css, tset) {
5784 WARN_ON_ONCE(p);
5785 p = leader;
5786 memcg = mem_cgroup_from_css(css);
5787 }
5788 if (!p)
5789 return 0;
5790
5791
5792
5793
5794
5795
5796 move_flags = READ_ONCE(memcg->move_charge_at_immigrate);
5797 if (!move_flags)
5798 return 0;
5799
5800 from = mem_cgroup_from_task(p);
5801
5802 VM_BUG_ON(from == memcg);
5803
5804 mm = get_task_mm(p);
5805 if (!mm)
5806 return 0;
5807
5808 if (mm->owner == p) {
5809 VM_BUG_ON(mc.from);
5810 VM_BUG_ON(mc.to);
5811 VM_BUG_ON(mc.precharge);
5812 VM_BUG_ON(mc.moved_charge);
5813 VM_BUG_ON(mc.moved_swap);
5814
5815 spin_lock(&mc.lock);
5816 mc.mm = mm;
5817 mc.from = from;
5818 mc.to = memcg;
5819 mc.flags = move_flags;
5820 spin_unlock(&mc.lock);
5821
5822
5823 ret = mem_cgroup_precharge_mc(mm);
5824 if (ret)
5825 mem_cgroup_clear_mc();
5826 } else {
5827 mmput(mm);
5828 }
5829 return ret;
5830}
5831
5832static void mem_cgroup_cancel_attach(struct cgroup_taskset *tset)
5833{
5834 if (mc.to)
5835 mem_cgroup_clear_mc();
5836}
5837
5838static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
5839 unsigned long addr, unsigned long end,
5840 struct mm_walk *walk)
5841{
5842 int ret = 0;
5843 struct vm_area_struct *vma = walk->vma;
5844 pte_t *pte;
5845 spinlock_t *ptl;
5846 enum mc_target_type target_type;
5847 union mc_target target;
5848 struct page *page;
5849
5850 ptl = pmd_trans_huge_lock(pmd, vma);
5851 if (ptl) {
5852 if (mc.precharge < HPAGE_PMD_NR) {
5853 spin_unlock(ptl);
5854 return 0;
5855 }
5856 target_type = get_mctgt_type_thp(vma, addr, *pmd, &target);
5857 if (target_type == MC_TARGET_PAGE) {
5858 page = target.page;
5859 if (!isolate_lru_page(page)) {
5860 if (!mem_cgroup_move_account(page, true,
5861 mc.from, mc.to)) {
5862 mc.precharge -= HPAGE_PMD_NR;
5863 mc.moved_charge += HPAGE_PMD_NR;
5864 }
5865 putback_lru_page(page);
5866 }
5867 put_page(page);
5868 } else if (target_type == MC_TARGET_DEVICE) {
5869 page = target.page;
5870 if (!mem_cgroup_move_account(page, true,
5871 mc.from, mc.to)) {
5872 mc.precharge -= HPAGE_PMD_NR;
5873 mc.moved_charge += HPAGE_PMD_NR;
5874 }
5875 put_page(page);
5876 }
5877 spin_unlock(ptl);
5878 return 0;
5879 }
5880
5881 if (pmd_trans_unstable(pmd))
5882 return 0;
5883retry:
5884 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
5885 for (; addr != end; addr += PAGE_SIZE) {
5886 pte_t ptent = *(pte++);
5887 bool device = false;
5888 swp_entry_t ent;
5889
5890 if (!mc.precharge)
5891 break;
5892
5893 switch (get_mctgt_type(vma, addr, ptent, &target)) {
5894 case MC_TARGET_DEVICE:
5895 device = true;
5896
5897 case MC_TARGET_PAGE:
5898 page = target.page;
5899
5900
5901
5902
5903
5904
5905 if (PageTransCompound(page))
5906 goto put;
5907 if (!device && isolate_lru_page(page))
5908 goto put;
5909 if (!mem_cgroup_move_account(page, false,
5910 mc.from, mc.to)) {
5911 mc.precharge--;
5912
5913 mc.moved_charge++;
5914 }
5915 if (!device)
5916 putback_lru_page(page);
5917put:
5918 put_page(page);
5919 break;
5920 case MC_TARGET_SWAP:
5921 ent = target.ent;
5922 if (!mem_cgroup_move_swap_account(ent, mc.from, mc.to)) {
5923 mc.precharge--;
5924
5925 mc.moved_swap++;
5926 }
5927 break;
5928 default:
5929 break;
5930 }
5931 }
5932 pte_unmap_unlock(pte - 1, ptl);
5933 cond_resched();
5934
5935 if (addr != end) {
5936
5937
5938
5939
5940
5941
5942 ret = mem_cgroup_do_precharge(1);
5943 if (!ret)
5944 goto retry;
5945 }
5946
5947 return ret;
5948}
5949
5950static const struct mm_walk_ops charge_walk_ops = {
5951 .pmd_entry = mem_cgroup_move_charge_pte_range,
5952};
5953
5954static void mem_cgroup_move_charge(void)
5955{
5956 lru_add_drain_all();
5957
5958
5959
5960
5961
5962 atomic_inc(&mc.from->moving_account);
5963 synchronize_rcu();
5964retry:
5965 if (unlikely(!down_read_trylock(&mc.mm->mmap_sem))) {
5966
5967
5968
5969
5970
5971
5972
5973 __mem_cgroup_clear_mc();
5974 cond_resched();
5975 goto retry;
5976 }
5977
5978
5979
5980
5981 walk_page_range(mc.mm, 0, mc.mm->highest_vm_end, &charge_walk_ops,
5982 NULL);
5983
5984 up_read(&mc.mm->mmap_sem);
5985 atomic_dec(&mc.from->moving_account);
5986}
5987
5988static void mem_cgroup_move_task(void)
5989{
5990 if (mc.to) {
5991 mem_cgroup_move_charge();
5992 mem_cgroup_clear_mc();
5993 }
5994}
5995#else
5996static int mem_cgroup_can_attach(struct cgroup_taskset *tset)
5997{
5998 return 0;
5999}
6000static void mem_cgroup_cancel_attach(struct cgroup_taskset *tset)
6001{
6002}
6003static void mem_cgroup_move_task(void)
6004{
6005}
6006#endif
6007
6008
6009
6010
6011
6012
6013static void mem_cgroup_bind(struct cgroup_subsys_state *root_css)
6014{
6015
6016
6017
6018
6019
6020 if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
6021 root_mem_cgroup->use_hierarchy = true;
6022 else
6023 root_mem_cgroup->use_hierarchy = false;
6024}
6025
6026static int seq_puts_memcg_tunable(struct seq_file *m, unsigned long value)
6027{
6028 if (value == PAGE_COUNTER_MAX)
6029 seq_puts(m, "max\n");
6030 else
6031 seq_printf(m, "%llu\n", (u64)value * PAGE_SIZE);
6032
6033 return 0;
6034}
6035
6036static u64 memory_current_read(struct cgroup_subsys_state *css,
6037 struct cftype *cft)
6038{
6039 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
6040
6041 return (u64)page_counter_read(&memcg->memory) * PAGE_SIZE;
6042}
6043
6044static int memory_min_show(struct seq_file *m, void *v)
6045{
6046 return seq_puts_memcg_tunable(m,
6047 READ_ONCE(mem_cgroup_from_seq(m)->memory.min));
6048}
6049
6050static ssize_t memory_min_write(struct kernfs_open_file *of,
6051 char *buf, size_t nbytes, loff_t off)
6052{
6053 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
6054 unsigned long min;
6055 int err;
6056
6057 buf = strstrip(buf);
6058 err = page_counter_memparse(buf, "max", &min);
6059 if (err)
6060 return err;
6061
6062 page_counter_set_min(&memcg->memory, min);
6063
6064 return nbytes;
6065}
6066
6067static int memory_low_show(struct seq_file *m, void *v)
6068{
6069 return seq_puts_memcg_tunable(m,
6070 READ_ONCE(mem_cgroup_from_seq(m)->memory.low));
6071}
6072
6073static ssize_t memory_low_write(struct kernfs_open_file *of,
6074 char *buf, size_t nbytes, loff_t off)
6075{
6076 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
6077 unsigned long low;
6078 int err;
6079
6080 buf = strstrip(buf);
6081 err = page_counter_memparse(buf, "max", &low);
6082 if (err)
6083 return err;
6084
6085 page_counter_set_low(&memcg->memory, low);
6086
6087 return nbytes;
6088}
6089
6090static int memory_high_show(struct seq_file *m, void *v)
6091{
6092 return seq_puts_memcg_tunable(m, READ_ONCE(mem_cgroup_from_seq(m)->high));
6093}
6094
6095static ssize_t memory_high_write(struct kernfs_open_file *of,
6096 char *buf, size_t nbytes, loff_t off)
6097{
6098 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
6099 unsigned long nr_pages;
6100 unsigned long high;
6101 int err;
6102
6103 buf = strstrip(buf);
6104 err = page_counter_memparse(buf, "max", &high);
6105 if (err)
6106 return err;
6107
6108 memcg->high = high;
6109
6110 nr_pages = page_counter_read(&memcg->memory);
6111 if (nr_pages > high)
6112 try_to_free_mem_cgroup_pages(memcg, nr_pages - high,
6113 GFP_KERNEL, true);
6114
6115 memcg_wb_domain_size_changed(memcg);
6116 return nbytes;
6117}
6118
6119static int memory_max_show(struct seq_file *m, void *v)
6120{
6121 return seq_puts_memcg_tunable(m,
6122 READ_ONCE(mem_cgroup_from_seq(m)->memory.max));
6123}
6124
6125static ssize_t memory_max_write(struct kernfs_open_file *of,
6126 char *buf, size_t nbytes, loff_t off)
6127{
6128 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
6129 unsigned int nr_reclaims = MEM_CGROUP_RECLAIM_RETRIES;
6130 bool drained = false;
6131 unsigned long max;
6132 int err;
6133
6134 buf = strstrip(buf);
6135 err = page_counter_memparse(buf, "max", &max);
6136 if (err)
6137 return err;
6138
6139 xchg(&memcg->memory.max, max);
6140
6141 for (;;) {
6142 unsigned long nr_pages = page_counter_read(&memcg->memory);
6143
6144 if (nr_pages <= max)
6145 break;
6146
6147 if (signal_pending(current)) {
6148 err = -EINTR;
6149 break;
6150 }
6151
6152 if (!drained) {
6153 drain_all_stock(memcg);
6154 drained = true;
6155 continue;
6156 }
6157
6158 if (nr_reclaims) {
6159 if (!try_to_free_mem_cgroup_pages(memcg, nr_pages - max,
6160 GFP_KERNEL, true))
6161 nr_reclaims--;
6162 continue;
6163 }
6164
6165 memcg_memory_event(memcg, MEMCG_OOM);
6166 if (!mem_cgroup_out_of_memory(memcg, GFP_KERNEL, 0))
6167 break;
6168 }
6169
6170 memcg_wb_domain_size_changed(memcg);
6171 return nbytes;
6172}
6173
6174static void __memory_events_show(struct seq_file *m, atomic_long_t *events)
6175{
6176 seq_printf(m, "low %lu\n", atomic_long_read(&events[MEMCG_LOW]));
6177 seq_printf(m, "high %lu\n", atomic_long_read(&events[MEMCG_HIGH]));
6178 seq_printf(m, "max %lu\n", atomic_long_read(&events[MEMCG_MAX]));
6179 seq_printf(m, "oom %lu\n", atomic_long_read(&events[MEMCG_OOM]));
6180 seq_printf(m, "oom_kill %lu\n",
6181 atomic_long_read(&events[MEMCG_OOM_KILL]));
6182}
6183
6184static int memory_events_show(struct seq_file *m, void *v)
6185{
6186 struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
6187
6188 __memory_events_show(m, memcg->memory_events);
6189 return 0;
6190}
6191
6192static int memory_events_local_show(struct seq_file *m, void *v)
6193{
6194 struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
6195
6196 __memory_events_show(m, memcg->memory_events_local);
6197 return 0;
6198}
6199
6200static int memory_stat_show(struct seq_file *m, void *v)
6201{
6202 struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
6203 char *buf;
6204
6205 buf = memory_stat_format(memcg);
6206 if (!buf)
6207 return -ENOMEM;
6208 seq_puts(m, buf);
6209 kfree(buf);
6210 return 0;
6211}
6212
6213static int memory_oom_group_show(struct seq_file *m, void *v)
6214{
6215 struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
6216
6217 seq_printf(m, "%d\n", memcg->oom_group);
6218
6219 return 0;
6220}
6221
6222static ssize_t memory_oom_group_write(struct kernfs_open_file *of,
6223 char *buf, size_t nbytes, loff_t off)
6224{
6225 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
6226 int ret, oom_group;
6227
6228 buf = strstrip(buf);
6229 if (!buf)
6230 return -EINVAL;
6231
6232 ret = kstrtoint(buf, 0, &oom_group);
6233 if (ret)
6234 return ret;
6235
6236 if (oom_group != 0 && oom_group != 1)
6237 return -EINVAL;
6238
6239 memcg->oom_group = oom_group;
6240
6241 return nbytes;
6242}
6243
6244static struct cftype memory_files[] = {
6245 {
6246 .name = "current",
6247 .flags = CFTYPE_NOT_ON_ROOT,
6248 .read_u64 = memory_current_read,
6249 },
6250 {
6251 .name = "min",
6252 .flags = CFTYPE_NOT_ON_ROOT,
6253 .seq_show = memory_min_show,
6254 .write = memory_min_write,
6255 },
6256 {
6257 .name = "low",
6258 .flags = CFTYPE_NOT_ON_ROOT,
6259 .seq_show = memory_low_show,
6260 .write = memory_low_write,
6261 },
6262 {
6263 .name = "high",
6264 .flags = CFTYPE_NOT_ON_ROOT,
6265 .seq_show = memory_high_show,
6266 .write = memory_high_write,
6267 },
6268 {
6269 .name = "max",
6270 .flags = CFTYPE_NOT_ON_ROOT,
6271 .seq_show = memory_max_show,
6272 .write = memory_max_write,
6273 },
6274 {
6275 .name = "events",
6276 .flags = CFTYPE_NOT_ON_ROOT,
6277 .file_offset = offsetof(struct mem_cgroup, events_file),
6278 .seq_show = memory_events_show,
6279 },
6280 {
6281 .name = "events.local",
6282 .flags = CFTYPE_NOT_ON_ROOT,
6283 .file_offset = offsetof(struct mem_cgroup, events_local_file),
6284 .seq_show = memory_events_local_show,
6285 },
6286 {
6287 .name = "stat",
6288 .flags = CFTYPE_NOT_ON_ROOT,
6289 .seq_show = memory_stat_show,
6290 },
6291 {
6292 .name = "oom.group",
6293 .flags = CFTYPE_NOT_ON_ROOT | CFTYPE_NS_DELEGATABLE,
6294 .seq_show = memory_oom_group_show,
6295 .write = memory_oom_group_write,
6296 },
6297 { }
6298};
6299
6300struct cgroup_subsys memory_cgrp_subsys = {
6301 .css_alloc = mem_cgroup_css_alloc,
6302 .css_online = mem_cgroup_css_online,
6303 .css_offline = mem_cgroup_css_offline,
6304 .css_released = mem_cgroup_css_released,
6305 .css_free = mem_cgroup_css_free,
6306 .css_reset = mem_cgroup_css_reset,
6307 .can_attach = mem_cgroup_can_attach,
6308 .cancel_attach = mem_cgroup_cancel_attach,
6309 .post_attach = mem_cgroup_move_task,
6310 .bind = mem_cgroup_bind,
6311 .dfl_cftypes = memory_files,
6312 .legacy_cftypes = mem_cgroup_legacy_files,
6313 .early_init = 0,
6314};
6315
6316
6317
6318
6319
6320
6321
6322
6323
6324
6325
6326
6327
6328
6329
6330
6331
6332
6333
6334
6335
6336
6337
6338
6339
6340
6341
6342
6343
6344
6345
6346
6347
6348
6349
6350
6351
6352
6353
6354
6355
6356
6357
6358
6359
6360
6361
6362
6363
6364
6365
6366
6367
6368
6369
6370
6371
6372
6373
6374
6375
6376
6377
6378
6379
6380
6381
6382
6383
6384
6385
6386enum mem_cgroup_protection mem_cgroup_protected(struct mem_cgroup *root,
6387 struct mem_cgroup *memcg)
6388{
6389 struct mem_cgroup *parent;
6390 unsigned long emin, parent_emin;
6391 unsigned long elow, parent_elow;
6392 unsigned long usage;
6393
6394 if (mem_cgroup_disabled())
6395 return MEMCG_PROT_NONE;
6396
6397 if (!root)
6398 root = root_mem_cgroup;
6399 if (memcg == root)
6400 return MEMCG_PROT_NONE;
6401
6402 usage = page_counter_read(&memcg->memory);
6403 if (!usage)
6404 return MEMCG_PROT_NONE;
6405
6406 emin = memcg->memory.min;
6407 elow = memcg->memory.low;
6408
6409 parent = parent_mem_cgroup(memcg);
6410
6411 if (!parent)
6412 return MEMCG_PROT_NONE;
6413
6414 if (parent == root)
6415 goto exit;
6416
6417 parent_emin = READ_ONCE(parent->memory.emin);
6418 emin = min(emin, parent_emin);
6419 if (emin && parent_emin) {
6420 unsigned long min_usage, siblings_min_usage;
6421
6422 min_usage = min(usage, memcg->memory.min);
6423 siblings_min_usage = atomic_long_read(
6424 &parent->memory.children_min_usage);
6425
6426 if (min_usage && siblings_min_usage)
6427 emin = min(emin, parent_emin * min_usage /
6428 siblings_min_usage);
6429 }
6430
6431 parent_elow = READ_ONCE(parent->memory.elow);
6432 elow = min(elow, parent_elow);
6433 if (elow && parent_elow) {
6434 unsigned long low_usage, siblings_low_usage;
6435
6436 low_usage = min(usage, memcg->memory.low);
6437 siblings_low_usage = atomic_long_read(
6438 &parent->memory.children_low_usage);
6439
6440 if (low_usage && siblings_low_usage)
6441 elow = min(elow, parent_elow * low_usage /
6442 siblings_low_usage);
6443 }
6444
6445exit:
6446 memcg->memory.emin = emin;
6447 memcg->memory.elow = elow;
6448
6449 if (usage <= emin)
6450 return MEMCG_PROT_MIN;
6451 else if (usage <= elow)
6452 return MEMCG_PROT_LOW;
6453 else
6454 return MEMCG_PROT_NONE;
6455}
6456
6457
6458
6459
6460
6461
6462
6463
6464
6465
6466
6467
6468
6469
6470
6471
6472
6473
6474
6475int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm,
6476 gfp_t gfp_mask, struct mem_cgroup **memcgp,
6477 bool compound)
6478{
6479 struct mem_cgroup *memcg = NULL;
6480 unsigned int nr_pages = compound ? hpage_nr_pages(page) : 1;
6481 int ret = 0;
6482
6483 if (mem_cgroup_disabled())
6484 goto out;
6485
6486 if (PageSwapCache(page)) {
6487
6488
6489
6490
6491
6492
6493
6494 VM_BUG_ON_PAGE(!PageLocked(page), page);
6495 if (compound_head(page)->mem_cgroup)
6496 goto out;
6497
6498 if (do_swap_account) {
6499 swp_entry_t ent = { .val = page_private(page), };
6500 unsigned short id = lookup_swap_cgroup_id(ent);
6501
6502 rcu_read_lock();
6503 memcg = mem_cgroup_from_id(id);
6504 if (memcg && !css_tryget_online(&memcg->css))
6505 memcg = NULL;
6506 rcu_read_unlock();
6507 }
6508 }
6509
6510 if (!memcg)
6511 memcg = get_mem_cgroup_from_mm(mm);
6512
6513 ret = try_charge(memcg, gfp_mask, nr_pages);
6514
6515 css_put(&memcg->css);
6516out:
6517 *memcgp = memcg;
6518 return ret;
6519}
6520
6521int mem_cgroup_try_charge_delay(struct page *page, struct mm_struct *mm,
6522 gfp_t gfp_mask, struct mem_cgroup **memcgp,
6523 bool compound)
6524{
6525 struct mem_cgroup *memcg;
6526 int ret;
6527
6528 ret = mem_cgroup_try_charge(page, mm, gfp_mask, memcgp, compound);
6529 memcg = *memcgp;
6530 mem_cgroup_throttle_swaprate(memcg, page_to_nid(page), gfp_mask);
6531 return ret;
6532}
6533
6534
6535
6536
6537
6538
6539
6540
6541
6542
6543
6544
6545
6546
6547
6548
6549
6550
6551void mem_cgroup_commit_charge(struct page *page, struct mem_cgroup *memcg,
6552 bool lrucare, bool compound)
6553{
6554 unsigned int nr_pages = compound ? hpage_nr_pages(page) : 1;
6555
6556 VM_BUG_ON_PAGE(!page->mapping, page);
6557 VM_BUG_ON_PAGE(PageLRU(page) && !lrucare, page);
6558
6559 if (mem_cgroup_disabled())
6560 return;
6561
6562
6563
6564
6565
6566 if (!memcg)
6567 return;
6568
6569 commit_charge(page, memcg, lrucare);
6570
6571 local_irq_disable();
6572 mem_cgroup_charge_statistics(memcg, page, compound, nr_pages);
6573 memcg_check_events(memcg, page);
6574 local_irq_enable();
6575
6576 if (do_memsw_account() && PageSwapCache(page)) {
6577 swp_entry_t entry = { .val = page_private(page) };
6578
6579
6580
6581
6582
6583 mem_cgroup_uncharge_swap(entry, nr_pages);
6584 }
6585}
6586
6587
6588
6589
6590
6591
6592
6593
6594
6595void mem_cgroup_cancel_charge(struct page *page, struct mem_cgroup *memcg,
6596 bool compound)
6597{
6598 unsigned int nr_pages = compound ? hpage_nr_pages(page) : 1;
6599
6600 if (mem_cgroup_disabled())
6601 return;
6602
6603
6604
6605
6606
6607 if (!memcg)
6608 return;
6609
6610 cancel_charge(memcg, nr_pages);
6611}
6612
6613struct uncharge_gather {
6614 struct mem_cgroup *memcg;
6615 unsigned long pgpgout;
6616 unsigned long nr_anon;
6617 unsigned long nr_file;
6618 unsigned long nr_kmem;
6619 unsigned long nr_huge;
6620 unsigned long nr_shmem;
6621 struct page *dummy_page;
6622};
6623
6624static inline void uncharge_gather_clear(struct uncharge_gather *ug)
6625{
6626 memset(ug, 0, sizeof(*ug));
6627}
6628
6629static void uncharge_batch(const struct uncharge_gather *ug)
6630{
6631 unsigned long nr_pages = ug->nr_anon + ug->nr_file + ug->nr_kmem;
6632 unsigned long flags;
6633
6634 if (!mem_cgroup_is_root(ug->memcg)) {
6635 page_counter_uncharge(&ug->memcg->memory, nr_pages);
6636 if (do_memsw_account())
6637 page_counter_uncharge(&ug->memcg->memsw, nr_pages);
6638 if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && ug->nr_kmem)
6639 page_counter_uncharge(&ug->memcg->kmem, ug->nr_kmem);
6640 memcg_oom_recover(ug->memcg);
6641 }
6642
6643 local_irq_save(flags);
6644 __mod_memcg_state(ug->memcg, MEMCG_RSS, -ug->nr_anon);
6645 __mod_memcg_state(ug->memcg, MEMCG_CACHE, -ug->nr_file);
6646 __mod_memcg_state(ug->memcg, MEMCG_RSS_HUGE, -ug->nr_huge);
6647 __mod_memcg_state(ug->memcg, NR_SHMEM, -ug->nr_shmem);
6648 __count_memcg_events(ug->memcg, PGPGOUT, ug->pgpgout);
6649 __this_cpu_add(ug->memcg->vmstats_percpu->nr_page_events, nr_pages);
6650 memcg_check_events(ug->memcg, ug->dummy_page);
6651 local_irq_restore(flags);
6652
6653 if (!mem_cgroup_is_root(ug->memcg))
6654 css_put_many(&ug->memcg->css, nr_pages);
6655}
6656
6657static void uncharge_page(struct page *page, struct uncharge_gather *ug)
6658{
6659 VM_BUG_ON_PAGE(PageLRU(page), page);
6660 VM_BUG_ON_PAGE(page_count(page) && !is_zone_device_page(page) &&
6661 !PageHWPoison(page) , page);
6662
6663 if (!page->mem_cgroup)
6664 return;
6665
6666
6667
6668
6669
6670
6671
6672 if (ug->memcg != page->mem_cgroup) {
6673 if (ug->memcg) {
6674 uncharge_batch(ug);
6675 uncharge_gather_clear(ug);
6676 }
6677 ug->memcg = page->mem_cgroup;
6678 }
6679
6680 if (!PageKmemcg(page)) {
6681 unsigned int nr_pages = 1;
6682
6683 if (PageTransHuge(page)) {
6684 nr_pages = compound_nr(page);
6685 ug->nr_huge += nr_pages;
6686 }
6687 if (PageAnon(page))
6688 ug->nr_anon += nr_pages;
6689 else {
6690 ug->nr_file += nr_pages;
6691 if (PageSwapBacked(page))
6692 ug->nr_shmem += nr_pages;
6693 }
6694 ug->pgpgout++;
6695 } else {
6696 ug->nr_kmem += compound_nr(page);
6697 __ClearPageKmemcg(page);
6698 }
6699
6700 ug->dummy_page = page;
6701 page->mem_cgroup = NULL;
6702}
6703
6704static void uncharge_list(struct list_head *page_list)
6705{
6706 struct uncharge_gather ug;
6707 struct list_head *next;
6708
6709 uncharge_gather_clear(&ug);
6710
6711
6712
6713
6714
6715 next = page_list->next;
6716 do {
6717 struct page *page;
6718
6719 page = list_entry(next, struct page, lru);
6720 next = page->lru.next;
6721
6722 uncharge_page(page, &ug);
6723 } while (next != page_list);
6724
6725 if (ug.memcg)
6726 uncharge_batch(&ug);
6727}
6728
6729
6730
6731
6732
6733
6734
6735
6736void mem_cgroup_uncharge(struct page *page)
6737{
6738 struct uncharge_gather ug;
6739
6740 if (mem_cgroup_disabled())
6741 return;
6742
6743
6744 if (!page->mem_cgroup)
6745 return;
6746
6747 uncharge_gather_clear(&ug);
6748 uncharge_page(page, &ug);
6749 uncharge_batch(&ug);
6750}
6751
6752
6753
6754
6755
6756
6757
6758
6759void mem_cgroup_uncharge_list(struct list_head *page_list)
6760{
6761 if (mem_cgroup_disabled())
6762 return;
6763
6764 if (!list_empty(page_list))
6765 uncharge_list(page_list);
6766}
6767
6768
6769
6770
6771
6772
6773
6774
6775
6776
6777
6778void mem_cgroup_migrate(struct page *oldpage, struct page *newpage)
6779{
6780 struct mem_cgroup *memcg;
6781 unsigned int nr_pages;
6782 bool compound;
6783 unsigned long flags;
6784
6785 VM_BUG_ON_PAGE(!PageLocked(oldpage), oldpage);
6786 VM_BUG_ON_PAGE(!PageLocked(newpage), newpage);
6787 VM_BUG_ON_PAGE(PageAnon(oldpage) != PageAnon(newpage), newpage);
6788 VM_BUG_ON_PAGE(PageTransHuge(oldpage) != PageTransHuge(newpage),
6789 newpage);
6790
6791 if (mem_cgroup_disabled())
6792 return;
6793
6794
6795 if (newpage->mem_cgroup)
6796 return;
6797
6798
6799 memcg = oldpage->mem_cgroup;
6800 if (!memcg)
6801 return;
6802
6803
6804 compound = PageTransHuge(newpage);
6805 nr_pages = compound ? hpage_nr_pages(newpage) : 1;
6806
6807 page_counter_charge(&memcg->memory, nr_pages);
6808 if (do_memsw_account())
6809 page_counter_charge(&memcg->memsw, nr_pages);
6810 css_get_many(&memcg->css, nr_pages);
6811
6812 commit_charge(newpage, memcg, false);
6813
6814 local_irq_save(flags);
6815 mem_cgroup_charge_statistics(memcg, newpage, compound, nr_pages);
6816 memcg_check_events(memcg, newpage);
6817 local_irq_restore(flags);
6818}
6819
6820DEFINE_STATIC_KEY_FALSE(memcg_sockets_enabled_key);
6821EXPORT_SYMBOL(memcg_sockets_enabled_key);
6822
6823void mem_cgroup_sk_alloc(struct sock *sk)
6824{
6825 struct mem_cgroup *memcg;
6826
6827 if (!mem_cgroup_sockets_enabled)
6828 return;
6829
6830
6831
6832
6833
6834
6835
6836
6837
6838
6839 if (sk->sk_memcg) {
6840 css_get(&sk->sk_memcg->css);
6841 return;
6842 }
6843
6844 rcu_read_lock();
6845 memcg = mem_cgroup_from_task(current);
6846 if (memcg == root_mem_cgroup)
6847 goto out;
6848 if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && !memcg->tcpmem_active)
6849 goto out;
6850 if (css_tryget_online(&memcg->css))
6851 sk->sk_memcg = memcg;
6852out:
6853 rcu_read_unlock();
6854}
6855
6856void mem_cgroup_sk_free(struct sock *sk)
6857{
6858 if (sk->sk_memcg)
6859 css_put(&sk->sk_memcg->css);
6860}
6861
6862
6863
6864
6865
6866
6867
6868
6869
6870bool mem_cgroup_charge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages)
6871{
6872 gfp_t gfp_mask = GFP_KERNEL;
6873
6874 if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) {
6875 struct page_counter *fail;
6876
6877 if (page_counter_try_charge(&memcg->tcpmem, nr_pages, &fail)) {
6878 memcg->tcpmem_pressure = 0;
6879 return true;
6880 }
6881 page_counter_charge(&memcg->tcpmem, nr_pages);
6882 memcg->tcpmem_pressure = 1;
6883 return false;
6884 }
6885
6886
6887 if (in_softirq())
6888 gfp_mask = GFP_NOWAIT;
6889
6890 mod_memcg_state(memcg, MEMCG_SOCK, nr_pages);
6891
6892 if (try_charge(memcg, gfp_mask, nr_pages) == 0)
6893 return true;
6894
6895 try_charge(memcg, gfp_mask|__GFP_NOFAIL, nr_pages);
6896 return false;
6897}
6898
6899
6900
6901
6902
6903
6904void mem_cgroup_uncharge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages)
6905{
6906 if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) {
6907 page_counter_uncharge(&memcg->tcpmem, nr_pages);
6908 return;
6909 }
6910
6911 mod_memcg_state(memcg, MEMCG_SOCK, -nr_pages);
6912
6913 refill_stock(memcg, nr_pages);
6914}
6915
6916static int __init cgroup_memory(char *s)
6917{
6918 char *token;
6919
6920 while ((token = strsep(&s, ",")) != NULL) {
6921 if (!*token)
6922 continue;
6923 if (!strcmp(token, "nosocket"))
6924 cgroup_memory_nosocket = true;
6925 if (!strcmp(token, "nokmem"))
6926 cgroup_memory_nokmem = true;
6927 }
6928 return 0;
6929}
6930__setup("cgroup.memory=", cgroup_memory);
6931
6932
6933
6934
6935
6936
6937
6938
6939
6940static int __init mem_cgroup_init(void)
6941{
6942 int cpu, node;
6943
6944#ifdef CONFIG_MEMCG_KMEM
6945
6946
6947
6948
6949
6950
6951 memcg_kmem_cache_wq = alloc_workqueue("memcg_kmem_cache", 0, 1);
6952 BUG_ON(!memcg_kmem_cache_wq);
6953#endif
6954
6955 cpuhp_setup_state_nocalls(CPUHP_MM_MEMCQ_DEAD, "mm/memctrl:dead", NULL,
6956 memcg_hotplug_cpu_dead);
6957
6958 for_each_possible_cpu(cpu)
6959 INIT_WORK(&per_cpu_ptr(&memcg_stock, cpu)->work,
6960 drain_local_stock);
6961
6962 for_each_node(node) {
6963 struct mem_cgroup_tree_per_node *rtpn;
6964
6965 rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL,
6966 node_online(node) ? node : NUMA_NO_NODE);
6967
6968 rtpn->rb_root = RB_ROOT;
6969 rtpn->rb_rightmost = NULL;
6970 spin_lock_init(&rtpn->lock);
6971 soft_limit_tree.rb_tree_per_node[node] = rtpn;
6972 }
6973
6974 return 0;
6975}
6976subsys_initcall(mem_cgroup_init);
6977
6978#ifdef CONFIG_MEMCG_SWAP
6979static struct mem_cgroup *mem_cgroup_id_get_online(struct mem_cgroup *memcg)
6980{
6981 while (!refcount_inc_not_zero(&memcg->id.ref)) {
6982
6983
6984
6985
6986 if (WARN_ON_ONCE(memcg == root_mem_cgroup)) {
6987 VM_BUG_ON(1);
6988 break;
6989 }
6990 memcg = parent_mem_cgroup(memcg);
6991 if (!memcg)
6992 memcg = root_mem_cgroup;
6993 }
6994 return memcg;
6995}
6996
6997
6998
6999
7000
7001
7002
7003
7004void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
7005{
7006 struct mem_cgroup *memcg, *swap_memcg;
7007 unsigned int nr_entries;
7008 unsigned short oldid;
7009
7010 VM_BUG_ON_PAGE(PageLRU(page), page);
7011 VM_BUG_ON_PAGE(page_count(page), page);
7012
7013 if (!do_memsw_account())
7014 return;
7015
7016 memcg = page->mem_cgroup;
7017
7018
7019 if (!memcg)
7020 return;
7021
7022
7023
7024
7025
7026
7027 swap_memcg = mem_cgroup_id_get_online(memcg);
7028 nr_entries = hpage_nr_pages(page);
7029
7030 if (nr_entries > 1)
7031 mem_cgroup_id_get_many(swap_memcg, nr_entries - 1);
7032 oldid = swap_cgroup_record(entry, mem_cgroup_id(swap_memcg),
7033 nr_entries);
7034 VM_BUG_ON_PAGE(oldid, page);
7035 mod_memcg_state(swap_memcg, MEMCG_SWAP, nr_entries);
7036
7037 page->mem_cgroup = NULL;
7038
7039 if (!mem_cgroup_is_root(memcg))
7040 page_counter_uncharge(&memcg->memory, nr_entries);
7041
7042 if (memcg != swap_memcg) {
7043 if (!mem_cgroup_is_root(swap_memcg))
7044 page_counter_charge(&swap_memcg->memsw, nr_entries);
7045 page_counter_uncharge(&memcg->memsw, nr_entries);
7046 }
7047
7048
7049
7050
7051
7052
7053
7054 VM_BUG_ON(!irqs_disabled());
7055 mem_cgroup_charge_statistics(memcg, page, PageTransHuge(page),
7056 -nr_entries);
7057 memcg_check_events(memcg, page);
7058
7059 if (!mem_cgroup_is_root(memcg))
7060 css_put_many(&memcg->css, nr_entries);
7061}
7062
7063
7064
7065
7066
7067
7068
7069
7070
7071
7072int mem_cgroup_try_charge_swap(struct page *page, swp_entry_t entry)
7073{
7074 unsigned int nr_pages = hpage_nr_pages(page);
7075 struct page_counter *counter;
7076 struct mem_cgroup *memcg;
7077 unsigned short oldid;
7078
7079 if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) || !do_swap_account)
7080 return 0;
7081
7082 memcg = page->mem_cgroup;
7083
7084
7085 if (!memcg)
7086 return 0;
7087
7088 if (!entry.val) {
7089 memcg_memory_event(memcg, MEMCG_SWAP_FAIL);
7090 return 0;
7091 }
7092
7093 memcg = mem_cgroup_id_get_online(memcg);
7094
7095 if (!mem_cgroup_is_root(memcg) &&
7096 !page_counter_try_charge(&memcg->swap, nr_pages, &counter)) {
7097 memcg_memory_event(memcg, MEMCG_SWAP_MAX);
7098 memcg_memory_event(memcg, MEMCG_SWAP_FAIL);
7099 mem_cgroup_id_put(memcg);
7100 return -ENOMEM;
7101 }
7102
7103
7104 if (nr_pages > 1)
7105 mem_cgroup_id_get_many(memcg, nr_pages - 1);
7106 oldid = swap_cgroup_record(entry, mem_cgroup_id(memcg), nr_pages);
7107 VM_BUG_ON_PAGE(oldid, page);
7108 mod_memcg_state(memcg, MEMCG_SWAP, nr_pages);
7109
7110 return 0;
7111}
7112
7113
7114
7115
7116
7117
7118void mem_cgroup_uncharge_swap(swp_entry_t entry, unsigned int nr_pages)
7119{
7120 struct mem_cgroup *memcg;
7121 unsigned short id;
7122
7123 if (!do_swap_account)
7124 return;
7125
7126 id = swap_cgroup_record(entry, 0, nr_pages);
7127 rcu_read_lock();
7128 memcg = mem_cgroup_from_id(id);
7129 if (memcg) {
7130 if (!mem_cgroup_is_root(memcg)) {
7131 if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
7132 page_counter_uncharge(&memcg->swap, nr_pages);
7133 else
7134 page_counter_uncharge(&memcg->memsw, nr_pages);
7135 }
7136 mod_memcg_state(memcg, MEMCG_SWAP, -nr_pages);
7137 mem_cgroup_id_put_many(memcg, nr_pages);
7138 }
7139 rcu_read_unlock();
7140}
7141
7142long mem_cgroup_get_nr_swap_pages(struct mem_cgroup *memcg)
7143{
7144 long nr_swap_pages = get_nr_swap_pages();
7145
7146 if (!do_swap_account || !cgroup_subsys_on_dfl(memory_cgrp_subsys))
7147 return nr_swap_pages;
7148 for (; memcg != root_mem_cgroup; memcg = parent_mem_cgroup(memcg))
7149 nr_swap_pages = min_t(long, nr_swap_pages,
7150 READ_ONCE(memcg->swap.max) -
7151 page_counter_read(&memcg->swap));
7152 return nr_swap_pages;
7153}
7154
7155bool mem_cgroup_swap_full(struct page *page)
7156{
7157 struct mem_cgroup *memcg;
7158
7159 VM_BUG_ON_PAGE(!PageLocked(page), page);
7160
7161 if (vm_swap_full())
7162 return true;
7163 if (!do_swap_account || !cgroup_subsys_on_dfl(memory_cgrp_subsys))
7164 return false;
7165
7166 memcg = page->mem_cgroup;
7167 if (!memcg)
7168 return false;
7169
7170 for (; memcg != root_mem_cgroup; memcg = parent_mem_cgroup(memcg))
7171 if (page_counter_read(&memcg->swap) * 2 >= memcg->swap.max)
7172 return true;
7173
7174 return false;
7175}
7176
7177
7178#ifdef CONFIG_MEMCG_SWAP_ENABLED
7179static int really_do_swap_account __initdata = 1;
7180#else
7181static int really_do_swap_account __initdata;
7182#endif
7183
7184static int __init enable_swap_account(char *s)
7185{
7186 if (!strcmp(s, "1"))
7187 really_do_swap_account = 1;
7188 else if (!strcmp(s, "0"))
7189 really_do_swap_account = 0;
7190 return 1;
7191}
7192__setup("swapaccount=", enable_swap_account);
7193
7194static u64 swap_current_read(struct cgroup_subsys_state *css,
7195 struct cftype *cft)
7196{
7197 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
7198
7199 return (u64)page_counter_read(&memcg->swap) * PAGE_SIZE;
7200}
7201
7202static int swap_max_show(struct seq_file *m, void *v)
7203{
7204 return seq_puts_memcg_tunable(m,
7205 READ_ONCE(mem_cgroup_from_seq(m)->swap.max));
7206}
7207
7208static ssize_t swap_max_write(struct kernfs_open_file *of,
7209 char *buf, size_t nbytes, loff_t off)
7210{
7211 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
7212 unsigned long max;
7213 int err;
7214
7215 buf = strstrip(buf);
7216 err = page_counter_memparse(buf, "max", &max);
7217 if (err)
7218 return err;
7219
7220 xchg(&memcg->swap.max, max);
7221
7222 return nbytes;
7223}
7224
7225static int swap_events_show(struct seq_file *m, void *v)
7226{
7227 struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
7228
7229 seq_printf(m, "max %lu\n",
7230 atomic_long_read(&memcg->memory_events[MEMCG_SWAP_MAX]));
7231 seq_printf(m, "fail %lu\n",
7232 atomic_long_read(&memcg->memory_events[MEMCG_SWAP_FAIL]));
7233
7234 return 0;
7235}
7236
7237static struct cftype swap_files[] = {
7238 {
7239 .name = "swap.current",
7240 .flags = CFTYPE_NOT_ON_ROOT,
7241 .read_u64 = swap_current_read,
7242 },
7243 {
7244 .name = "swap.max",
7245 .flags = CFTYPE_NOT_ON_ROOT,
7246 .seq_show = swap_max_show,
7247 .write = swap_max_write,
7248 },
7249 {
7250 .name = "swap.events",
7251 .flags = CFTYPE_NOT_ON_ROOT,
7252 .file_offset = offsetof(struct mem_cgroup, swap_events_file),
7253 .seq_show = swap_events_show,
7254 },
7255 { }
7256};
7257
7258static struct cftype memsw_cgroup_files[] = {
7259 {
7260 .name = "memsw.usage_in_bytes",
7261 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE),
7262 .read_u64 = mem_cgroup_read_u64,
7263 },
7264 {
7265 .name = "memsw.max_usage_in_bytes",
7266 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_MAX_USAGE),
7267 .write = mem_cgroup_reset,
7268 .read_u64 = mem_cgroup_read_u64,
7269 },
7270 {
7271 .name = "memsw.limit_in_bytes",
7272 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_LIMIT),
7273 .write = mem_cgroup_write,
7274 .read_u64 = mem_cgroup_read_u64,
7275 },
7276 {
7277 .name = "memsw.failcnt",
7278 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_FAILCNT),
7279 .write = mem_cgroup_reset,
7280 .read_u64 = mem_cgroup_read_u64,
7281 },
7282 { },
7283};
7284
7285static int __init mem_cgroup_swap_init(void)
7286{
7287 if (!mem_cgroup_disabled() && really_do_swap_account) {
7288 do_swap_account = 1;
7289 WARN_ON(cgroup_add_dfl_cftypes(&memory_cgrp_subsys,
7290 swap_files));
7291 WARN_ON(cgroup_add_legacy_cftypes(&memory_cgrp_subsys,
7292 memsw_cgroup_files));
7293 }
7294 return 0;
7295}
7296subsys_initcall(mem_cgroup_swap_init);
7297
7298#endif
7299