1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25#include <linux/page_counter.h>
26#include <linux/memcontrol.h>
27#include <linux/cgroup.h>
28#include <linux/mm.h>
29#include <linux/sched/mm.h>
30#include <linux/shmem_fs.h>
31#include <linux/hugetlb.h>
32#include <linux/pagemap.h>
33#include <linux/vm_event_item.h>
34#include <linux/smp.h>
35#include <linux/page-flags.h>
36#include <linux/backing-dev.h>
37#include <linux/bit_spinlock.h>
38#include <linux/rcupdate.h>
39#include <linux/limits.h>
40#include <linux/export.h>
41#include <linux/mutex.h>
42#include <linux/rbtree.h>
43#include <linux/slab.h>
44#include <linux/swap.h>
45#include <linux/swapops.h>
46#include <linux/spinlock.h>
47#include <linux/eventfd.h>
48#include <linux/poll.h>
49#include <linux/sort.h>
50#include <linux/fs.h>
51#include <linux/seq_file.h>
52#include <linux/vmpressure.h>
53#include <linux/mm_inline.h>
54#include <linux/swap_cgroup.h>
55#include <linux/cpu.h>
56#include <linux/oom.h>
57#include <linux/lockdep.h>
58#include <linux/file.h>
59#include <linux/tracehook.h>
60#include <linux/seq_buf.h>
61#include "internal.h"
62#include <net/sock.h>
63#include <net/ip.h>
64#include "slab.h"
65
66#include <linux/uaccess.h>
67
68#include <trace/events/vmscan.h>
69
70struct cgroup_subsys memory_cgrp_subsys __read_mostly;
71EXPORT_SYMBOL(memory_cgrp_subsys);
72
73struct mem_cgroup *root_mem_cgroup __read_mostly;
74
75#define MEM_CGROUP_RECLAIM_RETRIES 5
76
77
78static bool cgroup_memory_nosocket;
79
80
81static bool cgroup_memory_nokmem;
82
83
84#ifdef CONFIG_MEMCG_SWAP
85int do_swap_account __read_mostly;
86#else
87#define do_swap_account 0
88#endif
89
90
91static bool do_memsw_account(void)
92{
93 return !cgroup_subsys_on_dfl(memory_cgrp_subsys) && do_swap_account;
94}
95
96static const char *const mem_cgroup_lru_names[] = {
97 "inactive_anon",
98 "active_anon",
99 "inactive_file",
100 "active_file",
101 "unevictable",
102};
103
104#define THRESHOLDS_EVENTS_TARGET 128
105#define SOFTLIMIT_EVENTS_TARGET 1024
106#define NUMAINFO_EVENTS_TARGET 1024
107
108
109
110
111
112
113struct mem_cgroup_tree_per_node {
114 struct rb_root rb_root;
115 struct rb_node *rb_rightmost;
116 spinlock_t lock;
117};
118
119struct mem_cgroup_tree {
120 struct mem_cgroup_tree_per_node *rb_tree_per_node[MAX_NUMNODES];
121};
122
123static struct mem_cgroup_tree soft_limit_tree __read_mostly;
124
125
126struct mem_cgroup_eventfd_list {
127 struct list_head list;
128 struct eventfd_ctx *eventfd;
129};
130
131
132
133
134struct mem_cgroup_event {
135
136
137
138 struct mem_cgroup *memcg;
139
140
141
142 struct eventfd_ctx *eventfd;
143
144
145
146 struct list_head list;
147
148
149
150
151
152 int (*register_event)(struct mem_cgroup *memcg,
153 struct eventfd_ctx *eventfd, const char *args);
154
155
156
157
158
159 void (*unregister_event)(struct mem_cgroup *memcg,
160 struct eventfd_ctx *eventfd);
161
162
163
164
165 poll_table pt;
166 wait_queue_head_t *wqh;
167 wait_queue_entry_t wait;
168 struct work_struct remove;
169};
170
171static void mem_cgroup_threshold(struct mem_cgroup *memcg);
172static void mem_cgroup_oom_notify(struct mem_cgroup *memcg);
173
174
175
176
177
178#define MOVE_ANON 0x1U
179#define MOVE_FILE 0x2U
180#define MOVE_MASK (MOVE_ANON | MOVE_FILE)
181
182
183static struct move_charge_struct {
184 spinlock_t lock;
185 struct mm_struct *mm;
186 struct mem_cgroup *from;
187 struct mem_cgroup *to;
188 unsigned long flags;
189 unsigned long precharge;
190 unsigned long moved_charge;
191 unsigned long moved_swap;
192 struct task_struct *moving_task;
193 wait_queue_head_t waitq;
194} mc = {
195 .lock = __SPIN_LOCK_UNLOCKED(mc.lock),
196 .waitq = __WAIT_QUEUE_HEAD_INITIALIZER(mc.waitq),
197};
198
199
200
201
202
203#define MEM_CGROUP_MAX_RECLAIM_LOOPS 100
204#define MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS 2
205
206enum charge_type {
207 MEM_CGROUP_CHARGE_TYPE_CACHE = 0,
208 MEM_CGROUP_CHARGE_TYPE_ANON,
209 MEM_CGROUP_CHARGE_TYPE_SWAPOUT,
210 MEM_CGROUP_CHARGE_TYPE_DROP,
211 NR_CHARGE_TYPE,
212};
213
214
215enum res_type {
216 _MEM,
217 _MEMSWAP,
218 _OOM_TYPE,
219 _KMEM,
220 _TCP,
221};
222
223#define MEMFILE_PRIVATE(x, val) ((x) << 16 | (val))
224#define MEMFILE_TYPE(val) ((val) >> 16 & 0xffff)
225#define MEMFILE_ATTR(val) ((val) & 0xffff)
226
227#define OOM_CONTROL (0)
228
229
230
231
232
233
234#define for_each_mem_cgroup_tree(iter, root) \
235 for (iter = mem_cgroup_iter(root, NULL, NULL); \
236 iter != NULL; \
237 iter = mem_cgroup_iter(root, iter, NULL))
238
239#define for_each_mem_cgroup(iter) \
240 for (iter = mem_cgroup_iter(NULL, NULL, NULL); \
241 iter != NULL; \
242 iter = mem_cgroup_iter(NULL, iter, NULL))
243
244static inline bool should_force_charge(void)
245{
246 return tsk_is_oom_victim(current) || fatal_signal_pending(current) ||
247 (current->flags & PF_EXITING);
248}
249
250
251struct vmpressure *memcg_to_vmpressure(struct mem_cgroup *memcg)
252{
253 if (!memcg)
254 memcg = root_mem_cgroup;
255 return &memcg->vmpressure;
256}
257
258struct cgroup_subsys_state *vmpressure_to_css(struct vmpressure *vmpr)
259{
260 return &container_of(vmpr, struct mem_cgroup, vmpressure)->css;
261}
262
263#ifdef CONFIG_MEMCG_KMEM
264
265
266
267
268
269
270
271
272
273
274
275static DEFINE_IDA(memcg_cache_ida);
276int memcg_nr_cache_ids;
277
278
279static DECLARE_RWSEM(memcg_cache_ids_sem);
280
281void memcg_get_cache_ids(void)
282{
283 down_read(&memcg_cache_ids_sem);
284}
285
286void memcg_put_cache_ids(void)
287{
288 up_read(&memcg_cache_ids_sem);
289}
290
291
292
293
294
295
296
297
298
299
300
301
302
303#define MEMCG_CACHES_MIN_SIZE 4
304#define MEMCG_CACHES_MAX_SIZE MEM_CGROUP_ID_MAX
305
306
307
308
309
310
311
312DEFINE_STATIC_KEY_FALSE(memcg_kmem_enabled_key);
313EXPORT_SYMBOL(memcg_kmem_enabled_key);
314
315struct workqueue_struct *memcg_kmem_cache_wq;
316
317static int memcg_shrinker_map_size;
318static DEFINE_MUTEX(memcg_shrinker_map_mutex);
319
320static void memcg_free_shrinker_map_rcu(struct rcu_head *head)
321{
322 kvfree(container_of(head, struct memcg_shrinker_map, rcu));
323}
324
325static int memcg_expand_one_shrinker_map(struct mem_cgroup *memcg,
326 int size, int old_size)
327{
328 struct memcg_shrinker_map *new, *old;
329 int nid;
330
331 lockdep_assert_held(&memcg_shrinker_map_mutex);
332
333 for_each_node(nid) {
334 old = rcu_dereference_protected(
335 mem_cgroup_nodeinfo(memcg, nid)->shrinker_map, true);
336
337 if (!old)
338 return 0;
339
340 new = kvmalloc(sizeof(*new) + size, GFP_KERNEL);
341 if (!new)
342 return -ENOMEM;
343
344
345 memset(new->map, (int)0xff, old_size);
346 memset((void *)new->map + old_size, 0, size - old_size);
347
348 rcu_assign_pointer(memcg->nodeinfo[nid]->shrinker_map, new);
349 call_rcu(&old->rcu, memcg_free_shrinker_map_rcu);
350 }
351
352 return 0;
353}
354
355static void memcg_free_shrinker_maps(struct mem_cgroup *memcg)
356{
357 struct mem_cgroup_per_node *pn;
358 struct memcg_shrinker_map *map;
359 int nid;
360
361 if (mem_cgroup_is_root(memcg))
362 return;
363
364 for_each_node(nid) {
365 pn = mem_cgroup_nodeinfo(memcg, nid);
366 map = rcu_dereference_protected(pn->shrinker_map, true);
367 if (map)
368 kvfree(map);
369 rcu_assign_pointer(pn->shrinker_map, NULL);
370 }
371}
372
373static int memcg_alloc_shrinker_maps(struct mem_cgroup *memcg)
374{
375 struct memcg_shrinker_map *map;
376 int nid, size, ret = 0;
377
378 if (mem_cgroup_is_root(memcg))
379 return 0;
380
381 mutex_lock(&memcg_shrinker_map_mutex);
382 size = memcg_shrinker_map_size;
383 for_each_node(nid) {
384 map = kvzalloc(sizeof(*map) + size, GFP_KERNEL);
385 if (!map) {
386 memcg_free_shrinker_maps(memcg);
387 ret = -ENOMEM;
388 break;
389 }
390 rcu_assign_pointer(memcg->nodeinfo[nid]->shrinker_map, map);
391 }
392 mutex_unlock(&memcg_shrinker_map_mutex);
393
394 return ret;
395}
396
397int memcg_expand_shrinker_maps(int new_id)
398{
399 int size, old_size, ret = 0;
400 struct mem_cgroup *memcg;
401
402 size = DIV_ROUND_UP(new_id + 1, BITS_PER_LONG) * sizeof(unsigned long);
403 old_size = memcg_shrinker_map_size;
404 if (size <= old_size)
405 return 0;
406
407 mutex_lock(&memcg_shrinker_map_mutex);
408 if (!root_mem_cgroup)
409 goto unlock;
410
411 for_each_mem_cgroup(memcg) {
412 if (mem_cgroup_is_root(memcg))
413 continue;
414 ret = memcg_expand_one_shrinker_map(memcg, size, old_size);
415 if (ret)
416 goto unlock;
417 }
418unlock:
419 if (!ret)
420 memcg_shrinker_map_size = size;
421 mutex_unlock(&memcg_shrinker_map_mutex);
422 return ret;
423}
424
425void memcg_set_shrinker_bit(struct mem_cgroup *memcg, int nid, int shrinker_id)
426{
427 if (shrinker_id >= 0 && memcg && !mem_cgroup_is_root(memcg)) {
428 struct memcg_shrinker_map *map;
429
430 rcu_read_lock();
431 map = rcu_dereference(memcg->nodeinfo[nid]->shrinker_map);
432
433 smp_mb__before_atomic();
434 set_bit(shrinker_id, map->map);
435 rcu_read_unlock();
436 }
437}
438
439#else
440static int memcg_alloc_shrinker_maps(struct mem_cgroup *memcg)
441{
442 return 0;
443}
444static void memcg_free_shrinker_maps(struct mem_cgroup *memcg) { }
445#endif
446
447
448
449
450
451
452
453
454
455
456
457
458struct cgroup_subsys_state *mem_cgroup_css_from_page(struct page *page)
459{
460 struct mem_cgroup *memcg;
461
462 memcg = page->mem_cgroup;
463
464 if (!memcg || !cgroup_subsys_on_dfl(memory_cgrp_subsys))
465 memcg = root_mem_cgroup;
466
467 return &memcg->css;
468}
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483ino_t page_cgroup_ino(struct page *page)
484{
485 struct mem_cgroup *memcg;
486 unsigned long ino = 0;
487
488 rcu_read_lock();
489 if (PageHead(page) && PageSlab(page))
490 memcg = memcg_from_slab_page(page);
491 else
492 memcg = READ_ONCE(page->mem_cgroup);
493 while (memcg && !(memcg->css.flags & CSS_ONLINE))
494 memcg = parent_mem_cgroup(memcg);
495 if (memcg)
496 ino = cgroup_ino(memcg->css.cgroup);
497 rcu_read_unlock();
498 return ino;
499}
500
501static struct mem_cgroup_per_node *
502mem_cgroup_page_nodeinfo(struct mem_cgroup *memcg, struct page *page)
503{
504 int nid = page_to_nid(page);
505
506 return memcg->nodeinfo[nid];
507}
508
509static struct mem_cgroup_tree_per_node *
510soft_limit_tree_node(int nid)
511{
512 return soft_limit_tree.rb_tree_per_node[nid];
513}
514
515static struct mem_cgroup_tree_per_node *
516soft_limit_tree_from_page(struct page *page)
517{
518 int nid = page_to_nid(page);
519
520 return soft_limit_tree.rb_tree_per_node[nid];
521}
522
523static void __mem_cgroup_insert_exceeded(struct mem_cgroup_per_node *mz,
524 struct mem_cgroup_tree_per_node *mctz,
525 unsigned long new_usage_in_excess)
526{
527 struct rb_node **p = &mctz->rb_root.rb_node;
528 struct rb_node *parent = NULL;
529 struct mem_cgroup_per_node *mz_node;
530 bool rightmost = true;
531
532 if (mz->on_tree)
533 return;
534
535 mz->usage_in_excess = new_usage_in_excess;
536 if (!mz->usage_in_excess)
537 return;
538 while (*p) {
539 parent = *p;
540 mz_node = rb_entry(parent, struct mem_cgroup_per_node,
541 tree_node);
542 if (mz->usage_in_excess < mz_node->usage_in_excess) {
543 p = &(*p)->rb_left;
544 rightmost = false;
545 }
546
547
548
549
550
551 else if (mz->usage_in_excess >= mz_node->usage_in_excess)
552 p = &(*p)->rb_right;
553 }
554
555 if (rightmost)
556 mctz->rb_rightmost = &mz->tree_node;
557
558 rb_link_node(&mz->tree_node, parent, p);
559 rb_insert_color(&mz->tree_node, &mctz->rb_root);
560 mz->on_tree = true;
561}
562
563static void __mem_cgroup_remove_exceeded(struct mem_cgroup_per_node *mz,
564 struct mem_cgroup_tree_per_node *mctz)
565{
566 if (!mz->on_tree)
567 return;
568
569 if (&mz->tree_node == mctz->rb_rightmost)
570 mctz->rb_rightmost = rb_prev(&mz->tree_node);
571
572 rb_erase(&mz->tree_node, &mctz->rb_root);
573 mz->on_tree = false;
574}
575
576static void mem_cgroup_remove_exceeded(struct mem_cgroup_per_node *mz,
577 struct mem_cgroup_tree_per_node *mctz)
578{
579 unsigned long flags;
580
581 spin_lock_irqsave(&mctz->lock, flags);
582 __mem_cgroup_remove_exceeded(mz, mctz);
583 spin_unlock_irqrestore(&mctz->lock, flags);
584}
585
586static unsigned long soft_limit_excess(struct mem_cgroup *memcg)
587{
588 unsigned long nr_pages = page_counter_read(&memcg->memory);
589 unsigned long soft_limit = READ_ONCE(memcg->soft_limit);
590 unsigned long excess = 0;
591
592 if (nr_pages > soft_limit)
593 excess = nr_pages - soft_limit;
594
595 return excess;
596}
597
598static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page)
599{
600 unsigned long excess;
601 struct mem_cgroup_per_node *mz;
602 struct mem_cgroup_tree_per_node *mctz;
603
604 mctz = soft_limit_tree_from_page(page);
605 if (!mctz)
606 return;
607
608
609
610
611 for (; memcg; memcg = parent_mem_cgroup(memcg)) {
612 mz = mem_cgroup_page_nodeinfo(memcg, page);
613 excess = soft_limit_excess(memcg);
614
615
616
617
618 if (excess || mz->on_tree) {
619 unsigned long flags;
620
621 spin_lock_irqsave(&mctz->lock, flags);
622
623 if (mz->on_tree)
624 __mem_cgroup_remove_exceeded(mz, mctz);
625
626
627
628
629 __mem_cgroup_insert_exceeded(mz, mctz, excess);
630 spin_unlock_irqrestore(&mctz->lock, flags);
631 }
632 }
633}
634
635static void mem_cgroup_remove_from_trees(struct mem_cgroup *memcg)
636{
637 struct mem_cgroup_tree_per_node *mctz;
638 struct mem_cgroup_per_node *mz;
639 int nid;
640
641 for_each_node(nid) {
642 mz = mem_cgroup_nodeinfo(memcg, nid);
643 mctz = soft_limit_tree_node(nid);
644 if (mctz)
645 mem_cgroup_remove_exceeded(mz, mctz);
646 }
647}
648
649static struct mem_cgroup_per_node *
650__mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz)
651{
652 struct mem_cgroup_per_node *mz;
653
654retry:
655 mz = NULL;
656 if (!mctz->rb_rightmost)
657 goto done;
658
659 mz = rb_entry(mctz->rb_rightmost,
660 struct mem_cgroup_per_node, tree_node);
661
662
663
664
665
666 __mem_cgroup_remove_exceeded(mz, mctz);
667 if (!soft_limit_excess(mz->memcg) ||
668 !css_tryget_online(&mz->memcg->css))
669 goto retry;
670done:
671 return mz;
672}
673
674static struct mem_cgroup_per_node *
675mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz)
676{
677 struct mem_cgroup_per_node *mz;
678
679 spin_lock_irq(&mctz->lock);
680 mz = __mem_cgroup_largest_soft_limit_node(mctz);
681 spin_unlock_irq(&mctz->lock);
682 return mz;
683}
684
685
686
687
688
689
690
691void __mod_memcg_state(struct mem_cgroup *memcg, int idx, int val)
692{
693 long x;
694
695 if (mem_cgroup_disabled())
696 return;
697
698 x = val + __this_cpu_read(memcg->vmstats_percpu->stat[idx]);
699 if (unlikely(abs(x) > MEMCG_CHARGE_BATCH)) {
700 struct mem_cgroup *mi;
701
702
703
704
705
706 __this_cpu_add(memcg->vmstats_local->stat[idx], x);
707 for (mi = memcg; mi; mi = parent_mem_cgroup(mi))
708 atomic_long_add(x, &mi->vmstats[idx]);
709 x = 0;
710 }
711 __this_cpu_write(memcg->vmstats_percpu->stat[idx], x);
712}
713
714static struct mem_cgroup_per_node *
715parent_nodeinfo(struct mem_cgroup_per_node *pn, int nid)
716{
717 struct mem_cgroup *parent;
718
719 parent = parent_mem_cgroup(pn->memcg);
720 if (!parent)
721 return NULL;
722 return mem_cgroup_nodeinfo(parent, nid);
723}
724
725
726
727
728
729
730
731
732
733
734
735void __mod_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx,
736 int val)
737{
738 pg_data_t *pgdat = lruvec_pgdat(lruvec);
739 struct mem_cgroup_per_node *pn;
740 struct mem_cgroup *memcg;
741 long x;
742
743
744 __mod_node_page_state(pgdat, idx, val);
745
746 if (mem_cgroup_disabled())
747 return;
748
749 pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
750 memcg = pn->memcg;
751
752
753 __mod_memcg_state(memcg, idx, val);
754
755
756 __this_cpu_add(pn->lruvec_stat_local->count[idx], val);
757
758 x = val + __this_cpu_read(pn->lruvec_stat_cpu->count[idx]);
759 if (unlikely(abs(x) > MEMCG_CHARGE_BATCH)) {
760 struct mem_cgroup_per_node *pi;
761
762 for (pi = pn; pi; pi = parent_nodeinfo(pi, pgdat->node_id))
763 atomic_long_add(x, &pi->lruvec_stat[idx]);
764 x = 0;
765 }
766 __this_cpu_write(pn->lruvec_stat_cpu->count[idx], x);
767}
768
769void __mod_lruvec_slab_state(void *p, enum node_stat_item idx, int val)
770{
771 struct page *page = virt_to_head_page(p);
772 pg_data_t *pgdat = page_pgdat(page);
773 struct mem_cgroup *memcg;
774 struct lruvec *lruvec;
775
776 rcu_read_lock();
777 memcg = memcg_from_slab_page(page);
778
779
780 if (!memcg || memcg == root_mem_cgroup) {
781 __mod_node_page_state(pgdat, idx, val);
782 } else {
783 lruvec = mem_cgroup_lruvec(pgdat, memcg);
784 __mod_lruvec_state(lruvec, idx, val);
785 }
786 rcu_read_unlock();
787}
788
789
790
791
792
793
794
795void __count_memcg_events(struct mem_cgroup *memcg, enum vm_event_item idx,
796 unsigned long count)
797{
798 unsigned long x;
799
800 if (mem_cgroup_disabled())
801 return;
802
803 x = count + __this_cpu_read(memcg->vmstats_percpu->events[idx]);
804 if (unlikely(x > MEMCG_CHARGE_BATCH)) {
805 struct mem_cgroup *mi;
806
807
808
809
810
811 __this_cpu_add(memcg->vmstats_local->events[idx], x);
812 for (mi = memcg; mi; mi = parent_mem_cgroup(mi))
813 atomic_long_add(x, &mi->vmevents[idx]);
814 x = 0;
815 }
816 __this_cpu_write(memcg->vmstats_percpu->events[idx], x);
817}
818
819static unsigned long memcg_events(struct mem_cgroup *memcg, int event)
820{
821 return atomic_long_read(&memcg->vmevents[event]);
822}
823
824static unsigned long memcg_events_local(struct mem_cgroup *memcg, int event)
825{
826 long x = 0;
827 int cpu;
828
829 for_each_possible_cpu(cpu)
830 x += per_cpu(memcg->vmstats_local->events[event], cpu);
831 return x;
832}
833
834static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg,
835 struct page *page,
836 bool compound, int nr_pages)
837{
838
839
840
841
842 if (PageAnon(page))
843 __mod_memcg_state(memcg, MEMCG_RSS, nr_pages);
844 else {
845 __mod_memcg_state(memcg, MEMCG_CACHE, nr_pages);
846 if (PageSwapBacked(page))
847 __mod_memcg_state(memcg, NR_SHMEM, nr_pages);
848 }
849
850 if (compound) {
851 VM_BUG_ON_PAGE(!PageTransHuge(page), page);
852 __mod_memcg_state(memcg, MEMCG_RSS_HUGE, nr_pages);
853 }
854
855
856 if (nr_pages > 0)
857 __count_memcg_events(memcg, PGPGIN, 1);
858 else {
859 __count_memcg_events(memcg, PGPGOUT, 1);
860 nr_pages = -nr_pages;
861 }
862
863 __this_cpu_add(memcg->vmstats_percpu->nr_page_events, nr_pages);
864}
865
866static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg,
867 enum mem_cgroup_events_target target)
868{
869 unsigned long val, next;
870
871 val = __this_cpu_read(memcg->vmstats_percpu->nr_page_events);
872 next = __this_cpu_read(memcg->vmstats_percpu->targets[target]);
873
874 if ((long)(next - val) < 0) {
875 switch (target) {
876 case MEM_CGROUP_TARGET_THRESH:
877 next = val + THRESHOLDS_EVENTS_TARGET;
878 break;
879 case MEM_CGROUP_TARGET_SOFTLIMIT:
880 next = val + SOFTLIMIT_EVENTS_TARGET;
881 break;
882 case MEM_CGROUP_TARGET_NUMAINFO:
883 next = val + NUMAINFO_EVENTS_TARGET;
884 break;
885 default:
886 break;
887 }
888 __this_cpu_write(memcg->vmstats_percpu->targets[target], next);
889 return true;
890 }
891 return false;
892}
893
894
895
896
897
898static void memcg_check_events(struct mem_cgroup *memcg, struct page *page)
899{
900
901 if (unlikely(mem_cgroup_event_ratelimit(memcg,
902 MEM_CGROUP_TARGET_THRESH))) {
903 bool do_softlimit;
904 bool do_numainfo __maybe_unused;
905
906 do_softlimit = mem_cgroup_event_ratelimit(memcg,
907 MEM_CGROUP_TARGET_SOFTLIMIT);
908#if MAX_NUMNODES > 1
909 do_numainfo = mem_cgroup_event_ratelimit(memcg,
910 MEM_CGROUP_TARGET_NUMAINFO);
911#endif
912 mem_cgroup_threshold(memcg);
913 if (unlikely(do_softlimit))
914 mem_cgroup_update_tree(memcg, page);
915#if MAX_NUMNODES > 1
916 if (unlikely(do_numainfo))
917 atomic_inc(&memcg->numainfo_events);
918#endif
919 }
920}
921
922struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
923{
924
925
926
927
928
929 if (unlikely(!p))
930 return NULL;
931
932 return mem_cgroup_from_css(task_css(p, memory_cgrp_id));
933}
934EXPORT_SYMBOL(mem_cgroup_from_task);
935
936
937
938
939
940
941
942
943
944struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm)
945{
946 struct mem_cgroup *memcg;
947
948 if (mem_cgroup_disabled())
949 return NULL;
950
951 rcu_read_lock();
952 do {
953
954
955
956
957
958 if (unlikely(!mm))
959 memcg = root_mem_cgroup;
960 else {
961 memcg = mem_cgroup_from_task(rcu_dereference(mm->owner));
962 if (unlikely(!memcg))
963 memcg = root_mem_cgroup;
964 }
965 } while (!css_tryget_online(&memcg->css));
966 rcu_read_unlock();
967 return memcg;
968}
969EXPORT_SYMBOL(get_mem_cgroup_from_mm);
970
971
972
973
974
975
976
977
978struct mem_cgroup *get_mem_cgroup_from_page(struct page *page)
979{
980 struct mem_cgroup *memcg = page->mem_cgroup;
981
982 if (mem_cgroup_disabled())
983 return NULL;
984
985 rcu_read_lock();
986 if (!memcg || !css_tryget_online(&memcg->css))
987 memcg = root_mem_cgroup;
988 rcu_read_unlock();
989 return memcg;
990}
991EXPORT_SYMBOL(get_mem_cgroup_from_page);
992
993
994
995
996static __always_inline struct mem_cgroup *get_mem_cgroup_from_current(void)
997{
998 if (unlikely(current->active_memcg)) {
999 struct mem_cgroup *memcg = root_mem_cgroup;
1000
1001 rcu_read_lock();
1002 if (css_tryget_online(¤t->active_memcg->css))
1003 memcg = current->active_memcg;
1004 rcu_read_unlock();
1005 return memcg;
1006 }
1007 return get_mem_cgroup_from_mm(current->mm);
1008}
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
1028 struct mem_cgroup *prev,
1029 struct mem_cgroup_reclaim_cookie *reclaim)
1030{
1031 struct mem_cgroup_reclaim_iter *uninitialized_var(iter);
1032 struct cgroup_subsys_state *css = NULL;
1033 struct mem_cgroup *memcg = NULL;
1034 struct mem_cgroup *pos = NULL;
1035
1036 if (mem_cgroup_disabled())
1037 return NULL;
1038
1039 if (!root)
1040 root = root_mem_cgroup;
1041
1042 if (prev && !reclaim)
1043 pos = prev;
1044
1045 if (!root->use_hierarchy && root != root_mem_cgroup) {
1046 if (prev)
1047 goto out;
1048 return root;
1049 }
1050
1051 rcu_read_lock();
1052
1053 if (reclaim) {
1054 struct mem_cgroup_per_node *mz;
1055
1056 mz = mem_cgroup_nodeinfo(root, reclaim->pgdat->node_id);
1057 iter = &mz->iter[reclaim->priority];
1058
1059 if (prev && reclaim->generation != iter->generation)
1060 goto out_unlock;
1061
1062 while (1) {
1063 pos = READ_ONCE(iter->position);
1064 if (!pos || css_tryget(&pos->css))
1065 break;
1066
1067
1068
1069
1070
1071
1072
1073
1074 (void)cmpxchg(&iter->position, pos, NULL);
1075 }
1076 }
1077
1078 if (pos)
1079 css = &pos->css;
1080
1081 for (;;) {
1082 css = css_next_descendant_pre(css, &root->css);
1083 if (!css) {
1084
1085
1086
1087
1088
1089
1090 if (!prev)
1091 continue;
1092 break;
1093 }
1094
1095
1096
1097
1098
1099
1100 memcg = mem_cgroup_from_css(css);
1101
1102 if (css == &root->css)
1103 break;
1104
1105 if (css_tryget(css))
1106 break;
1107
1108 memcg = NULL;
1109 }
1110
1111 if (reclaim) {
1112
1113
1114
1115
1116
1117 (void)cmpxchg(&iter->position, pos, memcg);
1118
1119 if (pos)
1120 css_put(&pos->css);
1121
1122 if (!memcg)
1123 iter->generation++;
1124 else if (!prev)
1125 reclaim->generation = iter->generation;
1126 }
1127
1128out_unlock:
1129 rcu_read_unlock();
1130out:
1131 if (prev && prev != root)
1132 css_put(&prev->css);
1133
1134 return memcg;
1135}
1136
1137
1138
1139
1140
1141
1142void mem_cgroup_iter_break(struct mem_cgroup *root,
1143 struct mem_cgroup *prev)
1144{
1145 if (!root)
1146 root = root_mem_cgroup;
1147 if (prev && prev != root)
1148 css_put(&prev->css);
1149}
1150
1151static void __invalidate_reclaim_iterators(struct mem_cgroup *from,
1152 struct mem_cgroup *dead_memcg)
1153{
1154 struct mem_cgroup_reclaim_iter *iter;
1155 struct mem_cgroup_per_node *mz;
1156 int nid;
1157 int i;
1158
1159 for_each_node(nid) {
1160 mz = mem_cgroup_nodeinfo(from, nid);
1161 for (i = 0; i <= DEF_PRIORITY; i++) {
1162 iter = &mz->iter[i];
1163 cmpxchg(&iter->position,
1164 dead_memcg, NULL);
1165 }
1166 }
1167}
1168
1169static void invalidate_reclaim_iterators(struct mem_cgroup *dead_memcg)
1170{
1171 struct mem_cgroup *memcg = dead_memcg;
1172 struct mem_cgroup *last;
1173
1174 do {
1175 __invalidate_reclaim_iterators(memcg, dead_memcg);
1176 last = memcg;
1177 } while ((memcg = parent_mem_cgroup(memcg)));
1178
1179
1180
1181
1182
1183
1184
1185 if (last != root_mem_cgroup)
1186 __invalidate_reclaim_iterators(root_mem_cgroup,
1187 dead_memcg);
1188}
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203int mem_cgroup_scan_tasks(struct mem_cgroup *memcg,
1204 int (*fn)(struct task_struct *, void *), void *arg)
1205{
1206 struct mem_cgroup *iter;
1207 int ret = 0;
1208
1209 BUG_ON(memcg == root_mem_cgroup);
1210
1211 for_each_mem_cgroup_tree(iter, memcg) {
1212 struct css_task_iter it;
1213 struct task_struct *task;
1214
1215 css_task_iter_start(&iter->css, CSS_TASK_ITER_PROCS, &it);
1216 while (!ret && (task = css_task_iter_next(&it)))
1217 ret = fn(task, arg);
1218 css_task_iter_end(&it);
1219 if (ret) {
1220 mem_cgroup_iter_break(memcg, iter);
1221 break;
1222 }
1223 }
1224 return ret;
1225}
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236struct lruvec *mem_cgroup_page_lruvec(struct page *page, struct pglist_data *pgdat)
1237{
1238 struct mem_cgroup_per_node *mz;
1239 struct mem_cgroup *memcg;
1240 struct lruvec *lruvec;
1241
1242 if (mem_cgroup_disabled()) {
1243 lruvec = &pgdat->lruvec;
1244 goto out;
1245 }
1246
1247 memcg = page->mem_cgroup;
1248
1249
1250
1251
1252 if (!memcg)
1253 memcg = root_mem_cgroup;
1254
1255 mz = mem_cgroup_page_nodeinfo(memcg, page);
1256 lruvec = &mz->lruvec;
1257out:
1258
1259
1260
1261
1262
1263 if (unlikely(lruvec->pgdat != pgdat))
1264 lruvec->pgdat = pgdat;
1265 return lruvec;
1266}
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru,
1280 int zid, int nr_pages)
1281{
1282 struct mem_cgroup_per_node *mz;
1283 unsigned long *lru_size;
1284 long size;
1285
1286 if (mem_cgroup_disabled())
1287 return;
1288
1289 mz = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
1290 lru_size = &mz->lru_zone_size[zid][lru];
1291
1292 if (nr_pages < 0)
1293 *lru_size += nr_pages;
1294
1295 size = *lru_size;
1296 if (WARN_ONCE(size < 0,
1297 "%s(%p, %d, %d): lru_size %ld\n",
1298 __func__, lruvec, lru, nr_pages, size)) {
1299 VM_BUG_ON(1);
1300 *lru_size = 0;
1301 }
1302
1303 if (nr_pages > 0)
1304 *lru_size += nr_pages;
1305}
1306
1307
1308
1309
1310
1311
1312
1313
1314static unsigned long mem_cgroup_margin(struct mem_cgroup *memcg)
1315{
1316 unsigned long margin = 0;
1317 unsigned long count;
1318 unsigned long limit;
1319
1320 count = page_counter_read(&memcg->memory);
1321 limit = READ_ONCE(memcg->memory.max);
1322 if (count < limit)
1323 margin = limit - count;
1324
1325 if (do_memsw_account()) {
1326 count = page_counter_read(&memcg->memsw);
1327 limit = READ_ONCE(memcg->memsw.max);
1328 if (count <= limit)
1329 margin = min(margin, limit - count);
1330 else
1331 margin = 0;
1332 }
1333
1334 return margin;
1335}
1336
1337
1338
1339
1340
1341
1342
1343
1344static bool mem_cgroup_under_move(struct mem_cgroup *memcg)
1345{
1346 struct mem_cgroup *from;
1347 struct mem_cgroup *to;
1348 bool ret = false;
1349
1350
1351
1352
1353 spin_lock(&mc.lock);
1354 from = mc.from;
1355 to = mc.to;
1356 if (!from)
1357 goto unlock;
1358
1359 ret = mem_cgroup_is_descendant(from, memcg) ||
1360 mem_cgroup_is_descendant(to, memcg);
1361unlock:
1362 spin_unlock(&mc.lock);
1363 return ret;
1364}
1365
1366static bool mem_cgroup_wait_acct_move(struct mem_cgroup *memcg)
1367{
1368 if (mc.moving_task && current != mc.moving_task) {
1369 if (mem_cgroup_under_move(memcg)) {
1370 DEFINE_WAIT(wait);
1371 prepare_to_wait(&mc.waitq, &wait, TASK_INTERRUPTIBLE);
1372
1373 if (mc.moving_task)
1374 schedule();
1375 finish_wait(&mc.waitq, &wait);
1376 return true;
1377 }
1378 }
1379 return false;
1380}
1381
1382static char *memory_stat_format(struct mem_cgroup *memcg)
1383{
1384 struct seq_buf s;
1385 int i;
1386
1387 seq_buf_init(&s, kmalloc(PAGE_SIZE, GFP_KERNEL), PAGE_SIZE);
1388 if (!s.buffer)
1389 return NULL;
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402 seq_buf_printf(&s, "anon %llu\n",
1403 (u64)memcg_page_state(memcg, MEMCG_RSS) *
1404 PAGE_SIZE);
1405 seq_buf_printf(&s, "file %llu\n",
1406 (u64)memcg_page_state(memcg, MEMCG_CACHE) *
1407 PAGE_SIZE);
1408 seq_buf_printf(&s, "kernel_stack %llu\n",
1409 (u64)memcg_page_state(memcg, MEMCG_KERNEL_STACK_KB) *
1410 1024);
1411 seq_buf_printf(&s, "slab %llu\n",
1412 (u64)(memcg_page_state(memcg, NR_SLAB_RECLAIMABLE) +
1413 memcg_page_state(memcg, NR_SLAB_UNRECLAIMABLE)) *
1414 PAGE_SIZE);
1415 seq_buf_printf(&s, "sock %llu\n",
1416 (u64)memcg_page_state(memcg, MEMCG_SOCK) *
1417 PAGE_SIZE);
1418
1419 seq_buf_printf(&s, "shmem %llu\n",
1420 (u64)memcg_page_state(memcg, NR_SHMEM) *
1421 PAGE_SIZE);
1422 seq_buf_printf(&s, "file_mapped %llu\n",
1423 (u64)memcg_page_state(memcg, NR_FILE_MAPPED) *
1424 PAGE_SIZE);
1425 seq_buf_printf(&s, "file_dirty %llu\n",
1426 (u64)memcg_page_state(memcg, NR_FILE_DIRTY) *
1427 PAGE_SIZE);
1428 seq_buf_printf(&s, "file_writeback %llu\n",
1429 (u64)memcg_page_state(memcg, NR_WRITEBACK) *
1430 PAGE_SIZE);
1431
1432
1433
1434
1435
1436
1437
1438 seq_buf_printf(&s, "anon_thp %llu\n",
1439 (u64)memcg_page_state(memcg, MEMCG_RSS_HUGE) *
1440 PAGE_SIZE);
1441
1442 for (i = 0; i < NR_LRU_LISTS; i++)
1443 seq_buf_printf(&s, "%s %llu\n", mem_cgroup_lru_names[i],
1444 (u64)memcg_page_state(memcg, NR_LRU_BASE + i) *
1445 PAGE_SIZE);
1446
1447 seq_buf_printf(&s, "slab_reclaimable %llu\n",
1448 (u64)memcg_page_state(memcg, NR_SLAB_RECLAIMABLE) *
1449 PAGE_SIZE);
1450 seq_buf_printf(&s, "slab_unreclaimable %llu\n",
1451 (u64)memcg_page_state(memcg, NR_SLAB_UNRECLAIMABLE) *
1452 PAGE_SIZE);
1453
1454
1455
1456 seq_buf_printf(&s, "pgfault %lu\n", memcg_events(memcg, PGFAULT));
1457 seq_buf_printf(&s, "pgmajfault %lu\n", memcg_events(memcg, PGMAJFAULT));
1458
1459 seq_buf_printf(&s, "workingset_refault %lu\n",
1460 memcg_page_state(memcg, WORKINGSET_REFAULT));
1461 seq_buf_printf(&s, "workingset_activate %lu\n",
1462 memcg_page_state(memcg, WORKINGSET_ACTIVATE));
1463 seq_buf_printf(&s, "workingset_nodereclaim %lu\n",
1464 memcg_page_state(memcg, WORKINGSET_NODERECLAIM));
1465
1466 seq_buf_printf(&s, "pgrefill %lu\n", memcg_events(memcg, PGREFILL));
1467 seq_buf_printf(&s, "pgscan %lu\n",
1468 memcg_events(memcg, PGSCAN_KSWAPD) +
1469 memcg_events(memcg, PGSCAN_DIRECT));
1470 seq_buf_printf(&s, "pgsteal %lu\n",
1471 memcg_events(memcg, PGSTEAL_KSWAPD) +
1472 memcg_events(memcg, PGSTEAL_DIRECT));
1473 seq_buf_printf(&s, "pgactivate %lu\n", memcg_events(memcg, PGACTIVATE));
1474 seq_buf_printf(&s, "pgdeactivate %lu\n", memcg_events(memcg, PGDEACTIVATE));
1475 seq_buf_printf(&s, "pglazyfree %lu\n", memcg_events(memcg, PGLAZYFREE));
1476 seq_buf_printf(&s, "pglazyfreed %lu\n", memcg_events(memcg, PGLAZYFREED));
1477
1478#ifdef CONFIG_TRANSPARENT_HUGEPAGE
1479 seq_buf_printf(&s, "thp_fault_alloc %lu\n",
1480 memcg_events(memcg, THP_FAULT_ALLOC));
1481 seq_buf_printf(&s, "thp_collapse_alloc %lu\n",
1482 memcg_events(memcg, THP_COLLAPSE_ALLOC));
1483#endif
1484
1485
1486 WARN_ON_ONCE(seq_buf_has_overflowed(&s));
1487
1488 return s.buffer;
1489}
1490
1491#define K(x) ((x) << (PAGE_SHIFT-10))
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501void mem_cgroup_print_oom_context(struct mem_cgroup *memcg, struct task_struct *p)
1502{
1503 rcu_read_lock();
1504
1505 if (memcg) {
1506 pr_cont(",oom_memcg=");
1507 pr_cont_cgroup_path(memcg->css.cgroup);
1508 } else
1509 pr_cont(",global_oom");
1510 if (p) {
1511 pr_cont(",task_memcg=");
1512 pr_cont_cgroup_path(task_cgroup(p, memory_cgrp_id));
1513 }
1514 rcu_read_unlock();
1515}
1516
1517
1518
1519
1520
1521
1522void mem_cgroup_print_oom_meminfo(struct mem_cgroup *memcg)
1523{
1524 char *buf;
1525
1526 pr_info("memory: usage %llukB, limit %llukB, failcnt %lu\n",
1527 K((u64)page_counter_read(&memcg->memory)),
1528 K((u64)memcg->memory.max), memcg->memory.failcnt);
1529 if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
1530 pr_info("swap: usage %llukB, limit %llukB, failcnt %lu\n",
1531 K((u64)page_counter_read(&memcg->swap)),
1532 K((u64)memcg->swap.max), memcg->swap.failcnt);
1533 else {
1534 pr_info("memory+swap: usage %llukB, limit %llukB, failcnt %lu\n",
1535 K((u64)page_counter_read(&memcg->memsw)),
1536 K((u64)memcg->memsw.max), memcg->memsw.failcnt);
1537 pr_info("kmem: usage %llukB, limit %llukB, failcnt %lu\n",
1538 K((u64)page_counter_read(&memcg->kmem)),
1539 K((u64)memcg->kmem.max), memcg->kmem.failcnt);
1540 }
1541
1542 pr_info("Memory cgroup stats for ");
1543 pr_cont_cgroup_path(memcg->css.cgroup);
1544 pr_cont(":");
1545 buf = memory_stat_format(memcg);
1546 if (!buf)
1547 return;
1548 pr_info("%s", buf);
1549 kfree(buf);
1550}
1551
1552
1553
1554
1555unsigned long mem_cgroup_get_max(struct mem_cgroup *memcg)
1556{
1557 unsigned long max;
1558
1559 max = memcg->memory.max;
1560 if (mem_cgroup_swappiness(memcg)) {
1561 unsigned long memsw_max;
1562 unsigned long swap_max;
1563
1564 memsw_max = memcg->memsw.max;
1565 swap_max = memcg->swap.max;
1566 swap_max = min(swap_max, (unsigned long)total_swap_pages);
1567 max = min(max + swap_max, memsw_max);
1568 }
1569 return max;
1570}
1571
1572static bool mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
1573 int order)
1574{
1575 struct oom_control oc = {
1576 .zonelist = NULL,
1577 .nodemask = NULL,
1578 .memcg = memcg,
1579 .gfp_mask = gfp_mask,
1580 .order = order,
1581 };
1582 bool ret;
1583
1584 if (mutex_lock_killable(&oom_lock))
1585 return true;
1586
1587
1588
1589
1590 ret = should_force_charge() || out_of_memory(&oc);
1591 mutex_unlock(&oom_lock);
1592 return ret;
1593}
1594
1595#if MAX_NUMNODES > 1
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607static bool test_mem_cgroup_node_reclaimable(struct mem_cgroup *memcg,
1608 int nid, bool noswap)
1609{
1610 struct lruvec *lruvec = mem_cgroup_lruvec(NODE_DATA(nid), memcg);
1611
1612 if (lruvec_page_state(lruvec, NR_INACTIVE_FILE) ||
1613 lruvec_page_state(lruvec, NR_ACTIVE_FILE))
1614 return true;
1615 if (noswap || !total_swap_pages)
1616 return false;
1617 if (lruvec_page_state(lruvec, NR_INACTIVE_ANON) ||
1618 lruvec_page_state(lruvec, NR_ACTIVE_ANON))
1619 return true;
1620 return false;
1621
1622}
1623
1624
1625
1626
1627
1628
1629
1630static void mem_cgroup_may_update_nodemask(struct mem_cgroup *memcg)
1631{
1632 int nid;
1633
1634
1635
1636
1637 if (!atomic_read(&memcg->numainfo_events))
1638 return;
1639 if (atomic_inc_return(&memcg->numainfo_updating) > 1)
1640 return;
1641
1642
1643 memcg->scan_nodes = node_states[N_MEMORY];
1644
1645 for_each_node_mask(nid, node_states[N_MEMORY]) {
1646
1647 if (!test_mem_cgroup_node_reclaimable(memcg, nid, false))
1648 node_clear(nid, memcg->scan_nodes);
1649 }
1650
1651 atomic_set(&memcg->numainfo_events, 0);
1652 atomic_set(&memcg->numainfo_updating, 0);
1653}
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667int mem_cgroup_select_victim_node(struct mem_cgroup *memcg)
1668{
1669 int node;
1670
1671 mem_cgroup_may_update_nodemask(memcg);
1672 node = memcg->last_scanned_node;
1673
1674 node = next_node_in(node, memcg->scan_nodes);
1675
1676
1677
1678
1679
1680 if (unlikely(node == MAX_NUMNODES))
1681 node = numa_node_id();
1682
1683 memcg->last_scanned_node = node;
1684 return node;
1685}
1686#else
1687int mem_cgroup_select_victim_node(struct mem_cgroup *memcg)
1688{
1689 return 0;
1690}
1691#endif
1692
1693static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg,
1694 pg_data_t *pgdat,
1695 gfp_t gfp_mask,
1696 unsigned long *total_scanned)
1697{
1698 struct mem_cgroup *victim = NULL;
1699 int total = 0;
1700 int loop = 0;
1701 unsigned long excess;
1702 unsigned long nr_scanned;
1703 struct mem_cgroup_reclaim_cookie reclaim = {
1704 .pgdat = pgdat,
1705 .priority = 0,
1706 };
1707
1708 excess = soft_limit_excess(root_memcg);
1709
1710 while (1) {
1711 victim = mem_cgroup_iter(root_memcg, victim, &reclaim);
1712 if (!victim) {
1713 loop++;
1714 if (loop >= 2) {
1715
1716
1717
1718
1719
1720 if (!total)
1721 break;
1722
1723
1724
1725
1726
1727
1728 if (total >= (excess >> 2) ||
1729 (loop > MEM_CGROUP_MAX_RECLAIM_LOOPS))
1730 break;
1731 }
1732 continue;
1733 }
1734 total += mem_cgroup_shrink_node(victim, gfp_mask, false,
1735 pgdat, &nr_scanned);
1736 *total_scanned += nr_scanned;
1737 if (!soft_limit_excess(root_memcg))
1738 break;
1739 }
1740 mem_cgroup_iter_break(root_memcg, victim);
1741 return total;
1742}
1743
1744#ifdef CONFIG_LOCKDEP
1745static struct lockdep_map memcg_oom_lock_dep_map = {
1746 .name = "memcg_oom_lock",
1747};
1748#endif
1749
1750static DEFINE_SPINLOCK(memcg_oom_lock);
1751
1752
1753
1754
1755
1756static bool mem_cgroup_oom_trylock(struct mem_cgroup *memcg)
1757{
1758 struct mem_cgroup *iter, *failed = NULL;
1759
1760 spin_lock(&memcg_oom_lock);
1761
1762 for_each_mem_cgroup_tree(iter, memcg) {
1763 if (iter->oom_lock) {
1764
1765
1766
1767
1768 failed = iter;
1769 mem_cgroup_iter_break(memcg, iter);
1770 break;
1771 } else
1772 iter->oom_lock = true;
1773 }
1774
1775 if (failed) {
1776
1777
1778
1779
1780 for_each_mem_cgroup_tree(iter, memcg) {
1781 if (iter == failed) {
1782 mem_cgroup_iter_break(memcg, iter);
1783 break;
1784 }
1785 iter->oom_lock = false;
1786 }
1787 } else
1788 mutex_acquire(&memcg_oom_lock_dep_map, 0, 1, _RET_IP_);
1789
1790 spin_unlock(&memcg_oom_lock);
1791
1792 return !failed;
1793}
1794
1795static void mem_cgroup_oom_unlock(struct mem_cgroup *memcg)
1796{
1797 struct mem_cgroup *iter;
1798
1799 spin_lock(&memcg_oom_lock);
1800 mutex_release(&memcg_oom_lock_dep_map, 1, _RET_IP_);
1801 for_each_mem_cgroup_tree(iter, memcg)
1802 iter->oom_lock = false;
1803 spin_unlock(&memcg_oom_lock);
1804}
1805
1806static void mem_cgroup_mark_under_oom(struct mem_cgroup *memcg)
1807{
1808 struct mem_cgroup *iter;
1809
1810 spin_lock(&memcg_oom_lock);
1811 for_each_mem_cgroup_tree(iter, memcg)
1812 iter->under_oom++;
1813 spin_unlock(&memcg_oom_lock);
1814}
1815
1816static void mem_cgroup_unmark_under_oom(struct mem_cgroup *memcg)
1817{
1818 struct mem_cgroup *iter;
1819
1820
1821
1822
1823
1824 spin_lock(&memcg_oom_lock);
1825 for_each_mem_cgroup_tree(iter, memcg)
1826 if (iter->under_oom > 0)
1827 iter->under_oom--;
1828 spin_unlock(&memcg_oom_lock);
1829}
1830
1831static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq);
1832
1833struct oom_wait_info {
1834 struct mem_cgroup *memcg;
1835 wait_queue_entry_t wait;
1836};
1837
1838static int memcg_oom_wake_function(wait_queue_entry_t *wait,
1839 unsigned mode, int sync, void *arg)
1840{
1841 struct mem_cgroup *wake_memcg = (struct mem_cgroup *)arg;
1842 struct mem_cgroup *oom_wait_memcg;
1843 struct oom_wait_info *oom_wait_info;
1844
1845 oom_wait_info = container_of(wait, struct oom_wait_info, wait);
1846 oom_wait_memcg = oom_wait_info->memcg;
1847
1848 if (!mem_cgroup_is_descendant(wake_memcg, oom_wait_memcg) &&
1849 !mem_cgroup_is_descendant(oom_wait_memcg, wake_memcg))
1850 return 0;
1851 return autoremove_wake_function(wait, mode, sync, arg);
1852}
1853
1854static void memcg_oom_recover(struct mem_cgroup *memcg)
1855{
1856
1857
1858
1859
1860
1861
1862
1863
1864 if (memcg && memcg->under_oom)
1865 __wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, memcg);
1866}
1867
1868enum oom_status {
1869 OOM_SUCCESS,
1870 OOM_FAILED,
1871 OOM_ASYNC,
1872 OOM_SKIPPED
1873};
1874
1875static enum oom_status mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order)
1876{
1877 enum oom_status ret;
1878 bool locked;
1879
1880 if (order > PAGE_ALLOC_COSTLY_ORDER)
1881 return OOM_SKIPPED;
1882
1883 memcg_memory_event(memcg, MEMCG_OOM);
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903 if (memcg->oom_kill_disable) {
1904 if (!current->in_user_fault)
1905 return OOM_SKIPPED;
1906 css_get(&memcg->css);
1907 current->memcg_in_oom = memcg;
1908 current->memcg_oom_gfp_mask = mask;
1909 current->memcg_oom_order = order;
1910
1911 return OOM_ASYNC;
1912 }
1913
1914 mem_cgroup_mark_under_oom(memcg);
1915
1916 locked = mem_cgroup_oom_trylock(memcg);
1917
1918 if (locked)
1919 mem_cgroup_oom_notify(memcg);
1920
1921 mem_cgroup_unmark_under_oom(memcg);
1922 if (mem_cgroup_out_of_memory(memcg, mask, order))
1923 ret = OOM_SUCCESS;
1924 else
1925 ret = OOM_FAILED;
1926
1927 if (locked)
1928 mem_cgroup_oom_unlock(memcg);
1929
1930 return ret;
1931}
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950bool mem_cgroup_oom_synchronize(bool handle)
1951{
1952 struct mem_cgroup *memcg = current->memcg_in_oom;
1953 struct oom_wait_info owait;
1954 bool locked;
1955
1956
1957 if (!memcg)
1958 return false;
1959
1960 if (!handle)
1961 goto cleanup;
1962
1963 owait.memcg = memcg;
1964 owait.wait.flags = 0;
1965 owait.wait.func = memcg_oom_wake_function;
1966 owait.wait.private = current;
1967 INIT_LIST_HEAD(&owait.wait.entry);
1968
1969 prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE);
1970 mem_cgroup_mark_under_oom(memcg);
1971
1972 locked = mem_cgroup_oom_trylock(memcg);
1973
1974 if (locked)
1975 mem_cgroup_oom_notify(memcg);
1976
1977 if (locked && !memcg->oom_kill_disable) {
1978 mem_cgroup_unmark_under_oom(memcg);
1979 finish_wait(&memcg_oom_waitq, &owait.wait);
1980 mem_cgroup_out_of_memory(memcg, current->memcg_oom_gfp_mask,
1981 current->memcg_oom_order);
1982 } else {
1983 schedule();
1984 mem_cgroup_unmark_under_oom(memcg);
1985 finish_wait(&memcg_oom_waitq, &owait.wait);
1986 }
1987
1988 if (locked) {
1989 mem_cgroup_oom_unlock(memcg);
1990
1991
1992
1993
1994
1995 memcg_oom_recover(memcg);
1996 }
1997cleanup:
1998 current->memcg_in_oom = NULL;
1999 css_put(&memcg->css);
2000 return true;
2001}
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013struct mem_cgroup *mem_cgroup_get_oom_group(struct task_struct *victim,
2014 struct mem_cgroup *oom_domain)
2015{
2016 struct mem_cgroup *oom_group = NULL;
2017 struct mem_cgroup *memcg;
2018
2019 if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
2020 return NULL;
2021
2022 if (!oom_domain)
2023 oom_domain = root_mem_cgroup;
2024
2025 rcu_read_lock();
2026
2027 memcg = mem_cgroup_from_task(victim);
2028 if (memcg == root_mem_cgroup)
2029 goto out;
2030
2031
2032
2033
2034
2035
2036 for (; memcg; memcg = parent_mem_cgroup(memcg)) {
2037 if (memcg->oom_group)
2038 oom_group = memcg;
2039
2040 if (memcg == oom_domain)
2041 break;
2042 }
2043
2044 if (oom_group)
2045 css_get(&oom_group->css);
2046out:
2047 rcu_read_unlock();
2048
2049 return oom_group;
2050}
2051
2052void mem_cgroup_print_oom_group(struct mem_cgroup *memcg)
2053{
2054 pr_info("Tasks in ");
2055 pr_cont_cgroup_path(memcg->css.cgroup);
2056 pr_cont(" are going to be killed due to memory.oom.group set\n");
2057}
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070struct mem_cgroup *lock_page_memcg(struct page *page)
2071{
2072 struct mem_cgroup *memcg;
2073 unsigned long flags;
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086 rcu_read_lock();
2087
2088 if (mem_cgroup_disabled())
2089 return NULL;
2090again:
2091 memcg = page->mem_cgroup;
2092 if (unlikely(!memcg))
2093 return NULL;
2094
2095 if (atomic_read(&memcg->moving_account) <= 0)
2096 return memcg;
2097
2098 spin_lock_irqsave(&memcg->move_lock, flags);
2099 if (memcg != page->mem_cgroup) {
2100 spin_unlock_irqrestore(&memcg->move_lock, flags);
2101 goto again;
2102 }
2103
2104
2105
2106
2107
2108
2109 memcg->move_lock_task = current;
2110 memcg->move_lock_flags = flags;
2111
2112 return memcg;
2113}
2114EXPORT_SYMBOL(lock_page_memcg);
2115
2116
2117
2118
2119
2120
2121
2122void __unlock_page_memcg(struct mem_cgroup *memcg)
2123{
2124 if (memcg && memcg->move_lock_task == current) {
2125 unsigned long flags = memcg->move_lock_flags;
2126
2127 memcg->move_lock_task = NULL;
2128 memcg->move_lock_flags = 0;
2129
2130 spin_unlock_irqrestore(&memcg->move_lock, flags);
2131 }
2132
2133 rcu_read_unlock();
2134}
2135
2136
2137
2138
2139
2140void unlock_page_memcg(struct page *page)
2141{
2142 __unlock_page_memcg(page->mem_cgroup);
2143}
2144EXPORT_SYMBOL(unlock_page_memcg);
2145
2146struct memcg_stock_pcp {
2147 struct mem_cgroup *cached;
2148 unsigned int nr_pages;
2149 struct work_struct work;
2150 unsigned long flags;
2151#define FLUSHING_CACHED_CHARGE 0
2152};
2153static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock);
2154static DEFINE_MUTEX(percpu_charge_mutex);
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
2168{
2169 struct memcg_stock_pcp *stock;
2170 unsigned long flags;
2171 bool ret = false;
2172
2173 if (nr_pages > MEMCG_CHARGE_BATCH)
2174 return ret;
2175
2176 local_irq_save(flags);
2177
2178 stock = this_cpu_ptr(&memcg_stock);
2179 if (memcg == stock->cached && stock->nr_pages >= nr_pages) {
2180 stock->nr_pages -= nr_pages;
2181 ret = true;
2182 }
2183
2184 local_irq_restore(flags);
2185
2186 return ret;
2187}
2188
2189
2190
2191
2192static void drain_stock(struct memcg_stock_pcp *stock)
2193{
2194 struct mem_cgroup *old = stock->cached;
2195
2196 if (stock->nr_pages) {
2197 page_counter_uncharge(&old->memory, stock->nr_pages);
2198 if (do_memsw_account())
2199 page_counter_uncharge(&old->memsw, stock->nr_pages);
2200 css_put_many(&old->css, stock->nr_pages);
2201 stock->nr_pages = 0;
2202 }
2203 stock->cached = NULL;
2204}
2205
2206static void drain_local_stock(struct work_struct *dummy)
2207{
2208 struct memcg_stock_pcp *stock;
2209 unsigned long flags;
2210
2211
2212
2213
2214
2215 local_irq_save(flags);
2216
2217 stock = this_cpu_ptr(&memcg_stock);
2218 drain_stock(stock);
2219 clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags);
2220
2221 local_irq_restore(flags);
2222}
2223
2224
2225
2226
2227
2228static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
2229{
2230 struct memcg_stock_pcp *stock;
2231 unsigned long flags;
2232
2233 local_irq_save(flags);
2234
2235 stock = this_cpu_ptr(&memcg_stock);
2236 if (stock->cached != memcg) {
2237 drain_stock(stock);
2238 stock->cached = memcg;
2239 }
2240 stock->nr_pages += nr_pages;
2241
2242 if (stock->nr_pages > MEMCG_CHARGE_BATCH)
2243 drain_stock(stock);
2244
2245 local_irq_restore(flags);
2246}
2247
2248
2249
2250
2251
2252static void drain_all_stock(struct mem_cgroup *root_memcg)
2253{
2254 int cpu, curcpu;
2255
2256
2257 if (!mutex_trylock(&percpu_charge_mutex))
2258 return;
2259
2260
2261
2262
2263
2264
2265 curcpu = get_cpu();
2266 for_each_online_cpu(cpu) {
2267 struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu);
2268 struct mem_cgroup *memcg;
2269
2270 memcg = stock->cached;
2271 if (!memcg || !stock->nr_pages || !css_tryget(&memcg->css))
2272 continue;
2273 if (!mem_cgroup_is_descendant(memcg, root_memcg)) {
2274 css_put(&memcg->css);
2275 continue;
2276 }
2277 if (!test_and_set_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) {
2278 if (cpu == curcpu)
2279 drain_local_stock(&stock->work);
2280 else
2281 schedule_work_on(cpu, &stock->work);
2282 }
2283 css_put(&memcg->css);
2284 }
2285 put_cpu();
2286 mutex_unlock(&percpu_charge_mutex);
2287}
2288
2289static int memcg_hotplug_cpu_dead(unsigned int cpu)
2290{
2291 struct memcg_stock_pcp *stock;
2292 struct mem_cgroup *memcg, *mi;
2293
2294 stock = &per_cpu(memcg_stock, cpu);
2295 drain_stock(stock);
2296
2297 for_each_mem_cgroup(memcg) {
2298 int i;
2299
2300 for (i = 0; i < MEMCG_NR_STAT; i++) {
2301 int nid;
2302 long x;
2303
2304 x = this_cpu_xchg(memcg->vmstats_percpu->stat[i], 0);
2305 if (x)
2306 for (mi = memcg; mi; mi = parent_mem_cgroup(mi))
2307 atomic_long_add(x, &memcg->vmstats[i]);
2308
2309 if (i >= NR_VM_NODE_STAT_ITEMS)
2310 continue;
2311
2312 for_each_node(nid) {
2313 struct mem_cgroup_per_node *pn;
2314
2315 pn = mem_cgroup_nodeinfo(memcg, nid);
2316 x = this_cpu_xchg(pn->lruvec_stat_cpu->count[i], 0);
2317 if (x)
2318 do {
2319 atomic_long_add(x, &pn->lruvec_stat[i]);
2320 } while ((pn = parent_nodeinfo(pn, nid)));
2321 }
2322 }
2323
2324 for (i = 0; i < NR_VM_EVENT_ITEMS; i++) {
2325 long x;
2326
2327 x = this_cpu_xchg(memcg->vmstats_percpu->events[i], 0);
2328 if (x)
2329 for (mi = memcg; mi; mi = parent_mem_cgroup(mi))
2330 atomic_long_add(x, &memcg->vmevents[i]);
2331 }
2332 }
2333
2334 return 0;
2335}
2336
2337static void reclaim_high(struct mem_cgroup *memcg,
2338 unsigned int nr_pages,
2339 gfp_t gfp_mask)
2340{
2341 do {
2342 if (page_counter_read(&memcg->memory) <= memcg->high)
2343 continue;
2344 memcg_memory_event(memcg, MEMCG_HIGH);
2345 try_to_free_mem_cgroup_pages(memcg, nr_pages, gfp_mask, true);
2346 } while ((memcg = parent_mem_cgroup(memcg)));
2347}
2348
2349static void high_work_func(struct work_struct *work)
2350{
2351 struct mem_cgroup *memcg;
2352
2353 memcg = container_of(work, struct mem_cgroup, high_work);
2354 reclaim_high(memcg, MEMCG_CHARGE_BATCH, GFP_KERNEL);
2355}
2356
2357
2358
2359
2360
2361void mem_cgroup_handle_over_high(void)
2362{
2363 unsigned int nr_pages = current->memcg_nr_pages_over_high;
2364 struct mem_cgroup *memcg;
2365
2366 if (likely(!nr_pages))
2367 return;
2368
2369 memcg = get_mem_cgroup_from_mm(current->mm);
2370 reclaim_high(memcg, nr_pages, GFP_KERNEL);
2371 css_put(&memcg->css);
2372 current->memcg_nr_pages_over_high = 0;
2373}
2374
2375static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
2376 unsigned int nr_pages)
2377{
2378 unsigned int batch = max(MEMCG_CHARGE_BATCH, nr_pages);
2379 int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
2380 struct mem_cgroup *mem_over_limit;
2381 struct page_counter *counter;
2382 unsigned long nr_reclaimed;
2383 bool may_swap = true;
2384 bool drained = false;
2385 enum oom_status oom_status;
2386
2387 if (mem_cgroup_is_root(memcg))
2388 return 0;
2389retry:
2390 if (consume_stock(memcg, nr_pages))
2391 return 0;
2392
2393 if (!do_memsw_account() ||
2394 page_counter_try_charge(&memcg->memsw, batch, &counter)) {
2395 if (page_counter_try_charge(&memcg->memory, batch, &counter))
2396 goto done_restock;
2397 if (do_memsw_account())
2398 page_counter_uncharge(&memcg->memsw, batch);
2399 mem_over_limit = mem_cgroup_from_counter(counter, memory);
2400 } else {
2401 mem_over_limit = mem_cgroup_from_counter(counter, memsw);
2402 may_swap = false;
2403 }
2404
2405 if (batch > nr_pages) {
2406 batch = nr_pages;
2407 goto retry;
2408 }
2409
2410
2411
2412
2413
2414
2415
2416 if (unlikely(should_force_charge()))
2417 goto force;
2418
2419
2420
2421
2422
2423
2424
2425 if (unlikely(current->flags & PF_MEMALLOC))
2426 goto force;
2427
2428 if (unlikely(task_in_memcg_oom(current)))
2429 goto nomem;
2430
2431 if (!gfpflags_allow_blocking(gfp_mask))
2432 goto nomem;
2433
2434 memcg_memory_event(mem_over_limit, MEMCG_MAX);
2435
2436 nr_reclaimed = try_to_free_mem_cgroup_pages(mem_over_limit, nr_pages,
2437 gfp_mask, may_swap);
2438
2439 if (mem_cgroup_margin(mem_over_limit) >= nr_pages)
2440 goto retry;
2441
2442 if (!drained) {
2443 drain_all_stock(mem_over_limit);
2444 drained = true;
2445 goto retry;
2446 }
2447
2448 if (gfp_mask & __GFP_NORETRY)
2449 goto nomem;
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459 if (nr_reclaimed && nr_pages <= (1 << PAGE_ALLOC_COSTLY_ORDER))
2460 goto retry;
2461
2462
2463
2464
2465 if (mem_cgroup_wait_acct_move(mem_over_limit))
2466 goto retry;
2467
2468 if (nr_retries--)
2469 goto retry;
2470
2471 if (gfp_mask & __GFP_RETRY_MAYFAIL)
2472 goto nomem;
2473
2474 if (gfp_mask & __GFP_NOFAIL)
2475 goto force;
2476
2477 if (fatal_signal_pending(current))
2478 goto force;
2479
2480
2481
2482
2483
2484
2485 oom_status = mem_cgroup_oom(mem_over_limit, gfp_mask,
2486 get_order(nr_pages * PAGE_SIZE));
2487 switch (oom_status) {
2488 case OOM_SUCCESS:
2489 nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
2490 goto retry;
2491 case OOM_FAILED:
2492 goto force;
2493 default:
2494 goto nomem;
2495 }
2496nomem:
2497 if (!(gfp_mask & __GFP_NOFAIL))
2498 return -ENOMEM;
2499force:
2500
2501
2502
2503
2504
2505 page_counter_charge(&memcg->memory, nr_pages);
2506 if (do_memsw_account())
2507 page_counter_charge(&memcg->memsw, nr_pages);
2508 css_get_many(&memcg->css, nr_pages);
2509
2510 return 0;
2511
2512done_restock:
2513 css_get_many(&memcg->css, batch);
2514 if (batch > nr_pages)
2515 refill_stock(memcg, batch - nr_pages);
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526 do {
2527 if (page_counter_read(&memcg->memory) > memcg->high) {
2528
2529 if (in_interrupt()) {
2530 schedule_work(&memcg->high_work);
2531 break;
2532 }
2533 current->memcg_nr_pages_over_high += batch;
2534 set_notify_resume(current);
2535 break;
2536 }
2537 } while ((memcg = parent_mem_cgroup(memcg)));
2538
2539 return 0;
2540}
2541
2542static void cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages)
2543{
2544 if (mem_cgroup_is_root(memcg))
2545 return;
2546
2547 page_counter_uncharge(&memcg->memory, nr_pages);
2548 if (do_memsw_account())
2549 page_counter_uncharge(&memcg->memsw, nr_pages);
2550
2551 css_put_many(&memcg->css, nr_pages);
2552}
2553
2554static void lock_page_lru(struct page *page, int *isolated)
2555{
2556 pg_data_t *pgdat = page_pgdat(page);
2557
2558 spin_lock_irq(&pgdat->lru_lock);
2559 if (PageLRU(page)) {
2560 struct lruvec *lruvec;
2561
2562 lruvec = mem_cgroup_page_lruvec(page, pgdat);
2563 ClearPageLRU(page);
2564 del_page_from_lru_list(page, lruvec, page_lru(page));
2565 *isolated = 1;
2566 } else
2567 *isolated = 0;
2568}
2569
2570static void unlock_page_lru(struct page *page, int isolated)
2571{
2572 pg_data_t *pgdat = page_pgdat(page);
2573
2574 if (isolated) {
2575 struct lruvec *lruvec;
2576
2577 lruvec = mem_cgroup_page_lruvec(page, pgdat);
2578 VM_BUG_ON_PAGE(PageLRU(page), page);
2579 SetPageLRU(page);
2580 add_page_to_lru_list(page, lruvec, page_lru(page));
2581 }
2582 spin_unlock_irq(&pgdat->lru_lock);
2583}
2584
2585static void commit_charge(struct page *page, struct mem_cgroup *memcg,
2586 bool lrucare)
2587{
2588 int isolated;
2589
2590 VM_BUG_ON_PAGE(page->mem_cgroup, page);
2591
2592
2593
2594
2595
2596 if (lrucare)
2597 lock_page_lru(page, &isolated);
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613 page->mem_cgroup = memcg;
2614
2615 if (lrucare)
2616 unlock_page_lru(page, isolated);
2617}
2618
2619#ifdef CONFIG_MEMCG_KMEM
2620static int memcg_alloc_cache_id(void)
2621{
2622 int id, size;
2623 int err;
2624
2625 id = ida_simple_get(&memcg_cache_ida,
2626 0, MEMCG_CACHES_MAX_SIZE, GFP_KERNEL);
2627 if (id < 0)
2628 return id;
2629
2630 if (id < memcg_nr_cache_ids)
2631 return id;
2632
2633
2634
2635
2636
2637 down_write(&memcg_cache_ids_sem);
2638
2639 size = 2 * (id + 1);
2640 if (size < MEMCG_CACHES_MIN_SIZE)
2641 size = MEMCG_CACHES_MIN_SIZE;
2642 else if (size > MEMCG_CACHES_MAX_SIZE)
2643 size = MEMCG_CACHES_MAX_SIZE;
2644
2645 err = memcg_update_all_caches(size);
2646 if (!err)
2647 err = memcg_update_all_list_lrus(size);
2648 if (!err)
2649 memcg_nr_cache_ids = size;
2650
2651 up_write(&memcg_cache_ids_sem);
2652
2653 if (err) {
2654 ida_simple_remove(&memcg_cache_ida, id);
2655 return err;
2656 }
2657 return id;
2658}
2659
2660static void memcg_free_cache_id(int id)
2661{
2662 ida_simple_remove(&memcg_cache_ida, id);
2663}
2664
2665struct memcg_kmem_cache_create_work {
2666 struct mem_cgroup *memcg;
2667 struct kmem_cache *cachep;
2668 struct work_struct work;
2669};
2670
2671static void memcg_kmem_cache_create_func(struct work_struct *w)
2672{
2673 struct memcg_kmem_cache_create_work *cw =
2674 container_of(w, struct memcg_kmem_cache_create_work, work);
2675 struct mem_cgroup *memcg = cw->memcg;
2676 struct kmem_cache *cachep = cw->cachep;
2677
2678 memcg_create_kmem_cache(memcg, cachep);
2679
2680 css_put(&memcg->css);
2681 kfree(cw);
2682}
2683
2684
2685
2686
2687static void memcg_schedule_kmem_cache_create(struct mem_cgroup *memcg,
2688 struct kmem_cache *cachep)
2689{
2690 struct memcg_kmem_cache_create_work *cw;
2691
2692 if (!css_tryget_online(&memcg->css))
2693 return;
2694
2695 cw = kmalloc(sizeof(*cw), GFP_NOWAIT | __GFP_NOWARN);
2696 if (!cw)
2697 return;
2698
2699 cw->memcg = memcg;
2700 cw->cachep = cachep;
2701 INIT_WORK(&cw->work, memcg_kmem_cache_create_func);
2702
2703 queue_work(memcg_kmem_cache_wq, &cw->work);
2704}
2705
2706static inline bool memcg_kmem_bypass(void)
2707{
2708 if (in_interrupt() || !current->mm || (current->flags & PF_KTHREAD))
2709 return true;
2710 return false;
2711}
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729struct kmem_cache *memcg_kmem_get_cache(struct kmem_cache *cachep)
2730{
2731 struct mem_cgroup *memcg;
2732 struct kmem_cache *memcg_cachep;
2733 struct memcg_cache_array *arr;
2734 int kmemcg_id;
2735
2736 VM_BUG_ON(!is_root_cache(cachep));
2737
2738 if (memcg_kmem_bypass())
2739 return cachep;
2740
2741 rcu_read_lock();
2742
2743 if (unlikely(current->active_memcg))
2744 memcg = current->active_memcg;
2745 else
2746 memcg = mem_cgroup_from_task(current);
2747
2748 if (!memcg || memcg == root_mem_cgroup)
2749 goto out_unlock;
2750
2751 kmemcg_id = READ_ONCE(memcg->kmemcg_id);
2752 if (kmemcg_id < 0)
2753 goto out_unlock;
2754
2755 arr = rcu_dereference(cachep->memcg_params.memcg_caches);
2756
2757
2758
2759
2760
2761
2762 memcg_cachep = READ_ONCE(arr->entries[kmemcg_id]);
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783 if (unlikely(!memcg_cachep))
2784 memcg_schedule_kmem_cache_create(memcg, cachep);
2785 else if (percpu_ref_tryget(&memcg_cachep->memcg_params.refcnt))
2786 cachep = memcg_cachep;
2787out_unlock:
2788 rcu_read_unlock();
2789 return cachep;
2790}
2791
2792
2793
2794
2795
2796void memcg_kmem_put_cache(struct kmem_cache *cachep)
2797{
2798 if (!is_root_cache(cachep))
2799 percpu_ref_put(&cachep->memcg_params.refcnt);
2800}
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811int __memcg_kmem_charge_memcg(struct page *page, gfp_t gfp, int order,
2812 struct mem_cgroup *memcg)
2813{
2814 unsigned int nr_pages = 1 << order;
2815 struct page_counter *counter;
2816 int ret;
2817
2818 ret = try_charge(memcg, gfp, nr_pages);
2819 if (ret)
2820 return ret;
2821
2822 if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) &&
2823 !page_counter_try_charge(&memcg->kmem, nr_pages, &counter)) {
2824 cancel_charge(memcg, nr_pages);
2825 return -ENOMEM;
2826 }
2827 return 0;
2828}
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838int __memcg_kmem_charge(struct page *page, gfp_t gfp, int order)
2839{
2840 struct mem_cgroup *memcg;
2841 int ret = 0;
2842
2843 if (memcg_kmem_bypass())
2844 return 0;
2845
2846 memcg = get_mem_cgroup_from_current();
2847 if (!mem_cgroup_is_root(memcg)) {
2848 ret = __memcg_kmem_charge_memcg(page, gfp, order, memcg);
2849 if (!ret) {
2850 page->mem_cgroup = memcg;
2851 __SetPageKmemcg(page);
2852 }
2853 }
2854 css_put(&memcg->css);
2855 return ret;
2856}
2857
2858
2859
2860
2861
2862
2863void __memcg_kmem_uncharge_memcg(struct mem_cgroup *memcg,
2864 unsigned int nr_pages)
2865{
2866 if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
2867 page_counter_uncharge(&memcg->kmem, nr_pages);
2868
2869 page_counter_uncharge(&memcg->memory, nr_pages);
2870 if (do_memsw_account())
2871 page_counter_uncharge(&memcg->memsw, nr_pages);
2872}
2873
2874
2875
2876
2877
2878void __memcg_kmem_uncharge(struct page *page, int order)
2879{
2880 struct mem_cgroup *memcg = page->mem_cgroup;
2881 unsigned int nr_pages = 1 << order;
2882
2883 if (!memcg)
2884 return;
2885
2886 VM_BUG_ON_PAGE(mem_cgroup_is_root(memcg), page);
2887 __memcg_kmem_uncharge_memcg(memcg, nr_pages);
2888 page->mem_cgroup = NULL;
2889
2890
2891 if (PageKmemcg(page))
2892 __ClearPageKmemcg(page);
2893
2894 css_put_many(&memcg->css, nr_pages);
2895}
2896#endif
2897
2898#ifdef CONFIG_TRANSPARENT_HUGEPAGE
2899
2900
2901
2902
2903
2904void mem_cgroup_split_huge_fixup(struct page *head)
2905{
2906 int i;
2907
2908 if (mem_cgroup_disabled())
2909 return;
2910
2911 for (i = 1; i < HPAGE_PMD_NR; i++)
2912 head[i].mem_cgroup = head->mem_cgroup;
2913
2914 __mod_memcg_state(head->mem_cgroup, MEMCG_RSS_HUGE, -HPAGE_PMD_NR);
2915}
2916#endif
2917
2918#ifdef CONFIG_MEMCG_SWAP
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933static int mem_cgroup_move_swap_account(swp_entry_t entry,
2934 struct mem_cgroup *from, struct mem_cgroup *to)
2935{
2936 unsigned short old_id, new_id;
2937
2938 old_id = mem_cgroup_id(from);
2939 new_id = mem_cgroup_id(to);
2940
2941 if (swap_cgroup_cmpxchg(entry, old_id, new_id) == old_id) {
2942 mod_memcg_state(from, MEMCG_SWAP, -1);
2943 mod_memcg_state(to, MEMCG_SWAP, 1);
2944 return 0;
2945 }
2946 return -EINVAL;
2947}
2948#else
2949static inline int mem_cgroup_move_swap_account(swp_entry_t entry,
2950 struct mem_cgroup *from, struct mem_cgroup *to)
2951{
2952 return -EINVAL;
2953}
2954#endif
2955
2956static DEFINE_MUTEX(memcg_max_mutex);
2957
2958static int mem_cgroup_resize_max(struct mem_cgroup *memcg,
2959 unsigned long max, bool memsw)
2960{
2961 bool enlarge = false;
2962 bool drained = false;
2963 int ret;
2964 bool limits_invariant;
2965 struct page_counter *counter = memsw ? &memcg->memsw : &memcg->memory;
2966
2967 do {
2968 if (signal_pending(current)) {
2969 ret = -EINTR;
2970 break;
2971 }
2972
2973 mutex_lock(&memcg_max_mutex);
2974
2975
2976
2977
2978 limits_invariant = memsw ? max >= memcg->memory.max :
2979 max <= memcg->memsw.max;
2980 if (!limits_invariant) {
2981 mutex_unlock(&memcg_max_mutex);
2982 ret = -EINVAL;
2983 break;
2984 }
2985 if (max > counter->max)
2986 enlarge = true;
2987 ret = page_counter_set_max(counter, max);
2988 mutex_unlock(&memcg_max_mutex);
2989
2990 if (!ret)
2991 break;
2992
2993 if (!drained) {
2994 drain_all_stock(memcg);
2995 drained = true;
2996 continue;
2997 }
2998
2999 if (!try_to_free_mem_cgroup_pages(memcg, 1,
3000 GFP_KERNEL, !memsw)) {
3001 ret = -EBUSY;
3002 break;
3003 }
3004 } while (true);
3005
3006 if (!ret && enlarge)
3007 memcg_oom_recover(memcg);
3008
3009 return ret;
3010}
3011
3012unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order,
3013 gfp_t gfp_mask,
3014 unsigned long *total_scanned)
3015{
3016 unsigned long nr_reclaimed = 0;
3017 struct mem_cgroup_per_node *mz, *next_mz = NULL;
3018 unsigned long reclaimed;
3019 int loop = 0;
3020 struct mem_cgroup_tree_per_node *mctz;
3021 unsigned long excess;
3022 unsigned long nr_scanned;
3023
3024 if (order > 0)
3025 return 0;
3026
3027 mctz = soft_limit_tree_node(pgdat->node_id);
3028
3029
3030
3031
3032
3033
3034 if (!mctz || RB_EMPTY_ROOT(&mctz->rb_root))
3035 return 0;
3036
3037
3038
3039
3040
3041
3042 do {
3043 if (next_mz)
3044 mz = next_mz;
3045 else
3046 mz = mem_cgroup_largest_soft_limit_node(mctz);
3047 if (!mz)
3048 break;
3049
3050 nr_scanned = 0;
3051 reclaimed = mem_cgroup_soft_reclaim(mz->memcg, pgdat,
3052 gfp_mask, &nr_scanned);
3053 nr_reclaimed += reclaimed;
3054 *total_scanned += nr_scanned;
3055 spin_lock_irq(&mctz->lock);
3056 __mem_cgroup_remove_exceeded(mz, mctz);
3057
3058
3059
3060
3061
3062 next_mz = NULL;
3063 if (!reclaimed)
3064 next_mz = __mem_cgroup_largest_soft_limit_node(mctz);
3065
3066 excess = soft_limit_excess(mz->memcg);
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076 __mem_cgroup_insert_exceeded(mz, mctz, excess);
3077 spin_unlock_irq(&mctz->lock);
3078 css_put(&mz->memcg->css);
3079 loop++;
3080
3081
3082
3083
3084
3085 if (!nr_reclaimed &&
3086 (next_mz == NULL ||
3087 loop > MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS))
3088 break;
3089 } while (!nr_reclaimed);
3090 if (next_mz)
3091 css_put(&next_mz->memcg->css);
3092 return nr_reclaimed;
3093}
3094
3095
3096
3097
3098
3099
3100
3101static inline bool memcg_has_children(struct mem_cgroup *memcg)
3102{
3103 bool ret;
3104
3105 rcu_read_lock();
3106 ret = css_next_child(NULL, &memcg->css);
3107 rcu_read_unlock();
3108 return ret;
3109}
3110
3111
3112
3113
3114
3115
3116static int mem_cgroup_force_empty(struct mem_cgroup *memcg)
3117{
3118 int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
3119
3120
3121 lru_add_drain_all();
3122
3123 drain_all_stock(memcg);
3124
3125
3126 while (nr_retries && page_counter_read(&memcg->memory)) {
3127 int progress;
3128
3129 if (signal_pending(current))
3130 return -EINTR;
3131
3132 progress = try_to_free_mem_cgroup_pages(memcg, 1,
3133 GFP_KERNEL, true);
3134 if (!progress) {
3135 nr_retries--;
3136
3137 congestion_wait(BLK_RW_ASYNC, HZ/10);
3138 }
3139
3140 }
3141
3142 return 0;
3143}
3144
3145static ssize_t mem_cgroup_force_empty_write(struct kernfs_open_file *of,
3146 char *buf, size_t nbytes,
3147 loff_t off)
3148{
3149 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
3150
3151 if (mem_cgroup_is_root(memcg))
3152 return -EINVAL;
3153 return mem_cgroup_force_empty(memcg) ?: nbytes;
3154}
3155
3156static u64 mem_cgroup_hierarchy_read(struct cgroup_subsys_state *css,
3157 struct cftype *cft)
3158{
3159 return mem_cgroup_from_css(css)->use_hierarchy;
3160}
3161
3162static int mem_cgroup_hierarchy_write(struct cgroup_subsys_state *css,
3163 struct cftype *cft, u64 val)
3164{
3165 int retval = 0;
3166 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
3167 struct mem_cgroup *parent_memcg = mem_cgroup_from_css(memcg->css.parent);
3168
3169 if (memcg->use_hierarchy == val)
3170 return 0;
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180 if ((!parent_memcg || !parent_memcg->use_hierarchy) &&
3181 (val == 1 || val == 0)) {
3182 if (!memcg_has_children(memcg))
3183 memcg->use_hierarchy = val;
3184 else
3185 retval = -EBUSY;
3186 } else
3187 retval = -EINVAL;
3188
3189 return retval;
3190}
3191
3192static unsigned long mem_cgroup_usage(struct mem_cgroup *memcg, bool swap)
3193{
3194 unsigned long val;
3195
3196 if (mem_cgroup_is_root(memcg)) {
3197 val = memcg_page_state(memcg, MEMCG_CACHE) +
3198 memcg_page_state(memcg, MEMCG_RSS);
3199 if (swap)
3200 val += memcg_page_state(memcg, MEMCG_SWAP);
3201 } else {
3202 if (!swap)
3203 val = page_counter_read(&memcg->memory);
3204 else
3205 val = page_counter_read(&memcg->memsw);
3206 }
3207 return val;
3208}
3209
3210enum {
3211 RES_USAGE,
3212 RES_LIMIT,
3213 RES_MAX_USAGE,
3214 RES_FAILCNT,
3215 RES_SOFT_LIMIT,
3216};
3217
3218static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css,
3219 struct cftype *cft)
3220{
3221 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
3222 struct page_counter *counter;
3223
3224 switch (MEMFILE_TYPE(cft->private)) {
3225 case _MEM:
3226 counter = &memcg->memory;
3227 break;
3228 case _MEMSWAP:
3229 counter = &memcg->memsw;
3230 break;
3231 case _KMEM:
3232 counter = &memcg->kmem;
3233 break;
3234 case _TCP:
3235 counter = &memcg->tcpmem;
3236 break;
3237 default:
3238 BUG();
3239 }
3240
3241 switch (MEMFILE_ATTR(cft->private)) {
3242 case RES_USAGE:
3243 if (counter == &memcg->memory)
3244 return (u64)mem_cgroup_usage(memcg, false) * PAGE_SIZE;
3245 if (counter == &memcg->memsw)
3246 return (u64)mem_cgroup_usage(memcg, true) * PAGE_SIZE;
3247 return (u64)page_counter_read(counter) * PAGE_SIZE;
3248 case RES_LIMIT:
3249 return (u64)counter->max * PAGE_SIZE;
3250 case RES_MAX_USAGE:
3251 return (u64)counter->watermark * PAGE_SIZE;
3252 case RES_FAILCNT:
3253 return counter->failcnt;
3254 case RES_SOFT_LIMIT:
3255 return (u64)memcg->soft_limit * PAGE_SIZE;
3256 default:
3257 BUG();
3258 }
3259}
3260
3261static void memcg_flush_percpu_vmstats(struct mem_cgroup *memcg, bool slab_only)
3262{
3263 unsigned long stat[MEMCG_NR_STAT];
3264 struct mem_cgroup *mi;
3265 int node, cpu, i;
3266 int min_idx, max_idx;
3267
3268 if (slab_only) {
3269 min_idx = NR_SLAB_RECLAIMABLE;
3270 max_idx = NR_SLAB_UNRECLAIMABLE;
3271 } else {
3272 min_idx = 0;
3273 max_idx = MEMCG_NR_STAT;
3274 }
3275
3276 for (i = min_idx; i < max_idx; i++)
3277 stat[i] = 0;
3278
3279 for_each_online_cpu(cpu)
3280 for (i = min_idx; i < max_idx; i++)
3281 stat[i] += per_cpu(memcg->vmstats_percpu->stat[i], cpu);
3282
3283 for (mi = memcg; mi; mi = parent_mem_cgroup(mi))
3284 for (i = min_idx; i < max_idx; i++)
3285 atomic_long_add(stat[i], &mi->vmstats[i]);
3286
3287 if (!slab_only)
3288 max_idx = NR_VM_NODE_STAT_ITEMS;
3289
3290 for_each_node(node) {
3291 struct mem_cgroup_per_node *pn = memcg->nodeinfo[node];
3292 struct mem_cgroup_per_node *pi;
3293
3294 for (i = min_idx; i < max_idx; i++)
3295 stat[i] = 0;
3296
3297 for_each_online_cpu(cpu)
3298 for (i = min_idx; i < max_idx; i++)
3299 stat[i] += per_cpu(
3300 pn->lruvec_stat_cpu->count[i], cpu);
3301
3302 for (pi = pn; pi; pi = parent_nodeinfo(pi, node))
3303 for (i = min_idx; i < max_idx; i++)
3304 atomic_long_add(stat[i], &pi->lruvec_stat[i]);
3305 }
3306}
3307
3308static void memcg_flush_percpu_vmevents(struct mem_cgroup *memcg)
3309{
3310 unsigned long events[NR_VM_EVENT_ITEMS];
3311 struct mem_cgroup *mi;
3312 int cpu, i;
3313
3314 for (i = 0; i < NR_VM_EVENT_ITEMS; i++)
3315 events[i] = 0;
3316
3317 for_each_online_cpu(cpu)
3318 for (i = 0; i < NR_VM_EVENT_ITEMS; i++)
3319 events[i] += per_cpu(memcg->vmstats_percpu->events[i],
3320 cpu);
3321
3322 for (mi = memcg; mi; mi = parent_mem_cgroup(mi))
3323 for (i = 0; i < NR_VM_EVENT_ITEMS; i++)
3324 atomic_long_add(events[i], &mi->vmevents[i]);
3325}
3326
3327#ifdef CONFIG_MEMCG_KMEM
3328static int memcg_online_kmem(struct mem_cgroup *memcg)
3329{
3330 int memcg_id;
3331
3332 if (cgroup_memory_nokmem)
3333 return 0;
3334
3335 BUG_ON(memcg->kmemcg_id >= 0);
3336 BUG_ON(memcg->kmem_state);
3337
3338 memcg_id = memcg_alloc_cache_id();
3339 if (memcg_id < 0)
3340 return memcg_id;
3341
3342 static_branch_inc(&memcg_kmem_enabled_key);
3343
3344
3345
3346
3347
3348
3349 memcg->kmemcg_id = memcg_id;
3350 memcg->kmem_state = KMEM_ONLINE;
3351 INIT_LIST_HEAD(&memcg->kmem_caches);
3352
3353 return 0;
3354}
3355
3356static void memcg_offline_kmem(struct mem_cgroup *memcg)
3357{
3358 struct cgroup_subsys_state *css;
3359 struct mem_cgroup *parent, *child;
3360 int kmemcg_id;
3361
3362 if (memcg->kmem_state != KMEM_ONLINE)
3363 return;
3364
3365
3366
3367
3368
3369
3370 memcg->kmem_state = KMEM_ALLOCATED;
3371
3372 parent = parent_mem_cgroup(memcg);
3373 if (!parent)
3374 parent = root_mem_cgroup;
3375
3376
3377
3378
3379
3380
3381
3382 memcg_deactivate_kmem_caches(memcg, parent);
3383 memcg_flush_percpu_vmstats(memcg, true);
3384
3385 kmemcg_id = memcg->kmemcg_id;
3386 BUG_ON(kmemcg_id < 0);
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396 rcu_read_lock();
3397 css_for_each_descendant_pre(css, &memcg->css) {
3398 child = mem_cgroup_from_css(css);
3399 BUG_ON(child->kmemcg_id != kmemcg_id);
3400 child->kmemcg_id = parent->kmemcg_id;
3401 if (!memcg->use_hierarchy)
3402 break;
3403 }
3404 rcu_read_unlock();
3405
3406 memcg_drain_all_list_lrus(kmemcg_id, parent);
3407
3408 memcg_free_cache_id(kmemcg_id);
3409}
3410
3411static void memcg_free_kmem(struct mem_cgroup *memcg)
3412{
3413
3414 if (unlikely(memcg->kmem_state == KMEM_ONLINE))
3415 memcg_offline_kmem(memcg);
3416
3417 if (memcg->kmem_state == KMEM_ALLOCATED) {
3418 WARN_ON(!list_empty(&memcg->kmem_caches));
3419 static_branch_dec(&memcg_kmem_enabled_key);
3420 }
3421}
3422#else
3423static int memcg_online_kmem(struct mem_cgroup *memcg)
3424{
3425 return 0;
3426}
3427static void memcg_offline_kmem(struct mem_cgroup *memcg)
3428{
3429}
3430static void memcg_free_kmem(struct mem_cgroup *memcg)
3431{
3432}
3433#endif
3434
3435static int memcg_update_kmem_max(struct mem_cgroup *memcg,
3436 unsigned long max)
3437{
3438 int ret;
3439
3440 mutex_lock(&memcg_max_mutex);
3441 ret = page_counter_set_max(&memcg->kmem, max);
3442 mutex_unlock(&memcg_max_mutex);
3443 return ret;
3444}
3445
3446static int memcg_update_tcp_max(struct mem_cgroup *memcg, unsigned long max)
3447{
3448 int ret;
3449
3450 mutex_lock(&memcg_max_mutex);
3451
3452 ret = page_counter_set_max(&memcg->tcpmem, max);
3453 if (ret)
3454 goto out;
3455
3456 if (!memcg->tcpmem_active) {
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473 static_branch_inc(&memcg_sockets_enabled_key);
3474 memcg->tcpmem_active = true;
3475 }
3476out:
3477 mutex_unlock(&memcg_max_mutex);
3478 return ret;
3479}
3480
3481
3482
3483
3484
3485static ssize_t mem_cgroup_write(struct kernfs_open_file *of,
3486 char *buf, size_t nbytes, loff_t off)
3487{
3488 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
3489 unsigned long nr_pages;
3490 int ret;
3491
3492 buf = strstrip(buf);
3493 ret = page_counter_memparse(buf, "-1", &nr_pages);
3494 if (ret)
3495 return ret;
3496
3497 switch (MEMFILE_ATTR(of_cft(of)->private)) {
3498 case RES_LIMIT:
3499 if (mem_cgroup_is_root(memcg)) {
3500 ret = -EINVAL;
3501 break;
3502 }
3503 switch (MEMFILE_TYPE(of_cft(of)->private)) {
3504 case _MEM:
3505 ret = mem_cgroup_resize_max(memcg, nr_pages, false);
3506 break;
3507 case _MEMSWAP:
3508 ret = mem_cgroup_resize_max(memcg, nr_pages, true);
3509 break;
3510 case _KMEM:
3511 ret = memcg_update_kmem_max(memcg, nr_pages);
3512 break;
3513 case _TCP:
3514 ret = memcg_update_tcp_max(memcg, nr_pages);
3515 break;
3516 }
3517 break;
3518 case RES_SOFT_LIMIT:
3519 memcg->soft_limit = nr_pages;
3520 ret = 0;
3521 break;
3522 }
3523 return ret ?: nbytes;
3524}
3525
3526static ssize_t mem_cgroup_reset(struct kernfs_open_file *of, char *buf,
3527 size_t nbytes, loff_t off)
3528{
3529 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
3530 struct page_counter *counter;
3531
3532 switch (MEMFILE_TYPE(of_cft(of)->private)) {
3533 case _MEM:
3534 counter = &memcg->memory;
3535 break;
3536 case _MEMSWAP:
3537 counter = &memcg->memsw;
3538 break;
3539 case _KMEM:
3540 counter = &memcg->kmem;
3541 break;
3542 case _TCP:
3543 counter = &memcg->tcpmem;
3544 break;
3545 default:
3546 BUG();
3547 }
3548
3549 switch (MEMFILE_ATTR(of_cft(of)->private)) {
3550 case RES_MAX_USAGE:
3551 page_counter_reset_watermark(counter);
3552 break;
3553 case RES_FAILCNT:
3554 counter->failcnt = 0;
3555 break;
3556 default:
3557 BUG();
3558 }
3559
3560 return nbytes;
3561}
3562
3563static u64 mem_cgroup_move_charge_read(struct cgroup_subsys_state *css,
3564 struct cftype *cft)
3565{
3566 return mem_cgroup_from_css(css)->move_charge_at_immigrate;
3567}
3568
3569#ifdef CONFIG_MMU
3570static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css,
3571 struct cftype *cft, u64 val)
3572{
3573 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
3574
3575 if (val & ~MOVE_MASK)
3576 return -EINVAL;
3577
3578
3579
3580
3581
3582
3583
3584 memcg->move_charge_at_immigrate = val;
3585 return 0;
3586}
3587#else
3588static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css,
3589 struct cftype *cft, u64 val)
3590{
3591 return -ENOSYS;
3592}
3593#endif
3594
3595#ifdef CONFIG_NUMA
3596
3597#define LRU_ALL_FILE (BIT(LRU_INACTIVE_FILE) | BIT(LRU_ACTIVE_FILE))
3598#define LRU_ALL_ANON (BIT(LRU_INACTIVE_ANON) | BIT(LRU_ACTIVE_ANON))
3599#define LRU_ALL ((1 << NR_LRU_LISTS) - 1)
3600
3601static unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg,
3602 int nid, unsigned int lru_mask)
3603{
3604 struct lruvec *lruvec = mem_cgroup_lruvec(NODE_DATA(nid), memcg);
3605 unsigned long nr = 0;
3606 enum lru_list lru;
3607
3608 VM_BUG_ON((unsigned)nid >= nr_node_ids);
3609
3610 for_each_lru(lru) {
3611 if (!(BIT(lru) & lru_mask))
3612 continue;
3613 nr += lruvec_page_state_local(lruvec, NR_LRU_BASE + lru);
3614 }
3615 return nr;
3616}
3617
3618static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *memcg,
3619 unsigned int lru_mask)
3620{
3621 unsigned long nr = 0;
3622 enum lru_list lru;
3623
3624 for_each_lru(lru) {
3625 if (!(BIT(lru) & lru_mask))
3626 continue;
3627 nr += memcg_page_state_local(memcg, NR_LRU_BASE + lru);
3628 }
3629 return nr;
3630}
3631
3632static int memcg_numa_stat_show(struct seq_file *m, void *v)
3633{
3634 struct numa_stat {
3635 const char *name;
3636 unsigned int lru_mask;
3637 };
3638
3639 static const struct numa_stat stats[] = {
3640 { "total", LRU_ALL },
3641 { "file", LRU_ALL_FILE },
3642 { "anon", LRU_ALL_ANON },
3643 { "unevictable", BIT(LRU_UNEVICTABLE) },
3644 };
3645 const struct numa_stat *stat;
3646 int nid;
3647 unsigned long nr;
3648 struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
3649
3650 for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) {
3651 nr = mem_cgroup_nr_lru_pages(memcg, stat->lru_mask);
3652 seq_printf(m, "%s=%lu", stat->name, nr);
3653 for_each_node_state(nid, N_MEMORY) {
3654 nr = mem_cgroup_node_nr_lru_pages(memcg, nid,
3655 stat->lru_mask);
3656 seq_printf(m, " N%d=%lu", nid, nr);
3657 }
3658 seq_putc(m, '\n');
3659 }
3660
3661 for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) {
3662 struct mem_cgroup *iter;
3663
3664 nr = 0;
3665 for_each_mem_cgroup_tree(iter, memcg)
3666 nr += mem_cgroup_nr_lru_pages(iter, stat->lru_mask);
3667 seq_printf(m, "hierarchical_%s=%lu", stat->name, nr);
3668 for_each_node_state(nid, N_MEMORY) {
3669 nr = 0;
3670 for_each_mem_cgroup_tree(iter, memcg)
3671 nr += mem_cgroup_node_nr_lru_pages(
3672 iter, nid, stat->lru_mask);
3673 seq_printf(m, " N%d=%lu", nid, nr);
3674 }
3675 seq_putc(m, '\n');
3676 }
3677
3678 return 0;
3679}
3680#endif
3681
3682static const unsigned int memcg1_stats[] = {
3683 MEMCG_CACHE,
3684 MEMCG_RSS,
3685 MEMCG_RSS_HUGE,
3686 NR_SHMEM,
3687 NR_FILE_MAPPED,
3688 NR_FILE_DIRTY,
3689 NR_WRITEBACK,
3690 MEMCG_SWAP,
3691};
3692
3693static const char *const memcg1_stat_names[] = {
3694 "cache",
3695 "rss",
3696 "rss_huge",
3697 "shmem",
3698 "mapped_file",
3699 "dirty",
3700 "writeback",
3701 "swap",
3702};
3703
3704
3705static const unsigned int memcg1_events[] = {
3706 PGPGIN,
3707 PGPGOUT,
3708 PGFAULT,
3709 PGMAJFAULT,
3710};
3711
3712static const char *const memcg1_event_names[] = {
3713 "pgpgin",
3714 "pgpgout",
3715 "pgfault",
3716 "pgmajfault",
3717};
3718
3719static int memcg_stat_show(struct seq_file *m, void *v)
3720{
3721 struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
3722 unsigned long memory, memsw;
3723 struct mem_cgroup *mi;
3724 unsigned int i;
3725
3726 BUILD_BUG_ON(ARRAY_SIZE(memcg1_stat_names) != ARRAY_SIZE(memcg1_stats));
3727 BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_lru_names) != NR_LRU_LISTS);
3728
3729 for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) {
3730 if (memcg1_stats[i] == MEMCG_SWAP && !do_memsw_account())
3731 continue;
3732 seq_printf(m, "%s %lu\n", memcg1_stat_names[i],
3733 memcg_page_state_local(memcg, memcg1_stats[i]) *
3734 PAGE_SIZE);
3735 }
3736
3737 for (i = 0; i < ARRAY_SIZE(memcg1_events); i++)
3738 seq_printf(m, "%s %lu\n", memcg1_event_names[i],
3739 memcg_events_local(memcg, memcg1_events[i]));
3740
3741 for (i = 0; i < NR_LRU_LISTS; i++)
3742 seq_printf(m, "%s %lu\n", mem_cgroup_lru_names[i],
3743 memcg_page_state_local(memcg, NR_LRU_BASE + i) *
3744 PAGE_SIZE);
3745
3746
3747 memory = memsw = PAGE_COUNTER_MAX;
3748 for (mi = memcg; mi; mi = parent_mem_cgroup(mi)) {
3749 memory = min(memory, mi->memory.max);
3750 memsw = min(memsw, mi->memsw.max);
3751 }
3752 seq_printf(m, "hierarchical_memory_limit %llu\n",
3753 (u64)memory * PAGE_SIZE);
3754 if (do_memsw_account())
3755 seq_printf(m, "hierarchical_memsw_limit %llu\n",
3756 (u64)memsw * PAGE_SIZE);
3757
3758 for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) {
3759 if (memcg1_stats[i] == MEMCG_SWAP && !do_memsw_account())
3760 continue;
3761 seq_printf(m, "total_%s %llu\n", memcg1_stat_names[i],
3762 (u64)memcg_page_state(memcg, memcg1_stats[i]) *
3763 PAGE_SIZE);
3764 }
3765
3766 for (i = 0; i < ARRAY_SIZE(memcg1_events); i++)
3767 seq_printf(m, "total_%s %llu\n", memcg1_event_names[i],
3768 (u64)memcg_events(memcg, memcg1_events[i]));
3769
3770 for (i = 0; i < NR_LRU_LISTS; i++)
3771 seq_printf(m, "total_%s %llu\n", mem_cgroup_lru_names[i],
3772 (u64)memcg_page_state(memcg, NR_LRU_BASE + i) *
3773 PAGE_SIZE);
3774
3775#ifdef CONFIG_DEBUG_VM
3776 {
3777 pg_data_t *pgdat;
3778 struct mem_cgroup_per_node *mz;
3779 struct zone_reclaim_stat *rstat;
3780 unsigned long recent_rotated[2] = {0, 0};
3781 unsigned long recent_scanned[2] = {0, 0};
3782
3783 for_each_online_pgdat(pgdat) {
3784 mz = mem_cgroup_nodeinfo(memcg, pgdat->node_id);
3785 rstat = &mz->lruvec.reclaim_stat;
3786
3787 recent_rotated[0] += rstat->recent_rotated[0];
3788 recent_rotated[1] += rstat->recent_rotated[1];
3789 recent_scanned[0] += rstat->recent_scanned[0];
3790 recent_scanned[1] += rstat->recent_scanned[1];
3791 }
3792 seq_printf(m, "recent_rotated_anon %lu\n", recent_rotated[0]);
3793 seq_printf(m, "recent_rotated_file %lu\n", recent_rotated[1]);
3794 seq_printf(m, "recent_scanned_anon %lu\n", recent_scanned[0]);
3795 seq_printf(m, "recent_scanned_file %lu\n", recent_scanned[1]);
3796 }
3797#endif
3798
3799 return 0;
3800}
3801
3802static u64 mem_cgroup_swappiness_read(struct cgroup_subsys_state *css,
3803 struct cftype *cft)
3804{
3805 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
3806
3807 return mem_cgroup_swappiness(memcg);
3808}
3809
3810static int mem_cgroup_swappiness_write(struct cgroup_subsys_state *css,
3811 struct cftype *cft, u64 val)
3812{
3813 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
3814
3815 if (val > 100)
3816 return -EINVAL;
3817
3818 if (css->parent)
3819 memcg->swappiness = val;
3820 else
3821 vm_swappiness = val;
3822
3823 return 0;
3824}
3825
3826static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap)
3827{
3828 struct mem_cgroup_threshold_ary *t;
3829 unsigned long usage;
3830 int i;
3831
3832 rcu_read_lock();
3833 if (!swap)
3834 t = rcu_dereference(memcg->thresholds.primary);
3835 else
3836 t = rcu_dereference(memcg->memsw_thresholds.primary);
3837
3838 if (!t)
3839 goto unlock;
3840
3841 usage = mem_cgroup_usage(memcg, swap);
3842
3843
3844
3845
3846
3847
3848 i = t->current_threshold;
3849
3850
3851
3852
3853
3854
3855
3856 for (; i >= 0 && unlikely(t->entries[i].threshold > usage); i--)
3857 eventfd_signal(t->entries[i].eventfd, 1);
3858
3859
3860 i++;
3861
3862
3863
3864
3865
3866
3867
3868 for (; i < t->size && unlikely(t->entries[i].threshold <= usage); i++)
3869 eventfd_signal(t->entries[i].eventfd, 1);
3870
3871
3872 t->current_threshold = i - 1;
3873unlock:
3874 rcu_read_unlock();
3875}
3876
3877static void mem_cgroup_threshold(struct mem_cgroup *memcg)
3878{
3879 while (memcg) {
3880 __mem_cgroup_threshold(memcg, false);
3881 if (do_memsw_account())
3882 __mem_cgroup_threshold(memcg, true);
3883
3884 memcg = parent_mem_cgroup(memcg);
3885 }
3886}
3887
3888static int compare_thresholds(const void *a, const void *b)
3889{
3890 const struct mem_cgroup_threshold *_a = a;
3891 const struct mem_cgroup_threshold *_b = b;
3892
3893 if (_a->threshold > _b->threshold)
3894 return 1;
3895
3896 if (_a->threshold < _b->threshold)
3897 return -1;
3898
3899 return 0;
3900}
3901
3902static int mem_cgroup_oom_notify_cb(struct mem_cgroup *memcg)
3903{
3904 struct mem_cgroup_eventfd_list *ev;
3905
3906 spin_lock(&memcg_oom_lock);
3907
3908 list_for_each_entry(ev, &memcg->oom_notify, list)
3909 eventfd_signal(ev->eventfd, 1);
3910
3911 spin_unlock(&memcg_oom_lock);
3912 return 0;
3913}
3914
3915static void mem_cgroup_oom_notify(struct mem_cgroup *memcg)
3916{
3917 struct mem_cgroup *iter;
3918
3919 for_each_mem_cgroup_tree(iter, memcg)
3920 mem_cgroup_oom_notify_cb(iter);
3921}
3922
3923static int __mem_cgroup_usage_register_event(struct mem_cgroup *memcg,
3924 struct eventfd_ctx *eventfd, const char *args, enum res_type type)
3925{
3926 struct mem_cgroup_thresholds *thresholds;
3927 struct mem_cgroup_threshold_ary *new;
3928 unsigned long threshold;
3929 unsigned long usage;
3930 int i, size, ret;
3931
3932 ret = page_counter_memparse(args, "-1", &threshold);
3933 if (ret)
3934 return ret;
3935
3936 mutex_lock(&memcg->thresholds_lock);
3937
3938 if (type == _MEM) {
3939 thresholds = &memcg->thresholds;
3940 usage = mem_cgroup_usage(memcg, false);
3941 } else if (type == _MEMSWAP) {
3942 thresholds = &memcg->memsw_thresholds;
3943 usage = mem_cgroup_usage(memcg, true);
3944 } else
3945 BUG();
3946
3947
3948 if (thresholds->primary)
3949 __mem_cgroup_threshold(memcg, type == _MEMSWAP);
3950
3951 size = thresholds->primary ? thresholds->primary->size + 1 : 1;
3952
3953
3954 new = kmalloc(struct_size(new, entries, size), GFP_KERNEL);
3955 if (!new) {
3956 ret = -ENOMEM;
3957 goto unlock;
3958 }
3959 new->size = size;
3960
3961
3962 if (thresholds->primary) {
3963 memcpy(new->entries, thresholds->primary->entries, (size - 1) *
3964 sizeof(struct mem_cgroup_threshold));
3965 }
3966
3967
3968 new->entries[size - 1].eventfd = eventfd;
3969 new->entries[size - 1].threshold = threshold;
3970
3971
3972 sort(new->entries, size, sizeof(struct mem_cgroup_threshold),
3973 compare_thresholds, NULL);
3974
3975
3976 new->current_threshold = -1;
3977 for (i = 0; i < size; i++) {
3978 if (new->entries[i].threshold <= usage) {
3979
3980
3981
3982
3983
3984 ++new->current_threshold;
3985 } else
3986 break;
3987 }
3988
3989
3990 kfree(thresholds->spare);
3991 thresholds->spare = thresholds->primary;
3992
3993 rcu_assign_pointer(thresholds->primary, new);
3994
3995
3996 synchronize_rcu();
3997
3998unlock:
3999 mutex_unlock(&memcg->thresholds_lock);
4000
4001 return ret;
4002}
4003
4004static int mem_cgroup_usage_register_event(struct mem_cgroup *memcg,
4005 struct eventfd_ctx *eventfd, const char *args)
4006{
4007 return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEM);
4008}
4009
4010static int memsw_cgroup_usage_register_event(struct mem_cgroup *memcg,
4011 struct eventfd_ctx *eventfd, const char *args)
4012{
4013 return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEMSWAP);
4014}
4015
4016static void __mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
4017 struct eventfd_ctx *eventfd, enum res_type type)
4018{
4019 struct mem_cgroup_thresholds *thresholds;
4020 struct mem_cgroup_threshold_ary *new;
4021 unsigned long usage;
4022 int i, j, size;
4023
4024 mutex_lock(&memcg->thresholds_lock);
4025
4026 if (type == _MEM) {
4027 thresholds = &memcg->thresholds;
4028 usage = mem_cgroup_usage(memcg, false);
4029 } else if (type == _MEMSWAP) {
4030 thresholds = &memcg->memsw_thresholds;
4031 usage = mem_cgroup_usage(memcg, true);
4032 } else
4033 BUG();
4034
4035 if (!thresholds->primary)
4036 goto unlock;
4037
4038
4039 __mem_cgroup_threshold(memcg, type == _MEMSWAP);
4040
4041
4042 size = 0;
4043 for (i = 0; i < thresholds->primary->size; i++) {
4044 if (thresholds->primary->entries[i].eventfd != eventfd)
4045 size++;
4046 }
4047
4048 new = thresholds->spare;
4049
4050
4051 if (!size) {
4052 kfree(new);
4053 new = NULL;
4054 goto swap_buffers;
4055 }
4056
4057 new->size = size;
4058
4059
4060 new->current_threshold = -1;
4061 for (i = 0, j = 0; i < thresholds->primary->size; i++) {
4062 if (thresholds->primary->entries[i].eventfd == eventfd)
4063 continue;
4064
4065 new->entries[j] = thresholds->primary->entries[i];
4066 if (new->entries[j].threshold <= usage) {
4067
4068
4069
4070
4071
4072 ++new->current_threshold;
4073 }
4074 j++;
4075 }
4076
4077swap_buffers:
4078
4079 thresholds->spare = thresholds->primary;
4080
4081 rcu_assign_pointer(thresholds->primary, new);
4082
4083
4084 synchronize_rcu();
4085
4086
4087 if (!new) {
4088 kfree(thresholds->spare);
4089 thresholds->spare = NULL;
4090 }
4091unlock:
4092 mutex_unlock(&memcg->thresholds_lock);
4093}
4094
4095static void mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
4096 struct eventfd_ctx *eventfd)
4097{
4098 return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEM);
4099}
4100
4101static void memsw_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
4102 struct eventfd_ctx *eventfd)
4103{
4104 return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEMSWAP);
4105}
4106
4107static int mem_cgroup_oom_register_event(struct mem_cgroup *memcg,
4108 struct eventfd_ctx *eventfd, const char *args)
4109{
4110 struct mem_cgroup_eventfd_list *event;
4111
4112 event = kmalloc(sizeof(*event), GFP_KERNEL);
4113 if (!event)
4114 return -ENOMEM;
4115
4116 spin_lock(&memcg_oom_lock);
4117
4118 event->eventfd = eventfd;
4119 list_add(&event->list, &memcg->oom_notify);
4120
4121
4122 if (memcg->under_oom)
4123 eventfd_signal(eventfd, 1);
4124 spin_unlock(&memcg_oom_lock);
4125
4126 return 0;
4127}
4128
4129static void mem_cgroup_oom_unregister_event(struct mem_cgroup *memcg,
4130 struct eventfd_ctx *eventfd)
4131{
4132 struct mem_cgroup_eventfd_list *ev, *tmp;
4133
4134 spin_lock(&memcg_oom_lock);
4135
4136 list_for_each_entry_safe(ev, tmp, &memcg->oom_notify, list) {
4137 if (ev->eventfd == eventfd) {
4138 list_del(&ev->list);
4139 kfree(ev);
4140 }
4141 }
4142
4143 spin_unlock(&memcg_oom_lock);
4144}
4145
4146static int mem_cgroup_oom_control_read(struct seq_file *sf, void *v)
4147{
4148 struct mem_cgroup *memcg = mem_cgroup_from_seq(sf);
4149
4150 seq_printf(sf, "oom_kill_disable %d\n", memcg->oom_kill_disable);
4151 seq_printf(sf, "under_oom %d\n", (bool)memcg->under_oom);
4152 seq_printf(sf, "oom_kill %lu\n",
4153 atomic_long_read(&memcg->memory_events[MEMCG_OOM_KILL]));
4154 return 0;
4155}
4156
4157static int mem_cgroup_oom_control_write(struct cgroup_subsys_state *css,
4158 struct cftype *cft, u64 val)
4159{
4160 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
4161
4162
4163 if (!css->parent || !((val == 0) || (val == 1)))
4164 return -EINVAL;
4165
4166 memcg->oom_kill_disable = val;
4167 if (!val)
4168 memcg_oom_recover(memcg);
4169
4170 return 0;
4171}
4172
4173#ifdef CONFIG_CGROUP_WRITEBACK
4174
4175static int memcg_wb_domain_init(struct mem_cgroup *memcg, gfp_t gfp)
4176{
4177 return wb_domain_init(&memcg->cgwb_domain, gfp);
4178}
4179
4180static void memcg_wb_domain_exit(struct mem_cgroup *memcg)
4181{
4182 wb_domain_exit(&memcg->cgwb_domain);
4183}
4184
4185static void memcg_wb_domain_size_changed(struct mem_cgroup *memcg)
4186{
4187 wb_domain_size_changed(&memcg->cgwb_domain);
4188}
4189
4190struct wb_domain *mem_cgroup_wb_domain(struct bdi_writeback *wb)
4191{
4192 struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css);
4193
4194 if (!memcg->css.parent)
4195 return NULL;
4196
4197 return &memcg->cgwb_domain;
4198}
4199
4200
4201
4202
4203
4204static unsigned long memcg_exact_page_state(struct mem_cgroup *memcg, int idx)
4205{
4206 long x = atomic_long_read(&memcg->vmstats[idx]);
4207 int cpu;
4208
4209 for_each_online_cpu(cpu)
4210 x += per_cpu_ptr(memcg->vmstats_percpu, cpu)->stat[idx];
4211 if (x < 0)
4212 x = 0;
4213 return x;
4214}
4215
4216
4217
4218
4219
4220
4221
4222
4223
4224
4225
4226
4227
4228
4229
4230
4231
4232
4233
4234void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages,
4235 unsigned long *pheadroom, unsigned long *pdirty,
4236 unsigned long *pwriteback)
4237{
4238 struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css);
4239 struct mem_cgroup *parent;
4240
4241 *pdirty = memcg_exact_page_state(memcg, NR_FILE_DIRTY);
4242
4243
4244 *pwriteback = memcg_exact_page_state(memcg, NR_WRITEBACK);
4245 *pfilepages = memcg_exact_page_state(memcg, NR_INACTIVE_FILE) +
4246 memcg_exact_page_state(memcg, NR_ACTIVE_FILE);
4247 *pheadroom = PAGE_COUNTER_MAX;
4248
4249 while ((parent = parent_mem_cgroup(memcg))) {
4250 unsigned long ceiling = min(memcg->memory.max, memcg->high);
4251 unsigned long used = page_counter_read(&memcg->memory);
4252
4253 *pheadroom = min(*pheadroom, ceiling - min(ceiling, used));
4254 memcg = parent;
4255 }
4256}
4257
4258#else
4259
4260static int memcg_wb_domain_init(struct mem_cgroup *memcg, gfp_t gfp)
4261{
4262 return 0;
4263}
4264
4265static void memcg_wb_domain_exit(struct mem_cgroup *memcg)
4266{
4267}
4268
4269static void memcg_wb_domain_size_changed(struct mem_cgroup *memcg)
4270{
4271}
4272
4273#endif
4274
4275
4276
4277
4278
4279
4280
4281
4282
4283
4284
4285
4286
4287
4288
4289
4290
4291
4292
4293static void memcg_event_remove(struct work_struct *work)
4294{
4295 struct mem_cgroup_event *event =
4296 container_of(work, struct mem_cgroup_event, remove);
4297 struct mem_cgroup *memcg = event->memcg;
4298
4299 remove_wait_queue(event->wqh, &event->wait);
4300
4301 event->unregister_event(memcg, event->eventfd);
4302
4303
4304 eventfd_signal(event->eventfd, 1);
4305
4306 eventfd_ctx_put(event->eventfd);
4307 kfree(event);
4308 css_put(&memcg->css);
4309}
4310
4311
4312
4313
4314
4315
4316static int memcg_event_wake(wait_queue_entry_t *wait, unsigned mode,
4317 int sync, void *key)
4318{
4319 struct mem_cgroup_event *event =
4320 container_of(wait, struct mem_cgroup_event, wait);
4321 struct mem_cgroup *memcg = event->memcg;
4322 __poll_t flags = key_to_poll(key);
4323
4324 if (flags & EPOLLHUP) {
4325
4326
4327
4328
4329
4330
4331
4332
4333
4334 spin_lock(&memcg->event_list_lock);
4335 if (!list_empty(&event->list)) {
4336 list_del_init(&event->list);
4337
4338
4339
4340
4341 schedule_work(&event->remove);
4342 }
4343 spin_unlock(&memcg->event_list_lock);
4344 }
4345
4346 return 0;
4347}
4348
4349static void memcg_event_ptable_queue_proc(struct file *file,
4350 wait_queue_head_t *wqh, poll_table *pt)
4351{
4352 struct mem_cgroup_event *event =
4353 container_of(pt, struct mem_cgroup_event, pt);
4354
4355 event->wqh = wqh;
4356 add_wait_queue(wqh, &event->wait);
4357}
4358
4359
4360
4361
4362
4363
4364
4365
4366
4367static ssize_t memcg_write_event_control(struct kernfs_open_file *of,
4368 char *buf, size_t nbytes, loff_t off)
4369{
4370 struct cgroup_subsys_state *css = of_css(of);
4371 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
4372 struct mem_cgroup_event *event;
4373 struct cgroup_subsys_state *cfile_css;
4374 unsigned int efd, cfd;
4375 struct fd efile;
4376 struct fd cfile;
4377 const char *name;
4378 char *endp;
4379 int ret;
4380
4381 buf = strstrip(buf);
4382
4383 efd = simple_strtoul(buf, &endp, 10);
4384 if (*endp != ' ')
4385 return -EINVAL;
4386 buf = endp + 1;
4387
4388 cfd = simple_strtoul(buf, &endp, 10);
4389 if ((*endp != ' ') && (*endp != '\0'))
4390 return -EINVAL;
4391 buf = endp + 1;
4392
4393 event = kzalloc(sizeof(*event), GFP_KERNEL);
4394 if (!event)
4395 return -ENOMEM;
4396
4397 event->memcg = memcg;
4398 INIT_LIST_HEAD(&event->list);
4399 init_poll_funcptr(&event->pt, memcg_event_ptable_queue_proc);
4400 init_waitqueue_func_entry(&event->wait, memcg_event_wake);
4401 INIT_WORK(&event->remove, memcg_event_remove);
4402
4403 efile = fdget(efd);
4404 if (!efile.file) {
4405 ret = -EBADF;
4406 goto out_kfree;
4407 }
4408
4409 event->eventfd = eventfd_ctx_fileget(efile.file);
4410 if (IS_ERR(event->eventfd)) {
4411 ret = PTR_ERR(event->eventfd);
4412 goto out_put_efile;
4413 }
4414
4415 cfile = fdget(cfd);
4416 if (!cfile.file) {
4417 ret = -EBADF;
4418 goto out_put_eventfd;
4419 }
4420
4421
4422
4423 ret = inode_permission(file_inode(cfile.file), MAY_READ);
4424 if (ret < 0)
4425 goto out_put_cfile;
4426
4427
4428
4429
4430
4431
4432
4433
4434
4435 name = cfile.file->f_path.dentry->d_name.name;
4436
4437 if (!strcmp(name, "memory.usage_in_bytes")) {
4438 event->register_event = mem_cgroup_usage_register_event;
4439 event->unregister_event = mem_cgroup_usage_unregister_event;
4440 } else if (!strcmp(name, "memory.oom_control")) {
4441 event->register_event = mem_cgroup_oom_register_event;
4442 event->unregister_event = mem_cgroup_oom_unregister_event;
4443 } else if (!strcmp(name, "memory.pressure_level")) {
4444 event->register_event = vmpressure_register_event;
4445 event->unregister_event = vmpressure_unregister_event;
4446 } else if (!strcmp(name, "memory.memsw.usage_in_bytes")) {
4447 event->register_event = memsw_cgroup_usage_register_event;
4448 event->unregister_event = memsw_cgroup_usage_unregister_event;
4449 } else {
4450 ret = -EINVAL;
4451 goto out_put_cfile;
4452 }
4453
4454
4455
4456
4457
4458
4459 cfile_css = css_tryget_online_from_dir(cfile.file->f_path.dentry->d_parent,
4460 &memory_cgrp_subsys);
4461 ret = -EINVAL;
4462 if (IS_ERR(cfile_css))
4463 goto out_put_cfile;
4464 if (cfile_css != css) {
4465 css_put(cfile_css);
4466 goto out_put_cfile;
4467 }
4468
4469 ret = event->register_event(memcg, event->eventfd, buf);
4470 if (ret)
4471 goto out_put_css;
4472
4473 vfs_poll(efile.file, &event->pt);
4474
4475 spin_lock(&memcg->event_list_lock);
4476 list_add(&event->list, &memcg->event_list);
4477 spin_unlock(&memcg->event_list_lock);
4478
4479 fdput(cfile);
4480 fdput(efile);
4481
4482 return nbytes;
4483
4484out_put_css:
4485 css_put(css);
4486out_put_cfile:
4487 fdput(cfile);
4488out_put_eventfd:
4489 eventfd_ctx_put(event->eventfd);
4490out_put_efile:
4491 fdput(efile);
4492out_kfree:
4493 kfree(event);
4494
4495 return ret;
4496}
4497
4498static struct cftype mem_cgroup_legacy_files[] = {
4499 {
4500 .name = "usage_in_bytes",
4501 .private = MEMFILE_PRIVATE(_MEM, RES_USAGE),
4502 .read_u64 = mem_cgroup_read_u64,
4503 },
4504 {
4505 .name = "max_usage_in_bytes",
4506 .private = MEMFILE_PRIVATE(_MEM, RES_MAX_USAGE),
4507 .write = mem_cgroup_reset,
4508 .read_u64 = mem_cgroup_read_u64,
4509 },
4510 {
4511 .name = "limit_in_bytes",
4512 .private = MEMFILE_PRIVATE(_MEM, RES_LIMIT),
4513 .write = mem_cgroup_write,
4514 .read_u64 = mem_cgroup_read_u64,
4515 },
4516 {
4517 .name = "soft_limit_in_bytes",
4518 .private = MEMFILE_PRIVATE(_MEM, RES_SOFT_LIMIT),
4519 .write = mem_cgroup_write,
4520 .read_u64 = mem_cgroup_read_u64,
4521 },
4522 {
4523 .name = "failcnt",
4524 .private = MEMFILE_PRIVATE(_MEM, RES_FAILCNT),
4525 .write = mem_cgroup_reset,
4526 .read_u64 = mem_cgroup_read_u64,
4527 },
4528 {
4529 .name = "stat",
4530 .seq_show = memcg_stat_show,
4531 },
4532 {
4533 .name = "force_empty",
4534 .write = mem_cgroup_force_empty_write,
4535 },
4536 {
4537 .name = "use_hierarchy",
4538 .write_u64 = mem_cgroup_hierarchy_write,
4539 .read_u64 = mem_cgroup_hierarchy_read,
4540 },
4541 {
4542 .name = "cgroup.event_control",
4543 .write = memcg_write_event_control,
4544 .flags = CFTYPE_NO_PREFIX | CFTYPE_WORLD_WRITABLE,
4545 },
4546 {
4547 .name = "swappiness",
4548 .read_u64 = mem_cgroup_swappiness_read,
4549 .write_u64 = mem_cgroup_swappiness_write,
4550 },
4551 {
4552 .name = "move_charge_at_immigrate",
4553 .read_u64 = mem_cgroup_move_charge_read,
4554 .write_u64 = mem_cgroup_move_charge_write,
4555 },
4556 {
4557 .name = "oom_control",
4558 .seq_show = mem_cgroup_oom_control_read,
4559 .write_u64 = mem_cgroup_oom_control_write,
4560 .private = MEMFILE_PRIVATE(_OOM_TYPE, OOM_CONTROL),
4561 },
4562 {
4563 .name = "pressure_level",
4564 },
4565#ifdef CONFIG_NUMA
4566 {
4567 .name = "numa_stat",
4568 .seq_show = memcg_numa_stat_show,
4569 },
4570#endif
4571 {
4572 .name = "kmem.limit_in_bytes",
4573 .private = MEMFILE_PRIVATE(_KMEM, RES_LIMIT),
4574 .write = mem_cgroup_write,
4575 .read_u64 = mem_cgroup_read_u64,
4576 },
4577 {
4578 .name = "kmem.usage_in_bytes",
4579 .private = MEMFILE_PRIVATE(_KMEM, RES_USAGE),
4580 .read_u64 = mem_cgroup_read_u64,
4581 },
4582 {
4583 .name = "kmem.failcnt",
4584 .private = MEMFILE_PRIVATE(_KMEM, RES_FAILCNT),
4585 .write = mem_cgroup_reset,
4586 .read_u64 = mem_cgroup_read_u64,
4587 },
4588 {
4589 .name = "kmem.max_usage_in_bytes",
4590 .private = MEMFILE_PRIVATE(_KMEM, RES_MAX_USAGE),
4591 .write = mem_cgroup_reset,
4592 .read_u64 = mem_cgroup_read_u64,
4593 },
4594#if defined(CONFIG_SLAB) || defined(CONFIG_SLUB_DEBUG)
4595 {
4596 .name = "kmem.slabinfo",
4597 .seq_start = memcg_slab_start,
4598 .seq_next = memcg_slab_next,
4599 .seq_stop = memcg_slab_stop,
4600 .seq_show = memcg_slab_show,
4601 },
4602#endif
4603 {
4604 .name = "kmem.tcp.limit_in_bytes",
4605 .private = MEMFILE_PRIVATE(_TCP, RES_LIMIT),
4606 .write = mem_cgroup_write,
4607 .read_u64 = mem_cgroup_read_u64,
4608 },
4609 {
4610 .name = "kmem.tcp.usage_in_bytes",
4611 .private = MEMFILE_PRIVATE(_TCP, RES_USAGE),
4612 .read_u64 = mem_cgroup_read_u64,
4613 },
4614 {
4615 .name = "kmem.tcp.failcnt",
4616 .private = MEMFILE_PRIVATE(_TCP, RES_FAILCNT),
4617 .write = mem_cgroup_reset,
4618 .read_u64 = mem_cgroup_read_u64,
4619 },
4620 {
4621 .name = "kmem.tcp.max_usage_in_bytes",
4622 .private = MEMFILE_PRIVATE(_TCP, RES_MAX_USAGE),
4623 .write = mem_cgroup_reset,
4624 .read_u64 = mem_cgroup_read_u64,
4625 },
4626 { },
4627};
4628
4629
4630
4631
4632
4633
4634
4635
4636
4637
4638
4639
4640
4641
4642
4643
4644
4645
4646
4647
4648
4649
4650
4651
4652
4653static DEFINE_IDR(mem_cgroup_idr);
4654
4655static void mem_cgroup_id_remove(struct mem_cgroup *memcg)
4656{
4657 if (memcg->id.id > 0) {
4658 idr_remove(&mem_cgroup_idr, memcg->id.id);
4659 memcg->id.id = 0;
4660 }
4661}
4662
4663static void mem_cgroup_id_get_many(struct mem_cgroup *memcg, unsigned int n)
4664{
4665 refcount_add(n, &memcg->id.ref);
4666}
4667
4668static void mem_cgroup_id_put_many(struct mem_cgroup *memcg, unsigned int n)
4669{
4670 if (refcount_sub_and_test(n, &memcg->id.ref)) {
4671 mem_cgroup_id_remove(memcg);
4672
4673
4674 css_put(&memcg->css);
4675 }
4676}
4677
4678static inline void mem_cgroup_id_get(struct mem_cgroup *memcg)
4679{
4680 mem_cgroup_id_get_many(memcg, 1);
4681}
4682
4683static inline void mem_cgroup_id_put(struct mem_cgroup *memcg)
4684{
4685 mem_cgroup_id_put_many(memcg, 1);
4686}
4687
4688
4689
4690
4691
4692
4693
4694struct mem_cgroup *mem_cgroup_from_id(unsigned short id)
4695{
4696 WARN_ON_ONCE(!rcu_read_lock_held());
4697 return idr_find(&mem_cgroup_idr, id);
4698}
4699
4700static int alloc_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node)
4701{
4702 struct mem_cgroup_per_node *pn;
4703 int tmp = node;
4704
4705
4706
4707
4708
4709
4710
4711
4712 if (!node_state(node, N_NORMAL_MEMORY))
4713 tmp = -1;
4714 pn = kzalloc_node(sizeof(*pn), GFP_KERNEL, tmp);
4715 if (!pn)
4716 return 1;
4717
4718 pn->lruvec_stat_local = alloc_percpu(struct lruvec_stat);
4719 if (!pn->lruvec_stat_local) {
4720 kfree(pn);
4721 return 1;
4722 }
4723
4724 pn->lruvec_stat_cpu = alloc_percpu(struct lruvec_stat);
4725 if (!pn->lruvec_stat_cpu) {
4726 free_percpu(pn->lruvec_stat_local);
4727 kfree(pn);
4728 return 1;
4729 }
4730
4731 lruvec_init(&pn->lruvec);
4732 pn->usage_in_excess = 0;
4733 pn->on_tree = false;
4734 pn->memcg = memcg;
4735
4736 memcg->nodeinfo[node] = pn;
4737 return 0;
4738}
4739
4740static void free_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node)
4741{
4742 struct mem_cgroup_per_node *pn = memcg->nodeinfo[node];
4743
4744 if (!pn)
4745 return;
4746
4747 free_percpu(pn->lruvec_stat_cpu);
4748 free_percpu(pn->lruvec_stat_local);
4749 kfree(pn);
4750}
4751
4752static void __mem_cgroup_free(struct mem_cgroup *memcg)
4753{
4754 int node;
4755
4756
4757
4758
4759
4760 memcg_flush_percpu_vmstats(memcg, false);
4761 memcg_flush_percpu_vmevents(memcg);
4762 for_each_node(node)
4763 free_mem_cgroup_per_node_info(memcg, node);
4764 free_percpu(memcg->vmstats_percpu);
4765 free_percpu(memcg->vmstats_local);
4766 kfree(memcg);
4767}
4768
4769static void mem_cgroup_free(struct mem_cgroup *memcg)
4770{
4771 memcg_wb_domain_exit(memcg);
4772 __mem_cgroup_free(memcg);
4773}
4774
4775static struct mem_cgroup *mem_cgroup_alloc(void)
4776{
4777 struct mem_cgroup *memcg;
4778 unsigned int size;
4779 int node;
4780
4781 size = sizeof(struct mem_cgroup);
4782 size += nr_node_ids * sizeof(struct mem_cgroup_per_node *);
4783
4784 memcg = kzalloc(size, GFP_KERNEL);
4785 if (!memcg)
4786 return NULL;
4787
4788 memcg->id.id = idr_alloc(&mem_cgroup_idr, NULL,
4789 1, MEM_CGROUP_ID_MAX,
4790 GFP_KERNEL);
4791 if (memcg->id.id < 0)
4792 goto fail;
4793
4794 memcg->vmstats_local = alloc_percpu(struct memcg_vmstats_percpu);
4795 if (!memcg->vmstats_local)
4796 goto fail;
4797
4798 memcg->vmstats_percpu = alloc_percpu(struct memcg_vmstats_percpu);
4799 if (!memcg->vmstats_percpu)
4800 goto fail;
4801
4802 for_each_node(node)
4803 if (alloc_mem_cgroup_per_node_info(memcg, node))
4804 goto fail;
4805
4806 if (memcg_wb_domain_init(memcg, GFP_KERNEL))
4807 goto fail;
4808
4809 INIT_WORK(&memcg->high_work, high_work_func);
4810 memcg->last_scanned_node = MAX_NUMNODES;
4811 INIT_LIST_HEAD(&memcg->oom_notify);
4812 mutex_init(&memcg->thresholds_lock);
4813 spin_lock_init(&memcg->move_lock);
4814 vmpressure_init(&memcg->vmpressure);
4815 INIT_LIST_HEAD(&memcg->event_list);
4816 spin_lock_init(&memcg->event_list_lock);
4817 memcg->socket_pressure = jiffies;
4818#ifdef CONFIG_MEMCG_KMEM
4819 memcg->kmemcg_id = -1;
4820#endif
4821#ifdef CONFIG_CGROUP_WRITEBACK
4822 INIT_LIST_HEAD(&memcg->cgwb_list);
4823#endif
4824 idr_replace(&mem_cgroup_idr, memcg, memcg->id.id);
4825 return memcg;
4826fail:
4827 mem_cgroup_id_remove(memcg);
4828 __mem_cgroup_free(memcg);
4829 return NULL;
4830}
4831
4832static struct cgroup_subsys_state * __ref
4833mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
4834{
4835 struct mem_cgroup *parent = mem_cgroup_from_css(parent_css);
4836 struct mem_cgroup *memcg;
4837 long error = -ENOMEM;
4838
4839 memcg = mem_cgroup_alloc();
4840 if (!memcg)
4841 return ERR_PTR(error);
4842
4843 memcg->high = PAGE_COUNTER_MAX;
4844 memcg->soft_limit = PAGE_COUNTER_MAX;
4845 if (parent) {
4846 memcg->swappiness = mem_cgroup_swappiness(parent);
4847 memcg->oom_kill_disable = parent->oom_kill_disable;
4848 }
4849 if (parent && parent->use_hierarchy) {
4850 memcg->use_hierarchy = true;
4851 page_counter_init(&memcg->memory, &parent->memory);
4852 page_counter_init(&memcg->swap, &parent->swap);
4853 page_counter_init(&memcg->memsw, &parent->memsw);
4854 page_counter_init(&memcg->kmem, &parent->kmem);
4855 page_counter_init(&memcg->tcpmem, &parent->tcpmem);
4856 } else {
4857 page_counter_init(&memcg->memory, NULL);
4858 page_counter_init(&memcg->swap, NULL);
4859 page_counter_init(&memcg->memsw, NULL);
4860 page_counter_init(&memcg->kmem, NULL);
4861 page_counter_init(&memcg->tcpmem, NULL);
4862
4863
4864
4865
4866
4867 if (parent != root_mem_cgroup)
4868 memory_cgrp_subsys.broken_hierarchy = true;
4869 }
4870
4871
4872 if (!parent) {
4873#ifdef CONFIG_MEMCG_KMEM
4874 INIT_LIST_HEAD(&memcg->kmem_caches);
4875#endif
4876 root_mem_cgroup = memcg;
4877 return &memcg->css;
4878 }
4879
4880 error = memcg_online_kmem(memcg);
4881 if (error)
4882 goto fail;
4883
4884 if (cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_nosocket)
4885 static_branch_inc(&memcg_sockets_enabled_key);
4886
4887 return &memcg->css;
4888fail:
4889 mem_cgroup_id_remove(memcg);
4890 mem_cgroup_free(memcg);
4891 return ERR_PTR(-ENOMEM);
4892}
4893
4894static int mem_cgroup_css_online(struct cgroup_subsys_state *css)
4895{
4896 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
4897
4898
4899
4900
4901
4902
4903 if (memcg_alloc_shrinker_maps(memcg)) {
4904 mem_cgroup_id_remove(memcg);
4905 return -ENOMEM;
4906 }
4907
4908
4909 refcount_set(&memcg->id.ref, 1);
4910 css_get(css);
4911 return 0;
4912}
4913
4914static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
4915{
4916 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
4917 struct mem_cgroup_event *event, *tmp;
4918
4919
4920
4921
4922
4923
4924 spin_lock(&memcg->event_list_lock);
4925 list_for_each_entry_safe(event, tmp, &memcg->event_list, list) {
4926 list_del_init(&event->list);
4927 schedule_work(&event->remove);
4928 }
4929 spin_unlock(&memcg->event_list_lock);
4930
4931 page_counter_set_min(&memcg->memory, 0);
4932 page_counter_set_low(&memcg->memory, 0);
4933
4934 memcg_offline_kmem(memcg);
4935 wb_memcg_offline(memcg);
4936
4937 drain_all_stock(memcg);
4938
4939 mem_cgroup_id_put(memcg);
4940}
4941
4942static void mem_cgroup_css_released(struct cgroup_subsys_state *css)
4943{
4944 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
4945
4946 invalidate_reclaim_iterators(memcg);
4947}
4948
4949static void mem_cgroup_css_free(struct cgroup_subsys_state *css)
4950{
4951 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
4952
4953 if (cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_nosocket)
4954 static_branch_dec(&memcg_sockets_enabled_key);
4955
4956 if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && memcg->tcpmem_active)
4957 static_branch_dec(&memcg_sockets_enabled_key);
4958
4959 vmpressure_cleanup(&memcg->vmpressure);
4960 cancel_work_sync(&memcg->high_work);
4961 mem_cgroup_remove_from_trees(memcg);
4962 memcg_free_shrinker_maps(memcg);
4963 memcg_free_kmem(memcg);
4964 mem_cgroup_free(memcg);
4965}
4966
4967
4968
4969
4970
4971
4972
4973
4974
4975
4976
4977
4978
4979
4980static void mem_cgroup_css_reset(struct cgroup_subsys_state *css)
4981{
4982 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
4983
4984 page_counter_set_max(&memcg->memory, PAGE_COUNTER_MAX);
4985 page_counter_set_max(&memcg->swap, PAGE_COUNTER_MAX);
4986 page_counter_set_max(&memcg->memsw, PAGE_COUNTER_MAX);
4987 page_counter_set_max(&memcg->kmem, PAGE_COUNTER_MAX);
4988 page_counter_set_max(&memcg->tcpmem, PAGE_COUNTER_MAX);
4989 page_counter_set_min(&memcg->memory, 0);
4990 page_counter_set_low(&memcg->memory, 0);
4991 memcg->high = PAGE_COUNTER_MAX;
4992 memcg->soft_limit = PAGE_COUNTER_MAX;
4993 memcg_wb_domain_size_changed(memcg);
4994}
4995
4996#ifdef CONFIG_MMU
4997
4998static int mem_cgroup_do_precharge(unsigned long count)
4999{
5000 int ret;
5001
5002
5003 ret = try_charge(mc.to, GFP_KERNEL & ~__GFP_DIRECT_RECLAIM, count);
5004 if (!ret) {
5005 mc.precharge += count;
5006 return ret;
5007 }
5008
5009
5010 while (count--) {
5011 ret = try_charge(mc.to, GFP_KERNEL | __GFP_NORETRY, 1);
5012 if (ret)
5013 return ret;
5014 mc.precharge++;
5015 cond_resched();
5016 }
5017 return 0;
5018}
5019
5020union mc_target {
5021 struct page *page;
5022 swp_entry_t ent;
5023};
5024
5025enum mc_target_type {
5026 MC_TARGET_NONE = 0,
5027 MC_TARGET_PAGE,
5028 MC_TARGET_SWAP,
5029 MC_TARGET_DEVICE,
5030};
5031
5032static struct page *mc_handle_present_pte(struct vm_area_struct *vma,
5033 unsigned long addr, pte_t ptent)
5034{
5035 struct page *page = vm_normal_page(vma, addr, ptent);
5036
5037 if (!page || !page_mapped(page))
5038 return NULL;
5039 if (PageAnon(page)) {
5040 if (!(mc.flags & MOVE_ANON))
5041 return NULL;
5042 } else {
5043 if (!(mc.flags & MOVE_FILE))
5044 return NULL;
5045 }
5046 if (!get_page_unless_zero(page))
5047 return NULL;
5048
5049 return page;
5050}
5051
5052#if defined(CONFIG_SWAP) || defined(CONFIG_DEVICE_PRIVATE)
5053static struct page *mc_handle_swap_pte(struct vm_area_struct *vma,
5054 pte_t ptent, swp_entry_t *entry)
5055{
5056 struct page *page = NULL;
5057 swp_entry_t ent = pte_to_swp_entry(ptent);
5058
5059 if (!(mc.flags & MOVE_ANON) || non_swap_entry(ent))
5060 return NULL;
5061
5062
5063
5064
5065
5066
5067 if (is_device_private_entry(ent)) {
5068 page = device_private_entry_to_page(ent);
5069
5070
5071
5072
5073 if (!page_ref_add_unless(page, 1, 1))
5074 return NULL;
5075 return page;
5076 }
5077
5078
5079
5080
5081
5082 page = find_get_page(swap_address_space(ent), swp_offset(ent));
5083 if (do_memsw_account())
5084 entry->val = ent.val;
5085
5086 return page;
5087}
5088#else
5089static struct page *mc_handle_swap_pte(struct vm_area_struct *vma,
5090 pte_t ptent, swp_entry_t *entry)
5091{
5092 return NULL;
5093}
5094#endif
5095
5096static struct page *mc_handle_file_pte(struct vm_area_struct *vma,
5097 unsigned long addr, pte_t ptent, swp_entry_t *entry)
5098{
5099 struct page *page = NULL;
5100 struct address_space *mapping;
5101 pgoff_t pgoff;
5102
5103 if (!vma->vm_file)
5104 return NULL;
5105 if (!(mc.flags & MOVE_FILE))
5106 return NULL;
5107
5108 mapping = vma->vm_file->f_mapping;
5109 pgoff = linear_page_index(vma, addr);
5110
5111
5112#ifdef CONFIG_SWAP
5113
5114 if (shmem_mapping(mapping)) {
5115 page = find_get_entry(mapping, pgoff);
5116 if (xa_is_value(page)) {
5117 swp_entry_t swp = radix_to_swp_entry(page);
5118 if (do_memsw_account())
5119 *entry = swp;
5120 page = find_get_page(swap_address_space(swp),
5121 swp_offset(swp));
5122 }
5123 } else
5124 page = find_get_page(mapping, pgoff);
5125#else
5126 page = find_get_page(mapping, pgoff);
5127#endif
5128 return page;
5129}
5130
5131
5132
5133
5134
5135
5136
5137
5138
5139
5140
5141
5142
5143static int mem_cgroup_move_account(struct page *page,
5144 bool compound,
5145 struct mem_cgroup *from,
5146 struct mem_cgroup *to)
5147{
5148 unsigned long flags;
5149 unsigned int nr_pages = compound ? hpage_nr_pages(page) : 1;
5150 int ret;
5151 bool anon;
5152
5153 VM_BUG_ON(from == to);
5154 VM_BUG_ON_PAGE(PageLRU(page), page);
5155 VM_BUG_ON(compound && !PageTransHuge(page));
5156
5157
5158
5159
5160
5161 ret = -EBUSY;
5162 if (!trylock_page(page))
5163 goto out;
5164
5165 ret = -EINVAL;
5166 if (page->mem_cgroup != from)
5167 goto out_unlock;
5168
5169 anon = PageAnon(page);
5170
5171 spin_lock_irqsave(&from->move_lock, flags);
5172
5173 if (!anon && page_mapped(page)) {
5174 __mod_memcg_state(from, NR_FILE_MAPPED, -nr_pages);
5175 __mod_memcg_state(to, NR_FILE_MAPPED, nr_pages);
5176 }
5177
5178
5179
5180
5181
5182
5183 if (!anon && PageDirty(page)) {
5184 struct address_space *mapping = page_mapping(page);
5185
5186 if (mapping_cap_account_dirty(mapping)) {
5187 __mod_memcg_state(from, NR_FILE_DIRTY, -nr_pages);
5188 __mod_memcg_state(to, NR_FILE_DIRTY, nr_pages);
5189 }
5190 }
5191
5192 if (PageWriteback(page)) {
5193 __mod_memcg_state(from, NR_WRITEBACK, -nr_pages);
5194 __mod_memcg_state(to, NR_WRITEBACK, nr_pages);
5195 }
5196
5197
5198
5199
5200
5201
5202
5203
5204 page->mem_cgroup = to;
5205 spin_unlock_irqrestore(&from->move_lock, flags);
5206
5207 ret = 0;
5208
5209 local_irq_disable();
5210 mem_cgroup_charge_statistics(to, page, compound, nr_pages);
5211 memcg_check_events(to, page);
5212 mem_cgroup_charge_statistics(from, page, compound, -nr_pages);
5213 memcg_check_events(from, page);
5214 local_irq_enable();
5215out_unlock:
5216 unlock_page(page);
5217out:
5218 return ret;
5219}
5220
5221
5222
5223
5224
5225
5226
5227
5228
5229
5230
5231
5232
5233
5234
5235
5236
5237
5238
5239
5240
5241
5242
5243
5244
5245
5246
5247static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma,
5248 unsigned long addr, pte_t ptent, union mc_target *target)
5249{
5250 struct page *page = NULL;
5251 enum mc_target_type ret = MC_TARGET_NONE;
5252 swp_entry_t ent = { .val = 0 };
5253
5254 if (pte_present(ptent))
5255 page = mc_handle_present_pte(vma, addr, ptent);
5256 else if (is_swap_pte(ptent))
5257 page = mc_handle_swap_pte(vma, ptent, &ent);
5258 else if (pte_none(ptent))
5259 page = mc_handle_file_pte(vma, addr, ptent, &ent);
5260
5261 if (!page && !ent.val)
5262 return ret;
5263 if (page) {
5264
5265
5266
5267
5268
5269 if (page->mem_cgroup == mc.from) {
5270 ret = MC_TARGET_PAGE;
5271 if (is_device_private_page(page))
5272 ret = MC_TARGET_DEVICE;
5273 if (target)
5274 target->page = page;
5275 }
5276 if (!ret || !target)
5277 put_page(page);
5278 }
5279
5280
5281
5282
5283 if (ent.val && !ret && (!page || !PageTransCompound(page)) &&
5284 mem_cgroup_id(mc.from) == lookup_swap_cgroup_id(ent)) {
5285 ret = MC_TARGET_SWAP;
5286 if (target)
5287 target->ent = ent;
5288 }
5289 return ret;
5290}
5291
5292#ifdef CONFIG_TRANSPARENT_HUGEPAGE
5293
5294
5295
5296
5297
5298static enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma,
5299 unsigned long addr, pmd_t pmd, union mc_target *target)
5300{
5301 struct page *page = NULL;
5302 enum mc_target_type ret = MC_TARGET_NONE;
5303
5304 if (unlikely(is_swap_pmd(pmd))) {
5305 VM_BUG_ON(thp_migration_supported() &&
5306 !is_pmd_migration_entry(pmd));
5307 return ret;
5308 }
5309 page = pmd_page(pmd);
5310 VM_BUG_ON_PAGE(!page || !PageHead(page), page);
5311 if (!(mc.flags & MOVE_ANON))
5312 return ret;
5313 if (page->mem_cgroup == mc.from) {
5314 ret = MC_TARGET_PAGE;
5315 if (target) {
5316 get_page(page);
5317 target->page = page;
5318 }
5319 }
5320 return ret;
5321}
5322#else
5323static inline enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma,
5324 unsigned long addr, pmd_t pmd, union mc_target *target)
5325{
5326 return MC_TARGET_NONE;
5327}
5328#endif
5329
5330static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd,
5331 unsigned long addr, unsigned long end,
5332 struct mm_walk *walk)
5333{
5334 struct vm_area_struct *vma = walk->vma;
5335 pte_t *pte;
5336 spinlock_t *ptl;
5337
5338 ptl = pmd_trans_huge_lock(pmd, vma);
5339 if (ptl) {
5340
5341
5342
5343
5344
5345 if (get_mctgt_type_thp(vma, addr, *pmd, NULL) == MC_TARGET_PAGE)
5346 mc.precharge += HPAGE_PMD_NR;
5347 spin_unlock(ptl);
5348 return 0;
5349 }
5350
5351 if (pmd_trans_unstable(pmd))
5352 return 0;
5353 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
5354 for (; addr != end; pte++, addr += PAGE_SIZE)
5355 if (get_mctgt_type(vma, addr, *pte, NULL))
5356 mc.precharge++;
5357 pte_unmap_unlock(pte - 1, ptl);
5358 cond_resched();
5359
5360 return 0;
5361}
5362
5363static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm)
5364{
5365 unsigned long precharge;
5366
5367 struct mm_walk mem_cgroup_count_precharge_walk = {
5368 .pmd_entry = mem_cgroup_count_precharge_pte_range,
5369 .mm = mm,
5370 };
5371 down_read(&mm->mmap_sem);
5372 walk_page_range(0, mm->highest_vm_end,
5373 &mem_cgroup_count_precharge_walk);
5374 up_read(&mm->mmap_sem);
5375
5376 precharge = mc.precharge;
5377 mc.precharge = 0;
5378
5379 return precharge;
5380}
5381
5382static int mem_cgroup_precharge_mc(struct mm_struct *mm)
5383{
5384 unsigned long precharge = mem_cgroup_count_precharge(mm);
5385
5386 VM_BUG_ON(mc.moving_task);
5387 mc.moving_task = current;
5388 return mem_cgroup_do_precharge(precharge);
5389}
5390
5391
5392static void __mem_cgroup_clear_mc(void)
5393{
5394 struct mem_cgroup *from = mc.from;
5395 struct mem_cgroup *to = mc.to;
5396
5397
5398 if (mc.precharge) {
5399 cancel_charge(mc.to, mc.precharge);
5400 mc.precharge = 0;
5401 }
5402
5403
5404
5405
5406 if (mc.moved_charge) {
5407 cancel_charge(mc.from, mc.moved_charge);
5408 mc.moved_charge = 0;
5409 }
5410
5411 if (mc.moved_swap) {
5412
5413 if (!mem_cgroup_is_root(mc.from))
5414 page_counter_uncharge(&mc.from->memsw, mc.moved_swap);
5415
5416 mem_cgroup_id_put_many(mc.from, mc.moved_swap);
5417
5418
5419
5420
5421
5422 if (!mem_cgroup_is_root(mc.to))
5423 page_counter_uncharge(&mc.to->memory, mc.moved_swap);
5424
5425 mem_cgroup_id_get_many(mc.to, mc.moved_swap);
5426 css_put_many(&mc.to->css, mc.moved_swap);
5427
5428 mc.moved_swap = 0;
5429 }
5430 memcg_oom_recover(from);
5431 memcg_oom_recover(to);
5432 wake_up_all(&mc.waitq);
5433}
5434
5435static void mem_cgroup_clear_mc(void)
5436{
5437 struct mm_struct *mm = mc.mm;
5438
5439
5440
5441
5442
5443 mc.moving_task = NULL;
5444 __mem_cgroup_clear_mc();
5445 spin_lock(&mc.lock);
5446 mc.from = NULL;
5447 mc.to = NULL;
5448 mc.mm = NULL;
5449 spin_unlock(&mc.lock);
5450
5451 mmput(mm);
5452}
5453
5454static int mem_cgroup_can_attach(struct cgroup_taskset *tset)
5455{
5456 struct cgroup_subsys_state *css;
5457 struct mem_cgroup *memcg = NULL;
5458 struct mem_cgroup *from;
5459 struct task_struct *leader, *p;
5460 struct mm_struct *mm;
5461 unsigned long move_flags;
5462 int ret = 0;
5463
5464
5465 if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
5466 return 0;
5467
5468
5469
5470
5471
5472
5473
5474 p = NULL;
5475 cgroup_taskset_for_each_leader(leader, css, tset) {
5476 WARN_ON_ONCE(p);
5477 p = leader;
5478 memcg = mem_cgroup_from_css(css);
5479 }
5480 if (!p)
5481 return 0;
5482
5483
5484
5485
5486
5487
5488 move_flags = READ_ONCE(memcg->move_charge_at_immigrate);
5489 if (!move_flags)
5490 return 0;
5491
5492 from = mem_cgroup_from_task(p);
5493
5494 VM_BUG_ON(from == memcg);
5495
5496 mm = get_task_mm(p);
5497 if (!mm)
5498 return 0;
5499
5500 if (mm->owner == p) {
5501 VM_BUG_ON(mc.from);
5502 VM_BUG_ON(mc.to);
5503 VM_BUG_ON(mc.precharge);
5504 VM_BUG_ON(mc.moved_charge);
5505 VM_BUG_ON(mc.moved_swap);
5506
5507 spin_lock(&mc.lock);
5508 mc.mm = mm;
5509 mc.from = from;
5510 mc.to = memcg;
5511 mc.flags = move_flags;
5512 spin_unlock(&mc.lock);
5513
5514
5515 ret = mem_cgroup_precharge_mc(mm);
5516 if (ret)
5517 mem_cgroup_clear_mc();
5518 } else {
5519 mmput(mm);
5520 }
5521 return ret;
5522}
5523
5524static void mem_cgroup_cancel_attach(struct cgroup_taskset *tset)
5525{
5526 if (mc.to)
5527 mem_cgroup_clear_mc();
5528}
5529
5530static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
5531 unsigned long addr, unsigned long end,
5532 struct mm_walk *walk)
5533{
5534 int ret = 0;
5535 struct vm_area_struct *vma = walk->vma;
5536 pte_t *pte;
5537 spinlock_t *ptl;
5538 enum mc_target_type target_type;
5539 union mc_target target;
5540 struct page *page;
5541
5542 ptl = pmd_trans_huge_lock(pmd, vma);
5543 if (ptl) {
5544 if (mc.precharge < HPAGE_PMD_NR) {
5545 spin_unlock(ptl);
5546 return 0;
5547 }
5548 target_type = get_mctgt_type_thp(vma, addr, *pmd, &target);
5549 if (target_type == MC_TARGET_PAGE) {
5550 page = target.page;
5551 if (!isolate_lru_page(page)) {
5552 if (!mem_cgroup_move_account(page, true,
5553 mc.from, mc.to)) {
5554 mc.precharge -= HPAGE_PMD_NR;
5555 mc.moved_charge += HPAGE_PMD_NR;
5556 }
5557 putback_lru_page(page);
5558 }
5559 put_page(page);
5560 } else if (target_type == MC_TARGET_DEVICE) {
5561 page = target.page;
5562 if (!mem_cgroup_move_account(page, true,
5563 mc.from, mc.to)) {
5564 mc.precharge -= HPAGE_PMD_NR;
5565 mc.moved_charge += HPAGE_PMD_NR;
5566 }
5567 put_page(page);
5568 }
5569 spin_unlock(ptl);
5570 return 0;
5571 }
5572
5573 if (pmd_trans_unstable(pmd))
5574 return 0;
5575retry:
5576 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
5577 for (; addr != end; addr += PAGE_SIZE) {
5578 pte_t ptent = *(pte++);
5579 bool device = false;
5580 swp_entry_t ent;
5581
5582 if (!mc.precharge)
5583 break;
5584
5585 switch (get_mctgt_type(vma, addr, ptent, &target)) {
5586 case MC_TARGET_DEVICE:
5587 device = true;
5588
5589 case MC_TARGET_PAGE:
5590 page = target.page;
5591
5592
5593
5594
5595
5596
5597 if (PageTransCompound(page))
5598 goto put;
5599 if (!device && isolate_lru_page(page))
5600 goto put;
5601 if (!mem_cgroup_move_account(page, false,
5602 mc.from, mc.to)) {
5603 mc.precharge--;
5604
5605 mc.moved_charge++;
5606 }
5607 if (!device)
5608 putback_lru_page(page);
5609put:
5610 put_page(page);
5611 break;
5612 case MC_TARGET_SWAP:
5613 ent = target.ent;
5614 if (!mem_cgroup_move_swap_account(ent, mc.from, mc.to)) {
5615 mc.precharge--;
5616
5617 mc.moved_swap++;
5618 }
5619 break;
5620 default:
5621 break;
5622 }
5623 }
5624 pte_unmap_unlock(pte - 1, ptl);
5625 cond_resched();
5626
5627 if (addr != end) {
5628
5629
5630
5631
5632
5633
5634 ret = mem_cgroup_do_precharge(1);
5635 if (!ret)
5636 goto retry;
5637 }
5638
5639 return ret;
5640}
5641
5642static void mem_cgroup_move_charge(void)
5643{
5644 struct mm_walk mem_cgroup_move_charge_walk = {
5645 .pmd_entry = mem_cgroup_move_charge_pte_range,
5646 .mm = mc.mm,
5647 };
5648
5649 lru_add_drain_all();
5650
5651
5652
5653
5654
5655 atomic_inc(&mc.from->moving_account);
5656 synchronize_rcu();
5657retry:
5658 if (unlikely(!down_read_trylock(&mc.mm->mmap_sem))) {
5659
5660
5661
5662
5663
5664
5665
5666 __mem_cgroup_clear_mc();
5667 cond_resched();
5668 goto retry;
5669 }
5670
5671
5672
5673
5674 walk_page_range(0, mc.mm->highest_vm_end, &mem_cgroup_move_charge_walk);
5675
5676 up_read(&mc.mm->mmap_sem);
5677 atomic_dec(&mc.from->moving_account);
5678}
5679
5680static void mem_cgroup_move_task(void)
5681{
5682 if (mc.to) {
5683 mem_cgroup_move_charge();
5684 mem_cgroup_clear_mc();
5685 }
5686}
5687#else
5688static int mem_cgroup_can_attach(struct cgroup_taskset *tset)
5689{
5690 return 0;
5691}
5692static void mem_cgroup_cancel_attach(struct cgroup_taskset *tset)
5693{
5694}
5695static void mem_cgroup_move_task(void)
5696{
5697}
5698#endif
5699
5700
5701
5702
5703
5704
5705static void mem_cgroup_bind(struct cgroup_subsys_state *root_css)
5706{
5707
5708
5709
5710
5711
5712 if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
5713 root_mem_cgroup->use_hierarchy = true;
5714 else
5715 root_mem_cgroup->use_hierarchy = false;
5716}
5717
5718static int seq_puts_memcg_tunable(struct seq_file *m, unsigned long value)
5719{
5720 if (value == PAGE_COUNTER_MAX)
5721 seq_puts(m, "max\n");
5722 else
5723 seq_printf(m, "%llu\n", (u64)value * PAGE_SIZE);
5724
5725 return 0;
5726}
5727
5728static u64 memory_current_read(struct cgroup_subsys_state *css,
5729 struct cftype *cft)
5730{
5731 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
5732
5733 return (u64)page_counter_read(&memcg->memory) * PAGE_SIZE;
5734}
5735
5736static int memory_min_show(struct seq_file *m, void *v)
5737{
5738 return seq_puts_memcg_tunable(m,
5739 READ_ONCE(mem_cgroup_from_seq(m)->memory.min));
5740}
5741
5742static ssize_t memory_min_write(struct kernfs_open_file *of,
5743 char *buf, size_t nbytes, loff_t off)
5744{
5745 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
5746 unsigned long min;
5747 int err;
5748
5749 buf = strstrip(buf);
5750 err = page_counter_memparse(buf, "max", &min);
5751 if (err)
5752 return err;
5753
5754 page_counter_set_min(&memcg->memory, min);
5755
5756 return nbytes;
5757}
5758
5759static int memory_low_show(struct seq_file *m, void *v)
5760{
5761 return seq_puts_memcg_tunable(m,
5762 READ_ONCE(mem_cgroup_from_seq(m)->memory.low));
5763}
5764
5765static ssize_t memory_low_write(struct kernfs_open_file *of,
5766 char *buf, size_t nbytes, loff_t off)
5767{
5768 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
5769 unsigned long low;
5770 int err;
5771
5772 buf = strstrip(buf);
5773 err = page_counter_memparse(buf, "max", &low);
5774 if (err)
5775 return err;
5776
5777 page_counter_set_low(&memcg->memory, low);
5778
5779 return nbytes;
5780}
5781
5782static int memory_high_show(struct seq_file *m, void *v)
5783{
5784 return seq_puts_memcg_tunable(m, READ_ONCE(mem_cgroup_from_seq(m)->high));
5785}
5786
5787static ssize_t memory_high_write(struct kernfs_open_file *of,
5788 char *buf, size_t nbytes, loff_t off)
5789{
5790 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
5791 unsigned long nr_pages;
5792 unsigned long high;
5793 int err;
5794
5795 buf = strstrip(buf);
5796 err = page_counter_memparse(buf, "max", &high);
5797 if (err)
5798 return err;
5799
5800 memcg->high = high;
5801
5802 nr_pages = page_counter_read(&memcg->memory);
5803 if (nr_pages > high)
5804 try_to_free_mem_cgroup_pages(memcg, nr_pages - high,
5805 GFP_KERNEL, true);
5806
5807 memcg_wb_domain_size_changed(memcg);
5808 return nbytes;
5809}
5810
5811static int memory_max_show(struct seq_file *m, void *v)
5812{
5813 return seq_puts_memcg_tunable(m,
5814 READ_ONCE(mem_cgroup_from_seq(m)->memory.max));
5815}
5816
5817static ssize_t memory_max_write(struct kernfs_open_file *of,
5818 char *buf, size_t nbytes, loff_t off)
5819{
5820 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
5821 unsigned int nr_reclaims = MEM_CGROUP_RECLAIM_RETRIES;
5822 bool drained = false;
5823 unsigned long max;
5824 int err;
5825
5826 buf = strstrip(buf);
5827 err = page_counter_memparse(buf, "max", &max);
5828 if (err)
5829 return err;
5830
5831 xchg(&memcg->memory.max, max);
5832
5833 for (;;) {
5834 unsigned long nr_pages = page_counter_read(&memcg->memory);
5835
5836 if (nr_pages <= max)
5837 break;
5838
5839 if (signal_pending(current)) {
5840 err = -EINTR;
5841 break;
5842 }
5843
5844 if (!drained) {
5845 drain_all_stock(memcg);
5846 drained = true;
5847 continue;
5848 }
5849
5850 if (nr_reclaims) {
5851 if (!try_to_free_mem_cgroup_pages(memcg, nr_pages - max,
5852 GFP_KERNEL, true))
5853 nr_reclaims--;
5854 continue;
5855 }
5856
5857 memcg_memory_event(memcg, MEMCG_OOM);
5858 if (!mem_cgroup_out_of_memory(memcg, GFP_KERNEL, 0))
5859 break;
5860 }
5861
5862 memcg_wb_domain_size_changed(memcg);
5863 return nbytes;
5864}
5865
5866static void __memory_events_show(struct seq_file *m, atomic_long_t *events)
5867{
5868 seq_printf(m, "low %lu\n", atomic_long_read(&events[MEMCG_LOW]));
5869 seq_printf(m, "high %lu\n", atomic_long_read(&events[MEMCG_HIGH]));
5870 seq_printf(m, "max %lu\n", atomic_long_read(&events[MEMCG_MAX]));
5871 seq_printf(m, "oom %lu\n", atomic_long_read(&events[MEMCG_OOM]));
5872 seq_printf(m, "oom_kill %lu\n",
5873 atomic_long_read(&events[MEMCG_OOM_KILL]));
5874}
5875
5876static int memory_events_show(struct seq_file *m, void *v)
5877{
5878 struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
5879
5880 __memory_events_show(m, memcg->memory_events);
5881 return 0;
5882}
5883
5884static int memory_events_local_show(struct seq_file *m, void *v)
5885{
5886 struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
5887
5888 __memory_events_show(m, memcg->memory_events_local);
5889 return 0;
5890}
5891
5892static int memory_stat_show(struct seq_file *m, void *v)
5893{
5894 struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
5895 char *buf;
5896
5897 buf = memory_stat_format(memcg);
5898 if (!buf)
5899 return -ENOMEM;
5900 seq_puts(m, buf);
5901 kfree(buf);
5902 return 0;
5903}
5904
5905static int memory_oom_group_show(struct seq_file *m, void *v)
5906{
5907 struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
5908
5909 seq_printf(m, "%d\n", memcg->oom_group);
5910
5911 return 0;
5912}
5913
5914static ssize_t memory_oom_group_write(struct kernfs_open_file *of,
5915 char *buf, size_t nbytes, loff_t off)
5916{
5917 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
5918 int ret, oom_group;
5919
5920 buf = strstrip(buf);
5921 if (!buf)
5922 return -EINVAL;
5923
5924 ret = kstrtoint(buf, 0, &oom_group);
5925 if (ret)
5926 return ret;
5927
5928 if (oom_group != 0 && oom_group != 1)
5929 return -EINVAL;
5930
5931 memcg->oom_group = oom_group;
5932
5933 return nbytes;
5934}
5935
5936static struct cftype memory_files[] = {
5937 {
5938 .name = "current",
5939 .flags = CFTYPE_NOT_ON_ROOT,
5940 .read_u64 = memory_current_read,
5941 },
5942 {
5943 .name = "min",
5944 .flags = CFTYPE_NOT_ON_ROOT,
5945 .seq_show = memory_min_show,
5946 .write = memory_min_write,
5947 },
5948 {
5949 .name = "low",
5950 .flags = CFTYPE_NOT_ON_ROOT,
5951 .seq_show = memory_low_show,
5952 .write = memory_low_write,
5953 },
5954 {
5955 .name = "high",
5956 .flags = CFTYPE_NOT_ON_ROOT,
5957 .seq_show = memory_high_show,
5958 .write = memory_high_write,
5959 },
5960 {
5961 .name = "max",
5962 .flags = CFTYPE_NOT_ON_ROOT,
5963 .seq_show = memory_max_show,
5964 .write = memory_max_write,
5965 },
5966 {
5967 .name = "events",
5968 .flags = CFTYPE_NOT_ON_ROOT,
5969 .file_offset = offsetof(struct mem_cgroup, events_file),
5970 .seq_show = memory_events_show,
5971 },
5972 {
5973 .name = "events.local",
5974 .flags = CFTYPE_NOT_ON_ROOT,
5975 .file_offset = offsetof(struct mem_cgroup, events_local_file),
5976 .seq_show = memory_events_local_show,
5977 },
5978 {
5979 .name = "stat",
5980 .flags = CFTYPE_NOT_ON_ROOT,
5981 .seq_show = memory_stat_show,
5982 },
5983 {
5984 .name = "oom.group",
5985 .flags = CFTYPE_NOT_ON_ROOT | CFTYPE_NS_DELEGATABLE,
5986 .seq_show = memory_oom_group_show,
5987 .write = memory_oom_group_write,
5988 },
5989 { }
5990};
5991
5992struct cgroup_subsys memory_cgrp_subsys = {
5993 .css_alloc = mem_cgroup_css_alloc,
5994 .css_online = mem_cgroup_css_online,
5995 .css_offline = mem_cgroup_css_offline,
5996 .css_released = mem_cgroup_css_released,
5997 .css_free = mem_cgroup_css_free,
5998 .css_reset = mem_cgroup_css_reset,
5999 .can_attach = mem_cgroup_can_attach,
6000 .cancel_attach = mem_cgroup_cancel_attach,
6001 .post_attach = mem_cgroup_move_task,
6002 .bind = mem_cgroup_bind,
6003 .dfl_cftypes = memory_files,
6004 .legacy_cftypes = mem_cgroup_legacy_files,
6005 .early_init = 0,
6006};
6007
6008
6009
6010
6011
6012
6013
6014
6015
6016
6017
6018
6019
6020
6021
6022
6023
6024
6025
6026
6027
6028
6029
6030
6031
6032
6033
6034
6035
6036
6037
6038
6039
6040
6041
6042
6043
6044
6045
6046
6047
6048
6049
6050
6051
6052
6053
6054
6055
6056
6057
6058
6059
6060
6061
6062
6063
6064
6065
6066
6067
6068
6069
6070
6071
6072
6073
6074
6075
6076
6077
6078enum mem_cgroup_protection mem_cgroup_protected(struct mem_cgroup *root,
6079 struct mem_cgroup *memcg)
6080{
6081 struct mem_cgroup *parent;
6082 unsigned long emin, parent_emin;
6083 unsigned long elow, parent_elow;
6084 unsigned long usage;
6085
6086 if (mem_cgroup_disabled())
6087 return MEMCG_PROT_NONE;
6088
6089 if (!root)
6090 root = root_mem_cgroup;
6091 if (memcg == root)
6092 return MEMCG_PROT_NONE;
6093
6094 usage = page_counter_read(&memcg->memory);
6095 if (!usage)
6096 return MEMCG_PROT_NONE;
6097
6098 emin = memcg->memory.min;
6099 elow = memcg->memory.low;
6100
6101 parent = parent_mem_cgroup(memcg);
6102
6103 if (!parent)
6104 return MEMCG_PROT_NONE;
6105
6106 if (parent == root)
6107 goto exit;
6108
6109 parent_emin = READ_ONCE(parent->memory.emin);
6110 emin = min(emin, parent_emin);
6111 if (emin && parent_emin) {
6112 unsigned long min_usage, siblings_min_usage;
6113
6114 min_usage = min(usage, memcg->memory.min);
6115 siblings_min_usage = atomic_long_read(
6116 &parent->memory.children_min_usage);
6117
6118 if (min_usage && siblings_min_usage)
6119 emin = min(emin, parent_emin * min_usage /
6120 siblings_min_usage);
6121 }
6122
6123 parent_elow = READ_ONCE(parent->memory.elow);
6124 elow = min(elow, parent_elow);
6125 if (elow && parent_elow) {
6126 unsigned long low_usage, siblings_low_usage;
6127
6128 low_usage = min(usage, memcg->memory.low);
6129 siblings_low_usage = atomic_long_read(
6130 &parent->memory.children_low_usage);
6131
6132 if (low_usage && siblings_low_usage)
6133 elow = min(elow, parent_elow * low_usage /
6134 siblings_low_usage);
6135 }
6136
6137exit:
6138 memcg->memory.emin = emin;
6139 memcg->memory.elow = elow;
6140
6141 if (usage <= emin)
6142 return MEMCG_PROT_MIN;
6143 else if (usage <= elow)
6144 return MEMCG_PROT_LOW;
6145 else
6146 return MEMCG_PROT_NONE;
6147}
6148
6149
6150
6151
6152
6153
6154
6155
6156
6157
6158
6159
6160
6161
6162
6163
6164
6165
6166
6167int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm,
6168 gfp_t gfp_mask, struct mem_cgroup **memcgp,
6169 bool compound)
6170{
6171 struct mem_cgroup *memcg = NULL;
6172 unsigned int nr_pages = compound ? hpage_nr_pages(page) : 1;
6173 int ret = 0;
6174
6175 if (mem_cgroup_disabled())
6176 goto out;
6177
6178 if (PageSwapCache(page)) {
6179
6180
6181
6182
6183
6184
6185
6186 VM_BUG_ON_PAGE(!PageLocked(page), page);
6187 if (compound_head(page)->mem_cgroup)
6188 goto out;
6189
6190 if (do_swap_account) {
6191 swp_entry_t ent = { .val = page_private(page), };
6192 unsigned short id = lookup_swap_cgroup_id(ent);
6193
6194 rcu_read_lock();
6195 memcg = mem_cgroup_from_id(id);
6196 if (memcg && !css_tryget_online(&memcg->css))
6197 memcg = NULL;
6198 rcu_read_unlock();
6199 }
6200 }
6201
6202 if (!memcg)
6203 memcg = get_mem_cgroup_from_mm(mm);
6204
6205 ret = try_charge(memcg, gfp_mask, nr_pages);
6206
6207 css_put(&memcg->css);
6208out:
6209 *memcgp = memcg;
6210 return ret;
6211}
6212
6213int mem_cgroup_try_charge_delay(struct page *page, struct mm_struct *mm,
6214 gfp_t gfp_mask, struct mem_cgroup **memcgp,
6215 bool compound)
6216{
6217 struct mem_cgroup *memcg;
6218 int ret;
6219
6220 ret = mem_cgroup_try_charge(page, mm, gfp_mask, memcgp, compound);
6221 memcg = *memcgp;
6222 mem_cgroup_throttle_swaprate(memcg, page_to_nid(page), gfp_mask);
6223 return ret;
6224}
6225
6226
6227
6228
6229
6230
6231
6232
6233
6234
6235
6236
6237
6238
6239
6240
6241
6242
6243void mem_cgroup_commit_charge(struct page *page, struct mem_cgroup *memcg,
6244 bool lrucare, bool compound)
6245{
6246 unsigned int nr_pages = compound ? hpage_nr_pages(page) : 1;
6247
6248 VM_BUG_ON_PAGE(!page->mapping, page);
6249 VM_BUG_ON_PAGE(PageLRU(page) && !lrucare, page);
6250
6251 if (mem_cgroup_disabled())
6252 return;
6253
6254
6255
6256
6257
6258 if (!memcg)
6259 return;
6260
6261 commit_charge(page, memcg, lrucare);
6262
6263 local_irq_disable();
6264 mem_cgroup_charge_statistics(memcg, page, compound, nr_pages);
6265 memcg_check_events(memcg, page);
6266 local_irq_enable();
6267
6268 if (do_memsw_account() && PageSwapCache(page)) {
6269 swp_entry_t entry = { .val = page_private(page) };
6270
6271
6272
6273
6274
6275 mem_cgroup_uncharge_swap(entry, nr_pages);
6276 }
6277}
6278
6279
6280
6281
6282
6283
6284
6285
6286
6287void mem_cgroup_cancel_charge(struct page *page, struct mem_cgroup *memcg,
6288 bool compound)
6289{
6290 unsigned int nr_pages = compound ? hpage_nr_pages(page) : 1;
6291
6292 if (mem_cgroup_disabled())
6293 return;
6294
6295
6296
6297
6298
6299 if (!memcg)
6300 return;
6301
6302 cancel_charge(memcg, nr_pages);
6303}
6304
6305struct uncharge_gather {
6306 struct mem_cgroup *memcg;
6307 unsigned long pgpgout;
6308 unsigned long nr_anon;
6309 unsigned long nr_file;
6310 unsigned long nr_kmem;
6311 unsigned long nr_huge;
6312 unsigned long nr_shmem;
6313 struct page *dummy_page;
6314};
6315
6316static inline void uncharge_gather_clear(struct uncharge_gather *ug)
6317{
6318 memset(ug, 0, sizeof(*ug));
6319}
6320
6321static void uncharge_batch(const struct uncharge_gather *ug)
6322{
6323 unsigned long nr_pages = ug->nr_anon + ug->nr_file + ug->nr_kmem;
6324 unsigned long flags;
6325
6326 if (!mem_cgroup_is_root(ug->memcg)) {
6327 page_counter_uncharge(&ug->memcg->memory, nr_pages);
6328 if (do_memsw_account())
6329 page_counter_uncharge(&ug->memcg->memsw, nr_pages);
6330 if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && ug->nr_kmem)
6331 page_counter_uncharge(&ug->memcg->kmem, ug->nr_kmem);
6332 memcg_oom_recover(ug->memcg);
6333 }
6334
6335 local_irq_save(flags);
6336 __mod_memcg_state(ug->memcg, MEMCG_RSS, -ug->nr_anon);
6337 __mod_memcg_state(ug->memcg, MEMCG_CACHE, -ug->nr_file);
6338 __mod_memcg_state(ug->memcg, MEMCG_RSS_HUGE, -ug->nr_huge);
6339 __mod_memcg_state(ug->memcg, NR_SHMEM, -ug->nr_shmem);
6340 __count_memcg_events(ug->memcg, PGPGOUT, ug->pgpgout);
6341 __this_cpu_add(ug->memcg->vmstats_percpu->nr_page_events, nr_pages);
6342 memcg_check_events(ug->memcg, ug->dummy_page);
6343 local_irq_restore(flags);
6344
6345 if (!mem_cgroup_is_root(ug->memcg))
6346 css_put_many(&ug->memcg->css, nr_pages);
6347}
6348
6349static void uncharge_page(struct page *page, struct uncharge_gather *ug)
6350{
6351 VM_BUG_ON_PAGE(PageLRU(page), page);
6352 VM_BUG_ON_PAGE(page_count(page) && !is_zone_device_page(page) &&
6353 !PageHWPoison(page) , page);
6354
6355 if (!page->mem_cgroup)
6356 return;
6357
6358
6359
6360
6361
6362
6363
6364 if (ug->memcg != page->mem_cgroup) {
6365 if (ug->memcg) {
6366 uncharge_batch(ug);
6367 uncharge_gather_clear(ug);
6368 }
6369 ug->memcg = page->mem_cgroup;
6370 }
6371
6372 if (!PageKmemcg(page)) {
6373 unsigned int nr_pages = 1;
6374
6375 if (PageTransHuge(page)) {
6376 nr_pages <<= compound_order(page);
6377 ug->nr_huge += nr_pages;
6378 }
6379 if (PageAnon(page))
6380 ug->nr_anon += nr_pages;
6381 else {
6382 ug->nr_file += nr_pages;
6383 if (PageSwapBacked(page))
6384 ug->nr_shmem += nr_pages;
6385 }
6386 ug->pgpgout++;
6387 } else {
6388 ug->nr_kmem += 1 << compound_order(page);
6389 __ClearPageKmemcg(page);
6390 }
6391
6392 ug->dummy_page = page;
6393 page->mem_cgroup = NULL;
6394}
6395
6396static void uncharge_list(struct list_head *page_list)
6397{
6398 struct uncharge_gather ug;
6399 struct list_head *next;
6400
6401 uncharge_gather_clear(&ug);
6402
6403
6404
6405
6406
6407 next = page_list->next;
6408 do {
6409 struct page *page;
6410
6411 page = list_entry(next, struct page, lru);
6412 next = page->lru.next;
6413
6414 uncharge_page(page, &ug);
6415 } while (next != page_list);
6416
6417 if (ug.memcg)
6418 uncharge_batch(&ug);
6419}
6420
6421
6422
6423
6424
6425
6426
6427
6428void mem_cgroup_uncharge(struct page *page)
6429{
6430 struct uncharge_gather ug;
6431
6432 if (mem_cgroup_disabled())
6433 return;
6434
6435
6436 if (!page->mem_cgroup)
6437 return;
6438
6439 uncharge_gather_clear(&ug);
6440 uncharge_page(page, &ug);
6441 uncharge_batch(&ug);
6442}
6443
6444
6445
6446
6447
6448
6449
6450
6451void mem_cgroup_uncharge_list(struct list_head *page_list)
6452{
6453 if (mem_cgroup_disabled())
6454 return;
6455
6456 if (!list_empty(page_list))
6457 uncharge_list(page_list);
6458}
6459
6460
6461
6462
6463
6464
6465
6466
6467
6468
6469
6470void mem_cgroup_migrate(struct page *oldpage, struct page *newpage)
6471{
6472 struct mem_cgroup *memcg;
6473 unsigned int nr_pages;
6474 bool compound;
6475 unsigned long flags;
6476
6477 VM_BUG_ON_PAGE(!PageLocked(oldpage), oldpage);
6478 VM_BUG_ON_PAGE(!PageLocked(newpage), newpage);
6479 VM_BUG_ON_PAGE(PageAnon(oldpage) != PageAnon(newpage), newpage);
6480 VM_BUG_ON_PAGE(PageTransHuge(oldpage) != PageTransHuge(newpage),
6481 newpage);
6482
6483 if (mem_cgroup_disabled())
6484 return;
6485
6486
6487 if (newpage->mem_cgroup)
6488 return;
6489
6490
6491 memcg = oldpage->mem_cgroup;
6492 if (!memcg)
6493 return;
6494
6495
6496 compound = PageTransHuge(newpage);
6497 nr_pages = compound ? hpage_nr_pages(newpage) : 1;
6498
6499 page_counter_charge(&memcg->memory, nr_pages);
6500 if (do_memsw_account())
6501 page_counter_charge(&memcg->memsw, nr_pages);
6502 css_get_many(&memcg->css, nr_pages);
6503
6504 commit_charge(newpage, memcg, false);
6505
6506 local_irq_save(flags);
6507 mem_cgroup_charge_statistics(memcg, newpage, compound, nr_pages);
6508 memcg_check_events(memcg, newpage);
6509 local_irq_restore(flags);
6510}
6511
6512DEFINE_STATIC_KEY_FALSE(memcg_sockets_enabled_key);
6513EXPORT_SYMBOL(memcg_sockets_enabled_key);
6514
6515void mem_cgroup_sk_alloc(struct sock *sk)
6516{
6517 struct mem_cgroup *memcg;
6518
6519 if (!mem_cgroup_sockets_enabled)
6520 return;
6521
6522
6523
6524
6525
6526
6527
6528
6529
6530
6531 if (sk->sk_memcg) {
6532 css_get(&sk->sk_memcg->css);
6533 return;
6534 }
6535
6536 rcu_read_lock();
6537 memcg = mem_cgroup_from_task(current);
6538 if (memcg == root_mem_cgroup)
6539 goto out;
6540 if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && !memcg->tcpmem_active)
6541 goto out;
6542 if (css_tryget_online(&memcg->css))
6543 sk->sk_memcg = memcg;
6544out:
6545 rcu_read_unlock();
6546}
6547
6548void mem_cgroup_sk_free(struct sock *sk)
6549{
6550 if (sk->sk_memcg)
6551 css_put(&sk->sk_memcg->css);
6552}
6553
6554
6555
6556
6557
6558
6559
6560
6561
6562bool mem_cgroup_charge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages)
6563{
6564 gfp_t gfp_mask = GFP_KERNEL;
6565
6566 if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) {
6567 struct page_counter *fail;
6568
6569 if (page_counter_try_charge(&memcg->tcpmem, nr_pages, &fail)) {
6570 memcg->tcpmem_pressure = 0;
6571 return true;
6572 }
6573 page_counter_charge(&memcg->tcpmem, nr_pages);
6574 memcg->tcpmem_pressure = 1;
6575 return false;
6576 }
6577
6578
6579 if (in_softirq())
6580 gfp_mask = GFP_NOWAIT;
6581
6582 mod_memcg_state(memcg, MEMCG_SOCK, nr_pages);
6583
6584 if (try_charge(memcg, gfp_mask, nr_pages) == 0)
6585 return true;
6586
6587 try_charge(memcg, gfp_mask|__GFP_NOFAIL, nr_pages);
6588 return false;
6589}
6590
6591
6592
6593
6594
6595
6596void mem_cgroup_uncharge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages)
6597{
6598 if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) {
6599 page_counter_uncharge(&memcg->tcpmem, nr_pages);
6600 return;
6601 }
6602
6603 mod_memcg_state(memcg, MEMCG_SOCK, -nr_pages);
6604
6605 refill_stock(memcg, nr_pages);
6606}
6607
6608static int __init cgroup_memory(char *s)
6609{
6610 char *token;
6611
6612 while ((token = strsep(&s, ",")) != NULL) {
6613 if (!*token)
6614 continue;
6615 if (!strcmp(token, "nosocket"))
6616 cgroup_memory_nosocket = true;
6617 if (!strcmp(token, "nokmem"))
6618 cgroup_memory_nokmem = true;
6619 }
6620 return 0;
6621}
6622__setup("cgroup.memory=", cgroup_memory);
6623
6624
6625
6626
6627
6628
6629
6630
6631
6632static int __init mem_cgroup_init(void)
6633{
6634 int cpu, node;
6635
6636#ifdef CONFIG_MEMCG_KMEM
6637
6638
6639
6640
6641
6642
6643 memcg_kmem_cache_wq = alloc_workqueue("memcg_kmem_cache", 0, 1);
6644 BUG_ON(!memcg_kmem_cache_wq);
6645#endif
6646
6647 cpuhp_setup_state_nocalls(CPUHP_MM_MEMCQ_DEAD, "mm/memctrl:dead", NULL,
6648 memcg_hotplug_cpu_dead);
6649
6650 for_each_possible_cpu(cpu)
6651 INIT_WORK(&per_cpu_ptr(&memcg_stock, cpu)->work,
6652 drain_local_stock);
6653
6654 for_each_node(node) {
6655 struct mem_cgroup_tree_per_node *rtpn;
6656
6657 rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL,
6658 node_online(node) ? node : NUMA_NO_NODE);
6659
6660 rtpn->rb_root = RB_ROOT;
6661 rtpn->rb_rightmost = NULL;
6662 spin_lock_init(&rtpn->lock);
6663 soft_limit_tree.rb_tree_per_node[node] = rtpn;
6664 }
6665
6666 return 0;
6667}
6668subsys_initcall(mem_cgroup_init);
6669
6670#ifdef CONFIG_MEMCG_SWAP
6671static struct mem_cgroup *mem_cgroup_id_get_online(struct mem_cgroup *memcg)
6672{
6673 while (!refcount_inc_not_zero(&memcg->id.ref)) {
6674
6675
6676
6677
6678 if (WARN_ON_ONCE(memcg == root_mem_cgroup)) {
6679 VM_BUG_ON(1);
6680 break;
6681 }
6682 memcg = parent_mem_cgroup(memcg);
6683 if (!memcg)
6684 memcg = root_mem_cgroup;
6685 }
6686 return memcg;
6687}
6688
6689
6690
6691
6692
6693
6694
6695
6696void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
6697{
6698 struct mem_cgroup *memcg, *swap_memcg;
6699 unsigned int nr_entries;
6700 unsigned short oldid;
6701
6702 VM_BUG_ON_PAGE(PageLRU(page), page);
6703 VM_BUG_ON_PAGE(page_count(page), page);
6704
6705 if (!do_memsw_account())
6706 return;
6707
6708 memcg = page->mem_cgroup;
6709
6710
6711 if (!memcg)
6712 return;
6713
6714
6715
6716
6717
6718
6719 swap_memcg = mem_cgroup_id_get_online(memcg);
6720 nr_entries = hpage_nr_pages(page);
6721
6722 if (nr_entries > 1)
6723 mem_cgroup_id_get_many(swap_memcg, nr_entries - 1);
6724 oldid = swap_cgroup_record(entry, mem_cgroup_id(swap_memcg),
6725 nr_entries);
6726 VM_BUG_ON_PAGE(oldid, page);
6727 mod_memcg_state(swap_memcg, MEMCG_SWAP, nr_entries);
6728
6729 page->mem_cgroup = NULL;
6730
6731 if (!mem_cgroup_is_root(memcg))
6732 page_counter_uncharge(&memcg->memory, nr_entries);
6733
6734 if (memcg != swap_memcg) {
6735 if (!mem_cgroup_is_root(swap_memcg))
6736 page_counter_charge(&swap_memcg->memsw, nr_entries);
6737 page_counter_uncharge(&memcg->memsw, nr_entries);
6738 }
6739
6740
6741
6742
6743
6744
6745
6746 VM_BUG_ON(!irqs_disabled());
6747 mem_cgroup_charge_statistics(memcg, page, PageTransHuge(page),
6748 -nr_entries);
6749 memcg_check_events(memcg, page);
6750
6751 if (!mem_cgroup_is_root(memcg))
6752 css_put_many(&memcg->css, nr_entries);
6753}
6754
6755
6756
6757
6758
6759
6760
6761
6762
6763
6764int mem_cgroup_try_charge_swap(struct page *page, swp_entry_t entry)
6765{
6766 unsigned int nr_pages = hpage_nr_pages(page);
6767 struct page_counter *counter;
6768 struct mem_cgroup *memcg;
6769 unsigned short oldid;
6770
6771 if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) || !do_swap_account)
6772 return 0;
6773
6774 memcg = page->mem_cgroup;
6775
6776
6777 if (!memcg)
6778 return 0;
6779
6780 if (!entry.val) {
6781 memcg_memory_event(memcg, MEMCG_SWAP_FAIL);
6782 return 0;
6783 }
6784
6785 memcg = mem_cgroup_id_get_online(memcg);
6786
6787 if (!mem_cgroup_is_root(memcg) &&
6788 !page_counter_try_charge(&memcg->swap, nr_pages, &counter)) {
6789 memcg_memory_event(memcg, MEMCG_SWAP_MAX);
6790 memcg_memory_event(memcg, MEMCG_SWAP_FAIL);
6791 mem_cgroup_id_put(memcg);
6792 return -ENOMEM;
6793 }
6794
6795
6796 if (nr_pages > 1)
6797 mem_cgroup_id_get_many(memcg, nr_pages - 1);
6798 oldid = swap_cgroup_record(entry, mem_cgroup_id(memcg), nr_pages);
6799 VM_BUG_ON_PAGE(oldid, page);
6800 mod_memcg_state(memcg, MEMCG_SWAP, nr_pages);
6801
6802 return 0;
6803}
6804
6805
6806
6807
6808
6809
6810void mem_cgroup_uncharge_swap(swp_entry_t entry, unsigned int nr_pages)
6811{
6812 struct mem_cgroup *memcg;
6813 unsigned short id;
6814
6815 if (!do_swap_account)
6816 return;
6817
6818 id = swap_cgroup_record(entry, 0, nr_pages);
6819 rcu_read_lock();
6820 memcg = mem_cgroup_from_id(id);
6821 if (memcg) {
6822 if (!mem_cgroup_is_root(memcg)) {
6823 if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
6824 page_counter_uncharge(&memcg->swap, nr_pages);
6825 else
6826 page_counter_uncharge(&memcg->memsw, nr_pages);
6827 }
6828 mod_memcg_state(memcg, MEMCG_SWAP, -nr_pages);
6829 mem_cgroup_id_put_many(memcg, nr_pages);
6830 }
6831 rcu_read_unlock();
6832}
6833
6834long mem_cgroup_get_nr_swap_pages(struct mem_cgroup *memcg)
6835{
6836 long nr_swap_pages = get_nr_swap_pages();
6837
6838 if (!do_swap_account || !cgroup_subsys_on_dfl(memory_cgrp_subsys))
6839 return nr_swap_pages;
6840 for (; memcg != root_mem_cgroup; memcg = parent_mem_cgroup(memcg))
6841 nr_swap_pages = min_t(long, nr_swap_pages,
6842 READ_ONCE(memcg->swap.max) -
6843 page_counter_read(&memcg->swap));
6844 return nr_swap_pages;
6845}
6846
6847bool mem_cgroup_swap_full(struct page *page)
6848{
6849 struct mem_cgroup *memcg;
6850
6851 VM_BUG_ON_PAGE(!PageLocked(page), page);
6852
6853 if (vm_swap_full())
6854 return true;
6855 if (!do_swap_account || !cgroup_subsys_on_dfl(memory_cgrp_subsys))
6856 return false;
6857
6858 memcg = page->mem_cgroup;
6859 if (!memcg)
6860 return false;
6861
6862 for (; memcg != root_mem_cgroup; memcg = parent_mem_cgroup(memcg))
6863 if (page_counter_read(&memcg->swap) * 2 >= memcg->swap.max)
6864 return true;
6865
6866 return false;
6867}
6868
6869
6870#ifdef CONFIG_MEMCG_SWAP_ENABLED
6871static int really_do_swap_account __initdata = 1;
6872#else
6873static int really_do_swap_account __initdata;
6874#endif
6875
6876static int __init enable_swap_account(char *s)
6877{
6878 if (!strcmp(s, "1"))
6879 really_do_swap_account = 1;
6880 else if (!strcmp(s, "0"))
6881 really_do_swap_account = 0;
6882 return 1;
6883}
6884__setup("swapaccount=", enable_swap_account);
6885
6886static u64 swap_current_read(struct cgroup_subsys_state *css,
6887 struct cftype *cft)
6888{
6889 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
6890
6891 return (u64)page_counter_read(&memcg->swap) * PAGE_SIZE;
6892}
6893
6894static int swap_max_show(struct seq_file *m, void *v)
6895{
6896 return seq_puts_memcg_tunable(m,
6897 READ_ONCE(mem_cgroup_from_seq(m)->swap.max));
6898}
6899
6900static ssize_t swap_max_write(struct kernfs_open_file *of,
6901 char *buf, size_t nbytes, loff_t off)
6902{
6903 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
6904 unsigned long max;
6905 int err;
6906
6907 buf = strstrip(buf);
6908 err = page_counter_memparse(buf, "max", &max);
6909 if (err)
6910 return err;
6911
6912 xchg(&memcg->swap.max, max);
6913
6914 return nbytes;
6915}
6916
6917static int swap_events_show(struct seq_file *m, void *v)
6918{
6919 struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
6920
6921 seq_printf(m, "max %lu\n",
6922 atomic_long_read(&memcg->memory_events[MEMCG_SWAP_MAX]));
6923 seq_printf(m, "fail %lu\n",
6924 atomic_long_read(&memcg->memory_events[MEMCG_SWAP_FAIL]));
6925
6926 return 0;
6927}
6928
6929static struct cftype swap_files[] = {
6930 {
6931 .name = "swap.current",
6932 .flags = CFTYPE_NOT_ON_ROOT,
6933 .read_u64 = swap_current_read,
6934 },
6935 {
6936 .name = "swap.max",
6937 .flags = CFTYPE_NOT_ON_ROOT,
6938 .seq_show = swap_max_show,
6939 .write = swap_max_write,
6940 },
6941 {
6942 .name = "swap.events",
6943 .flags = CFTYPE_NOT_ON_ROOT,
6944 .file_offset = offsetof(struct mem_cgroup, swap_events_file),
6945 .seq_show = swap_events_show,
6946 },
6947 { }
6948};
6949
6950static struct cftype memsw_cgroup_files[] = {
6951 {
6952 .name = "memsw.usage_in_bytes",
6953 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE),
6954 .read_u64 = mem_cgroup_read_u64,
6955 },
6956 {
6957 .name = "memsw.max_usage_in_bytes",
6958 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_MAX_USAGE),
6959 .write = mem_cgroup_reset,
6960 .read_u64 = mem_cgroup_read_u64,
6961 },
6962 {
6963 .name = "memsw.limit_in_bytes",
6964 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_LIMIT),
6965 .write = mem_cgroup_write,
6966 .read_u64 = mem_cgroup_read_u64,
6967 },
6968 {
6969 .name = "memsw.failcnt",
6970 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_FAILCNT),
6971 .write = mem_cgroup_reset,
6972 .read_u64 = mem_cgroup_read_u64,
6973 },
6974 { },
6975};
6976
6977static int __init mem_cgroup_swap_init(void)
6978{
6979 if (!mem_cgroup_disabled() && really_do_swap_account) {
6980 do_swap_account = 1;
6981 WARN_ON(cgroup_add_dfl_cftypes(&memory_cgrp_subsys,
6982 swap_files));
6983 WARN_ON(cgroup_add_legacy_cftypes(&memory_cgrp_subsys,
6984 memsw_cgroup_files));
6985 }
6986 return 0;
6987}
6988subsys_initcall(mem_cgroup_swap_init);
6989
6990#endif
6991