1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
30
31#include "cgroup-internal.h"
32
33#include <linux/cred.h>
34#include <linux/errno.h>
35#include <linux/init_task.h>
36#include <linux/kernel.h>
37#include <linux/magic.h>
38#include <linux/mutex.h>
39#include <linux/mount.h>
40#include <linux/pagemap.h>
41#include <linux/proc_fs.h>
42#include <linux/rcupdate.h>
43#include <linux/sched.h>
44#include <linux/sched/task.h>
45#include <linux/slab.h>
46#include <linux/spinlock.h>
47#include <linux/percpu-rwsem.h>
48#include <linux/string.h>
49#include <linux/hashtable.h>
50#include <linux/idr.h>
51#include <linux/kthread.h>
52#include <linux/atomic.h>
53#include <linux/cpuset.h>
54#include <linux/proc_ns.h>
55#include <linux/nsproxy.h>
56#include <linux/file.h>
57#include <linux/fs_parser.h>
58#include <linux/sched/cputime.h>
59#include <linux/psi.h>
60#include <net/sock.h>
61
62#define CREATE_TRACE_POINTS
63#include <trace/events/cgroup.h>
64
65#define CGROUP_FILE_NAME_MAX (MAX_CGROUP_TYPE_NAMELEN + \
66 MAX_CFTYPE_NAME + 2)
67
68#define CGROUP_FILE_NOTIFY_MIN_INTV DIV_ROUND_UP(HZ, 100)
69
70
71
72
73
74
75
76
77
78
79
80DEFINE_MUTEX(cgroup_mutex);
81DEFINE_SPINLOCK(css_set_lock);
82
83#ifdef CONFIG_PROVE_RCU
84EXPORT_SYMBOL_GPL(cgroup_mutex);
85EXPORT_SYMBOL_GPL(css_set_lock);
86#endif
87
88DEFINE_SPINLOCK(trace_cgroup_path_lock);
89char trace_cgroup_path[TRACE_CGROUP_PATH_LEN];
90bool cgroup_debug __read_mostly;
91
92
93
94
95
96static DEFINE_SPINLOCK(cgroup_idr_lock);
97
98
99
100
101
102static DEFINE_SPINLOCK(cgroup_file_kn_lock);
103
104DEFINE_PERCPU_RWSEM(cgroup_threadgroup_rwsem);
105
106#define cgroup_assert_mutex_or_rcu_locked() \
107 RCU_LOCKDEP_WARN(!rcu_read_lock_held() && \
108 !lockdep_is_held(&cgroup_mutex), \
109 "cgroup_mutex or RCU read lock required");
110
111
112
113
114
115
116
117static struct workqueue_struct *cgroup_destroy_wq;
118
119
120#define SUBSYS(_x) [_x ## _cgrp_id] = &_x ## _cgrp_subsys,
121struct cgroup_subsys *cgroup_subsys[] = {
122#include <linux/cgroup_subsys.h>
123};
124#undef SUBSYS
125
126
127#define SUBSYS(_x) [_x ## _cgrp_id] = #_x,
128static const char *cgroup_subsys_name[] = {
129#include <linux/cgroup_subsys.h>
130};
131#undef SUBSYS
132
133
134#define SUBSYS(_x) \
135 DEFINE_STATIC_KEY_TRUE(_x ## _cgrp_subsys_enabled_key); \
136 DEFINE_STATIC_KEY_TRUE(_x ## _cgrp_subsys_on_dfl_key); \
137 EXPORT_SYMBOL_GPL(_x ## _cgrp_subsys_enabled_key); \
138 EXPORT_SYMBOL_GPL(_x ## _cgrp_subsys_on_dfl_key);
139#include <linux/cgroup_subsys.h>
140#undef SUBSYS
141
142#define SUBSYS(_x) [_x ## _cgrp_id] = &_x ## _cgrp_subsys_enabled_key,
143static struct static_key_true *cgroup_subsys_enabled_key[] = {
144#include <linux/cgroup_subsys.h>
145};
146#undef SUBSYS
147
148#define SUBSYS(_x) [_x ## _cgrp_id] = &_x ## _cgrp_subsys_on_dfl_key,
149static struct static_key_true *cgroup_subsys_on_dfl_key[] = {
150#include <linux/cgroup_subsys.h>
151};
152#undef SUBSYS
153
154static DEFINE_PER_CPU(struct cgroup_rstat_cpu, cgrp_dfl_root_rstat_cpu);
155
156
157struct cgroup_root cgrp_dfl_root = { .cgrp.rstat_cpu = &cgrp_dfl_root_rstat_cpu };
158EXPORT_SYMBOL_GPL(cgrp_dfl_root);
159
160
161
162
163
164static bool cgrp_dfl_visible;
165
166
167static u16 cgrp_dfl_inhibit_ss_mask;
168
169
170static u16 cgrp_dfl_implicit_ss_mask;
171
172
173static u16 cgrp_dfl_threaded_ss_mask;
174
175
176LIST_HEAD(cgroup_roots);
177static int cgroup_root_count;
178
179
180static DEFINE_IDR(cgroup_hierarchy_idr);
181
182
183
184
185
186
187
188
189static u64 css_serial_nr_next = 1;
190
191
192
193
194
195static u16 have_fork_callback __read_mostly;
196static u16 have_exit_callback __read_mostly;
197static u16 have_release_callback __read_mostly;
198static u16 have_canfork_callback __read_mostly;
199
200
201struct cgroup_namespace init_cgroup_ns = {
202 .count = REFCOUNT_INIT(2),
203 .user_ns = &init_user_ns,
204 .ns.ops = &cgroupns_operations,
205 .ns.inum = PROC_CGROUP_INIT_INO,
206 .root_cset = &init_css_set,
207};
208
209static struct file_system_type cgroup2_fs_type;
210static struct cftype cgroup_base_files[];
211
212static int cgroup_apply_control(struct cgroup *cgrp);
213static void cgroup_finalize_control(struct cgroup *cgrp, int ret);
214static void css_task_iter_skip(struct css_task_iter *it,
215 struct task_struct *task);
216static int cgroup_destroy_locked(struct cgroup *cgrp);
217static struct cgroup_subsys_state *css_create(struct cgroup *cgrp,
218 struct cgroup_subsys *ss);
219static void css_release(struct percpu_ref *ref);
220static void kill_css(struct cgroup_subsys_state *css);
221static int cgroup_addrm_files(struct cgroup_subsys_state *css,
222 struct cgroup *cgrp, struct cftype cfts[],
223 bool is_add);
224
225
226
227
228
229
230
231
232
233bool cgroup_ssid_enabled(int ssid)
234{
235 if (CGROUP_SUBSYS_COUNT == 0)
236 return false;
237
238 return static_key_enabled(cgroup_subsys_enabled_key[ssid]);
239}
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291bool cgroup_on_dfl(const struct cgroup *cgrp)
292{
293 return cgrp->root == &cgrp_dfl_root;
294}
295
296
297static int cgroup_idr_alloc(struct idr *idr, void *ptr, int start, int end,
298 gfp_t gfp_mask)
299{
300 int ret;
301
302 idr_preload(gfp_mask);
303 spin_lock_bh(&cgroup_idr_lock);
304 ret = idr_alloc(idr, ptr, start, end, gfp_mask & ~__GFP_DIRECT_RECLAIM);
305 spin_unlock_bh(&cgroup_idr_lock);
306 idr_preload_end();
307 return ret;
308}
309
310static void *cgroup_idr_replace(struct idr *idr, void *ptr, int id)
311{
312 void *ret;
313
314 spin_lock_bh(&cgroup_idr_lock);
315 ret = idr_replace(idr, ptr, id);
316 spin_unlock_bh(&cgroup_idr_lock);
317 return ret;
318}
319
320static void cgroup_idr_remove(struct idr *idr, int id)
321{
322 spin_lock_bh(&cgroup_idr_lock);
323 idr_remove(idr, id);
324 spin_unlock_bh(&cgroup_idr_lock);
325}
326
327static bool cgroup_has_tasks(struct cgroup *cgrp)
328{
329 return cgrp->nr_populated_csets;
330}
331
332bool cgroup_is_threaded(struct cgroup *cgrp)
333{
334 return cgrp->dom_cgrp != cgrp;
335}
336
337
338static bool cgroup_is_mixable(struct cgroup *cgrp)
339{
340
341
342
343
344
345 return !cgroup_parent(cgrp);
346}
347
348
349static bool cgroup_can_be_thread_root(struct cgroup *cgrp)
350{
351
352 if (cgroup_is_mixable(cgrp))
353 return true;
354
355
356 if (cgroup_is_threaded(cgrp))
357 return false;
358
359
360 if (cgrp->nr_populated_domain_children)
361 return false;
362
363
364 if (cgrp->subtree_control & ~cgrp_dfl_threaded_ss_mask)
365 return false;
366
367 return true;
368}
369
370
371bool cgroup_is_thread_root(struct cgroup *cgrp)
372{
373
374 if (cgroup_is_threaded(cgrp))
375 return false;
376
377
378 if (cgrp->nr_threaded_children)
379 return true;
380
381
382
383
384
385 if (cgroup_has_tasks(cgrp) &&
386 (cgrp->subtree_control & cgrp_dfl_threaded_ss_mask))
387 return true;
388
389 return false;
390}
391
392
393static bool cgroup_is_valid_domain(struct cgroup *cgrp)
394{
395
396 if (cgroup_is_threaded(cgrp))
397 return false;
398
399
400 while ((cgrp = cgroup_parent(cgrp))) {
401 if (!cgroup_is_mixable(cgrp) && cgroup_is_thread_root(cgrp))
402 return false;
403 if (cgroup_is_threaded(cgrp))
404 return false;
405 }
406
407 return true;
408}
409
410
411static u16 cgroup_control(struct cgroup *cgrp)
412{
413 struct cgroup *parent = cgroup_parent(cgrp);
414 u16 root_ss_mask = cgrp->root->subsys_mask;
415
416 if (parent) {
417 u16 ss_mask = parent->subtree_control;
418
419
420 if (cgroup_is_threaded(cgrp))
421 ss_mask &= cgrp_dfl_threaded_ss_mask;
422 return ss_mask;
423 }
424
425 if (cgroup_on_dfl(cgrp))
426 root_ss_mask &= ~(cgrp_dfl_inhibit_ss_mask |
427 cgrp_dfl_implicit_ss_mask);
428 return root_ss_mask;
429}
430
431
432static u16 cgroup_ss_mask(struct cgroup *cgrp)
433{
434 struct cgroup *parent = cgroup_parent(cgrp);
435
436 if (parent) {
437 u16 ss_mask = parent->subtree_ss_mask;
438
439
440 if (cgroup_is_threaded(cgrp))
441 ss_mask &= cgrp_dfl_threaded_ss_mask;
442 return ss_mask;
443 }
444
445 return cgrp->root->subsys_mask;
446}
447
448
449
450
451
452
453
454
455
456
457
458
459static struct cgroup_subsys_state *cgroup_css(struct cgroup *cgrp,
460 struct cgroup_subsys *ss)
461{
462 if (ss)
463 return rcu_dereference_check(cgrp->subsys[ss->id],
464 lockdep_is_held(&cgroup_mutex));
465 else
466 return &cgrp->self;
467}
468
469
470
471
472
473
474
475
476
477static struct cgroup_subsys_state *cgroup_tryget_css(struct cgroup *cgrp,
478 struct cgroup_subsys *ss)
479{
480 struct cgroup_subsys_state *css;
481
482 rcu_read_lock();
483 css = cgroup_css(cgrp, ss);
484 if (css && !css_tryget_online(css))
485 css = NULL;
486 rcu_read_unlock();
487
488 return css;
489}
490
491
492
493
494
495
496
497
498
499
500
501static struct cgroup_subsys_state *cgroup_e_css_by_mask(struct cgroup *cgrp,
502 struct cgroup_subsys *ss)
503{
504 lockdep_assert_held(&cgroup_mutex);
505
506 if (!ss)
507 return &cgrp->self;
508
509
510
511
512
513 while (!(cgroup_ss_mask(cgrp) & (1 << ss->id))) {
514 cgrp = cgroup_parent(cgrp);
515 if (!cgrp)
516 return NULL;
517 }
518
519 return cgroup_css(cgrp, ss);
520}
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535struct cgroup_subsys_state *cgroup_e_css(struct cgroup *cgrp,
536 struct cgroup_subsys *ss)
537{
538 struct cgroup_subsys_state *css;
539
540 do {
541 css = cgroup_css(cgrp, ss);
542
543 if (css)
544 return css;
545 cgrp = cgroup_parent(cgrp);
546 } while (cgrp);
547
548 return init_css_set.subsys[ss->id];
549}
550
551
552
553
554
555
556
557
558
559
560
561
562struct cgroup_subsys_state *cgroup_get_e_css(struct cgroup *cgrp,
563 struct cgroup_subsys *ss)
564{
565 struct cgroup_subsys_state *css;
566
567 rcu_read_lock();
568
569 do {
570 css = cgroup_css(cgrp, ss);
571
572 if (css && css_tryget_online(css))
573 goto out_unlock;
574 cgrp = cgroup_parent(cgrp);
575 } while (cgrp);
576
577 css = init_css_set.subsys[ss->id];
578 css_get(css);
579out_unlock:
580 rcu_read_unlock();
581 return css;
582}
583
584static void cgroup_get_live(struct cgroup *cgrp)
585{
586 WARN_ON_ONCE(cgroup_is_dead(cgrp));
587 css_get(&cgrp->self);
588}
589
590
591
592
593
594
595int __cgroup_task_count(const struct cgroup *cgrp)
596{
597 int count = 0;
598 struct cgrp_cset_link *link;
599
600 lockdep_assert_held(&css_set_lock);
601
602 list_for_each_entry(link, &cgrp->cset_links, cset_link)
603 count += link->cset->nr_tasks;
604
605 return count;
606}
607
608
609
610
611
612int cgroup_task_count(const struct cgroup *cgrp)
613{
614 int count;
615
616 spin_lock_irq(&css_set_lock);
617 count = __cgroup_task_count(cgrp);
618 spin_unlock_irq(&css_set_lock);
619
620 return count;
621}
622
623struct cgroup_subsys_state *of_css(struct kernfs_open_file *of)
624{
625 struct cgroup *cgrp = of->kn->parent->priv;
626 struct cftype *cft = of_cft(of);
627
628
629
630
631
632
633
634
635
636 if (cft->ss)
637 return rcu_dereference_raw(cgrp->subsys[cft->ss->id]);
638 else
639 return &cgrp->self;
640}
641EXPORT_SYMBOL_GPL(of_css);
642
643
644
645
646
647
648
649
650
651#define for_each_css(css, ssid, cgrp) \
652 for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++) \
653 if (!((css) = rcu_dereference_check( \
654 (cgrp)->subsys[(ssid)], \
655 lockdep_is_held(&cgroup_mutex)))) { } \
656 else
657
658
659
660
661
662
663
664
665
666#define for_each_e_css(css, ssid, cgrp) \
667 for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++) \
668 if (!((css) = cgroup_e_css_by_mask(cgrp, \
669 cgroup_subsys[(ssid)]))) \
670 ; \
671 else
672
673
674
675
676
677
678
679
680
681
682#define do_each_subsys_mask(ss, ssid, ss_mask) do { \
683 unsigned long __ss_mask = (ss_mask); \
684 if (!CGROUP_SUBSYS_COUNT) { \
685 (ssid) = 0; \
686 break; \
687 } \
688 for_each_set_bit(ssid, &__ss_mask, CGROUP_SUBSYS_COUNT) { \
689 (ss) = cgroup_subsys[ssid]; \
690 {
691
692#define while_each_subsys_mask() \
693 } \
694 } \
695} while (false)
696
697
698#define cgroup_for_each_live_child(child, cgrp) \
699 list_for_each_entry((child), &(cgrp)->self.children, self.sibling) \
700 if (({ lockdep_assert_held(&cgroup_mutex); \
701 cgroup_is_dead(child); })) \
702 ; \
703 else
704
705
706#define cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp) \
707 css_for_each_descendant_pre((d_css), cgroup_css((cgrp), NULL)) \
708 if (({ lockdep_assert_held(&cgroup_mutex); \
709 (dsct) = (d_css)->cgroup; \
710 cgroup_is_dead(dsct); })) \
711 ; \
712 else
713
714
715#define cgroup_for_each_live_descendant_post(dsct, d_css, cgrp) \
716 css_for_each_descendant_post((d_css), cgroup_css((cgrp), NULL)) \
717 if (({ lockdep_assert_held(&cgroup_mutex); \
718 (dsct) = (d_css)->cgroup; \
719 cgroup_is_dead(dsct); })) \
720 ; \
721 else
722
723
724
725
726
727
728
729
730struct css_set init_css_set = {
731 .refcount = REFCOUNT_INIT(1),
732 .dom_cset = &init_css_set,
733 .tasks = LIST_HEAD_INIT(init_css_set.tasks),
734 .mg_tasks = LIST_HEAD_INIT(init_css_set.mg_tasks),
735 .dying_tasks = LIST_HEAD_INIT(init_css_set.dying_tasks),
736 .task_iters = LIST_HEAD_INIT(init_css_set.task_iters),
737 .threaded_csets = LIST_HEAD_INIT(init_css_set.threaded_csets),
738 .cgrp_links = LIST_HEAD_INIT(init_css_set.cgrp_links),
739 .mg_preload_node = LIST_HEAD_INIT(init_css_set.mg_preload_node),
740 .mg_node = LIST_HEAD_INIT(init_css_set.mg_node),
741
742
743
744
745
746
747
748 .dfl_cgrp = &cgrp_dfl_root.cgrp,
749};
750
751static int css_set_count = 1;
752
753static bool css_set_threaded(struct css_set *cset)
754{
755 return cset->dom_cset != cset;
756}
757
758
759
760
761
762
763
764
765
766
767static bool css_set_populated(struct css_set *cset)
768{
769 lockdep_assert_held(&css_set_lock);
770
771 return !list_empty(&cset->tasks) || !list_empty(&cset->mg_tasks);
772}
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791static void cgroup_update_populated(struct cgroup *cgrp, bool populated)
792{
793 struct cgroup *child = NULL;
794 int adj = populated ? 1 : -1;
795
796 lockdep_assert_held(&css_set_lock);
797
798 do {
799 bool was_populated = cgroup_is_populated(cgrp);
800
801 if (!child) {
802 cgrp->nr_populated_csets += adj;
803 } else {
804 if (cgroup_is_threaded(child))
805 cgrp->nr_populated_threaded_children += adj;
806 else
807 cgrp->nr_populated_domain_children += adj;
808 }
809
810 if (was_populated == cgroup_is_populated(cgrp))
811 break;
812
813 cgroup1_check_for_release(cgrp);
814 TRACE_CGROUP_PATH(notify_populated, cgrp,
815 cgroup_is_populated(cgrp));
816 cgroup_file_notify(&cgrp->events_file);
817
818 child = cgrp;
819 cgrp = cgroup_parent(cgrp);
820 } while (cgrp);
821}
822
823
824
825
826
827
828
829
830
831static void css_set_update_populated(struct css_set *cset, bool populated)
832{
833 struct cgrp_cset_link *link;
834
835 lockdep_assert_held(&css_set_lock);
836
837 list_for_each_entry(link, &cset->cgrp_links, cgrp_link)
838 cgroup_update_populated(link->cgrp, populated);
839}
840
841
842
843
844
845
846
847static void css_set_skip_task_iters(struct css_set *cset,
848 struct task_struct *task)
849{
850 struct css_task_iter *it, *pos;
851
852 list_for_each_entry_safe(it, pos, &cset->task_iters, iters_node)
853 css_task_iter_skip(it, task);
854}
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871static void css_set_move_task(struct task_struct *task,
872 struct css_set *from_cset, struct css_set *to_cset,
873 bool use_mg_tasks)
874{
875 lockdep_assert_held(&css_set_lock);
876
877 if (to_cset && !css_set_populated(to_cset))
878 css_set_update_populated(to_cset, true);
879
880 if (from_cset) {
881 WARN_ON_ONCE(list_empty(&task->cg_list));
882
883 css_set_skip_task_iters(from_cset, task);
884 list_del_init(&task->cg_list);
885 if (!css_set_populated(from_cset))
886 css_set_update_populated(from_cset, false);
887 } else {
888 WARN_ON_ONCE(!list_empty(&task->cg_list));
889 }
890
891 if (to_cset) {
892
893
894
895
896
897 WARN_ON_ONCE(task->flags & PF_EXITING);
898
899 cgroup_move_task(task, to_cset);
900 list_add_tail(&task->cg_list, use_mg_tasks ? &to_cset->mg_tasks :
901 &to_cset->tasks);
902 }
903}
904
905
906
907
908
909
910#define CSS_SET_HASH_BITS 7
911static DEFINE_HASHTABLE(css_set_table, CSS_SET_HASH_BITS);
912
913static unsigned long css_set_hash(struct cgroup_subsys_state *css[])
914{
915 unsigned long key = 0UL;
916 struct cgroup_subsys *ss;
917 int i;
918
919 for_each_subsys(ss, i)
920 key += (unsigned long)css[i];
921 key = (key >> 16) ^ key;
922
923 return key;
924}
925
926void put_css_set_locked(struct css_set *cset)
927{
928 struct cgrp_cset_link *link, *tmp_link;
929 struct cgroup_subsys *ss;
930 int ssid;
931
932 lockdep_assert_held(&css_set_lock);
933
934 if (!refcount_dec_and_test(&cset->refcount))
935 return;
936
937 WARN_ON_ONCE(!list_empty(&cset->threaded_csets));
938
939
940 for_each_subsys(ss, ssid) {
941 list_del(&cset->e_cset_node[ssid]);
942 css_put(cset->subsys[ssid]);
943 }
944 hash_del(&cset->hlist);
945 css_set_count--;
946
947 list_for_each_entry_safe(link, tmp_link, &cset->cgrp_links, cgrp_link) {
948 list_del(&link->cset_link);
949 list_del(&link->cgrp_link);
950 if (cgroup_parent(link->cgrp))
951 cgroup_put(link->cgrp);
952 kfree(link);
953 }
954
955 if (css_set_threaded(cset)) {
956 list_del(&cset->threaded_csets_node);
957 put_css_set_locked(cset->dom_cset);
958 }
959
960 kfree_rcu(cset, rcu_head);
961}
962
963
964
965
966
967
968
969
970
971
972
973static bool compare_css_sets(struct css_set *cset,
974 struct css_set *old_cset,
975 struct cgroup *new_cgrp,
976 struct cgroup_subsys_state *template[])
977{
978 struct cgroup *new_dfl_cgrp;
979 struct list_head *l1, *l2;
980
981
982
983
984
985
986 if (memcmp(template, cset->subsys, sizeof(cset->subsys)))
987 return false;
988
989
990
991 if (cgroup_on_dfl(new_cgrp))
992 new_dfl_cgrp = new_cgrp;
993 else
994 new_dfl_cgrp = old_cset->dfl_cgrp;
995
996 if (new_dfl_cgrp->dom_cgrp != cset->dom_cset->dfl_cgrp)
997 return false;
998
999
1000
1001
1002
1003
1004
1005 l1 = &cset->cgrp_links;
1006 l2 = &old_cset->cgrp_links;
1007 while (1) {
1008 struct cgrp_cset_link *link1, *link2;
1009 struct cgroup *cgrp1, *cgrp2;
1010
1011 l1 = l1->next;
1012 l2 = l2->next;
1013
1014 if (l1 == &cset->cgrp_links) {
1015 BUG_ON(l2 != &old_cset->cgrp_links);
1016 break;
1017 } else {
1018 BUG_ON(l2 == &old_cset->cgrp_links);
1019 }
1020
1021 link1 = list_entry(l1, struct cgrp_cset_link, cgrp_link);
1022 link2 = list_entry(l2, struct cgrp_cset_link, cgrp_link);
1023 cgrp1 = link1->cgrp;
1024 cgrp2 = link2->cgrp;
1025
1026 BUG_ON(cgrp1->root != cgrp2->root);
1027
1028
1029
1030
1031
1032
1033
1034
1035 if (cgrp1->root == new_cgrp->root) {
1036 if (cgrp1 != new_cgrp)
1037 return false;
1038 } else {
1039 if (cgrp1 != cgrp2)
1040 return false;
1041 }
1042 }
1043 return true;
1044}
1045
1046
1047
1048
1049
1050
1051
1052static struct css_set *find_existing_css_set(struct css_set *old_cset,
1053 struct cgroup *cgrp,
1054 struct cgroup_subsys_state *template[])
1055{
1056 struct cgroup_root *root = cgrp->root;
1057 struct cgroup_subsys *ss;
1058 struct css_set *cset;
1059 unsigned long key;
1060 int i;
1061
1062
1063
1064
1065
1066
1067 for_each_subsys(ss, i) {
1068 if (root->subsys_mask & (1UL << i)) {
1069
1070
1071
1072
1073 template[i] = cgroup_e_css_by_mask(cgrp, ss);
1074 } else {
1075
1076
1077
1078
1079 template[i] = old_cset->subsys[i];
1080 }
1081 }
1082
1083 key = css_set_hash(template);
1084 hash_for_each_possible(css_set_table, cset, hlist, key) {
1085 if (!compare_css_sets(cset, old_cset, cgrp, template))
1086 continue;
1087
1088
1089 return cset;
1090 }
1091
1092
1093 return NULL;
1094}
1095
1096static void free_cgrp_cset_links(struct list_head *links_to_free)
1097{
1098 struct cgrp_cset_link *link, *tmp_link;
1099
1100 list_for_each_entry_safe(link, tmp_link, links_to_free, cset_link) {
1101 list_del(&link->cset_link);
1102 kfree(link);
1103 }
1104}
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114static int allocate_cgrp_cset_links(int count, struct list_head *tmp_links)
1115{
1116 struct cgrp_cset_link *link;
1117 int i;
1118
1119 INIT_LIST_HEAD(tmp_links);
1120
1121 for (i = 0; i < count; i++) {
1122 link = kzalloc(sizeof(*link), GFP_KERNEL);
1123 if (!link) {
1124 free_cgrp_cset_links(tmp_links);
1125 return -ENOMEM;
1126 }
1127 list_add(&link->cset_link, tmp_links);
1128 }
1129 return 0;
1130}
1131
1132
1133
1134
1135
1136
1137
1138static void link_css_set(struct list_head *tmp_links, struct css_set *cset,
1139 struct cgroup *cgrp)
1140{
1141 struct cgrp_cset_link *link;
1142
1143 BUG_ON(list_empty(tmp_links));
1144
1145 if (cgroup_on_dfl(cgrp))
1146 cset->dfl_cgrp = cgrp;
1147
1148 link = list_first_entry(tmp_links, struct cgrp_cset_link, cset_link);
1149 link->cset = cset;
1150 link->cgrp = cgrp;
1151
1152
1153
1154
1155
1156 list_move_tail(&link->cset_link, &cgrp->cset_links);
1157 list_add_tail(&link->cgrp_link, &cset->cgrp_links);
1158
1159 if (cgroup_parent(cgrp))
1160 cgroup_get_live(cgrp);
1161}
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171static struct css_set *find_css_set(struct css_set *old_cset,
1172 struct cgroup *cgrp)
1173{
1174 struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT] = { };
1175 struct css_set *cset;
1176 struct list_head tmp_links;
1177 struct cgrp_cset_link *link;
1178 struct cgroup_subsys *ss;
1179 unsigned long key;
1180 int ssid;
1181
1182 lockdep_assert_held(&cgroup_mutex);
1183
1184
1185
1186 spin_lock_irq(&css_set_lock);
1187 cset = find_existing_css_set(old_cset, cgrp, template);
1188 if (cset)
1189 get_css_set(cset);
1190 spin_unlock_irq(&css_set_lock);
1191
1192 if (cset)
1193 return cset;
1194
1195 cset = kzalloc(sizeof(*cset), GFP_KERNEL);
1196 if (!cset)
1197 return NULL;
1198
1199
1200 if (allocate_cgrp_cset_links(cgroup_root_count, &tmp_links) < 0) {
1201 kfree(cset);
1202 return NULL;
1203 }
1204
1205 refcount_set(&cset->refcount, 1);
1206 cset->dom_cset = cset;
1207 INIT_LIST_HEAD(&cset->tasks);
1208 INIT_LIST_HEAD(&cset->mg_tasks);
1209 INIT_LIST_HEAD(&cset->dying_tasks);
1210 INIT_LIST_HEAD(&cset->task_iters);
1211 INIT_LIST_HEAD(&cset->threaded_csets);
1212 INIT_HLIST_NODE(&cset->hlist);
1213 INIT_LIST_HEAD(&cset->cgrp_links);
1214 INIT_LIST_HEAD(&cset->mg_preload_node);
1215 INIT_LIST_HEAD(&cset->mg_node);
1216
1217
1218
1219 memcpy(cset->subsys, template, sizeof(cset->subsys));
1220
1221 spin_lock_irq(&css_set_lock);
1222
1223 list_for_each_entry(link, &old_cset->cgrp_links, cgrp_link) {
1224 struct cgroup *c = link->cgrp;
1225
1226 if (c->root == cgrp->root)
1227 c = cgrp;
1228 link_css_set(&tmp_links, cset, c);
1229 }
1230
1231 BUG_ON(!list_empty(&tmp_links));
1232
1233 css_set_count++;
1234
1235
1236 key = css_set_hash(cset->subsys);
1237 hash_add(css_set_table, &cset->hlist, key);
1238
1239 for_each_subsys(ss, ssid) {
1240 struct cgroup_subsys_state *css = cset->subsys[ssid];
1241
1242 list_add_tail(&cset->e_cset_node[ssid],
1243 &css->cgroup->e_csets[ssid]);
1244 css_get(css);
1245 }
1246
1247 spin_unlock_irq(&css_set_lock);
1248
1249
1250
1251
1252
1253
1254
1255 if (cgroup_is_threaded(cset->dfl_cgrp)) {
1256 struct css_set *dcset;
1257
1258 dcset = find_css_set(cset, cset->dfl_cgrp->dom_cgrp);
1259 if (!dcset) {
1260 put_css_set(cset);
1261 return NULL;
1262 }
1263
1264 spin_lock_irq(&css_set_lock);
1265 cset->dom_cset = dcset;
1266 list_add_tail(&cset->threaded_csets_node,
1267 &dcset->threaded_csets);
1268 spin_unlock_irq(&css_set_lock);
1269 }
1270
1271 return cset;
1272}
1273
1274struct cgroup_root *cgroup_root_from_kf(struct kernfs_root *kf_root)
1275{
1276 struct cgroup *root_cgrp = kf_root->kn->priv;
1277
1278 return root_cgrp->root;
1279}
1280
1281static int cgroup_init_root_id(struct cgroup_root *root)
1282{
1283 int id;
1284
1285 lockdep_assert_held(&cgroup_mutex);
1286
1287 id = idr_alloc_cyclic(&cgroup_hierarchy_idr, root, 0, 0, GFP_KERNEL);
1288 if (id < 0)
1289 return id;
1290
1291 root->hierarchy_id = id;
1292 return 0;
1293}
1294
1295static void cgroup_exit_root_id(struct cgroup_root *root)
1296{
1297 lockdep_assert_held(&cgroup_mutex);
1298
1299 idr_remove(&cgroup_hierarchy_idr, root->hierarchy_id);
1300}
1301
1302void cgroup_free_root(struct cgroup_root *root)
1303{
1304 kfree(root);
1305}
1306
1307static void cgroup_destroy_root(struct cgroup_root *root)
1308{
1309 struct cgroup *cgrp = &root->cgrp;
1310 struct cgrp_cset_link *link, *tmp_link;
1311
1312 trace_cgroup_destroy_root(root);
1313
1314 cgroup_lock_and_drain_offline(&cgrp_dfl_root.cgrp);
1315
1316 BUG_ON(atomic_read(&root->nr_cgrps));
1317 BUG_ON(!list_empty(&cgrp->self.children));
1318
1319
1320 WARN_ON(rebind_subsystems(&cgrp_dfl_root, root->subsys_mask));
1321
1322
1323
1324
1325
1326 spin_lock_irq(&css_set_lock);
1327
1328 list_for_each_entry_safe(link, tmp_link, &cgrp->cset_links, cset_link) {
1329 list_del(&link->cset_link);
1330 list_del(&link->cgrp_link);
1331 kfree(link);
1332 }
1333
1334 spin_unlock_irq(&css_set_lock);
1335
1336 if (!list_empty(&root->root_list)) {
1337 list_del(&root->root_list);
1338 cgroup_root_count--;
1339 }
1340
1341 cgroup_exit_root_id(root);
1342
1343 mutex_unlock(&cgroup_mutex);
1344
1345 kernfs_destroy_root(root->kf_root);
1346 cgroup_free_root(root);
1347}
1348
1349
1350
1351
1352
1353static struct cgroup *
1354current_cgns_cgroup_from_root(struct cgroup_root *root)
1355{
1356 struct cgroup *res = NULL;
1357 struct css_set *cset;
1358
1359 lockdep_assert_held(&css_set_lock);
1360
1361 rcu_read_lock();
1362
1363 cset = current->nsproxy->cgroup_ns->root_cset;
1364 if (cset == &init_css_set) {
1365 res = &root->cgrp;
1366 } else if (root == &cgrp_dfl_root) {
1367 res = cset->dfl_cgrp;
1368 } else {
1369 struct cgrp_cset_link *link;
1370
1371 list_for_each_entry(link, &cset->cgrp_links, cgrp_link) {
1372 struct cgroup *c = link->cgrp;
1373
1374 if (c->root == root) {
1375 res = c;
1376 break;
1377 }
1378 }
1379 }
1380 rcu_read_unlock();
1381
1382 BUG_ON(!res);
1383 return res;
1384}
1385
1386
1387static struct cgroup *cset_cgroup_from_root(struct css_set *cset,
1388 struct cgroup_root *root)
1389{
1390 struct cgroup *res = NULL;
1391
1392 lockdep_assert_held(&cgroup_mutex);
1393 lockdep_assert_held(&css_set_lock);
1394
1395 if (cset == &init_css_set) {
1396 res = &root->cgrp;
1397 } else if (root == &cgrp_dfl_root) {
1398 res = cset->dfl_cgrp;
1399 } else {
1400 struct cgrp_cset_link *link;
1401
1402 list_for_each_entry(link, &cset->cgrp_links, cgrp_link) {
1403 struct cgroup *c = link->cgrp;
1404
1405 if (c->root == root) {
1406 res = c;
1407 break;
1408 }
1409 }
1410 }
1411
1412 BUG_ON(!res);
1413 return res;
1414}
1415
1416
1417
1418
1419
1420struct cgroup *task_cgroup_from_root(struct task_struct *task,
1421 struct cgroup_root *root)
1422{
1423
1424
1425
1426
1427 return cset_cgroup_from_root(task_css_set(task), root);
1428}
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456static struct kernfs_syscall_ops cgroup_kf_syscall_ops;
1457
1458static char *cgroup_file_name(struct cgroup *cgrp, const struct cftype *cft,
1459 char *buf)
1460{
1461 struct cgroup_subsys *ss = cft->ss;
1462
1463 if (cft->ss && !(cft->flags & CFTYPE_NO_PREFIX) &&
1464 !(cgrp->root->flags & CGRP_ROOT_NOPREFIX)) {
1465 const char *dbg = (cft->flags & CFTYPE_DEBUG) ? ".__DEBUG__." : "";
1466
1467 snprintf(buf, CGROUP_FILE_NAME_MAX, "%s%s.%s",
1468 dbg, cgroup_on_dfl(cgrp) ? ss->name : ss->legacy_name,
1469 cft->name);
1470 } else {
1471 strscpy(buf, cft->name, CGROUP_FILE_NAME_MAX);
1472 }
1473 return buf;
1474}
1475
1476
1477
1478
1479
1480
1481
1482static umode_t cgroup_file_mode(const struct cftype *cft)
1483{
1484 umode_t mode = 0;
1485
1486 if (cft->read_u64 || cft->read_s64 || cft->seq_show)
1487 mode |= S_IRUGO;
1488
1489 if (cft->write_u64 || cft->write_s64 || cft->write) {
1490 if (cft->flags & CFTYPE_WORLD_WRITABLE)
1491 mode |= S_IWUGO;
1492 else
1493 mode |= S_IWUSR;
1494 }
1495
1496 return mode;
1497}
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511static u16 cgroup_calc_subtree_ss_mask(u16 subtree_control, u16 this_ss_mask)
1512{
1513 u16 cur_ss_mask = subtree_control;
1514 struct cgroup_subsys *ss;
1515 int ssid;
1516
1517 lockdep_assert_held(&cgroup_mutex);
1518
1519 cur_ss_mask |= cgrp_dfl_implicit_ss_mask;
1520
1521 while (true) {
1522 u16 new_ss_mask = cur_ss_mask;
1523
1524 do_each_subsys_mask(ss, ssid, cur_ss_mask) {
1525 new_ss_mask |= ss->depends_on;
1526 } while_each_subsys_mask();
1527
1528
1529
1530
1531
1532
1533 new_ss_mask &= this_ss_mask;
1534
1535 if (new_ss_mask == cur_ss_mask)
1536 break;
1537 cur_ss_mask = new_ss_mask;
1538 }
1539
1540 return cur_ss_mask;
1541}
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553void cgroup_kn_unlock(struct kernfs_node *kn)
1554{
1555 struct cgroup *cgrp;
1556
1557 if (kernfs_type(kn) == KERNFS_DIR)
1558 cgrp = kn->priv;
1559 else
1560 cgrp = kn->parent->priv;
1561
1562 mutex_unlock(&cgroup_mutex);
1563
1564 kernfs_unbreak_active_protection(kn);
1565 cgroup_put(cgrp);
1566}
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585struct cgroup *cgroup_kn_lock_live(struct kernfs_node *kn, bool drain_offline)
1586{
1587 struct cgroup *cgrp;
1588
1589 if (kernfs_type(kn) == KERNFS_DIR)
1590 cgrp = kn->priv;
1591 else
1592 cgrp = kn->parent->priv;
1593
1594
1595
1596
1597
1598
1599
1600 if (!cgroup_tryget(cgrp))
1601 return NULL;
1602 kernfs_break_active_protection(kn);
1603
1604 if (drain_offline)
1605 cgroup_lock_and_drain_offline(cgrp);
1606 else
1607 mutex_lock(&cgroup_mutex);
1608
1609 if (!cgroup_is_dead(cgrp))
1610 return cgrp;
1611
1612 cgroup_kn_unlock(kn);
1613 return NULL;
1614}
1615
1616static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft)
1617{
1618 char name[CGROUP_FILE_NAME_MAX];
1619
1620 lockdep_assert_held(&cgroup_mutex);
1621
1622 if (cft->file_offset) {
1623 struct cgroup_subsys_state *css = cgroup_css(cgrp, cft->ss);
1624 struct cgroup_file *cfile = (void *)css + cft->file_offset;
1625
1626 spin_lock_irq(&cgroup_file_kn_lock);
1627 cfile->kn = NULL;
1628 spin_unlock_irq(&cgroup_file_kn_lock);
1629
1630 del_timer_sync(&cfile->notify_timer);
1631 }
1632
1633 kernfs_remove_by_name(cgrp->kn, cgroup_file_name(cgrp, cft, name));
1634}
1635
1636
1637
1638
1639
1640static void css_clear_dir(struct cgroup_subsys_state *css)
1641{
1642 struct cgroup *cgrp = css->cgroup;
1643 struct cftype *cfts;
1644
1645 if (!(css->flags & CSS_VISIBLE))
1646 return;
1647
1648 css->flags &= ~CSS_VISIBLE;
1649
1650 if (!css->ss) {
1651 if (cgroup_on_dfl(cgrp))
1652 cfts = cgroup_base_files;
1653 else
1654 cfts = cgroup1_base_files;
1655
1656 cgroup_addrm_files(css, cgrp, cfts, false);
1657 } else {
1658 list_for_each_entry(cfts, &css->ss->cfts, node)
1659 cgroup_addrm_files(css, cgrp, cfts, false);
1660 }
1661}
1662
1663
1664
1665
1666
1667
1668
1669static int css_populate_dir(struct cgroup_subsys_state *css)
1670{
1671 struct cgroup *cgrp = css->cgroup;
1672 struct cftype *cfts, *failed_cfts;
1673 int ret;
1674
1675 if ((css->flags & CSS_VISIBLE) || !cgrp->kn)
1676 return 0;
1677
1678 if (!css->ss) {
1679 if (cgroup_on_dfl(cgrp))
1680 cfts = cgroup_base_files;
1681 else
1682 cfts = cgroup1_base_files;
1683
1684 ret = cgroup_addrm_files(&cgrp->self, cgrp, cfts, true);
1685 if (ret < 0)
1686 return ret;
1687 } else {
1688 list_for_each_entry(cfts, &css->ss->cfts, node) {
1689 ret = cgroup_addrm_files(css, cgrp, cfts, true);
1690 if (ret < 0) {
1691 failed_cfts = cfts;
1692 goto err;
1693 }
1694 }
1695 }
1696
1697 css->flags |= CSS_VISIBLE;
1698
1699 return 0;
1700err:
1701 list_for_each_entry(cfts, &css->ss->cfts, node) {
1702 if (cfts == failed_cfts)
1703 break;
1704 cgroup_addrm_files(css, cgrp, cfts, false);
1705 }
1706 return ret;
1707}
1708
1709int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask)
1710{
1711 struct cgroup *dcgrp = &dst_root->cgrp;
1712 struct cgroup_subsys *ss;
1713 int ssid, i, ret;
1714
1715 lockdep_assert_held(&cgroup_mutex);
1716
1717 do_each_subsys_mask(ss, ssid, ss_mask) {
1718
1719
1720
1721
1722
1723 if (css_next_child(NULL, cgroup_css(&ss->root->cgrp, ss)) &&
1724 !ss->implicit_on_dfl)
1725 return -EBUSY;
1726
1727
1728 if (ss->root != &cgrp_dfl_root && dst_root != &cgrp_dfl_root)
1729 return -EBUSY;
1730 } while_each_subsys_mask();
1731
1732 do_each_subsys_mask(ss, ssid, ss_mask) {
1733 struct cgroup_root *src_root = ss->root;
1734 struct cgroup *scgrp = &src_root->cgrp;
1735 struct cgroup_subsys_state *css = cgroup_css(scgrp, ss);
1736 struct css_set *cset;
1737
1738 WARN_ON(!css || cgroup_css(dcgrp, ss));
1739
1740
1741 src_root->subsys_mask &= ~(1 << ssid);
1742 WARN_ON(cgroup_apply_control(scgrp));
1743 cgroup_finalize_control(scgrp, 0);
1744
1745
1746 RCU_INIT_POINTER(scgrp->subsys[ssid], NULL);
1747 rcu_assign_pointer(dcgrp->subsys[ssid], css);
1748 ss->root = dst_root;
1749 css->cgroup = dcgrp;
1750
1751 spin_lock_irq(&css_set_lock);
1752 hash_for_each(css_set_table, i, cset, hlist)
1753 list_move_tail(&cset->e_cset_node[ss->id],
1754 &dcgrp->e_csets[ss->id]);
1755 spin_unlock_irq(&css_set_lock);
1756
1757
1758 dst_root->subsys_mask |= 1 << ssid;
1759 if (dst_root == &cgrp_dfl_root) {
1760 static_branch_enable(cgroup_subsys_on_dfl_key[ssid]);
1761 } else {
1762 dcgrp->subtree_control |= 1 << ssid;
1763 static_branch_disable(cgroup_subsys_on_dfl_key[ssid]);
1764 }
1765
1766 ret = cgroup_apply_control(dcgrp);
1767 if (ret)
1768 pr_warn("partial failure to rebind %s controller (err=%d)\n",
1769 ss->name, ret);
1770
1771 if (ss->bind)
1772 ss->bind(css);
1773 } while_each_subsys_mask();
1774
1775 kernfs_activate(dcgrp->kn);
1776 return 0;
1777}
1778
1779int cgroup_show_path(struct seq_file *sf, struct kernfs_node *kf_node,
1780 struct kernfs_root *kf_root)
1781{
1782 int len = 0;
1783 char *buf = NULL;
1784 struct cgroup_root *kf_cgroot = cgroup_root_from_kf(kf_root);
1785 struct cgroup *ns_cgroup;
1786
1787 buf = kmalloc(PATH_MAX, GFP_KERNEL);
1788 if (!buf)
1789 return -ENOMEM;
1790
1791 spin_lock_irq(&css_set_lock);
1792 ns_cgroup = current_cgns_cgroup_from_root(kf_cgroot);
1793 len = kernfs_path_from_node(kf_node, ns_cgroup->kn, buf, PATH_MAX);
1794 spin_unlock_irq(&css_set_lock);
1795
1796 if (len >= PATH_MAX)
1797 len = -ERANGE;
1798 else if (len > 0) {
1799 seq_escape(sf, buf, " \t\n\\");
1800 len = 0;
1801 }
1802 kfree(buf);
1803 return len;
1804}
1805
1806enum cgroup2_param {
1807 Opt_nsdelegate,
1808 Opt_memory_localevents,
1809 Opt_memory_recursiveprot,
1810 nr__cgroup2_params
1811};
1812
1813static const struct fs_parameter_spec cgroup2_fs_parameters[] = {
1814 fsparam_flag("nsdelegate", Opt_nsdelegate),
1815 fsparam_flag("memory_localevents", Opt_memory_localevents),
1816 fsparam_flag("memory_recursiveprot", Opt_memory_recursiveprot),
1817 {}
1818};
1819
1820static int cgroup2_parse_param(struct fs_context *fc, struct fs_parameter *param)
1821{
1822 struct cgroup_fs_context *ctx = cgroup_fc2context(fc);
1823 struct fs_parse_result result;
1824 int opt;
1825
1826 opt = fs_parse(fc, cgroup2_fs_parameters, param, &result);
1827 if (opt < 0)
1828 return opt;
1829
1830 switch (opt) {
1831 case Opt_nsdelegate:
1832 ctx->flags |= CGRP_ROOT_NS_DELEGATE;
1833 return 0;
1834 case Opt_memory_localevents:
1835 ctx->flags |= CGRP_ROOT_MEMORY_LOCAL_EVENTS;
1836 return 0;
1837 case Opt_memory_recursiveprot:
1838 ctx->flags |= CGRP_ROOT_MEMORY_RECURSIVE_PROT;
1839 return 0;
1840 }
1841 return -EINVAL;
1842}
1843
1844static void apply_cgroup_root_flags(unsigned int root_flags)
1845{
1846 if (current->nsproxy->cgroup_ns == &init_cgroup_ns) {
1847 if (root_flags & CGRP_ROOT_NS_DELEGATE)
1848 cgrp_dfl_root.flags |= CGRP_ROOT_NS_DELEGATE;
1849 else
1850 cgrp_dfl_root.flags &= ~CGRP_ROOT_NS_DELEGATE;
1851
1852 if (root_flags & CGRP_ROOT_MEMORY_LOCAL_EVENTS)
1853 cgrp_dfl_root.flags |= CGRP_ROOT_MEMORY_LOCAL_EVENTS;
1854 else
1855 cgrp_dfl_root.flags &= ~CGRP_ROOT_MEMORY_LOCAL_EVENTS;
1856
1857 if (root_flags & CGRP_ROOT_MEMORY_RECURSIVE_PROT)
1858 cgrp_dfl_root.flags |= CGRP_ROOT_MEMORY_RECURSIVE_PROT;
1859 else
1860 cgrp_dfl_root.flags &= ~CGRP_ROOT_MEMORY_RECURSIVE_PROT;
1861 }
1862}
1863
1864static int cgroup_show_options(struct seq_file *seq, struct kernfs_root *kf_root)
1865{
1866 if (cgrp_dfl_root.flags & CGRP_ROOT_NS_DELEGATE)
1867 seq_puts(seq, ",nsdelegate");
1868 if (cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_LOCAL_EVENTS)
1869 seq_puts(seq, ",memory_localevents");
1870 if (cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_RECURSIVE_PROT)
1871 seq_puts(seq, ",memory_recursiveprot");
1872 return 0;
1873}
1874
1875static int cgroup_reconfigure(struct fs_context *fc)
1876{
1877 struct cgroup_fs_context *ctx = cgroup_fc2context(fc);
1878
1879 apply_cgroup_root_flags(ctx->flags);
1880 return 0;
1881}
1882
1883static void init_cgroup_housekeeping(struct cgroup *cgrp)
1884{
1885 struct cgroup_subsys *ss;
1886 int ssid;
1887
1888 INIT_LIST_HEAD(&cgrp->self.sibling);
1889 INIT_LIST_HEAD(&cgrp->self.children);
1890 INIT_LIST_HEAD(&cgrp->cset_links);
1891 INIT_LIST_HEAD(&cgrp->pidlists);
1892 mutex_init(&cgrp->pidlist_mutex);
1893 cgrp->self.cgroup = cgrp;
1894 cgrp->self.flags |= CSS_ONLINE;
1895 cgrp->dom_cgrp = cgrp;
1896 cgrp->max_descendants = INT_MAX;
1897 cgrp->max_depth = INT_MAX;
1898 INIT_LIST_HEAD(&cgrp->rstat_css_list);
1899 prev_cputime_init(&cgrp->prev_cputime);
1900
1901 for_each_subsys(ss, ssid)
1902 INIT_LIST_HEAD(&cgrp->e_csets[ssid]);
1903
1904 init_waitqueue_head(&cgrp->offline_waitq);
1905 INIT_WORK(&cgrp->release_agent_work, cgroup1_release_agent);
1906}
1907
1908void init_cgroup_root(struct cgroup_fs_context *ctx)
1909{
1910 struct cgroup_root *root = ctx->root;
1911 struct cgroup *cgrp = &root->cgrp;
1912
1913 INIT_LIST_HEAD(&root->root_list);
1914 atomic_set(&root->nr_cgrps, 1);
1915 cgrp->root = root;
1916 init_cgroup_housekeeping(cgrp);
1917
1918 root->flags = ctx->flags;
1919 if (ctx->release_agent)
1920 strscpy(root->release_agent_path, ctx->release_agent, PATH_MAX);
1921 if (ctx->name)
1922 strscpy(root->name, ctx->name, MAX_CGROUP_ROOT_NAMELEN);
1923 if (ctx->cpuset_clone_children)
1924 set_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags);
1925}
1926
1927int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask)
1928{
1929 LIST_HEAD(tmp_links);
1930 struct cgroup *root_cgrp = &root->cgrp;
1931 struct kernfs_syscall_ops *kf_sops;
1932 struct css_set *cset;
1933 int i, ret;
1934
1935 lockdep_assert_held(&cgroup_mutex);
1936
1937 ret = percpu_ref_init(&root_cgrp->self.refcnt, css_release,
1938 0, GFP_KERNEL);
1939 if (ret)
1940 goto out;
1941
1942
1943
1944
1945
1946
1947
1948
1949 ret = allocate_cgrp_cset_links(2 * css_set_count, &tmp_links);
1950 if (ret)
1951 goto cancel_ref;
1952
1953 ret = cgroup_init_root_id(root);
1954 if (ret)
1955 goto cancel_ref;
1956
1957 kf_sops = root == &cgrp_dfl_root ?
1958 &cgroup_kf_syscall_ops : &cgroup1_kf_syscall_ops;
1959
1960 root->kf_root = kernfs_create_root(kf_sops,
1961 KERNFS_ROOT_CREATE_DEACTIVATED |
1962 KERNFS_ROOT_SUPPORT_EXPORTOP |
1963 KERNFS_ROOT_SUPPORT_USER_XATTR,
1964 root_cgrp);
1965 if (IS_ERR(root->kf_root)) {
1966 ret = PTR_ERR(root->kf_root);
1967 goto exit_root_id;
1968 }
1969 root_cgrp->kn = root->kf_root->kn;
1970 WARN_ON_ONCE(cgroup_ino(root_cgrp) != 1);
1971 root_cgrp->ancestor_ids[0] = cgroup_id(root_cgrp);
1972
1973 ret = css_populate_dir(&root_cgrp->self);
1974 if (ret)
1975 goto destroy_root;
1976
1977 ret = rebind_subsystems(root, ss_mask);
1978 if (ret)
1979 goto destroy_root;
1980
1981 ret = cgroup_bpf_inherit(root_cgrp);
1982 WARN_ON_ONCE(ret);
1983
1984 trace_cgroup_setup_root(root);
1985
1986
1987
1988
1989
1990
1991 list_add(&root->root_list, &cgroup_roots);
1992 cgroup_root_count++;
1993
1994
1995
1996
1997
1998 spin_lock_irq(&css_set_lock);
1999 hash_for_each(css_set_table, i, cset, hlist) {
2000 link_css_set(&tmp_links, cset, root_cgrp);
2001 if (css_set_populated(cset))
2002 cgroup_update_populated(root_cgrp, true);
2003 }
2004 spin_unlock_irq(&css_set_lock);
2005
2006 BUG_ON(!list_empty(&root_cgrp->self.children));
2007 BUG_ON(atomic_read(&root->nr_cgrps) != 1);
2008
2009 kernfs_activate(root_cgrp->kn);
2010 ret = 0;
2011 goto out;
2012
2013destroy_root:
2014 kernfs_destroy_root(root->kf_root);
2015 root->kf_root = NULL;
2016exit_root_id:
2017 cgroup_exit_root_id(root);
2018cancel_ref:
2019 percpu_ref_exit(&root_cgrp->self.refcnt);
2020out:
2021 free_cgrp_cset_links(&tmp_links);
2022 return ret;
2023}
2024
2025int cgroup_do_get_tree(struct fs_context *fc)
2026{
2027 struct cgroup_fs_context *ctx = cgroup_fc2context(fc);
2028 int ret;
2029
2030 ctx->kfc.root = ctx->root->kf_root;
2031 if (fc->fs_type == &cgroup2_fs_type)
2032 ctx->kfc.magic = CGROUP2_SUPER_MAGIC;
2033 else
2034 ctx->kfc.magic = CGROUP_SUPER_MAGIC;
2035 ret = kernfs_get_tree(fc);
2036
2037
2038
2039
2040
2041 if (!ret && ctx->ns != &init_cgroup_ns) {
2042 struct dentry *nsdentry;
2043 struct super_block *sb = fc->root->d_sb;
2044 struct cgroup *cgrp;
2045
2046 mutex_lock(&cgroup_mutex);
2047 spin_lock_irq(&css_set_lock);
2048
2049 cgrp = cset_cgroup_from_root(ctx->ns->root_cset, ctx->root);
2050
2051 spin_unlock_irq(&css_set_lock);
2052 mutex_unlock(&cgroup_mutex);
2053
2054 nsdentry = kernfs_node_dentry(cgrp->kn, sb);
2055 dput(fc->root);
2056 if (IS_ERR(nsdentry)) {
2057 deactivate_locked_super(sb);
2058 ret = PTR_ERR(nsdentry);
2059 nsdentry = NULL;
2060 }
2061 fc->root = nsdentry;
2062 }
2063
2064 if (!ctx->kfc.new_sb_created)
2065 cgroup_put(&ctx->root->cgrp);
2066
2067 return ret;
2068}
2069
2070
2071
2072
2073static void cgroup_fs_context_free(struct fs_context *fc)
2074{
2075 struct cgroup_fs_context *ctx = cgroup_fc2context(fc);
2076
2077 kfree(ctx->name);
2078 kfree(ctx->release_agent);
2079 put_cgroup_ns(ctx->ns);
2080 kernfs_free_fs_context(fc);
2081 kfree(ctx);
2082}
2083
2084static int cgroup_get_tree(struct fs_context *fc)
2085{
2086 struct cgroup_fs_context *ctx = cgroup_fc2context(fc);
2087 int ret;
2088
2089 cgrp_dfl_visible = true;
2090 cgroup_get_live(&cgrp_dfl_root.cgrp);
2091 ctx->root = &cgrp_dfl_root;
2092
2093 ret = cgroup_do_get_tree(fc);
2094 if (!ret)
2095 apply_cgroup_root_flags(ctx->flags);
2096 return ret;
2097}
2098
2099static const struct fs_context_operations cgroup_fs_context_ops = {
2100 .free = cgroup_fs_context_free,
2101 .parse_param = cgroup2_parse_param,
2102 .get_tree = cgroup_get_tree,
2103 .reconfigure = cgroup_reconfigure,
2104};
2105
2106static const struct fs_context_operations cgroup1_fs_context_ops = {
2107 .free = cgroup_fs_context_free,
2108 .parse_param = cgroup1_parse_param,
2109 .get_tree = cgroup1_get_tree,
2110 .reconfigure = cgroup1_reconfigure,
2111};
2112
2113
2114
2115
2116
2117static int cgroup_init_fs_context(struct fs_context *fc)
2118{
2119 struct cgroup_fs_context *ctx;
2120
2121 ctx = kzalloc(sizeof(struct cgroup_fs_context), GFP_KERNEL);
2122 if (!ctx)
2123 return -ENOMEM;
2124
2125 ctx->ns = current->nsproxy->cgroup_ns;
2126 get_cgroup_ns(ctx->ns);
2127 fc->fs_private = &ctx->kfc;
2128 if (fc->fs_type == &cgroup2_fs_type)
2129 fc->ops = &cgroup_fs_context_ops;
2130 else
2131 fc->ops = &cgroup1_fs_context_ops;
2132 put_user_ns(fc->user_ns);
2133 fc->user_ns = get_user_ns(ctx->ns->user_ns);
2134 fc->global = true;
2135 return 0;
2136}
2137
2138static void cgroup_kill_sb(struct super_block *sb)
2139{
2140 struct kernfs_root *kf_root = kernfs_root_from_sb(sb);
2141 struct cgroup_root *root = cgroup_root_from_kf(kf_root);
2142
2143
2144
2145
2146
2147
2148
2149
2150 if (list_empty(&root->cgrp.self.children) && root != &cgrp_dfl_root &&
2151 !percpu_ref_is_dying(&root->cgrp.self.refcnt))
2152 percpu_ref_kill(&root->cgrp.self.refcnt);
2153 cgroup_put(&root->cgrp);
2154 kernfs_kill_sb(sb);
2155}
2156
2157struct file_system_type cgroup_fs_type = {
2158 .name = "cgroup",
2159 .init_fs_context = cgroup_init_fs_context,
2160 .parameters = cgroup1_fs_parameters,
2161 .kill_sb = cgroup_kill_sb,
2162 .fs_flags = FS_USERNS_MOUNT,
2163};
2164
2165static struct file_system_type cgroup2_fs_type = {
2166 .name = "cgroup2",
2167 .init_fs_context = cgroup_init_fs_context,
2168 .parameters = cgroup2_fs_parameters,
2169 .kill_sb = cgroup_kill_sb,
2170 .fs_flags = FS_USERNS_MOUNT,
2171};
2172
2173#ifdef CONFIG_CPUSETS
2174static const struct fs_context_operations cpuset_fs_context_ops = {
2175 .get_tree = cgroup1_get_tree,
2176 .free = cgroup_fs_context_free,
2177};
2178
2179
2180
2181
2182
2183
2184static int cpuset_init_fs_context(struct fs_context *fc)
2185{
2186 char *agent = kstrdup("/sbin/cpuset_release_agent", GFP_USER);
2187 struct cgroup_fs_context *ctx;
2188 int err;
2189
2190 err = cgroup_init_fs_context(fc);
2191 if (err) {
2192 kfree(agent);
2193 return err;
2194 }
2195
2196 fc->ops = &cpuset_fs_context_ops;
2197
2198 ctx = cgroup_fc2context(fc);
2199 ctx->subsys_mask = 1 << cpuset_cgrp_id;
2200 ctx->flags |= CGRP_ROOT_NOPREFIX;
2201 ctx->release_agent = agent;
2202
2203 get_filesystem(&cgroup_fs_type);
2204 put_filesystem(fc->fs_type);
2205 fc->fs_type = &cgroup_fs_type;
2206
2207 return 0;
2208}
2209
2210static struct file_system_type cpuset_fs_type = {
2211 .name = "cpuset",
2212 .init_fs_context = cpuset_init_fs_context,
2213 .fs_flags = FS_USERNS_MOUNT,
2214};
2215#endif
2216
2217int cgroup_path_ns_locked(struct cgroup *cgrp, char *buf, size_t buflen,
2218 struct cgroup_namespace *ns)
2219{
2220 struct cgroup *root = cset_cgroup_from_root(ns->root_cset, cgrp->root);
2221
2222 return kernfs_path_from_node(cgrp->kn, root->kn, buf, buflen);
2223}
2224
2225int cgroup_path_ns(struct cgroup *cgrp, char *buf, size_t buflen,
2226 struct cgroup_namespace *ns)
2227{
2228 int ret;
2229
2230 mutex_lock(&cgroup_mutex);
2231 spin_lock_irq(&css_set_lock);
2232
2233 ret = cgroup_path_ns_locked(cgrp, buf, buflen, ns);
2234
2235 spin_unlock_irq(&css_set_lock);
2236 mutex_unlock(&cgroup_mutex);
2237
2238 return ret;
2239}
2240EXPORT_SYMBOL_GPL(cgroup_path_ns);
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255int task_cgroup_path(struct task_struct *task, char *buf, size_t buflen)
2256{
2257 struct cgroup_root *root;
2258 struct cgroup *cgrp;
2259 int hierarchy_id = 1;
2260 int ret;
2261
2262 mutex_lock(&cgroup_mutex);
2263 spin_lock_irq(&css_set_lock);
2264
2265 root = idr_get_next(&cgroup_hierarchy_idr, &hierarchy_id);
2266
2267 if (root) {
2268 cgrp = task_cgroup_from_root(task, root);
2269 ret = cgroup_path_ns_locked(cgrp, buf, buflen, &init_cgroup_ns);
2270 } else {
2271
2272 ret = strlcpy(buf, "/", buflen);
2273 }
2274
2275 spin_unlock_irq(&css_set_lock);
2276 mutex_unlock(&cgroup_mutex);
2277 return ret;
2278}
2279EXPORT_SYMBOL_GPL(task_cgroup_path);
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291static void cgroup_migrate_add_task(struct task_struct *task,
2292 struct cgroup_mgctx *mgctx)
2293{
2294 struct css_set *cset;
2295
2296 lockdep_assert_held(&css_set_lock);
2297
2298
2299 if (task->flags & PF_EXITING)
2300 return;
2301
2302
2303 WARN_ON_ONCE(list_empty(&task->cg_list));
2304
2305 cset = task_css_set(task);
2306 if (!cset->mg_src_cgrp)
2307 return;
2308
2309 mgctx->tset.nr_tasks++;
2310
2311 list_move_tail(&task->cg_list, &cset->mg_tasks);
2312 if (list_empty(&cset->mg_node))
2313 list_add_tail(&cset->mg_node,
2314 &mgctx->tset.src_csets);
2315 if (list_empty(&cset->mg_dst_cset->mg_node))
2316 list_add_tail(&cset->mg_dst_cset->mg_node,
2317 &mgctx->tset.dst_csets);
2318}
2319
2320
2321
2322
2323
2324
2325
2326
2327struct task_struct *cgroup_taskset_first(struct cgroup_taskset *tset,
2328 struct cgroup_subsys_state **dst_cssp)
2329{
2330 tset->cur_cset = list_first_entry(tset->csets, struct css_set, mg_node);
2331 tset->cur_task = NULL;
2332
2333 return cgroup_taskset_next(tset, dst_cssp);
2334}
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset,
2345 struct cgroup_subsys_state **dst_cssp)
2346{
2347 struct css_set *cset = tset->cur_cset;
2348 struct task_struct *task = tset->cur_task;
2349
2350 while (&cset->mg_node != tset->csets) {
2351 if (!task)
2352 task = list_first_entry(&cset->mg_tasks,
2353 struct task_struct, cg_list);
2354 else
2355 task = list_next_entry(task, cg_list);
2356
2357 if (&task->cg_list != &cset->mg_tasks) {
2358 tset->cur_cset = cset;
2359 tset->cur_task = task;
2360
2361
2362
2363
2364
2365
2366
2367 if (cset->mg_dst_cset)
2368 *dst_cssp = cset->mg_dst_cset->subsys[tset->ssid];
2369 else
2370 *dst_cssp = cset->subsys[tset->ssid];
2371
2372 return task;
2373 }
2374
2375 cset = list_next_entry(cset, mg_node);
2376 task = NULL;
2377 }
2378
2379 return NULL;
2380}
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391static int cgroup_migrate_execute(struct cgroup_mgctx *mgctx)
2392{
2393 struct cgroup_taskset *tset = &mgctx->tset;
2394 struct cgroup_subsys *ss;
2395 struct task_struct *task, *tmp_task;
2396 struct css_set *cset, *tmp_cset;
2397 int ssid, failed_ssid, ret;
2398
2399
2400 if (tset->nr_tasks) {
2401 do_each_subsys_mask(ss, ssid, mgctx->ss_mask) {
2402 if (ss->can_attach) {
2403 tset->ssid = ssid;
2404 ret = ss->can_attach(tset);
2405 if (ret) {
2406 failed_ssid = ssid;
2407 goto out_cancel_attach;
2408 }
2409 }
2410 } while_each_subsys_mask();
2411 }
2412
2413
2414
2415
2416
2417
2418 spin_lock_irq(&css_set_lock);
2419 list_for_each_entry(cset, &tset->src_csets, mg_node) {
2420 list_for_each_entry_safe(task, tmp_task, &cset->mg_tasks, cg_list) {
2421 struct css_set *from_cset = task_css_set(task);
2422 struct css_set *to_cset = cset->mg_dst_cset;
2423
2424 get_css_set(to_cset);
2425 to_cset->nr_tasks++;
2426 css_set_move_task(task, from_cset, to_cset, true);
2427 from_cset->nr_tasks--;
2428
2429
2430
2431
2432 cgroup_freezer_migrate_task(task, from_cset->dfl_cgrp,
2433 to_cset->dfl_cgrp);
2434 put_css_set_locked(from_cset);
2435
2436 }
2437 }
2438 spin_unlock_irq(&css_set_lock);
2439
2440
2441
2442
2443
2444
2445 tset->csets = &tset->dst_csets;
2446
2447 if (tset->nr_tasks) {
2448 do_each_subsys_mask(ss, ssid, mgctx->ss_mask) {
2449 if (ss->attach) {
2450 tset->ssid = ssid;
2451 ss->attach(tset);
2452 }
2453 } while_each_subsys_mask();
2454 }
2455
2456 ret = 0;
2457 goto out_release_tset;
2458
2459out_cancel_attach:
2460 if (tset->nr_tasks) {
2461 do_each_subsys_mask(ss, ssid, mgctx->ss_mask) {
2462 if (ssid == failed_ssid)
2463 break;
2464 if (ss->cancel_attach) {
2465 tset->ssid = ssid;
2466 ss->cancel_attach(tset);
2467 }
2468 } while_each_subsys_mask();
2469 }
2470out_release_tset:
2471 spin_lock_irq(&css_set_lock);
2472 list_splice_init(&tset->dst_csets, &tset->src_csets);
2473 list_for_each_entry_safe(cset, tmp_cset, &tset->src_csets, mg_node) {
2474 list_splice_tail_init(&cset->mg_tasks, &cset->tasks);
2475 list_del_init(&cset->mg_node);
2476 }
2477 spin_unlock_irq(&css_set_lock);
2478
2479
2480
2481
2482
2483
2484 tset->nr_tasks = 0;
2485 tset->csets = &tset->src_csets;
2486 return ret;
2487}
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498int cgroup_migrate_vet_dst(struct cgroup *dst_cgrp)
2499{
2500
2501 if (!cgroup_on_dfl(dst_cgrp))
2502 return 0;
2503
2504
2505 if (!cgroup_is_valid_domain(dst_cgrp->dom_cgrp))
2506 return -EOPNOTSUPP;
2507
2508
2509 if (cgroup_is_mixable(dst_cgrp))
2510 return 0;
2511
2512
2513
2514
2515
2516 if (cgroup_can_be_thread_root(dst_cgrp) || cgroup_is_threaded(dst_cgrp))
2517 return 0;
2518
2519
2520 if (dst_cgrp->subtree_control)
2521 return -EBUSY;
2522
2523 return 0;
2524}
2525
2526
2527
2528
2529
2530
2531
2532
2533void cgroup_migrate_finish(struct cgroup_mgctx *mgctx)
2534{
2535 LIST_HEAD(preloaded);
2536 struct css_set *cset, *tmp_cset;
2537
2538 lockdep_assert_held(&cgroup_mutex);
2539
2540 spin_lock_irq(&css_set_lock);
2541
2542 list_splice_tail_init(&mgctx->preloaded_src_csets, &preloaded);
2543 list_splice_tail_init(&mgctx->preloaded_dst_csets, &preloaded);
2544
2545 list_for_each_entry_safe(cset, tmp_cset, &preloaded, mg_preload_node) {
2546 cset->mg_src_cgrp = NULL;
2547 cset->mg_dst_cgrp = NULL;
2548 cset->mg_dst_cset = NULL;
2549 list_del_init(&cset->mg_preload_node);
2550 put_css_set_locked(cset);
2551 }
2552
2553 spin_unlock_irq(&css_set_lock);
2554}
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572void cgroup_migrate_add_src(struct css_set *src_cset,
2573 struct cgroup *dst_cgrp,
2574 struct cgroup_mgctx *mgctx)
2575{
2576 struct cgroup *src_cgrp;
2577
2578 lockdep_assert_held(&cgroup_mutex);
2579 lockdep_assert_held(&css_set_lock);
2580
2581
2582
2583
2584
2585
2586 if (src_cset->dead)
2587 return;
2588
2589 src_cgrp = cset_cgroup_from_root(src_cset, dst_cgrp->root);
2590
2591 if (!list_empty(&src_cset->mg_preload_node))
2592 return;
2593
2594 WARN_ON(src_cset->mg_src_cgrp);
2595 WARN_ON(src_cset->mg_dst_cgrp);
2596 WARN_ON(!list_empty(&src_cset->mg_tasks));
2597 WARN_ON(!list_empty(&src_cset->mg_node));
2598
2599 src_cset->mg_src_cgrp = src_cgrp;
2600 src_cset->mg_dst_cgrp = dst_cgrp;
2601 get_css_set(src_cset);
2602 list_add_tail(&src_cset->mg_preload_node, &mgctx->preloaded_src_csets);
2603}
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619int cgroup_migrate_prepare_dst(struct cgroup_mgctx *mgctx)
2620{
2621 struct css_set *src_cset, *tmp_cset;
2622
2623 lockdep_assert_held(&cgroup_mutex);
2624
2625
2626 list_for_each_entry_safe(src_cset, tmp_cset, &mgctx->preloaded_src_csets,
2627 mg_preload_node) {
2628 struct css_set *dst_cset;
2629 struct cgroup_subsys *ss;
2630 int ssid;
2631
2632 dst_cset = find_css_set(src_cset, src_cset->mg_dst_cgrp);
2633 if (!dst_cset)
2634 return -ENOMEM;
2635
2636 WARN_ON_ONCE(src_cset->mg_dst_cset || dst_cset->mg_dst_cset);
2637
2638
2639
2640
2641
2642
2643 if (src_cset == dst_cset) {
2644 src_cset->mg_src_cgrp = NULL;
2645 src_cset->mg_dst_cgrp = NULL;
2646 list_del_init(&src_cset->mg_preload_node);
2647 put_css_set(src_cset);
2648 put_css_set(dst_cset);
2649 continue;
2650 }
2651
2652 src_cset->mg_dst_cset = dst_cset;
2653
2654 if (list_empty(&dst_cset->mg_preload_node))
2655 list_add_tail(&dst_cset->mg_preload_node,
2656 &mgctx->preloaded_dst_csets);
2657 else
2658 put_css_set(dst_cset);
2659
2660 for_each_subsys(ss, ssid)
2661 if (src_cset->subsys[ssid] != dst_cset->subsys[ssid])
2662 mgctx->ss_mask |= 1 << ssid;
2663 }
2664
2665 return 0;
2666}
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686int cgroup_migrate(struct task_struct *leader, bool threadgroup,
2687 struct cgroup_mgctx *mgctx)
2688{
2689 struct task_struct *task;
2690
2691
2692
2693
2694
2695
2696 spin_lock_irq(&css_set_lock);
2697 rcu_read_lock();
2698 task = leader;
2699 do {
2700 cgroup_migrate_add_task(task, mgctx);
2701 if (!threadgroup)
2702 break;
2703 } while_each_thread(leader, task);
2704 rcu_read_unlock();
2705 spin_unlock_irq(&css_set_lock);
2706
2707 return cgroup_migrate_execute(mgctx);
2708}
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718int cgroup_attach_task(struct cgroup *dst_cgrp, struct task_struct *leader,
2719 bool threadgroup)
2720{
2721 DEFINE_CGROUP_MGCTX(mgctx);
2722 struct task_struct *task;
2723 int ret = 0;
2724
2725
2726 spin_lock_irq(&css_set_lock);
2727 rcu_read_lock();
2728 task = leader;
2729 do {
2730 cgroup_migrate_add_src(task_css_set(task), dst_cgrp, &mgctx);
2731 if (!threadgroup)
2732 break;
2733 } while_each_thread(leader, task);
2734 rcu_read_unlock();
2735 spin_unlock_irq(&css_set_lock);
2736
2737
2738 ret = cgroup_migrate_prepare_dst(&mgctx);
2739 if (!ret)
2740 ret = cgroup_migrate(leader, threadgroup, &mgctx);
2741
2742 cgroup_migrate_finish(&mgctx);
2743
2744 if (!ret)
2745 TRACE_CGROUP_PATH(attach_task, dst_cgrp, leader, threadgroup);
2746
2747 return ret;
2748}
2749
2750struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup,
2751 bool *locked)
2752 __acquires(&cgroup_threadgroup_rwsem)
2753{
2754 struct task_struct *tsk;
2755 pid_t pid;
2756
2757 if (kstrtoint(strstrip(buf), 0, &pid) || pid < 0)
2758 return ERR_PTR(-EINVAL);
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768 lockdep_assert_held(&cgroup_mutex);
2769 if (pid || threadgroup) {
2770 percpu_down_write(&cgroup_threadgroup_rwsem);
2771 *locked = true;
2772 } else {
2773 *locked = false;
2774 }
2775
2776 rcu_read_lock();
2777 if (pid) {
2778 tsk = find_task_by_vpid(pid);
2779 if (!tsk) {
2780 tsk = ERR_PTR(-ESRCH);
2781 goto out_unlock_threadgroup;
2782 }
2783 } else {
2784 tsk = current;
2785 }
2786
2787 if (threadgroup)
2788 tsk = tsk->group_leader;
2789
2790
2791
2792
2793
2794
2795
2796 if (tsk->no_cgroup_migration || (tsk->flags & PF_NO_SETAFFINITY)) {
2797 tsk = ERR_PTR(-EINVAL);
2798 goto out_unlock_threadgroup;
2799 }
2800
2801 get_task_struct(tsk);
2802 goto out_unlock_rcu;
2803
2804out_unlock_threadgroup:
2805 if (*locked) {
2806 percpu_up_write(&cgroup_threadgroup_rwsem);
2807 *locked = false;
2808 }
2809out_unlock_rcu:
2810 rcu_read_unlock();
2811 return tsk;
2812}
2813
2814void cgroup_procs_write_finish(struct task_struct *task, bool locked)
2815 __releases(&cgroup_threadgroup_rwsem)
2816{
2817 struct cgroup_subsys *ss;
2818 int ssid;
2819
2820
2821 put_task_struct(task);
2822
2823 if (locked)
2824 percpu_up_write(&cgroup_threadgroup_rwsem);
2825 for_each_subsys(ss, ssid)
2826 if (ss->post_attach)
2827 ss->post_attach();
2828}
2829
2830static void cgroup_print_ss_mask(struct seq_file *seq, u16 ss_mask)
2831{
2832 struct cgroup_subsys *ss;
2833 bool printed = false;
2834 int ssid;
2835
2836 do_each_subsys_mask(ss, ssid, ss_mask) {
2837 if (printed)
2838 seq_putc(seq, ' ');
2839 seq_puts(seq, ss->name);
2840 printed = true;
2841 } while_each_subsys_mask();
2842 if (printed)
2843 seq_putc(seq, '\n');
2844}
2845
2846
2847static int cgroup_controllers_show(struct seq_file *seq, void *v)
2848{
2849 struct cgroup *cgrp = seq_css(seq)->cgroup;
2850
2851 cgroup_print_ss_mask(seq, cgroup_control(cgrp));
2852 return 0;
2853}
2854
2855
2856static int cgroup_subtree_control_show(struct seq_file *seq, void *v)
2857{
2858 struct cgroup *cgrp = seq_css(seq)->cgroup;
2859
2860 cgroup_print_ss_mask(seq, cgrp->subtree_control);
2861 return 0;
2862}
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873static int cgroup_update_dfl_csses(struct cgroup *cgrp)
2874{
2875 DEFINE_CGROUP_MGCTX(mgctx);
2876 struct cgroup_subsys_state *d_css;
2877 struct cgroup *dsct;
2878 struct css_set *src_cset;
2879 int ret;
2880
2881 lockdep_assert_held(&cgroup_mutex);
2882
2883 percpu_down_write(&cgroup_threadgroup_rwsem);
2884
2885
2886 spin_lock_irq(&css_set_lock);
2887 cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp) {
2888 struct cgrp_cset_link *link;
2889
2890 list_for_each_entry(link, &dsct->cset_links, cset_link)
2891 cgroup_migrate_add_src(link->cset, dsct, &mgctx);
2892 }
2893 spin_unlock_irq(&css_set_lock);
2894
2895
2896 ret = cgroup_migrate_prepare_dst(&mgctx);
2897 if (ret)
2898 goto out_finish;
2899
2900 spin_lock_irq(&css_set_lock);
2901 list_for_each_entry(src_cset, &mgctx.preloaded_src_csets, mg_preload_node) {
2902 struct task_struct *task, *ntask;
2903
2904
2905 list_for_each_entry_safe(task, ntask, &src_cset->tasks, cg_list)
2906 cgroup_migrate_add_task(task, &mgctx);
2907 }
2908 spin_unlock_irq(&css_set_lock);
2909
2910 ret = cgroup_migrate_execute(&mgctx);
2911out_finish:
2912 cgroup_migrate_finish(&mgctx);
2913 percpu_up_write(&cgroup_threadgroup_rwsem);
2914 return ret;
2915}
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925void cgroup_lock_and_drain_offline(struct cgroup *cgrp)
2926 __acquires(&cgroup_mutex)
2927{
2928 struct cgroup *dsct;
2929 struct cgroup_subsys_state *d_css;
2930 struct cgroup_subsys *ss;
2931 int ssid;
2932
2933restart:
2934 mutex_lock(&cgroup_mutex);
2935
2936 cgroup_for_each_live_descendant_post(dsct, d_css, cgrp) {
2937 for_each_subsys(ss, ssid) {
2938 struct cgroup_subsys_state *css = cgroup_css(dsct, ss);
2939 DEFINE_WAIT(wait);
2940
2941 if (!css || !percpu_ref_is_dying(&css->refcnt))
2942 continue;
2943
2944 cgroup_get_live(dsct);
2945 prepare_to_wait(&dsct->offline_waitq, &wait,
2946 TASK_UNINTERRUPTIBLE);
2947
2948 mutex_unlock(&cgroup_mutex);
2949 schedule();
2950 finish_wait(&dsct->offline_waitq, &wait);
2951
2952 cgroup_put(dsct);
2953 goto restart;
2954 }
2955 }
2956}
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966static void cgroup_save_control(struct cgroup *cgrp)
2967{
2968 struct cgroup *dsct;
2969 struct cgroup_subsys_state *d_css;
2970
2971 cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp) {
2972 dsct->old_subtree_control = dsct->subtree_control;
2973 dsct->old_subtree_ss_mask = dsct->subtree_ss_mask;
2974 dsct->old_dom_cgrp = dsct->dom_cgrp;
2975 }
2976}
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986static void cgroup_propagate_control(struct cgroup *cgrp)
2987{
2988 struct cgroup *dsct;
2989 struct cgroup_subsys_state *d_css;
2990
2991 cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp) {
2992 dsct->subtree_control &= cgroup_control(dsct);
2993 dsct->subtree_ss_mask =
2994 cgroup_calc_subtree_ss_mask(dsct->subtree_control,
2995 cgroup_ss_mask(dsct));
2996 }
2997}
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007static void cgroup_restore_control(struct cgroup *cgrp)
3008{
3009 struct cgroup *dsct;
3010 struct cgroup_subsys_state *d_css;
3011
3012 cgroup_for_each_live_descendant_post(dsct, d_css, cgrp) {
3013 dsct->subtree_control = dsct->old_subtree_control;
3014 dsct->subtree_ss_mask = dsct->old_subtree_ss_mask;
3015 dsct->dom_cgrp = dsct->old_dom_cgrp;
3016 }
3017}
3018
3019static bool css_visible(struct cgroup_subsys_state *css)
3020{
3021 struct cgroup_subsys *ss = css->ss;
3022 struct cgroup *cgrp = css->cgroup;
3023
3024 if (cgroup_control(cgrp) & (1 << ss->id))
3025 return true;
3026 if (!(cgroup_ss_mask(cgrp) & (1 << ss->id)))
3027 return false;
3028 return cgroup_on_dfl(cgrp) && ss->implicit_on_dfl;
3029}
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044static int cgroup_apply_control_enable(struct cgroup *cgrp)
3045{
3046 struct cgroup *dsct;
3047 struct cgroup_subsys_state *d_css;
3048 struct cgroup_subsys *ss;
3049 int ssid, ret;
3050
3051 cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp) {
3052 for_each_subsys(ss, ssid) {
3053 struct cgroup_subsys_state *css = cgroup_css(dsct, ss);
3054
3055 if (!(cgroup_ss_mask(dsct) & (1 << ss->id)))
3056 continue;
3057
3058 if (!css) {
3059 css = css_create(dsct, ss);
3060 if (IS_ERR(css))
3061 return PTR_ERR(css);
3062 }
3063
3064 WARN_ON_ONCE(percpu_ref_is_dying(&css->refcnt));
3065
3066 if (css_visible(css)) {
3067 ret = css_populate_dir(css);
3068 if (ret)
3069 return ret;
3070 }
3071 }
3072 }
3073
3074 return 0;
3075}
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090static void cgroup_apply_control_disable(struct cgroup *cgrp)
3091{
3092 struct cgroup *dsct;
3093 struct cgroup_subsys_state *d_css;
3094 struct cgroup_subsys *ss;
3095 int ssid;
3096
3097 cgroup_for_each_live_descendant_post(dsct, d_css, cgrp) {
3098 for_each_subsys(ss, ssid) {
3099 struct cgroup_subsys_state *css = cgroup_css(dsct, ss);
3100
3101 if (!css)
3102 continue;
3103
3104 WARN_ON_ONCE(percpu_ref_is_dying(&css->refcnt));
3105
3106 if (css->parent &&
3107 !(cgroup_ss_mask(dsct) & (1 << ss->id))) {
3108 kill_css(css);
3109 } else if (!css_visible(css)) {
3110 css_clear_dir(css);
3111 if (ss->css_reset)
3112 ss->css_reset(css);
3113 }
3114 }
3115 }
3116}
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135static int cgroup_apply_control(struct cgroup *cgrp)
3136{
3137 int ret;
3138
3139 cgroup_propagate_control(cgrp);
3140
3141 ret = cgroup_apply_control_enable(cgrp);
3142 if (ret)
3143 return ret;
3144
3145
3146
3147
3148
3149
3150 ret = cgroup_update_dfl_csses(cgrp);
3151 if (ret)
3152 return ret;
3153
3154 return 0;
3155}
3156
3157
3158
3159
3160
3161
3162
3163
3164static void cgroup_finalize_control(struct cgroup *cgrp, int ret)
3165{
3166 if (ret) {
3167 cgroup_restore_control(cgrp);
3168 cgroup_propagate_control(cgrp);
3169 }
3170
3171 cgroup_apply_control_disable(cgrp);
3172}
3173
3174static int cgroup_vet_subtree_control_enable(struct cgroup *cgrp, u16 enable)
3175{
3176 u16 domain_enable = enable & ~cgrp_dfl_threaded_ss_mask;
3177
3178
3179 if (!enable)
3180 return 0;
3181
3182
3183 if (!cgroup_is_valid_domain(cgrp->dom_cgrp))
3184 return -EOPNOTSUPP;
3185
3186
3187 if (cgroup_is_mixable(cgrp))
3188 return 0;
3189
3190 if (domain_enable) {
3191
3192 if (cgroup_is_thread_root(cgrp) || cgroup_is_threaded(cgrp))
3193 return -EOPNOTSUPP;
3194 } else {
3195
3196
3197
3198
3199
3200 if (cgroup_can_be_thread_root(cgrp) || cgroup_is_threaded(cgrp))
3201 return 0;
3202 }
3203
3204
3205
3206
3207
3208 if (cgroup_has_tasks(cgrp))
3209 return -EBUSY;
3210
3211 return 0;
3212}
3213
3214
3215static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of,
3216 char *buf, size_t nbytes,
3217 loff_t off)
3218{
3219 u16 enable = 0, disable = 0;
3220 struct cgroup *cgrp, *child;
3221 struct cgroup_subsys *ss;
3222 char *tok;
3223 int ssid, ret;
3224
3225
3226
3227
3228
3229 buf = strstrip(buf);
3230 while ((tok = strsep(&buf, " "))) {
3231 if (tok[0] == '\0')
3232 continue;
3233 do_each_subsys_mask(ss, ssid, ~cgrp_dfl_inhibit_ss_mask) {
3234 if (!cgroup_ssid_enabled(ssid) ||
3235 strcmp(tok + 1, ss->name))
3236 continue;
3237
3238 if (*tok == '+') {
3239 enable |= 1 << ssid;
3240 disable &= ~(1 << ssid);
3241 } else if (*tok == '-') {
3242 disable |= 1 << ssid;
3243 enable &= ~(1 << ssid);
3244 } else {
3245 return -EINVAL;
3246 }
3247 break;
3248 } while_each_subsys_mask();
3249 if (ssid == CGROUP_SUBSYS_COUNT)
3250 return -EINVAL;
3251 }
3252
3253 cgrp = cgroup_kn_lock_live(of->kn, true);
3254 if (!cgrp)
3255 return -ENODEV;
3256
3257 for_each_subsys(ss, ssid) {
3258 if (enable & (1 << ssid)) {
3259 if (cgrp->subtree_control & (1 << ssid)) {
3260 enable &= ~(1 << ssid);
3261 continue;
3262 }
3263
3264 if (!(cgroup_control(cgrp) & (1 << ssid))) {
3265 ret = -ENOENT;
3266 goto out_unlock;
3267 }
3268 } else if (disable & (1 << ssid)) {
3269 if (!(cgrp->subtree_control & (1 << ssid))) {
3270 disable &= ~(1 << ssid);
3271 continue;
3272 }
3273
3274
3275 cgroup_for_each_live_child(child, cgrp) {
3276 if (child->subtree_control & (1 << ssid)) {
3277 ret = -EBUSY;
3278 goto out_unlock;
3279 }
3280 }
3281 }
3282 }
3283
3284 if (!enable && !disable) {
3285 ret = 0;
3286 goto out_unlock;
3287 }
3288
3289 ret = cgroup_vet_subtree_control_enable(cgrp, enable);
3290 if (ret)
3291 goto out_unlock;
3292
3293
3294 cgroup_save_control(cgrp);
3295
3296 cgrp->subtree_control |= enable;
3297 cgrp->subtree_control &= ~disable;
3298
3299 ret = cgroup_apply_control(cgrp);
3300 cgroup_finalize_control(cgrp, ret);
3301 if (ret)
3302 goto out_unlock;
3303
3304 kernfs_activate(cgrp->kn);
3305out_unlock:
3306 cgroup_kn_unlock(of->kn);
3307 return ret ?: nbytes;
3308}
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319static int cgroup_enable_threaded(struct cgroup *cgrp)
3320{
3321 struct cgroup *parent = cgroup_parent(cgrp);
3322 struct cgroup *dom_cgrp = parent->dom_cgrp;
3323 struct cgroup *dsct;
3324 struct cgroup_subsys_state *d_css;
3325 int ret;
3326
3327 lockdep_assert_held(&cgroup_mutex);
3328
3329
3330 if (cgroup_is_threaded(cgrp))
3331 return 0;
3332
3333
3334
3335
3336
3337
3338
3339 if (cgroup_is_populated(cgrp) ||
3340 cgrp->subtree_control & ~cgrp_dfl_threaded_ss_mask)
3341 return -EOPNOTSUPP;
3342
3343
3344 if (!cgroup_is_valid_domain(dom_cgrp) ||
3345 !cgroup_can_be_thread_root(dom_cgrp))
3346 return -EOPNOTSUPP;
3347
3348
3349
3350
3351
3352 cgroup_save_control(cgrp);
3353
3354 cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp)
3355 if (dsct == cgrp || cgroup_is_threaded(dsct))
3356 dsct->dom_cgrp = dom_cgrp;
3357
3358 ret = cgroup_apply_control(cgrp);
3359 if (!ret)
3360 parent->nr_threaded_children++;
3361
3362 cgroup_finalize_control(cgrp, ret);
3363 return ret;
3364}
3365
3366static int cgroup_type_show(struct seq_file *seq, void *v)
3367{
3368 struct cgroup *cgrp = seq_css(seq)->cgroup;
3369
3370 if (cgroup_is_threaded(cgrp))
3371 seq_puts(seq, "threaded\n");
3372 else if (!cgroup_is_valid_domain(cgrp))
3373 seq_puts(seq, "domain invalid\n");
3374 else if (cgroup_is_thread_root(cgrp))
3375 seq_puts(seq, "domain threaded\n");
3376 else
3377 seq_puts(seq, "domain\n");
3378
3379 return 0;
3380}
3381
3382static ssize_t cgroup_type_write(struct kernfs_open_file *of, char *buf,
3383 size_t nbytes, loff_t off)
3384{
3385 struct cgroup *cgrp;
3386 int ret;
3387
3388
3389 if (strcmp(strstrip(buf), "threaded"))
3390 return -EINVAL;
3391
3392
3393 cgrp = cgroup_kn_lock_live(of->kn, true);
3394 if (!cgrp)
3395 return -ENOENT;
3396
3397
3398 ret = cgroup_enable_threaded(cgrp);
3399
3400 cgroup_kn_unlock(of->kn);
3401 return ret ?: nbytes;
3402}
3403
3404static int cgroup_max_descendants_show(struct seq_file *seq, void *v)
3405{
3406 struct cgroup *cgrp = seq_css(seq)->cgroup;
3407 int descendants = READ_ONCE(cgrp->max_descendants);
3408
3409 if (descendants == INT_MAX)
3410 seq_puts(seq, "max\n");
3411 else
3412 seq_printf(seq, "%d\n", descendants);
3413
3414 return 0;
3415}
3416
3417static ssize_t cgroup_max_descendants_write(struct kernfs_open_file *of,
3418 char *buf, size_t nbytes, loff_t off)
3419{
3420 struct cgroup *cgrp;
3421 int descendants;
3422 ssize_t ret;
3423
3424 buf = strstrip(buf);
3425 if (!strcmp(buf, "max")) {
3426 descendants = INT_MAX;
3427 } else {
3428 ret = kstrtoint(buf, 0, &descendants);
3429 if (ret)
3430 return ret;
3431 }
3432
3433 if (descendants < 0)
3434 return -ERANGE;
3435
3436 cgrp = cgroup_kn_lock_live(of->kn, false);
3437 if (!cgrp)
3438 return -ENOENT;
3439
3440 cgrp->max_descendants = descendants;
3441
3442 cgroup_kn_unlock(of->kn);
3443
3444 return nbytes;
3445}
3446
3447static int cgroup_max_depth_show(struct seq_file *seq, void *v)
3448{
3449 struct cgroup *cgrp = seq_css(seq)->cgroup;
3450 int depth = READ_ONCE(cgrp->max_depth);
3451
3452 if (depth == INT_MAX)
3453 seq_puts(seq, "max\n");
3454 else
3455 seq_printf(seq, "%d\n", depth);
3456
3457 return 0;
3458}
3459
3460static ssize_t cgroup_max_depth_write(struct kernfs_open_file *of,
3461 char *buf, size_t nbytes, loff_t off)
3462{
3463 struct cgroup *cgrp;
3464 ssize_t ret;
3465 int depth;
3466
3467 buf = strstrip(buf);
3468 if (!strcmp(buf, "max")) {
3469 depth = INT_MAX;
3470 } else {
3471 ret = kstrtoint(buf, 0, &depth);
3472 if (ret)
3473 return ret;
3474 }
3475
3476 if (depth < 0)
3477 return -ERANGE;
3478
3479 cgrp = cgroup_kn_lock_live(of->kn, false);
3480 if (!cgrp)
3481 return -ENOENT;
3482
3483 cgrp->max_depth = depth;
3484
3485 cgroup_kn_unlock(of->kn);
3486
3487 return nbytes;
3488}
3489
3490static int cgroup_events_show(struct seq_file *seq, void *v)
3491{
3492 struct cgroup *cgrp = seq_css(seq)->cgroup;
3493
3494 seq_printf(seq, "populated %d\n", cgroup_is_populated(cgrp));
3495 seq_printf(seq, "frozen %d\n", test_bit(CGRP_FROZEN, &cgrp->flags));
3496
3497 return 0;
3498}
3499
3500static int cgroup_stat_show(struct seq_file *seq, void *v)
3501{
3502 struct cgroup *cgroup = seq_css(seq)->cgroup;
3503
3504 seq_printf(seq, "nr_descendants %d\n",
3505 cgroup->nr_descendants);
3506 seq_printf(seq, "nr_dying_descendants %d\n",
3507 cgroup->nr_dying_descendants);
3508
3509 return 0;
3510}
3511
3512static int __maybe_unused cgroup_extra_stat_show(struct seq_file *seq,
3513 struct cgroup *cgrp, int ssid)
3514{
3515 struct cgroup_subsys *ss = cgroup_subsys[ssid];
3516 struct cgroup_subsys_state *css;
3517 int ret;
3518
3519 if (!ss->css_extra_stat_show)
3520 return 0;
3521
3522 css = cgroup_tryget_css(cgrp, ss);
3523 if (!css)
3524 return 0;
3525
3526 ret = ss->css_extra_stat_show(seq, css);
3527 css_put(css);
3528 return ret;
3529}
3530
3531static int cpu_stat_show(struct seq_file *seq, void *v)
3532{
3533 struct cgroup __maybe_unused *cgrp = seq_css(seq)->cgroup;
3534 int ret = 0;
3535
3536 cgroup_base_stat_cputime_show(seq);
3537#ifdef CONFIG_CGROUP_SCHED
3538 ret = cgroup_extra_stat_show(seq, cgrp, cpu_cgrp_id);
3539#endif
3540 return ret;
3541}
3542
3543#ifdef CONFIG_PSI
3544static int cgroup_io_pressure_show(struct seq_file *seq, void *v)
3545{
3546 struct cgroup *cgrp = seq_css(seq)->cgroup;
3547 struct psi_group *psi = cgroup_ino(cgrp) == 1 ? &psi_system : &cgrp->psi;
3548
3549 return psi_show(seq, psi, PSI_IO);
3550}
3551static int cgroup_memory_pressure_show(struct seq_file *seq, void *v)
3552{
3553 struct cgroup *cgrp = seq_css(seq)->cgroup;
3554 struct psi_group *psi = cgroup_ino(cgrp) == 1 ? &psi_system : &cgrp->psi;
3555
3556 return psi_show(seq, psi, PSI_MEM);
3557}
3558static int cgroup_cpu_pressure_show(struct seq_file *seq, void *v)
3559{
3560 struct cgroup *cgrp = seq_css(seq)->cgroup;
3561 struct psi_group *psi = cgroup_ino(cgrp) == 1 ? &psi_system : &cgrp->psi;
3562
3563 return psi_show(seq, psi, PSI_CPU);
3564}
3565
3566static ssize_t cgroup_pressure_write(struct kernfs_open_file *of, char *buf,
3567 size_t nbytes, enum psi_res res)
3568{
3569 struct psi_trigger *new;
3570 struct cgroup *cgrp;
3571
3572 cgrp = cgroup_kn_lock_live(of->kn, false);
3573 if (!cgrp)
3574 return -ENODEV;
3575
3576 cgroup_get(cgrp);
3577 cgroup_kn_unlock(of->kn);
3578
3579 new = psi_trigger_create(&cgrp->psi, buf, nbytes, res);
3580 if (IS_ERR(new)) {
3581 cgroup_put(cgrp);
3582 return PTR_ERR(new);
3583 }
3584
3585 psi_trigger_replace(&of->priv, new);
3586
3587 cgroup_put(cgrp);
3588
3589 return nbytes;
3590}
3591
3592static ssize_t cgroup_io_pressure_write(struct kernfs_open_file *of,
3593 char *buf, size_t nbytes,
3594 loff_t off)
3595{
3596 return cgroup_pressure_write(of, buf, nbytes, PSI_IO);
3597}
3598
3599static ssize_t cgroup_memory_pressure_write(struct kernfs_open_file *of,
3600 char *buf, size_t nbytes,
3601 loff_t off)
3602{
3603 return cgroup_pressure_write(of, buf, nbytes, PSI_MEM);
3604}
3605
3606static ssize_t cgroup_cpu_pressure_write(struct kernfs_open_file *of,
3607 char *buf, size_t nbytes,
3608 loff_t off)
3609{
3610 return cgroup_pressure_write(of, buf, nbytes, PSI_CPU);
3611}
3612
3613static __poll_t cgroup_pressure_poll(struct kernfs_open_file *of,
3614 poll_table *pt)
3615{
3616 return psi_trigger_poll(&of->priv, of->file, pt);
3617}
3618
3619static void cgroup_pressure_release(struct kernfs_open_file *of)
3620{
3621 psi_trigger_replace(&of->priv, NULL);
3622}
3623#endif
3624
3625static int cgroup_freeze_show(struct seq_file *seq, void *v)
3626{
3627 struct cgroup *cgrp = seq_css(seq)->cgroup;
3628
3629 seq_printf(seq, "%d\n", cgrp->freezer.freeze);
3630
3631 return 0;
3632}
3633
3634static ssize_t cgroup_freeze_write(struct kernfs_open_file *of,
3635 char *buf, size_t nbytes, loff_t off)
3636{
3637 struct cgroup *cgrp;
3638 ssize_t ret;
3639 int freeze;
3640
3641 ret = kstrtoint(strstrip(buf), 0, &freeze);
3642 if (ret)
3643 return ret;
3644
3645 if (freeze < 0 || freeze > 1)
3646 return -ERANGE;
3647
3648 cgrp = cgroup_kn_lock_live(of->kn, false);
3649 if (!cgrp)
3650 return -ENOENT;
3651
3652 cgroup_freeze(cgrp, freeze);
3653
3654 cgroup_kn_unlock(of->kn);
3655
3656 return nbytes;
3657}
3658
3659static int cgroup_file_open(struct kernfs_open_file *of)
3660{
3661 struct cftype *cft = of->kn->priv;
3662
3663 if (cft->open)
3664 return cft->open(of);
3665 return 0;
3666}
3667
3668static void cgroup_file_release(struct kernfs_open_file *of)
3669{
3670 struct cftype *cft = of->kn->priv;
3671
3672 if (cft->release)
3673 cft->release(of);
3674}
3675
3676static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf,
3677 size_t nbytes, loff_t off)
3678{
3679 struct cgroup_namespace *ns = current->nsproxy->cgroup_ns;
3680 struct cgroup *cgrp = of->kn->parent->priv;
3681 struct cftype *cft = of->kn->priv;
3682 struct cgroup_subsys_state *css;
3683 int ret;
3684
3685
3686
3687
3688
3689
3690
3691 if ((cgrp->root->flags & CGRP_ROOT_NS_DELEGATE) &&
3692 !(cft->flags & CFTYPE_NS_DELEGATABLE) &&
3693 ns != &init_cgroup_ns && ns->root_cset->dfl_cgrp == cgrp)
3694 return -EPERM;
3695
3696 if (cft->write)
3697 return cft->write(of, buf, nbytes, off);
3698
3699
3700
3701
3702
3703
3704
3705 rcu_read_lock();
3706 css = cgroup_css(cgrp, cft->ss);
3707 rcu_read_unlock();
3708
3709 if (cft->write_u64) {
3710 unsigned long long v;
3711 ret = kstrtoull(buf, 0, &v);
3712 if (!ret)
3713 ret = cft->write_u64(css, cft, v);
3714 } else if (cft->write_s64) {
3715 long long v;
3716 ret = kstrtoll(buf, 0, &v);
3717 if (!ret)
3718 ret = cft->write_s64(css, cft, v);
3719 } else {
3720 ret = -EINVAL;
3721 }
3722
3723 return ret ?: nbytes;
3724}
3725
3726static __poll_t cgroup_file_poll(struct kernfs_open_file *of, poll_table *pt)
3727{
3728 struct cftype *cft = of->kn->priv;
3729
3730 if (cft->poll)
3731 return cft->poll(of, pt);
3732
3733 return kernfs_generic_poll(of, pt);
3734}
3735
3736static void *cgroup_seqfile_start(struct seq_file *seq, loff_t *ppos)
3737{
3738 return seq_cft(seq)->seq_start(seq, ppos);
3739}
3740
3741static void *cgroup_seqfile_next(struct seq_file *seq, void *v, loff_t *ppos)
3742{
3743 return seq_cft(seq)->seq_next(seq, v, ppos);
3744}
3745
3746static void cgroup_seqfile_stop(struct seq_file *seq, void *v)
3747{
3748 if (seq_cft(seq)->seq_stop)
3749 seq_cft(seq)->seq_stop(seq, v);
3750}
3751
3752static int cgroup_seqfile_show(struct seq_file *m, void *arg)
3753{
3754 struct cftype *cft = seq_cft(m);
3755 struct cgroup_subsys_state *css = seq_css(m);
3756
3757 if (cft->seq_show)
3758 return cft->seq_show(m, arg);
3759
3760 if (cft->read_u64)
3761 seq_printf(m, "%llu\n", cft->read_u64(css, cft));
3762 else if (cft->read_s64)
3763 seq_printf(m, "%lld\n", cft->read_s64(css, cft));
3764 else
3765 return -EINVAL;
3766 return 0;
3767}
3768
3769static struct kernfs_ops cgroup_kf_single_ops = {
3770 .atomic_write_len = PAGE_SIZE,
3771 .open = cgroup_file_open,
3772 .release = cgroup_file_release,
3773 .write = cgroup_file_write,
3774 .poll = cgroup_file_poll,
3775 .seq_show = cgroup_seqfile_show,
3776};
3777
3778static struct kernfs_ops cgroup_kf_ops = {
3779 .atomic_write_len = PAGE_SIZE,
3780 .open = cgroup_file_open,
3781 .release = cgroup_file_release,
3782 .write = cgroup_file_write,
3783 .poll = cgroup_file_poll,
3784 .seq_start = cgroup_seqfile_start,
3785 .seq_next = cgroup_seqfile_next,
3786 .seq_stop = cgroup_seqfile_stop,
3787 .seq_show = cgroup_seqfile_show,
3788};
3789
3790
3791static int cgroup_kn_set_ugid(struct kernfs_node *kn)
3792{
3793 struct iattr iattr = { .ia_valid = ATTR_UID | ATTR_GID,
3794 .ia_uid = current_fsuid(),
3795 .ia_gid = current_fsgid(), };
3796
3797 if (uid_eq(iattr.ia_uid, GLOBAL_ROOT_UID) &&
3798 gid_eq(iattr.ia_gid, GLOBAL_ROOT_GID))
3799 return 0;
3800
3801 return kernfs_setattr(kn, &iattr);
3802}
3803
3804static void cgroup_file_notify_timer(struct timer_list *timer)
3805{
3806 cgroup_file_notify(container_of(timer, struct cgroup_file,
3807 notify_timer));
3808}
3809
3810static int cgroup_add_file(struct cgroup_subsys_state *css, struct cgroup *cgrp,
3811 struct cftype *cft)
3812{
3813 char name[CGROUP_FILE_NAME_MAX];
3814 struct kernfs_node *kn;
3815 struct lock_class_key *key = NULL;
3816 int ret;
3817
3818#ifdef CONFIG_DEBUG_LOCK_ALLOC
3819 key = &cft->lockdep_key;
3820#endif
3821 kn = __kernfs_create_file(cgrp->kn, cgroup_file_name(cgrp, cft, name),
3822 cgroup_file_mode(cft),
3823 GLOBAL_ROOT_UID, GLOBAL_ROOT_GID,
3824 0, cft->kf_ops, cft,
3825 NULL, key);
3826 if (IS_ERR(kn))
3827 return PTR_ERR(kn);
3828
3829 ret = cgroup_kn_set_ugid(kn);
3830 if (ret) {
3831 kernfs_remove(kn);
3832 return ret;
3833 }
3834
3835 if (cft->file_offset) {
3836 struct cgroup_file *cfile = (void *)css + cft->file_offset;
3837
3838 timer_setup(&cfile->notify_timer, cgroup_file_notify_timer, 0);
3839
3840 spin_lock_irq(&cgroup_file_kn_lock);
3841 cfile->kn = kn;
3842 spin_unlock_irq(&cgroup_file_kn_lock);
3843 }
3844
3845 return 0;
3846}
3847
3848
3849
3850
3851
3852
3853
3854
3855
3856
3857
3858static int cgroup_addrm_files(struct cgroup_subsys_state *css,
3859 struct cgroup *cgrp, struct cftype cfts[],
3860 bool is_add)
3861{
3862 struct cftype *cft, *cft_end = NULL;
3863 int ret = 0;
3864
3865 lockdep_assert_held(&cgroup_mutex);
3866
3867restart:
3868 for (cft = cfts; cft != cft_end && cft->name[0] != '\0'; cft++) {
3869
3870 if ((cft->flags & __CFTYPE_ONLY_ON_DFL) && !cgroup_on_dfl(cgrp))
3871 continue;
3872 if ((cft->flags & __CFTYPE_NOT_ON_DFL) && cgroup_on_dfl(cgrp))
3873 continue;
3874 if ((cft->flags & CFTYPE_NOT_ON_ROOT) && !cgroup_parent(cgrp))
3875 continue;
3876 if ((cft->flags & CFTYPE_ONLY_ON_ROOT) && cgroup_parent(cgrp))
3877 continue;
3878 if ((cft->flags & CFTYPE_DEBUG) && !cgroup_debug)
3879 continue;
3880 if (is_add) {
3881 ret = cgroup_add_file(css, cgrp, cft);
3882 if (ret) {
3883 pr_warn("%s: failed to add %s, err=%d\n",
3884 __func__, cft->name, ret);
3885 cft_end = cft;
3886 is_add = false;
3887 goto restart;
3888 }
3889 } else {
3890 cgroup_rm_file(cgrp, cft);
3891 }
3892 }
3893 return ret;
3894}
3895
3896static int cgroup_apply_cftypes(struct cftype *cfts, bool is_add)
3897{
3898 struct cgroup_subsys *ss = cfts[0].ss;
3899 struct cgroup *root = &ss->root->cgrp;
3900 struct cgroup_subsys_state *css;
3901 int ret = 0;
3902
3903 lockdep_assert_held(&cgroup_mutex);
3904
3905
3906 css_for_each_descendant_pre(css, cgroup_css(root, ss)) {
3907 struct cgroup *cgrp = css->cgroup;
3908
3909 if (!(css->flags & CSS_VISIBLE))
3910 continue;
3911
3912 ret = cgroup_addrm_files(css, cgrp, cfts, is_add);
3913 if (ret)
3914 break;
3915 }
3916
3917 if (is_add && !ret)
3918 kernfs_activate(root->kn);
3919 return ret;
3920}
3921
3922static void cgroup_exit_cftypes(struct cftype *cfts)
3923{
3924 struct cftype *cft;
3925
3926 for (cft = cfts; cft->name[0] != '\0'; cft++) {
3927
3928 if (cft->max_write_len && cft->max_write_len != PAGE_SIZE)
3929 kfree(cft->kf_ops);
3930 cft->kf_ops = NULL;
3931 cft->ss = NULL;
3932
3933
3934 cft->flags &= ~(__CFTYPE_ONLY_ON_DFL | __CFTYPE_NOT_ON_DFL);
3935 }
3936}
3937
3938static int cgroup_init_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
3939{
3940 struct cftype *cft;
3941
3942 for (cft = cfts; cft->name[0] != '\0'; cft++) {
3943 struct kernfs_ops *kf_ops;
3944
3945 WARN_ON(cft->ss || cft->kf_ops);
3946
3947 if (cft->seq_start)
3948 kf_ops = &cgroup_kf_ops;
3949 else
3950 kf_ops = &cgroup_kf_single_ops;
3951
3952
3953
3954
3955
3956 if (cft->max_write_len && cft->max_write_len != PAGE_SIZE) {
3957 kf_ops = kmemdup(kf_ops, sizeof(*kf_ops), GFP_KERNEL);
3958 if (!kf_ops) {
3959 cgroup_exit_cftypes(cfts);
3960 return -ENOMEM;
3961 }
3962 kf_ops->atomic_write_len = cft->max_write_len;
3963 }
3964
3965 cft->kf_ops = kf_ops;
3966 cft->ss = ss;
3967 }
3968
3969 return 0;
3970}
3971
3972static int cgroup_rm_cftypes_locked(struct cftype *cfts)
3973{
3974 lockdep_assert_held(&cgroup_mutex);
3975
3976 if (!cfts || !cfts[0].ss)
3977 return -ENOENT;
3978
3979 list_del(&cfts->node);
3980 cgroup_apply_cftypes(cfts, false);
3981 cgroup_exit_cftypes(cfts);
3982 return 0;
3983}
3984
3985
3986
3987
3988
3989
3990
3991
3992
3993
3994
3995
3996int cgroup_rm_cftypes(struct cftype *cfts)
3997{
3998 int ret;
3999
4000 mutex_lock(&cgroup_mutex);
4001 ret = cgroup_rm_cftypes_locked(cfts);
4002 mutex_unlock(&cgroup_mutex);
4003 return ret;
4004}
4005
4006
4007
4008
4009
4010
4011
4012
4013
4014
4015
4016
4017
4018
4019
4020static int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
4021{
4022 int ret;
4023
4024 if (!cgroup_ssid_enabled(ss->id))
4025 return 0;
4026
4027 if (!cfts || cfts[0].name[0] == '\0')
4028 return 0;
4029
4030 ret = cgroup_init_cftypes(ss, cfts);
4031 if (ret)
4032 return ret;
4033
4034 mutex_lock(&cgroup_mutex);
4035
4036 list_add_tail(&cfts->node, &ss->cfts);
4037 ret = cgroup_apply_cftypes(cfts, true);
4038 if (ret)
4039 cgroup_rm_cftypes_locked(cfts);
4040
4041 mutex_unlock(&cgroup_mutex);
4042 return ret;
4043}
4044
4045
4046
4047
4048
4049
4050
4051
4052
4053int cgroup_add_dfl_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
4054{
4055 struct cftype *cft;
4056
4057 for (cft = cfts; cft && cft->name[0] != '\0'; cft++)
4058 cft->flags |= __CFTYPE_ONLY_ON_DFL;
4059 return cgroup_add_cftypes(ss, cfts);
4060}
4061
4062
4063
4064
4065
4066
4067
4068
4069
4070int cgroup_add_legacy_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
4071{
4072 struct cftype *cft;
4073
4074 for (cft = cfts; cft && cft->name[0] != '\0'; cft++)
4075 cft->flags |= __CFTYPE_NOT_ON_DFL;
4076 return cgroup_add_cftypes(ss, cfts);
4077}
4078
4079
4080
4081
4082
4083
4084
4085void cgroup_file_notify(struct cgroup_file *cfile)
4086{
4087 unsigned long flags;
4088
4089 spin_lock_irqsave(&cgroup_file_kn_lock, flags);
4090 if (cfile->kn) {
4091 unsigned long last = cfile->notified_at;
4092 unsigned long next = last + CGROUP_FILE_NOTIFY_MIN_INTV;
4093
4094 if (time_in_range(jiffies, last, next)) {
4095 timer_reduce(&cfile->notify_timer, next);
4096 } else {
4097 kernfs_notify(cfile->kn);
4098 cfile->notified_at = jiffies;
4099 }
4100 }
4101 spin_unlock_irqrestore(&cgroup_file_kn_lock, flags);
4102}
4103
4104
4105
4106
4107
4108
4109
4110
4111
4112
4113
4114
4115
4116
4117
4118
4119
4120
4121struct cgroup_subsys_state *css_next_child(struct cgroup_subsys_state *pos,
4122 struct cgroup_subsys_state *parent)
4123{
4124 struct cgroup_subsys_state *next;
4125
4126 cgroup_assert_mutex_or_rcu_locked();
4127
4128
4129
4130
4131
4132
4133
4134
4135
4136
4137
4138
4139
4140
4141
4142
4143
4144
4145
4146
4147
4148 if (!pos) {
4149 next = list_entry_rcu(parent->children.next, struct cgroup_subsys_state, sibling);
4150 } else if (likely(!(pos->flags & CSS_RELEASED))) {
4151 next = list_entry_rcu(pos->sibling.next, struct cgroup_subsys_state, sibling);
4152 } else {
4153 list_for_each_entry_rcu(next, &parent->children, sibling,
4154 lockdep_is_held(&cgroup_mutex))
4155 if (next->serial_nr > pos->serial_nr)
4156 break;
4157 }
4158
4159
4160
4161
4162
4163 if (&next->sibling != &parent->children)
4164 return next;
4165 return NULL;
4166}
4167
4168
4169
4170
4171
4172
4173
4174
4175
4176
4177
4178
4179
4180
4181
4182
4183
4184
4185
4186
4187
4188
4189struct cgroup_subsys_state *
4190css_next_descendant_pre(struct cgroup_subsys_state *pos,
4191 struct cgroup_subsys_state *root)
4192{
4193 struct cgroup_subsys_state *next;
4194
4195 cgroup_assert_mutex_or_rcu_locked();
4196
4197
4198 if (!pos)
4199 return root;
4200
4201
4202 next = css_next_child(NULL, pos);
4203 if (next)
4204 return next;
4205
4206
4207 while (pos != root) {
4208 next = css_next_child(pos, pos->parent);
4209 if (next)
4210 return next;
4211 pos = pos->parent;
4212 }
4213
4214 return NULL;
4215}
4216EXPORT_SYMBOL_GPL(css_next_descendant_pre);
4217
4218
4219
4220
4221
4222
4223
4224
4225
4226
4227
4228
4229
4230
4231struct cgroup_subsys_state *
4232css_rightmost_descendant(struct cgroup_subsys_state *pos)
4233{
4234 struct cgroup_subsys_state *last, *tmp;
4235
4236 cgroup_assert_mutex_or_rcu_locked();
4237
4238 do {
4239 last = pos;
4240
4241 pos = NULL;
4242 css_for_each_child(tmp, last)
4243 pos = tmp;
4244 } while (pos);
4245
4246 return last;
4247}
4248
4249static struct cgroup_subsys_state *
4250css_leftmost_descendant(struct cgroup_subsys_state *pos)
4251{
4252 struct cgroup_subsys_state *last;
4253
4254 do {
4255 last = pos;
4256 pos = css_next_child(NULL, pos);
4257 } while (pos);
4258
4259 return last;
4260}
4261
4262
4263
4264
4265
4266
4267
4268
4269
4270
4271
4272
4273
4274
4275
4276
4277
4278
4279
4280
4281
4282
4283
4284struct cgroup_subsys_state *
4285css_next_descendant_post(struct cgroup_subsys_state *pos,
4286 struct cgroup_subsys_state *root)
4287{
4288 struct cgroup_subsys_state *next;
4289
4290 cgroup_assert_mutex_or_rcu_locked();
4291
4292
4293 if (!pos)
4294 return css_leftmost_descendant(root);
4295
4296
4297 if (pos == root)
4298 return NULL;
4299
4300
4301 next = css_next_child(pos, pos->parent);
4302 if (next)
4303 return css_leftmost_descendant(next);
4304
4305
4306 return pos->parent;
4307}
4308
4309
4310
4311
4312
4313
4314
4315
4316
4317bool css_has_online_children(struct cgroup_subsys_state *css)
4318{
4319 struct cgroup_subsys_state *child;
4320 bool ret = false;
4321
4322 rcu_read_lock();
4323 css_for_each_child(child, css) {
4324 if (child->flags & CSS_ONLINE) {
4325 ret = true;
4326 break;
4327 }
4328 }
4329 rcu_read_unlock();
4330 return ret;
4331}
4332
4333static struct css_set *css_task_iter_next_css_set(struct css_task_iter *it)
4334{
4335 struct list_head *l;
4336 struct cgrp_cset_link *link;
4337 struct css_set *cset;
4338
4339 lockdep_assert_held(&css_set_lock);
4340
4341
4342 if (it->tcset_pos) {
4343 l = it->tcset_pos->next;
4344
4345 if (l != it->tcset_head) {
4346 it->tcset_pos = l;
4347 return container_of(l, struct css_set,
4348 threaded_csets_node);
4349 }
4350
4351 it->tcset_pos = NULL;
4352 }
4353
4354
4355 l = it->cset_pos;
4356 l = l->next;
4357 if (l == it->cset_head) {
4358 it->cset_pos = NULL;
4359 return NULL;
4360 }
4361
4362 if (it->ss) {
4363 cset = container_of(l, struct css_set, e_cset_node[it->ss->id]);
4364 } else {
4365 link = list_entry(l, struct cgrp_cset_link, cset_link);
4366 cset = link->cset;
4367 }
4368
4369 it->cset_pos = l;
4370
4371
4372 if (it->flags & CSS_TASK_ITER_THREADED) {
4373 if (it->cur_dcset)
4374 put_css_set_locked(it->cur_dcset);
4375 it->cur_dcset = cset;
4376 get_css_set(cset);
4377
4378 it->tcset_head = &cset->threaded_csets;
4379 it->tcset_pos = &cset->threaded_csets;
4380 }
4381
4382 return cset;
4383}
4384
4385
4386
4387
4388
4389
4390
4391static void css_task_iter_advance_css_set(struct css_task_iter *it)
4392{
4393 struct css_set *cset;
4394
4395 lockdep_assert_held(&css_set_lock);
4396
4397
4398 while ((cset = css_task_iter_next_css_set(it))) {
4399 if (!list_empty(&cset->tasks)) {
4400 it->cur_tasks_head = &cset->tasks;
4401 break;
4402 } else if (!list_empty(&cset->mg_tasks)) {
4403 it->cur_tasks_head = &cset->mg_tasks;
4404 break;
4405 } else if (!list_empty(&cset->dying_tasks)) {
4406 it->cur_tasks_head = &cset->dying_tasks;
4407 break;
4408 }
4409 }
4410 if (!cset) {
4411 it->task_pos = NULL;
4412 return;
4413 }
4414 it->task_pos = it->cur_tasks_head->next;
4415
4416
4417
4418
4419
4420
4421
4422
4423
4424
4425
4426
4427
4428
4429
4430
4431 if (it->cur_cset) {
4432 list_del(&it->iters_node);
4433 put_css_set_locked(it->cur_cset);
4434 }
4435 get_css_set(cset);
4436 it->cur_cset = cset;
4437 list_add(&it->iters_node, &cset->task_iters);
4438}
4439
4440static void css_task_iter_skip(struct css_task_iter *it,
4441 struct task_struct *task)
4442{
4443 lockdep_assert_held(&css_set_lock);
4444
4445 if (it->task_pos == &task->cg_list) {
4446 it->task_pos = it->task_pos->next;
4447 it->flags |= CSS_TASK_ITER_SKIPPED;
4448 }
4449}
4450
4451static void css_task_iter_advance(struct css_task_iter *it)
4452{
4453 struct task_struct *task;
4454
4455 lockdep_assert_held(&css_set_lock);
4456repeat:
4457 if (it->task_pos) {
4458
4459
4460
4461
4462
4463 if (it->flags & CSS_TASK_ITER_SKIPPED)
4464 it->flags &= ~CSS_TASK_ITER_SKIPPED;
4465 else
4466 it->task_pos = it->task_pos->next;
4467
4468 if (it->task_pos == &it->cur_cset->tasks) {
4469 it->cur_tasks_head = &it->cur_cset->mg_tasks;
4470 it->task_pos = it->cur_tasks_head->next;
4471 }
4472 if (it->task_pos == &it->cur_cset->mg_tasks) {
4473 it->cur_tasks_head = &it->cur_cset->dying_tasks;
4474 it->task_pos = it->cur_tasks_head->next;
4475 }
4476 if (it->task_pos == &it->cur_cset->dying_tasks)
4477 css_task_iter_advance_css_set(it);
4478 } else {
4479
4480 css_task_iter_advance_css_set(it);
4481 }
4482
4483 if (!it->task_pos)
4484 return;
4485
4486 task = list_entry(it->task_pos, struct task_struct, cg_list);
4487
4488 if (it->flags & CSS_TASK_ITER_PROCS) {
4489
4490 if (!thread_group_leader(task))
4491 goto repeat;
4492
4493
4494 if (it->cur_tasks_head == &it->cur_cset->dying_tasks &&
4495 !atomic_read(&task->signal->live))
4496 goto repeat;
4497 } else {
4498
4499 if (it->cur_tasks_head == &it->cur_cset->dying_tasks)
4500 goto repeat;
4501 }
4502}
4503
4504
4505
4506
4507
4508
4509
4510
4511
4512
4513
4514
4515void css_task_iter_start(struct cgroup_subsys_state *css, unsigned int flags,
4516 struct css_task_iter *it)
4517{
4518 memset(it, 0, sizeof(*it));
4519
4520 spin_lock_irq(&css_set_lock);
4521
4522 it->ss = css->ss;
4523 it->flags = flags;
4524
4525 if (it->ss)
4526 it->cset_pos = &css->cgroup->e_csets[css->ss->id];
4527 else
4528 it->cset_pos = &css->cgroup->cset_links;
4529
4530 it->cset_head = it->cset_pos;
4531
4532 css_task_iter_advance(it);
4533
4534 spin_unlock_irq(&css_set_lock);
4535}
4536
4537
4538
4539
4540
4541
4542
4543
4544
4545struct task_struct *css_task_iter_next(struct css_task_iter *it)
4546{
4547 if (it->cur_task) {
4548 put_task_struct(it->cur_task);
4549 it->cur_task = NULL;
4550 }
4551
4552 spin_lock_irq(&css_set_lock);
4553
4554
4555 if (it->flags & CSS_TASK_ITER_SKIPPED)
4556 css_task_iter_advance(it);
4557
4558 if (it->task_pos) {
4559 it->cur_task = list_entry(it->task_pos, struct task_struct,
4560 cg_list);
4561 get_task_struct(it->cur_task);
4562 css_task_iter_advance(it);
4563 }
4564
4565 spin_unlock_irq(&css_set_lock);
4566
4567 return it->cur_task;
4568}
4569
4570
4571
4572
4573
4574
4575
4576void css_task_iter_end(struct css_task_iter *it)
4577{
4578 if (it->cur_cset) {
4579 spin_lock_irq(&css_set_lock);
4580 list_del(&it->iters_node);
4581 put_css_set_locked(it->cur_cset);
4582 spin_unlock_irq(&css_set_lock);
4583 }
4584
4585 if (it->cur_dcset)
4586 put_css_set(it->cur_dcset);
4587
4588 if (it->cur_task)
4589 put_task_struct(it->cur_task);
4590}
4591
4592static void cgroup_procs_release(struct kernfs_open_file *of)
4593{
4594 if (of->priv) {
4595 css_task_iter_end(of->priv);
4596 kfree(of->priv);
4597 }
4598}
4599
4600static void *cgroup_procs_next(struct seq_file *s, void *v, loff_t *pos)
4601{
4602 struct kernfs_open_file *of = s->private;
4603 struct css_task_iter *it = of->priv;
4604
4605 if (pos)
4606 (*pos)++;
4607
4608 return css_task_iter_next(it);
4609}
4610
4611static void *__cgroup_procs_start(struct seq_file *s, loff_t *pos,
4612 unsigned int iter_flags)
4613{
4614 struct kernfs_open_file *of = s->private;
4615 struct cgroup *cgrp = seq_css(s)->cgroup;
4616 struct css_task_iter *it = of->priv;
4617
4618
4619
4620
4621
4622 if (!it) {
4623 if (WARN_ON_ONCE((*pos)))
4624 return ERR_PTR(-EINVAL);
4625
4626 it = kzalloc(sizeof(*it), GFP_KERNEL);
4627 if (!it)
4628 return ERR_PTR(-ENOMEM);
4629 of->priv = it;
4630 css_task_iter_start(&cgrp->self, iter_flags, it);
4631 } else if (!(*pos)) {
4632 css_task_iter_end(it);
4633 css_task_iter_start(&cgrp->self, iter_flags, it);
4634 } else
4635 return it->cur_task;
4636
4637 return cgroup_procs_next(s, NULL, NULL);
4638}
4639
4640static void *cgroup_procs_start(struct seq_file *s, loff_t *pos)
4641{
4642 struct cgroup *cgrp = seq_css(s)->cgroup;
4643
4644
4645
4646
4647
4648
4649
4650 if (cgroup_is_threaded(cgrp))
4651 return ERR_PTR(-EOPNOTSUPP);
4652
4653 return __cgroup_procs_start(s, pos, CSS_TASK_ITER_PROCS |
4654 CSS_TASK_ITER_THREADED);
4655}
4656
4657static int cgroup_procs_show(struct seq_file *s, void *v)
4658{
4659 seq_printf(s, "%d\n", task_pid_vnr(v));
4660 return 0;
4661}
4662
4663static int cgroup_may_write(const struct cgroup *cgrp, struct super_block *sb)
4664{
4665 int ret;
4666 struct inode *inode;
4667
4668 lockdep_assert_held(&cgroup_mutex);
4669
4670 inode = kernfs_get_inode(sb, cgrp->procs_file.kn);
4671 if (!inode)
4672 return -ENOMEM;
4673
4674 ret = inode_permission(inode, MAY_WRITE);
4675 iput(inode);
4676 return ret;
4677}
4678
4679static int cgroup_procs_write_permission(struct cgroup *src_cgrp,
4680 struct cgroup *dst_cgrp,
4681 struct super_block *sb)
4682{
4683 struct cgroup_namespace *ns = current->nsproxy->cgroup_ns;
4684 struct cgroup *com_cgrp = src_cgrp;
4685 int ret;
4686
4687 lockdep_assert_held(&cgroup_mutex);
4688
4689
4690 while (!cgroup_is_descendant(dst_cgrp, com_cgrp))
4691 com_cgrp = cgroup_parent(com_cgrp);
4692
4693
4694 ret = cgroup_may_write(com_cgrp, sb);
4695 if (ret)
4696 return ret;
4697
4698
4699
4700
4701
4702 if ((cgrp_dfl_root.flags & CGRP_ROOT_NS_DELEGATE) &&
4703 (!cgroup_is_descendant(src_cgrp, ns->root_cset->dfl_cgrp) ||
4704 !cgroup_is_descendant(dst_cgrp, ns->root_cset->dfl_cgrp)))
4705 return -ENOENT;
4706
4707 return 0;
4708}
4709
4710static int cgroup_attach_permissions(struct cgroup *src_cgrp,
4711 struct cgroup *dst_cgrp,
4712 struct super_block *sb, bool threadgroup)
4713{
4714 int ret = 0;
4715
4716 ret = cgroup_procs_write_permission(src_cgrp, dst_cgrp, sb);
4717 if (ret)
4718 return ret;
4719
4720 ret = cgroup_migrate_vet_dst(dst_cgrp);
4721 if (ret)
4722 return ret;
4723
4724 if (!threadgroup && (src_cgrp->dom_cgrp != dst_cgrp->dom_cgrp))
4725 ret = -EOPNOTSUPP;
4726
4727 return ret;
4728}
4729
4730static ssize_t cgroup_procs_write(struct kernfs_open_file *of,
4731 char *buf, size_t nbytes, loff_t off)
4732{
4733 struct cgroup *src_cgrp, *dst_cgrp;
4734 struct task_struct *task;
4735 ssize_t ret;
4736 bool locked;
4737
4738 dst_cgrp = cgroup_kn_lock_live(of->kn, false);
4739 if (!dst_cgrp)
4740 return -ENODEV;
4741
4742 task = cgroup_procs_write_start(buf, true, &locked);
4743 ret = PTR_ERR_OR_ZERO(task);
4744 if (ret)
4745 goto out_unlock;
4746
4747
4748 spin_lock_irq(&css_set_lock);
4749 src_cgrp = task_cgroup_from_root(task, &cgrp_dfl_root);
4750 spin_unlock_irq(&css_set_lock);
4751
4752 ret = cgroup_attach_permissions(src_cgrp, dst_cgrp,
4753 of->file->f_path.dentry->d_sb, true);
4754 if (ret)
4755 goto out_finish;
4756
4757 ret = cgroup_attach_task(dst_cgrp, task, true);
4758
4759out_finish:
4760 cgroup_procs_write_finish(task, locked);
4761out_unlock:
4762 cgroup_kn_unlock(of->kn);
4763
4764 return ret ?: nbytes;
4765}
4766
4767static void *cgroup_threads_start(struct seq_file *s, loff_t *pos)
4768{
4769 return __cgroup_procs_start(s, pos, 0);
4770}
4771
4772static ssize_t cgroup_threads_write(struct kernfs_open_file *of,
4773 char *buf, size_t nbytes, loff_t off)
4774{
4775 struct cgroup *src_cgrp, *dst_cgrp;
4776 struct task_struct *task;
4777 ssize_t ret;
4778 bool locked;
4779
4780 buf = strstrip(buf);
4781
4782 dst_cgrp = cgroup_kn_lock_live(of->kn, false);
4783 if (!dst_cgrp)
4784 return -ENODEV;
4785
4786 task = cgroup_procs_write_start(buf, false, &locked);
4787 ret = PTR_ERR_OR_ZERO(task);
4788 if (ret)
4789 goto out_unlock;
4790
4791
4792 spin_lock_irq(&css_set_lock);
4793 src_cgrp = task_cgroup_from_root(task, &cgrp_dfl_root);
4794 spin_unlock_irq(&css_set_lock);
4795
4796
4797 ret = cgroup_attach_permissions(src_cgrp, dst_cgrp,
4798 of->file->f_path.dentry->d_sb, false);
4799 if (ret)
4800 goto out_finish;
4801
4802 ret = cgroup_attach_task(dst_cgrp, task, false);
4803
4804out_finish:
4805 cgroup_procs_write_finish(task, locked);
4806out_unlock:
4807 cgroup_kn_unlock(of->kn);
4808
4809 return ret ?: nbytes;
4810}
4811
4812
4813static struct cftype cgroup_base_files[] = {
4814 {
4815 .name = "cgroup.type",
4816 .flags = CFTYPE_NOT_ON_ROOT,
4817 .seq_show = cgroup_type_show,
4818 .write = cgroup_type_write,
4819 },
4820 {
4821 .name = "cgroup.procs",
4822 .flags = CFTYPE_NS_DELEGATABLE,
4823 .file_offset = offsetof(struct cgroup, procs_file),
4824 .release = cgroup_procs_release,
4825 .seq_start = cgroup_procs_start,
4826 .seq_next = cgroup_procs_next,
4827 .seq_show = cgroup_procs_show,
4828 .write = cgroup_procs_write,
4829 },
4830 {
4831 .name = "cgroup.threads",
4832 .flags = CFTYPE_NS_DELEGATABLE,
4833 .release = cgroup_procs_release,
4834 .seq_start = cgroup_threads_start,
4835 .seq_next = cgroup_procs_next,
4836 .seq_show = cgroup_procs_show,
4837 .write = cgroup_threads_write,
4838 },
4839 {
4840 .name = "cgroup.controllers",
4841 .seq_show = cgroup_controllers_show,
4842 },
4843 {
4844 .name = "cgroup.subtree_control",
4845 .flags = CFTYPE_NS_DELEGATABLE,
4846 .seq_show = cgroup_subtree_control_show,
4847 .write = cgroup_subtree_control_write,
4848 },
4849 {
4850 .name = "cgroup.events",
4851 .flags = CFTYPE_NOT_ON_ROOT,
4852 .file_offset = offsetof(struct cgroup, events_file),
4853 .seq_show = cgroup_events_show,
4854 },
4855 {
4856 .name = "cgroup.max.descendants",
4857 .seq_show = cgroup_max_descendants_show,
4858 .write = cgroup_max_descendants_write,
4859 },
4860 {
4861 .name = "cgroup.max.depth",
4862 .seq_show = cgroup_max_depth_show,
4863 .write = cgroup_max_depth_write,
4864 },
4865 {
4866 .name = "cgroup.stat",
4867 .seq_show = cgroup_stat_show,
4868 },
4869 {
4870 .name = "cgroup.freeze",
4871 .flags = CFTYPE_NOT_ON_ROOT,
4872 .seq_show = cgroup_freeze_show,
4873 .write = cgroup_freeze_write,
4874 },
4875 {
4876 .name = "cpu.stat",
4877 .seq_show = cpu_stat_show,
4878 },
4879#ifdef CONFIG_PSI
4880 {
4881 .name = "io.pressure",
4882 .seq_show = cgroup_io_pressure_show,
4883 .write = cgroup_io_pressure_write,
4884 .poll = cgroup_pressure_poll,
4885 .release = cgroup_pressure_release,
4886 },
4887 {
4888 .name = "memory.pressure",
4889 .seq_show = cgroup_memory_pressure_show,
4890 .write = cgroup_memory_pressure_write,
4891 .poll = cgroup_pressure_poll,
4892 .release = cgroup_pressure_release,
4893 },
4894 {
4895 .name = "cpu.pressure",
4896 .seq_show = cgroup_cpu_pressure_show,
4897 .write = cgroup_cpu_pressure_write,
4898 .poll = cgroup_pressure_poll,
4899 .release = cgroup_pressure_release,
4900 },
4901#endif
4902 { }
4903};
4904
4905
4906
4907
4908
4909
4910
4911
4912
4913
4914
4915
4916
4917
4918
4919
4920
4921
4922
4923
4924
4925
4926
4927static void css_free_rwork_fn(struct work_struct *work)
4928{
4929 struct cgroup_subsys_state *css = container_of(to_rcu_work(work),
4930 struct cgroup_subsys_state, destroy_rwork);
4931 struct cgroup_subsys *ss = css->ss;
4932 struct cgroup *cgrp = css->cgroup;
4933
4934 percpu_ref_exit(&css->refcnt);
4935
4936 if (ss) {
4937
4938 struct cgroup_subsys_state *parent = css->parent;
4939 int id = css->id;
4940
4941 ss->css_free(css);
4942 cgroup_idr_remove(&ss->css_idr, id);
4943 cgroup_put(cgrp);
4944
4945 if (parent)
4946 css_put(parent);
4947 } else {
4948
4949 atomic_dec(&cgrp->root->nr_cgrps);
4950 cgroup1_pidlist_destroy_all(cgrp);
4951 cancel_work_sync(&cgrp->release_agent_work);
4952
4953 if (cgroup_parent(cgrp)) {
4954
4955
4956
4957
4958
4959
4960 cgroup_put(cgroup_parent(cgrp));
4961 kernfs_put(cgrp->kn);
4962 psi_cgroup_free(cgrp);
4963 if (cgroup_on_dfl(cgrp))
4964 cgroup_rstat_exit(cgrp);
4965 kfree(cgrp);
4966 } else {
4967
4968
4969
4970
4971
4972 cgroup_destroy_root(cgrp->root);
4973 }
4974 }
4975}
4976
4977static void css_release_work_fn(struct work_struct *work)
4978{
4979 struct cgroup_subsys_state *css =
4980 container_of(work, struct cgroup_subsys_state, destroy_work);
4981 struct cgroup_subsys *ss = css->ss;
4982 struct cgroup *cgrp = css->cgroup;
4983
4984 mutex_lock(&cgroup_mutex);
4985
4986 css->flags |= CSS_RELEASED;
4987 list_del_rcu(&css->sibling);
4988
4989 if (ss) {
4990
4991 if (!list_empty(&css->rstat_css_node)) {
4992 cgroup_rstat_flush(cgrp);
4993 list_del_rcu(&css->rstat_css_node);
4994 }
4995
4996 cgroup_idr_replace(&ss->css_idr, NULL, css->id);
4997 if (ss->css_released)
4998 ss->css_released(css);
4999 } else {
5000 struct cgroup *tcgrp;
5001
5002
5003 TRACE_CGROUP_PATH(release, cgrp);
5004
5005 if (cgroup_on_dfl(cgrp))
5006 cgroup_rstat_flush(cgrp);
5007
5008 spin_lock_irq(&css_set_lock);
5009 for (tcgrp = cgroup_parent(cgrp); tcgrp;
5010 tcgrp = cgroup_parent(tcgrp))
5011 tcgrp->nr_dying_descendants--;
5012 spin_unlock_irq(&css_set_lock);
5013
5014
5015
5016
5017
5018
5019
5020
5021 if (cgrp->kn)
5022 RCU_INIT_POINTER(*(void __rcu __force **)&cgrp->kn->priv,
5023 NULL);
5024 }
5025
5026 mutex_unlock(&cgroup_mutex);
5027
5028 INIT_RCU_WORK(&css->destroy_rwork, css_free_rwork_fn);
5029 queue_rcu_work(cgroup_destroy_wq, &css->destroy_rwork);
5030}
5031
5032static void css_release(struct percpu_ref *ref)
5033{
5034 struct cgroup_subsys_state *css =
5035 container_of(ref, struct cgroup_subsys_state, refcnt);
5036
5037 INIT_WORK(&css->destroy_work, css_release_work_fn);
5038 queue_work(cgroup_destroy_wq, &css->destroy_work);
5039}
5040
5041static void init_and_link_css(struct cgroup_subsys_state *css,
5042 struct cgroup_subsys *ss, struct cgroup *cgrp)
5043{
5044 lockdep_assert_held(&cgroup_mutex);
5045
5046 cgroup_get_live(cgrp);
5047
5048 memset(css, 0, sizeof(*css));
5049 css->cgroup = cgrp;
5050 css->ss = ss;
5051 css->id = -1;
5052 INIT_LIST_HEAD(&css->sibling);
5053 INIT_LIST_HEAD(&css->children);
5054 INIT_LIST_HEAD(&css->rstat_css_node);
5055 css->serial_nr = css_serial_nr_next++;
5056 atomic_set(&css->online_cnt, 0);
5057
5058 if (cgroup_parent(cgrp)) {
5059 css->parent = cgroup_css(cgroup_parent(cgrp), ss);
5060 css_get(css->parent);
5061 }
5062
5063 if (cgroup_on_dfl(cgrp) && ss->css_rstat_flush)
5064 list_add_rcu(&css->rstat_css_node, &cgrp->rstat_css_list);
5065
5066 BUG_ON(cgroup_css(cgrp, ss));
5067}
5068
5069
5070static int online_css(struct cgroup_subsys_state *css)
5071{
5072 struct cgroup_subsys *ss = css->ss;
5073 int ret = 0;
5074
5075 lockdep_assert_held(&cgroup_mutex);
5076
5077 if (ss->css_online)
5078 ret = ss->css_online(css);
5079 if (!ret) {
5080 css->flags |= CSS_ONLINE;
5081 rcu_assign_pointer(css->cgroup->subsys[ss->id], css);
5082
5083 atomic_inc(&css->online_cnt);
5084 if (css->parent)
5085 atomic_inc(&css->parent->online_cnt);
5086 }
5087 return ret;
5088}
5089
5090
5091static void offline_css(struct cgroup_subsys_state *css)
5092{
5093 struct cgroup_subsys *ss = css->ss;
5094
5095 lockdep_assert_held(&cgroup_mutex);
5096
5097 if (!(css->flags & CSS_ONLINE))
5098 return;
5099
5100 if (ss->css_offline)
5101 ss->css_offline(css);
5102
5103 css->flags &= ~CSS_ONLINE;
5104 RCU_INIT_POINTER(css->cgroup->subsys[ss->id], NULL);
5105
5106 wake_up_all(&css->cgroup->offline_waitq);
5107}
5108
5109
5110
5111
5112
5113
5114
5115
5116
5117
5118static struct cgroup_subsys_state *css_create(struct cgroup *cgrp,
5119 struct cgroup_subsys *ss)
5120{
5121 struct cgroup *parent = cgroup_parent(cgrp);
5122 struct cgroup_subsys_state *parent_css = cgroup_css(parent, ss);
5123 struct cgroup_subsys_state *css;
5124 int err;
5125
5126 lockdep_assert_held(&cgroup_mutex);
5127
5128 css = ss->css_alloc(parent_css);
5129 if (!css)
5130 css = ERR_PTR(-ENOMEM);
5131 if (IS_ERR(css))
5132 return css;
5133
5134 init_and_link_css(css, ss, cgrp);
5135
5136 err = percpu_ref_init(&css->refcnt, css_release, 0, GFP_KERNEL);
5137 if (err)
5138 goto err_free_css;
5139
5140 err = cgroup_idr_alloc(&ss->css_idr, NULL, 2, 0, GFP_KERNEL);
5141 if (err < 0)
5142 goto err_free_css;
5143 css->id = err;
5144
5145
5146 list_add_tail_rcu(&css->sibling, &parent_css->children);
5147 cgroup_idr_replace(&ss->css_idr, css, css->id);
5148
5149 err = online_css(css);
5150 if (err)
5151 goto err_list_del;
5152
5153 if (ss->broken_hierarchy && !ss->warned_broken_hierarchy &&
5154 cgroup_parent(parent)) {
5155 pr_warn("%s (%d) created nested cgroup for controller \"%s\" which has incomplete hierarchy support. Nested cgroups may change behavior in the future.\n",
5156 current->comm, current->pid, ss->name);
5157 if (!strcmp(ss->name, "memory"))
5158 pr_warn("\"memory\" requires setting use_hierarchy to 1 on the root\n");
5159 ss->warned_broken_hierarchy = true;
5160 }
5161
5162 return css;
5163
5164err_list_del:
5165 list_del_rcu(&css->sibling);
5166err_free_css:
5167 list_del_rcu(&css->rstat_css_node);
5168 INIT_RCU_WORK(&css->destroy_rwork, css_free_rwork_fn);
5169 queue_rcu_work(cgroup_destroy_wq, &css->destroy_rwork);
5170 return ERR_PTR(err);
5171}
5172
5173
5174
5175
5176
5177
5178static struct cgroup *cgroup_create(struct cgroup *parent, const char *name,
5179 umode_t mode)
5180{
5181 struct cgroup_root *root = parent->root;
5182 struct cgroup *cgrp, *tcgrp;
5183 struct kernfs_node *kn;
5184 int level = parent->level + 1;
5185 int ret;
5186
5187
5188 cgrp = kzalloc(struct_size(cgrp, ancestor_ids, (level + 1)),
5189 GFP_KERNEL);
5190 if (!cgrp)
5191 return ERR_PTR(-ENOMEM);
5192
5193 ret = percpu_ref_init(&cgrp->self.refcnt, css_release, 0, GFP_KERNEL);
5194 if (ret)
5195 goto out_free_cgrp;
5196
5197 if (cgroup_on_dfl(parent)) {
5198 ret = cgroup_rstat_init(cgrp);
5199 if (ret)
5200 goto out_cancel_ref;
5201 }
5202
5203
5204 kn = kernfs_create_dir(parent->kn, name, mode, cgrp);
5205 if (IS_ERR(kn)) {
5206 ret = PTR_ERR(kn);
5207 goto out_stat_exit;
5208 }
5209 cgrp->kn = kn;
5210
5211 init_cgroup_housekeeping(cgrp);
5212
5213 cgrp->self.parent = &parent->self;
5214 cgrp->root = root;
5215 cgrp->level = level;
5216
5217 ret = psi_cgroup_alloc(cgrp);
5218 if (ret)
5219 goto out_kernfs_remove;
5220
5221 ret = cgroup_bpf_inherit(cgrp);
5222 if (ret)
5223 goto out_psi_free;
5224
5225
5226
5227
5228
5229 cgrp->freezer.e_freeze = parent->freezer.e_freeze;
5230 if (cgrp->freezer.e_freeze) {
5231
5232
5233
5234
5235
5236
5237 set_bit(CGRP_FREEZE, &cgrp->flags);
5238 set_bit(CGRP_FROZEN, &cgrp->flags);
5239 }
5240
5241 spin_lock_irq(&css_set_lock);
5242 for (tcgrp = cgrp; tcgrp; tcgrp = cgroup_parent(tcgrp)) {
5243 cgrp->ancestor_ids[tcgrp->level] = cgroup_id(tcgrp);
5244
5245 if (tcgrp != cgrp) {
5246 tcgrp->nr_descendants++;
5247
5248
5249
5250
5251
5252
5253 if (cgrp->freezer.e_freeze)
5254 tcgrp->freezer.nr_frozen_descendants++;
5255 }
5256 }
5257 spin_unlock_irq(&css_set_lock);
5258
5259 if (notify_on_release(parent))
5260 set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
5261
5262 if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &parent->flags))
5263 set_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags);
5264
5265 cgrp->self.serial_nr = css_serial_nr_next++;
5266
5267
5268 list_add_tail_rcu(&cgrp->self.sibling, &cgroup_parent(cgrp)->self.children);
5269 atomic_inc(&root->nr_cgrps);
5270 cgroup_get_live(parent);
5271
5272
5273
5274
5275
5276 if (!cgroup_on_dfl(cgrp))
5277 cgrp->subtree_control = cgroup_control(cgrp);
5278
5279 cgroup_propagate_control(cgrp);
5280
5281 return cgrp;
5282
5283out_psi_free:
5284 psi_cgroup_free(cgrp);
5285out_kernfs_remove:
5286 kernfs_remove(cgrp->kn);
5287out_stat_exit:
5288 if (cgroup_on_dfl(parent))
5289 cgroup_rstat_exit(cgrp);
5290out_cancel_ref:
5291 percpu_ref_exit(&cgrp->self.refcnt);
5292out_free_cgrp:
5293 kfree(cgrp);
5294 return ERR_PTR(ret);
5295}
5296
5297static bool cgroup_check_hierarchy_limits(struct cgroup *parent)
5298{
5299 struct cgroup *cgroup;
5300 int ret = false;
5301 int level = 1;
5302
5303 lockdep_assert_held(&cgroup_mutex);
5304
5305 for (cgroup = parent; cgroup; cgroup = cgroup_parent(cgroup)) {
5306 if (cgroup->nr_descendants >= cgroup->max_descendants)
5307 goto fail;
5308
5309 if (level > cgroup->max_depth)
5310 goto fail;
5311
5312 level++;
5313 }
5314
5315 ret = true;
5316fail:
5317 return ret;
5318}
5319
5320int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, umode_t mode)
5321{
5322 struct cgroup *parent, *cgrp;
5323 int ret;
5324
5325
5326 if (strchr(name, '\n'))
5327 return -EINVAL;
5328
5329 parent = cgroup_kn_lock_live(parent_kn, false);
5330 if (!parent)
5331 return -ENODEV;
5332
5333 if (!cgroup_check_hierarchy_limits(parent)) {
5334 ret = -EAGAIN;
5335 goto out_unlock;
5336 }
5337
5338 cgrp = cgroup_create(parent, name, mode);
5339 if (IS_ERR(cgrp)) {
5340 ret = PTR_ERR(cgrp);
5341 goto out_unlock;
5342 }
5343
5344
5345
5346
5347
5348 kernfs_get(cgrp->kn);
5349
5350 ret = cgroup_kn_set_ugid(cgrp->kn);
5351 if (ret)
5352 goto out_destroy;
5353
5354 ret = css_populate_dir(&cgrp->self);
5355 if (ret)
5356 goto out_destroy;
5357
5358 ret = cgroup_apply_control_enable(cgrp);
5359 if (ret)
5360 goto out_destroy;
5361
5362 TRACE_CGROUP_PATH(mkdir, cgrp);
5363
5364
5365 kernfs_activate(cgrp->kn);
5366
5367 ret = 0;
5368 goto out_unlock;
5369
5370out_destroy:
5371 cgroup_destroy_locked(cgrp);
5372out_unlock:
5373 cgroup_kn_unlock(parent_kn);
5374 return ret;
5375}
5376
5377
5378
5379
5380
5381
5382static void css_killed_work_fn(struct work_struct *work)
5383{
5384 struct cgroup_subsys_state *css =
5385 container_of(work, struct cgroup_subsys_state, destroy_work);
5386
5387 mutex_lock(&cgroup_mutex);
5388
5389 do {
5390 offline_css(css);
5391 css_put(css);
5392
5393 css = css->parent;
5394 } while (css && atomic_dec_and_test(&css->online_cnt));
5395
5396 mutex_unlock(&cgroup_mutex);
5397}
5398
5399
5400static void css_killed_ref_fn(struct percpu_ref *ref)
5401{
5402 struct cgroup_subsys_state *css =
5403 container_of(ref, struct cgroup_subsys_state, refcnt);
5404
5405 if (atomic_dec_and_test(&css->online_cnt)) {
5406 INIT_WORK(&css->destroy_work, css_killed_work_fn);
5407 queue_work(cgroup_destroy_wq, &css->destroy_work);
5408 }
5409}
5410
5411
5412
5413
5414
5415
5416
5417
5418
5419
5420static void kill_css(struct cgroup_subsys_state *css)
5421{
5422 lockdep_assert_held(&cgroup_mutex);
5423
5424 if (css->flags & CSS_DYING)
5425 return;
5426
5427 css->flags |= CSS_DYING;
5428
5429
5430
5431
5432
5433 css_clear_dir(css);
5434
5435
5436
5437
5438
5439 css_get(css);
5440
5441
5442
5443
5444
5445
5446
5447
5448
5449
5450
5451 percpu_ref_kill_and_confirm(&css->refcnt, css_killed_ref_fn);
5452}
5453
5454
5455
5456
5457
5458
5459
5460
5461
5462
5463
5464
5465
5466
5467
5468
5469
5470
5471
5472
5473
5474
5475
5476
5477
5478static int cgroup_destroy_locked(struct cgroup *cgrp)
5479 __releases(&cgroup_mutex) __acquires(&cgroup_mutex)
5480{
5481 struct cgroup *tcgrp, *parent = cgroup_parent(cgrp);
5482 struct cgroup_subsys_state *css;
5483 struct cgrp_cset_link *link;
5484 int ssid;
5485
5486 lockdep_assert_held(&cgroup_mutex);
5487
5488
5489
5490
5491
5492 if (cgroup_is_populated(cgrp))
5493 return -EBUSY;
5494
5495
5496
5497
5498
5499
5500 if (css_has_online_children(&cgrp->self))
5501 return -EBUSY;
5502
5503
5504
5505
5506
5507
5508
5509 cgrp->self.flags &= ~CSS_ONLINE;
5510
5511 spin_lock_irq(&css_set_lock);
5512 list_for_each_entry(link, &cgrp->cset_links, cset_link)
5513 link->cset->dead = true;
5514 spin_unlock_irq(&css_set_lock);
5515
5516
5517 for_each_css(css, ssid, cgrp)
5518 kill_css(css);
5519
5520
5521 css_clear_dir(&cgrp->self);
5522 kernfs_remove(cgrp->kn);
5523
5524 if (parent && cgroup_is_threaded(cgrp))
5525 parent->nr_threaded_children--;
5526
5527 spin_lock_irq(&css_set_lock);
5528 for (tcgrp = cgroup_parent(cgrp); tcgrp; tcgrp = cgroup_parent(tcgrp)) {
5529 tcgrp->nr_descendants--;
5530 tcgrp->nr_dying_descendants++;
5531
5532
5533
5534
5535 if (test_bit(CGRP_FROZEN, &cgrp->flags))
5536 tcgrp->freezer.nr_frozen_descendants--;
5537 }
5538 spin_unlock_irq(&css_set_lock);
5539
5540 cgroup1_check_for_release(parent);
5541
5542 cgroup_bpf_offline(cgrp);
5543
5544
5545 percpu_ref_kill(&cgrp->self.refcnt);
5546
5547 return 0;
5548};
5549
5550int cgroup_rmdir(struct kernfs_node *kn)
5551{
5552 struct cgroup *cgrp;
5553 int ret = 0;
5554
5555 cgrp = cgroup_kn_lock_live(kn, false);
5556 if (!cgrp)
5557 return 0;
5558
5559 ret = cgroup_destroy_locked(cgrp);
5560 if (!ret)
5561 TRACE_CGROUP_PATH(rmdir, cgrp);
5562
5563 cgroup_kn_unlock(kn);
5564 return ret;
5565}
5566
5567static struct kernfs_syscall_ops cgroup_kf_syscall_ops = {
5568 .show_options = cgroup_show_options,
5569 .mkdir = cgroup_mkdir,
5570 .rmdir = cgroup_rmdir,
5571 .show_path = cgroup_show_path,
5572};
5573
5574static void __init cgroup_init_subsys(struct cgroup_subsys *ss, bool early)
5575{
5576 struct cgroup_subsys_state *css;
5577
5578 pr_debug("Initializing cgroup subsys %s\n", ss->name);
5579
5580 mutex_lock(&cgroup_mutex);
5581
5582 idr_init(&ss->css_idr);
5583 INIT_LIST_HEAD(&ss->cfts);
5584
5585
5586 ss->root = &cgrp_dfl_root;
5587 css = ss->css_alloc(cgroup_css(&cgrp_dfl_root.cgrp, ss));
5588
5589 BUG_ON(IS_ERR(css));
5590 init_and_link_css(css, ss, &cgrp_dfl_root.cgrp);
5591
5592
5593
5594
5595
5596 css->flags |= CSS_NO_REF;
5597
5598 if (early) {
5599
5600 css->id = 1;
5601 } else {
5602 css->id = cgroup_idr_alloc(&ss->css_idr, css, 1, 2, GFP_KERNEL);
5603 BUG_ON(css->id < 0);
5604 }
5605
5606
5607
5608
5609
5610 init_css_set.subsys[ss->id] = css;
5611
5612 have_fork_callback |= (bool)ss->fork << ss->id;
5613 have_exit_callback |= (bool)ss->exit << ss->id;
5614 have_release_callback |= (bool)ss->release << ss->id;
5615 have_canfork_callback |= (bool)ss->can_fork << ss->id;
5616
5617
5618
5619
5620 BUG_ON(!list_empty(&init_task.tasks));
5621
5622 BUG_ON(online_css(css));
5623
5624 mutex_unlock(&cgroup_mutex);
5625}
5626
5627
5628
5629
5630
5631
5632
5633int __init cgroup_init_early(void)
5634{
5635 static struct cgroup_fs_context __initdata ctx;
5636 struct cgroup_subsys *ss;
5637 int i;
5638
5639 ctx.root = &cgrp_dfl_root;
5640 init_cgroup_root(&ctx);
5641 cgrp_dfl_root.cgrp.self.flags |= CSS_NO_REF;
5642
5643 RCU_INIT_POINTER(init_task.cgroups, &init_css_set);
5644
5645 for_each_subsys(ss, i) {
5646 WARN(!ss->css_alloc || !ss->css_free || ss->name || ss->id,
5647 "invalid cgroup_subsys %d:%s css_alloc=%p css_free=%p id:name=%d:%s\n",
5648 i, cgroup_subsys_name[i], ss->css_alloc, ss->css_free,
5649 ss->id, ss->name);
5650 WARN(strlen(cgroup_subsys_name[i]) > MAX_CGROUP_TYPE_NAMELEN,
5651 "cgroup_subsys_name %s too long\n", cgroup_subsys_name[i]);
5652
5653 ss->id = i;
5654 ss->name = cgroup_subsys_name[i];
5655 if (!ss->legacy_name)
5656 ss->legacy_name = cgroup_subsys_name[i];
5657
5658 if (ss->early_init)
5659 cgroup_init_subsys(ss, true);
5660 }
5661 return 0;
5662}
5663
5664static u16 cgroup_disable_mask __initdata;
5665
5666
5667
5668
5669
5670
5671
5672int __init cgroup_init(void)
5673{
5674 struct cgroup_subsys *ss;
5675 int ssid;
5676
5677 BUILD_BUG_ON(CGROUP_SUBSYS_COUNT > 16);
5678 BUG_ON(cgroup_init_cftypes(NULL, cgroup_base_files));
5679 BUG_ON(cgroup_init_cftypes(NULL, cgroup1_base_files));
5680
5681 cgroup_rstat_boot();
5682
5683
5684
5685
5686
5687 rcu_sync_enter_start(&cgroup_threadgroup_rwsem.rss);
5688
5689 get_user_ns(init_cgroup_ns.user_ns);
5690
5691 mutex_lock(&cgroup_mutex);
5692
5693
5694
5695
5696
5697 hash_add(css_set_table, &init_css_set.hlist,
5698 css_set_hash(init_css_set.subsys));
5699
5700 BUG_ON(cgroup_setup_root(&cgrp_dfl_root, 0));
5701
5702 mutex_unlock(&cgroup_mutex);
5703
5704 for_each_subsys(ss, ssid) {
5705 if (ss->early_init) {
5706 struct cgroup_subsys_state *css =
5707 init_css_set.subsys[ss->id];
5708
5709 css->id = cgroup_idr_alloc(&ss->css_idr, css, 1, 2,
5710 GFP_KERNEL);
5711 BUG_ON(css->id < 0);
5712 } else {
5713 cgroup_init_subsys(ss, false);
5714 }
5715
5716 list_add_tail(&init_css_set.e_cset_node[ssid],
5717 &cgrp_dfl_root.cgrp.e_csets[ssid]);
5718
5719
5720
5721
5722
5723
5724 if (cgroup_disable_mask & (1 << ssid)) {
5725 static_branch_disable(cgroup_subsys_enabled_key[ssid]);
5726 printk(KERN_INFO "Disabling %s control group subsystem\n",
5727 ss->name);
5728 continue;
5729 }
5730
5731 if (cgroup1_ssid_disabled(ssid))
5732 printk(KERN_INFO "Disabling %s control group subsystem in v1 mounts\n",
5733 ss->name);
5734
5735 cgrp_dfl_root.subsys_mask |= 1 << ss->id;
5736
5737
5738 WARN_ON(ss->implicit_on_dfl && !ss->threaded);
5739
5740 if (ss->implicit_on_dfl)
5741 cgrp_dfl_implicit_ss_mask |= 1 << ss->id;
5742 else if (!ss->dfl_cftypes)
5743 cgrp_dfl_inhibit_ss_mask |= 1 << ss->id;
5744
5745 if (ss->threaded)
5746 cgrp_dfl_threaded_ss_mask |= 1 << ss->id;
5747
5748 if (ss->dfl_cftypes == ss->legacy_cftypes) {
5749 WARN_ON(cgroup_add_cftypes(ss, ss->dfl_cftypes));
5750 } else {
5751 WARN_ON(cgroup_add_dfl_cftypes(ss, ss->dfl_cftypes));
5752 WARN_ON(cgroup_add_legacy_cftypes(ss, ss->legacy_cftypes));
5753 }
5754
5755 if (ss->bind)
5756 ss->bind(init_css_set.subsys[ssid]);
5757
5758 mutex_lock(&cgroup_mutex);
5759 css_populate_dir(init_css_set.subsys[ssid]);
5760 mutex_unlock(&cgroup_mutex);
5761 }
5762
5763
5764 hash_del(&init_css_set.hlist);
5765 hash_add(css_set_table, &init_css_set.hlist,
5766 css_set_hash(init_css_set.subsys));
5767
5768 WARN_ON(sysfs_create_mount_point(fs_kobj, "cgroup"));
5769 WARN_ON(register_filesystem(&cgroup_fs_type));
5770 WARN_ON(register_filesystem(&cgroup2_fs_type));
5771 WARN_ON(!proc_create_single("cgroups", 0, NULL, proc_cgroupstats_show));
5772#ifdef CONFIG_CPUSETS
5773 WARN_ON(register_filesystem(&cpuset_fs_type));
5774#endif
5775
5776 return 0;
5777}
5778
5779static int __init cgroup_wq_init(void)
5780{
5781
5782
5783
5784
5785
5786
5787
5788
5789 cgroup_destroy_wq = alloc_workqueue("cgroup_destroy", 0, 1);
5790 BUG_ON(!cgroup_destroy_wq);
5791 return 0;
5792}
5793core_initcall(cgroup_wq_init);
5794
5795void cgroup_path_from_kernfs_id(u64 id, char *buf, size_t buflen)
5796{
5797 struct kernfs_node *kn;
5798
5799 kn = kernfs_find_and_get_node_by_id(cgrp_dfl_root.kf_root, id);
5800 if (!kn)
5801 return;
5802 kernfs_path(kn, buf, buflen);
5803 kernfs_put(kn);
5804}
5805
5806
5807
5808
5809
5810
5811int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns,
5812 struct pid *pid, struct task_struct *tsk)
5813{
5814 char *buf;
5815 int retval;
5816 struct cgroup_root *root;
5817
5818 retval = -ENOMEM;
5819 buf = kmalloc(PATH_MAX, GFP_KERNEL);
5820 if (!buf)
5821 goto out;
5822
5823 mutex_lock(&cgroup_mutex);
5824 spin_lock_irq(&css_set_lock);
5825
5826 for_each_root(root) {
5827 struct cgroup_subsys *ss;
5828 struct cgroup *cgrp;
5829 int ssid, count = 0;
5830
5831 if (root == &cgrp_dfl_root && !cgrp_dfl_visible)
5832 continue;
5833
5834 seq_printf(m, "%d:", root->hierarchy_id);
5835 if (root != &cgrp_dfl_root)
5836 for_each_subsys(ss, ssid)
5837 if (root->subsys_mask & (1 << ssid))
5838 seq_printf(m, "%s%s", count++ ? "," : "",
5839 ss->legacy_name);
5840 if (strlen(root->name))
5841 seq_printf(m, "%sname=%s", count ? "," : "",
5842 root->name);
5843 seq_putc(m, ':');
5844
5845 cgrp = task_cgroup_from_root(tsk, root);
5846
5847
5848
5849
5850
5851
5852
5853
5854
5855
5856 if (cgroup_on_dfl(cgrp) || !(tsk->flags & PF_EXITING)) {
5857 retval = cgroup_path_ns_locked(cgrp, buf, PATH_MAX,
5858 current->nsproxy->cgroup_ns);
5859 if (retval >= PATH_MAX)
5860 retval = -ENAMETOOLONG;
5861 if (retval < 0)
5862 goto out_unlock;
5863
5864 seq_puts(m, buf);
5865 } else {
5866 seq_puts(m, "/");
5867 }
5868
5869 if (cgroup_on_dfl(cgrp) && cgroup_is_dead(cgrp))
5870 seq_puts(m, " (deleted)\n");
5871 else
5872 seq_putc(m, '\n');
5873 }
5874
5875 retval = 0;
5876out_unlock:
5877 spin_unlock_irq(&css_set_lock);
5878 mutex_unlock(&cgroup_mutex);
5879 kfree(buf);
5880out:
5881 return retval;
5882}
5883
5884
5885
5886
5887
5888
5889
5890
5891void cgroup_fork(struct task_struct *child)
5892{
5893 RCU_INIT_POINTER(child->cgroups, &init_css_set);
5894 INIT_LIST_HEAD(&child->cg_list);
5895}
5896
5897static struct cgroup *cgroup_get_from_file(struct file *f)
5898{
5899 struct cgroup_subsys_state *css;
5900 struct cgroup *cgrp;
5901
5902 css = css_tryget_online_from_dir(f->f_path.dentry, NULL);
5903 if (IS_ERR(css))
5904 return ERR_CAST(css);
5905
5906 cgrp = css->cgroup;
5907 if (!cgroup_on_dfl(cgrp)) {
5908 cgroup_put(cgrp);
5909 return ERR_PTR(-EBADF);
5910 }
5911
5912 return cgrp;
5913}
5914
5915
5916
5917
5918
5919
5920
5921
5922
5923
5924
5925
5926
5927
5928
5929
5930
5931static int cgroup_css_set_fork(struct kernel_clone_args *kargs)
5932 __acquires(&cgroup_mutex) __acquires(&cgroup_threadgroup_rwsem)
5933{
5934 int ret;
5935 struct cgroup *dst_cgrp = NULL;
5936 struct css_set *cset;
5937 struct super_block *sb;
5938 struct file *f;
5939
5940 if (kargs->flags & CLONE_INTO_CGROUP)
5941 mutex_lock(&cgroup_mutex);
5942
5943 cgroup_threadgroup_change_begin(current);
5944
5945 spin_lock_irq(&css_set_lock);
5946 cset = task_css_set(current);
5947 get_css_set(cset);
5948 spin_unlock_irq(&css_set_lock);
5949
5950 if (!(kargs->flags & CLONE_INTO_CGROUP)) {
5951 kargs->cset = cset;
5952 return 0;
5953 }
5954
5955 f = fget_raw(kargs->cgroup);
5956 if (!f) {
5957 ret = -EBADF;
5958 goto err;
5959 }
5960 sb = f->f_path.dentry->d_sb;
5961
5962 dst_cgrp = cgroup_get_from_file(f);
5963 if (IS_ERR(dst_cgrp)) {
5964 ret = PTR_ERR(dst_cgrp);
5965 dst_cgrp = NULL;
5966 goto err;
5967 }
5968
5969 if (cgroup_is_dead(dst_cgrp)) {
5970 ret = -ENODEV;
5971 goto err;
5972 }
5973
5974
5975
5976
5977
5978
5979 ret = cgroup_may_write(dst_cgrp, sb);
5980 if (ret)
5981 goto err;
5982
5983 ret = cgroup_attach_permissions(cset->dfl_cgrp, dst_cgrp, sb,
5984 !(kargs->flags & CLONE_THREAD));
5985 if (ret)
5986 goto err;
5987
5988 kargs->cset = find_css_set(cset, dst_cgrp);
5989 if (!kargs->cset) {
5990 ret = -ENOMEM;
5991 goto err;
5992 }
5993
5994 put_css_set(cset);
5995 fput(f);
5996 kargs->cgrp = dst_cgrp;
5997 return ret;
5998
5999err:
6000 cgroup_threadgroup_change_end(current);
6001 mutex_unlock(&cgroup_mutex);
6002 if (f)
6003 fput(f);
6004 if (dst_cgrp)
6005 cgroup_put(dst_cgrp);
6006 put_css_set(cset);
6007 if (kargs->cset)
6008 put_css_set(kargs->cset);
6009 return ret;
6010}
6011
6012
6013
6014
6015
6016
6017
6018
6019static void cgroup_css_set_put_fork(struct kernel_clone_args *kargs)
6020 __releases(&cgroup_threadgroup_rwsem) __releases(&cgroup_mutex)
6021{
6022 cgroup_threadgroup_change_end(current);
6023
6024 if (kargs->flags & CLONE_INTO_CGROUP) {
6025 struct cgroup *cgrp = kargs->cgrp;
6026 struct css_set *cset = kargs->cset;
6027
6028 mutex_unlock(&cgroup_mutex);
6029
6030 if (cset) {
6031 put_css_set(cset);
6032 kargs->cset = NULL;
6033 }
6034
6035 if (cgrp) {
6036 cgroup_put(cgrp);
6037 kargs->cgrp = NULL;
6038 }
6039 }
6040}
6041
6042
6043
6044
6045
6046
6047
6048
6049
6050
6051
6052int cgroup_can_fork(struct task_struct *child, struct kernel_clone_args *kargs)
6053{
6054 struct cgroup_subsys *ss;
6055 int i, j, ret;
6056
6057 ret = cgroup_css_set_fork(kargs);
6058 if (ret)
6059 return ret;
6060
6061 do_each_subsys_mask(ss, i, have_canfork_callback) {
6062 ret = ss->can_fork(child, kargs->cset);
6063 if (ret)
6064 goto out_revert;
6065 } while_each_subsys_mask();
6066
6067 return 0;
6068
6069out_revert:
6070 for_each_subsys(ss, j) {
6071 if (j >= i)
6072 break;
6073 if (ss->cancel_fork)
6074 ss->cancel_fork(child, kargs->cset);
6075 }
6076
6077 cgroup_css_set_put_fork(kargs);
6078
6079 return ret;
6080}
6081
6082
6083
6084
6085
6086
6087
6088
6089
6090
6091void cgroup_cancel_fork(struct task_struct *child,
6092 struct kernel_clone_args *kargs)
6093{
6094 struct cgroup_subsys *ss;
6095 int i;
6096
6097 for_each_subsys(ss, i)
6098 if (ss->cancel_fork)
6099 ss->cancel_fork(child, kargs->cset);
6100
6101 cgroup_css_set_put_fork(kargs);
6102}
6103
6104
6105
6106
6107
6108
6109
6110
6111void cgroup_post_fork(struct task_struct *child,
6112 struct kernel_clone_args *kargs)
6113 __releases(&cgroup_threadgroup_rwsem) __releases(&cgroup_mutex)
6114{
6115 struct cgroup_subsys *ss;
6116 struct css_set *cset;
6117 int i;
6118
6119 cset = kargs->cset;
6120 kargs->cset = NULL;
6121
6122 spin_lock_irq(&css_set_lock);
6123
6124
6125 if (likely(child->pid)) {
6126 WARN_ON_ONCE(!list_empty(&child->cg_list));
6127 cset->nr_tasks++;
6128 css_set_move_task(child, NULL, cset, false);
6129 } else {
6130 put_css_set(cset);
6131 cset = NULL;
6132 }
6133
6134
6135
6136
6137
6138
6139 if (unlikely(cgroup_task_freeze(child))) {
6140 spin_lock(&child->sighand->siglock);
6141 WARN_ON_ONCE(child->frozen);
6142 child->jobctl |= JOBCTL_TRAP_FREEZE;
6143 spin_unlock(&child->sighand->siglock);
6144
6145
6146
6147
6148
6149
6150
6151 }
6152
6153 spin_unlock_irq(&css_set_lock);
6154
6155
6156
6157
6158
6159
6160 do_each_subsys_mask(ss, i, have_fork_callback) {
6161 ss->fork(child);
6162 } while_each_subsys_mask();
6163
6164
6165 if (kargs->flags & CLONE_NEWCGROUP) {
6166 struct css_set *rcset = child->nsproxy->cgroup_ns->root_cset;
6167
6168 get_css_set(cset);
6169 child->nsproxy->cgroup_ns->root_cset = cset;
6170 put_css_set(rcset);
6171 }
6172
6173 cgroup_css_set_put_fork(kargs);
6174}
6175
6176
6177
6178
6179
6180
6181
6182
6183void cgroup_exit(struct task_struct *tsk)
6184{
6185 struct cgroup_subsys *ss;
6186 struct css_set *cset;
6187 int i;
6188
6189 spin_lock_irq(&css_set_lock);
6190
6191 WARN_ON_ONCE(list_empty(&tsk->cg_list));
6192 cset = task_css_set(tsk);
6193 css_set_move_task(tsk, cset, NULL, false);
6194 list_add_tail(&tsk->cg_list, &cset->dying_tasks);
6195 cset->nr_tasks--;
6196
6197 WARN_ON_ONCE(cgroup_task_frozen(tsk));
6198 if (unlikely(cgroup_task_freeze(tsk)))
6199 cgroup_update_frozen(task_dfl_cgroup(tsk));
6200
6201 spin_unlock_irq(&css_set_lock);
6202
6203
6204 do_each_subsys_mask(ss, i, have_exit_callback) {
6205 ss->exit(tsk);
6206 } while_each_subsys_mask();
6207}
6208
6209void cgroup_release(struct task_struct *task)
6210{
6211 struct cgroup_subsys *ss;
6212 int ssid;
6213
6214 do_each_subsys_mask(ss, ssid, have_release_callback) {
6215 ss->release(task);
6216 } while_each_subsys_mask();
6217
6218 spin_lock_irq(&css_set_lock);
6219 css_set_skip_task_iters(task_css_set(task), task);
6220 list_del_init(&task->cg_list);
6221 spin_unlock_irq(&css_set_lock);
6222}
6223
6224void cgroup_free(struct task_struct *task)
6225{
6226 struct css_set *cset = task_css_set(task);
6227 put_css_set(cset);
6228}
6229
6230static int __init cgroup_disable(char *str)
6231{
6232 struct cgroup_subsys *ss;
6233 char *token;
6234 int i;
6235
6236 while ((token = strsep(&str, ",")) != NULL) {
6237 if (!*token)
6238 continue;
6239
6240 for_each_subsys(ss, i) {
6241 if (strcmp(token, ss->name) &&
6242 strcmp(token, ss->legacy_name))
6243 continue;
6244 cgroup_disable_mask |= 1 << i;
6245 }
6246 }
6247 return 1;
6248}
6249__setup("cgroup_disable=", cgroup_disable);
6250
6251void __init __weak enable_debug_cgroup(void) { }
6252
6253static int __init enable_cgroup_debug(char *str)
6254{
6255 cgroup_debug = true;
6256 enable_debug_cgroup();
6257 return 1;
6258}
6259__setup("cgroup_debug", enable_cgroup_debug);
6260
6261
6262
6263
6264
6265
6266
6267
6268
6269
6270struct cgroup_subsys_state *css_tryget_online_from_dir(struct dentry *dentry,
6271 struct cgroup_subsys *ss)
6272{
6273 struct kernfs_node *kn = kernfs_node_from_dentry(dentry);
6274 struct file_system_type *s_type = dentry->d_sb->s_type;
6275 struct cgroup_subsys_state *css = NULL;
6276 struct cgroup *cgrp;
6277
6278
6279 if ((s_type != &cgroup_fs_type && s_type != &cgroup2_fs_type) ||
6280 !kn || kernfs_type(kn) != KERNFS_DIR)
6281 return ERR_PTR(-EBADF);
6282
6283 rcu_read_lock();
6284
6285
6286
6287
6288
6289
6290 cgrp = rcu_dereference(*(void __rcu __force **)&kn->priv);
6291 if (cgrp)
6292 css = cgroup_css(cgrp, ss);
6293
6294 if (!css || !css_tryget_online(css))
6295 css = ERR_PTR(-ENOENT);
6296
6297 rcu_read_unlock();
6298 return css;
6299}
6300
6301
6302
6303
6304
6305
6306
6307
6308
6309struct cgroup_subsys_state *css_from_id(int id, struct cgroup_subsys *ss)
6310{
6311 WARN_ON_ONCE(!rcu_read_lock_held());
6312 return idr_find(&ss->css_idr, id);
6313}
6314
6315
6316
6317
6318
6319
6320
6321
6322
6323
6324struct cgroup *cgroup_get_from_path(const char *path)
6325{
6326 struct kernfs_node *kn;
6327 struct cgroup *cgrp;
6328
6329 mutex_lock(&cgroup_mutex);
6330
6331 kn = kernfs_walk_and_get(cgrp_dfl_root.cgrp.kn, path);
6332 if (kn) {
6333 if (kernfs_type(kn) == KERNFS_DIR) {
6334 cgrp = kn->priv;
6335 cgroup_get_live(cgrp);
6336 } else {
6337 cgrp = ERR_PTR(-ENOTDIR);
6338 }
6339 kernfs_put(kn);
6340 } else {
6341 cgrp = ERR_PTR(-ENOENT);
6342 }
6343
6344 mutex_unlock(&cgroup_mutex);
6345 return cgrp;
6346}
6347EXPORT_SYMBOL_GPL(cgroup_get_from_path);
6348
6349
6350
6351
6352
6353
6354
6355
6356
6357
6358struct cgroup *cgroup_get_from_fd(int fd)
6359{
6360 struct cgroup *cgrp;
6361 struct file *f;
6362
6363 f = fget_raw(fd);
6364 if (!f)
6365 return ERR_PTR(-EBADF);
6366
6367 cgrp = cgroup_get_from_file(f);
6368 fput(f);
6369 return cgrp;
6370}
6371EXPORT_SYMBOL_GPL(cgroup_get_from_fd);
6372
6373static u64 power_of_ten(int power)
6374{
6375 u64 v = 1;
6376 while (power--)
6377 v *= 10;
6378 return v;
6379}
6380
6381
6382
6383
6384
6385
6386
6387
6388
6389
6390
6391
6392
6393
6394
6395int cgroup_parse_float(const char *input, unsigned dec_shift, s64 *v)
6396{
6397 s64 whole, frac = 0;
6398 int fstart = 0, fend = 0, flen;
6399
6400 if (!sscanf(input, "%lld.%n%lld%n", &whole, &fstart, &frac, &fend))
6401 return -EINVAL;
6402 if (frac < 0)
6403 return -EINVAL;
6404
6405 flen = fend > fstart ? fend - fstart : 0;
6406 if (flen < dec_shift)
6407 frac *= power_of_ten(dec_shift - flen);
6408 else
6409 frac = DIV_ROUND_CLOSEST_ULL(frac, power_of_ten(flen - dec_shift));
6410
6411 *v = whole * power_of_ten(dec_shift) + frac;
6412 return 0;
6413}
6414
6415
6416
6417
6418
6419#ifdef CONFIG_SOCK_CGROUP_DATA
6420
6421#if defined(CONFIG_CGROUP_NET_PRIO) || defined(CONFIG_CGROUP_NET_CLASSID)
6422
6423DEFINE_SPINLOCK(cgroup_sk_update_lock);
6424static bool cgroup_sk_alloc_disabled __read_mostly;
6425
6426void cgroup_sk_alloc_disable(void)
6427{
6428 if (cgroup_sk_alloc_disabled)
6429 return;
6430 pr_info("cgroup: disabling cgroup2 socket matching due to net_prio or net_cls activation\n");
6431 cgroup_sk_alloc_disabled = true;
6432}
6433
6434#else
6435
6436#define cgroup_sk_alloc_disabled false
6437
6438#endif
6439
6440void cgroup_sk_alloc(struct sock_cgroup_data *skcd)
6441{
6442 if (cgroup_sk_alloc_disabled) {
6443 skcd->no_refcnt = 1;
6444 return;
6445 }
6446
6447
6448 if (in_interrupt())
6449 return;
6450
6451 rcu_read_lock();
6452
6453 while (true) {
6454 struct css_set *cset;
6455
6456 cset = task_css_set(current);
6457 if (likely(cgroup_tryget(cset->dfl_cgrp))) {
6458 skcd->val = (unsigned long)cset->dfl_cgrp;
6459 cgroup_bpf_get(cset->dfl_cgrp);
6460 break;
6461 }
6462 cpu_relax();
6463 }
6464
6465 rcu_read_unlock();
6466}
6467
6468void cgroup_sk_clone(struct sock_cgroup_data *skcd)
6469{
6470 if (skcd->val) {
6471 if (skcd->no_refcnt)
6472 return;
6473
6474
6475
6476
6477
6478 cgroup_get(sock_cgroup_ptr(skcd));
6479 cgroup_bpf_get(sock_cgroup_ptr(skcd));
6480 }
6481}
6482
6483void cgroup_sk_free(struct sock_cgroup_data *skcd)
6484{
6485 struct cgroup *cgrp = sock_cgroup_ptr(skcd);
6486
6487 if (skcd->no_refcnt)
6488 return;
6489 cgroup_bpf_put(cgrp);
6490 cgroup_put(cgrp);
6491}
6492
6493#endif
6494
6495#ifdef CONFIG_CGROUP_BPF
6496int cgroup_bpf_attach(struct cgroup *cgrp,
6497 struct bpf_prog *prog, struct bpf_prog *replace_prog,
6498 struct bpf_cgroup_link *link,
6499 enum bpf_attach_type type,
6500 u32 flags)
6501{
6502 int ret;
6503
6504 mutex_lock(&cgroup_mutex);
6505 ret = __cgroup_bpf_attach(cgrp, prog, replace_prog, link, type, flags);
6506 mutex_unlock(&cgroup_mutex);
6507 return ret;
6508}
6509
6510int cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog,
6511 enum bpf_attach_type type)
6512{
6513 int ret;
6514
6515 mutex_lock(&cgroup_mutex);
6516 ret = __cgroup_bpf_detach(cgrp, prog, NULL, type);
6517 mutex_unlock(&cgroup_mutex);
6518 return ret;
6519}
6520
6521int cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
6522 union bpf_attr __user *uattr)
6523{
6524 int ret;
6525
6526 mutex_lock(&cgroup_mutex);
6527 ret = __cgroup_bpf_query(cgrp, attr, uattr);
6528 mutex_unlock(&cgroup_mutex);
6529 return ret;
6530}
6531#endif
6532
6533#ifdef CONFIG_SYSFS
6534static ssize_t show_delegatable_files(struct cftype *files, char *buf,
6535 ssize_t size, const char *prefix)
6536{
6537 struct cftype *cft;
6538 ssize_t ret = 0;
6539
6540 for (cft = files; cft && cft->name[0] != '\0'; cft++) {
6541 if (!(cft->flags & CFTYPE_NS_DELEGATABLE))
6542 continue;
6543
6544 if (prefix)
6545 ret += snprintf(buf + ret, size - ret, "%s.", prefix);
6546
6547 ret += snprintf(buf + ret, size - ret, "%s\n", cft->name);
6548
6549 if (WARN_ON(ret >= size))
6550 break;
6551 }
6552
6553 return ret;
6554}
6555
6556static ssize_t delegate_show(struct kobject *kobj, struct kobj_attribute *attr,
6557 char *buf)
6558{
6559 struct cgroup_subsys *ss;
6560 int ssid;
6561 ssize_t ret = 0;
6562
6563 ret = show_delegatable_files(cgroup_base_files, buf, PAGE_SIZE - ret,
6564 NULL);
6565
6566 for_each_subsys(ss, ssid)
6567 ret += show_delegatable_files(ss->dfl_cftypes, buf + ret,
6568 PAGE_SIZE - ret,
6569 cgroup_subsys_name[ssid]);
6570
6571 return ret;
6572}
6573static struct kobj_attribute cgroup_delegate_attr = __ATTR_RO(delegate);
6574
6575static ssize_t features_show(struct kobject *kobj, struct kobj_attribute *attr,
6576 char *buf)
6577{
6578 return snprintf(buf, PAGE_SIZE,
6579 "nsdelegate\n"
6580 "memory_localevents\n"
6581 "memory_recursiveprot\n");
6582}
6583static struct kobj_attribute cgroup_features_attr = __ATTR_RO(features);
6584
6585static struct attribute *cgroup_sysfs_attrs[] = {
6586 &cgroup_delegate_attr.attr,
6587 &cgroup_features_attr.attr,
6588 NULL,
6589};
6590
6591static const struct attribute_group cgroup_sysfs_attr_group = {
6592 .attrs = cgroup_sysfs_attrs,
6593 .name = "cgroup",
6594};
6595
6596static int __init cgroup_sysfs_init(void)
6597{
6598 return sysfs_create_group(kernel_kobj, &cgroup_sysfs_attr_group);
6599}
6600subsys_initcall(cgroup_sysfs_init);
6601
6602#endif
6603