1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
30
31#include "cgroup-internal.h"
32
33#include <linux/cred.h>
34#include <linux/errno.h>
35#include <linux/init_task.h>
36#include <linux/kernel.h>
37#include <linux/magic.h>
38#include <linux/mutex.h>
39#include <linux/mount.h>
40#include <linux/pagemap.h>
41#include <linux/proc_fs.h>
42#include <linux/rcupdate.h>
43#include <linux/sched.h>
44#include <linux/sched/task.h>
45#include <linux/slab.h>
46#include <linux/spinlock.h>
47#include <linux/percpu-rwsem.h>
48#include <linux/string.h>
49#include <linux/hashtable.h>
50#include <linux/idr.h>
51#include <linux/kthread.h>
52#include <linux/atomic.h>
53#include <linux/cpuset.h>
54#include <linux/proc_ns.h>
55#include <linux/nsproxy.h>
56#include <linux/file.h>
57#include <net/sock.h>
58
59#define CREATE_TRACE_POINTS
60#include <trace/events/cgroup.h>
61
62#define CGROUP_FILE_NAME_MAX (MAX_CGROUP_TYPE_NAMELEN + \
63 MAX_CFTYPE_NAME + 2)
64
65
66
67
68
69
70
71
72
73
74
75DEFINE_MUTEX(cgroup_mutex);
76DEFINE_SPINLOCK(css_set_lock);
77
78#ifdef CONFIG_PROVE_RCU
79EXPORT_SYMBOL_GPL(cgroup_mutex);
80EXPORT_SYMBOL_GPL(css_set_lock);
81#endif
82
83
84
85
86
87static DEFINE_SPINLOCK(cgroup_idr_lock);
88
89
90
91
92
93static DEFINE_SPINLOCK(cgroup_file_kn_lock);
94
95struct percpu_rw_semaphore cgroup_threadgroup_rwsem;
96
97#define cgroup_assert_mutex_or_rcu_locked() \
98 RCU_LOCKDEP_WARN(!rcu_read_lock_held() && \
99 !lockdep_is_held(&cgroup_mutex), \
100 "cgroup_mutex or RCU read lock required");
101
102
103
104
105
106
107
108static struct workqueue_struct *cgroup_destroy_wq;
109
110
111#define SUBSYS(_x) [_x ## _cgrp_id] = &_x ## _cgrp_subsys,
112struct cgroup_subsys *cgroup_subsys[] = {
113#include <linux/cgroup_subsys.h>
114};
115#undef SUBSYS
116
117
118#define SUBSYS(_x) [_x ## _cgrp_id] = #_x,
119static const char *cgroup_subsys_name[] = {
120#include <linux/cgroup_subsys.h>
121};
122#undef SUBSYS
123
124
125#define SUBSYS(_x) \
126 DEFINE_STATIC_KEY_TRUE(_x ## _cgrp_subsys_enabled_key); \
127 DEFINE_STATIC_KEY_TRUE(_x ## _cgrp_subsys_on_dfl_key); \
128 EXPORT_SYMBOL_GPL(_x ## _cgrp_subsys_enabled_key); \
129 EXPORT_SYMBOL_GPL(_x ## _cgrp_subsys_on_dfl_key);
130#include <linux/cgroup_subsys.h>
131#undef SUBSYS
132
133#define SUBSYS(_x) [_x ## _cgrp_id] = &_x ## _cgrp_subsys_enabled_key,
134static struct static_key_true *cgroup_subsys_enabled_key[] = {
135#include <linux/cgroup_subsys.h>
136};
137#undef SUBSYS
138
139#define SUBSYS(_x) [_x ## _cgrp_id] = &_x ## _cgrp_subsys_on_dfl_key,
140static struct static_key_true *cgroup_subsys_on_dfl_key[] = {
141#include <linux/cgroup_subsys.h>
142};
143#undef SUBSYS
144
145
146
147
148
149
150struct cgroup_root cgrp_dfl_root;
151EXPORT_SYMBOL_GPL(cgrp_dfl_root);
152
153
154
155
156
157static bool cgrp_dfl_visible;
158
159
160static u16 cgrp_dfl_inhibit_ss_mask;
161
162
163static u16 cgrp_dfl_implicit_ss_mask;
164
165
166LIST_HEAD(cgroup_roots);
167static int cgroup_root_count;
168
169
170static DEFINE_IDR(cgroup_hierarchy_idr);
171
172
173
174
175
176
177
178
179static u64 css_serial_nr_next = 1;
180
181
182
183
184
185static u16 have_fork_callback __read_mostly;
186static u16 have_exit_callback __read_mostly;
187static u16 have_free_callback __read_mostly;
188static u16 have_canfork_callback __read_mostly;
189
190
191struct cgroup_namespace init_cgroup_ns = {
192 .count = REFCOUNT_INIT(2),
193 .user_ns = &init_user_ns,
194 .ns.ops = &cgroupns_operations,
195 .ns.inum = PROC_CGROUP_INIT_INO,
196 .root_cset = &init_css_set,
197};
198
199static struct file_system_type cgroup2_fs_type;
200static struct cftype cgroup_base_files[];
201
202static int cgroup_apply_control(struct cgroup *cgrp);
203static void cgroup_finalize_control(struct cgroup *cgrp, int ret);
204static void css_task_iter_advance(struct css_task_iter *it);
205static int cgroup_destroy_locked(struct cgroup *cgrp);
206static struct cgroup_subsys_state *css_create(struct cgroup *cgrp,
207 struct cgroup_subsys *ss);
208static void css_release(struct percpu_ref *ref);
209static void kill_css(struct cgroup_subsys_state *css);
210static int cgroup_addrm_files(struct cgroup_subsys_state *css,
211 struct cgroup *cgrp, struct cftype cfts[],
212 bool is_add);
213
214
215
216
217
218
219
220
221
222bool cgroup_ssid_enabled(int ssid)
223{
224 if (CGROUP_SUBSYS_COUNT == 0)
225 return false;
226
227 return static_key_enabled(cgroup_subsys_enabled_key[ssid]);
228}
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283bool cgroup_on_dfl(const struct cgroup *cgrp)
284{
285 return cgrp->root == &cgrp_dfl_root;
286}
287
288
289static int cgroup_idr_alloc(struct idr *idr, void *ptr, int start, int end,
290 gfp_t gfp_mask)
291{
292 int ret;
293
294 idr_preload(gfp_mask);
295 spin_lock_bh(&cgroup_idr_lock);
296 ret = idr_alloc(idr, ptr, start, end, gfp_mask & ~__GFP_DIRECT_RECLAIM);
297 spin_unlock_bh(&cgroup_idr_lock);
298 idr_preload_end();
299 return ret;
300}
301
302static void *cgroup_idr_replace(struct idr *idr, void *ptr, int id)
303{
304 void *ret;
305
306 spin_lock_bh(&cgroup_idr_lock);
307 ret = idr_replace(idr, ptr, id);
308 spin_unlock_bh(&cgroup_idr_lock);
309 return ret;
310}
311
312static void cgroup_idr_remove(struct idr *idr, int id)
313{
314 spin_lock_bh(&cgroup_idr_lock);
315 idr_remove(idr, id);
316 spin_unlock_bh(&cgroup_idr_lock);
317}
318
319static struct cgroup *cgroup_parent(struct cgroup *cgrp)
320{
321 struct cgroup_subsys_state *parent_css = cgrp->self.parent;
322
323 if (parent_css)
324 return container_of(parent_css, struct cgroup, self);
325 return NULL;
326}
327
328
329static u16 cgroup_control(struct cgroup *cgrp)
330{
331 struct cgroup *parent = cgroup_parent(cgrp);
332 u16 root_ss_mask = cgrp->root->subsys_mask;
333
334 if (parent)
335 return parent->subtree_control;
336
337 if (cgroup_on_dfl(cgrp))
338 root_ss_mask &= ~(cgrp_dfl_inhibit_ss_mask |
339 cgrp_dfl_implicit_ss_mask);
340 return root_ss_mask;
341}
342
343
344static u16 cgroup_ss_mask(struct cgroup *cgrp)
345{
346 struct cgroup *parent = cgroup_parent(cgrp);
347
348 if (parent)
349 return parent->subtree_ss_mask;
350
351 return cgrp->root->subsys_mask;
352}
353
354
355
356
357
358
359
360
361
362
363
364
365static struct cgroup_subsys_state *cgroup_css(struct cgroup *cgrp,
366 struct cgroup_subsys *ss)
367{
368 if (ss)
369 return rcu_dereference_check(cgrp->subsys[ss->id],
370 lockdep_is_held(&cgroup_mutex));
371 else
372 return &cgrp->self;
373}
374
375
376
377
378
379
380
381
382
383
384
385static struct cgroup_subsys_state *cgroup_e_css(struct cgroup *cgrp,
386 struct cgroup_subsys *ss)
387{
388 lockdep_assert_held(&cgroup_mutex);
389
390 if (!ss)
391 return &cgrp->self;
392
393
394
395
396
397 while (!(cgroup_ss_mask(cgrp) & (1 << ss->id))) {
398 cgrp = cgroup_parent(cgrp);
399 if (!cgrp)
400 return NULL;
401 }
402
403 return cgroup_css(cgrp, ss);
404}
405
406
407
408
409
410
411
412
413
414
415
416
417struct cgroup_subsys_state *cgroup_get_e_css(struct cgroup *cgrp,
418 struct cgroup_subsys *ss)
419{
420 struct cgroup_subsys_state *css;
421
422 rcu_read_lock();
423
424 do {
425 css = cgroup_css(cgrp, ss);
426
427 if (css && css_tryget_online(css))
428 goto out_unlock;
429 cgrp = cgroup_parent(cgrp);
430 } while (cgrp);
431
432 css = init_css_set.subsys[ss->id];
433 css_get(css);
434out_unlock:
435 rcu_read_unlock();
436 return css;
437}
438
439static void __maybe_unused cgroup_get(struct cgroup *cgrp)
440{
441 css_get(&cgrp->self);
442}
443
444static void cgroup_get_live(struct cgroup *cgrp)
445{
446 WARN_ON_ONCE(cgroup_is_dead(cgrp));
447 css_get(&cgrp->self);
448}
449
450static bool cgroup_tryget(struct cgroup *cgrp)
451{
452 return css_tryget(&cgrp->self);
453}
454
455struct cgroup_subsys_state *of_css(struct kernfs_open_file *of)
456{
457 struct cgroup *cgrp = of->kn->parent->priv;
458 struct cftype *cft = of_cft(of);
459
460
461
462
463
464
465
466
467
468 if (cft->ss)
469 return rcu_dereference_raw(cgrp->subsys[cft->ss->id]);
470 else
471 return &cgrp->self;
472}
473EXPORT_SYMBOL_GPL(of_css);
474
475
476
477
478
479
480
481
482
483#define for_each_css(css, ssid, cgrp) \
484 for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++) \
485 if (!((css) = rcu_dereference_check( \
486 (cgrp)->subsys[(ssid)], \
487 lockdep_is_held(&cgroup_mutex)))) { } \
488 else
489
490
491
492
493
494
495
496
497
498#define for_each_e_css(css, ssid, cgrp) \
499 for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++) \
500 if (!((css) = cgroup_e_css(cgrp, cgroup_subsys[(ssid)]))) \
501 ; \
502 else
503
504
505
506
507
508
509
510
511
512
513#define do_each_subsys_mask(ss, ssid, ss_mask) do { \
514 unsigned long __ss_mask = (ss_mask); \
515 if (!CGROUP_SUBSYS_COUNT) { \
516 (ssid) = 0; \
517 break; \
518 } \
519 for_each_set_bit(ssid, &__ss_mask, CGROUP_SUBSYS_COUNT) { \
520 (ss) = cgroup_subsys[ssid]; \
521 {
522
523#define while_each_subsys_mask() \
524 } \
525 } \
526} while (false)
527
528
529#define cgroup_for_each_live_child(child, cgrp) \
530 list_for_each_entry((child), &(cgrp)->self.children, self.sibling) \
531 if (({ lockdep_assert_held(&cgroup_mutex); \
532 cgroup_is_dead(child); })) \
533 ; \
534 else
535
536
537#define cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp) \
538 css_for_each_descendant_pre((d_css), cgroup_css((cgrp), NULL)) \
539 if (({ lockdep_assert_held(&cgroup_mutex); \
540 (dsct) = (d_css)->cgroup; \
541 cgroup_is_dead(dsct); })) \
542 ; \
543 else
544
545
546#define cgroup_for_each_live_descendant_post(dsct, d_css, cgrp) \
547 css_for_each_descendant_post((d_css), cgroup_css((cgrp), NULL)) \
548 if (({ lockdep_assert_held(&cgroup_mutex); \
549 (dsct) = (d_css)->cgroup; \
550 cgroup_is_dead(dsct); })) \
551 ; \
552 else
553
554
555
556
557
558
559
560
561struct css_set init_css_set = {
562 .refcount = REFCOUNT_INIT(1),
563 .tasks = LIST_HEAD_INIT(init_css_set.tasks),
564 .mg_tasks = LIST_HEAD_INIT(init_css_set.mg_tasks),
565 .task_iters = LIST_HEAD_INIT(init_css_set.task_iters),
566 .cgrp_links = LIST_HEAD_INIT(init_css_set.cgrp_links),
567 .mg_preload_node = LIST_HEAD_INIT(init_css_set.mg_preload_node),
568 .mg_node = LIST_HEAD_INIT(init_css_set.mg_node),
569};
570
571static int css_set_count = 1;
572
573
574
575
576
577
578
579
580
581
582static bool css_set_populated(struct css_set *cset)
583{
584 lockdep_assert_held(&css_set_lock);
585
586 return !list_empty(&cset->tasks) || !list_empty(&cset->mg_tasks);
587}
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605static void cgroup_update_populated(struct cgroup *cgrp, bool populated)
606{
607 lockdep_assert_held(&css_set_lock);
608
609 do {
610 bool trigger;
611
612 if (populated)
613 trigger = !cgrp->populated_cnt++;
614 else
615 trigger = !--cgrp->populated_cnt;
616
617 if (!trigger)
618 break;
619
620 cgroup1_check_for_release(cgrp);
621 cgroup_file_notify(&cgrp->events_file);
622
623 cgrp = cgroup_parent(cgrp);
624 } while (cgrp);
625}
626
627
628
629
630
631
632
633
634
635static void css_set_update_populated(struct css_set *cset, bool populated)
636{
637 struct cgrp_cset_link *link;
638
639 lockdep_assert_held(&css_set_lock);
640
641 list_for_each_entry(link, &cset->cgrp_links, cgrp_link)
642 cgroup_update_populated(link->cgrp, populated);
643}
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660static void css_set_move_task(struct task_struct *task,
661 struct css_set *from_cset, struct css_set *to_cset,
662 bool use_mg_tasks)
663{
664 lockdep_assert_held(&css_set_lock);
665
666 if (to_cset && !css_set_populated(to_cset))
667 css_set_update_populated(to_cset, true);
668
669 if (from_cset) {
670 struct css_task_iter *it, *pos;
671
672 WARN_ON_ONCE(list_empty(&task->cg_list));
673
674
675
676
677
678
679
680
681 list_for_each_entry_safe(it, pos, &from_cset->task_iters,
682 iters_node)
683 if (it->task_pos == &task->cg_list)
684 css_task_iter_advance(it);
685
686 list_del_init(&task->cg_list);
687 if (!css_set_populated(from_cset))
688 css_set_update_populated(from_cset, false);
689 } else {
690 WARN_ON_ONCE(!list_empty(&task->cg_list));
691 }
692
693 if (to_cset) {
694
695
696
697
698
699
700 WARN_ON_ONCE(task->flags & PF_EXITING);
701
702 rcu_assign_pointer(task->cgroups, to_cset);
703 list_add_tail(&task->cg_list, use_mg_tasks ? &to_cset->mg_tasks :
704 &to_cset->tasks);
705 }
706}
707
708
709
710
711
712
713#define CSS_SET_HASH_BITS 7
714static DEFINE_HASHTABLE(css_set_table, CSS_SET_HASH_BITS);
715
716static unsigned long css_set_hash(struct cgroup_subsys_state *css[])
717{
718 unsigned long key = 0UL;
719 struct cgroup_subsys *ss;
720 int i;
721
722 for_each_subsys(ss, i)
723 key += (unsigned long)css[i];
724 key = (key >> 16) ^ key;
725
726 return key;
727}
728
729void put_css_set_locked(struct css_set *cset)
730{
731 struct cgrp_cset_link *link, *tmp_link;
732 struct cgroup_subsys *ss;
733 int ssid;
734
735 lockdep_assert_held(&css_set_lock);
736
737 if (!refcount_dec_and_test(&cset->refcount))
738 return;
739
740
741 for_each_subsys(ss, ssid) {
742 list_del(&cset->e_cset_node[ssid]);
743 css_put(cset->subsys[ssid]);
744 }
745 hash_del(&cset->hlist);
746 css_set_count--;
747
748 list_for_each_entry_safe(link, tmp_link, &cset->cgrp_links, cgrp_link) {
749 list_del(&link->cset_link);
750 list_del(&link->cgrp_link);
751 if (cgroup_parent(link->cgrp))
752 cgroup_put(link->cgrp);
753 kfree(link);
754 }
755
756 kfree_rcu(cset, rcu_head);
757}
758
759
760
761
762
763
764
765
766
767
768
769static bool compare_css_sets(struct css_set *cset,
770 struct css_set *old_cset,
771 struct cgroup *new_cgrp,
772 struct cgroup_subsys_state *template[])
773{
774 struct list_head *l1, *l2;
775
776
777
778
779
780
781 if (memcmp(template, cset->subsys, sizeof(cset->subsys)))
782 return false;
783
784
785
786
787
788
789
790 l1 = &cset->cgrp_links;
791 l2 = &old_cset->cgrp_links;
792 while (1) {
793 struct cgrp_cset_link *link1, *link2;
794 struct cgroup *cgrp1, *cgrp2;
795
796 l1 = l1->next;
797 l2 = l2->next;
798
799 if (l1 == &cset->cgrp_links) {
800 BUG_ON(l2 != &old_cset->cgrp_links);
801 break;
802 } else {
803 BUG_ON(l2 == &old_cset->cgrp_links);
804 }
805
806 link1 = list_entry(l1, struct cgrp_cset_link, cgrp_link);
807 link2 = list_entry(l2, struct cgrp_cset_link, cgrp_link);
808 cgrp1 = link1->cgrp;
809 cgrp2 = link2->cgrp;
810
811 BUG_ON(cgrp1->root != cgrp2->root);
812
813
814
815
816
817
818
819
820 if (cgrp1->root == new_cgrp->root) {
821 if (cgrp1 != new_cgrp)
822 return false;
823 } else {
824 if (cgrp1 != cgrp2)
825 return false;
826 }
827 }
828 return true;
829}
830
831
832
833
834
835
836
837static struct css_set *find_existing_css_set(struct css_set *old_cset,
838 struct cgroup *cgrp,
839 struct cgroup_subsys_state *template[])
840{
841 struct cgroup_root *root = cgrp->root;
842 struct cgroup_subsys *ss;
843 struct css_set *cset;
844 unsigned long key;
845 int i;
846
847
848
849
850
851
852 for_each_subsys(ss, i) {
853 if (root->subsys_mask & (1UL << i)) {
854
855
856
857
858 template[i] = cgroup_e_css(cgrp, ss);
859 } else {
860
861
862
863
864 template[i] = old_cset->subsys[i];
865 }
866 }
867
868 key = css_set_hash(template);
869 hash_for_each_possible(css_set_table, cset, hlist, key) {
870 if (!compare_css_sets(cset, old_cset, cgrp, template))
871 continue;
872
873
874 return cset;
875 }
876
877
878 return NULL;
879}
880
881static void free_cgrp_cset_links(struct list_head *links_to_free)
882{
883 struct cgrp_cset_link *link, *tmp_link;
884
885 list_for_each_entry_safe(link, tmp_link, links_to_free, cset_link) {
886 list_del(&link->cset_link);
887 kfree(link);
888 }
889}
890
891
892
893
894
895
896
897
898
899static int allocate_cgrp_cset_links(int count, struct list_head *tmp_links)
900{
901 struct cgrp_cset_link *link;
902 int i;
903
904 INIT_LIST_HEAD(tmp_links);
905
906 for (i = 0; i < count; i++) {
907 link = kzalloc(sizeof(*link), GFP_KERNEL);
908 if (!link) {
909 free_cgrp_cset_links(tmp_links);
910 return -ENOMEM;
911 }
912 list_add(&link->cset_link, tmp_links);
913 }
914 return 0;
915}
916
917
918
919
920
921
922
923static void link_css_set(struct list_head *tmp_links, struct css_set *cset,
924 struct cgroup *cgrp)
925{
926 struct cgrp_cset_link *link;
927
928 BUG_ON(list_empty(tmp_links));
929
930 if (cgroup_on_dfl(cgrp))
931 cset->dfl_cgrp = cgrp;
932
933 link = list_first_entry(tmp_links, struct cgrp_cset_link, cset_link);
934 link->cset = cset;
935 link->cgrp = cgrp;
936
937
938
939
940
941 list_move_tail(&link->cset_link, &cgrp->cset_links);
942 list_add_tail(&link->cgrp_link, &cset->cgrp_links);
943
944 if (cgroup_parent(cgrp))
945 cgroup_get_live(cgrp);
946}
947
948
949
950
951
952
953
954
955
956static struct css_set *find_css_set(struct css_set *old_cset,
957 struct cgroup *cgrp)
958{
959 struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT] = { };
960 struct css_set *cset;
961 struct list_head tmp_links;
962 struct cgrp_cset_link *link;
963 struct cgroup_subsys *ss;
964 unsigned long key;
965 int ssid;
966
967 lockdep_assert_held(&cgroup_mutex);
968
969
970
971 spin_lock_irq(&css_set_lock);
972 cset = find_existing_css_set(old_cset, cgrp, template);
973 if (cset)
974 get_css_set(cset);
975 spin_unlock_irq(&css_set_lock);
976
977 if (cset)
978 return cset;
979
980 cset = kzalloc(sizeof(*cset), GFP_KERNEL);
981 if (!cset)
982 return NULL;
983
984
985 if (allocate_cgrp_cset_links(cgroup_root_count, &tmp_links) < 0) {
986 kfree(cset);
987 return NULL;
988 }
989
990 refcount_set(&cset->refcount, 1);
991 INIT_LIST_HEAD(&cset->tasks);
992 INIT_LIST_HEAD(&cset->mg_tasks);
993 INIT_LIST_HEAD(&cset->task_iters);
994 INIT_HLIST_NODE(&cset->hlist);
995 INIT_LIST_HEAD(&cset->cgrp_links);
996 INIT_LIST_HEAD(&cset->mg_preload_node);
997 INIT_LIST_HEAD(&cset->mg_node);
998
999
1000
1001 memcpy(cset->subsys, template, sizeof(cset->subsys));
1002
1003 spin_lock_irq(&css_set_lock);
1004
1005 list_for_each_entry(link, &old_cset->cgrp_links, cgrp_link) {
1006 struct cgroup *c = link->cgrp;
1007
1008 if (c->root == cgrp->root)
1009 c = cgrp;
1010 link_css_set(&tmp_links, cset, c);
1011 }
1012
1013 BUG_ON(!list_empty(&tmp_links));
1014
1015 css_set_count++;
1016
1017
1018 key = css_set_hash(cset->subsys);
1019 hash_add(css_set_table, &cset->hlist, key);
1020
1021 for_each_subsys(ss, ssid) {
1022 struct cgroup_subsys_state *css = cset->subsys[ssid];
1023
1024 list_add_tail(&cset->e_cset_node[ssid],
1025 &css->cgroup->e_csets[ssid]);
1026 css_get(css);
1027 }
1028
1029 spin_unlock_irq(&css_set_lock);
1030
1031 return cset;
1032}
1033
1034struct cgroup_root *cgroup_root_from_kf(struct kernfs_root *kf_root)
1035{
1036 struct cgroup *root_cgrp = kf_root->kn->priv;
1037
1038 return root_cgrp->root;
1039}
1040
1041static int cgroup_init_root_id(struct cgroup_root *root)
1042{
1043 int id;
1044
1045 lockdep_assert_held(&cgroup_mutex);
1046
1047 id = idr_alloc_cyclic(&cgroup_hierarchy_idr, root, 0, 0, GFP_KERNEL);
1048 if (id < 0)
1049 return id;
1050
1051 root->hierarchy_id = id;
1052 return 0;
1053}
1054
1055static void cgroup_exit_root_id(struct cgroup_root *root)
1056{
1057 lockdep_assert_held(&cgroup_mutex);
1058
1059 idr_remove(&cgroup_hierarchy_idr, root->hierarchy_id);
1060}
1061
1062void cgroup_free_root(struct cgroup_root *root)
1063{
1064 if (root) {
1065 idr_destroy(&root->cgroup_idr);
1066 kfree(root);
1067 }
1068}
1069
1070static void cgroup_destroy_root(struct cgroup_root *root)
1071{
1072 struct cgroup *cgrp = &root->cgrp;
1073 struct cgrp_cset_link *link, *tmp_link;
1074
1075 trace_cgroup_destroy_root(root);
1076
1077 cgroup_lock_and_drain_offline(&cgrp_dfl_root.cgrp);
1078
1079 BUG_ON(atomic_read(&root->nr_cgrps));
1080 BUG_ON(!list_empty(&cgrp->self.children));
1081
1082
1083 WARN_ON(rebind_subsystems(&cgrp_dfl_root, root->subsys_mask));
1084
1085
1086
1087
1088
1089 spin_lock_irq(&css_set_lock);
1090
1091 list_for_each_entry_safe(link, tmp_link, &cgrp->cset_links, cset_link) {
1092 list_del(&link->cset_link);
1093 list_del(&link->cgrp_link);
1094 kfree(link);
1095 }
1096
1097 spin_unlock_irq(&css_set_lock);
1098
1099 if (!list_empty(&root->root_list)) {
1100 list_del(&root->root_list);
1101 cgroup_root_count--;
1102 }
1103
1104 cgroup_exit_root_id(root);
1105
1106 mutex_unlock(&cgroup_mutex);
1107
1108 kernfs_destroy_root(root->kf_root);
1109 cgroup_free_root(root);
1110}
1111
1112
1113
1114
1115
1116static struct cgroup *
1117current_cgns_cgroup_from_root(struct cgroup_root *root)
1118{
1119 struct cgroup *res = NULL;
1120 struct css_set *cset;
1121
1122 lockdep_assert_held(&css_set_lock);
1123
1124 rcu_read_lock();
1125
1126 cset = current->nsproxy->cgroup_ns->root_cset;
1127 if (cset == &init_css_set) {
1128 res = &root->cgrp;
1129 } else {
1130 struct cgrp_cset_link *link;
1131
1132 list_for_each_entry(link, &cset->cgrp_links, cgrp_link) {
1133 struct cgroup *c = link->cgrp;
1134
1135 if (c->root == root) {
1136 res = c;
1137 break;
1138 }
1139 }
1140 }
1141 rcu_read_unlock();
1142
1143 BUG_ON(!res);
1144 return res;
1145}
1146
1147
1148static struct cgroup *cset_cgroup_from_root(struct css_set *cset,
1149 struct cgroup_root *root)
1150{
1151 struct cgroup *res = NULL;
1152
1153 lockdep_assert_held(&cgroup_mutex);
1154 lockdep_assert_held(&css_set_lock);
1155
1156 if (cset == &init_css_set) {
1157 res = &root->cgrp;
1158 } else {
1159 struct cgrp_cset_link *link;
1160
1161 list_for_each_entry(link, &cset->cgrp_links, cgrp_link) {
1162 struct cgroup *c = link->cgrp;
1163
1164 if (c->root == root) {
1165 res = c;
1166 break;
1167 }
1168 }
1169 }
1170
1171 BUG_ON(!res);
1172 return res;
1173}
1174
1175
1176
1177
1178
1179struct cgroup *task_cgroup_from_root(struct task_struct *task,
1180 struct cgroup_root *root)
1181{
1182
1183
1184
1185
1186
1187 return cset_cgroup_from_root(task_css_set(task), root);
1188}
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216static struct kernfs_syscall_ops cgroup_kf_syscall_ops;
1217
1218static char *cgroup_file_name(struct cgroup *cgrp, const struct cftype *cft,
1219 char *buf)
1220{
1221 struct cgroup_subsys *ss = cft->ss;
1222
1223 if (cft->ss && !(cft->flags & CFTYPE_NO_PREFIX) &&
1224 !(cgrp->root->flags & CGRP_ROOT_NOPREFIX))
1225 snprintf(buf, CGROUP_FILE_NAME_MAX, "%s.%s",
1226 cgroup_on_dfl(cgrp) ? ss->name : ss->legacy_name,
1227 cft->name);
1228 else
1229 strncpy(buf, cft->name, CGROUP_FILE_NAME_MAX);
1230 return buf;
1231}
1232
1233
1234
1235
1236
1237
1238
1239static umode_t cgroup_file_mode(const struct cftype *cft)
1240{
1241 umode_t mode = 0;
1242
1243 if (cft->read_u64 || cft->read_s64 || cft->seq_show)
1244 mode |= S_IRUGO;
1245
1246 if (cft->write_u64 || cft->write_s64 || cft->write) {
1247 if (cft->flags & CFTYPE_WORLD_WRITABLE)
1248 mode |= S_IWUGO;
1249 else
1250 mode |= S_IWUSR;
1251 }
1252
1253 return mode;
1254}
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268static u16 cgroup_calc_subtree_ss_mask(u16 subtree_control, u16 this_ss_mask)
1269{
1270 u16 cur_ss_mask = subtree_control;
1271 struct cgroup_subsys *ss;
1272 int ssid;
1273
1274 lockdep_assert_held(&cgroup_mutex);
1275
1276 cur_ss_mask |= cgrp_dfl_implicit_ss_mask;
1277
1278 while (true) {
1279 u16 new_ss_mask = cur_ss_mask;
1280
1281 do_each_subsys_mask(ss, ssid, cur_ss_mask) {
1282 new_ss_mask |= ss->depends_on;
1283 } while_each_subsys_mask();
1284
1285
1286
1287
1288
1289
1290 new_ss_mask &= this_ss_mask;
1291
1292 if (new_ss_mask == cur_ss_mask)
1293 break;
1294 cur_ss_mask = new_ss_mask;
1295 }
1296
1297 return cur_ss_mask;
1298}
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310void cgroup_kn_unlock(struct kernfs_node *kn)
1311{
1312 struct cgroup *cgrp;
1313
1314 if (kernfs_type(kn) == KERNFS_DIR)
1315 cgrp = kn->priv;
1316 else
1317 cgrp = kn->parent->priv;
1318
1319 mutex_unlock(&cgroup_mutex);
1320
1321 kernfs_unbreak_active_protection(kn);
1322 cgroup_put(cgrp);
1323}
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342struct cgroup *cgroup_kn_lock_live(struct kernfs_node *kn, bool drain_offline)
1343{
1344 struct cgroup *cgrp;
1345
1346 if (kernfs_type(kn) == KERNFS_DIR)
1347 cgrp = kn->priv;
1348 else
1349 cgrp = kn->parent->priv;
1350
1351
1352
1353
1354
1355
1356
1357 if (!cgroup_tryget(cgrp))
1358 return NULL;
1359 kernfs_break_active_protection(kn);
1360
1361 if (drain_offline)
1362 cgroup_lock_and_drain_offline(cgrp);
1363 else
1364 mutex_lock(&cgroup_mutex);
1365
1366 if (!cgroup_is_dead(cgrp))
1367 return cgrp;
1368
1369 cgroup_kn_unlock(kn);
1370 return NULL;
1371}
1372
1373static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft)
1374{
1375 char name[CGROUP_FILE_NAME_MAX];
1376
1377 lockdep_assert_held(&cgroup_mutex);
1378
1379 if (cft->file_offset) {
1380 struct cgroup_subsys_state *css = cgroup_css(cgrp, cft->ss);
1381 struct cgroup_file *cfile = (void *)css + cft->file_offset;
1382
1383 spin_lock_irq(&cgroup_file_kn_lock);
1384 cfile->kn = NULL;
1385 spin_unlock_irq(&cgroup_file_kn_lock);
1386 }
1387
1388 kernfs_remove_by_name(cgrp->kn, cgroup_file_name(cgrp, cft, name));
1389}
1390
1391
1392
1393
1394
1395static void css_clear_dir(struct cgroup_subsys_state *css)
1396{
1397 struct cgroup *cgrp = css->cgroup;
1398 struct cftype *cfts;
1399
1400 if (!(css->flags & CSS_VISIBLE))
1401 return;
1402
1403 css->flags &= ~CSS_VISIBLE;
1404
1405 list_for_each_entry(cfts, &css->ss->cfts, node)
1406 cgroup_addrm_files(css, cgrp, cfts, false);
1407}
1408
1409
1410
1411
1412
1413
1414
1415static int css_populate_dir(struct cgroup_subsys_state *css)
1416{
1417 struct cgroup *cgrp = css->cgroup;
1418 struct cftype *cfts, *failed_cfts;
1419 int ret;
1420
1421 if ((css->flags & CSS_VISIBLE) || !cgrp->kn)
1422 return 0;
1423
1424 if (!css->ss) {
1425 if (cgroup_on_dfl(cgrp))
1426 cfts = cgroup_base_files;
1427 else
1428 cfts = cgroup1_base_files;
1429
1430 return cgroup_addrm_files(&cgrp->self, cgrp, cfts, true);
1431 }
1432
1433 list_for_each_entry(cfts, &css->ss->cfts, node) {
1434 ret = cgroup_addrm_files(css, cgrp, cfts, true);
1435 if (ret < 0) {
1436 failed_cfts = cfts;
1437 goto err;
1438 }
1439 }
1440
1441 css->flags |= CSS_VISIBLE;
1442
1443 return 0;
1444err:
1445 list_for_each_entry(cfts, &css->ss->cfts, node) {
1446 if (cfts == failed_cfts)
1447 break;
1448 cgroup_addrm_files(css, cgrp, cfts, false);
1449 }
1450 return ret;
1451}
1452
1453int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask)
1454{
1455 struct cgroup *dcgrp = &dst_root->cgrp;
1456 struct cgroup_subsys *ss;
1457 int ssid, i, ret;
1458
1459 lockdep_assert_held(&cgroup_mutex);
1460
1461 do_each_subsys_mask(ss, ssid, ss_mask) {
1462
1463
1464
1465
1466
1467 if (css_next_child(NULL, cgroup_css(&ss->root->cgrp, ss)) &&
1468 !ss->implicit_on_dfl)
1469 return -EBUSY;
1470
1471
1472 if (ss->root != &cgrp_dfl_root && dst_root != &cgrp_dfl_root)
1473 return -EBUSY;
1474 } while_each_subsys_mask();
1475
1476 do_each_subsys_mask(ss, ssid, ss_mask) {
1477 struct cgroup_root *src_root = ss->root;
1478 struct cgroup *scgrp = &src_root->cgrp;
1479 struct cgroup_subsys_state *css = cgroup_css(scgrp, ss);
1480 struct css_set *cset;
1481
1482 WARN_ON(!css || cgroup_css(dcgrp, ss));
1483
1484
1485 src_root->subsys_mask &= ~(1 << ssid);
1486 WARN_ON(cgroup_apply_control(scgrp));
1487 cgroup_finalize_control(scgrp, 0);
1488
1489
1490 RCU_INIT_POINTER(scgrp->subsys[ssid], NULL);
1491 rcu_assign_pointer(dcgrp->subsys[ssid], css);
1492 ss->root = dst_root;
1493 css->cgroup = dcgrp;
1494
1495 spin_lock_irq(&css_set_lock);
1496 hash_for_each(css_set_table, i, cset, hlist)
1497 list_move_tail(&cset->e_cset_node[ss->id],
1498 &dcgrp->e_csets[ss->id]);
1499 spin_unlock_irq(&css_set_lock);
1500
1501
1502 dst_root->subsys_mask |= 1 << ssid;
1503 if (dst_root == &cgrp_dfl_root) {
1504 static_branch_enable(cgroup_subsys_on_dfl_key[ssid]);
1505 } else {
1506 dcgrp->subtree_control |= 1 << ssid;
1507 static_branch_disable(cgroup_subsys_on_dfl_key[ssid]);
1508 }
1509
1510 ret = cgroup_apply_control(dcgrp);
1511 if (ret)
1512 pr_warn("partial failure to rebind %s controller (err=%d)\n",
1513 ss->name, ret);
1514
1515 if (ss->bind)
1516 ss->bind(css);
1517 } while_each_subsys_mask();
1518
1519 kernfs_activate(dcgrp->kn);
1520 return 0;
1521}
1522
1523int cgroup_show_path(struct seq_file *sf, struct kernfs_node *kf_node,
1524 struct kernfs_root *kf_root)
1525{
1526 int len = 0;
1527 char *buf = NULL;
1528 struct cgroup_root *kf_cgroot = cgroup_root_from_kf(kf_root);
1529 struct cgroup *ns_cgroup;
1530
1531 buf = kmalloc(PATH_MAX, GFP_KERNEL);
1532 if (!buf)
1533 return -ENOMEM;
1534
1535 spin_lock_irq(&css_set_lock);
1536 ns_cgroup = current_cgns_cgroup_from_root(kf_cgroot);
1537 len = kernfs_path_from_node(kf_node, ns_cgroup->kn, buf, PATH_MAX);
1538 spin_unlock_irq(&css_set_lock);
1539
1540 if (len >= PATH_MAX)
1541 len = -ERANGE;
1542 else if (len > 0) {
1543 seq_escape(sf, buf, " \t\n\\");
1544 len = 0;
1545 }
1546 kfree(buf);
1547 return len;
1548}
1549
1550static int parse_cgroup_root_flags(char *data, unsigned int *root_flags)
1551{
1552 char *token;
1553
1554 *root_flags = 0;
1555
1556 if (!data)
1557 return 0;
1558
1559 while ((token = strsep(&data, ",")) != NULL) {
1560 if (!strcmp(token, "nsdelegate")) {
1561 *root_flags |= CGRP_ROOT_NS_DELEGATE;
1562 continue;
1563 }
1564
1565 pr_err("cgroup2: unknown option \"%s\"\n", token);
1566 return -EINVAL;
1567 }
1568
1569 return 0;
1570}
1571
1572static void apply_cgroup_root_flags(unsigned int root_flags)
1573{
1574 if (current->nsproxy->cgroup_ns == &init_cgroup_ns) {
1575 if (root_flags & CGRP_ROOT_NS_DELEGATE)
1576 cgrp_dfl_root.flags |= CGRP_ROOT_NS_DELEGATE;
1577 else
1578 cgrp_dfl_root.flags &= ~CGRP_ROOT_NS_DELEGATE;
1579 }
1580}
1581
1582static int cgroup_show_options(struct seq_file *seq, struct kernfs_root *kf_root)
1583{
1584 if (cgrp_dfl_root.flags & CGRP_ROOT_NS_DELEGATE)
1585 seq_puts(seq, ",nsdelegate");
1586 return 0;
1587}
1588
1589static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data)
1590{
1591 unsigned int root_flags;
1592 int ret;
1593
1594 ret = parse_cgroup_root_flags(data, &root_flags);
1595 if (ret)
1596 return ret;
1597
1598 apply_cgroup_root_flags(root_flags);
1599 return 0;
1600}
1601
1602
1603
1604
1605
1606
1607
1608static bool use_task_css_set_links __read_mostly;
1609
1610static void cgroup_enable_task_cg_lists(void)
1611{
1612 struct task_struct *p, *g;
1613
1614 spin_lock_irq(&css_set_lock);
1615
1616 if (use_task_css_set_links)
1617 goto out_unlock;
1618
1619 use_task_css_set_links = true;
1620
1621
1622
1623
1624
1625
1626
1627
1628 read_lock(&tasklist_lock);
1629 do_each_thread(g, p) {
1630 WARN_ON_ONCE(!list_empty(&p->cg_list) ||
1631 task_css_set(p) != &init_css_set);
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644 spin_lock(&p->sighand->siglock);
1645 if (!(p->flags & PF_EXITING)) {
1646 struct css_set *cset = task_css_set(p);
1647
1648 if (!css_set_populated(cset))
1649 css_set_update_populated(cset, true);
1650 list_add_tail(&p->cg_list, &cset->tasks);
1651 get_css_set(cset);
1652 cset->nr_tasks++;
1653 }
1654 spin_unlock(&p->sighand->siglock);
1655 } while_each_thread(g, p);
1656 read_unlock(&tasklist_lock);
1657out_unlock:
1658 spin_unlock_irq(&css_set_lock);
1659}
1660
1661static void init_cgroup_housekeeping(struct cgroup *cgrp)
1662{
1663 struct cgroup_subsys *ss;
1664 int ssid;
1665
1666 INIT_LIST_HEAD(&cgrp->self.sibling);
1667 INIT_LIST_HEAD(&cgrp->self.children);
1668 INIT_LIST_HEAD(&cgrp->cset_links);
1669 INIT_LIST_HEAD(&cgrp->pidlists);
1670 mutex_init(&cgrp->pidlist_mutex);
1671 cgrp->self.cgroup = cgrp;
1672 cgrp->self.flags |= CSS_ONLINE;
1673
1674 for_each_subsys(ss, ssid)
1675 INIT_LIST_HEAD(&cgrp->e_csets[ssid]);
1676
1677 init_waitqueue_head(&cgrp->offline_waitq);
1678 INIT_WORK(&cgrp->release_agent_work, cgroup1_release_agent);
1679}
1680
1681void init_cgroup_root(struct cgroup_root *root, struct cgroup_sb_opts *opts)
1682{
1683 struct cgroup *cgrp = &root->cgrp;
1684
1685 INIT_LIST_HEAD(&root->root_list);
1686 atomic_set(&root->nr_cgrps, 1);
1687 cgrp->root = root;
1688 init_cgroup_housekeeping(cgrp);
1689 idr_init(&root->cgroup_idr);
1690
1691 root->flags = opts->flags;
1692 if (opts->release_agent)
1693 strcpy(root->release_agent_path, opts->release_agent);
1694 if (opts->name)
1695 strcpy(root->name, opts->name);
1696 if (opts->cpuset_clone_children)
1697 set_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags);
1698}
1699
1700int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask, int ref_flags)
1701{
1702 LIST_HEAD(tmp_links);
1703 struct cgroup *root_cgrp = &root->cgrp;
1704 struct kernfs_syscall_ops *kf_sops;
1705 struct css_set *cset;
1706 int i, ret;
1707
1708 lockdep_assert_held(&cgroup_mutex);
1709
1710 ret = cgroup_idr_alloc(&root->cgroup_idr, root_cgrp, 1, 2, GFP_KERNEL);
1711 if (ret < 0)
1712 goto out;
1713 root_cgrp->id = ret;
1714 root_cgrp->ancestor_ids[0] = ret;
1715
1716 ret = percpu_ref_init(&root_cgrp->self.refcnt, css_release,
1717 ref_flags, GFP_KERNEL);
1718 if (ret)
1719 goto out;
1720
1721
1722
1723
1724
1725
1726
1727
1728 ret = allocate_cgrp_cset_links(2 * css_set_count, &tmp_links);
1729 if (ret)
1730 goto cancel_ref;
1731
1732 ret = cgroup_init_root_id(root);
1733 if (ret)
1734 goto cancel_ref;
1735
1736 kf_sops = root == &cgrp_dfl_root ?
1737 &cgroup_kf_syscall_ops : &cgroup1_kf_syscall_ops;
1738
1739 root->kf_root = kernfs_create_root(kf_sops,
1740 KERNFS_ROOT_CREATE_DEACTIVATED,
1741 root_cgrp);
1742 if (IS_ERR(root->kf_root)) {
1743 ret = PTR_ERR(root->kf_root);
1744 goto exit_root_id;
1745 }
1746 root_cgrp->kn = root->kf_root->kn;
1747
1748 ret = css_populate_dir(&root_cgrp->self);
1749 if (ret)
1750 goto destroy_root;
1751
1752 ret = rebind_subsystems(root, ss_mask);
1753 if (ret)
1754 goto destroy_root;
1755
1756 trace_cgroup_setup_root(root);
1757
1758
1759
1760
1761
1762
1763 list_add(&root->root_list, &cgroup_roots);
1764 cgroup_root_count++;
1765
1766
1767
1768
1769
1770 spin_lock_irq(&css_set_lock);
1771 hash_for_each(css_set_table, i, cset, hlist) {
1772 link_css_set(&tmp_links, cset, root_cgrp);
1773 if (css_set_populated(cset))
1774 cgroup_update_populated(root_cgrp, true);
1775 }
1776 spin_unlock_irq(&css_set_lock);
1777
1778 BUG_ON(!list_empty(&root_cgrp->self.children));
1779 BUG_ON(atomic_read(&root->nr_cgrps) != 1);
1780
1781 kernfs_activate(root_cgrp->kn);
1782 ret = 0;
1783 goto out;
1784
1785destroy_root:
1786 kernfs_destroy_root(root->kf_root);
1787 root->kf_root = NULL;
1788exit_root_id:
1789 cgroup_exit_root_id(root);
1790cancel_ref:
1791 percpu_ref_exit(&root_cgrp->self.refcnt);
1792out:
1793 free_cgrp_cset_links(&tmp_links);
1794 return ret;
1795}
1796
1797struct dentry *cgroup_do_mount(struct file_system_type *fs_type, int flags,
1798 struct cgroup_root *root, unsigned long magic,
1799 struct cgroup_namespace *ns)
1800{
1801 struct dentry *dentry;
1802 bool new_sb;
1803
1804 dentry = kernfs_mount(fs_type, flags, root->kf_root, magic, &new_sb);
1805
1806
1807
1808
1809
1810 if (!IS_ERR(dentry) && ns != &init_cgroup_ns) {
1811 struct dentry *nsdentry;
1812 struct cgroup *cgrp;
1813
1814 mutex_lock(&cgroup_mutex);
1815 spin_lock_irq(&css_set_lock);
1816
1817 cgrp = cset_cgroup_from_root(ns->root_cset, root);
1818
1819 spin_unlock_irq(&css_set_lock);
1820 mutex_unlock(&cgroup_mutex);
1821
1822 nsdentry = kernfs_node_dentry(cgrp->kn, dentry->d_sb);
1823 dput(dentry);
1824 dentry = nsdentry;
1825 }
1826
1827 if (IS_ERR(dentry) || !new_sb)
1828 cgroup_put(&root->cgrp);
1829
1830 return dentry;
1831}
1832
1833static struct dentry *cgroup_mount(struct file_system_type *fs_type,
1834 int flags, const char *unused_dev_name,
1835 void *data)
1836{
1837 struct cgroup_namespace *ns = current->nsproxy->cgroup_ns;
1838 struct dentry *dentry;
1839 int ret;
1840
1841 get_cgroup_ns(ns);
1842
1843
1844 if (!ns_capable(ns->user_ns, CAP_SYS_ADMIN)) {
1845 put_cgroup_ns(ns);
1846 return ERR_PTR(-EPERM);
1847 }
1848
1849
1850
1851
1852
1853 if (!use_task_css_set_links)
1854 cgroup_enable_task_cg_lists();
1855
1856 if (fs_type == &cgroup2_fs_type) {
1857 unsigned int root_flags;
1858
1859 ret = parse_cgroup_root_flags(data, &root_flags);
1860 if (ret) {
1861 put_cgroup_ns(ns);
1862 return ERR_PTR(ret);
1863 }
1864
1865 cgrp_dfl_visible = true;
1866 cgroup_get_live(&cgrp_dfl_root.cgrp);
1867
1868 dentry = cgroup_do_mount(&cgroup2_fs_type, flags, &cgrp_dfl_root,
1869 CGROUP2_SUPER_MAGIC, ns);
1870 if (!IS_ERR(dentry))
1871 apply_cgroup_root_flags(root_flags);
1872 } else {
1873 dentry = cgroup1_mount(&cgroup_fs_type, flags, data,
1874 CGROUP_SUPER_MAGIC, ns);
1875 }
1876
1877 put_cgroup_ns(ns);
1878 return dentry;
1879}
1880
1881static void cgroup_kill_sb(struct super_block *sb)
1882{
1883 struct kernfs_root *kf_root = kernfs_root_from_sb(sb);
1884 struct cgroup_root *root = cgroup_root_from_kf(kf_root);
1885
1886
1887
1888
1889
1890
1891
1892
1893 if (!list_empty(&root->cgrp.self.children) ||
1894 root == &cgrp_dfl_root)
1895 cgroup_put(&root->cgrp);
1896 else
1897 percpu_ref_kill(&root->cgrp.self.refcnt);
1898
1899 kernfs_kill_sb(sb);
1900}
1901
1902struct file_system_type cgroup_fs_type = {
1903 .name = "cgroup",
1904 .mount = cgroup_mount,
1905 .kill_sb = cgroup_kill_sb,
1906 .fs_flags = FS_USERNS_MOUNT,
1907};
1908
1909static struct file_system_type cgroup2_fs_type = {
1910 .name = "cgroup2",
1911 .mount = cgroup_mount,
1912 .kill_sb = cgroup_kill_sb,
1913 .fs_flags = FS_USERNS_MOUNT,
1914};
1915
1916int cgroup_path_ns_locked(struct cgroup *cgrp, char *buf, size_t buflen,
1917 struct cgroup_namespace *ns)
1918{
1919 struct cgroup *root = cset_cgroup_from_root(ns->root_cset, cgrp->root);
1920
1921 return kernfs_path_from_node(cgrp->kn, root->kn, buf, buflen);
1922}
1923
1924int cgroup_path_ns(struct cgroup *cgrp, char *buf, size_t buflen,
1925 struct cgroup_namespace *ns)
1926{
1927 int ret;
1928
1929 mutex_lock(&cgroup_mutex);
1930 spin_lock_irq(&css_set_lock);
1931
1932 ret = cgroup_path_ns_locked(cgrp, buf, buflen, ns);
1933
1934 spin_unlock_irq(&css_set_lock);
1935 mutex_unlock(&cgroup_mutex);
1936
1937 return ret;
1938}
1939EXPORT_SYMBOL_GPL(cgroup_path_ns);
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954int task_cgroup_path(struct task_struct *task, char *buf, size_t buflen)
1955{
1956 struct cgroup_root *root;
1957 struct cgroup *cgrp;
1958 int hierarchy_id = 1;
1959 int ret;
1960
1961 mutex_lock(&cgroup_mutex);
1962 spin_lock_irq(&css_set_lock);
1963
1964 root = idr_get_next(&cgroup_hierarchy_idr, &hierarchy_id);
1965
1966 if (root) {
1967 cgrp = task_cgroup_from_root(task, root);
1968 ret = cgroup_path_ns_locked(cgrp, buf, buflen, &init_cgroup_ns);
1969 } else {
1970
1971 ret = strlcpy(buf, "/", buflen);
1972 }
1973
1974 spin_unlock_irq(&css_set_lock);
1975 mutex_unlock(&cgroup_mutex);
1976 return ret;
1977}
1978EXPORT_SYMBOL_GPL(task_cgroup_path);
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990static void cgroup_migrate_add_task(struct task_struct *task,
1991 struct cgroup_mgctx *mgctx)
1992{
1993 struct css_set *cset;
1994
1995 lockdep_assert_held(&css_set_lock);
1996
1997
1998 if (task->flags & PF_EXITING)
1999 return;
2000
2001
2002 if (list_empty(&task->cg_list))
2003 return;
2004
2005 cset = task_css_set(task);
2006 if (!cset->mg_src_cgrp)
2007 return;
2008
2009 mgctx->tset.nr_tasks++;
2010
2011 list_move_tail(&task->cg_list, &cset->mg_tasks);
2012 if (list_empty(&cset->mg_node))
2013 list_add_tail(&cset->mg_node,
2014 &mgctx->tset.src_csets);
2015 if (list_empty(&cset->mg_dst_cset->mg_node))
2016 list_add_tail(&cset->mg_dst_cset->mg_node,
2017 &mgctx->tset.dst_csets);
2018}
2019
2020
2021
2022
2023
2024
2025
2026
2027struct task_struct *cgroup_taskset_first(struct cgroup_taskset *tset,
2028 struct cgroup_subsys_state **dst_cssp)
2029{
2030 tset->cur_cset = list_first_entry(tset->csets, struct css_set, mg_node);
2031 tset->cur_task = NULL;
2032
2033 return cgroup_taskset_next(tset, dst_cssp);
2034}
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset,
2045 struct cgroup_subsys_state **dst_cssp)
2046{
2047 struct css_set *cset = tset->cur_cset;
2048 struct task_struct *task = tset->cur_task;
2049
2050 while (&cset->mg_node != tset->csets) {
2051 if (!task)
2052 task = list_first_entry(&cset->mg_tasks,
2053 struct task_struct, cg_list);
2054 else
2055 task = list_next_entry(task, cg_list);
2056
2057 if (&task->cg_list != &cset->mg_tasks) {
2058 tset->cur_cset = cset;
2059 tset->cur_task = task;
2060
2061
2062
2063
2064
2065
2066
2067 if (cset->mg_dst_cset)
2068 *dst_cssp = cset->mg_dst_cset->subsys[tset->ssid];
2069 else
2070 *dst_cssp = cset->subsys[tset->ssid];
2071
2072 return task;
2073 }
2074
2075 cset = list_next_entry(cset, mg_node);
2076 task = NULL;
2077 }
2078
2079 return NULL;
2080}
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091static int cgroup_migrate_execute(struct cgroup_mgctx *mgctx)
2092{
2093 struct cgroup_taskset *tset = &mgctx->tset;
2094 struct cgroup_subsys *ss;
2095 struct task_struct *task, *tmp_task;
2096 struct css_set *cset, *tmp_cset;
2097 int ssid, failed_ssid, ret;
2098
2099
2100 if (tset->nr_tasks) {
2101 do_each_subsys_mask(ss, ssid, mgctx->ss_mask) {
2102 if (ss->can_attach) {
2103 tset->ssid = ssid;
2104 ret = ss->can_attach(tset);
2105 if (ret) {
2106 failed_ssid = ssid;
2107 goto out_cancel_attach;
2108 }
2109 }
2110 } while_each_subsys_mask();
2111 }
2112
2113
2114
2115
2116
2117
2118 spin_lock_irq(&css_set_lock);
2119 list_for_each_entry(cset, &tset->src_csets, mg_node) {
2120 list_for_each_entry_safe(task, tmp_task, &cset->mg_tasks, cg_list) {
2121 struct css_set *from_cset = task_css_set(task);
2122 struct css_set *to_cset = cset->mg_dst_cset;
2123
2124 get_css_set(to_cset);
2125 to_cset->nr_tasks++;
2126 css_set_move_task(task, from_cset, to_cset, true);
2127 put_css_set_locked(from_cset);
2128 from_cset->nr_tasks--;
2129 }
2130 }
2131 spin_unlock_irq(&css_set_lock);
2132
2133
2134
2135
2136
2137
2138 tset->csets = &tset->dst_csets;
2139
2140 if (tset->nr_tasks) {
2141 do_each_subsys_mask(ss, ssid, mgctx->ss_mask) {
2142 if (ss->attach) {
2143 tset->ssid = ssid;
2144 ss->attach(tset);
2145 }
2146 } while_each_subsys_mask();
2147 }
2148
2149 ret = 0;
2150 goto out_release_tset;
2151
2152out_cancel_attach:
2153 if (tset->nr_tasks) {
2154 do_each_subsys_mask(ss, ssid, mgctx->ss_mask) {
2155 if (ssid == failed_ssid)
2156 break;
2157 if (ss->cancel_attach) {
2158 tset->ssid = ssid;
2159 ss->cancel_attach(tset);
2160 }
2161 } while_each_subsys_mask();
2162 }
2163out_release_tset:
2164 spin_lock_irq(&css_set_lock);
2165 list_splice_init(&tset->dst_csets, &tset->src_csets);
2166 list_for_each_entry_safe(cset, tmp_cset, &tset->src_csets, mg_node) {
2167 list_splice_tail_init(&cset->mg_tasks, &cset->tasks);
2168 list_del_init(&cset->mg_node);
2169 }
2170 spin_unlock_irq(&css_set_lock);
2171 return ret;
2172}
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182bool cgroup_may_migrate_to(struct cgroup *dst_cgrp)
2183{
2184 return !cgroup_on_dfl(dst_cgrp) || !cgroup_parent(dst_cgrp) ||
2185 !dst_cgrp->subtree_control;
2186}
2187
2188
2189
2190
2191
2192
2193
2194
2195void cgroup_migrate_finish(struct cgroup_mgctx *mgctx)
2196{
2197 LIST_HEAD(preloaded);
2198 struct css_set *cset, *tmp_cset;
2199
2200 lockdep_assert_held(&cgroup_mutex);
2201
2202 spin_lock_irq(&css_set_lock);
2203
2204 list_splice_tail_init(&mgctx->preloaded_src_csets, &preloaded);
2205 list_splice_tail_init(&mgctx->preloaded_dst_csets, &preloaded);
2206
2207 list_for_each_entry_safe(cset, tmp_cset, &preloaded, mg_preload_node) {
2208 cset->mg_src_cgrp = NULL;
2209 cset->mg_dst_cgrp = NULL;
2210 cset->mg_dst_cset = NULL;
2211 list_del_init(&cset->mg_preload_node);
2212 put_css_set_locked(cset);
2213 }
2214
2215 spin_unlock_irq(&css_set_lock);
2216}
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234void cgroup_migrate_add_src(struct css_set *src_cset,
2235 struct cgroup *dst_cgrp,
2236 struct cgroup_mgctx *mgctx)
2237{
2238 struct cgroup *src_cgrp;
2239
2240 lockdep_assert_held(&cgroup_mutex);
2241 lockdep_assert_held(&css_set_lock);
2242
2243
2244
2245
2246
2247
2248 if (src_cset->dead)
2249 return;
2250
2251 src_cgrp = cset_cgroup_from_root(src_cset, dst_cgrp->root);
2252
2253 if (!list_empty(&src_cset->mg_preload_node))
2254 return;
2255
2256 WARN_ON(src_cset->mg_src_cgrp);
2257 WARN_ON(src_cset->mg_dst_cgrp);
2258 WARN_ON(!list_empty(&src_cset->mg_tasks));
2259 WARN_ON(!list_empty(&src_cset->mg_node));
2260
2261 src_cset->mg_src_cgrp = src_cgrp;
2262 src_cset->mg_dst_cgrp = dst_cgrp;
2263 get_css_set(src_cset);
2264 list_add_tail(&src_cset->mg_preload_node, &mgctx->preloaded_src_csets);
2265}
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281int cgroup_migrate_prepare_dst(struct cgroup_mgctx *mgctx)
2282{
2283 struct css_set *src_cset, *tmp_cset;
2284
2285 lockdep_assert_held(&cgroup_mutex);
2286
2287
2288 list_for_each_entry_safe(src_cset, tmp_cset, &mgctx->preloaded_src_csets,
2289 mg_preload_node) {
2290 struct css_set *dst_cset;
2291 struct cgroup_subsys *ss;
2292 int ssid;
2293
2294 dst_cset = find_css_set(src_cset, src_cset->mg_dst_cgrp);
2295 if (!dst_cset)
2296 goto err;
2297
2298 WARN_ON_ONCE(src_cset->mg_dst_cset || dst_cset->mg_dst_cset);
2299
2300
2301
2302
2303
2304
2305 if (src_cset == dst_cset) {
2306 src_cset->mg_src_cgrp = NULL;
2307 src_cset->mg_dst_cgrp = NULL;
2308 list_del_init(&src_cset->mg_preload_node);
2309 put_css_set(src_cset);
2310 put_css_set(dst_cset);
2311 continue;
2312 }
2313
2314 src_cset->mg_dst_cset = dst_cset;
2315
2316 if (list_empty(&dst_cset->mg_preload_node))
2317 list_add_tail(&dst_cset->mg_preload_node,
2318 &mgctx->preloaded_dst_csets);
2319 else
2320 put_css_set(dst_cset);
2321
2322 for_each_subsys(ss, ssid)
2323 if (src_cset->subsys[ssid] != dst_cset->subsys[ssid])
2324 mgctx->ss_mask |= 1 << ssid;
2325 }
2326
2327 return 0;
2328err:
2329 cgroup_migrate_finish(mgctx);
2330 return -ENOMEM;
2331}
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351int cgroup_migrate(struct task_struct *leader, bool threadgroup,
2352 struct cgroup_mgctx *mgctx)
2353{
2354 struct task_struct *task;
2355
2356
2357
2358
2359
2360
2361 spin_lock_irq(&css_set_lock);
2362 rcu_read_lock();
2363 task = leader;
2364 do {
2365 cgroup_migrate_add_task(task, mgctx);
2366 if (!threadgroup)
2367 break;
2368 } while_each_thread(leader, task);
2369 rcu_read_unlock();
2370 spin_unlock_irq(&css_set_lock);
2371
2372 return cgroup_migrate_execute(mgctx);
2373}
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383int cgroup_attach_task(struct cgroup *dst_cgrp, struct task_struct *leader,
2384 bool threadgroup)
2385{
2386 DEFINE_CGROUP_MGCTX(mgctx);
2387 struct task_struct *task;
2388 int ret;
2389
2390 if (!cgroup_may_migrate_to(dst_cgrp))
2391 return -EBUSY;
2392
2393
2394 spin_lock_irq(&css_set_lock);
2395 rcu_read_lock();
2396 task = leader;
2397 do {
2398 cgroup_migrate_add_src(task_css_set(task), dst_cgrp, &mgctx);
2399 if (!threadgroup)
2400 break;
2401 } while_each_thread(leader, task);
2402 rcu_read_unlock();
2403 spin_unlock_irq(&css_set_lock);
2404
2405
2406 ret = cgroup_migrate_prepare_dst(&mgctx);
2407 if (!ret)
2408 ret = cgroup_migrate(leader, threadgroup, &mgctx);
2409
2410 cgroup_migrate_finish(&mgctx);
2411
2412 if (!ret)
2413 trace_cgroup_attach_task(dst_cgrp, leader, threadgroup);
2414
2415 return ret;
2416}
2417
2418static int cgroup_procs_write_permission(struct task_struct *task,
2419 struct cgroup *dst_cgrp,
2420 struct kernfs_open_file *of)
2421{
2422 struct super_block *sb = of->file->f_path.dentry->d_sb;
2423 struct cgroup_namespace *ns = current->nsproxy->cgroup_ns;
2424 struct cgroup *root_cgrp = ns->root_cset->dfl_cgrp;
2425 struct cgroup *src_cgrp, *com_cgrp;
2426 struct inode *inode;
2427 int ret;
2428
2429 if (!cgroup_on_dfl(dst_cgrp)) {
2430 const struct cred *cred = current_cred();
2431 const struct cred *tcred = get_task_cred(task);
2432
2433
2434
2435
2436
2437 if (uid_eq(cred->euid, GLOBAL_ROOT_UID) ||
2438 uid_eq(cred->euid, tcred->uid) ||
2439 uid_eq(cred->euid, tcred->suid))
2440 ret = 0;
2441 else
2442 ret = -EACCES;
2443
2444 put_cred(tcred);
2445 return ret;
2446 }
2447
2448
2449 spin_lock_irq(&css_set_lock);
2450 src_cgrp = task_cgroup_from_root(task, &cgrp_dfl_root);
2451 spin_unlock_irq(&css_set_lock);
2452
2453
2454 com_cgrp = src_cgrp;
2455 while (!cgroup_is_descendant(dst_cgrp, com_cgrp))
2456 com_cgrp = cgroup_parent(com_cgrp);
2457
2458
2459 inode = kernfs_get_inode(sb, com_cgrp->procs_file.kn);
2460 if (!inode)
2461 return -ENOMEM;
2462
2463 ret = inode_permission(inode, MAY_WRITE);
2464 iput(inode);
2465 if (ret)
2466 return ret;
2467
2468
2469
2470
2471
2472 if ((cgrp_dfl_root.flags & CGRP_ROOT_NS_DELEGATE) &&
2473 (!cgroup_is_descendant(src_cgrp, root_cgrp) ||
2474 !cgroup_is_descendant(dst_cgrp, root_cgrp)))
2475 return -ENOENT;
2476
2477 return 0;
2478}
2479
2480
2481
2482
2483
2484
2485ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf,
2486 size_t nbytes, loff_t off, bool threadgroup)
2487{
2488 struct task_struct *tsk;
2489 struct cgroup_subsys *ss;
2490 struct cgroup *cgrp;
2491 pid_t pid;
2492 int ssid, ret;
2493
2494 if (kstrtoint(strstrip(buf), 0, &pid) || pid < 0)
2495 return -EINVAL;
2496
2497 cgrp = cgroup_kn_lock_live(of->kn, false);
2498 if (!cgrp)
2499 return -ENODEV;
2500
2501 percpu_down_write(&cgroup_threadgroup_rwsem);
2502 rcu_read_lock();
2503 if (pid) {
2504 tsk = find_task_by_vpid(pid);
2505 if (!tsk) {
2506 ret = -ESRCH;
2507 goto out_unlock_rcu;
2508 }
2509 } else {
2510 tsk = current;
2511 }
2512
2513 if (threadgroup)
2514 tsk = tsk->group_leader;
2515
2516
2517
2518
2519
2520
2521
2522 if (tsk->no_cgroup_migration || (tsk->flags & PF_NO_SETAFFINITY)) {
2523 ret = -EINVAL;
2524 goto out_unlock_rcu;
2525 }
2526
2527 get_task_struct(tsk);
2528 rcu_read_unlock();
2529
2530 ret = cgroup_procs_write_permission(tsk, cgrp, of);
2531 if (!ret)
2532 ret = cgroup_attach_task(cgrp, tsk, threadgroup);
2533
2534 put_task_struct(tsk);
2535 goto out_unlock_threadgroup;
2536
2537out_unlock_rcu:
2538 rcu_read_unlock();
2539out_unlock_threadgroup:
2540 percpu_up_write(&cgroup_threadgroup_rwsem);
2541 for_each_subsys(ss, ssid)
2542 if (ss->post_attach)
2543 ss->post_attach();
2544 cgroup_kn_unlock(of->kn);
2545 return ret ?: nbytes;
2546}
2547
2548ssize_t cgroup_procs_write(struct kernfs_open_file *of, char *buf, size_t nbytes,
2549 loff_t off)
2550{
2551 return __cgroup_procs_write(of, buf, nbytes, off, true);
2552}
2553
2554static void cgroup_print_ss_mask(struct seq_file *seq, u16 ss_mask)
2555{
2556 struct cgroup_subsys *ss;
2557 bool printed = false;
2558 int ssid;
2559
2560 do_each_subsys_mask(ss, ssid, ss_mask) {
2561 if (printed)
2562 seq_putc(seq, ' ');
2563 seq_printf(seq, "%s", ss->name);
2564 printed = true;
2565 } while_each_subsys_mask();
2566 if (printed)
2567 seq_putc(seq, '\n');
2568}
2569
2570
2571static int cgroup_controllers_show(struct seq_file *seq, void *v)
2572{
2573 struct cgroup *cgrp = seq_css(seq)->cgroup;
2574
2575 cgroup_print_ss_mask(seq, cgroup_control(cgrp));
2576 return 0;
2577}
2578
2579
2580static int cgroup_subtree_control_show(struct seq_file *seq, void *v)
2581{
2582 struct cgroup *cgrp = seq_css(seq)->cgroup;
2583
2584 cgroup_print_ss_mask(seq, cgrp->subtree_control);
2585 return 0;
2586}
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597static int cgroup_update_dfl_csses(struct cgroup *cgrp)
2598{
2599 DEFINE_CGROUP_MGCTX(mgctx);
2600 struct cgroup_subsys_state *d_css;
2601 struct cgroup *dsct;
2602 struct css_set *src_cset;
2603 int ret;
2604
2605 lockdep_assert_held(&cgroup_mutex);
2606
2607 percpu_down_write(&cgroup_threadgroup_rwsem);
2608
2609
2610 spin_lock_irq(&css_set_lock);
2611 cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp) {
2612 struct cgrp_cset_link *link;
2613
2614 list_for_each_entry(link, &dsct->cset_links, cset_link)
2615 cgroup_migrate_add_src(link->cset, dsct, &mgctx);
2616 }
2617 spin_unlock_irq(&css_set_lock);
2618
2619
2620 ret = cgroup_migrate_prepare_dst(&mgctx);
2621 if (ret)
2622 goto out_finish;
2623
2624 spin_lock_irq(&css_set_lock);
2625 list_for_each_entry(src_cset, &mgctx.preloaded_src_csets, mg_preload_node) {
2626 struct task_struct *task, *ntask;
2627
2628
2629 list_for_each_entry_safe(task, ntask, &src_cset->tasks, cg_list)
2630 cgroup_migrate_add_task(task, &mgctx);
2631 }
2632 spin_unlock_irq(&css_set_lock);
2633
2634 ret = cgroup_migrate_execute(&mgctx);
2635out_finish:
2636 cgroup_migrate_finish(&mgctx);
2637 percpu_up_write(&cgroup_threadgroup_rwsem);
2638 return ret;
2639}
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649void cgroup_lock_and_drain_offline(struct cgroup *cgrp)
2650 __acquires(&cgroup_mutex)
2651{
2652 struct cgroup *dsct;
2653 struct cgroup_subsys_state *d_css;
2654 struct cgroup_subsys *ss;
2655 int ssid;
2656
2657restart:
2658 mutex_lock(&cgroup_mutex);
2659
2660 cgroup_for_each_live_descendant_post(dsct, d_css, cgrp) {
2661 for_each_subsys(ss, ssid) {
2662 struct cgroup_subsys_state *css = cgroup_css(dsct, ss);
2663 DEFINE_WAIT(wait);
2664
2665 if (!css || !percpu_ref_is_dying(&css->refcnt))
2666 continue;
2667
2668 cgroup_get_live(dsct);
2669 prepare_to_wait(&dsct->offline_waitq, &wait,
2670 TASK_UNINTERRUPTIBLE);
2671
2672 mutex_unlock(&cgroup_mutex);
2673 schedule();
2674 finish_wait(&dsct->offline_waitq, &wait);
2675
2676 cgroup_put(dsct);
2677 goto restart;
2678 }
2679 }
2680}
2681
2682
2683
2684
2685
2686
2687
2688
2689static void cgroup_save_control(struct cgroup *cgrp)
2690{
2691 struct cgroup *dsct;
2692 struct cgroup_subsys_state *d_css;
2693
2694 cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp) {
2695 dsct->old_subtree_control = dsct->subtree_control;
2696 dsct->old_subtree_ss_mask = dsct->subtree_ss_mask;
2697 }
2698}
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708static void cgroup_propagate_control(struct cgroup *cgrp)
2709{
2710 struct cgroup *dsct;
2711 struct cgroup_subsys_state *d_css;
2712
2713 cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp) {
2714 dsct->subtree_control &= cgroup_control(dsct);
2715 dsct->subtree_ss_mask =
2716 cgroup_calc_subtree_ss_mask(dsct->subtree_control,
2717 cgroup_ss_mask(dsct));
2718 }
2719}
2720
2721
2722
2723
2724
2725
2726
2727
2728static void cgroup_restore_control(struct cgroup *cgrp)
2729{
2730 struct cgroup *dsct;
2731 struct cgroup_subsys_state *d_css;
2732
2733 cgroup_for_each_live_descendant_post(dsct, d_css, cgrp) {
2734 dsct->subtree_control = dsct->old_subtree_control;
2735 dsct->subtree_ss_mask = dsct->old_subtree_ss_mask;
2736 }
2737}
2738
2739static bool css_visible(struct cgroup_subsys_state *css)
2740{
2741 struct cgroup_subsys *ss = css->ss;
2742 struct cgroup *cgrp = css->cgroup;
2743
2744 if (cgroup_control(cgrp) & (1 << ss->id))
2745 return true;
2746 if (!(cgroup_ss_mask(cgrp) & (1 << ss->id)))
2747 return false;
2748 return cgroup_on_dfl(cgrp) && ss->implicit_on_dfl;
2749}
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764static int cgroup_apply_control_enable(struct cgroup *cgrp)
2765{
2766 struct cgroup *dsct;
2767 struct cgroup_subsys_state *d_css;
2768 struct cgroup_subsys *ss;
2769 int ssid, ret;
2770
2771 cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp) {
2772 for_each_subsys(ss, ssid) {
2773 struct cgroup_subsys_state *css = cgroup_css(dsct, ss);
2774
2775 WARN_ON_ONCE(css && percpu_ref_is_dying(&css->refcnt));
2776
2777 if (!(cgroup_ss_mask(dsct) & (1 << ss->id)))
2778 continue;
2779
2780 if (!css) {
2781 css = css_create(dsct, ss);
2782 if (IS_ERR(css))
2783 return PTR_ERR(css);
2784 }
2785
2786 if (css_visible(css)) {
2787 ret = css_populate_dir(css);
2788 if (ret)
2789 return ret;
2790 }
2791 }
2792 }
2793
2794 return 0;
2795}
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810static void cgroup_apply_control_disable(struct cgroup *cgrp)
2811{
2812 struct cgroup *dsct;
2813 struct cgroup_subsys_state *d_css;
2814 struct cgroup_subsys *ss;
2815 int ssid;
2816
2817 cgroup_for_each_live_descendant_post(dsct, d_css, cgrp) {
2818 for_each_subsys(ss, ssid) {
2819 struct cgroup_subsys_state *css = cgroup_css(dsct, ss);
2820
2821 WARN_ON_ONCE(css && percpu_ref_is_dying(&css->refcnt));
2822
2823 if (!css)
2824 continue;
2825
2826 if (css->parent &&
2827 !(cgroup_ss_mask(dsct) & (1 << ss->id))) {
2828 kill_css(css);
2829 } else if (!css_visible(css)) {
2830 css_clear_dir(css);
2831 if (ss->css_reset)
2832 ss->css_reset(css);
2833 }
2834 }
2835 }
2836}
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855static int cgroup_apply_control(struct cgroup *cgrp)
2856{
2857 int ret;
2858
2859 cgroup_propagate_control(cgrp);
2860
2861 ret = cgroup_apply_control_enable(cgrp);
2862 if (ret)
2863 return ret;
2864
2865
2866
2867
2868
2869
2870 ret = cgroup_update_dfl_csses(cgrp);
2871 if (ret)
2872 return ret;
2873
2874 return 0;
2875}
2876
2877
2878
2879
2880
2881
2882
2883
2884static void cgroup_finalize_control(struct cgroup *cgrp, int ret)
2885{
2886 if (ret) {
2887 cgroup_restore_control(cgrp);
2888 cgroup_propagate_control(cgrp);
2889 }
2890
2891 cgroup_apply_control_disable(cgrp);
2892}
2893
2894
2895static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of,
2896 char *buf, size_t nbytes,
2897 loff_t off)
2898{
2899 u16 enable = 0, disable = 0;
2900 struct cgroup *cgrp, *child;
2901 struct cgroup_subsys *ss;
2902 char *tok;
2903 int ssid, ret;
2904
2905
2906
2907
2908
2909 buf = strstrip(buf);
2910 while ((tok = strsep(&buf, " "))) {
2911 if (tok[0] == '\0')
2912 continue;
2913 do_each_subsys_mask(ss, ssid, ~cgrp_dfl_inhibit_ss_mask) {
2914 if (!cgroup_ssid_enabled(ssid) ||
2915 strcmp(tok + 1, ss->name))
2916 continue;
2917
2918 if (*tok == '+') {
2919 enable |= 1 << ssid;
2920 disable &= ~(1 << ssid);
2921 } else if (*tok == '-') {
2922 disable |= 1 << ssid;
2923 enable &= ~(1 << ssid);
2924 } else {
2925 return -EINVAL;
2926 }
2927 break;
2928 } while_each_subsys_mask();
2929 if (ssid == CGROUP_SUBSYS_COUNT)
2930 return -EINVAL;
2931 }
2932
2933 cgrp = cgroup_kn_lock_live(of->kn, true);
2934 if (!cgrp)
2935 return -ENODEV;
2936
2937 for_each_subsys(ss, ssid) {
2938 if (enable & (1 << ssid)) {
2939 if (cgrp->subtree_control & (1 << ssid)) {
2940 enable &= ~(1 << ssid);
2941 continue;
2942 }
2943
2944 if (!(cgroup_control(cgrp) & (1 << ssid))) {
2945 ret = -ENOENT;
2946 goto out_unlock;
2947 }
2948 } else if (disable & (1 << ssid)) {
2949 if (!(cgrp->subtree_control & (1 << ssid))) {
2950 disable &= ~(1 << ssid);
2951 continue;
2952 }
2953
2954
2955 cgroup_for_each_live_child(child, cgrp) {
2956 if (child->subtree_control & (1 << ssid)) {
2957 ret = -EBUSY;
2958 goto out_unlock;
2959 }
2960 }
2961 }
2962 }
2963
2964 if (!enable && !disable) {
2965 ret = 0;
2966 goto out_unlock;
2967 }
2968
2969
2970
2971
2972
2973 if (enable && cgroup_parent(cgrp)) {
2974 struct cgrp_cset_link *link;
2975
2976
2977
2978
2979
2980
2981 spin_lock_irq(&css_set_lock);
2982
2983 ret = 0;
2984 list_for_each_entry(link, &cgrp->cset_links, cset_link) {
2985 if (css_set_populated(link->cset)) {
2986 ret = -EBUSY;
2987 break;
2988 }
2989 }
2990
2991 spin_unlock_irq(&css_set_lock);
2992
2993 if (ret)
2994 goto out_unlock;
2995 }
2996
2997
2998 cgroup_save_control(cgrp);
2999
3000 cgrp->subtree_control |= enable;
3001 cgrp->subtree_control &= ~disable;
3002
3003 ret = cgroup_apply_control(cgrp);
3004 cgroup_finalize_control(cgrp, ret);
3005 if (ret)
3006 goto out_unlock;
3007
3008 kernfs_activate(cgrp->kn);
3009out_unlock:
3010 cgroup_kn_unlock(of->kn);
3011 return ret ?: nbytes;
3012}
3013
3014static int cgroup_events_show(struct seq_file *seq, void *v)
3015{
3016 seq_printf(seq, "populated %d\n",
3017 cgroup_is_populated(seq_css(seq)->cgroup));
3018 return 0;
3019}
3020
3021static int cgroup_file_open(struct kernfs_open_file *of)
3022{
3023 struct cftype *cft = of->kn->priv;
3024
3025 if (cft->open)
3026 return cft->open(of);
3027 return 0;
3028}
3029
3030static void cgroup_file_release(struct kernfs_open_file *of)
3031{
3032 struct cftype *cft = of->kn->priv;
3033
3034 if (cft->release)
3035 cft->release(of);
3036}
3037
3038static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf,
3039 size_t nbytes, loff_t off)
3040{
3041 struct cgroup_namespace *ns = current->nsproxy->cgroup_ns;
3042 struct cgroup *cgrp = of->kn->parent->priv;
3043 struct cftype *cft = of->kn->priv;
3044 struct cgroup_subsys_state *css;
3045 int ret;
3046
3047
3048
3049
3050
3051
3052
3053 if ((cgrp->root->flags & CGRP_ROOT_NS_DELEGATE) &&
3054 !(cft->flags & CFTYPE_NS_DELEGATABLE) &&
3055 ns != &init_cgroup_ns && ns->root_cset->dfl_cgrp == cgrp)
3056 return -EPERM;
3057
3058 if (cft->write)
3059 return cft->write(of, buf, nbytes, off);
3060
3061
3062
3063
3064
3065
3066
3067 rcu_read_lock();
3068 css = cgroup_css(cgrp, cft->ss);
3069 rcu_read_unlock();
3070
3071 if (cft->write_u64) {
3072 unsigned long long v;
3073 ret = kstrtoull(buf, 0, &v);
3074 if (!ret)
3075 ret = cft->write_u64(css, cft, v);
3076 } else if (cft->write_s64) {
3077 long long v;
3078 ret = kstrtoll(buf, 0, &v);
3079 if (!ret)
3080 ret = cft->write_s64(css, cft, v);
3081 } else {
3082 ret = -EINVAL;
3083 }
3084
3085 return ret ?: nbytes;
3086}
3087
3088static void *cgroup_seqfile_start(struct seq_file *seq, loff_t *ppos)
3089{
3090 return seq_cft(seq)->seq_start(seq, ppos);
3091}
3092
3093static void *cgroup_seqfile_next(struct seq_file *seq, void *v, loff_t *ppos)
3094{
3095 return seq_cft(seq)->seq_next(seq, v, ppos);
3096}
3097
3098static void cgroup_seqfile_stop(struct seq_file *seq, void *v)
3099{
3100 if (seq_cft(seq)->seq_stop)
3101 seq_cft(seq)->seq_stop(seq, v);
3102}
3103
3104static int cgroup_seqfile_show(struct seq_file *m, void *arg)
3105{
3106 struct cftype *cft = seq_cft(m);
3107 struct cgroup_subsys_state *css = seq_css(m);
3108
3109 if (cft->seq_show)
3110 return cft->seq_show(m, arg);
3111
3112 if (cft->read_u64)
3113 seq_printf(m, "%llu\n", cft->read_u64(css, cft));
3114 else if (cft->read_s64)
3115 seq_printf(m, "%lld\n", cft->read_s64(css, cft));
3116 else
3117 return -EINVAL;
3118 return 0;
3119}
3120
3121static struct kernfs_ops cgroup_kf_single_ops = {
3122 .atomic_write_len = PAGE_SIZE,
3123 .open = cgroup_file_open,
3124 .release = cgroup_file_release,
3125 .write = cgroup_file_write,
3126 .seq_show = cgroup_seqfile_show,
3127};
3128
3129static struct kernfs_ops cgroup_kf_ops = {
3130 .atomic_write_len = PAGE_SIZE,
3131 .open = cgroup_file_open,
3132 .release = cgroup_file_release,
3133 .write = cgroup_file_write,
3134 .seq_start = cgroup_seqfile_start,
3135 .seq_next = cgroup_seqfile_next,
3136 .seq_stop = cgroup_seqfile_stop,
3137 .seq_show = cgroup_seqfile_show,
3138};
3139
3140
3141static int cgroup_kn_set_ugid(struct kernfs_node *kn)
3142{
3143 struct iattr iattr = { .ia_valid = ATTR_UID | ATTR_GID,
3144 .ia_uid = current_fsuid(),
3145 .ia_gid = current_fsgid(), };
3146
3147 if (uid_eq(iattr.ia_uid, GLOBAL_ROOT_UID) &&
3148 gid_eq(iattr.ia_gid, GLOBAL_ROOT_GID))
3149 return 0;
3150
3151 return kernfs_setattr(kn, &iattr);
3152}
3153
3154static int cgroup_add_file(struct cgroup_subsys_state *css, struct cgroup *cgrp,
3155 struct cftype *cft)
3156{
3157 char name[CGROUP_FILE_NAME_MAX];
3158 struct kernfs_node *kn;
3159 struct lock_class_key *key = NULL;
3160 int ret;
3161
3162#ifdef CONFIG_DEBUG_LOCK_ALLOC
3163 key = &cft->lockdep_key;
3164#endif
3165 kn = __kernfs_create_file(cgrp->kn, cgroup_file_name(cgrp, cft, name),
3166 cgroup_file_mode(cft), 0, cft->kf_ops, cft,
3167 NULL, key);
3168 if (IS_ERR(kn))
3169 return PTR_ERR(kn);
3170
3171 ret = cgroup_kn_set_ugid(kn);
3172 if (ret) {
3173 kernfs_remove(kn);
3174 return ret;
3175 }
3176
3177 if (cft->file_offset) {
3178 struct cgroup_file *cfile = (void *)css + cft->file_offset;
3179
3180 spin_lock_irq(&cgroup_file_kn_lock);
3181 cfile->kn = kn;
3182 spin_unlock_irq(&cgroup_file_kn_lock);
3183 }
3184
3185 return 0;
3186}
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198static int cgroup_addrm_files(struct cgroup_subsys_state *css,
3199 struct cgroup *cgrp, struct cftype cfts[],
3200 bool is_add)
3201{
3202 struct cftype *cft, *cft_end = NULL;
3203 int ret = 0;
3204
3205 lockdep_assert_held(&cgroup_mutex);
3206
3207restart:
3208 for (cft = cfts; cft != cft_end && cft->name[0] != '\0'; cft++) {
3209
3210 if ((cft->flags & __CFTYPE_ONLY_ON_DFL) && !cgroup_on_dfl(cgrp))
3211 continue;
3212 if ((cft->flags & __CFTYPE_NOT_ON_DFL) && cgroup_on_dfl(cgrp))
3213 continue;
3214 if ((cft->flags & CFTYPE_NOT_ON_ROOT) && !cgroup_parent(cgrp))
3215 continue;
3216 if ((cft->flags & CFTYPE_ONLY_ON_ROOT) && cgroup_parent(cgrp))
3217 continue;
3218
3219 if (is_add) {
3220 ret = cgroup_add_file(css, cgrp, cft);
3221 if (ret) {
3222 pr_warn("%s: failed to add %s, err=%d\n",
3223 __func__, cft->name, ret);
3224 cft_end = cft;
3225 is_add = false;
3226 goto restart;
3227 }
3228 } else {
3229 cgroup_rm_file(cgrp, cft);
3230 }
3231 }
3232 return ret;
3233}
3234
3235static int cgroup_apply_cftypes(struct cftype *cfts, bool is_add)
3236{
3237 LIST_HEAD(pending);
3238 struct cgroup_subsys *ss = cfts[0].ss;
3239 struct cgroup *root = &ss->root->cgrp;
3240 struct cgroup_subsys_state *css;
3241 int ret = 0;
3242
3243 lockdep_assert_held(&cgroup_mutex);
3244
3245
3246 css_for_each_descendant_pre(css, cgroup_css(root, ss)) {
3247 struct cgroup *cgrp = css->cgroup;
3248
3249 if (!(css->flags & CSS_VISIBLE))
3250 continue;
3251
3252 ret = cgroup_addrm_files(css, cgrp, cfts, is_add);
3253 if (ret)
3254 break;
3255 }
3256
3257 if (is_add && !ret)
3258 kernfs_activate(root->kn);
3259 return ret;
3260}
3261
3262static void cgroup_exit_cftypes(struct cftype *cfts)
3263{
3264 struct cftype *cft;
3265
3266 for (cft = cfts; cft->name[0] != '\0'; cft++) {
3267
3268 if (cft->max_write_len && cft->max_write_len != PAGE_SIZE)
3269 kfree(cft->kf_ops);
3270 cft->kf_ops = NULL;
3271 cft->ss = NULL;
3272
3273
3274 cft->flags &= ~(__CFTYPE_ONLY_ON_DFL | __CFTYPE_NOT_ON_DFL);
3275 }
3276}
3277
3278static int cgroup_init_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
3279{
3280 struct cftype *cft;
3281
3282 for (cft = cfts; cft->name[0] != '\0'; cft++) {
3283 struct kernfs_ops *kf_ops;
3284
3285 WARN_ON(cft->ss || cft->kf_ops);
3286
3287 if (cft->seq_start)
3288 kf_ops = &cgroup_kf_ops;
3289 else
3290 kf_ops = &cgroup_kf_single_ops;
3291
3292
3293
3294
3295
3296 if (cft->max_write_len && cft->max_write_len != PAGE_SIZE) {
3297 kf_ops = kmemdup(kf_ops, sizeof(*kf_ops), GFP_KERNEL);
3298 if (!kf_ops) {
3299 cgroup_exit_cftypes(cfts);
3300 return -ENOMEM;
3301 }
3302 kf_ops->atomic_write_len = cft->max_write_len;
3303 }
3304
3305 cft->kf_ops = kf_ops;
3306 cft->ss = ss;
3307 }
3308
3309 return 0;
3310}
3311
3312static int cgroup_rm_cftypes_locked(struct cftype *cfts)
3313{
3314 lockdep_assert_held(&cgroup_mutex);
3315
3316 if (!cfts || !cfts[0].ss)
3317 return -ENOENT;
3318
3319 list_del(&cfts->node);
3320 cgroup_apply_cftypes(cfts, false);
3321 cgroup_exit_cftypes(cfts);
3322 return 0;
3323}
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336int cgroup_rm_cftypes(struct cftype *cfts)
3337{
3338 int ret;
3339
3340 mutex_lock(&cgroup_mutex);
3341 ret = cgroup_rm_cftypes_locked(cfts);
3342 mutex_unlock(&cgroup_mutex);
3343 return ret;
3344}
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360static int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
3361{
3362 int ret;
3363
3364 if (!cgroup_ssid_enabled(ss->id))
3365 return 0;
3366
3367 if (!cfts || cfts[0].name[0] == '\0')
3368 return 0;
3369
3370 ret = cgroup_init_cftypes(ss, cfts);
3371 if (ret)
3372 return ret;
3373
3374 mutex_lock(&cgroup_mutex);
3375
3376 list_add_tail(&cfts->node, &ss->cfts);
3377 ret = cgroup_apply_cftypes(cfts, true);
3378 if (ret)
3379 cgroup_rm_cftypes_locked(cfts);
3380
3381 mutex_unlock(&cgroup_mutex);
3382 return ret;
3383}
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393int cgroup_add_dfl_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
3394{
3395 struct cftype *cft;
3396
3397 for (cft = cfts; cft && cft->name[0] != '\0'; cft++)
3398 cft->flags |= __CFTYPE_ONLY_ON_DFL;
3399 return cgroup_add_cftypes(ss, cfts);
3400}
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410int cgroup_add_legacy_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
3411{
3412 struct cftype *cft;
3413
3414 for (cft = cfts; cft && cft->name[0] != '\0'; cft++)
3415 cft->flags |= __CFTYPE_NOT_ON_DFL;
3416 return cgroup_add_cftypes(ss, cfts);
3417}
3418
3419
3420
3421
3422
3423
3424
3425void cgroup_file_notify(struct cgroup_file *cfile)
3426{
3427 unsigned long flags;
3428
3429 spin_lock_irqsave(&cgroup_file_kn_lock, flags);
3430 if (cfile->kn)
3431 kernfs_notify(cfile->kn);
3432 spin_unlock_irqrestore(&cgroup_file_kn_lock, flags);
3433}
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452struct cgroup_subsys_state *css_next_child(struct cgroup_subsys_state *pos,
3453 struct cgroup_subsys_state *parent)
3454{
3455 struct cgroup_subsys_state *next;
3456
3457 cgroup_assert_mutex_or_rcu_locked();
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479 if (!pos) {
3480 next = list_entry_rcu(parent->children.next, struct cgroup_subsys_state, sibling);
3481 } else if (likely(!(pos->flags & CSS_RELEASED))) {
3482 next = list_entry_rcu(pos->sibling.next, struct cgroup_subsys_state, sibling);
3483 } else {
3484 list_for_each_entry_rcu(next, &parent->children, sibling)
3485 if (next->serial_nr > pos->serial_nr)
3486 break;
3487 }
3488
3489
3490
3491
3492
3493 if (&next->sibling != &parent->children)
3494 return next;
3495 return NULL;
3496}
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519struct cgroup_subsys_state *
3520css_next_descendant_pre(struct cgroup_subsys_state *pos,
3521 struct cgroup_subsys_state *root)
3522{
3523 struct cgroup_subsys_state *next;
3524
3525 cgroup_assert_mutex_or_rcu_locked();
3526
3527
3528 if (!pos)
3529 return root;
3530
3531
3532 next = css_next_child(NULL, pos);
3533 if (next)
3534 return next;
3535
3536
3537 while (pos != root) {
3538 next = css_next_child(pos, pos->parent);
3539 if (next)
3540 return next;
3541 pos = pos->parent;
3542 }
3543
3544 return NULL;
3545}
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560struct cgroup_subsys_state *
3561css_rightmost_descendant(struct cgroup_subsys_state *pos)
3562{
3563 struct cgroup_subsys_state *last, *tmp;
3564
3565 cgroup_assert_mutex_or_rcu_locked();
3566
3567 do {
3568 last = pos;
3569
3570 pos = NULL;
3571 css_for_each_child(tmp, last)
3572 pos = tmp;
3573 } while (pos);
3574
3575 return last;
3576}
3577
3578static struct cgroup_subsys_state *
3579css_leftmost_descendant(struct cgroup_subsys_state *pos)
3580{
3581 struct cgroup_subsys_state *last;
3582
3583 do {
3584 last = pos;
3585 pos = css_next_child(NULL, pos);
3586 } while (pos);
3587
3588 return last;
3589}
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613struct cgroup_subsys_state *
3614css_next_descendant_post(struct cgroup_subsys_state *pos,
3615 struct cgroup_subsys_state *root)
3616{
3617 struct cgroup_subsys_state *next;
3618
3619 cgroup_assert_mutex_or_rcu_locked();
3620
3621
3622 if (!pos)
3623 return css_leftmost_descendant(root);
3624
3625
3626 if (pos == root)
3627 return NULL;
3628
3629
3630 next = css_next_child(pos, pos->parent);
3631 if (next)
3632 return css_leftmost_descendant(next);
3633
3634
3635 return pos->parent;
3636}
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646bool css_has_online_children(struct cgroup_subsys_state *css)
3647{
3648 struct cgroup_subsys_state *child;
3649 bool ret = false;
3650
3651 rcu_read_lock();
3652 css_for_each_child(child, css) {
3653 if (child->flags & CSS_ONLINE) {
3654 ret = true;
3655 break;
3656 }
3657 }
3658 rcu_read_unlock();
3659 return ret;
3660}
3661
3662
3663
3664
3665
3666
3667
3668static void css_task_iter_advance_css_set(struct css_task_iter *it)
3669{
3670 struct list_head *l = it->cset_pos;
3671 struct cgrp_cset_link *link;
3672 struct css_set *cset;
3673
3674 lockdep_assert_held(&css_set_lock);
3675
3676
3677 do {
3678 l = l->next;
3679 if (l == it->cset_head) {
3680 it->cset_pos = NULL;
3681 it->task_pos = NULL;
3682 return;
3683 }
3684
3685 if (it->ss) {
3686 cset = container_of(l, struct css_set,
3687 e_cset_node[it->ss->id]);
3688 } else {
3689 link = list_entry(l, struct cgrp_cset_link, cset_link);
3690 cset = link->cset;
3691 }
3692 } while (!css_set_populated(cset));
3693
3694 it->cset_pos = l;
3695
3696 if (!list_empty(&cset->tasks))
3697 it->task_pos = cset->tasks.next;
3698 else
3699 it->task_pos = cset->mg_tasks.next;
3700
3701 it->tasks_head = &cset->tasks;
3702 it->mg_tasks_head = &cset->mg_tasks;
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719 if (it->cur_cset) {
3720 list_del(&it->iters_node);
3721 put_css_set_locked(it->cur_cset);
3722 }
3723 get_css_set(cset);
3724 it->cur_cset = cset;
3725 list_add(&it->iters_node, &cset->task_iters);
3726}
3727
3728static void css_task_iter_advance(struct css_task_iter *it)
3729{
3730 struct list_head *l = it->task_pos;
3731
3732 lockdep_assert_held(&css_set_lock);
3733 WARN_ON_ONCE(!l);
3734
3735
3736
3737
3738
3739
3740 l = l->next;
3741
3742 if (l == it->tasks_head)
3743 l = it->mg_tasks_head->next;
3744
3745 if (l == it->mg_tasks_head)
3746 css_task_iter_advance_css_set(it);
3747 else
3748 it->task_pos = l;
3749}
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761void css_task_iter_start(struct cgroup_subsys_state *css,
3762 struct css_task_iter *it)
3763{
3764
3765 WARN_ON_ONCE(!use_task_css_set_links);
3766
3767 memset(it, 0, sizeof(*it));
3768
3769 spin_lock_irq(&css_set_lock);
3770
3771 it->ss = css->ss;
3772
3773 if (it->ss)
3774 it->cset_pos = &css->cgroup->e_csets[css->ss->id];
3775 else
3776 it->cset_pos = &css->cgroup->cset_links;
3777
3778 it->cset_head = it->cset_pos;
3779
3780 css_task_iter_advance_css_set(it);
3781
3782 spin_unlock_irq(&css_set_lock);
3783}
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793struct task_struct *css_task_iter_next(struct css_task_iter *it)
3794{
3795 if (it->cur_task) {
3796 put_task_struct(it->cur_task);
3797 it->cur_task = NULL;
3798 }
3799
3800 spin_lock_irq(&css_set_lock);
3801
3802 if (it->task_pos) {
3803 it->cur_task = list_entry(it->task_pos, struct task_struct,
3804 cg_list);
3805 get_task_struct(it->cur_task);
3806 css_task_iter_advance(it);
3807 }
3808
3809 spin_unlock_irq(&css_set_lock);
3810
3811 return it->cur_task;
3812}
3813
3814
3815
3816
3817
3818
3819
3820void css_task_iter_end(struct css_task_iter *it)
3821{
3822 if (it->cur_cset) {
3823 spin_lock_irq(&css_set_lock);
3824 list_del(&it->iters_node);
3825 put_css_set_locked(it->cur_cset);
3826 spin_unlock_irq(&css_set_lock);
3827 }
3828
3829 if (it->cur_task)
3830 put_task_struct(it->cur_task);
3831}
3832
3833static void cgroup_procs_release(struct kernfs_open_file *of)
3834{
3835 if (of->priv) {
3836 css_task_iter_end(of->priv);
3837 kfree(of->priv);
3838 }
3839}
3840
3841static void *cgroup_procs_next(struct seq_file *s, void *v, loff_t *pos)
3842{
3843 struct kernfs_open_file *of = s->private;
3844 struct css_task_iter *it = of->priv;
3845 struct task_struct *task;
3846
3847 do {
3848 task = css_task_iter_next(it);
3849 } while (task && !thread_group_leader(task));
3850
3851 return task;
3852}
3853
3854static void *cgroup_procs_start(struct seq_file *s, loff_t *pos)
3855{
3856 struct kernfs_open_file *of = s->private;
3857 struct cgroup *cgrp = seq_css(s)->cgroup;
3858 struct css_task_iter *it = of->priv;
3859
3860
3861
3862
3863
3864 if (!it) {
3865 if (WARN_ON_ONCE((*pos)++))
3866 return ERR_PTR(-EINVAL);
3867
3868 it = kzalloc(sizeof(*it), GFP_KERNEL);
3869 if (!it)
3870 return ERR_PTR(-ENOMEM);
3871 of->priv = it;
3872 css_task_iter_start(&cgrp->self, it);
3873 } else if (!(*pos)++) {
3874 css_task_iter_end(it);
3875 css_task_iter_start(&cgrp->self, it);
3876 }
3877
3878 return cgroup_procs_next(s, NULL, NULL);
3879}
3880
3881static int cgroup_procs_show(struct seq_file *s, void *v)
3882{
3883 seq_printf(s, "%d\n", task_tgid_vnr(v));
3884 return 0;
3885}
3886
3887
3888static struct cftype cgroup_base_files[] = {
3889 {
3890 .name = "cgroup.procs",
3891 .flags = CFTYPE_NS_DELEGATABLE,
3892 .file_offset = offsetof(struct cgroup, procs_file),
3893 .release = cgroup_procs_release,
3894 .seq_start = cgroup_procs_start,
3895 .seq_next = cgroup_procs_next,
3896 .seq_show = cgroup_procs_show,
3897 .write = cgroup_procs_write,
3898 },
3899 {
3900 .name = "cgroup.controllers",
3901 .seq_show = cgroup_controllers_show,
3902 },
3903 {
3904 .name = "cgroup.subtree_control",
3905 .flags = CFTYPE_NS_DELEGATABLE,
3906 .seq_show = cgroup_subtree_control_show,
3907 .write = cgroup_subtree_control_write,
3908 },
3909 {
3910 .name = "cgroup.events",
3911 .flags = CFTYPE_NOT_ON_ROOT,
3912 .file_offset = offsetof(struct cgroup, events_file),
3913 .seq_show = cgroup_events_show,
3914 },
3915 { }
3916};
3917
3918
3919
3920
3921
3922
3923
3924
3925
3926
3927
3928
3929
3930
3931
3932
3933
3934
3935
3936
3937
3938
3939
3940static void css_free_work_fn(struct work_struct *work)
3941{
3942 struct cgroup_subsys_state *css =
3943 container_of(work, struct cgroup_subsys_state, destroy_work);
3944 struct cgroup_subsys *ss = css->ss;
3945 struct cgroup *cgrp = css->cgroup;
3946
3947 percpu_ref_exit(&css->refcnt);
3948
3949 if (ss) {
3950
3951 struct cgroup_subsys_state *parent = css->parent;
3952 int id = css->id;
3953
3954 ss->css_free(css);
3955 cgroup_idr_remove(&ss->css_idr, id);
3956 cgroup_put(cgrp);
3957
3958 if (parent)
3959 css_put(parent);
3960 } else {
3961
3962 atomic_dec(&cgrp->root->nr_cgrps);
3963 cgroup1_pidlist_destroy_all(cgrp);
3964 cancel_work_sync(&cgrp->release_agent_work);
3965
3966 if (cgroup_parent(cgrp)) {
3967
3968
3969
3970
3971
3972
3973 cgroup_put(cgroup_parent(cgrp));
3974 kernfs_put(cgrp->kn);
3975 kfree(cgrp);
3976 } else {
3977
3978
3979
3980
3981
3982 cgroup_destroy_root(cgrp->root);
3983 }
3984 }
3985}
3986
3987static void css_free_rcu_fn(struct rcu_head *rcu_head)
3988{
3989 struct cgroup_subsys_state *css =
3990 container_of(rcu_head, struct cgroup_subsys_state, rcu_head);
3991
3992 INIT_WORK(&css->destroy_work, css_free_work_fn);
3993 queue_work(cgroup_destroy_wq, &css->destroy_work);
3994}
3995
3996static void css_release_work_fn(struct work_struct *work)
3997{
3998 struct cgroup_subsys_state *css =
3999 container_of(work, struct cgroup_subsys_state, destroy_work);
4000 struct cgroup_subsys *ss = css->ss;
4001 struct cgroup *cgrp = css->cgroup;
4002
4003 mutex_lock(&cgroup_mutex);
4004
4005 css->flags |= CSS_RELEASED;
4006 list_del_rcu(&css->sibling);
4007
4008 if (ss) {
4009
4010 cgroup_idr_replace(&ss->css_idr, NULL, css->id);
4011 if (ss->css_released)
4012 ss->css_released(css);
4013 } else {
4014
4015 trace_cgroup_release(cgrp);
4016
4017 cgroup_idr_remove(&cgrp->root->cgroup_idr, cgrp->id);
4018 cgrp->id = -1;
4019
4020
4021
4022
4023
4024
4025
4026
4027 if (cgrp->kn)
4028 RCU_INIT_POINTER(*(void __rcu __force **)&cgrp->kn->priv,
4029 NULL);
4030
4031 cgroup_bpf_put(cgrp);
4032 }
4033
4034 mutex_unlock(&cgroup_mutex);
4035
4036 call_rcu(&css->rcu_head, css_free_rcu_fn);
4037}
4038
4039static void css_release(struct percpu_ref *ref)
4040{
4041 struct cgroup_subsys_state *css =
4042 container_of(ref, struct cgroup_subsys_state, refcnt);
4043
4044 INIT_WORK(&css->destroy_work, css_release_work_fn);
4045 queue_work(cgroup_destroy_wq, &css->destroy_work);
4046}
4047
4048static void init_and_link_css(struct cgroup_subsys_state *css,
4049 struct cgroup_subsys *ss, struct cgroup *cgrp)
4050{
4051 lockdep_assert_held(&cgroup_mutex);
4052
4053 cgroup_get_live(cgrp);
4054
4055 memset(css, 0, sizeof(*css));
4056 css->cgroup = cgrp;
4057 css->ss = ss;
4058 css->id = -1;
4059 INIT_LIST_HEAD(&css->sibling);
4060 INIT_LIST_HEAD(&css->children);
4061 css->serial_nr = css_serial_nr_next++;
4062 atomic_set(&css->online_cnt, 0);
4063
4064 if (cgroup_parent(cgrp)) {
4065 css->parent = cgroup_css(cgroup_parent(cgrp), ss);
4066 css_get(css->parent);
4067 }
4068
4069 BUG_ON(cgroup_css(cgrp, ss));
4070}
4071
4072
4073static int online_css(struct cgroup_subsys_state *css)
4074{
4075 struct cgroup_subsys *ss = css->ss;
4076 int ret = 0;
4077
4078 lockdep_assert_held(&cgroup_mutex);
4079
4080 if (ss->css_online)
4081 ret = ss->css_online(css);
4082 if (!ret) {
4083 css->flags |= CSS_ONLINE;
4084 rcu_assign_pointer(css->cgroup->subsys[ss->id], css);
4085
4086 atomic_inc(&css->online_cnt);
4087 if (css->parent)
4088 atomic_inc(&css->parent->online_cnt);
4089 }
4090 return ret;
4091}
4092
4093
4094static void offline_css(struct cgroup_subsys_state *css)
4095{
4096 struct cgroup_subsys *ss = css->ss;
4097
4098 lockdep_assert_held(&cgroup_mutex);
4099
4100 if (!(css->flags & CSS_ONLINE))
4101 return;
4102
4103 if (ss->css_reset)
4104 ss->css_reset(css);
4105
4106 if (ss->css_offline)
4107 ss->css_offline(css);
4108
4109 css->flags &= ~CSS_ONLINE;
4110 RCU_INIT_POINTER(css->cgroup->subsys[ss->id], NULL);
4111
4112 wake_up_all(&css->cgroup->offline_waitq);
4113}
4114
4115
4116
4117
4118
4119
4120
4121
4122
4123
4124static struct cgroup_subsys_state *css_create(struct cgroup *cgrp,
4125 struct cgroup_subsys *ss)
4126{
4127 struct cgroup *parent = cgroup_parent(cgrp);
4128 struct cgroup_subsys_state *parent_css = cgroup_css(parent, ss);
4129 struct cgroup_subsys_state *css;
4130 int err;
4131
4132 lockdep_assert_held(&cgroup_mutex);
4133
4134 css = ss->css_alloc(parent_css);
4135 if (!css)
4136 css = ERR_PTR(-ENOMEM);
4137 if (IS_ERR(css))
4138 return css;
4139
4140 init_and_link_css(css, ss, cgrp);
4141
4142 err = percpu_ref_init(&css->refcnt, css_release, 0, GFP_KERNEL);
4143 if (err)
4144 goto err_free_css;
4145
4146 err = cgroup_idr_alloc(&ss->css_idr, NULL, 2, 0, GFP_KERNEL);
4147 if (err < 0)
4148 goto err_free_css;
4149 css->id = err;
4150
4151
4152 list_add_tail_rcu(&css->sibling, &parent_css->children);
4153 cgroup_idr_replace(&ss->css_idr, css, css->id);
4154
4155 err = online_css(css);
4156 if (err)
4157 goto err_list_del;
4158
4159 if (ss->broken_hierarchy && !ss->warned_broken_hierarchy &&
4160 cgroup_parent(parent)) {
4161 pr_warn("%s (%d) created nested cgroup for controller \"%s\" which has incomplete hierarchy support. Nested cgroups may change behavior in the future.\n",
4162 current->comm, current->pid, ss->name);
4163 if (!strcmp(ss->name, "memory"))
4164 pr_warn("\"memory\" requires setting use_hierarchy to 1 on the root\n");
4165 ss->warned_broken_hierarchy = true;
4166 }
4167
4168 return css;
4169
4170err_list_del:
4171 list_del_rcu(&css->sibling);
4172err_free_css:
4173 call_rcu(&css->rcu_head, css_free_rcu_fn);
4174 return ERR_PTR(err);
4175}
4176
4177
4178
4179
4180
4181
4182static struct cgroup *cgroup_create(struct cgroup *parent)
4183{
4184 struct cgroup_root *root = parent->root;
4185 struct cgroup *cgrp, *tcgrp;
4186 int level = parent->level + 1;
4187 int ret;
4188
4189
4190 cgrp = kzalloc(sizeof(*cgrp) +
4191 sizeof(cgrp->ancestor_ids[0]) * (level + 1), GFP_KERNEL);
4192 if (!cgrp)
4193 return ERR_PTR(-ENOMEM);
4194
4195 ret = percpu_ref_init(&cgrp->self.refcnt, css_release, 0, GFP_KERNEL);
4196 if (ret)
4197 goto out_free_cgrp;
4198
4199
4200
4201
4202
4203 cgrp->id = cgroup_idr_alloc(&root->cgroup_idr, NULL, 2, 0, GFP_KERNEL);
4204 if (cgrp->id < 0) {
4205 ret = -ENOMEM;
4206 goto out_cancel_ref;
4207 }
4208
4209 init_cgroup_housekeeping(cgrp);
4210
4211 cgrp->self.parent = &parent->self;
4212 cgrp->root = root;
4213 cgrp->level = level;
4214
4215 for (tcgrp = cgrp; tcgrp; tcgrp = cgroup_parent(tcgrp))
4216 cgrp->ancestor_ids[tcgrp->level] = tcgrp->id;
4217
4218 if (notify_on_release(parent))
4219 set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
4220
4221 if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &parent->flags))
4222 set_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags);
4223
4224 cgrp->self.serial_nr = css_serial_nr_next++;
4225
4226
4227 list_add_tail_rcu(&cgrp->self.sibling, &cgroup_parent(cgrp)->self.children);
4228 atomic_inc(&root->nr_cgrps);
4229 cgroup_get_live(parent);
4230
4231
4232
4233
4234
4235 cgroup_idr_replace(&root->cgroup_idr, cgrp, cgrp->id);
4236
4237
4238
4239
4240
4241 if (!cgroup_on_dfl(cgrp))
4242 cgrp->subtree_control = cgroup_control(cgrp);
4243
4244 if (parent)
4245 cgroup_bpf_inherit(cgrp, parent);
4246
4247 cgroup_propagate_control(cgrp);
4248
4249 return cgrp;
4250
4251out_cancel_ref:
4252 percpu_ref_exit(&cgrp->self.refcnt);
4253out_free_cgrp:
4254 kfree(cgrp);
4255 return ERR_PTR(ret);
4256}
4257
4258int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, umode_t mode)
4259{
4260 struct cgroup *parent, *cgrp;
4261 struct kernfs_node *kn;
4262 int ret;
4263
4264
4265 if (strchr(name, '\n'))
4266 return -EINVAL;
4267
4268 parent = cgroup_kn_lock_live(parent_kn, false);
4269 if (!parent)
4270 return -ENODEV;
4271
4272 cgrp = cgroup_create(parent);
4273 if (IS_ERR(cgrp)) {
4274 ret = PTR_ERR(cgrp);
4275 goto out_unlock;
4276 }
4277
4278
4279 kn = kernfs_create_dir(parent->kn, name, mode, cgrp);
4280 if (IS_ERR(kn)) {
4281 ret = PTR_ERR(kn);
4282 goto out_destroy;
4283 }
4284 cgrp->kn = kn;
4285
4286
4287
4288
4289
4290 kernfs_get(kn);
4291
4292 ret = cgroup_kn_set_ugid(kn);
4293 if (ret)
4294 goto out_destroy;
4295
4296 ret = css_populate_dir(&cgrp->self);
4297 if (ret)
4298 goto out_destroy;
4299
4300 ret = cgroup_apply_control_enable(cgrp);
4301 if (ret)
4302 goto out_destroy;
4303
4304 trace_cgroup_mkdir(cgrp);
4305
4306
4307 kernfs_activate(kn);
4308
4309 ret = 0;
4310 goto out_unlock;
4311
4312out_destroy:
4313 cgroup_destroy_locked(cgrp);
4314out_unlock:
4315 cgroup_kn_unlock(parent_kn);
4316 return ret;
4317}
4318
4319
4320
4321
4322
4323
4324static void css_killed_work_fn(struct work_struct *work)
4325{
4326 struct cgroup_subsys_state *css =
4327 container_of(work, struct cgroup_subsys_state, destroy_work);
4328
4329 mutex_lock(&cgroup_mutex);
4330
4331 do {
4332 offline_css(css);
4333 css_put(css);
4334
4335 css = css->parent;
4336 } while (css && atomic_dec_and_test(&css->online_cnt));
4337
4338 mutex_unlock(&cgroup_mutex);
4339}
4340
4341
4342static void css_killed_ref_fn(struct percpu_ref *ref)
4343{
4344 struct cgroup_subsys_state *css =
4345 container_of(ref, struct cgroup_subsys_state, refcnt);
4346
4347 if (atomic_dec_and_test(&css->online_cnt)) {
4348 INIT_WORK(&css->destroy_work, css_killed_work_fn);
4349 queue_work(cgroup_destroy_wq, &css->destroy_work);
4350 }
4351}
4352
4353
4354
4355
4356
4357
4358
4359
4360
4361
4362static void kill_css(struct cgroup_subsys_state *css)
4363{
4364 lockdep_assert_held(&cgroup_mutex);
4365
4366 if (css->flags & CSS_DYING)
4367 return;
4368
4369 css->flags |= CSS_DYING;
4370
4371
4372
4373
4374
4375 css_clear_dir(css);
4376
4377
4378
4379
4380
4381 css_get(css);
4382
4383
4384
4385
4386
4387
4388
4389
4390
4391
4392
4393 percpu_ref_kill_and_confirm(&css->refcnt, css_killed_ref_fn);
4394}
4395
4396
4397
4398
4399
4400
4401
4402
4403
4404
4405
4406
4407
4408
4409
4410
4411
4412
4413
4414
4415
4416
4417
4418
4419
4420static int cgroup_destroy_locked(struct cgroup *cgrp)
4421 __releases(&cgroup_mutex) __acquires(&cgroup_mutex)
4422{
4423 struct cgroup_subsys_state *css;
4424 struct cgrp_cset_link *link;
4425 int ssid;
4426
4427 lockdep_assert_held(&cgroup_mutex);
4428
4429
4430
4431
4432
4433 if (cgroup_is_populated(cgrp))
4434 return -EBUSY;
4435
4436
4437
4438
4439
4440
4441 if (css_has_online_children(&cgrp->self))
4442 return -EBUSY;
4443
4444
4445
4446
4447
4448
4449
4450 cgrp->self.flags &= ~CSS_ONLINE;
4451
4452 spin_lock_irq(&css_set_lock);
4453 list_for_each_entry(link, &cgrp->cset_links, cset_link)
4454 link->cset->dead = true;
4455 spin_unlock_irq(&css_set_lock);
4456
4457
4458 for_each_css(css, ssid, cgrp)
4459 kill_css(css);
4460
4461
4462
4463
4464
4465 kernfs_remove(cgrp->kn);
4466
4467 cgroup1_check_for_release(cgroup_parent(cgrp));
4468
4469
4470 percpu_ref_kill(&cgrp->self.refcnt);
4471
4472 return 0;
4473};
4474
4475int cgroup_rmdir(struct kernfs_node *kn)
4476{
4477 struct cgroup *cgrp;
4478 int ret = 0;
4479
4480 cgrp = cgroup_kn_lock_live(kn, false);
4481 if (!cgrp)
4482 return 0;
4483
4484 ret = cgroup_destroy_locked(cgrp);
4485
4486 if (!ret)
4487 trace_cgroup_rmdir(cgrp);
4488
4489 cgroup_kn_unlock(kn);
4490 return ret;
4491}
4492
4493static struct kernfs_syscall_ops cgroup_kf_syscall_ops = {
4494 .show_options = cgroup_show_options,
4495 .remount_fs = cgroup_remount,
4496 .mkdir = cgroup_mkdir,
4497 .rmdir = cgroup_rmdir,
4498 .show_path = cgroup_show_path,
4499};
4500
4501static void __init cgroup_init_subsys(struct cgroup_subsys *ss, bool early)
4502{
4503 struct cgroup_subsys_state *css;
4504
4505 pr_debug("Initializing cgroup subsys %s\n", ss->name);
4506
4507 mutex_lock(&cgroup_mutex);
4508
4509 idr_init(&ss->css_idr);
4510 INIT_LIST_HEAD(&ss->cfts);
4511
4512
4513 ss->root = &cgrp_dfl_root;
4514 css = ss->css_alloc(cgroup_css(&cgrp_dfl_root.cgrp, ss));
4515
4516 BUG_ON(IS_ERR(css));
4517 init_and_link_css(css, ss, &cgrp_dfl_root.cgrp);
4518
4519
4520
4521
4522
4523 css->flags |= CSS_NO_REF;
4524
4525 if (early) {
4526
4527 css->id = 1;
4528 } else {
4529 css->id = cgroup_idr_alloc(&ss->css_idr, css, 1, 2, GFP_KERNEL);
4530 BUG_ON(css->id < 0);
4531 }
4532
4533
4534
4535
4536
4537 init_css_set.subsys[ss->id] = css;
4538
4539 have_fork_callback |= (bool)ss->fork << ss->id;
4540 have_exit_callback |= (bool)ss->exit << ss->id;
4541 have_free_callback |= (bool)ss->free << ss->id;
4542 have_canfork_callback |= (bool)ss->can_fork << ss->id;
4543
4544
4545
4546
4547 BUG_ON(!list_empty(&init_task.tasks));
4548
4549 BUG_ON(online_css(css));
4550
4551 mutex_unlock(&cgroup_mutex);
4552}
4553
4554
4555
4556
4557
4558
4559
4560int __init cgroup_init_early(void)
4561{
4562 static struct cgroup_sb_opts __initdata opts;
4563 struct cgroup_subsys *ss;
4564 int i;
4565
4566 init_cgroup_root(&cgrp_dfl_root, &opts);
4567 cgrp_dfl_root.cgrp.self.flags |= CSS_NO_REF;
4568
4569 RCU_INIT_POINTER(init_task.cgroups, &init_css_set);
4570
4571 for_each_subsys(ss, i) {
4572 WARN(!ss->css_alloc || !ss->css_free || ss->name || ss->id,
4573 "invalid cgroup_subsys %d:%s css_alloc=%p css_free=%p id:name=%d:%s\n",
4574 i, cgroup_subsys_name[i], ss->css_alloc, ss->css_free,
4575 ss->id, ss->name);
4576 WARN(strlen(cgroup_subsys_name[i]) > MAX_CGROUP_TYPE_NAMELEN,
4577 "cgroup_subsys_name %s too long\n", cgroup_subsys_name[i]);
4578
4579 ss->id = i;
4580 ss->name = cgroup_subsys_name[i];
4581 if (!ss->legacy_name)
4582 ss->legacy_name = cgroup_subsys_name[i];
4583
4584 if (ss->early_init)
4585 cgroup_init_subsys(ss, true);
4586 }
4587 return 0;
4588}
4589
4590static u16 cgroup_disable_mask __initdata;
4591
4592
4593
4594
4595
4596
4597
4598int __init cgroup_init(void)
4599{
4600 struct cgroup_subsys *ss;
4601 int ssid;
4602
4603 BUILD_BUG_ON(CGROUP_SUBSYS_COUNT > 16);
4604 BUG_ON(percpu_init_rwsem(&cgroup_threadgroup_rwsem));
4605 BUG_ON(cgroup_init_cftypes(NULL, cgroup_base_files));
4606 BUG_ON(cgroup_init_cftypes(NULL, cgroup1_base_files));
4607
4608
4609
4610
4611
4612 rcu_sync_enter_start(&cgroup_threadgroup_rwsem.rss);
4613
4614 get_user_ns(init_cgroup_ns.user_ns);
4615
4616 mutex_lock(&cgroup_mutex);
4617
4618
4619
4620
4621
4622 hash_add(css_set_table, &init_css_set.hlist,
4623 css_set_hash(init_css_set.subsys));
4624
4625 BUG_ON(cgroup_setup_root(&cgrp_dfl_root, 0, 0));
4626
4627 mutex_unlock(&cgroup_mutex);
4628
4629 for_each_subsys(ss, ssid) {
4630 if (ss->early_init) {
4631 struct cgroup_subsys_state *css =
4632 init_css_set.subsys[ss->id];
4633
4634 css->id = cgroup_idr_alloc(&ss->css_idr, css, 1, 2,
4635 GFP_KERNEL);
4636 BUG_ON(css->id < 0);
4637 } else {
4638 cgroup_init_subsys(ss, false);
4639 }
4640
4641 list_add_tail(&init_css_set.e_cset_node[ssid],
4642 &cgrp_dfl_root.cgrp.e_csets[ssid]);
4643
4644
4645
4646
4647
4648
4649 if (cgroup_disable_mask & (1 << ssid)) {
4650 static_branch_disable(cgroup_subsys_enabled_key[ssid]);
4651 printk(KERN_INFO "Disabling %s control group subsystem\n",
4652 ss->name);
4653 continue;
4654 }
4655
4656 if (cgroup1_ssid_disabled(ssid))
4657 printk(KERN_INFO "Disabling %s control group subsystem in v1 mounts\n",
4658 ss->name);
4659
4660 cgrp_dfl_root.subsys_mask |= 1 << ss->id;
4661
4662 if (ss->implicit_on_dfl)
4663 cgrp_dfl_implicit_ss_mask |= 1 << ss->id;
4664 else if (!ss->dfl_cftypes)
4665 cgrp_dfl_inhibit_ss_mask |= 1 << ss->id;
4666
4667 if (ss->dfl_cftypes == ss->legacy_cftypes) {
4668 WARN_ON(cgroup_add_cftypes(ss, ss->dfl_cftypes));
4669 } else {
4670 WARN_ON(cgroup_add_dfl_cftypes(ss, ss->dfl_cftypes));
4671 WARN_ON(cgroup_add_legacy_cftypes(ss, ss->legacy_cftypes));
4672 }
4673
4674 if (ss->bind)
4675 ss->bind(init_css_set.subsys[ssid]);
4676
4677 mutex_lock(&cgroup_mutex);
4678 css_populate_dir(init_css_set.subsys[ssid]);
4679 mutex_unlock(&cgroup_mutex);
4680 }
4681
4682
4683 hash_del(&init_css_set.hlist);
4684 hash_add(css_set_table, &init_css_set.hlist,
4685 css_set_hash(init_css_set.subsys));
4686
4687 WARN_ON(sysfs_create_mount_point(fs_kobj, "cgroup"));
4688 WARN_ON(register_filesystem(&cgroup_fs_type));
4689 WARN_ON(register_filesystem(&cgroup2_fs_type));
4690 WARN_ON(!proc_create("cgroups", 0, NULL, &proc_cgroupstats_operations));
4691
4692 return 0;
4693}
4694
4695static int __init cgroup_wq_init(void)
4696{
4697
4698
4699
4700
4701
4702
4703
4704
4705 cgroup_destroy_wq = alloc_workqueue("cgroup_destroy", 0, 1);
4706 BUG_ON(!cgroup_destroy_wq);
4707 return 0;
4708}
4709core_initcall(cgroup_wq_init);
4710
4711
4712
4713
4714
4715
4716int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns,
4717 struct pid *pid, struct task_struct *tsk)
4718{
4719 char *buf;
4720 int retval;
4721 struct cgroup_root *root;
4722
4723 retval = -ENOMEM;
4724 buf = kmalloc(PATH_MAX, GFP_KERNEL);
4725 if (!buf)
4726 goto out;
4727
4728 mutex_lock(&cgroup_mutex);
4729 spin_lock_irq(&css_set_lock);
4730
4731 for_each_root(root) {
4732 struct cgroup_subsys *ss;
4733 struct cgroup *cgrp;
4734 int ssid, count = 0;
4735
4736 if (root == &cgrp_dfl_root && !cgrp_dfl_visible)
4737 continue;
4738
4739 seq_printf(m, "%d:", root->hierarchy_id);
4740 if (root != &cgrp_dfl_root)
4741 for_each_subsys(ss, ssid)
4742 if (root->subsys_mask & (1 << ssid))
4743 seq_printf(m, "%s%s", count++ ? "," : "",
4744 ss->legacy_name);
4745 if (strlen(root->name))
4746 seq_printf(m, "%sname=%s", count ? "," : "",
4747 root->name);
4748 seq_putc(m, ':');
4749
4750 cgrp = task_cgroup_from_root(tsk, root);
4751
4752
4753
4754
4755
4756
4757
4758
4759
4760
4761 if (cgroup_on_dfl(cgrp) || !(tsk->flags & PF_EXITING)) {
4762 retval = cgroup_path_ns_locked(cgrp, buf, PATH_MAX,
4763 current->nsproxy->cgroup_ns);
4764 if (retval >= PATH_MAX)
4765 retval = -ENAMETOOLONG;
4766 if (retval < 0)
4767 goto out_unlock;
4768
4769 seq_puts(m, buf);
4770 } else {
4771 seq_puts(m, "/");
4772 }
4773
4774 if (cgroup_on_dfl(cgrp) && cgroup_is_dead(cgrp))
4775 seq_puts(m, " (deleted)\n");
4776 else
4777 seq_putc(m, '\n');
4778 }
4779
4780 retval = 0;
4781out_unlock:
4782 spin_unlock_irq(&css_set_lock);
4783 mutex_unlock(&cgroup_mutex);
4784 kfree(buf);
4785out:
4786 return retval;
4787}
4788
4789
4790
4791
4792
4793
4794
4795
4796
4797void cgroup_fork(struct task_struct *child)
4798{
4799 RCU_INIT_POINTER(child->cgroups, &init_css_set);
4800 INIT_LIST_HEAD(&child->cg_list);
4801}
4802
4803
4804
4805
4806
4807
4808
4809
4810
4811int cgroup_can_fork(struct task_struct *child)
4812{
4813 struct cgroup_subsys *ss;
4814 int i, j, ret;
4815
4816 do_each_subsys_mask(ss, i, have_canfork_callback) {
4817 ret = ss->can_fork(child);
4818 if (ret)
4819 goto out_revert;
4820 } while_each_subsys_mask();
4821
4822 return 0;
4823
4824out_revert:
4825 for_each_subsys(ss, j) {
4826 if (j >= i)
4827 break;
4828 if (ss->cancel_fork)
4829 ss->cancel_fork(child);
4830 }
4831
4832 return ret;
4833}
4834
4835
4836
4837
4838
4839
4840
4841
4842void cgroup_cancel_fork(struct task_struct *child)
4843{
4844 struct cgroup_subsys *ss;
4845 int i;
4846
4847 for_each_subsys(ss, i)
4848 if (ss->cancel_fork)
4849 ss->cancel_fork(child);
4850}
4851
4852
4853
4854
4855
4856
4857
4858
4859
4860
4861
4862void cgroup_post_fork(struct task_struct *child)
4863{
4864 struct cgroup_subsys *ss;
4865 int i;
4866
4867
4868
4869
4870
4871
4872
4873
4874
4875
4876
4877
4878
4879
4880
4881
4882
4883
4884
4885
4886
4887
4888 if (use_task_css_set_links) {
4889 struct css_set *cset;
4890
4891 spin_lock_irq(&css_set_lock);
4892 cset = task_css_set(current);
4893 if (list_empty(&child->cg_list)) {
4894 get_css_set(cset);
4895 cset->nr_tasks++;
4896 css_set_move_task(child, NULL, cset, false);
4897 }
4898 spin_unlock_irq(&css_set_lock);
4899 }
4900
4901
4902
4903
4904
4905
4906 do_each_subsys_mask(ss, i, have_fork_callback) {
4907 ss->fork(child);
4908 } while_each_subsys_mask();
4909}
4910
4911
4912
4913
4914
4915
4916
4917
4918
4919
4920
4921
4922
4923
4924
4925
4926
4927
4928
4929
4930void cgroup_exit(struct task_struct *tsk)
4931{
4932 struct cgroup_subsys *ss;
4933 struct css_set *cset;
4934 int i;
4935
4936
4937
4938
4939
4940 cset = task_css_set(tsk);
4941
4942 if (!list_empty(&tsk->cg_list)) {
4943 spin_lock_irq(&css_set_lock);
4944 css_set_move_task(tsk, cset, NULL, false);
4945 cset->nr_tasks--;
4946 spin_unlock_irq(&css_set_lock);
4947 } else {
4948 get_css_set(cset);
4949 }
4950
4951
4952 do_each_subsys_mask(ss, i, have_exit_callback) {
4953 ss->exit(tsk);
4954 } while_each_subsys_mask();
4955}
4956
4957void cgroup_free(struct task_struct *task)
4958{
4959 struct css_set *cset = task_css_set(task);
4960 struct cgroup_subsys *ss;
4961 int ssid;
4962
4963 do_each_subsys_mask(ss, ssid, have_free_callback) {
4964 ss->free(task);
4965 } while_each_subsys_mask();
4966
4967 put_css_set(cset);
4968}
4969
4970static int __init cgroup_disable(char *str)
4971{
4972 struct cgroup_subsys *ss;
4973 char *token;
4974 int i;
4975
4976 while ((token = strsep(&str, ",")) != NULL) {
4977 if (!*token)
4978 continue;
4979
4980 for_each_subsys(ss, i) {
4981 if (strcmp(token, ss->name) &&
4982 strcmp(token, ss->legacy_name))
4983 continue;
4984 cgroup_disable_mask |= 1 << i;
4985 }
4986 }
4987 return 1;
4988}
4989__setup("cgroup_disable=", cgroup_disable);
4990
4991
4992
4993
4994
4995
4996
4997
4998
4999
5000struct cgroup_subsys_state *css_tryget_online_from_dir(struct dentry *dentry,
5001 struct cgroup_subsys *ss)
5002{
5003 struct kernfs_node *kn = kernfs_node_from_dentry(dentry);
5004 struct file_system_type *s_type = dentry->d_sb->s_type;
5005 struct cgroup_subsys_state *css = NULL;
5006 struct cgroup *cgrp;
5007
5008
5009 if ((s_type != &cgroup_fs_type && s_type != &cgroup2_fs_type) ||
5010 !kn || kernfs_type(kn) != KERNFS_DIR)
5011 return ERR_PTR(-EBADF);
5012
5013 rcu_read_lock();
5014
5015
5016
5017
5018
5019
5020 cgrp = rcu_dereference(*(void __rcu __force **)&kn->priv);
5021 if (cgrp)
5022 css = cgroup_css(cgrp, ss);
5023
5024 if (!css || !css_tryget_online(css))
5025 css = ERR_PTR(-ENOENT);
5026
5027 rcu_read_unlock();
5028 return css;
5029}
5030
5031
5032
5033
5034
5035
5036
5037
5038
5039struct cgroup_subsys_state *css_from_id(int id, struct cgroup_subsys *ss)
5040{
5041 WARN_ON_ONCE(!rcu_read_lock_held());
5042 return idr_find(&ss->css_idr, id);
5043}
5044
5045
5046
5047
5048
5049
5050
5051
5052
5053
5054struct cgroup *cgroup_get_from_path(const char *path)
5055{
5056 struct kernfs_node *kn;
5057 struct cgroup *cgrp;
5058
5059 mutex_lock(&cgroup_mutex);
5060
5061 kn = kernfs_walk_and_get(cgrp_dfl_root.cgrp.kn, path);
5062 if (kn) {
5063 if (kernfs_type(kn) == KERNFS_DIR) {
5064 cgrp = kn->priv;
5065 cgroup_get_live(cgrp);
5066 } else {
5067 cgrp = ERR_PTR(-ENOTDIR);
5068 }
5069 kernfs_put(kn);
5070 } else {
5071 cgrp = ERR_PTR(-ENOENT);
5072 }
5073
5074 mutex_unlock(&cgroup_mutex);
5075 return cgrp;
5076}
5077EXPORT_SYMBOL_GPL(cgroup_get_from_path);
5078
5079
5080
5081
5082
5083
5084
5085
5086
5087
5088struct cgroup *cgroup_get_from_fd(int fd)
5089{
5090 struct cgroup_subsys_state *css;
5091 struct cgroup *cgrp;
5092 struct file *f;
5093
5094 f = fget_raw(fd);
5095 if (!f)
5096 return ERR_PTR(-EBADF);
5097
5098 css = css_tryget_online_from_dir(f->f_path.dentry, NULL);
5099 fput(f);
5100 if (IS_ERR(css))
5101 return ERR_CAST(css);
5102
5103 cgrp = css->cgroup;
5104 if (!cgroup_on_dfl(cgrp)) {
5105 cgroup_put(cgrp);
5106 return ERR_PTR(-EBADF);
5107 }
5108
5109 return cgrp;
5110}
5111EXPORT_SYMBOL_GPL(cgroup_get_from_fd);
5112
5113
5114
5115
5116
5117#ifdef CONFIG_SOCK_CGROUP_DATA
5118
5119#if defined(CONFIG_CGROUP_NET_PRIO) || defined(CONFIG_CGROUP_NET_CLASSID)
5120
5121DEFINE_SPINLOCK(cgroup_sk_update_lock);
5122static bool cgroup_sk_alloc_disabled __read_mostly;
5123
5124void cgroup_sk_alloc_disable(void)
5125{
5126 if (cgroup_sk_alloc_disabled)
5127 return;
5128 pr_info("cgroup: disabling cgroup2 socket matching due to net_prio or net_cls activation\n");
5129 cgroup_sk_alloc_disabled = true;
5130}
5131
5132#else
5133
5134#define cgroup_sk_alloc_disabled false
5135
5136#endif
5137
5138void cgroup_sk_alloc(struct sock_cgroup_data *skcd)
5139{
5140 if (cgroup_sk_alloc_disabled)
5141 return;
5142
5143
5144 if (skcd->val) {
5145
5146
5147
5148
5149
5150 cgroup_get(sock_cgroup_ptr(skcd));
5151 return;
5152 }
5153
5154 rcu_read_lock();
5155
5156 while (true) {
5157 struct css_set *cset;
5158
5159 cset = task_css_set(current);
5160 if (likely(cgroup_tryget(cset->dfl_cgrp))) {
5161 skcd->val = (unsigned long)cset->dfl_cgrp;
5162 break;
5163 }
5164 cpu_relax();
5165 }
5166
5167 rcu_read_unlock();
5168}
5169
5170void cgroup_sk_free(struct sock_cgroup_data *skcd)
5171{
5172 cgroup_put(sock_cgroup_ptr(skcd));
5173}
5174
5175#endif
5176
5177#ifdef CONFIG_CGROUP_BPF
5178int cgroup_bpf_update(struct cgroup *cgrp, struct bpf_prog *prog,
5179 enum bpf_attach_type type, bool overridable)
5180{
5181 struct cgroup *parent = cgroup_parent(cgrp);
5182 int ret;
5183
5184 mutex_lock(&cgroup_mutex);
5185 ret = __cgroup_bpf_update(cgrp, parent, prog, type, overridable);
5186 mutex_unlock(&cgroup_mutex);
5187 return ret;
5188}
5189#endif
5190