1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
30
31#include "cgroup-internal.h"
32
33#include <linux/cred.h>
34#include <linux/errno.h>
35#include <linux/init_task.h>
36#include <linux/kernel.h>
37#include <linux/magic.h>
38#include <linux/mutex.h>
39#include <linux/mount.h>
40#include <linux/pagemap.h>
41#include <linux/proc_fs.h>
42#include <linux/rcupdate.h>
43#include <linux/sched.h>
44#include <linux/sched/task.h>
45#include <linux/slab.h>
46#include <linux/spinlock.h>
47#include <linux/percpu-rwsem.h>
48#include <linux/string.h>
49#include <linux/hashtable.h>
50#include <linux/idr.h>
51#include <linux/kthread.h>
52#include <linux/atomic.h>
53#include <linux/cpuset.h>
54#include <linux/proc_ns.h>
55#include <linux/nsproxy.h>
56#include <linux/file.h>
57#include <net/sock.h>
58
59#define CREATE_TRACE_POINTS
60#include <trace/events/cgroup.h>
61
62#define CGROUP_FILE_NAME_MAX (MAX_CGROUP_TYPE_NAMELEN + \
63 MAX_CFTYPE_NAME + 2)
64
65
66
67
68
69
70
71
72
73
74
75DEFINE_MUTEX(cgroup_mutex);
76DEFINE_SPINLOCK(css_set_lock);
77
78#ifdef CONFIG_PROVE_RCU
79EXPORT_SYMBOL_GPL(cgroup_mutex);
80EXPORT_SYMBOL_GPL(css_set_lock);
81#endif
82
83
84
85
86
87static DEFINE_SPINLOCK(cgroup_idr_lock);
88
89
90
91
92
93static DEFINE_SPINLOCK(cgroup_file_kn_lock);
94
95struct percpu_rw_semaphore cgroup_threadgroup_rwsem;
96
97#define cgroup_assert_mutex_or_rcu_locked() \
98 RCU_LOCKDEP_WARN(!rcu_read_lock_held() && \
99 !lockdep_is_held(&cgroup_mutex), \
100 "cgroup_mutex or RCU read lock required");
101
102
103
104
105
106
107
108static struct workqueue_struct *cgroup_destroy_wq;
109
110
111#define SUBSYS(_x) [_x ## _cgrp_id] = &_x ## _cgrp_subsys,
112struct cgroup_subsys *cgroup_subsys[] = {
113#include <linux/cgroup_subsys.h>
114};
115#undef SUBSYS
116
117
118#define SUBSYS(_x) [_x ## _cgrp_id] = #_x,
119static const char *cgroup_subsys_name[] = {
120#include <linux/cgroup_subsys.h>
121};
122#undef SUBSYS
123
124
125#define SUBSYS(_x) \
126 DEFINE_STATIC_KEY_TRUE(_x ## _cgrp_subsys_enabled_key); \
127 DEFINE_STATIC_KEY_TRUE(_x ## _cgrp_subsys_on_dfl_key); \
128 EXPORT_SYMBOL_GPL(_x ## _cgrp_subsys_enabled_key); \
129 EXPORT_SYMBOL_GPL(_x ## _cgrp_subsys_on_dfl_key);
130#include <linux/cgroup_subsys.h>
131#undef SUBSYS
132
133#define SUBSYS(_x) [_x ## _cgrp_id] = &_x ## _cgrp_subsys_enabled_key,
134static struct static_key_true *cgroup_subsys_enabled_key[] = {
135#include <linux/cgroup_subsys.h>
136};
137#undef SUBSYS
138
139#define SUBSYS(_x) [_x ## _cgrp_id] = &_x ## _cgrp_subsys_on_dfl_key,
140static struct static_key_true *cgroup_subsys_on_dfl_key[] = {
141#include <linux/cgroup_subsys.h>
142};
143#undef SUBSYS
144
145static DEFINE_PER_CPU(struct cgroup_cpu_stat, cgrp_dfl_root_cpu_stat);
146
147
148
149
150
151
152struct cgroup_root cgrp_dfl_root = { .cgrp.cpu_stat = &cgrp_dfl_root_cpu_stat };
153EXPORT_SYMBOL_GPL(cgrp_dfl_root);
154
155
156
157
158
159static bool cgrp_dfl_visible;
160
161
162static u16 cgrp_dfl_inhibit_ss_mask;
163
164
165static u16 cgrp_dfl_implicit_ss_mask;
166
167
168static u16 cgrp_dfl_threaded_ss_mask;
169
170
171LIST_HEAD(cgroup_roots);
172static int cgroup_root_count;
173
174
175static DEFINE_IDR(cgroup_hierarchy_idr);
176
177
178
179
180
181
182
183
184static u64 css_serial_nr_next = 1;
185
186
187
188
189
190static u16 have_fork_callback __read_mostly;
191static u16 have_exit_callback __read_mostly;
192static u16 have_free_callback __read_mostly;
193static u16 have_canfork_callback __read_mostly;
194
195
196struct cgroup_namespace init_cgroup_ns = {
197 .count = REFCOUNT_INIT(2),
198 .user_ns = &init_user_ns,
199 .ns.ops = &cgroupns_operations,
200 .ns.inum = PROC_CGROUP_INIT_INO,
201 .root_cset = &init_css_set,
202};
203
204static struct file_system_type cgroup2_fs_type;
205static struct cftype cgroup_base_files[];
206
207static int cgroup_apply_control(struct cgroup *cgrp);
208static void cgroup_finalize_control(struct cgroup *cgrp, int ret);
209static void css_task_iter_advance(struct css_task_iter *it);
210static int cgroup_destroy_locked(struct cgroup *cgrp);
211static struct cgroup_subsys_state *css_create(struct cgroup *cgrp,
212 struct cgroup_subsys *ss);
213static void css_release(struct percpu_ref *ref);
214static void kill_css(struct cgroup_subsys_state *css);
215static int cgroup_addrm_files(struct cgroup_subsys_state *css,
216 struct cgroup *cgrp, struct cftype cfts[],
217 bool is_add);
218
219
220
221
222
223
224
225
226
227bool cgroup_ssid_enabled(int ssid)
228{
229 if (CGROUP_SUBSYS_COUNT == 0)
230 return false;
231
232 return static_key_enabled(cgroup_subsys_enabled_key[ssid]);
233}
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288bool cgroup_on_dfl(const struct cgroup *cgrp)
289{
290 return cgrp->root == &cgrp_dfl_root;
291}
292
293
294static int cgroup_idr_alloc(struct idr *idr, void *ptr, int start, int end,
295 gfp_t gfp_mask)
296{
297 int ret;
298
299 idr_preload(gfp_mask);
300 spin_lock_bh(&cgroup_idr_lock);
301 ret = idr_alloc(idr, ptr, start, end, gfp_mask & ~__GFP_DIRECT_RECLAIM);
302 spin_unlock_bh(&cgroup_idr_lock);
303 idr_preload_end();
304 return ret;
305}
306
307static void *cgroup_idr_replace(struct idr *idr, void *ptr, int id)
308{
309 void *ret;
310
311 spin_lock_bh(&cgroup_idr_lock);
312 ret = idr_replace(idr, ptr, id);
313 spin_unlock_bh(&cgroup_idr_lock);
314 return ret;
315}
316
317static void cgroup_idr_remove(struct idr *idr, int id)
318{
319 spin_lock_bh(&cgroup_idr_lock);
320 idr_remove(idr, id);
321 spin_unlock_bh(&cgroup_idr_lock);
322}
323
324static bool cgroup_has_tasks(struct cgroup *cgrp)
325{
326 return cgrp->nr_populated_csets;
327}
328
329bool cgroup_is_threaded(struct cgroup *cgrp)
330{
331 return cgrp->dom_cgrp != cgrp;
332}
333
334
335static bool cgroup_is_mixable(struct cgroup *cgrp)
336{
337
338
339
340
341
342 return !cgroup_parent(cgrp);
343}
344
345
346static bool cgroup_can_be_thread_root(struct cgroup *cgrp)
347{
348
349 if (cgroup_is_mixable(cgrp))
350 return true;
351
352
353 if (cgroup_is_threaded(cgrp))
354 return false;
355
356
357 if (cgrp->nr_populated_domain_children)
358 return false;
359
360
361 if (cgrp->subtree_control & ~cgrp_dfl_threaded_ss_mask)
362 return false;
363
364 return true;
365}
366
367
368bool cgroup_is_thread_root(struct cgroup *cgrp)
369{
370
371 if (cgroup_is_threaded(cgrp))
372 return false;
373
374
375 if (cgrp->nr_threaded_children)
376 return true;
377
378
379
380
381
382 if (cgroup_has_tasks(cgrp) &&
383 (cgrp->subtree_control & cgrp_dfl_threaded_ss_mask))
384 return true;
385
386 return false;
387}
388
389
390static bool cgroup_is_valid_domain(struct cgroup *cgrp)
391{
392
393 if (cgroup_is_threaded(cgrp))
394 return false;
395
396
397 while ((cgrp = cgroup_parent(cgrp))) {
398 if (!cgroup_is_mixable(cgrp) && cgroup_is_thread_root(cgrp))
399 return false;
400 if (cgroup_is_threaded(cgrp))
401 return false;
402 }
403
404 return true;
405}
406
407
408static u16 cgroup_control(struct cgroup *cgrp)
409{
410 struct cgroup *parent = cgroup_parent(cgrp);
411 u16 root_ss_mask = cgrp->root->subsys_mask;
412
413 if (parent) {
414 u16 ss_mask = parent->subtree_control;
415
416
417 if (cgroup_is_threaded(cgrp))
418 ss_mask &= cgrp_dfl_threaded_ss_mask;
419 return ss_mask;
420 }
421
422 if (cgroup_on_dfl(cgrp))
423 root_ss_mask &= ~(cgrp_dfl_inhibit_ss_mask |
424 cgrp_dfl_implicit_ss_mask);
425 return root_ss_mask;
426}
427
428
429static u16 cgroup_ss_mask(struct cgroup *cgrp)
430{
431 struct cgroup *parent = cgroup_parent(cgrp);
432
433 if (parent) {
434 u16 ss_mask = parent->subtree_ss_mask;
435
436
437 if (cgroup_is_threaded(cgrp))
438 ss_mask &= cgrp_dfl_threaded_ss_mask;
439 return ss_mask;
440 }
441
442 return cgrp->root->subsys_mask;
443}
444
445
446
447
448
449
450
451
452
453
454
455
456static struct cgroup_subsys_state *cgroup_css(struct cgroup *cgrp,
457 struct cgroup_subsys *ss)
458{
459 if (ss)
460 return rcu_dereference_check(cgrp->subsys[ss->id],
461 lockdep_is_held(&cgroup_mutex));
462 else
463 return &cgrp->self;
464}
465
466
467
468
469
470
471
472
473
474static struct cgroup_subsys_state *cgroup_tryget_css(struct cgroup *cgrp,
475 struct cgroup_subsys *ss)
476{
477 struct cgroup_subsys_state *css;
478
479 rcu_read_lock();
480 css = cgroup_css(cgrp, ss);
481 if (!css || !css_tryget_online(css))
482 css = NULL;
483 rcu_read_unlock();
484
485 return css;
486}
487
488
489
490
491
492
493
494
495
496
497
498static struct cgroup_subsys_state *cgroup_e_css(struct cgroup *cgrp,
499 struct cgroup_subsys *ss)
500{
501 lockdep_assert_held(&cgroup_mutex);
502
503 if (!ss)
504 return &cgrp->self;
505
506
507
508
509
510 while (!(cgroup_ss_mask(cgrp) & (1 << ss->id))) {
511 cgrp = cgroup_parent(cgrp);
512 if (!cgrp)
513 return NULL;
514 }
515
516 return cgroup_css(cgrp, ss);
517}
518
519
520
521
522
523
524
525
526
527
528
529
530struct cgroup_subsys_state *cgroup_get_e_css(struct cgroup *cgrp,
531 struct cgroup_subsys *ss)
532{
533 struct cgroup_subsys_state *css;
534
535 rcu_read_lock();
536
537 do {
538 css = cgroup_css(cgrp, ss);
539
540 if (css && css_tryget_online(css))
541 goto out_unlock;
542 cgrp = cgroup_parent(cgrp);
543 } while (cgrp);
544
545 css = init_css_set.subsys[ss->id];
546 css_get(css);
547out_unlock:
548 rcu_read_unlock();
549 return css;
550}
551
552static void cgroup_get_live(struct cgroup *cgrp)
553{
554 WARN_ON_ONCE(cgroup_is_dead(cgrp));
555 css_get(&cgrp->self);
556}
557
558struct cgroup_subsys_state *of_css(struct kernfs_open_file *of)
559{
560 struct cgroup *cgrp = of->kn->parent->priv;
561 struct cftype *cft = of_cft(of);
562
563
564
565
566
567
568
569
570
571 if (cft->ss)
572 return rcu_dereference_raw(cgrp->subsys[cft->ss->id]);
573 else
574 return &cgrp->self;
575}
576EXPORT_SYMBOL_GPL(of_css);
577
578
579
580
581
582
583
584
585
586#define for_each_css(css, ssid, cgrp) \
587 for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++) \
588 if (!((css) = rcu_dereference_check( \
589 (cgrp)->subsys[(ssid)], \
590 lockdep_is_held(&cgroup_mutex)))) { } \
591 else
592
593
594
595
596
597
598
599
600
601#define for_each_e_css(css, ssid, cgrp) \
602 for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++) \
603 if (!((css) = cgroup_e_css(cgrp, cgroup_subsys[(ssid)]))) \
604 ; \
605 else
606
607
608
609
610
611
612
613
614
615
616#define do_each_subsys_mask(ss, ssid, ss_mask) do { \
617 unsigned long __ss_mask = (ss_mask); \
618 if (!CGROUP_SUBSYS_COUNT) { \
619 (ssid) = 0; \
620 break; \
621 } \
622 for_each_set_bit(ssid, &__ss_mask, CGROUP_SUBSYS_COUNT) { \
623 (ss) = cgroup_subsys[ssid]; \
624 {
625
626#define while_each_subsys_mask() \
627 } \
628 } \
629} while (false)
630
631
632#define cgroup_for_each_live_child(child, cgrp) \
633 list_for_each_entry((child), &(cgrp)->self.children, self.sibling) \
634 if (({ lockdep_assert_held(&cgroup_mutex); \
635 cgroup_is_dead(child); })) \
636 ; \
637 else
638
639
640#define cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp) \
641 css_for_each_descendant_pre((d_css), cgroup_css((cgrp), NULL)) \
642 if (({ lockdep_assert_held(&cgroup_mutex); \
643 (dsct) = (d_css)->cgroup; \
644 cgroup_is_dead(dsct); })) \
645 ; \
646 else
647
648
649#define cgroup_for_each_live_descendant_post(dsct, d_css, cgrp) \
650 css_for_each_descendant_post((d_css), cgroup_css((cgrp), NULL)) \
651 if (({ lockdep_assert_held(&cgroup_mutex); \
652 (dsct) = (d_css)->cgroup; \
653 cgroup_is_dead(dsct); })) \
654 ; \
655 else
656
657
658
659
660
661
662
663
664struct css_set init_css_set = {
665 .refcount = REFCOUNT_INIT(1),
666 .dom_cset = &init_css_set,
667 .tasks = LIST_HEAD_INIT(init_css_set.tasks),
668 .mg_tasks = LIST_HEAD_INIT(init_css_set.mg_tasks),
669 .task_iters = LIST_HEAD_INIT(init_css_set.task_iters),
670 .threaded_csets = LIST_HEAD_INIT(init_css_set.threaded_csets),
671 .cgrp_links = LIST_HEAD_INIT(init_css_set.cgrp_links),
672 .mg_preload_node = LIST_HEAD_INIT(init_css_set.mg_preload_node),
673 .mg_node = LIST_HEAD_INIT(init_css_set.mg_node),
674
675
676
677
678
679
680
681 .dfl_cgrp = &cgrp_dfl_root.cgrp,
682};
683
684static int css_set_count = 1;
685
686static bool css_set_threaded(struct css_set *cset)
687{
688 return cset->dom_cset != cset;
689}
690
691
692
693
694
695
696
697
698
699
700static bool css_set_populated(struct css_set *cset)
701{
702 lockdep_assert_held(&css_set_lock);
703
704 return !list_empty(&cset->tasks) || !list_empty(&cset->mg_tasks);
705}
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724static void cgroup_update_populated(struct cgroup *cgrp, bool populated)
725{
726 struct cgroup *child = NULL;
727 int adj = populated ? 1 : -1;
728
729 lockdep_assert_held(&css_set_lock);
730
731 do {
732 bool was_populated = cgroup_is_populated(cgrp);
733
734 if (!child) {
735 cgrp->nr_populated_csets += adj;
736 } else {
737 if (cgroup_is_threaded(child))
738 cgrp->nr_populated_threaded_children += adj;
739 else
740 cgrp->nr_populated_domain_children += adj;
741 }
742
743 if (was_populated == cgroup_is_populated(cgrp))
744 break;
745
746 cgroup1_check_for_release(cgrp);
747 cgroup_file_notify(&cgrp->events_file);
748
749 child = cgrp;
750 cgrp = cgroup_parent(cgrp);
751 } while (cgrp);
752}
753
754
755
756
757
758
759
760
761
762static void css_set_update_populated(struct css_set *cset, bool populated)
763{
764 struct cgrp_cset_link *link;
765
766 lockdep_assert_held(&css_set_lock);
767
768 list_for_each_entry(link, &cset->cgrp_links, cgrp_link)
769 cgroup_update_populated(link->cgrp, populated);
770}
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787static void css_set_move_task(struct task_struct *task,
788 struct css_set *from_cset, struct css_set *to_cset,
789 bool use_mg_tasks)
790{
791 lockdep_assert_held(&css_set_lock);
792
793 if (to_cset && !css_set_populated(to_cset))
794 css_set_update_populated(to_cset, true);
795
796 if (from_cset) {
797 struct css_task_iter *it, *pos;
798
799 WARN_ON_ONCE(list_empty(&task->cg_list));
800
801
802
803
804
805
806
807
808 list_for_each_entry_safe(it, pos, &from_cset->task_iters,
809 iters_node)
810 if (it->task_pos == &task->cg_list)
811 css_task_iter_advance(it);
812
813 list_del_init(&task->cg_list);
814 if (!css_set_populated(from_cset))
815 css_set_update_populated(from_cset, false);
816 } else {
817 WARN_ON_ONCE(!list_empty(&task->cg_list));
818 }
819
820 if (to_cset) {
821
822
823
824
825
826
827 WARN_ON_ONCE(task->flags & PF_EXITING);
828
829 rcu_assign_pointer(task->cgroups, to_cset);
830 list_add_tail(&task->cg_list, use_mg_tasks ? &to_cset->mg_tasks :
831 &to_cset->tasks);
832 }
833}
834
835
836
837
838
839
840#define CSS_SET_HASH_BITS 7
841static DEFINE_HASHTABLE(css_set_table, CSS_SET_HASH_BITS);
842
843static unsigned long css_set_hash(struct cgroup_subsys_state *css[])
844{
845 unsigned long key = 0UL;
846 struct cgroup_subsys *ss;
847 int i;
848
849 for_each_subsys(ss, i)
850 key += (unsigned long)css[i];
851 key = (key >> 16) ^ key;
852
853 return key;
854}
855
856void put_css_set_locked(struct css_set *cset)
857{
858 struct cgrp_cset_link *link, *tmp_link;
859 struct cgroup_subsys *ss;
860 int ssid;
861
862 lockdep_assert_held(&css_set_lock);
863
864 if (!refcount_dec_and_test(&cset->refcount))
865 return;
866
867 WARN_ON_ONCE(!list_empty(&cset->threaded_csets));
868
869
870 for_each_subsys(ss, ssid) {
871 list_del(&cset->e_cset_node[ssid]);
872 css_put(cset->subsys[ssid]);
873 }
874 hash_del(&cset->hlist);
875 css_set_count--;
876
877 list_for_each_entry_safe(link, tmp_link, &cset->cgrp_links, cgrp_link) {
878 list_del(&link->cset_link);
879 list_del(&link->cgrp_link);
880 if (cgroup_parent(link->cgrp))
881 cgroup_put(link->cgrp);
882 kfree(link);
883 }
884
885 if (css_set_threaded(cset)) {
886 list_del(&cset->threaded_csets_node);
887 put_css_set_locked(cset->dom_cset);
888 }
889
890 kfree_rcu(cset, rcu_head);
891}
892
893
894
895
896
897
898
899
900
901
902
903static bool compare_css_sets(struct css_set *cset,
904 struct css_set *old_cset,
905 struct cgroup *new_cgrp,
906 struct cgroup_subsys_state *template[])
907{
908 struct cgroup *new_dfl_cgrp;
909 struct list_head *l1, *l2;
910
911
912
913
914
915
916 if (memcmp(template, cset->subsys, sizeof(cset->subsys)))
917 return false;
918
919
920
921 if (cgroup_on_dfl(new_cgrp))
922 new_dfl_cgrp = new_cgrp;
923 else
924 new_dfl_cgrp = old_cset->dfl_cgrp;
925
926 if (new_dfl_cgrp->dom_cgrp != cset->dom_cset->dfl_cgrp)
927 return false;
928
929
930
931
932
933
934
935 l1 = &cset->cgrp_links;
936 l2 = &old_cset->cgrp_links;
937 while (1) {
938 struct cgrp_cset_link *link1, *link2;
939 struct cgroup *cgrp1, *cgrp2;
940
941 l1 = l1->next;
942 l2 = l2->next;
943
944 if (l1 == &cset->cgrp_links) {
945 BUG_ON(l2 != &old_cset->cgrp_links);
946 break;
947 } else {
948 BUG_ON(l2 == &old_cset->cgrp_links);
949 }
950
951 link1 = list_entry(l1, struct cgrp_cset_link, cgrp_link);
952 link2 = list_entry(l2, struct cgrp_cset_link, cgrp_link);
953 cgrp1 = link1->cgrp;
954 cgrp2 = link2->cgrp;
955
956 BUG_ON(cgrp1->root != cgrp2->root);
957
958
959
960
961
962
963
964
965 if (cgrp1->root == new_cgrp->root) {
966 if (cgrp1 != new_cgrp)
967 return false;
968 } else {
969 if (cgrp1 != cgrp2)
970 return false;
971 }
972 }
973 return true;
974}
975
976
977
978
979
980
981
982static struct css_set *find_existing_css_set(struct css_set *old_cset,
983 struct cgroup *cgrp,
984 struct cgroup_subsys_state *template[])
985{
986 struct cgroup_root *root = cgrp->root;
987 struct cgroup_subsys *ss;
988 struct css_set *cset;
989 unsigned long key;
990 int i;
991
992
993
994
995
996
997 for_each_subsys(ss, i) {
998 if (root->subsys_mask & (1UL << i)) {
999
1000
1001
1002
1003 template[i] = cgroup_e_css(cgrp, ss);
1004 } else {
1005
1006
1007
1008
1009 template[i] = old_cset->subsys[i];
1010 }
1011 }
1012
1013 key = css_set_hash(template);
1014 hash_for_each_possible(css_set_table, cset, hlist, key) {
1015 if (!compare_css_sets(cset, old_cset, cgrp, template))
1016 continue;
1017
1018
1019 return cset;
1020 }
1021
1022
1023 return NULL;
1024}
1025
1026static void free_cgrp_cset_links(struct list_head *links_to_free)
1027{
1028 struct cgrp_cset_link *link, *tmp_link;
1029
1030 list_for_each_entry_safe(link, tmp_link, links_to_free, cset_link) {
1031 list_del(&link->cset_link);
1032 kfree(link);
1033 }
1034}
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044static int allocate_cgrp_cset_links(int count, struct list_head *tmp_links)
1045{
1046 struct cgrp_cset_link *link;
1047 int i;
1048
1049 INIT_LIST_HEAD(tmp_links);
1050
1051 for (i = 0; i < count; i++) {
1052 link = kzalloc(sizeof(*link), GFP_KERNEL);
1053 if (!link) {
1054 free_cgrp_cset_links(tmp_links);
1055 return -ENOMEM;
1056 }
1057 list_add(&link->cset_link, tmp_links);
1058 }
1059 return 0;
1060}
1061
1062
1063
1064
1065
1066
1067
1068static void link_css_set(struct list_head *tmp_links, struct css_set *cset,
1069 struct cgroup *cgrp)
1070{
1071 struct cgrp_cset_link *link;
1072
1073 BUG_ON(list_empty(tmp_links));
1074
1075 if (cgroup_on_dfl(cgrp))
1076 cset->dfl_cgrp = cgrp;
1077
1078 link = list_first_entry(tmp_links, struct cgrp_cset_link, cset_link);
1079 link->cset = cset;
1080 link->cgrp = cgrp;
1081
1082
1083
1084
1085
1086 list_move_tail(&link->cset_link, &cgrp->cset_links);
1087 list_add_tail(&link->cgrp_link, &cset->cgrp_links);
1088
1089 if (cgroup_parent(cgrp))
1090 cgroup_get_live(cgrp);
1091}
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101static struct css_set *find_css_set(struct css_set *old_cset,
1102 struct cgroup *cgrp)
1103{
1104 struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT] = { };
1105 struct css_set *cset;
1106 struct list_head tmp_links;
1107 struct cgrp_cset_link *link;
1108 struct cgroup_subsys *ss;
1109 unsigned long key;
1110 int ssid;
1111
1112 lockdep_assert_held(&cgroup_mutex);
1113
1114
1115
1116 spin_lock_irq(&css_set_lock);
1117 cset = find_existing_css_set(old_cset, cgrp, template);
1118 if (cset)
1119 get_css_set(cset);
1120 spin_unlock_irq(&css_set_lock);
1121
1122 if (cset)
1123 return cset;
1124
1125 cset = kzalloc(sizeof(*cset), GFP_KERNEL);
1126 if (!cset)
1127 return NULL;
1128
1129
1130 if (allocate_cgrp_cset_links(cgroup_root_count, &tmp_links) < 0) {
1131 kfree(cset);
1132 return NULL;
1133 }
1134
1135 refcount_set(&cset->refcount, 1);
1136 cset->dom_cset = cset;
1137 INIT_LIST_HEAD(&cset->tasks);
1138 INIT_LIST_HEAD(&cset->mg_tasks);
1139 INIT_LIST_HEAD(&cset->task_iters);
1140 INIT_LIST_HEAD(&cset->threaded_csets);
1141 INIT_HLIST_NODE(&cset->hlist);
1142 INIT_LIST_HEAD(&cset->cgrp_links);
1143 INIT_LIST_HEAD(&cset->mg_preload_node);
1144 INIT_LIST_HEAD(&cset->mg_node);
1145
1146
1147
1148 memcpy(cset->subsys, template, sizeof(cset->subsys));
1149
1150 spin_lock_irq(&css_set_lock);
1151
1152 list_for_each_entry(link, &old_cset->cgrp_links, cgrp_link) {
1153 struct cgroup *c = link->cgrp;
1154
1155 if (c->root == cgrp->root)
1156 c = cgrp;
1157 link_css_set(&tmp_links, cset, c);
1158 }
1159
1160 BUG_ON(!list_empty(&tmp_links));
1161
1162 css_set_count++;
1163
1164
1165 key = css_set_hash(cset->subsys);
1166 hash_add(css_set_table, &cset->hlist, key);
1167
1168 for_each_subsys(ss, ssid) {
1169 struct cgroup_subsys_state *css = cset->subsys[ssid];
1170
1171 list_add_tail(&cset->e_cset_node[ssid],
1172 &css->cgroup->e_csets[ssid]);
1173 css_get(css);
1174 }
1175
1176 spin_unlock_irq(&css_set_lock);
1177
1178
1179
1180
1181
1182
1183
1184 if (cgroup_is_threaded(cset->dfl_cgrp)) {
1185 struct css_set *dcset;
1186
1187 dcset = find_css_set(cset, cset->dfl_cgrp->dom_cgrp);
1188 if (!dcset) {
1189 put_css_set(cset);
1190 return NULL;
1191 }
1192
1193 spin_lock_irq(&css_set_lock);
1194 cset->dom_cset = dcset;
1195 list_add_tail(&cset->threaded_csets_node,
1196 &dcset->threaded_csets);
1197 spin_unlock_irq(&css_set_lock);
1198 }
1199
1200 return cset;
1201}
1202
1203struct cgroup_root *cgroup_root_from_kf(struct kernfs_root *kf_root)
1204{
1205 struct cgroup *root_cgrp = kf_root->kn->priv;
1206
1207 return root_cgrp->root;
1208}
1209
1210static int cgroup_init_root_id(struct cgroup_root *root)
1211{
1212 int id;
1213
1214 lockdep_assert_held(&cgroup_mutex);
1215
1216 id = idr_alloc_cyclic(&cgroup_hierarchy_idr, root, 0, 0, GFP_KERNEL);
1217 if (id < 0)
1218 return id;
1219
1220 root->hierarchy_id = id;
1221 return 0;
1222}
1223
1224static void cgroup_exit_root_id(struct cgroup_root *root)
1225{
1226 lockdep_assert_held(&cgroup_mutex);
1227
1228 idr_remove(&cgroup_hierarchy_idr, root->hierarchy_id);
1229}
1230
1231void cgroup_free_root(struct cgroup_root *root)
1232{
1233 if (root) {
1234 idr_destroy(&root->cgroup_idr);
1235 kfree(root);
1236 }
1237}
1238
1239static void cgroup_destroy_root(struct cgroup_root *root)
1240{
1241 struct cgroup *cgrp = &root->cgrp;
1242 struct cgrp_cset_link *link, *tmp_link;
1243
1244 trace_cgroup_destroy_root(root);
1245
1246 cgroup_lock_and_drain_offline(&cgrp_dfl_root.cgrp);
1247
1248 BUG_ON(atomic_read(&root->nr_cgrps));
1249 BUG_ON(!list_empty(&cgrp->self.children));
1250
1251
1252 WARN_ON(rebind_subsystems(&cgrp_dfl_root, root->subsys_mask));
1253
1254
1255
1256
1257
1258 spin_lock_irq(&css_set_lock);
1259
1260 list_for_each_entry_safe(link, tmp_link, &cgrp->cset_links, cset_link) {
1261 list_del(&link->cset_link);
1262 list_del(&link->cgrp_link);
1263 kfree(link);
1264 }
1265
1266 spin_unlock_irq(&css_set_lock);
1267
1268 if (!list_empty(&root->root_list)) {
1269 list_del(&root->root_list);
1270 cgroup_root_count--;
1271 }
1272
1273 cgroup_exit_root_id(root);
1274
1275 mutex_unlock(&cgroup_mutex);
1276
1277 kernfs_destroy_root(root->kf_root);
1278 cgroup_free_root(root);
1279}
1280
1281
1282
1283
1284
1285static struct cgroup *
1286current_cgns_cgroup_from_root(struct cgroup_root *root)
1287{
1288 struct cgroup *res = NULL;
1289 struct css_set *cset;
1290
1291 lockdep_assert_held(&css_set_lock);
1292
1293 rcu_read_lock();
1294
1295 cset = current->nsproxy->cgroup_ns->root_cset;
1296 if (cset == &init_css_set) {
1297 res = &root->cgrp;
1298 } else {
1299 struct cgrp_cset_link *link;
1300
1301 list_for_each_entry(link, &cset->cgrp_links, cgrp_link) {
1302 struct cgroup *c = link->cgrp;
1303
1304 if (c->root == root) {
1305 res = c;
1306 break;
1307 }
1308 }
1309 }
1310 rcu_read_unlock();
1311
1312 BUG_ON(!res);
1313 return res;
1314}
1315
1316
1317static struct cgroup *cset_cgroup_from_root(struct css_set *cset,
1318 struct cgroup_root *root)
1319{
1320 struct cgroup *res = NULL;
1321
1322 lockdep_assert_held(&cgroup_mutex);
1323 lockdep_assert_held(&css_set_lock);
1324
1325 if (cset == &init_css_set) {
1326 res = &root->cgrp;
1327 } else if (root == &cgrp_dfl_root) {
1328 res = cset->dfl_cgrp;
1329 } else {
1330 struct cgrp_cset_link *link;
1331
1332 list_for_each_entry(link, &cset->cgrp_links, cgrp_link) {
1333 struct cgroup *c = link->cgrp;
1334
1335 if (c->root == root) {
1336 res = c;
1337 break;
1338 }
1339 }
1340 }
1341
1342 BUG_ON(!res);
1343 return res;
1344}
1345
1346
1347
1348
1349
1350struct cgroup *task_cgroup_from_root(struct task_struct *task,
1351 struct cgroup_root *root)
1352{
1353
1354
1355
1356
1357
1358 return cset_cgroup_from_root(task_css_set(task), root);
1359}
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387static struct kernfs_syscall_ops cgroup_kf_syscall_ops;
1388
1389static char *cgroup_file_name(struct cgroup *cgrp, const struct cftype *cft,
1390 char *buf)
1391{
1392 struct cgroup_subsys *ss = cft->ss;
1393
1394 if (cft->ss && !(cft->flags & CFTYPE_NO_PREFIX) &&
1395 !(cgrp->root->flags & CGRP_ROOT_NOPREFIX))
1396 snprintf(buf, CGROUP_FILE_NAME_MAX, "%s.%s",
1397 cgroup_on_dfl(cgrp) ? ss->name : ss->legacy_name,
1398 cft->name);
1399 else
1400 strscpy(buf, cft->name, CGROUP_FILE_NAME_MAX);
1401 return buf;
1402}
1403
1404
1405
1406
1407
1408
1409
1410static umode_t cgroup_file_mode(const struct cftype *cft)
1411{
1412 umode_t mode = 0;
1413
1414 if (cft->read_u64 || cft->read_s64 || cft->seq_show)
1415 mode |= S_IRUGO;
1416
1417 if (cft->write_u64 || cft->write_s64 || cft->write) {
1418 if (cft->flags & CFTYPE_WORLD_WRITABLE)
1419 mode |= S_IWUGO;
1420 else
1421 mode |= S_IWUSR;
1422 }
1423
1424 return mode;
1425}
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439static u16 cgroup_calc_subtree_ss_mask(u16 subtree_control, u16 this_ss_mask)
1440{
1441 u16 cur_ss_mask = subtree_control;
1442 struct cgroup_subsys *ss;
1443 int ssid;
1444
1445 lockdep_assert_held(&cgroup_mutex);
1446
1447 cur_ss_mask |= cgrp_dfl_implicit_ss_mask;
1448
1449 while (true) {
1450 u16 new_ss_mask = cur_ss_mask;
1451
1452 do_each_subsys_mask(ss, ssid, cur_ss_mask) {
1453 new_ss_mask |= ss->depends_on;
1454 } while_each_subsys_mask();
1455
1456
1457
1458
1459
1460
1461 new_ss_mask &= this_ss_mask;
1462
1463 if (new_ss_mask == cur_ss_mask)
1464 break;
1465 cur_ss_mask = new_ss_mask;
1466 }
1467
1468 return cur_ss_mask;
1469}
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481void cgroup_kn_unlock(struct kernfs_node *kn)
1482{
1483 struct cgroup *cgrp;
1484
1485 if (kernfs_type(kn) == KERNFS_DIR)
1486 cgrp = kn->priv;
1487 else
1488 cgrp = kn->parent->priv;
1489
1490 mutex_unlock(&cgroup_mutex);
1491
1492 kernfs_unbreak_active_protection(kn);
1493 cgroup_put(cgrp);
1494}
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513struct cgroup *cgroup_kn_lock_live(struct kernfs_node *kn, bool drain_offline)
1514{
1515 struct cgroup *cgrp;
1516
1517 if (kernfs_type(kn) == KERNFS_DIR)
1518 cgrp = kn->priv;
1519 else
1520 cgrp = kn->parent->priv;
1521
1522
1523
1524
1525
1526
1527
1528 if (!cgroup_tryget(cgrp))
1529 return NULL;
1530 kernfs_break_active_protection(kn);
1531
1532 if (drain_offline)
1533 cgroup_lock_and_drain_offline(cgrp);
1534 else
1535 mutex_lock(&cgroup_mutex);
1536
1537 if (!cgroup_is_dead(cgrp))
1538 return cgrp;
1539
1540 cgroup_kn_unlock(kn);
1541 return NULL;
1542}
1543
1544static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft)
1545{
1546 char name[CGROUP_FILE_NAME_MAX];
1547
1548 lockdep_assert_held(&cgroup_mutex);
1549
1550 if (cft->file_offset) {
1551 struct cgroup_subsys_state *css = cgroup_css(cgrp, cft->ss);
1552 struct cgroup_file *cfile = (void *)css + cft->file_offset;
1553
1554 spin_lock_irq(&cgroup_file_kn_lock);
1555 cfile->kn = NULL;
1556 spin_unlock_irq(&cgroup_file_kn_lock);
1557 }
1558
1559 kernfs_remove_by_name(cgrp->kn, cgroup_file_name(cgrp, cft, name));
1560}
1561
1562
1563
1564
1565
1566static void css_clear_dir(struct cgroup_subsys_state *css)
1567{
1568 struct cgroup *cgrp = css->cgroup;
1569 struct cftype *cfts;
1570
1571 if (!(css->flags & CSS_VISIBLE))
1572 return;
1573
1574 css->flags &= ~CSS_VISIBLE;
1575
1576 list_for_each_entry(cfts, &css->ss->cfts, node)
1577 cgroup_addrm_files(css, cgrp, cfts, false);
1578}
1579
1580
1581
1582
1583
1584
1585
1586static int css_populate_dir(struct cgroup_subsys_state *css)
1587{
1588 struct cgroup *cgrp = css->cgroup;
1589 struct cftype *cfts, *failed_cfts;
1590 int ret;
1591
1592 if ((css->flags & CSS_VISIBLE) || !cgrp->kn)
1593 return 0;
1594
1595 if (!css->ss) {
1596 if (cgroup_on_dfl(cgrp))
1597 cfts = cgroup_base_files;
1598 else
1599 cfts = cgroup1_base_files;
1600
1601 return cgroup_addrm_files(&cgrp->self, cgrp, cfts, true);
1602 }
1603
1604 list_for_each_entry(cfts, &css->ss->cfts, node) {
1605 ret = cgroup_addrm_files(css, cgrp, cfts, true);
1606 if (ret < 0) {
1607 failed_cfts = cfts;
1608 goto err;
1609 }
1610 }
1611
1612 css->flags |= CSS_VISIBLE;
1613
1614 return 0;
1615err:
1616 list_for_each_entry(cfts, &css->ss->cfts, node) {
1617 if (cfts == failed_cfts)
1618 break;
1619 cgroup_addrm_files(css, cgrp, cfts, false);
1620 }
1621 return ret;
1622}
1623
1624int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask)
1625{
1626 struct cgroup *dcgrp = &dst_root->cgrp;
1627 struct cgroup_subsys *ss;
1628 int ssid, i, ret;
1629
1630 lockdep_assert_held(&cgroup_mutex);
1631
1632 do_each_subsys_mask(ss, ssid, ss_mask) {
1633
1634
1635
1636
1637
1638 if (css_next_child(NULL, cgroup_css(&ss->root->cgrp, ss)) &&
1639 !ss->implicit_on_dfl)
1640 return -EBUSY;
1641
1642
1643 if (ss->root != &cgrp_dfl_root && dst_root != &cgrp_dfl_root)
1644 return -EBUSY;
1645 } while_each_subsys_mask();
1646
1647 do_each_subsys_mask(ss, ssid, ss_mask) {
1648 struct cgroup_root *src_root = ss->root;
1649 struct cgroup *scgrp = &src_root->cgrp;
1650 struct cgroup_subsys_state *css = cgroup_css(scgrp, ss);
1651 struct css_set *cset;
1652
1653 WARN_ON(!css || cgroup_css(dcgrp, ss));
1654
1655
1656 src_root->subsys_mask &= ~(1 << ssid);
1657 WARN_ON(cgroup_apply_control(scgrp));
1658 cgroup_finalize_control(scgrp, 0);
1659
1660
1661 RCU_INIT_POINTER(scgrp->subsys[ssid], NULL);
1662 rcu_assign_pointer(dcgrp->subsys[ssid], css);
1663 ss->root = dst_root;
1664 css->cgroup = dcgrp;
1665
1666 spin_lock_irq(&css_set_lock);
1667 hash_for_each(css_set_table, i, cset, hlist)
1668 list_move_tail(&cset->e_cset_node[ss->id],
1669 &dcgrp->e_csets[ss->id]);
1670 spin_unlock_irq(&css_set_lock);
1671
1672
1673 dst_root->subsys_mask |= 1 << ssid;
1674 if (dst_root == &cgrp_dfl_root) {
1675 static_branch_enable(cgroup_subsys_on_dfl_key[ssid]);
1676 } else {
1677 dcgrp->subtree_control |= 1 << ssid;
1678 static_branch_disable(cgroup_subsys_on_dfl_key[ssid]);
1679 }
1680
1681 ret = cgroup_apply_control(dcgrp);
1682 if (ret)
1683 pr_warn("partial failure to rebind %s controller (err=%d)\n",
1684 ss->name, ret);
1685
1686 if (ss->bind)
1687 ss->bind(css);
1688 } while_each_subsys_mask();
1689
1690 kernfs_activate(dcgrp->kn);
1691 return 0;
1692}
1693
1694int cgroup_show_path(struct seq_file *sf, struct kernfs_node *kf_node,
1695 struct kernfs_root *kf_root)
1696{
1697 int len = 0;
1698 char *buf = NULL;
1699 struct cgroup_root *kf_cgroot = cgroup_root_from_kf(kf_root);
1700 struct cgroup *ns_cgroup;
1701
1702 buf = kmalloc(PATH_MAX, GFP_KERNEL);
1703 if (!buf)
1704 return -ENOMEM;
1705
1706 spin_lock_irq(&css_set_lock);
1707 ns_cgroup = current_cgns_cgroup_from_root(kf_cgroot);
1708 len = kernfs_path_from_node(kf_node, ns_cgroup->kn, buf, PATH_MAX);
1709 spin_unlock_irq(&css_set_lock);
1710
1711 if (len >= PATH_MAX)
1712 len = -ERANGE;
1713 else if (len > 0) {
1714 seq_escape(sf, buf, " \t\n\\");
1715 len = 0;
1716 }
1717 kfree(buf);
1718 return len;
1719}
1720
1721static int parse_cgroup_root_flags(char *data, unsigned int *root_flags)
1722{
1723 char *token;
1724
1725 *root_flags = 0;
1726
1727 if (!data)
1728 return 0;
1729
1730 while ((token = strsep(&data, ",")) != NULL) {
1731 if (!strcmp(token, "nsdelegate")) {
1732 *root_flags |= CGRP_ROOT_NS_DELEGATE;
1733 continue;
1734 }
1735
1736 pr_err("cgroup2: unknown option \"%s\"\n", token);
1737 return -EINVAL;
1738 }
1739
1740 return 0;
1741}
1742
1743static void apply_cgroup_root_flags(unsigned int root_flags)
1744{
1745 if (current->nsproxy->cgroup_ns == &init_cgroup_ns) {
1746 if (root_flags & CGRP_ROOT_NS_DELEGATE)
1747 cgrp_dfl_root.flags |= CGRP_ROOT_NS_DELEGATE;
1748 else
1749 cgrp_dfl_root.flags &= ~CGRP_ROOT_NS_DELEGATE;
1750 }
1751}
1752
1753static int cgroup_show_options(struct seq_file *seq, struct kernfs_root *kf_root)
1754{
1755 if (cgrp_dfl_root.flags & CGRP_ROOT_NS_DELEGATE)
1756 seq_puts(seq, ",nsdelegate");
1757 return 0;
1758}
1759
1760static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data)
1761{
1762 unsigned int root_flags;
1763 int ret;
1764
1765 ret = parse_cgroup_root_flags(data, &root_flags);
1766 if (ret)
1767 return ret;
1768
1769 apply_cgroup_root_flags(root_flags);
1770 return 0;
1771}
1772
1773
1774
1775
1776
1777
1778
1779static bool use_task_css_set_links __read_mostly;
1780
1781static void cgroup_enable_task_cg_lists(void)
1782{
1783 struct task_struct *p, *g;
1784
1785 spin_lock_irq(&css_set_lock);
1786
1787 if (use_task_css_set_links)
1788 goto out_unlock;
1789
1790 use_task_css_set_links = true;
1791
1792
1793
1794
1795
1796
1797
1798
1799 read_lock(&tasklist_lock);
1800 do_each_thread(g, p) {
1801 WARN_ON_ONCE(!list_empty(&p->cg_list) ||
1802 task_css_set(p) != &init_css_set);
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815 spin_lock(&p->sighand->siglock);
1816 if (!(p->flags & PF_EXITING)) {
1817 struct css_set *cset = task_css_set(p);
1818
1819 if (!css_set_populated(cset))
1820 css_set_update_populated(cset, true);
1821 list_add_tail(&p->cg_list, &cset->tasks);
1822 get_css_set(cset);
1823 cset->nr_tasks++;
1824 }
1825 spin_unlock(&p->sighand->siglock);
1826 } while_each_thread(g, p);
1827 read_unlock(&tasklist_lock);
1828out_unlock:
1829 spin_unlock_irq(&css_set_lock);
1830}
1831
1832static void init_cgroup_housekeeping(struct cgroup *cgrp)
1833{
1834 struct cgroup_subsys *ss;
1835 int ssid;
1836
1837 INIT_LIST_HEAD(&cgrp->self.sibling);
1838 INIT_LIST_HEAD(&cgrp->self.children);
1839 INIT_LIST_HEAD(&cgrp->cset_links);
1840 INIT_LIST_HEAD(&cgrp->pidlists);
1841 mutex_init(&cgrp->pidlist_mutex);
1842 cgrp->self.cgroup = cgrp;
1843 cgrp->self.flags |= CSS_ONLINE;
1844 cgrp->dom_cgrp = cgrp;
1845 cgrp->max_descendants = INT_MAX;
1846 cgrp->max_depth = INT_MAX;
1847
1848 for_each_subsys(ss, ssid)
1849 INIT_LIST_HEAD(&cgrp->e_csets[ssid]);
1850
1851 init_waitqueue_head(&cgrp->offline_waitq);
1852 INIT_WORK(&cgrp->release_agent_work, cgroup1_release_agent);
1853}
1854
1855void init_cgroup_root(struct cgroup_root *root, struct cgroup_sb_opts *opts)
1856{
1857 struct cgroup *cgrp = &root->cgrp;
1858
1859 INIT_LIST_HEAD(&root->root_list);
1860 atomic_set(&root->nr_cgrps, 1);
1861 cgrp->root = root;
1862 init_cgroup_housekeeping(cgrp);
1863 idr_init(&root->cgroup_idr);
1864
1865 root->flags = opts->flags;
1866 if (opts->release_agent)
1867 strscpy(root->release_agent_path, opts->release_agent, PATH_MAX);
1868 if (opts->name)
1869 strscpy(root->name, opts->name, MAX_CGROUP_ROOT_NAMELEN);
1870 if (opts->cpuset_clone_children)
1871 set_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags);
1872}
1873
1874int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask, int ref_flags)
1875{
1876 LIST_HEAD(tmp_links);
1877 struct cgroup *root_cgrp = &root->cgrp;
1878 struct kernfs_syscall_ops *kf_sops;
1879 struct css_set *cset;
1880 int i, ret;
1881
1882 lockdep_assert_held(&cgroup_mutex);
1883
1884 ret = cgroup_idr_alloc(&root->cgroup_idr, root_cgrp, 1, 2, GFP_KERNEL);
1885 if (ret < 0)
1886 goto out;
1887 root_cgrp->id = ret;
1888 root_cgrp->ancestor_ids[0] = ret;
1889
1890 ret = percpu_ref_init(&root_cgrp->self.refcnt, css_release,
1891 ref_flags, GFP_KERNEL);
1892 if (ret)
1893 goto out;
1894
1895
1896
1897
1898
1899
1900
1901
1902 ret = allocate_cgrp_cset_links(2 * css_set_count, &tmp_links);
1903 if (ret)
1904 goto cancel_ref;
1905
1906 ret = cgroup_init_root_id(root);
1907 if (ret)
1908 goto cancel_ref;
1909
1910 kf_sops = root == &cgrp_dfl_root ?
1911 &cgroup_kf_syscall_ops : &cgroup1_kf_syscall_ops;
1912
1913 root->kf_root = kernfs_create_root(kf_sops,
1914 KERNFS_ROOT_CREATE_DEACTIVATED |
1915 KERNFS_ROOT_SUPPORT_EXPORTOP,
1916 root_cgrp);
1917 if (IS_ERR(root->kf_root)) {
1918 ret = PTR_ERR(root->kf_root);
1919 goto exit_root_id;
1920 }
1921 root_cgrp->kn = root->kf_root->kn;
1922
1923 ret = css_populate_dir(&root_cgrp->self);
1924 if (ret)
1925 goto destroy_root;
1926
1927 ret = rebind_subsystems(root, ss_mask);
1928 if (ret)
1929 goto destroy_root;
1930
1931 ret = cgroup_bpf_inherit(root_cgrp);
1932 WARN_ON_ONCE(ret);
1933
1934 trace_cgroup_setup_root(root);
1935
1936
1937
1938
1939
1940
1941 list_add(&root->root_list, &cgroup_roots);
1942 cgroup_root_count++;
1943
1944
1945
1946
1947
1948 spin_lock_irq(&css_set_lock);
1949 hash_for_each(css_set_table, i, cset, hlist) {
1950 link_css_set(&tmp_links, cset, root_cgrp);
1951 if (css_set_populated(cset))
1952 cgroup_update_populated(root_cgrp, true);
1953 }
1954 spin_unlock_irq(&css_set_lock);
1955
1956 BUG_ON(!list_empty(&root_cgrp->self.children));
1957 BUG_ON(atomic_read(&root->nr_cgrps) != 1);
1958
1959 kernfs_activate(root_cgrp->kn);
1960 ret = 0;
1961 goto out;
1962
1963destroy_root:
1964 kernfs_destroy_root(root->kf_root);
1965 root->kf_root = NULL;
1966exit_root_id:
1967 cgroup_exit_root_id(root);
1968cancel_ref:
1969 percpu_ref_exit(&root_cgrp->self.refcnt);
1970out:
1971 free_cgrp_cset_links(&tmp_links);
1972 return ret;
1973}
1974
1975struct dentry *cgroup_do_mount(struct file_system_type *fs_type, int flags,
1976 struct cgroup_root *root, unsigned long magic,
1977 struct cgroup_namespace *ns)
1978{
1979 struct dentry *dentry;
1980 bool new_sb;
1981
1982 dentry = kernfs_mount(fs_type, flags, root->kf_root, magic, &new_sb);
1983
1984
1985
1986
1987
1988 if (!IS_ERR(dentry) && ns != &init_cgroup_ns) {
1989 struct dentry *nsdentry;
1990 struct cgroup *cgrp;
1991
1992 mutex_lock(&cgroup_mutex);
1993 spin_lock_irq(&css_set_lock);
1994
1995 cgrp = cset_cgroup_from_root(ns->root_cset, root);
1996
1997 spin_unlock_irq(&css_set_lock);
1998 mutex_unlock(&cgroup_mutex);
1999
2000 nsdentry = kernfs_node_dentry(cgrp->kn, dentry->d_sb);
2001 dput(dentry);
2002 dentry = nsdentry;
2003 }
2004
2005 if (IS_ERR(dentry) || !new_sb)
2006 cgroup_put(&root->cgrp);
2007
2008 return dentry;
2009}
2010
2011static struct dentry *cgroup_mount(struct file_system_type *fs_type,
2012 int flags, const char *unused_dev_name,
2013 void *data)
2014{
2015 struct cgroup_namespace *ns = current->nsproxy->cgroup_ns;
2016 struct dentry *dentry;
2017 int ret;
2018
2019 get_cgroup_ns(ns);
2020
2021
2022 if (!ns_capable(ns->user_ns, CAP_SYS_ADMIN)) {
2023 put_cgroup_ns(ns);
2024 return ERR_PTR(-EPERM);
2025 }
2026
2027
2028
2029
2030
2031 if (!use_task_css_set_links)
2032 cgroup_enable_task_cg_lists();
2033
2034 if (fs_type == &cgroup2_fs_type) {
2035 unsigned int root_flags;
2036
2037 ret = parse_cgroup_root_flags(data, &root_flags);
2038 if (ret) {
2039 put_cgroup_ns(ns);
2040 return ERR_PTR(ret);
2041 }
2042
2043 cgrp_dfl_visible = true;
2044 cgroup_get_live(&cgrp_dfl_root.cgrp);
2045
2046 dentry = cgroup_do_mount(&cgroup2_fs_type, flags, &cgrp_dfl_root,
2047 CGROUP2_SUPER_MAGIC, ns);
2048 if (!IS_ERR(dentry))
2049 apply_cgroup_root_flags(root_flags);
2050 } else {
2051 dentry = cgroup1_mount(&cgroup_fs_type, flags, data,
2052 CGROUP_SUPER_MAGIC, ns);
2053 }
2054
2055 put_cgroup_ns(ns);
2056 return dentry;
2057}
2058
2059static void cgroup_kill_sb(struct super_block *sb)
2060{
2061 struct kernfs_root *kf_root = kernfs_root_from_sb(sb);
2062 struct cgroup_root *root = cgroup_root_from_kf(kf_root);
2063
2064
2065
2066
2067
2068
2069
2070
2071 if (!list_empty(&root->cgrp.self.children) ||
2072 root == &cgrp_dfl_root)
2073 cgroup_put(&root->cgrp);
2074 else
2075 percpu_ref_kill(&root->cgrp.self.refcnt);
2076
2077 kernfs_kill_sb(sb);
2078}
2079
2080struct file_system_type cgroup_fs_type = {
2081 .name = "cgroup",
2082 .mount = cgroup_mount,
2083 .kill_sb = cgroup_kill_sb,
2084 .fs_flags = FS_USERNS_MOUNT,
2085};
2086
2087static struct file_system_type cgroup2_fs_type = {
2088 .name = "cgroup2",
2089 .mount = cgroup_mount,
2090 .kill_sb = cgroup_kill_sb,
2091 .fs_flags = FS_USERNS_MOUNT,
2092};
2093
2094int cgroup_path_ns_locked(struct cgroup *cgrp, char *buf, size_t buflen,
2095 struct cgroup_namespace *ns)
2096{
2097 struct cgroup *root = cset_cgroup_from_root(ns->root_cset, cgrp->root);
2098
2099 return kernfs_path_from_node(cgrp->kn, root->kn, buf, buflen);
2100}
2101
2102int cgroup_path_ns(struct cgroup *cgrp, char *buf, size_t buflen,
2103 struct cgroup_namespace *ns)
2104{
2105 int ret;
2106
2107 mutex_lock(&cgroup_mutex);
2108 spin_lock_irq(&css_set_lock);
2109
2110 ret = cgroup_path_ns_locked(cgrp, buf, buflen, ns);
2111
2112 spin_unlock_irq(&css_set_lock);
2113 mutex_unlock(&cgroup_mutex);
2114
2115 return ret;
2116}
2117EXPORT_SYMBOL_GPL(cgroup_path_ns);
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132int task_cgroup_path(struct task_struct *task, char *buf, size_t buflen)
2133{
2134 struct cgroup_root *root;
2135 struct cgroup *cgrp;
2136 int hierarchy_id = 1;
2137 int ret;
2138
2139 mutex_lock(&cgroup_mutex);
2140 spin_lock_irq(&css_set_lock);
2141
2142 root = idr_get_next(&cgroup_hierarchy_idr, &hierarchy_id);
2143
2144 if (root) {
2145 cgrp = task_cgroup_from_root(task, root);
2146 ret = cgroup_path_ns_locked(cgrp, buf, buflen, &init_cgroup_ns);
2147 } else {
2148
2149 ret = strlcpy(buf, "/", buflen);
2150 }
2151
2152 spin_unlock_irq(&css_set_lock);
2153 mutex_unlock(&cgroup_mutex);
2154 return ret;
2155}
2156EXPORT_SYMBOL_GPL(task_cgroup_path);
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168static void cgroup_migrate_add_task(struct task_struct *task,
2169 struct cgroup_mgctx *mgctx)
2170{
2171 struct css_set *cset;
2172
2173 lockdep_assert_held(&css_set_lock);
2174
2175
2176 if (task->flags & PF_EXITING)
2177 return;
2178
2179
2180 if (list_empty(&task->cg_list))
2181 return;
2182
2183 cset = task_css_set(task);
2184 if (!cset->mg_src_cgrp)
2185 return;
2186
2187 mgctx->tset.nr_tasks++;
2188
2189 list_move_tail(&task->cg_list, &cset->mg_tasks);
2190 if (list_empty(&cset->mg_node))
2191 list_add_tail(&cset->mg_node,
2192 &mgctx->tset.src_csets);
2193 if (list_empty(&cset->mg_dst_cset->mg_node))
2194 list_add_tail(&cset->mg_dst_cset->mg_node,
2195 &mgctx->tset.dst_csets);
2196}
2197
2198
2199
2200
2201
2202
2203
2204
2205struct task_struct *cgroup_taskset_first(struct cgroup_taskset *tset,
2206 struct cgroup_subsys_state **dst_cssp)
2207{
2208 tset->cur_cset = list_first_entry(tset->csets, struct css_set, mg_node);
2209 tset->cur_task = NULL;
2210
2211 return cgroup_taskset_next(tset, dst_cssp);
2212}
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset,
2223 struct cgroup_subsys_state **dst_cssp)
2224{
2225 struct css_set *cset = tset->cur_cset;
2226 struct task_struct *task = tset->cur_task;
2227
2228 while (&cset->mg_node != tset->csets) {
2229 if (!task)
2230 task = list_first_entry(&cset->mg_tasks,
2231 struct task_struct, cg_list);
2232 else
2233 task = list_next_entry(task, cg_list);
2234
2235 if (&task->cg_list != &cset->mg_tasks) {
2236 tset->cur_cset = cset;
2237 tset->cur_task = task;
2238
2239
2240
2241
2242
2243
2244
2245 if (cset->mg_dst_cset)
2246 *dst_cssp = cset->mg_dst_cset->subsys[tset->ssid];
2247 else
2248 *dst_cssp = cset->subsys[tset->ssid];
2249
2250 return task;
2251 }
2252
2253 cset = list_next_entry(cset, mg_node);
2254 task = NULL;
2255 }
2256
2257 return NULL;
2258}
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269static int cgroup_migrate_execute(struct cgroup_mgctx *mgctx)
2270{
2271 struct cgroup_taskset *tset = &mgctx->tset;
2272 struct cgroup_subsys *ss;
2273 struct task_struct *task, *tmp_task;
2274 struct css_set *cset, *tmp_cset;
2275 int ssid, failed_ssid, ret;
2276
2277
2278 if (tset->nr_tasks) {
2279 do_each_subsys_mask(ss, ssid, mgctx->ss_mask) {
2280 if (ss->can_attach) {
2281 tset->ssid = ssid;
2282 ret = ss->can_attach(tset);
2283 if (ret) {
2284 failed_ssid = ssid;
2285 goto out_cancel_attach;
2286 }
2287 }
2288 } while_each_subsys_mask();
2289 }
2290
2291
2292
2293
2294
2295
2296 spin_lock_irq(&css_set_lock);
2297 list_for_each_entry(cset, &tset->src_csets, mg_node) {
2298 list_for_each_entry_safe(task, tmp_task, &cset->mg_tasks, cg_list) {
2299 struct css_set *from_cset = task_css_set(task);
2300 struct css_set *to_cset = cset->mg_dst_cset;
2301
2302 get_css_set(to_cset);
2303 to_cset->nr_tasks++;
2304 css_set_move_task(task, from_cset, to_cset, true);
2305 put_css_set_locked(from_cset);
2306 from_cset->nr_tasks--;
2307 }
2308 }
2309 spin_unlock_irq(&css_set_lock);
2310
2311
2312
2313
2314
2315
2316 tset->csets = &tset->dst_csets;
2317
2318 if (tset->nr_tasks) {
2319 do_each_subsys_mask(ss, ssid, mgctx->ss_mask) {
2320 if (ss->attach) {
2321 tset->ssid = ssid;
2322 ss->attach(tset);
2323 }
2324 } while_each_subsys_mask();
2325 }
2326
2327 ret = 0;
2328 goto out_release_tset;
2329
2330out_cancel_attach:
2331 if (tset->nr_tasks) {
2332 do_each_subsys_mask(ss, ssid, mgctx->ss_mask) {
2333 if (ssid == failed_ssid)
2334 break;
2335 if (ss->cancel_attach) {
2336 tset->ssid = ssid;
2337 ss->cancel_attach(tset);
2338 }
2339 } while_each_subsys_mask();
2340 }
2341out_release_tset:
2342 spin_lock_irq(&css_set_lock);
2343 list_splice_init(&tset->dst_csets, &tset->src_csets);
2344 list_for_each_entry_safe(cset, tmp_cset, &tset->src_csets, mg_node) {
2345 list_splice_tail_init(&cset->mg_tasks, &cset->tasks);
2346 list_del_init(&cset->mg_node);
2347 }
2348 spin_unlock_irq(&css_set_lock);
2349
2350
2351
2352
2353
2354
2355 tset->nr_tasks = 0;
2356 tset->csets = &tset->src_csets;
2357 return ret;
2358}
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369int cgroup_migrate_vet_dst(struct cgroup *dst_cgrp)
2370{
2371
2372 if (!cgroup_on_dfl(dst_cgrp))
2373 return 0;
2374
2375
2376 if (!cgroup_is_valid_domain(dst_cgrp->dom_cgrp))
2377 return -EOPNOTSUPP;
2378
2379
2380 if (cgroup_is_mixable(dst_cgrp))
2381 return 0;
2382
2383
2384
2385
2386
2387 if (cgroup_can_be_thread_root(dst_cgrp) || cgroup_is_threaded(dst_cgrp))
2388 return 0;
2389
2390
2391 if (dst_cgrp->subtree_control)
2392 return -EBUSY;
2393
2394 return 0;
2395}
2396
2397
2398
2399
2400
2401
2402
2403
2404void cgroup_migrate_finish(struct cgroup_mgctx *mgctx)
2405{
2406 LIST_HEAD(preloaded);
2407 struct css_set *cset, *tmp_cset;
2408
2409 lockdep_assert_held(&cgroup_mutex);
2410
2411 spin_lock_irq(&css_set_lock);
2412
2413 list_splice_tail_init(&mgctx->preloaded_src_csets, &preloaded);
2414 list_splice_tail_init(&mgctx->preloaded_dst_csets, &preloaded);
2415
2416 list_for_each_entry_safe(cset, tmp_cset, &preloaded, mg_preload_node) {
2417 cset->mg_src_cgrp = NULL;
2418 cset->mg_dst_cgrp = NULL;
2419 cset->mg_dst_cset = NULL;
2420 list_del_init(&cset->mg_preload_node);
2421 put_css_set_locked(cset);
2422 }
2423
2424 spin_unlock_irq(&css_set_lock);
2425}
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443void cgroup_migrate_add_src(struct css_set *src_cset,
2444 struct cgroup *dst_cgrp,
2445 struct cgroup_mgctx *mgctx)
2446{
2447 struct cgroup *src_cgrp;
2448
2449 lockdep_assert_held(&cgroup_mutex);
2450 lockdep_assert_held(&css_set_lock);
2451
2452
2453
2454
2455
2456
2457 if (src_cset->dead)
2458 return;
2459
2460 src_cgrp = cset_cgroup_from_root(src_cset, dst_cgrp->root);
2461
2462 if (!list_empty(&src_cset->mg_preload_node))
2463 return;
2464
2465 WARN_ON(src_cset->mg_src_cgrp);
2466 WARN_ON(src_cset->mg_dst_cgrp);
2467 WARN_ON(!list_empty(&src_cset->mg_tasks));
2468 WARN_ON(!list_empty(&src_cset->mg_node));
2469
2470 src_cset->mg_src_cgrp = src_cgrp;
2471 src_cset->mg_dst_cgrp = dst_cgrp;
2472 get_css_set(src_cset);
2473 list_add_tail(&src_cset->mg_preload_node, &mgctx->preloaded_src_csets);
2474}
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490int cgroup_migrate_prepare_dst(struct cgroup_mgctx *mgctx)
2491{
2492 struct css_set *src_cset, *tmp_cset;
2493
2494 lockdep_assert_held(&cgroup_mutex);
2495
2496
2497 list_for_each_entry_safe(src_cset, tmp_cset, &mgctx->preloaded_src_csets,
2498 mg_preload_node) {
2499 struct css_set *dst_cset;
2500 struct cgroup_subsys *ss;
2501 int ssid;
2502
2503 dst_cset = find_css_set(src_cset, src_cset->mg_dst_cgrp);
2504 if (!dst_cset)
2505 goto err;
2506
2507 WARN_ON_ONCE(src_cset->mg_dst_cset || dst_cset->mg_dst_cset);
2508
2509
2510
2511
2512
2513
2514 if (src_cset == dst_cset) {
2515 src_cset->mg_src_cgrp = NULL;
2516 src_cset->mg_dst_cgrp = NULL;
2517 list_del_init(&src_cset->mg_preload_node);
2518 put_css_set(src_cset);
2519 put_css_set(dst_cset);
2520 continue;
2521 }
2522
2523 src_cset->mg_dst_cset = dst_cset;
2524
2525 if (list_empty(&dst_cset->mg_preload_node))
2526 list_add_tail(&dst_cset->mg_preload_node,
2527 &mgctx->preloaded_dst_csets);
2528 else
2529 put_css_set(dst_cset);
2530
2531 for_each_subsys(ss, ssid)
2532 if (src_cset->subsys[ssid] != dst_cset->subsys[ssid])
2533 mgctx->ss_mask |= 1 << ssid;
2534 }
2535
2536 return 0;
2537err:
2538 cgroup_migrate_finish(mgctx);
2539 return -ENOMEM;
2540}
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560int cgroup_migrate(struct task_struct *leader, bool threadgroup,
2561 struct cgroup_mgctx *mgctx)
2562{
2563 struct task_struct *task;
2564
2565
2566
2567
2568
2569
2570 spin_lock_irq(&css_set_lock);
2571 rcu_read_lock();
2572 task = leader;
2573 do {
2574 cgroup_migrate_add_task(task, mgctx);
2575 if (!threadgroup)
2576 break;
2577 } while_each_thread(leader, task);
2578 rcu_read_unlock();
2579 spin_unlock_irq(&css_set_lock);
2580
2581 return cgroup_migrate_execute(mgctx);
2582}
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592int cgroup_attach_task(struct cgroup *dst_cgrp, struct task_struct *leader,
2593 bool threadgroup)
2594{
2595 DEFINE_CGROUP_MGCTX(mgctx);
2596 struct task_struct *task;
2597 int ret;
2598
2599 ret = cgroup_migrate_vet_dst(dst_cgrp);
2600 if (ret)
2601 return ret;
2602
2603
2604 spin_lock_irq(&css_set_lock);
2605 rcu_read_lock();
2606 task = leader;
2607 do {
2608 cgroup_migrate_add_src(task_css_set(task), dst_cgrp, &mgctx);
2609 if (!threadgroup)
2610 break;
2611 } while_each_thread(leader, task);
2612 rcu_read_unlock();
2613 spin_unlock_irq(&css_set_lock);
2614
2615
2616 ret = cgroup_migrate_prepare_dst(&mgctx);
2617 if (!ret)
2618 ret = cgroup_migrate(leader, threadgroup, &mgctx);
2619
2620 cgroup_migrate_finish(&mgctx);
2621
2622 if (!ret)
2623 trace_cgroup_attach_task(dst_cgrp, leader, threadgroup);
2624
2625 return ret;
2626}
2627
2628struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup)
2629 __acquires(&cgroup_threadgroup_rwsem)
2630{
2631 struct task_struct *tsk;
2632 pid_t pid;
2633
2634 if (kstrtoint(strstrip(buf), 0, &pid) || pid < 0)
2635 return ERR_PTR(-EINVAL);
2636
2637 percpu_down_write(&cgroup_threadgroup_rwsem);
2638
2639 rcu_read_lock();
2640 if (pid) {
2641 tsk = find_task_by_vpid(pid);
2642 if (!tsk) {
2643 tsk = ERR_PTR(-ESRCH);
2644 goto out_unlock_threadgroup;
2645 }
2646 } else {
2647 tsk = current;
2648 }
2649
2650 if (threadgroup)
2651 tsk = tsk->group_leader;
2652
2653
2654
2655
2656
2657
2658
2659 if (tsk->no_cgroup_migration || (tsk->flags & PF_NO_SETAFFINITY)) {
2660 tsk = ERR_PTR(-EINVAL);
2661 goto out_unlock_threadgroup;
2662 }
2663
2664 get_task_struct(tsk);
2665 goto out_unlock_rcu;
2666
2667out_unlock_threadgroup:
2668 percpu_up_write(&cgroup_threadgroup_rwsem);
2669out_unlock_rcu:
2670 rcu_read_unlock();
2671 return tsk;
2672}
2673
2674void cgroup_procs_write_finish(struct task_struct *task)
2675 __releases(&cgroup_threadgroup_rwsem)
2676{
2677 struct cgroup_subsys *ss;
2678 int ssid;
2679
2680
2681 put_task_struct(task);
2682
2683 percpu_up_write(&cgroup_threadgroup_rwsem);
2684 for_each_subsys(ss, ssid)
2685 if (ss->post_attach)
2686 ss->post_attach();
2687}
2688
2689static void cgroup_print_ss_mask(struct seq_file *seq, u16 ss_mask)
2690{
2691 struct cgroup_subsys *ss;
2692 bool printed = false;
2693 int ssid;
2694
2695 do_each_subsys_mask(ss, ssid, ss_mask) {
2696 if (printed)
2697 seq_putc(seq, ' ');
2698 seq_printf(seq, "%s", ss->name);
2699 printed = true;
2700 } while_each_subsys_mask();
2701 if (printed)
2702 seq_putc(seq, '\n');
2703}
2704
2705
2706static int cgroup_controllers_show(struct seq_file *seq, void *v)
2707{
2708 struct cgroup *cgrp = seq_css(seq)->cgroup;
2709
2710 cgroup_print_ss_mask(seq, cgroup_control(cgrp));
2711 return 0;
2712}
2713
2714
2715static int cgroup_subtree_control_show(struct seq_file *seq, void *v)
2716{
2717 struct cgroup *cgrp = seq_css(seq)->cgroup;
2718
2719 cgroup_print_ss_mask(seq, cgrp->subtree_control);
2720 return 0;
2721}
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732static int cgroup_update_dfl_csses(struct cgroup *cgrp)
2733{
2734 DEFINE_CGROUP_MGCTX(mgctx);
2735 struct cgroup_subsys_state *d_css;
2736 struct cgroup *dsct;
2737 struct css_set *src_cset;
2738 int ret;
2739
2740 lockdep_assert_held(&cgroup_mutex);
2741
2742 percpu_down_write(&cgroup_threadgroup_rwsem);
2743
2744
2745 spin_lock_irq(&css_set_lock);
2746 cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp) {
2747 struct cgrp_cset_link *link;
2748
2749 list_for_each_entry(link, &dsct->cset_links, cset_link)
2750 cgroup_migrate_add_src(link->cset, dsct, &mgctx);
2751 }
2752 spin_unlock_irq(&css_set_lock);
2753
2754
2755 ret = cgroup_migrate_prepare_dst(&mgctx);
2756 if (ret)
2757 goto out_finish;
2758
2759 spin_lock_irq(&css_set_lock);
2760 list_for_each_entry(src_cset, &mgctx.preloaded_src_csets, mg_preload_node) {
2761 struct task_struct *task, *ntask;
2762
2763
2764 list_for_each_entry_safe(task, ntask, &src_cset->tasks, cg_list)
2765 cgroup_migrate_add_task(task, &mgctx);
2766 }
2767 spin_unlock_irq(&css_set_lock);
2768
2769 ret = cgroup_migrate_execute(&mgctx);
2770out_finish:
2771 cgroup_migrate_finish(&mgctx);
2772 percpu_up_write(&cgroup_threadgroup_rwsem);
2773 return ret;
2774}
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784void cgroup_lock_and_drain_offline(struct cgroup *cgrp)
2785 __acquires(&cgroup_mutex)
2786{
2787 struct cgroup *dsct;
2788 struct cgroup_subsys_state *d_css;
2789 struct cgroup_subsys *ss;
2790 int ssid;
2791
2792restart:
2793 mutex_lock(&cgroup_mutex);
2794
2795 cgroup_for_each_live_descendant_post(dsct, d_css, cgrp) {
2796 for_each_subsys(ss, ssid) {
2797 struct cgroup_subsys_state *css = cgroup_css(dsct, ss);
2798 DEFINE_WAIT(wait);
2799
2800 if (!css || !percpu_ref_is_dying(&css->refcnt))
2801 continue;
2802
2803 cgroup_get_live(dsct);
2804 prepare_to_wait(&dsct->offline_waitq, &wait,
2805 TASK_UNINTERRUPTIBLE);
2806
2807 mutex_unlock(&cgroup_mutex);
2808 schedule();
2809 finish_wait(&dsct->offline_waitq, &wait);
2810
2811 cgroup_put(dsct);
2812 goto restart;
2813 }
2814 }
2815}
2816
2817
2818
2819
2820
2821
2822
2823
2824static void cgroup_save_control(struct cgroup *cgrp)
2825{
2826 struct cgroup *dsct;
2827 struct cgroup_subsys_state *d_css;
2828
2829 cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp) {
2830 dsct->old_subtree_control = dsct->subtree_control;
2831 dsct->old_subtree_ss_mask = dsct->subtree_ss_mask;
2832 }
2833}
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843static void cgroup_propagate_control(struct cgroup *cgrp)
2844{
2845 struct cgroup *dsct;
2846 struct cgroup_subsys_state *d_css;
2847
2848 cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp) {
2849 dsct->subtree_control &= cgroup_control(dsct);
2850 dsct->subtree_ss_mask =
2851 cgroup_calc_subtree_ss_mask(dsct->subtree_control,
2852 cgroup_ss_mask(dsct));
2853 }
2854}
2855
2856
2857
2858
2859
2860
2861
2862
2863static void cgroup_restore_control(struct cgroup *cgrp)
2864{
2865 struct cgroup *dsct;
2866 struct cgroup_subsys_state *d_css;
2867
2868 cgroup_for_each_live_descendant_post(dsct, d_css, cgrp) {
2869 dsct->subtree_control = dsct->old_subtree_control;
2870 dsct->subtree_ss_mask = dsct->old_subtree_ss_mask;
2871 }
2872}
2873
2874static bool css_visible(struct cgroup_subsys_state *css)
2875{
2876 struct cgroup_subsys *ss = css->ss;
2877 struct cgroup *cgrp = css->cgroup;
2878
2879 if (cgroup_control(cgrp) & (1 << ss->id))
2880 return true;
2881 if (!(cgroup_ss_mask(cgrp) & (1 << ss->id)))
2882 return false;
2883 return cgroup_on_dfl(cgrp) && ss->implicit_on_dfl;
2884}
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899static int cgroup_apply_control_enable(struct cgroup *cgrp)
2900{
2901 struct cgroup *dsct;
2902 struct cgroup_subsys_state *d_css;
2903 struct cgroup_subsys *ss;
2904 int ssid, ret;
2905
2906 cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp) {
2907 for_each_subsys(ss, ssid) {
2908 struct cgroup_subsys_state *css = cgroup_css(dsct, ss);
2909
2910 WARN_ON_ONCE(css && percpu_ref_is_dying(&css->refcnt));
2911
2912 if (!(cgroup_ss_mask(dsct) & (1 << ss->id)))
2913 continue;
2914
2915 if (!css) {
2916 css = css_create(dsct, ss);
2917 if (IS_ERR(css))
2918 return PTR_ERR(css);
2919 }
2920
2921 if (css_visible(css)) {
2922 ret = css_populate_dir(css);
2923 if (ret)
2924 return ret;
2925 }
2926 }
2927 }
2928
2929 return 0;
2930}
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945static void cgroup_apply_control_disable(struct cgroup *cgrp)
2946{
2947 struct cgroup *dsct;
2948 struct cgroup_subsys_state *d_css;
2949 struct cgroup_subsys *ss;
2950 int ssid;
2951
2952 cgroup_for_each_live_descendant_post(dsct, d_css, cgrp) {
2953 for_each_subsys(ss, ssid) {
2954 struct cgroup_subsys_state *css = cgroup_css(dsct, ss);
2955
2956 WARN_ON_ONCE(css && percpu_ref_is_dying(&css->refcnt));
2957
2958 if (!css)
2959 continue;
2960
2961 if (css->parent &&
2962 !(cgroup_ss_mask(dsct) & (1 << ss->id))) {
2963 kill_css(css);
2964 } else if (!css_visible(css)) {
2965 css_clear_dir(css);
2966 if (ss->css_reset)
2967 ss->css_reset(css);
2968 }
2969 }
2970 }
2971}
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990static int cgroup_apply_control(struct cgroup *cgrp)
2991{
2992 int ret;
2993
2994 cgroup_propagate_control(cgrp);
2995
2996 ret = cgroup_apply_control_enable(cgrp);
2997 if (ret)
2998 return ret;
2999
3000
3001
3002
3003
3004
3005 ret = cgroup_update_dfl_csses(cgrp);
3006 if (ret)
3007 return ret;
3008
3009 return 0;
3010}
3011
3012
3013
3014
3015
3016
3017
3018
3019static void cgroup_finalize_control(struct cgroup *cgrp, int ret)
3020{
3021 if (ret) {
3022 cgroup_restore_control(cgrp);
3023 cgroup_propagate_control(cgrp);
3024 }
3025
3026 cgroup_apply_control_disable(cgrp);
3027}
3028
3029static int cgroup_vet_subtree_control_enable(struct cgroup *cgrp, u16 enable)
3030{
3031 u16 domain_enable = enable & ~cgrp_dfl_threaded_ss_mask;
3032
3033
3034 if (!enable)
3035 return 0;
3036
3037
3038 if (!cgroup_is_valid_domain(cgrp->dom_cgrp))
3039 return -EOPNOTSUPP;
3040
3041
3042 if (cgroup_is_mixable(cgrp))
3043 return 0;
3044
3045 if (domain_enable) {
3046
3047 if (cgroup_is_thread_root(cgrp) || cgroup_is_threaded(cgrp))
3048 return -EOPNOTSUPP;
3049 } else {
3050
3051
3052
3053
3054
3055 if (cgroup_can_be_thread_root(cgrp) || cgroup_is_threaded(cgrp))
3056 return 0;
3057 }
3058
3059
3060
3061
3062
3063 if (cgroup_has_tasks(cgrp))
3064 return -EBUSY;
3065
3066 return 0;
3067}
3068
3069
3070static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of,
3071 char *buf, size_t nbytes,
3072 loff_t off)
3073{
3074 u16 enable = 0, disable = 0;
3075 struct cgroup *cgrp, *child;
3076 struct cgroup_subsys *ss;
3077 char *tok;
3078 int ssid, ret;
3079
3080
3081
3082
3083
3084 buf = strstrip(buf);
3085 while ((tok = strsep(&buf, " "))) {
3086 if (tok[0] == '\0')
3087 continue;
3088 do_each_subsys_mask(ss, ssid, ~cgrp_dfl_inhibit_ss_mask) {
3089 if (!cgroup_ssid_enabled(ssid) ||
3090 strcmp(tok + 1, ss->name))
3091 continue;
3092
3093 if (*tok == '+') {
3094 enable |= 1 << ssid;
3095 disable &= ~(1 << ssid);
3096 } else if (*tok == '-') {
3097 disable |= 1 << ssid;
3098 enable &= ~(1 << ssid);
3099 } else {
3100 return -EINVAL;
3101 }
3102 break;
3103 } while_each_subsys_mask();
3104 if (ssid == CGROUP_SUBSYS_COUNT)
3105 return -EINVAL;
3106 }
3107
3108 cgrp = cgroup_kn_lock_live(of->kn, true);
3109 if (!cgrp)
3110 return -ENODEV;
3111
3112 for_each_subsys(ss, ssid) {
3113 if (enable & (1 << ssid)) {
3114 if (cgrp->subtree_control & (1 << ssid)) {
3115 enable &= ~(1 << ssid);
3116 continue;
3117 }
3118
3119 if (!(cgroup_control(cgrp) & (1 << ssid))) {
3120 ret = -ENOENT;
3121 goto out_unlock;
3122 }
3123 } else if (disable & (1 << ssid)) {
3124 if (!(cgrp->subtree_control & (1 << ssid))) {
3125 disable &= ~(1 << ssid);
3126 continue;
3127 }
3128
3129
3130 cgroup_for_each_live_child(child, cgrp) {
3131 if (child->subtree_control & (1 << ssid)) {
3132 ret = -EBUSY;
3133 goto out_unlock;
3134 }
3135 }
3136 }
3137 }
3138
3139 if (!enable && !disable) {
3140 ret = 0;
3141 goto out_unlock;
3142 }
3143
3144 ret = cgroup_vet_subtree_control_enable(cgrp, enable);
3145 if (ret)
3146 goto out_unlock;
3147
3148
3149 cgroup_save_control(cgrp);
3150
3151 cgrp->subtree_control |= enable;
3152 cgrp->subtree_control &= ~disable;
3153
3154 ret = cgroup_apply_control(cgrp);
3155 cgroup_finalize_control(cgrp, ret);
3156 if (ret)
3157 goto out_unlock;
3158
3159 kernfs_activate(cgrp->kn);
3160out_unlock:
3161 cgroup_kn_unlock(of->kn);
3162 return ret ?: nbytes;
3163}
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174static int cgroup_enable_threaded(struct cgroup *cgrp)
3175{
3176 struct cgroup *parent = cgroup_parent(cgrp);
3177 struct cgroup *dom_cgrp = parent->dom_cgrp;
3178 int ret;
3179
3180 lockdep_assert_held(&cgroup_mutex);
3181
3182
3183 if (cgroup_is_threaded(cgrp))
3184 return 0;
3185
3186
3187
3188
3189
3190
3191
3192 if (cgroup_is_populated(cgrp) ||
3193 cgrp->subtree_control & ~cgrp_dfl_threaded_ss_mask)
3194 return -EOPNOTSUPP;
3195
3196
3197 if (!cgroup_is_valid_domain(dom_cgrp) ||
3198 !cgroup_can_be_thread_root(dom_cgrp))
3199 return -EOPNOTSUPP;
3200
3201
3202
3203
3204
3205 cgroup_save_control(cgrp);
3206
3207 cgrp->dom_cgrp = dom_cgrp;
3208 ret = cgroup_apply_control(cgrp);
3209 if (!ret)
3210 parent->nr_threaded_children++;
3211 else
3212 cgrp->dom_cgrp = cgrp;
3213
3214 cgroup_finalize_control(cgrp, ret);
3215 return ret;
3216}
3217
3218static int cgroup_type_show(struct seq_file *seq, void *v)
3219{
3220 struct cgroup *cgrp = seq_css(seq)->cgroup;
3221
3222 if (cgroup_is_threaded(cgrp))
3223 seq_puts(seq, "threaded\n");
3224 else if (!cgroup_is_valid_domain(cgrp))
3225 seq_puts(seq, "domain invalid\n");
3226 else if (cgroup_is_thread_root(cgrp))
3227 seq_puts(seq, "domain threaded\n");
3228 else
3229 seq_puts(seq, "domain\n");
3230
3231 return 0;
3232}
3233
3234static ssize_t cgroup_type_write(struct kernfs_open_file *of, char *buf,
3235 size_t nbytes, loff_t off)
3236{
3237 struct cgroup *cgrp;
3238 int ret;
3239
3240
3241 if (strcmp(strstrip(buf), "threaded"))
3242 return -EINVAL;
3243
3244 cgrp = cgroup_kn_lock_live(of->kn, false);
3245 if (!cgrp)
3246 return -ENOENT;
3247
3248
3249 ret = cgroup_enable_threaded(cgrp);
3250
3251 cgroup_kn_unlock(of->kn);
3252 return ret ?: nbytes;
3253}
3254
3255static int cgroup_max_descendants_show(struct seq_file *seq, void *v)
3256{
3257 struct cgroup *cgrp = seq_css(seq)->cgroup;
3258 int descendants = READ_ONCE(cgrp->max_descendants);
3259
3260 if (descendants == INT_MAX)
3261 seq_puts(seq, "max\n");
3262 else
3263 seq_printf(seq, "%d\n", descendants);
3264
3265 return 0;
3266}
3267
3268static ssize_t cgroup_max_descendants_write(struct kernfs_open_file *of,
3269 char *buf, size_t nbytes, loff_t off)
3270{
3271 struct cgroup *cgrp;
3272 int descendants;
3273 ssize_t ret;
3274
3275 buf = strstrip(buf);
3276 if (!strcmp(buf, "max")) {
3277 descendants = INT_MAX;
3278 } else {
3279 ret = kstrtoint(buf, 0, &descendants);
3280 if (ret)
3281 return ret;
3282 }
3283
3284 if (descendants < 0)
3285 return -ERANGE;
3286
3287 cgrp = cgroup_kn_lock_live(of->kn, false);
3288 if (!cgrp)
3289 return -ENOENT;
3290
3291 cgrp->max_descendants = descendants;
3292
3293 cgroup_kn_unlock(of->kn);
3294
3295 return nbytes;
3296}
3297
3298static int cgroup_max_depth_show(struct seq_file *seq, void *v)
3299{
3300 struct cgroup *cgrp = seq_css(seq)->cgroup;
3301 int depth = READ_ONCE(cgrp->max_depth);
3302
3303 if (depth == INT_MAX)
3304 seq_puts(seq, "max\n");
3305 else
3306 seq_printf(seq, "%d\n", depth);
3307
3308 return 0;
3309}
3310
3311static ssize_t cgroup_max_depth_write(struct kernfs_open_file *of,
3312 char *buf, size_t nbytes, loff_t off)
3313{
3314 struct cgroup *cgrp;
3315 ssize_t ret;
3316 int depth;
3317
3318 buf = strstrip(buf);
3319 if (!strcmp(buf, "max")) {
3320 depth = INT_MAX;
3321 } else {
3322 ret = kstrtoint(buf, 0, &depth);
3323 if (ret)
3324 return ret;
3325 }
3326
3327 if (depth < 0)
3328 return -ERANGE;
3329
3330 cgrp = cgroup_kn_lock_live(of->kn, false);
3331 if (!cgrp)
3332 return -ENOENT;
3333
3334 cgrp->max_depth = depth;
3335
3336 cgroup_kn_unlock(of->kn);
3337
3338 return nbytes;
3339}
3340
3341static int cgroup_events_show(struct seq_file *seq, void *v)
3342{
3343 seq_printf(seq, "populated %d\n",
3344 cgroup_is_populated(seq_css(seq)->cgroup));
3345 return 0;
3346}
3347
3348static int cgroup_stat_show(struct seq_file *seq, void *v)
3349{
3350 struct cgroup *cgroup = seq_css(seq)->cgroup;
3351
3352 seq_printf(seq, "nr_descendants %d\n",
3353 cgroup->nr_descendants);
3354 seq_printf(seq, "nr_dying_descendants %d\n",
3355 cgroup->nr_dying_descendants);
3356
3357 return 0;
3358}
3359
3360static int __maybe_unused cgroup_extra_stat_show(struct seq_file *seq,
3361 struct cgroup *cgrp, int ssid)
3362{
3363 struct cgroup_subsys *ss = cgroup_subsys[ssid];
3364 struct cgroup_subsys_state *css;
3365 int ret;
3366
3367 if (!ss->css_extra_stat_show)
3368 return 0;
3369
3370 css = cgroup_tryget_css(cgrp, ss);
3371 if (!css)
3372 return 0;
3373
3374 ret = ss->css_extra_stat_show(seq, css);
3375 css_put(css);
3376 return ret;
3377}
3378
3379static int cpu_stat_show(struct seq_file *seq, void *v)
3380{
3381 struct cgroup __maybe_unused *cgrp = seq_css(seq)->cgroup;
3382 int ret = 0;
3383
3384 cgroup_stat_show_cputime(seq);
3385#ifdef CONFIG_CGROUP_SCHED
3386 ret = cgroup_extra_stat_show(seq, cgrp, cpu_cgrp_id);
3387#endif
3388 return ret;
3389}
3390
3391static int cgroup_file_open(struct kernfs_open_file *of)
3392{
3393 struct cftype *cft = of->kn->priv;
3394
3395 if (cft->open)
3396 return cft->open(of);
3397 return 0;
3398}
3399
3400static void cgroup_file_release(struct kernfs_open_file *of)
3401{
3402 struct cftype *cft = of->kn->priv;
3403
3404 if (cft->release)
3405 cft->release(of);
3406}
3407
3408static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf,
3409 size_t nbytes, loff_t off)
3410{
3411 struct cgroup_namespace *ns = current->nsproxy->cgroup_ns;
3412 struct cgroup *cgrp = of->kn->parent->priv;
3413 struct cftype *cft = of->kn->priv;
3414 struct cgroup_subsys_state *css;
3415 int ret;
3416
3417
3418
3419
3420
3421
3422
3423 if ((cgrp->root->flags & CGRP_ROOT_NS_DELEGATE) &&
3424 !(cft->flags & CFTYPE_NS_DELEGATABLE) &&
3425 ns != &init_cgroup_ns && ns->root_cset->dfl_cgrp == cgrp)
3426 return -EPERM;
3427
3428 if (cft->write)
3429 return cft->write(of, buf, nbytes, off);
3430
3431
3432
3433
3434
3435
3436
3437 rcu_read_lock();
3438 css = cgroup_css(cgrp, cft->ss);
3439 rcu_read_unlock();
3440
3441 if (cft->write_u64) {
3442 unsigned long long v;
3443 ret = kstrtoull(buf, 0, &v);
3444 if (!ret)
3445 ret = cft->write_u64(css, cft, v);
3446 } else if (cft->write_s64) {
3447 long long v;
3448 ret = kstrtoll(buf, 0, &v);
3449 if (!ret)
3450 ret = cft->write_s64(css, cft, v);
3451 } else {
3452 ret = -EINVAL;
3453 }
3454
3455 return ret ?: nbytes;
3456}
3457
3458static void *cgroup_seqfile_start(struct seq_file *seq, loff_t *ppos)
3459{
3460 return seq_cft(seq)->seq_start(seq, ppos);
3461}
3462
3463static void *cgroup_seqfile_next(struct seq_file *seq, void *v, loff_t *ppos)
3464{
3465 return seq_cft(seq)->seq_next(seq, v, ppos);
3466}
3467
3468static void cgroup_seqfile_stop(struct seq_file *seq, void *v)
3469{
3470 if (seq_cft(seq)->seq_stop)
3471 seq_cft(seq)->seq_stop(seq, v);
3472}
3473
3474static int cgroup_seqfile_show(struct seq_file *m, void *arg)
3475{
3476 struct cftype *cft = seq_cft(m);
3477 struct cgroup_subsys_state *css = seq_css(m);
3478
3479 if (cft->seq_show)
3480 return cft->seq_show(m, arg);
3481
3482 if (cft->read_u64)
3483 seq_printf(m, "%llu\n", cft->read_u64(css, cft));
3484 else if (cft->read_s64)
3485 seq_printf(m, "%lld\n", cft->read_s64(css, cft));
3486 else
3487 return -EINVAL;
3488 return 0;
3489}
3490
3491static struct kernfs_ops cgroup_kf_single_ops = {
3492 .atomic_write_len = PAGE_SIZE,
3493 .open = cgroup_file_open,
3494 .release = cgroup_file_release,
3495 .write = cgroup_file_write,
3496 .seq_show = cgroup_seqfile_show,
3497};
3498
3499static struct kernfs_ops cgroup_kf_ops = {
3500 .atomic_write_len = PAGE_SIZE,
3501 .open = cgroup_file_open,
3502 .release = cgroup_file_release,
3503 .write = cgroup_file_write,
3504 .seq_start = cgroup_seqfile_start,
3505 .seq_next = cgroup_seqfile_next,
3506 .seq_stop = cgroup_seqfile_stop,
3507 .seq_show = cgroup_seqfile_show,
3508};
3509
3510
3511static int cgroup_kn_set_ugid(struct kernfs_node *kn)
3512{
3513 struct iattr iattr = { .ia_valid = ATTR_UID | ATTR_GID,
3514 .ia_uid = current_fsuid(),
3515 .ia_gid = current_fsgid(), };
3516
3517 if (uid_eq(iattr.ia_uid, GLOBAL_ROOT_UID) &&
3518 gid_eq(iattr.ia_gid, GLOBAL_ROOT_GID))
3519 return 0;
3520
3521 return kernfs_setattr(kn, &iattr);
3522}
3523
3524static int cgroup_add_file(struct cgroup_subsys_state *css, struct cgroup *cgrp,
3525 struct cftype *cft)
3526{
3527 char name[CGROUP_FILE_NAME_MAX];
3528 struct kernfs_node *kn;
3529 struct lock_class_key *key = NULL;
3530 int ret;
3531
3532#ifdef CONFIG_DEBUG_LOCK_ALLOC
3533 key = &cft->lockdep_key;
3534#endif
3535 kn = __kernfs_create_file(cgrp->kn, cgroup_file_name(cgrp, cft, name),
3536 cgroup_file_mode(cft), 0, cft->kf_ops, cft,
3537 NULL, key);
3538 if (IS_ERR(kn))
3539 return PTR_ERR(kn);
3540
3541 ret = cgroup_kn_set_ugid(kn);
3542 if (ret) {
3543 kernfs_remove(kn);
3544 return ret;
3545 }
3546
3547 if (cft->file_offset) {
3548 struct cgroup_file *cfile = (void *)css + cft->file_offset;
3549
3550 spin_lock_irq(&cgroup_file_kn_lock);
3551 cfile->kn = kn;
3552 spin_unlock_irq(&cgroup_file_kn_lock);
3553 }
3554
3555 return 0;
3556}
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568static int cgroup_addrm_files(struct cgroup_subsys_state *css,
3569 struct cgroup *cgrp, struct cftype cfts[],
3570 bool is_add)
3571{
3572 struct cftype *cft, *cft_end = NULL;
3573 int ret = 0;
3574
3575 lockdep_assert_held(&cgroup_mutex);
3576
3577restart:
3578 for (cft = cfts; cft != cft_end && cft->name[0] != '\0'; cft++) {
3579
3580 if ((cft->flags & __CFTYPE_ONLY_ON_DFL) && !cgroup_on_dfl(cgrp))
3581 continue;
3582 if ((cft->flags & __CFTYPE_NOT_ON_DFL) && cgroup_on_dfl(cgrp))
3583 continue;
3584 if ((cft->flags & CFTYPE_NOT_ON_ROOT) && !cgroup_parent(cgrp))
3585 continue;
3586 if ((cft->flags & CFTYPE_ONLY_ON_ROOT) && cgroup_parent(cgrp))
3587 continue;
3588
3589 if (is_add) {
3590 ret = cgroup_add_file(css, cgrp, cft);
3591 if (ret) {
3592 pr_warn("%s: failed to add %s, err=%d\n",
3593 __func__, cft->name, ret);
3594 cft_end = cft;
3595 is_add = false;
3596 goto restart;
3597 }
3598 } else {
3599 cgroup_rm_file(cgrp, cft);
3600 }
3601 }
3602 return ret;
3603}
3604
3605static int cgroup_apply_cftypes(struct cftype *cfts, bool is_add)
3606{
3607 struct cgroup_subsys *ss = cfts[0].ss;
3608 struct cgroup *root = &ss->root->cgrp;
3609 struct cgroup_subsys_state *css;
3610 int ret = 0;
3611
3612 lockdep_assert_held(&cgroup_mutex);
3613
3614
3615 css_for_each_descendant_pre(css, cgroup_css(root, ss)) {
3616 struct cgroup *cgrp = css->cgroup;
3617
3618 if (!(css->flags & CSS_VISIBLE))
3619 continue;
3620
3621 ret = cgroup_addrm_files(css, cgrp, cfts, is_add);
3622 if (ret)
3623 break;
3624 }
3625
3626 if (is_add && !ret)
3627 kernfs_activate(root->kn);
3628 return ret;
3629}
3630
3631static void cgroup_exit_cftypes(struct cftype *cfts)
3632{
3633 struct cftype *cft;
3634
3635 for (cft = cfts; cft->name[0] != '\0'; cft++) {
3636
3637 if (cft->max_write_len && cft->max_write_len != PAGE_SIZE)
3638 kfree(cft->kf_ops);
3639 cft->kf_ops = NULL;
3640 cft->ss = NULL;
3641
3642
3643 cft->flags &= ~(__CFTYPE_ONLY_ON_DFL | __CFTYPE_NOT_ON_DFL);
3644 }
3645}
3646
3647static int cgroup_init_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
3648{
3649 struct cftype *cft;
3650
3651 for (cft = cfts; cft->name[0] != '\0'; cft++) {
3652 struct kernfs_ops *kf_ops;
3653
3654 WARN_ON(cft->ss || cft->kf_ops);
3655
3656 if (cft->seq_start)
3657 kf_ops = &cgroup_kf_ops;
3658 else
3659 kf_ops = &cgroup_kf_single_ops;
3660
3661
3662
3663
3664
3665 if (cft->max_write_len && cft->max_write_len != PAGE_SIZE) {
3666 kf_ops = kmemdup(kf_ops, sizeof(*kf_ops), GFP_KERNEL);
3667 if (!kf_ops) {
3668 cgroup_exit_cftypes(cfts);
3669 return -ENOMEM;
3670 }
3671 kf_ops->atomic_write_len = cft->max_write_len;
3672 }
3673
3674 cft->kf_ops = kf_ops;
3675 cft->ss = ss;
3676 }
3677
3678 return 0;
3679}
3680
3681static int cgroup_rm_cftypes_locked(struct cftype *cfts)
3682{
3683 lockdep_assert_held(&cgroup_mutex);
3684
3685 if (!cfts || !cfts[0].ss)
3686 return -ENOENT;
3687
3688 list_del(&cfts->node);
3689 cgroup_apply_cftypes(cfts, false);
3690 cgroup_exit_cftypes(cfts);
3691 return 0;
3692}
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705int cgroup_rm_cftypes(struct cftype *cfts)
3706{
3707 int ret;
3708
3709 mutex_lock(&cgroup_mutex);
3710 ret = cgroup_rm_cftypes_locked(cfts);
3711 mutex_unlock(&cgroup_mutex);
3712 return ret;
3713}
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729static int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
3730{
3731 int ret;
3732
3733 if (!cgroup_ssid_enabled(ss->id))
3734 return 0;
3735
3736 if (!cfts || cfts[0].name[0] == '\0')
3737 return 0;
3738
3739 ret = cgroup_init_cftypes(ss, cfts);
3740 if (ret)
3741 return ret;
3742
3743 mutex_lock(&cgroup_mutex);
3744
3745 list_add_tail(&cfts->node, &ss->cfts);
3746 ret = cgroup_apply_cftypes(cfts, true);
3747 if (ret)
3748 cgroup_rm_cftypes_locked(cfts);
3749
3750 mutex_unlock(&cgroup_mutex);
3751 return ret;
3752}
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762int cgroup_add_dfl_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
3763{
3764 struct cftype *cft;
3765
3766 for (cft = cfts; cft && cft->name[0] != '\0'; cft++)
3767 cft->flags |= __CFTYPE_ONLY_ON_DFL;
3768 return cgroup_add_cftypes(ss, cfts);
3769}
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779int cgroup_add_legacy_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
3780{
3781 struct cftype *cft;
3782
3783 for (cft = cfts; cft && cft->name[0] != '\0'; cft++)
3784 cft->flags |= __CFTYPE_NOT_ON_DFL;
3785 return cgroup_add_cftypes(ss, cfts);
3786}
3787
3788
3789
3790
3791
3792
3793
3794void cgroup_file_notify(struct cgroup_file *cfile)
3795{
3796 unsigned long flags;
3797
3798 spin_lock_irqsave(&cgroup_file_kn_lock, flags);
3799 if (cfile->kn)
3800 kernfs_notify(cfile->kn);
3801 spin_unlock_irqrestore(&cgroup_file_kn_lock, flags);
3802}
3803
3804
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821struct cgroup_subsys_state *css_next_child(struct cgroup_subsys_state *pos,
3822 struct cgroup_subsys_state *parent)
3823{
3824 struct cgroup_subsys_state *next;
3825
3826 cgroup_assert_mutex_or_rcu_locked();
3827
3828
3829
3830
3831
3832
3833
3834
3835
3836
3837
3838
3839
3840
3841
3842
3843
3844
3845
3846
3847
3848 if (!pos) {
3849 next = list_entry_rcu(parent->children.next, struct cgroup_subsys_state, sibling);
3850 } else if (likely(!(pos->flags & CSS_RELEASED))) {
3851 next = list_entry_rcu(pos->sibling.next, struct cgroup_subsys_state, sibling);
3852 } else {
3853 list_for_each_entry_rcu(next, &parent->children, sibling)
3854 if (next->serial_nr > pos->serial_nr)
3855 break;
3856 }
3857
3858
3859
3860
3861
3862 if (&next->sibling != &parent->children)
3863 return next;
3864 return NULL;
3865}
3866
3867
3868
3869
3870
3871
3872
3873
3874
3875
3876
3877
3878
3879
3880
3881
3882
3883
3884
3885
3886
3887
3888struct cgroup_subsys_state *
3889css_next_descendant_pre(struct cgroup_subsys_state *pos,
3890 struct cgroup_subsys_state *root)
3891{
3892 struct cgroup_subsys_state *next;
3893
3894 cgroup_assert_mutex_or_rcu_locked();
3895
3896
3897 if (!pos)
3898 return root;
3899
3900
3901 next = css_next_child(NULL, pos);
3902 if (next)
3903 return next;
3904
3905
3906 while (pos != root) {
3907 next = css_next_child(pos, pos->parent);
3908 if (next)
3909 return next;
3910 pos = pos->parent;
3911 }
3912
3913 return NULL;
3914}
3915
3916
3917
3918
3919
3920
3921
3922
3923
3924
3925
3926
3927
3928
3929struct cgroup_subsys_state *
3930css_rightmost_descendant(struct cgroup_subsys_state *pos)
3931{
3932 struct cgroup_subsys_state *last, *tmp;
3933
3934 cgroup_assert_mutex_or_rcu_locked();
3935
3936 do {
3937 last = pos;
3938
3939 pos = NULL;
3940 css_for_each_child(tmp, last)
3941 pos = tmp;
3942 } while (pos);
3943
3944 return last;
3945}
3946
3947static struct cgroup_subsys_state *
3948css_leftmost_descendant(struct cgroup_subsys_state *pos)
3949{
3950 struct cgroup_subsys_state *last;
3951
3952 do {
3953 last = pos;
3954 pos = css_next_child(NULL, pos);
3955 } while (pos);
3956
3957 return last;
3958}
3959
3960
3961
3962
3963
3964
3965
3966
3967
3968
3969
3970
3971
3972
3973
3974
3975
3976
3977
3978
3979
3980
3981
3982struct cgroup_subsys_state *
3983css_next_descendant_post(struct cgroup_subsys_state *pos,
3984 struct cgroup_subsys_state *root)
3985{
3986 struct cgroup_subsys_state *next;
3987
3988 cgroup_assert_mutex_or_rcu_locked();
3989
3990
3991 if (!pos)
3992 return css_leftmost_descendant(root);
3993
3994
3995 if (pos == root)
3996 return NULL;
3997
3998
3999 next = css_next_child(pos, pos->parent);
4000 if (next)
4001 return css_leftmost_descendant(next);
4002
4003
4004 return pos->parent;
4005}
4006
4007
4008
4009
4010
4011
4012
4013
4014
4015bool css_has_online_children(struct cgroup_subsys_state *css)
4016{
4017 struct cgroup_subsys_state *child;
4018 bool ret = false;
4019
4020 rcu_read_lock();
4021 css_for_each_child(child, css) {
4022 if (child->flags & CSS_ONLINE) {
4023 ret = true;
4024 break;
4025 }
4026 }
4027 rcu_read_unlock();
4028 return ret;
4029}
4030
4031static struct css_set *css_task_iter_next_css_set(struct css_task_iter *it)
4032{
4033 struct list_head *l;
4034 struct cgrp_cset_link *link;
4035 struct css_set *cset;
4036
4037 lockdep_assert_held(&css_set_lock);
4038
4039
4040 if (it->tcset_pos) {
4041 l = it->tcset_pos->next;
4042
4043 if (l != it->tcset_head) {
4044 it->tcset_pos = l;
4045 return container_of(l, struct css_set,
4046 threaded_csets_node);
4047 }
4048
4049 it->tcset_pos = NULL;
4050 }
4051
4052
4053 l = it->cset_pos;
4054 l = l->next;
4055 if (l == it->cset_head) {
4056 it->cset_pos = NULL;
4057 return NULL;
4058 }
4059
4060 if (it->ss) {
4061 cset = container_of(l, struct css_set, e_cset_node[it->ss->id]);
4062 } else {
4063 link = list_entry(l, struct cgrp_cset_link, cset_link);
4064 cset = link->cset;
4065 }
4066
4067 it->cset_pos = l;
4068
4069
4070 if (it->flags & CSS_TASK_ITER_THREADED) {
4071 if (it->cur_dcset)
4072 put_css_set_locked(it->cur_dcset);
4073 it->cur_dcset = cset;
4074 get_css_set(cset);
4075
4076 it->tcset_head = &cset->threaded_csets;
4077 it->tcset_pos = &cset->threaded_csets;
4078 }
4079
4080 return cset;
4081}
4082
4083
4084
4085
4086
4087
4088
4089static void css_task_iter_advance_css_set(struct css_task_iter *it)
4090{
4091 struct css_set *cset;
4092
4093 lockdep_assert_held(&css_set_lock);
4094
4095
4096 do {
4097 cset = css_task_iter_next_css_set(it);
4098 if (!cset) {
4099 it->task_pos = NULL;
4100 return;
4101 }
4102 } while (!css_set_populated(cset));
4103
4104 if (!list_empty(&cset->tasks))
4105 it->task_pos = cset->tasks.next;
4106 else
4107 it->task_pos = cset->mg_tasks.next;
4108
4109 it->tasks_head = &cset->tasks;
4110 it->mg_tasks_head = &cset->mg_tasks;
4111
4112
4113
4114
4115
4116
4117
4118
4119
4120
4121
4122
4123
4124
4125
4126
4127 if (it->cur_cset) {
4128 list_del(&it->iters_node);
4129 put_css_set_locked(it->cur_cset);
4130 }
4131 get_css_set(cset);
4132 it->cur_cset = cset;
4133 list_add(&it->iters_node, &cset->task_iters);
4134}
4135
4136static void css_task_iter_advance(struct css_task_iter *it)
4137{
4138 struct list_head *next;
4139
4140 lockdep_assert_held(&css_set_lock);
4141repeat:
4142
4143
4144
4145
4146
4147 next = it->task_pos->next;
4148
4149 if (next == it->tasks_head)
4150 next = it->mg_tasks_head->next;
4151
4152 if (next == it->mg_tasks_head)
4153 css_task_iter_advance_css_set(it);
4154 else
4155 it->task_pos = next;
4156
4157
4158 if ((it->flags & CSS_TASK_ITER_PROCS) && it->task_pos &&
4159 !thread_group_leader(list_entry(it->task_pos, struct task_struct,
4160 cg_list)))
4161 goto repeat;
4162}
4163
4164
4165
4166
4167
4168
4169
4170
4171
4172
4173
4174
4175void css_task_iter_start(struct cgroup_subsys_state *css, unsigned int flags,
4176 struct css_task_iter *it)
4177{
4178
4179 WARN_ON_ONCE(!use_task_css_set_links);
4180
4181 memset(it, 0, sizeof(*it));
4182
4183 spin_lock_irq(&css_set_lock);
4184
4185 it->ss = css->ss;
4186 it->flags = flags;
4187
4188 if (it->ss)
4189 it->cset_pos = &css->cgroup->e_csets[css->ss->id];
4190 else
4191 it->cset_pos = &css->cgroup->cset_links;
4192
4193 it->cset_head = it->cset_pos;
4194
4195 css_task_iter_advance_css_set(it);
4196
4197 spin_unlock_irq(&css_set_lock);
4198}
4199
4200
4201
4202
4203
4204
4205
4206
4207
4208struct task_struct *css_task_iter_next(struct css_task_iter *it)
4209{
4210 if (it->cur_task) {
4211 put_task_struct(it->cur_task);
4212 it->cur_task = NULL;
4213 }
4214
4215 spin_lock_irq(&css_set_lock);
4216
4217 if (it->task_pos) {
4218 it->cur_task = list_entry(it->task_pos, struct task_struct,
4219 cg_list);
4220 get_task_struct(it->cur_task);
4221 css_task_iter_advance(it);
4222 }
4223
4224 spin_unlock_irq(&css_set_lock);
4225
4226 return it->cur_task;
4227}
4228
4229
4230
4231
4232
4233
4234
4235void css_task_iter_end(struct css_task_iter *it)
4236{
4237 if (it->cur_cset) {
4238 spin_lock_irq(&css_set_lock);
4239 list_del(&it->iters_node);
4240 put_css_set_locked(it->cur_cset);
4241 spin_unlock_irq(&css_set_lock);
4242 }
4243
4244 if (it->cur_dcset)
4245 put_css_set(it->cur_dcset);
4246
4247 if (it->cur_task)
4248 put_task_struct(it->cur_task);
4249}
4250
4251static void cgroup_procs_release(struct kernfs_open_file *of)
4252{
4253 if (of->priv) {
4254 css_task_iter_end(of->priv);
4255 kfree(of->priv);
4256 }
4257}
4258
4259static void *cgroup_procs_next(struct seq_file *s, void *v, loff_t *pos)
4260{
4261 struct kernfs_open_file *of = s->private;
4262 struct css_task_iter *it = of->priv;
4263
4264 return css_task_iter_next(it);
4265}
4266
4267static void *__cgroup_procs_start(struct seq_file *s, loff_t *pos,
4268 unsigned int iter_flags)
4269{
4270 struct kernfs_open_file *of = s->private;
4271 struct cgroup *cgrp = seq_css(s)->cgroup;
4272 struct css_task_iter *it = of->priv;
4273
4274
4275
4276
4277
4278 if (!it) {
4279 if (WARN_ON_ONCE((*pos)++))
4280 return ERR_PTR(-EINVAL);
4281
4282 it = kzalloc(sizeof(*it), GFP_KERNEL);
4283 if (!it)
4284 return ERR_PTR(-ENOMEM);
4285 of->priv = it;
4286 css_task_iter_start(&cgrp->self, iter_flags, it);
4287 } else if (!(*pos)++) {
4288 css_task_iter_end(it);
4289 css_task_iter_start(&cgrp->self, iter_flags, it);
4290 }
4291
4292 return cgroup_procs_next(s, NULL, NULL);
4293}
4294
4295static void *cgroup_procs_start(struct seq_file *s, loff_t *pos)
4296{
4297 struct cgroup *cgrp = seq_css(s)->cgroup;
4298
4299
4300
4301
4302
4303
4304
4305 if (cgroup_is_threaded(cgrp))
4306 return ERR_PTR(-EOPNOTSUPP);
4307
4308 return __cgroup_procs_start(s, pos, CSS_TASK_ITER_PROCS |
4309 CSS_TASK_ITER_THREADED);
4310}
4311
4312static int cgroup_procs_show(struct seq_file *s, void *v)
4313{
4314 seq_printf(s, "%d\n", task_pid_vnr(v));
4315 return 0;
4316}
4317
4318static int cgroup_procs_write_permission(struct cgroup *src_cgrp,
4319 struct cgroup *dst_cgrp,
4320 struct super_block *sb)
4321{
4322 struct cgroup_namespace *ns = current->nsproxy->cgroup_ns;
4323 struct cgroup *com_cgrp = src_cgrp;
4324 struct inode *inode;
4325 int ret;
4326
4327 lockdep_assert_held(&cgroup_mutex);
4328
4329
4330 while (!cgroup_is_descendant(dst_cgrp, com_cgrp))
4331 com_cgrp = cgroup_parent(com_cgrp);
4332
4333
4334 inode = kernfs_get_inode(sb, com_cgrp->procs_file.kn);
4335 if (!inode)
4336 return -ENOMEM;
4337
4338 ret = inode_permission(inode, MAY_WRITE);
4339 iput(inode);
4340 if (ret)
4341 return ret;
4342
4343
4344
4345
4346
4347 if ((cgrp_dfl_root.flags & CGRP_ROOT_NS_DELEGATE) &&
4348 (!cgroup_is_descendant(src_cgrp, ns->root_cset->dfl_cgrp) ||
4349 !cgroup_is_descendant(dst_cgrp, ns->root_cset->dfl_cgrp)))
4350 return -ENOENT;
4351
4352 return 0;
4353}
4354
4355static ssize_t cgroup_procs_write(struct kernfs_open_file *of,
4356 char *buf, size_t nbytes, loff_t off)
4357{
4358 struct cgroup *src_cgrp, *dst_cgrp;
4359 struct task_struct *task;
4360 ssize_t ret;
4361
4362 dst_cgrp = cgroup_kn_lock_live(of->kn, false);
4363 if (!dst_cgrp)
4364 return -ENODEV;
4365
4366 task = cgroup_procs_write_start(buf, true);
4367 ret = PTR_ERR_OR_ZERO(task);
4368 if (ret)
4369 goto out_unlock;
4370
4371
4372 spin_lock_irq(&css_set_lock);
4373 src_cgrp = task_cgroup_from_root(task, &cgrp_dfl_root);
4374 spin_unlock_irq(&css_set_lock);
4375
4376 ret = cgroup_procs_write_permission(src_cgrp, dst_cgrp,
4377 of->file->f_path.dentry->d_sb);
4378 if (ret)
4379 goto out_finish;
4380
4381 ret = cgroup_attach_task(dst_cgrp, task, true);
4382
4383out_finish:
4384 cgroup_procs_write_finish(task);
4385out_unlock:
4386 cgroup_kn_unlock(of->kn);
4387
4388 return ret ?: nbytes;
4389}
4390
4391static void *cgroup_threads_start(struct seq_file *s, loff_t *pos)
4392{
4393 return __cgroup_procs_start(s, pos, 0);
4394}
4395
4396static ssize_t cgroup_threads_write(struct kernfs_open_file *of,
4397 char *buf, size_t nbytes, loff_t off)
4398{
4399 struct cgroup *src_cgrp, *dst_cgrp;
4400 struct task_struct *task;
4401 ssize_t ret;
4402
4403 buf = strstrip(buf);
4404
4405 dst_cgrp = cgroup_kn_lock_live(of->kn, false);
4406 if (!dst_cgrp)
4407 return -ENODEV;
4408
4409 task = cgroup_procs_write_start(buf, false);
4410 ret = PTR_ERR_OR_ZERO(task);
4411 if (ret)
4412 goto out_unlock;
4413
4414
4415 spin_lock_irq(&css_set_lock);
4416 src_cgrp = task_cgroup_from_root(task, &cgrp_dfl_root);
4417 spin_unlock_irq(&css_set_lock);
4418
4419
4420 ret = cgroup_procs_write_permission(src_cgrp, dst_cgrp,
4421 of->file->f_path.dentry->d_sb);
4422 if (ret)
4423 goto out_finish;
4424
4425
4426 ret = -EOPNOTSUPP;
4427 if (src_cgrp->dom_cgrp != dst_cgrp->dom_cgrp)
4428 goto out_finish;
4429
4430 ret = cgroup_attach_task(dst_cgrp, task, false);
4431
4432out_finish:
4433 cgroup_procs_write_finish(task);
4434out_unlock:
4435 cgroup_kn_unlock(of->kn);
4436
4437 return ret ?: nbytes;
4438}
4439
4440
4441static struct cftype cgroup_base_files[] = {
4442 {
4443 .name = "cgroup.type",
4444 .flags = CFTYPE_NOT_ON_ROOT,
4445 .seq_show = cgroup_type_show,
4446 .write = cgroup_type_write,
4447 },
4448 {
4449 .name = "cgroup.procs",
4450 .flags = CFTYPE_NS_DELEGATABLE,
4451 .file_offset = offsetof(struct cgroup, procs_file),
4452 .release = cgroup_procs_release,
4453 .seq_start = cgroup_procs_start,
4454 .seq_next = cgroup_procs_next,
4455 .seq_show = cgroup_procs_show,
4456 .write = cgroup_procs_write,
4457 },
4458 {
4459 .name = "cgroup.threads",
4460 .flags = CFTYPE_NS_DELEGATABLE,
4461 .release = cgroup_procs_release,
4462 .seq_start = cgroup_threads_start,
4463 .seq_next = cgroup_procs_next,
4464 .seq_show = cgroup_procs_show,
4465 .write = cgroup_threads_write,
4466 },
4467 {
4468 .name = "cgroup.controllers",
4469 .seq_show = cgroup_controllers_show,
4470 },
4471 {
4472 .name = "cgroup.subtree_control",
4473 .flags = CFTYPE_NS_DELEGATABLE,
4474 .seq_show = cgroup_subtree_control_show,
4475 .write = cgroup_subtree_control_write,
4476 },
4477 {
4478 .name = "cgroup.events",
4479 .flags = CFTYPE_NOT_ON_ROOT,
4480 .file_offset = offsetof(struct cgroup, events_file),
4481 .seq_show = cgroup_events_show,
4482 },
4483 {
4484 .name = "cgroup.max.descendants",
4485 .seq_show = cgroup_max_descendants_show,
4486 .write = cgroup_max_descendants_write,
4487 },
4488 {
4489 .name = "cgroup.max.depth",
4490 .seq_show = cgroup_max_depth_show,
4491 .write = cgroup_max_depth_write,
4492 },
4493 {
4494 .name = "cgroup.stat",
4495 .seq_show = cgroup_stat_show,
4496 },
4497 {
4498 .name = "cpu.stat",
4499 .flags = CFTYPE_NOT_ON_ROOT,
4500 .seq_show = cpu_stat_show,
4501 },
4502 { }
4503};
4504
4505
4506
4507
4508
4509
4510
4511
4512
4513
4514
4515
4516
4517
4518
4519
4520
4521
4522
4523
4524
4525
4526
4527static void css_free_rwork_fn(struct work_struct *work)
4528{
4529 struct cgroup_subsys_state *css = container_of(to_rcu_work(work),
4530 struct cgroup_subsys_state, destroy_rwork);
4531 struct cgroup_subsys *ss = css->ss;
4532 struct cgroup *cgrp = css->cgroup;
4533
4534 percpu_ref_exit(&css->refcnt);
4535
4536 if (ss) {
4537
4538 struct cgroup_subsys_state *parent = css->parent;
4539 int id = css->id;
4540
4541 ss->css_free(css);
4542 cgroup_idr_remove(&ss->css_idr, id);
4543 cgroup_put(cgrp);
4544
4545 if (parent)
4546 css_put(parent);
4547 } else {
4548
4549 atomic_dec(&cgrp->root->nr_cgrps);
4550 cgroup1_pidlist_destroy_all(cgrp);
4551 cancel_work_sync(&cgrp->release_agent_work);
4552
4553 if (cgroup_parent(cgrp)) {
4554
4555
4556
4557
4558
4559
4560 cgroup_put(cgroup_parent(cgrp));
4561 kernfs_put(cgrp->kn);
4562 if (cgroup_on_dfl(cgrp))
4563 cgroup_stat_exit(cgrp);
4564 kfree(cgrp);
4565 } else {
4566
4567
4568
4569
4570
4571 cgroup_destroy_root(cgrp->root);
4572 }
4573 }
4574}
4575
4576static void css_release_work_fn(struct work_struct *work)
4577{
4578 struct cgroup_subsys_state *css =
4579 container_of(work, struct cgroup_subsys_state, destroy_work);
4580 struct cgroup_subsys *ss = css->ss;
4581 struct cgroup *cgrp = css->cgroup;
4582
4583 mutex_lock(&cgroup_mutex);
4584
4585 css->flags |= CSS_RELEASED;
4586 list_del_rcu(&css->sibling);
4587
4588 if (ss) {
4589
4590 cgroup_idr_replace(&ss->css_idr, NULL, css->id);
4591 if (ss->css_released)
4592 ss->css_released(css);
4593 } else {
4594 struct cgroup *tcgrp;
4595
4596
4597 trace_cgroup_release(cgrp);
4598
4599 if (cgroup_on_dfl(cgrp))
4600 cgroup_stat_flush(cgrp);
4601
4602 for (tcgrp = cgroup_parent(cgrp); tcgrp;
4603 tcgrp = cgroup_parent(tcgrp))
4604 tcgrp->nr_dying_descendants--;
4605
4606 cgroup_idr_remove(&cgrp->root->cgroup_idr, cgrp->id);
4607 cgrp->id = -1;
4608
4609
4610
4611
4612
4613
4614
4615
4616 if (cgrp->kn)
4617 RCU_INIT_POINTER(*(void __rcu __force **)&cgrp->kn->priv,
4618 NULL);
4619
4620 cgroup_bpf_put(cgrp);
4621 }
4622
4623 mutex_unlock(&cgroup_mutex);
4624
4625 INIT_RCU_WORK(&css->destroy_rwork, css_free_rwork_fn);
4626 queue_rcu_work(cgroup_destroy_wq, &css->destroy_rwork);
4627}
4628
4629static void css_release(struct percpu_ref *ref)
4630{
4631 struct cgroup_subsys_state *css =
4632 container_of(ref, struct cgroup_subsys_state, refcnt);
4633
4634 INIT_WORK(&css->destroy_work, css_release_work_fn);
4635 queue_work(cgroup_destroy_wq, &css->destroy_work);
4636}
4637
4638static void init_and_link_css(struct cgroup_subsys_state *css,
4639 struct cgroup_subsys *ss, struct cgroup *cgrp)
4640{
4641 lockdep_assert_held(&cgroup_mutex);
4642
4643 cgroup_get_live(cgrp);
4644
4645 memset(css, 0, sizeof(*css));
4646 css->cgroup = cgrp;
4647 css->ss = ss;
4648 css->id = -1;
4649 INIT_LIST_HEAD(&css->sibling);
4650 INIT_LIST_HEAD(&css->children);
4651 css->serial_nr = css_serial_nr_next++;
4652 atomic_set(&css->online_cnt, 0);
4653
4654 if (cgroup_parent(cgrp)) {
4655 css->parent = cgroup_css(cgroup_parent(cgrp), ss);
4656 css_get(css->parent);
4657 }
4658
4659 BUG_ON(cgroup_css(cgrp, ss));
4660}
4661
4662
4663static int online_css(struct cgroup_subsys_state *css)
4664{
4665 struct cgroup_subsys *ss = css->ss;
4666 int ret = 0;
4667
4668 lockdep_assert_held(&cgroup_mutex);
4669
4670 if (ss->css_online)
4671 ret = ss->css_online(css);
4672 if (!ret) {
4673 css->flags |= CSS_ONLINE;
4674 rcu_assign_pointer(css->cgroup->subsys[ss->id], css);
4675
4676 atomic_inc(&css->online_cnt);
4677 if (css->parent)
4678 atomic_inc(&css->parent->online_cnt);
4679 }
4680 return ret;
4681}
4682
4683
4684static void offline_css(struct cgroup_subsys_state *css)
4685{
4686 struct cgroup_subsys *ss = css->ss;
4687
4688 lockdep_assert_held(&cgroup_mutex);
4689
4690 if (!(css->flags & CSS_ONLINE))
4691 return;
4692
4693 if (ss->css_offline)
4694 ss->css_offline(css);
4695
4696 css->flags &= ~CSS_ONLINE;
4697 RCU_INIT_POINTER(css->cgroup->subsys[ss->id], NULL);
4698
4699 wake_up_all(&css->cgroup->offline_waitq);
4700}
4701
4702
4703
4704
4705
4706
4707
4708
4709
4710
4711static struct cgroup_subsys_state *css_create(struct cgroup *cgrp,
4712 struct cgroup_subsys *ss)
4713{
4714 struct cgroup *parent = cgroup_parent(cgrp);
4715 struct cgroup_subsys_state *parent_css = cgroup_css(parent, ss);
4716 struct cgroup_subsys_state *css;
4717 int err;
4718
4719 lockdep_assert_held(&cgroup_mutex);
4720
4721 css = ss->css_alloc(parent_css);
4722 if (!css)
4723 css = ERR_PTR(-ENOMEM);
4724 if (IS_ERR(css))
4725 return css;
4726
4727 init_and_link_css(css, ss, cgrp);
4728
4729 err = percpu_ref_init(&css->refcnt, css_release, 0, GFP_KERNEL);
4730 if (err)
4731 goto err_free_css;
4732
4733 err = cgroup_idr_alloc(&ss->css_idr, NULL, 2, 0, GFP_KERNEL);
4734 if (err < 0)
4735 goto err_free_css;
4736 css->id = err;
4737
4738
4739 list_add_tail_rcu(&css->sibling, &parent_css->children);
4740 cgroup_idr_replace(&ss->css_idr, css, css->id);
4741
4742 err = online_css(css);
4743 if (err)
4744 goto err_list_del;
4745
4746 if (ss->broken_hierarchy && !ss->warned_broken_hierarchy &&
4747 cgroup_parent(parent)) {
4748 pr_warn("%s (%d) created nested cgroup for controller \"%s\" which has incomplete hierarchy support. Nested cgroups may change behavior in the future.\n",
4749 current->comm, current->pid, ss->name);
4750 if (!strcmp(ss->name, "memory"))
4751 pr_warn("\"memory\" requires setting use_hierarchy to 1 on the root\n");
4752 ss->warned_broken_hierarchy = true;
4753 }
4754
4755 return css;
4756
4757err_list_del:
4758 list_del_rcu(&css->sibling);
4759err_free_css:
4760 INIT_RCU_WORK(&css->destroy_rwork, css_free_rwork_fn);
4761 queue_rcu_work(cgroup_destroy_wq, &css->destroy_rwork);
4762 return ERR_PTR(err);
4763}
4764
4765
4766
4767
4768
4769
4770static struct cgroup *cgroup_create(struct cgroup *parent)
4771{
4772 struct cgroup_root *root = parent->root;
4773 struct cgroup *cgrp, *tcgrp;
4774 int level = parent->level + 1;
4775 int ret;
4776
4777
4778 cgrp = kzalloc(sizeof(*cgrp) +
4779 sizeof(cgrp->ancestor_ids[0]) * (level + 1), GFP_KERNEL);
4780 if (!cgrp)
4781 return ERR_PTR(-ENOMEM);
4782
4783 ret = percpu_ref_init(&cgrp->self.refcnt, css_release, 0, GFP_KERNEL);
4784 if (ret)
4785 goto out_free_cgrp;
4786
4787 if (cgroup_on_dfl(parent)) {
4788 ret = cgroup_stat_init(cgrp);
4789 if (ret)
4790 goto out_cancel_ref;
4791 }
4792
4793
4794
4795
4796
4797 cgrp->id = cgroup_idr_alloc(&root->cgroup_idr, NULL, 2, 0, GFP_KERNEL);
4798 if (cgrp->id < 0) {
4799 ret = -ENOMEM;
4800 goto out_stat_exit;
4801 }
4802
4803 init_cgroup_housekeeping(cgrp);
4804
4805 cgrp->self.parent = &parent->self;
4806 cgrp->root = root;
4807 cgrp->level = level;
4808 ret = cgroup_bpf_inherit(cgrp);
4809 if (ret)
4810 goto out_idr_free;
4811
4812 for (tcgrp = cgrp; tcgrp; tcgrp = cgroup_parent(tcgrp)) {
4813 cgrp->ancestor_ids[tcgrp->level] = tcgrp->id;
4814
4815 if (tcgrp != cgrp)
4816 tcgrp->nr_descendants++;
4817 }
4818
4819 if (notify_on_release(parent))
4820 set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
4821
4822 if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &parent->flags))
4823 set_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags);
4824
4825 cgrp->self.serial_nr = css_serial_nr_next++;
4826
4827
4828 list_add_tail_rcu(&cgrp->self.sibling, &cgroup_parent(cgrp)->self.children);
4829 atomic_inc(&root->nr_cgrps);
4830 cgroup_get_live(parent);
4831
4832
4833
4834
4835
4836 cgroup_idr_replace(&root->cgroup_idr, cgrp, cgrp->id);
4837
4838
4839
4840
4841
4842 if (!cgroup_on_dfl(cgrp))
4843 cgrp->subtree_control = cgroup_control(cgrp);
4844
4845 cgroup_propagate_control(cgrp);
4846
4847 return cgrp;
4848
4849out_idr_free:
4850 cgroup_idr_remove(&root->cgroup_idr, cgrp->id);
4851out_stat_exit:
4852 if (cgroup_on_dfl(parent))
4853 cgroup_stat_exit(cgrp);
4854out_cancel_ref:
4855 percpu_ref_exit(&cgrp->self.refcnt);
4856out_free_cgrp:
4857 kfree(cgrp);
4858 return ERR_PTR(ret);
4859}
4860
4861static bool cgroup_check_hierarchy_limits(struct cgroup *parent)
4862{
4863 struct cgroup *cgroup;
4864 int ret = false;
4865 int level = 1;
4866
4867 lockdep_assert_held(&cgroup_mutex);
4868
4869 for (cgroup = parent; cgroup; cgroup = cgroup_parent(cgroup)) {
4870 if (cgroup->nr_descendants >= cgroup->max_descendants)
4871 goto fail;
4872
4873 if (level > cgroup->max_depth)
4874 goto fail;
4875
4876 level++;
4877 }
4878
4879 ret = true;
4880fail:
4881 return ret;
4882}
4883
4884int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, umode_t mode)
4885{
4886 struct cgroup *parent, *cgrp;
4887 struct kernfs_node *kn;
4888 int ret;
4889
4890
4891 if (strchr(name, '\n'))
4892 return -EINVAL;
4893
4894 parent = cgroup_kn_lock_live(parent_kn, false);
4895 if (!parent)
4896 return -ENODEV;
4897
4898 if (!cgroup_check_hierarchy_limits(parent)) {
4899 ret = -EAGAIN;
4900 goto out_unlock;
4901 }
4902
4903 cgrp = cgroup_create(parent);
4904 if (IS_ERR(cgrp)) {
4905 ret = PTR_ERR(cgrp);
4906 goto out_unlock;
4907 }
4908
4909
4910 kn = kernfs_create_dir(parent->kn, name, mode, cgrp);
4911 if (IS_ERR(kn)) {
4912 ret = PTR_ERR(kn);
4913 goto out_destroy;
4914 }
4915 cgrp->kn = kn;
4916
4917
4918
4919
4920
4921 kernfs_get(kn);
4922
4923 ret = cgroup_kn_set_ugid(kn);
4924 if (ret)
4925 goto out_destroy;
4926
4927 ret = css_populate_dir(&cgrp->self);
4928 if (ret)
4929 goto out_destroy;
4930
4931 ret = cgroup_apply_control_enable(cgrp);
4932 if (ret)
4933 goto out_destroy;
4934
4935 trace_cgroup_mkdir(cgrp);
4936
4937
4938 kernfs_activate(kn);
4939
4940 ret = 0;
4941 goto out_unlock;
4942
4943out_destroy:
4944 cgroup_destroy_locked(cgrp);
4945out_unlock:
4946 cgroup_kn_unlock(parent_kn);
4947 return ret;
4948}
4949
4950
4951
4952
4953
4954
4955static void css_killed_work_fn(struct work_struct *work)
4956{
4957 struct cgroup_subsys_state *css =
4958 container_of(work, struct cgroup_subsys_state, destroy_work);
4959
4960 mutex_lock(&cgroup_mutex);
4961
4962 do {
4963 offline_css(css);
4964 css_put(css);
4965
4966 css = css->parent;
4967 } while (css && atomic_dec_and_test(&css->online_cnt));
4968
4969 mutex_unlock(&cgroup_mutex);
4970}
4971
4972
4973static void css_killed_ref_fn(struct percpu_ref *ref)
4974{
4975 struct cgroup_subsys_state *css =
4976 container_of(ref, struct cgroup_subsys_state, refcnt);
4977
4978 if (atomic_dec_and_test(&css->online_cnt)) {
4979 INIT_WORK(&css->destroy_work, css_killed_work_fn);
4980 queue_work(cgroup_destroy_wq, &css->destroy_work);
4981 }
4982}
4983
4984
4985
4986
4987
4988
4989
4990
4991
4992
4993static void kill_css(struct cgroup_subsys_state *css)
4994{
4995 lockdep_assert_held(&cgroup_mutex);
4996
4997 if (css->flags & CSS_DYING)
4998 return;
4999
5000 css->flags |= CSS_DYING;
5001
5002
5003
5004
5005
5006 css_clear_dir(css);
5007
5008
5009
5010
5011
5012 css_get(css);
5013
5014
5015
5016
5017
5018
5019
5020
5021
5022
5023
5024 percpu_ref_kill_and_confirm(&css->refcnt, css_killed_ref_fn);
5025}
5026
5027
5028
5029
5030
5031
5032
5033
5034
5035
5036
5037
5038
5039
5040
5041
5042
5043
5044
5045
5046
5047
5048
5049
5050
5051static int cgroup_destroy_locked(struct cgroup *cgrp)
5052 __releases(&cgroup_mutex) __acquires(&cgroup_mutex)
5053{
5054 struct cgroup *tcgrp, *parent = cgroup_parent(cgrp);
5055 struct cgroup_subsys_state *css;
5056 struct cgrp_cset_link *link;
5057 int ssid;
5058
5059 lockdep_assert_held(&cgroup_mutex);
5060
5061
5062
5063
5064
5065 if (cgroup_is_populated(cgrp))
5066 return -EBUSY;
5067
5068
5069
5070
5071
5072
5073 if (css_has_online_children(&cgrp->self))
5074 return -EBUSY;
5075
5076
5077
5078
5079
5080
5081
5082 cgrp->self.flags &= ~CSS_ONLINE;
5083
5084 spin_lock_irq(&css_set_lock);
5085 list_for_each_entry(link, &cgrp->cset_links, cset_link)
5086 link->cset->dead = true;
5087 spin_unlock_irq(&css_set_lock);
5088
5089
5090 for_each_css(css, ssid, cgrp)
5091 kill_css(css);
5092
5093
5094
5095
5096
5097 kernfs_remove(cgrp->kn);
5098
5099 if (parent && cgroup_is_threaded(cgrp))
5100 parent->nr_threaded_children--;
5101
5102 for (tcgrp = cgroup_parent(cgrp); tcgrp; tcgrp = cgroup_parent(tcgrp)) {
5103 tcgrp->nr_descendants--;
5104 tcgrp->nr_dying_descendants++;
5105 }
5106
5107 cgroup1_check_for_release(parent);
5108
5109
5110 percpu_ref_kill(&cgrp->self.refcnt);
5111
5112 return 0;
5113};
5114
5115int cgroup_rmdir(struct kernfs_node *kn)
5116{
5117 struct cgroup *cgrp;
5118 int ret = 0;
5119
5120 cgrp = cgroup_kn_lock_live(kn, false);
5121 if (!cgrp)
5122 return 0;
5123
5124 ret = cgroup_destroy_locked(cgrp);
5125
5126 if (!ret)
5127 trace_cgroup_rmdir(cgrp);
5128
5129 cgroup_kn_unlock(kn);
5130 return ret;
5131}
5132
5133static struct kernfs_syscall_ops cgroup_kf_syscall_ops = {
5134 .show_options = cgroup_show_options,
5135 .remount_fs = cgroup_remount,
5136 .mkdir = cgroup_mkdir,
5137 .rmdir = cgroup_rmdir,
5138 .show_path = cgroup_show_path,
5139};
5140
5141static void __init cgroup_init_subsys(struct cgroup_subsys *ss, bool early)
5142{
5143 struct cgroup_subsys_state *css;
5144
5145 pr_debug("Initializing cgroup subsys %s\n", ss->name);
5146
5147 mutex_lock(&cgroup_mutex);
5148
5149 idr_init(&ss->css_idr);
5150 INIT_LIST_HEAD(&ss->cfts);
5151
5152
5153 ss->root = &cgrp_dfl_root;
5154 css = ss->css_alloc(cgroup_css(&cgrp_dfl_root.cgrp, ss));
5155
5156 BUG_ON(IS_ERR(css));
5157 init_and_link_css(css, ss, &cgrp_dfl_root.cgrp);
5158
5159
5160
5161
5162
5163 css->flags |= CSS_NO_REF;
5164
5165 if (early) {
5166
5167 css->id = 1;
5168 } else {
5169 css->id = cgroup_idr_alloc(&ss->css_idr, css, 1, 2, GFP_KERNEL);
5170 BUG_ON(css->id < 0);
5171 }
5172
5173
5174
5175
5176
5177 init_css_set.subsys[ss->id] = css;
5178
5179 have_fork_callback |= (bool)ss->fork << ss->id;
5180 have_exit_callback |= (bool)ss->exit << ss->id;
5181 have_free_callback |= (bool)ss->free << ss->id;
5182 have_canfork_callback |= (bool)ss->can_fork << ss->id;
5183
5184
5185
5186
5187 BUG_ON(!list_empty(&init_task.tasks));
5188
5189 BUG_ON(online_css(css));
5190
5191 mutex_unlock(&cgroup_mutex);
5192}
5193
5194
5195
5196
5197
5198
5199
5200int __init cgroup_init_early(void)
5201{
5202 static struct cgroup_sb_opts __initdata opts;
5203 struct cgroup_subsys *ss;
5204 int i;
5205
5206 init_cgroup_root(&cgrp_dfl_root, &opts);
5207 cgrp_dfl_root.cgrp.self.flags |= CSS_NO_REF;
5208
5209 RCU_INIT_POINTER(init_task.cgroups, &init_css_set);
5210
5211 for_each_subsys(ss, i) {
5212 WARN(!ss->css_alloc || !ss->css_free || ss->name || ss->id,
5213 "invalid cgroup_subsys %d:%s css_alloc=%p css_free=%p id:name=%d:%s\n",
5214 i, cgroup_subsys_name[i], ss->css_alloc, ss->css_free,
5215 ss->id, ss->name);
5216 WARN(strlen(cgroup_subsys_name[i]) > MAX_CGROUP_TYPE_NAMELEN,
5217 "cgroup_subsys_name %s too long\n", cgroup_subsys_name[i]);
5218
5219 ss->id = i;
5220 ss->name = cgroup_subsys_name[i];
5221 if (!ss->legacy_name)
5222 ss->legacy_name = cgroup_subsys_name[i];
5223
5224 if (ss->early_init)
5225 cgroup_init_subsys(ss, true);
5226 }
5227 return 0;
5228}
5229
5230static u16 cgroup_disable_mask __initdata;
5231
5232
5233
5234
5235
5236
5237
5238int __init cgroup_init(void)
5239{
5240 struct cgroup_subsys *ss;
5241 int ssid;
5242
5243 BUILD_BUG_ON(CGROUP_SUBSYS_COUNT > 16);
5244 BUG_ON(percpu_init_rwsem(&cgroup_threadgroup_rwsem));
5245 BUG_ON(cgroup_init_cftypes(NULL, cgroup_base_files));
5246 BUG_ON(cgroup_init_cftypes(NULL, cgroup1_base_files));
5247
5248 cgroup_stat_boot();
5249
5250
5251
5252
5253
5254 rcu_sync_enter_start(&cgroup_threadgroup_rwsem.rss);
5255
5256 get_user_ns(init_cgroup_ns.user_ns);
5257
5258 mutex_lock(&cgroup_mutex);
5259
5260
5261
5262
5263
5264 hash_add(css_set_table, &init_css_set.hlist,
5265 css_set_hash(init_css_set.subsys));
5266
5267 BUG_ON(cgroup_setup_root(&cgrp_dfl_root, 0, 0));
5268
5269 mutex_unlock(&cgroup_mutex);
5270
5271 for_each_subsys(ss, ssid) {
5272 if (ss->early_init) {
5273 struct cgroup_subsys_state *css =
5274 init_css_set.subsys[ss->id];
5275
5276 css->id = cgroup_idr_alloc(&ss->css_idr, css, 1, 2,
5277 GFP_KERNEL);
5278 BUG_ON(css->id < 0);
5279 } else {
5280 cgroup_init_subsys(ss, false);
5281 }
5282
5283 list_add_tail(&init_css_set.e_cset_node[ssid],
5284 &cgrp_dfl_root.cgrp.e_csets[ssid]);
5285
5286
5287
5288
5289
5290
5291 if (cgroup_disable_mask & (1 << ssid)) {
5292 static_branch_disable(cgroup_subsys_enabled_key[ssid]);
5293 printk(KERN_INFO "Disabling %s control group subsystem\n",
5294 ss->name);
5295 continue;
5296 }
5297
5298 if (cgroup1_ssid_disabled(ssid))
5299 printk(KERN_INFO "Disabling %s control group subsystem in v1 mounts\n",
5300 ss->name);
5301
5302 cgrp_dfl_root.subsys_mask |= 1 << ss->id;
5303
5304
5305 WARN_ON(ss->implicit_on_dfl && !ss->threaded);
5306
5307 if (ss->implicit_on_dfl)
5308 cgrp_dfl_implicit_ss_mask |= 1 << ss->id;
5309 else if (!ss->dfl_cftypes)
5310 cgrp_dfl_inhibit_ss_mask |= 1 << ss->id;
5311
5312 if (ss->threaded)
5313 cgrp_dfl_threaded_ss_mask |= 1 << ss->id;
5314
5315 if (ss->dfl_cftypes == ss->legacy_cftypes) {
5316 WARN_ON(cgroup_add_cftypes(ss, ss->dfl_cftypes));
5317 } else {
5318 WARN_ON(cgroup_add_dfl_cftypes(ss, ss->dfl_cftypes));
5319 WARN_ON(cgroup_add_legacy_cftypes(ss, ss->legacy_cftypes));
5320 }
5321
5322 if (ss->bind)
5323 ss->bind(init_css_set.subsys[ssid]);
5324
5325 mutex_lock(&cgroup_mutex);
5326 css_populate_dir(init_css_set.subsys[ssid]);
5327 mutex_unlock(&cgroup_mutex);
5328 }
5329
5330
5331 hash_del(&init_css_set.hlist);
5332 hash_add(css_set_table, &init_css_set.hlist,
5333 css_set_hash(init_css_set.subsys));
5334
5335 WARN_ON(sysfs_create_mount_point(fs_kobj, "cgroup"));
5336 WARN_ON(register_filesystem(&cgroup_fs_type));
5337 WARN_ON(register_filesystem(&cgroup2_fs_type));
5338 WARN_ON(!proc_create("cgroups", 0, NULL, &proc_cgroupstats_operations));
5339
5340 return 0;
5341}
5342
5343static int __init cgroup_wq_init(void)
5344{
5345
5346
5347
5348
5349
5350
5351
5352
5353 cgroup_destroy_wq = alloc_workqueue("cgroup_destroy", 0, 1);
5354 BUG_ON(!cgroup_destroy_wq);
5355 return 0;
5356}
5357core_initcall(cgroup_wq_init);
5358
5359void cgroup_path_from_kernfs_id(const union kernfs_node_id *id,
5360 char *buf, size_t buflen)
5361{
5362 struct kernfs_node *kn;
5363
5364 kn = kernfs_get_node_by_id(cgrp_dfl_root.kf_root, id);
5365 if (!kn)
5366 return;
5367 kernfs_path(kn, buf, buflen);
5368 kernfs_put(kn);
5369}
5370
5371
5372
5373
5374
5375
5376int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns,
5377 struct pid *pid, struct task_struct *tsk)
5378{
5379 char *buf;
5380 int retval;
5381 struct cgroup_root *root;
5382
5383 retval = -ENOMEM;
5384 buf = kmalloc(PATH_MAX, GFP_KERNEL);
5385 if (!buf)
5386 goto out;
5387
5388 mutex_lock(&cgroup_mutex);
5389 spin_lock_irq(&css_set_lock);
5390
5391 for_each_root(root) {
5392 struct cgroup_subsys *ss;
5393 struct cgroup *cgrp;
5394 int ssid, count = 0;
5395
5396 if (root == &cgrp_dfl_root && !cgrp_dfl_visible)
5397 continue;
5398
5399 seq_printf(m, "%d:", root->hierarchy_id);
5400 if (root != &cgrp_dfl_root)
5401 for_each_subsys(ss, ssid)
5402 if (root->subsys_mask & (1 << ssid))
5403 seq_printf(m, "%s%s", count++ ? "," : "",
5404 ss->legacy_name);
5405 if (strlen(root->name))
5406 seq_printf(m, "%sname=%s", count ? "," : "",
5407 root->name);
5408 seq_putc(m, ':');
5409
5410 cgrp = task_cgroup_from_root(tsk, root);
5411
5412
5413
5414
5415
5416
5417
5418
5419
5420
5421 if (cgroup_on_dfl(cgrp) || !(tsk->flags & PF_EXITING)) {
5422 retval = cgroup_path_ns_locked(cgrp, buf, PATH_MAX,
5423 current->nsproxy->cgroup_ns);
5424 if (retval >= PATH_MAX)
5425 retval = -ENAMETOOLONG;
5426 if (retval < 0)
5427 goto out_unlock;
5428
5429 seq_puts(m, buf);
5430 } else {
5431 seq_puts(m, "/");
5432 }
5433
5434 if (cgroup_on_dfl(cgrp) && cgroup_is_dead(cgrp))
5435 seq_puts(m, " (deleted)\n");
5436 else
5437 seq_putc(m, '\n');
5438 }
5439
5440 retval = 0;
5441out_unlock:
5442 spin_unlock_irq(&css_set_lock);
5443 mutex_unlock(&cgroup_mutex);
5444 kfree(buf);
5445out:
5446 return retval;
5447}
5448
5449
5450
5451
5452
5453
5454
5455
5456
5457void cgroup_fork(struct task_struct *child)
5458{
5459 RCU_INIT_POINTER(child->cgroups, &init_css_set);
5460 INIT_LIST_HEAD(&child->cg_list);
5461}
5462
5463
5464
5465
5466
5467
5468
5469
5470
5471int cgroup_can_fork(struct task_struct *child)
5472{
5473 struct cgroup_subsys *ss;
5474 int i, j, ret;
5475
5476 do_each_subsys_mask(ss, i, have_canfork_callback) {
5477 ret = ss->can_fork(child);
5478 if (ret)
5479 goto out_revert;
5480 } while_each_subsys_mask();
5481
5482 return 0;
5483
5484out_revert:
5485 for_each_subsys(ss, j) {
5486 if (j >= i)
5487 break;
5488 if (ss->cancel_fork)
5489 ss->cancel_fork(child);
5490 }
5491
5492 return ret;
5493}
5494
5495
5496
5497
5498
5499
5500
5501
5502void cgroup_cancel_fork(struct task_struct *child)
5503{
5504 struct cgroup_subsys *ss;
5505 int i;
5506
5507 for_each_subsys(ss, i)
5508 if (ss->cancel_fork)
5509 ss->cancel_fork(child);
5510}
5511
5512
5513
5514
5515
5516
5517
5518
5519
5520
5521
5522void cgroup_post_fork(struct task_struct *child)
5523{
5524 struct cgroup_subsys *ss;
5525 int i;
5526
5527
5528
5529
5530
5531
5532
5533
5534
5535
5536
5537
5538
5539
5540
5541
5542
5543
5544
5545
5546
5547
5548 if (use_task_css_set_links) {
5549 struct css_set *cset;
5550
5551 spin_lock_irq(&css_set_lock);
5552 cset = task_css_set(current);
5553 if (list_empty(&child->cg_list)) {
5554 get_css_set(cset);
5555 cset->nr_tasks++;
5556 css_set_move_task(child, NULL, cset, false);
5557 }
5558 spin_unlock_irq(&css_set_lock);
5559 }
5560
5561
5562
5563
5564
5565
5566 do_each_subsys_mask(ss, i, have_fork_callback) {
5567 ss->fork(child);
5568 } while_each_subsys_mask();
5569}
5570
5571
5572
5573
5574
5575
5576
5577
5578
5579
5580
5581
5582
5583
5584
5585
5586
5587
5588
5589
5590void cgroup_exit(struct task_struct *tsk)
5591{
5592 struct cgroup_subsys *ss;
5593 struct css_set *cset;
5594 int i;
5595
5596
5597
5598
5599
5600 cset = task_css_set(tsk);
5601
5602 if (!list_empty(&tsk->cg_list)) {
5603 spin_lock_irq(&css_set_lock);
5604 css_set_move_task(tsk, cset, NULL, false);
5605 cset->nr_tasks--;
5606 spin_unlock_irq(&css_set_lock);
5607 } else {
5608 get_css_set(cset);
5609 }
5610
5611
5612 do_each_subsys_mask(ss, i, have_exit_callback) {
5613 ss->exit(tsk);
5614 } while_each_subsys_mask();
5615}
5616
5617void cgroup_free(struct task_struct *task)
5618{
5619 struct css_set *cset = task_css_set(task);
5620 struct cgroup_subsys *ss;
5621 int ssid;
5622
5623 do_each_subsys_mask(ss, ssid, have_free_callback) {
5624 ss->free(task);
5625 } while_each_subsys_mask();
5626
5627 put_css_set(cset);
5628}
5629
5630static int __init cgroup_disable(char *str)
5631{
5632 struct cgroup_subsys *ss;
5633 char *token;
5634 int i;
5635
5636 while ((token = strsep(&str, ",")) != NULL) {
5637 if (!*token)
5638 continue;
5639
5640 for_each_subsys(ss, i) {
5641 if (strcmp(token, ss->name) &&
5642 strcmp(token, ss->legacy_name))
5643 continue;
5644 cgroup_disable_mask |= 1 << i;
5645 }
5646 }
5647 return 1;
5648}
5649__setup("cgroup_disable=", cgroup_disable);
5650
5651
5652
5653
5654
5655
5656
5657
5658
5659
5660struct cgroup_subsys_state *css_tryget_online_from_dir(struct dentry *dentry,
5661 struct cgroup_subsys *ss)
5662{
5663 struct kernfs_node *kn = kernfs_node_from_dentry(dentry);
5664 struct file_system_type *s_type = dentry->d_sb->s_type;
5665 struct cgroup_subsys_state *css = NULL;
5666 struct cgroup *cgrp;
5667
5668
5669 if ((s_type != &cgroup_fs_type && s_type != &cgroup2_fs_type) ||
5670 !kn || kernfs_type(kn) != KERNFS_DIR)
5671 return ERR_PTR(-EBADF);
5672
5673 rcu_read_lock();
5674
5675
5676
5677
5678
5679
5680 cgrp = rcu_dereference(*(void __rcu __force **)&kn->priv);
5681 if (cgrp)
5682 css = cgroup_css(cgrp, ss);
5683
5684 if (!css || !css_tryget_online(css))
5685 css = ERR_PTR(-ENOENT);
5686
5687 rcu_read_unlock();
5688 return css;
5689}
5690
5691
5692
5693
5694
5695
5696
5697
5698
5699struct cgroup_subsys_state *css_from_id(int id, struct cgroup_subsys *ss)
5700{
5701 WARN_ON_ONCE(!rcu_read_lock_held());
5702 return idr_find(&ss->css_idr, id);
5703}
5704
5705
5706
5707
5708
5709
5710
5711
5712
5713
5714struct cgroup *cgroup_get_from_path(const char *path)
5715{
5716 struct kernfs_node *kn;
5717 struct cgroup *cgrp;
5718
5719 mutex_lock(&cgroup_mutex);
5720
5721 kn = kernfs_walk_and_get(cgrp_dfl_root.cgrp.kn, path);
5722 if (kn) {
5723 if (kernfs_type(kn) == KERNFS_DIR) {
5724 cgrp = kn->priv;
5725 cgroup_get_live(cgrp);
5726 } else {
5727 cgrp = ERR_PTR(-ENOTDIR);
5728 }
5729 kernfs_put(kn);
5730 } else {
5731 cgrp = ERR_PTR(-ENOENT);
5732 }
5733
5734 mutex_unlock(&cgroup_mutex);
5735 return cgrp;
5736}
5737EXPORT_SYMBOL_GPL(cgroup_get_from_path);
5738
5739
5740
5741
5742
5743
5744
5745
5746
5747
5748struct cgroup *cgroup_get_from_fd(int fd)
5749{
5750 struct cgroup_subsys_state *css;
5751 struct cgroup *cgrp;
5752 struct file *f;
5753
5754 f = fget_raw(fd);
5755 if (!f)
5756 return ERR_PTR(-EBADF);
5757
5758 css = css_tryget_online_from_dir(f->f_path.dentry, NULL);
5759 fput(f);
5760 if (IS_ERR(css))
5761 return ERR_CAST(css);
5762
5763 cgrp = css->cgroup;
5764 if (!cgroup_on_dfl(cgrp)) {
5765 cgroup_put(cgrp);
5766 return ERR_PTR(-EBADF);
5767 }
5768
5769 return cgrp;
5770}
5771EXPORT_SYMBOL_GPL(cgroup_get_from_fd);
5772
5773
5774
5775
5776
5777#ifdef CONFIG_SOCK_CGROUP_DATA
5778
5779#if defined(CONFIG_CGROUP_NET_PRIO) || defined(CONFIG_CGROUP_NET_CLASSID)
5780
5781DEFINE_SPINLOCK(cgroup_sk_update_lock);
5782static bool cgroup_sk_alloc_disabled __read_mostly;
5783
5784void cgroup_sk_alloc_disable(void)
5785{
5786 if (cgroup_sk_alloc_disabled)
5787 return;
5788 pr_info("cgroup: disabling cgroup2 socket matching due to net_prio or net_cls activation\n");
5789 cgroup_sk_alloc_disabled = true;
5790}
5791
5792#else
5793
5794#define cgroup_sk_alloc_disabled false
5795
5796#endif
5797
5798void cgroup_sk_alloc(struct sock_cgroup_data *skcd)
5799{
5800 if (cgroup_sk_alloc_disabled)
5801 return;
5802
5803
5804 if (skcd->val) {
5805
5806
5807
5808
5809
5810 cgroup_get(sock_cgroup_ptr(skcd));
5811 return;
5812 }
5813
5814 rcu_read_lock();
5815
5816 while (true) {
5817 struct css_set *cset;
5818
5819 cset = task_css_set(current);
5820 if (likely(cgroup_tryget(cset->dfl_cgrp))) {
5821 skcd->val = (unsigned long)cset->dfl_cgrp;
5822 break;
5823 }
5824 cpu_relax();
5825 }
5826
5827 rcu_read_unlock();
5828}
5829
5830void cgroup_sk_free(struct sock_cgroup_data *skcd)
5831{
5832 cgroup_put(sock_cgroup_ptr(skcd));
5833}
5834
5835#endif
5836
5837#ifdef CONFIG_CGROUP_BPF
5838int cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog,
5839 enum bpf_attach_type type, u32 flags)
5840{
5841 int ret;
5842
5843 mutex_lock(&cgroup_mutex);
5844 ret = __cgroup_bpf_attach(cgrp, prog, type, flags);
5845 mutex_unlock(&cgroup_mutex);
5846 return ret;
5847}
5848int cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog,
5849 enum bpf_attach_type type, u32 flags)
5850{
5851 int ret;
5852
5853 mutex_lock(&cgroup_mutex);
5854 ret = __cgroup_bpf_detach(cgrp, prog, type, flags);
5855 mutex_unlock(&cgroup_mutex);
5856 return ret;
5857}
5858int cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
5859 union bpf_attr __user *uattr)
5860{
5861 int ret;
5862
5863 mutex_lock(&cgroup_mutex);
5864 ret = __cgroup_bpf_query(cgrp, attr, uattr);
5865 mutex_unlock(&cgroup_mutex);
5866 return ret;
5867}
5868#endif
5869
5870#ifdef CONFIG_SYSFS
5871static ssize_t show_delegatable_files(struct cftype *files, char *buf,
5872 ssize_t size, const char *prefix)
5873{
5874 struct cftype *cft;
5875 ssize_t ret = 0;
5876
5877 for (cft = files; cft && cft->name[0] != '\0'; cft++) {
5878 if (!(cft->flags & CFTYPE_NS_DELEGATABLE))
5879 continue;
5880
5881 if (prefix)
5882 ret += snprintf(buf + ret, size - ret, "%s.", prefix);
5883
5884 ret += snprintf(buf + ret, size - ret, "%s\n", cft->name);
5885
5886 if (unlikely(ret >= size)) {
5887 WARN_ON(1);
5888 break;
5889 }
5890 }
5891
5892 return ret;
5893}
5894
5895static ssize_t delegate_show(struct kobject *kobj, struct kobj_attribute *attr,
5896 char *buf)
5897{
5898 struct cgroup_subsys *ss;
5899 int ssid;
5900 ssize_t ret = 0;
5901
5902 ret = show_delegatable_files(cgroup_base_files, buf, PAGE_SIZE - ret,
5903 NULL);
5904
5905 for_each_subsys(ss, ssid)
5906 ret += show_delegatable_files(ss->dfl_cftypes, buf + ret,
5907 PAGE_SIZE - ret,
5908 cgroup_subsys_name[ssid]);
5909
5910 return ret;
5911}
5912static struct kobj_attribute cgroup_delegate_attr = __ATTR_RO(delegate);
5913
5914static ssize_t features_show(struct kobject *kobj, struct kobj_attribute *attr,
5915 char *buf)
5916{
5917 return snprintf(buf, PAGE_SIZE, "nsdelegate\n");
5918}
5919static struct kobj_attribute cgroup_features_attr = __ATTR_RO(features);
5920
5921static struct attribute *cgroup_sysfs_attrs[] = {
5922 &cgroup_delegate_attr.attr,
5923 &cgroup_features_attr.attr,
5924 NULL,
5925};
5926
5927static const struct attribute_group cgroup_sysfs_attr_group = {
5928 .attrs = cgroup_sysfs_attrs,
5929 .name = "cgroup",
5930};
5931
5932static int __init cgroup_sysfs_init(void)
5933{
5934 return sysfs_create_group(kernel_kobj, &cgroup_sysfs_attr_group);
5935}
5936subsys_initcall(cgroup_sysfs_init);
5937#endif
5938