1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
30
31#include <linux/cgroup.h>
32#include <linux/cred.h>
33#include <linux/ctype.h>
34#include <linux/errno.h>
35#include <linux/init_task.h>
36#include <linux/kernel.h>
37#include <linux/list.h>
38#include <linux/magic.h>
39#include <linux/mm.h>
40#include <linux/mutex.h>
41#include <linux/mount.h>
42#include <linux/pagemap.h>
43#include <linux/proc_fs.h>
44#include <linux/rcupdate.h>
45#include <linux/sched.h>
46#include <linux/slab.h>
47#include <linux/spinlock.h>
48#include <linux/percpu-rwsem.h>
49#include <linux/string.h>
50#include <linux/sort.h>
51#include <linux/kmod.h>
52#include <linux/delayacct.h>
53#include <linux/cgroupstats.h>
54#include <linux/hashtable.h>
55#include <linux/pid_namespace.h>
56#include <linux/idr.h>
57#include <linux/vmalloc.h>
58#include <linux/kthread.h>
59#include <linux/delay.h>
60#include <linux/atomic.h>
61#include <linux/cpuset.h>
62#include <linux/proc_ns.h>
63#include <linux/nsproxy.h>
64#include <linux/file.h>
65#include <net/sock.h>
66
67
68
69
70
71
72
73#define CGROUP_PIDLIST_DESTROY_DELAY HZ
74
75#define CGROUP_FILE_NAME_MAX (MAX_CGROUP_TYPE_NAMELEN + \
76 MAX_CFTYPE_NAME + 2)
77
78
79
80
81
82
83
84
85
86
87
88#ifdef CONFIG_PROVE_RCU
89DEFINE_MUTEX(cgroup_mutex);
90DEFINE_SPINLOCK(css_set_lock);
91EXPORT_SYMBOL_GPL(cgroup_mutex);
92EXPORT_SYMBOL_GPL(css_set_lock);
93#else
94static DEFINE_MUTEX(cgroup_mutex);
95static DEFINE_SPINLOCK(css_set_lock);
96#endif
97
98
99
100
101
102static DEFINE_SPINLOCK(cgroup_idr_lock);
103
104
105
106
107
108static DEFINE_SPINLOCK(cgroup_file_kn_lock);
109
110
111
112
113
114static DEFINE_SPINLOCK(release_agent_path_lock);
115
116struct percpu_rw_semaphore cgroup_threadgroup_rwsem;
117
118#define cgroup_assert_mutex_or_rcu_locked() \
119 RCU_LOCKDEP_WARN(!rcu_read_lock_held() && \
120 !lockdep_is_held(&cgroup_mutex), \
121 "cgroup_mutex or RCU read lock required");
122
123
124
125
126
127
128
129static struct workqueue_struct *cgroup_destroy_wq;
130
131
132
133
134
135static struct workqueue_struct *cgroup_pidlist_destroy_wq;
136
137
138#define SUBSYS(_x) [_x ## _cgrp_id] = &_x ## _cgrp_subsys,
139static struct cgroup_subsys *cgroup_subsys[] = {
140#include <linux/cgroup_subsys.h>
141};
142#undef SUBSYS
143
144
145#define SUBSYS(_x) [_x ## _cgrp_id] = #_x,
146static const char *cgroup_subsys_name[] = {
147#include <linux/cgroup_subsys.h>
148};
149#undef SUBSYS
150
151
152#define SUBSYS(_x) \
153 DEFINE_STATIC_KEY_TRUE(_x ## _cgrp_subsys_enabled_key); \
154 DEFINE_STATIC_KEY_TRUE(_x ## _cgrp_subsys_on_dfl_key); \
155 EXPORT_SYMBOL_GPL(_x ## _cgrp_subsys_enabled_key); \
156 EXPORT_SYMBOL_GPL(_x ## _cgrp_subsys_on_dfl_key);
157#include <linux/cgroup_subsys.h>
158#undef SUBSYS
159
160#define SUBSYS(_x) [_x ## _cgrp_id] = &_x ## _cgrp_subsys_enabled_key,
161static struct static_key_true *cgroup_subsys_enabled_key[] = {
162#include <linux/cgroup_subsys.h>
163};
164#undef SUBSYS
165
166#define SUBSYS(_x) [_x ## _cgrp_id] = &_x ## _cgrp_subsys_on_dfl_key,
167static struct static_key_true *cgroup_subsys_on_dfl_key[] = {
168#include <linux/cgroup_subsys.h>
169};
170#undef SUBSYS
171
172
173
174
175
176
177struct cgroup_root cgrp_dfl_root;
178EXPORT_SYMBOL_GPL(cgrp_dfl_root);
179
180
181
182
183
184static bool cgrp_dfl_visible;
185
186
187static u16 cgroup_no_v1_mask;
188
189
190static u16 cgrp_dfl_inhibit_ss_mask;
191
192
193static unsigned long cgrp_dfl_implicit_ss_mask;
194
195
196
197static LIST_HEAD(cgroup_roots);
198static int cgroup_root_count;
199
200
201static DEFINE_IDR(cgroup_hierarchy_idr);
202
203
204
205
206
207
208
209
210static u64 css_serial_nr_next = 1;
211
212
213
214
215
216
217static u16 have_fork_callback __read_mostly;
218static u16 have_exit_callback __read_mostly;
219static u16 have_free_callback __read_mostly;
220
221
222struct cgroup_namespace init_cgroup_ns = {
223 .count = { .counter = 2, },
224 .user_ns = &init_user_ns,
225 .ns.ops = &cgroupns_operations,
226 .ns.inum = PROC_CGROUP_INIT_INO,
227 .root_cset = &init_css_set,
228};
229
230
231static u16 have_canfork_callback __read_mostly;
232
233static struct file_system_type cgroup2_fs_type;
234static struct cftype cgroup_dfl_base_files[];
235static struct cftype cgroup_legacy_base_files[];
236
237static int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask);
238static void cgroup_lock_and_drain_offline(struct cgroup *cgrp);
239static int cgroup_apply_control(struct cgroup *cgrp);
240static void cgroup_finalize_control(struct cgroup *cgrp, int ret);
241static void css_task_iter_advance(struct css_task_iter *it);
242static int cgroup_destroy_locked(struct cgroup *cgrp);
243static struct cgroup_subsys_state *css_create(struct cgroup *cgrp,
244 struct cgroup_subsys *ss);
245static void css_release(struct percpu_ref *ref);
246static void kill_css(struct cgroup_subsys_state *css);
247static int cgroup_addrm_files(struct cgroup_subsys_state *css,
248 struct cgroup *cgrp, struct cftype cfts[],
249 bool is_add);
250
251
252
253
254
255
256
257
258
259static bool cgroup_ssid_enabled(int ssid)
260{
261 if (CGROUP_SUBSYS_COUNT == 0)
262 return false;
263
264 return static_key_enabled(cgroup_subsys_enabled_key[ssid]);
265}
266
267static bool cgroup_ssid_no_v1(int ssid)
268{
269 return cgroup_no_v1_mask & (1 << ssid);
270}
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325static bool cgroup_on_dfl(const struct cgroup *cgrp)
326{
327 return cgrp->root == &cgrp_dfl_root;
328}
329
330
331static int cgroup_idr_alloc(struct idr *idr, void *ptr, int start, int end,
332 gfp_t gfp_mask)
333{
334 int ret;
335
336 idr_preload(gfp_mask);
337 spin_lock_bh(&cgroup_idr_lock);
338 ret = idr_alloc(idr, ptr, start, end, gfp_mask & ~__GFP_DIRECT_RECLAIM);
339 spin_unlock_bh(&cgroup_idr_lock);
340 idr_preload_end();
341 return ret;
342}
343
344static void *cgroup_idr_replace(struct idr *idr, void *ptr, int id)
345{
346 void *ret;
347
348 spin_lock_bh(&cgroup_idr_lock);
349 ret = idr_replace(idr, ptr, id);
350 spin_unlock_bh(&cgroup_idr_lock);
351 return ret;
352}
353
354static void cgroup_idr_remove(struct idr *idr, int id)
355{
356 spin_lock_bh(&cgroup_idr_lock);
357 idr_remove(idr, id);
358 spin_unlock_bh(&cgroup_idr_lock);
359}
360
361static struct cgroup *cgroup_parent(struct cgroup *cgrp)
362{
363 struct cgroup_subsys_state *parent_css = cgrp->self.parent;
364
365 if (parent_css)
366 return container_of(parent_css, struct cgroup, self);
367 return NULL;
368}
369
370
371static u16 cgroup_control(struct cgroup *cgrp)
372{
373 struct cgroup *parent = cgroup_parent(cgrp);
374 u16 root_ss_mask = cgrp->root->subsys_mask;
375
376 if (parent)
377 return parent->subtree_control;
378
379 if (cgroup_on_dfl(cgrp))
380 root_ss_mask &= ~(cgrp_dfl_inhibit_ss_mask |
381 cgrp_dfl_implicit_ss_mask);
382 return root_ss_mask;
383}
384
385
386static u16 cgroup_ss_mask(struct cgroup *cgrp)
387{
388 struct cgroup *parent = cgroup_parent(cgrp);
389
390 if (parent)
391 return parent->subtree_ss_mask;
392
393 return cgrp->root->subsys_mask;
394}
395
396
397
398
399
400
401
402
403
404
405
406
407static struct cgroup_subsys_state *cgroup_css(struct cgroup *cgrp,
408 struct cgroup_subsys *ss)
409{
410 if (ss)
411 return rcu_dereference_check(cgrp->subsys[ss->id],
412 lockdep_is_held(&cgroup_mutex));
413 else
414 return &cgrp->self;
415}
416
417
418
419
420
421
422
423
424
425
426
427static struct cgroup_subsys_state *cgroup_e_css(struct cgroup *cgrp,
428 struct cgroup_subsys *ss)
429{
430 lockdep_assert_held(&cgroup_mutex);
431
432 if (!ss)
433 return &cgrp->self;
434
435
436
437
438
439 while (!(cgroup_ss_mask(cgrp) & (1 << ss->id))) {
440 cgrp = cgroup_parent(cgrp);
441 if (!cgrp)
442 return NULL;
443 }
444
445 return cgroup_css(cgrp, ss);
446}
447
448
449
450
451
452
453
454
455
456
457
458
459struct cgroup_subsys_state *cgroup_get_e_css(struct cgroup *cgrp,
460 struct cgroup_subsys *ss)
461{
462 struct cgroup_subsys_state *css;
463
464 rcu_read_lock();
465
466 do {
467 css = cgroup_css(cgrp, ss);
468
469 if (css && css_tryget_online(css))
470 goto out_unlock;
471 cgrp = cgroup_parent(cgrp);
472 } while (cgrp);
473
474 css = init_css_set.subsys[ss->id];
475 css_get(css);
476out_unlock:
477 rcu_read_unlock();
478 return css;
479}
480
481
482static inline bool cgroup_is_dead(const struct cgroup *cgrp)
483{
484 return !(cgrp->self.flags & CSS_ONLINE);
485}
486
487static void cgroup_get(struct cgroup *cgrp)
488{
489 WARN_ON_ONCE(cgroup_is_dead(cgrp));
490 css_get(&cgrp->self);
491}
492
493static bool cgroup_tryget(struct cgroup *cgrp)
494{
495 return css_tryget(&cgrp->self);
496}
497
498struct cgroup_subsys_state *of_css(struct kernfs_open_file *of)
499{
500 struct cgroup *cgrp = of->kn->parent->priv;
501 struct cftype *cft = of_cft(of);
502
503
504
505
506
507
508
509
510
511 if (cft->ss)
512 return rcu_dereference_raw(cgrp->subsys[cft->ss->id]);
513 else
514 return &cgrp->self;
515}
516EXPORT_SYMBOL_GPL(of_css);
517
518static int notify_on_release(const struct cgroup *cgrp)
519{
520 return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
521}
522
523
524
525
526
527
528
529
530
531#define for_each_css(css, ssid, cgrp) \
532 for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++) \
533 if (!((css) = rcu_dereference_check( \
534 (cgrp)->subsys[(ssid)], \
535 lockdep_is_held(&cgroup_mutex)))) { } \
536 else
537
538
539
540
541
542
543
544
545
546#define for_each_e_css(css, ssid, cgrp) \
547 for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++) \
548 if (!((css) = cgroup_e_css(cgrp, cgroup_subsys[(ssid)]))) \
549 ; \
550 else
551
552
553
554
555
556
557#define for_each_subsys(ss, ssid) \
558 for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT && \
559 (((ss) = cgroup_subsys[ssid]) || true); (ssid)++)
560
561
562
563
564
565
566
567
568
569
570#define do_each_subsys_mask(ss, ssid, ss_mask) do { \
571 unsigned long __ss_mask = (ss_mask); \
572 if (!CGROUP_SUBSYS_COUNT) { \
573 (ssid) = 0; \
574 break; \
575 } \
576 for_each_set_bit(ssid, &__ss_mask, CGROUP_SUBSYS_COUNT) { \
577 (ss) = cgroup_subsys[ssid]; \
578 {
579
580#define while_each_subsys_mask() \
581 } \
582 } \
583} while (false)
584
585
586#define for_each_root(root) \
587 list_for_each_entry((root), &cgroup_roots, root_list)
588
589
590#define cgroup_for_each_live_child(child, cgrp) \
591 list_for_each_entry((child), &(cgrp)->self.children, self.sibling) \
592 if (({ lockdep_assert_held(&cgroup_mutex); \
593 cgroup_is_dead(child); })) \
594 ; \
595 else
596
597
598#define cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp) \
599 css_for_each_descendant_pre((d_css), cgroup_css((cgrp), NULL)) \
600 if (({ lockdep_assert_held(&cgroup_mutex); \
601 (dsct) = (d_css)->cgroup; \
602 cgroup_is_dead(dsct); })) \
603 ; \
604 else
605
606
607#define cgroup_for_each_live_descendant_post(dsct, d_css, cgrp) \
608 css_for_each_descendant_post((d_css), cgroup_css((cgrp), NULL)) \
609 if (({ lockdep_assert_held(&cgroup_mutex); \
610 (dsct) = (d_css)->cgroup; \
611 cgroup_is_dead(dsct); })) \
612 ; \
613 else
614
615static void cgroup_release_agent(struct work_struct *work);
616static void check_for_release(struct cgroup *cgrp);
617
618
619
620
621
622
623
624
625
626struct cgrp_cset_link {
627
628 struct cgroup *cgrp;
629 struct css_set *cset;
630
631
632 struct list_head cset_link;
633
634
635 struct list_head cgrp_link;
636};
637
638
639
640
641
642
643
644
645struct css_set init_css_set = {
646 .refcount = ATOMIC_INIT(1),
647 .cgrp_links = LIST_HEAD_INIT(init_css_set.cgrp_links),
648 .tasks = LIST_HEAD_INIT(init_css_set.tasks),
649 .mg_tasks = LIST_HEAD_INIT(init_css_set.mg_tasks),
650 .mg_preload_node = LIST_HEAD_INIT(init_css_set.mg_preload_node),
651 .mg_node = LIST_HEAD_INIT(init_css_set.mg_node),
652 .task_iters = LIST_HEAD_INIT(init_css_set.task_iters),
653};
654
655static int css_set_count = 1;
656
657
658
659
660
661static bool css_set_populated(struct css_set *cset)
662{
663 lockdep_assert_held(&css_set_lock);
664
665 return !list_empty(&cset->tasks) || !list_empty(&cset->mg_tasks);
666}
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684static void cgroup_update_populated(struct cgroup *cgrp, bool populated)
685{
686 lockdep_assert_held(&css_set_lock);
687
688 do {
689 bool trigger;
690
691 if (populated)
692 trigger = !cgrp->populated_cnt++;
693 else
694 trigger = !--cgrp->populated_cnt;
695
696 if (!trigger)
697 break;
698
699 check_for_release(cgrp);
700 cgroup_file_notify(&cgrp->events_file);
701
702 cgrp = cgroup_parent(cgrp);
703 } while (cgrp);
704}
705
706
707
708
709
710
711
712
713
714static void css_set_update_populated(struct css_set *cset, bool populated)
715{
716 struct cgrp_cset_link *link;
717
718 lockdep_assert_held(&css_set_lock);
719
720 list_for_each_entry(link, &cset->cgrp_links, cgrp_link)
721 cgroup_update_populated(link->cgrp, populated);
722}
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739static void css_set_move_task(struct task_struct *task,
740 struct css_set *from_cset, struct css_set *to_cset,
741 bool use_mg_tasks)
742{
743 lockdep_assert_held(&css_set_lock);
744
745 if (to_cset && !css_set_populated(to_cset))
746 css_set_update_populated(to_cset, true);
747
748 if (from_cset) {
749 struct css_task_iter *it, *pos;
750
751 WARN_ON_ONCE(list_empty(&task->cg_list));
752
753
754
755
756
757
758
759
760 list_for_each_entry_safe(it, pos, &from_cset->task_iters,
761 iters_node)
762 if (it->task_pos == &task->cg_list)
763 css_task_iter_advance(it);
764
765 list_del_init(&task->cg_list);
766 if (!css_set_populated(from_cset))
767 css_set_update_populated(from_cset, false);
768 } else {
769 WARN_ON_ONCE(!list_empty(&task->cg_list));
770 }
771
772 if (to_cset) {
773
774
775
776
777
778
779 WARN_ON_ONCE(task->flags & PF_EXITING);
780
781 rcu_assign_pointer(task->cgroups, to_cset);
782 list_add_tail(&task->cg_list, use_mg_tasks ? &to_cset->mg_tasks :
783 &to_cset->tasks);
784 }
785}
786
787
788
789
790
791
792#define CSS_SET_HASH_BITS 7
793static DEFINE_HASHTABLE(css_set_table, CSS_SET_HASH_BITS);
794
795static unsigned long css_set_hash(struct cgroup_subsys_state *css[])
796{
797 unsigned long key = 0UL;
798 struct cgroup_subsys *ss;
799 int i;
800
801 for_each_subsys(ss, i)
802 key += (unsigned long)css[i];
803 key = (key >> 16) ^ key;
804
805 return key;
806}
807
808static void put_css_set_locked(struct css_set *cset)
809{
810 struct cgrp_cset_link *link, *tmp_link;
811 struct cgroup_subsys *ss;
812 int ssid;
813
814 lockdep_assert_held(&css_set_lock);
815
816 if (!atomic_dec_and_test(&cset->refcount))
817 return;
818
819
820 for_each_subsys(ss, ssid) {
821 list_del(&cset->e_cset_node[ssid]);
822 css_put(cset->subsys[ssid]);
823 }
824 hash_del(&cset->hlist);
825 css_set_count--;
826
827 list_for_each_entry_safe(link, tmp_link, &cset->cgrp_links, cgrp_link) {
828 list_del(&link->cset_link);
829 list_del(&link->cgrp_link);
830 if (cgroup_parent(link->cgrp))
831 cgroup_put(link->cgrp);
832 kfree(link);
833 }
834
835 kfree_rcu(cset, rcu_head);
836}
837
838static void put_css_set(struct css_set *cset)
839{
840 unsigned long flags;
841
842
843
844
845
846
847 if (atomic_add_unless(&cset->refcount, -1, 1))
848 return;
849
850 spin_lock_irqsave(&css_set_lock, flags);
851 put_css_set_locked(cset);
852 spin_unlock_irqrestore(&css_set_lock, flags);
853}
854
855
856
857
858static inline void get_css_set(struct css_set *cset)
859{
860 atomic_inc(&cset->refcount);
861}
862
863
864
865
866
867
868
869
870
871
872
873static bool compare_css_sets(struct css_set *cset,
874 struct css_set *old_cset,
875 struct cgroup *new_cgrp,
876 struct cgroup_subsys_state *template[])
877{
878 struct list_head *l1, *l2;
879
880
881
882
883
884
885 if (memcmp(template, cset->subsys, sizeof(cset->subsys)))
886 return false;
887
888
889
890
891
892
893
894 l1 = &cset->cgrp_links;
895 l2 = &old_cset->cgrp_links;
896 while (1) {
897 struct cgrp_cset_link *link1, *link2;
898 struct cgroup *cgrp1, *cgrp2;
899
900 l1 = l1->next;
901 l2 = l2->next;
902
903 if (l1 == &cset->cgrp_links) {
904 BUG_ON(l2 != &old_cset->cgrp_links);
905 break;
906 } else {
907 BUG_ON(l2 == &old_cset->cgrp_links);
908 }
909
910 link1 = list_entry(l1, struct cgrp_cset_link, cgrp_link);
911 link2 = list_entry(l2, struct cgrp_cset_link, cgrp_link);
912 cgrp1 = link1->cgrp;
913 cgrp2 = link2->cgrp;
914
915 BUG_ON(cgrp1->root != cgrp2->root);
916
917
918
919
920
921
922
923
924 if (cgrp1->root == new_cgrp->root) {
925 if (cgrp1 != new_cgrp)
926 return false;
927 } else {
928 if (cgrp1 != cgrp2)
929 return false;
930 }
931 }
932 return true;
933}
934
935
936
937
938
939
940
941static struct css_set *find_existing_css_set(struct css_set *old_cset,
942 struct cgroup *cgrp,
943 struct cgroup_subsys_state *template[])
944{
945 struct cgroup_root *root = cgrp->root;
946 struct cgroup_subsys *ss;
947 struct css_set *cset;
948 unsigned long key;
949 int i;
950
951
952
953
954
955
956 for_each_subsys(ss, i) {
957 if (root->subsys_mask & (1UL << i)) {
958
959
960
961
962 template[i] = cgroup_e_css(cgrp, ss);
963 } else {
964
965
966
967
968 template[i] = old_cset->subsys[i];
969 }
970 }
971
972 key = css_set_hash(template);
973 hash_for_each_possible(css_set_table, cset, hlist, key) {
974 if (!compare_css_sets(cset, old_cset, cgrp, template))
975 continue;
976
977
978 return cset;
979 }
980
981
982 return NULL;
983}
984
985static void free_cgrp_cset_links(struct list_head *links_to_free)
986{
987 struct cgrp_cset_link *link, *tmp_link;
988
989 list_for_each_entry_safe(link, tmp_link, links_to_free, cset_link) {
990 list_del(&link->cset_link);
991 kfree(link);
992 }
993}
994
995
996
997
998
999
1000
1001
1002
1003static int allocate_cgrp_cset_links(int count, struct list_head *tmp_links)
1004{
1005 struct cgrp_cset_link *link;
1006 int i;
1007
1008 INIT_LIST_HEAD(tmp_links);
1009
1010 for (i = 0; i < count; i++) {
1011 link = kzalloc(sizeof(*link), GFP_KERNEL);
1012 if (!link) {
1013 free_cgrp_cset_links(tmp_links);
1014 return -ENOMEM;
1015 }
1016 list_add(&link->cset_link, tmp_links);
1017 }
1018 return 0;
1019}
1020
1021
1022
1023
1024
1025
1026
1027static void link_css_set(struct list_head *tmp_links, struct css_set *cset,
1028 struct cgroup *cgrp)
1029{
1030 struct cgrp_cset_link *link;
1031
1032 BUG_ON(list_empty(tmp_links));
1033
1034 if (cgroup_on_dfl(cgrp))
1035 cset->dfl_cgrp = cgrp;
1036
1037 link = list_first_entry(tmp_links, struct cgrp_cset_link, cset_link);
1038 link->cset = cset;
1039 link->cgrp = cgrp;
1040
1041
1042
1043
1044
1045 list_move_tail(&link->cset_link, &cgrp->cset_links);
1046 list_add_tail(&link->cgrp_link, &cset->cgrp_links);
1047
1048 if (cgroup_parent(cgrp))
1049 cgroup_get(cgrp);
1050}
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060static struct css_set *find_css_set(struct css_set *old_cset,
1061 struct cgroup *cgrp)
1062{
1063 struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT] = { };
1064 struct css_set *cset;
1065 struct list_head tmp_links;
1066 struct cgrp_cset_link *link;
1067 struct cgroup_subsys *ss;
1068 unsigned long key;
1069 int ssid;
1070
1071 lockdep_assert_held(&cgroup_mutex);
1072
1073
1074
1075 spin_lock_irq(&css_set_lock);
1076 cset = find_existing_css_set(old_cset, cgrp, template);
1077 if (cset)
1078 get_css_set(cset);
1079 spin_unlock_irq(&css_set_lock);
1080
1081 if (cset)
1082 return cset;
1083
1084 cset = kzalloc(sizeof(*cset), GFP_KERNEL);
1085 if (!cset)
1086 return NULL;
1087
1088
1089 if (allocate_cgrp_cset_links(cgroup_root_count, &tmp_links) < 0) {
1090 kfree(cset);
1091 return NULL;
1092 }
1093
1094 atomic_set(&cset->refcount, 1);
1095 INIT_LIST_HEAD(&cset->cgrp_links);
1096 INIT_LIST_HEAD(&cset->tasks);
1097 INIT_LIST_HEAD(&cset->mg_tasks);
1098 INIT_LIST_HEAD(&cset->mg_preload_node);
1099 INIT_LIST_HEAD(&cset->mg_node);
1100 INIT_LIST_HEAD(&cset->task_iters);
1101 INIT_HLIST_NODE(&cset->hlist);
1102
1103
1104
1105 memcpy(cset->subsys, template, sizeof(cset->subsys));
1106
1107 spin_lock_irq(&css_set_lock);
1108
1109 list_for_each_entry(link, &old_cset->cgrp_links, cgrp_link) {
1110 struct cgroup *c = link->cgrp;
1111
1112 if (c->root == cgrp->root)
1113 c = cgrp;
1114 link_css_set(&tmp_links, cset, c);
1115 }
1116
1117 BUG_ON(!list_empty(&tmp_links));
1118
1119 css_set_count++;
1120
1121
1122 key = css_set_hash(cset->subsys);
1123 hash_add(css_set_table, &cset->hlist, key);
1124
1125 for_each_subsys(ss, ssid) {
1126 struct cgroup_subsys_state *css = cset->subsys[ssid];
1127
1128 list_add_tail(&cset->e_cset_node[ssid],
1129 &css->cgroup->e_csets[ssid]);
1130 css_get(css);
1131 }
1132
1133 spin_unlock_irq(&css_set_lock);
1134
1135 return cset;
1136}
1137
1138static struct cgroup_root *cgroup_root_from_kf(struct kernfs_root *kf_root)
1139{
1140 struct cgroup *root_cgrp = kf_root->kn->priv;
1141
1142 return root_cgrp->root;
1143}
1144
1145static int cgroup_init_root_id(struct cgroup_root *root)
1146{
1147 int id;
1148
1149 lockdep_assert_held(&cgroup_mutex);
1150
1151 id = idr_alloc_cyclic(&cgroup_hierarchy_idr, root, 0, 0, GFP_KERNEL);
1152 if (id < 0)
1153 return id;
1154
1155 root->hierarchy_id = id;
1156 return 0;
1157}
1158
1159static void cgroup_exit_root_id(struct cgroup_root *root)
1160{
1161 lockdep_assert_held(&cgroup_mutex);
1162
1163 idr_remove(&cgroup_hierarchy_idr, root->hierarchy_id);
1164}
1165
1166static void cgroup_free_root(struct cgroup_root *root)
1167{
1168 if (root) {
1169 idr_destroy(&root->cgroup_idr);
1170 kfree(root);
1171 }
1172}
1173
1174static void cgroup_destroy_root(struct cgroup_root *root)
1175{
1176 struct cgroup *cgrp = &root->cgrp;
1177 struct cgrp_cset_link *link, *tmp_link;
1178
1179 cgroup_lock_and_drain_offline(&cgrp_dfl_root.cgrp);
1180
1181 BUG_ON(atomic_read(&root->nr_cgrps));
1182 BUG_ON(!list_empty(&cgrp->self.children));
1183
1184
1185 WARN_ON(rebind_subsystems(&cgrp_dfl_root, root->subsys_mask));
1186
1187
1188
1189
1190
1191 spin_lock_irq(&css_set_lock);
1192
1193 list_for_each_entry_safe(link, tmp_link, &cgrp->cset_links, cset_link) {
1194 list_del(&link->cset_link);
1195 list_del(&link->cgrp_link);
1196 kfree(link);
1197 }
1198
1199 spin_unlock_irq(&css_set_lock);
1200
1201 if (!list_empty(&root->root_list)) {
1202 list_del(&root->root_list);
1203 cgroup_root_count--;
1204 }
1205
1206 cgroup_exit_root_id(root);
1207
1208 mutex_unlock(&cgroup_mutex);
1209
1210 kernfs_destroy_root(root->kf_root);
1211 cgroup_free_root(root);
1212}
1213
1214
1215
1216
1217
1218static struct cgroup *
1219current_cgns_cgroup_from_root(struct cgroup_root *root)
1220{
1221 struct cgroup *res = NULL;
1222 struct css_set *cset;
1223
1224 lockdep_assert_held(&css_set_lock);
1225
1226 rcu_read_lock();
1227
1228 cset = current->nsproxy->cgroup_ns->root_cset;
1229 if (cset == &init_css_set) {
1230 res = &root->cgrp;
1231 } else {
1232 struct cgrp_cset_link *link;
1233
1234 list_for_each_entry(link, &cset->cgrp_links, cgrp_link) {
1235 struct cgroup *c = link->cgrp;
1236
1237 if (c->root == root) {
1238 res = c;
1239 break;
1240 }
1241 }
1242 }
1243 rcu_read_unlock();
1244
1245 BUG_ON(!res);
1246 return res;
1247}
1248
1249
1250static struct cgroup *cset_cgroup_from_root(struct css_set *cset,
1251 struct cgroup_root *root)
1252{
1253 struct cgroup *res = NULL;
1254
1255 lockdep_assert_held(&cgroup_mutex);
1256 lockdep_assert_held(&css_set_lock);
1257
1258 if (cset == &init_css_set) {
1259 res = &root->cgrp;
1260 } else {
1261 struct cgrp_cset_link *link;
1262
1263 list_for_each_entry(link, &cset->cgrp_links, cgrp_link) {
1264 struct cgroup *c = link->cgrp;
1265
1266 if (c->root == root) {
1267 res = c;
1268 break;
1269 }
1270 }
1271 }
1272
1273 BUG_ON(!res);
1274 return res;
1275}
1276
1277
1278
1279
1280
1281static struct cgroup *task_cgroup_from_root(struct task_struct *task,
1282 struct cgroup_root *root)
1283{
1284
1285
1286
1287
1288
1289 return cset_cgroup_from_root(task_css_set(task), root);
1290}
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318static struct kernfs_syscall_ops cgroup_kf_syscall_ops;
1319static const struct file_operations proc_cgroupstats_operations;
1320
1321static char *cgroup_file_name(struct cgroup *cgrp, const struct cftype *cft,
1322 char *buf)
1323{
1324 struct cgroup_subsys *ss = cft->ss;
1325
1326 if (cft->ss && !(cft->flags & CFTYPE_NO_PREFIX) &&
1327 !(cgrp->root->flags & CGRP_ROOT_NOPREFIX))
1328 snprintf(buf, CGROUP_FILE_NAME_MAX, "%s.%s",
1329 cgroup_on_dfl(cgrp) ? ss->name : ss->legacy_name,
1330 cft->name);
1331 else
1332 strncpy(buf, cft->name, CGROUP_FILE_NAME_MAX);
1333 return buf;
1334}
1335
1336
1337
1338
1339
1340
1341
1342static umode_t cgroup_file_mode(const struct cftype *cft)
1343{
1344 umode_t mode = 0;
1345
1346 if (cft->read_u64 || cft->read_s64 || cft->seq_show)
1347 mode |= S_IRUGO;
1348
1349 if (cft->write_u64 || cft->write_s64 || cft->write) {
1350 if (cft->flags & CFTYPE_WORLD_WRITABLE)
1351 mode |= S_IWUGO;
1352 else
1353 mode |= S_IWUSR;
1354 }
1355
1356 return mode;
1357}
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371static u16 cgroup_calc_subtree_ss_mask(u16 subtree_control, u16 this_ss_mask)
1372{
1373 u16 cur_ss_mask = subtree_control;
1374 struct cgroup_subsys *ss;
1375 int ssid;
1376
1377 lockdep_assert_held(&cgroup_mutex);
1378
1379 cur_ss_mask |= cgrp_dfl_implicit_ss_mask;
1380
1381 while (true) {
1382 u16 new_ss_mask = cur_ss_mask;
1383
1384 do_each_subsys_mask(ss, ssid, cur_ss_mask) {
1385 new_ss_mask |= ss->depends_on;
1386 } while_each_subsys_mask();
1387
1388
1389
1390
1391
1392
1393 new_ss_mask &= this_ss_mask;
1394
1395 if (new_ss_mask == cur_ss_mask)
1396 break;
1397 cur_ss_mask = new_ss_mask;
1398 }
1399
1400 return cur_ss_mask;
1401}
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413static void cgroup_kn_unlock(struct kernfs_node *kn)
1414{
1415 struct cgroup *cgrp;
1416
1417 if (kernfs_type(kn) == KERNFS_DIR)
1418 cgrp = kn->priv;
1419 else
1420 cgrp = kn->parent->priv;
1421
1422 mutex_unlock(&cgroup_mutex);
1423
1424 kernfs_unbreak_active_protection(kn);
1425 cgroup_put(cgrp);
1426}
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445static struct cgroup *cgroup_kn_lock_live(struct kernfs_node *kn,
1446 bool drain_offline)
1447{
1448 struct cgroup *cgrp;
1449
1450 if (kernfs_type(kn) == KERNFS_DIR)
1451 cgrp = kn->priv;
1452 else
1453 cgrp = kn->parent->priv;
1454
1455
1456
1457
1458
1459
1460
1461 if (!cgroup_tryget(cgrp))
1462 return NULL;
1463 kernfs_break_active_protection(kn);
1464
1465 if (drain_offline)
1466 cgroup_lock_and_drain_offline(cgrp);
1467 else
1468 mutex_lock(&cgroup_mutex);
1469
1470 if (!cgroup_is_dead(cgrp))
1471 return cgrp;
1472
1473 cgroup_kn_unlock(kn);
1474 return NULL;
1475}
1476
1477static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft)
1478{
1479 char name[CGROUP_FILE_NAME_MAX];
1480
1481 lockdep_assert_held(&cgroup_mutex);
1482
1483 if (cft->file_offset) {
1484 struct cgroup_subsys_state *css = cgroup_css(cgrp, cft->ss);
1485 struct cgroup_file *cfile = (void *)css + cft->file_offset;
1486
1487 spin_lock_irq(&cgroup_file_kn_lock);
1488 cfile->kn = NULL;
1489 spin_unlock_irq(&cgroup_file_kn_lock);
1490 }
1491
1492 kernfs_remove_by_name(cgrp->kn, cgroup_file_name(cgrp, cft, name));
1493}
1494
1495
1496
1497
1498
1499static void css_clear_dir(struct cgroup_subsys_state *css)
1500{
1501 struct cgroup *cgrp = css->cgroup;
1502 struct cftype *cfts;
1503
1504 if (!(css->flags & CSS_VISIBLE))
1505 return;
1506
1507 css->flags &= ~CSS_VISIBLE;
1508
1509 list_for_each_entry(cfts, &css->ss->cfts, node)
1510 cgroup_addrm_files(css, cgrp, cfts, false);
1511}
1512
1513
1514
1515
1516
1517
1518
1519static int css_populate_dir(struct cgroup_subsys_state *css)
1520{
1521 struct cgroup *cgrp = css->cgroup;
1522 struct cftype *cfts, *failed_cfts;
1523 int ret;
1524
1525 if ((css->flags & CSS_VISIBLE) || !cgrp->kn)
1526 return 0;
1527
1528 if (!css->ss) {
1529 if (cgroup_on_dfl(cgrp))
1530 cfts = cgroup_dfl_base_files;
1531 else
1532 cfts = cgroup_legacy_base_files;
1533
1534 return cgroup_addrm_files(&cgrp->self, cgrp, cfts, true);
1535 }
1536
1537 list_for_each_entry(cfts, &css->ss->cfts, node) {
1538 ret = cgroup_addrm_files(css, cgrp, cfts, true);
1539 if (ret < 0) {
1540 failed_cfts = cfts;
1541 goto err;
1542 }
1543 }
1544
1545 css->flags |= CSS_VISIBLE;
1546
1547 return 0;
1548err:
1549 list_for_each_entry(cfts, &css->ss->cfts, node) {
1550 if (cfts == failed_cfts)
1551 break;
1552 cgroup_addrm_files(css, cgrp, cfts, false);
1553 }
1554 return ret;
1555}
1556
1557static int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask)
1558{
1559 struct cgroup *dcgrp = &dst_root->cgrp;
1560 struct cgroup_subsys *ss;
1561 int ssid, i, ret;
1562
1563 lockdep_assert_held(&cgroup_mutex);
1564
1565 do_each_subsys_mask(ss, ssid, ss_mask) {
1566
1567
1568
1569
1570
1571 if (css_next_child(NULL, cgroup_css(&ss->root->cgrp, ss)) &&
1572 !ss->implicit_on_dfl)
1573 return -EBUSY;
1574
1575
1576 if (ss->root != &cgrp_dfl_root && dst_root != &cgrp_dfl_root)
1577 return -EBUSY;
1578 } while_each_subsys_mask();
1579
1580 do_each_subsys_mask(ss, ssid, ss_mask) {
1581 struct cgroup_root *src_root = ss->root;
1582 struct cgroup *scgrp = &src_root->cgrp;
1583 struct cgroup_subsys_state *css = cgroup_css(scgrp, ss);
1584 struct css_set *cset;
1585
1586 WARN_ON(!css || cgroup_css(dcgrp, ss));
1587
1588
1589 src_root->subsys_mask &= ~(1 << ssid);
1590 WARN_ON(cgroup_apply_control(scgrp));
1591 cgroup_finalize_control(scgrp, 0);
1592
1593
1594 RCU_INIT_POINTER(scgrp->subsys[ssid], NULL);
1595 rcu_assign_pointer(dcgrp->subsys[ssid], css);
1596 ss->root = dst_root;
1597 css->cgroup = dcgrp;
1598
1599 spin_lock_irq(&css_set_lock);
1600 hash_for_each(css_set_table, i, cset, hlist)
1601 list_move_tail(&cset->e_cset_node[ss->id],
1602 &dcgrp->e_csets[ss->id]);
1603 spin_unlock_irq(&css_set_lock);
1604
1605
1606 dst_root->subsys_mask |= 1 << ssid;
1607 if (dst_root == &cgrp_dfl_root) {
1608 static_branch_enable(cgroup_subsys_on_dfl_key[ssid]);
1609 } else {
1610 dcgrp->subtree_control |= 1 << ssid;
1611 static_branch_disable(cgroup_subsys_on_dfl_key[ssid]);
1612 }
1613
1614 ret = cgroup_apply_control(dcgrp);
1615 if (ret)
1616 pr_warn("partial failure to rebind %s controller (err=%d)\n",
1617 ss->name, ret);
1618
1619 if (ss->bind)
1620 ss->bind(css);
1621 } while_each_subsys_mask();
1622
1623 kernfs_activate(dcgrp->kn);
1624 return 0;
1625}
1626
1627static int cgroup_show_path(struct seq_file *sf, struct kernfs_node *kf_node,
1628 struct kernfs_root *kf_root)
1629{
1630 int len = 0;
1631 char *buf = NULL;
1632 struct cgroup_root *kf_cgroot = cgroup_root_from_kf(kf_root);
1633 struct cgroup *ns_cgroup;
1634
1635 buf = kmalloc(PATH_MAX, GFP_KERNEL);
1636 if (!buf)
1637 return -ENOMEM;
1638
1639 spin_lock_irq(&css_set_lock);
1640 ns_cgroup = current_cgns_cgroup_from_root(kf_cgroot);
1641 len = kernfs_path_from_node(kf_node, ns_cgroup->kn, buf, PATH_MAX);
1642 spin_unlock_irq(&css_set_lock);
1643
1644 if (len >= PATH_MAX)
1645 len = -ERANGE;
1646 else if (len > 0) {
1647 seq_escape(sf, buf, " \t\n\\");
1648 len = 0;
1649 }
1650 kfree(buf);
1651 return len;
1652}
1653
1654static int cgroup_show_options(struct seq_file *seq,
1655 struct kernfs_root *kf_root)
1656{
1657 struct cgroup_root *root = cgroup_root_from_kf(kf_root);
1658 struct cgroup_subsys *ss;
1659 int ssid;
1660
1661 if (root != &cgrp_dfl_root)
1662 for_each_subsys(ss, ssid)
1663 if (root->subsys_mask & (1 << ssid))
1664 seq_show_option(seq, ss->legacy_name, NULL);
1665 if (root->flags & CGRP_ROOT_NOPREFIX)
1666 seq_puts(seq, ",noprefix");
1667 if (root->flags & CGRP_ROOT_XATTR)
1668 seq_puts(seq, ",xattr");
1669
1670 spin_lock(&release_agent_path_lock);
1671 if (strlen(root->release_agent_path))
1672 seq_show_option(seq, "release_agent",
1673 root->release_agent_path);
1674 spin_unlock(&release_agent_path_lock);
1675
1676 if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags))
1677 seq_puts(seq, ",clone_children");
1678 if (strlen(root->name))
1679 seq_show_option(seq, "name", root->name);
1680 return 0;
1681}
1682
1683struct cgroup_sb_opts {
1684 u16 subsys_mask;
1685 unsigned int flags;
1686 char *release_agent;
1687 bool cpuset_clone_children;
1688 char *name;
1689
1690 bool none;
1691};
1692
1693static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
1694{
1695 char *token, *o = data;
1696 bool all_ss = false, one_ss = false;
1697 u16 mask = U16_MAX;
1698 struct cgroup_subsys *ss;
1699 int nr_opts = 0;
1700 int i;
1701
1702#ifdef CONFIG_CPUSETS
1703 mask = ~((u16)1 << cpuset_cgrp_id);
1704#endif
1705
1706 memset(opts, 0, sizeof(*opts));
1707
1708 while ((token = strsep(&o, ",")) != NULL) {
1709 nr_opts++;
1710
1711 if (!*token)
1712 return -EINVAL;
1713 if (!strcmp(token, "none")) {
1714
1715 opts->none = true;
1716 continue;
1717 }
1718 if (!strcmp(token, "all")) {
1719
1720 if (one_ss)
1721 return -EINVAL;
1722 all_ss = true;
1723 continue;
1724 }
1725 if (!strcmp(token, "noprefix")) {
1726 opts->flags |= CGRP_ROOT_NOPREFIX;
1727 continue;
1728 }
1729 if (!strcmp(token, "clone_children")) {
1730 opts->cpuset_clone_children = true;
1731 continue;
1732 }
1733 if (!strcmp(token, "xattr")) {
1734 opts->flags |= CGRP_ROOT_XATTR;
1735 continue;
1736 }
1737 if (!strncmp(token, "release_agent=", 14)) {
1738
1739 if (opts->release_agent)
1740 return -EINVAL;
1741 opts->release_agent =
1742 kstrndup(token + 14, PATH_MAX - 1, GFP_KERNEL);
1743 if (!opts->release_agent)
1744 return -ENOMEM;
1745 continue;
1746 }
1747 if (!strncmp(token, "name=", 5)) {
1748 const char *name = token + 5;
1749
1750 if (!strlen(name))
1751 return -EINVAL;
1752
1753 for (i = 0; i < strlen(name); i++) {
1754 char c = name[i];
1755 if (isalnum(c))
1756 continue;
1757 if ((c == '.') || (c == '-') || (c == '_'))
1758 continue;
1759 return -EINVAL;
1760 }
1761
1762 if (opts->name)
1763 return -EINVAL;
1764 opts->name = kstrndup(name,
1765 MAX_CGROUP_ROOT_NAMELEN - 1,
1766 GFP_KERNEL);
1767 if (!opts->name)
1768 return -ENOMEM;
1769
1770 continue;
1771 }
1772
1773 for_each_subsys(ss, i) {
1774 if (strcmp(token, ss->legacy_name))
1775 continue;
1776 if (!cgroup_ssid_enabled(i))
1777 continue;
1778 if (cgroup_ssid_no_v1(i))
1779 continue;
1780
1781
1782 if (all_ss)
1783 return -EINVAL;
1784 opts->subsys_mask |= (1 << i);
1785 one_ss = true;
1786
1787 break;
1788 }
1789 if (i == CGROUP_SUBSYS_COUNT)
1790 return -ENOENT;
1791 }
1792
1793
1794
1795
1796
1797
1798 if (all_ss || (!one_ss && !opts->none && !opts->name))
1799 for_each_subsys(ss, i)
1800 if (cgroup_ssid_enabled(i) && !cgroup_ssid_no_v1(i))
1801 opts->subsys_mask |= (1 << i);
1802
1803
1804
1805
1806
1807 if (!opts->subsys_mask && !opts->name)
1808 return -EINVAL;
1809
1810
1811
1812
1813
1814
1815 if ((opts->flags & CGRP_ROOT_NOPREFIX) && (opts->subsys_mask & mask))
1816 return -EINVAL;
1817
1818
1819 if (opts->subsys_mask && opts->none)
1820 return -EINVAL;
1821
1822 return 0;
1823}
1824
1825static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data)
1826{
1827 int ret = 0;
1828 struct cgroup_root *root = cgroup_root_from_kf(kf_root);
1829 struct cgroup_sb_opts opts;
1830 u16 added_mask, removed_mask;
1831
1832 if (root == &cgrp_dfl_root) {
1833 pr_err("remount is not allowed\n");
1834 return -EINVAL;
1835 }
1836
1837 cgroup_lock_and_drain_offline(&cgrp_dfl_root.cgrp);
1838
1839
1840 ret = parse_cgroupfs_options(data, &opts);
1841 if (ret)
1842 goto out_unlock;
1843
1844 if (opts.subsys_mask != root->subsys_mask || opts.release_agent)
1845 pr_warn("option changes via remount are deprecated (pid=%d comm=%s)\n",
1846 task_tgid_nr(current), current->comm);
1847
1848 added_mask = opts.subsys_mask & ~root->subsys_mask;
1849 removed_mask = root->subsys_mask & ~opts.subsys_mask;
1850
1851
1852 if ((opts.flags ^ root->flags) ||
1853 (opts.name && strcmp(opts.name, root->name))) {
1854 pr_err("option or name mismatch, new: 0x%x \"%s\", old: 0x%x \"%s\"\n",
1855 opts.flags, opts.name ?: "", root->flags, root->name);
1856 ret = -EINVAL;
1857 goto out_unlock;
1858 }
1859
1860
1861 if (!list_empty(&root->cgrp.self.children)) {
1862 ret = -EBUSY;
1863 goto out_unlock;
1864 }
1865
1866 ret = rebind_subsystems(root, added_mask);
1867 if (ret)
1868 goto out_unlock;
1869
1870 WARN_ON(rebind_subsystems(&cgrp_dfl_root, removed_mask));
1871
1872 if (opts.release_agent) {
1873 spin_lock(&release_agent_path_lock);
1874 strcpy(root->release_agent_path, opts.release_agent);
1875 spin_unlock(&release_agent_path_lock);
1876 }
1877 out_unlock:
1878 kfree(opts.release_agent);
1879 kfree(opts.name);
1880 mutex_unlock(&cgroup_mutex);
1881 return ret;
1882}
1883
1884
1885
1886
1887
1888
1889
1890static bool use_task_css_set_links __read_mostly;
1891
1892static void cgroup_enable_task_cg_lists(void)
1893{
1894 struct task_struct *p, *g;
1895
1896 spin_lock_irq(&css_set_lock);
1897
1898 if (use_task_css_set_links)
1899 goto out_unlock;
1900
1901 use_task_css_set_links = true;
1902
1903
1904
1905
1906
1907
1908
1909
1910 read_lock(&tasklist_lock);
1911 do_each_thread(g, p) {
1912 WARN_ON_ONCE(!list_empty(&p->cg_list) ||
1913 task_css_set(p) != &init_css_set);
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926 spin_lock(&p->sighand->siglock);
1927 if (!(p->flags & PF_EXITING)) {
1928 struct css_set *cset = task_css_set(p);
1929
1930 if (!css_set_populated(cset))
1931 css_set_update_populated(cset, true);
1932 list_add_tail(&p->cg_list, &cset->tasks);
1933 get_css_set(cset);
1934 }
1935 spin_unlock(&p->sighand->siglock);
1936 } while_each_thread(g, p);
1937 read_unlock(&tasklist_lock);
1938out_unlock:
1939 spin_unlock_irq(&css_set_lock);
1940}
1941
1942static void init_cgroup_housekeeping(struct cgroup *cgrp)
1943{
1944 struct cgroup_subsys *ss;
1945 int ssid;
1946
1947 INIT_LIST_HEAD(&cgrp->self.sibling);
1948 INIT_LIST_HEAD(&cgrp->self.children);
1949 INIT_LIST_HEAD(&cgrp->cset_links);
1950 INIT_LIST_HEAD(&cgrp->pidlists);
1951 mutex_init(&cgrp->pidlist_mutex);
1952 cgrp->self.cgroup = cgrp;
1953 cgrp->self.flags |= CSS_ONLINE;
1954
1955 for_each_subsys(ss, ssid)
1956 INIT_LIST_HEAD(&cgrp->e_csets[ssid]);
1957
1958 init_waitqueue_head(&cgrp->offline_waitq);
1959 INIT_WORK(&cgrp->release_agent_work, cgroup_release_agent);
1960}
1961
1962static void init_cgroup_root(struct cgroup_root *root,
1963 struct cgroup_sb_opts *opts)
1964{
1965 struct cgroup *cgrp = &root->cgrp;
1966
1967 INIT_LIST_HEAD(&root->root_list);
1968 atomic_set(&root->nr_cgrps, 1);
1969 cgrp->root = root;
1970 init_cgroup_housekeeping(cgrp);
1971 idr_init(&root->cgroup_idr);
1972
1973 root->flags = opts->flags;
1974 if (opts->release_agent)
1975 strcpy(root->release_agent_path, opts->release_agent);
1976 if (opts->name)
1977 strcpy(root->name, opts->name);
1978 if (opts->cpuset_clone_children)
1979 set_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags);
1980}
1981
1982static int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask)
1983{
1984 LIST_HEAD(tmp_links);
1985 struct cgroup *root_cgrp = &root->cgrp;
1986 struct css_set *cset;
1987 int i, ret;
1988
1989 lockdep_assert_held(&cgroup_mutex);
1990
1991 ret = cgroup_idr_alloc(&root->cgroup_idr, root_cgrp, 1, 2, GFP_KERNEL);
1992 if (ret < 0)
1993 goto out;
1994 root_cgrp->id = ret;
1995 root_cgrp->ancestor_ids[0] = ret;
1996
1997 ret = percpu_ref_init(&root_cgrp->self.refcnt, css_release, 0,
1998 GFP_KERNEL);
1999 if (ret)
2000 goto out;
2001
2002
2003
2004
2005
2006
2007
2008
2009 ret = allocate_cgrp_cset_links(2 * css_set_count, &tmp_links);
2010 if (ret)
2011 goto cancel_ref;
2012
2013 ret = cgroup_init_root_id(root);
2014 if (ret)
2015 goto cancel_ref;
2016
2017 root->kf_root = kernfs_create_root(&cgroup_kf_syscall_ops,
2018 KERNFS_ROOT_CREATE_DEACTIVATED,
2019 root_cgrp);
2020 if (IS_ERR(root->kf_root)) {
2021 ret = PTR_ERR(root->kf_root);
2022 goto exit_root_id;
2023 }
2024 root_cgrp->kn = root->kf_root->kn;
2025
2026 ret = css_populate_dir(&root_cgrp->self);
2027 if (ret)
2028 goto destroy_root;
2029
2030 ret = rebind_subsystems(root, ss_mask);
2031 if (ret)
2032 goto destroy_root;
2033
2034
2035
2036
2037
2038
2039 list_add(&root->root_list, &cgroup_roots);
2040 cgroup_root_count++;
2041
2042
2043
2044
2045
2046 spin_lock_irq(&css_set_lock);
2047 hash_for_each(css_set_table, i, cset, hlist) {
2048 link_css_set(&tmp_links, cset, root_cgrp);
2049 if (css_set_populated(cset))
2050 cgroup_update_populated(root_cgrp, true);
2051 }
2052 spin_unlock_irq(&css_set_lock);
2053
2054 BUG_ON(!list_empty(&root_cgrp->self.children));
2055 BUG_ON(atomic_read(&root->nr_cgrps) != 1);
2056
2057 kernfs_activate(root_cgrp->kn);
2058 ret = 0;
2059 goto out;
2060
2061destroy_root:
2062 kernfs_destroy_root(root->kf_root);
2063 root->kf_root = NULL;
2064exit_root_id:
2065 cgroup_exit_root_id(root);
2066cancel_ref:
2067 percpu_ref_exit(&root_cgrp->self.refcnt);
2068out:
2069 free_cgrp_cset_links(&tmp_links);
2070 return ret;
2071}
2072
2073static struct dentry *cgroup_mount(struct file_system_type *fs_type,
2074 int flags, const char *unused_dev_name,
2075 void *data)
2076{
2077 bool is_v2 = fs_type == &cgroup2_fs_type;
2078 struct super_block *pinned_sb = NULL;
2079 struct cgroup_namespace *ns = current->nsproxy->cgroup_ns;
2080 struct cgroup_subsys *ss;
2081 struct cgroup_root *root;
2082 struct cgroup_sb_opts opts;
2083 struct dentry *dentry;
2084 int ret;
2085 int i;
2086 bool new_sb;
2087
2088 get_cgroup_ns(ns);
2089
2090
2091 if (!ns_capable(ns->user_ns, CAP_SYS_ADMIN)) {
2092 put_cgroup_ns(ns);
2093 return ERR_PTR(-EPERM);
2094 }
2095
2096
2097
2098
2099
2100 if (!use_task_css_set_links)
2101 cgroup_enable_task_cg_lists();
2102
2103 if (is_v2) {
2104 if (data) {
2105 pr_err("cgroup2: unknown option \"%s\"\n", (char *)data);
2106 put_cgroup_ns(ns);
2107 return ERR_PTR(-EINVAL);
2108 }
2109 cgrp_dfl_visible = true;
2110 root = &cgrp_dfl_root;
2111 cgroup_get(&root->cgrp);
2112 goto out_mount;
2113 }
2114
2115 cgroup_lock_and_drain_offline(&cgrp_dfl_root.cgrp);
2116
2117
2118 ret = parse_cgroupfs_options(data, &opts);
2119 if (ret)
2120 goto out_unlock;
2121
2122
2123
2124
2125
2126
2127
2128
2129 for_each_subsys(ss, i) {
2130 if (!(opts.subsys_mask & (1 << i)) ||
2131 ss->root == &cgrp_dfl_root)
2132 continue;
2133
2134 if (!percpu_ref_tryget_live(&ss->root->cgrp.self.refcnt)) {
2135 mutex_unlock(&cgroup_mutex);
2136 msleep(10);
2137 ret = restart_syscall();
2138 goto out_free;
2139 }
2140 cgroup_put(&ss->root->cgrp);
2141 }
2142
2143 for_each_root(root) {
2144 bool name_match = false;
2145
2146 if (root == &cgrp_dfl_root)
2147 continue;
2148
2149
2150
2151
2152
2153
2154 if (opts.name) {
2155 if (strcmp(opts.name, root->name))
2156 continue;
2157 name_match = true;
2158 }
2159
2160
2161
2162
2163
2164 if ((opts.subsys_mask || opts.none) &&
2165 (opts.subsys_mask != root->subsys_mask)) {
2166 if (!name_match)
2167 continue;
2168 ret = -EBUSY;
2169 goto out_unlock;
2170 }
2171
2172 if (root->flags ^ opts.flags)
2173 pr_warn("new mount options do not match the existing superblock, will be ignored\n");
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187 pinned_sb = kernfs_pin_sb(root->kf_root, NULL);
2188 if (IS_ERR(pinned_sb) ||
2189 !percpu_ref_tryget_live(&root->cgrp.self.refcnt)) {
2190 mutex_unlock(&cgroup_mutex);
2191 if (!IS_ERR_OR_NULL(pinned_sb))
2192 deactivate_super(pinned_sb);
2193 msleep(10);
2194 ret = restart_syscall();
2195 goto out_free;
2196 }
2197
2198 ret = 0;
2199 goto out_unlock;
2200 }
2201
2202
2203
2204
2205
2206
2207 if (!opts.subsys_mask && !opts.none) {
2208 ret = -EINVAL;
2209 goto out_unlock;
2210 }
2211
2212
2213 if (ns != &init_cgroup_ns) {
2214 ret = -EPERM;
2215 goto out_unlock;
2216 }
2217
2218 root = kzalloc(sizeof(*root), GFP_KERNEL);
2219 if (!root) {
2220 ret = -ENOMEM;
2221 goto out_unlock;
2222 }
2223
2224 init_cgroup_root(root, &opts);
2225
2226 ret = cgroup_setup_root(root, opts.subsys_mask);
2227 if (ret)
2228 cgroup_free_root(root);
2229
2230out_unlock:
2231 mutex_unlock(&cgroup_mutex);
2232out_free:
2233 kfree(opts.release_agent);
2234 kfree(opts.name);
2235
2236 if (ret) {
2237 put_cgroup_ns(ns);
2238 return ERR_PTR(ret);
2239 }
2240out_mount:
2241 dentry = kernfs_mount(fs_type, flags, root->kf_root,
2242 is_v2 ? CGROUP2_SUPER_MAGIC : CGROUP_SUPER_MAGIC,
2243 &new_sb);
2244
2245
2246
2247
2248
2249
2250 if (!IS_ERR(dentry) && ns != &init_cgroup_ns) {
2251 struct dentry *nsdentry;
2252 struct cgroup *cgrp;
2253
2254 mutex_lock(&cgroup_mutex);
2255 spin_lock_irq(&css_set_lock);
2256
2257 cgrp = cset_cgroup_from_root(ns->root_cset, root);
2258
2259 spin_unlock_irq(&css_set_lock);
2260 mutex_unlock(&cgroup_mutex);
2261
2262 nsdentry = kernfs_node_dentry(cgrp->kn, dentry->d_sb);
2263 dput(dentry);
2264 dentry = nsdentry;
2265 }
2266
2267 if (IS_ERR(dentry) || !new_sb)
2268 cgroup_put(&root->cgrp);
2269
2270
2271
2272
2273
2274 if (pinned_sb) {
2275 WARN_ON(new_sb);
2276 deactivate_super(pinned_sb);
2277 }
2278
2279 put_cgroup_ns(ns);
2280 return dentry;
2281}
2282
2283static void cgroup_kill_sb(struct super_block *sb)
2284{
2285 struct kernfs_root *kf_root = kernfs_root_from_sb(sb);
2286 struct cgroup_root *root = cgroup_root_from_kf(kf_root);
2287
2288
2289
2290
2291
2292
2293
2294
2295 if (!list_empty(&root->cgrp.self.children) ||
2296 root == &cgrp_dfl_root)
2297 cgroup_put(&root->cgrp);
2298 else
2299 percpu_ref_kill(&root->cgrp.self.refcnt);
2300
2301 kernfs_kill_sb(sb);
2302}
2303
2304static struct file_system_type cgroup_fs_type = {
2305 .name = "cgroup",
2306 .mount = cgroup_mount,
2307 .kill_sb = cgroup_kill_sb,
2308 .fs_flags = FS_USERNS_MOUNT,
2309};
2310
2311static struct file_system_type cgroup2_fs_type = {
2312 .name = "cgroup2",
2313 .mount = cgroup_mount,
2314 .kill_sb = cgroup_kill_sb,
2315 .fs_flags = FS_USERNS_MOUNT,
2316};
2317
2318static char *cgroup_path_ns_locked(struct cgroup *cgrp, char *buf, size_t buflen,
2319 struct cgroup_namespace *ns)
2320{
2321 struct cgroup *root = cset_cgroup_from_root(ns->root_cset, cgrp->root);
2322 int ret;
2323
2324 ret = kernfs_path_from_node(cgrp->kn, root->kn, buf, buflen);
2325 if (ret < 0 || ret >= buflen)
2326 return NULL;
2327 return buf;
2328}
2329
2330char *cgroup_path_ns(struct cgroup *cgrp, char *buf, size_t buflen,
2331 struct cgroup_namespace *ns)
2332{
2333 char *ret;
2334
2335 mutex_lock(&cgroup_mutex);
2336 spin_lock_irq(&css_set_lock);
2337
2338 ret = cgroup_path_ns_locked(cgrp, buf, buflen, ns);
2339
2340 spin_unlock_irq(&css_set_lock);
2341 mutex_unlock(&cgroup_mutex);
2342
2343 return ret;
2344}
2345EXPORT_SYMBOL_GPL(cgroup_path_ns);
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360char *task_cgroup_path(struct task_struct *task, char *buf, size_t buflen)
2361{
2362 struct cgroup_root *root;
2363 struct cgroup *cgrp;
2364 int hierarchy_id = 1;
2365 char *path = NULL;
2366
2367 mutex_lock(&cgroup_mutex);
2368 spin_lock_irq(&css_set_lock);
2369
2370 root = idr_get_next(&cgroup_hierarchy_idr, &hierarchy_id);
2371
2372 if (root) {
2373 cgrp = task_cgroup_from_root(task, root);
2374 path = cgroup_path_ns_locked(cgrp, buf, buflen, &init_cgroup_ns);
2375 } else {
2376
2377 if (strlcpy(buf, "/", buflen) < buflen)
2378 path = buf;
2379 }
2380
2381 spin_unlock_irq(&css_set_lock);
2382 mutex_unlock(&cgroup_mutex);
2383 return path;
2384}
2385EXPORT_SYMBOL_GPL(task_cgroup_path);
2386
2387
2388struct cgroup_taskset {
2389
2390 struct list_head src_csets;
2391 struct list_head dst_csets;
2392
2393
2394 int ssid;
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407 struct list_head *csets;
2408 struct css_set *cur_cset;
2409 struct task_struct *cur_task;
2410};
2411
2412#define CGROUP_TASKSET_INIT(tset) (struct cgroup_taskset){ \
2413 .src_csets = LIST_HEAD_INIT(tset.src_csets), \
2414 .dst_csets = LIST_HEAD_INIT(tset.dst_csets), \
2415 .csets = &tset.src_csets, \
2416}
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428static void cgroup_taskset_add(struct task_struct *task,
2429 struct cgroup_taskset *tset)
2430{
2431 struct css_set *cset;
2432
2433 lockdep_assert_held(&css_set_lock);
2434
2435
2436 if (task->flags & PF_EXITING)
2437 return;
2438
2439
2440 if (list_empty(&task->cg_list))
2441 return;
2442
2443 cset = task_css_set(task);
2444 if (!cset->mg_src_cgrp)
2445 return;
2446
2447 list_move_tail(&task->cg_list, &cset->mg_tasks);
2448 if (list_empty(&cset->mg_node))
2449 list_add_tail(&cset->mg_node, &tset->src_csets);
2450 if (list_empty(&cset->mg_dst_cset->mg_node))
2451 list_move_tail(&cset->mg_dst_cset->mg_node,
2452 &tset->dst_csets);
2453}
2454
2455
2456
2457
2458
2459
2460
2461
2462struct task_struct *cgroup_taskset_first(struct cgroup_taskset *tset,
2463 struct cgroup_subsys_state **dst_cssp)
2464{
2465 tset->cur_cset = list_first_entry(tset->csets, struct css_set, mg_node);
2466 tset->cur_task = NULL;
2467
2468 return cgroup_taskset_next(tset, dst_cssp);
2469}
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset,
2480 struct cgroup_subsys_state **dst_cssp)
2481{
2482 struct css_set *cset = tset->cur_cset;
2483 struct task_struct *task = tset->cur_task;
2484
2485 while (&cset->mg_node != tset->csets) {
2486 if (!task)
2487 task = list_first_entry(&cset->mg_tasks,
2488 struct task_struct, cg_list);
2489 else
2490 task = list_next_entry(task, cg_list);
2491
2492 if (&task->cg_list != &cset->mg_tasks) {
2493 tset->cur_cset = cset;
2494 tset->cur_task = task;
2495
2496
2497
2498
2499
2500
2501
2502 if (cset->mg_dst_cset)
2503 *dst_cssp = cset->mg_dst_cset->subsys[tset->ssid];
2504 else
2505 *dst_cssp = cset->subsys[tset->ssid];
2506
2507 return task;
2508 }
2509
2510 cset = list_next_entry(cset, mg_node);
2511 task = NULL;
2512 }
2513
2514 return NULL;
2515}
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527static int cgroup_taskset_migrate(struct cgroup_taskset *tset,
2528 struct cgroup_root *root)
2529{
2530 struct cgroup_subsys *ss;
2531 struct task_struct *task, *tmp_task;
2532 struct css_set *cset, *tmp_cset;
2533 int ssid, failed_ssid, ret;
2534
2535
2536 if (list_empty(&tset->src_csets))
2537 return 0;
2538
2539
2540 do_each_subsys_mask(ss, ssid, root->subsys_mask) {
2541 if (ss->can_attach) {
2542 tset->ssid = ssid;
2543 ret = ss->can_attach(tset);
2544 if (ret) {
2545 failed_ssid = ssid;
2546 goto out_cancel_attach;
2547 }
2548 }
2549 } while_each_subsys_mask();
2550
2551
2552
2553
2554
2555
2556 spin_lock_irq(&css_set_lock);
2557 list_for_each_entry(cset, &tset->src_csets, mg_node) {
2558 list_for_each_entry_safe(task, tmp_task, &cset->mg_tasks, cg_list) {
2559 struct css_set *from_cset = task_css_set(task);
2560 struct css_set *to_cset = cset->mg_dst_cset;
2561
2562 get_css_set(to_cset);
2563 css_set_move_task(task, from_cset, to_cset, true);
2564 put_css_set_locked(from_cset);
2565 }
2566 }
2567 spin_unlock_irq(&css_set_lock);
2568
2569
2570
2571
2572
2573
2574 tset->csets = &tset->dst_csets;
2575
2576 do_each_subsys_mask(ss, ssid, root->subsys_mask) {
2577 if (ss->attach) {
2578 tset->ssid = ssid;
2579 ss->attach(tset);
2580 }
2581 } while_each_subsys_mask();
2582
2583 ret = 0;
2584 goto out_release_tset;
2585
2586out_cancel_attach:
2587 do_each_subsys_mask(ss, ssid, root->subsys_mask) {
2588 if (ssid == failed_ssid)
2589 break;
2590 if (ss->cancel_attach) {
2591 tset->ssid = ssid;
2592 ss->cancel_attach(tset);
2593 }
2594 } while_each_subsys_mask();
2595out_release_tset:
2596 spin_lock_irq(&css_set_lock);
2597 list_splice_init(&tset->dst_csets, &tset->src_csets);
2598 list_for_each_entry_safe(cset, tmp_cset, &tset->src_csets, mg_node) {
2599 list_splice_tail_init(&cset->mg_tasks, &cset->tasks);
2600 list_del_init(&cset->mg_node);
2601 }
2602 spin_unlock_irq(&css_set_lock);
2603 return ret;
2604}
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614static bool cgroup_may_migrate_to(struct cgroup *dst_cgrp)
2615{
2616 return !cgroup_on_dfl(dst_cgrp) || !cgroup_parent(dst_cgrp) ||
2617 !dst_cgrp->subtree_control;
2618}
2619
2620
2621
2622
2623
2624
2625
2626
2627static void cgroup_migrate_finish(struct list_head *preloaded_csets)
2628{
2629 struct css_set *cset, *tmp_cset;
2630
2631 lockdep_assert_held(&cgroup_mutex);
2632
2633 spin_lock_irq(&css_set_lock);
2634 list_for_each_entry_safe(cset, tmp_cset, preloaded_csets, mg_preload_node) {
2635 cset->mg_src_cgrp = NULL;
2636 cset->mg_dst_cgrp = NULL;
2637 cset->mg_dst_cset = NULL;
2638 list_del_init(&cset->mg_preload_node);
2639 put_css_set_locked(cset);
2640 }
2641 spin_unlock_irq(&css_set_lock);
2642}
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660static void cgroup_migrate_add_src(struct css_set *src_cset,
2661 struct cgroup *dst_cgrp,
2662 struct list_head *preloaded_csets)
2663{
2664 struct cgroup *src_cgrp;
2665
2666 lockdep_assert_held(&cgroup_mutex);
2667 lockdep_assert_held(&css_set_lock);
2668
2669
2670
2671
2672
2673
2674 if (src_cset->dead)
2675 return;
2676
2677 src_cgrp = cset_cgroup_from_root(src_cset, dst_cgrp->root);
2678
2679 if (!list_empty(&src_cset->mg_preload_node))
2680 return;
2681
2682 WARN_ON(src_cset->mg_src_cgrp);
2683 WARN_ON(src_cset->mg_dst_cgrp);
2684 WARN_ON(!list_empty(&src_cset->mg_tasks));
2685 WARN_ON(!list_empty(&src_cset->mg_node));
2686
2687 src_cset->mg_src_cgrp = src_cgrp;
2688 src_cset->mg_dst_cgrp = dst_cgrp;
2689 get_css_set(src_cset);
2690 list_add(&src_cset->mg_preload_node, preloaded_csets);
2691}
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707static int cgroup_migrate_prepare_dst(struct list_head *preloaded_csets)
2708{
2709 LIST_HEAD(csets);
2710 struct css_set *src_cset, *tmp_cset;
2711
2712 lockdep_assert_held(&cgroup_mutex);
2713
2714
2715 list_for_each_entry_safe(src_cset, tmp_cset, preloaded_csets, mg_preload_node) {
2716 struct css_set *dst_cset;
2717
2718 dst_cset = find_css_set(src_cset, src_cset->mg_dst_cgrp);
2719 if (!dst_cset)
2720 goto err;
2721
2722 WARN_ON_ONCE(src_cset->mg_dst_cset || dst_cset->mg_dst_cset);
2723
2724
2725
2726
2727
2728
2729 if (src_cset == dst_cset) {
2730 src_cset->mg_src_cgrp = NULL;
2731 src_cset->mg_dst_cgrp = NULL;
2732 list_del_init(&src_cset->mg_preload_node);
2733 put_css_set(src_cset);
2734 put_css_set(dst_cset);
2735 continue;
2736 }
2737
2738 src_cset->mg_dst_cset = dst_cset;
2739
2740 if (list_empty(&dst_cset->mg_preload_node))
2741 list_add(&dst_cset->mg_preload_node, &csets);
2742 else
2743 put_css_set(dst_cset);
2744 }
2745
2746 list_splice_tail(&csets, preloaded_csets);
2747 return 0;
2748err:
2749 cgroup_migrate_finish(&csets);
2750 return -ENOMEM;
2751}
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771static int cgroup_migrate(struct task_struct *leader, bool threadgroup,
2772 struct cgroup_root *root)
2773{
2774 struct cgroup_taskset tset = CGROUP_TASKSET_INIT(tset);
2775 struct task_struct *task;
2776
2777
2778
2779
2780
2781
2782 spin_lock_irq(&css_set_lock);
2783 rcu_read_lock();
2784 task = leader;
2785 do {
2786 cgroup_taskset_add(task, &tset);
2787 if (!threadgroup)
2788 break;
2789 } while_each_thread(leader, task);
2790 rcu_read_unlock();
2791 spin_unlock_irq(&css_set_lock);
2792
2793 return cgroup_taskset_migrate(&tset, root);
2794}
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804static int cgroup_attach_task(struct cgroup *dst_cgrp,
2805 struct task_struct *leader, bool threadgroup)
2806{
2807 LIST_HEAD(preloaded_csets);
2808 struct task_struct *task;
2809 int ret;
2810
2811 if (!cgroup_may_migrate_to(dst_cgrp))
2812 return -EBUSY;
2813
2814
2815 spin_lock_irq(&css_set_lock);
2816 rcu_read_lock();
2817 task = leader;
2818 do {
2819 cgroup_migrate_add_src(task_css_set(task), dst_cgrp,
2820 &preloaded_csets);
2821 if (!threadgroup)
2822 break;
2823 } while_each_thread(leader, task);
2824 rcu_read_unlock();
2825 spin_unlock_irq(&css_set_lock);
2826
2827
2828 ret = cgroup_migrate_prepare_dst(&preloaded_csets);
2829 if (!ret)
2830 ret = cgroup_migrate(leader, threadgroup, dst_cgrp->root);
2831
2832 cgroup_migrate_finish(&preloaded_csets);
2833 return ret;
2834}
2835
2836static int cgroup_procs_write_permission(struct task_struct *task,
2837 struct cgroup *dst_cgrp,
2838 struct kernfs_open_file *of)
2839{
2840 const struct cred *cred = current_cred();
2841 const struct cred *tcred = get_task_cred(task);
2842 int ret = 0;
2843
2844
2845
2846
2847
2848 if (!uid_eq(cred->euid, GLOBAL_ROOT_UID) &&
2849 !uid_eq(cred->euid, tcred->uid) &&
2850 !uid_eq(cred->euid, tcred->suid))
2851 ret = -EACCES;
2852
2853 if (!ret && cgroup_on_dfl(dst_cgrp)) {
2854 struct super_block *sb = of->file->f_path.dentry->d_sb;
2855 struct cgroup *cgrp;
2856 struct inode *inode;
2857
2858 spin_lock_irq(&css_set_lock);
2859 cgrp = task_cgroup_from_root(task, &cgrp_dfl_root);
2860 spin_unlock_irq(&css_set_lock);
2861
2862 while (!cgroup_is_descendant(dst_cgrp, cgrp))
2863 cgrp = cgroup_parent(cgrp);
2864
2865 ret = -ENOMEM;
2866 inode = kernfs_get_inode(sb, cgrp->procs_file.kn);
2867 if (inode) {
2868 ret = inode_permission(inode, MAY_WRITE);
2869 iput(inode);
2870 }
2871 }
2872
2873 put_cred(tcred);
2874 return ret;
2875}
2876
2877
2878
2879
2880
2881
2882static ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf,
2883 size_t nbytes, loff_t off, bool threadgroup)
2884{
2885 struct task_struct *tsk;
2886 struct cgroup_subsys *ss;
2887 struct cgroup *cgrp;
2888 pid_t pid;
2889 int ssid, ret;
2890
2891 if (kstrtoint(strstrip(buf), 0, &pid) || pid < 0)
2892 return -EINVAL;
2893
2894 cgrp = cgroup_kn_lock_live(of->kn, false);
2895 if (!cgrp)
2896 return -ENODEV;
2897
2898 percpu_down_write(&cgroup_threadgroup_rwsem);
2899 rcu_read_lock();
2900 if (pid) {
2901 tsk = find_task_by_vpid(pid);
2902 if (!tsk) {
2903 ret = -ESRCH;
2904 goto out_unlock_rcu;
2905 }
2906 } else {
2907 tsk = current;
2908 }
2909
2910 if (threadgroup)
2911 tsk = tsk->group_leader;
2912
2913
2914
2915
2916
2917
2918 if (tsk == kthreadd_task || (tsk->flags & PF_NO_SETAFFINITY)) {
2919 ret = -EINVAL;
2920 goto out_unlock_rcu;
2921 }
2922
2923 get_task_struct(tsk);
2924 rcu_read_unlock();
2925
2926 ret = cgroup_procs_write_permission(tsk, cgrp, of);
2927 if (!ret)
2928 ret = cgroup_attach_task(cgrp, tsk, threadgroup);
2929
2930 put_task_struct(tsk);
2931 goto out_unlock_threadgroup;
2932
2933out_unlock_rcu:
2934 rcu_read_unlock();
2935out_unlock_threadgroup:
2936 percpu_up_write(&cgroup_threadgroup_rwsem);
2937 for_each_subsys(ss, ssid)
2938 if (ss->post_attach)
2939 ss->post_attach();
2940 cgroup_kn_unlock(of->kn);
2941 return ret ?: nbytes;
2942}
2943
2944
2945
2946
2947
2948
2949int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk)
2950{
2951 struct cgroup_root *root;
2952 int retval = 0;
2953
2954 mutex_lock(&cgroup_mutex);
2955 percpu_down_write(&cgroup_threadgroup_rwsem);
2956 for_each_root(root) {
2957 struct cgroup *from_cgrp;
2958
2959 if (root == &cgrp_dfl_root)
2960 continue;
2961
2962 spin_lock_irq(&css_set_lock);
2963 from_cgrp = task_cgroup_from_root(from, root);
2964 spin_unlock_irq(&css_set_lock);
2965
2966 retval = cgroup_attach_task(from_cgrp, tsk, false);
2967 if (retval)
2968 break;
2969 }
2970 percpu_up_write(&cgroup_threadgroup_rwsem);
2971 mutex_unlock(&cgroup_mutex);
2972
2973 return retval;
2974}
2975EXPORT_SYMBOL_GPL(cgroup_attach_task_all);
2976
2977static ssize_t cgroup_tasks_write(struct kernfs_open_file *of,
2978 char *buf, size_t nbytes, loff_t off)
2979{
2980 return __cgroup_procs_write(of, buf, nbytes, off, false);
2981}
2982
2983static ssize_t cgroup_procs_write(struct kernfs_open_file *of,
2984 char *buf, size_t nbytes, loff_t off)
2985{
2986 return __cgroup_procs_write(of, buf, nbytes, off, true);
2987}
2988
2989static ssize_t cgroup_release_agent_write(struct kernfs_open_file *of,
2990 char *buf, size_t nbytes, loff_t off)
2991{
2992 struct cgroup *cgrp;
2993
2994 BUILD_BUG_ON(sizeof(cgrp->root->release_agent_path) < PATH_MAX);
2995
2996 cgrp = cgroup_kn_lock_live(of->kn, false);
2997 if (!cgrp)
2998 return -ENODEV;
2999 spin_lock(&release_agent_path_lock);
3000 strlcpy(cgrp->root->release_agent_path, strstrip(buf),
3001 sizeof(cgrp->root->release_agent_path));
3002 spin_unlock(&release_agent_path_lock);
3003 cgroup_kn_unlock(of->kn);
3004 return nbytes;
3005}
3006
3007static int cgroup_release_agent_show(struct seq_file *seq, void *v)
3008{
3009 struct cgroup *cgrp = seq_css(seq)->cgroup;
3010
3011 spin_lock(&release_agent_path_lock);
3012 seq_puts(seq, cgrp->root->release_agent_path);
3013 spin_unlock(&release_agent_path_lock);
3014 seq_putc(seq, '\n');
3015 return 0;
3016}
3017
3018static int cgroup_sane_behavior_show(struct seq_file *seq, void *v)
3019{
3020 seq_puts(seq, "0\n");
3021 return 0;
3022}
3023
3024static void cgroup_print_ss_mask(struct seq_file *seq, u16 ss_mask)
3025{
3026 struct cgroup_subsys *ss;
3027 bool printed = false;
3028 int ssid;
3029
3030 do_each_subsys_mask(ss, ssid, ss_mask) {
3031 if (printed)
3032 seq_putc(seq, ' ');
3033 seq_printf(seq, "%s", ss->name);
3034 printed = true;
3035 } while_each_subsys_mask();
3036 if (printed)
3037 seq_putc(seq, '\n');
3038}
3039
3040
3041static int cgroup_controllers_show(struct seq_file *seq, void *v)
3042{
3043 struct cgroup *cgrp = seq_css(seq)->cgroup;
3044
3045 cgroup_print_ss_mask(seq, cgroup_control(cgrp));
3046 return 0;
3047}
3048
3049
3050static int cgroup_subtree_control_show(struct seq_file *seq, void *v)
3051{
3052 struct cgroup *cgrp = seq_css(seq)->cgroup;
3053
3054 cgroup_print_ss_mask(seq, cgrp->subtree_control);
3055 return 0;
3056}
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067static int cgroup_update_dfl_csses(struct cgroup *cgrp)
3068{
3069 LIST_HEAD(preloaded_csets);
3070 struct cgroup_taskset tset = CGROUP_TASKSET_INIT(tset);
3071 struct cgroup_subsys_state *d_css;
3072 struct cgroup *dsct;
3073 struct css_set *src_cset;
3074 int ret;
3075
3076 lockdep_assert_held(&cgroup_mutex);
3077
3078 percpu_down_write(&cgroup_threadgroup_rwsem);
3079
3080
3081 spin_lock_irq(&css_set_lock);
3082 cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp) {
3083 struct cgrp_cset_link *link;
3084
3085 list_for_each_entry(link, &dsct->cset_links, cset_link)
3086 cgroup_migrate_add_src(link->cset, dsct,
3087 &preloaded_csets);
3088 }
3089 spin_unlock_irq(&css_set_lock);
3090
3091
3092 ret = cgroup_migrate_prepare_dst(&preloaded_csets);
3093 if (ret)
3094 goto out_finish;
3095
3096 spin_lock_irq(&css_set_lock);
3097 list_for_each_entry(src_cset, &preloaded_csets, mg_preload_node) {
3098 struct task_struct *task, *ntask;
3099
3100
3101 if (!src_cset->mg_src_cgrp)
3102 break;
3103
3104
3105 list_for_each_entry_safe(task, ntask, &src_cset->tasks, cg_list)
3106 cgroup_taskset_add(task, &tset);
3107 }
3108 spin_unlock_irq(&css_set_lock);
3109
3110 ret = cgroup_taskset_migrate(&tset, cgrp->root);
3111out_finish:
3112 cgroup_migrate_finish(&preloaded_csets);
3113 percpu_up_write(&cgroup_threadgroup_rwsem);
3114 return ret;
3115}
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125static void cgroup_lock_and_drain_offline(struct cgroup *cgrp)
3126 __acquires(&cgroup_mutex)
3127{
3128 struct cgroup *dsct;
3129 struct cgroup_subsys_state *d_css;
3130 struct cgroup_subsys *ss;
3131 int ssid;
3132
3133restart:
3134 mutex_lock(&cgroup_mutex);
3135
3136 cgroup_for_each_live_descendant_post(dsct, d_css, cgrp) {
3137 for_each_subsys(ss, ssid) {
3138 struct cgroup_subsys_state *css = cgroup_css(dsct, ss);
3139 DEFINE_WAIT(wait);
3140
3141 if (!css || !percpu_ref_is_dying(&css->refcnt))
3142 continue;
3143
3144 cgroup_get(dsct);
3145 prepare_to_wait(&dsct->offline_waitq, &wait,
3146 TASK_UNINTERRUPTIBLE);
3147
3148 mutex_unlock(&cgroup_mutex);
3149 schedule();
3150 finish_wait(&dsct->offline_waitq, &wait);
3151
3152 cgroup_put(dsct);
3153 goto restart;
3154 }
3155 }
3156}
3157
3158
3159
3160
3161
3162
3163
3164
3165static void cgroup_save_control(struct cgroup *cgrp)
3166{
3167 struct cgroup *dsct;
3168 struct cgroup_subsys_state *d_css;
3169
3170 cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp) {
3171 dsct->old_subtree_control = dsct->subtree_control;
3172 dsct->old_subtree_ss_mask = dsct->subtree_ss_mask;
3173 }
3174}
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184static void cgroup_propagate_control(struct cgroup *cgrp)
3185{
3186 struct cgroup *dsct;
3187 struct cgroup_subsys_state *d_css;
3188
3189 cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp) {
3190 dsct->subtree_control &= cgroup_control(dsct);
3191 dsct->subtree_ss_mask =
3192 cgroup_calc_subtree_ss_mask(dsct->subtree_control,
3193 cgroup_ss_mask(dsct));
3194 }
3195}
3196
3197
3198
3199
3200
3201
3202
3203
3204static void cgroup_restore_control(struct cgroup *cgrp)
3205{
3206 struct cgroup *dsct;
3207 struct cgroup_subsys_state *d_css;
3208
3209 cgroup_for_each_live_descendant_post(dsct, d_css, cgrp) {
3210 dsct->subtree_control = dsct->old_subtree_control;
3211 dsct->subtree_ss_mask = dsct->old_subtree_ss_mask;
3212 }
3213}
3214
3215static bool css_visible(struct cgroup_subsys_state *css)
3216{
3217 struct cgroup_subsys *ss = css->ss;
3218 struct cgroup *cgrp = css->cgroup;
3219
3220 if (cgroup_control(cgrp) & (1 << ss->id))
3221 return true;
3222 if (!(cgroup_ss_mask(cgrp) & (1 << ss->id)))
3223 return false;
3224 return cgroup_on_dfl(cgrp) && ss->implicit_on_dfl;
3225}
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240static int cgroup_apply_control_enable(struct cgroup *cgrp)
3241{
3242 struct cgroup *dsct;
3243 struct cgroup_subsys_state *d_css;
3244 struct cgroup_subsys *ss;
3245 int ssid, ret;
3246
3247 cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp) {
3248 for_each_subsys(ss, ssid) {
3249 struct cgroup_subsys_state *css = cgroup_css(dsct, ss);
3250
3251 WARN_ON_ONCE(css && percpu_ref_is_dying(&css->refcnt));
3252
3253 if (!(cgroup_ss_mask(dsct) & (1 << ss->id)))
3254 continue;
3255
3256 if (!css) {
3257 css = css_create(dsct, ss);
3258 if (IS_ERR(css))
3259 return PTR_ERR(css);
3260 }
3261
3262 if (css_visible(css)) {
3263 ret = css_populate_dir(css);
3264 if (ret)
3265 return ret;
3266 }
3267 }
3268 }
3269
3270 return 0;
3271}
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286static void cgroup_apply_control_disable(struct cgroup *cgrp)
3287{
3288 struct cgroup *dsct;
3289 struct cgroup_subsys_state *d_css;
3290 struct cgroup_subsys *ss;
3291 int ssid;
3292
3293 cgroup_for_each_live_descendant_post(dsct, d_css, cgrp) {
3294 for_each_subsys(ss, ssid) {
3295 struct cgroup_subsys_state *css = cgroup_css(dsct, ss);
3296
3297 WARN_ON_ONCE(css && percpu_ref_is_dying(&css->refcnt));
3298
3299 if (!css)
3300 continue;
3301
3302 if (css->parent &&
3303 !(cgroup_ss_mask(dsct) & (1 << ss->id))) {
3304 kill_css(css);
3305 } else if (!css_visible(css)) {
3306 css_clear_dir(css);
3307 if (ss->css_reset)
3308 ss->css_reset(css);
3309 }
3310 }
3311 }
3312}
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331static int cgroup_apply_control(struct cgroup *cgrp)
3332{
3333 int ret;
3334
3335 cgroup_propagate_control(cgrp);
3336
3337 ret = cgroup_apply_control_enable(cgrp);
3338 if (ret)
3339 return ret;
3340
3341
3342
3343
3344
3345
3346 ret = cgroup_update_dfl_csses(cgrp);
3347 if (ret)
3348 return ret;
3349
3350 return 0;
3351}
3352
3353
3354
3355
3356
3357
3358
3359
3360static void cgroup_finalize_control(struct cgroup *cgrp, int ret)
3361{
3362 if (ret) {
3363 cgroup_restore_control(cgrp);
3364 cgroup_propagate_control(cgrp);
3365 }
3366
3367 cgroup_apply_control_disable(cgrp);
3368}
3369
3370
3371static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of,
3372 char *buf, size_t nbytes,
3373 loff_t off)
3374{
3375 u16 enable = 0, disable = 0;
3376 struct cgroup *cgrp, *child;
3377 struct cgroup_subsys *ss;
3378 char *tok;
3379 int ssid, ret;
3380
3381
3382
3383
3384
3385 buf = strstrip(buf);
3386 while ((tok = strsep(&buf, " "))) {
3387 if (tok[0] == '\0')
3388 continue;
3389 do_each_subsys_mask(ss, ssid, ~cgrp_dfl_inhibit_ss_mask) {
3390 if (!cgroup_ssid_enabled(ssid) ||
3391 strcmp(tok + 1, ss->name))
3392 continue;
3393
3394 if (*tok == '+') {
3395 enable |= 1 << ssid;
3396 disable &= ~(1 << ssid);
3397 } else if (*tok == '-') {
3398 disable |= 1 << ssid;
3399 enable &= ~(1 << ssid);
3400 } else {
3401 return -EINVAL;
3402 }
3403 break;
3404 } while_each_subsys_mask();
3405 if (ssid == CGROUP_SUBSYS_COUNT)
3406 return -EINVAL;
3407 }
3408
3409 cgrp = cgroup_kn_lock_live(of->kn, true);
3410 if (!cgrp)
3411 return -ENODEV;
3412
3413 for_each_subsys(ss, ssid) {
3414 if (enable & (1 << ssid)) {
3415 if (cgrp->subtree_control & (1 << ssid)) {
3416 enable &= ~(1 << ssid);
3417 continue;
3418 }
3419
3420 if (!(cgroup_control(cgrp) & (1 << ssid))) {
3421 ret = -ENOENT;
3422 goto out_unlock;
3423 }
3424 } else if (disable & (1 << ssid)) {
3425 if (!(cgrp->subtree_control & (1 << ssid))) {
3426 disable &= ~(1 << ssid);
3427 continue;
3428 }
3429
3430
3431 cgroup_for_each_live_child(child, cgrp) {
3432 if (child->subtree_control & (1 << ssid)) {
3433 ret = -EBUSY;
3434 goto out_unlock;
3435 }
3436 }
3437 }
3438 }
3439
3440 if (!enable && !disable) {
3441 ret = 0;
3442 goto out_unlock;
3443 }
3444
3445
3446
3447
3448
3449 if (enable && cgroup_parent(cgrp)) {
3450 struct cgrp_cset_link *link;
3451
3452
3453
3454
3455
3456
3457 spin_lock_irq(&css_set_lock);
3458
3459 ret = 0;
3460 list_for_each_entry(link, &cgrp->cset_links, cset_link) {
3461 if (css_set_populated(link->cset)) {
3462 ret = -EBUSY;
3463 break;
3464 }
3465 }
3466
3467 spin_unlock_irq(&css_set_lock);
3468
3469 if (ret)
3470 goto out_unlock;
3471 }
3472
3473
3474 cgroup_save_control(cgrp);
3475
3476 cgrp->subtree_control |= enable;
3477 cgrp->subtree_control &= ~disable;
3478
3479 ret = cgroup_apply_control(cgrp);
3480
3481 cgroup_finalize_control(cgrp, ret);
3482
3483 kernfs_activate(cgrp->kn);
3484 ret = 0;
3485out_unlock:
3486 cgroup_kn_unlock(of->kn);
3487 return ret ?: nbytes;
3488}
3489
3490static int cgroup_events_show(struct seq_file *seq, void *v)
3491{
3492 seq_printf(seq, "populated %d\n",
3493 cgroup_is_populated(seq_css(seq)->cgroup));
3494 return 0;
3495}
3496
3497static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf,
3498 size_t nbytes, loff_t off)
3499{
3500 struct cgroup *cgrp = of->kn->parent->priv;
3501 struct cftype *cft = of->kn->priv;
3502 struct cgroup_subsys_state *css;
3503 int ret;
3504
3505 if (cft->write)
3506 return cft->write(of, buf, nbytes, off);
3507
3508
3509
3510
3511
3512
3513
3514 rcu_read_lock();
3515 css = cgroup_css(cgrp, cft->ss);
3516 rcu_read_unlock();
3517
3518 if (cft->write_u64) {
3519 unsigned long long v;
3520 ret = kstrtoull(buf, 0, &v);
3521 if (!ret)
3522 ret = cft->write_u64(css, cft, v);
3523 } else if (cft->write_s64) {
3524 long long v;
3525 ret = kstrtoll(buf, 0, &v);
3526 if (!ret)
3527 ret = cft->write_s64(css, cft, v);
3528 } else {
3529 ret = -EINVAL;
3530 }
3531
3532 return ret ?: nbytes;
3533}
3534
3535static void *cgroup_seqfile_start(struct seq_file *seq, loff_t *ppos)
3536{
3537 return seq_cft(seq)->seq_start(seq, ppos);
3538}
3539
3540static void *cgroup_seqfile_next(struct seq_file *seq, void *v, loff_t *ppos)
3541{
3542 return seq_cft(seq)->seq_next(seq, v, ppos);
3543}
3544
3545static void cgroup_seqfile_stop(struct seq_file *seq, void *v)
3546{
3547 seq_cft(seq)->seq_stop(seq, v);
3548}
3549
3550static int cgroup_seqfile_show(struct seq_file *m, void *arg)
3551{
3552 struct cftype *cft = seq_cft(m);
3553 struct cgroup_subsys_state *css = seq_css(m);
3554
3555 if (cft->seq_show)
3556 return cft->seq_show(m, arg);
3557
3558 if (cft->read_u64)
3559 seq_printf(m, "%llu\n", cft->read_u64(css, cft));
3560 else if (cft->read_s64)
3561 seq_printf(m, "%lld\n", cft->read_s64(css, cft));
3562 else
3563 return -EINVAL;
3564 return 0;
3565}
3566
3567static struct kernfs_ops cgroup_kf_single_ops = {
3568 .atomic_write_len = PAGE_SIZE,
3569 .write = cgroup_file_write,
3570 .seq_show = cgroup_seqfile_show,
3571};
3572
3573static struct kernfs_ops cgroup_kf_ops = {
3574 .atomic_write_len = PAGE_SIZE,
3575 .write = cgroup_file_write,
3576 .seq_start = cgroup_seqfile_start,
3577 .seq_next = cgroup_seqfile_next,
3578 .seq_stop = cgroup_seqfile_stop,
3579 .seq_show = cgroup_seqfile_show,
3580};
3581
3582
3583
3584
3585static int cgroup_rename(struct kernfs_node *kn, struct kernfs_node *new_parent,
3586 const char *new_name_str)
3587{
3588 struct cgroup *cgrp = kn->priv;
3589 int ret;
3590
3591 if (kernfs_type(kn) != KERNFS_DIR)
3592 return -ENOTDIR;
3593 if (kn->parent != new_parent)
3594 return -EIO;
3595
3596
3597
3598
3599
3600 if (cgroup_on_dfl(cgrp))
3601 return -EPERM;
3602
3603
3604
3605
3606
3607
3608 kernfs_break_active_protection(new_parent);
3609 kernfs_break_active_protection(kn);
3610
3611 mutex_lock(&cgroup_mutex);
3612
3613 ret = kernfs_rename(kn, new_parent, new_name_str);
3614
3615 mutex_unlock(&cgroup_mutex);
3616
3617 kernfs_unbreak_active_protection(kn);
3618 kernfs_unbreak_active_protection(new_parent);
3619 return ret;
3620}
3621
3622
3623static int cgroup_kn_set_ugid(struct kernfs_node *kn)
3624{
3625 struct iattr iattr = { .ia_valid = ATTR_UID | ATTR_GID,
3626 .ia_uid = current_fsuid(),
3627 .ia_gid = current_fsgid(), };
3628
3629 if (uid_eq(iattr.ia_uid, GLOBAL_ROOT_UID) &&
3630 gid_eq(iattr.ia_gid, GLOBAL_ROOT_GID))
3631 return 0;
3632
3633 return kernfs_setattr(kn, &iattr);
3634}
3635
3636static int cgroup_add_file(struct cgroup_subsys_state *css, struct cgroup *cgrp,
3637 struct cftype *cft)
3638{
3639 char name[CGROUP_FILE_NAME_MAX];
3640 struct kernfs_node *kn;
3641 struct lock_class_key *key = NULL;
3642 int ret;
3643
3644#ifdef CONFIG_DEBUG_LOCK_ALLOC
3645 key = &cft->lockdep_key;
3646#endif
3647 kn = __kernfs_create_file(cgrp->kn, cgroup_file_name(cgrp, cft, name),
3648 cgroup_file_mode(cft), 0, cft->kf_ops, cft,
3649 NULL, key);
3650 if (IS_ERR(kn))
3651 return PTR_ERR(kn);
3652
3653 ret = cgroup_kn_set_ugid(kn);
3654 if (ret) {
3655 kernfs_remove(kn);
3656 return ret;
3657 }
3658
3659 if (cft->file_offset) {
3660 struct cgroup_file *cfile = (void *)css + cft->file_offset;
3661
3662 spin_lock_irq(&cgroup_file_kn_lock);
3663 cfile->kn = kn;
3664 spin_unlock_irq(&cgroup_file_kn_lock);
3665 }
3666
3667 return 0;
3668}
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680static int cgroup_addrm_files(struct cgroup_subsys_state *css,
3681 struct cgroup *cgrp, struct cftype cfts[],
3682 bool is_add)
3683{
3684 struct cftype *cft, *cft_end = NULL;
3685 int ret = 0;
3686
3687 lockdep_assert_held(&cgroup_mutex);
3688
3689restart:
3690 for (cft = cfts; cft != cft_end && cft->name[0] != '\0'; cft++) {
3691
3692 if ((cft->flags & __CFTYPE_ONLY_ON_DFL) && !cgroup_on_dfl(cgrp))
3693 continue;
3694 if ((cft->flags & __CFTYPE_NOT_ON_DFL) && cgroup_on_dfl(cgrp))
3695 continue;
3696 if ((cft->flags & CFTYPE_NOT_ON_ROOT) && !cgroup_parent(cgrp))
3697 continue;
3698 if ((cft->flags & CFTYPE_ONLY_ON_ROOT) && cgroup_parent(cgrp))
3699 continue;
3700
3701 if (is_add) {
3702 ret = cgroup_add_file(css, cgrp, cft);
3703 if (ret) {
3704 pr_warn("%s: failed to add %s, err=%d\n",
3705 __func__, cft->name, ret);
3706 cft_end = cft;
3707 is_add = false;
3708 goto restart;
3709 }
3710 } else {
3711 cgroup_rm_file(cgrp, cft);
3712 }
3713 }
3714 return ret;
3715}
3716
3717static int cgroup_apply_cftypes(struct cftype *cfts, bool is_add)
3718{
3719 LIST_HEAD(pending);
3720 struct cgroup_subsys *ss = cfts[0].ss;
3721 struct cgroup *root = &ss->root->cgrp;
3722 struct cgroup_subsys_state *css;
3723 int ret = 0;
3724
3725 lockdep_assert_held(&cgroup_mutex);
3726
3727
3728 css_for_each_descendant_pre(css, cgroup_css(root, ss)) {
3729 struct cgroup *cgrp = css->cgroup;
3730
3731 if (!(css->flags & CSS_VISIBLE))
3732 continue;
3733
3734 ret = cgroup_addrm_files(css, cgrp, cfts, is_add);
3735 if (ret)
3736 break;
3737 }
3738
3739 if (is_add && !ret)
3740 kernfs_activate(root->kn);
3741 return ret;
3742}
3743
3744static void cgroup_exit_cftypes(struct cftype *cfts)
3745{
3746 struct cftype *cft;
3747
3748 for (cft = cfts; cft->name[0] != '\0'; cft++) {
3749
3750 if (cft->max_write_len && cft->max_write_len != PAGE_SIZE)
3751 kfree(cft->kf_ops);
3752 cft->kf_ops = NULL;
3753 cft->ss = NULL;
3754
3755
3756 cft->flags &= ~(__CFTYPE_ONLY_ON_DFL | __CFTYPE_NOT_ON_DFL);
3757 }
3758}
3759
3760static int cgroup_init_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
3761{
3762 struct cftype *cft;
3763
3764 for (cft = cfts; cft->name[0] != '\0'; cft++) {
3765 struct kernfs_ops *kf_ops;
3766
3767 WARN_ON(cft->ss || cft->kf_ops);
3768
3769 if (cft->seq_start)
3770 kf_ops = &cgroup_kf_ops;
3771 else
3772 kf_ops = &cgroup_kf_single_ops;
3773
3774
3775
3776
3777
3778 if (cft->max_write_len && cft->max_write_len != PAGE_SIZE) {
3779 kf_ops = kmemdup(kf_ops, sizeof(*kf_ops), GFP_KERNEL);
3780 if (!kf_ops) {
3781 cgroup_exit_cftypes(cfts);
3782 return -ENOMEM;
3783 }
3784 kf_ops->atomic_write_len = cft->max_write_len;
3785 }
3786
3787 cft->kf_ops = kf_ops;
3788 cft->ss = ss;
3789 }
3790
3791 return 0;
3792}
3793
3794static int cgroup_rm_cftypes_locked(struct cftype *cfts)
3795{
3796 lockdep_assert_held(&cgroup_mutex);
3797
3798 if (!cfts || !cfts[0].ss)
3799 return -ENOENT;
3800
3801 list_del(&cfts->node);
3802 cgroup_apply_cftypes(cfts, false);
3803 cgroup_exit_cftypes(cfts);
3804 return 0;
3805}
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818int cgroup_rm_cftypes(struct cftype *cfts)
3819{
3820 int ret;
3821
3822 mutex_lock(&cgroup_mutex);
3823 ret = cgroup_rm_cftypes_locked(cfts);
3824 mutex_unlock(&cgroup_mutex);
3825 return ret;
3826}
3827
3828
3829
3830
3831
3832
3833
3834
3835
3836
3837
3838
3839
3840
3841
3842static int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
3843{
3844 int ret;
3845
3846 if (!cgroup_ssid_enabled(ss->id))
3847 return 0;
3848
3849 if (!cfts || cfts[0].name[0] == '\0')
3850 return 0;
3851
3852 ret = cgroup_init_cftypes(ss, cfts);
3853 if (ret)
3854 return ret;
3855
3856 mutex_lock(&cgroup_mutex);
3857
3858 list_add_tail(&cfts->node, &ss->cfts);
3859 ret = cgroup_apply_cftypes(cfts, true);
3860 if (ret)
3861 cgroup_rm_cftypes_locked(cfts);
3862
3863 mutex_unlock(&cgroup_mutex);
3864 return ret;
3865}
3866
3867
3868
3869
3870
3871
3872
3873
3874
3875int cgroup_add_dfl_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
3876{
3877 struct cftype *cft;
3878
3879 for (cft = cfts; cft && cft->name[0] != '\0'; cft++)
3880 cft->flags |= __CFTYPE_ONLY_ON_DFL;
3881 return cgroup_add_cftypes(ss, cfts);
3882}
3883
3884
3885
3886
3887
3888
3889
3890
3891
3892int cgroup_add_legacy_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
3893{
3894 struct cftype *cft;
3895
3896 for (cft = cfts; cft && cft->name[0] != '\0'; cft++)
3897 cft->flags |= __CFTYPE_NOT_ON_DFL;
3898 return cgroup_add_cftypes(ss, cfts);
3899}
3900
3901
3902
3903
3904
3905
3906
3907void cgroup_file_notify(struct cgroup_file *cfile)
3908{
3909 unsigned long flags;
3910
3911 spin_lock_irqsave(&cgroup_file_kn_lock, flags);
3912 if (cfile->kn)
3913 kernfs_notify(cfile->kn);
3914 spin_unlock_irqrestore(&cgroup_file_kn_lock, flags);
3915}
3916
3917
3918
3919
3920
3921
3922
3923
3924
3925static int cgroup_task_count(const struct cgroup *cgrp)
3926{
3927 int count = 0;
3928 struct cgrp_cset_link *link;
3929
3930 spin_lock_irq(&css_set_lock);
3931 list_for_each_entry(link, &cgrp->cset_links, cset_link)
3932 count += atomic_read(&link->cset->refcount);
3933 spin_unlock_irq(&css_set_lock);
3934 return count;
3935}
3936
3937
3938
3939
3940
3941
3942
3943
3944
3945
3946
3947
3948
3949
3950
3951
3952
3953
3954struct cgroup_subsys_state *css_next_child(struct cgroup_subsys_state *pos,
3955 struct cgroup_subsys_state *parent)
3956{
3957 struct cgroup_subsys_state *next;
3958
3959 cgroup_assert_mutex_or_rcu_locked();
3960
3961
3962
3963
3964
3965
3966
3967
3968
3969
3970
3971
3972
3973
3974
3975
3976
3977
3978
3979
3980
3981 if (!pos) {
3982 next = list_entry_rcu(parent->children.next, struct cgroup_subsys_state, sibling);
3983 } else if (likely(!(pos->flags & CSS_RELEASED))) {
3984 next = list_entry_rcu(pos->sibling.next, struct cgroup_subsys_state, sibling);
3985 } else {
3986 list_for_each_entry_rcu(next, &parent->children, sibling)
3987 if (next->serial_nr > pos->serial_nr)
3988 break;
3989 }
3990
3991
3992
3993
3994
3995 if (&next->sibling != &parent->children)
3996 return next;
3997 return NULL;
3998}
3999
4000
4001
4002
4003
4004
4005
4006
4007
4008
4009
4010
4011
4012
4013
4014
4015
4016
4017
4018
4019
4020
4021struct cgroup_subsys_state *
4022css_next_descendant_pre(struct cgroup_subsys_state *pos,
4023 struct cgroup_subsys_state *root)
4024{
4025 struct cgroup_subsys_state *next;
4026
4027 cgroup_assert_mutex_or_rcu_locked();
4028
4029
4030 if (!pos)
4031 return root;
4032
4033
4034 next = css_next_child(NULL, pos);
4035 if (next)
4036 return next;
4037
4038
4039 while (pos != root) {
4040 next = css_next_child(pos, pos->parent);
4041 if (next)
4042 return next;
4043 pos = pos->parent;
4044 }
4045
4046 return NULL;
4047}
4048
4049
4050
4051
4052
4053
4054
4055
4056
4057
4058
4059
4060
4061
4062struct cgroup_subsys_state *
4063css_rightmost_descendant(struct cgroup_subsys_state *pos)
4064{
4065 struct cgroup_subsys_state *last, *tmp;
4066
4067 cgroup_assert_mutex_or_rcu_locked();
4068
4069 do {
4070 last = pos;
4071
4072 pos = NULL;
4073 css_for_each_child(tmp, last)
4074 pos = tmp;
4075 } while (pos);
4076
4077 return last;
4078}
4079
4080static struct cgroup_subsys_state *
4081css_leftmost_descendant(struct cgroup_subsys_state *pos)
4082{
4083 struct cgroup_subsys_state *last;
4084
4085 do {
4086 last = pos;
4087 pos = css_next_child(NULL, pos);
4088 } while (pos);
4089
4090 return last;
4091}
4092
4093
4094
4095
4096
4097
4098
4099
4100
4101
4102
4103
4104
4105
4106
4107
4108
4109
4110
4111
4112
4113
4114
4115struct cgroup_subsys_state *
4116css_next_descendant_post(struct cgroup_subsys_state *pos,
4117 struct cgroup_subsys_state *root)
4118{
4119 struct cgroup_subsys_state *next;
4120
4121 cgroup_assert_mutex_or_rcu_locked();
4122
4123
4124 if (!pos)
4125 return css_leftmost_descendant(root);
4126
4127
4128 if (pos == root)
4129 return NULL;
4130
4131
4132 next = css_next_child(pos, pos->parent);
4133 if (next)
4134 return css_leftmost_descendant(next);
4135
4136
4137 return pos->parent;
4138}
4139
4140
4141
4142
4143
4144
4145
4146
4147
4148bool css_has_online_children(struct cgroup_subsys_state *css)
4149{
4150 struct cgroup_subsys_state *child;
4151 bool ret = false;
4152
4153 rcu_read_lock();
4154 css_for_each_child(child, css) {
4155 if (child->flags & CSS_ONLINE) {
4156 ret = true;
4157 break;
4158 }
4159 }
4160 rcu_read_unlock();
4161 return ret;
4162}
4163
4164
4165
4166
4167
4168
4169
4170static void css_task_iter_advance_css_set(struct css_task_iter *it)
4171{
4172 struct list_head *l = it->cset_pos;
4173 struct cgrp_cset_link *link;
4174 struct css_set *cset;
4175
4176 lockdep_assert_held(&css_set_lock);
4177
4178
4179 do {
4180 l = l->next;
4181 if (l == it->cset_head) {
4182 it->cset_pos = NULL;
4183 it->task_pos = NULL;
4184 return;
4185 }
4186
4187 if (it->ss) {
4188 cset = container_of(l, struct css_set,
4189 e_cset_node[it->ss->id]);
4190 } else {
4191 link = list_entry(l, struct cgrp_cset_link, cset_link);
4192 cset = link->cset;
4193 }
4194 } while (!css_set_populated(cset));
4195
4196 it->cset_pos = l;
4197
4198 if (!list_empty(&cset->tasks))
4199 it->task_pos = cset->tasks.next;
4200 else
4201 it->task_pos = cset->mg_tasks.next;
4202
4203 it->tasks_head = &cset->tasks;
4204 it->mg_tasks_head = &cset->mg_tasks;
4205
4206
4207
4208
4209
4210
4211
4212
4213
4214
4215
4216
4217
4218
4219
4220
4221 if (it->cur_cset) {
4222 list_del(&it->iters_node);
4223 put_css_set_locked(it->cur_cset);
4224 }
4225 get_css_set(cset);
4226 it->cur_cset = cset;
4227 list_add(&it->iters_node, &cset->task_iters);
4228}
4229
4230static void css_task_iter_advance(struct css_task_iter *it)
4231{
4232 struct list_head *l = it->task_pos;
4233
4234 lockdep_assert_held(&css_set_lock);
4235 WARN_ON_ONCE(!l);
4236
4237
4238
4239
4240
4241
4242 l = l->next;
4243
4244 if (l == it->tasks_head)
4245 l = it->mg_tasks_head->next;
4246
4247 if (l == it->mg_tasks_head)
4248 css_task_iter_advance_css_set(it);
4249 else
4250 it->task_pos = l;
4251}
4252
4253
4254
4255
4256
4257
4258
4259
4260
4261
4262
4263void css_task_iter_start(struct cgroup_subsys_state *css,
4264 struct css_task_iter *it)
4265{
4266
4267 WARN_ON_ONCE(!use_task_css_set_links);
4268
4269 memset(it, 0, sizeof(*it));
4270
4271 spin_lock_irq(&css_set_lock);
4272
4273 it->ss = css->ss;
4274
4275 if (it->ss)
4276 it->cset_pos = &css->cgroup->e_csets[css->ss->id];
4277 else
4278 it->cset_pos = &css->cgroup->cset_links;
4279
4280 it->cset_head = it->cset_pos;
4281
4282 css_task_iter_advance_css_set(it);
4283
4284 spin_unlock_irq(&css_set_lock);
4285}
4286
4287
4288
4289
4290
4291
4292
4293
4294
4295struct task_struct *css_task_iter_next(struct css_task_iter *it)
4296{
4297 if (it->cur_task) {
4298 put_task_struct(it->cur_task);
4299 it->cur_task = NULL;
4300 }
4301
4302 spin_lock_irq(&css_set_lock);
4303
4304 if (it->task_pos) {
4305 it->cur_task = list_entry(it->task_pos, struct task_struct,
4306 cg_list);
4307 get_task_struct(it->cur_task);
4308 css_task_iter_advance(it);
4309 }
4310
4311 spin_unlock_irq(&css_set_lock);
4312
4313 return it->cur_task;
4314}
4315
4316
4317
4318
4319
4320
4321
4322void css_task_iter_end(struct css_task_iter *it)
4323{
4324 if (it->cur_cset) {
4325 spin_lock_irq(&css_set_lock);
4326 list_del(&it->iters_node);
4327 put_css_set_locked(it->cur_cset);
4328 spin_unlock_irq(&css_set_lock);
4329 }
4330
4331 if (it->cur_task)
4332 put_task_struct(it->cur_task);
4333}
4334
4335
4336
4337
4338
4339
4340
4341
4342
4343
4344
4345
4346int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from)
4347{
4348 LIST_HEAD(preloaded_csets);
4349 struct cgrp_cset_link *link;
4350 struct css_task_iter it;
4351 struct task_struct *task;
4352 int ret;
4353
4354 if (!cgroup_may_migrate_to(to))
4355 return -EBUSY;
4356
4357 mutex_lock(&cgroup_mutex);
4358
4359 percpu_down_write(&cgroup_threadgroup_rwsem);
4360
4361
4362 spin_lock_irq(&css_set_lock);
4363 list_for_each_entry(link, &from->cset_links, cset_link)
4364 cgroup_migrate_add_src(link->cset, to, &preloaded_csets);
4365 spin_unlock_irq(&css_set_lock);
4366
4367 ret = cgroup_migrate_prepare_dst(&preloaded_csets);
4368 if (ret)
4369 goto out_err;
4370
4371
4372
4373
4374
4375 do {
4376 css_task_iter_start(&from->self, &it);
4377 task = css_task_iter_next(&it);
4378 if (task)
4379 get_task_struct(task);
4380 css_task_iter_end(&it);
4381
4382 if (task) {
4383 ret = cgroup_migrate(task, false, to->root);
4384 put_task_struct(task);
4385 }
4386 } while (task && !ret);
4387out_err:
4388 cgroup_migrate_finish(&preloaded_csets);
4389 percpu_up_write(&cgroup_threadgroup_rwsem);
4390 mutex_unlock(&cgroup_mutex);
4391 return ret;
4392}
4393
4394
4395
4396
4397
4398
4399
4400
4401
4402
4403
4404
4405enum cgroup_filetype {
4406 CGROUP_FILE_PROCS,
4407 CGROUP_FILE_TASKS,
4408};
4409
4410
4411
4412
4413
4414
4415
4416struct cgroup_pidlist {
4417
4418
4419
4420
4421 struct { enum cgroup_filetype type; struct pid_namespace *ns; } key;
4422
4423 pid_t *list;
4424
4425 int length;
4426
4427 struct list_head links;
4428
4429 struct cgroup *owner;
4430
4431 struct delayed_work destroy_dwork;
4432};
4433
4434
4435
4436
4437
4438
4439#define PIDLIST_TOO_LARGE(c) ((c) * sizeof(pid_t) > (PAGE_SIZE * 2))
4440static void *pidlist_allocate(int count)
4441{
4442 if (PIDLIST_TOO_LARGE(count))
4443 return vmalloc(count * sizeof(pid_t));
4444 else
4445 return kmalloc(count * sizeof(pid_t), GFP_KERNEL);
4446}
4447
4448static void pidlist_free(void *p)
4449{
4450 kvfree(p);
4451}
4452
4453
4454
4455
4456
4457static void cgroup_pidlist_destroy_all(struct cgroup *cgrp)
4458{
4459 struct cgroup_pidlist *l, *tmp_l;
4460
4461 mutex_lock(&cgrp->pidlist_mutex);
4462 list_for_each_entry_safe(l, tmp_l, &cgrp->pidlists, links)
4463 mod_delayed_work(cgroup_pidlist_destroy_wq, &l->destroy_dwork, 0);
4464 mutex_unlock(&cgrp->pidlist_mutex);
4465
4466 flush_workqueue(cgroup_pidlist_destroy_wq);
4467 BUG_ON(!list_empty(&cgrp->pidlists));
4468}
4469
4470static void cgroup_pidlist_destroy_work_fn(struct work_struct *work)
4471{
4472 struct delayed_work *dwork = to_delayed_work(work);
4473 struct cgroup_pidlist *l = container_of(dwork, struct cgroup_pidlist,
4474 destroy_dwork);
4475 struct cgroup_pidlist *tofree = NULL;
4476
4477 mutex_lock(&l->owner->pidlist_mutex);
4478
4479
4480
4481
4482
4483 if (!delayed_work_pending(dwork)) {
4484 list_del(&l->links);
4485 pidlist_free(l->list);
4486 put_pid_ns(l->key.ns);
4487 tofree = l;
4488 }
4489
4490 mutex_unlock(&l->owner->pidlist_mutex);
4491 kfree(tofree);
4492}
4493
4494
4495
4496
4497
4498static int pidlist_uniq(pid_t *list, int length)
4499{
4500 int src, dest = 1;
4501
4502
4503
4504
4505
4506 if (length == 0 || length == 1)
4507 return length;
4508
4509 for (src = 1; src < length; src++) {
4510
4511 while (list[src] == list[src-1]) {
4512 src++;
4513 if (src == length)
4514 goto after;
4515 }
4516
4517 list[dest] = list[src];
4518 dest++;
4519 }
4520after:
4521 return dest;
4522}
4523
4524
4525
4526
4527
4528
4529
4530
4531
4532
4533
4534
4535
4536
4537
4538
4539
4540
4541
4542static pid_t pid_fry(pid_t pid)
4543{
4544 unsigned a = pid & 0x55555555;
4545 unsigned b = pid & 0xAAAAAAAA;
4546
4547 return (a << 1) | (b >> 1);
4548}
4549
4550static pid_t cgroup_pid_fry(struct cgroup *cgrp, pid_t pid)
4551{
4552 if (cgroup_on_dfl(cgrp))
4553 return pid_fry(pid);
4554 else
4555 return pid;
4556}
4557
4558static int cmppid(const void *a, const void *b)
4559{
4560 return *(pid_t *)a - *(pid_t *)b;
4561}
4562
4563static int fried_cmppid(const void *a, const void *b)
4564{
4565 return pid_fry(*(pid_t *)a) - pid_fry(*(pid_t *)b);
4566}
4567
4568static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp,
4569 enum cgroup_filetype type)
4570{
4571 struct cgroup_pidlist *l;
4572
4573 struct pid_namespace *ns = task_active_pid_ns(current);
4574
4575 lockdep_assert_held(&cgrp->pidlist_mutex);
4576
4577 list_for_each_entry(l, &cgrp->pidlists, links)
4578 if (l->key.type == type && l->key.ns == ns)
4579 return l;
4580 return NULL;
4581}
4582
4583
4584
4585
4586
4587
4588
4589static struct cgroup_pidlist *cgroup_pidlist_find_create(struct cgroup *cgrp,
4590 enum cgroup_filetype type)
4591{
4592 struct cgroup_pidlist *l;
4593
4594 lockdep_assert_held(&cgrp->pidlist_mutex);
4595
4596 l = cgroup_pidlist_find(cgrp, type);
4597 if (l)
4598 return l;
4599
4600
4601 l = kzalloc(sizeof(struct cgroup_pidlist), GFP_KERNEL);
4602 if (!l)
4603 return l;
4604
4605 INIT_DELAYED_WORK(&l->destroy_dwork, cgroup_pidlist_destroy_work_fn);
4606 l->key.type = type;
4607
4608 l->key.ns = get_pid_ns(task_active_pid_ns(current));
4609 l->owner = cgrp;
4610 list_add(&l->links, &cgrp->pidlists);
4611 return l;
4612}
4613
4614
4615
4616
4617static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type,
4618 struct cgroup_pidlist **lp)
4619{
4620 pid_t *array;
4621 int length;
4622 int pid, n = 0;
4623 struct css_task_iter it;
4624 struct task_struct *tsk;
4625 struct cgroup_pidlist *l;
4626
4627 lockdep_assert_held(&cgrp->pidlist_mutex);
4628
4629
4630
4631
4632
4633
4634
4635 length = cgroup_task_count(cgrp);
4636 array = pidlist_allocate(length);
4637 if (!array)
4638 return -ENOMEM;
4639
4640 css_task_iter_start(&cgrp->self, &it);
4641 while ((tsk = css_task_iter_next(&it))) {
4642 if (unlikely(n == length))
4643 break;
4644
4645 if (type == CGROUP_FILE_PROCS)
4646 pid = task_tgid_vnr(tsk);
4647 else
4648 pid = task_pid_vnr(tsk);
4649 if (pid > 0)
4650 array[n++] = pid;
4651 }
4652 css_task_iter_end(&it);
4653 length = n;
4654
4655 if (cgroup_on_dfl(cgrp))
4656 sort(array, length, sizeof(pid_t), fried_cmppid, NULL);
4657 else
4658 sort(array, length, sizeof(pid_t), cmppid, NULL);
4659 if (type == CGROUP_FILE_PROCS)
4660 length = pidlist_uniq(array, length);
4661
4662 l = cgroup_pidlist_find_create(cgrp, type);
4663 if (!l) {
4664 pidlist_free(array);
4665 return -ENOMEM;
4666 }
4667
4668
4669 pidlist_free(l->list);
4670 l->list = array;
4671 l->length = length;
4672 *lp = l;
4673 return 0;
4674}
4675
4676
4677
4678
4679
4680
4681
4682
4683
4684
4685int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry)
4686{
4687 struct kernfs_node *kn = kernfs_node_from_dentry(dentry);
4688 struct cgroup *cgrp;
4689 struct css_task_iter it;
4690 struct task_struct *tsk;
4691
4692
4693 if (dentry->d_sb->s_type != &cgroup_fs_type || !kn ||
4694 kernfs_type(kn) != KERNFS_DIR)
4695 return -EINVAL;
4696
4697 mutex_lock(&cgroup_mutex);
4698
4699
4700
4701
4702
4703
4704 rcu_read_lock();
4705 cgrp = rcu_dereference(kn->priv);
4706 if (!cgrp || cgroup_is_dead(cgrp)) {
4707 rcu_read_unlock();
4708 mutex_unlock(&cgroup_mutex);
4709 return -ENOENT;
4710 }
4711 rcu_read_unlock();
4712
4713 css_task_iter_start(&cgrp->self, &it);
4714 while ((tsk = css_task_iter_next(&it))) {
4715 switch (tsk->state) {
4716 case TASK_RUNNING:
4717 stats->nr_running++;
4718 break;
4719 case TASK_INTERRUPTIBLE:
4720 stats->nr_sleeping++;
4721 break;
4722 case TASK_UNINTERRUPTIBLE:
4723 stats->nr_uninterruptible++;
4724 break;
4725 case TASK_STOPPED:
4726 stats->nr_stopped++;
4727 break;
4728 default:
4729 if (delayacct_is_task_waiting_on_io(tsk))
4730 stats->nr_io_wait++;
4731 break;
4732 }
4733 }
4734 css_task_iter_end(&it);
4735
4736 mutex_unlock(&cgroup_mutex);
4737 return 0;
4738}
4739
4740
4741
4742
4743
4744
4745
4746
4747static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos)
4748{
4749
4750
4751
4752
4753
4754
4755 struct kernfs_open_file *of = s->private;
4756 struct cgroup *cgrp = seq_css(s)->cgroup;
4757 struct cgroup_pidlist *l;
4758 enum cgroup_filetype type = seq_cft(s)->private;
4759 int index = 0, pid = *pos;
4760 int *iter, ret;
4761
4762 mutex_lock(&cgrp->pidlist_mutex);
4763
4764
4765
4766
4767
4768
4769
4770 if (of->priv)
4771 of->priv = cgroup_pidlist_find(cgrp, type);
4772
4773
4774
4775
4776
4777 if (!of->priv) {
4778 ret = pidlist_array_load(cgrp, type,
4779 (struct cgroup_pidlist **)&of->priv);
4780 if (ret)
4781 return ERR_PTR(ret);
4782 }
4783 l = of->priv;
4784
4785 if (pid) {
4786 int end = l->length;
4787
4788 while (index < end) {
4789 int mid = (index + end) / 2;
4790 if (cgroup_pid_fry(cgrp, l->list[mid]) == pid) {
4791 index = mid;
4792 break;
4793 } else if (cgroup_pid_fry(cgrp, l->list[mid]) <= pid)
4794 index = mid + 1;
4795 else
4796 end = mid;
4797 }
4798 }
4799
4800 if (index >= l->length)
4801 return NULL;
4802
4803 iter = l->list + index;
4804 *pos = cgroup_pid_fry(cgrp, *iter);
4805 return iter;
4806}
4807
4808static void cgroup_pidlist_stop(struct seq_file *s, void *v)
4809{
4810 struct kernfs_open_file *of = s->private;
4811 struct cgroup_pidlist *l = of->priv;
4812
4813 if (l)
4814 mod_delayed_work(cgroup_pidlist_destroy_wq, &l->destroy_dwork,
4815 CGROUP_PIDLIST_DESTROY_DELAY);
4816 mutex_unlock(&seq_css(s)->cgroup->pidlist_mutex);
4817}
4818
4819static void *cgroup_pidlist_next(struct seq_file *s, void *v, loff_t *pos)
4820{
4821 struct kernfs_open_file *of = s->private;
4822 struct cgroup_pidlist *l = of->priv;
4823 pid_t *p = v;
4824 pid_t *end = l->list + l->length;
4825
4826
4827
4828
4829 p++;
4830 if (p >= end) {
4831 return NULL;
4832 } else {
4833 *pos = cgroup_pid_fry(seq_css(s)->cgroup, *p);
4834 return p;
4835 }
4836}
4837
4838static int cgroup_pidlist_show(struct seq_file *s, void *v)
4839{
4840 seq_printf(s, "%d\n", *(int *)v);
4841
4842 return 0;
4843}
4844
4845static u64 cgroup_read_notify_on_release(struct cgroup_subsys_state *css,
4846 struct cftype *cft)
4847{
4848 return notify_on_release(css->cgroup);
4849}
4850
4851static int cgroup_write_notify_on_release(struct cgroup_subsys_state *css,
4852 struct cftype *cft, u64 val)
4853{
4854 if (val)
4855 set_bit(CGRP_NOTIFY_ON_RELEASE, &css->cgroup->flags);
4856 else
4857 clear_bit(CGRP_NOTIFY_ON_RELEASE, &css->cgroup->flags);
4858 return 0;
4859}
4860
4861static u64 cgroup_clone_children_read(struct cgroup_subsys_state *css,
4862 struct cftype *cft)
4863{
4864 return test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags);
4865}
4866
4867static int cgroup_clone_children_write(struct cgroup_subsys_state *css,
4868 struct cftype *cft, u64 val)
4869{
4870 if (val)
4871 set_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags);
4872 else
4873 clear_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags);
4874 return 0;
4875}
4876
4877
4878static struct cftype cgroup_dfl_base_files[] = {
4879 {
4880 .name = "cgroup.procs",
4881 .file_offset = offsetof(struct cgroup, procs_file),
4882 .seq_start = cgroup_pidlist_start,
4883 .seq_next = cgroup_pidlist_next,
4884 .seq_stop = cgroup_pidlist_stop,
4885 .seq_show = cgroup_pidlist_show,
4886 .private = CGROUP_FILE_PROCS,
4887 .write = cgroup_procs_write,
4888 },
4889 {
4890 .name = "cgroup.controllers",
4891 .seq_show = cgroup_controllers_show,
4892 },
4893 {
4894 .name = "cgroup.subtree_control",
4895 .seq_show = cgroup_subtree_control_show,
4896 .write = cgroup_subtree_control_write,
4897 },
4898 {
4899 .name = "cgroup.events",
4900 .flags = CFTYPE_NOT_ON_ROOT,
4901 .file_offset = offsetof(struct cgroup, events_file),
4902 .seq_show = cgroup_events_show,
4903 },
4904 { }
4905};
4906
4907
4908static struct cftype cgroup_legacy_base_files[] = {
4909 {
4910 .name = "cgroup.procs",
4911 .seq_start = cgroup_pidlist_start,
4912 .seq_next = cgroup_pidlist_next,
4913 .seq_stop = cgroup_pidlist_stop,
4914 .seq_show = cgroup_pidlist_show,
4915 .private = CGROUP_FILE_PROCS,
4916 .write = cgroup_procs_write,
4917 },
4918 {
4919 .name = "cgroup.clone_children",
4920 .read_u64 = cgroup_clone_children_read,
4921 .write_u64 = cgroup_clone_children_write,
4922 },
4923 {
4924 .name = "cgroup.sane_behavior",
4925 .flags = CFTYPE_ONLY_ON_ROOT,
4926 .seq_show = cgroup_sane_behavior_show,
4927 },
4928 {
4929 .name = "tasks",
4930 .seq_start = cgroup_pidlist_start,
4931 .seq_next = cgroup_pidlist_next,
4932 .seq_stop = cgroup_pidlist_stop,
4933 .seq_show = cgroup_pidlist_show,
4934 .private = CGROUP_FILE_TASKS,
4935 .write = cgroup_tasks_write,
4936 },
4937 {
4938 .name = "notify_on_release",
4939 .read_u64 = cgroup_read_notify_on_release,
4940 .write_u64 = cgroup_write_notify_on_release,
4941 },
4942 {
4943 .name = "release_agent",
4944 .flags = CFTYPE_ONLY_ON_ROOT,
4945 .seq_show = cgroup_release_agent_show,
4946 .write = cgroup_release_agent_write,
4947 .max_write_len = PATH_MAX - 1,
4948 },
4949 { }
4950};
4951
4952
4953
4954
4955
4956
4957
4958
4959
4960
4961
4962
4963
4964
4965
4966
4967
4968
4969
4970
4971
4972
4973
4974static void css_free_work_fn(struct work_struct *work)
4975{
4976 struct cgroup_subsys_state *css =
4977 container_of(work, struct cgroup_subsys_state, destroy_work);
4978 struct cgroup_subsys *ss = css->ss;
4979 struct cgroup *cgrp = css->cgroup;
4980
4981 percpu_ref_exit(&css->refcnt);
4982
4983 if (ss) {
4984
4985 struct cgroup_subsys_state *parent = css->parent;
4986 int id = css->id;
4987
4988 ss->css_free(css);
4989 cgroup_idr_remove(&ss->css_idr, id);
4990 cgroup_put(cgrp);
4991
4992 if (parent)
4993 css_put(parent);
4994 } else {
4995
4996 atomic_dec(&cgrp->root->nr_cgrps);
4997 cgroup_pidlist_destroy_all(cgrp);
4998 cancel_work_sync(&cgrp->release_agent_work);
4999
5000 if (cgroup_parent(cgrp)) {
5001
5002
5003
5004
5005
5006
5007 cgroup_put(cgroup_parent(cgrp));
5008 kernfs_put(cgrp->kn);
5009 kfree(cgrp);
5010 } else {
5011
5012
5013
5014
5015
5016 cgroup_destroy_root(cgrp->root);
5017 }
5018 }
5019}
5020
5021static void css_free_rcu_fn(struct rcu_head *rcu_head)
5022{
5023 struct cgroup_subsys_state *css =
5024 container_of(rcu_head, struct cgroup_subsys_state, rcu_head);
5025
5026 INIT_WORK(&css->destroy_work, css_free_work_fn);
5027 queue_work(cgroup_destroy_wq, &css->destroy_work);
5028}
5029
5030static void css_release_work_fn(struct work_struct *work)
5031{
5032 struct cgroup_subsys_state *css =
5033 container_of(work, struct cgroup_subsys_state, destroy_work);
5034 struct cgroup_subsys *ss = css->ss;
5035 struct cgroup *cgrp = css->cgroup;
5036
5037 mutex_lock(&cgroup_mutex);
5038
5039 css->flags |= CSS_RELEASED;
5040 list_del_rcu(&css->sibling);
5041
5042 if (ss) {
5043
5044 cgroup_idr_replace(&ss->css_idr, NULL, css->id);
5045 if (ss->css_released)
5046 ss->css_released(css);
5047 } else {
5048
5049 cgroup_idr_remove(&cgrp->root->cgroup_idr, cgrp->id);
5050 cgrp->id = -1;
5051
5052
5053
5054
5055
5056
5057
5058
5059 if (cgrp->kn)
5060 RCU_INIT_POINTER(*(void __rcu __force **)&cgrp->kn->priv,
5061 NULL);
5062 }
5063
5064 mutex_unlock(&cgroup_mutex);
5065
5066 call_rcu(&css->rcu_head, css_free_rcu_fn);
5067}
5068
5069static void css_release(struct percpu_ref *ref)
5070{
5071 struct cgroup_subsys_state *css =
5072 container_of(ref, struct cgroup_subsys_state, refcnt);
5073
5074 INIT_WORK(&css->destroy_work, css_release_work_fn);
5075 queue_work(cgroup_destroy_wq, &css->destroy_work);
5076}
5077
5078static void init_and_link_css(struct cgroup_subsys_state *css,
5079 struct cgroup_subsys *ss, struct cgroup *cgrp)
5080{
5081 lockdep_assert_held(&cgroup_mutex);
5082
5083 cgroup_get(cgrp);
5084
5085 memset(css, 0, sizeof(*css));
5086 css->cgroup = cgrp;
5087 css->ss = ss;
5088 css->id = -1;
5089 INIT_LIST_HEAD(&css->sibling);
5090 INIT_LIST_HEAD(&css->children);
5091 css->serial_nr = css_serial_nr_next++;
5092 atomic_set(&css->online_cnt, 0);
5093
5094 if (cgroup_parent(cgrp)) {
5095 css->parent = cgroup_css(cgroup_parent(cgrp), ss);
5096 css_get(css->parent);
5097 }
5098
5099 BUG_ON(cgroup_css(cgrp, ss));
5100}
5101
5102
5103static int online_css(struct cgroup_subsys_state *css)
5104{
5105 struct cgroup_subsys *ss = css->ss;
5106 int ret = 0;
5107
5108 lockdep_assert_held(&cgroup_mutex);
5109
5110 if (ss->css_online)
5111 ret = ss->css_online(css);
5112 if (!ret) {
5113 css->flags |= CSS_ONLINE;
5114 rcu_assign_pointer(css->cgroup->subsys[ss->id], css);
5115
5116 atomic_inc(&css->online_cnt);
5117 if (css->parent)
5118 atomic_inc(&css->parent->online_cnt);
5119 }
5120 return ret;
5121}
5122
5123
5124static void offline_css(struct cgroup_subsys_state *css)
5125{
5126 struct cgroup_subsys *ss = css->ss;
5127
5128 lockdep_assert_held(&cgroup_mutex);
5129
5130 if (!(css->flags & CSS_ONLINE))
5131 return;
5132
5133 if (ss->css_reset)
5134 ss->css_reset(css);
5135
5136 if (ss->css_offline)
5137 ss->css_offline(css);
5138
5139 css->flags &= ~CSS_ONLINE;
5140 RCU_INIT_POINTER(css->cgroup->subsys[ss->id], NULL);
5141
5142 wake_up_all(&css->cgroup->offline_waitq);
5143}
5144
5145
5146
5147
5148
5149
5150
5151
5152
5153
5154static struct cgroup_subsys_state *css_create(struct cgroup *cgrp,
5155 struct cgroup_subsys *ss)
5156{
5157 struct cgroup *parent = cgroup_parent(cgrp);
5158 struct cgroup_subsys_state *parent_css = cgroup_css(parent, ss);
5159 struct cgroup_subsys_state *css;
5160 int err;
5161
5162 lockdep_assert_held(&cgroup_mutex);
5163
5164 css = ss->css_alloc(parent_css);
5165 if (!css)
5166 css = ERR_PTR(-ENOMEM);
5167 if (IS_ERR(css))
5168 return css;
5169
5170 init_and_link_css(css, ss, cgrp);
5171
5172 err = percpu_ref_init(&css->refcnt, css_release, 0, GFP_KERNEL);
5173 if (err)
5174 goto err_free_css;
5175
5176 err = cgroup_idr_alloc(&ss->css_idr, NULL, 2, 0, GFP_KERNEL);
5177 if (err < 0)
5178 goto err_free_css;
5179 css->id = err;
5180
5181
5182 list_add_tail_rcu(&css->sibling, &parent_css->children);
5183 cgroup_idr_replace(&ss->css_idr, css, css->id);
5184
5185 err = online_css(css);
5186 if (err)
5187 goto err_list_del;
5188
5189 if (ss->broken_hierarchy && !ss->warned_broken_hierarchy &&
5190 cgroup_parent(parent)) {
5191 pr_warn("%s (%d) created nested cgroup for controller \"%s\" which has incomplete hierarchy support. Nested cgroups may change behavior in the future.\n",
5192 current->comm, current->pid, ss->name);
5193 if (!strcmp(ss->name, "memory"))
5194 pr_warn("\"memory\" requires setting use_hierarchy to 1 on the root\n");
5195 ss->warned_broken_hierarchy = true;
5196 }
5197
5198 return css;
5199
5200err_list_del:
5201 list_del_rcu(&css->sibling);
5202err_free_css:
5203 call_rcu(&css->rcu_head, css_free_rcu_fn);
5204 return ERR_PTR(err);
5205}
5206
5207static struct cgroup *cgroup_create(struct cgroup *parent)
5208{
5209 struct cgroup_root *root = parent->root;
5210 struct cgroup *cgrp, *tcgrp;
5211 int level = parent->level + 1;
5212 int ret;
5213
5214
5215 cgrp = kzalloc(sizeof(*cgrp) +
5216 sizeof(cgrp->ancestor_ids[0]) * (level + 1), GFP_KERNEL);
5217 if (!cgrp)
5218 return ERR_PTR(-ENOMEM);
5219
5220 ret = percpu_ref_init(&cgrp->self.refcnt, css_release, 0, GFP_KERNEL);
5221 if (ret)
5222 goto out_free_cgrp;
5223
5224
5225
5226
5227
5228 cgrp->id = cgroup_idr_alloc(&root->cgroup_idr, NULL, 2, 0, GFP_KERNEL);
5229 if (cgrp->id < 0) {
5230 ret = -ENOMEM;
5231 goto out_cancel_ref;
5232 }
5233
5234 init_cgroup_housekeeping(cgrp);
5235
5236 cgrp->self.parent = &parent->self;
5237 cgrp->root = root;
5238 cgrp->level = level;
5239
5240 for (tcgrp = cgrp; tcgrp; tcgrp = cgroup_parent(tcgrp))
5241 cgrp->ancestor_ids[tcgrp->level] = tcgrp->id;
5242
5243 if (notify_on_release(parent))
5244 set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
5245
5246 if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &parent->flags))
5247 set_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags);
5248
5249 cgrp->self.serial_nr = css_serial_nr_next++;
5250
5251
5252 list_add_tail_rcu(&cgrp->self.sibling, &cgroup_parent(cgrp)->self.children);
5253 atomic_inc(&root->nr_cgrps);
5254 cgroup_get(parent);
5255
5256
5257
5258
5259
5260 cgroup_idr_replace(&root->cgroup_idr, cgrp, cgrp->id);
5261
5262
5263
5264
5265
5266 if (!cgroup_on_dfl(cgrp))
5267 cgrp->subtree_control = cgroup_control(cgrp);
5268
5269 cgroup_propagate_control(cgrp);
5270
5271
5272 ret = cgroup_apply_control_enable(cgrp);
5273 if (ret)
5274 goto out_destroy;
5275
5276 return cgrp;
5277
5278out_cancel_ref:
5279 percpu_ref_exit(&cgrp->self.refcnt);
5280out_free_cgrp:
5281 kfree(cgrp);
5282 return ERR_PTR(ret);
5283out_destroy:
5284 cgroup_destroy_locked(cgrp);
5285 return ERR_PTR(ret);
5286}
5287
5288static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name,
5289 umode_t mode)
5290{
5291 struct cgroup *parent, *cgrp;
5292 struct kernfs_node *kn;
5293 int ret;
5294
5295
5296 if (strchr(name, '\n'))
5297 return -EINVAL;
5298
5299 parent = cgroup_kn_lock_live(parent_kn, false);
5300 if (!parent)
5301 return -ENODEV;
5302
5303 cgrp = cgroup_create(parent);
5304 if (IS_ERR(cgrp)) {
5305 ret = PTR_ERR(cgrp);
5306 goto out_unlock;
5307 }
5308
5309
5310 kn = kernfs_create_dir(parent->kn, name, mode, cgrp);
5311 if (IS_ERR(kn)) {
5312 ret = PTR_ERR(kn);
5313 goto out_destroy;
5314 }
5315 cgrp->kn = kn;
5316
5317
5318
5319
5320
5321 kernfs_get(kn);
5322
5323 ret = cgroup_kn_set_ugid(kn);
5324 if (ret)
5325 goto out_destroy;
5326
5327 ret = css_populate_dir(&cgrp->self);
5328 if (ret)
5329 goto out_destroy;
5330
5331 ret = cgroup_apply_control_enable(cgrp);
5332 if (ret)
5333 goto out_destroy;
5334
5335
5336 kernfs_activate(kn);
5337
5338 ret = 0;
5339 goto out_unlock;
5340
5341out_destroy:
5342 cgroup_destroy_locked(cgrp);
5343out_unlock:
5344 cgroup_kn_unlock(parent_kn);
5345 return ret;
5346}
5347
5348
5349
5350
5351
5352
5353static void css_killed_work_fn(struct work_struct *work)
5354{
5355 struct cgroup_subsys_state *css =
5356 container_of(work, struct cgroup_subsys_state, destroy_work);
5357
5358 mutex_lock(&cgroup_mutex);
5359
5360 do {
5361 offline_css(css);
5362 css_put(css);
5363
5364 css = css->parent;
5365 } while (css && atomic_dec_and_test(&css->online_cnt));
5366
5367 mutex_unlock(&cgroup_mutex);
5368}
5369
5370
5371static void css_killed_ref_fn(struct percpu_ref *ref)
5372{
5373 struct cgroup_subsys_state *css =
5374 container_of(ref, struct cgroup_subsys_state, refcnt);
5375
5376 if (atomic_dec_and_test(&css->online_cnt)) {
5377 INIT_WORK(&css->destroy_work, css_killed_work_fn);
5378 queue_work(cgroup_destroy_wq, &css->destroy_work);
5379 }
5380}
5381
5382
5383
5384
5385
5386
5387
5388
5389
5390
5391static void kill_css(struct cgroup_subsys_state *css)
5392{
5393 lockdep_assert_held(&cgroup_mutex);
5394
5395
5396
5397
5398
5399 css_clear_dir(css);
5400
5401
5402
5403
5404
5405 css_get(css);
5406
5407
5408
5409
5410
5411
5412
5413
5414
5415
5416
5417 percpu_ref_kill_and_confirm(&css->refcnt, css_killed_ref_fn);
5418}
5419
5420
5421
5422
5423
5424
5425
5426
5427
5428
5429
5430
5431
5432
5433
5434
5435
5436
5437
5438
5439
5440
5441
5442
5443
5444static int cgroup_destroy_locked(struct cgroup *cgrp)
5445 __releases(&cgroup_mutex) __acquires(&cgroup_mutex)
5446{
5447 struct cgroup_subsys_state *css;
5448 struct cgrp_cset_link *link;
5449 int ssid;
5450
5451 lockdep_assert_held(&cgroup_mutex);
5452
5453
5454
5455
5456
5457 if (cgroup_is_populated(cgrp))
5458 return -EBUSY;
5459
5460
5461
5462
5463
5464
5465 if (css_has_online_children(&cgrp->self))
5466 return -EBUSY;
5467
5468
5469
5470
5471
5472
5473
5474 cgrp->self.flags &= ~CSS_ONLINE;
5475
5476 spin_lock_irq(&css_set_lock);
5477 list_for_each_entry(link, &cgrp->cset_links, cset_link)
5478 link->cset->dead = true;
5479 spin_unlock_irq(&css_set_lock);
5480
5481
5482 for_each_css(css, ssid, cgrp)
5483 kill_css(css);
5484
5485
5486
5487
5488
5489 kernfs_remove(cgrp->kn);
5490
5491 check_for_release(cgroup_parent(cgrp));
5492
5493
5494 percpu_ref_kill(&cgrp->self.refcnt);
5495
5496 return 0;
5497};
5498
5499static int cgroup_rmdir(struct kernfs_node *kn)
5500{
5501 struct cgroup *cgrp;
5502 int ret = 0;
5503
5504 cgrp = cgroup_kn_lock_live(kn, false);
5505 if (!cgrp)
5506 return 0;
5507
5508 ret = cgroup_destroy_locked(cgrp);
5509
5510 cgroup_kn_unlock(kn);
5511 return ret;
5512}
5513
5514static struct kernfs_syscall_ops cgroup_kf_syscall_ops = {
5515 .remount_fs = cgroup_remount,
5516 .show_options = cgroup_show_options,
5517 .mkdir = cgroup_mkdir,
5518 .rmdir = cgroup_rmdir,
5519 .rename = cgroup_rename,
5520 .show_path = cgroup_show_path,
5521};
5522
5523static void __init cgroup_init_subsys(struct cgroup_subsys *ss, bool early)
5524{
5525 struct cgroup_subsys_state *css;
5526
5527 pr_debug("Initializing cgroup subsys %s\n", ss->name);
5528
5529 mutex_lock(&cgroup_mutex);
5530
5531 idr_init(&ss->css_idr);
5532 INIT_LIST_HEAD(&ss->cfts);
5533
5534
5535 ss->root = &cgrp_dfl_root;
5536 css = ss->css_alloc(cgroup_css(&cgrp_dfl_root.cgrp, ss));
5537
5538 BUG_ON(IS_ERR(css));
5539 init_and_link_css(css, ss, &cgrp_dfl_root.cgrp);
5540
5541
5542
5543
5544
5545 css->flags |= CSS_NO_REF;
5546
5547 if (early) {
5548
5549 css->id = 1;
5550 } else {
5551 css->id = cgroup_idr_alloc(&ss->css_idr, css, 1, 2, GFP_KERNEL);
5552 BUG_ON(css->id < 0);
5553 }
5554
5555
5556
5557
5558
5559 init_css_set.subsys[ss->id] = css;
5560
5561 have_fork_callback |= (bool)ss->fork << ss->id;
5562 have_exit_callback |= (bool)ss->exit << ss->id;
5563 have_free_callback |= (bool)ss->free << ss->id;
5564 have_canfork_callback |= (bool)ss->can_fork << ss->id;
5565
5566
5567
5568
5569 BUG_ON(!list_empty(&init_task.tasks));
5570
5571 BUG_ON(online_css(css));
5572
5573 mutex_unlock(&cgroup_mutex);
5574}
5575
5576
5577
5578
5579
5580
5581
5582int __init cgroup_init_early(void)
5583{
5584 static struct cgroup_sb_opts __initdata opts;
5585 struct cgroup_subsys *ss;
5586 int i;
5587
5588 init_cgroup_root(&cgrp_dfl_root, &opts);
5589 cgrp_dfl_root.cgrp.self.flags |= CSS_NO_REF;
5590
5591 RCU_INIT_POINTER(init_task.cgroups, &init_css_set);
5592
5593 for_each_subsys(ss, i) {
5594 WARN(!ss->css_alloc || !ss->css_free || ss->name || ss->id,
5595 "invalid cgroup_subsys %d:%s css_alloc=%p css_free=%p id:name=%d:%s\n",
5596 i, cgroup_subsys_name[i], ss->css_alloc, ss->css_free,
5597 ss->id, ss->name);
5598 WARN(strlen(cgroup_subsys_name[i]) > MAX_CGROUP_TYPE_NAMELEN,
5599 "cgroup_subsys_name %s too long\n", cgroup_subsys_name[i]);
5600
5601 ss->id = i;
5602 ss->name = cgroup_subsys_name[i];
5603 if (!ss->legacy_name)
5604 ss->legacy_name = cgroup_subsys_name[i];
5605
5606 if (ss->early_init)
5607 cgroup_init_subsys(ss, true);
5608 }
5609 return 0;
5610}
5611
5612static u16 cgroup_disable_mask __initdata;
5613
5614
5615
5616
5617
5618
5619
5620int __init cgroup_init(void)
5621{
5622 struct cgroup_subsys *ss;
5623 int ssid;
5624
5625 BUILD_BUG_ON(CGROUP_SUBSYS_COUNT > 16);
5626 BUG_ON(percpu_init_rwsem(&cgroup_threadgroup_rwsem));
5627 BUG_ON(cgroup_init_cftypes(NULL, cgroup_dfl_base_files));
5628 BUG_ON(cgroup_init_cftypes(NULL, cgroup_legacy_base_files));
5629
5630 get_user_ns(init_cgroup_ns.user_ns);
5631
5632 mutex_lock(&cgroup_mutex);
5633
5634
5635
5636
5637
5638 hash_add(css_set_table, &init_css_set.hlist,
5639 css_set_hash(init_css_set.subsys));
5640
5641 BUG_ON(cgroup_setup_root(&cgrp_dfl_root, 0));
5642
5643 mutex_unlock(&cgroup_mutex);
5644
5645 for_each_subsys(ss, ssid) {
5646 if (ss->early_init) {
5647 struct cgroup_subsys_state *css =
5648 init_css_set.subsys[ss->id];
5649
5650 css->id = cgroup_idr_alloc(&ss->css_idr, css, 1, 2,
5651 GFP_KERNEL);
5652 BUG_ON(css->id < 0);
5653 } else {
5654 cgroup_init_subsys(ss, false);
5655 }
5656
5657 list_add_tail(&init_css_set.e_cset_node[ssid],
5658 &cgrp_dfl_root.cgrp.e_csets[ssid]);
5659
5660
5661
5662
5663
5664
5665 if (cgroup_disable_mask & (1 << ssid)) {
5666 static_branch_disable(cgroup_subsys_enabled_key[ssid]);
5667 printk(KERN_INFO "Disabling %s control group subsystem\n",
5668 ss->name);
5669 continue;
5670 }
5671
5672 if (cgroup_ssid_no_v1(ssid))
5673 printk(KERN_INFO "Disabling %s control group subsystem in v1 mounts\n",
5674 ss->name);
5675
5676 cgrp_dfl_root.subsys_mask |= 1 << ss->id;
5677
5678 if (ss->implicit_on_dfl)
5679 cgrp_dfl_implicit_ss_mask |= 1 << ss->id;
5680 else if (!ss->dfl_cftypes)
5681 cgrp_dfl_inhibit_ss_mask |= 1 << ss->id;
5682
5683 if (ss->dfl_cftypes == ss->legacy_cftypes) {
5684 WARN_ON(cgroup_add_cftypes(ss, ss->dfl_cftypes));
5685 } else {
5686 WARN_ON(cgroup_add_dfl_cftypes(ss, ss->dfl_cftypes));
5687 WARN_ON(cgroup_add_legacy_cftypes(ss, ss->legacy_cftypes));
5688 }
5689
5690 if (ss->bind)
5691 ss->bind(init_css_set.subsys[ssid]);
5692 }
5693
5694
5695 hash_del(&init_css_set.hlist);
5696 hash_add(css_set_table, &init_css_set.hlist,
5697 css_set_hash(init_css_set.subsys));
5698
5699 WARN_ON(sysfs_create_mount_point(fs_kobj, "cgroup"));
5700 WARN_ON(register_filesystem(&cgroup_fs_type));
5701 WARN_ON(register_filesystem(&cgroup2_fs_type));
5702 WARN_ON(!proc_create("cgroups", 0, NULL, &proc_cgroupstats_operations));
5703
5704 return 0;
5705}
5706
5707static int __init cgroup_wq_init(void)
5708{
5709
5710
5711
5712
5713
5714
5715
5716
5717 cgroup_destroy_wq = alloc_workqueue("cgroup_destroy", 0, 1);
5718 BUG_ON(!cgroup_destroy_wq);
5719
5720
5721
5722
5723
5724 cgroup_pidlist_destroy_wq = alloc_workqueue("cgroup_pidlist_destroy",
5725 0, 1);
5726 BUG_ON(!cgroup_pidlist_destroy_wq);
5727
5728 return 0;
5729}
5730core_initcall(cgroup_wq_init);
5731
5732
5733
5734
5735
5736
5737int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns,
5738 struct pid *pid, struct task_struct *tsk)
5739{
5740 char *buf, *path;
5741 int retval;
5742 struct cgroup_root *root;
5743
5744 retval = -ENOMEM;
5745 buf = kmalloc(PATH_MAX, GFP_KERNEL);
5746 if (!buf)
5747 goto out;
5748
5749 mutex_lock(&cgroup_mutex);
5750 spin_lock_irq(&css_set_lock);
5751
5752 for_each_root(root) {
5753 struct cgroup_subsys *ss;
5754 struct cgroup *cgrp;
5755 int ssid, count = 0;
5756
5757 if (root == &cgrp_dfl_root && !cgrp_dfl_visible)
5758 continue;
5759
5760 seq_printf(m, "%d:", root->hierarchy_id);
5761 if (root != &cgrp_dfl_root)
5762 for_each_subsys(ss, ssid)
5763 if (root->subsys_mask & (1 << ssid))
5764 seq_printf(m, "%s%s", count++ ? "," : "",
5765 ss->legacy_name);
5766 if (strlen(root->name))
5767 seq_printf(m, "%sname=%s", count ? "," : "",
5768 root->name);
5769 seq_putc(m, ':');
5770
5771 cgrp = task_cgroup_from_root(tsk, root);
5772
5773
5774
5775
5776
5777
5778
5779
5780
5781
5782 if (cgroup_on_dfl(cgrp) || !(tsk->flags & PF_EXITING)) {
5783 path = cgroup_path_ns_locked(cgrp, buf, PATH_MAX,
5784 current->nsproxy->cgroup_ns);
5785 if (!path) {
5786 retval = -ENAMETOOLONG;
5787 goto out_unlock;
5788 }
5789 } else {
5790 path = "/";
5791 }
5792
5793 seq_puts(m, path);
5794
5795 if (cgroup_on_dfl(cgrp) && cgroup_is_dead(cgrp))
5796 seq_puts(m, " (deleted)\n");
5797 else
5798 seq_putc(m, '\n');
5799 }
5800
5801 retval = 0;
5802out_unlock:
5803 spin_unlock_irq(&css_set_lock);
5804 mutex_unlock(&cgroup_mutex);
5805 kfree(buf);
5806out:
5807 return retval;
5808}
5809
5810
5811static int proc_cgroupstats_show(struct seq_file *m, void *v)
5812{
5813 struct cgroup_subsys *ss;
5814 int i;
5815
5816 seq_puts(m, "#subsys_name\thierarchy\tnum_cgroups\tenabled\n");
5817
5818
5819
5820
5821
5822 mutex_lock(&cgroup_mutex);
5823
5824 for_each_subsys(ss, i)
5825 seq_printf(m, "%s\t%d\t%d\t%d\n",
5826 ss->legacy_name, ss->root->hierarchy_id,
5827 atomic_read(&ss->root->nr_cgrps),
5828 cgroup_ssid_enabled(i));
5829
5830 mutex_unlock(&cgroup_mutex);
5831 return 0;
5832}
5833
5834static int cgroupstats_open(struct inode *inode, struct file *file)
5835{
5836 return single_open(file, proc_cgroupstats_show, NULL);
5837}
5838
5839static const struct file_operations proc_cgroupstats_operations = {
5840 .open = cgroupstats_open,
5841 .read = seq_read,
5842 .llseek = seq_lseek,
5843 .release = single_release,
5844};
5845
5846
5847
5848
5849
5850
5851
5852
5853
5854void cgroup_fork(struct task_struct *child)
5855{
5856 RCU_INIT_POINTER(child->cgroups, &init_css_set);
5857 INIT_LIST_HEAD(&child->cg_list);
5858}
5859
5860
5861
5862
5863
5864
5865
5866
5867
5868int cgroup_can_fork(struct task_struct *child)
5869{
5870 struct cgroup_subsys *ss;
5871 int i, j, ret;
5872
5873 do_each_subsys_mask(ss, i, have_canfork_callback) {
5874 ret = ss->can_fork(child);
5875 if (ret)
5876 goto out_revert;
5877 } while_each_subsys_mask();
5878
5879 return 0;
5880
5881out_revert:
5882 for_each_subsys(ss, j) {
5883 if (j >= i)
5884 break;
5885 if (ss->cancel_fork)
5886 ss->cancel_fork(child);
5887 }
5888
5889 return ret;
5890}
5891
5892
5893
5894
5895
5896
5897
5898
5899void cgroup_cancel_fork(struct task_struct *child)
5900{
5901 struct cgroup_subsys *ss;
5902 int i;
5903
5904 for_each_subsys(ss, i)
5905 if (ss->cancel_fork)
5906 ss->cancel_fork(child);
5907}
5908
5909
5910
5911
5912
5913
5914
5915
5916
5917
5918
5919void cgroup_post_fork(struct task_struct *child)
5920{
5921 struct cgroup_subsys *ss;
5922 int i;
5923
5924
5925
5926
5927
5928
5929
5930
5931
5932
5933
5934
5935
5936
5937
5938
5939
5940
5941
5942
5943
5944
5945 if (use_task_css_set_links) {
5946 struct css_set *cset;
5947
5948 spin_lock_irq(&css_set_lock);
5949 cset = task_css_set(current);
5950 if (list_empty(&child->cg_list)) {
5951 get_css_set(cset);
5952 css_set_move_task(child, NULL, cset, false);
5953 }
5954 spin_unlock_irq(&css_set_lock);
5955 }
5956
5957
5958
5959
5960
5961
5962 do_each_subsys_mask(ss, i, have_fork_callback) {
5963 ss->fork(child);
5964 } while_each_subsys_mask();
5965}
5966
5967
5968
5969
5970
5971
5972
5973
5974
5975
5976
5977
5978
5979
5980
5981
5982
5983
5984
5985
5986void cgroup_exit(struct task_struct *tsk)
5987{
5988 struct cgroup_subsys *ss;
5989 struct css_set *cset;
5990 int i;
5991
5992
5993
5994
5995
5996 cset = task_css_set(tsk);
5997
5998 if (!list_empty(&tsk->cg_list)) {
5999 spin_lock_irq(&css_set_lock);
6000 css_set_move_task(tsk, cset, NULL, false);
6001 spin_unlock_irq(&css_set_lock);
6002 } else {
6003 get_css_set(cset);
6004 }
6005
6006
6007 do_each_subsys_mask(ss, i, have_exit_callback) {
6008 ss->exit(tsk);
6009 } while_each_subsys_mask();
6010}
6011
6012void cgroup_free(struct task_struct *task)
6013{
6014 struct css_set *cset = task_css_set(task);
6015 struct cgroup_subsys *ss;
6016 int ssid;
6017
6018 do_each_subsys_mask(ss, ssid, have_free_callback) {
6019 ss->free(task);
6020 } while_each_subsys_mask();
6021
6022 put_css_set(cset);
6023}
6024
6025static void check_for_release(struct cgroup *cgrp)
6026{
6027 if (notify_on_release(cgrp) && !cgroup_is_populated(cgrp) &&
6028 !css_has_online_children(&cgrp->self) && !cgroup_is_dead(cgrp))
6029 schedule_work(&cgrp->release_agent_work);
6030}
6031
6032
6033
6034
6035
6036
6037
6038
6039
6040
6041
6042
6043
6044
6045
6046
6047
6048
6049
6050
6051
6052
6053
6054
6055static void cgroup_release_agent(struct work_struct *work)
6056{
6057 struct cgroup *cgrp =
6058 container_of(work, struct cgroup, release_agent_work);
6059 char *pathbuf = NULL, *agentbuf = NULL, *path;
6060 char *argv[3], *envp[3];
6061
6062 mutex_lock(&cgroup_mutex);
6063
6064 pathbuf = kmalloc(PATH_MAX, GFP_KERNEL);
6065 agentbuf = kstrdup(cgrp->root->release_agent_path, GFP_KERNEL);
6066 if (!pathbuf || !agentbuf)
6067 goto out;
6068
6069 spin_lock_irq(&css_set_lock);
6070 path = cgroup_path_ns_locked(cgrp, pathbuf, PATH_MAX, &init_cgroup_ns);
6071 spin_unlock_irq(&css_set_lock);
6072 if (!path)
6073 goto out;
6074
6075 argv[0] = agentbuf;
6076 argv[1] = path;
6077 argv[2] = NULL;
6078
6079
6080 envp[0] = "HOME=/";
6081 envp[1] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin";
6082 envp[2] = NULL;
6083
6084 mutex_unlock(&cgroup_mutex);
6085 call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC);
6086 goto out_free;
6087out:
6088 mutex_unlock(&cgroup_mutex);
6089out_free:
6090 kfree(agentbuf);
6091 kfree(pathbuf);
6092}
6093
6094static int __init cgroup_disable(char *str)
6095{
6096 struct cgroup_subsys *ss;
6097 char *token;
6098 int i;
6099
6100 while ((token = strsep(&str, ",")) != NULL) {
6101 if (!*token)
6102 continue;
6103
6104 for_each_subsys(ss, i) {
6105 if (strcmp(token, ss->name) &&
6106 strcmp(token, ss->legacy_name))
6107 continue;
6108 cgroup_disable_mask |= 1 << i;
6109 }
6110 }
6111 return 1;
6112}
6113__setup("cgroup_disable=", cgroup_disable);
6114
6115static int __init cgroup_no_v1(char *str)
6116{
6117 struct cgroup_subsys *ss;
6118 char *token;
6119 int i;
6120
6121 while ((token = strsep(&str, ",")) != NULL) {
6122 if (!*token)
6123 continue;
6124
6125 if (!strcmp(token, "all")) {
6126 cgroup_no_v1_mask = U16_MAX;
6127 break;
6128 }
6129
6130 for_each_subsys(ss, i) {
6131 if (strcmp(token, ss->name) &&
6132 strcmp(token, ss->legacy_name))
6133 continue;
6134
6135 cgroup_no_v1_mask |= 1 << i;
6136 }
6137 }
6138 return 1;
6139}
6140__setup("cgroup_no_v1=", cgroup_no_v1);
6141
6142
6143
6144
6145
6146
6147
6148
6149
6150
6151struct cgroup_subsys_state *css_tryget_online_from_dir(struct dentry *dentry,
6152 struct cgroup_subsys *ss)
6153{
6154 struct kernfs_node *kn = kernfs_node_from_dentry(dentry);
6155 struct file_system_type *s_type = dentry->d_sb->s_type;
6156 struct cgroup_subsys_state *css = NULL;
6157 struct cgroup *cgrp;
6158
6159
6160 if ((s_type != &cgroup_fs_type && s_type != &cgroup2_fs_type) ||
6161 !kn || kernfs_type(kn) != KERNFS_DIR)
6162 return ERR_PTR(-EBADF);
6163
6164 rcu_read_lock();
6165
6166
6167
6168
6169
6170
6171 cgrp = rcu_dereference(kn->priv);
6172 if (cgrp)
6173 css = cgroup_css(cgrp, ss);
6174
6175 if (!css || !css_tryget_online(css))
6176 css = ERR_PTR(-ENOENT);
6177
6178 rcu_read_unlock();
6179 return css;
6180}
6181
6182
6183
6184
6185
6186
6187
6188
6189
6190struct cgroup_subsys_state *css_from_id(int id, struct cgroup_subsys *ss)
6191{
6192 WARN_ON_ONCE(!rcu_read_lock_held());
6193 return idr_find(&ss->css_idr, id);
6194}
6195
6196
6197
6198
6199
6200
6201
6202
6203
6204
6205struct cgroup *cgroup_get_from_path(const char *path)
6206{
6207 struct kernfs_node *kn;
6208 struct cgroup *cgrp;
6209
6210 mutex_lock(&cgroup_mutex);
6211
6212 kn = kernfs_walk_and_get(cgrp_dfl_root.cgrp.kn, path);
6213 if (kn) {
6214 if (kernfs_type(kn) == KERNFS_DIR) {
6215 cgrp = kn->priv;
6216 cgroup_get(cgrp);
6217 } else {
6218 cgrp = ERR_PTR(-ENOTDIR);
6219 }
6220 kernfs_put(kn);
6221 } else {
6222 cgrp = ERR_PTR(-ENOENT);
6223 }
6224
6225 mutex_unlock(&cgroup_mutex);
6226 return cgrp;
6227}
6228EXPORT_SYMBOL_GPL(cgroup_get_from_path);
6229
6230
6231
6232
6233
6234
6235
6236
6237
6238
6239struct cgroup *cgroup_get_from_fd(int fd)
6240{
6241 struct cgroup_subsys_state *css;
6242 struct cgroup *cgrp;
6243 struct file *f;
6244
6245 f = fget_raw(fd);
6246 if (!f)
6247 return ERR_PTR(-EBADF);
6248
6249 css = css_tryget_online_from_dir(f->f_path.dentry, NULL);
6250 fput(f);
6251 if (IS_ERR(css))
6252 return ERR_CAST(css);
6253
6254 cgrp = css->cgroup;
6255 if (!cgroup_on_dfl(cgrp)) {
6256 cgroup_put(cgrp);
6257 return ERR_PTR(-EBADF);
6258 }
6259
6260 return cgrp;
6261}
6262EXPORT_SYMBOL_GPL(cgroup_get_from_fd);
6263
6264
6265
6266
6267
6268#ifdef CONFIG_SOCK_CGROUP_DATA
6269
6270#if defined(CONFIG_CGROUP_NET_PRIO) || defined(CONFIG_CGROUP_NET_CLASSID)
6271
6272DEFINE_SPINLOCK(cgroup_sk_update_lock);
6273static bool cgroup_sk_alloc_disabled __read_mostly;
6274
6275void cgroup_sk_alloc_disable(void)
6276{
6277 if (cgroup_sk_alloc_disabled)
6278 return;
6279 pr_info("cgroup: disabling cgroup2 socket matching due to net_prio or net_cls activation\n");
6280 cgroup_sk_alloc_disabled = true;
6281}
6282
6283#else
6284
6285#define cgroup_sk_alloc_disabled false
6286
6287#endif
6288
6289void cgroup_sk_alloc(struct sock_cgroup_data *skcd)
6290{
6291 if (cgroup_sk_alloc_disabled)
6292 return;
6293
6294
6295 if (skcd->val) {
6296 cgroup_get(sock_cgroup_ptr(skcd));
6297 return;
6298 }
6299
6300 rcu_read_lock();
6301
6302 while (true) {
6303 struct css_set *cset;
6304
6305 cset = task_css_set(current);
6306 if (likely(cgroup_tryget(cset->dfl_cgrp))) {
6307 skcd->val = (unsigned long)cset->dfl_cgrp;
6308 break;
6309 }
6310 cpu_relax();
6311 }
6312
6313 rcu_read_unlock();
6314}
6315
6316void cgroup_sk_free(struct sock_cgroup_data *skcd)
6317{
6318 cgroup_put(sock_cgroup_ptr(skcd));
6319}
6320
6321#endif
6322
6323
6324
6325static struct cgroup_namespace *alloc_cgroup_ns(void)
6326{
6327 struct cgroup_namespace *new_ns;
6328 int ret;
6329
6330 new_ns = kzalloc(sizeof(struct cgroup_namespace), GFP_KERNEL);
6331 if (!new_ns)
6332 return ERR_PTR(-ENOMEM);
6333 ret = ns_alloc_inum(&new_ns->ns);
6334 if (ret) {
6335 kfree(new_ns);
6336 return ERR_PTR(ret);
6337 }
6338 atomic_set(&new_ns->count, 1);
6339 new_ns->ns.ops = &cgroupns_operations;
6340 return new_ns;
6341}
6342
6343void free_cgroup_ns(struct cgroup_namespace *ns)
6344{
6345 put_css_set(ns->root_cset);
6346 put_user_ns(ns->user_ns);
6347 ns_free_inum(&ns->ns);
6348 kfree(ns);
6349}
6350EXPORT_SYMBOL(free_cgroup_ns);
6351
6352struct cgroup_namespace *copy_cgroup_ns(unsigned long flags,
6353 struct user_namespace *user_ns,
6354 struct cgroup_namespace *old_ns)
6355{
6356 struct cgroup_namespace *new_ns;
6357 struct css_set *cset;
6358
6359 BUG_ON(!old_ns);
6360
6361 if (!(flags & CLONE_NEWCGROUP)) {
6362 get_cgroup_ns(old_ns);
6363 return old_ns;
6364 }
6365
6366
6367 if (!ns_capable(user_ns, CAP_SYS_ADMIN))
6368 return ERR_PTR(-EPERM);
6369
6370
6371 spin_lock_irq(&css_set_lock);
6372 cset = task_css_set(current);
6373 get_css_set(cset);
6374 spin_unlock_irq(&css_set_lock);
6375
6376 new_ns = alloc_cgroup_ns();
6377 if (IS_ERR(new_ns)) {
6378 put_css_set(cset);
6379 return new_ns;
6380 }
6381
6382 new_ns->user_ns = get_user_ns(user_ns);
6383 new_ns->root_cset = cset;
6384
6385 return new_ns;
6386}
6387
6388static inline struct cgroup_namespace *to_cg_ns(struct ns_common *ns)
6389{
6390 return container_of(ns, struct cgroup_namespace, ns);
6391}
6392
6393static int cgroupns_install(struct nsproxy *nsproxy, struct ns_common *ns)
6394{
6395 struct cgroup_namespace *cgroup_ns = to_cg_ns(ns);
6396
6397 if (!ns_capable(current_user_ns(), CAP_SYS_ADMIN) ||
6398 !ns_capable(cgroup_ns->user_ns, CAP_SYS_ADMIN))
6399 return -EPERM;
6400
6401
6402 if (cgroup_ns == nsproxy->cgroup_ns)
6403 return 0;
6404
6405 get_cgroup_ns(cgroup_ns);
6406 put_cgroup_ns(nsproxy->cgroup_ns);
6407 nsproxy->cgroup_ns = cgroup_ns;
6408
6409 return 0;
6410}
6411
6412static struct ns_common *cgroupns_get(struct task_struct *task)
6413{
6414 struct cgroup_namespace *ns = NULL;
6415 struct nsproxy *nsproxy;
6416
6417 task_lock(task);
6418 nsproxy = task->nsproxy;
6419 if (nsproxy) {
6420 ns = nsproxy->cgroup_ns;
6421 get_cgroup_ns(ns);
6422 }
6423 task_unlock(task);
6424
6425 return ns ? &ns->ns : NULL;
6426}
6427
6428static void cgroupns_put(struct ns_common *ns)
6429{
6430 put_cgroup_ns(to_cg_ns(ns));
6431}
6432
6433const struct proc_ns_operations cgroupns_operations = {
6434 .name = "cgroup",
6435 .type = CLONE_NEWCGROUP,
6436 .get = cgroupns_get,
6437 .put = cgroupns_put,
6438 .install = cgroupns_install,
6439};
6440
6441static __init int cgroup_namespaces_init(void)
6442{
6443 return 0;
6444}
6445subsys_initcall(cgroup_namespaces_init);
6446
6447#ifdef CONFIG_CGROUP_DEBUG
6448static struct cgroup_subsys_state *
6449debug_css_alloc(struct cgroup_subsys_state *parent_css)
6450{
6451 struct cgroup_subsys_state *css = kzalloc(sizeof(*css), GFP_KERNEL);
6452
6453 if (!css)
6454 return ERR_PTR(-ENOMEM);
6455
6456 return css;
6457}
6458
6459static void debug_css_free(struct cgroup_subsys_state *css)
6460{
6461 kfree(css);
6462}
6463
6464static u64 debug_taskcount_read(struct cgroup_subsys_state *css,
6465 struct cftype *cft)
6466{
6467 return cgroup_task_count(css->cgroup);
6468}
6469
6470static u64 current_css_set_read(struct cgroup_subsys_state *css,
6471 struct cftype *cft)
6472{
6473 return (u64)(unsigned long)current->cgroups;
6474}
6475
6476static u64 current_css_set_refcount_read(struct cgroup_subsys_state *css,
6477 struct cftype *cft)
6478{
6479 u64 count;
6480
6481 rcu_read_lock();
6482 count = atomic_read(&task_css_set(current)->refcount);
6483 rcu_read_unlock();
6484 return count;
6485}
6486
6487static int current_css_set_cg_links_read(struct seq_file *seq, void *v)
6488{
6489 struct cgrp_cset_link *link;
6490 struct css_set *cset;
6491 char *name_buf;
6492
6493 name_buf = kmalloc(NAME_MAX + 1, GFP_KERNEL);
6494 if (!name_buf)
6495 return -ENOMEM;
6496
6497 spin_lock_irq(&css_set_lock);
6498 rcu_read_lock();
6499 cset = rcu_dereference(current->cgroups);
6500 list_for_each_entry(link, &cset->cgrp_links, cgrp_link) {
6501 struct cgroup *c = link->cgrp;
6502
6503 cgroup_name(c, name_buf, NAME_MAX + 1);
6504 seq_printf(seq, "Root %d group %s\n",
6505 c->root->hierarchy_id, name_buf);
6506 }
6507 rcu_read_unlock();
6508 spin_unlock_irq(&css_set_lock);
6509 kfree(name_buf);
6510 return 0;
6511}
6512
6513#define MAX_TASKS_SHOWN_PER_CSS 25
6514static int cgroup_css_links_read(struct seq_file *seq, void *v)
6515{
6516 struct cgroup_subsys_state *css = seq_css(seq);
6517 struct cgrp_cset_link *link;
6518
6519 spin_lock_irq(&css_set_lock);
6520 list_for_each_entry(link, &css->cgroup->cset_links, cset_link) {
6521 struct css_set *cset = link->cset;
6522 struct task_struct *task;
6523 int count = 0;
6524
6525 seq_printf(seq, "css_set %p\n", cset);
6526
6527 list_for_each_entry(task, &cset->tasks, cg_list) {
6528 if (count++ > MAX_TASKS_SHOWN_PER_CSS)
6529 goto overflow;
6530 seq_printf(seq, " task %d\n", task_pid_vnr(task));
6531 }
6532
6533 list_for_each_entry(task, &cset->mg_tasks, cg_list) {
6534 if (count++ > MAX_TASKS_SHOWN_PER_CSS)
6535 goto overflow;
6536 seq_printf(seq, " task %d\n", task_pid_vnr(task));
6537 }
6538 continue;
6539 overflow:
6540 seq_puts(seq, " ...\n");
6541 }
6542 spin_unlock_irq(&css_set_lock);
6543 return 0;
6544}
6545
6546static u64 releasable_read(struct cgroup_subsys_state *css, struct cftype *cft)
6547{
6548 return (!cgroup_is_populated(css->cgroup) &&
6549 !css_has_online_children(&css->cgroup->self));
6550}
6551
6552static struct cftype debug_files[] = {
6553 {
6554 .name = "taskcount",
6555 .read_u64 = debug_taskcount_read,
6556 },
6557
6558 {
6559 .name = "current_css_set",
6560 .read_u64 = current_css_set_read,
6561 },
6562
6563 {
6564 .name = "current_css_set_refcount",
6565 .read_u64 = current_css_set_refcount_read,
6566 },
6567
6568 {
6569 .name = "current_css_set_cg_links",
6570 .seq_show = current_css_set_cg_links_read,
6571 },
6572
6573 {
6574 .name = "cgroup_css_links",
6575 .seq_show = cgroup_css_links_read,
6576 },
6577
6578 {
6579 .name = "releasable",
6580 .read_u64 = releasable_read,
6581 },
6582
6583 { }
6584};
6585
6586struct cgroup_subsys debug_cgrp_subsys = {
6587 .css_alloc = debug_css_alloc,
6588 .css_free = debug_css_free,
6589 .legacy_cftypes = debug_files,
6590};
6591#endif
6592