1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
30
31#include "cgroup-internal.h"
32
33#include <linux/cred.h>
34#include <linux/errno.h>
35#include <linux/init_task.h>
36#include <linux/kernel.h>
37#include <linux/magic.h>
38#include <linux/mutex.h>
39#include <linux/mount.h>
40#include <linux/pagemap.h>
41#include <linux/proc_fs.h>
42#include <linux/rcupdate.h>
43#include <linux/sched.h>
44#include <linux/sched/task.h>
45#include <linux/slab.h>
46#include <linux/spinlock.h>
47#include <linux/percpu-rwsem.h>
48#include <linux/string.h>
49#include <linux/hashtable.h>
50#include <linux/idr.h>
51#include <linux/kthread.h>
52#include <linux/atomic.h>
53#include <linux/cpuset.h>
54#include <linux/proc_ns.h>
55#include <linux/nsproxy.h>
56#include <linux/file.h>
57#include <linux/fs_parser.h>
58#include <linux/sched/cputime.h>
59#include <linux/psi.h>
60#include <net/sock.h>
61
62#define CREATE_TRACE_POINTS
63#include <trace/events/cgroup.h>
64
65#define CGROUP_FILE_NAME_MAX (MAX_CGROUP_TYPE_NAMELEN + \
66 MAX_CFTYPE_NAME + 2)
67
68#define CGROUP_FILE_NOTIFY_MIN_INTV DIV_ROUND_UP(HZ, 100)
69
70
71
72
73
74
75
76#define CGROUP_HAS_SUBSYS_CONFIG (CGROUP_SUBSYS_COUNT > 0)
77
78
79
80
81
82
83
84
85
86
87
88DEFINE_MUTEX(cgroup_mutex);
89DEFINE_SPINLOCK(css_set_lock);
90
91#ifdef CONFIG_PROVE_RCU
92EXPORT_SYMBOL_GPL(cgroup_mutex);
93EXPORT_SYMBOL_GPL(css_set_lock);
94#endif
95
96DEFINE_SPINLOCK(trace_cgroup_path_lock);
97char trace_cgroup_path[TRACE_CGROUP_PATH_LEN];
98bool cgroup_debug __read_mostly;
99
100
101
102
103
104static DEFINE_SPINLOCK(cgroup_idr_lock);
105
106
107
108
109
110static DEFINE_SPINLOCK(cgroup_file_kn_lock);
111
112DEFINE_PERCPU_RWSEM(cgroup_threadgroup_rwsem);
113
114#define cgroup_assert_mutex_or_rcu_locked() \
115 RCU_LOCKDEP_WARN(!rcu_read_lock_held() && \
116 !lockdep_is_held(&cgroup_mutex), \
117 "cgroup_mutex or RCU read lock required");
118
119
120
121
122
123
124
125static struct workqueue_struct *cgroup_destroy_wq;
126
127
128#define SUBSYS(_x) [_x ## _cgrp_id] = &_x ## _cgrp_subsys,
129struct cgroup_subsys *cgroup_subsys[] = {
130#include <linux/cgroup_subsys.h>
131};
132#undef SUBSYS
133
134
135#define SUBSYS(_x) [_x ## _cgrp_id] = #_x,
136static const char *cgroup_subsys_name[] = {
137#include <linux/cgroup_subsys.h>
138};
139#undef SUBSYS
140
141
142#define SUBSYS(_x) \
143 DEFINE_STATIC_KEY_TRUE(_x ## _cgrp_subsys_enabled_key); \
144 DEFINE_STATIC_KEY_TRUE(_x ## _cgrp_subsys_on_dfl_key); \
145 EXPORT_SYMBOL_GPL(_x ## _cgrp_subsys_enabled_key); \
146 EXPORT_SYMBOL_GPL(_x ## _cgrp_subsys_on_dfl_key);
147#include <linux/cgroup_subsys.h>
148#undef SUBSYS
149
150#define SUBSYS(_x) [_x ## _cgrp_id] = &_x ## _cgrp_subsys_enabled_key,
151static struct static_key_true *cgroup_subsys_enabled_key[] = {
152#include <linux/cgroup_subsys.h>
153};
154#undef SUBSYS
155
156#define SUBSYS(_x) [_x ## _cgrp_id] = &_x ## _cgrp_subsys_on_dfl_key,
157static struct static_key_true *cgroup_subsys_on_dfl_key[] = {
158#include <linux/cgroup_subsys.h>
159};
160#undef SUBSYS
161
162static DEFINE_PER_CPU(struct cgroup_rstat_cpu, cgrp_dfl_root_rstat_cpu);
163
164
165struct cgroup_root cgrp_dfl_root = { .cgrp.rstat_cpu = &cgrp_dfl_root_rstat_cpu };
166EXPORT_SYMBOL_GPL(cgrp_dfl_root);
167
168
169
170
171
172static bool cgrp_dfl_visible;
173
174
175static u16 cgrp_dfl_inhibit_ss_mask;
176
177
178static u16 cgrp_dfl_implicit_ss_mask;
179
180
181static u16 cgrp_dfl_threaded_ss_mask;
182
183
184LIST_HEAD(cgroup_roots);
185static int cgroup_root_count;
186
187
188static DEFINE_IDR(cgroup_hierarchy_idr);
189
190
191
192
193
194
195
196
197static u64 css_serial_nr_next = 1;
198
199
200
201
202
203static u16 have_fork_callback __read_mostly;
204static u16 have_exit_callback __read_mostly;
205static u16 have_release_callback __read_mostly;
206static u16 have_canfork_callback __read_mostly;
207
208
209struct cgroup_namespace init_cgroup_ns = {
210 .ns.count = REFCOUNT_INIT(2),
211 .user_ns = &init_user_ns,
212 .ns.ops = &cgroupns_operations,
213 .ns.inum = PROC_CGROUP_INIT_INO,
214 .root_cset = &init_css_set,
215};
216
217static struct file_system_type cgroup2_fs_type;
218static struct cftype cgroup_base_files[];
219
220
221enum cgroup_opt_features {
222#ifdef CONFIG_PSI
223 OPT_FEATURE_PRESSURE,
224#endif
225 OPT_FEATURE_COUNT
226};
227
228static const char *cgroup_opt_feature_names[OPT_FEATURE_COUNT] = {
229#ifdef CONFIG_PSI
230 "pressure",
231#endif
232};
233
234static u16 cgroup_feature_disable_mask __read_mostly;
235
236static int cgroup_apply_control(struct cgroup *cgrp);
237static void cgroup_finalize_control(struct cgroup *cgrp, int ret);
238static void css_task_iter_skip(struct css_task_iter *it,
239 struct task_struct *task);
240static int cgroup_destroy_locked(struct cgroup *cgrp);
241static struct cgroup_subsys_state *css_create(struct cgroup *cgrp,
242 struct cgroup_subsys *ss);
243static void css_release(struct percpu_ref *ref);
244static void kill_css(struct cgroup_subsys_state *css);
245static int cgroup_addrm_files(struct cgroup_subsys_state *css,
246 struct cgroup *cgrp, struct cftype cfts[],
247 bool is_add);
248
249
250
251
252
253
254
255
256
257bool cgroup_ssid_enabled(int ssid)
258{
259 if (!CGROUP_HAS_SUBSYS_CONFIG)
260 return false;
261
262 return static_key_enabled(cgroup_subsys_enabled_key[ssid]);
263}
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312bool cgroup_on_dfl(const struct cgroup *cgrp)
313{
314 return cgrp->root == &cgrp_dfl_root;
315}
316
317
318static int cgroup_idr_alloc(struct idr *idr, void *ptr, int start, int end,
319 gfp_t gfp_mask)
320{
321 int ret;
322
323 idr_preload(gfp_mask);
324 spin_lock_bh(&cgroup_idr_lock);
325 ret = idr_alloc(idr, ptr, start, end, gfp_mask & ~__GFP_DIRECT_RECLAIM);
326 spin_unlock_bh(&cgroup_idr_lock);
327 idr_preload_end();
328 return ret;
329}
330
331static void *cgroup_idr_replace(struct idr *idr, void *ptr, int id)
332{
333 void *ret;
334
335 spin_lock_bh(&cgroup_idr_lock);
336 ret = idr_replace(idr, ptr, id);
337 spin_unlock_bh(&cgroup_idr_lock);
338 return ret;
339}
340
341static void cgroup_idr_remove(struct idr *idr, int id)
342{
343 spin_lock_bh(&cgroup_idr_lock);
344 idr_remove(idr, id);
345 spin_unlock_bh(&cgroup_idr_lock);
346}
347
348static bool cgroup_has_tasks(struct cgroup *cgrp)
349{
350 return cgrp->nr_populated_csets;
351}
352
353bool cgroup_is_threaded(struct cgroup *cgrp)
354{
355 return cgrp->dom_cgrp != cgrp;
356}
357
358
359static bool cgroup_is_mixable(struct cgroup *cgrp)
360{
361
362
363
364
365
366 return !cgroup_parent(cgrp);
367}
368
369
370static bool cgroup_can_be_thread_root(struct cgroup *cgrp)
371{
372
373 if (cgroup_is_mixable(cgrp))
374 return true;
375
376
377 if (cgroup_is_threaded(cgrp))
378 return false;
379
380
381 if (cgrp->nr_populated_domain_children)
382 return false;
383
384
385 if (cgrp->subtree_control & ~cgrp_dfl_threaded_ss_mask)
386 return false;
387
388 return true;
389}
390
391
392bool cgroup_is_thread_root(struct cgroup *cgrp)
393{
394
395 if (cgroup_is_threaded(cgrp))
396 return false;
397
398
399 if (cgrp->nr_threaded_children)
400 return true;
401
402
403
404
405
406 if (cgroup_has_tasks(cgrp) &&
407 (cgrp->subtree_control & cgrp_dfl_threaded_ss_mask))
408 return true;
409
410 return false;
411}
412
413
414static bool cgroup_is_valid_domain(struct cgroup *cgrp)
415{
416
417 if (cgroup_is_threaded(cgrp))
418 return false;
419
420
421 while ((cgrp = cgroup_parent(cgrp))) {
422 if (!cgroup_is_mixable(cgrp) && cgroup_is_thread_root(cgrp))
423 return false;
424 if (cgroup_is_threaded(cgrp))
425 return false;
426 }
427
428 return true;
429}
430
431
432static u16 cgroup_control(struct cgroup *cgrp)
433{
434 struct cgroup *parent = cgroup_parent(cgrp);
435 u16 root_ss_mask = cgrp->root->subsys_mask;
436
437 if (parent) {
438 u16 ss_mask = parent->subtree_control;
439
440
441 if (cgroup_is_threaded(cgrp))
442 ss_mask &= cgrp_dfl_threaded_ss_mask;
443 return ss_mask;
444 }
445
446 if (cgroup_on_dfl(cgrp))
447 root_ss_mask &= ~(cgrp_dfl_inhibit_ss_mask |
448 cgrp_dfl_implicit_ss_mask);
449 return root_ss_mask;
450}
451
452
453static u16 cgroup_ss_mask(struct cgroup *cgrp)
454{
455 struct cgroup *parent = cgroup_parent(cgrp);
456
457 if (parent) {
458 u16 ss_mask = parent->subtree_ss_mask;
459
460
461 if (cgroup_is_threaded(cgrp))
462 ss_mask &= cgrp_dfl_threaded_ss_mask;
463 return ss_mask;
464 }
465
466 return cgrp->root->subsys_mask;
467}
468
469
470
471
472
473
474
475
476
477
478
479
480static struct cgroup_subsys_state *cgroup_css(struct cgroup *cgrp,
481 struct cgroup_subsys *ss)
482{
483 if (CGROUP_HAS_SUBSYS_CONFIG && ss)
484 return rcu_dereference_check(cgrp->subsys[ss->id],
485 lockdep_is_held(&cgroup_mutex));
486 else
487 return &cgrp->self;
488}
489
490
491
492
493
494
495
496
497
498static struct cgroup_subsys_state *cgroup_tryget_css(struct cgroup *cgrp,
499 struct cgroup_subsys *ss)
500{
501 struct cgroup_subsys_state *css;
502
503 rcu_read_lock();
504 css = cgroup_css(cgrp, ss);
505 if (css && !css_tryget_online(css))
506 css = NULL;
507 rcu_read_unlock();
508
509 return css;
510}
511
512
513
514
515
516
517
518
519
520
521
522static struct cgroup_subsys_state *cgroup_e_css_by_mask(struct cgroup *cgrp,
523 struct cgroup_subsys *ss)
524{
525 lockdep_assert_held(&cgroup_mutex);
526
527 if (!ss)
528 return &cgrp->self;
529
530
531
532
533
534 while (!(cgroup_ss_mask(cgrp) & (1 << ss->id))) {
535 cgrp = cgroup_parent(cgrp);
536 if (!cgrp)
537 return NULL;
538 }
539
540 return cgroup_css(cgrp, ss);
541}
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556struct cgroup_subsys_state *cgroup_e_css(struct cgroup *cgrp,
557 struct cgroup_subsys *ss)
558{
559 struct cgroup_subsys_state *css;
560
561 if (!CGROUP_HAS_SUBSYS_CONFIG)
562 return NULL;
563
564 do {
565 css = cgroup_css(cgrp, ss);
566
567 if (css)
568 return css;
569 cgrp = cgroup_parent(cgrp);
570 } while (cgrp);
571
572 return init_css_set.subsys[ss->id];
573}
574
575
576
577
578
579
580
581
582
583
584
585
586struct cgroup_subsys_state *cgroup_get_e_css(struct cgroup *cgrp,
587 struct cgroup_subsys *ss)
588{
589 struct cgroup_subsys_state *css;
590
591 if (!CGROUP_HAS_SUBSYS_CONFIG)
592 return NULL;
593
594 rcu_read_lock();
595
596 do {
597 css = cgroup_css(cgrp, ss);
598
599 if (css && css_tryget_online(css))
600 goto out_unlock;
601 cgrp = cgroup_parent(cgrp);
602 } while (cgrp);
603
604 css = init_css_set.subsys[ss->id];
605 css_get(css);
606out_unlock:
607 rcu_read_unlock();
608 return css;
609}
610EXPORT_SYMBOL_GPL(cgroup_get_e_css);
611
612static void cgroup_get_live(struct cgroup *cgrp)
613{
614 WARN_ON_ONCE(cgroup_is_dead(cgrp));
615 css_get(&cgrp->self);
616}
617
618
619
620
621
622
623int __cgroup_task_count(const struct cgroup *cgrp)
624{
625 int count = 0;
626 struct cgrp_cset_link *link;
627
628 lockdep_assert_held(&css_set_lock);
629
630 list_for_each_entry(link, &cgrp->cset_links, cset_link)
631 count += link->cset->nr_tasks;
632
633 return count;
634}
635
636
637
638
639
640int cgroup_task_count(const struct cgroup *cgrp)
641{
642 int count;
643
644 spin_lock_irq(&css_set_lock);
645 count = __cgroup_task_count(cgrp);
646 spin_unlock_irq(&css_set_lock);
647
648 return count;
649}
650
651struct cgroup_subsys_state *of_css(struct kernfs_open_file *of)
652{
653 struct cgroup *cgrp = of->kn->parent->priv;
654 struct cftype *cft = of_cft(of);
655
656
657
658
659
660
661
662
663
664 if (CGROUP_HAS_SUBSYS_CONFIG && cft->ss)
665 return rcu_dereference_raw(cgrp->subsys[cft->ss->id]);
666 else
667 return &cgrp->self;
668}
669EXPORT_SYMBOL_GPL(of_css);
670
671
672
673
674
675
676
677
678
679#define for_each_css(css, ssid, cgrp) \
680 for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++) \
681 if (!((css) = rcu_dereference_check( \
682 (cgrp)->subsys[(ssid)], \
683 lockdep_is_held(&cgroup_mutex)))) { } \
684 else
685
686
687
688
689
690
691
692
693
694#define for_each_e_css(css, ssid, cgrp) \
695 for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++) \
696 if (!((css) = cgroup_e_css_by_mask(cgrp, \
697 cgroup_subsys[(ssid)]))) \
698 ; \
699 else
700
701
702
703
704
705
706
707
708
709
710#define do_each_subsys_mask(ss, ssid, ss_mask) do { \
711 unsigned long __ss_mask = (ss_mask); \
712 if (!CGROUP_HAS_SUBSYS_CONFIG) { \
713 (ssid) = 0; \
714 break; \
715 } \
716 for_each_set_bit(ssid, &__ss_mask, CGROUP_SUBSYS_COUNT) { \
717 (ss) = cgroup_subsys[ssid]; \
718 {
719
720#define while_each_subsys_mask() \
721 } \
722 } \
723} while (false)
724
725
726#define cgroup_for_each_live_child(child, cgrp) \
727 list_for_each_entry((child), &(cgrp)->self.children, self.sibling) \
728 if (({ lockdep_assert_held(&cgroup_mutex); \
729 cgroup_is_dead(child); })) \
730 ; \
731 else
732
733
734#define cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp) \
735 css_for_each_descendant_pre((d_css), cgroup_css((cgrp), NULL)) \
736 if (({ lockdep_assert_held(&cgroup_mutex); \
737 (dsct) = (d_css)->cgroup; \
738 cgroup_is_dead(dsct); })) \
739 ; \
740 else
741
742
743#define cgroup_for_each_live_descendant_post(dsct, d_css, cgrp) \
744 css_for_each_descendant_post((d_css), cgroup_css((cgrp), NULL)) \
745 if (({ lockdep_assert_held(&cgroup_mutex); \
746 (dsct) = (d_css)->cgroup; \
747 cgroup_is_dead(dsct); })) \
748 ; \
749 else
750
751
752
753
754
755
756
757
758struct css_set init_css_set = {
759 .refcount = REFCOUNT_INIT(1),
760 .dom_cset = &init_css_set,
761 .tasks = LIST_HEAD_INIT(init_css_set.tasks),
762 .mg_tasks = LIST_HEAD_INIT(init_css_set.mg_tasks),
763 .dying_tasks = LIST_HEAD_INIT(init_css_set.dying_tasks),
764 .task_iters = LIST_HEAD_INIT(init_css_set.task_iters),
765 .threaded_csets = LIST_HEAD_INIT(init_css_set.threaded_csets),
766 .cgrp_links = LIST_HEAD_INIT(init_css_set.cgrp_links),
767 .mg_preload_node = LIST_HEAD_INIT(init_css_set.mg_preload_node),
768 .mg_node = LIST_HEAD_INIT(init_css_set.mg_node),
769
770
771
772
773
774
775
776 .dfl_cgrp = &cgrp_dfl_root.cgrp,
777};
778
779static int css_set_count = 1;
780
781static bool css_set_threaded(struct css_set *cset)
782{
783 return cset->dom_cset != cset;
784}
785
786
787
788
789
790
791
792
793
794
795static bool css_set_populated(struct css_set *cset)
796{
797 lockdep_assert_held(&css_set_lock);
798
799 return !list_empty(&cset->tasks) || !list_empty(&cset->mg_tasks);
800}
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819static void cgroup_update_populated(struct cgroup *cgrp, bool populated)
820{
821 struct cgroup *child = NULL;
822 int adj = populated ? 1 : -1;
823
824 lockdep_assert_held(&css_set_lock);
825
826 do {
827 bool was_populated = cgroup_is_populated(cgrp);
828
829 if (!child) {
830 cgrp->nr_populated_csets += adj;
831 } else {
832 if (cgroup_is_threaded(child))
833 cgrp->nr_populated_threaded_children += adj;
834 else
835 cgrp->nr_populated_domain_children += adj;
836 }
837
838 if (was_populated == cgroup_is_populated(cgrp))
839 break;
840
841 cgroup1_check_for_release(cgrp);
842 TRACE_CGROUP_PATH(notify_populated, cgrp,
843 cgroup_is_populated(cgrp));
844 cgroup_file_notify(&cgrp->events_file);
845
846 child = cgrp;
847 cgrp = cgroup_parent(cgrp);
848 } while (cgrp);
849}
850
851
852
853
854
855
856
857
858
859static void css_set_update_populated(struct css_set *cset, bool populated)
860{
861 struct cgrp_cset_link *link;
862
863 lockdep_assert_held(&css_set_lock);
864
865 list_for_each_entry(link, &cset->cgrp_links, cgrp_link)
866 cgroup_update_populated(link->cgrp, populated);
867}
868
869
870
871
872
873
874
875static void css_set_skip_task_iters(struct css_set *cset,
876 struct task_struct *task)
877{
878 struct css_task_iter *it, *pos;
879
880 list_for_each_entry_safe(it, pos, &cset->task_iters, iters_node)
881 css_task_iter_skip(it, task);
882}
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899static void css_set_move_task(struct task_struct *task,
900 struct css_set *from_cset, struct css_set *to_cset,
901 bool use_mg_tasks)
902{
903 lockdep_assert_held(&css_set_lock);
904
905 if (to_cset && !css_set_populated(to_cset))
906 css_set_update_populated(to_cset, true);
907
908 if (from_cset) {
909 WARN_ON_ONCE(list_empty(&task->cg_list));
910
911 css_set_skip_task_iters(from_cset, task);
912 list_del_init(&task->cg_list);
913 if (!css_set_populated(from_cset))
914 css_set_update_populated(from_cset, false);
915 } else {
916 WARN_ON_ONCE(!list_empty(&task->cg_list));
917 }
918
919 if (to_cset) {
920
921
922
923
924
925 WARN_ON_ONCE(task->flags & PF_EXITING);
926
927 cgroup_move_task(task, to_cset);
928 list_add_tail(&task->cg_list, use_mg_tasks ? &to_cset->mg_tasks :
929 &to_cset->tasks);
930 }
931}
932
933
934
935
936
937
938#define CSS_SET_HASH_BITS 7
939static DEFINE_HASHTABLE(css_set_table, CSS_SET_HASH_BITS);
940
941static unsigned long css_set_hash(struct cgroup_subsys_state *css[])
942{
943 unsigned long key = 0UL;
944 struct cgroup_subsys *ss;
945 int i;
946
947 for_each_subsys(ss, i)
948 key += (unsigned long)css[i];
949 key = (key >> 16) ^ key;
950
951 return key;
952}
953
954void put_css_set_locked(struct css_set *cset)
955{
956 struct cgrp_cset_link *link, *tmp_link;
957 struct cgroup_subsys *ss;
958 int ssid;
959
960 lockdep_assert_held(&css_set_lock);
961
962 if (!refcount_dec_and_test(&cset->refcount))
963 return;
964
965 WARN_ON_ONCE(!list_empty(&cset->threaded_csets));
966
967
968 for_each_subsys(ss, ssid) {
969 list_del(&cset->e_cset_node[ssid]);
970 css_put(cset->subsys[ssid]);
971 }
972 hash_del(&cset->hlist);
973 css_set_count--;
974
975 list_for_each_entry_safe(link, tmp_link, &cset->cgrp_links, cgrp_link) {
976 list_del(&link->cset_link);
977 list_del(&link->cgrp_link);
978 if (cgroup_parent(link->cgrp))
979 cgroup_put(link->cgrp);
980 kfree(link);
981 }
982
983 if (css_set_threaded(cset)) {
984 list_del(&cset->threaded_csets_node);
985 put_css_set_locked(cset->dom_cset);
986 }
987
988 kfree_rcu(cset, rcu_head);
989}
990
991
992
993
994
995
996
997
998
999
1000
1001static bool compare_css_sets(struct css_set *cset,
1002 struct css_set *old_cset,
1003 struct cgroup *new_cgrp,
1004 struct cgroup_subsys_state *template[])
1005{
1006 struct cgroup *new_dfl_cgrp;
1007 struct list_head *l1, *l2;
1008
1009
1010
1011
1012
1013
1014 if (memcmp(template, cset->subsys, sizeof(cset->subsys)))
1015 return false;
1016
1017
1018
1019 if (cgroup_on_dfl(new_cgrp))
1020 new_dfl_cgrp = new_cgrp;
1021 else
1022 new_dfl_cgrp = old_cset->dfl_cgrp;
1023
1024 if (new_dfl_cgrp->dom_cgrp != cset->dom_cset->dfl_cgrp)
1025 return false;
1026
1027
1028
1029
1030
1031
1032
1033 l1 = &cset->cgrp_links;
1034 l2 = &old_cset->cgrp_links;
1035 while (1) {
1036 struct cgrp_cset_link *link1, *link2;
1037 struct cgroup *cgrp1, *cgrp2;
1038
1039 l1 = l1->next;
1040 l2 = l2->next;
1041
1042 if (l1 == &cset->cgrp_links) {
1043 BUG_ON(l2 != &old_cset->cgrp_links);
1044 break;
1045 } else {
1046 BUG_ON(l2 == &old_cset->cgrp_links);
1047 }
1048
1049 link1 = list_entry(l1, struct cgrp_cset_link, cgrp_link);
1050 link2 = list_entry(l2, struct cgrp_cset_link, cgrp_link);
1051 cgrp1 = link1->cgrp;
1052 cgrp2 = link2->cgrp;
1053
1054 BUG_ON(cgrp1->root != cgrp2->root);
1055
1056
1057
1058
1059
1060
1061
1062
1063 if (cgrp1->root == new_cgrp->root) {
1064 if (cgrp1 != new_cgrp)
1065 return false;
1066 } else {
1067 if (cgrp1 != cgrp2)
1068 return false;
1069 }
1070 }
1071 return true;
1072}
1073
1074
1075
1076
1077
1078
1079
1080static struct css_set *find_existing_css_set(struct css_set *old_cset,
1081 struct cgroup *cgrp,
1082 struct cgroup_subsys_state *template[])
1083{
1084 struct cgroup_root *root = cgrp->root;
1085 struct cgroup_subsys *ss;
1086 struct css_set *cset;
1087 unsigned long key;
1088 int i;
1089
1090
1091
1092
1093
1094
1095 for_each_subsys(ss, i) {
1096 if (root->subsys_mask & (1UL << i)) {
1097
1098
1099
1100
1101 template[i] = cgroup_e_css_by_mask(cgrp, ss);
1102 } else {
1103
1104
1105
1106
1107 template[i] = old_cset->subsys[i];
1108 }
1109 }
1110
1111 key = css_set_hash(template);
1112 hash_for_each_possible(css_set_table, cset, hlist, key) {
1113 if (!compare_css_sets(cset, old_cset, cgrp, template))
1114 continue;
1115
1116
1117 return cset;
1118 }
1119
1120
1121 return NULL;
1122}
1123
1124static void free_cgrp_cset_links(struct list_head *links_to_free)
1125{
1126 struct cgrp_cset_link *link, *tmp_link;
1127
1128 list_for_each_entry_safe(link, tmp_link, links_to_free, cset_link) {
1129 list_del(&link->cset_link);
1130 kfree(link);
1131 }
1132}
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142static int allocate_cgrp_cset_links(int count, struct list_head *tmp_links)
1143{
1144 struct cgrp_cset_link *link;
1145 int i;
1146
1147 INIT_LIST_HEAD(tmp_links);
1148
1149 for (i = 0; i < count; i++) {
1150 link = kzalloc(sizeof(*link), GFP_KERNEL);
1151 if (!link) {
1152 free_cgrp_cset_links(tmp_links);
1153 return -ENOMEM;
1154 }
1155 list_add(&link->cset_link, tmp_links);
1156 }
1157 return 0;
1158}
1159
1160
1161
1162
1163
1164
1165
1166static void link_css_set(struct list_head *tmp_links, struct css_set *cset,
1167 struct cgroup *cgrp)
1168{
1169 struct cgrp_cset_link *link;
1170
1171 BUG_ON(list_empty(tmp_links));
1172
1173 if (cgroup_on_dfl(cgrp))
1174 cset->dfl_cgrp = cgrp;
1175
1176 link = list_first_entry(tmp_links, struct cgrp_cset_link, cset_link);
1177 link->cset = cset;
1178 link->cgrp = cgrp;
1179
1180
1181
1182
1183
1184 list_move_tail(&link->cset_link, &cgrp->cset_links);
1185 list_add_tail(&link->cgrp_link, &cset->cgrp_links);
1186
1187 if (cgroup_parent(cgrp))
1188 cgroup_get_live(cgrp);
1189}
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199static struct css_set *find_css_set(struct css_set *old_cset,
1200 struct cgroup *cgrp)
1201{
1202 struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT] = { };
1203 struct css_set *cset;
1204 struct list_head tmp_links;
1205 struct cgrp_cset_link *link;
1206 struct cgroup_subsys *ss;
1207 unsigned long key;
1208 int ssid;
1209
1210 lockdep_assert_held(&cgroup_mutex);
1211
1212
1213
1214 spin_lock_irq(&css_set_lock);
1215 cset = find_existing_css_set(old_cset, cgrp, template);
1216 if (cset)
1217 get_css_set(cset);
1218 spin_unlock_irq(&css_set_lock);
1219
1220 if (cset)
1221 return cset;
1222
1223 cset = kzalloc(sizeof(*cset), GFP_KERNEL);
1224 if (!cset)
1225 return NULL;
1226
1227
1228 if (allocate_cgrp_cset_links(cgroup_root_count, &tmp_links) < 0) {
1229 kfree(cset);
1230 return NULL;
1231 }
1232
1233 refcount_set(&cset->refcount, 1);
1234 cset->dom_cset = cset;
1235 INIT_LIST_HEAD(&cset->tasks);
1236 INIT_LIST_HEAD(&cset->mg_tasks);
1237 INIT_LIST_HEAD(&cset->dying_tasks);
1238 INIT_LIST_HEAD(&cset->task_iters);
1239 INIT_LIST_HEAD(&cset->threaded_csets);
1240 INIT_HLIST_NODE(&cset->hlist);
1241 INIT_LIST_HEAD(&cset->cgrp_links);
1242 INIT_LIST_HEAD(&cset->mg_preload_node);
1243 INIT_LIST_HEAD(&cset->mg_node);
1244
1245
1246
1247 memcpy(cset->subsys, template, sizeof(cset->subsys));
1248
1249 spin_lock_irq(&css_set_lock);
1250
1251 list_for_each_entry(link, &old_cset->cgrp_links, cgrp_link) {
1252 struct cgroup *c = link->cgrp;
1253
1254 if (c->root == cgrp->root)
1255 c = cgrp;
1256 link_css_set(&tmp_links, cset, c);
1257 }
1258
1259 BUG_ON(!list_empty(&tmp_links));
1260
1261 css_set_count++;
1262
1263
1264 key = css_set_hash(cset->subsys);
1265 hash_add(css_set_table, &cset->hlist, key);
1266
1267 for_each_subsys(ss, ssid) {
1268 struct cgroup_subsys_state *css = cset->subsys[ssid];
1269
1270 list_add_tail(&cset->e_cset_node[ssid],
1271 &css->cgroup->e_csets[ssid]);
1272 css_get(css);
1273 }
1274
1275 spin_unlock_irq(&css_set_lock);
1276
1277
1278
1279
1280
1281
1282
1283 if (cgroup_is_threaded(cset->dfl_cgrp)) {
1284 struct css_set *dcset;
1285
1286 dcset = find_css_set(cset, cset->dfl_cgrp->dom_cgrp);
1287 if (!dcset) {
1288 put_css_set(cset);
1289 return NULL;
1290 }
1291
1292 spin_lock_irq(&css_set_lock);
1293 cset->dom_cset = dcset;
1294 list_add_tail(&cset->threaded_csets_node,
1295 &dcset->threaded_csets);
1296 spin_unlock_irq(&css_set_lock);
1297 }
1298
1299 return cset;
1300}
1301
1302struct cgroup_root *cgroup_root_from_kf(struct kernfs_root *kf_root)
1303{
1304 struct cgroup *root_cgrp = kf_root->kn->priv;
1305
1306 return root_cgrp->root;
1307}
1308
1309static int cgroup_init_root_id(struct cgroup_root *root)
1310{
1311 int id;
1312
1313 lockdep_assert_held(&cgroup_mutex);
1314
1315 id = idr_alloc_cyclic(&cgroup_hierarchy_idr, root, 0, 0, GFP_KERNEL);
1316 if (id < 0)
1317 return id;
1318
1319 root->hierarchy_id = id;
1320 return 0;
1321}
1322
1323static void cgroup_exit_root_id(struct cgroup_root *root)
1324{
1325 lockdep_assert_held(&cgroup_mutex);
1326
1327 idr_remove(&cgroup_hierarchy_idr, root->hierarchy_id);
1328}
1329
1330void cgroup_free_root(struct cgroup_root *root)
1331{
1332 kfree(root);
1333}
1334
1335static void cgroup_destroy_root(struct cgroup_root *root)
1336{
1337 struct cgroup *cgrp = &root->cgrp;
1338 struct cgrp_cset_link *link, *tmp_link;
1339
1340 trace_cgroup_destroy_root(root);
1341
1342 cgroup_lock_and_drain_offline(&cgrp_dfl_root.cgrp);
1343
1344 BUG_ON(atomic_read(&root->nr_cgrps));
1345 BUG_ON(!list_empty(&cgrp->self.children));
1346
1347
1348 WARN_ON(rebind_subsystems(&cgrp_dfl_root, root->subsys_mask));
1349
1350
1351
1352
1353
1354 spin_lock_irq(&css_set_lock);
1355
1356 list_for_each_entry_safe(link, tmp_link, &cgrp->cset_links, cset_link) {
1357 list_del(&link->cset_link);
1358 list_del(&link->cgrp_link);
1359 kfree(link);
1360 }
1361
1362 spin_unlock_irq(&css_set_lock);
1363
1364 if (!list_empty(&root->root_list)) {
1365 list_del(&root->root_list);
1366 cgroup_root_count--;
1367 }
1368
1369 cgroup_exit_root_id(root);
1370
1371 mutex_unlock(&cgroup_mutex);
1372
1373 cgroup_rstat_exit(cgrp);
1374 kernfs_destroy_root(root->kf_root);
1375 cgroup_free_root(root);
1376}
1377
1378
1379
1380
1381
1382static struct cgroup *
1383current_cgns_cgroup_from_root(struct cgroup_root *root)
1384{
1385 struct cgroup *res = NULL;
1386 struct css_set *cset;
1387
1388 lockdep_assert_held(&css_set_lock);
1389
1390 rcu_read_lock();
1391
1392 cset = current->nsproxy->cgroup_ns->root_cset;
1393 if (cset == &init_css_set) {
1394 res = &root->cgrp;
1395 } else if (root == &cgrp_dfl_root) {
1396 res = cset->dfl_cgrp;
1397 } else {
1398 struct cgrp_cset_link *link;
1399
1400 list_for_each_entry(link, &cset->cgrp_links, cgrp_link) {
1401 struct cgroup *c = link->cgrp;
1402
1403 if (c->root == root) {
1404 res = c;
1405 break;
1406 }
1407 }
1408 }
1409 rcu_read_unlock();
1410
1411 BUG_ON(!res);
1412 return res;
1413}
1414
1415
1416static struct cgroup *cset_cgroup_from_root(struct css_set *cset,
1417 struct cgroup_root *root)
1418{
1419 struct cgroup *res = NULL;
1420
1421 lockdep_assert_held(&cgroup_mutex);
1422 lockdep_assert_held(&css_set_lock);
1423
1424 if (cset == &init_css_set) {
1425 res = &root->cgrp;
1426 } else if (root == &cgrp_dfl_root) {
1427 res = cset->dfl_cgrp;
1428 } else {
1429 struct cgrp_cset_link *link;
1430
1431 list_for_each_entry(link, &cset->cgrp_links, cgrp_link) {
1432 struct cgroup *c = link->cgrp;
1433
1434 if (c->root == root) {
1435 res = c;
1436 break;
1437 }
1438 }
1439 }
1440
1441 BUG_ON(!res);
1442 return res;
1443}
1444
1445
1446
1447
1448
1449struct cgroup *task_cgroup_from_root(struct task_struct *task,
1450 struct cgroup_root *root)
1451{
1452
1453
1454
1455
1456 return cset_cgroup_from_root(task_css_set(task), root);
1457}
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485static struct kernfs_syscall_ops cgroup_kf_syscall_ops;
1486
1487static char *cgroup_file_name(struct cgroup *cgrp, const struct cftype *cft,
1488 char *buf)
1489{
1490 struct cgroup_subsys *ss = cft->ss;
1491
1492 if (cft->ss && !(cft->flags & CFTYPE_NO_PREFIX) &&
1493 !(cgrp->root->flags & CGRP_ROOT_NOPREFIX)) {
1494 const char *dbg = (cft->flags & CFTYPE_DEBUG) ? ".__DEBUG__." : "";
1495
1496 snprintf(buf, CGROUP_FILE_NAME_MAX, "%s%s.%s",
1497 dbg, cgroup_on_dfl(cgrp) ? ss->name : ss->legacy_name,
1498 cft->name);
1499 } else {
1500 strscpy(buf, cft->name, CGROUP_FILE_NAME_MAX);
1501 }
1502 return buf;
1503}
1504
1505
1506
1507
1508
1509
1510
1511static umode_t cgroup_file_mode(const struct cftype *cft)
1512{
1513 umode_t mode = 0;
1514
1515 if (cft->read_u64 || cft->read_s64 || cft->seq_show)
1516 mode |= S_IRUGO;
1517
1518 if (cft->write_u64 || cft->write_s64 || cft->write) {
1519 if (cft->flags & CFTYPE_WORLD_WRITABLE)
1520 mode |= S_IWUGO;
1521 else
1522 mode |= S_IWUSR;
1523 }
1524
1525 return mode;
1526}
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540static u16 cgroup_calc_subtree_ss_mask(u16 subtree_control, u16 this_ss_mask)
1541{
1542 u16 cur_ss_mask = subtree_control;
1543 struct cgroup_subsys *ss;
1544 int ssid;
1545
1546 lockdep_assert_held(&cgroup_mutex);
1547
1548 cur_ss_mask |= cgrp_dfl_implicit_ss_mask;
1549
1550 while (true) {
1551 u16 new_ss_mask = cur_ss_mask;
1552
1553 do_each_subsys_mask(ss, ssid, cur_ss_mask) {
1554 new_ss_mask |= ss->depends_on;
1555 } while_each_subsys_mask();
1556
1557
1558
1559
1560
1561
1562 new_ss_mask &= this_ss_mask;
1563
1564 if (new_ss_mask == cur_ss_mask)
1565 break;
1566 cur_ss_mask = new_ss_mask;
1567 }
1568
1569 return cur_ss_mask;
1570}
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582void cgroup_kn_unlock(struct kernfs_node *kn)
1583{
1584 struct cgroup *cgrp;
1585
1586 if (kernfs_type(kn) == KERNFS_DIR)
1587 cgrp = kn->priv;
1588 else
1589 cgrp = kn->parent->priv;
1590
1591 mutex_unlock(&cgroup_mutex);
1592
1593 kernfs_unbreak_active_protection(kn);
1594 cgroup_put(cgrp);
1595}
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614struct cgroup *cgroup_kn_lock_live(struct kernfs_node *kn, bool drain_offline)
1615{
1616 struct cgroup *cgrp;
1617
1618 if (kernfs_type(kn) == KERNFS_DIR)
1619 cgrp = kn->priv;
1620 else
1621 cgrp = kn->parent->priv;
1622
1623
1624
1625
1626
1627
1628
1629 if (!cgroup_tryget(cgrp))
1630 return NULL;
1631 kernfs_break_active_protection(kn);
1632
1633 if (drain_offline)
1634 cgroup_lock_and_drain_offline(cgrp);
1635 else
1636 mutex_lock(&cgroup_mutex);
1637
1638 if (!cgroup_is_dead(cgrp))
1639 return cgrp;
1640
1641 cgroup_kn_unlock(kn);
1642 return NULL;
1643}
1644
1645static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft)
1646{
1647 char name[CGROUP_FILE_NAME_MAX];
1648
1649 lockdep_assert_held(&cgroup_mutex);
1650
1651 if (cft->file_offset) {
1652 struct cgroup_subsys_state *css = cgroup_css(cgrp, cft->ss);
1653 struct cgroup_file *cfile = (void *)css + cft->file_offset;
1654
1655 spin_lock_irq(&cgroup_file_kn_lock);
1656 cfile->kn = NULL;
1657 spin_unlock_irq(&cgroup_file_kn_lock);
1658
1659 del_timer_sync(&cfile->notify_timer);
1660 }
1661
1662 kernfs_remove_by_name(cgrp->kn, cgroup_file_name(cgrp, cft, name));
1663}
1664
1665
1666
1667
1668
1669static void css_clear_dir(struct cgroup_subsys_state *css)
1670{
1671 struct cgroup *cgrp = css->cgroup;
1672 struct cftype *cfts;
1673
1674 if (!(css->flags & CSS_VISIBLE))
1675 return;
1676
1677 css->flags &= ~CSS_VISIBLE;
1678
1679 if (!css->ss) {
1680 if (cgroup_on_dfl(cgrp))
1681 cfts = cgroup_base_files;
1682 else
1683 cfts = cgroup1_base_files;
1684
1685 cgroup_addrm_files(css, cgrp, cfts, false);
1686 } else {
1687 list_for_each_entry(cfts, &css->ss->cfts, node)
1688 cgroup_addrm_files(css, cgrp, cfts, false);
1689 }
1690}
1691
1692
1693
1694
1695
1696
1697
1698static int css_populate_dir(struct cgroup_subsys_state *css)
1699{
1700 struct cgroup *cgrp = css->cgroup;
1701 struct cftype *cfts, *failed_cfts;
1702 int ret;
1703
1704 if ((css->flags & CSS_VISIBLE) || !cgrp->kn)
1705 return 0;
1706
1707 if (!css->ss) {
1708 if (cgroup_on_dfl(cgrp))
1709 cfts = cgroup_base_files;
1710 else
1711 cfts = cgroup1_base_files;
1712
1713 ret = cgroup_addrm_files(&cgrp->self, cgrp, cfts, true);
1714 if (ret < 0)
1715 return ret;
1716 } else {
1717 list_for_each_entry(cfts, &css->ss->cfts, node) {
1718 ret = cgroup_addrm_files(css, cgrp, cfts, true);
1719 if (ret < 0) {
1720 failed_cfts = cfts;
1721 goto err;
1722 }
1723 }
1724 }
1725
1726 css->flags |= CSS_VISIBLE;
1727
1728 return 0;
1729err:
1730 list_for_each_entry(cfts, &css->ss->cfts, node) {
1731 if (cfts == failed_cfts)
1732 break;
1733 cgroup_addrm_files(css, cgrp, cfts, false);
1734 }
1735 return ret;
1736}
1737
1738int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask)
1739{
1740 struct cgroup *dcgrp = &dst_root->cgrp;
1741 struct cgroup_subsys *ss;
1742 int ssid, i, ret;
1743
1744 lockdep_assert_held(&cgroup_mutex);
1745
1746 do_each_subsys_mask(ss, ssid, ss_mask) {
1747
1748
1749
1750
1751
1752 if (css_next_child(NULL, cgroup_css(&ss->root->cgrp, ss)) &&
1753 !ss->implicit_on_dfl)
1754 return -EBUSY;
1755
1756
1757 if (ss->root != &cgrp_dfl_root && dst_root != &cgrp_dfl_root)
1758 return -EBUSY;
1759 } while_each_subsys_mask();
1760
1761 do_each_subsys_mask(ss, ssid, ss_mask) {
1762 struct cgroup_root *src_root = ss->root;
1763 struct cgroup *scgrp = &src_root->cgrp;
1764 struct cgroup_subsys_state *css = cgroup_css(scgrp, ss);
1765 struct css_set *cset;
1766
1767 WARN_ON(!css || cgroup_css(dcgrp, ss));
1768
1769
1770 src_root->subsys_mask &= ~(1 << ssid);
1771 WARN_ON(cgroup_apply_control(scgrp));
1772 cgroup_finalize_control(scgrp, 0);
1773
1774
1775 RCU_INIT_POINTER(scgrp->subsys[ssid], NULL);
1776 rcu_assign_pointer(dcgrp->subsys[ssid], css);
1777 ss->root = dst_root;
1778 css->cgroup = dcgrp;
1779
1780 spin_lock_irq(&css_set_lock);
1781 hash_for_each(css_set_table, i, cset, hlist)
1782 list_move_tail(&cset->e_cset_node[ss->id],
1783 &dcgrp->e_csets[ss->id]);
1784 spin_unlock_irq(&css_set_lock);
1785
1786 if (ss->css_rstat_flush) {
1787 list_del_rcu(&css->rstat_css_node);
1788 list_add_rcu(&css->rstat_css_node,
1789 &dcgrp->rstat_css_list);
1790 }
1791
1792
1793 dst_root->subsys_mask |= 1 << ssid;
1794 if (dst_root == &cgrp_dfl_root) {
1795 static_branch_enable(cgroup_subsys_on_dfl_key[ssid]);
1796 } else {
1797 dcgrp->subtree_control |= 1 << ssid;
1798 static_branch_disable(cgroup_subsys_on_dfl_key[ssid]);
1799 }
1800
1801 ret = cgroup_apply_control(dcgrp);
1802 if (ret)
1803 pr_warn("partial failure to rebind %s controller (err=%d)\n",
1804 ss->name, ret);
1805
1806 if (ss->bind)
1807 ss->bind(css);
1808 } while_each_subsys_mask();
1809
1810 kernfs_activate(dcgrp->kn);
1811 return 0;
1812}
1813
1814int cgroup_show_path(struct seq_file *sf, struct kernfs_node *kf_node,
1815 struct kernfs_root *kf_root)
1816{
1817 int len = 0;
1818 char *buf = NULL;
1819 struct cgroup_root *kf_cgroot = cgroup_root_from_kf(kf_root);
1820 struct cgroup *ns_cgroup;
1821
1822 buf = kmalloc(PATH_MAX, GFP_KERNEL);
1823 if (!buf)
1824 return -ENOMEM;
1825
1826 spin_lock_irq(&css_set_lock);
1827 ns_cgroup = current_cgns_cgroup_from_root(kf_cgroot);
1828 len = kernfs_path_from_node(kf_node, ns_cgroup->kn, buf, PATH_MAX);
1829 spin_unlock_irq(&css_set_lock);
1830
1831 if (len >= PATH_MAX)
1832 len = -ERANGE;
1833 else if (len > 0) {
1834 seq_escape(sf, buf, " \t\n\\");
1835 len = 0;
1836 }
1837 kfree(buf);
1838 return len;
1839}
1840
1841enum cgroup2_param {
1842 Opt_nsdelegate,
1843 Opt_memory_localevents,
1844 Opt_memory_recursiveprot,
1845 nr__cgroup2_params
1846};
1847
1848static const struct fs_parameter_spec cgroup2_fs_parameters[] = {
1849 fsparam_flag("nsdelegate", Opt_nsdelegate),
1850 fsparam_flag("memory_localevents", Opt_memory_localevents),
1851 fsparam_flag("memory_recursiveprot", Opt_memory_recursiveprot),
1852 {}
1853};
1854
1855static int cgroup2_parse_param(struct fs_context *fc, struct fs_parameter *param)
1856{
1857 struct cgroup_fs_context *ctx = cgroup_fc2context(fc);
1858 struct fs_parse_result result;
1859 int opt;
1860
1861 opt = fs_parse(fc, cgroup2_fs_parameters, param, &result);
1862 if (opt < 0)
1863 return opt;
1864
1865 switch (opt) {
1866 case Opt_nsdelegate:
1867 ctx->flags |= CGRP_ROOT_NS_DELEGATE;
1868 return 0;
1869 case Opt_memory_localevents:
1870 ctx->flags |= CGRP_ROOT_MEMORY_LOCAL_EVENTS;
1871 return 0;
1872 case Opt_memory_recursiveprot:
1873 ctx->flags |= CGRP_ROOT_MEMORY_RECURSIVE_PROT;
1874 return 0;
1875 }
1876 return -EINVAL;
1877}
1878
1879static void apply_cgroup_root_flags(unsigned int root_flags)
1880{
1881 if (current->nsproxy->cgroup_ns == &init_cgroup_ns) {
1882 if (root_flags & CGRP_ROOT_NS_DELEGATE)
1883 cgrp_dfl_root.flags |= CGRP_ROOT_NS_DELEGATE;
1884 else
1885 cgrp_dfl_root.flags &= ~CGRP_ROOT_NS_DELEGATE;
1886
1887 if (root_flags & CGRP_ROOT_MEMORY_LOCAL_EVENTS)
1888 cgrp_dfl_root.flags |= CGRP_ROOT_MEMORY_LOCAL_EVENTS;
1889 else
1890 cgrp_dfl_root.flags &= ~CGRP_ROOT_MEMORY_LOCAL_EVENTS;
1891
1892 if (root_flags & CGRP_ROOT_MEMORY_RECURSIVE_PROT)
1893 cgrp_dfl_root.flags |= CGRP_ROOT_MEMORY_RECURSIVE_PROT;
1894 else
1895 cgrp_dfl_root.flags &= ~CGRP_ROOT_MEMORY_RECURSIVE_PROT;
1896 }
1897}
1898
1899static int cgroup_show_options(struct seq_file *seq, struct kernfs_root *kf_root)
1900{
1901 if (cgrp_dfl_root.flags & CGRP_ROOT_NS_DELEGATE)
1902 seq_puts(seq, ",nsdelegate");
1903 if (cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_LOCAL_EVENTS)
1904 seq_puts(seq, ",memory_localevents");
1905 if (cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_RECURSIVE_PROT)
1906 seq_puts(seq, ",memory_recursiveprot");
1907 return 0;
1908}
1909
1910static int cgroup_reconfigure(struct fs_context *fc)
1911{
1912 struct cgroup_fs_context *ctx = cgroup_fc2context(fc);
1913
1914 apply_cgroup_root_flags(ctx->flags);
1915 return 0;
1916}
1917
1918static void init_cgroup_housekeeping(struct cgroup *cgrp)
1919{
1920 struct cgroup_subsys *ss;
1921 int ssid;
1922
1923 INIT_LIST_HEAD(&cgrp->self.sibling);
1924 INIT_LIST_HEAD(&cgrp->self.children);
1925 INIT_LIST_HEAD(&cgrp->cset_links);
1926 INIT_LIST_HEAD(&cgrp->pidlists);
1927 mutex_init(&cgrp->pidlist_mutex);
1928 cgrp->self.cgroup = cgrp;
1929 cgrp->self.flags |= CSS_ONLINE;
1930 cgrp->dom_cgrp = cgrp;
1931 cgrp->max_descendants = INT_MAX;
1932 cgrp->max_depth = INT_MAX;
1933 INIT_LIST_HEAD(&cgrp->rstat_css_list);
1934 prev_cputime_init(&cgrp->prev_cputime);
1935
1936 for_each_subsys(ss, ssid)
1937 INIT_LIST_HEAD(&cgrp->e_csets[ssid]);
1938
1939 init_waitqueue_head(&cgrp->offline_waitq);
1940 INIT_WORK(&cgrp->release_agent_work, cgroup1_release_agent);
1941}
1942
1943void init_cgroup_root(struct cgroup_fs_context *ctx)
1944{
1945 struct cgroup_root *root = ctx->root;
1946 struct cgroup *cgrp = &root->cgrp;
1947
1948 INIT_LIST_HEAD(&root->root_list);
1949 atomic_set(&root->nr_cgrps, 1);
1950 cgrp->root = root;
1951 init_cgroup_housekeeping(cgrp);
1952
1953 root->flags = ctx->flags;
1954 if (ctx->release_agent)
1955 strscpy(root->release_agent_path, ctx->release_agent, PATH_MAX);
1956 if (ctx->name)
1957 strscpy(root->name, ctx->name, MAX_CGROUP_ROOT_NAMELEN);
1958 if (ctx->cpuset_clone_children)
1959 set_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags);
1960}
1961
1962int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask)
1963{
1964 LIST_HEAD(tmp_links);
1965 struct cgroup *root_cgrp = &root->cgrp;
1966 struct kernfs_syscall_ops *kf_sops;
1967 struct css_set *cset;
1968 int i, ret;
1969
1970 lockdep_assert_held(&cgroup_mutex);
1971
1972 ret = percpu_ref_init(&root_cgrp->self.refcnt, css_release,
1973 0, GFP_KERNEL);
1974 if (ret)
1975 goto out;
1976
1977
1978
1979
1980
1981
1982
1983
1984 ret = allocate_cgrp_cset_links(2 * css_set_count, &tmp_links);
1985 if (ret)
1986 goto cancel_ref;
1987
1988 ret = cgroup_init_root_id(root);
1989 if (ret)
1990 goto cancel_ref;
1991
1992 kf_sops = root == &cgrp_dfl_root ?
1993 &cgroup_kf_syscall_ops : &cgroup1_kf_syscall_ops;
1994
1995 root->kf_root = kernfs_create_root(kf_sops,
1996 KERNFS_ROOT_CREATE_DEACTIVATED |
1997 KERNFS_ROOT_SUPPORT_EXPORTOP |
1998 KERNFS_ROOT_SUPPORT_USER_XATTR,
1999 root_cgrp);
2000 if (IS_ERR(root->kf_root)) {
2001 ret = PTR_ERR(root->kf_root);
2002 goto exit_root_id;
2003 }
2004 root_cgrp->kn = root->kf_root->kn;
2005 WARN_ON_ONCE(cgroup_ino(root_cgrp) != 1);
2006 root_cgrp->ancestor_ids[0] = cgroup_id(root_cgrp);
2007
2008 ret = css_populate_dir(&root_cgrp->self);
2009 if (ret)
2010 goto destroy_root;
2011
2012 ret = cgroup_rstat_init(root_cgrp);
2013 if (ret)
2014 goto destroy_root;
2015
2016 ret = rebind_subsystems(root, ss_mask);
2017 if (ret)
2018 goto exit_stats;
2019
2020 ret = cgroup_bpf_inherit(root_cgrp);
2021 WARN_ON_ONCE(ret);
2022
2023 trace_cgroup_setup_root(root);
2024
2025
2026
2027
2028
2029
2030 list_add(&root->root_list, &cgroup_roots);
2031 cgroup_root_count++;
2032
2033
2034
2035
2036
2037 spin_lock_irq(&css_set_lock);
2038 hash_for_each(css_set_table, i, cset, hlist) {
2039 link_css_set(&tmp_links, cset, root_cgrp);
2040 if (css_set_populated(cset))
2041 cgroup_update_populated(root_cgrp, true);
2042 }
2043 spin_unlock_irq(&css_set_lock);
2044
2045 BUG_ON(!list_empty(&root_cgrp->self.children));
2046 BUG_ON(atomic_read(&root->nr_cgrps) != 1);
2047
2048 ret = 0;
2049 goto out;
2050
2051exit_stats:
2052 cgroup_rstat_exit(root_cgrp);
2053destroy_root:
2054 kernfs_destroy_root(root->kf_root);
2055 root->kf_root = NULL;
2056exit_root_id:
2057 cgroup_exit_root_id(root);
2058cancel_ref:
2059 percpu_ref_exit(&root_cgrp->self.refcnt);
2060out:
2061 free_cgrp_cset_links(&tmp_links);
2062 return ret;
2063}
2064
2065int cgroup_do_get_tree(struct fs_context *fc)
2066{
2067 struct cgroup_fs_context *ctx = cgroup_fc2context(fc);
2068 int ret;
2069
2070 ctx->kfc.root = ctx->root->kf_root;
2071 if (fc->fs_type == &cgroup2_fs_type)
2072 ctx->kfc.magic = CGROUP2_SUPER_MAGIC;
2073 else
2074 ctx->kfc.magic = CGROUP_SUPER_MAGIC;
2075 ret = kernfs_get_tree(fc);
2076
2077
2078
2079
2080
2081 if (!ret && ctx->ns != &init_cgroup_ns) {
2082 struct dentry *nsdentry;
2083 struct super_block *sb = fc->root->d_sb;
2084 struct cgroup *cgrp;
2085
2086 mutex_lock(&cgroup_mutex);
2087 spin_lock_irq(&css_set_lock);
2088
2089 cgrp = cset_cgroup_from_root(ctx->ns->root_cset, ctx->root);
2090
2091 spin_unlock_irq(&css_set_lock);
2092 mutex_unlock(&cgroup_mutex);
2093
2094 nsdentry = kernfs_node_dentry(cgrp->kn, sb);
2095 dput(fc->root);
2096 if (IS_ERR(nsdentry)) {
2097 deactivate_locked_super(sb);
2098 ret = PTR_ERR(nsdentry);
2099 nsdentry = NULL;
2100 }
2101 fc->root = nsdentry;
2102 }
2103
2104 if (!ctx->kfc.new_sb_created)
2105 cgroup_put(&ctx->root->cgrp);
2106
2107 return ret;
2108}
2109
2110
2111
2112
2113static void cgroup_fs_context_free(struct fs_context *fc)
2114{
2115 struct cgroup_fs_context *ctx = cgroup_fc2context(fc);
2116
2117 kfree(ctx->name);
2118 kfree(ctx->release_agent);
2119 put_cgroup_ns(ctx->ns);
2120 kernfs_free_fs_context(fc);
2121 kfree(ctx);
2122}
2123
2124static int cgroup_get_tree(struct fs_context *fc)
2125{
2126 struct cgroup_fs_context *ctx = cgroup_fc2context(fc);
2127 int ret;
2128
2129 cgrp_dfl_visible = true;
2130 cgroup_get_live(&cgrp_dfl_root.cgrp);
2131 ctx->root = &cgrp_dfl_root;
2132
2133 ret = cgroup_do_get_tree(fc);
2134 if (!ret)
2135 apply_cgroup_root_flags(ctx->flags);
2136 return ret;
2137}
2138
2139static const struct fs_context_operations cgroup_fs_context_ops = {
2140 .free = cgroup_fs_context_free,
2141 .parse_param = cgroup2_parse_param,
2142 .get_tree = cgroup_get_tree,
2143 .reconfigure = cgroup_reconfigure,
2144};
2145
2146static const struct fs_context_operations cgroup1_fs_context_ops = {
2147 .free = cgroup_fs_context_free,
2148 .parse_param = cgroup1_parse_param,
2149 .get_tree = cgroup1_get_tree,
2150 .reconfigure = cgroup1_reconfigure,
2151};
2152
2153
2154
2155
2156
2157static int cgroup_init_fs_context(struct fs_context *fc)
2158{
2159 struct cgroup_fs_context *ctx;
2160
2161 ctx = kzalloc(sizeof(struct cgroup_fs_context), GFP_KERNEL);
2162 if (!ctx)
2163 return -ENOMEM;
2164
2165 ctx->ns = current->nsproxy->cgroup_ns;
2166 get_cgroup_ns(ctx->ns);
2167 fc->fs_private = &ctx->kfc;
2168 if (fc->fs_type == &cgroup2_fs_type)
2169 fc->ops = &cgroup_fs_context_ops;
2170 else
2171 fc->ops = &cgroup1_fs_context_ops;
2172 put_user_ns(fc->user_ns);
2173 fc->user_ns = get_user_ns(ctx->ns->user_ns);
2174 fc->global = true;
2175 return 0;
2176}
2177
2178static void cgroup_kill_sb(struct super_block *sb)
2179{
2180 struct kernfs_root *kf_root = kernfs_root_from_sb(sb);
2181 struct cgroup_root *root = cgroup_root_from_kf(kf_root);
2182
2183
2184
2185
2186
2187
2188
2189 if (list_empty(&root->cgrp.self.children) && root != &cgrp_dfl_root &&
2190 !percpu_ref_is_dying(&root->cgrp.self.refcnt)) {
2191 cgroup_bpf_offline(&root->cgrp);
2192 percpu_ref_kill(&root->cgrp.self.refcnt);
2193 }
2194 cgroup_put(&root->cgrp);
2195 kernfs_kill_sb(sb);
2196}
2197
2198struct file_system_type cgroup_fs_type = {
2199 .name = "cgroup",
2200 .init_fs_context = cgroup_init_fs_context,
2201 .parameters = cgroup1_fs_parameters,
2202 .kill_sb = cgroup_kill_sb,
2203 .fs_flags = FS_USERNS_MOUNT,
2204};
2205
2206static struct file_system_type cgroup2_fs_type = {
2207 .name = "cgroup2",
2208 .init_fs_context = cgroup_init_fs_context,
2209 .parameters = cgroup2_fs_parameters,
2210 .kill_sb = cgroup_kill_sb,
2211 .fs_flags = FS_USERNS_MOUNT,
2212};
2213
2214#ifdef CONFIG_CPUSETS
2215static const struct fs_context_operations cpuset_fs_context_ops = {
2216 .get_tree = cgroup1_get_tree,
2217 .free = cgroup_fs_context_free,
2218};
2219
2220
2221
2222
2223
2224
2225static int cpuset_init_fs_context(struct fs_context *fc)
2226{
2227 char *agent = kstrdup("/sbin/cpuset_release_agent", GFP_USER);
2228 struct cgroup_fs_context *ctx;
2229 int err;
2230
2231 err = cgroup_init_fs_context(fc);
2232 if (err) {
2233 kfree(agent);
2234 return err;
2235 }
2236
2237 fc->ops = &cpuset_fs_context_ops;
2238
2239 ctx = cgroup_fc2context(fc);
2240 ctx->subsys_mask = 1 << cpuset_cgrp_id;
2241 ctx->flags |= CGRP_ROOT_NOPREFIX;
2242 ctx->release_agent = agent;
2243
2244 get_filesystem(&cgroup_fs_type);
2245 put_filesystem(fc->fs_type);
2246 fc->fs_type = &cgroup_fs_type;
2247
2248 return 0;
2249}
2250
2251static struct file_system_type cpuset_fs_type = {
2252 .name = "cpuset",
2253 .init_fs_context = cpuset_init_fs_context,
2254 .fs_flags = FS_USERNS_MOUNT,
2255};
2256#endif
2257
2258int cgroup_path_ns_locked(struct cgroup *cgrp, char *buf, size_t buflen,
2259 struct cgroup_namespace *ns)
2260{
2261 struct cgroup *root = cset_cgroup_from_root(ns->root_cset, cgrp->root);
2262
2263 return kernfs_path_from_node(cgrp->kn, root->kn, buf, buflen);
2264}
2265
2266int cgroup_path_ns(struct cgroup *cgrp, char *buf, size_t buflen,
2267 struct cgroup_namespace *ns)
2268{
2269 int ret;
2270
2271 mutex_lock(&cgroup_mutex);
2272 spin_lock_irq(&css_set_lock);
2273
2274 ret = cgroup_path_ns_locked(cgrp, buf, buflen, ns);
2275
2276 spin_unlock_irq(&css_set_lock);
2277 mutex_unlock(&cgroup_mutex);
2278
2279 return ret;
2280}
2281EXPORT_SYMBOL_GPL(cgroup_path_ns);
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296int task_cgroup_path(struct task_struct *task, char *buf, size_t buflen)
2297{
2298 struct cgroup_root *root;
2299 struct cgroup *cgrp;
2300 int hierarchy_id = 1;
2301 int ret;
2302
2303 mutex_lock(&cgroup_mutex);
2304 spin_lock_irq(&css_set_lock);
2305
2306 root = idr_get_next(&cgroup_hierarchy_idr, &hierarchy_id);
2307
2308 if (root) {
2309 cgrp = task_cgroup_from_root(task, root);
2310 ret = cgroup_path_ns_locked(cgrp, buf, buflen, &init_cgroup_ns);
2311 } else {
2312
2313 ret = strlcpy(buf, "/", buflen);
2314 }
2315
2316 spin_unlock_irq(&css_set_lock);
2317 mutex_unlock(&cgroup_mutex);
2318 return ret;
2319}
2320EXPORT_SYMBOL_GPL(task_cgroup_path);
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332static void cgroup_migrate_add_task(struct task_struct *task,
2333 struct cgroup_mgctx *mgctx)
2334{
2335 struct css_set *cset;
2336
2337 lockdep_assert_held(&css_set_lock);
2338
2339
2340 if (task->flags & PF_EXITING)
2341 return;
2342
2343
2344 WARN_ON_ONCE(list_empty(&task->cg_list));
2345
2346 cset = task_css_set(task);
2347 if (!cset->mg_src_cgrp)
2348 return;
2349
2350 mgctx->tset.nr_tasks++;
2351
2352 list_move_tail(&task->cg_list, &cset->mg_tasks);
2353 if (list_empty(&cset->mg_node))
2354 list_add_tail(&cset->mg_node,
2355 &mgctx->tset.src_csets);
2356 if (list_empty(&cset->mg_dst_cset->mg_node))
2357 list_add_tail(&cset->mg_dst_cset->mg_node,
2358 &mgctx->tset.dst_csets);
2359}
2360
2361
2362
2363
2364
2365
2366
2367
2368struct task_struct *cgroup_taskset_first(struct cgroup_taskset *tset,
2369 struct cgroup_subsys_state **dst_cssp)
2370{
2371 tset->cur_cset = list_first_entry(tset->csets, struct css_set, mg_node);
2372 tset->cur_task = NULL;
2373
2374 return cgroup_taskset_next(tset, dst_cssp);
2375}
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset,
2386 struct cgroup_subsys_state **dst_cssp)
2387{
2388 struct css_set *cset = tset->cur_cset;
2389 struct task_struct *task = tset->cur_task;
2390
2391 while (CGROUP_HAS_SUBSYS_CONFIG && &cset->mg_node != tset->csets) {
2392 if (!task)
2393 task = list_first_entry(&cset->mg_tasks,
2394 struct task_struct, cg_list);
2395 else
2396 task = list_next_entry(task, cg_list);
2397
2398 if (&task->cg_list != &cset->mg_tasks) {
2399 tset->cur_cset = cset;
2400 tset->cur_task = task;
2401
2402
2403
2404
2405
2406
2407
2408 if (cset->mg_dst_cset)
2409 *dst_cssp = cset->mg_dst_cset->subsys[tset->ssid];
2410 else
2411 *dst_cssp = cset->subsys[tset->ssid];
2412
2413 return task;
2414 }
2415
2416 cset = list_next_entry(cset, mg_node);
2417 task = NULL;
2418 }
2419
2420 return NULL;
2421}
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432static int cgroup_migrate_execute(struct cgroup_mgctx *mgctx)
2433{
2434 struct cgroup_taskset *tset = &mgctx->tset;
2435 struct cgroup_subsys *ss;
2436 struct task_struct *task, *tmp_task;
2437 struct css_set *cset, *tmp_cset;
2438 int ssid, failed_ssid, ret;
2439
2440
2441 if (tset->nr_tasks) {
2442 do_each_subsys_mask(ss, ssid, mgctx->ss_mask) {
2443 if (ss->can_attach) {
2444 tset->ssid = ssid;
2445 ret = ss->can_attach(tset);
2446 if (ret) {
2447 failed_ssid = ssid;
2448 goto out_cancel_attach;
2449 }
2450 }
2451 } while_each_subsys_mask();
2452 }
2453
2454
2455
2456
2457
2458
2459 spin_lock_irq(&css_set_lock);
2460 list_for_each_entry(cset, &tset->src_csets, mg_node) {
2461 list_for_each_entry_safe(task, tmp_task, &cset->mg_tasks, cg_list) {
2462 struct css_set *from_cset = task_css_set(task);
2463 struct css_set *to_cset = cset->mg_dst_cset;
2464
2465 get_css_set(to_cset);
2466 to_cset->nr_tasks++;
2467 css_set_move_task(task, from_cset, to_cset, true);
2468 from_cset->nr_tasks--;
2469
2470
2471
2472
2473 cgroup_freezer_migrate_task(task, from_cset->dfl_cgrp,
2474 to_cset->dfl_cgrp);
2475 put_css_set_locked(from_cset);
2476
2477 }
2478 }
2479 spin_unlock_irq(&css_set_lock);
2480
2481
2482
2483
2484
2485
2486 tset->csets = &tset->dst_csets;
2487
2488 if (tset->nr_tasks) {
2489 do_each_subsys_mask(ss, ssid, mgctx->ss_mask) {
2490 if (ss->attach) {
2491 tset->ssid = ssid;
2492 ss->attach(tset);
2493 }
2494 } while_each_subsys_mask();
2495 }
2496
2497 ret = 0;
2498 goto out_release_tset;
2499
2500out_cancel_attach:
2501 if (tset->nr_tasks) {
2502 do_each_subsys_mask(ss, ssid, mgctx->ss_mask) {
2503 if (ssid == failed_ssid)
2504 break;
2505 if (ss->cancel_attach) {
2506 tset->ssid = ssid;
2507 ss->cancel_attach(tset);
2508 }
2509 } while_each_subsys_mask();
2510 }
2511out_release_tset:
2512 spin_lock_irq(&css_set_lock);
2513 list_splice_init(&tset->dst_csets, &tset->src_csets);
2514 list_for_each_entry_safe(cset, tmp_cset, &tset->src_csets, mg_node) {
2515 list_splice_tail_init(&cset->mg_tasks, &cset->tasks);
2516 list_del_init(&cset->mg_node);
2517 }
2518 spin_unlock_irq(&css_set_lock);
2519
2520
2521
2522
2523
2524
2525 tset->nr_tasks = 0;
2526 tset->csets = &tset->src_csets;
2527 return ret;
2528}
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539int cgroup_migrate_vet_dst(struct cgroup *dst_cgrp)
2540{
2541
2542 if (!cgroup_on_dfl(dst_cgrp))
2543 return 0;
2544
2545
2546 if (!cgroup_is_valid_domain(dst_cgrp->dom_cgrp))
2547 return -EOPNOTSUPP;
2548
2549
2550 if (cgroup_is_mixable(dst_cgrp))
2551 return 0;
2552
2553
2554
2555
2556
2557 if (cgroup_can_be_thread_root(dst_cgrp) || cgroup_is_threaded(dst_cgrp))
2558 return 0;
2559
2560
2561 if (dst_cgrp->subtree_control)
2562 return -EBUSY;
2563
2564 return 0;
2565}
2566
2567
2568
2569
2570
2571
2572
2573
2574void cgroup_migrate_finish(struct cgroup_mgctx *mgctx)
2575{
2576 LIST_HEAD(preloaded);
2577 struct css_set *cset, *tmp_cset;
2578
2579 lockdep_assert_held(&cgroup_mutex);
2580
2581 spin_lock_irq(&css_set_lock);
2582
2583 list_splice_tail_init(&mgctx->preloaded_src_csets, &preloaded);
2584 list_splice_tail_init(&mgctx->preloaded_dst_csets, &preloaded);
2585
2586 list_for_each_entry_safe(cset, tmp_cset, &preloaded, mg_preload_node) {
2587 cset->mg_src_cgrp = NULL;
2588 cset->mg_dst_cgrp = NULL;
2589 cset->mg_dst_cset = NULL;
2590 list_del_init(&cset->mg_preload_node);
2591 put_css_set_locked(cset);
2592 }
2593
2594 spin_unlock_irq(&css_set_lock);
2595}
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613void cgroup_migrate_add_src(struct css_set *src_cset,
2614 struct cgroup *dst_cgrp,
2615 struct cgroup_mgctx *mgctx)
2616{
2617 struct cgroup *src_cgrp;
2618
2619 lockdep_assert_held(&cgroup_mutex);
2620 lockdep_assert_held(&css_set_lock);
2621
2622
2623
2624
2625
2626
2627 if (src_cset->dead)
2628 return;
2629
2630 src_cgrp = cset_cgroup_from_root(src_cset, dst_cgrp->root);
2631
2632 if (!list_empty(&src_cset->mg_preload_node))
2633 return;
2634
2635 WARN_ON(src_cset->mg_src_cgrp);
2636 WARN_ON(src_cset->mg_dst_cgrp);
2637 WARN_ON(!list_empty(&src_cset->mg_tasks));
2638 WARN_ON(!list_empty(&src_cset->mg_node));
2639
2640 src_cset->mg_src_cgrp = src_cgrp;
2641 src_cset->mg_dst_cgrp = dst_cgrp;
2642 get_css_set(src_cset);
2643 list_add_tail(&src_cset->mg_preload_node, &mgctx->preloaded_src_csets);
2644}
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660int cgroup_migrate_prepare_dst(struct cgroup_mgctx *mgctx)
2661{
2662 struct css_set *src_cset, *tmp_cset;
2663
2664 lockdep_assert_held(&cgroup_mutex);
2665
2666
2667 list_for_each_entry_safe(src_cset, tmp_cset, &mgctx->preloaded_src_csets,
2668 mg_preload_node) {
2669 struct css_set *dst_cset;
2670 struct cgroup_subsys *ss;
2671 int ssid;
2672
2673 dst_cset = find_css_set(src_cset, src_cset->mg_dst_cgrp);
2674 if (!dst_cset)
2675 return -ENOMEM;
2676
2677 WARN_ON_ONCE(src_cset->mg_dst_cset || dst_cset->mg_dst_cset);
2678
2679
2680
2681
2682
2683
2684 if (src_cset == dst_cset) {
2685 src_cset->mg_src_cgrp = NULL;
2686 src_cset->mg_dst_cgrp = NULL;
2687 list_del_init(&src_cset->mg_preload_node);
2688 put_css_set(src_cset);
2689 put_css_set(dst_cset);
2690 continue;
2691 }
2692
2693 src_cset->mg_dst_cset = dst_cset;
2694
2695 if (list_empty(&dst_cset->mg_preload_node))
2696 list_add_tail(&dst_cset->mg_preload_node,
2697 &mgctx->preloaded_dst_csets);
2698 else
2699 put_css_set(dst_cset);
2700
2701 for_each_subsys(ss, ssid)
2702 if (src_cset->subsys[ssid] != dst_cset->subsys[ssid])
2703 mgctx->ss_mask |= 1 << ssid;
2704 }
2705
2706 return 0;
2707}
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727int cgroup_migrate(struct task_struct *leader, bool threadgroup,
2728 struct cgroup_mgctx *mgctx)
2729{
2730 struct task_struct *task;
2731
2732
2733
2734
2735
2736
2737 spin_lock_irq(&css_set_lock);
2738 rcu_read_lock();
2739 task = leader;
2740 do {
2741 cgroup_migrate_add_task(task, mgctx);
2742 if (!threadgroup)
2743 break;
2744 } while_each_thread(leader, task);
2745 rcu_read_unlock();
2746 spin_unlock_irq(&css_set_lock);
2747
2748 return cgroup_migrate_execute(mgctx);
2749}
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759int cgroup_attach_task(struct cgroup *dst_cgrp, struct task_struct *leader,
2760 bool threadgroup)
2761{
2762 DEFINE_CGROUP_MGCTX(mgctx);
2763 struct task_struct *task;
2764 int ret = 0;
2765
2766
2767 spin_lock_irq(&css_set_lock);
2768 rcu_read_lock();
2769 task = leader;
2770 do {
2771 cgroup_migrate_add_src(task_css_set(task), dst_cgrp, &mgctx);
2772 if (!threadgroup)
2773 break;
2774 } while_each_thread(leader, task);
2775 rcu_read_unlock();
2776 spin_unlock_irq(&css_set_lock);
2777
2778
2779 ret = cgroup_migrate_prepare_dst(&mgctx);
2780 if (!ret)
2781 ret = cgroup_migrate(leader, threadgroup, &mgctx);
2782
2783 cgroup_migrate_finish(&mgctx);
2784
2785 if (!ret)
2786 TRACE_CGROUP_PATH(attach_task, dst_cgrp, leader, threadgroup);
2787
2788 return ret;
2789}
2790
2791struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup,
2792 bool *locked)
2793 __acquires(&cgroup_threadgroup_rwsem)
2794{
2795 struct task_struct *tsk;
2796 pid_t pid;
2797
2798 if (kstrtoint(strstrip(buf), 0, &pid) || pid < 0)
2799 return ERR_PTR(-EINVAL);
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809 lockdep_assert_held(&cgroup_mutex);
2810 if (pid || threadgroup) {
2811 percpu_down_write(&cgroup_threadgroup_rwsem);
2812 *locked = true;
2813 } else {
2814 *locked = false;
2815 }
2816
2817 rcu_read_lock();
2818 if (pid) {
2819 tsk = find_task_by_vpid(pid);
2820 if (!tsk) {
2821 tsk = ERR_PTR(-ESRCH);
2822 goto out_unlock_threadgroup;
2823 }
2824 } else {
2825 tsk = current;
2826 }
2827
2828 if (threadgroup)
2829 tsk = tsk->group_leader;
2830
2831
2832
2833
2834
2835
2836
2837 if (tsk->no_cgroup_migration || (tsk->flags & PF_NO_SETAFFINITY)) {
2838 tsk = ERR_PTR(-EINVAL);
2839 goto out_unlock_threadgroup;
2840 }
2841
2842 get_task_struct(tsk);
2843 goto out_unlock_rcu;
2844
2845out_unlock_threadgroup:
2846 if (*locked) {
2847 percpu_up_write(&cgroup_threadgroup_rwsem);
2848 *locked = false;
2849 }
2850out_unlock_rcu:
2851 rcu_read_unlock();
2852 return tsk;
2853}
2854
2855void cgroup_procs_write_finish(struct task_struct *task, bool locked)
2856 __releases(&cgroup_threadgroup_rwsem)
2857{
2858 struct cgroup_subsys *ss;
2859 int ssid;
2860
2861
2862 put_task_struct(task);
2863
2864 if (locked)
2865 percpu_up_write(&cgroup_threadgroup_rwsem);
2866 for_each_subsys(ss, ssid)
2867 if (ss->post_attach)
2868 ss->post_attach();
2869}
2870
2871static void cgroup_print_ss_mask(struct seq_file *seq, u16 ss_mask)
2872{
2873 struct cgroup_subsys *ss;
2874 bool printed = false;
2875 int ssid;
2876
2877 do_each_subsys_mask(ss, ssid, ss_mask) {
2878 if (printed)
2879 seq_putc(seq, ' ');
2880 seq_puts(seq, ss->name);
2881 printed = true;
2882 } while_each_subsys_mask();
2883 if (printed)
2884 seq_putc(seq, '\n');
2885}
2886
2887
2888static int cgroup_controllers_show(struct seq_file *seq, void *v)
2889{
2890 struct cgroup *cgrp = seq_css(seq)->cgroup;
2891
2892 cgroup_print_ss_mask(seq, cgroup_control(cgrp));
2893 return 0;
2894}
2895
2896
2897static int cgroup_subtree_control_show(struct seq_file *seq, void *v)
2898{
2899 struct cgroup *cgrp = seq_css(seq)->cgroup;
2900
2901 cgroup_print_ss_mask(seq, cgrp->subtree_control);
2902 return 0;
2903}
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914static int cgroup_update_dfl_csses(struct cgroup *cgrp)
2915{
2916 DEFINE_CGROUP_MGCTX(mgctx);
2917 struct cgroup_subsys_state *d_css;
2918 struct cgroup *dsct;
2919 struct css_set *src_cset;
2920 int ret;
2921
2922 lockdep_assert_held(&cgroup_mutex);
2923
2924 percpu_down_write(&cgroup_threadgroup_rwsem);
2925
2926
2927 spin_lock_irq(&css_set_lock);
2928 cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp) {
2929 struct cgrp_cset_link *link;
2930
2931 list_for_each_entry(link, &dsct->cset_links, cset_link)
2932 cgroup_migrate_add_src(link->cset, dsct, &mgctx);
2933 }
2934 spin_unlock_irq(&css_set_lock);
2935
2936
2937 ret = cgroup_migrate_prepare_dst(&mgctx);
2938 if (ret)
2939 goto out_finish;
2940
2941 spin_lock_irq(&css_set_lock);
2942 list_for_each_entry(src_cset, &mgctx.preloaded_src_csets, mg_preload_node) {
2943 struct task_struct *task, *ntask;
2944
2945
2946 list_for_each_entry_safe(task, ntask, &src_cset->tasks, cg_list)
2947 cgroup_migrate_add_task(task, &mgctx);
2948 }
2949 spin_unlock_irq(&css_set_lock);
2950
2951 ret = cgroup_migrate_execute(&mgctx);
2952out_finish:
2953 cgroup_migrate_finish(&mgctx);
2954 percpu_up_write(&cgroup_threadgroup_rwsem);
2955 return ret;
2956}
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966void cgroup_lock_and_drain_offline(struct cgroup *cgrp)
2967 __acquires(&cgroup_mutex)
2968{
2969 struct cgroup *dsct;
2970 struct cgroup_subsys_state *d_css;
2971 struct cgroup_subsys *ss;
2972 int ssid;
2973
2974restart:
2975 mutex_lock(&cgroup_mutex);
2976
2977 cgroup_for_each_live_descendant_post(dsct, d_css, cgrp) {
2978 for_each_subsys(ss, ssid) {
2979 struct cgroup_subsys_state *css = cgroup_css(dsct, ss);
2980 DEFINE_WAIT(wait);
2981
2982 if (!css || !percpu_ref_is_dying(&css->refcnt))
2983 continue;
2984
2985 cgroup_get_live(dsct);
2986 prepare_to_wait(&dsct->offline_waitq, &wait,
2987 TASK_UNINTERRUPTIBLE);
2988
2989 mutex_unlock(&cgroup_mutex);
2990 schedule();
2991 finish_wait(&dsct->offline_waitq, &wait);
2992
2993 cgroup_put(dsct);
2994 goto restart;
2995 }
2996 }
2997}
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007static void cgroup_save_control(struct cgroup *cgrp)
3008{
3009 struct cgroup *dsct;
3010 struct cgroup_subsys_state *d_css;
3011
3012 cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp) {
3013 dsct->old_subtree_control = dsct->subtree_control;
3014 dsct->old_subtree_ss_mask = dsct->subtree_ss_mask;
3015 dsct->old_dom_cgrp = dsct->dom_cgrp;
3016 }
3017}
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027static void cgroup_propagate_control(struct cgroup *cgrp)
3028{
3029 struct cgroup *dsct;
3030 struct cgroup_subsys_state *d_css;
3031
3032 cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp) {
3033 dsct->subtree_control &= cgroup_control(dsct);
3034 dsct->subtree_ss_mask =
3035 cgroup_calc_subtree_ss_mask(dsct->subtree_control,
3036 cgroup_ss_mask(dsct));
3037 }
3038}
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048static void cgroup_restore_control(struct cgroup *cgrp)
3049{
3050 struct cgroup *dsct;
3051 struct cgroup_subsys_state *d_css;
3052
3053 cgroup_for_each_live_descendant_post(dsct, d_css, cgrp) {
3054 dsct->subtree_control = dsct->old_subtree_control;
3055 dsct->subtree_ss_mask = dsct->old_subtree_ss_mask;
3056 dsct->dom_cgrp = dsct->old_dom_cgrp;
3057 }
3058}
3059
3060static bool css_visible(struct cgroup_subsys_state *css)
3061{
3062 struct cgroup_subsys *ss = css->ss;
3063 struct cgroup *cgrp = css->cgroup;
3064
3065 if (cgroup_control(cgrp) & (1 << ss->id))
3066 return true;
3067 if (!(cgroup_ss_mask(cgrp) & (1 << ss->id)))
3068 return false;
3069 return cgroup_on_dfl(cgrp) && ss->implicit_on_dfl;
3070}
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085static int cgroup_apply_control_enable(struct cgroup *cgrp)
3086{
3087 struct cgroup *dsct;
3088 struct cgroup_subsys_state *d_css;
3089 struct cgroup_subsys *ss;
3090 int ssid, ret;
3091
3092 cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp) {
3093 for_each_subsys(ss, ssid) {
3094 struct cgroup_subsys_state *css = cgroup_css(dsct, ss);
3095
3096 if (!(cgroup_ss_mask(dsct) & (1 << ss->id)))
3097 continue;
3098
3099 if (!css) {
3100 css = css_create(dsct, ss);
3101 if (IS_ERR(css))
3102 return PTR_ERR(css);
3103 }
3104
3105 WARN_ON_ONCE(percpu_ref_is_dying(&css->refcnt));
3106
3107 if (css_visible(css)) {
3108 ret = css_populate_dir(css);
3109 if (ret)
3110 return ret;
3111 }
3112 }
3113 }
3114
3115 return 0;
3116}
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131static void cgroup_apply_control_disable(struct cgroup *cgrp)
3132{
3133 struct cgroup *dsct;
3134 struct cgroup_subsys_state *d_css;
3135 struct cgroup_subsys *ss;
3136 int ssid;
3137
3138 cgroup_for_each_live_descendant_post(dsct, d_css, cgrp) {
3139 for_each_subsys(ss, ssid) {
3140 struct cgroup_subsys_state *css = cgroup_css(dsct, ss);
3141
3142 if (!css)
3143 continue;
3144
3145 WARN_ON_ONCE(percpu_ref_is_dying(&css->refcnt));
3146
3147 if (css->parent &&
3148 !(cgroup_ss_mask(dsct) & (1 << ss->id))) {
3149 kill_css(css);
3150 } else if (!css_visible(css)) {
3151 css_clear_dir(css);
3152 if (ss->css_reset)
3153 ss->css_reset(css);
3154 }
3155 }
3156 }
3157}
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176static int cgroup_apply_control(struct cgroup *cgrp)
3177{
3178 int ret;
3179
3180 cgroup_propagate_control(cgrp);
3181
3182 ret = cgroup_apply_control_enable(cgrp);
3183 if (ret)
3184 return ret;
3185
3186
3187
3188
3189
3190
3191 ret = cgroup_update_dfl_csses(cgrp);
3192 if (ret)
3193 return ret;
3194
3195 return 0;
3196}
3197
3198
3199
3200
3201
3202
3203
3204
3205static void cgroup_finalize_control(struct cgroup *cgrp, int ret)
3206{
3207 if (ret) {
3208 cgroup_restore_control(cgrp);
3209 cgroup_propagate_control(cgrp);
3210 }
3211
3212 cgroup_apply_control_disable(cgrp);
3213}
3214
3215static int cgroup_vet_subtree_control_enable(struct cgroup *cgrp, u16 enable)
3216{
3217 u16 domain_enable = enable & ~cgrp_dfl_threaded_ss_mask;
3218
3219
3220 if (!enable)
3221 return 0;
3222
3223
3224 if (!cgroup_is_valid_domain(cgrp->dom_cgrp))
3225 return -EOPNOTSUPP;
3226
3227
3228 if (cgroup_is_mixable(cgrp))
3229 return 0;
3230
3231 if (domain_enable) {
3232
3233 if (cgroup_is_thread_root(cgrp) || cgroup_is_threaded(cgrp))
3234 return -EOPNOTSUPP;
3235 } else {
3236
3237
3238
3239
3240
3241 if (cgroup_can_be_thread_root(cgrp) || cgroup_is_threaded(cgrp))
3242 return 0;
3243 }
3244
3245
3246
3247
3248
3249 if (cgroup_has_tasks(cgrp))
3250 return -EBUSY;
3251
3252 return 0;
3253}
3254
3255
3256static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of,
3257 char *buf, size_t nbytes,
3258 loff_t off)
3259{
3260 u16 enable = 0, disable = 0;
3261 struct cgroup *cgrp, *child;
3262 struct cgroup_subsys *ss;
3263 char *tok;
3264 int ssid, ret;
3265
3266
3267
3268
3269
3270 buf = strstrip(buf);
3271 while ((tok = strsep(&buf, " "))) {
3272 if (tok[0] == '\0')
3273 continue;
3274 do_each_subsys_mask(ss, ssid, ~cgrp_dfl_inhibit_ss_mask) {
3275 if (!cgroup_ssid_enabled(ssid) ||
3276 strcmp(tok + 1, ss->name))
3277 continue;
3278
3279 if (*tok == '+') {
3280 enable |= 1 << ssid;
3281 disable &= ~(1 << ssid);
3282 } else if (*tok == '-') {
3283 disable |= 1 << ssid;
3284 enable &= ~(1 << ssid);
3285 } else {
3286 return -EINVAL;
3287 }
3288 break;
3289 } while_each_subsys_mask();
3290 if (ssid == CGROUP_SUBSYS_COUNT)
3291 return -EINVAL;
3292 }
3293
3294 cgrp = cgroup_kn_lock_live(of->kn, true);
3295 if (!cgrp)
3296 return -ENODEV;
3297
3298 for_each_subsys(ss, ssid) {
3299 if (enable & (1 << ssid)) {
3300 if (cgrp->subtree_control & (1 << ssid)) {
3301 enable &= ~(1 << ssid);
3302 continue;
3303 }
3304
3305 if (!(cgroup_control(cgrp) & (1 << ssid))) {
3306 ret = -ENOENT;
3307 goto out_unlock;
3308 }
3309 } else if (disable & (1 << ssid)) {
3310 if (!(cgrp->subtree_control & (1 << ssid))) {
3311 disable &= ~(1 << ssid);
3312 continue;
3313 }
3314
3315
3316 cgroup_for_each_live_child(child, cgrp) {
3317 if (child->subtree_control & (1 << ssid)) {
3318 ret = -EBUSY;
3319 goto out_unlock;
3320 }
3321 }
3322 }
3323 }
3324
3325 if (!enable && !disable) {
3326 ret = 0;
3327 goto out_unlock;
3328 }
3329
3330 ret = cgroup_vet_subtree_control_enable(cgrp, enable);
3331 if (ret)
3332 goto out_unlock;
3333
3334
3335 cgroup_save_control(cgrp);
3336
3337 cgrp->subtree_control |= enable;
3338 cgrp->subtree_control &= ~disable;
3339
3340 ret = cgroup_apply_control(cgrp);
3341 cgroup_finalize_control(cgrp, ret);
3342 if (ret)
3343 goto out_unlock;
3344
3345 kernfs_activate(cgrp->kn);
3346out_unlock:
3347 cgroup_kn_unlock(of->kn);
3348 return ret ?: nbytes;
3349}
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360static int cgroup_enable_threaded(struct cgroup *cgrp)
3361{
3362 struct cgroup *parent = cgroup_parent(cgrp);
3363 struct cgroup *dom_cgrp = parent->dom_cgrp;
3364 struct cgroup *dsct;
3365 struct cgroup_subsys_state *d_css;
3366 int ret;
3367
3368 lockdep_assert_held(&cgroup_mutex);
3369
3370
3371 if (cgroup_is_threaded(cgrp))
3372 return 0;
3373
3374
3375
3376
3377
3378
3379
3380 if (cgroup_is_populated(cgrp) ||
3381 cgrp->subtree_control & ~cgrp_dfl_threaded_ss_mask)
3382 return -EOPNOTSUPP;
3383
3384
3385 if (!cgroup_is_valid_domain(dom_cgrp) ||
3386 !cgroup_can_be_thread_root(dom_cgrp))
3387 return -EOPNOTSUPP;
3388
3389
3390
3391
3392
3393 cgroup_save_control(cgrp);
3394
3395 cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp)
3396 if (dsct == cgrp || cgroup_is_threaded(dsct))
3397 dsct->dom_cgrp = dom_cgrp;
3398
3399 ret = cgroup_apply_control(cgrp);
3400 if (!ret)
3401 parent->nr_threaded_children++;
3402
3403 cgroup_finalize_control(cgrp, ret);
3404 return ret;
3405}
3406
3407static int cgroup_type_show(struct seq_file *seq, void *v)
3408{
3409 struct cgroup *cgrp = seq_css(seq)->cgroup;
3410
3411 if (cgroup_is_threaded(cgrp))
3412 seq_puts(seq, "threaded\n");
3413 else if (!cgroup_is_valid_domain(cgrp))
3414 seq_puts(seq, "domain invalid\n");
3415 else if (cgroup_is_thread_root(cgrp))
3416 seq_puts(seq, "domain threaded\n");
3417 else
3418 seq_puts(seq, "domain\n");
3419
3420 return 0;
3421}
3422
3423static ssize_t cgroup_type_write(struct kernfs_open_file *of, char *buf,
3424 size_t nbytes, loff_t off)
3425{
3426 struct cgroup *cgrp;
3427 int ret;
3428
3429
3430 if (strcmp(strstrip(buf), "threaded"))
3431 return -EINVAL;
3432
3433
3434 cgrp = cgroup_kn_lock_live(of->kn, true);
3435 if (!cgrp)
3436 return -ENOENT;
3437
3438
3439 ret = cgroup_enable_threaded(cgrp);
3440
3441 cgroup_kn_unlock(of->kn);
3442 return ret ?: nbytes;
3443}
3444
3445static int cgroup_max_descendants_show(struct seq_file *seq, void *v)
3446{
3447 struct cgroup *cgrp = seq_css(seq)->cgroup;
3448 int descendants = READ_ONCE(cgrp->max_descendants);
3449
3450 if (descendants == INT_MAX)
3451 seq_puts(seq, "max\n");
3452 else
3453 seq_printf(seq, "%d\n", descendants);
3454
3455 return 0;
3456}
3457
3458static ssize_t cgroup_max_descendants_write(struct kernfs_open_file *of,
3459 char *buf, size_t nbytes, loff_t off)
3460{
3461 struct cgroup *cgrp;
3462 int descendants;
3463 ssize_t ret;
3464
3465 buf = strstrip(buf);
3466 if (!strcmp(buf, "max")) {
3467 descendants = INT_MAX;
3468 } else {
3469 ret = kstrtoint(buf, 0, &descendants);
3470 if (ret)
3471 return ret;
3472 }
3473
3474 if (descendants < 0)
3475 return -ERANGE;
3476
3477 cgrp = cgroup_kn_lock_live(of->kn, false);
3478 if (!cgrp)
3479 return -ENOENT;
3480
3481 cgrp->max_descendants = descendants;
3482
3483 cgroup_kn_unlock(of->kn);
3484
3485 return nbytes;
3486}
3487
3488static int cgroup_max_depth_show(struct seq_file *seq, void *v)
3489{
3490 struct cgroup *cgrp = seq_css(seq)->cgroup;
3491 int depth = READ_ONCE(cgrp->max_depth);
3492
3493 if (depth == INT_MAX)
3494 seq_puts(seq, "max\n");
3495 else
3496 seq_printf(seq, "%d\n", depth);
3497
3498 return 0;
3499}
3500
3501static ssize_t cgroup_max_depth_write(struct kernfs_open_file *of,
3502 char *buf, size_t nbytes, loff_t off)
3503{
3504 struct cgroup *cgrp;
3505 ssize_t ret;
3506 int depth;
3507
3508 buf = strstrip(buf);
3509 if (!strcmp(buf, "max")) {
3510 depth = INT_MAX;
3511 } else {
3512 ret = kstrtoint(buf, 0, &depth);
3513 if (ret)
3514 return ret;
3515 }
3516
3517 if (depth < 0)
3518 return -ERANGE;
3519
3520 cgrp = cgroup_kn_lock_live(of->kn, false);
3521 if (!cgrp)
3522 return -ENOENT;
3523
3524 cgrp->max_depth = depth;
3525
3526 cgroup_kn_unlock(of->kn);
3527
3528 return nbytes;
3529}
3530
3531static int cgroup_events_show(struct seq_file *seq, void *v)
3532{
3533 struct cgroup *cgrp = seq_css(seq)->cgroup;
3534
3535 seq_printf(seq, "populated %d\n", cgroup_is_populated(cgrp));
3536 seq_printf(seq, "frozen %d\n", test_bit(CGRP_FROZEN, &cgrp->flags));
3537
3538 return 0;
3539}
3540
3541static int cgroup_stat_show(struct seq_file *seq, void *v)
3542{
3543 struct cgroup *cgroup = seq_css(seq)->cgroup;
3544
3545 seq_printf(seq, "nr_descendants %d\n",
3546 cgroup->nr_descendants);
3547 seq_printf(seq, "nr_dying_descendants %d\n",
3548 cgroup->nr_dying_descendants);
3549
3550 return 0;
3551}
3552
3553static int __maybe_unused cgroup_extra_stat_show(struct seq_file *seq,
3554 struct cgroup *cgrp, int ssid)
3555{
3556 struct cgroup_subsys *ss = cgroup_subsys[ssid];
3557 struct cgroup_subsys_state *css;
3558 int ret;
3559
3560 if (!ss->css_extra_stat_show)
3561 return 0;
3562
3563 css = cgroup_tryget_css(cgrp, ss);
3564 if (!css)
3565 return 0;
3566
3567 ret = ss->css_extra_stat_show(seq, css);
3568 css_put(css);
3569 return ret;
3570}
3571
3572static int cpu_stat_show(struct seq_file *seq, void *v)
3573{
3574 struct cgroup __maybe_unused *cgrp = seq_css(seq)->cgroup;
3575 int ret = 0;
3576
3577 cgroup_base_stat_cputime_show(seq);
3578#ifdef CONFIG_CGROUP_SCHED
3579 ret = cgroup_extra_stat_show(seq, cgrp, cpu_cgrp_id);
3580#endif
3581 return ret;
3582}
3583
3584#ifdef CONFIG_PSI
3585static int cgroup_io_pressure_show(struct seq_file *seq, void *v)
3586{
3587 struct cgroup *cgrp = seq_css(seq)->cgroup;
3588 struct psi_group *psi = cgroup_ino(cgrp) == 1 ? &psi_system : &cgrp->psi;
3589
3590 return psi_show(seq, psi, PSI_IO);
3591}
3592static int cgroup_memory_pressure_show(struct seq_file *seq, void *v)
3593{
3594 struct cgroup *cgrp = seq_css(seq)->cgroup;
3595 struct psi_group *psi = cgroup_ino(cgrp) == 1 ? &psi_system : &cgrp->psi;
3596
3597 return psi_show(seq, psi, PSI_MEM);
3598}
3599static int cgroup_cpu_pressure_show(struct seq_file *seq, void *v)
3600{
3601 struct cgroup *cgrp = seq_css(seq)->cgroup;
3602 struct psi_group *psi = cgroup_ino(cgrp) == 1 ? &psi_system : &cgrp->psi;
3603
3604 return psi_show(seq, psi, PSI_CPU);
3605}
3606
3607static ssize_t cgroup_pressure_write(struct kernfs_open_file *of, char *buf,
3608 size_t nbytes, enum psi_res res)
3609{
3610 struct psi_trigger *new;
3611 struct cgroup *cgrp;
3612 struct psi_group *psi;
3613
3614 cgrp = cgroup_kn_lock_live(of->kn, false);
3615 if (!cgrp)
3616 return -ENODEV;
3617
3618 cgroup_get(cgrp);
3619 cgroup_kn_unlock(of->kn);
3620
3621 psi = cgroup_ino(cgrp) == 1 ? &psi_system : &cgrp->psi;
3622 new = psi_trigger_create(psi, buf, nbytes, res);
3623 if (IS_ERR(new)) {
3624 cgroup_put(cgrp);
3625 return PTR_ERR(new);
3626 }
3627
3628 psi_trigger_replace(&of->priv, new);
3629
3630 cgroup_put(cgrp);
3631
3632 return nbytes;
3633}
3634
3635static ssize_t cgroup_io_pressure_write(struct kernfs_open_file *of,
3636 char *buf, size_t nbytes,
3637 loff_t off)
3638{
3639 return cgroup_pressure_write(of, buf, nbytes, PSI_IO);
3640}
3641
3642static ssize_t cgroup_memory_pressure_write(struct kernfs_open_file *of,
3643 char *buf, size_t nbytes,
3644 loff_t off)
3645{
3646 return cgroup_pressure_write(of, buf, nbytes, PSI_MEM);
3647}
3648
3649static ssize_t cgroup_cpu_pressure_write(struct kernfs_open_file *of,
3650 char *buf, size_t nbytes,
3651 loff_t off)
3652{
3653 return cgroup_pressure_write(of, buf, nbytes, PSI_CPU);
3654}
3655
3656static __poll_t cgroup_pressure_poll(struct kernfs_open_file *of,
3657 poll_table *pt)
3658{
3659 return psi_trigger_poll(&of->priv, of->file, pt);
3660}
3661
3662static void cgroup_pressure_release(struct kernfs_open_file *of)
3663{
3664 psi_trigger_replace(&of->priv, NULL);
3665}
3666
3667bool cgroup_psi_enabled(void)
3668{
3669 return (cgroup_feature_disable_mask & (1 << OPT_FEATURE_PRESSURE)) == 0;
3670}
3671
3672#else
3673bool cgroup_psi_enabled(void)
3674{
3675 return false;
3676}
3677
3678#endif
3679
3680static int cgroup_freeze_show(struct seq_file *seq, void *v)
3681{
3682 struct cgroup *cgrp = seq_css(seq)->cgroup;
3683
3684 seq_printf(seq, "%d\n", cgrp->freezer.freeze);
3685
3686 return 0;
3687}
3688
3689static ssize_t cgroup_freeze_write(struct kernfs_open_file *of,
3690 char *buf, size_t nbytes, loff_t off)
3691{
3692 struct cgroup *cgrp;
3693 ssize_t ret;
3694 int freeze;
3695
3696 ret = kstrtoint(strstrip(buf), 0, &freeze);
3697 if (ret)
3698 return ret;
3699
3700 if (freeze < 0 || freeze > 1)
3701 return -ERANGE;
3702
3703 cgrp = cgroup_kn_lock_live(of->kn, false);
3704 if (!cgrp)
3705 return -ENOENT;
3706
3707 cgroup_freeze(cgrp, freeze);
3708
3709 cgroup_kn_unlock(of->kn);
3710
3711 return nbytes;
3712}
3713
3714static void __cgroup_kill(struct cgroup *cgrp)
3715{
3716 struct css_task_iter it;
3717 struct task_struct *task;
3718
3719 lockdep_assert_held(&cgroup_mutex);
3720
3721 spin_lock_irq(&css_set_lock);
3722 set_bit(CGRP_KILL, &cgrp->flags);
3723 spin_unlock_irq(&css_set_lock);
3724
3725 css_task_iter_start(&cgrp->self, CSS_TASK_ITER_PROCS | CSS_TASK_ITER_THREADED, &it);
3726 while ((task = css_task_iter_next(&it))) {
3727
3728 if (task->flags & PF_KTHREAD)
3729 continue;
3730
3731
3732 if (__fatal_signal_pending(task))
3733 continue;
3734
3735 send_sig(SIGKILL, task, 0);
3736 }
3737 css_task_iter_end(&it);
3738
3739 spin_lock_irq(&css_set_lock);
3740 clear_bit(CGRP_KILL, &cgrp->flags);
3741 spin_unlock_irq(&css_set_lock);
3742}
3743
3744static void cgroup_kill(struct cgroup *cgrp)
3745{
3746 struct cgroup_subsys_state *css;
3747 struct cgroup *dsct;
3748
3749 lockdep_assert_held(&cgroup_mutex);
3750
3751 cgroup_for_each_live_descendant_pre(dsct, css, cgrp)
3752 __cgroup_kill(dsct);
3753}
3754
3755static ssize_t cgroup_kill_write(struct kernfs_open_file *of, char *buf,
3756 size_t nbytes, loff_t off)
3757{
3758 ssize_t ret = 0;
3759 int kill;
3760 struct cgroup *cgrp;
3761
3762 ret = kstrtoint(strstrip(buf), 0, &kill);
3763 if (ret)
3764 return ret;
3765
3766 if (kill != 1)
3767 return -ERANGE;
3768
3769 cgrp = cgroup_kn_lock_live(of->kn, false);
3770 if (!cgrp)
3771 return -ENOENT;
3772
3773
3774
3775
3776
3777
3778 if (cgroup_is_threaded(cgrp))
3779 ret = -EOPNOTSUPP;
3780 else
3781 cgroup_kill(cgrp);
3782
3783 cgroup_kn_unlock(of->kn);
3784
3785 return ret ?: nbytes;
3786}
3787
3788static int cgroup_file_open(struct kernfs_open_file *of)
3789{
3790 struct cftype *cft = of_cft(of);
3791
3792 if (cft->open)
3793 return cft->open(of);
3794 return 0;
3795}
3796
3797static void cgroup_file_release(struct kernfs_open_file *of)
3798{
3799 struct cftype *cft = of_cft(of);
3800
3801 if (cft->release)
3802 cft->release(of);
3803}
3804
3805static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf,
3806 size_t nbytes, loff_t off)
3807{
3808 struct cgroup_namespace *ns = current->nsproxy->cgroup_ns;
3809 struct cgroup *cgrp = of->kn->parent->priv;
3810 struct cftype *cft = of_cft(of);
3811 struct cgroup_subsys_state *css;
3812 int ret;
3813
3814 if (!nbytes)
3815 return 0;
3816
3817
3818
3819
3820
3821
3822
3823 if ((cgrp->root->flags & CGRP_ROOT_NS_DELEGATE) &&
3824 !(cft->flags & CFTYPE_NS_DELEGATABLE) &&
3825 ns != &init_cgroup_ns && ns->root_cset->dfl_cgrp == cgrp)
3826 return -EPERM;
3827
3828 if (cft->write)
3829 return cft->write(of, buf, nbytes, off);
3830
3831
3832
3833
3834
3835
3836
3837 rcu_read_lock();
3838 css = cgroup_css(cgrp, cft->ss);
3839 rcu_read_unlock();
3840
3841 if (cft->write_u64) {
3842 unsigned long long v;
3843 ret = kstrtoull(buf, 0, &v);
3844 if (!ret)
3845 ret = cft->write_u64(css, cft, v);
3846 } else if (cft->write_s64) {
3847 long long v;
3848 ret = kstrtoll(buf, 0, &v);
3849 if (!ret)
3850 ret = cft->write_s64(css, cft, v);
3851 } else {
3852 ret = -EINVAL;
3853 }
3854
3855 return ret ?: nbytes;
3856}
3857
3858static __poll_t cgroup_file_poll(struct kernfs_open_file *of, poll_table *pt)
3859{
3860 struct cftype *cft = of_cft(of);
3861
3862 if (cft->poll)
3863 return cft->poll(of, pt);
3864
3865 return kernfs_generic_poll(of, pt);
3866}
3867
3868static void *cgroup_seqfile_start(struct seq_file *seq, loff_t *ppos)
3869{
3870 return seq_cft(seq)->seq_start(seq, ppos);
3871}
3872
3873static void *cgroup_seqfile_next(struct seq_file *seq, void *v, loff_t *ppos)
3874{
3875 return seq_cft(seq)->seq_next(seq, v, ppos);
3876}
3877
3878static void cgroup_seqfile_stop(struct seq_file *seq, void *v)
3879{
3880 if (seq_cft(seq)->seq_stop)
3881 seq_cft(seq)->seq_stop(seq, v);
3882}
3883
3884static int cgroup_seqfile_show(struct seq_file *m, void *arg)
3885{
3886 struct cftype *cft = seq_cft(m);
3887 struct cgroup_subsys_state *css = seq_css(m);
3888
3889 if (cft->seq_show)
3890 return cft->seq_show(m, arg);
3891
3892 if (cft->read_u64)
3893 seq_printf(m, "%llu\n", cft->read_u64(css, cft));
3894 else if (cft->read_s64)
3895 seq_printf(m, "%lld\n", cft->read_s64(css, cft));
3896 else
3897 return -EINVAL;
3898 return 0;
3899}
3900
3901static struct kernfs_ops cgroup_kf_single_ops = {
3902 .atomic_write_len = PAGE_SIZE,
3903 .open = cgroup_file_open,
3904 .release = cgroup_file_release,
3905 .write = cgroup_file_write,
3906 .poll = cgroup_file_poll,
3907 .seq_show = cgroup_seqfile_show,
3908};
3909
3910static struct kernfs_ops cgroup_kf_ops = {
3911 .atomic_write_len = PAGE_SIZE,
3912 .open = cgroup_file_open,
3913 .release = cgroup_file_release,
3914 .write = cgroup_file_write,
3915 .poll = cgroup_file_poll,
3916 .seq_start = cgroup_seqfile_start,
3917 .seq_next = cgroup_seqfile_next,
3918 .seq_stop = cgroup_seqfile_stop,
3919 .seq_show = cgroup_seqfile_show,
3920};
3921
3922
3923static int cgroup_kn_set_ugid(struct kernfs_node *kn)
3924{
3925 struct iattr iattr = { .ia_valid = ATTR_UID | ATTR_GID,
3926 .ia_uid = current_fsuid(),
3927 .ia_gid = current_fsgid(), };
3928
3929 if (uid_eq(iattr.ia_uid, GLOBAL_ROOT_UID) &&
3930 gid_eq(iattr.ia_gid, GLOBAL_ROOT_GID))
3931 return 0;
3932
3933 return kernfs_setattr(kn, &iattr);
3934}
3935
3936static void cgroup_file_notify_timer(struct timer_list *timer)
3937{
3938 cgroup_file_notify(container_of(timer, struct cgroup_file,
3939 notify_timer));
3940}
3941
3942static int cgroup_add_file(struct cgroup_subsys_state *css, struct cgroup *cgrp,
3943 struct cftype *cft)
3944{
3945 char name[CGROUP_FILE_NAME_MAX];
3946 struct kernfs_node *kn;
3947 struct lock_class_key *key = NULL;
3948 int ret;
3949
3950#ifdef CONFIG_DEBUG_LOCK_ALLOC
3951 key = &cft->lockdep_key;
3952#endif
3953 kn = __kernfs_create_file(cgrp->kn, cgroup_file_name(cgrp, cft, name),
3954 cgroup_file_mode(cft),
3955 GLOBAL_ROOT_UID, GLOBAL_ROOT_GID,
3956 0, cft->kf_ops, cft,
3957 NULL, key);
3958 if (IS_ERR(kn))
3959 return PTR_ERR(kn);
3960
3961 ret = cgroup_kn_set_ugid(kn);
3962 if (ret) {
3963 kernfs_remove(kn);
3964 return ret;
3965 }
3966
3967 if (cft->file_offset) {
3968 struct cgroup_file *cfile = (void *)css + cft->file_offset;
3969
3970 timer_setup(&cfile->notify_timer, cgroup_file_notify_timer, 0);
3971
3972 spin_lock_irq(&cgroup_file_kn_lock);
3973 cfile->kn = kn;
3974 spin_unlock_irq(&cgroup_file_kn_lock);
3975 }
3976
3977 return 0;
3978}
3979
3980
3981
3982
3983
3984
3985
3986
3987
3988
3989
3990static int cgroup_addrm_files(struct cgroup_subsys_state *css,
3991 struct cgroup *cgrp, struct cftype cfts[],
3992 bool is_add)
3993{
3994 struct cftype *cft, *cft_end = NULL;
3995 int ret = 0;
3996
3997 lockdep_assert_held(&cgroup_mutex);
3998
3999restart:
4000 for (cft = cfts; cft != cft_end && cft->name[0] != '\0'; cft++) {
4001
4002 if ((cft->flags & CFTYPE_PRESSURE) && !cgroup_psi_enabled())
4003 continue;
4004 if ((cft->flags & __CFTYPE_ONLY_ON_DFL) && !cgroup_on_dfl(cgrp))
4005 continue;
4006 if ((cft->flags & __CFTYPE_NOT_ON_DFL) && cgroup_on_dfl(cgrp))
4007 continue;
4008 if ((cft->flags & CFTYPE_NOT_ON_ROOT) && !cgroup_parent(cgrp))
4009 continue;
4010 if ((cft->flags & CFTYPE_ONLY_ON_ROOT) && cgroup_parent(cgrp))
4011 continue;
4012 if ((cft->flags & CFTYPE_DEBUG) && !cgroup_debug)
4013 continue;
4014 if (is_add) {
4015 ret = cgroup_add_file(css, cgrp, cft);
4016 if (ret) {
4017 pr_warn("%s: failed to add %s, err=%d\n",
4018 __func__, cft->name, ret);
4019 cft_end = cft;
4020 is_add = false;
4021 goto restart;
4022 }
4023 } else {
4024 cgroup_rm_file(cgrp, cft);
4025 }
4026 }
4027 return ret;
4028}
4029
4030static int cgroup_apply_cftypes(struct cftype *cfts, bool is_add)
4031{
4032 struct cgroup_subsys *ss = cfts[0].ss;
4033 struct cgroup *root = &ss->root->cgrp;
4034 struct cgroup_subsys_state *css;
4035 int ret = 0;
4036
4037 lockdep_assert_held(&cgroup_mutex);
4038
4039
4040 css_for_each_descendant_pre(css, cgroup_css(root, ss)) {
4041 struct cgroup *cgrp = css->cgroup;
4042
4043 if (!(css->flags & CSS_VISIBLE))
4044 continue;
4045
4046 ret = cgroup_addrm_files(css, cgrp, cfts, is_add);
4047 if (ret)
4048 break;
4049 }
4050
4051 if (is_add && !ret)
4052 kernfs_activate(root->kn);
4053 return ret;
4054}
4055
4056static void cgroup_exit_cftypes(struct cftype *cfts)
4057{
4058 struct cftype *cft;
4059
4060 for (cft = cfts; cft->name[0] != '\0'; cft++) {
4061
4062 if (cft->max_write_len && cft->max_write_len != PAGE_SIZE)
4063 kfree(cft->kf_ops);
4064 cft->kf_ops = NULL;
4065 cft->ss = NULL;
4066
4067
4068 cft->flags &= ~(__CFTYPE_ONLY_ON_DFL | __CFTYPE_NOT_ON_DFL);
4069 }
4070}
4071
4072static int cgroup_init_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
4073{
4074 struct cftype *cft;
4075
4076 for (cft = cfts; cft->name[0] != '\0'; cft++) {
4077 struct kernfs_ops *kf_ops;
4078
4079 WARN_ON(cft->ss || cft->kf_ops);
4080
4081 if ((cft->flags & CFTYPE_PRESSURE) && !cgroup_psi_enabled())
4082 continue;
4083
4084 if (cft->seq_start)
4085 kf_ops = &cgroup_kf_ops;
4086 else
4087 kf_ops = &cgroup_kf_single_ops;
4088
4089
4090
4091
4092
4093 if (cft->max_write_len && cft->max_write_len != PAGE_SIZE) {
4094 kf_ops = kmemdup(kf_ops, sizeof(*kf_ops), GFP_KERNEL);
4095 if (!kf_ops) {
4096 cgroup_exit_cftypes(cfts);
4097 return -ENOMEM;
4098 }
4099 kf_ops->atomic_write_len = cft->max_write_len;
4100 }
4101
4102 cft->kf_ops = kf_ops;
4103 cft->ss = ss;
4104 }
4105
4106 return 0;
4107}
4108
4109static int cgroup_rm_cftypes_locked(struct cftype *cfts)
4110{
4111 lockdep_assert_held(&cgroup_mutex);
4112
4113 if (!cfts || !cfts[0].ss)
4114 return -ENOENT;
4115
4116 list_del(&cfts->node);
4117 cgroup_apply_cftypes(cfts, false);
4118 cgroup_exit_cftypes(cfts);
4119 return 0;
4120}
4121
4122
4123
4124
4125
4126
4127
4128
4129
4130
4131
4132
4133int cgroup_rm_cftypes(struct cftype *cfts)
4134{
4135 int ret;
4136
4137 mutex_lock(&cgroup_mutex);
4138 ret = cgroup_rm_cftypes_locked(cfts);
4139 mutex_unlock(&cgroup_mutex);
4140 return ret;
4141}
4142
4143
4144
4145
4146
4147
4148
4149
4150
4151
4152
4153
4154
4155
4156
4157static int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
4158{
4159 int ret;
4160
4161 if (!cgroup_ssid_enabled(ss->id))
4162 return 0;
4163
4164 if (!cfts || cfts[0].name[0] == '\0')
4165 return 0;
4166
4167 ret = cgroup_init_cftypes(ss, cfts);
4168 if (ret)
4169 return ret;
4170
4171 mutex_lock(&cgroup_mutex);
4172
4173 list_add_tail(&cfts->node, &ss->cfts);
4174 ret = cgroup_apply_cftypes(cfts, true);
4175 if (ret)
4176 cgroup_rm_cftypes_locked(cfts);
4177
4178 mutex_unlock(&cgroup_mutex);
4179 return ret;
4180}
4181
4182
4183
4184
4185
4186
4187
4188
4189
4190int cgroup_add_dfl_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
4191{
4192 struct cftype *cft;
4193
4194 for (cft = cfts; cft && cft->name[0] != '\0'; cft++)
4195 cft->flags |= __CFTYPE_ONLY_ON_DFL;
4196 return cgroup_add_cftypes(ss, cfts);
4197}
4198
4199
4200
4201
4202
4203
4204
4205
4206
4207int cgroup_add_legacy_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
4208{
4209 struct cftype *cft;
4210
4211 for (cft = cfts; cft && cft->name[0] != '\0'; cft++)
4212 cft->flags |= __CFTYPE_NOT_ON_DFL;
4213 return cgroup_add_cftypes(ss, cfts);
4214}
4215
4216
4217
4218
4219
4220
4221
4222void cgroup_file_notify(struct cgroup_file *cfile)
4223{
4224 unsigned long flags;
4225
4226 spin_lock_irqsave(&cgroup_file_kn_lock, flags);
4227 if (cfile->kn) {
4228 unsigned long last = cfile->notified_at;
4229 unsigned long next = last + CGROUP_FILE_NOTIFY_MIN_INTV;
4230
4231 if (time_in_range(jiffies, last, next)) {
4232 timer_reduce(&cfile->notify_timer, next);
4233 } else {
4234 kernfs_notify(cfile->kn);
4235 cfile->notified_at = jiffies;
4236 }
4237 }
4238 spin_unlock_irqrestore(&cgroup_file_kn_lock, flags);
4239}
4240
4241
4242
4243
4244
4245
4246
4247
4248
4249
4250
4251
4252
4253
4254
4255
4256
4257
4258struct cgroup_subsys_state *css_next_child(struct cgroup_subsys_state *pos,
4259 struct cgroup_subsys_state *parent)
4260{
4261 struct cgroup_subsys_state *next;
4262
4263 cgroup_assert_mutex_or_rcu_locked();
4264
4265
4266
4267
4268
4269
4270
4271
4272
4273
4274
4275
4276
4277
4278
4279
4280
4281
4282
4283
4284
4285 if (!pos) {
4286 next = list_entry_rcu(parent->children.next, struct cgroup_subsys_state, sibling);
4287 } else if (likely(!(pos->flags & CSS_RELEASED))) {
4288 next = list_entry_rcu(pos->sibling.next, struct cgroup_subsys_state, sibling);
4289 } else {
4290 list_for_each_entry_rcu(next, &parent->children, sibling,
4291 lockdep_is_held(&cgroup_mutex))
4292 if (next->serial_nr > pos->serial_nr)
4293 break;
4294 }
4295
4296
4297
4298
4299
4300 if (&next->sibling != &parent->children)
4301 return next;
4302 return NULL;
4303}
4304
4305
4306
4307
4308
4309
4310
4311
4312
4313
4314
4315
4316
4317
4318
4319
4320
4321
4322
4323
4324
4325
4326struct cgroup_subsys_state *
4327css_next_descendant_pre(struct cgroup_subsys_state *pos,
4328 struct cgroup_subsys_state *root)
4329{
4330 struct cgroup_subsys_state *next;
4331
4332 cgroup_assert_mutex_or_rcu_locked();
4333
4334
4335 if (!pos)
4336 return root;
4337
4338
4339 next = css_next_child(NULL, pos);
4340 if (next)
4341 return next;
4342
4343
4344 while (pos != root) {
4345 next = css_next_child(pos, pos->parent);
4346 if (next)
4347 return next;
4348 pos = pos->parent;
4349 }
4350
4351 return NULL;
4352}
4353EXPORT_SYMBOL_GPL(css_next_descendant_pre);
4354
4355
4356
4357
4358
4359
4360
4361
4362
4363
4364
4365
4366
4367
4368struct cgroup_subsys_state *
4369css_rightmost_descendant(struct cgroup_subsys_state *pos)
4370{
4371 struct cgroup_subsys_state *last, *tmp;
4372
4373 cgroup_assert_mutex_or_rcu_locked();
4374
4375 do {
4376 last = pos;
4377
4378 pos = NULL;
4379 css_for_each_child(tmp, last)
4380 pos = tmp;
4381 } while (pos);
4382
4383 return last;
4384}
4385
4386static struct cgroup_subsys_state *
4387css_leftmost_descendant(struct cgroup_subsys_state *pos)
4388{
4389 struct cgroup_subsys_state *last;
4390
4391 do {
4392 last = pos;
4393 pos = css_next_child(NULL, pos);
4394 } while (pos);
4395
4396 return last;
4397}
4398
4399
4400
4401
4402
4403
4404
4405
4406
4407
4408
4409
4410
4411
4412
4413
4414
4415
4416
4417
4418
4419
4420
4421struct cgroup_subsys_state *
4422css_next_descendant_post(struct cgroup_subsys_state *pos,
4423 struct cgroup_subsys_state *root)
4424{
4425 struct cgroup_subsys_state *next;
4426
4427 cgroup_assert_mutex_or_rcu_locked();
4428
4429
4430 if (!pos)
4431 return css_leftmost_descendant(root);
4432
4433
4434 if (pos == root)
4435 return NULL;
4436
4437
4438 next = css_next_child(pos, pos->parent);
4439 if (next)
4440 return css_leftmost_descendant(next);
4441
4442
4443 return pos->parent;
4444}
4445
4446
4447
4448
4449
4450
4451
4452
4453
4454bool css_has_online_children(struct cgroup_subsys_state *css)
4455{
4456 struct cgroup_subsys_state *child;
4457 bool ret = false;
4458
4459 rcu_read_lock();
4460 css_for_each_child(child, css) {
4461 if (child->flags & CSS_ONLINE) {
4462 ret = true;
4463 break;
4464 }
4465 }
4466 rcu_read_unlock();
4467 return ret;
4468}
4469
4470static struct css_set *css_task_iter_next_css_set(struct css_task_iter *it)
4471{
4472 struct list_head *l;
4473 struct cgrp_cset_link *link;
4474 struct css_set *cset;
4475
4476 lockdep_assert_held(&css_set_lock);
4477
4478
4479 if (it->tcset_pos) {
4480 l = it->tcset_pos->next;
4481
4482 if (l != it->tcset_head) {
4483 it->tcset_pos = l;
4484 return container_of(l, struct css_set,
4485 threaded_csets_node);
4486 }
4487
4488 it->tcset_pos = NULL;
4489 }
4490
4491
4492 l = it->cset_pos;
4493 l = l->next;
4494 if (l == it->cset_head) {
4495 it->cset_pos = NULL;
4496 return NULL;
4497 }
4498
4499 if (it->ss) {
4500 cset = container_of(l, struct css_set, e_cset_node[it->ss->id]);
4501 } else {
4502 link = list_entry(l, struct cgrp_cset_link, cset_link);
4503 cset = link->cset;
4504 }
4505
4506 it->cset_pos = l;
4507
4508
4509 if (it->flags & CSS_TASK_ITER_THREADED) {
4510 if (it->cur_dcset)
4511 put_css_set_locked(it->cur_dcset);
4512 it->cur_dcset = cset;
4513 get_css_set(cset);
4514
4515 it->tcset_head = &cset->threaded_csets;
4516 it->tcset_pos = &cset->threaded_csets;
4517 }
4518
4519 return cset;
4520}
4521
4522
4523
4524
4525
4526
4527
4528static void css_task_iter_advance_css_set(struct css_task_iter *it)
4529{
4530 struct css_set *cset;
4531
4532 lockdep_assert_held(&css_set_lock);
4533
4534
4535 while ((cset = css_task_iter_next_css_set(it))) {
4536 if (!list_empty(&cset->tasks)) {
4537 it->cur_tasks_head = &cset->tasks;
4538 break;
4539 } else if (!list_empty(&cset->mg_tasks)) {
4540 it->cur_tasks_head = &cset->mg_tasks;
4541 break;
4542 } else if (!list_empty(&cset->dying_tasks)) {
4543 it->cur_tasks_head = &cset->dying_tasks;
4544 break;
4545 }
4546 }
4547 if (!cset) {
4548 it->task_pos = NULL;
4549 return;
4550 }
4551 it->task_pos = it->cur_tasks_head->next;
4552
4553
4554
4555
4556
4557
4558
4559
4560
4561
4562
4563
4564
4565
4566
4567
4568 if (it->cur_cset) {
4569 list_del(&it->iters_node);
4570 put_css_set_locked(it->cur_cset);
4571 }
4572 get_css_set(cset);
4573 it->cur_cset = cset;
4574 list_add(&it->iters_node, &cset->task_iters);
4575}
4576
4577static void css_task_iter_skip(struct css_task_iter *it,
4578 struct task_struct *task)
4579{
4580 lockdep_assert_held(&css_set_lock);
4581
4582 if (it->task_pos == &task->cg_list) {
4583 it->task_pos = it->task_pos->next;
4584 it->flags |= CSS_TASK_ITER_SKIPPED;
4585 }
4586}
4587
4588static void css_task_iter_advance(struct css_task_iter *it)
4589{
4590 struct task_struct *task;
4591
4592 lockdep_assert_held(&css_set_lock);
4593repeat:
4594 if (it->task_pos) {
4595
4596
4597
4598
4599
4600 if (it->flags & CSS_TASK_ITER_SKIPPED)
4601 it->flags &= ~CSS_TASK_ITER_SKIPPED;
4602 else
4603 it->task_pos = it->task_pos->next;
4604
4605 if (it->task_pos == &it->cur_cset->tasks) {
4606 it->cur_tasks_head = &it->cur_cset->mg_tasks;
4607 it->task_pos = it->cur_tasks_head->next;
4608 }
4609 if (it->task_pos == &it->cur_cset->mg_tasks) {
4610 it->cur_tasks_head = &it->cur_cset->dying_tasks;
4611 it->task_pos = it->cur_tasks_head->next;
4612 }
4613 if (it->task_pos == &it->cur_cset->dying_tasks)
4614 css_task_iter_advance_css_set(it);
4615 } else {
4616
4617 css_task_iter_advance_css_set(it);
4618 }
4619
4620 if (!it->task_pos)
4621 return;
4622
4623 task = list_entry(it->task_pos, struct task_struct, cg_list);
4624
4625 if (it->flags & CSS_TASK_ITER_PROCS) {
4626
4627 if (!thread_group_leader(task))
4628 goto repeat;
4629
4630
4631 if (it->cur_tasks_head == &it->cur_cset->dying_tasks &&
4632 !atomic_read(&task->signal->live))
4633 goto repeat;
4634 } else {
4635
4636 if (it->cur_tasks_head == &it->cur_cset->dying_tasks)
4637 goto repeat;
4638 }
4639}
4640
4641
4642
4643
4644
4645
4646
4647
4648
4649
4650
4651
4652void css_task_iter_start(struct cgroup_subsys_state *css, unsigned int flags,
4653 struct css_task_iter *it)
4654{
4655 memset(it, 0, sizeof(*it));
4656
4657 spin_lock_irq(&css_set_lock);
4658
4659 it->ss = css->ss;
4660 it->flags = flags;
4661
4662 if (CGROUP_HAS_SUBSYS_CONFIG && it->ss)
4663 it->cset_pos = &css->cgroup->e_csets[css->ss->id];
4664 else
4665 it->cset_pos = &css->cgroup->cset_links;
4666
4667 it->cset_head = it->cset_pos;
4668
4669 css_task_iter_advance(it);
4670
4671 spin_unlock_irq(&css_set_lock);
4672}
4673
4674
4675
4676
4677
4678
4679
4680
4681
4682struct task_struct *css_task_iter_next(struct css_task_iter *it)
4683{
4684 if (it->cur_task) {
4685 put_task_struct(it->cur_task);
4686 it->cur_task = NULL;
4687 }
4688
4689 spin_lock_irq(&css_set_lock);
4690
4691
4692 if (it->flags & CSS_TASK_ITER_SKIPPED)
4693 css_task_iter_advance(it);
4694
4695 if (it->task_pos) {
4696 it->cur_task = list_entry(it->task_pos, struct task_struct,
4697 cg_list);
4698 get_task_struct(it->cur_task);
4699 css_task_iter_advance(it);
4700 }
4701
4702 spin_unlock_irq(&css_set_lock);
4703
4704 return it->cur_task;
4705}
4706
4707
4708
4709
4710
4711
4712
4713void css_task_iter_end(struct css_task_iter *it)
4714{
4715 if (it->cur_cset) {
4716 spin_lock_irq(&css_set_lock);
4717 list_del(&it->iters_node);
4718 put_css_set_locked(it->cur_cset);
4719 spin_unlock_irq(&css_set_lock);
4720 }
4721
4722 if (it->cur_dcset)
4723 put_css_set(it->cur_dcset);
4724
4725 if (it->cur_task)
4726 put_task_struct(it->cur_task);
4727}
4728
4729static void cgroup_procs_release(struct kernfs_open_file *of)
4730{
4731 if (of->priv) {
4732 css_task_iter_end(of->priv);
4733 kfree(of->priv);
4734 }
4735}
4736
4737static void *cgroup_procs_next(struct seq_file *s, void *v, loff_t *pos)
4738{
4739 struct kernfs_open_file *of = s->private;
4740 struct css_task_iter *it = of->priv;
4741
4742 if (pos)
4743 (*pos)++;
4744
4745 return css_task_iter_next(it);
4746}
4747
4748static void *__cgroup_procs_start(struct seq_file *s, loff_t *pos,
4749 unsigned int iter_flags)
4750{
4751 struct kernfs_open_file *of = s->private;
4752 struct cgroup *cgrp = seq_css(s)->cgroup;
4753 struct css_task_iter *it = of->priv;
4754
4755
4756
4757
4758
4759 if (!it) {
4760 if (WARN_ON_ONCE((*pos)))
4761 return ERR_PTR(-EINVAL);
4762
4763 it = kzalloc(sizeof(*it), GFP_KERNEL);
4764 if (!it)
4765 return ERR_PTR(-ENOMEM);
4766 of->priv = it;
4767 css_task_iter_start(&cgrp->self, iter_flags, it);
4768 } else if (!(*pos)) {
4769 css_task_iter_end(it);
4770 css_task_iter_start(&cgrp->self, iter_flags, it);
4771 } else
4772 return it->cur_task;
4773
4774 return cgroup_procs_next(s, NULL, NULL);
4775}
4776
4777static void *cgroup_procs_start(struct seq_file *s, loff_t *pos)
4778{
4779 struct cgroup *cgrp = seq_css(s)->cgroup;
4780
4781
4782
4783
4784
4785
4786
4787 if (cgroup_is_threaded(cgrp))
4788 return ERR_PTR(-EOPNOTSUPP);
4789
4790 return __cgroup_procs_start(s, pos, CSS_TASK_ITER_PROCS |
4791 CSS_TASK_ITER_THREADED);
4792}
4793
4794static int cgroup_procs_show(struct seq_file *s, void *v)
4795{
4796 seq_printf(s, "%d\n", task_pid_vnr(v));
4797 return 0;
4798}
4799
4800static int cgroup_may_write(const struct cgroup *cgrp, struct super_block *sb)
4801{
4802 int ret;
4803 struct inode *inode;
4804
4805 lockdep_assert_held(&cgroup_mutex);
4806
4807 inode = kernfs_get_inode(sb, cgrp->procs_file.kn);
4808 if (!inode)
4809 return -ENOMEM;
4810
4811 ret = inode_permission(&init_user_ns, inode, MAY_WRITE);
4812 iput(inode);
4813 return ret;
4814}
4815
4816static int cgroup_procs_write_permission(struct cgroup *src_cgrp,
4817 struct cgroup *dst_cgrp,
4818 struct super_block *sb)
4819{
4820 struct cgroup_namespace *ns = current->nsproxy->cgroup_ns;
4821 struct cgroup *com_cgrp = src_cgrp;
4822 int ret;
4823
4824 lockdep_assert_held(&cgroup_mutex);
4825
4826
4827 while (!cgroup_is_descendant(dst_cgrp, com_cgrp))
4828 com_cgrp = cgroup_parent(com_cgrp);
4829
4830
4831 ret = cgroup_may_write(com_cgrp, sb);
4832 if (ret)
4833 return ret;
4834
4835
4836
4837
4838
4839 if ((cgrp_dfl_root.flags & CGRP_ROOT_NS_DELEGATE) &&
4840 (!cgroup_is_descendant(src_cgrp, ns->root_cset->dfl_cgrp) ||
4841 !cgroup_is_descendant(dst_cgrp, ns->root_cset->dfl_cgrp)))
4842 return -ENOENT;
4843
4844 return 0;
4845}
4846
4847static int cgroup_attach_permissions(struct cgroup *src_cgrp,
4848 struct cgroup *dst_cgrp,
4849 struct super_block *sb, bool threadgroup)
4850{
4851 int ret = 0;
4852
4853 ret = cgroup_procs_write_permission(src_cgrp, dst_cgrp, sb);
4854 if (ret)
4855 return ret;
4856
4857 ret = cgroup_migrate_vet_dst(dst_cgrp);
4858 if (ret)
4859 return ret;
4860
4861 if (!threadgroup && (src_cgrp->dom_cgrp != dst_cgrp->dom_cgrp))
4862 ret = -EOPNOTSUPP;
4863
4864 return ret;
4865}
4866
4867static ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf,
4868 bool threadgroup)
4869{
4870 struct cgroup *src_cgrp, *dst_cgrp;
4871 struct task_struct *task;
4872 ssize_t ret;
4873 bool locked;
4874
4875 dst_cgrp = cgroup_kn_lock_live(of->kn, false);
4876 if (!dst_cgrp)
4877 return -ENODEV;
4878
4879 task = cgroup_procs_write_start(buf, threadgroup, &locked);
4880 ret = PTR_ERR_OR_ZERO(task);
4881 if (ret)
4882 goto out_unlock;
4883
4884
4885 spin_lock_irq(&css_set_lock);
4886 src_cgrp = task_cgroup_from_root(task, &cgrp_dfl_root);
4887 spin_unlock_irq(&css_set_lock);
4888
4889
4890 ret = cgroup_attach_permissions(src_cgrp, dst_cgrp,
4891 of->file->f_path.dentry->d_sb, threadgroup);
4892 if (ret)
4893 goto out_finish;
4894
4895 ret = cgroup_attach_task(dst_cgrp, task, threadgroup);
4896
4897out_finish:
4898 cgroup_procs_write_finish(task, locked);
4899out_unlock:
4900 cgroup_kn_unlock(of->kn);
4901
4902 return ret;
4903}
4904
4905static ssize_t cgroup_procs_write(struct kernfs_open_file *of,
4906 char *buf, size_t nbytes, loff_t off)
4907{
4908 return __cgroup_procs_write(of, buf, true) ?: nbytes;
4909}
4910
4911static void *cgroup_threads_start(struct seq_file *s, loff_t *pos)
4912{
4913 return __cgroup_procs_start(s, pos, 0);
4914}
4915
4916static ssize_t cgroup_threads_write(struct kernfs_open_file *of,
4917 char *buf, size_t nbytes, loff_t off)
4918{
4919 return __cgroup_procs_write(of, buf, false) ?: nbytes;
4920}
4921
4922
4923static struct cftype cgroup_base_files[] = {
4924 {
4925 .name = "cgroup.type",
4926 .flags = CFTYPE_NOT_ON_ROOT,
4927 .seq_show = cgroup_type_show,
4928 .write = cgroup_type_write,
4929 },
4930 {
4931 .name = "cgroup.procs",
4932 .flags = CFTYPE_NS_DELEGATABLE,
4933 .file_offset = offsetof(struct cgroup, procs_file),
4934 .release = cgroup_procs_release,
4935 .seq_start = cgroup_procs_start,
4936 .seq_next = cgroup_procs_next,
4937 .seq_show = cgroup_procs_show,
4938 .write = cgroup_procs_write,
4939 },
4940 {
4941 .name = "cgroup.threads",
4942 .flags = CFTYPE_NS_DELEGATABLE,
4943 .release = cgroup_procs_release,
4944 .seq_start = cgroup_threads_start,
4945 .seq_next = cgroup_procs_next,
4946 .seq_show = cgroup_procs_show,
4947 .write = cgroup_threads_write,
4948 },
4949 {
4950 .name = "cgroup.controllers",
4951 .seq_show = cgroup_controllers_show,
4952 },
4953 {
4954 .name = "cgroup.subtree_control",
4955 .flags = CFTYPE_NS_DELEGATABLE,
4956 .seq_show = cgroup_subtree_control_show,
4957 .write = cgroup_subtree_control_write,
4958 },
4959 {
4960 .name = "cgroup.events",
4961 .flags = CFTYPE_NOT_ON_ROOT,
4962 .file_offset = offsetof(struct cgroup, events_file),
4963 .seq_show = cgroup_events_show,
4964 },
4965 {
4966 .name = "cgroup.max.descendants",
4967 .seq_show = cgroup_max_descendants_show,
4968 .write = cgroup_max_descendants_write,
4969 },
4970 {
4971 .name = "cgroup.max.depth",
4972 .seq_show = cgroup_max_depth_show,
4973 .write = cgroup_max_depth_write,
4974 },
4975 {
4976 .name = "cgroup.stat",
4977 .seq_show = cgroup_stat_show,
4978 },
4979 {
4980 .name = "cgroup.freeze",
4981 .flags = CFTYPE_NOT_ON_ROOT,
4982 .seq_show = cgroup_freeze_show,
4983 .write = cgroup_freeze_write,
4984 },
4985 {
4986 .name = "cgroup.kill",
4987 .flags = CFTYPE_NOT_ON_ROOT,
4988 .write = cgroup_kill_write,
4989 },
4990 {
4991 .name = "cpu.stat",
4992 .seq_show = cpu_stat_show,
4993 },
4994#ifdef CONFIG_PSI
4995 {
4996 .name = "io.pressure",
4997 .flags = CFTYPE_PRESSURE,
4998 .seq_show = cgroup_io_pressure_show,
4999 .write = cgroup_io_pressure_write,
5000 .poll = cgroup_pressure_poll,
5001 .release = cgroup_pressure_release,
5002 },
5003 {
5004 .name = "memory.pressure",
5005 .flags = CFTYPE_PRESSURE,
5006 .seq_show = cgroup_memory_pressure_show,
5007 .write = cgroup_memory_pressure_write,
5008 .poll = cgroup_pressure_poll,
5009 .release = cgroup_pressure_release,
5010 },
5011 {
5012 .name = "cpu.pressure",
5013 .flags = CFTYPE_PRESSURE,
5014 .seq_show = cgroup_cpu_pressure_show,
5015 .write = cgroup_cpu_pressure_write,
5016 .poll = cgroup_pressure_poll,
5017 .release = cgroup_pressure_release,
5018 },
5019#endif
5020 { }
5021};
5022
5023
5024
5025
5026
5027
5028
5029
5030
5031
5032
5033
5034
5035
5036
5037
5038
5039
5040
5041
5042
5043
5044
5045static void css_free_rwork_fn(struct work_struct *work)
5046{
5047 struct cgroup_subsys_state *css = container_of(to_rcu_work(work),
5048 struct cgroup_subsys_state, destroy_rwork);
5049 struct cgroup_subsys *ss = css->ss;
5050 struct cgroup *cgrp = css->cgroup;
5051
5052 percpu_ref_exit(&css->refcnt);
5053
5054 if (ss) {
5055
5056 struct cgroup_subsys_state *parent = css->parent;
5057 int id = css->id;
5058
5059 ss->css_free(css);
5060 cgroup_idr_remove(&ss->css_idr, id);
5061 cgroup_put(cgrp);
5062
5063 if (parent)
5064 css_put(parent);
5065 } else {
5066
5067 atomic_dec(&cgrp->root->nr_cgrps);
5068 cgroup1_pidlist_destroy_all(cgrp);
5069 cancel_work_sync(&cgrp->release_agent_work);
5070
5071 if (cgroup_parent(cgrp)) {
5072
5073
5074
5075
5076
5077
5078 cgroup_put(cgroup_parent(cgrp));
5079 kernfs_put(cgrp->kn);
5080 psi_cgroup_free(cgrp);
5081 cgroup_rstat_exit(cgrp);
5082 kfree(cgrp);
5083 } else {
5084
5085
5086
5087
5088
5089 cgroup_destroy_root(cgrp->root);
5090 }
5091 }
5092}
5093
5094static void css_release_work_fn(struct work_struct *work)
5095{
5096 struct cgroup_subsys_state *css =
5097 container_of(work, struct cgroup_subsys_state, destroy_work);
5098 struct cgroup_subsys *ss = css->ss;
5099 struct cgroup *cgrp = css->cgroup;
5100
5101 mutex_lock(&cgroup_mutex);
5102
5103 css->flags |= CSS_RELEASED;
5104 list_del_rcu(&css->sibling);
5105
5106 if (ss) {
5107
5108 if (!list_empty(&css->rstat_css_node)) {
5109 cgroup_rstat_flush(cgrp);
5110 list_del_rcu(&css->rstat_css_node);
5111 }
5112
5113 cgroup_idr_replace(&ss->css_idr, NULL, css->id);
5114 if (ss->css_released)
5115 ss->css_released(css);
5116 } else {
5117 struct cgroup *tcgrp;
5118
5119
5120 TRACE_CGROUP_PATH(release, cgrp);
5121
5122 cgroup_rstat_flush(cgrp);
5123
5124 spin_lock_irq(&css_set_lock);
5125 for (tcgrp = cgroup_parent(cgrp); tcgrp;
5126 tcgrp = cgroup_parent(tcgrp))
5127 tcgrp->nr_dying_descendants--;
5128 spin_unlock_irq(&css_set_lock);
5129
5130
5131
5132
5133
5134
5135
5136
5137 if (cgrp->kn)
5138 RCU_INIT_POINTER(*(void __rcu __force **)&cgrp->kn->priv,
5139 NULL);
5140 }
5141
5142 mutex_unlock(&cgroup_mutex);
5143
5144 INIT_RCU_WORK(&css->destroy_rwork, css_free_rwork_fn);
5145 queue_rcu_work(cgroup_destroy_wq, &css->destroy_rwork);
5146}
5147
5148static void css_release(struct percpu_ref *ref)
5149{
5150 struct cgroup_subsys_state *css =
5151 container_of(ref, struct cgroup_subsys_state, refcnt);
5152
5153 INIT_WORK(&css->destroy_work, css_release_work_fn);
5154 queue_work(cgroup_destroy_wq, &css->destroy_work);
5155}
5156
5157static void init_and_link_css(struct cgroup_subsys_state *css,
5158 struct cgroup_subsys *ss, struct cgroup *cgrp)
5159{
5160 lockdep_assert_held(&cgroup_mutex);
5161
5162 cgroup_get_live(cgrp);
5163
5164 memset(css, 0, sizeof(*css));
5165 css->cgroup = cgrp;
5166 css->ss = ss;
5167 css->id = -1;
5168 INIT_LIST_HEAD(&css->sibling);
5169 INIT_LIST_HEAD(&css->children);
5170 INIT_LIST_HEAD(&css->rstat_css_node);
5171 css->serial_nr = css_serial_nr_next++;
5172 atomic_set(&css->online_cnt, 0);
5173
5174 if (cgroup_parent(cgrp)) {
5175 css->parent = cgroup_css(cgroup_parent(cgrp), ss);
5176 css_get(css->parent);
5177 }
5178
5179 if (ss->css_rstat_flush)
5180 list_add_rcu(&css->rstat_css_node, &cgrp->rstat_css_list);
5181
5182 BUG_ON(cgroup_css(cgrp, ss));
5183}
5184
5185
5186static int online_css(struct cgroup_subsys_state *css)
5187{
5188 struct cgroup_subsys *ss = css->ss;
5189 int ret = 0;
5190
5191 lockdep_assert_held(&cgroup_mutex);
5192
5193 if (ss->css_online)
5194 ret = ss->css_online(css);
5195 if (!ret) {
5196 css->flags |= CSS_ONLINE;
5197 rcu_assign_pointer(css->cgroup->subsys[ss->id], css);
5198
5199 atomic_inc(&css->online_cnt);
5200 if (css->parent)
5201 atomic_inc(&css->parent->online_cnt);
5202 }
5203 return ret;
5204}
5205
5206
5207static void offline_css(struct cgroup_subsys_state *css)
5208{
5209 struct cgroup_subsys *ss = css->ss;
5210
5211 lockdep_assert_held(&cgroup_mutex);
5212
5213 if (!(css->flags & CSS_ONLINE))
5214 return;
5215
5216 if (ss->css_offline)
5217 ss->css_offline(css);
5218
5219 css->flags &= ~CSS_ONLINE;
5220 RCU_INIT_POINTER(css->cgroup->subsys[ss->id], NULL);
5221
5222 wake_up_all(&css->cgroup->offline_waitq);
5223}
5224
5225
5226
5227
5228
5229
5230
5231
5232
5233
5234static struct cgroup_subsys_state *css_create(struct cgroup *cgrp,
5235 struct cgroup_subsys *ss)
5236{
5237 struct cgroup *parent = cgroup_parent(cgrp);
5238 struct cgroup_subsys_state *parent_css = cgroup_css(parent, ss);
5239 struct cgroup_subsys_state *css;
5240 int err;
5241
5242 lockdep_assert_held(&cgroup_mutex);
5243
5244 css = ss->css_alloc(parent_css);
5245 if (!css)
5246 css = ERR_PTR(-ENOMEM);
5247 if (IS_ERR(css))
5248 return css;
5249
5250 init_and_link_css(css, ss, cgrp);
5251
5252 err = percpu_ref_init(&css->refcnt, css_release, 0, GFP_KERNEL);
5253 if (err)
5254 goto err_free_css;
5255
5256 err = cgroup_idr_alloc(&ss->css_idr, NULL, 2, 0, GFP_KERNEL);
5257 if (err < 0)
5258 goto err_free_css;
5259 css->id = err;
5260
5261
5262 list_add_tail_rcu(&css->sibling, &parent_css->children);
5263 cgroup_idr_replace(&ss->css_idr, css, css->id);
5264
5265 err = online_css(css);
5266 if (err)
5267 goto err_list_del;
5268
5269 return css;
5270
5271err_list_del:
5272 list_del_rcu(&css->sibling);
5273err_free_css:
5274 list_del_rcu(&css->rstat_css_node);
5275 INIT_RCU_WORK(&css->destroy_rwork, css_free_rwork_fn);
5276 queue_rcu_work(cgroup_destroy_wq, &css->destroy_rwork);
5277 return ERR_PTR(err);
5278}
5279
5280
5281
5282
5283
5284
5285static struct cgroup *cgroup_create(struct cgroup *parent, const char *name,
5286 umode_t mode)
5287{
5288 struct cgroup_root *root = parent->root;
5289 struct cgroup *cgrp, *tcgrp;
5290 struct kernfs_node *kn;
5291 int level = parent->level + 1;
5292 int ret;
5293
5294
5295 cgrp = kzalloc(struct_size(cgrp, ancestor_ids, (level + 1)),
5296 GFP_KERNEL);
5297 if (!cgrp)
5298 return ERR_PTR(-ENOMEM);
5299
5300 ret = percpu_ref_init(&cgrp->self.refcnt, css_release, 0, GFP_KERNEL);
5301 if (ret)
5302 goto out_free_cgrp;
5303
5304 ret = cgroup_rstat_init(cgrp);
5305 if (ret)
5306 goto out_cancel_ref;
5307
5308
5309 kn = kernfs_create_dir(parent->kn, name, mode, cgrp);
5310 if (IS_ERR(kn)) {
5311 ret = PTR_ERR(kn);
5312 goto out_stat_exit;
5313 }
5314 cgrp->kn = kn;
5315
5316 init_cgroup_housekeeping(cgrp);
5317
5318 cgrp->self.parent = &parent->self;
5319 cgrp->root = root;
5320 cgrp->level = level;
5321
5322 ret = psi_cgroup_alloc(cgrp);
5323 if (ret)
5324 goto out_kernfs_remove;
5325
5326 ret = cgroup_bpf_inherit(cgrp);
5327 if (ret)
5328 goto out_psi_free;
5329
5330
5331
5332
5333
5334 cgrp->freezer.e_freeze = parent->freezer.e_freeze;
5335 if (cgrp->freezer.e_freeze) {
5336
5337
5338
5339
5340
5341
5342 set_bit(CGRP_FREEZE, &cgrp->flags);
5343 set_bit(CGRP_FROZEN, &cgrp->flags);
5344 }
5345
5346 spin_lock_irq(&css_set_lock);
5347 for (tcgrp = cgrp; tcgrp; tcgrp = cgroup_parent(tcgrp)) {
5348 cgrp->ancestor_ids[tcgrp->level] = cgroup_id(tcgrp);
5349
5350 if (tcgrp != cgrp) {
5351 tcgrp->nr_descendants++;
5352
5353
5354
5355
5356
5357
5358 if (cgrp->freezer.e_freeze)
5359 tcgrp->freezer.nr_frozen_descendants++;
5360 }
5361 }
5362 spin_unlock_irq(&css_set_lock);
5363
5364 if (notify_on_release(parent))
5365 set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
5366
5367 if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &parent->flags))
5368 set_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags);
5369
5370 cgrp->self.serial_nr = css_serial_nr_next++;
5371
5372
5373 list_add_tail_rcu(&cgrp->self.sibling, &cgroup_parent(cgrp)->self.children);
5374 atomic_inc(&root->nr_cgrps);
5375 cgroup_get_live(parent);
5376
5377
5378
5379
5380
5381 if (!cgroup_on_dfl(cgrp))
5382 cgrp->subtree_control = cgroup_control(cgrp);
5383
5384 cgroup_propagate_control(cgrp);
5385
5386 return cgrp;
5387
5388out_psi_free:
5389 psi_cgroup_free(cgrp);
5390out_kernfs_remove:
5391 kernfs_remove(cgrp->kn);
5392out_stat_exit:
5393 cgroup_rstat_exit(cgrp);
5394out_cancel_ref:
5395 percpu_ref_exit(&cgrp->self.refcnt);
5396out_free_cgrp:
5397 kfree(cgrp);
5398 return ERR_PTR(ret);
5399}
5400
5401static bool cgroup_check_hierarchy_limits(struct cgroup *parent)
5402{
5403 struct cgroup *cgroup;
5404 int ret = false;
5405 int level = 1;
5406
5407 lockdep_assert_held(&cgroup_mutex);
5408
5409 for (cgroup = parent; cgroup; cgroup = cgroup_parent(cgroup)) {
5410 if (cgroup->nr_descendants >= cgroup->max_descendants)
5411 goto fail;
5412
5413 if (level > cgroup->max_depth)
5414 goto fail;
5415
5416 level++;
5417 }
5418
5419 ret = true;
5420fail:
5421 return ret;
5422}
5423
5424int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, umode_t mode)
5425{
5426 struct cgroup *parent, *cgrp;
5427 int ret;
5428
5429
5430 if (strchr(name, '\n'))
5431 return -EINVAL;
5432
5433 parent = cgroup_kn_lock_live(parent_kn, false);
5434 if (!parent)
5435 return -ENODEV;
5436
5437 if (!cgroup_check_hierarchy_limits(parent)) {
5438 ret = -EAGAIN;
5439 goto out_unlock;
5440 }
5441
5442 cgrp = cgroup_create(parent, name, mode);
5443 if (IS_ERR(cgrp)) {
5444 ret = PTR_ERR(cgrp);
5445 goto out_unlock;
5446 }
5447
5448
5449
5450
5451
5452 kernfs_get(cgrp->kn);
5453
5454 ret = cgroup_kn_set_ugid(cgrp->kn);
5455 if (ret)
5456 goto out_destroy;
5457
5458 ret = css_populate_dir(&cgrp->self);
5459 if (ret)
5460 goto out_destroy;
5461
5462 ret = cgroup_apply_control_enable(cgrp);
5463 if (ret)
5464 goto out_destroy;
5465
5466 TRACE_CGROUP_PATH(mkdir, cgrp);
5467
5468
5469 kernfs_activate(cgrp->kn);
5470
5471 ret = 0;
5472 goto out_unlock;
5473
5474out_destroy:
5475 cgroup_destroy_locked(cgrp);
5476out_unlock:
5477 cgroup_kn_unlock(parent_kn);
5478 return ret;
5479}
5480
5481
5482
5483
5484
5485
5486static void css_killed_work_fn(struct work_struct *work)
5487{
5488 struct cgroup_subsys_state *css =
5489 container_of(work, struct cgroup_subsys_state, destroy_work);
5490
5491 mutex_lock(&cgroup_mutex);
5492
5493 do {
5494 offline_css(css);
5495 css_put(css);
5496
5497 css = css->parent;
5498 } while (css && atomic_dec_and_test(&css->online_cnt));
5499
5500 mutex_unlock(&cgroup_mutex);
5501}
5502
5503
5504static void css_killed_ref_fn(struct percpu_ref *ref)
5505{
5506 struct cgroup_subsys_state *css =
5507 container_of(ref, struct cgroup_subsys_state, refcnt);
5508
5509 if (atomic_dec_and_test(&css->online_cnt)) {
5510 INIT_WORK(&css->destroy_work, css_killed_work_fn);
5511 queue_work(cgroup_destroy_wq, &css->destroy_work);
5512 }
5513}
5514
5515
5516
5517
5518
5519
5520
5521
5522
5523
5524static void kill_css(struct cgroup_subsys_state *css)
5525{
5526 lockdep_assert_held(&cgroup_mutex);
5527
5528 if (css->flags & CSS_DYING)
5529 return;
5530
5531 css->flags |= CSS_DYING;
5532
5533
5534
5535
5536
5537 css_clear_dir(css);
5538
5539
5540
5541
5542
5543 css_get(css);
5544
5545
5546
5547
5548
5549
5550
5551
5552
5553
5554
5555 percpu_ref_kill_and_confirm(&css->refcnt, css_killed_ref_fn);
5556}
5557
5558
5559
5560
5561
5562
5563
5564
5565
5566
5567
5568
5569
5570
5571
5572
5573
5574
5575
5576
5577
5578
5579
5580
5581
5582static int cgroup_destroy_locked(struct cgroup *cgrp)
5583 __releases(&cgroup_mutex) __acquires(&cgroup_mutex)
5584{
5585 struct cgroup *tcgrp, *parent = cgroup_parent(cgrp);
5586 struct cgroup_subsys_state *css;
5587 struct cgrp_cset_link *link;
5588 int ssid;
5589
5590 lockdep_assert_held(&cgroup_mutex);
5591
5592
5593
5594
5595
5596 if (cgroup_is_populated(cgrp))
5597 return -EBUSY;
5598
5599
5600
5601
5602
5603
5604 if (css_has_online_children(&cgrp->self))
5605 return -EBUSY;
5606
5607
5608
5609
5610
5611
5612
5613 cgrp->self.flags &= ~CSS_ONLINE;
5614
5615 spin_lock_irq(&css_set_lock);
5616 list_for_each_entry(link, &cgrp->cset_links, cset_link)
5617 link->cset->dead = true;
5618 spin_unlock_irq(&css_set_lock);
5619
5620
5621 for_each_css(css, ssid, cgrp)
5622 kill_css(css);
5623
5624
5625 css_clear_dir(&cgrp->self);
5626 kernfs_remove(cgrp->kn);
5627
5628 if (parent && cgroup_is_threaded(cgrp))
5629 parent->nr_threaded_children--;
5630
5631 spin_lock_irq(&css_set_lock);
5632 for (tcgrp = cgroup_parent(cgrp); tcgrp; tcgrp = cgroup_parent(tcgrp)) {
5633 tcgrp->nr_descendants--;
5634 tcgrp->nr_dying_descendants++;
5635
5636
5637
5638
5639 if (test_bit(CGRP_FROZEN, &cgrp->flags))
5640 tcgrp->freezer.nr_frozen_descendants--;
5641 }
5642 spin_unlock_irq(&css_set_lock);
5643
5644 cgroup1_check_for_release(parent);
5645
5646 cgroup_bpf_offline(cgrp);
5647
5648
5649 percpu_ref_kill(&cgrp->self.refcnt);
5650
5651 return 0;
5652};
5653
5654int cgroup_rmdir(struct kernfs_node *kn)
5655{
5656 struct cgroup *cgrp;
5657 int ret = 0;
5658
5659 cgrp = cgroup_kn_lock_live(kn, false);
5660 if (!cgrp)
5661 return 0;
5662
5663 ret = cgroup_destroy_locked(cgrp);
5664 if (!ret)
5665 TRACE_CGROUP_PATH(rmdir, cgrp);
5666
5667 cgroup_kn_unlock(kn);
5668 return ret;
5669}
5670
5671static struct kernfs_syscall_ops cgroup_kf_syscall_ops = {
5672 .show_options = cgroup_show_options,
5673 .mkdir = cgroup_mkdir,
5674 .rmdir = cgroup_rmdir,
5675 .show_path = cgroup_show_path,
5676};
5677
5678static void __init cgroup_init_subsys(struct cgroup_subsys *ss, bool early)
5679{
5680 struct cgroup_subsys_state *css;
5681
5682 pr_debug("Initializing cgroup subsys %s\n", ss->name);
5683
5684 mutex_lock(&cgroup_mutex);
5685
5686 idr_init(&ss->css_idr);
5687 INIT_LIST_HEAD(&ss->cfts);
5688
5689
5690 ss->root = &cgrp_dfl_root;
5691 css = ss->css_alloc(cgroup_css(&cgrp_dfl_root.cgrp, ss));
5692
5693 BUG_ON(IS_ERR(css));
5694 init_and_link_css(css, ss, &cgrp_dfl_root.cgrp);
5695
5696
5697
5698
5699
5700 css->flags |= CSS_NO_REF;
5701
5702 if (early) {
5703
5704 css->id = 1;
5705 } else {
5706 css->id = cgroup_idr_alloc(&ss->css_idr, css, 1, 2, GFP_KERNEL);
5707 BUG_ON(css->id < 0);
5708 }
5709
5710
5711
5712
5713
5714 init_css_set.subsys[ss->id] = css;
5715
5716 have_fork_callback |= (bool)ss->fork << ss->id;
5717 have_exit_callback |= (bool)ss->exit << ss->id;
5718 have_release_callback |= (bool)ss->release << ss->id;
5719 have_canfork_callback |= (bool)ss->can_fork << ss->id;
5720
5721
5722
5723
5724 BUG_ON(!list_empty(&init_task.tasks));
5725
5726 BUG_ON(online_css(css));
5727
5728 mutex_unlock(&cgroup_mutex);
5729}
5730
5731
5732
5733
5734
5735
5736
5737int __init cgroup_init_early(void)
5738{
5739 static struct cgroup_fs_context __initdata ctx;
5740 struct cgroup_subsys *ss;
5741 int i;
5742
5743 ctx.root = &cgrp_dfl_root;
5744 init_cgroup_root(&ctx);
5745 cgrp_dfl_root.cgrp.self.flags |= CSS_NO_REF;
5746
5747 RCU_INIT_POINTER(init_task.cgroups, &init_css_set);
5748
5749 for_each_subsys(ss, i) {
5750 WARN(!ss->css_alloc || !ss->css_free || ss->name || ss->id,
5751 "invalid cgroup_subsys %d:%s css_alloc=%p css_free=%p id:name=%d:%s\n",
5752 i, cgroup_subsys_name[i], ss->css_alloc, ss->css_free,
5753 ss->id, ss->name);
5754 WARN(strlen(cgroup_subsys_name[i]) > MAX_CGROUP_TYPE_NAMELEN,
5755 "cgroup_subsys_name %s too long\n", cgroup_subsys_name[i]);
5756
5757 ss->id = i;
5758 ss->name = cgroup_subsys_name[i];
5759 if (!ss->legacy_name)
5760 ss->legacy_name = cgroup_subsys_name[i];
5761
5762 if (ss->early_init)
5763 cgroup_init_subsys(ss, true);
5764 }
5765 return 0;
5766}
5767
5768
5769
5770
5771
5772
5773
5774int __init cgroup_init(void)
5775{
5776 struct cgroup_subsys *ss;
5777 int ssid;
5778
5779 BUILD_BUG_ON(CGROUP_SUBSYS_COUNT > 16);
5780 BUG_ON(cgroup_init_cftypes(NULL, cgroup_base_files));
5781 BUG_ON(cgroup_init_cftypes(NULL, cgroup1_base_files));
5782
5783 cgroup_rstat_boot();
5784
5785
5786
5787
5788
5789 rcu_sync_enter_start(&cgroup_threadgroup_rwsem.rss);
5790
5791 get_user_ns(init_cgroup_ns.user_ns);
5792
5793 mutex_lock(&cgroup_mutex);
5794
5795
5796
5797
5798
5799 hash_add(css_set_table, &init_css_set.hlist,
5800 css_set_hash(init_css_set.subsys));
5801
5802 BUG_ON(cgroup_setup_root(&cgrp_dfl_root, 0));
5803
5804 mutex_unlock(&cgroup_mutex);
5805
5806 for_each_subsys(ss, ssid) {
5807 if (ss->early_init) {
5808 struct cgroup_subsys_state *css =
5809 init_css_set.subsys[ss->id];
5810
5811 css->id = cgroup_idr_alloc(&ss->css_idr, css, 1, 2,
5812 GFP_KERNEL);
5813 BUG_ON(css->id < 0);
5814 } else {
5815 cgroup_init_subsys(ss, false);
5816 }
5817
5818 list_add_tail(&init_css_set.e_cset_node[ssid],
5819 &cgrp_dfl_root.cgrp.e_csets[ssid]);
5820
5821
5822
5823
5824
5825
5826 if (!cgroup_ssid_enabled(ssid))
5827 continue;
5828
5829 if (cgroup1_ssid_disabled(ssid))
5830 printk(KERN_INFO "Disabling %s control group subsystem in v1 mounts\n",
5831 ss->name);
5832
5833 cgrp_dfl_root.subsys_mask |= 1 << ss->id;
5834
5835
5836 WARN_ON(ss->implicit_on_dfl && !ss->threaded);
5837
5838 if (ss->implicit_on_dfl)
5839 cgrp_dfl_implicit_ss_mask |= 1 << ss->id;
5840 else if (!ss->dfl_cftypes)
5841 cgrp_dfl_inhibit_ss_mask |= 1 << ss->id;
5842
5843 if (ss->threaded)
5844 cgrp_dfl_threaded_ss_mask |= 1 << ss->id;
5845
5846 if (ss->dfl_cftypes == ss->legacy_cftypes) {
5847 WARN_ON(cgroup_add_cftypes(ss, ss->dfl_cftypes));
5848 } else {
5849 WARN_ON(cgroup_add_dfl_cftypes(ss, ss->dfl_cftypes));
5850 WARN_ON(cgroup_add_legacy_cftypes(ss, ss->legacy_cftypes));
5851 }
5852
5853 if (ss->bind)
5854 ss->bind(init_css_set.subsys[ssid]);
5855
5856 mutex_lock(&cgroup_mutex);
5857 css_populate_dir(init_css_set.subsys[ssid]);
5858 mutex_unlock(&cgroup_mutex);
5859 }
5860
5861
5862 hash_del(&init_css_set.hlist);
5863 hash_add(css_set_table, &init_css_set.hlist,
5864 css_set_hash(init_css_set.subsys));
5865
5866 WARN_ON(sysfs_create_mount_point(fs_kobj, "cgroup"));
5867 WARN_ON(register_filesystem(&cgroup_fs_type));
5868 WARN_ON(register_filesystem(&cgroup2_fs_type));
5869 WARN_ON(!proc_create_single("cgroups", 0, NULL, proc_cgroupstats_show));
5870#ifdef CONFIG_CPUSETS
5871 WARN_ON(register_filesystem(&cpuset_fs_type));
5872#endif
5873
5874 return 0;
5875}
5876
5877static int __init cgroup_wq_init(void)
5878{
5879
5880
5881
5882
5883
5884
5885
5886
5887 cgroup_destroy_wq = alloc_workqueue("cgroup_destroy", 0, 1);
5888 BUG_ON(!cgroup_destroy_wq);
5889 return 0;
5890}
5891core_initcall(cgroup_wq_init);
5892
5893void cgroup_path_from_kernfs_id(u64 id, char *buf, size_t buflen)
5894{
5895 struct kernfs_node *kn;
5896
5897 kn = kernfs_find_and_get_node_by_id(cgrp_dfl_root.kf_root, id);
5898 if (!kn)
5899 return;
5900 kernfs_path(kn, buf, buflen);
5901 kernfs_put(kn);
5902}
5903
5904
5905
5906
5907
5908
5909struct cgroup *cgroup_get_from_id(u64 id)
5910{
5911 struct kernfs_node *kn;
5912 struct cgroup *cgrp = NULL;
5913
5914 mutex_lock(&cgroup_mutex);
5915 kn = kernfs_find_and_get_node_by_id(cgrp_dfl_root.kf_root, id);
5916 if (!kn)
5917 goto out_unlock;
5918
5919 cgrp = kn->priv;
5920 if (cgroup_is_dead(cgrp) || !cgroup_tryget(cgrp))
5921 cgrp = NULL;
5922 kernfs_put(kn);
5923out_unlock:
5924 mutex_unlock(&cgroup_mutex);
5925 return cgrp;
5926}
5927EXPORT_SYMBOL_GPL(cgroup_get_from_id);
5928
5929
5930
5931
5932
5933
5934int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns,
5935 struct pid *pid, struct task_struct *tsk)
5936{
5937 char *buf;
5938 int retval;
5939 struct cgroup_root *root;
5940
5941 retval = -ENOMEM;
5942 buf = kmalloc(PATH_MAX, GFP_KERNEL);
5943 if (!buf)
5944 goto out;
5945
5946 mutex_lock(&cgroup_mutex);
5947 spin_lock_irq(&css_set_lock);
5948
5949 for_each_root(root) {
5950 struct cgroup_subsys *ss;
5951 struct cgroup *cgrp;
5952 int ssid, count = 0;
5953
5954 if (root == &cgrp_dfl_root && !cgrp_dfl_visible)
5955 continue;
5956
5957 seq_printf(m, "%d:", root->hierarchy_id);
5958 if (root != &cgrp_dfl_root)
5959 for_each_subsys(ss, ssid)
5960 if (root->subsys_mask & (1 << ssid))
5961 seq_printf(m, "%s%s", count++ ? "," : "",
5962 ss->legacy_name);
5963 if (strlen(root->name))
5964 seq_printf(m, "%sname=%s", count ? "," : "",
5965 root->name);
5966 seq_putc(m, ':');
5967
5968 cgrp = task_cgroup_from_root(tsk, root);
5969
5970
5971
5972
5973
5974
5975
5976
5977
5978
5979 if (cgroup_on_dfl(cgrp) || !(tsk->flags & PF_EXITING)) {
5980 retval = cgroup_path_ns_locked(cgrp, buf, PATH_MAX,
5981 current->nsproxy->cgroup_ns);
5982 if (retval >= PATH_MAX)
5983 retval = -ENAMETOOLONG;
5984 if (retval < 0)
5985 goto out_unlock;
5986
5987 seq_puts(m, buf);
5988 } else {
5989 seq_puts(m, "/");
5990 }
5991
5992 if (cgroup_on_dfl(cgrp) && cgroup_is_dead(cgrp))
5993 seq_puts(m, " (deleted)\n");
5994 else
5995 seq_putc(m, '\n');
5996 }
5997
5998 retval = 0;
5999out_unlock:
6000 spin_unlock_irq(&css_set_lock);
6001 mutex_unlock(&cgroup_mutex);
6002 kfree(buf);
6003out:
6004 return retval;
6005}
6006
6007
6008
6009
6010
6011
6012
6013
6014void cgroup_fork(struct task_struct *child)
6015{
6016 RCU_INIT_POINTER(child->cgroups, &init_css_set);
6017 INIT_LIST_HEAD(&child->cg_list);
6018}
6019
6020static struct cgroup *cgroup_get_from_file(struct file *f)
6021{
6022 struct cgroup_subsys_state *css;
6023 struct cgroup *cgrp;
6024
6025 css = css_tryget_online_from_dir(f->f_path.dentry, NULL);
6026 if (IS_ERR(css))
6027 return ERR_CAST(css);
6028
6029 cgrp = css->cgroup;
6030 if (!cgroup_on_dfl(cgrp)) {
6031 cgroup_put(cgrp);
6032 return ERR_PTR(-EBADF);
6033 }
6034
6035 return cgrp;
6036}
6037
6038
6039
6040
6041
6042
6043
6044
6045
6046
6047
6048
6049
6050
6051
6052
6053
6054static int cgroup_css_set_fork(struct kernel_clone_args *kargs)
6055 __acquires(&cgroup_mutex) __acquires(&cgroup_threadgroup_rwsem)
6056{
6057 int ret;
6058 struct cgroup *dst_cgrp = NULL;
6059 struct css_set *cset;
6060 struct super_block *sb;
6061 struct file *f;
6062
6063 if (kargs->flags & CLONE_INTO_CGROUP)
6064 mutex_lock(&cgroup_mutex);
6065
6066 cgroup_threadgroup_change_begin(current);
6067
6068 spin_lock_irq(&css_set_lock);
6069 cset = task_css_set(current);
6070 get_css_set(cset);
6071 spin_unlock_irq(&css_set_lock);
6072
6073 if (!(kargs->flags & CLONE_INTO_CGROUP)) {
6074 kargs->cset = cset;
6075 return 0;
6076 }
6077
6078 f = fget_raw(kargs->cgroup);
6079 if (!f) {
6080 ret = -EBADF;
6081 goto err;
6082 }
6083 sb = f->f_path.dentry->d_sb;
6084
6085 dst_cgrp = cgroup_get_from_file(f);
6086 if (IS_ERR(dst_cgrp)) {
6087 ret = PTR_ERR(dst_cgrp);
6088 dst_cgrp = NULL;
6089 goto err;
6090 }
6091
6092 if (cgroup_is_dead(dst_cgrp)) {
6093 ret = -ENODEV;
6094 goto err;
6095 }
6096
6097
6098
6099
6100
6101
6102 ret = cgroup_may_write(dst_cgrp, sb);
6103 if (ret)
6104 goto err;
6105
6106 ret = cgroup_attach_permissions(cset->dfl_cgrp, dst_cgrp, sb,
6107 !(kargs->flags & CLONE_THREAD));
6108 if (ret)
6109 goto err;
6110
6111 kargs->cset = find_css_set(cset, dst_cgrp);
6112 if (!kargs->cset) {
6113 ret = -ENOMEM;
6114 goto err;
6115 }
6116
6117 put_css_set(cset);
6118 fput(f);
6119 kargs->cgrp = dst_cgrp;
6120 return ret;
6121
6122err:
6123 cgroup_threadgroup_change_end(current);
6124 mutex_unlock(&cgroup_mutex);
6125 if (f)
6126 fput(f);
6127 if (dst_cgrp)
6128 cgroup_put(dst_cgrp);
6129 put_css_set(cset);
6130 if (kargs->cset)
6131 put_css_set(kargs->cset);
6132 return ret;
6133}
6134
6135
6136
6137
6138
6139
6140
6141
6142static void cgroup_css_set_put_fork(struct kernel_clone_args *kargs)
6143 __releases(&cgroup_threadgroup_rwsem) __releases(&cgroup_mutex)
6144{
6145 cgroup_threadgroup_change_end(current);
6146
6147 if (kargs->flags & CLONE_INTO_CGROUP) {
6148 struct cgroup *cgrp = kargs->cgrp;
6149 struct css_set *cset = kargs->cset;
6150
6151 mutex_unlock(&cgroup_mutex);
6152
6153 if (cset) {
6154 put_css_set(cset);
6155 kargs->cset = NULL;
6156 }
6157
6158 if (cgrp) {
6159 cgroup_put(cgrp);
6160 kargs->cgrp = NULL;
6161 }
6162 }
6163}
6164
6165
6166
6167
6168
6169
6170
6171
6172
6173
6174
6175int cgroup_can_fork(struct task_struct *child, struct kernel_clone_args *kargs)
6176{
6177 struct cgroup_subsys *ss;
6178 int i, j, ret;
6179
6180 ret = cgroup_css_set_fork(kargs);
6181 if (ret)
6182 return ret;
6183
6184 do_each_subsys_mask(ss, i, have_canfork_callback) {
6185 ret = ss->can_fork(child, kargs->cset);
6186 if (ret)
6187 goto out_revert;
6188 } while_each_subsys_mask();
6189
6190 return 0;
6191
6192out_revert:
6193 for_each_subsys(ss, j) {
6194 if (j >= i)
6195 break;
6196 if (ss->cancel_fork)
6197 ss->cancel_fork(child, kargs->cset);
6198 }
6199
6200 cgroup_css_set_put_fork(kargs);
6201
6202 return ret;
6203}
6204
6205
6206
6207
6208
6209
6210
6211
6212
6213
6214void cgroup_cancel_fork(struct task_struct *child,
6215 struct kernel_clone_args *kargs)
6216{
6217 struct cgroup_subsys *ss;
6218 int i;
6219
6220 for_each_subsys(ss, i)
6221 if (ss->cancel_fork)
6222 ss->cancel_fork(child, kargs->cset);
6223
6224 cgroup_css_set_put_fork(kargs);
6225}
6226
6227
6228
6229
6230
6231
6232
6233
6234void cgroup_post_fork(struct task_struct *child,
6235 struct kernel_clone_args *kargs)
6236 __releases(&cgroup_threadgroup_rwsem) __releases(&cgroup_mutex)
6237{
6238 unsigned long cgrp_flags = 0;
6239 bool kill = false;
6240 struct cgroup_subsys *ss;
6241 struct css_set *cset;
6242 int i;
6243
6244 cset = kargs->cset;
6245 kargs->cset = NULL;
6246
6247 spin_lock_irq(&css_set_lock);
6248
6249
6250 if (likely(child->pid)) {
6251 if (kargs->cgrp)
6252 cgrp_flags = kargs->cgrp->flags;
6253 else
6254 cgrp_flags = cset->dfl_cgrp->flags;
6255
6256 WARN_ON_ONCE(!list_empty(&child->cg_list));
6257 cset->nr_tasks++;
6258 css_set_move_task(child, NULL, cset, false);
6259 } else {
6260 put_css_set(cset);
6261 cset = NULL;
6262 }
6263
6264 if (!(child->flags & PF_KTHREAD)) {
6265 if (unlikely(test_bit(CGRP_FREEZE, &cgrp_flags))) {
6266
6267
6268
6269
6270
6271 spin_lock(&child->sighand->siglock);
6272 WARN_ON_ONCE(child->frozen);
6273 child->jobctl |= JOBCTL_TRAP_FREEZE;
6274 spin_unlock(&child->sighand->siglock);
6275
6276
6277
6278
6279
6280
6281
6282 }
6283
6284
6285
6286
6287
6288
6289 kill = test_bit(CGRP_KILL, &cgrp_flags);
6290 }
6291
6292 spin_unlock_irq(&css_set_lock);
6293
6294
6295
6296
6297
6298
6299 do_each_subsys_mask(ss, i, have_fork_callback) {
6300 ss->fork(child);
6301 } while_each_subsys_mask();
6302
6303
6304 if (kargs->flags & CLONE_NEWCGROUP) {
6305 struct css_set *rcset = child->nsproxy->cgroup_ns->root_cset;
6306
6307 get_css_set(cset);
6308 child->nsproxy->cgroup_ns->root_cset = cset;
6309 put_css_set(rcset);
6310 }
6311
6312
6313 if (unlikely(kill))
6314 do_send_sig_info(SIGKILL, SEND_SIG_NOINFO, child, PIDTYPE_TGID);
6315
6316 cgroup_css_set_put_fork(kargs);
6317}
6318
6319
6320
6321
6322
6323
6324
6325
6326void cgroup_exit(struct task_struct *tsk)
6327{
6328 struct cgroup_subsys *ss;
6329 struct css_set *cset;
6330 int i;
6331
6332 spin_lock_irq(&css_set_lock);
6333
6334 WARN_ON_ONCE(list_empty(&tsk->cg_list));
6335 cset = task_css_set(tsk);
6336 css_set_move_task(tsk, cset, NULL, false);
6337 list_add_tail(&tsk->cg_list, &cset->dying_tasks);
6338 cset->nr_tasks--;
6339
6340 WARN_ON_ONCE(cgroup_task_frozen(tsk));
6341 if (unlikely(!(tsk->flags & PF_KTHREAD) &&
6342 test_bit(CGRP_FREEZE, &task_dfl_cgroup(tsk)->flags)))
6343 cgroup_update_frozen(task_dfl_cgroup(tsk));
6344
6345 spin_unlock_irq(&css_set_lock);
6346
6347
6348 do_each_subsys_mask(ss, i, have_exit_callback) {
6349 ss->exit(tsk);
6350 } while_each_subsys_mask();
6351}
6352
6353void cgroup_release(struct task_struct *task)
6354{
6355 struct cgroup_subsys *ss;
6356 int ssid;
6357
6358 do_each_subsys_mask(ss, ssid, have_release_callback) {
6359 ss->release(task);
6360 } while_each_subsys_mask();
6361
6362 spin_lock_irq(&css_set_lock);
6363 css_set_skip_task_iters(task_css_set(task), task);
6364 list_del_init(&task->cg_list);
6365 spin_unlock_irq(&css_set_lock);
6366}
6367
6368void cgroup_free(struct task_struct *task)
6369{
6370 struct css_set *cset = task_css_set(task);
6371 put_css_set(cset);
6372}
6373
6374static int __init cgroup_disable(char *str)
6375{
6376 struct cgroup_subsys *ss;
6377 char *token;
6378 int i;
6379
6380 while ((token = strsep(&str, ",")) != NULL) {
6381 if (!*token)
6382 continue;
6383
6384 for_each_subsys(ss, i) {
6385 if (strcmp(token, ss->name) &&
6386 strcmp(token, ss->legacy_name))
6387 continue;
6388
6389 static_branch_disable(cgroup_subsys_enabled_key[i]);
6390 pr_info("Disabling %s control group subsystem\n",
6391 ss->name);
6392 }
6393
6394 for (i = 0; i < OPT_FEATURE_COUNT; i++) {
6395 if (strcmp(token, cgroup_opt_feature_names[i]))
6396 continue;
6397 cgroup_feature_disable_mask |= 1 << i;
6398 pr_info("Disabling %s control group feature\n",
6399 cgroup_opt_feature_names[i]);
6400 break;
6401 }
6402 }
6403 return 1;
6404}
6405__setup("cgroup_disable=", cgroup_disable);
6406
6407void __init __weak enable_debug_cgroup(void) { }
6408
6409static int __init enable_cgroup_debug(char *str)
6410{
6411 cgroup_debug = true;
6412 enable_debug_cgroup();
6413 return 1;
6414}
6415__setup("cgroup_debug", enable_cgroup_debug);
6416
6417
6418
6419
6420
6421
6422
6423
6424
6425
6426struct cgroup_subsys_state *css_tryget_online_from_dir(struct dentry *dentry,
6427 struct cgroup_subsys *ss)
6428{
6429 struct kernfs_node *kn = kernfs_node_from_dentry(dentry);
6430 struct file_system_type *s_type = dentry->d_sb->s_type;
6431 struct cgroup_subsys_state *css = NULL;
6432 struct cgroup *cgrp;
6433
6434
6435 if ((s_type != &cgroup_fs_type && s_type != &cgroup2_fs_type) ||
6436 !kn || kernfs_type(kn) != KERNFS_DIR)
6437 return ERR_PTR(-EBADF);
6438
6439 rcu_read_lock();
6440
6441
6442
6443
6444
6445
6446 cgrp = rcu_dereference(*(void __rcu __force **)&kn->priv);
6447 if (cgrp)
6448 css = cgroup_css(cgrp, ss);
6449
6450 if (!css || !css_tryget_online(css))
6451 css = ERR_PTR(-ENOENT);
6452
6453 rcu_read_unlock();
6454 return css;
6455}
6456
6457
6458
6459
6460
6461
6462
6463
6464
6465struct cgroup_subsys_state *css_from_id(int id, struct cgroup_subsys *ss)
6466{
6467 WARN_ON_ONCE(!rcu_read_lock_held());
6468 return idr_find(&ss->css_idr, id);
6469}
6470
6471
6472
6473
6474
6475
6476
6477
6478
6479
6480struct cgroup *cgroup_get_from_path(const char *path)
6481{
6482 struct kernfs_node *kn;
6483 struct cgroup *cgrp;
6484
6485 mutex_lock(&cgroup_mutex);
6486
6487 kn = kernfs_walk_and_get(cgrp_dfl_root.cgrp.kn, path);
6488 if (kn) {
6489 if (kernfs_type(kn) == KERNFS_DIR) {
6490 cgrp = kn->priv;
6491 cgroup_get_live(cgrp);
6492 } else {
6493 cgrp = ERR_PTR(-ENOTDIR);
6494 }
6495 kernfs_put(kn);
6496 } else {
6497 cgrp = ERR_PTR(-ENOENT);
6498 }
6499
6500 mutex_unlock(&cgroup_mutex);
6501 return cgrp;
6502}
6503EXPORT_SYMBOL_GPL(cgroup_get_from_path);
6504
6505
6506
6507
6508
6509
6510
6511
6512
6513
6514struct cgroup *cgroup_get_from_fd(int fd)
6515{
6516 struct cgroup *cgrp;
6517 struct file *f;
6518
6519 f = fget_raw(fd);
6520 if (!f)
6521 return ERR_PTR(-EBADF);
6522
6523 cgrp = cgroup_get_from_file(f);
6524 fput(f);
6525 return cgrp;
6526}
6527EXPORT_SYMBOL_GPL(cgroup_get_from_fd);
6528
6529static u64 power_of_ten(int power)
6530{
6531 u64 v = 1;
6532 while (power--)
6533 v *= 10;
6534 return v;
6535}
6536
6537
6538
6539
6540
6541
6542
6543
6544
6545
6546
6547
6548
6549
6550
6551int cgroup_parse_float(const char *input, unsigned dec_shift, s64 *v)
6552{
6553 s64 whole, frac = 0;
6554 int fstart = 0, fend = 0, flen;
6555
6556 if (!sscanf(input, "%lld.%n%lld%n", &whole, &fstart, &frac, &fend))
6557 return -EINVAL;
6558 if (frac < 0)
6559 return -EINVAL;
6560
6561 flen = fend > fstart ? fend - fstart : 0;
6562 if (flen < dec_shift)
6563 frac *= power_of_ten(dec_shift - flen);
6564 else
6565 frac = DIV_ROUND_CLOSEST_ULL(frac, power_of_ten(flen - dec_shift));
6566
6567 *v = whole * power_of_ten(dec_shift) + frac;
6568 return 0;
6569}
6570
6571
6572
6573
6574
6575#ifdef CONFIG_SOCK_CGROUP_DATA
6576
6577void cgroup_sk_alloc(struct sock_cgroup_data *skcd)
6578{
6579 struct cgroup *cgroup;
6580
6581 rcu_read_lock();
6582
6583 if (in_interrupt()) {
6584 cgroup = &cgrp_dfl_root.cgrp;
6585 cgroup_get(cgroup);
6586 goto out;
6587 }
6588
6589 while (true) {
6590 struct css_set *cset;
6591
6592 cset = task_css_set(current);
6593 if (likely(cgroup_tryget(cset->dfl_cgrp))) {
6594 cgroup = cset->dfl_cgrp;
6595 break;
6596 }
6597 cpu_relax();
6598 }
6599out:
6600 skcd->cgroup = cgroup;
6601 cgroup_bpf_get(cgroup);
6602 rcu_read_unlock();
6603}
6604
6605void cgroup_sk_clone(struct sock_cgroup_data *skcd)
6606{
6607 struct cgroup *cgrp = sock_cgroup_ptr(skcd);
6608
6609
6610
6611
6612
6613
6614 cgroup_get(cgrp);
6615 cgroup_bpf_get(cgrp);
6616}
6617
6618void cgroup_sk_free(struct sock_cgroup_data *skcd)
6619{
6620 struct cgroup *cgrp = sock_cgroup_ptr(skcd);
6621
6622 cgroup_bpf_put(cgrp);
6623 cgroup_put(cgrp);
6624}
6625
6626#endif
6627
6628#ifdef CONFIG_CGROUP_BPF
6629int cgroup_bpf_attach(struct cgroup *cgrp,
6630 struct bpf_prog *prog, struct bpf_prog *replace_prog,
6631 struct bpf_cgroup_link *link,
6632 enum bpf_attach_type type,
6633 u32 flags)
6634{
6635 int ret;
6636
6637 mutex_lock(&cgroup_mutex);
6638 ret = __cgroup_bpf_attach(cgrp, prog, replace_prog, link, type, flags);
6639 mutex_unlock(&cgroup_mutex);
6640 return ret;
6641}
6642
6643int cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog,
6644 enum bpf_attach_type type)
6645{
6646 int ret;
6647
6648 mutex_lock(&cgroup_mutex);
6649 ret = __cgroup_bpf_detach(cgrp, prog, NULL, type);
6650 mutex_unlock(&cgroup_mutex);
6651 return ret;
6652}
6653
6654int cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
6655 union bpf_attr __user *uattr)
6656{
6657 int ret;
6658
6659 mutex_lock(&cgroup_mutex);
6660 ret = __cgroup_bpf_query(cgrp, attr, uattr);
6661 mutex_unlock(&cgroup_mutex);
6662 return ret;
6663}
6664#endif
6665
6666#ifdef CONFIG_SYSFS
6667static ssize_t show_delegatable_files(struct cftype *files, char *buf,
6668 ssize_t size, const char *prefix)
6669{
6670 struct cftype *cft;
6671 ssize_t ret = 0;
6672
6673 for (cft = files; cft && cft->name[0] != '\0'; cft++) {
6674 if (!(cft->flags & CFTYPE_NS_DELEGATABLE))
6675 continue;
6676
6677 if ((cft->flags & CFTYPE_PRESSURE) && !cgroup_psi_enabled())
6678 continue;
6679
6680 if (prefix)
6681 ret += snprintf(buf + ret, size - ret, "%s.", prefix);
6682
6683 ret += snprintf(buf + ret, size - ret, "%s\n", cft->name);
6684
6685 if (WARN_ON(ret >= size))
6686 break;
6687 }
6688
6689 return ret;
6690}
6691
6692static ssize_t delegate_show(struct kobject *kobj, struct kobj_attribute *attr,
6693 char *buf)
6694{
6695 struct cgroup_subsys *ss;
6696 int ssid;
6697 ssize_t ret = 0;
6698
6699 ret = show_delegatable_files(cgroup_base_files, buf, PAGE_SIZE - ret,
6700 NULL);
6701
6702 for_each_subsys(ss, ssid)
6703 ret += show_delegatable_files(ss->dfl_cftypes, buf + ret,
6704 PAGE_SIZE - ret,
6705 cgroup_subsys_name[ssid]);
6706
6707 return ret;
6708}
6709static struct kobj_attribute cgroup_delegate_attr = __ATTR_RO(delegate);
6710
6711static ssize_t features_show(struct kobject *kobj, struct kobj_attribute *attr,
6712 char *buf)
6713{
6714 return snprintf(buf, PAGE_SIZE,
6715 "nsdelegate\n"
6716 "memory_localevents\n"
6717 "memory_recursiveprot\n");
6718}
6719static struct kobj_attribute cgroup_features_attr = __ATTR_RO(features);
6720
6721static struct attribute *cgroup_sysfs_attrs[] = {
6722 &cgroup_delegate_attr.attr,
6723 &cgroup_features_attr.attr,
6724 NULL,
6725};
6726
6727static const struct attribute_group cgroup_sysfs_attr_group = {
6728 .attrs = cgroup_sysfs_attrs,
6729 .name = "cgroup",
6730};
6731
6732static int __init cgroup_sysfs_init(void)
6733{
6734 return sysfs_create_group(kernel_kobj, &cgroup_sysfs_attr_group);
6735}
6736subsys_initcall(cgroup_sysfs_init);
6737
6738#endif
6739