1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
30
31#include "cgroup-internal.h"
32
33#include <linux/cred.h>
34#include <linux/errno.h>
35#include <linux/init_task.h>
36#include <linux/kernel.h>
37#include <linux/magic.h>
38#include <linux/mutex.h>
39#include <linux/mount.h>
40#include <linux/pagemap.h>
41#include <linux/proc_fs.h>
42#include <linux/rcupdate.h>
43#include <linux/sched.h>
44#include <linux/sched/task.h>
45#include <linux/slab.h>
46#include <linux/spinlock.h>
47#include <linux/percpu-rwsem.h>
48#include <linux/string.h>
49#include <linux/hashtable.h>
50#include <linux/idr.h>
51#include <linux/kthread.h>
52#include <linux/atomic.h>
53#include <linux/cpuset.h>
54#include <linux/proc_ns.h>
55#include <linux/nsproxy.h>
56#include <linux/file.h>
57#include <linux/fs_parser.h>
58#include <linux/sched/cputime.h>
59#include <linux/psi.h>
60#include <net/sock.h>
61
62#define CREATE_TRACE_POINTS
63#include <trace/events/cgroup.h>
64
65#define CGROUP_FILE_NAME_MAX (MAX_CGROUP_TYPE_NAMELEN + \
66 MAX_CFTYPE_NAME + 2)
67
68#define CGROUP_FILE_NOTIFY_MIN_INTV DIV_ROUND_UP(HZ, 100)
69
70
71
72
73
74
75
76
77
78
79
80DEFINE_MUTEX(cgroup_mutex);
81DEFINE_SPINLOCK(css_set_lock);
82
83#ifdef CONFIG_PROVE_RCU
84EXPORT_SYMBOL_GPL(cgroup_mutex);
85EXPORT_SYMBOL_GPL(css_set_lock);
86#endif
87
88DEFINE_SPINLOCK(trace_cgroup_path_lock);
89char trace_cgroup_path[TRACE_CGROUP_PATH_LEN];
90bool cgroup_debug __read_mostly;
91
92
93
94
95
96static DEFINE_SPINLOCK(cgroup_idr_lock);
97
98
99
100
101
102static DEFINE_SPINLOCK(cgroup_file_kn_lock);
103
104DEFINE_PERCPU_RWSEM(cgroup_threadgroup_rwsem);
105
106#define cgroup_assert_mutex_or_rcu_locked() \
107 RCU_LOCKDEP_WARN(!rcu_read_lock_held() && \
108 !lockdep_is_held(&cgroup_mutex), \
109 "cgroup_mutex or RCU read lock required");
110
111
112
113
114
115
116
117static struct workqueue_struct *cgroup_destroy_wq;
118
119
120#define SUBSYS(_x) [_x ## _cgrp_id] = &_x ## _cgrp_subsys,
121struct cgroup_subsys *cgroup_subsys[] = {
122#include <linux/cgroup_subsys.h>
123};
124#undef SUBSYS
125
126
127#define SUBSYS(_x) [_x ## _cgrp_id] = #_x,
128static const char *cgroup_subsys_name[] = {
129#include <linux/cgroup_subsys.h>
130};
131#undef SUBSYS
132
133
134#define SUBSYS(_x) \
135 DEFINE_STATIC_KEY_TRUE(_x ## _cgrp_subsys_enabled_key); \
136 DEFINE_STATIC_KEY_TRUE(_x ## _cgrp_subsys_on_dfl_key); \
137 EXPORT_SYMBOL_GPL(_x ## _cgrp_subsys_enabled_key); \
138 EXPORT_SYMBOL_GPL(_x ## _cgrp_subsys_on_dfl_key);
139#include <linux/cgroup_subsys.h>
140#undef SUBSYS
141
142#define SUBSYS(_x) [_x ## _cgrp_id] = &_x ## _cgrp_subsys_enabled_key,
143static struct static_key_true *cgroup_subsys_enabled_key[] = {
144#include <linux/cgroup_subsys.h>
145};
146#undef SUBSYS
147
148#define SUBSYS(_x) [_x ## _cgrp_id] = &_x ## _cgrp_subsys_on_dfl_key,
149static struct static_key_true *cgroup_subsys_on_dfl_key[] = {
150#include <linux/cgroup_subsys.h>
151};
152#undef SUBSYS
153
154static DEFINE_PER_CPU(struct cgroup_rstat_cpu, cgrp_dfl_root_rstat_cpu);
155
156
157struct cgroup_root cgrp_dfl_root = { .cgrp.rstat_cpu = &cgrp_dfl_root_rstat_cpu };
158EXPORT_SYMBOL_GPL(cgrp_dfl_root);
159
160
161
162
163
164static bool cgrp_dfl_visible;
165
166
167static u16 cgrp_dfl_inhibit_ss_mask;
168
169
170static u16 cgrp_dfl_implicit_ss_mask;
171
172
173static u16 cgrp_dfl_threaded_ss_mask;
174
175
176LIST_HEAD(cgroup_roots);
177static int cgroup_root_count;
178
179
180static DEFINE_IDR(cgroup_hierarchy_idr);
181
182
183
184
185
186
187
188
189static u64 css_serial_nr_next = 1;
190
191
192
193
194
195static u16 have_fork_callback __read_mostly;
196static u16 have_exit_callback __read_mostly;
197static u16 have_release_callback __read_mostly;
198static u16 have_canfork_callback __read_mostly;
199
200
201struct cgroup_namespace init_cgroup_ns = {
202 .ns.count = REFCOUNT_INIT(2),
203 .user_ns = &init_user_ns,
204 .ns.ops = &cgroupns_operations,
205 .ns.inum = PROC_CGROUP_INIT_INO,
206 .root_cset = &init_css_set,
207};
208
209static struct file_system_type cgroup2_fs_type;
210static struct cftype cgroup_base_files[];
211
212static int cgroup_apply_control(struct cgroup *cgrp);
213static void cgroup_finalize_control(struct cgroup *cgrp, int ret);
214static void css_task_iter_skip(struct css_task_iter *it,
215 struct task_struct *task);
216static int cgroup_destroy_locked(struct cgroup *cgrp);
217static struct cgroup_subsys_state *css_create(struct cgroup *cgrp,
218 struct cgroup_subsys *ss);
219static void css_release(struct percpu_ref *ref);
220static void kill_css(struct cgroup_subsys_state *css);
221static int cgroup_addrm_files(struct cgroup_subsys_state *css,
222 struct cgroup *cgrp, struct cftype cfts[],
223 bool is_add);
224
225
226
227
228
229
230
231
232
233bool cgroup_ssid_enabled(int ssid)
234{
235 if (CGROUP_SUBSYS_COUNT == 0)
236 return false;
237
238 return static_key_enabled(cgroup_subsys_enabled_key[ssid]);
239}
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288bool cgroup_on_dfl(const struct cgroup *cgrp)
289{
290 return cgrp->root == &cgrp_dfl_root;
291}
292
293
294static int cgroup_idr_alloc(struct idr *idr, void *ptr, int start, int end,
295 gfp_t gfp_mask)
296{
297 int ret;
298
299 idr_preload(gfp_mask);
300 spin_lock_bh(&cgroup_idr_lock);
301 ret = idr_alloc(idr, ptr, start, end, gfp_mask & ~__GFP_DIRECT_RECLAIM);
302 spin_unlock_bh(&cgroup_idr_lock);
303 idr_preload_end();
304 return ret;
305}
306
307static void *cgroup_idr_replace(struct idr *idr, void *ptr, int id)
308{
309 void *ret;
310
311 spin_lock_bh(&cgroup_idr_lock);
312 ret = idr_replace(idr, ptr, id);
313 spin_unlock_bh(&cgroup_idr_lock);
314 return ret;
315}
316
317static void cgroup_idr_remove(struct idr *idr, int id)
318{
319 spin_lock_bh(&cgroup_idr_lock);
320 idr_remove(idr, id);
321 spin_unlock_bh(&cgroup_idr_lock);
322}
323
324static bool cgroup_has_tasks(struct cgroup *cgrp)
325{
326 return cgrp->nr_populated_csets;
327}
328
329bool cgroup_is_threaded(struct cgroup *cgrp)
330{
331 return cgrp->dom_cgrp != cgrp;
332}
333
334
335static bool cgroup_is_mixable(struct cgroup *cgrp)
336{
337
338
339
340
341
342 return !cgroup_parent(cgrp);
343}
344
345
346static bool cgroup_can_be_thread_root(struct cgroup *cgrp)
347{
348
349 if (cgroup_is_mixable(cgrp))
350 return true;
351
352
353 if (cgroup_is_threaded(cgrp))
354 return false;
355
356
357 if (cgrp->nr_populated_domain_children)
358 return false;
359
360
361 if (cgrp->subtree_control & ~cgrp_dfl_threaded_ss_mask)
362 return false;
363
364 return true;
365}
366
367
368bool cgroup_is_thread_root(struct cgroup *cgrp)
369{
370
371 if (cgroup_is_threaded(cgrp))
372 return false;
373
374
375 if (cgrp->nr_threaded_children)
376 return true;
377
378
379
380
381
382 if (cgroup_has_tasks(cgrp) &&
383 (cgrp->subtree_control & cgrp_dfl_threaded_ss_mask))
384 return true;
385
386 return false;
387}
388
389
390static bool cgroup_is_valid_domain(struct cgroup *cgrp)
391{
392
393 if (cgroup_is_threaded(cgrp))
394 return false;
395
396
397 while ((cgrp = cgroup_parent(cgrp))) {
398 if (!cgroup_is_mixable(cgrp) && cgroup_is_thread_root(cgrp))
399 return false;
400 if (cgroup_is_threaded(cgrp))
401 return false;
402 }
403
404 return true;
405}
406
407
408static u16 cgroup_control(struct cgroup *cgrp)
409{
410 struct cgroup *parent = cgroup_parent(cgrp);
411 u16 root_ss_mask = cgrp->root->subsys_mask;
412
413 if (parent) {
414 u16 ss_mask = parent->subtree_control;
415
416
417 if (cgroup_is_threaded(cgrp))
418 ss_mask &= cgrp_dfl_threaded_ss_mask;
419 return ss_mask;
420 }
421
422 if (cgroup_on_dfl(cgrp))
423 root_ss_mask &= ~(cgrp_dfl_inhibit_ss_mask |
424 cgrp_dfl_implicit_ss_mask);
425 return root_ss_mask;
426}
427
428
429static u16 cgroup_ss_mask(struct cgroup *cgrp)
430{
431 struct cgroup *parent = cgroup_parent(cgrp);
432
433 if (parent) {
434 u16 ss_mask = parent->subtree_ss_mask;
435
436
437 if (cgroup_is_threaded(cgrp))
438 ss_mask &= cgrp_dfl_threaded_ss_mask;
439 return ss_mask;
440 }
441
442 return cgrp->root->subsys_mask;
443}
444
445
446
447
448
449
450
451
452
453
454
455
456static struct cgroup_subsys_state *cgroup_css(struct cgroup *cgrp,
457 struct cgroup_subsys *ss)
458{
459 if (ss)
460 return rcu_dereference_check(cgrp->subsys[ss->id],
461 lockdep_is_held(&cgroup_mutex));
462 else
463 return &cgrp->self;
464}
465
466
467
468
469
470
471
472
473
474static struct cgroup_subsys_state *cgroup_tryget_css(struct cgroup *cgrp,
475 struct cgroup_subsys *ss)
476{
477 struct cgroup_subsys_state *css;
478
479 rcu_read_lock();
480 css = cgroup_css(cgrp, ss);
481 if (css && !css_tryget_online(css))
482 css = NULL;
483 rcu_read_unlock();
484
485 return css;
486}
487
488
489
490
491
492
493
494
495
496
497
498static struct cgroup_subsys_state *cgroup_e_css_by_mask(struct cgroup *cgrp,
499 struct cgroup_subsys *ss)
500{
501 lockdep_assert_held(&cgroup_mutex);
502
503 if (!ss)
504 return &cgrp->self;
505
506
507
508
509
510 while (!(cgroup_ss_mask(cgrp) & (1 << ss->id))) {
511 cgrp = cgroup_parent(cgrp);
512 if (!cgrp)
513 return NULL;
514 }
515
516 return cgroup_css(cgrp, ss);
517}
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532struct cgroup_subsys_state *cgroup_e_css(struct cgroup *cgrp,
533 struct cgroup_subsys *ss)
534{
535 struct cgroup_subsys_state *css;
536
537 do {
538 css = cgroup_css(cgrp, ss);
539
540 if (css)
541 return css;
542 cgrp = cgroup_parent(cgrp);
543 } while (cgrp);
544
545 return init_css_set.subsys[ss->id];
546}
547
548
549
550
551
552
553
554
555
556
557
558
559struct cgroup_subsys_state *cgroup_get_e_css(struct cgroup *cgrp,
560 struct cgroup_subsys *ss)
561{
562 struct cgroup_subsys_state *css;
563
564 rcu_read_lock();
565
566 do {
567 css = cgroup_css(cgrp, ss);
568
569 if (css && css_tryget_online(css))
570 goto out_unlock;
571 cgrp = cgroup_parent(cgrp);
572 } while (cgrp);
573
574 css = init_css_set.subsys[ss->id];
575 css_get(css);
576out_unlock:
577 rcu_read_unlock();
578 return css;
579}
580
581static void cgroup_get_live(struct cgroup *cgrp)
582{
583 WARN_ON_ONCE(cgroup_is_dead(cgrp));
584 css_get(&cgrp->self);
585}
586
587
588
589
590
591
592int __cgroup_task_count(const struct cgroup *cgrp)
593{
594 int count = 0;
595 struct cgrp_cset_link *link;
596
597 lockdep_assert_held(&css_set_lock);
598
599 list_for_each_entry(link, &cgrp->cset_links, cset_link)
600 count += link->cset->nr_tasks;
601
602 return count;
603}
604
605
606
607
608
609int cgroup_task_count(const struct cgroup *cgrp)
610{
611 int count;
612
613 spin_lock_irq(&css_set_lock);
614 count = __cgroup_task_count(cgrp);
615 spin_unlock_irq(&css_set_lock);
616
617 return count;
618}
619
620struct cgroup_subsys_state *of_css(struct kernfs_open_file *of)
621{
622 struct cgroup *cgrp = of->kn->parent->priv;
623 struct cftype *cft = of_cft(of);
624
625
626
627
628
629
630
631
632
633 if (cft->ss)
634 return rcu_dereference_raw(cgrp->subsys[cft->ss->id]);
635 else
636 return &cgrp->self;
637}
638EXPORT_SYMBOL_GPL(of_css);
639
640
641
642
643
644
645
646
647
648#define for_each_css(css, ssid, cgrp) \
649 for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++) \
650 if (!((css) = rcu_dereference_check( \
651 (cgrp)->subsys[(ssid)], \
652 lockdep_is_held(&cgroup_mutex)))) { } \
653 else
654
655
656
657
658
659
660
661
662
663#define for_each_e_css(css, ssid, cgrp) \
664 for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++) \
665 if (!((css) = cgroup_e_css_by_mask(cgrp, \
666 cgroup_subsys[(ssid)]))) \
667 ; \
668 else
669
670
671
672
673
674
675
676
677
678
679#define do_each_subsys_mask(ss, ssid, ss_mask) do { \
680 unsigned long __ss_mask = (ss_mask); \
681 if (!CGROUP_SUBSYS_COUNT) { \
682 (ssid) = 0; \
683 break; \
684 } \
685 for_each_set_bit(ssid, &__ss_mask, CGROUP_SUBSYS_COUNT) { \
686 (ss) = cgroup_subsys[ssid]; \
687 {
688
689#define while_each_subsys_mask() \
690 } \
691 } \
692} while (false)
693
694
695#define cgroup_for_each_live_child(child, cgrp) \
696 list_for_each_entry((child), &(cgrp)->self.children, self.sibling) \
697 if (({ lockdep_assert_held(&cgroup_mutex); \
698 cgroup_is_dead(child); })) \
699 ; \
700 else
701
702
703#define cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp) \
704 css_for_each_descendant_pre((d_css), cgroup_css((cgrp), NULL)) \
705 if (({ lockdep_assert_held(&cgroup_mutex); \
706 (dsct) = (d_css)->cgroup; \
707 cgroup_is_dead(dsct); })) \
708 ; \
709 else
710
711
712#define cgroup_for_each_live_descendant_post(dsct, d_css, cgrp) \
713 css_for_each_descendant_post((d_css), cgroup_css((cgrp), NULL)) \
714 if (({ lockdep_assert_held(&cgroup_mutex); \
715 (dsct) = (d_css)->cgroup; \
716 cgroup_is_dead(dsct); })) \
717 ; \
718 else
719
720
721
722
723
724
725
726
727struct css_set init_css_set = {
728 .refcount = REFCOUNT_INIT(1),
729 .dom_cset = &init_css_set,
730 .tasks = LIST_HEAD_INIT(init_css_set.tasks),
731 .mg_tasks = LIST_HEAD_INIT(init_css_set.mg_tasks),
732 .dying_tasks = LIST_HEAD_INIT(init_css_set.dying_tasks),
733 .task_iters = LIST_HEAD_INIT(init_css_set.task_iters),
734 .threaded_csets = LIST_HEAD_INIT(init_css_set.threaded_csets),
735 .cgrp_links = LIST_HEAD_INIT(init_css_set.cgrp_links),
736 .mg_preload_node = LIST_HEAD_INIT(init_css_set.mg_preload_node),
737 .mg_node = LIST_HEAD_INIT(init_css_set.mg_node),
738
739
740
741
742
743
744
745 .dfl_cgrp = &cgrp_dfl_root.cgrp,
746};
747
748static int css_set_count = 1;
749
750static bool css_set_threaded(struct css_set *cset)
751{
752 return cset->dom_cset != cset;
753}
754
755
756
757
758
759
760
761
762
763
764static bool css_set_populated(struct css_set *cset)
765{
766 lockdep_assert_held(&css_set_lock);
767
768 return !list_empty(&cset->tasks) || !list_empty(&cset->mg_tasks);
769}
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788static void cgroup_update_populated(struct cgroup *cgrp, bool populated)
789{
790 struct cgroup *child = NULL;
791 int adj = populated ? 1 : -1;
792
793 lockdep_assert_held(&css_set_lock);
794
795 do {
796 bool was_populated = cgroup_is_populated(cgrp);
797
798 if (!child) {
799 cgrp->nr_populated_csets += adj;
800 } else {
801 if (cgroup_is_threaded(child))
802 cgrp->nr_populated_threaded_children += adj;
803 else
804 cgrp->nr_populated_domain_children += adj;
805 }
806
807 if (was_populated == cgroup_is_populated(cgrp))
808 break;
809
810 cgroup1_check_for_release(cgrp);
811 TRACE_CGROUP_PATH(notify_populated, cgrp,
812 cgroup_is_populated(cgrp));
813 cgroup_file_notify(&cgrp->events_file);
814
815 child = cgrp;
816 cgrp = cgroup_parent(cgrp);
817 } while (cgrp);
818}
819
820
821
822
823
824
825
826
827
828static void css_set_update_populated(struct css_set *cset, bool populated)
829{
830 struct cgrp_cset_link *link;
831
832 lockdep_assert_held(&css_set_lock);
833
834 list_for_each_entry(link, &cset->cgrp_links, cgrp_link)
835 cgroup_update_populated(link->cgrp, populated);
836}
837
838
839
840
841
842
843
844static void css_set_skip_task_iters(struct css_set *cset,
845 struct task_struct *task)
846{
847 struct css_task_iter *it, *pos;
848
849 list_for_each_entry_safe(it, pos, &cset->task_iters, iters_node)
850 css_task_iter_skip(it, task);
851}
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868static void css_set_move_task(struct task_struct *task,
869 struct css_set *from_cset, struct css_set *to_cset,
870 bool use_mg_tasks)
871{
872 lockdep_assert_held(&css_set_lock);
873
874 if (to_cset && !css_set_populated(to_cset))
875 css_set_update_populated(to_cset, true);
876
877 if (from_cset) {
878 WARN_ON_ONCE(list_empty(&task->cg_list));
879
880 css_set_skip_task_iters(from_cset, task);
881 list_del_init(&task->cg_list);
882 if (!css_set_populated(from_cset))
883 css_set_update_populated(from_cset, false);
884 } else {
885 WARN_ON_ONCE(!list_empty(&task->cg_list));
886 }
887
888 if (to_cset) {
889
890
891
892
893
894 WARN_ON_ONCE(task->flags & PF_EXITING);
895
896 cgroup_move_task(task, to_cset);
897 list_add_tail(&task->cg_list, use_mg_tasks ? &to_cset->mg_tasks :
898 &to_cset->tasks);
899 }
900}
901
902
903
904
905
906
907#define CSS_SET_HASH_BITS 7
908static DEFINE_HASHTABLE(css_set_table, CSS_SET_HASH_BITS);
909
910static unsigned long css_set_hash(struct cgroup_subsys_state *css[])
911{
912 unsigned long key = 0UL;
913 struct cgroup_subsys *ss;
914 int i;
915
916 for_each_subsys(ss, i)
917 key += (unsigned long)css[i];
918 key = (key >> 16) ^ key;
919
920 return key;
921}
922
923void put_css_set_locked(struct css_set *cset)
924{
925 struct cgrp_cset_link *link, *tmp_link;
926 struct cgroup_subsys *ss;
927 int ssid;
928
929 lockdep_assert_held(&css_set_lock);
930
931 if (!refcount_dec_and_test(&cset->refcount))
932 return;
933
934 WARN_ON_ONCE(!list_empty(&cset->threaded_csets));
935
936
937 for_each_subsys(ss, ssid) {
938 list_del(&cset->e_cset_node[ssid]);
939 css_put(cset->subsys[ssid]);
940 }
941 hash_del(&cset->hlist);
942 css_set_count--;
943
944 list_for_each_entry_safe(link, tmp_link, &cset->cgrp_links, cgrp_link) {
945 list_del(&link->cset_link);
946 list_del(&link->cgrp_link);
947 if (cgroup_parent(link->cgrp))
948 cgroup_put(link->cgrp);
949 kfree(link);
950 }
951
952 if (css_set_threaded(cset)) {
953 list_del(&cset->threaded_csets_node);
954 put_css_set_locked(cset->dom_cset);
955 }
956
957 kfree_rcu(cset, rcu_head);
958}
959
960
961
962
963
964
965
966
967
968
969
970static bool compare_css_sets(struct css_set *cset,
971 struct css_set *old_cset,
972 struct cgroup *new_cgrp,
973 struct cgroup_subsys_state *template[])
974{
975 struct cgroup *new_dfl_cgrp;
976 struct list_head *l1, *l2;
977
978
979
980
981
982
983 if (memcmp(template, cset->subsys, sizeof(cset->subsys)))
984 return false;
985
986
987
988 if (cgroup_on_dfl(new_cgrp))
989 new_dfl_cgrp = new_cgrp;
990 else
991 new_dfl_cgrp = old_cset->dfl_cgrp;
992
993 if (new_dfl_cgrp->dom_cgrp != cset->dom_cset->dfl_cgrp)
994 return false;
995
996
997
998
999
1000
1001
1002 l1 = &cset->cgrp_links;
1003 l2 = &old_cset->cgrp_links;
1004 while (1) {
1005 struct cgrp_cset_link *link1, *link2;
1006 struct cgroup *cgrp1, *cgrp2;
1007
1008 l1 = l1->next;
1009 l2 = l2->next;
1010
1011 if (l1 == &cset->cgrp_links) {
1012 BUG_ON(l2 != &old_cset->cgrp_links);
1013 break;
1014 } else {
1015 BUG_ON(l2 == &old_cset->cgrp_links);
1016 }
1017
1018 link1 = list_entry(l1, struct cgrp_cset_link, cgrp_link);
1019 link2 = list_entry(l2, struct cgrp_cset_link, cgrp_link);
1020 cgrp1 = link1->cgrp;
1021 cgrp2 = link2->cgrp;
1022
1023 BUG_ON(cgrp1->root != cgrp2->root);
1024
1025
1026
1027
1028
1029
1030
1031
1032 if (cgrp1->root == new_cgrp->root) {
1033 if (cgrp1 != new_cgrp)
1034 return false;
1035 } else {
1036 if (cgrp1 != cgrp2)
1037 return false;
1038 }
1039 }
1040 return true;
1041}
1042
1043
1044
1045
1046
1047
1048
1049static struct css_set *find_existing_css_set(struct css_set *old_cset,
1050 struct cgroup *cgrp,
1051 struct cgroup_subsys_state *template[])
1052{
1053 struct cgroup_root *root = cgrp->root;
1054 struct cgroup_subsys *ss;
1055 struct css_set *cset;
1056 unsigned long key;
1057 int i;
1058
1059
1060
1061
1062
1063
1064 for_each_subsys(ss, i) {
1065 if (root->subsys_mask & (1UL << i)) {
1066
1067
1068
1069
1070 template[i] = cgroup_e_css_by_mask(cgrp, ss);
1071 } else {
1072
1073
1074
1075
1076 template[i] = old_cset->subsys[i];
1077 }
1078 }
1079
1080 key = css_set_hash(template);
1081 hash_for_each_possible(css_set_table, cset, hlist, key) {
1082 if (!compare_css_sets(cset, old_cset, cgrp, template))
1083 continue;
1084
1085
1086 return cset;
1087 }
1088
1089
1090 return NULL;
1091}
1092
1093static void free_cgrp_cset_links(struct list_head *links_to_free)
1094{
1095 struct cgrp_cset_link *link, *tmp_link;
1096
1097 list_for_each_entry_safe(link, tmp_link, links_to_free, cset_link) {
1098 list_del(&link->cset_link);
1099 kfree(link);
1100 }
1101}
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111static int allocate_cgrp_cset_links(int count, struct list_head *tmp_links)
1112{
1113 struct cgrp_cset_link *link;
1114 int i;
1115
1116 INIT_LIST_HEAD(tmp_links);
1117
1118 for (i = 0; i < count; i++) {
1119 link = kzalloc(sizeof(*link), GFP_KERNEL);
1120 if (!link) {
1121 free_cgrp_cset_links(tmp_links);
1122 return -ENOMEM;
1123 }
1124 list_add(&link->cset_link, tmp_links);
1125 }
1126 return 0;
1127}
1128
1129
1130
1131
1132
1133
1134
1135static void link_css_set(struct list_head *tmp_links, struct css_set *cset,
1136 struct cgroup *cgrp)
1137{
1138 struct cgrp_cset_link *link;
1139
1140 BUG_ON(list_empty(tmp_links));
1141
1142 if (cgroup_on_dfl(cgrp))
1143 cset->dfl_cgrp = cgrp;
1144
1145 link = list_first_entry(tmp_links, struct cgrp_cset_link, cset_link);
1146 link->cset = cset;
1147 link->cgrp = cgrp;
1148
1149
1150
1151
1152
1153 list_move_tail(&link->cset_link, &cgrp->cset_links);
1154 list_add_tail(&link->cgrp_link, &cset->cgrp_links);
1155
1156 if (cgroup_parent(cgrp))
1157 cgroup_get_live(cgrp);
1158}
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168static struct css_set *find_css_set(struct css_set *old_cset,
1169 struct cgroup *cgrp)
1170{
1171 struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT] = { };
1172 struct css_set *cset;
1173 struct list_head tmp_links;
1174 struct cgrp_cset_link *link;
1175 struct cgroup_subsys *ss;
1176 unsigned long key;
1177 int ssid;
1178
1179 lockdep_assert_held(&cgroup_mutex);
1180
1181
1182
1183 spin_lock_irq(&css_set_lock);
1184 cset = find_existing_css_set(old_cset, cgrp, template);
1185 if (cset)
1186 get_css_set(cset);
1187 spin_unlock_irq(&css_set_lock);
1188
1189 if (cset)
1190 return cset;
1191
1192 cset = kzalloc(sizeof(*cset), GFP_KERNEL);
1193 if (!cset)
1194 return NULL;
1195
1196
1197 if (allocate_cgrp_cset_links(cgroup_root_count, &tmp_links) < 0) {
1198 kfree(cset);
1199 return NULL;
1200 }
1201
1202 refcount_set(&cset->refcount, 1);
1203 cset->dom_cset = cset;
1204 INIT_LIST_HEAD(&cset->tasks);
1205 INIT_LIST_HEAD(&cset->mg_tasks);
1206 INIT_LIST_HEAD(&cset->dying_tasks);
1207 INIT_LIST_HEAD(&cset->task_iters);
1208 INIT_LIST_HEAD(&cset->threaded_csets);
1209 INIT_HLIST_NODE(&cset->hlist);
1210 INIT_LIST_HEAD(&cset->cgrp_links);
1211 INIT_LIST_HEAD(&cset->mg_preload_node);
1212 INIT_LIST_HEAD(&cset->mg_node);
1213
1214
1215
1216 memcpy(cset->subsys, template, sizeof(cset->subsys));
1217
1218 spin_lock_irq(&css_set_lock);
1219
1220 list_for_each_entry(link, &old_cset->cgrp_links, cgrp_link) {
1221 struct cgroup *c = link->cgrp;
1222
1223 if (c->root == cgrp->root)
1224 c = cgrp;
1225 link_css_set(&tmp_links, cset, c);
1226 }
1227
1228 BUG_ON(!list_empty(&tmp_links));
1229
1230 css_set_count++;
1231
1232
1233 key = css_set_hash(cset->subsys);
1234 hash_add(css_set_table, &cset->hlist, key);
1235
1236 for_each_subsys(ss, ssid) {
1237 struct cgroup_subsys_state *css = cset->subsys[ssid];
1238
1239 list_add_tail(&cset->e_cset_node[ssid],
1240 &css->cgroup->e_csets[ssid]);
1241 css_get(css);
1242 }
1243
1244 spin_unlock_irq(&css_set_lock);
1245
1246
1247
1248
1249
1250
1251
1252 if (cgroup_is_threaded(cset->dfl_cgrp)) {
1253 struct css_set *dcset;
1254
1255 dcset = find_css_set(cset, cset->dfl_cgrp->dom_cgrp);
1256 if (!dcset) {
1257 put_css_set(cset);
1258 return NULL;
1259 }
1260
1261 spin_lock_irq(&css_set_lock);
1262 cset->dom_cset = dcset;
1263 list_add_tail(&cset->threaded_csets_node,
1264 &dcset->threaded_csets);
1265 spin_unlock_irq(&css_set_lock);
1266 }
1267
1268 return cset;
1269}
1270
1271struct cgroup_root *cgroup_root_from_kf(struct kernfs_root *kf_root)
1272{
1273 struct cgroup *root_cgrp = kf_root->kn->priv;
1274
1275 return root_cgrp->root;
1276}
1277
1278static int cgroup_init_root_id(struct cgroup_root *root)
1279{
1280 int id;
1281
1282 lockdep_assert_held(&cgroup_mutex);
1283
1284 id = idr_alloc_cyclic(&cgroup_hierarchy_idr, root, 0, 0, GFP_KERNEL);
1285 if (id < 0)
1286 return id;
1287
1288 root->hierarchy_id = id;
1289 return 0;
1290}
1291
1292static void cgroup_exit_root_id(struct cgroup_root *root)
1293{
1294 lockdep_assert_held(&cgroup_mutex);
1295
1296 idr_remove(&cgroup_hierarchy_idr, root->hierarchy_id);
1297}
1298
1299void cgroup_free_root(struct cgroup_root *root)
1300{
1301 kfree(root);
1302}
1303
1304static void cgroup_destroy_root(struct cgroup_root *root)
1305{
1306 struct cgroup *cgrp = &root->cgrp;
1307 struct cgrp_cset_link *link, *tmp_link;
1308
1309 trace_cgroup_destroy_root(root);
1310
1311 cgroup_lock_and_drain_offline(&cgrp_dfl_root.cgrp);
1312
1313 BUG_ON(atomic_read(&root->nr_cgrps));
1314 BUG_ON(!list_empty(&cgrp->self.children));
1315
1316
1317 WARN_ON(rebind_subsystems(&cgrp_dfl_root, root->subsys_mask));
1318
1319
1320
1321
1322
1323 spin_lock_irq(&css_set_lock);
1324
1325 list_for_each_entry_safe(link, tmp_link, &cgrp->cset_links, cset_link) {
1326 list_del(&link->cset_link);
1327 list_del(&link->cgrp_link);
1328 kfree(link);
1329 }
1330
1331 spin_unlock_irq(&css_set_lock);
1332
1333 if (!list_empty(&root->root_list)) {
1334 list_del(&root->root_list);
1335 cgroup_root_count--;
1336 }
1337
1338 cgroup_exit_root_id(root);
1339
1340 mutex_unlock(&cgroup_mutex);
1341
1342 cgroup_rstat_exit(cgrp);
1343 kernfs_destroy_root(root->kf_root);
1344 cgroup_free_root(root);
1345}
1346
1347
1348
1349
1350
1351static struct cgroup *
1352current_cgns_cgroup_from_root(struct cgroup_root *root)
1353{
1354 struct cgroup *res = NULL;
1355 struct css_set *cset;
1356
1357 lockdep_assert_held(&css_set_lock);
1358
1359 rcu_read_lock();
1360
1361 cset = current->nsproxy->cgroup_ns->root_cset;
1362 if (cset == &init_css_set) {
1363 res = &root->cgrp;
1364 } else if (root == &cgrp_dfl_root) {
1365 res = cset->dfl_cgrp;
1366 } else {
1367 struct cgrp_cset_link *link;
1368
1369 list_for_each_entry(link, &cset->cgrp_links, cgrp_link) {
1370 struct cgroup *c = link->cgrp;
1371
1372 if (c->root == root) {
1373 res = c;
1374 break;
1375 }
1376 }
1377 }
1378 rcu_read_unlock();
1379
1380 BUG_ON(!res);
1381 return res;
1382}
1383
1384
1385static struct cgroup *cset_cgroup_from_root(struct css_set *cset,
1386 struct cgroup_root *root)
1387{
1388 struct cgroup *res = NULL;
1389
1390 lockdep_assert_held(&cgroup_mutex);
1391 lockdep_assert_held(&css_set_lock);
1392
1393 if (cset == &init_css_set) {
1394 res = &root->cgrp;
1395 } else if (root == &cgrp_dfl_root) {
1396 res = cset->dfl_cgrp;
1397 } else {
1398 struct cgrp_cset_link *link;
1399
1400 list_for_each_entry(link, &cset->cgrp_links, cgrp_link) {
1401 struct cgroup *c = link->cgrp;
1402
1403 if (c->root == root) {
1404 res = c;
1405 break;
1406 }
1407 }
1408 }
1409
1410 BUG_ON(!res);
1411 return res;
1412}
1413
1414
1415
1416
1417
1418struct cgroup *task_cgroup_from_root(struct task_struct *task,
1419 struct cgroup_root *root)
1420{
1421
1422
1423
1424
1425 return cset_cgroup_from_root(task_css_set(task), root);
1426}
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454static struct kernfs_syscall_ops cgroup_kf_syscall_ops;
1455
1456static char *cgroup_file_name(struct cgroup *cgrp, const struct cftype *cft,
1457 char *buf)
1458{
1459 struct cgroup_subsys *ss = cft->ss;
1460
1461 if (cft->ss && !(cft->flags & CFTYPE_NO_PREFIX) &&
1462 !(cgrp->root->flags & CGRP_ROOT_NOPREFIX)) {
1463 const char *dbg = (cft->flags & CFTYPE_DEBUG) ? ".__DEBUG__." : "";
1464
1465 snprintf(buf, CGROUP_FILE_NAME_MAX, "%s%s.%s",
1466 dbg, cgroup_on_dfl(cgrp) ? ss->name : ss->legacy_name,
1467 cft->name);
1468 } else {
1469 strscpy(buf, cft->name, CGROUP_FILE_NAME_MAX);
1470 }
1471 return buf;
1472}
1473
1474
1475
1476
1477
1478
1479
1480static umode_t cgroup_file_mode(const struct cftype *cft)
1481{
1482 umode_t mode = 0;
1483
1484 if (cft->read_u64 || cft->read_s64 || cft->seq_show)
1485 mode |= S_IRUGO;
1486
1487 if (cft->write_u64 || cft->write_s64 || cft->write) {
1488 if (cft->flags & CFTYPE_WORLD_WRITABLE)
1489 mode |= S_IWUGO;
1490 else
1491 mode |= S_IWUSR;
1492 }
1493
1494 return mode;
1495}
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509static u16 cgroup_calc_subtree_ss_mask(u16 subtree_control, u16 this_ss_mask)
1510{
1511 u16 cur_ss_mask = subtree_control;
1512 struct cgroup_subsys *ss;
1513 int ssid;
1514
1515 lockdep_assert_held(&cgroup_mutex);
1516
1517 cur_ss_mask |= cgrp_dfl_implicit_ss_mask;
1518
1519 while (true) {
1520 u16 new_ss_mask = cur_ss_mask;
1521
1522 do_each_subsys_mask(ss, ssid, cur_ss_mask) {
1523 new_ss_mask |= ss->depends_on;
1524 } while_each_subsys_mask();
1525
1526
1527
1528
1529
1530
1531 new_ss_mask &= this_ss_mask;
1532
1533 if (new_ss_mask == cur_ss_mask)
1534 break;
1535 cur_ss_mask = new_ss_mask;
1536 }
1537
1538 return cur_ss_mask;
1539}
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551void cgroup_kn_unlock(struct kernfs_node *kn)
1552{
1553 struct cgroup *cgrp;
1554
1555 if (kernfs_type(kn) == KERNFS_DIR)
1556 cgrp = kn->priv;
1557 else
1558 cgrp = kn->parent->priv;
1559
1560 mutex_unlock(&cgroup_mutex);
1561
1562 kernfs_unbreak_active_protection(kn);
1563 cgroup_put(cgrp);
1564}
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583struct cgroup *cgroup_kn_lock_live(struct kernfs_node *kn, bool drain_offline)
1584{
1585 struct cgroup *cgrp;
1586
1587 if (kernfs_type(kn) == KERNFS_DIR)
1588 cgrp = kn->priv;
1589 else
1590 cgrp = kn->parent->priv;
1591
1592
1593
1594
1595
1596
1597
1598 if (!cgroup_tryget(cgrp))
1599 return NULL;
1600 kernfs_break_active_protection(kn);
1601
1602 if (drain_offline)
1603 cgroup_lock_and_drain_offline(cgrp);
1604 else
1605 mutex_lock(&cgroup_mutex);
1606
1607 if (!cgroup_is_dead(cgrp))
1608 return cgrp;
1609
1610 cgroup_kn_unlock(kn);
1611 return NULL;
1612}
1613
1614static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft)
1615{
1616 char name[CGROUP_FILE_NAME_MAX];
1617
1618 lockdep_assert_held(&cgroup_mutex);
1619
1620 if (cft->file_offset) {
1621 struct cgroup_subsys_state *css = cgroup_css(cgrp, cft->ss);
1622 struct cgroup_file *cfile = (void *)css + cft->file_offset;
1623
1624 spin_lock_irq(&cgroup_file_kn_lock);
1625 cfile->kn = NULL;
1626 spin_unlock_irq(&cgroup_file_kn_lock);
1627
1628 del_timer_sync(&cfile->notify_timer);
1629 }
1630
1631 kernfs_remove_by_name(cgrp->kn, cgroup_file_name(cgrp, cft, name));
1632}
1633
1634
1635
1636
1637
1638static void css_clear_dir(struct cgroup_subsys_state *css)
1639{
1640 struct cgroup *cgrp = css->cgroup;
1641 struct cftype *cfts;
1642
1643 if (!(css->flags & CSS_VISIBLE))
1644 return;
1645
1646 css->flags &= ~CSS_VISIBLE;
1647
1648 if (!css->ss) {
1649 if (cgroup_on_dfl(cgrp))
1650 cfts = cgroup_base_files;
1651 else
1652 cfts = cgroup1_base_files;
1653
1654 cgroup_addrm_files(css, cgrp, cfts, false);
1655 } else {
1656 list_for_each_entry(cfts, &css->ss->cfts, node)
1657 cgroup_addrm_files(css, cgrp, cfts, false);
1658 }
1659}
1660
1661
1662
1663
1664
1665
1666
1667static int css_populate_dir(struct cgroup_subsys_state *css)
1668{
1669 struct cgroup *cgrp = css->cgroup;
1670 struct cftype *cfts, *failed_cfts;
1671 int ret;
1672
1673 if ((css->flags & CSS_VISIBLE) || !cgrp->kn)
1674 return 0;
1675
1676 if (!css->ss) {
1677 if (cgroup_on_dfl(cgrp))
1678 cfts = cgroup_base_files;
1679 else
1680 cfts = cgroup1_base_files;
1681
1682 ret = cgroup_addrm_files(&cgrp->self, cgrp, cfts, true);
1683 if (ret < 0)
1684 return ret;
1685 } else {
1686 list_for_each_entry(cfts, &css->ss->cfts, node) {
1687 ret = cgroup_addrm_files(css, cgrp, cfts, true);
1688 if (ret < 0) {
1689 failed_cfts = cfts;
1690 goto err;
1691 }
1692 }
1693 }
1694
1695 css->flags |= CSS_VISIBLE;
1696
1697 return 0;
1698err:
1699 list_for_each_entry(cfts, &css->ss->cfts, node) {
1700 if (cfts == failed_cfts)
1701 break;
1702 cgroup_addrm_files(css, cgrp, cfts, false);
1703 }
1704 return ret;
1705}
1706
1707int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask)
1708{
1709 struct cgroup *dcgrp = &dst_root->cgrp;
1710 struct cgroup_subsys *ss;
1711 int ssid, i, ret;
1712
1713 lockdep_assert_held(&cgroup_mutex);
1714
1715 do_each_subsys_mask(ss, ssid, ss_mask) {
1716
1717
1718
1719
1720
1721 if (css_next_child(NULL, cgroup_css(&ss->root->cgrp, ss)) &&
1722 !ss->implicit_on_dfl)
1723 return -EBUSY;
1724
1725
1726 if (ss->root != &cgrp_dfl_root && dst_root != &cgrp_dfl_root)
1727 return -EBUSY;
1728 } while_each_subsys_mask();
1729
1730 do_each_subsys_mask(ss, ssid, ss_mask) {
1731 struct cgroup_root *src_root = ss->root;
1732 struct cgroup *scgrp = &src_root->cgrp;
1733 struct cgroup_subsys_state *css = cgroup_css(scgrp, ss);
1734 struct css_set *cset;
1735
1736 WARN_ON(!css || cgroup_css(dcgrp, ss));
1737
1738
1739 src_root->subsys_mask &= ~(1 << ssid);
1740 WARN_ON(cgroup_apply_control(scgrp));
1741 cgroup_finalize_control(scgrp, 0);
1742
1743
1744 RCU_INIT_POINTER(scgrp->subsys[ssid], NULL);
1745 rcu_assign_pointer(dcgrp->subsys[ssid], css);
1746 ss->root = dst_root;
1747 css->cgroup = dcgrp;
1748
1749 spin_lock_irq(&css_set_lock);
1750 hash_for_each(css_set_table, i, cset, hlist)
1751 list_move_tail(&cset->e_cset_node[ss->id],
1752 &dcgrp->e_csets[ss->id]);
1753 spin_unlock_irq(&css_set_lock);
1754
1755 if (ss->css_rstat_flush) {
1756 list_del_rcu(&css->rstat_css_node);
1757 list_add_rcu(&css->rstat_css_node,
1758 &dcgrp->rstat_css_list);
1759 }
1760
1761
1762 dst_root->subsys_mask |= 1 << ssid;
1763 if (dst_root == &cgrp_dfl_root) {
1764 static_branch_enable(cgroup_subsys_on_dfl_key[ssid]);
1765 } else {
1766 dcgrp->subtree_control |= 1 << ssid;
1767 static_branch_disable(cgroup_subsys_on_dfl_key[ssid]);
1768 }
1769
1770 ret = cgroup_apply_control(dcgrp);
1771 if (ret)
1772 pr_warn("partial failure to rebind %s controller (err=%d)\n",
1773 ss->name, ret);
1774
1775 if (ss->bind)
1776 ss->bind(css);
1777 } while_each_subsys_mask();
1778
1779 kernfs_activate(dcgrp->kn);
1780 return 0;
1781}
1782
1783int cgroup_show_path(struct seq_file *sf, struct kernfs_node *kf_node,
1784 struct kernfs_root *kf_root)
1785{
1786 int len = 0;
1787 char *buf = NULL;
1788 struct cgroup_root *kf_cgroot = cgroup_root_from_kf(kf_root);
1789 struct cgroup *ns_cgroup;
1790
1791 buf = kmalloc(PATH_MAX, GFP_KERNEL);
1792 if (!buf)
1793 return -ENOMEM;
1794
1795 spin_lock_irq(&css_set_lock);
1796 ns_cgroup = current_cgns_cgroup_from_root(kf_cgroot);
1797 len = kernfs_path_from_node(kf_node, ns_cgroup->kn, buf, PATH_MAX);
1798 spin_unlock_irq(&css_set_lock);
1799
1800 if (len >= PATH_MAX)
1801 len = -ERANGE;
1802 else if (len > 0) {
1803 seq_escape(sf, buf, " \t\n\\");
1804 len = 0;
1805 }
1806 kfree(buf);
1807 return len;
1808}
1809
1810enum cgroup2_param {
1811 Opt_nsdelegate,
1812 Opt_memory_localevents,
1813 Opt_memory_recursiveprot,
1814 nr__cgroup2_params
1815};
1816
1817static const struct fs_parameter_spec cgroup2_fs_parameters[] = {
1818 fsparam_flag("nsdelegate", Opt_nsdelegate),
1819 fsparam_flag("memory_localevents", Opt_memory_localevents),
1820 fsparam_flag("memory_recursiveprot", Opt_memory_recursiveprot),
1821 {}
1822};
1823
1824static int cgroup2_parse_param(struct fs_context *fc, struct fs_parameter *param)
1825{
1826 struct cgroup_fs_context *ctx = cgroup_fc2context(fc);
1827 struct fs_parse_result result;
1828 int opt;
1829
1830 opt = fs_parse(fc, cgroup2_fs_parameters, param, &result);
1831 if (opt < 0)
1832 return opt;
1833
1834 switch (opt) {
1835 case Opt_nsdelegate:
1836 ctx->flags |= CGRP_ROOT_NS_DELEGATE;
1837 return 0;
1838 case Opt_memory_localevents:
1839 ctx->flags |= CGRP_ROOT_MEMORY_LOCAL_EVENTS;
1840 return 0;
1841 case Opt_memory_recursiveprot:
1842 ctx->flags |= CGRP_ROOT_MEMORY_RECURSIVE_PROT;
1843 return 0;
1844 }
1845 return -EINVAL;
1846}
1847
1848static void apply_cgroup_root_flags(unsigned int root_flags)
1849{
1850 if (current->nsproxy->cgroup_ns == &init_cgroup_ns) {
1851 if (root_flags & CGRP_ROOT_NS_DELEGATE)
1852 cgrp_dfl_root.flags |= CGRP_ROOT_NS_DELEGATE;
1853 else
1854 cgrp_dfl_root.flags &= ~CGRP_ROOT_NS_DELEGATE;
1855
1856 if (root_flags & CGRP_ROOT_MEMORY_LOCAL_EVENTS)
1857 cgrp_dfl_root.flags |= CGRP_ROOT_MEMORY_LOCAL_EVENTS;
1858 else
1859 cgrp_dfl_root.flags &= ~CGRP_ROOT_MEMORY_LOCAL_EVENTS;
1860
1861 if (root_flags & CGRP_ROOT_MEMORY_RECURSIVE_PROT)
1862 cgrp_dfl_root.flags |= CGRP_ROOT_MEMORY_RECURSIVE_PROT;
1863 else
1864 cgrp_dfl_root.flags &= ~CGRP_ROOT_MEMORY_RECURSIVE_PROT;
1865 }
1866}
1867
1868static int cgroup_show_options(struct seq_file *seq, struct kernfs_root *kf_root)
1869{
1870 if (cgrp_dfl_root.flags & CGRP_ROOT_NS_DELEGATE)
1871 seq_puts(seq, ",nsdelegate");
1872 if (cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_LOCAL_EVENTS)
1873 seq_puts(seq, ",memory_localevents");
1874 if (cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_RECURSIVE_PROT)
1875 seq_puts(seq, ",memory_recursiveprot");
1876 return 0;
1877}
1878
1879static int cgroup_reconfigure(struct fs_context *fc)
1880{
1881 struct cgroup_fs_context *ctx = cgroup_fc2context(fc);
1882
1883 apply_cgroup_root_flags(ctx->flags);
1884 return 0;
1885}
1886
1887static void init_cgroup_housekeeping(struct cgroup *cgrp)
1888{
1889 struct cgroup_subsys *ss;
1890 int ssid;
1891
1892 INIT_LIST_HEAD(&cgrp->self.sibling);
1893 INIT_LIST_HEAD(&cgrp->self.children);
1894 INIT_LIST_HEAD(&cgrp->cset_links);
1895 INIT_LIST_HEAD(&cgrp->pidlists);
1896 mutex_init(&cgrp->pidlist_mutex);
1897 cgrp->self.cgroup = cgrp;
1898 cgrp->self.flags |= CSS_ONLINE;
1899 cgrp->dom_cgrp = cgrp;
1900 cgrp->max_descendants = INT_MAX;
1901 cgrp->max_depth = INT_MAX;
1902 INIT_LIST_HEAD(&cgrp->rstat_css_list);
1903 prev_cputime_init(&cgrp->prev_cputime);
1904
1905 for_each_subsys(ss, ssid)
1906 INIT_LIST_HEAD(&cgrp->e_csets[ssid]);
1907
1908 init_waitqueue_head(&cgrp->offline_waitq);
1909 INIT_WORK(&cgrp->release_agent_work, cgroup1_release_agent);
1910}
1911
1912void init_cgroup_root(struct cgroup_fs_context *ctx)
1913{
1914 struct cgroup_root *root = ctx->root;
1915 struct cgroup *cgrp = &root->cgrp;
1916
1917 INIT_LIST_HEAD(&root->root_list);
1918 atomic_set(&root->nr_cgrps, 1);
1919 cgrp->root = root;
1920 init_cgroup_housekeeping(cgrp);
1921
1922 root->flags = ctx->flags;
1923 if (ctx->release_agent)
1924 strscpy(root->release_agent_path, ctx->release_agent, PATH_MAX);
1925 if (ctx->name)
1926 strscpy(root->name, ctx->name, MAX_CGROUP_ROOT_NAMELEN);
1927 if (ctx->cpuset_clone_children)
1928 set_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags);
1929}
1930
1931int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask)
1932{
1933 LIST_HEAD(tmp_links);
1934 struct cgroup *root_cgrp = &root->cgrp;
1935 struct kernfs_syscall_ops *kf_sops;
1936 struct css_set *cset;
1937 int i, ret;
1938
1939 lockdep_assert_held(&cgroup_mutex);
1940
1941 ret = percpu_ref_init(&root_cgrp->self.refcnt, css_release,
1942 0, GFP_KERNEL);
1943 if (ret)
1944 goto out;
1945
1946
1947
1948
1949
1950
1951
1952
1953 ret = allocate_cgrp_cset_links(2 * css_set_count, &tmp_links);
1954 if (ret)
1955 goto cancel_ref;
1956
1957 ret = cgroup_init_root_id(root);
1958 if (ret)
1959 goto cancel_ref;
1960
1961 kf_sops = root == &cgrp_dfl_root ?
1962 &cgroup_kf_syscall_ops : &cgroup1_kf_syscall_ops;
1963
1964 root->kf_root = kernfs_create_root(kf_sops,
1965 KERNFS_ROOT_CREATE_DEACTIVATED |
1966 KERNFS_ROOT_SUPPORT_EXPORTOP |
1967 KERNFS_ROOT_SUPPORT_USER_XATTR,
1968 root_cgrp);
1969 if (IS_ERR(root->kf_root)) {
1970 ret = PTR_ERR(root->kf_root);
1971 goto exit_root_id;
1972 }
1973 root_cgrp->kn = root->kf_root->kn;
1974 WARN_ON_ONCE(cgroup_ino(root_cgrp) != 1);
1975 root_cgrp->ancestor_ids[0] = cgroup_id(root_cgrp);
1976
1977 ret = css_populate_dir(&root_cgrp->self);
1978 if (ret)
1979 goto destroy_root;
1980
1981 ret = cgroup_rstat_init(root_cgrp);
1982 if (ret)
1983 goto destroy_root;
1984
1985 ret = rebind_subsystems(root, ss_mask);
1986 if (ret)
1987 goto exit_stats;
1988
1989 ret = cgroup_bpf_inherit(root_cgrp);
1990 WARN_ON_ONCE(ret);
1991
1992 trace_cgroup_setup_root(root);
1993
1994
1995
1996
1997
1998
1999 list_add(&root->root_list, &cgroup_roots);
2000 cgroup_root_count++;
2001
2002
2003
2004
2005
2006 spin_lock_irq(&css_set_lock);
2007 hash_for_each(css_set_table, i, cset, hlist) {
2008 link_css_set(&tmp_links, cset, root_cgrp);
2009 if (css_set_populated(cset))
2010 cgroup_update_populated(root_cgrp, true);
2011 }
2012 spin_unlock_irq(&css_set_lock);
2013
2014 BUG_ON(!list_empty(&root_cgrp->self.children));
2015 BUG_ON(atomic_read(&root->nr_cgrps) != 1);
2016
2017 ret = 0;
2018 goto out;
2019
2020exit_stats:
2021 cgroup_rstat_exit(root_cgrp);
2022destroy_root:
2023 kernfs_destroy_root(root->kf_root);
2024 root->kf_root = NULL;
2025exit_root_id:
2026 cgroup_exit_root_id(root);
2027cancel_ref:
2028 percpu_ref_exit(&root_cgrp->self.refcnt);
2029out:
2030 free_cgrp_cset_links(&tmp_links);
2031 return ret;
2032}
2033
2034int cgroup_do_get_tree(struct fs_context *fc)
2035{
2036 struct cgroup_fs_context *ctx = cgroup_fc2context(fc);
2037 int ret;
2038
2039 ctx->kfc.root = ctx->root->kf_root;
2040 if (fc->fs_type == &cgroup2_fs_type)
2041 ctx->kfc.magic = CGROUP2_SUPER_MAGIC;
2042 else
2043 ctx->kfc.magic = CGROUP_SUPER_MAGIC;
2044 ret = kernfs_get_tree(fc);
2045
2046
2047
2048
2049
2050 if (!ret && ctx->ns != &init_cgroup_ns) {
2051 struct dentry *nsdentry;
2052 struct super_block *sb = fc->root->d_sb;
2053 struct cgroup *cgrp;
2054
2055 mutex_lock(&cgroup_mutex);
2056 spin_lock_irq(&css_set_lock);
2057
2058 cgrp = cset_cgroup_from_root(ctx->ns->root_cset, ctx->root);
2059
2060 spin_unlock_irq(&css_set_lock);
2061 mutex_unlock(&cgroup_mutex);
2062
2063 nsdentry = kernfs_node_dentry(cgrp->kn, sb);
2064 dput(fc->root);
2065 if (IS_ERR(nsdentry)) {
2066 deactivate_locked_super(sb);
2067 ret = PTR_ERR(nsdentry);
2068 nsdentry = NULL;
2069 }
2070 fc->root = nsdentry;
2071 }
2072
2073 if (!ctx->kfc.new_sb_created)
2074 cgroup_put(&ctx->root->cgrp);
2075
2076 return ret;
2077}
2078
2079
2080
2081
2082static void cgroup_fs_context_free(struct fs_context *fc)
2083{
2084 struct cgroup_fs_context *ctx = cgroup_fc2context(fc);
2085
2086 kfree(ctx->name);
2087 kfree(ctx->release_agent);
2088 put_cgroup_ns(ctx->ns);
2089 kernfs_free_fs_context(fc);
2090 kfree(ctx);
2091}
2092
2093static int cgroup_get_tree(struct fs_context *fc)
2094{
2095 struct cgroup_fs_context *ctx = cgroup_fc2context(fc);
2096 int ret;
2097
2098 cgrp_dfl_visible = true;
2099 cgroup_get_live(&cgrp_dfl_root.cgrp);
2100 ctx->root = &cgrp_dfl_root;
2101
2102 ret = cgroup_do_get_tree(fc);
2103 if (!ret)
2104 apply_cgroup_root_flags(ctx->flags);
2105 return ret;
2106}
2107
2108static const struct fs_context_operations cgroup_fs_context_ops = {
2109 .free = cgroup_fs_context_free,
2110 .parse_param = cgroup2_parse_param,
2111 .get_tree = cgroup_get_tree,
2112 .reconfigure = cgroup_reconfigure,
2113};
2114
2115static const struct fs_context_operations cgroup1_fs_context_ops = {
2116 .free = cgroup_fs_context_free,
2117 .parse_param = cgroup1_parse_param,
2118 .get_tree = cgroup1_get_tree,
2119 .reconfigure = cgroup1_reconfigure,
2120};
2121
2122
2123
2124
2125
2126static int cgroup_init_fs_context(struct fs_context *fc)
2127{
2128 struct cgroup_fs_context *ctx;
2129
2130 ctx = kzalloc(sizeof(struct cgroup_fs_context), GFP_KERNEL);
2131 if (!ctx)
2132 return -ENOMEM;
2133
2134 ctx->ns = current->nsproxy->cgroup_ns;
2135 get_cgroup_ns(ctx->ns);
2136 fc->fs_private = &ctx->kfc;
2137 if (fc->fs_type == &cgroup2_fs_type)
2138 fc->ops = &cgroup_fs_context_ops;
2139 else
2140 fc->ops = &cgroup1_fs_context_ops;
2141 put_user_ns(fc->user_ns);
2142 fc->user_ns = get_user_ns(ctx->ns->user_ns);
2143 fc->global = true;
2144 return 0;
2145}
2146
2147static void cgroup_kill_sb(struct super_block *sb)
2148{
2149 struct kernfs_root *kf_root = kernfs_root_from_sb(sb);
2150 struct cgroup_root *root = cgroup_root_from_kf(kf_root);
2151
2152
2153
2154
2155
2156
2157
2158
2159 if (list_empty(&root->cgrp.self.children) && root != &cgrp_dfl_root &&
2160 !percpu_ref_is_dying(&root->cgrp.self.refcnt))
2161 percpu_ref_kill(&root->cgrp.self.refcnt);
2162 cgroup_put(&root->cgrp);
2163 kernfs_kill_sb(sb);
2164}
2165
2166struct file_system_type cgroup_fs_type = {
2167 .name = "cgroup",
2168 .init_fs_context = cgroup_init_fs_context,
2169 .parameters = cgroup1_fs_parameters,
2170 .kill_sb = cgroup_kill_sb,
2171 .fs_flags = FS_USERNS_MOUNT,
2172};
2173
2174static struct file_system_type cgroup2_fs_type = {
2175 .name = "cgroup2",
2176 .init_fs_context = cgroup_init_fs_context,
2177 .parameters = cgroup2_fs_parameters,
2178 .kill_sb = cgroup_kill_sb,
2179 .fs_flags = FS_USERNS_MOUNT,
2180};
2181
2182#ifdef CONFIG_CPUSETS
2183static const struct fs_context_operations cpuset_fs_context_ops = {
2184 .get_tree = cgroup1_get_tree,
2185 .free = cgroup_fs_context_free,
2186};
2187
2188
2189
2190
2191
2192
2193static int cpuset_init_fs_context(struct fs_context *fc)
2194{
2195 char *agent = kstrdup("/sbin/cpuset_release_agent", GFP_USER);
2196 struct cgroup_fs_context *ctx;
2197 int err;
2198
2199 err = cgroup_init_fs_context(fc);
2200 if (err) {
2201 kfree(agent);
2202 return err;
2203 }
2204
2205 fc->ops = &cpuset_fs_context_ops;
2206
2207 ctx = cgroup_fc2context(fc);
2208 ctx->subsys_mask = 1 << cpuset_cgrp_id;
2209 ctx->flags |= CGRP_ROOT_NOPREFIX;
2210 ctx->release_agent = agent;
2211
2212 get_filesystem(&cgroup_fs_type);
2213 put_filesystem(fc->fs_type);
2214 fc->fs_type = &cgroup_fs_type;
2215
2216 return 0;
2217}
2218
2219static struct file_system_type cpuset_fs_type = {
2220 .name = "cpuset",
2221 .init_fs_context = cpuset_init_fs_context,
2222 .fs_flags = FS_USERNS_MOUNT,
2223};
2224#endif
2225
2226int cgroup_path_ns_locked(struct cgroup *cgrp, char *buf, size_t buflen,
2227 struct cgroup_namespace *ns)
2228{
2229 struct cgroup *root = cset_cgroup_from_root(ns->root_cset, cgrp->root);
2230
2231 return kernfs_path_from_node(cgrp->kn, root->kn, buf, buflen);
2232}
2233
2234int cgroup_path_ns(struct cgroup *cgrp, char *buf, size_t buflen,
2235 struct cgroup_namespace *ns)
2236{
2237 int ret;
2238
2239 mutex_lock(&cgroup_mutex);
2240 spin_lock_irq(&css_set_lock);
2241
2242 ret = cgroup_path_ns_locked(cgrp, buf, buflen, ns);
2243
2244 spin_unlock_irq(&css_set_lock);
2245 mutex_unlock(&cgroup_mutex);
2246
2247 return ret;
2248}
2249EXPORT_SYMBOL_GPL(cgroup_path_ns);
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264int task_cgroup_path(struct task_struct *task, char *buf, size_t buflen)
2265{
2266 struct cgroup_root *root;
2267 struct cgroup *cgrp;
2268 int hierarchy_id = 1;
2269 int ret;
2270
2271 mutex_lock(&cgroup_mutex);
2272 spin_lock_irq(&css_set_lock);
2273
2274 root = idr_get_next(&cgroup_hierarchy_idr, &hierarchy_id);
2275
2276 if (root) {
2277 cgrp = task_cgroup_from_root(task, root);
2278 ret = cgroup_path_ns_locked(cgrp, buf, buflen, &init_cgroup_ns);
2279 } else {
2280
2281 ret = strlcpy(buf, "/", buflen);
2282 }
2283
2284 spin_unlock_irq(&css_set_lock);
2285 mutex_unlock(&cgroup_mutex);
2286 return ret;
2287}
2288EXPORT_SYMBOL_GPL(task_cgroup_path);
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300static void cgroup_migrate_add_task(struct task_struct *task,
2301 struct cgroup_mgctx *mgctx)
2302{
2303 struct css_set *cset;
2304
2305 lockdep_assert_held(&css_set_lock);
2306
2307
2308 if (task->flags & PF_EXITING)
2309 return;
2310
2311
2312 WARN_ON_ONCE(list_empty(&task->cg_list));
2313
2314 cset = task_css_set(task);
2315 if (!cset->mg_src_cgrp)
2316 return;
2317
2318 mgctx->tset.nr_tasks++;
2319
2320 list_move_tail(&task->cg_list, &cset->mg_tasks);
2321 if (list_empty(&cset->mg_node))
2322 list_add_tail(&cset->mg_node,
2323 &mgctx->tset.src_csets);
2324 if (list_empty(&cset->mg_dst_cset->mg_node))
2325 list_add_tail(&cset->mg_dst_cset->mg_node,
2326 &mgctx->tset.dst_csets);
2327}
2328
2329
2330
2331
2332
2333
2334
2335
2336struct task_struct *cgroup_taskset_first(struct cgroup_taskset *tset,
2337 struct cgroup_subsys_state **dst_cssp)
2338{
2339 tset->cur_cset = list_first_entry(tset->csets, struct css_set, mg_node);
2340 tset->cur_task = NULL;
2341
2342 return cgroup_taskset_next(tset, dst_cssp);
2343}
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset,
2354 struct cgroup_subsys_state **dst_cssp)
2355{
2356 struct css_set *cset = tset->cur_cset;
2357 struct task_struct *task = tset->cur_task;
2358
2359 while (&cset->mg_node != tset->csets) {
2360 if (!task)
2361 task = list_first_entry(&cset->mg_tasks,
2362 struct task_struct, cg_list);
2363 else
2364 task = list_next_entry(task, cg_list);
2365
2366 if (&task->cg_list != &cset->mg_tasks) {
2367 tset->cur_cset = cset;
2368 tset->cur_task = task;
2369
2370
2371
2372
2373
2374
2375
2376 if (cset->mg_dst_cset)
2377 *dst_cssp = cset->mg_dst_cset->subsys[tset->ssid];
2378 else
2379 *dst_cssp = cset->subsys[tset->ssid];
2380
2381 return task;
2382 }
2383
2384 cset = list_next_entry(cset, mg_node);
2385 task = NULL;
2386 }
2387
2388 return NULL;
2389}
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400static int cgroup_migrate_execute(struct cgroup_mgctx *mgctx)
2401{
2402 struct cgroup_taskset *tset = &mgctx->tset;
2403 struct cgroup_subsys *ss;
2404 struct task_struct *task, *tmp_task;
2405 struct css_set *cset, *tmp_cset;
2406 int ssid, failed_ssid, ret;
2407
2408
2409 if (tset->nr_tasks) {
2410 do_each_subsys_mask(ss, ssid, mgctx->ss_mask) {
2411 if (ss->can_attach) {
2412 tset->ssid = ssid;
2413 ret = ss->can_attach(tset);
2414 if (ret) {
2415 failed_ssid = ssid;
2416 goto out_cancel_attach;
2417 }
2418 }
2419 } while_each_subsys_mask();
2420 }
2421
2422
2423
2424
2425
2426
2427 spin_lock_irq(&css_set_lock);
2428 list_for_each_entry(cset, &tset->src_csets, mg_node) {
2429 list_for_each_entry_safe(task, tmp_task, &cset->mg_tasks, cg_list) {
2430 struct css_set *from_cset = task_css_set(task);
2431 struct css_set *to_cset = cset->mg_dst_cset;
2432
2433 get_css_set(to_cset);
2434 to_cset->nr_tasks++;
2435 css_set_move_task(task, from_cset, to_cset, true);
2436 from_cset->nr_tasks--;
2437
2438
2439
2440
2441 cgroup_freezer_migrate_task(task, from_cset->dfl_cgrp,
2442 to_cset->dfl_cgrp);
2443 put_css_set_locked(from_cset);
2444
2445 }
2446 }
2447 spin_unlock_irq(&css_set_lock);
2448
2449
2450
2451
2452
2453
2454 tset->csets = &tset->dst_csets;
2455
2456 if (tset->nr_tasks) {
2457 do_each_subsys_mask(ss, ssid, mgctx->ss_mask) {
2458 if (ss->attach) {
2459 tset->ssid = ssid;
2460 ss->attach(tset);
2461 }
2462 } while_each_subsys_mask();
2463 }
2464
2465 ret = 0;
2466 goto out_release_tset;
2467
2468out_cancel_attach:
2469 if (tset->nr_tasks) {
2470 do_each_subsys_mask(ss, ssid, mgctx->ss_mask) {
2471 if (ssid == failed_ssid)
2472 break;
2473 if (ss->cancel_attach) {
2474 tset->ssid = ssid;
2475 ss->cancel_attach(tset);
2476 }
2477 } while_each_subsys_mask();
2478 }
2479out_release_tset:
2480 spin_lock_irq(&css_set_lock);
2481 list_splice_init(&tset->dst_csets, &tset->src_csets);
2482 list_for_each_entry_safe(cset, tmp_cset, &tset->src_csets, mg_node) {
2483 list_splice_tail_init(&cset->mg_tasks, &cset->tasks);
2484 list_del_init(&cset->mg_node);
2485 }
2486 spin_unlock_irq(&css_set_lock);
2487
2488
2489
2490
2491
2492
2493 tset->nr_tasks = 0;
2494 tset->csets = &tset->src_csets;
2495 return ret;
2496}
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507int cgroup_migrate_vet_dst(struct cgroup *dst_cgrp)
2508{
2509
2510 if (!cgroup_on_dfl(dst_cgrp))
2511 return 0;
2512
2513
2514 if (!cgroup_is_valid_domain(dst_cgrp->dom_cgrp))
2515 return -EOPNOTSUPP;
2516
2517
2518 if (cgroup_is_mixable(dst_cgrp))
2519 return 0;
2520
2521
2522
2523
2524
2525 if (cgroup_can_be_thread_root(dst_cgrp) || cgroup_is_threaded(dst_cgrp))
2526 return 0;
2527
2528
2529 if (dst_cgrp->subtree_control)
2530 return -EBUSY;
2531
2532 return 0;
2533}
2534
2535
2536
2537
2538
2539
2540
2541
2542void cgroup_migrate_finish(struct cgroup_mgctx *mgctx)
2543{
2544 LIST_HEAD(preloaded);
2545 struct css_set *cset, *tmp_cset;
2546
2547 lockdep_assert_held(&cgroup_mutex);
2548
2549 spin_lock_irq(&css_set_lock);
2550
2551 list_splice_tail_init(&mgctx->preloaded_src_csets, &preloaded);
2552 list_splice_tail_init(&mgctx->preloaded_dst_csets, &preloaded);
2553
2554 list_for_each_entry_safe(cset, tmp_cset, &preloaded, mg_preload_node) {
2555 cset->mg_src_cgrp = NULL;
2556 cset->mg_dst_cgrp = NULL;
2557 cset->mg_dst_cset = NULL;
2558 list_del_init(&cset->mg_preload_node);
2559 put_css_set_locked(cset);
2560 }
2561
2562 spin_unlock_irq(&css_set_lock);
2563}
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581void cgroup_migrate_add_src(struct css_set *src_cset,
2582 struct cgroup *dst_cgrp,
2583 struct cgroup_mgctx *mgctx)
2584{
2585 struct cgroup *src_cgrp;
2586
2587 lockdep_assert_held(&cgroup_mutex);
2588 lockdep_assert_held(&css_set_lock);
2589
2590
2591
2592
2593
2594
2595 if (src_cset->dead)
2596 return;
2597
2598 src_cgrp = cset_cgroup_from_root(src_cset, dst_cgrp->root);
2599
2600 if (!list_empty(&src_cset->mg_preload_node))
2601 return;
2602
2603 WARN_ON(src_cset->mg_src_cgrp);
2604 WARN_ON(src_cset->mg_dst_cgrp);
2605 WARN_ON(!list_empty(&src_cset->mg_tasks));
2606 WARN_ON(!list_empty(&src_cset->mg_node));
2607
2608 src_cset->mg_src_cgrp = src_cgrp;
2609 src_cset->mg_dst_cgrp = dst_cgrp;
2610 get_css_set(src_cset);
2611 list_add_tail(&src_cset->mg_preload_node, &mgctx->preloaded_src_csets);
2612}
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628int cgroup_migrate_prepare_dst(struct cgroup_mgctx *mgctx)
2629{
2630 struct css_set *src_cset, *tmp_cset;
2631
2632 lockdep_assert_held(&cgroup_mutex);
2633
2634
2635 list_for_each_entry_safe(src_cset, tmp_cset, &mgctx->preloaded_src_csets,
2636 mg_preload_node) {
2637 struct css_set *dst_cset;
2638 struct cgroup_subsys *ss;
2639 int ssid;
2640
2641 dst_cset = find_css_set(src_cset, src_cset->mg_dst_cgrp);
2642 if (!dst_cset)
2643 return -ENOMEM;
2644
2645 WARN_ON_ONCE(src_cset->mg_dst_cset || dst_cset->mg_dst_cset);
2646
2647
2648
2649
2650
2651
2652 if (src_cset == dst_cset) {
2653 src_cset->mg_src_cgrp = NULL;
2654 src_cset->mg_dst_cgrp = NULL;
2655 list_del_init(&src_cset->mg_preload_node);
2656 put_css_set(src_cset);
2657 put_css_set(dst_cset);
2658 continue;
2659 }
2660
2661 src_cset->mg_dst_cset = dst_cset;
2662
2663 if (list_empty(&dst_cset->mg_preload_node))
2664 list_add_tail(&dst_cset->mg_preload_node,
2665 &mgctx->preloaded_dst_csets);
2666 else
2667 put_css_set(dst_cset);
2668
2669 for_each_subsys(ss, ssid)
2670 if (src_cset->subsys[ssid] != dst_cset->subsys[ssid])
2671 mgctx->ss_mask |= 1 << ssid;
2672 }
2673
2674 return 0;
2675}
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695int cgroup_migrate(struct task_struct *leader, bool threadgroup,
2696 struct cgroup_mgctx *mgctx)
2697{
2698 struct task_struct *task;
2699
2700
2701
2702
2703
2704
2705 spin_lock_irq(&css_set_lock);
2706 rcu_read_lock();
2707 task = leader;
2708 do {
2709 cgroup_migrate_add_task(task, mgctx);
2710 if (!threadgroup)
2711 break;
2712 } while_each_thread(leader, task);
2713 rcu_read_unlock();
2714 spin_unlock_irq(&css_set_lock);
2715
2716 return cgroup_migrate_execute(mgctx);
2717}
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727int cgroup_attach_task(struct cgroup *dst_cgrp, struct task_struct *leader,
2728 bool threadgroup)
2729{
2730 DEFINE_CGROUP_MGCTX(mgctx);
2731 struct task_struct *task;
2732 int ret = 0;
2733
2734
2735 spin_lock_irq(&css_set_lock);
2736 rcu_read_lock();
2737 task = leader;
2738 do {
2739 cgroup_migrate_add_src(task_css_set(task), dst_cgrp, &mgctx);
2740 if (!threadgroup)
2741 break;
2742 } while_each_thread(leader, task);
2743 rcu_read_unlock();
2744 spin_unlock_irq(&css_set_lock);
2745
2746
2747 ret = cgroup_migrate_prepare_dst(&mgctx);
2748 if (!ret)
2749 ret = cgroup_migrate(leader, threadgroup, &mgctx);
2750
2751 cgroup_migrate_finish(&mgctx);
2752
2753 if (!ret)
2754 TRACE_CGROUP_PATH(attach_task, dst_cgrp, leader, threadgroup);
2755
2756 return ret;
2757}
2758
2759struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup,
2760 bool *locked)
2761 __acquires(&cgroup_threadgroup_rwsem)
2762{
2763 struct task_struct *tsk;
2764 pid_t pid;
2765
2766 if (kstrtoint(strstrip(buf), 0, &pid) || pid < 0)
2767 return ERR_PTR(-EINVAL);
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777 lockdep_assert_held(&cgroup_mutex);
2778 if (pid || threadgroup) {
2779 percpu_down_write(&cgroup_threadgroup_rwsem);
2780 *locked = true;
2781 } else {
2782 *locked = false;
2783 }
2784
2785 rcu_read_lock();
2786 if (pid) {
2787 tsk = find_task_by_vpid(pid);
2788 if (!tsk) {
2789 tsk = ERR_PTR(-ESRCH);
2790 goto out_unlock_threadgroup;
2791 }
2792 } else {
2793 tsk = current;
2794 }
2795
2796 if (threadgroup)
2797 tsk = tsk->group_leader;
2798
2799
2800
2801
2802
2803
2804
2805 if (tsk->no_cgroup_migration || (tsk->flags & PF_NO_SETAFFINITY)) {
2806 tsk = ERR_PTR(-EINVAL);
2807 goto out_unlock_threadgroup;
2808 }
2809
2810 get_task_struct(tsk);
2811 goto out_unlock_rcu;
2812
2813out_unlock_threadgroup:
2814 if (*locked) {
2815 percpu_up_write(&cgroup_threadgroup_rwsem);
2816 *locked = false;
2817 }
2818out_unlock_rcu:
2819 rcu_read_unlock();
2820 return tsk;
2821}
2822
2823void cgroup_procs_write_finish(struct task_struct *task, bool locked)
2824 __releases(&cgroup_threadgroup_rwsem)
2825{
2826 struct cgroup_subsys *ss;
2827 int ssid;
2828
2829
2830 put_task_struct(task);
2831
2832 if (locked)
2833 percpu_up_write(&cgroup_threadgroup_rwsem);
2834 for_each_subsys(ss, ssid)
2835 if (ss->post_attach)
2836 ss->post_attach();
2837}
2838
2839static void cgroup_print_ss_mask(struct seq_file *seq, u16 ss_mask)
2840{
2841 struct cgroup_subsys *ss;
2842 bool printed = false;
2843 int ssid;
2844
2845 do_each_subsys_mask(ss, ssid, ss_mask) {
2846 if (printed)
2847 seq_putc(seq, ' ');
2848 seq_puts(seq, ss->name);
2849 printed = true;
2850 } while_each_subsys_mask();
2851 if (printed)
2852 seq_putc(seq, '\n');
2853}
2854
2855
2856static int cgroup_controllers_show(struct seq_file *seq, void *v)
2857{
2858 struct cgroup *cgrp = seq_css(seq)->cgroup;
2859
2860 cgroup_print_ss_mask(seq, cgroup_control(cgrp));
2861 return 0;
2862}
2863
2864
2865static int cgroup_subtree_control_show(struct seq_file *seq, void *v)
2866{
2867 struct cgroup *cgrp = seq_css(seq)->cgroup;
2868
2869 cgroup_print_ss_mask(seq, cgrp->subtree_control);
2870 return 0;
2871}
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882static int cgroup_update_dfl_csses(struct cgroup *cgrp)
2883{
2884 DEFINE_CGROUP_MGCTX(mgctx);
2885 struct cgroup_subsys_state *d_css;
2886 struct cgroup *dsct;
2887 struct css_set *src_cset;
2888 int ret;
2889
2890 lockdep_assert_held(&cgroup_mutex);
2891
2892 percpu_down_write(&cgroup_threadgroup_rwsem);
2893
2894
2895 spin_lock_irq(&css_set_lock);
2896 cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp) {
2897 struct cgrp_cset_link *link;
2898
2899 list_for_each_entry(link, &dsct->cset_links, cset_link)
2900 cgroup_migrate_add_src(link->cset, dsct, &mgctx);
2901 }
2902 spin_unlock_irq(&css_set_lock);
2903
2904
2905 ret = cgroup_migrate_prepare_dst(&mgctx);
2906 if (ret)
2907 goto out_finish;
2908
2909 spin_lock_irq(&css_set_lock);
2910 list_for_each_entry(src_cset, &mgctx.preloaded_src_csets, mg_preload_node) {
2911 struct task_struct *task, *ntask;
2912
2913
2914 list_for_each_entry_safe(task, ntask, &src_cset->tasks, cg_list)
2915 cgroup_migrate_add_task(task, &mgctx);
2916 }
2917 spin_unlock_irq(&css_set_lock);
2918
2919 ret = cgroup_migrate_execute(&mgctx);
2920out_finish:
2921 cgroup_migrate_finish(&mgctx);
2922 percpu_up_write(&cgroup_threadgroup_rwsem);
2923 return ret;
2924}
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934void cgroup_lock_and_drain_offline(struct cgroup *cgrp)
2935 __acquires(&cgroup_mutex)
2936{
2937 struct cgroup *dsct;
2938 struct cgroup_subsys_state *d_css;
2939 struct cgroup_subsys *ss;
2940 int ssid;
2941
2942restart:
2943 mutex_lock(&cgroup_mutex);
2944
2945 cgroup_for_each_live_descendant_post(dsct, d_css, cgrp) {
2946 for_each_subsys(ss, ssid) {
2947 struct cgroup_subsys_state *css = cgroup_css(dsct, ss);
2948 DEFINE_WAIT(wait);
2949
2950 if (!css || !percpu_ref_is_dying(&css->refcnt))
2951 continue;
2952
2953 cgroup_get_live(dsct);
2954 prepare_to_wait(&dsct->offline_waitq, &wait,
2955 TASK_UNINTERRUPTIBLE);
2956
2957 mutex_unlock(&cgroup_mutex);
2958 schedule();
2959 finish_wait(&dsct->offline_waitq, &wait);
2960
2961 cgroup_put(dsct);
2962 goto restart;
2963 }
2964 }
2965}
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975static void cgroup_save_control(struct cgroup *cgrp)
2976{
2977 struct cgroup *dsct;
2978 struct cgroup_subsys_state *d_css;
2979
2980 cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp) {
2981 dsct->old_subtree_control = dsct->subtree_control;
2982 dsct->old_subtree_ss_mask = dsct->subtree_ss_mask;
2983 dsct->old_dom_cgrp = dsct->dom_cgrp;
2984 }
2985}
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995static void cgroup_propagate_control(struct cgroup *cgrp)
2996{
2997 struct cgroup *dsct;
2998 struct cgroup_subsys_state *d_css;
2999
3000 cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp) {
3001 dsct->subtree_control &= cgroup_control(dsct);
3002 dsct->subtree_ss_mask =
3003 cgroup_calc_subtree_ss_mask(dsct->subtree_control,
3004 cgroup_ss_mask(dsct));
3005 }
3006}
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016static void cgroup_restore_control(struct cgroup *cgrp)
3017{
3018 struct cgroup *dsct;
3019 struct cgroup_subsys_state *d_css;
3020
3021 cgroup_for_each_live_descendant_post(dsct, d_css, cgrp) {
3022 dsct->subtree_control = dsct->old_subtree_control;
3023 dsct->subtree_ss_mask = dsct->old_subtree_ss_mask;
3024 dsct->dom_cgrp = dsct->old_dom_cgrp;
3025 }
3026}
3027
3028static bool css_visible(struct cgroup_subsys_state *css)
3029{
3030 struct cgroup_subsys *ss = css->ss;
3031 struct cgroup *cgrp = css->cgroup;
3032
3033 if (cgroup_control(cgrp) & (1 << ss->id))
3034 return true;
3035 if (!(cgroup_ss_mask(cgrp) & (1 << ss->id)))
3036 return false;
3037 return cgroup_on_dfl(cgrp) && ss->implicit_on_dfl;
3038}
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053static int cgroup_apply_control_enable(struct cgroup *cgrp)
3054{
3055 struct cgroup *dsct;
3056 struct cgroup_subsys_state *d_css;
3057 struct cgroup_subsys *ss;
3058 int ssid, ret;
3059
3060 cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp) {
3061 for_each_subsys(ss, ssid) {
3062 struct cgroup_subsys_state *css = cgroup_css(dsct, ss);
3063
3064 if (!(cgroup_ss_mask(dsct) & (1 << ss->id)))
3065 continue;
3066
3067 if (!css) {
3068 css = css_create(dsct, ss);
3069 if (IS_ERR(css))
3070 return PTR_ERR(css);
3071 }
3072
3073 WARN_ON_ONCE(percpu_ref_is_dying(&css->refcnt));
3074
3075 if (css_visible(css)) {
3076 ret = css_populate_dir(css);
3077 if (ret)
3078 return ret;
3079 }
3080 }
3081 }
3082
3083 return 0;
3084}
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099static void cgroup_apply_control_disable(struct cgroup *cgrp)
3100{
3101 struct cgroup *dsct;
3102 struct cgroup_subsys_state *d_css;
3103 struct cgroup_subsys *ss;
3104 int ssid;
3105
3106 cgroup_for_each_live_descendant_post(dsct, d_css, cgrp) {
3107 for_each_subsys(ss, ssid) {
3108 struct cgroup_subsys_state *css = cgroup_css(dsct, ss);
3109
3110 if (!css)
3111 continue;
3112
3113 WARN_ON_ONCE(percpu_ref_is_dying(&css->refcnt));
3114
3115 if (css->parent &&
3116 !(cgroup_ss_mask(dsct) & (1 << ss->id))) {
3117 kill_css(css);
3118 } else if (!css_visible(css)) {
3119 css_clear_dir(css);
3120 if (ss->css_reset)
3121 ss->css_reset(css);
3122 }
3123 }
3124 }
3125}
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144static int cgroup_apply_control(struct cgroup *cgrp)
3145{
3146 int ret;
3147
3148 cgroup_propagate_control(cgrp);
3149
3150 ret = cgroup_apply_control_enable(cgrp);
3151 if (ret)
3152 return ret;
3153
3154
3155
3156
3157
3158
3159 ret = cgroup_update_dfl_csses(cgrp);
3160 if (ret)
3161 return ret;
3162
3163 return 0;
3164}
3165
3166
3167
3168
3169
3170
3171
3172
3173static void cgroup_finalize_control(struct cgroup *cgrp, int ret)
3174{
3175 if (ret) {
3176 cgroup_restore_control(cgrp);
3177 cgroup_propagate_control(cgrp);
3178 }
3179
3180 cgroup_apply_control_disable(cgrp);
3181}
3182
3183static int cgroup_vet_subtree_control_enable(struct cgroup *cgrp, u16 enable)
3184{
3185 u16 domain_enable = enable & ~cgrp_dfl_threaded_ss_mask;
3186
3187
3188 if (!enable)
3189 return 0;
3190
3191
3192 if (!cgroup_is_valid_domain(cgrp->dom_cgrp))
3193 return -EOPNOTSUPP;
3194
3195
3196 if (cgroup_is_mixable(cgrp))
3197 return 0;
3198
3199 if (domain_enable) {
3200
3201 if (cgroup_is_thread_root(cgrp) || cgroup_is_threaded(cgrp))
3202 return -EOPNOTSUPP;
3203 } else {
3204
3205
3206
3207
3208
3209 if (cgroup_can_be_thread_root(cgrp) || cgroup_is_threaded(cgrp))
3210 return 0;
3211 }
3212
3213
3214
3215
3216
3217 if (cgroup_has_tasks(cgrp))
3218 return -EBUSY;
3219
3220 return 0;
3221}
3222
3223
3224static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of,
3225 char *buf, size_t nbytes,
3226 loff_t off)
3227{
3228 u16 enable = 0, disable = 0;
3229 struct cgroup *cgrp, *child;
3230 struct cgroup_subsys *ss;
3231 char *tok;
3232 int ssid, ret;
3233
3234
3235
3236
3237
3238 buf = strstrip(buf);
3239 while ((tok = strsep(&buf, " "))) {
3240 if (tok[0] == '\0')
3241 continue;
3242 do_each_subsys_mask(ss, ssid, ~cgrp_dfl_inhibit_ss_mask) {
3243 if (!cgroup_ssid_enabled(ssid) ||
3244 strcmp(tok + 1, ss->name))
3245 continue;
3246
3247 if (*tok == '+') {
3248 enable |= 1 << ssid;
3249 disable &= ~(1 << ssid);
3250 } else if (*tok == '-') {
3251 disable |= 1 << ssid;
3252 enable &= ~(1 << ssid);
3253 } else {
3254 return -EINVAL;
3255 }
3256 break;
3257 } while_each_subsys_mask();
3258 if (ssid == CGROUP_SUBSYS_COUNT)
3259 return -EINVAL;
3260 }
3261
3262 cgrp = cgroup_kn_lock_live(of->kn, true);
3263 if (!cgrp)
3264 return -ENODEV;
3265
3266 for_each_subsys(ss, ssid) {
3267 if (enable & (1 << ssid)) {
3268 if (cgrp->subtree_control & (1 << ssid)) {
3269 enable &= ~(1 << ssid);
3270 continue;
3271 }
3272
3273 if (!(cgroup_control(cgrp) & (1 << ssid))) {
3274 ret = -ENOENT;
3275 goto out_unlock;
3276 }
3277 } else if (disable & (1 << ssid)) {
3278 if (!(cgrp->subtree_control & (1 << ssid))) {
3279 disable &= ~(1 << ssid);
3280 continue;
3281 }
3282
3283
3284 cgroup_for_each_live_child(child, cgrp) {
3285 if (child->subtree_control & (1 << ssid)) {
3286 ret = -EBUSY;
3287 goto out_unlock;
3288 }
3289 }
3290 }
3291 }
3292
3293 if (!enable && !disable) {
3294 ret = 0;
3295 goto out_unlock;
3296 }
3297
3298 ret = cgroup_vet_subtree_control_enable(cgrp, enable);
3299 if (ret)
3300 goto out_unlock;
3301
3302
3303 cgroup_save_control(cgrp);
3304
3305 cgrp->subtree_control |= enable;
3306 cgrp->subtree_control &= ~disable;
3307
3308 ret = cgroup_apply_control(cgrp);
3309 cgroup_finalize_control(cgrp, ret);
3310 if (ret)
3311 goto out_unlock;
3312
3313 kernfs_activate(cgrp->kn);
3314out_unlock:
3315 cgroup_kn_unlock(of->kn);
3316 return ret ?: nbytes;
3317}
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328static int cgroup_enable_threaded(struct cgroup *cgrp)
3329{
3330 struct cgroup *parent = cgroup_parent(cgrp);
3331 struct cgroup *dom_cgrp = parent->dom_cgrp;
3332 struct cgroup *dsct;
3333 struct cgroup_subsys_state *d_css;
3334 int ret;
3335
3336 lockdep_assert_held(&cgroup_mutex);
3337
3338
3339 if (cgroup_is_threaded(cgrp))
3340 return 0;
3341
3342
3343
3344
3345
3346
3347
3348 if (cgroup_is_populated(cgrp) ||
3349 cgrp->subtree_control & ~cgrp_dfl_threaded_ss_mask)
3350 return -EOPNOTSUPP;
3351
3352
3353 if (!cgroup_is_valid_domain(dom_cgrp) ||
3354 !cgroup_can_be_thread_root(dom_cgrp))
3355 return -EOPNOTSUPP;
3356
3357
3358
3359
3360
3361 cgroup_save_control(cgrp);
3362
3363 cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp)
3364 if (dsct == cgrp || cgroup_is_threaded(dsct))
3365 dsct->dom_cgrp = dom_cgrp;
3366
3367 ret = cgroup_apply_control(cgrp);
3368 if (!ret)
3369 parent->nr_threaded_children++;
3370
3371 cgroup_finalize_control(cgrp, ret);
3372 return ret;
3373}
3374
3375static int cgroup_type_show(struct seq_file *seq, void *v)
3376{
3377 struct cgroup *cgrp = seq_css(seq)->cgroup;
3378
3379 if (cgroup_is_threaded(cgrp))
3380 seq_puts(seq, "threaded\n");
3381 else if (!cgroup_is_valid_domain(cgrp))
3382 seq_puts(seq, "domain invalid\n");
3383 else if (cgroup_is_thread_root(cgrp))
3384 seq_puts(seq, "domain threaded\n");
3385 else
3386 seq_puts(seq, "domain\n");
3387
3388 return 0;
3389}
3390
3391static ssize_t cgroup_type_write(struct kernfs_open_file *of, char *buf,
3392 size_t nbytes, loff_t off)
3393{
3394 struct cgroup *cgrp;
3395 int ret;
3396
3397
3398 if (strcmp(strstrip(buf), "threaded"))
3399 return -EINVAL;
3400
3401
3402 cgrp = cgroup_kn_lock_live(of->kn, true);
3403 if (!cgrp)
3404 return -ENOENT;
3405
3406
3407 ret = cgroup_enable_threaded(cgrp);
3408
3409 cgroup_kn_unlock(of->kn);
3410 return ret ?: nbytes;
3411}
3412
3413static int cgroup_max_descendants_show(struct seq_file *seq, void *v)
3414{
3415 struct cgroup *cgrp = seq_css(seq)->cgroup;
3416 int descendants = READ_ONCE(cgrp->max_descendants);
3417
3418 if (descendants == INT_MAX)
3419 seq_puts(seq, "max\n");
3420 else
3421 seq_printf(seq, "%d\n", descendants);
3422
3423 return 0;
3424}
3425
3426static ssize_t cgroup_max_descendants_write(struct kernfs_open_file *of,
3427 char *buf, size_t nbytes, loff_t off)
3428{
3429 struct cgroup *cgrp;
3430 int descendants;
3431 ssize_t ret;
3432
3433 buf = strstrip(buf);
3434 if (!strcmp(buf, "max")) {
3435 descendants = INT_MAX;
3436 } else {
3437 ret = kstrtoint(buf, 0, &descendants);
3438 if (ret)
3439 return ret;
3440 }
3441
3442 if (descendants < 0)
3443 return -ERANGE;
3444
3445 cgrp = cgroup_kn_lock_live(of->kn, false);
3446 if (!cgrp)
3447 return -ENOENT;
3448
3449 cgrp->max_descendants = descendants;
3450
3451 cgroup_kn_unlock(of->kn);
3452
3453 return nbytes;
3454}
3455
3456static int cgroup_max_depth_show(struct seq_file *seq, void *v)
3457{
3458 struct cgroup *cgrp = seq_css(seq)->cgroup;
3459 int depth = READ_ONCE(cgrp->max_depth);
3460
3461 if (depth == INT_MAX)
3462 seq_puts(seq, "max\n");
3463 else
3464 seq_printf(seq, "%d\n", depth);
3465
3466 return 0;
3467}
3468
3469static ssize_t cgroup_max_depth_write(struct kernfs_open_file *of,
3470 char *buf, size_t nbytes, loff_t off)
3471{
3472 struct cgroup *cgrp;
3473 ssize_t ret;
3474 int depth;
3475
3476 buf = strstrip(buf);
3477 if (!strcmp(buf, "max")) {
3478 depth = INT_MAX;
3479 } else {
3480 ret = kstrtoint(buf, 0, &depth);
3481 if (ret)
3482 return ret;
3483 }
3484
3485 if (depth < 0)
3486 return -ERANGE;
3487
3488 cgrp = cgroup_kn_lock_live(of->kn, false);
3489 if (!cgrp)
3490 return -ENOENT;
3491
3492 cgrp->max_depth = depth;
3493
3494 cgroup_kn_unlock(of->kn);
3495
3496 return nbytes;
3497}
3498
3499static int cgroup_events_show(struct seq_file *seq, void *v)
3500{
3501 struct cgroup *cgrp = seq_css(seq)->cgroup;
3502
3503 seq_printf(seq, "populated %d\n", cgroup_is_populated(cgrp));
3504 seq_printf(seq, "frozen %d\n", test_bit(CGRP_FROZEN, &cgrp->flags));
3505
3506 return 0;
3507}
3508
3509static int cgroup_stat_show(struct seq_file *seq, void *v)
3510{
3511 struct cgroup *cgroup = seq_css(seq)->cgroup;
3512
3513 seq_printf(seq, "nr_descendants %d\n",
3514 cgroup->nr_descendants);
3515 seq_printf(seq, "nr_dying_descendants %d\n",
3516 cgroup->nr_dying_descendants);
3517
3518 return 0;
3519}
3520
3521static int __maybe_unused cgroup_extra_stat_show(struct seq_file *seq,
3522 struct cgroup *cgrp, int ssid)
3523{
3524 struct cgroup_subsys *ss = cgroup_subsys[ssid];
3525 struct cgroup_subsys_state *css;
3526 int ret;
3527
3528 if (!ss->css_extra_stat_show)
3529 return 0;
3530
3531 css = cgroup_tryget_css(cgrp, ss);
3532 if (!css)
3533 return 0;
3534
3535 ret = ss->css_extra_stat_show(seq, css);
3536 css_put(css);
3537 return ret;
3538}
3539
3540static int cpu_stat_show(struct seq_file *seq, void *v)
3541{
3542 struct cgroup __maybe_unused *cgrp = seq_css(seq)->cgroup;
3543 int ret = 0;
3544
3545 cgroup_base_stat_cputime_show(seq);
3546#ifdef CONFIG_CGROUP_SCHED
3547 ret = cgroup_extra_stat_show(seq, cgrp, cpu_cgrp_id);
3548#endif
3549 return ret;
3550}
3551
3552#ifdef CONFIG_PSI
3553static int cgroup_io_pressure_show(struct seq_file *seq, void *v)
3554{
3555 struct cgroup *cgrp = seq_css(seq)->cgroup;
3556 struct psi_group *psi = cgroup_ino(cgrp) == 1 ? &psi_system : &cgrp->psi;
3557
3558 return psi_show(seq, psi, PSI_IO);
3559}
3560static int cgroup_memory_pressure_show(struct seq_file *seq, void *v)
3561{
3562 struct cgroup *cgrp = seq_css(seq)->cgroup;
3563 struct psi_group *psi = cgroup_ino(cgrp) == 1 ? &psi_system : &cgrp->psi;
3564
3565 return psi_show(seq, psi, PSI_MEM);
3566}
3567static int cgroup_cpu_pressure_show(struct seq_file *seq, void *v)
3568{
3569 struct cgroup *cgrp = seq_css(seq)->cgroup;
3570 struct psi_group *psi = cgroup_ino(cgrp) == 1 ? &psi_system : &cgrp->psi;
3571
3572 return psi_show(seq, psi, PSI_CPU);
3573}
3574
3575static ssize_t cgroup_pressure_write(struct kernfs_open_file *of, char *buf,
3576 size_t nbytes, enum psi_res res)
3577{
3578 struct psi_trigger *new;
3579 struct cgroup *cgrp;
3580 struct psi_group *psi;
3581
3582 cgrp = cgroup_kn_lock_live(of->kn, false);
3583 if (!cgrp)
3584 return -ENODEV;
3585
3586 cgroup_get(cgrp);
3587 cgroup_kn_unlock(of->kn);
3588
3589 psi = cgroup_ino(cgrp) == 1 ? &psi_system : &cgrp->psi;
3590 new = psi_trigger_create(psi, buf, nbytes, res);
3591 if (IS_ERR(new)) {
3592 cgroup_put(cgrp);
3593 return PTR_ERR(new);
3594 }
3595
3596 psi_trigger_replace(&of->priv, new);
3597
3598 cgroup_put(cgrp);
3599
3600 return nbytes;
3601}
3602
3603static ssize_t cgroup_io_pressure_write(struct kernfs_open_file *of,
3604 char *buf, size_t nbytes,
3605 loff_t off)
3606{
3607 return cgroup_pressure_write(of, buf, nbytes, PSI_IO);
3608}
3609
3610static ssize_t cgroup_memory_pressure_write(struct kernfs_open_file *of,
3611 char *buf, size_t nbytes,
3612 loff_t off)
3613{
3614 return cgroup_pressure_write(of, buf, nbytes, PSI_MEM);
3615}
3616
3617static ssize_t cgroup_cpu_pressure_write(struct kernfs_open_file *of,
3618 char *buf, size_t nbytes,
3619 loff_t off)
3620{
3621 return cgroup_pressure_write(of, buf, nbytes, PSI_CPU);
3622}
3623
3624static __poll_t cgroup_pressure_poll(struct kernfs_open_file *of,
3625 poll_table *pt)
3626{
3627 return psi_trigger_poll(&of->priv, of->file, pt);
3628}
3629
3630static void cgroup_pressure_release(struct kernfs_open_file *of)
3631{
3632 psi_trigger_replace(&of->priv, NULL);
3633}
3634#endif
3635
3636static int cgroup_freeze_show(struct seq_file *seq, void *v)
3637{
3638 struct cgroup *cgrp = seq_css(seq)->cgroup;
3639
3640 seq_printf(seq, "%d\n", cgrp->freezer.freeze);
3641
3642 return 0;
3643}
3644
3645static ssize_t cgroup_freeze_write(struct kernfs_open_file *of,
3646 char *buf, size_t nbytes, loff_t off)
3647{
3648 struct cgroup *cgrp;
3649 ssize_t ret;
3650 int freeze;
3651
3652 ret = kstrtoint(strstrip(buf), 0, &freeze);
3653 if (ret)
3654 return ret;
3655
3656 if (freeze < 0 || freeze > 1)
3657 return -ERANGE;
3658
3659 cgrp = cgroup_kn_lock_live(of->kn, false);
3660 if (!cgrp)
3661 return -ENOENT;
3662
3663 cgroup_freeze(cgrp, freeze);
3664
3665 cgroup_kn_unlock(of->kn);
3666
3667 return nbytes;
3668}
3669
3670static int cgroup_file_open(struct kernfs_open_file *of)
3671{
3672 struct cftype *cft = of_cft(of);
3673
3674 if (cft->open)
3675 return cft->open(of);
3676 return 0;
3677}
3678
3679static void cgroup_file_release(struct kernfs_open_file *of)
3680{
3681 struct cftype *cft = of_cft(of);
3682
3683 if (cft->release)
3684 cft->release(of);
3685}
3686
3687static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf,
3688 size_t nbytes, loff_t off)
3689{
3690 struct cgroup_namespace *ns = current->nsproxy->cgroup_ns;
3691 struct cgroup *cgrp = of->kn->parent->priv;
3692 struct cftype *cft = of_cft(of);
3693 struct cgroup_subsys_state *css;
3694 int ret;
3695
3696 if (!nbytes)
3697 return 0;
3698
3699
3700
3701
3702
3703
3704
3705 if ((cgrp->root->flags & CGRP_ROOT_NS_DELEGATE) &&
3706 !(cft->flags & CFTYPE_NS_DELEGATABLE) &&
3707 ns != &init_cgroup_ns && ns->root_cset->dfl_cgrp == cgrp)
3708 return -EPERM;
3709
3710 if (cft->write)
3711 return cft->write(of, buf, nbytes, off);
3712
3713
3714
3715
3716
3717
3718
3719 rcu_read_lock();
3720 css = cgroup_css(cgrp, cft->ss);
3721 rcu_read_unlock();
3722
3723 if (cft->write_u64) {
3724 unsigned long long v;
3725 ret = kstrtoull(buf, 0, &v);
3726 if (!ret)
3727 ret = cft->write_u64(css, cft, v);
3728 } else if (cft->write_s64) {
3729 long long v;
3730 ret = kstrtoll(buf, 0, &v);
3731 if (!ret)
3732 ret = cft->write_s64(css, cft, v);
3733 } else {
3734 ret = -EINVAL;
3735 }
3736
3737 return ret ?: nbytes;
3738}
3739
3740static __poll_t cgroup_file_poll(struct kernfs_open_file *of, poll_table *pt)
3741{
3742 struct cftype *cft = of_cft(of);
3743
3744 if (cft->poll)
3745 return cft->poll(of, pt);
3746
3747 return kernfs_generic_poll(of, pt);
3748}
3749
3750static void *cgroup_seqfile_start(struct seq_file *seq, loff_t *ppos)
3751{
3752 return seq_cft(seq)->seq_start(seq, ppos);
3753}
3754
3755static void *cgroup_seqfile_next(struct seq_file *seq, void *v, loff_t *ppos)
3756{
3757 return seq_cft(seq)->seq_next(seq, v, ppos);
3758}
3759
3760static void cgroup_seqfile_stop(struct seq_file *seq, void *v)
3761{
3762 if (seq_cft(seq)->seq_stop)
3763 seq_cft(seq)->seq_stop(seq, v);
3764}
3765
3766static int cgroup_seqfile_show(struct seq_file *m, void *arg)
3767{
3768 struct cftype *cft = seq_cft(m);
3769 struct cgroup_subsys_state *css = seq_css(m);
3770
3771 if (cft->seq_show)
3772 return cft->seq_show(m, arg);
3773
3774 if (cft->read_u64)
3775 seq_printf(m, "%llu\n", cft->read_u64(css, cft));
3776 else if (cft->read_s64)
3777 seq_printf(m, "%lld\n", cft->read_s64(css, cft));
3778 else
3779 return -EINVAL;
3780 return 0;
3781}
3782
3783static struct kernfs_ops cgroup_kf_single_ops = {
3784 .atomic_write_len = PAGE_SIZE,
3785 .open = cgroup_file_open,
3786 .release = cgroup_file_release,
3787 .write = cgroup_file_write,
3788 .poll = cgroup_file_poll,
3789 .seq_show = cgroup_seqfile_show,
3790};
3791
3792static struct kernfs_ops cgroup_kf_ops = {
3793 .atomic_write_len = PAGE_SIZE,
3794 .open = cgroup_file_open,
3795 .release = cgroup_file_release,
3796 .write = cgroup_file_write,
3797 .poll = cgroup_file_poll,
3798 .seq_start = cgroup_seqfile_start,
3799 .seq_next = cgroup_seqfile_next,
3800 .seq_stop = cgroup_seqfile_stop,
3801 .seq_show = cgroup_seqfile_show,
3802};
3803
3804
3805static int cgroup_kn_set_ugid(struct kernfs_node *kn)
3806{
3807 struct iattr iattr = { .ia_valid = ATTR_UID | ATTR_GID,
3808 .ia_uid = current_fsuid(),
3809 .ia_gid = current_fsgid(), };
3810
3811 if (uid_eq(iattr.ia_uid, GLOBAL_ROOT_UID) &&
3812 gid_eq(iattr.ia_gid, GLOBAL_ROOT_GID))
3813 return 0;
3814
3815 return kernfs_setattr(kn, &iattr);
3816}
3817
3818static void cgroup_file_notify_timer(struct timer_list *timer)
3819{
3820 cgroup_file_notify(container_of(timer, struct cgroup_file,
3821 notify_timer));
3822}
3823
3824static int cgroup_add_file(struct cgroup_subsys_state *css, struct cgroup *cgrp,
3825 struct cftype *cft)
3826{
3827 char name[CGROUP_FILE_NAME_MAX];
3828 struct kernfs_node *kn;
3829 struct lock_class_key *key = NULL;
3830 int ret;
3831
3832#ifdef CONFIG_DEBUG_LOCK_ALLOC
3833 key = &cft->lockdep_key;
3834#endif
3835 kn = __kernfs_create_file(cgrp->kn, cgroup_file_name(cgrp, cft, name),
3836 cgroup_file_mode(cft),
3837 GLOBAL_ROOT_UID, GLOBAL_ROOT_GID,
3838 0, cft->kf_ops, cft,
3839 NULL, key);
3840 if (IS_ERR(kn))
3841 return PTR_ERR(kn);
3842
3843 ret = cgroup_kn_set_ugid(kn);
3844 if (ret) {
3845 kernfs_remove(kn);
3846 return ret;
3847 }
3848
3849 if (cft->file_offset) {
3850 struct cgroup_file *cfile = (void *)css + cft->file_offset;
3851
3852 timer_setup(&cfile->notify_timer, cgroup_file_notify_timer, 0);
3853
3854 spin_lock_irq(&cgroup_file_kn_lock);
3855 cfile->kn = kn;
3856 spin_unlock_irq(&cgroup_file_kn_lock);
3857 }
3858
3859 return 0;
3860}
3861
3862
3863
3864
3865
3866
3867
3868
3869
3870
3871
3872static int cgroup_addrm_files(struct cgroup_subsys_state *css,
3873 struct cgroup *cgrp, struct cftype cfts[],
3874 bool is_add)
3875{
3876 struct cftype *cft, *cft_end = NULL;
3877 int ret = 0;
3878
3879 lockdep_assert_held(&cgroup_mutex);
3880
3881restart:
3882 for (cft = cfts; cft != cft_end && cft->name[0] != '\0'; cft++) {
3883
3884 if ((cft->flags & __CFTYPE_ONLY_ON_DFL) && !cgroup_on_dfl(cgrp))
3885 continue;
3886 if ((cft->flags & __CFTYPE_NOT_ON_DFL) && cgroup_on_dfl(cgrp))
3887 continue;
3888 if ((cft->flags & CFTYPE_NOT_ON_ROOT) && !cgroup_parent(cgrp))
3889 continue;
3890 if ((cft->flags & CFTYPE_ONLY_ON_ROOT) && cgroup_parent(cgrp))
3891 continue;
3892 if ((cft->flags & CFTYPE_DEBUG) && !cgroup_debug)
3893 continue;
3894 if (is_add) {
3895 ret = cgroup_add_file(css, cgrp, cft);
3896 if (ret) {
3897 pr_warn("%s: failed to add %s, err=%d\n",
3898 __func__, cft->name, ret);
3899 cft_end = cft;
3900 is_add = false;
3901 goto restart;
3902 }
3903 } else {
3904 cgroup_rm_file(cgrp, cft);
3905 }
3906 }
3907 return ret;
3908}
3909
3910static int cgroup_apply_cftypes(struct cftype *cfts, bool is_add)
3911{
3912 struct cgroup_subsys *ss = cfts[0].ss;
3913 struct cgroup *root = &ss->root->cgrp;
3914 struct cgroup_subsys_state *css;
3915 int ret = 0;
3916
3917 lockdep_assert_held(&cgroup_mutex);
3918
3919
3920 css_for_each_descendant_pre(css, cgroup_css(root, ss)) {
3921 struct cgroup *cgrp = css->cgroup;
3922
3923 if (!(css->flags & CSS_VISIBLE))
3924 continue;
3925
3926 ret = cgroup_addrm_files(css, cgrp, cfts, is_add);
3927 if (ret)
3928 break;
3929 }
3930
3931 if (is_add && !ret)
3932 kernfs_activate(root->kn);
3933 return ret;
3934}
3935
3936static void cgroup_exit_cftypes(struct cftype *cfts)
3937{
3938 struct cftype *cft;
3939
3940 for (cft = cfts; cft->name[0] != '\0'; cft++) {
3941
3942 if (cft->max_write_len && cft->max_write_len != PAGE_SIZE)
3943 kfree(cft->kf_ops);
3944 cft->kf_ops = NULL;
3945 cft->ss = NULL;
3946
3947
3948 cft->flags &= ~(__CFTYPE_ONLY_ON_DFL | __CFTYPE_NOT_ON_DFL);
3949 }
3950}
3951
3952static int cgroup_init_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
3953{
3954 struct cftype *cft;
3955
3956 for (cft = cfts; cft->name[0] != '\0'; cft++) {
3957 struct kernfs_ops *kf_ops;
3958
3959 WARN_ON(cft->ss || cft->kf_ops);
3960
3961 if (cft->seq_start)
3962 kf_ops = &cgroup_kf_ops;
3963 else
3964 kf_ops = &cgroup_kf_single_ops;
3965
3966
3967
3968
3969
3970 if (cft->max_write_len && cft->max_write_len != PAGE_SIZE) {
3971 kf_ops = kmemdup(kf_ops, sizeof(*kf_ops), GFP_KERNEL);
3972 if (!kf_ops) {
3973 cgroup_exit_cftypes(cfts);
3974 return -ENOMEM;
3975 }
3976 kf_ops->atomic_write_len = cft->max_write_len;
3977 }
3978
3979 cft->kf_ops = kf_ops;
3980 cft->ss = ss;
3981 }
3982
3983 return 0;
3984}
3985
3986static int cgroup_rm_cftypes_locked(struct cftype *cfts)
3987{
3988 lockdep_assert_held(&cgroup_mutex);
3989
3990 if (!cfts || !cfts[0].ss)
3991 return -ENOENT;
3992
3993 list_del(&cfts->node);
3994 cgroup_apply_cftypes(cfts, false);
3995 cgroup_exit_cftypes(cfts);
3996 return 0;
3997}
3998
3999
4000
4001
4002
4003
4004
4005
4006
4007
4008
4009
4010int cgroup_rm_cftypes(struct cftype *cfts)
4011{
4012 int ret;
4013
4014 mutex_lock(&cgroup_mutex);
4015 ret = cgroup_rm_cftypes_locked(cfts);
4016 mutex_unlock(&cgroup_mutex);
4017 return ret;
4018}
4019
4020
4021
4022
4023
4024
4025
4026
4027
4028
4029
4030
4031
4032
4033
4034static int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
4035{
4036 int ret;
4037
4038 if (!cgroup_ssid_enabled(ss->id))
4039 return 0;
4040
4041 if (!cfts || cfts[0].name[0] == '\0')
4042 return 0;
4043
4044 ret = cgroup_init_cftypes(ss, cfts);
4045 if (ret)
4046 return ret;
4047
4048 mutex_lock(&cgroup_mutex);
4049
4050 list_add_tail(&cfts->node, &ss->cfts);
4051 ret = cgroup_apply_cftypes(cfts, true);
4052 if (ret)
4053 cgroup_rm_cftypes_locked(cfts);
4054
4055 mutex_unlock(&cgroup_mutex);
4056 return ret;
4057}
4058
4059
4060
4061
4062
4063
4064
4065
4066
4067int cgroup_add_dfl_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
4068{
4069 struct cftype *cft;
4070
4071 for (cft = cfts; cft && cft->name[0] != '\0'; cft++)
4072 cft->flags |= __CFTYPE_ONLY_ON_DFL;
4073 return cgroup_add_cftypes(ss, cfts);
4074}
4075
4076
4077
4078
4079
4080
4081
4082
4083
4084int cgroup_add_legacy_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
4085{
4086 struct cftype *cft;
4087
4088 for (cft = cfts; cft && cft->name[0] != '\0'; cft++)
4089 cft->flags |= __CFTYPE_NOT_ON_DFL;
4090 return cgroup_add_cftypes(ss, cfts);
4091}
4092
4093
4094
4095
4096
4097
4098
4099void cgroup_file_notify(struct cgroup_file *cfile)
4100{
4101 unsigned long flags;
4102
4103 spin_lock_irqsave(&cgroup_file_kn_lock, flags);
4104 if (cfile->kn) {
4105 unsigned long last = cfile->notified_at;
4106 unsigned long next = last + CGROUP_FILE_NOTIFY_MIN_INTV;
4107
4108 if (time_in_range(jiffies, last, next)) {
4109 timer_reduce(&cfile->notify_timer, next);
4110 } else {
4111 kernfs_notify(cfile->kn);
4112 cfile->notified_at = jiffies;
4113 }
4114 }
4115 spin_unlock_irqrestore(&cgroup_file_kn_lock, flags);
4116}
4117
4118
4119
4120
4121
4122
4123
4124
4125
4126
4127
4128
4129
4130
4131
4132
4133
4134
4135struct cgroup_subsys_state *css_next_child(struct cgroup_subsys_state *pos,
4136 struct cgroup_subsys_state *parent)
4137{
4138 struct cgroup_subsys_state *next;
4139
4140 cgroup_assert_mutex_or_rcu_locked();
4141
4142
4143
4144
4145
4146
4147
4148
4149
4150
4151
4152
4153
4154
4155
4156
4157
4158
4159
4160
4161
4162 if (!pos) {
4163 next = list_entry_rcu(parent->children.next, struct cgroup_subsys_state, sibling);
4164 } else if (likely(!(pos->flags & CSS_RELEASED))) {
4165 next = list_entry_rcu(pos->sibling.next, struct cgroup_subsys_state, sibling);
4166 } else {
4167 list_for_each_entry_rcu(next, &parent->children, sibling,
4168 lockdep_is_held(&cgroup_mutex))
4169 if (next->serial_nr > pos->serial_nr)
4170 break;
4171 }
4172
4173
4174
4175
4176
4177 if (&next->sibling != &parent->children)
4178 return next;
4179 return NULL;
4180}
4181
4182
4183
4184
4185
4186
4187
4188
4189
4190
4191
4192
4193
4194
4195
4196
4197
4198
4199
4200
4201
4202
4203struct cgroup_subsys_state *
4204css_next_descendant_pre(struct cgroup_subsys_state *pos,
4205 struct cgroup_subsys_state *root)
4206{
4207 struct cgroup_subsys_state *next;
4208
4209 cgroup_assert_mutex_or_rcu_locked();
4210
4211
4212 if (!pos)
4213 return root;
4214
4215
4216 next = css_next_child(NULL, pos);
4217 if (next)
4218 return next;
4219
4220
4221 while (pos != root) {
4222 next = css_next_child(pos, pos->parent);
4223 if (next)
4224 return next;
4225 pos = pos->parent;
4226 }
4227
4228 return NULL;
4229}
4230EXPORT_SYMBOL_GPL(css_next_descendant_pre);
4231
4232
4233
4234
4235
4236
4237
4238
4239
4240
4241
4242
4243
4244
4245struct cgroup_subsys_state *
4246css_rightmost_descendant(struct cgroup_subsys_state *pos)
4247{
4248 struct cgroup_subsys_state *last, *tmp;
4249
4250 cgroup_assert_mutex_or_rcu_locked();
4251
4252 do {
4253 last = pos;
4254
4255 pos = NULL;
4256 css_for_each_child(tmp, last)
4257 pos = tmp;
4258 } while (pos);
4259
4260 return last;
4261}
4262
4263static struct cgroup_subsys_state *
4264css_leftmost_descendant(struct cgroup_subsys_state *pos)
4265{
4266 struct cgroup_subsys_state *last;
4267
4268 do {
4269 last = pos;
4270 pos = css_next_child(NULL, pos);
4271 } while (pos);
4272
4273 return last;
4274}
4275
4276
4277
4278
4279
4280
4281
4282
4283
4284
4285
4286
4287
4288
4289
4290
4291
4292
4293
4294
4295
4296
4297
4298struct cgroup_subsys_state *
4299css_next_descendant_post(struct cgroup_subsys_state *pos,
4300 struct cgroup_subsys_state *root)
4301{
4302 struct cgroup_subsys_state *next;
4303
4304 cgroup_assert_mutex_or_rcu_locked();
4305
4306
4307 if (!pos)
4308 return css_leftmost_descendant(root);
4309
4310
4311 if (pos == root)
4312 return NULL;
4313
4314
4315 next = css_next_child(pos, pos->parent);
4316 if (next)
4317 return css_leftmost_descendant(next);
4318
4319
4320 return pos->parent;
4321}
4322
4323
4324
4325
4326
4327
4328
4329
4330
4331bool css_has_online_children(struct cgroup_subsys_state *css)
4332{
4333 struct cgroup_subsys_state *child;
4334 bool ret = false;
4335
4336 rcu_read_lock();
4337 css_for_each_child(child, css) {
4338 if (child->flags & CSS_ONLINE) {
4339 ret = true;
4340 break;
4341 }
4342 }
4343 rcu_read_unlock();
4344 return ret;
4345}
4346
4347static struct css_set *css_task_iter_next_css_set(struct css_task_iter *it)
4348{
4349 struct list_head *l;
4350 struct cgrp_cset_link *link;
4351 struct css_set *cset;
4352
4353 lockdep_assert_held(&css_set_lock);
4354
4355
4356 if (it->tcset_pos) {
4357 l = it->tcset_pos->next;
4358
4359 if (l != it->tcset_head) {
4360 it->tcset_pos = l;
4361 return container_of(l, struct css_set,
4362 threaded_csets_node);
4363 }
4364
4365 it->tcset_pos = NULL;
4366 }
4367
4368
4369 l = it->cset_pos;
4370 l = l->next;
4371 if (l == it->cset_head) {
4372 it->cset_pos = NULL;
4373 return NULL;
4374 }
4375
4376 if (it->ss) {
4377 cset = container_of(l, struct css_set, e_cset_node[it->ss->id]);
4378 } else {
4379 link = list_entry(l, struct cgrp_cset_link, cset_link);
4380 cset = link->cset;
4381 }
4382
4383 it->cset_pos = l;
4384
4385
4386 if (it->flags & CSS_TASK_ITER_THREADED) {
4387 if (it->cur_dcset)
4388 put_css_set_locked(it->cur_dcset);
4389 it->cur_dcset = cset;
4390 get_css_set(cset);
4391
4392 it->tcset_head = &cset->threaded_csets;
4393 it->tcset_pos = &cset->threaded_csets;
4394 }
4395
4396 return cset;
4397}
4398
4399
4400
4401
4402
4403
4404
4405static void css_task_iter_advance_css_set(struct css_task_iter *it)
4406{
4407 struct css_set *cset;
4408
4409 lockdep_assert_held(&css_set_lock);
4410
4411
4412 while ((cset = css_task_iter_next_css_set(it))) {
4413 if (!list_empty(&cset->tasks)) {
4414 it->cur_tasks_head = &cset->tasks;
4415 break;
4416 } else if (!list_empty(&cset->mg_tasks)) {
4417 it->cur_tasks_head = &cset->mg_tasks;
4418 break;
4419 } else if (!list_empty(&cset->dying_tasks)) {
4420 it->cur_tasks_head = &cset->dying_tasks;
4421 break;
4422 }
4423 }
4424 if (!cset) {
4425 it->task_pos = NULL;
4426 return;
4427 }
4428 it->task_pos = it->cur_tasks_head->next;
4429
4430
4431
4432
4433
4434
4435
4436
4437
4438
4439
4440
4441
4442
4443
4444
4445 if (it->cur_cset) {
4446 list_del(&it->iters_node);
4447 put_css_set_locked(it->cur_cset);
4448 }
4449 get_css_set(cset);
4450 it->cur_cset = cset;
4451 list_add(&it->iters_node, &cset->task_iters);
4452}
4453
4454static void css_task_iter_skip(struct css_task_iter *it,
4455 struct task_struct *task)
4456{
4457 lockdep_assert_held(&css_set_lock);
4458
4459 if (it->task_pos == &task->cg_list) {
4460 it->task_pos = it->task_pos->next;
4461 it->flags |= CSS_TASK_ITER_SKIPPED;
4462 }
4463}
4464
4465static void css_task_iter_advance(struct css_task_iter *it)
4466{
4467 struct task_struct *task;
4468
4469 lockdep_assert_held(&css_set_lock);
4470repeat:
4471 if (it->task_pos) {
4472
4473
4474
4475
4476
4477 if (it->flags & CSS_TASK_ITER_SKIPPED)
4478 it->flags &= ~CSS_TASK_ITER_SKIPPED;
4479 else
4480 it->task_pos = it->task_pos->next;
4481
4482 if (it->task_pos == &it->cur_cset->tasks) {
4483 it->cur_tasks_head = &it->cur_cset->mg_tasks;
4484 it->task_pos = it->cur_tasks_head->next;
4485 }
4486 if (it->task_pos == &it->cur_cset->mg_tasks) {
4487 it->cur_tasks_head = &it->cur_cset->dying_tasks;
4488 it->task_pos = it->cur_tasks_head->next;
4489 }
4490 if (it->task_pos == &it->cur_cset->dying_tasks)
4491 css_task_iter_advance_css_set(it);
4492 } else {
4493
4494 css_task_iter_advance_css_set(it);
4495 }
4496
4497 if (!it->task_pos)
4498 return;
4499
4500 task = list_entry(it->task_pos, struct task_struct, cg_list);
4501
4502 if (it->flags & CSS_TASK_ITER_PROCS) {
4503
4504 if (!thread_group_leader(task))
4505 goto repeat;
4506
4507
4508 if (it->cur_tasks_head == &it->cur_cset->dying_tasks &&
4509 !atomic_read(&task->signal->live))
4510 goto repeat;
4511 } else {
4512
4513 if (it->cur_tasks_head == &it->cur_cset->dying_tasks)
4514 goto repeat;
4515 }
4516}
4517
4518
4519
4520
4521
4522
4523
4524
4525
4526
4527
4528
4529void css_task_iter_start(struct cgroup_subsys_state *css, unsigned int flags,
4530 struct css_task_iter *it)
4531{
4532 memset(it, 0, sizeof(*it));
4533
4534 spin_lock_irq(&css_set_lock);
4535
4536 it->ss = css->ss;
4537 it->flags = flags;
4538
4539 if (it->ss)
4540 it->cset_pos = &css->cgroup->e_csets[css->ss->id];
4541 else
4542 it->cset_pos = &css->cgroup->cset_links;
4543
4544 it->cset_head = it->cset_pos;
4545
4546 css_task_iter_advance(it);
4547
4548 spin_unlock_irq(&css_set_lock);
4549}
4550
4551
4552
4553
4554
4555
4556
4557
4558
4559struct task_struct *css_task_iter_next(struct css_task_iter *it)
4560{
4561 if (it->cur_task) {
4562 put_task_struct(it->cur_task);
4563 it->cur_task = NULL;
4564 }
4565
4566 spin_lock_irq(&css_set_lock);
4567
4568
4569 if (it->flags & CSS_TASK_ITER_SKIPPED)
4570 css_task_iter_advance(it);
4571
4572 if (it->task_pos) {
4573 it->cur_task = list_entry(it->task_pos, struct task_struct,
4574 cg_list);
4575 get_task_struct(it->cur_task);
4576 css_task_iter_advance(it);
4577 }
4578
4579 spin_unlock_irq(&css_set_lock);
4580
4581 return it->cur_task;
4582}
4583
4584
4585
4586
4587
4588
4589
4590void css_task_iter_end(struct css_task_iter *it)
4591{
4592 if (it->cur_cset) {
4593 spin_lock_irq(&css_set_lock);
4594 list_del(&it->iters_node);
4595 put_css_set_locked(it->cur_cset);
4596 spin_unlock_irq(&css_set_lock);
4597 }
4598
4599 if (it->cur_dcset)
4600 put_css_set(it->cur_dcset);
4601
4602 if (it->cur_task)
4603 put_task_struct(it->cur_task);
4604}
4605
4606static void cgroup_procs_release(struct kernfs_open_file *of)
4607{
4608 if (of->priv) {
4609 css_task_iter_end(of->priv);
4610 kfree(of->priv);
4611 }
4612}
4613
4614static void *cgroup_procs_next(struct seq_file *s, void *v, loff_t *pos)
4615{
4616 struct kernfs_open_file *of = s->private;
4617 struct css_task_iter *it = of->priv;
4618
4619 if (pos)
4620 (*pos)++;
4621
4622 return css_task_iter_next(it);
4623}
4624
4625static void *__cgroup_procs_start(struct seq_file *s, loff_t *pos,
4626 unsigned int iter_flags)
4627{
4628 struct kernfs_open_file *of = s->private;
4629 struct cgroup *cgrp = seq_css(s)->cgroup;
4630 struct css_task_iter *it = of->priv;
4631
4632
4633
4634
4635
4636 if (!it) {
4637 if (WARN_ON_ONCE((*pos)))
4638 return ERR_PTR(-EINVAL);
4639
4640 it = kzalloc(sizeof(*it), GFP_KERNEL);
4641 if (!it)
4642 return ERR_PTR(-ENOMEM);
4643 of->priv = it;
4644 css_task_iter_start(&cgrp->self, iter_flags, it);
4645 } else if (!(*pos)) {
4646 css_task_iter_end(it);
4647 css_task_iter_start(&cgrp->self, iter_flags, it);
4648 } else
4649 return it->cur_task;
4650
4651 return cgroup_procs_next(s, NULL, NULL);
4652}
4653
4654static void *cgroup_procs_start(struct seq_file *s, loff_t *pos)
4655{
4656 struct cgroup *cgrp = seq_css(s)->cgroup;
4657
4658
4659
4660
4661
4662
4663
4664 if (cgroup_is_threaded(cgrp))
4665 return ERR_PTR(-EOPNOTSUPP);
4666
4667 return __cgroup_procs_start(s, pos, CSS_TASK_ITER_PROCS |
4668 CSS_TASK_ITER_THREADED);
4669}
4670
4671static int cgroup_procs_show(struct seq_file *s, void *v)
4672{
4673 seq_printf(s, "%d\n", task_pid_vnr(v));
4674 return 0;
4675}
4676
4677static int cgroup_may_write(const struct cgroup *cgrp, struct super_block *sb)
4678{
4679 int ret;
4680 struct inode *inode;
4681
4682 lockdep_assert_held(&cgroup_mutex);
4683
4684 inode = kernfs_get_inode(sb, cgrp->procs_file.kn);
4685 if (!inode)
4686 return -ENOMEM;
4687
4688 ret = inode_permission(&init_user_ns, inode, MAY_WRITE);
4689 iput(inode);
4690 return ret;
4691}
4692
4693static int cgroup_procs_write_permission(struct cgroup *src_cgrp,
4694 struct cgroup *dst_cgrp,
4695 struct super_block *sb)
4696{
4697 struct cgroup_namespace *ns = current->nsproxy->cgroup_ns;
4698 struct cgroup *com_cgrp = src_cgrp;
4699 int ret;
4700
4701 lockdep_assert_held(&cgroup_mutex);
4702
4703
4704 while (!cgroup_is_descendant(dst_cgrp, com_cgrp))
4705 com_cgrp = cgroup_parent(com_cgrp);
4706
4707
4708 ret = cgroup_may_write(com_cgrp, sb);
4709 if (ret)
4710 return ret;
4711
4712
4713
4714
4715
4716 if ((cgrp_dfl_root.flags & CGRP_ROOT_NS_DELEGATE) &&
4717 (!cgroup_is_descendant(src_cgrp, ns->root_cset->dfl_cgrp) ||
4718 !cgroup_is_descendant(dst_cgrp, ns->root_cset->dfl_cgrp)))
4719 return -ENOENT;
4720
4721 return 0;
4722}
4723
4724static int cgroup_attach_permissions(struct cgroup *src_cgrp,
4725 struct cgroup *dst_cgrp,
4726 struct super_block *sb, bool threadgroup)
4727{
4728 int ret = 0;
4729
4730 ret = cgroup_procs_write_permission(src_cgrp, dst_cgrp, sb);
4731 if (ret)
4732 return ret;
4733
4734 ret = cgroup_migrate_vet_dst(dst_cgrp);
4735 if (ret)
4736 return ret;
4737
4738 if (!threadgroup && (src_cgrp->dom_cgrp != dst_cgrp->dom_cgrp))
4739 ret = -EOPNOTSUPP;
4740
4741 return ret;
4742}
4743
4744static ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf,
4745 bool threadgroup)
4746{
4747 struct cgroup *src_cgrp, *dst_cgrp;
4748 struct task_struct *task;
4749 ssize_t ret;
4750 bool locked;
4751
4752 dst_cgrp = cgroup_kn_lock_live(of->kn, false);
4753 if (!dst_cgrp)
4754 return -ENODEV;
4755
4756 task = cgroup_procs_write_start(buf, threadgroup, &locked);
4757 ret = PTR_ERR_OR_ZERO(task);
4758 if (ret)
4759 goto out_unlock;
4760
4761
4762 spin_lock_irq(&css_set_lock);
4763 src_cgrp = task_cgroup_from_root(task, &cgrp_dfl_root);
4764 spin_unlock_irq(&css_set_lock);
4765
4766
4767 ret = cgroup_attach_permissions(src_cgrp, dst_cgrp,
4768 of->file->f_path.dentry->d_sb, threadgroup);
4769 if (ret)
4770 goto out_finish;
4771
4772 ret = cgroup_attach_task(dst_cgrp, task, threadgroup);
4773
4774out_finish:
4775 cgroup_procs_write_finish(task, locked);
4776out_unlock:
4777 cgroup_kn_unlock(of->kn);
4778
4779 return ret;
4780}
4781
4782static ssize_t cgroup_procs_write(struct kernfs_open_file *of,
4783 char *buf, size_t nbytes, loff_t off)
4784{
4785 return __cgroup_procs_write(of, buf, true) ?: nbytes;
4786}
4787
4788static void *cgroup_threads_start(struct seq_file *s, loff_t *pos)
4789{
4790 return __cgroup_procs_start(s, pos, 0);
4791}
4792
4793static ssize_t cgroup_threads_write(struct kernfs_open_file *of,
4794 char *buf, size_t nbytes, loff_t off)
4795{
4796 return __cgroup_procs_write(of, buf, false) ?: nbytes;
4797}
4798
4799
4800static struct cftype cgroup_base_files[] = {
4801 {
4802 .name = "cgroup.type",
4803 .flags = CFTYPE_NOT_ON_ROOT,
4804 .seq_show = cgroup_type_show,
4805 .write = cgroup_type_write,
4806 },
4807 {
4808 .name = "cgroup.procs",
4809 .flags = CFTYPE_NS_DELEGATABLE,
4810 .file_offset = offsetof(struct cgroup, procs_file),
4811 .release = cgroup_procs_release,
4812 .seq_start = cgroup_procs_start,
4813 .seq_next = cgroup_procs_next,
4814 .seq_show = cgroup_procs_show,
4815 .write = cgroup_procs_write,
4816 },
4817 {
4818 .name = "cgroup.threads",
4819 .flags = CFTYPE_NS_DELEGATABLE,
4820 .release = cgroup_procs_release,
4821 .seq_start = cgroup_threads_start,
4822 .seq_next = cgroup_procs_next,
4823 .seq_show = cgroup_procs_show,
4824 .write = cgroup_threads_write,
4825 },
4826 {
4827 .name = "cgroup.controllers",
4828 .seq_show = cgroup_controllers_show,
4829 },
4830 {
4831 .name = "cgroup.subtree_control",
4832 .flags = CFTYPE_NS_DELEGATABLE,
4833 .seq_show = cgroup_subtree_control_show,
4834 .write = cgroup_subtree_control_write,
4835 },
4836 {
4837 .name = "cgroup.events",
4838 .flags = CFTYPE_NOT_ON_ROOT,
4839 .file_offset = offsetof(struct cgroup, events_file),
4840 .seq_show = cgroup_events_show,
4841 },
4842 {
4843 .name = "cgroup.max.descendants",
4844 .seq_show = cgroup_max_descendants_show,
4845 .write = cgroup_max_descendants_write,
4846 },
4847 {
4848 .name = "cgroup.max.depth",
4849 .seq_show = cgroup_max_depth_show,
4850 .write = cgroup_max_depth_write,
4851 },
4852 {
4853 .name = "cgroup.stat",
4854 .seq_show = cgroup_stat_show,
4855 },
4856 {
4857 .name = "cgroup.freeze",
4858 .flags = CFTYPE_NOT_ON_ROOT,
4859 .seq_show = cgroup_freeze_show,
4860 .write = cgroup_freeze_write,
4861 },
4862 {
4863 .name = "cpu.stat",
4864 .seq_show = cpu_stat_show,
4865 },
4866#ifdef CONFIG_PSI
4867 {
4868 .name = "io.pressure",
4869 .seq_show = cgroup_io_pressure_show,
4870 .write = cgroup_io_pressure_write,
4871 .poll = cgroup_pressure_poll,
4872 .release = cgroup_pressure_release,
4873 },
4874 {
4875 .name = "memory.pressure",
4876 .seq_show = cgroup_memory_pressure_show,
4877 .write = cgroup_memory_pressure_write,
4878 .poll = cgroup_pressure_poll,
4879 .release = cgroup_pressure_release,
4880 },
4881 {
4882 .name = "cpu.pressure",
4883 .seq_show = cgroup_cpu_pressure_show,
4884 .write = cgroup_cpu_pressure_write,
4885 .poll = cgroup_pressure_poll,
4886 .release = cgroup_pressure_release,
4887 },
4888#endif
4889 { }
4890};
4891
4892
4893
4894
4895
4896
4897
4898
4899
4900
4901
4902
4903
4904
4905
4906
4907
4908
4909
4910
4911
4912
4913
4914static void css_free_rwork_fn(struct work_struct *work)
4915{
4916 struct cgroup_subsys_state *css = container_of(to_rcu_work(work),
4917 struct cgroup_subsys_state, destroy_rwork);
4918 struct cgroup_subsys *ss = css->ss;
4919 struct cgroup *cgrp = css->cgroup;
4920
4921 percpu_ref_exit(&css->refcnt);
4922
4923 if (ss) {
4924
4925 struct cgroup_subsys_state *parent = css->parent;
4926 int id = css->id;
4927
4928 ss->css_free(css);
4929 cgroup_idr_remove(&ss->css_idr, id);
4930 cgroup_put(cgrp);
4931
4932 if (parent)
4933 css_put(parent);
4934 } else {
4935
4936 atomic_dec(&cgrp->root->nr_cgrps);
4937 cgroup1_pidlist_destroy_all(cgrp);
4938 cancel_work_sync(&cgrp->release_agent_work);
4939
4940 if (cgroup_parent(cgrp)) {
4941
4942
4943
4944
4945
4946
4947 cgroup_put(cgroup_parent(cgrp));
4948 kernfs_put(cgrp->kn);
4949 psi_cgroup_free(cgrp);
4950 cgroup_rstat_exit(cgrp);
4951 kfree(cgrp);
4952 } else {
4953
4954
4955
4956
4957
4958 cgroup_destroy_root(cgrp->root);
4959 }
4960 }
4961}
4962
4963static void css_release_work_fn(struct work_struct *work)
4964{
4965 struct cgroup_subsys_state *css =
4966 container_of(work, struct cgroup_subsys_state, destroy_work);
4967 struct cgroup_subsys *ss = css->ss;
4968 struct cgroup *cgrp = css->cgroup;
4969
4970 mutex_lock(&cgroup_mutex);
4971
4972 css->flags |= CSS_RELEASED;
4973 list_del_rcu(&css->sibling);
4974
4975 if (ss) {
4976
4977 if (!list_empty(&css->rstat_css_node)) {
4978 cgroup_rstat_flush(cgrp);
4979 list_del_rcu(&css->rstat_css_node);
4980 }
4981
4982 cgroup_idr_replace(&ss->css_idr, NULL, css->id);
4983 if (ss->css_released)
4984 ss->css_released(css);
4985 } else {
4986 struct cgroup *tcgrp;
4987
4988
4989 TRACE_CGROUP_PATH(release, cgrp);
4990
4991 cgroup_rstat_flush(cgrp);
4992
4993 spin_lock_irq(&css_set_lock);
4994 for (tcgrp = cgroup_parent(cgrp); tcgrp;
4995 tcgrp = cgroup_parent(tcgrp))
4996 tcgrp->nr_dying_descendants--;
4997 spin_unlock_irq(&css_set_lock);
4998
4999
5000
5001
5002
5003
5004
5005
5006 if (cgrp->kn)
5007 RCU_INIT_POINTER(*(void __rcu __force **)&cgrp->kn->priv,
5008 NULL);
5009 }
5010
5011 mutex_unlock(&cgroup_mutex);
5012
5013 INIT_RCU_WORK(&css->destroy_rwork, css_free_rwork_fn);
5014 queue_rcu_work(cgroup_destroy_wq, &css->destroy_rwork);
5015}
5016
5017static void css_release(struct percpu_ref *ref)
5018{
5019 struct cgroup_subsys_state *css =
5020 container_of(ref, struct cgroup_subsys_state, refcnt);
5021
5022 INIT_WORK(&css->destroy_work, css_release_work_fn);
5023 queue_work(cgroup_destroy_wq, &css->destroy_work);
5024}
5025
5026static void init_and_link_css(struct cgroup_subsys_state *css,
5027 struct cgroup_subsys *ss, struct cgroup *cgrp)
5028{
5029 lockdep_assert_held(&cgroup_mutex);
5030
5031 cgroup_get_live(cgrp);
5032
5033 memset(css, 0, sizeof(*css));
5034 css->cgroup = cgrp;
5035 css->ss = ss;
5036 css->id = -1;
5037 INIT_LIST_HEAD(&css->sibling);
5038 INIT_LIST_HEAD(&css->children);
5039 INIT_LIST_HEAD(&css->rstat_css_node);
5040 css->serial_nr = css_serial_nr_next++;
5041 atomic_set(&css->online_cnt, 0);
5042
5043 if (cgroup_parent(cgrp)) {
5044 css->parent = cgroup_css(cgroup_parent(cgrp), ss);
5045 css_get(css->parent);
5046 }
5047
5048 if (ss->css_rstat_flush)
5049 list_add_rcu(&css->rstat_css_node, &cgrp->rstat_css_list);
5050
5051 BUG_ON(cgroup_css(cgrp, ss));
5052}
5053
5054
5055static int online_css(struct cgroup_subsys_state *css)
5056{
5057 struct cgroup_subsys *ss = css->ss;
5058 int ret = 0;
5059
5060 lockdep_assert_held(&cgroup_mutex);
5061
5062 if (ss->css_online)
5063 ret = ss->css_online(css);
5064 if (!ret) {
5065 css->flags |= CSS_ONLINE;
5066 rcu_assign_pointer(css->cgroup->subsys[ss->id], css);
5067
5068 atomic_inc(&css->online_cnt);
5069 if (css->parent)
5070 atomic_inc(&css->parent->online_cnt);
5071 }
5072 return ret;
5073}
5074
5075
5076static void offline_css(struct cgroup_subsys_state *css)
5077{
5078 struct cgroup_subsys *ss = css->ss;
5079
5080 lockdep_assert_held(&cgroup_mutex);
5081
5082 if (!(css->flags & CSS_ONLINE))
5083 return;
5084
5085 if (ss->css_offline)
5086 ss->css_offline(css);
5087
5088 css->flags &= ~CSS_ONLINE;
5089 RCU_INIT_POINTER(css->cgroup->subsys[ss->id], NULL);
5090
5091 wake_up_all(&css->cgroup->offline_waitq);
5092}
5093
5094
5095
5096
5097
5098
5099
5100
5101
5102
5103static struct cgroup_subsys_state *css_create(struct cgroup *cgrp,
5104 struct cgroup_subsys *ss)
5105{
5106 struct cgroup *parent = cgroup_parent(cgrp);
5107 struct cgroup_subsys_state *parent_css = cgroup_css(parent, ss);
5108 struct cgroup_subsys_state *css;
5109 int err;
5110
5111 lockdep_assert_held(&cgroup_mutex);
5112
5113 css = ss->css_alloc(parent_css);
5114 if (!css)
5115 css = ERR_PTR(-ENOMEM);
5116 if (IS_ERR(css))
5117 return css;
5118
5119 init_and_link_css(css, ss, cgrp);
5120
5121 err = percpu_ref_init(&css->refcnt, css_release, 0, GFP_KERNEL);
5122 if (err)
5123 goto err_free_css;
5124
5125 err = cgroup_idr_alloc(&ss->css_idr, NULL, 2, 0, GFP_KERNEL);
5126 if (err < 0)
5127 goto err_free_css;
5128 css->id = err;
5129
5130
5131 list_add_tail_rcu(&css->sibling, &parent_css->children);
5132 cgroup_idr_replace(&ss->css_idr, css, css->id);
5133
5134 err = online_css(css);
5135 if (err)
5136 goto err_list_del;
5137
5138 return css;
5139
5140err_list_del:
5141 list_del_rcu(&css->sibling);
5142err_free_css:
5143 list_del_rcu(&css->rstat_css_node);
5144 INIT_RCU_WORK(&css->destroy_rwork, css_free_rwork_fn);
5145 queue_rcu_work(cgroup_destroy_wq, &css->destroy_rwork);
5146 return ERR_PTR(err);
5147}
5148
5149
5150
5151
5152
5153
5154static struct cgroup *cgroup_create(struct cgroup *parent, const char *name,
5155 umode_t mode)
5156{
5157 struct cgroup_root *root = parent->root;
5158 struct cgroup *cgrp, *tcgrp;
5159 struct kernfs_node *kn;
5160 int level = parent->level + 1;
5161 int ret;
5162
5163
5164 cgrp = kzalloc(struct_size(cgrp, ancestor_ids, (level + 1)),
5165 GFP_KERNEL);
5166 if (!cgrp)
5167 return ERR_PTR(-ENOMEM);
5168
5169 ret = percpu_ref_init(&cgrp->self.refcnt, css_release, 0, GFP_KERNEL);
5170 if (ret)
5171 goto out_free_cgrp;
5172
5173 ret = cgroup_rstat_init(cgrp);
5174 if (ret)
5175 goto out_cancel_ref;
5176
5177
5178 kn = kernfs_create_dir(parent->kn, name, mode, cgrp);
5179 if (IS_ERR(kn)) {
5180 ret = PTR_ERR(kn);
5181 goto out_stat_exit;
5182 }
5183 cgrp->kn = kn;
5184
5185 init_cgroup_housekeeping(cgrp);
5186
5187 cgrp->self.parent = &parent->self;
5188 cgrp->root = root;
5189 cgrp->level = level;
5190
5191 ret = psi_cgroup_alloc(cgrp);
5192 if (ret)
5193 goto out_kernfs_remove;
5194
5195 ret = cgroup_bpf_inherit(cgrp);
5196 if (ret)
5197 goto out_psi_free;
5198
5199
5200
5201
5202
5203 cgrp->freezer.e_freeze = parent->freezer.e_freeze;
5204 if (cgrp->freezer.e_freeze) {
5205
5206
5207
5208
5209
5210
5211 set_bit(CGRP_FREEZE, &cgrp->flags);
5212 set_bit(CGRP_FROZEN, &cgrp->flags);
5213 }
5214
5215 spin_lock_irq(&css_set_lock);
5216 for (tcgrp = cgrp; tcgrp; tcgrp = cgroup_parent(tcgrp)) {
5217 cgrp->ancestor_ids[tcgrp->level] = cgroup_id(tcgrp);
5218
5219 if (tcgrp != cgrp) {
5220 tcgrp->nr_descendants++;
5221
5222
5223
5224
5225
5226
5227 if (cgrp->freezer.e_freeze)
5228 tcgrp->freezer.nr_frozen_descendants++;
5229 }
5230 }
5231 spin_unlock_irq(&css_set_lock);
5232
5233 if (notify_on_release(parent))
5234 set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
5235
5236 if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &parent->flags))
5237 set_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags);
5238
5239 cgrp->self.serial_nr = css_serial_nr_next++;
5240
5241
5242 list_add_tail_rcu(&cgrp->self.sibling, &cgroup_parent(cgrp)->self.children);
5243 atomic_inc(&root->nr_cgrps);
5244 cgroup_get_live(parent);
5245
5246
5247
5248
5249
5250 if (!cgroup_on_dfl(cgrp))
5251 cgrp->subtree_control = cgroup_control(cgrp);
5252
5253 cgroup_propagate_control(cgrp);
5254
5255 return cgrp;
5256
5257out_psi_free:
5258 psi_cgroup_free(cgrp);
5259out_kernfs_remove:
5260 kernfs_remove(cgrp->kn);
5261out_stat_exit:
5262 cgroup_rstat_exit(cgrp);
5263out_cancel_ref:
5264 percpu_ref_exit(&cgrp->self.refcnt);
5265out_free_cgrp:
5266 kfree(cgrp);
5267 return ERR_PTR(ret);
5268}
5269
5270static bool cgroup_check_hierarchy_limits(struct cgroup *parent)
5271{
5272 struct cgroup *cgroup;
5273 int ret = false;
5274 int level = 1;
5275
5276 lockdep_assert_held(&cgroup_mutex);
5277
5278 for (cgroup = parent; cgroup; cgroup = cgroup_parent(cgroup)) {
5279 if (cgroup->nr_descendants >= cgroup->max_descendants)
5280 goto fail;
5281
5282 if (level > cgroup->max_depth)
5283 goto fail;
5284
5285 level++;
5286 }
5287
5288 ret = true;
5289fail:
5290 return ret;
5291}
5292
5293int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, umode_t mode)
5294{
5295 struct cgroup *parent, *cgrp;
5296 int ret;
5297
5298
5299 if (strchr(name, '\n'))
5300 return -EINVAL;
5301
5302 parent = cgroup_kn_lock_live(parent_kn, false);
5303 if (!parent)
5304 return -ENODEV;
5305
5306 if (!cgroup_check_hierarchy_limits(parent)) {
5307 ret = -EAGAIN;
5308 goto out_unlock;
5309 }
5310
5311 cgrp = cgroup_create(parent, name, mode);
5312 if (IS_ERR(cgrp)) {
5313 ret = PTR_ERR(cgrp);
5314 goto out_unlock;
5315 }
5316
5317
5318
5319
5320
5321 kernfs_get(cgrp->kn);
5322
5323 ret = cgroup_kn_set_ugid(cgrp->kn);
5324 if (ret)
5325 goto out_destroy;
5326
5327 ret = css_populate_dir(&cgrp->self);
5328 if (ret)
5329 goto out_destroy;
5330
5331 ret = cgroup_apply_control_enable(cgrp);
5332 if (ret)
5333 goto out_destroy;
5334
5335 TRACE_CGROUP_PATH(mkdir, cgrp);
5336
5337
5338 kernfs_activate(cgrp->kn);
5339
5340 ret = 0;
5341 goto out_unlock;
5342
5343out_destroy:
5344 cgroup_destroy_locked(cgrp);
5345out_unlock:
5346 cgroup_kn_unlock(parent_kn);
5347 return ret;
5348}
5349
5350
5351
5352
5353
5354
5355static void css_killed_work_fn(struct work_struct *work)
5356{
5357 struct cgroup_subsys_state *css =
5358 container_of(work, struct cgroup_subsys_state, destroy_work);
5359
5360 mutex_lock(&cgroup_mutex);
5361
5362 do {
5363 offline_css(css);
5364 css_put(css);
5365
5366 css = css->parent;
5367 } while (css && atomic_dec_and_test(&css->online_cnt));
5368
5369 mutex_unlock(&cgroup_mutex);
5370}
5371
5372
5373static void css_killed_ref_fn(struct percpu_ref *ref)
5374{
5375 struct cgroup_subsys_state *css =
5376 container_of(ref, struct cgroup_subsys_state, refcnt);
5377
5378 if (atomic_dec_and_test(&css->online_cnt)) {
5379 INIT_WORK(&css->destroy_work, css_killed_work_fn);
5380 queue_work(cgroup_destroy_wq, &css->destroy_work);
5381 }
5382}
5383
5384
5385
5386
5387
5388
5389
5390
5391
5392
5393static void kill_css(struct cgroup_subsys_state *css)
5394{
5395 lockdep_assert_held(&cgroup_mutex);
5396
5397 if (css->flags & CSS_DYING)
5398 return;
5399
5400 css->flags |= CSS_DYING;
5401
5402
5403
5404
5405
5406 css_clear_dir(css);
5407
5408
5409
5410
5411
5412 css_get(css);
5413
5414
5415
5416
5417
5418
5419
5420
5421
5422
5423
5424 percpu_ref_kill_and_confirm(&css->refcnt, css_killed_ref_fn);
5425}
5426
5427
5428
5429
5430
5431
5432
5433
5434
5435
5436
5437
5438
5439
5440
5441
5442
5443
5444
5445
5446
5447
5448
5449
5450
5451static int cgroup_destroy_locked(struct cgroup *cgrp)
5452 __releases(&cgroup_mutex) __acquires(&cgroup_mutex)
5453{
5454 struct cgroup *tcgrp, *parent = cgroup_parent(cgrp);
5455 struct cgroup_subsys_state *css;
5456 struct cgrp_cset_link *link;
5457 int ssid;
5458
5459 lockdep_assert_held(&cgroup_mutex);
5460
5461
5462
5463
5464
5465 if (cgroup_is_populated(cgrp))
5466 return -EBUSY;
5467
5468
5469
5470
5471
5472
5473 if (css_has_online_children(&cgrp->self))
5474 return -EBUSY;
5475
5476
5477
5478
5479
5480
5481
5482 cgrp->self.flags &= ~CSS_ONLINE;
5483
5484 spin_lock_irq(&css_set_lock);
5485 list_for_each_entry(link, &cgrp->cset_links, cset_link)
5486 link->cset->dead = true;
5487 spin_unlock_irq(&css_set_lock);
5488
5489
5490 for_each_css(css, ssid, cgrp)
5491 kill_css(css);
5492
5493
5494 css_clear_dir(&cgrp->self);
5495 kernfs_remove(cgrp->kn);
5496
5497 if (parent && cgroup_is_threaded(cgrp))
5498 parent->nr_threaded_children--;
5499
5500 spin_lock_irq(&css_set_lock);
5501 for (tcgrp = cgroup_parent(cgrp); tcgrp; tcgrp = cgroup_parent(tcgrp)) {
5502 tcgrp->nr_descendants--;
5503 tcgrp->nr_dying_descendants++;
5504
5505
5506
5507
5508 if (test_bit(CGRP_FROZEN, &cgrp->flags))
5509 tcgrp->freezer.nr_frozen_descendants--;
5510 }
5511 spin_unlock_irq(&css_set_lock);
5512
5513 cgroup1_check_for_release(parent);
5514
5515 cgroup_bpf_offline(cgrp);
5516
5517
5518 percpu_ref_kill(&cgrp->self.refcnt);
5519
5520 return 0;
5521};
5522
5523int cgroup_rmdir(struct kernfs_node *kn)
5524{
5525 struct cgroup *cgrp;
5526 int ret = 0;
5527
5528 cgrp = cgroup_kn_lock_live(kn, false);
5529 if (!cgrp)
5530 return 0;
5531
5532 ret = cgroup_destroy_locked(cgrp);
5533 if (!ret)
5534 TRACE_CGROUP_PATH(rmdir, cgrp);
5535
5536 cgroup_kn_unlock(kn);
5537 return ret;
5538}
5539
5540static struct kernfs_syscall_ops cgroup_kf_syscall_ops = {
5541 .show_options = cgroup_show_options,
5542 .mkdir = cgroup_mkdir,
5543 .rmdir = cgroup_rmdir,
5544 .show_path = cgroup_show_path,
5545};
5546
5547static void __init cgroup_init_subsys(struct cgroup_subsys *ss, bool early)
5548{
5549 struct cgroup_subsys_state *css;
5550
5551 pr_debug("Initializing cgroup subsys %s\n", ss->name);
5552
5553 mutex_lock(&cgroup_mutex);
5554
5555 idr_init(&ss->css_idr);
5556 INIT_LIST_HEAD(&ss->cfts);
5557
5558
5559 ss->root = &cgrp_dfl_root;
5560 css = ss->css_alloc(cgroup_css(&cgrp_dfl_root.cgrp, ss));
5561
5562 BUG_ON(IS_ERR(css));
5563 init_and_link_css(css, ss, &cgrp_dfl_root.cgrp);
5564
5565
5566
5567
5568
5569 css->flags |= CSS_NO_REF;
5570
5571 if (early) {
5572
5573 css->id = 1;
5574 } else {
5575 css->id = cgroup_idr_alloc(&ss->css_idr, css, 1, 2, GFP_KERNEL);
5576 BUG_ON(css->id < 0);
5577 }
5578
5579
5580
5581
5582
5583 init_css_set.subsys[ss->id] = css;
5584
5585 have_fork_callback |= (bool)ss->fork << ss->id;
5586 have_exit_callback |= (bool)ss->exit << ss->id;
5587 have_release_callback |= (bool)ss->release << ss->id;
5588 have_canfork_callback |= (bool)ss->can_fork << ss->id;
5589
5590
5591
5592
5593 BUG_ON(!list_empty(&init_task.tasks));
5594
5595 BUG_ON(online_css(css));
5596
5597 mutex_unlock(&cgroup_mutex);
5598}
5599
5600
5601
5602
5603
5604
5605
5606int __init cgroup_init_early(void)
5607{
5608 static struct cgroup_fs_context __initdata ctx;
5609 struct cgroup_subsys *ss;
5610 int i;
5611
5612 ctx.root = &cgrp_dfl_root;
5613 init_cgroup_root(&ctx);
5614 cgrp_dfl_root.cgrp.self.flags |= CSS_NO_REF;
5615
5616 RCU_INIT_POINTER(init_task.cgroups, &init_css_set);
5617
5618 for_each_subsys(ss, i) {
5619 WARN(!ss->css_alloc || !ss->css_free || ss->name || ss->id,
5620 "invalid cgroup_subsys %d:%s css_alloc=%p css_free=%p id:name=%d:%s\n",
5621 i, cgroup_subsys_name[i], ss->css_alloc, ss->css_free,
5622 ss->id, ss->name);
5623 WARN(strlen(cgroup_subsys_name[i]) > MAX_CGROUP_TYPE_NAMELEN,
5624 "cgroup_subsys_name %s too long\n", cgroup_subsys_name[i]);
5625
5626 ss->id = i;
5627 ss->name = cgroup_subsys_name[i];
5628 if (!ss->legacy_name)
5629 ss->legacy_name = cgroup_subsys_name[i];
5630
5631 if (ss->early_init)
5632 cgroup_init_subsys(ss, true);
5633 }
5634 return 0;
5635}
5636
5637
5638
5639
5640
5641
5642
5643int __init cgroup_init(void)
5644{
5645 struct cgroup_subsys *ss;
5646 int ssid;
5647
5648 BUILD_BUG_ON(CGROUP_SUBSYS_COUNT > 16);
5649 BUG_ON(cgroup_init_cftypes(NULL, cgroup_base_files));
5650 BUG_ON(cgroup_init_cftypes(NULL, cgroup1_base_files));
5651
5652 cgroup_rstat_boot();
5653
5654
5655
5656
5657
5658 rcu_sync_enter_start(&cgroup_threadgroup_rwsem.rss);
5659
5660 get_user_ns(init_cgroup_ns.user_ns);
5661
5662 mutex_lock(&cgroup_mutex);
5663
5664
5665
5666
5667
5668 hash_add(css_set_table, &init_css_set.hlist,
5669 css_set_hash(init_css_set.subsys));
5670
5671 BUG_ON(cgroup_setup_root(&cgrp_dfl_root, 0));
5672
5673 mutex_unlock(&cgroup_mutex);
5674
5675 for_each_subsys(ss, ssid) {
5676 if (ss->early_init) {
5677 struct cgroup_subsys_state *css =
5678 init_css_set.subsys[ss->id];
5679
5680 css->id = cgroup_idr_alloc(&ss->css_idr, css, 1, 2,
5681 GFP_KERNEL);
5682 BUG_ON(css->id < 0);
5683 } else {
5684 cgroup_init_subsys(ss, false);
5685 }
5686
5687 list_add_tail(&init_css_set.e_cset_node[ssid],
5688 &cgrp_dfl_root.cgrp.e_csets[ssid]);
5689
5690
5691
5692
5693
5694
5695 if (!cgroup_ssid_enabled(ssid))
5696 continue;
5697
5698 if (cgroup1_ssid_disabled(ssid))
5699 printk(KERN_INFO "Disabling %s control group subsystem in v1 mounts\n",
5700 ss->name);
5701
5702 cgrp_dfl_root.subsys_mask |= 1 << ss->id;
5703
5704
5705 WARN_ON(ss->implicit_on_dfl && !ss->threaded);
5706
5707 if (ss->implicit_on_dfl)
5708 cgrp_dfl_implicit_ss_mask |= 1 << ss->id;
5709 else if (!ss->dfl_cftypes)
5710 cgrp_dfl_inhibit_ss_mask |= 1 << ss->id;
5711
5712 if (ss->threaded)
5713 cgrp_dfl_threaded_ss_mask |= 1 << ss->id;
5714
5715 if (ss->dfl_cftypes == ss->legacy_cftypes) {
5716 WARN_ON(cgroup_add_cftypes(ss, ss->dfl_cftypes));
5717 } else {
5718 WARN_ON(cgroup_add_dfl_cftypes(ss, ss->dfl_cftypes));
5719 WARN_ON(cgroup_add_legacy_cftypes(ss, ss->legacy_cftypes));
5720 }
5721
5722 if (ss->bind)
5723 ss->bind(init_css_set.subsys[ssid]);
5724
5725 mutex_lock(&cgroup_mutex);
5726 css_populate_dir(init_css_set.subsys[ssid]);
5727 mutex_unlock(&cgroup_mutex);
5728 }
5729
5730
5731 hash_del(&init_css_set.hlist);
5732 hash_add(css_set_table, &init_css_set.hlist,
5733 css_set_hash(init_css_set.subsys));
5734
5735 WARN_ON(sysfs_create_mount_point(fs_kobj, "cgroup"));
5736 WARN_ON(register_filesystem(&cgroup_fs_type));
5737 WARN_ON(register_filesystem(&cgroup2_fs_type));
5738 WARN_ON(!proc_create_single("cgroups", 0, NULL, proc_cgroupstats_show));
5739#ifdef CONFIG_CPUSETS
5740 WARN_ON(register_filesystem(&cpuset_fs_type));
5741#endif
5742
5743 return 0;
5744}
5745
5746static int __init cgroup_wq_init(void)
5747{
5748
5749
5750
5751
5752
5753
5754
5755
5756 cgroup_destroy_wq = alloc_workqueue("cgroup_destroy", 0, 1);
5757 BUG_ON(!cgroup_destroy_wq);
5758 return 0;
5759}
5760core_initcall(cgroup_wq_init);
5761
5762void cgroup_path_from_kernfs_id(u64 id, char *buf, size_t buflen)
5763{
5764 struct kernfs_node *kn;
5765
5766 kn = kernfs_find_and_get_node_by_id(cgrp_dfl_root.kf_root, id);
5767 if (!kn)
5768 return;
5769 kernfs_path(kn, buf, buflen);
5770 kernfs_put(kn);
5771}
5772
5773
5774
5775
5776
5777
5778int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns,
5779 struct pid *pid, struct task_struct *tsk)
5780{
5781 char *buf;
5782 int retval;
5783 struct cgroup_root *root;
5784
5785 retval = -ENOMEM;
5786 buf = kmalloc(PATH_MAX, GFP_KERNEL);
5787 if (!buf)
5788 goto out;
5789
5790 mutex_lock(&cgroup_mutex);
5791 spin_lock_irq(&css_set_lock);
5792
5793 for_each_root(root) {
5794 struct cgroup_subsys *ss;
5795 struct cgroup *cgrp;
5796 int ssid, count = 0;
5797
5798 if (root == &cgrp_dfl_root && !cgrp_dfl_visible)
5799 continue;
5800
5801 seq_printf(m, "%d:", root->hierarchy_id);
5802 if (root != &cgrp_dfl_root)
5803 for_each_subsys(ss, ssid)
5804 if (root->subsys_mask & (1 << ssid))
5805 seq_printf(m, "%s%s", count++ ? "," : "",
5806 ss->legacy_name);
5807 if (strlen(root->name))
5808 seq_printf(m, "%sname=%s", count ? "," : "",
5809 root->name);
5810 seq_putc(m, ':');
5811
5812 cgrp = task_cgroup_from_root(tsk, root);
5813
5814
5815
5816
5817
5818
5819
5820
5821
5822
5823 if (cgroup_on_dfl(cgrp) || !(tsk->flags & PF_EXITING)) {
5824 retval = cgroup_path_ns_locked(cgrp, buf, PATH_MAX,
5825 current->nsproxy->cgroup_ns);
5826 if (retval >= PATH_MAX)
5827 retval = -ENAMETOOLONG;
5828 if (retval < 0)
5829 goto out_unlock;
5830
5831 seq_puts(m, buf);
5832 } else {
5833 seq_puts(m, "/");
5834 }
5835
5836 if (cgroup_on_dfl(cgrp) && cgroup_is_dead(cgrp))
5837 seq_puts(m, " (deleted)\n");
5838 else
5839 seq_putc(m, '\n');
5840 }
5841
5842 retval = 0;
5843out_unlock:
5844 spin_unlock_irq(&css_set_lock);
5845 mutex_unlock(&cgroup_mutex);
5846 kfree(buf);
5847out:
5848 return retval;
5849}
5850
5851
5852
5853
5854
5855
5856
5857
5858void cgroup_fork(struct task_struct *child)
5859{
5860 RCU_INIT_POINTER(child->cgroups, &init_css_set);
5861 INIT_LIST_HEAD(&child->cg_list);
5862}
5863
5864static struct cgroup *cgroup_get_from_file(struct file *f)
5865{
5866 struct cgroup_subsys_state *css;
5867 struct cgroup *cgrp;
5868
5869 css = css_tryget_online_from_dir(f->f_path.dentry, NULL);
5870 if (IS_ERR(css))
5871 return ERR_CAST(css);
5872
5873 cgrp = css->cgroup;
5874 if (!cgroup_on_dfl(cgrp)) {
5875 cgroup_put(cgrp);
5876 return ERR_PTR(-EBADF);
5877 }
5878
5879 return cgrp;
5880}
5881
5882
5883
5884
5885
5886
5887
5888
5889
5890
5891
5892
5893
5894
5895
5896
5897
5898static int cgroup_css_set_fork(struct kernel_clone_args *kargs)
5899 __acquires(&cgroup_mutex) __acquires(&cgroup_threadgroup_rwsem)
5900{
5901 int ret;
5902 struct cgroup *dst_cgrp = NULL;
5903 struct css_set *cset;
5904 struct super_block *sb;
5905 struct file *f;
5906
5907 if (kargs->flags & CLONE_INTO_CGROUP)
5908 mutex_lock(&cgroup_mutex);
5909
5910 cgroup_threadgroup_change_begin(current);
5911
5912 spin_lock_irq(&css_set_lock);
5913 cset = task_css_set(current);
5914 get_css_set(cset);
5915 spin_unlock_irq(&css_set_lock);
5916
5917 if (!(kargs->flags & CLONE_INTO_CGROUP)) {
5918 kargs->cset = cset;
5919 return 0;
5920 }
5921
5922 f = fget_raw(kargs->cgroup);
5923 if (!f) {
5924 ret = -EBADF;
5925 goto err;
5926 }
5927 sb = f->f_path.dentry->d_sb;
5928
5929 dst_cgrp = cgroup_get_from_file(f);
5930 if (IS_ERR(dst_cgrp)) {
5931 ret = PTR_ERR(dst_cgrp);
5932 dst_cgrp = NULL;
5933 goto err;
5934 }
5935
5936 if (cgroup_is_dead(dst_cgrp)) {
5937 ret = -ENODEV;
5938 goto err;
5939 }
5940
5941
5942
5943
5944
5945
5946 ret = cgroup_may_write(dst_cgrp, sb);
5947 if (ret)
5948 goto err;
5949
5950 ret = cgroup_attach_permissions(cset->dfl_cgrp, dst_cgrp, sb,
5951 !(kargs->flags & CLONE_THREAD));
5952 if (ret)
5953 goto err;
5954
5955 kargs->cset = find_css_set(cset, dst_cgrp);
5956 if (!kargs->cset) {
5957 ret = -ENOMEM;
5958 goto err;
5959 }
5960
5961 put_css_set(cset);
5962 fput(f);
5963 kargs->cgrp = dst_cgrp;
5964 return ret;
5965
5966err:
5967 cgroup_threadgroup_change_end(current);
5968 mutex_unlock(&cgroup_mutex);
5969 if (f)
5970 fput(f);
5971 if (dst_cgrp)
5972 cgroup_put(dst_cgrp);
5973 put_css_set(cset);
5974 if (kargs->cset)
5975 put_css_set(kargs->cset);
5976 return ret;
5977}
5978
5979
5980
5981
5982
5983
5984
5985
5986static void cgroup_css_set_put_fork(struct kernel_clone_args *kargs)
5987 __releases(&cgroup_threadgroup_rwsem) __releases(&cgroup_mutex)
5988{
5989 cgroup_threadgroup_change_end(current);
5990
5991 if (kargs->flags & CLONE_INTO_CGROUP) {
5992 struct cgroup *cgrp = kargs->cgrp;
5993 struct css_set *cset = kargs->cset;
5994
5995 mutex_unlock(&cgroup_mutex);
5996
5997 if (cset) {
5998 put_css_set(cset);
5999 kargs->cset = NULL;
6000 }
6001
6002 if (cgrp) {
6003 cgroup_put(cgrp);
6004 kargs->cgrp = NULL;
6005 }
6006 }
6007}
6008
6009
6010
6011
6012
6013
6014
6015
6016
6017
6018
6019int cgroup_can_fork(struct task_struct *child, struct kernel_clone_args *kargs)
6020{
6021 struct cgroup_subsys *ss;
6022 int i, j, ret;
6023
6024 ret = cgroup_css_set_fork(kargs);
6025 if (ret)
6026 return ret;
6027
6028 do_each_subsys_mask(ss, i, have_canfork_callback) {
6029 ret = ss->can_fork(child, kargs->cset);
6030 if (ret)
6031 goto out_revert;
6032 } while_each_subsys_mask();
6033
6034 return 0;
6035
6036out_revert:
6037 for_each_subsys(ss, j) {
6038 if (j >= i)
6039 break;
6040 if (ss->cancel_fork)
6041 ss->cancel_fork(child, kargs->cset);
6042 }
6043
6044 cgroup_css_set_put_fork(kargs);
6045
6046 return ret;
6047}
6048
6049
6050
6051
6052
6053
6054
6055
6056
6057
6058void cgroup_cancel_fork(struct task_struct *child,
6059 struct kernel_clone_args *kargs)
6060{
6061 struct cgroup_subsys *ss;
6062 int i;
6063
6064 for_each_subsys(ss, i)
6065 if (ss->cancel_fork)
6066 ss->cancel_fork(child, kargs->cset);
6067
6068 cgroup_css_set_put_fork(kargs);
6069}
6070
6071
6072
6073
6074
6075
6076
6077
6078void cgroup_post_fork(struct task_struct *child,
6079 struct kernel_clone_args *kargs)
6080 __releases(&cgroup_threadgroup_rwsem) __releases(&cgroup_mutex)
6081{
6082 struct cgroup_subsys *ss;
6083 struct css_set *cset;
6084 int i;
6085
6086 cset = kargs->cset;
6087 kargs->cset = NULL;
6088
6089 spin_lock_irq(&css_set_lock);
6090
6091
6092 if (likely(child->pid)) {
6093 WARN_ON_ONCE(!list_empty(&child->cg_list));
6094 cset->nr_tasks++;
6095 css_set_move_task(child, NULL, cset, false);
6096 } else {
6097 put_css_set(cset);
6098 cset = NULL;
6099 }
6100
6101
6102
6103
6104
6105
6106 if (unlikely(cgroup_task_freeze(child))) {
6107 spin_lock(&child->sighand->siglock);
6108 WARN_ON_ONCE(child->frozen);
6109 child->jobctl |= JOBCTL_TRAP_FREEZE;
6110 spin_unlock(&child->sighand->siglock);
6111
6112
6113
6114
6115
6116
6117
6118 }
6119
6120 spin_unlock_irq(&css_set_lock);
6121
6122
6123
6124
6125
6126
6127 do_each_subsys_mask(ss, i, have_fork_callback) {
6128 ss->fork(child);
6129 } while_each_subsys_mask();
6130
6131
6132 if (kargs->flags & CLONE_NEWCGROUP) {
6133 struct css_set *rcset = child->nsproxy->cgroup_ns->root_cset;
6134
6135 get_css_set(cset);
6136 child->nsproxy->cgroup_ns->root_cset = cset;
6137 put_css_set(rcset);
6138 }
6139
6140 cgroup_css_set_put_fork(kargs);
6141}
6142
6143
6144
6145
6146
6147
6148
6149
6150void cgroup_exit(struct task_struct *tsk)
6151{
6152 struct cgroup_subsys *ss;
6153 struct css_set *cset;
6154 int i;
6155
6156 spin_lock_irq(&css_set_lock);
6157
6158 WARN_ON_ONCE(list_empty(&tsk->cg_list));
6159 cset = task_css_set(tsk);
6160 css_set_move_task(tsk, cset, NULL, false);
6161 list_add_tail(&tsk->cg_list, &cset->dying_tasks);
6162 cset->nr_tasks--;
6163
6164 WARN_ON_ONCE(cgroup_task_frozen(tsk));
6165 if (unlikely(cgroup_task_freeze(tsk)))
6166 cgroup_update_frozen(task_dfl_cgroup(tsk));
6167
6168 spin_unlock_irq(&css_set_lock);
6169
6170
6171 do_each_subsys_mask(ss, i, have_exit_callback) {
6172 ss->exit(tsk);
6173 } while_each_subsys_mask();
6174}
6175
6176void cgroup_release(struct task_struct *task)
6177{
6178 struct cgroup_subsys *ss;
6179 int ssid;
6180
6181 do_each_subsys_mask(ss, ssid, have_release_callback) {
6182 ss->release(task);
6183 } while_each_subsys_mask();
6184
6185 spin_lock_irq(&css_set_lock);
6186 css_set_skip_task_iters(task_css_set(task), task);
6187 list_del_init(&task->cg_list);
6188 spin_unlock_irq(&css_set_lock);
6189}
6190
6191void cgroup_free(struct task_struct *task)
6192{
6193 struct css_set *cset = task_css_set(task);
6194 put_css_set(cset);
6195}
6196
6197static int __init cgroup_disable(char *str)
6198{
6199 struct cgroup_subsys *ss;
6200 char *token;
6201 int i;
6202
6203 while ((token = strsep(&str, ",")) != NULL) {
6204 if (!*token)
6205 continue;
6206
6207 for_each_subsys(ss, i) {
6208 if (strcmp(token, ss->name) &&
6209 strcmp(token, ss->legacy_name))
6210 continue;
6211
6212 static_branch_disable(cgroup_subsys_enabled_key[i]);
6213 pr_info("Disabling %s control group subsystem\n",
6214 ss->name);
6215 }
6216 }
6217 return 1;
6218}
6219__setup("cgroup_disable=", cgroup_disable);
6220
6221void __init __weak enable_debug_cgroup(void) { }
6222
6223static int __init enable_cgroup_debug(char *str)
6224{
6225 cgroup_debug = true;
6226 enable_debug_cgroup();
6227 return 1;
6228}
6229__setup("cgroup_debug", enable_cgroup_debug);
6230
6231
6232
6233
6234
6235
6236
6237
6238
6239
6240struct cgroup_subsys_state *css_tryget_online_from_dir(struct dentry *dentry,
6241 struct cgroup_subsys *ss)
6242{
6243 struct kernfs_node *kn = kernfs_node_from_dentry(dentry);
6244 struct file_system_type *s_type = dentry->d_sb->s_type;
6245 struct cgroup_subsys_state *css = NULL;
6246 struct cgroup *cgrp;
6247
6248
6249 if ((s_type != &cgroup_fs_type && s_type != &cgroup2_fs_type) ||
6250 !kn || kernfs_type(kn) != KERNFS_DIR)
6251 return ERR_PTR(-EBADF);
6252
6253 rcu_read_lock();
6254
6255
6256
6257
6258
6259
6260 cgrp = rcu_dereference(*(void __rcu __force **)&kn->priv);
6261 if (cgrp)
6262 css = cgroup_css(cgrp, ss);
6263
6264 if (!css || !css_tryget_online(css))
6265 css = ERR_PTR(-ENOENT);
6266
6267 rcu_read_unlock();
6268 return css;
6269}
6270
6271
6272
6273
6274
6275
6276
6277
6278
6279struct cgroup_subsys_state *css_from_id(int id, struct cgroup_subsys *ss)
6280{
6281 WARN_ON_ONCE(!rcu_read_lock_held());
6282 return idr_find(&ss->css_idr, id);
6283}
6284
6285
6286
6287
6288
6289
6290
6291
6292
6293
6294struct cgroup *cgroup_get_from_path(const char *path)
6295{
6296 struct kernfs_node *kn;
6297 struct cgroup *cgrp;
6298
6299 mutex_lock(&cgroup_mutex);
6300
6301 kn = kernfs_walk_and_get(cgrp_dfl_root.cgrp.kn, path);
6302 if (kn) {
6303 if (kernfs_type(kn) == KERNFS_DIR) {
6304 cgrp = kn->priv;
6305 cgroup_get_live(cgrp);
6306 } else {
6307 cgrp = ERR_PTR(-ENOTDIR);
6308 }
6309 kernfs_put(kn);
6310 } else {
6311 cgrp = ERR_PTR(-ENOENT);
6312 }
6313
6314 mutex_unlock(&cgroup_mutex);
6315 return cgrp;
6316}
6317EXPORT_SYMBOL_GPL(cgroup_get_from_path);
6318
6319
6320
6321
6322
6323
6324
6325
6326
6327
6328struct cgroup *cgroup_get_from_fd(int fd)
6329{
6330 struct cgroup *cgrp;
6331 struct file *f;
6332
6333 f = fget_raw(fd);
6334 if (!f)
6335 return ERR_PTR(-EBADF);
6336
6337 cgrp = cgroup_get_from_file(f);
6338 fput(f);
6339 return cgrp;
6340}
6341EXPORT_SYMBOL_GPL(cgroup_get_from_fd);
6342
6343static u64 power_of_ten(int power)
6344{
6345 u64 v = 1;
6346 while (power--)
6347 v *= 10;
6348 return v;
6349}
6350
6351
6352
6353
6354
6355
6356
6357
6358
6359
6360
6361
6362
6363
6364
6365int cgroup_parse_float(const char *input, unsigned dec_shift, s64 *v)
6366{
6367 s64 whole, frac = 0;
6368 int fstart = 0, fend = 0, flen;
6369
6370 if (!sscanf(input, "%lld.%n%lld%n", &whole, &fstart, &frac, &fend))
6371 return -EINVAL;
6372 if (frac < 0)
6373 return -EINVAL;
6374
6375 flen = fend > fstart ? fend - fstart : 0;
6376 if (flen < dec_shift)
6377 frac *= power_of_ten(dec_shift - flen);
6378 else
6379 frac = DIV_ROUND_CLOSEST_ULL(frac, power_of_ten(flen - dec_shift));
6380
6381 *v = whole * power_of_ten(dec_shift) + frac;
6382 return 0;
6383}
6384
6385
6386
6387
6388
6389#ifdef CONFIG_SOCK_CGROUP_DATA
6390
6391#if defined(CONFIG_CGROUP_NET_PRIO) || defined(CONFIG_CGROUP_NET_CLASSID)
6392
6393DEFINE_SPINLOCK(cgroup_sk_update_lock);
6394static bool cgroup_sk_alloc_disabled __read_mostly;
6395
6396void cgroup_sk_alloc_disable(void)
6397{
6398 if (cgroup_sk_alloc_disabled)
6399 return;
6400 pr_info("cgroup: disabling cgroup2 socket matching due to net_prio or net_cls activation\n");
6401 cgroup_sk_alloc_disabled = true;
6402}
6403
6404#else
6405
6406#define cgroup_sk_alloc_disabled false
6407
6408#endif
6409
6410void cgroup_sk_alloc(struct sock_cgroup_data *skcd)
6411{
6412 if (cgroup_sk_alloc_disabled) {
6413 skcd->no_refcnt = 1;
6414 return;
6415 }
6416
6417
6418 if (in_interrupt())
6419 return;
6420
6421 rcu_read_lock();
6422
6423 while (true) {
6424 struct css_set *cset;
6425
6426 cset = task_css_set(current);
6427 if (likely(cgroup_tryget(cset->dfl_cgrp))) {
6428 skcd->val = (unsigned long)cset->dfl_cgrp;
6429 cgroup_bpf_get(cset->dfl_cgrp);
6430 break;
6431 }
6432 cpu_relax();
6433 }
6434
6435 rcu_read_unlock();
6436}
6437
6438void cgroup_sk_clone(struct sock_cgroup_data *skcd)
6439{
6440 if (skcd->val) {
6441 if (skcd->no_refcnt)
6442 return;
6443
6444
6445
6446
6447
6448 cgroup_get(sock_cgroup_ptr(skcd));
6449 cgroup_bpf_get(sock_cgroup_ptr(skcd));
6450 }
6451}
6452
6453void cgroup_sk_free(struct sock_cgroup_data *skcd)
6454{
6455 struct cgroup *cgrp = sock_cgroup_ptr(skcd);
6456
6457 if (skcd->no_refcnt)
6458 return;
6459 cgroup_bpf_put(cgrp);
6460 cgroup_put(cgrp);
6461}
6462
6463#endif
6464
6465#ifdef CONFIG_CGROUP_BPF
6466int cgroup_bpf_attach(struct cgroup *cgrp,
6467 struct bpf_prog *prog, struct bpf_prog *replace_prog,
6468 struct bpf_cgroup_link *link,
6469 enum bpf_attach_type type,
6470 u32 flags)
6471{
6472 int ret;
6473
6474 mutex_lock(&cgroup_mutex);
6475 ret = __cgroup_bpf_attach(cgrp, prog, replace_prog, link, type, flags);
6476 mutex_unlock(&cgroup_mutex);
6477 return ret;
6478}
6479
6480int cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog,
6481 enum bpf_attach_type type)
6482{
6483 int ret;
6484
6485 mutex_lock(&cgroup_mutex);
6486 ret = __cgroup_bpf_detach(cgrp, prog, NULL, type);
6487 mutex_unlock(&cgroup_mutex);
6488 return ret;
6489}
6490
6491int cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
6492 union bpf_attr __user *uattr)
6493{
6494 int ret;
6495
6496 mutex_lock(&cgroup_mutex);
6497 ret = __cgroup_bpf_query(cgrp, attr, uattr);
6498 mutex_unlock(&cgroup_mutex);
6499 return ret;
6500}
6501#endif
6502
6503#ifdef CONFIG_SYSFS
6504static ssize_t show_delegatable_files(struct cftype *files, char *buf,
6505 ssize_t size, const char *prefix)
6506{
6507 struct cftype *cft;
6508 ssize_t ret = 0;
6509
6510 for (cft = files; cft && cft->name[0] != '\0'; cft++) {
6511 if (!(cft->flags & CFTYPE_NS_DELEGATABLE))
6512 continue;
6513
6514 if (prefix)
6515 ret += snprintf(buf + ret, size - ret, "%s.", prefix);
6516
6517 ret += snprintf(buf + ret, size - ret, "%s\n", cft->name);
6518
6519 if (WARN_ON(ret >= size))
6520 break;
6521 }
6522
6523 return ret;
6524}
6525
6526static ssize_t delegate_show(struct kobject *kobj, struct kobj_attribute *attr,
6527 char *buf)
6528{
6529 struct cgroup_subsys *ss;
6530 int ssid;
6531 ssize_t ret = 0;
6532
6533 ret = show_delegatable_files(cgroup_base_files, buf, PAGE_SIZE - ret,
6534 NULL);
6535
6536 for_each_subsys(ss, ssid)
6537 ret += show_delegatable_files(ss->dfl_cftypes, buf + ret,
6538 PAGE_SIZE - ret,
6539 cgroup_subsys_name[ssid]);
6540
6541 return ret;
6542}
6543static struct kobj_attribute cgroup_delegate_attr = __ATTR_RO(delegate);
6544
6545static ssize_t features_show(struct kobject *kobj, struct kobj_attribute *attr,
6546 char *buf)
6547{
6548 return snprintf(buf, PAGE_SIZE,
6549 "nsdelegate\n"
6550 "memory_localevents\n"
6551 "memory_recursiveprot\n");
6552}
6553static struct kobj_attribute cgroup_features_attr = __ATTR_RO(features);
6554
6555static struct attribute *cgroup_sysfs_attrs[] = {
6556 &cgroup_delegate_attr.attr,
6557 &cgroup_features_attr.attr,
6558 NULL,
6559};
6560
6561static const struct attribute_group cgroup_sysfs_attr_group = {
6562 .attrs = cgroup_sysfs_attrs,
6563 .name = "cgroup",
6564};
6565
6566static int __init cgroup_sysfs_init(void)
6567{
6568 return sysfs_create_group(kernel_kobj, &cgroup_sysfs_attr_group);
6569}
6570subsys_initcall(cgroup_sysfs_init);
6571
6572#endif
6573