1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
30
31#include "cgroup-internal.h"
32
33#include <linux/cred.h>
34#include <linux/errno.h>
35#include <linux/init_task.h>
36#include <linux/kernel.h>
37#include <linux/magic.h>
38#include <linux/mutex.h>
39#include <linux/mount.h>
40#include <linux/pagemap.h>
41#include <linux/proc_fs.h>
42#include <linux/rcupdate.h>
43#include <linux/sched.h>
44#include <linux/sched/task.h>
45#include <linux/slab.h>
46#include <linux/spinlock.h>
47#include <linux/percpu-rwsem.h>
48#include <linux/string.h>
49#include <linux/hashtable.h>
50#include <linux/idr.h>
51#include <linux/kthread.h>
52#include <linux/atomic.h>
53#include <linux/cpuset.h>
54#include <linux/proc_ns.h>
55#include <linux/nsproxy.h>
56#include <linux/file.h>
57#include <linux/fs_parser.h>
58#include <linux/sched/cputime.h>
59#include <linux/psi.h>
60#include <net/sock.h>
61
62#define CREATE_TRACE_POINTS
63#include <trace/events/cgroup.h>
64
65#define CGROUP_FILE_NAME_MAX (MAX_CGROUP_TYPE_NAMELEN + \
66 MAX_CFTYPE_NAME + 2)
67
68#define CGROUP_FILE_NOTIFY_MIN_INTV DIV_ROUND_UP(HZ, 100)
69
70
71
72
73
74
75
76
77
78
79
80DEFINE_MUTEX(cgroup_mutex);
81DEFINE_SPINLOCK(css_set_lock);
82
83#ifdef CONFIG_PROVE_RCU
84EXPORT_SYMBOL_GPL(cgroup_mutex);
85EXPORT_SYMBOL_GPL(css_set_lock);
86#endif
87
88DEFINE_SPINLOCK(trace_cgroup_path_lock);
89char trace_cgroup_path[TRACE_CGROUP_PATH_LEN];
90bool cgroup_debug __read_mostly;
91
92
93
94
95
96static DEFINE_SPINLOCK(cgroup_idr_lock);
97
98
99
100
101
102static DEFINE_SPINLOCK(cgroup_file_kn_lock);
103
104DEFINE_PERCPU_RWSEM(cgroup_threadgroup_rwsem);
105
106#define cgroup_assert_mutex_or_rcu_locked() \
107 RCU_LOCKDEP_WARN(!rcu_read_lock_held() && \
108 !lockdep_is_held(&cgroup_mutex), \
109 "cgroup_mutex or RCU read lock required");
110
111
112
113
114
115
116
117static struct workqueue_struct *cgroup_destroy_wq;
118
119
120#define SUBSYS(_x) [_x ## _cgrp_id] = &_x ## _cgrp_subsys,
121struct cgroup_subsys *cgroup_subsys[] = {
122#include <linux/cgroup_subsys.h>
123};
124#undef SUBSYS
125
126
127#define SUBSYS(_x) [_x ## _cgrp_id] = #_x,
128static const char *cgroup_subsys_name[] = {
129#include <linux/cgroup_subsys.h>
130};
131#undef SUBSYS
132
133
134#define SUBSYS(_x) \
135 DEFINE_STATIC_KEY_TRUE(_x ## _cgrp_subsys_enabled_key); \
136 DEFINE_STATIC_KEY_TRUE(_x ## _cgrp_subsys_on_dfl_key); \
137 EXPORT_SYMBOL_GPL(_x ## _cgrp_subsys_enabled_key); \
138 EXPORT_SYMBOL_GPL(_x ## _cgrp_subsys_on_dfl_key);
139#include <linux/cgroup_subsys.h>
140#undef SUBSYS
141
142#define SUBSYS(_x) [_x ## _cgrp_id] = &_x ## _cgrp_subsys_enabled_key,
143static struct static_key_true *cgroup_subsys_enabled_key[] = {
144#include <linux/cgroup_subsys.h>
145};
146#undef SUBSYS
147
148#define SUBSYS(_x) [_x ## _cgrp_id] = &_x ## _cgrp_subsys_on_dfl_key,
149static struct static_key_true *cgroup_subsys_on_dfl_key[] = {
150#include <linux/cgroup_subsys.h>
151};
152#undef SUBSYS
153
154static DEFINE_PER_CPU(struct cgroup_rstat_cpu, cgrp_dfl_root_rstat_cpu);
155
156
157struct cgroup_root cgrp_dfl_root = { .cgrp.rstat_cpu = &cgrp_dfl_root_rstat_cpu };
158EXPORT_SYMBOL_GPL(cgrp_dfl_root);
159
160
161
162
163
164static bool cgrp_dfl_visible;
165
166
167static u16 cgrp_dfl_inhibit_ss_mask;
168
169
170static u16 cgrp_dfl_implicit_ss_mask;
171
172
173static u16 cgrp_dfl_threaded_ss_mask;
174
175
176LIST_HEAD(cgroup_roots);
177static int cgroup_root_count;
178
179
180static DEFINE_IDR(cgroup_hierarchy_idr);
181
182
183
184
185
186
187
188
189static u64 css_serial_nr_next = 1;
190
191
192
193
194
195static u16 have_fork_callback __read_mostly;
196static u16 have_exit_callback __read_mostly;
197static u16 have_release_callback __read_mostly;
198static u16 have_canfork_callback __read_mostly;
199
200
201struct cgroup_namespace init_cgroup_ns = {
202 .count = REFCOUNT_INIT(2),
203 .user_ns = &init_user_ns,
204 .ns.ops = &cgroupns_operations,
205 .ns.inum = PROC_CGROUP_INIT_INO,
206 .root_cset = &init_css_set,
207};
208
209static struct file_system_type cgroup2_fs_type;
210static struct cftype cgroup_base_files[];
211
212static int cgroup_apply_control(struct cgroup *cgrp);
213static void cgroup_finalize_control(struct cgroup *cgrp, int ret);
214static void css_task_iter_skip(struct css_task_iter *it,
215 struct task_struct *task);
216static int cgroup_destroy_locked(struct cgroup *cgrp);
217static struct cgroup_subsys_state *css_create(struct cgroup *cgrp,
218 struct cgroup_subsys *ss);
219static void css_release(struct percpu_ref *ref);
220static void kill_css(struct cgroup_subsys_state *css);
221static int cgroup_addrm_files(struct cgroup_subsys_state *css,
222 struct cgroup *cgrp, struct cftype cfts[],
223 bool is_add);
224
225
226
227
228
229
230
231
232
233bool cgroup_ssid_enabled(int ssid)
234{
235 if (CGROUP_SUBSYS_COUNT == 0)
236 return false;
237
238 return static_key_enabled(cgroup_subsys_enabled_key[ssid]);
239}
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291bool cgroup_on_dfl(const struct cgroup *cgrp)
292{
293 return cgrp->root == &cgrp_dfl_root;
294}
295
296
297static int cgroup_idr_alloc(struct idr *idr, void *ptr, int start, int end,
298 gfp_t gfp_mask)
299{
300 int ret;
301
302 idr_preload(gfp_mask);
303 spin_lock_bh(&cgroup_idr_lock);
304 ret = idr_alloc(idr, ptr, start, end, gfp_mask & ~__GFP_DIRECT_RECLAIM);
305 spin_unlock_bh(&cgroup_idr_lock);
306 idr_preload_end();
307 return ret;
308}
309
310static void *cgroup_idr_replace(struct idr *idr, void *ptr, int id)
311{
312 void *ret;
313
314 spin_lock_bh(&cgroup_idr_lock);
315 ret = idr_replace(idr, ptr, id);
316 spin_unlock_bh(&cgroup_idr_lock);
317 return ret;
318}
319
320static void cgroup_idr_remove(struct idr *idr, int id)
321{
322 spin_lock_bh(&cgroup_idr_lock);
323 idr_remove(idr, id);
324 spin_unlock_bh(&cgroup_idr_lock);
325}
326
327static bool cgroup_has_tasks(struct cgroup *cgrp)
328{
329 return cgrp->nr_populated_csets;
330}
331
332bool cgroup_is_threaded(struct cgroup *cgrp)
333{
334 return cgrp->dom_cgrp != cgrp;
335}
336
337
338static bool cgroup_is_mixable(struct cgroup *cgrp)
339{
340
341
342
343
344
345 return !cgroup_parent(cgrp);
346}
347
348
349static bool cgroup_can_be_thread_root(struct cgroup *cgrp)
350{
351
352 if (cgroup_is_mixable(cgrp))
353 return true;
354
355
356 if (cgroup_is_threaded(cgrp))
357 return false;
358
359
360 if (cgrp->nr_populated_domain_children)
361 return false;
362
363
364 if (cgrp->subtree_control & ~cgrp_dfl_threaded_ss_mask)
365 return false;
366
367 return true;
368}
369
370
371bool cgroup_is_thread_root(struct cgroup *cgrp)
372{
373
374 if (cgroup_is_threaded(cgrp))
375 return false;
376
377
378 if (cgrp->nr_threaded_children)
379 return true;
380
381
382
383
384
385 if (cgroup_has_tasks(cgrp) &&
386 (cgrp->subtree_control & cgrp_dfl_threaded_ss_mask))
387 return true;
388
389 return false;
390}
391
392
393static bool cgroup_is_valid_domain(struct cgroup *cgrp)
394{
395
396 if (cgroup_is_threaded(cgrp))
397 return false;
398
399
400 while ((cgrp = cgroup_parent(cgrp))) {
401 if (!cgroup_is_mixable(cgrp) && cgroup_is_thread_root(cgrp))
402 return false;
403 if (cgroup_is_threaded(cgrp))
404 return false;
405 }
406
407 return true;
408}
409
410
411static u16 cgroup_control(struct cgroup *cgrp)
412{
413 struct cgroup *parent = cgroup_parent(cgrp);
414 u16 root_ss_mask = cgrp->root->subsys_mask;
415
416 if (parent) {
417 u16 ss_mask = parent->subtree_control;
418
419
420 if (cgroup_is_threaded(cgrp))
421 ss_mask &= cgrp_dfl_threaded_ss_mask;
422 return ss_mask;
423 }
424
425 if (cgroup_on_dfl(cgrp))
426 root_ss_mask &= ~(cgrp_dfl_inhibit_ss_mask |
427 cgrp_dfl_implicit_ss_mask);
428 return root_ss_mask;
429}
430
431
432static u16 cgroup_ss_mask(struct cgroup *cgrp)
433{
434 struct cgroup *parent = cgroup_parent(cgrp);
435
436 if (parent) {
437 u16 ss_mask = parent->subtree_ss_mask;
438
439
440 if (cgroup_is_threaded(cgrp))
441 ss_mask &= cgrp_dfl_threaded_ss_mask;
442 return ss_mask;
443 }
444
445 return cgrp->root->subsys_mask;
446}
447
448
449
450
451
452
453
454
455
456
457
458
459static struct cgroup_subsys_state *cgroup_css(struct cgroup *cgrp,
460 struct cgroup_subsys *ss)
461{
462 if (ss)
463 return rcu_dereference_check(cgrp->subsys[ss->id],
464 lockdep_is_held(&cgroup_mutex));
465 else
466 return &cgrp->self;
467}
468
469
470
471
472
473
474
475
476
477static struct cgroup_subsys_state *cgroup_tryget_css(struct cgroup *cgrp,
478 struct cgroup_subsys *ss)
479{
480 struct cgroup_subsys_state *css;
481
482 rcu_read_lock();
483 css = cgroup_css(cgrp, ss);
484 if (css && !css_tryget_online(css))
485 css = NULL;
486 rcu_read_unlock();
487
488 return css;
489}
490
491
492
493
494
495
496
497
498
499
500
501static struct cgroup_subsys_state *cgroup_e_css_by_mask(struct cgroup *cgrp,
502 struct cgroup_subsys *ss)
503{
504 lockdep_assert_held(&cgroup_mutex);
505
506 if (!ss)
507 return &cgrp->self;
508
509
510
511
512
513 while (!(cgroup_ss_mask(cgrp) & (1 << ss->id))) {
514 cgrp = cgroup_parent(cgrp);
515 if (!cgrp)
516 return NULL;
517 }
518
519 return cgroup_css(cgrp, ss);
520}
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535struct cgroup_subsys_state *cgroup_e_css(struct cgroup *cgrp,
536 struct cgroup_subsys *ss)
537{
538 struct cgroup_subsys_state *css;
539
540 do {
541 css = cgroup_css(cgrp, ss);
542
543 if (css)
544 return css;
545 cgrp = cgroup_parent(cgrp);
546 } while (cgrp);
547
548 return init_css_set.subsys[ss->id];
549}
550
551
552
553
554
555
556
557
558
559
560
561
562struct cgroup_subsys_state *cgroup_get_e_css(struct cgroup *cgrp,
563 struct cgroup_subsys *ss)
564{
565 struct cgroup_subsys_state *css;
566
567 rcu_read_lock();
568
569 do {
570 css = cgroup_css(cgrp, ss);
571
572 if (css && css_tryget_online(css))
573 goto out_unlock;
574 cgrp = cgroup_parent(cgrp);
575 } while (cgrp);
576
577 css = init_css_set.subsys[ss->id];
578 css_get(css);
579out_unlock:
580 rcu_read_unlock();
581 return css;
582}
583
584static void cgroup_get_live(struct cgroup *cgrp)
585{
586 WARN_ON_ONCE(cgroup_is_dead(cgrp));
587 css_get(&cgrp->self);
588}
589
590
591
592
593
594
595int __cgroup_task_count(const struct cgroup *cgrp)
596{
597 int count = 0;
598 struct cgrp_cset_link *link;
599
600 lockdep_assert_held(&css_set_lock);
601
602 list_for_each_entry(link, &cgrp->cset_links, cset_link)
603 count += link->cset->nr_tasks;
604
605 return count;
606}
607
608
609
610
611
612int cgroup_task_count(const struct cgroup *cgrp)
613{
614 int count;
615
616 spin_lock_irq(&css_set_lock);
617 count = __cgroup_task_count(cgrp);
618 spin_unlock_irq(&css_set_lock);
619
620 return count;
621}
622
623struct cgroup_subsys_state *of_css(struct kernfs_open_file *of)
624{
625 struct cgroup *cgrp = of->kn->parent->priv;
626 struct cftype *cft = of_cft(of);
627
628
629
630
631
632
633
634
635
636 if (cft->ss)
637 return rcu_dereference_raw(cgrp->subsys[cft->ss->id]);
638 else
639 return &cgrp->self;
640}
641EXPORT_SYMBOL_GPL(of_css);
642
643
644
645
646
647
648
649
650
651#define for_each_css(css, ssid, cgrp) \
652 for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++) \
653 if (!((css) = rcu_dereference_check( \
654 (cgrp)->subsys[(ssid)], \
655 lockdep_is_held(&cgroup_mutex)))) { } \
656 else
657
658
659
660
661
662
663
664
665
666#define for_each_e_css(css, ssid, cgrp) \
667 for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++) \
668 if (!((css) = cgroup_e_css_by_mask(cgrp, \
669 cgroup_subsys[(ssid)]))) \
670 ; \
671 else
672
673
674
675
676
677
678
679
680
681
682#define do_each_subsys_mask(ss, ssid, ss_mask) do { \
683 unsigned long __ss_mask = (ss_mask); \
684 if (!CGROUP_SUBSYS_COUNT) { \
685 (ssid) = 0; \
686 break; \
687 } \
688 for_each_set_bit(ssid, &__ss_mask, CGROUP_SUBSYS_COUNT) { \
689 (ss) = cgroup_subsys[ssid]; \
690 {
691
692#define while_each_subsys_mask() \
693 } \
694 } \
695} while (false)
696
697
698#define cgroup_for_each_live_child(child, cgrp) \
699 list_for_each_entry((child), &(cgrp)->self.children, self.sibling) \
700 if (({ lockdep_assert_held(&cgroup_mutex); \
701 cgroup_is_dead(child); })) \
702 ; \
703 else
704
705
706#define cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp) \
707 css_for_each_descendant_pre((d_css), cgroup_css((cgrp), NULL)) \
708 if (({ lockdep_assert_held(&cgroup_mutex); \
709 (dsct) = (d_css)->cgroup; \
710 cgroup_is_dead(dsct); })) \
711 ; \
712 else
713
714
715#define cgroup_for_each_live_descendant_post(dsct, d_css, cgrp) \
716 css_for_each_descendant_post((d_css), cgroup_css((cgrp), NULL)) \
717 if (({ lockdep_assert_held(&cgroup_mutex); \
718 (dsct) = (d_css)->cgroup; \
719 cgroup_is_dead(dsct); })) \
720 ; \
721 else
722
723
724
725
726
727
728
729
730struct css_set init_css_set = {
731 .refcount = REFCOUNT_INIT(1),
732 .dom_cset = &init_css_set,
733 .tasks = LIST_HEAD_INIT(init_css_set.tasks),
734 .mg_tasks = LIST_HEAD_INIT(init_css_set.mg_tasks),
735 .dying_tasks = LIST_HEAD_INIT(init_css_set.dying_tasks),
736 .task_iters = LIST_HEAD_INIT(init_css_set.task_iters),
737 .threaded_csets = LIST_HEAD_INIT(init_css_set.threaded_csets),
738 .cgrp_links = LIST_HEAD_INIT(init_css_set.cgrp_links),
739 .mg_preload_node = LIST_HEAD_INIT(init_css_set.mg_preload_node),
740 .mg_node = LIST_HEAD_INIT(init_css_set.mg_node),
741
742
743
744
745
746
747
748 .dfl_cgrp = &cgrp_dfl_root.cgrp,
749};
750
751static int css_set_count = 1;
752
753static bool css_set_threaded(struct css_set *cset)
754{
755 return cset->dom_cset != cset;
756}
757
758
759
760
761
762
763
764
765
766
767static bool css_set_populated(struct css_set *cset)
768{
769 lockdep_assert_held(&css_set_lock);
770
771 return !list_empty(&cset->tasks) || !list_empty(&cset->mg_tasks);
772}
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791static void cgroup_update_populated(struct cgroup *cgrp, bool populated)
792{
793 struct cgroup *child = NULL;
794 int adj = populated ? 1 : -1;
795
796 lockdep_assert_held(&css_set_lock);
797
798 do {
799 bool was_populated = cgroup_is_populated(cgrp);
800
801 if (!child) {
802 cgrp->nr_populated_csets += adj;
803 } else {
804 if (cgroup_is_threaded(child))
805 cgrp->nr_populated_threaded_children += adj;
806 else
807 cgrp->nr_populated_domain_children += adj;
808 }
809
810 if (was_populated == cgroup_is_populated(cgrp))
811 break;
812
813 cgroup1_check_for_release(cgrp);
814 TRACE_CGROUP_PATH(notify_populated, cgrp,
815 cgroup_is_populated(cgrp));
816 cgroup_file_notify(&cgrp->events_file);
817
818 child = cgrp;
819 cgrp = cgroup_parent(cgrp);
820 } while (cgrp);
821}
822
823
824
825
826
827
828
829
830
831static void css_set_update_populated(struct css_set *cset, bool populated)
832{
833 struct cgrp_cset_link *link;
834
835 lockdep_assert_held(&css_set_lock);
836
837 list_for_each_entry(link, &cset->cgrp_links, cgrp_link)
838 cgroup_update_populated(link->cgrp, populated);
839}
840
841
842
843
844
845
846
847static void css_set_skip_task_iters(struct css_set *cset,
848 struct task_struct *task)
849{
850 struct css_task_iter *it, *pos;
851
852 list_for_each_entry_safe(it, pos, &cset->task_iters, iters_node)
853 css_task_iter_skip(it, task);
854}
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871static void css_set_move_task(struct task_struct *task,
872 struct css_set *from_cset, struct css_set *to_cset,
873 bool use_mg_tasks)
874{
875 lockdep_assert_held(&css_set_lock);
876
877 if (to_cset && !css_set_populated(to_cset))
878 css_set_update_populated(to_cset, true);
879
880 if (from_cset) {
881 WARN_ON_ONCE(list_empty(&task->cg_list));
882
883 css_set_skip_task_iters(from_cset, task);
884 list_del_init(&task->cg_list);
885 if (!css_set_populated(from_cset))
886 css_set_update_populated(from_cset, false);
887 } else {
888 WARN_ON_ONCE(!list_empty(&task->cg_list));
889 }
890
891 if (to_cset) {
892
893
894
895
896
897 WARN_ON_ONCE(task->flags & PF_EXITING);
898
899 cgroup_move_task(task, to_cset);
900 list_add_tail(&task->cg_list, use_mg_tasks ? &to_cset->mg_tasks :
901 &to_cset->tasks);
902 }
903}
904
905
906
907
908
909
910#define CSS_SET_HASH_BITS 7
911static DEFINE_HASHTABLE(css_set_table, CSS_SET_HASH_BITS);
912
913static unsigned long css_set_hash(struct cgroup_subsys_state *css[])
914{
915 unsigned long key = 0UL;
916 struct cgroup_subsys *ss;
917 int i;
918
919 for_each_subsys(ss, i)
920 key += (unsigned long)css[i];
921 key = (key >> 16) ^ key;
922
923 return key;
924}
925
926void put_css_set_locked(struct css_set *cset)
927{
928 struct cgrp_cset_link *link, *tmp_link;
929 struct cgroup_subsys *ss;
930 int ssid;
931
932 lockdep_assert_held(&css_set_lock);
933
934 if (!refcount_dec_and_test(&cset->refcount))
935 return;
936
937 WARN_ON_ONCE(!list_empty(&cset->threaded_csets));
938
939
940 for_each_subsys(ss, ssid) {
941 list_del(&cset->e_cset_node[ssid]);
942 css_put(cset->subsys[ssid]);
943 }
944 hash_del(&cset->hlist);
945 css_set_count--;
946
947 list_for_each_entry_safe(link, tmp_link, &cset->cgrp_links, cgrp_link) {
948 list_del(&link->cset_link);
949 list_del(&link->cgrp_link);
950 if (cgroup_parent(link->cgrp))
951 cgroup_put(link->cgrp);
952 kfree(link);
953 }
954
955 if (css_set_threaded(cset)) {
956 list_del(&cset->threaded_csets_node);
957 put_css_set_locked(cset->dom_cset);
958 }
959
960 kfree_rcu(cset, rcu_head);
961}
962
963
964
965
966
967
968
969
970
971
972
973static bool compare_css_sets(struct css_set *cset,
974 struct css_set *old_cset,
975 struct cgroup *new_cgrp,
976 struct cgroup_subsys_state *template[])
977{
978 struct cgroup *new_dfl_cgrp;
979 struct list_head *l1, *l2;
980
981
982
983
984
985
986 if (memcmp(template, cset->subsys, sizeof(cset->subsys)))
987 return false;
988
989
990
991 if (cgroup_on_dfl(new_cgrp))
992 new_dfl_cgrp = new_cgrp;
993 else
994 new_dfl_cgrp = old_cset->dfl_cgrp;
995
996 if (new_dfl_cgrp->dom_cgrp != cset->dom_cset->dfl_cgrp)
997 return false;
998
999
1000
1001
1002
1003
1004
1005 l1 = &cset->cgrp_links;
1006 l2 = &old_cset->cgrp_links;
1007 while (1) {
1008 struct cgrp_cset_link *link1, *link2;
1009 struct cgroup *cgrp1, *cgrp2;
1010
1011 l1 = l1->next;
1012 l2 = l2->next;
1013
1014 if (l1 == &cset->cgrp_links) {
1015 BUG_ON(l2 != &old_cset->cgrp_links);
1016 break;
1017 } else {
1018 BUG_ON(l2 == &old_cset->cgrp_links);
1019 }
1020
1021 link1 = list_entry(l1, struct cgrp_cset_link, cgrp_link);
1022 link2 = list_entry(l2, struct cgrp_cset_link, cgrp_link);
1023 cgrp1 = link1->cgrp;
1024 cgrp2 = link2->cgrp;
1025
1026 BUG_ON(cgrp1->root != cgrp2->root);
1027
1028
1029
1030
1031
1032
1033
1034
1035 if (cgrp1->root == new_cgrp->root) {
1036 if (cgrp1 != new_cgrp)
1037 return false;
1038 } else {
1039 if (cgrp1 != cgrp2)
1040 return false;
1041 }
1042 }
1043 return true;
1044}
1045
1046
1047
1048
1049
1050
1051
1052static struct css_set *find_existing_css_set(struct css_set *old_cset,
1053 struct cgroup *cgrp,
1054 struct cgroup_subsys_state *template[])
1055{
1056 struct cgroup_root *root = cgrp->root;
1057 struct cgroup_subsys *ss;
1058 struct css_set *cset;
1059 unsigned long key;
1060 int i;
1061
1062
1063
1064
1065
1066
1067 for_each_subsys(ss, i) {
1068 if (root->subsys_mask & (1UL << i)) {
1069
1070
1071
1072
1073 template[i] = cgroup_e_css_by_mask(cgrp, ss);
1074 } else {
1075
1076
1077
1078
1079 template[i] = old_cset->subsys[i];
1080 }
1081 }
1082
1083 key = css_set_hash(template);
1084 hash_for_each_possible(css_set_table, cset, hlist, key) {
1085 if (!compare_css_sets(cset, old_cset, cgrp, template))
1086 continue;
1087
1088
1089 return cset;
1090 }
1091
1092
1093 return NULL;
1094}
1095
1096static void free_cgrp_cset_links(struct list_head *links_to_free)
1097{
1098 struct cgrp_cset_link *link, *tmp_link;
1099
1100 list_for_each_entry_safe(link, tmp_link, links_to_free, cset_link) {
1101 list_del(&link->cset_link);
1102 kfree(link);
1103 }
1104}
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114static int allocate_cgrp_cset_links(int count, struct list_head *tmp_links)
1115{
1116 struct cgrp_cset_link *link;
1117 int i;
1118
1119 INIT_LIST_HEAD(tmp_links);
1120
1121 for (i = 0; i < count; i++) {
1122 link = kzalloc(sizeof(*link), GFP_KERNEL);
1123 if (!link) {
1124 free_cgrp_cset_links(tmp_links);
1125 return -ENOMEM;
1126 }
1127 list_add(&link->cset_link, tmp_links);
1128 }
1129 return 0;
1130}
1131
1132
1133
1134
1135
1136
1137
1138static void link_css_set(struct list_head *tmp_links, struct css_set *cset,
1139 struct cgroup *cgrp)
1140{
1141 struct cgrp_cset_link *link;
1142
1143 BUG_ON(list_empty(tmp_links));
1144
1145 if (cgroup_on_dfl(cgrp))
1146 cset->dfl_cgrp = cgrp;
1147
1148 link = list_first_entry(tmp_links, struct cgrp_cset_link, cset_link);
1149 link->cset = cset;
1150 link->cgrp = cgrp;
1151
1152
1153
1154
1155
1156 list_move_tail(&link->cset_link, &cgrp->cset_links);
1157 list_add_tail(&link->cgrp_link, &cset->cgrp_links);
1158
1159 if (cgroup_parent(cgrp))
1160 cgroup_get_live(cgrp);
1161}
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171static struct css_set *find_css_set(struct css_set *old_cset,
1172 struct cgroup *cgrp)
1173{
1174 struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT] = { };
1175 struct css_set *cset;
1176 struct list_head tmp_links;
1177 struct cgrp_cset_link *link;
1178 struct cgroup_subsys *ss;
1179 unsigned long key;
1180 int ssid;
1181
1182 lockdep_assert_held(&cgroup_mutex);
1183
1184
1185
1186 spin_lock_irq(&css_set_lock);
1187 cset = find_existing_css_set(old_cset, cgrp, template);
1188 if (cset)
1189 get_css_set(cset);
1190 spin_unlock_irq(&css_set_lock);
1191
1192 if (cset)
1193 return cset;
1194
1195 cset = kzalloc(sizeof(*cset), GFP_KERNEL);
1196 if (!cset)
1197 return NULL;
1198
1199
1200 if (allocate_cgrp_cset_links(cgroup_root_count, &tmp_links) < 0) {
1201 kfree(cset);
1202 return NULL;
1203 }
1204
1205 refcount_set(&cset->refcount, 1);
1206 cset->dom_cset = cset;
1207 INIT_LIST_HEAD(&cset->tasks);
1208 INIT_LIST_HEAD(&cset->mg_tasks);
1209 INIT_LIST_HEAD(&cset->dying_tasks);
1210 INIT_LIST_HEAD(&cset->task_iters);
1211 INIT_LIST_HEAD(&cset->threaded_csets);
1212 INIT_HLIST_NODE(&cset->hlist);
1213 INIT_LIST_HEAD(&cset->cgrp_links);
1214 INIT_LIST_HEAD(&cset->mg_preload_node);
1215 INIT_LIST_HEAD(&cset->mg_node);
1216
1217
1218
1219 memcpy(cset->subsys, template, sizeof(cset->subsys));
1220
1221 spin_lock_irq(&css_set_lock);
1222
1223 list_for_each_entry(link, &old_cset->cgrp_links, cgrp_link) {
1224 struct cgroup *c = link->cgrp;
1225
1226 if (c->root == cgrp->root)
1227 c = cgrp;
1228 link_css_set(&tmp_links, cset, c);
1229 }
1230
1231 BUG_ON(!list_empty(&tmp_links));
1232
1233 css_set_count++;
1234
1235
1236 key = css_set_hash(cset->subsys);
1237 hash_add(css_set_table, &cset->hlist, key);
1238
1239 for_each_subsys(ss, ssid) {
1240 struct cgroup_subsys_state *css = cset->subsys[ssid];
1241
1242 list_add_tail(&cset->e_cset_node[ssid],
1243 &css->cgroup->e_csets[ssid]);
1244 css_get(css);
1245 }
1246
1247 spin_unlock_irq(&css_set_lock);
1248
1249
1250
1251
1252
1253
1254
1255 if (cgroup_is_threaded(cset->dfl_cgrp)) {
1256 struct css_set *dcset;
1257
1258 dcset = find_css_set(cset, cset->dfl_cgrp->dom_cgrp);
1259 if (!dcset) {
1260 put_css_set(cset);
1261 return NULL;
1262 }
1263
1264 spin_lock_irq(&css_set_lock);
1265 cset->dom_cset = dcset;
1266 list_add_tail(&cset->threaded_csets_node,
1267 &dcset->threaded_csets);
1268 spin_unlock_irq(&css_set_lock);
1269 }
1270
1271 return cset;
1272}
1273
1274struct cgroup_root *cgroup_root_from_kf(struct kernfs_root *kf_root)
1275{
1276 struct cgroup *root_cgrp = kf_root->kn->priv;
1277
1278 return root_cgrp->root;
1279}
1280
1281static int cgroup_init_root_id(struct cgroup_root *root)
1282{
1283 int id;
1284
1285 lockdep_assert_held(&cgroup_mutex);
1286
1287 id = idr_alloc_cyclic(&cgroup_hierarchy_idr, root, 0, 0, GFP_KERNEL);
1288 if (id < 0)
1289 return id;
1290
1291 root->hierarchy_id = id;
1292 return 0;
1293}
1294
1295static void cgroup_exit_root_id(struct cgroup_root *root)
1296{
1297 lockdep_assert_held(&cgroup_mutex);
1298
1299 idr_remove(&cgroup_hierarchy_idr, root->hierarchy_id);
1300}
1301
1302void cgroup_free_root(struct cgroup_root *root)
1303{
1304 kfree(root);
1305}
1306
1307static void cgroup_destroy_root(struct cgroup_root *root)
1308{
1309 struct cgroup *cgrp = &root->cgrp;
1310 struct cgrp_cset_link *link, *tmp_link;
1311
1312 trace_cgroup_destroy_root(root);
1313
1314 cgroup_lock_and_drain_offline(&cgrp_dfl_root.cgrp);
1315
1316 BUG_ON(atomic_read(&root->nr_cgrps));
1317 BUG_ON(!list_empty(&cgrp->self.children));
1318
1319
1320 WARN_ON(rebind_subsystems(&cgrp_dfl_root, root->subsys_mask));
1321
1322
1323
1324
1325
1326 spin_lock_irq(&css_set_lock);
1327
1328 list_for_each_entry_safe(link, tmp_link, &cgrp->cset_links, cset_link) {
1329 list_del(&link->cset_link);
1330 list_del(&link->cgrp_link);
1331 kfree(link);
1332 }
1333
1334 spin_unlock_irq(&css_set_lock);
1335
1336 if (!list_empty(&root->root_list)) {
1337 list_del(&root->root_list);
1338 cgroup_root_count--;
1339 }
1340
1341 cgroup_exit_root_id(root);
1342
1343 mutex_unlock(&cgroup_mutex);
1344
1345 kernfs_destroy_root(root->kf_root);
1346 cgroup_free_root(root);
1347}
1348
1349
1350
1351
1352
1353static struct cgroup *
1354current_cgns_cgroup_from_root(struct cgroup_root *root)
1355{
1356 struct cgroup *res = NULL;
1357 struct css_set *cset;
1358
1359 lockdep_assert_held(&css_set_lock);
1360
1361 rcu_read_lock();
1362
1363 cset = current->nsproxy->cgroup_ns->root_cset;
1364 if (cset == &init_css_set) {
1365 res = &root->cgrp;
1366 } else if (root == &cgrp_dfl_root) {
1367 res = cset->dfl_cgrp;
1368 } else {
1369 struct cgrp_cset_link *link;
1370
1371 list_for_each_entry(link, &cset->cgrp_links, cgrp_link) {
1372 struct cgroup *c = link->cgrp;
1373
1374 if (c->root == root) {
1375 res = c;
1376 break;
1377 }
1378 }
1379 }
1380 rcu_read_unlock();
1381
1382 BUG_ON(!res);
1383 return res;
1384}
1385
1386
1387static struct cgroup *cset_cgroup_from_root(struct css_set *cset,
1388 struct cgroup_root *root)
1389{
1390 struct cgroup *res = NULL;
1391
1392 lockdep_assert_held(&cgroup_mutex);
1393 lockdep_assert_held(&css_set_lock);
1394
1395 if (cset == &init_css_set) {
1396 res = &root->cgrp;
1397 } else if (root == &cgrp_dfl_root) {
1398 res = cset->dfl_cgrp;
1399 } else {
1400 struct cgrp_cset_link *link;
1401
1402 list_for_each_entry(link, &cset->cgrp_links, cgrp_link) {
1403 struct cgroup *c = link->cgrp;
1404
1405 if (c->root == root) {
1406 res = c;
1407 break;
1408 }
1409 }
1410 }
1411
1412 BUG_ON(!res);
1413 return res;
1414}
1415
1416
1417
1418
1419
1420struct cgroup *task_cgroup_from_root(struct task_struct *task,
1421 struct cgroup_root *root)
1422{
1423
1424
1425
1426
1427 return cset_cgroup_from_root(task_css_set(task), root);
1428}
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456static struct kernfs_syscall_ops cgroup_kf_syscall_ops;
1457
1458static char *cgroup_file_name(struct cgroup *cgrp, const struct cftype *cft,
1459 char *buf)
1460{
1461 struct cgroup_subsys *ss = cft->ss;
1462
1463 if (cft->ss && !(cft->flags & CFTYPE_NO_PREFIX) &&
1464 !(cgrp->root->flags & CGRP_ROOT_NOPREFIX)) {
1465 const char *dbg = (cft->flags & CFTYPE_DEBUG) ? ".__DEBUG__." : "";
1466
1467 snprintf(buf, CGROUP_FILE_NAME_MAX, "%s%s.%s",
1468 dbg, cgroup_on_dfl(cgrp) ? ss->name : ss->legacy_name,
1469 cft->name);
1470 } else {
1471 strscpy(buf, cft->name, CGROUP_FILE_NAME_MAX);
1472 }
1473 return buf;
1474}
1475
1476
1477
1478
1479
1480
1481
1482static umode_t cgroup_file_mode(const struct cftype *cft)
1483{
1484 umode_t mode = 0;
1485
1486 if (cft->read_u64 || cft->read_s64 || cft->seq_show)
1487 mode |= S_IRUGO;
1488
1489 if (cft->write_u64 || cft->write_s64 || cft->write) {
1490 if (cft->flags & CFTYPE_WORLD_WRITABLE)
1491 mode |= S_IWUGO;
1492 else
1493 mode |= S_IWUSR;
1494 }
1495
1496 return mode;
1497}
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511static u16 cgroup_calc_subtree_ss_mask(u16 subtree_control, u16 this_ss_mask)
1512{
1513 u16 cur_ss_mask = subtree_control;
1514 struct cgroup_subsys *ss;
1515 int ssid;
1516
1517 lockdep_assert_held(&cgroup_mutex);
1518
1519 cur_ss_mask |= cgrp_dfl_implicit_ss_mask;
1520
1521 while (true) {
1522 u16 new_ss_mask = cur_ss_mask;
1523
1524 do_each_subsys_mask(ss, ssid, cur_ss_mask) {
1525 new_ss_mask |= ss->depends_on;
1526 } while_each_subsys_mask();
1527
1528
1529
1530
1531
1532
1533 new_ss_mask &= this_ss_mask;
1534
1535 if (new_ss_mask == cur_ss_mask)
1536 break;
1537 cur_ss_mask = new_ss_mask;
1538 }
1539
1540 return cur_ss_mask;
1541}
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553void cgroup_kn_unlock(struct kernfs_node *kn)
1554{
1555 struct cgroup *cgrp;
1556
1557 if (kernfs_type(kn) == KERNFS_DIR)
1558 cgrp = kn->priv;
1559 else
1560 cgrp = kn->parent->priv;
1561
1562 mutex_unlock(&cgroup_mutex);
1563
1564 kernfs_unbreak_active_protection(kn);
1565 cgroup_put(cgrp);
1566}
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585struct cgroup *cgroup_kn_lock_live(struct kernfs_node *kn, bool drain_offline)
1586{
1587 struct cgroup *cgrp;
1588
1589 if (kernfs_type(kn) == KERNFS_DIR)
1590 cgrp = kn->priv;
1591 else
1592 cgrp = kn->parent->priv;
1593
1594
1595
1596
1597
1598
1599
1600 if (!cgroup_tryget(cgrp))
1601 return NULL;
1602 kernfs_break_active_protection(kn);
1603
1604 if (drain_offline)
1605 cgroup_lock_and_drain_offline(cgrp);
1606 else
1607 mutex_lock(&cgroup_mutex);
1608
1609 if (!cgroup_is_dead(cgrp))
1610 return cgrp;
1611
1612 cgroup_kn_unlock(kn);
1613 return NULL;
1614}
1615
1616static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft)
1617{
1618 char name[CGROUP_FILE_NAME_MAX];
1619
1620 lockdep_assert_held(&cgroup_mutex);
1621
1622 if (cft->file_offset) {
1623 struct cgroup_subsys_state *css = cgroup_css(cgrp, cft->ss);
1624 struct cgroup_file *cfile = (void *)css + cft->file_offset;
1625
1626 spin_lock_irq(&cgroup_file_kn_lock);
1627 cfile->kn = NULL;
1628 spin_unlock_irq(&cgroup_file_kn_lock);
1629
1630 del_timer_sync(&cfile->notify_timer);
1631 }
1632
1633 kernfs_remove_by_name(cgrp->kn, cgroup_file_name(cgrp, cft, name));
1634}
1635
1636
1637
1638
1639
1640static void css_clear_dir(struct cgroup_subsys_state *css)
1641{
1642 struct cgroup *cgrp = css->cgroup;
1643 struct cftype *cfts;
1644
1645 if (!(css->flags & CSS_VISIBLE))
1646 return;
1647
1648 css->flags &= ~CSS_VISIBLE;
1649
1650 if (!css->ss) {
1651 if (cgroup_on_dfl(cgrp))
1652 cfts = cgroup_base_files;
1653 else
1654 cfts = cgroup1_base_files;
1655
1656 cgroup_addrm_files(css, cgrp, cfts, false);
1657 } else {
1658 list_for_each_entry(cfts, &css->ss->cfts, node)
1659 cgroup_addrm_files(css, cgrp, cfts, false);
1660 }
1661}
1662
1663
1664
1665
1666
1667
1668
1669static int css_populate_dir(struct cgroup_subsys_state *css)
1670{
1671 struct cgroup *cgrp = css->cgroup;
1672 struct cftype *cfts, *failed_cfts;
1673 int ret;
1674
1675 if ((css->flags & CSS_VISIBLE) || !cgrp->kn)
1676 return 0;
1677
1678 if (!css->ss) {
1679 if (cgroup_on_dfl(cgrp))
1680 cfts = cgroup_base_files;
1681 else
1682 cfts = cgroup1_base_files;
1683
1684 ret = cgroup_addrm_files(&cgrp->self, cgrp, cfts, true);
1685 if (ret < 0)
1686 return ret;
1687 } else {
1688 list_for_each_entry(cfts, &css->ss->cfts, node) {
1689 ret = cgroup_addrm_files(css, cgrp, cfts, true);
1690 if (ret < 0) {
1691 failed_cfts = cfts;
1692 goto err;
1693 }
1694 }
1695 }
1696
1697 css->flags |= CSS_VISIBLE;
1698
1699 return 0;
1700err:
1701 list_for_each_entry(cfts, &css->ss->cfts, node) {
1702 if (cfts == failed_cfts)
1703 break;
1704 cgroup_addrm_files(css, cgrp, cfts, false);
1705 }
1706 return ret;
1707}
1708
1709int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask)
1710{
1711 struct cgroup *dcgrp = &dst_root->cgrp;
1712 struct cgroup_subsys *ss;
1713 int ssid, i, ret;
1714
1715 lockdep_assert_held(&cgroup_mutex);
1716
1717 do_each_subsys_mask(ss, ssid, ss_mask) {
1718
1719
1720
1721
1722
1723 if (css_next_child(NULL, cgroup_css(&ss->root->cgrp, ss)) &&
1724 !ss->implicit_on_dfl)
1725 return -EBUSY;
1726
1727
1728 if (ss->root != &cgrp_dfl_root && dst_root != &cgrp_dfl_root)
1729 return -EBUSY;
1730 } while_each_subsys_mask();
1731
1732 do_each_subsys_mask(ss, ssid, ss_mask) {
1733 struct cgroup_root *src_root = ss->root;
1734 struct cgroup *scgrp = &src_root->cgrp;
1735 struct cgroup_subsys_state *css = cgroup_css(scgrp, ss);
1736 struct css_set *cset;
1737
1738 WARN_ON(!css || cgroup_css(dcgrp, ss));
1739
1740
1741 src_root->subsys_mask &= ~(1 << ssid);
1742 WARN_ON(cgroup_apply_control(scgrp));
1743 cgroup_finalize_control(scgrp, 0);
1744
1745
1746 RCU_INIT_POINTER(scgrp->subsys[ssid], NULL);
1747 rcu_assign_pointer(dcgrp->subsys[ssid], css);
1748 ss->root = dst_root;
1749 css->cgroup = dcgrp;
1750
1751 spin_lock_irq(&css_set_lock);
1752 hash_for_each(css_set_table, i, cset, hlist)
1753 list_move_tail(&cset->e_cset_node[ss->id],
1754 &dcgrp->e_csets[ss->id]);
1755 spin_unlock_irq(&css_set_lock);
1756
1757
1758 dst_root->subsys_mask |= 1 << ssid;
1759 if (dst_root == &cgrp_dfl_root) {
1760 static_branch_enable(cgroup_subsys_on_dfl_key[ssid]);
1761 } else {
1762 dcgrp->subtree_control |= 1 << ssid;
1763 static_branch_disable(cgroup_subsys_on_dfl_key[ssid]);
1764 }
1765
1766 ret = cgroup_apply_control(dcgrp);
1767 if (ret)
1768 pr_warn("partial failure to rebind %s controller (err=%d)\n",
1769 ss->name, ret);
1770
1771 if (ss->bind)
1772 ss->bind(css);
1773 } while_each_subsys_mask();
1774
1775 kernfs_activate(dcgrp->kn);
1776 return 0;
1777}
1778
1779int cgroup_show_path(struct seq_file *sf, struct kernfs_node *kf_node,
1780 struct kernfs_root *kf_root)
1781{
1782 int len = 0;
1783 char *buf = NULL;
1784 struct cgroup_root *kf_cgroot = cgroup_root_from_kf(kf_root);
1785 struct cgroup *ns_cgroup;
1786
1787 buf = kmalloc(PATH_MAX, GFP_KERNEL);
1788 if (!buf)
1789 return -ENOMEM;
1790
1791 spin_lock_irq(&css_set_lock);
1792 ns_cgroup = current_cgns_cgroup_from_root(kf_cgroot);
1793 len = kernfs_path_from_node(kf_node, ns_cgroup->kn, buf, PATH_MAX);
1794 spin_unlock_irq(&css_set_lock);
1795
1796 if (len >= PATH_MAX)
1797 len = -ERANGE;
1798 else if (len > 0) {
1799 seq_escape(sf, buf, " \t\n\\");
1800 len = 0;
1801 }
1802 kfree(buf);
1803 return len;
1804}
1805
1806enum cgroup2_param {
1807 Opt_nsdelegate,
1808 Opt_memory_localevents,
1809 Opt_memory_recursiveprot,
1810 nr__cgroup2_params
1811};
1812
1813static const struct fs_parameter_spec cgroup2_fs_parameters[] = {
1814 fsparam_flag("nsdelegate", Opt_nsdelegate),
1815 fsparam_flag("memory_localevents", Opt_memory_localevents),
1816 fsparam_flag("memory_recursiveprot", Opt_memory_recursiveprot),
1817 {}
1818};
1819
1820static int cgroup2_parse_param(struct fs_context *fc, struct fs_parameter *param)
1821{
1822 struct cgroup_fs_context *ctx = cgroup_fc2context(fc);
1823 struct fs_parse_result result;
1824 int opt;
1825
1826 opt = fs_parse(fc, cgroup2_fs_parameters, param, &result);
1827 if (opt < 0)
1828 return opt;
1829
1830 switch (opt) {
1831 case Opt_nsdelegate:
1832 ctx->flags |= CGRP_ROOT_NS_DELEGATE;
1833 return 0;
1834 case Opt_memory_localevents:
1835 ctx->flags |= CGRP_ROOT_MEMORY_LOCAL_EVENTS;
1836 return 0;
1837 case Opt_memory_recursiveprot:
1838 ctx->flags |= CGRP_ROOT_MEMORY_RECURSIVE_PROT;
1839 return 0;
1840 }
1841 return -EINVAL;
1842}
1843
1844static void apply_cgroup_root_flags(unsigned int root_flags)
1845{
1846 if (current->nsproxy->cgroup_ns == &init_cgroup_ns) {
1847 if (root_flags & CGRP_ROOT_NS_DELEGATE)
1848 cgrp_dfl_root.flags |= CGRP_ROOT_NS_DELEGATE;
1849 else
1850 cgrp_dfl_root.flags &= ~CGRP_ROOT_NS_DELEGATE;
1851
1852 if (root_flags & CGRP_ROOT_MEMORY_LOCAL_EVENTS)
1853 cgrp_dfl_root.flags |= CGRP_ROOT_MEMORY_LOCAL_EVENTS;
1854 else
1855 cgrp_dfl_root.flags &= ~CGRP_ROOT_MEMORY_LOCAL_EVENTS;
1856
1857 if (root_flags & CGRP_ROOT_MEMORY_RECURSIVE_PROT)
1858 cgrp_dfl_root.flags |= CGRP_ROOT_MEMORY_RECURSIVE_PROT;
1859 else
1860 cgrp_dfl_root.flags &= ~CGRP_ROOT_MEMORY_RECURSIVE_PROT;
1861 }
1862}
1863
1864static int cgroup_show_options(struct seq_file *seq, struct kernfs_root *kf_root)
1865{
1866 if (cgrp_dfl_root.flags & CGRP_ROOT_NS_DELEGATE)
1867 seq_puts(seq, ",nsdelegate");
1868 if (cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_LOCAL_EVENTS)
1869 seq_puts(seq, ",memory_localevents");
1870 if (cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_RECURSIVE_PROT)
1871 seq_puts(seq, ",memory_recursiveprot");
1872 return 0;
1873}
1874
1875static int cgroup_reconfigure(struct fs_context *fc)
1876{
1877 struct cgroup_fs_context *ctx = cgroup_fc2context(fc);
1878
1879 apply_cgroup_root_flags(ctx->flags);
1880 return 0;
1881}
1882
1883static void init_cgroup_housekeeping(struct cgroup *cgrp)
1884{
1885 struct cgroup_subsys *ss;
1886 int ssid;
1887
1888 INIT_LIST_HEAD(&cgrp->self.sibling);
1889 INIT_LIST_HEAD(&cgrp->self.children);
1890 INIT_LIST_HEAD(&cgrp->cset_links);
1891 INIT_LIST_HEAD(&cgrp->pidlists);
1892 mutex_init(&cgrp->pidlist_mutex);
1893 cgrp->self.cgroup = cgrp;
1894 cgrp->self.flags |= CSS_ONLINE;
1895 cgrp->dom_cgrp = cgrp;
1896 cgrp->max_descendants = INT_MAX;
1897 cgrp->max_depth = INT_MAX;
1898 INIT_LIST_HEAD(&cgrp->rstat_css_list);
1899 prev_cputime_init(&cgrp->prev_cputime);
1900
1901 for_each_subsys(ss, ssid)
1902 INIT_LIST_HEAD(&cgrp->e_csets[ssid]);
1903
1904 init_waitqueue_head(&cgrp->offline_waitq);
1905 INIT_WORK(&cgrp->release_agent_work, cgroup1_release_agent);
1906}
1907
1908void init_cgroup_root(struct cgroup_fs_context *ctx)
1909{
1910 struct cgroup_root *root = ctx->root;
1911 struct cgroup *cgrp = &root->cgrp;
1912
1913 INIT_LIST_HEAD(&root->root_list);
1914 atomic_set(&root->nr_cgrps, 1);
1915 cgrp->root = root;
1916 init_cgroup_housekeeping(cgrp);
1917
1918 root->flags = ctx->flags;
1919 if (ctx->release_agent)
1920 strscpy(root->release_agent_path, ctx->release_agent, PATH_MAX);
1921 if (ctx->name)
1922 strscpy(root->name, ctx->name, MAX_CGROUP_ROOT_NAMELEN);
1923 if (ctx->cpuset_clone_children)
1924 set_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags);
1925}
1926
1927int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask)
1928{
1929 LIST_HEAD(tmp_links);
1930 struct cgroup *root_cgrp = &root->cgrp;
1931 struct kernfs_syscall_ops *kf_sops;
1932 struct css_set *cset;
1933 int i, ret;
1934
1935 lockdep_assert_held(&cgroup_mutex);
1936
1937 ret = percpu_ref_init(&root_cgrp->self.refcnt, css_release,
1938 0, GFP_KERNEL);
1939 if (ret)
1940 goto out;
1941
1942
1943
1944
1945
1946
1947
1948
1949 ret = allocate_cgrp_cset_links(2 * css_set_count, &tmp_links);
1950 if (ret)
1951 goto cancel_ref;
1952
1953 ret = cgroup_init_root_id(root);
1954 if (ret)
1955 goto cancel_ref;
1956
1957 kf_sops = root == &cgrp_dfl_root ?
1958 &cgroup_kf_syscall_ops : &cgroup1_kf_syscall_ops;
1959
1960 root->kf_root = kernfs_create_root(kf_sops,
1961 KERNFS_ROOT_CREATE_DEACTIVATED |
1962 KERNFS_ROOT_SUPPORT_EXPORTOP |
1963 KERNFS_ROOT_SUPPORT_USER_XATTR,
1964 root_cgrp);
1965 if (IS_ERR(root->kf_root)) {
1966 ret = PTR_ERR(root->kf_root);
1967 goto exit_root_id;
1968 }
1969 root_cgrp->kn = root->kf_root->kn;
1970 WARN_ON_ONCE(cgroup_ino(root_cgrp) != 1);
1971 root_cgrp->ancestor_ids[0] = cgroup_id(root_cgrp);
1972
1973 ret = css_populate_dir(&root_cgrp->self);
1974 if (ret)
1975 goto destroy_root;
1976
1977 ret = rebind_subsystems(root, ss_mask);
1978 if (ret)
1979 goto destroy_root;
1980
1981 ret = cgroup_bpf_inherit(root_cgrp);
1982 WARN_ON_ONCE(ret);
1983
1984 trace_cgroup_setup_root(root);
1985
1986
1987
1988
1989
1990
1991 list_add(&root->root_list, &cgroup_roots);
1992 cgroup_root_count++;
1993
1994
1995
1996
1997
1998 spin_lock_irq(&css_set_lock);
1999 hash_for_each(css_set_table, i, cset, hlist) {
2000 link_css_set(&tmp_links, cset, root_cgrp);
2001 if (css_set_populated(cset))
2002 cgroup_update_populated(root_cgrp, true);
2003 }
2004 spin_unlock_irq(&css_set_lock);
2005
2006 BUG_ON(!list_empty(&root_cgrp->self.children));
2007 BUG_ON(atomic_read(&root->nr_cgrps) != 1);
2008
2009 ret = 0;
2010 goto out;
2011
2012destroy_root:
2013 kernfs_destroy_root(root->kf_root);
2014 root->kf_root = NULL;
2015exit_root_id:
2016 cgroup_exit_root_id(root);
2017cancel_ref:
2018 percpu_ref_exit(&root_cgrp->self.refcnt);
2019out:
2020 free_cgrp_cset_links(&tmp_links);
2021 return ret;
2022}
2023
2024int cgroup_do_get_tree(struct fs_context *fc)
2025{
2026 struct cgroup_fs_context *ctx = cgroup_fc2context(fc);
2027 int ret;
2028
2029 ctx->kfc.root = ctx->root->kf_root;
2030 if (fc->fs_type == &cgroup2_fs_type)
2031 ctx->kfc.magic = CGROUP2_SUPER_MAGIC;
2032 else
2033 ctx->kfc.magic = CGROUP_SUPER_MAGIC;
2034 ret = kernfs_get_tree(fc);
2035
2036
2037
2038
2039
2040 if (!ret && ctx->ns != &init_cgroup_ns) {
2041 struct dentry *nsdentry;
2042 struct super_block *sb = fc->root->d_sb;
2043 struct cgroup *cgrp;
2044
2045 mutex_lock(&cgroup_mutex);
2046 spin_lock_irq(&css_set_lock);
2047
2048 cgrp = cset_cgroup_from_root(ctx->ns->root_cset, ctx->root);
2049
2050 spin_unlock_irq(&css_set_lock);
2051 mutex_unlock(&cgroup_mutex);
2052
2053 nsdentry = kernfs_node_dentry(cgrp->kn, sb);
2054 dput(fc->root);
2055 if (IS_ERR(nsdentry)) {
2056 deactivate_locked_super(sb);
2057 ret = PTR_ERR(nsdentry);
2058 nsdentry = NULL;
2059 }
2060 fc->root = nsdentry;
2061 }
2062
2063 if (!ctx->kfc.new_sb_created)
2064 cgroup_put(&ctx->root->cgrp);
2065
2066 return ret;
2067}
2068
2069
2070
2071
2072static void cgroup_fs_context_free(struct fs_context *fc)
2073{
2074 struct cgroup_fs_context *ctx = cgroup_fc2context(fc);
2075
2076 kfree(ctx->name);
2077 kfree(ctx->release_agent);
2078 put_cgroup_ns(ctx->ns);
2079 kernfs_free_fs_context(fc);
2080 kfree(ctx);
2081}
2082
2083static int cgroup_get_tree(struct fs_context *fc)
2084{
2085 struct cgroup_fs_context *ctx = cgroup_fc2context(fc);
2086 int ret;
2087
2088 cgrp_dfl_visible = true;
2089 cgroup_get_live(&cgrp_dfl_root.cgrp);
2090 ctx->root = &cgrp_dfl_root;
2091
2092 ret = cgroup_do_get_tree(fc);
2093 if (!ret)
2094 apply_cgroup_root_flags(ctx->flags);
2095 return ret;
2096}
2097
2098static const struct fs_context_operations cgroup_fs_context_ops = {
2099 .free = cgroup_fs_context_free,
2100 .parse_param = cgroup2_parse_param,
2101 .get_tree = cgroup_get_tree,
2102 .reconfigure = cgroup_reconfigure,
2103};
2104
2105static const struct fs_context_operations cgroup1_fs_context_ops = {
2106 .free = cgroup_fs_context_free,
2107 .parse_param = cgroup1_parse_param,
2108 .get_tree = cgroup1_get_tree,
2109 .reconfigure = cgroup1_reconfigure,
2110};
2111
2112
2113
2114
2115
2116static int cgroup_init_fs_context(struct fs_context *fc)
2117{
2118 struct cgroup_fs_context *ctx;
2119
2120 ctx = kzalloc(sizeof(struct cgroup_fs_context), GFP_KERNEL);
2121 if (!ctx)
2122 return -ENOMEM;
2123
2124 ctx->ns = current->nsproxy->cgroup_ns;
2125 get_cgroup_ns(ctx->ns);
2126 fc->fs_private = &ctx->kfc;
2127 if (fc->fs_type == &cgroup2_fs_type)
2128 fc->ops = &cgroup_fs_context_ops;
2129 else
2130 fc->ops = &cgroup1_fs_context_ops;
2131 put_user_ns(fc->user_ns);
2132 fc->user_ns = get_user_ns(ctx->ns->user_ns);
2133 fc->global = true;
2134 return 0;
2135}
2136
2137static void cgroup_kill_sb(struct super_block *sb)
2138{
2139 struct kernfs_root *kf_root = kernfs_root_from_sb(sb);
2140 struct cgroup_root *root = cgroup_root_from_kf(kf_root);
2141
2142
2143
2144
2145
2146
2147
2148
2149 if (list_empty(&root->cgrp.self.children) && root != &cgrp_dfl_root &&
2150 !percpu_ref_is_dying(&root->cgrp.self.refcnt))
2151 percpu_ref_kill(&root->cgrp.self.refcnt);
2152 cgroup_put(&root->cgrp);
2153 kernfs_kill_sb(sb);
2154}
2155
2156struct file_system_type cgroup_fs_type = {
2157 .name = "cgroup",
2158 .init_fs_context = cgroup_init_fs_context,
2159 .parameters = cgroup1_fs_parameters,
2160 .kill_sb = cgroup_kill_sb,
2161 .fs_flags = FS_USERNS_MOUNT,
2162};
2163
2164static struct file_system_type cgroup2_fs_type = {
2165 .name = "cgroup2",
2166 .init_fs_context = cgroup_init_fs_context,
2167 .parameters = cgroup2_fs_parameters,
2168 .kill_sb = cgroup_kill_sb,
2169 .fs_flags = FS_USERNS_MOUNT,
2170};
2171
2172#ifdef CONFIG_CPUSETS
2173static const struct fs_context_operations cpuset_fs_context_ops = {
2174 .get_tree = cgroup1_get_tree,
2175 .free = cgroup_fs_context_free,
2176};
2177
2178
2179
2180
2181
2182
2183static int cpuset_init_fs_context(struct fs_context *fc)
2184{
2185 char *agent = kstrdup("/sbin/cpuset_release_agent", GFP_USER);
2186 struct cgroup_fs_context *ctx;
2187 int err;
2188
2189 err = cgroup_init_fs_context(fc);
2190 if (err) {
2191 kfree(agent);
2192 return err;
2193 }
2194
2195 fc->ops = &cpuset_fs_context_ops;
2196
2197 ctx = cgroup_fc2context(fc);
2198 ctx->subsys_mask = 1 << cpuset_cgrp_id;
2199 ctx->flags |= CGRP_ROOT_NOPREFIX;
2200 ctx->release_agent = agent;
2201
2202 get_filesystem(&cgroup_fs_type);
2203 put_filesystem(fc->fs_type);
2204 fc->fs_type = &cgroup_fs_type;
2205
2206 return 0;
2207}
2208
2209static struct file_system_type cpuset_fs_type = {
2210 .name = "cpuset",
2211 .init_fs_context = cpuset_init_fs_context,
2212 .fs_flags = FS_USERNS_MOUNT,
2213};
2214#endif
2215
2216int cgroup_path_ns_locked(struct cgroup *cgrp, char *buf, size_t buflen,
2217 struct cgroup_namespace *ns)
2218{
2219 struct cgroup *root = cset_cgroup_from_root(ns->root_cset, cgrp->root);
2220
2221 return kernfs_path_from_node(cgrp->kn, root->kn, buf, buflen);
2222}
2223
2224int cgroup_path_ns(struct cgroup *cgrp, char *buf, size_t buflen,
2225 struct cgroup_namespace *ns)
2226{
2227 int ret;
2228
2229 mutex_lock(&cgroup_mutex);
2230 spin_lock_irq(&css_set_lock);
2231
2232 ret = cgroup_path_ns_locked(cgrp, buf, buflen, ns);
2233
2234 spin_unlock_irq(&css_set_lock);
2235 mutex_unlock(&cgroup_mutex);
2236
2237 return ret;
2238}
2239EXPORT_SYMBOL_GPL(cgroup_path_ns);
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254int task_cgroup_path(struct task_struct *task, char *buf, size_t buflen)
2255{
2256 struct cgroup_root *root;
2257 struct cgroup *cgrp;
2258 int hierarchy_id = 1;
2259 int ret;
2260
2261 mutex_lock(&cgroup_mutex);
2262 spin_lock_irq(&css_set_lock);
2263
2264 root = idr_get_next(&cgroup_hierarchy_idr, &hierarchy_id);
2265
2266 if (root) {
2267 cgrp = task_cgroup_from_root(task, root);
2268 ret = cgroup_path_ns_locked(cgrp, buf, buflen, &init_cgroup_ns);
2269 } else {
2270
2271 ret = strlcpy(buf, "/", buflen);
2272 }
2273
2274 spin_unlock_irq(&css_set_lock);
2275 mutex_unlock(&cgroup_mutex);
2276 return ret;
2277}
2278EXPORT_SYMBOL_GPL(task_cgroup_path);
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290static void cgroup_migrate_add_task(struct task_struct *task,
2291 struct cgroup_mgctx *mgctx)
2292{
2293 struct css_set *cset;
2294
2295 lockdep_assert_held(&css_set_lock);
2296
2297
2298 if (task->flags & PF_EXITING)
2299 return;
2300
2301
2302 WARN_ON_ONCE(list_empty(&task->cg_list));
2303
2304 cset = task_css_set(task);
2305 if (!cset->mg_src_cgrp)
2306 return;
2307
2308 mgctx->tset.nr_tasks++;
2309
2310 list_move_tail(&task->cg_list, &cset->mg_tasks);
2311 if (list_empty(&cset->mg_node))
2312 list_add_tail(&cset->mg_node,
2313 &mgctx->tset.src_csets);
2314 if (list_empty(&cset->mg_dst_cset->mg_node))
2315 list_add_tail(&cset->mg_dst_cset->mg_node,
2316 &mgctx->tset.dst_csets);
2317}
2318
2319
2320
2321
2322
2323
2324
2325
2326struct task_struct *cgroup_taskset_first(struct cgroup_taskset *tset,
2327 struct cgroup_subsys_state **dst_cssp)
2328{
2329 tset->cur_cset = list_first_entry(tset->csets, struct css_set, mg_node);
2330 tset->cur_task = NULL;
2331
2332 return cgroup_taskset_next(tset, dst_cssp);
2333}
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset,
2344 struct cgroup_subsys_state **dst_cssp)
2345{
2346 struct css_set *cset = tset->cur_cset;
2347 struct task_struct *task = tset->cur_task;
2348
2349 while (&cset->mg_node != tset->csets) {
2350 if (!task)
2351 task = list_first_entry(&cset->mg_tasks,
2352 struct task_struct, cg_list);
2353 else
2354 task = list_next_entry(task, cg_list);
2355
2356 if (&task->cg_list != &cset->mg_tasks) {
2357 tset->cur_cset = cset;
2358 tset->cur_task = task;
2359
2360
2361
2362
2363
2364
2365
2366 if (cset->mg_dst_cset)
2367 *dst_cssp = cset->mg_dst_cset->subsys[tset->ssid];
2368 else
2369 *dst_cssp = cset->subsys[tset->ssid];
2370
2371 return task;
2372 }
2373
2374 cset = list_next_entry(cset, mg_node);
2375 task = NULL;
2376 }
2377
2378 return NULL;
2379}
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390static int cgroup_migrate_execute(struct cgroup_mgctx *mgctx)
2391{
2392 struct cgroup_taskset *tset = &mgctx->tset;
2393 struct cgroup_subsys *ss;
2394 struct task_struct *task, *tmp_task;
2395 struct css_set *cset, *tmp_cset;
2396 int ssid, failed_ssid, ret;
2397
2398
2399 if (tset->nr_tasks) {
2400 do_each_subsys_mask(ss, ssid, mgctx->ss_mask) {
2401 if (ss->can_attach) {
2402 tset->ssid = ssid;
2403 ret = ss->can_attach(tset);
2404 if (ret) {
2405 failed_ssid = ssid;
2406 goto out_cancel_attach;
2407 }
2408 }
2409 } while_each_subsys_mask();
2410 }
2411
2412
2413
2414
2415
2416
2417 spin_lock_irq(&css_set_lock);
2418 list_for_each_entry(cset, &tset->src_csets, mg_node) {
2419 list_for_each_entry_safe(task, tmp_task, &cset->mg_tasks, cg_list) {
2420 struct css_set *from_cset = task_css_set(task);
2421 struct css_set *to_cset = cset->mg_dst_cset;
2422
2423 get_css_set(to_cset);
2424 to_cset->nr_tasks++;
2425 css_set_move_task(task, from_cset, to_cset, true);
2426 from_cset->nr_tasks--;
2427
2428
2429
2430
2431 cgroup_freezer_migrate_task(task, from_cset->dfl_cgrp,
2432 to_cset->dfl_cgrp);
2433 put_css_set_locked(from_cset);
2434
2435 }
2436 }
2437 spin_unlock_irq(&css_set_lock);
2438
2439
2440
2441
2442
2443
2444 tset->csets = &tset->dst_csets;
2445
2446 if (tset->nr_tasks) {
2447 do_each_subsys_mask(ss, ssid, mgctx->ss_mask) {
2448 if (ss->attach) {
2449 tset->ssid = ssid;
2450 ss->attach(tset);
2451 }
2452 } while_each_subsys_mask();
2453 }
2454
2455 ret = 0;
2456 goto out_release_tset;
2457
2458out_cancel_attach:
2459 if (tset->nr_tasks) {
2460 do_each_subsys_mask(ss, ssid, mgctx->ss_mask) {
2461 if (ssid == failed_ssid)
2462 break;
2463 if (ss->cancel_attach) {
2464 tset->ssid = ssid;
2465 ss->cancel_attach(tset);
2466 }
2467 } while_each_subsys_mask();
2468 }
2469out_release_tset:
2470 spin_lock_irq(&css_set_lock);
2471 list_splice_init(&tset->dst_csets, &tset->src_csets);
2472 list_for_each_entry_safe(cset, tmp_cset, &tset->src_csets, mg_node) {
2473 list_splice_tail_init(&cset->mg_tasks, &cset->tasks);
2474 list_del_init(&cset->mg_node);
2475 }
2476 spin_unlock_irq(&css_set_lock);
2477
2478
2479
2480
2481
2482
2483 tset->nr_tasks = 0;
2484 tset->csets = &tset->src_csets;
2485 return ret;
2486}
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497int cgroup_migrate_vet_dst(struct cgroup *dst_cgrp)
2498{
2499
2500 if (!cgroup_on_dfl(dst_cgrp))
2501 return 0;
2502
2503
2504 if (!cgroup_is_valid_domain(dst_cgrp->dom_cgrp))
2505 return -EOPNOTSUPP;
2506
2507
2508 if (cgroup_is_mixable(dst_cgrp))
2509 return 0;
2510
2511
2512
2513
2514
2515 if (cgroup_can_be_thread_root(dst_cgrp) || cgroup_is_threaded(dst_cgrp))
2516 return 0;
2517
2518
2519 if (dst_cgrp->subtree_control)
2520 return -EBUSY;
2521
2522 return 0;
2523}
2524
2525
2526
2527
2528
2529
2530
2531
2532void cgroup_migrate_finish(struct cgroup_mgctx *mgctx)
2533{
2534 LIST_HEAD(preloaded);
2535 struct css_set *cset, *tmp_cset;
2536
2537 lockdep_assert_held(&cgroup_mutex);
2538
2539 spin_lock_irq(&css_set_lock);
2540
2541 list_splice_tail_init(&mgctx->preloaded_src_csets, &preloaded);
2542 list_splice_tail_init(&mgctx->preloaded_dst_csets, &preloaded);
2543
2544 list_for_each_entry_safe(cset, tmp_cset, &preloaded, mg_preload_node) {
2545 cset->mg_src_cgrp = NULL;
2546 cset->mg_dst_cgrp = NULL;
2547 cset->mg_dst_cset = NULL;
2548 list_del_init(&cset->mg_preload_node);
2549 put_css_set_locked(cset);
2550 }
2551
2552 spin_unlock_irq(&css_set_lock);
2553}
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571void cgroup_migrate_add_src(struct css_set *src_cset,
2572 struct cgroup *dst_cgrp,
2573 struct cgroup_mgctx *mgctx)
2574{
2575 struct cgroup *src_cgrp;
2576
2577 lockdep_assert_held(&cgroup_mutex);
2578 lockdep_assert_held(&css_set_lock);
2579
2580
2581
2582
2583
2584
2585 if (src_cset->dead)
2586 return;
2587
2588 src_cgrp = cset_cgroup_from_root(src_cset, dst_cgrp->root);
2589
2590 if (!list_empty(&src_cset->mg_preload_node))
2591 return;
2592
2593 WARN_ON(src_cset->mg_src_cgrp);
2594 WARN_ON(src_cset->mg_dst_cgrp);
2595 WARN_ON(!list_empty(&src_cset->mg_tasks));
2596 WARN_ON(!list_empty(&src_cset->mg_node));
2597
2598 src_cset->mg_src_cgrp = src_cgrp;
2599 src_cset->mg_dst_cgrp = dst_cgrp;
2600 get_css_set(src_cset);
2601 list_add_tail(&src_cset->mg_preload_node, &mgctx->preloaded_src_csets);
2602}
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618int cgroup_migrate_prepare_dst(struct cgroup_mgctx *mgctx)
2619{
2620 struct css_set *src_cset, *tmp_cset;
2621
2622 lockdep_assert_held(&cgroup_mutex);
2623
2624
2625 list_for_each_entry_safe(src_cset, tmp_cset, &mgctx->preloaded_src_csets,
2626 mg_preload_node) {
2627 struct css_set *dst_cset;
2628 struct cgroup_subsys *ss;
2629 int ssid;
2630
2631 dst_cset = find_css_set(src_cset, src_cset->mg_dst_cgrp);
2632 if (!dst_cset)
2633 return -ENOMEM;
2634
2635 WARN_ON_ONCE(src_cset->mg_dst_cset || dst_cset->mg_dst_cset);
2636
2637
2638
2639
2640
2641
2642 if (src_cset == dst_cset) {
2643 src_cset->mg_src_cgrp = NULL;
2644 src_cset->mg_dst_cgrp = NULL;
2645 list_del_init(&src_cset->mg_preload_node);
2646 put_css_set(src_cset);
2647 put_css_set(dst_cset);
2648 continue;
2649 }
2650
2651 src_cset->mg_dst_cset = dst_cset;
2652
2653 if (list_empty(&dst_cset->mg_preload_node))
2654 list_add_tail(&dst_cset->mg_preload_node,
2655 &mgctx->preloaded_dst_csets);
2656 else
2657 put_css_set(dst_cset);
2658
2659 for_each_subsys(ss, ssid)
2660 if (src_cset->subsys[ssid] != dst_cset->subsys[ssid])
2661 mgctx->ss_mask |= 1 << ssid;
2662 }
2663
2664 return 0;
2665}
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685int cgroup_migrate(struct task_struct *leader, bool threadgroup,
2686 struct cgroup_mgctx *mgctx)
2687{
2688 struct task_struct *task;
2689
2690
2691
2692
2693
2694
2695 spin_lock_irq(&css_set_lock);
2696 rcu_read_lock();
2697 task = leader;
2698 do {
2699 cgroup_migrate_add_task(task, mgctx);
2700 if (!threadgroup)
2701 break;
2702 } while_each_thread(leader, task);
2703 rcu_read_unlock();
2704 spin_unlock_irq(&css_set_lock);
2705
2706 return cgroup_migrate_execute(mgctx);
2707}
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717int cgroup_attach_task(struct cgroup *dst_cgrp, struct task_struct *leader,
2718 bool threadgroup)
2719{
2720 DEFINE_CGROUP_MGCTX(mgctx);
2721 struct task_struct *task;
2722 int ret = 0;
2723
2724
2725 spin_lock_irq(&css_set_lock);
2726 rcu_read_lock();
2727 task = leader;
2728 do {
2729 cgroup_migrate_add_src(task_css_set(task), dst_cgrp, &mgctx);
2730 if (!threadgroup)
2731 break;
2732 } while_each_thread(leader, task);
2733 rcu_read_unlock();
2734 spin_unlock_irq(&css_set_lock);
2735
2736
2737 ret = cgroup_migrate_prepare_dst(&mgctx);
2738 if (!ret)
2739 ret = cgroup_migrate(leader, threadgroup, &mgctx);
2740
2741 cgroup_migrate_finish(&mgctx);
2742
2743 if (!ret)
2744 TRACE_CGROUP_PATH(attach_task, dst_cgrp, leader, threadgroup);
2745
2746 return ret;
2747}
2748
2749struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup,
2750 bool *locked)
2751 __acquires(&cgroup_threadgroup_rwsem)
2752{
2753 struct task_struct *tsk;
2754 pid_t pid;
2755
2756 if (kstrtoint(strstrip(buf), 0, &pid) || pid < 0)
2757 return ERR_PTR(-EINVAL);
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767 lockdep_assert_held(&cgroup_mutex);
2768 if (pid || threadgroup) {
2769 percpu_down_write(&cgroup_threadgroup_rwsem);
2770 *locked = true;
2771 } else {
2772 *locked = false;
2773 }
2774
2775 rcu_read_lock();
2776 if (pid) {
2777 tsk = find_task_by_vpid(pid);
2778 if (!tsk) {
2779 tsk = ERR_PTR(-ESRCH);
2780 goto out_unlock_threadgroup;
2781 }
2782 } else {
2783 tsk = current;
2784 }
2785
2786 if (threadgroup)
2787 tsk = tsk->group_leader;
2788
2789
2790
2791
2792
2793
2794
2795 if (tsk->no_cgroup_migration || (tsk->flags & PF_NO_SETAFFINITY)) {
2796 tsk = ERR_PTR(-EINVAL);
2797 goto out_unlock_threadgroup;
2798 }
2799
2800 get_task_struct(tsk);
2801 goto out_unlock_rcu;
2802
2803out_unlock_threadgroup:
2804 if (*locked) {
2805 percpu_up_write(&cgroup_threadgroup_rwsem);
2806 *locked = false;
2807 }
2808out_unlock_rcu:
2809 rcu_read_unlock();
2810 return tsk;
2811}
2812
2813void cgroup_procs_write_finish(struct task_struct *task, bool locked)
2814 __releases(&cgroup_threadgroup_rwsem)
2815{
2816 struct cgroup_subsys *ss;
2817 int ssid;
2818
2819
2820 put_task_struct(task);
2821
2822 if (locked)
2823 percpu_up_write(&cgroup_threadgroup_rwsem);
2824 for_each_subsys(ss, ssid)
2825 if (ss->post_attach)
2826 ss->post_attach();
2827}
2828
2829static void cgroup_print_ss_mask(struct seq_file *seq, u16 ss_mask)
2830{
2831 struct cgroup_subsys *ss;
2832 bool printed = false;
2833 int ssid;
2834
2835 do_each_subsys_mask(ss, ssid, ss_mask) {
2836 if (printed)
2837 seq_putc(seq, ' ');
2838 seq_puts(seq, ss->name);
2839 printed = true;
2840 } while_each_subsys_mask();
2841 if (printed)
2842 seq_putc(seq, '\n');
2843}
2844
2845
2846static int cgroup_controllers_show(struct seq_file *seq, void *v)
2847{
2848 struct cgroup *cgrp = seq_css(seq)->cgroup;
2849
2850 cgroup_print_ss_mask(seq, cgroup_control(cgrp));
2851 return 0;
2852}
2853
2854
2855static int cgroup_subtree_control_show(struct seq_file *seq, void *v)
2856{
2857 struct cgroup *cgrp = seq_css(seq)->cgroup;
2858
2859 cgroup_print_ss_mask(seq, cgrp->subtree_control);
2860 return 0;
2861}
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872static int cgroup_update_dfl_csses(struct cgroup *cgrp)
2873{
2874 DEFINE_CGROUP_MGCTX(mgctx);
2875 struct cgroup_subsys_state *d_css;
2876 struct cgroup *dsct;
2877 struct css_set *src_cset;
2878 int ret;
2879
2880 lockdep_assert_held(&cgroup_mutex);
2881
2882 percpu_down_write(&cgroup_threadgroup_rwsem);
2883
2884
2885 spin_lock_irq(&css_set_lock);
2886 cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp) {
2887 struct cgrp_cset_link *link;
2888
2889 list_for_each_entry(link, &dsct->cset_links, cset_link)
2890 cgroup_migrate_add_src(link->cset, dsct, &mgctx);
2891 }
2892 spin_unlock_irq(&css_set_lock);
2893
2894
2895 ret = cgroup_migrate_prepare_dst(&mgctx);
2896 if (ret)
2897 goto out_finish;
2898
2899 spin_lock_irq(&css_set_lock);
2900 list_for_each_entry(src_cset, &mgctx.preloaded_src_csets, mg_preload_node) {
2901 struct task_struct *task, *ntask;
2902
2903
2904 list_for_each_entry_safe(task, ntask, &src_cset->tasks, cg_list)
2905 cgroup_migrate_add_task(task, &mgctx);
2906 }
2907 spin_unlock_irq(&css_set_lock);
2908
2909 ret = cgroup_migrate_execute(&mgctx);
2910out_finish:
2911 cgroup_migrate_finish(&mgctx);
2912 percpu_up_write(&cgroup_threadgroup_rwsem);
2913 return ret;
2914}
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924void cgroup_lock_and_drain_offline(struct cgroup *cgrp)
2925 __acquires(&cgroup_mutex)
2926{
2927 struct cgroup *dsct;
2928 struct cgroup_subsys_state *d_css;
2929 struct cgroup_subsys *ss;
2930 int ssid;
2931
2932restart:
2933 mutex_lock(&cgroup_mutex);
2934
2935 cgroup_for_each_live_descendant_post(dsct, d_css, cgrp) {
2936 for_each_subsys(ss, ssid) {
2937 struct cgroup_subsys_state *css = cgroup_css(dsct, ss);
2938 DEFINE_WAIT(wait);
2939
2940 if (!css || !percpu_ref_is_dying(&css->refcnt))
2941 continue;
2942
2943 cgroup_get_live(dsct);
2944 prepare_to_wait(&dsct->offline_waitq, &wait,
2945 TASK_UNINTERRUPTIBLE);
2946
2947 mutex_unlock(&cgroup_mutex);
2948 schedule();
2949 finish_wait(&dsct->offline_waitq, &wait);
2950
2951 cgroup_put(dsct);
2952 goto restart;
2953 }
2954 }
2955}
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965static void cgroup_save_control(struct cgroup *cgrp)
2966{
2967 struct cgroup *dsct;
2968 struct cgroup_subsys_state *d_css;
2969
2970 cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp) {
2971 dsct->old_subtree_control = dsct->subtree_control;
2972 dsct->old_subtree_ss_mask = dsct->subtree_ss_mask;
2973 dsct->old_dom_cgrp = dsct->dom_cgrp;
2974 }
2975}
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985static void cgroup_propagate_control(struct cgroup *cgrp)
2986{
2987 struct cgroup *dsct;
2988 struct cgroup_subsys_state *d_css;
2989
2990 cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp) {
2991 dsct->subtree_control &= cgroup_control(dsct);
2992 dsct->subtree_ss_mask =
2993 cgroup_calc_subtree_ss_mask(dsct->subtree_control,
2994 cgroup_ss_mask(dsct));
2995 }
2996}
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006static void cgroup_restore_control(struct cgroup *cgrp)
3007{
3008 struct cgroup *dsct;
3009 struct cgroup_subsys_state *d_css;
3010
3011 cgroup_for_each_live_descendant_post(dsct, d_css, cgrp) {
3012 dsct->subtree_control = dsct->old_subtree_control;
3013 dsct->subtree_ss_mask = dsct->old_subtree_ss_mask;
3014 dsct->dom_cgrp = dsct->old_dom_cgrp;
3015 }
3016}
3017
3018static bool css_visible(struct cgroup_subsys_state *css)
3019{
3020 struct cgroup_subsys *ss = css->ss;
3021 struct cgroup *cgrp = css->cgroup;
3022
3023 if (cgroup_control(cgrp) & (1 << ss->id))
3024 return true;
3025 if (!(cgroup_ss_mask(cgrp) & (1 << ss->id)))
3026 return false;
3027 return cgroup_on_dfl(cgrp) && ss->implicit_on_dfl;
3028}
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043static int cgroup_apply_control_enable(struct cgroup *cgrp)
3044{
3045 struct cgroup *dsct;
3046 struct cgroup_subsys_state *d_css;
3047 struct cgroup_subsys *ss;
3048 int ssid, ret;
3049
3050 cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp) {
3051 for_each_subsys(ss, ssid) {
3052 struct cgroup_subsys_state *css = cgroup_css(dsct, ss);
3053
3054 if (!(cgroup_ss_mask(dsct) & (1 << ss->id)))
3055 continue;
3056
3057 if (!css) {
3058 css = css_create(dsct, ss);
3059 if (IS_ERR(css))
3060 return PTR_ERR(css);
3061 }
3062
3063 WARN_ON_ONCE(percpu_ref_is_dying(&css->refcnt));
3064
3065 if (css_visible(css)) {
3066 ret = css_populate_dir(css);
3067 if (ret)
3068 return ret;
3069 }
3070 }
3071 }
3072
3073 return 0;
3074}
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089static void cgroup_apply_control_disable(struct cgroup *cgrp)
3090{
3091 struct cgroup *dsct;
3092 struct cgroup_subsys_state *d_css;
3093 struct cgroup_subsys *ss;
3094 int ssid;
3095
3096 cgroup_for_each_live_descendant_post(dsct, d_css, cgrp) {
3097 for_each_subsys(ss, ssid) {
3098 struct cgroup_subsys_state *css = cgroup_css(dsct, ss);
3099
3100 if (!css)
3101 continue;
3102
3103 WARN_ON_ONCE(percpu_ref_is_dying(&css->refcnt));
3104
3105 if (css->parent &&
3106 !(cgroup_ss_mask(dsct) & (1 << ss->id))) {
3107 kill_css(css);
3108 } else if (!css_visible(css)) {
3109 css_clear_dir(css);
3110 if (ss->css_reset)
3111 ss->css_reset(css);
3112 }
3113 }
3114 }
3115}
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134static int cgroup_apply_control(struct cgroup *cgrp)
3135{
3136 int ret;
3137
3138 cgroup_propagate_control(cgrp);
3139
3140 ret = cgroup_apply_control_enable(cgrp);
3141 if (ret)
3142 return ret;
3143
3144
3145
3146
3147
3148
3149 ret = cgroup_update_dfl_csses(cgrp);
3150 if (ret)
3151 return ret;
3152
3153 return 0;
3154}
3155
3156
3157
3158
3159
3160
3161
3162
3163static void cgroup_finalize_control(struct cgroup *cgrp, int ret)
3164{
3165 if (ret) {
3166 cgroup_restore_control(cgrp);
3167 cgroup_propagate_control(cgrp);
3168 }
3169
3170 cgroup_apply_control_disable(cgrp);
3171}
3172
3173static int cgroup_vet_subtree_control_enable(struct cgroup *cgrp, u16 enable)
3174{
3175 u16 domain_enable = enable & ~cgrp_dfl_threaded_ss_mask;
3176
3177
3178 if (!enable)
3179 return 0;
3180
3181
3182 if (!cgroup_is_valid_domain(cgrp->dom_cgrp))
3183 return -EOPNOTSUPP;
3184
3185
3186 if (cgroup_is_mixable(cgrp))
3187 return 0;
3188
3189 if (domain_enable) {
3190
3191 if (cgroup_is_thread_root(cgrp) || cgroup_is_threaded(cgrp))
3192 return -EOPNOTSUPP;
3193 } else {
3194
3195
3196
3197
3198
3199 if (cgroup_can_be_thread_root(cgrp) || cgroup_is_threaded(cgrp))
3200 return 0;
3201 }
3202
3203
3204
3205
3206
3207 if (cgroup_has_tasks(cgrp))
3208 return -EBUSY;
3209
3210 return 0;
3211}
3212
3213
3214static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of,
3215 char *buf, size_t nbytes,
3216 loff_t off)
3217{
3218 u16 enable = 0, disable = 0;
3219 struct cgroup *cgrp, *child;
3220 struct cgroup_subsys *ss;
3221 char *tok;
3222 int ssid, ret;
3223
3224
3225
3226
3227
3228 buf = strstrip(buf);
3229 while ((tok = strsep(&buf, " "))) {
3230 if (tok[0] == '\0')
3231 continue;
3232 do_each_subsys_mask(ss, ssid, ~cgrp_dfl_inhibit_ss_mask) {
3233 if (!cgroup_ssid_enabled(ssid) ||
3234 strcmp(tok + 1, ss->name))
3235 continue;
3236
3237 if (*tok == '+') {
3238 enable |= 1 << ssid;
3239 disable &= ~(1 << ssid);
3240 } else if (*tok == '-') {
3241 disable |= 1 << ssid;
3242 enable &= ~(1 << ssid);
3243 } else {
3244 return -EINVAL;
3245 }
3246 break;
3247 } while_each_subsys_mask();
3248 if (ssid == CGROUP_SUBSYS_COUNT)
3249 return -EINVAL;
3250 }
3251
3252 cgrp = cgroup_kn_lock_live(of->kn, true);
3253 if (!cgrp)
3254 return -ENODEV;
3255
3256 for_each_subsys(ss, ssid) {
3257 if (enable & (1 << ssid)) {
3258 if (cgrp->subtree_control & (1 << ssid)) {
3259 enable &= ~(1 << ssid);
3260 continue;
3261 }
3262
3263 if (!(cgroup_control(cgrp) & (1 << ssid))) {
3264 ret = -ENOENT;
3265 goto out_unlock;
3266 }
3267 } else if (disable & (1 << ssid)) {
3268 if (!(cgrp->subtree_control & (1 << ssid))) {
3269 disable &= ~(1 << ssid);
3270 continue;
3271 }
3272
3273
3274 cgroup_for_each_live_child(child, cgrp) {
3275 if (child->subtree_control & (1 << ssid)) {
3276 ret = -EBUSY;
3277 goto out_unlock;
3278 }
3279 }
3280 }
3281 }
3282
3283 if (!enable && !disable) {
3284 ret = 0;
3285 goto out_unlock;
3286 }
3287
3288 ret = cgroup_vet_subtree_control_enable(cgrp, enable);
3289 if (ret)
3290 goto out_unlock;
3291
3292
3293 cgroup_save_control(cgrp);
3294
3295 cgrp->subtree_control |= enable;
3296 cgrp->subtree_control &= ~disable;
3297
3298 ret = cgroup_apply_control(cgrp);
3299 cgroup_finalize_control(cgrp, ret);
3300 if (ret)
3301 goto out_unlock;
3302
3303 kernfs_activate(cgrp->kn);
3304out_unlock:
3305 cgroup_kn_unlock(of->kn);
3306 return ret ?: nbytes;
3307}
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318static int cgroup_enable_threaded(struct cgroup *cgrp)
3319{
3320 struct cgroup *parent = cgroup_parent(cgrp);
3321 struct cgroup *dom_cgrp = parent->dom_cgrp;
3322 struct cgroup *dsct;
3323 struct cgroup_subsys_state *d_css;
3324 int ret;
3325
3326 lockdep_assert_held(&cgroup_mutex);
3327
3328
3329 if (cgroup_is_threaded(cgrp))
3330 return 0;
3331
3332
3333
3334
3335
3336
3337
3338 if (cgroup_is_populated(cgrp) ||
3339 cgrp->subtree_control & ~cgrp_dfl_threaded_ss_mask)
3340 return -EOPNOTSUPP;
3341
3342
3343 if (!cgroup_is_valid_domain(dom_cgrp) ||
3344 !cgroup_can_be_thread_root(dom_cgrp))
3345 return -EOPNOTSUPP;
3346
3347
3348
3349
3350
3351 cgroup_save_control(cgrp);
3352
3353 cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp)
3354 if (dsct == cgrp || cgroup_is_threaded(dsct))
3355 dsct->dom_cgrp = dom_cgrp;
3356
3357 ret = cgroup_apply_control(cgrp);
3358 if (!ret)
3359 parent->nr_threaded_children++;
3360
3361 cgroup_finalize_control(cgrp, ret);
3362 return ret;
3363}
3364
3365static int cgroup_type_show(struct seq_file *seq, void *v)
3366{
3367 struct cgroup *cgrp = seq_css(seq)->cgroup;
3368
3369 if (cgroup_is_threaded(cgrp))
3370 seq_puts(seq, "threaded\n");
3371 else if (!cgroup_is_valid_domain(cgrp))
3372 seq_puts(seq, "domain invalid\n");
3373 else if (cgroup_is_thread_root(cgrp))
3374 seq_puts(seq, "domain threaded\n");
3375 else
3376 seq_puts(seq, "domain\n");
3377
3378 return 0;
3379}
3380
3381static ssize_t cgroup_type_write(struct kernfs_open_file *of, char *buf,
3382 size_t nbytes, loff_t off)
3383{
3384 struct cgroup *cgrp;
3385 int ret;
3386
3387
3388 if (strcmp(strstrip(buf), "threaded"))
3389 return -EINVAL;
3390
3391
3392 cgrp = cgroup_kn_lock_live(of->kn, true);
3393 if (!cgrp)
3394 return -ENOENT;
3395
3396
3397 ret = cgroup_enable_threaded(cgrp);
3398
3399 cgroup_kn_unlock(of->kn);
3400 return ret ?: nbytes;
3401}
3402
3403static int cgroup_max_descendants_show(struct seq_file *seq, void *v)
3404{
3405 struct cgroup *cgrp = seq_css(seq)->cgroup;
3406 int descendants = READ_ONCE(cgrp->max_descendants);
3407
3408 if (descendants == INT_MAX)
3409 seq_puts(seq, "max\n");
3410 else
3411 seq_printf(seq, "%d\n", descendants);
3412
3413 return 0;
3414}
3415
3416static ssize_t cgroup_max_descendants_write(struct kernfs_open_file *of,
3417 char *buf, size_t nbytes, loff_t off)
3418{
3419 struct cgroup *cgrp;
3420 int descendants;
3421 ssize_t ret;
3422
3423 buf = strstrip(buf);
3424 if (!strcmp(buf, "max")) {
3425 descendants = INT_MAX;
3426 } else {
3427 ret = kstrtoint(buf, 0, &descendants);
3428 if (ret)
3429 return ret;
3430 }
3431
3432 if (descendants < 0)
3433 return -ERANGE;
3434
3435 cgrp = cgroup_kn_lock_live(of->kn, false);
3436 if (!cgrp)
3437 return -ENOENT;
3438
3439 cgrp->max_descendants = descendants;
3440
3441 cgroup_kn_unlock(of->kn);
3442
3443 return nbytes;
3444}
3445
3446static int cgroup_max_depth_show(struct seq_file *seq, void *v)
3447{
3448 struct cgroup *cgrp = seq_css(seq)->cgroup;
3449 int depth = READ_ONCE(cgrp->max_depth);
3450
3451 if (depth == INT_MAX)
3452 seq_puts(seq, "max\n");
3453 else
3454 seq_printf(seq, "%d\n", depth);
3455
3456 return 0;
3457}
3458
3459static ssize_t cgroup_max_depth_write(struct kernfs_open_file *of,
3460 char *buf, size_t nbytes, loff_t off)
3461{
3462 struct cgroup *cgrp;
3463 ssize_t ret;
3464 int depth;
3465
3466 buf = strstrip(buf);
3467 if (!strcmp(buf, "max")) {
3468 depth = INT_MAX;
3469 } else {
3470 ret = kstrtoint(buf, 0, &depth);
3471 if (ret)
3472 return ret;
3473 }
3474
3475 if (depth < 0)
3476 return -ERANGE;
3477
3478 cgrp = cgroup_kn_lock_live(of->kn, false);
3479 if (!cgrp)
3480 return -ENOENT;
3481
3482 cgrp->max_depth = depth;
3483
3484 cgroup_kn_unlock(of->kn);
3485
3486 return nbytes;
3487}
3488
3489static int cgroup_events_show(struct seq_file *seq, void *v)
3490{
3491 struct cgroup *cgrp = seq_css(seq)->cgroup;
3492
3493 seq_printf(seq, "populated %d\n", cgroup_is_populated(cgrp));
3494 seq_printf(seq, "frozen %d\n", test_bit(CGRP_FROZEN, &cgrp->flags));
3495
3496 return 0;
3497}
3498
3499static int cgroup_stat_show(struct seq_file *seq, void *v)
3500{
3501 struct cgroup *cgroup = seq_css(seq)->cgroup;
3502
3503 seq_printf(seq, "nr_descendants %d\n",
3504 cgroup->nr_descendants);
3505 seq_printf(seq, "nr_dying_descendants %d\n",
3506 cgroup->nr_dying_descendants);
3507
3508 return 0;
3509}
3510
3511static int __maybe_unused cgroup_extra_stat_show(struct seq_file *seq,
3512 struct cgroup *cgrp, int ssid)
3513{
3514 struct cgroup_subsys *ss = cgroup_subsys[ssid];
3515 struct cgroup_subsys_state *css;
3516 int ret;
3517
3518 if (!ss->css_extra_stat_show)
3519 return 0;
3520
3521 css = cgroup_tryget_css(cgrp, ss);
3522 if (!css)
3523 return 0;
3524
3525 ret = ss->css_extra_stat_show(seq, css);
3526 css_put(css);
3527 return ret;
3528}
3529
3530static int cpu_stat_show(struct seq_file *seq, void *v)
3531{
3532 struct cgroup __maybe_unused *cgrp = seq_css(seq)->cgroup;
3533 int ret = 0;
3534
3535 cgroup_base_stat_cputime_show(seq);
3536#ifdef CONFIG_CGROUP_SCHED
3537 ret = cgroup_extra_stat_show(seq, cgrp, cpu_cgrp_id);
3538#endif
3539 return ret;
3540}
3541
3542#ifdef CONFIG_PSI
3543static int cgroup_io_pressure_show(struct seq_file *seq, void *v)
3544{
3545 struct cgroup *cgrp = seq_css(seq)->cgroup;
3546 struct psi_group *psi = cgroup_ino(cgrp) == 1 ? &psi_system : &cgrp->psi;
3547
3548 return psi_show(seq, psi, PSI_IO);
3549}
3550static int cgroup_memory_pressure_show(struct seq_file *seq, void *v)
3551{
3552 struct cgroup *cgrp = seq_css(seq)->cgroup;
3553 struct psi_group *psi = cgroup_ino(cgrp) == 1 ? &psi_system : &cgrp->psi;
3554
3555 return psi_show(seq, psi, PSI_MEM);
3556}
3557static int cgroup_cpu_pressure_show(struct seq_file *seq, void *v)
3558{
3559 struct cgroup *cgrp = seq_css(seq)->cgroup;
3560 struct psi_group *psi = cgroup_ino(cgrp) == 1 ? &psi_system : &cgrp->psi;
3561
3562 return psi_show(seq, psi, PSI_CPU);
3563}
3564
3565static ssize_t cgroup_pressure_write(struct kernfs_open_file *of, char *buf,
3566 size_t nbytes, enum psi_res res)
3567{
3568 struct psi_trigger *new;
3569 struct cgroup *cgrp;
3570
3571 cgrp = cgroup_kn_lock_live(of->kn, false);
3572 if (!cgrp)
3573 return -ENODEV;
3574
3575 cgroup_get(cgrp);
3576 cgroup_kn_unlock(of->kn);
3577
3578 new = psi_trigger_create(&cgrp->psi, buf, nbytes, res);
3579 if (IS_ERR(new)) {
3580 cgroup_put(cgrp);
3581 return PTR_ERR(new);
3582 }
3583
3584 psi_trigger_replace(&of->priv, new);
3585
3586 cgroup_put(cgrp);
3587
3588 return nbytes;
3589}
3590
3591static ssize_t cgroup_io_pressure_write(struct kernfs_open_file *of,
3592 char *buf, size_t nbytes,
3593 loff_t off)
3594{
3595 return cgroup_pressure_write(of, buf, nbytes, PSI_IO);
3596}
3597
3598static ssize_t cgroup_memory_pressure_write(struct kernfs_open_file *of,
3599 char *buf, size_t nbytes,
3600 loff_t off)
3601{
3602 return cgroup_pressure_write(of, buf, nbytes, PSI_MEM);
3603}
3604
3605static ssize_t cgroup_cpu_pressure_write(struct kernfs_open_file *of,
3606 char *buf, size_t nbytes,
3607 loff_t off)
3608{
3609 return cgroup_pressure_write(of, buf, nbytes, PSI_CPU);
3610}
3611
3612static __poll_t cgroup_pressure_poll(struct kernfs_open_file *of,
3613 poll_table *pt)
3614{
3615 return psi_trigger_poll(&of->priv, of->file, pt);
3616}
3617
3618static void cgroup_pressure_release(struct kernfs_open_file *of)
3619{
3620 psi_trigger_replace(&of->priv, NULL);
3621}
3622#endif
3623
3624static int cgroup_freeze_show(struct seq_file *seq, void *v)
3625{
3626 struct cgroup *cgrp = seq_css(seq)->cgroup;
3627
3628 seq_printf(seq, "%d\n", cgrp->freezer.freeze);
3629
3630 return 0;
3631}
3632
3633static ssize_t cgroup_freeze_write(struct kernfs_open_file *of,
3634 char *buf, size_t nbytes, loff_t off)
3635{
3636 struct cgroup *cgrp;
3637 ssize_t ret;
3638 int freeze;
3639
3640 ret = kstrtoint(strstrip(buf), 0, &freeze);
3641 if (ret)
3642 return ret;
3643
3644 if (freeze < 0 || freeze > 1)
3645 return -ERANGE;
3646
3647 cgrp = cgroup_kn_lock_live(of->kn, false);
3648 if (!cgrp)
3649 return -ENOENT;
3650
3651 cgroup_freeze(cgrp, freeze);
3652
3653 cgroup_kn_unlock(of->kn);
3654
3655 return nbytes;
3656}
3657
3658static int cgroup_file_open(struct kernfs_open_file *of)
3659{
3660 struct cftype *cft = of->kn->priv;
3661
3662 if (cft->open)
3663 return cft->open(of);
3664 return 0;
3665}
3666
3667static void cgroup_file_release(struct kernfs_open_file *of)
3668{
3669 struct cftype *cft = of->kn->priv;
3670
3671 if (cft->release)
3672 cft->release(of);
3673}
3674
3675static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf,
3676 size_t nbytes, loff_t off)
3677{
3678 struct cgroup_namespace *ns = current->nsproxy->cgroup_ns;
3679 struct cgroup *cgrp = of->kn->parent->priv;
3680 struct cftype *cft = of->kn->priv;
3681 struct cgroup_subsys_state *css;
3682 int ret;
3683
3684 if (!nbytes)
3685 return 0;
3686
3687
3688
3689
3690
3691
3692
3693 if ((cgrp->root->flags & CGRP_ROOT_NS_DELEGATE) &&
3694 !(cft->flags & CFTYPE_NS_DELEGATABLE) &&
3695 ns != &init_cgroup_ns && ns->root_cset->dfl_cgrp == cgrp)
3696 return -EPERM;
3697
3698 if (cft->write)
3699 return cft->write(of, buf, nbytes, off);
3700
3701
3702
3703
3704
3705
3706
3707 rcu_read_lock();
3708 css = cgroup_css(cgrp, cft->ss);
3709 rcu_read_unlock();
3710
3711 if (cft->write_u64) {
3712 unsigned long long v;
3713 ret = kstrtoull(buf, 0, &v);
3714 if (!ret)
3715 ret = cft->write_u64(css, cft, v);
3716 } else if (cft->write_s64) {
3717 long long v;
3718 ret = kstrtoll(buf, 0, &v);
3719 if (!ret)
3720 ret = cft->write_s64(css, cft, v);
3721 } else {
3722 ret = -EINVAL;
3723 }
3724
3725 return ret ?: nbytes;
3726}
3727
3728static __poll_t cgroup_file_poll(struct kernfs_open_file *of, poll_table *pt)
3729{
3730 struct cftype *cft = of->kn->priv;
3731
3732 if (cft->poll)
3733 return cft->poll(of, pt);
3734
3735 return kernfs_generic_poll(of, pt);
3736}
3737
3738static void *cgroup_seqfile_start(struct seq_file *seq, loff_t *ppos)
3739{
3740 return seq_cft(seq)->seq_start(seq, ppos);
3741}
3742
3743static void *cgroup_seqfile_next(struct seq_file *seq, void *v, loff_t *ppos)
3744{
3745 return seq_cft(seq)->seq_next(seq, v, ppos);
3746}
3747
3748static void cgroup_seqfile_stop(struct seq_file *seq, void *v)
3749{
3750 if (seq_cft(seq)->seq_stop)
3751 seq_cft(seq)->seq_stop(seq, v);
3752}
3753
3754static int cgroup_seqfile_show(struct seq_file *m, void *arg)
3755{
3756 struct cftype *cft = seq_cft(m);
3757 struct cgroup_subsys_state *css = seq_css(m);
3758
3759 if (cft->seq_show)
3760 return cft->seq_show(m, arg);
3761
3762 if (cft->read_u64)
3763 seq_printf(m, "%llu\n", cft->read_u64(css, cft));
3764 else if (cft->read_s64)
3765 seq_printf(m, "%lld\n", cft->read_s64(css, cft));
3766 else
3767 return -EINVAL;
3768 return 0;
3769}
3770
3771static struct kernfs_ops cgroup_kf_single_ops = {
3772 .atomic_write_len = PAGE_SIZE,
3773 .open = cgroup_file_open,
3774 .release = cgroup_file_release,
3775 .write = cgroup_file_write,
3776 .poll = cgroup_file_poll,
3777 .seq_show = cgroup_seqfile_show,
3778};
3779
3780static struct kernfs_ops cgroup_kf_ops = {
3781 .atomic_write_len = PAGE_SIZE,
3782 .open = cgroup_file_open,
3783 .release = cgroup_file_release,
3784 .write = cgroup_file_write,
3785 .poll = cgroup_file_poll,
3786 .seq_start = cgroup_seqfile_start,
3787 .seq_next = cgroup_seqfile_next,
3788 .seq_stop = cgroup_seqfile_stop,
3789 .seq_show = cgroup_seqfile_show,
3790};
3791
3792
3793static int cgroup_kn_set_ugid(struct kernfs_node *kn)
3794{
3795 struct iattr iattr = { .ia_valid = ATTR_UID | ATTR_GID,
3796 .ia_uid = current_fsuid(),
3797 .ia_gid = current_fsgid(), };
3798
3799 if (uid_eq(iattr.ia_uid, GLOBAL_ROOT_UID) &&
3800 gid_eq(iattr.ia_gid, GLOBAL_ROOT_GID))
3801 return 0;
3802
3803 return kernfs_setattr(kn, &iattr);
3804}
3805
3806static void cgroup_file_notify_timer(struct timer_list *timer)
3807{
3808 cgroup_file_notify(container_of(timer, struct cgroup_file,
3809 notify_timer));
3810}
3811
3812static int cgroup_add_file(struct cgroup_subsys_state *css, struct cgroup *cgrp,
3813 struct cftype *cft)
3814{
3815 char name[CGROUP_FILE_NAME_MAX];
3816 struct kernfs_node *kn;
3817 struct lock_class_key *key = NULL;
3818 int ret;
3819
3820#ifdef CONFIG_DEBUG_LOCK_ALLOC
3821 key = &cft->lockdep_key;
3822#endif
3823 kn = __kernfs_create_file(cgrp->kn, cgroup_file_name(cgrp, cft, name),
3824 cgroup_file_mode(cft),
3825 GLOBAL_ROOT_UID, GLOBAL_ROOT_GID,
3826 0, cft->kf_ops, cft,
3827 NULL, key);
3828 if (IS_ERR(kn))
3829 return PTR_ERR(kn);
3830
3831 ret = cgroup_kn_set_ugid(kn);
3832 if (ret) {
3833 kernfs_remove(kn);
3834 return ret;
3835 }
3836
3837 if (cft->file_offset) {
3838 struct cgroup_file *cfile = (void *)css + cft->file_offset;
3839
3840 timer_setup(&cfile->notify_timer, cgroup_file_notify_timer, 0);
3841
3842 spin_lock_irq(&cgroup_file_kn_lock);
3843 cfile->kn = kn;
3844 spin_unlock_irq(&cgroup_file_kn_lock);
3845 }
3846
3847 return 0;
3848}
3849
3850
3851
3852
3853
3854
3855
3856
3857
3858
3859
3860static int cgroup_addrm_files(struct cgroup_subsys_state *css,
3861 struct cgroup *cgrp, struct cftype cfts[],
3862 bool is_add)
3863{
3864 struct cftype *cft, *cft_end = NULL;
3865 int ret = 0;
3866
3867 lockdep_assert_held(&cgroup_mutex);
3868
3869restart:
3870 for (cft = cfts; cft != cft_end && cft->name[0] != '\0'; cft++) {
3871
3872 if ((cft->flags & __CFTYPE_ONLY_ON_DFL) && !cgroup_on_dfl(cgrp))
3873 continue;
3874 if ((cft->flags & __CFTYPE_NOT_ON_DFL) && cgroup_on_dfl(cgrp))
3875 continue;
3876 if ((cft->flags & CFTYPE_NOT_ON_ROOT) && !cgroup_parent(cgrp))
3877 continue;
3878 if ((cft->flags & CFTYPE_ONLY_ON_ROOT) && cgroup_parent(cgrp))
3879 continue;
3880 if ((cft->flags & CFTYPE_DEBUG) && !cgroup_debug)
3881 continue;
3882 if (is_add) {
3883 ret = cgroup_add_file(css, cgrp, cft);
3884 if (ret) {
3885 pr_warn("%s: failed to add %s, err=%d\n",
3886 __func__, cft->name, ret);
3887 cft_end = cft;
3888 is_add = false;
3889 goto restart;
3890 }
3891 } else {
3892 cgroup_rm_file(cgrp, cft);
3893 }
3894 }
3895 return ret;
3896}
3897
3898static int cgroup_apply_cftypes(struct cftype *cfts, bool is_add)
3899{
3900 struct cgroup_subsys *ss = cfts[0].ss;
3901 struct cgroup *root = &ss->root->cgrp;
3902 struct cgroup_subsys_state *css;
3903 int ret = 0;
3904
3905 lockdep_assert_held(&cgroup_mutex);
3906
3907
3908 css_for_each_descendant_pre(css, cgroup_css(root, ss)) {
3909 struct cgroup *cgrp = css->cgroup;
3910
3911 if (!(css->flags & CSS_VISIBLE))
3912 continue;
3913
3914 ret = cgroup_addrm_files(css, cgrp, cfts, is_add);
3915 if (ret)
3916 break;
3917 }
3918
3919 if (is_add && !ret)
3920 kernfs_activate(root->kn);
3921 return ret;
3922}
3923
3924static void cgroup_exit_cftypes(struct cftype *cfts)
3925{
3926 struct cftype *cft;
3927
3928 for (cft = cfts; cft->name[0] != '\0'; cft++) {
3929
3930 if (cft->max_write_len && cft->max_write_len != PAGE_SIZE)
3931 kfree(cft->kf_ops);
3932 cft->kf_ops = NULL;
3933 cft->ss = NULL;
3934
3935
3936 cft->flags &= ~(__CFTYPE_ONLY_ON_DFL | __CFTYPE_NOT_ON_DFL);
3937 }
3938}
3939
3940static int cgroup_init_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
3941{
3942 struct cftype *cft;
3943
3944 for (cft = cfts; cft->name[0] != '\0'; cft++) {
3945 struct kernfs_ops *kf_ops;
3946
3947 WARN_ON(cft->ss || cft->kf_ops);
3948
3949 if (cft->seq_start)
3950 kf_ops = &cgroup_kf_ops;
3951 else
3952 kf_ops = &cgroup_kf_single_ops;
3953
3954
3955
3956
3957
3958 if (cft->max_write_len && cft->max_write_len != PAGE_SIZE) {
3959 kf_ops = kmemdup(kf_ops, sizeof(*kf_ops), GFP_KERNEL);
3960 if (!kf_ops) {
3961 cgroup_exit_cftypes(cfts);
3962 return -ENOMEM;
3963 }
3964 kf_ops->atomic_write_len = cft->max_write_len;
3965 }
3966
3967 cft->kf_ops = kf_ops;
3968 cft->ss = ss;
3969 }
3970
3971 return 0;
3972}
3973
3974static int cgroup_rm_cftypes_locked(struct cftype *cfts)
3975{
3976 lockdep_assert_held(&cgroup_mutex);
3977
3978 if (!cfts || !cfts[0].ss)
3979 return -ENOENT;
3980
3981 list_del(&cfts->node);
3982 cgroup_apply_cftypes(cfts, false);
3983 cgroup_exit_cftypes(cfts);
3984 return 0;
3985}
3986
3987
3988
3989
3990
3991
3992
3993
3994
3995
3996
3997
3998int cgroup_rm_cftypes(struct cftype *cfts)
3999{
4000 int ret;
4001
4002 mutex_lock(&cgroup_mutex);
4003 ret = cgroup_rm_cftypes_locked(cfts);
4004 mutex_unlock(&cgroup_mutex);
4005 return ret;
4006}
4007
4008
4009
4010
4011
4012
4013
4014
4015
4016
4017
4018
4019
4020
4021
4022static int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
4023{
4024 int ret;
4025
4026 if (!cgroup_ssid_enabled(ss->id))
4027 return 0;
4028
4029 if (!cfts || cfts[0].name[0] == '\0')
4030 return 0;
4031
4032 ret = cgroup_init_cftypes(ss, cfts);
4033 if (ret)
4034 return ret;
4035
4036 mutex_lock(&cgroup_mutex);
4037
4038 list_add_tail(&cfts->node, &ss->cfts);
4039 ret = cgroup_apply_cftypes(cfts, true);
4040 if (ret)
4041 cgroup_rm_cftypes_locked(cfts);
4042
4043 mutex_unlock(&cgroup_mutex);
4044 return ret;
4045}
4046
4047
4048
4049
4050
4051
4052
4053
4054
4055int cgroup_add_dfl_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
4056{
4057 struct cftype *cft;
4058
4059 for (cft = cfts; cft && cft->name[0] != '\0'; cft++)
4060 cft->flags |= __CFTYPE_ONLY_ON_DFL;
4061 return cgroup_add_cftypes(ss, cfts);
4062}
4063
4064
4065
4066
4067
4068
4069
4070
4071
4072int cgroup_add_legacy_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
4073{
4074 struct cftype *cft;
4075
4076 for (cft = cfts; cft && cft->name[0] != '\0'; cft++)
4077 cft->flags |= __CFTYPE_NOT_ON_DFL;
4078 return cgroup_add_cftypes(ss, cfts);
4079}
4080
4081
4082
4083
4084
4085
4086
4087void cgroup_file_notify(struct cgroup_file *cfile)
4088{
4089 unsigned long flags;
4090
4091 spin_lock_irqsave(&cgroup_file_kn_lock, flags);
4092 if (cfile->kn) {
4093 unsigned long last = cfile->notified_at;
4094 unsigned long next = last + CGROUP_FILE_NOTIFY_MIN_INTV;
4095
4096 if (time_in_range(jiffies, last, next)) {
4097 timer_reduce(&cfile->notify_timer, next);
4098 } else {
4099 kernfs_notify(cfile->kn);
4100 cfile->notified_at = jiffies;
4101 }
4102 }
4103 spin_unlock_irqrestore(&cgroup_file_kn_lock, flags);
4104}
4105
4106
4107
4108
4109
4110
4111
4112
4113
4114
4115
4116
4117
4118
4119
4120
4121
4122
4123struct cgroup_subsys_state *css_next_child(struct cgroup_subsys_state *pos,
4124 struct cgroup_subsys_state *parent)
4125{
4126 struct cgroup_subsys_state *next;
4127
4128 cgroup_assert_mutex_or_rcu_locked();
4129
4130
4131
4132
4133
4134
4135
4136
4137
4138
4139
4140
4141
4142
4143
4144
4145
4146
4147
4148
4149
4150 if (!pos) {
4151 next = list_entry_rcu(parent->children.next, struct cgroup_subsys_state, sibling);
4152 } else if (likely(!(pos->flags & CSS_RELEASED))) {
4153 next = list_entry_rcu(pos->sibling.next, struct cgroup_subsys_state, sibling);
4154 } else {
4155 list_for_each_entry_rcu(next, &parent->children, sibling,
4156 lockdep_is_held(&cgroup_mutex))
4157 if (next->serial_nr > pos->serial_nr)
4158 break;
4159 }
4160
4161
4162
4163
4164
4165 if (&next->sibling != &parent->children)
4166 return next;
4167 return NULL;
4168}
4169
4170
4171
4172
4173
4174
4175
4176
4177
4178
4179
4180
4181
4182
4183
4184
4185
4186
4187
4188
4189
4190
4191struct cgroup_subsys_state *
4192css_next_descendant_pre(struct cgroup_subsys_state *pos,
4193 struct cgroup_subsys_state *root)
4194{
4195 struct cgroup_subsys_state *next;
4196
4197 cgroup_assert_mutex_or_rcu_locked();
4198
4199
4200 if (!pos)
4201 return root;
4202
4203
4204 next = css_next_child(NULL, pos);
4205 if (next)
4206 return next;
4207
4208
4209 while (pos != root) {
4210 next = css_next_child(pos, pos->parent);
4211 if (next)
4212 return next;
4213 pos = pos->parent;
4214 }
4215
4216 return NULL;
4217}
4218EXPORT_SYMBOL_GPL(css_next_descendant_pre);
4219
4220
4221
4222
4223
4224
4225
4226
4227
4228
4229
4230
4231
4232
4233struct cgroup_subsys_state *
4234css_rightmost_descendant(struct cgroup_subsys_state *pos)
4235{
4236 struct cgroup_subsys_state *last, *tmp;
4237
4238 cgroup_assert_mutex_or_rcu_locked();
4239
4240 do {
4241 last = pos;
4242
4243 pos = NULL;
4244 css_for_each_child(tmp, last)
4245 pos = tmp;
4246 } while (pos);
4247
4248 return last;
4249}
4250
4251static struct cgroup_subsys_state *
4252css_leftmost_descendant(struct cgroup_subsys_state *pos)
4253{
4254 struct cgroup_subsys_state *last;
4255
4256 do {
4257 last = pos;
4258 pos = css_next_child(NULL, pos);
4259 } while (pos);
4260
4261 return last;
4262}
4263
4264
4265
4266
4267
4268
4269
4270
4271
4272
4273
4274
4275
4276
4277
4278
4279
4280
4281
4282
4283
4284
4285
4286struct cgroup_subsys_state *
4287css_next_descendant_post(struct cgroup_subsys_state *pos,
4288 struct cgroup_subsys_state *root)
4289{
4290 struct cgroup_subsys_state *next;
4291
4292 cgroup_assert_mutex_or_rcu_locked();
4293
4294
4295 if (!pos)
4296 return css_leftmost_descendant(root);
4297
4298
4299 if (pos == root)
4300 return NULL;
4301
4302
4303 next = css_next_child(pos, pos->parent);
4304 if (next)
4305 return css_leftmost_descendant(next);
4306
4307
4308 return pos->parent;
4309}
4310
4311
4312
4313
4314
4315
4316
4317
4318
4319bool css_has_online_children(struct cgroup_subsys_state *css)
4320{
4321 struct cgroup_subsys_state *child;
4322 bool ret = false;
4323
4324 rcu_read_lock();
4325 css_for_each_child(child, css) {
4326 if (child->flags & CSS_ONLINE) {
4327 ret = true;
4328 break;
4329 }
4330 }
4331 rcu_read_unlock();
4332 return ret;
4333}
4334
4335static struct css_set *css_task_iter_next_css_set(struct css_task_iter *it)
4336{
4337 struct list_head *l;
4338 struct cgrp_cset_link *link;
4339 struct css_set *cset;
4340
4341 lockdep_assert_held(&css_set_lock);
4342
4343
4344 if (it->tcset_pos) {
4345 l = it->tcset_pos->next;
4346
4347 if (l != it->tcset_head) {
4348 it->tcset_pos = l;
4349 return container_of(l, struct css_set,
4350 threaded_csets_node);
4351 }
4352
4353 it->tcset_pos = NULL;
4354 }
4355
4356
4357 l = it->cset_pos;
4358 l = l->next;
4359 if (l == it->cset_head) {
4360 it->cset_pos = NULL;
4361 return NULL;
4362 }
4363
4364 if (it->ss) {
4365 cset = container_of(l, struct css_set, e_cset_node[it->ss->id]);
4366 } else {
4367 link = list_entry(l, struct cgrp_cset_link, cset_link);
4368 cset = link->cset;
4369 }
4370
4371 it->cset_pos = l;
4372
4373
4374 if (it->flags & CSS_TASK_ITER_THREADED) {
4375 if (it->cur_dcset)
4376 put_css_set_locked(it->cur_dcset);
4377 it->cur_dcset = cset;
4378 get_css_set(cset);
4379
4380 it->tcset_head = &cset->threaded_csets;
4381 it->tcset_pos = &cset->threaded_csets;
4382 }
4383
4384 return cset;
4385}
4386
4387
4388
4389
4390
4391
4392
4393static void css_task_iter_advance_css_set(struct css_task_iter *it)
4394{
4395 struct css_set *cset;
4396
4397 lockdep_assert_held(&css_set_lock);
4398
4399
4400 while ((cset = css_task_iter_next_css_set(it))) {
4401 if (!list_empty(&cset->tasks)) {
4402 it->cur_tasks_head = &cset->tasks;
4403 break;
4404 } else if (!list_empty(&cset->mg_tasks)) {
4405 it->cur_tasks_head = &cset->mg_tasks;
4406 break;
4407 } else if (!list_empty(&cset->dying_tasks)) {
4408 it->cur_tasks_head = &cset->dying_tasks;
4409 break;
4410 }
4411 }
4412 if (!cset) {
4413 it->task_pos = NULL;
4414 return;
4415 }
4416 it->task_pos = it->cur_tasks_head->next;
4417
4418
4419
4420
4421
4422
4423
4424
4425
4426
4427
4428
4429
4430
4431
4432
4433 if (it->cur_cset) {
4434 list_del(&it->iters_node);
4435 put_css_set_locked(it->cur_cset);
4436 }
4437 get_css_set(cset);
4438 it->cur_cset = cset;
4439 list_add(&it->iters_node, &cset->task_iters);
4440}
4441
4442static void css_task_iter_skip(struct css_task_iter *it,
4443 struct task_struct *task)
4444{
4445 lockdep_assert_held(&css_set_lock);
4446
4447 if (it->task_pos == &task->cg_list) {
4448 it->task_pos = it->task_pos->next;
4449 it->flags |= CSS_TASK_ITER_SKIPPED;
4450 }
4451}
4452
4453static void css_task_iter_advance(struct css_task_iter *it)
4454{
4455 struct task_struct *task;
4456
4457 lockdep_assert_held(&css_set_lock);
4458repeat:
4459 if (it->task_pos) {
4460
4461
4462
4463
4464
4465 if (it->flags & CSS_TASK_ITER_SKIPPED)
4466 it->flags &= ~CSS_TASK_ITER_SKIPPED;
4467 else
4468 it->task_pos = it->task_pos->next;
4469
4470 if (it->task_pos == &it->cur_cset->tasks) {
4471 it->cur_tasks_head = &it->cur_cset->mg_tasks;
4472 it->task_pos = it->cur_tasks_head->next;
4473 }
4474 if (it->task_pos == &it->cur_cset->mg_tasks) {
4475 it->cur_tasks_head = &it->cur_cset->dying_tasks;
4476 it->task_pos = it->cur_tasks_head->next;
4477 }
4478 if (it->task_pos == &it->cur_cset->dying_tasks)
4479 css_task_iter_advance_css_set(it);
4480 } else {
4481
4482 css_task_iter_advance_css_set(it);
4483 }
4484
4485 if (!it->task_pos)
4486 return;
4487
4488 task = list_entry(it->task_pos, struct task_struct, cg_list);
4489
4490 if (it->flags & CSS_TASK_ITER_PROCS) {
4491
4492 if (!thread_group_leader(task))
4493 goto repeat;
4494
4495
4496 if (it->cur_tasks_head == &it->cur_cset->dying_tasks &&
4497 !atomic_read(&task->signal->live))
4498 goto repeat;
4499 } else {
4500
4501 if (it->cur_tasks_head == &it->cur_cset->dying_tasks)
4502 goto repeat;
4503 }
4504}
4505
4506
4507
4508
4509
4510
4511
4512
4513
4514
4515
4516
4517void css_task_iter_start(struct cgroup_subsys_state *css, unsigned int flags,
4518 struct css_task_iter *it)
4519{
4520 memset(it, 0, sizeof(*it));
4521
4522 spin_lock_irq(&css_set_lock);
4523
4524 it->ss = css->ss;
4525 it->flags = flags;
4526
4527 if (it->ss)
4528 it->cset_pos = &css->cgroup->e_csets[css->ss->id];
4529 else
4530 it->cset_pos = &css->cgroup->cset_links;
4531
4532 it->cset_head = it->cset_pos;
4533
4534 css_task_iter_advance(it);
4535
4536 spin_unlock_irq(&css_set_lock);
4537}
4538
4539
4540
4541
4542
4543
4544
4545
4546
4547struct task_struct *css_task_iter_next(struct css_task_iter *it)
4548{
4549 if (it->cur_task) {
4550 put_task_struct(it->cur_task);
4551 it->cur_task = NULL;
4552 }
4553
4554 spin_lock_irq(&css_set_lock);
4555
4556
4557 if (it->flags & CSS_TASK_ITER_SKIPPED)
4558 css_task_iter_advance(it);
4559
4560 if (it->task_pos) {
4561 it->cur_task = list_entry(it->task_pos, struct task_struct,
4562 cg_list);
4563 get_task_struct(it->cur_task);
4564 css_task_iter_advance(it);
4565 }
4566
4567 spin_unlock_irq(&css_set_lock);
4568
4569 return it->cur_task;
4570}
4571
4572
4573
4574
4575
4576
4577
4578void css_task_iter_end(struct css_task_iter *it)
4579{
4580 if (it->cur_cset) {
4581 spin_lock_irq(&css_set_lock);
4582 list_del(&it->iters_node);
4583 put_css_set_locked(it->cur_cset);
4584 spin_unlock_irq(&css_set_lock);
4585 }
4586
4587 if (it->cur_dcset)
4588 put_css_set(it->cur_dcset);
4589
4590 if (it->cur_task)
4591 put_task_struct(it->cur_task);
4592}
4593
4594static void cgroup_procs_release(struct kernfs_open_file *of)
4595{
4596 if (of->priv) {
4597 css_task_iter_end(of->priv);
4598 kfree(of->priv);
4599 }
4600}
4601
4602static void *cgroup_procs_next(struct seq_file *s, void *v, loff_t *pos)
4603{
4604 struct kernfs_open_file *of = s->private;
4605 struct css_task_iter *it = of->priv;
4606
4607 if (pos)
4608 (*pos)++;
4609
4610 return css_task_iter_next(it);
4611}
4612
4613static void *__cgroup_procs_start(struct seq_file *s, loff_t *pos,
4614 unsigned int iter_flags)
4615{
4616 struct kernfs_open_file *of = s->private;
4617 struct cgroup *cgrp = seq_css(s)->cgroup;
4618 struct css_task_iter *it = of->priv;
4619
4620
4621
4622
4623
4624 if (!it) {
4625 if (WARN_ON_ONCE((*pos)))
4626 return ERR_PTR(-EINVAL);
4627
4628 it = kzalloc(sizeof(*it), GFP_KERNEL);
4629 if (!it)
4630 return ERR_PTR(-ENOMEM);
4631 of->priv = it;
4632 css_task_iter_start(&cgrp->self, iter_flags, it);
4633 } else if (!(*pos)) {
4634 css_task_iter_end(it);
4635 css_task_iter_start(&cgrp->self, iter_flags, it);
4636 } else
4637 return it->cur_task;
4638
4639 return cgroup_procs_next(s, NULL, NULL);
4640}
4641
4642static void *cgroup_procs_start(struct seq_file *s, loff_t *pos)
4643{
4644 struct cgroup *cgrp = seq_css(s)->cgroup;
4645
4646
4647
4648
4649
4650
4651
4652 if (cgroup_is_threaded(cgrp))
4653 return ERR_PTR(-EOPNOTSUPP);
4654
4655 return __cgroup_procs_start(s, pos, CSS_TASK_ITER_PROCS |
4656 CSS_TASK_ITER_THREADED);
4657}
4658
4659static int cgroup_procs_show(struct seq_file *s, void *v)
4660{
4661 seq_printf(s, "%d\n", task_pid_vnr(v));
4662 return 0;
4663}
4664
4665static int cgroup_may_write(const struct cgroup *cgrp, struct super_block *sb)
4666{
4667 int ret;
4668 struct inode *inode;
4669
4670 lockdep_assert_held(&cgroup_mutex);
4671
4672 inode = kernfs_get_inode(sb, cgrp->procs_file.kn);
4673 if (!inode)
4674 return -ENOMEM;
4675
4676 ret = inode_permission(inode, MAY_WRITE);
4677 iput(inode);
4678 return ret;
4679}
4680
4681static int cgroup_procs_write_permission(struct cgroup *src_cgrp,
4682 struct cgroup *dst_cgrp,
4683 struct super_block *sb)
4684{
4685 struct cgroup_namespace *ns = current->nsproxy->cgroup_ns;
4686 struct cgroup *com_cgrp = src_cgrp;
4687 int ret;
4688
4689 lockdep_assert_held(&cgroup_mutex);
4690
4691
4692 while (!cgroup_is_descendant(dst_cgrp, com_cgrp))
4693 com_cgrp = cgroup_parent(com_cgrp);
4694
4695
4696 ret = cgroup_may_write(com_cgrp, sb);
4697 if (ret)
4698 return ret;
4699
4700
4701
4702
4703
4704 if ((cgrp_dfl_root.flags & CGRP_ROOT_NS_DELEGATE) &&
4705 (!cgroup_is_descendant(src_cgrp, ns->root_cset->dfl_cgrp) ||
4706 !cgroup_is_descendant(dst_cgrp, ns->root_cset->dfl_cgrp)))
4707 return -ENOENT;
4708
4709 return 0;
4710}
4711
4712static int cgroup_attach_permissions(struct cgroup *src_cgrp,
4713 struct cgroup *dst_cgrp,
4714 struct super_block *sb, bool threadgroup)
4715{
4716 int ret = 0;
4717
4718 ret = cgroup_procs_write_permission(src_cgrp, dst_cgrp, sb);
4719 if (ret)
4720 return ret;
4721
4722 ret = cgroup_migrate_vet_dst(dst_cgrp);
4723 if (ret)
4724 return ret;
4725
4726 if (!threadgroup && (src_cgrp->dom_cgrp != dst_cgrp->dom_cgrp))
4727 ret = -EOPNOTSUPP;
4728
4729 return ret;
4730}
4731
4732static ssize_t cgroup_procs_write(struct kernfs_open_file *of,
4733 char *buf, size_t nbytes, loff_t off)
4734{
4735 struct cgroup *src_cgrp, *dst_cgrp;
4736 struct task_struct *task;
4737 ssize_t ret;
4738 bool locked;
4739
4740 dst_cgrp = cgroup_kn_lock_live(of->kn, false);
4741 if (!dst_cgrp)
4742 return -ENODEV;
4743
4744 task = cgroup_procs_write_start(buf, true, &locked);
4745 ret = PTR_ERR_OR_ZERO(task);
4746 if (ret)
4747 goto out_unlock;
4748
4749
4750 spin_lock_irq(&css_set_lock);
4751 src_cgrp = task_cgroup_from_root(task, &cgrp_dfl_root);
4752 spin_unlock_irq(&css_set_lock);
4753
4754 ret = cgroup_attach_permissions(src_cgrp, dst_cgrp,
4755 of->file->f_path.dentry->d_sb, true);
4756 if (ret)
4757 goto out_finish;
4758
4759 ret = cgroup_attach_task(dst_cgrp, task, true);
4760
4761out_finish:
4762 cgroup_procs_write_finish(task, locked);
4763out_unlock:
4764 cgroup_kn_unlock(of->kn);
4765
4766 return ret ?: nbytes;
4767}
4768
4769static void *cgroup_threads_start(struct seq_file *s, loff_t *pos)
4770{
4771 return __cgroup_procs_start(s, pos, 0);
4772}
4773
4774static ssize_t cgroup_threads_write(struct kernfs_open_file *of,
4775 char *buf, size_t nbytes, loff_t off)
4776{
4777 struct cgroup *src_cgrp, *dst_cgrp;
4778 struct task_struct *task;
4779 ssize_t ret;
4780 bool locked;
4781
4782 buf = strstrip(buf);
4783
4784 dst_cgrp = cgroup_kn_lock_live(of->kn, false);
4785 if (!dst_cgrp)
4786 return -ENODEV;
4787
4788 task = cgroup_procs_write_start(buf, false, &locked);
4789 ret = PTR_ERR_OR_ZERO(task);
4790 if (ret)
4791 goto out_unlock;
4792
4793
4794 spin_lock_irq(&css_set_lock);
4795 src_cgrp = task_cgroup_from_root(task, &cgrp_dfl_root);
4796 spin_unlock_irq(&css_set_lock);
4797
4798
4799 ret = cgroup_attach_permissions(src_cgrp, dst_cgrp,
4800 of->file->f_path.dentry->d_sb, false);
4801 if (ret)
4802 goto out_finish;
4803
4804 ret = cgroup_attach_task(dst_cgrp, task, false);
4805
4806out_finish:
4807 cgroup_procs_write_finish(task, locked);
4808out_unlock:
4809 cgroup_kn_unlock(of->kn);
4810
4811 return ret ?: nbytes;
4812}
4813
4814
4815static struct cftype cgroup_base_files[] = {
4816 {
4817 .name = "cgroup.type",
4818 .flags = CFTYPE_NOT_ON_ROOT,
4819 .seq_show = cgroup_type_show,
4820 .write = cgroup_type_write,
4821 },
4822 {
4823 .name = "cgroup.procs",
4824 .flags = CFTYPE_NS_DELEGATABLE,
4825 .file_offset = offsetof(struct cgroup, procs_file),
4826 .release = cgroup_procs_release,
4827 .seq_start = cgroup_procs_start,
4828 .seq_next = cgroup_procs_next,
4829 .seq_show = cgroup_procs_show,
4830 .write = cgroup_procs_write,
4831 },
4832 {
4833 .name = "cgroup.threads",
4834 .flags = CFTYPE_NS_DELEGATABLE,
4835 .release = cgroup_procs_release,
4836 .seq_start = cgroup_threads_start,
4837 .seq_next = cgroup_procs_next,
4838 .seq_show = cgroup_procs_show,
4839 .write = cgroup_threads_write,
4840 },
4841 {
4842 .name = "cgroup.controllers",
4843 .seq_show = cgroup_controllers_show,
4844 },
4845 {
4846 .name = "cgroup.subtree_control",
4847 .flags = CFTYPE_NS_DELEGATABLE,
4848 .seq_show = cgroup_subtree_control_show,
4849 .write = cgroup_subtree_control_write,
4850 },
4851 {
4852 .name = "cgroup.events",
4853 .flags = CFTYPE_NOT_ON_ROOT,
4854 .file_offset = offsetof(struct cgroup, events_file),
4855 .seq_show = cgroup_events_show,
4856 },
4857 {
4858 .name = "cgroup.max.descendants",
4859 .seq_show = cgroup_max_descendants_show,
4860 .write = cgroup_max_descendants_write,
4861 },
4862 {
4863 .name = "cgroup.max.depth",
4864 .seq_show = cgroup_max_depth_show,
4865 .write = cgroup_max_depth_write,
4866 },
4867 {
4868 .name = "cgroup.stat",
4869 .seq_show = cgroup_stat_show,
4870 },
4871 {
4872 .name = "cgroup.freeze",
4873 .flags = CFTYPE_NOT_ON_ROOT,
4874 .seq_show = cgroup_freeze_show,
4875 .write = cgroup_freeze_write,
4876 },
4877 {
4878 .name = "cpu.stat",
4879 .seq_show = cpu_stat_show,
4880 },
4881#ifdef CONFIG_PSI
4882 {
4883 .name = "io.pressure",
4884 .seq_show = cgroup_io_pressure_show,
4885 .write = cgroup_io_pressure_write,
4886 .poll = cgroup_pressure_poll,
4887 .release = cgroup_pressure_release,
4888 },
4889 {
4890 .name = "memory.pressure",
4891 .seq_show = cgroup_memory_pressure_show,
4892 .write = cgroup_memory_pressure_write,
4893 .poll = cgroup_pressure_poll,
4894 .release = cgroup_pressure_release,
4895 },
4896 {
4897 .name = "cpu.pressure",
4898 .seq_show = cgroup_cpu_pressure_show,
4899 .write = cgroup_cpu_pressure_write,
4900 .poll = cgroup_pressure_poll,
4901 .release = cgroup_pressure_release,
4902 },
4903#endif
4904 { }
4905};
4906
4907
4908
4909
4910
4911
4912
4913
4914
4915
4916
4917
4918
4919
4920
4921
4922
4923
4924
4925
4926
4927
4928
4929static void css_free_rwork_fn(struct work_struct *work)
4930{
4931 struct cgroup_subsys_state *css = container_of(to_rcu_work(work),
4932 struct cgroup_subsys_state, destroy_rwork);
4933 struct cgroup_subsys *ss = css->ss;
4934 struct cgroup *cgrp = css->cgroup;
4935
4936 percpu_ref_exit(&css->refcnt);
4937
4938 if (ss) {
4939
4940 struct cgroup_subsys_state *parent = css->parent;
4941 int id = css->id;
4942
4943 ss->css_free(css);
4944 cgroup_idr_remove(&ss->css_idr, id);
4945 cgroup_put(cgrp);
4946
4947 if (parent)
4948 css_put(parent);
4949 } else {
4950
4951 atomic_dec(&cgrp->root->nr_cgrps);
4952 cgroup1_pidlist_destroy_all(cgrp);
4953 cancel_work_sync(&cgrp->release_agent_work);
4954
4955 if (cgroup_parent(cgrp)) {
4956
4957
4958
4959
4960
4961
4962 cgroup_put(cgroup_parent(cgrp));
4963 kernfs_put(cgrp->kn);
4964 psi_cgroup_free(cgrp);
4965 if (cgroup_on_dfl(cgrp))
4966 cgroup_rstat_exit(cgrp);
4967 kfree(cgrp);
4968 } else {
4969
4970
4971
4972
4973
4974 cgroup_destroy_root(cgrp->root);
4975 }
4976 }
4977}
4978
4979static void css_release_work_fn(struct work_struct *work)
4980{
4981 struct cgroup_subsys_state *css =
4982 container_of(work, struct cgroup_subsys_state, destroy_work);
4983 struct cgroup_subsys *ss = css->ss;
4984 struct cgroup *cgrp = css->cgroup;
4985
4986 mutex_lock(&cgroup_mutex);
4987
4988 css->flags |= CSS_RELEASED;
4989 list_del_rcu(&css->sibling);
4990
4991 if (ss) {
4992
4993 if (!list_empty(&css->rstat_css_node)) {
4994 cgroup_rstat_flush(cgrp);
4995 list_del_rcu(&css->rstat_css_node);
4996 }
4997
4998 cgroup_idr_replace(&ss->css_idr, NULL, css->id);
4999 if (ss->css_released)
5000 ss->css_released(css);
5001 } else {
5002 struct cgroup *tcgrp;
5003
5004
5005 TRACE_CGROUP_PATH(release, cgrp);
5006
5007 if (cgroup_on_dfl(cgrp))
5008 cgroup_rstat_flush(cgrp);
5009
5010 spin_lock_irq(&css_set_lock);
5011 for (tcgrp = cgroup_parent(cgrp); tcgrp;
5012 tcgrp = cgroup_parent(tcgrp))
5013 tcgrp->nr_dying_descendants--;
5014 spin_unlock_irq(&css_set_lock);
5015
5016
5017
5018
5019
5020
5021
5022
5023 if (cgrp->kn)
5024 RCU_INIT_POINTER(*(void __rcu __force **)&cgrp->kn->priv,
5025 NULL);
5026 }
5027
5028 mutex_unlock(&cgroup_mutex);
5029
5030 INIT_RCU_WORK(&css->destroy_rwork, css_free_rwork_fn);
5031 queue_rcu_work(cgroup_destroy_wq, &css->destroy_rwork);
5032}
5033
5034static void css_release(struct percpu_ref *ref)
5035{
5036 struct cgroup_subsys_state *css =
5037 container_of(ref, struct cgroup_subsys_state, refcnt);
5038
5039 INIT_WORK(&css->destroy_work, css_release_work_fn);
5040 queue_work(cgroup_destroy_wq, &css->destroy_work);
5041}
5042
5043static void init_and_link_css(struct cgroup_subsys_state *css,
5044 struct cgroup_subsys *ss, struct cgroup *cgrp)
5045{
5046 lockdep_assert_held(&cgroup_mutex);
5047
5048 cgroup_get_live(cgrp);
5049
5050 memset(css, 0, sizeof(*css));
5051 css->cgroup = cgrp;
5052 css->ss = ss;
5053 css->id = -1;
5054 INIT_LIST_HEAD(&css->sibling);
5055 INIT_LIST_HEAD(&css->children);
5056 INIT_LIST_HEAD(&css->rstat_css_node);
5057 css->serial_nr = css_serial_nr_next++;
5058 atomic_set(&css->online_cnt, 0);
5059
5060 if (cgroup_parent(cgrp)) {
5061 css->parent = cgroup_css(cgroup_parent(cgrp), ss);
5062 css_get(css->parent);
5063 }
5064
5065 if (cgroup_on_dfl(cgrp) && ss->css_rstat_flush)
5066 list_add_rcu(&css->rstat_css_node, &cgrp->rstat_css_list);
5067
5068 BUG_ON(cgroup_css(cgrp, ss));
5069}
5070
5071
5072static int online_css(struct cgroup_subsys_state *css)
5073{
5074 struct cgroup_subsys *ss = css->ss;
5075 int ret = 0;
5076
5077 lockdep_assert_held(&cgroup_mutex);
5078
5079 if (ss->css_online)
5080 ret = ss->css_online(css);
5081 if (!ret) {
5082 css->flags |= CSS_ONLINE;
5083 rcu_assign_pointer(css->cgroup->subsys[ss->id], css);
5084
5085 atomic_inc(&css->online_cnt);
5086 if (css->parent)
5087 atomic_inc(&css->parent->online_cnt);
5088 }
5089 return ret;
5090}
5091
5092
5093static void offline_css(struct cgroup_subsys_state *css)
5094{
5095 struct cgroup_subsys *ss = css->ss;
5096
5097 lockdep_assert_held(&cgroup_mutex);
5098
5099 if (!(css->flags & CSS_ONLINE))
5100 return;
5101
5102 if (ss->css_offline)
5103 ss->css_offline(css);
5104
5105 css->flags &= ~CSS_ONLINE;
5106 RCU_INIT_POINTER(css->cgroup->subsys[ss->id], NULL);
5107
5108 wake_up_all(&css->cgroup->offline_waitq);
5109}
5110
5111
5112
5113
5114
5115
5116
5117
5118
5119
5120static struct cgroup_subsys_state *css_create(struct cgroup *cgrp,
5121 struct cgroup_subsys *ss)
5122{
5123 struct cgroup *parent = cgroup_parent(cgrp);
5124 struct cgroup_subsys_state *parent_css = cgroup_css(parent, ss);
5125 struct cgroup_subsys_state *css;
5126 int err;
5127
5128 lockdep_assert_held(&cgroup_mutex);
5129
5130 css = ss->css_alloc(parent_css);
5131 if (!css)
5132 css = ERR_PTR(-ENOMEM);
5133 if (IS_ERR(css))
5134 return css;
5135
5136 init_and_link_css(css, ss, cgrp);
5137
5138 err = percpu_ref_init(&css->refcnt, css_release, 0, GFP_KERNEL);
5139 if (err)
5140 goto err_free_css;
5141
5142 err = cgroup_idr_alloc(&ss->css_idr, NULL, 2, 0, GFP_KERNEL);
5143 if (err < 0)
5144 goto err_free_css;
5145 css->id = err;
5146
5147
5148 list_add_tail_rcu(&css->sibling, &parent_css->children);
5149 cgroup_idr_replace(&ss->css_idr, css, css->id);
5150
5151 err = online_css(css);
5152 if (err)
5153 goto err_list_del;
5154
5155 if (ss->broken_hierarchy && !ss->warned_broken_hierarchy &&
5156 cgroup_parent(parent)) {
5157 pr_warn("%s (%d) created nested cgroup for controller \"%s\" which has incomplete hierarchy support. Nested cgroups may change behavior in the future.\n",
5158 current->comm, current->pid, ss->name);
5159 if (!strcmp(ss->name, "memory"))
5160 pr_warn("\"memory\" requires setting use_hierarchy to 1 on the root\n");
5161 ss->warned_broken_hierarchy = true;
5162 }
5163
5164 return css;
5165
5166err_list_del:
5167 list_del_rcu(&css->sibling);
5168err_free_css:
5169 list_del_rcu(&css->rstat_css_node);
5170 INIT_RCU_WORK(&css->destroy_rwork, css_free_rwork_fn);
5171 queue_rcu_work(cgroup_destroy_wq, &css->destroy_rwork);
5172 return ERR_PTR(err);
5173}
5174
5175
5176
5177
5178
5179
5180static struct cgroup *cgroup_create(struct cgroup *parent, const char *name,
5181 umode_t mode)
5182{
5183 struct cgroup_root *root = parent->root;
5184 struct cgroup *cgrp, *tcgrp;
5185 struct kernfs_node *kn;
5186 int level = parent->level + 1;
5187 int ret;
5188
5189
5190 cgrp = kzalloc(struct_size(cgrp, ancestor_ids, (level + 1)),
5191 GFP_KERNEL);
5192 if (!cgrp)
5193 return ERR_PTR(-ENOMEM);
5194
5195 ret = percpu_ref_init(&cgrp->self.refcnt, css_release, 0, GFP_KERNEL);
5196 if (ret)
5197 goto out_free_cgrp;
5198
5199 if (cgroup_on_dfl(parent)) {
5200 ret = cgroup_rstat_init(cgrp);
5201 if (ret)
5202 goto out_cancel_ref;
5203 }
5204
5205
5206 kn = kernfs_create_dir(parent->kn, name, mode, cgrp);
5207 if (IS_ERR(kn)) {
5208 ret = PTR_ERR(kn);
5209 goto out_stat_exit;
5210 }
5211 cgrp->kn = kn;
5212
5213 init_cgroup_housekeeping(cgrp);
5214
5215 cgrp->self.parent = &parent->self;
5216 cgrp->root = root;
5217 cgrp->level = level;
5218
5219 ret = psi_cgroup_alloc(cgrp);
5220 if (ret)
5221 goto out_kernfs_remove;
5222
5223 ret = cgroup_bpf_inherit(cgrp);
5224 if (ret)
5225 goto out_psi_free;
5226
5227
5228
5229
5230
5231 cgrp->freezer.e_freeze = parent->freezer.e_freeze;
5232 if (cgrp->freezer.e_freeze) {
5233
5234
5235
5236
5237
5238
5239 set_bit(CGRP_FREEZE, &cgrp->flags);
5240 set_bit(CGRP_FROZEN, &cgrp->flags);
5241 }
5242
5243 spin_lock_irq(&css_set_lock);
5244 for (tcgrp = cgrp; tcgrp; tcgrp = cgroup_parent(tcgrp)) {
5245 cgrp->ancestor_ids[tcgrp->level] = cgroup_id(tcgrp);
5246
5247 if (tcgrp != cgrp) {
5248 tcgrp->nr_descendants++;
5249
5250
5251
5252
5253
5254
5255 if (cgrp->freezer.e_freeze)
5256 tcgrp->freezer.nr_frozen_descendants++;
5257 }
5258 }
5259 spin_unlock_irq(&css_set_lock);
5260
5261 if (notify_on_release(parent))
5262 set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
5263
5264 if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &parent->flags))
5265 set_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags);
5266
5267 cgrp->self.serial_nr = css_serial_nr_next++;
5268
5269
5270 list_add_tail_rcu(&cgrp->self.sibling, &cgroup_parent(cgrp)->self.children);
5271 atomic_inc(&root->nr_cgrps);
5272 cgroup_get_live(parent);
5273
5274
5275
5276
5277
5278 if (!cgroup_on_dfl(cgrp))
5279 cgrp->subtree_control = cgroup_control(cgrp);
5280
5281 cgroup_propagate_control(cgrp);
5282
5283 return cgrp;
5284
5285out_psi_free:
5286 psi_cgroup_free(cgrp);
5287out_kernfs_remove:
5288 kernfs_remove(cgrp->kn);
5289out_stat_exit:
5290 if (cgroup_on_dfl(parent))
5291 cgroup_rstat_exit(cgrp);
5292out_cancel_ref:
5293 percpu_ref_exit(&cgrp->self.refcnt);
5294out_free_cgrp:
5295 kfree(cgrp);
5296 return ERR_PTR(ret);
5297}
5298
5299static bool cgroup_check_hierarchy_limits(struct cgroup *parent)
5300{
5301 struct cgroup *cgroup;
5302 int ret = false;
5303 int level = 1;
5304
5305 lockdep_assert_held(&cgroup_mutex);
5306
5307 for (cgroup = parent; cgroup; cgroup = cgroup_parent(cgroup)) {
5308 if (cgroup->nr_descendants >= cgroup->max_descendants)
5309 goto fail;
5310
5311 if (level > cgroup->max_depth)
5312 goto fail;
5313
5314 level++;
5315 }
5316
5317 ret = true;
5318fail:
5319 return ret;
5320}
5321
5322int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, umode_t mode)
5323{
5324 struct cgroup *parent, *cgrp;
5325 int ret;
5326
5327
5328 if (strchr(name, '\n'))
5329 return -EINVAL;
5330
5331 parent = cgroup_kn_lock_live(parent_kn, false);
5332 if (!parent)
5333 return -ENODEV;
5334
5335 if (!cgroup_check_hierarchy_limits(parent)) {
5336 ret = -EAGAIN;
5337 goto out_unlock;
5338 }
5339
5340 cgrp = cgroup_create(parent, name, mode);
5341 if (IS_ERR(cgrp)) {
5342 ret = PTR_ERR(cgrp);
5343 goto out_unlock;
5344 }
5345
5346
5347
5348
5349
5350 kernfs_get(cgrp->kn);
5351
5352 ret = cgroup_kn_set_ugid(cgrp->kn);
5353 if (ret)
5354 goto out_destroy;
5355
5356 ret = css_populate_dir(&cgrp->self);
5357 if (ret)
5358 goto out_destroy;
5359
5360 ret = cgroup_apply_control_enable(cgrp);
5361 if (ret)
5362 goto out_destroy;
5363
5364 TRACE_CGROUP_PATH(mkdir, cgrp);
5365
5366
5367 kernfs_activate(cgrp->kn);
5368
5369 ret = 0;
5370 goto out_unlock;
5371
5372out_destroy:
5373 cgroup_destroy_locked(cgrp);
5374out_unlock:
5375 cgroup_kn_unlock(parent_kn);
5376 return ret;
5377}
5378
5379
5380
5381
5382
5383
5384static void css_killed_work_fn(struct work_struct *work)
5385{
5386 struct cgroup_subsys_state *css =
5387 container_of(work, struct cgroup_subsys_state, destroy_work);
5388
5389 mutex_lock(&cgroup_mutex);
5390
5391 do {
5392 offline_css(css);
5393 css_put(css);
5394
5395 css = css->parent;
5396 } while (css && atomic_dec_and_test(&css->online_cnt));
5397
5398 mutex_unlock(&cgroup_mutex);
5399}
5400
5401
5402static void css_killed_ref_fn(struct percpu_ref *ref)
5403{
5404 struct cgroup_subsys_state *css =
5405 container_of(ref, struct cgroup_subsys_state, refcnt);
5406
5407 if (atomic_dec_and_test(&css->online_cnt)) {
5408 INIT_WORK(&css->destroy_work, css_killed_work_fn);
5409 queue_work(cgroup_destroy_wq, &css->destroy_work);
5410 }
5411}
5412
5413
5414
5415
5416
5417
5418
5419
5420
5421
5422static void kill_css(struct cgroup_subsys_state *css)
5423{
5424 lockdep_assert_held(&cgroup_mutex);
5425
5426 if (css->flags & CSS_DYING)
5427 return;
5428
5429 css->flags |= CSS_DYING;
5430
5431
5432
5433
5434
5435 css_clear_dir(css);
5436
5437
5438
5439
5440
5441 css_get(css);
5442
5443
5444
5445
5446
5447
5448
5449
5450
5451
5452
5453 percpu_ref_kill_and_confirm(&css->refcnt, css_killed_ref_fn);
5454}
5455
5456
5457
5458
5459
5460
5461
5462
5463
5464
5465
5466
5467
5468
5469
5470
5471
5472
5473
5474
5475
5476
5477
5478
5479
5480static int cgroup_destroy_locked(struct cgroup *cgrp)
5481 __releases(&cgroup_mutex) __acquires(&cgroup_mutex)
5482{
5483 struct cgroup *tcgrp, *parent = cgroup_parent(cgrp);
5484 struct cgroup_subsys_state *css;
5485 struct cgrp_cset_link *link;
5486 int ssid;
5487
5488 lockdep_assert_held(&cgroup_mutex);
5489
5490
5491
5492
5493
5494 if (cgroup_is_populated(cgrp))
5495 return -EBUSY;
5496
5497
5498
5499
5500
5501
5502 if (css_has_online_children(&cgrp->self))
5503 return -EBUSY;
5504
5505
5506
5507
5508
5509
5510
5511 cgrp->self.flags &= ~CSS_ONLINE;
5512
5513 spin_lock_irq(&css_set_lock);
5514 list_for_each_entry(link, &cgrp->cset_links, cset_link)
5515 link->cset->dead = true;
5516 spin_unlock_irq(&css_set_lock);
5517
5518
5519 for_each_css(css, ssid, cgrp)
5520 kill_css(css);
5521
5522
5523 css_clear_dir(&cgrp->self);
5524 kernfs_remove(cgrp->kn);
5525
5526 if (parent && cgroup_is_threaded(cgrp))
5527 parent->nr_threaded_children--;
5528
5529 spin_lock_irq(&css_set_lock);
5530 for (tcgrp = cgroup_parent(cgrp); tcgrp; tcgrp = cgroup_parent(tcgrp)) {
5531 tcgrp->nr_descendants--;
5532 tcgrp->nr_dying_descendants++;
5533
5534
5535
5536
5537 if (test_bit(CGRP_FROZEN, &cgrp->flags))
5538 tcgrp->freezer.nr_frozen_descendants--;
5539 }
5540 spin_unlock_irq(&css_set_lock);
5541
5542 cgroup1_check_for_release(parent);
5543
5544 cgroup_bpf_offline(cgrp);
5545
5546
5547 percpu_ref_kill(&cgrp->self.refcnt);
5548
5549 return 0;
5550};
5551
5552int cgroup_rmdir(struct kernfs_node *kn)
5553{
5554 struct cgroup *cgrp;
5555 int ret = 0;
5556
5557 cgrp = cgroup_kn_lock_live(kn, false);
5558 if (!cgrp)
5559 return 0;
5560
5561 ret = cgroup_destroy_locked(cgrp);
5562 if (!ret)
5563 TRACE_CGROUP_PATH(rmdir, cgrp);
5564
5565 cgroup_kn_unlock(kn);
5566 return ret;
5567}
5568
5569static struct kernfs_syscall_ops cgroup_kf_syscall_ops = {
5570 .show_options = cgroup_show_options,
5571 .mkdir = cgroup_mkdir,
5572 .rmdir = cgroup_rmdir,
5573 .show_path = cgroup_show_path,
5574};
5575
5576static void __init cgroup_init_subsys(struct cgroup_subsys *ss, bool early)
5577{
5578 struct cgroup_subsys_state *css;
5579
5580 pr_debug("Initializing cgroup subsys %s\n", ss->name);
5581
5582 mutex_lock(&cgroup_mutex);
5583
5584 idr_init(&ss->css_idr);
5585 INIT_LIST_HEAD(&ss->cfts);
5586
5587
5588 ss->root = &cgrp_dfl_root;
5589 css = ss->css_alloc(cgroup_css(&cgrp_dfl_root.cgrp, ss));
5590
5591 BUG_ON(IS_ERR(css));
5592 init_and_link_css(css, ss, &cgrp_dfl_root.cgrp);
5593
5594
5595
5596
5597
5598 css->flags |= CSS_NO_REF;
5599
5600 if (early) {
5601
5602 css->id = 1;
5603 } else {
5604 css->id = cgroup_idr_alloc(&ss->css_idr, css, 1, 2, GFP_KERNEL);
5605 BUG_ON(css->id < 0);
5606 }
5607
5608
5609
5610
5611
5612 init_css_set.subsys[ss->id] = css;
5613
5614 have_fork_callback |= (bool)ss->fork << ss->id;
5615 have_exit_callback |= (bool)ss->exit << ss->id;
5616 have_release_callback |= (bool)ss->release << ss->id;
5617 have_canfork_callback |= (bool)ss->can_fork << ss->id;
5618
5619
5620
5621
5622 BUG_ON(!list_empty(&init_task.tasks));
5623
5624 BUG_ON(online_css(css));
5625
5626 mutex_unlock(&cgroup_mutex);
5627}
5628
5629
5630
5631
5632
5633
5634
5635int __init cgroup_init_early(void)
5636{
5637 static struct cgroup_fs_context __initdata ctx;
5638 struct cgroup_subsys *ss;
5639 int i;
5640
5641 ctx.root = &cgrp_dfl_root;
5642 init_cgroup_root(&ctx);
5643 cgrp_dfl_root.cgrp.self.flags |= CSS_NO_REF;
5644
5645 RCU_INIT_POINTER(init_task.cgroups, &init_css_set);
5646
5647 for_each_subsys(ss, i) {
5648 WARN(!ss->css_alloc || !ss->css_free || ss->name || ss->id,
5649 "invalid cgroup_subsys %d:%s css_alloc=%p css_free=%p id:name=%d:%s\n",
5650 i, cgroup_subsys_name[i], ss->css_alloc, ss->css_free,
5651 ss->id, ss->name);
5652 WARN(strlen(cgroup_subsys_name[i]) > MAX_CGROUP_TYPE_NAMELEN,
5653 "cgroup_subsys_name %s too long\n", cgroup_subsys_name[i]);
5654
5655 ss->id = i;
5656 ss->name = cgroup_subsys_name[i];
5657 if (!ss->legacy_name)
5658 ss->legacy_name = cgroup_subsys_name[i];
5659
5660 if (ss->early_init)
5661 cgroup_init_subsys(ss, true);
5662 }
5663 return 0;
5664}
5665
5666static u16 cgroup_disable_mask __initdata;
5667
5668
5669
5670
5671
5672
5673
5674int __init cgroup_init(void)
5675{
5676 struct cgroup_subsys *ss;
5677 int ssid;
5678
5679 BUILD_BUG_ON(CGROUP_SUBSYS_COUNT > 16);
5680 BUG_ON(cgroup_init_cftypes(NULL, cgroup_base_files));
5681 BUG_ON(cgroup_init_cftypes(NULL, cgroup1_base_files));
5682
5683 cgroup_rstat_boot();
5684
5685
5686
5687
5688
5689 rcu_sync_enter_start(&cgroup_threadgroup_rwsem.rss);
5690
5691 get_user_ns(init_cgroup_ns.user_ns);
5692
5693 mutex_lock(&cgroup_mutex);
5694
5695
5696
5697
5698
5699 hash_add(css_set_table, &init_css_set.hlist,
5700 css_set_hash(init_css_set.subsys));
5701
5702 BUG_ON(cgroup_setup_root(&cgrp_dfl_root, 0));
5703
5704 mutex_unlock(&cgroup_mutex);
5705
5706 for_each_subsys(ss, ssid) {
5707 if (ss->early_init) {
5708 struct cgroup_subsys_state *css =
5709 init_css_set.subsys[ss->id];
5710
5711 css->id = cgroup_idr_alloc(&ss->css_idr, css, 1, 2,
5712 GFP_KERNEL);
5713 BUG_ON(css->id < 0);
5714 } else {
5715 cgroup_init_subsys(ss, false);
5716 }
5717
5718 list_add_tail(&init_css_set.e_cset_node[ssid],
5719 &cgrp_dfl_root.cgrp.e_csets[ssid]);
5720
5721
5722
5723
5724
5725
5726 if (cgroup_disable_mask & (1 << ssid)) {
5727 static_branch_disable(cgroup_subsys_enabled_key[ssid]);
5728 printk(KERN_INFO "Disabling %s control group subsystem\n",
5729 ss->name);
5730 continue;
5731 }
5732
5733 if (cgroup1_ssid_disabled(ssid))
5734 printk(KERN_INFO "Disabling %s control group subsystem in v1 mounts\n",
5735 ss->name);
5736
5737 cgrp_dfl_root.subsys_mask |= 1 << ss->id;
5738
5739
5740 WARN_ON(ss->implicit_on_dfl && !ss->threaded);
5741
5742 if (ss->implicit_on_dfl)
5743 cgrp_dfl_implicit_ss_mask |= 1 << ss->id;
5744 else if (!ss->dfl_cftypes)
5745 cgrp_dfl_inhibit_ss_mask |= 1 << ss->id;
5746
5747 if (ss->threaded)
5748 cgrp_dfl_threaded_ss_mask |= 1 << ss->id;
5749
5750 if (ss->dfl_cftypes == ss->legacy_cftypes) {
5751 WARN_ON(cgroup_add_cftypes(ss, ss->dfl_cftypes));
5752 } else {
5753 WARN_ON(cgroup_add_dfl_cftypes(ss, ss->dfl_cftypes));
5754 WARN_ON(cgroup_add_legacy_cftypes(ss, ss->legacy_cftypes));
5755 }
5756
5757 if (ss->bind)
5758 ss->bind(init_css_set.subsys[ssid]);
5759
5760 mutex_lock(&cgroup_mutex);
5761 css_populate_dir(init_css_set.subsys[ssid]);
5762 mutex_unlock(&cgroup_mutex);
5763 }
5764
5765
5766 hash_del(&init_css_set.hlist);
5767 hash_add(css_set_table, &init_css_set.hlist,
5768 css_set_hash(init_css_set.subsys));
5769
5770 WARN_ON(sysfs_create_mount_point(fs_kobj, "cgroup"));
5771 WARN_ON(register_filesystem(&cgroup_fs_type));
5772 WARN_ON(register_filesystem(&cgroup2_fs_type));
5773 WARN_ON(!proc_create_single("cgroups", 0, NULL, proc_cgroupstats_show));
5774#ifdef CONFIG_CPUSETS
5775 WARN_ON(register_filesystem(&cpuset_fs_type));
5776#endif
5777
5778 return 0;
5779}
5780
5781static int __init cgroup_wq_init(void)
5782{
5783
5784
5785
5786
5787
5788
5789
5790
5791 cgroup_destroy_wq = alloc_workqueue("cgroup_destroy", 0, 1);
5792 BUG_ON(!cgroup_destroy_wq);
5793 return 0;
5794}
5795core_initcall(cgroup_wq_init);
5796
5797void cgroup_path_from_kernfs_id(u64 id, char *buf, size_t buflen)
5798{
5799 struct kernfs_node *kn;
5800
5801 kn = kernfs_find_and_get_node_by_id(cgrp_dfl_root.kf_root, id);
5802 if (!kn)
5803 return;
5804 kernfs_path(kn, buf, buflen);
5805 kernfs_put(kn);
5806}
5807
5808
5809
5810
5811
5812
5813int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns,
5814 struct pid *pid, struct task_struct *tsk)
5815{
5816 char *buf;
5817 int retval;
5818 struct cgroup_root *root;
5819
5820 retval = -ENOMEM;
5821 buf = kmalloc(PATH_MAX, GFP_KERNEL);
5822 if (!buf)
5823 goto out;
5824
5825 mutex_lock(&cgroup_mutex);
5826 spin_lock_irq(&css_set_lock);
5827
5828 for_each_root(root) {
5829 struct cgroup_subsys *ss;
5830 struct cgroup *cgrp;
5831 int ssid, count = 0;
5832
5833 if (root == &cgrp_dfl_root && !cgrp_dfl_visible)
5834 continue;
5835
5836 seq_printf(m, "%d:", root->hierarchy_id);
5837 if (root != &cgrp_dfl_root)
5838 for_each_subsys(ss, ssid)
5839 if (root->subsys_mask & (1 << ssid))
5840 seq_printf(m, "%s%s", count++ ? "," : "",
5841 ss->legacy_name);
5842 if (strlen(root->name))
5843 seq_printf(m, "%sname=%s", count ? "," : "",
5844 root->name);
5845 seq_putc(m, ':');
5846
5847 cgrp = task_cgroup_from_root(tsk, root);
5848
5849
5850
5851
5852
5853
5854
5855
5856
5857
5858 if (cgroup_on_dfl(cgrp) || !(tsk->flags & PF_EXITING)) {
5859 retval = cgroup_path_ns_locked(cgrp, buf, PATH_MAX,
5860 current->nsproxy->cgroup_ns);
5861 if (retval >= PATH_MAX)
5862 retval = -ENAMETOOLONG;
5863 if (retval < 0)
5864 goto out_unlock;
5865
5866 seq_puts(m, buf);
5867 } else {
5868 seq_puts(m, "/");
5869 }
5870
5871 if (cgroup_on_dfl(cgrp) && cgroup_is_dead(cgrp))
5872 seq_puts(m, " (deleted)\n");
5873 else
5874 seq_putc(m, '\n');
5875 }
5876
5877 retval = 0;
5878out_unlock:
5879 spin_unlock_irq(&css_set_lock);
5880 mutex_unlock(&cgroup_mutex);
5881 kfree(buf);
5882out:
5883 return retval;
5884}
5885
5886
5887
5888
5889
5890
5891
5892
5893void cgroup_fork(struct task_struct *child)
5894{
5895 RCU_INIT_POINTER(child->cgroups, &init_css_set);
5896 INIT_LIST_HEAD(&child->cg_list);
5897}
5898
5899static struct cgroup *cgroup_get_from_file(struct file *f)
5900{
5901 struct cgroup_subsys_state *css;
5902 struct cgroup *cgrp;
5903
5904 css = css_tryget_online_from_dir(f->f_path.dentry, NULL);
5905 if (IS_ERR(css))
5906 return ERR_CAST(css);
5907
5908 cgrp = css->cgroup;
5909 if (!cgroup_on_dfl(cgrp)) {
5910 cgroup_put(cgrp);
5911 return ERR_PTR(-EBADF);
5912 }
5913
5914 return cgrp;
5915}
5916
5917
5918
5919
5920
5921
5922
5923
5924
5925
5926
5927
5928
5929
5930
5931
5932
5933static int cgroup_css_set_fork(struct kernel_clone_args *kargs)
5934 __acquires(&cgroup_mutex) __acquires(&cgroup_threadgroup_rwsem)
5935{
5936 int ret;
5937 struct cgroup *dst_cgrp = NULL;
5938 struct css_set *cset;
5939 struct super_block *sb;
5940 struct file *f;
5941
5942 if (kargs->flags & CLONE_INTO_CGROUP)
5943 mutex_lock(&cgroup_mutex);
5944
5945 cgroup_threadgroup_change_begin(current);
5946
5947 spin_lock_irq(&css_set_lock);
5948 cset = task_css_set(current);
5949 get_css_set(cset);
5950 spin_unlock_irq(&css_set_lock);
5951
5952 if (!(kargs->flags & CLONE_INTO_CGROUP)) {
5953 kargs->cset = cset;
5954 return 0;
5955 }
5956
5957 f = fget_raw(kargs->cgroup);
5958 if (!f) {
5959 ret = -EBADF;
5960 goto err;
5961 }
5962 sb = f->f_path.dentry->d_sb;
5963
5964 dst_cgrp = cgroup_get_from_file(f);
5965 if (IS_ERR(dst_cgrp)) {
5966 ret = PTR_ERR(dst_cgrp);
5967 dst_cgrp = NULL;
5968 goto err;
5969 }
5970
5971 if (cgroup_is_dead(dst_cgrp)) {
5972 ret = -ENODEV;
5973 goto err;
5974 }
5975
5976
5977
5978
5979
5980
5981 ret = cgroup_may_write(dst_cgrp, sb);
5982 if (ret)
5983 goto err;
5984
5985 ret = cgroup_attach_permissions(cset->dfl_cgrp, dst_cgrp, sb,
5986 !(kargs->flags & CLONE_THREAD));
5987 if (ret)
5988 goto err;
5989
5990 kargs->cset = find_css_set(cset, dst_cgrp);
5991 if (!kargs->cset) {
5992 ret = -ENOMEM;
5993 goto err;
5994 }
5995
5996 put_css_set(cset);
5997 fput(f);
5998 kargs->cgrp = dst_cgrp;
5999 return ret;
6000
6001err:
6002 cgroup_threadgroup_change_end(current);
6003 mutex_unlock(&cgroup_mutex);
6004 if (f)
6005 fput(f);
6006 if (dst_cgrp)
6007 cgroup_put(dst_cgrp);
6008 put_css_set(cset);
6009 if (kargs->cset)
6010 put_css_set(kargs->cset);
6011 return ret;
6012}
6013
6014
6015
6016
6017
6018
6019
6020
6021static void cgroup_css_set_put_fork(struct kernel_clone_args *kargs)
6022 __releases(&cgroup_threadgroup_rwsem) __releases(&cgroup_mutex)
6023{
6024 cgroup_threadgroup_change_end(current);
6025
6026 if (kargs->flags & CLONE_INTO_CGROUP) {
6027 struct cgroup *cgrp = kargs->cgrp;
6028 struct css_set *cset = kargs->cset;
6029
6030 mutex_unlock(&cgroup_mutex);
6031
6032 if (cset) {
6033 put_css_set(cset);
6034 kargs->cset = NULL;
6035 }
6036
6037 if (cgrp) {
6038 cgroup_put(cgrp);
6039 kargs->cgrp = NULL;
6040 }
6041 }
6042}
6043
6044
6045
6046
6047
6048
6049
6050
6051
6052
6053
6054int cgroup_can_fork(struct task_struct *child, struct kernel_clone_args *kargs)
6055{
6056 struct cgroup_subsys *ss;
6057 int i, j, ret;
6058
6059 ret = cgroup_css_set_fork(kargs);
6060 if (ret)
6061 return ret;
6062
6063 do_each_subsys_mask(ss, i, have_canfork_callback) {
6064 ret = ss->can_fork(child, kargs->cset);
6065 if (ret)
6066 goto out_revert;
6067 } while_each_subsys_mask();
6068
6069 return 0;
6070
6071out_revert:
6072 for_each_subsys(ss, j) {
6073 if (j >= i)
6074 break;
6075 if (ss->cancel_fork)
6076 ss->cancel_fork(child, kargs->cset);
6077 }
6078
6079 cgroup_css_set_put_fork(kargs);
6080
6081 return ret;
6082}
6083
6084
6085
6086
6087
6088
6089
6090
6091
6092
6093void cgroup_cancel_fork(struct task_struct *child,
6094 struct kernel_clone_args *kargs)
6095{
6096 struct cgroup_subsys *ss;
6097 int i;
6098
6099 for_each_subsys(ss, i)
6100 if (ss->cancel_fork)
6101 ss->cancel_fork(child, kargs->cset);
6102
6103 cgroup_css_set_put_fork(kargs);
6104}
6105
6106
6107
6108
6109
6110
6111
6112
6113void cgroup_post_fork(struct task_struct *child,
6114 struct kernel_clone_args *kargs)
6115 __releases(&cgroup_threadgroup_rwsem) __releases(&cgroup_mutex)
6116{
6117 struct cgroup_subsys *ss;
6118 struct css_set *cset;
6119 int i;
6120
6121 cset = kargs->cset;
6122 kargs->cset = NULL;
6123
6124 spin_lock_irq(&css_set_lock);
6125
6126
6127 if (likely(child->pid)) {
6128 WARN_ON_ONCE(!list_empty(&child->cg_list));
6129 cset->nr_tasks++;
6130 css_set_move_task(child, NULL, cset, false);
6131 } else {
6132 put_css_set(cset);
6133 cset = NULL;
6134 }
6135
6136
6137
6138
6139
6140
6141 if (unlikely(cgroup_task_freeze(child))) {
6142 spin_lock(&child->sighand->siglock);
6143 WARN_ON_ONCE(child->frozen);
6144 child->jobctl |= JOBCTL_TRAP_FREEZE;
6145 spin_unlock(&child->sighand->siglock);
6146
6147
6148
6149
6150
6151
6152
6153 }
6154
6155 spin_unlock_irq(&css_set_lock);
6156
6157
6158
6159
6160
6161
6162 do_each_subsys_mask(ss, i, have_fork_callback) {
6163 ss->fork(child);
6164 } while_each_subsys_mask();
6165
6166
6167 if (kargs->flags & CLONE_NEWCGROUP) {
6168 struct css_set *rcset = child->nsproxy->cgroup_ns->root_cset;
6169
6170 get_css_set(cset);
6171 child->nsproxy->cgroup_ns->root_cset = cset;
6172 put_css_set(rcset);
6173 }
6174
6175 cgroup_css_set_put_fork(kargs);
6176}
6177
6178
6179
6180
6181
6182
6183
6184
6185void cgroup_exit(struct task_struct *tsk)
6186{
6187 struct cgroup_subsys *ss;
6188 struct css_set *cset;
6189 int i;
6190
6191 spin_lock_irq(&css_set_lock);
6192
6193 WARN_ON_ONCE(list_empty(&tsk->cg_list));
6194 cset = task_css_set(tsk);
6195 css_set_move_task(tsk, cset, NULL, false);
6196 list_add_tail(&tsk->cg_list, &cset->dying_tasks);
6197 cset->nr_tasks--;
6198
6199 WARN_ON_ONCE(cgroup_task_frozen(tsk));
6200 if (unlikely(cgroup_task_freeze(tsk)))
6201 cgroup_update_frozen(task_dfl_cgroup(tsk));
6202
6203 spin_unlock_irq(&css_set_lock);
6204
6205
6206 do_each_subsys_mask(ss, i, have_exit_callback) {
6207 ss->exit(tsk);
6208 } while_each_subsys_mask();
6209}
6210
6211void cgroup_release(struct task_struct *task)
6212{
6213 struct cgroup_subsys *ss;
6214 int ssid;
6215
6216 do_each_subsys_mask(ss, ssid, have_release_callback) {
6217 ss->release(task);
6218 } while_each_subsys_mask();
6219
6220 spin_lock_irq(&css_set_lock);
6221 css_set_skip_task_iters(task_css_set(task), task);
6222 list_del_init(&task->cg_list);
6223 spin_unlock_irq(&css_set_lock);
6224}
6225
6226void cgroup_free(struct task_struct *task)
6227{
6228 struct css_set *cset = task_css_set(task);
6229 put_css_set(cset);
6230}
6231
6232static int __init cgroup_disable(char *str)
6233{
6234 struct cgroup_subsys *ss;
6235 char *token;
6236 int i;
6237
6238 while ((token = strsep(&str, ",")) != NULL) {
6239 if (!*token)
6240 continue;
6241
6242 for_each_subsys(ss, i) {
6243 if (strcmp(token, ss->name) &&
6244 strcmp(token, ss->legacy_name))
6245 continue;
6246 cgroup_disable_mask |= 1 << i;
6247 }
6248 }
6249 return 1;
6250}
6251__setup("cgroup_disable=", cgroup_disable);
6252
6253void __init __weak enable_debug_cgroup(void) { }
6254
6255static int __init enable_cgroup_debug(char *str)
6256{
6257 cgroup_debug = true;
6258 enable_debug_cgroup();
6259 return 1;
6260}
6261__setup("cgroup_debug", enable_cgroup_debug);
6262
6263
6264
6265
6266
6267
6268
6269
6270
6271
6272struct cgroup_subsys_state *css_tryget_online_from_dir(struct dentry *dentry,
6273 struct cgroup_subsys *ss)
6274{
6275 struct kernfs_node *kn = kernfs_node_from_dentry(dentry);
6276 struct file_system_type *s_type = dentry->d_sb->s_type;
6277 struct cgroup_subsys_state *css = NULL;
6278 struct cgroup *cgrp;
6279
6280
6281 if ((s_type != &cgroup_fs_type && s_type != &cgroup2_fs_type) ||
6282 !kn || kernfs_type(kn) != KERNFS_DIR)
6283 return ERR_PTR(-EBADF);
6284
6285 rcu_read_lock();
6286
6287
6288
6289
6290
6291
6292 cgrp = rcu_dereference(*(void __rcu __force **)&kn->priv);
6293 if (cgrp)
6294 css = cgroup_css(cgrp, ss);
6295
6296 if (!css || !css_tryget_online(css))
6297 css = ERR_PTR(-ENOENT);
6298
6299 rcu_read_unlock();
6300 return css;
6301}
6302
6303
6304
6305
6306
6307
6308
6309
6310
6311struct cgroup_subsys_state *css_from_id(int id, struct cgroup_subsys *ss)
6312{
6313 WARN_ON_ONCE(!rcu_read_lock_held());
6314 return idr_find(&ss->css_idr, id);
6315}
6316
6317
6318
6319
6320
6321
6322
6323
6324
6325
6326struct cgroup *cgroup_get_from_path(const char *path)
6327{
6328 struct kernfs_node *kn;
6329 struct cgroup *cgrp;
6330
6331 mutex_lock(&cgroup_mutex);
6332
6333 kn = kernfs_walk_and_get(cgrp_dfl_root.cgrp.kn, path);
6334 if (kn) {
6335 if (kernfs_type(kn) == KERNFS_DIR) {
6336 cgrp = kn->priv;
6337 cgroup_get_live(cgrp);
6338 } else {
6339 cgrp = ERR_PTR(-ENOTDIR);
6340 }
6341 kernfs_put(kn);
6342 } else {
6343 cgrp = ERR_PTR(-ENOENT);
6344 }
6345
6346 mutex_unlock(&cgroup_mutex);
6347 return cgrp;
6348}
6349EXPORT_SYMBOL_GPL(cgroup_get_from_path);
6350
6351
6352
6353
6354
6355
6356
6357
6358
6359
6360struct cgroup *cgroup_get_from_fd(int fd)
6361{
6362 struct cgroup *cgrp;
6363 struct file *f;
6364
6365 f = fget_raw(fd);
6366 if (!f)
6367 return ERR_PTR(-EBADF);
6368
6369 cgrp = cgroup_get_from_file(f);
6370 fput(f);
6371 return cgrp;
6372}
6373EXPORT_SYMBOL_GPL(cgroup_get_from_fd);
6374
6375static u64 power_of_ten(int power)
6376{
6377 u64 v = 1;
6378 while (power--)
6379 v *= 10;
6380 return v;
6381}
6382
6383
6384
6385
6386
6387
6388
6389
6390
6391
6392
6393
6394
6395
6396
6397int cgroup_parse_float(const char *input, unsigned dec_shift, s64 *v)
6398{
6399 s64 whole, frac = 0;
6400 int fstart = 0, fend = 0, flen;
6401
6402 if (!sscanf(input, "%lld.%n%lld%n", &whole, &fstart, &frac, &fend))
6403 return -EINVAL;
6404 if (frac < 0)
6405 return -EINVAL;
6406
6407 flen = fend > fstart ? fend - fstart : 0;
6408 if (flen < dec_shift)
6409 frac *= power_of_ten(dec_shift - flen);
6410 else
6411 frac = DIV_ROUND_CLOSEST_ULL(frac, power_of_ten(flen - dec_shift));
6412
6413 *v = whole * power_of_ten(dec_shift) + frac;
6414 return 0;
6415}
6416
6417
6418
6419
6420
6421#ifdef CONFIG_SOCK_CGROUP_DATA
6422
6423#if defined(CONFIG_CGROUP_NET_PRIO) || defined(CONFIG_CGROUP_NET_CLASSID)
6424
6425DEFINE_SPINLOCK(cgroup_sk_update_lock);
6426static bool cgroup_sk_alloc_disabled __read_mostly;
6427
6428void cgroup_sk_alloc_disable(void)
6429{
6430 if (cgroup_sk_alloc_disabled)
6431 return;
6432 pr_info("cgroup: disabling cgroup2 socket matching due to net_prio or net_cls activation\n");
6433 cgroup_sk_alloc_disabled = true;
6434}
6435
6436#else
6437
6438#define cgroup_sk_alloc_disabled false
6439
6440#endif
6441
6442void cgroup_sk_alloc(struct sock_cgroup_data *skcd)
6443{
6444 if (cgroup_sk_alloc_disabled) {
6445 skcd->no_refcnt = 1;
6446 return;
6447 }
6448
6449
6450 if (in_interrupt())
6451 return;
6452
6453 rcu_read_lock();
6454
6455 while (true) {
6456 struct css_set *cset;
6457
6458 cset = task_css_set(current);
6459 if (likely(cgroup_tryget(cset->dfl_cgrp))) {
6460 skcd->val = (unsigned long)cset->dfl_cgrp;
6461 cgroup_bpf_get(cset->dfl_cgrp);
6462 break;
6463 }
6464 cpu_relax();
6465 }
6466
6467 rcu_read_unlock();
6468}
6469
6470void cgroup_sk_clone(struct sock_cgroup_data *skcd)
6471{
6472 if (skcd->val) {
6473 if (skcd->no_refcnt)
6474 return;
6475
6476
6477
6478
6479
6480 cgroup_get(sock_cgroup_ptr(skcd));
6481 cgroup_bpf_get(sock_cgroup_ptr(skcd));
6482 }
6483}
6484
6485void cgroup_sk_free(struct sock_cgroup_data *skcd)
6486{
6487 struct cgroup *cgrp = sock_cgroup_ptr(skcd);
6488
6489 if (skcd->no_refcnt)
6490 return;
6491 cgroup_bpf_put(cgrp);
6492 cgroup_put(cgrp);
6493}
6494
6495#endif
6496
6497#ifdef CONFIG_CGROUP_BPF
6498int cgroup_bpf_attach(struct cgroup *cgrp,
6499 struct bpf_prog *prog, struct bpf_prog *replace_prog,
6500 struct bpf_cgroup_link *link,
6501 enum bpf_attach_type type,
6502 u32 flags)
6503{
6504 int ret;
6505
6506 mutex_lock(&cgroup_mutex);
6507 ret = __cgroup_bpf_attach(cgrp, prog, replace_prog, link, type, flags);
6508 mutex_unlock(&cgroup_mutex);
6509 return ret;
6510}
6511
6512int cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog,
6513 enum bpf_attach_type type)
6514{
6515 int ret;
6516
6517 mutex_lock(&cgroup_mutex);
6518 ret = __cgroup_bpf_detach(cgrp, prog, NULL, type);
6519 mutex_unlock(&cgroup_mutex);
6520 return ret;
6521}
6522
6523int cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
6524 union bpf_attr __user *uattr)
6525{
6526 int ret;
6527
6528 mutex_lock(&cgroup_mutex);
6529 ret = __cgroup_bpf_query(cgrp, attr, uattr);
6530 mutex_unlock(&cgroup_mutex);
6531 return ret;
6532}
6533#endif
6534
6535#ifdef CONFIG_SYSFS
6536static ssize_t show_delegatable_files(struct cftype *files, char *buf,
6537 ssize_t size, const char *prefix)
6538{
6539 struct cftype *cft;
6540 ssize_t ret = 0;
6541
6542 for (cft = files; cft && cft->name[0] != '\0'; cft++) {
6543 if (!(cft->flags & CFTYPE_NS_DELEGATABLE))
6544 continue;
6545
6546 if (prefix)
6547 ret += snprintf(buf + ret, size - ret, "%s.", prefix);
6548
6549 ret += snprintf(buf + ret, size - ret, "%s\n", cft->name);
6550
6551 if (WARN_ON(ret >= size))
6552 break;
6553 }
6554
6555 return ret;
6556}
6557
6558static ssize_t delegate_show(struct kobject *kobj, struct kobj_attribute *attr,
6559 char *buf)
6560{
6561 struct cgroup_subsys *ss;
6562 int ssid;
6563 ssize_t ret = 0;
6564
6565 ret = show_delegatable_files(cgroup_base_files, buf, PAGE_SIZE - ret,
6566 NULL);
6567
6568 for_each_subsys(ss, ssid)
6569 ret += show_delegatable_files(ss->dfl_cftypes, buf + ret,
6570 PAGE_SIZE - ret,
6571 cgroup_subsys_name[ssid]);
6572
6573 return ret;
6574}
6575static struct kobj_attribute cgroup_delegate_attr = __ATTR_RO(delegate);
6576
6577static ssize_t features_show(struct kobject *kobj, struct kobj_attribute *attr,
6578 char *buf)
6579{
6580 return snprintf(buf, PAGE_SIZE,
6581 "nsdelegate\n"
6582 "memory_localevents\n"
6583 "memory_recursiveprot\n");
6584}
6585static struct kobj_attribute cgroup_features_attr = __ATTR_RO(features);
6586
6587static struct attribute *cgroup_sysfs_attrs[] = {
6588 &cgroup_delegate_attr.attr,
6589 &cgroup_features_attr.attr,
6590 NULL,
6591};
6592
6593static const struct attribute_group cgroup_sysfs_attr_group = {
6594 .attrs = cgroup_sysfs_attrs,
6595 .name = "cgroup",
6596};
6597
6598static int __init cgroup_sysfs_init(void)
6599{
6600 return sysfs_create_group(kernel_kobj, &cgroup_sysfs_attr_group);
6601}
6602subsys_initcall(cgroup_sysfs_init);
6603
6604#endif
6605