1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
30
31#include "cgroup-internal.h"
32
33#include <linux/cred.h>
34#include <linux/errno.h>
35#include <linux/init_task.h>
36#include <linux/kernel.h>
37#include <linux/magic.h>
38#include <linux/mutex.h>
39#include <linux/mount.h>
40#include <linux/pagemap.h>
41#include <linux/proc_fs.h>
42#include <linux/rcupdate.h>
43#include <linux/sched.h>
44#include <linux/sched/task.h>
45#include <linux/slab.h>
46#include <linux/spinlock.h>
47#include <linux/percpu-rwsem.h>
48#include <linux/string.h>
49#include <linux/hashtable.h>
50#include <linux/idr.h>
51#include <linux/kthread.h>
52#include <linux/atomic.h>
53#include <linux/cpuset.h>
54#include <linux/proc_ns.h>
55#include <linux/nsproxy.h>
56#include <linux/file.h>
57#include <linux/fs_parser.h>
58#include <linux/sched/cputime.h>
59#include <linux/psi.h>
60#include <net/sock.h>
61
62#define CREATE_TRACE_POINTS
63#include <trace/events/cgroup.h>
64
65#define CGROUP_FILE_NAME_MAX (MAX_CGROUP_TYPE_NAMELEN + \
66 MAX_CFTYPE_NAME + 2)
67
68#define CGROUP_FILE_NOTIFY_MIN_INTV DIV_ROUND_UP(HZ, 100)
69
70
71
72
73
74
75
76
77
78
79
80DEFINE_MUTEX(cgroup_mutex);
81DEFINE_SPINLOCK(css_set_lock);
82
83#ifdef CONFIG_PROVE_RCU
84EXPORT_SYMBOL_GPL(cgroup_mutex);
85EXPORT_SYMBOL_GPL(css_set_lock);
86#endif
87
88DEFINE_SPINLOCK(trace_cgroup_path_lock);
89char trace_cgroup_path[TRACE_CGROUP_PATH_LEN];
90bool cgroup_debug __read_mostly;
91
92
93
94
95
96static DEFINE_SPINLOCK(cgroup_idr_lock);
97
98
99
100
101
102static DEFINE_SPINLOCK(cgroup_file_kn_lock);
103
104DEFINE_PERCPU_RWSEM(cgroup_threadgroup_rwsem);
105
106#define cgroup_assert_mutex_or_rcu_locked() \
107 RCU_LOCKDEP_WARN(!rcu_read_lock_held() && \
108 !lockdep_is_held(&cgroup_mutex), \
109 "cgroup_mutex or RCU read lock required");
110
111
112
113
114
115
116
117static struct workqueue_struct *cgroup_destroy_wq;
118
119
120#define SUBSYS(_x) [_x ## _cgrp_id] = &_x ## _cgrp_subsys,
121struct cgroup_subsys *cgroup_subsys[] = {
122#include <linux/cgroup_subsys.h>
123};
124#undef SUBSYS
125
126
127#define SUBSYS(_x) [_x ## _cgrp_id] = #_x,
128static const char *cgroup_subsys_name[] = {
129#include <linux/cgroup_subsys.h>
130};
131#undef SUBSYS
132
133
134#define SUBSYS(_x) \
135 DEFINE_STATIC_KEY_TRUE(_x ## _cgrp_subsys_enabled_key); \
136 DEFINE_STATIC_KEY_TRUE(_x ## _cgrp_subsys_on_dfl_key); \
137 EXPORT_SYMBOL_GPL(_x ## _cgrp_subsys_enabled_key); \
138 EXPORT_SYMBOL_GPL(_x ## _cgrp_subsys_on_dfl_key);
139#include <linux/cgroup_subsys.h>
140#undef SUBSYS
141
142#define SUBSYS(_x) [_x ## _cgrp_id] = &_x ## _cgrp_subsys_enabled_key,
143static struct static_key_true *cgroup_subsys_enabled_key[] = {
144#include <linux/cgroup_subsys.h>
145};
146#undef SUBSYS
147
148#define SUBSYS(_x) [_x ## _cgrp_id] = &_x ## _cgrp_subsys_on_dfl_key,
149static struct static_key_true *cgroup_subsys_on_dfl_key[] = {
150#include <linux/cgroup_subsys.h>
151};
152#undef SUBSYS
153
154static DEFINE_PER_CPU(struct cgroup_rstat_cpu, cgrp_dfl_root_rstat_cpu);
155
156
157struct cgroup_root cgrp_dfl_root = { .cgrp.rstat_cpu = &cgrp_dfl_root_rstat_cpu };
158EXPORT_SYMBOL_GPL(cgrp_dfl_root);
159
160
161
162
163
164static bool cgrp_dfl_visible;
165
166
167static u16 cgrp_dfl_inhibit_ss_mask;
168
169
170static u16 cgrp_dfl_implicit_ss_mask;
171
172
173static u16 cgrp_dfl_threaded_ss_mask;
174
175
176LIST_HEAD(cgroup_roots);
177static int cgroup_root_count;
178
179
180static DEFINE_IDR(cgroup_hierarchy_idr);
181
182
183
184
185
186
187
188
189static u64 css_serial_nr_next = 1;
190
191
192
193
194
195static u16 have_fork_callback __read_mostly;
196static u16 have_exit_callback __read_mostly;
197static u16 have_release_callback __read_mostly;
198static u16 have_canfork_callback __read_mostly;
199
200
201struct cgroup_namespace init_cgroup_ns = {
202 .ns.count = REFCOUNT_INIT(2),
203 .user_ns = &init_user_ns,
204 .ns.ops = &cgroupns_operations,
205 .ns.inum = PROC_CGROUP_INIT_INO,
206 .root_cset = &init_css_set,
207};
208
209static struct file_system_type cgroup2_fs_type;
210static struct cftype cgroup_base_files[];
211
212
213enum cgroup_opt_features {
214#ifdef CONFIG_PSI
215 OPT_FEATURE_PRESSURE,
216#endif
217 OPT_FEATURE_COUNT
218};
219
220static const char *cgroup_opt_feature_names[OPT_FEATURE_COUNT] = {
221#ifdef CONFIG_PSI
222 "pressure",
223#endif
224};
225
226static u16 cgroup_feature_disable_mask __read_mostly;
227
228static int cgroup_apply_control(struct cgroup *cgrp);
229static void cgroup_finalize_control(struct cgroup *cgrp, int ret);
230static void css_task_iter_skip(struct css_task_iter *it,
231 struct task_struct *task);
232static int cgroup_destroy_locked(struct cgroup *cgrp);
233static struct cgroup_subsys_state *css_create(struct cgroup *cgrp,
234 struct cgroup_subsys *ss);
235static void css_release(struct percpu_ref *ref);
236static void kill_css(struct cgroup_subsys_state *css);
237static int cgroup_addrm_files(struct cgroup_subsys_state *css,
238 struct cgroup *cgrp, struct cftype cfts[],
239 bool is_add);
240
241
242
243
244
245
246
247
248
249bool cgroup_ssid_enabled(int ssid)
250{
251 if (CGROUP_SUBSYS_COUNT == 0)
252 return false;
253
254 return static_key_enabled(cgroup_subsys_enabled_key[ssid]);
255}
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304bool cgroup_on_dfl(const struct cgroup *cgrp)
305{
306 return cgrp->root == &cgrp_dfl_root;
307}
308
309
310static int cgroup_idr_alloc(struct idr *idr, void *ptr, int start, int end,
311 gfp_t gfp_mask)
312{
313 int ret;
314
315 idr_preload(gfp_mask);
316 spin_lock_bh(&cgroup_idr_lock);
317 ret = idr_alloc(idr, ptr, start, end, gfp_mask & ~__GFP_DIRECT_RECLAIM);
318 spin_unlock_bh(&cgroup_idr_lock);
319 idr_preload_end();
320 return ret;
321}
322
323static void *cgroup_idr_replace(struct idr *idr, void *ptr, int id)
324{
325 void *ret;
326
327 spin_lock_bh(&cgroup_idr_lock);
328 ret = idr_replace(idr, ptr, id);
329 spin_unlock_bh(&cgroup_idr_lock);
330 return ret;
331}
332
333static void cgroup_idr_remove(struct idr *idr, int id)
334{
335 spin_lock_bh(&cgroup_idr_lock);
336 idr_remove(idr, id);
337 spin_unlock_bh(&cgroup_idr_lock);
338}
339
340static bool cgroup_has_tasks(struct cgroup *cgrp)
341{
342 return cgrp->nr_populated_csets;
343}
344
345bool cgroup_is_threaded(struct cgroup *cgrp)
346{
347 return cgrp->dom_cgrp != cgrp;
348}
349
350
351static bool cgroup_is_mixable(struct cgroup *cgrp)
352{
353
354
355
356
357
358 return !cgroup_parent(cgrp);
359}
360
361
362static bool cgroup_can_be_thread_root(struct cgroup *cgrp)
363{
364
365 if (cgroup_is_mixable(cgrp))
366 return true;
367
368
369 if (cgroup_is_threaded(cgrp))
370 return false;
371
372
373 if (cgrp->nr_populated_domain_children)
374 return false;
375
376
377 if (cgrp->subtree_control & ~cgrp_dfl_threaded_ss_mask)
378 return false;
379
380 return true;
381}
382
383
384bool cgroup_is_thread_root(struct cgroup *cgrp)
385{
386
387 if (cgroup_is_threaded(cgrp))
388 return false;
389
390
391 if (cgrp->nr_threaded_children)
392 return true;
393
394
395
396
397
398 if (cgroup_has_tasks(cgrp) &&
399 (cgrp->subtree_control & cgrp_dfl_threaded_ss_mask))
400 return true;
401
402 return false;
403}
404
405
406static bool cgroup_is_valid_domain(struct cgroup *cgrp)
407{
408
409 if (cgroup_is_threaded(cgrp))
410 return false;
411
412
413 while ((cgrp = cgroup_parent(cgrp))) {
414 if (!cgroup_is_mixable(cgrp) && cgroup_is_thread_root(cgrp))
415 return false;
416 if (cgroup_is_threaded(cgrp))
417 return false;
418 }
419
420 return true;
421}
422
423
424static u16 cgroup_control(struct cgroup *cgrp)
425{
426 struct cgroup *parent = cgroup_parent(cgrp);
427 u16 root_ss_mask = cgrp->root->subsys_mask;
428
429 if (parent) {
430 u16 ss_mask = parent->subtree_control;
431
432
433 if (cgroup_is_threaded(cgrp))
434 ss_mask &= cgrp_dfl_threaded_ss_mask;
435 return ss_mask;
436 }
437
438 if (cgroup_on_dfl(cgrp))
439 root_ss_mask &= ~(cgrp_dfl_inhibit_ss_mask |
440 cgrp_dfl_implicit_ss_mask);
441 return root_ss_mask;
442}
443
444
445static u16 cgroup_ss_mask(struct cgroup *cgrp)
446{
447 struct cgroup *parent = cgroup_parent(cgrp);
448
449 if (parent) {
450 u16 ss_mask = parent->subtree_ss_mask;
451
452
453 if (cgroup_is_threaded(cgrp))
454 ss_mask &= cgrp_dfl_threaded_ss_mask;
455 return ss_mask;
456 }
457
458 return cgrp->root->subsys_mask;
459}
460
461
462
463
464
465
466
467
468
469
470
471
472static struct cgroup_subsys_state *cgroup_css(struct cgroup *cgrp,
473 struct cgroup_subsys *ss)
474{
475 if (ss)
476 return rcu_dereference_check(cgrp->subsys[ss->id],
477 lockdep_is_held(&cgroup_mutex));
478 else
479 return &cgrp->self;
480}
481
482
483
484
485
486
487
488
489
490static struct cgroup_subsys_state *cgroup_tryget_css(struct cgroup *cgrp,
491 struct cgroup_subsys *ss)
492{
493 struct cgroup_subsys_state *css;
494
495 rcu_read_lock();
496 css = cgroup_css(cgrp, ss);
497 if (css && !css_tryget_online(css))
498 css = NULL;
499 rcu_read_unlock();
500
501 return css;
502}
503
504
505
506
507
508
509
510
511
512
513
514static struct cgroup_subsys_state *cgroup_e_css_by_mask(struct cgroup *cgrp,
515 struct cgroup_subsys *ss)
516{
517 lockdep_assert_held(&cgroup_mutex);
518
519 if (!ss)
520 return &cgrp->self;
521
522
523
524
525
526 while (!(cgroup_ss_mask(cgrp) & (1 << ss->id))) {
527 cgrp = cgroup_parent(cgrp);
528 if (!cgrp)
529 return NULL;
530 }
531
532 return cgroup_css(cgrp, ss);
533}
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548struct cgroup_subsys_state *cgroup_e_css(struct cgroup *cgrp,
549 struct cgroup_subsys *ss)
550{
551 struct cgroup_subsys_state *css;
552
553 do {
554 css = cgroup_css(cgrp, ss);
555
556 if (css)
557 return css;
558 cgrp = cgroup_parent(cgrp);
559 } while (cgrp);
560
561 return init_css_set.subsys[ss->id];
562}
563
564
565
566
567
568
569
570
571
572
573
574
575struct cgroup_subsys_state *cgroup_get_e_css(struct cgroup *cgrp,
576 struct cgroup_subsys *ss)
577{
578 struct cgroup_subsys_state *css;
579
580 rcu_read_lock();
581
582 do {
583 css = cgroup_css(cgrp, ss);
584
585 if (css && css_tryget_online(css))
586 goto out_unlock;
587 cgrp = cgroup_parent(cgrp);
588 } while (cgrp);
589
590 css = init_css_set.subsys[ss->id];
591 css_get(css);
592out_unlock:
593 rcu_read_unlock();
594 return css;
595}
596EXPORT_SYMBOL_GPL(cgroup_get_e_css);
597
598static void cgroup_get_live(struct cgroup *cgrp)
599{
600 WARN_ON_ONCE(cgroup_is_dead(cgrp));
601 css_get(&cgrp->self);
602}
603
604
605
606
607
608
609int __cgroup_task_count(const struct cgroup *cgrp)
610{
611 int count = 0;
612 struct cgrp_cset_link *link;
613
614 lockdep_assert_held(&css_set_lock);
615
616 list_for_each_entry(link, &cgrp->cset_links, cset_link)
617 count += link->cset->nr_tasks;
618
619 return count;
620}
621
622
623
624
625
626int cgroup_task_count(const struct cgroup *cgrp)
627{
628 int count;
629
630 spin_lock_irq(&css_set_lock);
631 count = __cgroup_task_count(cgrp);
632 spin_unlock_irq(&css_set_lock);
633
634 return count;
635}
636
637struct cgroup_subsys_state *of_css(struct kernfs_open_file *of)
638{
639 struct cgroup *cgrp = of->kn->parent->priv;
640 struct cftype *cft = of_cft(of);
641
642
643
644
645
646
647
648
649
650 if (cft->ss)
651 return rcu_dereference_raw(cgrp->subsys[cft->ss->id]);
652 else
653 return &cgrp->self;
654}
655EXPORT_SYMBOL_GPL(of_css);
656
657
658
659
660
661
662
663
664
665#define for_each_css(css, ssid, cgrp) \
666 for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++) \
667 if (!((css) = rcu_dereference_check( \
668 (cgrp)->subsys[(ssid)], \
669 lockdep_is_held(&cgroup_mutex)))) { } \
670 else
671
672
673
674
675
676
677
678
679
680#define for_each_e_css(css, ssid, cgrp) \
681 for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++) \
682 if (!((css) = cgroup_e_css_by_mask(cgrp, \
683 cgroup_subsys[(ssid)]))) \
684 ; \
685 else
686
687
688
689
690
691
692
693
694
695
696#define do_each_subsys_mask(ss, ssid, ss_mask) do { \
697 unsigned long __ss_mask = (ss_mask); \
698 if (!CGROUP_SUBSYS_COUNT) { \
699 (ssid) = 0; \
700 break; \
701 } \
702 for_each_set_bit(ssid, &__ss_mask, CGROUP_SUBSYS_COUNT) { \
703 (ss) = cgroup_subsys[ssid]; \
704 {
705
706#define while_each_subsys_mask() \
707 } \
708 } \
709} while (false)
710
711
712#define cgroup_for_each_live_child(child, cgrp) \
713 list_for_each_entry((child), &(cgrp)->self.children, self.sibling) \
714 if (({ lockdep_assert_held(&cgroup_mutex); \
715 cgroup_is_dead(child); })) \
716 ; \
717 else
718
719
720#define cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp) \
721 css_for_each_descendant_pre((d_css), cgroup_css((cgrp), NULL)) \
722 if (({ lockdep_assert_held(&cgroup_mutex); \
723 (dsct) = (d_css)->cgroup; \
724 cgroup_is_dead(dsct); })) \
725 ; \
726 else
727
728
729#define cgroup_for_each_live_descendant_post(dsct, d_css, cgrp) \
730 css_for_each_descendant_post((d_css), cgroup_css((cgrp), NULL)) \
731 if (({ lockdep_assert_held(&cgroup_mutex); \
732 (dsct) = (d_css)->cgroup; \
733 cgroup_is_dead(dsct); })) \
734 ; \
735 else
736
737
738
739
740
741
742
743
744struct css_set init_css_set = {
745 .refcount = REFCOUNT_INIT(1),
746 .dom_cset = &init_css_set,
747 .tasks = LIST_HEAD_INIT(init_css_set.tasks),
748 .mg_tasks = LIST_HEAD_INIT(init_css_set.mg_tasks),
749 .dying_tasks = LIST_HEAD_INIT(init_css_set.dying_tasks),
750 .task_iters = LIST_HEAD_INIT(init_css_set.task_iters),
751 .threaded_csets = LIST_HEAD_INIT(init_css_set.threaded_csets),
752 .cgrp_links = LIST_HEAD_INIT(init_css_set.cgrp_links),
753 .mg_preload_node = LIST_HEAD_INIT(init_css_set.mg_preload_node),
754 .mg_node = LIST_HEAD_INIT(init_css_set.mg_node),
755
756
757
758
759
760
761
762 .dfl_cgrp = &cgrp_dfl_root.cgrp,
763};
764
765static int css_set_count = 1;
766
767static bool css_set_threaded(struct css_set *cset)
768{
769 return cset->dom_cset != cset;
770}
771
772
773
774
775
776
777
778
779
780
781static bool css_set_populated(struct css_set *cset)
782{
783 lockdep_assert_held(&css_set_lock);
784
785 return !list_empty(&cset->tasks) || !list_empty(&cset->mg_tasks);
786}
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805static void cgroup_update_populated(struct cgroup *cgrp, bool populated)
806{
807 struct cgroup *child = NULL;
808 int adj = populated ? 1 : -1;
809
810 lockdep_assert_held(&css_set_lock);
811
812 do {
813 bool was_populated = cgroup_is_populated(cgrp);
814
815 if (!child) {
816 cgrp->nr_populated_csets += adj;
817 } else {
818 if (cgroup_is_threaded(child))
819 cgrp->nr_populated_threaded_children += adj;
820 else
821 cgrp->nr_populated_domain_children += adj;
822 }
823
824 if (was_populated == cgroup_is_populated(cgrp))
825 break;
826
827 cgroup1_check_for_release(cgrp);
828 TRACE_CGROUP_PATH(notify_populated, cgrp,
829 cgroup_is_populated(cgrp));
830 cgroup_file_notify(&cgrp->events_file);
831
832 child = cgrp;
833 cgrp = cgroup_parent(cgrp);
834 } while (cgrp);
835}
836
837
838
839
840
841
842
843
844
845static void css_set_update_populated(struct css_set *cset, bool populated)
846{
847 struct cgrp_cset_link *link;
848
849 lockdep_assert_held(&css_set_lock);
850
851 list_for_each_entry(link, &cset->cgrp_links, cgrp_link)
852 cgroup_update_populated(link->cgrp, populated);
853}
854
855
856
857
858
859
860
861static void css_set_skip_task_iters(struct css_set *cset,
862 struct task_struct *task)
863{
864 struct css_task_iter *it, *pos;
865
866 list_for_each_entry_safe(it, pos, &cset->task_iters, iters_node)
867 css_task_iter_skip(it, task);
868}
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885static void css_set_move_task(struct task_struct *task,
886 struct css_set *from_cset, struct css_set *to_cset,
887 bool use_mg_tasks)
888{
889 lockdep_assert_held(&css_set_lock);
890
891 if (to_cset && !css_set_populated(to_cset))
892 css_set_update_populated(to_cset, true);
893
894 if (from_cset) {
895 WARN_ON_ONCE(list_empty(&task->cg_list));
896
897 css_set_skip_task_iters(from_cset, task);
898 list_del_init(&task->cg_list);
899 if (!css_set_populated(from_cset))
900 css_set_update_populated(from_cset, false);
901 } else {
902 WARN_ON_ONCE(!list_empty(&task->cg_list));
903 }
904
905 if (to_cset) {
906
907
908
909
910
911 WARN_ON_ONCE(task->flags & PF_EXITING);
912
913 cgroup_move_task(task, to_cset);
914 list_add_tail(&task->cg_list, use_mg_tasks ? &to_cset->mg_tasks :
915 &to_cset->tasks);
916 }
917}
918
919
920
921
922
923
924#define CSS_SET_HASH_BITS 7
925static DEFINE_HASHTABLE(css_set_table, CSS_SET_HASH_BITS);
926
927static unsigned long css_set_hash(struct cgroup_subsys_state *css[])
928{
929 unsigned long key = 0UL;
930 struct cgroup_subsys *ss;
931 int i;
932
933 for_each_subsys(ss, i)
934 key += (unsigned long)css[i];
935 key = (key >> 16) ^ key;
936
937 return key;
938}
939
940void put_css_set_locked(struct css_set *cset)
941{
942 struct cgrp_cset_link *link, *tmp_link;
943 struct cgroup_subsys *ss;
944 int ssid;
945
946 lockdep_assert_held(&css_set_lock);
947
948 if (!refcount_dec_and_test(&cset->refcount))
949 return;
950
951 WARN_ON_ONCE(!list_empty(&cset->threaded_csets));
952
953
954 for_each_subsys(ss, ssid) {
955 list_del(&cset->e_cset_node[ssid]);
956 css_put(cset->subsys[ssid]);
957 }
958 hash_del(&cset->hlist);
959 css_set_count--;
960
961 list_for_each_entry_safe(link, tmp_link, &cset->cgrp_links, cgrp_link) {
962 list_del(&link->cset_link);
963 list_del(&link->cgrp_link);
964 if (cgroup_parent(link->cgrp))
965 cgroup_put(link->cgrp);
966 kfree(link);
967 }
968
969 if (css_set_threaded(cset)) {
970 list_del(&cset->threaded_csets_node);
971 put_css_set_locked(cset->dom_cset);
972 }
973
974 kfree_rcu(cset, rcu_head);
975}
976
977
978
979
980
981
982
983
984
985
986
987static bool compare_css_sets(struct css_set *cset,
988 struct css_set *old_cset,
989 struct cgroup *new_cgrp,
990 struct cgroup_subsys_state *template[])
991{
992 struct cgroup *new_dfl_cgrp;
993 struct list_head *l1, *l2;
994
995
996
997
998
999
1000 if (memcmp(template, cset->subsys, sizeof(cset->subsys)))
1001 return false;
1002
1003
1004
1005 if (cgroup_on_dfl(new_cgrp))
1006 new_dfl_cgrp = new_cgrp;
1007 else
1008 new_dfl_cgrp = old_cset->dfl_cgrp;
1009
1010 if (new_dfl_cgrp->dom_cgrp != cset->dom_cset->dfl_cgrp)
1011 return false;
1012
1013
1014
1015
1016
1017
1018
1019 l1 = &cset->cgrp_links;
1020 l2 = &old_cset->cgrp_links;
1021 while (1) {
1022 struct cgrp_cset_link *link1, *link2;
1023 struct cgroup *cgrp1, *cgrp2;
1024
1025 l1 = l1->next;
1026 l2 = l2->next;
1027
1028 if (l1 == &cset->cgrp_links) {
1029 BUG_ON(l2 != &old_cset->cgrp_links);
1030 break;
1031 } else {
1032 BUG_ON(l2 == &old_cset->cgrp_links);
1033 }
1034
1035 link1 = list_entry(l1, struct cgrp_cset_link, cgrp_link);
1036 link2 = list_entry(l2, struct cgrp_cset_link, cgrp_link);
1037 cgrp1 = link1->cgrp;
1038 cgrp2 = link2->cgrp;
1039
1040 BUG_ON(cgrp1->root != cgrp2->root);
1041
1042
1043
1044
1045
1046
1047
1048
1049 if (cgrp1->root == new_cgrp->root) {
1050 if (cgrp1 != new_cgrp)
1051 return false;
1052 } else {
1053 if (cgrp1 != cgrp2)
1054 return false;
1055 }
1056 }
1057 return true;
1058}
1059
1060
1061
1062
1063
1064
1065
1066static struct css_set *find_existing_css_set(struct css_set *old_cset,
1067 struct cgroup *cgrp,
1068 struct cgroup_subsys_state *template[])
1069{
1070 struct cgroup_root *root = cgrp->root;
1071 struct cgroup_subsys *ss;
1072 struct css_set *cset;
1073 unsigned long key;
1074 int i;
1075
1076
1077
1078
1079
1080
1081 for_each_subsys(ss, i) {
1082 if (root->subsys_mask & (1UL << i)) {
1083
1084
1085
1086
1087 template[i] = cgroup_e_css_by_mask(cgrp, ss);
1088 } else {
1089
1090
1091
1092
1093 template[i] = old_cset->subsys[i];
1094 }
1095 }
1096
1097 key = css_set_hash(template);
1098 hash_for_each_possible(css_set_table, cset, hlist, key) {
1099 if (!compare_css_sets(cset, old_cset, cgrp, template))
1100 continue;
1101
1102
1103 return cset;
1104 }
1105
1106
1107 return NULL;
1108}
1109
1110static void free_cgrp_cset_links(struct list_head *links_to_free)
1111{
1112 struct cgrp_cset_link *link, *tmp_link;
1113
1114 list_for_each_entry_safe(link, tmp_link, links_to_free, cset_link) {
1115 list_del(&link->cset_link);
1116 kfree(link);
1117 }
1118}
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128static int allocate_cgrp_cset_links(int count, struct list_head *tmp_links)
1129{
1130 struct cgrp_cset_link *link;
1131 int i;
1132
1133 INIT_LIST_HEAD(tmp_links);
1134
1135 for (i = 0; i < count; i++) {
1136 link = kzalloc(sizeof(*link), GFP_KERNEL);
1137 if (!link) {
1138 free_cgrp_cset_links(tmp_links);
1139 return -ENOMEM;
1140 }
1141 list_add(&link->cset_link, tmp_links);
1142 }
1143 return 0;
1144}
1145
1146
1147
1148
1149
1150
1151
1152static void link_css_set(struct list_head *tmp_links, struct css_set *cset,
1153 struct cgroup *cgrp)
1154{
1155 struct cgrp_cset_link *link;
1156
1157 BUG_ON(list_empty(tmp_links));
1158
1159 if (cgroup_on_dfl(cgrp))
1160 cset->dfl_cgrp = cgrp;
1161
1162 link = list_first_entry(tmp_links, struct cgrp_cset_link, cset_link);
1163 link->cset = cset;
1164 link->cgrp = cgrp;
1165
1166
1167
1168
1169
1170 list_move_tail(&link->cset_link, &cgrp->cset_links);
1171 list_add_tail(&link->cgrp_link, &cset->cgrp_links);
1172
1173 if (cgroup_parent(cgrp))
1174 cgroup_get_live(cgrp);
1175}
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185static struct css_set *find_css_set(struct css_set *old_cset,
1186 struct cgroup *cgrp)
1187{
1188 struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT] = { };
1189 struct css_set *cset;
1190 struct list_head tmp_links;
1191 struct cgrp_cset_link *link;
1192 struct cgroup_subsys *ss;
1193 unsigned long key;
1194 int ssid;
1195
1196 lockdep_assert_held(&cgroup_mutex);
1197
1198
1199
1200 spin_lock_irq(&css_set_lock);
1201 cset = find_existing_css_set(old_cset, cgrp, template);
1202 if (cset)
1203 get_css_set(cset);
1204 spin_unlock_irq(&css_set_lock);
1205
1206 if (cset)
1207 return cset;
1208
1209 cset = kzalloc(sizeof(*cset), GFP_KERNEL);
1210 if (!cset)
1211 return NULL;
1212
1213
1214 if (allocate_cgrp_cset_links(cgroup_root_count, &tmp_links) < 0) {
1215 kfree(cset);
1216 return NULL;
1217 }
1218
1219 refcount_set(&cset->refcount, 1);
1220 cset->dom_cset = cset;
1221 INIT_LIST_HEAD(&cset->tasks);
1222 INIT_LIST_HEAD(&cset->mg_tasks);
1223 INIT_LIST_HEAD(&cset->dying_tasks);
1224 INIT_LIST_HEAD(&cset->task_iters);
1225 INIT_LIST_HEAD(&cset->threaded_csets);
1226 INIT_HLIST_NODE(&cset->hlist);
1227 INIT_LIST_HEAD(&cset->cgrp_links);
1228 INIT_LIST_HEAD(&cset->mg_preload_node);
1229 INIT_LIST_HEAD(&cset->mg_node);
1230
1231
1232
1233 memcpy(cset->subsys, template, sizeof(cset->subsys));
1234
1235 spin_lock_irq(&css_set_lock);
1236
1237 list_for_each_entry(link, &old_cset->cgrp_links, cgrp_link) {
1238 struct cgroup *c = link->cgrp;
1239
1240 if (c->root == cgrp->root)
1241 c = cgrp;
1242 link_css_set(&tmp_links, cset, c);
1243 }
1244
1245 BUG_ON(!list_empty(&tmp_links));
1246
1247 css_set_count++;
1248
1249
1250 key = css_set_hash(cset->subsys);
1251 hash_add(css_set_table, &cset->hlist, key);
1252
1253 for_each_subsys(ss, ssid) {
1254 struct cgroup_subsys_state *css = cset->subsys[ssid];
1255
1256 list_add_tail(&cset->e_cset_node[ssid],
1257 &css->cgroup->e_csets[ssid]);
1258 css_get(css);
1259 }
1260
1261 spin_unlock_irq(&css_set_lock);
1262
1263
1264
1265
1266
1267
1268
1269 if (cgroup_is_threaded(cset->dfl_cgrp)) {
1270 struct css_set *dcset;
1271
1272 dcset = find_css_set(cset, cset->dfl_cgrp->dom_cgrp);
1273 if (!dcset) {
1274 put_css_set(cset);
1275 return NULL;
1276 }
1277
1278 spin_lock_irq(&css_set_lock);
1279 cset->dom_cset = dcset;
1280 list_add_tail(&cset->threaded_csets_node,
1281 &dcset->threaded_csets);
1282 spin_unlock_irq(&css_set_lock);
1283 }
1284
1285 return cset;
1286}
1287
1288struct cgroup_root *cgroup_root_from_kf(struct kernfs_root *kf_root)
1289{
1290 struct cgroup *root_cgrp = kf_root->kn->priv;
1291
1292 return root_cgrp->root;
1293}
1294
1295static int cgroup_init_root_id(struct cgroup_root *root)
1296{
1297 int id;
1298
1299 lockdep_assert_held(&cgroup_mutex);
1300
1301 id = idr_alloc_cyclic(&cgroup_hierarchy_idr, root, 0, 0, GFP_KERNEL);
1302 if (id < 0)
1303 return id;
1304
1305 root->hierarchy_id = id;
1306 return 0;
1307}
1308
1309static void cgroup_exit_root_id(struct cgroup_root *root)
1310{
1311 lockdep_assert_held(&cgroup_mutex);
1312
1313 idr_remove(&cgroup_hierarchy_idr, root->hierarchy_id);
1314}
1315
1316void cgroup_free_root(struct cgroup_root *root)
1317{
1318 kfree(root);
1319}
1320
1321static void cgroup_destroy_root(struct cgroup_root *root)
1322{
1323 struct cgroup *cgrp = &root->cgrp;
1324 struct cgrp_cset_link *link, *tmp_link;
1325
1326 trace_cgroup_destroy_root(root);
1327
1328 cgroup_lock_and_drain_offline(&cgrp_dfl_root.cgrp);
1329
1330 BUG_ON(atomic_read(&root->nr_cgrps));
1331 BUG_ON(!list_empty(&cgrp->self.children));
1332
1333
1334 WARN_ON(rebind_subsystems(&cgrp_dfl_root, root->subsys_mask));
1335
1336
1337
1338
1339
1340 spin_lock_irq(&css_set_lock);
1341
1342 list_for_each_entry_safe(link, tmp_link, &cgrp->cset_links, cset_link) {
1343 list_del(&link->cset_link);
1344 list_del(&link->cgrp_link);
1345 kfree(link);
1346 }
1347
1348 spin_unlock_irq(&css_set_lock);
1349
1350 if (!list_empty(&root->root_list)) {
1351 list_del(&root->root_list);
1352 cgroup_root_count--;
1353 }
1354
1355 cgroup_exit_root_id(root);
1356
1357 mutex_unlock(&cgroup_mutex);
1358
1359 cgroup_rstat_exit(cgrp);
1360 kernfs_destroy_root(root->kf_root);
1361 cgroup_free_root(root);
1362}
1363
1364
1365
1366
1367
1368static struct cgroup *
1369current_cgns_cgroup_from_root(struct cgroup_root *root)
1370{
1371 struct cgroup *res = NULL;
1372 struct css_set *cset;
1373
1374 lockdep_assert_held(&css_set_lock);
1375
1376 rcu_read_lock();
1377
1378 cset = current->nsproxy->cgroup_ns->root_cset;
1379 if (cset == &init_css_set) {
1380 res = &root->cgrp;
1381 } else if (root == &cgrp_dfl_root) {
1382 res = cset->dfl_cgrp;
1383 } else {
1384 struct cgrp_cset_link *link;
1385
1386 list_for_each_entry(link, &cset->cgrp_links, cgrp_link) {
1387 struct cgroup *c = link->cgrp;
1388
1389 if (c->root == root) {
1390 res = c;
1391 break;
1392 }
1393 }
1394 }
1395 rcu_read_unlock();
1396
1397 BUG_ON(!res);
1398 return res;
1399}
1400
1401
1402static struct cgroup *cset_cgroup_from_root(struct css_set *cset,
1403 struct cgroup_root *root)
1404{
1405 struct cgroup *res = NULL;
1406
1407 lockdep_assert_held(&cgroup_mutex);
1408 lockdep_assert_held(&css_set_lock);
1409
1410 if (cset == &init_css_set) {
1411 res = &root->cgrp;
1412 } else if (root == &cgrp_dfl_root) {
1413 res = cset->dfl_cgrp;
1414 } else {
1415 struct cgrp_cset_link *link;
1416
1417 list_for_each_entry(link, &cset->cgrp_links, cgrp_link) {
1418 struct cgroup *c = link->cgrp;
1419
1420 if (c->root == root) {
1421 res = c;
1422 break;
1423 }
1424 }
1425 }
1426
1427 BUG_ON(!res);
1428 return res;
1429}
1430
1431
1432
1433
1434
1435struct cgroup *task_cgroup_from_root(struct task_struct *task,
1436 struct cgroup_root *root)
1437{
1438
1439
1440
1441
1442 return cset_cgroup_from_root(task_css_set(task), root);
1443}
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471static struct kernfs_syscall_ops cgroup_kf_syscall_ops;
1472
1473static char *cgroup_file_name(struct cgroup *cgrp, const struct cftype *cft,
1474 char *buf)
1475{
1476 struct cgroup_subsys *ss = cft->ss;
1477
1478 if (cft->ss && !(cft->flags & CFTYPE_NO_PREFIX) &&
1479 !(cgrp->root->flags & CGRP_ROOT_NOPREFIX)) {
1480 const char *dbg = (cft->flags & CFTYPE_DEBUG) ? ".__DEBUG__." : "";
1481
1482 snprintf(buf, CGROUP_FILE_NAME_MAX, "%s%s.%s",
1483 dbg, cgroup_on_dfl(cgrp) ? ss->name : ss->legacy_name,
1484 cft->name);
1485 } else {
1486 strscpy(buf, cft->name, CGROUP_FILE_NAME_MAX);
1487 }
1488 return buf;
1489}
1490
1491
1492
1493
1494
1495
1496
1497static umode_t cgroup_file_mode(const struct cftype *cft)
1498{
1499 umode_t mode = 0;
1500
1501 if (cft->read_u64 || cft->read_s64 || cft->seq_show)
1502 mode |= S_IRUGO;
1503
1504 if (cft->write_u64 || cft->write_s64 || cft->write) {
1505 if (cft->flags & CFTYPE_WORLD_WRITABLE)
1506 mode |= S_IWUGO;
1507 else
1508 mode |= S_IWUSR;
1509 }
1510
1511 return mode;
1512}
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526static u16 cgroup_calc_subtree_ss_mask(u16 subtree_control, u16 this_ss_mask)
1527{
1528 u16 cur_ss_mask = subtree_control;
1529 struct cgroup_subsys *ss;
1530 int ssid;
1531
1532 lockdep_assert_held(&cgroup_mutex);
1533
1534 cur_ss_mask |= cgrp_dfl_implicit_ss_mask;
1535
1536 while (true) {
1537 u16 new_ss_mask = cur_ss_mask;
1538
1539 do_each_subsys_mask(ss, ssid, cur_ss_mask) {
1540 new_ss_mask |= ss->depends_on;
1541 } while_each_subsys_mask();
1542
1543
1544
1545
1546
1547
1548 new_ss_mask &= this_ss_mask;
1549
1550 if (new_ss_mask == cur_ss_mask)
1551 break;
1552 cur_ss_mask = new_ss_mask;
1553 }
1554
1555 return cur_ss_mask;
1556}
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568void cgroup_kn_unlock(struct kernfs_node *kn)
1569{
1570 struct cgroup *cgrp;
1571
1572 if (kernfs_type(kn) == KERNFS_DIR)
1573 cgrp = kn->priv;
1574 else
1575 cgrp = kn->parent->priv;
1576
1577 mutex_unlock(&cgroup_mutex);
1578
1579 kernfs_unbreak_active_protection(kn);
1580 cgroup_put(cgrp);
1581}
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600struct cgroup *cgroup_kn_lock_live(struct kernfs_node *kn, bool drain_offline)
1601{
1602 struct cgroup *cgrp;
1603
1604 if (kernfs_type(kn) == KERNFS_DIR)
1605 cgrp = kn->priv;
1606 else
1607 cgrp = kn->parent->priv;
1608
1609
1610
1611
1612
1613
1614
1615 if (!cgroup_tryget(cgrp))
1616 return NULL;
1617 kernfs_break_active_protection(kn);
1618
1619 if (drain_offline)
1620 cgroup_lock_and_drain_offline(cgrp);
1621 else
1622 mutex_lock(&cgroup_mutex);
1623
1624 if (!cgroup_is_dead(cgrp))
1625 return cgrp;
1626
1627 cgroup_kn_unlock(kn);
1628 return NULL;
1629}
1630
1631static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft)
1632{
1633 char name[CGROUP_FILE_NAME_MAX];
1634
1635 lockdep_assert_held(&cgroup_mutex);
1636
1637 if (cft->file_offset) {
1638 struct cgroup_subsys_state *css = cgroup_css(cgrp, cft->ss);
1639 struct cgroup_file *cfile = (void *)css + cft->file_offset;
1640
1641 spin_lock_irq(&cgroup_file_kn_lock);
1642 cfile->kn = NULL;
1643 spin_unlock_irq(&cgroup_file_kn_lock);
1644
1645 del_timer_sync(&cfile->notify_timer);
1646 }
1647
1648 kernfs_remove_by_name(cgrp->kn, cgroup_file_name(cgrp, cft, name));
1649}
1650
1651
1652
1653
1654
1655static void css_clear_dir(struct cgroup_subsys_state *css)
1656{
1657 struct cgroup *cgrp = css->cgroup;
1658 struct cftype *cfts;
1659
1660 if (!(css->flags & CSS_VISIBLE))
1661 return;
1662
1663 css->flags &= ~CSS_VISIBLE;
1664
1665 if (!css->ss) {
1666 if (cgroup_on_dfl(cgrp))
1667 cfts = cgroup_base_files;
1668 else
1669 cfts = cgroup1_base_files;
1670
1671 cgroup_addrm_files(css, cgrp, cfts, false);
1672 } else {
1673 list_for_each_entry(cfts, &css->ss->cfts, node)
1674 cgroup_addrm_files(css, cgrp, cfts, false);
1675 }
1676}
1677
1678
1679
1680
1681
1682
1683
1684static int css_populate_dir(struct cgroup_subsys_state *css)
1685{
1686 struct cgroup *cgrp = css->cgroup;
1687 struct cftype *cfts, *failed_cfts;
1688 int ret;
1689
1690 if ((css->flags & CSS_VISIBLE) || !cgrp->kn)
1691 return 0;
1692
1693 if (!css->ss) {
1694 if (cgroup_on_dfl(cgrp))
1695 cfts = cgroup_base_files;
1696 else
1697 cfts = cgroup1_base_files;
1698
1699 ret = cgroup_addrm_files(&cgrp->self, cgrp, cfts, true);
1700 if (ret < 0)
1701 return ret;
1702 } else {
1703 list_for_each_entry(cfts, &css->ss->cfts, node) {
1704 ret = cgroup_addrm_files(css, cgrp, cfts, true);
1705 if (ret < 0) {
1706 failed_cfts = cfts;
1707 goto err;
1708 }
1709 }
1710 }
1711
1712 css->flags |= CSS_VISIBLE;
1713
1714 return 0;
1715err:
1716 list_for_each_entry(cfts, &css->ss->cfts, node) {
1717 if (cfts == failed_cfts)
1718 break;
1719 cgroup_addrm_files(css, cgrp, cfts, false);
1720 }
1721 return ret;
1722}
1723
1724int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask)
1725{
1726 struct cgroup *dcgrp = &dst_root->cgrp;
1727 struct cgroup_subsys *ss;
1728 int ssid, i, ret;
1729
1730 lockdep_assert_held(&cgroup_mutex);
1731
1732 do_each_subsys_mask(ss, ssid, ss_mask) {
1733
1734
1735
1736
1737
1738 if (css_next_child(NULL, cgroup_css(&ss->root->cgrp, ss)) &&
1739 !ss->implicit_on_dfl)
1740 return -EBUSY;
1741
1742
1743 if (ss->root != &cgrp_dfl_root && dst_root != &cgrp_dfl_root)
1744 return -EBUSY;
1745 } while_each_subsys_mask();
1746
1747 do_each_subsys_mask(ss, ssid, ss_mask) {
1748 struct cgroup_root *src_root = ss->root;
1749 struct cgroup *scgrp = &src_root->cgrp;
1750 struct cgroup_subsys_state *css = cgroup_css(scgrp, ss);
1751 struct css_set *cset;
1752
1753 WARN_ON(!css || cgroup_css(dcgrp, ss));
1754
1755
1756 src_root->subsys_mask &= ~(1 << ssid);
1757 WARN_ON(cgroup_apply_control(scgrp));
1758 cgroup_finalize_control(scgrp, 0);
1759
1760
1761 RCU_INIT_POINTER(scgrp->subsys[ssid], NULL);
1762 rcu_assign_pointer(dcgrp->subsys[ssid], css);
1763 ss->root = dst_root;
1764 css->cgroup = dcgrp;
1765
1766 spin_lock_irq(&css_set_lock);
1767 hash_for_each(css_set_table, i, cset, hlist)
1768 list_move_tail(&cset->e_cset_node[ss->id],
1769 &dcgrp->e_csets[ss->id]);
1770 spin_unlock_irq(&css_set_lock);
1771
1772 if (ss->css_rstat_flush) {
1773 list_del_rcu(&css->rstat_css_node);
1774 list_add_rcu(&css->rstat_css_node,
1775 &dcgrp->rstat_css_list);
1776 }
1777
1778
1779 dst_root->subsys_mask |= 1 << ssid;
1780 if (dst_root == &cgrp_dfl_root) {
1781 static_branch_enable(cgroup_subsys_on_dfl_key[ssid]);
1782 } else {
1783 dcgrp->subtree_control |= 1 << ssid;
1784 static_branch_disable(cgroup_subsys_on_dfl_key[ssid]);
1785 }
1786
1787 ret = cgroup_apply_control(dcgrp);
1788 if (ret)
1789 pr_warn("partial failure to rebind %s controller (err=%d)\n",
1790 ss->name, ret);
1791
1792 if (ss->bind)
1793 ss->bind(css);
1794 } while_each_subsys_mask();
1795
1796 kernfs_activate(dcgrp->kn);
1797 return 0;
1798}
1799
1800int cgroup_show_path(struct seq_file *sf, struct kernfs_node *kf_node,
1801 struct kernfs_root *kf_root)
1802{
1803 int len = 0;
1804 char *buf = NULL;
1805 struct cgroup_root *kf_cgroot = cgroup_root_from_kf(kf_root);
1806 struct cgroup *ns_cgroup;
1807
1808 buf = kmalloc(PATH_MAX, GFP_KERNEL);
1809 if (!buf)
1810 return -ENOMEM;
1811
1812 spin_lock_irq(&css_set_lock);
1813 ns_cgroup = current_cgns_cgroup_from_root(kf_cgroot);
1814 len = kernfs_path_from_node(kf_node, ns_cgroup->kn, buf, PATH_MAX);
1815 spin_unlock_irq(&css_set_lock);
1816
1817 if (len >= PATH_MAX)
1818 len = -ERANGE;
1819 else if (len > 0) {
1820 seq_escape(sf, buf, " \t\n\\");
1821 len = 0;
1822 }
1823 kfree(buf);
1824 return len;
1825}
1826
1827enum cgroup2_param {
1828 Opt_nsdelegate,
1829 Opt_memory_localevents,
1830 Opt_memory_recursiveprot,
1831 nr__cgroup2_params
1832};
1833
1834static const struct fs_parameter_spec cgroup2_fs_parameters[] = {
1835 fsparam_flag("nsdelegate", Opt_nsdelegate),
1836 fsparam_flag("memory_localevents", Opt_memory_localevents),
1837 fsparam_flag("memory_recursiveprot", Opt_memory_recursiveprot),
1838 {}
1839};
1840
1841static int cgroup2_parse_param(struct fs_context *fc, struct fs_parameter *param)
1842{
1843 struct cgroup_fs_context *ctx = cgroup_fc2context(fc);
1844 struct fs_parse_result result;
1845 int opt;
1846
1847 opt = fs_parse(fc, cgroup2_fs_parameters, param, &result);
1848 if (opt < 0)
1849 return opt;
1850
1851 switch (opt) {
1852 case Opt_nsdelegate:
1853 ctx->flags |= CGRP_ROOT_NS_DELEGATE;
1854 return 0;
1855 case Opt_memory_localevents:
1856 ctx->flags |= CGRP_ROOT_MEMORY_LOCAL_EVENTS;
1857 return 0;
1858 case Opt_memory_recursiveprot:
1859 ctx->flags |= CGRP_ROOT_MEMORY_RECURSIVE_PROT;
1860 return 0;
1861 }
1862 return -EINVAL;
1863}
1864
1865static void apply_cgroup_root_flags(unsigned int root_flags)
1866{
1867 if (current->nsproxy->cgroup_ns == &init_cgroup_ns) {
1868 if (root_flags & CGRP_ROOT_NS_DELEGATE)
1869 cgrp_dfl_root.flags |= CGRP_ROOT_NS_DELEGATE;
1870 else
1871 cgrp_dfl_root.flags &= ~CGRP_ROOT_NS_DELEGATE;
1872
1873 if (root_flags & CGRP_ROOT_MEMORY_LOCAL_EVENTS)
1874 cgrp_dfl_root.flags |= CGRP_ROOT_MEMORY_LOCAL_EVENTS;
1875 else
1876 cgrp_dfl_root.flags &= ~CGRP_ROOT_MEMORY_LOCAL_EVENTS;
1877
1878 if (root_flags & CGRP_ROOT_MEMORY_RECURSIVE_PROT)
1879 cgrp_dfl_root.flags |= CGRP_ROOT_MEMORY_RECURSIVE_PROT;
1880 else
1881 cgrp_dfl_root.flags &= ~CGRP_ROOT_MEMORY_RECURSIVE_PROT;
1882 }
1883}
1884
1885static int cgroup_show_options(struct seq_file *seq, struct kernfs_root *kf_root)
1886{
1887 if (cgrp_dfl_root.flags & CGRP_ROOT_NS_DELEGATE)
1888 seq_puts(seq, ",nsdelegate");
1889 if (cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_LOCAL_EVENTS)
1890 seq_puts(seq, ",memory_localevents");
1891 if (cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_RECURSIVE_PROT)
1892 seq_puts(seq, ",memory_recursiveprot");
1893 return 0;
1894}
1895
1896static int cgroup_reconfigure(struct fs_context *fc)
1897{
1898 struct cgroup_fs_context *ctx = cgroup_fc2context(fc);
1899
1900 apply_cgroup_root_flags(ctx->flags);
1901 return 0;
1902}
1903
1904static void init_cgroup_housekeeping(struct cgroup *cgrp)
1905{
1906 struct cgroup_subsys *ss;
1907 int ssid;
1908
1909 INIT_LIST_HEAD(&cgrp->self.sibling);
1910 INIT_LIST_HEAD(&cgrp->self.children);
1911 INIT_LIST_HEAD(&cgrp->cset_links);
1912 INIT_LIST_HEAD(&cgrp->pidlists);
1913 mutex_init(&cgrp->pidlist_mutex);
1914 cgrp->self.cgroup = cgrp;
1915 cgrp->self.flags |= CSS_ONLINE;
1916 cgrp->dom_cgrp = cgrp;
1917 cgrp->max_descendants = INT_MAX;
1918 cgrp->max_depth = INT_MAX;
1919 INIT_LIST_HEAD(&cgrp->rstat_css_list);
1920 prev_cputime_init(&cgrp->prev_cputime);
1921
1922 for_each_subsys(ss, ssid)
1923 INIT_LIST_HEAD(&cgrp->e_csets[ssid]);
1924
1925 init_waitqueue_head(&cgrp->offline_waitq);
1926 INIT_WORK(&cgrp->release_agent_work, cgroup1_release_agent);
1927}
1928
1929void init_cgroup_root(struct cgroup_fs_context *ctx)
1930{
1931 struct cgroup_root *root = ctx->root;
1932 struct cgroup *cgrp = &root->cgrp;
1933
1934 INIT_LIST_HEAD(&root->root_list);
1935 atomic_set(&root->nr_cgrps, 1);
1936 cgrp->root = root;
1937 init_cgroup_housekeeping(cgrp);
1938
1939 root->flags = ctx->flags;
1940 if (ctx->release_agent)
1941 strscpy(root->release_agent_path, ctx->release_agent, PATH_MAX);
1942 if (ctx->name)
1943 strscpy(root->name, ctx->name, MAX_CGROUP_ROOT_NAMELEN);
1944 if (ctx->cpuset_clone_children)
1945 set_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags);
1946}
1947
1948int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask)
1949{
1950 LIST_HEAD(tmp_links);
1951 struct cgroup *root_cgrp = &root->cgrp;
1952 struct kernfs_syscall_ops *kf_sops;
1953 struct css_set *cset;
1954 int i, ret;
1955
1956 lockdep_assert_held(&cgroup_mutex);
1957
1958 ret = percpu_ref_init(&root_cgrp->self.refcnt, css_release,
1959 0, GFP_KERNEL);
1960 if (ret)
1961 goto out;
1962
1963
1964
1965
1966
1967
1968
1969
1970 ret = allocate_cgrp_cset_links(2 * css_set_count, &tmp_links);
1971 if (ret)
1972 goto cancel_ref;
1973
1974 ret = cgroup_init_root_id(root);
1975 if (ret)
1976 goto cancel_ref;
1977
1978 kf_sops = root == &cgrp_dfl_root ?
1979 &cgroup_kf_syscall_ops : &cgroup1_kf_syscall_ops;
1980
1981 root->kf_root = kernfs_create_root(kf_sops,
1982 KERNFS_ROOT_CREATE_DEACTIVATED |
1983 KERNFS_ROOT_SUPPORT_EXPORTOP |
1984 KERNFS_ROOT_SUPPORT_USER_XATTR,
1985 root_cgrp);
1986 if (IS_ERR(root->kf_root)) {
1987 ret = PTR_ERR(root->kf_root);
1988 goto exit_root_id;
1989 }
1990 root_cgrp->kn = root->kf_root->kn;
1991 WARN_ON_ONCE(cgroup_ino(root_cgrp) != 1);
1992 root_cgrp->ancestor_ids[0] = cgroup_id(root_cgrp);
1993
1994 ret = css_populate_dir(&root_cgrp->self);
1995 if (ret)
1996 goto destroy_root;
1997
1998 ret = cgroup_rstat_init(root_cgrp);
1999 if (ret)
2000 goto destroy_root;
2001
2002 ret = rebind_subsystems(root, ss_mask);
2003 if (ret)
2004 goto exit_stats;
2005
2006 ret = cgroup_bpf_inherit(root_cgrp);
2007 WARN_ON_ONCE(ret);
2008
2009 trace_cgroup_setup_root(root);
2010
2011
2012
2013
2014
2015
2016 list_add(&root->root_list, &cgroup_roots);
2017 cgroup_root_count++;
2018
2019
2020
2021
2022
2023 spin_lock_irq(&css_set_lock);
2024 hash_for_each(css_set_table, i, cset, hlist) {
2025 link_css_set(&tmp_links, cset, root_cgrp);
2026 if (css_set_populated(cset))
2027 cgroup_update_populated(root_cgrp, true);
2028 }
2029 spin_unlock_irq(&css_set_lock);
2030
2031 BUG_ON(!list_empty(&root_cgrp->self.children));
2032 BUG_ON(atomic_read(&root->nr_cgrps) != 1);
2033
2034 ret = 0;
2035 goto out;
2036
2037exit_stats:
2038 cgroup_rstat_exit(root_cgrp);
2039destroy_root:
2040 kernfs_destroy_root(root->kf_root);
2041 root->kf_root = NULL;
2042exit_root_id:
2043 cgroup_exit_root_id(root);
2044cancel_ref:
2045 percpu_ref_exit(&root_cgrp->self.refcnt);
2046out:
2047 free_cgrp_cset_links(&tmp_links);
2048 return ret;
2049}
2050
2051int cgroup_do_get_tree(struct fs_context *fc)
2052{
2053 struct cgroup_fs_context *ctx = cgroup_fc2context(fc);
2054 int ret;
2055
2056 ctx->kfc.root = ctx->root->kf_root;
2057 if (fc->fs_type == &cgroup2_fs_type)
2058 ctx->kfc.magic = CGROUP2_SUPER_MAGIC;
2059 else
2060 ctx->kfc.magic = CGROUP_SUPER_MAGIC;
2061 ret = kernfs_get_tree(fc);
2062
2063
2064
2065
2066
2067 if (!ret && ctx->ns != &init_cgroup_ns) {
2068 struct dentry *nsdentry;
2069 struct super_block *sb = fc->root->d_sb;
2070 struct cgroup *cgrp;
2071
2072 mutex_lock(&cgroup_mutex);
2073 spin_lock_irq(&css_set_lock);
2074
2075 cgrp = cset_cgroup_from_root(ctx->ns->root_cset, ctx->root);
2076
2077 spin_unlock_irq(&css_set_lock);
2078 mutex_unlock(&cgroup_mutex);
2079
2080 nsdentry = kernfs_node_dentry(cgrp->kn, sb);
2081 dput(fc->root);
2082 if (IS_ERR(nsdentry)) {
2083 deactivate_locked_super(sb);
2084 ret = PTR_ERR(nsdentry);
2085 nsdentry = NULL;
2086 }
2087 fc->root = nsdentry;
2088 }
2089
2090 if (!ctx->kfc.new_sb_created)
2091 cgroup_put(&ctx->root->cgrp);
2092
2093 return ret;
2094}
2095
2096
2097
2098
2099static void cgroup_fs_context_free(struct fs_context *fc)
2100{
2101 struct cgroup_fs_context *ctx = cgroup_fc2context(fc);
2102
2103 kfree(ctx->name);
2104 kfree(ctx->release_agent);
2105 put_cgroup_ns(ctx->ns);
2106 kernfs_free_fs_context(fc);
2107 kfree(ctx);
2108}
2109
2110static int cgroup_get_tree(struct fs_context *fc)
2111{
2112 struct cgroup_fs_context *ctx = cgroup_fc2context(fc);
2113 int ret;
2114
2115 cgrp_dfl_visible = true;
2116 cgroup_get_live(&cgrp_dfl_root.cgrp);
2117 ctx->root = &cgrp_dfl_root;
2118
2119 ret = cgroup_do_get_tree(fc);
2120 if (!ret)
2121 apply_cgroup_root_flags(ctx->flags);
2122 return ret;
2123}
2124
2125static const struct fs_context_operations cgroup_fs_context_ops = {
2126 .free = cgroup_fs_context_free,
2127 .parse_param = cgroup2_parse_param,
2128 .get_tree = cgroup_get_tree,
2129 .reconfigure = cgroup_reconfigure,
2130};
2131
2132static const struct fs_context_operations cgroup1_fs_context_ops = {
2133 .free = cgroup_fs_context_free,
2134 .parse_param = cgroup1_parse_param,
2135 .get_tree = cgroup1_get_tree,
2136 .reconfigure = cgroup1_reconfigure,
2137};
2138
2139
2140
2141
2142
2143static int cgroup_init_fs_context(struct fs_context *fc)
2144{
2145 struct cgroup_fs_context *ctx;
2146
2147 ctx = kzalloc(sizeof(struct cgroup_fs_context), GFP_KERNEL);
2148 if (!ctx)
2149 return -ENOMEM;
2150
2151 ctx->ns = current->nsproxy->cgroup_ns;
2152 get_cgroup_ns(ctx->ns);
2153 fc->fs_private = &ctx->kfc;
2154 if (fc->fs_type == &cgroup2_fs_type)
2155 fc->ops = &cgroup_fs_context_ops;
2156 else
2157 fc->ops = &cgroup1_fs_context_ops;
2158 put_user_ns(fc->user_ns);
2159 fc->user_ns = get_user_ns(ctx->ns->user_ns);
2160 fc->global = true;
2161 return 0;
2162}
2163
2164static void cgroup_kill_sb(struct super_block *sb)
2165{
2166 struct kernfs_root *kf_root = kernfs_root_from_sb(sb);
2167 struct cgroup_root *root = cgroup_root_from_kf(kf_root);
2168
2169
2170
2171
2172
2173
2174
2175
2176 if (list_empty(&root->cgrp.self.children) && root != &cgrp_dfl_root &&
2177 !percpu_ref_is_dying(&root->cgrp.self.refcnt))
2178 percpu_ref_kill(&root->cgrp.self.refcnt);
2179 cgroup_put(&root->cgrp);
2180 kernfs_kill_sb(sb);
2181}
2182
2183struct file_system_type cgroup_fs_type = {
2184 .name = "cgroup",
2185 .init_fs_context = cgroup_init_fs_context,
2186 .parameters = cgroup1_fs_parameters,
2187 .kill_sb = cgroup_kill_sb,
2188 .fs_flags = FS_USERNS_MOUNT,
2189};
2190
2191static struct file_system_type cgroup2_fs_type = {
2192 .name = "cgroup2",
2193 .init_fs_context = cgroup_init_fs_context,
2194 .parameters = cgroup2_fs_parameters,
2195 .kill_sb = cgroup_kill_sb,
2196 .fs_flags = FS_USERNS_MOUNT,
2197};
2198
2199#ifdef CONFIG_CPUSETS
2200static const struct fs_context_operations cpuset_fs_context_ops = {
2201 .get_tree = cgroup1_get_tree,
2202 .free = cgroup_fs_context_free,
2203};
2204
2205
2206
2207
2208
2209
2210static int cpuset_init_fs_context(struct fs_context *fc)
2211{
2212 char *agent = kstrdup("/sbin/cpuset_release_agent", GFP_USER);
2213 struct cgroup_fs_context *ctx;
2214 int err;
2215
2216 err = cgroup_init_fs_context(fc);
2217 if (err) {
2218 kfree(agent);
2219 return err;
2220 }
2221
2222 fc->ops = &cpuset_fs_context_ops;
2223
2224 ctx = cgroup_fc2context(fc);
2225 ctx->subsys_mask = 1 << cpuset_cgrp_id;
2226 ctx->flags |= CGRP_ROOT_NOPREFIX;
2227 ctx->release_agent = agent;
2228
2229 get_filesystem(&cgroup_fs_type);
2230 put_filesystem(fc->fs_type);
2231 fc->fs_type = &cgroup_fs_type;
2232
2233 return 0;
2234}
2235
2236static struct file_system_type cpuset_fs_type = {
2237 .name = "cpuset",
2238 .init_fs_context = cpuset_init_fs_context,
2239 .fs_flags = FS_USERNS_MOUNT,
2240};
2241#endif
2242
2243int cgroup_path_ns_locked(struct cgroup *cgrp, char *buf, size_t buflen,
2244 struct cgroup_namespace *ns)
2245{
2246 struct cgroup *root = cset_cgroup_from_root(ns->root_cset, cgrp->root);
2247
2248 return kernfs_path_from_node(cgrp->kn, root->kn, buf, buflen);
2249}
2250
2251int cgroup_path_ns(struct cgroup *cgrp, char *buf, size_t buflen,
2252 struct cgroup_namespace *ns)
2253{
2254 int ret;
2255
2256 mutex_lock(&cgroup_mutex);
2257 spin_lock_irq(&css_set_lock);
2258
2259 ret = cgroup_path_ns_locked(cgrp, buf, buflen, ns);
2260
2261 spin_unlock_irq(&css_set_lock);
2262 mutex_unlock(&cgroup_mutex);
2263
2264 return ret;
2265}
2266EXPORT_SYMBOL_GPL(cgroup_path_ns);
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281int task_cgroup_path(struct task_struct *task, char *buf, size_t buflen)
2282{
2283 struct cgroup_root *root;
2284 struct cgroup *cgrp;
2285 int hierarchy_id = 1;
2286 int ret;
2287
2288 mutex_lock(&cgroup_mutex);
2289 spin_lock_irq(&css_set_lock);
2290
2291 root = idr_get_next(&cgroup_hierarchy_idr, &hierarchy_id);
2292
2293 if (root) {
2294 cgrp = task_cgroup_from_root(task, root);
2295 ret = cgroup_path_ns_locked(cgrp, buf, buflen, &init_cgroup_ns);
2296 } else {
2297
2298 ret = strlcpy(buf, "/", buflen);
2299 }
2300
2301 spin_unlock_irq(&css_set_lock);
2302 mutex_unlock(&cgroup_mutex);
2303 return ret;
2304}
2305EXPORT_SYMBOL_GPL(task_cgroup_path);
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317static void cgroup_migrate_add_task(struct task_struct *task,
2318 struct cgroup_mgctx *mgctx)
2319{
2320 struct css_set *cset;
2321
2322 lockdep_assert_held(&css_set_lock);
2323
2324
2325 if (task->flags & PF_EXITING)
2326 return;
2327
2328
2329 WARN_ON_ONCE(list_empty(&task->cg_list));
2330
2331 cset = task_css_set(task);
2332 if (!cset->mg_src_cgrp)
2333 return;
2334
2335 mgctx->tset.nr_tasks++;
2336
2337 list_move_tail(&task->cg_list, &cset->mg_tasks);
2338 if (list_empty(&cset->mg_node))
2339 list_add_tail(&cset->mg_node,
2340 &mgctx->tset.src_csets);
2341 if (list_empty(&cset->mg_dst_cset->mg_node))
2342 list_add_tail(&cset->mg_dst_cset->mg_node,
2343 &mgctx->tset.dst_csets);
2344}
2345
2346
2347
2348
2349
2350
2351
2352
2353struct task_struct *cgroup_taskset_first(struct cgroup_taskset *tset,
2354 struct cgroup_subsys_state **dst_cssp)
2355{
2356 tset->cur_cset = list_first_entry(tset->csets, struct css_set, mg_node);
2357 tset->cur_task = NULL;
2358
2359 return cgroup_taskset_next(tset, dst_cssp);
2360}
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset,
2371 struct cgroup_subsys_state **dst_cssp)
2372{
2373 struct css_set *cset = tset->cur_cset;
2374 struct task_struct *task = tset->cur_task;
2375
2376 while (&cset->mg_node != tset->csets) {
2377 if (!task)
2378 task = list_first_entry(&cset->mg_tasks,
2379 struct task_struct, cg_list);
2380 else
2381 task = list_next_entry(task, cg_list);
2382
2383 if (&task->cg_list != &cset->mg_tasks) {
2384 tset->cur_cset = cset;
2385 tset->cur_task = task;
2386
2387
2388
2389
2390
2391
2392
2393 if (cset->mg_dst_cset)
2394 *dst_cssp = cset->mg_dst_cset->subsys[tset->ssid];
2395 else
2396 *dst_cssp = cset->subsys[tset->ssid];
2397
2398 return task;
2399 }
2400
2401 cset = list_next_entry(cset, mg_node);
2402 task = NULL;
2403 }
2404
2405 return NULL;
2406}
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417static int cgroup_migrate_execute(struct cgroup_mgctx *mgctx)
2418{
2419 struct cgroup_taskset *tset = &mgctx->tset;
2420 struct cgroup_subsys *ss;
2421 struct task_struct *task, *tmp_task;
2422 struct css_set *cset, *tmp_cset;
2423 int ssid, failed_ssid, ret;
2424
2425
2426 if (tset->nr_tasks) {
2427 do_each_subsys_mask(ss, ssid, mgctx->ss_mask) {
2428 if (ss->can_attach) {
2429 tset->ssid = ssid;
2430 ret = ss->can_attach(tset);
2431 if (ret) {
2432 failed_ssid = ssid;
2433 goto out_cancel_attach;
2434 }
2435 }
2436 } while_each_subsys_mask();
2437 }
2438
2439
2440
2441
2442
2443
2444 spin_lock_irq(&css_set_lock);
2445 list_for_each_entry(cset, &tset->src_csets, mg_node) {
2446 list_for_each_entry_safe(task, tmp_task, &cset->mg_tasks, cg_list) {
2447 struct css_set *from_cset = task_css_set(task);
2448 struct css_set *to_cset = cset->mg_dst_cset;
2449
2450 get_css_set(to_cset);
2451 to_cset->nr_tasks++;
2452 css_set_move_task(task, from_cset, to_cset, true);
2453 from_cset->nr_tasks--;
2454
2455
2456
2457
2458 cgroup_freezer_migrate_task(task, from_cset->dfl_cgrp,
2459 to_cset->dfl_cgrp);
2460 put_css_set_locked(from_cset);
2461
2462 }
2463 }
2464 spin_unlock_irq(&css_set_lock);
2465
2466
2467
2468
2469
2470
2471 tset->csets = &tset->dst_csets;
2472
2473 if (tset->nr_tasks) {
2474 do_each_subsys_mask(ss, ssid, mgctx->ss_mask) {
2475 if (ss->attach) {
2476 tset->ssid = ssid;
2477 ss->attach(tset);
2478 }
2479 } while_each_subsys_mask();
2480 }
2481
2482 ret = 0;
2483 goto out_release_tset;
2484
2485out_cancel_attach:
2486 if (tset->nr_tasks) {
2487 do_each_subsys_mask(ss, ssid, mgctx->ss_mask) {
2488 if (ssid == failed_ssid)
2489 break;
2490 if (ss->cancel_attach) {
2491 tset->ssid = ssid;
2492 ss->cancel_attach(tset);
2493 }
2494 } while_each_subsys_mask();
2495 }
2496out_release_tset:
2497 spin_lock_irq(&css_set_lock);
2498 list_splice_init(&tset->dst_csets, &tset->src_csets);
2499 list_for_each_entry_safe(cset, tmp_cset, &tset->src_csets, mg_node) {
2500 list_splice_tail_init(&cset->mg_tasks, &cset->tasks);
2501 list_del_init(&cset->mg_node);
2502 }
2503 spin_unlock_irq(&css_set_lock);
2504
2505
2506
2507
2508
2509
2510 tset->nr_tasks = 0;
2511 tset->csets = &tset->src_csets;
2512 return ret;
2513}
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524int cgroup_migrate_vet_dst(struct cgroup *dst_cgrp)
2525{
2526
2527 if (!cgroup_on_dfl(dst_cgrp))
2528 return 0;
2529
2530
2531 if (!cgroup_is_valid_domain(dst_cgrp->dom_cgrp))
2532 return -EOPNOTSUPP;
2533
2534
2535 if (cgroup_is_mixable(dst_cgrp))
2536 return 0;
2537
2538
2539
2540
2541
2542 if (cgroup_can_be_thread_root(dst_cgrp) || cgroup_is_threaded(dst_cgrp))
2543 return 0;
2544
2545
2546 if (dst_cgrp->subtree_control)
2547 return -EBUSY;
2548
2549 return 0;
2550}
2551
2552
2553
2554
2555
2556
2557
2558
2559void cgroup_migrate_finish(struct cgroup_mgctx *mgctx)
2560{
2561 LIST_HEAD(preloaded);
2562 struct css_set *cset, *tmp_cset;
2563
2564 lockdep_assert_held(&cgroup_mutex);
2565
2566 spin_lock_irq(&css_set_lock);
2567
2568 list_splice_tail_init(&mgctx->preloaded_src_csets, &preloaded);
2569 list_splice_tail_init(&mgctx->preloaded_dst_csets, &preloaded);
2570
2571 list_for_each_entry_safe(cset, tmp_cset, &preloaded, mg_preload_node) {
2572 cset->mg_src_cgrp = NULL;
2573 cset->mg_dst_cgrp = NULL;
2574 cset->mg_dst_cset = NULL;
2575 list_del_init(&cset->mg_preload_node);
2576 put_css_set_locked(cset);
2577 }
2578
2579 spin_unlock_irq(&css_set_lock);
2580}
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598void cgroup_migrate_add_src(struct css_set *src_cset,
2599 struct cgroup *dst_cgrp,
2600 struct cgroup_mgctx *mgctx)
2601{
2602 struct cgroup *src_cgrp;
2603
2604 lockdep_assert_held(&cgroup_mutex);
2605 lockdep_assert_held(&css_set_lock);
2606
2607
2608
2609
2610
2611
2612 if (src_cset->dead)
2613 return;
2614
2615 src_cgrp = cset_cgroup_from_root(src_cset, dst_cgrp->root);
2616
2617 if (!list_empty(&src_cset->mg_preload_node))
2618 return;
2619
2620 WARN_ON(src_cset->mg_src_cgrp);
2621 WARN_ON(src_cset->mg_dst_cgrp);
2622 WARN_ON(!list_empty(&src_cset->mg_tasks));
2623 WARN_ON(!list_empty(&src_cset->mg_node));
2624
2625 src_cset->mg_src_cgrp = src_cgrp;
2626 src_cset->mg_dst_cgrp = dst_cgrp;
2627 get_css_set(src_cset);
2628 list_add_tail(&src_cset->mg_preload_node, &mgctx->preloaded_src_csets);
2629}
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645int cgroup_migrate_prepare_dst(struct cgroup_mgctx *mgctx)
2646{
2647 struct css_set *src_cset, *tmp_cset;
2648
2649 lockdep_assert_held(&cgroup_mutex);
2650
2651
2652 list_for_each_entry_safe(src_cset, tmp_cset, &mgctx->preloaded_src_csets,
2653 mg_preload_node) {
2654 struct css_set *dst_cset;
2655 struct cgroup_subsys *ss;
2656 int ssid;
2657
2658 dst_cset = find_css_set(src_cset, src_cset->mg_dst_cgrp);
2659 if (!dst_cset)
2660 return -ENOMEM;
2661
2662 WARN_ON_ONCE(src_cset->mg_dst_cset || dst_cset->mg_dst_cset);
2663
2664
2665
2666
2667
2668
2669 if (src_cset == dst_cset) {
2670 src_cset->mg_src_cgrp = NULL;
2671 src_cset->mg_dst_cgrp = NULL;
2672 list_del_init(&src_cset->mg_preload_node);
2673 put_css_set(src_cset);
2674 put_css_set(dst_cset);
2675 continue;
2676 }
2677
2678 src_cset->mg_dst_cset = dst_cset;
2679
2680 if (list_empty(&dst_cset->mg_preload_node))
2681 list_add_tail(&dst_cset->mg_preload_node,
2682 &mgctx->preloaded_dst_csets);
2683 else
2684 put_css_set(dst_cset);
2685
2686 for_each_subsys(ss, ssid)
2687 if (src_cset->subsys[ssid] != dst_cset->subsys[ssid])
2688 mgctx->ss_mask |= 1 << ssid;
2689 }
2690
2691 return 0;
2692}
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712int cgroup_migrate(struct task_struct *leader, bool threadgroup,
2713 struct cgroup_mgctx *mgctx)
2714{
2715 struct task_struct *task;
2716
2717
2718
2719
2720
2721
2722 spin_lock_irq(&css_set_lock);
2723 rcu_read_lock();
2724 task = leader;
2725 do {
2726 cgroup_migrate_add_task(task, mgctx);
2727 if (!threadgroup)
2728 break;
2729 } while_each_thread(leader, task);
2730 rcu_read_unlock();
2731 spin_unlock_irq(&css_set_lock);
2732
2733 return cgroup_migrate_execute(mgctx);
2734}
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744int cgroup_attach_task(struct cgroup *dst_cgrp, struct task_struct *leader,
2745 bool threadgroup)
2746{
2747 DEFINE_CGROUP_MGCTX(mgctx);
2748 struct task_struct *task;
2749 int ret = 0;
2750
2751
2752 spin_lock_irq(&css_set_lock);
2753 rcu_read_lock();
2754 task = leader;
2755 do {
2756 cgroup_migrate_add_src(task_css_set(task), dst_cgrp, &mgctx);
2757 if (!threadgroup)
2758 break;
2759 } while_each_thread(leader, task);
2760 rcu_read_unlock();
2761 spin_unlock_irq(&css_set_lock);
2762
2763
2764 ret = cgroup_migrate_prepare_dst(&mgctx);
2765 if (!ret)
2766 ret = cgroup_migrate(leader, threadgroup, &mgctx);
2767
2768 cgroup_migrate_finish(&mgctx);
2769
2770 if (!ret)
2771 TRACE_CGROUP_PATH(attach_task, dst_cgrp, leader, threadgroup);
2772
2773 return ret;
2774}
2775
2776struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup,
2777 bool *locked)
2778 __acquires(&cgroup_threadgroup_rwsem)
2779{
2780 struct task_struct *tsk;
2781 pid_t pid;
2782
2783 if (kstrtoint(strstrip(buf), 0, &pid) || pid < 0)
2784 return ERR_PTR(-EINVAL);
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794 lockdep_assert_held(&cgroup_mutex);
2795 if (pid || threadgroup) {
2796 percpu_down_write(&cgroup_threadgroup_rwsem);
2797 *locked = true;
2798 } else {
2799 *locked = false;
2800 }
2801
2802 rcu_read_lock();
2803 if (pid) {
2804 tsk = find_task_by_vpid(pid);
2805 if (!tsk) {
2806 tsk = ERR_PTR(-ESRCH);
2807 goto out_unlock_threadgroup;
2808 }
2809 } else {
2810 tsk = current;
2811 }
2812
2813 if (threadgroup)
2814 tsk = tsk->group_leader;
2815
2816
2817
2818
2819
2820
2821
2822 if (tsk->no_cgroup_migration || (tsk->flags & PF_NO_SETAFFINITY)) {
2823 tsk = ERR_PTR(-EINVAL);
2824 goto out_unlock_threadgroup;
2825 }
2826
2827 get_task_struct(tsk);
2828 goto out_unlock_rcu;
2829
2830out_unlock_threadgroup:
2831 if (*locked) {
2832 percpu_up_write(&cgroup_threadgroup_rwsem);
2833 *locked = false;
2834 }
2835out_unlock_rcu:
2836 rcu_read_unlock();
2837 return tsk;
2838}
2839
2840void cgroup_procs_write_finish(struct task_struct *task, bool locked)
2841 __releases(&cgroup_threadgroup_rwsem)
2842{
2843 struct cgroup_subsys *ss;
2844 int ssid;
2845
2846
2847 put_task_struct(task);
2848
2849 if (locked)
2850 percpu_up_write(&cgroup_threadgroup_rwsem);
2851 for_each_subsys(ss, ssid)
2852 if (ss->post_attach)
2853 ss->post_attach();
2854}
2855
2856static void cgroup_print_ss_mask(struct seq_file *seq, u16 ss_mask)
2857{
2858 struct cgroup_subsys *ss;
2859 bool printed = false;
2860 int ssid;
2861
2862 do_each_subsys_mask(ss, ssid, ss_mask) {
2863 if (printed)
2864 seq_putc(seq, ' ');
2865 seq_puts(seq, ss->name);
2866 printed = true;
2867 } while_each_subsys_mask();
2868 if (printed)
2869 seq_putc(seq, '\n');
2870}
2871
2872
2873static int cgroup_controllers_show(struct seq_file *seq, void *v)
2874{
2875 struct cgroup *cgrp = seq_css(seq)->cgroup;
2876
2877 cgroup_print_ss_mask(seq, cgroup_control(cgrp));
2878 return 0;
2879}
2880
2881
2882static int cgroup_subtree_control_show(struct seq_file *seq, void *v)
2883{
2884 struct cgroup *cgrp = seq_css(seq)->cgroup;
2885
2886 cgroup_print_ss_mask(seq, cgrp->subtree_control);
2887 return 0;
2888}
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899static int cgroup_update_dfl_csses(struct cgroup *cgrp)
2900{
2901 DEFINE_CGROUP_MGCTX(mgctx);
2902 struct cgroup_subsys_state *d_css;
2903 struct cgroup *dsct;
2904 struct css_set *src_cset;
2905 int ret;
2906
2907 lockdep_assert_held(&cgroup_mutex);
2908
2909 percpu_down_write(&cgroup_threadgroup_rwsem);
2910
2911
2912 spin_lock_irq(&css_set_lock);
2913 cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp) {
2914 struct cgrp_cset_link *link;
2915
2916 list_for_each_entry(link, &dsct->cset_links, cset_link)
2917 cgroup_migrate_add_src(link->cset, dsct, &mgctx);
2918 }
2919 spin_unlock_irq(&css_set_lock);
2920
2921
2922 ret = cgroup_migrate_prepare_dst(&mgctx);
2923 if (ret)
2924 goto out_finish;
2925
2926 spin_lock_irq(&css_set_lock);
2927 list_for_each_entry(src_cset, &mgctx.preloaded_src_csets, mg_preload_node) {
2928 struct task_struct *task, *ntask;
2929
2930
2931 list_for_each_entry_safe(task, ntask, &src_cset->tasks, cg_list)
2932 cgroup_migrate_add_task(task, &mgctx);
2933 }
2934 spin_unlock_irq(&css_set_lock);
2935
2936 ret = cgroup_migrate_execute(&mgctx);
2937out_finish:
2938 cgroup_migrate_finish(&mgctx);
2939 percpu_up_write(&cgroup_threadgroup_rwsem);
2940 return ret;
2941}
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951void cgroup_lock_and_drain_offline(struct cgroup *cgrp)
2952 __acquires(&cgroup_mutex)
2953{
2954 struct cgroup *dsct;
2955 struct cgroup_subsys_state *d_css;
2956 struct cgroup_subsys *ss;
2957 int ssid;
2958
2959restart:
2960 mutex_lock(&cgroup_mutex);
2961
2962 cgroup_for_each_live_descendant_post(dsct, d_css, cgrp) {
2963 for_each_subsys(ss, ssid) {
2964 struct cgroup_subsys_state *css = cgroup_css(dsct, ss);
2965 DEFINE_WAIT(wait);
2966
2967 if (!css || !percpu_ref_is_dying(&css->refcnt))
2968 continue;
2969
2970 cgroup_get_live(dsct);
2971 prepare_to_wait(&dsct->offline_waitq, &wait,
2972 TASK_UNINTERRUPTIBLE);
2973
2974 mutex_unlock(&cgroup_mutex);
2975 schedule();
2976 finish_wait(&dsct->offline_waitq, &wait);
2977
2978 cgroup_put(dsct);
2979 goto restart;
2980 }
2981 }
2982}
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992static void cgroup_save_control(struct cgroup *cgrp)
2993{
2994 struct cgroup *dsct;
2995 struct cgroup_subsys_state *d_css;
2996
2997 cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp) {
2998 dsct->old_subtree_control = dsct->subtree_control;
2999 dsct->old_subtree_ss_mask = dsct->subtree_ss_mask;
3000 dsct->old_dom_cgrp = dsct->dom_cgrp;
3001 }
3002}
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012static void cgroup_propagate_control(struct cgroup *cgrp)
3013{
3014 struct cgroup *dsct;
3015 struct cgroup_subsys_state *d_css;
3016
3017 cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp) {
3018 dsct->subtree_control &= cgroup_control(dsct);
3019 dsct->subtree_ss_mask =
3020 cgroup_calc_subtree_ss_mask(dsct->subtree_control,
3021 cgroup_ss_mask(dsct));
3022 }
3023}
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033static void cgroup_restore_control(struct cgroup *cgrp)
3034{
3035 struct cgroup *dsct;
3036 struct cgroup_subsys_state *d_css;
3037
3038 cgroup_for_each_live_descendant_post(dsct, d_css, cgrp) {
3039 dsct->subtree_control = dsct->old_subtree_control;
3040 dsct->subtree_ss_mask = dsct->old_subtree_ss_mask;
3041 dsct->dom_cgrp = dsct->old_dom_cgrp;
3042 }
3043}
3044
3045static bool css_visible(struct cgroup_subsys_state *css)
3046{
3047 struct cgroup_subsys *ss = css->ss;
3048 struct cgroup *cgrp = css->cgroup;
3049
3050 if (cgroup_control(cgrp) & (1 << ss->id))
3051 return true;
3052 if (!(cgroup_ss_mask(cgrp) & (1 << ss->id)))
3053 return false;
3054 return cgroup_on_dfl(cgrp) && ss->implicit_on_dfl;
3055}
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070static int cgroup_apply_control_enable(struct cgroup *cgrp)
3071{
3072 struct cgroup *dsct;
3073 struct cgroup_subsys_state *d_css;
3074 struct cgroup_subsys *ss;
3075 int ssid, ret;
3076
3077 cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp) {
3078 for_each_subsys(ss, ssid) {
3079 struct cgroup_subsys_state *css = cgroup_css(dsct, ss);
3080
3081 if (!(cgroup_ss_mask(dsct) & (1 << ss->id)))
3082 continue;
3083
3084 if (!css) {
3085 css = css_create(dsct, ss);
3086 if (IS_ERR(css))
3087 return PTR_ERR(css);
3088 }
3089
3090 WARN_ON_ONCE(percpu_ref_is_dying(&css->refcnt));
3091
3092 if (css_visible(css)) {
3093 ret = css_populate_dir(css);
3094 if (ret)
3095 return ret;
3096 }
3097 }
3098 }
3099
3100 return 0;
3101}
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116static void cgroup_apply_control_disable(struct cgroup *cgrp)
3117{
3118 struct cgroup *dsct;
3119 struct cgroup_subsys_state *d_css;
3120 struct cgroup_subsys *ss;
3121 int ssid;
3122
3123 cgroup_for_each_live_descendant_post(dsct, d_css, cgrp) {
3124 for_each_subsys(ss, ssid) {
3125 struct cgroup_subsys_state *css = cgroup_css(dsct, ss);
3126
3127 if (!css)
3128 continue;
3129
3130 WARN_ON_ONCE(percpu_ref_is_dying(&css->refcnt));
3131
3132 if (css->parent &&
3133 !(cgroup_ss_mask(dsct) & (1 << ss->id))) {
3134 kill_css(css);
3135 } else if (!css_visible(css)) {
3136 css_clear_dir(css);
3137 if (ss->css_reset)
3138 ss->css_reset(css);
3139 }
3140 }
3141 }
3142}
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161static int cgroup_apply_control(struct cgroup *cgrp)
3162{
3163 int ret;
3164
3165 cgroup_propagate_control(cgrp);
3166
3167 ret = cgroup_apply_control_enable(cgrp);
3168 if (ret)
3169 return ret;
3170
3171
3172
3173
3174
3175
3176 ret = cgroup_update_dfl_csses(cgrp);
3177 if (ret)
3178 return ret;
3179
3180 return 0;
3181}
3182
3183
3184
3185
3186
3187
3188
3189
3190static void cgroup_finalize_control(struct cgroup *cgrp, int ret)
3191{
3192 if (ret) {
3193 cgroup_restore_control(cgrp);
3194 cgroup_propagate_control(cgrp);
3195 }
3196
3197 cgroup_apply_control_disable(cgrp);
3198}
3199
3200static int cgroup_vet_subtree_control_enable(struct cgroup *cgrp, u16 enable)
3201{
3202 u16 domain_enable = enable & ~cgrp_dfl_threaded_ss_mask;
3203
3204
3205 if (!enable)
3206 return 0;
3207
3208
3209 if (!cgroup_is_valid_domain(cgrp->dom_cgrp))
3210 return -EOPNOTSUPP;
3211
3212
3213 if (cgroup_is_mixable(cgrp))
3214 return 0;
3215
3216 if (domain_enable) {
3217
3218 if (cgroup_is_thread_root(cgrp) || cgroup_is_threaded(cgrp))
3219 return -EOPNOTSUPP;
3220 } else {
3221
3222
3223
3224
3225
3226 if (cgroup_can_be_thread_root(cgrp) || cgroup_is_threaded(cgrp))
3227 return 0;
3228 }
3229
3230
3231
3232
3233
3234 if (cgroup_has_tasks(cgrp))
3235 return -EBUSY;
3236
3237 return 0;
3238}
3239
3240
3241static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of,
3242 char *buf, size_t nbytes,
3243 loff_t off)
3244{
3245 u16 enable = 0, disable = 0;
3246 struct cgroup *cgrp, *child;
3247 struct cgroup_subsys *ss;
3248 char *tok;
3249 int ssid, ret;
3250
3251
3252
3253
3254
3255 buf = strstrip(buf);
3256 while ((tok = strsep(&buf, " "))) {
3257 if (tok[0] == '\0')
3258 continue;
3259 do_each_subsys_mask(ss, ssid, ~cgrp_dfl_inhibit_ss_mask) {
3260 if (!cgroup_ssid_enabled(ssid) ||
3261 strcmp(tok + 1, ss->name))
3262 continue;
3263
3264 if (*tok == '+') {
3265 enable |= 1 << ssid;
3266 disable &= ~(1 << ssid);
3267 } else if (*tok == '-') {
3268 disable |= 1 << ssid;
3269 enable &= ~(1 << ssid);
3270 } else {
3271 return -EINVAL;
3272 }
3273 break;
3274 } while_each_subsys_mask();
3275 if (ssid == CGROUP_SUBSYS_COUNT)
3276 return -EINVAL;
3277 }
3278
3279 cgrp = cgroup_kn_lock_live(of->kn, true);
3280 if (!cgrp)
3281 return -ENODEV;
3282
3283 for_each_subsys(ss, ssid) {
3284 if (enable & (1 << ssid)) {
3285 if (cgrp->subtree_control & (1 << ssid)) {
3286 enable &= ~(1 << ssid);
3287 continue;
3288 }
3289
3290 if (!(cgroup_control(cgrp) & (1 << ssid))) {
3291 ret = -ENOENT;
3292 goto out_unlock;
3293 }
3294 } else if (disable & (1 << ssid)) {
3295 if (!(cgrp->subtree_control & (1 << ssid))) {
3296 disable &= ~(1 << ssid);
3297 continue;
3298 }
3299
3300
3301 cgroup_for_each_live_child(child, cgrp) {
3302 if (child->subtree_control & (1 << ssid)) {
3303 ret = -EBUSY;
3304 goto out_unlock;
3305 }
3306 }
3307 }
3308 }
3309
3310 if (!enable && !disable) {
3311 ret = 0;
3312 goto out_unlock;
3313 }
3314
3315 ret = cgroup_vet_subtree_control_enable(cgrp, enable);
3316 if (ret)
3317 goto out_unlock;
3318
3319
3320 cgroup_save_control(cgrp);
3321
3322 cgrp->subtree_control |= enable;
3323 cgrp->subtree_control &= ~disable;
3324
3325 ret = cgroup_apply_control(cgrp);
3326 cgroup_finalize_control(cgrp, ret);
3327 if (ret)
3328 goto out_unlock;
3329
3330 kernfs_activate(cgrp->kn);
3331out_unlock:
3332 cgroup_kn_unlock(of->kn);
3333 return ret ?: nbytes;
3334}
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345static int cgroup_enable_threaded(struct cgroup *cgrp)
3346{
3347 struct cgroup *parent = cgroup_parent(cgrp);
3348 struct cgroup *dom_cgrp = parent->dom_cgrp;
3349 struct cgroup *dsct;
3350 struct cgroup_subsys_state *d_css;
3351 int ret;
3352
3353 lockdep_assert_held(&cgroup_mutex);
3354
3355
3356 if (cgroup_is_threaded(cgrp))
3357 return 0;
3358
3359
3360
3361
3362
3363
3364
3365 if (cgroup_is_populated(cgrp) ||
3366 cgrp->subtree_control & ~cgrp_dfl_threaded_ss_mask)
3367 return -EOPNOTSUPP;
3368
3369
3370 if (!cgroup_is_valid_domain(dom_cgrp) ||
3371 !cgroup_can_be_thread_root(dom_cgrp))
3372 return -EOPNOTSUPP;
3373
3374
3375
3376
3377
3378 cgroup_save_control(cgrp);
3379
3380 cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp)
3381 if (dsct == cgrp || cgroup_is_threaded(dsct))
3382 dsct->dom_cgrp = dom_cgrp;
3383
3384 ret = cgroup_apply_control(cgrp);
3385 if (!ret)
3386 parent->nr_threaded_children++;
3387
3388 cgroup_finalize_control(cgrp, ret);
3389 return ret;
3390}
3391
3392static int cgroup_type_show(struct seq_file *seq, void *v)
3393{
3394 struct cgroup *cgrp = seq_css(seq)->cgroup;
3395
3396 if (cgroup_is_threaded(cgrp))
3397 seq_puts(seq, "threaded\n");
3398 else if (!cgroup_is_valid_domain(cgrp))
3399 seq_puts(seq, "domain invalid\n");
3400 else if (cgroup_is_thread_root(cgrp))
3401 seq_puts(seq, "domain threaded\n");
3402 else
3403 seq_puts(seq, "domain\n");
3404
3405 return 0;
3406}
3407
3408static ssize_t cgroup_type_write(struct kernfs_open_file *of, char *buf,
3409 size_t nbytes, loff_t off)
3410{
3411 struct cgroup *cgrp;
3412 int ret;
3413
3414
3415 if (strcmp(strstrip(buf), "threaded"))
3416 return -EINVAL;
3417
3418
3419 cgrp = cgroup_kn_lock_live(of->kn, true);
3420 if (!cgrp)
3421 return -ENOENT;
3422
3423
3424 ret = cgroup_enable_threaded(cgrp);
3425
3426 cgroup_kn_unlock(of->kn);
3427 return ret ?: nbytes;
3428}
3429
3430static int cgroup_max_descendants_show(struct seq_file *seq, void *v)
3431{
3432 struct cgroup *cgrp = seq_css(seq)->cgroup;
3433 int descendants = READ_ONCE(cgrp->max_descendants);
3434
3435 if (descendants == INT_MAX)
3436 seq_puts(seq, "max\n");
3437 else
3438 seq_printf(seq, "%d\n", descendants);
3439
3440 return 0;
3441}
3442
3443static ssize_t cgroup_max_descendants_write(struct kernfs_open_file *of,
3444 char *buf, size_t nbytes, loff_t off)
3445{
3446 struct cgroup *cgrp;
3447 int descendants;
3448 ssize_t ret;
3449
3450 buf = strstrip(buf);
3451 if (!strcmp(buf, "max")) {
3452 descendants = INT_MAX;
3453 } else {
3454 ret = kstrtoint(buf, 0, &descendants);
3455 if (ret)
3456 return ret;
3457 }
3458
3459 if (descendants < 0)
3460 return -ERANGE;
3461
3462 cgrp = cgroup_kn_lock_live(of->kn, false);
3463 if (!cgrp)
3464 return -ENOENT;
3465
3466 cgrp->max_descendants = descendants;
3467
3468 cgroup_kn_unlock(of->kn);
3469
3470 return nbytes;
3471}
3472
3473static int cgroup_max_depth_show(struct seq_file *seq, void *v)
3474{
3475 struct cgroup *cgrp = seq_css(seq)->cgroup;
3476 int depth = READ_ONCE(cgrp->max_depth);
3477
3478 if (depth == INT_MAX)
3479 seq_puts(seq, "max\n");
3480 else
3481 seq_printf(seq, "%d\n", depth);
3482
3483 return 0;
3484}
3485
3486static ssize_t cgroup_max_depth_write(struct kernfs_open_file *of,
3487 char *buf, size_t nbytes, loff_t off)
3488{
3489 struct cgroup *cgrp;
3490 ssize_t ret;
3491 int depth;
3492
3493 buf = strstrip(buf);
3494 if (!strcmp(buf, "max")) {
3495 depth = INT_MAX;
3496 } else {
3497 ret = kstrtoint(buf, 0, &depth);
3498 if (ret)
3499 return ret;
3500 }
3501
3502 if (depth < 0)
3503 return -ERANGE;
3504
3505 cgrp = cgroup_kn_lock_live(of->kn, false);
3506 if (!cgrp)
3507 return -ENOENT;
3508
3509 cgrp->max_depth = depth;
3510
3511 cgroup_kn_unlock(of->kn);
3512
3513 return nbytes;
3514}
3515
3516static int cgroup_events_show(struct seq_file *seq, void *v)
3517{
3518 struct cgroup *cgrp = seq_css(seq)->cgroup;
3519
3520 seq_printf(seq, "populated %d\n", cgroup_is_populated(cgrp));
3521 seq_printf(seq, "frozen %d\n", test_bit(CGRP_FROZEN, &cgrp->flags));
3522
3523 return 0;
3524}
3525
3526static int cgroup_stat_show(struct seq_file *seq, void *v)
3527{
3528 struct cgroup *cgroup = seq_css(seq)->cgroup;
3529
3530 seq_printf(seq, "nr_descendants %d\n",
3531 cgroup->nr_descendants);
3532 seq_printf(seq, "nr_dying_descendants %d\n",
3533 cgroup->nr_dying_descendants);
3534
3535 return 0;
3536}
3537
3538static int __maybe_unused cgroup_extra_stat_show(struct seq_file *seq,
3539 struct cgroup *cgrp, int ssid)
3540{
3541 struct cgroup_subsys *ss = cgroup_subsys[ssid];
3542 struct cgroup_subsys_state *css;
3543 int ret;
3544
3545 if (!ss->css_extra_stat_show)
3546 return 0;
3547
3548 css = cgroup_tryget_css(cgrp, ss);
3549 if (!css)
3550 return 0;
3551
3552 ret = ss->css_extra_stat_show(seq, css);
3553 css_put(css);
3554 return ret;
3555}
3556
3557static int cpu_stat_show(struct seq_file *seq, void *v)
3558{
3559 struct cgroup __maybe_unused *cgrp = seq_css(seq)->cgroup;
3560 int ret = 0;
3561
3562 cgroup_base_stat_cputime_show(seq);
3563#ifdef CONFIG_CGROUP_SCHED
3564 ret = cgroup_extra_stat_show(seq, cgrp, cpu_cgrp_id);
3565#endif
3566 return ret;
3567}
3568
3569#ifdef CONFIG_PSI
3570static int cgroup_io_pressure_show(struct seq_file *seq, void *v)
3571{
3572 struct cgroup *cgrp = seq_css(seq)->cgroup;
3573 struct psi_group *psi = cgroup_ino(cgrp) == 1 ? &psi_system : &cgrp->psi;
3574
3575 return psi_show(seq, psi, PSI_IO);
3576}
3577static int cgroup_memory_pressure_show(struct seq_file *seq, void *v)
3578{
3579 struct cgroup *cgrp = seq_css(seq)->cgroup;
3580 struct psi_group *psi = cgroup_ino(cgrp) == 1 ? &psi_system : &cgrp->psi;
3581
3582 return psi_show(seq, psi, PSI_MEM);
3583}
3584static int cgroup_cpu_pressure_show(struct seq_file *seq, void *v)
3585{
3586 struct cgroup *cgrp = seq_css(seq)->cgroup;
3587 struct psi_group *psi = cgroup_ino(cgrp) == 1 ? &psi_system : &cgrp->psi;
3588
3589 return psi_show(seq, psi, PSI_CPU);
3590}
3591
3592static ssize_t cgroup_pressure_write(struct kernfs_open_file *of, char *buf,
3593 size_t nbytes, enum psi_res res)
3594{
3595 struct psi_trigger *new;
3596 struct cgroup *cgrp;
3597 struct psi_group *psi;
3598
3599 cgrp = cgroup_kn_lock_live(of->kn, false);
3600 if (!cgrp)
3601 return -ENODEV;
3602
3603 cgroup_get(cgrp);
3604 cgroup_kn_unlock(of->kn);
3605
3606 psi = cgroup_ino(cgrp) == 1 ? &psi_system : &cgrp->psi;
3607 new = psi_trigger_create(psi, buf, nbytes, res);
3608 if (IS_ERR(new)) {
3609 cgroup_put(cgrp);
3610 return PTR_ERR(new);
3611 }
3612
3613 psi_trigger_replace(&of->priv, new);
3614
3615 cgroup_put(cgrp);
3616
3617 return nbytes;
3618}
3619
3620static ssize_t cgroup_io_pressure_write(struct kernfs_open_file *of,
3621 char *buf, size_t nbytes,
3622 loff_t off)
3623{
3624 return cgroup_pressure_write(of, buf, nbytes, PSI_IO);
3625}
3626
3627static ssize_t cgroup_memory_pressure_write(struct kernfs_open_file *of,
3628 char *buf, size_t nbytes,
3629 loff_t off)
3630{
3631 return cgroup_pressure_write(of, buf, nbytes, PSI_MEM);
3632}
3633
3634static ssize_t cgroup_cpu_pressure_write(struct kernfs_open_file *of,
3635 char *buf, size_t nbytes,
3636 loff_t off)
3637{
3638 return cgroup_pressure_write(of, buf, nbytes, PSI_CPU);
3639}
3640
3641static __poll_t cgroup_pressure_poll(struct kernfs_open_file *of,
3642 poll_table *pt)
3643{
3644 return psi_trigger_poll(&of->priv, of->file, pt);
3645}
3646
3647static void cgroup_pressure_release(struct kernfs_open_file *of)
3648{
3649 psi_trigger_replace(&of->priv, NULL);
3650}
3651
3652bool cgroup_psi_enabled(void)
3653{
3654 return (cgroup_feature_disable_mask & (1 << OPT_FEATURE_PRESSURE)) == 0;
3655}
3656
3657#else
3658bool cgroup_psi_enabled(void)
3659{
3660 return false;
3661}
3662
3663#endif
3664
3665static int cgroup_freeze_show(struct seq_file *seq, void *v)
3666{
3667 struct cgroup *cgrp = seq_css(seq)->cgroup;
3668
3669 seq_printf(seq, "%d\n", cgrp->freezer.freeze);
3670
3671 return 0;
3672}
3673
3674static ssize_t cgroup_freeze_write(struct kernfs_open_file *of,
3675 char *buf, size_t nbytes, loff_t off)
3676{
3677 struct cgroup *cgrp;
3678 ssize_t ret;
3679 int freeze;
3680
3681 ret = kstrtoint(strstrip(buf), 0, &freeze);
3682 if (ret)
3683 return ret;
3684
3685 if (freeze < 0 || freeze > 1)
3686 return -ERANGE;
3687
3688 cgrp = cgroup_kn_lock_live(of->kn, false);
3689 if (!cgrp)
3690 return -ENOENT;
3691
3692 cgroup_freeze(cgrp, freeze);
3693
3694 cgroup_kn_unlock(of->kn);
3695
3696 return nbytes;
3697}
3698
3699static void __cgroup_kill(struct cgroup *cgrp)
3700{
3701 struct css_task_iter it;
3702 struct task_struct *task;
3703
3704 lockdep_assert_held(&cgroup_mutex);
3705
3706 spin_lock_irq(&css_set_lock);
3707 set_bit(CGRP_KILL, &cgrp->flags);
3708 spin_unlock_irq(&css_set_lock);
3709
3710 css_task_iter_start(&cgrp->self, CSS_TASK_ITER_PROCS | CSS_TASK_ITER_THREADED, &it);
3711 while ((task = css_task_iter_next(&it))) {
3712
3713 if (task->flags & PF_KTHREAD)
3714 continue;
3715
3716
3717 if (__fatal_signal_pending(task))
3718 continue;
3719
3720 send_sig(SIGKILL, task, 0);
3721 }
3722 css_task_iter_end(&it);
3723
3724 spin_lock_irq(&css_set_lock);
3725 clear_bit(CGRP_KILL, &cgrp->flags);
3726 spin_unlock_irq(&css_set_lock);
3727}
3728
3729static void cgroup_kill(struct cgroup *cgrp)
3730{
3731 struct cgroup_subsys_state *css;
3732 struct cgroup *dsct;
3733
3734 lockdep_assert_held(&cgroup_mutex);
3735
3736 cgroup_for_each_live_descendant_pre(dsct, css, cgrp)
3737 __cgroup_kill(dsct);
3738}
3739
3740static ssize_t cgroup_kill_write(struct kernfs_open_file *of, char *buf,
3741 size_t nbytes, loff_t off)
3742{
3743 ssize_t ret = 0;
3744 int kill;
3745 struct cgroup *cgrp;
3746
3747 ret = kstrtoint(strstrip(buf), 0, &kill);
3748 if (ret)
3749 return ret;
3750
3751 if (kill != 1)
3752 return -ERANGE;
3753
3754 cgrp = cgroup_kn_lock_live(of->kn, false);
3755 if (!cgrp)
3756 return -ENOENT;
3757
3758
3759
3760
3761
3762
3763 if (cgroup_is_threaded(cgrp))
3764 ret = -EOPNOTSUPP;
3765 else
3766 cgroup_kill(cgrp);
3767
3768 cgroup_kn_unlock(of->kn);
3769
3770 return ret ?: nbytes;
3771}
3772
3773static int cgroup_file_open(struct kernfs_open_file *of)
3774{
3775 struct cftype *cft = of_cft(of);
3776
3777 if (cft->open)
3778 return cft->open(of);
3779 return 0;
3780}
3781
3782static void cgroup_file_release(struct kernfs_open_file *of)
3783{
3784 struct cftype *cft = of_cft(of);
3785
3786 if (cft->release)
3787 cft->release(of);
3788}
3789
3790static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf,
3791 size_t nbytes, loff_t off)
3792{
3793 struct cgroup_namespace *ns = current->nsproxy->cgroup_ns;
3794 struct cgroup *cgrp = of->kn->parent->priv;
3795 struct cftype *cft = of_cft(of);
3796 struct cgroup_subsys_state *css;
3797 int ret;
3798
3799 if (!nbytes)
3800 return 0;
3801
3802
3803
3804
3805
3806
3807
3808 if ((cgrp->root->flags & CGRP_ROOT_NS_DELEGATE) &&
3809 !(cft->flags & CFTYPE_NS_DELEGATABLE) &&
3810 ns != &init_cgroup_ns && ns->root_cset->dfl_cgrp == cgrp)
3811 return -EPERM;
3812
3813 if (cft->write)
3814 return cft->write(of, buf, nbytes, off);
3815
3816
3817
3818
3819
3820
3821
3822 rcu_read_lock();
3823 css = cgroup_css(cgrp, cft->ss);
3824 rcu_read_unlock();
3825
3826 if (cft->write_u64) {
3827 unsigned long long v;
3828 ret = kstrtoull(buf, 0, &v);
3829 if (!ret)
3830 ret = cft->write_u64(css, cft, v);
3831 } else if (cft->write_s64) {
3832 long long v;
3833 ret = kstrtoll(buf, 0, &v);
3834 if (!ret)
3835 ret = cft->write_s64(css, cft, v);
3836 } else {
3837 ret = -EINVAL;
3838 }
3839
3840 return ret ?: nbytes;
3841}
3842
3843static __poll_t cgroup_file_poll(struct kernfs_open_file *of, poll_table *pt)
3844{
3845 struct cftype *cft = of_cft(of);
3846
3847 if (cft->poll)
3848 return cft->poll(of, pt);
3849
3850 return kernfs_generic_poll(of, pt);
3851}
3852
3853static void *cgroup_seqfile_start(struct seq_file *seq, loff_t *ppos)
3854{
3855 return seq_cft(seq)->seq_start(seq, ppos);
3856}
3857
3858static void *cgroup_seqfile_next(struct seq_file *seq, void *v, loff_t *ppos)
3859{
3860 return seq_cft(seq)->seq_next(seq, v, ppos);
3861}
3862
3863static void cgroup_seqfile_stop(struct seq_file *seq, void *v)
3864{
3865 if (seq_cft(seq)->seq_stop)
3866 seq_cft(seq)->seq_stop(seq, v);
3867}
3868
3869static int cgroup_seqfile_show(struct seq_file *m, void *arg)
3870{
3871 struct cftype *cft = seq_cft(m);
3872 struct cgroup_subsys_state *css = seq_css(m);
3873
3874 if (cft->seq_show)
3875 return cft->seq_show(m, arg);
3876
3877 if (cft->read_u64)
3878 seq_printf(m, "%llu\n", cft->read_u64(css, cft));
3879 else if (cft->read_s64)
3880 seq_printf(m, "%lld\n", cft->read_s64(css, cft));
3881 else
3882 return -EINVAL;
3883 return 0;
3884}
3885
3886static struct kernfs_ops cgroup_kf_single_ops = {
3887 .atomic_write_len = PAGE_SIZE,
3888 .open = cgroup_file_open,
3889 .release = cgroup_file_release,
3890 .write = cgroup_file_write,
3891 .poll = cgroup_file_poll,
3892 .seq_show = cgroup_seqfile_show,
3893};
3894
3895static struct kernfs_ops cgroup_kf_ops = {
3896 .atomic_write_len = PAGE_SIZE,
3897 .open = cgroup_file_open,
3898 .release = cgroup_file_release,
3899 .write = cgroup_file_write,
3900 .poll = cgroup_file_poll,
3901 .seq_start = cgroup_seqfile_start,
3902 .seq_next = cgroup_seqfile_next,
3903 .seq_stop = cgroup_seqfile_stop,
3904 .seq_show = cgroup_seqfile_show,
3905};
3906
3907
3908static int cgroup_kn_set_ugid(struct kernfs_node *kn)
3909{
3910 struct iattr iattr = { .ia_valid = ATTR_UID | ATTR_GID,
3911 .ia_uid = current_fsuid(),
3912 .ia_gid = current_fsgid(), };
3913
3914 if (uid_eq(iattr.ia_uid, GLOBAL_ROOT_UID) &&
3915 gid_eq(iattr.ia_gid, GLOBAL_ROOT_GID))
3916 return 0;
3917
3918 return kernfs_setattr(kn, &iattr);
3919}
3920
3921static void cgroup_file_notify_timer(struct timer_list *timer)
3922{
3923 cgroup_file_notify(container_of(timer, struct cgroup_file,
3924 notify_timer));
3925}
3926
3927static int cgroup_add_file(struct cgroup_subsys_state *css, struct cgroup *cgrp,
3928 struct cftype *cft)
3929{
3930 char name[CGROUP_FILE_NAME_MAX];
3931 struct kernfs_node *kn;
3932 struct lock_class_key *key = NULL;
3933 int ret;
3934
3935#ifdef CONFIG_DEBUG_LOCK_ALLOC
3936 key = &cft->lockdep_key;
3937#endif
3938 kn = __kernfs_create_file(cgrp->kn, cgroup_file_name(cgrp, cft, name),
3939 cgroup_file_mode(cft),
3940 GLOBAL_ROOT_UID, GLOBAL_ROOT_GID,
3941 0, cft->kf_ops, cft,
3942 NULL, key);
3943 if (IS_ERR(kn))
3944 return PTR_ERR(kn);
3945
3946 ret = cgroup_kn_set_ugid(kn);
3947 if (ret) {
3948 kernfs_remove(kn);
3949 return ret;
3950 }
3951
3952 if (cft->file_offset) {
3953 struct cgroup_file *cfile = (void *)css + cft->file_offset;
3954
3955 timer_setup(&cfile->notify_timer, cgroup_file_notify_timer, 0);
3956
3957 spin_lock_irq(&cgroup_file_kn_lock);
3958 cfile->kn = kn;
3959 spin_unlock_irq(&cgroup_file_kn_lock);
3960 }
3961
3962 return 0;
3963}
3964
3965
3966
3967
3968
3969
3970
3971
3972
3973
3974
3975static int cgroup_addrm_files(struct cgroup_subsys_state *css,
3976 struct cgroup *cgrp, struct cftype cfts[],
3977 bool is_add)
3978{
3979 struct cftype *cft, *cft_end = NULL;
3980 int ret = 0;
3981
3982 lockdep_assert_held(&cgroup_mutex);
3983
3984restart:
3985 for (cft = cfts; cft != cft_end && cft->name[0] != '\0'; cft++) {
3986
3987 if ((cft->flags & CFTYPE_PRESSURE) && !cgroup_psi_enabled())
3988 continue;
3989 if ((cft->flags & __CFTYPE_ONLY_ON_DFL) && !cgroup_on_dfl(cgrp))
3990 continue;
3991 if ((cft->flags & __CFTYPE_NOT_ON_DFL) && cgroup_on_dfl(cgrp))
3992 continue;
3993 if ((cft->flags & CFTYPE_NOT_ON_ROOT) && !cgroup_parent(cgrp))
3994 continue;
3995 if ((cft->flags & CFTYPE_ONLY_ON_ROOT) && cgroup_parent(cgrp))
3996 continue;
3997 if ((cft->flags & CFTYPE_DEBUG) && !cgroup_debug)
3998 continue;
3999 if (is_add) {
4000 ret = cgroup_add_file(css, cgrp, cft);
4001 if (ret) {
4002 pr_warn("%s: failed to add %s, err=%d\n",
4003 __func__, cft->name, ret);
4004 cft_end = cft;
4005 is_add = false;
4006 goto restart;
4007 }
4008 } else {
4009 cgroup_rm_file(cgrp, cft);
4010 }
4011 }
4012 return ret;
4013}
4014
4015static int cgroup_apply_cftypes(struct cftype *cfts, bool is_add)
4016{
4017 struct cgroup_subsys *ss = cfts[0].ss;
4018 struct cgroup *root = &ss->root->cgrp;
4019 struct cgroup_subsys_state *css;
4020 int ret = 0;
4021
4022 lockdep_assert_held(&cgroup_mutex);
4023
4024
4025 css_for_each_descendant_pre(css, cgroup_css(root, ss)) {
4026 struct cgroup *cgrp = css->cgroup;
4027
4028 if (!(css->flags & CSS_VISIBLE))
4029 continue;
4030
4031 ret = cgroup_addrm_files(css, cgrp, cfts, is_add);
4032 if (ret)
4033 break;
4034 }
4035
4036 if (is_add && !ret)
4037 kernfs_activate(root->kn);
4038 return ret;
4039}
4040
4041static void cgroup_exit_cftypes(struct cftype *cfts)
4042{
4043 struct cftype *cft;
4044
4045 for (cft = cfts; cft->name[0] != '\0'; cft++) {
4046
4047 if (cft->max_write_len && cft->max_write_len != PAGE_SIZE)
4048 kfree(cft->kf_ops);
4049 cft->kf_ops = NULL;
4050 cft->ss = NULL;
4051
4052
4053 cft->flags &= ~(__CFTYPE_ONLY_ON_DFL | __CFTYPE_NOT_ON_DFL);
4054 }
4055}
4056
4057static int cgroup_init_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
4058{
4059 struct cftype *cft;
4060
4061 for (cft = cfts; cft->name[0] != '\0'; cft++) {
4062 struct kernfs_ops *kf_ops;
4063
4064 WARN_ON(cft->ss || cft->kf_ops);
4065
4066 if ((cft->flags & CFTYPE_PRESSURE) && !cgroup_psi_enabled())
4067 continue;
4068
4069 if (cft->seq_start)
4070 kf_ops = &cgroup_kf_ops;
4071 else
4072 kf_ops = &cgroup_kf_single_ops;
4073
4074
4075
4076
4077
4078 if (cft->max_write_len && cft->max_write_len != PAGE_SIZE) {
4079 kf_ops = kmemdup(kf_ops, sizeof(*kf_ops), GFP_KERNEL);
4080 if (!kf_ops) {
4081 cgroup_exit_cftypes(cfts);
4082 return -ENOMEM;
4083 }
4084 kf_ops->atomic_write_len = cft->max_write_len;
4085 }
4086
4087 cft->kf_ops = kf_ops;
4088 cft->ss = ss;
4089 }
4090
4091 return 0;
4092}
4093
4094static int cgroup_rm_cftypes_locked(struct cftype *cfts)
4095{
4096 lockdep_assert_held(&cgroup_mutex);
4097
4098 if (!cfts || !cfts[0].ss)
4099 return -ENOENT;
4100
4101 list_del(&cfts->node);
4102 cgroup_apply_cftypes(cfts, false);
4103 cgroup_exit_cftypes(cfts);
4104 return 0;
4105}
4106
4107
4108
4109
4110
4111
4112
4113
4114
4115
4116
4117
4118int cgroup_rm_cftypes(struct cftype *cfts)
4119{
4120 int ret;
4121
4122 mutex_lock(&cgroup_mutex);
4123 ret = cgroup_rm_cftypes_locked(cfts);
4124 mutex_unlock(&cgroup_mutex);
4125 return ret;
4126}
4127
4128
4129
4130
4131
4132
4133
4134
4135
4136
4137
4138
4139
4140
4141
4142static int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
4143{
4144 int ret;
4145
4146 if (!cgroup_ssid_enabled(ss->id))
4147 return 0;
4148
4149 if (!cfts || cfts[0].name[0] == '\0')
4150 return 0;
4151
4152 ret = cgroup_init_cftypes(ss, cfts);
4153 if (ret)
4154 return ret;
4155
4156 mutex_lock(&cgroup_mutex);
4157
4158 list_add_tail(&cfts->node, &ss->cfts);
4159 ret = cgroup_apply_cftypes(cfts, true);
4160 if (ret)
4161 cgroup_rm_cftypes_locked(cfts);
4162
4163 mutex_unlock(&cgroup_mutex);
4164 return ret;
4165}
4166
4167
4168
4169
4170
4171
4172
4173
4174
4175int cgroup_add_dfl_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
4176{
4177 struct cftype *cft;
4178
4179 for (cft = cfts; cft && cft->name[0] != '\0'; cft++)
4180 cft->flags |= __CFTYPE_ONLY_ON_DFL;
4181 return cgroup_add_cftypes(ss, cfts);
4182}
4183
4184
4185
4186
4187
4188
4189
4190
4191
4192int cgroup_add_legacy_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
4193{
4194 struct cftype *cft;
4195
4196 for (cft = cfts; cft && cft->name[0] != '\0'; cft++)
4197 cft->flags |= __CFTYPE_NOT_ON_DFL;
4198 return cgroup_add_cftypes(ss, cfts);
4199}
4200
4201
4202
4203
4204
4205
4206
4207void cgroup_file_notify(struct cgroup_file *cfile)
4208{
4209 unsigned long flags;
4210
4211 spin_lock_irqsave(&cgroup_file_kn_lock, flags);
4212 if (cfile->kn) {
4213 unsigned long last = cfile->notified_at;
4214 unsigned long next = last + CGROUP_FILE_NOTIFY_MIN_INTV;
4215
4216 if (time_in_range(jiffies, last, next)) {
4217 timer_reduce(&cfile->notify_timer, next);
4218 } else {
4219 kernfs_notify(cfile->kn);
4220 cfile->notified_at = jiffies;
4221 }
4222 }
4223 spin_unlock_irqrestore(&cgroup_file_kn_lock, flags);
4224}
4225
4226
4227
4228
4229
4230
4231
4232
4233
4234
4235
4236
4237
4238
4239
4240
4241
4242
4243struct cgroup_subsys_state *css_next_child(struct cgroup_subsys_state *pos,
4244 struct cgroup_subsys_state *parent)
4245{
4246 struct cgroup_subsys_state *next;
4247
4248 cgroup_assert_mutex_or_rcu_locked();
4249
4250
4251
4252
4253
4254
4255
4256
4257
4258
4259
4260
4261
4262
4263
4264
4265
4266
4267
4268
4269
4270 if (!pos) {
4271 next = list_entry_rcu(parent->children.next, struct cgroup_subsys_state, sibling);
4272 } else if (likely(!(pos->flags & CSS_RELEASED))) {
4273 next = list_entry_rcu(pos->sibling.next, struct cgroup_subsys_state, sibling);
4274 } else {
4275 list_for_each_entry_rcu(next, &parent->children, sibling,
4276 lockdep_is_held(&cgroup_mutex))
4277 if (next->serial_nr > pos->serial_nr)
4278 break;
4279 }
4280
4281
4282
4283
4284
4285 if (&next->sibling != &parent->children)
4286 return next;
4287 return NULL;
4288}
4289
4290
4291
4292
4293
4294
4295
4296
4297
4298
4299
4300
4301
4302
4303
4304
4305
4306
4307
4308
4309
4310
4311struct cgroup_subsys_state *
4312css_next_descendant_pre(struct cgroup_subsys_state *pos,
4313 struct cgroup_subsys_state *root)
4314{
4315 struct cgroup_subsys_state *next;
4316
4317 cgroup_assert_mutex_or_rcu_locked();
4318
4319
4320 if (!pos)
4321 return root;
4322
4323
4324 next = css_next_child(NULL, pos);
4325 if (next)
4326 return next;
4327
4328
4329 while (pos != root) {
4330 next = css_next_child(pos, pos->parent);
4331 if (next)
4332 return next;
4333 pos = pos->parent;
4334 }
4335
4336 return NULL;
4337}
4338EXPORT_SYMBOL_GPL(css_next_descendant_pre);
4339
4340
4341
4342
4343
4344
4345
4346
4347
4348
4349
4350
4351
4352
4353struct cgroup_subsys_state *
4354css_rightmost_descendant(struct cgroup_subsys_state *pos)
4355{
4356 struct cgroup_subsys_state *last, *tmp;
4357
4358 cgroup_assert_mutex_or_rcu_locked();
4359
4360 do {
4361 last = pos;
4362
4363 pos = NULL;
4364 css_for_each_child(tmp, last)
4365 pos = tmp;
4366 } while (pos);
4367
4368 return last;
4369}
4370
4371static struct cgroup_subsys_state *
4372css_leftmost_descendant(struct cgroup_subsys_state *pos)
4373{
4374 struct cgroup_subsys_state *last;
4375
4376 do {
4377 last = pos;
4378 pos = css_next_child(NULL, pos);
4379 } while (pos);
4380
4381 return last;
4382}
4383
4384
4385
4386
4387
4388
4389
4390
4391
4392
4393
4394
4395
4396
4397
4398
4399
4400
4401
4402
4403
4404
4405
4406struct cgroup_subsys_state *
4407css_next_descendant_post(struct cgroup_subsys_state *pos,
4408 struct cgroup_subsys_state *root)
4409{
4410 struct cgroup_subsys_state *next;
4411
4412 cgroup_assert_mutex_or_rcu_locked();
4413
4414
4415 if (!pos)
4416 return css_leftmost_descendant(root);
4417
4418
4419 if (pos == root)
4420 return NULL;
4421
4422
4423 next = css_next_child(pos, pos->parent);
4424 if (next)
4425 return css_leftmost_descendant(next);
4426
4427
4428 return pos->parent;
4429}
4430
4431
4432
4433
4434
4435
4436
4437
4438
4439bool css_has_online_children(struct cgroup_subsys_state *css)
4440{
4441 struct cgroup_subsys_state *child;
4442 bool ret = false;
4443
4444 rcu_read_lock();
4445 css_for_each_child(child, css) {
4446 if (child->flags & CSS_ONLINE) {
4447 ret = true;
4448 break;
4449 }
4450 }
4451 rcu_read_unlock();
4452 return ret;
4453}
4454
4455static struct css_set *css_task_iter_next_css_set(struct css_task_iter *it)
4456{
4457 struct list_head *l;
4458 struct cgrp_cset_link *link;
4459 struct css_set *cset;
4460
4461 lockdep_assert_held(&css_set_lock);
4462
4463
4464 if (it->tcset_pos) {
4465 l = it->tcset_pos->next;
4466
4467 if (l != it->tcset_head) {
4468 it->tcset_pos = l;
4469 return container_of(l, struct css_set,
4470 threaded_csets_node);
4471 }
4472
4473 it->tcset_pos = NULL;
4474 }
4475
4476
4477 l = it->cset_pos;
4478 l = l->next;
4479 if (l == it->cset_head) {
4480 it->cset_pos = NULL;
4481 return NULL;
4482 }
4483
4484 if (it->ss) {
4485 cset = container_of(l, struct css_set, e_cset_node[it->ss->id]);
4486 } else {
4487 link = list_entry(l, struct cgrp_cset_link, cset_link);
4488 cset = link->cset;
4489 }
4490
4491 it->cset_pos = l;
4492
4493
4494 if (it->flags & CSS_TASK_ITER_THREADED) {
4495 if (it->cur_dcset)
4496 put_css_set_locked(it->cur_dcset);
4497 it->cur_dcset = cset;
4498 get_css_set(cset);
4499
4500 it->tcset_head = &cset->threaded_csets;
4501 it->tcset_pos = &cset->threaded_csets;
4502 }
4503
4504 return cset;
4505}
4506
4507
4508
4509
4510
4511
4512
4513static void css_task_iter_advance_css_set(struct css_task_iter *it)
4514{
4515 struct css_set *cset;
4516
4517 lockdep_assert_held(&css_set_lock);
4518
4519
4520 while ((cset = css_task_iter_next_css_set(it))) {
4521 if (!list_empty(&cset->tasks)) {
4522 it->cur_tasks_head = &cset->tasks;
4523 break;
4524 } else if (!list_empty(&cset->mg_tasks)) {
4525 it->cur_tasks_head = &cset->mg_tasks;
4526 break;
4527 } else if (!list_empty(&cset->dying_tasks)) {
4528 it->cur_tasks_head = &cset->dying_tasks;
4529 break;
4530 }
4531 }
4532 if (!cset) {
4533 it->task_pos = NULL;
4534 return;
4535 }
4536 it->task_pos = it->cur_tasks_head->next;
4537
4538
4539
4540
4541
4542
4543
4544
4545
4546
4547
4548
4549
4550
4551
4552
4553 if (it->cur_cset) {
4554 list_del(&it->iters_node);
4555 put_css_set_locked(it->cur_cset);
4556 }
4557 get_css_set(cset);
4558 it->cur_cset = cset;
4559 list_add(&it->iters_node, &cset->task_iters);
4560}
4561
4562static void css_task_iter_skip(struct css_task_iter *it,
4563 struct task_struct *task)
4564{
4565 lockdep_assert_held(&css_set_lock);
4566
4567 if (it->task_pos == &task->cg_list) {
4568 it->task_pos = it->task_pos->next;
4569 it->flags |= CSS_TASK_ITER_SKIPPED;
4570 }
4571}
4572
4573static void css_task_iter_advance(struct css_task_iter *it)
4574{
4575 struct task_struct *task;
4576
4577 lockdep_assert_held(&css_set_lock);
4578repeat:
4579 if (it->task_pos) {
4580
4581
4582
4583
4584
4585 if (it->flags & CSS_TASK_ITER_SKIPPED)
4586 it->flags &= ~CSS_TASK_ITER_SKIPPED;
4587 else
4588 it->task_pos = it->task_pos->next;
4589
4590 if (it->task_pos == &it->cur_cset->tasks) {
4591 it->cur_tasks_head = &it->cur_cset->mg_tasks;
4592 it->task_pos = it->cur_tasks_head->next;
4593 }
4594 if (it->task_pos == &it->cur_cset->mg_tasks) {
4595 it->cur_tasks_head = &it->cur_cset->dying_tasks;
4596 it->task_pos = it->cur_tasks_head->next;
4597 }
4598 if (it->task_pos == &it->cur_cset->dying_tasks)
4599 css_task_iter_advance_css_set(it);
4600 } else {
4601
4602 css_task_iter_advance_css_set(it);
4603 }
4604
4605 if (!it->task_pos)
4606 return;
4607
4608 task = list_entry(it->task_pos, struct task_struct, cg_list);
4609
4610 if (it->flags & CSS_TASK_ITER_PROCS) {
4611
4612 if (!thread_group_leader(task))
4613 goto repeat;
4614
4615
4616 if (it->cur_tasks_head == &it->cur_cset->dying_tasks &&
4617 !atomic_read(&task->signal->live))
4618 goto repeat;
4619 } else {
4620
4621 if (it->cur_tasks_head == &it->cur_cset->dying_tasks)
4622 goto repeat;
4623 }
4624}
4625
4626
4627
4628
4629
4630
4631
4632
4633
4634
4635
4636
4637void css_task_iter_start(struct cgroup_subsys_state *css, unsigned int flags,
4638 struct css_task_iter *it)
4639{
4640 memset(it, 0, sizeof(*it));
4641
4642 spin_lock_irq(&css_set_lock);
4643
4644 it->ss = css->ss;
4645 it->flags = flags;
4646
4647 if (it->ss)
4648 it->cset_pos = &css->cgroup->e_csets[css->ss->id];
4649 else
4650 it->cset_pos = &css->cgroup->cset_links;
4651
4652 it->cset_head = it->cset_pos;
4653
4654 css_task_iter_advance(it);
4655
4656 spin_unlock_irq(&css_set_lock);
4657}
4658
4659
4660
4661
4662
4663
4664
4665
4666
4667struct task_struct *css_task_iter_next(struct css_task_iter *it)
4668{
4669 if (it->cur_task) {
4670 put_task_struct(it->cur_task);
4671 it->cur_task = NULL;
4672 }
4673
4674 spin_lock_irq(&css_set_lock);
4675
4676
4677 if (it->flags & CSS_TASK_ITER_SKIPPED)
4678 css_task_iter_advance(it);
4679
4680 if (it->task_pos) {
4681 it->cur_task = list_entry(it->task_pos, struct task_struct,
4682 cg_list);
4683 get_task_struct(it->cur_task);
4684 css_task_iter_advance(it);
4685 }
4686
4687 spin_unlock_irq(&css_set_lock);
4688
4689 return it->cur_task;
4690}
4691
4692
4693
4694
4695
4696
4697
4698void css_task_iter_end(struct css_task_iter *it)
4699{
4700 if (it->cur_cset) {
4701 spin_lock_irq(&css_set_lock);
4702 list_del(&it->iters_node);
4703 put_css_set_locked(it->cur_cset);
4704 spin_unlock_irq(&css_set_lock);
4705 }
4706
4707 if (it->cur_dcset)
4708 put_css_set(it->cur_dcset);
4709
4710 if (it->cur_task)
4711 put_task_struct(it->cur_task);
4712}
4713
4714static void cgroup_procs_release(struct kernfs_open_file *of)
4715{
4716 if (of->priv) {
4717 css_task_iter_end(of->priv);
4718 kfree(of->priv);
4719 }
4720}
4721
4722static void *cgroup_procs_next(struct seq_file *s, void *v, loff_t *pos)
4723{
4724 struct kernfs_open_file *of = s->private;
4725 struct css_task_iter *it = of->priv;
4726
4727 if (pos)
4728 (*pos)++;
4729
4730 return css_task_iter_next(it);
4731}
4732
4733static void *__cgroup_procs_start(struct seq_file *s, loff_t *pos,
4734 unsigned int iter_flags)
4735{
4736 struct kernfs_open_file *of = s->private;
4737 struct cgroup *cgrp = seq_css(s)->cgroup;
4738 struct css_task_iter *it = of->priv;
4739
4740
4741
4742
4743
4744 if (!it) {
4745 if (WARN_ON_ONCE((*pos)))
4746 return ERR_PTR(-EINVAL);
4747
4748 it = kzalloc(sizeof(*it), GFP_KERNEL);
4749 if (!it)
4750 return ERR_PTR(-ENOMEM);
4751 of->priv = it;
4752 css_task_iter_start(&cgrp->self, iter_flags, it);
4753 } else if (!(*pos)) {
4754 css_task_iter_end(it);
4755 css_task_iter_start(&cgrp->self, iter_flags, it);
4756 } else
4757 return it->cur_task;
4758
4759 return cgroup_procs_next(s, NULL, NULL);
4760}
4761
4762static void *cgroup_procs_start(struct seq_file *s, loff_t *pos)
4763{
4764 struct cgroup *cgrp = seq_css(s)->cgroup;
4765
4766
4767
4768
4769
4770
4771
4772 if (cgroup_is_threaded(cgrp))
4773 return ERR_PTR(-EOPNOTSUPP);
4774
4775 return __cgroup_procs_start(s, pos, CSS_TASK_ITER_PROCS |
4776 CSS_TASK_ITER_THREADED);
4777}
4778
4779static int cgroup_procs_show(struct seq_file *s, void *v)
4780{
4781 seq_printf(s, "%d\n", task_pid_vnr(v));
4782 return 0;
4783}
4784
4785static int cgroup_may_write(const struct cgroup *cgrp, struct super_block *sb)
4786{
4787 int ret;
4788 struct inode *inode;
4789
4790 lockdep_assert_held(&cgroup_mutex);
4791
4792 inode = kernfs_get_inode(sb, cgrp->procs_file.kn);
4793 if (!inode)
4794 return -ENOMEM;
4795
4796 ret = inode_permission(&init_user_ns, inode, MAY_WRITE);
4797 iput(inode);
4798 return ret;
4799}
4800
4801static int cgroup_procs_write_permission(struct cgroup *src_cgrp,
4802 struct cgroup *dst_cgrp,
4803 struct super_block *sb)
4804{
4805 struct cgroup_namespace *ns = current->nsproxy->cgroup_ns;
4806 struct cgroup *com_cgrp = src_cgrp;
4807 int ret;
4808
4809 lockdep_assert_held(&cgroup_mutex);
4810
4811
4812 while (!cgroup_is_descendant(dst_cgrp, com_cgrp))
4813 com_cgrp = cgroup_parent(com_cgrp);
4814
4815
4816 ret = cgroup_may_write(com_cgrp, sb);
4817 if (ret)
4818 return ret;
4819
4820
4821
4822
4823
4824 if ((cgrp_dfl_root.flags & CGRP_ROOT_NS_DELEGATE) &&
4825 (!cgroup_is_descendant(src_cgrp, ns->root_cset->dfl_cgrp) ||
4826 !cgroup_is_descendant(dst_cgrp, ns->root_cset->dfl_cgrp)))
4827 return -ENOENT;
4828
4829 return 0;
4830}
4831
4832static int cgroup_attach_permissions(struct cgroup *src_cgrp,
4833 struct cgroup *dst_cgrp,
4834 struct super_block *sb, bool threadgroup)
4835{
4836 int ret = 0;
4837
4838 ret = cgroup_procs_write_permission(src_cgrp, dst_cgrp, sb);
4839 if (ret)
4840 return ret;
4841
4842 ret = cgroup_migrate_vet_dst(dst_cgrp);
4843 if (ret)
4844 return ret;
4845
4846 if (!threadgroup && (src_cgrp->dom_cgrp != dst_cgrp->dom_cgrp))
4847 ret = -EOPNOTSUPP;
4848
4849 return ret;
4850}
4851
4852static ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf,
4853 bool threadgroup)
4854{
4855 struct cgroup *src_cgrp, *dst_cgrp;
4856 struct task_struct *task;
4857 ssize_t ret;
4858 bool locked;
4859
4860 dst_cgrp = cgroup_kn_lock_live(of->kn, false);
4861 if (!dst_cgrp)
4862 return -ENODEV;
4863
4864 task = cgroup_procs_write_start(buf, threadgroup, &locked);
4865 ret = PTR_ERR_OR_ZERO(task);
4866 if (ret)
4867 goto out_unlock;
4868
4869
4870 spin_lock_irq(&css_set_lock);
4871 src_cgrp = task_cgroup_from_root(task, &cgrp_dfl_root);
4872 spin_unlock_irq(&css_set_lock);
4873
4874
4875 ret = cgroup_attach_permissions(src_cgrp, dst_cgrp,
4876 of->file->f_path.dentry->d_sb, threadgroup);
4877 if (ret)
4878 goto out_finish;
4879
4880 ret = cgroup_attach_task(dst_cgrp, task, threadgroup);
4881
4882out_finish:
4883 cgroup_procs_write_finish(task, locked);
4884out_unlock:
4885 cgroup_kn_unlock(of->kn);
4886
4887 return ret;
4888}
4889
4890static ssize_t cgroup_procs_write(struct kernfs_open_file *of,
4891 char *buf, size_t nbytes, loff_t off)
4892{
4893 return __cgroup_procs_write(of, buf, true) ?: nbytes;
4894}
4895
4896static void *cgroup_threads_start(struct seq_file *s, loff_t *pos)
4897{
4898 return __cgroup_procs_start(s, pos, 0);
4899}
4900
4901static ssize_t cgroup_threads_write(struct kernfs_open_file *of,
4902 char *buf, size_t nbytes, loff_t off)
4903{
4904 return __cgroup_procs_write(of, buf, false) ?: nbytes;
4905}
4906
4907
4908static struct cftype cgroup_base_files[] = {
4909 {
4910 .name = "cgroup.type",
4911 .flags = CFTYPE_NOT_ON_ROOT,
4912 .seq_show = cgroup_type_show,
4913 .write = cgroup_type_write,
4914 },
4915 {
4916 .name = "cgroup.procs",
4917 .flags = CFTYPE_NS_DELEGATABLE,
4918 .file_offset = offsetof(struct cgroup, procs_file),
4919 .release = cgroup_procs_release,
4920 .seq_start = cgroup_procs_start,
4921 .seq_next = cgroup_procs_next,
4922 .seq_show = cgroup_procs_show,
4923 .write = cgroup_procs_write,
4924 },
4925 {
4926 .name = "cgroup.threads",
4927 .flags = CFTYPE_NS_DELEGATABLE,
4928 .release = cgroup_procs_release,
4929 .seq_start = cgroup_threads_start,
4930 .seq_next = cgroup_procs_next,
4931 .seq_show = cgroup_procs_show,
4932 .write = cgroup_threads_write,
4933 },
4934 {
4935 .name = "cgroup.controllers",
4936 .seq_show = cgroup_controllers_show,
4937 },
4938 {
4939 .name = "cgroup.subtree_control",
4940 .flags = CFTYPE_NS_DELEGATABLE,
4941 .seq_show = cgroup_subtree_control_show,
4942 .write = cgroup_subtree_control_write,
4943 },
4944 {
4945 .name = "cgroup.events",
4946 .flags = CFTYPE_NOT_ON_ROOT,
4947 .file_offset = offsetof(struct cgroup, events_file),
4948 .seq_show = cgroup_events_show,
4949 },
4950 {
4951 .name = "cgroup.max.descendants",
4952 .seq_show = cgroup_max_descendants_show,
4953 .write = cgroup_max_descendants_write,
4954 },
4955 {
4956 .name = "cgroup.max.depth",
4957 .seq_show = cgroup_max_depth_show,
4958 .write = cgroup_max_depth_write,
4959 },
4960 {
4961 .name = "cgroup.stat",
4962 .seq_show = cgroup_stat_show,
4963 },
4964 {
4965 .name = "cgroup.freeze",
4966 .flags = CFTYPE_NOT_ON_ROOT,
4967 .seq_show = cgroup_freeze_show,
4968 .write = cgroup_freeze_write,
4969 },
4970 {
4971 .name = "cgroup.kill",
4972 .flags = CFTYPE_NOT_ON_ROOT,
4973 .write = cgroup_kill_write,
4974 },
4975 {
4976 .name = "cpu.stat",
4977 .seq_show = cpu_stat_show,
4978 },
4979#ifdef CONFIG_PSI
4980 {
4981 .name = "io.pressure",
4982 .flags = CFTYPE_PRESSURE,
4983 .seq_show = cgroup_io_pressure_show,
4984 .write = cgroup_io_pressure_write,
4985 .poll = cgroup_pressure_poll,
4986 .release = cgroup_pressure_release,
4987 },
4988 {
4989 .name = "memory.pressure",
4990 .flags = CFTYPE_PRESSURE,
4991 .seq_show = cgroup_memory_pressure_show,
4992 .write = cgroup_memory_pressure_write,
4993 .poll = cgroup_pressure_poll,
4994 .release = cgroup_pressure_release,
4995 },
4996 {
4997 .name = "cpu.pressure",
4998 .flags = CFTYPE_PRESSURE,
4999 .seq_show = cgroup_cpu_pressure_show,
5000 .write = cgroup_cpu_pressure_write,
5001 .poll = cgroup_pressure_poll,
5002 .release = cgroup_pressure_release,
5003 },
5004#endif
5005 { }
5006};
5007
5008
5009
5010
5011
5012
5013
5014
5015
5016
5017
5018
5019
5020
5021
5022
5023
5024
5025
5026
5027
5028
5029
5030static void css_free_rwork_fn(struct work_struct *work)
5031{
5032 struct cgroup_subsys_state *css = container_of(to_rcu_work(work),
5033 struct cgroup_subsys_state, destroy_rwork);
5034 struct cgroup_subsys *ss = css->ss;
5035 struct cgroup *cgrp = css->cgroup;
5036
5037 percpu_ref_exit(&css->refcnt);
5038
5039 if (ss) {
5040
5041 struct cgroup_subsys_state *parent = css->parent;
5042 int id = css->id;
5043
5044 ss->css_free(css);
5045 cgroup_idr_remove(&ss->css_idr, id);
5046 cgroup_put(cgrp);
5047
5048 if (parent)
5049 css_put(parent);
5050 } else {
5051
5052 atomic_dec(&cgrp->root->nr_cgrps);
5053 cgroup1_pidlist_destroy_all(cgrp);
5054 cancel_work_sync(&cgrp->release_agent_work);
5055
5056 if (cgroup_parent(cgrp)) {
5057
5058
5059
5060
5061
5062
5063 cgroup_put(cgroup_parent(cgrp));
5064 kernfs_put(cgrp->kn);
5065 psi_cgroup_free(cgrp);
5066 cgroup_rstat_exit(cgrp);
5067 kfree(cgrp);
5068 } else {
5069
5070
5071
5072
5073
5074 cgroup_destroy_root(cgrp->root);
5075 }
5076 }
5077}
5078
5079static void css_release_work_fn(struct work_struct *work)
5080{
5081 struct cgroup_subsys_state *css =
5082 container_of(work, struct cgroup_subsys_state, destroy_work);
5083 struct cgroup_subsys *ss = css->ss;
5084 struct cgroup *cgrp = css->cgroup;
5085
5086 mutex_lock(&cgroup_mutex);
5087
5088 css->flags |= CSS_RELEASED;
5089 list_del_rcu(&css->sibling);
5090
5091 if (ss) {
5092
5093 if (!list_empty(&css->rstat_css_node)) {
5094 cgroup_rstat_flush(cgrp);
5095 list_del_rcu(&css->rstat_css_node);
5096 }
5097
5098 cgroup_idr_replace(&ss->css_idr, NULL, css->id);
5099 if (ss->css_released)
5100 ss->css_released(css);
5101 } else {
5102 struct cgroup *tcgrp;
5103
5104
5105 TRACE_CGROUP_PATH(release, cgrp);
5106
5107 cgroup_rstat_flush(cgrp);
5108
5109 spin_lock_irq(&css_set_lock);
5110 for (tcgrp = cgroup_parent(cgrp); tcgrp;
5111 tcgrp = cgroup_parent(tcgrp))
5112 tcgrp->nr_dying_descendants--;
5113 spin_unlock_irq(&css_set_lock);
5114
5115
5116
5117
5118
5119
5120
5121
5122 if (cgrp->kn)
5123 RCU_INIT_POINTER(*(void __rcu __force **)&cgrp->kn->priv,
5124 NULL);
5125 }
5126
5127 mutex_unlock(&cgroup_mutex);
5128
5129 INIT_RCU_WORK(&css->destroy_rwork, css_free_rwork_fn);
5130 queue_rcu_work(cgroup_destroy_wq, &css->destroy_rwork);
5131}
5132
5133static void css_release(struct percpu_ref *ref)
5134{
5135 struct cgroup_subsys_state *css =
5136 container_of(ref, struct cgroup_subsys_state, refcnt);
5137
5138 INIT_WORK(&css->destroy_work, css_release_work_fn);
5139 queue_work(cgroup_destroy_wq, &css->destroy_work);
5140}
5141
5142static void init_and_link_css(struct cgroup_subsys_state *css,
5143 struct cgroup_subsys *ss, struct cgroup *cgrp)
5144{
5145 lockdep_assert_held(&cgroup_mutex);
5146
5147 cgroup_get_live(cgrp);
5148
5149 memset(css, 0, sizeof(*css));
5150 css->cgroup = cgrp;
5151 css->ss = ss;
5152 css->id = -1;
5153 INIT_LIST_HEAD(&css->sibling);
5154 INIT_LIST_HEAD(&css->children);
5155 INIT_LIST_HEAD(&css->rstat_css_node);
5156 css->serial_nr = css_serial_nr_next++;
5157 atomic_set(&css->online_cnt, 0);
5158
5159 if (cgroup_parent(cgrp)) {
5160 css->parent = cgroup_css(cgroup_parent(cgrp), ss);
5161 css_get(css->parent);
5162 }
5163
5164 if (ss->css_rstat_flush)
5165 list_add_rcu(&css->rstat_css_node, &cgrp->rstat_css_list);
5166
5167 BUG_ON(cgroup_css(cgrp, ss));
5168}
5169
5170
5171static int online_css(struct cgroup_subsys_state *css)
5172{
5173 struct cgroup_subsys *ss = css->ss;
5174 int ret = 0;
5175
5176 lockdep_assert_held(&cgroup_mutex);
5177
5178 if (ss->css_online)
5179 ret = ss->css_online(css);
5180 if (!ret) {
5181 css->flags |= CSS_ONLINE;
5182 rcu_assign_pointer(css->cgroup->subsys[ss->id], css);
5183
5184 atomic_inc(&css->online_cnt);
5185 if (css->parent)
5186 atomic_inc(&css->parent->online_cnt);
5187 }
5188 return ret;
5189}
5190
5191
5192static void offline_css(struct cgroup_subsys_state *css)
5193{
5194 struct cgroup_subsys *ss = css->ss;
5195
5196 lockdep_assert_held(&cgroup_mutex);
5197
5198 if (!(css->flags & CSS_ONLINE))
5199 return;
5200
5201 if (ss->css_offline)
5202 ss->css_offline(css);
5203
5204 css->flags &= ~CSS_ONLINE;
5205 RCU_INIT_POINTER(css->cgroup->subsys[ss->id], NULL);
5206
5207 wake_up_all(&css->cgroup->offline_waitq);
5208}
5209
5210
5211
5212
5213
5214
5215
5216
5217
5218
5219static struct cgroup_subsys_state *css_create(struct cgroup *cgrp,
5220 struct cgroup_subsys *ss)
5221{
5222 struct cgroup *parent = cgroup_parent(cgrp);
5223 struct cgroup_subsys_state *parent_css = cgroup_css(parent, ss);
5224 struct cgroup_subsys_state *css;
5225 int err;
5226
5227 lockdep_assert_held(&cgroup_mutex);
5228
5229 css = ss->css_alloc(parent_css);
5230 if (!css)
5231 css = ERR_PTR(-ENOMEM);
5232 if (IS_ERR(css))
5233 return css;
5234
5235 init_and_link_css(css, ss, cgrp);
5236
5237 err = percpu_ref_init(&css->refcnt, css_release, 0, GFP_KERNEL);
5238 if (err)
5239 goto err_free_css;
5240
5241 err = cgroup_idr_alloc(&ss->css_idr, NULL, 2, 0, GFP_KERNEL);
5242 if (err < 0)
5243 goto err_free_css;
5244 css->id = err;
5245
5246
5247 list_add_tail_rcu(&css->sibling, &parent_css->children);
5248 cgroup_idr_replace(&ss->css_idr, css, css->id);
5249
5250 err = online_css(css);
5251 if (err)
5252 goto err_list_del;
5253
5254 return css;
5255
5256err_list_del:
5257 list_del_rcu(&css->sibling);
5258err_free_css:
5259 list_del_rcu(&css->rstat_css_node);
5260 INIT_RCU_WORK(&css->destroy_rwork, css_free_rwork_fn);
5261 queue_rcu_work(cgroup_destroy_wq, &css->destroy_rwork);
5262 return ERR_PTR(err);
5263}
5264
5265
5266
5267
5268
5269
5270static struct cgroup *cgroup_create(struct cgroup *parent, const char *name,
5271 umode_t mode)
5272{
5273 struct cgroup_root *root = parent->root;
5274 struct cgroup *cgrp, *tcgrp;
5275 struct kernfs_node *kn;
5276 int level = parent->level + 1;
5277 int ret;
5278
5279
5280 cgrp = kzalloc(struct_size(cgrp, ancestor_ids, (level + 1)),
5281 GFP_KERNEL);
5282 if (!cgrp)
5283 return ERR_PTR(-ENOMEM);
5284
5285 ret = percpu_ref_init(&cgrp->self.refcnt, css_release, 0, GFP_KERNEL);
5286 if (ret)
5287 goto out_free_cgrp;
5288
5289 ret = cgroup_rstat_init(cgrp);
5290 if (ret)
5291 goto out_cancel_ref;
5292
5293
5294 kn = kernfs_create_dir(parent->kn, name, mode, cgrp);
5295 if (IS_ERR(kn)) {
5296 ret = PTR_ERR(kn);
5297 goto out_stat_exit;
5298 }
5299 cgrp->kn = kn;
5300
5301 init_cgroup_housekeeping(cgrp);
5302
5303 cgrp->self.parent = &parent->self;
5304 cgrp->root = root;
5305 cgrp->level = level;
5306
5307 ret = psi_cgroup_alloc(cgrp);
5308 if (ret)
5309 goto out_kernfs_remove;
5310
5311 ret = cgroup_bpf_inherit(cgrp);
5312 if (ret)
5313 goto out_psi_free;
5314
5315
5316
5317
5318
5319 cgrp->freezer.e_freeze = parent->freezer.e_freeze;
5320 if (cgrp->freezer.e_freeze) {
5321
5322
5323
5324
5325
5326
5327 set_bit(CGRP_FREEZE, &cgrp->flags);
5328 set_bit(CGRP_FROZEN, &cgrp->flags);
5329 }
5330
5331 spin_lock_irq(&css_set_lock);
5332 for (tcgrp = cgrp; tcgrp; tcgrp = cgroup_parent(tcgrp)) {
5333 cgrp->ancestor_ids[tcgrp->level] = cgroup_id(tcgrp);
5334
5335 if (tcgrp != cgrp) {
5336 tcgrp->nr_descendants++;
5337
5338
5339
5340
5341
5342
5343 if (cgrp->freezer.e_freeze)
5344 tcgrp->freezer.nr_frozen_descendants++;
5345 }
5346 }
5347 spin_unlock_irq(&css_set_lock);
5348
5349 if (notify_on_release(parent))
5350 set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
5351
5352 if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &parent->flags))
5353 set_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags);
5354
5355 cgrp->self.serial_nr = css_serial_nr_next++;
5356
5357
5358 list_add_tail_rcu(&cgrp->self.sibling, &cgroup_parent(cgrp)->self.children);
5359 atomic_inc(&root->nr_cgrps);
5360 cgroup_get_live(parent);
5361
5362
5363
5364
5365
5366 if (!cgroup_on_dfl(cgrp))
5367 cgrp->subtree_control = cgroup_control(cgrp);
5368
5369 cgroup_propagate_control(cgrp);
5370
5371 return cgrp;
5372
5373out_psi_free:
5374 psi_cgroup_free(cgrp);
5375out_kernfs_remove:
5376 kernfs_remove(cgrp->kn);
5377out_stat_exit:
5378 cgroup_rstat_exit(cgrp);
5379out_cancel_ref:
5380 percpu_ref_exit(&cgrp->self.refcnt);
5381out_free_cgrp:
5382 kfree(cgrp);
5383 return ERR_PTR(ret);
5384}
5385
5386static bool cgroup_check_hierarchy_limits(struct cgroup *parent)
5387{
5388 struct cgroup *cgroup;
5389 int ret = false;
5390 int level = 1;
5391
5392 lockdep_assert_held(&cgroup_mutex);
5393
5394 for (cgroup = parent; cgroup; cgroup = cgroup_parent(cgroup)) {
5395 if (cgroup->nr_descendants >= cgroup->max_descendants)
5396 goto fail;
5397
5398 if (level > cgroup->max_depth)
5399 goto fail;
5400
5401 level++;
5402 }
5403
5404 ret = true;
5405fail:
5406 return ret;
5407}
5408
5409int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, umode_t mode)
5410{
5411 struct cgroup *parent, *cgrp;
5412 int ret;
5413
5414
5415 if (strchr(name, '\n'))
5416 return -EINVAL;
5417
5418 parent = cgroup_kn_lock_live(parent_kn, false);
5419 if (!parent)
5420 return -ENODEV;
5421
5422 if (!cgroup_check_hierarchy_limits(parent)) {
5423 ret = -EAGAIN;
5424 goto out_unlock;
5425 }
5426
5427 cgrp = cgroup_create(parent, name, mode);
5428 if (IS_ERR(cgrp)) {
5429 ret = PTR_ERR(cgrp);
5430 goto out_unlock;
5431 }
5432
5433
5434
5435
5436
5437 kernfs_get(cgrp->kn);
5438
5439 ret = cgroup_kn_set_ugid(cgrp->kn);
5440 if (ret)
5441 goto out_destroy;
5442
5443 ret = css_populate_dir(&cgrp->self);
5444 if (ret)
5445 goto out_destroy;
5446
5447 ret = cgroup_apply_control_enable(cgrp);
5448 if (ret)
5449 goto out_destroy;
5450
5451 TRACE_CGROUP_PATH(mkdir, cgrp);
5452
5453
5454 kernfs_activate(cgrp->kn);
5455
5456 ret = 0;
5457 goto out_unlock;
5458
5459out_destroy:
5460 cgroup_destroy_locked(cgrp);
5461out_unlock:
5462 cgroup_kn_unlock(parent_kn);
5463 return ret;
5464}
5465
5466
5467
5468
5469
5470
5471static void css_killed_work_fn(struct work_struct *work)
5472{
5473 struct cgroup_subsys_state *css =
5474 container_of(work, struct cgroup_subsys_state, destroy_work);
5475
5476 mutex_lock(&cgroup_mutex);
5477
5478 do {
5479 offline_css(css);
5480 css_put(css);
5481
5482 css = css->parent;
5483 } while (css && atomic_dec_and_test(&css->online_cnt));
5484
5485 mutex_unlock(&cgroup_mutex);
5486}
5487
5488
5489static void css_killed_ref_fn(struct percpu_ref *ref)
5490{
5491 struct cgroup_subsys_state *css =
5492 container_of(ref, struct cgroup_subsys_state, refcnt);
5493
5494 if (atomic_dec_and_test(&css->online_cnt)) {
5495 INIT_WORK(&css->destroy_work, css_killed_work_fn);
5496 queue_work(cgroup_destroy_wq, &css->destroy_work);
5497 }
5498}
5499
5500
5501
5502
5503
5504
5505
5506
5507
5508
5509static void kill_css(struct cgroup_subsys_state *css)
5510{
5511 lockdep_assert_held(&cgroup_mutex);
5512
5513 if (css->flags & CSS_DYING)
5514 return;
5515
5516 css->flags |= CSS_DYING;
5517
5518
5519
5520
5521
5522 css_clear_dir(css);
5523
5524
5525
5526
5527
5528 css_get(css);
5529
5530
5531
5532
5533
5534
5535
5536
5537
5538
5539
5540 percpu_ref_kill_and_confirm(&css->refcnt, css_killed_ref_fn);
5541}
5542
5543
5544
5545
5546
5547
5548
5549
5550
5551
5552
5553
5554
5555
5556
5557
5558
5559
5560
5561
5562
5563
5564
5565
5566
5567static int cgroup_destroy_locked(struct cgroup *cgrp)
5568 __releases(&cgroup_mutex) __acquires(&cgroup_mutex)
5569{
5570 struct cgroup *tcgrp, *parent = cgroup_parent(cgrp);
5571 struct cgroup_subsys_state *css;
5572 struct cgrp_cset_link *link;
5573 int ssid;
5574
5575 lockdep_assert_held(&cgroup_mutex);
5576
5577
5578
5579
5580
5581 if (cgroup_is_populated(cgrp))
5582 return -EBUSY;
5583
5584
5585
5586
5587
5588
5589 if (css_has_online_children(&cgrp->self))
5590 return -EBUSY;
5591
5592
5593
5594
5595
5596
5597
5598 cgrp->self.flags &= ~CSS_ONLINE;
5599
5600 spin_lock_irq(&css_set_lock);
5601 list_for_each_entry(link, &cgrp->cset_links, cset_link)
5602 link->cset->dead = true;
5603 spin_unlock_irq(&css_set_lock);
5604
5605
5606 for_each_css(css, ssid, cgrp)
5607 kill_css(css);
5608
5609
5610 css_clear_dir(&cgrp->self);
5611 kernfs_remove(cgrp->kn);
5612
5613 if (parent && cgroup_is_threaded(cgrp))
5614 parent->nr_threaded_children--;
5615
5616 spin_lock_irq(&css_set_lock);
5617 for (tcgrp = cgroup_parent(cgrp); tcgrp; tcgrp = cgroup_parent(tcgrp)) {
5618 tcgrp->nr_descendants--;
5619 tcgrp->nr_dying_descendants++;
5620
5621
5622
5623
5624 if (test_bit(CGRP_FROZEN, &cgrp->flags))
5625 tcgrp->freezer.nr_frozen_descendants--;
5626 }
5627 spin_unlock_irq(&css_set_lock);
5628
5629 cgroup1_check_for_release(parent);
5630
5631 cgroup_bpf_offline(cgrp);
5632
5633
5634 percpu_ref_kill(&cgrp->self.refcnt);
5635
5636 return 0;
5637};
5638
5639int cgroup_rmdir(struct kernfs_node *kn)
5640{
5641 struct cgroup *cgrp;
5642 int ret = 0;
5643
5644 cgrp = cgroup_kn_lock_live(kn, false);
5645 if (!cgrp)
5646 return 0;
5647
5648 ret = cgroup_destroy_locked(cgrp);
5649 if (!ret)
5650 TRACE_CGROUP_PATH(rmdir, cgrp);
5651
5652 cgroup_kn_unlock(kn);
5653 return ret;
5654}
5655
5656static struct kernfs_syscall_ops cgroup_kf_syscall_ops = {
5657 .show_options = cgroup_show_options,
5658 .mkdir = cgroup_mkdir,
5659 .rmdir = cgroup_rmdir,
5660 .show_path = cgroup_show_path,
5661};
5662
5663static void __init cgroup_init_subsys(struct cgroup_subsys *ss, bool early)
5664{
5665 struct cgroup_subsys_state *css;
5666
5667 pr_debug("Initializing cgroup subsys %s\n", ss->name);
5668
5669 mutex_lock(&cgroup_mutex);
5670
5671 idr_init(&ss->css_idr);
5672 INIT_LIST_HEAD(&ss->cfts);
5673
5674
5675 ss->root = &cgrp_dfl_root;
5676 css = ss->css_alloc(cgroup_css(&cgrp_dfl_root.cgrp, ss));
5677
5678 BUG_ON(IS_ERR(css));
5679 init_and_link_css(css, ss, &cgrp_dfl_root.cgrp);
5680
5681
5682
5683
5684
5685 css->flags |= CSS_NO_REF;
5686
5687 if (early) {
5688
5689 css->id = 1;
5690 } else {
5691 css->id = cgroup_idr_alloc(&ss->css_idr, css, 1, 2, GFP_KERNEL);
5692 BUG_ON(css->id < 0);
5693 }
5694
5695
5696
5697
5698
5699 init_css_set.subsys[ss->id] = css;
5700
5701 have_fork_callback |= (bool)ss->fork << ss->id;
5702 have_exit_callback |= (bool)ss->exit << ss->id;
5703 have_release_callback |= (bool)ss->release << ss->id;
5704 have_canfork_callback |= (bool)ss->can_fork << ss->id;
5705
5706
5707
5708
5709 BUG_ON(!list_empty(&init_task.tasks));
5710
5711 BUG_ON(online_css(css));
5712
5713 mutex_unlock(&cgroup_mutex);
5714}
5715
5716
5717
5718
5719
5720
5721
5722int __init cgroup_init_early(void)
5723{
5724 static struct cgroup_fs_context __initdata ctx;
5725 struct cgroup_subsys *ss;
5726 int i;
5727
5728 ctx.root = &cgrp_dfl_root;
5729 init_cgroup_root(&ctx);
5730 cgrp_dfl_root.cgrp.self.flags |= CSS_NO_REF;
5731
5732 RCU_INIT_POINTER(init_task.cgroups, &init_css_set);
5733
5734 for_each_subsys(ss, i) {
5735 WARN(!ss->css_alloc || !ss->css_free || ss->name || ss->id,
5736 "invalid cgroup_subsys %d:%s css_alloc=%p css_free=%p id:name=%d:%s\n",
5737 i, cgroup_subsys_name[i], ss->css_alloc, ss->css_free,
5738 ss->id, ss->name);
5739 WARN(strlen(cgroup_subsys_name[i]) > MAX_CGROUP_TYPE_NAMELEN,
5740 "cgroup_subsys_name %s too long\n", cgroup_subsys_name[i]);
5741
5742 ss->id = i;
5743 ss->name = cgroup_subsys_name[i];
5744 if (!ss->legacy_name)
5745 ss->legacy_name = cgroup_subsys_name[i];
5746
5747 if (ss->early_init)
5748 cgroup_init_subsys(ss, true);
5749 }
5750 return 0;
5751}
5752
5753
5754
5755
5756
5757
5758
5759int __init cgroup_init(void)
5760{
5761 struct cgroup_subsys *ss;
5762 int ssid;
5763
5764 BUILD_BUG_ON(CGROUP_SUBSYS_COUNT > 16);
5765 BUG_ON(cgroup_init_cftypes(NULL, cgroup_base_files));
5766 BUG_ON(cgroup_init_cftypes(NULL, cgroup1_base_files));
5767
5768 cgroup_rstat_boot();
5769
5770
5771
5772
5773
5774 rcu_sync_enter_start(&cgroup_threadgroup_rwsem.rss);
5775
5776 get_user_ns(init_cgroup_ns.user_ns);
5777
5778 mutex_lock(&cgroup_mutex);
5779
5780
5781
5782
5783
5784 hash_add(css_set_table, &init_css_set.hlist,
5785 css_set_hash(init_css_set.subsys));
5786
5787 BUG_ON(cgroup_setup_root(&cgrp_dfl_root, 0));
5788
5789 mutex_unlock(&cgroup_mutex);
5790
5791 for_each_subsys(ss, ssid) {
5792 if (ss->early_init) {
5793 struct cgroup_subsys_state *css =
5794 init_css_set.subsys[ss->id];
5795
5796 css->id = cgroup_idr_alloc(&ss->css_idr, css, 1, 2,
5797 GFP_KERNEL);
5798 BUG_ON(css->id < 0);
5799 } else {
5800 cgroup_init_subsys(ss, false);
5801 }
5802
5803 list_add_tail(&init_css_set.e_cset_node[ssid],
5804 &cgrp_dfl_root.cgrp.e_csets[ssid]);
5805
5806
5807
5808
5809
5810
5811 if (!cgroup_ssid_enabled(ssid))
5812 continue;
5813
5814 if (cgroup1_ssid_disabled(ssid))
5815 printk(KERN_INFO "Disabling %s control group subsystem in v1 mounts\n",
5816 ss->name);
5817
5818 cgrp_dfl_root.subsys_mask |= 1 << ss->id;
5819
5820
5821 WARN_ON(ss->implicit_on_dfl && !ss->threaded);
5822
5823 if (ss->implicit_on_dfl)
5824 cgrp_dfl_implicit_ss_mask |= 1 << ss->id;
5825 else if (!ss->dfl_cftypes)
5826 cgrp_dfl_inhibit_ss_mask |= 1 << ss->id;
5827
5828 if (ss->threaded)
5829 cgrp_dfl_threaded_ss_mask |= 1 << ss->id;
5830
5831 if (ss->dfl_cftypes == ss->legacy_cftypes) {
5832 WARN_ON(cgroup_add_cftypes(ss, ss->dfl_cftypes));
5833 } else {
5834 WARN_ON(cgroup_add_dfl_cftypes(ss, ss->dfl_cftypes));
5835 WARN_ON(cgroup_add_legacy_cftypes(ss, ss->legacy_cftypes));
5836 }
5837
5838 if (ss->bind)
5839 ss->bind(init_css_set.subsys[ssid]);
5840
5841 mutex_lock(&cgroup_mutex);
5842 css_populate_dir(init_css_set.subsys[ssid]);
5843 mutex_unlock(&cgroup_mutex);
5844 }
5845
5846
5847 hash_del(&init_css_set.hlist);
5848 hash_add(css_set_table, &init_css_set.hlist,
5849 css_set_hash(init_css_set.subsys));
5850
5851 WARN_ON(sysfs_create_mount_point(fs_kobj, "cgroup"));
5852 WARN_ON(register_filesystem(&cgroup_fs_type));
5853 WARN_ON(register_filesystem(&cgroup2_fs_type));
5854 WARN_ON(!proc_create_single("cgroups", 0, NULL, proc_cgroupstats_show));
5855#ifdef CONFIG_CPUSETS
5856 WARN_ON(register_filesystem(&cpuset_fs_type));
5857#endif
5858
5859 return 0;
5860}
5861
5862static int __init cgroup_wq_init(void)
5863{
5864
5865
5866
5867
5868
5869
5870
5871
5872 cgroup_destroy_wq = alloc_workqueue("cgroup_destroy", 0, 1);
5873 BUG_ON(!cgroup_destroy_wq);
5874 return 0;
5875}
5876core_initcall(cgroup_wq_init);
5877
5878void cgroup_path_from_kernfs_id(u64 id, char *buf, size_t buflen)
5879{
5880 struct kernfs_node *kn;
5881
5882 kn = kernfs_find_and_get_node_by_id(cgrp_dfl_root.kf_root, id);
5883 if (!kn)
5884 return;
5885 kernfs_path(kn, buf, buflen);
5886 kernfs_put(kn);
5887}
5888
5889
5890
5891
5892
5893
5894struct cgroup *cgroup_get_from_id(u64 id)
5895{
5896 struct kernfs_node *kn;
5897 struct cgroup *cgrp = NULL;
5898
5899 mutex_lock(&cgroup_mutex);
5900 kn = kernfs_find_and_get_node_by_id(cgrp_dfl_root.kf_root, id);
5901 if (!kn)
5902 goto out_unlock;
5903
5904 cgrp = kn->priv;
5905 if (cgroup_is_dead(cgrp) || !cgroup_tryget(cgrp))
5906 cgrp = NULL;
5907 kernfs_put(kn);
5908out_unlock:
5909 mutex_unlock(&cgroup_mutex);
5910 return cgrp;
5911}
5912EXPORT_SYMBOL_GPL(cgroup_get_from_id);
5913
5914
5915
5916
5917
5918
5919int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns,
5920 struct pid *pid, struct task_struct *tsk)
5921{
5922 char *buf;
5923 int retval;
5924 struct cgroup_root *root;
5925
5926 retval = -ENOMEM;
5927 buf = kmalloc(PATH_MAX, GFP_KERNEL);
5928 if (!buf)
5929 goto out;
5930
5931 mutex_lock(&cgroup_mutex);
5932 spin_lock_irq(&css_set_lock);
5933
5934 for_each_root(root) {
5935 struct cgroup_subsys *ss;
5936 struct cgroup *cgrp;
5937 int ssid, count = 0;
5938
5939 if (root == &cgrp_dfl_root && !cgrp_dfl_visible)
5940 continue;
5941
5942 seq_printf(m, "%d:", root->hierarchy_id);
5943 if (root != &cgrp_dfl_root)
5944 for_each_subsys(ss, ssid)
5945 if (root->subsys_mask & (1 << ssid))
5946 seq_printf(m, "%s%s", count++ ? "," : "",
5947 ss->legacy_name);
5948 if (strlen(root->name))
5949 seq_printf(m, "%sname=%s", count ? "," : "",
5950 root->name);
5951 seq_putc(m, ':');
5952
5953 cgrp = task_cgroup_from_root(tsk, root);
5954
5955
5956
5957
5958
5959
5960
5961
5962
5963
5964 if (cgroup_on_dfl(cgrp) || !(tsk->flags & PF_EXITING)) {
5965 retval = cgroup_path_ns_locked(cgrp, buf, PATH_MAX,
5966 current->nsproxy->cgroup_ns);
5967 if (retval >= PATH_MAX)
5968 retval = -ENAMETOOLONG;
5969 if (retval < 0)
5970 goto out_unlock;
5971
5972 seq_puts(m, buf);
5973 } else {
5974 seq_puts(m, "/");
5975 }
5976
5977 if (cgroup_on_dfl(cgrp) && cgroup_is_dead(cgrp))
5978 seq_puts(m, " (deleted)\n");
5979 else
5980 seq_putc(m, '\n');
5981 }
5982
5983 retval = 0;
5984out_unlock:
5985 spin_unlock_irq(&css_set_lock);
5986 mutex_unlock(&cgroup_mutex);
5987 kfree(buf);
5988out:
5989 return retval;
5990}
5991
5992
5993
5994
5995
5996
5997
5998
5999void cgroup_fork(struct task_struct *child)
6000{
6001 RCU_INIT_POINTER(child->cgroups, &init_css_set);
6002 INIT_LIST_HEAD(&child->cg_list);
6003}
6004
6005static struct cgroup *cgroup_get_from_file(struct file *f)
6006{
6007 struct cgroup_subsys_state *css;
6008 struct cgroup *cgrp;
6009
6010 css = css_tryget_online_from_dir(f->f_path.dentry, NULL);
6011 if (IS_ERR(css))
6012 return ERR_CAST(css);
6013
6014 cgrp = css->cgroup;
6015 if (!cgroup_on_dfl(cgrp)) {
6016 cgroup_put(cgrp);
6017 return ERR_PTR(-EBADF);
6018 }
6019
6020 return cgrp;
6021}
6022
6023
6024
6025
6026
6027
6028
6029
6030
6031
6032
6033
6034
6035
6036
6037
6038
6039static int cgroup_css_set_fork(struct kernel_clone_args *kargs)
6040 __acquires(&cgroup_mutex) __acquires(&cgroup_threadgroup_rwsem)
6041{
6042 int ret;
6043 struct cgroup *dst_cgrp = NULL;
6044 struct css_set *cset;
6045 struct super_block *sb;
6046 struct file *f;
6047
6048 if (kargs->flags & CLONE_INTO_CGROUP)
6049 mutex_lock(&cgroup_mutex);
6050
6051 cgroup_threadgroup_change_begin(current);
6052
6053 spin_lock_irq(&css_set_lock);
6054 cset = task_css_set(current);
6055 get_css_set(cset);
6056 spin_unlock_irq(&css_set_lock);
6057
6058 if (!(kargs->flags & CLONE_INTO_CGROUP)) {
6059 kargs->cset = cset;
6060 return 0;
6061 }
6062
6063 f = fget_raw(kargs->cgroup);
6064 if (!f) {
6065 ret = -EBADF;
6066 goto err;
6067 }
6068 sb = f->f_path.dentry->d_sb;
6069
6070 dst_cgrp = cgroup_get_from_file(f);
6071 if (IS_ERR(dst_cgrp)) {
6072 ret = PTR_ERR(dst_cgrp);
6073 dst_cgrp = NULL;
6074 goto err;
6075 }
6076
6077 if (cgroup_is_dead(dst_cgrp)) {
6078 ret = -ENODEV;
6079 goto err;
6080 }
6081
6082
6083
6084
6085
6086
6087 ret = cgroup_may_write(dst_cgrp, sb);
6088 if (ret)
6089 goto err;
6090
6091 ret = cgroup_attach_permissions(cset->dfl_cgrp, dst_cgrp, sb,
6092 !(kargs->flags & CLONE_THREAD));
6093 if (ret)
6094 goto err;
6095
6096 kargs->cset = find_css_set(cset, dst_cgrp);
6097 if (!kargs->cset) {
6098 ret = -ENOMEM;
6099 goto err;
6100 }
6101
6102 put_css_set(cset);
6103 fput(f);
6104 kargs->cgrp = dst_cgrp;
6105 return ret;
6106
6107err:
6108 cgroup_threadgroup_change_end(current);
6109 mutex_unlock(&cgroup_mutex);
6110 if (f)
6111 fput(f);
6112 if (dst_cgrp)
6113 cgroup_put(dst_cgrp);
6114 put_css_set(cset);
6115 if (kargs->cset)
6116 put_css_set(kargs->cset);
6117 return ret;
6118}
6119
6120
6121
6122
6123
6124
6125
6126
6127static void cgroup_css_set_put_fork(struct kernel_clone_args *kargs)
6128 __releases(&cgroup_threadgroup_rwsem) __releases(&cgroup_mutex)
6129{
6130 cgroup_threadgroup_change_end(current);
6131
6132 if (kargs->flags & CLONE_INTO_CGROUP) {
6133 struct cgroup *cgrp = kargs->cgrp;
6134 struct css_set *cset = kargs->cset;
6135
6136 mutex_unlock(&cgroup_mutex);
6137
6138 if (cset) {
6139 put_css_set(cset);
6140 kargs->cset = NULL;
6141 }
6142
6143 if (cgrp) {
6144 cgroup_put(cgrp);
6145 kargs->cgrp = NULL;
6146 }
6147 }
6148}
6149
6150
6151
6152
6153
6154
6155
6156
6157
6158
6159
6160int cgroup_can_fork(struct task_struct *child, struct kernel_clone_args *kargs)
6161{
6162 struct cgroup_subsys *ss;
6163 int i, j, ret;
6164
6165 ret = cgroup_css_set_fork(kargs);
6166 if (ret)
6167 return ret;
6168
6169 do_each_subsys_mask(ss, i, have_canfork_callback) {
6170 ret = ss->can_fork(child, kargs->cset);
6171 if (ret)
6172 goto out_revert;
6173 } while_each_subsys_mask();
6174
6175 return 0;
6176
6177out_revert:
6178 for_each_subsys(ss, j) {
6179 if (j >= i)
6180 break;
6181 if (ss->cancel_fork)
6182 ss->cancel_fork(child, kargs->cset);
6183 }
6184
6185 cgroup_css_set_put_fork(kargs);
6186
6187 return ret;
6188}
6189
6190
6191
6192
6193
6194
6195
6196
6197
6198
6199void cgroup_cancel_fork(struct task_struct *child,
6200 struct kernel_clone_args *kargs)
6201{
6202 struct cgroup_subsys *ss;
6203 int i;
6204
6205 for_each_subsys(ss, i)
6206 if (ss->cancel_fork)
6207 ss->cancel_fork(child, kargs->cset);
6208
6209 cgroup_css_set_put_fork(kargs);
6210}
6211
6212
6213
6214
6215
6216
6217
6218
6219void cgroup_post_fork(struct task_struct *child,
6220 struct kernel_clone_args *kargs)
6221 __releases(&cgroup_threadgroup_rwsem) __releases(&cgroup_mutex)
6222{
6223 unsigned long cgrp_flags = 0;
6224 bool kill = false;
6225 struct cgroup_subsys *ss;
6226 struct css_set *cset;
6227 int i;
6228
6229 cset = kargs->cset;
6230 kargs->cset = NULL;
6231
6232 spin_lock_irq(&css_set_lock);
6233
6234
6235 if (likely(child->pid)) {
6236 if (kargs->cgrp)
6237 cgrp_flags = kargs->cgrp->flags;
6238 else
6239 cgrp_flags = cset->dfl_cgrp->flags;
6240
6241 WARN_ON_ONCE(!list_empty(&child->cg_list));
6242 cset->nr_tasks++;
6243 css_set_move_task(child, NULL, cset, false);
6244 } else {
6245 put_css_set(cset);
6246 cset = NULL;
6247 }
6248
6249 if (!(child->flags & PF_KTHREAD)) {
6250 if (unlikely(test_bit(CGRP_FREEZE, &cgrp_flags))) {
6251
6252
6253
6254
6255
6256 spin_lock(&child->sighand->siglock);
6257 WARN_ON_ONCE(child->frozen);
6258 child->jobctl |= JOBCTL_TRAP_FREEZE;
6259 spin_unlock(&child->sighand->siglock);
6260
6261
6262
6263
6264
6265
6266
6267 }
6268
6269
6270
6271
6272
6273
6274 kill = test_bit(CGRP_KILL, &cgrp_flags);
6275 }
6276
6277 spin_unlock_irq(&css_set_lock);
6278
6279
6280
6281
6282
6283
6284 do_each_subsys_mask(ss, i, have_fork_callback) {
6285 ss->fork(child);
6286 } while_each_subsys_mask();
6287
6288
6289 if (kargs->flags & CLONE_NEWCGROUP) {
6290 struct css_set *rcset = child->nsproxy->cgroup_ns->root_cset;
6291
6292 get_css_set(cset);
6293 child->nsproxy->cgroup_ns->root_cset = cset;
6294 put_css_set(rcset);
6295 }
6296
6297
6298 if (unlikely(kill))
6299 do_send_sig_info(SIGKILL, SEND_SIG_NOINFO, child, PIDTYPE_TGID);
6300
6301 cgroup_css_set_put_fork(kargs);
6302}
6303
6304
6305
6306
6307
6308
6309
6310
6311void cgroup_exit(struct task_struct *tsk)
6312{
6313 struct cgroup_subsys *ss;
6314 struct css_set *cset;
6315 int i;
6316
6317 spin_lock_irq(&css_set_lock);
6318
6319 WARN_ON_ONCE(list_empty(&tsk->cg_list));
6320 cset = task_css_set(tsk);
6321 css_set_move_task(tsk, cset, NULL, false);
6322 list_add_tail(&tsk->cg_list, &cset->dying_tasks);
6323 cset->nr_tasks--;
6324
6325 WARN_ON_ONCE(cgroup_task_frozen(tsk));
6326 if (unlikely(!(tsk->flags & PF_KTHREAD) &&
6327 test_bit(CGRP_FREEZE, &task_dfl_cgroup(tsk)->flags)))
6328 cgroup_update_frozen(task_dfl_cgroup(tsk));
6329
6330 spin_unlock_irq(&css_set_lock);
6331
6332
6333 do_each_subsys_mask(ss, i, have_exit_callback) {
6334 ss->exit(tsk);
6335 } while_each_subsys_mask();
6336}
6337
6338void cgroup_release(struct task_struct *task)
6339{
6340 struct cgroup_subsys *ss;
6341 int ssid;
6342
6343 do_each_subsys_mask(ss, ssid, have_release_callback) {
6344 ss->release(task);
6345 } while_each_subsys_mask();
6346
6347 spin_lock_irq(&css_set_lock);
6348 css_set_skip_task_iters(task_css_set(task), task);
6349 list_del_init(&task->cg_list);
6350 spin_unlock_irq(&css_set_lock);
6351}
6352
6353void cgroup_free(struct task_struct *task)
6354{
6355 struct css_set *cset = task_css_set(task);
6356 put_css_set(cset);
6357}
6358
6359static int __init cgroup_disable(char *str)
6360{
6361 struct cgroup_subsys *ss;
6362 char *token;
6363 int i;
6364
6365 while ((token = strsep(&str, ",")) != NULL) {
6366 if (!*token)
6367 continue;
6368
6369 for_each_subsys(ss, i) {
6370 if (strcmp(token, ss->name) &&
6371 strcmp(token, ss->legacy_name))
6372 continue;
6373
6374 static_branch_disable(cgroup_subsys_enabled_key[i]);
6375 pr_info("Disabling %s control group subsystem\n",
6376 ss->name);
6377 }
6378
6379 for (i = 0; i < OPT_FEATURE_COUNT; i++) {
6380 if (strcmp(token, cgroup_opt_feature_names[i]))
6381 continue;
6382 cgroup_feature_disable_mask |= 1 << i;
6383 pr_info("Disabling %s control group feature\n",
6384 cgroup_opt_feature_names[i]);
6385 break;
6386 }
6387 }
6388 return 1;
6389}
6390__setup("cgroup_disable=", cgroup_disable);
6391
6392void __init __weak enable_debug_cgroup(void) { }
6393
6394static int __init enable_cgroup_debug(char *str)
6395{
6396 cgroup_debug = true;
6397 enable_debug_cgroup();
6398 return 1;
6399}
6400__setup("cgroup_debug", enable_cgroup_debug);
6401
6402
6403
6404
6405
6406
6407
6408
6409
6410
6411struct cgroup_subsys_state *css_tryget_online_from_dir(struct dentry *dentry,
6412 struct cgroup_subsys *ss)
6413{
6414 struct kernfs_node *kn = kernfs_node_from_dentry(dentry);
6415 struct file_system_type *s_type = dentry->d_sb->s_type;
6416 struct cgroup_subsys_state *css = NULL;
6417 struct cgroup *cgrp;
6418
6419
6420 if ((s_type != &cgroup_fs_type && s_type != &cgroup2_fs_type) ||
6421 !kn || kernfs_type(kn) != KERNFS_DIR)
6422 return ERR_PTR(-EBADF);
6423
6424 rcu_read_lock();
6425
6426
6427
6428
6429
6430
6431 cgrp = rcu_dereference(*(void __rcu __force **)&kn->priv);
6432 if (cgrp)
6433 css = cgroup_css(cgrp, ss);
6434
6435 if (!css || !css_tryget_online(css))
6436 css = ERR_PTR(-ENOENT);
6437
6438 rcu_read_unlock();
6439 return css;
6440}
6441
6442
6443
6444
6445
6446
6447
6448
6449
6450struct cgroup_subsys_state *css_from_id(int id, struct cgroup_subsys *ss)
6451{
6452 WARN_ON_ONCE(!rcu_read_lock_held());
6453 return idr_find(&ss->css_idr, id);
6454}
6455
6456
6457
6458
6459
6460
6461
6462
6463
6464
6465struct cgroup *cgroup_get_from_path(const char *path)
6466{
6467 struct kernfs_node *kn;
6468 struct cgroup *cgrp;
6469
6470 mutex_lock(&cgroup_mutex);
6471
6472 kn = kernfs_walk_and_get(cgrp_dfl_root.cgrp.kn, path);
6473 if (kn) {
6474 if (kernfs_type(kn) == KERNFS_DIR) {
6475 cgrp = kn->priv;
6476 cgroup_get_live(cgrp);
6477 } else {
6478 cgrp = ERR_PTR(-ENOTDIR);
6479 }
6480 kernfs_put(kn);
6481 } else {
6482 cgrp = ERR_PTR(-ENOENT);
6483 }
6484
6485 mutex_unlock(&cgroup_mutex);
6486 return cgrp;
6487}
6488EXPORT_SYMBOL_GPL(cgroup_get_from_path);
6489
6490
6491
6492
6493
6494
6495
6496
6497
6498
6499struct cgroup *cgroup_get_from_fd(int fd)
6500{
6501 struct cgroup *cgrp;
6502 struct file *f;
6503
6504 f = fget_raw(fd);
6505 if (!f)
6506 return ERR_PTR(-EBADF);
6507
6508 cgrp = cgroup_get_from_file(f);
6509 fput(f);
6510 return cgrp;
6511}
6512EXPORT_SYMBOL_GPL(cgroup_get_from_fd);
6513
6514static u64 power_of_ten(int power)
6515{
6516 u64 v = 1;
6517 while (power--)
6518 v *= 10;
6519 return v;
6520}
6521
6522
6523
6524
6525
6526
6527
6528
6529
6530
6531
6532
6533
6534
6535
6536int cgroup_parse_float(const char *input, unsigned dec_shift, s64 *v)
6537{
6538 s64 whole, frac = 0;
6539 int fstart = 0, fend = 0, flen;
6540
6541 if (!sscanf(input, "%lld.%n%lld%n", &whole, &fstart, &frac, &fend))
6542 return -EINVAL;
6543 if (frac < 0)
6544 return -EINVAL;
6545
6546 flen = fend > fstart ? fend - fstart : 0;
6547 if (flen < dec_shift)
6548 frac *= power_of_ten(dec_shift - flen);
6549 else
6550 frac = DIV_ROUND_CLOSEST_ULL(frac, power_of_ten(flen - dec_shift));
6551
6552 *v = whole * power_of_ten(dec_shift) + frac;
6553 return 0;
6554}
6555
6556
6557
6558
6559
6560#ifdef CONFIG_SOCK_CGROUP_DATA
6561
6562#if defined(CONFIG_CGROUP_NET_PRIO) || defined(CONFIG_CGROUP_NET_CLASSID)
6563
6564DEFINE_SPINLOCK(cgroup_sk_update_lock);
6565static bool cgroup_sk_alloc_disabled __read_mostly;
6566
6567void cgroup_sk_alloc_disable(void)
6568{
6569 if (cgroup_sk_alloc_disabled)
6570 return;
6571 pr_info("cgroup: disabling cgroup2 socket matching due to net_prio or net_cls activation\n");
6572 cgroup_sk_alloc_disabled = true;
6573}
6574
6575#else
6576
6577#define cgroup_sk_alloc_disabled false
6578
6579#endif
6580
6581void cgroup_sk_alloc(struct sock_cgroup_data *skcd)
6582{
6583 if (cgroup_sk_alloc_disabled) {
6584 skcd->no_refcnt = 1;
6585 return;
6586 }
6587
6588
6589 if (in_interrupt())
6590 return;
6591
6592 rcu_read_lock();
6593
6594 while (true) {
6595 struct css_set *cset;
6596
6597 cset = task_css_set(current);
6598 if (likely(cgroup_tryget(cset->dfl_cgrp))) {
6599 skcd->val = (unsigned long)cset->dfl_cgrp;
6600 cgroup_bpf_get(cset->dfl_cgrp);
6601 break;
6602 }
6603 cpu_relax();
6604 }
6605
6606 rcu_read_unlock();
6607}
6608
6609void cgroup_sk_clone(struct sock_cgroup_data *skcd)
6610{
6611 if (skcd->val) {
6612 if (skcd->no_refcnt)
6613 return;
6614
6615
6616
6617
6618
6619 cgroup_get(sock_cgroup_ptr(skcd));
6620 cgroup_bpf_get(sock_cgroup_ptr(skcd));
6621 }
6622}
6623
6624void cgroup_sk_free(struct sock_cgroup_data *skcd)
6625{
6626 struct cgroup *cgrp = sock_cgroup_ptr(skcd);
6627
6628 if (skcd->no_refcnt)
6629 return;
6630 cgroup_bpf_put(cgrp);
6631 cgroup_put(cgrp);
6632}
6633
6634#endif
6635
6636#ifdef CONFIG_CGROUP_BPF
6637int cgroup_bpf_attach(struct cgroup *cgrp,
6638 struct bpf_prog *prog, struct bpf_prog *replace_prog,
6639 struct bpf_cgroup_link *link,
6640 enum bpf_attach_type type,
6641 u32 flags)
6642{
6643 int ret;
6644
6645 mutex_lock(&cgroup_mutex);
6646 ret = __cgroup_bpf_attach(cgrp, prog, replace_prog, link, type, flags);
6647 mutex_unlock(&cgroup_mutex);
6648 return ret;
6649}
6650
6651int cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog,
6652 enum bpf_attach_type type)
6653{
6654 int ret;
6655
6656 mutex_lock(&cgroup_mutex);
6657 ret = __cgroup_bpf_detach(cgrp, prog, NULL, type);
6658 mutex_unlock(&cgroup_mutex);
6659 return ret;
6660}
6661
6662int cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
6663 union bpf_attr __user *uattr)
6664{
6665 int ret;
6666
6667 mutex_lock(&cgroup_mutex);
6668 ret = __cgroup_bpf_query(cgrp, attr, uattr);
6669 mutex_unlock(&cgroup_mutex);
6670 return ret;
6671}
6672#endif
6673
6674#ifdef CONFIG_SYSFS
6675static ssize_t show_delegatable_files(struct cftype *files, char *buf,
6676 ssize_t size, const char *prefix)
6677{
6678 struct cftype *cft;
6679 ssize_t ret = 0;
6680
6681 for (cft = files; cft && cft->name[0] != '\0'; cft++) {
6682 if (!(cft->flags & CFTYPE_NS_DELEGATABLE))
6683 continue;
6684
6685 if ((cft->flags & CFTYPE_PRESSURE) && !cgroup_psi_enabled())
6686 continue;
6687
6688 if (prefix)
6689 ret += snprintf(buf + ret, size - ret, "%s.", prefix);
6690
6691 ret += snprintf(buf + ret, size - ret, "%s\n", cft->name);
6692
6693 if (WARN_ON(ret >= size))
6694 break;
6695 }
6696
6697 return ret;
6698}
6699
6700static ssize_t delegate_show(struct kobject *kobj, struct kobj_attribute *attr,
6701 char *buf)
6702{
6703 struct cgroup_subsys *ss;
6704 int ssid;
6705 ssize_t ret = 0;
6706
6707 ret = show_delegatable_files(cgroup_base_files, buf, PAGE_SIZE - ret,
6708 NULL);
6709
6710 for_each_subsys(ss, ssid)
6711 ret += show_delegatable_files(ss->dfl_cftypes, buf + ret,
6712 PAGE_SIZE - ret,
6713 cgroup_subsys_name[ssid]);
6714
6715 return ret;
6716}
6717static struct kobj_attribute cgroup_delegate_attr = __ATTR_RO(delegate);
6718
6719static ssize_t features_show(struct kobject *kobj, struct kobj_attribute *attr,
6720 char *buf)
6721{
6722 return snprintf(buf, PAGE_SIZE,
6723 "nsdelegate\n"
6724 "memory_localevents\n"
6725 "memory_recursiveprot\n");
6726}
6727static struct kobj_attribute cgroup_features_attr = __ATTR_RO(features);
6728
6729static struct attribute *cgroup_sysfs_attrs[] = {
6730 &cgroup_delegate_attr.attr,
6731 &cgroup_features_attr.attr,
6732 NULL,
6733};
6734
6735static const struct attribute_group cgroup_sysfs_attr_group = {
6736 .attrs = cgroup_sysfs_attrs,
6737 .name = "cgroup",
6738};
6739
6740static int __init cgroup_sysfs_init(void)
6741{
6742 return sysfs_create_group(kernel_kobj, &cgroup_sysfs_attr_group);
6743}
6744subsys_initcall(cgroup_sysfs_init);
6745
6746#endif
6747