1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
30
31#include "cgroup-internal.h"
32
33#include <linux/cred.h>
34#include <linux/errno.h>
35#include <linux/init_task.h>
36#include <linux/kernel.h>
37#include <linux/magic.h>
38#include <linux/mutex.h>
39#include <linux/mount.h>
40#include <linux/pagemap.h>
41#include <linux/proc_fs.h>
42#include <linux/rcupdate.h>
43#include <linux/sched.h>
44#include <linux/sched/task.h>
45#include <linux/slab.h>
46#include <linux/spinlock.h>
47#include <linux/percpu-rwsem.h>
48#include <linux/string.h>
49#include <linux/hashtable.h>
50#include <linux/idr.h>
51#include <linux/kthread.h>
52#include <linux/atomic.h>
53#include <linux/cpuset.h>
54#include <linux/proc_ns.h>
55#include <linux/nsproxy.h>
56#include <linux/file.h>
57#include <linux/fs_parser.h>
58#include <linux/sched/cputime.h>
59#include <linux/psi.h>
60#include <net/sock.h>
61
62#define CREATE_TRACE_POINTS
63#include <trace/events/cgroup.h>
64
65#define CGROUP_FILE_NAME_MAX (MAX_CGROUP_TYPE_NAMELEN + \
66 MAX_CFTYPE_NAME + 2)
67
68#define CGROUP_FILE_NOTIFY_MIN_INTV DIV_ROUND_UP(HZ, 100)
69
70
71
72
73
74
75
76
77
78
79
80DEFINE_MUTEX(cgroup_mutex);
81DEFINE_SPINLOCK(css_set_lock);
82
83#ifdef CONFIG_PROVE_RCU
84EXPORT_SYMBOL_GPL(cgroup_mutex);
85EXPORT_SYMBOL_GPL(css_set_lock);
86#endif
87
88DEFINE_SPINLOCK(trace_cgroup_path_lock);
89char trace_cgroup_path[TRACE_CGROUP_PATH_LEN];
90bool cgroup_debug __read_mostly;
91
92
93
94
95
96static DEFINE_SPINLOCK(cgroup_idr_lock);
97
98
99
100
101
102static DEFINE_SPINLOCK(cgroup_file_kn_lock);
103
104DEFINE_PERCPU_RWSEM(cgroup_threadgroup_rwsem);
105
106#define cgroup_assert_mutex_or_rcu_locked() \
107 RCU_LOCKDEP_WARN(!rcu_read_lock_held() && \
108 !lockdep_is_held(&cgroup_mutex), \
109 "cgroup_mutex or RCU read lock required");
110
111
112
113
114
115
116
117static struct workqueue_struct *cgroup_destroy_wq;
118
119
120#define SUBSYS(_x) [_x ## _cgrp_id] = &_x ## _cgrp_subsys,
121struct cgroup_subsys *cgroup_subsys[] = {
122#include <linux/cgroup_subsys.h>
123};
124#undef SUBSYS
125
126
127#define SUBSYS(_x) [_x ## _cgrp_id] = #_x,
128static const char *cgroup_subsys_name[] = {
129#include <linux/cgroup_subsys.h>
130};
131#undef SUBSYS
132
133
134#define SUBSYS(_x) \
135 DEFINE_STATIC_KEY_TRUE(_x ## _cgrp_subsys_enabled_key); \
136 DEFINE_STATIC_KEY_TRUE(_x ## _cgrp_subsys_on_dfl_key); \
137 EXPORT_SYMBOL_GPL(_x ## _cgrp_subsys_enabled_key); \
138 EXPORT_SYMBOL_GPL(_x ## _cgrp_subsys_on_dfl_key);
139#include <linux/cgroup_subsys.h>
140#undef SUBSYS
141
142#define SUBSYS(_x) [_x ## _cgrp_id] = &_x ## _cgrp_subsys_enabled_key,
143static struct static_key_true *cgroup_subsys_enabled_key[] = {
144#include <linux/cgroup_subsys.h>
145};
146#undef SUBSYS
147
148#define SUBSYS(_x) [_x ## _cgrp_id] = &_x ## _cgrp_subsys_on_dfl_key,
149static struct static_key_true *cgroup_subsys_on_dfl_key[] = {
150#include <linux/cgroup_subsys.h>
151};
152#undef SUBSYS
153
154static DEFINE_PER_CPU(struct cgroup_rstat_cpu, cgrp_dfl_root_rstat_cpu);
155
156
157
158
159
160
161struct cgroup_root cgrp_dfl_root = { .cgrp.rstat_cpu = &cgrp_dfl_root_rstat_cpu };
162EXPORT_SYMBOL_GPL(cgrp_dfl_root);
163
164
165
166
167
168static bool cgrp_dfl_visible;
169
170
171static u16 cgrp_dfl_inhibit_ss_mask;
172
173
174static u16 cgrp_dfl_implicit_ss_mask;
175
176
177static u16 cgrp_dfl_threaded_ss_mask;
178
179
180LIST_HEAD(cgroup_roots);
181static int cgroup_root_count;
182
183
184static DEFINE_IDR(cgroup_hierarchy_idr);
185
186
187
188
189
190
191
192
193static u64 css_serial_nr_next = 1;
194
195
196
197
198
199static u16 have_fork_callback __read_mostly;
200static u16 have_exit_callback __read_mostly;
201static u16 have_release_callback __read_mostly;
202static u16 have_canfork_callback __read_mostly;
203
204
205struct cgroup_namespace init_cgroup_ns = {
206 .count = REFCOUNT_INIT(2),
207 .user_ns = &init_user_ns,
208 .ns.ops = &cgroupns_operations,
209 .ns.inum = PROC_CGROUP_INIT_INO,
210 .root_cset = &init_css_set,
211};
212
213static struct file_system_type cgroup2_fs_type;
214static struct cftype cgroup_base_files[];
215
216static int cgroup_apply_control(struct cgroup *cgrp);
217static void cgroup_finalize_control(struct cgroup *cgrp, int ret);
218static void css_task_iter_skip(struct css_task_iter *it,
219 struct task_struct *task);
220static int cgroup_destroy_locked(struct cgroup *cgrp);
221static struct cgroup_subsys_state *css_create(struct cgroup *cgrp,
222 struct cgroup_subsys *ss);
223static void css_release(struct percpu_ref *ref);
224static void kill_css(struct cgroup_subsys_state *css);
225static int cgroup_addrm_files(struct cgroup_subsys_state *css,
226 struct cgroup *cgrp, struct cftype cfts[],
227 bool is_add);
228
229
230
231
232
233
234
235
236
237bool cgroup_ssid_enabled(int ssid)
238{
239 if (CGROUP_SUBSYS_COUNT == 0)
240 return false;
241
242 return static_key_enabled(cgroup_subsys_enabled_key[ssid]);
243}
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298bool cgroup_on_dfl(const struct cgroup *cgrp)
299{
300 return cgrp->root == &cgrp_dfl_root;
301}
302
303
304static int cgroup_idr_alloc(struct idr *idr, void *ptr, int start, int end,
305 gfp_t gfp_mask)
306{
307 int ret;
308
309 idr_preload(gfp_mask);
310 spin_lock_bh(&cgroup_idr_lock);
311 ret = idr_alloc(idr, ptr, start, end, gfp_mask & ~__GFP_DIRECT_RECLAIM);
312 spin_unlock_bh(&cgroup_idr_lock);
313 idr_preload_end();
314 return ret;
315}
316
317static void *cgroup_idr_replace(struct idr *idr, void *ptr, int id)
318{
319 void *ret;
320
321 spin_lock_bh(&cgroup_idr_lock);
322 ret = idr_replace(idr, ptr, id);
323 spin_unlock_bh(&cgroup_idr_lock);
324 return ret;
325}
326
327static void cgroup_idr_remove(struct idr *idr, int id)
328{
329 spin_lock_bh(&cgroup_idr_lock);
330 idr_remove(idr, id);
331 spin_unlock_bh(&cgroup_idr_lock);
332}
333
334static bool cgroup_has_tasks(struct cgroup *cgrp)
335{
336 return cgrp->nr_populated_csets;
337}
338
339bool cgroup_is_threaded(struct cgroup *cgrp)
340{
341 return cgrp->dom_cgrp != cgrp;
342}
343
344
345static bool cgroup_is_mixable(struct cgroup *cgrp)
346{
347
348
349
350
351
352 return !cgroup_parent(cgrp);
353}
354
355
356static bool cgroup_can_be_thread_root(struct cgroup *cgrp)
357{
358
359 if (cgroup_is_mixable(cgrp))
360 return true;
361
362
363 if (cgroup_is_threaded(cgrp))
364 return false;
365
366
367 if (cgrp->nr_populated_domain_children)
368 return false;
369
370
371 if (cgrp->subtree_control & ~cgrp_dfl_threaded_ss_mask)
372 return false;
373
374 return true;
375}
376
377
378bool cgroup_is_thread_root(struct cgroup *cgrp)
379{
380
381 if (cgroup_is_threaded(cgrp))
382 return false;
383
384
385 if (cgrp->nr_threaded_children)
386 return true;
387
388
389
390
391
392 if (cgroup_has_tasks(cgrp) &&
393 (cgrp->subtree_control & cgrp_dfl_threaded_ss_mask))
394 return true;
395
396 return false;
397}
398
399
400static bool cgroup_is_valid_domain(struct cgroup *cgrp)
401{
402
403 if (cgroup_is_threaded(cgrp))
404 return false;
405
406
407 while ((cgrp = cgroup_parent(cgrp))) {
408 if (!cgroup_is_mixable(cgrp) && cgroup_is_thread_root(cgrp))
409 return false;
410 if (cgroup_is_threaded(cgrp))
411 return false;
412 }
413
414 return true;
415}
416
417
418static u16 cgroup_control(struct cgroup *cgrp)
419{
420 struct cgroup *parent = cgroup_parent(cgrp);
421 u16 root_ss_mask = cgrp->root->subsys_mask;
422
423 if (parent) {
424 u16 ss_mask = parent->subtree_control;
425
426
427 if (cgroup_is_threaded(cgrp))
428 ss_mask &= cgrp_dfl_threaded_ss_mask;
429 return ss_mask;
430 }
431
432 if (cgroup_on_dfl(cgrp))
433 root_ss_mask &= ~(cgrp_dfl_inhibit_ss_mask |
434 cgrp_dfl_implicit_ss_mask);
435 return root_ss_mask;
436}
437
438
439static u16 cgroup_ss_mask(struct cgroup *cgrp)
440{
441 struct cgroup *parent = cgroup_parent(cgrp);
442
443 if (parent) {
444 u16 ss_mask = parent->subtree_ss_mask;
445
446
447 if (cgroup_is_threaded(cgrp))
448 ss_mask &= cgrp_dfl_threaded_ss_mask;
449 return ss_mask;
450 }
451
452 return cgrp->root->subsys_mask;
453}
454
455
456
457
458
459
460
461
462
463
464
465
466static struct cgroup_subsys_state *cgroup_css(struct cgroup *cgrp,
467 struct cgroup_subsys *ss)
468{
469 if (ss)
470 return rcu_dereference_check(cgrp->subsys[ss->id],
471 lockdep_is_held(&cgroup_mutex));
472 else
473 return &cgrp->self;
474}
475
476
477
478
479
480
481
482
483
484static struct cgroup_subsys_state *cgroup_tryget_css(struct cgroup *cgrp,
485 struct cgroup_subsys *ss)
486{
487 struct cgroup_subsys_state *css;
488
489 rcu_read_lock();
490 css = cgroup_css(cgrp, ss);
491 if (css && !css_tryget_online(css))
492 css = NULL;
493 rcu_read_unlock();
494
495 return css;
496}
497
498
499
500
501
502
503
504
505
506
507
508static struct cgroup_subsys_state *cgroup_e_css_by_mask(struct cgroup *cgrp,
509 struct cgroup_subsys *ss)
510{
511 lockdep_assert_held(&cgroup_mutex);
512
513 if (!ss)
514 return &cgrp->self;
515
516
517
518
519
520 while (!(cgroup_ss_mask(cgrp) & (1 << ss->id))) {
521 cgrp = cgroup_parent(cgrp);
522 if (!cgrp)
523 return NULL;
524 }
525
526 return cgroup_css(cgrp, ss);
527}
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542struct cgroup_subsys_state *cgroup_e_css(struct cgroup *cgrp,
543 struct cgroup_subsys *ss)
544{
545 struct cgroup_subsys_state *css;
546
547 do {
548 css = cgroup_css(cgrp, ss);
549
550 if (css)
551 return css;
552 cgrp = cgroup_parent(cgrp);
553 } while (cgrp);
554
555 return init_css_set.subsys[ss->id];
556}
557
558
559
560
561
562
563
564
565
566
567
568
569struct cgroup_subsys_state *cgroup_get_e_css(struct cgroup *cgrp,
570 struct cgroup_subsys *ss)
571{
572 struct cgroup_subsys_state *css;
573
574 rcu_read_lock();
575
576 do {
577 css = cgroup_css(cgrp, ss);
578
579 if (css && css_tryget_online(css))
580 goto out_unlock;
581 cgrp = cgroup_parent(cgrp);
582 } while (cgrp);
583
584 css = init_css_set.subsys[ss->id];
585 css_get(css);
586out_unlock:
587 rcu_read_unlock();
588 return css;
589}
590
591static void cgroup_get_live(struct cgroup *cgrp)
592{
593 WARN_ON_ONCE(cgroup_is_dead(cgrp));
594 css_get(&cgrp->self);
595}
596
597
598
599
600
601
602int __cgroup_task_count(const struct cgroup *cgrp)
603{
604 int count = 0;
605 struct cgrp_cset_link *link;
606
607 lockdep_assert_held(&css_set_lock);
608
609 list_for_each_entry(link, &cgrp->cset_links, cset_link)
610 count += link->cset->nr_tasks;
611
612 return count;
613}
614
615
616
617
618
619int cgroup_task_count(const struct cgroup *cgrp)
620{
621 int count;
622
623 spin_lock_irq(&css_set_lock);
624 count = __cgroup_task_count(cgrp);
625 spin_unlock_irq(&css_set_lock);
626
627 return count;
628}
629
630struct cgroup_subsys_state *of_css(struct kernfs_open_file *of)
631{
632 struct cgroup *cgrp = of->kn->parent->priv;
633 struct cftype *cft = of_cft(of);
634
635
636
637
638
639
640
641
642
643 if (cft->ss)
644 return rcu_dereference_raw(cgrp->subsys[cft->ss->id]);
645 else
646 return &cgrp->self;
647}
648EXPORT_SYMBOL_GPL(of_css);
649
650
651
652
653
654
655
656
657
658#define for_each_css(css, ssid, cgrp) \
659 for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++) \
660 if (!((css) = rcu_dereference_check( \
661 (cgrp)->subsys[(ssid)], \
662 lockdep_is_held(&cgroup_mutex)))) { } \
663 else
664
665
666
667
668
669
670
671
672
673#define for_each_e_css(css, ssid, cgrp) \
674 for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++) \
675 if (!((css) = cgroup_e_css_by_mask(cgrp, \
676 cgroup_subsys[(ssid)]))) \
677 ; \
678 else
679
680
681
682
683
684
685
686
687
688
689#define do_each_subsys_mask(ss, ssid, ss_mask) do { \
690 unsigned long __ss_mask = (ss_mask); \
691 if (!CGROUP_SUBSYS_COUNT) { \
692 (ssid) = 0; \
693 break; \
694 } \
695 for_each_set_bit(ssid, &__ss_mask, CGROUP_SUBSYS_COUNT) { \
696 (ss) = cgroup_subsys[ssid]; \
697 {
698
699#define while_each_subsys_mask() \
700 } \
701 } \
702} while (false)
703
704
705#define cgroup_for_each_live_child(child, cgrp) \
706 list_for_each_entry((child), &(cgrp)->self.children, self.sibling) \
707 if (({ lockdep_assert_held(&cgroup_mutex); \
708 cgroup_is_dead(child); })) \
709 ; \
710 else
711
712
713#define cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp) \
714 css_for_each_descendant_pre((d_css), cgroup_css((cgrp), NULL)) \
715 if (({ lockdep_assert_held(&cgroup_mutex); \
716 (dsct) = (d_css)->cgroup; \
717 cgroup_is_dead(dsct); })) \
718 ; \
719 else
720
721
722#define cgroup_for_each_live_descendant_post(dsct, d_css, cgrp) \
723 css_for_each_descendant_post((d_css), cgroup_css((cgrp), NULL)) \
724 if (({ lockdep_assert_held(&cgroup_mutex); \
725 (dsct) = (d_css)->cgroup; \
726 cgroup_is_dead(dsct); })) \
727 ; \
728 else
729
730
731
732
733
734
735
736
737struct css_set init_css_set = {
738 .refcount = REFCOUNT_INIT(1),
739 .dom_cset = &init_css_set,
740 .tasks = LIST_HEAD_INIT(init_css_set.tasks),
741 .mg_tasks = LIST_HEAD_INIT(init_css_set.mg_tasks),
742 .dying_tasks = LIST_HEAD_INIT(init_css_set.dying_tasks),
743 .task_iters = LIST_HEAD_INIT(init_css_set.task_iters),
744 .threaded_csets = LIST_HEAD_INIT(init_css_set.threaded_csets),
745 .cgrp_links = LIST_HEAD_INIT(init_css_set.cgrp_links),
746 .mg_preload_node = LIST_HEAD_INIT(init_css_set.mg_preload_node),
747 .mg_node = LIST_HEAD_INIT(init_css_set.mg_node),
748
749
750
751
752
753
754
755 .dfl_cgrp = &cgrp_dfl_root.cgrp,
756};
757
758static int css_set_count = 1;
759
760static bool css_set_threaded(struct css_set *cset)
761{
762 return cset->dom_cset != cset;
763}
764
765
766
767
768
769
770
771
772
773
774static bool css_set_populated(struct css_set *cset)
775{
776 lockdep_assert_held(&css_set_lock);
777
778 return !list_empty(&cset->tasks) || !list_empty(&cset->mg_tasks);
779}
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798static void cgroup_update_populated(struct cgroup *cgrp, bool populated)
799{
800 struct cgroup *child = NULL;
801 int adj = populated ? 1 : -1;
802
803 lockdep_assert_held(&css_set_lock);
804
805 do {
806 bool was_populated = cgroup_is_populated(cgrp);
807
808 if (!child) {
809 cgrp->nr_populated_csets += adj;
810 } else {
811 if (cgroup_is_threaded(child))
812 cgrp->nr_populated_threaded_children += adj;
813 else
814 cgrp->nr_populated_domain_children += adj;
815 }
816
817 if (was_populated == cgroup_is_populated(cgrp))
818 break;
819
820 cgroup1_check_for_release(cgrp);
821 TRACE_CGROUP_PATH(notify_populated, cgrp,
822 cgroup_is_populated(cgrp));
823 cgroup_file_notify(&cgrp->events_file);
824
825 child = cgrp;
826 cgrp = cgroup_parent(cgrp);
827 } while (cgrp);
828}
829
830
831
832
833
834
835
836
837
838static void css_set_update_populated(struct css_set *cset, bool populated)
839{
840 struct cgrp_cset_link *link;
841
842 lockdep_assert_held(&css_set_lock);
843
844 list_for_each_entry(link, &cset->cgrp_links, cgrp_link)
845 cgroup_update_populated(link->cgrp, populated);
846}
847
848
849
850
851
852
853
854static void css_set_skip_task_iters(struct css_set *cset,
855 struct task_struct *task)
856{
857 struct css_task_iter *it, *pos;
858
859 list_for_each_entry_safe(it, pos, &cset->task_iters, iters_node)
860 css_task_iter_skip(it, task);
861}
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878static void css_set_move_task(struct task_struct *task,
879 struct css_set *from_cset, struct css_set *to_cset,
880 bool use_mg_tasks)
881{
882 lockdep_assert_held(&css_set_lock);
883
884 if (to_cset && !css_set_populated(to_cset))
885 css_set_update_populated(to_cset, true);
886
887 if (from_cset) {
888 WARN_ON_ONCE(list_empty(&task->cg_list));
889
890 css_set_skip_task_iters(from_cset, task);
891 list_del_init(&task->cg_list);
892 if (!css_set_populated(from_cset))
893 css_set_update_populated(from_cset, false);
894 } else {
895 WARN_ON_ONCE(!list_empty(&task->cg_list));
896 }
897
898 if (to_cset) {
899
900
901
902
903
904 WARN_ON_ONCE(task->flags & PF_EXITING);
905
906 cgroup_move_task(task, to_cset);
907 list_add_tail(&task->cg_list, use_mg_tasks ? &to_cset->mg_tasks :
908 &to_cset->tasks);
909 }
910}
911
912
913
914
915
916
917#define CSS_SET_HASH_BITS 7
918static DEFINE_HASHTABLE(css_set_table, CSS_SET_HASH_BITS);
919
920static unsigned long css_set_hash(struct cgroup_subsys_state *css[])
921{
922 unsigned long key = 0UL;
923 struct cgroup_subsys *ss;
924 int i;
925
926 for_each_subsys(ss, i)
927 key += (unsigned long)css[i];
928 key = (key >> 16) ^ key;
929
930 return key;
931}
932
933void put_css_set_locked(struct css_set *cset)
934{
935 struct cgrp_cset_link *link, *tmp_link;
936 struct cgroup_subsys *ss;
937 int ssid;
938
939 lockdep_assert_held(&css_set_lock);
940
941 if (!refcount_dec_and_test(&cset->refcount))
942 return;
943
944 WARN_ON_ONCE(!list_empty(&cset->threaded_csets));
945
946
947 for_each_subsys(ss, ssid) {
948 list_del(&cset->e_cset_node[ssid]);
949 css_put(cset->subsys[ssid]);
950 }
951 hash_del(&cset->hlist);
952 css_set_count--;
953
954 list_for_each_entry_safe(link, tmp_link, &cset->cgrp_links, cgrp_link) {
955 list_del(&link->cset_link);
956 list_del(&link->cgrp_link);
957 if (cgroup_parent(link->cgrp))
958 cgroup_put(link->cgrp);
959 kfree(link);
960 }
961
962 if (css_set_threaded(cset)) {
963 list_del(&cset->threaded_csets_node);
964 put_css_set_locked(cset->dom_cset);
965 }
966
967 kfree_rcu(cset, rcu_head);
968}
969
970
971
972
973
974
975
976
977
978
979
980static bool compare_css_sets(struct css_set *cset,
981 struct css_set *old_cset,
982 struct cgroup *new_cgrp,
983 struct cgroup_subsys_state *template[])
984{
985 struct cgroup *new_dfl_cgrp;
986 struct list_head *l1, *l2;
987
988
989
990
991
992
993 if (memcmp(template, cset->subsys, sizeof(cset->subsys)))
994 return false;
995
996
997
998 if (cgroup_on_dfl(new_cgrp))
999 new_dfl_cgrp = new_cgrp;
1000 else
1001 new_dfl_cgrp = old_cset->dfl_cgrp;
1002
1003 if (new_dfl_cgrp->dom_cgrp != cset->dom_cset->dfl_cgrp)
1004 return false;
1005
1006
1007
1008
1009
1010
1011
1012 l1 = &cset->cgrp_links;
1013 l2 = &old_cset->cgrp_links;
1014 while (1) {
1015 struct cgrp_cset_link *link1, *link2;
1016 struct cgroup *cgrp1, *cgrp2;
1017
1018 l1 = l1->next;
1019 l2 = l2->next;
1020
1021 if (l1 == &cset->cgrp_links) {
1022 BUG_ON(l2 != &old_cset->cgrp_links);
1023 break;
1024 } else {
1025 BUG_ON(l2 == &old_cset->cgrp_links);
1026 }
1027
1028 link1 = list_entry(l1, struct cgrp_cset_link, cgrp_link);
1029 link2 = list_entry(l2, struct cgrp_cset_link, cgrp_link);
1030 cgrp1 = link1->cgrp;
1031 cgrp2 = link2->cgrp;
1032
1033 BUG_ON(cgrp1->root != cgrp2->root);
1034
1035
1036
1037
1038
1039
1040
1041
1042 if (cgrp1->root == new_cgrp->root) {
1043 if (cgrp1 != new_cgrp)
1044 return false;
1045 } else {
1046 if (cgrp1 != cgrp2)
1047 return false;
1048 }
1049 }
1050 return true;
1051}
1052
1053
1054
1055
1056
1057
1058
1059static struct css_set *find_existing_css_set(struct css_set *old_cset,
1060 struct cgroup *cgrp,
1061 struct cgroup_subsys_state *template[])
1062{
1063 struct cgroup_root *root = cgrp->root;
1064 struct cgroup_subsys *ss;
1065 struct css_set *cset;
1066 unsigned long key;
1067 int i;
1068
1069
1070
1071
1072
1073
1074 for_each_subsys(ss, i) {
1075 if (root->subsys_mask & (1UL << i)) {
1076
1077
1078
1079
1080 template[i] = cgroup_e_css_by_mask(cgrp, ss);
1081 } else {
1082
1083
1084
1085
1086 template[i] = old_cset->subsys[i];
1087 }
1088 }
1089
1090 key = css_set_hash(template);
1091 hash_for_each_possible(css_set_table, cset, hlist, key) {
1092 if (!compare_css_sets(cset, old_cset, cgrp, template))
1093 continue;
1094
1095
1096 return cset;
1097 }
1098
1099
1100 return NULL;
1101}
1102
1103static void free_cgrp_cset_links(struct list_head *links_to_free)
1104{
1105 struct cgrp_cset_link *link, *tmp_link;
1106
1107 list_for_each_entry_safe(link, tmp_link, links_to_free, cset_link) {
1108 list_del(&link->cset_link);
1109 kfree(link);
1110 }
1111}
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121static int allocate_cgrp_cset_links(int count, struct list_head *tmp_links)
1122{
1123 struct cgrp_cset_link *link;
1124 int i;
1125
1126 INIT_LIST_HEAD(tmp_links);
1127
1128 for (i = 0; i < count; i++) {
1129 link = kzalloc(sizeof(*link), GFP_KERNEL);
1130 if (!link) {
1131 free_cgrp_cset_links(tmp_links);
1132 return -ENOMEM;
1133 }
1134 list_add(&link->cset_link, tmp_links);
1135 }
1136 return 0;
1137}
1138
1139
1140
1141
1142
1143
1144
1145static void link_css_set(struct list_head *tmp_links, struct css_set *cset,
1146 struct cgroup *cgrp)
1147{
1148 struct cgrp_cset_link *link;
1149
1150 BUG_ON(list_empty(tmp_links));
1151
1152 if (cgroup_on_dfl(cgrp))
1153 cset->dfl_cgrp = cgrp;
1154
1155 link = list_first_entry(tmp_links, struct cgrp_cset_link, cset_link);
1156 link->cset = cset;
1157 link->cgrp = cgrp;
1158
1159
1160
1161
1162
1163 list_move_tail(&link->cset_link, &cgrp->cset_links);
1164 list_add_tail(&link->cgrp_link, &cset->cgrp_links);
1165
1166 if (cgroup_parent(cgrp))
1167 cgroup_get_live(cgrp);
1168}
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178static struct css_set *find_css_set(struct css_set *old_cset,
1179 struct cgroup *cgrp)
1180{
1181 struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT] = { };
1182 struct css_set *cset;
1183 struct list_head tmp_links;
1184 struct cgrp_cset_link *link;
1185 struct cgroup_subsys *ss;
1186 unsigned long key;
1187 int ssid;
1188
1189 lockdep_assert_held(&cgroup_mutex);
1190
1191
1192
1193 spin_lock_irq(&css_set_lock);
1194 cset = find_existing_css_set(old_cset, cgrp, template);
1195 if (cset)
1196 get_css_set(cset);
1197 spin_unlock_irq(&css_set_lock);
1198
1199 if (cset)
1200 return cset;
1201
1202 cset = kzalloc(sizeof(*cset), GFP_KERNEL);
1203 if (!cset)
1204 return NULL;
1205
1206
1207 if (allocate_cgrp_cset_links(cgroup_root_count, &tmp_links) < 0) {
1208 kfree(cset);
1209 return NULL;
1210 }
1211
1212 refcount_set(&cset->refcount, 1);
1213 cset->dom_cset = cset;
1214 INIT_LIST_HEAD(&cset->tasks);
1215 INIT_LIST_HEAD(&cset->mg_tasks);
1216 INIT_LIST_HEAD(&cset->dying_tasks);
1217 INIT_LIST_HEAD(&cset->task_iters);
1218 INIT_LIST_HEAD(&cset->threaded_csets);
1219 INIT_HLIST_NODE(&cset->hlist);
1220 INIT_LIST_HEAD(&cset->cgrp_links);
1221 INIT_LIST_HEAD(&cset->mg_preload_node);
1222 INIT_LIST_HEAD(&cset->mg_node);
1223
1224
1225
1226 memcpy(cset->subsys, template, sizeof(cset->subsys));
1227
1228 spin_lock_irq(&css_set_lock);
1229
1230 list_for_each_entry(link, &old_cset->cgrp_links, cgrp_link) {
1231 struct cgroup *c = link->cgrp;
1232
1233 if (c->root == cgrp->root)
1234 c = cgrp;
1235 link_css_set(&tmp_links, cset, c);
1236 }
1237
1238 BUG_ON(!list_empty(&tmp_links));
1239
1240 css_set_count++;
1241
1242
1243 key = css_set_hash(cset->subsys);
1244 hash_add(css_set_table, &cset->hlist, key);
1245
1246 for_each_subsys(ss, ssid) {
1247 struct cgroup_subsys_state *css = cset->subsys[ssid];
1248
1249 list_add_tail(&cset->e_cset_node[ssid],
1250 &css->cgroup->e_csets[ssid]);
1251 css_get(css);
1252 }
1253
1254 spin_unlock_irq(&css_set_lock);
1255
1256
1257
1258
1259
1260
1261
1262 if (cgroup_is_threaded(cset->dfl_cgrp)) {
1263 struct css_set *dcset;
1264
1265 dcset = find_css_set(cset, cset->dfl_cgrp->dom_cgrp);
1266 if (!dcset) {
1267 put_css_set(cset);
1268 return NULL;
1269 }
1270
1271 spin_lock_irq(&css_set_lock);
1272 cset->dom_cset = dcset;
1273 list_add_tail(&cset->threaded_csets_node,
1274 &dcset->threaded_csets);
1275 spin_unlock_irq(&css_set_lock);
1276 }
1277
1278 return cset;
1279}
1280
1281struct cgroup_root *cgroup_root_from_kf(struct kernfs_root *kf_root)
1282{
1283 struct cgroup *root_cgrp = kf_root->kn->priv;
1284
1285 return root_cgrp->root;
1286}
1287
1288static int cgroup_init_root_id(struct cgroup_root *root)
1289{
1290 int id;
1291
1292 lockdep_assert_held(&cgroup_mutex);
1293
1294 id = idr_alloc_cyclic(&cgroup_hierarchy_idr, root, 0, 0, GFP_KERNEL);
1295 if (id < 0)
1296 return id;
1297
1298 root->hierarchy_id = id;
1299 return 0;
1300}
1301
1302static void cgroup_exit_root_id(struct cgroup_root *root)
1303{
1304 lockdep_assert_held(&cgroup_mutex);
1305
1306 idr_remove(&cgroup_hierarchy_idr, root->hierarchy_id);
1307}
1308
1309void cgroup_free_root(struct cgroup_root *root)
1310{
1311 kfree(root);
1312}
1313
1314static void cgroup_destroy_root(struct cgroup_root *root)
1315{
1316 struct cgroup *cgrp = &root->cgrp;
1317 struct cgrp_cset_link *link, *tmp_link;
1318
1319 trace_cgroup_destroy_root(root);
1320
1321 cgroup_lock_and_drain_offline(&cgrp_dfl_root.cgrp);
1322
1323 BUG_ON(atomic_read(&root->nr_cgrps));
1324 BUG_ON(!list_empty(&cgrp->self.children));
1325
1326
1327 WARN_ON(rebind_subsystems(&cgrp_dfl_root, root->subsys_mask));
1328
1329
1330
1331
1332
1333 spin_lock_irq(&css_set_lock);
1334
1335 list_for_each_entry_safe(link, tmp_link, &cgrp->cset_links, cset_link) {
1336 list_del(&link->cset_link);
1337 list_del(&link->cgrp_link);
1338 kfree(link);
1339 }
1340
1341 spin_unlock_irq(&css_set_lock);
1342
1343 if (!list_empty(&root->root_list)) {
1344 list_del(&root->root_list);
1345 cgroup_root_count--;
1346 }
1347
1348 cgroup_exit_root_id(root);
1349
1350 mutex_unlock(&cgroup_mutex);
1351
1352 kernfs_destroy_root(root->kf_root);
1353 cgroup_free_root(root);
1354}
1355
1356
1357
1358
1359
1360static struct cgroup *
1361current_cgns_cgroup_from_root(struct cgroup_root *root)
1362{
1363 struct cgroup *res = NULL;
1364 struct css_set *cset;
1365
1366 lockdep_assert_held(&css_set_lock);
1367
1368 rcu_read_lock();
1369
1370 cset = current->nsproxy->cgroup_ns->root_cset;
1371 if (cset == &init_css_set) {
1372 res = &root->cgrp;
1373 } else if (root == &cgrp_dfl_root) {
1374 res = cset->dfl_cgrp;
1375 } else {
1376 struct cgrp_cset_link *link;
1377
1378 list_for_each_entry(link, &cset->cgrp_links, cgrp_link) {
1379 struct cgroup *c = link->cgrp;
1380
1381 if (c->root == root) {
1382 res = c;
1383 break;
1384 }
1385 }
1386 }
1387 rcu_read_unlock();
1388
1389 BUG_ON(!res);
1390 return res;
1391}
1392
1393
1394static struct cgroup *cset_cgroup_from_root(struct css_set *cset,
1395 struct cgroup_root *root)
1396{
1397 struct cgroup *res = NULL;
1398
1399 lockdep_assert_held(&cgroup_mutex);
1400 lockdep_assert_held(&css_set_lock);
1401
1402 if (cset == &init_css_set) {
1403 res = &root->cgrp;
1404 } else if (root == &cgrp_dfl_root) {
1405 res = cset->dfl_cgrp;
1406 } else {
1407 struct cgrp_cset_link *link;
1408
1409 list_for_each_entry(link, &cset->cgrp_links, cgrp_link) {
1410 struct cgroup *c = link->cgrp;
1411
1412 if (c->root == root) {
1413 res = c;
1414 break;
1415 }
1416 }
1417 }
1418
1419 BUG_ON(!res);
1420 return res;
1421}
1422
1423
1424
1425
1426
1427struct cgroup *task_cgroup_from_root(struct task_struct *task,
1428 struct cgroup_root *root)
1429{
1430
1431
1432
1433
1434 return cset_cgroup_from_root(task_css_set(task), root);
1435}
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463static struct kernfs_syscall_ops cgroup_kf_syscall_ops;
1464
1465static char *cgroup_file_name(struct cgroup *cgrp, const struct cftype *cft,
1466 char *buf)
1467{
1468 struct cgroup_subsys *ss = cft->ss;
1469
1470 if (cft->ss && !(cft->flags & CFTYPE_NO_PREFIX) &&
1471 !(cgrp->root->flags & CGRP_ROOT_NOPREFIX)) {
1472 const char *dbg = (cft->flags & CFTYPE_DEBUG) ? ".__DEBUG__." : "";
1473
1474 snprintf(buf, CGROUP_FILE_NAME_MAX, "%s%s.%s",
1475 dbg, cgroup_on_dfl(cgrp) ? ss->name : ss->legacy_name,
1476 cft->name);
1477 } else {
1478 strscpy(buf, cft->name, CGROUP_FILE_NAME_MAX);
1479 }
1480 return buf;
1481}
1482
1483
1484
1485
1486
1487
1488
1489static umode_t cgroup_file_mode(const struct cftype *cft)
1490{
1491 umode_t mode = 0;
1492
1493 if (cft->read_u64 || cft->read_s64 || cft->seq_show)
1494 mode |= S_IRUGO;
1495
1496 if (cft->write_u64 || cft->write_s64 || cft->write) {
1497 if (cft->flags & CFTYPE_WORLD_WRITABLE)
1498 mode |= S_IWUGO;
1499 else
1500 mode |= S_IWUSR;
1501 }
1502
1503 return mode;
1504}
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518static u16 cgroup_calc_subtree_ss_mask(u16 subtree_control, u16 this_ss_mask)
1519{
1520 u16 cur_ss_mask = subtree_control;
1521 struct cgroup_subsys *ss;
1522 int ssid;
1523
1524 lockdep_assert_held(&cgroup_mutex);
1525
1526 cur_ss_mask |= cgrp_dfl_implicit_ss_mask;
1527
1528 while (true) {
1529 u16 new_ss_mask = cur_ss_mask;
1530
1531 do_each_subsys_mask(ss, ssid, cur_ss_mask) {
1532 new_ss_mask |= ss->depends_on;
1533 } while_each_subsys_mask();
1534
1535
1536
1537
1538
1539
1540 new_ss_mask &= this_ss_mask;
1541
1542 if (new_ss_mask == cur_ss_mask)
1543 break;
1544 cur_ss_mask = new_ss_mask;
1545 }
1546
1547 return cur_ss_mask;
1548}
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560void cgroup_kn_unlock(struct kernfs_node *kn)
1561{
1562 struct cgroup *cgrp;
1563
1564 if (kernfs_type(kn) == KERNFS_DIR)
1565 cgrp = kn->priv;
1566 else
1567 cgrp = kn->parent->priv;
1568
1569 mutex_unlock(&cgroup_mutex);
1570
1571 kernfs_unbreak_active_protection(kn);
1572 cgroup_put(cgrp);
1573}
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592struct cgroup *cgroup_kn_lock_live(struct kernfs_node *kn, bool drain_offline)
1593{
1594 struct cgroup *cgrp;
1595
1596 if (kernfs_type(kn) == KERNFS_DIR)
1597 cgrp = kn->priv;
1598 else
1599 cgrp = kn->parent->priv;
1600
1601
1602
1603
1604
1605
1606
1607 if (!cgroup_tryget(cgrp))
1608 return NULL;
1609 kernfs_break_active_protection(kn);
1610
1611 if (drain_offline)
1612 cgroup_lock_and_drain_offline(cgrp);
1613 else
1614 mutex_lock(&cgroup_mutex);
1615
1616 if (!cgroup_is_dead(cgrp))
1617 return cgrp;
1618
1619 cgroup_kn_unlock(kn);
1620 return NULL;
1621}
1622
1623static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft)
1624{
1625 char name[CGROUP_FILE_NAME_MAX];
1626
1627 lockdep_assert_held(&cgroup_mutex);
1628
1629 if (cft->file_offset) {
1630 struct cgroup_subsys_state *css = cgroup_css(cgrp, cft->ss);
1631 struct cgroup_file *cfile = (void *)css + cft->file_offset;
1632
1633 spin_lock_irq(&cgroup_file_kn_lock);
1634 cfile->kn = NULL;
1635 spin_unlock_irq(&cgroup_file_kn_lock);
1636
1637 del_timer_sync(&cfile->notify_timer);
1638 }
1639
1640 kernfs_remove_by_name(cgrp->kn, cgroup_file_name(cgrp, cft, name));
1641}
1642
1643
1644
1645
1646
1647static void css_clear_dir(struct cgroup_subsys_state *css)
1648{
1649 struct cgroup *cgrp = css->cgroup;
1650 struct cftype *cfts;
1651
1652 if (!(css->flags & CSS_VISIBLE))
1653 return;
1654
1655 css->flags &= ~CSS_VISIBLE;
1656
1657 if (!css->ss) {
1658 if (cgroup_on_dfl(cgrp))
1659 cfts = cgroup_base_files;
1660 else
1661 cfts = cgroup1_base_files;
1662
1663 cgroup_addrm_files(css, cgrp, cfts, false);
1664 } else {
1665 list_for_each_entry(cfts, &css->ss->cfts, node)
1666 cgroup_addrm_files(css, cgrp, cfts, false);
1667 }
1668}
1669
1670
1671
1672
1673
1674
1675
1676static int css_populate_dir(struct cgroup_subsys_state *css)
1677{
1678 struct cgroup *cgrp = css->cgroup;
1679 struct cftype *cfts, *failed_cfts;
1680 int ret;
1681
1682 if ((css->flags & CSS_VISIBLE) || !cgrp->kn)
1683 return 0;
1684
1685 if (!css->ss) {
1686 if (cgroup_on_dfl(cgrp))
1687 cfts = cgroup_base_files;
1688 else
1689 cfts = cgroup1_base_files;
1690
1691 ret = cgroup_addrm_files(&cgrp->self, cgrp, cfts, true);
1692 if (ret < 0)
1693 return ret;
1694 } else {
1695 list_for_each_entry(cfts, &css->ss->cfts, node) {
1696 ret = cgroup_addrm_files(css, cgrp, cfts, true);
1697 if (ret < 0) {
1698 failed_cfts = cfts;
1699 goto err;
1700 }
1701 }
1702 }
1703
1704 css->flags |= CSS_VISIBLE;
1705
1706 return 0;
1707err:
1708 list_for_each_entry(cfts, &css->ss->cfts, node) {
1709 if (cfts == failed_cfts)
1710 break;
1711 cgroup_addrm_files(css, cgrp, cfts, false);
1712 }
1713 return ret;
1714}
1715
1716int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask)
1717{
1718 struct cgroup *dcgrp = &dst_root->cgrp;
1719 struct cgroup_subsys *ss;
1720 int ssid, i, ret;
1721
1722 lockdep_assert_held(&cgroup_mutex);
1723
1724 do_each_subsys_mask(ss, ssid, ss_mask) {
1725
1726
1727
1728
1729
1730 if (css_next_child(NULL, cgroup_css(&ss->root->cgrp, ss)) &&
1731 !ss->implicit_on_dfl)
1732 return -EBUSY;
1733
1734
1735 if (ss->root != &cgrp_dfl_root && dst_root != &cgrp_dfl_root)
1736 return -EBUSY;
1737 } while_each_subsys_mask();
1738
1739 do_each_subsys_mask(ss, ssid, ss_mask) {
1740 struct cgroup_root *src_root = ss->root;
1741 struct cgroup *scgrp = &src_root->cgrp;
1742 struct cgroup_subsys_state *css = cgroup_css(scgrp, ss);
1743 struct css_set *cset;
1744
1745 WARN_ON(!css || cgroup_css(dcgrp, ss));
1746
1747
1748 src_root->subsys_mask &= ~(1 << ssid);
1749 WARN_ON(cgroup_apply_control(scgrp));
1750 cgroup_finalize_control(scgrp, 0);
1751
1752
1753 RCU_INIT_POINTER(scgrp->subsys[ssid], NULL);
1754 rcu_assign_pointer(dcgrp->subsys[ssid], css);
1755 ss->root = dst_root;
1756 css->cgroup = dcgrp;
1757
1758 spin_lock_irq(&css_set_lock);
1759 hash_for_each(css_set_table, i, cset, hlist)
1760 list_move_tail(&cset->e_cset_node[ss->id],
1761 &dcgrp->e_csets[ss->id]);
1762 spin_unlock_irq(&css_set_lock);
1763
1764
1765 dst_root->subsys_mask |= 1 << ssid;
1766 if (dst_root == &cgrp_dfl_root) {
1767 static_branch_enable(cgroup_subsys_on_dfl_key[ssid]);
1768 } else {
1769 dcgrp->subtree_control |= 1 << ssid;
1770 static_branch_disable(cgroup_subsys_on_dfl_key[ssid]);
1771 }
1772
1773 ret = cgroup_apply_control(dcgrp);
1774 if (ret)
1775 pr_warn("partial failure to rebind %s controller (err=%d)\n",
1776 ss->name, ret);
1777
1778 if (ss->bind)
1779 ss->bind(css);
1780 } while_each_subsys_mask();
1781
1782 kernfs_activate(dcgrp->kn);
1783 return 0;
1784}
1785
1786int cgroup_show_path(struct seq_file *sf, struct kernfs_node *kf_node,
1787 struct kernfs_root *kf_root)
1788{
1789 int len = 0;
1790 char *buf = NULL;
1791 struct cgroup_root *kf_cgroot = cgroup_root_from_kf(kf_root);
1792 struct cgroup *ns_cgroup;
1793
1794 buf = kmalloc(PATH_MAX, GFP_KERNEL);
1795 if (!buf)
1796 return -ENOMEM;
1797
1798 spin_lock_irq(&css_set_lock);
1799 ns_cgroup = current_cgns_cgroup_from_root(kf_cgroot);
1800 len = kernfs_path_from_node(kf_node, ns_cgroup->kn, buf, PATH_MAX);
1801 spin_unlock_irq(&css_set_lock);
1802
1803 if (len >= PATH_MAX)
1804 len = -ERANGE;
1805 else if (len > 0) {
1806 seq_escape(sf, buf, " \t\n\\");
1807 len = 0;
1808 }
1809 kfree(buf);
1810 return len;
1811}
1812
1813enum cgroup2_param {
1814 Opt_nsdelegate,
1815 Opt_memory_localevents,
1816 nr__cgroup2_params
1817};
1818
1819static const struct fs_parameter_spec cgroup2_param_specs[] = {
1820 fsparam_flag("nsdelegate", Opt_nsdelegate),
1821 fsparam_flag("memory_localevents", Opt_memory_localevents),
1822 {}
1823};
1824
1825static const struct fs_parameter_description cgroup2_fs_parameters = {
1826 .name = "cgroup2",
1827 .specs = cgroup2_param_specs,
1828};
1829
1830static int cgroup2_parse_param(struct fs_context *fc, struct fs_parameter *param)
1831{
1832 struct cgroup_fs_context *ctx = cgroup_fc2context(fc);
1833 struct fs_parse_result result;
1834 int opt;
1835
1836 opt = fs_parse(fc, &cgroup2_fs_parameters, param, &result);
1837 if (opt < 0)
1838 return opt;
1839
1840 switch (opt) {
1841 case Opt_nsdelegate:
1842 ctx->flags |= CGRP_ROOT_NS_DELEGATE;
1843 return 0;
1844 case Opt_memory_localevents:
1845 ctx->flags |= CGRP_ROOT_MEMORY_LOCAL_EVENTS;
1846 return 0;
1847 }
1848 return -EINVAL;
1849}
1850
1851static void apply_cgroup_root_flags(unsigned int root_flags)
1852{
1853 if (current->nsproxy->cgroup_ns == &init_cgroup_ns) {
1854 if (root_flags & CGRP_ROOT_NS_DELEGATE)
1855 cgrp_dfl_root.flags |= CGRP_ROOT_NS_DELEGATE;
1856 else
1857 cgrp_dfl_root.flags &= ~CGRP_ROOT_NS_DELEGATE;
1858
1859 if (root_flags & CGRP_ROOT_MEMORY_LOCAL_EVENTS)
1860 cgrp_dfl_root.flags |= CGRP_ROOT_MEMORY_LOCAL_EVENTS;
1861 else
1862 cgrp_dfl_root.flags &= ~CGRP_ROOT_MEMORY_LOCAL_EVENTS;
1863 }
1864}
1865
1866static int cgroup_show_options(struct seq_file *seq, struct kernfs_root *kf_root)
1867{
1868 if (cgrp_dfl_root.flags & CGRP_ROOT_NS_DELEGATE)
1869 seq_puts(seq, ",nsdelegate");
1870 if (cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_LOCAL_EVENTS)
1871 seq_puts(seq, ",memory_localevents");
1872 return 0;
1873}
1874
1875static int cgroup_reconfigure(struct fs_context *fc)
1876{
1877 struct cgroup_fs_context *ctx = cgroup_fc2context(fc);
1878
1879 apply_cgroup_root_flags(ctx->flags);
1880 return 0;
1881}
1882
1883static void init_cgroup_housekeeping(struct cgroup *cgrp)
1884{
1885 struct cgroup_subsys *ss;
1886 int ssid;
1887
1888 INIT_LIST_HEAD(&cgrp->self.sibling);
1889 INIT_LIST_HEAD(&cgrp->self.children);
1890 INIT_LIST_HEAD(&cgrp->cset_links);
1891 INIT_LIST_HEAD(&cgrp->pidlists);
1892 mutex_init(&cgrp->pidlist_mutex);
1893 cgrp->self.cgroup = cgrp;
1894 cgrp->self.flags |= CSS_ONLINE;
1895 cgrp->dom_cgrp = cgrp;
1896 cgrp->max_descendants = INT_MAX;
1897 cgrp->max_depth = INT_MAX;
1898 INIT_LIST_HEAD(&cgrp->rstat_css_list);
1899 prev_cputime_init(&cgrp->prev_cputime);
1900
1901 for_each_subsys(ss, ssid)
1902 INIT_LIST_HEAD(&cgrp->e_csets[ssid]);
1903
1904 init_waitqueue_head(&cgrp->offline_waitq);
1905 INIT_WORK(&cgrp->release_agent_work, cgroup1_release_agent);
1906}
1907
1908void init_cgroup_root(struct cgroup_fs_context *ctx)
1909{
1910 struct cgroup_root *root = ctx->root;
1911 struct cgroup *cgrp = &root->cgrp;
1912
1913 INIT_LIST_HEAD(&root->root_list);
1914 atomic_set(&root->nr_cgrps, 1);
1915 cgrp->root = root;
1916 init_cgroup_housekeeping(cgrp);
1917
1918 root->flags = ctx->flags;
1919 if (ctx->release_agent)
1920 strscpy(root->release_agent_path, ctx->release_agent, PATH_MAX);
1921 if (ctx->name)
1922 strscpy(root->name, ctx->name, MAX_CGROUP_ROOT_NAMELEN);
1923 if (ctx->cpuset_clone_children)
1924 set_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags);
1925}
1926
1927int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask)
1928{
1929 LIST_HEAD(tmp_links);
1930 struct cgroup *root_cgrp = &root->cgrp;
1931 struct kernfs_syscall_ops *kf_sops;
1932 struct css_set *cset;
1933 int i, ret;
1934
1935 lockdep_assert_held(&cgroup_mutex);
1936
1937 ret = percpu_ref_init(&root_cgrp->self.refcnt, css_release,
1938 0, GFP_KERNEL);
1939 if (ret)
1940 goto out;
1941
1942
1943
1944
1945
1946
1947
1948
1949 ret = allocate_cgrp_cset_links(2 * css_set_count, &tmp_links);
1950 if (ret)
1951 goto cancel_ref;
1952
1953 ret = cgroup_init_root_id(root);
1954 if (ret)
1955 goto cancel_ref;
1956
1957 kf_sops = root == &cgrp_dfl_root ?
1958 &cgroup_kf_syscall_ops : &cgroup1_kf_syscall_ops;
1959
1960 root->kf_root = kernfs_create_root(kf_sops,
1961 KERNFS_ROOT_CREATE_DEACTIVATED |
1962 KERNFS_ROOT_SUPPORT_EXPORTOP,
1963 root_cgrp);
1964 if (IS_ERR(root->kf_root)) {
1965 ret = PTR_ERR(root->kf_root);
1966 goto exit_root_id;
1967 }
1968 root_cgrp->kn = root->kf_root->kn;
1969 WARN_ON_ONCE(cgroup_ino(root_cgrp) != 1);
1970 root_cgrp->ancestor_ids[0] = cgroup_id(root_cgrp);
1971
1972 ret = css_populate_dir(&root_cgrp->self);
1973 if (ret)
1974 goto destroy_root;
1975
1976 ret = rebind_subsystems(root, ss_mask);
1977 if (ret)
1978 goto destroy_root;
1979
1980 ret = cgroup_bpf_inherit(root_cgrp);
1981 WARN_ON_ONCE(ret);
1982
1983 trace_cgroup_setup_root(root);
1984
1985
1986
1987
1988
1989
1990 list_add(&root->root_list, &cgroup_roots);
1991 cgroup_root_count++;
1992
1993
1994
1995
1996
1997 spin_lock_irq(&css_set_lock);
1998 hash_for_each(css_set_table, i, cset, hlist) {
1999 link_css_set(&tmp_links, cset, root_cgrp);
2000 if (css_set_populated(cset))
2001 cgroup_update_populated(root_cgrp, true);
2002 }
2003 spin_unlock_irq(&css_set_lock);
2004
2005 BUG_ON(!list_empty(&root_cgrp->self.children));
2006 BUG_ON(atomic_read(&root->nr_cgrps) != 1);
2007
2008 kernfs_activate(root_cgrp->kn);
2009 ret = 0;
2010 goto out;
2011
2012destroy_root:
2013 kernfs_destroy_root(root->kf_root);
2014 root->kf_root = NULL;
2015exit_root_id:
2016 cgroup_exit_root_id(root);
2017cancel_ref:
2018 percpu_ref_exit(&root_cgrp->self.refcnt);
2019out:
2020 free_cgrp_cset_links(&tmp_links);
2021 return ret;
2022}
2023
2024int cgroup_do_get_tree(struct fs_context *fc)
2025{
2026 struct cgroup_fs_context *ctx = cgroup_fc2context(fc);
2027 int ret;
2028
2029 ctx->kfc.root = ctx->root->kf_root;
2030 if (fc->fs_type == &cgroup2_fs_type)
2031 ctx->kfc.magic = CGROUP2_SUPER_MAGIC;
2032 else
2033 ctx->kfc.magic = CGROUP_SUPER_MAGIC;
2034 ret = kernfs_get_tree(fc);
2035
2036
2037
2038
2039
2040 if (!ret && ctx->ns != &init_cgroup_ns) {
2041 struct dentry *nsdentry;
2042 struct super_block *sb = fc->root->d_sb;
2043 struct cgroup *cgrp;
2044
2045 mutex_lock(&cgroup_mutex);
2046 spin_lock_irq(&css_set_lock);
2047
2048 cgrp = cset_cgroup_from_root(ctx->ns->root_cset, ctx->root);
2049
2050 spin_unlock_irq(&css_set_lock);
2051 mutex_unlock(&cgroup_mutex);
2052
2053 nsdentry = kernfs_node_dentry(cgrp->kn, sb);
2054 dput(fc->root);
2055 if (IS_ERR(nsdentry)) {
2056 deactivate_locked_super(sb);
2057 ret = PTR_ERR(nsdentry);
2058 nsdentry = NULL;
2059 }
2060 fc->root = nsdentry;
2061 }
2062
2063 if (!ctx->kfc.new_sb_created)
2064 cgroup_put(&ctx->root->cgrp);
2065
2066 return ret;
2067}
2068
2069
2070
2071
2072static void cgroup_fs_context_free(struct fs_context *fc)
2073{
2074 struct cgroup_fs_context *ctx = cgroup_fc2context(fc);
2075
2076 kfree(ctx->name);
2077 kfree(ctx->release_agent);
2078 put_cgroup_ns(ctx->ns);
2079 kernfs_free_fs_context(fc);
2080 kfree(ctx);
2081}
2082
2083static int cgroup_get_tree(struct fs_context *fc)
2084{
2085 struct cgroup_fs_context *ctx = cgroup_fc2context(fc);
2086 int ret;
2087
2088 cgrp_dfl_visible = true;
2089 cgroup_get_live(&cgrp_dfl_root.cgrp);
2090 ctx->root = &cgrp_dfl_root;
2091
2092 ret = cgroup_do_get_tree(fc);
2093 if (!ret)
2094 apply_cgroup_root_flags(ctx->flags);
2095 return ret;
2096}
2097
2098static const struct fs_context_operations cgroup_fs_context_ops = {
2099 .free = cgroup_fs_context_free,
2100 .parse_param = cgroup2_parse_param,
2101 .get_tree = cgroup_get_tree,
2102 .reconfigure = cgroup_reconfigure,
2103};
2104
2105static const struct fs_context_operations cgroup1_fs_context_ops = {
2106 .free = cgroup_fs_context_free,
2107 .parse_param = cgroup1_parse_param,
2108 .get_tree = cgroup1_get_tree,
2109 .reconfigure = cgroup1_reconfigure,
2110};
2111
2112
2113
2114
2115
2116static int cgroup_init_fs_context(struct fs_context *fc)
2117{
2118 struct cgroup_fs_context *ctx;
2119
2120 ctx = kzalloc(sizeof(struct cgroup_fs_context), GFP_KERNEL);
2121 if (!ctx)
2122 return -ENOMEM;
2123
2124 ctx->ns = current->nsproxy->cgroup_ns;
2125 get_cgroup_ns(ctx->ns);
2126 fc->fs_private = &ctx->kfc;
2127 if (fc->fs_type == &cgroup2_fs_type)
2128 fc->ops = &cgroup_fs_context_ops;
2129 else
2130 fc->ops = &cgroup1_fs_context_ops;
2131 put_user_ns(fc->user_ns);
2132 fc->user_ns = get_user_ns(ctx->ns->user_ns);
2133 fc->global = true;
2134 return 0;
2135}
2136
2137static void cgroup_kill_sb(struct super_block *sb)
2138{
2139 struct kernfs_root *kf_root = kernfs_root_from_sb(sb);
2140 struct cgroup_root *root = cgroup_root_from_kf(kf_root);
2141
2142
2143
2144
2145
2146
2147
2148
2149 if (list_empty(&root->cgrp.self.children) && root != &cgrp_dfl_root &&
2150 !percpu_ref_is_dying(&root->cgrp.self.refcnt))
2151 percpu_ref_kill(&root->cgrp.self.refcnt);
2152 cgroup_put(&root->cgrp);
2153 kernfs_kill_sb(sb);
2154}
2155
2156struct file_system_type cgroup_fs_type = {
2157 .name = "cgroup",
2158 .init_fs_context = cgroup_init_fs_context,
2159 .parameters = &cgroup1_fs_parameters,
2160 .kill_sb = cgroup_kill_sb,
2161 .fs_flags = FS_USERNS_MOUNT,
2162};
2163
2164static struct file_system_type cgroup2_fs_type = {
2165 .name = "cgroup2",
2166 .init_fs_context = cgroup_init_fs_context,
2167 .parameters = &cgroup2_fs_parameters,
2168 .kill_sb = cgroup_kill_sb,
2169 .fs_flags = FS_USERNS_MOUNT,
2170};
2171
2172#ifdef CONFIG_CPUSETS
2173static const struct fs_context_operations cpuset_fs_context_ops = {
2174 .get_tree = cgroup1_get_tree,
2175 .free = cgroup_fs_context_free,
2176};
2177
2178
2179
2180
2181
2182
2183static int cpuset_init_fs_context(struct fs_context *fc)
2184{
2185 char *agent = kstrdup("/sbin/cpuset_release_agent", GFP_USER);
2186 struct cgroup_fs_context *ctx;
2187 int err;
2188
2189 err = cgroup_init_fs_context(fc);
2190 if (err) {
2191 kfree(agent);
2192 return err;
2193 }
2194
2195 fc->ops = &cpuset_fs_context_ops;
2196
2197 ctx = cgroup_fc2context(fc);
2198 ctx->subsys_mask = 1 << cpuset_cgrp_id;
2199 ctx->flags |= CGRP_ROOT_NOPREFIX;
2200 ctx->release_agent = agent;
2201
2202 get_filesystem(&cgroup_fs_type);
2203 put_filesystem(fc->fs_type);
2204 fc->fs_type = &cgroup_fs_type;
2205
2206 return 0;
2207}
2208
2209static struct file_system_type cpuset_fs_type = {
2210 .name = "cpuset",
2211 .init_fs_context = cpuset_init_fs_context,
2212 .fs_flags = FS_USERNS_MOUNT,
2213};
2214#endif
2215
2216int cgroup_path_ns_locked(struct cgroup *cgrp, char *buf, size_t buflen,
2217 struct cgroup_namespace *ns)
2218{
2219 struct cgroup *root = cset_cgroup_from_root(ns->root_cset, cgrp->root);
2220
2221 return kernfs_path_from_node(cgrp->kn, root->kn, buf, buflen);
2222}
2223
2224int cgroup_path_ns(struct cgroup *cgrp, char *buf, size_t buflen,
2225 struct cgroup_namespace *ns)
2226{
2227 int ret;
2228
2229 mutex_lock(&cgroup_mutex);
2230 spin_lock_irq(&css_set_lock);
2231
2232 ret = cgroup_path_ns_locked(cgrp, buf, buflen, ns);
2233
2234 spin_unlock_irq(&css_set_lock);
2235 mutex_unlock(&cgroup_mutex);
2236
2237 return ret;
2238}
2239EXPORT_SYMBOL_GPL(cgroup_path_ns);
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254int task_cgroup_path(struct task_struct *task, char *buf, size_t buflen)
2255{
2256 struct cgroup_root *root;
2257 struct cgroup *cgrp;
2258 int hierarchy_id = 1;
2259 int ret;
2260
2261 mutex_lock(&cgroup_mutex);
2262 spin_lock_irq(&css_set_lock);
2263
2264 root = idr_get_next(&cgroup_hierarchy_idr, &hierarchy_id);
2265
2266 if (root) {
2267 cgrp = task_cgroup_from_root(task, root);
2268 ret = cgroup_path_ns_locked(cgrp, buf, buflen, &init_cgroup_ns);
2269 } else {
2270
2271 ret = strlcpy(buf, "/", buflen);
2272 }
2273
2274 spin_unlock_irq(&css_set_lock);
2275 mutex_unlock(&cgroup_mutex);
2276 return ret;
2277}
2278EXPORT_SYMBOL_GPL(task_cgroup_path);
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290static void cgroup_migrate_add_task(struct task_struct *task,
2291 struct cgroup_mgctx *mgctx)
2292{
2293 struct css_set *cset;
2294
2295 lockdep_assert_held(&css_set_lock);
2296
2297
2298 if (task->flags & PF_EXITING)
2299 return;
2300
2301
2302 WARN_ON_ONCE(list_empty(&task->cg_list));
2303
2304 cset = task_css_set(task);
2305 if (!cset->mg_src_cgrp)
2306 return;
2307
2308 mgctx->tset.nr_tasks++;
2309
2310 list_move_tail(&task->cg_list, &cset->mg_tasks);
2311 if (list_empty(&cset->mg_node))
2312 list_add_tail(&cset->mg_node,
2313 &mgctx->tset.src_csets);
2314 if (list_empty(&cset->mg_dst_cset->mg_node))
2315 list_add_tail(&cset->mg_dst_cset->mg_node,
2316 &mgctx->tset.dst_csets);
2317}
2318
2319
2320
2321
2322
2323
2324
2325
2326struct task_struct *cgroup_taskset_first(struct cgroup_taskset *tset,
2327 struct cgroup_subsys_state **dst_cssp)
2328{
2329 tset->cur_cset = list_first_entry(tset->csets, struct css_set, mg_node);
2330 tset->cur_task = NULL;
2331
2332 return cgroup_taskset_next(tset, dst_cssp);
2333}
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset,
2344 struct cgroup_subsys_state **dst_cssp)
2345{
2346 struct css_set *cset = tset->cur_cset;
2347 struct task_struct *task = tset->cur_task;
2348
2349 while (&cset->mg_node != tset->csets) {
2350 if (!task)
2351 task = list_first_entry(&cset->mg_tasks,
2352 struct task_struct, cg_list);
2353 else
2354 task = list_next_entry(task, cg_list);
2355
2356 if (&task->cg_list != &cset->mg_tasks) {
2357 tset->cur_cset = cset;
2358 tset->cur_task = task;
2359
2360
2361
2362
2363
2364
2365
2366 if (cset->mg_dst_cset)
2367 *dst_cssp = cset->mg_dst_cset->subsys[tset->ssid];
2368 else
2369 *dst_cssp = cset->subsys[tset->ssid];
2370
2371 return task;
2372 }
2373
2374 cset = list_next_entry(cset, mg_node);
2375 task = NULL;
2376 }
2377
2378 return NULL;
2379}
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390static int cgroup_migrate_execute(struct cgroup_mgctx *mgctx)
2391{
2392 struct cgroup_taskset *tset = &mgctx->tset;
2393 struct cgroup_subsys *ss;
2394 struct task_struct *task, *tmp_task;
2395 struct css_set *cset, *tmp_cset;
2396 int ssid, failed_ssid, ret;
2397
2398
2399 if (tset->nr_tasks) {
2400 do_each_subsys_mask(ss, ssid, mgctx->ss_mask) {
2401 if (ss->can_attach) {
2402 tset->ssid = ssid;
2403 ret = ss->can_attach(tset);
2404 if (ret) {
2405 failed_ssid = ssid;
2406 goto out_cancel_attach;
2407 }
2408 }
2409 } while_each_subsys_mask();
2410 }
2411
2412
2413
2414
2415
2416
2417 spin_lock_irq(&css_set_lock);
2418 list_for_each_entry(cset, &tset->src_csets, mg_node) {
2419 list_for_each_entry_safe(task, tmp_task, &cset->mg_tasks, cg_list) {
2420 struct css_set *from_cset = task_css_set(task);
2421 struct css_set *to_cset = cset->mg_dst_cset;
2422
2423 get_css_set(to_cset);
2424 to_cset->nr_tasks++;
2425 css_set_move_task(task, from_cset, to_cset, true);
2426 from_cset->nr_tasks--;
2427
2428
2429
2430
2431 cgroup_freezer_migrate_task(task, from_cset->dfl_cgrp,
2432 to_cset->dfl_cgrp);
2433 put_css_set_locked(from_cset);
2434
2435 }
2436 }
2437 spin_unlock_irq(&css_set_lock);
2438
2439
2440
2441
2442
2443
2444 tset->csets = &tset->dst_csets;
2445
2446 if (tset->nr_tasks) {
2447 do_each_subsys_mask(ss, ssid, mgctx->ss_mask) {
2448 if (ss->attach) {
2449 tset->ssid = ssid;
2450 ss->attach(tset);
2451 }
2452 } while_each_subsys_mask();
2453 }
2454
2455 ret = 0;
2456 goto out_release_tset;
2457
2458out_cancel_attach:
2459 if (tset->nr_tasks) {
2460 do_each_subsys_mask(ss, ssid, mgctx->ss_mask) {
2461 if (ssid == failed_ssid)
2462 break;
2463 if (ss->cancel_attach) {
2464 tset->ssid = ssid;
2465 ss->cancel_attach(tset);
2466 }
2467 } while_each_subsys_mask();
2468 }
2469out_release_tset:
2470 spin_lock_irq(&css_set_lock);
2471 list_splice_init(&tset->dst_csets, &tset->src_csets);
2472 list_for_each_entry_safe(cset, tmp_cset, &tset->src_csets, mg_node) {
2473 list_splice_tail_init(&cset->mg_tasks, &cset->tasks);
2474 list_del_init(&cset->mg_node);
2475 }
2476 spin_unlock_irq(&css_set_lock);
2477
2478
2479
2480
2481
2482
2483 tset->nr_tasks = 0;
2484 tset->csets = &tset->src_csets;
2485 return ret;
2486}
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497int cgroup_migrate_vet_dst(struct cgroup *dst_cgrp)
2498{
2499
2500 if (!cgroup_on_dfl(dst_cgrp))
2501 return 0;
2502
2503
2504 if (!cgroup_is_valid_domain(dst_cgrp->dom_cgrp))
2505 return -EOPNOTSUPP;
2506
2507
2508 if (cgroup_is_mixable(dst_cgrp))
2509 return 0;
2510
2511
2512
2513
2514
2515 if (cgroup_can_be_thread_root(dst_cgrp) || cgroup_is_threaded(dst_cgrp))
2516 return 0;
2517
2518
2519 if (dst_cgrp->subtree_control)
2520 return -EBUSY;
2521
2522 return 0;
2523}
2524
2525
2526
2527
2528
2529
2530
2531
2532void cgroup_migrate_finish(struct cgroup_mgctx *mgctx)
2533{
2534 LIST_HEAD(preloaded);
2535 struct css_set *cset, *tmp_cset;
2536
2537 lockdep_assert_held(&cgroup_mutex);
2538
2539 spin_lock_irq(&css_set_lock);
2540
2541 list_splice_tail_init(&mgctx->preloaded_src_csets, &preloaded);
2542 list_splice_tail_init(&mgctx->preloaded_dst_csets, &preloaded);
2543
2544 list_for_each_entry_safe(cset, tmp_cset, &preloaded, mg_preload_node) {
2545 cset->mg_src_cgrp = NULL;
2546 cset->mg_dst_cgrp = NULL;
2547 cset->mg_dst_cset = NULL;
2548 list_del_init(&cset->mg_preload_node);
2549 put_css_set_locked(cset);
2550 }
2551
2552 spin_unlock_irq(&css_set_lock);
2553}
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571void cgroup_migrate_add_src(struct css_set *src_cset,
2572 struct cgroup *dst_cgrp,
2573 struct cgroup_mgctx *mgctx)
2574{
2575 struct cgroup *src_cgrp;
2576
2577 lockdep_assert_held(&cgroup_mutex);
2578 lockdep_assert_held(&css_set_lock);
2579
2580
2581
2582
2583
2584
2585 if (src_cset->dead)
2586 return;
2587
2588 src_cgrp = cset_cgroup_from_root(src_cset, dst_cgrp->root);
2589
2590 if (!list_empty(&src_cset->mg_preload_node))
2591 return;
2592
2593 WARN_ON(src_cset->mg_src_cgrp);
2594 WARN_ON(src_cset->mg_dst_cgrp);
2595 WARN_ON(!list_empty(&src_cset->mg_tasks));
2596 WARN_ON(!list_empty(&src_cset->mg_node));
2597
2598 src_cset->mg_src_cgrp = src_cgrp;
2599 src_cset->mg_dst_cgrp = dst_cgrp;
2600 get_css_set(src_cset);
2601 list_add_tail(&src_cset->mg_preload_node, &mgctx->preloaded_src_csets);
2602}
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618int cgroup_migrate_prepare_dst(struct cgroup_mgctx *mgctx)
2619{
2620 struct css_set *src_cset, *tmp_cset;
2621
2622 lockdep_assert_held(&cgroup_mutex);
2623
2624
2625 list_for_each_entry_safe(src_cset, tmp_cset, &mgctx->preloaded_src_csets,
2626 mg_preload_node) {
2627 struct css_set *dst_cset;
2628 struct cgroup_subsys *ss;
2629 int ssid;
2630
2631 dst_cset = find_css_set(src_cset, src_cset->mg_dst_cgrp);
2632 if (!dst_cset)
2633 return -ENOMEM;
2634
2635 WARN_ON_ONCE(src_cset->mg_dst_cset || dst_cset->mg_dst_cset);
2636
2637
2638
2639
2640
2641
2642 if (src_cset == dst_cset) {
2643 src_cset->mg_src_cgrp = NULL;
2644 src_cset->mg_dst_cgrp = NULL;
2645 list_del_init(&src_cset->mg_preload_node);
2646 put_css_set(src_cset);
2647 put_css_set(dst_cset);
2648 continue;
2649 }
2650
2651 src_cset->mg_dst_cset = dst_cset;
2652
2653 if (list_empty(&dst_cset->mg_preload_node))
2654 list_add_tail(&dst_cset->mg_preload_node,
2655 &mgctx->preloaded_dst_csets);
2656 else
2657 put_css_set(dst_cset);
2658
2659 for_each_subsys(ss, ssid)
2660 if (src_cset->subsys[ssid] != dst_cset->subsys[ssid])
2661 mgctx->ss_mask |= 1 << ssid;
2662 }
2663
2664 return 0;
2665}
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685int cgroup_migrate(struct task_struct *leader, bool threadgroup,
2686 struct cgroup_mgctx *mgctx)
2687{
2688 struct task_struct *task;
2689
2690
2691
2692
2693
2694
2695 spin_lock_irq(&css_set_lock);
2696 rcu_read_lock();
2697 task = leader;
2698 do {
2699 cgroup_migrate_add_task(task, mgctx);
2700 if (!threadgroup)
2701 break;
2702 } while_each_thread(leader, task);
2703 rcu_read_unlock();
2704 spin_unlock_irq(&css_set_lock);
2705
2706 return cgroup_migrate_execute(mgctx);
2707}
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717int cgroup_attach_task(struct cgroup *dst_cgrp, struct task_struct *leader,
2718 bool threadgroup)
2719{
2720 DEFINE_CGROUP_MGCTX(mgctx);
2721 struct task_struct *task;
2722 int ret;
2723
2724 ret = cgroup_migrate_vet_dst(dst_cgrp);
2725 if (ret)
2726 return ret;
2727
2728
2729 spin_lock_irq(&css_set_lock);
2730 rcu_read_lock();
2731 task = leader;
2732 do {
2733 cgroup_migrate_add_src(task_css_set(task), dst_cgrp, &mgctx);
2734 if (!threadgroup)
2735 break;
2736 } while_each_thread(leader, task);
2737 rcu_read_unlock();
2738 spin_unlock_irq(&css_set_lock);
2739
2740
2741 ret = cgroup_migrate_prepare_dst(&mgctx);
2742 if (!ret)
2743 ret = cgroup_migrate(leader, threadgroup, &mgctx);
2744
2745 cgroup_migrate_finish(&mgctx);
2746
2747 if (!ret)
2748 TRACE_CGROUP_PATH(attach_task, dst_cgrp, leader, threadgroup);
2749
2750 return ret;
2751}
2752
2753struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup,
2754 bool *locked)
2755 __acquires(&cgroup_threadgroup_rwsem)
2756{
2757 struct task_struct *tsk;
2758 pid_t pid;
2759
2760 if (kstrtoint(strstrip(buf), 0, &pid) || pid < 0)
2761 return ERR_PTR(-EINVAL);
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771 lockdep_assert_held(&cgroup_mutex);
2772 if (pid || threadgroup) {
2773 percpu_down_write(&cgroup_threadgroup_rwsem);
2774 *locked = true;
2775 } else {
2776 *locked = false;
2777 }
2778
2779 rcu_read_lock();
2780 if (pid) {
2781 tsk = find_task_by_vpid(pid);
2782 if (!tsk) {
2783 tsk = ERR_PTR(-ESRCH);
2784 goto out_unlock_threadgroup;
2785 }
2786 } else {
2787 tsk = current;
2788 }
2789
2790 if (threadgroup)
2791 tsk = tsk->group_leader;
2792
2793
2794
2795
2796
2797
2798
2799 if (tsk->no_cgroup_migration || (tsk->flags & PF_NO_SETAFFINITY)) {
2800 tsk = ERR_PTR(-EINVAL);
2801 goto out_unlock_threadgroup;
2802 }
2803
2804 get_task_struct(tsk);
2805 goto out_unlock_rcu;
2806
2807out_unlock_threadgroup:
2808 if (*locked) {
2809 percpu_up_write(&cgroup_threadgroup_rwsem);
2810 *locked = false;
2811 }
2812out_unlock_rcu:
2813 rcu_read_unlock();
2814 return tsk;
2815}
2816
2817void cgroup_procs_write_finish(struct task_struct *task, bool locked)
2818 __releases(&cgroup_threadgroup_rwsem)
2819{
2820 struct cgroup_subsys *ss;
2821 int ssid;
2822
2823
2824 put_task_struct(task);
2825
2826 if (locked)
2827 percpu_up_write(&cgroup_threadgroup_rwsem);
2828 for_each_subsys(ss, ssid)
2829 if (ss->post_attach)
2830 ss->post_attach();
2831}
2832
2833static void cgroup_print_ss_mask(struct seq_file *seq, u16 ss_mask)
2834{
2835 struct cgroup_subsys *ss;
2836 bool printed = false;
2837 int ssid;
2838
2839 do_each_subsys_mask(ss, ssid, ss_mask) {
2840 if (printed)
2841 seq_putc(seq, ' ');
2842 seq_puts(seq, ss->name);
2843 printed = true;
2844 } while_each_subsys_mask();
2845 if (printed)
2846 seq_putc(seq, '\n');
2847}
2848
2849
2850static int cgroup_controllers_show(struct seq_file *seq, void *v)
2851{
2852 struct cgroup *cgrp = seq_css(seq)->cgroup;
2853
2854 cgroup_print_ss_mask(seq, cgroup_control(cgrp));
2855 return 0;
2856}
2857
2858
2859static int cgroup_subtree_control_show(struct seq_file *seq, void *v)
2860{
2861 struct cgroup *cgrp = seq_css(seq)->cgroup;
2862
2863 cgroup_print_ss_mask(seq, cgrp->subtree_control);
2864 return 0;
2865}
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876static int cgroup_update_dfl_csses(struct cgroup *cgrp)
2877{
2878 DEFINE_CGROUP_MGCTX(mgctx);
2879 struct cgroup_subsys_state *d_css;
2880 struct cgroup *dsct;
2881 struct css_set *src_cset;
2882 int ret;
2883
2884 lockdep_assert_held(&cgroup_mutex);
2885
2886 percpu_down_write(&cgroup_threadgroup_rwsem);
2887
2888
2889 spin_lock_irq(&css_set_lock);
2890 cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp) {
2891 struct cgrp_cset_link *link;
2892
2893 list_for_each_entry(link, &dsct->cset_links, cset_link)
2894 cgroup_migrate_add_src(link->cset, dsct, &mgctx);
2895 }
2896 spin_unlock_irq(&css_set_lock);
2897
2898
2899 ret = cgroup_migrate_prepare_dst(&mgctx);
2900 if (ret)
2901 goto out_finish;
2902
2903 spin_lock_irq(&css_set_lock);
2904 list_for_each_entry(src_cset, &mgctx.preloaded_src_csets, mg_preload_node) {
2905 struct task_struct *task, *ntask;
2906
2907
2908 list_for_each_entry_safe(task, ntask, &src_cset->tasks, cg_list)
2909 cgroup_migrate_add_task(task, &mgctx);
2910 }
2911 spin_unlock_irq(&css_set_lock);
2912
2913 ret = cgroup_migrate_execute(&mgctx);
2914out_finish:
2915 cgroup_migrate_finish(&mgctx);
2916 percpu_up_write(&cgroup_threadgroup_rwsem);
2917 return ret;
2918}
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928void cgroup_lock_and_drain_offline(struct cgroup *cgrp)
2929 __acquires(&cgroup_mutex)
2930{
2931 struct cgroup *dsct;
2932 struct cgroup_subsys_state *d_css;
2933 struct cgroup_subsys *ss;
2934 int ssid;
2935
2936restart:
2937 mutex_lock(&cgroup_mutex);
2938
2939 cgroup_for_each_live_descendant_post(dsct, d_css, cgrp) {
2940 for_each_subsys(ss, ssid) {
2941 struct cgroup_subsys_state *css = cgroup_css(dsct, ss);
2942 DEFINE_WAIT(wait);
2943
2944 if (!css || !percpu_ref_is_dying(&css->refcnt))
2945 continue;
2946
2947 cgroup_get_live(dsct);
2948 prepare_to_wait(&dsct->offline_waitq, &wait,
2949 TASK_UNINTERRUPTIBLE);
2950
2951 mutex_unlock(&cgroup_mutex);
2952 schedule();
2953 finish_wait(&dsct->offline_waitq, &wait);
2954
2955 cgroup_put(dsct);
2956 goto restart;
2957 }
2958 }
2959}
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969static void cgroup_save_control(struct cgroup *cgrp)
2970{
2971 struct cgroup *dsct;
2972 struct cgroup_subsys_state *d_css;
2973
2974 cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp) {
2975 dsct->old_subtree_control = dsct->subtree_control;
2976 dsct->old_subtree_ss_mask = dsct->subtree_ss_mask;
2977 dsct->old_dom_cgrp = dsct->dom_cgrp;
2978 }
2979}
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989static void cgroup_propagate_control(struct cgroup *cgrp)
2990{
2991 struct cgroup *dsct;
2992 struct cgroup_subsys_state *d_css;
2993
2994 cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp) {
2995 dsct->subtree_control &= cgroup_control(dsct);
2996 dsct->subtree_ss_mask =
2997 cgroup_calc_subtree_ss_mask(dsct->subtree_control,
2998 cgroup_ss_mask(dsct));
2999 }
3000}
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010static void cgroup_restore_control(struct cgroup *cgrp)
3011{
3012 struct cgroup *dsct;
3013 struct cgroup_subsys_state *d_css;
3014
3015 cgroup_for_each_live_descendant_post(dsct, d_css, cgrp) {
3016 dsct->subtree_control = dsct->old_subtree_control;
3017 dsct->subtree_ss_mask = dsct->old_subtree_ss_mask;
3018 dsct->dom_cgrp = dsct->old_dom_cgrp;
3019 }
3020}
3021
3022static bool css_visible(struct cgroup_subsys_state *css)
3023{
3024 struct cgroup_subsys *ss = css->ss;
3025 struct cgroup *cgrp = css->cgroup;
3026
3027 if (cgroup_control(cgrp) & (1 << ss->id))
3028 return true;
3029 if (!(cgroup_ss_mask(cgrp) & (1 << ss->id)))
3030 return false;
3031 return cgroup_on_dfl(cgrp) && ss->implicit_on_dfl;
3032}
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047static int cgroup_apply_control_enable(struct cgroup *cgrp)
3048{
3049 struct cgroup *dsct;
3050 struct cgroup_subsys_state *d_css;
3051 struct cgroup_subsys *ss;
3052 int ssid, ret;
3053
3054 cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp) {
3055 for_each_subsys(ss, ssid) {
3056 struct cgroup_subsys_state *css = cgroup_css(dsct, ss);
3057
3058 WARN_ON_ONCE(css && percpu_ref_is_dying(&css->refcnt));
3059
3060 if (!(cgroup_ss_mask(dsct) & (1 << ss->id)))
3061 continue;
3062
3063 if (!css) {
3064 css = css_create(dsct, ss);
3065 if (IS_ERR(css))
3066 return PTR_ERR(css);
3067 }
3068
3069 if (css_visible(css)) {
3070 ret = css_populate_dir(css);
3071 if (ret)
3072 return ret;
3073 }
3074 }
3075 }
3076
3077 return 0;
3078}
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093static void cgroup_apply_control_disable(struct cgroup *cgrp)
3094{
3095 struct cgroup *dsct;
3096 struct cgroup_subsys_state *d_css;
3097 struct cgroup_subsys *ss;
3098 int ssid;
3099
3100 cgroup_for_each_live_descendant_post(dsct, d_css, cgrp) {
3101 for_each_subsys(ss, ssid) {
3102 struct cgroup_subsys_state *css = cgroup_css(dsct, ss);
3103
3104 WARN_ON_ONCE(css && percpu_ref_is_dying(&css->refcnt));
3105
3106 if (!css)
3107 continue;
3108
3109 if (css->parent &&
3110 !(cgroup_ss_mask(dsct) & (1 << ss->id))) {
3111 kill_css(css);
3112 } else if (!css_visible(css)) {
3113 css_clear_dir(css);
3114 if (ss->css_reset)
3115 ss->css_reset(css);
3116 }
3117 }
3118 }
3119}
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138static int cgroup_apply_control(struct cgroup *cgrp)
3139{
3140 int ret;
3141
3142 cgroup_propagate_control(cgrp);
3143
3144 ret = cgroup_apply_control_enable(cgrp);
3145 if (ret)
3146 return ret;
3147
3148
3149
3150
3151
3152
3153 ret = cgroup_update_dfl_csses(cgrp);
3154 if (ret)
3155 return ret;
3156
3157 return 0;
3158}
3159
3160
3161
3162
3163
3164
3165
3166
3167static void cgroup_finalize_control(struct cgroup *cgrp, int ret)
3168{
3169 if (ret) {
3170 cgroup_restore_control(cgrp);
3171 cgroup_propagate_control(cgrp);
3172 }
3173
3174 cgroup_apply_control_disable(cgrp);
3175}
3176
3177static int cgroup_vet_subtree_control_enable(struct cgroup *cgrp, u16 enable)
3178{
3179 u16 domain_enable = enable & ~cgrp_dfl_threaded_ss_mask;
3180
3181
3182 if (!enable)
3183 return 0;
3184
3185
3186 if (!cgroup_is_valid_domain(cgrp->dom_cgrp))
3187 return -EOPNOTSUPP;
3188
3189
3190 if (cgroup_is_mixable(cgrp))
3191 return 0;
3192
3193 if (domain_enable) {
3194
3195 if (cgroup_is_thread_root(cgrp) || cgroup_is_threaded(cgrp))
3196 return -EOPNOTSUPP;
3197 } else {
3198
3199
3200
3201
3202
3203 if (cgroup_can_be_thread_root(cgrp) || cgroup_is_threaded(cgrp))
3204 return 0;
3205 }
3206
3207
3208
3209
3210
3211 if (cgroup_has_tasks(cgrp))
3212 return -EBUSY;
3213
3214 return 0;
3215}
3216
3217
3218static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of,
3219 char *buf, size_t nbytes,
3220 loff_t off)
3221{
3222 u16 enable = 0, disable = 0;
3223 struct cgroup *cgrp, *child;
3224 struct cgroup_subsys *ss;
3225 char *tok;
3226 int ssid, ret;
3227
3228
3229
3230
3231
3232 buf = strstrip(buf);
3233 while ((tok = strsep(&buf, " "))) {
3234 if (tok[0] == '\0')
3235 continue;
3236 do_each_subsys_mask(ss, ssid, ~cgrp_dfl_inhibit_ss_mask) {
3237 if (!cgroup_ssid_enabled(ssid) ||
3238 strcmp(tok + 1, ss->name))
3239 continue;
3240
3241 if (*tok == '+') {
3242 enable |= 1 << ssid;
3243 disable &= ~(1 << ssid);
3244 } else if (*tok == '-') {
3245 disable |= 1 << ssid;
3246 enable &= ~(1 << ssid);
3247 } else {
3248 return -EINVAL;
3249 }
3250 break;
3251 } while_each_subsys_mask();
3252 if (ssid == CGROUP_SUBSYS_COUNT)
3253 return -EINVAL;
3254 }
3255
3256 cgrp = cgroup_kn_lock_live(of->kn, true);
3257 if (!cgrp)
3258 return -ENODEV;
3259
3260 for_each_subsys(ss, ssid) {
3261 if (enable & (1 << ssid)) {
3262 if (cgrp->subtree_control & (1 << ssid)) {
3263 enable &= ~(1 << ssid);
3264 continue;
3265 }
3266
3267 if (!(cgroup_control(cgrp) & (1 << ssid))) {
3268 ret = -ENOENT;
3269 goto out_unlock;
3270 }
3271 } else if (disable & (1 << ssid)) {
3272 if (!(cgrp->subtree_control & (1 << ssid))) {
3273 disable &= ~(1 << ssid);
3274 continue;
3275 }
3276
3277
3278 cgroup_for_each_live_child(child, cgrp) {
3279 if (child->subtree_control & (1 << ssid)) {
3280 ret = -EBUSY;
3281 goto out_unlock;
3282 }
3283 }
3284 }
3285 }
3286
3287 if (!enable && !disable) {
3288 ret = 0;
3289 goto out_unlock;
3290 }
3291
3292 ret = cgroup_vet_subtree_control_enable(cgrp, enable);
3293 if (ret)
3294 goto out_unlock;
3295
3296
3297 cgroup_save_control(cgrp);
3298
3299 cgrp->subtree_control |= enable;
3300 cgrp->subtree_control &= ~disable;
3301
3302 ret = cgroup_apply_control(cgrp);
3303 cgroup_finalize_control(cgrp, ret);
3304 if (ret)
3305 goto out_unlock;
3306
3307 kernfs_activate(cgrp->kn);
3308out_unlock:
3309 cgroup_kn_unlock(of->kn);
3310 return ret ?: nbytes;
3311}
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322static int cgroup_enable_threaded(struct cgroup *cgrp)
3323{
3324 struct cgroup *parent = cgroup_parent(cgrp);
3325 struct cgroup *dom_cgrp = parent->dom_cgrp;
3326 struct cgroup *dsct;
3327 struct cgroup_subsys_state *d_css;
3328 int ret;
3329
3330 lockdep_assert_held(&cgroup_mutex);
3331
3332
3333 if (cgroup_is_threaded(cgrp))
3334 return 0;
3335
3336
3337
3338
3339
3340
3341
3342 if (cgroup_is_populated(cgrp) ||
3343 cgrp->subtree_control & ~cgrp_dfl_threaded_ss_mask)
3344 return -EOPNOTSUPP;
3345
3346
3347 if (!cgroup_is_valid_domain(dom_cgrp) ||
3348 !cgroup_can_be_thread_root(dom_cgrp))
3349 return -EOPNOTSUPP;
3350
3351
3352
3353
3354
3355 cgroup_save_control(cgrp);
3356
3357 cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp)
3358 if (dsct == cgrp || cgroup_is_threaded(dsct))
3359 dsct->dom_cgrp = dom_cgrp;
3360
3361 ret = cgroup_apply_control(cgrp);
3362 if (!ret)
3363 parent->nr_threaded_children++;
3364
3365 cgroup_finalize_control(cgrp, ret);
3366 return ret;
3367}
3368
3369static int cgroup_type_show(struct seq_file *seq, void *v)
3370{
3371 struct cgroup *cgrp = seq_css(seq)->cgroup;
3372
3373 if (cgroup_is_threaded(cgrp))
3374 seq_puts(seq, "threaded\n");
3375 else if (!cgroup_is_valid_domain(cgrp))
3376 seq_puts(seq, "domain invalid\n");
3377 else if (cgroup_is_thread_root(cgrp))
3378 seq_puts(seq, "domain threaded\n");
3379 else
3380 seq_puts(seq, "domain\n");
3381
3382 return 0;
3383}
3384
3385static ssize_t cgroup_type_write(struct kernfs_open_file *of, char *buf,
3386 size_t nbytes, loff_t off)
3387{
3388 struct cgroup *cgrp;
3389 int ret;
3390
3391
3392 if (strcmp(strstrip(buf), "threaded"))
3393 return -EINVAL;
3394
3395 cgrp = cgroup_kn_lock_live(of->kn, false);
3396 if (!cgrp)
3397 return -ENOENT;
3398
3399
3400 ret = cgroup_enable_threaded(cgrp);
3401
3402 cgroup_kn_unlock(of->kn);
3403 return ret ?: nbytes;
3404}
3405
3406static int cgroup_max_descendants_show(struct seq_file *seq, void *v)
3407{
3408 struct cgroup *cgrp = seq_css(seq)->cgroup;
3409 int descendants = READ_ONCE(cgrp->max_descendants);
3410
3411 if (descendants == INT_MAX)
3412 seq_puts(seq, "max\n");
3413 else
3414 seq_printf(seq, "%d\n", descendants);
3415
3416 return 0;
3417}
3418
3419static ssize_t cgroup_max_descendants_write(struct kernfs_open_file *of,
3420 char *buf, size_t nbytes, loff_t off)
3421{
3422 struct cgroup *cgrp;
3423 int descendants;
3424 ssize_t ret;
3425
3426 buf = strstrip(buf);
3427 if (!strcmp(buf, "max")) {
3428 descendants = INT_MAX;
3429 } else {
3430 ret = kstrtoint(buf, 0, &descendants);
3431 if (ret)
3432 return ret;
3433 }
3434
3435 if (descendants < 0)
3436 return -ERANGE;
3437
3438 cgrp = cgroup_kn_lock_live(of->kn, false);
3439 if (!cgrp)
3440 return -ENOENT;
3441
3442 cgrp->max_descendants = descendants;
3443
3444 cgroup_kn_unlock(of->kn);
3445
3446 return nbytes;
3447}
3448
3449static int cgroup_max_depth_show(struct seq_file *seq, void *v)
3450{
3451 struct cgroup *cgrp = seq_css(seq)->cgroup;
3452 int depth = READ_ONCE(cgrp->max_depth);
3453
3454 if (depth == INT_MAX)
3455 seq_puts(seq, "max\n");
3456 else
3457 seq_printf(seq, "%d\n", depth);
3458
3459 return 0;
3460}
3461
3462static ssize_t cgroup_max_depth_write(struct kernfs_open_file *of,
3463 char *buf, size_t nbytes, loff_t off)
3464{
3465 struct cgroup *cgrp;
3466 ssize_t ret;
3467 int depth;
3468
3469 buf = strstrip(buf);
3470 if (!strcmp(buf, "max")) {
3471 depth = INT_MAX;
3472 } else {
3473 ret = kstrtoint(buf, 0, &depth);
3474 if (ret)
3475 return ret;
3476 }
3477
3478 if (depth < 0)
3479 return -ERANGE;
3480
3481 cgrp = cgroup_kn_lock_live(of->kn, false);
3482 if (!cgrp)
3483 return -ENOENT;
3484
3485 cgrp->max_depth = depth;
3486
3487 cgroup_kn_unlock(of->kn);
3488
3489 return nbytes;
3490}
3491
3492static int cgroup_events_show(struct seq_file *seq, void *v)
3493{
3494 struct cgroup *cgrp = seq_css(seq)->cgroup;
3495
3496 seq_printf(seq, "populated %d\n", cgroup_is_populated(cgrp));
3497 seq_printf(seq, "frozen %d\n", test_bit(CGRP_FROZEN, &cgrp->flags));
3498
3499 return 0;
3500}
3501
3502static int cgroup_stat_show(struct seq_file *seq, void *v)
3503{
3504 struct cgroup *cgroup = seq_css(seq)->cgroup;
3505
3506 seq_printf(seq, "nr_descendants %d\n",
3507 cgroup->nr_descendants);
3508 seq_printf(seq, "nr_dying_descendants %d\n",
3509 cgroup->nr_dying_descendants);
3510
3511 return 0;
3512}
3513
3514static int __maybe_unused cgroup_extra_stat_show(struct seq_file *seq,
3515 struct cgroup *cgrp, int ssid)
3516{
3517 struct cgroup_subsys *ss = cgroup_subsys[ssid];
3518 struct cgroup_subsys_state *css;
3519 int ret;
3520
3521 if (!ss->css_extra_stat_show)
3522 return 0;
3523
3524 css = cgroup_tryget_css(cgrp, ss);
3525 if (!css)
3526 return 0;
3527
3528 ret = ss->css_extra_stat_show(seq, css);
3529 css_put(css);
3530 return ret;
3531}
3532
3533static int cpu_stat_show(struct seq_file *seq, void *v)
3534{
3535 struct cgroup __maybe_unused *cgrp = seq_css(seq)->cgroup;
3536 int ret = 0;
3537
3538 cgroup_base_stat_cputime_show(seq);
3539#ifdef CONFIG_CGROUP_SCHED
3540 ret = cgroup_extra_stat_show(seq, cgrp, cpu_cgrp_id);
3541#endif
3542 return ret;
3543}
3544
3545#ifdef CONFIG_PSI
3546static int cgroup_io_pressure_show(struct seq_file *seq, void *v)
3547{
3548 struct cgroup *cgrp = seq_css(seq)->cgroup;
3549 struct psi_group *psi = cgroup_id(cgrp) == 1 ? &psi_system : &cgrp->psi;
3550
3551 return psi_show(seq, psi, PSI_IO);
3552}
3553static int cgroup_memory_pressure_show(struct seq_file *seq, void *v)
3554{
3555 struct cgroup *cgrp = seq_css(seq)->cgroup;
3556 struct psi_group *psi = cgroup_id(cgrp) == 1 ? &psi_system : &cgrp->psi;
3557
3558 return psi_show(seq, psi, PSI_MEM);
3559}
3560static int cgroup_cpu_pressure_show(struct seq_file *seq, void *v)
3561{
3562 struct cgroup *cgrp = seq_css(seq)->cgroup;
3563 struct psi_group *psi = cgroup_id(cgrp) == 1 ? &psi_system : &cgrp->psi;
3564
3565 return psi_show(seq, psi, PSI_CPU);
3566}
3567
3568static ssize_t cgroup_pressure_write(struct kernfs_open_file *of, char *buf,
3569 size_t nbytes, enum psi_res res)
3570{
3571 struct psi_trigger *new;
3572 struct cgroup *cgrp;
3573
3574 cgrp = cgroup_kn_lock_live(of->kn, false);
3575 if (!cgrp)
3576 return -ENODEV;
3577
3578 cgroup_get(cgrp);
3579 cgroup_kn_unlock(of->kn);
3580
3581 new = psi_trigger_create(&cgrp->psi, buf, nbytes, res);
3582 if (IS_ERR(new)) {
3583 cgroup_put(cgrp);
3584 return PTR_ERR(new);
3585 }
3586
3587 psi_trigger_replace(&of->priv, new);
3588
3589 cgroup_put(cgrp);
3590
3591 return nbytes;
3592}
3593
3594static ssize_t cgroup_io_pressure_write(struct kernfs_open_file *of,
3595 char *buf, size_t nbytes,
3596 loff_t off)
3597{
3598 return cgroup_pressure_write(of, buf, nbytes, PSI_IO);
3599}
3600
3601static ssize_t cgroup_memory_pressure_write(struct kernfs_open_file *of,
3602 char *buf, size_t nbytes,
3603 loff_t off)
3604{
3605 return cgroup_pressure_write(of, buf, nbytes, PSI_MEM);
3606}
3607
3608static ssize_t cgroup_cpu_pressure_write(struct kernfs_open_file *of,
3609 char *buf, size_t nbytes,
3610 loff_t off)
3611{
3612 return cgroup_pressure_write(of, buf, nbytes, PSI_CPU);
3613}
3614
3615static __poll_t cgroup_pressure_poll(struct kernfs_open_file *of,
3616 poll_table *pt)
3617{
3618 return psi_trigger_poll(&of->priv, of->file, pt);
3619}
3620
3621static void cgroup_pressure_release(struct kernfs_open_file *of)
3622{
3623 psi_trigger_replace(&of->priv, NULL);
3624}
3625#endif
3626
3627static int cgroup_freeze_show(struct seq_file *seq, void *v)
3628{
3629 struct cgroup *cgrp = seq_css(seq)->cgroup;
3630
3631 seq_printf(seq, "%d\n", cgrp->freezer.freeze);
3632
3633 return 0;
3634}
3635
3636static ssize_t cgroup_freeze_write(struct kernfs_open_file *of,
3637 char *buf, size_t nbytes, loff_t off)
3638{
3639 struct cgroup *cgrp;
3640 ssize_t ret;
3641 int freeze;
3642
3643 ret = kstrtoint(strstrip(buf), 0, &freeze);
3644 if (ret)
3645 return ret;
3646
3647 if (freeze < 0 || freeze > 1)
3648 return -ERANGE;
3649
3650 cgrp = cgroup_kn_lock_live(of->kn, false);
3651 if (!cgrp)
3652 return -ENOENT;
3653
3654 cgroup_freeze(cgrp, freeze);
3655
3656 cgroup_kn_unlock(of->kn);
3657
3658 return nbytes;
3659}
3660
3661static int cgroup_file_open(struct kernfs_open_file *of)
3662{
3663 struct cftype *cft = of->kn->priv;
3664
3665 if (cft->open)
3666 return cft->open(of);
3667 return 0;
3668}
3669
3670static void cgroup_file_release(struct kernfs_open_file *of)
3671{
3672 struct cftype *cft = of->kn->priv;
3673
3674 if (cft->release)
3675 cft->release(of);
3676}
3677
3678static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf,
3679 size_t nbytes, loff_t off)
3680{
3681 struct cgroup_namespace *ns = current->nsproxy->cgroup_ns;
3682 struct cgroup *cgrp = of->kn->parent->priv;
3683 struct cftype *cft = of->kn->priv;
3684 struct cgroup_subsys_state *css;
3685 int ret;
3686
3687
3688
3689
3690
3691
3692
3693 if ((cgrp->root->flags & CGRP_ROOT_NS_DELEGATE) &&
3694 !(cft->flags & CFTYPE_NS_DELEGATABLE) &&
3695 ns != &init_cgroup_ns && ns->root_cset->dfl_cgrp == cgrp)
3696 return -EPERM;
3697
3698 if (cft->write)
3699 return cft->write(of, buf, nbytes, off);
3700
3701
3702
3703
3704
3705
3706
3707 rcu_read_lock();
3708 css = cgroup_css(cgrp, cft->ss);
3709 rcu_read_unlock();
3710
3711 if (cft->write_u64) {
3712 unsigned long long v;
3713 ret = kstrtoull(buf, 0, &v);
3714 if (!ret)
3715 ret = cft->write_u64(css, cft, v);
3716 } else if (cft->write_s64) {
3717 long long v;
3718 ret = kstrtoll(buf, 0, &v);
3719 if (!ret)
3720 ret = cft->write_s64(css, cft, v);
3721 } else {
3722 ret = -EINVAL;
3723 }
3724
3725 return ret ?: nbytes;
3726}
3727
3728static __poll_t cgroup_file_poll(struct kernfs_open_file *of, poll_table *pt)
3729{
3730 struct cftype *cft = of->kn->priv;
3731
3732 if (cft->poll)
3733 return cft->poll(of, pt);
3734
3735 return kernfs_generic_poll(of, pt);
3736}
3737
3738static void *cgroup_seqfile_start(struct seq_file *seq, loff_t *ppos)
3739{
3740 return seq_cft(seq)->seq_start(seq, ppos);
3741}
3742
3743static void *cgroup_seqfile_next(struct seq_file *seq, void *v, loff_t *ppos)
3744{
3745 return seq_cft(seq)->seq_next(seq, v, ppos);
3746}
3747
3748static void cgroup_seqfile_stop(struct seq_file *seq, void *v)
3749{
3750 if (seq_cft(seq)->seq_stop)
3751 seq_cft(seq)->seq_stop(seq, v);
3752}
3753
3754static int cgroup_seqfile_show(struct seq_file *m, void *arg)
3755{
3756 struct cftype *cft = seq_cft(m);
3757 struct cgroup_subsys_state *css = seq_css(m);
3758
3759 if (cft->seq_show)
3760 return cft->seq_show(m, arg);
3761
3762 if (cft->read_u64)
3763 seq_printf(m, "%llu\n", cft->read_u64(css, cft));
3764 else if (cft->read_s64)
3765 seq_printf(m, "%lld\n", cft->read_s64(css, cft));
3766 else
3767 return -EINVAL;
3768 return 0;
3769}
3770
3771static struct kernfs_ops cgroup_kf_single_ops = {
3772 .atomic_write_len = PAGE_SIZE,
3773 .open = cgroup_file_open,
3774 .release = cgroup_file_release,
3775 .write = cgroup_file_write,
3776 .poll = cgroup_file_poll,
3777 .seq_show = cgroup_seqfile_show,
3778};
3779
3780static struct kernfs_ops cgroup_kf_ops = {
3781 .atomic_write_len = PAGE_SIZE,
3782 .open = cgroup_file_open,
3783 .release = cgroup_file_release,
3784 .write = cgroup_file_write,
3785 .poll = cgroup_file_poll,
3786 .seq_start = cgroup_seqfile_start,
3787 .seq_next = cgroup_seqfile_next,
3788 .seq_stop = cgroup_seqfile_stop,
3789 .seq_show = cgroup_seqfile_show,
3790};
3791
3792
3793static int cgroup_kn_set_ugid(struct kernfs_node *kn)
3794{
3795 struct iattr iattr = { .ia_valid = ATTR_UID | ATTR_GID,
3796 .ia_uid = current_fsuid(),
3797 .ia_gid = current_fsgid(), };
3798
3799 if (uid_eq(iattr.ia_uid, GLOBAL_ROOT_UID) &&
3800 gid_eq(iattr.ia_gid, GLOBAL_ROOT_GID))
3801 return 0;
3802
3803 return kernfs_setattr(kn, &iattr);
3804}
3805
3806static void cgroup_file_notify_timer(struct timer_list *timer)
3807{
3808 cgroup_file_notify(container_of(timer, struct cgroup_file,
3809 notify_timer));
3810}
3811
3812static int cgroup_add_file(struct cgroup_subsys_state *css, struct cgroup *cgrp,
3813 struct cftype *cft)
3814{
3815 char name[CGROUP_FILE_NAME_MAX];
3816 struct kernfs_node *kn;
3817 struct lock_class_key *key = NULL;
3818 int ret;
3819
3820#ifdef CONFIG_DEBUG_LOCK_ALLOC
3821 key = &cft->lockdep_key;
3822#endif
3823 kn = __kernfs_create_file(cgrp->kn, cgroup_file_name(cgrp, cft, name),
3824 cgroup_file_mode(cft),
3825 GLOBAL_ROOT_UID, GLOBAL_ROOT_GID,
3826 0, cft->kf_ops, cft,
3827 NULL, key);
3828 if (IS_ERR(kn))
3829 return PTR_ERR(kn);
3830
3831 ret = cgroup_kn_set_ugid(kn);
3832 if (ret) {
3833 kernfs_remove(kn);
3834 return ret;
3835 }
3836
3837 if (cft->file_offset) {
3838 struct cgroup_file *cfile = (void *)css + cft->file_offset;
3839
3840 timer_setup(&cfile->notify_timer, cgroup_file_notify_timer, 0);
3841
3842 spin_lock_irq(&cgroup_file_kn_lock);
3843 cfile->kn = kn;
3844 spin_unlock_irq(&cgroup_file_kn_lock);
3845 }
3846
3847 return 0;
3848}
3849
3850
3851
3852
3853
3854
3855
3856
3857
3858
3859
3860static int cgroup_addrm_files(struct cgroup_subsys_state *css,
3861 struct cgroup *cgrp, struct cftype cfts[],
3862 bool is_add)
3863{
3864 struct cftype *cft, *cft_end = NULL;
3865 int ret = 0;
3866
3867 lockdep_assert_held(&cgroup_mutex);
3868
3869restart:
3870 for (cft = cfts; cft != cft_end && cft->name[0] != '\0'; cft++) {
3871
3872 if ((cft->flags & __CFTYPE_ONLY_ON_DFL) && !cgroup_on_dfl(cgrp))
3873 continue;
3874 if ((cft->flags & __CFTYPE_NOT_ON_DFL) && cgroup_on_dfl(cgrp))
3875 continue;
3876 if ((cft->flags & CFTYPE_NOT_ON_ROOT) && !cgroup_parent(cgrp))
3877 continue;
3878 if ((cft->flags & CFTYPE_ONLY_ON_ROOT) && cgroup_parent(cgrp))
3879 continue;
3880 if ((cft->flags & CFTYPE_DEBUG) && !cgroup_debug)
3881 continue;
3882 if (is_add) {
3883 ret = cgroup_add_file(css, cgrp, cft);
3884 if (ret) {
3885 pr_warn("%s: failed to add %s, err=%d\n",
3886 __func__, cft->name, ret);
3887 cft_end = cft;
3888 is_add = false;
3889 goto restart;
3890 }
3891 } else {
3892 cgroup_rm_file(cgrp, cft);
3893 }
3894 }
3895 return ret;
3896}
3897
3898static int cgroup_apply_cftypes(struct cftype *cfts, bool is_add)
3899{
3900 struct cgroup_subsys *ss = cfts[0].ss;
3901 struct cgroup *root = &ss->root->cgrp;
3902 struct cgroup_subsys_state *css;
3903 int ret = 0;
3904
3905 lockdep_assert_held(&cgroup_mutex);
3906
3907
3908 css_for_each_descendant_pre(css, cgroup_css(root, ss)) {
3909 struct cgroup *cgrp = css->cgroup;
3910
3911 if (!(css->flags & CSS_VISIBLE))
3912 continue;
3913
3914 ret = cgroup_addrm_files(css, cgrp, cfts, is_add);
3915 if (ret)
3916 break;
3917 }
3918
3919 if (is_add && !ret)
3920 kernfs_activate(root->kn);
3921 return ret;
3922}
3923
3924static void cgroup_exit_cftypes(struct cftype *cfts)
3925{
3926 struct cftype *cft;
3927
3928 for (cft = cfts; cft->name[0] != '\0'; cft++) {
3929
3930 if (cft->max_write_len && cft->max_write_len != PAGE_SIZE)
3931 kfree(cft->kf_ops);
3932 cft->kf_ops = NULL;
3933 cft->ss = NULL;
3934
3935
3936 cft->flags &= ~(__CFTYPE_ONLY_ON_DFL | __CFTYPE_NOT_ON_DFL);
3937 }
3938}
3939
3940static int cgroup_init_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
3941{
3942 struct cftype *cft;
3943
3944 for (cft = cfts; cft->name[0] != '\0'; cft++) {
3945 struct kernfs_ops *kf_ops;
3946
3947 WARN_ON(cft->ss || cft->kf_ops);
3948
3949 if (cft->seq_start)
3950 kf_ops = &cgroup_kf_ops;
3951 else
3952 kf_ops = &cgroup_kf_single_ops;
3953
3954
3955
3956
3957
3958 if (cft->max_write_len && cft->max_write_len != PAGE_SIZE) {
3959 kf_ops = kmemdup(kf_ops, sizeof(*kf_ops), GFP_KERNEL);
3960 if (!kf_ops) {
3961 cgroup_exit_cftypes(cfts);
3962 return -ENOMEM;
3963 }
3964 kf_ops->atomic_write_len = cft->max_write_len;
3965 }
3966
3967 cft->kf_ops = kf_ops;
3968 cft->ss = ss;
3969 }
3970
3971 return 0;
3972}
3973
3974static int cgroup_rm_cftypes_locked(struct cftype *cfts)
3975{
3976 lockdep_assert_held(&cgroup_mutex);
3977
3978 if (!cfts || !cfts[0].ss)
3979 return -ENOENT;
3980
3981 list_del(&cfts->node);
3982 cgroup_apply_cftypes(cfts, false);
3983 cgroup_exit_cftypes(cfts);
3984 return 0;
3985}
3986
3987
3988
3989
3990
3991
3992
3993
3994
3995
3996
3997
3998int cgroup_rm_cftypes(struct cftype *cfts)
3999{
4000 int ret;
4001
4002 mutex_lock(&cgroup_mutex);
4003 ret = cgroup_rm_cftypes_locked(cfts);
4004 mutex_unlock(&cgroup_mutex);
4005 return ret;
4006}
4007
4008
4009
4010
4011
4012
4013
4014
4015
4016
4017
4018
4019
4020
4021
4022static int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
4023{
4024 int ret;
4025
4026 if (!cgroup_ssid_enabled(ss->id))
4027 return 0;
4028
4029 if (!cfts || cfts[0].name[0] == '\0')
4030 return 0;
4031
4032 ret = cgroup_init_cftypes(ss, cfts);
4033 if (ret)
4034 return ret;
4035
4036 mutex_lock(&cgroup_mutex);
4037
4038 list_add_tail(&cfts->node, &ss->cfts);
4039 ret = cgroup_apply_cftypes(cfts, true);
4040 if (ret)
4041 cgroup_rm_cftypes_locked(cfts);
4042
4043 mutex_unlock(&cgroup_mutex);
4044 return ret;
4045}
4046
4047
4048
4049
4050
4051
4052
4053
4054
4055int cgroup_add_dfl_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
4056{
4057 struct cftype *cft;
4058
4059 for (cft = cfts; cft && cft->name[0] != '\0'; cft++)
4060 cft->flags |= __CFTYPE_ONLY_ON_DFL;
4061 return cgroup_add_cftypes(ss, cfts);
4062}
4063
4064
4065
4066
4067
4068
4069
4070
4071
4072int cgroup_add_legacy_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
4073{
4074 struct cftype *cft;
4075
4076 for (cft = cfts; cft && cft->name[0] != '\0'; cft++)
4077 cft->flags |= __CFTYPE_NOT_ON_DFL;
4078 return cgroup_add_cftypes(ss, cfts);
4079}
4080
4081
4082
4083
4084
4085
4086
4087void cgroup_file_notify(struct cgroup_file *cfile)
4088{
4089 unsigned long flags;
4090
4091 spin_lock_irqsave(&cgroup_file_kn_lock, flags);
4092 if (cfile->kn) {
4093 unsigned long last = cfile->notified_at;
4094 unsigned long next = last + CGROUP_FILE_NOTIFY_MIN_INTV;
4095
4096 if (time_in_range(jiffies, last, next)) {
4097 timer_reduce(&cfile->notify_timer, next);
4098 } else {
4099 kernfs_notify(cfile->kn);
4100 cfile->notified_at = jiffies;
4101 }
4102 }
4103 spin_unlock_irqrestore(&cgroup_file_kn_lock, flags);
4104}
4105
4106
4107
4108
4109
4110
4111
4112
4113
4114
4115
4116
4117
4118
4119
4120
4121
4122
4123struct cgroup_subsys_state *css_next_child(struct cgroup_subsys_state *pos,
4124 struct cgroup_subsys_state *parent)
4125{
4126 struct cgroup_subsys_state *next;
4127
4128 cgroup_assert_mutex_or_rcu_locked();
4129
4130
4131
4132
4133
4134
4135
4136
4137
4138
4139
4140
4141
4142
4143
4144
4145
4146
4147
4148
4149
4150 if (!pos) {
4151 next = list_entry_rcu(parent->children.next, struct cgroup_subsys_state, sibling);
4152 } else if (likely(!(pos->flags & CSS_RELEASED))) {
4153 next = list_entry_rcu(pos->sibling.next, struct cgroup_subsys_state, sibling);
4154 } else {
4155 list_for_each_entry_rcu(next, &parent->children, sibling)
4156 if (next->serial_nr > pos->serial_nr)
4157 break;
4158 }
4159
4160
4161
4162
4163
4164 if (&next->sibling != &parent->children)
4165 return next;
4166 return NULL;
4167}
4168
4169
4170
4171
4172
4173
4174
4175
4176
4177
4178
4179
4180
4181
4182
4183
4184
4185
4186
4187
4188
4189
4190struct cgroup_subsys_state *
4191css_next_descendant_pre(struct cgroup_subsys_state *pos,
4192 struct cgroup_subsys_state *root)
4193{
4194 struct cgroup_subsys_state *next;
4195
4196 cgroup_assert_mutex_or_rcu_locked();
4197
4198
4199 if (!pos)
4200 return root;
4201
4202
4203 next = css_next_child(NULL, pos);
4204 if (next)
4205 return next;
4206
4207
4208 while (pos != root) {
4209 next = css_next_child(pos, pos->parent);
4210 if (next)
4211 return next;
4212 pos = pos->parent;
4213 }
4214
4215 return NULL;
4216}
4217EXPORT_SYMBOL_GPL(css_next_descendant_pre);
4218
4219
4220
4221
4222
4223
4224
4225
4226
4227
4228
4229
4230
4231
4232struct cgroup_subsys_state *
4233css_rightmost_descendant(struct cgroup_subsys_state *pos)
4234{
4235 struct cgroup_subsys_state *last, *tmp;
4236
4237 cgroup_assert_mutex_or_rcu_locked();
4238
4239 do {
4240 last = pos;
4241
4242 pos = NULL;
4243 css_for_each_child(tmp, last)
4244 pos = tmp;
4245 } while (pos);
4246
4247 return last;
4248}
4249
4250static struct cgroup_subsys_state *
4251css_leftmost_descendant(struct cgroup_subsys_state *pos)
4252{
4253 struct cgroup_subsys_state *last;
4254
4255 do {
4256 last = pos;
4257 pos = css_next_child(NULL, pos);
4258 } while (pos);
4259
4260 return last;
4261}
4262
4263
4264
4265
4266
4267
4268
4269
4270
4271
4272
4273
4274
4275
4276
4277
4278
4279
4280
4281
4282
4283
4284
4285struct cgroup_subsys_state *
4286css_next_descendant_post(struct cgroup_subsys_state *pos,
4287 struct cgroup_subsys_state *root)
4288{
4289 struct cgroup_subsys_state *next;
4290
4291 cgroup_assert_mutex_or_rcu_locked();
4292
4293
4294 if (!pos)
4295 return css_leftmost_descendant(root);
4296
4297
4298 if (pos == root)
4299 return NULL;
4300
4301
4302 next = css_next_child(pos, pos->parent);
4303 if (next)
4304 return css_leftmost_descendant(next);
4305
4306
4307 return pos->parent;
4308}
4309
4310
4311
4312
4313
4314
4315
4316
4317
4318bool css_has_online_children(struct cgroup_subsys_state *css)
4319{
4320 struct cgroup_subsys_state *child;
4321 bool ret = false;
4322
4323 rcu_read_lock();
4324 css_for_each_child(child, css) {
4325 if (child->flags & CSS_ONLINE) {
4326 ret = true;
4327 break;
4328 }
4329 }
4330 rcu_read_unlock();
4331 return ret;
4332}
4333
4334static struct css_set *css_task_iter_next_css_set(struct css_task_iter *it)
4335{
4336 struct list_head *l;
4337 struct cgrp_cset_link *link;
4338 struct css_set *cset;
4339
4340 lockdep_assert_held(&css_set_lock);
4341
4342
4343 if (it->tcset_pos) {
4344 l = it->tcset_pos->next;
4345
4346 if (l != it->tcset_head) {
4347 it->tcset_pos = l;
4348 return container_of(l, struct css_set,
4349 threaded_csets_node);
4350 }
4351
4352 it->tcset_pos = NULL;
4353 }
4354
4355
4356 l = it->cset_pos;
4357 l = l->next;
4358 if (l == it->cset_head) {
4359 it->cset_pos = NULL;
4360 return NULL;
4361 }
4362
4363 if (it->ss) {
4364 cset = container_of(l, struct css_set, e_cset_node[it->ss->id]);
4365 } else {
4366 link = list_entry(l, struct cgrp_cset_link, cset_link);
4367 cset = link->cset;
4368 }
4369
4370 it->cset_pos = l;
4371
4372
4373 if (it->flags & CSS_TASK_ITER_THREADED) {
4374 if (it->cur_dcset)
4375 put_css_set_locked(it->cur_dcset);
4376 it->cur_dcset = cset;
4377 get_css_set(cset);
4378
4379 it->tcset_head = &cset->threaded_csets;
4380 it->tcset_pos = &cset->threaded_csets;
4381 }
4382
4383 return cset;
4384}
4385
4386
4387
4388
4389
4390
4391
4392static void css_task_iter_advance_css_set(struct css_task_iter *it)
4393{
4394 struct css_set *cset;
4395
4396 lockdep_assert_held(&css_set_lock);
4397
4398
4399 do {
4400 cset = css_task_iter_next_css_set(it);
4401 if (!cset) {
4402 it->task_pos = NULL;
4403 return;
4404 }
4405 } while (!css_set_populated(cset) && list_empty(&cset->dying_tasks));
4406
4407 if (!list_empty(&cset->tasks))
4408 it->task_pos = cset->tasks.next;
4409 else if (!list_empty(&cset->mg_tasks))
4410 it->task_pos = cset->mg_tasks.next;
4411 else
4412 it->task_pos = cset->dying_tasks.next;
4413
4414 it->tasks_head = &cset->tasks;
4415 it->mg_tasks_head = &cset->mg_tasks;
4416 it->dying_tasks_head = &cset->dying_tasks;
4417
4418
4419
4420
4421
4422
4423
4424
4425
4426
4427
4428
4429
4430
4431
4432
4433 if (it->cur_cset) {
4434 list_del(&it->iters_node);
4435 put_css_set_locked(it->cur_cset);
4436 }
4437 get_css_set(cset);
4438 it->cur_cset = cset;
4439 list_add(&it->iters_node, &cset->task_iters);
4440}
4441
4442static void css_task_iter_skip(struct css_task_iter *it,
4443 struct task_struct *task)
4444{
4445 lockdep_assert_held(&css_set_lock);
4446
4447 if (it->task_pos == &task->cg_list) {
4448 it->task_pos = it->task_pos->next;
4449 it->flags |= CSS_TASK_ITER_SKIPPED;
4450 }
4451}
4452
4453static void css_task_iter_advance(struct css_task_iter *it)
4454{
4455 struct task_struct *task;
4456
4457 lockdep_assert_held(&css_set_lock);
4458repeat:
4459 if (it->task_pos) {
4460
4461
4462
4463
4464
4465 if (it->flags & CSS_TASK_ITER_SKIPPED)
4466 it->flags &= ~CSS_TASK_ITER_SKIPPED;
4467 else
4468 it->task_pos = it->task_pos->next;
4469
4470 if (it->task_pos == it->tasks_head)
4471 it->task_pos = it->mg_tasks_head->next;
4472 if (it->task_pos == it->mg_tasks_head)
4473 it->task_pos = it->dying_tasks_head->next;
4474 if (it->task_pos == it->dying_tasks_head)
4475 css_task_iter_advance_css_set(it);
4476 } else {
4477
4478 css_task_iter_advance_css_set(it);
4479 }
4480
4481 if (!it->task_pos)
4482 return;
4483
4484 task = list_entry(it->task_pos, struct task_struct, cg_list);
4485
4486 if (it->flags & CSS_TASK_ITER_PROCS) {
4487
4488 if (!thread_group_leader(task))
4489 goto repeat;
4490
4491
4492 if (!atomic_read(&task->signal->live))
4493 goto repeat;
4494 } else {
4495
4496 if (task->flags & PF_EXITING)
4497 goto repeat;
4498 }
4499}
4500
4501
4502
4503
4504
4505
4506
4507
4508
4509
4510
4511
4512void css_task_iter_start(struct cgroup_subsys_state *css, unsigned int flags,
4513 struct css_task_iter *it)
4514{
4515 memset(it, 0, sizeof(*it));
4516
4517 spin_lock_irq(&css_set_lock);
4518
4519 it->ss = css->ss;
4520 it->flags = flags;
4521
4522 if (it->ss)
4523 it->cset_pos = &css->cgroup->e_csets[css->ss->id];
4524 else
4525 it->cset_pos = &css->cgroup->cset_links;
4526
4527 it->cset_head = it->cset_pos;
4528
4529 css_task_iter_advance(it);
4530
4531 spin_unlock_irq(&css_set_lock);
4532}
4533
4534
4535
4536
4537
4538
4539
4540
4541
4542struct task_struct *css_task_iter_next(struct css_task_iter *it)
4543{
4544 if (it->cur_task) {
4545 put_task_struct(it->cur_task);
4546 it->cur_task = NULL;
4547 }
4548
4549 spin_lock_irq(&css_set_lock);
4550
4551
4552 if (it->flags & CSS_TASK_ITER_SKIPPED)
4553 css_task_iter_advance(it);
4554
4555 if (it->task_pos) {
4556 it->cur_task = list_entry(it->task_pos, struct task_struct,
4557 cg_list);
4558 get_task_struct(it->cur_task);
4559 css_task_iter_advance(it);
4560 }
4561
4562 spin_unlock_irq(&css_set_lock);
4563
4564 return it->cur_task;
4565}
4566
4567
4568
4569
4570
4571
4572
4573void css_task_iter_end(struct css_task_iter *it)
4574{
4575 if (it->cur_cset) {
4576 spin_lock_irq(&css_set_lock);
4577 list_del(&it->iters_node);
4578 put_css_set_locked(it->cur_cset);
4579 spin_unlock_irq(&css_set_lock);
4580 }
4581
4582 if (it->cur_dcset)
4583 put_css_set(it->cur_dcset);
4584
4585 if (it->cur_task)
4586 put_task_struct(it->cur_task);
4587}
4588
4589static void cgroup_procs_release(struct kernfs_open_file *of)
4590{
4591 if (of->priv) {
4592 css_task_iter_end(of->priv);
4593 kfree(of->priv);
4594 }
4595}
4596
4597static void *cgroup_procs_next(struct seq_file *s, void *v, loff_t *pos)
4598{
4599 struct kernfs_open_file *of = s->private;
4600 struct css_task_iter *it = of->priv;
4601
4602 return css_task_iter_next(it);
4603}
4604
4605static void *__cgroup_procs_start(struct seq_file *s, loff_t *pos,
4606 unsigned int iter_flags)
4607{
4608 struct kernfs_open_file *of = s->private;
4609 struct cgroup *cgrp = seq_css(s)->cgroup;
4610 struct css_task_iter *it = of->priv;
4611
4612
4613
4614
4615
4616 if (!it) {
4617 if (WARN_ON_ONCE((*pos)++))
4618 return ERR_PTR(-EINVAL);
4619
4620 it = kzalloc(sizeof(*it), GFP_KERNEL);
4621 if (!it)
4622 return ERR_PTR(-ENOMEM);
4623 of->priv = it;
4624 css_task_iter_start(&cgrp->self, iter_flags, it);
4625 } else if (!(*pos)++) {
4626 css_task_iter_end(it);
4627 css_task_iter_start(&cgrp->self, iter_flags, it);
4628 }
4629
4630 return cgroup_procs_next(s, NULL, NULL);
4631}
4632
4633static void *cgroup_procs_start(struct seq_file *s, loff_t *pos)
4634{
4635 struct cgroup *cgrp = seq_css(s)->cgroup;
4636
4637
4638
4639
4640
4641
4642
4643 if (cgroup_is_threaded(cgrp))
4644 return ERR_PTR(-EOPNOTSUPP);
4645
4646 return __cgroup_procs_start(s, pos, CSS_TASK_ITER_PROCS |
4647 CSS_TASK_ITER_THREADED);
4648}
4649
4650static int cgroup_procs_show(struct seq_file *s, void *v)
4651{
4652 seq_printf(s, "%d\n", task_pid_vnr(v));
4653 return 0;
4654}
4655
4656static int cgroup_procs_write_permission(struct cgroup *src_cgrp,
4657 struct cgroup *dst_cgrp,
4658 struct super_block *sb)
4659{
4660 struct cgroup_namespace *ns = current->nsproxy->cgroup_ns;
4661 struct cgroup *com_cgrp = src_cgrp;
4662 struct inode *inode;
4663 int ret;
4664
4665 lockdep_assert_held(&cgroup_mutex);
4666
4667
4668 while (!cgroup_is_descendant(dst_cgrp, com_cgrp))
4669 com_cgrp = cgroup_parent(com_cgrp);
4670
4671
4672 inode = kernfs_get_inode(sb, com_cgrp->procs_file.kn);
4673 if (!inode)
4674 return -ENOMEM;
4675
4676 ret = inode_permission(inode, MAY_WRITE);
4677 iput(inode);
4678 if (ret)
4679 return ret;
4680
4681
4682
4683
4684
4685 if ((cgrp_dfl_root.flags & CGRP_ROOT_NS_DELEGATE) &&
4686 (!cgroup_is_descendant(src_cgrp, ns->root_cset->dfl_cgrp) ||
4687 !cgroup_is_descendant(dst_cgrp, ns->root_cset->dfl_cgrp)))
4688 return -ENOENT;
4689
4690 return 0;
4691}
4692
4693static ssize_t cgroup_procs_write(struct kernfs_open_file *of,
4694 char *buf, size_t nbytes, loff_t off)
4695{
4696 struct cgroup *src_cgrp, *dst_cgrp;
4697 struct task_struct *task;
4698 ssize_t ret;
4699 bool locked;
4700
4701 dst_cgrp = cgroup_kn_lock_live(of->kn, false);
4702 if (!dst_cgrp)
4703 return -ENODEV;
4704
4705 task = cgroup_procs_write_start(buf, true, &locked);
4706 ret = PTR_ERR_OR_ZERO(task);
4707 if (ret)
4708 goto out_unlock;
4709
4710
4711 spin_lock_irq(&css_set_lock);
4712 src_cgrp = task_cgroup_from_root(task, &cgrp_dfl_root);
4713 spin_unlock_irq(&css_set_lock);
4714
4715 ret = cgroup_procs_write_permission(src_cgrp, dst_cgrp,
4716 of->file->f_path.dentry->d_sb);
4717 if (ret)
4718 goto out_finish;
4719
4720 ret = cgroup_attach_task(dst_cgrp, task, true);
4721
4722out_finish:
4723 cgroup_procs_write_finish(task, locked);
4724out_unlock:
4725 cgroup_kn_unlock(of->kn);
4726
4727 return ret ?: nbytes;
4728}
4729
4730static void *cgroup_threads_start(struct seq_file *s, loff_t *pos)
4731{
4732 return __cgroup_procs_start(s, pos, 0);
4733}
4734
4735static ssize_t cgroup_threads_write(struct kernfs_open_file *of,
4736 char *buf, size_t nbytes, loff_t off)
4737{
4738 struct cgroup *src_cgrp, *dst_cgrp;
4739 struct task_struct *task;
4740 ssize_t ret;
4741 bool locked;
4742
4743 buf = strstrip(buf);
4744
4745 dst_cgrp = cgroup_kn_lock_live(of->kn, false);
4746 if (!dst_cgrp)
4747 return -ENODEV;
4748
4749 task = cgroup_procs_write_start(buf, false, &locked);
4750 ret = PTR_ERR_OR_ZERO(task);
4751 if (ret)
4752 goto out_unlock;
4753
4754
4755 spin_lock_irq(&css_set_lock);
4756 src_cgrp = task_cgroup_from_root(task, &cgrp_dfl_root);
4757 spin_unlock_irq(&css_set_lock);
4758
4759
4760 ret = cgroup_procs_write_permission(src_cgrp, dst_cgrp,
4761 of->file->f_path.dentry->d_sb);
4762 if (ret)
4763 goto out_finish;
4764
4765
4766 ret = -EOPNOTSUPP;
4767 if (src_cgrp->dom_cgrp != dst_cgrp->dom_cgrp)
4768 goto out_finish;
4769
4770 ret = cgroup_attach_task(dst_cgrp, task, false);
4771
4772out_finish:
4773 cgroup_procs_write_finish(task, locked);
4774out_unlock:
4775 cgroup_kn_unlock(of->kn);
4776
4777 return ret ?: nbytes;
4778}
4779
4780
4781static struct cftype cgroup_base_files[] = {
4782 {
4783 .name = "cgroup.type",
4784 .flags = CFTYPE_NOT_ON_ROOT,
4785 .seq_show = cgroup_type_show,
4786 .write = cgroup_type_write,
4787 },
4788 {
4789 .name = "cgroup.procs",
4790 .flags = CFTYPE_NS_DELEGATABLE,
4791 .file_offset = offsetof(struct cgroup, procs_file),
4792 .release = cgroup_procs_release,
4793 .seq_start = cgroup_procs_start,
4794 .seq_next = cgroup_procs_next,
4795 .seq_show = cgroup_procs_show,
4796 .write = cgroup_procs_write,
4797 },
4798 {
4799 .name = "cgroup.threads",
4800 .flags = CFTYPE_NS_DELEGATABLE,
4801 .release = cgroup_procs_release,
4802 .seq_start = cgroup_threads_start,
4803 .seq_next = cgroup_procs_next,
4804 .seq_show = cgroup_procs_show,
4805 .write = cgroup_threads_write,
4806 },
4807 {
4808 .name = "cgroup.controllers",
4809 .seq_show = cgroup_controllers_show,
4810 },
4811 {
4812 .name = "cgroup.subtree_control",
4813 .flags = CFTYPE_NS_DELEGATABLE,
4814 .seq_show = cgroup_subtree_control_show,
4815 .write = cgroup_subtree_control_write,
4816 },
4817 {
4818 .name = "cgroup.events",
4819 .flags = CFTYPE_NOT_ON_ROOT,
4820 .file_offset = offsetof(struct cgroup, events_file),
4821 .seq_show = cgroup_events_show,
4822 },
4823 {
4824 .name = "cgroup.max.descendants",
4825 .seq_show = cgroup_max_descendants_show,
4826 .write = cgroup_max_descendants_write,
4827 },
4828 {
4829 .name = "cgroup.max.depth",
4830 .seq_show = cgroup_max_depth_show,
4831 .write = cgroup_max_depth_write,
4832 },
4833 {
4834 .name = "cgroup.stat",
4835 .seq_show = cgroup_stat_show,
4836 },
4837 {
4838 .name = "cgroup.freeze",
4839 .flags = CFTYPE_NOT_ON_ROOT,
4840 .seq_show = cgroup_freeze_show,
4841 .write = cgroup_freeze_write,
4842 },
4843 {
4844 .name = "cpu.stat",
4845 .flags = CFTYPE_NOT_ON_ROOT,
4846 .seq_show = cpu_stat_show,
4847 },
4848#ifdef CONFIG_PSI
4849 {
4850 .name = "io.pressure",
4851 .seq_show = cgroup_io_pressure_show,
4852 .write = cgroup_io_pressure_write,
4853 .poll = cgroup_pressure_poll,
4854 .release = cgroup_pressure_release,
4855 },
4856 {
4857 .name = "memory.pressure",
4858 .seq_show = cgroup_memory_pressure_show,
4859 .write = cgroup_memory_pressure_write,
4860 .poll = cgroup_pressure_poll,
4861 .release = cgroup_pressure_release,
4862 },
4863 {
4864 .name = "cpu.pressure",
4865 .seq_show = cgroup_cpu_pressure_show,
4866 .write = cgroup_cpu_pressure_write,
4867 .poll = cgroup_pressure_poll,
4868 .release = cgroup_pressure_release,
4869 },
4870#endif
4871 { }
4872};
4873
4874
4875
4876
4877
4878
4879
4880
4881
4882
4883
4884
4885
4886
4887
4888
4889
4890
4891
4892
4893
4894
4895
4896static void css_free_rwork_fn(struct work_struct *work)
4897{
4898 struct cgroup_subsys_state *css = container_of(to_rcu_work(work),
4899 struct cgroup_subsys_state, destroy_rwork);
4900 struct cgroup_subsys *ss = css->ss;
4901 struct cgroup *cgrp = css->cgroup;
4902
4903 percpu_ref_exit(&css->refcnt);
4904
4905 if (ss) {
4906
4907 struct cgroup_subsys_state *parent = css->parent;
4908 int id = css->id;
4909
4910 ss->css_free(css);
4911 cgroup_idr_remove(&ss->css_idr, id);
4912 cgroup_put(cgrp);
4913
4914 if (parent)
4915 css_put(parent);
4916 } else {
4917
4918 atomic_dec(&cgrp->root->nr_cgrps);
4919 cgroup1_pidlist_destroy_all(cgrp);
4920 cancel_work_sync(&cgrp->release_agent_work);
4921
4922 if (cgroup_parent(cgrp)) {
4923
4924
4925
4926
4927
4928
4929 cgroup_put(cgroup_parent(cgrp));
4930 kernfs_put(cgrp->kn);
4931 psi_cgroup_free(cgrp);
4932 if (cgroup_on_dfl(cgrp))
4933 cgroup_rstat_exit(cgrp);
4934 kfree(cgrp);
4935 } else {
4936
4937
4938
4939
4940
4941 cgroup_destroy_root(cgrp->root);
4942 }
4943 }
4944}
4945
4946static void css_release_work_fn(struct work_struct *work)
4947{
4948 struct cgroup_subsys_state *css =
4949 container_of(work, struct cgroup_subsys_state, destroy_work);
4950 struct cgroup_subsys *ss = css->ss;
4951 struct cgroup *cgrp = css->cgroup;
4952
4953 mutex_lock(&cgroup_mutex);
4954
4955 css->flags |= CSS_RELEASED;
4956 list_del_rcu(&css->sibling);
4957
4958 if (ss) {
4959
4960 if (!list_empty(&css->rstat_css_node)) {
4961 cgroup_rstat_flush(cgrp);
4962 list_del_rcu(&css->rstat_css_node);
4963 }
4964
4965 cgroup_idr_replace(&ss->css_idr, NULL, css->id);
4966 if (ss->css_released)
4967 ss->css_released(css);
4968 } else {
4969 struct cgroup *tcgrp;
4970
4971
4972 TRACE_CGROUP_PATH(release, cgrp);
4973
4974 if (cgroup_on_dfl(cgrp))
4975 cgroup_rstat_flush(cgrp);
4976
4977 spin_lock_irq(&css_set_lock);
4978 for (tcgrp = cgroup_parent(cgrp); tcgrp;
4979 tcgrp = cgroup_parent(tcgrp))
4980 tcgrp->nr_dying_descendants--;
4981 spin_unlock_irq(&css_set_lock);
4982
4983
4984
4985
4986
4987
4988
4989
4990 if (cgrp->kn)
4991 RCU_INIT_POINTER(*(void __rcu __force **)&cgrp->kn->priv,
4992 NULL);
4993 }
4994
4995 mutex_unlock(&cgroup_mutex);
4996
4997 INIT_RCU_WORK(&css->destroy_rwork, css_free_rwork_fn);
4998 queue_rcu_work(cgroup_destroy_wq, &css->destroy_rwork);
4999}
5000
5001static void css_release(struct percpu_ref *ref)
5002{
5003 struct cgroup_subsys_state *css =
5004 container_of(ref, struct cgroup_subsys_state, refcnt);
5005
5006 INIT_WORK(&css->destroy_work, css_release_work_fn);
5007 queue_work(cgroup_destroy_wq, &css->destroy_work);
5008}
5009
5010static void init_and_link_css(struct cgroup_subsys_state *css,
5011 struct cgroup_subsys *ss, struct cgroup *cgrp)
5012{
5013 lockdep_assert_held(&cgroup_mutex);
5014
5015 cgroup_get_live(cgrp);
5016
5017 memset(css, 0, sizeof(*css));
5018 css->cgroup = cgrp;
5019 css->ss = ss;
5020 css->id = -1;
5021 INIT_LIST_HEAD(&css->sibling);
5022 INIT_LIST_HEAD(&css->children);
5023 INIT_LIST_HEAD(&css->rstat_css_node);
5024 css->serial_nr = css_serial_nr_next++;
5025 atomic_set(&css->online_cnt, 0);
5026
5027 if (cgroup_parent(cgrp)) {
5028 css->parent = cgroup_css(cgroup_parent(cgrp), ss);
5029 css_get(css->parent);
5030 }
5031
5032 if (cgroup_on_dfl(cgrp) && ss->css_rstat_flush)
5033 list_add_rcu(&css->rstat_css_node, &cgrp->rstat_css_list);
5034
5035 BUG_ON(cgroup_css(cgrp, ss));
5036}
5037
5038
5039static int online_css(struct cgroup_subsys_state *css)
5040{
5041 struct cgroup_subsys *ss = css->ss;
5042 int ret = 0;
5043
5044 lockdep_assert_held(&cgroup_mutex);
5045
5046 if (ss->css_online)
5047 ret = ss->css_online(css);
5048 if (!ret) {
5049 css->flags |= CSS_ONLINE;
5050 rcu_assign_pointer(css->cgroup->subsys[ss->id], css);
5051
5052 atomic_inc(&css->online_cnt);
5053 if (css->parent)
5054 atomic_inc(&css->parent->online_cnt);
5055 }
5056 return ret;
5057}
5058
5059
5060static void offline_css(struct cgroup_subsys_state *css)
5061{
5062 struct cgroup_subsys *ss = css->ss;
5063
5064 lockdep_assert_held(&cgroup_mutex);
5065
5066 if (!(css->flags & CSS_ONLINE))
5067 return;
5068
5069 if (ss->css_offline)
5070 ss->css_offline(css);
5071
5072 css->flags &= ~CSS_ONLINE;
5073 RCU_INIT_POINTER(css->cgroup->subsys[ss->id], NULL);
5074
5075 wake_up_all(&css->cgroup->offline_waitq);
5076}
5077
5078
5079
5080
5081
5082
5083
5084
5085
5086
5087static struct cgroup_subsys_state *css_create(struct cgroup *cgrp,
5088 struct cgroup_subsys *ss)
5089{
5090 struct cgroup *parent = cgroup_parent(cgrp);
5091 struct cgroup_subsys_state *parent_css = cgroup_css(parent, ss);
5092 struct cgroup_subsys_state *css;
5093 int err;
5094
5095 lockdep_assert_held(&cgroup_mutex);
5096
5097 css = ss->css_alloc(parent_css);
5098 if (!css)
5099 css = ERR_PTR(-ENOMEM);
5100 if (IS_ERR(css))
5101 return css;
5102
5103 init_and_link_css(css, ss, cgrp);
5104
5105 err = percpu_ref_init(&css->refcnt, css_release, 0, GFP_KERNEL);
5106 if (err)
5107 goto err_free_css;
5108
5109 err = cgroup_idr_alloc(&ss->css_idr, NULL, 2, 0, GFP_KERNEL);
5110 if (err < 0)
5111 goto err_free_css;
5112 css->id = err;
5113
5114
5115 list_add_tail_rcu(&css->sibling, &parent_css->children);
5116 cgroup_idr_replace(&ss->css_idr, css, css->id);
5117
5118 err = online_css(css);
5119 if (err)
5120 goto err_list_del;
5121
5122 if (ss->broken_hierarchy && !ss->warned_broken_hierarchy &&
5123 cgroup_parent(parent)) {
5124 pr_warn("%s (%d) created nested cgroup for controller \"%s\" which has incomplete hierarchy support. Nested cgroups may change behavior in the future.\n",
5125 current->comm, current->pid, ss->name);
5126 if (!strcmp(ss->name, "memory"))
5127 pr_warn("\"memory\" requires setting use_hierarchy to 1 on the root\n");
5128 ss->warned_broken_hierarchy = true;
5129 }
5130
5131 return css;
5132
5133err_list_del:
5134 list_del_rcu(&css->sibling);
5135err_free_css:
5136 list_del_rcu(&css->rstat_css_node);
5137 INIT_RCU_WORK(&css->destroy_rwork, css_free_rwork_fn);
5138 queue_rcu_work(cgroup_destroy_wq, &css->destroy_rwork);
5139 return ERR_PTR(err);
5140}
5141
5142
5143
5144
5145
5146
5147static struct cgroup *cgroup_create(struct cgroup *parent, const char *name,
5148 umode_t mode)
5149{
5150 struct cgroup_root *root = parent->root;
5151 struct cgroup *cgrp, *tcgrp;
5152 struct kernfs_node *kn;
5153 int level = parent->level + 1;
5154 int ret;
5155
5156
5157 cgrp = kzalloc(struct_size(cgrp, ancestor_ids, (level + 1)),
5158 GFP_KERNEL);
5159 if (!cgrp)
5160 return ERR_PTR(-ENOMEM);
5161
5162 ret = percpu_ref_init(&cgrp->self.refcnt, css_release, 0, GFP_KERNEL);
5163 if (ret)
5164 goto out_free_cgrp;
5165
5166 if (cgroup_on_dfl(parent)) {
5167 ret = cgroup_rstat_init(cgrp);
5168 if (ret)
5169 goto out_cancel_ref;
5170 }
5171
5172
5173 kn = kernfs_create_dir(parent->kn, name, mode, cgrp);
5174 if (IS_ERR(kn)) {
5175 ret = PTR_ERR(kn);
5176 goto out_stat_exit;
5177 }
5178 cgrp->kn = kn;
5179
5180 init_cgroup_housekeeping(cgrp);
5181
5182 cgrp->self.parent = &parent->self;
5183 cgrp->root = root;
5184 cgrp->level = level;
5185
5186 ret = psi_cgroup_alloc(cgrp);
5187 if (ret)
5188 goto out_kernfs_remove;
5189
5190 ret = cgroup_bpf_inherit(cgrp);
5191 if (ret)
5192 goto out_psi_free;
5193
5194
5195
5196
5197
5198 cgrp->freezer.e_freeze = parent->freezer.e_freeze;
5199 if (cgrp->freezer.e_freeze) {
5200
5201
5202
5203
5204
5205
5206 set_bit(CGRP_FREEZE, &cgrp->flags);
5207 set_bit(CGRP_FROZEN, &cgrp->flags);
5208 }
5209
5210 spin_lock_irq(&css_set_lock);
5211 for (tcgrp = cgrp; tcgrp; tcgrp = cgroup_parent(tcgrp)) {
5212 cgrp->ancestor_ids[tcgrp->level] = cgroup_id(tcgrp);
5213
5214 if (tcgrp != cgrp) {
5215 tcgrp->nr_descendants++;
5216
5217
5218
5219
5220
5221
5222 if (cgrp->freezer.e_freeze)
5223 tcgrp->freezer.nr_frozen_descendants++;
5224 }
5225 }
5226 spin_unlock_irq(&css_set_lock);
5227
5228 if (notify_on_release(parent))
5229 set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
5230
5231 if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &parent->flags))
5232 set_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags);
5233
5234 cgrp->self.serial_nr = css_serial_nr_next++;
5235
5236
5237 list_add_tail_rcu(&cgrp->self.sibling, &cgroup_parent(cgrp)->self.children);
5238 atomic_inc(&root->nr_cgrps);
5239 cgroup_get_live(parent);
5240
5241
5242
5243
5244
5245 if (!cgroup_on_dfl(cgrp))
5246 cgrp->subtree_control = cgroup_control(cgrp);
5247
5248 cgroup_propagate_control(cgrp);
5249
5250 return cgrp;
5251
5252out_psi_free:
5253 psi_cgroup_free(cgrp);
5254out_kernfs_remove:
5255 kernfs_remove(cgrp->kn);
5256out_stat_exit:
5257 if (cgroup_on_dfl(parent))
5258 cgroup_rstat_exit(cgrp);
5259out_cancel_ref:
5260 percpu_ref_exit(&cgrp->self.refcnt);
5261out_free_cgrp:
5262 kfree(cgrp);
5263 return ERR_PTR(ret);
5264}
5265
5266static bool cgroup_check_hierarchy_limits(struct cgroup *parent)
5267{
5268 struct cgroup *cgroup;
5269 int ret = false;
5270 int level = 1;
5271
5272 lockdep_assert_held(&cgroup_mutex);
5273
5274 for (cgroup = parent; cgroup; cgroup = cgroup_parent(cgroup)) {
5275 if (cgroup->nr_descendants >= cgroup->max_descendants)
5276 goto fail;
5277
5278 if (level > cgroup->max_depth)
5279 goto fail;
5280
5281 level++;
5282 }
5283
5284 ret = true;
5285fail:
5286 return ret;
5287}
5288
5289int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, umode_t mode)
5290{
5291 struct cgroup *parent, *cgrp;
5292 int ret;
5293
5294
5295 if (strchr(name, '\n'))
5296 return -EINVAL;
5297
5298 parent = cgroup_kn_lock_live(parent_kn, false);
5299 if (!parent)
5300 return -ENODEV;
5301
5302 if (!cgroup_check_hierarchy_limits(parent)) {
5303 ret = -EAGAIN;
5304 goto out_unlock;
5305 }
5306
5307 cgrp = cgroup_create(parent, name, mode);
5308 if (IS_ERR(cgrp)) {
5309 ret = PTR_ERR(cgrp);
5310 goto out_unlock;
5311 }
5312
5313
5314
5315
5316
5317 kernfs_get(cgrp->kn);
5318
5319 ret = cgroup_kn_set_ugid(cgrp->kn);
5320 if (ret)
5321 goto out_destroy;
5322
5323 ret = css_populate_dir(&cgrp->self);
5324 if (ret)
5325 goto out_destroy;
5326
5327 ret = cgroup_apply_control_enable(cgrp);
5328 if (ret)
5329 goto out_destroy;
5330
5331 TRACE_CGROUP_PATH(mkdir, cgrp);
5332
5333
5334 kernfs_activate(cgrp->kn);
5335
5336 ret = 0;
5337 goto out_unlock;
5338
5339out_destroy:
5340 cgroup_destroy_locked(cgrp);
5341out_unlock:
5342 cgroup_kn_unlock(parent_kn);
5343 return ret;
5344}
5345
5346
5347
5348
5349
5350
5351static void css_killed_work_fn(struct work_struct *work)
5352{
5353 struct cgroup_subsys_state *css =
5354 container_of(work, struct cgroup_subsys_state, destroy_work);
5355
5356 mutex_lock(&cgroup_mutex);
5357
5358 do {
5359 offline_css(css);
5360 css_put(css);
5361
5362 css = css->parent;
5363 } while (css && atomic_dec_and_test(&css->online_cnt));
5364
5365 mutex_unlock(&cgroup_mutex);
5366}
5367
5368
5369static void css_killed_ref_fn(struct percpu_ref *ref)
5370{
5371 struct cgroup_subsys_state *css =
5372 container_of(ref, struct cgroup_subsys_state, refcnt);
5373
5374 if (atomic_dec_and_test(&css->online_cnt)) {
5375 INIT_WORK(&css->destroy_work, css_killed_work_fn);
5376 queue_work(cgroup_destroy_wq, &css->destroy_work);
5377 }
5378}
5379
5380
5381
5382
5383
5384
5385
5386
5387
5388
5389static void kill_css(struct cgroup_subsys_state *css)
5390{
5391 lockdep_assert_held(&cgroup_mutex);
5392
5393 if (css->flags & CSS_DYING)
5394 return;
5395
5396 css->flags |= CSS_DYING;
5397
5398
5399
5400
5401
5402 css_clear_dir(css);
5403
5404
5405
5406
5407
5408 css_get(css);
5409
5410
5411
5412
5413
5414
5415
5416
5417
5418
5419
5420 percpu_ref_kill_and_confirm(&css->refcnt, css_killed_ref_fn);
5421}
5422
5423
5424
5425
5426
5427
5428
5429
5430
5431
5432
5433
5434
5435
5436
5437
5438
5439
5440
5441
5442
5443
5444
5445
5446
5447static int cgroup_destroy_locked(struct cgroup *cgrp)
5448 __releases(&cgroup_mutex) __acquires(&cgroup_mutex)
5449{
5450 struct cgroup *tcgrp, *parent = cgroup_parent(cgrp);
5451 struct cgroup_subsys_state *css;
5452 struct cgrp_cset_link *link;
5453 int ssid;
5454
5455 lockdep_assert_held(&cgroup_mutex);
5456
5457
5458
5459
5460
5461 if (cgroup_is_populated(cgrp))
5462 return -EBUSY;
5463
5464
5465
5466
5467
5468
5469 if (css_has_online_children(&cgrp->self))
5470 return -EBUSY;
5471
5472
5473
5474
5475
5476
5477
5478 cgrp->self.flags &= ~CSS_ONLINE;
5479
5480 spin_lock_irq(&css_set_lock);
5481 list_for_each_entry(link, &cgrp->cset_links, cset_link)
5482 link->cset->dead = true;
5483 spin_unlock_irq(&css_set_lock);
5484
5485
5486 for_each_css(css, ssid, cgrp)
5487 kill_css(css);
5488
5489
5490 css_clear_dir(&cgrp->self);
5491 kernfs_remove(cgrp->kn);
5492
5493 if (parent && cgroup_is_threaded(cgrp))
5494 parent->nr_threaded_children--;
5495
5496 spin_lock_irq(&css_set_lock);
5497 for (tcgrp = cgroup_parent(cgrp); tcgrp; tcgrp = cgroup_parent(tcgrp)) {
5498 tcgrp->nr_descendants--;
5499 tcgrp->nr_dying_descendants++;
5500
5501
5502
5503
5504 if (test_bit(CGRP_FROZEN, &cgrp->flags))
5505 tcgrp->freezer.nr_frozen_descendants--;
5506 }
5507 spin_unlock_irq(&css_set_lock);
5508
5509 cgroup1_check_for_release(parent);
5510
5511 cgroup_bpf_offline(cgrp);
5512
5513
5514 percpu_ref_kill(&cgrp->self.refcnt);
5515
5516 return 0;
5517};
5518
5519int cgroup_rmdir(struct kernfs_node *kn)
5520{
5521 struct cgroup *cgrp;
5522 int ret = 0;
5523
5524 cgrp = cgroup_kn_lock_live(kn, false);
5525 if (!cgrp)
5526 return 0;
5527
5528 ret = cgroup_destroy_locked(cgrp);
5529 if (!ret)
5530 TRACE_CGROUP_PATH(rmdir, cgrp);
5531
5532 cgroup_kn_unlock(kn);
5533 return ret;
5534}
5535
5536static struct kernfs_syscall_ops cgroup_kf_syscall_ops = {
5537 .show_options = cgroup_show_options,
5538 .mkdir = cgroup_mkdir,
5539 .rmdir = cgroup_rmdir,
5540 .show_path = cgroup_show_path,
5541};
5542
5543static void __init cgroup_init_subsys(struct cgroup_subsys *ss, bool early)
5544{
5545 struct cgroup_subsys_state *css;
5546
5547 pr_debug("Initializing cgroup subsys %s\n", ss->name);
5548
5549 mutex_lock(&cgroup_mutex);
5550
5551 idr_init(&ss->css_idr);
5552 INIT_LIST_HEAD(&ss->cfts);
5553
5554
5555 ss->root = &cgrp_dfl_root;
5556 css = ss->css_alloc(cgroup_css(&cgrp_dfl_root.cgrp, ss));
5557
5558 BUG_ON(IS_ERR(css));
5559 init_and_link_css(css, ss, &cgrp_dfl_root.cgrp);
5560
5561
5562
5563
5564
5565 css->flags |= CSS_NO_REF;
5566
5567 if (early) {
5568
5569 css->id = 1;
5570 } else {
5571 css->id = cgroup_idr_alloc(&ss->css_idr, css, 1, 2, GFP_KERNEL);
5572 BUG_ON(css->id < 0);
5573 }
5574
5575
5576
5577
5578
5579 init_css_set.subsys[ss->id] = css;
5580
5581 have_fork_callback |= (bool)ss->fork << ss->id;
5582 have_exit_callback |= (bool)ss->exit << ss->id;
5583 have_release_callback |= (bool)ss->release << ss->id;
5584 have_canfork_callback |= (bool)ss->can_fork << ss->id;
5585
5586
5587
5588
5589 BUG_ON(!list_empty(&init_task.tasks));
5590
5591 BUG_ON(online_css(css));
5592
5593 mutex_unlock(&cgroup_mutex);
5594}
5595
5596
5597
5598
5599
5600
5601
5602int __init cgroup_init_early(void)
5603{
5604 static struct cgroup_fs_context __initdata ctx;
5605 struct cgroup_subsys *ss;
5606 int i;
5607
5608 ctx.root = &cgrp_dfl_root;
5609 init_cgroup_root(&ctx);
5610 cgrp_dfl_root.cgrp.self.flags |= CSS_NO_REF;
5611
5612 RCU_INIT_POINTER(init_task.cgroups, &init_css_set);
5613
5614 for_each_subsys(ss, i) {
5615 WARN(!ss->css_alloc || !ss->css_free || ss->name || ss->id,
5616 "invalid cgroup_subsys %d:%s css_alloc=%p css_free=%p id:name=%d:%s\n",
5617 i, cgroup_subsys_name[i], ss->css_alloc, ss->css_free,
5618 ss->id, ss->name);
5619 WARN(strlen(cgroup_subsys_name[i]) > MAX_CGROUP_TYPE_NAMELEN,
5620 "cgroup_subsys_name %s too long\n", cgroup_subsys_name[i]);
5621
5622 ss->id = i;
5623 ss->name = cgroup_subsys_name[i];
5624 if (!ss->legacy_name)
5625 ss->legacy_name = cgroup_subsys_name[i];
5626
5627 if (ss->early_init)
5628 cgroup_init_subsys(ss, true);
5629 }
5630 return 0;
5631}
5632
5633static u16 cgroup_disable_mask __initdata;
5634
5635
5636
5637
5638
5639
5640
5641int __init cgroup_init(void)
5642{
5643 struct cgroup_subsys *ss;
5644 int ssid;
5645
5646 BUILD_BUG_ON(CGROUP_SUBSYS_COUNT > 16);
5647 BUG_ON(cgroup_init_cftypes(NULL, cgroup_base_files));
5648 BUG_ON(cgroup_init_cftypes(NULL, cgroup1_base_files));
5649
5650 cgroup_rstat_boot();
5651
5652
5653
5654
5655
5656 rcu_sync_enter_start(&cgroup_threadgroup_rwsem.rss);
5657
5658 get_user_ns(init_cgroup_ns.user_ns);
5659
5660 mutex_lock(&cgroup_mutex);
5661
5662
5663
5664
5665
5666 hash_add(css_set_table, &init_css_set.hlist,
5667 css_set_hash(init_css_set.subsys));
5668
5669 BUG_ON(cgroup_setup_root(&cgrp_dfl_root, 0));
5670
5671 mutex_unlock(&cgroup_mutex);
5672
5673 for_each_subsys(ss, ssid) {
5674 if (ss->early_init) {
5675 struct cgroup_subsys_state *css =
5676 init_css_set.subsys[ss->id];
5677
5678 css->id = cgroup_idr_alloc(&ss->css_idr, css, 1, 2,
5679 GFP_KERNEL);
5680 BUG_ON(css->id < 0);
5681 } else {
5682 cgroup_init_subsys(ss, false);
5683 }
5684
5685 list_add_tail(&init_css_set.e_cset_node[ssid],
5686 &cgrp_dfl_root.cgrp.e_csets[ssid]);
5687
5688
5689
5690
5691
5692
5693 if (cgroup_disable_mask & (1 << ssid)) {
5694 static_branch_disable(cgroup_subsys_enabled_key[ssid]);
5695 printk(KERN_INFO "Disabling %s control group subsystem\n",
5696 ss->name);
5697 continue;
5698 }
5699
5700 if (cgroup1_ssid_disabled(ssid))
5701 printk(KERN_INFO "Disabling %s control group subsystem in v1 mounts\n",
5702 ss->name);
5703
5704 cgrp_dfl_root.subsys_mask |= 1 << ss->id;
5705
5706
5707 WARN_ON(ss->implicit_on_dfl && !ss->threaded);
5708
5709 if (ss->implicit_on_dfl)
5710 cgrp_dfl_implicit_ss_mask |= 1 << ss->id;
5711 else if (!ss->dfl_cftypes)
5712 cgrp_dfl_inhibit_ss_mask |= 1 << ss->id;
5713
5714 if (ss->threaded)
5715 cgrp_dfl_threaded_ss_mask |= 1 << ss->id;
5716
5717 if (ss->dfl_cftypes == ss->legacy_cftypes) {
5718 WARN_ON(cgroup_add_cftypes(ss, ss->dfl_cftypes));
5719 } else {
5720 WARN_ON(cgroup_add_dfl_cftypes(ss, ss->dfl_cftypes));
5721 WARN_ON(cgroup_add_legacy_cftypes(ss, ss->legacy_cftypes));
5722 }
5723
5724 if (ss->bind)
5725 ss->bind(init_css_set.subsys[ssid]);
5726
5727 mutex_lock(&cgroup_mutex);
5728 css_populate_dir(init_css_set.subsys[ssid]);
5729 mutex_unlock(&cgroup_mutex);
5730 }
5731
5732
5733 hash_del(&init_css_set.hlist);
5734 hash_add(css_set_table, &init_css_set.hlist,
5735 css_set_hash(init_css_set.subsys));
5736
5737 WARN_ON(sysfs_create_mount_point(fs_kobj, "cgroup"));
5738 WARN_ON(register_filesystem(&cgroup_fs_type));
5739 WARN_ON(register_filesystem(&cgroup2_fs_type));
5740 WARN_ON(!proc_create_single("cgroups", 0, NULL, proc_cgroupstats_show));
5741#ifdef CONFIG_CPUSETS
5742 WARN_ON(register_filesystem(&cpuset_fs_type));
5743#endif
5744
5745 return 0;
5746}
5747
5748static int __init cgroup_wq_init(void)
5749{
5750
5751
5752
5753
5754
5755
5756
5757
5758 cgroup_destroy_wq = alloc_workqueue("cgroup_destroy", 0, 1);
5759 BUG_ON(!cgroup_destroy_wq);
5760 return 0;
5761}
5762core_initcall(cgroup_wq_init);
5763
5764void cgroup_path_from_kernfs_id(u64 id, char *buf, size_t buflen)
5765{
5766 struct kernfs_node *kn;
5767
5768 kn = kernfs_find_and_get_node_by_id(cgrp_dfl_root.kf_root, id);
5769 if (!kn)
5770 return;
5771 kernfs_path(kn, buf, buflen);
5772 kernfs_put(kn);
5773}
5774
5775
5776
5777
5778
5779
5780int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns,
5781 struct pid *pid, struct task_struct *tsk)
5782{
5783 char *buf;
5784 int retval;
5785 struct cgroup_root *root;
5786
5787 retval = -ENOMEM;
5788 buf = kmalloc(PATH_MAX, GFP_KERNEL);
5789 if (!buf)
5790 goto out;
5791
5792 mutex_lock(&cgroup_mutex);
5793 spin_lock_irq(&css_set_lock);
5794
5795 for_each_root(root) {
5796 struct cgroup_subsys *ss;
5797 struct cgroup *cgrp;
5798 int ssid, count = 0;
5799
5800 if (root == &cgrp_dfl_root && !cgrp_dfl_visible)
5801 continue;
5802
5803 seq_printf(m, "%d:", root->hierarchy_id);
5804 if (root != &cgrp_dfl_root)
5805 for_each_subsys(ss, ssid)
5806 if (root->subsys_mask & (1 << ssid))
5807 seq_printf(m, "%s%s", count++ ? "," : "",
5808 ss->legacy_name);
5809 if (strlen(root->name))
5810 seq_printf(m, "%sname=%s", count ? "," : "",
5811 root->name);
5812 seq_putc(m, ':');
5813
5814 cgrp = task_cgroup_from_root(tsk, root);
5815
5816
5817
5818
5819
5820
5821
5822
5823
5824
5825 if (cgroup_on_dfl(cgrp) || !(tsk->flags & PF_EXITING)) {
5826 retval = cgroup_path_ns_locked(cgrp, buf, PATH_MAX,
5827 current->nsproxy->cgroup_ns);
5828 if (retval >= PATH_MAX)
5829 retval = -ENAMETOOLONG;
5830 if (retval < 0)
5831 goto out_unlock;
5832
5833 seq_puts(m, buf);
5834 } else {
5835 seq_puts(m, "/");
5836 }
5837
5838 if (cgroup_on_dfl(cgrp) && cgroup_is_dead(cgrp))
5839 seq_puts(m, " (deleted)\n");
5840 else
5841 seq_putc(m, '\n');
5842 }
5843
5844 retval = 0;
5845out_unlock:
5846 spin_unlock_irq(&css_set_lock);
5847 mutex_unlock(&cgroup_mutex);
5848 kfree(buf);
5849out:
5850 return retval;
5851}
5852
5853
5854
5855
5856
5857
5858
5859
5860
5861void cgroup_fork(struct task_struct *child)
5862{
5863 RCU_INIT_POINTER(child->cgroups, &init_css_set);
5864 INIT_LIST_HEAD(&child->cg_list);
5865}
5866
5867
5868
5869
5870
5871
5872
5873
5874
5875int cgroup_can_fork(struct task_struct *child)
5876{
5877 struct cgroup_subsys *ss;
5878 int i, j, ret;
5879
5880 do_each_subsys_mask(ss, i, have_canfork_callback) {
5881 ret = ss->can_fork(child);
5882 if (ret)
5883 goto out_revert;
5884 } while_each_subsys_mask();
5885
5886 return 0;
5887
5888out_revert:
5889 for_each_subsys(ss, j) {
5890 if (j >= i)
5891 break;
5892 if (ss->cancel_fork)
5893 ss->cancel_fork(child);
5894 }
5895
5896 return ret;
5897}
5898
5899
5900
5901
5902
5903
5904
5905
5906void cgroup_cancel_fork(struct task_struct *child)
5907{
5908 struct cgroup_subsys *ss;
5909 int i;
5910
5911 for_each_subsys(ss, i)
5912 if (ss->cancel_fork)
5913 ss->cancel_fork(child);
5914}
5915
5916
5917
5918
5919
5920
5921
5922
5923
5924
5925
5926void cgroup_post_fork(struct task_struct *child)
5927{
5928 struct cgroup_subsys *ss;
5929 struct css_set *cset;
5930 int i;
5931
5932 spin_lock_irq(&css_set_lock);
5933
5934 WARN_ON_ONCE(!list_empty(&child->cg_list));
5935 cset = task_css_set(current);
5936 get_css_set(cset);
5937 cset->nr_tasks++;
5938 css_set_move_task(child, NULL, cset, false);
5939
5940
5941
5942
5943
5944
5945 if (unlikely(cgroup_task_freeze(child))) {
5946 spin_lock(&child->sighand->siglock);
5947 WARN_ON_ONCE(child->frozen);
5948 child->jobctl |= JOBCTL_TRAP_FREEZE;
5949 spin_unlock(&child->sighand->siglock);
5950
5951
5952
5953
5954
5955
5956
5957 }
5958
5959 spin_unlock_irq(&css_set_lock);
5960
5961
5962
5963
5964
5965
5966 do_each_subsys_mask(ss, i, have_fork_callback) {
5967 ss->fork(child);
5968 } while_each_subsys_mask();
5969}
5970
5971
5972
5973
5974
5975
5976
5977
5978void cgroup_exit(struct task_struct *tsk)
5979{
5980 struct cgroup_subsys *ss;
5981 struct css_set *cset;
5982 int i;
5983
5984 spin_lock_irq(&css_set_lock);
5985
5986 WARN_ON_ONCE(list_empty(&tsk->cg_list));
5987 cset = task_css_set(tsk);
5988 css_set_move_task(tsk, cset, NULL, false);
5989 list_add_tail(&tsk->cg_list, &cset->dying_tasks);
5990 cset->nr_tasks--;
5991
5992 WARN_ON_ONCE(cgroup_task_frozen(tsk));
5993 if (unlikely(cgroup_task_freeze(tsk)))
5994 cgroup_update_frozen(task_dfl_cgroup(tsk));
5995
5996 spin_unlock_irq(&css_set_lock);
5997
5998
5999 do_each_subsys_mask(ss, i, have_exit_callback) {
6000 ss->exit(tsk);
6001 } while_each_subsys_mask();
6002}
6003
6004void cgroup_release(struct task_struct *task)
6005{
6006 struct cgroup_subsys *ss;
6007 int ssid;
6008
6009 do_each_subsys_mask(ss, ssid, have_release_callback) {
6010 ss->release(task);
6011 } while_each_subsys_mask();
6012
6013 spin_lock_irq(&css_set_lock);
6014 css_set_skip_task_iters(task_css_set(task), task);
6015 list_del_init(&task->cg_list);
6016 spin_unlock_irq(&css_set_lock);
6017}
6018
6019void cgroup_free(struct task_struct *task)
6020{
6021 struct css_set *cset = task_css_set(task);
6022 put_css_set(cset);
6023}
6024
6025static int __init cgroup_disable(char *str)
6026{
6027 struct cgroup_subsys *ss;
6028 char *token;
6029 int i;
6030
6031 while ((token = strsep(&str, ",")) != NULL) {
6032 if (!*token)
6033 continue;
6034
6035 for_each_subsys(ss, i) {
6036 if (strcmp(token, ss->name) &&
6037 strcmp(token, ss->legacy_name))
6038 continue;
6039 cgroup_disable_mask |= 1 << i;
6040 }
6041 }
6042 return 1;
6043}
6044__setup("cgroup_disable=", cgroup_disable);
6045
6046void __init __weak enable_debug_cgroup(void) { }
6047
6048static int __init enable_cgroup_debug(char *str)
6049{
6050 cgroup_debug = true;
6051 enable_debug_cgroup();
6052 return 1;
6053}
6054__setup("cgroup_debug", enable_cgroup_debug);
6055
6056
6057
6058
6059
6060
6061
6062
6063
6064
6065struct cgroup_subsys_state *css_tryget_online_from_dir(struct dentry *dentry,
6066 struct cgroup_subsys *ss)
6067{
6068 struct kernfs_node *kn = kernfs_node_from_dentry(dentry);
6069 struct file_system_type *s_type = dentry->d_sb->s_type;
6070 struct cgroup_subsys_state *css = NULL;
6071 struct cgroup *cgrp;
6072
6073
6074 if ((s_type != &cgroup_fs_type && s_type != &cgroup2_fs_type) ||
6075 !kn || kernfs_type(kn) != KERNFS_DIR)
6076 return ERR_PTR(-EBADF);
6077
6078 rcu_read_lock();
6079
6080
6081
6082
6083
6084
6085 cgrp = rcu_dereference(*(void __rcu __force **)&kn->priv);
6086 if (cgrp)
6087 css = cgroup_css(cgrp, ss);
6088
6089 if (!css || !css_tryget_online(css))
6090 css = ERR_PTR(-ENOENT);
6091
6092 rcu_read_unlock();
6093 return css;
6094}
6095
6096
6097
6098
6099
6100
6101
6102
6103
6104struct cgroup_subsys_state *css_from_id(int id, struct cgroup_subsys *ss)
6105{
6106 WARN_ON_ONCE(!rcu_read_lock_held());
6107 return idr_find(&ss->css_idr, id);
6108}
6109
6110
6111
6112
6113
6114
6115
6116
6117
6118
6119struct cgroup *cgroup_get_from_path(const char *path)
6120{
6121 struct kernfs_node *kn;
6122 struct cgroup *cgrp;
6123
6124 mutex_lock(&cgroup_mutex);
6125
6126 kn = kernfs_walk_and_get(cgrp_dfl_root.cgrp.kn, path);
6127 if (kn) {
6128 if (kernfs_type(kn) == KERNFS_DIR) {
6129 cgrp = kn->priv;
6130 cgroup_get_live(cgrp);
6131 } else {
6132 cgrp = ERR_PTR(-ENOTDIR);
6133 }
6134 kernfs_put(kn);
6135 } else {
6136 cgrp = ERR_PTR(-ENOENT);
6137 }
6138
6139 mutex_unlock(&cgroup_mutex);
6140 return cgrp;
6141}
6142EXPORT_SYMBOL_GPL(cgroup_get_from_path);
6143
6144
6145
6146
6147
6148
6149
6150
6151
6152
6153struct cgroup *cgroup_get_from_fd(int fd)
6154{
6155 struct cgroup_subsys_state *css;
6156 struct cgroup *cgrp;
6157 struct file *f;
6158
6159 f = fget_raw(fd);
6160 if (!f)
6161 return ERR_PTR(-EBADF);
6162
6163 css = css_tryget_online_from_dir(f->f_path.dentry, NULL);
6164 fput(f);
6165 if (IS_ERR(css))
6166 return ERR_CAST(css);
6167
6168 cgrp = css->cgroup;
6169 if (!cgroup_on_dfl(cgrp)) {
6170 cgroup_put(cgrp);
6171 return ERR_PTR(-EBADF);
6172 }
6173
6174 return cgrp;
6175}
6176EXPORT_SYMBOL_GPL(cgroup_get_from_fd);
6177
6178static u64 power_of_ten(int power)
6179{
6180 u64 v = 1;
6181 while (power--)
6182 v *= 10;
6183 return v;
6184}
6185
6186
6187
6188
6189
6190
6191
6192
6193
6194
6195
6196
6197
6198
6199
6200int cgroup_parse_float(const char *input, unsigned dec_shift, s64 *v)
6201{
6202 s64 whole, frac = 0;
6203 int fstart = 0, fend = 0, flen;
6204
6205 if (!sscanf(input, "%lld.%n%lld%n", &whole, &fstart, &frac, &fend))
6206 return -EINVAL;
6207 if (frac < 0)
6208 return -EINVAL;
6209
6210 flen = fend > fstart ? fend - fstart : 0;
6211 if (flen < dec_shift)
6212 frac *= power_of_ten(dec_shift - flen);
6213 else
6214 frac = DIV_ROUND_CLOSEST_ULL(frac, power_of_ten(flen - dec_shift));
6215
6216 *v = whole * power_of_ten(dec_shift) + frac;
6217 return 0;
6218}
6219
6220
6221
6222
6223
6224#ifdef CONFIG_SOCK_CGROUP_DATA
6225
6226#if defined(CONFIG_CGROUP_NET_PRIO) || defined(CONFIG_CGROUP_NET_CLASSID)
6227
6228DEFINE_SPINLOCK(cgroup_sk_update_lock);
6229static bool cgroup_sk_alloc_disabled __read_mostly;
6230
6231void cgroup_sk_alloc_disable(void)
6232{
6233 if (cgroup_sk_alloc_disabled)
6234 return;
6235 pr_info("cgroup: disabling cgroup2 socket matching due to net_prio or net_cls activation\n");
6236 cgroup_sk_alloc_disabled = true;
6237}
6238
6239#else
6240
6241#define cgroup_sk_alloc_disabled false
6242
6243#endif
6244
6245void cgroup_sk_alloc(struct sock_cgroup_data *skcd)
6246{
6247 if (cgroup_sk_alloc_disabled)
6248 return;
6249
6250
6251 if (skcd->val) {
6252
6253
6254
6255
6256
6257 cgroup_get(sock_cgroup_ptr(skcd));
6258 cgroup_bpf_get(sock_cgroup_ptr(skcd));
6259 return;
6260 }
6261
6262 rcu_read_lock();
6263
6264 while (true) {
6265 struct css_set *cset;
6266
6267 cset = task_css_set(current);
6268 if (likely(cgroup_tryget(cset->dfl_cgrp))) {
6269 skcd->val = (unsigned long)cset->dfl_cgrp;
6270 cgroup_bpf_get(cset->dfl_cgrp);
6271 break;
6272 }
6273 cpu_relax();
6274 }
6275
6276 rcu_read_unlock();
6277}
6278
6279void cgroup_sk_free(struct sock_cgroup_data *skcd)
6280{
6281 struct cgroup *cgrp = sock_cgroup_ptr(skcd);
6282
6283 cgroup_bpf_put(cgrp);
6284 cgroup_put(cgrp);
6285}
6286
6287#endif
6288
6289#ifdef CONFIG_CGROUP_BPF
6290int cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog,
6291 enum bpf_attach_type type, u32 flags)
6292{
6293 int ret;
6294
6295 mutex_lock(&cgroup_mutex);
6296 ret = __cgroup_bpf_attach(cgrp, prog, type, flags);
6297 mutex_unlock(&cgroup_mutex);
6298 return ret;
6299}
6300int cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog,
6301 enum bpf_attach_type type, u32 flags)
6302{
6303 int ret;
6304
6305 mutex_lock(&cgroup_mutex);
6306 ret = __cgroup_bpf_detach(cgrp, prog, type);
6307 mutex_unlock(&cgroup_mutex);
6308 return ret;
6309}
6310int cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
6311 union bpf_attr __user *uattr)
6312{
6313 int ret;
6314
6315 mutex_lock(&cgroup_mutex);
6316 ret = __cgroup_bpf_query(cgrp, attr, uattr);
6317 mutex_unlock(&cgroup_mutex);
6318 return ret;
6319}
6320#endif
6321
6322#ifdef CONFIG_SYSFS
6323static ssize_t show_delegatable_files(struct cftype *files, char *buf,
6324 ssize_t size, const char *prefix)
6325{
6326 struct cftype *cft;
6327 ssize_t ret = 0;
6328
6329 for (cft = files; cft && cft->name[0] != '\0'; cft++) {
6330 if (!(cft->flags & CFTYPE_NS_DELEGATABLE))
6331 continue;
6332
6333 if (prefix)
6334 ret += snprintf(buf + ret, size - ret, "%s.", prefix);
6335
6336 ret += snprintf(buf + ret, size - ret, "%s\n", cft->name);
6337
6338 if (WARN_ON(ret >= size))
6339 break;
6340 }
6341
6342 return ret;
6343}
6344
6345static ssize_t delegate_show(struct kobject *kobj, struct kobj_attribute *attr,
6346 char *buf)
6347{
6348 struct cgroup_subsys *ss;
6349 int ssid;
6350 ssize_t ret = 0;
6351
6352 ret = show_delegatable_files(cgroup_base_files, buf, PAGE_SIZE - ret,
6353 NULL);
6354
6355 for_each_subsys(ss, ssid)
6356 ret += show_delegatable_files(ss->dfl_cftypes, buf + ret,
6357 PAGE_SIZE - ret,
6358 cgroup_subsys_name[ssid]);
6359
6360 return ret;
6361}
6362static struct kobj_attribute cgroup_delegate_attr = __ATTR_RO(delegate);
6363
6364static ssize_t features_show(struct kobject *kobj, struct kobj_attribute *attr,
6365 char *buf)
6366{
6367 return snprintf(buf, PAGE_SIZE, "nsdelegate\nmemory_localevents\n");
6368}
6369static struct kobj_attribute cgroup_features_attr = __ATTR_RO(features);
6370
6371static struct attribute *cgroup_sysfs_attrs[] = {
6372 &cgroup_delegate_attr.attr,
6373 &cgroup_features_attr.attr,
6374 NULL,
6375};
6376
6377static const struct attribute_group cgroup_sysfs_attr_group = {
6378 .attrs = cgroup_sysfs_attrs,
6379 .name = "cgroup",
6380};
6381
6382static int __init cgroup_sysfs_init(void)
6383{
6384 return sysfs_create_group(kernel_kobj, &cgroup_sysfs_attr_group);
6385}
6386subsys_initcall(cgroup_sysfs_init);
6387
6388#endif
6389