1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29#include <linux/cgroup.h>
30#include <linux/cred.h>
31#include <linux/ctype.h>
32#include <linux/errno.h>
33#include <linux/init_task.h>
34#include <linux/kernel.h>
35#include <linux/list.h>
36#include <linux/mm.h>
37#include <linux/mutex.h>
38#include <linux/mount.h>
39#include <linux/pagemap.h>
40#include <linux/proc_fs.h>
41#include <linux/rcupdate.h>
42#include <linux/sched.h>
43#include <linux/backing-dev.h>
44#include <linux/seq_file.h>
45#include <linux/slab.h>
46#include <linux/magic.h>
47#include <linux/spinlock.h>
48#include <linux/string.h>
49#include <linux/sort.h>
50#include <linux/kmod.h>
51#include <linux/module.h>
52#include <linux/delayacct.h>
53#include <linux/cgroupstats.h>
54#include <linux/hashtable.h>
55#include <linux/namei.h>
56#include <linux/pid_namespace.h>
57#include <linux/idr.h>
58#include <linux/vmalloc.h>
59#include <linux/eventfd.h>
60#include <linux/poll.h>
61#include <linux/flex_array.h>
62#include <linux/kthread.h>
63
64#include <linux/atomic.h>
65
66
67#define CSS_DEACT_BIAS INT_MIN
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85#ifdef CONFIG_PROVE_RCU
86DEFINE_MUTEX(cgroup_mutex);
87EXPORT_SYMBOL_GPL(cgroup_mutex);
88#else
89static DEFINE_MUTEX(cgroup_mutex);
90#endif
91
92static DEFINE_MUTEX(cgroup_root_mutex);
93
94
95
96
97
98
99
100static struct workqueue_struct *cgroup_destroy_wq;
101
102
103
104
105
106
107
108#define SUBSYS(_x) [_x ## _subsys_id] = &_x ## _subsys,
109#define IS_SUBSYS_ENABLED(option) IS_BUILTIN(option)
110#define ENABLE_NETPRIO_NOW
111static struct cgroup_subsys *subsys[CGROUP_SUBSYS_COUNT] = {
112#include <linux/cgroup_subsys.h>
113};
114#undef ENABLE_NETPRIO_NOW
115
116
117
118
119
120
121static struct cgroupfs_root rootnode;
122
123
124
125
126struct cfent {
127 struct list_head node;
128 struct dentry *dentry;
129 struct cftype *type;
130
131
132 struct simple_xattrs xattrs;
133};
134
135
136
137
138struct cgroup_event {
139
140
141
142 struct cgroup *cgrp;
143
144
145
146 struct cftype *cft;
147
148
149
150 struct eventfd_ctx *eventfd;
151
152
153
154 struct list_head list;
155
156
157
158
159 poll_table pt;
160 wait_queue_head_t *wqh;
161 wait_queue_t wait;
162 struct work_struct remove;
163};
164
165
166
167static LIST_HEAD(roots);
168static int root_count;
169
170static DEFINE_IDA(hierarchy_ida);
171static int next_hierarchy_id;
172static DEFINE_SPINLOCK(hierarchy_id_lock);
173
174
175#define dummytop (&rootnode.top_cgroup)
176
177static struct cgroup_name root_cgroup_name = { .name = "/" };
178
179
180
181
182
183
184static int need_forkexit_callback __read_mostly;
185
186static int cgroup_destroy_locked(struct cgroup *cgrp);
187static int cgroup_addrm_files(struct cgroup *cgrp, struct cgroup_subsys *subsys,
188 struct cftype cfts[], bool is_add);
189
190static int css_unbias_refcnt(int refcnt)
191{
192 return refcnt >= 0 ? refcnt : refcnt - CSS_DEACT_BIAS;
193}
194
195
196static int css_refcnt(struct cgroup_subsys_state *css)
197{
198 int v = atomic_read(&css->refcnt);
199
200 return css_unbias_refcnt(v);
201}
202
203
204inline int cgroup_is_removed(const struct cgroup *cgrp)
205{
206 return test_bit(CGRP_REMOVED, &cgrp->flags);
207}
208
209
210
211
212
213
214
215
216
217
218bool cgroup_is_descendant(struct cgroup *cgrp, struct cgroup *ancestor)
219{
220 while (cgrp) {
221 if (cgrp == ancestor)
222 return true;
223 cgrp = cgrp->parent;
224 }
225 return false;
226}
227EXPORT_SYMBOL_GPL(cgroup_is_descendant);
228
229static int cgroup_is_releasable(const struct cgroup *cgrp)
230{
231 const int bits =
232 (1 << CGRP_RELEASABLE) |
233 (1 << CGRP_NOTIFY_ON_RELEASE);
234 return (cgrp->flags & bits) == bits;
235}
236
237static int notify_on_release(const struct cgroup *cgrp)
238{
239 return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
240}
241
242
243
244
245
246#define for_each_subsys(_root, _ss) \
247list_for_each_entry(_ss, &_root->subsys_list, sibling)
248
249
250#define for_each_active_root(_root) \
251list_for_each_entry(_root, &roots, root_list)
252
253static inline struct cgroup *__d_cgrp(struct dentry *dentry)
254{
255 return dentry->d_fsdata;
256}
257
258static inline struct cfent *__d_cfe(struct dentry *dentry)
259{
260 return dentry->d_fsdata;
261}
262
263static inline struct cftype *__d_cft(struct dentry *dentry)
264{
265 return __d_cfe(dentry)->type;
266}
267
268
269
270
271
272
273
274
275static bool cgroup_lock_live_group(struct cgroup *cgrp)
276{
277 mutex_lock(&cgroup_mutex);
278 if (cgroup_is_removed(cgrp)) {
279 mutex_unlock(&cgroup_mutex);
280 return false;
281 }
282 return true;
283}
284
285
286
287static LIST_HEAD(release_list);
288static DEFINE_RAW_SPINLOCK(release_list_lock);
289static void cgroup_release_agent(struct work_struct *work);
290static DECLARE_WORK(release_agent_work, cgroup_release_agent);
291static void check_for_release(struct cgroup *cgrp);
292
293
294struct cg_cgroup_link {
295
296
297
298
299 struct list_head cgrp_link_list;
300 struct cgroup *cgrp;
301
302
303
304
305 struct list_head cg_link_list;
306 struct css_set *cg;
307};
308
309
310
311
312
313
314
315
316static struct css_set init_css_set;
317static struct cg_cgroup_link init_css_set_link;
318
319
320
321
322static DEFINE_RWLOCK(css_set_lock);
323static int css_set_count;
324
325
326
327
328
329
330#define CSS_SET_HASH_BITS 7
331static DEFINE_HASHTABLE(css_set_table, CSS_SET_HASH_BITS);
332
333static unsigned long css_set_hash(struct cgroup_subsys_state *css[])
334{
335 int i;
336 unsigned long key = 0UL;
337
338 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++)
339 key += (unsigned long)css[i];
340 key = (key >> 16) ^ key;
341
342 return key;
343}
344
345
346
347
348
349static int use_task_css_set_links __read_mostly;
350
351static void __put_css_set(struct css_set *cg, int taskexit)
352{
353 struct cg_cgroup_link *link;
354 struct cg_cgroup_link *saved_link;
355
356
357
358
359
360 if (atomic_add_unless(&cg->refcount, -1, 1))
361 return;
362 write_lock(&css_set_lock);
363 if (!atomic_dec_and_test(&cg->refcount)) {
364 write_unlock(&css_set_lock);
365 return;
366 }
367
368
369 hash_del(&cg->hlist);
370 css_set_count--;
371
372 list_for_each_entry_safe(link, saved_link, &cg->cg_links,
373 cg_link_list) {
374 struct cgroup *cgrp = link->cgrp;
375 list_del(&link->cg_link_list);
376 list_del(&link->cgrp_link_list);
377
378
379
380
381
382
383 rcu_read_lock();
384 if (atomic_dec_and_test(&cgrp->count) &&
385 notify_on_release(cgrp)) {
386 if (taskexit)
387 set_bit(CGRP_RELEASABLE, &cgrp->flags);
388 check_for_release(cgrp);
389 }
390 rcu_read_unlock();
391
392 kfree(link);
393 }
394
395 write_unlock(&css_set_lock);
396 kfree_rcu(cg, rcu_head);
397}
398
399
400
401
402static inline void get_css_set(struct css_set *cg)
403{
404 atomic_inc(&cg->refcount);
405}
406
407static inline void put_css_set(struct css_set *cg)
408{
409 __put_css_set(cg, 0);
410}
411
412static inline void put_css_set_taskexit(struct css_set *cg)
413{
414 __put_css_set(cg, 1);
415}
416
417
418
419
420
421
422
423
424
425
426
427static bool compare_css_sets(struct css_set *cg,
428 struct css_set *old_cg,
429 struct cgroup *new_cgrp,
430 struct cgroup_subsys_state *template[])
431{
432 struct list_head *l1, *l2;
433
434 if (memcmp(template, cg->subsys, sizeof(cg->subsys))) {
435
436 return false;
437 }
438
439
440
441
442
443
444
445
446
447
448 l1 = &cg->cg_links;
449 l2 = &old_cg->cg_links;
450 while (1) {
451 struct cg_cgroup_link *cgl1, *cgl2;
452 struct cgroup *cg1, *cg2;
453
454 l1 = l1->next;
455 l2 = l2->next;
456
457 if (l1 == &cg->cg_links) {
458 BUG_ON(l2 != &old_cg->cg_links);
459 break;
460 } else {
461 BUG_ON(l2 == &old_cg->cg_links);
462 }
463
464 cgl1 = list_entry(l1, struct cg_cgroup_link, cg_link_list);
465 cgl2 = list_entry(l2, struct cg_cgroup_link, cg_link_list);
466 cg1 = cgl1->cgrp;
467 cg2 = cgl2->cgrp;
468
469 BUG_ON(cg1->root != cg2->root);
470
471
472
473
474
475
476
477
478 if (cg1->root == new_cgrp->root) {
479 if (cg1 != new_cgrp)
480 return false;
481 } else {
482 if (cg1 != cg2)
483 return false;
484 }
485 }
486 return true;
487}
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502static struct css_set *find_existing_css_set(
503 struct css_set *oldcg,
504 struct cgroup *cgrp,
505 struct cgroup_subsys_state *template[])
506{
507 int i;
508 struct cgroupfs_root *root = cgrp->root;
509 struct css_set *cg;
510 unsigned long key;
511
512
513
514
515
516
517 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
518 if (root->subsys_mask & (1UL << i)) {
519
520
521
522 template[i] = cgrp->subsys[i];
523 } else {
524
525
526 template[i] = oldcg->subsys[i];
527 }
528 }
529
530 key = css_set_hash(template);
531 hash_for_each_possible(css_set_table, cg, hlist, key) {
532 if (!compare_css_sets(cg, oldcg, cgrp, template))
533 continue;
534
535
536 return cg;
537 }
538
539
540 return NULL;
541}
542
543static void free_cg_links(struct list_head *tmp)
544{
545 struct cg_cgroup_link *link;
546 struct cg_cgroup_link *saved_link;
547
548 list_for_each_entry_safe(link, saved_link, tmp, cgrp_link_list) {
549 list_del(&link->cgrp_link_list);
550 kfree(link);
551 }
552}
553
554
555
556
557
558
559static int allocate_cg_links(int count, struct list_head *tmp)
560{
561 struct cg_cgroup_link *link;
562 int i;
563 INIT_LIST_HEAD(tmp);
564 for (i = 0; i < count; i++) {
565 link = kmalloc(sizeof(*link), GFP_KERNEL);
566 if (!link) {
567 free_cg_links(tmp);
568 return -ENOMEM;
569 }
570 list_add(&link->cgrp_link_list, tmp);
571 }
572 return 0;
573}
574
575
576
577
578
579
580
581static void link_css_set(struct list_head *tmp_cg_links,
582 struct css_set *cg, struct cgroup *cgrp)
583{
584 struct cg_cgroup_link *link;
585
586 BUG_ON(list_empty(tmp_cg_links));
587 link = list_first_entry(tmp_cg_links, struct cg_cgroup_link,
588 cgrp_link_list);
589 link->cg = cg;
590 link->cgrp = cgrp;
591 atomic_inc(&cgrp->count);
592 list_move(&link->cgrp_link_list, &cgrp->css_sets);
593
594
595
596
597 list_add_tail(&link->cg_link_list, &cg->cg_links);
598}
599
600
601
602
603
604
605
606
607static struct css_set *find_css_set(
608 struct css_set *oldcg, struct cgroup *cgrp)
609{
610 struct css_set *res;
611 struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT];
612
613 struct list_head tmp_cg_links;
614
615 struct cg_cgroup_link *link;
616 unsigned long key;
617
618
619
620 read_lock(&css_set_lock);
621 res = find_existing_css_set(oldcg, cgrp, template);
622 if (res)
623 get_css_set(res);
624 read_unlock(&css_set_lock);
625
626 if (res)
627 return res;
628
629 res = kmalloc(sizeof(*res), GFP_KERNEL);
630 if (!res)
631 return NULL;
632
633
634 if (allocate_cg_links(root_count, &tmp_cg_links) < 0) {
635 kfree(res);
636 return NULL;
637 }
638
639 atomic_set(&res->refcount, 1);
640 INIT_LIST_HEAD(&res->cg_links);
641 INIT_LIST_HEAD(&res->tasks);
642 INIT_HLIST_NODE(&res->hlist);
643
644
645
646 memcpy(res->subsys, template, sizeof(res->subsys));
647
648 write_lock(&css_set_lock);
649
650 list_for_each_entry(link, &oldcg->cg_links, cg_link_list) {
651 struct cgroup *c = link->cgrp;
652 if (c->root == cgrp->root)
653 c = cgrp;
654 link_css_set(&tmp_cg_links, res, c);
655 }
656
657 BUG_ON(!list_empty(&tmp_cg_links));
658
659 css_set_count++;
660
661
662 key = css_set_hash(res->subsys);
663 hash_add(css_set_table, &res->hlist, key);
664
665 write_unlock(&css_set_lock);
666
667 return res;
668}
669
670
671
672
673
674static struct cgroup *task_cgroup_from_root(struct task_struct *task,
675 struct cgroupfs_root *root)
676{
677 struct css_set *css;
678 struct cgroup *res = NULL;
679
680 BUG_ON(!mutex_is_locked(&cgroup_mutex));
681 read_lock(&css_set_lock);
682
683
684
685
686
687 css = task->cgroups;
688 if (css == &init_css_set) {
689 res = &root->top_cgroup;
690 } else {
691 struct cg_cgroup_link *link;
692 list_for_each_entry(link, &css->cg_links, cg_link_list) {
693 struct cgroup *c = link->cgrp;
694 if (c->root == root) {
695 res = c;
696 break;
697 }
698 }
699 }
700 read_unlock(&css_set_lock);
701 BUG_ON(!res);
702 return res;
703}
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode);
763static struct dentry *cgroup_lookup(struct inode *, struct dentry *, unsigned int);
764static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry);
765static int cgroup_populate_dir(struct cgroup *cgrp, bool base_files,
766 unsigned long subsys_mask);
767static const struct inode_operations cgroup_dir_inode_operations;
768static const struct file_operations proc_cgroupstats_operations;
769
770static struct backing_dev_info cgroup_backing_dev_info = {
771 .name = "cgroup",
772 .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK,
773};
774
775static struct inode *cgroup_new_inode(umode_t mode, struct super_block *sb)
776{
777 struct inode *inode = new_inode(sb);
778
779 if (inode) {
780 inode->i_ino = get_next_ino();
781 inode->i_mode = mode;
782 inode->i_uid = current_fsuid();
783 inode->i_gid = current_fsgid();
784 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
785 inode->i_mapping->backing_dev_info = &cgroup_backing_dev_info;
786 }
787 return inode;
788}
789
790static struct cgroup_name *cgroup_alloc_name(struct dentry *dentry)
791{
792 struct cgroup_name *name;
793
794 name = kmalloc(sizeof(*name) + dentry->d_name.len + 1, GFP_KERNEL);
795 if (!name)
796 return NULL;
797 strcpy(name->name, dentry->d_name.name);
798 return name;
799}
800
801static void cgroup_free_fn(struct work_struct *work)
802{
803 struct cgroup *cgrp = container_of(work, struct cgroup, free_work);
804 struct cgroup_subsys *ss;
805
806 mutex_lock(&cgroup_mutex);
807
808
809
810 for_each_subsys(cgrp->root, ss)
811 ss->css_free(cgrp);
812
813 cgrp->root->number_of_cgroups--;
814 mutex_unlock(&cgroup_mutex);
815
816
817
818
819
820
821 dput(cgrp->parent->dentry);
822
823 ida_simple_remove(&cgrp->root->cgroup_ida, cgrp->id);
824
825
826
827
828
829
830 deactivate_super(cgrp->root->sb);
831
832
833
834
835
836 BUG_ON(!list_empty(&cgrp->pidlists));
837
838 simple_xattrs_free(&cgrp->xattrs);
839
840 kfree(rcu_dereference_raw(cgrp->name));
841 kfree(cgrp);
842}
843
844static void cgroup_free_rcu(struct rcu_head *head)
845{
846 struct cgroup *cgrp = container_of(head, struct cgroup, rcu_head);
847
848 queue_work(cgroup_destroy_wq, &cgrp->free_work);
849}
850
851static void cgroup_diput(struct dentry *dentry, struct inode *inode)
852{
853
854 if (S_ISDIR(inode->i_mode)) {
855 struct cgroup *cgrp = dentry->d_fsdata;
856
857 BUG_ON(!(cgroup_is_removed(cgrp)));
858 call_rcu(&cgrp->rcu_head, cgroup_free_rcu);
859 } else {
860 struct cfent *cfe = __d_cfe(dentry);
861 struct cgroup *cgrp = dentry->d_parent->d_fsdata;
862
863 WARN_ONCE(!list_empty(&cfe->node) &&
864 cgrp != &cgrp->root->top_cgroup,
865 "cfe still linked for %s\n", cfe->type->name);
866 simple_xattrs_free(&cfe->xattrs);
867 kfree(cfe);
868 }
869 iput(inode);
870}
871
872static int cgroup_delete(const struct dentry *d)
873{
874 return 1;
875}
876
877static void remove_dir(struct dentry *d)
878{
879 struct dentry *parent = dget(d->d_parent);
880
881 d_delete(d);
882 simple_rmdir(parent->d_inode, d);
883 dput(parent);
884}
885
886static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft)
887{
888 struct cfent *cfe;
889
890 lockdep_assert_held(&cgrp->dentry->d_inode->i_mutex);
891 lockdep_assert_held(&cgroup_mutex);
892
893
894
895
896
897 list_for_each_entry(cfe, &cgrp->files, node) {
898 struct dentry *d = cfe->dentry;
899
900 if (cft && cfe->type != cft)
901 continue;
902
903 dget(d);
904 d_delete(d);
905 simple_unlink(cgrp->dentry->d_inode, d);
906 list_del_init(&cfe->node);
907 dput(d);
908
909 break;
910 }
911}
912
913
914
915
916
917
918
919static void cgroup_clear_directory(struct dentry *dir, bool base_files,
920 unsigned long subsys_mask)
921{
922 struct cgroup *cgrp = __d_cgrp(dir);
923 struct cgroup_subsys *ss;
924
925 for_each_subsys(cgrp->root, ss) {
926 struct cftype_set *set;
927 if (!test_bit(ss->subsys_id, &subsys_mask))
928 continue;
929 list_for_each_entry(set, &ss->cftsets, node)
930 cgroup_addrm_files(cgrp, NULL, set->cfts, false);
931 }
932 if (base_files) {
933 while (!list_empty(&cgrp->files))
934 cgroup_rm_file(cgrp, NULL);
935 }
936}
937
938
939
940
941static void cgroup_d_remove_dir(struct dentry *dentry)
942{
943 struct dentry *parent;
944 struct cgroupfs_root *root = dentry->d_sb->s_fs_info;
945
946 cgroup_clear_directory(dentry, true, root->subsys_mask);
947
948 parent = dentry->d_parent;
949 spin_lock(&parent->d_lock);
950 spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
951 list_del_init(&dentry->d_u.d_child);
952 spin_unlock(&dentry->d_lock);
953 spin_unlock(&parent->d_lock);
954 remove_dir(dentry);
955}
956
957
958
959
960
961
962static int rebind_subsystems(struct cgroupfs_root *root,
963 unsigned long final_subsys_mask)
964{
965 unsigned long added_mask, removed_mask;
966 struct cgroup *cgrp = &root->top_cgroup;
967 int i;
968
969 BUG_ON(!mutex_is_locked(&cgroup_mutex));
970 BUG_ON(!mutex_is_locked(&cgroup_root_mutex));
971
972 removed_mask = root->actual_subsys_mask & ~final_subsys_mask;
973 added_mask = final_subsys_mask & ~root->actual_subsys_mask;
974
975 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
976 unsigned long bit = 1UL << i;
977 struct cgroup_subsys *ss = subsys[i];
978 if (!(bit & added_mask))
979 continue;
980
981
982
983
984
985 BUG_ON(ss == NULL);
986 if (ss->root != &rootnode) {
987
988 return -EBUSY;
989 }
990 }
991
992
993
994
995
996 if (root->number_of_cgroups > 1)
997 return -EBUSY;
998
999
1000 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
1001 struct cgroup_subsys *ss = subsys[i];
1002 unsigned long bit = 1UL << i;
1003 if (bit & added_mask) {
1004
1005 BUG_ON(ss == NULL);
1006 BUG_ON(cgrp->subsys[i]);
1007 BUG_ON(!dummytop->subsys[i]);
1008 BUG_ON(dummytop->subsys[i]->cgroup != dummytop);
1009 cgrp->subsys[i] = dummytop->subsys[i];
1010 cgrp->subsys[i]->cgroup = cgrp;
1011 list_move(&ss->sibling, &root->subsys_list);
1012 ss->root = root;
1013 if (ss->bind)
1014 ss->bind(cgrp);
1015
1016 } else if (bit & removed_mask) {
1017
1018 BUG_ON(ss == NULL);
1019 BUG_ON(cgrp->subsys[i] != dummytop->subsys[i]);
1020 BUG_ON(cgrp->subsys[i]->cgroup != cgrp);
1021 if (ss->bind)
1022 ss->bind(dummytop);
1023 dummytop->subsys[i]->cgroup = dummytop;
1024 cgrp->subsys[i] = NULL;
1025 subsys[i]->root = &rootnode;
1026 list_move(&ss->sibling, &rootnode.subsys_list);
1027
1028 module_put(ss->module);
1029 } else if (bit & final_subsys_mask) {
1030
1031 BUG_ON(ss == NULL);
1032 BUG_ON(!cgrp->subsys[i]);
1033
1034
1035
1036
1037 module_put(ss->module);
1038#ifdef CONFIG_MODULE_UNLOAD
1039 BUG_ON(ss->module && !module_refcount(ss->module));
1040#endif
1041 } else {
1042
1043 BUG_ON(cgrp->subsys[i]);
1044 }
1045 }
1046 root->subsys_mask = root->actual_subsys_mask = final_subsys_mask;
1047
1048 return 0;
1049}
1050
1051static int cgroup_show_options(struct seq_file *seq, struct dentry *dentry)
1052{
1053 struct cgroupfs_root *root = dentry->d_sb->s_fs_info;
1054 struct cgroup_subsys *ss;
1055
1056 mutex_lock(&cgroup_root_mutex);
1057 for_each_subsys(root, ss)
1058 seq_printf(seq, ",%s", ss->name);
1059 if (root->flags & CGRP_ROOT_SANE_BEHAVIOR)
1060 seq_puts(seq, ",sane_behavior");
1061 if (root->flags & CGRP_ROOT_NOPREFIX)
1062 seq_puts(seq, ",noprefix");
1063 if (root->flags & CGRP_ROOT_XATTR)
1064 seq_puts(seq, ",xattr");
1065 if (root->flags & CGRP_ROOT_CPUSET_V2_MODE)
1066 seq_puts(seq, ",cpuset_v2_mode");
1067 if (strlen(root->release_agent_path))
1068 seq_show_option(seq, "release_agent",
1069 root->release_agent_path);
1070 if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->top_cgroup.flags))
1071 seq_puts(seq, ",clone_children");
1072 if (strlen(root->name))
1073 seq_show_option(seq, "name", root->name);
1074 mutex_unlock(&cgroup_root_mutex);
1075 return 0;
1076}
1077
1078struct cgroup_sb_opts {
1079 unsigned long subsys_mask;
1080 unsigned long flags;
1081 char *release_agent;
1082 bool cpuset_clone_children;
1083 char *name;
1084
1085 bool none;
1086
1087 struct cgroupfs_root *new_root;
1088
1089};
1090
1091
1092
1093
1094
1095
1096
1097static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
1098{
1099 char *token, *o = data;
1100 bool all_ss = false, one_ss = false;
1101 unsigned long mask = (unsigned long)-1;
1102 int i;
1103 bool module_pin_failed = false;
1104
1105 BUG_ON(!mutex_is_locked(&cgroup_mutex));
1106
1107#ifdef CONFIG_CPUSETS
1108 mask = ~(1UL << cpuset_subsys_id);
1109#endif
1110
1111 memset(opts, 0, sizeof(*opts));
1112
1113 while ((token = strsep(&o, ",")) != NULL) {
1114 if (!*token)
1115 return -EINVAL;
1116 if (!strcmp(token, "none")) {
1117
1118 opts->none = true;
1119 continue;
1120 }
1121 if (!strcmp(token, "all")) {
1122
1123 if (one_ss)
1124 return -EINVAL;
1125 all_ss = true;
1126 continue;
1127 }
1128 if (!strcmp(token, "__DEVEL__sane_behavior")) {
1129 opts->flags |= CGRP_ROOT_SANE_BEHAVIOR;
1130 continue;
1131 }
1132 if (!strcmp(token, "noprefix")) {
1133 opts->flags |= CGRP_ROOT_NOPREFIX;
1134 continue;
1135 }
1136 if (!strcmp(token, "clone_children")) {
1137 opts->cpuset_clone_children = true;
1138 continue;
1139 }
1140 if (!strcmp(token, "cpuset_v2_mode")) {
1141 opts->flags |= CGRP_ROOT_CPUSET_V2_MODE;
1142 continue;
1143 }
1144 if (!strcmp(token, "xattr")) {
1145 opts->flags |= CGRP_ROOT_XATTR;
1146 continue;
1147 }
1148 if (!strncmp(token, "release_agent=", 14)) {
1149
1150 if (opts->release_agent)
1151 return -EINVAL;
1152 opts->release_agent =
1153 kstrndup(token + 14, PATH_MAX - 1, GFP_KERNEL);
1154 if (!opts->release_agent)
1155 return -ENOMEM;
1156 continue;
1157 }
1158 if (!strncmp(token, "name=", 5)) {
1159 const char *name = token + 5;
1160
1161 if (!strlen(name))
1162 return -EINVAL;
1163
1164 for (i = 0; i < strlen(name); i++) {
1165 char c = name[i];
1166 if (isalnum(c))
1167 continue;
1168 if ((c == '.') || (c == '-') || (c == '_'))
1169 continue;
1170 return -EINVAL;
1171 }
1172
1173 if (opts->name)
1174 return -EINVAL;
1175 opts->name = kstrndup(name,
1176 MAX_CGROUP_ROOT_NAMELEN - 1,
1177 GFP_KERNEL);
1178 if (!opts->name)
1179 return -ENOMEM;
1180
1181 continue;
1182 }
1183
1184 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
1185 struct cgroup_subsys *ss = subsys[i];
1186 if (ss == NULL)
1187 continue;
1188 if (strcmp(token, ss->name))
1189 continue;
1190 if (ss->disabled)
1191 continue;
1192
1193
1194 if (all_ss)
1195 return -EINVAL;
1196 set_bit(i, &opts->subsys_mask);
1197 one_ss = true;
1198
1199 break;
1200 }
1201 if (i == CGROUP_SUBSYS_COUNT)
1202 return -ENOENT;
1203 }
1204
1205
1206
1207
1208
1209
1210 if (all_ss || (!one_ss && !opts->none && !opts->name)) {
1211 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
1212 struct cgroup_subsys *ss = subsys[i];
1213 if (ss == NULL)
1214 continue;
1215 if (ss->disabled)
1216 continue;
1217 set_bit(i, &opts->subsys_mask);
1218 }
1219 }
1220
1221
1222
1223 if (opts->flags & CGRP_ROOT_SANE_BEHAVIOR) {
1224 pr_warning("cgroup: sane_behavior: this is still under development and its behaviors will change, proceed at your own risk\n");
1225
1226 if (opts->flags & CGRP_ROOT_NOPREFIX) {
1227 pr_err("cgroup: sane_behavior: noprefix is not allowed\n");
1228 return -EINVAL;
1229 }
1230
1231 if (opts->cpuset_clone_children) {
1232 pr_err("cgroup: sane_behavior: clone_children is not allowed\n");
1233 return -EINVAL;
1234 }
1235 }
1236
1237
1238
1239
1240
1241
1242 if ((opts->flags & CGRP_ROOT_NOPREFIX) && (opts->subsys_mask & mask))
1243 return -EINVAL;
1244
1245
1246
1247 if (opts->subsys_mask && opts->none)
1248 return -EINVAL;
1249
1250
1251
1252
1253
1254 if (!opts->subsys_mask && !opts->name)
1255 return -EINVAL;
1256
1257
1258
1259
1260
1261
1262
1263 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
1264 unsigned long bit = 1UL << i;
1265
1266 if (!(bit & opts->subsys_mask))
1267 continue;
1268 if (!try_module_get(subsys[i]->module)) {
1269 module_pin_failed = true;
1270 break;
1271 }
1272 }
1273 if (module_pin_failed) {
1274
1275
1276
1277
1278
1279 for (i--; i >= 0; i--) {
1280
1281 unsigned long bit = 1UL << i;
1282
1283 if (!(bit & opts->subsys_mask))
1284 continue;
1285 module_put(subsys[i]->module);
1286 }
1287 return -ENOENT;
1288 }
1289
1290 return 0;
1291}
1292
1293static void drop_parsed_module_refcounts(unsigned long subsys_mask)
1294{
1295 int i;
1296 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
1297 unsigned long bit = 1UL << i;
1298
1299 if (!(bit & subsys_mask))
1300 continue;
1301 module_put(subsys[i]->module);
1302 }
1303}
1304
1305static int cgroup_remount(struct super_block *sb, int *flags, char *data)
1306{
1307 int ret = 0;
1308 struct cgroupfs_root *root = sb->s_fs_info;
1309 struct cgroup *cgrp = &root->top_cgroup;
1310 struct cgroup_sb_opts opts;
1311 unsigned long added_mask, removed_mask;
1312
1313 if (root->flags & CGRP_ROOT_SANE_BEHAVIOR) {
1314 pr_err("cgroup: sane_behavior: remount is not allowed\n");
1315 return -EINVAL;
1316 }
1317
1318 mutex_lock(&cgrp->dentry->d_inode->i_mutex);
1319 mutex_lock(&cgroup_mutex);
1320 mutex_lock(&cgroup_root_mutex);
1321
1322
1323 ret = parse_cgroupfs_options(data, &opts);
1324 if (ret)
1325 goto out_unlock;
1326
1327 if (opts.subsys_mask != root->actual_subsys_mask || opts.release_agent)
1328 pr_warning("cgroup: option changes via remount are deprecated (pid=%d comm=%s)\n",
1329 task_tgid_nr(current), current->comm);
1330
1331 added_mask = opts.subsys_mask & ~root->subsys_mask;
1332 removed_mask = root->subsys_mask & ~opts.subsys_mask;
1333
1334
1335 if (opts.flags != root->flags ||
1336 (opts.name && strcmp(opts.name, root->name))) {
1337 ret = -EINVAL;
1338 drop_parsed_module_refcounts(opts.subsys_mask);
1339 goto out_unlock;
1340 }
1341
1342
1343
1344
1345
1346
1347 cgroup_clear_directory(cgrp->dentry, false, removed_mask);
1348
1349 ret = rebind_subsystems(root, opts.subsys_mask);
1350 if (ret) {
1351
1352 cgroup_populate_dir(cgrp, false, removed_mask);
1353 drop_parsed_module_refcounts(opts.subsys_mask);
1354 goto out_unlock;
1355 }
1356
1357
1358 cgroup_populate_dir(cgrp, false, added_mask);
1359
1360 if (opts.release_agent)
1361 strcpy(root->release_agent_path, opts.release_agent);
1362 out_unlock:
1363 kfree(opts.release_agent);
1364 kfree(opts.name);
1365 mutex_unlock(&cgroup_root_mutex);
1366 mutex_unlock(&cgroup_mutex);
1367 mutex_unlock(&cgrp->dentry->d_inode->i_mutex);
1368 return ret;
1369}
1370
1371static const struct super_operations cgroup_ops = {
1372 .statfs = simple_statfs,
1373 .drop_inode = generic_delete_inode,
1374 .show_options = cgroup_show_options,
1375 .remount_fs = cgroup_remount,
1376};
1377
1378static void init_cgroup_housekeeping(struct cgroup *cgrp)
1379{
1380 INIT_LIST_HEAD(&cgrp->sibling);
1381 INIT_LIST_HEAD(&cgrp->children);
1382 INIT_LIST_HEAD(&cgrp->files);
1383 INIT_LIST_HEAD(&cgrp->css_sets);
1384 INIT_LIST_HEAD(&cgrp->allcg_node);
1385 INIT_LIST_HEAD(&cgrp->release_list);
1386 INIT_LIST_HEAD(&cgrp->pidlists);
1387 INIT_WORK(&cgrp->free_work, cgroup_free_fn);
1388 mutex_init(&cgrp->pidlist_mutex);
1389 INIT_LIST_HEAD(&cgrp->event_list);
1390 spin_lock_init(&cgrp->event_list_lock);
1391 simple_xattrs_init(&cgrp->xattrs);
1392}
1393
1394static void init_cgroup_root(struct cgroupfs_root *root)
1395{
1396 struct cgroup *cgrp = &root->top_cgroup;
1397
1398 INIT_LIST_HEAD(&root->subsys_list);
1399 INIT_LIST_HEAD(&root->root_list);
1400 INIT_LIST_HEAD(&root->allcg_list);
1401 root->number_of_cgroups = 1;
1402 cgrp->root = root;
1403 cgrp->name = &root_cgroup_name;
1404 init_cgroup_housekeeping(cgrp);
1405 list_add_tail(&cgrp->allcg_node, &root->allcg_list);
1406}
1407
1408static bool init_root_id(struct cgroupfs_root *root)
1409{
1410 int ret = 0;
1411
1412 do {
1413 if (!ida_pre_get(&hierarchy_ida, GFP_KERNEL))
1414 return false;
1415 spin_lock(&hierarchy_id_lock);
1416
1417 ret = ida_get_new_above(&hierarchy_ida, next_hierarchy_id,
1418 &root->hierarchy_id);
1419 if (ret == -ENOSPC)
1420
1421 ret = ida_get_new(&hierarchy_ida, &root->hierarchy_id);
1422 if (!ret) {
1423 next_hierarchy_id = root->hierarchy_id + 1;
1424 } else if (ret != -EAGAIN) {
1425
1426 BUG_ON(ret);
1427 }
1428 spin_unlock(&hierarchy_id_lock);
1429 } while (ret);
1430 return true;
1431}
1432
1433static int cgroup_test_super(struct super_block *sb, void *data)
1434{
1435 struct cgroup_sb_opts *opts = data;
1436 struct cgroupfs_root *root = sb->s_fs_info;
1437
1438
1439 if (opts->name && strcmp(opts->name, root->name))
1440 return 0;
1441
1442
1443
1444
1445
1446 if ((opts->subsys_mask || opts->none)
1447 && (opts->subsys_mask != root->subsys_mask))
1448 return 0;
1449
1450 return 1;
1451}
1452
1453static struct cgroupfs_root *cgroup_root_from_opts(struct cgroup_sb_opts *opts)
1454{
1455 struct cgroupfs_root *root;
1456
1457 if (!opts->subsys_mask && !opts->none)
1458 return NULL;
1459
1460 root = kzalloc(sizeof(*root), GFP_KERNEL);
1461 if (!root)
1462 return ERR_PTR(-ENOMEM);
1463
1464 if (!init_root_id(root)) {
1465 kfree(root);
1466 return ERR_PTR(-ENOMEM);
1467 }
1468 init_cgroup_root(root);
1469
1470 root->subsys_mask = opts->subsys_mask;
1471 root->flags = opts->flags;
1472 ida_init(&root->cgroup_ida);
1473 if (opts->release_agent)
1474 strcpy(root->release_agent_path, opts->release_agent);
1475 if (opts->name)
1476 strcpy(root->name, opts->name);
1477 if (opts->cpuset_clone_children)
1478 set_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->top_cgroup.flags);
1479 return root;
1480}
1481
1482static void cgroup_drop_root(struct cgroupfs_root *root)
1483{
1484 if (!root)
1485 return;
1486
1487 BUG_ON(!root->hierarchy_id);
1488 spin_lock(&hierarchy_id_lock);
1489 ida_remove(&hierarchy_ida, root->hierarchy_id);
1490 spin_unlock(&hierarchy_id_lock);
1491 ida_destroy(&root->cgroup_ida);
1492 kfree(root);
1493}
1494
1495static int cgroup_set_super(struct super_block *sb, void *data)
1496{
1497 int ret;
1498 struct cgroup_sb_opts *opts = data;
1499
1500
1501 if (!opts->new_root)
1502 return -EINVAL;
1503
1504 BUG_ON(!opts->subsys_mask && !opts->none);
1505
1506 ret = set_anon_super(sb, NULL);
1507 if (ret)
1508 return ret;
1509
1510 sb->s_fs_info = opts->new_root;
1511 opts->new_root->sb = sb;
1512
1513 sb->s_blocksize = PAGE_CACHE_SIZE;
1514 sb->s_blocksize_bits = PAGE_CACHE_SHIFT;
1515 sb->s_magic = CGROUP_SUPER_MAGIC;
1516 sb->s_op = &cgroup_ops;
1517
1518 return 0;
1519}
1520
1521static int cgroup_get_rootdir(struct super_block *sb)
1522{
1523 static const struct dentry_operations cgroup_dops = {
1524 .d_iput = cgroup_diput,
1525 .d_delete = cgroup_delete,
1526 };
1527
1528 struct inode *inode =
1529 cgroup_new_inode(S_IFDIR | S_IRUGO | S_IXUGO | S_IWUSR, sb);
1530
1531 if (!inode)
1532 return -ENOMEM;
1533
1534 inode->i_fop = &simple_dir_operations;
1535 inode->i_op = &cgroup_dir_inode_operations;
1536
1537 inc_nlink(inode);
1538 sb->s_root = d_make_root(inode);
1539 if (!sb->s_root)
1540 return -ENOMEM;
1541
1542 sb->s_d_op = &cgroup_dops;
1543 return 0;
1544}
1545
1546static struct dentry *cgroup_mount(struct file_system_type *fs_type,
1547 int flags, const char *unused_dev_name,
1548 void *data)
1549{
1550 struct cgroup_sb_opts opts;
1551 struct cgroupfs_root *root;
1552 int ret = 0;
1553 struct super_block *sb;
1554 struct cgroupfs_root *new_root;
1555 struct inode *inode;
1556
1557
1558 mutex_lock(&cgroup_mutex);
1559 ret = parse_cgroupfs_options(data, &opts);
1560 mutex_unlock(&cgroup_mutex);
1561 if (ret)
1562 goto out_err;
1563
1564
1565
1566
1567
1568 new_root = cgroup_root_from_opts(&opts);
1569 if (IS_ERR(new_root)) {
1570 ret = PTR_ERR(new_root);
1571 goto drop_modules;
1572 }
1573 opts.new_root = new_root;
1574
1575
1576 sb = sget(fs_type, cgroup_test_super, cgroup_set_super, 0, &opts);
1577 if (IS_ERR(sb)) {
1578 ret = PTR_ERR(sb);
1579 cgroup_drop_root(opts.new_root);
1580 goto drop_modules;
1581 }
1582
1583 root = sb->s_fs_info;
1584 BUG_ON(!root);
1585 if (root == opts.new_root) {
1586
1587 struct list_head tmp_cg_links;
1588 struct cgroup *root_cgrp = &root->top_cgroup;
1589 struct cgroupfs_root *existing_root;
1590 const struct cred *cred;
1591 int i;
1592 struct css_set *cg;
1593
1594 BUG_ON(sb->s_root != NULL);
1595
1596 ret = cgroup_get_rootdir(sb);
1597 if (ret)
1598 goto drop_new_super;
1599 inode = sb->s_root->d_inode;
1600
1601 mutex_lock(&inode->i_mutex);
1602 mutex_lock(&cgroup_mutex);
1603 mutex_lock(&cgroup_root_mutex);
1604
1605
1606 ret = -EBUSY;
1607 if (strlen(root->name))
1608 for_each_active_root(existing_root)
1609 if (!strcmp(existing_root->name, root->name))
1610 goto unlock_drop;
1611
1612
1613
1614
1615
1616
1617
1618
1619 ret = allocate_cg_links(css_set_count, &tmp_cg_links);
1620 if (ret)
1621 goto unlock_drop;
1622
1623 ret = rebind_subsystems(root, root->subsys_mask);
1624 if (ret == -EBUSY) {
1625 free_cg_links(&tmp_cg_links);
1626 goto unlock_drop;
1627 }
1628
1629
1630
1631
1632
1633
1634
1635 BUG_ON(ret);
1636
1637 list_add(&root->root_list, &roots);
1638 root_count++;
1639
1640 sb->s_root->d_fsdata = root_cgrp;
1641 root->top_cgroup.dentry = sb->s_root;
1642
1643
1644
1645 write_lock(&css_set_lock);
1646 hash_for_each(css_set_table, i, cg, hlist)
1647 link_css_set(&tmp_cg_links, cg, root_cgrp);
1648 write_unlock(&css_set_lock);
1649
1650 free_cg_links(&tmp_cg_links);
1651
1652 BUG_ON(!list_empty(&root_cgrp->children));
1653 BUG_ON(root->number_of_cgroups != 1);
1654
1655 cred = override_creds(&init_cred);
1656 cgroup_populate_dir(root_cgrp, true, root->subsys_mask);
1657 revert_creds(cred);
1658 mutex_unlock(&cgroup_root_mutex);
1659 mutex_unlock(&cgroup_mutex);
1660 mutex_unlock(&inode->i_mutex);
1661 } else {
1662
1663
1664
1665
1666 cgroup_drop_root(opts.new_root);
1667
1668 if (root->flags != opts.flags) {
1669 if ((root->flags | opts.flags) & CGRP_ROOT_SANE_BEHAVIOR) {
1670 pr_err("cgroup: sane_behavior: new mount options should match the existing superblock\n");
1671 ret = -EINVAL;
1672 goto drop_new_super;
1673 } else {
1674 pr_warning("cgroup: new mount options do not match the existing superblock, will be ignored\n");
1675 }
1676 }
1677
1678
1679 drop_parsed_module_refcounts(opts.subsys_mask);
1680 }
1681
1682 kfree(opts.release_agent);
1683 kfree(opts.name);
1684 return dget(sb->s_root);
1685
1686 unlock_drop:
1687 mutex_unlock(&cgroup_root_mutex);
1688 mutex_unlock(&cgroup_mutex);
1689 mutex_unlock(&inode->i_mutex);
1690 drop_new_super:
1691 deactivate_locked_super(sb);
1692 drop_modules:
1693 drop_parsed_module_refcounts(opts.subsys_mask);
1694 out_err:
1695 kfree(opts.release_agent);
1696 kfree(opts.name);
1697 return ERR_PTR(ret);
1698}
1699
1700static void cgroup_kill_sb(struct super_block *sb) {
1701 struct cgroupfs_root *root = sb->s_fs_info;
1702 struct cgroup *cgrp = &root->top_cgroup;
1703 int ret;
1704 struct cg_cgroup_link *link;
1705 struct cg_cgroup_link *saved_link;
1706
1707 BUG_ON(!root);
1708
1709 BUG_ON(root->number_of_cgroups != 1);
1710 BUG_ON(!list_empty(&cgrp->children));
1711
1712 mutex_lock(&cgroup_mutex);
1713 mutex_lock(&cgroup_root_mutex);
1714
1715
1716 ret = rebind_subsystems(root, 0);
1717
1718 BUG_ON(ret);
1719
1720
1721
1722
1723
1724 write_lock(&css_set_lock);
1725
1726 list_for_each_entry_safe(link, saved_link, &cgrp->css_sets,
1727 cgrp_link_list) {
1728 list_del(&link->cg_link_list);
1729 list_del(&link->cgrp_link_list);
1730 kfree(link);
1731 }
1732 write_unlock(&css_set_lock);
1733
1734 if (!list_empty(&root->root_list)) {
1735 list_del(&root->root_list);
1736 root_count--;
1737 }
1738
1739 mutex_unlock(&cgroup_root_mutex);
1740 mutex_unlock(&cgroup_mutex);
1741
1742 simple_xattrs_free(&cgrp->xattrs);
1743
1744 kill_litter_super(sb);
1745 cgroup_drop_root(root);
1746}
1747
1748static struct file_system_type cgroup_fs_type = {
1749 .name = "cgroup",
1750 .mount = cgroup_mount,
1751 .kill_sb = cgroup_kill_sb,
1752};
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)
1768{
1769 int ret = -ENAMETOOLONG;
1770 char *start;
1771
1772 if (!cgrp->parent) {
1773 if (strlcpy(buf, "/", buflen) >= buflen)
1774 return -ENAMETOOLONG;
1775 return 0;
1776 }
1777
1778 start = buf + buflen - 1;
1779 *start = '\0';
1780
1781 rcu_read_lock();
1782 do {
1783 const char *name = cgroup_name(cgrp);
1784 int len;
1785
1786 len = strlen(name);
1787 if ((start -= len) < buf)
1788 goto out;
1789 memcpy(start, name, len);
1790
1791 if (--start < buf)
1792 goto out;
1793 *start = '/';
1794
1795 cgrp = cgrp->parent;
1796 } while (cgrp->parent);
1797 ret = 0;
1798 memmove(buf, start, buf + buflen - start);
1799out:
1800 rcu_read_unlock();
1801 return ret;
1802}
1803EXPORT_SYMBOL_GPL(cgroup_path);
1804
1805
1806
1807
1808struct task_and_cgroup {
1809 struct task_struct *task;
1810 struct cgroup *cgrp;
1811 struct css_set *cg;
1812};
1813
1814struct cgroup_taskset {
1815 struct task_and_cgroup single;
1816 struct flex_array *tc_array;
1817 int tc_array_len;
1818 int idx;
1819 struct cgroup *cur_cgrp;
1820};
1821
1822
1823
1824
1825
1826
1827
1828struct task_struct *cgroup_taskset_first(struct cgroup_taskset *tset)
1829{
1830 if (tset->tc_array) {
1831 tset->idx = 0;
1832 return cgroup_taskset_next(tset);
1833 } else {
1834 tset->cur_cgrp = tset->single.cgrp;
1835 return tset->single.task;
1836 }
1837}
1838EXPORT_SYMBOL_GPL(cgroup_taskset_first);
1839
1840
1841
1842
1843
1844
1845
1846
1847struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset)
1848{
1849 struct task_and_cgroup *tc;
1850
1851 if (!tset->tc_array || tset->idx >= tset->tc_array_len)
1852 return NULL;
1853
1854 tc = flex_array_get(tset->tc_array, tset->idx++);
1855 tset->cur_cgrp = tc->cgrp;
1856 return tc->task;
1857}
1858EXPORT_SYMBOL_GPL(cgroup_taskset_next);
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868struct cgroup *cgroup_taskset_cur_cgroup(struct cgroup_taskset *tset)
1869{
1870 return tset->cur_cgrp;
1871}
1872EXPORT_SYMBOL_GPL(cgroup_taskset_cur_cgroup);
1873
1874
1875
1876
1877
1878int cgroup_taskset_size(struct cgroup_taskset *tset)
1879{
1880 return tset->tc_array ? tset->tc_array_len : 1;
1881}
1882EXPORT_SYMBOL_GPL(cgroup_taskset_size);
1883
1884
1885
1886
1887
1888
1889
1890static void cgroup_task_migrate(struct cgroup *oldcgrp,
1891 struct task_struct *tsk, struct css_set *newcg)
1892{
1893 struct css_set *oldcg;
1894
1895
1896
1897
1898
1899
1900 WARN_ON_ONCE(tsk->flags & PF_EXITING);
1901 oldcg = tsk->cgroups;
1902
1903 task_lock(tsk);
1904 rcu_assign_pointer(tsk->cgroups, newcg);
1905 task_unlock(tsk);
1906
1907
1908 write_lock(&css_set_lock);
1909 if (!list_empty(&tsk->cg_list))
1910 list_move(&tsk->cg_list, &newcg->tasks);
1911 write_unlock(&css_set_lock);
1912
1913
1914
1915
1916
1917
1918 set_bit(CGRP_RELEASABLE, &oldcgrp->flags);
1919 put_css_set(oldcg);
1920}
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk,
1932 bool threadgroup)
1933{
1934 int retval, i, group_size;
1935 struct cgroup_subsys *ss, *failed_ss = NULL;
1936 struct cgroupfs_root *root = cgrp->root;
1937
1938 struct task_struct *leader = tsk;
1939 struct task_and_cgroup *tc;
1940 struct flex_array *group;
1941 struct cgroup_taskset tset = { };
1942
1943
1944
1945
1946
1947
1948
1949
1950 if (threadgroup)
1951 group_size = get_nr_threads(tsk);
1952 else
1953 group_size = 1;
1954
1955 group = flex_array_alloc(sizeof(*tc), group_size, GFP_KERNEL);
1956 if (!group)
1957 return -ENOMEM;
1958
1959 retval = flex_array_prealloc(group, 0, group_size, GFP_KERNEL);
1960 if (retval)
1961 goto out_free_group_list;
1962
1963 i = 0;
1964
1965
1966
1967
1968
1969 rcu_read_lock();
1970 do {
1971 struct task_and_cgroup ent;
1972
1973
1974 if (tsk->flags & PF_EXITING)
1975 goto next;
1976
1977
1978 BUG_ON(i >= group_size);
1979 ent.task = tsk;
1980 ent.cgrp = task_cgroup_from_root(tsk, root);
1981
1982 if (ent.cgrp == cgrp)
1983 goto next;
1984
1985
1986
1987
1988 retval = flex_array_put(group, i, &ent, GFP_ATOMIC);
1989 BUG_ON(retval != 0);
1990 i++;
1991 next:
1992 if (!threadgroup)
1993 break;
1994 } while_each_thread(leader, tsk);
1995 rcu_read_unlock();
1996
1997 group_size = i;
1998 tset.tc_array = group;
1999 tset.tc_array_len = group_size;
2000
2001
2002 retval = 0;
2003 if (!group_size)
2004 goto out_free_group_list;
2005
2006
2007
2008
2009 for_each_subsys(root, ss) {
2010 if (ss->can_attach) {
2011 retval = ss->can_attach(cgrp, &tset);
2012 if (retval) {
2013 failed_ss = ss;
2014 goto out_cancel_attach;
2015 }
2016 }
2017 }
2018
2019
2020
2021
2022
2023 for (i = 0; i < group_size; i++) {
2024 tc = flex_array_get(group, i);
2025 tc->cg = find_css_set(tc->task->cgroups, cgrp);
2026 if (!tc->cg) {
2027 retval = -ENOMEM;
2028 goto out_put_css_set_refs;
2029 }
2030 }
2031
2032
2033
2034
2035
2036
2037 for (i = 0; i < group_size; i++) {
2038 tc = flex_array_get(group, i);
2039 cgroup_task_migrate(tc->cgrp, tc->task, tc->cg);
2040 }
2041
2042
2043
2044
2045
2046 for_each_subsys(root, ss) {
2047 if (ss->attach)
2048 ss->attach(cgrp, &tset);
2049 }
2050
2051
2052
2053
2054 retval = 0;
2055out_put_css_set_refs:
2056 if (retval) {
2057 for (i = 0; i < group_size; i++) {
2058 tc = flex_array_get(group, i);
2059 if (!tc->cg)
2060 break;
2061 put_css_set(tc->cg);
2062 }
2063 }
2064out_cancel_attach:
2065 if (retval) {
2066 for_each_subsys(root, ss) {
2067 if (ss == failed_ss)
2068 break;
2069 if (ss->cancel_attach)
2070 ss->cancel_attach(cgrp, &tset);
2071 }
2072 }
2073out_free_group_list:
2074 flex_array_free(group);
2075 return retval;
2076}
2077
2078
2079
2080
2081
2082
2083static int attach_task_by_pid(struct cgroup *cgrp, u64 pid, bool threadgroup)
2084{
2085 struct task_struct *tsk;
2086 const struct cred *cred = current_cred(), *tcred;
2087 int ret;
2088
2089 if (!cgroup_lock_live_group(cgrp))
2090 return -ENODEV;
2091
2092retry_find_task:
2093 rcu_read_lock();
2094 if (pid) {
2095 tsk = find_task_by_vpid(pid);
2096 if (!tsk) {
2097 rcu_read_unlock();
2098 ret= -ESRCH;
2099 goto out_unlock_cgroup;
2100 }
2101
2102
2103
2104
2105 tcred = __task_cred(tsk);
2106 if (!uid_eq(cred->euid, GLOBAL_ROOT_UID) &&
2107 !uid_eq(cred->euid, tcred->uid) &&
2108 !uid_eq(cred->euid, tcred->suid)) {
2109 rcu_read_unlock();
2110 ret = -EACCES;
2111 goto out_unlock_cgroup;
2112 }
2113 } else
2114 tsk = current;
2115
2116 if (threadgroup)
2117 tsk = tsk->group_leader;
2118
2119
2120
2121
2122
2123
2124 if (tsk == kthreadd_task || (tsk->flags & PF_NO_SETAFFINITY)) {
2125 ret = -EINVAL;
2126 rcu_read_unlock();
2127 goto out_unlock_cgroup;
2128 }
2129
2130 get_task_struct(tsk);
2131 rcu_read_unlock();
2132
2133 threadgroup_lock(tsk);
2134 if (threadgroup) {
2135 if (!thread_group_leader(tsk)) {
2136
2137
2138
2139
2140
2141
2142
2143 threadgroup_unlock(tsk);
2144 put_task_struct(tsk);
2145 goto retry_find_task;
2146 }
2147 }
2148
2149 ret = cgroup_attach_task(cgrp, tsk, threadgroup);
2150
2151 threadgroup_unlock(tsk);
2152
2153 put_task_struct(tsk);
2154out_unlock_cgroup:
2155 mutex_unlock(&cgroup_mutex);
2156 return ret;
2157}
2158
2159
2160
2161
2162
2163
2164int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk)
2165{
2166 struct cgroupfs_root *root;
2167 int retval = 0;
2168
2169 mutex_lock(&cgroup_mutex);
2170 for_each_active_root(root) {
2171 struct cgroup *from_cg = task_cgroup_from_root(from, root);
2172
2173 retval = cgroup_attach_task(from_cg, tsk, false);
2174 if (retval)
2175 break;
2176 }
2177 mutex_unlock(&cgroup_mutex);
2178
2179 return retval;
2180}
2181EXPORT_SYMBOL_GPL(cgroup_attach_task_all);
2182
2183static int cgroup_tasks_write(struct cgroup *cgrp, struct cftype *cft, u64 pid)
2184{
2185 return attach_task_by_pid(cgrp, pid, false);
2186}
2187
2188static int cgroup_procs_write(struct cgroup *cgrp, struct cftype *cft, u64 tgid)
2189{
2190 return attach_task_by_pid(cgrp, tgid, true);
2191}
2192
2193static int cgroup_release_agent_write(struct cgroup *cgrp, struct cftype *cft,
2194 const char *buffer)
2195{
2196 BUILD_BUG_ON(sizeof(cgrp->root->release_agent_path) < PATH_MAX);
2197 if (strlen(buffer) >= PATH_MAX)
2198 return -EINVAL;
2199 if (!cgroup_lock_live_group(cgrp))
2200 return -ENODEV;
2201 mutex_lock(&cgroup_root_mutex);
2202 strcpy(cgrp->root->release_agent_path, buffer);
2203 mutex_unlock(&cgroup_root_mutex);
2204 mutex_unlock(&cgroup_mutex);
2205 return 0;
2206}
2207
2208static int cgroup_release_agent_show(struct cgroup *cgrp, struct cftype *cft,
2209 struct seq_file *seq)
2210{
2211 if (!cgroup_lock_live_group(cgrp))
2212 return -ENODEV;
2213 seq_puts(seq, cgrp->root->release_agent_path);
2214 seq_putc(seq, '\n');
2215 mutex_unlock(&cgroup_mutex);
2216 return 0;
2217}
2218
2219static int cgroup_sane_behavior_show(struct cgroup *cgrp, struct cftype *cft,
2220 struct seq_file *seq)
2221{
2222 seq_printf(seq, "%d\n", cgroup_sane_behavior(cgrp));
2223 return 0;
2224}
2225
2226
2227#define CGROUP_LOCAL_BUFFER_SIZE 64
2228
2229static ssize_t cgroup_write_X64(struct cgroup *cgrp, struct cftype *cft,
2230 struct file *file,
2231 const char __user *userbuf,
2232 size_t nbytes, loff_t *unused_ppos)
2233{
2234 char buffer[CGROUP_LOCAL_BUFFER_SIZE];
2235 int retval = 0;
2236 char *end;
2237
2238 if (!nbytes)
2239 return -EINVAL;
2240 if (nbytes >= sizeof(buffer))
2241 return -E2BIG;
2242 if (copy_from_user(buffer, userbuf, nbytes))
2243 return -EFAULT;
2244
2245 buffer[nbytes] = 0;
2246 if (cft->write_u64) {
2247 u64 val = simple_strtoull(strstrip(buffer), &end, 0);
2248 if (*end)
2249 return -EINVAL;
2250 retval = cft->write_u64(cgrp, cft, val);
2251 } else {
2252 s64 val = simple_strtoll(strstrip(buffer), &end, 0);
2253 if (*end)
2254 return -EINVAL;
2255 retval = cft->write_s64(cgrp, cft, val);
2256 }
2257 if (!retval)
2258 retval = nbytes;
2259 return retval;
2260}
2261
2262static ssize_t cgroup_write_string(struct cgroup *cgrp, struct cftype *cft,
2263 struct file *file,
2264 const char __user *userbuf,
2265 size_t nbytes, loff_t *unused_ppos)
2266{
2267 char local_buffer[CGROUP_LOCAL_BUFFER_SIZE];
2268 int retval = 0;
2269 size_t max_bytes = cft->max_write_len;
2270 char *buffer = local_buffer;
2271
2272 if (!max_bytes)
2273 max_bytes = sizeof(local_buffer) - 1;
2274 if (nbytes >= max_bytes)
2275 return -E2BIG;
2276
2277 if (nbytes >= sizeof(local_buffer)) {
2278 buffer = kmalloc(nbytes + 1, GFP_KERNEL);
2279 if (buffer == NULL)
2280 return -ENOMEM;
2281 }
2282 if (nbytes && copy_from_user(buffer, userbuf, nbytes)) {
2283 retval = -EFAULT;
2284 goto out;
2285 }
2286
2287 buffer[nbytes] = 0;
2288 retval = cft->write_string(cgrp, cft, strstrip(buffer));
2289 if (!retval)
2290 retval = nbytes;
2291out:
2292 if (buffer != local_buffer)
2293 kfree(buffer);
2294 return retval;
2295}
2296
2297static ssize_t cgroup_file_write(struct file *file, const char __user *buf,
2298 size_t nbytes, loff_t *ppos)
2299{
2300 struct cftype *cft = __d_cft(file->f_dentry);
2301 struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent);
2302
2303 if (cgroup_is_removed(cgrp))
2304 return -ENODEV;
2305 if (cft->write)
2306 return cft->write(cgrp, cft, file, buf, nbytes, ppos);
2307 if (cft->write_u64 || cft->write_s64)
2308 return cgroup_write_X64(cgrp, cft, file, buf, nbytes, ppos);
2309 if (cft->write_string)
2310 return cgroup_write_string(cgrp, cft, file, buf, nbytes, ppos);
2311 if (cft->trigger) {
2312 int ret = cft->trigger(cgrp, (unsigned int)cft->private);
2313 return ret ? ret : nbytes;
2314 }
2315 return -EINVAL;
2316}
2317
2318static ssize_t cgroup_read_u64(struct cgroup *cgrp, struct cftype *cft,
2319 struct file *file,
2320 char __user *buf, size_t nbytes,
2321 loff_t *ppos)
2322{
2323 char tmp[CGROUP_LOCAL_BUFFER_SIZE];
2324 u64 val = cft->read_u64(cgrp, cft);
2325 int len = sprintf(tmp, "%llu\n", (unsigned long long) val);
2326
2327 return simple_read_from_buffer(buf, nbytes, ppos, tmp, len);
2328}
2329
2330static ssize_t cgroup_read_s64(struct cgroup *cgrp, struct cftype *cft,
2331 struct file *file,
2332 char __user *buf, size_t nbytes,
2333 loff_t *ppos)
2334{
2335 char tmp[CGROUP_LOCAL_BUFFER_SIZE];
2336 s64 val = cft->read_s64(cgrp, cft);
2337 int len = sprintf(tmp, "%lld\n", (long long) val);
2338
2339 return simple_read_from_buffer(buf, nbytes, ppos, tmp, len);
2340}
2341
2342static ssize_t cgroup_file_read(struct file *file, char __user *buf,
2343 size_t nbytes, loff_t *ppos)
2344{
2345 struct cftype *cft = __d_cft(file->f_dentry);
2346 struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent);
2347
2348 if (cgroup_is_removed(cgrp))
2349 return -ENODEV;
2350
2351 if (cft->read)
2352 return cft->read(cgrp, cft, file, buf, nbytes, ppos);
2353 if (cft->read_u64)
2354 return cgroup_read_u64(cgrp, cft, file, buf, nbytes, ppos);
2355 if (cft->read_s64)
2356 return cgroup_read_s64(cgrp, cft, file, buf, nbytes, ppos);
2357 return -EINVAL;
2358}
2359
2360
2361
2362
2363
2364
2365struct cgroup_seqfile_state {
2366 struct cftype *cft;
2367 struct cgroup *cgroup;
2368};
2369
2370static int cgroup_map_add(struct cgroup_map_cb *cb, const char *key, u64 value)
2371{
2372 struct seq_file *sf = cb->state;
2373 return seq_printf(sf, "%s %llu\n", key, (unsigned long long)value);
2374}
2375
2376static int cgroup_seqfile_show(struct seq_file *m, void *arg)
2377{
2378 struct cgroup_seqfile_state *state = m->private;
2379 struct cftype *cft = state->cft;
2380 if (cft->read_map) {
2381 struct cgroup_map_cb cb = {
2382 .fill = cgroup_map_add,
2383 .state = m,
2384 };
2385 return cft->read_map(state->cgroup, cft, &cb);
2386 }
2387 return cft->read_seq_string(state->cgroup, cft, m);
2388}
2389
2390static int cgroup_seqfile_release(struct inode *inode, struct file *file)
2391{
2392 struct seq_file *seq = file->private_data;
2393 kfree(seq->private);
2394 return single_release(inode, file);
2395}
2396
2397static const struct file_operations cgroup_seqfile_operations = {
2398 .read = seq_read,
2399 .write = cgroup_file_write,
2400 .llseek = seq_lseek,
2401 .release = cgroup_seqfile_release,
2402};
2403
2404static int cgroup_file_open(struct inode *inode, struct file *file)
2405{
2406 int err;
2407 struct cftype *cft;
2408
2409 err = generic_file_open(inode, file);
2410 if (err)
2411 return err;
2412 cft = __d_cft(file->f_dentry);
2413
2414 if (cft->read_map || cft->read_seq_string) {
2415 struct cgroup_seqfile_state *state =
2416 kzalloc(sizeof(*state), GFP_USER);
2417 if (!state)
2418 return -ENOMEM;
2419 state->cft = cft;
2420 state->cgroup = __d_cgrp(file->f_dentry->d_parent);
2421 file->f_op = &cgroup_seqfile_operations;
2422 err = single_open(file, cgroup_seqfile_show, state);
2423 if (err < 0)
2424 kfree(state);
2425 } else if (cft->open)
2426 err = cft->open(inode, file);
2427 else
2428 err = 0;
2429
2430 return err;
2431}
2432
2433static int cgroup_file_release(struct inode *inode, struct file *file)
2434{
2435 struct cftype *cft = __d_cft(file->f_dentry);
2436 if (cft->release)
2437 return cft->release(inode, file);
2438 return 0;
2439}
2440
2441
2442
2443
2444static int cgroup_rename(struct inode *old_dir, struct dentry *old_dentry,
2445 struct inode *new_dir, struct dentry *new_dentry)
2446{
2447 int ret;
2448 struct cgroup_name *name, *old_name;
2449 struct cgroup *cgrp;
2450
2451
2452
2453
2454
2455 lockdep_assert_held(&old_dir->i_mutex);
2456
2457 if (!S_ISDIR(old_dentry->d_inode->i_mode))
2458 return -ENOTDIR;
2459 if (new_dentry->d_inode)
2460 return -EEXIST;
2461 if (old_dir != new_dir)
2462 return -EIO;
2463
2464 cgrp = __d_cgrp(old_dentry);
2465
2466 name = cgroup_alloc_name(new_dentry);
2467 if (!name)
2468 return -ENOMEM;
2469
2470 ret = simple_rename(old_dir, old_dentry, new_dir, new_dentry);
2471 if (ret) {
2472 kfree(name);
2473 return ret;
2474 }
2475
2476 old_name = cgrp->name;
2477 rcu_assign_pointer(cgrp->name, name);
2478
2479 kfree_rcu(old_name, rcu_head);
2480 return 0;
2481}
2482
2483static struct simple_xattrs *__d_xattrs(struct dentry *dentry)
2484{
2485 if (S_ISDIR(dentry->d_inode->i_mode))
2486 return &__d_cgrp(dentry)->xattrs;
2487 else
2488 return &__d_cfe(dentry)->xattrs;
2489}
2490
2491static inline int xattr_enabled(struct dentry *dentry)
2492{
2493 struct cgroupfs_root *root = dentry->d_sb->s_fs_info;
2494 return root->flags & CGRP_ROOT_XATTR;
2495}
2496
2497static bool is_valid_xattr(const char *name)
2498{
2499 if (!strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) ||
2500 !strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN))
2501 return true;
2502 return false;
2503}
2504
2505static int cgroup_setxattr(struct dentry *dentry, const char *name,
2506 const void *val, size_t size, int flags)
2507{
2508 if (!xattr_enabled(dentry))
2509 return -EOPNOTSUPP;
2510 if (!is_valid_xattr(name))
2511 return -EINVAL;
2512 return simple_xattr_set(__d_xattrs(dentry), name, val, size, flags);
2513}
2514
2515static int cgroup_removexattr(struct dentry *dentry, const char *name)
2516{
2517 if (!xattr_enabled(dentry))
2518 return -EOPNOTSUPP;
2519 if (!is_valid_xattr(name))
2520 return -EINVAL;
2521 return simple_xattr_remove(__d_xattrs(dentry), name);
2522}
2523
2524static ssize_t cgroup_getxattr(struct dentry *dentry, const char *name,
2525 void *buf, size_t size)
2526{
2527 if (!xattr_enabled(dentry))
2528 return -EOPNOTSUPP;
2529 if (!is_valid_xattr(name))
2530 return -EINVAL;
2531 return simple_xattr_get(__d_xattrs(dentry), name, buf, size);
2532}
2533
2534static ssize_t cgroup_listxattr(struct dentry *dentry, char *buf, size_t size)
2535{
2536 if (!xattr_enabled(dentry))
2537 return -EOPNOTSUPP;
2538 return simple_xattr_list(__d_xattrs(dentry), buf, size);
2539}
2540
2541static const struct file_operations cgroup_file_operations = {
2542 .read = cgroup_file_read,
2543 .write = cgroup_file_write,
2544 .llseek = generic_file_llseek,
2545 .open = cgroup_file_open,
2546 .release = cgroup_file_release,
2547};
2548
2549static const struct inode_operations cgroup_file_inode_operations = {
2550 .setxattr = cgroup_setxattr,
2551 .getxattr = cgroup_getxattr,
2552 .listxattr = cgroup_listxattr,
2553 .removexattr = cgroup_removexattr,
2554};
2555
2556static const struct inode_operations cgroup_dir_inode_operations = {
2557 .lookup = cgroup_lookup,
2558 .mkdir = cgroup_mkdir,
2559 .rmdir = cgroup_rmdir,
2560 .rename = cgroup_rename,
2561 .setxattr = cgroup_setxattr,
2562 .getxattr = cgroup_getxattr,
2563 .listxattr = cgroup_listxattr,
2564 .removexattr = cgroup_removexattr,
2565};
2566
2567static struct dentry *cgroup_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags)
2568{
2569 if (dentry->d_name.len > NAME_MAX)
2570 return ERR_PTR(-ENAMETOOLONG);
2571 d_add(dentry, NULL);
2572 return NULL;
2573}
2574
2575
2576
2577
2578static inline struct cftype *__file_cft(struct file *file)
2579{
2580 if (file_inode(file)->i_fop != &cgroup_file_operations)
2581 return ERR_PTR(-EINVAL);
2582 return __d_cft(file->f_dentry);
2583}
2584
2585static int cgroup_create_file(struct dentry *dentry, umode_t mode,
2586 struct super_block *sb)
2587{
2588 struct inode *inode;
2589
2590 if (!dentry)
2591 return -ENOENT;
2592 if (dentry->d_inode)
2593 return -EEXIST;
2594
2595 inode = cgroup_new_inode(mode, sb);
2596 if (!inode)
2597 return -ENOMEM;
2598
2599 if (S_ISDIR(mode)) {
2600 inode->i_op = &cgroup_dir_inode_operations;
2601 inode->i_fop = &simple_dir_operations;
2602
2603
2604 inc_nlink(inode);
2605 inc_nlink(dentry->d_parent->d_inode);
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615 WARN_ON_ONCE(!mutex_trylock(&inode->i_mutex));
2616 } else if (S_ISREG(mode)) {
2617 inode->i_size = 0;
2618 inode->i_fop = &cgroup_file_operations;
2619 inode->i_op = &cgroup_file_inode_operations;
2620 }
2621 d_instantiate(dentry, inode);
2622 dget(dentry);
2623 return 0;
2624}
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635static umode_t cgroup_file_mode(const struct cftype *cft)
2636{
2637 umode_t mode = 0;
2638
2639 if (cft->mode)
2640 return cft->mode;
2641
2642 if (cft->read || cft->read_u64 || cft->read_s64 ||
2643 cft->read_map || cft->read_seq_string)
2644 mode |= S_IRUGO;
2645
2646 if (cft->write || cft->write_u64 || cft->write_s64 ||
2647 cft->write_string || cft->trigger)
2648 mode |= S_IWUSR;
2649
2650 return mode;
2651}
2652
2653static int cgroup_add_file(struct cgroup *cgrp, struct cgroup_subsys *subsys,
2654 struct cftype *cft)
2655{
2656 struct dentry *dir = cgrp->dentry;
2657 struct cgroup *parent = __d_cgrp(dir);
2658 struct dentry *dentry;
2659 struct cfent *cfe;
2660 int error;
2661 umode_t mode;
2662 char name[MAX_CGROUP_TYPE_NAMELEN + MAX_CFTYPE_NAME + 2] = { 0 };
2663
2664 if (subsys && !(cgrp->root->flags & CGRP_ROOT_NOPREFIX)) {
2665 strcpy(name, subsys->name);
2666 strcat(name, ".");
2667 }
2668 strcat(name, cft->name);
2669
2670 BUG_ON(!mutex_is_locked(&dir->d_inode->i_mutex));
2671
2672 cfe = kzalloc(sizeof(*cfe), GFP_KERNEL);
2673 if (!cfe)
2674 return -ENOMEM;
2675
2676 dentry = lookup_one_len(name, dir, strlen(name));
2677 if (IS_ERR(dentry)) {
2678 error = PTR_ERR(dentry);
2679 goto out;
2680 }
2681
2682 cfe->type = (void *)cft;
2683 cfe->dentry = dentry;
2684 dentry->d_fsdata = cfe;
2685 simple_xattrs_init(&cfe->xattrs);
2686
2687 mode = cgroup_file_mode(cft);
2688 error = cgroup_create_file(dentry, mode | S_IFREG, cgrp->root->sb);
2689 if (!error) {
2690 list_add_tail(&cfe->node, &parent->files);
2691 cfe = NULL;
2692 }
2693 dput(dentry);
2694out:
2695 kfree(cfe);
2696 return error;
2697}
2698
2699static int cgroup_addrm_files(struct cgroup *cgrp, struct cgroup_subsys *subsys,
2700 struct cftype cfts[], bool is_add)
2701{
2702 struct cftype *cft;
2703 int err, ret = 0;
2704
2705 for (cft = cfts; cft->name[0] != '\0'; cft++) {
2706
2707 if ((cft->flags & CFTYPE_INSANE) && cgroup_sane_behavior(cgrp))
2708 continue;
2709 if ((cft->flags & CFTYPE_NOT_ON_ROOT) && !cgrp->parent)
2710 continue;
2711 if ((cft->flags & CFTYPE_ONLY_ON_ROOT) && cgrp->parent)
2712 continue;
2713
2714 if (is_add) {
2715 err = cgroup_add_file(cgrp, subsys, cft);
2716 if (err)
2717 pr_warn("cgroup_addrm_files: failed to add %s, err=%d\n",
2718 cft->name, err);
2719 ret = err;
2720 } else {
2721 cgroup_rm_file(cgrp, cft);
2722 }
2723 }
2724 return ret;
2725}
2726
2727static DEFINE_MUTEX(cgroup_cft_mutex);
2728
2729static void cgroup_cfts_prepare(void)
2730 __acquires(&cgroup_cft_mutex) __acquires(&cgroup_mutex)
2731{
2732
2733
2734
2735
2736
2737
2738
2739 mutex_lock(&cgroup_cft_mutex);
2740 mutex_lock(&cgroup_mutex);
2741}
2742
2743static void cgroup_cfts_commit(struct cgroup_subsys *ss,
2744 struct cftype *cfts, bool is_add)
2745 __releases(&cgroup_mutex) __releases(&cgroup_cft_mutex)
2746{
2747 LIST_HEAD(pending);
2748 struct cgroup *cgrp, *n;
2749 struct super_block *sb = ss->root->sb;
2750
2751
2752 if (cfts && ss->root != &rootnode &&
2753 atomic_inc_not_zero(&sb->s_active)) {
2754 list_for_each_entry(cgrp, &ss->root->allcg_list, allcg_node) {
2755 dget(cgrp->dentry);
2756 list_add_tail(&cgrp->cft_q_node, &pending);
2757 }
2758 } else {
2759 sb = NULL;
2760 }
2761
2762 mutex_unlock(&cgroup_mutex);
2763
2764
2765
2766
2767
2768 list_for_each_entry_safe(cgrp, n, &pending, cft_q_node) {
2769 struct inode *inode = cgrp->dentry->d_inode;
2770
2771 mutex_lock(&inode->i_mutex);
2772 mutex_lock(&cgroup_mutex);
2773 if (!cgroup_is_removed(cgrp))
2774 cgroup_addrm_files(cgrp, ss, cfts, is_add);
2775 mutex_unlock(&cgroup_mutex);
2776 mutex_unlock(&inode->i_mutex);
2777
2778 list_del_init(&cgrp->cft_q_node);
2779 dput(cgrp->dentry);
2780 }
2781
2782 if (sb)
2783 deactivate_super(sb);
2784
2785 mutex_unlock(&cgroup_cft_mutex);
2786}
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
2803{
2804 struct cftype_set *set;
2805
2806 set = kzalloc(sizeof(*set), GFP_KERNEL);
2807 if (!set)
2808 return -ENOMEM;
2809
2810 cgroup_cfts_prepare();
2811 set->cfts = cfts;
2812 list_add_tail(&set->node, &ss->cftsets);
2813 cgroup_cfts_commit(ss, cfts, true);
2814
2815 return 0;
2816}
2817EXPORT_SYMBOL_GPL(cgroup_add_cftypes);
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832int cgroup_rm_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
2833{
2834 struct cftype_set *set;
2835
2836 cgroup_cfts_prepare();
2837
2838 list_for_each_entry(set, &ss->cftsets, node) {
2839 if (set->cfts == cfts) {
2840 list_del_init(&set->node);
2841 cgroup_cfts_commit(ss, cfts, false);
2842 return 0;
2843 }
2844 }
2845
2846 cgroup_cfts_commit(ss, NULL, false);
2847 return -ENOENT;
2848}
2849
2850
2851
2852
2853
2854
2855
2856int cgroup_task_count(const struct cgroup *cgrp)
2857{
2858 int count = 0;
2859 struct cg_cgroup_link *link;
2860
2861 read_lock(&css_set_lock);
2862 list_for_each_entry(link, &cgrp->css_sets, cgrp_link_list) {
2863 count += atomic_read(&link->cg->refcount);
2864 }
2865 read_unlock(&css_set_lock);
2866 return count;
2867}
2868
2869
2870
2871
2872
2873static void cgroup_advance_iter(struct cgroup *cgrp,
2874 struct cgroup_iter *it)
2875{
2876 struct list_head *l = it->cg_link;
2877 struct cg_cgroup_link *link;
2878 struct css_set *cg;
2879
2880
2881 do {
2882 l = l->next;
2883 if (l == &cgrp->css_sets) {
2884 it->cg_link = NULL;
2885 return;
2886 }
2887 link = list_entry(l, struct cg_cgroup_link, cgrp_link_list);
2888 cg = link->cg;
2889 } while (list_empty(&cg->tasks));
2890 it->cg_link = l;
2891 it->task = cg->tasks.next;
2892}
2893
2894
2895
2896
2897
2898
2899
2900static void cgroup_enable_task_cg_lists(void)
2901{
2902 struct task_struct *p, *g;
2903 write_lock(&css_set_lock);
2904 use_task_css_set_links = 1;
2905
2906
2907
2908
2909
2910
2911
2912 qread_lock(&tasklist_lock);
2913 do_each_thread(g, p) {
2914 task_lock(p);
2915
2916
2917
2918
2919
2920
2921
2922 spin_lock_irq(&p->sighand->siglock);
2923 if (!(p->flags & PF_EXITING) && list_empty(&p->cg_list))
2924 list_add(&p->cg_list, &p->cgroups->tasks);
2925 spin_unlock_irq(&p->sighand->siglock);
2926
2927 task_unlock(p);
2928 } while_each_thread(g, p);
2929 qread_unlock(&tasklist_lock);
2930 write_unlock(&css_set_lock);
2931}
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941struct cgroup *cgroup_next_descendant_pre(struct cgroup *pos,
2942 struct cgroup *cgroup)
2943{
2944 struct cgroup *next;
2945
2946 WARN_ON_ONCE(!rcu_read_lock_held());
2947
2948
2949 if (!pos)
2950 pos = cgroup;
2951
2952
2953 next = list_first_or_null_rcu(&pos->children, struct cgroup, sibling);
2954 if (next)
2955 return next;
2956
2957
2958 while (pos != cgroup) {
2959 next = list_entry_rcu(pos->sibling.next, struct cgroup,
2960 sibling);
2961 if (&next->sibling != &pos->parent->children)
2962 return next;
2963
2964 pos = pos->parent;
2965 }
2966
2967 return NULL;
2968}
2969EXPORT_SYMBOL_GPL(cgroup_next_descendant_pre);
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979struct cgroup *cgroup_rightmost_descendant(struct cgroup *pos)
2980{
2981 struct cgroup *last, *tmp;
2982
2983 WARN_ON_ONCE(!rcu_read_lock_held());
2984
2985 do {
2986 last = pos;
2987
2988 pos = NULL;
2989 list_for_each_entry_rcu(tmp, &last->children, sibling)
2990 pos = tmp;
2991 } while (pos);
2992
2993 return last;
2994}
2995EXPORT_SYMBOL_GPL(cgroup_rightmost_descendant);
2996
2997static struct cgroup *cgroup_leftmost_descendant(struct cgroup *pos)
2998{
2999 struct cgroup *last;
3000
3001 do {
3002 last = pos;
3003 pos = list_first_or_null_rcu(&pos->children, struct cgroup,
3004 sibling);
3005 } while (pos);
3006
3007 return last;
3008}
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018struct cgroup *cgroup_next_descendant_post(struct cgroup *pos,
3019 struct cgroup *cgroup)
3020{
3021 struct cgroup *next;
3022
3023 WARN_ON_ONCE(!rcu_read_lock_held());
3024
3025
3026 if (!pos) {
3027 next = cgroup_leftmost_descendant(cgroup);
3028 return next != cgroup ? next : NULL;
3029 }
3030
3031
3032 next = list_entry_rcu(pos->sibling.next, struct cgroup, sibling);
3033 if (&next->sibling != &pos->parent->children)
3034 return cgroup_leftmost_descendant(next);
3035
3036
3037 next = pos->parent;
3038 return next != cgroup ? next : NULL;
3039}
3040EXPORT_SYMBOL_GPL(cgroup_next_descendant_post);
3041
3042void cgroup_iter_start(struct cgroup *cgrp, struct cgroup_iter *it)
3043 __acquires(css_set_lock)
3044{
3045
3046
3047
3048
3049
3050 if (!use_task_css_set_links)
3051 cgroup_enable_task_cg_lists();
3052
3053 read_lock(&css_set_lock);
3054 it->cg_link = &cgrp->css_sets;
3055 cgroup_advance_iter(cgrp, it);
3056}
3057
3058struct task_struct *cgroup_iter_next(struct cgroup *cgrp,
3059 struct cgroup_iter *it)
3060{
3061 struct task_struct *res;
3062 struct list_head *l = it->task;
3063 struct cg_cgroup_link *link;
3064
3065
3066 if (!it->cg_link)
3067 return NULL;
3068 res = list_entry(l, struct task_struct, cg_list);
3069
3070 l = l->next;
3071 link = list_entry(it->cg_link, struct cg_cgroup_link, cgrp_link_list);
3072 if (l == &link->cg->tasks) {
3073
3074
3075 cgroup_advance_iter(cgrp, it);
3076 } else {
3077 it->task = l;
3078 }
3079 return res;
3080}
3081
3082void cgroup_iter_end(struct cgroup *cgrp, struct cgroup_iter *it)
3083 __releases(css_set_lock)
3084{
3085 read_unlock(&css_set_lock);
3086}
3087
3088static inline int started_after_time(struct task_struct *t1,
3089 struct timespec *time,
3090 struct task_struct *t2)
3091{
3092 int start_diff = timespec_compare(&t1->start_time, time);
3093 if (start_diff > 0) {
3094 return 1;
3095 } else if (start_diff < 0) {
3096 return 0;
3097 } else {
3098
3099
3100
3101
3102
3103
3104
3105
3106 return t1 > t2;
3107 }
3108}
3109
3110
3111
3112
3113
3114
3115static inline int started_after(void *p1, void *p2)
3116{
3117 struct task_struct *t1 = p1;
3118 struct task_struct *t2 = p2;
3119 return started_after_time(t1, &t2->start_time, t2);
3120}
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149int cgroup_scan_tasks(struct cgroup_scanner *scan)
3150{
3151 int retval, i;
3152 struct cgroup_iter it;
3153 struct task_struct *p, *dropped;
3154
3155 struct task_struct *latest_task = NULL;
3156 struct ptr_heap tmp_heap;
3157 struct ptr_heap *heap;
3158 struct timespec latest_time = { 0, 0 };
3159
3160 if (scan->heap) {
3161
3162 heap = scan->heap;
3163 heap->gt = &started_after;
3164 } else {
3165
3166 heap = &tmp_heap;
3167 retval = heap_init(heap, PAGE_SIZE, GFP_KERNEL, &started_after);
3168 if (retval)
3169
3170 return retval;
3171 }
3172
3173 again:
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186 heap->size = 0;
3187 cgroup_iter_start(scan->cg, &it);
3188 while ((p = cgroup_iter_next(scan->cg, &it))) {
3189
3190
3191
3192
3193 if (scan->test_task && !scan->test_task(p, scan))
3194 continue;
3195
3196
3197
3198
3199 if (!started_after_time(p, &latest_time, latest_task))
3200 continue;
3201 dropped = heap_insert(heap, p);
3202 if (dropped == NULL) {
3203
3204
3205
3206
3207 get_task_struct(p);
3208 } else if (dropped != p) {
3209
3210
3211
3212
3213 get_task_struct(p);
3214 put_task_struct(dropped);
3215 }
3216
3217
3218
3219
3220 }
3221 cgroup_iter_end(scan->cg, &it);
3222
3223 if (heap->size) {
3224 for (i = 0; i < heap->size; i++) {
3225 struct task_struct *q = heap->ptrs[i];
3226 if (i == 0) {
3227 latest_time = q->start_time;
3228 latest_task = q;
3229 }
3230
3231 scan->process_task(q, scan);
3232 put_task_struct(q);
3233 }
3234
3235
3236
3237
3238
3239
3240
3241 goto again;
3242 }
3243 if (heap == &tmp_heap)
3244 heap_free(&tmp_heap);
3245 return 0;
3246}
3247
3248static void cgroup_transfer_one_task(struct task_struct *task,
3249 struct cgroup_scanner *scan)
3250{
3251 struct cgroup *new_cgroup = scan->data;
3252
3253 mutex_lock(&cgroup_mutex);
3254 cgroup_attach_task(new_cgroup, task, false);
3255 mutex_unlock(&cgroup_mutex);
3256}
3257
3258
3259
3260
3261
3262
3263int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from)
3264{
3265 struct cgroup_scanner scan;
3266
3267 scan.cg = from;
3268 scan.test_task = NULL;
3269 scan.process_task = cgroup_transfer_one_task;
3270 scan.heap = NULL;
3271 scan.data = to;
3272
3273 return cgroup_scan_tasks(&scan);
3274}
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287enum cgroup_filetype {
3288 CGROUP_FILE_PROCS,
3289 CGROUP_FILE_TASKS,
3290};
3291
3292
3293
3294
3295
3296
3297
3298struct cgroup_pidlist {
3299
3300
3301
3302
3303 struct { enum cgroup_filetype type; struct pid_namespace *ns; } key;
3304
3305 pid_t *list;
3306
3307 int length;
3308
3309 int use_count;
3310
3311 struct list_head links;
3312
3313 struct cgroup *owner;
3314
3315 struct rw_semaphore mutex;
3316};
3317
3318
3319
3320
3321
3322
3323#define PIDLIST_TOO_LARGE(c) ((c) * sizeof(pid_t) > (PAGE_SIZE * 2))
3324static void *pidlist_allocate(int count)
3325{
3326 if (PIDLIST_TOO_LARGE(count))
3327 return vmalloc(count * sizeof(pid_t));
3328 else
3329 return kmalloc(count * sizeof(pid_t), GFP_KERNEL);
3330}
3331static void pidlist_free(void *p)
3332{
3333 if (is_vmalloc_addr(p))
3334 vfree(p);
3335 else
3336 kfree(p);
3337}
3338
3339
3340
3341
3342
3343static int pidlist_uniq(pid_t *list, int length)
3344{
3345 int src, dest = 1;
3346
3347
3348
3349
3350
3351 if (length == 0 || length == 1)
3352 return length;
3353
3354 for (src = 1; src < length; src++) {
3355
3356 while (list[src] == list[src-1]) {
3357 src++;
3358 if (src == length)
3359 goto after;
3360 }
3361
3362 list[dest] = list[src];
3363 dest++;
3364 }
3365after:
3366 return dest;
3367}
3368
3369static int cmppid(const void *a, const void *b)
3370{
3371 return *(pid_t *)a - *(pid_t *)b;
3372}
3373
3374
3375
3376
3377
3378
3379
3380static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp,
3381 enum cgroup_filetype type)
3382{
3383 struct cgroup_pidlist *l;
3384
3385 struct pid_namespace *ns = task_active_pid_ns(current);
3386
3387
3388
3389
3390
3391
3392
3393 mutex_lock(&cgrp->pidlist_mutex);
3394 list_for_each_entry(l, &cgrp->pidlists, links) {
3395 if (l->key.type == type && l->key.ns == ns) {
3396
3397 down_write(&l->mutex);
3398 mutex_unlock(&cgrp->pidlist_mutex);
3399 return l;
3400 }
3401 }
3402
3403 l = kmalloc(sizeof(struct cgroup_pidlist), GFP_KERNEL);
3404 if (!l) {
3405 mutex_unlock(&cgrp->pidlist_mutex);
3406 return l;
3407 }
3408 init_rwsem(&l->mutex);
3409 down_write(&l->mutex);
3410 l->key.type = type;
3411 l->key.ns = get_pid_ns(ns);
3412 l->use_count = 0;
3413 l->list = NULL;
3414 l->owner = cgrp;
3415 list_add(&l->links, &cgrp->pidlists);
3416 mutex_unlock(&cgrp->pidlist_mutex);
3417 return l;
3418}
3419
3420
3421
3422
3423static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type,
3424 struct cgroup_pidlist **lp)
3425{
3426 pid_t *array;
3427 int length;
3428 int pid, n = 0;
3429 struct cgroup_iter it;
3430 struct task_struct *tsk;
3431 struct cgroup_pidlist *l;
3432
3433
3434
3435
3436
3437
3438
3439 length = cgroup_task_count(cgrp);
3440 array = pidlist_allocate(length);
3441 if (!array)
3442 return -ENOMEM;
3443
3444 cgroup_iter_start(cgrp, &it);
3445 while ((tsk = cgroup_iter_next(cgrp, &it))) {
3446 if (unlikely(n == length))
3447 break;
3448
3449 if (type == CGROUP_FILE_PROCS)
3450 pid = task_tgid_vnr(tsk);
3451 else
3452 pid = task_pid_vnr(tsk);
3453 if (pid > 0)
3454 array[n++] = pid;
3455 }
3456 cgroup_iter_end(cgrp, &it);
3457 length = n;
3458
3459 sort(array, length, sizeof(pid_t), cmppid, NULL);
3460 if (type == CGROUP_FILE_PROCS)
3461 length = pidlist_uniq(array, length);
3462 l = cgroup_pidlist_find(cgrp, type);
3463 if (!l) {
3464 pidlist_free(array);
3465 return -ENOMEM;
3466 }
3467
3468 pidlist_free(l->list);
3469 l->list = array;
3470 l->length = length;
3471 l->use_count++;
3472 up_write(&l->mutex);
3473 *lp = l;
3474 return 0;
3475}
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry)
3487{
3488 int ret = -EINVAL;
3489 struct cgroup *cgrp;
3490 struct cgroup_iter it;
3491 struct task_struct *tsk;
3492
3493
3494
3495
3496
3497 if (dentry->d_sb->s_op != &cgroup_ops ||
3498 !S_ISDIR(dentry->d_inode->i_mode))
3499 goto err;
3500
3501 ret = 0;
3502 cgrp = dentry->d_fsdata;
3503
3504 cgroup_iter_start(cgrp, &it);
3505 while ((tsk = cgroup_iter_next(cgrp, &it))) {
3506 switch (tsk->state) {
3507 case TASK_RUNNING:
3508 stats->nr_running++;
3509 break;
3510 case TASK_INTERRUPTIBLE:
3511 stats->nr_sleeping++;
3512 break;
3513 case TASK_UNINTERRUPTIBLE:
3514 stats->nr_uninterruptible++;
3515 break;
3516 case TASK_STOPPED:
3517 stats->nr_stopped++;
3518 break;
3519 default:
3520 if (delayacct_is_task_waiting_on_io(tsk))
3521 stats->nr_io_wait++;
3522 break;
3523 }
3524 }
3525 cgroup_iter_end(cgrp, &it);
3526
3527err:
3528 return ret;
3529}
3530
3531
3532
3533
3534
3535
3536
3537
3538static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos)
3539{
3540
3541
3542
3543
3544
3545
3546 struct cgroup_pidlist *l = s->private;
3547 int index = 0, pid = *pos;
3548 int *iter;
3549
3550 down_read(&l->mutex);
3551 if (pid) {
3552 int end = l->length;
3553
3554 while (index < end) {
3555 int mid = (index + end) / 2;
3556 if (l->list[mid] == pid) {
3557 index = mid;
3558 break;
3559 } else if (l->list[mid] <= pid)
3560 index = mid + 1;
3561 else
3562 end = mid;
3563 }
3564 }
3565
3566 if (index >= l->length)
3567 return NULL;
3568
3569 iter = l->list + index;
3570 *pos = *iter;
3571 return iter;
3572}
3573
3574static void cgroup_pidlist_stop(struct seq_file *s, void *v)
3575{
3576 struct cgroup_pidlist *l = s->private;
3577 up_read(&l->mutex);
3578}
3579
3580static void *cgroup_pidlist_next(struct seq_file *s, void *v, loff_t *pos)
3581{
3582 struct cgroup_pidlist *l = s->private;
3583 pid_t *p = v;
3584 pid_t *end = l->list + l->length;
3585
3586
3587
3588
3589 p++;
3590 if (p >= end) {
3591 return NULL;
3592 } else {
3593 *pos = *p;
3594 return p;
3595 }
3596}
3597
3598static int cgroup_pidlist_show(struct seq_file *s, void *v)
3599{
3600 return seq_printf(s, "%d\n", *(int *)v);
3601}
3602
3603
3604
3605
3606
3607static const struct seq_operations cgroup_pidlist_seq_operations = {
3608 .start = cgroup_pidlist_start,
3609 .stop = cgroup_pidlist_stop,
3610 .next = cgroup_pidlist_next,
3611 .show = cgroup_pidlist_show,
3612};
3613
3614static void cgroup_release_pid_array(struct cgroup_pidlist *l)
3615{
3616
3617
3618
3619
3620
3621
3622 mutex_lock(&l->owner->pidlist_mutex);
3623 down_write(&l->mutex);
3624 BUG_ON(!l->use_count);
3625 if (!--l->use_count) {
3626
3627 list_del(&l->links);
3628 mutex_unlock(&l->owner->pidlist_mutex);
3629 pidlist_free(l->list);
3630 put_pid_ns(l->key.ns);
3631 up_write(&l->mutex);
3632 kfree(l);
3633 return;
3634 }
3635 mutex_unlock(&l->owner->pidlist_mutex);
3636 up_write(&l->mutex);
3637}
3638
3639static int cgroup_pidlist_release(struct inode *inode, struct file *file)
3640{
3641 struct cgroup_pidlist *l;
3642 if (!(file->f_mode & FMODE_READ))
3643 return 0;
3644
3645
3646
3647
3648 l = ((struct seq_file *)file->private_data)->private;
3649 cgroup_release_pid_array(l);
3650 return seq_release(inode, file);
3651}
3652
3653static const struct file_operations cgroup_pidlist_operations = {
3654 .read = seq_read,
3655 .llseek = seq_lseek,
3656 .write = cgroup_file_write,
3657 .release = cgroup_pidlist_release,
3658};
3659
3660
3661
3662
3663
3664
3665
3666static int cgroup_pidlist_open(struct file *file, enum cgroup_filetype type)
3667{
3668 struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent);
3669 struct cgroup_pidlist *l;
3670 int retval;
3671
3672
3673 if (!(file->f_mode & FMODE_READ))
3674 return 0;
3675
3676
3677 retval = pidlist_array_load(cgrp, type, &l);
3678 if (retval)
3679 return retval;
3680
3681 file->f_op = &cgroup_pidlist_operations;
3682
3683 retval = seq_open(file, &cgroup_pidlist_seq_operations);
3684 if (retval) {
3685 cgroup_release_pid_array(l);
3686 return retval;
3687 }
3688 ((struct seq_file *)file->private_data)->private = l;
3689 return 0;
3690}
3691static int cgroup_tasks_open(struct inode *unused, struct file *file)
3692{
3693 return cgroup_pidlist_open(file, CGROUP_FILE_TASKS);
3694}
3695static int cgroup_procs_open(struct inode *unused, struct file *file)
3696{
3697 return cgroup_pidlist_open(file, CGROUP_FILE_PROCS);
3698}
3699
3700static u64 cgroup_read_notify_on_release(struct cgroup *cgrp,
3701 struct cftype *cft)
3702{
3703 return notify_on_release(cgrp);
3704}
3705
3706static int cgroup_write_notify_on_release(struct cgroup *cgrp,
3707 struct cftype *cft,
3708 u64 val)
3709{
3710 clear_bit(CGRP_RELEASABLE, &cgrp->flags);
3711 if (val)
3712 set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
3713 else
3714 clear_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
3715 return 0;
3716}
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726static void cgroup_dput(struct cgroup *cgrp)
3727{
3728 struct super_block *sb = cgrp->root->sb;
3729
3730 atomic_inc(&sb->s_active);
3731 dput(cgrp->dentry);
3732 deactivate_super(sb);
3733}
3734
3735
3736
3737
3738
3739
3740static void cgroup_event_remove(struct work_struct *work)
3741{
3742 struct cgroup_event *event = container_of(work, struct cgroup_event,
3743 remove);
3744 struct cgroup *cgrp = event->cgrp;
3745
3746 remove_wait_queue(event->wqh, &event->wait);
3747
3748 event->cft->unregister_event(cgrp, event->cft, event->eventfd);
3749
3750
3751 eventfd_signal(event->eventfd, 1);
3752
3753 eventfd_ctx_put(event->eventfd);
3754 kfree(event);
3755 cgroup_dput(cgrp);
3756}
3757
3758
3759
3760
3761
3762
3763static int cgroup_event_wake(wait_queue_t *wait, unsigned mode,
3764 int sync, void *key)
3765{
3766 struct cgroup_event *event = container_of(wait,
3767 struct cgroup_event, wait);
3768 struct cgroup *cgrp = event->cgrp;
3769 unsigned long flags = (unsigned long)key;
3770
3771 if (flags & POLLHUP) {
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781 spin_lock(&cgrp->event_list_lock);
3782 if (!list_empty(&event->list)) {
3783 list_del_init(&event->list);
3784
3785
3786
3787
3788 schedule_work(&event->remove);
3789 }
3790 spin_unlock(&cgrp->event_list_lock);
3791 }
3792
3793 return 0;
3794}
3795
3796static void cgroup_event_ptable_queue_proc(struct file *file,
3797 wait_queue_head_t *wqh, poll_table *pt)
3798{
3799 struct cgroup_event *event = container_of(pt,
3800 struct cgroup_event, pt);
3801
3802 event->wqh = wqh;
3803 add_wait_queue(wqh, &event->wait);
3804}
3805
3806
3807
3808
3809
3810
3811
3812static int cgroup_write_event_control(struct cgroup *cgrp, struct cftype *cft,
3813 const char *buffer)
3814{
3815 struct cgroup_event *event = NULL;
3816 struct cgroup *cgrp_cfile;
3817 unsigned int efd, cfd;
3818 struct file *efile = NULL;
3819 struct file *cfile = NULL;
3820 char *endp;
3821 int ret;
3822
3823 efd = simple_strtoul(buffer, &endp, 10);
3824 if (*endp != ' ')
3825 return -EINVAL;
3826 buffer = endp + 1;
3827
3828 cfd = simple_strtoul(buffer, &endp, 10);
3829 if ((*endp != ' ') && (*endp != '\0'))
3830 return -EINVAL;
3831 buffer = endp + 1;
3832
3833 event = kzalloc(sizeof(*event), GFP_KERNEL);
3834 if (!event)
3835 return -ENOMEM;
3836 event->cgrp = cgrp;
3837 INIT_LIST_HEAD(&event->list);
3838 init_poll_funcptr(&event->pt, cgroup_event_ptable_queue_proc);
3839 init_waitqueue_func_entry(&event->wait, cgroup_event_wake);
3840 INIT_WORK(&event->remove, cgroup_event_remove);
3841
3842 efile = eventfd_fget(efd);
3843 if (IS_ERR(efile)) {
3844 ret = PTR_ERR(efile);
3845 goto fail;
3846 }
3847
3848 event->eventfd = eventfd_ctx_fileget(efile);
3849 if (IS_ERR(event->eventfd)) {
3850 ret = PTR_ERR(event->eventfd);
3851 goto fail;
3852 }
3853
3854 cfile = fget(cfd);
3855 if (!cfile) {
3856 ret = -EBADF;
3857 goto fail;
3858 }
3859
3860
3861
3862 ret = inode_permission(file_inode(cfile), MAY_READ);
3863 if (ret < 0)
3864 goto fail;
3865
3866 event->cft = __file_cft(cfile);
3867 if (IS_ERR(event->cft)) {
3868 ret = PTR_ERR(event->cft);
3869 goto fail;
3870 }
3871
3872
3873
3874
3875
3876 cgrp_cfile = __d_cgrp(cfile->f_dentry->d_parent);
3877 if (cgrp_cfile != cgrp) {
3878 ret = -EINVAL;
3879 goto fail;
3880 }
3881
3882 if (!event->cft->register_event || !event->cft->unregister_event) {
3883 ret = -EINVAL;
3884 goto fail;
3885 }
3886
3887 ret = event->cft->register_event(cgrp, event->cft,
3888 event->eventfd, buffer);
3889 if (ret)
3890 goto fail;
3891
3892 efile->f_op->poll(efile, &event->pt);
3893
3894
3895
3896
3897
3898
3899 dget(cgrp->dentry);
3900
3901 spin_lock(&cgrp->event_list_lock);
3902 list_add(&event->list, &cgrp->event_list);
3903 spin_unlock(&cgrp->event_list_lock);
3904
3905 fput(cfile);
3906 fput(efile);
3907
3908 return 0;
3909
3910fail:
3911 if (cfile)
3912 fput(cfile);
3913
3914 if (event && event->eventfd && !IS_ERR(event->eventfd))
3915 eventfd_ctx_put(event->eventfd);
3916
3917 if (!IS_ERR_OR_NULL(efile))
3918 fput(efile);
3919
3920 kfree(event);
3921
3922 return ret;
3923}
3924
3925static u64 cgroup_clone_children_read(struct cgroup *cgrp,
3926 struct cftype *cft)
3927{
3928 return test_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags);
3929}
3930
3931static int cgroup_clone_children_write(struct cgroup *cgrp,
3932 struct cftype *cft,
3933 u64 val)
3934{
3935 if (val)
3936 set_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags);
3937 else
3938 clear_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags);
3939 return 0;
3940}
3941
3942
3943
3944
3945
3946#define CGROUP_FILE_GENERIC_PREFIX "cgroup."
3947static struct cftype files[] = {
3948 {
3949 .name = "tasks",
3950 .open = cgroup_tasks_open,
3951 .write_u64 = cgroup_tasks_write,
3952 .release = cgroup_pidlist_release,
3953 .mode = S_IRUGO | S_IWUSR,
3954 },
3955 {
3956 .name = CGROUP_FILE_GENERIC_PREFIX "procs",
3957 .open = cgroup_procs_open,
3958 .write_u64 = cgroup_procs_write,
3959 .release = cgroup_pidlist_release,
3960 .mode = S_IRUGO | S_IWUSR,
3961 },
3962 {
3963 .name = "notify_on_release",
3964 .read_u64 = cgroup_read_notify_on_release,
3965 .write_u64 = cgroup_write_notify_on_release,
3966 },
3967 {
3968 .name = CGROUP_FILE_GENERIC_PREFIX "event_control",
3969 .write_string = cgroup_write_event_control,
3970 .mode = S_IWUGO,
3971 },
3972 {
3973 .name = "cgroup.clone_children",
3974 .flags = CFTYPE_INSANE,
3975 .read_u64 = cgroup_clone_children_read,
3976 .write_u64 = cgroup_clone_children_write,
3977 },
3978 {
3979 .name = "cgroup.sane_behavior",
3980 .flags = CFTYPE_ONLY_ON_ROOT,
3981 .read_seq_string = cgroup_sane_behavior_show,
3982 },
3983 {
3984 .name = "release_agent",
3985 .flags = CFTYPE_ONLY_ON_ROOT,
3986 .read_seq_string = cgroup_release_agent_show,
3987 .write_string = cgroup_release_agent_write,
3988 .max_write_len = PATH_MAX,
3989 },
3990 { }
3991};
3992
3993
3994
3995
3996
3997
3998
3999static int cgroup_populate_dir(struct cgroup *cgrp, bool base_files,
4000 unsigned long subsys_mask)
4001{
4002 int err;
4003 struct cgroup_subsys *ss;
4004
4005 if (base_files) {
4006 err = cgroup_addrm_files(cgrp, NULL, files, true);
4007 if (err < 0)
4008 return err;
4009 }
4010
4011
4012 for_each_subsys(cgrp->root, ss) {
4013 struct cftype_set *set;
4014 if (!test_bit(ss->subsys_id, &subsys_mask))
4015 continue;
4016
4017 list_for_each_entry(set, &ss->cftsets, node)
4018 cgroup_addrm_files(cgrp, ss, set->cfts, true);
4019 }
4020
4021 return 0;
4022}
4023
4024static void css_dput_fn(struct work_struct *work)
4025{
4026 struct cgroup_subsys_state *css =
4027 container_of(work, struct cgroup_subsys_state, dput_work);
4028
4029 cgroup_dput(css->cgroup);
4030}
4031
4032static void init_cgroup_css(struct cgroup_subsys_state *css,
4033 struct cgroup_subsys *ss,
4034 struct cgroup *cgrp)
4035{
4036 css->cgroup = cgrp;
4037 atomic_set(&css->refcnt, 1);
4038 css->flags = 0;
4039 if (cgrp == dummytop)
4040 css->flags |= CSS_ROOT;
4041 BUG_ON(cgrp->subsys[ss->subsys_id]);
4042 cgrp->subsys[ss->subsys_id] = css;
4043
4044
4045
4046
4047
4048
4049
4050 INIT_WORK(&css->dput_work, css_dput_fn);
4051}
4052
4053
4054static int online_css(struct cgroup_subsys *ss, struct cgroup *cgrp)
4055{
4056 int ret = 0;
4057
4058 lockdep_assert_held(&cgroup_mutex);
4059
4060 if (ss->css_online)
4061 ret = ss->css_online(cgrp);
4062 if (!ret)
4063 cgrp->subsys[ss->subsys_id]->flags |= CSS_ONLINE;
4064 return ret;
4065}
4066
4067
4068static void offline_css(struct cgroup_subsys *ss, struct cgroup *cgrp)
4069 __releases(&cgroup_mutex) __acquires(&cgroup_mutex)
4070{
4071 struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id];
4072
4073 lockdep_assert_held(&cgroup_mutex);
4074
4075 if (!(css->flags & CSS_ONLINE))
4076 return;
4077
4078 if (ss->css_offline)
4079 ss->css_offline(cgrp);
4080
4081 cgrp->subsys[ss->subsys_id]->flags &= ~CSS_ONLINE;
4082}
4083
4084
4085
4086
4087
4088
4089
4090
4091
4092static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
4093 umode_t mode)
4094{
4095 struct cgroup *cgrp;
4096 struct cgroup_name *name;
4097 struct cgroupfs_root *root = parent->root;
4098 int err = 0;
4099 struct cgroup_subsys *ss;
4100 struct super_block *sb = root->sb;
4101
4102
4103 cgrp = kzalloc(sizeof(*cgrp), GFP_KERNEL);
4104 if (!cgrp)
4105 return -ENOMEM;
4106
4107 name = cgroup_alloc_name(dentry);
4108 if (!name)
4109 goto err_free_cgrp;
4110 rcu_assign_pointer(cgrp->name, name);
4111
4112 cgrp->id = ida_simple_get(&root->cgroup_ida, 1, 0, GFP_KERNEL);
4113 if (cgrp->id < 0)
4114 goto err_free_name;
4115
4116
4117
4118
4119
4120
4121
4122
4123 if (!cgroup_lock_live_group(parent)) {
4124 err = -ENODEV;
4125 goto err_free_id;
4126 }
4127
4128
4129
4130
4131
4132
4133 atomic_inc(&sb->s_active);
4134
4135 init_cgroup_housekeeping(cgrp);
4136
4137 dentry->d_fsdata = cgrp;
4138 cgrp->dentry = dentry;
4139
4140 cgrp->parent = parent;
4141 cgrp->root = parent->root;
4142
4143 if (notify_on_release(parent))
4144 set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
4145
4146 if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &parent->flags))
4147 set_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags);
4148
4149 for_each_subsys(root, ss) {
4150 struct cgroup_subsys_state *css;
4151
4152 css = ss->css_alloc(cgrp);
4153 if (IS_ERR(css)) {
4154 err = PTR_ERR(css);
4155 goto err_free_all;
4156 }
4157 init_cgroup_css(css, ss, cgrp);
4158 }
4159
4160
4161
4162
4163
4164
4165 err = cgroup_create_file(dentry, S_IFDIR | mode, sb);
4166 if (err < 0)
4167 goto err_free_all;
4168 lockdep_assert_held(&dentry->d_inode->i_mutex);
4169
4170
4171 list_add_tail(&cgrp->allcg_node, &root->allcg_list);
4172 list_add_tail_rcu(&cgrp->sibling, &cgrp->parent->children);
4173 root->number_of_cgroups++;
4174
4175
4176 for_each_subsys(root, ss)
4177 dget(dentry);
4178
4179
4180 dget(parent->dentry);
4181
4182
4183 for_each_subsys(root, ss) {
4184 err = online_css(ss, cgrp);
4185 if (err)
4186 goto err_destroy;
4187
4188 if (ss->broken_hierarchy && !ss->warned_broken_hierarchy &&
4189 parent->parent) {
4190 pr_warning("cgroup: %s (%d) created nested cgroup for controller \"%s\" which has incomplete hierarchy support. Nested cgroups may change behavior in the future.\n",
4191 current->comm, current->pid, ss->name);
4192 if (!strcmp(ss->name, "memory"))
4193 pr_warning("cgroup: \"memory\" requires setting use_hierarchy to 1 on the root.\n");
4194 ss->warned_broken_hierarchy = true;
4195 }
4196 }
4197
4198 err = cgroup_populate_dir(cgrp, true, root->subsys_mask);
4199 if (err)
4200 goto err_destroy;
4201
4202 mutex_unlock(&cgroup_mutex);
4203 mutex_unlock(&cgrp->dentry->d_inode->i_mutex);
4204
4205 return 0;
4206
4207err_free_all:
4208 for_each_subsys(root, ss) {
4209 if (cgrp->subsys[ss->subsys_id])
4210 ss->css_free(cgrp);
4211 }
4212 mutex_unlock(&cgroup_mutex);
4213
4214 deactivate_super(sb);
4215err_free_id:
4216 ida_simple_remove(&root->cgroup_ida, cgrp->id);
4217err_free_name:
4218 kfree(rcu_dereference_raw(cgrp->name));
4219err_free_cgrp:
4220 kfree(cgrp);
4221 return err;
4222
4223err_destroy:
4224 cgroup_destroy_locked(cgrp);
4225 mutex_unlock(&cgroup_mutex);
4226 mutex_unlock(&dentry->d_inode->i_mutex);
4227 return err;
4228}
4229
4230static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
4231{
4232 struct cgroup *c_parent = dentry->d_parent->d_fsdata;
4233
4234
4235 return cgroup_create(c_parent, dentry, mode | S_IFDIR);
4236}
4237
4238static int cgroup_destroy_locked(struct cgroup *cgrp)
4239 __releases(&cgroup_mutex) __acquires(&cgroup_mutex)
4240{
4241 struct dentry *d = cgrp->dentry;
4242 struct cgroup *parent = cgrp->parent;
4243 struct cgroup_event *event, *tmp;
4244 struct cgroup_subsys *ss;
4245
4246 lockdep_assert_held(&d->d_inode->i_mutex);
4247 lockdep_assert_held(&cgroup_mutex);
4248
4249 if (atomic_read(&cgrp->count) || !list_empty(&cgrp->children))
4250 return -EBUSY;
4251
4252
4253
4254
4255
4256
4257
4258 for_each_subsys(cgrp->root, ss) {
4259 struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id];
4260
4261 WARN_ON(atomic_read(&css->refcnt) < 0);
4262 atomic_add(CSS_DEACT_BIAS, &css->refcnt);
4263 }
4264 set_bit(CGRP_REMOVED, &cgrp->flags);
4265
4266
4267 for_each_subsys(cgrp->root, ss)
4268 offline_css(ss, cgrp);
4269
4270
4271
4272
4273
4274
4275
4276
4277 for_each_subsys(cgrp->root, ss)
4278 css_put(cgrp->subsys[ss->subsys_id]);
4279
4280 raw_spin_lock(&release_list_lock);
4281 if (!list_empty(&cgrp->release_list))
4282 list_del_init(&cgrp->release_list);
4283 raw_spin_unlock(&release_list_lock);
4284
4285
4286 list_del_rcu(&cgrp->sibling);
4287 list_del_init(&cgrp->allcg_node);
4288
4289 dget(d);
4290 cgroup_d_remove_dir(d);
4291 dput(d);
4292
4293 set_bit(CGRP_RELEASABLE, &parent->flags);
4294 check_for_release(parent);
4295
4296
4297
4298
4299
4300
4301 spin_lock(&cgrp->event_list_lock);
4302 list_for_each_entry_safe(event, tmp, &cgrp->event_list, list) {
4303 list_del_init(&event->list);
4304 schedule_work(&event->remove);
4305 }
4306 spin_unlock(&cgrp->event_list_lock);
4307
4308 return 0;
4309}
4310
4311static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)
4312{
4313 int ret;
4314
4315 mutex_lock(&cgroup_mutex);
4316 ret = cgroup_destroy_locked(dentry->d_fsdata);
4317 mutex_unlock(&cgroup_mutex);
4318
4319 return ret;
4320}
4321
4322static void __init_or_module cgroup_init_cftsets(struct cgroup_subsys *ss)
4323{
4324 INIT_LIST_HEAD(&ss->cftsets);
4325
4326
4327
4328
4329
4330 if (ss->base_cftypes) {
4331 ss->base_cftset.cfts = ss->base_cftypes;
4332 list_add_tail(&ss->base_cftset.node, &ss->cftsets);
4333 }
4334}
4335
4336static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
4337{
4338 struct cgroup_subsys_state *css;
4339
4340 printk(KERN_INFO "Initializing cgroup subsys %s\n", ss->name);
4341
4342 mutex_lock(&cgroup_mutex);
4343
4344
4345 cgroup_init_cftsets(ss);
4346
4347
4348 list_add(&ss->sibling, &rootnode.subsys_list);
4349 ss->root = &rootnode;
4350 css = ss->css_alloc(dummytop);
4351
4352 BUG_ON(IS_ERR(css));
4353 init_cgroup_css(css, ss, dummytop);
4354
4355
4356
4357
4358
4359 init_css_set.subsys[ss->subsys_id] = css;
4360
4361 need_forkexit_callback |= ss->fork || ss->exit;
4362
4363
4364
4365
4366 BUG_ON(!list_empty(&init_task.tasks));
4367
4368 BUG_ON(online_css(ss, dummytop));
4369
4370 mutex_unlock(&cgroup_mutex);
4371
4372
4373
4374 BUG_ON(ss->module);
4375}
4376
4377
4378
4379
4380
4381
4382
4383
4384
4385
4386int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)
4387{
4388 struct cgroup_subsys_state *css;
4389 int i, ret;
4390 struct hlist_node *tmp;
4391 struct css_set *cg;
4392 unsigned long key;
4393
4394
4395 if (ss->name == NULL || strlen(ss->name) > MAX_CGROUP_TYPE_NAMELEN ||
4396 ss->css_alloc == NULL || ss->css_free == NULL)
4397 return -EINVAL;
4398
4399
4400
4401
4402
4403
4404
4405 if (ss->fork || ss->exit)
4406 return -EINVAL;
4407
4408
4409
4410
4411
4412 if (ss->module == NULL) {
4413
4414 BUG_ON(subsys[ss->subsys_id] != ss);
4415 return 0;
4416 }
4417
4418
4419 cgroup_init_cftsets(ss);
4420
4421 mutex_lock(&cgroup_mutex);
4422 subsys[ss->subsys_id] = ss;
4423
4424
4425
4426
4427
4428
4429 css = ss->css_alloc(dummytop);
4430 if (IS_ERR(css)) {
4431
4432 subsys[ss->subsys_id] = NULL;
4433 mutex_unlock(&cgroup_mutex);
4434 return PTR_ERR(css);
4435 }
4436
4437 list_add(&ss->sibling, &rootnode.subsys_list);
4438 ss->root = &rootnode;
4439
4440
4441 init_cgroup_css(css, ss, dummytop);
4442
4443
4444
4445
4446
4447
4448
4449
4450
4451 write_lock(&css_set_lock);
4452 hash_for_each_safe(css_set_table, i, tmp, cg, hlist) {
4453
4454 if (cg->subsys[ss->subsys_id])
4455 continue;
4456
4457 hash_del(&cg->hlist);
4458
4459 cg->subsys[ss->subsys_id] = css;
4460
4461 key = css_set_hash(cg->subsys);
4462 hash_add(css_set_table, &cg->hlist, key);
4463 }
4464 write_unlock(&css_set_lock);
4465
4466 ret = online_css(ss, dummytop);
4467 if (ret)
4468 goto err_unload;
4469
4470
4471 mutex_unlock(&cgroup_mutex);
4472 return 0;
4473
4474err_unload:
4475 mutex_unlock(&cgroup_mutex);
4476
4477 cgroup_unload_subsys(ss);
4478 return ret;
4479}
4480EXPORT_SYMBOL_GPL(cgroup_load_subsys);
4481
4482
4483
4484
4485
4486
4487
4488
4489
4490void cgroup_unload_subsys(struct cgroup_subsys *ss)
4491{
4492 struct cg_cgroup_link *link;
4493
4494 BUG_ON(ss->module == NULL);
4495
4496
4497
4498
4499
4500
4501 BUG_ON(ss->root != &rootnode);
4502
4503 mutex_lock(&cgroup_mutex);
4504
4505 offline_css(ss, dummytop);
4506
4507
4508 subsys[ss->subsys_id] = NULL;
4509
4510
4511 list_del_init(&ss->sibling);
4512
4513
4514
4515
4516
4517 write_lock(&css_set_lock);
4518 list_for_each_entry(link, &dummytop->css_sets, cgrp_link_list) {
4519 struct css_set *cg = link->cg;
4520 unsigned long key;
4521
4522 hash_del(&cg->hlist);
4523 cg->subsys[ss->subsys_id] = NULL;
4524 key = css_set_hash(cg->subsys);
4525 hash_add(css_set_table, &cg->hlist, key);
4526 }
4527 write_unlock(&css_set_lock);
4528
4529
4530
4531
4532
4533
4534 ss->css_free(dummytop);
4535 dummytop->subsys[ss->subsys_id] = NULL;
4536
4537 mutex_unlock(&cgroup_mutex);
4538}
4539EXPORT_SYMBOL_GPL(cgroup_unload_subsys);
4540
4541
4542
4543
4544
4545
4546
4547int __init cgroup_init_early(void)
4548{
4549 int i;
4550 atomic_set(&init_css_set.refcount, 1);
4551 INIT_LIST_HEAD(&init_css_set.cg_links);
4552 INIT_LIST_HEAD(&init_css_set.tasks);
4553 INIT_HLIST_NODE(&init_css_set.hlist);
4554 css_set_count = 1;
4555 init_cgroup_root(&rootnode);
4556 root_count = 1;
4557 init_task.cgroups = &init_css_set;
4558
4559 init_css_set_link.cg = &init_css_set;
4560 init_css_set_link.cgrp = dummytop;
4561 list_add(&init_css_set_link.cgrp_link_list,
4562 &rootnode.top_cgroup.css_sets);
4563 list_add(&init_css_set_link.cg_link_list,
4564 &init_css_set.cg_links);
4565
4566 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
4567 struct cgroup_subsys *ss = subsys[i];
4568
4569
4570 if (!ss || ss->module)
4571 continue;
4572
4573 BUG_ON(!ss->name);
4574 BUG_ON(strlen(ss->name) > MAX_CGROUP_TYPE_NAMELEN);
4575 BUG_ON(!ss->css_alloc);
4576 BUG_ON(!ss->css_free);
4577 if (ss->subsys_id != i) {
4578 printk(KERN_ERR "cgroup: Subsys %s id == %d\n",
4579 ss->name, ss->subsys_id);
4580 BUG();
4581 }
4582
4583 if (ss->early_init)
4584 cgroup_init_subsys(ss);
4585 }
4586 return 0;
4587}
4588
4589
4590
4591
4592
4593
4594
4595int __init cgroup_init(void)
4596{
4597 int err;
4598 int i;
4599 unsigned long key;
4600
4601 err = bdi_init(&cgroup_backing_dev_info);
4602 if (err)
4603 return err;
4604
4605 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
4606 struct cgroup_subsys *ss = subsys[i];
4607
4608
4609 if (!ss || ss->module)
4610 continue;
4611 if (!ss->early_init)
4612 cgroup_init_subsys(ss);
4613 }
4614
4615
4616 key = css_set_hash(init_css_set.subsys);
4617 hash_add(css_set_table, &init_css_set.hlist, key);
4618 BUG_ON(!init_root_id(&rootnode));
4619
4620 err = sysfs_create_mount_point(fs_kobj, "cgroup");
4621 if (err)
4622 goto out;
4623
4624 err = register_filesystem(&cgroup_fs_type);
4625 if (err < 0) {
4626 sysfs_remove_mount_point(fs_kobj, "cgroup");
4627 goto out;
4628 }
4629
4630 proc_create("cgroups", 0, NULL, &proc_cgroupstats_operations);
4631
4632out:
4633 if (err)
4634 bdi_destroy(&cgroup_backing_dev_info);
4635
4636 return err;
4637}
4638
4639static int __init cgroup_wq_init(void)
4640{
4641
4642
4643
4644
4645
4646
4647
4648
4649 cgroup_destroy_wq = alloc_workqueue("cgroup_destroy", 0, 1);
4650 BUG_ON(!cgroup_destroy_wq);
4651 return 0;
4652}
4653core_initcall(cgroup_wq_init);
4654
4655
4656
4657
4658
4659
4660
4661
4662
4663
4664
4665
4666
4667
4668int proc_cgroup_show(struct seq_file *m, void *v)
4669{
4670 struct pid *pid;
4671 struct task_struct *tsk;
4672 char *buf;
4673 int retval;
4674 struct cgroupfs_root *root;
4675
4676 retval = -ENOMEM;
4677 buf = kmalloc(PAGE_SIZE, GFP_KERNEL);
4678 if (!buf)
4679 goto out;
4680
4681 retval = -ESRCH;
4682 pid = m->private;
4683 tsk = get_pid_task(pid, PIDTYPE_PID);
4684 if (!tsk)
4685 goto out_free;
4686
4687 retval = 0;
4688
4689 mutex_lock(&cgroup_mutex);
4690
4691 for_each_active_root(root) {
4692 struct cgroup_subsys *ss;
4693 struct cgroup *cgrp;
4694 int count = 0;
4695
4696 seq_printf(m, "%d:", root->hierarchy_id);
4697 for_each_subsys(root, ss)
4698 seq_printf(m, "%s%s", count++ ? "," : "", ss->name);
4699 if (strlen(root->name))
4700 seq_printf(m, "%sname=%s", count ? "," : "",
4701 root->name);
4702 seq_putc(m, ':');
4703 cgrp = task_cgroup_from_root(tsk, root);
4704 retval = cgroup_path(cgrp, buf, PAGE_SIZE);
4705 if (retval < 0)
4706 goto out_unlock;
4707 seq_puts(m, buf);
4708 seq_putc(m, '\n');
4709 }
4710
4711out_unlock:
4712 mutex_unlock(&cgroup_mutex);
4713 put_task_struct(tsk);
4714out_free:
4715 kfree(buf);
4716out:
4717 return retval;
4718}
4719
4720
4721static int proc_cgroupstats_show(struct seq_file *m, void *v)
4722{
4723 int i;
4724
4725 seq_puts(m, "#subsys_name\thierarchy\tnum_cgroups\tenabled\n");
4726
4727
4728
4729
4730
4731 mutex_lock(&cgroup_mutex);
4732 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
4733 struct cgroup_subsys *ss = subsys[i];
4734 if (ss == NULL)
4735 continue;
4736 seq_printf(m, "%s\t%d\t%d\t%d\n",
4737 ss->name, ss->root->hierarchy_id,
4738 ss->root->number_of_cgroups, !ss->disabled);
4739 }
4740 mutex_unlock(&cgroup_mutex);
4741 return 0;
4742}
4743
4744static int cgroupstats_open(struct inode *inode, struct file *file)
4745{
4746 return single_open(file, proc_cgroupstats_show, NULL);
4747}
4748
4749static const struct file_operations proc_cgroupstats_operations = {
4750 .open = cgroupstats_open,
4751 .read = seq_read,
4752 .llseek = seq_lseek,
4753 .release = single_release,
4754};
4755
4756static void **subsys_canfork_priv_p(void *ss_priv[CGROUP_CANFORK_COUNT], int i)
4757{
4758 if (CGROUP_CANFORK_START <= i && i < CGROUP_CANFORK_END)
4759 return &ss_priv[i - CGROUP_CANFORK_START];
4760 return NULL;
4761}
4762
4763static void *subsys_canfork_priv(void *ss_priv[CGROUP_CANFORK_COUNT], int i)
4764{
4765 void **private = subsys_canfork_priv_p(ss_priv, i);
4766 return private ? *private : NULL;
4767}
4768
4769
4770
4771
4772
4773
4774
4775
4776
4777
4778
4779
4780
4781
4782
4783
4784
4785
4786void cgroup_fork(struct task_struct *child)
4787{
4788 task_lock(current);
4789 child->cgroups = current->cgroups;
4790 get_css_set(child->cgroups);
4791 task_unlock(current);
4792 INIT_LIST_HEAD(&child->cg_list);
4793}
4794
4795
4796
4797
4798
4799
4800
4801
4802
4803int cgroup_can_fork(struct task_struct *child,
4804 void *ss_priv[CGROUP_CANFORK_COUNT])
4805{
4806 struct cgroup_subsys *ss;
4807 int i, j, ret;
4808
4809 for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) {
4810 ss = subsys[i];
4811 if (!ss->can_fork)
4812 continue;
4813 ret = ss->can_fork(child, subsys_canfork_priv_p(ss_priv, i));
4814 if (ret)
4815 goto out_revert;
4816 }
4817
4818 return 0;
4819
4820out_revert:
4821 for (j = 0; j < CGROUP_BUILTIN_SUBSYS_COUNT; j++) {
4822 ss = subsys[j];
4823 if (j >= i)
4824 break;
4825 if (ss->cancel_fork)
4826 ss->cancel_fork(child, subsys_canfork_priv(ss_priv, j));
4827 }
4828
4829 return ret;
4830}
4831
4832
4833
4834
4835
4836
4837
4838
4839void cgroup_cancel_fork(struct task_struct *child,
4840 void *ss_priv[CGROUP_CANFORK_COUNT])
4841{
4842 struct cgroup_subsys *ss;
4843 int i;
4844
4845 for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) {
4846 ss = subsys[i];
4847 if (ss->cancel_fork)
4848 ss->cancel_fork(child, subsys_canfork_priv(ss_priv, i));
4849 }
4850}
4851
4852
4853
4854
4855
4856
4857
4858
4859
4860
4861
4862
4863void cgroup_post_fork(struct task_struct *child,
4864 void *old_ss_priv[CGROUP_CANFORK_COUNT])
4865{
4866 int i;
4867
4868
4869
4870
4871
4872
4873
4874
4875
4876
4877
4878
4879 if (use_task_css_set_links) {
4880 write_lock(&css_set_lock);
4881 task_lock(child);
4882 if (list_empty(&child->cg_list))
4883 list_add(&child->cg_list, &child->cgroups->tasks);
4884 task_unlock(child);
4885 write_unlock(&css_set_lock);
4886 }
4887
4888
4889
4890
4891
4892
4893 if (need_forkexit_callback) {
4894
4895
4896
4897
4898
4899
4900
4901
4902 for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) {
4903 struct cgroup_subsys *ss = subsys[i];
4904
4905 if (ss->fork)
4906 ss->fork(child, subsys_canfork_priv(old_ss_priv, i));
4907 }
4908 }
4909}
4910
4911
4912
4913
4914
4915
4916
4917
4918
4919
4920
4921
4922
4923
4924
4925
4926
4927
4928
4929
4930
4931
4932
4933
4934
4935
4936
4937
4938
4939
4940
4941
4942
4943
4944
4945
4946void cgroup_exit(struct task_struct *tsk, int run_callbacks)
4947{
4948 struct css_set *cg;
4949 int i;
4950
4951
4952
4953
4954
4955
4956 if (!list_empty(&tsk->cg_list)) {
4957 write_lock(&css_set_lock);
4958 if (!list_empty(&tsk->cg_list))
4959 list_del_init(&tsk->cg_list);
4960 write_unlock(&css_set_lock);
4961 }
4962
4963
4964 task_lock(tsk);
4965 cg = tsk->cgroups;
4966 tsk->cgroups = &init_css_set;
4967
4968 if (run_callbacks && need_forkexit_callback) {
4969
4970
4971
4972
4973 for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) {
4974 struct cgroup_subsys *ss = subsys[i];
4975
4976 if (ss->exit) {
4977 struct cgroup *old_cgrp =
4978 rcu_dereference_raw(cg->subsys[i])->cgroup;
4979 struct cgroup *cgrp = task_cgroup(tsk, i);
4980 ss->exit(cgrp, old_cgrp, tsk);
4981 }
4982 }
4983 }
4984 task_unlock(tsk);
4985
4986 put_css_set_taskexit(cg);
4987}
4988
4989static void check_for_release(struct cgroup *cgrp)
4990{
4991
4992
4993 if (cgroup_is_releasable(cgrp) &&
4994 !atomic_read(&cgrp->count) && list_empty(&cgrp->children)) {
4995
4996
4997
4998
4999
5000 int need_schedule_work = 0;
5001
5002 raw_spin_lock(&release_list_lock);
5003 if (!cgroup_is_removed(cgrp) &&
5004 list_empty(&cgrp->release_list)) {
5005 list_add(&cgrp->release_list, &release_list);
5006 need_schedule_work = 1;
5007 }
5008 raw_spin_unlock(&release_list_lock);
5009 if (need_schedule_work)
5010 schedule_work(&release_agent_work);
5011 }
5012}
5013
5014
5015bool __css_tryget(struct cgroup_subsys_state *css)
5016{
5017 while (true) {
5018 int t, v;
5019
5020 v = css_refcnt(css);
5021 t = atomic_cmpxchg(&css->refcnt, v, v + 1);
5022 if (likely(t == v))
5023 return true;
5024 else if (t < 0)
5025 return false;
5026 cpu_relax();
5027 }
5028}
5029EXPORT_SYMBOL_GPL(__css_tryget);
5030
5031
5032void __css_put(struct cgroup_subsys_state *css)
5033{
5034 int v;
5035
5036 v = css_unbias_refcnt(atomic_dec_return(&css->refcnt));
5037 if (v == 0)
5038 queue_work(cgroup_destroy_wq, &css->dput_work);
5039}
5040EXPORT_SYMBOL_GPL(__css_put);
5041
5042
5043
5044
5045
5046
5047
5048
5049
5050
5051
5052
5053
5054
5055
5056
5057
5058
5059
5060
5061
5062
5063
5064
5065static void cgroup_release_agent(struct work_struct *work)
5066{
5067 BUG_ON(work != &release_agent_work);
5068 mutex_lock(&cgroup_mutex);
5069 raw_spin_lock(&release_list_lock);
5070 while (!list_empty(&release_list)) {
5071 char *argv[3], *envp[3];
5072 int i;
5073 char *pathbuf = NULL, *agentbuf = NULL;
5074 struct cgroup *cgrp = list_entry(release_list.next,
5075 struct cgroup,
5076 release_list);
5077 list_del_init(&cgrp->release_list);
5078 raw_spin_unlock(&release_list_lock);
5079 pathbuf = kmalloc(PAGE_SIZE, GFP_KERNEL);
5080 if (!pathbuf)
5081 goto continue_free;
5082 if (cgroup_path(cgrp, pathbuf, PAGE_SIZE) < 0)
5083 goto continue_free;
5084 agentbuf = kstrdup(cgrp->root->release_agent_path, GFP_KERNEL);
5085 if (!agentbuf)
5086 goto continue_free;
5087
5088 i = 0;
5089 argv[i++] = agentbuf;
5090 argv[i++] = pathbuf;
5091 argv[i] = NULL;
5092
5093 i = 0;
5094
5095 envp[i++] = "HOME=/";
5096 envp[i++] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin";
5097 envp[i] = NULL;
5098
5099
5100
5101
5102 mutex_unlock(&cgroup_mutex);
5103 call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC);
5104 mutex_lock(&cgroup_mutex);
5105 continue_free:
5106 kfree(pathbuf);
5107 kfree(agentbuf);
5108 raw_spin_lock(&release_list_lock);
5109 }
5110 raw_spin_unlock(&release_list_lock);
5111 mutex_unlock(&cgroup_mutex);
5112}
5113
5114static int __init cgroup_disable(char *str)
5115{
5116 int i;
5117 char *token;
5118
5119 while ((token = strsep(&str, ",")) != NULL) {
5120 if (!*token)
5121 continue;
5122 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
5123 struct cgroup_subsys *ss = subsys[i];
5124
5125
5126
5127
5128
5129
5130 if (!ss || ss->module)
5131 continue;
5132
5133 if (!strcmp(token, ss->name)) {
5134 ss->disabled = 1;
5135 printk(KERN_INFO "Disabling %s control group"
5136 " subsystem\n", ss->name);
5137 break;
5138 }
5139 }
5140 }
5141 return 1;
5142}
5143__setup("cgroup_disable=", cgroup_disable);
5144
5145
5146
5147
5148
5149struct cgroup_subsys_state *cgroup_css_from_dir(struct file *f, int id)
5150{
5151 struct cgroup *cgrp;
5152 struct inode *inode;
5153 struct cgroup_subsys_state *css;
5154
5155 inode = file_inode(f);
5156
5157 if (inode->i_op != &cgroup_dir_inode_operations)
5158 return ERR_PTR(-EBADF);
5159
5160 if (id < 0 || id >= CGROUP_SUBSYS_COUNT)
5161 return ERR_PTR(-EINVAL);
5162
5163
5164 cgrp = __d_cgrp(f->f_dentry);
5165 css = cgrp->subsys[id];
5166 return css ? css : ERR_PTR(-ENOENT);
5167}
5168
5169#ifdef CONFIG_CGROUP_DEBUG
5170static struct cgroup_subsys_state *debug_css_alloc(struct cgroup *cont)
5171{
5172 struct cgroup_subsys_state *css = kzalloc(sizeof(*css), GFP_KERNEL);
5173
5174 if (!css)
5175 return ERR_PTR(-ENOMEM);
5176
5177 return css;
5178}
5179
5180static void debug_css_free(struct cgroup *cont)
5181{
5182 kfree(cont->subsys[debug_subsys_id]);
5183}
5184
5185static u64 cgroup_refcount_read(struct cgroup *cont, struct cftype *cft)
5186{
5187 return atomic_read(&cont->count);
5188}
5189
5190static u64 debug_taskcount_read(struct cgroup *cont, struct cftype *cft)
5191{
5192 return cgroup_task_count(cont);
5193}
5194
5195static u64 current_css_set_read(struct cgroup *cont, struct cftype *cft)
5196{
5197 return (u64)(unsigned long)current->cgroups;
5198}
5199
5200static u64 current_css_set_refcount_read(struct cgroup *cont,
5201 struct cftype *cft)
5202{
5203 u64 count;
5204
5205 rcu_read_lock();
5206 count = atomic_read(¤t->cgroups->refcount);
5207 rcu_read_unlock();
5208 return count;
5209}
5210
5211static int current_css_set_cg_links_read(struct cgroup *cont,
5212 struct cftype *cft,
5213 struct seq_file *seq)
5214{
5215 struct cg_cgroup_link *link;
5216 struct css_set *cg;
5217
5218 read_lock(&css_set_lock);
5219 rcu_read_lock();
5220 cg = rcu_dereference(current->cgroups);
5221 list_for_each_entry(link, &cg->cg_links, cg_link_list) {
5222 struct cgroup *c = link->cgrp;
5223 const char *name;
5224
5225 if (c->dentry)
5226 name = c->dentry->d_name.name;
5227 else
5228 name = "?";
5229 seq_printf(seq, "Root %d group %s\n",
5230 c->root->hierarchy_id, name);
5231 }
5232 rcu_read_unlock();
5233 read_unlock(&css_set_lock);
5234 return 0;
5235}
5236
5237#define MAX_TASKS_SHOWN_PER_CSS 25
5238static int cgroup_css_links_read(struct cgroup *cont,
5239 struct cftype *cft,
5240 struct seq_file *seq)
5241{
5242 struct cg_cgroup_link *link;
5243
5244 read_lock(&css_set_lock);
5245 list_for_each_entry(link, &cont->css_sets, cgrp_link_list) {
5246 struct css_set *cg = link->cg;
5247 struct task_struct *task;
5248 int count = 0;
5249 seq_printf(seq, "css_set %p\n", cg);
5250 list_for_each_entry(task, &cg->tasks, cg_list) {
5251 if (count++ > MAX_TASKS_SHOWN_PER_CSS) {
5252 seq_puts(seq, " ...\n");
5253 break;
5254 } else {
5255 seq_printf(seq, " task %d\n",
5256 task_pid_vnr(task));
5257 }
5258 }
5259 }
5260 read_unlock(&css_set_lock);
5261 return 0;
5262}
5263
5264static u64 releasable_read(struct cgroup *cgrp, struct cftype *cft)
5265{
5266 return test_bit(CGRP_RELEASABLE, &cgrp->flags);
5267}
5268
5269static struct cftype debug_files[] = {
5270 {
5271 .name = "cgroup_refcount",
5272 .read_u64 = cgroup_refcount_read,
5273 },
5274 {
5275 .name = "taskcount",
5276 .read_u64 = debug_taskcount_read,
5277 },
5278
5279 {
5280 .name = "current_css_set",
5281 .read_u64 = current_css_set_read,
5282 },
5283
5284 {
5285 .name = "current_css_set_refcount",
5286 .read_u64 = current_css_set_refcount_read,
5287 },
5288
5289 {
5290 .name = "current_css_set_cg_links",
5291 .read_seq_string = current_css_set_cg_links_read,
5292 },
5293
5294 {
5295 .name = "cgroup_css_links",
5296 .read_seq_string = cgroup_css_links_read,
5297 },
5298
5299 {
5300 .name = "releasable",
5301 .read_u64 = releasable_read,
5302 },
5303
5304 { }
5305};
5306
5307struct cgroup_subsys debug_subsys = {
5308 .name = "debug",
5309 .css_alloc = debug_css_alloc,
5310 .css_free = debug_css_free,
5311 .subsys_id = debug_subsys_id,
5312 .base_cftypes = debug_files,
5313};
5314#endif
5315