1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25#include <linux/cpu.h>
26#include <linux/cpumask.h>
27#include <linux/cpuset.h>
28#include <linux/err.h>
29#include <linux/errno.h>
30#include <linux/file.h>
31#include <linux/fs.h>
32#include <linux/init.h>
33#include <linux/interrupt.h>
34#include <linux/kernel.h>
35#include <linux/kmod.h>
36#include <linux/list.h>
37#include <linux/mempolicy.h>
38#include <linux/mm.h>
39#include <linux/memory.h>
40#include <linux/export.h>
41#include <linux/mount.h>
42#include <linux/namei.h>
43#include <linux/pagemap.h>
44#include <linux/proc_fs.h>
45#include <linux/rcupdate.h>
46#include <linux/sched.h>
47#include <linux/seq_file.h>
48#include <linux/security.h>
49#include <linux/slab.h>
50#include <linux/spinlock.h>
51#include <linux/stat.h>
52#include <linux/string.h>
53#include <linux/time.h>
54#include <linux/backing-dev.h>
55#include <linux/sort.h>
56
57#include <asm/uaccess.h>
58#include <linux/atomic.h>
59#include <linux/mutex.h>
60#include <linux/workqueue.h>
61#include <linux/cgroup.h>
62#include <linux/wait.h>
63
64
65
66
67
68
69int number_of_cpusets __read_mostly;
70
71
72struct cgroup_subsys cpuset_subsys;
73struct cpuset;
74
75
76
77struct fmeter {
78 int cnt;
79 int val;
80 time_t time;
81 spinlock_t lock;
82};
83
84struct cpuset {
85 struct cgroup_subsys_state css;
86
87 unsigned long flags;
88 cpumask_var_t cpus_allowed;
89 nodemask_t mems_allowed;
90
91
92
93
94
95
96
97
98
99
100
101 nodemask_t old_mems_allowed;
102
103 struct fmeter fmeter;
104
105
106
107
108
109 int attach_in_progress;
110
111
112 int pn;
113
114
115 int relax_domain_level;
116};
117
118
119static inline struct cpuset *cgroup_cs(struct cgroup *cgrp)
120{
121 return container_of(cgroup_subsys_state(cgrp, cpuset_subsys_id),
122 struct cpuset, css);
123}
124
125
126static inline struct cpuset *task_cs(struct task_struct *task)
127{
128 return container_of(task_subsys_state(task, cpuset_subsys_id),
129 struct cpuset, css);
130}
131
132static inline struct cpuset *parent_cs(const struct cpuset *cs)
133{
134 struct cgroup *pcgrp = cs->css.cgroup->parent;
135
136 if (pcgrp)
137 return cgroup_cs(pcgrp);
138 return NULL;
139}
140
141#ifdef CONFIG_NUMA
142static inline bool task_has_mempolicy(struct task_struct *task)
143{
144 return task->mempolicy;
145}
146#else
147static inline bool task_has_mempolicy(struct task_struct *task)
148{
149 return false;
150}
151#endif
152
153
154
155typedef enum {
156 CS_ONLINE,
157 CS_CPU_EXCLUSIVE,
158 CS_MEM_EXCLUSIVE,
159 CS_MEM_HARDWALL,
160 CS_MEMORY_MIGRATE,
161 CS_SCHED_LOAD_BALANCE,
162 CS_SPREAD_PAGE,
163 CS_SPREAD_SLAB,
164} cpuset_flagbits_t;
165
166
167static inline bool is_cpuset_online(const struct cpuset *cs)
168{
169 return test_bit(CS_ONLINE, &cs->flags);
170}
171
172static inline int is_cpu_exclusive(const struct cpuset *cs)
173{
174 return test_bit(CS_CPU_EXCLUSIVE, &cs->flags);
175}
176
177static inline int is_mem_exclusive(const struct cpuset *cs)
178{
179 return test_bit(CS_MEM_EXCLUSIVE, &cs->flags);
180}
181
182static inline int is_mem_hardwall(const struct cpuset *cs)
183{
184 return test_bit(CS_MEM_HARDWALL, &cs->flags);
185}
186
187static inline int is_sched_load_balance(const struct cpuset *cs)
188{
189 return test_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
190}
191
192static inline int is_memory_migrate(const struct cpuset *cs)
193{
194 return test_bit(CS_MEMORY_MIGRATE, &cs->flags);
195}
196
197static inline int is_spread_page(const struct cpuset *cs)
198{
199 return test_bit(CS_SPREAD_PAGE, &cs->flags);
200}
201
202static inline int is_spread_slab(const struct cpuset *cs)
203{
204 return test_bit(CS_SPREAD_SLAB, &cs->flags);
205}
206
207static struct cpuset top_cpuset = {
208 .flags = ((1 << CS_ONLINE) | (1 << CS_CPU_EXCLUSIVE) |
209 (1 << CS_MEM_EXCLUSIVE)),
210};
211
212
213
214
215
216
217
218
219
220
221#define cpuset_for_each_child(child_cs, pos_cgrp, parent_cs) \
222 cgroup_for_each_child((pos_cgrp), (parent_cs)->css.cgroup) \
223 if (is_cpuset_online(((child_cs) = cgroup_cs((pos_cgrp)))))
224
225
226
227
228
229
230
231
232
233
234
235#define cpuset_for_each_descendant_pre(des_cs, pos_cgrp, root_cs) \
236 cgroup_for_each_descendant_pre((pos_cgrp), (root_cs)->css.cgroup) \
237 if (is_cpuset_online(((des_cs) = cgroup_cs((pos_cgrp)))))
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275static DEFINE_MUTEX(cpuset_mutex);
276static DEFINE_MUTEX(callback_mutex);
277
278
279
280
281static void cpuset_hotplug_workfn(struct work_struct *work);
282static DECLARE_WORK(cpuset_hotplug_work, cpuset_hotplug_workfn);
283
284static DECLARE_WAIT_QUEUE_HEAD(cpuset_attach_wq);
285
286
287
288
289
290
291static struct dentry *cpuset_mount(struct file_system_type *fs_type,
292 int flags, const char *unused_dev_name, void *data)
293{
294 struct file_system_type *cgroup_fs = get_fs_type("cgroup");
295 struct dentry *ret = ERR_PTR(-ENODEV);
296 if (cgroup_fs) {
297 char mountopts[] =
298 "cpuset,noprefix,"
299 "release_agent=/sbin/cpuset_release_agent";
300 ret = cgroup_fs->mount(cgroup_fs, flags,
301 unused_dev_name, mountopts);
302 put_filesystem(cgroup_fs);
303 }
304 return ret;
305}
306
307static struct file_system_type cpuset_fs_type = {
308 .name = "cpuset",
309 .mount = cpuset_mount,
310};
311
312
313
314
315
316
317
318
319
320
321
322
323static void guarantee_online_cpus(const struct cpuset *cs,
324 struct cpumask *pmask)
325{
326 while (!cpumask_intersects(cs->cpus_allowed, cpu_online_mask))
327 cs = parent_cs(cs);
328 cpumask_and(pmask, cs->cpus_allowed, cpu_online_mask);
329}
330
331
332
333
334
335
336
337
338
339
340
341
342static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask)
343{
344 while (!nodes_intersects(cs->mems_allowed, node_states[N_MEMORY]))
345 cs = parent_cs(cs);
346 nodes_and(*pmask, cs->mems_allowed, node_states[N_MEMORY]);
347}
348
349
350
351
352
353
354static void cpuset_update_task_spread_flag(struct cpuset *cs,
355 struct task_struct *tsk)
356{
357 if (is_spread_page(cs))
358 tsk->flags |= PF_SPREAD_PAGE;
359 else
360 tsk->flags &= ~PF_SPREAD_PAGE;
361 if (is_spread_slab(cs))
362 tsk->flags |= PF_SPREAD_SLAB;
363 else
364 tsk->flags &= ~PF_SPREAD_SLAB;
365}
366
367
368
369
370
371
372
373
374
375static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q)
376{
377 return cpumask_subset(p->cpus_allowed, q->cpus_allowed) &&
378 nodes_subset(p->mems_allowed, q->mems_allowed) &&
379 is_cpu_exclusive(p) <= is_cpu_exclusive(q) &&
380 is_mem_exclusive(p) <= is_mem_exclusive(q);
381}
382
383
384
385
386
387static struct cpuset *alloc_trial_cpuset(const struct cpuset *cs)
388{
389 struct cpuset *trial;
390
391 trial = kmemdup(cs, sizeof(*cs), GFP_KERNEL);
392 if (!trial)
393 return NULL;
394
395 if (!alloc_cpumask_var(&trial->cpus_allowed, GFP_KERNEL)) {
396 kfree(trial);
397 return NULL;
398 }
399 cpumask_copy(trial->cpus_allowed, cs->cpus_allowed);
400
401 return trial;
402}
403
404
405
406
407
408static void free_trial_cpuset(struct cpuset *trial)
409{
410 free_cpumask_var(trial->cpus_allowed);
411 kfree(trial);
412}
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434static int validate_change(const struct cpuset *cur, const struct cpuset *trial)
435{
436 struct cgroup *cgrp;
437 struct cpuset *c, *par;
438 int ret;
439
440 rcu_read_lock();
441
442
443 ret = -EBUSY;
444 cpuset_for_each_child(c, cgrp, cur)
445 if (!is_cpuset_subset(c, trial))
446 goto out;
447
448
449 ret = 0;
450 if (cur == &top_cpuset)
451 goto out;
452
453 par = parent_cs(cur);
454
455
456 ret = -EACCES;
457 if (!is_cpuset_subset(trial, par))
458 goto out;
459
460
461
462
463
464 ret = -EINVAL;
465 cpuset_for_each_child(c, cgrp, par) {
466 if ((is_cpu_exclusive(trial) || is_cpu_exclusive(c)) &&
467 c != cur &&
468 cpumask_intersects(trial->cpus_allowed, c->cpus_allowed))
469 goto out;
470 if ((is_mem_exclusive(trial) || is_mem_exclusive(c)) &&
471 c != cur &&
472 nodes_intersects(trial->mems_allowed, c->mems_allowed))
473 goto out;
474 }
475
476
477
478
479
480 ret = -ENOSPC;
481 if ((cgroup_task_count(cur->css.cgroup) || cur->attach_in_progress)) {
482 if (!cpumask_empty(cur->cpus_allowed) &&
483 cpumask_empty(trial->cpus_allowed))
484 goto out;
485 if (!nodes_empty(cur->mems_allowed) &&
486 nodes_empty(trial->mems_allowed))
487 goto out;
488 }
489
490 ret = 0;
491out:
492 rcu_read_unlock();
493 return ret;
494}
495
496#ifdef CONFIG_SMP
497
498
499
500
501static int cpusets_overlap(struct cpuset *a, struct cpuset *b)
502{
503 return cpumask_intersects(a->cpus_allowed, b->cpus_allowed);
504}
505
506static void
507update_domain_attr(struct sched_domain_attr *dattr, struct cpuset *c)
508{
509 if (dattr->relax_domain_level < c->relax_domain_level)
510 dattr->relax_domain_level = c->relax_domain_level;
511 return;
512}
513
514static void update_domain_attr_tree(struct sched_domain_attr *dattr,
515 struct cpuset *root_cs)
516{
517 struct cpuset *cp;
518 struct cgroup *pos_cgrp;
519
520 rcu_read_lock();
521 cpuset_for_each_descendant_pre(cp, pos_cgrp, root_cs) {
522
523 if (cpumask_empty(cp->cpus_allowed)) {
524 pos_cgrp = cgroup_rightmost_descendant(pos_cgrp);
525 continue;
526 }
527
528 if (is_sched_load_balance(cp))
529 update_domain_attr(dattr, cp);
530 }
531 rcu_read_unlock();
532}
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588static int generate_sched_domains(cpumask_var_t **domains,
589 struct sched_domain_attr **attributes)
590{
591 struct cpuset *cp;
592 struct cpuset **csa;
593 int csn;
594 int i, j, k;
595 cpumask_var_t *doms;
596 struct sched_domain_attr *dattr;
597 int ndoms = 0;
598 int nslot;
599 struct cgroup *pos_cgrp;
600
601 doms = NULL;
602 dattr = NULL;
603 csa = NULL;
604
605
606 if (is_sched_load_balance(&top_cpuset)) {
607 ndoms = 1;
608 doms = alloc_sched_domains(ndoms);
609 if (!doms)
610 goto done;
611
612 dattr = kmalloc(sizeof(struct sched_domain_attr), GFP_KERNEL);
613 if (dattr) {
614 *dattr = SD_ATTR_INIT;
615 update_domain_attr_tree(dattr, &top_cpuset);
616 }
617 cpumask_copy(doms[0], top_cpuset.cpus_allowed);
618
619 goto done;
620 }
621
622 csa = kmalloc(number_of_cpusets * sizeof(cp), GFP_KERNEL);
623 if (!csa)
624 goto done;
625 csn = 0;
626
627 rcu_read_lock();
628 cpuset_for_each_descendant_pre(cp, pos_cgrp, &top_cpuset) {
629
630
631
632
633
634
635
636
637 if (!cpumask_empty(cp->cpus_allowed) &&
638 !is_sched_load_balance(cp))
639 continue;
640
641 if (is_sched_load_balance(cp))
642 csa[csn++] = cp;
643
644
645 pos_cgrp = cgroup_rightmost_descendant(pos_cgrp);
646 }
647 rcu_read_unlock();
648
649 for (i = 0; i < csn; i++)
650 csa[i]->pn = i;
651 ndoms = csn;
652
653restart:
654
655 for (i = 0; i < csn; i++) {
656 struct cpuset *a = csa[i];
657 int apn = a->pn;
658
659 for (j = 0; j < csn; j++) {
660 struct cpuset *b = csa[j];
661 int bpn = b->pn;
662
663 if (apn != bpn && cpusets_overlap(a, b)) {
664 for (k = 0; k < csn; k++) {
665 struct cpuset *c = csa[k];
666
667 if (c->pn == bpn)
668 c->pn = apn;
669 }
670 ndoms--;
671 goto restart;
672 }
673 }
674 }
675
676
677
678
679
680 doms = alloc_sched_domains(ndoms);
681 if (!doms)
682 goto done;
683
684
685
686
687
688 dattr = kmalloc(ndoms * sizeof(struct sched_domain_attr), GFP_KERNEL);
689
690 for (nslot = 0, i = 0; i < csn; i++) {
691 struct cpuset *a = csa[i];
692 struct cpumask *dp;
693 int apn = a->pn;
694
695 if (apn < 0) {
696
697 continue;
698 }
699
700 dp = doms[nslot];
701
702 if (nslot == ndoms) {
703 static int warnings = 10;
704 if (warnings) {
705 printk(KERN_WARNING
706 "rebuild_sched_domains confused:"
707 " nslot %d, ndoms %d, csn %d, i %d,"
708 " apn %d\n",
709 nslot, ndoms, csn, i, apn);
710 warnings--;
711 }
712 continue;
713 }
714
715 cpumask_clear(dp);
716 if (dattr)
717 *(dattr + nslot) = SD_ATTR_INIT;
718 for (j = i; j < csn; j++) {
719 struct cpuset *b = csa[j];
720
721 if (apn == b->pn) {
722 cpumask_or(dp, dp, b->cpus_allowed);
723 if (dattr)
724 update_domain_attr_tree(dattr + nslot, b);
725
726
727 b->pn = -1;
728 }
729 }
730 nslot++;
731 }
732 BUG_ON(nslot != ndoms);
733
734done:
735 kfree(csa);
736
737
738
739
740
741 if (doms == NULL)
742 ndoms = 1;
743
744 *domains = doms;
745 *attributes = dattr;
746 return ndoms;
747}
748
749
750
751
752
753
754
755
756
757
758
759
760static void rebuild_sched_domains_locked(void)
761{
762 struct sched_domain_attr *attr;
763 cpumask_var_t *doms;
764 int ndoms;
765
766 lockdep_assert_held(&cpuset_mutex);
767 get_online_cpus();
768
769
770
771
772
773
774 if (!cpumask_equal(top_cpuset.cpus_allowed, cpu_active_mask))
775 goto out;
776
777
778 ndoms = generate_sched_domains(&doms, &attr);
779
780
781 partition_sched_domains(ndoms, doms, attr);
782out:
783 put_online_cpus();
784}
785#else
786static void rebuild_sched_domains_locked(void)
787{
788}
789#endif
790
791void rebuild_sched_domains(void)
792{
793 mutex_lock(&cpuset_mutex);
794 rebuild_sched_domains_locked();
795 mutex_unlock(&cpuset_mutex);
796}
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811static struct cpuset *effective_cpumask_cpuset(struct cpuset *cs)
812{
813 while (cpumask_empty(cs->cpus_allowed))
814 cs = parent_cs(cs);
815 return cs;
816}
817
818
819
820
821
822
823
824
825
826
827
828
829
830static struct cpuset *effective_nodemask_cpuset(struct cpuset *cs)
831{
832 while (nodes_empty(cs->mems_allowed))
833 cs = parent_cs(cs);
834 return cs;
835}
836
837
838
839
840
841
842
843
844
845
846
847
848static void cpuset_change_cpumask(struct task_struct *tsk,
849 struct cgroup_scanner *scan)
850{
851 struct cpuset *cpus_cs;
852
853 cpus_cs = effective_cpumask_cpuset(cgroup_cs(scan->cg));
854 set_cpus_allowed_ptr(tsk, cpus_cs->cpus_allowed);
855}
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870static void update_tasks_cpumask(struct cpuset *cs, struct ptr_heap *heap)
871{
872 struct cgroup_scanner scan;
873
874 scan.cg = cs->css.cgroup;
875 scan.test_task = NULL;
876 scan.process_task = cpuset_change_cpumask;
877 scan.heap = heap;
878 cgroup_scan_tasks(&scan);
879}
880
881
882
883
884
885
886
887
888
889
890
891
892static void update_tasks_cpumask_hier(struct cpuset *root_cs,
893 bool update_root, struct ptr_heap *heap)
894{
895 struct cpuset *cp;
896 struct cgroup *pos_cgrp;
897
898 if (update_root)
899 update_tasks_cpumask(root_cs, heap);
900
901 rcu_read_lock();
902 cpuset_for_each_descendant_pre(cp, pos_cgrp, root_cs) {
903
904 if (!cpumask_empty(cp->cpus_allowed)) {
905 pos_cgrp = cgroup_rightmost_descendant(pos_cgrp);
906 continue;
907 }
908 if (!css_tryget(&cp->css))
909 continue;
910 rcu_read_unlock();
911
912 update_tasks_cpumask(cp, heap);
913
914 rcu_read_lock();
915 css_put(&cp->css);
916 }
917 rcu_read_unlock();
918}
919
920
921
922
923
924
925static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
926 const char *buf)
927{
928 struct ptr_heap heap;
929 int retval;
930 int is_load_balanced;
931
932
933 if (cs == &top_cpuset)
934 return -EACCES;
935
936
937
938
939
940
941
942 if (!*buf) {
943 cpumask_clear(trialcs->cpus_allowed);
944 } else {
945 retval = cpulist_parse(buf, trialcs->cpus_allowed);
946 if (retval < 0)
947 return retval;
948
949 if (!cpumask_subset(trialcs->cpus_allowed, cpu_active_mask))
950 return -EINVAL;
951 }
952
953
954 if (cpumask_equal(cs->cpus_allowed, trialcs->cpus_allowed))
955 return 0;
956
957 retval = validate_change(cs, trialcs);
958 if (retval < 0)
959 return retval;
960
961 retval = heap_init(&heap, PAGE_SIZE, GFP_KERNEL, NULL);
962 if (retval)
963 return retval;
964
965 is_load_balanced = is_sched_load_balance(trialcs);
966
967 mutex_lock(&callback_mutex);
968 cpumask_copy(cs->cpus_allowed, trialcs->cpus_allowed);
969 mutex_unlock(&callback_mutex);
970
971 update_tasks_cpumask_hier(cs, true, &heap);
972
973 heap_free(&heap);
974
975 if (is_load_balanced)
976 rebuild_sched_domains_locked();
977 return 0;
978}
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from,
1001 const nodemask_t *to)
1002{
1003 struct task_struct *tsk = current;
1004 struct cpuset *mems_cs;
1005
1006 tsk->mems_allowed = *to;
1007
1008 do_migrate_pages(mm, from, to, MPOL_MF_MOVE_ALL);
1009
1010 mems_cs = effective_nodemask_cpuset(task_cs(tsk));
1011 guarantee_online_mems(mems_cs, &tsk->mems_allowed);
1012}
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023static void cpuset_change_task_nodemask(struct task_struct *tsk,
1024 nodemask_t *newmems)
1025{
1026 bool need_loop;
1027
1028
1029
1030
1031
1032 if (unlikely(test_thread_flag(TIF_MEMDIE)))
1033 return;
1034 if (current->flags & PF_EXITING)
1035 return;
1036
1037 task_lock(tsk);
1038
1039
1040
1041
1042
1043
1044 need_loop = task_has_mempolicy(tsk) ||
1045 !nodes_intersects(*newmems, tsk->mems_allowed);
1046
1047 if (need_loop)
1048 write_seqcount_begin(&tsk->mems_allowed_seq);
1049
1050 nodes_or(tsk->mems_allowed, tsk->mems_allowed, *newmems);
1051 mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP1);
1052
1053 mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP2);
1054 tsk->mems_allowed = *newmems;
1055
1056 if (need_loop)
1057 write_seqcount_end(&tsk->mems_allowed_seq);
1058
1059 task_unlock(tsk);
1060}
1061
1062
1063
1064
1065
1066
1067static void cpuset_change_nodemask(struct task_struct *p,
1068 struct cgroup_scanner *scan)
1069{
1070 struct cpuset *cs = cgroup_cs(scan->cg);
1071 struct mm_struct *mm;
1072 int migrate;
1073 nodemask_t *newmems = scan->data;
1074
1075 cpuset_change_task_nodemask(p, newmems);
1076
1077 mm = get_task_mm(p);
1078 if (!mm)
1079 return;
1080
1081 migrate = is_memory_migrate(cs);
1082
1083 mpol_rebind_mm(mm, &cs->mems_allowed);
1084 if (migrate)
1085 cpuset_migrate_mm(mm, &cs->old_mems_allowed, newmems);
1086 mmput(mm);
1087}
1088
1089static void *cpuset_being_rebound;
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100static void update_tasks_nodemask(struct cpuset *cs, struct ptr_heap *heap)
1101{
1102 static nodemask_t newmems;
1103 struct cgroup_scanner scan;
1104 struct cpuset *mems_cs = effective_nodemask_cpuset(cs);
1105
1106 cpuset_being_rebound = cs;
1107
1108 guarantee_online_mems(mems_cs, &newmems);
1109
1110 scan.cg = cs->css.cgroup;
1111 scan.test_task = NULL;
1112 scan.process_task = cpuset_change_nodemask;
1113 scan.heap = heap;
1114 scan.data = &newmems;
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126 cgroup_scan_tasks(&scan);
1127
1128
1129
1130
1131
1132 cs->old_mems_allowed = newmems;
1133
1134
1135 cpuset_being_rebound = NULL;
1136}
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149static void update_tasks_nodemask_hier(struct cpuset *root_cs,
1150 bool update_root, struct ptr_heap *heap)
1151{
1152 struct cpuset *cp;
1153 struct cgroup *pos_cgrp;
1154
1155 if (update_root)
1156 update_tasks_nodemask(root_cs, heap);
1157
1158 rcu_read_lock();
1159 cpuset_for_each_descendant_pre(cp, pos_cgrp, root_cs) {
1160
1161 if (!nodes_empty(cp->mems_allowed)) {
1162 pos_cgrp = cgroup_rightmost_descendant(pos_cgrp);
1163 continue;
1164 }
1165 if (!css_tryget(&cp->css))
1166 continue;
1167 rcu_read_unlock();
1168
1169 update_tasks_nodemask(cp, heap);
1170
1171 rcu_read_lock();
1172 css_put(&cp->css);
1173 }
1174 rcu_read_unlock();
1175}
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
1191 const char *buf)
1192{
1193 int retval;
1194 struct ptr_heap heap;
1195
1196
1197
1198
1199
1200 if (cs == &top_cpuset) {
1201 retval = -EACCES;
1202 goto done;
1203 }
1204
1205
1206
1207
1208
1209
1210
1211 if (!*buf) {
1212 nodes_clear(trialcs->mems_allowed);
1213 } else {
1214 retval = nodelist_parse(buf, trialcs->mems_allowed);
1215 if (retval < 0)
1216 goto done;
1217
1218 if (!nodes_subset(trialcs->mems_allowed,
1219 node_states[N_MEMORY])) {
1220 retval = -EINVAL;
1221 goto done;
1222 }
1223 }
1224
1225 if (nodes_equal(cs->mems_allowed, trialcs->mems_allowed)) {
1226 retval = 0;
1227 goto done;
1228 }
1229 retval = validate_change(cs, trialcs);
1230 if (retval < 0)
1231 goto done;
1232
1233 retval = heap_init(&heap, PAGE_SIZE, GFP_KERNEL, NULL);
1234 if (retval < 0)
1235 goto done;
1236
1237 mutex_lock(&callback_mutex);
1238 cs->mems_allowed = trialcs->mems_allowed;
1239 mutex_unlock(&callback_mutex);
1240
1241 update_tasks_nodemask_hier(cs, true, &heap);
1242
1243 heap_free(&heap);
1244done:
1245 return retval;
1246}
1247
1248int current_cpuset_is_being_rebound(void)
1249{
1250 return task_cs(current) == cpuset_being_rebound;
1251}
1252
1253static int update_relax_domain_level(struct cpuset *cs, s64 val)
1254{
1255#ifdef CONFIG_SMP
1256 if (val < -1 || val >= sched_domain_level_max)
1257 return -EINVAL;
1258#endif
1259
1260 if (val != cs->relax_domain_level) {
1261 cs->relax_domain_level = val;
1262 if (!cpumask_empty(cs->cpus_allowed) &&
1263 is_sched_load_balance(cs))
1264 rebuild_sched_domains_locked();
1265 }
1266
1267 return 0;
1268}
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280static void cpuset_change_flag(struct task_struct *tsk,
1281 struct cgroup_scanner *scan)
1282{
1283 cpuset_update_task_spread_flag(cgroup_cs(scan->cg), tsk);
1284}
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299static void update_tasks_flags(struct cpuset *cs, struct ptr_heap *heap)
1300{
1301 struct cgroup_scanner scan;
1302
1303 scan.cg = cs->css.cgroup;
1304 scan.test_task = NULL;
1305 scan.process_task = cpuset_change_flag;
1306 scan.heap = heap;
1307 cgroup_scan_tasks(&scan);
1308}
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
1320 int turning_on)
1321{
1322 struct cpuset *trialcs;
1323 int balance_flag_changed;
1324 int spread_flag_changed;
1325 struct ptr_heap heap;
1326 int err;
1327
1328 trialcs = alloc_trial_cpuset(cs);
1329 if (!trialcs)
1330 return -ENOMEM;
1331
1332 if (turning_on)
1333 set_bit(bit, &trialcs->flags);
1334 else
1335 clear_bit(bit, &trialcs->flags);
1336
1337 err = validate_change(cs, trialcs);
1338 if (err < 0)
1339 goto out;
1340
1341 err = heap_init(&heap, PAGE_SIZE, GFP_KERNEL, NULL);
1342 if (err < 0)
1343 goto out;
1344
1345 balance_flag_changed = (is_sched_load_balance(cs) !=
1346 is_sched_load_balance(trialcs));
1347
1348 spread_flag_changed = ((is_spread_slab(cs) != is_spread_slab(trialcs))
1349 || (is_spread_page(cs) != is_spread_page(trialcs)));
1350
1351 mutex_lock(&callback_mutex);
1352 cs->flags = trialcs->flags;
1353 mutex_unlock(&callback_mutex);
1354
1355 if (!cpumask_empty(trialcs->cpus_allowed) && balance_flag_changed)
1356 rebuild_sched_domains_locked();
1357
1358 if (spread_flag_changed)
1359 update_tasks_flags(cs, &heap);
1360 heap_free(&heap);
1361out:
1362 free_trial_cpuset(trialcs);
1363 return err;
1364}
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411#define FM_COEF 933
1412#define FM_MAXTICKS ((time_t)99)
1413#define FM_MAXCNT 1000000
1414#define FM_SCALE 1000
1415
1416
1417static void fmeter_init(struct fmeter *fmp)
1418{
1419 fmp->cnt = 0;
1420 fmp->val = 0;
1421 fmp->time = 0;
1422 spin_lock_init(&fmp->lock);
1423}
1424
1425
1426static void fmeter_update(struct fmeter *fmp)
1427{
1428 time_t now = get_seconds();
1429 time_t ticks = now - fmp->time;
1430
1431 if (ticks == 0)
1432 return;
1433
1434 ticks = min(FM_MAXTICKS, ticks);
1435 while (ticks-- > 0)
1436 fmp->val = (FM_COEF * fmp->val) / FM_SCALE;
1437 fmp->time = now;
1438
1439 fmp->val += ((FM_SCALE - FM_COEF) * fmp->cnt) / FM_SCALE;
1440 fmp->cnt = 0;
1441}
1442
1443
1444static void fmeter_markevent(struct fmeter *fmp)
1445{
1446 spin_lock(&fmp->lock);
1447 fmeter_update(fmp);
1448 fmp->cnt = min(FM_MAXCNT, fmp->cnt + FM_SCALE);
1449 spin_unlock(&fmp->lock);
1450}
1451
1452
1453static int fmeter_getrate(struct fmeter *fmp)
1454{
1455 int val;
1456
1457 spin_lock(&fmp->lock);
1458 fmeter_update(fmp);
1459 val = fmp->val;
1460 spin_unlock(&fmp->lock);
1461 return val;
1462}
1463
1464
1465static int cpuset_can_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)
1466{
1467 struct cpuset *cs = cgroup_cs(cgrp);
1468 struct task_struct *task;
1469 int ret;
1470
1471 mutex_lock(&cpuset_mutex);
1472
1473
1474
1475
1476
1477 ret = -ENOSPC;
1478 if (!cgroup_sane_behavior(cgrp) &&
1479 (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed)))
1480 goto out_unlock;
1481
1482 cgroup_taskset_for_each(task, cgrp, tset) {
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492 ret = -EINVAL;
1493 if (task->flags & PF_NO_SETAFFINITY)
1494 goto out_unlock;
1495 ret = security_task_setscheduler(task);
1496 if (ret)
1497 goto out_unlock;
1498 }
1499
1500
1501
1502
1503
1504 cs->attach_in_progress++;
1505 ret = 0;
1506out_unlock:
1507 mutex_unlock(&cpuset_mutex);
1508 return ret;
1509}
1510
1511static void cpuset_cancel_attach(struct cgroup *cgrp,
1512 struct cgroup_taskset *tset)
1513{
1514 mutex_lock(&cpuset_mutex);
1515 cgroup_cs(cgrp)->attach_in_progress--;
1516 mutex_unlock(&cpuset_mutex);
1517}
1518
1519
1520
1521
1522
1523
1524static cpumask_var_t cpus_attach;
1525
1526static void cpuset_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)
1527{
1528
1529 static nodemask_t cpuset_attach_nodemask_to;
1530 struct mm_struct *mm;
1531 struct task_struct *task;
1532 struct task_struct *leader = cgroup_taskset_first(tset);
1533 struct cgroup *oldcgrp = cgroup_taskset_cur_cgroup(tset);
1534 struct cpuset *cs = cgroup_cs(cgrp);
1535 struct cpuset *oldcs = cgroup_cs(oldcgrp);
1536 struct cpuset *cpus_cs = effective_cpumask_cpuset(cs);
1537 struct cpuset *mems_cs = effective_nodemask_cpuset(cs);
1538
1539 mutex_lock(&cpuset_mutex);
1540
1541
1542 if (cs == &top_cpuset)
1543 cpumask_copy(cpus_attach, cpu_possible_mask);
1544 else
1545 guarantee_online_cpus(cpus_cs, cpus_attach);
1546
1547 guarantee_online_mems(mems_cs, &cpuset_attach_nodemask_to);
1548
1549 cgroup_taskset_for_each(task, cgrp, tset) {
1550
1551
1552
1553
1554 WARN_ON_ONCE(set_cpus_allowed_ptr(task, cpus_attach));
1555
1556 cpuset_change_task_nodemask(task, &cpuset_attach_nodemask_to);
1557 cpuset_update_task_spread_flag(cs, task);
1558 }
1559
1560
1561
1562
1563
1564 cpuset_attach_nodemask_to = cs->mems_allowed;
1565 mm = get_task_mm(leader);
1566 if (mm) {
1567 struct cpuset *mems_oldcs = effective_nodemask_cpuset(oldcs);
1568
1569 mpol_rebind_mm(mm, &cpuset_attach_nodemask_to);
1570
1571
1572
1573
1574
1575
1576
1577
1578 if (is_memory_migrate(cs)) {
1579 cpuset_migrate_mm(mm, &mems_oldcs->old_mems_allowed,
1580 &cpuset_attach_nodemask_to);
1581 }
1582 mmput(mm);
1583 }
1584
1585 cs->old_mems_allowed = cpuset_attach_nodemask_to;
1586
1587 cs->attach_in_progress--;
1588 if (!cs->attach_in_progress)
1589 wake_up(&cpuset_attach_wq);
1590
1591 mutex_unlock(&cpuset_mutex);
1592}
1593
1594
1595
1596typedef enum {
1597 FILE_MEMORY_MIGRATE,
1598 FILE_CPULIST,
1599 FILE_MEMLIST,
1600 FILE_CPU_EXCLUSIVE,
1601 FILE_MEM_EXCLUSIVE,
1602 FILE_MEM_HARDWALL,
1603 FILE_SCHED_LOAD_BALANCE,
1604 FILE_SCHED_RELAX_DOMAIN_LEVEL,
1605 FILE_MEMORY_PRESSURE_ENABLED,
1606 FILE_MEMORY_PRESSURE,
1607 FILE_SPREAD_PAGE,
1608 FILE_SPREAD_SLAB,
1609} cpuset_filetype_t;
1610
1611static int cpuset_write_u64(struct cgroup *cgrp, struct cftype *cft, u64 val)
1612{
1613 struct cpuset *cs = cgroup_cs(cgrp);
1614 cpuset_filetype_t type = cft->private;
1615 int retval = 0;
1616
1617 mutex_lock(&cpuset_mutex);
1618 if (!is_cpuset_online(cs)) {
1619 retval = -ENODEV;
1620 goto out_unlock;
1621 }
1622
1623 switch (type) {
1624 case FILE_CPU_EXCLUSIVE:
1625 retval = update_flag(CS_CPU_EXCLUSIVE, cs, val);
1626 break;
1627 case FILE_MEM_EXCLUSIVE:
1628 retval = update_flag(CS_MEM_EXCLUSIVE, cs, val);
1629 break;
1630 case FILE_MEM_HARDWALL:
1631 retval = update_flag(CS_MEM_HARDWALL, cs, val);
1632 break;
1633 case FILE_SCHED_LOAD_BALANCE:
1634 retval = update_flag(CS_SCHED_LOAD_BALANCE, cs, val);
1635 break;
1636 case FILE_MEMORY_MIGRATE:
1637 retval = update_flag(CS_MEMORY_MIGRATE, cs, val);
1638 break;
1639 case FILE_MEMORY_PRESSURE_ENABLED:
1640 cpuset_memory_pressure_enabled = !!val;
1641 break;
1642 case FILE_MEMORY_PRESSURE:
1643 retval = -EACCES;
1644 break;
1645 case FILE_SPREAD_PAGE:
1646 retval = update_flag(CS_SPREAD_PAGE, cs, val);
1647 break;
1648 case FILE_SPREAD_SLAB:
1649 retval = update_flag(CS_SPREAD_SLAB, cs, val);
1650 break;
1651 default:
1652 retval = -EINVAL;
1653 break;
1654 }
1655out_unlock:
1656 mutex_unlock(&cpuset_mutex);
1657 return retval;
1658}
1659
1660static int cpuset_write_s64(struct cgroup *cgrp, struct cftype *cft, s64 val)
1661{
1662 struct cpuset *cs = cgroup_cs(cgrp);
1663 cpuset_filetype_t type = cft->private;
1664 int retval = -ENODEV;
1665
1666 mutex_lock(&cpuset_mutex);
1667 if (!is_cpuset_online(cs))
1668 goto out_unlock;
1669
1670 switch (type) {
1671 case FILE_SCHED_RELAX_DOMAIN_LEVEL:
1672 retval = update_relax_domain_level(cs, val);
1673 break;
1674 default:
1675 retval = -EINVAL;
1676 break;
1677 }
1678out_unlock:
1679 mutex_unlock(&cpuset_mutex);
1680 return retval;
1681}
1682
1683
1684
1685
1686static int cpuset_write_resmask(struct cgroup *cgrp, struct cftype *cft,
1687 const char *buf)
1688{
1689 struct cpuset *cs = cgroup_cs(cgrp);
1690 struct cpuset *trialcs;
1691 int retval = -ENODEV;
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704 flush_work(&cpuset_hotplug_work);
1705
1706 mutex_lock(&cpuset_mutex);
1707 if (!is_cpuset_online(cs))
1708 goto out_unlock;
1709
1710 trialcs = alloc_trial_cpuset(cs);
1711 if (!trialcs) {
1712 retval = -ENOMEM;
1713 goto out_unlock;
1714 }
1715
1716 switch (cft->private) {
1717 case FILE_CPULIST:
1718 retval = update_cpumask(cs, trialcs, buf);
1719 break;
1720 case FILE_MEMLIST:
1721 retval = update_nodemask(cs, trialcs, buf);
1722 break;
1723 default:
1724 retval = -EINVAL;
1725 break;
1726 }
1727
1728 free_trial_cpuset(trialcs);
1729out_unlock:
1730 mutex_unlock(&cpuset_mutex);
1731 return retval;
1732}
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746static size_t cpuset_sprintf_cpulist(char *page, struct cpuset *cs)
1747{
1748 size_t count;
1749
1750 mutex_lock(&callback_mutex);
1751 count = cpulist_scnprintf(page, PAGE_SIZE, cs->cpus_allowed);
1752 mutex_unlock(&callback_mutex);
1753
1754 return count;
1755}
1756
1757static size_t cpuset_sprintf_memlist(char *page, struct cpuset *cs)
1758{
1759 size_t count;
1760
1761 mutex_lock(&callback_mutex);
1762 count = nodelist_scnprintf(page, PAGE_SIZE, cs->mems_allowed);
1763 mutex_unlock(&callback_mutex);
1764
1765 return count;
1766}
1767
1768static ssize_t cpuset_common_file_read(struct cgroup *cgrp,
1769 struct cftype *cft,
1770 struct file *file,
1771 char __user *buf,
1772 size_t nbytes, loff_t *ppos)
1773{
1774 struct cpuset *cs = cgroup_cs(cgrp);
1775 cpuset_filetype_t type = cft->private;
1776 char *page;
1777 ssize_t retval = 0;
1778 char *s;
1779
1780 if (!(page = (char *)__get_free_page(GFP_TEMPORARY)))
1781 return -ENOMEM;
1782
1783 s = page;
1784
1785 switch (type) {
1786 case FILE_CPULIST:
1787 s += cpuset_sprintf_cpulist(s, cs);
1788 break;
1789 case FILE_MEMLIST:
1790 s += cpuset_sprintf_memlist(s, cs);
1791 break;
1792 default:
1793 retval = -EINVAL;
1794 goto out;
1795 }
1796 *s++ = '\n';
1797
1798 retval = simple_read_from_buffer(buf, nbytes, ppos, page, s - page);
1799out:
1800 free_page((unsigned long)page);
1801 return retval;
1802}
1803
1804static u64 cpuset_read_u64(struct cgroup *cgrp, struct cftype *cft)
1805{
1806 struct cpuset *cs = cgroup_cs(cgrp);
1807 cpuset_filetype_t type = cft->private;
1808 switch (type) {
1809 case FILE_CPU_EXCLUSIVE:
1810 return is_cpu_exclusive(cs);
1811 case FILE_MEM_EXCLUSIVE:
1812 return is_mem_exclusive(cs);
1813 case FILE_MEM_HARDWALL:
1814 return is_mem_hardwall(cs);
1815 case FILE_SCHED_LOAD_BALANCE:
1816 return is_sched_load_balance(cs);
1817 case FILE_MEMORY_MIGRATE:
1818 return is_memory_migrate(cs);
1819 case FILE_MEMORY_PRESSURE_ENABLED:
1820 return cpuset_memory_pressure_enabled;
1821 case FILE_MEMORY_PRESSURE:
1822 return fmeter_getrate(&cs->fmeter);
1823 case FILE_SPREAD_PAGE:
1824 return is_spread_page(cs);
1825 case FILE_SPREAD_SLAB:
1826 return is_spread_slab(cs);
1827 default:
1828 BUG();
1829 }
1830
1831
1832 return 0;
1833}
1834
1835static s64 cpuset_read_s64(struct cgroup *cgrp, struct cftype *cft)
1836{
1837 struct cpuset *cs = cgroup_cs(cgrp);
1838 cpuset_filetype_t type = cft->private;
1839 switch (type) {
1840 case FILE_SCHED_RELAX_DOMAIN_LEVEL:
1841 return cs->relax_domain_level;
1842 default:
1843 BUG();
1844 }
1845
1846
1847 return 0;
1848}
1849
1850
1851
1852
1853
1854
1855static struct cftype files[] = {
1856 {
1857 .name = "cpus",
1858 .read = cpuset_common_file_read,
1859 .write_string = cpuset_write_resmask,
1860 .max_write_len = (100U + 6 * NR_CPUS),
1861 .private = FILE_CPULIST,
1862 },
1863
1864 {
1865 .name = "mems",
1866 .read = cpuset_common_file_read,
1867 .write_string = cpuset_write_resmask,
1868 .max_write_len = (100U + 6 * MAX_NUMNODES),
1869 .private = FILE_MEMLIST,
1870 },
1871
1872 {
1873 .name = "cpu_exclusive",
1874 .read_u64 = cpuset_read_u64,
1875 .write_u64 = cpuset_write_u64,
1876 .private = FILE_CPU_EXCLUSIVE,
1877 },
1878
1879 {
1880 .name = "mem_exclusive",
1881 .read_u64 = cpuset_read_u64,
1882 .write_u64 = cpuset_write_u64,
1883 .private = FILE_MEM_EXCLUSIVE,
1884 },
1885
1886 {
1887 .name = "mem_hardwall",
1888 .read_u64 = cpuset_read_u64,
1889 .write_u64 = cpuset_write_u64,
1890 .private = FILE_MEM_HARDWALL,
1891 },
1892
1893 {
1894 .name = "sched_load_balance",
1895 .read_u64 = cpuset_read_u64,
1896 .write_u64 = cpuset_write_u64,
1897 .private = FILE_SCHED_LOAD_BALANCE,
1898 },
1899
1900 {
1901 .name = "sched_relax_domain_level",
1902 .read_s64 = cpuset_read_s64,
1903 .write_s64 = cpuset_write_s64,
1904 .private = FILE_SCHED_RELAX_DOMAIN_LEVEL,
1905 },
1906
1907 {
1908 .name = "memory_migrate",
1909 .read_u64 = cpuset_read_u64,
1910 .write_u64 = cpuset_write_u64,
1911 .private = FILE_MEMORY_MIGRATE,
1912 },
1913
1914 {
1915 .name = "memory_pressure",
1916 .read_u64 = cpuset_read_u64,
1917 .write_u64 = cpuset_write_u64,
1918 .private = FILE_MEMORY_PRESSURE,
1919 .mode = S_IRUGO,
1920 },
1921
1922 {
1923 .name = "memory_spread_page",
1924 .read_u64 = cpuset_read_u64,
1925 .write_u64 = cpuset_write_u64,
1926 .private = FILE_SPREAD_PAGE,
1927 },
1928
1929 {
1930 .name = "memory_spread_slab",
1931 .read_u64 = cpuset_read_u64,
1932 .write_u64 = cpuset_write_u64,
1933 .private = FILE_SPREAD_SLAB,
1934 },
1935
1936 {
1937 .name = "memory_pressure_enabled",
1938 .flags = CFTYPE_ONLY_ON_ROOT,
1939 .read_u64 = cpuset_read_u64,
1940 .write_u64 = cpuset_write_u64,
1941 .private = FILE_MEMORY_PRESSURE_ENABLED,
1942 },
1943
1944 { }
1945};
1946
1947
1948
1949
1950
1951
1952static struct cgroup_subsys_state *cpuset_css_alloc(struct cgroup *cgrp)
1953{
1954 struct cpuset *cs;
1955
1956 if (!cgrp->parent)
1957 return &top_cpuset.css;
1958
1959 cs = kzalloc(sizeof(*cs), GFP_KERNEL);
1960 if (!cs)
1961 return ERR_PTR(-ENOMEM);
1962 if (!alloc_cpumask_var(&cs->cpus_allowed, GFP_KERNEL)) {
1963 kfree(cs);
1964 return ERR_PTR(-ENOMEM);
1965 }
1966
1967 set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
1968 cpumask_clear(cs->cpus_allowed);
1969 nodes_clear(cs->mems_allowed);
1970 fmeter_init(&cs->fmeter);
1971 cs->relax_domain_level = -1;
1972
1973 return &cs->css;
1974}
1975
1976static int cpuset_css_online(struct cgroup *cgrp)
1977{
1978 struct cpuset *cs = cgroup_cs(cgrp);
1979 struct cpuset *parent = parent_cs(cs);
1980 struct cpuset *tmp_cs;
1981 struct cgroup *pos_cg;
1982
1983 if (!parent)
1984 return 0;
1985
1986 mutex_lock(&cpuset_mutex);
1987
1988 set_bit(CS_ONLINE, &cs->flags);
1989 if (is_spread_page(parent))
1990 set_bit(CS_SPREAD_PAGE, &cs->flags);
1991 if (is_spread_slab(parent))
1992 set_bit(CS_SPREAD_SLAB, &cs->flags);
1993
1994 number_of_cpusets++;
1995
1996 if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags))
1997 goto out_unlock;
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012 rcu_read_lock();
2013 cpuset_for_each_child(tmp_cs, pos_cg, parent) {
2014 if (is_mem_exclusive(tmp_cs) || is_cpu_exclusive(tmp_cs)) {
2015 rcu_read_unlock();
2016 goto out_unlock;
2017 }
2018 }
2019 rcu_read_unlock();
2020
2021 mutex_lock(&callback_mutex);
2022 cs->mems_allowed = parent->mems_allowed;
2023 cpumask_copy(cs->cpus_allowed, parent->cpus_allowed);
2024 mutex_unlock(&callback_mutex);
2025out_unlock:
2026 mutex_unlock(&cpuset_mutex);
2027 return 0;
2028}
2029
2030static void cpuset_css_offline(struct cgroup *cgrp)
2031{
2032 struct cpuset *cs = cgroup_cs(cgrp);
2033
2034 mutex_lock(&cpuset_mutex);
2035
2036 if (is_sched_load_balance(cs))
2037 update_flag(CS_SCHED_LOAD_BALANCE, cs, 0);
2038
2039 number_of_cpusets--;
2040 clear_bit(CS_ONLINE, &cs->flags);
2041
2042 mutex_unlock(&cpuset_mutex);
2043}
2044
2045
2046
2047
2048
2049
2050
2051static void cpuset_css_free(struct cgroup *cgrp)
2052{
2053 struct cpuset *cs = cgroup_cs(cgrp);
2054
2055 free_cpumask_var(cs->cpus_allowed);
2056 kfree(cs);
2057}
2058
2059struct cgroup_subsys cpuset_subsys = {
2060 .name = "cpuset",
2061 .css_alloc = cpuset_css_alloc,
2062 .css_online = cpuset_css_online,
2063 .css_offline = cpuset_css_offline,
2064 .css_free = cpuset_css_free,
2065 .can_attach = cpuset_can_attach,
2066 .cancel_attach = cpuset_cancel_attach,
2067 .attach = cpuset_attach,
2068 .subsys_id = cpuset_subsys_id,
2069 .base_cftypes = files,
2070 .early_init = 1,
2071};
2072
2073
2074
2075
2076
2077
2078
2079int __init cpuset_init(void)
2080{
2081 int err = 0;
2082
2083 if (!alloc_cpumask_var(&top_cpuset.cpus_allowed, GFP_KERNEL))
2084 BUG();
2085
2086 cpumask_setall(top_cpuset.cpus_allowed);
2087 nodes_setall(top_cpuset.mems_allowed);
2088
2089 fmeter_init(&top_cpuset.fmeter);
2090 set_bit(CS_SCHED_LOAD_BALANCE, &top_cpuset.flags);
2091 top_cpuset.relax_domain_level = -1;
2092
2093 err = register_filesystem(&cpuset_fs_type);
2094 if (err < 0)
2095 return err;
2096
2097 if (!alloc_cpumask_var(&cpus_attach, GFP_KERNEL))
2098 BUG();
2099
2100 number_of_cpusets = 1;
2101 return 0;
2102}
2103
2104
2105
2106
2107
2108
2109
2110
2111static void remove_tasks_in_empty_cpuset(struct cpuset *cs)
2112{
2113 struct cpuset *parent;
2114
2115
2116
2117
2118
2119 parent = parent_cs(cs);
2120 while (cpumask_empty(parent->cpus_allowed) ||
2121 nodes_empty(parent->mems_allowed))
2122 parent = parent_cs(parent);
2123
2124 if (cgroup_transfer_tasks(parent->css.cgroup, cs->css.cgroup)) {
2125 rcu_read_lock();
2126 printk(KERN_ERR "cpuset: failed to transfer tasks out of empty cpuset %s\n",
2127 cgroup_name(cs->css.cgroup));
2128 rcu_read_unlock();
2129 }
2130}
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140static void cpuset_hotplug_update_tasks(struct cpuset *cs)
2141{
2142 static cpumask_t off_cpus;
2143 static nodemask_t off_mems;
2144 bool is_empty;
2145 bool sane = cgroup_sane_behavior(cs->css.cgroup);
2146
2147retry:
2148 wait_event(cpuset_attach_wq, cs->attach_in_progress == 0);
2149
2150 mutex_lock(&cpuset_mutex);
2151
2152
2153
2154
2155
2156 if (cs->attach_in_progress) {
2157 mutex_unlock(&cpuset_mutex);
2158 goto retry;
2159 }
2160
2161 cpumask_andnot(&off_cpus, cs->cpus_allowed, top_cpuset.cpus_allowed);
2162 nodes_andnot(off_mems, cs->mems_allowed, top_cpuset.mems_allowed);
2163
2164 mutex_lock(&callback_mutex);
2165 cpumask_andnot(cs->cpus_allowed, cs->cpus_allowed, &off_cpus);
2166 mutex_unlock(&callback_mutex);
2167
2168
2169
2170
2171
2172
2173
2174 if ((sane && cpumask_empty(cs->cpus_allowed)) ||
2175 (!cpumask_empty(&off_cpus) && !cpumask_empty(cs->cpus_allowed)))
2176 update_tasks_cpumask(cs, NULL);
2177
2178 mutex_lock(&callback_mutex);
2179 nodes_andnot(cs->mems_allowed, cs->mems_allowed, off_mems);
2180 mutex_unlock(&callback_mutex);
2181
2182
2183
2184
2185
2186
2187
2188 if ((sane && nodes_empty(cs->mems_allowed)) ||
2189 (!nodes_empty(off_mems) && !nodes_empty(cs->mems_allowed)))
2190 update_tasks_nodemask(cs, NULL);
2191
2192 is_empty = cpumask_empty(cs->cpus_allowed) ||
2193 nodes_empty(cs->mems_allowed);
2194
2195 mutex_unlock(&cpuset_mutex);
2196
2197
2198
2199
2200
2201
2202
2203
2204 if (!sane && is_empty)
2205 remove_tasks_in_empty_cpuset(cs);
2206}
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224static void cpuset_hotplug_workfn(struct work_struct *work)
2225{
2226 static cpumask_t new_cpus;
2227 static nodemask_t new_mems;
2228 bool cpus_updated, mems_updated;
2229
2230 mutex_lock(&cpuset_mutex);
2231
2232
2233 cpumask_copy(&new_cpus, cpu_active_mask);
2234 new_mems = node_states[N_MEMORY];
2235
2236 cpus_updated = !cpumask_equal(top_cpuset.cpus_allowed, &new_cpus);
2237 mems_updated = !nodes_equal(top_cpuset.mems_allowed, new_mems);
2238
2239
2240 if (cpus_updated) {
2241 mutex_lock(&callback_mutex);
2242 cpumask_copy(top_cpuset.cpus_allowed, &new_cpus);
2243 mutex_unlock(&callback_mutex);
2244
2245 }
2246
2247
2248 if (mems_updated) {
2249 mutex_lock(&callback_mutex);
2250 top_cpuset.mems_allowed = new_mems;
2251 mutex_unlock(&callback_mutex);
2252 update_tasks_nodemask(&top_cpuset, NULL);
2253 }
2254
2255 mutex_unlock(&cpuset_mutex);
2256
2257
2258 if (cpus_updated || mems_updated) {
2259 struct cpuset *cs;
2260 struct cgroup *pos_cgrp;
2261
2262 rcu_read_lock();
2263 cpuset_for_each_descendant_pre(cs, pos_cgrp, &top_cpuset) {
2264 if (!css_tryget(&cs->css))
2265 continue;
2266 rcu_read_unlock();
2267
2268 cpuset_hotplug_update_tasks(cs);
2269
2270 rcu_read_lock();
2271 css_put(&cs->css);
2272 }
2273 rcu_read_unlock();
2274 }
2275
2276
2277 if (cpus_updated)
2278 rebuild_sched_domains();
2279}
2280
2281void cpuset_update_active_cpus(bool cpu_online)
2282{
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293 partition_sched_domains(1, NULL, NULL);
2294 schedule_work(&cpuset_hotplug_work);
2295}
2296
2297
2298
2299
2300
2301
2302static int cpuset_track_online_nodes(struct notifier_block *self,
2303 unsigned long action, void *arg)
2304{
2305 schedule_work(&cpuset_hotplug_work);
2306 return NOTIFY_OK;
2307}
2308
2309static struct notifier_block cpuset_track_online_nodes_nb = {
2310 .notifier_call = cpuset_track_online_nodes,
2311 .priority = 10,
2312};
2313
2314
2315
2316
2317
2318
2319void __init cpuset_init_smp(void)
2320{
2321 cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask);
2322 top_cpuset.mems_allowed = node_states[N_MEMORY];
2323 top_cpuset.old_mems_allowed = top_cpuset.mems_allowed;
2324
2325 register_hotmemory_notifier(&cpuset_track_online_nodes_nb);
2326}
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask)
2340{
2341 struct cpuset *cpus_cs;
2342
2343 mutex_lock(&callback_mutex);
2344 task_lock(tsk);
2345 cpus_cs = effective_cpumask_cpuset(task_cs(tsk));
2346 guarantee_online_cpus(cpus_cs, pmask);
2347 task_unlock(tsk);
2348 mutex_unlock(&callback_mutex);
2349}
2350
2351void cpuset_cpus_allowed_fallback(struct task_struct *tsk)
2352{
2353 const struct cpuset *cpus_cs;
2354
2355 rcu_read_lock();
2356 cpus_cs = effective_cpumask_cpuset(task_cs(tsk));
2357 do_set_cpus_allowed(tsk, cpus_cs->cpus_allowed);
2358 rcu_read_unlock();
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377}
2378
2379void cpuset_init_current_mems_allowed(void)
2380{
2381 nodes_setall(current->mems_allowed);
2382}
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394nodemask_t cpuset_mems_allowed(struct task_struct *tsk)
2395{
2396 struct cpuset *mems_cs;
2397 nodemask_t mask;
2398
2399 mutex_lock(&callback_mutex);
2400 task_lock(tsk);
2401 mems_cs = effective_nodemask_cpuset(task_cs(tsk));
2402 guarantee_online_mems(mems_cs, &mask);
2403 task_unlock(tsk);
2404 mutex_unlock(&callback_mutex);
2405
2406 return mask;
2407}
2408
2409
2410
2411
2412
2413
2414
2415int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask)
2416{
2417 return nodes_intersects(*nodemask, current->mems_allowed);
2418}
2419
2420
2421
2422
2423
2424
2425
2426static const struct cpuset *nearest_hardwall_ancestor(const struct cpuset *cs)
2427{
2428 while (!(is_mem_exclusive(cs) || is_mem_hardwall(cs)) && parent_cs(cs))
2429 cs = parent_cs(cs);
2430 return cs;
2431}
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494int __cpuset_node_allowed_softwall(int node, gfp_t gfp_mask)
2495{
2496 const struct cpuset *cs;
2497 int allowed;
2498
2499 if (in_interrupt() || (gfp_mask & __GFP_THISNODE))
2500 return 1;
2501 might_sleep_if(!(gfp_mask & __GFP_HARDWALL));
2502 if (node_isset(node, current->mems_allowed))
2503 return 1;
2504
2505
2506
2507
2508 if (unlikely(test_thread_flag(TIF_MEMDIE)))
2509 return 1;
2510 if (gfp_mask & __GFP_HARDWALL)
2511 return 0;
2512
2513 if (current->flags & PF_EXITING)
2514 return 1;
2515
2516
2517 mutex_lock(&callback_mutex);
2518
2519 task_lock(current);
2520 cs = nearest_hardwall_ancestor(task_cs(current));
2521 task_unlock(current);
2522
2523 allowed = node_isset(node, cs->mems_allowed);
2524 mutex_unlock(&callback_mutex);
2525 return allowed;
2526}
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551int __cpuset_node_allowed_hardwall(int node, gfp_t gfp_mask)
2552{
2553 if (in_interrupt() || (gfp_mask & __GFP_THISNODE))
2554 return 1;
2555 if (node_isset(node, current->mems_allowed))
2556 return 1;
2557
2558
2559
2560
2561 if (unlikely(test_thread_flag(TIF_MEMDIE)))
2562 return 1;
2563 return 0;
2564}
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593static int cpuset_spread_node(int *rotor)
2594{
2595 int node;
2596
2597 node = next_node(*rotor, current->mems_allowed);
2598 if (node == MAX_NUMNODES)
2599 node = first_node(current->mems_allowed);
2600 *rotor = node;
2601 return node;
2602}
2603
2604int cpuset_mem_spread_node(void)
2605{
2606 if (current->cpuset_mem_spread_rotor == NUMA_NO_NODE)
2607 current->cpuset_mem_spread_rotor =
2608 node_random(¤t->mems_allowed);
2609
2610 return cpuset_spread_node(¤t->cpuset_mem_spread_rotor);
2611}
2612
2613int cpuset_slab_spread_node(void)
2614{
2615 if (current->cpuset_slab_spread_rotor == NUMA_NO_NODE)
2616 current->cpuset_slab_spread_rotor =
2617 node_random(¤t->mems_allowed);
2618
2619 return cpuset_spread_node(¤t->cpuset_slab_spread_rotor);
2620}
2621
2622EXPORT_SYMBOL_GPL(cpuset_mem_spread_node);
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635int cpuset_mems_allowed_intersects(const struct task_struct *tsk1,
2636 const struct task_struct *tsk2)
2637{
2638 return nodes_intersects(tsk1->mems_allowed, tsk2->mems_allowed);
2639}
2640
2641#define CPUSET_NODELIST_LEN (256)
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651void cpuset_print_task_mems_allowed(struct task_struct *tsk)
2652{
2653
2654 static char cpuset_nodelist[CPUSET_NODELIST_LEN];
2655 static DEFINE_SPINLOCK(cpuset_buffer_lock);
2656
2657 struct cgroup *cgrp = task_cs(tsk)->css.cgroup;
2658
2659 rcu_read_lock();
2660 spin_lock(&cpuset_buffer_lock);
2661
2662 nodelist_scnprintf(cpuset_nodelist, CPUSET_NODELIST_LEN,
2663 tsk->mems_allowed);
2664 printk(KERN_INFO "%s cpuset=%s mems_allowed=%s\n",
2665 tsk->comm, cgroup_name(cgrp), cpuset_nodelist);
2666
2667 spin_unlock(&cpuset_buffer_lock);
2668 rcu_read_unlock();
2669}
2670
2671
2672
2673
2674
2675
2676
2677int cpuset_memory_pressure_enabled __read_mostly;
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697void __cpuset_memory_pressure_bump(void)
2698{
2699 task_lock(current);
2700 fmeter_markevent(&task_cs(current)->fmeter);
2701 task_unlock(current);
2702}
2703
2704#ifdef CONFIG_PROC_PID_CPUSET
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714int proc_cpuset_show(struct seq_file *m, void *unused_v)
2715{
2716 struct pid *pid;
2717 struct task_struct *tsk;
2718 char *buf;
2719 struct cgroup_subsys_state *css;
2720 int retval;
2721
2722 retval = -ENOMEM;
2723 buf = kmalloc(PAGE_SIZE, GFP_KERNEL);
2724 if (!buf)
2725 goto out;
2726
2727 retval = -ESRCH;
2728 pid = m->private;
2729 tsk = get_pid_task(pid, PIDTYPE_PID);
2730 if (!tsk)
2731 goto out_free;
2732
2733 rcu_read_lock();
2734 css = task_subsys_state(tsk, cpuset_subsys_id);
2735 retval = cgroup_path(css->cgroup, buf, PAGE_SIZE);
2736 rcu_read_unlock();
2737 if (retval < 0)
2738 goto out_put_task;
2739 seq_puts(m, buf);
2740 seq_putc(m, '\n');
2741out_put_task:
2742 put_task_struct(tsk);
2743out_free:
2744 kfree(buf);
2745out:
2746 return retval;
2747}
2748#endif
2749
2750
2751void cpuset_task_status_allowed(struct seq_file *m, struct task_struct *task)
2752{
2753 seq_printf(m, "Mems_allowed:\t");
2754 seq_nodemask(m, &task->mems_allowed);
2755 seq_printf(m, "\n");
2756 seq_printf(m, "Mems_allowed_list:\t");
2757 seq_nodemask_list(m, &task->mems_allowed);
2758 seq_printf(m, "\n");
2759}
2760