1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25#include <linux/cpu.h>
26#include <linux/cpumask.h>
27#include <linux/cpuset.h>
28#include <linux/err.h>
29#include <linux/errno.h>
30#include <linux/file.h>
31#include <linux/fs.h>
32#include <linux/init.h>
33#include <linux/interrupt.h>
34#include <linux/kernel.h>
35#include <linux/kmod.h>
36#include <linux/list.h>
37#include <linux/mempolicy.h>
38#include <linux/mm.h>
39#include <linux/memory.h>
40#include <linux/module.h>
41#include <linux/mount.h>
42#include <linux/namei.h>
43#include <linux/pagemap.h>
44#include <linux/proc_fs.h>
45#include <linux/rcupdate.h>
46#include <linux/sched.h>
47#include <linux/seq_file.h>
48#include <linux/security.h>
49#include <linux/slab.h>
50#include <linux/spinlock.h>
51#include <linux/stat.h>
52#include <linux/string.h>
53#include <linux/time.h>
54#include <linux/backing-dev.h>
55#include <linux/sort.h>
56
57#include <asm/uaccess.h>
58#include <asm/atomic.h>
59#include <linux/mutex.h>
60#include <linux/workqueue.h>
61#include <linux/cgroup.h>
62
63
64
65
66
67
68
69static struct workqueue_struct *cpuset_wq;
70
71
72
73
74
75
76int number_of_cpusets __read_mostly;
77
78
79struct cgroup_subsys cpuset_subsys;
80struct cpuset;
81
82
83
84struct fmeter {
85 int cnt;
86 int val;
87 time_t time;
88 spinlock_t lock;
89};
90
91struct cpuset {
92 struct cgroup_subsys_state css;
93
94 unsigned long flags;
95 cpumask_var_t cpus_allowed;
96 nodemask_t mems_allowed;
97
98 struct cpuset *parent;
99
100 struct fmeter fmeter;
101
102
103 int pn;
104
105
106 int relax_domain_level;
107
108
109 struct list_head stack_list;
110};
111
112
113static inline struct cpuset *cgroup_cs(struct cgroup *cont)
114{
115 return container_of(cgroup_subsys_state(cont, cpuset_subsys_id),
116 struct cpuset, css);
117}
118
119
120static inline struct cpuset *task_cs(struct task_struct *task)
121{
122 return container_of(task_subsys_state(task, cpuset_subsys_id),
123 struct cpuset, css);
124}
125
126
127typedef enum {
128 CS_CPU_EXCLUSIVE,
129 CS_MEM_EXCLUSIVE,
130 CS_MEM_HARDWALL,
131 CS_MEMORY_MIGRATE,
132 CS_SCHED_LOAD_BALANCE,
133 CS_SPREAD_PAGE,
134 CS_SPREAD_SLAB,
135} cpuset_flagbits_t;
136
137
138static inline int is_cpu_exclusive(const struct cpuset *cs)
139{
140 return test_bit(CS_CPU_EXCLUSIVE, &cs->flags);
141}
142
143static inline int is_mem_exclusive(const struct cpuset *cs)
144{
145 return test_bit(CS_MEM_EXCLUSIVE, &cs->flags);
146}
147
148static inline int is_mem_hardwall(const struct cpuset *cs)
149{
150 return test_bit(CS_MEM_HARDWALL, &cs->flags);
151}
152
153static inline int is_sched_load_balance(const struct cpuset *cs)
154{
155 return test_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
156}
157
158static inline int is_memory_migrate(const struct cpuset *cs)
159{
160 return test_bit(CS_MEMORY_MIGRATE, &cs->flags);
161}
162
163static inline int is_spread_page(const struct cpuset *cs)
164{
165 return test_bit(CS_SPREAD_PAGE, &cs->flags);
166}
167
168static inline int is_spread_slab(const struct cpuset *cs)
169{
170 return test_bit(CS_SPREAD_SLAB, &cs->flags);
171}
172
173static struct cpuset top_cpuset = {
174 .flags = ((1 << CS_CPU_EXCLUSIVE) | (1 << CS_MEM_EXCLUSIVE)),
175};
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216static DEFINE_MUTEX(callback_mutex);
217
218
219
220
221
222
223#define CPUSET_NAME_LEN (128)
224#define CPUSET_NODELIST_LEN (256)
225static char cpuset_name[CPUSET_NAME_LEN];
226static char cpuset_nodelist[CPUSET_NODELIST_LEN];
227static DEFINE_SPINLOCK(cpuset_buffer_lock);
228
229
230
231
232
233
234static struct dentry *cpuset_mount(struct file_system_type *fs_type,
235 int flags, const char *unused_dev_name, void *data)
236{
237 struct file_system_type *cgroup_fs = get_fs_type("cgroup");
238 struct dentry *ret = ERR_PTR(-ENODEV);
239 if (cgroup_fs) {
240 char mountopts[] =
241 "cpuset,noprefix,"
242 "release_agent=/sbin/cpuset_release_agent";
243 ret = cgroup_fs->mount(cgroup_fs, flags,
244 unused_dev_name, mountopts);
245 put_filesystem(cgroup_fs);
246 }
247 return ret;
248}
249
250static struct file_system_type cpuset_fs_type = {
251 .name = "cpuset",
252 .mount = cpuset_mount,
253};
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269static void guarantee_online_cpus(const struct cpuset *cs,
270 struct cpumask *pmask)
271{
272 while (cs && !cpumask_intersects(cs->cpus_allowed, cpu_online_mask))
273 cs = cs->parent;
274 if (cs)
275 cpumask_and(pmask, cs->cpus_allowed, cpu_online_mask);
276 else
277 cpumask_copy(pmask, cpu_online_mask);
278 BUG_ON(!cpumask_intersects(pmask, cpu_online_mask));
279}
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask)
295{
296 while (cs && !nodes_intersects(cs->mems_allowed,
297 node_states[N_HIGH_MEMORY]))
298 cs = cs->parent;
299 if (cs)
300 nodes_and(*pmask, cs->mems_allowed,
301 node_states[N_HIGH_MEMORY]);
302 else
303 *pmask = node_states[N_HIGH_MEMORY];
304 BUG_ON(!nodes_intersects(*pmask, node_states[N_HIGH_MEMORY]));
305}
306
307
308
309
310
311
312static void cpuset_update_task_spread_flag(struct cpuset *cs,
313 struct task_struct *tsk)
314{
315 if (is_spread_page(cs))
316 tsk->flags |= PF_SPREAD_PAGE;
317 else
318 tsk->flags &= ~PF_SPREAD_PAGE;
319 if (is_spread_slab(cs))
320 tsk->flags |= PF_SPREAD_SLAB;
321 else
322 tsk->flags &= ~PF_SPREAD_SLAB;
323}
324
325
326
327
328
329
330
331
332
333static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q)
334{
335 return cpumask_subset(p->cpus_allowed, q->cpus_allowed) &&
336 nodes_subset(p->mems_allowed, q->mems_allowed) &&
337 is_cpu_exclusive(p) <= is_cpu_exclusive(q) &&
338 is_mem_exclusive(p) <= is_mem_exclusive(q);
339}
340
341
342
343
344
345static struct cpuset *alloc_trial_cpuset(const struct cpuset *cs)
346{
347 struct cpuset *trial;
348
349 trial = kmemdup(cs, sizeof(*cs), GFP_KERNEL);
350 if (!trial)
351 return NULL;
352
353 if (!alloc_cpumask_var(&trial->cpus_allowed, GFP_KERNEL)) {
354 kfree(trial);
355 return NULL;
356 }
357 cpumask_copy(trial->cpus_allowed, cs->cpus_allowed);
358
359 return trial;
360}
361
362
363
364
365
366static void free_trial_cpuset(struct cpuset *trial)
367{
368 free_cpumask_var(trial->cpus_allowed);
369 kfree(trial);
370}
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392static int validate_change(const struct cpuset *cur, const struct cpuset *trial)
393{
394 struct cgroup *cont;
395 struct cpuset *c, *par;
396
397
398 list_for_each_entry(cont, &cur->css.cgroup->children, sibling) {
399 if (!is_cpuset_subset(cgroup_cs(cont), trial))
400 return -EBUSY;
401 }
402
403
404 if (cur == &top_cpuset)
405 return 0;
406
407 par = cur->parent;
408
409
410 if (!is_cpuset_subset(trial, par))
411 return -EACCES;
412
413
414
415
416
417 list_for_each_entry(cont, &par->css.cgroup->children, sibling) {
418 c = cgroup_cs(cont);
419 if ((is_cpu_exclusive(trial) || is_cpu_exclusive(c)) &&
420 c != cur &&
421 cpumask_intersects(trial->cpus_allowed, c->cpus_allowed))
422 return -EINVAL;
423 if ((is_mem_exclusive(trial) || is_mem_exclusive(c)) &&
424 c != cur &&
425 nodes_intersects(trial->mems_allowed, c->mems_allowed))
426 return -EINVAL;
427 }
428
429
430 if (cgroup_task_count(cur->css.cgroup)) {
431 if (cpumask_empty(trial->cpus_allowed) ||
432 nodes_empty(trial->mems_allowed)) {
433 return -ENOSPC;
434 }
435 }
436
437 return 0;
438}
439
440#ifdef CONFIG_SMP
441
442
443
444
445static int cpusets_overlap(struct cpuset *a, struct cpuset *b)
446{
447 return cpumask_intersects(a->cpus_allowed, b->cpus_allowed);
448}
449
450static void
451update_domain_attr(struct sched_domain_attr *dattr, struct cpuset *c)
452{
453 if (dattr->relax_domain_level < c->relax_domain_level)
454 dattr->relax_domain_level = c->relax_domain_level;
455 return;
456}
457
458static void
459update_domain_attr_tree(struct sched_domain_attr *dattr, struct cpuset *c)
460{
461 LIST_HEAD(q);
462
463 list_add(&c->stack_list, &q);
464 while (!list_empty(&q)) {
465 struct cpuset *cp;
466 struct cgroup *cont;
467 struct cpuset *child;
468
469 cp = list_first_entry(&q, struct cpuset, stack_list);
470 list_del(q.next);
471
472 if (cpumask_empty(cp->cpus_allowed))
473 continue;
474
475 if (is_sched_load_balance(cp))
476 update_domain_attr(dattr, cp);
477
478 list_for_each_entry(cont, &cp->css.cgroup->children, sibling) {
479 child = cgroup_cs(cont);
480 list_add_tail(&child->stack_list, &q);
481 }
482 }
483}
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539static int generate_sched_domains(cpumask_var_t **domains,
540 struct sched_domain_attr **attributes)
541{
542 LIST_HEAD(q);
543 struct cpuset *cp;
544 struct cpuset **csa;
545 int csn;
546 int i, j, k;
547 cpumask_var_t *doms;
548 struct sched_domain_attr *dattr;
549 int ndoms = 0;
550 int nslot;
551
552 doms = NULL;
553 dattr = NULL;
554 csa = NULL;
555
556
557 if (is_sched_load_balance(&top_cpuset)) {
558 ndoms = 1;
559 doms = alloc_sched_domains(ndoms);
560 if (!doms)
561 goto done;
562
563 dattr = kmalloc(sizeof(struct sched_domain_attr), GFP_KERNEL);
564 if (dattr) {
565 *dattr = SD_ATTR_INIT;
566 update_domain_attr_tree(dattr, &top_cpuset);
567 }
568 cpumask_copy(doms[0], top_cpuset.cpus_allowed);
569
570 goto done;
571 }
572
573 csa = kmalloc(number_of_cpusets * sizeof(cp), GFP_KERNEL);
574 if (!csa)
575 goto done;
576 csn = 0;
577
578 list_add(&top_cpuset.stack_list, &q);
579 while (!list_empty(&q)) {
580 struct cgroup *cont;
581 struct cpuset *child;
582
583 cp = list_first_entry(&q, struct cpuset, stack_list);
584 list_del(q.next);
585
586 if (cpumask_empty(cp->cpus_allowed))
587 continue;
588
589
590
591
592
593
594
595 if (is_sched_load_balance(cp)) {
596 csa[csn++] = cp;
597 continue;
598 }
599
600 list_for_each_entry(cont, &cp->css.cgroup->children, sibling) {
601 child = cgroup_cs(cont);
602 list_add_tail(&child->stack_list, &q);
603 }
604 }
605
606 for (i = 0; i < csn; i++)
607 csa[i]->pn = i;
608 ndoms = csn;
609
610restart:
611
612 for (i = 0; i < csn; i++) {
613 struct cpuset *a = csa[i];
614 int apn = a->pn;
615
616 for (j = 0; j < csn; j++) {
617 struct cpuset *b = csa[j];
618 int bpn = b->pn;
619
620 if (apn != bpn && cpusets_overlap(a, b)) {
621 for (k = 0; k < csn; k++) {
622 struct cpuset *c = csa[k];
623
624 if (c->pn == bpn)
625 c->pn = apn;
626 }
627 ndoms--;
628 goto restart;
629 }
630 }
631 }
632
633
634
635
636
637 doms = alloc_sched_domains(ndoms);
638 if (!doms)
639 goto done;
640
641
642
643
644
645 dattr = kmalloc(ndoms * sizeof(struct sched_domain_attr), GFP_KERNEL);
646
647 for (nslot = 0, i = 0; i < csn; i++) {
648 struct cpuset *a = csa[i];
649 struct cpumask *dp;
650 int apn = a->pn;
651
652 if (apn < 0) {
653
654 continue;
655 }
656
657 dp = doms[nslot];
658
659 if (nslot == ndoms) {
660 static int warnings = 10;
661 if (warnings) {
662 printk(KERN_WARNING
663 "rebuild_sched_domains confused:"
664 " nslot %d, ndoms %d, csn %d, i %d,"
665 " apn %d\n",
666 nslot, ndoms, csn, i, apn);
667 warnings--;
668 }
669 continue;
670 }
671
672 cpumask_clear(dp);
673 if (dattr)
674 *(dattr + nslot) = SD_ATTR_INIT;
675 for (j = i; j < csn; j++) {
676 struct cpuset *b = csa[j];
677
678 if (apn == b->pn) {
679 cpumask_or(dp, dp, b->cpus_allowed);
680 if (dattr)
681 update_domain_attr_tree(dattr + nslot, b);
682
683
684 b->pn = -1;
685 }
686 }
687 nslot++;
688 }
689 BUG_ON(nslot != ndoms);
690
691done:
692 kfree(csa);
693
694
695
696
697
698 if (doms == NULL)
699 ndoms = 1;
700
701 *domains = doms;
702 *attributes = dattr;
703 return ndoms;
704}
705
706
707
708
709
710
711
712
713
714
715
716static void do_rebuild_sched_domains(struct work_struct *unused)
717{
718 struct sched_domain_attr *attr;
719 cpumask_var_t *doms;
720 int ndoms;
721
722 get_online_cpus();
723
724
725 cgroup_lock();
726 ndoms = generate_sched_domains(&doms, &attr);
727 cgroup_unlock();
728
729
730 partition_sched_domains(ndoms, doms, attr);
731
732 put_online_cpus();
733}
734#else
735static void do_rebuild_sched_domains(struct work_struct *unused)
736{
737}
738
739static int generate_sched_domains(cpumask_var_t **domains,
740 struct sched_domain_attr **attributes)
741{
742 *domains = NULL;
743 return 1;
744}
745#endif
746
747static DECLARE_WORK(rebuild_sched_domains_work, do_rebuild_sched_domains);
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768static void async_rebuild_sched_domains(void)
769{
770 queue_work(cpuset_wq, &rebuild_sched_domains_work);
771}
772
773
774
775
776
777
778
779
780
781
782void rebuild_sched_domains(void)
783{
784 do_rebuild_sched_domains(NULL);
785}
786
787
788
789
790
791
792
793
794
795
796
797static int cpuset_test_cpumask(struct task_struct *tsk,
798 struct cgroup_scanner *scan)
799{
800 return !cpumask_equal(&tsk->cpus_allowed,
801 (cgroup_cs(scan->cg))->cpus_allowed);
802}
803
804
805
806
807
808
809
810
811
812
813
814
815static void cpuset_change_cpumask(struct task_struct *tsk,
816 struct cgroup_scanner *scan)
817{
818 set_cpus_allowed_ptr(tsk, ((cgroup_cs(scan->cg))->cpus_allowed));
819}
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834static void update_tasks_cpumask(struct cpuset *cs, struct ptr_heap *heap)
835{
836 struct cgroup_scanner scan;
837
838 scan.cg = cs->css.cgroup;
839 scan.test_task = cpuset_test_cpumask;
840 scan.process_task = cpuset_change_cpumask;
841 scan.heap = heap;
842 cgroup_scan_tasks(&scan);
843}
844
845
846
847
848
849
850static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
851 const char *buf)
852{
853 struct ptr_heap heap;
854 int retval;
855 int is_load_balanced;
856
857
858 if (cs == &top_cpuset)
859 return -EACCES;
860
861
862
863
864
865
866
867 if (!*buf) {
868 cpumask_clear(trialcs->cpus_allowed);
869 } else {
870 retval = cpulist_parse(buf, trialcs->cpus_allowed);
871 if (retval < 0)
872 return retval;
873
874 if (!cpumask_subset(trialcs->cpus_allowed, cpu_active_mask))
875 return -EINVAL;
876 }
877 retval = validate_change(cs, trialcs);
878 if (retval < 0)
879 return retval;
880
881
882 if (cpumask_equal(cs->cpus_allowed, trialcs->cpus_allowed))
883 return 0;
884
885 retval = heap_init(&heap, PAGE_SIZE, GFP_KERNEL, NULL);
886 if (retval)
887 return retval;
888
889 is_load_balanced = is_sched_load_balance(trialcs);
890
891 mutex_lock(&callback_mutex);
892 cpumask_copy(cs->cpus_allowed, trialcs->cpus_allowed);
893 mutex_unlock(&callback_mutex);
894
895
896
897
898
899 update_tasks_cpumask(cs, &heap);
900
901 heap_free(&heap);
902
903 if (is_load_balanced)
904 async_rebuild_sched_domains();
905 return 0;
906}
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from,
929 const nodemask_t *to)
930{
931 struct task_struct *tsk = current;
932
933 tsk->mems_allowed = *to;
934
935 do_migrate_pages(mm, from, to, MPOL_MF_MOVE_ALL);
936
937 guarantee_online_mems(task_cs(tsk),&tsk->mems_allowed);
938}
939
940
941
942
943
944
945
946
947
948
949static void cpuset_change_task_nodemask(struct task_struct *tsk,
950 nodemask_t *newmems)
951{
952repeat:
953
954
955
956
957 if (unlikely(test_thread_flag(TIF_MEMDIE)))
958 return;
959 if (current->flags & PF_EXITING)
960 return;
961
962 task_lock(tsk);
963 nodes_or(tsk->mems_allowed, tsk->mems_allowed, *newmems);
964 mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP1);
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979 smp_mb();
980
981
982
983
984
985 while (ACCESS_ONCE(tsk->mems_allowed_change_disable)) {
986 task_unlock(tsk);
987 if (!task_curr(tsk))
988 yield();
989 goto repeat;
990 }
991
992
993
994
995
996
997
998
999 smp_mb();
1000
1001 mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP2);
1002 tsk->mems_allowed = *newmems;
1003 task_unlock(tsk);
1004}
1005
1006
1007
1008
1009
1010
1011static void cpuset_change_nodemask(struct task_struct *p,
1012 struct cgroup_scanner *scan)
1013{
1014 struct mm_struct *mm;
1015 struct cpuset *cs;
1016 int migrate;
1017 const nodemask_t *oldmem = scan->data;
1018 NODEMASK_ALLOC(nodemask_t, newmems, GFP_KERNEL);
1019
1020 if (!newmems)
1021 return;
1022
1023 cs = cgroup_cs(scan->cg);
1024 guarantee_online_mems(cs, newmems);
1025
1026 cpuset_change_task_nodemask(p, newmems);
1027
1028 NODEMASK_FREE(newmems);
1029
1030 mm = get_task_mm(p);
1031 if (!mm)
1032 return;
1033
1034 migrate = is_memory_migrate(cs);
1035
1036 mpol_rebind_mm(mm, &cs->mems_allowed);
1037 if (migrate)
1038 cpuset_migrate_mm(mm, oldmem, &cs->mems_allowed);
1039 mmput(mm);
1040}
1041
1042static void *cpuset_being_rebound;
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054static void update_tasks_nodemask(struct cpuset *cs, const nodemask_t *oldmem,
1055 struct ptr_heap *heap)
1056{
1057 struct cgroup_scanner scan;
1058
1059 cpuset_being_rebound = cs;
1060
1061 scan.cg = cs->css.cgroup;
1062 scan.test_task = NULL;
1063 scan.process_task = cpuset_change_nodemask;
1064 scan.heap = heap;
1065 scan.data = (nodemask_t *)oldmem;
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077 cgroup_scan_tasks(&scan);
1078
1079
1080 cpuset_being_rebound = NULL;
1081}
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
1097 const char *buf)
1098{
1099 NODEMASK_ALLOC(nodemask_t, oldmem, GFP_KERNEL);
1100 int retval;
1101 struct ptr_heap heap;
1102
1103 if (!oldmem)
1104 return -ENOMEM;
1105
1106
1107
1108
1109
1110 if (cs == &top_cpuset) {
1111 retval = -EACCES;
1112 goto done;
1113 }
1114
1115
1116
1117
1118
1119
1120
1121 if (!*buf) {
1122 nodes_clear(trialcs->mems_allowed);
1123 } else {
1124 retval = nodelist_parse(buf, trialcs->mems_allowed);
1125 if (retval < 0)
1126 goto done;
1127
1128 if (!nodes_subset(trialcs->mems_allowed,
1129 node_states[N_HIGH_MEMORY])) {
1130 retval = -EINVAL;
1131 goto done;
1132 }
1133 }
1134 *oldmem = cs->mems_allowed;
1135 if (nodes_equal(*oldmem, trialcs->mems_allowed)) {
1136 retval = 0;
1137 goto done;
1138 }
1139 retval = validate_change(cs, trialcs);
1140 if (retval < 0)
1141 goto done;
1142
1143 retval = heap_init(&heap, PAGE_SIZE, GFP_KERNEL, NULL);
1144 if (retval < 0)
1145 goto done;
1146
1147 mutex_lock(&callback_mutex);
1148 cs->mems_allowed = trialcs->mems_allowed;
1149 mutex_unlock(&callback_mutex);
1150
1151 update_tasks_nodemask(cs, oldmem, &heap);
1152
1153 heap_free(&heap);
1154done:
1155 NODEMASK_FREE(oldmem);
1156 return retval;
1157}
1158
1159int current_cpuset_is_being_rebound(void)
1160{
1161 return task_cs(current) == cpuset_being_rebound;
1162}
1163
1164static int update_relax_domain_level(struct cpuset *cs, s64 val)
1165{
1166#ifdef CONFIG_SMP
1167 if (val < -1 || val >= SD_LV_MAX)
1168 return -EINVAL;
1169#endif
1170
1171 if (val != cs->relax_domain_level) {
1172 cs->relax_domain_level = val;
1173 if (!cpumask_empty(cs->cpus_allowed) &&
1174 is_sched_load_balance(cs))
1175 async_rebuild_sched_domains();
1176 }
1177
1178 return 0;
1179}
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191static void cpuset_change_flag(struct task_struct *tsk,
1192 struct cgroup_scanner *scan)
1193{
1194 cpuset_update_task_spread_flag(cgroup_cs(scan->cg), tsk);
1195}
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210static void update_tasks_flags(struct cpuset *cs, struct ptr_heap *heap)
1211{
1212 struct cgroup_scanner scan;
1213
1214 scan.cg = cs->css.cgroup;
1215 scan.test_task = NULL;
1216 scan.process_task = cpuset_change_flag;
1217 scan.heap = heap;
1218 cgroup_scan_tasks(&scan);
1219}
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
1231 int turning_on)
1232{
1233 struct cpuset *trialcs;
1234 int balance_flag_changed;
1235 int spread_flag_changed;
1236 struct ptr_heap heap;
1237 int err;
1238
1239 trialcs = alloc_trial_cpuset(cs);
1240 if (!trialcs)
1241 return -ENOMEM;
1242
1243 if (turning_on)
1244 set_bit(bit, &trialcs->flags);
1245 else
1246 clear_bit(bit, &trialcs->flags);
1247
1248 err = validate_change(cs, trialcs);
1249 if (err < 0)
1250 goto out;
1251
1252 err = heap_init(&heap, PAGE_SIZE, GFP_KERNEL, NULL);
1253 if (err < 0)
1254 goto out;
1255
1256 balance_flag_changed = (is_sched_load_balance(cs) !=
1257 is_sched_load_balance(trialcs));
1258
1259 spread_flag_changed = ((is_spread_slab(cs) != is_spread_slab(trialcs))
1260 || (is_spread_page(cs) != is_spread_page(trialcs)));
1261
1262 mutex_lock(&callback_mutex);
1263 cs->flags = trialcs->flags;
1264 mutex_unlock(&callback_mutex);
1265
1266 if (!cpumask_empty(trialcs->cpus_allowed) && balance_flag_changed)
1267 async_rebuild_sched_domains();
1268
1269 if (spread_flag_changed)
1270 update_tasks_flags(cs, &heap);
1271 heap_free(&heap);
1272out:
1273 free_trial_cpuset(trialcs);
1274 return err;
1275}
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322#define FM_COEF 933
1323#define FM_MAXTICKS ((time_t)99)
1324#define FM_MAXCNT 1000000
1325#define FM_SCALE 1000
1326
1327
1328static void fmeter_init(struct fmeter *fmp)
1329{
1330 fmp->cnt = 0;
1331 fmp->val = 0;
1332 fmp->time = 0;
1333 spin_lock_init(&fmp->lock);
1334}
1335
1336
1337static void fmeter_update(struct fmeter *fmp)
1338{
1339 time_t now = get_seconds();
1340 time_t ticks = now - fmp->time;
1341
1342 if (ticks == 0)
1343 return;
1344
1345 ticks = min(FM_MAXTICKS, ticks);
1346 while (ticks-- > 0)
1347 fmp->val = (FM_COEF * fmp->val) / FM_SCALE;
1348 fmp->time = now;
1349
1350 fmp->val += ((FM_SCALE - FM_COEF) * fmp->cnt) / FM_SCALE;
1351 fmp->cnt = 0;
1352}
1353
1354
1355static void fmeter_markevent(struct fmeter *fmp)
1356{
1357 spin_lock(&fmp->lock);
1358 fmeter_update(fmp);
1359 fmp->cnt = min(FM_MAXCNT, fmp->cnt + FM_SCALE);
1360 spin_unlock(&fmp->lock);
1361}
1362
1363
1364static int fmeter_getrate(struct fmeter *fmp)
1365{
1366 int val;
1367
1368 spin_lock(&fmp->lock);
1369 fmeter_update(fmp);
1370 val = fmp->val;
1371 spin_unlock(&fmp->lock);
1372 return val;
1373}
1374
1375
1376static cpumask_var_t cpus_attach;
1377
1378
1379static int cpuset_can_attach(struct cgroup_subsys *ss, struct cgroup *cont,
1380 struct task_struct *tsk, bool threadgroup)
1381{
1382 int ret;
1383 struct cpuset *cs = cgroup_cs(cont);
1384
1385 if (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed))
1386 return -ENOSPC;
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396 if (tsk->flags & PF_THREAD_BOUND)
1397 return -EINVAL;
1398
1399 ret = security_task_setscheduler(tsk);
1400 if (ret)
1401 return ret;
1402 if (threadgroup) {
1403 struct task_struct *c;
1404
1405 rcu_read_lock();
1406 list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) {
1407 ret = security_task_setscheduler(c);
1408 if (ret) {
1409 rcu_read_unlock();
1410 return ret;
1411 }
1412 }
1413 rcu_read_unlock();
1414 }
1415 return 0;
1416}
1417
1418static void cpuset_attach_task(struct task_struct *tsk, nodemask_t *to,
1419 struct cpuset *cs)
1420{
1421 int err;
1422
1423
1424
1425
1426 err = set_cpus_allowed_ptr(tsk, cpus_attach);
1427 WARN_ON_ONCE(err);
1428
1429 cpuset_change_task_nodemask(tsk, to);
1430 cpuset_update_task_spread_flag(cs, tsk);
1431
1432}
1433
1434static void cpuset_attach(struct cgroup_subsys *ss, struct cgroup *cont,
1435 struct cgroup *oldcont, struct task_struct *tsk,
1436 bool threadgroup)
1437{
1438 struct mm_struct *mm;
1439 struct cpuset *cs = cgroup_cs(cont);
1440 struct cpuset *oldcs = cgroup_cs(oldcont);
1441 NODEMASK_ALLOC(nodemask_t, from, GFP_KERNEL);
1442 NODEMASK_ALLOC(nodemask_t, to, GFP_KERNEL);
1443
1444 if (from == NULL || to == NULL)
1445 goto alloc_fail;
1446
1447 if (cs == &top_cpuset) {
1448 cpumask_copy(cpus_attach, cpu_possible_mask);
1449 } else {
1450 guarantee_online_cpus(cs, cpus_attach);
1451 }
1452 guarantee_online_mems(cs, to);
1453
1454
1455 cpuset_attach_task(tsk, to, cs);
1456 if (threadgroup) {
1457 struct task_struct *c;
1458 rcu_read_lock();
1459 list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) {
1460 cpuset_attach_task(c, to, cs);
1461 }
1462 rcu_read_unlock();
1463 }
1464
1465
1466 *from = oldcs->mems_allowed;
1467 *to = cs->mems_allowed;
1468 mm = get_task_mm(tsk);
1469 if (mm) {
1470 mpol_rebind_mm(mm, to);
1471 if (is_memory_migrate(cs))
1472 cpuset_migrate_mm(mm, from, to);
1473 mmput(mm);
1474 }
1475
1476alloc_fail:
1477 NODEMASK_FREE(from);
1478 NODEMASK_FREE(to);
1479}
1480
1481
1482
1483typedef enum {
1484 FILE_MEMORY_MIGRATE,
1485 FILE_CPULIST,
1486 FILE_MEMLIST,
1487 FILE_CPU_EXCLUSIVE,
1488 FILE_MEM_EXCLUSIVE,
1489 FILE_MEM_HARDWALL,
1490 FILE_SCHED_LOAD_BALANCE,
1491 FILE_SCHED_RELAX_DOMAIN_LEVEL,
1492 FILE_MEMORY_PRESSURE_ENABLED,
1493 FILE_MEMORY_PRESSURE,
1494 FILE_SPREAD_PAGE,
1495 FILE_SPREAD_SLAB,
1496} cpuset_filetype_t;
1497
1498static int cpuset_write_u64(struct cgroup *cgrp, struct cftype *cft, u64 val)
1499{
1500 int retval = 0;
1501 struct cpuset *cs = cgroup_cs(cgrp);
1502 cpuset_filetype_t type = cft->private;
1503
1504 if (!cgroup_lock_live_group(cgrp))
1505 return -ENODEV;
1506
1507 switch (type) {
1508 case FILE_CPU_EXCLUSIVE:
1509 retval = update_flag(CS_CPU_EXCLUSIVE, cs, val);
1510 break;
1511 case FILE_MEM_EXCLUSIVE:
1512 retval = update_flag(CS_MEM_EXCLUSIVE, cs, val);
1513 break;
1514 case FILE_MEM_HARDWALL:
1515 retval = update_flag(CS_MEM_HARDWALL, cs, val);
1516 break;
1517 case FILE_SCHED_LOAD_BALANCE:
1518 retval = update_flag(CS_SCHED_LOAD_BALANCE, cs, val);
1519 break;
1520 case FILE_MEMORY_MIGRATE:
1521 retval = update_flag(CS_MEMORY_MIGRATE, cs, val);
1522 break;
1523 case FILE_MEMORY_PRESSURE_ENABLED:
1524 cpuset_memory_pressure_enabled = !!val;
1525 break;
1526 case FILE_MEMORY_PRESSURE:
1527 retval = -EACCES;
1528 break;
1529 case FILE_SPREAD_PAGE:
1530 retval = update_flag(CS_SPREAD_PAGE, cs, val);
1531 break;
1532 case FILE_SPREAD_SLAB:
1533 retval = update_flag(CS_SPREAD_SLAB, cs, val);
1534 break;
1535 default:
1536 retval = -EINVAL;
1537 break;
1538 }
1539 cgroup_unlock();
1540 return retval;
1541}
1542
1543static int cpuset_write_s64(struct cgroup *cgrp, struct cftype *cft, s64 val)
1544{
1545 int retval = 0;
1546 struct cpuset *cs = cgroup_cs(cgrp);
1547 cpuset_filetype_t type = cft->private;
1548
1549 if (!cgroup_lock_live_group(cgrp))
1550 return -ENODEV;
1551
1552 switch (type) {
1553 case FILE_SCHED_RELAX_DOMAIN_LEVEL:
1554 retval = update_relax_domain_level(cs, val);
1555 break;
1556 default:
1557 retval = -EINVAL;
1558 break;
1559 }
1560 cgroup_unlock();
1561 return retval;
1562}
1563
1564
1565
1566
1567static int cpuset_write_resmask(struct cgroup *cgrp, struct cftype *cft,
1568 const char *buf)
1569{
1570 int retval = 0;
1571 struct cpuset *cs = cgroup_cs(cgrp);
1572 struct cpuset *trialcs;
1573
1574 if (!cgroup_lock_live_group(cgrp))
1575 return -ENODEV;
1576
1577 trialcs = alloc_trial_cpuset(cs);
1578 if (!trialcs) {
1579 retval = -ENOMEM;
1580 goto out;
1581 }
1582
1583 switch (cft->private) {
1584 case FILE_CPULIST:
1585 retval = update_cpumask(cs, trialcs, buf);
1586 break;
1587 case FILE_MEMLIST:
1588 retval = update_nodemask(cs, trialcs, buf);
1589 break;
1590 default:
1591 retval = -EINVAL;
1592 break;
1593 }
1594
1595 free_trial_cpuset(trialcs);
1596out:
1597 cgroup_unlock();
1598 return retval;
1599}
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613static int cpuset_sprintf_cpulist(char *page, struct cpuset *cs)
1614{
1615 int ret;
1616
1617 mutex_lock(&callback_mutex);
1618 ret = cpulist_scnprintf(page, PAGE_SIZE, cs->cpus_allowed);
1619 mutex_unlock(&callback_mutex);
1620
1621 return ret;
1622}
1623
1624static int cpuset_sprintf_memlist(char *page, struct cpuset *cs)
1625{
1626 NODEMASK_ALLOC(nodemask_t, mask, GFP_KERNEL);
1627 int retval;
1628
1629 if (mask == NULL)
1630 return -ENOMEM;
1631
1632 mutex_lock(&callback_mutex);
1633 *mask = cs->mems_allowed;
1634 mutex_unlock(&callback_mutex);
1635
1636 retval = nodelist_scnprintf(page, PAGE_SIZE, *mask);
1637
1638 NODEMASK_FREE(mask);
1639
1640 return retval;
1641}
1642
1643static ssize_t cpuset_common_file_read(struct cgroup *cont,
1644 struct cftype *cft,
1645 struct file *file,
1646 char __user *buf,
1647 size_t nbytes, loff_t *ppos)
1648{
1649 struct cpuset *cs = cgroup_cs(cont);
1650 cpuset_filetype_t type = cft->private;
1651 char *page;
1652 ssize_t retval = 0;
1653 char *s;
1654
1655 if (!(page = (char *)__get_free_page(GFP_TEMPORARY)))
1656 return -ENOMEM;
1657
1658 s = page;
1659
1660 switch (type) {
1661 case FILE_CPULIST:
1662 s += cpuset_sprintf_cpulist(s, cs);
1663 break;
1664 case FILE_MEMLIST:
1665 s += cpuset_sprintf_memlist(s, cs);
1666 break;
1667 default:
1668 retval = -EINVAL;
1669 goto out;
1670 }
1671 *s++ = '\n';
1672
1673 retval = simple_read_from_buffer(buf, nbytes, ppos, page, s - page);
1674out:
1675 free_page((unsigned long)page);
1676 return retval;
1677}
1678
1679static u64 cpuset_read_u64(struct cgroup *cont, struct cftype *cft)
1680{
1681 struct cpuset *cs = cgroup_cs(cont);
1682 cpuset_filetype_t type = cft->private;
1683 switch (type) {
1684 case FILE_CPU_EXCLUSIVE:
1685 return is_cpu_exclusive(cs);
1686 case FILE_MEM_EXCLUSIVE:
1687 return is_mem_exclusive(cs);
1688 case FILE_MEM_HARDWALL:
1689 return is_mem_hardwall(cs);
1690 case FILE_SCHED_LOAD_BALANCE:
1691 return is_sched_load_balance(cs);
1692 case FILE_MEMORY_MIGRATE:
1693 return is_memory_migrate(cs);
1694 case FILE_MEMORY_PRESSURE_ENABLED:
1695 return cpuset_memory_pressure_enabled;
1696 case FILE_MEMORY_PRESSURE:
1697 return fmeter_getrate(&cs->fmeter);
1698 case FILE_SPREAD_PAGE:
1699 return is_spread_page(cs);
1700 case FILE_SPREAD_SLAB:
1701 return is_spread_slab(cs);
1702 default:
1703 BUG();
1704 }
1705
1706
1707 return 0;
1708}
1709
1710static s64 cpuset_read_s64(struct cgroup *cont, struct cftype *cft)
1711{
1712 struct cpuset *cs = cgroup_cs(cont);
1713 cpuset_filetype_t type = cft->private;
1714 switch (type) {
1715 case FILE_SCHED_RELAX_DOMAIN_LEVEL:
1716 return cs->relax_domain_level;
1717 default:
1718 BUG();
1719 }
1720
1721
1722 return 0;
1723}
1724
1725
1726
1727
1728
1729
1730static struct cftype files[] = {
1731 {
1732 .name = "cpus",
1733 .read = cpuset_common_file_read,
1734 .write_string = cpuset_write_resmask,
1735 .max_write_len = (100U + 6 * NR_CPUS),
1736 .private = FILE_CPULIST,
1737 },
1738
1739 {
1740 .name = "mems",
1741 .read = cpuset_common_file_read,
1742 .write_string = cpuset_write_resmask,
1743 .max_write_len = (100U + 6 * MAX_NUMNODES),
1744 .private = FILE_MEMLIST,
1745 },
1746
1747 {
1748 .name = "cpu_exclusive",
1749 .read_u64 = cpuset_read_u64,
1750 .write_u64 = cpuset_write_u64,
1751 .private = FILE_CPU_EXCLUSIVE,
1752 },
1753
1754 {
1755 .name = "mem_exclusive",
1756 .read_u64 = cpuset_read_u64,
1757 .write_u64 = cpuset_write_u64,
1758 .private = FILE_MEM_EXCLUSIVE,
1759 },
1760
1761 {
1762 .name = "mem_hardwall",
1763 .read_u64 = cpuset_read_u64,
1764 .write_u64 = cpuset_write_u64,
1765 .private = FILE_MEM_HARDWALL,
1766 },
1767
1768 {
1769 .name = "sched_load_balance",
1770 .read_u64 = cpuset_read_u64,
1771 .write_u64 = cpuset_write_u64,
1772 .private = FILE_SCHED_LOAD_BALANCE,
1773 },
1774
1775 {
1776 .name = "sched_relax_domain_level",
1777 .read_s64 = cpuset_read_s64,
1778 .write_s64 = cpuset_write_s64,
1779 .private = FILE_SCHED_RELAX_DOMAIN_LEVEL,
1780 },
1781
1782 {
1783 .name = "memory_migrate",
1784 .read_u64 = cpuset_read_u64,
1785 .write_u64 = cpuset_write_u64,
1786 .private = FILE_MEMORY_MIGRATE,
1787 },
1788
1789 {
1790 .name = "memory_pressure",
1791 .read_u64 = cpuset_read_u64,
1792 .write_u64 = cpuset_write_u64,
1793 .private = FILE_MEMORY_PRESSURE,
1794 .mode = S_IRUGO,
1795 },
1796
1797 {
1798 .name = "memory_spread_page",
1799 .read_u64 = cpuset_read_u64,
1800 .write_u64 = cpuset_write_u64,
1801 .private = FILE_SPREAD_PAGE,
1802 },
1803
1804 {
1805 .name = "memory_spread_slab",
1806 .read_u64 = cpuset_read_u64,
1807 .write_u64 = cpuset_write_u64,
1808 .private = FILE_SPREAD_SLAB,
1809 },
1810};
1811
1812static struct cftype cft_memory_pressure_enabled = {
1813 .name = "memory_pressure_enabled",
1814 .read_u64 = cpuset_read_u64,
1815 .write_u64 = cpuset_write_u64,
1816 .private = FILE_MEMORY_PRESSURE_ENABLED,
1817};
1818
1819static int cpuset_populate(struct cgroup_subsys *ss, struct cgroup *cont)
1820{
1821 int err;
1822
1823 err = cgroup_add_files(cont, ss, files, ARRAY_SIZE(files));
1824 if (err)
1825 return err;
1826
1827 if (!cont->parent)
1828 err = cgroup_add_file(cont, ss,
1829 &cft_memory_pressure_enabled);
1830 return err;
1831}
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850static void cpuset_post_clone(struct cgroup_subsys *ss,
1851 struct cgroup *cgroup)
1852{
1853 struct cgroup *parent, *child;
1854 struct cpuset *cs, *parent_cs;
1855
1856 parent = cgroup->parent;
1857 list_for_each_entry(child, &parent->children, sibling) {
1858 cs = cgroup_cs(child);
1859 if (is_mem_exclusive(cs) || is_cpu_exclusive(cs))
1860 return;
1861 }
1862 cs = cgroup_cs(cgroup);
1863 parent_cs = cgroup_cs(parent);
1864
1865 cs->mems_allowed = parent_cs->mems_allowed;
1866 cpumask_copy(cs->cpus_allowed, parent_cs->cpus_allowed);
1867 return;
1868}
1869
1870
1871
1872
1873
1874
1875
1876static struct cgroup_subsys_state *cpuset_create(
1877 struct cgroup_subsys *ss,
1878 struct cgroup *cont)
1879{
1880 struct cpuset *cs;
1881 struct cpuset *parent;
1882
1883 if (!cont->parent) {
1884 return &top_cpuset.css;
1885 }
1886 parent = cgroup_cs(cont->parent);
1887 cs = kmalloc(sizeof(*cs), GFP_KERNEL);
1888 if (!cs)
1889 return ERR_PTR(-ENOMEM);
1890 if (!alloc_cpumask_var(&cs->cpus_allowed, GFP_KERNEL)) {
1891 kfree(cs);
1892 return ERR_PTR(-ENOMEM);
1893 }
1894
1895 cs->flags = 0;
1896 if (is_spread_page(parent))
1897 set_bit(CS_SPREAD_PAGE, &cs->flags);
1898 if (is_spread_slab(parent))
1899 set_bit(CS_SPREAD_SLAB, &cs->flags);
1900 set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
1901 cpumask_clear(cs->cpus_allowed);
1902 nodes_clear(cs->mems_allowed);
1903 fmeter_init(&cs->fmeter);
1904 cs->relax_domain_level = -1;
1905
1906 cs->parent = parent;
1907 number_of_cpusets++;
1908 return &cs->css ;
1909}
1910
1911
1912
1913
1914
1915
1916
1917static void cpuset_destroy(struct cgroup_subsys *ss, struct cgroup *cont)
1918{
1919 struct cpuset *cs = cgroup_cs(cont);
1920
1921 if (is_sched_load_balance(cs))
1922 update_flag(CS_SCHED_LOAD_BALANCE, cs, 0);
1923
1924 number_of_cpusets--;
1925 free_cpumask_var(cs->cpus_allowed);
1926 kfree(cs);
1927}
1928
1929struct cgroup_subsys cpuset_subsys = {
1930 .name = "cpuset",
1931 .create = cpuset_create,
1932 .destroy = cpuset_destroy,
1933 .can_attach = cpuset_can_attach,
1934 .attach = cpuset_attach,
1935 .populate = cpuset_populate,
1936 .post_clone = cpuset_post_clone,
1937 .subsys_id = cpuset_subsys_id,
1938 .early_init = 1,
1939};
1940
1941
1942
1943
1944
1945
1946
1947int __init cpuset_init(void)
1948{
1949 int err = 0;
1950
1951 if (!alloc_cpumask_var(&top_cpuset.cpus_allowed, GFP_KERNEL))
1952 BUG();
1953
1954 cpumask_setall(top_cpuset.cpus_allowed);
1955 nodes_setall(top_cpuset.mems_allowed);
1956
1957 fmeter_init(&top_cpuset.fmeter);
1958 set_bit(CS_SCHED_LOAD_BALANCE, &top_cpuset.flags);
1959 top_cpuset.relax_domain_level = -1;
1960
1961 err = register_filesystem(&cpuset_fs_type);
1962 if (err < 0)
1963 return err;
1964
1965 if (!alloc_cpumask_var(&cpus_attach, GFP_KERNEL))
1966 BUG();
1967
1968 number_of_cpusets = 1;
1969 return 0;
1970}
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980static void cpuset_do_move_task(struct task_struct *tsk,
1981 struct cgroup_scanner *scan)
1982{
1983 struct cgroup *new_cgroup = scan->data;
1984
1985 cgroup_attach_task(new_cgroup, tsk);
1986}
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999static void move_member_tasks_to_cpuset(struct cpuset *from, struct cpuset *to)
2000{
2001 struct cgroup_scanner scan;
2002
2003 scan.cg = from->css.cgroup;
2004 scan.test_task = NULL;
2005 scan.process_task = cpuset_do_move_task;
2006 scan.heap = NULL;
2007 scan.data = to->css.cgroup;
2008
2009 if (cgroup_scan_tasks(&scan))
2010 printk(KERN_ERR "move_member_tasks_to_cpuset: "
2011 "cgroup_scan_tasks failed\n");
2012}
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024static void remove_tasks_in_empty_cpuset(struct cpuset *cs)
2025{
2026 struct cpuset *parent;
2027
2028
2029
2030
2031
2032
2033 if (list_empty(&cs->css.cgroup->css_sets))
2034 return;
2035
2036
2037
2038
2039
2040 parent = cs->parent;
2041 while (cpumask_empty(parent->cpus_allowed) ||
2042 nodes_empty(parent->mems_allowed))
2043 parent = parent->parent;
2044
2045 move_member_tasks_to_cpuset(cs, parent);
2046}
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063static void scan_for_empty_cpusets(struct cpuset *root)
2064{
2065 LIST_HEAD(queue);
2066 struct cpuset *cp;
2067 struct cpuset *child;
2068 struct cgroup *cont;
2069 NODEMASK_ALLOC(nodemask_t, oldmems, GFP_KERNEL);
2070
2071 if (oldmems == NULL)
2072 return;
2073
2074 list_add_tail((struct list_head *)&root->stack_list, &queue);
2075
2076 while (!list_empty(&queue)) {
2077 cp = list_first_entry(&queue, struct cpuset, stack_list);
2078 list_del(queue.next);
2079 list_for_each_entry(cont, &cp->css.cgroup->children, sibling) {
2080 child = cgroup_cs(cont);
2081 list_add_tail(&child->stack_list, &queue);
2082 }
2083
2084
2085 if (cpumask_subset(cp->cpus_allowed, cpu_active_mask) &&
2086 nodes_subset(cp->mems_allowed, node_states[N_HIGH_MEMORY]))
2087 continue;
2088
2089 *oldmems = cp->mems_allowed;
2090
2091
2092 mutex_lock(&callback_mutex);
2093 cpumask_and(cp->cpus_allowed, cp->cpus_allowed,
2094 cpu_active_mask);
2095 nodes_and(cp->mems_allowed, cp->mems_allowed,
2096 node_states[N_HIGH_MEMORY]);
2097 mutex_unlock(&callback_mutex);
2098
2099
2100 if (cpumask_empty(cp->cpus_allowed) ||
2101 nodes_empty(cp->mems_allowed))
2102 remove_tasks_in_empty_cpuset(cp);
2103 else {
2104 update_tasks_cpumask(cp, NULL);
2105 update_tasks_nodemask(cp, oldmems, NULL);
2106 }
2107 }
2108 NODEMASK_FREE(oldmems);
2109}
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123void cpuset_update_active_cpus(void)
2124{
2125 struct sched_domain_attr *attr;
2126 cpumask_var_t *doms;
2127 int ndoms;
2128
2129 cgroup_lock();
2130 mutex_lock(&callback_mutex);
2131 cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask);
2132 mutex_unlock(&callback_mutex);
2133 scan_for_empty_cpusets(&top_cpuset);
2134 ndoms = generate_sched_domains(&doms, &attr);
2135 cgroup_unlock();
2136
2137
2138 partition_sched_domains(ndoms, doms, attr);
2139}
2140
2141#ifdef CONFIG_MEMORY_HOTPLUG
2142
2143
2144
2145
2146
2147static int cpuset_track_online_nodes(struct notifier_block *self,
2148 unsigned long action, void *arg)
2149{
2150 NODEMASK_ALLOC(nodemask_t, oldmems, GFP_KERNEL);
2151
2152 if (oldmems == NULL)
2153 return NOTIFY_DONE;
2154
2155 cgroup_lock();
2156 switch (action) {
2157 case MEM_ONLINE:
2158 *oldmems = top_cpuset.mems_allowed;
2159 mutex_lock(&callback_mutex);
2160 top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY];
2161 mutex_unlock(&callback_mutex);
2162 update_tasks_nodemask(&top_cpuset, oldmems, NULL);
2163 break;
2164 case MEM_OFFLINE:
2165
2166
2167
2168
2169 scan_for_empty_cpusets(&top_cpuset);
2170 break;
2171 default:
2172 break;
2173 }
2174 cgroup_unlock();
2175
2176 NODEMASK_FREE(oldmems);
2177 return NOTIFY_OK;
2178}
2179#endif
2180
2181
2182
2183
2184
2185
2186
2187void __init cpuset_init_smp(void)
2188{
2189 cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask);
2190 top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY];
2191
2192 hotplug_memory_notifier(cpuset_track_online_nodes, 10);
2193
2194 cpuset_wq = create_singlethread_workqueue("cpuset");
2195 BUG_ON(!cpuset_wq);
2196}
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask)
2210{
2211 mutex_lock(&callback_mutex);
2212 task_lock(tsk);
2213 guarantee_online_cpus(task_cs(tsk), pmask);
2214 task_unlock(tsk);
2215 mutex_unlock(&callback_mutex);
2216}
2217
2218int cpuset_cpus_allowed_fallback(struct task_struct *tsk)
2219{
2220 const struct cpuset *cs;
2221 int cpu;
2222
2223 rcu_read_lock();
2224 cs = task_cs(tsk);
2225 if (cs)
2226 cpumask_copy(&tsk->cpus_allowed, cs->cpus_allowed);
2227 rcu_read_unlock();
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244 cpu = cpumask_any_and(&tsk->cpus_allowed, cpu_active_mask);
2245 if (cpu >= nr_cpu_ids) {
2246
2247
2248
2249
2250
2251
2252
2253 cpumask_copy(&tsk->cpus_allowed, cpu_possible_mask);
2254 cpu = cpumask_any(cpu_active_mask);
2255 }
2256
2257 return cpu;
2258}
2259
2260void cpuset_init_current_mems_allowed(void)
2261{
2262 nodes_setall(current->mems_allowed);
2263}
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275nodemask_t cpuset_mems_allowed(struct task_struct *tsk)
2276{
2277 nodemask_t mask;
2278
2279 mutex_lock(&callback_mutex);
2280 task_lock(tsk);
2281 guarantee_online_mems(task_cs(tsk), &mask);
2282 task_unlock(tsk);
2283 mutex_unlock(&callback_mutex);
2284
2285 return mask;
2286}
2287
2288
2289
2290
2291
2292
2293
2294int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask)
2295{
2296 return nodes_intersects(*nodemask, current->mems_allowed);
2297}
2298
2299
2300
2301
2302
2303
2304
2305static const struct cpuset *nearest_hardwall_ancestor(const struct cpuset *cs)
2306{
2307 while (!(is_mem_exclusive(cs) || is_mem_hardwall(cs)) && cs->parent)
2308 cs = cs->parent;
2309 return cs;
2310}
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373int __cpuset_node_allowed_softwall(int node, gfp_t gfp_mask)
2374{
2375 const struct cpuset *cs;
2376 int allowed;
2377
2378 if (in_interrupt() || (gfp_mask & __GFP_THISNODE))
2379 return 1;
2380 might_sleep_if(!(gfp_mask & __GFP_HARDWALL));
2381 if (node_isset(node, current->mems_allowed))
2382 return 1;
2383
2384
2385
2386
2387 if (unlikely(test_thread_flag(TIF_MEMDIE)))
2388 return 1;
2389 if (gfp_mask & __GFP_HARDWALL)
2390 return 0;
2391
2392 if (current->flags & PF_EXITING)
2393 return 1;
2394
2395
2396 mutex_lock(&callback_mutex);
2397
2398 task_lock(current);
2399 cs = nearest_hardwall_ancestor(task_cs(current));
2400 task_unlock(current);
2401
2402 allowed = node_isset(node, cs->mems_allowed);
2403 mutex_unlock(&callback_mutex);
2404 return allowed;
2405}
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430int __cpuset_node_allowed_hardwall(int node, gfp_t gfp_mask)
2431{
2432 if (in_interrupt() || (gfp_mask & __GFP_THISNODE))
2433 return 1;
2434 if (node_isset(node, current->mems_allowed))
2435 return 1;
2436
2437
2438
2439
2440 if (unlikely(test_thread_flag(TIF_MEMDIE)))
2441 return 1;
2442 return 0;
2443}
2444
2445
2446
2447
2448
2449
2450
2451void cpuset_unlock(void)
2452{
2453 mutex_unlock(&callback_mutex);
2454}
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483static int cpuset_spread_node(int *rotor)
2484{
2485 int node;
2486
2487 node = next_node(*rotor, current->mems_allowed);
2488 if (node == MAX_NUMNODES)
2489 node = first_node(current->mems_allowed);
2490 *rotor = node;
2491 return node;
2492}
2493
2494int cpuset_mem_spread_node(void)
2495{
2496 return cpuset_spread_node(¤t->cpuset_mem_spread_rotor);
2497}
2498
2499int cpuset_slab_spread_node(void)
2500{
2501 return cpuset_spread_node(¤t->cpuset_slab_spread_rotor);
2502}
2503
2504EXPORT_SYMBOL_GPL(cpuset_mem_spread_node);
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517int cpuset_mems_allowed_intersects(const struct task_struct *tsk1,
2518 const struct task_struct *tsk2)
2519{
2520 return nodes_intersects(tsk1->mems_allowed, tsk2->mems_allowed);
2521}
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531void cpuset_print_task_mems_allowed(struct task_struct *tsk)
2532{
2533 struct dentry *dentry;
2534
2535 dentry = task_cs(tsk)->css.cgroup->dentry;
2536 spin_lock(&cpuset_buffer_lock);
2537 snprintf(cpuset_name, CPUSET_NAME_LEN,
2538 dentry ? (const char *)dentry->d_name.name : "/");
2539 nodelist_scnprintf(cpuset_nodelist, CPUSET_NODELIST_LEN,
2540 tsk->mems_allowed);
2541 printk(KERN_INFO "%s cpuset=%s mems_allowed=%s\n",
2542 tsk->comm, cpuset_name, cpuset_nodelist);
2543 spin_unlock(&cpuset_buffer_lock);
2544}
2545
2546
2547
2548
2549
2550
2551
2552int cpuset_memory_pressure_enabled __read_mostly;
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572void __cpuset_memory_pressure_bump(void)
2573{
2574 task_lock(current);
2575 fmeter_markevent(&task_cs(current)->fmeter);
2576 task_unlock(current);
2577}
2578
2579#ifdef CONFIG_PROC_PID_CPUSET
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589static int proc_cpuset_show(struct seq_file *m, void *unused_v)
2590{
2591 struct pid *pid;
2592 struct task_struct *tsk;
2593 char *buf;
2594 struct cgroup_subsys_state *css;
2595 int retval;
2596
2597 retval = -ENOMEM;
2598 buf = kmalloc(PAGE_SIZE, GFP_KERNEL);
2599 if (!buf)
2600 goto out;
2601
2602 retval = -ESRCH;
2603 pid = m->private;
2604 tsk = get_pid_task(pid, PIDTYPE_PID);
2605 if (!tsk)
2606 goto out_free;
2607
2608 retval = -EINVAL;
2609 cgroup_lock();
2610 css = task_subsys_state(tsk, cpuset_subsys_id);
2611 retval = cgroup_path(css->cgroup, buf, PAGE_SIZE);
2612 if (retval < 0)
2613 goto out_unlock;
2614 seq_puts(m, buf);
2615 seq_putc(m, '\n');
2616out_unlock:
2617 cgroup_unlock();
2618 put_task_struct(tsk);
2619out_free:
2620 kfree(buf);
2621out:
2622 return retval;
2623}
2624
2625static int cpuset_open(struct inode *inode, struct file *file)
2626{
2627 struct pid *pid = PROC_I(inode)->pid;
2628 return single_open(file, proc_cpuset_show, pid);
2629}
2630
2631const struct file_operations proc_cpuset_operations = {
2632 .open = cpuset_open,
2633 .read = seq_read,
2634 .llseek = seq_lseek,
2635 .release = single_release,
2636};
2637#endif
2638
2639
2640void cpuset_task_status_allowed(struct seq_file *m, struct task_struct *task)
2641{
2642 seq_printf(m, "Mems_allowed:\t");
2643 seq_nodemask(m, &task->mems_allowed);
2644 seq_printf(m, "\n");
2645 seq_printf(m, "Mems_allowed_list:\t");
2646 seq_nodemask_list(m, &task->mems_allowed);
2647 seq_printf(m, "\n");
2648}
2649