1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25#include <linux/cpu.h>
26#include <linux/cpumask.h>
27#include <linux/cpuset.h>
28#include <linux/err.h>
29#include <linux/errno.h>
30#include <linux/file.h>
31#include <linux/fs.h>
32#include <linux/init.h>
33#include <linux/interrupt.h>
34#include <linux/kernel.h>
35#include <linux/kmod.h>
36#include <linux/list.h>
37#include <linux/mempolicy.h>
38#include <linux/mm.h>
39#include <linux/memory.h>
40#include <linux/module.h>
41#include <linux/mount.h>
42#include <linux/namei.h>
43#include <linux/pagemap.h>
44#include <linux/proc_fs.h>
45#include <linux/rcupdate.h>
46#include <linux/sched.h>
47#include <linux/seq_file.h>
48#include <linux/security.h>
49#include <linux/slab.h>
50#include <linux/spinlock.h>
51#include <linux/stat.h>
52#include <linux/string.h>
53#include <linux/time.h>
54#include <linux/backing-dev.h>
55#include <linux/sort.h>
56
57#include <asm/uaccess.h>
58#include <asm/atomic.h>
59#include <linux/mutex.h>
60#include <linux/workqueue.h>
61#include <linux/cgroup.h>
62
63
64
65
66
67
68
69static struct workqueue_struct *cpuset_wq;
70
71
72
73
74
75
76int number_of_cpusets __read_mostly;
77
78
79struct cgroup_subsys cpuset_subsys;
80struct cpuset;
81
82
83
84struct fmeter {
85 int cnt;
86 int val;
87 time_t time;
88 spinlock_t lock;
89};
90
91struct cpuset {
92 struct cgroup_subsys_state css;
93
94 unsigned long flags;
95 cpumask_var_t cpus_allowed;
96 nodemask_t mems_allowed;
97
98 struct cpuset *parent;
99
100 struct fmeter fmeter;
101
102
103 int pn;
104
105
106 int relax_domain_level;
107
108
109 struct list_head stack_list;
110};
111
112
113static inline struct cpuset *cgroup_cs(struct cgroup *cont)
114{
115 return container_of(cgroup_subsys_state(cont, cpuset_subsys_id),
116 struct cpuset, css);
117}
118
119
120static inline struct cpuset *task_cs(struct task_struct *task)
121{
122 return container_of(task_subsys_state(task, cpuset_subsys_id),
123 struct cpuset, css);
124}
125
126
127typedef enum {
128 CS_CPU_EXCLUSIVE,
129 CS_MEM_EXCLUSIVE,
130 CS_MEM_HARDWALL,
131 CS_MEMORY_MIGRATE,
132 CS_SCHED_LOAD_BALANCE,
133 CS_SPREAD_PAGE,
134 CS_SPREAD_SLAB,
135} cpuset_flagbits_t;
136
137
138static inline int is_cpu_exclusive(const struct cpuset *cs)
139{
140 return test_bit(CS_CPU_EXCLUSIVE, &cs->flags);
141}
142
143static inline int is_mem_exclusive(const struct cpuset *cs)
144{
145 return test_bit(CS_MEM_EXCLUSIVE, &cs->flags);
146}
147
148static inline int is_mem_hardwall(const struct cpuset *cs)
149{
150 return test_bit(CS_MEM_HARDWALL, &cs->flags);
151}
152
153static inline int is_sched_load_balance(const struct cpuset *cs)
154{
155 return test_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
156}
157
158static inline int is_memory_migrate(const struct cpuset *cs)
159{
160 return test_bit(CS_MEMORY_MIGRATE, &cs->flags);
161}
162
163static inline int is_spread_page(const struct cpuset *cs)
164{
165 return test_bit(CS_SPREAD_PAGE, &cs->flags);
166}
167
168static inline int is_spread_slab(const struct cpuset *cs)
169{
170 return test_bit(CS_SPREAD_SLAB, &cs->flags);
171}
172
173static struct cpuset top_cpuset = {
174 .flags = ((1 << CS_CPU_EXCLUSIVE) | (1 << CS_MEM_EXCLUSIVE)),
175};
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216static DEFINE_MUTEX(callback_mutex);
217
218
219
220
221
222
223#define CPUSET_NAME_LEN (128)
224#define CPUSET_NODELIST_LEN (256)
225static char cpuset_name[CPUSET_NAME_LEN];
226static char cpuset_nodelist[CPUSET_NODELIST_LEN];
227static DEFINE_SPINLOCK(cpuset_buffer_lock);
228
229
230
231
232
233
234static int cpuset_get_sb(struct file_system_type *fs_type,
235 int flags, const char *unused_dev_name,
236 void *data, struct vfsmount *mnt)
237{
238 struct file_system_type *cgroup_fs = get_fs_type("cgroup");
239 int ret = -ENODEV;
240 if (cgroup_fs) {
241 char mountopts[] =
242 "cpuset,noprefix,"
243 "release_agent=/sbin/cpuset_release_agent";
244 ret = cgroup_fs->get_sb(cgroup_fs, flags,
245 unused_dev_name, mountopts, mnt);
246 put_filesystem(cgroup_fs);
247 }
248 return ret;
249}
250
251static struct file_system_type cpuset_fs_type = {
252 .name = "cpuset",
253 .get_sb = cpuset_get_sb,
254};
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270static void guarantee_online_cpus(const struct cpuset *cs,
271 struct cpumask *pmask)
272{
273 while (cs && !cpumask_intersects(cs->cpus_allowed, cpu_online_mask))
274 cs = cs->parent;
275 if (cs)
276 cpumask_and(pmask, cs->cpus_allowed, cpu_online_mask);
277 else
278 cpumask_copy(pmask, cpu_online_mask);
279 BUG_ON(!cpumask_intersects(pmask, cpu_online_mask));
280}
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask)
296{
297 while (cs && !nodes_intersects(cs->mems_allowed,
298 node_states[N_HIGH_MEMORY]))
299 cs = cs->parent;
300 if (cs)
301 nodes_and(*pmask, cs->mems_allowed,
302 node_states[N_HIGH_MEMORY]);
303 else
304 *pmask = node_states[N_HIGH_MEMORY];
305 BUG_ON(!nodes_intersects(*pmask, node_states[N_HIGH_MEMORY]));
306}
307
308
309
310
311
312
313static void cpuset_update_task_spread_flag(struct cpuset *cs,
314 struct task_struct *tsk)
315{
316 if (is_spread_page(cs))
317 tsk->flags |= PF_SPREAD_PAGE;
318 else
319 tsk->flags &= ~PF_SPREAD_PAGE;
320 if (is_spread_slab(cs))
321 tsk->flags |= PF_SPREAD_SLAB;
322 else
323 tsk->flags &= ~PF_SPREAD_SLAB;
324}
325
326
327
328
329
330
331
332
333
334static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q)
335{
336 return cpumask_subset(p->cpus_allowed, q->cpus_allowed) &&
337 nodes_subset(p->mems_allowed, q->mems_allowed) &&
338 is_cpu_exclusive(p) <= is_cpu_exclusive(q) &&
339 is_mem_exclusive(p) <= is_mem_exclusive(q);
340}
341
342
343
344
345
346static struct cpuset *alloc_trial_cpuset(const struct cpuset *cs)
347{
348 struct cpuset *trial;
349
350 trial = kmemdup(cs, sizeof(*cs), GFP_KERNEL);
351 if (!trial)
352 return NULL;
353
354 if (!alloc_cpumask_var(&trial->cpus_allowed, GFP_KERNEL)) {
355 kfree(trial);
356 return NULL;
357 }
358 cpumask_copy(trial->cpus_allowed, cs->cpus_allowed);
359
360 return trial;
361}
362
363
364
365
366
367static void free_trial_cpuset(struct cpuset *trial)
368{
369 free_cpumask_var(trial->cpus_allowed);
370 kfree(trial);
371}
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393static int validate_change(const struct cpuset *cur, const struct cpuset *trial)
394{
395 struct cgroup *cont;
396 struct cpuset *c, *par;
397
398
399 list_for_each_entry(cont, &cur->css.cgroup->children, sibling) {
400 if (!is_cpuset_subset(cgroup_cs(cont), trial))
401 return -EBUSY;
402 }
403
404
405 if (cur == &top_cpuset)
406 return 0;
407
408 par = cur->parent;
409
410
411 if (!is_cpuset_subset(trial, par))
412 return -EACCES;
413
414
415
416
417
418 list_for_each_entry(cont, &par->css.cgroup->children, sibling) {
419 c = cgroup_cs(cont);
420 if ((is_cpu_exclusive(trial) || is_cpu_exclusive(c)) &&
421 c != cur &&
422 cpumask_intersects(trial->cpus_allowed, c->cpus_allowed))
423 return -EINVAL;
424 if ((is_mem_exclusive(trial) || is_mem_exclusive(c)) &&
425 c != cur &&
426 nodes_intersects(trial->mems_allowed, c->mems_allowed))
427 return -EINVAL;
428 }
429
430
431 if (cgroup_task_count(cur->css.cgroup)) {
432 if (cpumask_empty(trial->cpus_allowed) ||
433 nodes_empty(trial->mems_allowed)) {
434 return -ENOSPC;
435 }
436 }
437
438 return 0;
439}
440
441#ifdef CONFIG_SMP
442
443
444
445
446static int cpusets_overlap(struct cpuset *a, struct cpuset *b)
447{
448 return cpumask_intersects(a->cpus_allowed, b->cpus_allowed);
449}
450
451static void
452update_domain_attr(struct sched_domain_attr *dattr, struct cpuset *c)
453{
454 if (dattr->relax_domain_level < c->relax_domain_level)
455 dattr->relax_domain_level = c->relax_domain_level;
456 return;
457}
458
459static void
460update_domain_attr_tree(struct sched_domain_attr *dattr, struct cpuset *c)
461{
462 LIST_HEAD(q);
463
464 list_add(&c->stack_list, &q);
465 while (!list_empty(&q)) {
466 struct cpuset *cp;
467 struct cgroup *cont;
468 struct cpuset *child;
469
470 cp = list_first_entry(&q, struct cpuset, stack_list);
471 list_del(q.next);
472
473 if (cpumask_empty(cp->cpus_allowed))
474 continue;
475
476 if (is_sched_load_balance(cp))
477 update_domain_attr(dattr, cp);
478
479 list_for_each_entry(cont, &cp->css.cgroup->children, sibling) {
480 child = cgroup_cs(cont);
481 list_add_tail(&child->stack_list, &q);
482 }
483 }
484}
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541static int generate_sched_domains(struct cpumask **domains,
542 struct sched_domain_attr **attributes)
543{
544 LIST_HEAD(q);
545 struct cpuset *cp;
546 struct cpuset **csa;
547 int csn;
548 int i, j, k;
549 struct cpumask *doms;
550 struct sched_domain_attr *dattr;
551 int ndoms = 0;
552 int nslot;
553
554 doms = NULL;
555 dattr = NULL;
556 csa = NULL;
557
558
559 if (is_sched_load_balance(&top_cpuset)) {
560 doms = kmalloc(cpumask_size(), GFP_KERNEL);
561 if (!doms)
562 goto done;
563
564 dattr = kmalloc(sizeof(struct sched_domain_attr), GFP_KERNEL);
565 if (dattr) {
566 *dattr = SD_ATTR_INIT;
567 update_domain_attr_tree(dattr, &top_cpuset);
568 }
569 cpumask_copy(doms, top_cpuset.cpus_allowed);
570
571 ndoms = 1;
572 goto done;
573 }
574
575 csa = kmalloc(number_of_cpusets * sizeof(cp), GFP_KERNEL);
576 if (!csa)
577 goto done;
578 csn = 0;
579
580 list_add(&top_cpuset.stack_list, &q);
581 while (!list_empty(&q)) {
582 struct cgroup *cont;
583 struct cpuset *child;
584
585 cp = list_first_entry(&q, struct cpuset, stack_list);
586 list_del(q.next);
587
588 if (cpumask_empty(cp->cpus_allowed))
589 continue;
590
591
592
593
594
595
596
597 if (is_sched_load_balance(cp)) {
598 csa[csn++] = cp;
599 continue;
600 }
601
602 list_for_each_entry(cont, &cp->css.cgroup->children, sibling) {
603 child = cgroup_cs(cont);
604 list_add_tail(&child->stack_list, &q);
605 }
606 }
607
608 for (i = 0; i < csn; i++)
609 csa[i]->pn = i;
610 ndoms = csn;
611
612restart:
613
614 for (i = 0; i < csn; i++) {
615 struct cpuset *a = csa[i];
616 int apn = a->pn;
617
618 for (j = 0; j < csn; j++) {
619 struct cpuset *b = csa[j];
620 int bpn = b->pn;
621
622 if (apn != bpn && cpusets_overlap(a, b)) {
623 for (k = 0; k < csn; k++) {
624 struct cpuset *c = csa[k];
625
626 if (c->pn == bpn)
627 c->pn = apn;
628 }
629 ndoms--;
630 goto restart;
631 }
632 }
633 }
634
635
636
637
638
639 doms = kmalloc(ndoms * cpumask_size(), GFP_KERNEL);
640 if (!doms)
641 goto done;
642
643
644
645
646
647 dattr = kmalloc(ndoms * sizeof(struct sched_domain_attr), GFP_KERNEL);
648
649 for (nslot = 0, i = 0; i < csn; i++) {
650 struct cpuset *a = csa[i];
651 struct cpumask *dp;
652 int apn = a->pn;
653
654 if (apn < 0) {
655
656 continue;
657 }
658
659 dp = doms + nslot;
660
661 if (nslot == ndoms) {
662 static int warnings = 10;
663 if (warnings) {
664 printk(KERN_WARNING
665 "rebuild_sched_domains confused:"
666 " nslot %d, ndoms %d, csn %d, i %d,"
667 " apn %d\n",
668 nslot, ndoms, csn, i, apn);
669 warnings--;
670 }
671 continue;
672 }
673
674 cpumask_clear(dp);
675 if (dattr)
676 *(dattr + nslot) = SD_ATTR_INIT;
677 for (j = i; j < csn; j++) {
678 struct cpuset *b = csa[j];
679
680 if (apn == b->pn) {
681 cpumask_or(dp, dp, b->cpus_allowed);
682 if (dattr)
683 update_domain_attr_tree(dattr + nslot, b);
684
685
686 b->pn = -1;
687 }
688 }
689 nslot++;
690 }
691 BUG_ON(nslot != ndoms);
692
693done:
694 kfree(csa);
695
696
697
698
699
700 if (doms == NULL)
701 ndoms = 1;
702
703 *domains = doms;
704 *attributes = dattr;
705 return ndoms;
706}
707
708
709
710
711
712
713
714
715
716
717
718static void do_rebuild_sched_domains(struct work_struct *unused)
719{
720 struct sched_domain_attr *attr;
721 struct cpumask *doms;
722 int ndoms;
723
724 get_online_cpus();
725
726
727 cgroup_lock();
728 ndoms = generate_sched_domains(&doms, &attr);
729 cgroup_unlock();
730
731
732 partition_sched_domains(ndoms, doms, attr);
733
734 put_online_cpus();
735}
736#else
737static void do_rebuild_sched_domains(struct work_struct *unused)
738{
739}
740
741static int generate_sched_domains(struct cpumask **domains,
742 struct sched_domain_attr **attributes)
743{
744 *domains = NULL;
745 return 1;
746}
747#endif
748
749static DECLARE_WORK(rebuild_sched_domains_work, do_rebuild_sched_domains);
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770static void async_rebuild_sched_domains(void)
771{
772 queue_work(cpuset_wq, &rebuild_sched_domains_work);
773}
774
775
776
777
778
779
780
781
782
783
784void rebuild_sched_domains(void)
785{
786 do_rebuild_sched_domains(NULL);
787}
788
789
790
791
792
793
794
795
796
797
798
799static int cpuset_test_cpumask(struct task_struct *tsk,
800 struct cgroup_scanner *scan)
801{
802 return !cpumask_equal(&tsk->cpus_allowed,
803 (cgroup_cs(scan->cg))->cpus_allowed);
804}
805
806
807
808
809
810
811
812
813
814
815
816
817static void cpuset_change_cpumask(struct task_struct *tsk,
818 struct cgroup_scanner *scan)
819{
820 set_cpus_allowed_ptr(tsk, ((cgroup_cs(scan->cg))->cpus_allowed));
821}
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836static void update_tasks_cpumask(struct cpuset *cs, struct ptr_heap *heap)
837{
838 struct cgroup_scanner scan;
839
840 scan.cg = cs->css.cgroup;
841 scan.test_task = cpuset_test_cpumask;
842 scan.process_task = cpuset_change_cpumask;
843 scan.heap = heap;
844 cgroup_scan_tasks(&scan);
845}
846
847
848
849
850
851
852static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
853 const char *buf)
854{
855 struct ptr_heap heap;
856 int retval;
857 int is_load_balanced;
858
859
860 if (cs == &top_cpuset)
861 return -EACCES;
862
863
864
865
866
867
868
869 if (!*buf) {
870 cpumask_clear(trialcs->cpus_allowed);
871 } else {
872 retval = cpulist_parse(buf, trialcs->cpus_allowed);
873 if (retval < 0)
874 return retval;
875
876 if (!cpumask_subset(trialcs->cpus_allowed, cpu_online_mask))
877 return -EINVAL;
878 }
879 retval = validate_change(cs, trialcs);
880 if (retval < 0)
881 return retval;
882
883
884 if (cpumask_equal(cs->cpus_allowed, trialcs->cpus_allowed))
885 return 0;
886
887 retval = heap_init(&heap, PAGE_SIZE, GFP_KERNEL, NULL);
888 if (retval)
889 return retval;
890
891 is_load_balanced = is_sched_load_balance(trialcs);
892
893 mutex_lock(&callback_mutex);
894 cpumask_copy(cs->cpus_allowed, trialcs->cpus_allowed);
895 mutex_unlock(&callback_mutex);
896
897
898
899
900
901 update_tasks_cpumask(cs, &heap);
902
903 heap_free(&heap);
904
905 if (is_load_balanced)
906 async_rebuild_sched_domains();
907 return 0;
908}
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from,
934 const nodemask_t *to)
935{
936 struct task_struct *tsk = current;
937
938 tsk->mems_allowed = *to;
939
940 do_migrate_pages(mm, from, to, MPOL_MF_MOVE_ALL);
941
942 guarantee_online_mems(task_cs(tsk),&tsk->mems_allowed);
943}
944
945
946
947
948
949
950
951
952
953
954
955
956static void cpuset_change_task_nodemask(struct task_struct *tsk,
957 nodemask_t *newmems)
958{
959 nodes_or(tsk->mems_allowed, tsk->mems_allowed, *newmems);
960 mpol_rebind_task(tsk, &tsk->mems_allowed);
961 mpol_rebind_task(tsk, newmems);
962 tsk->mems_allowed = *newmems;
963}
964
965
966
967
968
969
970static void cpuset_change_nodemask(struct task_struct *p,
971 struct cgroup_scanner *scan)
972{
973 struct mm_struct *mm;
974 struct cpuset *cs;
975 int migrate;
976 const nodemask_t *oldmem = scan->data;
977 nodemask_t newmems;
978
979 cs = cgroup_cs(scan->cg);
980 guarantee_online_mems(cs, &newmems);
981
982 task_lock(p);
983 cpuset_change_task_nodemask(p, &newmems);
984 task_unlock(p);
985
986 mm = get_task_mm(p);
987 if (!mm)
988 return;
989
990 migrate = is_memory_migrate(cs);
991
992 mpol_rebind_mm(mm, &cs->mems_allowed);
993 if (migrate)
994 cpuset_migrate_mm(mm, oldmem, &cs->mems_allowed);
995 mmput(mm);
996}
997
998static void *cpuset_being_rebound;
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010static void update_tasks_nodemask(struct cpuset *cs, const nodemask_t *oldmem,
1011 struct ptr_heap *heap)
1012{
1013 struct cgroup_scanner scan;
1014
1015 cpuset_being_rebound = cs;
1016
1017 scan.cg = cs->css.cgroup;
1018 scan.test_task = NULL;
1019 scan.process_task = cpuset_change_nodemask;
1020 scan.heap = heap;
1021 scan.data = (nodemask_t *)oldmem;
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033 cgroup_scan_tasks(&scan);
1034
1035
1036 cpuset_being_rebound = NULL;
1037}
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
1053 const char *buf)
1054{
1055 nodemask_t oldmem;
1056 int retval;
1057 struct ptr_heap heap;
1058
1059
1060
1061
1062
1063 if (cs == &top_cpuset)
1064 return -EACCES;
1065
1066
1067
1068
1069
1070
1071
1072 if (!*buf) {
1073 nodes_clear(trialcs->mems_allowed);
1074 } else {
1075 retval = nodelist_parse(buf, trialcs->mems_allowed);
1076 if (retval < 0)
1077 goto done;
1078
1079 if (!nodes_subset(trialcs->mems_allowed,
1080 node_states[N_HIGH_MEMORY]))
1081 return -EINVAL;
1082 }
1083 oldmem = cs->mems_allowed;
1084 if (nodes_equal(oldmem, trialcs->mems_allowed)) {
1085 retval = 0;
1086 goto done;
1087 }
1088 retval = validate_change(cs, trialcs);
1089 if (retval < 0)
1090 goto done;
1091
1092 retval = heap_init(&heap, PAGE_SIZE, GFP_KERNEL, NULL);
1093 if (retval < 0)
1094 goto done;
1095
1096 mutex_lock(&callback_mutex);
1097 cs->mems_allowed = trialcs->mems_allowed;
1098 mutex_unlock(&callback_mutex);
1099
1100 update_tasks_nodemask(cs, &oldmem, &heap);
1101
1102 heap_free(&heap);
1103done:
1104 return retval;
1105}
1106
1107int current_cpuset_is_being_rebound(void)
1108{
1109 return task_cs(current) == cpuset_being_rebound;
1110}
1111
1112static int update_relax_domain_level(struct cpuset *cs, s64 val)
1113{
1114#ifdef CONFIG_SMP
1115 if (val < -1 || val >= SD_LV_MAX)
1116 return -EINVAL;
1117#endif
1118
1119 if (val != cs->relax_domain_level) {
1120 cs->relax_domain_level = val;
1121 if (!cpumask_empty(cs->cpus_allowed) &&
1122 is_sched_load_balance(cs))
1123 async_rebuild_sched_domains();
1124 }
1125
1126 return 0;
1127}
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139static void cpuset_change_flag(struct task_struct *tsk,
1140 struct cgroup_scanner *scan)
1141{
1142 cpuset_update_task_spread_flag(cgroup_cs(scan->cg), tsk);
1143}
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158static void update_tasks_flags(struct cpuset *cs, struct ptr_heap *heap)
1159{
1160 struct cgroup_scanner scan;
1161
1162 scan.cg = cs->css.cgroup;
1163 scan.test_task = NULL;
1164 scan.process_task = cpuset_change_flag;
1165 scan.heap = heap;
1166 cgroup_scan_tasks(&scan);
1167}
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
1179 int turning_on)
1180{
1181 struct cpuset *trialcs;
1182 int balance_flag_changed;
1183 int spread_flag_changed;
1184 struct ptr_heap heap;
1185 int err;
1186
1187 trialcs = alloc_trial_cpuset(cs);
1188 if (!trialcs)
1189 return -ENOMEM;
1190
1191 if (turning_on)
1192 set_bit(bit, &trialcs->flags);
1193 else
1194 clear_bit(bit, &trialcs->flags);
1195
1196 err = validate_change(cs, trialcs);
1197 if (err < 0)
1198 goto out;
1199
1200 err = heap_init(&heap, PAGE_SIZE, GFP_KERNEL, NULL);
1201 if (err < 0)
1202 goto out;
1203
1204 balance_flag_changed = (is_sched_load_balance(cs) !=
1205 is_sched_load_balance(trialcs));
1206
1207 spread_flag_changed = ((is_spread_slab(cs) != is_spread_slab(trialcs))
1208 || (is_spread_page(cs) != is_spread_page(trialcs)));
1209
1210 mutex_lock(&callback_mutex);
1211 cs->flags = trialcs->flags;
1212 mutex_unlock(&callback_mutex);
1213
1214 if (!cpumask_empty(trialcs->cpus_allowed) && balance_flag_changed)
1215 async_rebuild_sched_domains();
1216
1217 if (spread_flag_changed)
1218 update_tasks_flags(cs, &heap);
1219 heap_free(&heap);
1220out:
1221 free_trial_cpuset(trialcs);
1222 return err;
1223}
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270#define FM_COEF 933
1271#define FM_MAXTICKS ((time_t)99)
1272#define FM_MAXCNT 1000000
1273#define FM_SCALE 1000
1274
1275
1276static void fmeter_init(struct fmeter *fmp)
1277{
1278 fmp->cnt = 0;
1279 fmp->val = 0;
1280 fmp->time = 0;
1281 spin_lock_init(&fmp->lock);
1282}
1283
1284
1285static void fmeter_update(struct fmeter *fmp)
1286{
1287 time_t now = get_seconds();
1288 time_t ticks = now - fmp->time;
1289
1290 if (ticks == 0)
1291 return;
1292
1293 ticks = min(FM_MAXTICKS, ticks);
1294 while (ticks-- > 0)
1295 fmp->val = (FM_COEF * fmp->val) / FM_SCALE;
1296 fmp->time = now;
1297
1298 fmp->val += ((FM_SCALE - FM_COEF) * fmp->cnt) / FM_SCALE;
1299 fmp->cnt = 0;
1300}
1301
1302
1303static void fmeter_markevent(struct fmeter *fmp)
1304{
1305 spin_lock(&fmp->lock);
1306 fmeter_update(fmp);
1307 fmp->cnt = min(FM_MAXCNT, fmp->cnt + FM_SCALE);
1308 spin_unlock(&fmp->lock);
1309}
1310
1311
1312static int fmeter_getrate(struct fmeter *fmp)
1313{
1314 int val;
1315
1316 spin_lock(&fmp->lock);
1317 fmeter_update(fmp);
1318 val = fmp->val;
1319 spin_unlock(&fmp->lock);
1320 return val;
1321}
1322
1323
1324static cpumask_var_t cpus_attach;
1325
1326
1327static int cpuset_can_attach(struct cgroup_subsys *ss, struct cgroup *cont,
1328 struct task_struct *tsk, bool threadgroup)
1329{
1330 int ret;
1331 struct cpuset *cs = cgroup_cs(cont);
1332
1333 if (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed))
1334 return -ENOSPC;
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344 if (tsk->flags & PF_THREAD_BOUND)
1345 return -EINVAL;
1346
1347 ret = security_task_setscheduler(tsk, 0, NULL);
1348 if (ret)
1349 return ret;
1350 if (threadgroup) {
1351 struct task_struct *c;
1352
1353 rcu_read_lock();
1354 list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) {
1355 ret = security_task_setscheduler(c, 0, NULL);
1356 if (ret) {
1357 rcu_read_unlock();
1358 return ret;
1359 }
1360 }
1361 rcu_read_unlock();
1362 }
1363 return 0;
1364}
1365
1366static void cpuset_attach_task(struct task_struct *tsk, nodemask_t *to,
1367 struct cpuset *cs)
1368{
1369 int err;
1370
1371
1372
1373
1374 err = set_cpus_allowed_ptr(tsk, cpus_attach);
1375 WARN_ON_ONCE(err);
1376
1377 task_lock(tsk);
1378 cpuset_change_task_nodemask(tsk, to);
1379 task_unlock(tsk);
1380 cpuset_update_task_spread_flag(cs, tsk);
1381
1382}
1383
1384static void cpuset_attach(struct cgroup_subsys *ss, struct cgroup *cont,
1385 struct cgroup *oldcont, struct task_struct *tsk,
1386 bool threadgroup)
1387{
1388 nodemask_t from, to;
1389 struct mm_struct *mm;
1390 struct cpuset *cs = cgroup_cs(cont);
1391 struct cpuset *oldcs = cgroup_cs(oldcont);
1392
1393 if (cs == &top_cpuset) {
1394 cpumask_copy(cpus_attach, cpu_possible_mask);
1395 to = node_possible_map;
1396 } else {
1397 guarantee_online_cpus(cs, cpus_attach);
1398 guarantee_online_mems(cs, &to);
1399 }
1400
1401
1402 cpuset_attach_task(tsk, &to, cs);
1403 if (threadgroup) {
1404 struct task_struct *c;
1405 rcu_read_lock();
1406 list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) {
1407 cpuset_attach_task(c, &to, cs);
1408 }
1409 rcu_read_unlock();
1410 }
1411
1412
1413 from = oldcs->mems_allowed;
1414 to = cs->mems_allowed;
1415 mm = get_task_mm(tsk);
1416 if (mm) {
1417 mpol_rebind_mm(mm, &to);
1418 if (is_memory_migrate(cs))
1419 cpuset_migrate_mm(mm, &from, &to);
1420 mmput(mm);
1421 }
1422}
1423
1424
1425
1426typedef enum {
1427 FILE_MEMORY_MIGRATE,
1428 FILE_CPULIST,
1429 FILE_MEMLIST,
1430 FILE_CPU_EXCLUSIVE,
1431 FILE_MEM_EXCLUSIVE,
1432 FILE_MEM_HARDWALL,
1433 FILE_SCHED_LOAD_BALANCE,
1434 FILE_SCHED_RELAX_DOMAIN_LEVEL,
1435 FILE_MEMORY_PRESSURE_ENABLED,
1436 FILE_MEMORY_PRESSURE,
1437 FILE_SPREAD_PAGE,
1438 FILE_SPREAD_SLAB,
1439} cpuset_filetype_t;
1440
1441static int cpuset_write_u64(struct cgroup *cgrp, struct cftype *cft, u64 val)
1442{
1443 int retval = 0;
1444 struct cpuset *cs = cgroup_cs(cgrp);
1445 cpuset_filetype_t type = cft->private;
1446
1447 if (!cgroup_lock_live_group(cgrp))
1448 return -ENODEV;
1449
1450 switch (type) {
1451 case FILE_CPU_EXCLUSIVE:
1452 retval = update_flag(CS_CPU_EXCLUSIVE, cs, val);
1453 break;
1454 case FILE_MEM_EXCLUSIVE:
1455 retval = update_flag(CS_MEM_EXCLUSIVE, cs, val);
1456 break;
1457 case FILE_MEM_HARDWALL:
1458 retval = update_flag(CS_MEM_HARDWALL, cs, val);
1459 break;
1460 case FILE_SCHED_LOAD_BALANCE:
1461 retval = update_flag(CS_SCHED_LOAD_BALANCE, cs, val);
1462 break;
1463 case FILE_MEMORY_MIGRATE:
1464 retval = update_flag(CS_MEMORY_MIGRATE, cs, val);
1465 break;
1466 case FILE_MEMORY_PRESSURE_ENABLED:
1467 cpuset_memory_pressure_enabled = !!val;
1468 break;
1469 case FILE_MEMORY_PRESSURE:
1470 retval = -EACCES;
1471 break;
1472 case FILE_SPREAD_PAGE:
1473 retval = update_flag(CS_SPREAD_PAGE, cs, val);
1474 break;
1475 case FILE_SPREAD_SLAB:
1476 retval = update_flag(CS_SPREAD_SLAB, cs, val);
1477 break;
1478 default:
1479 retval = -EINVAL;
1480 break;
1481 }
1482 cgroup_unlock();
1483 return retval;
1484}
1485
1486static int cpuset_write_s64(struct cgroup *cgrp, struct cftype *cft, s64 val)
1487{
1488 int retval = 0;
1489 struct cpuset *cs = cgroup_cs(cgrp);
1490 cpuset_filetype_t type = cft->private;
1491
1492 if (!cgroup_lock_live_group(cgrp))
1493 return -ENODEV;
1494
1495 switch (type) {
1496 case FILE_SCHED_RELAX_DOMAIN_LEVEL:
1497 retval = update_relax_domain_level(cs, val);
1498 break;
1499 default:
1500 retval = -EINVAL;
1501 break;
1502 }
1503 cgroup_unlock();
1504 return retval;
1505}
1506
1507
1508
1509
1510static int cpuset_write_resmask(struct cgroup *cgrp, struct cftype *cft,
1511 const char *buf)
1512{
1513 int retval = 0;
1514 struct cpuset *cs = cgroup_cs(cgrp);
1515 struct cpuset *trialcs;
1516
1517 if (!cgroup_lock_live_group(cgrp))
1518 return -ENODEV;
1519
1520 trialcs = alloc_trial_cpuset(cs);
1521 if (!trialcs)
1522 return -ENOMEM;
1523
1524 switch (cft->private) {
1525 case FILE_CPULIST:
1526 retval = update_cpumask(cs, trialcs, buf);
1527 break;
1528 case FILE_MEMLIST:
1529 retval = update_nodemask(cs, trialcs, buf);
1530 break;
1531 default:
1532 retval = -EINVAL;
1533 break;
1534 }
1535
1536 free_trial_cpuset(trialcs);
1537 cgroup_unlock();
1538 return retval;
1539}
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553static int cpuset_sprintf_cpulist(char *page, struct cpuset *cs)
1554{
1555 int ret;
1556
1557 mutex_lock(&callback_mutex);
1558 ret = cpulist_scnprintf(page, PAGE_SIZE, cs->cpus_allowed);
1559 mutex_unlock(&callback_mutex);
1560
1561 return ret;
1562}
1563
1564static int cpuset_sprintf_memlist(char *page, struct cpuset *cs)
1565{
1566 nodemask_t mask;
1567
1568 mutex_lock(&callback_mutex);
1569 mask = cs->mems_allowed;
1570 mutex_unlock(&callback_mutex);
1571
1572 return nodelist_scnprintf(page, PAGE_SIZE, mask);
1573}
1574
1575static ssize_t cpuset_common_file_read(struct cgroup *cont,
1576 struct cftype *cft,
1577 struct file *file,
1578 char __user *buf,
1579 size_t nbytes, loff_t *ppos)
1580{
1581 struct cpuset *cs = cgroup_cs(cont);
1582 cpuset_filetype_t type = cft->private;
1583 char *page;
1584 ssize_t retval = 0;
1585 char *s;
1586
1587 if (!(page = (char *)__get_free_page(GFP_TEMPORARY)))
1588 return -ENOMEM;
1589
1590 s = page;
1591
1592 switch (type) {
1593 case FILE_CPULIST:
1594 s += cpuset_sprintf_cpulist(s, cs);
1595 break;
1596 case FILE_MEMLIST:
1597 s += cpuset_sprintf_memlist(s, cs);
1598 break;
1599 default:
1600 retval = -EINVAL;
1601 goto out;
1602 }
1603 *s++ = '\n';
1604
1605 retval = simple_read_from_buffer(buf, nbytes, ppos, page, s - page);
1606out:
1607 free_page((unsigned long)page);
1608 return retval;
1609}
1610
1611static u64 cpuset_read_u64(struct cgroup *cont, struct cftype *cft)
1612{
1613 struct cpuset *cs = cgroup_cs(cont);
1614 cpuset_filetype_t type = cft->private;
1615 switch (type) {
1616 case FILE_CPU_EXCLUSIVE:
1617 return is_cpu_exclusive(cs);
1618 case FILE_MEM_EXCLUSIVE:
1619 return is_mem_exclusive(cs);
1620 case FILE_MEM_HARDWALL:
1621 return is_mem_hardwall(cs);
1622 case FILE_SCHED_LOAD_BALANCE:
1623 return is_sched_load_balance(cs);
1624 case FILE_MEMORY_MIGRATE:
1625 return is_memory_migrate(cs);
1626 case FILE_MEMORY_PRESSURE_ENABLED:
1627 return cpuset_memory_pressure_enabled;
1628 case FILE_MEMORY_PRESSURE:
1629 return fmeter_getrate(&cs->fmeter);
1630 case FILE_SPREAD_PAGE:
1631 return is_spread_page(cs);
1632 case FILE_SPREAD_SLAB:
1633 return is_spread_slab(cs);
1634 default:
1635 BUG();
1636 }
1637
1638
1639 return 0;
1640}
1641
1642static s64 cpuset_read_s64(struct cgroup *cont, struct cftype *cft)
1643{
1644 struct cpuset *cs = cgroup_cs(cont);
1645 cpuset_filetype_t type = cft->private;
1646 switch (type) {
1647 case FILE_SCHED_RELAX_DOMAIN_LEVEL:
1648 return cs->relax_domain_level;
1649 default:
1650 BUG();
1651 }
1652
1653
1654 return 0;
1655}
1656
1657
1658
1659
1660
1661
1662static struct cftype files[] = {
1663 {
1664 .name = "cpus",
1665 .read = cpuset_common_file_read,
1666 .write_string = cpuset_write_resmask,
1667 .max_write_len = (100U + 6 * NR_CPUS),
1668 .private = FILE_CPULIST,
1669 },
1670
1671 {
1672 .name = "mems",
1673 .read = cpuset_common_file_read,
1674 .write_string = cpuset_write_resmask,
1675 .max_write_len = (100U + 6 * MAX_NUMNODES),
1676 .private = FILE_MEMLIST,
1677 },
1678
1679 {
1680 .name = "cpu_exclusive",
1681 .read_u64 = cpuset_read_u64,
1682 .write_u64 = cpuset_write_u64,
1683 .private = FILE_CPU_EXCLUSIVE,
1684 },
1685
1686 {
1687 .name = "mem_exclusive",
1688 .read_u64 = cpuset_read_u64,
1689 .write_u64 = cpuset_write_u64,
1690 .private = FILE_MEM_EXCLUSIVE,
1691 },
1692
1693 {
1694 .name = "mem_hardwall",
1695 .read_u64 = cpuset_read_u64,
1696 .write_u64 = cpuset_write_u64,
1697 .private = FILE_MEM_HARDWALL,
1698 },
1699
1700 {
1701 .name = "sched_load_balance",
1702 .read_u64 = cpuset_read_u64,
1703 .write_u64 = cpuset_write_u64,
1704 .private = FILE_SCHED_LOAD_BALANCE,
1705 },
1706
1707 {
1708 .name = "sched_relax_domain_level",
1709 .read_s64 = cpuset_read_s64,
1710 .write_s64 = cpuset_write_s64,
1711 .private = FILE_SCHED_RELAX_DOMAIN_LEVEL,
1712 },
1713
1714 {
1715 .name = "memory_migrate",
1716 .read_u64 = cpuset_read_u64,
1717 .write_u64 = cpuset_write_u64,
1718 .private = FILE_MEMORY_MIGRATE,
1719 },
1720
1721 {
1722 .name = "memory_pressure",
1723 .read_u64 = cpuset_read_u64,
1724 .write_u64 = cpuset_write_u64,
1725 .private = FILE_MEMORY_PRESSURE,
1726 .mode = S_IRUGO,
1727 },
1728
1729 {
1730 .name = "memory_spread_page",
1731 .read_u64 = cpuset_read_u64,
1732 .write_u64 = cpuset_write_u64,
1733 .private = FILE_SPREAD_PAGE,
1734 },
1735
1736 {
1737 .name = "memory_spread_slab",
1738 .read_u64 = cpuset_read_u64,
1739 .write_u64 = cpuset_write_u64,
1740 .private = FILE_SPREAD_SLAB,
1741 },
1742};
1743
1744static struct cftype cft_memory_pressure_enabled = {
1745 .name = "memory_pressure_enabled",
1746 .read_u64 = cpuset_read_u64,
1747 .write_u64 = cpuset_write_u64,
1748 .private = FILE_MEMORY_PRESSURE_ENABLED,
1749};
1750
1751static int cpuset_populate(struct cgroup_subsys *ss, struct cgroup *cont)
1752{
1753 int err;
1754
1755 err = cgroup_add_files(cont, ss, files, ARRAY_SIZE(files));
1756 if (err)
1757 return err;
1758
1759 if (!cont->parent)
1760 err = cgroup_add_file(cont, ss,
1761 &cft_memory_pressure_enabled);
1762 return err;
1763}
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782static void cpuset_post_clone(struct cgroup_subsys *ss,
1783 struct cgroup *cgroup)
1784{
1785 struct cgroup *parent, *child;
1786 struct cpuset *cs, *parent_cs;
1787
1788 parent = cgroup->parent;
1789 list_for_each_entry(child, &parent->children, sibling) {
1790 cs = cgroup_cs(child);
1791 if (is_mem_exclusive(cs) || is_cpu_exclusive(cs))
1792 return;
1793 }
1794 cs = cgroup_cs(cgroup);
1795 parent_cs = cgroup_cs(parent);
1796
1797 cs->mems_allowed = parent_cs->mems_allowed;
1798 cpumask_copy(cs->cpus_allowed, parent_cs->cpus_allowed);
1799 return;
1800}
1801
1802
1803
1804
1805
1806
1807
1808static struct cgroup_subsys_state *cpuset_create(
1809 struct cgroup_subsys *ss,
1810 struct cgroup *cont)
1811{
1812 struct cpuset *cs;
1813 struct cpuset *parent;
1814
1815 if (!cont->parent) {
1816 return &top_cpuset.css;
1817 }
1818 parent = cgroup_cs(cont->parent);
1819 cs = kmalloc(sizeof(*cs), GFP_KERNEL);
1820 if (!cs)
1821 return ERR_PTR(-ENOMEM);
1822 if (!alloc_cpumask_var(&cs->cpus_allowed, GFP_KERNEL)) {
1823 kfree(cs);
1824 return ERR_PTR(-ENOMEM);
1825 }
1826
1827 cs->flags = 0;
1828 if (is_spread_page(parent))
1829 set_bit(CS_SPREAD_PAGE, &cs->flags);
1830 if (is_spread_slab(parent))
1831 set_bit(CS_SPREAD_SLAB, &cs->flags);
1832 set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
1833 cpumask_clear(cs->cpus_allowed);
1834 nodes_clear(cs->mems_allowed);
1835 fmeter_init(&cs->fmeter);
1836 cs->relax_domain_level = -1;
1837
1838 cs->parent = parent;
1839 number_of_cpusets++;
1840 return &cs->css ;
1841}
1842
1843
1844
1845
1846
1847
1848
1849static void cpuset_destroy(struct cgroup_subsys *ss, struct cgroup *cont)
1850{
1851 struct cpuset *cs = cgroup_cs(cont);
1852
1853 if (is_sched_load_balance(cs))
1854 update_flag(CS_SCHED_LOAD_BALANCE, cs, 0);
1855
1856 number_of_cpusets--;
1857 free_cpumask_var(cs->cpus_allowed);
1858 kfree(cs);
1859}
1860
1861struct cgroup_subsys cpuset_subsys = {
1862 .name = "cpuset",
1863 .create = cpuset_create,
1864 .destroy = cpuset_destroy,
1865 .can_attach = cpuset_can_attach,
1866 .attach = cpuset_attach,
1867 .populate = cpuset_populate,
1868 .post_clone = cpuset_post_clone,
1869 .subsys_id = cpuset_subsys_id,
1870 .early_init = 1,
1871};
1872
1873
1874
1875
1876
1877
1878
1879int __init cpuset_init(void)
1880{
1881 int err = 0;
1882
1883 if (!alloc_cpumask_var(&top_cpuset.cpus_allowed, GFP_KERNEL))
1884 BUG();
1885
1886 cpumask_setall(top_cpuset.cpus_allowed);
1887 nodes_setall(top_cpuset.mems_allowed);
1888
1889 fmeter_init(&top_cpuset.fmeter);
1890 set_bit(CS_SCHED_LOAD_BALANCE, &top_cpuset.flags);
1891 top_cpuset.relax_domain_level = -1;
1892
1893 err = register_filesystem(&cpuset_fs_type);
1894 if (err < 0)
1895 return err;
1896
1897 if (!alloc_cpumask_var(&cpus_attach, GFP_KERNEL))
1898 BUG();
1899
1900 number_of_cpusets = 1;
1901 return 0;
1902}
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912static void cpuset_do_move_task(struct task_struct *tsk,
1913 struct cgroup_scanner *scan)
1914{
1915 struct cgroup *new_cgroup = scan->data;
1916
1917 cgroup_attach_task(new_cgroup, tsk);
1918}
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931static void move_member_tasks_to_cpuset(struct cpuset *from, struct cpuset *to)
1932{
1933 struct cgroup_scanner scan;
1934
1935 scan.cg = from->css.cgroup;
1936 scan.test_task = NULL;
1937 scan.process_task = cpuset_do_move_task;
1938 scan.heap = NULL;
1939 scan.data = to->css.cgroup;
1940
1941 if (cgroup_scan_tasks(&scan))
1942 printk(KERN_ERR "move_member_tasks_to_cpuset: "
1943 "cgroup_scan_tasks failed\n");
1944}
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956static void remove_tasks_in_empty_cpuset(struct cpuset *cs)
1957{
1958 struct cpuset *parent;
1959
1960
1961
1962
1963
1964
1965 if (list_empty(&cs->css.cgroup->css_sets))
1966 return;
1967
1968
1969
1970
1971
1972 parent = cs->parent;
1973 while (cpumask_empty(parent->cpus_allowed) ||
1974 nodes_empty(parent->mems_allowed))
1975 parent = parent->parent;
1976
1977 move_member_tasks_to_cpuset(cs, parent);
1978}
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995static void scan_for_empty_cpusets(struct cpuset *root)
1996{
1997 LIST_HEAD(queue);
1998 struct cpuset *cp;
1999 struct cpuset *child;
2000 struct cgroup *cont;
2001 nodemask_t oldmems;
2002
2003 list_add_tail((struct list_head *)&root->stack_list, &queue);
2004
2005 while (!list_empty(&queue)) {
2006 cp = list_first_entry(&queue, struct cpuset, stack_list);
2007 list_del(queue.next);
2008 list_for_each_entry(cont, &cp->css.cgroup->children, sibling) {
2009 child = cgroup_cs(cont);
2010 list_add_tail(&child->stack_list, &queue);
2011 }
2012
2013
2014 if (cpumask_subset(cp->cpus_allowed, cpu_online_mask) &&
2015 nodes_subset(cp->mems_allowed, node_states[N_HIGH_MEMORY]))
2016 continue;
2017
2018 oldmems = cp->mems_allowed;
2019
2020
2021 mutex_lock(&callback_mutex);
2022 cpumask_and(cp->cpus_allowed, cp->cpus_allowed,
2023 cpu_online_mask);
2024 nodes_and(cp->mems_allowed, cp->mems_allowed,
2025 node_states[N_HIGH_MEMORY]);
2026 mutex_unlock(&callback_mutex);
2027
2028
2029 if (cpumask_empty(cp->cpus_allowed) ||
2030 nodes_empty(cp->mems_allowed))
2031 remove_tasks_in_empty_cpuset(cp);
2032 else {
2033 update_tasks_cpumask(cp, NULL);
2034 update_tasks_nodemask(cp, &oldmems, NULL);
2035 }
2036 }
2037}
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051static int cpuset_track_online_cpus(struct notifier_block *unused_nb,
2052 unsigned long phase, void *unused_cpu)
2053{
2054 struct sched_domain_attr *attr;
2055 struct cpumask *doms;
2056 int ndoms;
2057
2058 switch (phase) {
2059 case CPU_ONLINE:
2060 case CPU_ONLINE_FROZEN:
2061 case CPU_DEAD:
2062 case CPU_DEAD_FROZEN:
2063 break;
2064
2065 default:
2066 return NOTIFY_DONE;
2067 }
2068
2069 cgroup_lock();
2070 mutex_lock(&callback_mutex);
2071 cpumask_copy(top_cpuset.cpus_allowed, cpu_online_mask);
2072 mutex_unlock(&callback_mutex);
2073 scan_for_empty_cpusets(&top_cpuset);
2074 ndoms = generate_sched_domains(&doms, &attr);
2075 cgroup_unlock();
2076
2077
2078 partition_sched_domains(ndoms, doms, attr);
2079
2080 return NOTIFY_OK;
2081}
2082
2083#ifdef CONFIG_MEMORY_HOTPLUG
2084
2085
2086
2087
2088
2089static int cpuset_track_online_nodes(struct notifier_block *self,
2090 unsigned long action, void *arg)
2091{
2092 cgroup_lock();
2093 switch (action) {
2094 case MEM_ONLINE:
2095 case MEM_OFFLINE:
2096 mutex_lock(&callback_mutex);
2097 top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY];
2098 mutex_unlock(&callback_mutex);
2099 if (action == MEM_OFFLINE)
2100 scan_for_empty_cpusets(&top_cpuset);
2101 break;
2102 default:
2103 break;
2104 }
2105 cgroup_unlock();
2106 return NOTIFY_OK;
2107}
2108#endif
2109
2110
2111
2112
2113
2114
2115
2116void __init cpuset_init_smp(void)
2117{
2118 cpumask_copy(top_cpuset.cpus_allowed, cpu_online_mask);
2119 top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY];
2120
2121 hotcpu_notifier(cpuset_track_online_cpus, 0);
2122 hotplug_memory_notifier(cpuset_track_online_nodes, 10);
2123
2124 cpuset_wq = create_singlethread_workqueue("cpuset");
2125 BUG_ON(!cpuset_wq);
2126}
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask)
2140{
2141 mutex_lock(&callback_mutex);
2142 cpuset_cpus_allowed_locked(tsk, pmask);
2143 mutex_unlock(&callback_mutex);
2144}
2145
2146
2147
2148
2149
2150void cpuset_cpus_allowed_locked(struct task_struct *tsk, struct cpumask *pmask)
2151{
2152 task_lock(tsk);
2153 guarantee_online_cpus(task_cs(tsk), pmask);
2154 task_unlock(tsk);
2155}
2156
2157void cpuset_init_current_mems_allowed(void)
2158{
2159 nodes_setall(current->mems_allowed);
2160}
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172nodemask_t cpuset_mems_allowed(struct task_struct *tsk)
2173{
2174 nodemask_t mask;
2175
2176 mutex_lock(&callback_mutex);
2177 task_lock(tsk);
2178 guarantee_online_mems(task_cs(tsk), &mask);
2179 task_unlock(tsk);
2180 mutex_unlock(&callback_mutex);
2181
2182 return mask;
2183}
2184
2185
2186
2187
2188
2189
2190
2191int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask)
2192{
2193 return nodes_intersects(*nodemask, current->mems_allowed);
2194}
2195
2196
2197
2198
2199
2200
2201
2202static const struct cpuset *nearest_hardwall_ancestor(const struct cpuset *cs)
2203{
2204 while (!(is_mem_exclusive(cs) || is_mem_hardwall(cs)) && cs->parent)
2205 cs = cs->parent;
2206 return cs;
2207}
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270int __cpuset_node_allowed_softwall(int node, gfp_t gfp_mask)
2271{
2272 const struct cpuset *cs;
2273 int allowed;
2274
2275 if (in_interrupt() || (gfp_mask & __GFP_THISNODE))
2276 return 1;
2277 might_sleep_if(!(gfp_mask & __GFP_HARDWALL));
2278 if (node_isset(node, current->mems_allowed))
2279 return 1;
2280
2281
2282
2283
2284 if (unlikely(test_thread_flag(TIF_MEMDIE)))
2285 return 1;
2286 if (gfp_mask & __GFP_HARDWALL)
2287 return 0;
2288
2289 if (current->flags & PF_EXITING)
2290 return 1;
2291
2292
2293 mutex_lock(&callback_mutex);
2294
2295 task_lock(current);
2296 cs = nearest_hardwall_ancestor(task_cs(current));
2297 task_unlock(current);
2298
2299 allowed = node_isset(node, cs->mems_allowed);
2300 mutex_unlock(&callback_mutex);
2301 return allowed;
2302}
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327int __cpuset_node_allowed_hardwall(int node, gfp_t gfp_mask)
2328{
2329 if (in_interrupt() || (gfp_mask & __GFP_THISNODE))
2330 return 1;
2331 if (node_isset(node, current->mems_allowed))
2332 return 1;
2333
2334
2335
2336
2337 if (unlikely(test_thread_flag(TIF_MEMDIE)))
2338 return 1;
2339 return 0;
2340}
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353void cpuset_lock(void)
2354{
2355 mutex_lock(&callback_mutex);
2356}
2357
2358
2359
2360
2361
2362
2363
2364void cpuset_unlock(void)
2365{
2366 mutex_unlock(&callback_mutex);
2367}
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395int cpuset_mem_spread_node(void)
2396{
2397 int node;
2398
2399 node = next_node(current->cpuset_mem_spread_rotor, current->mems_allowed);
2400 if (node == MAX_NUMNODES)
2401 node = first_node(current->mems_allowed);
2402 current->cpuset_mem_spread_rotor = node;
2403 return node;
2404}
2405EXPORT_SYMBOL_GPL(cpuset_mem_spread_node);
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418int cpuset_mems_allowed_intersects(const struct task_struct *tsk1,
2419 const struct task_struct *tsk2)
2420{
2421 return nodes_intersects(tsk1->mems_allowed, tsk2->mems_allowed);
2422}
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432void cpuset_print_task_mems_allowed(struct task_struct *tsk)
2433{
2434 struct dentry *dentry;
2435
2436 dentry = task_cs(tsk)->css.cgroup->dentry;
2437 spin_lock(&cpuset_buffer_lock);
2438 snprintf(cpuset_name, CPUSET_NAME_LEN,
2439 dentry ? (const char *)dentry->d_name.name : "/");
2440 nodelist_scnprintf(cpuset_nodelist, CPUSET_NODELIST_LEN,
2441 tsk->mems_allowed);
2442 printk(KERN_INFO "%s cpuset=%s mems_allowed=%s\n",
2443 tsk->comm, cpuset_name, cpuset_nodelist);
2444 spin_unlock(&cpuset_buffer_lock);
2445}
2446
2447
2448
2449
2450
2451
2452
2453int cpuset_memory_pressure_enabled __read_mostly;
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473void __cpuset_memory_pressure_bump(void)
2474{
2475 task_lock(current);
2476 fmeter_markevent(&task_cs(current)->fmeter);
2477 task_unlock(current);
2478}
2479
2480#ifdef CONFIG_PROC_PID_CPUSET
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490static int proc_cpuset_show(struct seq_file *m, void *unused_v)
2491{
2492 struct pid *pid;
2493 struct task_struct *tsk;
2494 char *buf;
2495 struct cgroup_subsys_state *css;
2496 int retval;
2497
2498 retval = -ENOMEM;
2499 buf = kmalloc(PAGE_SIZE, GFP_KERNEL);
2500 if (!buf)
2501 goto out;
2502
2503 retval = -ESRCH;
2504 pid = m->private;
2505 tsk = get_pid_task(pid, PIDTYPE_PID);
2506 if (!tsk)
2507 goto out_free;
2508
2509 retval = -EINVAL;
2510 cgroup_lock();
2511 css = task_subsys_state(tsk, cpuset_subsys_id);
2512 retval = cgroup_path(css->cgroup, buf, PAGE_SIZE);
2513 if (retval < 0)
2514 goto out_unlock;
2515 seq_puts(m, buf);
2516 seq_putc(m, '\n');
2517out_unlock:
2518 cgroup_unlock();
2519 put_task_struct(tsk);
2520out_free:
2521 kfree(buf);
2522out:
2523 return retval;
2524}
2525
2526static int cpuset_open(struct inode *inode, struct file *file)
2527{
2528 struct pid *pid = PROC_I(inode)->pid;
2529 return single_open(file, proc_cpuset_show, pid);
2530}
2531
2532const struct file_operations proc_cpuset_operations = {
2533 .open = cpuset_open,
2534 .read = seq_read,
2535 .llseek = seq_lseek,
2536 .release = single_release,
2537};
2538#endif
2539
2540
2541void cpuset_task_status_allowed(struct seq_file *m, struct task_struct *task)
2542{
2543 seq_printf(m, "Cpus_allowed:\t");
2544 seq_cpumask(m, &task->cpus_allowed);
2545 seq_printf(m, "\n");
2546 seq_printf(m, "Cpus_allowed_list:\t");
2547 seq_cpumask_list(m, &task->cpus_allowed);
2548 seq_printf(m, "\n");
2549 seq_printf(m, "Mems_allowed:\t");
2550 seq_nodemask(m, &task->mems_allowed);
2551 seq_printf(m, "\n");
2552 seq_printf(m, "Mems_allowed_list:\t");
2553 seq_nodemask_list(m, &task->mems_allowed);
2554 seq_printf(m, "\n");
2555}
2556