1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25#include <linux/cpu.h>
26#include <linux/cpumask.h>
27#include <linux/cpuset.h>
28#include <linux/err.h>
29#include <linux/errno.h>
30#include <linux/file.h>
31#include <linux/fs.h>
32#include <linux/init.h>
33#include <linux/interrupt.h>
34#include <linux/kernel.h>
35#include <linux/kmod.h>
36#include <linux/list.h>
37#include <linux/mempolicy.h>
38#include <linux/mm.h>
39#include <linux/memory.h>
40#include <linux/export.h>
41#include <linux/mount.h>
42#include <linux/fs_context.h>
43#include <linux/namei.h>
44#include <linux/pagemap.h>
45#include <linux/proc_fs.h>
46#include <linux/rcupdate.h>
47#include <linux/sched.h>
48#include <linux/sched/deadline.h>
49#include <linux/sched/mm.h>
50#include <linux/sched/task.h>
51#include <linux/seq_file.h>
52#include <linux/security.h>
53#include <linux/slab.h>
54#include <linux/spinlock.h>
55#include <linux/stat.h>
56#include <linux/string.h>
57#include <linux/time.h>
58#include <linux/time64.h>
59#include <linux/backing-dev.h>
60#include <linux/sort.h>
61#include <linux/oom.h>
62#include <linux/sched/isolation.h>
63#include <linux/uaccess.h>
64#include <linux/atomic.h>
65#include <linux/mutex.h>
66#include <linux/cgroup.h>
67#include <linux/wait.h>
68
69DEFINE_STATIC_KEY_FALSE(cpusets_pre_enable_key);
70DEFINE_STATIC_KEY_FALSE(cpusets_enabled_key);
71
72
73
74struct fmeter {
75 int cnt;
76 int val;
77 time64_t time;
78 spinlock_t lock;
79};
80
81struct cpuset {
82 struct cgroup_subsys_state css;
83
84 unsigned long flags;
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107 cpumask_var_t cpus_allowed;
108 nodemask_t mems_allowed;
109
110
111 cpumask_var_t effective_cpus;
112 nodemask_t effective_mems;
113
114
115
116
117
118
119
120
121
122 cpumask_var_t subparts_cpus;
123
124
125
126
127
128
129
130
131
132
133
134 nodemask_t old_mems_allowed;
135
136 struct fmeter fmeter;
137
138
139
140
141
142 int attach_in_progress;
143
144
145 int pn;
146
147
148 int relax_domain_level;
149
150
151 int nr_subparts_cpus;
152
153
154 int partition_root_state;
155
156
157
158
159
160
161 int use_parent_ecpus;
162 int child_ecpus_count;
163};
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179#define PRS_DISABLED 0
180#define PRS_ENABLED 1
181#define PRS_ERROR -1
182
183
184
185
186
187struct tmpmasks {
188 cpumask_var_t addmask, delmask;
189 cpumask_var_t new_cpus;
190};
191
192static inline struct cpuset *css_cs(struct cgroup_subsys_state *css)
193{
194 return css ? container_of(css, struct cpuset, css) : NULL;
195}
196
197
198static inline struct cpuset *task_cs(struct task_struct *task)
199{
200 return css_cs(task_css(task, cpuset_cgrp_id));
201}
202
203static inline struct cpuset *parent_cs(struct cpuset *cs)
204{
205 return css_cs(cs->css.parent);
206}
207
208
209typedef enum {
210 CS_ONLINE,
211 CS_CPU_EXCLUSIVE,
212 CS_MEM_EXCLUSIVE,
213 CS_MEM_HARDWALL,
214 CS_MEMORY_MIGRATE,
215 CS_SCHED_LOAD_BALANCE,
216 CS_SPREAD_PAGE,
217 CS_SPREAD_SLAB,
218} cpuset_flagbits_t;
219
220
221static inline bool is_cpuset_online(struct cpuset *cs)
222{
223 return test_bit(CS_ONLINE, &cs->flags) && !css_is_dying(&cs->css);
224}
225
226static inline int is_cpu_exclusive(const struct cpuset *cs)
227{
228 return test_bit(CS_CPU_EXCLUSIVE, &cs->flags);
229}
230
231static inline int is_mem_exclusive(const struct cpuset *cs)
232{
233 return test_bit(CS_MEM_EXCLUSIVE, &cs->flags);
234}
235
236static inline int is_mem_hardwall(const struct cpuset *cs)
237{
238 return test_bit(CS_MEM_HARDWALL, &cs->flags);
239}
240
241static inline int is_sched_load_balance(const struct cpuset *cs)
242{
243 return test_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
244}
245
246static inline int is_memory_migrate(const struct cpuset *cs)
247{
248 return test_bit(CS_MEMORY_MIGRATE, &cs->flags);
249}
250
251static inline int is_spread_page(const struct cpuset *cs)
252{
253 return test_bit(CS_SPREAD_PAGE, &cs->flags);
254}
255
256static inline int is_spread_slab(const struct cpuset *cs)
257{
258 return test_bit(CS_SPREAD_SLAB, &cs->flags);
259}
260
261static inline int is_partition_root(const struct cpuset *cs)
262{
263 return cs->partition_root_state > 0;
264}
265
266static struct cpuset top_cpuset = {
267 .flags = ((1 << CS_ONLINE) | (1 << CS_CPU_EXCLUSIVE) |
268 (1 << CS_MEM_EXCLUSIVE)),
269 .partition_root_state = PRS_ENABLED,
270};
271
272
273
274
275
276
277
278
279
280
281#define cpuset_for_each_child(child_cs, pos_css, parent_cs) \
282 css_for_each_child((pos_css), &(parent_cs)->css) \
283 if (is_cpuset_online(((child_cs) = css_cs((pos_css)))))
284
285
286
287
288
289
290
291
292
293
294
295
296#define cpuset_for_each_descendant_pre(des_cs, pos_css, root_cs) \
297 css_for_each_descendant_pre((pos_css), &(root_cs)->css) \
298 if (is_cpuset_online(((des_cs) = css_cs((pos_css)))))
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336DEFINE_STATIC_PERCPU_RWSEM(cpuset_rwsem);
337
338void cpuset_read_lock(void)
339{
340 percpu_down_read(&cpuset_rwsem);
341}
342
343void cpuset_read_unlock(void)
344{
345 percpu_up_read(&cpuset_rwsem);
346}
347
348static DEFINE_SPINLOCK(callback_lock);
349
350static struct workqueue_struct *cpuset_migrate_mm_wq;
351
352
353
354
355static void cpuset_hotplug_workfn(struct work_struct *work);
356static DECLARE_WORK(cpuset_hotplug_work, cpuset_hotplug_workfn);
357
358static DECLARE_WAIT_QUEUE_HEAD(cpuset_attach_wq);
359
360
361
362
363
364
365
366
367
368static inline bool is_in_v2_mode(void)
369{
370 return cgroup_subsys_on_dfl(cpuset_cgrp_subsys) ||
371 (cpuset_cgrp_subsys.root->flags & CGRP_ROOT_CPUSET_V2_MODE);
372}
373
374
375
376
377
378
379
380
381
382
383
384static void guarantee_online_cpus(struct cpuset *cs, struct cpumask *pmask)
385{
386 while (!cpumask_intersects(cs->effective_cpus, cpu_online_mask)) {
387 cs = parent_cs(cs);
388 if (unlikely(!cs)) {
389
390
391
392
393
394
395
396 cpumask_copy(pmask, cpu_online_mask);
397 return;
398 }
399 }
400 cpumask_and(pmask, cs->effective_cpus, cpu_online_mask);
401}
402
403
404
405
406
407
408
409
410
411
412
413
414static void guarantee_online_mems(struct cpuset *cs, nodemask_t *pmask)
415{
416 while (!nodes_intersects(cs->effective_mems, node_states[N_MEMORY]))
417 cs = parent_cs(cs);
418 nodes_and(*pmask, cs->effective_mems, node_states[N_MEMORY]);
419}
420
421
422
423
424
425
426static void cpuset_update_task_spread_flag(struct cpuset *cs,
427 struct task_struct *tsk)
428{
429 if (is_spread_page(cs))
430 task_set_spread_page(tsk);
431 else
432 task_clear_spread_page(tsk);
433
434 if (is_spread_slab(cs))
435 task_set_spread_slab(tsk);
436 else
437 task_clear_spread_slab(tsk);
438}
439
440
441
442
443
444
445
446
447
448static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q)
449{
450 return cpumask_subset(p->cpus_allowed, q->cpus_allowed) &&
451 nodes_subset(p->mems_allowed, q->mems_allowed) &&
452 is_cpu_exclusive(p) <= is_cpu_exclusive(q) &&
453 is_mem_exclusive(p) <= is_mem_exclusive(q);
454}
455
456
457
458
459
460
461
462
463
464static inline int alloc_cpumasks(struct cpuset *cs, struct tmpmasks *tmp)
465{
466 cpumask_var_t *pmask1, *pmask2, *pmask3;
467
468 if (cs) {
469 pmask1 = &cs->cpus_allowed;
470 pmask2 = &cs->effective_cpus;
471 pmask3 = &cs->subparts_cpus;
472 } else {
473 pmask1 = &tmp->new_cpus;
474 pmask2 = &tmp->addmask;
475 pmask3 = &tmp->delmask;
476 }
477
478 if (!zalloc_cpumask_var(pmask1, GFP_KERNEL))
479 return -ENOMEM;
480
481 if (!zalloc_cpumask_var(pmask2, GFP_KERNEL))
482 goto free_one;
483
484 if (!zalloc_cpumask_var(pmask3, GFP_KERNEL))
485 goto free_two;
486
487 return 0;
488
489free_two:
490 free_cpumask_var(*pmask2);
491free_one:
492 free_cpumask_var(*pmask1);
493 return -ENOMEM;
494}
495
496
497
498
499
500
501static inline void free_cpumasks(struct cpuset *cs, struct tmpmasks *tmp)
502{
503 if (cs) {
504 free_cpumask_var(cs->cpus_allowed);
505 free_cpumask_var(cs->effective_cpus);
506 free_cpumask_var(cs->subparts_cpus);
507 }
508 if (tmp) {
509 free_cpumask_var(tmp->new_cpus);
510 free_cpumask_var(tmp->addmask);
511 free_cpumask_var(tmp->delmask);
512 }
513}
514
515
516
517
518
519static struct cpuset *alloc_trial_cpuset(struct cpuset *cs)
520{
521 struct cpuset *trial;
522
523 trial = kmemdup(cs, sizeof(*cs), GFP_KERNEL);
524 if (!trial)
525 return NULL;
526
527 if (alloc_cpumasks(trial, NULL)) {
528 kfree(trial);
529 return NULL;
530 }
531
532 cpumask_copy(trial->cpus_allowed, cs->cpus_allowed);
533 cpumask_copy(trial->effective_cpus, cs->effective_cpus);
534 return trial;
535}
536
537
538
539
540
541static inline void free_cpuset(struct cpuset *cs)
542{
543 free_cpumasks(cs, NULL);
544 kfree(cs);
545}
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567static int validate_change(struct cpuset *cur, struct cpuset *trial)
568{
569 struct cgroup_subsys_state *css;
570 struct cpuset *c, *par;
571 int ret;
572
573 rcu_read_lock();
574
575
576 ret = -EBUSY;
577 cpuset_for_each_child(c, css, cur)
578 if (!is_cpuset_subset(c, trial))
579 goto out;
580
581
582 ret = 0;
583 if (cur == &top_cpuset)
584 goto out;
585
586 par = parent_cs(cur);
587
588
589 ret = -EACCES;
590 if (!is_in_v2_mode() && !is_cpuset_subset(trial, par))
591 goto out;
592
593
594
595
596
597 ret = -EINVAL;
598 cpuset_for_each_child(c, css, par) {
599 if ((is_cpu_exclusive(trial) || is_cpu_exclusive(c)) &&
600 c != cur &&
601 cpumask_intersects(trial->cpus_allowed, c->cpus_allowed))
602 goto out;
603 if ((is_mem_exclusive(trial) || is_mem_exclusive(c)) &&
604 c != cur &&
605 nodes_intersects(trial->mems_allowed, c->mems_allowed))
606 goto out;
607 }
608
609
610
611
612
613 ret = -ENOSPC;
614 if ((cgroup_is_populated(cur->css.cgroup) || cur->attach_in_progress)) {
615 if (!cpumask_empty(cur->cpus_allowed) &&
616 cpumask_empty(trial->cpus_allowed))
617 goto out;
618 if (!nodes_empty(cur->mems_allowed) &&
619 nodes_empty(trial->mems_allowed))
620 goto out;
621 }
622
623
624
625
626
627 ret = -EBUSY;
628 if (is_cpu_exclusive(cur) &&
629 !cpuset_cpumask_can_shrink(cur->cpus_allowed,
630 trial->cpus_allowed))
631 goto out;
632
633 ret = 0;
634out:
635 rcu_read_unlock();
636 return ret;
637}
638
639#ifdef CONFIG_SMP
640
641
642
643
644static int cpusets_overlap(struct cpuset *a, struct cpuset *b)
645{
646 return cpumask_intersects(a->effective_cpus, b->effective_cpus);
647}
648
649static void
650update_domain_attr(struct sched_domain_attr *dattr, struct cpuset *c)
651{
652 if (dattr->relax_domain_level < c->relax_domain_level)
653 dattr->relax_domain_level = c->relax_domain_level;
654 return;
655}
656
657static void update_domain_attr_tree(struct sched_domain_attr *dattr,
658 struct cpuset *root_cs)
659{
660 struct cpuset *cp;
661 struct cgroup_subsys_state *pos_css;
662
663 rcu_read_lock();
664 cpuset_for_each_descendant_pre(cp, pos_css, root_cs) {
665
666 if (cpumask_empty(cp->cpus_allowed)) {
667 pos_css = css_rightmost_descendant(pos_css);
668 continue;
669 }
670
671 if (is_sched_load_balance(cp))
672 update_domain_attr(dattr, cp);
673 }
674 rcu_read_unlock();
675}
676
677
678static inline int nr_cpusets(void)
679{
680
681 return static_key_count(&cpusets_enabled_key.key) + 1;
682}
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737static int generate_sched_domains(cpumask_var_t **domains,
738 struct sched_domain_attr **attributes)
739{
740 struct cpuset *cp;
741 struct cpuset **csa;
742 int csn;
743 int i, j, k;
744 cpumask_var_t *doms;
745 struct sched_domain_attr *dattr;
746 int ndoms = 0;
747 int nslot;
748 struct cgroup_subsys_state *pos_css;
749 bool root_load_balance = is_sched_load_balance(&top_cpuset);
750
751 doms = NULL;
752 dattr = NULL;
753 csa = NULL;
754
755
756 if (root_load_balance && !top_cpuset.nr_subparts_cpus) {
757 ndoms = 1;
758 doms = alloc_sched_domains(ndoms);
759 if (!doms)
760 goto done;
761
762 dattr = kmalloc(sizeof(struct sched_domain_attr), GFP_KERNEL);
763 if (dattr) {
764 *dattr = SD_ATTR_INIT;
765 update_domain_attr_tree(dattr, &top_cpuset);
766 }
767 cpumask_and(doms[0], top_cpuset.effective_cpus,
768 housekeeping_cpumask(HK_FLAG_DOMAIN));
769
770 goto done;
771 }
772
773 csa = kmalloc_array(nr_cpusets(), sizeof(cp), GFP_KERNEL);
774 if (!csa)
775 goto done;
776 csn = 0;
777
778 rcu_read_lock();
779 if (root_load_balance)
780 csa[csn++] = &top_cpuset;
781 cpuset_for_each_descendant_pre(cp, pos_css, &top_cpuset) {
782 if (cp == &top_cpuset)
783 continue;
784
785
786
787
788
789
790
791
792
793
794
795 if (!cpumask_empty(cp->cpus_allowed) &&
796 !(is_sched_load_balance(cp) &&
797 cpumask_intersects(cp->cpus_allowed,
798 housekeeping_cpumask(HK_FLAG_DOMAIN))))
799 continue;
800
801 if (root_load_balance &&
802 cpumask_subset(cp->cpus_allowed, top_cpuset.effective_cpus))
803 continue;
804
805 if (is_sched_load_balance(cp) &&
806 !cpumask_empty(cp->effective_cpus))
807 csa[csn++] = cp;
808
809
810 if (!is_partition_root(cp))
811 pos_css = css_rightmost_descendant(pos_css);
812 }
813 rcu_read_unlock();
814
815 for (i = 0; i < csn; i++)
816 csa[i]->pn = i;
817 ndoms = csn;
818
819restart:
820
821 for (i = 0; i < csn; i++) {
822 struct cpuset *a = csa[i];
823 int apn = a->pn;
824
825 for (j = 0; j < csn; j++) {
826 struct cpuset *b = csa[j];
827 int bpn = b->pn;
828
829 if (apn != bpn && cpusets_overlap(a, b)) {
830 for (k = 0; k < csn; k++) {
831 struct cpuset *c = csa[k];
832
833 if (c->pn == bpn)
834 c->pn = apn;
835 }
836 ndoms--;
837 goto restart;
838 }
839 }
840 }
841
842
843
844
845
846 doms = alloc_sched_domains(ndoms);
847 if (!doms)
848 goto done;
849
850
851
852
853
854 dattr = kmalloc_array(ndoms, sizeof(struct sched_domain_attr),
855 GFP_KERNEL);
856
857 for (nslot = 0, i = 0; i < csn; i++) {
858 struct cpuset *a = csa[i];
859 struct cpumask *dp;
860 int apn = a->pn;
861
862 if (apn < 0) {
863
864 continue;
865 }
866
867 dp = doms[nslot];
868
869 if (nslot == ndoms) {
870 static int warnings = 10;
871 if (warnings) {
872 pr_warn("rebuild_sched_domains confused: nslot %d, ndoms %d, csn %d, i %d, apn %d\n",
873 nslot, ndoms, csn, i, apn);
874 warnings--;
875 }
876 continue;
877 }
878
879 cpumask_clear(dp);
880 if (dattr)
881 *(dattr + nslot) = SD_ATTR_INIT;
882 for (j = i; j < csn; j++) {
883 struct cpuset *b = csa[j];
884
885 if (apn == b->pn) {
886 cpumask_or(dp, dp, b->effective_cpus);
887 cpumask_and(dp, dp, housekeeping_cpumask(HK_FLAG_DOMAIN));
888 if (dattr)
889 update_domain_attr_tree(dattr + nslot, b);
890
891
892 b->pn = -1;
893 }
894 }
895 nslot++;
896 }
897 BUG_ON(nslot != ndoms);
898
899done:
900 kfree(csa);
901
902
903
904
905
906 if (doms == NULL)
907 ndoms = 1;
908
909 *domains = doms;
910 *attributes = dattr;
911 return ndoms;
912}
913
914static void update_tasks_root_domain(struct cpuset *cs)
915{
916 struct css_task_iter it;
917 struct task_struct *task;
918
919 css_task_iter_start(&cs->css, 0, &it);
920
921 while ((task = css_task_iter_next(&it)))
922 dl_add_task_root_domain(task);
923
924 css_task_iter_end(&it);
925}
926
927static void rebuild_root_domains(void)
928{
929 struct cpuset *cs = NULL;
930 struct cgroup_subsys_state *pos_css;
931
932 percpu_rwsem_assert_held(&cpuset_rwsem);
933 lockdep_assert_cpus_held();
934 lockdep_assert_held(&sched_domains_mutex);
935
936 rcu_read_lock();
937
938
939
940
941
942 dl_clear_root_domain(&def_root_domain);
943
944 cpuset_for_each_descendant_pre(cs, pos_css, &top_cpuset) {
945
946 if (cpumask_empty(cs->effective_cpus)) {
947 pos_css = css_rightmost_descendant(pos_css);
948 continue;
949 }
950
951 css_get(&cs->css);
952
953 rcu_read_unlock();
954
955 update_tasks_root_domain(cs);
956
957 rcu_read_lock();
958 css_put(&cs->css);
959 }
960 rcu_read_unlock();
961}
962
963static void
964partition_and_rebuild_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
965 struct sched_domain_attr *dattr_new)
966{
967 mutex_lock(&sched_domains_mutex);
968 partition_sched_domains_locked(ndoms_new, doms_new, dattr_new);
969 rebuild_root_domains();
970 mutex_unlock(&sched_domains_mutex);
971}
972
973
974
975
976
977
978
979
980
981
982
983
984static void rebuild_sched_domains_locked(void)
985{
986 struct sched_domain_attr *attr;
987 cpumask_var_t *doms;
988 int ndoms;
989
990 lockdep_assert_cpus_held();
991 percpu_rwsem_assert_held(&cpuset_rwsem);
992
993
994
995
996
997
998 if (!top_cpuset.nr_subparts_cpus &&
999 !cpumask_equal(top_cpuset.effective_cpus, cpu_active_mask))
1000 return;
1001
1002 if (top_cpuset.nr_subparts_cpus &&
1003 !cpumask_subset(top_cpuset.effective_cpus, cpu_active_mask))
1004 return;
1005
1006
1007 ndoms = generate_sched_domains(&doms, &attr);
1008
1009
1010 partition_and_rebuild_sched_domains(ndoms, doms, attr);
1011}
1012#else
1013static void rebuild_sched_domains_locked(void)
1014{
1015}
1016#endif
1017
1018void rebuild_sched_domains(void)
1019{
1020 get_online_cpus();
1021 percpu_down_write(&cpuset_rwsem);
1022 rebuild_sched_domains_locked();
1023 percpu_up_write(&cpuset_rwsem);
1024 put_online_cpus();
1025}
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035static void update_tasks_cpumask(struct cpuset *cs)
1036{
1037 struct css_task_iter it;
1038 struct task_struct *task;
1039
1040 css_task_iter_start(&cs->css, 0, &it);
1041 while ((task = css_task_iter_next(&it)))
1042 set_cpus_allowed_ptr(task, cs->effective_cpus);
1043 css_task_iter_end(&it);
1044}
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057static void compute_effective_cpumask(struct cpumask *new_cpus,
1058 struct cpuset *cs, struct cpuset *parent)
1059{
1060 if (parent->nr_subparts_cpus) {
1061 cpumask_or(new_cpus, parent->effective_cpus,
1062 parent->subparts_cpus);
1063 cpumask_and(new_cpus, new_cpus, cs->cpus_allowed);
1064 cpumask_and(new_cpus, new_cpus, cpu_active_mask);
1065 } else {
1066 cpumask_and(new_cpus, cs->cpus_allowed, parent->effective_cpus);
1067 }
1068}
1069
1070
1071
1072
1073enum subparts_cmd {
1074 partcmd_enable,
1075 partcmd_disable,
1076 partcmd_update,
1077};
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121static int update_parent_subparts_cpumask(struct cpuset *cpuset, int cmd,
1122 struct cpumask *newmask,
1123 struct tmpmasks *tmp)
1124{
1125 struct cpuset *parent = parent_cs(cpuset);
1126 int adding;
1127 int deleting;
1128 bool part_error = false;
1129
1130 percpu_rwsem_assert_held(&cpuset_rwsem);
1131
1132
1133
1134
1135
1136
1137 if (!is_partition_root(parent) ||
1138 (newmask && cpumask_empty(newmask)) ||
1139 (!newmask && cpumask_empty(cpuset->cpus_allowed)))
1140 return -EINVAL;
1141
1142
1143
1144
1145
1146 if ((cmd != partcmd_update) && css_has_online_children(&cpuset->css))
1147 return -EBUSY;
1148
1149
1150
1151
1152
1153
1154 if ((cmd == partcmd_enable) &&
1155 (!cpumask_subset(cpuset->cpus_allowed, parent->effective_cpus) ||
1156 cpumask_equal(cpuset->cpus_allowed, parent->effective_cpus)))
1157 return -EINVAL;
1158
1159
1160
1161
1162 adding = deleting = false;
1163 if (cmd == partcmd_enable) {
1164 cpumask_copy(tmp->addmask, cpuset->cpus_allowed);
1165 adding = true;
1166 } else if (cmd == partcmd_disable) {
1167 deleting = cpumask_and(tmp->delmask, cpuset->cpus_allowed,
1168 parent->subparts_cpus);
1169 } else if (newmask) {
1170
1171
1172
1173
1174
1175
1176
1177 cpumask_andnot(tmp->delmask, cpuset->cpus_allowed, newmask);
1178 deleting = cpumask_and(tmp->delmask, tmp->delmask,
1179 parent->subparts_cpus);
1180
1181 cpumask_and(tmp->addmask, newmask, parent->effective_cpus);
1182 adding = cpumask_andnot(tmp->addmask, tmp->addmask,
1183 parent->subparts_cpus);
1184
1185
1186
1187 if (adding &&
1188 cpumask_equal(parent->effective_cpus, tmp->addmask)) {
1189 if (!deleting)
1190 return -EINVAL;
1191
1192
1193
1194
1195
1196 if (!cpumask_and(tmp->addmask, tmp->delmask,
1197 cpu_active_mask))
1198 return -EINVAL;
1199 cpumask_copy(tmp->addmask, parent->effective_cpus);
1200 }
1201 } else {
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211 adding = cpumask_and(tmp->addmask, cpuset->cpus_allowed,
1212 parent->effective_cpus);
1213 part_error = cpumask_equal(tmp->addmask,
1214 parent->effective_cpus);
1215 }
1216
1217 if (cmd == partcmd_update) {
1218 int prev_prs = cpuset->partition_root_state;
1219
1220
1221
1222
1223
1224 switch (cpuset->partition_root_state) {
1225 case PRS_ENABLED:
1226 if (part_error)
1227 cpuset->partition_root_state = PRS_ERROR;
1228 break;
1229 case PRS_ERROR:
1230 if (!part_error)
1231 cpuset->partition_root_state = PRS_ENABLED;
1232 break;
1233 }
1234
1235
1236
1237 part_error = (prev_prs == PRS_ERROR);
1238 }
1239
1240 if (!part_error && (cpuset->partition_root_state == PRS_ERROR))
1241 return 0;
1242
1243 if (cpuset->partition_root_state == PRS_ERROR) {
1244
1245
1246
1247 adding = false;
1248 deleting = cpumask_and(tmp->delmask, cpuset->cpus_allowed,
1249 parent->subparts_cpus);
1250 }
1251
1252 if (!adding && !deleting)
1253 return 0;
1254
1255
1256
1257
1258
1259
1260 spin_lock_irq(&callback_lock);
1261 if (adding) {
1262 cpumask_or(parent->subparts_cpus,
1263 parent->subparts_cpus, tmp->addmask);
1264 cpumask_andnot(parent->effective_cpus,
1265 parent->effective_cpus, tmp->addmask);
1266 }
1267 if (deleting) {
1268 cpumask_andnot(parent->subparts_cpus,
1269 parent->subparts_cpus, tmp->delmask);
1270
1271
1272
1273 cpumask_and(tmp->delmask, tmp->delmask, cpu_active_mask);
1274 cpumask_or(parent->effective_cpus,
1275 parent->effective_cpus, tmp->delmask);
1276 }
1277
1278 parent->nr_subparts_cpus = cpumask_weight(parent->subparts_cpus);
1279 spin_unlock_irq(&callback_lock);
1280
1281 return cmd == partcmd_update;
1282}
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp)
1297{
1298 struct cpuset *cp;
1299 struct cgroup_subsys_state *pos_css;
1300 bool need_rebuild_sched_domains = false;
1301
1302 rcu_read_lock();
1303 cpuset_for_each_descendant_pre(cp, pos_css, cs) {
1304 struct cpuset *parent = parent_cs(cp);
1305
1306 compute_effective_cpumask(tmp->new_cpus, cp, parent);
1307
1308
1309
1310
1311
1312 if (is_in_v2_mode() && cpumask_empty(tmp->new_cpus)) {
1313 cpumask_copy(tmp->new_cpus, parent->effective_cpus);
1314 if (!cp->use_parent_ecpus) {
1315 cp->use_parent_ecpus = true;
1316 parent->child_ecpus_count++;
1317 }
1318 } else if (cp->use_parent_ecpus) {
1319 cp->use_parent_ecpus = false;
1320 WARN_ON_ONCE(!parent->child_ecpus_count);
1321 parent->child_ecpus_count--;
1322 }
1323
1324
1325
1326
1327
1328 if (!cp->partition_root_state &&
1329 cpumask_equal(tmp->new_cpus, cp->effective_cpus)) {
1330 pos_css = css_rightmost_descendant(pos_css);
1331 continue;
1332 }
1333
1334
1335
1336
1337
1338
1339
1340 if ((cp != cs) && cp->partition_root_state) {
1341 switch (parent->partition_root_state) {
1342 case PRS_DISABLED:
1343
1344
1345
1346
1347
1348 WARN_ON_ONCE(cp->partition_root_state
1349 != PRS_ERROR);
1350 cp->partition_root_state = 0;
1351
1352
1353
1354
1355
1356
1357
1358
1359 clear_bit(CS_CPU_EXCLUSIVE, &cp->flags);
1360 break;
1361
1362 case PRS_ENABLED:
1363 if (update_parent_subparts_cpumask(cp, partcmd_update, NULL, tmp))
1364 update_tasks_cpumask(parent);
1365 break;
1366
1367 case PRS_ERROR:
1368
1369
1370
1371 cp->partition_root_state = PRS_ERROR;
1372 if (cp->nr_subparts_cpus) {
1373 cp->nr_subparts_cpus = 0;
1374 cpumask_clear(cp->subparts_cpus);
1375 }
1376 break;
1377 }
1378 }
1379
1380 if (!css_tryget_online(&cp->css))
1381 continue;
1382 rcu_read_unlock();
1383
1384 spin_lock_irq(&callback_lock);
1385
1386 cpumask_copy(cp->effective_cpus, tmp->new_cpus);
1387 if (cp->nr_subparts_cpus &&
1388 (cp->partition_root_state != PRS_ENABLED)) {
1389 cp->nr_subparts_cpus = 0;
1390 cpumask_clear(cp->subparts_cpus);
1391 } else if (cp->nr_subparts_cpus) {
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401 cpumask_andnot(cp->effective_cpus, cp->effective_cpus,
1402 cp->subparts_cpus);
1403 if (cpumask_empty(cp->effective_cpus)) {
1404 cpumask_copy(cp->effective_cpus, tmp->new_cpus);
1405 cpumask_clear(cp->subparts_cpus);
1406 cp->nr_subparts_cpus = 0;
1407 } else if (!cpumask_subset(cp->subparts_cpus,
1408 tmp->new_cpus)) {
1409 cpumask_andnot(cp->subparts_cpus,
1410 cp->subparts_cpus, tmp->new_cpus);
1411 cp->nr_subparts_cpus
1412 = cpumask_weight(cp->subparts_cpus);
1413 }
1414 }
1415 spin_unlock_irq(&callback_lock);
1416
1417 WARN_ON(!is_in_v2_mode() &&
1418 !cpumask_equal(cp->cpus_allowed, cp->effective_cpus));
1419
1420 update_tasks_cpumask(cp);
1421
1422
1423
1424
1425
1426
1427
1428 if (!cpumask_empty(cp->cpus_allowed) &&
1429 is_sched_load_balance(cp) &&
1430 (!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) ||
1431 is_partition_root(cp)))
1432 need_rebuild_sched_domains = true;
1433
1434 rcu_read_lock();
1435 css_put(&cp->css);
1436 }
1437 rcu_read_unlock();
1438
1439 if (need_rebuild_sched_domains)
1440 rebuild_sched_domains_locked();
1441}
1442
1443
1444
1445
1446
1447
1448
1449static void update_sibling_cpumasks(struct cpuset *parent, struct cpuset *cs,
1450 struct tmpmasks *tmp)
1451{
1452 struct cpuset *sibling;
1453 struct cgroup_subsys_state *pos_css;
1454
1455
1456
1457
1458
1459
1460 rcu_read_lock();
1461 cpuset_for_each_child(sibling, pos_css, parent) {
1462 if (sibling == cs)
1463 continue;
1464 if (!sibling->use_parent_ecpus)
1465 continue;
1466
1467 update_cpumasks_hier(sibling, tmp);
1468 }
1469 rcu_read_unlock();
1470}
1471
1472
1473
1474
1475
1476
1477
1478static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
1479 const char *buf)
1480{
1481 int retval;
1482 struct tmpmasks tmp;
1483
1484
1485 if (cs == &top_cpuset)
1486 return -EACCES;
1487
1488
1489
1490
1491
1492
1493
1494 if (!*buf) {
1495 cpumask_clear(trialcs->cpus_allowed);
1496 } else {
1497 retval = cpulist_parse(buf, trialcs->cpus_allowed);
1498 if (retval < 0)
1499 return retval;
1500
1501 if (!cpumask_subset(trialcs->cpus_allowed,
1502 top_cpuset.cpus_allowed))
1503 return -EINVAL;
1504 }
1505
1506
1507 if (cpumask_equal(cs->cpus_allowed, trialcs->cpus_allowed))
1508 return 0;
1509
1510 retval = validate_change(cs, trialcs);
1511 if (retval < 0)
1512 return retval;
1513
1514#ifdef CONFIG_CPUMASK_OFFSTACK
1515
1516
1517
1518
1519 tmp.addmask = trialcs->subparts_cpus;
1520 tmp.delmask = trialcs->effective_cpus;
1521 tmp.new_cpus = trialcs->cpus_allowed;
1522#endif
1523
1524 if (cs->partition_root_state) {
1525
1526 if (cpumask_empty(trialcs->cpus_allowed))
1527 return -EINVAL;
1528 if (update_parent_subparts_cpumask(cs, partcmd_update,
1529 trialcs->cpus_allowed, &tmp) < 0)
1530 return -EINVAL;
1531 }
1532
1533 spin_lock_irq(&callback_lock);
1534 cpumask_copy(cs->cpus_allowed, trialcs->cpus_allowed);
1535
1536
1537
1538
1539 if (cs->nr_subparts_cpus) {
1540 cpumask_andnot(cs->subparts_cpus, cs->subparts_cpus,
1541 cs->cpus_allowed);
1542 cs->nr_subparts_cpus = cpumask_weight(cs->subparts_cpus);
1543 }
1544 spin_unlock_irq(&callback_lock);
1545
1546 update_cpumasks_hier(cs, &tmp);
1547
1548 if (cs->partition_root_state) {
1549 struct cpuset *parent = parent_cs(cs);
1550
1551
1552
1553
1554
1555 if (parent->child_ecpus_count)
1556 update_sibling_cpumasks(parent, cs, &tmp);
1557 }
1558 return 0;
1559}
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569struct cpuset_migrate_mm_work {
1570 struct work_struct work;
1571 struct mm_struct *mm;
1572 nodemask_t from;
1573 nodemask_t to;
1574};
1575
1576static void cpuset_migrate_mm_workfn(struct work_struct *work)
1577{
1578 struct cpuset_migrate_mm_work *mwork =
1579 container_of(work, struct cpuset_migrate_mm_work, work);
1580
1581
1582 do_migrate_pages(mwork->mm, &mwork->from, &mwork->to, MPOL_MF_MOVE_ALL);
1583 mmput(mwork->mm);
1584 kfree(mwork);
1585}
1586
1587static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from,
1588 const nodemask_t *to)
1589{
1590 struct cpuset_migrate_mm_work *mwork;
1591
1592 mwork = kzalloc(sizeof(*mwork), GFP_KERNEL);
1593 if (mwork) {
1594 mwork->mm = mm;
1595 mwork->from = *from;
1596 mwork->to = *to;
1597 INIT_WORK(&mwork->work, cpuset_migrate_mm_workfn);
1598 queue_work(cpuset_migrate_mm_wq, &mwork->work);
1599 } else {
1600 mmput(mm);
1601 }
1602}
1603
1604static void cpuset_post_attach(void)
1605{
1606 flush_workqueue(cpuset_migrate_mm_wq);
1607}
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619static void cpuset_change_task_nodemask(struct task_struct *tsk,
1620 nodemask_t *newmems)
1621{
1622 task_lock(tsk);
1623
1624 local_irq_disable();
1625 write_seqcount_begin(&tsk->mems_allowed_seq);
1626
1627 nodes_or(tsk->mems_allowed, tsk->mems_allowed, *newmems);
1628 mpol_rebind_task(tsk, newmems);
1629 tsk->mems_allowed = *newmems;
1630
1631 write_seqcount_end(&tsk->mems_allowed_seq);
1632 local_irq_enable();
1633
1634 task_unlock(tsk);
1635}
1636
1637static void *cpuset_being_rebound;
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647static void update_tasks_nodemask(struct cpuset *cs)
1648{
1649 static nodemask_t newmems;
1650 struct css_task_iter it;
1651 struct task_struct *task;
1652
1653 cpuset_being_rebound = cs;
1654
1655 guarantee_online_mems(cs, &newmems);
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667 css_task_iter_start(&cs->css, 0, &it);
1668 while ((task = css_task_iter_next(&it))) {
1669 struct mm_struct *mm;
1670 bool migrate;
1671
1672 cpuset_change_task_nodemask(task, &newmems);
1673
1674 mm = get_task_mm(task);
1675 if (!mm)
1676 continue;
1677
1678 migrate = is_memory_migrate(cs);
1679
1680 mpol_rebind_mm(mm, &cs->mems_allowed);
1681 if (migrate)
1682 cpuset_migrate_mm(mm, &cs->old_mems_allowed, &newmems);
1683 else
1684 mmput(mm);
1685 }
1686 css_task_iter_end(&it);
1687
1688
1689
1690
1691
1692 cs->old_mems_allowed = newmems;
1693
1694
1695 cpuset_being_rebound = NULL;
1696}
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710static void update_nodemasks_hier(struct cpuset *cs, nodemask_t *new_mems)
1711{
1712 struct cpuset *cp;
1713 struct cgroup_subsys_state *pos_css;
1714
1715 rcu_read_lock();
1716 cpuset_for_each_descendant_pre(cp, pos_css, cs) {
1717 struct cpuset *parent = parent_cs(cp);
1718
1719 nodes_and(*new_mems, cp->mems_allowed, parent->effective_mems);
1720
1721
1722
1723
1724
1725 if (is_in_v2_mode() && nodes_empty(*new_mems))
1726 *new_mems = parent->effective_mems;
1727
1728
1729 if (nodes_equal(*new_mems, cp->effective_mems)) {
1730 pos_css = css_rightmost_descendant(pos_css);
1731 continue;
1732 }
1733
1734 if (!css_tryget_online(&cp->css))
1735 continue;
1736 rcu_read_unlock();
1737
1738 spin_lock_irq(&callback_lock);
1739 cp->effective_mems = *new_mems;
1740 spin_unlock_irq(&callback_lock);
1741
1742 WARN_ON(!is_in_v2_mode() &&
1743 !nodes_equal(cp->mems_allowed, cp->effective_mems));
1744
1745 update_tasks_nodemask(cp);
1746
1747 rcu_read_lock();
1748 css_put(&cp->css);
1749 }
1750 rcu_read_unlock();
1751}
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
1767 const char *buf)
1768{
1769 int retval;
1770
1771
1772
1773
1774
1775 if (cs == &top_cpuset) {
1776 retval = -EACCES;
1777 goto done;
1778 }
1779
1780
1781
1782
1783
1784
1785
1786 if (!*buf) {
1787 nodes_clear(trialcs->mems_allowed);
1788 } else {
1789 retval = nodelist_parse(buf, trialcs->mems_allowed);
1790 if (retval < 0)
1791 goto done;
1792
1793 if (!nodes_subset(trialcs->mems_allowed,
1794 top_cpuset.mems_allowed)) {
1795 retval = -EINVAL;
1796 goto done;
1797 }
1798 }
1799
1800 if (nodes_equal(cs->mems_allowed, trialcs->mems_allowed)) {
1801 retval = 0;
1802 goto done;
1803 }
1804 retval = validate_change(cs, trialcs);
1805 if (retval < 0)
1806 goto done;
1807
1808 spin_lock_irq(&callback_lock);
1809 cs->mems_allowed = trialcs->mems_allowed;
1810 spin_unlock_irq(&callback_lock);
1811
1812
1813 update_nodemasks_hier(cs, &trialcs->mems_allowed);
1814done:
1815 return retval;
1816}
1817
1818bool current_cpuset_is_being_rebound(void)
1819{
1820 bool ret;
1821
1822 rcu_read_lock();
1823 ret = task_cs(current) == cpuset_being_rebound;
1824 rcu_read_unlock();
1825
1826 return ret;
1827}
1828
1829static int update_relax_domain_level(struct cpuset *cs, s64 val)
1830{
1831#ifdef CONFIG_SMP
1832 if (val < -1 || val >= sched_domain_level_max)
1833 return -EINVAL;
1834#endif
1835
1836 if (val != cs->relax_domain_level) {
1837 cs->relax_domain_level = val;
1838 if (!cpumask_empty(cs->cpus_allowed) &&
1839 is_sched_load_balance(cs))
1840 rebuild_sched_domains_locked();
1841 }
1842
1843 return 0;
1844}
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854static void update_tasks_flags(struct cpuset *cs)
1855{
1856 struct css_task_iter it;
1857 struct task_struct *task;
1858
1859 css_task_iter_start(&cs->css, 0, &it);
1860 while ((task = css_task_iter_next(&it)))
1861 cpuset_update_task_spread_flag(cs, task);
1862 css_task_iter_end(&it);
1863}
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
1875 int turning_on)
1876{
1877 struct cpuset *trialcs;
1878 int balance_flag_changed;
1879 int spread_flag_changed;
1880 int err;
1881
1882 trialcs = alloc_trial_cpuset(cs);
1883 if (!trialcs)
1884 return -ENOMEM;
1885
1886 if (turning_on)
1887 set_bit(bit, &trialcs->flags);
1888 else
1889 clear_bit(bit, &trialcs->flags);
1890
1891 err = validate_change(cs, trialcs);
1892 if (err < 0)
1893 goto out;
1894
1895 balance_flag_changed = (is_sched_load_balance(cs) !=
1896 is_sched_load_balance(trialcs));
1897
1898 spread_flag_changed = ((is_spread_slab(cs) != is_spread_slab(trialcs))
1899 || (is_spread_page(cs) != is_spread_page(trialcs)));
1900
1901 spin_lock_irq(&callback_lock);
1902 cs->flags = trialcs->flags;
1903 spin_unlock_irq(&callback_lock);
1904
1905 if (!cpumask_empty(trialcs->cpus_allowed) && balance_flag_changed)
1906 rebuild_sched_domains_locked();
1907
1908 if (spread_flag_changed)
1909 update_tasks_flags(cs);
1910out:
1911 free_cpuset(trialcs);
1912 return err;
1913}
1914
1915
1916
1917
1918
1919
1920
1921
1922static int update_prstate(struct cpuset *cs, int val)
1923{
1924 int err;
1925 struct cpuset *parent = parent_cs(cs);
1926 struct tmpmasks tmp;
1927
1928 if ((val != 0) && (val != 1))
1929 return -EINVAL;
1930 if (val == cs->partition_root_state)
1931 return 0;
1932
1933
1934
1935
1936
1937 if (val && cs->partition_root_state)
1938 return -EINVAL;
1939
1940 if (alloc_cpumasks(NULL, &tmp))
1941 return -ENOMEM;
1942
1943 err = -EINVAL;
1944 if (!cs->partition_root_state) {
1945
1946
1947
1948
1949
1950 if (cpumask_empty(cs->cpus_allowed))
1951 goto out;
1952
1953 err = update_flag(CS_CPU_EXCLUSIVE, cs, 1);
1954 if (err)
1955 goto out;
1956
1957 err = update_parent_subparts_cpumask(cs, partcmd_enable,
1958 NULL, &tmp);
1959 if (err) {
1960 update_flag(CS_CPU_EXCLUSIVE, cs, 0);
1961 goto out;
1962 }
1963 cs->partition_root_state = PRS_ENABLED;
1964 } else {
1965
1966
1967
1968
1969 if (cs->partition_root_state == PRS_ERROR) {
1970 cs->partition_root_state = 0;
1971 update_flag(CS_CPU_EXCLUSIVE, cs, 0);
1972 err = 0;
1973 goto out;
1974 }
1975
1976 err = update_parent_subparts_cpumask(cs, partcmd_disable,
1977 NULL, &tmp);
1978 if (err)
1979 goto out;
1980
1981 cs->partition_root_state = 0;
1982
1983
1984 update_flag(CS_CPU_EXCLUSIVE, cs, 0);
1985 }
1986
1987
1988
1989
1990
1991 if (parent != &top_cpuset)
1992 update_tasks_cpumask(parent);
1993
1994 if (parent->child_ecpus_count)
1995 update_sibling_cpumasks(parent, cs, &tmp);
1996
1997 rebuild_sched_domains_locked();
1998out:
1999 free_cpumasks(NULL, &tmp);
2000 return err;
2001}
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048#define FM_COEF 933
2049#define FM_MAXTICKS ((u32)99)
2050#define FM_MAXCNT 1000000
2051#define FM_SCALE 1000
2052
2053
2054static void fmeter_init(struct fmeter *fmp)
2055{
2056 fmp->cnt = 0;
2057 fmp->val = 0;
2058 fmp->time = 0;
2059 spin_lock_init(&fmp->lock);
2060}
2061
2062
2063static void fmeter_update(struct fmeter *fmp)
2064{
2065 time64_t now;
2066 u32 ticks;
2067
2068 now = ktime_get_seconds();
2069 ticks = now - fmp->time;
2070
2071 if (ticks == 0)
2072 return;
2073
2074 ticks = min(FM_MAXTICKS, ticks);
2075 while (ticks-- > 0)
2076 fmp->val = (FM_COEF * fmp->val) / FM_SCALE;
2077 fmp->time = now;
2078
2079 fmp->val += ((FM_SCALE - FM_COEF) * fmp->cnt) / FM_SCALE;
2080 fmp->cnt = 0;
2081}
2082
2083
2084static void fmeter_markevent(struct fmeter *fmp)
2085{
2086 spin_lock(&fmp->lock);
2087 fmeter_update(fmp);
2088 fmp->cnt = min(FM_MAXCNT, fmp->cnt + FM_SCALE);
2089 spin_unlock(&fmp->lock);
2090}
2091
2092
2093static int fmeter_getrate(struct fmeter *fmp)
2094{
2095 int val;
2096
2097 spin_lock(&fmp->lock);
2098 fmeter_update(fmp);
2099 val = fmp->val;
2100 spin_unlock(&fmp->lock);
2101 return val;
2102}
2103
2104static struct cpuset *cpuset_attach_old_cs;
2105
2106
2107static int cpuset_can_attach(struct cgroup_taskset *tset)
2108{
2109 struct cgroup_subsys_state *css;
2110 struct cpuset *cs;
2111 struct task_struct *task;
2112 int ret;
2113
2114
2115 cpuset_attach_old_cs = task_cs(cgroup_taskset_first(tset, &css));
2116 cs = css_cs(css);
2117
2118 percpu_down_write(&cpuset_rwsem);
2119
2120
2121 ret = -ENOSPC;
2122 if (!is_in_v2_mode() &&
2123 (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed)))
2124 goto out_unlock;
2125
2126 cgroup_taskset_for_each(task, css, tset) {
2127 ret = task_can_attach(task, cs->cpus_allowed);
2128 if (ret)
2129 goto out_unlock;
2130 ret = security_task_setscheduler(task);
2131 if (ret)
2132 goto out_unlock;
2133 }
2134
2135
2136
2137
2138
2139 cs->attach_in_progress++;
2140 ret = 0;
2141out_unlock:
2142 percpu_up_write(&cpuset_rwsem);
2143 return ret;
2144}
2145
2146static void cpuset_cancel_attach(struct cgroup_taskset *tset)
2147{
2148 struct cgroup_subsys_state *css;
2149
2150 cgroup_taskset_first(tset, &css);
2151
2152 percpu_down_write(&cpuset_rwsem);
2153 css_cs(css)->attach_in_progress--;
2154 percpu_up_write(&cpuset_rwsem);
2155}
2156
2157
2158
2159
2160
2161
2162static cpumask_var_t cpus_attach;
2163
2164static void cpuset_attach(struct cgroup_taskset *tset)
2165{
2166
2167 static nodemask_t cpuset_attach_nodemask_to;
2168 struct task_struct *task;
2169 struct task_struct *leader;
2170 struct cgroup_subsys_state *css;
2171 struct cpuset *cs;
2172 struct cpuset *oldcs = cpuset_attach_old_cs;
2173
2174 cgroup_taskset_first(tset, &css);
2175 cs = css_cs(css);
2176
2177 percpu_down_write(&cpuset_rwsem);
2178
2179
2180 if (cs == &top_cpuset)
2181 cpumask_copy(cpus_attach, cpu_possible_mask);
2182 else
2183 guarantee_online_cpus(cs, cpus_attach);
2184
2185 guarantee_online_mems(cs, &cpuset_attach_nodemask_to);
2186
2187 cgroup_taskset_for_each(task, css, tset) {
2188
2189
2190
2191
2192 WARN_ON_ONCE(set_cpus_allowed_ptr(task, cpus_attach));
2193
2194 cpuset_change_task_nodemask(task, &cpuset_attach_nodemask_to);
2195 cpuset_update_task_spread_flag(cs, task);
2196 }
2197
2198
2199
2200
2201
2202 cpuset_attach_nodemask_to = cs->effective_mems;
2203 cgroup_taskset_for_each_leader(leader, css, tset) {
2204 struct mm_struct *mm = get_task_mm(leader);
2205
2206 if (mm) {
2207 mpol_rebind_mm(mm, &cpuset_attach_nodemask_to);
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217 if (is_memory_migrate(cs))
2218 cpuset_migrate_mm(mm, &oldcs->old_mems_allowed,
2219 &cpuset_attach_nodemask_to);
2220 else
2221 mmput(mm);
2222 }
2223 }
2224
2225 cs->old_mems_allowed = cpuset_attach_nodemask_to;
2226
2227 cs->attach_in_progress--;
2228 if (!cs->attach_in_progress)
2229 wake_up(&cpuset_attach_wq);
2230
2231 percpu_up_write(&cpuset_rwsem);
2232}
2233
2234
2235
2236typedef enum {
2237 FILE_MEMORY_MIGRATE,
2238 FILE_CPULIST,
2239 FILE_MEMLIST,
2240 FILE_EFFECTIVE_CPULIST,
2241 FILE_EFFECTIVE_MEMLIST,
2242 FILE_SUBPARTS_CPULIST,
2243 FILE_CPU_EXCLUSIVE,
2244 FILE_MEM_EXCLUSIVE,
2245 FILE_MEM_HARDWALL,
2246 FILE_SCHED_LOAD_BALANCE,
2247 FILE_PARTITION_ROOT,
2248 FILE_SCHED_RELAX_DOMAIN_LEVEL,
2249 FILE_MEMORY_PRESSURE_ENABLED,
2250 FILE_MEMORY_PRESSURE,
2251 FILE_SPREAD_PAGE,
2252 FILE_SPREAD_SLAB,
2253} cpuset_filetype_t;
2254
2255static int cpuset_write_u64(struct cgroup_subsys_state *css, struct cftype *cft,
2256 u64 val)
2257{
2258 struct cpuset *cs = css_cs(css);
2259 cpuset_filetype_t type = cft->private;
2260 int retval = 0;
2261
2262 get_online_cpus();
2263 percpu_down_write(&cpuset_rwsem);
2264 if (!is_cpuset_online(cs)) {
2265 retval = -ENODEV;
2266 goto out_unlock;
2267 }
2268
2269 switch (type) {
2270 case FILE_CPU_EXCLUSIVE:
2271 retval = update_flag(CS_CPU_EXCLUSIVE, cs, val);
2272 break;
2273 case FILE_MEM_EXCLUSIVE:
2274 retval = update_flag(CS_MEM_EXCLUSIVE, cs, val);
2275 break;
2276 case FILE_MEM_HARDWALL:
2277 retval = update_flag(CS_MEM_HARDWALL, cs, val);
2278 break;
2279 case FILE_SCHED_LOAD_BALANCE:
2280 retval = update_flag(CS_SCHED_LOAD_BALANCE, cs, val);
2281 break;
2282 case FILE_MEMORY_MIGRATE:
2283 retval = update_flag(CS_MEMORY_MIGRATE, cs, val);
2284 break;
2285 case FILE_MEMORY_PRESSURE_ENABLED:
2286 cpuset_memory_pressure_enabled = !!val;
2287 break;
2288 case FILE_SPREAD_PAGE:
2289 retval = update_flag(CS_SPREAD_PAGE, cs, val);
2290 break;
2291 case FILE_SPREAD_SLAB:
2292 retval = update_flag(CS_SPREAD_SLAB, cs, val);
2293 break;
2294 default:
2295 retval = -EINVAL;
2296 break;
2297 }
2298out_unlock:
2299 percpu_up_write(&cpuset_rwsem);
2300 put_online_cpus();
2301 return retval;
2302}
2303
2304static int cpuset_write_s64(struct cgroup_subsys_state *css, struct cftype *cft,
2305 s64 val)
2306{
2307 struct cpuset *cs = css_cs(css);
2308 cpuset_filetype_t type = cft->private;
2309 int retval = -ENODEV;
2310
2311 get_online_cpus();
2312 percpu_down_write(&cpuset_rwsem);
2313 if (!is_cpuset_online(cs))
2314 goto out_unlock;
2315
2316 switch (type) {
2317 case FILE_SCHED_RELAX_DOMAIN_LEVEL:
2318 retval = update_relax_domain_level(cs, val);
2319 break;
2320 default:
2321 retval = -EINVAL;
2322 break;
2323 }
2324out_unlock:
2325 percpu_up_write(&cpuset_rwsem);
2326 put_online_cpus();
2327 return retval;
2328}
2329
2330
2331
2332
2333static ssize_t cpuset_write_resmask(struct kernfs_open_file *of,
2334 char *buf, size_t nbytes, loff_t off)
2335{
2336 struct cpuset *cs = css_cs(of_css(of));
2337 struct cpuset *trialcs;
2338 int retval = -ENODEV;
2339
2340 buf = strstrip(buf);
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361 css_get(&cs->css);
2362 kernfs_break_active_protection(of->kn);
2363 flush_work(&cpuset_hotplug_work);
2364
2365 get_online_cpus();
2366 percpu_down_write(&cpuset_rwsem);
2367 if (!is_cpuset_online(cs))
2368 goto out_unlock;
2369
2370 trialcs = alloc_trial_cpuset(cs);
2371 if (!trialcs) {
2372 retval = -ENOMEM;
2373 goto out_unlock;
2374 }
2375
2376 switch (of_cft(of)->private) {
2377 case FILE_CPULIST:
2378 retval = update_cpumask(cs, trialcs, buf);
2379 break;
2380 case FILE_MEMLIST:
2381 retval = update_nodemask(cs, trialcs, buf);
2382 break;
2383 default:
2384 retval = -EINVAL;
2385 break;
2386 }
2387
2388 free_cpuset(trialcs);
2389out_unlock:
2390 percpu_up_write(&cpuset_rwsem);
2391 put_online_cpus();
2392 kernfs_unbreak_active_protection(of->kn);
2393 css_put(&cs->css);
2394 flush_workqueue(cpuset_migrate_mm_wq);
2395 return retval ?: nbytes;
2396}
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406static int cpuset_common_seq_show(struct seq_file *sf, void *v)
2407{
2408 struct cpuset *cs = css_cs(seq_css(sf));
2409 cpuset_filetype_t type = seq_cft(sf)->private;
2410 int ret = 0;
2411
2412 spin_lock_irq(&callback_lock);
2413
2414 switch (type) {
2415 case FILE_CPULIST:
2416 seq_printf(sf, "%*pbl\n", cpumask_pr_args(cs->cpus_allowed));
2417 break;
2418 case FILE_MEMLIST:
2419 seq_printf(sf, "%*pbl\n", nodemask_pr_args(&cs->mems_allowed));
2420 break;
2421 case FILE_EFFECTIVE_CPULIST:
2422 seq_printf(sf, "%*pbl\n", cpumask_pr_args(cs->effective_cpus));
2423 break;
2424 case FILE_EFFECTIVE_MEMLIST:
2425 seq_printf(sf, "%*pbl\n", nodemask_pr_args(&cs->effective_mems));
2426 break;
2427 case FILE_SUBPARTS_CPULIST:
2428 seq_printf(sf, "%*pbl\n", cpumask_pr_args(cs->subparts_cpus));
2429 break;
2430 default:
2431 ret = -EINVAL;
2432 }
2433
2434 spin_unlock_irq(&callback_lock);
2435 return ret;
2436}
2437
2438static u64 cpuset_read_u64(struct cgroup_subsys_state *css, struct cftype *cft)
2439{
2440 struct cpuset *cs = css_cs(css);
2441 cpuset_filetype_t type = cft->private;
2442 switch (type) {
2443 case FILE_CPU_EXCLUSIVE:
2444 return is_cpu_exclusive(cs);
2445 case FILE_MEM_EXCLUSIVE:
2446 return is_mem_exclusive(cs);
2447 case FILE_MEM_HARDWALL:
2448 return is_mem_hardwall(cs);
2449 case FILE_SCHED_LOAD_BALANCE:
2450 return is_sched_load_balance(cs);
2451 case FILE_MEMORY_MIGRATE:
2452 return is_memory_migrate(cs);
2453 case FILE_MEMORY_PRESSURE_ENABLED:
2454 return cpuset_memory_pressure_enabled;
2455 case FILE_MEMORY_PRESSURE:
2456 return fmeter_getrate(&cs->fmeter);
2457 case FILE_SPREAD_PAGE:
2458 return is_spread_page(cs);
2459 case FILE_SPREAD_SLAB:
2460 return is_spread_slab(cs);
2461 default:
2462 BUG();
2463 }
2464
2465
2466 return 0;
2467}
2468
2469static s64 cpuset_read_s64(struct cgroup_subsys_state *css, struct cftype *cft)
2470{
2471 struct cpuset *cs = css_cs(css);
2472 cpuset_filetype_t type = cft->private;
2473 switch (type) {
2474 case FILE_SCHED_RELAX_DOMAIN_LEVEL:
2475 return cs->relax_domain_level;
2476 default:
2477 BUG();
2478 }
2479
2480
2481 return 0;
2482}
2483
2484static int sched_partition_show(struct seq_file *seq, void *v)
2485{
2486 struct cpuset *cs = css_cs(seq_css(seq));
2487
2488 switch (cs->partition_root_state) {
2489 case PRS_ENABLED:
2490 seq_puts(seq, "root\n");
2491 break;
2492 case PRS_DISABLED:
2493 seq_puts(seq, "member\n");
2494 break;
2495 case PRS_ERROR:
2496 seq_puts(seq, "root invalid\n");
2497 break;
2498 }
2499 return 0;
2500}
2501
2502static ssize_t sched_partition_write(struct kernfs_open_file *of, char *buf,
2503 size_t nbytes, loff_t off)
2504{
2505 struct cpuset *cs = css_cs(of_css(of));
2506 int val;
2507 int retval = -ENODEV;
2508
2509 buf = strstrip(buf);
2510
2511
2512
2513
2514 if (!strcmp(buf, "root"))
2515 val = PRS_ENABLED;
2516 else if (!strcmp(buf, "member"))
2517 val = PRS_DISABLED;
2518 else
2519 return -EINVAL;
2520
2521 css_get(&cs->css);
2522 get_online_cpus();
2523 percpu_down_write(&cpuset_rwsem);
2524 if (!is_cpuset_online(cs))
2525 goto out_unlock;
2526
2527 retval = update_prstate(cs, val);
2528out_unlock:
2529 percpu_up_write(&cpuset_rwsem);
2530 put_online_cpus();
2531 css_put(&cs->css);
2532 return retval ?: nbytes;
2533}
2534
2535
2536
2537
2538
2539static struct cftype legacy_files[] = {
2540 {
2541 .name = "cpus",
2542 .seq_show = cpuset_common_seq_show,
2543 .write = cpuset_write_resmask,
2544 .max_write_len = (100U + 6 * NR_CPUS),
2545 .private = FILE_CPULIST,
2546 },
2547
2548 {
2549 .name = "mems",
2550 .seq_show = cpuset_common_seq_show,
2551 .write = cpuset_write_resmask,
2552 .max_write_len = (100U + 6 * MAX_NUMNODES),
2553 .private = FILE_MEMLIST,
2554 },
2555
2556 {
2557 .name = "effective_cpus",
2558 .seq_show = cpuset_common_seq_show,
2559 .private = FILE_EFFECTIVE_CPULIST,
2560 },
2561
2562 {
2563 .name = "effective_mems",
2564 .seq_show = cpuset_common_seq_show,
2565 .private = FILE_EFFECTIVE_MEMLIST,
2566 },
2567
2568 {
2569 .name = "cpu_exclusive",
2570 .read_u64 = cpuset_read_u64,
2571 .write_u64 = cpuset_write_u64,
2572 .private = FILE_CPU_EXCLUSIVE,
2573 },
2574
2575 {
2576 .name = "mem_exclusive",
2577 .read_u64 = cpuset_read_u64,
2578 .write_u64 = cpuset_write_u64,
2579 .private = FILE_MEM_EXCLUSIVE,
2580 },
2581
2582 {
2583 .name = "mem_hardwall",
2584 .read_u64 = cpuset_read_u64,
2585 .write_u64 = cpuset_write_u64,
2586 .private = FILE_MEM_HARDWALL,
2587 },
2588
2589 {
2590 .name = "sched_load_balance",
2591 .read_u64 = cpuset_read_u64,
2592 .write_u64 = cpuset_write_u64,
2593 .private = FILE_SCHED_LOAD_BALANCE,
2594 },
2595
2596 {
2597 .name = "sched_relax_domain_level",
2598 .read_s64 = cpuset_read_s64,
2599 .write_s64 = cpuset_write_s64,
2600 .private = FILE_SCHED_RELAX_DOMAIN_LEVEL,
2601 },
2602
2603 {
2604 .name = "memory_migrate",
2605 .read_u64 = cpuset_read_u64,
2606 .write_u64 = cpuset_write_u64,
2607 .private = FILE_MEMORY_MIGRATE,
2608 },
2609
2610 {
2611 .name = "memory_pressure",
2612 .read_u64 = cpuset_read_u64,
2613 .private = FILE_MEMORY_PRESSURE,
2614 },
2615
2616 {
2617 .name = "memory_spread_page",
2618 .read_u64 = cpuset_read_u64,
2619 .write_u64 = cpuset_write_u64,
2620 .private = FILE_SPREAD_PAGE,
2621 },
2622
2623 {
2624 .name = "memory_spread_slab",
2625 .read_u64 = cpuset_read_u64,
2626 .write_u64 = cpuset_write_u64,
2627 .private = FILE_SPREAD_SLAB,
2628 },
2629
2630 {
2631 .name = "memory_pressure_enabled",
2632 .flags = CFTYPE_ONLY_ON_ROOT,
2633 .read_u64 = cpuset_read_u64,
2634 .write_u64 = cpuset_write_u64,
2635 .private = FILE_MEMORY_PRESSURE_ENABLED,
2636 },
2637
2638 { }
2639};
2640
2641
2642
2643
2644
2645static struct cftype dfl_files[] = {
2646 {
2647 .name = "cpus",
2648 .seq_show = cpuset_common_seq_show,
2649 .write = cpuset_write_resmask,
2650 .max_write_len = (100U + 6 * NR_CPUS),
2651 .private = FILE_CPULIST,
2652 .flags = CFTYPE_NOT_ON_ROOT,
2653 },
2654
2655 {
2656 .name = "mems",
2657 .seq_show = cpuset_common_seq_show,
2658 .write = cpuset_write_resmask,
2659 .max_write_len = (100U + 6 * MAX_NUMNODES),
2660 .private = FILE_MEMLIST,
2661 .flags = CFTYPE_NOT_ON_ROOT,
2662 },
2663
2664 {
2665 .name = "cpus.effective",
2666 .seq_show = cpuset_common_seq_show,
2667 .private = FILE_EFFECTIVE_CPULIST,
2668 },
2669
2670 {
2671 .name = "mems.effective",
2672 .seq_show = cpuset_common_seq_show,
2673 .private = FILE_EFFECTIVE_MEMLIST,
2674 },
2675
2676 {
2677 .name = "cpus.partition",
2678 .seq_show = sched_partition_show,
2679 .write = sched_partition_write,
2680 .private = FILE_PARTITION_ROOT,
2681 .flags = CFTYPE_NOT_ON_ROOT,
2682 },
2683
2684 {
2685 .name = "cpus.subpartitions",
2686 .seq_show = cpuset_common_seq_show,
2687 .private = FILE_SUBPARTS_CPULIST,
2688 .flags = CFTYPE_DEBUG,
2689 },
2690
2691 { }
2692};
2693
2694
2695
2696
2697
2698
2699
2700static struct cgroup_subsys_state *
2701cpuset_css_alloc(struct cgroup_subsys_state *parent_css)
2702{
2703 struct cpuset *cs;
2704
2705 if (!parent_css)
2706 return &top_cpuset.css;
2707
2708 cs = kzalloc(sizeof(*cs), GFP_KERNEL);
2709 if (!cs)
2710 return ERR_PTR(-ENOMEM);
2711
2712 if (alloc_cpumasks(cs, NULL)) {
2713 kfree(cs);
2714 return ERR_PTR(-ENOMEM);
2715 }
2716
2717 set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
2718 nodes_clear(cs->mems_allowed);
2719 nodes_clear(cs->effective_mems);
2720 fmeter_init(&cs->fmeter);
2721 cs->relax_domain_level = -1;
2722
2723 return &cs->css;
2724}
2725
2726static int cpuset_css_online(struct cgroup_subsys_state *css)
2727{
2728 struct cpuset *cs = css_cs(css);
2729 struct cpuset *parent = parent_cs(cs);
2730 struct cpuset *tmp_cs;
2731 struct cgroup_subsys_state *pos_css;
2732
2733 if (!parent)
2734 return 0;
2735
2736 get_online_cpus();
2737 percpu_down_write(&cpuset_rwsem);
2738
2739 set_bit(CS_ONLINE, &cs->flags);
2740 if (is_spread_page(parent))
2741 set_bit(CS_SPREAD_PAGE, &cs->flags);
2742 if (is_spread_slab(parent))
2743 set_bit(CS_SPREAD_SLAB, &cs->flags);
2744
2745 cpuset_inc();
2746
2747 spin_lock_irq(&callback_lock);
2748 if (is_in_v2_mode()) {
2749 cpumask_copy(cs->effective_cpus, parent->effective_cpus);
2750 cs->effective_mems = parent->effective_mems;
2751 cs->use_parent_ecpus = true;
2752 parent->child_ecpus_count++;
2753 }
2754 spin_unlock_irq(&callback_lock);
2755
2756 if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags))
2757 goto out_unlock;
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772 rcu_read_lock();
2773 cpuset_for_each_child(tmp_cs, pos_css, parent) {
2774 if (is_mem_exclusive(tmp_cs) || is_cpu_exclusive(tmp_cs)) {
2775 rcu_read_unlock();
2776 goto out_unlock;
2777 }
2778 }
2779 rcu_read_unlock();
2780
2781 spin_lock_irq(&callback_lock);
2782 cs->mems_allowed = parent->mems_allowed;
2783 cs->effective_mems = parent->mems_allowed;
2784 cpumask_copy(cs->cpus_allowed, parent->cpus_allowed);
2785 cpumask_copy(cs->effective_cpus, parent->cpus_allowed);
2786 spin_unlock_irq(&callback_lock);
2787out_unlock:
2788 percpu_up_write(&cpuset_rwsem);
2789 put_online_cpus();
2790 return 0;
2791}
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804static void cpuset_css_offline(struct cgroup_subsys_state *css)
2805{
2806 struct cpuset *cs = css_cs(css);
2807
2808 get_online_cpus();
2809 percpu_down_write(&cpuset_rwsem);
2810
2811 if (is_partition_root(cs))
2812 update_prstate(cs, 0);
2813
2814 if (!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) &&
2815 is_sched_load_balance(cs))
2816 update_flag(CS_SCHED_LOAD_BALANCE, cs, 0);
2817
2818 if (cs->use_parent_ecpus) {
2819 struct cpuset *parent = parent_cs(cs);
2820
2821 cs->use_parent_ecpus = false;
2822 parent->child_ecpus_count--;
2823 }
2824
2825 cpuset_dec();
2826 clear_bit(CS_ONLINE, &cs->flags);
2827
2828 percpu_up_write(&cpuset_rwsem);
2829 put_online_cpus();
2830}
2831
2832static void cpuset_css_free(struct cgroup_subsys_state *css)
2833{
2834 struct cpuset *cs = css_cs(css);
2835
2836 free_cpuset(cs);
2837}
2838
2839static void cpuset_bind(struct cgroup_subsys_state *root_css)
2840{
2841 percpu_down_write(&cpuset_rwsem);
2842 spin_lock_irq(&callback_lock);
2843
2844 if (is_in_v2_mode()) {
2845 cpumask_copy(top_cpuset.cpus_allowed, cpu_possible_mask);
2846 top_cpuset.mems_allowed = node_possible_map;
2847 } else {
2848 cpumask_copy(top_cpuset.cpus_allowed,
2849 top_cpuset.effective_cpus);
2850 top_cpuset.mems_allowed = top_cpuset.effective_mems;
2851 }
2852
2853 spin_unlock_irq(&callback_lock);
2854 percpu_up_write(&cpuset_rwsem);
2855}
2856
2857
2858
2859
2860
2861
2862static void cpuset_fork(struct task_struct *task)
2863{
2864 if (task_css_is_root(task, cpuset_cgrp_id))
2865 return;
2866
2867 set_cpus_allowed_ptr(task, current->cpus_ptr);
2868 task->mems_allowed = current->mems_allowed;
2869}
2870
2871struct cgroup_subsys cpuset_cgrp_subsys = {
2872 .css_alloc = cpuset_css_alloc,
2873 .css_online = cpuset_css_online,
2874 .css_offline = cpuset_css_offline,
2875 .css_free = cpuset_css_free,
2876 .can_attach = cpuset_can_attach,
2877 .cancel_attach = cpuset_cancel_attach,
2878 .attach = cpuset_attach,
2879 .post_attach = cpuset_post_attach,
2880 .bind = cpuset_bind,
2881 .fork = cpuset_fork,
2882 .legacy_cftypes = legacy_files,
2883 .dfl_cftypes = dfl_files,
2884 .early_init = true,
2885 .threaded = true,
2886};
2887
2888
2889
2890
2891
2892
2893
2894int __init cpuset_init(void)
2895{
2896 BUG_ON(percpu_init_rwsem(&cpuset_rwsem));
2897
2898 BUG_ON(!alloc_cpumask_var(&top_cpuset.cpus_allowed, GFP_KERNEL));
2899 BUG_ON(!alloc_cpumask_var(&top_cpuset.effective_cpus, GFP_KERNEL));
2900 BUG_ON(!zalloc_cpumask_var(&top_cpuset.subparts_cpus, GFP_KERNEL));
2901
2902 cpumask_setall(top_cpuset.cpus_allowed);
2903 nodes_setall(top_cpuset.mems_allowed);
2904 cpumask_setall(top_cpuset.effective_cpus);
2905 nodes_setall(top_cpuset.effective_mems);
2906
2907 fmeter_init(&top_cpuset.fmeter);
2908 set_bit(CS_SCHED_LOAD_BALANCE, &top_cpuset.flags);
2909 top_cpuset.relax_domain_level = -1;
2910
2911 BUG_ON(!alloc_cpumask_var(&cpus_attach, GFP_KERNEL));
2912
2913 return 0;
2914}
2915
2916
2917
2918
2919
2920
2921
2922
2923static void remove_tasks_in_empty_cpuset(struct cpuset *cs)
2924{
2925 struct cpuset *parent;
2926
2927
2928
2929
2930
2931 parent = parent_cs(cs);
2932 while (cpumask_empty(parent->cpus_allowed) ||
2933 nodes_empty(parent->mems_allowed))
2934 parent = parent_cs(parent);
2935
2936 if (cgroup_transfer_tasks(parent->css.cgroup, cs->css.cgroup)) {
2937 pr_err("cpuset: failed to transfer tasks out of empty cpuset ");
2938 pr_cont_cgroup_name(cs->css.cgroup);
2939 pr_cont("\n");
2940 }
2941}
2942
2943static void
2944hotplug_update_tasks_legacy(struct cpuset *cs,
2945 struct cpumask *new_cpus, nodemask_t *new_mems,
2946 bool cpus_updated, bool mems_updated)
2947{
2948 bool is_empty;
2949
2950 spin_lock_irq(&callback_lock);
2951 cpumask_copy(cs->cpus_allowed, new_cpus);
2952 cpumask_copy(cs->effective_cpus, new_cpus);
2953 cs->mems_allowed = *new_mems;
2954 cs->effective_mems = *new_mems;
2955 spin_unlock_irq(&callback_lock);
2956
2957
2958
2959
2960
2961 if (cpus_updated && !cpumask_empty(cs->cpus_allowed))
2962 update_tasks_cpumask(cs);
2963 if (mems_updated && !nodes_empty(cs->mems_allowed))
2964 update_tasks_nodemask(cs);
2965
2966 is_empty = cpumask_empty(cs->cpus_allowed) ||
2967 nodes_empty(cs->mems_allowed);
2968
2969 percpu_up_write(&cpuset_rwsem);
2970
2971
2972
2973
2974
2975
2976 if (is_empty)
2977 remove_tasks_in_empty_cpuset(cs);
2978
2979 percpu_down_write(&cpuset_rwsem);
2980}
2981
2982static void
2983hotplug_update_tasks(struct cpuset *cs,
2984 struct cpumask *new_cpus, nodemask_t *new_mems,
2985 bool cpus_updated, bool mems_updated)
2986{
2987 if (cpumask_empty(new_cpus))
2988 cpumask_copy(new_cpus, parent_cs(cs)->effective_cpus);
2989 if (nodes_empty(*new_mems))
2990 *new_mems = parent_cs(cs)->effective_mems;
2991
2992 spin_lock_irq(&callback_lock);
2993 cpumask_copy(cs->effective_cpus, new_cpus);
2994 cs->effective_mems = *new_mems;
2995 spin_unlock_irq(&callback_lock);
2996
2997 if (cpus_updated)
2998 update_tasks_cpumask(cs);
2999 if (mems_updated)
3000 update_tasks_nodemask(cs);
3001}
3002
3003static bool force_rebuild;
3004
3005void cpuset_force_rebuild(void)
3006{
3007 force_rebuild = true;
3008}
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019static void cpuset_hotplug_update_tasks(struct cpuset *cs, struct tmpmasks *tmp)
3020{
3021 static cpumask_t new_cpus;
3022 static nodemask_t new_mems;
3023 bool cpus_updated;
3024 bool mems_updated;
3025 struct cpuset *parent;
3026retry:
3027 wait_event(cpuset_attach_wq, cs->attach_in_progress == 0);
3028
3029 percpu_down_write(&cpuset_rwsem);
3030
3031
3032
3033
3034
3035 if (cs->attach_in_progress) {
3036 percpu_up_write(&cpuset_rwsem);
3037 goto retry;
3038 }
3039
3040 parent = parent_cs(cs);
3041 compute_effective_cpumask(&new_cpus, cs, parent);
3042 nodes_and(new_mems, cs->mems_allowed, parent->effective_mems);
3043
3044 if (cs->nr_subparts_cpus)
3045
3046
3047
3048
3049 cpumask_andnot(&new_cpus, &new_cpus, cs->subparts_cpus);
3050
3051 if (!tmp || !cs->partition_root_state)
3052 goto update_tasks;
3053
3054
3055
3056
3057
3058
3059 if (is_partition_root(cs) && (cpumask_empty(&new_cpus) ||
3060 (parent->partition_root_state == PRS_ERROR))) {
3061 if (cs->nr_subparts_cpus) {
3062 cs->nr_subparts_cpus = 0;
3063 cpumask_clear(cs->subparts_cpus);
3064 compute_effective_cpumask(&new_cpus, cs, parent);
3065 }
3066
3067
3068
3069
3070
3071
3072
3073 if ((parent->partition_root_state == PRS_ERROR) ||
3074 cpumask_empty(&new_cpus)) {
3075 update_parent_subparts_cpumask(cs, partcmd_disable,
3076 NULL, tmp);
3077 cs->partition_root_state = PRS_ERROR;
3078 }
3079 cpuset_force_rebuild();
3080 }
3081
3082
3083
3084
3085
3086
3087 if (is_partition_root(parent) &&
3088 ((cs->partition_root_state == PRS_ERROR) ||
3089 !cpumask_intersects(&new_cpus, parent->subparts_cpus)) &&
3090 update_parent_subparts_cpumask(cs, partcmd_update, NULL, tmp))
3091 cpuset_force_rebuild();
3092
3093update_tasks:
3094 cpus_updated = !cpumask_equal(&new_cpus, cs->effective_cpus);
3095 mems_updated = !nodes_equal(new_mems, cs->effective_mems);
3096
3097 if (is_in_v2_mode())
3098 hotplug_update_tasks(cs, &new_cpus, &new_mems,
3099 cpus_updated, mems_updated);
3100 else
3101 hotplug_update_tasks_legacy(cs, &new_cpus, &new_mems,
3102 cpus_updated, mems_updated);
3103
3104 percpu_up_write(&cpuset_rwsem);
3105}
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123static void cpuset_hotplug_workfn(struct work_struct *work)
3124{
3125 static cpumask_t new_cpus;
3126 static nodemask_t new_mems;
3127 bool cpus_updated, mems_updated;
3128 bool on_dfl = is_in_v2_mode();
3129 struct tmpmasks tmp, *ptmp = NULL;
3130
3131 if (on_dfl && !alloc_cpumasks(NULL, &tmp))
3132 ptmp = &tmp;
3133
3134 percpu_down_write(&cpuset_rwsem);
3135
3136
3137 cpumask_copy(&new_cpus, cpu_active_mask);
3138 new_mems = node_states[N_MEMORY];
3139
3140
3141
3142
3143
3144
3145 cpus_updated = !cpumask_equal(top_cpuset.effective_cpus, &new_cpus);
3146 mems_updated = !nodes_equal(top_cpuset.effective_mems, new_mems);
3147
3148
3149 if (cpus_updated) {
3150 spin_lock_irq(&callback_lock);
3151 if (!on_dfl)
3152 cpumask_copy(top_cpuset.cpus_allowed, &new_cpus);
3153
3154
3155
3156
3157
3158
3159 if (top_cpuset.nr_subparts_cpus) {
3160 if (cpumask_subset(&new_cpus,
3161 top_cpuset.subparts_cpus)) {
3162 top_cpuset.nr_subparts_cpus = 0;
3163 cpumask_clear(top_cpuset.subparts_cpus);
3164 } else {
3165 cpumask_andnot(&new_cpus, &new_cpus,
3166 top_cpuset.subparts_cpus);
3167 }
3168 }
3169 cpumask_copy(top_cpuset.effective_cpus, &new_cpus);
3170 spin_unlock_irq(&callback_lock);
3171
3172 }
3173
3174
3175 if (mems_updated) {
3176 spin_lock_irq(&callback_lock);
3177 if (!on_dfl)
3178 top_cpuset.mems_allowed = new_mems;
3179 top_cpuset.effective_mems = new_mems;
3180 spin_unlock_irq(&callback_lock);
3181 update_tasks_nodemask(&top_cpuset);
3182 }
3183
3184 percpu_up_write(&cpuset_rwsem);
3185
3186
3187 if (cpus_updated || mems_updated) {
3188 struct cpuset *cs;
3189 struct cgroup_subsys_state *pos_css;
3190
3191 rcu_read_lock();
3192 cpuset_for_each_descendant_pre(cs, pos_css, &top_cpuset) {
3193 if (cs == &top_cpuset || !css_tryget_online(&cs->css))
3194 continue;
3195 rcu_read_unlock();
3196
3197 cpuset_hotplug_update_tasks(cs, ptmp);
3198
3199 rcu_read_lock();
3200 css_put(&cs->css);
3201 }
3202 rcu_read_unlock();
3203 }
3204
3205
3206 if (cpus_updated || force_rebuild) {
3207 force_rebuild = false;
3208 rebuild_sched_domains();
3209 }
3210
3211 free_cpumasks(NULL, ptmp);
3212}
3213
3214void cpuset_update_active_cpus(void)
3215{
3216
3217
3218
3219
3220
3221 schedule_work(&cpuset_hotplug_work);
3222}
3223
3224void cpuset_wait_for_hotplug(void)
3225{
3226 flush_work(&cpuset_hotplug_work);
3227}
3228
3229
3230
3231
3232
3233
3234static int cpuset_track_online_nodes(struct notifier_block *self,
3235 unsigned long action, void *arg)
3236{
3237 schedule_work(&cpuset_hotplug_work);
3238 return NOTIFY_OK;
3239}
3240
3241static struct notifier_block cpuset_track_online_nodes_nb = {
3242 .notifier_call = cpuset_track_online_nodes,
3243 .priority = 10,
3244};
3245
3246
3247
3248
3249
3250
3251void __init cpuset_init_smp(void)
3252{
3253 cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask);
3254 top_cpuset.mems_allowed = node_states[N_MEMORY];
3255 top_cpuset.old_mems_allowed = top_cpuset.mems_allowed;
3256
3257 cpumask_copy(top_cpuset.effective_cpus, cpu_active_mask);
3258 top_cpuset.effective_mems = node_states[N_MEMORY];
3259
3260 register_hotmemory_notifier(&cpuset_track_online_nodes_nb);
3261
3262 cpuset_migrate_mm_wq = alloc_ordered_workqueue("cpuset_migrate_mm", 0);
3263 BUG_ON(!cpuset_migrate_mm_wq);
3264}
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask)
3278{
3279 unsigned long flags;
3280
3281 spin_lock_irqsave(&callback_lock, flags);
3282 rcu_read_lock();
3283 guarantee_online_cpus(task_cs(tsk), pmask);
3284 rcu_read_unlock();
3285 spin_unlock_irqrestore(&callback_lock, flags);
3286}
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300void cpuset_cpus_allowed_fallback(struct task_struct *tsk)
3301{
3302 rcu_read_lock();
3303 do_set_cpus_allowed(tsk, is_in_v2_mode() ?
3304 task_cs(tsk)->cpus_allowed : cpu_possible_mask);
3305 rcu_read_unlock();
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324}
3325
3326void __init cpuset_init_current_mems_allowed(void)
3327{
3328 nodes_setall(current->mems_allowed);
3329}
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341nodemask_t cpuset_mems_allowed(struct task_struct *tsk)
3342{
3343 nodemask_t mask;
3344 unsigned long flags;
3345
3346 spin_lock_irqsave(&callback_lock, flags);
3347 rcu_read_lock();
3348 guarantee_online_mems(task_cs(tsk), &mask);
3349 rcu_read_unlock();
3350 spin_unlock_irqrestore(&callback_lock, flags);
3351
3352 return mask;
3353}
3354
3355
3356
3357
3358
3359
3360
3361int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask)
3362{
3363 return nodes_intersects(*nodemask, current->mems_allowed);
3364}
3365
3366
3367
3368
3369
3370
3371
3372static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs)
3373{
3374 while (!(is_mem_exclusive(cs) || is_mem_hardwall(cs)) && parent_cs(cs))
3375 cs = parent_cs(cs);
3376 return cs;
3377}
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419bool __cpuset_node_allowed(int node, gfp_t gfp_mask)
3420{
3421 struct cpuset *cs;
3422 int allowed;
3423 unsigned long flags;
3424
3425 if (in_interrupt())
3426 return true;
3427 if (node_isset(node, current->mems_allowed))
3428 return true;
3429
3430
3431
3432
3433 if (unlikely(tsk_is_oom_victim(current)))
3434 return true;
3435 if (gfp_mask & __GFP_HARDWALL)
3436 return false;
3437
3438 if (current->flags & PF_EXITING)
3439 return true;
3440
3441
3442 spin_lock_irqsave(&callback_lock, flags);
3443
3444 rcu_read_lock();
3445 cs = nearest_hardwall_ancestor(task_cs(current));
3446 allowed = node_isset(node, cs->mems_allowed);
3447 rcu_read_unlock();
3448
3449 spin_unlock_irqrestore(&callback_lock, flags);
3450 return allowed;
3451}
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480static int cpuset_spread_node(int *rotor)
3481{
3482 return *rotor = next_node_in(*rotor, current->mems_allowed);
3483}
3484
3485int cpuset_mem_spread_node(void)
3486{
3487 if (current->cpuset_mem_spread_rotor == NUMA_NO_NODE)
3488 current->cpuset_mem_spread_rotor =
3489 node_random(¤t->mems_allowed);
3490
3491 return cpuset_spread_node(¤t->cpuset_mem_spread_rotor);
3492}
3493
3494int cpuset_slab_spread_node(void)
3495{
3496 if (current->cpuset_slab_spread_rotor == NUMA_NO_NODE)
3497 current->cpuset_slab_spread_rotor =
3498 node_random(¤t->mems_allowed);
3499
3500 return cpuset_spread_node(¤t->cpuset_slab_spread_rotor);
3501}
3502
3503EXPORT_SYMBOL_GPL(cpuset_mem_spread_node);
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516int cpuset_mems_allowed_intersects(const struct task_struct *tsk1,
3517 const struct task_struct *tsk2)
3518{
3519 return nodes_intersects(tsk1->mems_allowed, tsk2->mems_allowed);
3520}
3521
3522
3523
3524
3525
3526
3527
3528void cpuset_print_current_mems_allowed(void)
3529{
3530 struct cgroup *cgrp;
3531
3532 rcu_read_lock();
3533
3534 cgrp = task_cs(current)->css.cgroup;
3535 pr_cont(",cpuset=");
3536 pr_cont_cgroup_name(cgrp);
3537 pr_cont(",mems_allowed=%*pbl",
3538 nodemask_pr_args(¤t->mems_allowed));
3539
3540 rcu_read_unlock();
3541}
3542
3543
3544
3545
3546
3547
3548
3549int cpuset_memory_pressure_enabled __read_mostly;
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569void __cpuset_memory_pressure_bump(void)
3570{
3571 rcu_read_lock();
3572 fmeter_markevent(&task_cs(current)->fmeter);
3573 rcu_read_unlock();
3574}
3575
3576#ifdef CONFIG_PROC_PID_CPUSET
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586int proc_cpuset_show(struct seq_file *m, struct pid_namespace *ns,
3587 struct pid *pid, struct task_struct *tsk)
3588{
3589 char *buf;
3590 struct cgroup_subsys_state *css;
3591 int retval;
3592
3593 retval = -ENOMEM;
3594 buf = kmalloc(PATH_MAX, GFP_KERNEL);
3595 if (!buf)
3596 goto out;
3597
3598 css = task_get_css(tsk, cpuset_cgrp_id);
3599 retval = cgroup_path_ns(css->cgroup, buf, PATH_MAX,
3600 current->nsproxy->cgroup_ns);
3601 css_put(css);
3602 if (retval >= PATH_MAX)
3603 retval = -ENAMETOOLONG;
3604 if (retval < 0)
3605 goto out_free;
3606 seq_puts(m, buf);
3607 seq_putc(m, '\n');
3608 retval = 0;
3609out_free:
3610 kfree(buf);
3611out:
3612 return retval;
3613}
3614#endif
3615
3616
3617void cpuset_task_status_allowed(struct seq_file *m, struct task_struct *task)
3618{
3619 seq_printf(m, "Mems_allowed:\t%*pb\n",
3620 nodemask_pr_args(&task->mems_allowed));
3621 seq_printf(m, "Mems_allowed_list:\t%*pbl\n",
3622 nodemask_pr_args(&task->mems_allowed));
3623}
3624