1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25#include <linux/cpu.h>
26#include <linux/cpumask.h>
27#include <linux/cpuset.h>
28#include <linux/err.h>
29#include <linux/errno.h>
30#include <linux/file.h>
31#include <linux/fs.h>
32#include <linux/init.h>
33#include <linux/interrupt.h>
34#include <linux/kernel.h>
35#include <linux/kmod.h>
36#include <linux/list.h>
37#include <linux/mempolicy.h>
38#include <linux/mm.h>
39#include <linux/memory.h>
40#include <linux/export.h>
41#include <linux/mount.h>
42#include <linux/fs_context.h>
43#include <linux/namei.h>
44#include <linux/pagemap.h>
45#include <linux/proc_fs.h>
46#include <linux/rcupdate.h>
47#include <linux/sched.h>
48#include <linux/sched/deadline.h>
49#include <linux/sched/mm.h>
50#include <linux/sched/task.h>
51#include <linux/seq_file.h>
52#include <linux/security.h>
53#include <linux/slab.h>
54#include <linux/spinlock.h>
55#include <linux/stat.h>
56#include <linux/string.h>
57#include <linux/time.h>
58#include <linux/time64.h>
59#include <linux/backing-dev.h>
60#include <linux/sort.h>
61#include <linux/oom.h>
62#include <linux/sched/isolation.h>
63#include <linux/uaccess.h>
64#include <linux/atomic.h>
65#include <linux/mutex.h>
66#include <linux/cgroup.h>
67#include <linux/wait.h>
68
69DEFINE_STATIC_KEY_FALSE(cpusets_pre_enable_key);
70DEFINE_STATIC_KEY_FALSE(cpusets_enabled_key);
71
72
73
74
75
76
77DEFINE_STATIC_KEY_FALSE(cpusets_insane_config_key);
78
79
80
81struct fmeter {
82 int cnt;
83 int val;
84 time64_t time;
85 spinlock_t lock;
86};
87
88struct cpuset {
89 struct cgroup_subsys_state css;
90
91 unsigned long flags;
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114 cpumask_var_t cpus_allowed;
115 nodemask_t mems_allowed;
116
117
118 cpumask_var_t effective_cpus;
119 nodemask_t effective_mems;
120
121
122
123
124
125
126
127
128
129 cpumask_var_t subparts_cpus;
130
131
132
133
134
135
136
137
138
139
140
141 nodemask_t old_mems_allowed;
142
143 struct fmeter fmeter;
144
145
146
147
148
149 int attach_in_progress;
150
151
152 int pn;
153
154
155 int relax_domain_level;
156
157
158 int nr_subparts_cpus;
159
160
161 int partition_root_state;
162
163
164
165
166
167
168 int use_parent_ecpus;
169 int child_ecpus_count;
170
171
172 struct cgroup_file partition_file;
173};
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189#define PRS_DISABLED 0
190#define PRS_ENABLED 1
191#define PRS_ERROR -1
192
193
194
195
196
197struct tmpmasks {
198 cpumask_var_t addmask, delmask;
199 cpumask_var_t new_cpus;
200};
201
202static inline struct cpuset *css_cs(struct cgroup_subsys_state *css)
203{
204 return css ? container_of(css, struct cpuset, css) : NULL;
205}
206
207
208static inline struct cpuset *task_cs(struct task_struct *task)
209{
210 return css_cs(task_css(task, cpuset_cgrp_id));
211}
212
213static inline struct cpuset *parent_cs(struct cpuset *cs)
214{
215 return css_cs(cs->css.parent);
216}
217
218
219typedef enum {
220 CS_ONLINE,
221 CS_CPU_EXCLUSIVE,
222 CS_MEM_EXCLUSIVE,
223 CS_MEM_HARDWALL,
224 CS_MEMORY_MIGRATE,
225 CS_SCHED_LOAD_BALANCE,
226 CS_SPREAD_PAGE,
227 CS_SPREAD_SLAB,
228} cpuset_flagbits_t;
229
230
231static inline bool is_cpuset_online(struct cpuset *cs)
232{
233 return test_bit(CS_ONLINE, &cs->flags) && !css_is_dying(&cs->css);
234}
235
236static inline int is_cpu_exclusive(const struct cpuset *cs)
237{
238 return test_bit(CS_CPU_EXCLUSIVE, &cs->flags);
239}
240
241static inline int is_mem_exclusive(const struct cpuset *cs)
242{
243 return test_bit(CS_MEM_EXCLUSIVE, &cs->flags);
244}
245
246static inline int is_mem_hardwall(const struct cpuset *cs)
247{
248 return test_bit(CS_MEM_HARDWALL, &cs->flags);
249}
250
251static inline int is_sched_load_balance(const struct cpuset *cs)
252{
253 return test_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
254}
255
256static inline int is_memory_migrate(const struct cpuset *cs)
257{
258 return test_bit(CS_MEMORY_MIGRATE, &cs->flags);
259}
260
261static inline int is_spread_page(const struct cpuset *cs)
262{
263 return test_bit(CS_SPREAD_PAGE, &cs->flags);
264}
265
266static inline int is_spread_slab(const struct cpuset *cs)
267{
268 return test_bit(CS_SPREAD_SLAB, &cs->flags);
269}
270
271static inline int is_partition_root(const struct cpuset *cs)
272{
273 return cs->partition_root_state > 0;
274}
275
276
277
278
279static inline void notify_partition_change(struct cpuset *cs,
280 int old_prs, int new_prs)
281{
282 if (old_prs != new_prs)
283 cgroup_file_notify(&cs->partition_file);
284}
285
286static struct cpuset top_cpuset = {
287 .flags = ((1 << CS_ONLINE) | (1 << CS_CPU_EXCLUSIVE) |
288 (1 << CS_MEM_EXCLUSIVE)),
289 .partition_root_state = PRS_ENABLED,
290};
291
292
293
294
295
296
297
298
299
300
301#define cpuset_for_each_child(child_cs, pos_css, parent_cs) \
302 css_for_each_child((pos_css), &(parent_cs)->css) \
303 if (is_cpuset_online(((child_cs) = css_cs((pos_css)))))
304
305
306
307
308
309
310
311
312
313
314
315
316#define cpuset_for_each_descendant_pre(des_cs, pos_css, root_cs) \
317 css_for_each_descendant_pre((pos_css), &(root_cs)->css) \
318 if (is_cpuset_online(((des_cs) = css_cs((pos_css)))))
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358DEFINE_STATIC_PERCPU_RWSEM(cpuset_rwsem);
359
360void cpuset_read_lock(void)
361{
362 percpu_down_read(&cpuset_rwsem);
363}
364
365void cpuset_read_unlock(void)
366{
367 percpu_up_read(&cpuset_rwsem);
368}
369
370static DEFINE_SPINLOCK(callback_lock);
371
372static struct workqueue_struct *cpuset_migrate_mm_wq;
373
374
375
376
377static void cpuset_hotplug_workfn(struct work_struct *work);
378static DECLARE_WORK(cpuset_hotplug_work, cpuset_hotplug_workfn);
379
380static DECLARE_WAIT_QUEUE_HEAD(cpuset_attach_wq);
381
382static inline void check_insane_mems_config(nodemask_t *nodes)
383{
384 if (!cpusets_insane_config() &&
385 movable_only_nodes(nodes)) {
386 static_branch_enable(&cpusets_insane_config_key);
387 pr_info("Unsupported (movable nodes only) cpuset configuration detected (nmask=%*pbl)!\n"
388 "Cpuset allocations might fail even with a lot of memory available.\n",
389 nodemask_pr_args(nodes));
390 }
391}
392
393
394
395
396
397
398
399
400
401static inline bool is_in_v2_mode(void)
402{
403 return cgroup_subsys_on_dfl(cpuset_cgrp_subsys) ||
404 (cpuset_cgrp_subsys.root->flags & CGRP_ROOT_CPUSET_V2_MODE);
405}
406
407
408
409
410
411
412
413
414
415
416
417
418static void guarantee_online_cpus(struct task_struct *tsk,
419 struct cpumask *pmask)
420{
421 const struct cpumask *possible_mask = task_cpu_possible_mask(tsk);
422 struct cpuset *cs;
423
424 if (WARN_ON(!cpumask_and(pmask, possible_mask, cpu_online_mask)))
425 cpumask_copy(pmask, cpu_online_mask);
426
427 rcu_read_lock();
428 cs = task_cs(tsk);
429
430 while (!cpumask_intersects(cs->effective_cpus, pmask)) {
431 cs = parent_cs(cs);
432 if (unlikely(!cs)) {
433
434
435
436
437
438
439
440 goto out_unlock;
441 }
442 }
443 cpumask_and(pmask, pmask, cs->effective_cpus);
444
445out_unlock:
446 rcu_read_unlock();
447}
448
449
450
451
452
453
454
455
456
457
458
459
460static void guarantee_online_mems(struct cpuset *cs, nodemask_t *pmask)
461{
462 while (!nodes_intersects(cs->effective_mems, node_states[N_MEMORY]))
463 cs = parent_cs(cs);
464 nodes_and(*pmask, cs->effective_mems, node_states[N_MEMORY]);
465}
466
467
468
469
470
471
472static void cpuset_update_task_spread_flag(struct cpuset *cs,
473 struct task_struct *tsk)
474{
475 if (is_spread_page(cs))
476 task_set_spread_page(tsk);
477 else
478 task_clear_spread_page(tsk);
479
480 if (is_spread_slab(cs))
481 task_set_spread_slab(tsk);
482 else
483 task_clear_spread_slab(tsk);
484}
485
486
487
488
489
490
491
492
493
494static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q)
495{
496 return cpumask_subset(p->cpus_allowed, q->cpus_allowed) &&
497 nodes_subset(p->mems_allowed, q->mems_allowed) &&
498 is_cpu_exclusive(p) <= is_cpu_exclusive(q) &&
499 is_mem_exclusive(p) <= is_mem_exclusive(q);
500}
501
502
503
504
505
506
507
508
509
510static inline int alloc_cpumasks(struct cpuset *cs, struct tmpmasks *tmp)
511{
512 cpumask_var_t *pmask1, *pmask2, *pmask3;
513
514 if (cs) {
515 pmask1 = &cs->cpus_allowed;
516 pmask2 = &cs->effective_cpus;
517 pmask3 = &cs->subparts_cpus;
518 } else {
519 pmask1 = &tmp->new_cpus;
520 pmask2 = &tmp->addmask;
521 pmask3 = &tmp->delmask;
522 }
523
524 if (!zalloc_cpumask_var(pmask1, GFP_KERNEL))
525 return -ENOMEM;
526
527 if (!zalloc_cpumask_var(pmask2, GFP_KERNEL))
528 goto free_one;
529
530 if (!zalloc_cpumask_var(pmask3, GFP_KERNEL))
531 goto free_two;
532
533 return 0;
534
535free_two:
536 free_cpumask_var(*pmask2);
537free_one:
538 free_cpumask_var(*pmask1);
539 return -ENOMEM;
540}
541
542
543
544
545
546
547static inline void free_cpumasks(struct cpuset *cs, struct tmpmasks *tmp)
548{
549 if (cs) {
550 free_cpumask_var(cs->cpus_allowed);
551 free_cpumask_var(cs->effective_cpus);
552 free_cpumask_var(cs->subparts_cpus);
553 }
554 if (tmp) {
555 free_cpumask_var(tmp->new_cpus);
556 free_cpumask_var(tmp->addmask);
557 free_cpumask_var(tmp->delmask);
558 }
559}
560
561
562
563
564
565static struct cpuset *alloc_trial_cpuset(struct cpuset *cs)
566{
567 struct cpuset *trial;
568
569 trial = kmemdup(cs, sizeof(*cs), GFP_KERNEL);
570 if (!trial)
571 return NULL;
572
573 if (alloc_cpumasks(trial, NULL)) {
574 kfree(trial);
575 return NULL;
576 }
577
578 cpumask_copy(trial->cpus_allowed, cs->cpus_allowed);
579 cpumask_copy(trial->effective_cpus, cs->effective_cpus);
580 return trial;
581}
582
583
584
585
586
587static inline void free_cpuset(struct cpuset *cs)
588{
589 free_cpumasks(cs, NULL);
590 kfree(cs);
591}
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613static int validate_change(struct cpuset *cur, struct cpuset *trial)
614{
615 struct cgroup_subsys_state *css;
616 struct cpuset *c, *par;
617 int ret;
618
619 rcu_read_lock();
620
621
622 ret = -EBUSY;
623 cpuset_for_each_child(c, css, cur)
624 if (!is_cpuset_subset(c, trial))
625 goto out;
626
627
628 ret = 0;
629 if (cur == &top_cpuset)
630 goto out;
631
632 par = parent_cs(cur);
633
634
635 ret = -EACCES;
636 if (!is_in_v2_mode() && !is_cpuset_subset(trial, par))
637 goto out;
638
639
640
641
642
643 ret = -EINVAL;
644 cpuset_for_each_child(c, css, par) {
645 if ((is_cpu_exclusive(trial) || is_cpu_exclusive(c)) &&
646 c != cur &&
647 cpumask_intersects(trial->cpus_allowed, c->cpus_allowed))
648 goto out;
649 if ((is_mem_exclusive(trial) || is_mem_exclusive(c)) &&
650 c != cur &&
651 nodes_intersects(trial->mems_allowed, c->mems_allowed))
652 goto out;
653 }
654
655
656
657
658
659 ret = -ENOSPC;
660 if ((cgroup_is_populated(cur->css.cgroup) || cur->attach_in_progress)) {
661 if (!cpumask_empty(cur->cpus_allowed) &&
662 cpumask_empty(trial->cpus_allowed))
663 goto out;
664 if (!nodes_empty(cur->mems_allowed) &&
665 nodes_empty(trial->mems_allowed))
666 goto out;
667 }
668
669
670
671
672
673 ret = -EBUSY;
674 if (is_cpu_exclusive(cur) &&
675 !cpuset_cpumask_can_shrink(cur->cpus_allowed,
676 trial->cpus_allowed))
677 goto out;
678
679 ret = 0;
680out:
681 rcu_read_unlock();
682 return ret;
683}
684
685#ifdef CONFIG_SMP
686
687
688
689
690static int cpusets_overlap(struct cpuset *a, struct cpuset *b)
691{
692 return cpumask_intersects(a->effective_cpus, b->effective_cpus);
693}
694
695static void
696update_domain_attr(struct sched_domain_attr *dattr, struct cpuset *c)
697{
698 if (dattr->relax_domain_level < c->relax_domain_level)
699 dattr->relax_domain_level = c->relax_domain_level;
700 return;
701}
702
703static void update_domain_attr_tree(struct sched_domain_attr *dattr,
704 struct cpuset *root_cs)
705{
706 struct cpuset *cp;
707 struct cgroup_subsys_state *pos_css;
708
709 rcu_read_lock();
710 cpuset_for_each_descendant_pre(cp, pos_css, root_cs) {
711
712 if (cpumask_empty(cp->cpus_allowed)) {
713 pos_css = css_rightmost_descendant(pos_css);
714 continue;
715 }
716
717 if (is_sched_load_balance(cp))
718 update_domain_attr(dattr, cp);
719 }
720 rcu_read_unlock();
721}
722
723
724static inline int nr_cpusets(void)
725{
726
727 return static_key_count(&cpusets_enabled_key.key) + 1;
728}
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783static int generate_sched_domains(cpumask_var_t **domains,
784 struct sched_domain_attr **attributes)
785{
786 struct cpuset *cp;
787 struct cpuset **csa;
788 int csn;
789 int i, j, k;
790 cpumask_var_t *doms;
791 struct sched_domain_attr *dattr;
792 int ndoms = 0;
793 int nslot;
794 struct cgroup_subsys_state *pos_css;
795 bool root_load_balance = is_sched_load_balance(&top_cpuset);
796
797 doms = NULL;
798 dattr = NULL;
799 csa = NULL;
800
801
802 if (root_load_balance && !top_cpuset.nr_subparts_cpus) {
803 ndoms = 1;
804 doms = alloc_sched_domains(ndoms);
805 if (!doms)
806 goto done;
807
808 dattr = kmalloc(sizeof(struct sched_domain_attr), GFP_KERNEL);
809 if (dattr) {
810 *dattr = SD_ATTR_INIT;
811 update_domain_attr_tree(dattr, &top_cpuset);
812 }
813 cpumask_and(doms[0], top_cpuset.effective_cpus,
814 housekeeping_cpumask(HK_FLAG_DOMAIN));
815
816 goto done;
817 }
818
819 csa = kmalloc_array(nr_cpusets(), sizeof(cp), GFP_KERNEL);
820 if (!csa)
821 goto done;
822 csn = 0;
823
824 rcu_read_lock();
825 if (root_load_balance)
826 csa[csn++] = &top_cpuset;
827 cpuset_for_each_descendant_pre(cp, pos_css, &top_cpuset) {
828 if (cp == &top_cpuset)
829 continue;
830
831
832
833
834
835
836
837
838
839
840
841 if (!cpumask_empty(cp->cpus_allowed) &&
842 !(is_sched_load_balance(cp) &&
843 cpumask_intersects(cp->cpus_allowed,
844 housekeeping_cpumask(HK_FLAG_DOMAIN))))
845 continue;
846
847 if (root_load_balance &&
848 cpumask_subset(cp->cpus_allowed, top_cpuset.effective_cpus))
849 continue;
850
851 if (is_sched_load_balance(cp) &&
852 !cpumask_empty(cp->effective_cpus))
853 csa[csn++] = cp;
854
855
856 if (!is_partition_root(cp))
857 pos_css = css_rightmost_descendant(pos_css);
858 }
859 rcu_read_unlock();
860
861 for (i = 0; i < csn; i++)
862 csa[i]->pn = i;
863 ndoms = csn;
864
865restart:
866
867 for (i = 0; i < csn; i++) {
868 struct cpuset *a = csa[i];
869 int apn = a->pn;
870
871 for (j = 0; j < csn; j++) {
872 struct cpuset *b = csa[j];
873 int bpn = b->pn;
874
875 if (apn != bpn && cpusets_overlap(a, b)) {
876 for (k = 0; k < csn; k++) {
877 struct cpuset *c = csa[k];
878
879 if (c->pn == bpn)
880 c->pn = apn;
881 }
882 ndoms--;
883 goto restart;
884 }
885 }
886 }
887
888
889
890
891
892 doms = alloc_sched_domains(ndoms);
893 if (!doms)
894 goto done;
895
896
897
898
899
900 dattr = kmalloc_array(ndoms, sizeof(struct sched_domain_attr),
901 GFP_KERNEL);
902
903 for (nslot = 0, i = 0; i < csn; i++) {
904 struct cpuset *a = csa[i];
905 struct cpumask *dp;
906 int apn = a->pn;
907
908 if (apn < 0) {
909
910 continue;
911 }
912
913 dp = doms[nslot];
914
915 if (nslot == ndoms) {
916 static int warnings = 10;
917 if (warnings) {
918 pr_warn("rebuild_sched_domains confused: nslot %d, ndoms %d, csn %d, i %d, apn %d\n",
919 nslot, ndoms, csn, i, apn);
920 warnings--;
921 }
922 continue;
923 }
924
925 cpumask_clear(dp);
926 if (dattr)
927 *(dattr + nslot) = SD_ATTR_INIT;
928 for (j = i; j < csn; j++) {
929 struct cpuset *b = csa[j];
930
931 if (apn == b->pn) {
932 cpumask_or(dp, dp, b->effective_cpus);
933 cpumask_and(dp, dp, housekeeping_cpumask(HK_FLAG_DOMAIN));
934 if (dattr)
935 update_domain_attr_tree(dattr + nslot, b);
936
937
938 b->pn = -1;
939 }
940 }
941 nslot++;
942 }
943 BUG_ON(nslot != ndoms);
944
945done:
946 kfree(csa);
947
948
949
950
951
952 if (doms == NULL)
953 ndoms = 1;
954
955 *domains = doms;
956 *attributes = dattr;
957 return ndoms;
958}
959
960static void update_tasks_root_domain(struct cpuset *cs)
961{
962 struct css_task_iter it;
963 struct task_struct *task;
964
965 css_task_iter_start(&cs->css, 0, &it);
966
967 while ((task = css_task_iter_next(&it)))
968 dl_add_task_root_domain(task);
969
970 css_task_iter_end(&it);
971}
972
973static void rebuild_root_domains(void)
974{
975 struct cpuset *cs = NULL;
976 struct cgroup_subsys_state *pos_css;
977
978 percpu_rwsem_assert_held(&cpuset_rwsem);
979 lockdep_assert_cpus_held();
980 lockdep_assert_held(&sched_domains_mutex);
981
982 rcu_read_lock();
983
984
985
986
987
988 dl_clear_root_domain(&def_root_domain);
989
990 cpuset_for_each_descendant_pre(cs, pos_css, &top_cpuset) {
991
992 if (cpumask_empty(cs->effective_cpus)) {
993 pos_css = css_rightmost_descendant(pos_css);
994 continue;
995 }
996
997 css_get(&cs->css);
998
999 rcu_read_unlock();
1000
1001 update_tasks_root_domain(cs);
1002
1003 rcu_read_lock();
1004 css_put(&cs->css);
1005 }
1006 rcu_read_unlock();
1007}
1008
1009static void
1010partition_and_rebuild_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
1011 struct sched_domain_attr *dattr_new)
1012{
1013 mutex_lock(&sched_domains_mutex);
1014 partition_sched_domains_locked(ndoms_new, doms_new, dattr_new);
1015 rebuild_root_domains();
1016 mutex_unlock(&sched_domains_mutex);
1017}
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030static void rebuild_sched_domains_locked(void)
1031{
1032 struct cgroup_subsys_state *pos_css;
1033 struct sched_domain_attr *attr;
1034 cpumask_var_t *doms;
1035 struct cpuset *cs;
1036 int ndoms;
1037
1038 lockdep_assert_cpus_held();
1039 percpu_rwsem_assert_held(&cpuset_rwsem);
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050 if (!top_cpuset.nr_subparts_cpus &&
1051 !cpumask_equal(top_cpuset.effective_cpus, cpu_active_mask))
1052 return;
1053
1054
1055
1056
1057
1058
1059 if (top_cpuset.nr_subparts_cpus) {
1060 rcu_read_lock();
1061 cpuset_for_each_descendant_pre(cs, pos_css, &top_cpuset) {
1062 if (!is_partition_root(cs)) {
1063 pos_css = css_rightmost_descendant(pos_css);
1064 continue;
1065 }
1066 if (!cpumask_subset(cs->effective_cpus,
1067 cpu_active_mask)) {
1068 rcu_read_unlock();
1069 return;
1070 }
1071 }
1072 rcu_read_unlock();
1073 }
1074
1075
1076 ndoms = generate_sched_domains(&doms, &attr);
1077
1078
1079 partition_and_rebuild_sched_domains(ndoms, doms, attr);
1080}
1081#else
1082static void rebuild_sched_domains_locked(void)
1083{
1084}
1085#endif
1086
1087void rebuild_sched_domains(void)
1088{
1089 cpus_read_lock();
1090 percpu_down_write(&cpuset_rwsem);
1091 rebuild_sched_domains_locked();
1092 percpu_up_write(&cpuset_rwsem);
1093 cpus_read_unlock();
1094}
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104static void update_tasks_cpumask(struct cpuset *cs)
1105{
1106 struct css_task_iter it;
1107 struct task_struct *task;
1108
1109 css_task_iter_start(&cs->css, 0, &it);
1110 while ((task = css_task_iter_next(&it)))
1111 set_cpus_allowed_ptr(task, cs->effective_cpus);
1112 css_task_iter_end(&it);
1113}
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126static void compute_effective_cpumask(struct cpumask *new_cpus,
1127 struct cpuset *cs, struct cpuset *parent)
1128{
1129 if (parent->nr_subparts_cpus) {
1130 cpumask_or(new_cpus, parent->effective_cpus,
1131 parent->subparts_cpus);
1132 cpumask_and(new_cpus, new_cpus, cs->cpus_allowed);
1133 cpumask_and(new_cpus, new_cpus, cpu_active_mask);
1134 } else {
1135 cpumask_and(new_cpus, cs->cpus_allowed, parent->effective_cpus);
1136 }
1137}
1138
1139
1140
1141
1142enum subparts_cmd {
1143 partcmd_enable,
1144 partcmd_disable,
1145 partcmd_update,
1146};
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190static int update_parent_subparts_cpumask(struct cpuset *cpuset, int cmd,
1191 struct cpumask *newmask,
1192 struct tmpmasks *tmp)
1193{
1194 struct cpuset *parent = parent_cs(cpuset);
1195 int adding;
1196 int deleting;
1197 int old_prs, new_prs;
1198 bool part_error = false;
1199
1200 percpu_rwsem_assert_held(&cpuset_rwsem);
1201
1202
1203
1204
1205
1206
1207 if (!is_partition_root(parent) ||
1208 (newmask && cpumask_empty(newmask)) ||
1209 (!newmask && cpumask_empty(cpuset->cpus_allowed)))
1210 return -EINVAL;
1211
1212
1213
1214
1215
1216 if ((cmd != partcmd_update) && css_has_online_children(&cpuset->css))
1217 return -EBUSY;
1218
1219
1220
1221
1222
1223
1224 if ((cmd == partcmd_enable) &&
1225 (!cpumask_subset(cpuset->cpus_allowed, parent->effective_cpus) ||
1226 cpumask_equal(cpuset->cpus_allowed, parent->effective_cpus)))
1227 return -EINVAL;
1228
1229
1230
1231
1232 adding = deleting = false;
1233 old_prs = new_prs = cpuset->partition_root_state;
1234 if (cmd == partcmd_enable) {
1235 cpumask_copy(tmp->addmask, cpuset->cpus_allowed);
1236 adding = true;
1237 } else if (cmd == partcmd_disable) {
1238 deleting = cpumask_and(tmp->delmask, cpuset->cpus_allowed,
1239 parent->subparts_cpus);
1240 } else if (newmask) {
1241
1242
1243
1244
1245
1246
1247
1248 cpumask_andnot(tmp->delmask, cpuset->cpus_allowed, newmask);
1249 deleting = cpumask_and(tmp->delmask, tmp->delmask,
1250 parent->subparts_cpus);
1251
1252 cpumask_and(tmp->addmask, newmask, parent->effective_cpus);
1253 adding = cpumask_andnot(tmp->addmask, tmp->addmask,
1254 parent->subparts_cpus);
1255
1256
1257
1258 if (adding &&
1259 cpumask_equal(parent->effective_cpus, tmp->addmask)) {
1260 if (!deleting)
1261 return -EINVAL;
1262
1263
1264
1265
1266
1267 if (!cpumask_and(tmp->addmask, tmp->delmask,
1268 cpu_active_mask))
1269 return -EINVAL;
1270 cpumask_copy(tmp->addmask, parent->effective_cpus);
1271 }
1272 } else {
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282 adding = cpumask_and(tmp->addmask, cpuset->cpus_allowed,
1283 parent->effective_cpus);
1284 part_error = cpumask_equal(tmp->addmask,
1285 parent->effective_cpus);
1286 }
1287
1288 if (cmd == partcmd_update) {
1289 int prev_prs = cpuset->partition_root_state;
1290
1291
1292
1293
1294
1295 switch (cpuset->partition_root_state) {
1296 case PRS_ENABLED:
1297 if (part_error)
1298 new_prs = PRS_ERROR;
1299 break;
1300 case PRS_ERROR:
1301 if (!part_error)
1302 new_prs = PRS_ENABLED;
1303 break;
1304 }
1305
1306
1307
1308 part_error = (prev_prs == PRS_ERROR);
1309 }
1310
1311 if (!part_error && (new_prs == PRS_ERROR))
1312 return 0;
1313
1314 if (new_prs == PRS_ERROR) {
1315
1316
1317
1318 adding = false;
1319 deleting = cpumask_and(tmp->delmask, cpuset->cpus_allowed,
1320 parent->subparts_cpus);
1321 }
1322
1323 if (!adding && !deleting && (new_prs == old_prs))
1324 return 0;
1325
1326
1327
1328
1329
1330
1331 spin_lock_irq(&callback_lock);
1332 if (adding) {
1333 cpumask_or(parent->subparts_cpus,
1334 parent->subparts_cpus, tmp->addmask);
1335 cpumask_andnot(parent->effective_cpus,
1336 parent->effective_cpus, tmp->addmask);
1337 }
1338 if (deleting) {
1339 cpumask_andnot(parent->subparts_cpus,
1340 parent->subparts_cpus, tmp->delmask);
1341
1342
1343
1344 cpumask_and(tmp->delmask, tmp->delmask, cpu_active_mask);
1345 cpumask_or(parent->effective_cpus,
1346 parent->effective_cpus, tmp->delmask);
1347 }
1348
1349 parent->nr_subparts_cpus = cpumask_weight(parent->subparts_cpus);
1350
1351 if (old_prs != new_prs)
1352 cpuset->partition_root_state = new_prs;
1353
1354 spin_unlock_irq(&callback_lock);
1355 notify_partition_change(cpuset, old_prs, new_prs);
1356
1357 return cmd == partcmd_update;
1358}
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp)
1373{
1374 struct cpuset *cp;
1375 struct cgroup_subsys_state *pos_css;
1376 bool need_rebuild_sched_domains = false;
1377 int old_prs, new_prs;
1378
1379 rcu_read_lock();
1380 cpuset_for_each_descendant_pre(cp, pos_css, cs) {
1381 struct cpuset *parent = parent_cs(cp);
1382
1383 compute_effective_cpumask(tmp->new_cpus, cp, parent);
1384
1385
1386
1387
1388
1389 if (is_in_v2_mode() && cpumask_empty(tmp->new_cpus)) {
1390 cpumask_copy(tmp->new_cpus, parent->effective_cpus);
1391 if (!cp->use_parent_ecpus) {
1392 cp->use_parent_ecpus = true;
1393 parent->child_ecpus_count++;
1394 }
1395 } else if (cp->use_parent_ecpus) {
1396 cp->use_parent_ecpus = false;
1397 WARN_ON_ONCE(!parent->child_ecpus_count);
1398 parent->child_ecpus_count--;
1399 }
1400
1401
1402
1403
1404
1405 if (!cp->partition_root_state &&
1406 cpumask_equal(tmp->new_cpus, cp->effective_cpus)) {
1407 pos_css = css_rightmost_descendant(pos_css);
1408 continue;
1409 }
1410
1411
1412
1413
1414
1415
1416
1417 old_prs = new_prs = cp->partition_root_state;
1418 if ((cp != cs) && old_prs) {
1419 switch (parent->partition_root_state) {
1420 case PRS_DISABLED:
1421
1422
1423
1424
1425
1426 WARN_ON_ONCE(cp->partition_root_state
1427 != PRS_ERROR);
1428 new_prs = PRS_DISABLED;
1429
1430
1431
1432
1433
1434
1435
1436
1437 clear_bit(CS_CPU_EXCLUSIVE, &cp->flags);
1438 break;
1439
1440 case PRS_ENABLED:
1441 if (update_parent_subparts_cpumask(cp, partcmd_update, NULL, tmp))
1442 update_tasks_cpumask(parent);
1443 break;
1444
1445 case PRS_ERROR:
1446
1447
1448
1449 new_prs = PRS_ERROR;
1450 break;
1451 }
1452 }
1453
1454 if (!css_tryget_online(&cp->css))
1455 continue;
1456 rcu_read_unlock();
1457
1458 spin_lock_irq(&callback_lock);
1459
1460 cpumask_copy(cp->effective_cpus, tmp->new_cpus);
1461 if (cp->nr_subparts_cpus && (new_prs != PRS_ENABLED)) {
1462 cp->nr_subparts_cpus = 0;
1463 cpumask_clear(cp->subparts_cpus);
1464 } else if (cp->nr_subparts_cpus) {
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474 cpumask_andnot(cp->effective_cpus, cp->effective_cpus,
1475 cp->subparts_cpus);
1476 if (cpumask_empty(cp->effective_cpus)) {
1477 cpumask_copy(cp->effective_cpus, tmp->new_cpus);
1478 cpumask_clear(cp->subparts_cpus);
1479 cp->nr_subparts_cpus = 0;
1480 } else if (!cpumask_subset(cp->subparts_cpus,
1481 tmp->new_cpus)) {
1482 cpumask_andnot(cp->subparts_cpus,
1483 cp->subparts_cpus, tmp->new_cpus);
1484 cp->nr_subparts_cpus
1485 = cpumask_weight(cp->subparts_cpus);
1486 }
1487 }
1488
1489 if (new_prs != old_prs)
1490 cp->partition_root_state = new_prs;
1491
1492 spin_unlock_irq(&callback_lock);
1493 notify_partition_change(cp, old_prs, new_prs);
1494
1495 WARN_ON(!is_in_v2_mode() &&
1496 !cpumask_equal(cp->cpus_allowed, cp->effective_cpus));
1497
1498 update_tasks_cpumask(cp);
1499
1500
1501
1502
1503
1504
1505
1506 if (!cpumask_empty(cp->cpus_allowed) &&
1507 is_sched_load_balance(cp) &&
1508 (!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) ||
1509 is_partition_root(cp)))
1510 need_rebuild_sched_domains = true;
1511
1512 rcu_read_lock();
1513 css_put(&cp->css);
1514 }
1515 rcu_read_unlock();
1516
1517 if (need_rebuild_sched_domains)
1518 rebuild_sched_domains_locked();
1519}
1520
1521
1522
1523
1524
1525
1526
1527static void update_sibling_cpumasks(struct cpuset *parent, struct cpuset *cs,
1528 struct tmpmasks *tmp)
1529{
1530 struct cpuset *sibling;
1531 struct cgroup_subsys_state *pos_css;
1532
1533
1534
1535
1536
1537
1538 rcu_read_lock();
1539 cpuset_for_each_child(sibling, pos_css, parent) {
1540 if (sibling == cs)
1541 continue;
1542 if (!sibling->use_parent_ecpus)
1543 continue;
1544
1545 update_cpumasks_hier(sibling, tmp);
1546 }
1547 rcu_read_unlock();
1548}
1549
1550
1551
1552
1553
1554
1555
1556static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
1557 const char *buf)
1558{
1559 int retval;
1560 struct tmpmasks tmp;
1561
1562
1563 if (cs == &top_cpuset)
1564 return -EACCES;
1565
1566
1567
1568
1569
1570
1571
1572 if (!*buf) {
1573 cpumask_clear(trialcs->cpus_allowed);
1574 } else {
1575 retval = cpulist_parse(buf, trialcs->cpus_allowed);
1576 if (retval < 0)
1577 return retval;
1578
1579 if (!cpumask_subset(trialcs->cpus_allowed,
1580 top_cpuset.cpus_allowed))
1581 return -EINVAL;
1582 }
1583
1584
1585 if (cpumask_equal(cs->cpus_allowed, trialcs->cpus_allowed))
1586 return 0;
1587
1588 retval = validate_change(cs, trialcs);
1589 if (retval < 0)
1590 return retval;
1591
1592#ifdef CONFIG_CPUMASK_OFFSTACK
1593
1594
1595
1596
1597 tmp.addmask = trialcs->subparts_cpus;
1598 tmp.delmask = trialcs->effective_cpus;
1599 tmp.new_cpus = trialcs->cpus_allowed;
1600#endif
1601
1602 if (cs->partition_root_state) {
1603
1604 if (cpumask_empty(trialcs->cpus_allowed))
1605 return -EINVAL;
1606 if (update_parent_subparts_cpumask(cs, partcmd_update,
1607 trialcs->cpus_allowed, &tmp) < 0)
1608 return -EINVAL;
1609 }
1610
1611 spin_lock_irq(&callback_lock);
1612 cpumask_copy(cs->cpus_allowed, trialcs->cpus_allowed);
1613
1614
1615
1616
1617 if (cs->nr_subparts_cpus) {
1618 cpumask_andnot(cs->subparts_cpus, cs->subparts_cpus,
1619 cs->cpus_allowed);
1620 cs->nr_subparts_cpus = cpumask_weight(cs->subparts_cpus);
1621 }
1622 spin_unlock_irq(&callback_lock);
1623
1624 update_cpumasks_hier(cs, &tmp);
1625
1626 if (cs->partition_root_state) {
1627 struct cpuset *parent = parent_cs(cs);
1628
1629
1630
1631
1632
1633 if (parent->child_ecpus_count)
1634 update_sibling_cpumasks(parent, cs, &tmp);
1635 }
1636 return 0;
1637}
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647struct cpuset_migrate_mm_work {
1648 struct work_struct work;
1649 struct mm_struct *mm;
1650 nodemask_t from;
1651 nodemask_t to;
1652};
1653
1654static void cpuset_migrate_mm_workfn(struct work_struct *work)
1655{
1656 struct cpuset_migrate_mm_work *mwork =
1657 container_of(work, struct cpuset_migrate_mm_work, work);
1658
1659
1660 do_migrate_pages(mwork->mm, &mwork->from, &mwork->to, MPOL_MF_MOVE_ALL);
1661 mmput(mwork->mm);
1662 kfree(mwork);
1663}
1664
1665static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from,
1666 const nodemask_t *to)
1667{
1668 struct cpuset_migrate_mm_work *mwork;
1669
1670 if (nodes_equal(*from, *to)) {
1671 mmput(mm);
1672 return;
1673 }
1674
1675 mwork = kzalloc(sizeof(*mwork), GFP_KERNEL);
1676 if (mwork) {
1677 mwork->mm = mm;
1678 mwork->from = *from;
1679 mwork->to = *to;
1680 INIT_WORK(&mwork->work, cpuset_migrate_mm_workfn);
1681 queue_work(cpuset_migrate_mm_wq, &mwork->work);
1682 } else {
1683 mmput(mm);
1684 }
1685}
1686
1687static void cpuset_post_attach(void)
1688{
1689 flush_workqueue(cpuset_migrate_mm_wq);
1690}
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702static void cpuset_change_task_nodemask(struct task_struct *tsk,
1703 nodemask_t *newmems)
1704{
1705 task_lock(tsk);
1706
1707 local_irq_disable();
1708 write_seqcount_begin(&tsk->mems_allowed_seq);
1709
1710 nodes_or(tsk->mems_allowed, tsk->mems_allowed, *newmems);
1711 mpol_rebind_task(tsk, newmems);
1712 tsk->mems_allowed = *newmems;
1713
1714 write_seqcount_end(&tsk->mems_allowed_seq);
1715 local_irq_enable();
1716
1717 task_unlock(tsk);
1718}
1719
1720static void *cpuset_being_rebound;
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730static void update_tasks_nodemask(struct cpuset *cs)
1731{
1732 static nodemask_t newmems;
1733 struct css_task_iter it;
1734 struct task_struct *task;
1735
1736 cpuset_being_rebound = cs;
1737
1738 guarantee_online_mems(cs, &newmems);
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750 css_task_iter_start(&cs->css, 0, &it);
1751 while ((task = css_task_iter_next(&it))) {
1752 struct mm_struct *mm;
1753 bool migrate;
1754
1755 cpuset_change_task_nodemask(task, &newmems);
1756
1757 mm = get_task_mm(task);
1758 if (!mm)
1759 continue;
1760
1761 migrate = is_memory_migrate(cs);
1762
1763 mpol_rebind_mm(mm, &cs->mems_allowed);
1764 if (migrate)
1765 cpuset_migrate_mm(mm, &cs->old_mems_allowed, &newmems);
1766 else
1767 mmput(mm);
1768 }
1769 css_task_iter_end(&it);
1770
1771
1772
1773
1774
1775 cs->old_mems_allowed = newmems;
1776
1777
1778 cpuset_being_rebound = NULL;
1779}
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793static void update_nodemasks_hier(struct cpuset *cs, nodemask_t *new_mems)
1794{
1795 struct cpuset *cp;
1796 struct cgroup_subsys_state *pos_css;
1797
1798 rcu_read_lock();
1799 cpuset_for_each_descendant_pre(cp, pos_css, cs) {
1800 struct cpuset *parent = parent_cs(cp);
1801
1802 nodes_and(*new_mems, cp->mems_allowed, parent->effective_mems);
1803
1804
1805
1806
1807
1808 if (is_in_v2_mode() && nodes_empty(*new_mems))
1809 *new_mems = parent->effective_mems;
1810
1811
1812 if (nodes_equal(*new_mems, cp->effective_mems)) {
1813 pos_css = css_rightmost_descendant(pos_css);
1814 continue;
1815 }
1816
1817 if (!css_tryget_online(&cp->css))
1818 continue;
1819 rcu_read_unlock();
1820
1821 spin_lock_irq(&callback_lock);
1822 cp->effective_mems = *new_mems;
1823 spin_unlock_irq(&callback_lock);
1824
1825 WARN_ON(!is_in_v2_mode() &&
1826 !nodes_equal(cp->mems_allowed, cp->effective_mems));
1827
1828 update_tasks_nodemask(cp);
1829
1830 rcu_read_lock();
1831 css_put(&cp->css);
1832 }
1833 rcu_read_unlock();
1834}
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
1850 const char *buf)
1851{
1852 int retval;
1853
1854
1855
1856
1857
1858 if (cs == &top_cpuset) {
1859 retval = -EACCES;
1860 goto done;
1861 }
1862
1863
1864
1865
1866
1867
1868
1869 if (!*buf) {
1870 nodes_clear(trialcs->mems_allowed);
1871 } else {
1872 retval = nodelist_parse(buf, trialcs->mems_allowed);
1873 if (retval < 0)
1874 goto done;
1875
1876 if (!nodes_subset(trialcs->mems_allowed,
1877 top_cpuset.mems_allowed)) {
1878 retval = -EINVAL;
1879 goto done;
1880 }
1881 }
1882
1883 if (nodes_equal(cs->mems_allowed, trialcs->mems_allowed)) {
1884 retval = 0;
1885 goto done;
1886 }
1887 retval = validate_change(cs, trialcs);
1888 if (retval < 0)
1889 goto done;
1890
1891 check_insane_mems_config(&trialcs->mems_allowed);
1892
1893 spin_lock_irq(&callback_lock);
1894 cs->mems_allowed = trialcs->mems_allowed;
1895 spin_unlock_irq(&callback_lock);
1896
1897
1898 update_nodemasks_hier(cs, &trialcs->mems_allowed);
1899done:
1900 return retval;
1901}
1902
1903bool current_cpuset_is_being_rebound(void)
1904{
1905 bool ret;
1906
1907 rcu_read_lock();
1908 ret = task_cs(current) == cpuset_being_rebound;
1909 rcu_read_unlock();
1910
1911 return ret;
1912}
1913
1914static int update_relax_domain_level(struct cpuset *cs, s64 val)
1915{
1916#ifdef CONFIG_SMP
1917 if (val < -1 || val >= sched_domain_level_max)
1918 return -EINVAL;
1919#endif
1920
1921 if (val != cs->relax_domain_level) {
1922 cs->relax_domain_level = val;
1923 if (!cpumask_empty(cs->cpus_allowed) &&
1924 is_sched_load_balance(cs))
1925 rebuild_sched_domains_locked();
1926 }
1927
1928 return 0;
1929}
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939static void update_tasks_flags(struct cpuset *cs)
1940{
1941 struct css_task_iter it;
1942 struct task_struct *task;
1943
1944 css_task_iter_start(&cs->css, 0, &it);
1945 while ((task = css_task_iter_next(&it)))
1946 cpuset_update_task_spread_flag(cs, task);
1947 css_task_iter_end(&it);
1948}
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
1960 int turning_on)
1961{
1962 struct cpuset *trialcs;
1963 int balance_flag_changed;
1964 int spread_flag_changed;
1965 int err;
1966
1967 trialcs = alloc_trial_cpuset(cs);
1968 if (!trialcs)
1969 return -ENOMEM;
1970
1971 if (turning_on)
1972 set_bit(bit, &trialcs->flags);
1973 else
1974 clear_bit(bit, &trialcs->flags);
1975
1976 err = validate_change(cs, trialcs);
1977 if (err < 0)
1978 goto out;
1979
1980 balance_flag_changed = (is_sched_load_balance(cs) !=
1981 is_sched_load_balance(trialcs));
1982
1983 spread_flag_changed = ((is_spread_slab(cs) != is_spread_slab(trialcs))
1984 || (is_spread_page(cs) != is_spread_page(trialcs)));
1985
1986 spin_lock_irq(&callback_lock);
1987 cs->flags = trialcs->flags;
1988 spin_unlock_irq(&callback_lock);
1989
1990 if (!cpumask_empty(trialcs->cpus_allowed) && balance_flag_changed)
1991 rebuild_sched_domains_locked();
1992
1993 if (spread_flag_changed)
1994 update_tasks_flags(cs);
1995out:
1996 free_cpuset(trialcs);
1997 return err;
1998}
1999
2000
2001
2002
2003
2004
2005
2006
2007static int update_prstate(struct cpuset *cs, int new_prs)
2008{
2009 int err, old_prs = cs->partition_root_state;
2010 struct cpuset *parent = parent_cs(cs);
2011 struct tmpmasks tmpmask;
2012
2013 if (old_prs == new_prs)
2014 return 0;
2015
2016
2017
2018
2019
2020 if (new_prs && (old_prs == PRS_ERROR))
2021 return -EINVAL;
2022
2023 if (alloc_cpumasks(NULL, &tmpmask))
2024 return -ENOMEM;
2025
2026 err = -EINVAL;
2027 if (!old_prs) {
2028
2029
2030
2031
2032
2033 if (cpumask_empty(cs->cpus_allowed))
2034 goto out;
2035
2036 err = update_flag(CS_CPU_EXCLUSIVE, cs, 1);
2037 if (err)
2038 goto out;
2039
2040 err = update_parent_subparts_cpumask(cs, partcmd_enable,
2041 NULL, &tmpmask);
2042 if (err) {
2043 update_flag(CS_CPU_EXCLUSIVE, cs, 0);
2044 goto out;
2045 }
2046 } else {
2047
2048
2049
2050
2051 if (old_prs == PRS_ERROR) {
2052 update_flag(CS_CPU_EXCLUSIVE, cs, 0);
2053 err = 0;
2054 goto out;
2055 }
2056
2057 err = update_parent_subparts_cpumask(cs, partcmd_disable,
2058 NULL, &tmpmask);
2059 if (err)
2060 goto out;
2061
2062
2063 update_flag(CS_CPU_EXCLUSIVE, cs, 0);
2064 }
2065
2066
2067
2068
2069
2070 if (parent != &top_cpuset)
2071 update_tasks_cpumask(parent);
2072
2073 if (parent->child_ecpus_count)
2074 update_sibling_cpumasks(parent, cs, &tmpmask);
2075
2076 rebuild_sched_domains_locked();
2077out:
2078 if (!err) {
2079 spin_lock_irq(&callback_lock);
2080 cs->partition_root_state = new_prs;
2081 spin_unlock_irq(&callback_lock);
2082 notify_partition_change(cs, old_prs, new_prs);
2083 }
2084
2085 free_cpumasks(NULL, &tmpmask);
2086 return err;
2087}
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134#define FM_COEF 933
2135#define FM_MAXTICKS ((u32)99)
2136#define FM_MAXCNT 1000000
2137#define FM_SCALE 1000
2138
2139
2140static void fmeter_init(struct fmeter *fmp)
2141{
2142 fmp->cnt = 0;
2143 fmp->val = 0;
2144 fmp->time = 0;
2145 spin_lock_init(&fmp->lock);
2146}
2147
2148
2149static void fmeter_update(struct fmeter *fmp)
2150{
2151 time64_t now;
2152 u32 ticks;
2153
2154 now = ktime_get_seconds();
2155 ticks = now - fmp->time;
2156
2157 if (ticks == 0)
2158 return;
2159
2160 ticks = min(FM_MAXTICKS, ticks);
2161 while (ticks-- > 0)
2162 fmp->val = (FM_COEF * fmp->val) / FM_SCALE;
2163 fmp->time = now;
2164
2165 fmp->val += ((FM_SCALE - FM_COEF) * fmp->cnt) / FM_SCALE;
2166 fmp->cnt = 0;
2167}
2168
2169
2170static void fmeter_markevent(struct fmeter *fmp)
2171{
2172 spin_lock(&fmp->lock);
2173 fmeter_update(fmp);
2174 fmp->cnt = min(FM_MAXCNT, fmp->cnt + FM_SCALE);
2175 spin_unlock(&fmp->lock);
2176}
2177
2178
2179static int fmeter_getrate(struct fmeter *fmp)
2180{
2181 int val;
2182
2183 spin_lock(&fmp->lock);
2184 fmeter_update(fmp);
2185 val = fmp->val;
2186 spin_unlock(&fmp->lock);
2187 return val;
2188}
2189
2190static struct cpuset *cpuset_attach_old_cs;
2191
2192
2193static int cpuset_can_attach(struct cgroup_taskset *tset)
2194{
2195 struct cgroup_subsys_state *css;
2196 struct cpuset *cs;
2197 struct task_struct *task;
2198 int ret;
2199
2200
2201 cpuset_attach_old_cs = task_cs(cgroup_taskset_first(tset, &css));
2202 cs = css_cs(css);
2203
2204 percpu_down_write(&cpuset_rwsem);
2205
2206
2207 ret = -ENOSPC;
2208 if (!is_in_v2_mode() &&
2209 (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed)))
2210 goto out_unlock;
2211
2212 cgroup_taskset_for_each(task, css, tset) {
2213 ret = task_can_attach(task, cs->cpus_allowed);
2214 if (ret)
2215 goto out_unlock;
2216 ret = security_task_setscheduler(task);
2217 if (ret)
2218 goto out_unlock;
2219 }
2220
2221
2222
2223
2224
2225 cs->attach_in_progress++;
2226 ret = 0;
2227out_unlock:
2228 percpu_up_write(&cpuset_rwsem);
2229 return ret;
2230}
2231
2232static void cpuset_cancel_attach(struct cgroup_taskset *tset)
2233{
2234 struct cgroup_subsys_state *css;
2235
2236 cgroup_taskset_first(tset, &css);
2237
2238 percpu_down_write(&cpuset_rwsem);
2239 css_cs(css)->attach_in_progress--;
2240 percpu_up_write(&cpuset_rwsem);
2241}
2242
2243
2244
2245
2246
2247
2248static cpumask_var_t cpus_attach;
2249
2250static void cpuset_attach(struct cgroup_taskset *tset)
2251{
2252
2253 static nodemask_t cpuset_attach_nodemask_to;
2254 struct task_struct *task;
2255 struct task_struct *leader;
2256 struct cgroup_subsys_state *css;
2257 struct cpuset *cs;
2258 struct cpuset *oldcs = cpuset_attach_old_cs;
2259
2260 cgroup_taskset_first(tset, &css);
2261 cs = css_cs(css);
2262
2263 percpu_down_write(&cpuset_rwsem);
2264
2265 guarantee_online_mems(cs, &cpuset_attach_nodemask_to);
2266
2267 cgroup_taskset_for_each(task, css, tset) {
2268 if (cs != &top_cpuset)
2269 guarantee_online_cpus(task, cpus_attach);
2270 else
2271 cpumask_copy(cpus_attach, task_cpu_possible_mask(task));
2272
2273
2274
2275
2276 WARN_ON_ONCE(set_cpus_allowed_ptr(task, cpus_attach));
2277
2278 cpuset_change_task_nodemask(task, &cpuset_attach_nodemask_to);
2279 cpuset_update_task_spread_flag(cs, task);
2280 }
2281
2282
2283
2284
2285
2286 cpuset_attach_nodemask_to = cs->effective_mems;
2287 cgroup_taskset_for_each_leader(leader, css, tset) {
2288 struct mm_struct *mm = get_task_mm(leader);
2289
2290 if (mm) {
2291 mpol_rebind_mm(mm, &cpuset_attach_nodemask_to);
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301 if (is_memory_migrate(cs))
2302 cpuset_migrate_mm(mm, &oldcs->old_mems_allowed,
2303 &cpuset_attach_nodemask_to);
2304 else
2305 mmput(mm);
2306 }
2307 }
2308
2309 cs->old_mems_allowed = cpuset_attach_nodemask_to;
2310
2311 cs->attach_in_progress--;
2312 if (!cs->attach_in_progress)
2313 wake_up(&cpuset_attach_wq);
2314
2315 percpu_up_write(&cpuset_rwsem);
2316}
2317
2318
2319
2320typedef enum {
2321 FILE_MEMORY_MIGRATE,
2322 FILE_CPULIST,
2323 FILE_MEMLIST,
2324 FILE_EFFECTIVE_CPULIST,
2325 FILE_EFFECTIVE_MEMLIST,
2326 FILE_SUBPARTS_CPULIST,
2327 FILE_CPU_EXCLUSIVE,
2328 FILE_MEM_EXCLUSIVE,
2329 FILE_MEM_HARDWALL,
2330 FILE_SCHED_LOAD_BALANCE,
2331 FILE_PARTITION_ROOT,
2332 FILE_SCHED_RELAX_DOMAIN_LEVEL,
2333 FILE_MEMORY_PRESSURE_ENABLED,
2334 FILE_MEMORY_PRESSURE,
2335 FILE_SPREAD_PAGE,
2336 FILE_SPREAD_SLAB,
2337} cpuset_filetype_t;
2338
2339static int cpuset_write_u64(struct cgroup_subsys_state *css, struct cftype *cft,
2340 u64 val)
2341{
2342 struct cpuset *cs = css_cs(css);
2343 cpuset_filetype_t type = cft->private;
2344 int retval = 0;
2345
2346 cpus_read_lock();
2347 percpu_down_write(&cpuset_rwsem);
2348 if (!is_cpuset_online(cs)) {
2349 retval = -ENODEV;
2350 goto out_unlock;
2351 }
2352
2353 switch (type) {
2354 case FILE_CPU_EXCLUSIVE:
2355 retval = update_flag(CS_CPU_EXCLUSIVE, cs, val);
2356 break;
2357 case FILE_MEM_EXCLUSIVE:
2358 retval = update_flag(CS_MEM_EXCLUSIVE, cs, val);
2359 break;
2360 case FILE_MEM_HARDWALL:
2361 retval = update_flag(CS_MEM_HARDWALL, cs, val);
2362 break;
2363 case FILE_SCHED_LOAD_BALANCE:
2364 retval = update_flag(CS_SCHED_LOAD_BALANCE, cs, val);
2365 break;
2366 case FILE_MEMORY_MIGRATE:
2367 retval = update_flag(CS_MEMORY_MIGRATE, cs, val);
2368 break;
2369 case FILE_MEMORY_PRESSURE_ENABLED:
2370 cpuset_memory_pressure_enabled = !!val;
2371 break;
2372 case FILE_SPREAD_PAGE:
2373 retval = update_flag(CS_SPREAD_PAGE, cs, val);
2374 break;
2375 case FILE_SPREAD_SLAB:
2376 retval = update_flag(CS_SPREAD_SLAB, cs, val);
2377 break;
2378 default:
2379 retval = -EINVAL;
2380 break;
2381 }
2382out_unlock:
2383 percpu_up_write(&cpuset_rwsem);
2384 cpus_read_unlock();
2385 return retval;
2386}
2387
2388static int cpuset_write_s64(struct cgroup_subsys_state *css, struct cftype *cft,
2389 s64 val)
2390{
2391 struct cpuset *cs = css_cs(css);
2392 cpuset_filetype_t type = cft->private;
2393 int retval = -ENODEV;
2394
2395 cpus_read_lock();
2396 percpu_down_write(&cpuset_rwsem);
2397 if (!is_cpuset_online(cs))
2398 goto out_unlock;
2399
2400 switch (type) {
2401 case FILE_SCHED_RELAX_DOMAIN_LEVEL:
2402 retval = update_relax_domain_level(cs, val);
2403 break;
2404 default:
2405 retval = -EINVAL;
2406 break;
2407 }
2408out_unlock:
2409 percpu_up_write(&cpuset_rwsem);
2410 cpus_read_unlock();
2411 return retval;
2412}
2413
2414
2415
2416
2417static ssize_t cpuset_write_resmask(struct kernfs_open_file *of,
2418 char *buf, size_t nbytes, loff_t off)
2419{
2420 struct cpuset *cs = css_cs(of_css(of));
2421 struct cpuset *trialcs;
2422 int retval = -ENODEV;
2423
2424 buf = strstrip(buf);
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445 css_get(&cs->css);
2446 kernfs_break_active_protection(of->kn);
2447 flush_work(&cpuset_hotplug_work);
2448
2449 cpus_read_lock();
2450 percpu_down_write(&cpuset_rwsem);
2451 if (!is_cpuset_online(cs))
2452 goto out_unlock;
2453
2454 trialcs = alloc_trial_cpuset(cs);
2455 if (!trialcs) {
2456 retval = -ENOMEM;
2457 goto out_unlock;
2458 }
2459
2460 switch (of_cft(of)->private) {
2461 case FILE_CPULIST:
2462 retval = update_cpumask(cs, trialcs, buf);
2463 break;
2464 case FILE_MEMLIST:
2465 retval = update_nodemask(cs, trialcs, buf);
2466 break;
2467 default:
2468 retval = -EINVAL;
2469 break;
2470 }
2471
2472 free_cpuset(trialcs);
2473out_unlock:
2474 percpu_up_write(&cpuset_rwsem);
2475 cpus_read_unlock();
2476 kernfs_unbreak_active_protection(of->kn);
2477 css_put(&cs->css);
2478 flush_workqueue(cpuset_migrate_mm_wq);
2479 return retval ?: nbytes;
2480}
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490static int cpuset_common_seq_show(struct seq_file *sf, void *v)
2491{
2492 struct cpuset *cs = css_cs(seq_css(sf));
2493 cpuset_filetype_t type = seq_cft(sf)->private;
2494 int ret = 0;
2495
2496 spin_lock_irq(&callback_lock);
2497
2498 switch (type) {
2499 case FILE_CPULIST:
2500 seq_printf(sf, "%*pbl\n", cpumask_pr_args(cs->cpus_allowed));
2501 break;
2502 case FILE_MEMLIST:
2503 seq_printf(sf, "%*pbl\n", nodemask_pr_args(&cs->mems_allowed));
2504 break;
2505 case FILE_EFFECTIVE_CPULIST:
2506 seq_printf(sf, "%*pbl\n", cpumask_pr_args(cs->effective_cpus));
2507 break;
2508 case FILE_EFFECTIVE_MEMLIST:
2509 seq_printf(sf, "%*pbl\n", nodemask_pr_args(&cs->effective_mems));
2510 break;
2511 case FILE_SUBPARTS_CPULIST:
2512 seq_printf(sf, "%*pbl\n", cpumask_pr_args(cs->subparts_cpus));
2513 break;
2514 default:
2515 ret = -EINVAL;
2516 }
2517
2518 spin_unlock_irq(&callback_lock);
2519 return ret;
2520}
2521
2522static u64 cpuset_read_u64(struct cgroup_subsys_state *css, struct cftype *cft)
2523{
2524 struct cpuset *cs = css_cs(css);
2525 cpuset_filetype_t type = cft->private;
2526 switch (type) {
2527 case FILE_CPU_EXCLUSIVE:
2528 return is_cpu_exclusive(cs);
2529 case FILE_MEM_EXCLUSIVE:
2530 return is_mem_exclusive(cs);
2531 case FILE_MEM_HARDWALL:
2532 return is_mem_hardwall(cs);
2533 case FILE_SCHED_LOAD_BALANCE:
2534 return is_sched_load_balance(cs);
2535 case FILE_MEMORY_MIGRATE:
2536 return is_memory_migrate(cs);
2537 case FILE_MEMORY_PRESSURE_ENABLED:
2538 return cpuset_memory_pressure_enabled;
2539 case FILE_MEMORY_PRESSURE:
2540 return fmeter_getrate(&cs->fmeter);
2541 case FILE_SPREAD_PAGE:
2542 return is_spread_page(cs);
2543 case FILE_SPREAD_SLAB:
2544 return is_spread_slab(cs);
2545 default:
2546 BUG();
2547 }
2548
2549
2550 return 0;
2551}
2552
2553static s64 cpuset_read_s64(struct cgroup_subsys_state *css, struct cftype *cft)
2554{
2555 struct cpuset *cs = css_cs(css);
2556 cpuset_filetype_t type = cft->private;
2557 switch (type) {
2558 case FILE_SCHED_RELAX_DOMAIN_LEVEL:
2559 return cs->relax_domain_level;
2560 default:
2561 BUG();
2562 }
2563
2564
2565 return 0;
2566}
2567
2568static int sched_partition_show(struct seq_file *seq, void *v)
2569{
2570 struct cpuset *cs = css_cs(seq_css(seq));
2571
2572 switch (cs->partition_root_state) {
2573 case PRS_ENABLED:
2574 seq_puts(seq, "root\n");
2575 break;
2576 case PRS_DISABLED:
2577 seq_puts(seq, "member\n");
2578 break;
2579 case PRS_ERROR:
2580 seq_puts(seq, "root invalid\n");
2581 break;
2582 }
2583 return 0;
2584}
2585
2586static ssize_t sched_partition_write(struct kernfs_open_file *of, char *buf,
2587 size_t nbytes, loff_t off)
2588{
2589 struct cpuset *cs = css_cs(of_css(of));
2590 int val;
2591 int retval = -ENODEV;
2592
2593 buf = strstrip(buf);
2594
2595
2596
2597
2598 if (!strcmp(buf, "root"))
2599 val = PRS_ENABLED;
2600 else if (!strcmp(buf, "member"))
2601 val = PRS_DISABLED;
2602 else
2603 return -EINVAL;
2604
2605 css_get(&cs->css);
2606 cpus_read_lock();
2607 percpu_down_write(&cpuset_rwsem);
2608 if (!is_cpuset_online(cs))
2609 goto out_unlock;
2610
2611 retval = update_prstate(cs, val);
2612out_unlock:
2613 percpu_up_write(&cpuset_rwsem);
2614 cpus_read_unlock();
2615 css_put(&cs->css);
2616 return retval ?: nbytes;
2617}
2618
2619
2620
2621
2622
2623static struct cftype legacy_files[] = {
2624 {
2625 .name = "cpus",
2626 .seq_show = cpuset_common_seq_show,
2627 .write = cpuset_write_resmask,
2628 .max_write_len = (100U + 6 * NR_CPUS),
2629 .private = FILE_CPULIST,
2630 },
2631
2632 {
2633 .name = "mems",
2634 .seq_show = cpuset_common_seq_show,
2635 .write = cpuset_write_resmask,
2636 .max_write_len = (100U + 6 * MAX_NUMNODES),
2637 .private = FILE_MEMLIST,
2638 },
2639
2640 {
2641 .name = "effective_cpus",
2642 .seq_show = cpuset_common_seq_show,
2643 .private = FILE_EFFECTIVE_CPULIST,
2644 },
2645
2646 {
2647 .name = "effective_mems",
2648 .seq_show = cpuset_common_seq_show,
2649 .private = FILE_EFFECTIVE_MEMLIST,
2650 },
2651
2652 {
2653 .name = "cpu_exclusive",
2654 .read_u64 = cpuset_read_u64,
2655 .write_u64 = cpuset_write_u64,
2656 .private = FILE_CPU_EXCLUSIVE,
2657 },
2658
2659 {
2660 .name = "mem_exclusive",
2661 .read_u64 = cpuset_read_u64,
2662 .write_u64 = cpuset_write_u64,
2663 .private = FILE_MEM_EXCLUSIVE,
2664 },
2665
2666 {
2667 .name = "mem_hardwall",
2668 .read_u64 = cpuset_read_u64,
2669 .write_u64 = cpuset_write_u64,
2670 .private = FILE_MEM_HARDWALL,
2671 },
2672
2673 {
2674 .name = "sched_load_balance",
2675 .read_u64 = cpuset_read_u64,
2676 .write_u64 = cpuset_write_u64,
2677 .private = FILE_SCHED_LOAD_BALANCE,
2678 },
2679
2680 {
2681 .name = "sched_relax_domain_level",
2682 .read_s64 = cpuset_read_s64,
2683 .write_s64 = cpuset_write_s64,
2684 .private = FILE_SCHED_RELAX_DOMAIN_LEVEL,
2685 },
2686
2687 {
2688 .name = "memory_migrate",
2689 .read_u64 = cpuset_read_u64,
2690 .write_u64 = cpuset_write_u64,
2691 .private = FILE_MEMORY_MIGRATE,
2692 },
2693
2694 {
2695 .name = "memory_pressure",
2696 .read_u64 = cpuset_read_u64,
2697 .private = FILE_MEMORY_PRESSURE,
2698 },
2699
2700 {
2701 .name = "memory_spread_page",
2702 .read_u64 = cpuset_read_u64,
2703 .write_u64 = cpuset_write_u64,
2704 .private = FILE_SPREAD_PAGE,
2705 },
2706
2707 {
2708 .name = "memory_spread_slab",
2709 .read_u64 = cpuset_read_u64,
2710 .write_u64 = cpuset_write_u64,
2711 .private = FILE_SPREAD_SLAB,
2712 },
2713
2714 {
2715 .name = "memory_pressure_enabled",
2716 .flags = CFTYPE_ONLY_ON_ROOT,
2717 .read_u64 = cpuset_read_u64,
2718 .write_u64 = cpuset_write_u64,
2719 .private = FILE_MEMORY_PRESSURE_ENABLED,
2720 },
2721
2722 { }
2723};
2724
2725
2726
2727
2728
2729static struct cftype dfl_files[] = {
2730 {
2731 .name = "cpus",
2732 .seq_show = cpuset_common_seq_show,
2733 .write = cpuset_write_resmask,
2734 .max_write_len = (100U + 6 * NR_CPUS),
2735 .private = FILE_CPULIST,
2736 .flags = CFTYPE_NOT_ON_ROOT,
2737 },
2738
2739 {
2740 .name = "mems",
2741 .seq_show = cpuset_common_seq_show,
2742 .write = cpuset_write_resmask,
2743 .max_write_len = (100U + 6 * MAX_NUMNODES),
2744 .private = FILE_MEMLIST,
2745 .flags = CFTYPE_NOT_ON_ROOT,
2746 },
2747
2748 {
2749 .name = "cpus.effective",
2750 .seq_show = cpuset_common_seq_show,
2751 .private = FILE_EFFECTIVE_CPULIST,
2752 },
2753
2754 {
2755 .name = "mems.effective",
2756 .seq_show = cpuset_common_seq_show,
2757 .private = FILE_EFFECTIVE_MEMLIST,
2758 },
2759
2760 {
2761 .name = "cpus.partition",
2762 .seq_show = sched_partition_show,
2763 .write = sched_partition_write,
2764 .private = FILE_PARTITION_ROOT,
2765 .flags = CFTYPE_NOT_ON_ROOT,
2766 .file_offset = offsetof(struct cpuset, partition_file),
2767 },
2768
2769 {
2770 .name = "cpus.subpartitions",
2771 .seq_show = cpuset_common_seq_show,
2772 .private = FILE_SUBPARTS_CPULIST,
2773 .flags = CFTYPE_DEBUG,
2774 },
2775
2776 { }
2777};
2778
2779
2780
2781
2782
2783
2784
2785static struct cgroup_subsys_state *
2786cpuset_css_alloc(struct cgroup_subsys_state *parent_css)
2787{
2788 struct cpuset *cs;
2789
2790 if (!parent_css)
2791 return &top_cpuset.css;
2792
2793 cs = kzalloc(sizeof(*cs), GFP_KERNEL);
2794 if (!cs)
2795 return ERR_PTR(-ENOMEM);
2796
2797 if (alloc_cpumasks(cs, NULL)) {
2798 kfree(cs);
2799 return ERR_PTR(-ENOMEM);
2800 }
2801
2802 __set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
2803 nodes_clear(cs->mems_allowed);
2804 nodes_clear(cs->effective_mems);
2805 fmeter_init(&cs->fmeter);
2806 cs->relax_domain_level = -1;
2807
2808
2809 if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys))
2810 __set_bit(CS_MEMORY_MIGRATE, &cs->flags);
2811
2812 return &cs->css;
2813}
2814
2815static int cpuset_css_online(struct cgroup_subsys_state *css)
2816{
2817 struct cpuset *cs = css_cs(css);
2818 struct cpuset *parent = parent_cs(cs);
2819 struct cpuset *tmp_cs;
2820 struct cgroup_subsys_state *pos_css;
2821
2822 if (!parent)
2823 return 0;
2824
2825 cpus_read_lock();
2826 percpu_down_write(&cpuset_rwsem);
2827
2828 set_bit(CS_ONLINE, &cs->flags);
2829 if (is_spread_page(parent))
2830 set_bit(CS_SPREAD_PAGE, &cs->flags);
2831 if (is_spread_slab(parent))
2832 set_bit(CS_SPREAD_SLAB, &cs->flags);
2833
2834 cpuset_inc();
2835
2836 spin_lock_irq(&callback_lock);
2837 if (is_in_v2_mode()) {
2838 cpumask_copy(cs->effective_cpus, parent->effective_cpus);
2839 cs->effective_mems = parent->effective_mems;
2840 cs->use_parent_ecpus = true;
2841 parent->child_ecpus_count++;
2842 }
2843 spin_unlock_irq(&callback_lock);
2844
2845 if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags))
2846 goto out_unlock;
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861 rcu_read_lock();
2862 cpuset_for_each_child(tmp_cs, pos_css, parent) {
2863 if (is_mem_exclusive(tmp_cs) || is_cpu_exclusive(tmp_cs)) {
2864 rcu_read_unlock();
2865 goto out_unlock;
2866 }
2867 }
2868 rcu_read_unlock();
2869
2870 spin_lock_irq(&callback_lock);
2871 cs->mems_allowed = parent->mems_allowed;
2872 cs->effective_mems = parent->mems_allowed;
2873 cpumask_copy(cs->cpus_allowed, parent->cpus_allowed);
2874 cpumask_copy(cs->effective_cpus, parent->cpus_allowed);
2875 spin_unlock_irq(&callback_lock);
2876out_unlock:
2877 percpu_up_write(&cpuset_rwsem);
2878 cpus_read_unlock();
2879 return 0;
2880}
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893static void cpuset_css_offline(struct cgroup_subsys_state *css)
2894{
2895 struct cpuset *cs = css_cs(css);
2896
2897 cpus_read_lock();
2898 percpu_down_write(&cpuset_rwsem);
2899
2900 if (is_partition_root(cs))
2901 update_prstate(cs, 0);
2902
2903 if (!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) &&
2904 is_sched_load_balance(cs))
2905 update_flag(CS_SCHED_LOAD_BALANCE, cs, 0);
2906
2907 if (cs->use_parent_ecpus) {
2908 struct cpuset *parent = parent_cs(cs);
2909
2910 cs->use_parent_ecpus = false;
2911 parent->child_ecpus_count--;
2912 }
2913
2914 cpuset_dec();
2915 clear_bit(CS_ONLINE, &cs->flags);
2916
2917 percpu_up_write(&cpuset_rwsem);
2918 cpus_read_unlock();
2919}
2920
2921static void cpuset_css_free(struct cgroup_subsys_state *css)
2922{
2923 struct cpuset *cs = css_cs(css);
2924
2925 free_cpuset(cs);
2926}
2927
2928static void cpuset_bind(struct cgroup_subsys_state *root_css)
2929{
2930 percpu_down_write(&cpuset_rwsem);
2931 spin_lock_irq(&callback_lock);
2932
2933 if (is_in_v2_mode()) {
2934 cpumask_copy(top_cpuset.cpus_allowed, cpu_possible_mask);
2935 top_cpuset.mems_allowed = node_possible_map;
2936 } else {
2937 cpumask_copy(top_cpuset.cpus_allowed,
2938 top_cpuset.effective_cpus);
2939 top_cpuset.mems_allowed = top_cpuset.effective_mems;
2940 }
2941
2942 spin_unlock_irq(&callback_lock);
2943 percpu_up_write(&cpuset_rwsem);
2944}
2945
2946
2947
2948
2949
2950
2951static void cpuset_fork(struct task_struct *task)
2952{
2953 if (task_css_is_root(task, cpuset_cgrp_id))
2954 return;
2955
2956 set_cpus_allowed_ptr(task, current->cpus_ptr);
2957 task->mems_allowed = current->mems_allowed;
2958}
2959
2960struct cgroup_subsys cpuset_cgrp_subsys = {
2961 .css_alloc = cpuset_css_alloc,
2962 .css_online = cpuset_css_online,
2963 .css_offline = cpuset_css_offline,
2964 .css_free = cpuset_css_free,
2965 .can_attach = cpuset_can_attach,
2966 .cancel_attach = cpuset_cancel_attach,
2967 .attach = cpuset_attach,
2968 .post_attach = cpuset_post_attach,
2969 .bind = cpuset_bind,
2970 .fork = cpuset_fork,
2971 .legacy_cftypes = legacy_files,
2972 .dfl_cftypes = dfl_files,
2973 .early_init = true,
2974 .threaded = true,
2975};
2976
2977
2978
2979
2980
2981
2982
2983int __init cpuset_init(void)
2984{
2985 BUG_ON(percpu_init_rwsem(&cpuset_rwsem));
2986
2987 BUG_ON(!alloc_cpumask_var(&top_cpuset.cpus_allowed, GFP_KERNEL));
2988 BUG_ON(!alloc_cpumask_var(&top_cpuset.effective_cpus, GFP_KERNEL));
2989 BUG_ON(!zalloc_cpumask_var(&top_cpuset.subparts_cpus, GFP_KERNEL));
2990
2991 cpumask_setall(top_cpuset.cpus_allowed);
2992 nodes_setall(top_cpuset.mems_allowed);
2993 cpumask_setall(top_cpuset.effective_cpus);
2994 nodes_setall(top_cpuset.effective_mems);
2995
2996 fmeter_init(&top_cpuset.fmeter);
2997 set_bit(CS_SCHED_LOAD_BALANCE, &top_cpuset.flags);
2998 top_cpuset.relax_domain_level = -1;
2999
3000 BUG_ON(!alloc_cpumask_var(&cpus_attach, GFP_KERNEL));
3001
3002 return 0;
3003}
3004
3005
3006
3007
3008
3009
3010
3011
3012static void remove_tasks_in_empty_cpuset(struct cpuset *cs)
3013{
3014 struct cpuset *parent;
3015
3016
3017
3018
3019
3020 parent = parent_cs(cs);
3021 while (cpumask_empty(parent->cpus_allowed) ||
3022 nodes_empty(parent->mems_allowed))
3023 parent = parent_cs(parent);
3024
3025 if (cgroup_transfer_tasks(parent->css.cgroup, cs->css.cgroup)) {
3026 pr_err("cpuset: failed to transfer tasks out of empty cpuset ");
3027 pr_cont_cgroup_name(cs->css.cgroup);
3028 pr_cont("\n");
3029 }
3030}
3031
3032static void
3033hotplug_update_tasks_legacy(struct cpuset *cs,
3034 struct cpumask *new_cpus, nodemask_t *new_mems,
3035 bool cpus_updated, bool mems_updated)
3036{
3037 bool is_empty;
3038
3039 spin_lock_irq(&callback_lock);
3040 cpumask_copy(cs->cpus_allowed, new_cpus);
3041 cpumask_copy(cs->effective_cpus, new_cpus);
3042 cs->mems_allowed = *new_mems;
3043 cs->effective_mems = *new_mems;
3044 spin_unlock_irq(&callback_lock);
3045
3046
3047
3048
3049
3050 if (cpus_updated && !cpumask_empty(cs->cpus_allowed))
3051 update_tasks_cpumask(cs);
3052 if (mems_updated && !nodes_empty(cs->mems_allowed))
3053 update_tasks_nodemask(cs);
3054
3055 is_empty = cpumask_empty(cs->cpus_allowed) ||
3056 nodes_empty(cs->mems_allowed);
3057
3058 percpu_up_write(&cpuset_rwsem);
3059
3060
3061
3062
3063
3064
3065 if (is_empty)
3066 remove_tasks_in_empty_cpuset(cs);
3067
3068 percpu_down_write(&cpuset_rwsem);
3069}
3070
3071static void
3072hotplug_update_tasks(struct cpuset *cs,
3073 struct cpumask *new_cpus, nodemask_t *new_mems,
3074 bool cpus_updated, bool mems_updated)
3075{
3076 if (cpumask_empty(new_cpus))
3077 cpumask_copy(new_cpus, parent_cs(cs)->effective_cpus);
3078 if (nodes_empty(*new_mems))
3079 *new_mems = parent_cs(cs)->effective_mems;
3080
3081 spin_lock_irq(&callback_lock);
3082 cpumask_copy(cs->effective_cpus, new_cpus);
3083 cs->effective_mems = *new_mems;
3084 spin_unlock_irq(&callback_lock);
3085
3086 if (cpus_updated)
3087 update_tasks_cpumask(cs);
3088 if (mems_updated)
3089 update_tasks_nodemask(cs);
3090}
3091
3092static bool force_rebuild;
3093
3094void cpuset_force_rebuild(void)
3095{
3096 force_rebuild = true;
3097}
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108static void cpuset_hotplug_update_tasks(struct cpuset *cs, struct tmpmasks *tmp)
3109{
3110 static cpumask_t new_cpus;
3111 static nodemask_t new_mems;
3112 bool cpus_updated;
3113 bool mems_updated;
3114 struct cpuset *parent;
3115retry:
3116 wait_event(cpuset_attach_wq, cs->attach_in_progress == 0);
3117
3118 percpu_down_write(&cpuset_rwsem);
3119
3120
3121
3122
3123
3124 if (cs->attach_in_progress) {
3125 percpu_up_write(&cpuset_rwsem);
3126 goto retry;
3127 }
3128
3129 parent = parent_cs(cs);
3130 compute_effective_cpumask(&new_cpus, cs, parent);
3131 nodes_and(new_mems, cs->mems_allowed, parent->effective_mems);
3132
3133 if (cs->nr_subparts_cpus)
3134
3135
3136
3137
3138 cpumask_andnot(&new_cpus, &new_cpus, cs->subparts_cpus);
3139
3140 if (!tmp || !cs->partition_root_state)
3141 goto update_tasks;
3142
3143
3144
3145
3146
3147
3148 if (is_partition_root(cs) && (cpumask_empty(&new_cpus) ||
3149 (parent->partition_root_state == PRS_ERROR))) {
3150 if (cs->nr_subparts_cpus) {
3151 spin_lock_irq(&callback_lock);
3152 cs->nr_subparts_cpus = 0;
3153 cpumask_clear(cs->subparts_cpus);
3154 spin_unlock_irq(&callback_lock);
3155 compute_effective_cpumask(&new_cpus, cs, parent);
3156 }
3157
3158
3159
3160
3161
3162
3163
3164 if ((parent->partition_root_state == PRS_ERROR) ||
3165 cpumask_empty(&new_cpus)) {
3166 int old_prs;
3167
3168 update_parent_subparts_cpumask(cs, partcmd_disable,
3169 NULL, tmp);
3170 old_prs = cs->partition_root_state;
3171 if (old_prs != PRS_ERROR) {
3172 spin_lock_irq(&callback_lock);
3173 cs->partition_root_state = PRS_ERROR;
3174 spin_unlock_irq(&callback_lock);
3175 notify_partition_change(cs, old_prs, PRS_ERROR);
3176 }
3177 }
3178 cpuset_force_rebuild();
3179 }
3180
3181
3182
3183
3184
3185
3186 if (is_partition_root(parent) &&
3187 ((cs->partition_root_state == PRS_ERROR) ||
3188 !cpumask_intersects(&new_cpus, parent->subparts_cpus)) &&
3189 update_parent_subparts_cpumask(cs, partcmd_update, NULL, tmp))
3190 cpuset_force_rebuild();
3191
3192update_tasks:
3193 cpus_updated = !cpumask_equal(&new_cpus, cs->effective_cpus);
3194 mems_updated = !nodes_equal(new_mems, cs->effective_mems);
3195
3196 if (mems_updated)
3197 check_insane_mems_config(&new_mems);
3198
3199 if (is_in_v2_mode())
3200 hotplug_update_tasks(cs, &new_cpus, &new_mems,
3201 cpus_updated, mems_updated);
3202 else
3203 hotplug_update_tasks_legacy(cs, &new_cpus, &new_mems,
3204 cpus_updated, mems_updated);
3205
3206 percpu_up_write(&cpuset_rwsem);
3207}
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225static void cpuset_hotplug_workfn(struct work_struct *work)
3226{
3227 static cpumask_t new_cpus;
3228 static nodemask_t new_mems;
3229 bool cpus_updated, mems_updated;
3230 bool on_dfl = is_in_v2_mode();
3231 struct tmpmasks tmp, *ptmp = NULL;
3232
3233 if (on_dfl && !alloc_cpumasks(NULL, &tmp))
3234 ptmp = &tmp;
3235
3236 percpu_down_write(&cpuset_rwsem);
3237
3238
3239 cpumask_copy(&new_cpus, cpu_active_mask);
3240 new_mems = node_states[N_MEMORY];
3241
3242
3243
3244
3245
3246
3247 cpus_updated = !cpumask_equal(top_cpuset.effective_cpus, &new_cpus);
3248 mems_updated = !nodes_equal(top_cpuset.effective_mems, new_mems);
3249
3250
3251
3252
3253
3254 if (!cpus_updated && top_cpuset.nr_subparts_cpus)
3255 cpus_updated = true;
3256
3257
3258 if (cpus_updated) {
3259 spin_lock_irq(&callback_lock);
3260 if (!on_dfl)
3261 cpumask_copy(top_cpuset.cpus_allowed, &new_cpus);
3262
3263
3264
3265
3266
3267
3268 if (top_cpuset.nr_subparts_cpus) {
3269 if (cpumask_subset(&new_cpus,
3270 top_cpuset.subparts_cpus)) {
3271 top_cpuset.nr_subparts_cpus = 0;
3272 cpumask_clear(top_cpuset.subparts_cpus);
3273 } else {
3274 cpumask_andnot(&new_cpus, &new_cpus,
3275 top_cpuset.subparts_cpus);
3276 }
3277 }
3278 cpumask_copy(top_cpuset.effective_cpus, &new_cpus);
3279 spin_unlock_irq(&callback_lock);
3280
3281 }
3282
3283
3284 if (mems_updated) {
3285 spin_lock_irq(&callback_lock);
3286 if (!on_dfl)
3287 top_cpuset.mems_allowed = new_mems;
3288 top_cpuset.effective_mems = new_mems;
3289 spin_unlock_irq(&callback_lock);
3290 update_tasks_nodemask(&top_cpuset);
3291 }
3292
3293 percpu_up_write(&cpuset_rwsem);
3294
3295
3296 if (cpus_updated || mems_updated) {
3297 struct cpuset *cs;
3298 struct cgroup_subsys_state *pos_css;
3299
3300 rcu_read_lock();
3301 cpuset_for_each_descendant_pre(cs, pos_css, &top_cpuset) {
3302 if (cs == &top_cpuset || !css_tryget_online(&cs->css))
3303 continue;
3304 rcu_read_unlock();
3305
3306 cpuset_hotplug_update_tasks(cs, ptmp);
3307
3308 rcu_read_lock();
3309 css_put(&cs->css);
3310 }
3311 rcu_read_unlock();
3312 }
3313
3314
3315 if (cpus_updated || force_rebuild) {
3316 force_rebuild = false;
3317 rebuild_sched_domains();
3318 }
3319
3320 free_cpumasks(NULL, ptmp);
3321}
3322
3323void cpuset_update_active_cpus(void)
3324{
3325
3326
3327
3328
3329
3330 schedule_work(&cpuset_hotplug_work);
3331}
3332
3333void cpuset_wait_for_hotplug(void)
3334{
3335 flush_work(&cpuset_hotplug_work);
3336}
3337
3338
3339
3340
3341
3342
3343static int cpuset_track_online_nodes(struct notifier_block *self,
3344 unsigned long action, void *arg)
3345{
3346 schedule_work(&cpuset_hotplug_work);
3347 return NOTIFY_OK;
3348}
3349
3350static struct notifier_block cpuset_track_online_nodes_nb = {
3351 .notifier_call = cpuset_track_online_nodes,
3352 .priority = 10,
3353};
3354
3355
3356
3357
3358
3359
3360void __init cpuset_init_smp(void)
3361{
3362 cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask);
3363 top_cpuset.mems_allowed = node_states[N_MEMORY];
3364 top_cpuset.old_mems_allowed = top_cpuset.mems_allowed;
3365
3366 cpumask_copy(top_cpuset.effective_cpus, cpu_active_mask);
3367 top_cpuset.effective_mems = node_states[N_MEMORY];
3368
3369 register_hotmemory_notifier(&cpuset_track_online_nodes_nb);
3370
3371 cpuset_migrate_mm_wq = alloc_ordered_workqueue("cpuset_migrate_mm", 0);
3372 BUG_ON(!cpuset_migrate_mm_wq);
3373}
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask)
3387{
3388 unsigned long flags;
3389
3390 spin_lock_irqsave(&callback_lock, flags);
3391 guarantee_online_cpus(tsk, pmask);
3392 spin_unlock_irqrestore(&callback_lock, flags);
3393}
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409bool cpuset_cpus_allowed_fallback(struct task_struct *tsk)
3410{
3411 const struct cpumask *possible_mask = task_cpu_possible_mask(tsk);
3412 const struct cpumask *cs_mask;
3413 bool changed = false;
3414
3415 rcu_read_lock();
3416 cs_mask = task_cs(tsk)->cpus_allowed;
3417 if (is_in_v2_mode() && cpumask_subset(cs_mask, possible_mask)) {
3418 do_set_cpus_allowed(tsk, cs_mask);
3419 changed = true;
3420 }
3421 rcu_read_unlock();
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440 return changed;
3441}
3442
3443void __init cpuset_init_current_mems_allowed(void)
3444{
3445 nodes_setall(current->mems_allowed);
3446}
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458nodemask_t cpuset_mems_allowed(struct task_struct *tsk)
3459{
3460 nodemask_t mask;
3461 unsigned long flags;
3462
3463 spin_lock_irqsave(&callback_lock, flags);
3464 rcu_read_lock();
3465 guarantee_online_mems(task_cs(tsk), &mask);
3466 rcu_read_unlock();
3467 spin_unlock_irqrestore(&callback_lock, flags);
3468
3469 return mask;
3470}
3471
3472
3473
3474
3475
3476
3477
3478int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask)
3479{
3480 return nodes_intersects(*nodemask, current->mems_allowed);
3481}
3482
3483
3484
3485
3486
3487
3488
3489static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs)
3490{
3491 while (!(is_mem_exclusive(cs) || is_mem_hardwall(cs)) && parent_cs(cs))
3492 cs = parent_cs(cs);
3493 return cs;
3494}
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536bool __cpuset_node_allowed(int node, gfp_t gfp_mask)
3537{
3538 struct cpuset *cs;
3539 int allowed;
3540 unsigned long flags;
3541
3542 if (in_interrupt())
3543 return true;
3544 if (node_isset(node, current->mems_allowed))
3545 return true;
3546
3547
3548
3549
3550 if (unlikely(tsk_is_oom_victim(current)))
3551 return true;
3552 if (gfp_mask & __GFP_HARDWALL)
3553 return false;
3554
3555 if (current->flags & PF_EXITING)
3556 return true;
3557
3558
3559 spin_lock_irqsave(&callback_lock, flags);
3560
3561 rcu_read_lock();
3562 cs = nearest_hardwall_ancestor(task_cs(current));
3563 allowed = node_isset(node, cs->mems_allowed);
3564 rcu_read_unlock();
3565
3566 spin_unlock_irqrestore(&callback_lock, flags);
3567 return allowed;
3568}
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597static int cpuset_spread_node(int *rotor)
3598{
3599 return *rotor = next_node_in(*rotor, current->mems_allowed);
3600}
3601
3602int cpuset_mem_spread_node(void)
3603{
3604 if (current->cpuset_mem_spread_rotor == NUMA_NO_NODE)
3605 current->cpuset_mem_spread_rotor =
3606 node_random(¤t->mems_allowed);
3607
3608 return cpuset_spread_node(¤t->cpuset_mem_spread_rotor);
3609}
3610
3611int cpuset_slab_spread_node(void)
3612{
3613 if (current->cpuset_slab_spread_rotor == NUMA_NO_NODE)
3614 current->cpuset_slab_spread_rotor =
3615 node_random(¤t->mems_allowed);
3616
3617 return cpuset_spread_node(¤t->cpuset_slab_spread_rotor);
3618}
3619
3620EXPORT_SYMBOL_GPL(cpuset_mem_spread_node);
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633int cpuset_mems_allowed_intersects(const struct task_struct *tsk1,
3634 const struct task_struct *tsk2)
3635{
3636 return nodes_intersects(tsk1->mems_allowed, tsk2->mems_allowed);
3637}
3638
3639
3640
3641
3642
3643
3644
3645void cpuset_print_current_mems_allowed(void)
3646{
3647 struct cgroup *cgrp;
3648
3649 rcu_read_lock();
3650
3651 cgrp = task_cs(current)->css.cgroup;
3652 pr_cont(",cpuset=");
3653 pr_cont_cgroup_name(cgrp);
3654 pr_cont(",mems_allowed=%*pbl",
3655 nodemask_pr_args(¤t->mems_allowed));
3656
3657 rcu_read_unlock();
3658}
3659
3660
3661
3662
3663
3664
3665
3666int cpuset_memory_pressure_enabled __read_mostly;
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686void __cpuset_memory_pressure_bump(void)
3687{
3688 rcu_read_lock();
3689 fmeter_markevent(&task_cs(current)->fmeter);
3690 rcu_read_unlock();
3691}
3692
3693#ifdef CONFIG_PROC_PID_CPUSET
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703int proc_cpuset_show(struct seq_file *m, struct pid_namespace *ns,
3704 struct pid *pid, struct task_struct *tsk)
3705{
3706 char *buf;
3707 struct cgroup_subsys_state *css;
3708 int retval;
3709
3710 retval = -ENOMEM;
3711 buf = kmalloc(PATH_MAX, GFP_KERNEL);
3712 if (!buf)
3713 goto out;
3714
3715 css = task_get_css(tsk, cpuset_cgrp_id);
3716 retval = cgroup_path_ns(css->cgroup, buf, PATH_MAX,
3717 current->nsproxy->cgroup_ns);
3718 css_put(css);
3719 if (retval >= PATH_MAX)
3720 retval = -ENAMETOOLONG;
3721 if (retval < 0)
3722 goto out_free;
3723 seq_puts(m, buf);
3724 seq_putc(m, '\n');
3725 retval = 0;
3726out_free:
3727 kfree(buf);
3728out:
3729 return retval;
3730}
3731#endif
3732
3733
3734void cpuset_task_status_allowed(struct seq_file *m, struct task_struct *task)
3735{
3736 seq_printf(m, "Mems_allowed:\t%*pb\n",
3737 nodemask_pr_args(&task->mems_allowed));
3738 seq_printf(m, "Mems_allowed_list:\t%*pbl\n",
3739 nodemask_pr_args(&task->mems_allowed));
3740}
3741