1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25#include <linux/cpu.h>
26#include <linux/cpumask.h>
27#include <linux/cpuset.h>
28#include <linux/err.h>
29#include <linux/errno.h>
30#include <linux/file.h>
31#include <linux/fs.h>
32#include <linux/init.h>
33#include <linux/interrupt.h>
34#include <linux/kernel.h>
35#include <linux/kmod.h>
36#include <linux/list.h>
37#include <linux/mempolicy.h>
38#include <linux/mm.h>
39#include <linux/memory.h>
40#include <linux/export.h>
41#include <linux/mount.h>
42#include <linux/fs_context.h>
43#include <linux/namei.h>
44#include <linux/pagemap.h>
45#include <linux/proc_fs.h>
46#include <linux/rcupdate.h>
47#include <linux/sched.h>
48#include <linux/sched/deadline.h>
49#include <linux/sched/mm.h>
50#include <linux/sched/task.h>
51#include <linux/seq_file.h>
52#include <linux/security.h>
53#include <linux/slab.h>
54#include <linux/spinlock.h>
55#include <linux/stat.h>
56#include <linux/string.h>
57#include <linux/time.h>
58#include <linux/time64.h>
59#include <linux/backing-dev.h>
60#include <linux/sort.h>
61#include <linux/oom.h>
62#include <linux/sched/isolation.h>
63#include <linux/uaccess.h>
64#include <linux/atomic.h>
65#include <linux/mutex.h>
66#include <linux/cgroup.h>
67#include <linux/wait.h>
68
69DEFINE_STATIC_KEY_FALSE(cpusets_pre_enable_key);
70DEFINE_STATIC_KEY_FALSE(cpusets_enabled_key);
71
72
73
74struct fmeter {
75 int cnt;
76 int val;
77 time64_t time;
78 spinlock_t lock;
79};
80
81struct cpuset {
82 struct cgroup_subsys_state css;
83
84 unsigned long flags;
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107 cpumask_var_t cpus_allowed;
108 nodemask_t mems_allowed;
109
110
111 cpumask_var_t effective_cpus;
112 nodemask_t effective_mems;
113
114
115
116
117
118
119
120
121
122 cpumask_var_t subparts_cpus;
123
124
125
126
127
128
129
130
131
132
133
134 nodemask_t old_mems_allowed;
135
136 struct fmeter fmeter;
137
138
139
140
141
142 int attach_in_progress;
143
144
145 int pn;
146
147
148 int relax_domain_level;
149
150
151 int nr_subparts_cpus;
152
153
154 int partition_root_state;
155
156
157
158
159
160
161 int use_parent_ecpus;
162 int child_ecpus_count;
163};
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179#define PRS_DISABLED 0
180#define PRS_ENABLED 1
181#define PRS_ERROR -1
182
183
184
185
186
187struct tmpmasks {
188 cpumask_var_t addmask, delmask;
189 cpumask_var_t new_cpus;
190};
191
192static inline struct cpuset *css_cs(struct cgroup_subsys_state *css)
193{
194 return css ? container_of(css, struct cpuset, css) : NULL;
195}
196
197
198static inline struct cpuset *task_cs(struct task_struct *task)
199{
200 return css_cs(task_css(task, cpuset_cgrp_id));
201}
202
203static inline struct cpuset *parent_cs(struct cpuset *cs)
204{
205 return css_cs(cs->css.parent);
206}
207
208
209typedef enum {
210 CS_ONLINE,
211 CS_CPU_EXCLUSIVE,
212 CS_MEM_EXCLUSIVE,
213 CS_MEM_HARDWALL,
214 CS_MEMORY_MIGRATE,
215 CS_SCHED_LOAD_BALANCE,
216 CS_SPREAD_PAGE,
217 CS_SPREAD_SLAB,
218} cpuset_flagbits_t;
219
220
221static inline bool is_cpuset_online(struct cpuset *cs)
222{
223 return test_bit(CS_ONLINE, &cs->flags) && !css_is_dying(&cs->css);
224}
225
226static inline int is_cpu_exclusive(const struct cpuset *cs)
227{
228 return test_bit(CS_CPU_EXCLUSIVE, &cs->flags);
229}
230
231static inline int is_mem_exclusive(const struct cpuset *cs)
232{
233 return test_bit(CS_MEM_EXCLUSIVE, &cs->flags);
234}
235
236static inline int is_mem_hardwall(const struct cpuset *cs)
237{
238 return test_bit(CS_MEM_HARDWALL, &cs->flags);
239}
240
241static inline int is_sched_load_balance(const struct cpuset *cs)
242{
243 return test_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
244}
245
246static inline int is_memory_migrate(const struct cpuset *cs)
247{
248 return test_bit(CS_MEMORY_MIGRATE, &cs->flags);
249}
250
251static inline int is_spread_page(const struct cpuset *cs)
252{
253 return test_bit(CS_SPREAD_PAGE, &cs->flags);
254}
255
256static inline int is_spread_slab(const struct cpuset *cs)
257{
258 return test_bit(CS_SPREAD_SLAB, &cs->flags);
259}
260
261static inline int is_partition_root(const struct cpuset *cs)
262{
263 return cs->partition_root_state > 0;
264}
265
266static struct cpuset top_cpuset = {
267 .flags = ((1 << CS_ONLINE) | (1 << CS_CPU_EXCLUSIVE) |
268 (1 << CS_MEM_EXCLUSIVE)),
269 .partition_root_state = PRS_ENABLED,
270};
271
272
273
274
275
276
277
278
279
280
281#define cpuset_for_each_child(child_cs, pos_css, parent_cs) \
282 css_for_each_child((pos_css), &(parent_cs)->css) \
283 if (is_cpuset_online(((child_cs) = css_cs((pos_css)))))
284
285
286
287
288
289
290
291
292
293
294
295
296#define cpuset_for_each_descendant_pre(des_cs, pos_css, root_cs) \
297 css_for_each_descendant_pre((pos_css), &(root_cs)->css) \
298 if (is_cpuset_online(((des_cs) = css_cs((pos_css)))))
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336DEFINE_STATIC_PERCPU_RWSEM(cpuset_rwsem);
337
338void cpuset_read_lock(void)
339{
340 percpu_down_read(&cpuset_rwsem);
341}
342
343void cpuset_read_unlock(void)
344{
345 percpu_up_read(&cpuset_rwsem);
346}
347
348static DEFINE_SPINLOCK(callback_lock);
349
350static struct workqueue_struct *cpuset_migrate_mm_wq;
351
352
353
354
355static void cpuset_hotplug_workfn(struct work_struct *work);
356static DECLARE_WORK(cpuset_hotplug_work, cpuset_hotplug_workfn);
357
358static DECLARE_WAIT_QUEUE_HEAD(cpuset_attach_wq);
359
360
361
362
363
364
365
366
367
368static inline bool is_in_v2_mode(void)
369{
370 return cgroup_subsys_on_dfl(cpuset_cgrp_subsys) ||
371 (cpuset_cgrp_subsys.root->flags & CGRP_ROOT_CPUSET_V2_MODE);
372}
373
374
375
376
377
378
379
380
381
382
383
384static void guarantee_online_cpus(struct cpuset *cs, struct cpumask *pmask)
385{
386 while (!cpumask_intersects(cs->effective_cpus, cpu_online_mask)) {
387 cs = parent_cs(cs);
388 if (unlikely(!cs)) {
389
390
391
392
393
394
395
396 cpumask_copy(pmask, cpu_online_mask);
397 return;
398 }
399 }
400 cpumask_and(pmask, cs->effective_cpus, cpu_online_mask);
401}
402
403
404
405
406
407
408
409
410
411
412
413
414static void guarantee_online_mems(struct cpuset *cs, nodemask_t *pmask)
415{
416 while (!nodes_intersects(cs->effective_mems, node_states[N_MEMORY]))
417 cs = parent_cs(cs);
418 nodes_and(*pmask, cs->effective_mems, node_states[N_MEMORY]);
419}
420
421
422
423
424
425
426static void cpuset_update_task_spread_flag(struct cpuset *cs,
427 struct task_struct *tsk)
428{
429 if (is_spread_page(cs))
430 task_set_spread_page(tsk);
431 else
432 task_clear_spread_page(tsk);
433
434 if (is_spread_slab(cs))
435 task_set_spread_slab(tsk);
436 else
437 task_clear_spread_slab(tsk);
438}
439
440
441
442
443
444
445
446
447
448static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q)
449{
450 return cpumask_subset(p->cpus_allowed, q->cpus_allowed) &&
451 nodes_subset(p->mems_allowed, q->mems_allowed) &&
452 is_cpu_exclusive(p) <= is_cpu_exclusive(q) &&
453 is_mem_exclusive(p) <= is_mem_exclusive(q);
454}
455
456
457
458
459
460
461
462
463
464static inline int alloc_cpumasks(struct cpuset *cs, struct tmpmasks *tmp)
465{
466 cpumask_var_t *pmask1, *pmask2, *pmask3;
467
468 if (cs) {
469 pmask1 = &cs->cpus_allowed;
470 pmask2 = &cs->effective_cpus;
471 pmask3 = &cs->subparts_cpus;
472 } else {
473 pmask1 = &tmp->new_cpus;
474 pmask2 = &tmp->addmask;
475 pmask3 = &tmp->delmask;
476 }
477
478 if (!zalloc_cpumask_var(pmask1, GFP_KERNEL))
479 return -ENOMEM;
480
481 if (!zalloc_cpumask_var(pmask2, GFP_KERNEL))
482 goto free_one;
483
484 if (!zalloc_cpumask_var(pmask3, GFP_KERNEL))
485 goto free_two;
486
487 return 0;
488
489free_two:
490 free_cpumask_var(*pmask2);
491free_one:
492 free_cpumask_var(*pmask1);
493 return -ENOMEM;
494}
495
496
497
498
499
500
501static inline void free_cpumasks(struct cpuset *cs, struct tmpmasks *tmp)
502{
503 if (cs) {
504 free_cpumask_var(cs->cpus_allowed);
505 free_cpumask_var(cs->effective_cpus);
506 free_cpumask_var(cs->subparts_cpus);
507 }
508 if (tmp) {
509 free_cpumask_var(tmp->new_cpus);
510 free_cpumask_var(tmp->addmask);
511 free_cpumask_var(tmp->delmask);
512 }
513}
514
515
516
517
518
519static struct cpuset *alloc_trial_cpuset(struct cpuset *cs)
520{
521 struct cpuset *trial;
522
523 trial = kmemdup(cs, sizeof(*cs), GFP_KERNEL);
524 if (!trial)
525 return NULL;
526
527 if (alloc_cpumasks(trial, NULL)) {
528 kfree(trial);
529 return NULL;
530 }
531
532 cpumask_copy(trial->cpus_allowed, cs->cpus_allowed);
533 cpumask_copy(trial->effective_cpus, cs->effective_cpus);
534 return trial;
535}
536
537
538
539
540
541static inline void free_cpuset(struct cpuset *cs)
542{
543 free_cpumasks(cs, NULL);
544 kfree(cs);
545}
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567static int validate_change(struct cpuset *cur, struct cpuset *trial)
568{
569 struct cgroup_subsys_state *css;
570 struct cpuset *c, *par;
571 int ret;
572
573 rcu_read_lock();
574
575
576 ret = -EBUSY;
577 cpuset_for_each_child(c, css, cur)
578 if (!is_cpuset_subset(c, trial))
579 goto out;
580
581
582 ret = 0;
583 if (cur == &top_cpuset)
584 goto out;
585
586 par = parent_cs(cur);
587
588
589 ret = -EACCES;
590 if (!is_in_v2_mode() && !is_cpuset_subset(trial, par))
591 goto out;
592
593
594
595
596
597 ret = -EINVAL;
598 cpuset_for_each_child(c, css, par) {
599 if ((is_cpu_exclusive(trial) || is_cpu_exclusive(c)) &&
600 c != cur &&
601 cpumask_intersects(trial->cpus_allowed, c->cpus_allowed))
602 goto out;
603 if ((is_mem_exclusive(trial) || is_mem_exclusive(c)) &&
604 c != cur &&
605 nodes_intersects(trial->mems_allowed, c->mems_allowed))
606 goto out;
607 }
608
609
610
611
612
613 ret = -ENOSPC;
614 if ((cgroup_is_populated(cur->css.cgroup) || cur->attach_in_progress)) {
615 if (!cpumask_empty(cur->cpus_allowed) &&
616 cpumask_empty(trial->cpus_allowed))
617 goto out;
618 if (!nodes_empty(cur->mems_allowed) &&
619 nodes_empty(trial->mems_allowed))
620 goto out;
621 }
622
623
624
625
626
627 ret = -EBUSY;
628 if (is_cpu_exclusive(cur) &&
629 !cpuset_cpumask_can_shrink(cur->cpus_allowed,
630 trial->cpus_allowed))
631 goto out;
632
633 ret = 0;
634out:
635 rcu_read_unlock();
636 return ret;
637}
638
639#ifdef CONFIG_SMP
640
641
642
643
644static int cpusets_overlap(struct cpuset *a, struct cpuset *b)
645{
646 return cpumask_intersects(a->effective_cpus, b->effective_cpus);
647}
648
649static void
650update_domain_attr(struct sched_domain_attr *dattr, struct cpuset *c)
651{
652 if (dattr->relax_domain_level < c->relax_domain_level)
653 dattr->relax_domain_level = c->relax_domain_level;
654 return;
655}
656
657static void update_domain_attr_tree(struct sched_domain_attr *dattr,
658 struct cpuset *root_cs)
659{
660 struct cpuset *cp;
661 struct cgroup_subsys_state *pos_css;
662
663 rcu_read_lock();
664 cpuset_for_each_descendant_pre(cp, pos_css, root_cs) {
665
666 if (cpumask_empty(cp->cpus_allowed)) {
667 pos_css = css_rightmost_descendant(pos_css);
668 continue;
669 }
670
671 if (is_sched_load_balance(cp))
672 update_domain_attr(dattr, cp);
673 }
674 rcu_read_unlock();
675}
676
677
678static inline int nr_cpusets(void)
679{
680
681 return static_key_count(&cpusets_enabled_key.key) + 1;
682}
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737static int generate_sched_domains(cpumask_var_t **domains,
738 struct sched_domain_attr **attributes)
739{
740 struct cpuset *cp;
741 struct cpuset **csa;
742 int csn;
743 int i, j, k;
744 cpumask_var_t *doms;
745 struct sched_domain_attr *dattr;
746 int ndoms = 0;
747 int nslot;
748 struct cgroup_subsys_state *pos_css;
749 bool root_load_balance = is_sched_load_balance(&top_cpuset);
750
751 doms = NULL;
752 dattr = NULL;
753 csa = NULL;
754
755
756 if (root_load_balance && !top_cpuset.nr_subparts_cpus) {
757 ndoms = 1;
758 doms = alloc_sched_domains(ndoms);
759 if (!doms)
760 goto done;
761
762 dattr = kmalloc(sizeof(struct sched_domain_attr), GFP_KERNEL);
763 if (dattr) {
764 *dattr = SD_ATTR_INIT;
765 update_domain_attr_tree(dattr, &top_cpuset);
766 }
767 cpumask_and(doms[0], top_cpuset.effective_cpus,
768 housekeeping_cpumask(HK_FLAG_DOMAIN));
769
770 goto done;
771 }
772
773 csa = kmalloc_array(nr_cpusets(), sizeof(cp), GFP_KERNEL);
774 if (!csa)
775 goto done;
776 csn = 0;
777
778 rcu_read_lock();
779 if (root_load_balance)
780 csa[csn++] = &top_cpuset;
781 cpuset_for_each_descendant_pre(cp, pos_css, &top_cpuset) {
782 if (cp == &top_cpuset)
783 continue;
784
785
786
787
788
789
790
791
792
793
794
795 if (!cpumask_empty(cp->cpus_allowed) &&
796 !(is_sched_load_balance(cp) &&
797 cpumask_intersects(cp->cpus_allowed,
798 housekeeping_cpumask(HK_FLAG_DOMAIN))))
799 continue;
800
801 if (root_load_balance &&
802 cpumask_subset(cp->cpus_allowed, top_cpuset.effective_cpus))
803 continue;
804
805 if (is_sched_load_balance(cp) &&
806 !cpumask_empty(cp->effective_cpus))
807 csa[csn++] = cp;
808
809
810 if (!is_partition_root(cp))
811 pos_css = css_rightmost_descendant(pos_css);
812 }
813 rcu_read_unlock();
814
815 for (i = 0; i < csn; i++)
816 csa[i]->pn = i;
817 ndoms = csn;
818
819restart:
820
821 for (i = 0; i < csn; i++) {
822 struct cpuset *a = csa[i];
823 int apn = a->pn;
824
825 for (j = 0; j < csn; j++) {
826 struct cpuset *b = csa[j];
827 int bpn = b->pn;
828
829 if (apn != bpn && cpusets_overlap(a, b)) {
830 for (k = 0; k < csn; k++) {
831 struct cpuset *c = csa[k];
832
833 if (c->pn == bpn)
834 c->pn = apn;
835 }
836 ndoms--;
837 goto restart;
838 }
839 }
840 }
841
842
843
844
845
846 doms = alloc_sched_domains(ndoms);
847 if (!doms)
848 goto done;
849
850
851
852
853
854 dattr = kmalloc_array(ndoms, sizeof(struct sched_domain_attr),
855 GFP_KERNEL);
856
857 for (nslot = 0, i = 0; i < csn; i++) {
858 struct cpuset *a = csa[i];
859 struct cpumask *dp;
860 int apn = a->pn;
861
862 if (apn < 0) {
863
864 continue;
865 }
866
867 dp = doms[nslot];
868
869 if (nslot == ndoms) {
870 static int warnings = 10;
871 if (warnings) {
872 pr_warn("rebuild_sched_domains confused: nslot %d, ndoms %d, csn %d, i %d, apn %d\n",
873 nslot, ndoms, csn, i, apn);
874 warnings--;
875 }
876 continue;
877 }
878
879 cpumask_clear(dp);
880 if (dattr)
881 *(dattr + nslot) = SD_ATTR_INIT;
882 for (j = i; j < csn; j++) {
883 struct cpuset *b = csa[j];
884
885 if (apn == b->pn) {
886 cpumask_or(dp, dp, b->effective_cpus);
887 cpumask_and(dp, dp, housekeeping_cpumask(HK_FLAG_DOMAIN));
888 if (dattr)
889 update_domain_attr_tree(dattr + nslot, b);
890
891
892 b->pn = -1;
893 }
894 }
895 nslot++;
896 }
897 BUG_ON(nslot != ndoms);
898
899done:
900 kfree(csa);
901
902
903
904
905
906 if (doms == NULL)
907 ndoms = 1;
908
909 *domains = doms;
910 *attributes = dattr;
911 return ndoms;
912}
913
914static void update_tasks_root_domain(struct cpuset *cs)
915{
916 struct css_task_iter it;
917 struct task_struct *task;
918
919 css_task_iter_start(&cs->css, 0, &it);
920
921 while ((task = css_task_iter_next(&it)))
922 dl_add_task_root_domain(task);
923
924 css_task_iter_end(&it);
925}
926
927static void rebuild_root_domains(void)
928{
929 struct cpuset *cs = NULL;
930 struct cgroup_subsys_state *pos_css;
931
932 percpu_rwsem_assert_held(&cpuset_rwsem);
933 lockdep_assert_cpus_held();
934 lockdep_assert_held(&sched_domains_mutex);
935
936 rcu_read_lock();
937
938
939
940
941
942 dl_clear_root_domain(&def_root_domain);
943
944 cpuset_for_each_descendant_pre(cs, pos_css, &top_cpuset) {
945
946 if (cpumask_empty(cs->effective_cpus)) {
947 pos_css = css_rightmost_descendant(pos_css);
948 continue;
949 }
950
951 css_get(&cs->css);
952
953 rcu_read_unlock();
954
955 update_tasks_root_domain(cs);
956
957 rcu_read_lock();
958 css_put(&cs->css);
959 }
960 rcu_read_unlock();
961}
962
963static void
964partition_and_rebuild_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
965 struct sched_domain_attr *dattr_new)
966{
967 mutex_lock(&sched_domains_mutex);
968 partition_sched_domains_locked(ndoms_new, doms_new, dattr_new);
969 rebuild_root_domains();
970 mutex_unlock(&sched_domains_mutex);
971}
972
973
974
975
976
977
978
979
980
981
982
983
984static void rebuild_sched_domains_locked(void)
985{
986 struct cgroup_subsys_state *pos_css;
987 struct sched_domain_attr *attr;
988 cpumask_var_t *doms;
989 struct cpuset *cs;
990 int ndoms;
991
992 lockdep_assert_cpus_held();
993 percpu_rwsem_assert_held(&cpuset_rwsem);
994
995
996
997
998
999
1000
1001
1002
1003
1004 if (!top_cpuset.nr_subparts_cpus &&
1005 !cpumask_equal(top_cpuset.effective_cpus, cpu_active_mask))
1006 return;
1007
1008
1009
1010
1011
1012
1013 if (top_cpuset.nr_subparts_cpus) {
1014 rcu_read_lock();
1015 cpuset_for_each_descendant_pre(cs, pos_css, &top_cpuset) {
1016 if (!is_partition_root(cs)) {
1017 pos_css = css_rightmost_descendant(pos_css);
1018 continue;
1019 }
1020 if (!cpumask_subset(cs->effective_cpus,
1021 cpu_active_mask)) {
1022 rcu_read_unlock();
1023 return;
1024 }
1025 }
1026 rcu_read_unlock();
1027 }
1028
1029
1030 ndoms = generate_sched_domains(&doms, &attr);
1031
1032
1033 partition_and_rebuild_sched_domains(ndoms, doms, attr);
1034}
1035#else
1036static void rebuild_sched_domains_locked(void)
1037{
1038}
1039#endif
1040
1041void rebuild_sched_domains(void)
1042{
1043 get_online_cpus();
1044 percpu_down_write(&cpuset_rwsem);
1045 rebuild_sched_domains_locked();
1046 percpu_up_write(&cpuset_rwsem);
1047 put_online_cpus();
1048}
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058static void update_tasks_cpumask(struct cpuset *cs)
1059{
1060 struct css_task_iter it;
1061 struct task_struct *task;
1062
1063 css_task_iter_start(&cs->css, 0, &it);
1064 while ((task = css_task_iter_next(&it)))
1065 set_cpus_allowed_ptr(task, cs->effective_cpus);
1066 css_task_iter_end(&it);
1067}
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080static void compute_effective_cpumask(struct cpumask *new_cpus,
1081 struct cpuset *cs, struct cpuset *parent)
1082{
1083 if (parent->nr_subparts_cpus) {
1084 cpumask_or(new_cpus, parent->effective_cpus,
1085 parent->subparts_cpus);
1086 cpumask_and(new_cpus, new_cpus, cs->cpus_allowed);
1087 cpumask_and(new_cpus, new_cpus, cpu_active_mask);
1088 } else {
1089 cpumask_and(new_cpus, cs->cpus_allowed, parent->effective_cpus);
1090 }
1091}
1092
1093
1094
1095
1096enum subparts_cmd {
1097 partcmd_enable,
1098 partcmd_disable,
1099 partcmd_update,
1100};
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144static int update_parent_subparts_cpumask(struct cpuset *cpuset, int cmd,
1145 struct cpumask *newmask,
1146 struct tmpmasks *tmp)
1147{
1148 struct cpuset *parent = parent_cs(cpuset);
1149 int adding;
1150 int deleting;
1151 bool part_error = false;
1152
1153 percpu_rwsem_assert_held(&cpuset_rwsem);
1154
1155
1156
1157
1158
1159
1160 if (!is_partition_root(parent) ||
1161 (newmask && cpumask_empty(newmask)) ||
1162 (!newmask && cpumask_empty(cpuset->cpus_allowed)))
1163 return -EINVAL;
1164
1165
1166
1167
1168
1169 if ((cmd != partcmd_update) && css_has_online_children(&cpuset->css))
1170 return -EBUSY;
1171
1172
1173
1174
1175
1176
1177 if ((cmd == partcmd_enable) &&
1178 (!cpumask_subset(cpuset->cpus_allowed, parent->effective_cpus) ||
1179 cpumask_equal(cpuset->cpus_allowed, parent->effective_cpus)))
1180 return -EINVAL;
1181
1182
1183
1184
1185 adding = deleting = false;
1186 if (cmd == partcmd_enable) {
1187 cpumask_copy(tmp->addmask, cpuset->cpus_allowed);
1188 adding = true;
1189 } else if (cmd == partcmd_disable) {
1190 deleting = cpumask_and(tmp->delmask, cpuset->cpus_allowed,
1191 parent->subparts_cpus);
1192 } else if (newmask) {
1193
1194
1195
1196
1197
1198
1199
1200 cpumask_andnot(tmp->delmask, cpuset->cpus_allowed, newmask);
1201 deleting = cpumask_and(tmp->delmask, tmp->delmask,
1202 parent->subparts_cpus);
1203
1204 cpumask_and(tmp->addmask, newmask, parent->effective_cpus);
1205 adding = cpumask_andnot(tmp->addmask, tmp->addmask,
1206 parent->subparts_cpus);
1207
1208
1209
1210 if (adding &&
1211 cpumask_equal(parent->effective_cpus, tmp->addmask)) {
1212 if (!deleting)
1213 return -EINVAL;
1214
1215
1216
1217
1218
1219 if (!cpumask_and(tmp->addmask, tmp->delmask,
1220 cpu_active_mask))
1221 return -EINVAL;
1222 cpumask_copy(tmp->addmask, parent->effective_cpus);
1223 }
1224 } else {
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234 adding = cpumask_and(tmp->addmask, cpuset->cpus_allowed,
1235 parent->effective_cpus);
1236 part_error = cpumask_equal(tmp->addmask,
1237 parent->effective_cpus);
1238 }
1239
1240 if (cmd == partcmd_update) {
1241 int prev_prs = cpuset->partition_root_state;
1242
1243
1244
1245
1246
1247 switch (cpuset->partition_root_state) {
1248 case PRS_ENABLED:
1249 if (part_error)
1250 cpuset->partition_root_state = PRS_ERROR;
1251 break;
1252 case PRS_ERROR:
1253 if (!part_error)
1254 cpuset->partition_root_state = PRS_ENABLED;
1255 break;
1256 }
1257
1258
1259
1260 part_error = (prev_prs == PRS_ERROR);
1261 }
1262
1263 if (!part_error && (cpuset->partition_root_state == PRS_ERROR))
1264 return 0;
1265
1266 if (cpuset->partition_root_state == PRS_ERROR) {
1267
1268
1269
1270 adding = false;
1271 deleting = cpumask_and(tmp->delmask, cpuset->cpus_allowed,
1272 parent->subparts_cpus);
1273 }
1274
1275 if (!adding && !deleting)
1276 return 0;
1277
1278
1279
1280
1281
1282
1283 spin_lock_irq(&callback_lock);
1284 if (adding) {
1285 cpumask_or(parent->subparts_cpus,
1286 parent->subparts_cpus, tmp->addmask);
1287 cpumask_andnot(parent->effective_cpus,
1288 parent->effective_cpus, tmp->addmask);
1289 }
1290 if (deleting) {
1291 cpumask_andnot(parent->subparts_cpus,
1292 parent->subparts_cpus, tmp->delmask);
1293
1294
1295
1296 cpumask_and(tmp->delmask, tmp->delmask, cpu_active_mask);
1297 cpumask_or(parent->effective_cpus,
1298 parent->effective_cpus, tmp->delmask);
1299 }
1300
1301 parent->nr_subparts_cpus = cpumask_weight(parent->subparts_cpus);
1302 spin_unlock_irq(&callback_lock);
1303
1304 return cmd == partcmd_update;
1305}
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp)
1320{
1321 struct cpuset *cp;
1322 struct cgroup_subsys_state *pos_css;
1323 bool need_rebuild_sched_domains = false;
1324
1325 rcu_read_lock();
1326 cpuset_for_each_descendant_pre(cp, pos_css, cs) {
1327 struct cpuset *parent = parent_cs(cp);
1328
1329 compute_effective_cpumask(tmp->new_cpus, cp, parent);
1330
1331
1332
1333
1334
1335 if (is_in_v2_mode() && cpumask_empty(tmp->new_cpus)) {
1336 cpumask_copy(tmp->new_cpus, parent->effective_cpus);
1337 if (!cp->use_parent_ecpus) {
1338 cp->use_parent_ecpus = true;
1339 parent->child_ecpus_count++;
1340 }
1341 } else if (cp->use_parent_ecpus) {
1342 cp->use_parent_ecpus = false;
1343 WARN_ON_ONCE(!parent->child_ecpus_count);
1344 parent->child_ecpus_count--;
1345 }
1346
1347
1348
1349
1350
1351 if (!cp->partition_root_state &&
1352 cpumask_equal(tmp->new_cpus, cp->effective_cpus)) {
1353 pos_css = css_rightmost_descendant(pos_css);
1354 continue;
1355 }
1356
1357
1358
1359
1360
1361
1362
1363 if ((cp != cs) && cp->partition_root_state) {
1364 switch (parent->partition_root_state) {
1365 case PRS_DISABLED:
1366
1367
1368
1369
1370
1371 WARN_ON_ONCE(cp->partition_root_state
1372 != PRS_ERROR);
1373 cp->partition_root_state = 0;
1374
1375
1376
1377
1378
1379
1380
1381
1382 clear_bit(CS_CPU_EXCLUSIVE, &cp->flags);
1383 break;
1384
1385 case PRS_ENABLED:
1386 if (update_parent_subparts_cpumask(cp, partcmd_update, NULL, tmp))
1387 update_tasks_cpumask(parent);
1388 break;
1389
1390 case PRS_ERROR:
1391
1392
1393
1394 cp->partition_root_state = PRS_ERROR;
1395 if (cp->nr_subparts_cpus) {
1396 cp->nr_subparts_cpus = 0;
1397 cpumask_clear(cp->subparts_cpus);
1398 }
1399 break;
1400 }
1401 }
1402
1403 if (!css_tryget_online(&cp->css))
1404 continue;
1405 rcu_read_unlock();
1406
1407 spin_lock_irq(&callback_lock);
1408
1409 cpumask_copy(cp->effective_cpus, tmp->new_cpus);
1410 if (cp->nr_subparts_cpus &&
1411 (cp->partition_root_state != PRS_ENABLED)) {
1412 cp->nr_subparts_cpus = 0;
1413 cpumask_clear(cp->subparts_cpus);
1414 } else if (cp->nr_subparts_cpus) {
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424 cpumask_andnot(cp->effective_cpus, cp->effective_cpus,
1425 cp->subparts_cpus);
1426 if (cpumask_empty(cp->effective_cpus)) {
1427 cpumask_copy(cp->effective_cpus, tmp->new_cpus);
1428 cpumask_clear(cp->subparts_cpus);
1429 cp->nr_subparts_cpus = 0;
1430 } else if (!cpumask_subset(cp->subparts_cpus,
1431 tmp->new_cpus)) {
1432 cpumask_andnot(cp->subparts_cpus,
1433 cp->subparts_cpus, tmp->new_cpus);
1434 cp->nr_subparts_cpus
1435 = cpumask_weight(cp->subparts_cpus);
1436 }
1437 }
1438 spin_unlock_irq(&callback_lock);
1439
1440 WARN_ON(!is_in_v2_mode() &&
1441 !cpumask_equal(cp->cpus_allowed, cp->effective_cpus));
1442
1443 update_tasks_cpumask(cp);
1444
1445
1446
1447
1448
1449
1450
1451 if (!cpumask_empty(cp->cpus_allowed) &&
1452 is_sched_load_balance(cp) &&
1453 (!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) ||
1454 is_partition_root(cp)))
1455 need_rebuild_sched_domains = true;
1456
1457 rcu_read_lock();
1458 css_put(&cp->css);
1459 }
1460 rcu_read_unlock();
1461
1462 if (need_rebuild_sched_domains)
1463 rebuild_sched_domains_locked();
1464}
1465
1466
1467
1468
1469
1470
1471
1472static void update_sibling_cpumasks(struct cpuset *parent, struct cpuset *cs,
1473 struct tmpmasks *tmp)
1474{
1475 struct cpuset *sibling;
1476 struct cgroup_subsys_state *pos_css;
1477
1478 percpu_rwsem_assert_held(&cpuset_rwsem);
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488 rcu_read_lock();
1489 cpuset_for_each_child(sibling, pos_css, parent) {
1490 if (sibling == cs)
1491 continue;
1492 if (!sibling->use_parent_ecpus)
1493 continue;
1494 if (!css_tryget_online(&sibling->css))
1495 continue;
1496
1497 rcu_read_unlock();
1498 update_cpumasks_hier(sibling, tmp);
1499 rcu_read_lock();
1500 css_put(&sibling->css);
1501 }
1502 rcu_read_unlock();
1503}
1504
1505
1506
1507
1508
1509
1510
1511static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
1512 const char *buf)
1513{
1514 int retval;
1515 struct tmpmasks tmp;
1516
1517
1518 if (cs == &top_cpuset)
1519 return -EACCES;
1520
1521
1522
1523
1524
1525
1526
1527 if (!*buf) {
1528 cpumask_clear(trialcs->cpus_allowed);
1529 } else {
1530 retval = cpulist_parse(buf, trialcs->cpus_allowed);
1531 if (retval < 0)
1532 return retval;
1533
1534 if (!cpumask_subset(trialcs->cpus_allowed,
1535 top_cpuset.cpus_allowed))
1536 return -EINVAL;
1537 }
1538
1539
1540 if (cpumask_equal(cs->cpus_allowed, trialcs->cpus_allowed))
1541 return 0;
1542
1543 retval = validate_change(cs, trialcs);
1544 if (retval < 0)
1545 return retval;
1546
1547#ifdef CONFIG_CPUMASK_OFFSTACK
1548
1549
1550
1551
1552 tmp.addmask = trialcs->subparts_cpus;
1553 tmp.delmask = trialcs->effective_cpus;
1554 tmp.new_cpus = trialcs->cpus_allowed;
1555#endif
1556
1557 if (cs->partition_root_state) {
1558
1559 if (cpumask_empty(trialcs->cpus_allowed))
1560 return -EINVAL;
1561 if (update_parent_subparts_cpumask(cs, partcmd_update,
1562 trialcs->cpus_allowed, &tmp) < 0)
1563 return -EINVAL;
1564 }
1565
1566 spin_lock_irq(&callback_lock);
1567 cpumask_copy(cs->cpus_allowed, trialcs->cpus_allowed);
1568
1569
1570
1571
1572 if (cs->nr_subparts_cpus) {
1573 cpumask_and(cs->subparts_cpus, cs->subparts_cpus, cs->cpus_allowed);
1574 cs->nr_subparts_cpus = cpumask_weight(cs->subparts_cpus);
1575 }
1576 spin_unlock_irq(&callback_lock);
1577
1578 update_cpumasks_hier(cs, &tmp);
1579
1580 if (cs->partition_root_state) {
1581 struct cpuset *parent = parent_cs(cs);
1582
1583
1584
1585
1586
1587 if (parent->child_ecpus_count)
1588 update_sibling_cpumasks(parent, cs, &tmp);
1589 }
1590 return 0;
1591}
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601struct cpuset_migrate_mm_work {
1602 struct work_struct work;
1603 struct mm_struct *mm;
1604 nodemask_t from;
1605 nodemask_t to;
1606};
1607
1608static void cpuset_migrate_mm_workfn(struct work_struct *work)
1609{
1610 struct cpuset_migrate_mm_work *mwork =
1611 container_of(work, struct cpuset_migrate_mm_work, work);
1612
1613
1614 do_migrate_pages(mwork->mm, &mwork->from, &mwork->to, MPOL_MF_MOVE_ALL);
1615 mmput(mwork->mm);
1616 kfree(mwork);
1617}
1618
1619static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from,
1620 const nodemask_t *to)
1621{
1622 struct cpuset_migrate_mm_work *mwork;
1623
1624 mwork = kzalloc(sizeof(*mwork), GFP_KERNEL);
1625 if (mwork) {
1626 mwork->mm = mm;
1627 mwork->from = *from;
1628 mwork->to = *to;
1629 INIT_WORK(&mwork->work, cpuset_migrate_mm_workfn);
1630 queue_work(cpuset_migrate_mm_wq, &mwork->work);
1631 } else {
1632 mmput(mm);
1633 }
1634}
1635
1636static void cpuset_post_attach(void)
1637{
1638 flush_workqueue(cpuset_migrate_mm_wq);
1639}
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651static void cpuset_change_task_nodemask(struct task_struct *tsk,
1652 nodemask_t *newmems)
1653{
1654 task_lock(tsk);
1655
1656 local_irq_disable();
1657 write_seqcount_begin(&tsk->mems_allowed_seq);
1658
1659 nodes_or(tsk->mems_allowed, tsk->mems_allowed, *newmems);
1660 mpol_rebind_task(tsk, newmems);
1661 tsk->mems_allowed = *newmems;
1662
1663 write_seqcount_end(&tsk->mems_allowed_seq);
1664 local_irq_enable();
1665
1666 task_unlock(tsk);
1667}
1668
1669static void *cpuset_being_rebound;
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679static void update_tasks_nodemask(struct cpuset *cs)
1680{
1681 static nodemask_t newmems;
1682 struct css_task_iter it;
1683 struct task_struct *task;
1684
1685 cpuset_being_rebound = cs;
1686
1687 guarantee_online_mems(cs, &newmems);
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699 css_task_iter_start(&cs->css, 0, &it);
1700 while ((task = css_task_iter_next(&it))) {
1701 struct mm_struct *mm;
1702 bool migrate;
1703
1704 cpuset_change_task_nodemask(task, &newmems);
1705
1706 mm = get_task_mm(task);
1707 if (!mm)
1708 continue;
1709
1710 migrate = is_memory_migrate(cs);
1711
1712 mpol_rebind_mm(mm, &cs->mems_allowed);
1713 if (migrate)
1714 cpuset_migrate_mm(mm, &cs->old_mems_allowed, &newmems);
1715 else
1716 mmput(mm);
1717 }
1718 css_task_iter_end(&it);
1719
1720
1721
1722
1723
1724 cs->old_mems_allowed = newmems;
1725
1726
1727 cpuset_being_rebound = NULL;
1728}
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742static void update_nodemasks_hier(struct cpuset *cs, nodemask_t *new_mems)
1743{
1744 struct cpuset *cp;
1745 struct cgroup_subsys_state *pos_css;
1746
1747 rcu_read_lock();
1748 cpuset_for_each_descendant_pre(cp, pos_css, cs) {
1749 struct cpuset *parent = parent_cs(cp);
1750
1751 nodes_and(*new_mems, cp->mems_allowed, parent->effective_mems);
1752
1753
1754
1755
1756
1757 if (is_in_v2_mode() && nodes_empty(*new_mems))
1758 *new_mems = parent->effective_mems;
1759
1760
1761 if (nodes_equal(*new_mems, cp->effective_mems)) {
1762 pos_css = css_rightmost_descendant(pos_css);
1763 continue;
1764 }
1765
1766 if (!css_tryget_online(&cp->css))
1767 continue;
1768 rcu_read_unlock();
1769
1770 spin_lock_irq(&callback_lock);
1771 cp->effective_mems = *new_mems;
1772 spin_unlock_irq(&callback_lock);
1773
1774 WARN_ON(!is_in_v2_mode() &&
1775 !nodes_equal(cp->mems_allowed, cp->effective_mems));
1776
1777 update_tasks_nodemask(cp);
1778
1779 rcu_read_lock();
1780 css_put(&cp->css);
1781 }
1782 rcu_read_unlock();
1783}
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
1799 const char *buf)
1800{
1801 int retval;
1802
1803
1804
1805
1806
1807 if (cs == &top_cpuset) {
1808 retval = -EACCES;
1809 goto done;
1810 }
1811
1812
1813
1814
1815
1816
1817
1818 if (!*buf) {
1819 nodes_clear(trialcs->mems_allowed);
1820 } else {
1821 retval = nodelist_parse(buf, trialcs->mems_allowed);
1822 if (retval < 0)
1823 goto done;
1824
1825 if (!nodes_subset(trialcs->mems_allowed,
1826 top_cpuset.mems_allowed)) {
1827 retval = -EINVAL;
1828 goto done;
1829 }
1830 }
1831
1832 if (nodes_equal(cs->mems_allowed, trialcs->mems_allowed)) {
1833 retval = 0;
1834 goto done;
1835 }
1836 retval = validate_change(cs, trialcs);
1837 if (retval < 0)
1838 goto done;
1839
1840 spin_lock_irq(&callback_lock);
1841 cs->mems_allowed = trialcs->mems_allowed;
1842 spin_unlock_irq(&callback_lock);
1843
1844
1845 update_nodemasks_hier(cs, &trialcs->mems_allowed);
1846done:
1847 return retval;
1848}
1849
1850bool current_cpuset_is_being_rebound(void)
1851{
1852 bool ret;
1853
1854 rcu_read_lock();
1855 ret = task_cs(current) == cpuset_being_rebound;
1856 rcu_read_unlock();
1857
1858 return ret;
1859}
1860
1861static int update_relax_domain_level(struct cpuset *cs, s64 val)
1862{
1863#ifdef CONFIG_SMP
1864 if (val < -1 || val >= sched_domain_level_max)
1865 return -EINVAL;
1866#endif
1867
1868 if (val != cs->relax_domain_level) {
1869 cs->relax_domain_level = val;
1870 if (!cpumask_empty(cs->cpus_allowed) &&
1871 is_sched_load_balance(cs))
1872 rebuild_sched_domains_locked();
1873 }
1874
1875 return 0;
1876}
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886static void update_tasks_flags(struct cpuset *cs)
1887{
1888 struct css_task_iter it;
1889 struct task_struct *task;
1890
1891 css_task_iter_start(&cs->css, 0, &it);
1892 while ((task = css_task_iter_next(&it)))
1893 cpuset_update_task_spread_flag(cs, task);
1894 css_task_iter_end(&it);
1895}
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
1907 int turning_on)
1908{
1909 struct cpuset *trialcs;
1910 int balance_flag_changed;
1911 int spread_flag_changed;
1912 int err;
1913
1914 trialcs = alloc_trial_cpuset(cs);
1915 if (!trialcs)
1916 return -ENOMEM;
1917
1918 if (turning_on)
1919 set_bit(bit, &trialcs->flags);
1920 else
1921 clear_bit(bit, &trialcs->flags);
1922
1923 err = validate_change(cs, trialcs);
1924 if (err < 0)
1925 goto out;
1926
1927 balance_flag_changed = (is_sched_load_balance(cs) !=
1928 is_sched_load_balance(trialcs));
1929
1930 spread_flag_changed = ((is_spread_slab(cs) != is_spread_slab(trialcs))
1931 || (is_spread_page(cs) != is_spread_page(trialcs)));
1932
1933 spin_lock_irq(&callback_lock);
1934 cs->flags = trialcs->flags;
1935 spin_unlock_irq(&callback_lock);
1936
1937 if (!cpumask_empty(trialcs->cpus_allowed) && balance_flag_changed)
1938 rebuild_sched_domains_locked();
1939
1940 if (spread_flag_changed)
1941 update_tasks_flags(cs);
1942out:
1943 free_cpuset(trialcs);
1944 return err;
1945}
1946
1947
1948
1949
1950
1951
1952
1953
1954static int update_prstate(struct cpuset *cs, int val)
1955{
1956 int err;
1957 struct cpuset *parent = parent_cs(cs);
1958 struct tmpmasks tmp;
1959
1960 if ((val != 0) && (val != 1))
1961 return -EINVAL;
1962 if (val == cs->partition_root_state)
1963 return 0;
1964
1965
1966
1967
1968
1969 if (val && cs->partition_root_state)
1970 return -EINVAL;
1971
1972 if (alloc_cpumasks(NULL, &tmp))
1973 return -ENOMEM;
1974
1975 err = -EINVAL;
1976 if (!cs->partition_root_state) {
1977
1978
1979
1980
1981
1982 if (cpumask_empty(cs->cpus_allowed))
1983 goto out;
1984
1985 err = update_flag(CS_CPU_EXCLUSIVE, cs, 1);
1986 if (err)
1987 goto out;
1988
1989 err = update_parent_subparts_cpumask(cs, partcmd_enable,
1990 NULL, &tmp);
1991 if (err) {
1992 update_flag(CS_CPU_EXCLUSIVE, cs, 0);
1993 goto out;
1994 }
1995 cs->partition_root_state = PRS_ENABLED;
1996 } else {
1997
1998
1999
2000
2001 if (cs->partition_root_state == PRS_ERROR) {
2002 cs->partition_root_state = 0;
2003 update_flag(CS_CPU_EXCLUSIVE, cs, 0);
2004 err = 0;
2005 goto out;
2006 }
2007
2008 err = update_parent_subparts_cpumask(cs, partcmd_disable,
2009 NULL, &tmp);
2010 if (err)
2011 goto out;
2012
2013 cs->partition_root_state = 0;
2014
2015
2016 update_flag(CS_CPU_EXCLUSIVE, cs, 0);
2017 }
2018
2019
2020
2021
2022
2023 if (parent != &top_cpuset)
2024 update_tasks_cpumask(parent);
2025
2026 if (parent->child_ecpus_count)
2027 update_sibling_cpumasks(parent, cs, &tmp);
2028
2029 rebuild_sched_domains_locked();
2030out:
2031 free_cpumasks(NULL, &tmp);
2032 return err;
2033}
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080#define FM_COEF 933
2081#define FM_MAXTICKS ((u32)99)
2082#define FM_MAXCNT 1000000
2083#define FM_SCALE 1000
2084
2085
2086static void fmeter_init(struct fmeter *fmp)
2087{
2088 fmp->cnt = 0;
2089 fmp->val = 0;
2090 fmp->time = 0;
2091 spin_lock_init(&fmp->lock);
2092}
2093
2094
2095static void fmeter_update(struct fmeter *fmp)
2096{
2097 time64_t now;
2098 u32 ticks;
2099
2100 now = ktime_get_seconds();
2101 ticks = now - fmp->time;
2102
2103 if (ticks == 0)
2104 return;
2105
2106 ticks = min(FM_MAXTICKS, ticks);
2107 while (ticks-- > 0)
2108 fmp->val = (FM_COEF * fmp->val) / FM_SCALE;
2109 fmp->time = now;
2110
2111 fmp->val += ((FM_SCALE - FM_COEF) * fmp->cnt) / FM_SCALE;
2112 fmp->cnt = 0;
2113}
2114
2115
2116static void fmeter_markevent(struct fmeter *fmp)
2117{
2118 spin_lock(&fmp->lock);
2119 fmeter_update(fmp);
2120 fmp->cnt = min(FM_MAXCNT, fmp->cnt + FM_SCALE);
2121 spin_unlock(&fmp->lock);
2122}
2123
2124
2125static int fmeter_getrate(struct fmeter *fmp)
2126{
2127 int val;
2128
2129 spin_lock(&fmp->lock);
2130 fmeter_update(fmp);
2131 val = fmp->val;
2132 spin_unlock(&fmp->lock);
2133 return val;
2134}
2135
2136static struct cpuset *cpuset_attach_old_cs;
2137
2138
2139static int cpuset_can_attach(struct cgroup_taskset *tset)
2140{
2141 struct cgroup_subsys_state *css;
2142 struct cpuset *cs;
2143 struct task_struct *task;
2144 int ret;
2145
2146
2147 cpuset_attach_old_cs = task_cs(cgroup_taskset_first(tset, &css));
2148 cs = css_cs(css);
2149
2150 percpu_down_write(&cpuset_rwsem);
2151
2152
2153 ret = -ENOSPC;
2154 if (!is_in_v2_mode() &&
2155 (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed)))
2156 goto out_unlock;
2157
2158 cgroup_taskset_for_each(task, css, tset) {
2159 ret = task_can_attach(task, cs->cpus_allowed);
2160 if (ret)
2161 goto out_unlock;
2162 ret = security_task_setscheduler(task);
2163 if (ret)
2164 goto out_unlock;
2165 }
2166
2167
2168
2169
2170
2171 cs->attach_in_progress++;
2172 ret = 0;
2173out_unlock:
2174 percpu_up_write(&cpuset_rwsem);
2175 return ret;
2176}
2177
2178static void cpuset_cancel_attach(struct cgroup_taskset *tset)
2179{
2180 struct cgroup_subsys_state *css;
2181
2182 cgroup_taskset_first(tset, &css);
2183
2184 percpu_down_write(&cpuset_rwsem);
2185 css_cs(css)->attach_in_progress--;
2186 percpu_up_write(&cpuset_rwsem);
2187}
2188
2189
2190
2191
2192
2193
2194static cpumask_var_t cpus_attach;
2195
2196static void cpuset_attach(struct cgroup_taskset *tset)
2197{
2198
2199 static nodemask_t cpuset_attach_nodemask_to;
2200 struct task_struct *task;
2201 struct task_struct *leader;
2202 struct cgroup_subsys_state *css;
2203 struct cpuset *cs;
2204 struct cpuset *oldcs = cpuset_attach_old_cs;
2205
2206 cgroup_taskset_first(tset, &css);
2207 cs = css_cs(css);
2208
2209 percpu_down_write(&cpuset_rwsem);
2210
2211
2212 if (cs == &top_cpuset)
2213 cpumask_copy(cpus_attach, cpu_possible_mask);
2214 else
2215 guarantee_online_cpus(cs, cpus_attach);
2216
2217 guarantee_online_mems(cs, &cpuset_attach_nodemask_to);
2218
2219 cgroup_taskset_for_each(task, css, tset) {
2220
2221
2222
2223
2224 WARN_ON_ONCE(set_cpus_allowed_ptr(task, cpus_attach));
2225
2226 cpuset_change_task_nodemask(task, &cpuset_attach_nodemask_to);
2227 cpuset_update_task_spread_flag(cs, task);
2228 }
2229
2230
2231
2232
2233
2234 cpuset_attach_nodemask_to = cs->effective_mems;
2235 cgroup_taskset_for_each_leader(leader, css, tset) {
2236 struct mm_struct *mm = get_task_mm(leader);
2237
2238 if (mm) {
2239 mpol_rebind_mm(mm, &cpuset_attach_nodemask_to);
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249 if (is_memory_migrate(cs))
2250 cpuset_migrate_mm(mm, &oldcs->old_mems_allowed,
2251 &cpuset_attach_nodemask_to);
2252 else
2253 mmput(mm);
2254 }
2255 }
2256
2257 cs->old_mems_allowed = cpuset_attach_nodemask_to;
2258
2259 cs->attach_in_progress--;
2260 if (!cs->attach_in_progress)
2261 wake_up(&cpuset_attach_wq);
2262
2263 percpu_up_write(&cpuset_rwsem);
2264}
2265
2266
2267
2268typedef enum {
2269 FILE_MEMORY_MIGRATE,
2270 FILE_CPULIST,
2271 FILE_MEMLIST,
2272 FILE_EFFECTIVE_CPULIST,
2273 FILE_EFFECTIVE_MEMLIST,
2274 FILE_SUBPARTS_CPULIST,
2275 FILE_CPU_EXCLUSIVE,
2276 FILE_MEM_EXCLUSIVE,
2277 FILE_MEM_HARDWALL,
2278 FILE_SCHED_LOAD_BALANCE,
2279 FILE_PARTITION_ROOT,
2280 FILE_SCHED_RELAX_DOMAIN_LEVEL,
2281 FILE_MEMORY_PRESSURE_ENABLED,
2282 FILE_MEMORY_PRESSURE,
2283 FILE_SPREAD_PAGE,
2284 FILE_SPREAD_SLAB,
2285} cpuset_filetype_t;
2286
2287static int cpuset_write_u64(struct cgroup_subsys_state *css, struct cftype *cft,
2288 u64 val)
2289{
2290 struct cpuset *cs = css_cs(css);
2291 cpuset_filetype_t type = cft->private;
2292 int retval = 0;
2293
2294 get_online_cpus();
2295 percpu_down_write(&cpuset_rwsem);
2296 if (!is_cpuset_online(cs)) {
2297 retval = -ENODEV;
2298 goto out_unlock;
2299 }
2300
2301 switch (type) {
2302 case FILE_CPU_EXCLUSIVE:
2303 retval = update_flag(CS_CPU_EXCLUSIVE, cs, val);
2304 break;
2305 case FILE_MEM_EXCLUSIVE:
2306 retval = update_flag(CS_MEM_EXCLUSIVE, cs, val);
2307 break;
2308 case FILE_MEM_HARDWALL:
2309 retval = update_flag(CS_MEM_HARDWALL, cs, val);
2310 break;
2311 case FILE_SCHED_LOAD_BALANCE:
2312 retval = update_flag(CS_SCHED_LOAD_BALANCE, cs, val);
2313 break;
2314 case FILE_MEMORY_MIGRATE:
2315 retval = update_flag(CS_MEMORY_MIGRATE, cs, val);
2316 break;
2317 case FILE_MEMORY_PRESSURE_ENABLED:
2318 cpuset_memory_pressure_enabled = !!val;
2319 break;
2320 case FILE_SPREAD_PAGE:
2321 retval = update_flag(CS_SPREAD_PAGE, cs, val);
2322 break;
2323 case FILE_SPREAD_SLAB:
2324 retval = update_flag(CS_SPREAD_SLAB, cs, val);
2325 break;
2326 default:
2327 retval = -EINVAL;
2328 break;
2329 }
2330out_unlock:
2331 percpu_up_write(&cpuset_rwsem);
2332 put_online_cpus();
2333 return retval;
2334}
2335
2336static int cpuset_write_s64(struct cgroup_subsys_state *css, struct cftype *cft,
2337 s64 val)
2338{
2339 struct cpuset *cs = css_cs(css);
2340 cpuset_filetype_t type = cft->private;
2341 int retval = -ENODEV;
2342
2343 get_online_cpus();
2344 percpu_down_write(&cpuset_rwsem);
2345 if (!is_cpuset_online(cs))
2346 goto out_unlock;
2347
2348 switch (type) {
2349 case FILE_SCHED_RELAX_DOMAIN_LEVEL:
2350 retval = update_relax_domain_level(cs, val);
2351 break;
2352 default:
2353 retval = -EINVAL;
2354 break;
2355 }
2356out_unlock:
2357 percpu_up_write(&cpuset_rwsem);
2358 put_online_cpus();
2359 return retval;
2360}
2361
2362
2363
2364
2365static ssize_t cpuset_write_resmask(struct kernfs_open_file *of,
2366 char *buf, size_t nbytes, loff_t off)
2367{
2368 struct cpuset *cs = css_cs(of_css(of));
2369 struct cpuset *trialcs;
2370 int retval = -ENODEV;
2371
2372 buf = strstrip(buf);
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393 css_get(&cs->css);
2394 kernfs_break_active_protection(of->kn);
2395 flush_work(&cpuset_hotplug_work);
2396
2397 get_online_cpus();
2398 percpu_down_write(&cpuset_rwsem);
2399 if (!is_cpuset_online(cs))
2400 goto out_unlock;
2401
2402 trialcs = alloc_trial_cpuset(cs);
2403 if (!trialcs) {
2404 retval = -ENOMEM;
2405 goto out_unlock;
2406 }
2407
2408 switch (of_cft(of)->private) {
2409 case FILE_CPULIST:
2410 retval = update_cpumask(cs, trialcs, buf);
2411 break;
2412 case FILE_MEMLIST:
2413 retval = update_nodemask(cs, trialcs, buf);
2414 break;
2415 default:
2416 retval = -EINVAL;
2417 break;
2418 }
2419
2420 free_cpuset(trialcs);
2421out_unlock:
2422 percpu_up_write(&cpuset_rwsem);
2423 put_online_cpus();
2424 kernfs_unbreak_active_protection(of->kn);
2425 css_put(&cs->css);
2426 flush_workqueue(cpuset_migrate_mm_wq);
2427 return retval ?: nbytes;
2428}
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438static int cpuset_common_seq_show(struct seq_file *sf, void *v)
2439{
2440 struct cpuset *cs = css_cs(seq_css(sf));
2441 cpuset_filetype_t type = seq_cft(sf)->private;
2442 int ret = 0;
2443
2444 spin_lock_irq(&callback_lock);
2445
2446 switch (type) {
2447 case FILE_CPULIST:
2448 seq_printf(sf, "%*pbl\n", cpumask_pr_args(cs->cpus_allowed));
2449 break;
2450 case FILE_MEMLIST:
2451 seq_printf(sf, "%*pbl\n", nodemask_pr_args(&cs->mems_allowed));
2452 break;
2453 case FILE_EFFECTIVE_CPULIST:
2454 seq_printf(sf, "%*pbl\n", cpumask_pr_args(cs->effective_cpus));
2455 break;
2456 case FILE_EFFECTIVE_MEMLIST:
2457 seq_printf(sf, "%*pbl\n", nodemask_pr_args(&cs->effective_mems));
2458 break;
2459 case FILE_SUBPARTS_CPULIST:
2460 seq_printf(sf, "%*pbl\n", cpumask_pr_args(cs->subparts_cpus));
2461 break;
2462 default:
2463 ret = -EINVAL;
2464 }
2465
2466 spin_unlock_irq(&callback_lock);
2467 return ret;
2468}
2469
2470static u64 cpuset_read_u64(struct cgroup_subsys_state *css, struct cftype *cft)
2471{
2472 struct cpuset *cs = css_cs(css);
2473 cpuset_filetype_t type = cft->private;
2474 switch (type) {
2475 case FILE_CPU_EXCLUSIVE:
2476 return is_cpu_exclusive(cs);
2477 case FILE_MEM_EXCLUSIVE:
2478 return is_mem_exclusive(cs);
2479 case FILE_MEM_HARDWALL:
2480 return is_mem_hardwall(cs);
2481 case FILE_SCHED_LOAD_BALANCE:
2482 return is_sched_load_balance(cs);
2483 case FILE_MEMORY_MIGRATE:
2484 return is_memory_migrate(cs);
2485 case FILE_MEMORY_PRESSURE_ENABLED:
2486 return cpuset_memory_pressure_enabled;
2487 case FILE_MEMORY_PRESSURE:
2488 return fmeter_getrate(&cs->fmeter);
2489 case FILE_SPREAD_PAGE:
2490 return is_spread_page(cs);
2491 case FILE_SPREAD_SLAB:
2492 return is_spread_slab(cs);
2493 default:
2494 BUG();
2495 }
2496
2497
2498 return 0;
2499}
2500
2501static s64 cpuset_read_s64(struct cgroup_subsys_state *css, struct cftype *cft)
2502{
2503 struct cpuset *cs = css_cs(css);
2504 cpuset_filetype_t type = cft->private;
2505 switch (type) {
2506 case FILE_SCHED_RELAX_DOMAIN_LEVEL:
2507 return cs->relax_domain_level;
2508 default:
2509 BUG();
2510 }
2511
2512
2513 return 0;
2514}
2515
2516static int sched_partition_show(struct seq_file *seq, void *v)
2517{
2518 struct cpuset *cs = css_cs(seq_css(seq));
2519
2520 switch (cs->partition_root_state) {
2521 case PRS_ENABLED:
2522 seq_puts(seq, "root\n");
2523 break;
2524 case PRS_DISABLED:
2525 seq_puts(seq, "member\n");
2526 break;
2527 case PRS_ERROR:
2528 seq_puts(seq, "root invalid\n");
2529 break;
2530 }
2531 return 0;
2532}
2533
2534static ssize_t sched_partition_write(struct kernfs_open_file *of, char *buf,
2535 size_t nbytes, loff_t off)
2536{
2537 struct cpuset *cs = css_cs(of_css(of));
2538 int val;
2539 int retval = -ENODEV;
2540
2541 buf = strstrip(buf);
2542
2543
2544
2545
2546 if (!strcmp(buf, "root"))
2547 val = PRS_ENABLED;
2548 else if (!strcmp(buf, "member"))
2549 val = PRS_DISABLED;
2550 else
2551 return -EINVAL;
2552
2553 css_get(&cs->css);
2554 get_online_cpus();
2555 percpu_down_write(&cpuset_rwsem);
2556 if (!is_cpuset_online(cs))
2557 goto out_unlock;
2558
2559 retval = update_prstate(cs, val);
2560out_unlock:
2561 percpu_up_write(&cpuset_rwsem);
2562 put_online_cpus();
2563 css_put(&cs->css);
2564 return retval ?: nbytes;
2565}
2566
2567
2568
2569
2570
2571static struct cftype legacy_files[] = {
2572 {
2573 .name = "cpus",
2574 .seq_show = cpuset_common_seq_show,
2575 .write = cpuset_write_resmask,
2576 .max_write_len = (100U + 6 * NR_CPUS),
2577 .private = FILE_CPULIST,
2578 },
2579
2580 {
2581 .name = "mems",
2582 .seq_show = cpuset_common_seq_show,
2583 .write = cpuset_write_resmask,
2584 .max_write_len = (100U + 6 * MAX_NUMNODES),
2585 .private = FILE_MEMLIST,
2586 },
2587
2588 {
2589 .name = "effective_cpus",
2590 .seq_show = cpuset_common_seq_show,
2591 .private = FILE_EFFECTIVE_CPULIST,
2592 },
2593
2594 {
2595 .name = "effective_mems",
2596 .seq_show = cpuset_common_seq_show,
2597 .private = FILE_EFFECTIVE_MEMLIST,
2598 },
2599
2600 {
2601 .name = "cpu_exclusive",
2602 .read_u64 = cpuset_read_u64,
2603 .write_u64 = cpuset_write_u64,
2604 .private = FILE_CPU_EXCLUSIVE,
2605 },
2606
2607 {
2608 .name = "mem_exclusive",
2609 .read_u64 = cpuset_read_u64,
2610 .write_u64 = cpuset_write_u64,
2611 .private = FILE_MEM_EXCLUSIVE,
2612 },
2613
2614 {
2615 .name = "mem_hardwall",
2616 .read_u64 = cpuset_read_u64,
2617 .write_u64 = cpuset_write_u64,
2618 .private = FILE_MEM_HARDWALL,
2619 },
2620
2621 {
2622 .name = "sched_load_balance",
2623 .read_u64 = cpuset_read_u64,
2624 .write_u64 = cpuset_write_u64,
2625 .private = FILE_SCHED_LOAD_BALANCE,
2626 },
2627
2628 {
2629 .name = "sched_relax_domain_level",
2630 .read_s64 = cpuset_read_s64,
2631 .write_s64 = cpuset_write_s64,
2632 .private = FILE_SCHED_RELAX_DOMAIN_LEVEL,
2633 },
2634
2635 {
2636 .name = "memory_migrate",
2637 .read_u64 = cpuset_read_u64,
2638 .write_u64 = cpuset_write_u64,
2639 .private = FILE_MEMORY_MIGRATE,
2640 },
2641
2642 {
2643 .name = "memory_pressure",
2644 .read_u64 = cpuset_read_u64,
2645 .private = FILE_MEMORY_PRESSURE,
2646 },
2647
2648 {
2649 .name = "memory_spread_page",
2650 .read_u64 = cpuset_read_u64,
2651 .write_u64 = cpuset_write_u64,
2652 .private = FILE_SPREAD_PAGE,
2653 },
2654
2655 {
2656 .name = "memory_spread_slab",
2657 .read_u64 = cpuset_read_u64,
2658 .write_u64 = cpuset_write_u64,
2659 .private = FILE_SPREAD_SLAB,
2660 },
2661
2662 {
2663 .name = "memory_pressure_enabled",
2664 .flags = CFTYPE_ONLY_ON_ROOT,
2665 .read_u64 = cpuset_read_u64,
2666 .write_u64 = cpuset_write_u64,
2667 .private = FILE_MEMORY_PRESSURE_ENABLED,
2668 },
2669
2670 { }
2671};
2672
2673
2674
2675
2676
2677static struct cftype dfl_files[] = {
2678 {
2679 .name = "cpus",
2680 .seq_show = cpuset_common_seq_show,
2681 .write = cpuset_write_resmask,
2682 .max_write_len = (100U + 6 * NR_CPUS),
2683 .private = FILE_CPULIST,
2684 .flags = CFTYPE_NOT_ON_ROOT,
2685 },
2686
2687 {
2688 .name = "mems",
2689 .seq_show = cpuset_common_seq_show,
2690 .write = cpuset_write_resmask,
2691 .max_write_len = (100U + 6 * MAX_NUMNODES),
2692 .private = FILE_MEMLIST,
2693 .flags = CFTYPE_NOT_ON_ROOT,
2694 },
2695
2696 {
2697 .name = "cpus.effective",
2698 .seq_show = cpuset_common_seq_show,
2699 .private = FILE_EFFECTIVE_CPULIST,
2700 },
2701
2702 {
2703 .name = "mems.effective",
2704 .seq_show = cpuset_common_seq_show,
2705 .private = FILE_EFFECTIVE_MEMLIST,
2706 },
2707
2708 {
2709 .name = "cpus.partition",
2710 .seq_show = sched_partition_show,
2711 .write = sched_partition_write,
2712 .private = FILE_PARTITION_ROOT,
2713 .flags = CFTYPE_NOT_ON_ROOT,
2714 },
2715
2716 {
2717 .name = "cpus.subpartitions",
2718 .seq_show = cpuset_common_seq_show,
2719 .private = FILE_SUBPARTS_CPULIST,
2720 .flags = CFTYPE_DEBUG,
2721 },
2722
2723 { }
2724};
2725
2726
2727
2728
2729
2730
2731
2732static struct cgroup_subsys_state *
2733cpuset_css_alloc(struct cgroup_subsys_state *parent_css)
2734{
2735 struct cpuset *cs;
2736
2737 if (!parent_css)
2738 return &top_cpuset.css;
2739
2740 cs = kzalloc(sizeof(*cs), GFP_KERNEL);
2741 if (!cs)
2742 return ERR_PTR(-ENOMEM);
2743
2744 if (alloc_cpumasks(cs, NULL)) {
2745 kfree(cs);
2746 return ERR_PTR(-ENOMEM);
2747 }
2748
2749 set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
2750 nodes_clear(cs->mems_allowed);
2751 nodes_clear(cs->effective_mems);
2752 fmeter_init(&cs->fmeter);
2753 cs->relax_domain_level = -1;
2754
2755 return &cs->css;
2756}
2757
2758static int cpuset_css_online(struct cgroup_subsys_state *css)
2759{
2760 struct cpuset *cs = css_cs(css);
2761 struct cpuset *parent = parent_cs(cs);
2762 struct cpuset *tmp_cs;
2763 struct cgroup_subsys_state *pos_css;
2764
2765 if (!parent)
2766 return 0;
2767
2768 get_online_cpus();
2769 percpu_down_write(&cpuset_rwsem);
2770
2771 set_bit(CS_ONLINE, &cs->flags);
2772 if (is_spread_page(parent))
2773 set_bit(CS_SPREAD_PAGE, &cs->flags);
2774 if (is_spread_slab(parent))
2775 set_bit(CS_SPREAD_SLAB, &cs->flags);
2776
2777 cpuset_inc();
2778
2779 spin_lock_irq(&callback_lock);
2780 if (is_in_v2_mode()) {
2781 cpumask_copy(cs->effective_cpus, parent->effective_cpus);
2782 cs->effective_mems = parent->effective_mems;
2783 cs->use_parent_ecpus = true;
2784 parent->child_ecpus_count++;
2785 }
2786 spin_unlock_irq(&callback_lock);
2787
2788 if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags))
2789 goto out_unlock;
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804 rcu_read_lock();
2805 cpuset_for_each_child(tmp_cs, pos_css, parent) {
2806 if (is_mem_exclusive(tmp_cs) || is_cpu_exclusive(tmp_cs)) {
2807 rcu_read_unlock();
2808 goto out_unlock;
2809 }
2810 }
2811 rcu_read_unlock();
2812
2813 spin_lock_irq(&callback_lock);
2814 cs->mems_allowed = parent->mems_allowed;
2815 cs->effective_mems = parent->mems_allowed;
2816 cpumask_copy(cs->cpus_allowed, parent->cpus_allowed);
2817 cpumask_copy(cs->effective_cpus, parent->cpus_allowed);
2818 spin_unlock_irq(&callback_lock);
2819out_unlock:
2820 percpu_up_write(&cpuset_rwsem);
2821 put_online_cpus();
2822 return 0;
2823}
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836static void cpuset_css_offline(struct cgroup_subsys_state *css)
2837{
2838 struct cpuset *cs = css_cs(css);
2839
2840 get_online_cpus();
2841 percpu_down_write(&cpuset_rwsem);
2842
2843 if (is_partition_root(cs))
2844 update_prstate(cs, 0);
2845
2846 if (!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) &&
2847 is_sched_load_balance(cs))
2848 update_flag(CS_SCHED_LOAD_BALANCE, cs, 0);
2849
2850 if (cs->use_parent_ecpus) {
2851 struct cpuset *parent = parent_cs(cs);
2852
2853 cs->use_parent_ecpus = false;
2854 parent->child_ecpus_count--;
2855 }
2856
2857 cpuset_dec();
2858 clear_bit(CS_ONLINE, &cs->flags);
2859
2860 percpu_up_write(&cpuset_rwsem);
2861 put_online_cpus();
2862}
2863
2864static void cpuset_css_free(struct cgroup_subsys_state *css)
2865{
2866 struct cpuset *cs = css_cs(css);
2867
2868 free_cpuset(cs);
2869}
2870
2871static void cpuset_bind(struct cgroup_subsys_state *root_css)
2872{
2873 percpu_down_write(&cpuset_rwsem);
2874 spin_lock_irq(&callback_lock);
2875
2876 if (is_in_v2_mode()) {
2877 cpumask_copy(top_cpuset.cpus_allowed, cpu_possible_mask);
2878 top_cpuset.mems_allowed = node_possible_map;
2879 } else {
2880 cpumask_copy(top_cpuset.cpus_allowed,
2881 top_cpuset.effective_cpus);
2882 top_cpuset.mems_allowed = top_cpuset.effective_mems;
2883 }
2884
2885 spin_unlock_irq(&callback_lock);
2886 percpu_up_write(&cpuset_rwsem);
2887}
2888
2889
2890
2891
2892
2893
2894static void cpuset_fork(struct task_struct *task)
2895{
2896 if (task_css_is_root(task, cpuset_cgrp_id))
2897 return;
2898
2899 set_cpus_allowed_ptr(task, current->cpus_ptr);
2900 task->mems_allowed = current->mems_allowed;
2901}
2902
2903struct cgroup_subsys cpuset_cgrp_subsys = {
2904 .css_alloc = cpuset_css_alloc,
2905 .css_online = cpuset_css_online,
2906 .css_offline = cpuset_css_offline,
2907 .css_free = cpuset_css_free,
2908 .can_attach = cpuset_can_attach,
2909 .cancel_attach = cpuset_cancel_attach,
2910 .attach = cpuset_attach,
2911 .post_attach = cpuset_post_attach,
2912 .bind = cpuset_bind,
2913 .fork = cpuset_fork,
2914 .legacy_cftypes = legacy_files,
2915 .dfl_cftypes = dfl_files,
2916 .early_init = true,
2917 .threaded = true,
2918};
2919
2920
2921
2922
2923
2924
2925
2926int __init cpuset_init(void)
2927{
2928 BUG_ON(percpu_init_rwsem(&cpuset_rwsem));
2929
2930 BUG_ON(!alloc_cpumask_var(&top_cpuset.cpus_allowed, GFP_KERNEL));
2931 BUG_ON(!alloc_cpumask_var(&top_cpuset.effective_cpus, GFP_KERNEL));
2932 BUG_ON(!zalloc_cpumask_var(&top_cpuset.subparts_cpus, GFP_KERNEL));
2933
2934 cpumask_setall(top_cpuset.cpus_allowed);
2935 nodes_setall(top_cpuset.mems_allowed);
2936 cpumask_setall(top_cpuset.effective_cpus);
2937 nodes_setall(top_cpuset.effective_mems);
2938
2939 fmeter_init(&top_cpuset.fmeter);
2940 set_bit(CS_SCHED_LOAD_BALANCE, &top_cpuset.flags);
2941 top_cpuset.relax_domain_level = -1;
2942
2943 BUG_ON(!alloc_cpumask_var(&cpus_attach, GFP_KERNEL));
2944
2945 return 0;
2946}
2947
2948
2949
2950
2951
2952
2953
2954
2955static void remove_tasks_in_empty_cpuset(struct cpuset *cs)
2956{
2957 struct cpuset *parent;
2958
2959
2960
2961
2962
2963 parent = parent_cs(cs);
2964 while (cpumask_empty(parent->cpus_allowed) ||
2965 nodes_empty(parent->mems_allowed))
2966 parent = parent_cs(parent);
2967
2968 if (cgroup_transfer_tasks(parent->css.cgroup, cs->css.cgroup)) {
2969 pr_err("cpuset: failed to transfer tasks out of empty cpuset ");
2970 pr_cont_cgroup_name(cs->css.cgroup);
2971 pr_cont("\n");
2972 }
2973}
2974
2975static void
2976hotplug_update_tasks_legacy(struct cpuset *cs,
2977 struct cpumask *new_cpus, nodemask_t *new_mems,
2978 bool cpus_updated, bool mems_updated)
2979{
2980 bool is_empty;
2981
2982 spin_lock_irq(&callback_lock);
2983 cpumask_copy(cs->cpus_allowed, new_cpus);
2984 cpumask_copy(cs->effective_cpus, new_cpus);
2985 cs->mems_allowed = *new_mems;
2986 cs->effective_mems = *new_mems;
2987 spin_unlock_irq(&callback_lock);
2988
2989
2990
2991
2992
2993 if (cpus_updated && !cpumask_empty(cs->cpus_allowed))
2994 update_tasks_cpumask(cs);
2995 if (mems_updated && !nodes_empty(cs->mems_allowed))
2996 update_tasks_nodemask(cs);
2997
2998 is_empty = cpumask_empty(cs->cpus_allowed) ||
2999 nodes_empty(cs->mems_allowed);
3000
3001 percpu_up_write(&cpuset_rwsem);
3002
3003
3004
3005
3006
3007
3008 if (is_empty)
3009 remove_tasks_in_empty_cpuset(cs);
3010
3011 percpu_down_write(&cpuset_rwsem);
3012}
3013
3014static void
3015hotplug_update_tasks(struct cpuset *cs,
3016 struct cpumask *new_cpus, nodemask_t *new_mems,
3017 bool cpus_updated, bool mems_updated)
3018{
3019 if (cpumask_empty(new_cpus))
3020 cpumask_copy(new_cpus, parent_cs(cs)->effective_cpus);
3021 if (nodes_empty(*new_mems))
3022 *new_mems = parent_cs(cs)->effective_mems;
3023
3024 spin_lock_irq(&callback_lock);
3025 cpumask_copy(cs->effective_cpus, new_cpus);
3026 cs->effective_mems = *new_mems;
3027 spin_unlock_irq(&callback_lock);
3028
3029 if (cpus_updated)
3030 update_tasks_cpumask(cs);
3031 if (mems_updated)
3032 update_tasks_nodemask(cs);
3033}
3034
3035static bool force_rebuild;
3036
3037void cpuset_force_rebuild(void)
3038{
3039 force_rebuild = true;
3040}
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051static void cpuset_hotplug_update_tasks(struct cpuset *cs, struct tmpmasks *tmp)
3052{
3053 static cpumask_t new_cpus;
3054 static nodemask_t new_mems;
3055 bool cpus_updated;
3056 bool mems_updated;
3057 struct cpuset *parent;
3058retry:
3059 wait_event(cpuset_attach_wq, cs->attach_in_progress == 0);
3060
3061 percpu_down_write(&cpuset_rwsem);
3062
3063
3064
3065
3066
3067 if (cs->attach_in_progress) {
3068 percpu_up_write(&cpuset_rwsem);
3069 goto retry;
3070 }
3071
3072 parent = parent_cs(cs);
3073 compute_effective_cpumask(&new_cpus, cs, parent);
3074 nodes_and(new_mems, cs->mems_allowed, parent->effective_mems);
3075
3076 if (cs->nr_subparts_cpus)
3077
3078
3079
3080
3081 cpumask_andnot(&new_cpus, &new_cpus, cs->subparts_cpus);
3082
3083 if (!tmp || !cs->partition_root_state)
3084 goto update_tasks;
3085
3086
3087
3088
3089
3090
3091 if (is_partition_root(cs) && (cpumask_empty(&new_cpus) ||
3092 (parent->partition_root_state == PRS_ERROR))) {
3093 if (cs->nr_subparts_cpus) {
3094 cs->nr_subparts_cpus = 0;
3095 cpumask_clear(cs->subparts_cpus);
3096 compute_effective_cpumask(&new_cpus, cs, parent);
3097 }
3098
3099
3100
3101
3102
3103
3104
3105 if ((parent->partition_root_state == PRS_ERROR) ||
3106 cpumask_empty(&new_cpus)) {
3107 update_parent_subparts_cpumask(cs, partcmd_disable,
3108 NULL, tmp);
3109 cs->partition_root_state = PRS_ERROR;
3110 }
3111 cpuset_force_rebuild();
3112 }
3113
3114
3115
3116
3117
3118
3119 if (is_partition_root(parent) &&
3120 ((cs->partition_root_state == PRS_ERROR) ||
3121 !cpumask_intersects(&new_cpus, parent->subparts_cpus)) &&
3122 update_parent_subparts_cpumask(cs, partcmd_update, NULL, tmp))
3123 cpuset_force_rebuild();
3124
3125update_tasks:
3126 cpus_updated = !cpumask_equal(&new_cpus, cs->effective_cpus);
3127 mems_updated = !nodes_equal(new_mems, cs->effective_mems);
3128
3129 if (is_in_v2_mode())
3130 hotplug_update_tasks(cs, &new_cpus, &new_mems,
3131 cpus_updated, mems_updated);
3132 else
3133 hotplug_update_tasks_legacy(cs, &new_cpus, &new_mems,
3134 cpus_updated, mems_updated);
3135
3136 percpu_up_write(&cpuset_rwsem);
3137}
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155static void cpuset_hotplug_workfn(struct work_struct *work)
3156{
3157 static cpumask_t new_cpus;
3158 static nodemask_t new_mems;
3159 bool cpus_updated, mems_updated;
3160 bool on_dfl = is_in_v2_mode();
3161 struct tmpmasks tmp, *ptmp = NULL;
3162
3163 if (on_dfl && !alloc_cpumasks(NULL, &tmp))
3164 ptmp = &tmp;
3165
3166 percpu_down_write(&cpuset_rwsem);
3167
3168
3169 cpumask_copy(&new_cpus, cpu_active_mask);
3170 new_mems = node_states[N_MEMORY];
3171
3172
3173
3174
3175
3176
3177 cpus_updated = !cpumask_equal(top_cpuset.effective_cpus, &new_cpus);
3178 mems_updated = !nodes_equal(top_cpuset.effective_mems, new_mems);
3179
3180
3181 if (cpus_updated) {
3182 spin_lock_irq(&callback_lock);
3183 if (!on_dfl)
3184 cpumask_copy(top_cpuset.cpus_allowed, &new_cpus);
3185
3186
3187
3188
3189
3190
3191 if (top_cpuset.nr_subparts_cpus) {
3192 if (cpumask_subset(&new_cpus,
3193 top_cpuset.subparts_cpus)) {
3194 top_cpuset.nr_subparts_cpus = 0;
3195 cpumask_clear(top_cpuset.subparts_cpus);
3196 } else {
3197 cpumask_andnot(&new_cpus, &new_cpus,
3198 top_cpuset.subparts_cpus);
3199 }
3200 }
3201 cpumask_copy(top_cpuset.effective_cpus, &new_cpus);
3202 spin_unlock_irq(&callback_lock);
3203
3204 }
3205
3206
3207 if (mems_updated) {
3208 spin_lock_irq(&callback_lock);
3209 if (!on_dfl)
3210 top_cpuset.mems_allowed = new_mems;
3211 top_cpuset.effective_mems = new_mems;
3212 spin_unlock_irq(&callback_lock);
3213 update_tasks_nodemask(&top_cpuset);
3214 }
3215
3216 percpu_up_write(&cpuset_rwsem);
3217
3218
3219 if (cpus_updated || mems_updated) {
3220 struct cpuset *cs;
3221 struct cgroup_subsys_state *pos_css;
3222
3223 rcu_read_lock();
3224 cpuset_for_each_descendant_pre(cs, pos_css, &top_cpuset) {
3225 if (cs == &top_cpuset || !css_tryget_online(&cs->css))
3226 continue;
3227 rcu_read_unlock();
3228
3229 cpuset_hotplug_update_tasks(cs, ptmp);
3230
3231 rcu_read_lock();
3232 css_put(&cs->css);
3233 }
3234 rcu_read_unlock();
3235 }
3236
3237
3238 if (cpus_updated || force_rebuild) {
3239 force_rebuild = false;
3240 rebuild_sched_domains();
3241 }
3242
3243 free_cpumasks(NULL, ptmp);
3244}
3245
3246void cpuset_update_active_cpus(void)
3247{
3248
3249
3250
3251
3252
3253 schedule_work(&cpuset_hotplug_work);
3254}
3255
3256void cpuset_wait_for_hotplug(void)
3257{
3258 flush_work(&cpuset_hotplug_work);
3259}
3260
3261
3262
3263
3264
3265
3266static int cpuset_track_online_nodes(struct notifier_block *self,
3267 unsigned long action, void *arg)
3268{
3269 schedule_work(&cpuset_hotplug_work);
3270 return NOTIFY_OK;
3271}
3272
3273static struct notifier_block cpuset_track_online_nodes_nb = {
3274 .notifier_call = cpuset_track_online_nodes,
3275 .priority = 10,
3276};
3277
3278
3279
3280
3281
3282
3283void __init cpuset_init_smp(void)
3284{
3285 cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask);
3286 top_cpuset.mems_allowed = node_states[N_MEMORY];
3287 top_cpuset.old_mems_allowed = top_cpuset.mems_allowed;
3288
3289 cpumask_copy(top_cpuset.effective_cpus, cpu_active_mask);
3290 top_cpuset.effective_mems = node_states[N_MEMORY];
3291
3292 register_hotmemory_notifier(&cpuset_track_online_nodes_nb);
3293
3294 cpuset_migrate_mm_wq = alloc_ordered_workqueue("cpuset_migrate_mm", 0);
3295 BUG_ON(!cpuset_migrate_mm_wq);
3296}
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask)
3310{
3311 unsigned long flags;
3312
3313 spin_lock_irqsave(&callback_lock, flags);
3314 rcu_read_lock();
3315 guarantee_online_cpus(task_cs(tsk), pmask);
3316 rcu_read_unlock();
3317 spin_unlock_irqrestore(&callback_lock, flags);
3318}
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332void cpuset_cpus_allowed_fallback(struct task_struct *tsk)
3333{
3334 rcu_read_lock();
3335 do_set_cpus_allowed(tsk, is_in_v2_mode() ?
3336 task_cs(tsk)->cpus_allowed : cpu_possible_mask);
3337 rcu_read_unlock();
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356}
3357
3358void __init cpuset_init_current_mems_allowed(void)
3359{
3360 nodes_setall(current->mems_allowed);
3361}
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373nodemask_t cpuset_mems_allowed(struct task_struct *tsk)
3374{
3375 nodemask_t mask;
3376 unsigned long flags;
3377
3378 spin_lock_irqsave(&callback_lock, flags);
3379 rcu_read_lock();
3380 guarantee_online_mems(task_cs(tsk), &mask);
3381 rcu_read_unlock();
3382 spin_unlock_irqrestore(&callback_lock, flags);
3383
3384 return mask;
3385}
3386
3387
3388
3389
3390
3391
3392
3393int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask)
3394{
3395 return nodes_intersects(*nodemask, current->mems_allowed);
3396}
3397
3398
3399
3400
3401
3402
3403
3404static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs)
3405{
3406 while (!(is_mem_exclusive(cs) || is_mem_hardwall(cs)) && parent_cs(cs))
3407 cs = parent_cs(cs);
3408 return cs;
3409}
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451bool __cpuset_node_allowed(int node, gfp_t gfp_mask)
3452{
3453 struct cpuset *cs;
3454 int allowed;
3455 unsigned long flags;
3456
3457 if (in_interrupt())
3458 return true;
3459 if (node_isset(node, current->mems_allowed))
3460 return true;
3461
3462
3463
3464
3465 if (unlikely(tsk_is_oom_victim(current)))
3466 return true;
3467 if (gfp_mask & __GFP_HARDWALL)
3468 return false;
3469
3470 if (current->flags & PF_EXITING)
3471 return true;
3472
3473
3474 spin_lock_irqsave(&callback_lock, flags);
3475
3476 rcu_read_lock();
3477 cs = nearest_hardwall_ancestor(task_cs(current));
3478 allowed = node_isset(node, cs->mems_allowed);
3479 rcu_read_unlock();
3480
3481 spin_unlock_irqrestore(&callback_lock, flags);
3482 return allowed;
3483}
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512static int cpuset_spread_node(int *rotor)
3513{
3514 return *rotor = next_node_in(*rotor, current->mems_allowed);
3515}
3516
3517int cpuset_mem_spread_node(void)
3518{
3519 if (current->cpuset_mem_spread_rotor == NUMA_NO_NODE)
3520 current->cpuset_mem_spread_rotor =
3521 node_random(¤t->mems_allowed);
3522
3523 return cpuset_spread_node(¤t->cpuset_mem_spread_rotor);
3524}
3525
3526int cpuset_slab_spread_node(void)
3527{
3528 if (current->cpuset_slab_spread_rotor == NUMA_NO_NODE)
3529 current->cpuset_slab_spread_rotor =
3530 node_random(¤t->mems_allowed);
3531
3532 return cpuset_spread_node(¤t->cpuset_slab_spread_rotor);
3533}
3534
3535EXPORT_SYMBOL_GPL(cpuset_mem_spread_node);
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548int cpuset_mems_allowed_intersects(const struct task_struct *tsk1,
3549 const struct task_struct *tsk2)
3550{
3551 return nodes_intersects(tsk1->mems_allowed, tsk2->mems_allowed);
3552}
3553
3554
3555
3556
3557
3558
3559
3560void cpuset_print_current_mems_allowed(void)
3561{
3562 struct cgroup *cgrp;
3563
3564 rcu_read_lock();
3565
3566 cgrp = task_cs(current)->css.cgroup;
3567 pr_cont(",cpuset=");
3568 pr_cont_cgroup_name(cgrp);
3569 pr_cont(",mems_allowed=%*pbl",
3570 nodemask_pr_args(¤t->mems_allowed));
3571
3572 rcu_read_unlock();
3573}
3574
3575
3576
3577
3578
3579
3580
3581int cpuset_memory_pressure_enabled __read_mostly;
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601void __cpuset_memory_pressure_bump(void)
3602{
3603 rcu_read_lock();
3604 fmeter_markevent(&task_cs(current)->fmeter);
3605 rcu_read_unlock();
3606}
3607
3608#ifdef CONFIG_PROC_PID_CPUSET
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618int proc_cpuset_show(struct seq_file *m, struct pid_namespace *ns,
3619 struct pid *pid, struct task_struct *tsk)
3620{
3621 char *buf;
3622 struct cgroup_subsys_state *css;
3623 int retval;
3624
3625 retval = -ENOMEM;
3626 buf = kmalloc(PATH_MAX, GFP_KERNEL);
3627 if (!buf)
3628 goto out;
3629
3630 css = task_get_css(tsk, cpuset_cgrp_id);
3631 retval = cgroup_path_ns(css->cgroup, buf, PATH_MAX,
3632 current->nsproxy->cgroup_ns);
3633 css_put(css);
3634 if (retval >= PATH_MAX)
3635 retval = -ENAMETOOLONG;
3636 if (retval < 0)
3637 goto out_free;
3638 seq_puts(m, buf);
3639 seq_putc(m, '\n');
3640 retval = 0;
3641out_free:
3642 kfree(buf);
3643out:
3644 return retval;
3645}
3646#endif
3647
3648
3649void cpuset_task_status_allowed(struct seq_file *m, struct task_struct *task)
3650{
3651 seq_printf(m, "Mems_allowed:\t%*pb\n",
3652 nodemask_pr_args(&task->mems_allowed));
3653 seq_printf(m, "Mems_allowed_list:\t%*pbl\n",
3654 nodemask_pr_args(&task->mems_allowed));
3655}
3656