1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25#include <linux/cpu.h>
26#include <linux/cpumask.h>
27#include <linux/cpuset.h>
28#include <linux/err.h>
29#include <linux/errno.h>
30#include <linux/file.h>
31#include <linux/fs.h>
32#include <linux/init.h>
33#include <linux/interrupt.h>
34#include <linux/kernel.h>
35#include <linux/kmod.h>
36#include <linux/list.h>
37#include <linux/mempolicy.h>
38#include <linux/mm.h>
39#include <linux/memory.h>
40#include <linux/export.h>
41#include <linux/mount.h>
42#include <linux/fs_context.h>
43#include <linux/namei.h>
44#include <linux/pagemap.h>
45#include <linux/proc_fs.h>
46#include <linux/rcupdate.h>
47#include <linux/sched.h>
48#include <linux/sched/deadline.h>
49#include <linux/sched/mm.h>
50#include <linux/sched/task.h>
51#include <linux/seq_file.h>
52#include <linux/security.h>
53#include <linux/slab.h>
54#include <linux/spinlock.h>
55#include <linux/stat.h>
56#include <linux/string.h>
57#include <linux/time.h>
58#include <linux/time64.h>
59#include <linux/backing-dev.h>
60#include <linux/sort.h>
61#include <linux/oom.h>
62#include <linux/sched/isolation.h>
63#include <linux/uaccess.h>
64#include <linux/atomic.h>
65#include <linux/mutex.h>
66#include <linux/cgroup.h>
67#include <linux/wait.h>
68
69DEFINE_STATIC_KEY_FALSE(cpusets_pre_enable_key);
70DEFINE_STATIC_KEY_FALSE(cpusets_enabled_key);
71
72
73
74struct fmeter {
75 int cnt;
76 int val;
77 time64_t time;
78 spinlock_t lock;
79};
80
81struct cpuset {
82 struct cgroup_subsys_state css;
83
84 unsigned long flags;
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107 cpumask_var_t cpus_allowed;
108 nodemask_t mems_allowed;
109
110
111 cpumask_var_t effective_cpus;
112 nodemask_t effective_mems;
113
114
115
116
117
118
119
120
121
122 cpumask_var_t subparts_cpus;
123
124
125
126
127
128
129
130
131
132
133
134 nodemask_t old_mems_allowed;
135
136 struct fmeter fmeter;
137
138
139
140
141
142 int attach_in_progress;
143
144
145 int pn;
146
147
148 int relax_domain_level;
149
150
151 int nr_subparts_cpus;
152
153
154 int partition_root_state;
155
156
157
158
159
160
161 int use_parent_ecpus;
162 int child_ecpus_count;
163};
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179#define PRS_DISABLED 0
180#define PRS_ENABLED 1
181#define PRS_ERROR -1
182
183
184
185
186
187struct tmpmasks {
188 cpumask_var_t addmask, delmask;
189 cpumask_var_t new_cpus;
190};
191
192static inline struct cpuset *css_cs(struct cgroup_subsys_state *css)
193{
194 return css ? container_of(css, struct cpuset, css) : NULL;
195}
196
197
198static inline struct cpuset *task_cs(struct task_struct *task)
199{
200 return css_cs(task_css(task, cpuset_cgrp_id));
201}
202
203static inline struct cpuset *parent_cs(struct cpuset *cs)
204{
205 return css_cs(cs->css.parent);
206}
207
208
209typedef enum {
210 CS_ONLINE,
211 CS_CPU_EXCLUSIVE,
212 CS_MEM_EXCLUSIVE,
213 CS_MEM_HARDWALL,
214 CS_MEMORY_MIGRATE,
215 CS_SCHED_LOAD_BALANCE,
216 CS_SPREAD_PAGE,
217 CS_SPREAD_SLAB,
218} cpuset_flagbits_t;
219
220
221static inline bool is_cpuset_online(struct cpuset *cs)
222{
223 return test_bit(CS_ONLINE, &cs->flags) && !css_is_dying(&cs->css);
224}
225
226static inline int is_cpu_exclusive(const struct cpuset *cs)
227{
228 return test_bit(CS_CPU_EXCLUSIVE, &cs->flags);
229}
230
231static inline int is_mem_exclusive(const struct cpuset *cs)
232{
233 return test_bit(CS_MEM_EXCLUSIVE, &cs->flags);
234}
235
236static inline int is_mem_hardwall(const struct cpuset *cs)
237{
238 return test_bit(CS_MEM_HARDWALL, &cs->flags);
239}
240
241static inline int is_sched_load_balance(const struct cpuset *cs)
242{
243 return test_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
244}
245
246static inline int is_memory_migrate(const struct cpuset *cs)
247{
248 return test_bit(CS_MEMORY_MIGRATE, &cs->flags);
249}
250
251static inline int is_spread_page(const struct cpuset *cs)
252{
253 return test_bit(CS_SPREAD_PAGE, &cs->flags);
254}
255
256static inline int is_spread_slab(const struct cpuset *cs)
257{
258 return test_bit(CS_SPREAD_SLAB, &cs->flags);
259}
260
261static inline int is_partition_root(const struct cpuset *cs)
262{
263 return cs->partition_root_state > 0;
264}
265
266static struct cpuset top_cpuset = {
267 .flags = ((1 << CS_ONLINE) | (1 << CS_CPU_EXCLUSIVE) |
268 (1 << CS_MEM_EXCLUSIVE)),
269 .partition_root_state = PRS_ENABLED,
270};
271
272
273
274
275
276
277
278
279
280
281#define cpuset_for_each_child(child_cs, pos_css, parent_cs) \
282 css_for_each_child((pos_css), &(parent_cs)->css) \
283 if (is_cpuset_online(((child_cs) = css_cs((pos_css)))))
284
285
286
287
288
289
290
291
292
293
294
295
296#define cpuset_for_each_descendant_pre(des_cs, pos_css, root_cs) \
297 css_for_each_descendant_pre((pos_css), &(root_cs)->css) \
298 if (is_cpuset_online(((des_cs) = css_cs((pos_css)))))
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336DEFINE_STATIC_PERCPU_RWSEM(cpuset_rwsem);
337
338void cpuset_read_lock(void)
339{
340 percpu_down_read(&cpuset_rwsem);
341}
342
343void cpuset_read_unlock(void)
344{
345 percpu_up_read(&cpuset_rwsem);
346}
347
348static DEFINE_SPINLOCK(callback_lock);
349
350static struct workqueue_struct *cpuset_migrate_mm_wq;
351
352
353
354
355static void cpuset_hotplug_workfn(struct work_struct *work);
356static DECLARE_WORK(cpuset_hotplug_work, cpuset_hotplug_workfn);
357
358static DECLARE_WAIT_QUEUE_HEAD(cpuset_attach_wq);
359
360
361
362
363
364
365
366
367
368static inline bool is_in_v2_mode(void)
369{
370 return cgroup_subsys_on_dfl(cpuset_cgrp_subsys) ||
371 (cpuset_cgrp_subsys.root->flags & CGRP_ROOT_CPUSET_V2_MODE);
372}
373
374
375
376
377
378
379
380
381
382
383
384static void guarantee_online_cpus(struct cpuset *cs, struct cpumask *pmask)
385{
386 while (!cpumask_intersects(cs->effective_cpus, cpu_online_mask)) {
387 cs = parent_cs(cs);
388 if (unlikely(!cs)) {
389
390
391
392
393
394
395
396 cpumask_copy(pmask, cpu_online_mask);
397 return;
398 }
399 }
400 cpumask_and(pmask, cs->effective_cpus, cpu_online_mask);
401}
402
403
404
405
406
407
408
409
410
411
412
413
414static void guarantee_online_mems(struct cpuset *cs, nodemask_t *pmask)
415{
416 while (!nodes_intersects(cs->effective_mems, node_states[N_MEMORY]))
417 cs = parent_cs(cs);
418 nodes_and(*pmask, cs->effective_mems, node_states[N_MEMORY]);
419}
420
421
422
423
424
425
426static void cpuset_update_task_spread_flag(struct cpuset *cs,
427 struct task_struct *tsk)
428{
429 if (is_spread_page(cs))
430 task_set_spread_page(tsk);
431 else
432 task_clear_spread_page(tsk);
433
434 if (is_spread_slab(cs))
435 task_set_spread_slab(tsk);
436 else
437 task_clear_spread_slab(tsk);
438}
439
440
441
442
443
444
445
446
447
448static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q)
449{
450 return cpumask_subset(p->cpus_allowed, q->cpus_allowed) &&
451 nodes_subset(p->mems_allowed, q->mems_allowed) &&
452 is_cpu_exclusive(p) <= is_cpu_exclusive(q) &&
453 is_mem_exclusive(p) <= is_mem_exclusive(q);
454}
455
456
457
458
459
460
461
462
463
464static inline int alloc_cpumasks(struct cpuset *cs, struct tmpmasks *tmp)
465{
466 cpumask_var_t *pmask1, *pmask2, *pmask3;
467
468 if (cs) {
469 pmask1 = &cs->cpus_allowed;
470 pmask2 = &cs->effective_cpus;
471 pmask3 = &cs->subparts_cpus;
472 } else {
473 pmask1 = &tmp->new_cpus;
474 pmask2 = &tmp->addmask;
475 pmask3 = &tmp->delmask;
476 }
477
478 if (!zalloc_cpumask_var(pmask1, GFP_KERNEL))
479 return -ENOMEM;
480
481 if (!zalloc_cpumask_var(pmask2, GFP_KERNEL))
482 goto free_one;
483
484 if (!zalloc_cpumask_var(pmask3, GFP_KERNEL))
485 goto free_two;
486
487 return 0;
488
489free_two:
490 free_cpumask_var(*pmask2);
491free_one:
492 free_cpumask_var(*pmask1);
493 return -ENOMEM;
494}
495
496
497
498
499
500
501static inline void free_cpumasks(struct cpuset *cs, struct tmpmasks *tmp)
502{
503 if (cs) {
504 free_cpumask_var(cs->cpus_allowed);
505 free_cpumask_var(cs->effective_cpus);
506 free_cpumask_var(cs->subparts_cpus);
507 }
508 if (tmp) {
509 free_cpumask_var(tmp->new_cpus);
510 free_cpumask_var(tmp->addmask);
511 free_cpumask_var(tmp->delmask);
512 }
513}
514
515
516
517
518
519static struct cpuset *alloc_trial_cpuset(struct cpuset *cs)
520{
521 struct cpuset *trial;
522
523 trial = kmemdup(cs, sizeof(*cs), GFP_KERNEL);
524 if (!trial)
525 return NULL;
526
527 if (alloc_cpumasks(trial, NULL)) {
528 kfree(trial);
529 return NULL;
530 }
531
532 cpumask_copy(trial->cpus_allowed, cs->cpus_allowed);
533 cpumask_copy(trial->effective_cpus, cs->effective_cpus);
534 return trial;
535}
536
537
538
539
540
541static inline void free_cpuset(struct cpuset *cs)
542{
543 free_cpumasks(cs, NULL);
544 kfree(cs);
545}
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567static int validate_change(struct cpuset *cur, struct cpuset *trial)
568{
569 struct cgroup_subsys_state *css;
570 struct cpuset *c, *par;
571 int ret;
572
573 rcu_read_lock();
574
575
576 ret = -EBUSY;
577 cpuset_for_each_child(c, css, cur)
578 if (!is_cpuset_subset(c, trial))
579 goto out;
580
581
582 ret = 0;
583 if (cur == &top_cpuset)
584 goto out;
585
586 par = parent_cs(cur);
587
588
589 ret = -EACCES;
590 if (!is_in_v2_mode() && !is_cpuset_subset(trial, par))
591 goto out;
592
593
594
595
596
597 ret = -EINVAL;
598 cpuset_for_each_child(c, css, par) {
599 if ((is_cpu_exclusive(trial) || is_cpu_exclusive(c)) &&
600 c != cur &&
601 cpumask_intersects(trial->cpus_allowed, c->cpus_allowed))
602 goto out;
603 if ((is_mem_exclusive(trial) || is_mem_exclusive(c)) &&
604 c != cur &&
605 nodes_intersects(trial->mems_allowed, c->mems_allowed))
606 goto out;
607 }
608
609
610
611
612
613 ret = -ENOSPC;
614 if ((cgroup_is_populated(cur->css.cgroup) || cur->attach_in_progress)) {
615 if (!cpumask_empty(cur->cpus_allowed) &&
616 cpumask_empty(trial->cpus_allowed))
617 goto out;
618 if (!nodes_empty(cur->mems_allowed) &&
619 nodes_empty(trial->mems_allowed))
620 goto out;
621 }
622
623
624
625
626
627 ret = -EBUSY;
628 if (is_cpu_exclusive(cur) &&
629 !cpuset_cpumask_can_shrink(cur->cpus_allowed,
630 trial->cpus_allowed))
631 goto out;
632
633 ret = 0;
634out:
635 rcu_read_unlock();
636 return ret;
637}
638
639#ifdef CONFIG_SMP
640
641
642
643
644static int cpusets_overlap(struct cpuset *a, struct cpuset *b)
645{
646 return cpumask_intersects(a->effective_cpus, b->effective_cpus);
647}
648
649static void
650update_domain_attr(struct sched_domain_attr *dattr, struct cpuset *c)
651{
652 if (dattr->relax_domain_level < c->relax_domain_level)
653 dattr->relax_domain_level = c->relax_domain_level;
654 return;
655}
656
657static void update_domain_attr_tree(struct sched_domain_attr *dattr,
658 struct cpuset *root_cs)
659{
660 struct cpuset *cp;
661 struct cgroup_subsys_state *pos_css;
662
663 rcu_read_lock();
664 cpuset_for_each_descendant_pre(cp, pos_css, root_cs) {
665
666 if (cpumask_empty(cp->cpus_allowed)) {
667 pos_css = css_rightmost_descendant(pos_css);
668 continue;
669 }
670
671 if (is_sched_load_balance(cp))
672 update_domain_attr(dattr, cp);
673 }
674 rcu_read_unlock();
675}
676
677
678static inline int nr_cpusets(void)
679{
680
681 return static_key_count(&cpusets_enabled_key.key) + 1;
682}
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737static int generate_sched_domains(cpumask_var_t **domains,
738 struct sched_domain_attr **attributes)
739{
740 struct cpuset *cp;
741 struct cpuset **csa;
742 int csn;
743 int i, j, k;
744 cpumask_var_t *doms;
745 struct sched_domain_attr *dattr;
746 int ndoms = 0;
747 int nslot;
748 struct cgroup_subsys_state *pos_css;
749 bool root_load_balance = is_sched_load_balance(&top_cpuset);
750
751 doms = NULL;
752 dattr = NULL;
753 csa = NULL;
754
755
756 if (root_load_balance && !top_cpuset.nr_subparts_cpus) {
757 ndoms = 1;
758 doms = alloc_sched_domains(ndoms);
759 if (!doms)
760 goto done;
761
762 dattr = kmalloc(sizeof(struct sched_domain_attr), GFP_KERNEL);
763 if (dattr) {
764 *dattr = SD_ATTR_INIT;
765 update_domain_attr_tree(dattr, &top_cpuset);
766 }
767 cpumask_and(doms[0], top_cpuset.effective_cpus,
768 housekeeping_cpumask(HK_FLAG_DOMAIN));
769
770 goto done;
771 }
772
773 csa = kmalloc_array(nr_cpusets(), sizeof(cp), GFP_KERNEL);
774 if (!csa)
775 goto done;
776 csn = 0;
777
778 rcu_read_lock();
779 if (root_load_balance)
780 csa[csn++] = &top_cpuset;
781 cpuset_for_each_descendant_pre(cp, pos_css, &top_cpuset) {
782 if (cp == &top_cpuset)
783 continue;
784
785
786
787
788
789
790
791
792
793
794
795 if (!cpumask_empty(cp->cpus_allowed) &&
796 !(is_sched_load_balance(cp) &&
797 cpumask_intersects(cp->cpus_allowed,
798 housekeeping_cpumask(HK_FLAG_DOMAIN))))
799 continue;
800
801 if (root_load_balance &&
802 cpumask_subset(cp->cpus_allowed, top_cpuset.effective_cpus))
803 continue;
804
805 if (is_sched_load_balance(cp) &&
806 !cpumask_empty(cp->effective_cpus))
807 csa[csn++] = cp;
808
809
810 if (!is_partition_root(cp))
811 pos_css = css_rightmost_descendant(pos_css);
812 }
813 rcu_read_unlock();
814
815 for (i = 0; i < csn; i++)
816 csa[i]->pn = i;
817 ndoms = csn;
818
819restart:
820
821 for (i = 0; i < csn; i++) {
822 struct cpuset *a = csa[i];
823 int apn = a->pn;
824
825 for (j = 0; j < csn; j++) {
826 struct cpuset *b = csa[j];
827 int bpn = b->pn;
828
829 if (apn != bpn && cpusets_overlap(a, b)) {
830 for (k = 0; k < csn; k++) {
831 struct cpuset *c = csa[k];
832
833 if (c->pn == bpn)
834 c->pn = apn;
835 }
836 ndoms--;
837 goto restart;
838 }
839 }
840 }
841
842
843
844
845
846 doms = alloc_sched_domains(ndoms);
847 if (!doms)
848 goto done;
849
850
851
852
853
854 dattr = kmalloc_array(ndoms, sizeof(struct sched_domain_attr),
855 GFP_KERNEL);
856
857 for (nslot = 0, i = 0; i < csn; i++) {
858 struct cpuset *a = csa[i];
859 struct cpumask *dp;
860 int apn = a->pn;
861
862 if (apn < 0) {
863
864 continue;
865 }
866
867 dp = doms[nslot];
868
869 if (nslot == ndoms) {
870 static int warnings = 10;
871 if (warnings) {
872 pr_warn("rebuild_sched_domains confused: nslot %d, ndoms %d, csn %d, i %d, apn %d\n",
873 nslot, ndoms, csn, i, apn);
874 warnings--;
875 }
876 continue;
877 }
878
879 cpumask_clear(dp);
880 if (dattr)
881 *(dattr + nslot) = SD_ATTR_INIT;
882 for (j = i; j < csn; j++) {
883 struct cpuset *b = csa[j];
884
885 if (apn == b->pn) {
886 cpumask_or(dp, dp, b->effective_cpus);
887 cpumask_and(dp, dp, housekeeping_cpumask(HK_FLAG_DOMAIN));
888 if (dattr)
889 update_domain_attr_tree(dattr + nslot, b);
890
891
892 b->pn = -1;
893 }
894 }
895 nslot++;
896 }
897 BUG_ON(nslot != ndoms);
898
899done:
900 kfree(csa);
901
902
903
904
905
906 if (doms == NULL)
907 ndoms = 1;
908
909 *domains = doms;
910 *attributes = dattr;
911 return ndoms;
912}
913
914static void update_tasks_root_domain(struct cpuset *cs)
915{
916 struct css_task_iter it;
917 struct task_struct *task;
918
919 css_task_iter_start(&cs->css, 0, &it);
920
921 while ((task = css_task_iter_next(&it)))
922 dl_add_task_root_domain(task);
923
924 css_task_iter_end(&it);
925}
926
927static void rebuild_root_domains(void)
928{
929 struct cpuset *cs = NULL;
930 struct cgroup_subsys_state *pos_css;
931
932 percpu_rwsem_assert_held(&cpuset_rwsem);
933 lockdep_assert_cpus_held();
934 lockdep_assert_held(&sched_domains_mutex);
935
936 rcu_read_lock();
937
938
939
940
941
942 dl_clear_root_domain(&def_root_domain);
943
944 cpuset_for_each_descendant_pre(cs, pos_css, &top_cpuset) {
945
946 if (cpumask_empty(cs->effective_cpus)) {
947 pos_css = css_rightmost_descendant(pos_css);
948 continue;
949 }
950
951 css_get(&cs->css);
952
953 rcu_read_unlock();
954
955 update_tasks_root_domain(cs);
956
957 rcu_read_lock();
958 css_put(&cs->css);
959 }
960 rcu_read_unlock();
961}
962
963static void
964partition_and_rebuild_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
965 struct sched_domain_attr *dattr_new)
966{
967 mutex_lock(&sched_domains_mutex);
968 partition_sched_domains_locked(ndoms_new, doms_new, dattr_new);
969 rebuild_root_domains();
970 mutex_unlock(&sched_domains_mutex);
971}
972
973
974
975
976
977
978
979
980
981
982
983
984static void rebuild_sched_domains_locked(void)
985{
986 struct cgroup_subsys_state *pos_css;
987 struct sched_domain_attr *attr;
988 cpumask_var_t *doms;
989 struct cpuset *cs;
990 int ndoms;
991
992 lockdep_assert_cpus_held();
993 percpu_rwsem_assert_held(&cpuset_rwsem);
994
995
996
997
998
999
1000
1001
1002
1003
1004 if (!top_cpuset.nr_subparts_cpus &&
1005 !cpumask_equal(top_cpuset.effective_cpus, cpu_active_mask))
1006 return;
1007
1008
1009
1010
1011
1012
1013 if (top_cpuset.nr_subparts_cpus) {
1014 rcu_read_lock();
1015 cpuset_for_each_descendant_pre(cs, pos_css, &top_cpuset) {
1016 if (!is_partition_root(cs)) {
1017 pos_css = css_rightmost_descendant(pos_css);
1018 continue;
1019 }
1020 if (!cpumask_subset(cs->effective_cpus,
1021 cpu_active_mask)) {
1022 rcu_read_unlock();
1023 return;
1024 }
1025 }
1026 rcu_read_unlock();
1027 }
1028
1029
1030 ndoms = generate_sched_domains(&doms, &attr);
1031
1032
1033 partition_and_rebuild_sched_domains(ndoms, doms, attr);
1034}
1035#else
1036static void rebuild_sched_domains_locked(void)
1037{
1038}
1039#endif
1040
1041void rebuild_sched_domains(void)
1042{
1043 get_online_cpus();
1044 percpu_down_write(&cpuset_rwsem);
1045 rebuild_sched_domains_locked();
1046 percpu_up_write(&cpuset_rwsem);
1047 put_online_cpus();
1048}
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058static void update_tasks_cpumask(struct cpuset *cs)
1059{
1060 struct css_task_iter it;
1061 struct task_struct *task;
1062
1063 css_task_iter_start(&cs->css, 0, &it);
1064 while ((task = css_task_iter_next(&it)))
1065 set_cpus_allowed_ptr(task, cs->effective_cpus);
1066 css_task_iter_end(&it);
1067}
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080static void compute_effective_cpumask(struct cpumask *new_cpus,
1081 struct cpuset *cs, struct cpuset *parent)
1082{
1083 if (parent->nr_subparts_cpus) {
1084 cpumask_or(new_cpus, parent->effective_cpus,
1085 parent->subparts_cpus);
1086 cpumask_and(new_cpus, new_cpus, cs->cpus_allowed);
1087 cpumask_and(new_cpus, new_cpus, cpu_active_mask);
1088 } else {
1089 cpumask_and(new_cpus, cs->cpus_allowed, parent->effective_cpus);
1090 }
1091}
1092
1093
1094
1095
1096enum subparts_cmd {
1097 partcmd_enable,
1098 partcmd_disable,
1099 partcmd_update,
1100};
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144static int update_parent_subparts_cpumask(struct cpuset *cpuset, int cmd,
1145 struct cpumask *newmask,
1146 struct tmpmasks *tmp)
1147{
1148 struct cpuset *parent = parent_cs(cpuset);
1149 int adding;
1150 int deleting;
1151 bool part_error = false;
1152
1153 percpu_rwsem_assert_held(&cpuset_rwsem);
1154
1155
1156
1157
1158
1159
1160 if (!is_partition_root(parent) ||
1161 (newmask && cpumask_empty(newmask)) ||
1162 (!newmask && cpumask_empty(cpuset->cpus_allowed)))
1163 return -EINVAL;
1164
1165
1166
1167
1168
1169 if ((cmd != partcmd_update) && css_has_online_children(&cpuset->css))
1170 return -EBUSY;
1171
1172
1173
1174
1175
1176
1177 if ((cmd == partcmd_enable) &&
1178 (!cpumask_subset(cpuset->cpus_allowed, parent->effective_cpus) ||
1179 cpumask_equal(cpuset->cpus_allowed, parent->effective_cpus)))
1180 return -EINVAL;
1181
1182
1183
1184
1185 adding = deleting = false;
1186 if (cmd == partcmd_enable) {
1187 cpumask_copy(tmp->addmask, cpuset->cpus_allowed);
1188 adding = true;
1189 } else if (cmd == partcmd_disable) {
1190 deleting = cpumask_and(tmp->delmask, cpuset->cpus_allowed,
1191 parent->subparts_cpus);
1192 } else if (newmask) {
1193
1194
1195
1196
1197
1198
1199
1200 cpumask_andnot(tmp->delmask, cpuset->cpus_allowed, newmask);
1201 deleting = cpumask_and(tmp->delmask, tmp->delmask,
1202 parent->subparts_cpus);
1203
1204 cpumask_and(tmp->addmask, newmask, parent->effective_cpus);
1205 adding = cpumask_andnot(tmp->addmask, tmp->addmask,
1206 parent->subparts_cpus);
1207
1208
1209
1210 if (adding &&
1211 cpumask_equal(parent->effective_cpus, tmp->addmask)) {
1212 if (!deleting)
1213 return -EINVAL;
1214
1215
1216
1217
1218
1219 if (!cpumask_and(tmp->addmask, tmp->delmask,
1220 cpu_active_mask))
1221 return -EINVAL;
1222 cpumask_copy(tmp->addmask, parent->effective_cpus);
1223 }
1224 } else {
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234 adding = cpumask_and(tmp->addmask, cpuset->cpus_allowed,
1235 parent->effective_cpus);
1236 part_error = cpumask_equal(tmp->addmask,
1237 parent->effective_cpus);
1238 }
1239
1240 if (cmd == partcmd_update) {
1241 int prev_prs = cpuset->partition_root_state;
1242
1243
1244
1245
1246
1247 switch (cpuset->partition_root_state) {
1248 case PRS_ENABLED:
1249 if (part_error)
1250 cpuset->partition_root_state = PRS_ERROR;
1251 break;
1252 case PRS_ERROR:
1253 if (!part_error)
1254 cpuset->partition_root_state = PRS_ENABLED;
1255 break;
1256 }
1257
1258
1259
1260 part_error = (prev_prs == PRS_ERROR);
1261 }
1262
1263 if (!part_error && (cpuset->partition_root_state == PRS_ERROR))
1264 return 0;
1265
1266 if (cpuset->partition_root_state == PRS_ERROR) {
1267
1268
1269
1270 adding = false;
1271 deleting = cpumask_and(tmp->delmask, cpuset->cpus_allowed,
1272 parent->subparts_cpus);
1273 }
1274
1275 if (!adding && !deleting)
1276 return 0;
1277
1278
1279
1280
1281
1282
1283 spin_lock_irq(&callback_lock);
1284 if (adding) {
1285 cpumask_or(parent->subparts_cpus,
1286 parent->subparts_cpus, tmp->addmask);
1287 cpumask_andnot(parent->effective_cpus,
1288 parent->effective_cpus, tmp->addmask);
1289 }
1290 if (deleting) {
1291 cpumask_andnot(parent->subparts_cpus,
1292 parent->subparts_cpus, tmp->delmask);
1293
1294
1295
1296 cpumask_and(tmp->delmask, tmp->delmask, cpu_active_mask);
1297 cpumask_or(parent->effective_cpus,
1298 parent->effective_cpus, tmp->delmask);
1299 }
1300
1301 parent->nr_subparts_cpus = cpumask_weight(parent->subparts_cpus);
1302 spin_unlock_irq(&callback_lock);
1303
1304 return cmd == partcmd_update;
1305}
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp)
1320{
1321 struct cpuset *cp;
1322 struct cgroup_subsys_state *pos_css;
1323 bool need_rebuild_sched_domains = false;
1324
1325 rcu_read_lock();
1326 cpuset_for_each_descendant_pre(cp, pos_css, cs) {
1327 struct cpuset *parent = parent_cs(cp);
1328
1329 compute_effective_cpumask(tmp->new_cpus, cp, parent);
1330
1331
1332
1333
1334
1335 if (is_in_v2_mode() && cpumask_empty(tmp->new_cpus)) {
1336 cpumask_copy(tmp->new_cpus, parent->effective_cpus);
1337 if (!cp->use_parent_ecpus) {
1338 cp->use_parent_ecpus = true;
1339 parent->child_ecpus_count++;
1340 }
1341 } else if (cp->use_parent_ecpus) {
1342 cp->use_parent_ecpus = false;
1343 WARN_ON_ONCE(!parent->child_ecpus_count);
1344 parent->child_ecpus_count--;
1345 }
1346
1347
1348
1349
1350
1351 if (!cp->partition_root_state &&
1352 cpumask_equal(tmp->new_cpus, cp->effective_cpus)) {
1353 pos_css = css_rightmost_descendant(pos_css);
1354 continue;
1355 }
1356
1357
1358
1359
1360
1361
1362
1363 if ((cp != cs) && cp->partition_root_state) {
1364 switch (parent->partition_root_state) {
1365 case PRS_DISABLED:
1366
1367
1368
1369
1370
1371 WARN_ON_ONCE(cp->partition_root_state
1372 != PRS_ERROR);
1373 cp->partition_root_state = 0;
1374
1375
1376
1377
1378
1379
1380
1381
1382 clear_bit(CS_CPU_EXCLUSIVE, &cp->flags);
1383 break;
1384
1385 case PRS_ENABLED:
1386 if (update_parent_subparts_cpumask(cp, partcmd_update, NULL, tmp))
1387 update_tasks_cpumask(parent);
1388 break;
1389
1390 case PRS_ERROR:
1391
1392
1393
1394 cp->partition_root_state = PRS_ERROR;
1395 if (cp->nr_subparts_cpus) {
1396 cp->nr_subparts_cpus = 0;
1397 cpumask_clear(cp->subparts_cpus);
1398 }
1399 break;
1400 }
1401 }
1402
1403 if (!css_tryget_online(&cp->css))
1404 continue;
1405 rcu_read_unlock();
1406
1407 spin_lock_irq(&callback_lock);
1408
1409 cpumask_copy(cp->effective_cpus, tmp->new_cpus);
1410 if (cp->nr_subparts_cpus &&
1411 (cp->partition_root_state != PRS_ENABLED)) {
1412 cp->nr_subparts_cpus = 0;
1413 cpumask_clear(cp->subparts_cpus);
1414 } else if (cp->nr_subparts_cpus) {
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424 cpumask_andnot(cp->effective_cpus, cp->effective_cpus,
1425 cp->subparts_cpus);
1426 if (cpumask_empty(cp->effective_cpus)) {
1427 cpumask_copy(cp->effective_cpus, tmp->new_cpus);
1428 cpumask_clear(cp->subparts_cpus);
1429 cp->nr_subparts_cpus = 0;
1430 } else if (!cpumask_subset(cp->subparts_cpus,
1431 tmp->new_cpus)) {
1432 cpumask_andnot(cp->subparts_cpus,
1433 cp->subparts_cpus, tmp->new_cpus);
1434 cp->nr_subparts_cpus
1435 = cpumask_weight(cp->subparts_cpus);
1436 }
1437 }
1438 spin_unlock_irq(&callback_lock);
1439
1440 WARN_ON(!is_in_v2_mode() &&
1441 !cpumask_equal(cp->cpus_allowed, cp->effective_cpus));
1442
1443 update_tasks_cpumask(cp);
1444
1445
1446
1447
1448
1449
1450
1451 if (!cpumask_empty(cp->cpus_allowed) &&
1452 is_sched_load_balance(cp) &&
1453 (!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) ||
1454 is_partition_root(cp)))
1455 need_rebuild_sched_domains = true;
1456
1457 rcu_read_lock();
1458 css_put(&cp->css);
1459 }
1460 rcu_read_unlock();
1461
1462 if (need_rebuild_sched_domains)
1463 rebuild_sched_domains_locked();
1464}
1465
1466
1467
1468
1469
1470
1471
1472static void update_sibling_cpumasks(struct cpuset *parent, struct cpuset *cs,
1473 struct tmpmasks *tmp)
1474{
1475 struct cpuset *sibling;
1476 struct cgroup_subsys_state *pos_css;
1477
1478
1479
1480
1481
1482
1483 rcu_read_lock();
1484 cpuset_for_each_child(sibling, pos_css, parent) {
1485 if (sibling == cs)
1486 continue;
1487 if (!sibling->use_parent_ecpus)
1488 continue;
1489
1490 update_cpumasks_hier(sibling, tmp);
1491 }
1492 rcu_read_unlock();
1493}
1494
1495
1496
1497
1498
1499
1500
1501static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
1502 const char *buf)
1503{
1504 int retval;
1505 struct tmpmasks tmp;
1506
1507
1508 if (cs == &top_cpuset)
1509 return -EACCES;
1510
1511
1512
1513
1514
1515
1516
1517 if (!*buf) {
1518 cpumask_clear(trialcs->cpus_allowed);
1519 } else {
1520 retval = cpulist_parse(buf, trialcs->cpus_allowed);
1521 if (retval < 0)
1522 return retval;
1523
1524 if (!cpumask_subset(trialcs->cpus_allowed,
1525 top_cpuset.cpus_allowed))
1526 return -EINVAL;
1527 }
1528
1529
1530 if (cpumask_equal(cs->cpus_allowed, trialcs->cpus_allowed))
1531 return 0;
1532
1533 retval = validate_change(cs, trialcs);
1534 if (retval < 0)
1535 return retval;
1536
1537#ifdef CONFIG_CPUMASK_OFFSTACK
1538
1539
1540
1541
1542 tmp.addmask = trialcs->subparts_cpus;
1543 tmp.delmask = trialcs->effective_cpus;
1544 tmp.new_cpus = trialcs->cpus_allowed;
1545#endif
1546
1547 if (cs->partition_root_state) {
1548
1549 if (cpumask_empty(trialcs->cpus_allowed))
1550 return -EINVAL;
1551 if (update_parent_subparts_cpumask(cs, partcmd_update,
1552 trialcs->cpus_allowed, &tmp) < 0)
1553 return -EINVAL;
1554 }
1555
1556 spin_lock_irq(&callback_lock);
1557 cpumask_copy(cs->cpus_allowed, trialcs->cpus_allowed);
1558
1559
1560
1561
1562 if (cs->nr_subparts_cpus) {
1563 cpumask_andnot(cs->subparts_cpus, cs->subparts_cpus,
1564 cs->cpus_allowed);
1565 cs->nr_subparts_cpus = cpumask_weight(cs->subparts_cpus);
1566 }
1567 spin_unlock_irq(&callback_lock);
1568
1569 update_cpumasks_hier(cs, &tmp);
1570
1571 if (cs->partition_root_state) {
1572 struct cpuset *parent = parent_cs(cs);
1573
1574
1575
1576
1577
1578 if (parent->child_ecpus_count)
1579 update_sibling_cpumasks(parent, cs, &tmp);
1580 }
1581 return 0;
1582}
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592struct cpuset_migrate_mm_work {
1593 struct work_struct work;
1594 struct mm_struct *mm;
1595 nodemask_t from;
1596 nodemask_t to;
1597};
1598
1599static void cpuset_migrate_mm_workfn(struct work_struct *work)
1600{
1601 struct cpuset_migrate_mm_work *mwork =
1602 container_of(work, struct cpuset_migrate_mm_work, work);
1603
1604
1605 do_migrate_pages(mwork->mm, &mwork->from, &mwork->to, MPOL_MF_MOVE_ALL);
1606 mmput(mwork->mm);
1607 kfree(mwork);
1608}
1609
1610static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from,
1611 const nodemask_t *to)
1612{
1613 struct cpuset_migrate_mm_work *mwork;
1614
1615 mwork = kzalloc(sizeof(*mwork), GFP_KERNEL);
1616 if (mwork) {
1617 mwork->mm = mm;
1618 mwork->from = *from;
1619 mwork->to = *to;
1620 INIT_WORK(&mwork->work, cpuset_migrate_mm_workfn);
1621 queue_work(cpuset_migrate_mm_wq, &mwork->work);
1622 } else {
1623 mmput(mm);
1624 }
1625}
1626
1627static void cpuset_post_attach(void)
1628{
1629 flush_workqueue(cpuset_migrate_mm_wq);
1630}
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642static void cpuset_change_task_nodemask(struct task_struct *tsk,
1643 nodemask_t *newmems)
1644{
1645 task_lock(tsk);
1646
1647 local_irq_disable();
1648 write_seqcount_begin(&tsk->mems_allowed_seq);
1649
1650 nodes_or(tsk->mems_allowed, tsk->mems_allowed, *newmems);
1651 mpol_rebind_task(tsk, newmems);
1652 tsk->mems_allowed = *newmems;
1653
1654 write_seqcount_end(&tsk->mems_allowed_seq);
1655 local_irq_enable();
1656
1657 task_unlock(tsk);
1658}
1659
1660static void *cpuset_being_rebound;
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670static void update_tasks_nodemask(struct cpuset *cs)
1671{
1672 static nodemask_t newmems;
1673 struct css_task_iter it;
1674 struct task_struct *task;
1675
1676 cpuset_being_rebound = cs;
1677
1678 guarantee_online_mems(cs, &newmems);
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690 css_task_iter_start(&cs->css, 0, &it);
1691 while ((task = css_task_iter_next(&it))) {
1692 struct mm_struct *mm;
1693 bool migrate;
1694
1695 cpuset_change_task_nodemask(task, &newmems);
1696
1697 mm = get_task_mm(task);
1698 if (!mm)
1699 continue;
1700
1701 migrate = is_memory_migrate(cs);
1702
1703 mpol_rebind_mm(mm, &cs->mems_allowed);
1704 if (migrate)
1705 cpuset_migrate_mm(mm, &cs->old_mems_allowed, &newmems);
1706 else
1707 mmput(mm);
1708 }
1709 css_task_iter_end(&it);
1710
1711
1712
1713
1714
1715 cs->old_mems_allowed = newmems;
1716
1717
1718 cpuset_being_rebound = NULL;
1719}
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733static void update_nodemasks_hier(struct cpuset *cs, nodemask_t *new_mems)
1734{
1735 struct cpuset *cp;
1736 struct cgroup_subsys_state *pos_css;
1737
1738 rcu_read_lock();
1739 cpuset_for_each_descendant_pre(cp, pos_css, cs) {
1740 struct cpuset *parent = parent_cs(cp);
1741
1742 nodes_and(*new_mems, cp->mems_allowed, parent->effective_mems);
1743
1744
1745
1746
1747
1748 if (is_in_v2_mode() && nodes_empty(*new_mems))
1749 *new_mems = parent->effective_mems;
1750
1751
1752 if (nodes_equal(*new_mems, cp->effective_mems)) {
1753 pos_css = css_rightmost_descendant(pos_css);
1754 continue;
1755 }
1756
1757 if (!css_tryget_online(&cp->css))
1758 continue;
1759 rcu_read_unlock();
1760
1761 spin_lock_irq(&callback_lock);
1762 cp->effective_mems = *new_mems;
1763 spin_unlock_irq(&callback_lock);
1764
1765 WARN_ON(!is_in_v2_mode() &&
1766 !nodes_equal(cp->mems_allowed, cp->effective_mems));
1767
1768 update_tasks_nodemask(cp);
1769
1770 rcu_read_lock();
1771 css_put(&cp->css);
1772 }
1773 rcu_read_unlock();
1774}
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
1790 const char *buf)
1791{
1792 int retval;
1793
1794
1795
1796
1797
1798 if (cs == &top_cpuset) {
1799 retval = -EACCES;
1800 goto done;
1801 }
1802
1803
1804
1805
1806
1807
1808
1809 if (!*buf) {
1810 nodes_clear(trialcs->mems_allowed);
1811 } else {
1812 retval = nodelist_parse(buf, trialcs->mems_allowed);
1813 if (retval < 0)
1814 goto done;
1815
1816 if (!nodes_subset(trialcs->mems_allowed,
1817 top_cpuset.mems_allowed)) {
1818 retval = -EINVAL;
1819 goto done;
1820 }
1821 }
1822
1823 if (nodes_equal(cs->mems_allowed, trialcs->mems_allowed)) {
1824 retval = 0;
1825 goto done;
1826 }
1827 retval = validate_change(cs, trialcs);
1828 if (retval < 0)
1829 goto done;
1830
1831 spin_lock_irq(&callback_lock);
1832 cs->mems_allowed = trialcs->mems_allowed;
1833 spin_unlock_irq(&callback_lock);
1834
1835
1836 update_nodemasks_hier(cs, &trialcs->mems_allowed);
1837done:
1838 return retval;
1839}
1840
1841bool current_cpuset_is_being_rebound(void)
1842{
1843 bool ret;
1844
1845 rcu_read_lock();
1846 ret = task_cs(current) == cpuset_being_rebound;
1847 rcu_read_unlock();
1848
1849 return ret;
1850}
1851
1852static int update_relax_domain_level(struct cpuset *cs, s64 val)
1853{
1854#ifdef CONFIG_SMP
1855 if (val < -1 || val >= sched_domain_level_max)
1856 return -EINVAL;
1857#endif
1858
1859 if (val != cs->relax_domain_level) {
1860 cs->relax_domain_level = val;
1861 if (!cpumask_empty(cs->cpus_allowed) &&
1862 is_sched_load_balance(cs))
1863 rebuild_sched_domains_locked();
1864 }
1865
1866 return 0;
1867}
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877static void update_tasks_flags(struct cpuset *cs)
1878{
1879 struct css_task_iter it;
1880 struct task_struct *task;
1881
1882 css_task_iter_start(&cs->css, 0, &it);
1883 while ((task = css_task_iter_next(&it)))
1884 cpuset_update_task_spread_flag(cs, task);
1885 css_task_iter_end(&it);
1886}
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
1898 int turning_on)
1899{
1900 struct cpuset *trialcs;
1901 int balance_flag_changed;
1902 int spread_flag_changed;
1903 int err;
1904
1905 trialcs = alloc_trial_cpuset(cs);
1906 if (!trialcs)
1907 return -ENOMEM;
1908
1909 if (turning_on)
1910 set_bit(bit, &trialcs->flags);
1911 else
1912 clear_bit(bit, &trialcs->flags);
1913
1914 err = validate_change(cs, trialcs);
1915 if (err < 0)
1916 goto out;
1917
1918 balance_flag_changed = (is_sched_load_balance(cs) !=
1919 is_sched_load_balance(trialcs));
1920
1921 spread_flag_changed = ((is_spread_slab(cs) != is_spread_slab(trialcs))
1922 || (is_spread_page(cs) != is_spread_page(trialcs)));
1923
1924 spin_lock_irq(&callback_lock);
1925 cs->flags = trialcs->flags;
1926 spin_unlock_irq(&callback_lock);
1927
1928 if (!cpumask_empty(trialcs->cpus_allowed) && balance_flag_changed)
1929 rebuild_sched_domains_locked();
1930
1931 if (spread_flag_changed)
1932 update_tasks_flags(cs);
1933out:
1934 free_cpuset(trialcs);
1935 return err;
1936}
1937
1938
1939
1940
1941
1942
1943
1944
1945static int update_prstate(struct cpuset *cs, int val)
1946{
1947 int err;
1948 struct cpuset *parent = parent_cs(cs);
1949 struct tmpmasks tmp;
1950
1951 if ((val != 0) && (val != 1))
1952 return -EINVAL;
1953 if (val == cs->partition_root_state)
1954 return 0;
1955
1956
1957
1958
1959
1960 if (val && cs->partition_root_state)
1961 return -EINVAL;
1962
1963 if (alloc_cpumasks(NULL, &tmp))
1964 return -ENOMEM;
1965
1966 err = -EINVAL;
1967 if (!cs->partition_root_state) {
1968
1969
1970
1971
1972
1973 if (cpumask_empty(cs->cpus_allowed))
1974 goto out;
1975
1976 err = update_flag(CS_CPU_EXCLUSIVE, cs, 1);
1977 if (err)
1978 goto out;
1979
1980 err = update_parent_subparts_cpumask(cs, partcmd_enable,
1981 NULL, &tmp);
1982 if (err) {
1983 update_flag(CS_CPU_EXCLUSIVE, cs, 0);
1984 goto out;
1985 }
1986 cs->partition_root_state = PRS_ENABLED;
1987 } else {
1988
1989
1990
1991
1992 if (cs->partition_root_state == PRS_ERROR) {
1993 cs->partition_root_state = 0;
1994 update_flag(CS_CPU_EXCLUSIVE, cs, 0);
1995 err = 0;
1996 goto out;
1997 }
1998
1999 err = update_parent_subparts_cpumask(cs, partcmd_disable,
2000 NULL, &tmp);
2001 if (err)
2002 goto out;
2003
2004 cs->partition_root_state = 0;
2005
2006
2007 update_flag(CS_CPU_EXCLUSIVE, cs, 0);
2008 }
2009
2010
2011
2012
2013
2014 if (parent != &top_cpuset)
2015 update_tasks_cpumask(parent);
2016
2017 if (parent->child_ecpus_count)
2018 update_sibling_cpumasks(parent, cs, &tmp);
2019
2020 rebuild_sched_domains_locked();
2021out:
2022 free_cpumasks(NULL, &tmp);
2023 return err;
2024}
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071#define FM_COEF 933
2072#define FM_MAXTICKS ((u32)99)
2073#define FM_MAXCNT 1000000
2074#define FM_SCALE 1000
2075
2076
2077static void fmeter_init(struct fmeter *fmp)
2078{
2079 fmp->cnt = 0;
2080 fmp->val = 0;
2081 fmp->time = 0;
2082 spin_lock_init(&fmp->lock);
2083}
2084
2085
2086static void fmeter_update(struct fmeter *fmp)
2087{
2088 time64_t now;
2089 u32 ticks;
2090
2091 now = ktime_get_seconds();
2092 ticks = now - fmp->time;
2093
2094 if (ticks == 0)
2095 return;
2096
2097 ticks = min(FM_MAXTICKS, ticks);
2098 while (ticks-- > 0)
2099 fmp->val = (FM_COEF * fmp->val) / FM_SCALE;
2100 fmp->time = now;
2101
2102 fmp->val += ((FM_SCALE - FM_COEF) * fmp->cnt) / FM_SCALE;
2103 fmp->cnt = 0;
2104}
2105
2106
2107static void fmeter_markevent(struct fmeter *fmp)
2108{
2109 spin_lock(&fmp->lock);
2110 fmeter_update(fmp);
2111 fmp->cnt = min(FM_MAXCNT, fmp->cnt + FM_SCALE);
2112 spin_unlock(&fmp->lock);
2113}
2114
2115
2116static int fmeter_getrate(struct fmeter *fmp)
2117{
2118 int val;
2119
2120 spin_lock(&fmp->lock);
2121 fmeter_update(fmp);
2122 val = fmp->val;
2123 spin_unlock(&fmp->lock);
2124 return val;
2125}
2126
2127static struct cpuset *cpuset_attach_old_cs;
2128
2129
2130static int cpuset_can_attach(struct cgroup_taskset *tset)
2131{
2132 struct cgroup_subsys_state *css;
2133 struct cpuset *cs;
2134 struct task_struct *task;
2135 int ret;
2136
2137
2138 cpuset_attach_old_cs = task_cs(cgroup_taskset_first(tset, &css));
2139 cs = css_cs(css);
2140
2141 percpu_down_write(&cpuset_rwsem);
2142
2143
2144 ret = -ENOSPC;
2145 if (!is_in_v2_mode() &&
2146 (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed)))
2147 goto out_unlock;
2148
2149 cgroup_taskset_for_each(task, css, tset) {
2150 ret = task_can_attach(task, cs->cpus_allowed);
2151 if (ret)
2152 goto out_unlock;
2153 ret = security_task_setscheduler(task);
2154 if (ret)
2155 goto out_unlock;
2156 }
2157
2158
2159
2160
2161
2162 cs->attach_in_progress++;
2163 ret = 0;
2164out_unlock:
2165 percpu_up_write(&cpuset_rwsem);
2166 return ret;
2167}
2168
2169static void cpuset_cancel_attach(struct cgroup_taskset *tset)
2170{
2171 struct cgroup_subsys_state *css;
2172
2173 cgroup_taskset_first(tset, &css);
2174
2175 percpu_down_write(&cpuset_rwsem);
2176 css_cs(css)->attach_in_progress--;
2177 percpu_up_write(&cpuset_rwsem);
2178}
2179
2180
2181
2182
2183
2184
2185static cpumask_var_t cpus_attach;
2186
2187static void cpuset_attach(struct cgroup_taskset *tset)
2188{
2189
2190 static nodemask_t cpuset_attach_nodemask_to;
2191 struct task_struct *task;
2192 struct task_struct *leader;
2193 struct cgroup_subsys_state *css;
2194 struct cpuset *cs;
2195 struct cpuset *oldcs = cpuset_attach_old_cs;
2196
2197 cgroup_taskset_first(tset, &css);
2198 cs = css_cs(css);
2199
2200 percpu_down_write(&cpuset_rwsem);
2201
2202
2203 if (cs == &top_cpuset)
2204 cpumask_copy(cpus_attach, cpu_possible_mask);
2205 else
2206 guarantee_online_cpus(cs, cpus_attach);
2207
2208 guarantee_online_mems(cs, &cpuset_attach_nodemask_to);
2209
2210 cgroup_taskset_for_each(task, css, tset) {
2211
2212
2213
2214
2215 WARN_ON_ONCE(set_cpus_allowed_ptr(task, cpus_attach));
2216
2217 cpuset_change_task_nodemask(task, &cpuset_attach_nodemask_to);
2218 cpuset_update_task_spread_flag(cs, task);
2219 }
2220
2221
2222
2223
2224
2225 cpuset_attach_nodemask_to = cs->effective_mems;
2226 cgroup_taskset_for_each_leader(leader, css, tset) {
2227 struct mm_struct *mm = get_task_mm(leader);
2228
2229 if (mm) {
2230 mpol_rebind_mm(mm, &cpuset_attach_nodemask_to);
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240 if (is_memory_migrate(cs))
2241 cpuset_migrate_mm(mm, &oldcs->old_mems_allowed,
2242 &cpuset_attach_nodemask_to);
2243 else
2244 mmput(mm);
2245 }
2246 }
2247
2248 cs->old_mems_allowed = cpuset_attach_nodemask_to;
2249
2250 cs->attach_in_progress--;
2251 if (!cs->attach_in_progress)
2252 wake_up(&cpuset_attach_wq);
2253
2254 percpu_up_write(&cpuset_rwsem);
2255}
2256
2257
2258
2259typedef enum {
2260 FILE_MEMORY_MIGRATE,
2261 FILE_CPULIST,
2262 FILE_MEMLIST,
2263 FILE_EFFECTIVE_CPULIST,
2264 FILE_EFFECTIVE_MEMLIST,
2265 FILE_SUBPARTS_CPULIST,
2266 FILE_CPU_EXCLUSIVE,
2267 FILE_MEM_EXCLUSIVE,
2268 FILE_MEM_HARDWALL,
2269 FILE_SCHED_LOAD_BALANCE,
2270 FILE_PARTITION_ROOT,
2271 FILE_SCHED_RELAX_DOMAIN_LEVEL,
2272 FILE_MEMORY_PRESSURE_ENABLED,
2273 FILE_MEMORY_PRESSURE,
2274 FILE_SPREAD_PAGE,
2275 FILE_SPREAD_SLAB,
2276} cpuset_filetype_t;
2277
2278static int cpuset_write_u64(struct cgroup_subsys_state *css, struct cftype *cft,
2279 u64 val)
2280{
2281 struct cpuset *cs = css_cs(css);
2282 cpuset_filetype_t type = cft->private;
2283 int retval = 0;
2284
2285 get_online_cpus();
2286 percpu_down_write(&cpuset_rwsem);
2287 if (!is_cpuset_online(cs)) {
2288 retval = -ENODEV;
2289 goto out_unlock;
2290 }
2291
2292 switch (type) {
2293 case FILE_CPU_EXCLUSIVE:
2294 retval = update_flag(CS_CPU_EXCLUSIVE, cs, val);
2295 break;
2296 case FILE_MEM_EXCLUSIVE:
2297 retval = update_flag(CS_MEM_EXCLUSIVE, cs, val);
2298 break;
2299 case FILE_MEM_HARDWALL:
2300 retval = update_flag(CS_MEM_HARDWALL, cs, val);
2301 break;
2302 case FILE_SCHED_LOAD_BALANCE:
2303 retval = update_flag(CS_SCHED_LOAD_BALANCE, cs, val);
2304 break;
2305 case FILE_MEMORY_MIGRATE:
2306 retval = update_flag(CS_MEMORY_MIGRATE, cs, val);
2307 break;
2308 case FILE_MEMORY_PRESSURE_ENABLED:
2309 cpuset_memory_pressure_enabled = !!val;
2310 break;
2311 case FILE_SPREAD_PAGE:
2312 retval = update_flag(CS_SPREAD_PAGE, cs, val);
2313 break;
2314 case FILE_SPREAD_SLAB:
2315 retval = update_flag(CS_SPREAD_SLAB, cs, val);
2316 break;
2317 default:
2318 retval = -EINVAL;
2319 break;
2320 }
2321out_unlock:
2322 percpu_up_write(&cpuset_rwsem);
2323 put_online_cpus();
2324 return retval;
2325}
2326
2327static int cpuset_write_s64(struct cgroup_subsys_state *css, struct cftype *cft,
2328 s64 val)
2329{
2330 struct cpuset *cs = css_cs(css);
2331 cpuset_filetype_t type = cft->private;
2332 int retval = -ENODEV;
2333
2334 get_online_cpus();
2335 percpu_down_write(&cpuset_rwsem);
2336 if (!is_cpuset_online(cs))
2337 goto out_unlock;
2338
2339 switch (type) {
2340 case FILE_SCHED_RELAX_DOMAIN_LEVEL:
2341 retval = update_relax_domain_level(cs, val);
2342 break;
2343 default:
2344 retval = -EINVAL;
2345 break;
2346 }
2347out_unlock:
2348 percpu_up_write(&cpuset_rwsem);
2349 put_online_cpus();
2350 return retval;
2351}
2352
2353
2354
2355
2356static ssize_t cpuset_write_resmask(struct kernfs_open_file *of,
2357 char *buf, size_t nbytes, loff_t off)
2358{
2359 struct cpuset *cs = css_cs(of_css(of));
2360 struct cpuset *trialcs;
2361 int retval = -ENODEV;
2362
2363 buf = strstrip(buf);
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384 css_get(&cs->css);
2385 kernfs_break_active_protection(of->kn);
2386 flush_work(&cpuset_hotplug_work);
2387
2388 get_online_cpus();
2389 percpu_down_write(&cpuset_rwsem);
2390 if (!is_cpuset_online(cs))
2391 goto out_unlock;
2392
2393 trialcs = alloc_trial_cpuset(cs);
2394 if (!trialcs) {
2395 retval = -ENOMEM;
2396 goto out_unlock;
2397 }
2398
2399 switch (of_cft(of)->private) {
2400 case FILE_CPULIST:
2401 retval = update_cpumask(cs, trialcs, buf);
2402 break;
2403 case FILE_MEMLIST:
2404 retval = update_nodemask(cs, trialcs, buf);
2405 break;
2406 default:
2407 retval = -EINVAL;
2408 break;
2409 }
2410
2411 free_cpuset(trialcs);
2412out_unlock:
2413 percpu_up_write(&cpuset_rwsem);
2414 put_online_cpus();
2415 kernfs_unbreak_active_protection(of->kn);
2416 css_put(&cs->css);
2417 flush_workqueue(cpuset_migrate_mm_wq);
2418 return retval ?: nbytes;
2419}
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429static int cpuset_common_seq_show(struct seq_file *sf, void *v)
2430{
2431 struct cpuset *cs = css_cs(seq_css(sf));
2432 cpuset_filetype_t type = seq_cft(sf)->private;
2433 int ret = 0;
2434
2435 spin_lock_irq(&callback_lock);
2436
2437 switch (type) {
2438 case FILE_CPULIST:
2439 seq_printf(sf, "%*pbl\n", cpumask_pr_args(cs->cpus_allowed));
2440 break;
2441 case FILE_MEMLIST:
2442 seq_printf(sf, "%*pbl\n", nodemask_pr_args(&cs->mems_allowed));
2443 break;
2444 case FILE_EFFECTIVE_CPULIST:
2445 seq_printf(sf, "%*pbl\n", cpumask_pr_args(cs->effective_cpus));
2446 break;
2447 case FILE_EFFECTIVE_MEMLIST:
2448 seq_printf(sf, "%*pbl\n", nodemask_pr_args(&cs->effective_mems));
2449 break;
2450 case FILE_SUBPARTS_CPULIST:
2451 seq_printf(sf, "%*pbl\n", cpumask_pr_args(cs->subparts_cpus));
2452 break;
2453 default:
2454 ret = -EINVAL;
2455 }
2456
2457 spin_unlock_irq(&callback_lock);
2458 return ret;
2459}
2460
2461static u64 cpuset_read_u64(struct cgroup_subsys_state *css, struct cftype *cft)
2462{
2463 struct cpuset *cs = css_cs(css);
2464 cpuset_filetype_t type = cft->private;
2465 switch (type) {
2466 case FILE_CPU_EXCLUSIVE:
2467 return is_cpu_exclusive(cs);
2468 case FILE_MEM_EXCLUSIVE:
2469 return is_mem_exclusive(cs);
2470 case FILE_MEM_HARDWALL:
2471 return is_mem_hardwall(cs);
2472 case FILE_SCHED_LOAD_BALANCE:
2473 return is_sched_load_balance(cs);
2474 case FILE_MEMORY_MIGRATE:
2475 return is_memory_migrate(cs);
2476 case FILE_MEMORY_PRESSURE_ENABLED:
2477 return cpuset_memory_pressure_enabled;
2478 case FILE_MEMORY_PRESSURE:
2479 return fmeter_getrate(&cs->fmeter);
2480 case FILE_SPREAD_PAGE:
2481 return is_spread_page(cs);
2482 case FILE_SPREAD_SLAB:
2483 return is_spread_slab(cs);
2484 default:
2485 BUG();
2486 }
2487
2488
2489 return 0;
2490}
2491
2492static s64 cpuset_read_s64(struct cgroup_subsys_state *css, struct cftype *cft)
2493{
2494 struct cpuset *cs = css_cs(css);
2495 cpuset_filetype_t type = cft->private;
2496 switch (type) {
2497 case FILE_SCHED_RELAX_DOMAIN_LEVEL:
2498 return cs->relax_domain_level;
2499 default:
2500 BUG();
2501 }
2502
2503
2504 return 0;
2505}
2506
2507static int sched_partition_show(struct seq_file *seq, void *v)
2508{
2509 struct cpuset *cs = css_cs(seq_css(seq));
2510
2511 switch (cs->partition_root_state) {
2512 case PRS_ENABLED:
2513 seq_puts(seq, "root\n");
2514 break;
2515 case PRS_DISABLED:
2516 seq_puts(seq, "member\n");
2517 break;
2518 case PRS_ERROR:
2519 seq_puts(seq, "root invalid\n");
2520 break;
2521 }
2522 return 0;
2523}
2524
2525static ssize_t sched_partition_write(struct kernfs_open_file *of, char *buf,
2526 size_t nbytes, loff_t off)
2527{
2528 struct cpuset *cs = css_cs(of_css(of));
2529 int val;
2530 int retval = -ENODEV;
2531
2532 buf = strstrip(buf);
2533
2534
2535
2536
2537 if (!strcmp(buf, "root"))
2538 val = PRS_ENABLED;
2539 else if (!strcmp(buf, "member"))
2540 val = PRS_DISABLED;
2541 else
2542 return -EINVAL;
2543
2544 css_get(&cs->css);
2545 get_online_cpus();
2546 percpu_down_write(&cpuset_rwsem);
2547 if (!is_cpuset_online(cs))
2548 goto out_unlock;
2549
2550 retval = update_prstate(cs, val);
2551out_unlock:
2552 percpu_up_write(&cpuset_rwsem);
2553 put_online_cpus();
2554 css_put(&cs->css);
2555 return retval ?: nbytes;
2556}
2557
2558
2559
2560
2561
2562static struct cftype legacy_files[] = {
2563 {
2564 .name = "cpus",
2565 .seq_show = cpuset_common_seq_show,
2566 .write = cpuset_write_resmask,
2567 .max_write_len = (100U + 6 * NR_CPUS),
2568 .private = FILE_CPULIST,
2569 },
2570
2571 {
2572 .name = "mems",
2573 .seq_show = cpuset_common_seq_show,
2574 .write = cpuset_write_resmask,
2575 .max_write_len = (100U + 6 * MAX_NUMNODES),
2576 .private = FILE_MEMLIST,
2577 },
2578
2579 {
2580 .name = "effective_cpus",
2581 .seq_show = cpuset_common_seq_show,
2582 .private = FILE_EFFECTIVE_CPULIST,
2583 },
2584
2585 {
2586 .name = "effective_mems",
2587 .seq_show = cpuset_common_seq_show,
2588 .private = FILE_EFFECTIVE_MEMLIST,
2589 },
2590
2591 {
2592 .name = "cpu_exclusive",
2593 .read_u64 = cpuset_read_u64,
2594 .write_u64 = cpuset_write_u64,
2595 .private = FILE_CPU_EXCLUSIVE,
2596 },
2597
2598 {
2599 .name = "mem_exclusive",
2600 .read_u64 = cpuset_read_u64,
2601 .write_u64 = cpuset_write_u64,
2602 .private = FILE_MEM_EXCLUSIVE,
2603 },
2604
2605 {
2606 .name = "mem_hardwall",
2607 .read_u64 = cpuset_read_u64,
2608 .write_u64 = cpuset_write_u64,
2609 .private = FILE_MEM_HARDWALL,
2610 },
2611
2612 {
2613 .name = "sched_load_balance",
2614 .read_u64 = cpuset_read_u64,
2615 .write_u64 = cpuset_write_u64,
2616 .private = FILE_SCHED_LOAD_BALANCE,
2617 },
2618
2619 {
2620 .name = "sched_relax_domain_level",
2621 .read_s64 = cpuset_read_s64,
2622 .write_s64 = cpuset_write_s64,
2623 .private = FILE_SCHED_RELAX_DOMAIN_LEVEL,
2624 },
2625
2626 {
2627 .name = "memory_migrate",
2628 .read_u64 = cpuset_read_u64,
2629 .write_u64 = cpuset_write_u64,
2630 .private = FILE_MEMORY_MIGRATE,
2631 },
2632
2633 {
2634 .name = "memory_pressure",
2635 .read_u64 = cpuset_read_u64,
2636 .private = FILE_MEMORY_PRESSURE,
2637 },
2638
2639 {
2640 .name = "memory_spread_page",
2641 .read_u64 = cpuset_read_u64,
2642 .write_u64 = cpuset_write_u64,
2643 .private = FILE_SPREAD_PAGE,
2644 },
2645
2646 {
2647 .name = "memory_spread_slab",
2648 .read_u64 = cpuset_read_u64,
2649 .write_u64 = cpuset_write_u64,
2650 .private = FILE_SPREAD_SLAB,
2651 },
2652
2653 {
2654 .name = "memory_pressure_enabled",
2655 .flags = CFTYPE_ONLY_ON_ROOT,
2656 .read_u64 = cpuset_read_u64,
2657 .write_u64 = cpuset_write_u64,
2658 .private = FILE_MEMORY_PRESSURE_ENABLED,
2659 },
2660
2661 { }
2662};
2663
2664
2665
2666
2667
2668static struct cftype dfl_files[] = {
2669 {
2670 .name = "cpus",
2671 .seq_show = cpuset_common_seq_show,
2672 .write = cpuset_write_resmask,
2673 .max_write_len = (100U + 6 * NR_CPUS),
2674 .private = FILE_CPULIST,
2675 .flags = CFTYPE_NOT_ON_ROOT,
2676 },
2677
2678 {
2679 .name = "mems",
2680 .seq_show = cpuset_common_seq_show,
2681 .write = cpuset_write_resmask,
2682 .max_write_len = (100U + 6 * MAX_NUMNODES),
2683 .private = FILE_MEMLIST,
2684 .flags = CFTYPE_NOT_ON_ROOT,
2685 },
2686
2687 {
2688 .name = "cpus.effective",
2689 .seq_show = cpuset_common_seq_show,
2690 .private = FILE_EFFECTIVE_CPULIST,
2691 },
2692
2693 {
2694 .name = "mems.effective",
2695 .seq_show = cpuset_common_seq_show,
2696 .private = FILE_EFFECTIVE_MEMLIST,
2697 },
2698
2699 {
2700 .name = "cpus.partition",
2701 .seq_show = sched_partition_show,
2702 .write = sched_partition_write,
2703 .private = FILE_PARTITION_ROOT,
2704 .flags = CFTYPE_NOT_ON_ROOT,
2705 },
2706
2707 {
2708 .name = "cpus.subpartitions",
2709 .seq_show = cpuset_common_seq_show,
2710 .private = FILE_SUBPARTS_CPULIST,
2711 .flags = CFTYPE_DEBUG,
2712 },
2713
2714 { }
2715};
2716
2717
2718
2719
2720
2721
2722
2723static struct cgroup_subsys_state *
2724cpuset_css_alloc(struct cgroup_subsys_state *parent_css)
2725{
2726 struct cpuset *cs;
2727
2728 if (!parent_css)
2729 return &top_cpuset.css;
2730
2731 cs = kzalloc(sizeof(*cs), GFP_KERNEL);
2732 if (!cs)
2733 return ERR_PTR(-ENOMEM);
2734
2735 if (alloc_cpumasks(cs, NULL)) {
2736 kfree(cs);
2737 return ERR_PTR(-ENOMEM);
2738 }
2739
2740 set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
2741 nodes_clear(cs->mems_allowed);
2742 nodes_clear(cs->effective_mems);
2743 fmeter_init(&cs->fmeter);
2744 cs->relax_domain_level = -1;
2745
2746 return &cs->css;
2747}
2748
2749static int cpuset_css_online(struct cgroup_subsys_state *css)
2750{
2751 struct cpuset *cs = css_cs(css);
2752 struct cpuset *parent = parent_cs(cs);
2753 struct cpuset *tmp_cs;
2754 struct cgroup_subsys_state *pos_css;
2755
2756 if (!parent)
2757 return 0;
2758
2759 get_online_cpus();
2760 percpu_down_write(&cpuset_rwsem);
2761
2762 set_bit(CS_ONLINE, &cs->flags);
2763 if (is_spread_page(parent))
2764 set_bit(CS_SPREAD_PAGE, &cs->flags);
2765 if (is_spread_slab(parent))
2766 set_bit(CS_SPREAD_SLAB, &cs->flags);
2767
2768 cpuset_inc();
2769
2770 spin_lock_irq(&callback_lock);
2771 if (is_in_v2_mode()) {
2772 cpumask_copy(cs->effective_cpus, parent->effective_cpus);
2773 cs->effective_mems = parent->effective_mems;
2774 cs->use_parent_ecpus = true;
2775 parent->child_ecpus_count++;
2776 }
2777 spin_unlock_irq(&callback_lock);
2778
2779 if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags))
2780 goto out_unlock;
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795 rcu_read_lock();
2796 cpuset_for_each_child(tmp_cs, pos_css, parent) {
2797 if (is_mem_exclusive(tmp_cs) || is_cpu_exclusive(tmp_cs)) {
2798 rcu_read_unlock();
2799 goto out_unlock;
2800 }
2801 }
2802 rcu_read_unlock();
2803
2804 spin_lock_irq(&callback_lock);
2805 cs->mems_allowed = parent->mems_allowed;
2806 cs->effective_mems = parent->mems_allowed;
2807 cpumask_copy(cs->cpus_allowed, parent->cpus_allowed);
2808 cpumask_copy(cs->effective_cpus, parent->cpus_allowed);
2809 spin_unlock_irq(&callback_lock);
2810out_unlock:
2811 percpu_up_write(&cpuset_rwsem);
2812 put_online_cpus();
2813 return 0;
2814}
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827static void cpuset_css_offline(struct cgroup_subsys_state *css)
2828{
2829 struct cpuset *cs = css_cs(css);
2830
2831 get_online_cpus();
2832 percpu_down_write(&cpuset_rwsem);
2833
2834 if (is_partition_root(cs))
2835 update_prstate(cs, 0);
2836
2837 if (!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) &&
2838 is_sched_load_balance(cs))
2839 update_flag(CS_SCHED_LOAD_BALANCE, cs, 0);
2840
2841 if (cs->use_parent_ecpus) {
2842 struct cpuset *parent = parent_cs(cs);
2843
2844 cs->use_parent_ecpus = false;
2845 parent->child_ecpus_count--;
2846 }
2847
2848 cpuset_dec();
2849 clear_bit(CS_ONLINE, &cs->flags);
2850
2851 percpu_up_write(&cpuset_rwsem);
2852 put_online_cpus();
2853}
2854
2855static void cpuset_css_free(struct cgroup_subsys_state *css)
2856{
2857 struct cpuset *cs = css_cs(css);
2858
2859 free_cpuset(cs);
2860}
2861
2862static void cpuset_bind(struct cgroup_subsys_state *root_css)
2863{
2864 percpu_down_write(&cpuset_rwsem);
2865 spin_lock_irq(&callback_lock);
2866
2867 if (is_in_v2_mode()) {
2868 cpumask_copy(top_cpuset.cpus_allowed, cpu_possible_mask);
2869 top_cpuset.mems_allowed = node_possible_map;
2870 } else {
2871 cpumask_copy(top_cpuset.cpus_allowed,
2872 top_cpuset.effective_cpus);
2873 top_cpuset.mems_allowed = top_cpuset.effective_mems;
2874 }
2875
2876 spin_unlock_irq(&callback_lock);
2877 percpu_up_write(&cpuset_rwsem);
2878}
2879
2880
2881
2882
2883
2884
2885static void cpuset_fork(struct task_struct *task)
2886{
2887 if (task_css_is_root(task, cpuset_cgrp_id))
2888 return;
2889
2890 set_cpus_allowed_ptr(task, current->cpus_ptr);
2891 task->mems_allowed = current->mems_allowed;
2892}
2893
2894struct cgroup_subsys cpuset_cgrp_subsys = {
2895 .css_alloc = cpuset_css_alloc,
2896 .css_online = cpuset_css_online,
2897 .css_offline = cpuset_css_offline,
2898 .css_free = cpuset_css_free,
2899 .can_attach = cpuset_can_attach,
2900 .cancel_attach = cpuset_cancel_attach,
2901 .attach = cpuset_attach,
2902 .post_attach = cpuset_post_attach,
2903 .bind = cpuset_bind,
2904 .fork = cpuset_fork,
2905 .legacy_cftypes = legacy_files,
2906 .dfl_cftypes = dfl_files,
2907 .early_init = true,
2908 .threaded = true,
2909};
2910
2911
2912
2913
2914
2915
2916
2917int __init cpuset_init(void)
2918{
2919 BUG_ON(percpu_init_rwsem(&cpuset_rwsem));
2920
2921 BUG_ON(!alloc_cpumask_var(&top_cpuset.cpus_allowed, GFP_KERNEL));
2922 BUG_ON(!alloc_cpumask_var(&top_cpuset.effective_cpus, GFP_KERNEL));
2923 BUG_ON(!zalloc_cpumask_var(&top_cpuset.subparts_cpus, GFP_KERNEL));
2924
2925 cpumask_setall(top_cpuset.cpus_allowed);
2926 nodes_setall(top_cpuset.mems_allowed);
2927 cpumask_setall(top_cpuset.effective_cpus);
2928 nodes_setall(top_cpuset.effective_mems);
2929
2930 fmeter_init(&top_cpuset.fmeter);
2931 set_bit(CS_SCHED_LOAD_BALANCE, &top_cpuset.flags);
2932 top_cpuset.relax_domain_level = -1;
2933
2934 BUG_ON(!alloc_cpumask_var(&cpus_attach, GFP_KERNEL));
2935
2936 return 0;
2937}
2938
2939
2940
2941
2942
2943
2944
2945
2946static void remove_tasks_in_empty_cpuset(struct cpuset *cs)
2947{
2948 struct cpuset *parent;
2949
2950
2951
2952
2953
2954 parent = parent_cs(cs);
2955 while (cpumask_empty(parent->cpus_allowed) ||
2956 nodes_empty(parent->mems_allowed))
2957 parent = parent_cs(parent);
2958
2959 if (cgroup_transfer_tasks(parent->css.cgroup, cs->css.cgroup)) {
2960 pr_err("cpuset: failed to transfer tasks out of empty cpuset ");
2961 pr_cont_cgroup_name(cs->css.cgroup);
2962 pr_cont("\n");
2963 }
2964}
2965
2966static void
2967hotplug_update_tasks_legacy(struct cpuset *cs,
2968 struct cpumask *new_cpus, nodemask_t *new_mems,
2969 bool cpus_updated, bool mems_updated)
2970{
2971 bool is_empty;
2972
2973 spin_lock_irq(&callback_lock);
2974 cpumask_copy(cs->cpus_allowed, new_cpus);
2975 cpumask_copy(cs->effective_cpus, new_cpus);
2976 cs->mems_allowed = *new_mems;
2977 cs->effective_mems = *new_mems;
2978 spin_unlock_irq(&callback_lock);
2979
2980
2981
2982
2983
2984 if (cpus_updated && !cpumask_empty(cs->cpus_allowed))
2985 update_tasks_cpumask(cs);
2986 if (mems_updated && !nodes_empty(cs->mems_allowed))
2987 update_tasks_nodemask(cs);
2988
2989 is_empty = cpumask_empty(cs->cpus_allowed) ||
2990 nodes_empty(cs->mems_allowed);
2991
2992 percpu_up_write(&cpuset_rwsem);
2993
2994
2995
2996
2997
2998
2999 if (is_empty)
3000 remove_tasks_in_empty_cpuset(cs);
3001
3002 percpu_down_write(&cpuset_rwsem);
3003}
3004
3005static void
3006hotplug_update_tasks(struct cpuset *cs,
3007 struct cpumask *new_cpus, nodemask_t *new_mems,
3008 bool cpus_updated, bool mems_updated)
3009{
3010 if (cpumask_empty(new_cpus))
3011 cpumask_copy(new_cpus, parent_cs(cs)->effective_cpus);
3012 if (nodes_empty(*new_mems))
3013 *new_mems = parent_cs(cs)->effective_mems;
3014
3015 spin_lock_irq(&callback_lock);
3016 cpumask_copy(cs->effective_cpus, new_cpus);
3017 cs->effective_mems = *new_mems;
3018 spin_unlock_irq(&callback_lock);
3019
3020 if (cpus_updated)
3021 update_tasks_cpumask(cs);
3022 if (mems_updated)
3023 update_tasks_nodemask(cs);
3024}
3025
3026static bool force_rebuild;
3027
3028void cpuset_force_rebuild(void)
3029{
3030 force_rebuild = true;
3031}
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042static void cpuset_hotplug_update_tasks(struct cpuset *cs, struct tmpmasks *tmp)
3043{
3044 static cpumask_t new_cpus;
3045 static nodemask_t new_mems;
3046 bool cpus_updated;
3047 bool mems_updated;
3048 struct cpuset *parent;
3049retry:
3050 wait_event(cpuset_attach_wq, cs->attach_in_progress == 0);
3051
3052 percpu_down_write(&cpuset_rwsem);
3053
3054
3055
3056
3057
3058 if (cs->attach_in_progress) {
3059 percpu_up_write(&cpuset_rwsem);
3060 goto retry;
3061 }
3062
3063 parent = parent_cs(cs);
3064 compute_effective_cpumask(&new_cpus, cs, parent);
3065 nodes_and(new_mems, cs->mems_allowed, parent->effective_mems);
3066
3067 if (cs->nr_subparts_cpus)
3068
3069
3070
3071
3072 cpumask_andnot(&new_cpus, &new_cpus, cs->subparts_cpus);
3073
3074 if (!tmp || !cs->partition_root_state)
3075 goto update_tasks;
3076
3077
3078
3079
3080
3081
3082 if (is_partition_root(cs) && (cpumask_empty(&new_cpus) ||
3083 (parent->partition_root_state == PRS_ERROR))) {
3084 if (cs->nr_subparts_cpus) {
3085 cs->nr_subparts_cpus = 0;
3086 cpumask_clear(cs->subparts_cpus);
3087 compute_effective_cpumask(&new_cpus, cs, parent);
3088 }
3089
3090
3091
3092
3093
3094
3095
3096 if ((parent->partition_root_state == PRS_ERROR) ||
3097 cpumask_empty(&new_cpus)) {
3098 update_parent_subparts_cpumask(cs, partcmd_disable,
3099 NULL, tmp);
3100 cs->partition_root_state = PRS_ERROR;
3101 }
3102 cpuset_force_rebuild();
3103 }
3104
3105
3106
3107
3108
3109
3110 if (is_partition_root(parent) &&
3111 ((cs->partition_root_state == PRS_ERROR) ||
3112 !cpumask_intersects(&new_cpus, parent->subparts_cpus)) &&
3113 update_parent_subparts_cpumask(cs, partcmd_update, NULL, tmp))
3114 cpuset_force_rebuild();
3115
3116update_tasks:
3117 cpus_updated = !cpumask_equal(&new_cpus, cs->effective_cpus);
3118 mems_updated = !nodes_equal(new_mems, cs->effective_mems);
3119
3120 if (is_in_v2_mode())
3121 hotplug_update_tasks(cs, &new_cpus, &new_mems,
3122 cpus_updated, mems_updated);
3123 else
3124 hotplug_update_tasks_legacy(cs, &new_cpus, &new_mems,
3125 cpus_updated, mems_updated);
3126
3127 percpu_up_write(&cpuset_rwsem);
3128}
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146static void cpuset_hotplug_workfn(struct work_struct *work)
3147{
3148 static cpumask_t new_cpus;
3149 static nodemask_t new_mems;
3150 bool cpus_updated, mems_updated;
3151 bool on_dfl = is_in_v2_mode();
3152 struct tmpmasks tmp, *ptmp = NULL;
3153
3154 if (on_dfl && !alloc_cpumasks(NULL, &tmp))
3155 ptmp = &tmp;
3156
3157 percpu_down_write(&cpuset_rwsem);
3158
3159
3160 cpumask_copy(&new_cpus, cpu_active_mask);
3161 new_mems = node_states[N_MEMORY];
3162
3163
3164
3165
3166
3167
3168 cpus_updated = !cpumask_equal(top_cpuset.effective_cpus, &new_cpus);
3169 mems_updated = !nodes_equal(top_cpuset.effective_mems, new_mems);
3170
3171
3172 if (cpus_updated) {
3173 spin_lock_irq(&callback_lock);
3174 if (!on_dfl)
3175 cpumask_copy(top_cpuset.cpus_allowed, &new_cpus);
3176
3177
3178
3179
3180
3181
3182 if (top_cpuset.nr_subparts_cpus) {
3183 if (cpumask_subset(&new_cpus,
3184 top_cpuset.subparts_cpus)) {
3185 top_cpuset.nr_subparts_cpus = 0;
3186 cpumask_clear(top_cpuset.subparts_cpus);
3187 } else {
3188 cpumask_andnot(&new_cpus, &new_cpus,
3189 top_cpuset.subparts_cpus);
3190 }
3191 }
3192 cpumask_copy(top_cpuset.effective_cpus, &new_cpus);
3193 spin_unlock_irq(&callback_lock);
3194
3195 }
3196
3197
3198 if (mems_updated) {
3199 spin_lock_irq(&callback_lock);
3200 if (!on_dfl)
3201 top_cpuset.mems_allowed = new_mems;
3202 top_cpuset.effective_mems = new_mems;
3203 spin_unlock_irq(&callback_lock);
3204 update_tasks_nodemask(&top_cpuset);
3205 }
3206
3207 percpu_up_write(&cpuset_rwsem);
3208
3209
3210 if (cpus_updated || mems_updated) {
3211 struct cpuset *cs;
3212 struct cgroup_subsys_state *pos_css;
3213
3214 rcu_read_lock();
3215 cpuset_for_each_descendant_pre(cs, pos_css, &top_cpuset) {
3216 if (cs == &top_cpuset || !css_tryget_online(&cs->css))
3217 continue;
3218 rcu_read_unlock();
3219
3220 cpuset_hotplug_update_tasks(cs, ptmp);
3221
3222 rcu_read_lock();
3223 css_put(&cs->css);
3224 }
3225 rcu_read_unlock();
3226 }
3227
3228
3229 if (cpus_updated || force_rebuild) {
3230 force_rebuild = false;
3231 rebuild_sched_domains();
3232 }
3233
3234 free_cpumasks(NULL, ptmp);
3235}
3236
3237void cpuset_update_active_cpus(void)
3238{
3239
3240
3241
3242
3243
3244 schedule_work(&cpuset_hotplug_work);
3245}
3246
3247void cpuset_wait_for_hotplug(void)
3248{
3249 flush_work(&cpuset_hotplug_work);
3250}
3251
3252
3253
3254
3255
3256
3257static int cpuset_track_online_nodes(struct notifier_block *self,
3258 unsigned long action, void *arg)
3259{
3260 schedule_work(&cpuset_hotplug_work);
3261 return NOTIFY_OK;
3262}
3263
3264static struct notifier_block cpuset_track_online_nodes_nb = {
3265 .notifier_call = cpuset_track_online_nodes,
3266 .priority = 10,
3267};
3268
3269
3270
3271
3272
3273
3274void __init cpuset_init_smp(void)
3275{
3276 cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask);
3277 top_cpuset.mems_allowed = node_states[N_MEMORY];
3278 top_cpuset.old_mems_allowed = top_cpuset.mems_allowed;
3279
3280 cpumask_copy(top_cpuset.effective_cpus, cpu_active_mask);
3281 top_cpuset.effective_mems = node_states[N_MEMORY];
3282
3283 register_hotmemory_notifier(&cpuset_track_online_nodes_nb);
3284
3285 cpuset_migrate_mm_wq = alloc_ordered_workqueue("cpuset_migrate_mm", 0);
3286 BUG_ON(!cpuset_migrate_mm_wq);
3287}
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask)
3301{
3302 unsigned long flags;
3303
3304 spin_lock_irqsave(&callback_lock, flags);
3305 rcu_read_lock();
3306 guarantee_online_cpus(task_cs(tsk), pmask);
3307 rcu_read_unlock();
3308 spin_unlock_irqrestore(&callback_lock, flags);
3309}
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323void cpuset_cpus_allowed_fallback(struct task_struct *tsk)
3324{
3325 rcu_read_lock();
3326 do_set_cpus_allowed(tsk, is_in_v2_mode() ?
3327 task_cs(tsk)->cpus_allowed : cpu_possible_mask);
3328 rcu_read_unlock();
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347}
3348
3349void __init cpuset_init_current_mems_allowed(void)
3350{
3351 nodes_setall(current->mems_allowed);
3352}
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364nodemask_t cpuset_mems_allowed(struct task_struct *tsk)
3365{
3366 nodemask_t mask;
3367 unsigned long flags;
3368
3369 spin_lock_irqsave(&callback_lock, flags);
3370 rcu_read_lock();
3371 guarantee_online_mems(task_cs(tsk), &mask);
3372 rcu_read_unlock();
3373 spin_unlock_irqrestore(&callback_lock, flags);
3374
3375 return mask;
3376}
3377
3378
3379
3380
3381
3382
3383
3384int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask)
3385{
3386 return nodes_intersects(*nodemask, current->mems_allowed);
3387}
3388
3389
3390
3391
3392
3393
3394
3395static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs)
3396{
3397 while (!(is_mem_exclusive(cs) || is_mem_hardwall(cs)) && parent_cs(cs))
3398 cs = parent_cs(cs);
3399 return cs;
3400}
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442bool __cpuset_node_allowed(int node, gfp_t gfp_mask)
3443{
3444 struct cpuset *cs;
3445 int allowed;
3446 unsigned long flags;
3447
3448 if (in_interrupt())
3449 return true;
3450 if (node_isset(node, current->mems_allowed))
3451 return true;
3452
3453
3454
3455
3456 if (unlikely(tsk_is_oom_victim(current)))
3457 return true;
3458 if (gfp_mask & __GFP_HARDWALL)
3459 return false;
3460
3461 if (current->flags & PF_EXITING)
3462 return true;
3463
3464
3465 spin_lock_irqsave(&callback_lock, flags);
3466
3467 rcu_read_lock();
3468 cs = nearest_hardwall_ancestor(task_cs(current));
3469 allowed = node_isset(node, cs->mems_allowed);
3470 rcu_read_unlock();
3471
3472 spin_unlock_irqrestore(&callback_lock, flags);
3473 return allowed;
3474}
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503static int cpuset_spread_node(int *rotor)
3504{
3505 return *rotor = next_node_in(*rotor, current->mems_allowed);
3506}
3507
3508int cpuset_mem_spread_node(void)
3509{
3510 if (current->cpuset_mem_spread_rotor == NUMA_NO_NODE)
3511 current->cpuset_mem_spread_rotor =
3512 node_random(¤t->mems_allowed);
3513
3514 return cpuset_spread_node(¤t->cpuset_mem_spread_rotor);
3515}
3516
3517int cpuset_slab_spread_node(void)
3518{
3519 if (current->cpuset_slab_spread_rotor == NUMA_NO_NODE)
3520 current->cpuset_slab_spread_rotor =
3521 node_random(¤t->mems_allowed);
3522
3523 return cpuset_spread_node(¤t->cpuset_slab_spread_rotor);
3524}
3525
3526EXPORT_SYMBOL_GPL(cpuset_mem_spread_node);
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539int cpuset_mems_allowed_intersects(const struct task_struct *tsk1,
3540 const struct task_struct *tsk2)
3541{
3542 return nodes_intersects(tsk1->mems_allowed, tsk2->mems_allowed);
3543}
3544
3545
3546
3547
3548
3549
3550
3551void cpuset_print_current_mems_allowed(void)
3552{
3553 struct cgroup *cgrp;
3554
3555 rcu_read_lock();
3556
3557 cgrp = task_cs(current)->css.cgroup;
3558 pr_cont(",cpuset=");
3559 pr_cont_cgroup_name(cgrp);
3560 pr_cont(",mems_allowed=%*pbl",
3561 nodemask_pr_args(¤t->mems_allowed));
3562
3563 rcu_read_unlock();
3564}
3565
3566
3567
3568
3569
3570
3571
3572int cpuset_memory_pressure_enabled __read_mostly;
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592void __cpuset_memory_pressure_bump(void)
3593{
3594 rcu_read_lock();
3595 fmeter_markevent(&task_cs(current)->fmeter);
3596 rcu_read_unlock();
3597}
3598
3599#ifdef CONFIG_PROC_PID_CPUSET
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609int proc_cpuset_show(struct seq_file *m, struct pid_namespace *ns,
3610 struct pid *pid, struct task_struct *tsk)
3611{
3612 char *buf;
3613 struct cgroup_subsys_state *css;
3614 int retval;
3615
3616 retval = -ENOMEM;
3617 buf = kmalloc(PATH_MAX, GFP_KERNEL);
3618 if (!buf)
3619 goto out;
3620
3621 css = task_get_css(tsk, cpuset_cgrp_id);
3622 retval = cgroup_path_ns(css->cgroup, buf, PATH_MAX,
3623 current->nsproxy->cgroup_ns);
3624 css_put(css);
3625 if (retval >= PATH_MAX)
3626 retval = -ENAMETOOLONG;
3627 if (retval < 0)
3628 goto out_free;
3629 seq_puts(m, buf);
3630 seq_putc(m, '\n');
3631 retval = 0;
3632out_free:
3633 kfree(buf);
3634out:
3635 return retval;
3636}
3637#endif
3638
3639
3640void cpuset_task_status_allowed(struct seq_file *m, struct task_struct *task)
3641{
3642 seq_printf(m, "Mems_allowed:\t%*pb\n",
3643 nodemask_pr_args(&task->mems_allowed));
3644 seq_printf(m, "Mems_allowed_list:\t%*pbl\n",
3645 nodemask_pr_args(&task->mems_allowed));
3646}
3647