1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25#include <linux/cpu.h>
26#include <linux/cpumask.h>
27#include <linux/cpuset.h>
28#include <linux/err.h>
29#include <linux/errno.h>
30#include <linux/file.h>
31#include <linux/fs.h>
32#include <linux/init.h>
33#include <linux/interrupt.h>
34#include <linux/kernel.h>
35#include <linux/kmod.h>
36#include <linux/list.h>
37#include <linux/mempolicy.h>
38#include <linux/mm.h>
39#include <linux/memory.h>
40#include <linux/export.h>
41#include <linux/mount.h>
42#include <linux/fs_context.h>
43#include <linux/namei.h>
44#include <linux/pagemap.h>
45#include <linux/proc_fs.h>
46#include <linux/rcupdate.h>
47#include <linux/sched.h>
48#include <linux/sched/deadline.h>
49#include <linux/sched/mm.h>
50#include <linux/sched/task.h>
51#include <linux/seq_file.h>
52#include <linux/security.h>
53#include <linux/slab.h>
54#include <linux/spinlock.h>
55#include <linux/stat.h>
56#include <linux/string.h>
57#include <linux/time.h>
58#include <linux/time64.h>
59#include <linux/backing-dev.h>
60#include <linux/sort.h>
61#include <linux/oom.h>
62#include <linux/sched/isolation.h>
63#include <linux/uaccess.h>
64#include <linux/atomic.h>
65#include <linux/mutex.h>
66#include <linux/cgroup.h>
67#include <linux/wait.h>
68
69DEFINE_STATIC_KEY_FALSE(cpusets_pre_enable_key);
70DEFINE_STATIC_KEY_FALSE(cpusets_enabled_key);
71
72
73
74struct fmeter {
75 int cnt;
76 int val;
77 time64_t time;
78 spinlock_t lock;
79};
80
81struct cpuset {
82 struct cgroup_subsys_state css;
83
84 unsigned long flags;
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107 cpumask_var_t cpus_allowed;
108 nodemask_t mems_allowed;
109
110
111 cpumask_var_t effective_cpus;
112 nodemask_t effective_mems;
113
114
115
116
117
118
119
120
121
122 cpumask_var_t subparts_cpus;
123
124
125
126
127
128
129
130
131
132
133
134 nodemask_t old_mems_allowed;
135
136 struct fmeter fmeter;
137
138
139
140
141
142 int attach_in_progress;
143
144
145 int pn;
146
147
148 int relax_domain_level;
149
150
151 int nr_subparts_cpus;
152
153
154 int partition_root_state;
155
156
157
158
159
160
161 int use_parent_ecpus;
162 int child_ecpus_count;
163
164
165 struct cgroup_file partition_file;
166};
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182#define PRS_DISABLED 0
183#define PRS_ENABLED 1
184#define PRS_ERROR -1
185
186
187
188
189
190struct tmpmasks {
191 cpumask_var_t addmask, delmask;
192 cpumask_var_t new_cpus;
193};
194
195static inline struct cpuset *css_cs(struct cgroup_subsys_state *css)
196{
197 return css ? container_of(css, struct cpuset, css) : NULL;
198}
199
200
201static inline struct cpuset *task_cs(struct task_struct *task)
202{
203 return css_cs(task_css(task, cpuset_cgrp_id));
204}
205
206static inline struct cpuset *parent_cs(struct cpuset *cs)
207{
208 return css_cs(cs->css.parent);
209}
210
211
212typedef enum {
213 CS_ONLINE,
214 CS_CPU_EXCLUSIVE,
215 CS_MEM_EXCLUSIVE,
216 CS_MEM_HARDWALL,
217 CS_MEMORY_MIGRATE,
218 CS_SCHED_LOAD_BALANCE,
219 CS_SPREAD_PAGE,
220 CS_SPREAD_SLAB,
221} cpuset_flagbits_t;
222
223
224static inline bool is_cpuset_online(struct cpuset *cs)
225{
226 return test_bit(CS_ONLINE, &cs->flags) && !css_is_dying(&cs->css);
227}
228
229static inline int is_cpu_exclusive(const struct cpuset *cs)
230{
231 return test_bit(CS_CPU_EXCLUSIVE, &cs->flags);
232}
233
234static inline int is_mem_exclusive(const struct cpuset *cs)
235{
236 return test_bit(CS_MEM_EXCLUSIVE, &cs->flags);
237}
238
239static inline int is_mem_hardwall(const struct cpuset *cs)
240{
241 return test_bit(CS_MEM_HARDWALL, &cs->flags);
242}
243
244static inline int is_sched_load_balance(const struct cpuset *cs)
245{
246 return test_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
247}
248
249static inline int is_memory_migrate(const struct cpuset *cs)
250{
251 return test_bit(CS_MEMORY_MIGRATE, &cs->flags);
252}
253
254static inline int is_spread_page(const struct cpuset *cs)
255{
256 return test_bit(CS_SPREAD_PAGE, &cs->flags);
257}
258
259static inline int is_spread_slab(const struct cpuset *cs)
260{
261 return test_bit(CS_SPREAD_SLAB, &cs->flags);
262}
263
264static inline int is_partition_root(const struct cpuset *cs)
265{
266 return cs->partition_root_state > 0;
267}
268
269
270
271
272static inline void notify_partition_change(struct cpuset *cs,
273 int old_prs, int new_prs)
274{
275 if (old_prs != new_prs)
276 cgroup_file_notify(&cs->partition_file);
277}
278
279static struct cpuset top_cpuset = {
280 .flags = ((1 << CS_ONLINE) | (1 << CS_CPU_EXCLUSIVE) |
281 (1 << CS_MEM_EXCLUSIVE)),
282 .partition_root_state = PRS_ENABLED,
283};
284
285
286
287
288
289
290
291
292
293
294#define cpuset_for_each_child(child_cs, pos_css, parent_cs) \
295 css_for_each_child((pos_css), &(parent_cs)->css) \
296 if (is_cpuset_online(((child_cs) = css_cs((pos_css)))))
297
298
299
300
301
302
303
304
305
306
307
308
309#define cpuset_for_each_descendant_pre(des_cs, pos_css, root_cs) \
310 css_for_each_descendant_pre((pos_css), &(root_cs)->css) \
311 if (is_cpuset_online(((des_cs) = css_cs((pos_css)))))
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351DEFINE_STATIC_PERCPU_RWSEM(cpuset_rwsem);
352
353void cpuset_read_lock(void)
354{
355 percpu_down_read(&cpuset_rwsem);
356}
357
358void cpuset_read_unlock(void)
359{
360 percpu_up_read(&cpuset_rwsem);
361}
362
363static DEFINE_SPINLOCK(callback_lock);
364
365static struct workqueue_struct *cpuset_migrate_mm_wq;
366
367
368
369
370static void cpuset_hotplug_workfn(struct work_struct *work);
371static DECLARE_WORK(cpuset_hotplug_work, cpuset_hotplug_workfn);
372
373static DECLARE_WAIT_QUEUE_HEAD(cpuset_attach_wq);
374
375
376
377
378
379
380
381
382
383static inline bool is_in_v2_mode(void)
384{
385 return cgroup_subsys_on_dfl(cpuset_cgrp_subsys) ||
386 (cpuset_cgrp_subsys.root->flags & CGRP_ROOT_CPUSET_V2_MODE);
387}
388
389
390
391
392
393
394
395
396
397
398
399
400static void guarantee_online_cpus(struct task_struct *tsk,
401 struct cpumask *pmask)
402{
403 const struct cpumask *possible_mask = task_cpu_possible_mask(tsk);
404 struct cpuset *cs;
405
406 if (WARN_ON(!cpumask_and(pmask, possible_mask, cpu_online_mask)))
407 cpumask_copy(pmask, cpu_online_mask);
408
409 rcu_read_lock();
410 cs = task_cs(tsk);
411
412 while (!cpumask_intersects(cs->effective_cpus, pmask)) {
413 cs = parent_cs(cs);
414 if (unlikely(!cs)) {
415
416
417
418
419
420
421
422 goto out_unlock;
423 }
424 }
425 cpumask_and(pmask, pmask, cs->effective_cpus);
426
427out_unlock:
428 rcu_read_unlock();
429}
430
431
432
433
434
435
436
437
438
439
440
441
442static void guarantee_online_mems(struct cpuset *cs, nodemask_t *pmask)
443{
444 while (!nodes_intersects(cs->effective_mems, node_states[N_MEMORY]))
445 cs = parent_cs(cs);
446 nodes_and(*pmask, cs->effective_mems, node_states[N_MEMORY]);
447}
448
449
450
451
452
453
454static void cpuset_update_task_spread_flag(struct cpuset *cs,
455 struct task_struct *tsk)
456{
457 if (is_spread_page(cs))
458 task_set_spread_page(tsk);
459 else
460 task_clear_spread_page(tsk);
461
462 if (is_spread_slab(cs))
463 task_set_spread_slab(tsk);
464 else
465 task_clear_spread_slab(tsk);
466}
467
468
469
470
471
472
473
474
475
476static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q)
477{
478 return cpumask_subset(p->cpus_allowed, q->cpus_allowed) &&
479 nodes_subset(p->mems_allowed, q->mems_allowed) &&
480 is_cpu_exclusive(p) <= is_cpu_exclusive(q) &&
481 is_mem_exclusive(p) <= is_mem_exclusive(q);
482}
483
484
485
486
487
488
489
490
491
492static inline int alloc_cpumasks(struct cpuset *cs, struct tmpmasks *tmp)
493{
494 cpumask_var_t *pmask1, *pmask2, *pmask3;
495
496 if (cs) {
497 pmask1 = &cs->cpus_allowed;
498 pmask2 = &cs->effective_cpus;
499 pmask3 = &cs->subparts_cpus;
500 } else {
501 pmask1 = &tmp->new_cpus;
502 pmask2 = &tmp->addmask;
503 pmask3 = &tmp->delmask;
504 }
505
506 if (!zalloc_cpumask_var(pmask1, GFP_KERNEL))
507 return -ENOMEM;
508
509 if (!zalloc_cpumask_var(pmask2, GFP_KERNEL))
510 goto free_one;
511
512 if (!zalloc_cpumask_var(pmask3, GFP_KERNEL))
513 goto free_two;
514
515 return 0;
516
517free_two:
518 free_cpumask_var(*pmask2);
519free_one:
520 free_cpumask_var(*pmask1);
521 return -ENOMEM;
522}
523
524
525
526
527
528
529static inline void free_cpumasks(struct cpuset *cs, struct tmpmasks *tmp)
530{
531 if (cs) {
532 free_cpumask_var(cs->cpus_allowed);
533 free_cpumask_var(cs->effective_cpus);
534 free_cpumask_var(cs->subparts_cpus);
535 }
536 if (tmp) {
537 free_cpumask_var(tmp->new_cpus);
538 free_cpumask_var(tmp->addmask);
539 free_cpumask_var(tmp->delmask);
540 }
541}
542
543
544
545
546
547static struct cpuset *alloc_trial_cpuset(struct cpuset *cs)
548{
549 struct cpuset *trial;
550
551 trial = kmemdup(cs, sizeof(*cs), GFP_KERNEL);
552 if (!trial)
553 return NULL;
554
555 if (alloc_cpumasks(trial, NULL)) {
556 kfree(trial);
557 return NULL;
558 }
559
560 cpumask_copy(trial->cpus_allowed, cs->cpus_allowed);
561 cpumask_copy(trial->effective_cpus, cs->effective_cpus);
562 return trial;
563}
564
565
566
567
568
569static inline void free_cpuset(struct cpuset *cs)
570{
571 free_cpumasks(cs, NULL);
572 kfree(cs);
573}
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595static int validate_change(struct cpuset *cur, struct cpuset *trial)
596{
597 struct cgroup_subsys_state *css;
598 struct cpuset *c, *par;
599 int ret;
600
601 rcu_read_lock();
602
603
604 ret = -EBUSY;
605 cpuset_for_each_child(c, css, cur)
606 if (!is_cpuset_subset(c, trial))
607 goto out;
608
609
610 ret = 0;
611 if (cur == &top_cpuset)
612 goto out;
613
614 par = parent_cs(cur);
615
616
617 ret = -EACCES;
618 if (!is_in_v2_mode() && !is_cpuset_subset(trial, par))
619 goto out;
620
621
622
623
624
625 ret = -EINVAL;
626 cpuset_for_each_child(c, css, par) {
627 if ((is_cpu_exclusive(trial) || is_cpu_exclusive(c)) &&
628 c != cur &&
629 cpumask_intersects(trial->cpus_allowed, c->cpus_allowed))
630 goto out;
631 if ((is_mem_exclusive(trial) || is_mem_exclusive(c)) &&
632 c != cur &&
633 nodes_intersects(trial->mems_allowed, c->mems_allowed))
634 goto out;
635 }
636
637
638
639
640
641 ret = -ENOSPC;
642 if ((cgroup_is_populated(cur->css.cgroup) || cur->attach_in_progress)) {
643 if (!cpumask_empty(cur->cpus_allowed) &&
644 cpumask_empty(trial->cpus_allowed))
645 goto out;
646 if (!nodes_empty(cur->mems_allowed) &&
647 nodes_empty(trial->mems_allowed))
648 goto out;
649 }
650
651
652
653
654
655 ret = -EBUSY;
656 if (is_cpu_exclusive(cur) &&
657 !cpuset_cpumask_can_shrink(cur->cpus_allowed,
658 trial->cpus_allowed))
659 goto out;
660
661 ret = 0;
662out:
663 rcu_read_unlock();
664 return ret;
665}
666
667#ifdef CONFIG_SMP
668
669
670
671
672static int cpusets_overlap(struct cpuset *a, struct cpuset *b)
673{
674 return cpumask_intersects(a->effective_cpus, b->effective_cpus);
675}
676
677static void
678update_domain_attr(struct sched_domain_attr *dattr, struct cpuset *c)
679{
680 if (dattr->relax_domain_level < c->relax_domain_level)
681 dattr->relax_domain_level = c->relax_domain_level;
682 return;
683}
684
685static void update_domain_attr_tree(struct sched_domain_attr *dattr,
686 struct cpuset *root_cs)
687{
688 struct cpuset *cp;
689 struct cgroup_subsys_state *pos_css;
690
691 rcu_read_lock();
692 cpuset_for_each_descendant_pre(cp, pos_css, root_cs) {
693
694 if (cpumask_empty(cp->cpus_allowed)) {
695 pos_css = css_rightmost_descendant(pos_css);
696 continue;
697 }
698
699 if (is_sched_load_balance(cp))
700 update_domain_attr(dattr, cp);
701 }
702 rcu_read_unlock();
703}
704
705
706static inline int nr_cpusets(void)
707{
708
709 return static_key_count(&cpusets_enabled_key.key) + 1;
710}
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765static int generate_sched_domains(cpumask_var_t **domains,
766 struct sched_domain_attr **attributes)
767{
768 struct cpuset *cp;
769 struct cpuset **csa;
770 int csn;
771 int i, j, k;
772 cpumask_var_t *doms;
773 struct sched_domain_attr *dattr;
774 int ndoms = 0;
775 int nslot;
776 struct cgroup_subsys_state *pos_css;
777 bool root_load_balance = is_sched_load_balance(&top_cpuset);
778
779 doms = NULL;
780 dattr = NULL;
781 csa = NULL;
782
783
784 if (root_load_balance && !top_cpuset.nr_subparts_cpus) {
785 ndoms = 1;
786 doms = alloc_sched_domains(ndoms);
787 if (!doms)
788 goto done;
789
790 dattr = kmalloc(sizeof(struct sched_domain_attr), GFP_KERNEL);
791 if (dattr) {
792 *dattr = SD_ATTR_INIT;
793 update_domain_attr_tree(dattr, &top_cpuset);
794 }
795 cpumask_and(doms[0], top_cpuset.effective_cpus,
796 housekeeping_cpumask(HK_FLAG_DOMAIN));
797
798 goto done;
799 }
800
801 csa = kmalloc_array(nr_cpusets(), sizeof(cp), GFP_KERNEL);
802 if (!csa)
803 goto done;
804 csn = 0;
805
806 rcu_read_lock();
807 if (root_load_balance)
808 csa[csn++] = &top_cpuset;
809 cpuset_for_each_descendant_pre(cp, pos_css, &top_cpuset) {
810 if (cp == &top_cpuset)
811 continue;
812
813
814
815
816
817
818
819
820
821
822
823 if (!cpumask_empty(cp->cpus_allowed) &&
824 !(is_sched_load_balance(cp) &&
825 cpumask_intersects(cp->cpus_allowed,
826 housekeeping_cpumask(HK_FLAG_DOMAIN))))
827 continue;
828
829 if (root_load_balance &&
830 cpumask_subset(cp->cpus_allowed, top_cpuset.effective_cpus))
831 continue;
832
833 if (is_sched_load_balance(cp) &&
834 !cpumask_empty(cp->effective_cpus))
835 csa[csn++] = cp;
836
837
838 if (!is_partition_root(cp))
839 pos_css = css_rightmost_descendant(pos_css);
840 }
841 rcu_read_unlock();
842
843 for (i = 0; i < csn; i++)
844 csa[i]->pn = i;
845 ndoms = csn;
846
847restart:
848
849 for (i = 0; i < csn; i++) {
850 struct cpuset *a = csa[i];
851 int apn = a->pn;
852
853 for (j = 0; j < csn; j++) {
854 struct cpuset *b = csa[j];
855 int bpn = b->pn;
856
857 if (apn != bpn && cpusets_overlap(a, b)) {
858 for (k = 0; k < csn; k++) {
859 struct cpuset *c = csa[k];
860
861 if (c->pn == bpn)
862 c->pn = apn;
863 }
864 ndoms--;
865 goto restart;
866 }
867 }
868 }
869
870
871
872
873
874 doms = alloc_sched_domains(ndoms);
875 if (!doms)
876 goto done;
877
878
879
880
881
882 dattr = kmalloc_array(ndoms, sizeof(struct sched_domain_attr),
883 GFP_KERNEL);
884
885 for (nslot = 0, i = 0; i < csn; i++) {
886 struct cpuset *a = csa[i];
887 struct cpumask *dp;
888 int apn = a->pn;
889
890 if (apn < 0) {
891
892 continue;
893 }
894
895 dp = doms[nslot];
896
897 if (nslot == ndoms) {
898 static int warnings = 10;
899 if (warnings) {
900 pr_warn("rebuild_sched_domains confused: nslot %d, ndoms %d, csn %d, i %d, apn %d\n",
901 nslot, ndoms, csn, i, apn);
902 warnings--;
903 }
904 continue;
905 }
906
907 cpumask_clear(dp);
908 if (dattr)
909 *(dattr + nslot) = SD_ATTR_INIT;
910 for (j = i; j < csn; j++) {
911 struct cpuset *b = csa[j];
912
913 if (apn == b->pn) {
914 cpumask_or(dp, dp, b->effective_cpus);
915 cpumask_and(dp, dp, housekeeping_cpumask(HK_FLAG_DOMAIN));
916 if (dattr)
917 update_domain_attr_tree(dattr + nslot, b);
918
919
920 b->pn = -1;
921 }
922 }
923 nslot++;
924 }
925 BUG_ON(nslot != ndoms);
926
927done:
928 kfree(csa);
929
930
931
932
933
934 if (doms == NULL)
935 ndoms = 1;
936
937 *domains = doms;
938 *attributes = dattr;
939 return ndoms;
940}
941
942static void update_tasks_root_domain(struct cpuset *cs)
943{
944 struct css_task_iter it;
945 struct task_struct *task;
946
947 css_task_iter_start(&cs->css, 0, &it);
948
949 while ((task = css_task_iter_next(&it)))
950 dl_add_task_root_domain(task);
951
952 css_task_iter_end(&it);
953}
954
955static void rebuild_root_domains(void)
956{
957 struct cpuset *cs = NULL;
958 struct cgroup_subsys_state *pos_css;
959
960 percpu_rwsem_assert_held(&cpuset_rwsem);
961 lockdep_assert_cpus_held();
962 lockdep_assert_held(&sched_domains_mutex);
963
964 rcu_read_lock();
965
966
967
968
969
970 dl_clear_root_domain(&def_root_domain);
971
972 cpuset_for_each_descendant_pre(cs, pos_css, &top_cpuset) {
973
974 if (cpumask_empty(cs->effective_cpus)) {
975 pos_css = css_rightmost_descendant(pos_css);
976 continue;
977 }
978
979 css_get(&cs->css);
980
981 rcu_read_unlock();
982
983 update_tasks_root_domain(cs);
984
985 rcu_read_lock();
986 css_put(&cs->css);
987 }
988 rcu_read_unlock();
989}
990
991static void
992partition_and_rebuild_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
993 struct sched_domain_attr *dattr_new)
994{
995 mutex_lock(&sched_domains_mutex);
996 partition_sched_domains_locked(ndoms_new, doms_new, dattr_new);
997 rebuild_root_domains();
998 mutex_unlock(&sched_domains_mutex);
999}
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012static void rebuild_sched_domains_locked(void)
1013{
1014 struct cgroup_subsys_state *pos_css;
1015 struct sched_domain_attr *attr;
1016 cpumask_var_t *doms;
1017 struct cpuset *cs;
1018 int ndoms;
1019
1020 lockdep_assert_cpus_held();
1021 percpu_rwsem_assert_held(&cpuset_rwsem);
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032 if (!top_cpuset.nr_subparts_cpus &&
1033 !cpumask_equal(top_cpuset.effective_cpus, cpu_active_mask))
1034 return;
1035
1036
1037
1038
1039
1040
1041 if (top_cpuset.nr_subparts_cpus) {
1042 rcu_read_lock();
1043 cpuset_for_each_descendant_pre(cs, pos_css, &top_cpuset) {
1044 if (!is_partition_root(cs)) {
1045 pos_css = css_rightmost_descendant(pos_css);
1046 continue;
1047 }
1048 if (!cpumask_subset(cs->effective_cpus,
1049 cpu_active_mask)) {
1050 rcu_read_unlock();
1051 return;
1052 }
1053 }
1054 rcu_read_unlock();
1055 }
1056
1057
1058 ndoms = generate_sched_domains(&doms, &attr);
1059
1060
1061 partition_and_rebuild_sched_domains(ndoms, doms, attr);
1062}
1063#else
1064static void rebuild_sched_domains_locked(void)
1065{
1066}
1067#endif
1068
1069void rebuild_sched_domains(void)
1070{
1071 cpus_read_lock();
1072 percpu_down_write(&cpuset_rwsem);
1073 rebuild_sched_domains_locked();
1074 percpu_up_write(&cpuset_rwsem);
1075 cpus_read_unlock();
1076}
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086static void update_tasks_cpumask(struct cpuset *cs)
1087{
1088 struct css_task_iter it;
1089 struct task_struct *task;
1090
1091 css_task_iter_start(&cs->css, 0, &it);
1092 while ((task = css_task_iter_next(&it)))
1093 set_cpus_allowed_ptr(task, cs->effective_cpus);
1094 css_task_iter_end(&it);
1095}
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108static void compute_effective_cpumask(struct cpumask *new_cpus,
1109 struct cpuset *cs, struct cpuset *parent)
1110{
1111 if (parent->nr_subparts_cpus) {
1112 cpumask_or(new_cpus, parent->effective_cpus,
1113 parent->subparts_cpus);
1114 cpumask_and(new_cpus, new_cpus, cs->cpus_allowed);
1115 cpumask_and(new_cpus, new_cpus, cpu_active_mask);
1116 } else {
1117 cpumask_and(new_cpus, cs->cpus_allowed, parent->effective_cpus);
1118 }
1119}
1120
1121
1122
1123
1124enum subparts_cmd {
1125 partcmd_enable,
1126 partcmd_disable,
1127 partcmd_update,
1128};
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172static int update_parent_subparts_cpumask(struct cpuset *cpuset, int cmd,
1173 struct cpumask *newmask,
1174 struct tmpmasks *tmp)
1175{
1176 struct cpuset *parent = parent_cs(cpuset);
1177 int adding;
1178 int deleting;
1179 int old_prs, new_prs;
1180 bool part_error = false;
1181
1182 percpu_rwsem_assert_held(&cpuset_rwsem);
1183
1184
1185
1186
1187
1188
1189 if (!is_partition_root(parent) ||
1190 (newmask && cpumask_empty(newmask)) ||
1191 (!newmask && cpumask_empty(cpuset->cpus_allowed)))
1192 return -EINVAL;
1193
1194
1195
1196
1197
1198 if ((cmd != partcmd_update) && css_has_online_children(&cpuset->css))
1199 return -EBUSY;
1200
1201
1202
1203
1204
1205
1206 if ((cmd == partcmd_enable) &&
1207 (!cpumask_subset(cpuset->cpus_allowed, parent->effective_cpus) ||
1208 cpumask_equal(cpuset->cpus_allowed, parent->effective_cpus)))
1209 return -EINVAL;
1210
1211
1212
1213
1214 adding = deleting = false;
1215 old_prs = new_prs = cpuset->partition_root_state;
1216 if (cmd == partcmd_enable) {
1217 cpumask_copy(tmp->addmask, cpuset->cpus_allowed);
1218 adding = true;
1219 } else if (cmd == partcmd_disable) {
1220 deleting = cpumask_and(tmp->delmask, cpuset->cpus_allowed,
1221 parent->subparts_cpus);
1222 } else if (newmask) {
1223
1224
1225
1226
1227
1228
1229
1230 cpumask_andnot(tmp->delmask, cpuset->cpus_allowed, newmask);
1231 deleting = cpumask_and(tmp->delmask, tmp->delmask,
1232 parent->subparts_cpus);
1233
1234 cpumask_and(tmp->addmask, newmask, parent->effective_cpus);
1235 adding = cpumask_andnot(tmp->addmask, tmp->addmask,
1236 parent->subparts_cpus);
1237
1238
1239
1240 if (adding &&
1241 cpumask_equal(parent->effective_cpus, tmp->addmask)) {
1242 if (!deleting)
1243 return -EINVAL;
1244
1245
1246
1247
1248
1249 if (!cpumask_and(tmp->addmask, tmp->delmask,
1250 cpu_active_mask))
1251 return -EINVAL;
1252 cpumask_copy(tmp->addmask, parent->effective_cpus);
1253 }
1254 } else {
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264 adding = cpumask_and(tmp->addmask, cpuset->cpus_allowed,
1265 parent->effective_cpus);
1266 part_error = cpumask_equal(tmp->addmask,
1267 parent->effective_cpus);
1268 }
1269
1270 if (cmd == partcmd_update) {
1271 int prev_prs = cpuset->partition_root_state;
1272
1273
1274
1275
1276
1277 switch (cpuset->partition_root_state) {
1278 case PRS_ENABLED:
1279 if (part_error)
1280 new_prs = PRS_ERROR;
1281 break;
1282 case PRS_ERROR:
1283 if (!part_error)
1284 new_prs = PRS_ENABLED;
1285 break;
1286 }
1287
1288
1289
1290 part_error = (prev_prs == PRS_ERROR);
1291 }
1292
1293 if (!part_error && (new_prs == PRS_ERROR))
1294 return 0;
1295
1296 if (new_prs == PRS_ERROR) {
1297
1298
1299
1300 adding = false;
1301 deleting = cpumask_and(tmp->delmask, cpuset->cpus_allowed,
1302 parent->subparts_cpus);
1303 }
1304
1305 if (!adding && !deleting && (new_prs == old_prs))
1306 return 0;
1307
1308
1309
1310
1311
1312
1313 spin_lock_irq(&callback_lock);
1314 if (adding) {
1315 cpumask_or(parent->subparts_cpus,
1316 parent->subparts_cpus, tmp->addmask);
1317 cpumask_andnot(parent->effective_cpus,
1318 parent->effective_cpus, tmp->addmask);
1319 }
1320 if (deleting) {
1321 cpumask_andnot(parent->subparts_cpus,
1322 parent->subparts_cpus, tmp->delmask);
1323
1324
1325
1326 cpumask_and(tmp->delmask, tmp->delmask, cpu_active_mask);
1327 cpumask_or(parent->effective_cpus,
1328 parent->effective_cpus, tmp->delmask);
1329 }
1330
1331 parent->nr_subparts_cpus = cpumask_weight(parent->subparts_cpus);
1332
1333 if (old_prs != new_prs)
1334 cpuset->partition_root_state = new_prs;
1335
1336 spin_unlock_irq(&callback_lock);
1337 notify_partition_change(cpuset, old_prs, new_prs);
1338
1339 return cmd == partcmd_update;
1340}
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp)
1355{
1356 struct cpuset *cp;
1357 struct cgroup_subsys_state *pos_css;
1358 bool need_rebuild_sched_domains = false;
1359 int old_prs, new_prs;
1360
1361 rcu_read_lock();
1362 cpuset_for_each_descendant_pre(cp, pos_css, cs) {
1363 struct cpuset *parent = parent_cs(cp);
1364
1365 compute_effective_cpumask(tmp->new_cpus, cp, parent);
1366
1367
1368
1369
1370
1371 if (is_in_v2_mode() && cpumask_empty(tmp->new_cpus)) {
1372 cpumask_copy(tmp->new_cpus, parent->effective_cpus);
1373 if (!cp->use_parent_ecpus) {
1374 cp->use_parent_ecpus = true;
1375 parent->child_ecpus_count++;
1376 }
1377 } else if (cp->use_parent_ecpus) {
1378 cp->use_parent_ecpus = false;
1379 WARN_ON_ONCE(!parent->child_ecpus_count);
1380 parent->child_ecpus_count--;
1381 }
1382
1383
1384
1385
1386
1387 if (!cp->partition_root_state &&
1388 cpumask_equal(tmp->new_cpus, cp->effective_cpus)) {
1389 pos_css = css_rightmost_descendant(pos_css);
1390 continue;
1391 }
1392
1393
1394
1395
1396
1397
1398
1399 old_prs = new_prs = cp->partition_root_state;
1400 if ((cp != cs) && old_prs) {
1401 switch (parent->partition_root_state) {
1402 case PRS_DISABLED:
1403
1404
1405
1406
1407
1408 WARN_ON_ONCE(cp->partition_root_state
1409 != PRS_ERROR);
1410 new_prs = PRS_DISABLED;
1411
1412
1413
1414
1415
1416
1417
1418
1419 clear_bit(CS_CPU_EXCLUSIVE, &cp->flags);
1420 break;
1421
1422 case PRS_ENABLED:
1423 if (update_parent_subparts_cpumask(cp, partcmd_update, NULL, tmp))
1424 update_tasks_cpumask(parent);
1425 break;
1426
1427 case PRS_ERROR:
1428
1429
1430
1431 new_prs = PRS_ERROR;
1432 break;
1433 }
1434 }
1435
1436 if (!css_tryget_online(&cp->css))
1437 continue;
1438 rcu_read_unlock();
1439
1440 spin_lock_irq(&callback_lock);
1441
1442 cpumask_copy(cp->effective_cpus, tmp->new_cpus);
1443 if (cp->nr_subparts_cpus && (new_prs != PRS_ENABLED)) {
1444 cp->nr_subparts_cpus = 0;
1445 cpumask_clear(cp->subparts_cpus);
1446 } else if (cp->nr_subparts_cpus) {
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456 cpumask_andnot(cp->effective_cpus, cp->effective_cpus,
1457 cp->subparts_cpus);
1458 if (cpumask_empty(cp->effective_cpus)) {
1459 cpumask_copy(cp->effective_cpus, tmp->new_cpus);
1460 cpumask_clear(cp->subparts_cpus);
1461 cp->nr_subparts_cpus = 0;
1462 } else if (!cpumask_subset(cp->subparts_cpus,
1463 tmp->new_cpus)) {
1464 cpumask_andnot(cp->subparts_cpus,
1465 cp->subparts_cpus, tmp->new_cpus);
1466 cp->nr_subparts_cpus
1467 = cpumask_weight(cp->subparts_cpus);
1468 }
1469 }
1470
1471 if (new_prs != old_prs)
1472 cp->partition_root_state = new_prs;
1473
1474 spin_unlock_irq(&callback_lock);
1475 notify_partition_change(cp, old_prs, new_prs);
1476
1477 WARN_ON(!is_in_v2_mode() &&
1478 !cpumask_equal(cp->cpus_allowed, cp->effective_cpus));
1479
1480 update_tasks_cpumask(cp);
1481
1482
1483
1484
1485
1486
1487
1488 if (!cpumask_empty(cp->cpus_allowed) &&
1489 is_sched_load_balance(cp) &&
1490 (!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) ||
1491 is_partition_root(cp)))
1492 need_rebuild_sched_domains = true;
1493
1494 rcu_read_lock();
1495 css_put(&cp->css);
1496 }
1497 rcu_read_unlock();
1498
1499 if (need_rebuild_sched_domains)
1500 rebuild_sched_domains_locked();
1501}
1502
1503
1504
1505
1506
1507
1508
1509static void update_sibling_cpumasks(struct cpuset *parent, struct cpuset *cs,
1510 struct tmpmasks *tmp)
1511{
1512 struct cpuset *sibling;
1513 struct cgroup_subsys_state *pos_css;
1514
1515
1516
1517
1518
1519
1520 rcu_read_lock();
1521 cpuset_for_each_child(sibling, pos_css, parent) {
1522 if (sibling == cs)
1523 continue;
1524 if (!sibling->use_parent_ecpus)
1525 continue;
1526
1527 update_cpumasks_hier(sibling, tmp);
1528 }
1529 rcu_read_unlock();
1530}
1531
1532
1533
1534
1535
1536
1537
1538static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
1539 const char *buf)
1540{
1541 int retval;
1542 struct tmpmasks tmp;
1543
1544
1545 if (cs == &top_cpuset)
1546 return -EACCES;
1547
1548
1549
1550
1551
1552
1553
1554 if (!*buf) {
1555 cpumask_clear(trialcs->cpus_allowed);
1556 } else {
1557 retval = cpulist_parse(buf, trialcs->cpus_allowed);
1558 if (retval < 0)
1559 return retval;
1560
1561 if (!cpumask_subset(trialcs->cpus_allowed,
1562 top_cpuset.cpus_allowed))
1563 return -EINVAL;
1564 }
1565
1566
1567 if (cpumask_equal(cs->cpus_allowed, trialcs->cpus_allowed))
1568 return 0;
1569
1570 retval = validate_change(cs, trialcs);
1571 if (retval < 0)
1572 return retval;
1573
1574#ifdef CONFIG_CPUMASK_OFFSTACK
1575
1576
1577
1578
1579 tmp.addmask = trialcs->subparts_cpus;
1580 tmp.delmask = trialcs->effective_cpus;
1581 tmp.new_cpus = trialcs->cpus_allowed;
1582#endif
1583
1584 if (cs->partition_root_state) {
1585
1586 if (cpumask_empty(trialcs->cpus_allowed))
1587 return -EINVAL;
1588 if (update_parent_subparts_cpumask(cs, partcmd_update,
1589 trialcs->cpus_allowed, &tmp) < 0)
1590 return -EINVAL;
1591 }
1592
1593 spin_lock_irq(&callback_lock);
1594 cpumask_copy(cs->cpus_allowed, trialcs->cpus_allowed);
1595
1596
1597
1598
1599 if (cs->nr_subparts_cpus) {
1600 cpumask_andnot(cs->subparts_cpus, cs->subparts_cpus,
1601 cs->cpus_allowed);
1602 cs->nr_subparts_cpus = cpumask_weight(cs->subparts_cpus);
1603 }
1604 spin_unlock_irq(&callback_lock);
1605
1606 update_cpumasks_hier(cs, &tmp);
1607
1608 if (cs->partition_root_state) {
1609 struct cpuset *parent = parent_cs(cs);
1610
1611
1612
1613
1614
1615 if (parent->child_ecpus_count)
1616 update_sibling_cpumasks(parent, cs, &tmp);
1617 }
1618 return 0;
1619}
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629struct cpuset_migrate_mm_work {
1630 struct work_struct work;
1631 struct mm_struct *mm;
1632 nodemask_t from;
1633 nodemask_t to;
1634};
1635
1636static void cpuset_migrate_mm_workfn(struct work_struct *work)
1637{
1638 struct cpuset_migrate_mm_work *mwork =
1639 container_of(work, struct cpuset_migrate_mm_work, work);
1640
1641
1642 do_migrate_pages(mwork->mm, &mwork->from, &mwork->to, MPOL_MF_MOVE_ALL);
1643 mmput(mwork->mm);
1644 kfree(mwork);
1645}
1646
1647static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from,
1648 const nodemask_t *to)
1649{
1650 struct cpuset_migrate_mm_work *mwork;
1651
1652 if (nodes_equal(*from, *to)) {
1653 mmput(mm);
1654 return;
1655 }
1656
1657 mwork = kzalloc(sizeof(*mwork), GFP_KERNEL);
1658 if (mwork) {
1659 mwork->mm = mm;
1660 mwork->from = *from;
1661 mwork->to = *to;
1662 INIT_WORK(&mwork->work, cpuset_migrate_mm_workfn);
1663 queue_work(cpuset_migrate_mm_wq, &mwork->work);
1664 } else {
1665 mmput(mm);
1666 }
1667}
1668
1669static void cpuset_post_attach(void)
1670{
1671 flush_workqueue(cpuset_migrate_mm_wq);
1672}
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684static void cpuset_change_task_nodemask(struct task_struct *tsk,
1685 nodemask_t *newmems)
1686{
1687 task_lock(tsk);
1688
1689 local_irq_disable();
1690 write_seqcount_begin(&tsk->mems_allowed_seq);
1691
1692 nodes_or(tsk->mems_allowed, tsk->mems_allowed, *newmems);
1693 mpol_rebind_task(tsk, newmems);
1694 tsk->mems_allowed = *newmems;
1695
1696 write_seqcount_end(&tsk->mems_allowed_seq);
1697 local_irq_enable();
1698
1699 task_unlock(tsk);
1700}
1701
1702static void *cpuset_being_rebound;
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712static void update_tasks_nodemask(struct cpuset *cs)
1713{
1714 static nodemask_t newmems;
1715 struct css_task_iter it;
1716 struct task_struct *task;
1717
1718 cpuset_being_rebound = cs;
1719
1720 guarantee_online_mems(cs, &newmems);
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732 css_task_iter_start(&cs->css, 0, &it);
1733 while ((task = css_task_iter_next(&it))) {
1734 struct mm_struct *mm;
1735 bool migrate;
1736
1737 cpuset_change_task_nodemask(task, &newmems);
1738
1739 mm = get_task_mm(task);
1740 if (!mm)
1741 continue;
1742
1743 migrate = is_memory_migrate(cs);
1744
1745 mpol_rebind_mm(mm, &cs->mems_allowed);
1746 if (migrate)
1747 cpuset_migrate_mm(mm, &cs->old_mems_allowed, &newmems);
1748 else
1749 mmput(mm);
1750 }
1751 css_task_iter_end(&it);
1752
1753
1754
1755
1756
1757 cs->old_mems_allowed = newmems;
1758
1759
1760 cpuset_being_rebound = NULL;
1761}
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775static void update_nodemasks_hier(struct cpuset *cs, nodemask_t *new_mems)
1776{
1777 struct cpuset *cp;
1778 struct cgroup_subsys_state *pos_css;
1779
1780 rcu_read_lock();
1781 cpuset_for_each_descendant_pre(cp, pos_css, cs) {
1782 struct cpuset *parent = parent_cs(cp);
1783
1784 nodes_and(*new_mems, cp->mems_allowed, parent->effective_mems);
1785
1786
1787
1788
1789
1790 if (is_in_v2_mode() && nodes_empty(*new_mems))
1791 *new_mems = parent->effective_mems;
1792
1793
1794 if (nodes_equal(*new_mems, cp->effective_mems)) {
1795 pos_css = css_rightmost_descendant(pos_css);
1796 continue;
1797 }
1798
1799 if (!css_tryget_online(&cp->css))
1800 continue;
1801 rcu_read_unlock();
1802
1803 spin_lock_irq(&callback_lock);
1804 cp->effective_mems = *new_mems;
1805 spin_unlock_irq(&callback_lock);
1806
1807 WARN_ON(!is_in_v2_mode() &&
1808 !nodes_equal(cp->mems_allowed, cp->effective_mems));
1809
1810 update_tasks_nodemask(cp);
1811
1812 rcu_read_lock();
1813 css_put(&cp->css);
1814 }
1815 rcu_read_unlock();
1816}
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
1832 const char *buf)
1833{
1834 int retval;
1835
1836
1837
1838
1839
1840 if (cs == &top_cpuset) {
1841 retval = -EACCES;
1842 goto done;
1843 }
1844
1845
1846
1847
1848
1849
1850
1851 if (!*buf) {
1852 nodes_clear(trialcs->mems_allowed);
1853 } else {
1854 retval = nodelist_parse(buf, trialcs->mems_allowed);
1855 if (retval < 0)
1856 goto done;
1857
1858 if (!nodes_subset(trialcs->mems_allowed,
1859 top_cpuset.mems_allowed)) {
1860 retval = -EINVAL;
1861 goto done;
1862 }
1863 }
1864
1865 if (nodes_equal(cs->mems_allowed, trialcs->mems_allowed)) {
1866 retval = 0;
1867 goto done;
1868 }
1869 retval = validate_change(cs, trialcs);
1870 if (retval < 0)
1871 goto done;
1872
1873 spin_lock_irq(&callback_lock);
1874 cs->mems_allowed = trialcs->mems_allowed;
1875 spin_unlock_irq(&callback_lock);
1876
1877
1878 update_nodemasks_hier(cs, &trialcs->mems_allowed);
1879done:
1880 return retval;
1881}
1882
1883bool current_cpuset_is_being_rebound(void)
1884{
1885 bool ret;
1886
1887 rcu_read_lock();
1888 ret = task_cs(current) == cpuset_being_rebound;
1889 rcu_read_unlock();
1890
1891 return ret;
1892}
1893
1894static int update_relax_domain_level(struct cpuset *cs, s64 val)
1895{
1896#ifdef CONFIG_SMP
1897 if (val < -1 || val >= sched_domain_level_max)
1898 return -EINVAL;
1899#endif
1900
1901 if (val != cs->relax_domain_level) {
1902 cs->relax_domain_level = val;
1903 if (!cpumask_empty(cs->cpus_allowed) &&
1904 is_sched_load_balance(cs))
1905 rebuild_sched_domains_locked();
1906 }
1907
1908 return 0;
1909}
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919static void update_tasks_flags(struct cpuset *cs)
1920{
1921 struct css_task_iter it;
1922 struct task_struct *task;
1923
1924 css_task_iter_start(&cs->css, 0, &it);
1925 while ((task = css_task_iter_next(&it)))
1926 cpuset_update_task_spread_flag(cs, task);
1927 css_task_iter_end(&it);
1928}
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
1940 int turning_on)
1941{
1942 struct cpuset *trialcs;
1943 int balance_flag_changed;
1944 int spread_flag_changed;
1945 int err;
1946
1947 trialcs = alloc_trial_cpuset(cs);
1948 if (!trialcs)
1949 return -ENOMEM;
1950
1951 if (turning_on)
1952 set_bit(bit, &trialcs->flags);
1953 else
1954 clear_bit(bit, &trialcs->flags);
1955
1956 err = validate_change(cs, trialcs);
1957 if (err < 0)
1958 goto out;
1959
1960 balance_flag_changed = (is_sched_load_balance(cs) !=
1961 is_sched_load_balance(trialcs));
1962
1963 spread_flag_changed = ((is_spread_slab(cs) != is_spread_slab(trialcs))
1964 || (is_spread_page(cs) != is_spread_page(trialcs)));
1965
1966 spin_lock_irq(&callback_lock);
1967 cs->flags = trialcs->flags;
1968 spin_unlock_irq(&callback_lock);
1969
1970 if (!cpumask_empty(trialcs->cpus_allowed) && balance_flag_changed)
1971 rebuild_sched_domains_locked();
1972
1973 if (spread_flag_changed)
1974 update_tasks_flags(cs);
1975out:
1976 free_cpuset(trialcs);
1977 return err;
1978}
1979
1980
1981
1982
1983
1984
1985
1986
1987static int update_prstate(struct cpuset *cs, int new_prs)
1988{
1989 int err, old_prs = cs->partition_root_state;
1990 struct cpuset *parent = parent_cs(cs);
1991 struct tmpmasks tmpmask;
1992
1993 if (old_prs == new_prs)
1994 return 0;
1995
1996
1997
1998
1999
2000 if (new_prs && (old_prs == PRS_ERROR))
2001 return -EINVAL;
2002
2003 if (alloc_cpumasks(NULL, &tmpmask))
2004 return -ENOMEM;
2005
2006 err = -EINVAL;
2007 if (!old_prs) {
2008
2009
2010
2011
2012
2013 if (cpumask_empty(cs->cpus_allowed))
2014 goto out;
2015
2016 err = update_flag(CS_CPU_EXCLUSIVE, cs, 1);
2017 if (err)
2018 goto out;
2019
2020 err = update_parent_subparts_cpumask(cs, partcmd_enable,
2021 NULL, &tmpmask);
2022 if (err) {
2023 update_flag(CS_CPU_EXCLUSIVE, cs, 0);
2024 goto out;
2025 }
2026 } else {
2027
2028
2029
2030
2031 if (old_prs == PRS_ERROR) {
2032 update_flag(CS_CPU_EXCLUSIVE, cs, 0);
2033 err = 0;
2034 goto out;
2035 }
2036
2037 err = update_parent_subparts_cpumask(cs, partcmd_disable,
2038 NULL, &tmpmask);
2039 if (err)
2040 goto out;
2041
2042
2043 update_flag(CS_CPU_EXCLUSIVE, cs, 0);
2044 }
2045
2046
2047
2048
2049
2050 if (parent != &top_cpuset)
2051 update_tasks_cpumask(parent);
2052
2053 if (parent->child_ecpus_count)
2054 update_sibling_cpumasks(parent, cs, &tmpmask);
2055
2056 rebuild_sched_domains_locked();
2057out:
2058 if (!err) {
2059 spin_lock_irq(&callback_lock);
2060 cs->partition_root_state = new_prs;
2061 spin_unlock_irq(&callback_lock);
2062 notify_partition_change(cs, old_prs, new_prs);
2063 }
2064
2065 free_cpumasks(NULL, &tmpmask);
2066 return err;
2067}
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114#define FM_COEF 933
2115#define FM_MAXTICKS ((u32)99)
2116#define FM_MAXCNT 1000000
2117#define FM_SCALE 1000
2118
2119
2120static void fmeter_init(struct fmeter *fmp)
2121{
2122 fmp->cnt = 0;
2123 fmp->val = 0;
2124 fmp->time = 0;
2125 spin_lock_init(&fmp->lock);
2126}
2127
2128
2129static void fmeter_update(struct fmeter *fmp)
2130{
2131 time64_t now;
2132 u32 ticks;
2133
2134 now = ktime_get_seconds();
2135 ticks = now - fmp->time;
2136
2137 if (ticks == 0)
2138 return;
2139
2140 ticks = min(FM_MAXTICKS, ticks);
2141 while (ticks-- > 0)
2142 fmp->val = (FM_COEF * fmp->val) / FM_SCALE;
2143 fmp->time = now;
2144
2145 fmp->val += ((FM_SCALE - FM_COEF) * fmp->cnt) / FM_SCALE;
2146 fmp->cnt = 0;
2147}
2148
2149
2150static void fmeter_markevent(struct fmeter *fmp)
2151{
2152 spin_lock(&fmp->lock);
2153 fmeter_update(fmp);
2154 fmp->cnt = min(FM_MAXCNT, fmp->cnt + FM_SCALE);
2155 spin_unlock(&fmp->lock);
2156}
2157
2158
2159static int fmeter_getrate(struct fmeter *fmp)
2160{
2161 int val;
2162
2163 spin_lock(&fmp->lock);
2164 fmeter_update(fmp);
2165 val = fmp->val;
2166 spin_unlock(&fmp->lock);
2167 return val;
2168}
2169
2170static struct cpuset *cpuset_attach_old_cs;
2171
2172
2173static int cpuset_can_attach(struct cgroup_taskset *tset)
2174{
2175 struct cgroup_subsys_state *css;
2176 struct cpuset *cs;
2177 struct task_struct *task;
2178 int ret;
2179
2180
2181 cpuset_attach_old_cs = task_cs(cgroup_taskset_first(tset, &css));
2182 cs = css_cs(css);
2183
2184 percpu_down_write(&cpuset_rwsem);
2185
2186
2187 ret = -ENOSPC;
2188 if (!is_in_v2_mode() &&
2189 (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed)))
2190 goto out_unlock;
2191
2192 cgroup_taskset_for_each(task, css, tset) {
2193 ret = task_can_attach(task, cs->cpus_allowed);
2194 if (ret)
2195 goto out_unlock;
2196 ret = security_task_setscheduler(task);
2197 if (ret)
2198 goto out_unlock;
2199 }
2200
2201
2202
2203
2204
2205 cs->attach_in_progress++;
2206 ret = 0;
2207out_unlock:
2208 percpu_up_write(&cpuset_rwsem);
2209 return ret;
2210}
2211
2212static void cpuset_cancel_attach(struct cgroup_taskset *tset)
2213{
2214 struct cgroup_subsys_state *css;
2215
2216 cgroup_taskset_first(tset, &css);
2217
2218 percpu_down_write(&cpuset_rwsem);
2219 css_cs(css)->attach_in_progress--;
2220 percpu_up_write(&cpuset_rwsem);
2221}
2222
2223
2224
2225
2226
2227
2228static cpumask_var_t cpus_attach;
2229
2230static void cpuset_attach(struct cgroup_taskset *tset)
2231{
2232
2233 static nodemask_t cpuset_attach_nodemask_to;
2234 struct task_struct *task;
2235 struct task_struct *leader;
2236 struct cgroup_subsys_state *css;
2237 struct cpuset *cs;
2238 struct cpuset *oldcs = cpuset_attach_old_cs;
2239
2240 cgroup_taskset_first(tset, &css);
2241 cs = css_cs(css);
2242
2243 percpu_down_write(&cpuset_rwsem);
2244
2245 guarantee_online_mems(cs, &cpuset_attach_nodemask_to);
2246
2247 cgroup_taskset_for_each(task, css, tset) {
2248 if (cs != &top_cpuset)
2249 guarantee_online_cpus(task, cpus_attach);
2250 else
2251 cpumask_copy(cpus_attach, task_cpu_possible_mask(task));
2252
2253
2254
2255
2256 WARN_ON_ONCE(set_cpus_allowed_ptr(task, cpus_attach));
2257
2258 cpuset_change_task_nodemask(task, &cpuset_attach_nodemask_to);
2259 cpuset_update_task_spread_flag(cs, task);
2260 }
2261
2262
2263
2264
2265
2266 cpuset_attach_nodemask_to = cs->effective_mems;
2267 cgroup_taskset_for_each_leader(leader, css, tset) {
2268 struct mm_struct *mm = get_task_mm(leader);
2269
2270 if (mm) {
2271 mpol_rebind_mm(mm, &cpuset_attach_nodemask_to);
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281 if (is_memory_migrate(cs))
2282 cpuset_migrate_mm(mm, &oldcs->old_mems_allowed,
2283 &cpuset_attach_nodemask_to);
2284 else
2285 mmput(mm);
2286 }
2287 }
2288
2289 cs->old_mems_allowed = cpuset_attach_nodemask_to;
2290
2291 cs->attach_in_progress--;
2292 if (!cs->attach_in_progress)
2293 wake_up(&cpuset_attach_wq);
2294
2295 percpu_up_write(&cpuset_rwsem);
2296}
2297
2298
2299
2300typedef enum {
2301 FILE_MEMORY_MIGRATE,
2302 FILE_CPULIST,
2303 FILE_MEMLIST,
2304 FILE_EFFECTIVE_CPULIST,
2305 FILE_EFFECTIVE_MEMLIST,
2306 FILE_SUBPARTS_CPULIST,
2307 FILE_CPU_EXCLUSIVE,
2308 FILE_MEM_EXCLUSIVE,
2309 FILE_MEM_HARDWALL,
2310 FILE_SCHED_LOAD_BALANCE,
2311 FILE_PARTITION_ROOT,
2312 FILE_SCHED_RELAX_DOMAIN_LEVEL,
2313 FILE_MEMORY_PRESSURE_ENABLED,
2314 FILE_MEMORY_PRESSURE,
2315 FILE_SPREAD_PAGE,
2316 FILE_SPREAD_SLAB,
2317} cpuset_filetype_t;
2318
2319static int cpuset_write_u64(struct cgroup_subsys_state *css, struct cftype *cft,
2320 u64 val)
2321{
2322 struct cpuset *cs = css_cs(css);
2323 cpuset_filetype_t type = cft->private;
2324 int retval = 0;
2325
2326 cpus_read_lock();
2327 percpu_down_write(&cpuset_rwsem);
2328 if (!is_cpuset_online(cs)) {
2329 retval = -ENODEV;
2330 goto out_unlock;
2331 }
2332
2333 switch (type) {
2334 case FILE_CPU_EXCLUSIVE:
2335 retval = update_flag(CS_CPU_EXCLUSIVE, cs, val);
2336 break;
2337 case FILE_MEM_EXCLUSIVE:
2338 retval = update_flag(CS_MEM_EXCLUSIVE, cs, val);
2339 break;
2340 case FILE_MEM_HARDWALL:
2341 retval = update_flag(CS_MEM_HARDWALL, cs, val);
2342 break;
2343 case FILE_SCHED_LOAD_BALANCE:
2344 retval = update_flag(CS_SCHED_LOAD_BALANCE, cs, val);
2345 break;
2346 case FILE_MEMORY_MIGRATE:
2347 retval = update_flag(CS_MEMORY_MIGRATE, cs, val);
2348 break;
2349 case FILE_MEMORY_PRESSURE_ENABLED:
2350 cpuset_memory_pressure_enabled = !!val;
2351 break;
2352 case FILE_SPREAD_PAGE:
2353 retval = update_flag(CS_SPREAD_PAGE, cs, val);
2354 break;
2355 case FILE_SPREAD_SLAB:
2356 retval = update_flag(CS_SPREAD_SLAB, cs, val);
2357 break;
2358 default:
2359 retval = -EINVAL;
2360 break;
2361 }
2362out_unlock:
2363 percpu_up_write(&cpuset_rwsem);
2364 cpus_read_unlock();
2365 return retval;
2366}
2367
2368static int cpuset_write_s64(struct cgroup_subsys_state *css, struct cftype *cft,
2369 s64 val)
2370{
2371 struct cpuset *cs = css_cs(css);
2372 cpuset_filetype_t type = cft->private;
2373 int retval = -ENODEV;
2374
2375 cpus_read_lock();
2376 percpu_down_write(&cpuset_rwsem);
2377 if (!is_cpuset_online(cs))
2378 goto out_unlock;
2379
2380 switch (type) {
2381 case FILE_SCHED_RELAX_DOMAIN_LEVEL:
2382 retval = update_relax_domain_level(cs, val);
2383 break;
2384 default:
2385 retval = -EINVAL;
2386 break;
2387 }
2388out_unlock:
2389 percpu_up_write(&cpuset_rwsem);
2390 cpus_read_unlock();
2391 return retval;
2392}
2393
2394
2395
2396
2397static ssize_t cpuset_write_resmask(struct kernfs_open_file *of,
2398 char *buf, size_t nbytes, loff_t off)
2399{
2400 struct cpuset *cs = css_cs(of_css(of));
2401 struct cpuset *trialcs;
2402 int retval = -ENODEV;
2403
2404 buf = strstrip(buf);
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425 css_get(&cs->css);
2426 kernfs_break_active_protection(of->kn);
2427 flush_work(&cpuset_hotplug_work);
2428
2429 cpus_read_lock();
2430 percpu_down_write(&cpuset_rwsem);
2431 if (!is_cpuset_online(cs))
2432 goto out_unlock;
2433
2434 trialcs = alloc_trial_cpuset(cs);
2435 if (!trialcs) {
2436 retval = -ENOMEM;
2437 goto out_unlock;
2438 }
2439
2440 switch (of_cft(of)->private) {
2441 case FILE_CPULIST:
2442 retval = update_cpumask(cs, trialcs, buf);
2443 break;
2444 case FILE_MEMLIST:
2445 retval = update_nodemask(cs, trialcs, buf);
2446 break;
2447 default:
2448 retval = -EINVAL;
2449 break;
2450 }
2451
2452 free_cpuset(trialcs);
2453out_unlock:
2454 percpu_up_write(&cpuset_rwsem);
2455 cpus_read_unlock();
2456 kernfs_unbreak_active_protection(of->kn);
2457 css_put(&cs->css);
2458 flush_workqueue(cpuset_migrate_mm_wq);
2459 return retval ?: nbytes;
2460}
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470static int cpuset_common_seq_show(struct seq_file *sf, void *v)
2471{
2472 struct cpuset *cs = css_cs(seq_css(sf));
2473 cpuset_filetype_t type = seq_cft(sf)->private;
2474 int ret = 0;
2475
2476 spin_lock_irq(&callback_lock);
2477
2478 switch (type) {
2479 case FILE_CPULIST:
2480 seq_printf(sf, "%*pbl\n", cpumask_pr_args(cs->cpus_allowed));
2481 break;
2482 case FILE_MEMLIST:
2483 seq_printf(sf, "%*pbl\n", nodemask_pr_args(&cs->mems_allowed));
2484 break;
2485 case FILE_EFFECTIVE_CPULIST:
2486 seq_printf(sf, "%*pbl\n", cpumask_pr_args(cs->effective_cpus));
2487 break;
2488 case FILE_EFFECTIVE_MEMLIST:
2489 seq_printf(sf, "%*pbl\n", nodemask_pr_args(&cs->effective_mems));
2490 break;
2491 case FILE_SUBPARTS_CPULIST:
2492 seq_printf(sf, "%*pbl\n", cpumask_pr_args(cs->subparts_cpus));
2493 break;
2494 default:
2495 ret = -EINVAL;
2496 }
2497
2498 spin_unlock_irq(&callback_lock);
2499 return ret;
2500}
2501
2502static u64 cpuset_read_u64(struct cgroup_subsys_state *css, struct cftype *cft)
2503{
2504 struct cpuset *cs = css_cs(css);
2505 cpuset_filetype_t type = cft->private;
2506 switch (type) {
2507 case FILE_CPU_EXCLUSIVE:
2508 return is_cpu_exclusive(cs);
2509 case FILE_MEM_EXCLUSIVE:
2510 return is_mem_exclusive(cs);
2511 case FILE_MEM_HARDWALL:
2512 return is_mem_hardwall(cs);
2513 case FILE_SCHED_LOAD_BALANCE:
2514 return is_sched_load_balance(cs);
2515 case FILE_MEMORY_MIGRATE:
2516 return is_memory_migrate(cs);
2517 case FILE_MEMORY_PRESSURE_ENABLED:
2518 return cpuset_memory_pressure_enabled;
2519 case FILE_MEMORY_PRESSURE:
2520 return fmeter_getrate(&cs->fmeter);
2521 case FILE_SPREAD_PAGE:
2522 return is_spread_page(cs);
2523 case FILE_SPREAD_SLAB:
2524 return is_spread_slab(cs);
2525 default:
2526 BUG();
2527 }
2528
2529
2530 return 0;
2531}
2532
2533static s64 cpuset_read_s64(struct cgroup_subsys_state *css, struct cftype *cft)
2534{
2535 struct cpuset *cs = css_cs(css);
2536 cpuset_filetype_t type = cft->private;
2537 switch (type) {
2538 case FILE_SCHED_RELAX_DOMAIN_LEVEL:
2539 return cs->relax_domain_level;
2540 default:
2541 BUG();
2542 }
2543
2544
2545 return 0;
2546}
2547
2548static int sched_partition_show(struct seq_file *seq, void *v)
2549{
2550 struct cpuset *cs = css_cs(seq_css(seq));
2551
2552 switch (cs->partition_root_state) {
2553 case PRS_ENABLED:
2554 seq_puts(seq, "root\n");
2555 break;
2556 case PRS_DISABLED:
2557 seq_puts(seq, "member\n");
2558 break;
2559 case PRS_ERROR:
2560 seq_puts(seq, "root invalid\n");
2561 break;
2562 }
2563 return 0;
2564}
2565
2566static ssize_t sched_partition_write(struct kernfs_open_file *of, char *buf,
2567 size_t nbytes, loff_t off)
2568{
2569 struct cpuset *cs = css_cs(of_css(of));
2570 int val;
2571 int retval = -ENODEV;
2572
2573 buf = strstrip(buf);
2574
2575
2576
2577
2578 if (!strcmp(buf, "root"))
2579 val = PRS_ENABLED;
2580 else if (!strcmp(buf, "member"))
2581 val = PRS_DISABLED;
2582 else
2583 return -EINVAL;
2584
2585 css_get(&cs->css);
2586 cpus_read_lock();
2587 percpu_down_write(&cpuset_rwsem);
2588 if (!is_cpuset_online(cs))
2589 goto out_unlock;
2590
2591 retval = update_prstate(cs, val);
2592out_unlock:
2593 percpu_up_write(&cpuset_rwsem);
2594 cpus_read_unlock();
2595 css_put(&cs->css);
2596 return retval ?: nbytes;
2597}
2598
2599
2600
2601
2602
2603static struct cftype legacy_files[] = {
2604 {
2605 .name = "cpus",
2606 .seq_show = cpuset_common_seq_show,
2607 .write = cpuset_write_resmask,
2608 .max_write_len = (100U + 6 * NR_CPUS),
2609 .private = FILE_CPULIST,
2610 },
2611
2612 {
2613 .name = "mems",
2614 .seq_show = cpuset_common_seq_show,
2615 .write = cpuset_write_resmask,
2616 .max_write_len = (100U + 6 * MAX_NUMNODES),
2617 .private = FILE_MEMLIST,
2618 },
2619
2620 {
2621 .name = "effective_cpus",
2622 .seq_show = cpuset_common_seq_show,
2623 .private = FILE_EFFECTIVE_CPULIST,
2624 },
2625
2626 {
2627 .name = "effective_mems",
2628 .seq_show = cpuset_common_seq_show,
2629 .private = FILE_EFFECTIVE_MEMLIST,
2630 },
2631
2632 {
2633 .name = "cpu_exclusive",
2634 .read_u64 = cpuset_read_u64,
2635 .write_u64 = cpuset_write_u64,
2636 .private = FILE_CPU_EXCLUSIVE,
2637 },
2638
2639 {
2640 .name = "mem_exclusive",
2641 .read_u64 = cpuset_read_u64,
2642 .write_u64 = cpuset_write_u64,
2643 .private = FILE_MEM_EXCLUSIVE,
2644 },
2645
2646 {
2647 .name = "mem_hardwall",
2648 .read_u64 = cpuset_read_u64,
2649 .write_u64 = cpuset_write_u64,
2650 .private = FILE_MEM_HARDWALL,
2651 },
2652
2653 {
2654 .name = "sched_load_balance",
2655 .read_u64 = cpuset_read_u64,
2656 .write_u64 = cpuset_write_u64,
2657 .private = FILE_SCHED_LOAD_BALANCE,
2658 },
2659
2660 {
2661 .name = "sched_relax_domain_level",
2662 .read_s64 = cpuset_read_s64,
2663 .write_s64 = cpuset_write_s64,
2664 .private = FILE_SCHED_RELAX_DOMAIN_LEVEL,
2665 },
2666
2667 {
2668 .name = "memory_migrate",
2669 .read_u64 = cpuset_read_u64,
2670 .write_u64 = cpuset_write_u64,
2671 .private = FILE_MEMORY_MIGRATE,
2672 },
2673
2674 {
2675 .name = "memory_pressure",
2676 .read_u64 = cpuset_read_u64,
2677 .private = FILE_MEMORY_PRESSURE,
2678 },
2679
2680 {
2681 .name = "memory_spread_page",
2682 .read_u64 = cpuset_read_u64,
2683 .write_u64 = cpuset_write_u64,
2684 .private = FILE_SPREAD_PAGE,
2685 },
2686
2687 {
2688 .name = "memory_spread_slab",
2689 .read_u64 = cpuset_read_u64,
2690 .write_u64 = cpuset_write_u64,
2691 .private = FILE_SPREAD_SLAB,
2692 },
2693
2694 {
2695 .name = "memory_pressure_enabled",
2696 .flags = CFTYPE_ONLY_ON_ROOT,
2697 .read_u64 = cpuset_read_u64,
2698 .write_u64 = cpuset_write_u64,
2699 .private = FILE_MEMORY_PRESSURE_ENABLED,
2700 },
2701
2702 { }
2703};
2704
2705
2706
2707
2708
2709static struct cftype dfl_files[] = {
2710 {
2711 .name = "cpus",
2712 .seq_show = cpuset_common_seq_show,
2713 .write = cpuset_write_resmask,
2714 .max_write_len = (100U + 6 * NR_CPUS),
2715 .private = FILE_CPULIST,
2716 .flags = CFTYPE_NOT_ON_ROOT,
2717 },
2718
2719 {
2720 .name = "mems",
2721 .seq_show = cpuset_common_seq_show,
2722 .write = cpuset_write_resmask,
2723 .max_write_len = (100U + 6 * MAX_NUMNODES),
2724 .private = FILE_MEMLIST,
2725 .flags = CFTYPE_NOT_ON_ROOT,
2726 },
2727
2728 {
2729 .name = "cpus.effective",
2730 .seq_show = cpuset_common_seq_show,
2731 .private = FILE_EFFECTIVE_CPULIST,
2732 },
2733
2734 {
2735 .name = "mems.effective",
2736 .seq_show = cpuset_common_seq_show,
2737 .private = FILE_EFFECTIVE_MEMLIST,
2738 },
2739
2740 {
2741 .name = "cpus.partition",
2742 .seq_show = sched_partition_show,
2743 .write = sched_partition_write,
2744 .private = FILE_PARTITION_ROOT,
2745 .flags = CFTYPE_NOT_ON_ROOT,
2746 .file_offset = offsetof(struct cpuset, partition_file),
2747 },
2748
2749 {
2750 .name = "cpus.subpartitions",
2751 .seq_show = cpuset_common_seq_show,
2752 .private = FILE_SUBPARTS_CPULIST,
2753 .flags = CFTYPE_DEBUG,
2754 },
2755
2756 { }
2757};
2758
2759
2760
2761
2762
2763
2764
2765static struct cgroup_subsys_state *
2766cpuset_css_alloc(struct cgroup_subsys_state *parent_css)
2767{
2768 struct cpuset *cs;
2769
2770 if (!parent_css)
2771 return &top_cpuset.css;
2772
2773 cs = kzalloc(sizeof(*cs), GFP_KERNEL);
2774 if (!cs)
2775 return ERR_PTR(-ENOMEM);
2776
2777 if (alloc_cpumasks(cs, NULL)) {
2778 kfree(cs);
2779 return ERR_PTR(-ENOMEM);
2780 }
2781
2782 __set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
2783 nodes_clear(cs->mems_allowed);
2784 nodes_clear(cs->effective_mems);
2785 fmeter_init(&cs->fmeter);
2786 cs->relax_domain_level = -1;
2787
2788
2789 if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys))
2790 __set_bit(CS_MEMORY_MIGRATE, &cs->flags);
2791
2792 return &cs->css;
2793}
2794
2795static int cpuset_css_online(struct cgroup_subsys_state *css)
2796{
2797 struct cpuset *cs = css_cs(css);
2798 struct cpuset *parent = parent_cs(cs);
2799 struct cpuset *tmp_cs;
2800 struct cgroup_subsys_state *pos_css;
2801
2802 if (!parent)
2803 return 0;
2804
2805 cpus_read_lock();
2806 percpu_down_write(&cpuset_rwsem);
2807
2808 set_bit(CS_ONLINE, &cs->flags);
2809 if (is_spread_page(parent))
2810 set_bit(CS_SPREAD_PAGE, &cs->flags);
2811 if (is_spread_slab(parent))
2812 set_bit(CS_SPREAD_SLAB, &cs->flags);
2813
2814 cpuset_inc();
2815
2816 spin_lock_irq(&callback_lock);
2817 if (is_in_v2_mode()) {
2818 cpumask_copy(cs->effective_cpus, parent->effective_cpus);
2819 cs->effective_mems = parent->effective_mems;
2820 cs->use_parent_ecpus = true;
2821 parent->child_ecpus_count++;
2822 }
2823 spin_unlock_irq(&callback_lock);
2824
2825 if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags))
2826 goto out_unlock;
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841 rcu_read_lock();
2842 cpuset_for_each_child(tmp_cs, pos_css, parent) {
2843 if (is_mem_exclusive(tmp_cs) || is_cpu_exclusive(tmp_cs)) {
2844 rcu_read_unlock();
2845 goto out_unlock;
2846 }
2847 }
2848 rcu_read_unlock();
2849
2850 spin_lock_irq(&callback_lock);
2851 cs->mems_allowed = parent->mems_allowed;
2852 cs->effective_mems = parent->mems_allowed;
2853 cpumask_copy(cs->cpus_allowed, parent->cpus_allowed);
2854 cpumask_copy(cs->effective_cpus, parent->cpus_allowed);
2855 spin_unlock_irq(&callback_lock);
2856out_unlock:
2857 percpu_up_write(&cpuset_rwsem);
2858 cpus_read_unlock();
2859 return 0;
2860}
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873static void cpuset_css_offline(struct cgroup_subsys_state *css)
2874{
2875 struct cpuset *cs = css_cs(css);
2876
2877 cpus_read_lock();
2878 percpu_down_write(&cpuset_rwsem);
2879
2880 if (is_partition_root(cs))
2881 update_prstate(cs, 0);
2882
2883 if (!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) &&
2884 is_sched_load_balance(cs))
2885 update_flag(CS_SCHED_LOAD_BALANCE, cs, 0);
2886
2887 if (cs->use_parent_ecpus) {
2888 struct cpuset *parent = parent_cs(cs);
2889
2890 cs->use_parent_ecpus = false;
2891 parent->child_ecpus_count--;
2892 }
2893
2894 cpuset_dec();
2895 clear_bit(CS_ONLINE, &cs->flags);
2896
2897 percpu_up_write(&cpuset_rwsem);
2898 cpus_read_unlock();
2899}
2900
2901static void cpuset_css_free(struct cgroup_subsys_state *css)
2902{
2903 struct cpuset *cs = css_cs(css);
2904
2905 free_cpuset(cs);
2906}
2907
2908static void cpuset_bind(struct cgroup_subsys_state *root_css)
2909{
2910 percpu_down_write(&cpuset_rwsem);
2911 spin_lock_irq(&callback_lock);
2912
2913 if (is_in_v2_mode()) {
2914 cpumask_copy(top_cpuset.cpus_allowed, cpu_possible_mask);
2915 top_cpuset.mems_allowed = node_possible_map;
2916 } else {
2917 cpumask_copy(top_cpuset.cpus_allowed,
2918 top_cpuset.effective_cpus);
2919 top_cpuset.mems_allowed = top_cpuset.effective_mems;
2920 }
2921
2922 spin_unlock_irq(&callback_lock);
2923 percpu_up_write(&cpuset_rwsem);
2924}
2925
2926
2927
2928
2929
2930
2931static void cpuset_fork(struct task_struct *task)
2932{
2933 if (task_css_is_root(task, cpuset_cgrp_id))
2934 return;
2935
2936 set_cpus_allowed_ptr(task, current->cpus_ptr);
2937 task->mems_allowed = current->mems_allowed;
2938}
2939
2940struct cgroup_subsys cpuset_cgrp_subsys = {
2941 .css_alloc = cpuset_css_alloc,
2942 .css_online = cpuset_css_online,
2943 .css_offline = cpuset_css_offline,
2944 .css_free = cpuset_css_free,
2945 .can_attach = cpuset_can_attach,
2946 .cancel_attach = cpuset_cancel_attach,
2947 .attach = cpuset_attach,
2948 .post_attach = cpuset_post_attach,
2949 .bind = cpuset_bind,
2950 .fork = cpuset_fork,
2951 .legacy_cftypes = legacy_files,
2952 .dfl_cftypes = dfl_files,
2953 .early_init = true,
2954 .threaded = true,
2955};
2956
2957
2958
2959
2960
2961
2962
2963int __init cpuset_init(void)
2964{
2965 BUG_ON(percpu_init_rwsem(&cpuset_rwsem));
2966
2967 BUG_ON(!alloc_cpumask_var(&top_cpuset.cpus_allowed, GFP_KERNEL));
2968 BUG_ON(!alloc_cpumask_var(&top_cpuset.effective_cpus, GFP_KERNEL));
2969 BUG_ON(!zalloc_cpumask_var(&top_cpuset.subparts_cpus, GFP_KERNEL));
2970
2971 cpumask_setall(top_cpuset.cpus_allowed);
2972 nodes_setall(top_cpuset.mems_allowed);
2973 cpumask_setall(top_cpuset.effective_cpus);
2974 nodes_setall(top_cpuset.effective_mems);
2975
2976 fmeter_init(&top_cpuset.fmeter);
2977 set_bit(CS_SCHED_LOAD_BALANCE, &top_cpuset.flags);
2978 top_cpuset.relax_domain_level = -1;
2979
2980 BUG_ON(!alloc_cpumask_var(&cpus_attach, GFP_KERNEL));
2981
2982 return 0;
2983}
2984
2985
2986
2987
2988
2989
2990
2991
2992static void remove_tasks_in_empty_cpuset(struct cpuset *cs)
2993{
2994 struct cpuset *parent;
2995
2996
2997
2998
2999
3000 parent = parent_cs(cs);
3001 while (cpumask_empty(parent->cpus_allowed) ||
3002 nodes_empty(parent->mems_allowed))
3003 parent = parent_cs(parent);
3004
3005 if (cgroup_transfer_tasks(parent->css.cgroup, cs->css.cgroup)) {
3006 pr_err("cpuset: failed to transfer tasks out of empty cpuset ");
3007 pr_cont_cgroup_name(cs->css.cgroup);
3008 pr_cont("\n");
3009 }
3010}
3011
3012static void
3013hotplug_update_tasks_legacy(struct cpuset *cs,
3014 struct cpumask *new_cpus, nodemask_t *new_mems,
3015 bool cpus_updated, bool mems_updated)
3016{
3017 bool is_empty;
3018
3019 spin_lock_irq(&callback_lock);
3020 cpumask_copy(cs->cpus_allowed, new_cpus);
3021 cpumask_copy(cs->effective_cpus, new_cpus);
3022 cs->mems_allowed = *new_mems;
3023 cs->effective_mems = *new_mems;
3024 spin_unlock_irq(&callback_lock);
3025
3026
3027
3028
3029
3030 if (cpus_updated && !cpumask_empty(cs->cpus_allowed))
3031 update_tasks_cpumask(cs);
3032 if (mems_updated && !nodes_empty(cs->mems_allowed))
3033 update_tasks_nodemask(cs);
3034
3035 is_empty = cpumask_empty(cs->cpus_allowed) ||
3036 nodes_empty(cs->mems_allowed);
3037
3038 percpu_up_write(&cpuset_rwsem);
3039
3040
3041
3042
3043
3044
3045 if (is_empty)
3046 remove_tasks_in_empty_cpuset(cs);
3047
3048 percpu_down_write(&cpuset_rwsem);
3049}
3050
3051static void
3052hotplug_update_tasks(struct cpuset *cs,
3053 struct cpumask *new_cpus, nodemask_t *new_mems,
3054 bool cpus_updated, bool mems_updated)
3055{
3056 if (cpumask_empty(new_cpus))
3057 cpumask_copy(new_cpus, parent_cs(cs)->effective_cpus);
3058 if (nodes_empty(*new_mems))
3059 *new_mems = parent_cs(cs)->effective_mems;
3060
3061 spin_lock_irq(&callback_lock);
3062 cpumask_copy(cs->effective_cpus, new_cpus);
3063 cs->effective_mems = *new_mems;
3064 spin_unlock_irq(&callback_lock);
3065
3066 if (cpus_updated)
3067 update_tasks_cpumask(cs);
3068 if (mems_updated)
3069 update_tasks_nodemask(cs);
3070}
3071
3072static bool force_rebuild;
3073
3074void cpuset_force_rebuild(void)
3075{
3076 force_rebuild = true;
3077}
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088static void cpuset_hotplug_update_tasks(struct cpuset *cs, struct tmpmasks *tmp)
3089{
3090 static cpumask_t new_cpus;
3091 static nodemask_t new_mems;
3092 bool cpus_updated;
3093 bool mems_updated;
3094 struct cpuset *parent;
3095retry:
3096 wait_event(cpuset_attach_wq, cs->attach_in_progress == 0);
3097
3098 percpu_down_write(&cpuset_rwsem);
3099
3100
3101
3102
3103
3104 if (cs->attach_in_progress) {
3105 percpu_up_write(&cpuset_rwsem);
3106 goto retry;
3107 }
3108
3109 parent = parent_cs(cs);
3110 compute_effective_cpumask(&new_cpus, cs, parent);
3111 nodes_and(new_mems, cs->mems_allowed, parent->effective_mems);
3112
3113 if (cs->nr_subparts_cpus)
3114
3115
3116
3117
3118 cpumask_andnot(&new_cpus, &new_cpus, cs->subparts_cpus);
3119
3120 if (!tmp || !cs->partition_root_state)
3121 goto update_tasks;
3122
3123
3124
3125
3126
3127
3128 if (is_partition_root(cs) && (cpumask_empty(&new_cpus) ||
3129 (parent->partition_root_state == PRS_ERROR))) {
3130 if (cs->nr_subparts_cpus) {
3131 spin_lock_irq(&callback_lock);
3132 cs->nr_subparts_cpus = 0;
3133 cpumask_clear(cs->subparts_cpus);
3134 spin_unlock_irq(&callback_lock);
3135 compute_effective_cpumask(&new_cpus, cs, parent);
3136 }
3137
3138
3139
3140
3141
3142
3143
3144 if ((parent->partition_root_state == PRS_ERROR) ||
3145 cpumask_empty(&new_cpus)) {
3146 int old_prs;
3147
3148 update_parent_subparts_cpumask(cs, partcmd_disable,
3149 NULL, tmp);
3150 old_prs = cs->partition_root_state;
3151 if (old_prs != PRS_ERROR) {
3152 spin_lock_irq(&callback_lock);
3153 cs->partition_root_state = PRS_ERROR;
3154 spin_unlock_irq(&callback_lock);
3155 notify_partition_change(cs, old_prs, PRS_ERROR);
3156 }
3157 }
3158 cpuset_force_rebuild();
3159 }
3160
3161
3162
3163
3164
3165
3166 if (is_partition_root(parent) &&
3167 ((cs->partition_root_state == PRS_ERROR) ||
3168 !cpumask_intersects(&new_cpus, parent->subparts_cpus)) &&
3169 update_parent_subparts_cpumask(cs, partcmd_update, NULL, tmp))
3170 cpuset_force_rebuild();
3171
3172update_tasks:
3173 cpus_updated = !cpumask_equal(&new_cpus, cs->effective_cpus);
3174 mems_updated = !nodes_equal(new_mems, cs->effective_mems);
3175
3176 if (is_in_v2_mode())
3177 hotplug_update_tasks(cs, &new_cpus, &new_mems,
3178 cpus_updated, mems_updated);
3179 else
3180 hotplug_update_tasks_legacy(cs, &new_cpus, &new_mems,
3181 cpus_updated, mems_updated);
3182
3183 percpu_up_write(&cpuset_rwsem);
3184}
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202static void cpuset_hotplug_workfn(struct work_struct *work)
3203{
3204 static cpumask_t new_cpus;
3205 static nodemask_t new_mems;
3206 bool cpus_updated, mems_updated;
3207 bool on_dfl = is_in_v2_mode();
3208 struct tmpmasks tmp, *ptmp = NULL;
3209
3210 if (on_dfl && !alloc_cpumasks(NULL, &tmp))
3211 ptmp = &tmp;
3212
3213 percpu_down_write(&cpuset_rwsem);
3214
3215
3216 cpumask_copy(&new_cpus, cpu_active_mask);
3217 new_mems = node_states[N_MEMORY];
3218
3219
3220
3221
3222
3223
3224 cpus_updated = !cpumask_equal(top_cpuset.effective_cpus, &new_cpus);
3225 mems_updated = !nodes_equal(top_cpuset.effective_mems, new_mems);
3226
3227
3228
3229
3230
3231 if (!cpus_updated && top_cpuset.nr_subparts_cpus)
3232 cpus_updated = true;
3233
3234
3235 if (cpus_updated) {
3236 spin_lock_irq(&callback_lock);
3237 if (!on_dfl)
3238 cpumask_copy(top_cpuset.cpus_allowed, &new_cpus);
3239
3240
3241
3242
3243
3244
3245 if (top_cpuset.nr_subparts_cpus) {
3246 if (cpumask_subset(&new_cpus,
3247 top_cpuset.subparts_cpus)) {
3248 top_cpuset.nr_subparts_cpus = 0;
3249 cpumask_clear(top_cpuset.subparts_cpus);
3250 } else {
3251 cpumask_andnot(&new_cpus, &new_cpus,
3252 top_cpuset.subparts_cpus);
3253 }
3254 }
3255 cpumask_copy(top_cpuset.effective_cpus, &new_cpus);
3256 spin_unlock_irq(&callback_lock);
3257
3258 }
3259
3260
3261 if (mems_updated) {
3262 spin_lock_irq(&callback_lock);
3263 if (!on_dfl)
3264 top_cpuset.mems_allowed = new_mems;
3265 top_cpuset.effective_mems = new_mems;
3266 spin_unlock_irq(&callback_lock);
3267 update_tasks_nodemask(&top_cpuset);
3268 }
3269
3270 percpu_up_write(&cpuset_rwsem);
3271
3272
3273 if (cpus_updated || mems_updated) {
3274 struct cpuset *cs;
3275 struct cgroup_subsys_state *pos_css;
3276
3277 rcu_read_lock();
3278 cpuset_for_each_descendant_pre(cs, pos_css, &top_cpuset) {
3279 if (cs == &top_cpuset || !css_tryget_online(&cs->css))
3280 continue;
3281 rcu_read_unlock();
3282
3283 cpuset_hotplug_update_tasks(cs, ptmp);
3284
3285 rcu_read_lock();
3286 css_put(&cs->css);
3287 }
3288 rcu_read_unlock();
3289 }
3290
3291
3292 if (cpus_updated || force_rebuild) {
3293 force_rebuild = false;
3294 rebuild_sched_domains();
3295 }
3296
3297 free_cpumasks(NULL, ptmp);
3298}
3299
3300void cpuset_update_active_cpus(void)
3301{
3302
3303
3304
3305
3306
3307 schedule_work(&cpuset_hotplug_work);
3308}
3309
3310void cpuset_wait_for_hotplug(void)
3311{
3312 flush_work(&cpuset_hotplug_work);
3313}
3314
3315
3316
3317
3318
3319
3320static int cpuset_track_online_nodes(struct notifier_block *self,
3321 unsigned long action, void *arg)
3322{
3323 schedule_work(&cpuset_hotplug_work);
3324 return NOTIFY_OK;
3325}
3326
3327static struct notifier_block cpuset_track_online_nodes_nb = {
3328 .notifier_call = cpuset_track_online_nodes,
3329 .priority = 10,
3330};
3331
3332
3333
3334
3335
3336
3337void __init cpuset_init_smp(void)
3338{
3339 cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask);
3340 top_cpuset.mems_allowed = node_states[N_MEMORY];
3341 top_cpuset.old_mems_allowed = top_cpuset.mems_allowed;
3342
3343 cpumask_copy(top_cpuset.effective_cpus, cpu_active_mask);
3344 top_cpuset.effective_mems = node_states[N_MEMORY];
3345
3346 register_hotmemory_notifier(&cpuset_track_online_nodes_nb);
3347
3348 cpuset_migrate_mm_wq = alloc_ordered_workqueue("cpuset_migrate_mm", 0);
3349 BUG_ON(!cpuset_migrate_mm_wq);
3350}
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask)
3364{
3365 unsigned long flags;
3366
3367 spin_lock_irqsave(&callback_lock, flags);
3368 guarantee_online_cpus(tsk, pmask);
3369 spin_unlock_irqrestore(&callback_lock, flags);
3370}
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386bool cpuset_cpus_allowed_fallback(struct task_struct *tsk)
3387{
3388 const struct cpumask *possible_mask = task_cpu_possible_mask(tsk);
3389 const struct cpumask *cs_mask;
3390 bool changed = false;
3391
3392 rcu_read_lock();
3393 cs_mask = task_cs(tsk)->cpus_allowed;
3394 if (is_in_v2_mode() && cpumask_subset(cs_mask, possible_mask)) {
3395 do_set_cpus_allowed(tsk, cs_mask);
3396 changed = true;
3397 }
3398 rcu_read_unlock();
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417 return changed;
3418}
3419
3420void __init cpuset_init_current_mems_allowed(void)
3421{
3422 nodes_setall(current->mems_allowed);
3423}
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435nodemask_t cpuset_mems_allowed(struct task_struct *tsk)
3436{
3437 nodemask_t mask;
3438 unsigned long flags;
3439
3440 spin_lock_irqsave(&callback_lock, flags);
3441 rcu_read_lock();
3442 guarantee_online_mems(task_cs(tsk), &mask);
3443 rcu_read_unlock();
3444 spin_unlock_irqrestore(&callback_lock, flags);
3445
3446 return mask;
3447}
3448
3449
3450
3451
3452
3453
3454
3455int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask)
3456{
3457 return nodes_intersects(*nodemask, current->mems_allowed);
3458}
3459
3460
3461
3462
3463
3464
3465
3466static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs)
3467{
3468 while (!(is_mem_exclusive(cs) || is_mem_hardwall(cs)) && parent_cs(cs))
3469 cs = parent_cs(cs);
3470 return cs;
3471}
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513bool __cpuset_node_allowed(int node, gfp_t gfp_mask)
3514{
3515 struct cpuset *cs;
3516 int allowed;
3517 unsigned long flags;
3518
3519 if (in_interrupt())
3520 return true;
3521 if (node_isset(node, current->mems_allowed))
3522 return true;
3523
3524
3525
3526
3527 if (unlikely(tsk_is_oom_victim(current)))
3528 return true;
3529 if (gfp_mask & __GFP_HARDWALL)
3530 return false;
3531
3532 if (current->flags & PF_EXITING)
3533 return true;
3534
3535
3536 spin_lock_irqsave(&callback_lock, flags);
3537
3538 rcu_read_lock();
3539 cs = nearest_hardwall_ancestor(task_cs(current));
3540 allowed = node_isset(node, cs->mems_allowed);
3541 rcu_read_unlock();
3542
3543 spin_unlock_irqrestore(&callback_lock, flags);
3544 return allowed;
3545}
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574static int cpuset_spread_node(int *rotor)
3575{
3576 return *rotor = next_node_in(*rotor, current->mems_allowed);
3577}
3578
3579int cpuset_mem_spread_node(void)
3580{
3581 if (current->cpuset_mem_spread_rotor == NUMA_NO_NODE)
3582 current->cpuset_mem_spread_rotor =
3583 node_random(¤t->mems_allowed);
3584
3585 return cpuset_spread_node(¤t->cpuset_mem_spread_rotor);
3586}
3587
3588int cpuset_slab_spread_node(void)
3589{
3590 if (current->cpuset_slab_spread_rotor == NUMA_NO_NODE)
3591 current->cpuset_slab_spread_rotor =
3592 node_random(¤t->mems_allowed);
3593
3594 return cpuset_spread_node(¤t->cpuset_slab_spread_rotor);
3595}
3596
3597EXPORT_SYMBOL_GPL(cpuset_mem_spread_node);
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610int cpuset_mems_allowed_intersects(const struct task_struct *tsk1,
3611 const struct task_struct *tsk2)
3612{
3613 return nodes_intersects(tsk1->mems_allowed, tsk2->mems_allowed);
3614}
3615
3616
3617
3618
3619
3620
3621
3622void cpuset_print_current_mems_allowed(void)
3623{
3624 struct cgroup *cgrp;
3625
3626 rcu_read_lock();
3627
3628 cgrp = task_cs(current)->css.cgroup;
3629 pr_cont(",cpuset=");
3630 pr_cont_cgroup_name(cgrp);
3631 pr_cont(",mems_allowed=%*pbl",
3632 nodemask_pr_args(¤t->mems_allowed));
3633
3634 rcu_read_unlock();
3635}
3636
3637
3638
3639
3640
3641
3642
3643int cpuset_memory_pressure_enabled __read_mostly;
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663void __cpuset_memory_pressure_bump(void)
3664{
3665 rcu_read_lock();
3666 fmeter_markevent(&task_cs(current)->fmeter);
3667 rcu_read_unlock();
3668}
3669
3670#ifdef CONFIG_PROC_PID_CPUSET
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680int proc_cpuset_show(struct seq_file *m, struct pid_namespace *ns,
3681 struct pid *pid, struct task_struct *tsk)
3682{
3683 char *buf;
3684 struct cgroup_subsys_state *css;
3685 int retval;
3686
3687 retval = -ENOMEM;
3688 buf = kmalloc(PATH_MAX, GFP_KERNEL);
3689 if (!buf)
3690 goto out;
3691
3692 css = task_get_css(tsk, cpuset_cgrp_id);
3693 retval = cgroup_path_ns(css->cgroup, buf, PATH_MAX,
3694 current->nsproxy->cgroup_ns);
3695 css_put(css);
3696 if (retval >= PATH_MAX)
3697 retval = -ENAMETOOLONG;
3698 if (retval < 0)
3699 goto out_free;
3700 seq_puts(m, buf);
3701 seq_putc(m, '\n');
3702 retval = 0;
3703out_free:
3704 kfree(buf);
3705out:
3706 return retval;
3707}
3708#endif
3709
3710
3711void cpuset_task_status_allowed(struct seq_file *m, struct task_struct *task)
3712{
3713 seq_printf(m, "Mems_allowed:\t%*pb\n",
3714 nodemask_pr_args(&task->mems_allowed));
3715 seq_printf(m, "Mems_allowed_list:\t%*pbl\n",
3716 nodemask_pr_args(&task->mems_allowed));
3717}
3718