1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25#include <linux/cpu.h>
26#include <linux/cpumask.h>
27#include <linux/cpuset.h>
28#include <linux/err.h>
29#include <linux/errno.h>
30#include <linux/file.h>
31#include <linux/fs.h>
32#include <linux/init.h>
33#include <linux/interrupt.h>
34#include <linux/kernel.h>
35#include <linux/kmod.h>
36#include <linux/list.h>
37#include <linux/mempolicy.h>
38#include <linux/mm.h>
39#include <linux/memory.h>
40#include <linux/export.h>
41#include <linux/mount.h>
42#include <linux/namei.h>
43#include <linux/pagemap.h>
44#include <linux/proc_fs.h>
45#include <linux/rcupdate.h>
46#include <linux/sched.h>
47#include <linux/sched/mm.h>
48#include <linux/sched/task.h>
49#include <linux/seq_file.h>
50#include <linux/security.h>
51#include <linux/slab.h>
52#include <linux/spinlock.h>
53#include <linux/stat.h>
54#include <linux/string.h>
55#include <linux/time.h>
56#include <linux/time64.h>
57#include <linux/backing-dev.h>
58#include <linux/sort.h>
59
60#include <linux/uaccess.h>
61#include <linux/atomic.h>
62#include <linux/mutex.h>
63#include <linux/cgroup.h>
64#include <linux/wait.h>
65
66DEFINE_STATIC_KEY_FALSE(cpusets_pre_enable_key);
67DEFINE_STATIC_KEY_FALSE(cpusets_enabled_key);
68
69
70
71struct fmeter {
72 int cnt;
73 int val;
74 time64_t time;
75 spinlock_t lock;
76};
77
78struct cpuset {
79 struct cgroup_subsys_state css;
80
81 unsigned long flags;
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104 cpumask_var_t cpus_allowed;
105 nodemask_t mems_allowed;
106
107
108 cpumask_var_t effective_cpus;
109 nodemask_t effective_mems;
110
111
112
113
114
115
116
117
118
119
120
121 nodemask_t old_mems_allowed;
122
123 struct fmeter fmeter;
124
125
126
127
128
129 int attach_in_progress;
130
131
132 int pn;
133
134
135 int relax_domain_level;
136};
137
138static inline struct cpuset *css_cs(struct cgroup_subsys_state *css)
139{
140 return css ? container_of(css, struct cpuset, css) : NULL;
141}
142
143
144static inline struct cpuset *task_cs(struct task_struct *task)
145{
146 return css_cs(task_css(task, cpuset_cgrp_id));
147}
148
149static inline struct cpuset *parent_cs(struct cpuset *cs)
150{
151 return css_cs(cs->css.parent);
152}
153
154#ifdef CONFIG_NUMA
155static inline bool task_has_mempolicy(struct task_struct *task)
156{
157 return task->mempolicy;
158}
159#else
160static inline bool task_has_mempolicy(struct task_struct *task)
161{
162 return false;
163}
164#endif
165
166
167
168typedef enum {
169 CS_ONLINE,
170 CS_CPU_EXCLUSIVE,
171 CS_MEM_EXCLUSIVE,
172 CS_MEM_HARDWALL,
173 CS_MEMORY_MIGRATE,
174 CS_SCHED_LOAD_BALANCE,
175 CS_SPREAD_PAGE,
176 CS_SPREAD_SLAB,
177} cpuset_flagbits_t;
178
179
180static inline bool is_cpuset_online(struct cpuset *cs)
181{
182 return test_bit(CS_ONLINE, &cs->flags) && !css_is_dying(&cs->css);
183}
184
185static inline int is_cpu_exclusive(const struct cpuset *cs)
186{
187 return test_bit(CS_CPU_EXCLUSIVE, &cs->flags);
188}
189
190static inline int is_mem_exclusive(const struct cpuset *cs)
191{
192 return test_bit(CS_MEM_EXCLUSIVE, &cs->flags);
193}
194
195static inline int is_mem_hardwall(const struct cpuset *cs)
196{
197 return test_bit(CS_MEM_HARDWALL, &cs->flags);
198}
199
200static inline int is_sched_load_balance(const struct cpuset *cs)
201{
202 return test_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
203}
204
205static inline int is_memory_migrate(const struct cpuset *cs)
206{
207 return test_bit(CS_MEMORY_MIGRATE, &cs->flags);
208}
209
210static inline int is_spread_page(const struct cpuset *cs)
211{
212 return test_bit(CS_SPREAD_PAGE, &cs->flags);
213}
214
215static inline int is_spread_slab(const struct cpuset *cs)
216{
217 return test_bit(CS_SPREAD_SLAB, &cs->flags);
218}
219
220static struct cpuset top_cpuset = {
221 .flags = ((1 << CS_ONLINE) | (1 << CS_CPU_EXCLUSIVE) |
222 (1 << CS_MEM_EXCLUSIVE)),
223};
224
225
226
227
228
229
230
231
232
233
234#define cpuset_for_each_child(child_cs, pos_css, parent_cs) \
235 css_for_each_child((pos_css), &(parent_cs)->css) \
236 if (is_cpuset_online(((child_cs) = css_cs((pos_css)))))
237
238
239
240
241
242
243
244
245
246
247
248
249#define cpuset_for_each_descendant_pre(des_cs, pos_css, root_cs) \
250 css_for_each_descendant_pre((pos_css), &(root_cs)->css) \
251 if (is_cpuset_online(((des_cs) = css_cs((pos_css)))))
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289static DEFINE_MUTEX(cpuset_mutex);
290static DEFINE_SPINLOCK(callback_lock);
291
292static struct workqueue_struct *cpuset_migrate_mm_wq;
293
294
295
296
297static void cpuset_hotplug_workfn(struct work_struct *work);
298static DECLARE_WORK(cpuset_hotplug_work, cpuset_hotplug_workfn);
299
300static DECLARE_WAIT_QUEUE_HEAD(cpuset_attach_wq);
301
302
303
304
305
306
307static struct dentry *cpuset_mount(struct file_system_type *fs_type,
308 int flags, const char *unused_dev_name, void *data)
309{
310 struct file_system_type *cgroup_fs = get_fs_type("cgroup");
311 struct dentry *ret = ERR_PTR(-ENODEV);
312 if (cgroup_fs) {
313 char mountopts[] =
314 "cpuset,noprefix,"
315 "release_agent=/sbin/cpuset_release_agent";
316 ret = cgroup_fs->mount(cgroup_fs, flags,
317 unused_dev_name, mountopts);
318 put_filesystem(cgroup_fs);
319 }
320 return ret;
321}
322
323static struct file_system_type cpuset_fs_type = {
324 .name = "cpuset",
325 .mount = cpuset_mount,
326};
327
328
329
330
331
332
333
334
335
336
337
338static void guarantee_online_cpus(struct cpuset *cs, struct cpumask *pmask)
339{
340 while (!cpumask_intersects(cs->effective_cpus, cpu_online_mask)) {
341 cs = parent_cs(cs);
342 if (unlikely(!cs)) {
343
344
345
346
347
348
349
350 cpumask_copy(pmask, cpu_online_mask);
351 return;
352 }
353 }
354 cpumask_and(pmask, cs->effective_cpus, cpu_online_mask);
355}
356
357
358
359
360
361
362
363
364
365
366
367
368static void guarantee_online_mems(struct cpuset *cs, nodemask_t *pmask)
369{
370 while (!nodes_intersects(cs->effective_mems, node_states[N_MEMORY]))
371 cs = parent_cs(cs);
372 nodes_and(*pmask, cs->effective_mems, node_states[N_MEMORY]);
373}
374
375
376
377
378
379
380static void cpuset_update_task_spread_flag(struct cpuset *cs,
381 struct task_struct *tsk)
382{
383 if (is_spread_page(cs))
384 task_set_spread_page(tsk);
385 else
386 task_clear_spread_page(tsk);
387
388 if (is_spread_slab(cs))
389 task_set_spread_slab(tsk);
390 else
391 task_clear_spread_slab(tsk);
392}
393
394
395
396
397
398
399
400
401
402static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q)
403{
404 return cpumask_subset(p->cpus_allowed, q->cpus_allowed) &&
405 nodes_subset(p->mems_allowed, q->mems_allowed) &&
406 is_cpu_exclusive(p) <= is_cpu_exclusive(q) &&
407 is_mem_exclusive(p) <= is_mem_exclusive(q);
408}
409
410
411
412
413
414static struct cpuset *alloc_trial_cpuset(struct cpuset *cs)
415{
416 struct cpuset *trial;
417
418 trial = kmemdup(cs, sizeof(*cs), GFP_KERNEL);
419 if (!trial)
420 return NULL;
421
422 if (!alloc_cpumask_var(&trial->cpus_allowed, GFP_KERNEL))
423 goto free_cs;
424 if (!alloc_cpumask_var(&trial->effective_cpus, GFP_KERNEL))
425 goto free_cpus;
426
427 cpumask_copy(trial->cpus_allowed, cs->cpus_allowed);
428 cpumask_copy(trial->effective_cpus, cs->effective_cpus);
429 return trial;
430
431free_cpus:
432 free_cpumask_var(trial->cpus_allowed);
433free_cs:
434 kfree(trial);
435 return NULL;
436}
437
438
439
440
441
442static void free_trial_cpuset(struct cpuset *trial)
443{
444 free_cpumask_var(trial->effective_cpus);
445 free_cpumask_var(trial->cpus_allowed);
446 kfree(trial);
447}
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469static int validate_change(struct cpuset *cur, struct cpuset *trial)
470{
471 struct cgroup_subsys_state *css;
472 struct cpuset *c, *par;
473 int ret;
474
475 rcu_read_lock();
476
477
478 ret = -EBUSY;
479 cpuset_for_each_child(c, css, cur)
480 if (!is_cpuset_subset(c, trial))
481 goto out;
482
483
484 ret = 0;
485 if (cur == &top_cpuset)
486 goto out;
487
488 par = parent_cs(cur);
489
490
491 ret = -EACCES;
492 if (!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) &&
493 !is_cpuset_subset(trial, par))
494 goto out;
495
496
497
498
499
500 ret = -EINVAL;
501 cpuset_for_each_child(c, css, par) {
502 if ((is_cpu_exclusive(trial) || is_cpu_exclusive(c)) &&
503 c != cur &&
504 cpumask_intersects(trial->cpus_allowed, c->cpus_allowed))
505 goto out;
506 if ((is_mem_exclusive(trial) || is_mem_exclusive(c)) &&
507 c != cur &&
508 nodes_intersects(trial->mems_allowed, c->mems_allowed))
509 goto out;
510 }
511
512
513
514
515
516 ret = -ENOSPC;
517 if ((cgroup_is_populated(cur->css.cgroup) || cur->attach_in_progress)) {
518 if (!cpumask_empty(cur->cpus_allowed) &&
519 cpumask_empty(trial->cpus_allowed))
520 goto out;
521 if (!nodes_empty(cur->mems_allowed) &&
522 nodes_empty(trial->mems_allowed))
523 goto out;
524 }
525
526
527
528
529
530 ret = -EBUSY;
531 if (is_cpu_exclusive(cur) &&
532 !cpuset_cpumask_can_shrink(cur->cpus_allowed,
533 trial->cpus_allowed))
534 goto out;
535
536 ret = 0;
537out:
538 rcu_read_unlock();
539 return ret;
540}
541
542#ifdef CONFIG_SMP
543
544
545
546
547static int cpusets_overlap(struct cpuset *a, struct cpuset *b)
548{
549 return cpumask_intersects(a->effective_cpus, b->effective_cpus);
550}
551
552static void
553update_domain_attr(struct sched_domain_attr *dattr, struct cpuset *c)
554{
555 if (dattr->relax_domain_level < c->relax_domain_level)
556 dattr->relax_domain_level = c->relax_domain_level;
557 return;
558}
559
560static void update_domain_attr_tree(struct sched_domain_attr *dattr,
561 struct cpuset *root_cs)
562{
563 struct cpuset *cp;
564 struct cgroup_subsys_state *pos_css;
565
566 rcu_read_lock();
567 cpuset_for_each_descendant_pre(cp, pos_css, root_cs) {
568
569 if (cpumask_empty(cp->cpus_allowed)) {
570 pos_css = css_rightmost_descendant(pos_css);
571 continue;
572 }
573
574 if (is_sched_load_balance(cp))
575 update_domain_attr(dattr, cp);
576 }
577 rcu_read_unlock();
578}
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634static int generate_sched_domains(cpumask_var_t **domains,
635 struct sched_domain_attr **attributes)
636{
637 struct cpuset *cp;
638 struct cpuset **csa;
639 int csn;
640 int i, j, k;
641 cpumask_var_t *doms;
642 cpumask_var_t non_isolated_cpus;
643 struct sched_domain_attr *dattr;
644 int ndoms = 0;
645 int nslot;
646 struct cgroup_subsys_state *pos_css;
647
648 doms = NULL;
649 dattr = NULL;
650 csa = NULL;
651
652 if (!alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL))
653 goto done;
654 cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map);
655
656
657 if (is_sched_load_balance(&top_cpuset)) {
658 ndoms = 1;
659 doms = alloc_sched_domains(ndoms);
660 if (!doms)
661 goto done;
662
663 dattr = kmalloc(sizeof(struct sched_domain_attr), GFP_KERNEL);
664 if (dattr) {
665 *dattr = SD_ATTR_INIT;
666 update_domain_attr_tree(dattr, &top_cpuset);
667 }
668 cpumask_and(doms[0], top_cpuset.effective_cpus,
669 non_isolated_cpus);
670
671 goto done;
672 }
673
674 csa = kmalloc(nr_cpusets() * sizeof(cp), GFP_KERNEL);
675 if (!csa)
676 goto done;
677 csn = 0;
678
679 rcu_read_lock();
680 cpuset_for_each_descendant_pre(cp, pos_css, &top_cpuset) {
681 if (cp == &top_cpuset)
682 continue;
683
684
685
686
687
688
689
690
691 if (!cpumask_empty(cp->cpus_allowed) &&
692 !(is_sched_load_balance(cp) &&
693 cpumask_intersects(cp->cpus_allowed, non_isolated_cpus)))
694 continue;
695
696 if (is_sched_load_balance(cp))
697 csa[csn++] = cp;
698
699
700 pos_css = css_rightmost_descendant(pos_css);
701 }
702 rcu_read_unlock();
703
704 for (i = 0; i < csn; i++)
705 csa[i]->pn = i;
706 ndoms = csn;
707
708restart:
709
710 for (i = 0; i < csn; i++) {
711 struct cpuset *a = csa[i];
712 int apn = a->pn;
713
714 for (j = 0; j < csn; j++) {
715 struct cpuset *b = csa[j];
716 int bpn = b->pn;
717
718 if (apn != bpn && cpusets_overlap(a, b)) {
719 for (k = 0; k < csn; k++) {
720 struct cpuset *c = csa[k];
721
722 if (c->pn == bpn)
723 c->pn = apn;
724 }
725 ndoms--;
726 goto restart;
727 }
728 }
729 }
730
731
732
733
734
735 doms = alloc_sched_domains(ndoms);
736 if (!doms)
737 goto done;
738
739
740
741
742
743 dattr = kmalloc(ndoms * sizeof(struct sched_domain_attr), GFP_KERNEL);
744
745 for (nslot = 0, i = 0; i < csn; i++) {
746 struct cpuset *a = csa[i];
747 struct cpumask *dp;
748 int apn = a->pn;
749
750 if (apn < 0) {
751
752 continue;
753 }
754
755 dp = doms[nslot];
756
757 if (nslot == ndoms) {
758 static int warnings = 10;
759 if (warnings) {
760 pr_warn("rebuild_sched_domains confused: nslot %d, ndoms %d, csn %d, i %d, apn %d\n",
761 nslot, ndoms, csn, i, apn);
762 warnings--;
763 }
764 continue;
765 }
766
767 cpumask_clear(dp);
768 if (dattr)
769 *(dattr + nslot) = SD_ATTR_INIT;
770 for (j = i; j < csn; j++) {
771 struct cpuset *b = csa[j];
772
773 if (apn == b->pn) {
774 cpumask_or(dp, dp, b->effective_cpus);
775 cpumask_and(dp, dp, non_isolated_cpus);
776 if (dattr)
777 update_domain_attr_tree(dattr + nslot, b);
778
779
780 b->pn = -1;
781 }
782 }
783 nslot++;
784 }
785 BUG_ON(nslot != ndoms);
786
787done:
788 free_cpumask_var(non_isolated_cpus);
789 kfree(csa);
790
791
792
793
794
795 if (doms == NULL)
796 ndoms = 1;
797
798 *domains = doms;
799 *attributes = dattr;
800 return ndoms;
801}
802
803
804
805
806
807
808
809
810
811
812
813
814static void rebuild_sched_domains_locked(void)
815{
816 struct sched_domain_attr *attr;
817 cpumask_var_t *doms;
818 int ndoms;
819
820 lockdep_assert_held(&cpuset_mutex);
821 get_online_cpus();
822
823
824
825
826
827
828 if (!cpumask_equal(top_cpuset.effective_cpus, cpu_active_mask))
829 goto out;
830
831
832 ndoms = generate_sched_domains(&doms, &attr);
833
834
835 partition_sched_domains(ndoms, doms, attr);
836out:
837 put_online_cpus();
838}
839#else
840static void rebuild_sched_domains_locked(void)
841{
842}
843#endif
844
845void rebuild_sched_domains(void)
846{
847 mutex_lock(&cpuset_mutex);
848 rebuild_sched_domains_locked();
849 mutex_unlock(&cpuset_mutex);
850}
851
852
853
854
855
856
857
858
859
860static void update_tasks_cpumask(struct cpuset *cs)
861{
862 struct css_task_iter it;
863 struct task_struct *task;
864
865 css_task_iter_start(&cs->css, &it);
866 while ((task = css_task_iter_next(&it)))
867 set_cpus_allowed_ptr(task, cs->effective_cpus);
868 css_task_iter_end(&it);
869}
870
871
872
873
874
875
876
877
878
879
880
881
882
883static void update_cpumasks_hier(struct cpuset *cs, struct cpumask *new_cpus)
884{
885 struct cpuset *cp;
886 struct cgroup_subsys_state *pos_css;
887 bool need_rebuild_sched_domains = false;
888
889 rcu_read_lock();
890 cpuset_for_each_descendant_pre(cp, pos_css, cs) {
891 struct cpuset *parent = parent_cs(cp);
892
893 cpumask_and(new_cpus, cp->cpus_allowed, parent->effective_cpus);
894
895
896
897
898
899 if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys) &&
900 cpumask_empty(new_cpus))
901 cpumask_copy(new_cpus, parent->effective_cpus);
902
903
904 if (cpumask_equal(new_cpus, cp->effective_cpus)) {
905 pos_css = css_rightmost_descendant(pos_css);
906 continue;
907 }
908
909 if (!css_tryget_online(&cp->css))
910 continue;
911 rcu_read_unlock();
912
913 spin_lock_irq(&callback_lock);
914 cpumask_copy(cp->effective_cpus, new_cpus);
915 spin_unlock_irq(&callback_lock);
916
917 WARN_ON(!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) &&
918 !cpumask_equal(cp->cpus_allowed, cp->effective_cpus));
919
920 update_tasks_cpumask(cp);
921
922
923
924
925
926 if (!cpumask_empty(cp->cpus_allowed) &&
927 is_sched_load_balance(cp))
928 need_rebuild_sched_domains = true;
929
930 rcu_read_lock();
931 css_put(&cp->css);
932 }
933 rcu_read_unlock();
934
935 if (need_rebuild_sched_domains)
936 rebuild_sched_domains_locked();
937}
938
939
940
941
942
943
944
945static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
946 const char *buf)
947{
948 int retval;
949
950
951 if (cs == &top_cpuset)
952 return -EACCES;
953
954
955
956
957
958
959
960 if (!*buf) {
961 cpumask_clear(trialcs->cpus_allowed);
962 } else {
963 retval = cpulist_parse(buf, trialcs->cpus_allowed);
964 if (retval < 0)
965 return retval;
966
967 if (!cpumask_subset(trialcs->cpus_allowed,
968 top_cpuset.cpus_allowed))
969 return -EINVAL;
970 }
971
972
973 if (cpumask_equal(cs->cpus_allowed, trialcs->cpus_allowed))
974 return 0;
975
976 retval = validate_change(cs, trialcs);
977 if (retval < 0)
978 return retval;
979
980 spin_lock_irq(&callback_lock);
981 cpumask_copy(cs->cpus_allowed, trialcs->cpus_allowed);
982 spin_unlock_irq(&callback_lock);
983
984
985 update_cpumasks_hier(cs, trialcs->cpus_allowed);
986 return 0;
987}
988
989
990
991
992
993
994
995
996
997struct cpuset_migrate_mm_work {
998 struct work_struct work;
999 struct mm_struct *mm;
1000 nodemask_t from;
1001 nodemask_t to;
1002};
1003
1004static void cpuset_migrate_mm_workfn(struct work_struct *work)
1005{
1006 struct cpuset_migrate_mm_work *mwork =
1007 container_of(work, struct cpuset_migrate_mm_work, work);
1008
1009
1010 do_migrate_pages(mwork->mm, &mwork->from, &mwork->to, MPOL_MF_MOVE_ALL);
1011 mmput(mwork->mm);
1012 kfree(mwork);
1013}
1014
1015static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from,
1016 const nodemask_t *to)
1017{
1018 struct cpuset_migrate_mm_work *mwork;
1019
1020 mwork = kzalloc(sizeof(*mwork), GFP_KERNEL);
1021 if (mwork) {
1022 mwork->mm = mm;
1023 mwork->from = *from;
1024 mwork->to = *to;
1025 INIT_WORK(&mwork->work, cpuset_migrate_mm_workfn);
1026 queue_work(cpuset_migrate_mm_wq, &mwork->work);
1027 } else {
1028 mmput(mm);
1029 }
1030}
1031
1032static void cpuset_post_attach(void)
1033{
1034 flush_workqueue(cpuset_migrate_mm_wq);
1035}
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047static void cpuset_change_task_nodemask(struct task_struct *tsk,
1048 nodemask_t *newmems)
1049{
1050 task_lock(tsk);
1051
1052 local_irq_disable();
1053 write_seqcount_begin(&tsk->mems_allowed_seq);
1054
1055 nodes_or(tsk->mems_allowed, tsk->mems_allowed, *newmems);
1056 mpol_rebind_task(tsk, newmems);
1057 tsk->mems_allowed = *newmems;
1058
1059 write_seqcount_end(&tsk->mems_allowed_seq);
1060 local_irq_enable();
1061
1062 task_unlock(tsk);
1063}
1064
1065static void *cpuset_being_rebound;
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075static void update_tasks_nodemask(struct cpuset *cs)
1076{
1077 static nodemask_t newmems;
1078 struct css_task_iter it;
1079 struct task_struct *task;
1080
1081 cpuset_being_rebound = cs;
1082
1083 guarantee_online_mems(cs, &newmems);
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095 css_task_iter_start(&cs->css, &it);
1096 while ((task = css_task_iter_next(&it))) {
1097 struct mm_struct *mm;
1098 bool migrate;
1099
1100 cpuset_change_task_nodemask(task, &newmems);
1101
1102 mm = get_task_mm(task);
1103 if (!mm)
1104 continue;
1105
1106 migrate = is_memory_migrate(cs);
1107
1108 mpol_rebind_mm(mm, &cs->mems_allowed);
1109 if (migrate)
1110 cpuset_migrate_mm(mm, &cs->old_mems_allowed, &newmems);
1111 else
1112 mmput(mm);
1113 }
1114 css_task_iter_end(&it);
1115
1116
1117
1118
1119
1120 cs->old_mems_allowed = newmems;
1121
1122
1123 cpuset_being_rebound = NULL;
1124}
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138static void update_nodemasks_hier(struct cpuset *cs, nodemask_t *new_mems)
1139{
1140 struct cpuset *cp;
1141 struct cgroup_subsys_state *pos_css;
1142
1143 rcu_read_lock();
1144 cpuset_for_each_descendant_pre(cp, pos_css, cs) {
1145 struct cpuset *parent = parent_cs(cp);
1146
1147 nodes_and(*new_mems, cp->mems_allowed, parent->effective_mems);
1148
1149
1150
1151
1152
1153 if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys) &&
1154 nodes_empty(*new_mems))
1155 *new_mems = parent->effective_mems;
1156
1157
1158 if (nodes_equal(*new_mems, cp->effective_mems)) {
1159 pos_css = css_rightmost_descendant(pos_css);
1160 continue;
1161 }
1162
1163 if (!css_tryget_online(&cp->css))
1164 continue;
1165 rcu_read_unlock();
1166
1167 spin_lock_irq(&callback_lock);
1168 cp->effective_mems = *new_mems;
1169 spin_unlock_irq(&callback_lock);
1170
1171 WARN_ON(!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) &&
1172 !nodes_equal(cp->mems_allowed, cp->effective_mems));
1173
1174 update_tasks_nodemask(cp);
1175
1176 rcu_read_lock();
1177 css_put(&cp->css);
1178 }
1179 rcu_read_unlock();
1180}
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
1196 const char *buf)
1197{
1198 int retval;
1199
1200
1201
1202
1203
1204 if (cs == &top_cpuset) {
1205 retval = -EACCES;
1206 goto done;
1207 }
1208
1209
1210
1211
1212
1213
1214
1215 if (!*buf) {
1216 nodes_clear(trialcs->mems_allowed);
1217 } else {
1218 retval = nodelist_parse(buf, trialcs->mems_allowed);
1219 if (retval < 0)
1220 goto done;
1221
1222 if (!nodes_subset(trialcs->mems_allowed,
1223 top_cpuset.mems_allowed)) {
1224 retval = -EINVAL;
1225 goto done;
1226 }
1227 }
1228
1229 if (nodes_equal(cs->mems_allowed, trialcs->mems_allowed)) {
1230 retval = 0;
1231 goto done;
1232 }
1233 retval = validate_change(cs, trialcs);
1234 if (retval < 0)
1235 goto done;
1236
1237 spin_lock_irq(&callback_lock);
1238 cs->mems_allowed = trialcs->mems_allowed;
1239 spin_unlock_irq(&callback_lock);
1240
1241
1242 update_nodemasks_hier(cs, &trialcs->mems_allowed);
1243done:
1244 return retval;
1245}
1246
1247int current_cpuset_is_being_rebound(void)
1248{
1249 int ret;
1250
1251 rcu_read_lock();
1252 ret = task_cs(current) == cpuset_being_rebound;
1253 rcu_read_unlock();
1254
1255 return ret;
1256}
1257
1258static int update_relax_domain_level(struct cpuset *cs, s64 val)
1259{
1260#ifdef CONFIG_SMP
1261 if (val < -1 || val >= sched_domain_level_max)
1262 return -EINVAL;
1263#endif
1264
1265 if (val != cs->relax_domain_level) {
1266 cs->relax_domain_level = val;
1267 if (!cpumask_empty(cs->cpus_allowed) &&
1268 is_sched_load_balance(cs))
1269 rebuild_sched_domains_locked();
1270 }
1271
1272 return 0;
1273}
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283static void update_tasks_flags(struct cpuset *cs)
1284{
1285 struct css_task_iter it;
1286 struct task_struct *task;
1287
1288 css_task_iter_start(&cs->css, &it);
1289 while ((task = css_task_iter_next(&it)))
1290 cpuset_update_task_spread_flag(cs, task);
1291 css_task_iter_end(&it);
1292}
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
1304 int turning_on)
1305{
1306 struct cpuset *trialcs;
1307 int balance_flag_changed;
1308 int spread_flag_changed;
1309 int err;
1310
1311 trialcs = alloc_trial_cpuset(cs);
1312 if (!trialcs)
1313 return -ENOMEM;
1314
1315 if (turning_on)
1316 set_bit(bit, &trialcs->flags);
1317 else
1318 clear_bit(bit, &trialcs->flags);
1319
1320 err = validate_change(cs, trialcs);
1321 if (err < 0)
1322 goto out;
1323
1324 balance_flag_changed = (is_sched_load_balance(cs) !=
1325 is_sched_load_balance(trialcs));
1326
1327 spread_flag_changed = ((is_spread_slab(cs) != is_spread_slab(trialcs))
1328 || (is_spread_page(cs) != is_spread_page(trialcs)));
1329
1330 spin_lock_irq(&callback_lock);
1331 cs->flags = trialcs->flags;
1332 spin_unlock_irq(&callback_lock);
1333
1334 if (!cpumask_empty(trialcs->cpus_allowed) && balance_flag_changed)
1335 rebuild_sched_domains_locked();
1336
1337 if (spread_flag_changed)
1338 update_tasks_flags(cs);
1339out:
1340 free_trial_cpuset(trialcs);
1341 return err;
1342}
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389#define FM_COEF 933
1390#define FM_MAXTICKS ((u32)99)
1391#define FM_MAXCNT 1000000
1392#define FM_SCALE 1000
1393
1394
1395static void fmeter_init(struct fmeter *fmp)
1396{
1397 fmp->cnt = 0;
1398 fmp->val = 0;
1399 fmp->time = 0;
1400 spin_lock_init(&fmp->lock);
1401}
1402
1403
1404static void fmeter_update(struct fmeter *fmp)
1405{
1406 time64_t now;
1407 u32 ticks;
1408
1409 now = ktime_get_seconds();
1410 ticks = now - fmp->time;
1411
1412 if (ticks == 0)
1413 return;
1414
1415 ticks = min(FM_MAXTICKS, ticks);
1416 while (ticks-- > 0)
1417 fmp->val = (FM_COEF * fmp->val) / FM_SCALE;
1418 fmp->time = now;
1419
1420 fmp->val += ((FM_SCALE - FM_COEF) * fmp->cnt) / FM_SCALE;
1421 fmp->cnt = 0;
1422}
1423
1424
1425static void fmeter_markevent(struct fmeter *fmp)
1426{
1427 spin_lock(&fmp->lock);
1428 fmeter_update(fmp);
1429 fmp->cnt = min(FM_MAXCNT, fmp->cnt + FM_SCALE);
1430 spin_unlock(&fmp->lock);
1431}
1432
1433
1434static int fmeter_getrate(struct fmeter *fmp)
1435{
1436 int val;
1437
1438 spin_lock(&fmp->lock);
1439 fmeter_update(fmp);
1440 val = fmp->val;
1441 spin_unlock(&fmp->lock);
1442 return val;
1443}
1444
1445static struct cpuset *cpuset_attach_old_cs;
1446
1447
1448static int cpuset_can_attach(struct cgroup_taskset *tset)
1449{
1450 struct cgroup_subsys_state *css;
1451 struct cpuset *cs;
1452 struct task_struct *task;
1453 int ret;
1454
1455
1456 cpuset_attach_old_cs = task_cs(cgroup_taskset_first(tset, &css));
1457 cs = css_cs(css);
1458
1459 mutex_lock(&cpuset_mutex);
1460
1461
1462 ret = -ENOSPC;
1463 if (!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) &&
1464 (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed)))
1465 goto out_unlock;
1466
1467 cgroup_taskset_for_each(task, css, tset) {
1468 ret = task_can_attach(task, cs->cpus_allowed);
1469 if (ret)
1470 goto out_unlock;
1471 ret = security_task_setscheduler(task);
1472 if (ret)
1473 goto out_unlock;
1474 }
1475
1476
1477
1478
1479
1480 cs->attach_in_progress++;
1481 ret = 0;
1482out_unlock:
1483 mutex_unlock(&cpuset_mutex);
1484 return ret;
1485}
1486
1487static void cpuset_cancel_attach(struct cgroup_taskset *tset)
1488{
1489 struct cgroup_subsys_state *css;
1490 struct cpuset *cs;
1491
1492 cgroup_taskset_first(tset, &css);
1493 cs = css_cs(css);
1494
1495 mutex_lock(&cpuset_mutex);
1496 css_cs(css)->attach_in_progress--;
1497 mutex_unlock(&cpuset_mutex);
1498}
1499
1500
1501
1502
1503
1504
1505static cpumask_var_t cpus_attach;
1506
1507static void cpuset_attach(struct cgroup_taskset *tset)
1508{
1509
1510 static nodemask_t cpuset_attach_nodemask_to;
1511 struct task_struct *task;
1512 struct task_struct *leader;
1513 struct cgroup_subsys_state *css;
1514 struct cpuset *cs;
1515 struct cpuset *oldcs = cpuset_attach_old_cs;
1516
1517 cgroup_taskset_first(tset, &css);
1518 cs = css_cs(css);
1519
1520 mutex_lock(&cpuset_mutex);
1521
1522
1523 if (cs == &top_cpuset)
1524 cpumask_copy(cpus_attach, cpu_possible_mask);
1525 else
1526 guarantee_online_cpus(cs, cpus_attach);
1527
1528 guarantee_online_mems(cs, &cpuset_attach_nodemask_to);
1529
1530 cgroup_taskset_for_each(task, css, tset) {
1531
1532
1533
1534
1535 WARN_ON_ONCE(set_cpus_allowed_ptr(task, cpus_attach));
1536
1537 cpuset_change_task_nodemask(task, &cpuset_attach_nodemask_to);
1538 cpuset_update_task_spread_flag(cs, task);
1539 }
1540
1541
1542
1543
1544
1545 cpuset_attach_nodemask_to = cs->effective_mems;
1546 cgroup_taskset_for_each_leader(leader, css, tset) {
1547 struct mm_struct *mm = get_task_mm(leader);
1548
1549 if (mm) {
1550 mpol_rebind_mm(mm, &cpuset_attach_nodemask_to);
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560 if (is_memory_migrate(cs))
1561 cpuset_migrate_mm(mm, &oldcs->old_mems_allowed,
1562 &cpuset_attach_nodemask_to);
1563 else
1564 mmput(mm);
1565 }
1566 }
1567
1568 cs->old_mems_allowed = cpuset_attach_nodemask_to;
1569
1570 cs->attach_in_progress--;
1571 if (!cs->attach_in_progress)
1572 wake_up(&cpuset_attach_wq);
1573
1574 mutex_unlock(&cpuset_mutex);
1575}
1576
1577
1578
1579typedef enum {
1580 FILE_MEMORY_MIGRATE,
1581 FILE_CPULIST,
1582 FILE_MEMLIST,
1583 FILE_EFFECTIVE_CPULIST,
1584 FILE_EFFECTIVE_MEMLIST,
1585 FILE_CPU_EXCLUSIVE,
1586 FILE_MEM_EXCLUSIVE,
1587 FILE_MEM_HARDWALL,
1588 FILE_SCHED_LOAD_BALANCE,
1589 FILE_SCHED_RELAX_DOMAIN_LEVEL,
1590 FILE_MEMORY_PRESSURE_ENABLED,
1591 FILE_MEMORY_PRESSURE,
1592 FILE_SPREAD_PAGE,
1593 FILE_SPREAD_SLAB,
1594} cpuset_filetype_t;
1595
1596static int cpuset_write_u64(struct cgroup_subsys_state *css, struct cftype *cft,
1597 u64 val)
1598{
1599 struct cpuset *cs = css_cs(css);
1600 cpuset_filetype_t type = cft->private;
1601 int retval = 0;
1602
1603 mutex_lock(&cpuset_mutex);
1604 if (!is_cpuset_online(cs)) {
1605 retval = -ENODEV;
1606 goto out_unlock;
1607 }
1608
1609 switch (type) {
1610 case FILE_CPU_EXCLUSIVE:
1611 retval = update_flag(CS_CPU_EXCLUSIVE, cs, val);
1612 break;
1613 case FILE_MEM_EXCLUSIVE:
1614 retval = update_flag(CS_MEM_EXCLUSIVE, cs, val);
1615 break;
1616 case FILE_MEM_HARDWALL:
1617 retval = update_flag(CS_MEM_HARDWALL, cs, val);
1618 break;
1619 case FILE_SCHED_LOAD_BALANCE:
1620 retval = update_flag(CS_SCHED_LOAD_BALANCE, cs, val);
1621 break;
1622 case FILE_MEMORY_MIGRATE:
1623 retval = update_flag(CS_MEMORY_MIGRATE, cs, val);
1624 break;
1625 case FILE_MEMORY_PRESSURE_ENABLED:
1626 cpuset_memory_pressure_enabled = !!val;
1627 break;
1628 case FILE_SPREAD_PAGE:
1629 retval = update_flag(CS_SPREAD_PAGE, cs, val);
1630 break;
1631 case FILE_SPREAD_SLAB:
1632 retval = update_flag(CS_SPREAD_SLAB, cs, val);
1633 break;
1634 default:
1635 retval = -EINVAL;
1636 break;
1637 }
1638out_unlock:
1639 mutex_unlock(&cpuset_mutex);
1640 return retval;
1641}
1642
1643static int cpuset_write_s64(struct cgroup_subsys_state *css, struct cftype *cft,
1644 s64 val)
1645{
1646 struct cpuset *cs = css_cs(css);
1647 cpuset_filetype_t type = cft->private;
1648 int retval = -ENODEV;
1649
1650 mutex_lock(&cpuset_mutex);
1651 if (!is_cpuset_online(cs))
1652 goto out_unlock;
1653
1654 switch (type) {
1655 case FILE_SCHED_RELAX_DOMAIN_LEVEL:
1656 retval = update_relax_domain_level(cs, val);
1657 break;
1658 default:
1659 retval = -EINVAL;
1660 break;
1661 }
1662out_unlock:
1663 mutex_unlock(&cpuset_mutex);
1664 return retval;
1665}
1666
1667
1668
1669
1670static ssize_t cpuset_write_resmask(struct kernfs_open_file *of,
1671 char *buf, size_t nbytes, loff_t off)
1672{
1673 struct cpuset *cs = css_cs(of_css(of));
1674 struct cpuset *trialcs;
1675 int retval = -ENODEV;
1676
1677 buf = strstrip(buf);
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698 css_get(&cs->css);
1699 kernfs_break_active_protection(of->kn);
1700 flush_work(&cpuset_hotplug_work);
1701
1702 mutex_lock(&cpuset_mutex);
1703 if (!is_cpuset_online(cs))
1704 goto out_unlock;
1705
1706 trialcs = alloc_trial_cpuset(cs);
1707 if (!trialcs) {
1708 retval = -ENOMEM;
1709 goto out_unlock;
1710 }
1711
1712 switch (of_cft(of)->private) {
1713 case FILE_CPULIST:
1714 retval = update_cpumask(cs, trialcs, buf);
1715 break;
1716 case FILE_MEMLIST:
1717 retval = update_nodemask(cs, trialcs, buf);
1718 break;
1719 default:
1720 retval = -EINVAL;
1721 break;
1722 }
1723
1724 free_trial_cpuset(trialcs);
1725out_unlock:
1726 mutex_unlock(&cpuset_mutex);
1727 kernfs_unbreak_active_protection(of->kn);
1728 css_put(&cs->css);
1729 flush_workqueue(cpuset_migrate_mm_wq);
1730 return retval ?: nbytes;
1731}
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741static int cpuset_common_seq_show(struct seq_file *sf, void *v)
1742{
1743 struct cpuset *cs = css_cs(seq_css(sf));
1744 cpuset_filetype_t type = seq_cft(sf)->private;
1745 int ret = 0;
1746
1747 spin_lock_irq(&callback_lock);
1748
1749 switch (type) {
1750 case FILE_CPULIST:
1751 seq_printf(sf, "%*pbl\n", cpumask_pr_args(cs->cpus_allowed));
1752 break;
1753 case FILE_MEMLIST:
1754 seq_printf(sf, "%*pbl\n", nodemask_pr_args(&cs->mems_allowed));
1755 break;
1756 case FILE_EFFECTIVE_CPULIST:
1757 seq_printf(sf, "%*pbl\n", cpumask_pr_args(cs->effective_cpus));
1758 break;
1759 case FILE_EFFECTIVE_MEMLIST:
1760 seq_printf(sf, "%*pbl\n", nodemask_pr_args(&cs->effective_mems));
1761 break;
1762 default:
1763 ret = -EINVAL;
1764 }
1765
1766 spin_unlock_irq(&callback_lock);
1767 return ret;
1768}
1769
1770static u64 cpuset_read_u64(struct cgroup_subsys_state *css, struct cftype *cft)
1771{
1772 struct cpuset *cs = css_cs(css);
1773 cpuset_filetype_t type = cft->private;
1774 switch (type) {
1775 case FILE_CPU_EXCLUSIVE:
1776 return is_cpu_exclusive(cs);
1777 case FILE_MEM_EXCLUSIVE:
1778 return is_mem_exclusive(cs);
1779 case FILE_MEM_HARDWALL:
1780 return is_mem_hardwall(cs);
1781 case FILE_SCHED_LOAD_BALANCE:
1782 return is_sched_load_balance(cs);
1783 case FILE_MEMORY_MIGRATE:
1784 return is_memory_migrate(cs);
1785 case FILE_MEMORY_PRESSURE_ENABLED:
1786 return cpuset_memory_pressure_enabled;
1787 case FILE_MEMORY_PRESSURE:
1788 return fmeter_getrate(&cs->fmeter);
1789 case FILE_SPREAD_PAGE:
1790 return is_spread_page(cs);
1791 case FILE_SPREAD_SLAB:
1792 return is_spread_slab(cs);
1793 default:
1794 BUG();
1795 }
1796
1797
1798 return 0;
1799}
1800
1801static s64 cpuset_read_s64(struct cgroup_subsys_state *css, struct cftype *cft)
1802{
1803 struct cpuset *cs = css_cs(css);
1804 cpuset_filetype_t type = cft->private;
1805 switch (type) {
1806 case FILE_SCHED_RELAX_DOMAIN_LEVEL:
1807 return cs->relax_domain_level;
1808 default:
1809 BUG();
1810 }
1811
1812
1813 return 0;
1814}
1815
1816
1817
1818
1819
1820
1821static struct cftype files[] = {
1822 {
1823 .name = "cpus",
1824 .seq_show = cpuset_common_seq_show,
1825 .write = cpuset_write_resmask,
1826 .max_write_len = (100U + 6 * NR_CPUS),
1827 .private = FILE_CPULIST,
1828 },
1829
1830 {
1831 .name = "mems",
1832 .seq_show = cpuset_common_seq_show,
1833 .write = cpuset_write_resmask,
1834 .max_write_len = (100U + 6 * MAX_NUMNODES),
1835 .private = FILE_MEMLIST,
1836 },
1837
1838 {
1839 .name = "effective_cpus",
1840 .seq_show = cpuset_common_seq_show,
1841 .private = FILE_EFFECTIVE_CPULIST,
1842 },
1843
1844 {
1845 .name = "effective_mems",
1846 .seq_show = cpuset_common_seq_show,
1847 .private = FILE_EFFECTIVE_MEMLIST,
1848 },
1849
1850 {
1851 .name = "cpu_exclusive",
1852 .read_u64 = cpuset_read_u64,
1853 .write_u64 = cpuset_write_u64,
1854 .private = FILE_CPU_EXCLUSIVE,
1855 },
1856
1857 {
1858 .name = "mem_exclusive",
1859 .read_u64 = cpuset_read_u64,
1860 .write_u64 = cpuset_write_u64,
1861 .private = FILE_MEM_EXCLUSIVE,
1862 },
1863
1864 {
1865 .name = "mem_hardwall",
1866 .read_u64 = cpuset_read_u64,
1867 .write_u64 = cpuset_write_u64,
1868 .private = FILE_MEM_HARDWALL,
1869 },
1870
1871 {
1872 .name = "sched_load_balance",
1873 .read_u64 = cpuset_read_u64,
1874 .write_u64 = cpuset_write_u64,
1875 .private = FILE_SCHED_LOAD_BALANCE,
1876 },
1877
1878 {
1879 .name = "sched_relax_domain_level",
1880 .read_s64 = cpuset_read_s64,
1881 .write_s64 = cpuset_write_s64,
1882 .private = FILE_SCHED_RELAX_DOMAIN_LEVEL,
1883 },
1884
1885 {
1886 .name = "memory_migrate",
1887 .read_u64 = cpuset_read_u64,
1888 .write_u64 = cpuset_write_u64,
1889 .private = FILE_MEMORY_MIGRATE,
1890 },
1891
1892 {
1893 .name = "memory_pressure",
1894 .read_u64 = cpuset_read_u64,
1895 .private = FILE_MEMORY_PRESSURE,
1896 },
1897
1898 {
1899 .name = "memory_spread_page",
1900 .read_u64 = cpuset_read_u64,
1901 .write_u64 = cpuset_write_u64,
1902 .private = FILE_SPREAD_PAGE,
1903 },
1904
1905 {
1906 .name = "memory_spread_slab",
1907 .read_u64 = cpuset_read_u64,
1908 .write_u64 = cpuset_write_u64,
1909 .private = FILE_SPREAD_SLAB,
1910 },
1911
1912 {
1913 .name = "memory_pressure_enabled",
1914 .flags = CFTYPE_ONLY_ON_ROOT,
1915 .read_u64 = cpuset_read_u64,
1916 .write_u64 = cpuset_write_u64,
1917 .private = FILE_MEMORY_PRESSURE_ENABLED,
1918 },
1919
1920 { }
1921};
1922
1923
1924
1925
1926
1927
1928static struct cgroup_subsys_state *
1929cpuset_css_alloc(struct cgroup_subsys_state *parent_css)
1930{
1931 struct cpuset *cs;
1932
1933 if (!parent_css)
1934 return &top_cpuset.css;
1935
1936 cs = kzalloc(sizeof(*cs), GFP_KERNEL);
1937 if (!cs)
1938 return ERR_PTR(-ENOMEM);
1939 if (!alloc_cpumask_var(&cs->cpus_allowed, GFP_KERNEL))
1940 goto free_cs;
1941 if (!alloc_cpumask_var(&cs->effective_cpus, GFP_KERNEL))
1942 goto free_cpus;
1943
1944 set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
1945 cpumask_clear(cs->cpus_allowed);
1946 nodes_clear(cs->mems_allowed);
1947 cpumask_clear(cs->effective_cpus);
1948 nodes_clear(cs->effective_mems);
1949 fmeter_init(&cs->fmeter);
1950 cs->relax_domain_level = -1;
1951
1952 return &cs->css;
1953
1954free_cpus:
1955 free_cpumask_var(cs->cpus_allowed);
1956free_cs:
1957 kfree(cs);
1958 return ERR_PTR(-ENOMEM);
1959}
1960
1961static int cpuset_css_online(struct cgroup_subsys_state *css)
1962{
1963 struct cpuset *cs = css_cs(css);
1964 struct cpuset *parent = parent_cs(cs);
1965 struct cpuset *tmp_cs;
1966 struct cgroup_subsys_state *pos_css;
1967
1968 if (!parent)
1969 return 0;
1970
1971 mutex_lock(&cpuset_mutex);
1972
1973 set_bit(CS_ONLINE, &cs->flags);
1974 if (is_spread_page(parent))
1975 set_bit(CS_SPREAD_PAGE, &cs->flags);
1976 if (is_spread_slab(parent))
1977 set_bit(CS_SPREAD_SLAB, &cs->flags);
1978
1979 cpuset_inc();
1980
1981 spin_lock_irq(&callback_lock);
1982 if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys)) {
1983 cpumask_copy(cs->effective_cpus, parent->effective_cpus);
1984 cs->effective_mems = parent->effective_mems;
1985 }
1986 spin_unlock_irq(&callback_lock);
1987
1988 if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags))
1989 goto out_unlock;
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004 rcu_read_lock();
2005 cpuset_for_each_child(tmp_cs, pos_css, parent) {
2006 if (is_mem_exclusive(tmp_cs) || is_cpu_exclusive(tmp_cs)) {
2007 rcu_read_unlock();
2008 goto out_unlock;
2009 }
2010 }
2011 rcu_read_unlock();
2012
2013 spin_lock_irq(&callback_lock);
2014 cs->mems_allowed = parent->mems_allowed;
2015 cs->effective_mems = parent->mems_allowed;
2016 cpumask_copy(cs->cpus_allowed, parent->cpus_allowed);
2017 cpumask_copy(cs->effective_cpus, parent->cpus_allowed);
2018 spin_unlock_irq(&callback_lock);
2019out_unlock:
2020 mutex_unlock(&cpuset_mutex);
2021 return 0;
2022}
2023
2024
2025
2026
2027
2028
2029
2030static void cpuset_css_offline(struct cgroup_subsys_state *css)
2031{
2032 struct cpuset *cs = css_cs(css);
2033
2034 mutex_lock(&cpuset_mutex);
2035
2036 if (is_sched_load_balance(cs))
2037 update_flag(CS_SCHED_LOAD_BALANCE, cs, 0);
2038
2039 cpuset_dec();
2040 clear_bit(CS_ONLINE, &cs->flags);
2041
2042 mutex_unlock(&cpuset_mutex);
2043}
2044
2045static void cpuset_css_free(struct cgroup_subsys_state *css)
2046{
2047 struct cpuset *cs = css_cs(css);
2048
2049 free_cpumask_var(cs->effective_cpus);
2050 free_cpumask_var(cs->cpus_allowed);
2051 kfree(cs);
2052}
2053
2054static void cpuset_bind(struct cgroup_subsys_state *root_css)
2055{
2056 mutex_lock(&cpuset_mutex);
2057 spin_lock_irq(&callback_lock);
2058
2059 if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys)) {
2060 cpumask_copy(top_cpuset.cpus_allowed, cpu_possible_mask);
2061 top_cpuset.mems_allowed = node_possible_map;
2062 } else {
2063 cpumask_copy(top_cpuset.cpus_allowed,
2064 top_cpuset.effective_cpus);
2065 top_cpuset.mems_allowed = top_cpuset.effective_mems;
2066 }
2067
2068 spin_unlock_irq(&callback_lock);
2069 mutex_unlock(&cpuset_mutex);
2070}
2071
2072
2073
2074
2075
2076
2077static void cpuset_fork(struct task_struct *task)
2078{
2079 if (task_css_is_root(task, cpuset_cgrp_id))
2080 return;
2081
2082 set_cpus_allowed_ptr(task, ¤t->cpus_allowed);
2083 task->mems_allowed = current->mems_allowed;
2084}
2085
2086struct cgroup_subsys cpuset_cgrp_subsys = {
2087 .css_alloc = cpuset_css_alloc,
2088 .css_online = cpuset_css_online,
2089 .css_offline = cpuset_css_offline,
2090 .css_free = cpuset_css_free,
2091 .can_attach = cpuset_can_attach,
2092 .cancel_attach = cpuset_cancel_attach,
2093 .attach = cpuset_attach,
2094 .post_attach = cpuset_post_attach,
2095 .bind = cpuset_bind,
2096 .fork = cpuset_fork,
2097 .legacy_cftypes = files,
2098 .early_init = true,
2099};
2100
2101
2102
2103
2104
2105
2106
2107int __init cpuset_init(void)
2108{
2109 int err = 0;
2110
2111 BUG_ON(!alloc_cpumask_var(&top_cpuset.cpus_allowed, GFP_KERNEL));
2112 BUG_ON(!alloc_cpumask_var(&top_cpuset.effective_cpus, GFP_KERNEL));
2113
2114 cpumask_setall(top_cpuset.cpus_allowed);
2115 nodes_setall(top_cpuset.mems_allowed);
2116 cpumask_setall(top_cpuset.effective_cpus);
2117 nodes_setall(top_cpuset.effective_mems);
2118
2119 fmeter_init(&top_cpuset.fmeter);
2120 set_bit(CS_SCHED_LOAD_BALANCE, &top_cpuset.flags);
2121 top_cpuset.relax_domain_level = -1;
2122
2123 err = register_filesystem(&cpuset_fs_type);
2124 if (err < 0)
2125 return err;
2126
2127 BUG_ON(!alloc_cpumask_var(&cpus_attach, GFP_KERNEL));
2128
2129 return 0;
2130}
2131
2132
2133
2134
2135
2136
2137
2138
2139static void remove_tasks_in_empty_cpuset(struct cpuset *cs)
2140{
2141 struct cpuset *parent;
2142
2143
2144
2145
2146
2147 parent = parent_cs(cs);
2148 while (cpumask_empty(parent->cpus_allowed) ||
2149 nodes_empty(parent->mems_allowed))
2150 parent = parent_cs(parent);
2151
2152 if (cgroup_transfer_tasks(parent->css.cgroup, cs->css.cgroup)) {
2153 pr_err("cpuset: failed to transfer tasks out of empty cpuset ");
2154 pr_cont_cgroup_name(cs->css.cgroup);
2155 pr_cont("\n");
2156 }
2157}
2158
2159static void
2160hotplug_update_tasks_legacy(struct cpuset *cs,
2161 struct cpumask *new_cpus, nodemask_t *new_mems,
2162 bool cpus_updated, bool mems_updated)
2163{
2164 bool is_empty;
2165
2166 spin_lock_irq(&callback_lock);
2167 cpumask_copy(cs->cpus_allowed, new_cpus);
2168 cpumask_copy(cs->effective_cpus, new_cpus);
2169 cs->mems_allowed = *new_mems;
2170 cs->effective_mems = *new_mems;
2171 spin_unlock_irq(&callback_lock);
2172
2173
2174
2175
2176
2177 if (cpus_updated && !cpumask_empty(cs->cpus_allowed))
2178 update_tasks_cpumask(cs);
2179 if (mems_updated && !nodes_empty(cs->mems_allowed))
2180 update_tasks_nodemask(cs);
2181
2182 is_empty = cpumask_empty(cs->cpus_allowed) ||
2183 nodes_empty(cs->mems_allowed);
2184
2185 mutex_unlock(&cpuset_mutex);
2186
2187
2188
2189
2190
2191
2192 if (is_empty)
2193 remove_tasks_in_empty_cpuset(cs);
2194
2195 mutex_lock(&cpuset_mutex);
2196}
2197
2198static void
2199hotplug_update_tasks(struct cpuset *cs,
2200 struct cpumask *new_cpus, nodemask_t *new_mems,
2201 bool cpus_updated, bool mems_updated)
2202{
2203 if (cpumask_empty(new_cpus))
2204 cpumask_copy(new_cpus, parent_cs(cs)->effective_cpus);
2205 if (nodes_empty(*new_mems))
2206 *new_mems = parent_cs(cs)->effective_mems;
2207
2208 spin_lock_irq(&callback_lock);
2209 cpumask_copy(cs->effective_cpus, new_cpus);
2210 cs->effective_mems = *new_mems;
2211 spin_unlock_irq(&callback_lock);
2212
2213 if (cpus_updated)
2214 update_tasks_cpumask(cs);
2215 if (mems_updated)
2216 update_tasks_nodemask(cs);
2217}
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227static void cpuset_hotplug_update_tasks(struct cpuset *cs)
2228{
2229 static cpumask_t new_cpus;
2230 static nodemask_t new_mems;
2231 bool cpus_updated;
2232 bool mems_updated;
2233retry:
2234 wait_event(cpuset_attach_wq, cs->attach_in_progress == 0);
2235
2236 mutex_lock(&cpuset_mutex);
2237
2238
2239
2240
2241
2242 if (cs->attach_in_progress) {
2243 mutex_unlock(&cpuset_mutex);
2244 goto retry;
2245 }
2246
2247 cpumask_and(&new_cpus, cs->cpus_allowed, parent_cs(cs)->effective_cpus);
2248 nodes_and(new_mems, cs->mems_allowed, parent_cs(cs)->effective_mems);
2249
2250 cpus_updated = !cpumask_equal(&new_cpus, cs->effective_cpus);
2251 mems_updated = !nodes_equal(new_mems, cs->effective_mems);
2252
2253 if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys))
2254 hotplug_update_tasks(cs, &new_cpus, &new_mems,
2255 cpus_updated, mems_updated);
2256 else
2257 hotplug_update_tasks_legacy(cs, &new_cpus, &new_mems,
2258 cpus_updated, mems_updated);
2259
2260 mutex_unlock(&cpuset_mutex);
2261}
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279static void cpuset_hotplug_workfn(struct work_struct *work)
2280{
2281 static cpumask_t new_cpus;
2282 static nodemask_t new_mems;
2283 bool cpus_updated, mems_updated;
2284 bool on_dfl = cgroup_subsys_on_dfl(cpuset_cgrp_subsys);
2285
2286 mutex_lock(&cpuset_mutex);
2287
2288
2289 cpumask_copy(&new_cpus, cpu_active_mask);
2290 new_mems = node_states[N_MEMORY];
2291
2292 cpus_updated = !cpumask_equal(top_cpuset.effective_cpus, &new_cpus);
2293 mems_updated = !nodes_equal(top_cpuset.effective_mems, new_mems);
2294
2295
2296 if (cpus_updated) {
2297 spin_lock_irq(&callback_lock);
2298 if (!on_dfl)
2299 cpumask_copy(top_cpuset.cpus_allowed, &new_cpus);
2300 cpumask_copy(top_cpuset.effective_cpus, &new_cpus);
2301 spin_unlock_irq(&callback_lock);
2302
2303 }
2304
2305
2306 if (mems_updated) {
2307 spin_lock_irq(&callback_lock);
2308 if (!on_dfl)
2309 top_cpuset.mems_allowed = new_mems;
2310 top_cpuset.effective_mems = new_mems;
2311 spin_unlock_irq(&callback_lock);
2312 update_tasks_nodemask(&top_cpuset);
2313 }
2314
2315 mutex_unlock(&cpuset_mutex);
2316
2317
2318 if (cpus_updated || mems_updated) {
2319 struct cpuset *cs;
2320 struct cgroup_subsys_state *pos_css;
2321
2322 rcu_read_lock();
2323 cpuset_for_each_descendant_pre(cs, pos_css, &top_cpuset) {
2324 if (cs == &top_cpuset || !css_tryget_online(&cs->css))
2325 continue;
2326 rcu_read_unlock();
2327
2328 cpuset_hotplug_update_tasks(cs);
2329
2330 rcu_read_lock();
2331 css_put(&cs->css);
2332 }
2333 rcu_read_unlock();
2334 }
2335
2336
2337 if (cpus_updated)
2338 rebuild_sched_domains();
2339}
2340
2341void cpuset_update_active_cpus(void)
2342{
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353 partition_sched_domains(1, NULL, NULL);
2354 schedule_work(&cpuset_hotplug_work);
2355}
2356
2357
2358
2359
2360
2361
2362static int cpuset_track_online_nodes(struct notifier_block *self,
2363 unsigned long action, void *arg)
2364{
2365 schedule_work(&cpuset_hotplug_work);
2366 return NOTIFY_OK;
2367}
2368
2369static struct notifier_block cpuset_track_online_nodes_nb = {
2370 .notifier_call = cpuset_track_online_nodes,
2371 .priority = 10,
2372};
2373
2374
2375
2376
2377
2378
2379void __init cpuset_init_smp(void)
2380{
2381 cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask);
2382 top_cpuset.mems_allowed = node_states[N_MEMORY];
2383 top_cpuset.old_mems_allowed = top_cpuset.mems_allowed;
2384
2385 cpumask_copy(top_cpuset.effective_cpus, cpu_active_mask);
2386 top_cpuset.effective_mems = node_states[N_MEMORY];
2387
2388 register_hotmemory_notifier(&cpuset_track_online_nodes_nb);
2389
2390 cpuset_migrate_mm_wq = alloc_ordered_workqueue("cpuset_migrate_mm", 0);
2391 BUG_ON(!cpuset_migrate_mm_wq);
2392}
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask)
2406{
2407 unsigned long flags;
2408
2409 spin_lock_irqsave(&callback_lock, flags);
2410 rcu_read_lock();
2411 guarantee_online_cpus(task_cs(tsk), pmask);
2412 rcu_read_unlock();
2413 spin_unlock_irqrestore(&callback_lock, flags);
2414}
2415
2416void cpuset_cpus_allowed_fallback(struct task_struct *tsk)
2417{
2418 rcu_read_lock();
2419 do_set_cpus_allowed(tsk, task_cs(tsk)->effective_cpus);
2420 rcu_read_unlock();
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439}
2440
2441void __init cpuset_init_current_mems_allowed(void)
2442{
2443 nodes_setall(current->mems_allowed);
2444}
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456nodemask_t cpuset_mems_allowed(struct task_struct *tsk)
2457{
2458 nodemask_t mask;
2459 unsigned long flags;
2460
2461 spin_lock_irqsave(&callback_lock, flags);
2462 rcu_read_lock();
2463 guarantee_online_mems(task_cs(tsk), &mask);
2464 rcu_read_unlock();
2465 spin_unlock_irqrestore(&callback_lock, flags);
2466
2467 return mask;
2468}
2469
2470
2471
2472
2473
2474
2475
2476int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask)
2477{
2478 return nodes_intersects(*nodemask, current->mems_allowed);
2479}
2480
2481
2482
2483
2484
2485
2486
2487static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs)
2488{
2489 while (!(is_mem_exclusive(cs) || is_mem_hardwall(cs)) && parent_cs(cs))
2490 cs = parent_cs(cs);
2491 return cs;
2492}
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534bool __cpuset_node_allowed(int node, gfp_t gfp_mask)
2535{
2536 struct cpuset *cs;
2537 int allowed;
2538 unsigned long flags;
2539
2540 if (in_interrupt())
2541 return true;
2542 if (node_isset(node, current->mems_allowed))
2543 return true;
2544
2545
2546
2547
2548 if (unlikely(test_thread_flag(TIF_MEMDIE)))
2549 return true;
2550 if (gfp_mask & __GFP_HARDWALL)
2551 return false;
2552
2553 if (current->flags & PF_EXITING)
2554 return true;
2555
2556
2557 spin_lock_irqsave(&callback_lock, flags);
2558
2559 rcu_read_lock();
2560 cs = nearest_hardwall_ancestor(task_cs(current));
2561 allowed = node_isset(node, cs->mems_allowed);
2562 rcu_read_unlock();
2563
2564 spin_unlock_irqrestore(&callback_lock, flags);
2565 return allowed;
2566}
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595static int cpuset_spread_node(int *rotor)
2596{
2597 return *rotor = next_node_in(*rotor, current->mems_allowed);
2598}
2599
2600int cpuset_mem_spread_node(void)
2601{
2602 if (current->cpuset_mem_spread_rotor == NUMA_NO_NODE)
2603 current->cpuset_mem_spread_rotor =
2604 node_random(¤t->mems_allowed);
2605
2606 return cpuset_spread_node(¤t->cpuset_mem_spread_rotor);
2607}
2608
2609int cpuset_slab_spread_node(void)
2610{
2611 if (current->cpuset_slab_spread_rotor == NUMA_NO_NODE)
2612 current->cpuset_slab_spread_rotor =
2613 node_random(¤t->mems_allowed);
2614
2615 return cpuset_spread_node(¤t->cpuset_slab_spread_rotor);
2616}
2617
2618EXPORT_SYMBOL_GPL(cpuset_mem_spread_node);
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631int cpuset_mems_allowed_intersects(const struct task_struct *tsk1,
2632 const struct task_struct *tsk2)
2633{
2634 return nodes_intersects(tsk1->mems_allowed, tsk2->mems_allowed);
2635}
2636
2637
2638
2639
2640
2641
2642
2643void cpuset_print_current_mems_allowed(void)
2644{
2645 struct cgroup *cgrp;
2646
2647 rcu_read_lock();
2648
2649 cgrp = task_cs(current)->css.cgroup;
2650 pr_info("%s cpuset=", current->comm);
2651 pr_cont_cgroup_name(cgrp);
2652 pr_cont(" mems_allowed=%*pbl\n",
2653 nodemask_pr_args(¤t->mems_allowed));
2654
2655 rcu_read_unlock();
2656}
2657
2658
2659
2660
2661
2662
2663
2664int cpuset_memory_pressure_enabled __read_mostly;
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684void __cpuset_memory_pressure_bump(void)
2685{
2686 rcu_read_lock();
2687 fmeter_markevent(&task_cs(current)->fmeter);
2688 rcu_read_unlock();
2689}
2690
2691#ifdef CONFIG_PROC_PID_CPUSET
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701int proc_cpuset_show(struct seq_file *m, struct pid_namespace *ns,
2702 struct pid *pid, struct task_struct *tsk)
2703{
2704 char *buf;
2705 struct cgroup_subsys_state *css;
2706 int retval;
2707
2708 retval = -ENOMEM;
2709 buf = kmalloc(PATH_MAX, GFP_KERNEL);
2710 if (!buf)
2711 goto out;
2712
2713 css = task_get_css(tsk, cpuset_cgrp_id);
2714 retval = cgroup_path_ns(css->cgroup, buf, PATH_MAX,
2715 current->nsproxy->cgroup_ns);
2716 css_put(css);
2717 if (retval >= PATH_MAX)
2718 retval = -ENAMETOOLONG;
2719 if (retval < 0)
2720 goto out_free;
2721 seq_puts(m, buf);
2722 seq_putc(m, '\n');
2723 retval = 0;
2724out_free:
2725 kfree(buf);
2726out:
2727 return retval;
2728}
2729#endif
2730
2731
2732void cpuset_task_status_allowed(struct seq_file *m, struct task_struct *task)
2733{
2734 seq_printf(m, "Mems_allowed:\t%*pb\n",
2735 nodemask_pr_args(&task->mems_allowed));
2736 seq_printf(m, "Mems_allowed_list:\t%*pbl\n",
2737 nodemask_pr_args(&task->mems_allowed));
2738}
2739