1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25#include <linux/cpu.h>
26#include <linux/cpumask.h>
27#include <linux/cpuset.h>
28#include <linux/err.h>
29#include <linux/errno.h>
30#include <linux/file.h>
31#include <linux/fs.h>
32#include <linux/init.h>
33#include <linux/interrupt.h>
34#include <linux/kernel.h>
35#include <linux/kmod.h>
36#include <linux/list.h>
37#include <linux/mempolicy.h>
38#include <linux/mm.h>
39#include <linux/memory.h>
40#include <linux/export.h>
41#include <linux/mount.h>
42#include <linux/namei.h>
43#include <linux/pagemap.h>
44#include <linux/proc_fs.h>
45#include <linux/rcupdate.h>
46#include <linux/sched.h>
47#include <linux/sched/mm.h>
48#include <linux/sched/task.h>
49#include <linux/seq_file.h>
50#include <linux/security.h>
51#include <linux/slab.h>
52#include <linux/spinlock.h>
53#include <linux/stat.h>
54#include <linux/string.h>
55#include <linux/time.h>
56#include <linux/time64.h>
57#include <linux/backing-dev.h>
58#include <linux/sort.h>
59#include <linux/oom.h>
60
61#include <linux/uaccess.h>
62#include <linux/atomic.h>
63#include <linux/mutex.h>
64#include <linux/cgroup.h>
65#include <linux/wait.h>
66
67DEFINE_STATIC_KEY_FALSE(cpusets_pre_enable_key);
68DEFINE_STATIC_KEY_FALSE(cpusets_enabled_key);
69
70
71
72struct fmeter {
73 int cnt;
74 int val;
75 time64_t time;
76 spinlock_t lock;
77};
78
79struct cpuset {
80 struct cgroup_subsys_state css;
81
82 unsigned long flags;
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105 cpumask_var_t cpus_allowed;
106 nodemask_t mems_allowed;
107
108
109 cpumask_var_t effective_cpus;
110 nodemask_t effective_mems;
111
112
113
114
115
116
117
118
119
120
121
122 nodemask_t old_mems_allowed;
123
124 struct fmeter fmeter;
125
126
127
128
129
130 int attach_in_progress;
131
132
133 int pn;
134
135
136 int relax_domain_level;
137};
138
139static inline struct cpuset *css_cs(struct cgroup_subsys_state *css)
140{
141 return css ? container_of(css, struct cpuset, css) : NULL;
142}
143
144
145static inline struct cpuset *task_cs(struct task_struct *task)
146{
147 return css_cs(task_css(task, cpuset_cgrp_id));
148}
149
150static inline struct cpuset *parent_cs(struct cpuset *cs)
151{
152 return css_cs(cs->css.parent);
153}
154
155#ifdef CONFIG_NUMA
156static inline bool task_has_mempolicy(struct task_struct *task)
157{
158 return task->mempolicy;
159}
160#else
161static inline bool task_has_mempolicy(struct task_struct *task)
162{
163 return false;
164}
165#endif
166
167
168
169typedef enum {
170 CS_ONLINE,
171 CS_CPU_EXCLUSIVE,
172 CS_MEM_EXCLUSIVE,
173 CS_MEM_HARDWALL,
174 CS_MEMORY_MIGRATE,
175 CS_SCHED_LOAD_BALANCE,
176 CS_SPREAD_PAGE,
177 CS_SPREAD_SLAB,
178} cpuset_flagbits_t;
179
180
181static inline bool is_cpuset_online(struct cpuset *cs)
182{
183 return test_bit(CS_ONLINE, &cs->flags) && !css_is_dying(&cs->css);
184}
185
186static inline int is_cpu_exclusive(const struct cpuset *cs)
187{
188 return test_bit(CS_CPU_EXCLUSIVE, &cs->flags);
189}
190
191static inline int is_mem_exclusive(const struct cpuset *cs)
192{
193 return test_bit(CS_MEM_EXCLUSIVE, &cs->flags);
194}
195
196static inline int is_mem_hardwall(const struct cpuset *cs)
197{
198 return test_bit(CS_MEM_HARDWALL, &cs->flags);
199}
200
201static inline int is_sched_load_balance(const struct cpuset *cs)
202{
203 return test_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
204}
205
206static inline int is_memory_migrate(const struct cpuset *cs)
207{
208 return test_bit(CS_MEMORY_MIGRATE, &cs->flags);
209}
210
211static inline int is_spread_page(const struct cpuset *cs)
212{
213 return test_bit(CS_SPREAD_PAGE, &cs->flags);
214}
215
216static inline int is_spread_slab(const struct cpuset *cs)
217{
218 return test_bit(CS_SPREAD_SLAB, &cs->flags);
219}
220
221static struct cpuset top_cpuset = {
222 .flags = ((1 << CS_ONLINE) | (1 << CS_CPU_EXCLUSIVE) |
223 (1 << CS_MEM_EXCLUSIVE)),
224};
225
226
227
228
229
230
231
232
233
234
235#define cpuset_for_each_child(child_cs, pos_css, parent_cs) \
236 css_for_each_child((pos_css), &(parent_cs)->css) \
237 if (is_cpuset_online(((child_cs) = css_cs((pos_css)))))
238
239
240
241
242
243
244
245
246
247
248
249
250#define cpuset_for_each_descendant_pre(des_cs, pos_css, root_cs) \
251 css_for_each_descendant_pre((pos_css), &(root_cs)->css) \
252 if (is_cpuset_online(((des_cs) = css_cs((pos_css)))))
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290static DEFINE_MUTEX(cpuset_mutex);
291static DEFINE_SPINLOCK(callback_lock);
292
293static struct workqueue_struct *cpuset_migrate_mm_wq;
294
295
296
297
298static void cpuset_hotplug_workfn(struct work_struct *work);
299static DECLARE_WORK(cpuset_hotplug_work, cpuset_hotplug_workfn);
300
301static DECLARE_WAIT_QUEUE_HEAD(cpuset_attach_wq);
302
303
304
305
306
307static inline bool is_in_v2_mode(void)
308{
309 return cgroup_subsys_on_dfl(cpuset_cgrp_subsys) ||
310 (cpuset_cgrp_subsys.root->flags & CGRP_ROOT_CPUSET_V2_MODE);
311}
312
313
314
315
316
317
318static struct dentry *cpuset_mount(struct file_system_type *fs_type,
319 int flags, const char *unused_dev_name, void *data)
320{
321 struct file_system_type *cgroup_fs = get_fs_type("cgroup");
322 struct dentry *ret = ERR_PTR(-ENODEV);
323 if (cgroup_fs) {
324 char mountopts[] =
325 "cpuset,noprefix,"
326 "release_agent=/sbin/cpuset_release_agent";
327 ret = cgroup_fs->mount(cgroup_fs, flags,
328 unused_dev_name, mountopts);
329 put_filesystem(cgroup_fs);
330 }
331 return ret;
332}
333
334static struct file_system_type cpuset_fs_type = {
335 .name = "cpuset",
336 .mount = cpuset_mount,
337};
338
339
340
341
342
343
344
345
346
347
348
349static void guarantee_online_cpus(struct cpuset *cs, struct cpumask *pmask)
350{
351 while (!cpumask_intersects(cs->effective_cpus, cpu_online_mask)) {
352 cs = parent_cs(cs);
353 if (unlikely(!cs)) {
354
355
356
357
358
359
360
361 cpumask_copy(pmask, cpu_online_mask);
362 return;
363 }
364 }
365 cpumask_and(pmask, cs->effective_cpus, cpu_online_mask);
366}
367
368
369
370
371
372
373
374
375
376
377
378
379static void guarantee_online_mems(struct cpuset *cs, nodemask_t *pmask)
380{
381 while (!nodes_intersects(cs->effective_mems, node_states[N_MEMORY]))
382 cs = parent_cs(cs);
383 nodes_and(*pmask, cs->effective_mems, node_states[N_MEMORY]);
384}
385
386
387
388
389
390
391static void cpuset_update_task_spread_flag(struct cpuset *cs,
392 struct task_struct *tsk)
393{
394 if (is_spread_page(cs))
395 task_set_spread_page(tsk);
396 else
397 task_clear_spread_page(tsk);
398
399 if (is_spread_slab(cs))
400 task_set_spread_slab(tsk);
401 else
402 task_clear_spread_slab(tsk);
403}
404
405
406
407
408
409
410
411
412
413static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q)
414{
415 return cpumask_subset(p->cpus_allowed, q->cpus_allowed) &&
416 nodes_subset(p->mems_allowed, q->mems_allowed) &&
417 is_cpu_exclusive(p) <= is_cpu_exclusive(q) &&
418 is_mem_exclusive(p) <= is_mem_exclusive(q);
419}
420
421
422
423
424
425static struct cpuset *alloc_trial_cpuset(struct cpuset *cs)
426{
427 struct cpuset *trial;
428
429 trial = kmemdup(cs, sizeof(*cs), GFP_KERNEL);
430 if (!trial)
431 return NULL;
432
433 if (!alloc_cpumask_var(&trial->cpus_allowed, GFP_KERNEL))
434 goto free_cs;
435 if (!alloc_cpumask_var(&trial->effective_cpus, GFP_KERNEL))
436 goto free_cpus;
437
438 cpumask_copy(trial->cpus_allowed, cs->cpus_allowed);
439 cpumask_copy(trial->effective_cpus, cs->effective_cpus);
440 return trial;
441
442free_cpus:
443 free_cpumask_var(trial->cpus_allowed);
444free_cs:
445 kfree(trial);
446 return NULL;
447}
448
449
450
451
452
453static void free_trial_cpuset(struct cpuset *trial)
454{
455 free_cpumask_var(trial->effective_cpus);
456 free_cpumask_var(trial->cpus_allowed);
457 kfree(trial);
458}
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480static int validate_change(struct cpuset *cur, struct cpuset *trial)
481{
482 struct cgroup_subsys_state *css;
483 struct cpuset *c, *par;
484 int ret;
485
486 rcu_read_lock();
487
488
489 ret = -EBUSY;
490 cpuset_for_each_child(c, css, cur)
491 if (!is_cpuset_subset(c, trial))
492 goto out;
493
494
495 ret = 0;
496 if (cur == &top_cpuset)
497 goto out;
498
499 par = parent_cs(cur);
500
501
502 ret = -EACCES;
503 if (!is_in_v2_mode() && !is_cpuset_subset(trial, par))
504 goto out;
505
506
507
508
509
510 ret = -EINVAL;
511 cpuset_for_each_child(c, css, par) {
512 if ((is_cpu_exclusive(trial) || is_cpu_exclusive(c)) &&
513 c != cur &&
514 cpumask_intersects(trial->cpus_allowed, c->cpus_allowed))
515 goto out;
516 if ((is_mem_exclusive(trial) || is_mem_exclusive(c)) &&
517 c != cur &&
518 nodes_intersects(trial->mems_allowed, c->mems_allowed))
519 goto out;
520 }
521
522
523
524
525
526 ret = -ENOSPC;
527 if ((cgroup_is_populated(cur->css.cgroup) || cur->attach_in_progress)) {
528 if (!cpumask_empty(cur->cpus_allowed) &&
529 cpumask_empty(trial->cpus_allowed))
530 goto out;
531 if (!nodes_empty(cur->mems_allowed) &&
532 nodes_empty(trial->mems_allowed))
533 goto out;
534 }
535
536
537
538
539
540 ret = -EBUSY;
541 if (is_cpu_exclusive(cur) &&
542 !cpuset_cpumask_can_shrink(cur->cpus_allowed,
543 trial->cpus_allowed))
544 goto out;
545
546 ret = 0;
547out:
548 rcu_read_unlock();
549 return ret;
550}
551
552#ifdef CONFIG_SMP
553
554
555
556
557static int cpusets_overlap(struct cpuset *a, struct cpuset *b)
558{
559 return cpumask_intersects(a->effective_cpus, b->effective_cpus);
560}
561
562static void
563update_domain_attr(struct sched_domain_attr *dattr, struct cpuset *c)
564{
565 if (dattr->relax_domain_level < c->relax_domain_level)
566 dattr->relax_domain_level = c->relax_domain_level;
567 return;
568}
569
570static void update_domain_attr_tree(struct sched_domain_attr *dattr,
571 struct cpuset *root_cs)
572{
573 struct cpuset *cp;
574 struct cgroup_subsys_state *pos_css;
575
576 rcu_read_lock();
577 cpuset_for_each_descendant_pre(cp, pos_css, root_cs) {
578
579 if (cpumask_empty(cp->cpus_allowed)) {
580 pos_css = css_rightmost_descendant(pos_css);
581 continue;
582 }
583
584 if (is_sched_load_balance(cp))
585 update_domain_attr(dattr, cp);
586 }
587 rcu_read_unlock();
588}
589
590
591static inline int nr_cpusets(void)
592{
593
594 return static_key_count(&cpusets_enabled_key.key) + 1;
595}
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651static int generate_sched_domains(cpumask_var_t **domains,
652 struct sched_domain_attr **attributes)
653{
654 struct cpuset *cp;
655 struct cpuset **csa;
656 int csn;
657 int i, j, k;
658 cpumask_var_t *doms;
659 cpumask_var_t non_isolated_cpus;
660 struct sched_domain_attr *dattr;
661 int ndoms = 0;
662 int nslot;
663 struct cgroup_subsys_state *pos_css;
664
665 doms = NULL;
666 dattr = NULL;
667 csa = NULL;
668
669 if (!alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL))
670 goto done;
671 cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map);
672
673
674 if (is_sched_load_balance(&top_cpuset)) {
675 ndoms = 1;
676 doms = alloc_sched_domains(ndoms);
677 if (!doms)
678 goto done;
679
680 dattr = kmalloc(sizeof(struct sched_domain_attr), GFP_KERNEL);
681 if (dattr) {
682 *dattr = SD_ATTR_INIT;
683 update_domain_attr_tree(dattr, &top_cpuset);
684 }
685 cpumask_and(doms[0], top_cpuset.effective_cpus,
686 non_isolated_cpus);
687
688 goto done;
689 }
690
691 csa = kmalloc(nr_cpusets() * sizeof(cp), GFP_KERNEL);
692 if (!csa)
693 goto done;
694 csn = 0;
695
696 rcu_read_lock();
697 cpuset_for_each_descendant_pre(cp, pos_css, &top_cpuset) {
698 if (cp == &top_cpuset)
699 continue;
700
701
702
703
704
705
706
707
708 if (!cpumask_empty(cp->cpus_allowed) &&
709 !(is_sched_load_balance(cp) &&
710 cpumask_intersects(cp->cpus_allowed, non_isolated_cpus)))
711 continue;
712
713 if (is_sched_load_balance(cp))
714 csa[csn++] = cp;
715
716
717 pos_css = css_rightmost_descendant(pos_css);
718 }
719 rcu_read_unlock();
720
721 for (i = 0; i < csn; i++)
722 csa[i]->pn = i;
723 ndoms = csn;
724
725restart:
726
727 for (i = 0; i < csn; i++) {
728 struct cpuset *a = csa[i];
729 int apn = a->pn;
730
731 for (j = 0; j < csn; j++) {
732 struct cpuset *b = csa[j];
733 int bpn = b->pn;
734
735 if (apn != bpn && cpusets_overlap(a, b)) {
736 for (k = 0; k < csn; k++) {
737 struct cpuset *c = csa[k];
738
739 if (c->pn == bpn)
740 c->pn = apn;
741 }
742 ndoms--;
743 goto restart;
744 }
745 }
746 }
747
748
749
750
751
752 doms = alloc_sched_domains(ndoms);
753 if (!doms)
754 goto done;
755
756
757
758
759
760 dattr = kmalloc(ndoms * sizeof(struct sched_domain_attr), GFP_KERNEL);
761
762 for (nslot = 0, i = 0; i < csn; i++) {
763 struct cpuset *a = csa[i];
764 struct cpumask *dp;
765 int apn = a->pn;
766
767 if (apn < 0) {
768
769 continue;
770 }
771
772 dp = doms[nslot];
773
774 if (nslot == ndoms) {
775 static int warnings = 10;
776 if (warnings) {
777 pr_warn("rebuild_sched_domains confused: nslot %d, ndoms %d, csn %d, i %d, apn %d\n",
778 nslot, ndoms, csn, i, apn);
779 warnings--;
780 }
781 continue;
782 }
783
784 cpumask_clear(dp);
785 if (dattr)
786 *(dattr + nslot) = SD_ATTR_INIT;
787 for (j = i; j < csn; j++) {
788 struct cpuset *b = csa[j];
789
790 if (apn == b->pn) {
791 cpumask_or(dp, dp, b->effective_cpus);
792 cpumask_and(dp, dp, non_isolated_cpus);
793 if (dattr)
794 update_domain_attr_tree(dattr + nslot, b);
795
796
797 b->pn = -1;
798 }
799 }
800 nslot++;
801 }
802 BUG_ON(nslot != ndoms);
803
804done:
805 free_cpumask_var(non_isolated_cpus);
806 kfree(csa);
807
808
809
810
811
812 if (doms == NULL)
813 ndoms = 1;
814
815 *domains = doms;
816 *attributes = dattr;
817 return ndoms;
818}
819
820
821
822
823
824
825
826
827
828
829
830
831static void rebuild_sched_domains_locked(void)
832{
833 struct sched_domain_attr *attr;
834 cpumask_var_t *doms;
835 int ndoms;
836
837 lockdep_assert_held(&cpuset_mutex);
838 get_online_cpus();
839
840
841
842
843
844
845 if (!cpumask_equal(top_cpuset.effective_cpus, cpu_active_mask))
846 goto out;
847
848
849 ndoms = generate_sched_domains(&doms, &attr);
850
851
852 partition_sched_domains(ndoms, doms, attr);
853out:
854 put_online_cpus();
855}
856#else
857static void rebuild_sched_domains_locked(void)
858{
859}
860#endif
861
862void rebuild_sched_domains(void)
863{
864 mutex_lock(&cpuset_mutex);
865 rebuild_sched_domains_locked();
866 mutex_unlock(&cpuset_mutex);
867}
868
869
870
871
872
873
874
875
876
877static void update_tasks_cpumask(struct cpuset *cs)
878{
879 struct css_task_iter it;
880 struct task_struct *task;
881
882 css_task_iter_start(&cs->css, 0, &it);
883 while ((task = css_task_iter_next(&it)))
884 set_cpus_allowed_ptr(task, cs->effective_cpus);
885 css_task_iter_end(&it);
886}
887
888
889
890
891
892
893
894
895
896
897
898
899
900static void update_cpumasks_hier(struct cpuset *cs, struct cpumask *new_cpus)
901{
902 struct cpuset *cp;
903 struct cgroup_subsys_state *pos_css;
904 bool need_rebuild_sched_domains = false;
905
906 rcu_read_lock();
907 cpuset_for_each_descendant_pre(cp, pos_css, cs) {
908 struct cpuset *parent = parent_cs(cp);
909
910 cpumask_and(new_cpus, cp->cpus_allowed, parent->effective_cpus);
911
912
913
914
915
916 if (is_in_v2_mode() && cpumask_empty(new_cpus))
917 cpumask_copy(new_cpus, parent->effective_cpus);
918
919
920 if (cpumask_equal(new_cpus, cp->effective_cpus)) {
921 pos_css = css_rightmost_descendant(pos_css);
922 continue;
923 }
924
925 if (!css_tryget_online(&cp->css))
926 continue;
927 rcu_read_unlock();
928
929 spin_lock_irq(&callback_lock);
930 cpumask_copy(cp->effective_cpus, new_cpus);
931 spin_unlock_irq(&callback_lock);
932
933 WARN_ON(!is_in_v2_mode() &&
934 !cpumask_equal(cp->cpus_allowed, cp->effective_cpus));
935
936 update_tasks_cpumask(cp);
937
938
939
940
941
942 if (!cpumask_empty(cp->cpus_allowed) &&
943 is_sched_load_balance(cp))
944 need_rebuild_sched_domains = true;
945
946 rcu_read_lock();
947 css_put(&cp->css);
948 }
949 rcu_read_unlock();
950
951 if (need_rebuild_sched_domains)
952 rebuild_sched_domains_locked();
953}
954
955
956
957
958
959
960
961static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
962 const char *buf)
963{
964 int retval;
965
966
967 if (cs == &top_cpuset)
968 return -EACCES;
969
970
971
972
973
974
975
976 if (!*buf) {
977 cpumask_clear(trialcs->cpus_allowed);
978 } else {
979 retval = cpulist_parse(buf, trialcs->cpus_allowed);
980 if (retval < 0)
981 return retval;
982
983 if (!cpumask_subset(trialcs->cpus_allowed,
984 top_cpuset.cpus_allowed))
985 return -EINVAL;
986 }
987
988
989 if (cpumask_equal(cs->cpus_allowed, trialcs->cpus_allowed))
990 return 0;
991
992 retval = validate_change(cs, trialcs);
993 if (retval < 0)
994 return retval;
995
996 spin_lock_irq(&callback_lock);
997 cpumask_copy(cs->cpus_allowed, trialcs->cpus_allowed);
998 spin_unlock_irq(&callback_lock);
999
1000
1001 update_cpumasks_hier(cs, trialcs->cpus_allowed);
1002 return 0;
1003}
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013struct cpuset_migrate_mm_work {
1014 struct work_struct work;
1015 struct mm_struct *mm;
1016 nodemask_t from;
1017 nodemask_t to;
1018};
1019
1020static void cpuset_migrate_mm_workfn(struct work_struct *work)
1021{
1022 struct cpuset_migrate_mm_work *mwork =
1023 container_of(work, struct cpuset_migrate_mm_work, work);
1024
1025
1026 do_migrate_pages(mwork->mm, &mwork->from, &mwork->to, MPOL_MF_MOVE_ALL);
1027 mmput(mwork->mm);
1028 kfree(mwork);
1029}
1030
1031static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from,
1032 const nodemask_t *to)
1033{
1034 struct cpuset_migrate_mm_work *mwork;
1035
1036 mwork = kzalloc(sizeof(*mwork), GFP_KERNEL);
1037 if (mwork) {
1038 mwork->mm = mm;
1039 mwork->from = *from;
1040 mwork->to = *to;
1041 INIT_WORK(&mwork->work, cpuset_migrate_mm_workfn);
1042 queue_work(cpuset_migrate_mm_wq, &mwork->work);
1043 } else {
1044 mmput(mm);
1045 }
1046}
1047
1048static void cpuset_post_attach(void)
1049{
1050 flush_workqueue(cpuset_migrate_mm_wq);
1051}
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063static void cpuset_change_task_nodemask(struct task_struct *tsk,
1064 nodemask_t *newmems)
1065{
1066 task_lock(tsk);
1067
1068 local_irq_disable();
1069 write_seqcount_begin(&tsk->mems_allowed_seq);
1070
1071 nodes_or(tsk->mems_allowed, tsk->mems_allowed, *newmems);
1072 mpol_rebind_task(tsk, newmems);
1073 tsk->mems_allowed = *newmems;
1074
1075 write_seqcount_end(&tsk->mems_allowed_seq);
1076 local_irq_enable();
1077
1078 task_unlock(tsk);
1079}
1080
1081static void *cpuset_being_rebound;
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091static void update_tasks_nodemask(struct cpuset *cs)
1092{
1093 static nodemask_t newmems;
1094 struct css_task_iter it;
1095 struct task_struct *task;
1096
1097 cpuset_being_rebound = cs;
1098
1099 guarantee_online_mems(cs, &newmems);
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111 css_task_iter_start(&cs->css, 0, &it);
1112 while ((task = css_task_iter_next(&it))) {
1113 struct mm_struct *mm;
1114 bool migrate;
1115
1116 cpuset_change_task_nodemask(task, &newmems);
1117
1118 mm = get_task_mm(task);
1119 if (!mm)
1120 continue;
1121
1122 migrate = is_memory_migrate(cs);
1123
1124 mpol_rebind_mm(mm, &cs->mems_allowed);
1125 if (migrate)
1126 cpuset_migrate_mm(mm, &cs->old_mems_allowed, &newmems);
1127 else
1128 mmput(mm);
1129 }
1130 css_task_iter_end(&it);
1131
1132
1133
1134
1135
1136 cs->old_mems_allowed = newmems;
1137
1138
1139 cpuset_being_rebound = NULL;
1140}
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154static void update_nodemasks_hier(struct cpuset *cs, nodemask_t *new_mems)
1155{
1156 struct cpuset *cp;
1157 struct cgroup_subsys_state *pos_css;
1158
1159 rcu_read_lock();
1160 cpuset_for_each_descendant_pre(cp, pos_css, cs) {
1161 struct cpuset *parent = parent_cs(cp);
1162
1163 nodes_and(*new_mems, cp->mems_allowed, parent->effective_mems);
1164
1165
1166
1167
1168
1169 if (is_in_v2_mode() && nodes_empty(*new_mems))
1170 *new_mems = parent->effective_mems;
1171
1172
1173 if (nodes_equal(*new_mems, cp->effective_mems)) {
1174 pos_css = css_rightmost_descendant(pos_css);
1175 continue;
1176 }
1177
1178 if (!css_tryget_online(&cp->css))
1179 continue;
1180 rcu_read_unlock();
1181
1182 spin_lock_irq(&callback_lock);
1183 cp->effective_mems = *new_mems;
1184 spin_unlock_irq(&callback_lock);
1185
1186 WARN_ON(!is_in_v2_mode() &&
1187 !nodes_equal(cp->mems_allowed, cp->effective_mems));
1188
1189 update_tasks_nodemask(cp);
1190
1191 rcu_read_lock();
1192 css_put(&cp->css);
1193 }
1194 rcu_read_unlock();
1195}
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
1211 const char *buf)
1212{
1213 int retval;
1214
1215
1216
1217
1218
1219 if (cs == &top_cpuset) {
1220 retval = -EACCES;
1221 goto done;
1222 }
1223
1224
1225
1226
1227
1228
1229
1230 if (!*buf) {
1231 nodes_clear(trialcs->mems_allowed);
1232 } else {
1233 retval = nodelist_parse(buf, trialcs->mems_allowed);
1234 if (retval < 0)
1235 goto done;
1236
1237 if (!nodes_subset(trialcs->mems_allowed,
1238 top_cpuset.mems_allowed)) {
1239 retval = -EINVAL;
1240 goto done;
1241 }
1242 }
1243
1244 if (nodes_equal(cs->mems_allowed, trialcs->mems_allowed)) {
1245 retval = 0;
1246 goto done;
1247 }
1248 retval = validate_change(cs, trialcs);
1249 if (retval < 0)
1250 goto done;
1251
1252 spin_lock_irq(&callback_lock);
1253 cs->mems_allowed = trialcs->mems_allowed;
1254 spin_unlock_irq(&callback_lock);
1255
1256
1257 update_nodemasks_hier(cs, &trialcs->mems_allowed);
1258done:
1259 return retval;
1260}
1261
1262int current_cpuset_is_being_rebound(void)
1263{
1264 int ret;
1265
1266 rcu_read_lock();
1267 ret = task_cs(current) == cpuset_being_rebound;
1268 rcu_read_unlock();
1269
1270 return ret;
1271}
1272
1273static int update_relax_domain_level(struct cpuset *cs, s64 val)
1274{
1275#ifdef CONFIG_SMP
1276 if (val < -1 || val >= sched_domain_level_max)
1277 return -EINVAL;
1278#endif
1279
1280 if (val != cs->relax_domain_level) {
1281 cs->relax_domain_level = val;
1282 if (!cpumask_empty(cs->cpus_allowed) &&
1283 is_sched_load_balance(cs))
1284 rebuild_sched_domains_locked();
1285 }
1286
1287 return 0;
1288}
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298static void update_tasks_flags(struct cpuset *cs)
1299{
1300 struct css_task_iter it;
1301 struct task_struct *task;
1302
1303 css_task_iter_start(&cs->css, 0, &it);
1304 while ((task = css_task_iter_next(&it)))
1305 cpuset_update_task_spread_flag(cs, task);
1306 css_task_iter_end(&it);
1307}
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
1319 int turning_on)
1320{
1321 struct cpuset *trialcs;
1322 int balance_flag_changed;
1323 int spread_flag_changed;
1324 int err;
1325
1326 trialcs = alloc_trial_cpuset(cs);
1327 if (!trialcs)
1328 return -ENOMEM;
1329
1330 if (turning_on)
1331 set_bit(bit, &trialcs->flags);
1332 else
1333 clear_bit(bit, &trialcs->flags);
1334
1335 err = validate_change(cs, trialcs);
1336 if (err < 0)
1337 goto out;
1338
1339 balance_flag_changed = (is_sched_load_balance(cs) !=
1340 is_sched_load_balance(trialcs));
1341
1342 spread_flag_changed = ((is_spread_slab(cs) != is_spread_slab(trialcs))
1343 || (is_spread_page(cs) != is_spread_page(trialcs)));
1344
1345 spin_lock_irq(&callback_lock);
1346 cs->flags = trialcs->flags;
1347 spin_unlock_irq(&callback_lock);
1348
1349 if (!cpumask_empty(trialcs->cpus_allowed) && balance_flag_changed)
1350 rebuild_sched_domains_locked();
1351
1352 if (spread_flag_changed)
1353 update_tasks_flags(cs);
1354out:
1355 free_trial_cpuset(trialcs);
1356 return err;
1357}
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404#define FM_COEF 933
1405#define FM_MAXTICKS ((u32)99)
1406#define FM_MAXCNT 1000000
1407#define FM_SCALE 1000
1408
1409
1410static void fmeter_init(struct fmeter *fmp)
1411{
1412 fmp->cnt = 0;
1413 fmp->val = 0;
1414 fmp->time = 0;
1415 spin_lock_init(&fmp->lock);
1416}
1417
1418
1419static void fmeter_update(struct fmeter *fmp)
1420{
1421 time64_t now;
1422 u32 ticks;
1423
1424 now = ktime_get_seconds();
1425 ticks = now - fmp->time;
1426
1427 if (ticks == 0)
1428 return;
1429
1430 ticks = min(FM_MAXTICKS, ticks);
1431 while (ticks-- > 0)
1432 fmp->val = (FM_COEF * fmp->val) / FM_SCALE;
1433 fmp->time = now;
1434
1435 fmp->val += ((FM_SCALE - FM_COEF) * fmp->cnt) / FM_SCALE;
1436 fmp->cnt = 0;
1437}
1438
1439
1440static void fmeter_markevent(struct fmeter *fmp)
1441{
1442 spin_lock(&fmp->lock);
1443 fmeter_update(fmp);
1444 fmp->cnt = min(FM_MAXCNT, fmp->cnt + FM_SCALE);
1445 spin_unlock(&fmp->lock);
1446}
1447
1448
1449static int fmeter_getrate(struct fmeter *fmp)
1450{
1451 int val;
1452
1453 spin_lock(&fmp->lock);
1454 fmeter_update(fmp);
1455 val = fmp->val;
1456 spin_unlock(&fmp->lock);
1457 return val;
1458}
1459
1460static struct cpuset *cpuset_attach_old_cs;
1461
1462
1463static int cpuset_can_attach(struct cgroup_taskset *tset)
1464{
1465 struct cgroup_subsys_state *css;
1466 struct cpuset *cs;
1467 struct task_struct *task;
1468 int ret;
1469
1470
1471 cpuset_attach_old_cs = task_cs(cgroup_taskset_first(tset, &css));
1472 cs = css_cs(css);
1473
1474 mutex_lock(&cpuset_mutex);
1475
1476
1477 ret = -ENOSPC;
1478 if (!is_in_v2_mode() &&
1479 (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed)))
1480 goto out_unlock;
1481
1482 cgroup_taskset_for_each(task, css, tset) {
1483 ret = task_can_attach(task, cs->cpus_allowed);
1484 if (ret)
1485 goto out_unlock;
1486 ret = security_task_setscheduler(task);
1487 if (ret)
1488 goto out_unlock;
1489 }
1490
1491
1492
1493
1494
1495 cs->attach_in_progress++;
1496 ret = 0;
1497out_unlock:
1498 mutex_unlock(&cpuset_mutex);
1499 return ret;
1500}
1501
1502static void cpuset_cancel_attach(struct cgroup_taskset *tset)
1503{
1504 struct cgroup_subsys_state *css;
1505 struct cpuset *cs;
1506
1507 cgroup_taskset_first(tset, &css);
1508 cs = css_cs(css);
1509
1510 mutex_lock(&cpuset_mutex);
1511 css_cs(css)->attach_in_progress--;
1512 mutex_unlock(&cpuset_mutex);
1513}
1514
1515
1516
1517
1518
1519
1520static cpumask_var_t cpus_attach;
1521
1522static void cpuset_attach(struct cgroup_taskset *tset)
1523{
1524
1525 static nodemask_t cpuset_attach_nodemask_to;
1526 struct task_struct *task;
1527 struct task_struct *leader;
1528 struct cgroup_subsys_state *css;
1529 struct cpuset *cs;
1530 struct cpuset *oldcs = cpuset_attach_old_cs;
1531
1532 cgroup_taskset_first(tset, &css);
1533 cs = css_cs(css);
1534
1535 mutex_lock(&cpuset_mutex);
1536
1537
1538 if (cs == &top_cpuset)
1539 cpumask_copy(cpus_attach, cpu_possible_mask);
1540 else
1541 guarantee_online_cpus(cs, cpus_attach);
1542
1543 guarantee_online_mems(cs, &cpuset_attach_nodemask_to);
1544
1545 cgroup_taskset_for_each(task, css, tset) {
1546
1547
1548
1549
1550 WARN_ON_ONCE(set_cpus_allowed_ptr(task, cpus_attach));
1551
1552 cpuset_change_task_nodemask(task, &cpuset_attach_nodemask_to);
1553 cpuset_update_task_spread_flag(cs, task);
1554 }
1555
1556
1557
1558
1559
1560 cpuset_attach_nodemask_to = cs->effective_mems;
1561 cgroup_taskset_for_each_leader(leader, css, tset) {
1562 struct mm_struct *mm = get_task_mm(leader);
1563
1564 if (mm) {
1565 mpol_rebind_mm(mm, &cpuset_attach_nodemask_to);
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575 if (is_memory_migrate(cs))
1576 cpuset_migrate_mm(mm, &oldcs->old_mems_allowed,
1577 &cpuset_attach_nodemask_to);
1578 else
1579 mmput(mm);
1580 }
1581 }
1582
1583 cs->old_mems_allowed = cpuset_attach_nodemask_to;
1584
1585 cs->attach_in_progress--;
1586 if (!cs->attach_in_progress)
1587 wake_up(&cpuset_attach_wq);
1588
1589 mutex_unlock(&cpuset_mutex);
1590}
1591
1592
1593
1594typedef enum {
1595 FILE_MEMORY_MIGRATE,
1596 FILE_CPULIST,
1597 FILE_MEMLIST,
1598 FILE_EFFECTIVE_CPULIST,
1599 FILE_EFFECTIVE_MEMLIST,
1600 FILE_CPU_EXCLUSIVE,
1601 FILE_MEM_EXCLUSIVE,
1602 FILE_MEM_HARDWALL,
1603 FILE_SCHED_LOAD_BALANCE,
1604 FILE_SCHED_RELAX_DOMAIN_LEVEL,
1605 FILE_MEMORY_PRESSURE_ENABLED,
1606 FILE_MEMORY_PRESSURE,
1607 FILE_SPREAD_PAGE,
1608 FILE_SPREAD_SLAB,
1609} cpuset_filetype_t;
1610
1611static int cpuset_write_u64(struct cgroup_subsys_state *css, struct cftype *cft,
1612 u64 val)
1613{
1614 struct cpuset *cs = css_cs(css);
1615 cpuset_filetype_t type = cft->private;
1616 int retval = 0;
1617
1618 mutex_lock(&cpuset_mutex);
1619 if (!is_cpuset_online(cs)) {
1620 retval = -ENODEV;
1621 goto out_unlock;
1622 }
1623
1624 switch (type) {
1625 case FILE_CPU_EXCLUSIVE:
1626 retval = update_flag(CS_CPU_EXCLUSIVE, cs, val);
1627 break;
1628 case FILE_MEM_EXCLUSIVE:
1629 retval = update_flag(CS_MEM_EXCLUSIVE, cs, val);
1630 break;
1631 case FILE_MEM_HARDWALL:
1632 retval = update_flag(CS_MEM_HARDWALL, cs, val);
1633 break;
1634 case FILE_SCHED_LOAD_BALANCE:
1635 retval = update_flag(CS_SCHED_LOAD_BALANCE, cs, val);
1636 break;
1637 case FILE_MEMORY_MIGRATE:
1638 retval = update_flag(CS_MEMORY_MIGRATE, cs, val);
1639 break;
1640 case FILE_MEMORY_PRESSURE_ENABLED:
1641 cpuset_memory_pressure_enabled = !!val;
1642 break;
1643 case FILE_SPREAD_PAGE:
1644 retval = update_flag(CS_SPREAD_PAGE, cs, val);
1645 break;
1646 case FILE_SPREAD_SLAB:
1647 retval = update_flag(CS_SPREAD_SLAB, cs, val);
1648 break;
1649 default:
1650 retval = -EINVAL;
1651 break;
1652 }
1653out_unlock:
1654 mutex_unlock(&cpuset_mutex);
1655 return retval;
1656}
1657
1658static int cpuset_write_s64(struct cgroup_subsys_state *css, struct cftype *cft,
1659 s64 val)
1660{
1661 struct cpuset *cs = css_cs(css);
1662 cpuset_filetype_t type = cft->private;
1663 int retval = -ENODEV;
1664
1665 mutex_lock(&cpuset_mutex);
1666 if (!is_cpuset_online(cs))
1667 goto out_unlock;
1668
1669 switch (type) {
1670 case FILE_SCHED_RELAX_DOMAIN_LEVEL:
1671 retval = update_relax_domain_level(cs, val);
1672 break;
1673 default:
1674 retval = -EINVAL;
1675 break;
1676 }
1677out_unlock:
1678 mutex_unlock(&cpuset_mutex);
1679 return retval;
1680}
1681
1682
1683
1684
1685static ssize_t cpuset_write_resmask(struct kernfs_open_file *of,
1686 char *buf, size_t nbytes, loff_t off)
1687{
1688 struct cpuset *cs = css_cs(of_css(of));
1689 struct cpuset *trialcs;
1690 int retval = -ENODEV;
1691
1692 buf = strstrip(buf);
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713 css_get(&cs->css);
1714 kernfs_break_active_protection(of->kn);
1715 flush_work(&cpuset_hotplug_work);
1716
1717 mutex_lock(&cpuset_mutex);
1718 if (!is_cpuset_online(cs))
1719 goto out_unlock;
1720
1721 trialcs = alloc_trial_cpuset(cs);
1722 if (!trialcs) {
1723 retval = -ENOMEM;
1724 goto out_unlock;
1725 }
1726
1727 switch (of_cft(of)->private) {
1728 case FILE_CPULIST:
1729 retval = update_cpumask(cs, trialcs, buf);
1730 break;
1731 case FILE_MEMLIST:
1732 retval = update_nodemask(cs, trialcs, buf);
1733 break;
1734 default:
1735 retval = -EINVAL;
1736 break;
1737 }
1738
1739 free_trial_cpuset(trialcs);
1740out_unlock:
1741 mutex_unlock(&cpuset_mutex);
1742 kernfs_unbreak_active_protection(of->kn);
1743 css_put(&cs->css);
1744 flush_workqueue(cpuset_migrate_mm_wq);
1745 return retval ?: nbytes;
1746}
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756static int cpuset_common_seq_show(struct seq_file *sf, void *v)
1757{
1758 struct cpuset *cs = css_cs(seq_css(sf));
1759 cpuset_filetype_t type = seq_cft(sf)->private;
1760 int ret = 0;
1761
1762 spin_lock_irq(&callback_lock);
1763
1764 switch (type) {
1765 case FILE_CPULIST:
1766 seq_printf(sf, "%*pbl\n", cpumask_pr_args(cs->cpus_allowed));
1767 break;
1768 case FILE_MEMLIST:
1769 seq_printf(sf, "%*pbl\n", nodemask_pr_args(&cs->mems_allowed));
1770 break;
1771 case FILE_EFFECTIVE_CPULIST:
1772 seq_printf(sf, "%*pbl\n", cpumask_pr_args(cs->effective_cpus));
1773 break;
1774 case FILE_EFFECTIVE_MEMLIST:
1775 seq_printf(sf, "%*pbl\n", nodemask_pr_args(&cs->effective_mems));
1776 break;
1777 default:
1778 ret = -EINVAL;
1779 }
1780
1781 spin_unlock_irq(&callback_lock);
1782 return ret;
1783}
1784
1785static u64 cpuset_read_u64(struct cgroup_subsys_state *css, struct cftype *cft)
1786{
1787 struct cpuset *cs = css_cs(css);
1788 cpuset_filetype_t type = cft->private;
1789 switch (type) {
1790 case FILE_CPU_EXCLUSIVE:
1791 return is_cpu_exclusive(cs);
1792 case FILE_MEM_EXCLUSIVE:
1793 return is_mem_exclusive(cs);
1794 case FILE_MEM_HARDWALL:
1795 return is_mem_hardwall(cs);
1796 case FILE_SCHED_LOAD_BALANCE:
1797 return is_sched_load_balance(cs);
1798 case FILE_MEMORY_MIGRATE:
1799 return is_memory_migrate(cs);
1800 case FILE_MEMORY_PRESSURE_ENABLED:
1801 return cpuset_memory_pressure_enabled;
1802 case FILE_MEMORY_PRESSURE:
1803 return fmeter_getrate(&cs->fmeter);
1804 case FILE_SPREAD_PAGE:
1805 return is_spread_page(cs);
1806 case FILE_SPREAD_SLAB:
1807 return is_spread_slab(cs);
1808 default:
1809 BUG();
1810 }
1811
1812
1813 return 0;
1814}
1815
1816static s64 cpuset_read_s64(struct cgroup_subsys_state *css, struct cftype *cft)
1817{
1818 struct cpuset *cs = css_cs(css);
1819 cpuset_filetype_t type = cft->private;
1820 switch (type) {
1821 case FILE_SCHED_RELAX_DOMAIN_LEVEL:
1822 return cs->relax_domain_level;
1823 default:
1824 BUG();
1825 }
1826
1827
1828 return 0;
1829}
1830
1831
1832
1833
1834
1835
1836static struct cftype files[] = {
1837 {
1838 .name = "cpus",
1839 .seq_show = cpuset_common_seq_show,
1840 .write = cpuset_write_resmask,
1841 .max_write_len = (100U + 6 * NR_CPUS),
1842 .private = FILE_CPULIST,
1843 },
1844
1845 {
1846 .name = "mems",
1847 .seq_show = cpuset_common_seq_show,
1848 .write = cpuset_write_resmask,
1849 .max_write_len = (100U + 6 * MAX_NUMNODES),
1850 .private = FILE_MEMLIST,
1851 },
1852
1853 {
1854 .name = "effective_cpus",
1855 .seq_show = cpuset_common_seq_show,
1856 .private = FILE_EFFECTIVE_CPULIST,
1857 },
1858
1859 {
1860 .name = "effective_mems",
1861 .seq_show = cpuset_common_seq_show,
1862 .private = FILE_EFFECTIVE_MEMLIST,
1863 },
1864
1865 {
1866 .name = "cpu_exclusive",
1867 .read_u64 = cpuset_read_u64,
1868 .write_u64 = cpuset_write_u64,
1869 .private = FILE_CPU_EXCLUSIVE,
1870 },
1871
1872 {
1873 .name = "mem_exclusive",
1874 .read_u64 = cpuset_read_u64,
1875 .write_u64 = cpuset_write_u64,
1876 .private = FILE_MEM_EXCLUSIVE,
1877 },
1878
1879 {
1880 .name = "mem_hardwall",
1881 .read_u64 = cpuset_read_u64,
1882 .write_u64 = cpuset_write_u64,
1883 .private = FILE_MEM_HARDWALL,
1884 },
1885
1886 {
1887 .name = "sched_load_balance",
1888 .read_u64 = cpuset_read_u64,
1889 .write_u64 = cpuset_write_u64,
1890 .private = FILE_SCHED_LOAD_BALANCE,
1891 },
1892
1893 {
1894 .name = "sched_relax_domain_level",
1895 .read_s64 = cpuset_read_s64,
1896 .write_s64 = cpuset_write_s64,
1897 .private = FILE_SCHED_RELAX_DOMAIN_LEVEL,
1898 },
1899
1900 {
1901 .name = "memory_migrate",
1902 .read_u64 = cpuset_read_u64,
1903 .write_u64 = cpuset_write_u64,
1904 .private = FILE_MEMORY_MIGRATE,
1905 },
1906
1907 {
1908 .name = "memory_pressure",
1909 .read_u64 = cpuset_read_u64,
1910 .private = FILE_MEMORY_PRESSURE,
1911 },
1912
1913 {
1914 .name = "memory_spread_page",
1915 .read_u64 = cpuset_read_u64,
1916 .write_u64 = cpuset_write_u64,
1917 .private = FILE_SPREAD_PAGE,
1918 },
1919
1920 {
1921 .name = "memory_spread_slab",
1922 .read_u64 = cpuset_read_u64,
1923 .write_u64 = cpuset_write_u64,
1924 .private = FILE_SPREAD_SLAB,
1925 },
1926
1927 {
1928 .name = "memory_pressure_enabled",
1929 .flags = CFTYPE_ONLY_ON_ROOT,
1930 .read_u64 = cpuset_read_u64,
1931 .write_u64 = cpuset_write_u64,
1932 .private = FILE_MEMORY_PRESSURE_ENABLED,
1933 },
1934
1935 { }
1936};
1937
1938
1939
1940
1941
1942
1943static struct cgroup_subsys_state *
1944cpuset_css_alloc(struct cgroup_subsys_state *parent_css)
1945{
1946 struct cpuset *cs;
1947
1948 if (!parent_css)
1949 return &top_cpuset.css;
1950
1951 cs = kzalloc(sizeof(*cs), GFP_KERNEL);
1952 if (!cs)
1953 return ERR_PTR(-ENOMEM);
1954 if (!alloc_cpumask_var(&cs->cpus_allowed, GFP_KERNEL))
1955 goto free_cs;
1956 if (!alloc_cpumask_var(&cs->effective_cpus, GFP_KERNEL))
1957 goto free_cpus;
1958
1959 set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
1960 cpumask_clear(cs->cpus_allowed);
1961 nodes_clear(cs->mems_allowed);
1962 cpumask_clear(cs->effective_cpus);
1963 nodes_clear(cs->effective_mems);
1964 fmeter_init(&cs->fmeter);
1965 cs->relax_domain_level = -1;
1966
1967 return &cs->css;
1968
1969free_cpus:
1970 free_cpumask_var(cs->cpus_allowed);
1971free_cs:
1972 kfree(cs);
1973 return ERR_PTR(-ENOMEM);
1974}
1975
1976static int cpuset_css_online(struct cgroup_subsys_state *css)
1977{
1978 struct cpuset *cs = css_cs(css);
1979 struct cpuset *parent = parent_cs(cs);
1980 struct cpuset *tmp_cs;
1981 struct cgroup_subsys_state *pos_css;
1982
1983 if (!parent)
1984 return 0;
1985
1986 mutex_lock(&cpuset_mutex);
1987
1988 set_bit(CS_ONLINE, &cs->flags);
1989 if (is_spread_page(parent))
1990 set_bit(CS_SPREAD_PAGE, &cs->flags);
1991 if (is_spread_slab(parent))
1992 set_bit(CS_SPREAD_SLAB, &cs->flags);
1993
1994 cpuset_inc();
1995
1996 spin_lock_irq(&callback_lock);
1997 if (is_in_v2_mode()) {
1998 cpumask_copy(cs->effective_cpus, parent->effective_cpus);
1999 cs->effective_mems = parent->effective_mems;
2000 }
2001 spin_unlock_irq(&callback_lock);
2002
2003 if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags))
2004 goto out_unlock;
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019 rcu_read_lock();
2020 cpuset_for_each_child(tmp_cs, pos_css, parent) {
2021 if (is_mem_exclusive(tmp_cs) || is_cpu_exclusive(tmp_cs)) {
2022 rcu_read_unlock();
2023 goto out_unlock;
2024 }
2025 }
2026 rcu_read_unlock();
2027
2028 spin_lock_irq(&callback_lock);
2029 cs->mems_allowed = parent->mems_allowed;
2030 cs->effective_mems = parent->mems_allowed;
2031 cpumask_copy(cs->cpus_allowed, parent->cpus_allowed);
2032 cpumask_copy(cs->effective_cpus, parent->cpus_allowed);
2033 spin_unlock_irq(&callback_lock);
2034out_unlock:
2035 mutex_unlock(&cpuset_mutex);
2036 return 0;
2037}
2038
2039
2040
2041
2042
2043
2044
2045static void cpuset_css_offline(struct cgroup_subsys_state *css)
2046{
2047 struct cpuset *cs = css_cs(css);
2048
2049 mutex_lock(&cpuset_mutex);
2050
2051 if (is_sched_load_balance(cs))
2052 update_flag(CS_SCHED_LOAD_BALANCE, cs, 0);
2053
2054 cpuset_dec();
2055 clear_bit(CS_ONLINE, &cs->flags);
2056
2057 mutex_unlock(&cpuset_mutex);
2058}
2059
2060static void cpuset_css_free(struct cgroup_subsys_state *css)
2061{
2062 struct cpuset *cs = css_cs(css);
2063
2064 free_cpumask_var(cs->effective_cpus);
2065 free_cpumask_var(cs->cpus_allowed);
2066 kfree(cs);
2067}
2068
2069static void cpuset_bind(struct cgroup_subsys_state *root_css)
2070{
2071 mutex_lock(&cpuset_mutex);
2072 spin_lock_irq(&callback_lock);
2073
2074 if (is_in_v2_mode()) {
2075 cpumask_copy(top_cpuset.cpus_allowed, cpu_possible_mask);
2076 top_cpuset.mems_allowed = node_possible_map;
2077 } else {
2078 cpumask_copy(top_cpuset.cpus_allowed,
2079 top_cpuset.effective_cpus);
2080 top_cpuset.mems_allowed = top_cpuset.effective_mems;
2081 }
2082
2083 spin_unlock_irq(&callback_lock);
2084 mutex_unlock(&cpuset_mutex);
2085}
2086
2087
2088
2089
2090
2091
2092static void cpuset_fork(struct task_struct *task)
2093{
2094 if (task_css_is_root(task, cpuset_cgrp_id))
2095 return;
2096
2097 set_cpus_allowed_ptr(task, ¤t->cpus_allowed);
2098 task->mems_allowed = current->mems_allowed;
2099}
2100
2101struct cgroup_subsys cpuset_cgrp_subsys = {
2102 .css_alloc = cpuset_css_alloc,
2103 .css_online = cpuset_css_online,
2104 .css_offline = cpuset_css_offline,
2105 .css_free = cpuset_css_free,
2106 .can_attach = cpuset_can_attach,
2107 .cancel_attach = cpuset_cancel_attach,
2108 .attach = cpuset_attach,
2109 .post_attach = cpuset_post_attach,
2110 .bind = cpuset_bind,
2111 .fork = cpuset_fork,
2112 .legacy_cftypes = files,
2113 .early_init = true,
2114};
2115
2116
2117
2118
2119
2120
2121
2122int __init cpuset_init(void)
2123{
2124 int err = 0;
2125
2126 BUG_ON(!alloc_cpumask_var(&top_cpuset.cpus_allowed, GFP_KERNEL));
2127 BUG_ON(!alloc_cpumask_var(&top_cpuset.effective_cpus, GFP_KERNEL));
2128
2129 cpumask_setall(top_cpuset.cpus_allowed);
2130 nodes_setall(top_cpuset.mems_allowed);
2131 cpumask_setall(top_cpuset.effective_cpus);
2132 nodes_setall(top_cpuset.effective_mems);
2133
2134 fmeter_init(&top_cpuset.fmeter);
2135 set_bit(CS_SCHED_LOAD_BALANCE, &top_cpuset.flags);
2136 top_cpuset.relax_domain_level = -1;
2137
2138 err = register_filesystem(&cpuset_fs_type);
2139 if (err < 0)
2140 return err;
2141
2142 BUG_ON(!alloc_cpumask_var(&cpus_attach, GFP_KERNEL));
2143
2144 return 0;
2145}
2146
2147
2148
2149
2150
2151
2152
2153
2154static void remove_tasks_in_empty_cpuset(struct cpuset *cs)
2155{
2156 struct cpuset *parent;
2157
2158
2159
2160
2161
2162 parent = parent_cs(cs);
2163 while (cpumask_empty(parent->cpus_allowed) ||
2164 nodes_empty(parent->mems_allowed))
2165 parent = parent_cs(parent);
2166
2167 if (cgroup_transfer_tasks(parent->css.cgroup, cs->css.cgroup)) {
2168 pr_err("cpuset: failed to transfer tasks out of empty cpuset ");
2169 pr_cont_cgroup_name(cs->css.cgroup);
2170 pr_cont("\n");
2171 }
2172}
2173
2174static void
2175hotplug_update_tasks_legacy(struct cpuset *cs,
2176 struct cpumask *new_cpus, nodemask_t *new_mems,
2177 bool cpus_updated, bool mems_updated)
2178{
2179 bool is_empty;
2180
2181 spin_lock_irq(&callback_lock);
2182 cpumask_copy(cs->cpus_allowed, new_cpus);
2183 cpumask_copy(cs->effective_cpus, new_cpus);
2184 cs->mems_allowed = *new_mems;
2185 cs->effective_mems = *new_mems;
2186 spin_unlock_irq(&callback_lock);
2187
2188
2189
2190
2191
2192 if (cpus_updated && !cpumask_empty(cs->cpus_allowed))
2193 update_tasks_cpumask(cs);
2194 if (mems_updated && !nodes_empty(cs->mems_allowed))
2195 update_tasks_nodemask(cs);
2196
2197 is_empty = cpumask_empty(cs->cpus_allowed) ||
2198 nodes_empty(cs->mems_allowed);
2199
2200 mutex_unlock(&cpuset_mutex);
2201
2202
2203
2204
2205
2206
2207 if (is_empty)
2208 remove_tasks_in_empty_cpuset(cs);
2209
2210 mutex_lock(&cpuset_mutex);
2211}
2212
2213static void
2214hotplug_update_tasks(struct cpuset *cs,
2215 struct cpumask *new_cpus, nodemask_t *new_mems,
2216 bool cpus_updated, bool mems_updated)
2217{
2218 if (cpumask_empty(new_cpus))
2219 cpumask_copy(new_cpus, parent_cs(cs)->effective_cpus);
2220 if (nodes_empty(*new_mems))
2221 *new_mems = parent_cs(cs)->effective_mems;
2222
2223 spin_lock_irq(&callback_lock);
2224 cpumask_copy(cs->effective_cpus, new_cpus);
2225 cs->effective_mems = *new_mems;
2226 spin_unlock_irq(&callback_lock);
2227
2228 if (cpus_updated)
2229 update_tasks_cpumask(cs);
2230 if (mems_updated)
2231 update_tasks_nodemask(cs);
2232}
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242static void cpuset_hotplug_update_tasks(struct cpuset *cs)
2243{
2244 static cpumask_t new_cpus;
2245 static nodemask_t new_mems;
2246 bool cpus_updated;
2247 bool mems_updated;
2248retry:
2249 wait_event(cpuset_attach_wq, cs->attach_in_progress == 0);
2250
2251 mutex_lock(&cpuset_mutex);
2252
2253
2254
2255
2256
2257 if (cs->attach_in_progress) {
2258 mutex_unlock(&cpuset_mutex);
2259 goto retry;
2260 }
2261
2262 cpumask_and(&new_cpus, cs->cpus_allowed, parent_cs(cs)->effective_cpus);
2263 nodes_and(new_mems, cs->mems_allowed, parent_cs(cs)->effective_mems);
2264
2265 cpus_updated = !cpumask_equal(&new_cpus, cs->effective_cpus);
2266 mems_updated = !nodes_equal(new_mems, cs->effective_mems);
2267
2268 if (is_in_v2_mode())
2269 hotplug_update_tasks(cs, &new_cpus, &new_mems,
2270 cpus_updated, mems_updated);
2271 else
2272 hotplug_update_tasks_legacy(cs, &new_cpus, &new_mems,
2273 cpus_updated, mems_updated);
2274
2275 mutex_unlock(&cpuset_mutex);
2276}
2277
2278static bool force_rebuild;
2279
2280void cpuset_force_rebuild(void)
2281{
2282 force_rebuild = true;
2283}
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301static void cpuset_hotplug_workfn(struct work_struct *work)
2302{
2303 static cpumask_t new_cpus;
2304 static nodemask_t new_mems;
2305 bool cpus_updated, mems_updated;
2306 bool on_dfl = is_in_v2_mode();
2307
2308 mutex_lock(&cpuset_mutex);
2309
2310
2311 cpumask_copy(&new_cpus, cpu_active_mask);
2312 new_mems = node_states[N_MEMORY];
2313
2314 cpus_updated = !cpumask_equal(top_cpuset.effective_cpus, &new_cpus);
2315 mems_updated = !nodes_equal(top_cpuset.effective_mems, new_mems);
2316
2317
2318 if (cpus_updated) {
2319 spin_lock_irq(&callback_lock);
2320 if (!on_dfl)
2321 cpumask_copy(top_cpuset.cpus_allowed, &new_cpus);
2322 cpumask_copy(top_cpuset.effective_cpus, &new_cpus);
2323 spin_unlock_irq(&callback_lock);
2324
2325 }
2326
2327
2328 if (mems_updated) {
2329 spin_lock_irq(&callback_lock);
2330 if (!on_dfl)
2331 top_cpuset.mems_allowed = new_mems;
2332 top_cpuset.effective_mems = new_mems;
2333 spin_unlock_irq(&callback_lock);
2334 update_tasks_nodemask(&top_cpuset);
2335 }
2336
2337 mutex_unlock(&cpuset_mutex);
2338
2339
2340 if (cpus_updated || mems_updated) {
2341 struct cpuset *cs;
2342 struct cgroup_subsys_state *pos_css;
2343
2344 rcu_read_lock();
2345 cpuset_for_each_descendant_pre(cs, pos_css, &top_cpuset) {
2346 if (cs == &top_cpuset || !css_tryget_online(&cs->css))
2347 continue;
2348 rcu_read_unlock();
2349
2350 cpuset_hotplug_update_tasks(cs);
2351
2352 rcu_read_lock();
2353 css_put(&cs->css);
2354 }
2355 rcu_read_unlock();
2356 }
2357
2358
2359 if (cpus_updated || force_rebuild) {
2360 force_rebuild = false;
2361 rebuild_sched_domains();
2362 }
2363}
2364
2365void cpuset_update_active_cpus(void)
2366{
2367
2368
2369
2370
2371
2372 schedule_work(&cpuset_hotplug_work);
2373}
2374
2375void cpuset_wait_for_hotplug(void)
2376{
2377 flush_work(&cpuset_hotplug_work);
2378}
2379
2380
2381
2382
2383
2384
2385static int cpuset_track_online_nodes(struct notifier_block *self,
2386 unsigned long action, void *arg)
2387{
2388 schedule_work(&cpuset_hotplug_work);
2389 return NOTIFY_OK;
2390}
2391
2392static struct notifier_block cpuset_track_online_nodes_nb = {
2393 .notifier_call = cpuset_track_online_nodes,
2394 .priority = 10,
2395};
2396
2397
2398
2399
2400
2401
2402void __init cpuset_init_smp(void)
2403{
2404 cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask);
2405 top_cpuset.mems_allowed = node_states[N_MEMORY];
2406 top_cpuset.old_mems_allowed = top_cpuset.mems_allowed;
2407
2408 cpumask_copy(top_cpuset.effective_cpus, cpu_active_mask);
2409 top_cpuset.effective_mems = node_states[N_MEMORY];
2410
2411 register_hotmemory_notifier(&cpuset_track_online_nodes_nb);
2412
2413 cpuset_migrate_mm_wq = alloc_ordered_workqueue("cpuset_migrate_mm", 0);
2414 BUG_ON(!cpuset_migrate_mm_wq);
2415}
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask)
2429{
2430 unsigned long flags;
2431
2432 spin_lock_irqsave(&callback_lock, flags);
2433 rcu_read_lock();
2434 guarantee_online_cpus(task_cs(tsk), pmask);
2435 rcu_read_unlock();
2436 spin_unlock_irqrestore(&callback_lock, flags);
2437}
2438
2439void cpuset_cpus_allowed_fallback(struct task_struct *tsk)
2440{
2441 rcu_read_lock();
2442 do_set_cpus_allowed(tsk, task_cs(tsk)->effective_cpus);
2443 rcu_read_unlock();
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462}
2463
2464void __init cpuset_init_current_mems_allowed(void)
2465{
2466 nodes_setall(current->mems_allowed);
2467}
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479nodemask_t cpuset_mems_allowed(struct task_struct *tsk)
2480{
2481 nodemask_t mask;
2482 unsigned long flags;
2483
2484 spin_lock_irqsave(&callback_lock, flags);
2485 rcu_read_lock();
2486 guarantee_online_mems(task_cs(tsk), &mask);
2487 rcu_read_unlock();
2488 spin_unlock_irqrestore(&callback_lock, flags);
2489
2490 return mask;
2491}
2492
2493
2494
2495
2496
2497
2498
2499int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask)
2500{
2501 return nodes_intersects(*nodemask, current->mems_allowed);
2502}
2503
2504
2505
2506
2507
2508
2509
2510static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs)
2511{
2512 while (!(is_mem_exclusive(cs) || is_mem_hardwall(cs)) && parent_cs(cs))
2513 cs = parent_cs(cs);
2514 return cs;
2515}
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557bool __cpuset_node_allowed(int node, gfp_t gfp_mask)
2558{
2559 struct cpuset *cs;
2560 int allowed;
2561 unsigned long flags;
2562
2563 if (in_interrupt())
2564 return true;
2565 if (node_isset(node, current->mems_allowed))
2566 return true;
2567
2568
2569
2570
2571 if (unlikely(tsk_is_oom_victim(current)))
2572 return true;
2573 if (gfp_mask & __GFP_HARDWALL)
2574 return false;
2575
2576 if (current->flags & PF_EXITING)
2577 return true;
2578
2579
2580 spin_lock_irqsave(&callback_lock, flags);
2581
2582 rcu_read_lock();
2583 cs = nearest_hardwall_ancestor(task_cs(current));
2584 allowed = node_isset(node, cs->mems_allowed);
2585 rcu_read_unlock();
2586
2587 spin_unlock_irqrestore(&callback_lock, flags);
2588 return allowed;
2589}
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618static int cpuset_spread_node(int *rotor)
2619{
2620 return *rotor = next_node_in(*rotor, current->mems_allowed);
2621}
2622
2623int cpuset_mem_spread_node(void)
2624{
2625 if (current->cpuset_mem_spread_rotor == NUMA_NO_NODE)
2626 current->cpuset_mem_spread_rotor =
2627 node_random(¤t->mems_allowed);
2628
2629 return cpuset_spread_node(¤t->cpuset_mem_spread_rotor);
2630}
2631
2632int cpuset_slab_spread_node(void)
2633{
2634 if (current->cpuset_slab_spread_rotor == NUMA_NO_NODE)
2635 current->cpuset_slab_spread_rotor =
2636 node_random(¤t->mems_allowed);
2637
2638 return cpuset_spread_node(¤t->cpuset_slab_spread_rotor);
2639}
2640
2641EXPORT_SYMBOL_GPL(cpuset_mem_spread_node);
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654int cpuset_mems_allowed_intersects(const struct task_struct *tsk1,
2655 const struct task_struct *tsk2)
2656{
2657 return nodes_intersects(tsk1->mems_allowed, tsk2->mems_allowed);
2658}
2659
2660
2661
2662
2663
2664
2665
2666void cpuset_print_current_mems_allowed(void)
2667{
2668 struct cgroup *cgrp;
2669
2670 rcu_read_lock();
2671
2672 cgrp = task_cs(current)->css.cgroup;
2673 pr_info("%s cpuset=", current->comm);
2674 pr_cont_cgroup_name(cgrp);
2675 pr_cont(" mems_allowed=%*pbl\n",
2676 nodemask_pr_args(¤t->mems_allowed));
2677
2678 rcu_read_unlock();
2679}
2680
2681
2682
2683
2684
2685
2686
2687int cpuset_memory_pressure_enabled __read_mostly;
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707void __cpuset_memory_pressure_bump(void)
2708{
2709 rcu_read_lock();
2710 fmeter_markevent(&task_cs(current)->fmeter);
2711 rcu_read_unlock();
2712}
2713
2714#ifdef CONFIG_PROC_PID_CPUSET
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724int proc_cpuset_show(struct seq_file *m, struct pid_namespace *ns,
2725 struct pid *pid, struct task_struct *tsk)
2726{
2727 char *buf;
2728 struct cgroup_subsys_state *css;
2729 int retval;
2730
2731 retval = -ENOMEM;
2732 buf = kmalloc(PATH_MAX, GFP_KERNEL);
2733 if (!buf)
2734 goto out;
2735
2736 css = task_get_css(tsk, cpuset_cgrp_id);
2737 retval = cgroup_path_ns(css->cgroup, buf, PATH_MAX,
2738 current->nsproxy->cgroup_ns);
2739 css_put(css);
2740 if (retval >= PATH_MAX)
2741 retval = -ENAMETOOLONG;
2742 if (retval < 0)
2743 goto out_free;
2744 seq_puts(m, buf);
2745 seq_putc(m, '\n');
2746 retval = 0;
2747out_free:
2748 kfree(buf);
2749out:
2750 return retval;
2751}
2752#endif
2753
2754
2755void cpuset_task_status_allowed(struct seq_file *m, struct task_struct *task)
2756{
2757 seq_printf(m, "Mems_allowed:\t%*pb\n",
2758 nodemask_pr_args(&task->mems_allowed));
2759 seq_printf(m, "Mems_allowed_list:\t%*pbl\n",
2760 nodemask_pr_args(&task->mems_allowed));
2761}
2762